清华大佬耗费三个月吐血整理的几百G的资源,免费分享!....>>>
#!/usr/bin/python3 import os import os.path import urllib.request import chardet import time import re from multiprocessing.dummy import Pool as ThreadPool def url_content_read(url): res = urllib.request.urlopen(url).read() return res def url_socket_list_pharse(socket_raw_info_data): p = re.compile(r'<td>(\d+\.\d+\.\d+\.\d+)</td><td>(\d+)</td>') socket_info_list = p.findall(socket_raw_info_data) socket_pharsed_info_list = [] for s_info in socket_info_list: socket_pharsed_info_list.append("%s:%s"%(s_info[0],s_info[1])) return socket_pharsed_info_list if __name__ == "__main__": urls = [] for i in range(1,42): urls.append("http://www.proxy.com.ru/list_%d.html"%(i)) start_time = time.time() _pool = ThreadPool(4) results = _pool.map(url_content_read,urls) _pool.close() _pool.join() print("time spent: %f"%(time.time() - start_time)) p = re.compile(r'http://[\w|\d|-|\.]+[:\d+]*[/|\w|\d|\.|%|@|&|\*|\+|#|\?|\=|\-]+') #对获取的结果网页进行字符串的编码解码 socket_info_raw_data_list = [] for res in results: det_res = chardet.detect(res) #print(det_res) if det_res['encoding'] == 'utf-8': socket_info_raw_data_list.append(res.decode('utf-8','ignore')) elif det_res['encoding'] == 'GB2312': socket_info_raw_data_list.append(res.decode('GB2312','ignore')) else: socket_info_raw_data_list.append(res.decode('gbk','ignore')) #对结果进行进一步的解析 start_time = time.time() _pool = ThreadPool(40) results = _pool.map(url_socket_list_pharse,socket_info_raw_data_list) _pool.close() _pool.join() print("time spent: %f"%(time.time() - start_time)) #最后对结果进行输出 fp = open(r'/home/mobilefzb/socket_list.txt','w') for res in results: for si_res in res: fp.write("%s\n"%(si_res)) fp.close()