清华大佬耗费三个月吐血整理的几百G的资源,免费分享!....>>>
# -*- coding: utf-8 -*- from gevent import monkey monkey.patch_all() from gevent.pool import Pool import urllib2 import re import os all_pic_urls = [] base_url = "http://iconmatrix.sharpmark.net" pic_save_path = './icons' def get_page_count(): f = urllib2.urlopen(base_url) content = f.read() result = re.findall('([0-9]+)\/\"\>尾页', content) pc = result[0] if pc[0].isalnum(): return int(pc) return 0 def download_pic(pic_url): f = urllib2.urlopen(pic_url) name = os.path.basename(pic_url) with open(pic_save_path + name, "wb") as code: code.write(f.read()) def get_pic_urls(page_url): global all_pic_urls f = urllib2.urlopen(page_url) content = f.read() m = re.findall('\<img\sclass=\"redraw-icon\sicon-shadow\"\ssrc="(.*?)\"', content) all_pic_urls += m if __name__ == '__main__': page_count = get_page_count() p = Pool(20) for i in xrange(1, page_count + 1): page_url = base_url + '/apps/page/%d/' % i p.spawn(get_pic_urls, page_url) p.join() jobs = [] for pic_url in all_pic_urls: url = base_url + pic_url p.spawn(download_pic, url) p.join()