清华大佬耗费三个月吐血整理的几百G的资源,免费分享!....>>>
import re import os import urllib.request home = "http://www.gamersky.com/ent/wp/" i_count = 5 # 爬取的列表数量 def downImg(url, title): if not os.path.exists(title): os.makedirs(title) page = urllib.request.urlopen(url).read() try: page = page.decode('utf-8') except: print("try gbk code") page = page.decode('gbk') #s_key = 'src="(http://img1.gamersky.com/.+?\.jpg)"' s_key = 'href="http://www.gamersky.com/showimage/id_gamersky.shtml\?(http://img1.gamersky.com/.+?.jpg)"' re_c = re.compile(s_key) ls2 = re.findall(re_c, page) for l2 in ls2: try: (p2, f2) = os.path.split(l2) if os.path.exists(title + "/" + f2): continue print(l2) urllib.request.urlretrieve(l2, title + "/" + f2) except: print('down image error!') if __name__ == '__main__': try: url = home page = urllib.request.urlopen(url).read() try: page = page.decode('utf-8') except: print("try gbk code") page = page.decode('gbk') print(len(page)) s_key = 'href="(http://www.gamersky.com/ent.+?\.shtml)"' re_c = re.compile(s_key) ls = re.findall(re_c, page) i = 0 for l in ls: if i >= i_count: break i += 1 print("(" + str(i) + "/" + str(i_count) + ") " + l) try: (path, file) = os.path.split(l) title = file.replace('.shtml', '') page = urllib.request.urlopen(l).read() page = page.decode('utf-8') url2 = l.replace('.shtml', '') s_key = 'href="(' + url2 + '.+?)"' re_c = re.compile(s_key) ls2 = re.findall(re_c, page) j = 0 for l2 in ls2: j += 1 print("(" + str(j) + "/" + str(len(ls2)) + ") " + l2) try: downImg(l2, title) except: print('error II !') except: print('error!') except: print("read index error!") print('finish!')