清华大佬耗费三个月吐血整理的几百G的资源,免费分享!....>>>
#!/usr/bin/env python #coding:utf-8 import urllib2 import re import os def getHtml(url): #获取html源码 headers={"User-Agent":"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1"} req=urllib2.Request(url,headers=headers) response=urllib2.urlopen(req) html=response.read().decode('utf-8') return html def urlPages(page): #翻页 url = 'https://but/company/lists/page/' + str(page) #print url return url def findList(html): #正则匹配列表 myitems=re.findall('www.*?</td>',html,re.S) return myitems for page in range(1, 78+1): #抓取的页数 #print page html = getHtml(urlPages(page)) items = findList(html) for item in items: s = item s=item.replace('</td>','') #print s file_object = open('but.txt', 'a') file_object.writelines(s+'\n') file_object.close() print (u'\n\n本王的网站下载完毕啦!')