清华大佬耗费三个月吐血整理的几百G的资源,免费分享!....>>>
import os import shutil import re import urllib.request home = "http://www.tianqihoubao.com" def down2txt(code, tit, url): # 网页地址 page = urllib.request.urlopen(url).read() try: page = page.decode("gbk") except: page = page.decode("utf-8") i_start = page.find("<h1>") i_end = page.find("</h1>") t = page[i_start : i_end] #tit = t.replace("\r\n", "").replace("<h1>", "").strip(" ") #print(tit) # 创建目录 if not os.path.exists(os.getcwd() + "/data/" + code): os.makedirs(os.getcwd() + "/data/" + code) # 文件存在则不下载 file = os.getcwd() + "/data/" + code + "/" + tit + ".txt" if os.path.exists(file): print("文件已存在:" + tit + ".txt") return # 截取表格文本 i_start = page.find('<table') i_end = page.find('</table>') page = page[i_start:i_end] i_start = page.find(">") page = page[i_start:] page = page.replace("\r\n", "") #page = page.replace("\r", "").replace("\r", "") page = page.replace("</b>", "").replace("<b>", "") #page = page.replace(" ", "").replace(" ", "").replace(" ", "").replace(" ", "") re_c = re.compile(">(.+?)<") ls = re.findall(re_c, page) f = open(file, "w") i = 0 s = "" for l in ls: l = l.strip(" ") if l == "": continue s += l + " " if i == 8: #print(s) f.write(s + "\n") i+=1 if i >= 9: i = 0 s = "" f.close() def down_city(name, code): url = home + "/aqi/" + code + ".html" print(url) page = urllib.request.urlopen(url).read() page = page.decode("gbk") ls = re.findall(re.compile("href='(/aqi/" + code + "-" + ".+?html)'"), page) for l in ls: url = home + l tit = l.replace("/aqi/", "").replace(".html", "") print(url) down2txt(code, tit, url) #print(l) if __name__ == "__main__": url = home + "/aqi/" page = urllib.request.urlopen(url).read() try: page = page.decode("gbk") except: page = page.decode("utf-8") ls = re.findall(re.compile('href="/aqi/(.+?)</a>'), page) index = 0 for l in ls: try: ls2 = l.replace(" ","").replace('.html">', " ").strip(" ").split(" ") if len(ls2) == 2: index += 1 print( str(index) + "/" + str(len(ls)) + ": " + ls2[0] + " " + ls2[1]) b_down = False # 查找下载记录 if os.path.exists("data\index.txt"): f = open("data\index.txt", "r") ls3 = f.readlines() f.close() for l3 in ls3: if l3.strip("\n") == ls2[0] + " " + ls2[1]: print(ls2[1] + " 已下载") b_down = True break if b_down : continue down_city(ls2[1], ls2[0]) # 保存记录 f = open("data\index.txt", "a") f.write(ls2[0] + " " + ls2[1] + "\n") f.close() except: print("error!") print("finished!")