清华大佬耗费三个月吐血整理的几百G的资源,免费分享!....>>>
import urllib.request mmurl = "http://mm.taobao.com/json/request_top_list.htm?type=0&page=" SUCC = 0 FAIL = 0 def getPicUrl(htmlurl,num): global SUCC , FAIL i = 0 flg = 1 tmpa = 0 tmpb = 0 while flg == 1: try: percontHTML = urllib.request.urlopen(htmlurl).read().decode('GBK') pica = '''src="http://img0''' picb = ".jpg" tmpa = percontHTML.find(pica,tmpb) tmpb = percontHTML.find(picb,tmpa) imgurl = percontHTML[tmpa + 5:tmpb + 4] #print(imgurl) if imgurl == "": flg = 0 print("第" , num , "个美女的图片下载完毕") else: i += 1 print("正在下载第" , num , "个美女的第" , i , "张图片") urllib.request.urlretrieve(imgurl,"beautiful\\" + str(num) + "-" + str(i) + ".jpg") SUCC += 1 except: print("图片获取失败,可能是服务器自动屏蔽掉了") FAIL += 1 pass def getPageUrl(mmurl): i = 1 j = 1 pa = 0 pb = 0 hreflist = [] while i < 81: url = mmurl + str(i) #print(url) cont = urllib.request.urlopen(url).read().decode('GBK') diva = '''<div class="pic s60">''' divb = '</div>' while j<11: pa = cont.find(diva, pb)#div截取 pb = cont.find(divb, pa) divcont = cont[pa:pb] #print(divcont) hrefa = "<a href=" hrefb = ".htm" aa = divcont.find(hrefa)#网址截取 ab = divcont.find(hrefb) acont = divcont[aa + 9:ab + 4] hreflist.append(acont) #print(acont) j += 1 j = 1 pa = 0 pb = 0 print("正在读取第" + str(i) + "页的美女图片地址") i += 1 return hreflist if __name__ == "__main__": global SUCC , FAIL hreflist = getPageUrl(mmurl) print("共有美女个人图片页面", len(hreflist)) for i in range(0,len(hreflist)): num = i + 1 print("开始下载第" , num , "个美女图") test = hreflist[i] getPicUrl(test,num) print("成功下载图片:" , SUCC , "****下载图片失败:" , FAIL)