清华大佬耗费三个月吐血整理的几百G的资源,免费分享!....>>>
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 | import requests import re url = "http://zuimeia.com" r = requests.get( 'http://zuimeia.com/community/app/hot/?platform=2' ) pattern = re.compile(r '<a class="community-app-cover-wrapper" href="(.*?)" target="_blank">' ) urlList = pattern.findall(r.content) def requestsUrl(url): r = requests.get(url) title = re.findall(r '"app-title"><h1>(.*?)</h1>' ,r.content) #print title category = re.findall(r '<a class="app-tag" href="/community/app/category/title/.*?/?platform=2">(.*?)</a>' ,r.content) #print category describe = re.findall(r '<div id="article_content">(.*?)<div class="community-image-wrapper">' ,r.content) #print type(describe[0]) strdescribe = srtReplace(describe[ 0 ]) #print strdescribe downloadUrl = re.findall(r '<a class="download-button direct hidden" href="(.*?)"' ,r.content) #print downloadUrl return title,category,strdescribe,downloadUrl def srtReplace(string): listReplace = [ '<p>' , '<br>' , '<h1>' , '<h2>' , '<h3>' , '<h4>' , '<h5>' , '<h6>' , '<h7>' , '<strong>' , '</p>' , '<br/>' , '</h1>' , '</h2>' , '</h3>' , '</h4>' , '</h5>' , '</h6>' , '</h7>' , '</strong>' , '<b>' , '</b>' ] for eachListReplace in listReplace: string = string.replace(str(eachListReplace), '\n' ) string = string.replace( '\n\n' ,'') return string def categornFinal(category): categoryFinal = '' for eachCategory in category: categoryFinal = categoryFinal + str(eachCategory) + '-->' return categoryFinal def urlReplace(url): url = url.replace( '&' , '&' ) return url requestsUrl( "http://zuimeia.com/community/app/27369/?platform=2" ) for eachUrl in urlList: eachUrl = url + eachUrl content = requestsUrl(eachUrl) categoryFinal = '' title = content[ 0 ][ 0 ] category = categornFinal(content[ 1 ]) strdescribe = content[ 2 ] downloadUrl = urlReplace(content[ 3 ][ 0 ]) with open( 'c:/wqa.txt' , 'a+' ) as fd: fd.write( 'title:' + title + '\n' + 'category:' + category + '\n' + 'strdescribe:' + strdescribe + '\n' + 'downloadUrl:' + downloadUrl + '\n\n\n-----------------------------------------------------------------------------------------------------------------------------\n\n\n' ) |