清华大佬耗费三个月吐血整理的几百G的资源,免费分享!....>>>
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 | #!/usr/bin/env python #coding=utf-8 """ Author: Anemone Filename: getmain.py Last modified: 2015-02-19 16:47 E-mail: anemone@82flex.com """ import urllib2 from bs4 import BeautifulSoup import re import sys reload(sys) sys.setdefaultencoding( 'utf-8' ) def getEachArticle(url): # response = urllib2.urlopen('http://www.52duzhe.com/2015_01/duzh20150104.html') response = urllib2.urlopen(url) html = response.read() soup = BeautifulSoup(html) #.decode("utf-8").encode("gbk")) #for i in soup.find_all('div'): # print i,1 title = soup.find( "h1" ).string writer = soup.find(id = "pub_date" ).string.strip() _from = soup.find(id = "media_name" ).string.strip() text = soup.get_text() #.encode("utf-8") main = re.split( "BAIDU_CLB.*;" ,text) result = { "title" :title, "writer" :writer, "from" :_from, "context" :main[ 1 ]} return result #new=open("new.txt","w") #new.write(result["title"]+"\n\n") #new.write(result["writer"]+" "+result["from"]) #new.write(result["context"]) #new.close() def getCatalog(issue): url = "http://www.52duzhe.com/" + issue[: 4 ] + "_" + issue[ - 2 :] + "/" firstUrl = url + "duzh" + issue + "01.html" firstUrl = url + "index.html" duzhe = dict() response = urllib2.urlopen(firstUrl) html = response.read() soup = BeautifulSoup(html) firstUrl = url + soup.table.a.get( "href" ) response = urllib2.urlopen(firstUrl) html = response.read() soup = BeautifulSoup(html) all = soup.find_all( "h2" ) for i in all: print i.string duzhe[i.string] = list() for link in i.parent.find_all( "a" ): href = url + link.get( "href" ) print href while 1 : try : article = getEachArticle(href) break except : continue duzhe[i.string].append(article) return duzhe def readDuZhe(duzhe): for eachColumn in duzhe: for eachArticle in duzhe[eachColumn]: print eachArticle[ "title" ] if __name__ = = '__main__' : # issue=raw_input("issue(201501):") readDuZhe(getCatalog( "201424" )) |