清华大佬耗费三个月吐血整理的几百G的资源,免费分享!....>>>
''' 天涯帖子直播 20150426 Python3.4.3 ''' import re,time,os import requests from bs4 import * header={'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8','Accept-Encoding':'gzip,deflate,sdch','Accept-Language':'zh-CN,zh;q=0.8','Cache-Control':'max-age=0','Connection':'keep-alive','Host':'bbs.tianya.cn','User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36'} refushtime=30 #默认刷新时间 lastpage=1 #最新页面 Furl=''#最新页面URL lzname='' #Turl='http://bbs.tianya.cn/post-stocks-1131734-1.shtml'#初始页面URL Turl='http://bbs.tianya.cn/post-stocks-1345750-1.shtml' def bsp(newurl):#bs4初始化 html=requests.get(newurl,headers=header,timeout=10) #print(html)#正常显示中文 soup=BeautifulSoup(html.content.decode()) return soup def pagnum(soup):#获取总页数 #print(soup.find('script',{'type':'text/javascript'}).text) tx=soup.find('script',{'type':'text/javascript'}).text req=re.search(r'pageCount : \d*,',tx).group(0) req=req[12:-1]#截第12位后字符和倒数1字符之间 #print(req) return int(req) def pagepro():#URL处理 global Furl tx1=Turl.split('-') #['http://bbs.tianya.cn/post', 'stocks', '1131734', '1.shtml'] Furl=tx1[0]+'-'+tx1[1]+'-'+tx1[2]+'-'+'%d.shtml'%lastpage #print(Furl) return Furl def pagecollect():#获取内容 soup=bsp(Furl) #print(soup) txt=[] times=[] lzpost=soup.findAll('div',{'_host':lzname}) #print(lzpost[0]) if lzpost==[]: pass else: for i in range(len(lzpost)): ntime=lzpost[i].find('div',{'class':'atl-info'}).text #获取时间 #times.append(ntime) #print(times) post=lzpost[i].find('div',{'class':'atl-content'})#一级搜索 post=post.find('div',{'class':'bbs-content'}).text#二级搜索 post=post.strip() #print(post) txt.append(ntime) txt.append(post) return txt def formatprint(txt):#输出处理 if txt==[]: print('===========None============') else: for i in range(0,len(txt),2): #数组【时间,内容,时间……】 print('='*30) print(txt[i]) txt[i+1]=txt[i+1].replace('-'*29,'\n----------\n') print(txt[i+1]) print('='*30) #time.sleep(0.5) #显示间隔 pass def main(): #Turl=input('>>') global refushtime,lastpage,Furl,Turl soup=bsp(Turl) title=re.sub('_.*','=====',soup.title.text) print('=====',title) lastpage=pagnum(soup) print('LastPage:',lastpage) Furl=pagepro() #合成最新URL print('LastURL:',Furl) lzname=soup.find('div',{'class':'atl-menu clearfix js-bbs-act'})['js_activityusername'] print('Lzname:',lzname) formatprint(pagecollect()) #第一次输出 while True: time.sleep(refushtime) soup=bsp(Turl) #刷新 newtime=pagnum(soup) if newtime > lastpage: print('LastPage:',newtime) lastpage=newtime Furl=pagepro() formatprint(pagecollect()) else: os.system('cls') formatprint(pagecollect()) print('==========Refush==========') if __name__ == '__main__': soup=bsp(Turl) title=re.sub('_.*','=====',soup.title.text) print('=====',title) lastpage=pagnum(soup) print('LastPage:',lastpage) Furl=pagepro() #合成最新URL print('LastURL:',Furl) lzname=soup.find('div',{'class':'atl-menu clearfix js-bbs-act'})['js_activityusername'] print('Lzname:',lzname) formatprint(pagecollect()) #第一次输出 while True: time.sleep(refushtime) #input('Go to Refush') soup=bsp(Turl) #刷新 newtime=pagnum(soup) if newtime > lastpage: print('LastPage:',newtime) lastpage=newtime Furl=pagepro() formatprint(pagecollect()) else: os.system('cls') print('==========Refush==========') formatprint(pagecollect())