清华大佬耗费三个月吐血整理的几百G的资源,免费分享!....>>>
'''
天涯帖子直播
20150426
Python3.4.3
'''
import re,time,os
import requests
from bs4 import *
header={'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8','Accept-Encoding':'gzip,deflate,sdch','Accept-Language':'zh-CN,zh;q=0.8','Cache-Control':'max-age=0','Connection':'keep-alive','Host':'bbs.tianya.cn','User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36'}
refushtime=30 #默认刷新时间
lastpage=1 #最新页面
Furl=''#最新页面URL
lzname=''
#Turl='http://bbs.tianya.cn/post-stocks-1131734-1.shtml'#初始页面URL
Turl='http://bbs.tianya.cn/post-stocks-1345750-1.shtml'
def bsp(newurl):#bs4初始化
html=requests.get(newurl,headers=header,timeout=10)
#print(html)#正常显示中文
soup=BeautifulSoup(html.content.decode())
return soup
def pagnum(soup):#获取总页数
#print(soup.find('script',{'type':'text/javascript'}).text)
tx=soup.find('script',{'type':'text/javascript'}).text
req=re.search(r'pageCount : \d*,',tx).group(0)
req=req[12:-1]#截第12位后字符和倒数1字符之间
#print(req)
return int(req)
def pagepro():#URL处理
global Furl
tx1=Turl.split('-')
#['http://bbs.tianya.cn/post', 'stocks', '1131734', '1.shtml']
Furl=tx1[0]+'-'+tx1[1]+'-'+tx1[2]+'-'+'%d.shtml'%lastpage
#print(Furl)
return Furl
def pagecollect():#获取内容
soup=bsp(Furl)
#print(soup)
txt=[]
times=[]
lzpost=soup.findAll('div',{'_host':lzname})
#print(lzpost[0])
if lzpost==[]:
pass
else:
for i in range(len(lzpost)):
ntime=lzpost[i].find('div',{'class':'atl-info'}).text #获取时间
#times.append(ntime)
#print(times)
post=lzpost[i].find('div',{'class':'atl-content'})#一级搜索
post=post.find('div',{'class':'bbs-content'}).text#二级搜索
post=post.strip()
#print(post)
txt.append(ntime)
txt.append(post)
return txt
def formatprint(txt):#输出处理
if txt==[]:
print('===========None============')
else:
for i in range(0,len(txt),2): #数组【时间,内容,时间……】
print('='*30)
print(txt[i])
txt[i+1]=txt[i+1].replace('-'*29,'\n----------\n')
print(txt[i+1])
print('='*30)
#time.sleep(0.5) #显示间隔
pass
def main():
#Turl=input('>>')
global refushtime,lastpage,Furl,Turl
soup=bsp(Turl)
title=re.sub('_.*','=====',soup.title.text)
print('=====',title)
lastpage=pagnum(soup)
print('LastPage:',lastpage)
Furl=pagepro() #合成最新URL
print('LastURL:',Furl)
lzname=soup.find('div',{'class':'atl-menu clearfix js-bbs-act'})['js_activityusername']
print('Lzname:',lzname)
formatprint(pagecollect()) #第一次输出
while True:
time.sleep(refushtime)
soup=bsp(Turl) #刷新
newtime=pagnum(soup)
if newtime > lastpage:
print('LastPage:',newtime)
lastpage=newtime
Furl=pagepro()
formatprint(pagecollect())
else:
os.system('cls')
formatprint(pagecollect())
print('==========Refush==========')
if __name__ == '__main__':
soup=bsp(Turl)
title=re.sub('_.*','=====',soup.title.text)
print('=====',title)
lastpage=pagnum(soup)
print('LastPage:',lastpage)
Furl=pagepro() #合成最新URL
print('LastURL:',Furl)
lzname=soup.find('div',{'class':'atl-menu clearfix js-bbs-act'})['js_activityusername']
print('Lzname:',lzname)
formatprint(pagecollect()) #第一次输出
while True:
time.sleep(refushtime)
#input('Go to Refush')
soup=bsp(Turl) #刷新
newtime=pagnum(soup)
if newtime > lastpage:
print('LastPage:',newtime)
lastpage=newtime
Furl=pagepro()
formatprint(pagecollect())
else:
os.system('cls')
print('==========Refush==========')
formatprint(pagecollect())