抓取糗事百科文字笑话

2015/09/27 44688
# -*- coding:utf-8 -*-

#制作者：archingB
import urllib
import urllib2
import re

readme="按下回车键开始读取，q退出，w写入save.txt"
def getpage(page):
    url='http://www.qiushibaike.com/textnew/page/' + str(page)
    user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
    headers = { 'User-Agent' : user_agent }
    try:
        request = urllib2.Request(url,headers = headers)
        response = urllib2.urlopen(request)
        content = response.read().decode('utf-8')
        pattern = re.compile('<div class="content">(.*?)<!--\d{10}-->',re.S)
        items=re.findall(pattern,content)
        return items
    except urllib2.URLError, e:
        if hasattr(e,"code"):
            print e.code
        if hasattr(e,"reason"):
            print e.reason
def writes(item):
    item_swap=item.encode('utf-8')
    try:
        f=open('save.txt','a')
        f.write(str(M)+'\n'+item_swap+'\n')
    finally:
        if f:
            f.close()
def main():
    P=1
    N=0
    IS=True
    print readme.decode('utf-8')
    global M
    M=1
    items=getpage(P)
    raw_input()
    while(IS):
        if(N+1==len(items)):
            P+=1
            N=0
            items=getpage(P)
        items[N]=re.sub(r'<br/>','\n',items[N])
        print N+1,items[N]
        static=str(raw_input())
        if(static=='q'or static=='Q'):
            IS=False
            break 
        if(static=='w'or static=='W'):
            writes(items[N])
            M+=1
        N+=1



main()
代码片段