清华大佬耗费三个月吐血整理的几百G的资源,免费分享!....>>>
#!/usr/bin/env python #-*-coding: utf-8 -*- import re import urllib2 from bs4 import BeautifulSoup as bs import csv import os import sys reload(sys) sys.setdefaultencoding('utf-8') def GetAllLink(): num = int(raw_input("爬取多少页:>")) if not os.path.exists('./data/'): os.mkdir('./data/') for i in range(num): if i+1 == 1: url = 'http://nj.58.com/piao/' GetPage(url, i) else: url = 'http://nj.58.com/piao/pn%s/' %(i+1) GetPage(url, i) def GetPage(url, num): Url = url user_agent = 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:32.0) Gecko/20100101 Firefox/32.0' headers = { 'User-Agent' : user_agent } req = urllib2.Request(Url, headers = headers) page = urllib2.urlopen(req).read().decode('utf-8') soup = bs(page) table = soup.table tag = table.find_all('tr') # 提取出所需的那段 soup2 = bs(str(tag)) title = soup2.find_all('a','t') #标题与url price = soup2.find_all('b', 'pri') #价格 fixedprice = soup2.find_all('del') #原价 date = soup2.find_all('span','pr25') #时间 atitle = [] ahref = [] aprice = [] afixedprice = [] adate = [] for i in title: #print i.get_text(), i.get('href') atitle.append(i.get_text()) ahref.append(i.get('href')) for i in price: #print i.get_text() aprice.append(i.get_text()) for i in fixedprice: #print j.get_text() afixedprice.append(i.get_text()) for i in date: #print i.get_text() adate.append(i.get_text()) csvfile = file('./data/ticket_%s.csv'%num, 'w') writer = csv.writer(csvfile) writer.writerow(['标题','url','售价','原价','演出时间']) ''' 每个字段必有title,但是不一定有时间date 如果没有date日期,我们就设为'---' ''' if len(atitle) > len(adate): for i in range(len(atitle) - len(adate)): adate.append('---') for i in range(len(atitle)): message = atitle[i]+'|'+ahref[i]+'|'+aprice[i]+ '|'+afixedprice[i]+'|'+ adate[i] writer.writerow([i for i in str(message).split('|')]) print "[Result]:> 页面 %s 信息保存完毕!"%(num+1) csvfile.close() if __name__ == '__main__': GetAllLink()