python爬虫-爬取百度贴吧帖子加图片

清华大佬耗费三个月吐血整理的几百G的资源,免费分享!....>>>

# -*- coding: utf-8 -*-
""" 百度贴吧帖子抓取
"""
import urllib2
import json
import os
from lxml import etree
from pymongo import MongoClient
import sys
reload(sys)
sys.setdefaultencoding("utf-8")
client = MongoClient('localhost', 27017)
tb = u'四川大学'  # 设置要抓取的贴吧


def get_tz_id(tb_name, page_num):
    tz_id = []
    for page in range(1, page_num+1):
        url = "http://tieba.baidu.com/f?kw=%s&pn=%s" % (tb_name, (page*50-50))
        html = urllib2.urlopen(url).read()
        tree = etree.HTML(html)
        ul_li = tree.xpath('//*[@id="thread_list"]/li')[1:]
        for li in ul_li:
            data_field = li.xpath('./@data-field')  # 滤掉百度推广部分
            if data_field:
                id_ = eval(data_field[0])['id']
                tz_id.append(id_)
    return tz_id


def save_img(path, img_id, url):
    try:
        picture = urllib2.urlopen(url).read()
    except urllib2.URLError, e:
        print e
        picture = False
    if picture:
        if not os.path.exists(path):  # 创建文件路径
            os.makedirs(path)
        f = open('%s/%s.jpg' % (path, img_id), "wb")
        f.write(picture)
        f.flush()
        f.close()


def store_mongodb(dic):
    database = client.bdtb
    return database[tb].insert(dic)


def get_info(tz_id):
    tz_url = 'http://tieba.baidu.com/p/%s' % tz_id
    html = urllib2.urlopen(tz_url).read()
    tree = etree.HTML(html)
    fist_floor = tree.xpath('//div[@class="l_post j_l_post l_post_bright noborder "]')
    title = tree.xpath('//div[@class="core_title core_title_theme_bright"]/h1/@title')
    content = fist_floor[0].xpath('./div[3]/div[1]/cc/div')[0]
    info = {}

    if content.xpath('./img'):   # 判断是否有图片,有图片为true
        text = fist_floor[0].xpath('./div[3]/div[1]/cc/div')[0].xpath('string(.)').strip()
        if len(text) == 0:
            return False  # 滤掉没有文字的帖子
        images = fist_floor[0].xpath('./div[3]/div[1]/cc/div/img')  # 获取图片
        number = 1
        image_li = []
        for each in images:
            src = each.xpath('./@src')[0]
            if src.find('static')+1:  # 滤掉贴吧表情图片
                pass
            else:
                img_id = '%s_%s' % (tz_id, number)
                save_img(tb, img_id, src)  # 保存图片到本地
                image_li.append('%s/%s_%s' % (tb, tz_id, number))
                number += 1
        info['content'] = text
        info['image'] = image_li
    else:
        info['content'] = content.text.strip()
        info['image'] = 'null'
    info['source'] = tb
    info['title'] = ''.join(title)
    data_field = fist_floor[0].xpath('./@data-field')[0]
    data_info = json.loads(data_field)
    info['dateline'] = data_info['content']['date']  # create time
    info['sex'] = data_info['author']['user_sex']  # sex
    info['author'] = data_info['author']['user_name']
    reply_floor = tree.xpath('//div[@class="l_post j_l_post l_post_bright  "]')
    reply_li = []
    for each_floor in reply_floor:
        if not each_floor.xpath('./div[3]/div[1]/cc/div'):  # 滤掉百度推广
            return False
        reply_content = each_floor.xpath('./div[3]/div[1]/cc/div')[0].xpath('string(.)').strip()
        reply_info = {}
        if len(reply_content) > 0:  # 滤掉无文字的回复
            re_field = each_floor.xpath('./@data-field')[0]
            re_info = json.loads(re_field)
            reply_info['dateline'] = re_info['content']['date']
            reply_info['author'] = re_info['author']['user_name']
            reply_info['content'] = reply_content
        reply_li.append(reply_info)
    info['reply'] = reply_li
    store_mongodb(info)


def main():
    id_list = get_tz_id(tb, 1)
    for each in id_list:
        get_info(each)
        # break
    client.close()
if __name__ == "__main__":
    main()