清华大佬耗费三个月吐血整理的几百G的资源,免费分享!....>>>
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 | # -*- coding: utf-8 -*- from scrapy.spider import Spider from scrapy.http.request import Request from scrapy.selector import Selector from scrapy.exceptions import NotSupported from scrapy import log import urlparse import json import urllib from meizi.items import ImoocItem __author__ = 'lpe234' class ImoocSpider(Spider): """ 本爬虫旨在帮助不方便联网观看视频教程的同学。 其实“慕课网”还是蛮不错,希望各位不要乱跑本脚本!!! """ name = 'immoc' start_urls = [ 'http://www.imooc.com/course/list' ] def parse( self , response): """ 爬虫默认入口,旨在获取所有视频教程的 标题 以及 详细页 数据 :param response: :return: """ sel = Selector(response) # 获取本页所有视频节点 nodes = sel.xpath( '//ul/li[@class="course-one"]' ) for node in nodes: try : title = node.xpath( './a/h5/span/text()' ).extract()[ 0 ] href = node.xpath( './a/@href' ).extract()[ 0 ] href = urlparse.urljoin(response.url, href) # 将 url 中的 view -> learn 即可得到 视频详细页列表 href = href.replace(u 'view' , u 'learn' ) if title and href: yield Request( url = href, callback = self .parse_list, meta = { 'title' : title} ) except (IndexError, TypeError): continue # 多页情况 pages = sel.xpath( '//div[@class="page"]/a[@href]' ) for page in pages: try : href = page.xpath( './@href' ).extract()[ 0 ] href = urlparse.urljoin(response.url, href) if href and href.startswith( 'http' ): yield Request( url = href, callback = self .parse, ) except (IndexError, TypeError, NotSupported): continue def parse_list( self , response): """ 视频列表页 解析 :param response: :return: """ """ {"result":0,"data":{"result":{"mid":8124,"mpath": ["http:\/\/v1.mukewang.com\/f9fd506a-bf14-4ede-96c4-1d69d2fd26e7\/H.mp4", "http:\/\/v1.mukewang.com\/f9fd506a-bf14-4ede-96c4-1d69d2fd26e7\/M.mp4", "http:\/\/v1.mukewang.com\/f9fd506a-bf14-4ede-96c4-1d69d2fd26e7\/L.mp4"], "cpid":"2095","name":"NSMutableDictionary","time":"34","practise":[]}},"msg":"\u6210\u529f"} """ # 查看js文件可以得到 视频地址获取接口如下 GET -> json pre_href = 'http://www.imooc.com/course/ajaxmediainfo/?mid={}' title = response.meta.get( 'title' ) sel = Selector(response) nodes = sel.xpath( '//div[@class="course_chapter_list"]//a[@class="studyvideo"]' ) videos = [] for node in nodes: try : section = node.xpath( './text()' ).extract()[ 0 ] section = section.strip().replace( '\r' , ' ').replace(' \n ', ' ') video_id = node.xpath( './@href' ).extract()[ 0 ] video_id = video_id.split( '/' )[ - 1 ] if section and video_id: href = pre_href.format(video_id) response = urllib.urlopen(href) data = json.loads(response.read()) # 此处存在三种模式 0:超清 1:高清 2:普清 url = data[ 'data' ][ 'result' ][ 'mpath' ][ 0 ] videos.append({section: url}) except (IndexError, TypeError, ValueError): continue item = ImoocItem() item[ 'title' ] = title item[ 'videos' ] = videos yield item def parse_details( self , response): pass |