多线程下载豆瓣相册

清华大佬耗费三个月吐血整理的几百G的资源,免费分享!....>>>

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Date    : 2014-12-22 14:46:40
# @Author  : kuas (hukuas@gmail.com)
# @Version : $Id$
  
import _thread
from http import cookiejar
import os
import random
import re
import threading
import time
import urllib.request
 
 
userAgents = [{'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:5.0) Gecko/20100101 Firefox/5.0'},
    {"User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1) Gecko/20090624 Firefox/3.5"},
    {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'},
    {"User-Agent":"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1"},
    {"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11"},
    {"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER"},
    {"User-Agent":"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)"},
    {"User-Agent":"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)"},
    {"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 LBBROWSER"},
    {"User-Agent":"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)"},
    {"User-Agent":"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)"},
    {"User-Agent":"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)"},
    {"User-Agent":"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; 360SE)"},
    {"User-Agent":"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1"},
    {"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1"},
    {"User-Agent":"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0"},
    {"User-Agent":"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0)"},
    {"User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:16.0) Gecko/20121026 Firefox/16.0"},
    {"User-Agent":"Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5"},
    {"User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre"},
    {"User-Agent":"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0"},
    {"User-Agent":"Mozilla/5.0 (Windows; U; Windows NT 6.1; zh-CN; rv:1.9.2.15) Gecko/20110303 Firefox/3.6.15"},
    {"User-Agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11"},
    {"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11"},
    {"User-Agent":"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133"},
    {"User-Agent":"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0)"},
    {"User-Agent":"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)"},
    {"User-Agent":"Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10"}, 
    {"User-Agent":"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)"},
    {"User-Agent":"Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)"},
    {"User-Agent":"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)"},
    {"User-Agent":"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)"},
    {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101"},
    {"User-Agent":"Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1"}]
 
##只需要设置豆瓣相册地址ActivityURL即可
ActivityURL = "http://www.douban.com/online/11748222/album/131483485"
SAVE_DIR = "D:\\downPic\\" ##保存的文件目录
downPicCount = 16 ##下载图片线程数
clawThreadCount = 1 ##扒取图片链接线程数
 
##以下参数不用设置
pics = []
urls = []
openers = []
exitFlag = 0
picLock = threading.Lock()
urlLock = threading.Lock()
pageNum = 0
ActDir = "tmp\\"
PageSize = 18 #每页照片数量
MaxPageNum = 10  ##总共几页
 
def getRandomHeaders():
    headers = []
    headers.append(("User-Agent", random.choice(userAgents)["User-Agent"]))
    headers.append(("Accept-Language", "zh-cn,zh;q=0.8;"))
    headers.append(("Cache-Control", "max-age=0"))
    headers.append(("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"))
    return headers
 
def initOpeners(openerCount):
    for i in range(0,openerCount):
        cj = cookiejar.CookieJar()
        opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj))
        opener.addheaders = getRandomHeaders()
        urllib.request.install_opener(opener)
        openers.append(opener)
         
def getRandomOpener():
    return random.choice(openers)
 
def downLoadImage(url, savePath):
    testCount = 0
    while testCount < 3:
        try:
            f = getRandomOpener().open(url)
            file = open(savePath, 'wb')
            file.write(f.read())
            file.close()
            f.close()
            break
        except Exception as e:
            print("DownLoad image %s Error:%s"%(url,str(e)))
            testCount += 1
     
     
def getHtml(url):
    testCount = 0
    html = ""
    while testCount < 3:
        try:
            f = getRandomOpener().open(url)
            html = f.read().decode('utf-8')
            f.close()
            break
        except Exception as e:
            print("getHtml %s Error:%s"%(url,str(e)))
            testCount += 1
    return html
 
class ClawThread(threading.Thread):
    def __init__(self, name):
        threading.Thread.__init__(self)
        self.name = name
    def run(self):
        global urls,pageNum
        while exitFlag == 0:
            time.sleep(0.1 * random.randint(0, 10))
            urlLock.acquire()
            pUrl = ""
            if len(urls) > 0:
                pUrl = urls[0]
                urls.remove(pUrl)
            else:
                if pageNum == MaxPageNum:
                    break
                endAddPage = pageNum + 5;
                endAddPage = min(endAddPage,MaxPageNum)
                while pageNum < endAddPage:
                    urls.append(ActivityURL % (pageNum*PageSize))
                    pageNum += 1
            urlLock.release()
            if pUrl != "":
                html = getHtml(pUrl)
                getPicURL(html)
 
class DownPicThread(threading.Thread):
    def __init__(self, name):
        threading.Thread.__init__(self)
        self.name = name
    def run(self):
        global pics
        while exitFlag == 0:
            time.sleep(0.01 * random.randint(0, 10))
            picLock.acquire()
            picUrl = ""
            if len(pics) > 0:
                picUrl = pics[0]
                pics.remove(picUrl)
            picLock.release()
            if picUrl != "":
                fileName = picUrl[picUrl.rindex('/')+1:]
                filePath = SAVE_DIR + ActDir + fileName
                if not os.path.exists(filePath):
                    downLoadImage(picUrl, filePath)
             
 
def getPicURL(html):
    reg = r"http://img\d\.douban\.com/view/photo/thumb/public/p\d+\.jpg"
    picURLs = re.findall(reg, html)
    picLock.acquire()
    for picurl in picURLs:
        pics.append(picurl.replace("thumb", "photo"))
    picLock.release()
 
def initData():
    global ActivityURL,ActDir,SAVE_DIR,PageSize,MaxPageNum
    if not os.path.exists(SAVE_DIR):
        os.mkdir(SAVE_DIR)
 
    regNum = "/\d+"
    html = getHtml(ActivityURL)
    regStartNum = "\?start=\d+"
    startStrs = re.findall(regStartNum,html)
    maxStartInt = 0
    for startStr in startStrs:
        maxStartInt = max(int(startStr[7:]),maxStartInt)
    nums = re.findall(regNum,ActivityURL)
     
    if len(nums) == 2:#线上活动相册
        ActDir = nums[1][1:]+"/"
        PageSize = 90
        ActivityURL = ActivityURL +"?start=%d&sortby=popularity"
    elif len(nums) == 1:#个人相册
        ActDir = nums[0][1:]+"/"
        PageSize = 18
        ActivityURL = ActivityURL +"?start=%d"
    MaxPageNum = maxStartInt/PageSize + 1
    print("总页数:%d"%(MaxPageNum))
    print("PageSize:%d"%(PageSize))
     
if __name__ == '__main__':
     
    initOpeners(10)
    initData()
     
    threads = []
     
    if not os.path.exists(SAVE_DIR + ActDir):
        os.mkdir(SAVE_DIR + ActDir)
      
    for i in range(0,clawThreadCount):
        thread = ClawThread("%d"%(i))
        thread.start()
        threads.append(thread)   
 
    for i in range(0,downPicCount):
        thread = DownPicThread("%d"%(i))
        thread.start()
        threads.append(thread)
         
    while True:
        time.sleep(1)
        print("Downing:%d  ----------- Finished:%d"%(len(pics),len(os.listdir(SAVE_DIR + ActDir))))
        if len(urls) == 0 and len(pics) == 0 and pageNum == MaxPageNum:
            exitFlag = 1
            break
    print("Have DownLoaded %d files!"%(len(os.listdir(SAVE_DIR + ActDir))))
    print("Waiting for all thread exit...")
    for thread in threads:
        thread.join()