清华大佬耗费三个月吐血整理的几百G的资源,免费分享!....>>>
#! /usr/bin/env python
# -*- coding: utf-8 -*-
import urllib2
import re
import os
def getMeinv():
# 首先应该分页解析所有的链接和名字
# 定义页码
page = 1
# 请求数据
responseHtml = getHtml('http://sexy.faceks.com/?page=' + str(page))
# 解析出数据的路径和文件夹名称
linkAndName = getLinkAndName(responseHtml)
# 当找出的数据大于0的时候进入循环
while(len(linkAndName) > 0):
# 循环解析每个链接的数据
for l in linkAndName:
print l[0],l[1]
fileDirReg = r'<.*>'
fileDirPatter = re.compile(fileDirReg)
fileDirFindList = re.findall(fileDirPatter,l[1])
fileName = l[1]
print fileDirFindList
for f in fileDirFindList:
fileName = fileName.replace(f, "")
fileName = fileName.replace(' ', "")
if not fileName:
fileName = '没有名称'
makeDir(fileName)
try:
picResponseHtml = getHtml(l[0])
picList = getImageUrl(picResponseHtml)
for p in picList:
writeImage(fileName,p)
except:
continue
page += 1
responseHtml = getHtml('http://sexy.faceks.com/?page=' + str(page))
linkAndName = getLinkAndName(responseHtml)
#得到网页源码
def getHtml(reqUrl):
# 请求网络 8s 超时,如果出现各种异常,重新请求,累计5次返回失败
try:
response = urllib2.urlopen(reqUrl, timeout=8)
except:
for i in xrange(4):
try:
response = urllib2.urlopen(reqUrl, timeout=8)
except:
continue
else:
return response.read()
break
else:
return response.read()
#得到返回的链接和名称
def getLinkAndName(responseHtml):
reg = r'<a class="img" href="(.*)">[\s]*<img src=".*" />[\s]*</a>[\s]*</div>[\s]*<div class="text"><p>(.*)</p>'
pattern=re.compile(reg)
findList = re.findall(pattern,responseHtml)
return findList
def getImageUrl(responseHtml):
reg = r'<img src="(.*)"/>'
pattern=re.compile(reg)
findList = re.findall(pattern,responseHtml)
return findList
def makeDir(dirName):
dirPath = os.getcwd() + '/' + dirName
if not os.path.exists(dirPath):
print '文件夹不存在,创建文件夹:' , dirName
os.mkdir(dirPath)
else:
print dirName,'以存在'
def writeImage(dirName,url):
dirPath = os.getcwd() + '/' + dirName
print '正在写出' + os.path.basename(url)
try:
r = urllib2.urlopen(url, timeout=8)
with open(dirPath + '/' + os.path.basename(url), "wb") as f:
f.write(r.read())
except:
for i in xrange(4):
try:
#urllib.urlretrieve(url,dirPath + '/' + os.path.basename(url))
r = urllib2.urlopen(url, timeout=8)
with open(dirPath + '/' + os.path.basename(url), "wb") as f:
f.write(r.read())
except:
continue
else:
break
if __name__ == '__main__':
getMeinv();