清华大佬耗费三个月吐血整理的几百G的资源,免费分享!....>>>
#!/usr/bin/env python
# -*- coding: utf8 -*-
# 通过输入的网址获取其依赖的站点(html中引用到的)
# 依赖文件格式如下:
# *.microsoft.com
# *.outlook.com
# *.apple.com
# *.ibm.com
import urllib2
import urlparse
import socket
import sys
import re
def printHelp():
print 'Approach 1: python DepSpy.py url dstfile'
print ' * url starts with http:// or https://.'
print ' * dstfile is the full name of output file,'
print ' results output to stdin if dstfile is empty.'
print '\r\nApproach 2: python DepSpy.py urlfile dstfile'
print ' * urlfile is the full name of file listing input urls(splitted by \\n).'
print ' * dstfile is the full name of output file,'
print ' results output to stdin if dstfile is empty.'
# 根据命令行调用相应功能
def dispatch(args):
try:
if len(args) < 2:
printHelp()
return []
elif len(args) == 2 and (['h', '/h', '-h', '?', '/?', '-?', 'help', '-help', '/help'].count(args[1]) != 0):
printHelp()
elif args[1].find(r'http://') == 0 or args[1].find(r'https://') == 0:
# 命令行参数为一个网址
return getDependHost(args[1])
else:
# 命令行参数为一个网址列表文件名
urls = readURLList(args[1])
ret = []
for u in urls:
print'---- Dealing with: ' + u + ' ----'
lst = getDependHost(u)
for it in lst:
if ret.count(it) == 0:
ret.append(it)
return ret
except Exception , e:
print e
return []
# 获取依赖站点
_pattern = re.compile(r'<(?:script|link).*(?:src|href)\s?=\s?"(https?://.+?)"')
_pwww = re.compile(r'^[a-z0-9-_]+\.')
def getDependHost(url):
try:
if url.find('http://') != 0:
url = 'http://' + url
def getHost(str):
netloc = urlparse.urlparse(str).netloc
if netloc.find('baidu.com') != -1:
# 百度的网址要单独处理
return netloc
elif netloc.count('.') < 2:
return '*.' + netloc
else:
netloc, dummy = re.subn(_pwww, '*.', netloc)
return netloc
resp = urllib2.urlopen(url)
html = resp.read()
deps = _pattern.findall(html)
deps = map(getHost, deps)
selfHost = getHost(url)
ret = []
for it in deps:
if ret.count(it) == 0 and selfHost != it:
ret.append(it)
print ret
return ret
except Exception , e:
print e
return []
# 读取网址列表
def readURLList(path):
fp = open(path, 'r')
urls = []
try:
urls = fp.read().replace('\r', '').replace('*', 'www').split('\n')
finally:
fp.close()
return urls
# 程序入口
if __name__ == '__main__':
socket.setdefaulttimeout(60) # 全局超时设置
lst = dispatch(sys.argv)
if len(sys.argv) > 2:
try:
distFilename = sys.argv[2]
fp = open(distFilename, 'w')
for it in lst:
fp.write(it + '\r\n')
fp.close()
except Exception , e:
print 'Write File Error'
else:
try:
for it in lst:
print it
except Exception , e:
print 'Error'