清华大佬耗费三个月吐血整理的几百G的资源,免费分享!....>>>
python提取页面内的url列表
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 | from bs4 import BeautifulSoup import time,re,urllib2 t = time.time() websiteurls = {} def scanpage(url): websiteurl = url t = time.time() n = 0 html = urllib2.urlopen(websiteurl).read() soup = BeautifulSoup(html) pageurls = [] Upageurls = {} pageurls = soup.find_all( "a" ,href = True ) for links in pageurls: if websiteurl in links.get( "href" ) and links.get( "href" ) not in Upageurls and links.get( "href" ) not in websiteurls: Upageurls[links.get( "href" )] = 0 for links in Upageurls.keys(): try : urllib2.urlopen(links).getcode() except : print "connect failed" else : t2 = time.time() Upageurls[links] = urllib2.urlopen(links).getcode() print n, print links, print Upageurls[links] t1 = time.time() print t1 - t2 n + = 1 print ( "total is " + repr(n) + " links" ) print time.time() - t scanpage( "http://news.163.com/" ) |