清华大佬耗费三个月吐血整理的几百G的资源,免费分享!....>>>
| #!/usr/local/bin/python #-*- coding: UTF-8 -*- #模仿主站模板 #QQ29295842 ################################################## import threading import httplib import re #import php_data #import g import sys import base64 #import eval #import list import httplib,StringIO,gzip,urllib,re import binascii import chardet import ConfigParser #读取INI配置信息 import random #打乱数组排序 import os #import sc_html #生成HTML import urllib2 import time import Queue #import sitemap_xml #生成站点地图 #import shell_links #SHELLSEO import list import thread sys.path.append( '..' ) reload(sys) sys.setdefaultencoding( "utf-8" ) class www_mb(threading.Thread): def __init__( self ): threading.Thread.__init__( self ) self .url = "" def www_index( self ,url): self .url = url s1,s2 = self .url_http_200(url) if s1 = = True : print "OK" utf_gbk, data = self .utf_8_G(s2) #解码 if data = = "no" : print u "读取或转换失败" return 0 print "1111111" data = self .open_file_null(data) #self.th_re(data) print "2222" self .TXT_file_add( "index.html" ,u "%s" % (data)) else : print "NO" def www_index2( self ,url): LS = list.Clist() #初始化类 LS.list_del() #清空list列表 self .url = url s1,data = self .url_http_200(url) if s1 = = True : print "OK" p = re.compile( r '<a.+?href=.+?>.+?</a>' ) pname = re.compile( r '(?<=>).*?(?=</a>)' ) phref = re.compile( r '(?<=href\=\").*?(?=\")' ) sarr = p.findall(data) #找出一条一条的<a></a>标签 #这添加到数组在过滤重复值减少mysql压力 i = 0 for every in sarr: if i> = 3000 : break else : i + = 1 sname = pname.findall( every ) if sname: #sname = sname[0] shref = phref.findall( every ) if shref: if ( self .url in str(shref[ 0 ])): if (len(shref[ 0 ]) - len(url))> = 2 : LS.liet_add(str(shref[ 0 ])) #站内链接 全URL #print "1111",str(shref[0]) else : if not ( "http://" in str(shref[ 0 ]) or "https://" in str(shref[ 0 ])): if len(shref[ 0 ])> = 2 : LS.liet_add(url + "/" + str(shref[ 0 ])) #指向目录 #print "2222",str(shref[0]) LS.liet_lsqc() #数组列表去重复 random.shuffle(LS.list_2) #打算数组原有排序方式 data2 = "" for i in range(len(LS.list_2)): print LS.list_2[i] s1,data = self .url_http_200(LS.list_2[i]) if s1 = = True : print "OK" data2 = data break #跳出 整个循环 utf_gbk, data = self .utf_8_G(data2) #解码 if data = = u "no" : print u "读取或转换失败" return 0 data = self .open_file_null(data) self .TXT_file_add( "index2.html" ,u "%s" % (data)) def th_re( self ,data): try : #.+? try : #{time} 当前时间标签 (固定) p = re.compile( r '<title>[\s\S]*</title>' ) data = p.sub( '<title>{time}</title>' , data) #<title>【德尔顿官网】,热水器,电热水器,磁能热水器,安全健康热水器</title> except : pass try : #{time} 当前时间标签 (固定) p = re.compile( r '<a title="[\s\S]*" href="/">' ) data = p.sub( '<a title="{time}" href="/">' , data) #<a title="热水器十大品牌 电热水器哪个牌子好 磁能热水器-德尔顿" href="/"> except : pass try : #{time} 当前时间标签 (固定) p = re.compile( r '<strong>[\s\S]*</strong>' ) data = p.sub( '<strong>{time}</strong>' , data) #<strong>磁能热水器行业德尔顿...</strong> except : pass try : #{tmkeyword2} 整个网站模板随机抽取1个关键字 p = re.compile( r '>[\s\S]*</span>' ) data = p.sub( '>{tmkeyword2}</span>' , data) #>劳女士</span> except : pass try : #{tmkeyword2} 整个网站模板随机抽取1个关键字 p = re.compile( r '<p[\s\S]*>[\s\S]*</p>' ) data = p.sub( '<p>{tmkeyword2}</p>' , data) #>看看离您最近的体验店在哪</p> except : pass try : #{tmkeyword} 主关键词标签 (固定) p = re.compile( r '<meta content="[\s\S]*" name="keywords">' ) data = p.sub( '<meta content="{tmkeyword}" name="keywords">' , data) #<meta content="热水器,电热水器,即热式电热水器,磁能热水器" name="keywords"> except : pass try : #{tmkeyword} 主关键词标签 (固定) p = re.compile( r '<meta name=".*?" content="[\s\S]*" />' ) data = p.sub( '<meta name="keywords" content="{tmkeyword}" />' , data) except : pass try : #{tmkeyword} 主关键词标签 (固定) #{tmkeyword2} 整个网站模板随机抽取1个关键字 p = re.compile( r '<meta name=".*?" content="[\s\S]*">' ) data = p.sub( '<meta name="description" content="{tmkeyword}{tmkeyword2}">' , data) #<meta name="description" content="德尔顿中国健康热水器,不需防电墙、不需接地线,不生水垢,电磁热水器,磁能热水器,安全热水器,家装群体首选健康速磁生活电热水器品牌,招商代理电话:400-888-1682。"> except : pass try : #{link} 随机链接标签(内链) (随机) p = re.compile( r '<A.*?href="[\s\S]*</A>' ) sarr = p.findall(data) if len(sarr)> = 1 : phref = re.compile( r '(?<=href\=\").*?(?=\")' ) sarr = phref.findall(data) if ( "http://" in str(sarr[ 0 ]) or "https://" in str(sarr[ 0 ])): if (url in str(shref[ 0 ])): data = p.sub( '{link}' , data) #替换成内链 else : data = p.sub( '{ylinks}' , data) #替换成外链 else : if ( ".asp" in str(sarr[ 0 ]) or ".aspx" in str(sarr[ 0 ]) or ".php" in str(sarr[ 0 ]) or ".htm" in str(sarr[ 0 ]) or ".html" in str(sarr[ 0 ])): data = p.sub( '{link}' , data) #替换成内链 else : data = p.sub( '{newslist}' , data) #目录页目录链接 (随机) #<A href="/guanyu/shipin/">企业视频</A> except : pass try : #{link} 随机链接标签(内链) (随机) p = re.compile( r '<a.*?href="[\s\S]*</a>' ) sarr = p.findall(data) if len(sarr)> = 1 : phref = re.compile( r '(?<=href\=\").*?(?=\")' ) sarr = phref.findall(data) if ( "http://" in str(sarr[ 0 ]) or "https://" in str(sarr[ 0 ])): if (url in str(shref[ 0 ])): data = p.sub( '{link}' , data) #替换成内链 else : data = p.sub( '{ylinks}' , data) #替换成外链 else : if ( ".asp" in str(sarr[ 0 ]) or ".aspx" in str(sarr[ 0 ]) or ".php" in str(sarr[ 0 ]) or ".htm" in str(sarr[ 0 ]) or ".html" in str(sarr[ 0 ])): data = p.sub( '{link}' , data) #替换成内链 else : data = p.sub( '{newslist}' , data) #目录页目录链接 (随机) except : pass try : #{link} 随机链接标签(内链) (随机) p = re.compile( r '<a.*?href="[\s\S]*</A>' ) sarr = p.findall(data) if len(sarr)> = 1 : phref = re.compile( r '(?<=href\=\").*?(?=\")' ) sarr = phref.findall(data) if ( "http://" in str(sarr[ 0 ]) or "https://" in str(sarr[ 0 ])): if (url in str(shref[ 0 ])): data = p.sub( '{link}' , data) #替换成内链 else : data = p.sub( '{ylinks}' , data) #替换成外链 else : if ( ".asp" in str(sarr[ 0 ]) or ".aspx" in str(sarr[ 0 ]) or ".php" in str(sarr[ 0 ]) or ".htm" in str(sarr[ 0 ]) or ".html" in str(sarr[ 0 ])): data = p.sub( '{link}' , data) #替换成内链 else : data = p.sub( '{newslist}' , data) #目录页目录链接 (随机) except : pass try : #正文 p = re.compile( r '>*.+?[\s\S]*</p>' ) sarr = p.findall(data) if len(sarr)> = 1 : #print u"%s"%(sarr) if len(sarr[ 0 ])> = 150 : data = p.sub( '>{skeyword}{content}{content}。{skeyword}{content}{content}。{skeyword}{content}{content}。{skeyword}{content}{content}。</p>' , data) #print sarr[0],"=====",data if len(sarr[ 0 ])> = 100 : data = p.sub( '>{skeyword}{content}{content}。{skeyword}{content}{content}。</p>' , data) if len(sarr[ 0 ])> = 50 : data = p.sub( '>{skeyword}{content}{content}。</p>' , data) except : pass return data except Exception,e: return data # sarr = p.findall(data) # for line in res: # print line # print len(sarr) # res = p.split(data) # for line in res: # print line def open_file_null( self ,file_data): #清除空行 data = "" try : p = re.compile( r '.+?\n' ) sarr = p.findall(file_data) for every in sarr: if ( "//" in str(every) or "!" in str(every)): data + = str(every) else : #print self.th_re(str(every)) data + = self .th_re(str(every)) #if every.split(): #data+=every # if len(every)>1: # data+=every #data+=every.lstrip().rstrip().strip().rstrip('\n') return data except Exception,e: print u "zzzzzzzzzzzzz" ,e return data def TXT_file_add( self ,file_nem,data): #写入文本 try : #file_nem=time.strftime('%Y.%m.%d') #file_nem+".txt" file_object = open(file_nem, 'w' ) #file_object.write(list_passwed[E]) file_object.writelines(data) #file_object.writelines("\n") file_object.close() except Exception,e: print u "写入TXT失败" ,file_nem,data,e return 0 def utf_8_G( self ,data): #解码 try : try : return "gbk" ,data.decode( 'gbk' ) except Exception,e: #print e pass try : return "utf-8" ,data.decode( 'utf-8' ) except Exception,e: #print e pass try : return "gb2312" ,data.decode( 'gb2312' ) except Exception,e: #print e pass except Exception,e: print "utf_8_G" ,e return "utf_8_G" , "no" pass def url_http_200( self ,url): try : req = urllib2.Request(url) req.add_header( 'User-Agent' , "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)" ) s = urllib2.urlopen(req,timeout = 10 ) # 超时10秒 #s = urllib2.urlopen(r"http://www.163.com") int_url_read = s.read() if len(int_url_read)> = 5000 : return True ,int_url_read else : return False ,"" except BaseException, e: #print "xxxxxxxxxxxx",(str(e)) return False ,"" if __name__ = = '__main__' : m = www_mb() #http://www.deerdun.com m.www_index( "http://www.deerdun.com" ) #http://www.ybfhcl.com m.www_index2( "http://www.deerdun.com" ) # p = re.compile('(one|two|three)') # print p.sub('num', 'one word two words three words apple', 2) |