清华大佬耗费三个月吐血整理的几百G的资源,免费分享!....>>>
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 | #!/usr/local/bin/python #-*- coding: UTF-8 -*- #模仿主站模板 #QQ29295842 ################################################## import threading import httplib import re #import php_data #import g import sys import base64 #import eval #import list import httplib,StringIO,gzip,urllib,re import binascii import chardet import ConfigParser #读取INI配置信息 import random #打乱数组排序 import os #import sc_html #生成HTML import urllib2 import time import Queue #import sitemap_xml #生成站点地图 #import shell_links #SHELLSEO import list import thread sys.path.append( '..' ) reload(sys) sys.setdefaultencoding( "utf-8" ) class www_mb(threading.Thread): def __init__( self ): threading.Thread.__init__( self ) self .url = "" def www_index( self ,url): self .url = url s1,s2 = self .url_http_200(url) if s1 = = True : print "OK" utf_gbk, data = self .utf_8_G(s2) #解码 if data = = "no" : print u "读取或转换失败" return 0 print "1111111" data = self .open_file_null(data) #self.th_re(data) print "2222" self .TXT_file_add( "index.html" ,u "%s" % (data)) else : print "NO" def www_index2( self ,url): LS = list.Clist() #初始化类 LS.list_del() #清空list列表 self .url = url s1,data = self .url_http_200(url) if s1 = = True : print "OK" p = re.compile( r '<a.+?href=.+?>.+?</a>' ) pname = re.compile( r '(?<=>).*?(?=</a>)' ) phref = re.compile( r '(?<=href\=\").*?(?=\")' ) sarr = p.findall(data) #找出一条一条的<a></a>标签 #这添加到数组在过滤重复值减少mysql压力 i = 0 for every in sarr: if i> = 3000 : break else : i + = 1 sname = pname.findall( every ) if sname: #sname = sname[0] shref = phref.findall( every ) if shref: if ( self .url in str(shref[ 0 ])): if (len(shref[ 0 ]) - len(url))> = 2 : LS.liet_add(str(shref[ 0 ])) #站内链接 全URL #print "1111",str(shref[0]) else : if not ( "http://" in str(shref[ 0 ]) or "https://" in str(shref[ 0 ])): if len(shref[ 0 ])> = 2 : LS.liet_add(url + "/" + str(shref[ 0 ])) #指向目录 #print "2222",str(shref[0]) LS.liet_lsqc() #数组列表去重复 random.shuffle(LS.list_2) #打算数组原有排序方式 data2 = "" for i in range(len(LS.list_2)): print LS.list_2[i] s1,data = self .url_http_200(LS.list_2[i]) if s1 = = True : print "OK" data2 = data break #跳出 整个循环 utf_gbk, data = self .utf_8_G(data2) #解码 if data = = u "no" : print u "读取或转换失败" return 0 data = self .open_file_null(data) self .TXT_file_add( "index2.html" ,u "%s" % (data)) def th_re( self ,data): try : #.+? try : #{time} 当前时间标签 (固定) p = re.compile( r '<title>[\s\S]*</title>' ) data = p.sub( '<title>{time}</title>' , data) #<title>【德尔顿官网】,热水器,电热水器,磁能热水器,安全健康热水器</title> except : pass try : #{time} 当前时间标签 (固定) p = re.compile( r '<a title="[\s\S]*" href="/">' ) data = p.sub( '<a title="{time}" href="/">' , data) #<a title="热水器十大品牌 电热水器哪个牌子好 磁能热水器-德尔顿" href="/"> except : pass try : #{time} 当前时间标签 (固定) p = re.compile( r '<strong>[\s\S]*</strong>' ) data = p.sub( '<strong>{time}</strong>' , data) #<strong>磁能热水器行业德尔顿...</strong> except : pass try : #{tmkeyword2} 整个网站模板随机抽取1个关键字 p = re.compile( r '>[\s\S]*</span>' ) data = p.sub( '>{tmkeyword2}</span>' , data) #>劳女士</span> except : pass try : #{tmkeyword2} 整个网站模板随机抽取1个关键字 p = re.compile( r '<p[\s\S]*>[\s\S]*</p>' ) data = p.sub( '<p>{tmkeyword2}</p>' , data) #>看看离您最近的体验店在哪</p> except : pass try : #{tmkeyword} 主关键词标签 (固定) p = re.compile( r '<meta content="[\s\S]*" name="keywords">' ) data = p.sub( '<meta content="{tmkeyword}" name="keywords">' , data) #<meta content="热水器,电热水器,即热式电热水器,磁能热水器" name="keywords"> except : pass try : #{tmkeyword} 主关键词标签 (固定) p = re.compile( r '<meta name=".*?" content="[\s\S]*" />' ) data = p.sub( '<meta name="keywords" content="{tmkeyword}" />' , data) except : pass try : #{tmkeyword} 主关键词标签 (固定) #{tmkeyword2} 整个网站模板随机抽取1个关键字 p = re.compile( r '<meta name=".*?" content="[\s\S]*">' ) data = p.sub( '<meta name="description" content="{tmkeyword}{tmkeyword2}">' , data) #<meta name="description" content="德尔顿中国健康热水器,不需防电墙、不需接地线,不生水垢,电磁热水器,磁能热水器,安全热水器,家装群体首选健康速磁生活电热水器品牌,招商代理电话:400-888-1682。"> except : pass try : #{link} 随机链接标签(内链) (随机) p = re.compile( r '<A.*?href="[\s\S]*</A>' ) sarr = p.findall(data) if len(sarr)> = 1 : phref = re.compile( r '(?<=href\=\").*?(?=\")' ) sarr = phref.findall(data) if ( "http://" in str(sarr[ 0 ]) or "https://" in str(sarr[ 0 ])): if (url in str(shref[ 0 ])): data = p.sub( '{link}' , data) #替换成内链 else : data = p.sub( '{ylinks}' , data) #替换成外链 else : if ( ".asp" in str(sarr[ 0 ]) or ".aspx" in str(sarr[ 0 ]) or ".php" in str(sarr[ 0 ]) or ".htm" in str(sarr[ 0 ]) or ".html" in str(sarr[ 0 ])): data = p.sub( '{link}' , data) #替换成内链 else : data = p.sub( '{newslist}' , data) #目录页目录链接 (随机) #<A href="/guanyu/shipin/">企业视频</A> except : pass try : #{link} 随机链接标签(内链) (随机) p = re.compile( r '<a.*?href="[\s\S]*</a>' ) sarr = p.findall(data) if len(sarr)> = 1 : phref = re.compile( r '(?<=href\=\").*?(?=\")' ) sarr = phref.findall(data) if ( "http://" in str(sarr[ 0 ]) or "https://" in str(sarr[ 0 ])): if (url in str(shref[ 0 ])): data = p.sub( '{link}' , data) #替换成内链 else : data = p.sub( '{ylinks}' , data) #替换成外链 else : if ( ".asp" in str(sarr[ 0 ]) or ".aspx" in str(sarr[ 0 ]) or ".php" in str(sarr[ 0 ]) or ".htm" in str(sarr[ 0 ]) or ".html" in str(sarr[ 0 ])): data = p.sub( '{link}' , data) #替换成内链 else : data = p.sub( '{newslist}' , data) #目录页目录链接 (随机) except : pass try : #{link} 随机链接标签(内链) (随机) p = re.compile( r '<a.*?href="[\s\S]*</A>' ) sarr = p.findall(data) if len(sarr)> = 1 : phref = re.compile( r '(?<=href\=\").*?(?=\")' ) sarr = phref.findall(data) if ( "http://" in str(sarr[ 0 ]) or "https://" in str(sarr[ 0 ])): if (url in str(shref[ 0 ])): data = p.sub( '{link}' , data) #替换成内链 else : data = p.sub( '{ylinks}' , data) #替换成外链 else : if ( ".asp" in str(sarr[ 0 ]) or ".aspx" in str(sarr[ 0 ]) or ".php" in str(sarr[ 0 ]) or ".htm" in str(sarr[ 0 ]) or ".html" in str(sarr[ 0 ])): data = p.sub( '{link}' , data) #替换成内链 else : data = p.sub( '{newslist}' , data) #目录页目录链接 (随机) except : pass try : #正文 p = re.compile( r '>*.+?[\s\S]*</p>' ) sarr = p.findall(data) if len(sarr)> = 1 : #print u"%s"%(sarr) if len(sarr[ 0 ])> = 150 : data = p.sub( '>{skeyword}{content}{content}。{skeyword}{content}{content}。{skeyword}{content}{content}。{skeyword}{content}{content}。</p>' , data) #print sarr[0],"=====",data if len(sarr[ 0 ])> = 100 : data = p.sub( '>{skeyword}{content}{content}。{skeyword}{content}{content}。</p>' , data) if len(sarr[ 0 ])> = 50 : data = p.sub( '>{skeyword}{content}{content}。</p>' , data) except : pass return data except Exception,e: return data # sarr = p.findall(data) # for line in res: # print line # print len(sarr) # res = p.split(data) # for line in res: # print line def open_file_null( self ,file_data): #清除空行 data = "" try : p = re.compile( r '.+?\n' ) sarr = p.findall(file_data) for every in sarr: if ( "//" in str(every) or "!" in str(every)): data + = str(every) else : #print self.th_re(str(every)) data + = self .th_re(str(every)) #if every.split(): #data+=every # if len(every)>1: # data+=every #data+=every.lstrip().rstrip().strip().rstrip('\n') return data except Exception,e: print u "zzzzzzzzzzzzz" ,e return data def TXT_file_add( self ,file_nem,data): #写入文本 try : #file_nem=time.strftime('%Y.%m.%d') #file_nem+".txt" file_object = open(file_nem, 'w' ) #file_object.write(list_passwed[E]) file_object.writelines(data) #file_object.writelines("\n") file_object.close() except Exception,e: print u "写入TXT失败" ,file_nem,data,e return 0 def utf_8_G( self ,data): #解码 try : try : return "gbk" ,data.decode( 'gbk' ) except Exception,e: #print e pass try : return "utf-8" ,data.decode( 'utf-8' ) except Exception,e: #print e pass try : return "gb2312" ,data.decode( 'gb2312' ) except Exception,e: #print e pass except Exception,e: print "utf_8_G" ,e return "utf_8_G" , "no" pass def url_http_200( self ,url): try : req = urllib2.Request(url) req.add_header( 'User-Agent' , "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)" ) s = urllib2.urlopen(req,timeout = 10 ) # 超时10秒 #s = urllib2.urlopen(r"http://www.163.com") int_url_read = s.read() if len(int_url_read)> = 5000 : return True ,int_url_read else : return False ,"" except BaseException, e: #print "xxxxxxxxxxxx",(str(e)) return False ,"" if __name__ = = '__main__' : m = www_mb() #http://www.deerdun.com m.www_index( "http://www.deerdun.com" ) #http://www.ybfhcl.com m.www_index2( "http://www.deerdun.com" ) # p = re.compile('(one|two|three)') # print p.sub('num', 'one word two words three words apple', 2) |