模仿主站模板 首页 内页(想实现模板小偷)

清华大佬耗费三个月吐血整理的几百G的资源,免费分享!....>>>

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
#!/usr/local/bin/python
#-*- coding: UTF-8 -*-
#模仿主站模板
#QQ29295842
##################################################
import threading
import httplib
import re
#import php_data
#import g
import sys
import base64
#import eval
#import list
import httplib,StringIO,gzip,urllib,re
import binascii
import chardet
import ConfigParser  #读取INI配置信息
import random   #打乱数组排序
import os
#import sc_html  #生成HTML
import urllib2
import time
import Queue
#import sitemap_xml   #生成站点地图
#import shell_links  #SHELLSEO
import list
import thread
sys.path.append('..')
reload(sys)
sys.setdefaultencoding("utf-8")
 
class www_mb(threading.Thread):
    def __init__(self):
        threading.Thread.__init__(self)
        self.url=""
 
    def www_index(self,url):
        self.url=url
        s1,s2=self.url_http_200(url)
        if s1==True:
            print "OK"
            utf_gbk, data = self.utf_8_G(s2)  #解码
            if data == "no":
                print u"读取或转换失败"
                return 0
            print "1111111"
            data=self.open_file_null(data)
            #self.th_re(data)
            print "2222"
            self.TXT_file_add("index.html",u"%s"%(data))
        else:
            print "NO"
 
    def www_index2(self,url):
        LS = list.Clist()  #初始化类
        LS.list_del()  #清空list列表
        self.url=url
        s1,data=self.url_http_200(url)
        if s1==True:
            print "OK"
            p = re.compile( r'<a.+?href=.+?>.+?</a>' )
            pname = re.compile( r'(?<=>).*?(?=</a>)' )
            phref = re.compile( r'(?<=href\=\").*?(?=\")')
            sarr = p.findall(data)#找出一条一条的<a></a>标签   #这添加到数组在过滤重复值减少mysql压力
            i=0
            for every in sarr:
                if i>=3000:
                    break
                else:
                    i+=1
                sname = pname.findall( every )
                if sname:
                    #sname = sname[0]
                    shref = phref.findall( every )
                if shref:
                    if (self.url in str(shref[0])):
                        if (len(shref[0])-len(url))>=2:
                            LS.liet_add(str(shref[0]))  #站内链接  全URL
                            #print "1111",str(shref[0])
                    else:
                        if not("http://" in str(shref[0]) or "https://" in str(shref[0])):
                            if len(shref[0])>=2:
                                LS.liet_add(url+"/"+str(shref[0]))  #指向目录
                            #print "2222",str(shref[0])
            LS.liet_lsqc() #数组列表去重复
            random.shuffle(LS.list_2)   #打算数组原有排序方式
            data2=""
 
            for i in range(len(LS.list_2)):
                print LS.list_2[i]
                s1,data=self.url_http_200(LS.list_2[i])
                if s1==True:
                    print "OK"
                    data2=data
                    break #跳出   整个循环
 
            utf_gbk, data = self.utf_8_G(data2)  #解码
            if data == u"no":
                print u"读取或转换失败"
                return 0
            data=self.open_file_null(data)
            self.TXT_file_add("index2.html",u"%s"%(data))
 
 
    def th_re(self,data):
        try:   #.+?
            try:
                #{time}  当前时间标签   (固定)
                p = re.compile( r'<title>[\s\S]*</title>')
                data= p.sub('<title>{time}</title>', data)
                #<title>【德尔顿官网】,热水器,电热水器,磁能热水器,安全健康热水器</title>
            except :
                pass
 
            try:
                #{time}  当前时间标签   (固定)
                p = re.compile( r'<a title="[\s\S]*" href="/">')
                data= p.sub('<a title="{time}" href="/">', data)
                #<a title="热水器十大品牌 电热水器哪个牌子好 磁能热水器-德尔顿" href="/">
            except :
                pass
 
            try:
                #{time}  当前时间标签   (固定)
                p = re.compile( r'<strong>[\s\S]*</strong>')
                data= p.sub('<strong>{time}</strong>', data)
                #<strong>磁能热水器行业德尔顿...</strong>
            except :
                pass
 
            try:
                #{tmkeyword2} 整个网站模板随机抽取1个关键字
                p = re.compile( r'>[\s\S]*</span>')
                data= p.sub('>{tmkeyword2}</span>', data)
                #>劳女士</span>
            except :
                pass
 
            try:
                #{tmkeyword2} 整个网站模板随机抽取1个关键字
                p = re.compile( r'<p[\s\S]*>[\s\S]*</p>')
                data= p.sub('<p>{tmkeyword2}</p>', data)
                #>看看离您最近的体验店在哪</p>
            except :
                pass
 
            try:
                #{tmkeyword} 主关键词标签 (固定)
                p = re.compile( r'<meta content="[\s\S]*" name="keywords">')
                data= p.sub('<meta content="{tmkeyword}" name="keywords">', data)
                #<meta content="热水器,电热水器,即热式电热水器,磁能热水器" name="keywords">
            except :
                pass
 
            try:
                #{tmkeyword} 主关键词标签 (固定)
                p = re.compile( r'<meta name=".*?" content="[\s\S]*" />')
                data= p.sub('<meta name="keywords" content="{tmkeyword}" />', data)
            except :
                pass
 
            try:
                #{tmkeyword} 主关键词标签 (固定)
                #{tmkeyword2} 整个网站模板随机抽取1个关键字
                p = re.compile( r'<meta name=".*?" content="[\s\S]*">')
                data= p.sub('<meta name="description" content="{tmkeyword}{tmkeyword2}">', data)
                #<meta name="description" content="德尔顿中国健康热水器,不需防电墙、不需接地线,不生水垢,电磁热水器,磁能热水器,安全热水器,家装群体首选健康速磁生活电热水器品牌,招商代理电话:400-888-1682。">
            except :
                pass
 
            try:
                #{link}  随机链接标签(内链) (随机)
                p = re.compile( r'<A.*?href="[\s\S]*</A>')
                sarr = p.findall(data)
                if len(sarr)>=1:
                    phref = re.compile( r'(?<=href\=\").*?(?=\")')
                    sarr = phref.findall(data)
                    if ("http://" in str(sarr[0]) or "https://" in str(sarr[0])):
                        if (url in str(shref[0])):
                            data= p.sub('{link}', data)  #替换成内链
                        else:
                            data= p.sub('{ylinks}', data)  #替换成外链
                    else:
                        if (".asp" in str(sarr[0]) or ".aspx" in str(sarr[0]) or ".php" in str(sarr[0]) or ".htm" in str(sarr[0]) or ".html" in str(sarr[0])):
                            data= p.sub('{link}', data)  #替换成内链
                        else:
                            data= p.sub('{newslist}', data)  #目录页目录链接 (随机)
                #<A href="/guanyu/shipin/">企业视频</A>
            except :
                pass
 
            try:
                #{link}  随机链接标签(内链) (随机)
                p = re.compile( r'<a.*?href="[\s\S]*</a>')
                sarr = p.findall(data)
                if len(sarr)>=1:
                    phref = re.compile( r'(?<=href\=\").*?(?=\")')
                    sarr = phref.findall(data)
                    if ("http://" in str(sarr[0]) or "https://" in str(sarr[0])):
                        if (url in str(shref[0])):
                            data= p.sub('{link}', data)  #替换成内链
                        else:
                            data= p.sub('{ylinks}', data)  #替换成外链
                    else:
                        if (".asp" in str(sarr[0]) or ".aspx" in str(sarr[0]) or ".php" in str(sarr[0]) or ".htm" in str(sarr[0]) or ".html" in str(sarr[0])):
                            data= p.sub('{link}', data)  #替换成内链
                        else:
                            data= p.sub('{newslist}', data)  #目录页目录链接 (随机)
            except :
                pass
 
            try:
                #{link}  随机链接标签(内链) (随机)
                p = re.compile( r'<a.*?href="[\s\S]*</A>')
                sarr = p.findall(data)
                if len(sarr)>=1:
                    phref = re.compile( r'(?<=href\=\").*?(?=\")')
                    sarr = phref.findall(data)
                    if ("http://" in str(sarr[0]) or "https://" in str(sarr[0])):
                        if (url in str(shref[0])):
                            data= p.sub('{link}', data)  #替换成内链
                        else:
                            data= p.sub('{ylinks}', data)  #替换成外链
                    else:
                        if (".asp" in str(sarr[0]) or ".aspx" in str(sarr[0]) or ".php" in str(sarr[0]) or ".htm" in str(sarr[0]) or ".html" in str(sarr[0])):
                            data= p.sub('{link}', data)  #替换成内链
                        else:
                            data= p.sub('{newslist}', data)  #目录页目录链接 (随机)
            except :
                pass
 
            try:
                #正文
                p = re.compile( r'>*.+?[\s\S]*</p>')
                sarr = p.findall(data)
                if len(sarr)>=1:
                    #print u"%s"%(sarr)
                    if len(sarr[0])>=150:
                        data= p.sub('>{skeyword}{content}{content}。{skeyword}{content}{content}。{skeyword}{content}{content}。{skeyword}{content}{content}。</p>', data)
                        #print sarr[0],"=====",data
                    if len(sarr[0])>=100:
                        data= p.sub('>{skeyword}{content}{content}。{skeyword}{content}{content}。</p>', data)
                    if len(sarr[0])>=50:
                        data= p.sub('>{skeyword}{content}{content}。</p>', data)
            except :
                pass
 
            return data
        except Exception,e:
            return data
 
 
#        sarr = p.findall(data)
#        for line in res:
#            print line
#        print len(sarr)
#        res = p.split(data)
#        for line in res:
#            print line
 
 
    def open_file_null(self,file_data):  #清除空行
        data=""
        try:
            p = re.compile( r'.+?\n')
            sarr = p.findall(file_data)
            for every in sarr:
                if ("//" in str(every) or "!" in str(every)):
                    data+=str(every)
                else:
                    #print self.th_re(str(every))
                    data+=self.th_re(str(every))
                #if every.split():
                    #data+=every
    #            if len(every)>1:
    #                data+=every
                    #data+=every.lstrip().rstrip().strip().rstrip('\n')
            return data
        except Exception,e:
            print u"zzzzzzzzzzzzz",e
            return data
 
    def TXT_file_add(self,file_nem,data):  #写入文本
        try:
            #file_nem=time.strftime('%Y.%m.%d')   #file_nem+".txt"
            file_object = open(file_nem,'w')
            #file_object.write(list_passwed[E])
            file_object.writelines(data)
            #file_object.writelines("\n")
            file_object.close()
        except Exception,e:
            print u"写入TXT失败",file_nem,data,e
            return 0
 
    def utf_8_G(self,data):  #解码
        try:
            try:
                return "gbk",data.decode('gbk')
            except Exception,e:
                #print e
                pass
            try:
                return "utf-8",data.decode('utf-8')
            except Exception,e:
                #print e
                pass
            try:
                return "gb2312",data.decode('gb2312')
            except Exception,e:
                #print e
                pass
        except Exception,e:
            print "utf_8_G",e
            return "utf_8_G","no"
            pass
 
    def url_http_200(self,url):
        try:
            req = urllib2.Request(url)
            req.add_header('User-Agent',"Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)")
            s = urllib2.urlopen(req,timeout=10# 超时10秒   #s = urllib2.urlopen(r"http://www.163.com")
            int_url_read = s.read()
            if len(int_url_read)>=5000:
                return True,int_url_read
            else:
                return False,""
        except BaseException, e:
            #print "xxxxxxxxxxxx",(str(e))
            return False,""
 
 
if __name__=='__main__':
    m=www_mb()  #http://www.deerdun.com
    m.www_index("http://www.deerdun.com"#http://www.ybfhcl.com
    m.www_index2("http://www.deerdun.com")
 
 
 
#    p = re.compile('(one|two|three)')
#    print p.sub('num', 'one word two words three words apple', 2)