清华大佬耗费三个月吐血整理的几百G的资源,免费分享!....>>>
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 | #!/usr/bin/env python #-*-coding:utf-8-*- """ 从 cp.360.cn 同步时时彩的数据到表 haoma """ import datetime from datetime import timedelta import time import torndb import sys import requests import re from mylogger import get_logger reload(sys) sys.setdefaultencoding( 'utf-8' ) DBHOST = "localhost:3306" SCHEMA = "CAIPIAO" DBUSER = "user" DBPASSWD = "passwd" db = torndb.Connection(host = DBHOST, database = SCHEMA, user = DBUSER, password = DBPASSWD) cplog = get_logger( "caipiao" ) class Data_Sync(object): ssc_re = re.compile(r '<td class=\'gray\'>(.*?)</td>(<td class=\'red big\'>|<td style=\'width:65px\'>)(.*?)</td>.*?<tr>' ) def __init__( self , start_date = "20150101" , sleep_secs = 10 , run_ever = True ): self .start_date = start_date if start_date > "20130101" else "20150101" self .run_ever = run_ever self .base_url = "http://chart.cp.360.cn/kaijiang/kaijiang?lotId=255401&spanType=2&span=" self .latest_date = '' self .latest_period = '' self .need_sleep = False self .sleep_secs = sleep_secs def run( self ): while True : if self .need_sleep: time.sleep( self .sleep_secs) self .need_sleep = False else : self .sync_data_from_360() def sync_data_from_360( self ): """ 根据数据库中最新一条数据,从 cp.360.com 同步数据至最新数据 """ self .get_latest_haoma_from_mysql() if not self .latest_date: cplog.info( "db has no data, so start at {0}" .format( self .start_date)) self .latest_date = self .start_date self .latest_period = "000" cplog.info( "in db, item_date={0}, period={1}" .format( self .latest_date, self .latest_period)) if self .latest_date: cur_date = datetime.datetime.utcnow() + timedelta(hours = 8 ) latest_date = datetime.datetime.strptime( self .latest_date, "%Y%m%d" ) """ 更新规则: 1、检查是否同一天,如果不是,就下载数据,执行步骤2,增加天数,直到数据库日期与当前日一致; 2、检查数据库中的期数与下载回来的数据的最新期是否一致,一致,检查日期是否一致,是就跳过,否则插入数据; """ dl_times = 0 while (cur_date - latest_date).days > 0 : if int( self .latest_period) < 120 : dl_date = latest_date.strftime( "%Y-%m-%d" ) dl_url = self .base_url + dl_date + "_" + dl_date data = self .download_with_requests(dl_url) if not data: if dl_times < 3 : dl_times + = 1 time.sleep( 2 ) continue else : latest_date + = timedelta( 1 ) continue dl_times = 0 self .latest_date = latest_date.strftime( '%Y%m%d' ) lottery_numbers = data[int( self .latest_period):] self .insert_into_mysql( self .latest_date, lottery_numbers) latest_date + = timedelta( 1 ) else : latest_date + = timedelta( 1 ) self .latest_period = "000" """ 更新当日数据 """ dl_date = latest_date.strftime( "%Y-%m-%d" ) dl_url = self .base_url + dl_date + "_" + dl_date data = self .download_with_requests(dl_url) if data: lottery_numbers = data[int( self .latest_period):] self .latest_date = latest_date.strftime( '%Y%m%d' ) self .insert_into_mysql( self .latest_date, lottery_numbers) def insert_into_mysql( self , item_date, datas): insert_datas = [] for data in datas: period = data[ 0 ] date_period = item_date + period lottery_number = data[ 2 ] if not re.search( '\d+' , lottery_number): continue a, b, c, d, e = list(lottery_number) insert_data = (item_date, period, date_period, lottery_number, a, b, c, d, e) insert_datas.append(insert_data) if insert_datas: cplog.info( "current insert into haoma:{0}, {1}" .format(item_date, datas)) sql = "insert into haoma(item_date, period, date_period, lottery_number, a, b, c, d, e) values(%s, %s, %s, %s, %s, %s, %s, %s, %s)" try : db.executemany(sql, insert_datas) except Exception as e: print e sys.exit( 1 ) else : cplog.info( "no more new data to sync, wait for {0} seconds" .format( self .sleep_secs)) self .need_sleep = True def get_latest_haoma_from_mysql( self ): sql = "select * from haoma order by date_period desc limit 1" ret = db.get(sql) if ret: self .latest_date = ret.item_date self .latest_period = ret.period def download_with_requests( self , url): cplog.info( "download: {0}" .format(url)) data = [] try : r = requests.get(url, timeout = 10 ) if r.status_code = = 200 : data = self .ssc_re.findall(r.content) else : cplog.info( "download err, http status_code:{0}" .format(r.status_code)) except Exception as e: cplog.info( "call requests raise Exception: {0}" .format(e)) finally : return data def run(): sync = Data_Sync(start_date = "20140101" , sleep_secs = 30 ) sync.run() if __name__ = = "__main__" : run() |