清华大佬耗费三个月吐血整理的几百G的资源,免费分享!....>>>
#!/usr/bin/env python #-*-coding:utf-8-*- """ 从 cp.360.cn 同步时时彩的数据到表 haoma """ import datetime from datetime import timedelta import time import torndb import sys import requests import re from mylogger import get_logger reload(sys) sys.setdefaultencoding('utf-8') DBHOST = "localhost:3306" SCHEMA = "CAIPIAO" DBUSER = "user" DBPASSWD = "passwd" db = torndb.Connection(host=DBHOST, database=SCHEMA, user=DBUSER, password=DBPASSWD) cplog = get_logger("caipiao") class Data_Sync(object): ssc_re = re.compile(r'<td class=\'gray\'>(.*?)</td>(<td class=\'red big\'>|<td style=\'width:65px\'>)(.*?)</td>.*?<tr>') def __init__(self, start_date="20150101", sleep_secs = 10, run_ever=True): self.start_date = start_date if start_date > "20130101" else "20150101" self.run_ever = run_ever self.base_url = "http://chart.cp.360.cn/kaijiang/kaijiang?lotId=255401&spanType=2&span=" self.latest_date = '' self.latest_period = '' self.need_sleep = False self.sleep_secs = sleep_secs def run(self): while True: if self.need_sleep: time.sleep(self.sleep_secs) self.need_sleep = False else: self.sync_data_from_360() def sync_data_from_360(self): """ 根据数据库中最新一条数据,从 cp.360.com 同步数据至最新数据 """ self.get_latest_haoma_from_mysql() if not self.latest_date: cplog.info("db has no data, so start at {0}".format(self.start_date)) self.latest_date = self.start_date self.latest_period = "000" cplog.info("in db, item_date={0}, period={1}".format(self.latest_date, self.latest_period)) if self.latest_date: cur_date = datetime.datetime.utcnow() + timedelta(hours=8) latest_date = datetime.datetime.strptime(self.latest_date, "%Y%m%d") """ 更新规则: 1、检查是否同一天,如果不是,就下载数据,执行步骤2,增加天数,直到数据库日期与当前日一致; 2、检查数据库中的期数与下载回来的数据的最新期是否一致,一致,检查日期是否一致,是就跳过,否则插入数据; """ dl_times = 0 while (cur_date - latest_date).days > 0: if int(self.latest_period) < 120: dl_date = latest_date.strftime("%Y-%m-%d") dl_url = self.base_url + dl_date + "_" + dl_date data = self.download_with_requests(dl_url) if not data: if dl_times < 3: dl_times += 1 time.sleep(2) continue else: latest_date += timedelta(1) continue dl_times = 0 self.latest_date = latest_date.strftime('%Y%m%d') lottery_numbers = data[int(self.latest_period):] self.insert_into_mysql(self.latest_date, lottery_numbers) latest_date += timedelta(1) else: latest_date += timedelta(1) self.latest_period = "000" """ 更新当日数据 """ dl_date = latest_date.strftime("%Y-%m-%d") dl_url = self.base_url + dl_date + "_" + dl_date data = self.download_with_requests(dl_url) if data: lottery_numbers = data[int(self.latest_period):] self.latest_date = latest_date.strftime('%Y%m%d') self.insert_into_mysql(self.latest_date, lottery_numbers) def insert_into_mysql(self, item_date, datas): insert_datas = [] for data in datas: period = data[0] date_period = item_date + period lottery_number = data[2] if not re.search('\d+', lottery_number): continue a, b, c, d, e = list(lottery_number) insert_data = (item_date, period, date_period, lottery_number, a, b, c, d, e) insert_datas.append(insert_data) if insert_datas: cplog.info("current insert into haoma:{0}, {1}".format(item_date, datas)) sql = "insert into haoma(item_date, period, date_period, lottery_number, a, b, c, d, e) values(%s, %s, %s, %s, %s, %s, %s, %s, %s)" try: db.executemany(sql, insert_datas) except Exception as e: print e sys.exit(1) else: cplog.info("no more new data to sync, wait for {0} seconds".format(self.sleep_secs)) self.need_sleep = True def get_latest_haoma_from_mysql(self): sql = "select * from haoma order by date_period desc limit 1" ret = db.get(sql) if ret: self.latest_date = ret.item_date self.latest_period = ret.period def download_with_requests(self, url): cplog.info("download: {0}".format(url)) data = [] try: r = requests.get(url, timeout=10) if r.status_code == 200: data = self.ssc_re.findall(r.content) else: cplog.info("download err, http status_code:{0}".format(r.status_code)) except Exception as e: cplog.info("call requests raise Exception: {0}".format(e)) finally: return data def run(): sync = Data_Sync(start_date="20140101", sleep_secs=30) sync.run() if __name__ == "__main__": run()