PS:昨天写了Flask开发的双色球分析工具,在和一群学友交流后,他们说建议增加自动获取最新开奖信息的功能,本着python的轻巧,于是就有了这个Python获取双色球彩票开奖的工具,本就只是略知皮毛的了解过爬虫,本次部分功能的完善由DeepSeek Chat协助完成代码的防封机制以及大数据(大量数据(超过1000条)建议保存为多个文件,Excel单个工作表最多支持约100万行,但实际使用建议不超过10万行)分多文件的完善,
注意本代码仅用于学习交流,切勿用于非法用途!
import requests import json import time import random import pandas as pd from typing import Dict, List, Optional from datetime import datetime class SSQSpider: def __init__(self): self.base_url = "https://www.cwl.gov.cn/cwl_admin/front/cwlkj/search/kjxx/findDrawNotice" self.headers = { "Accept": "application/json, text/javascript, */*; q=0.01", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "zh-CN,zh;q=0.9", "Connection": "keep-alive", "Host": "www.cwl.gov.cn", "Referer": "https://www.cwl.gov.cn/ygkj/wqkjgg/ssq/", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", "X-Requested-With": "XMLHttpRequest" } self.cookies = { "HMF_CI": "ccdeb225dfb17006dccc0241dfe329e21005024861f408cc58a70bd6fb877731381b1f33b9130951fda8c4cc26ba65dd51ce93586d3d79177bb66d4a320174ddcd", "21_vq": "46" } def get_ssq_data(self, page_no: int = 1, page_size: int = 30) -> Optional[Dict]: try: time.sleep(random.uniform(0.5, 2)) response = requests.get( self.base_url, headers=self.headers, params={ "name": "ssq", "pageNo": page_no, "pageSize": page_size, "systemType": "PC" }, cookies=self.cookies, timeout=10 ) response.raise_for_status() return response.json() except Exception as e: print(f"第 {page_no} 页请求失败: {str(e)}") return None def parse_data(self, raw_data: Dict) -> List[Dict]: if not raw_data or not isinstance(raw_data.get("result"), list): return [] structured_data = [] for item in raw_data["result"]: try: # 清理日期格式(移除星期信息) date_str = item.get("date", "").split("(")[0].strip() # 解析日期 try: draw_date = datetime.strptime(date_str, "%Y-%m-%d").date() except ValueError: print(f"日期格式无效: {date_str}") continue red_balls = item.get("red", "").split(",") blue_ball = item.get("blue", "").strip() if len(red_balls) != 6 or not blue_ball: continue first_prize = next( (p for p in item.get("prizegrades", []) if p.get("type") == "1"), {"bonus": "0", "num": "0"} ) structured_data.append({ "期号": item.get("code", "").strip(), "开奖日期": draw_date.strftime("%Y-%m-%d"), "星期": item.get("week", "").strip(), "红球1": red_balls[0], "红球2": red_balls[1], "红球3": red_balls[2], "红球4": red_balls[3], "红球5": red_balls[4], "红球6": red_balls[5], "蓝球": blue_ball, "开奖号码": " ".join(red_balls) + " + " + blue_ball, "奖池金额(元)": item.get("poolmoney", "0"), "一等奖注数": first_prize.get("num", "0"), "一等奖奖金(元)": first_prize.get("bonus", "0"), "销售金额(元)": item.get("sales", "0") }) except Exception as e: print(f"解析数据条目时出错: {str(e)}") continue return structured_data def save_to_excel(self, data: List[Dict], filename: str) -> bool: if not data: print("警告: 没有有效数据可保存") return False try: df = pd.DataFrame(data) # 确保日期列是字符串格式 df["开奖日期"] = pd.to_datetime(df["开奖日期"], errors="coerce").dt.strftime("%Y-%m-%d") # 移除可能存在的空值 df = df.dropna(subset=["开奖日期"]) if df.empty: print("警告: 处理后没有有效数据") return False # 按开奖日期降序排序 df = df.sort_values("开奖日期", ascending=False) # 使用xlsxwriter引擎保存 with pd.ExcelWriter(filename, engine="xlsxwriter") as writer: df.to_excel(writer, index=False, sheet_name="双色球开奖记录") workbook = writer.book worksheet = writer.sheets["双色球开奖记录"] # 设置列宽 col_widths = { "A": 10, "B": 12, "C": 8, "D:I": 6, "J": 6, "K": 22, "L": 16, "M": 12, "N": 14, "O": 14 } for col, width in col_widths.items(): if ":" in col: start, end = col.split(":") for c in range(ord(start), ord(end)+1): worksheet.set_column(f"{chr(c)}:{chr(c)}", width) else: worksheet.set_column(f"{col}:{col}", width) # 设置格式 red_format = workbook.add_format({"bold": True, "font_color": "#FF0000"}) blue_format = workbook.add_format({"bold": True, "font_color": "#0000FF"}) for col in ["D", "E", "F", "G", "H", "I"]: worksheet.conditional_format( f"{col}2:{col}{len(df)+1}", {"type": "no_blanks", "format": red_format} ) worksheet.conditional_format( f"J2:J{len(df)+1}", {"type": "no_blanks", "format": blue_format} ) print(f"成功保存数据到 {filename} (共 {len(df)} 条记录)") return True except Exception as e: print(f"保存Excel文件失败: {str(e)}") return False def fetch_multi_pages(self, start_page: int = 1, end_page: int = 10) -> List[Dict]: all_data = [] for page in range(start_page, end_page + 1): print(f"正在获取第 {page}/{end_page} 页...") for retry in range(3, 0, -1): raw_data = self.get_ssq_data(page_no=page, page_size=50) if raw_data is not None: break if retry > 1: print(f"第 {page} 页获取失败,剩余重试次数: {retry-1}") time.sleep(2) if raw_data is None: print(f"第 {page} 页获取失败,跳过") continue page_data = self.parse_data(raw_data) if page_data: all_data.extend(page_data) print(f"第 {page} 页获取成功,累计 {len(all_data)} 条数据") else: print(f"第 {page} 页无有效数据") time.sleep(random.uniform(1, 3)) return all_data def main(): print("双色球历史开奖数据采集程序") print("=" * 50) spider = SSQSpider() while True: try: start_page = int(input("请输入起始页码(默认1): ") or 1) end_page = int(input("请输入结束页码(建议不超过50): ")) if start_page < 1 or end_page < start_page: raise ValueError break except ValueError: print("输入无效,请输入有效的页码数字(结束页码≥起始页码≥1)") print("\n开始获取数据...") start_time = time.time() all_data = spider.fetch_multi_pages(start_page, end_page) if not all_data: print("未能获取任何有效数据,程序结束") return filename = f"ssq_data_p{start_page}-p{end_page}.xlsx" if spider.save_to_excel(all_data, filename): time_used = round(time.time() - start_time, 2) avg_time = round(time_used / (end_page - start_page + 1), 2) if end_page > start_page else time_used print("\n数据获取完成!") print(f"统计信息:") print(f"- 共获取 {len(all_data)} 条开奖记录") print(f"- 时间范围: {all_data[-1]['开奖日期']} 至 {all_data[0]['开奖日期']}") print(f"- 总耗时: {time_used} 秒") print(f"- 平均每页耗时: {avg_time} 秒") print(f"- 数据已保存到: {filename}") else: print("数据保存失败,请检查错误信息") if __name__ == "__main__": main()
运行方式:
安装依赖库:
pip install pandas xlsxwriter requests
双色球历史开奖数据采集程序
==================================================
请输入起始页码(默认1): 1
请输入结束页码(建议不超过50): 2
开始获取数据…
正在获取第 1/2 页…
第 1 页获取成功,累计 50 条数据
正在获取第 2/2 页…
第 2 页获取成功,累计 100 条数据
成功保存数据到 ssq_data_p1-p2.xlsx (共 100 条记录)
数据获取完成!
统计信息:
– 共获取 100 条开奖记录
– 时间范围: 2024-12-05 至 2025-08-03
– 总耗时: 7.45 秒
– 平均每页耗时: 3.73 秒
– 数据已保存到: ssq_data_p1-p2.xlsx
评论(0)
暂无评论