
PS:昨天写了Flask开发的双色球分析工具,在和一群学友交流后,他们说建议增加自动获取最新开奖信息的功能,本着python的轻巧,于是就有了这个Python获取双色球彩票开奖的工具,本就只是略知皮毛的了解过爬虫,本次部分功能的完善由DeepSeek Chat协助完成代码的防封机制以及大数据(大量数据(超过1000条)建议保存为多个文件,Excel单个工作表最多支持约100万行,但实际使用建议不超过10万行)分多文件的完善,
注意本代码仅用于学习交流,切勿用于非法用途!
import requests
import json
import time
import random
import pandas as pd
from typing import Dict, List, Optional
from datetime import datetime
class SSQSpider:
def __init__(self):
self.base_url = "https://www.cwl.gov.cn/cwl_admin/front/cwlkj/search/kjxx/findDrawNotice"
self.headers = {
"Accept": "application/json, text/javascript, */*; q=0.01",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "zh-CN,zh;q=0.9",
"Connection": "keep-alive",
"Host": "www.cwl.gov.cn",
"Referer": "https://www.cwl.gov.cn/ygkj/wqkjgg/ssq/",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
"X-Requested-With": "XMLHttpRequest"
}
self.cookies = {
"HMF_CI": "ccdeb225dfb17006dccc0241dfe329e21005024861f408cc58a70bd6fb877731381b1f33b9130951fda8c4cc26ba65dd51ce93586d3d79177bb66d4a320174ddcd",
"21_vq": "46"
}
def get_ssq_data(self, page_no: int = 1, page_size: int = 30) -> Optional[Dict]:
try:
time.sleep(random.uniform(0.5, 2))
response = requests.get(
self.base_url,
headers=self.headers,
params={
"name": "ssq",
"pageNo": page_no,
"pageSize": page_size,
"systemType": "PC"
},
cookies=self.cookies,
timeout=10
)
response.raise_for_status()
return response.json()
except Exception as e:
print(f"第 {page_no} 页请求失败: {str(e)}")
return None
def parse_data(self, raw_data: Dict) -> List[Dict]:
if not raw_data or not isinstance(raw_data.get("result"), list):
return []
structured_data = []
for item in raw_data["result"]:
try:
# 清理日期格式(移除星期信息)
date_str = item.get("date", "").split("(")[0].strip()
# 解析日期
try:
draw_date = datetime.strptime(date_str, "%Y-%m-%d").date()
except ValueError:
print(f"日期格式无效: {date_str}")
continue
red_balls = item.get("red", "").split(",")
blue_ball = item.get("blue", "").strip()
if len(red_balls) != 6 or not blue_ball:
continue
first_prize = next(
(p for p in item.get("prizegrades", []) if p.get("type") == "1"),
{"bonus": "0", "num": "0"}
)
structured_data.append({
"期号": item.get("code", "").strip(),
"开奖日期": draw_date.strftime("%Y-%m-%d"),
"星期": item.get("week", "").strip(),
"红球1": red_balls[0],
"红球2": red_balls[1],
"红球3": red_balls[2],
"红球4": red_balls[3],
"红球5": red_balls[4],
"红球6": red_balls[5],
"蓝球": blue_ball,
"开奖号码": " ".join(red_balls) + " + " + blue_ball,
"奖池金额(元)": item.get("poolmoney", "0"),
"一等奖注数": first_prize.get("num", "0"),
"一等奖奖金(元)": first_prize.get("bonus", "0"),
"销售金额(元)": item.get("sales", "0")
})
except Exception as e:
print(f"解析数据条目时出错: {str(e)}")
continue
return structured_data
def save_to_excel(self, data: List[Dict], filename: str) -> bool:
if not data:
print("警告: 没有有效数据可保存")
return False
try:
df = pd.DataFrame(data)
# 确保日期列是字符串格式
df["开奖日期"] = pd.to_datetime(df["开奖日期"], errors="coerce").dt.strftime("%Y-%m-%d")
# 移除可能存在的空值
df = df.dropna(subset=["开奖日期"])
if df.empty:
print("警告: 处理后没有有效数据")
return False
# 按开奖日期降序排序
df = df.sort_values("开奖日期", ascending=False)
# 使用xlsxwriter引擎保存
with pd.ExcelWriter(filename, engine="xlsxwriter") as writer:
df.to_excel(writer, index=False, sheet_name="双色球开奖记录")
workbook = writer.book
worksheet = writer.sheets["双色球开奖记录"]
# 设置列宽
col_widths = {
"A": 10, "B": 12, "C": 8,
"D:I": 6, "J": 6, "K": 22,
"L": 16, "M": 12, "N": 14, "O": 14
}
for col, width in col_widths.items():
if ":" in col:
start, end = col.split(":")
for c in range(ord(start), ord(end)+1):
worksheet.set_column(f"{chr(c)}:{chr(c)}", width)
else:
worksheet.set_column(f"{col}:{col}", width)
# 设置格式
red_format = workbook.add_format({"bold": True, "font_color": "#FF0000"})
blue_format = workbook.add_format({"bold": True, "font_color": "#0000FF"})
for col in ["D", "E", "F", "G", "H", "I"]:
worksheet.conditional_format(
f"{col}2:{col}{len(df)+1}",
{"type": "no_blanks", "format": red_format}
)
worksheet.conditional_format(
f"J2:J{len(df)+1}",
{"type": "no_blanks", "format": blue_format}
)
print(f"成功保存数据到 {filename} (共 {len(df)} 条记录)")
return True
except Exception as e:
print(f"保存Excel文件失败: {str(e)}")
return False
def fetch_multi_pages(self, start_page: int = 1, end_page: int = 10) -> List[Dict]:
all_data = []
for page in range(start_page, end_page + 1):
print(f"正在获取第 {page}/{end_page} 页...")
for retry in range(3, 0, -1):
raw_data = self.get_ssq_data(page_no=page, page_size=50)
if raw_data is not None:
break
if retry > 1:
print(f"第 {page} 页获取失败,剩余重试次数: {retry-1}")
time.sleep(2)
if raw_data is None:
print(f"第 {page} 页获取失败,跳过")
continue
page_data = self.parse_data(raw_data)
if page_data:
all_data.extend(page_data)
print(f"第 {page} 页获取成功,累计 {len(all_data)} 条数据")
else:
print(f"第 {page} 页无有效数据")
time.sleep(random.uniform(1, 3))
return all_data
def main():
print("双色球历史开奖数据采集程序")
print("=" * 50)
spider = SSQSpider()
while True:
try:
start_page = int(input("请输入起始页码(默认1): ") or 1)
end_page = int(input("请输入结束页码(建议不超过50): "))
if start_page < 1 or end_page < start_page:
raise ValueError
break
except ValueError:
print("输入无效,请输入有效的页码数字(结束页码≥起始页码≥1)")
print("\n开始获取数据...")
start_time = time.time()
all_data = spider.fetch_multi_pages(start_page, end_page)
if not all_data:
print("未能获取任何有效数据,程序结束")
return
filename = f"ssq_data_p{start_page}-p{end_page}.xlsx"
if spider.save_to_excel(all_data, filename):
time_used = round(time.time() - start_time, 2)
avg_time = round(time_used / (end_page - start_page + 1), 2) if end_page > start_page else time_used
print("\n数据获取完成!")
print(f"统计信息:")
print(f"- 共获取 {len(all_data)} 条开奖记录")
print(f"- 时间范围: {all_data[-1]['开奖日期']} 至 {all_data[0]['开奖日期']}")
print(f"- 总耗时: {time_used} 秒")
print(f"- 平均每页耗时: {avg_time} 秒")
print(f"- 数据已保存到: {filename}")
else:
print("数据保存失败,请检查错误信息")
if __name__ == "__main__":
main()
运行方式:
安装依赖库:
pip install pandas xlsxwriter requests
双色球历史开奖数据采集程序
==================================================
请输入起始页码(默认1): 1
请输入结束页码(建议不超过50): 2
开始获取数据…
正在获取第 1/2 页…
第 1 页获取成功,累计 50 条数据
正在获取第 2/2 页…
第 2 页获取成功,累计 100 条数据
成功保存数据到 ssq_data_p1-p2.xlsx (共 100 条记录)
数据获取完成!
统计信息:
– 共获取 100 条开奖记录
– 时间范围: 2024-12-05 至 2025-08-03
– 总耗时: 7.45 秒
– 平均每页耗时: 3.73 秒
– 数据已保存到: ssq_data_p1-p2.xlsx

评论(0)
暂无评论