boxmoe_header_banner_img

Hello! 欢迎来到豌豆高清!

文章导读

Python爬取双色球彩票开奖信息


avatar
admin 2025 年 8 月 4 日 83

PS:昨天写了Flask开发的双色球分析工具,在和一群学友交流后,他们说建议增加自动获取最新开奖信息的功能,本着python的轻巧,于是就有了这个Python获取双色球彩票开奖的工具,本就只是略知皮毛的了解过爬虫,本次部分功能的完善由DeepSeek Chat协助完成代码的防封机制以及大数据(大量数据(超过1000条)建议保存为多个文件,Excel单个工作表最多支持约100万行,但实际使用建议不超过10万行)分多文件的完善,

注意本代码仅用于学习交流,切勿用于非法用途!

import requests
import json
import time
import random
import pandas as pd
from typing import Dict, List, Optional
from datetime import datetime

class SSQSpider:
    def __init__(self):
        self.base_url = "https://www.cwl.gov.cn/cwl_admin/front/cwlkj/search/kjxx/findDrawNotice"
        self.headers = {
            "Accept": "application/json, text/javascript, */*; q=0.01",
            "Accept-Encoding": "gzip, deflate, br",
            "Accept-Language": "zh-CN,zh;q=0.9",
            "Connection": "keep-alive",
            "Host": "www.cwl.gov.cn",
            "Referer": "https://www.cwl.gov.cn/ygkj/wqkjgg/ssq/",
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
            "X-Requested-With": "XMLHttpRequest"
        }
        self.cookies = {
            "HMF_CI": "ccdeb225dfb17006dccc0241dfe329e21005024861f408cc58a70bd6fb877731381b1f33b9130951fda8c4cc26ba65dd51ce93586d3d79177bb66d4a320174ddcd",
            "21_vq": "46"
        }
        
    def get_ssq_data(self, page_no: int = 1, page_size: int = 30) -> Optional[Dict]:
        try:
            time.sleep(random.uniform(0.5, 2))
            response = requests.get(
                self.base_url,
                headers=self.headers,
                params={
                    "name": "ssq",
                    "pageNo": page_no,
                    "pageSize": page_size,
                    "systemType": "PC"
                },
                cookies=self.cookies,
                timeout=10
            )
            response.raise_for_status()
            return response.json()
        except Exception as e:
            print(f"第 {page_no} 页请求失败: {str(e)}")
            return None
    
    def parse_data(self, raw_data: Dict) -> List[Dict]:
        if not raw_data or not isinstance(raw_data.get("result"), list):
            return []
            
        structured_data = []
        for item in raw_data["result"]:
            try:
                # 清理日期格式(移除星期信息)
                date_str = item.get("date", "").split("(")[0].strip()
                
                # 解析日期
                try:
                    draw_date = datetime.strptime(date_str, "%Y-%m-%d").date()
                except ValueError:
                    print(f"日期格式无效: {date_str}")
                    continue
                
                red_balls = item.get("red", "").split(",")
                blue_ball = item.get("blue", "").strip()
                
                if len(red_balls) != 6 or not blue_ball:
                    continue
                
                first_prize = next(
                    (p for p in item.get("prizegrades", []) if p.get("type") == "1"),
                    {"bonus": "0", "num": "0"}
                )
                
                structured_data.append({
                    "期号": item.get("code", "").strip(),
                    "开奖日期": draw_date.strftime("%Y-%m-%d"),
                    "星期": item.get("week", "").strip(),
                    "红球1": red_balls[0],
                    "红球2": red_balls[1],
                    "红球3": red_balls[2],
                    "红球4": red_balls[3],
                    "红球5": red_balls[4],
                    "红球6": red_balls[5],
                    "蓝球": blue_ball,
                    "开奖号码": " ".join(red_balls) + " + " + blue_ball,
                    "奖池金额(元)": item.get("poolmoney", "0"),
                    "一等奖注数": first_prize.get("num", "0"),
                    "一等奖奖金(元)": first_prize.get("bonus", "0"),
                    "销售金额(元)": item.get("sales", "0")
                })
            except Exception as e:
                print(f"解析数据条目时出错: {str(e)}")
                continue
                
        return structured_data
    
    def save_to_excel(self, data: List[Dict], filename: str) -> bool:
        if not data:
            print("警告: 没有有效数据可保存")
            return False
            
        try:
            df = pd.DataFrame(data)
            
            # 确保日期列是字符串格式
            df["开奖日期"] = pd.to_datetime(df["开奖日期"], errors="coerce").dt.strftime("%Y-%m-%d")
            
            # 移除可能存在的空值
            df = df.dropna(subset=["开奖日期"])
            
            if df.empty:
                print("警告: 处理后没有有效数据")
                return False
                
            # 按开奖日期降序排序
            df = df.sort_values("开奖日期", ascending=False)
            
            # 使用xlsxwriter引擎保存
            with pd.ExcelWriter(filename, engine="xlsxwriter") as writer:
                df.to_excel(writer, index=False, sheet_name="双色球开奖记录")
                
                workbook = writer.book
                worksheet = writer.sheets["双色球开奖记录"]
                
                # 设置列宽
                col_widths = {
                    "A": 10, "B": 12, "C": 8, 
                    "D:I": 6, "J": 6, "K": 22,
                    "L": 16, "M": 12, "N": 14, "O": 14
                }
                
                for col, width in col_widths.items():
                    if ":" in col:
                        start, end = col.split(":")
                        for c in range(ord(start), ord(end)+1):
                            worksheet.set_column(f"{chr(c)}:{chr(c)}", width)
                    else:
                        worksheet.set_column(f"{col}:{col}", width)
                
                # 设置格式
                red_format = workbook.add_format({"bold": True, "font_color": "#FF0000"})
                blue_format = workbook.add_format({"bold": True, "font_color": "#0000FF"})
                
                for col in ["D", "E", "F", "G", "H", "I"]:
                    worksheet.conditional_format(
                        f"{col}2:{col}{len(df)+1}",
                        {"type": "no_blanks", "format": red_format}
                    )
                worksheet.conditional_format(
                    f"J2:J{len(df)+1}",
                    {"type": "no_blanks", "format": blue_format}
                )
            
            print(f"成功保存数据到 {filename} (共 {len(df)} 条记录)")
            return True
            
        except Exception as e:
            print(f"保存Excel文件失败: {str(e)}")
            return False
    
    def fetch_multi_pages(self, start_page: int = 1, end_page: int = 10) -> List[Dict]:
        all_data = []
        for page in range(start_page, end_page + 1):
            print(f"正在获取第 {page}/{end_page} 页...")
            
            for retry in range(3, 0, -1):
                raw_data = self.get_ssq_data(page_no=page, page_size=50)
                if raw_data is not None:
                    break
                if retry > 1:
                    print(f"第 {page} 页获取失败,剩余重试次数: {retry-1}")
                    time.sleep(2)
            
            if raw_data is None:
                print(f"第 {page} 页获取失败,跳过")
                continue
                
            page_data = self.parse_data(raw_data)
            if page_data:
                all_data.extend(page_data)
                print(f"第 {page} 页获取成功,累计 {len(all_data)} 条数据")
            else:
                print(f"第 {page} 页无有效数据")
            
            time.sleep(random.uniform(1, 3))
        
        return all_data


def main():
    print("双色球历史开奖数据采集程序")
    print("=" * 50)
    
    spider = SSQSpider()
    
    while True:
        try:
            start_page = int(input("请输入起始页码(默认1): ") or 1)
            end_page = int(input("请输入结束页码(建议不超过50): "))
            if start_page < 1 or end_page < start_page:
                raise ValueError
            break
        except ValueError:
            print("输入无效,请输入有效的页码数字(结束页码≥起始页码≥1)")
    
    print("\n开始获取数据...")
    start_time = time.time()
    all_data = spider.fetch_multi_pages(start_page, end_page)
    
    if not all_data:
        print("未能获取任何有效数据,程序结束")
        return
    
    filename = f"ssq_data_p{start_page}-p{end_page}.xlsx"
    if spider.save_to_excel(all_data, filename):
        time_used = round(time.time() - start_time, 2)
        avg_time = round(time_used / (end_page - start_page + 1), 2) if end_page > start_page else time_used
        
        print("\n数据获取完成!")
        print(f"统计信息:")
        print(f"- 共获取 {len(all_data)} 条开奖记录")
        print(f"- 时间范围: {all_data[-1]['开奖日期']} 至 {all_data[0]['开奖日期']}")
        print(f"- 总耗时: {time_used} 秒")
        print(f"- 平均每页耗时: {avg_time} 秒")
        print(f"- 数据已保存到: {filename}")
    else:
        print("数据保存失败,请检查错误信息")


if __name__ == "__main__":
    main()


运行方式:
安装依赖库:
pip install pandas xlsxwriter requests

双色球历史开奖数据采集程序
==================================================
请输入起始页码(默认1): 1
请输入结束页码(建议不超过50): 2

开始获取数据…
正在获取第 1/2 页…
第 1 页获取成功,累计 50 条数据
正在获取第 2/2 页…
第 2 页获取成功,累计 100 条数据
成功保存数据到 ssq_data_p1-p2.xlsx (共 100 条记录)

数据获取完成!
统计信息:
– 共获取 100 条开奖记录
– 时间范围: 2024-12-05 至 2025-08-03
– 总耗时: 7.45 秒
– 平均每页耗时: 3.73 秒
– 数据已保存到: ssq_data_p1-p2.xlsx



评论(0)

查看评论列表

暂无评论


发表评论