彩票資料爬蟲收集,儲存在CSV檔案中,程式碼如下

# -*- coding: utf-8 -*-
# author:Apples
from requests import get
from bs4 import BeautifulSoup
from user_agent import generate_user_agent
import time


def request_content(start, end):
    url_link = 'https://datachart.500.com/ssq/history/newinc/history.php?start={0}&end={1}'.format(start, end)
    headers = {
        'User-Agent': generate_user_agent(device_type='desktop', os=('mac', 'linux', 'win', 'android'))
    }
    response = get(url_link, headers=headers, timeout=6)
    page_content = BeautifulSoup(response.content, "html.parser")
    html_tag = page_content.find_all('tbody', id='tdata')[0]
    return html_tag.find_all('tr', 't_tr1')


class ssqclazz:
    def __init__(self):
        self.period = ''  # 期號
        self.red_1 = ''  # 紅球
        self.red_2 = ''
        self.red_3 = ''
        self.red_4 = ''
        self.red_5 = ''
        self.red_6 = ''
        self.blue_1 = ''  # 藍球
        self.happy_sunday = ''  # 快樂星期天
        self.pool_prize = ''  # 獎池獎金(元)
        self.first_count = ''  # 一等獎 注數
        self.first_prize = ''  # 一等獎 獎金(元)
        self.second_count = ''  # 二等獎 注數
        self.second_prize = ''  # 二等獎 獎金(元)
        self.total_prize = ''  # 總投注額(元)
        self.lottery_date = ''  # 開獎日期

    def __str__(self):
        return '{0},{1},{2},{3},{4},{5},{6},{7},{8},{9},{10},{11},{12},{13},{14},{15}'.format(self.period, self.red_1,
                                                                                              self.red_2, self.red_3,
                                                                                              self.red_4, self.red_5,
                                                                                              self.red_6,
                                                                                              self.blue_1,
                                                                                              self.happy_sunday,
                                                                                              self.pool_prize,
                                                                                              self.first_count,
                                                                                              self.first_prize,
                                                                                              self.second_count,
                                                                                              self.second_prize,
                                                                                              self.total_prize,
                                                                                              self.lottery_date)

    def tr_tag(self, tag):
        tds = tag.find_all('td')
        index = 0
        self.period = tds[index].string
        index += 1
        self.red_1 = tds[index].string
        index += 1
        self.red_2 = tds[index].string
        index += 1
        self.red_3 = tds[index].string
        index += 1
        self.red_4 = tds[index].string
        index += 1
        self.red_5 = tds[index].string
        index += 1
        self.red_6 = tds[index].string
        index += 1
        self.blue_1 = tds[index].string
        index += 1
        self.happy_sunday = tds[index].string
        index += 1
        self.pool_prize = tds[index].string
        index += 1
        self.first_count = tds[index].string
        index += 1
        self.first_prize = tds[index].string
        index += 1
        self.second_count = tds[index].string
        index += 1
        self.second_prize = tds[index].string
        index += 1
        self.total_prize = tds[index].string
        index += 1
        self.lottery_date = tds[index].string


if __name__ == '__main__':
    import xlwt
    import re
    wbk = xlwt.Workbook()
    # file = open('ssq.txt', mode='a+', encoding='utf-8')
    localtime = time.localtime(time.time())
    lyear = localtime.tm_year
    ymin = 3  # 雙色球03年上線
    ymax = lyear - 2000
    print('===抓取資料開始===,200%s-20%s' % (ymin, ymax))
    for year in range(ymin, ymax + 1):
        sheet = wbk.add_sheet(str(year), cell_overwrite_ok=True)
        row = 0
        start = '{0}001'.format(year)
        end = '{0}300'.format(year)
        trs = request_content(start, end)
        for tr in trs:
            ssqobj = ssqclazz()
            ssqobj.tr_tag(tr)
            objstr = ssqobj.__str__()
            column = 0
            # 提取文字中的資料
            for data in re.findall(r"\d+\.?\d*", objstr):
                sheet.write(row, column, data)  # 第0行第一列寫入內容
                column += 1
            row += 1
            # file.write(objstr)
            # file.write('\n')
            print(objstr)
        # file.write('\n')
        print()
        time.sleep(3)
        wbk.save('test.xls')
    # file.close()
    print('抓取完畢!!!')