1. 程式人生 > >python爬取雙色球資料+資料統計

python爬取雙色球資料+資料統計

彩票資料爬去---寫入mysql:
import requests
import re#python中的正則表示式(re模組)
import xlwt
import time
import pymysql as MySQLdb

flag=True
allres=[]
def get_all_page():
    global all_page
    url = "http://kaijiang.zhcw.com/zhcw/html/ssq/list_1.html"
    reponse = requests.get(url=url)
    reponse.encoding='utf-8'
    html = reponse.text
    all_page = int(re.findall(r"class=\"pg\".*?<strong>(.*?)</strong>",html)[0])
    return all_page

def get_num():
    for page_num in range(1,all_page+1):
        url = "http://kaijiang.zhcw.com/zhcw/html/ssq/list_"+str(page_num)+".html"
        reponse = requests.get(url=url)
        time.sleep(2)
        reponse.encoding = 'utf-8'
        html = reponse.text
        rule = r"<tr>.*?<td align=\"center\">(.*?)</td>.*?<td align=\"center\">(.*?)</td>.*?<td align=\"center\" style=\"padding-left:10px;\">.*?<em class=\"rr\">(.*?)</em>.*?<em class=\"rr\">(.*?)</em>.*?<em class=\"rr\">(.*?)</em>.*?<em class=\"rr\">(.*?)</em>.*?<em class=\"rr\">(.*?)</em>.*?<em class=\"rr\">(.*?)</em>.*?<em>(.*?)</em></td>"
        num = re.findall(rule, html, re.S | re.M)
        # f = xlwt.Workbook(encoding='utf-8')
        # sheet01 = f.add_sheet(u'sheel1', cell_overwrite_ok=True)
        for k in range(0,len(num)):
            kjrq=num[k][0]
            qs=num[k][1]
            seq = (str(num[k][2]),str(num[k][3]),str(num[k][4]),str(num[k][5]),str(num[k][6]),str(num[k][7]))
            red_ball="|".join(seq)
            blue_ball = num[k][8]
            send_data(kjrq,qs,red_ball,blue_ball)

def send_data(kjrq,qs,red_ball,blue_ball):
    '''
    連線資料庫,提交獲取到的期號、截止時間和系統時間
    '''
    conn = MySQLdb.connect(
        host='localhost',
        port=3306,
        user='root',
        passwd='root',
        db='lottery_ticket',
        charset='utf8'
    )
    cur = conn.cursor()

    print(u'扒取到的最新期號為:%s' % kjrq)

    try:
        cur.execute("SELECT	 kjrq FROM	ssq ORDER BY kjrq DESC")
        select_db_results = cur.fetchall()
        for select_db_result in select_db_results:
            allres.append(select_db_result[0])
        if kjrq in select_db_results:
            print(u'*****<<資料已經存在,不需要更新!>>*****')
        else:
            sql_insert = """\
                  insert into ssq(kjrq,qs,red_ball,blue_ball)
                   VALUES (%s,%s,%s,%s)
                   """
            cur.execute(
                sql_insert, (kjrq, qs, red_ball, blue_ball)
            )
            conn.commit()
            print(u'*****<<更新期號成功,更新內容是:%s>>*****' % str(kjrq))
    except Exception as e:
        print(e)
    finally:
        cur.close()
        conn.close()
if __name__ == '__main__':
    get_all_page()
    get_num()

彩票資料統計分析:

import numpy as np
import pandas as pd
import pymysql
from sklearn import linear_model
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
import matplotlib.pyplot as plt
from pyspark.sql import SparkSession
from pyspark.mllib.fpm import FPGrowth
from pylab import *#支援中文
import operator
mpl.rcParams['font.sans-serif'] = ['SimHei']
#封裝彩票資料
alldata=[]
red_balls=[]
blue_balls=[]
qs=[]
kjrq=[]
#讀取資料庫資訊----nums為0表示預設統計所有期,>0則統計最近nums期資料。
def getData(nums=0):
    db = pymysql.connect(
        host='localhost',
        port=3306,
        user='root',
        passwd='root',
        db='lottery_ticket',
        charset='utf8'
    )
    cur = db.cursor()
    try:
        if nums>0:
            cur.execute("SELECT	 * FROM	ssq ORDER BY kjrq DESC limit "+str(nums))
        else:
            cur.execute("SELECT	 * FROM	ssq ORDER BY kjrq DESC")
        select_db_results = cur.fetchall()
        for select_db_result in select_db_results:
            alldata.append(select_db_result)
    except Exception as e:
        print(e)
    finally:
        cur.close()
        db.close()

def write2mysql(data={}):
    db = pymysql.connect(
        host='localhost',
        port=3306,
        user='root',
        passwd='root',
        db='lottery_ticket',
        charset='utf8'
    )
    cur = db.cursor()
    frequent=[]
    cur.execute("SELECT	 numbers FROM	fpgroupth ORDER BY `count` DESC")
    fp_results = cur.fetchall()
    for select_db_result in fp_results:
        frequent.append(select_db_result[0])

    try:
        for key in data.keys():
            numbers=key
            times=data[key]
            if numbers in frequent:
                print(u'*****<<資料已經存在,不需要更新!>>*****')
            else:
                sql_insert = """\
                                  insert into fpgroupth(numbers,`count`)
                                   VALUES (%s,%s)
                                   """
                cur.execute(
                    sql_insert, (numbers, times)
                )
                db.commit()
    except Exception as e:
        print(e)
    finally:
        cur.close()
        db.close()
#貝葉斯迴歸模型
def bayes():
    # 使用貝葉斯令迴歸
    reg = linear_model.BayesianRidge()
    reg.fit(red_balls, blue_balls)
    print("貝葉斯分類器"+reg.predict([[4.0, 2.0, 5.0, 12.0, 20.0, 22.0], [1.0, 7.0, 8.0, 15.0, 23.0, 31.0]]))
#SGDClassifier迴歸模型
def sGDClassifier():
    # 使用貝葉斯令迴歸
    clf1 = SGDClassifier(loss="hinge", penalty="l2")
    clf1.fit(red_balls, blue_balls)
    print("sgd分類器:"+str(clf1.predict([[1.0, 2.0, 4.0, 12.0, 20.0, 22.0]])))
def svmsClassfier():
    clf2 = svm.SVC()
    clf2.fit(red_balls, blue_balls)
    print("svm分類器:"+str(clf2.predict([[1.0, 2.0, 4.0, 12.0, 20.0, 22.0]])))
def randForest():
    clf3 = RandomForestClassifier(n_estimators=10)
    clf3 = clf3.fit(red_balls, blue_balls)
    print("random分類器:"+str(clf3.predict([[1.0, 2.0, 4.0, 12.0, 25.0, 33.0]])))
def analysis():
    #從資料庫獲取資料
    getData(nums=10)
    #利用scikit-learn分析資料
    for res in alldata:
        red=str(res[2]).split("|")
        red_balls.append(list(map(float,red)))
        blue_balls.append(res[3])
        qs.append(res[1])
        kjrq.append(res[0])
    #sGDClassifier()
    #svmsClassfier()
    #randForest()
    '''
    res=redStatisticCount(red=True)
    blue_res=redStatisticCount(red=False)
    val=res.values()
    blue_val=blue_res.values()
    X=[i for i in range(1,34)]
    print(res)
    print(val)  
    print(X)
    plt.plot(X, val, marker='o', mec='r', mfc='w',label=u'紅球曲線圖')
    plt.plot(X, blue_val, marker='*', mec='r', mfc='w',label=u'藍球曲線圖')
    plt.legend()  # 讓圖例生效
    plt.xticks(X, X, rotation=45)
    plt.margins(0)
    plt.subplots_adjust(bottom=0.15)
    plt.xlabel(u"紅球數字") #X軸標籤
    plt.ylabel("出現的次數") #Y軸標籤
    plt.title("紅球的歷史次數統計") #標題
    plt.show()
        '''
    #singeShow(red=True)
    #trendAnaly()
    #fpgroupth()
    #redTrendAnaly()
    singeShow(red=True)
#紅球頻次統計
def redStatisticCount(red=True):
    if red:
        arr = np.array(red_balls)
    else:
        arr=np.reshape(list(map(float,blue_balls)),len(blue_balls),1)
    #keyarry=  np.array(red_balls)
    key = np.unique(arr)
    result = {}
    for k in key:
        mask = (arr == k)
        arr_new = arr[mask]
        v = arr_new.size
        result[k] = v
    return result
def singeShow(red=True):
    if red:
        res=redStatisticCount(red=True)
        red_keys=res.keys()
        X=[i for i in red_keys]
    else:
        res = redStatisticCount(red=False)
        blue_keys=res.keys()
        X = [i for i in blue_keys]
    val1=res.values()
    plt.plot(X, val1, marker='o', mec='r', mfc='w',label=u'紅球曲線圖')
    # 設定數字標籤
    for a, b in zip(X, val1):
        plt.text(a, b, b, ha='center', va='bottom', fontsize=20)
    plt.legend()  # 讓圖例生效
    plt.xticks(X, X, rotation=45)
    plt.margins(0)
    plt.subplots_adjust(bottom=0.15)
    plt.xlabel(u"紅球數字") #X軸標籤
    plt.ylabel("出現的次數") #Y軸標籤
    plt.title("紅球的歷史次數統計") #標題
    plt.show()
#近期籃球走勢圖
def trendAnaly():
    X = qs
    X.reverse()
    blue=list(map(int,blue_balls))
    blue.reverse()
    plt.plot(X, blue, marker='o', mec='r', mfc='w',label=u'籃球走勢圖')
    # 設定數字標籤
    for a, b in zip(X, blue):
        plt.text(a, b, b, ha='center', va='bottom', fontsize=20)
    plt.legend()  # 讓圖例生效
    plt.xticks(X, X, rotation=45)
    #設定座標軸刻度
    my_y_ticks = np.arange(-1, 17, 2)
    plt.yticks(my_y_ticks)
    plt.margins(0)
    plt.subplots_adjust(bottom=0.15)
    plt.xlabel(u"期數")  # X軸標籤
    plt.ylabel("籃球號碼")  # Y軸標籤
    plt.title("籃球的趨勢統計")  # 標題
    plt.show()
#呼叫頻繁挖掘模式
def fpgroupth():
    spark=SparkSession.builder\
    .appName("fpgroupth")\
    .master("local[*]")\
    .getOrCreate()
    rdd=spark.sparkContext.parallelize(red_balls, 10)
    model = FPGrowth.train(rdd, minSupport=0.005, numPartitions=10)
    result = model.freqItemsets().collect()
    dictdata={}
    for r in result:
        if len(r[0])>1:
            dictdata[str(r[0])]=int(r[1])
            print(str(r[0])+"=="+str(r[1]))
    # 按照item中的第一個字元進行排序,即按照value排序
    sort_dict=sorted(dictdata.items(), key=operator.itemgetter(1),reverse=True)
    #sorted(dictdata.items(), key=lambda dictdata: dictdata[1], reverse=True)
    print(sort_dict)
    #寫入資料庫
    write2mysql(data=dictdata)
#近期紅球走勢圖
def redTrendAnaly():
    X = qs
    X.reverse()
    allRed=red_balls
    allRed.reverse()
    y=np.array(allRed)
    #獲取第一個球
    first=y[:,0]
    second = y[:, 1]
    third = y[:, 2]
    fouth = y[:, 3]
    fifth = y[:, 4]
    sixth = y[:, 5]
    plt.plot(X, first, marker='o', mec='r', mfc='w',label=u'紅球1走勢圖')
    plt.plot(X, second, marker='o', mec='r', mfc='b', label=u'紅球2走勢圖')
    plt.plot(X, third, marker='o', mec='r', mfc='g', label=u'紅球3走勢圖')
    plt.plot(X, fouth, marker='o', mec='r', mfc='y', label=u'紅球4走勢圖')
    plt.plot(X, fifth, marker='o', mec='r', mfc='r', label=u'紅球5走勢圖')
    plt.plot(X, sixth, marker='o', mec='r', mfc='m', label=u'紅球6走勢圖')
    # 設定數字標籤
    for a, b in zip(X, first):
        plt.text(a, b, b, ha='center', va='bottom', fontsize=20)
        # 設定數字標籤
    for a, b in zip(X, second):
        plt.text(a, b, b, ha='center', va='bottom', fontsize=20)
        # 設定數字標籤
    for a, b in zip(X, third):
        plt.text(a, b, b, ha='center', va='bottom', fontsize=20)
        # 設定數字標籤
    for a, b in zip(X, fouth):
        plt.text(a, b, b, ha='center', va='bottom', fontsize=20)
        # 設定數字標籤
    for a, b in zip(X, fifth):
        plt.text(a, b, b, ha='center', va='bottom', fontsize=20)
        # 設定數字標籤
    for a, b in zip(X, sixth):
        plt.text(a, b, b, ha='center', va='bottom', fontsize=20)
    plt.legend()  # 讓圖例生效
    plt.xticks(X, X, rotation=45)
    #設定座標軸刻度
    my_y_ticks = np.arange(-1, 35, 2)
    plt.yticks(my_y_ticks)
    plt.margins(0)
    plt.subplots_adjust(bottom=0.15)
    plt.xlabel(u"期數")  # X軸標籤
    plt.ylabel("籃球號碼")  # Y軸標籤
    plt.title("籃球的趨勢統計")  # 標題
    plt.show()
if __name__ == '__main__':
    analysis()

效果圖:

紅球各個數字的歷史出現次數

最近10期紅球走勢:

最近10期籃球走勢:

fpgroupth--挖掘頻繁模式效果

參考部落格:

http://blog.51cto.com/tdcqvip/2105499