1. 程式人生 > >Python爬蟲爬取NBA資料

Python爬蟲爬取NBA資料

爬取的網站為:stat-nba.com,本文爬取的是NBA2016-2017賽季常規賽至2017年1月7日的資料

改變url_header和url_tail即可爬取特定的其他資料。

原始碼如下:

#coding=utf-8
import sys
reload(sys)
sys.setdefaultencoding('utf-8')

import requests
import time
import urllib
from bs4 import BeautifulSoup
import re
from pyExcelerator import *

def getURLLists(url_header,url_tail,pages):
    """
    獲取所有頁面的URL列表
    """
    url_lists = []
    url_0 = url_header+'0'+url_tail
    print url_0
    url_lists.append(url_0)
    for i in range(1,pages+1):
        url_temp = url_header+str(i)+url_tail
        url_lists.append(url_temp)
    return url_lists

def getNBAAllData(url_lists):
    """
    獲取所有2017賽季NBA常規賽資料
    """
    datasets = ['']
    for item in url_lists:
        data1 = getNBASingleData(item)
        datasets.extend(data1)
    #去掉資料裡的空元素
    for item in datasets[:]:
        if len(item) == 0:
            datasets.remove(item)
    return datasets
def getNBASingleData(url):
    """
    獲取1個頁面NBA常規賽資料
    """
    # url = 'http://stat-nba.com/query_team.php?QueryType=game&order=1&crtcol=date_out&GameType=season&PageNum=3000&Season0=2016&Season1=2017'
    # html = requests.get(url).text
    html = urllib.urlopen(url).read()
    # print html
    soup = BeautifulSoup(html)
    data = soup.html.body.find('tbody').text
    list_data = data.split('\n')
    # with open('nba_data.txt','a') as fp:
    #     fp.write(data)
    # for item in list_data[:]:
    #     if len(item) == 0:
    #         list_data.remove(item)
    return list_data

def saveDataToExcel(datasets,sheetname,filename):

    book = Workbook()
    sheet = book.add_sheet(sheetname)
    sheet.write(0,0,u'序號')
    sheet.write(0,1,u'球隊')
    sheet.write(0,2,u'時間')
    sheet.write(0,3,u'結果')
    sheet.write(0,4,u'主客')
    sheet.write(0,5,u'比賽')
    sheet.write(0,6,u'投籃命中率')
    sheet.write(0,7,u'命中數')
    sheet.write(0,8,u'出手數')
    sheet.write(0,9,u'三分命中率')
    sheet.write(0,10,u'三分命中數')
    sheet.write(0,11,u'三分出手數')
    sheet.write(0,12,u'罰球命中率')
    sheet.write(0,13,u'罰球命中數')
    sheet.write(0,14,u'罰球出手數')
    sheet.write(0,15,u'籃板')
    sheet.write(0,16,u'前場籃板')
    sheet.write(0,17,u'後場籃板')
    sheet.write(0,18,u'助攻')
    sheet.write(0,19,u'搶斷')
    sheet.write(0,20,u'蓋帽')
    sheet.write(0,21,u'失誤')
    sheet.write(0,22,u'犯規')
    sheet.write(0,23,u'得分')

    num = 24
    row_cnt = 0
    data_cnt = 0
    data_len = len(datasets)
    print 'data_len:',data_len
    while(data_cnt< data_len):
        row_cnt += 1
        print '序號:',row_cnt
        for col in range(num):
                # print col
                sheet.write(row_cnt,col,datasets[data_cnt])
                data_cnt += 1
    book.save(filename)

def writeDataToTxt(datasets):
    fp = open('nba_data.txt','w')
    line_cnt = 1
    for i in range(len(datasets)-1):
        #球隊名稱對齊的操作:如果球隊名字過短或者為76人隊是 球隊名字後面加兩個table 否則加1個table
        if line_cnt % 24 == 2 and len(datasets[i]) < 5 or datasets[i] == u'費城76人':
            fp.write(datasets[i]+'\t\t')
        else:
            fp.write(datasets[i]+'\t')
        line_cnt += 1
        if line_cnt % 24 == 1:
            fp.write('\n')
    fp.close()

if __name__ == "__main__":

    pages = int(1132/150)
    url_header = 'http://stat-nba.com/query_team.php?page='
    url_tail = '&QueryType=game&order=1&crtcol=date_out&GameType=season&PageNum=3000&Season0=2016&Season1=2017#label_show_result'
    url_lists = getURLLists(url_header,url_tail,pages)
    datasets = getNBAAllData(url_lists)

    writeDataToTxt(datasets)

    sheetname = 'nba normal data 2016-2017'
    str_time = time.strftime('%Y-%m-%d',time.localtime(time.time()))
    filename = 'nba_normal_data'+str_time+'.xls'
    saveDataToExcel(datasets,sheetname,filename)