1. 程式人生 > >python股票資料爬蟲requests、etree、BeautifulSoup學習

python股票資料爬蟲requests、etree、BeautifulSoup學習

最近在研究股票資料回測(其實想做量化交易),但是能直接提供資料的API都不太穩定(tushare超時,雅虎的要修復才能用,也不太穩定)

#雅虎股票資料API的修復包
 from pandas_datareader import data as pdr
 import fix_yahoo_finance 

最後還是打算自己學習下python的爬蟲,很早就聽說過py爬蟲的大名,嘗試了下 我覺得OK。

import requests
from bs4 import BeautifulSoup
import re

#步驟1: 從東方財富網獲取股票列表;
#步驟2: 逐一獲取股票程式碼,並增加到百度股票的連結中,最後對這些連結進行逐個的訪問獲得股票的資訊;
#步驟3: 將結果儲存到檔案。 def getHTMLText(url, code="utf-8"): try: r = requests.get(url) r.raise_for_status()#丟擲異常 r.encoding = code#設定編碼格式 return r.text except: return "" def getStockList(lst, stockURL): html = getHTMLText(stockURL, "GB2312")#只獲取htrm文字? soup = BeautifulSoup(html, 'html.parser'
) #html解析,到這裡把整個網站原始碼整理乾淨 a = soup.find_all('a')#解析頁面,找到所有的a標籤 for i in a: #a[1] =<a href="http://finance.eastmoney.com/yaowen.html" target="_blank">要聞</a> #type(a[1]) = bs4.element.Tag try: #找到a標籤中的href屬性,並且判斷屬性中間的連結,把連結後面的數字取出來 href = i.attrs['href'
] #a[1].attrs['href'] = 'http://finance.eastmoney.com/yaowen.html' #深圳交易所的程式碼以sz開頭,上海交易所的程式碼以sh開頭,股票的數字有6位構成,所以正則表示式可以寫為[s][hz]\d{6} lst.append(re.findall(r"[s][hz]\d{6}", href)[0]) except: #try...except來對程式進行異常處理 continue def getStockInfo(lst, stockURL, fpath): count = 0 for stock in lst: url = stockURL + stock + ".html" html = getHTMLText(url)#對一隻股票進行操作 try: if html=="": continue infoDict = {} soup = BeautifulSoup(html, 'html.parser') stockInfo = soup.find('div',attrs={'class':'stock-bets'})#find整理成以<div class="stock-bets">的整段程式碼 # <div class="stock-bets"> # <h1> # <a class="bets-name" href="/fund/sh500038.html"> # 基金通乾 (<span>500038</span>) # </a> # <span class="state f-up">已收盤 2016-09-02 09:04:50 # </span> # </h1> # <div class="price s-stop "> # <strong class="_close">0.94</strong> # <span>--</span> # <span>0.00%</span> # </div> # <div class="bets-content"> # <div class="bets-col-8"> # <dl><dt>最高</dt><dd class="s-down">--</dd></dl> # <dl><dt>最低</dt><dd class="s-down">--</dd></dl> # <dl><dt>今開</dt><dd class="">--</dd></dl> # <dl><dt>昨收</dt><dd>0.94</dd></dl> # <dl><dt>成交額</dt><dd>--</dd></dl> # <dl><dt>成交量</dt><dd>--</dd></dl> # <dl><dt>淨值</dt><dd>0.9515</dd></dl> # <dl><dt>折價率</dt><dd>-1.42</dd></dl> # </div> # <div class="clear"></div> # </div> # </div> name = stockInfo.find_all(attrs={'class':'bets-name'})[0]#find_all從所有的stockInfo取出name # <a class="bets-name" href="/fund/sh500038.html"> # 基金通乾 (<span>500038</span>) # </a> infoDict.update({'股票名稱': name.text.split()[0]}) # text取出<a class="bets-name" href="/fund/sh500038.html"> (<span> </span>) </a> 標籤程式碼以外文字 # # 股票的其他資訊存放在dt和dd標籤中,其中dt表示股票資訊的鍵域,dd標籤是值域。獲取全部的鍵和值: keyList = stockInfo.find_all('dt') valueList = stockInfo.find_all('dd') for i in range(len(keyList)): key = keyList[i].text#text可直接在<dt>最高</dt>提取 val = valueList[i].text#text可直接在<dd>0.94</dd>提取 infoDict[key] = val#值賦到字典的鍵中 with open(fpath, 'a', encoding='utf-8') as f: f.write( str(infoDict) + '\n' ) count = count + 1 print("\r當前進度: {:.2f}%".format(count*100/len(lst)),end="") except: count = count + 1 print("\r當前進度: {:.2f}%".format(count*100/len(lst)),end="") continue def main(): stock_list_url = 'http://quote.eastmoney.com/stocklist.html' stock_info_url = 'https://gupiao.baidu.com/stock/' output_file = 'D:/BaiduStockInfo.txt' slist=[] getStockList(slist, stock_list_url) getStockInfo(slist, stock_info_url, output_file) main()

這裡有個巨大的問題,那就是這樣寫只能爬取1天的資料

不過作為我練習的第一個爬蟲程式,我把每個步驟的中間過程都作為註釋記錄,當作一種筆記學習吧。

接下來是能獲取歷史資料的程式碼

import time
import requests
from lxml import etree#
import re
import pandas as pd

class StockCode(object):
    def __init__(self):
        self.start_url = "http://quote.eastmoney.com/stocklist.html#sh"
        self.headers = {
            "User-Agent": ":Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36"
        }

    def parse_url(self):
        # 發起請求,獲取響應
        response = requests.get(self.start_url, headers=self.headers)
        if response.status_code == 200:
            return etree.HTML(response.content)

    def get_code_list(self, response):
        # 得到股票程式碼的列表
        node_list = response.xpath('//*[@id="quotesearch"]/ul[1]/li')
        code_list = []
        for node in node_list:
            try:
                code = re.match(r'.*?\((\d+)\)', etree.tostring(node).decode()).group(1)
                print (code)
                code_list.append(code)
            except:
                continue
        return code_list

    def run(self):
        html = self.parse_url()
        return self.get_code_list(html)

##下載歷史交易記錄
class Download_HistoryStock(object):
    def __init__(self, code):
        self.code = code
        self.start_url = "http://quotes.money.163.com/trade/lsjysj_" + self.code + ".html"
        print (self.start_url)
        self.headers = {
            "User-Agent": ":Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36"
        }

    def parse_url(self):
        response = requests.get(self.start_url)
        print (response.status_code)
        if response.status_code == 200:
            return etree.HTML(response.content)
        return False

    def get_date(self, response):
        # 得到開始和結束的日期
        start_date = ''.join(response.xpath('//input[@name="date_start_type"]/@value')[0].split('-'))
        end_date = ''.join(response.xpath('//input[@name="date_end_type"]/@value')[0].split('-'))
        return start_date,end_date

    def download(self, start_date, end_date):
        download_url = "http://quotes.money.163.com/service/chddata.html?code=0"+self.code+"&start="+start_date+"&end="+end_date+"&fields=TCLOSE;HIGH;LOW;TOPEN;LCLOSE;CHG;PCHG;TURNOVER;VOTURNOVER;VATURNOVER;TCAP;MCAP"
        data = requests.get(download_url)
        with open('E:/data/historyStock/' + self.code + '.csv', 'wb') as f:
            for chunk in data.iter_content(chunk_size=10000):
                if chunk:
                    f.write(chunk)
        print ('股票---',self.code,'歷史資料正在下載')

    def run(self):
        try:
            html = self.parse_url()
            start_date,end_date = self.get_date(html)
            self.download(start_date, end_date)
        except Exception as e:
            print (e)

if __name__ == '__main__':
    code = StockCode()
    code_list = code.run()

for temp_code in dcodes:
    time.sleep(1)
    download = Download_HistoryStock(temp_code)
    download.run()

後面是一些額外的操作,當作記錄

#
code_df=pd.Series(code_list).astype('int')
code_list=code_df[code_df>=600000].astype('str').tolist()

# #斷點查詢目錄下檔名,與code_list做差集
import os
dir = os.fsencode('E:/data/historyStock/')
codes = []
for file in os.listdir(dir):
    filename = os.fsdecode(file)
    code = str(filename[0:6])
    codes.append(code)

dcodes=list(set(code_list).difference(set(codes)))
#讀取到本地,寫入mysql
dfs=[]
for code in codes:
   everydf=pd.read_csv('E:/data/historyStock/%s.csv'%code,
   encoding='gbk').sort_values(by = '日期' )
   dfs.append(everydf)
stock=pd.concat(dfs)
stock.to_csv('E:/data/Stock.csv')
stock=pd.read_csv('E:/data/Stock.csv',encoding='gbk')

import MySQLdb as mdb
from sqlalchemy import create_engine
#sec_user:[email protected]/securities_master使用者:密碼@localhost/資料庫名
engine = create_engine('mysql://sec_user:[email protected]/securities_master?charset=utf8')#
#存入資料庫
stock.to_sql('historystock',engine)