python股票資料爬蟲requests、etree、BeautifulSoup學習
阿新 • • 發佈:2019-01-10
最近在研究股票資料回測(其實想做量化交易),但是能直接提供資料的API都不太穩定(tushare超時,雅虎的要修復才能用,也不太穩定)
#雅虎股票資料API的修復包
from pandas_datareader import data as pdr
import fix_yahoo_finance
最後還是打算自己學習下python的爬蟲,很早就聽說過py爬蟲的大名,嘗試了下 我覺得OK。
import requests
from bs4 import BeautifulSoup
import re
#步驟1: 從東方財富網獲取股票列表;
#步驟2: 逐一獲取股票程式碼,並增加到百度股票的連結中,最後對這些連結進行逐個的訪問獲得股票的資訊;
#步驟3: 將結果儲存到檔案。
def getHTMLText(url, code="utf-8"):
try:
r = requests.get(url)
r.raise_for_status()#丟擲異常
r.encoding = code#設定編碼格式
return r.text
except:
return ""
def getStockList(lst, stockURL):
html = getHTMLText(stockURL, "GB2312")#只獲取htrm文字?
soup = BeautifulSoup(html, 'html.parser' ) #html解析,到這裡把整個網站原始碼整理乾淨
a = soup.find_all('a')#解析頁面,找到所有的a標籤
for i in a:
#a[1] =<a href="http://finance.eastmoney.com/yaowen.html" target="_blank">要聞</a>
#type(a[1]) = bs4.element.Tag
try:
#找到a標籤中的href屬性,並且判斷屬性中間的連結,把連結後面的數字取出來
href = i.attrs['href' ]
#a[1].attrs['href'] = 'http://finance.eastmoney.com/yaowen.html'
#深圳交易所的程式碼以sz開頭,上海交易所的程式碼以sh開頭,股票的數字有6位構成,所以正則表示式可以寫為[s][hz]\d{6}
lst.append(re.findall(r"[s][hz]\d{6}", href)[0])
except:
#try...except來對程式進行異常處理
continue
def getStockInfo(lst, stockURL, fpath):
count = 0
for stock in lst:
url = stockURL + stock + ".html"
html = getHTMLText(url)#對一隻股票進行操作
try:
if html=="":
continue
infoDict = {}
soup = BeautifulSoup(html, 'html.parser')
stockInfo = soup.find('div',attrs={'class':'stock-bets'})#find整理成以<div class="stock-bets">的整段程式碼
# <div class="stock-bets">
# <h1>
# <a class="bets-name" href="/fund/sh500038.html">
# 基金通乾 (<span>500038</span>)
# </a>
# <span class="state f-up">已收盤 2016-09-02 09:04:50
# </span>
# </h1>
# <div class="price s-stop ">
# <strong class="_close">0.94</strong>
# <span>--</span>
# <span>0.00%</span>
# </div>
# <div class="bets-content">
# <div class="bets-col-8">
# <dl><dt>最高</dt><dd class="s-down">--</dd></dl>
# <dl><dt>最低</dt><dd class="s-down">--</dd></dl>
# <dl><dt>今開</dt><dd class="">--</dd></dl>
# <dl><dt>昨收</dt><dd>0.94</dd></dl>
# <dl><dt>成交額</dt><dd>--</dd></dl>
# <dl><dt>成交量</dt><dd>--</dd></dl>
# <dl><dt>淨值</dt><dd>0.9515</dd></dl>
# <dl><dt>折價率</dt><dd>-1.42</dd></dl>
# </div>
# <div class="clear"></div>
# </div>
# </div>
name = stockInfo.find_all(attrs={'class':'bets-name'})[0]#find_all從所有的stockInfo取出name
# <a class="bets-name" href="/fund/sh500038.html">
# 基金通乾 (<span>500038</span>)
# </a>
infoDict.update({'股票名稱': name.text.split()[0]})
# text取出<a class="bets-name" href="/fund/sh500038.html"> (<span> </span>) </a> 標籤程式碼以外文字
#
# 股票的其他資訊存放在dt和dd標籤中,其中dt表示股票資訊的鍵域,dd標籤是值域。獲取全部的鍵和值:
keyList = stockInfo.find_all('dt')
valueList = stockInfo.find_all('dd')
for i in range(len(keyList)):
key = keyList[i].text#text可直接在<dt>最高</dt>提取
val = valueList[i].text#text可直接在<dd>0.94</dd>提取
infoDict[key] = val#值賦到字典的鍵中
with open(fpath, 'a', encoding='utf-8') as f:
f.write( str(infoDict) + '\n' )
count = count + 1
print("\r當前進度: {:.2f}%".format(count*100/len(lst)),end="")
except:
count = count + 1
print("\r當前進度: {:.2f}%".format(count*100/len(lst)),end="")
continue
def main():
stock_list_url = 'http://quote.eastmoney.com/stocklist.html'
stock_info_url = 'https://gupiao.baidu.com/stock/'
output_file = 'D:/BaiduStockInfo.txt'
slist=[]
getStockList(slist, stock_list_url)
getStockInfo(slist, stock_info_url, output_file)
main()
這裡有個巨大的問題,那就是這樣寫只能爬取1天的資料
不過作為我練習的第一個爬蟲程式,我把每個步驟的中間過程都作為註釋記錄,當作一種筆記學習吧。
接下來是能獲取歷史資料的程式碼
import time
import requests
from lxml import etree#
import re
import pandas as pd
class StockCode(object):
def __init__(self):
self.start_url = "http://quote.eastmoney.com/stocklist.html#sh"
self.headers = {
"User-Agent": ":Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36"
}
def parse_url(self):
# 發起請求,獲取響應
response = requests.get(self.start_url, headers=self.headers)
if response.status_code == 200:
return etree.HTML(response.content)
def get_code_list(self, response):
# 得到股票程式碼的列表
node_list = response.xpath('//*[@id="quotesearch"]/ul[1]/li')
code_list = []
for node in node_list:
try:
code = re.match(r'.*?\((\d+)\)', etree.tostring(node).decode()).group(1)
print (code)
code_list.append(code)
except:
continue
return code_list
def run(self):
html = self.parse_url()
return self.get_code_list(html)
##下載歷史交易記錄
class Download_HistoryStock(object):
def __init__(self, code):
self.code = code
self.start_url = "http://quotes.money.163.com/trade/lsjysj_" + self.code + ".html"
print (self.start_url)
self.headers = {
"User-Agent": ":Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36"
}
def parse_url(self):
response = requests.get(self.start_url)
print (response.status_code)
if response.status_code == 200:
return etree.HTML(response.content)
return False
def get_date(self, response):
# 得到開始和結束的日期
start_date = ''.join(response.xpath('//input[@name="date_start_type"]/@value')[0].split('-'))
end_date = ''.join(response.xpath('//input[@name="date_end_type"]/@value')[0].split('-'))
return start_date,end_date
def download(self, start_date, end_date):
download_url = "http://quotes.money.163.com/service/chddata.html?code=0"+self.code+"&start="+start_date+"&end="+end_date+"&fields=TCLOSE;HIGH;LOW;TOPEN;LCLOSE;CHG;PCHG;TURNOVER;VOTURNOVER;VATURNOVER;TCAP;MCAP"
data = requests.get(download_url)
with open('E:/data/historyStock/' + self.code + '.csv', 'wb') as f:
for chunk in data.iter_content(chunk_size=10000):
if chunk:
f.write(chunk)
print ('股票---',self.code,'歷史資料正在下載')
def run(self):
try:
html = self.parse_url()
start_date,end_date = self.get_date(html)
self.download(start_date, end_date)
except Exception as e:
print (e)
if __name__ == '__main__':
code = StockCode()
code_list = code.run()
for temp_code in dcodes:
time.sleep(1)
download = Download_HistoryStock(temp_code)
download.run()
後面是一些額外的操作,當作記錄
#
code_df=pd.Series(code_list).astype('int')
code_list=code_df[code_df>=600000].astype('str').tolist()
# #斷點查詢目錄下檔名,與code_list做差集
import os
dir = os.fsencode('E:/data/historyStock/')
codes = []
for file in os.listdir(dir):
filename = os.fsdecode(file)
code = str(filename[0:6])
codes.append(code)
dcodes=list(set(code_list).difference(set(codes)))
#讀取到本地,寫入mysql
dfs=[]
for code in codes:
everydf=pd.read_csv('E:/data/historyStock/%s.csv'%code,
encoding='gbk').sort_values(by = '日期' )
dfs.append(everydf)
stock=pd.concat(dfs)
stock.to_csv('E:/data/Stock.csv')
stock=pd.read_csv('E:/data/Stock.csv',encoding='gbk')
import MySQLdb as mdb
from sqlalchemy import create_engine
#sec_user:[email protected]/securities_master使用者:密碼@localhost/資料庫名
engine = create_engine('mysql://sec_user:[email protected]/securities_master?charset=utf8')#
#存入資料庫
stock.to_sql('historystock',engine)