python網路爬蟲--抓取股票資訊到Mysql
1.建表
mysql -u root -p 123456
create database test default character set utf8;
create table stocks --a股
(
code varchar(10) comment '程式碼',
name varchar(30) comment '名稱',
score varchar(5) comment '分數',
advise varchar(10) comment '建議',
survey varchar(500) comment '概述',
trend varchar(500) comment '趨勢',
tec_score varchar(5) comment '技術面分數',
tec_content varchar(500) comment '技術面概述',
funds_score decimal(3,1) comment '資金面分數',
funds_content varchar(500) comment '資金面概述',
msg_score varchar(5) comment '訊息面分數',
msg_content varchar(500) comment '訊息面概述',
trade_score decimal(3,1) comment '行業面分數',
trade_content varchar(500) comment '行業面概述',
basic_score varchar(5) comment '基本面分數',
basic_content varchar(500) comment '基本面概述',
opt_trend varchar(500) comment '選股動向',
tec_form varchar(500) comment '技術形態',
buy_signal varchar(500) comment '買入訊號'
) ENGINE=InnoDB DEFAULT CHARSET="utf8";
create table syl --市盈率
(
id varchar(5) comment '序號',
code varchar(10) comment '程式碼',
name varchar(30) comment '名稱',
sy_j varchar(10) comment '市盈率',
sy_d varchar(10) comment '市盈率(動)',
price varchar(10) comment '價格',
zdf varchar(10) comment '漲跌幅(%)',
cha_rate varchar(10) comment '換手率(%)'
) ENGINE=InnoDB DEFAULT CHARSET="utf8";
2.抓取牛叉診股資訊
#!/usr/bin/env python # -*- coding: utf-8 -*- # author: xiaobao time: 2018/4/5 import urllib.request import re from bs4 import BeautifulSoup import pymysql import traceback import time start = time.clock() db = pymysql.connect(host='localhost', user='root', passwd='123456', db='test', charset='utf8') cursor = db.cursor() cursor.execute("truncate table stocks") cursor.execute("set names utf8") l = urllib.request.urlopen('http://quote.eastmoney.com/stocklist.html#sz') bs = BeautifulSoup(l,"html.parser") stock = bs.find_all('a',attrs={'target':'_blank'},href=re.compile("quote")) stocks = [] for s in stock: t = s.get_text() if len(re.findall(r'[^()]+', t)) == 2 : id = re.findall(r'[^()]+', t)[1] if id.startswith('0') or id.startswith('6') : stocks.append(id) else : pass else: pass for stock in stocks: try: url = 'http://doctor.300033.info/' + stock page = urllib.request.urlopen(url) soup = BeautifulSoup(page,"html.parser") stockname = soup.find('div',attrs={'class':'stockname'}).get_text() code = re.findall(r'[^()]+', stockname)[1] name = re.findall(r'[^()]+', stockname)[0] stockvalue = soup.find('div',attrs={'class':'stockvalue'}).get_text() cur = soup.find('span',attrs={'class':'cur'}).get_text() title = soup.find('strong',attrs={'class':'title'}).get_text() temp = title.split(',')[1:] total = ','.join(temp)[:-10] short = soup.find('li',attrs={'class':'short'}).get_text() mid = soup.find('li',attrs={'class':'mid'}).get_text() content = soup.find_all('p',attrs={'class':'content'}) dim = [] for i in content: j = i.get_text() dim.append(j) label = soup.find_all('div',attrs={'class':'label'}) grade = [] for n in label: m = n.get_text().replace('分','') grade.append(m) techcont = soup.find('div',attrs={'class':'techcont','display':''}).get_text() tec = techcont.replace('選股動向:','').replace('技術形態:','').replace('買入訊號:','').strip().split('\n\n\n') score = stockvalue advise = cur survey = total trend = short[5:] + mid[5:] tec_score = grade[0] tec_content = dim[0] funds_score = grade[1] funds_content = dim[1] msg_score = grade[2] msg_content = dim[2] trade_score = grade[3] trade_content = dim[3] basic_score = grade[4] basic_content = dim[4] opt_trend = tec[0].replace('\n',',') tec_form = '' buy_signal = '' if len(tec) > 1: tec_form = tec[1].replace('\n', ',')[1:] if len(tec) > 2: buy_signal = tec[2].replace('\n', ',')[1:] sql = "insert into stocks values('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s')" % (code, name, score, advise, survey, trend, tec_score, tec_content, funds_score, funds_content, msg_score, msg_content, trade_score, trade_content, basic_score, basic_content, opt_trend, tec_form, buy_signal) try: cursor.execute(sql) db.commit() except: db.rollback() except: pass db.close() end = time.clock() print(end - start)
3.抓取個股市盈率
#!/usr/bin/env python # -*- coding: utf-8 -*- # author: xiaobao time: 2018/4/6 import os import sys import time import codecs import pymysql import traceback from bs4 import BeautifulSoup from urllib import request,parse from selenium import webdriver from selenium.webdriver.chrome.options import Options pages = ['http://data.10jqka.com.cn/market/ggsyl/'] for url in pages: chrome_options = Options() chrome_options.add_argument('--headless') chrome_options.add_argument('--disable-gpu') chrome_options.add_experimental_option("excludeSwitches",["ignore-certificate-errors"]) chrome_options.binary_location = r'C:\\Program Files (x86)\\Google\\Chrome\\Application' browser = webdriver.Chrome(chrome_options=chrome_options) browser.get(url) html = browser.page_source soup = BeautifulSoup(html,'html.parser') for link in soup.find_all('a',text='尾頁'): end = int(link.get('page')) tmp = 'D:\\temp.txt' f = open(tmp,'w') f.truncate() f.close() count = 1 with open(tmp,"a+") as f: sys.stdout = f while count <= end: try: browser.find_element_by_link_text('下一頁').click() html = browser.page_source soup = BeautifulSoup(html, 'html.parser') except: pass time.sleep(3) tables = soup.find_all('table') tab = tables[1] for tr in tab.find_all('tr'): for td in tr.find_all('td'): print(td.get_text(), end=',') print() count = count + 1 browser.quit() f.close() if url == 'http://data.10jqka.com.cn/market/ggsyl/': result = 'D:\\syl.txt' f1 = open(tmp,'r') f2 = codecs.open(result,'w','utf-8') f2.truncate() for line in f1: data=line.strip() if len(data)!=0: f2.write(data[:-1]) f2.write('\r\n') f1.close() f2.close() os.remove(tmp) db = pymysql.connect(host='localhost', user='root', passwd='123456', db='test', charset='utf8') cursor = db.cursor() cursor.execute("truncate table syl") try: cursor.execute('load data infile "D:/syl.txt" into table syl fields terminated by "," lines terminated by "\n" ') db.commit() except: # traceback.print_exc() db.rollback() db.close()
4.結果分析
select
a.code '程式碼',
a.name '名稱',
score '分數',
advise '建議',
survey '概述',
trend '趨勢',
tec_score '技術面分數',
tec_content '技術面概述',
funds_score '資金面分數',
funds_content '資金面概述',
msg_score '訊息面分數',
msg_content '訊息面概述',
trade_score '行業面分數',
trade_content '行業面概述',
basic_score '基本面分數',
basic_content '基本面概述',
opt_trend '選股動向',
tec_form '技術形態',
buy_signal '買入訊號',
b.price '價格',
sy_j '市盈率',
sy_d '市盈率(動)',
zdf '漲跌幅(%)',
cha_rate '換手率(%)'
from stocks a
left join syl b
on a.code = b.code;