1. 程式人生 > >python網路爬蟲--抓取股票資訊到Mysql

python網路爬蟲--抓取股票資訊到Mysql

1.建表

mysql -u root -p 123456
create database test default character set utf8;

create table stocks --a股
(
  code varchar(10) comment '程式碼',
  name varchar(30) comment '名稱',
  score varchar(5) comment '分數',
  advise varchar(10) comment '建議',
  survey varchar(500) comment '概述',
  trend varchar(500) comment '趨勢',
  tec_score varchar(5) comment '技術面分數',
  tec_content varchar(500) comment '技術面概述',
  funds_score decimal(3,1) comment '資金面分數',
  funds_content varchar(500) comment '資金面概述',
  msg_score varchar(5) comment '訊息面分數',
  msg_content varchar(500) comment '訊息面概述',
  trade_score decimal(3,1) comment '行業面分數',
  trade_content varchar(500) comment '行業面概述',
  basic_score varchar(5) comment '基本面分數',
  basic_content varchar(500) comment '基本面概述', 
  opt_trend varchar(500) comment '選股動向',
  tec_form varchar(500) comment '技術形態',
  buy_signal varchar(500) comment '買入訊號'
) ENGINE=InnoDB DEFAULT CHARSET="utf8";

create table syl --市盈率
(
 id varchar(5) comment '序號',
 code varchar(10) comment '程式碼',
 name varchar(30) comment '名稱',
 sy_j varchar(10) comment '市盈率',
 sy_d varchar(10) comment '市盈率(動)',
 price varchar(10) comment '價格',
 zdf varchar(10) comment '漲跌幅(%)',
 cha_rate varchar(10) comment '換手率(%)'

) ENGINE=InnoDB DEFAULT CHARSET="utf8";

2.抓取牛叉診股資訊

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# author: xiaobao  time: 2018/4/5
import urllib.request
import re
from bs4 import BeautifulSoup
import pymysql
import traceback
import time

start = time.clock()
db = pymysql.connect(host='localhost', user='root'
, passwd='123456', db='test', charset='utf8') cursor = db.cursor() cursor.execute("truncate table stocks") cursor.execute("set names utf8") l = urllib.request.urlopen('http://quote.eastmoney.com/stocklist.html#sz') bs = BeautifulSoup(l,"html.parser") stock = bs.find_all('a',attrs={'target':'_blank'},href
=re.compile("quote")) stocks = [] for s in stock: t = s.get_text() if len(re.findall(r'[^()]+', t)) == 2 : id = re.findall(r'[^()]+', t)[1] if id.startswith('0') or id.startswith('6') : stocks.append(id) else : pass else: pass for stock in stocks: try: url = 'http://doctor.300033.info/' + stock page = urllib.request.urlopen(url) soup = BeautifulSoup(page,"html.parser") stockname = soup.find('div',attrs={'class':'stockname'}).get_text() code = re.findall(r'[^()]+', stockname)[1] name = re.findall(r'[^()]+', stockname)[0] stockvalue = soup.find('div',attrs={'class':'stockvalue'}).get_text() cur = soup.find('span',attrs={'class':'cur'}).get_text() title = soup.find('strong',attrs={'class':'title'}).get_text() temp = title.split(',')[1:] total = ','.join(temp)[:-10] short = soup.find('li',attrs={'class':'short'}).get_text() mid = soup.find('li',attrs={'class':'mid'}).get_text() content = soup.find_all('p',attrs={'class':'content'}) dim = [] for i in content: j = i.get_text() dim.append(j) label = soup.find_all('div',attrs={'class':'label'}) grade = [] for n in label: m = n.get_text().replace('分','') grade.append(m) techcont = soup.find('div',attrs={'class':'techcont','display':''}).get_text() tec = techcont.replace('選股動向:','').replace('技術形態:','').replace('買入訊號:','').strip().split('\n\n\n') score = stockvalue advise = cur survey = total trend = short[5:] + mid[5:] tec_score = grade[0] tec_content = dim[0] funds_score = grade[1] funds_content = dim[1] msg_score = grade[2] msg_content = dim[2] trade_score = grade[3] trade_content = dim[3] basic_score = grade[4] basic_content = dim[4] opt_trend = tec[0].replace('\n',',') tec_form = '' buy_signal = '' if len(tec) > 1: tec_form = tec[1].replace('\n', ',')[1:] if len(tec) > 2: buy_signal = tec[2].replace('\n', ',')[1:] sql = "insert into stocks values('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s')" % (code, name, score, advise, survey, trend, tec_score, tec_content, funds_score, funds_content, msg_score, msg_content, trade_score, trade_content, basic_score, basic_content, opt_trend, tec_form, buy_signal) try: cursor.execute(sql) db.commit() except: db.rollback() except: pass db.close() end = time.clock() print(end - start)

3.抓取個股市盈率

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# author: xiaobao  time: 2018/4/6
import os
import sys
import time
import codecs
import pymysql
import traceback
from bs4 import BeautifulSoup
from urllib import request,parse
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

pages = ['http://data.10jqka.com.cn/market/ggsyl/']

for url in pages:
    chrome_options = Options()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--disable-gpu')
    chrome_options.add_experimental_option("excludeSwitches",["ignore-certificate-errors"])
    chrome_options.binary_location = r'C:\\Program Files (x86)\\Google\\Chrome\\Application'
browser = webdriver.Chrome(chrome_options=chrome_options)
    browser.get(url)
    html = browser.page_source
    soup = BeautifulSoup(html,'html.parser')
    for link in soup.find_all('a',text='尾頁'):
        end = int(link.get('page'))

    tmp = 'D:\\temp.txt'
f = open(tmp,'w')
    f.truncate()
    f.close()

    count = 1
with open(tmp,"a+") as  f:
        sys.stdout = f
        while count <= end:
            try:
                browser.find_element_by_link_text('下一頁').click()
                html = browser.page_source
                soup = BeautifulSoup(html, 'html.parser')
            except:
                pass
time.sleep(3)
            tables = soup.find_all('table')
            tab = tables[1]
            for tr in tab.find_all('tr'):
                for td in tr.find_all('td'):
                    print(td.get_text(), end=',')
                print()
            count = count + 1
browser.quit()
    f.close()

    if url == 'http://data.10jqka.com.cn/market/ggsyl/':
        result = 'D:\\syl.txt'
f1 = open(tmp,'r')
    f2 = codecs.open(result,'w','utf-8')
    f2.truncate()
    for line in f1:
        data=line.strip()
        if len(data)!=0:
             f2.write(data[:-1])
             f2.write('\r\n')
    f1.close()
    f2.close()
    os.remove(tmp)

db = pymysql.connect(host='localhost', user='root', passwd='123456', db='test', charset='utf8')
cursor = db.cursor()
cursor.execute("truncate table syl")

try:
    cursor.execute('load data infile "D:/syl.txt" into table syl fields terminated by "," lines terminated by "\n" ')
    db.commit()
except:
    # traceback.print_exc()
db.rollback()

db.close()

4.結果分析

select 
  a.code         '程式碼',
  a.name         '名稱',
  score          '分數',
  advise         '建議',
  survey         '概述',
  trend          '趨勢',
  tec_score      '技術面分數',
  tec_content    '技術面概述',
  funds_score    '資金面分數',
  funds_content  '資金面概述',
  msg_score      '訊息面分數',
  msg_content    '訊息面概述',
  trade_score    '行業面分數',
  trade_content  '行業面概述',
  basic_score    '基本面分數',
  basic_content  '基本面概述',  
  opt_trend      '選股動向',
  tec_form       '技術形態',
  buy_signal     '買入訊號',
  b.price        '價格',
  sy_j           '市盈率',
  sy_d           '市盈率(動)',  
  zdf            '漲跌幅(%)',
  cha_rate       '換手率(%)'
from stocks a
left join syl b 
on a.code = b.code;