1. 程式人生 > >python 爬蟲 requests+BeautifulSoup 爬取巨潮資訊公司概況代碼實例

python 爬蟲 requests+BeautifulSoup 爬取巨潮資訊公司概況代碼實例

pan 字符 selenium 5.0 target 自我 color list tails

第一次寫一個算是比較完整的爬蟲,自我感覺極差啊,代碼low,效率差,也沒有保存到本地文件或者數據庫,強行使用了一波多線程導致數據順序發生了變化。。。

貼在這裏,引以為戒吧。

# -*- coding: utf-8 -*-
"""
Created on Wed Jul 18 21:41:34 2018
@author: brave-man
blog: http://www.cnblogs.com/zrmw/
"""

import requests
from bs4 import BeautifulSoup
import json
from threading import Thread

# 獲取上市公司的全稱,英文名稱,地址,法定代表人(也可以獲取任何想要獲取的公司信息)
def getDetails(url): headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0"} res = requests.get("{}".format(url), headers = headers) res.encoding = "GBK" soup = BeautifulSoup(res.text, "html.parser") details = {"code": soup.select(".table")[0].td.text.lstrip("
股票代碼:")[:6], "Entire_Name": soup.select(".zx_data2")[0].text.strip("\r\n "), "English_Name": soup.select(".zx_data2")[1].text.strip("\r\n "), "Address": soup.select(".zx_data2")[2].text.strip("\r\n "), "Legal_Representative": soup.select("
.zx_data2")[4].text.strip("\r\n ")} # 這裏將details轉換成json字符串格式用作後期存儲處理 jd = json.dumps(details) jd1 = json.loads(jd) print(jd1)
# 此函數用來獲取上市公司的股票代碼
def getCode(): headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0"} res = requests.get("http://www.cninfo.com.cn/cninfo-new/information/companylist", headers = headers) res.encoding = "gb1232" soup = BeautifulSoup(res.text, "html.parser") # print(soup.select(".company-list")) L = [] l1 = [] l2 = [] l3 = [] l4 = [] for i in soup.select(".company-list")[0].find_all("a"): code = i.text[:6] l1.append(code) for i in soup.select(".company-list")[1].find_all("a"): code = i.text[:6] l2.append(code) for i in soup.select(".company-list")[2].find_all("a"): code = i.text[:6] l3.append(code) for i in soup.select(".company-list")[3].find_all("a"): code = i.text[:6] l4.append(code) L = [l1, l2, l3, l4] print(L[0]) return getAll(L) def getAll(L): def t1(L): for i in L[0]: url_sszb = "http://www.cninfo.com.cn/information/brief/szmb{}.html".format(i) getDetails(url_sszb) def t2(L): for i in L[1]: url_zxqyb = "http://www.cninfo.com.cn/information/brief/szsme{}.html".format(i) getDetails(url_zxqyb) def t3(L): for i in L[2]: url_cyb = "http://www.cninfo.com.cn/information/brief/szcn{}.html".format(i) getDetails(url_cyb) def t4(L): for i in L[3]: url_hszb = "http://www.cninfo.com.cn/information/brief/shmb{}.html".format(i) getDetails(url_hszb) # tt1 = Thread(target = t1, args = (L, )) # tt2 = Thread(target = t2, args = (L, )) # tt3 = Thread(target = t3, args = (L, )) # tt4 = Thread(target = t4, args = (L, )) # # tt1.start() # tt2.start() # tt3.start() # tt4.start() # # tt1.join() # tt2.join() # tt3.join() # tt4.join() t1(L) t2(L) t3(L) t4(L) if __name__ == "__main__": getCode()

沒有考慮實際生產中突發的狀況,比如網速延遲卡頓等問題。

速度是真慢,有時間會分享給大家 selenium + 瀏覽器 的爬取巨潮資訊的方法代碼。晚安~

python 爬蟲 requests+BeautifulSoup 爬取巨潮資訊公司概況代碼實例