1. 程式人生 > >python抓取動態資料 A股上市公司基本資訊

python抓取動態資料 A股上市公司基本資訊

1.背景

之前寫的抓取A股所有上市公司資訊的小程式在上交所網站改版後,需要同步修改

pyton2.7.9

2.分析過程

以抓取宇通客車【600066】資訊為例


紅框中的內容是需要抓取的資訊,檢視網頁原始碼


可以看到公司資訊並沒有直接寫到html中,使用chrome “開發者工具”快捷鍵F12,檢視瀏覽器與伺服器的互動過程(在這一步走了彎路,使用selenium+plantomjs模擬瀏覽器然後分析html以及使用ghost.py+beautifulsoup都沒有成功)


可以在標紅線的url上看到返回的公司資訊,剩下的就是模擬瀏覽器請求這個url了,request header中的refer一定不能省略,不然會報403


返回的資訊是json格式的,可以使用python自帶的json庫轉換為dict,可以參考searchJ.js來獲得想要的資訊

# -*- coding: utf-8 -*- 
'''
Created on 2016年4月19日
@author: a
'''
import urllib2
import json
from time import sleep


class JSONObject:
    def __init__(self, d):
        self.__dict__ = d

class AchieveSSEStockInfo:
    '''獲得上海證卷交易所股票資訊.'''
    
    # 指標的方法,順序已經排好,請不要亂動
    __public__ = ['getCompanyCode', 'getCompanyShortName', 'getCompanyName', 'getCompanyEnlishName', 'getIpoAddress', 'getASharesCode',
                  'getASharesShortName', 'getASharesIPODate', 'getASharesTotalCapital', 'getASharesOutstandingCaptial', 'getBSharesCode',
                  'getBSharesShortName', 'getBSharesIPODate', 'getBSharesTotalCapital', 'getBSharesOutstandingCaptial', 'getArea', 'getProvince', 'getCity', 'getTrade', 'getWebsite']
    
    achieveIndexFromURLA = ['CHANGEABLE_BOND_ABBR', 'OFFICE_ZIP', 'AREA_NAME_DESC', 'FULL_NAME_IN_ENGLISH', 'COMPANY_CODE', 'CSRC_MIDDLE_CODE_DESC', 'SECURITY_ABBR_A', 'COMPANY_ADDRESS', 'SECURITY_CODE_A', 'SECURITY_CODE_B', 'SECURITY_30_DESC', 'COMPANY_ABBR', 'OFFICE_ADDRESS', 'CHANGEABLE_BOND_CODE', 'ENGLISH_ABBR', 'LEGAL_REPRESENTATIVE', 'REPR_PHONE', 'E_MAIL_ADDRESS', 'FOREIGN_LISTING_ADDRESS', 'STATE_CODE_A_DESC', 'SSE_CODE_DESC', 'FOREIGN_LISTING_DESC', 'SECURITY_CODE_A_SZ', 'CSRC_GREAT_CODE_DESC', 'WWW_ADDRESS', 'CSRC_CODE_DESC', 'STATE_CODE_B_DESC', 'FULLNAME']
    
    '''
    all indexs as follow:
        companyCode     公司程式碼
        companyShortName     公司簡稱
        companyName      公司全稱
        companyEnlishName      英文名稱
        ipoAddress      註冊地址
        aSharesCode      A股程式碼
        aSharesShortName      A股簡稱
        aSharesIPODate      A股上市日期
        aSharesTotalCapital      A股總股本
        aSharesOutstandingCaptial      A股流通股本
        bSharesCode      B股程式碼
        bSharesShortName      B股簡稱
        bSharesIPODate      B股上市日期 
        bSharesTotalCapital       B股總股本  
        bSharesOutstandingCaptial      B股流通股本
        area      地區 
        province      省份
        city      城市
        trade      所屬行業
        website      公司網址
        
        status A股狀態/B股狀態
    '''
    
    def getCompanyCode(self):
        return self.__getBasicValue('COMPANY_CODE')
    
    def getStatus(self):
        v = self.__getBasicValue('STATE_CODE_A_DESC') + '/' + self.__getBasicValue('STATE_CODE_B_DESC')
#         print v
        if v == '-/-' or u'摘牌' in v:
            return False
        else:
            return True
    
    def getCompanyShortName(self):
        return self.__getBasicValue('COMPANY_ABBR') + '/' + self.__getBasicValue('ENGLISH_ABBR')
    
    def getCompanyName(self):
        return self.__getBasicValue('FULLNAME')
    
    def getCompanyEnlishName(self):
        return self.__getBasicValue('FULL_NAME_IN_ENGLISH')
    
    def getIpoAddress(self):
        return self.__getBasicValue('COMPANY_ADDRESS')
    
    def getASharesCode(self):
        return self.__getBasicValue('SECURITY_CODE_A')
    
    def getASharesShortName(self):
        return self.__getBasicValue('COMPANY_ABBR') + '/' + self.__getBasicValue('ENGLISH_ABBR')
    
    def getASharesIPODate(self):
        result = ''
        try:
            rsDict = self.__getDatas(self.basicURLB)
            if rsDict == '-' or rsDict is None:
                result = '-'
            else:
                ipoDate = dict((name, getattr(rsDict[0], name)) for name in dir(rsDict[0]) if not name.startswith('__'))
                print ipoDate
                result = ipoDate.get('LISTINGDATEA')
        except:
            result = '-'
        return result
        

    def getTotalCapital(self):      
        return self.__getCapitalValue('totalShares')
    
    def getASharesTotalCapital(self):
        aShareTotalShare = 0.0
        
        AShareNonFlowShare = self.__getCapitalValue('totalNonFlowShare')
        AShareFlowShare = self.getASharesOutstandingCaptial()
         
        if  AShareNonFlowShare != '-' and  AShareNonFlowShare:
            aShareTotalShare += float(AShareNonFlowShare)
        if AShareFlowShare != '-' and AShareFlowShare:
            aShareTotalShare += float(AShareFlowShare)

        return aShareTotalShare
    
    def getASharesOutstandingCaptial(self):
        return self.__getCapitalValue('AShares')
    
    def getBSharesTotalCapital(self):
        return self.getBSharesOutstandingCaptial()
        
    def getBSharesOutstandingCaptial(self):
        return self.__getCapitalValue('BShares')
    
    def getBSharesCode(self):
        return self.__getBasicValue('SECURITY_CODE_B')
    
    def getBSharesShortName(self):
        if self.getBSharesCode().find('-') != -1:
            return ''
        else:
            return self.getASharesShortName()
    
    def getBSharesIPODate(self):
        result = ''
        try:
            rsDict = self.__getDatas(self.basicURLC)
            if rsDict == '-' or rsDict is None:
                result = '-'
            else:
                ipoDate = dict((name, getattr(rsDict[0], name)) for name in dir(rsDict[0]) if not name.startswith('__'))
                print ipoDate
                result = ipoDate.get('LISTINGDATEB')
        except:
            result = '-'
        return result
        
    def getArea(self):
        return self.__getBasicValue('AREA_NAME_DESC')
    
    def getProvince(self):
        return self.getArea() 
    
    def getCity(self):
        return self.getArea() 
    
    def getTrade(self):
        return self.__getBasicValue('SSE_CODE_DESC')
#    CSRC行業(門類/大類/中類)
#    'CSRC_CODE_DESC') + '/' + self.__getBasicValue('CSRC_GREAT_CODE_DESC') + '/' + self.__getBasicValue('CSRC_MIDDLE_CODE_DESC')
    
    def getWebsite(self):
        return self.__getBasicValue('WWW_ADDRESS')
        
    def __getDatas(self, url, basicInfo=True):
        '''獲取指定地址的html內容 .'''
        
        request = urllib2.Request(url)

        request.add_header('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8')
        request.add_header('Accept-Encoding', 'gzip, deflate, sdch')
        request.add_header('Accept-Language', 'zh-CN,zh;q=0.8,en;q=0.6')
        request.add_header('Cache-Control', 'max-age=0')
        request.add_header('Connection', 'keep-alive')
        request.add_header('Host', 'query.sse.com.cn')
        request.add_header('Upgrade-Insecure-Requests', '1')
        if basicInfo:
            request.add_header('Referer', 'http://www.sse.com.cn/assortment/stock/list/info/company/index.shtml?COMPANY_CODE=' + str(self.stockCode))
        else:
            request.add_header('Referer', 'http://www.sse.com.cn/assortment/stock/list/info/capital/index.shtml?COMPANY_CODE=' + str(self.stockCode))
        request.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.97 Safari/537.36')
        
        # 嘗試5次,如果每次都是timeout,列印提示資訊,返回none 
        maxNum = 5
        for i in range(maxNum):
            try:    
                response = urllib2.urlopen(url=request, timeout=15)
                # 慢一點 不然被遮蔽
                sleep(5)
                break
            except:
                pass
            
            if i < maxNum - 1:
                continue
            else:
                print 'URLError: <urlopen error timed out> All times is failed '
                return None
        
        response.encoding = 'utf-8'
        result = response.read()
#         print result
        
        str2JsonData = str(result).split('(')[1].split(')')[0]
        pythonObjData = json.loads(str2JsonData, object_hook=JSONObject)
        
#         print pythonObjData.result
        if not pythonObjData.result:
            return '-'
        else:
            return pythonObjData.result
    
    
    def __getBasicValue(self, key):
        '''獲得上市公司基本資訊的值.'''
        result = ''
        try:
            # 首次使用該方法,需要訪問url,獲取網頁內容
            if self.stockBasicInfo == None:
                rsDict = self.__getDatas(self.basicURLA)
                if rsDict == '-' or rsDict is None:
                    result = '-'
                else:
                    self.stockBasicInfo = dict((name, getattr(rsDict[0], name)) for name in dir(rsDict[0]) if not name.startswith('__'))
#                     print self.stockBasicInfo
            result = self.stockBasicInfo.get(key)
        except:
            result = '-'
        
#         print result
        return result
    
    def __getCapitalValue(self, key):
        '''獲得上市公司股本資訊的值.'''
        result = ''
        try:
            # 首次使用該方法,需要訪問url,獲取網頁內容
            if self.stockCapitalInfo == None:
                rsDict = self.__getDatas(self.capitalURL, basicInfo=False)
                if rsDict == '-' or rsDict is None:
                    result = '-'
                else:
                    self.stockCapitalInfo = dict((name, getattr(rsDict, name)) for name in dir(rsDict) if not name.startswith('__'))
#                     print self.stockCapitalInfo
            result = self.stockCapitalInfo.get(key)
        except:
            result = '-'
        
#         print result
        return result
    
    def __mergeBasicURL(self, sqlId, stockCode):
        return 'http://query.sse.com.cn/commonQuery.do?jsonCallBack=jsonpCallback12345&isPagination=false&sqlId=' + sqlId + '&productid=' + str(stockCode) + '&_=14555555555552'
    
    def __init__(self, stockCode):
        self.stockCode = stockCode
        self.basicURLA = self.__mergeBasicURL('COMMON_SSE_ZQPZ_GP_GPLB_C', stockCode)
        self.basicURLB = self.__mergeBasicURL('COMMON_SSE_ZQPZ_GP_GPLB_AGSSR_C', stockCode)
        self.basicURLC = self.__mergeBasicURL('COMMON_SSE_ZQPZ_GP_GPLB_BGSSR_C', stockCode)
        self.basicURLD = self.__mergeBasicURL('COMMON_SSE_ZQPZ_GP_GPLB_MSXX_C', stockCode)
        self.basicURLE = r'http://query.sse.com.cn/commonSoaQuery.do?jsonCallBack=jsonpCallback46644&isPagination=true&stockCode=' + str(stockCode) + '&tradeBeginDate=19700101&tradeEndDate=20161001&order=tradeBeginDate%7Cdesc&sqlId=PL_SCRL_SCRLB&pageHelp.pageNo=1&pageHelp.beginPage=1&pageHelp.cacheSize=1&pageHelp.endPage=1&pageHelp.pageSize=5&_=1475720975596'
        self.capitalURL = 'http://query.sse.com.cn/security/stock/queryCompanyStockStruct.do?jsonCallBack=jsonpCallback86976&isPagination=false&companyCode=' + str(stockCode) + '&_=1475732919742'
       
        self.stockBasicInfo = None
        self.stockCapitalInfo = None
        pass 
    

if __name__ == '__main__':
    for i in range(600001, 600003):
        a = AchieveSSEStockInfo(600013)
        for j in range(a.__public__.__len__()):
            m = a.__public__[j]
            f = getattr(a, m)
            print m, f()


附錄:

1.使用requests庫抓取頁面的時候的編碼問題 https://segmentfault.com/q/1010000000341014
2.openpyxl參考手冊 http://openpyxl.readthedocs.io/en/default/   http://openpyxl.readthedocs.io/en/default/usage.html
3.urllib2使用 http://zhuoqiang.me/python-urllib2-usage.html#http
4.讀寫json資料 http://python3-cookbook.readthedocs.io/zh_CN/latest/c06/p02_read-write_json_data.html
5.python中 class 或物件屬性轉化成dict 、dict轉換成物件 http://blog.csdn.net/chenyulancn/article/details/8203763
6.【原創】說說JSON和JSONP,也許你會豁然開朗,含jQuery用例 http://www.cnblogs.com/dowinning/archive/2012/04/19/json-jsonp-jquery.html

7.Applying borders to a cell in OpenPyxl   http://stackoverflow.com/questions/24917201/applying-borders-to-a-cell-in-openpyxl

後記: