Scrapy+Seleium爬蟲爬取天眼查資料
阿新 • • 發佈:2018-12-16
#難點:
-
1.資料介面很難找到,反爬措施很強,所以用的seleium模擬抓取
-
2.頁面資料字型進行了異常,需要進行反向破解
###本文用的是天眼查移動端 m.tianyancha.com 進行抓取,輸入公司名可以抓取前面5條具體資訊展示 ###還有網站字型異常反爬每天都會更新,所以需要後面使用的需要排除去除,用fontcreator軟體
###程式碼抓取也有些注意點,用的google無頭headless瀏覽器
**#Spider檔案 # -*- coding: utf-8 -*- import scrapy from tianyancha.items import TianyanchaItem import re from fontTools.ttLib import TTFont #aa需要更新維護反爬 aa = { 2: 0, 8: 2, 0: 4, 7: 6, 9: 7, 6: 8, 4: 9, 1: 1, 5: 5, 3:3 } class ChaSpider(scrapy.Spider): name = 'cha' allowed_domains = ['m.tianyancha.com'] # start_urls = ['http://m.tianyancha.com/'] def start_requests(self): meta={"nihao":"dawang"} a=input("請輸入要查詢的企業名:") url="https://m.tianyancha.com/search?key=%s"%a yield scrapy.Request(url=url,callback=self.parse,meta=meta) def parse(self, response): meta = {"nihao": "dawang"} url_lists=response.xpath('//div[contains(@class,"col-xs-10")]/a/@href').extract() for url_list in url_lists: yield scrapy.Request(url=url_list, callback=self.new_parse, meta=meta) def new_parse(self, response): item=TianyanchaItem() item["company"]=response.xpath('//div[@class="over-hide"]/div/text()').extract()[0] # item["boss"]=response.xpath('/html/body/div[3]/div[1]/div[7]/div/div[1]/span[2]/a/text()').extract()[0] a=response.xpath('/html/body/div[3]/div[1]/div[7]/div/div[3]/span[2]/text/text()').extract() a = a[0].replace("-", "") a = list(a) bb = [] for i in range(len(a)): aaa = a[i] # aaa出來是str bbb = aa[int(aaa)] bb.append(bbb) item["registration_time"] = "".join("%s" % id for id in bb) #將列表裡元素按方式拼接成字串 # item["registration_time"]="".join("%s"%id for id in list(k)) #將列表裡元素按方式拼接成字串 b=response.xpath('/html/body/div[3]/div[1]/div[7]/div/div[4]/span[2]/text/text()').extract()[0] b=re.findall(r"\d+",b) b = list(b[0]) kk = [] for i in range(len(b)): mmm = b[i] # aaa出來是str kkk = aa[int(mmm)] kk.append(kkk) item["the_registered_capital"] = "".join("%s" % im for im in kk)+"萬" # item["the_registered_capital"] = "".join("%s" % id for id in list(kk)) item["industry"] = response.xpath('/html/body/div[3]/div[1]/div[7]/div/div[5]/span[2]/text()').extract()[0] item["the_enterprise_type"] = response.xpath('/html/body/div[3]/div[1]/div[7]/div/div[6]/span[2]/text()').extract()[0] item["registration_number"] = response.xpath('/html/body/div[3]/div[1]/div[7]/div/div[7]/span[2]/text()').extract()[0] item["organization_code"] = response.xpath('/html/body/div[3]/div[1]/div[7]/div/div[8]/span[2]/text()').extract()[0] item["credit_code"] = response.xpath('/html/body/div[3]/div[1]/div[7]/div/div[9]/span[2]/text()').extract()[0] item["business_period"] = response.xpath('/html/body/div[3]/div[1]/div[7]/div/div[10]/span[2]/text()').extract()[0] item["approval_date"] = response.xpath('/html/body/div[3]/div[1]/div[7]/div/div[11]/span[2]/text()').extract()[0] item["registration_authority"] = response.xpath('/html/body/div[3]/div[1]/div[7]/div/div[13]/span[2]/text()').extract()[0] item["registered_address"] = response.xpath('/html/body/div[3]/div[1]/div[7]/div/div[14]/span[2]/text()').extract()[0] item["scope_of_business"] = response.xpath('/html/body/div[3]/div[1]/div[7]/div/div[15]/span[2]/span/span[2]/div/text/text()').extract()[0] # print(item["company"]) # print(item["registration_time"]) # print(item["the_registered_capital"]) print(item) yield item
#middlewares檔案 from selenium import webdriver from scrapy.http import HtmlResponse class TianyanchaDownloaderMiddleware(object): # Not all methods need to be defined. If a method is not defined, # scrapy acts as if the downloader middleware does not modify the # passed objects. def process_request(self, request, spider): if request.meta["nihao"] =="dawang" : option = webdriver.ChromeOptions() option.add_argument('--headless') option.add_argument('--disable-gpu') driver = webdriver.Chrome(chrome_options=option) # driver=webdriver.Chrome() driver.get(request.url) content=driver.page_source driver.quit() return HtmlResponse(request.url,encoding="utf-8",body=content,request=request)