scrapy模擬瀏覽器翻頁爬取智聯
阿新 • • 發佈:2018-12-05
智聯爬取中,頁碼的數字和url是不匹配的,因此盲目的拼接url會造成錯誤,因此可以採用模擬瀏覽器爬取網頁
要模擬瀏覽器需要知道scrapy流程,簡圖如下:
這裡只是簡單的寫一些偽碼,設計的資料清洗部分請看scrapy資料清洗
from scrapy.http import HtmlResponse from selenium import webdriver from selenium.webdriver.chrome.options import Options class SeleniumMiddleware(object): def __init__(self): self.options = Options() self.browser = webdriver.Chrome(chrome_options = self.options) def process_request(self,request,spider): if int(request.meta['page']) ==2: self.browser.execute_script('window.scrollTo(0,document.body.scrollHeight)') time.sleep(1) div = self.browser.find_element_by_css_selector('.soupager') next_page = div.find_elements_by_tag_name('button') #兩種方法二選一 # next_page[1].click() for button in next_page: if button.text == "下一頁": button.click() else: if int(request.meta['page']) == 0: try: print('url is :::', request.url) self.browser.get(request.url) except TimeoutException as e: print('超時') time.sleep(5) return HtmlResponse(url=self.browser.current_url, body=self.browser.page_source, encoding='utf-8', request=request) #爬取過程中有時候會出現一種情況就是網頁一直在載入,右上角有一個小圓圈一直再轉,此時把圓圈差掉內容就可以加載出來 #browser.execute_script('window.stop()') 用這個方法.
# -*- coding: utf-8 -*- import time import lxml.html import scrapy from lxml import etree from scrapy import Request class Jobparse(): def __init__(self): pass def parse_div_content(html_str): pass def parse_lxml_zhilian(html_str): tree = lxml.html.fromstring(html_str) job_url = tree.xpath('//a[@class="contentpile__content__wrapper__item__info__boxle"]/@href') job_name = tree.xpath('//a[@class="contentpile__content__wrapper__item__info__boxle"]/@title') print(job_url) print(job_name) return job_url #全部變數判斷翻頁速度與下載速度 count = 0 class ZhilianSpider(scrapy.Spider): name = 'zhilian' # allowed_domains = ['ts.zhaopin.com'] # start_urls = ['http://ts.zhaopin.com/'] def start_requests(self): url_str = "https://sou.zhaopin.com/?pageSize=60&jl=489&kw=python&kt=3" yield Request(url_str, callback=self.parse, dont_filter=True, meta={'page': '0'}) def parse(self, response): # rs = response.css('div:nth-child(2)').extract() # print('rs is :::::', rs) # page_next = response.css('.soupager').extract() # print('page next is ::::', page_next) rs = response.css('div.contentpile_content_wrapper:nth-child(2)').extract() page_next = response.css('.soupager').extract() #假設每頁60條資料 沒翻一頁資料多60條 global count count += 60 for r in rs: job_url = parse_lxml_zhilian(r) yield Request(url=job_url,callback=self.parse_detail,meta={'page': '3'}, dont_filter=True) if len(page_next) > 0: #當資料>300可以讓翻頁暫停等待資料下載 if count > 300: time.sleep(0.5) # 使用selenium模擬點選下一頁,該請求不會產生實質的下載動作 yield Request(url=response.url, callback=self.parse, meta={'page': '2'}, dont_filter=True) def parse_detail(self,response): #資料沒下載一條,count - 1 pass global count count -= 1 #實現瀏覽器模擬與非模擬 只要中介軟體中不return 該請求就會自動留到downloader #控制翻頁速度和下載速度同步: