1. 程式人生 > >scrapy模擬瀏覽器翻頁爬取智聯

scrapy模擬瀏覽器翻頁爬取智聯

智聯爬取中,頁碼的數字和url是不匹配的,因此盲目的拼接url會造成錯誤,因此可以採用模擬瀏覽器爬取網頁
要模擬瀏覽器需要知道scrapy流程,簡圖如下:
在這裡插入圖片描述
這裡只是簡單的寫一些偽碼,設計的資料清洗部分請看scrapy資料清洗

middleswares.py

from scrapy.http import HtmlResponse
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
class SeleniumMiddleware(object):
    def __init__(self):
        self.options = Options()
        self.browser = webdriver.Chrome(chrome_options = self.options)
    def process_request(self,request,spider):
        if int(request.meta['page']) ==2:
            self.browser.execute_script('window.scrollTo(0,document.body.scrollHeight)')
            time.sleep(1)
            div = self.browser.find_element_by_css_selector('.soupager')
            next_page = div.find_elements_by_tag_name('button')
            #兩種方法二選一
            # next_page[1].click()
            for button in next_page:
                if button.text == "下一頁":
                    button.click()
        else:
            if int(request.meta['page']) == 0:
                try:
                    print('url is :::', request.url)
                    self.browser.get(request.url)
                except TimeoutException as e:
                    print('超時')
                time.sleep(5)
                return HtmlResponse(url=self.browser.current_url, body=self.browser.page_source,
                                    encoding='utf-8', request=request)


#爬取過程中有時候會出現一種情況就是網頁一直在載入,右上角有一個小圓圈一直再轉,此時把圓圈差掉內容就可以加載出來
#browser.execute_script('window.stop()') 用這個方法.

spiders.py

# -*- coding: utf-8 -*-
import time

import lxml.html

import scrapy
from lxml import etree
from scrapy import Request
class Jobparse():
    def __init__(self):
        pass
def parse_div_content(html_str):
    pass

def parse_lxml_zhilian(html_str):

    tree = lxml.html.fromstring(html_str)
    job_url = tree.xpath('//a[@class="contentpile__content__wrapper__item__info__boxle"]/@href')
    job_name = tree.xpath('//a[@class="contentpile__content__wrapper__item__info__boxle"]/@title')

    print(job_url)
    print(job_name)
    return job_url

#全部變數判斷翻頁速度與下載速度
count = 0
class ZhilianSpider(scrapy.Spider):
    name = 'zhilian'
    # allowed_domains = ['ts.zhaopin.com']
    # start_urls = ['http://ts.zhaopin.com/']

    def start_requests(self):
        url_str = "https://sou.zhaopin.com/?pageSize=60&jl=489&kw=python&kt=3"
        yield Request(url_str, callback=self.parse, dont_filter=True, meta={'page': '0'})

    def parse(self, response):
        # rs = response.css('div:nth-child(2)').extract()
        # print('rs is :::::', rs)
        # page_next = response.css('.soupager').extract()
        # print('page next is ::::', page_next)
        rs = response.css('div.contentpile_content_wrapper:nth-child(2)').extract()
        page_next = response.css('.soupager').extract()
        #假設每頁60條資料 沒翻一頁資料多60條
        global count
        count += 60
        for r in rs:
            job_url = parse_lxml_zhilian(r)
            yield Request(url=job_url,callback=self.parse_detail,meta={'page': '3'}, dont_filter=True)

        if len(page_next) > 0:
            #當資料>300可以讓翻頁暫停等待資料下載
            if count > 300:
                time.sleep(0.5)
            # 使用selenium模擬點選下一頁,該請求不會產生實質的下載動作
            yield Request(url=response.url, callback=self.parse, meta={'page': '2'}, dont_filter=True)
    def parse_detail(self,response):
        #資料沒下載一條,count - 1
        pass
        global count
        count -= 1



#實現瀏覽器模擬與非模擬  只要中介軟體中不return 該請求就會自動留到downloader
#控制翻頁速度和下載速度同步: