1. 程式人生 > >使用scrapy框架+模擬瀏覽器方法實現爬取智聯的職位資訊

使用scrapy框架+模擬瀏覽器方法實現爬取智聯的職位資訊

由於智聯的頁面是由js動態載入的,一般的方法只能得到js載入前的頁面,為了得到載入過的頁面需要通過模擬瀏覽器來拿到完整的頁面.
下面的程式碼只是簡單的實現,爬取智聯頁面的部分功能,其他根據需要自己實現

中介軟體(middleswares.py)程式碼:


from scrapy.http import HtmlResponse
from selenium import webdriver
import time
# from selenium.webdriver.chrome.options import Options
from selenium.webdriver.firefox.options import Options


class SeleniumMiddleware(object):
    def __init__(self):
        self.options = Options()
        # self.options.add_argument('-headless')
        # self.browser = webdriver.Chrome(executable_path=r"D:\python_others\Spider\code\day06\tools\chromedriver.exe",chrome_options=self.options)
        self.browser = webdriver.Firefox(executable_path=r"D:\python_others\Spider\code\day06\tools\geckodriver.exe",
                                         firefox_options=self.options)

    def process_request(self, request, spider):
        if int(request.meta['page']) == 2:
            # 執行javascript使瀏覽器滾動條滾動到最後
            self.browser.execute_script('window.scrollTo(0,document.body.scrollHeight)')
            time.sleep(3)
            div = self.browser.find_element_by_css_selector(".soupager")
            next_page = div.find_element_by_tag_name("button")
            next_page[1].click()
            # page = self.browser.find_element_by_xpath('//*[@id="pagination_content"]/div/button[2]')
            # page.click()
            # time.sleep(10)
        else:
            if (request.meta['page']) == 0:
                try:
                    print("url is ::::", request.url)
                    self.browser.get(request.url)
                except TimeoutError as e:
                    print("超時")
                time.sleep(5)

                return HtmlResponse(url=self.browser.current_url, body=self.browser.page_source, encoding="utf-8",
                                    request=request)

# 在模擬瀏覽器過程中如果還想要在downloader實現下載 只要中介軟體不return就可以
# 出現頁面一直載入的情況時,顯示頁面一直在載入,只要差掉載入過程的小圓圈,頁面就會加載出來
# browser.execute_script(('window.stop()') 使用這個方法

爬蟲檔案(spider.py)程式碼:

# -*- coding: utf-8 -*-
import time
import scrapy
import lxml.html
from scrapy import Request


class JobDes(object):
    def __init__(self):
        self.detail_url = ""
        self.title = ""


def parse_lxml_zhilian(html_str):
    tree = lxml.html.fromstring(html_str)
    job_url = tree.xpath('//a[@class="contentpile__content__wrapper__item__info__boxle"]/@href')
    job_name = tree.xpath('//a[@class="contentpile__content__wrapper__item__info__boxle"]/@title')

    print(job_url)
    print(job_name)

#全域性變數用於判斷翻頁速度是否遠大於區域性下載速度
count = 0
class ZhaopinSpider(scrapy.Spider):
    name = 'zhaopin'

    # allowed_domains = ['ts.zhaopin.com']
    # start_urls = ['http://ts.zhaopin.com/']

    def start_requests(self):
        url_str = 'https://sou.zhaopin.com/?jl=489&kw=python&kt=3'
        yield Request(url=url_str, callback=self.parse, meta={"page": "0"})

    def parse(self, response):
        #使用模擬器翻頁載入ajax頁面
        #在模擬器彈出頁面分析抓取頁面
        #抓取標籤不是一成不變的,謹慎使用帶數字的css選擇器nth-child(1)
        #使用簡單可調式的頁面去除錯
        #selenium可以用於模擬測試
        rs = response.css('#listContent > div:nth-child(1)')
        page_next = response.xpath('//*[@id="pagination_content"]/div/button[2]')
        # pagination_content > div > button:nth-child(7)
        print("rs is :::::", rs)
        print("page_next is :::::", page_next)
        # listContent > div:nth-child(1)
        # pagination_content > div > button:nth-child(7)
        # button.btn:nth-child(8)
        #每頁60個下載任務,每翻一頁多60條任務
        global count
        count += 60
        for r in rs:
            job_url = parse_lxml_zhilian(r)
            yield Request(url=job_url, callback=self.parse_detal, meta={"page": "3"}, dont_filter=True)
        if len(page_next) > 0:
            #當下載任務大於300時,暫停翻頁等待資料下載
            while count > 300:
                time.sleep(0.5)
            yield Request(url=response.url, callback=self.parse, meta={"page": "2"}, dont_filter=True)

    def parse_detal(self):
        pass