1. 程式人生 > >scrapy-boss直聘

scrapy-boss直聘

ssm 解決 arc cep mac os x head mic exception gin

  Hi,大家好。有段時間沒來更新scrapy爬取實例信息了,前2天同事說爬取拉勾,boss直聘等網站信息比較困難。昨天下午開始著手爬取boss直聘內Python爬蟲的信息,比想象中的簡單很多。

需要解決的問題:

  boss直聘網的信息是大部分以靜態加載和少許動態加載方式顯示網站。

  1.靜態加載:公司的具體信息和崗位職責(1_1)

  2.動態加載:首頁搜索框,搜索python爬蟲(1_2)

解決的思路:

  1.靜態加載:常規爬取信息(簡單)

  2.動態加載:selenium(簡單)

技術分享圖片

                  圖(1_1)

技術分享圖片

                  圖(1_2)

老規矩,給各位爬取結果的圖,大家也可以去嘗試一下:

技術分享圖片

(三)開始正題

3_1.需要提取的信息:items.py

import scrapy

class BossItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    #pass
    job_title = scrapy.Field()
    salary    = scrapy.Field()
    address   = scrapy.Field()
    job_time  = scrapy.Field()
    education 
= scrapy.Field() company = scrapy.Field() company_info= scrapy.Field() detail_text = scrapy.Field()

3_2.設置代理:middlewares.py

class BossSpiderMiddleware(object):
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the spider middleware does not modify the
# passed objects. def __init__(self,ip=‘‘): self.ip = ip def process_request(self,request,spider): print(http://10.240.252.16:911) request.meta[proxy]= http://10.240.252.16:911 @classmethod def from_crawler(cls, crawler): # This method is used by Scrapy to create your spiders. s = cls() crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) return s def process_spider_input(self, response, spider): # Called for each response that goes through the spider # middleware and into the spider. # Should return None or raise an exception. return None def process_spider_output(self, response, result, spider): # Called with the results returned from the Spider, after # it has processed the response. # Must return an iterable of Request, dict or Item objects. for i in result: yield i def process_spider_exception(self, response, exception, spider): # Called when a spider or process_spider_input() method # (from other spider middleware) raises an exception. # Should return either None or an iterable of Response, dict # or Item objects. pass def process_start_requests(self, start_requests, spider): # Called with the start requests of the spider, and works # similarly to the process_spider_output() method, except # that it doesn’t have a response associated. # Must return only requests (not items). for r in start_requests: yield r def spider_opened(self, spider): spider.logger.info(Spider opened: %s % spider.name) class BossDownloaderMiddleware(object): # Not all methods need to be defined. If a method is not defined, # scrapy acts as if the downloader middleware does not modify the # passed objects. @classmethod def from_crawler(cls, crawler): # This method is used by Scrapy to create your spiders. s = cls() crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) return s def process_request(self, request, spider): # Called for each request that goes through the downloader # middleware. # Must either: # - return None: continue processing this request # - or return a Response object # - or return a Request object # - or raise IgnoreRequest: process_exception() methods of # installed downloader middleware will be called return None def process_response(self, request, response, spider): # Called with the response returned from the downloader. # Must either; # - return a Response object # - return a Request object # - or raise IgnoreRequest return response def process_exception(self, request, exception, spider): # Called when a download handler or a process_request() # (from other downloader middleware) raises an exception. # Must either: # - return None: continue processing this exception # - return a Response object: stops process_exception() chain # - return a Request object: stops process_exception() chain pass def spider_opened(self, spider): spider.logger.info(Spider opened: %s % spider.name)

3_3.下載數據(存儲到mongodb):pipelines.py

import scrapy
import pymongo
from scrapy.item import Item

class BossPipeline(object):
    def process_item(self, item, spider):
        return item

class MongoDBPipeline(object):    #存儲到mongodb中
    @classmethod
    def from_crawler(cls,crawler):
        cls.DB_URL = crawler.settings.get("MONGO_DB_URL",mongodb://localhost:27017/)
        cls.DB_NAME = crawler.settings.get("MONGO_DB_NAME",scrapy_data)
        return cls()

    def open_spider(self,spider):
        self.client = pymongo.MongoClient(self.DB_URL)
        self.db     = self.client[self.DB_NAME]

    def close_spider(self,spider):
        self.client.close()

    def process_item(self,item,spider):
        collection = self.db[spider.name]
        post = dict(item) if isinstance(item,Item) else item
        collection.insert(post)

        return item

3_4.settings.py

MONGO_DB_URL = mongodb://localhost:27017/
MONGO_DB_NAME = boss_detail

USER_AGENT ={       #設置瀏覽器的User_agent
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
    "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
    "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
    "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
    "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
}

FEED_EXPORT_FIELDS = [job_title,salary,address,job_time,education,company,company_info]

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)
CONCURRENT_REQUESTS = 10

# See also autothrottle settings and docs
DOWNLOAD_DELAY = 0.5

# Disable cookies (enabled by default)
COOKIES_ENABLED = False

DOWNLOADER_MIDDLEWARES = {
    #‘Boss.middlewares.BossDownloaderMiddleware‘: 543,
    scrapy.contrib.downloadermiddleware.httpproxy.HttpProxyMiddleware:543,
    Boss.middlewares.BossSpiderMiddleware:123,
}

ITEM_PIPELINES = {
    scrapy.contrib.downloadermiddleware.httpproxy.HttpProxyMiddleware:1,
    Boss.pipelines.MongoDBPipeline: 300,
}

3_5.spider/boss.py

#-*- coding:utf-8 -*-
import time
from selenium import webdriver
import pdb
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys    import Keys
from lxml import etree
import re
from bs4 import BeautifulSoup
import scrapy
from Boss.items import BossItem
from Boss.settings import USER_AGENT
from scrapy.linkextractors import LinkExtractor

chrome_options = Options()
driver = webdriver.Chrome()

class BossSpider(scrapy.Spider):
    name = boss
    allowed_domains = [www.zhipin.com]
    start_urls = [http://www.zhipin.com/]

    headers = {
        Accept: application/json, text/javascript, */*; q=0.01,
        Accept-Encoding: gzip, deflate,
        Accept-Language: zh-CN,zh;q=0.9,
        Connection: keep-alive,
        Content-Length: 11,
        Content-Type: application/x-www-form-urlencoded; charset=UTF-8,
        Host: www.zhipin.com,
        Origin: www.zhipin.com,
        Referer: http://www.zhipin.com/,
        User-Agent: USER_AGENT,
        X-Requested-With: XMLHttpRequest,
    }

    def start_requests(self):
        driver.get(
            self.start_urls[0]
            )
        time.sleep(3)

        #搜索python爬蟲
        driver.find_element_by_name(query).send_keys(upython爬蟲)
        time.sleep(3)
        driver.find_element_by_class_name(btn-search).click()
        time.sleep(3)

        new_url = driver.current_url.encode(utf8) #獲取跳轉之後的url
        yield scrapy.Request(new_url)

    def parse(self, response):
        #提取網頁鏈接url
        links = LinkExtractor(restrict_css="div.info-primary>h3>a")
        link = links.extract_links(response)
        for each_link in link:
            yield scrapy.Request(each_link.url,callback=self.job_detail)


    def job_detail(self,response):
        spiderItem = BossItem()
        #想要提取的信息
        spiderItem[job_title]     = response.css(div.job-primary.detail-box div.name h1::text).extract()[0]
        spiderItem[salary]        = response.css(div.job-primary.detail-box span.badge ::text).extract()[0]
        spiderItem[address]       = response.css(div.job-primary.detail-box p::text).extract()[0]
        spiderItem[job_time]      = response.css(div.job-primary.detail-box p::text).extract()[1]
        spiderItem[education]     = response.css(div.job-primary.detail-box p::text).extract()[2]
        spiderItem[company]       = response.css(div.job-primary.detail-box div.info-company h3.name a::text).extract()[0]
        spiderItem[company_info]  = response.css(div.job-primary.detail-box div.info-company>p::text).extract()[0]

        detail = response.css(div.job-sec div.text ::text).extract()
        details = ‘‘.join(detail)      #將列表內所有字符串提取成一個整的字符串
        spiderItem[detail_text]   = details

        print spiderItem
        yield spiderItem

scrapy-boss直聘