1. 程式人生 > >爬取某招聘網站的招聘資訊(獵聘)

爬取某招聘網站的招聘資訊(獵聘)

這該找工作了,俗話說的胡奧,金九銀十嘛。一個一個招聘資訊找著看,有點麻煩。所以心動了下,不如把我想找的資訊都爬取下來,直接sql語句查詢所有相關資訊,多方便,是吧~

注:
如果start-urls只設置一個的話,那麼只會爬取等於或者小於40條資料(會有重複)

Spider塊:
資訊搜尋,原本是想搜python、爬蟲之類的,後來寫著寫著就變成java了。果真還是忘不了自己的母語言啊~

import scrapy
from liepinSpider.items import LiepinspiderItem

class LisPinSpider(scrapy.Spider):
name = 'liepin' allowed_domains = ['www.liepin.com'] start_urls = [ 'https://www.liepin.com/sh/zhaopin/?dqs=020&salary=&isAnalysis=true&init=1&searchType=1&fromSearchBtn=1&jobTitles=&industries=&industryType=&d_headId=89d222c119810d9835c864b9842ca41a&d_ckId=89d222c119810d9835c864b9842ca41a&d_sfrom=search_city&d_curPage=0&d_pageSize=40&siTag=&key=java'
] #這個地址是該網站翻頁第二頁的地址,只需要在/zhaopin/後面加上pn1(第2頁,以此類 #推) #還有,在這裡要吐槽下獵聘的這個翻頁,搜尋關鍵字和區域後,進行翻頁,關鍵字和區域都 #沒了,我還要手動修改拼接url地址。最主要的是,使用者體驗沒了啊。不懂技術的人,壓根 #不知道怎麼看第二頁內容了。。 #如果想做的更靈活,直接input,修改key關鍵字地址就行,中文需要更改下編碼, #就這樣 # https://www.liepin.com/sh/zhaopin/pn1/?dqs=&salary=&isAnalysis=true&init=1&searchType=1&fromSearchBtn=1&jobTitles=&industries=&industryType=&d_headId=89d222c119810d9835c864b9842ca41a&d_ckId=89d222c119810d9835c864b9842ca41a&d_sfrom=search_city&d_curPage=0&d_pageSize=40&siTag=&key=java
def parse(self, response): list = response.css('.sojob-list li') for li in list: html_url = li.css('.job-name a::attr(href)').extract_first() yield scrapy.Request(html_url, callback=self.content) #這個位置可以編寫下一頁的訪問請求 #yield scrapy.Request('拼接好的url',callback=self.parse) def content(self, response): item = LiepinspiderItem() #這個是直接獲取該頁面的url地址 html_url = response.url title = response.css('.title-info h1::text').extract_first() company = response.css('.title-info h3 a::text').extract_first() money = response.css('.job-item-title::text').extract_first() address = response.css('.basic-infor a::text').extract_first() times = response.css('.basic-infor time::attr(title)').extract_first() job_query_list = response.css('.job-qualifications span::text').extract() job_query = '' for job_querys in job_query_list: job_query += job_querys + ',' tag_list = response.css('.tag-list span::text').extract() tags = '' for tag_span in tag_list: tags += tag_span + ',' job_contents = response.css('.job-description div::text').extract() job_content='' for job in job_contents: job_content += job.replace('\r\n','') #不要忘了在item中設定相關的引數呦 #招聘網頁url item['html_url'] = html_url #標題 item['title'] = title #公司名稱 item['company'] = company #薪水 item['money'] = money.strip() #公司地址(這個是區域地址,詳細地址可以在頁面上找到,自己修改下就好了) item['address'] = address #釋出時間 item['times'] = times #簡寫的工作條件 item['job_query'] = job_query #福利待遇 item['tags'] = tags #詳細的工作職責和工作條件 item['job_content'] = job_content.strip() yield item

pipelines塊:
不要忘記在setting中開啟pipelines模組啊~~
ITEM_PIPELINES = {
‘liepinSpider.pipelines.LiepinspiderPipeline’: 1,
}

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html

import pymysql
from scrapy.exceptions import DropItem
def dbHandle():
    conn = pymysql.connect(
        host='localhost',
        user='root',
        passwd='Cs123456.',
        charset='utf8',
        db='liepin',
        use_unicode=False
    )
    return conn


class LiepinspiderPipeline(object):

    def process_item(self, item, spider):
        #連線資料庫
        db = dbHandle()
        #開啟遊標
        cursor = db.cursor()
        #拼接sql
        sql = 'insert into liepin_list (url, title, company, money, address, times, job_query, tags, job_content)  ' \
              'value ("{html_url}", "{title}", "{company}", "{money}", "{address}", "{times}", "{job_query}", "{tags}", "{job_content}");'.format(
            **item)
        try:
            #判斷
            re = self.db_distinct(item['html_url'])
            if re:
                try:
                    cursor.execute(sql)
                    db.commit()
                except:
                    raise DropItem('sql執行錯誤')

            else:
                raise DropItem('資料已存在')


        except:
            db.rollback()
            cursor.close()

    #通過招聘地址的url來判斷這個頁面是否被儲存過
    def db_distinct(self, html_url):
        db = dbHandle()
        cursor = db.cursor()
        sql = 'select * from liepin_list where url ="{}"'.format(html_url)

        cursor.execute(sql)
        data = cursor.fetchone()
        cursor.close()
        if data == None:
            return True
        else:
            return False

表結構:
這裡寫圖片描述

查詢的部分資料:
這裡寫圖片描述
這樣就大功告成了,謝謝觀看。