1. 程式人生 > >前程無憂

前程無憂

學歷 style span arc https ems form lba cal

# -*- coding: utf-8 -*-
import scrapy
import re
from zhaopin_project.items import LagouItem

class QianchengwuyouSpider(scrapy.Spider):
    name = qianchengwuyou
    allowed_domains = [51job.com]
    start_urls = [http://51job.com/]

    def parse(self, response):
        for i in range(1,1620):
            base_url 
= https://search.51job.com/list/010000,000000,0000,32,9,99,%2B,2,{}.html.format(i) # print(base_url) yield scrapy.Request(base_url,callback=self.parse_detail) def parse_detail(self,response): html_str = response.xpath(//div[@class="el"]/p/span/a/@href).extract() # print(html_str)
for html_list in html_str: yield scrapy.Request(html_list,callback=self.parse_list) def parse_list(self,response): try: # 職位名稱 title = response.xpath(//div[@class="cn"]/h1/text()).extract_first() # 月薪 salary = response.xpath(
//div[@class="cn"]/strong/text()).extract_first() # 位置 p = re.findall(r<p class="msg ltype" title="(.*)">,response.text)[0] ss = p.split(&nbsp;&nbsp;|&nbsp;&nbsp;) position = ss[0] #經驗要求 jingyan = ss[1] # 學歷要求 if len(ss) ==4: xueli = 學歷不限 else: xueli = ss[2] # 時間 shijian = ss[-1] # 發布網站 fabu = 前程無憂 # 職位描述 job_bt = response.xpath(//div[@class="tBorderTop_box"]/div/p/text()).extract() job_bt = ‘‘.join(job_bt) # print(‘--‘*50) item = LagouItem() item[title] = title item[salary] = salary item[position] = position item[jingyan] = jingyan item[xueli] = xueli item[shijian] = shijian item[fabu] = fabu item[job_bt] = job_bt yield item except: pass

前程無憂