scrapy爬取前程無憂51job網職位資訊並存儲到資料庫
阿新 • • 發佈:2018-12-17
- spiders中程式碼如下
import scrapy from scrapy import Request from QianCheng.items import QianchengItem import re class ExampleSpider(scrapy.Spider): name = '51job' def start_requests(self): url_str = 'https://www.51job.com/zhengzhou/' yield Request(url=url_str,callback=self.parse,dont_filter=True,meta={'page':'0'}) def parse(self, response): contents = response.xpath('//div[@class = "el"]') for i in contents: urls = i.xpath('p/span[1]/a[@href]/@href').extract() for urll in urls: yield Request(url=urll,callback=self.parse_dail,meta={'page':'1'}) if re.search(r'search',response.url): yield Request(url = response.url,callback=self.parse,meta={'page':'2'}) #標記page,再中介軟體中識別並進行翻頁操作 def parse_dail(self,response): job_name = response.xpath('//h1[@title]/@title').extract() company =response.xpath('//p[@class="cname"]/a[@title]/@title').extract() saray = response.xpath('//div[@class="cn"]/strong/text()').extract() company_desc = response.xpath('//div[@class="tmsg inbox"]/text()').extract() qianchengs = QianchengItem() qianchengs['job_name'] = ''.join(job_name) qianchengs['company'] = ''.join(company) qianchengs['saray'] = ''.join(saray) qianchengs['company_desc'] = ''.join(company_desc).strip() yield qianchengs
- scrapy.items
import scrapy
class QianchengItem(scrapy.Item):
job_name = scrapy.Field()
company= scrapy.Field()
saray= scrapy.Field()
company_desc= scrapy.Field()
- scrapy.piplines獲取資料並進行儲存操作
import sqlite3 class QianchengPipeline(object): def __init__(self): self.conn = sqlite3.connect("qiancheng.db") self.cursor = self.conn.cursor() self.cursor.execute("create table IF NOT EXISTS zhaopin(job_name varchar(200),company varchar(500),saray varchar(100),company_desc varchar(100))") def process_item(self, item, spider): self.cursor.execute("insert into zhaopin values('%s','%s','%s','%s')"%(item["job_name"],item["company"],item["saray"],item["company_desc"])) self.conn.commit() return item
- scrapy.setting中:
BOT_NAME = 'QianCheng'
SPIDER_MODULES = ['QianCheng.spiders']
NEWSPIDER_MODULE = 'QianCheng.spiders'
ROBOTSTXT_OBEY = False
DOWNLOADER_MIDDLEWARES = {
'QianCheng.middlewares.SeleniumMiddlewares': 543,
}
ITEM_PIPELINES = {
'QianCheng.pipelines.QianchengPipeline': 300,
}
- 儲存結果如下: