1. 程式人生 > >scrapy爬取中關村在線手機頻道

scrapy爬取中關村在線手機頻道

tex ice extract base .section title .html release nbsp

 1 # -*- coding: utf-8 -*-
 2 import scrapy
 3 from pyquery import PyQuery as pq
 4 
 5 from zolphone.items import ZolphoneItem
 6 
 7 
 8 class PhoneSpider(scrapy.Spider):
 9     name = "phone"
10     # allowed_domains = ["www.zol.com.cn"]
11     # start_url = ‘http://detail.zol.com.cn/cell_phone_index/subcate57_0_list_1_0_1_1_0_1.html‘
12 start_url = http://detail.zol.com.cn/cell_phone_index/subcate57_0_list_1_0_1_1_0_ 13 14 def start_requests(self): 15 16 for page in range(1, 209): 17 url = self.start_url + str(page) + .html 18 yield scrapy.Request(url,callback=self.parse_index) 19 20 21 def
parse_index(self, response): 22 base_url = http://detail.zol.com.cn 23 doc = pq(response.text) 24 lis = doc(.list-box .list-item).items() 25 for result in lis: 26 detail_url = base_url + result.find(.pro-intro h3 a).attr(href) 27 yield scrapy.Request(url=detail_url, callback=self.parse_detail)
28 29 def parse_detail(self,response): 30 doc = pq(response.text) 31 title1 = response.css(.page-title h1::text).extract_first() 32 title2 = doc(.page-title h2).text() 33 price = doc(.product-price .price-type).text() 34 release_time = doc(.section div h3 .showdate).text() 35 print(title1, title2, price, release_time) 36 item = ZolphoneItem() 37 item[title1] = title1 38 item[title2] = title2 39 item[price] = price 40 item[release_time] = release_time 41 42 yield item
 1 import scrapy
 2 
 3 
 4 class ZolphoneItem(scrapy.Item):
 5     # define the fields for your item here like:
 6     # name = scrapy.Field()
 7     title1 = scrapy.Field()
 8     title2 = scrapy.Field()
 9     price = scrapy.Field()
10     release_time = scrapy.Field()

scrapy爬取中關村在線手機頻道