1. 程式人生 > >44.scrapy爬取鏈家網站二手房資訊-2

44.scrapy爬取鏈家網站二手房資訊-2

全面採集二手房資料:

網站二手房總資料量為27650條,但有的引數欄位會出現一些問題,因為只給返回100頁資料,具體檢視就需要去細分請求url引數去請求網站資料。
我這裡大概的獲取了一下篩選條件引數,一些存在問題也沒做細化處理,大致的採集資料量為21096,實際19794條。

看一下執行完成結果:

{'downloader/exception_count': 199,
'downloader/exception_type_count/twisted.internet.error.NoRouteError': 192,
'downloader/exception_type_count/twisted.web._newclient.ResponseNeverReceived': 7,
'downloader/request_bytes': 9878800,
'downloader/request_count': 21096,
'downloader/request_method_count/GET': 21096,
'downloader/response_bytes': 677177525,
'downloader/response_count': 20897,
'downloader/response_status_count/200': 20832,
'downloader/response_status_count/301': 49,
'downloader/response_status_count/302': 11,
'downloader/response_status_count/404': 5,
'dupefilter/filtered': 53,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2018, 11, 12, 8, 49, 42, 371235),
'httperror/response_ignored_count': 5,
'httperror/response_ignored_status_count/404': 5,
'log_count/DEBUG': 21098,
'log_count/ERROR': 298,
'log_count/INFO': 61,
'request_depth_max': 3,
'response_received_count': 20837,
'retry/count': 199,
'retry/reason_count/twisted.internet.error.NoRouteError': 192,
'retry/reason_count/twisted.web._newclient.ResponseNeverReceived': 7,
'scheduler/dequeued': 21096,
'scheduler/dequeued/memory': 21096,
'scheduler/enqueued': 21096,
'scheduler/enqueued/memory': 21096,
'spider_exceptions/TypeError': 298,
'start_time': datetime.datetime(2018, 11, 12, 7, 59, 52, 608383)}
2018-11-12 16:49:42 [scrapy.core.engine] INFO: Spider closed (finished)

採集資料如圖:

num = 296910/15=19794條

 

2. lianjia.py

# -*- coding: utf-8 -*-
import scrapy


class LianjiaSpider(scrapy.Spider):
    name = 'lianjia'
    allowed_domains = ['gz.lianjia.com']
    start_urls = ['https://gz.lianjia.com/ershoufang/pg1/']
  
    def parse(self, response):
        
for i in range(1,8): for j in range(1,8): url = 'https://gz.lianjia.com/ershoufang/p{}a{}pg1'.format(i,j) yield scrapy.Request(url=url,callback=self.parse_detail) def parse_detail(self,response): # 符合篩選條件的個數 counts = response.xpath("//h2[@class='total fl']/span/text()
").extract_first().strip() # print(counts) if int(counts)%30 >0: p_num = int(counts)//30+1 # print(p_num) # 拼接首頁url for k in range(1,p_num+1): url = response.url link_url = url.split('pg')[0]+'pg{}/'.format(k) # print(link_url) yield scrapy.Request(url=link_url,callback=self.parse_detail2) def parse_detail2(self,response): #獲取當前頁面url link_urls = response.xpath("//div[@class='info clear']/div[@class='title']/a/@href").extract() for link_url in link_urls: # print(link_url) yield scrapy.Request(url=link_url,callback=self.parse_detail3) # print('*'*100) def parse_detail3(self,response): title = response.xpath("//div[@class='title']/h1[@class='main']/text()").extract_first() print('標題: '+ title) dist = response.xpath("//div[@class='areaName']/span[@class='info']/a/text()").extract_first() print('所在區域: '+ dist) contents = response.xpath("//div[@class='introContent']/div[@class='base']") # print(contents) house_type = contents.xpath("./div[@class='content']/ul/li[1]/text()").extract_first() print('房屋戶型: '+ house_type) floor = contents.xpath("./div[@class='content']/ul/li[2]/text()").extract_first() print('所在樓層: '+ floor) built_area = contents.xpath("./div[@class='content']/ul/li[3]/text()").extract_first() print('建築面積: '+ built_area) family_structure = contents.xpath("./div[@class='content']/ul/li[4]/text()").extract_first() print('戶型結構: '+ family_structure) inner_area = contents.xpath("./div[@class='content']/ul/li[5]/text()").extract_first() print('套內面積: '+ inner_area) architectural_type = contents.xpath("./div[@class='content']/ul/li[6]/text()").extract_first() print('建築型別: '+ architectural_type) house_orientation = contents.xpath("./div[@class='content']/ul/li[7]/text()").extract_first() print('房屋朝向: '+ house_orientation) building_structure = contents.xpath("./div[@class='content']/ul/li[8]/text()").extract_first() print('建築結構: '+ building_structure) decoration_condition = contents.xpath("./div[@class='content']/ul/li[9]/text()").extract_first() print('裝修狀況: '+ decoration_condition) proportion = contents.xpath("./div[@class='content']/ul/li[10]/text()").extract_first() print('梯戶比例: '+ proportion) elevator = contents.xpath("./div[@class='content']/ul/li[11]/text()").extract_first() print('配備電梯: '+ elevator) age_limit =contents.xpath("./div[@class='content']/ul/li[12]/text()").extract_first() print('產權年限: '+ age_limit) # try: # house_label = response.xpath("//div[@class='content']/a/text()").extract_first() # except: # house_label = '' # print('房源標籤: ' + house_label) with open('text2', 'a', encoding='utf-8')as f: f.write('\n'.join( [title,dist,house_type,floor,built_area,family_structure,inner_area,architectural_type,house_orientation,building_structure,decoration_condition,proportion,elevator,age_limit])) f.write('\n' + '=' * 50 + '\n') print('-'*100)
3.程式碼還需要細分的話,就多配置url的請求引數,縮小篩選範圍,獲取頁面就更精準,就能避免篩選到過3000的資料型別,可以再去細分。