1. 程式人生 > >利用BeautifulSoup和Xpath爬取趕集網北京二手房房價資訊

利用BeautifulSoup和Xpath爬取趕集網北京二手房房價資訊

    利用BeautifulSoup和Xpath爬取趕集網北京二手房房價資訊

文章開始把我喜歡的這句話送個大家:這個世界上還有什麼比自己寫的程式碼執行在一億人的電腦上更酷的事情嗎,如果有那就是讓這個數字再擴大十倍!

1.BeautifulSoup實現

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Oct 12 17:41:06 2018
Beautiful Soup爬取
@author: Macbook
"""

import requests
import re
from requests.exceptions import RequestException
from bs4 import BeautifulSoup
import csv
import time

headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'}

def get_one_page(url):
    try:
        response = requests.get(url,headers = headers)
        if response.status_code == 200:
            return response.text
        return None
    except RequestException:
        return None

def parse_one_page(content):
    try:
        soup = BeautifulSoup(content,'html.parser')
        items = soup.find('div',class_=re.compile('js-tips-list'))
        for div in items.find_all('div',class_=re.compile('ershoufang-list')):
            yield {
                'Name':div.find('a',class_=re.compile('js-title')).text,
                'Type': div.find('dd', class_=re.compile('size')).contents[1].text,#tag的 .contents 屬性可以將tag的子節點以列表的方式輸出
                'Area':div.find('dd',class_=re.compile('size')).contents[5].text,
                'Towards':div.find('dd',class_=re.compile('size')).contents[9].text,
                'Floor':div.find('dd',class_=re.compile('size')).contents[13].text.replace('\n',''),
                'Decorate':div.find('dd',class_=re.compile('size')).contents[17].text,
                'Address':div.find('span',class_=re.compile('area')).text.strip().replace(' ','').replace('\n',''),
                'TotalPrice':div.find('span',class_=re.compile('js-price')).text+div.find('span',class_=re.compile('yue')).text,
                'Price':div.find('div',class_=re.compile('time')).text
            }
        #有一些二手房資訊缺少部分資訊,如:缺少裝修資訊,或者缺少樓層資訊,這時候需要加個判斷,不然爬取就會中斷。
        if div['Name', 'Type', 'Area', 'Towards', 'Floor', 'Decorate', 'Address', 'TotalPrice', 'Price'] == None:
                return None
    except Exception:
        return None

def main():
    for i in range(1,500):
        url = 'http://bj.ganji.com/fang5/o{}/'.format(i)
        content = get_one_page(url)
        print('第{}頁抓取完畢'.format(i))
        for div in parse_one_page(content):
            print(div)
        with open('Data.csv', 'a',encoding='gbk',newline='') as f:  # Data.csv 檔案儲存的路徑,如果預設路徑就直接寫檔名即可。
            fieldnames = ['Name', 'Type', 'Area', 'Towards', 'Floor', 'Decorate', 'Address', 'TotalPrice', 'Price']
            writer = csv.DictWriter(f, fieldnames=fieldnames)
            writer.writeheader()
            for item in parse_one_page(content):
                writer.writerow(item)#.encode('utf-8', 'ignore')
        time.sleep(2)#設定爬取頻率,爬取的太快,導致網頁需要驗證。

if __name__=='__main__':
    main()

 2.xpath實現

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Oct 12 15:37:50 2018
爬取趕集網北京二手房資料
主要練習應用Xpath判斷某些元素是否存在,以防某些元素不存在導致爬取中斷
@author: Macbook
"""
import requests
from lxml import etree
from requests.exceptions import RequestException
import multiprocessing
import time

headers = {
           'User-Agent':'Mozilla/5.0(Windows NT 10.0;WOW64) AppleWebKit/537.36(KHTML,like Gecko) Chrome/55.0.2883.87 Safari/537.36'}

def get_one_page(url):
        try:
                response = requests.get(url,headers=headers)
                if response.status_code == 200:
                        return response.text
                return None
        except RequestException:
                return None
        
def parse_one_page(content):
        try:
                selector = etree.HTML(content)
                ALL = selector.xpath('//*[@id="f_mew_list"]/div[6]/div[1]/div[3]/div[1]/div')
                for div in ALL:
                        yield{
                                        'Name': div.xpath('dl/dd[1]/a/text()')[0],
                                        'Type': div.xpath('dl/dd[2]/span[1]/text()')[0],
                                        'Area': div.xpath('dl/dd[2]/span[3]/text()')[0],
                                        'Towards': div.xpath('dl/dd[2]/span[5]/text()')[0],
                                        'Floor': div.xpath('dl/dd[2]/span[7]/text()')[0].strip().replace('\n', ""),
                                        'Decorate': div.xpath('dl/dd[2]/span[9]/text()')[0],
                                        #地址需要特殊處理一下
                                        'Address': div.xpath('dl/dd[3]//text()')[1]+div.xpath('dl/dd[3]//text()')[3].replace('\n','')+div.xpath('dl/dd[3]//text()')[4].strip(),
                                        'TotalPrice': div.xpath('dl/dd[5]/div[1]/span[1]/text()')[0] + div.xpath('dl/dd[5]/div[1]/span[2]/text()')[0],
                                        'Price': div.xpath('dl/dd[5]/div[2]/text()')[0]
                                }
                if div['Name','Type','Area','Towards','Floor','Decorate','Address','TotalPrice','Price'] == None:##這裡加上判斷,如果有一個為空輸出null
                        return None
        except Exception:
                return None
        
def main():
        for i in range(1,500):#爬500頁
                url = "http://bj.ganji/com/fang5/o{}/".format(i)
                content = get_one_page(url)
                print('第{}頁抓取完畢'.format(i))
                for div in parse_one_page(content):
                        print(div)
                        
if __name__ == '__main__':
        main()
        
        
                
        
                

加油吧,程式設計師!