利用BeautifulSoup和Xpath爬取趕集網北京二手房房價資訊
阿新 • • 發佈:2018-12-30
利用BeautifulSoup和Xpath爬取趕集網北京二手房房價資訊
文章開始把我喜歡的這句話送個大家:這個世界上還有什麼比自己寫的程式碼執行在一億人的電腦上更酷的事情嗎,如果有那就是讓這個數字再擴大十倍!
1.BeautifulSoup實現
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Fri Oct 12 17:41:06 2018 Beautiful Soup爬取 @author: Macbook """ import requests import re from requests.exceptions import RequestException from bs4 import BeautifulSoup import csv import time headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'} def get_one_page(url): try: response = requests.get(url,headers = headers) if response.status_code == 200: return response.text return None except RequestException: return None def parse_one_page(content): try: soup = BeautifulSoup(content,'html.parser') items = soup.find('div',class_=re.compile('js-tips-list')) for div in items.find_all('div',class_=re.compile('ershoufang-list')): yield { 'Name':div.find('a',class_=re.compile('js-title')).text, 'Type': div.find('dd', class_=re.compile('size')).contents[1].text,#tag的 .contents 屬性可以將tag的子節點以列表的方式輸出 'Area':div.find('dd',class_=re.compile('size')).contents[5].text, 'Towards':div.find('dd',class_=re.compile('size')).contents[9].text, 'Floor':div.find('dd',class_=re.compile('size')).contents[13].text.replace('\n',''), 'Decorate':div.find('dd',class_=re.compile('size')).contents[17].text, 'Address':div.find('span',class_=re.compile('area')).text.strip().replace(' ','').replace('\n',''), 'TotalPrice':div.find('span',class_=re.compile('js-price')).text+div.find('span',class_=re.compile('yue')).text, 'Price':div.find('div',class_=re.compile('time')).text } #有一些二手房資訊缺少部分資訊,如:缺少裝修資訊,或者缺少樓層資訊,這時候需要加個判斷,不然爬取就會中斷。 if div['Name', 'Type', 'Area', 'Towards', 'Floor', 'Decorate', 'Address', 'TotalPrice', 'Price'] == None: return None except Exception: return None def main(): for i in range(1,500): url = 'http://bj.ganji.com/fang5/o{}/'.format(i) content = get_one_page(url) print('第{}頁抓取完畢'.format(i)) for div in parse_one_page(content): print(div) with open('Data.csv', 'a',encoding='gbk',newline='') as f: # Data.csv 檔案儲存的路徑,如果預設路徑就直接寫檔名即可。 fieldnames = ['Name', 'Type', 'Area', 'Towards', 'Floor', 'Decorate', 'Address', 'TotalPrice', 'Price'] writer = csv.DictWriter(f, fieldnames=fieldnames) writer.writeheader() for item in parse_one_page(content): writer.writerow(item)#.encode('utf-8', 'ignore') time.sleep(2)#設定爬取頻率,爬取的太快,導致網頁需要驗證。 if __name__=='__main__': main()
2.xpath實現
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Fri Oct 12 15:37:50 2018 爬取趕集網北京二手房資料 主要練習應用Xpath判斷某些元素是否存在,以防某些元素不存在導致爬取中斷 @author: Macbook """ import requests from lxml import etree from requests.exceptions import RequestException import multiprocessing import time headers = { 'User-Agent':'Mozilla/5.0(Windows NT 10.0;WOW64) AppleWebKit/537.36(KHTML,like Gecko) Chrome/55.0.2883.87 Safari/537.36'} def get_one_page(url): try: response = requests.get(url,headers=headers) if response.status_code == 200: return response.text return None except RequestException: return None def parse_one_page(content): try: selector = etree.HTML(content) ALL = selector.xpath('//*[@id="f_mew_list"]/div[6]/div[1]/div[3]/div[1]/div') for div in ALL: yield{ 'Name': div.xpath('dl/dd[1]/a/text()')[0], 'Type': div.xpath('dl/dd[2]/span[1]/text()')[0], 'Area': div.xpath('dl/dd[2]/span[3]/text()')[0], 'Towards': div.xpath('dl/dd[2]/span[5]/text()')[0], 'Floor': div.xpath('dl/dd[2]/span[7]/text()')[0].strip().replace('\n', ""), 'Decorate': div.xpath('dl/dd[2]/span[9]/text()')[0], #地址需要特殊處理一下 'Address': div.xpath('dl/dd[3]//text()')[1]+div.xpath('dl/dd[3]//text()')[3].replace('\n','')+div.xpath('dl/dd[3]//text()')[4].strip(), 'TotalPrice': div.xpath('dl/dd[5]/div[1]/span[1]/text()')[0] + div.xpath('dl/dd[5]/div[1]/span[2]/text()')[0], 'Price': div.xpath('dl/dd[5]/div[2]/text()')[0] } if div['Name','Type','Area','Towards','Floor','Decorate','Address','TotalPrice','Price'] == None:##這裡加上判斷,如果有一個為空輸出null return None except Exception: return None def main(): for i in range(1,500):#爬500頁 url = "http://bj.ganji/com/fang5/o{}/".format(i) content = get_one_page(url) print('第{}頁抓取完畢'.format(i)) for div in parse_one_page(content): print(div) if __name__ == '__main__': main()
加油吧,程式設計師!