Python爬蟲專案--爬取自如網房源資訊
阿新 • • 發佈:2019-02-16
本次爬取自如網房源資訊所用到的知識點:
1. requests get請求
2. lxml解析html
3. Xpath
4. MongoDB儲存
正文
1.分析目標站點
1. url: http://hz.ziroom.com/z/nl/z3.html?p=2 的p引數控制分頁
2. get請求
2.獲取單頁原始碼
# -*- coding: utf-8 -*- import requests import time from requests.exceptions import RequestException def get_one_page(page): try: url = "http://hz.ziroom.com/z/nl/z2.html?p=" + str(page) headers = { 'Referer':'http://hz.ziroom.com/', 'Upgrade-Insecure-Requests':'1', 'User-Agent':'Mozilla/5.0(WindowsNT6.3;Win64;x64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/68.0.3440.106Safari/537.36' } res = requests.get(url,headers=headers) if res.status_code == 200: print(res.text) except RequestException: return None def main(): page = 1 get_one_page(page) if __name__ == '__main__': main() time.sleep(1)
3.解析單頁原始碼
1. 解析html文件, 目的: 測試XPath表示式
將獲取的原始碼儲存到當前資料夾下的"result.html"中, 然後通過XPath對其進行相應內容的提取, 當然你也可以使用某些線上工具.
from lxml import etree #解析html文件 html = etree.parse("./resul.html",etree.HTMLParser()) results = html.xpath('//ul[@id="houseList"]/li') for result in results[1:]: title = result.xpath("./div/h3/a/text()")[0][5:] if len(result.xpath("./div/h3/a/text()")[0]) >5 else "" location = result.xpath("./div/h4/a/text()")[0].replace("[","").replace("]",'') area = " ".join(result.xpath("./div/div/p[1]/span/text()")).replace(" ","",1) #使用join方法將列表中的內容以" "字元連線 nearby = result.xpath("./div/div/p[2]/span/text()")[0] print(title) print(location) print(area) print(nearby)
2. 解析原始碼
from lxml import etree def parse_one_page(sourcehtml): '''解析單頁原始碼''' contentTree = etree.HTML(sourcehtml) #解析原始碼 results = contentTree.xpath('//ul[@id="houseList"]/li') #利用XPath提取相應內容 for result in results[1:]: title = result.xpath("./div/h3/a/text()")[0][5:] if len(result.xpath("./div/h3/a/text()")[0]) > 5 else "" location = result.xpath("./div/h4/a/text()")[0].replace("[", "").replace("]", '') area = " ".join(result.xpath("./div/div/p[1]/span/text()")).replace(" ", "", 1) # 使用join方法將列表中的內容以" "字元連線 nearby = result.xpath("./div/div/p[2]/span/text()")[0] yield { "title": title, "location": location, "area": area, "nearby": nearby } def main(): page = 1 html = get_one_page(page) print(type(html)) parse_one_page(html) for item in parse_one_page(html): print(item) if __name__ == '__main__': main() time.sleep(1)
4.獲取多個頁面
def parse_one_page(sourcehtml):
'''解析單頁原始碼'''
contentTree = etree.HTML(sourcehtml) #解析原始碼
results = contentTree.xpath('//ul[@id="houseList"]/li') #利用XPath提取相應內容
for result in results[1:]:
title = result.xpath("./div/h3/a/text()")[0][5:] if len(result.xpath("./div/h3/a/text()")[0]) > 5 else ""
location = result.xpath("./div/h4/a/text()")[0].replace("[", "").replace("]", '')
area = " ".join(result.xpath("./div/div/p[1]/span/text()")).replace(" ", "", 1) # 使用join方法將列表中的內容以" "字元連線
#nearby = result.xpath("./div/div/p[2]/span/text()")[0].strip()這裡需要加判斷, 改寫為下句
nearby = result.xpath("./div/div/p[2]/span/text()")[0].strip() if len(result.xpath("./div/div/p[2]/span/text()"))>0 else ""
yield {
"title": title,
"location": location,
"area": area,
"nearby": nearby
}
print(nearby)
#yield {"pages":pages}
def get_pages():
"""得到總頁數"""
page = 1
html = get_one_page(page)
contentTree = etree.HTML(html)
pages = int(contentTree.xpath('//div[@class="pages"]/span[2]/text()')[0].strip("共頁"))
return pages
def main():
pages = get_pages()
print(pages)
for page in range(1,pages+1):
html = get_one_page(page)
for item in parse_one_page(html):
print(item)
if __name__ == '__main__':
main()
time.sleep(1)
5. 儲存到MongoDB中
需確保MongoDB已啟動服務, 否則必然會儲存失敗
def save_to_mongodb(result):
"""儲存到MongoDB中"""
# 建立資料庫連線物件, 即連線到本地
client = pymongo.MongoClient(host="localhost")
# 指定資料庫,這裡指定ziroom
db = client.iroomz
# 指定表的名稱, 這裡指定roominfo
db_table = db.roominfo
try:
#儲存到資料庫
if db_table.insert(result):
print("---儲存到資料庫成功---",result)
except Exception:
print("---儲存到資料庫失敗---",result)
6.完整程式碼
# -*- coding: utf-8 -*-
'''
有需要Python學習資料的小夥伴嗎?小編整理一套Python資料和PDF,感興趣者可以加學習群:548377875,反正閒著也是閒著呢,不如學點東西啦~~
'''
import requests
import time
import pymongo
from lxml import etree
from requests.exceptions import RequestException
def get_one_page(page):
'''獲取單頁原始碼'''
try:
url = "http://hz.ziroom.com/z/nl/z2.html?p=" + str(page)
headers = {
'Referer':'http://hz.ziroom.com/',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0(WindowsNT6.3;Win64;x64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/68.0.3440.106Safari/537.36'
}
res = requests.get(url,headers=headers)
if res.status_code == 200:
return res.text
return None
except RequestException:
return None
def parse_one_page(sourcehtml):
'''解析單頁原始碼'''
contentTree = etree.HTML(sourcehtml) #解析原始碼
results = contentTree.xpath('//ul[@id="houseList"]/li') #利用XPath提取相應內容
for result in results[1:]:
title = result.xpath("./div/h3/a/text()")[0][5:] if len(result.xpath("./div/h3/a/text()")[0]) > 5 else ""
location = result.xpath("./div/h4/a/text()")[0].replace("[", "").replace("]", '')
area = " ".join(result.xpath("./div/div/p[1]/span/text()")).replace(" ", "", 1) # 使用join方法將列表中的內容以" "字元連線
#nearby = result.xpath("./div/div/p[2]/span/text()")[0].strip()這裡需要加判斷, 改寫為下句
nearby = result.xpath("./div/div/p[2]/span/text()")[0].strip() if len(result.xpath("./div/div/p[2]/span/text()"))>0 else ""
data = {
"title": title,
"location": location,
"area": area,
"nearby": nearby
}
save_to_mongodb(data)
#yield {"pages":pages}
def get_pages():
"""得到總頁數"""
page = 1
html = get_one_page(page)
contentTree = etree.HTML(html)
pages = int(contentTree.xpath('//div[@class="pages"]/span[2]/text()')[0].strip("共頁"))
return pages
def save_to_mongodb(result):
"""儲存到MongoDB中"""
# 建立資料庫連線物件, 即連線到本地
client = pymongo.MongoClient(host="localhost")
# 指定資料庫,這裡指定ziroom
db = client.iroomz
# 指定表的名稱, 這裡指定roominfo
db_table = db.roominfo
try:
#儲存到資料庫
if db_table.insert(result):
print("---儲存到資料庫成功---",result)
except Exception:
print("---儲存到資料庫失敗---",result)
def main():
pages = get_pages()
print(pages)
for page in range(1,pages+1):
html = get_one_page(page)
parse_one_page(html)
if __name__ == '__main__':
main()
time.sleep(1)
7.最終結果
總結
1. 在第三步中XPath使用注意事項
title = result.xpath("./div/h3/a/text()")
此處的點'.'不能忘記, 它表示當前節點, 如果不加'.', '/'就表示從根節點開始選取
2. 在第四步獲取多個頁面時出現索引超出範圍錯誤
nearby = result.xpath("./div/div/p[2]/span/text()")[0].strip()
IndexError: list index out of range
造成這種錯誤原因有兩種:
1) [index] index超出list範圍
2) [index] index索引內容為空
因為這裡的nearby的index是0, 排除第一種情況, 那麼這裡就是空行了, 加句if判斷就可以解決
nearby = result.xpath("./div/div/p[2]/span/text()")[0].strip()
#改寫以後:
nearby = result.xpath("./div/div/p[2]/span/text()")[0].strip() if len(result.xpath("./div/div/p[2]/span/text()"))>0 else ""
以上主要是對爬蟲過程學習的總結, 若有不對的地方, 還請指正, 謝謝!