爬取鏈家杭州二手房3000套
阿新 • • 發佈:2018-12-09
匯入包
import requests
import lxml.html
import csv
獲取目標網頁Url
lianjiaUrl='https://hz.lianjia.com/ershoufang/pg{}/'
解析Url
def getSource(url):
responce=requests.get(url)
responce.encoding='utf-8'
return responce.content
獲取頁內容
def getEveryItme(source):
selector=lxml.html.document_fromstring(source)
movieItemList=selector.xpath('//div[@class="info clear"]' )
movieList = []
for eachMovie in movieItemList:
movieDict = {}
title = eachMovie.xpath('div[@class="title"]/a/text()')[0]
print(title)
address = eachMovie.xpath('div[@class="address"]/div[@class="houseInfo"]/a/text()')[0]
detailed =eachMovie.xpath('div[@class="address"]/div[@class="houseInfo"]/text()' )[0]
flood = eachMovie.xpath('div[@class="flood"]/div[@class="positionInfo"]/a/text()')[0]
followInfo = eachMovie.xpath('div[@class="followInfo"]/text()')[0]
movieDict['title'] = title
movieDict['address'] = ''.join(address + detailed)
movieDict['flood'] = flood
movieDict['followInfo' ] = followInfo
print(movieDict)
movieList.append(movieDict)
return movieList
儲存為csv格式
def writeData(movieList):
with open('./lianjia.csv','w',encoding='utf-8-sig',newline='') as f:
writer=csv.DictWriter(f,fieldnames=['title','address','flood','followInfo'])
writer.writeheader()
for each in movieList:
writer.writerow(each)
執行程式碼
if __name__=='__main__':
movieList=[]
for i in range(1,101):
pageLink=lianjiaUrl.format(i)
print(pageLink)
source=getSource(pageLink)
movieList += getEveryItme(source)
print(movieList[:101])
writeData(movieList)