1. 程式人生 > >爬取鏈家杭州二手房3000套

爬取鏈家杭州二手房3000套

匯入包

import requests
import lxml.html
import csv

獲取目標網頁Url

lianjiaUrl='https://hz.lianjia.com/ershoufang/pg{}/'

解析Url

def getSource(url):
    responce=requests.get(url)
    responce.encoding='utf-8'
    return responce.content

獲取頁內容

def getEveryItme(source):
    selector=lxml.html.document_fromstring(source)
    movieItemList=selector.xpath('//div[@class="info clear"]'
) movieList = [] for eachMovie in movieItemList: movieDict = {} title = eachMovie.xpath('div[@class="title"]/a/text()')[0] print(title) address = eachMovie.xpath('div[@class="address"]/div[@class="houseInfo"]/a/text()')[0] detailed =eachMovie.xpath('div[@class="address"]/div[@class="houseInfo"]/text()'
)[0] flood = eachMovie.xpath('div[@class="flood"]/div[@class="positionInfo"]/a/text()')[0] followInfo = eachMovie.xpath('div[@class="followInfo"]/text()')[0] movieDict['title'] = title movieDict['address'] = ''.join(address + detailed) movieDict['flood'] = flood movieDict['followInfo'
] = followInfo print(movieDict) movieList.append(movieDict) return movieList

儲存為csv格式

def writeData(movieList):
    with open('./lianjia.csv','w',encoding='utf-8-sig',newline='') as f:
        writer=csv.DictWriter(f,fieldnames=['title','address','flood','followInfo'])
        writer.writeheader()
        for each in movieList:
            writer.writerow(each)

執行程式碼

if __name__=='__main__':
    movieList=[]

    for i in range(1,101):

        pageLink=lianjiaUrl.format(i)
        print(pageLink)

        source=getSource(pageLink)
        movieList += getEveryItme(source)
    print(movieList[:101])
    writeData(movieList)