1. 程式人生 > >鏈家深圳租房信息爬取練習 附加源碼

鏈家深圳租房信息爬取練習 附加源碼

list enc dom \n referer brush csv文件 writer nec

from urllib import request
from time import sleep
from  lxml import  etree
import csv
# import random    #sleep(random.random(1)*2) 隨機秒數
# 參數部分
# sz_url = ‘https://sz.lianjia.com/zufang/‘
#
# header = {
#
# ‘Referer‘: ‘https://sz.lianjia.com/zufang/‘,
# ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36‘,
# }
# # 請求部分
# res = request.Request(sz_url,headers=header)
#
# response = request.urlopen(res)
# result = response.read().decode()
# # print(result)
# # 篩選部分
# html = etree.HTML(result)
# name_list = html.xpath(‘//ul[@id="house-lst"]/li/div[@class="info-panel"]/h2/a‘)
# with open(‘house.csv‘,"wb") as f:
#     for name in name_list:
#         title=name.attrib["title"]
#         f.write(title.encode())
#         f.write(‘\n‘.encode())
#         print(title)


# --------------------------------------------------------------------------------------------------------------
# # 參數部分
# sz_url = ‘https://sz.lianjia.com/zufang/105101400296.html‘
#
# header = {
#
# ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36‘,
# }
# # 請求部分
# res = request.Request(sz_url,headers=header)
#
# response = request.urlopen(res)
# result = response.read().decode()
# # print(result)
#
# html = etree.HTML(result)
# name_list = html.xpath(‘//div[@class="brokerName"]/a‘)
#
# for name in name_list:
#     text = name.text
#     print(text)

# @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@code  tree@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
# 請求測試
def getRequet(url,xpath,**headers):
    default_headers = {
        ‘Connection‘: ‘keep-alive‘,
        ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTM, like Gecko) Chrome/67.0.3396.62 Safari/537.36‘,
    }
    if headers :
        headers ={ **headers, **default_headers}
    else:
        headers = default_headers
    req = request.Request(url,headers=headers)
    response = request.urlopen(req)
    result = response.read().decode()
    html = etree.HTML(result)
    name_list = html.xpath(xpath)
    return name_list
def main():

    with open(‘house.csv‘,"wb") as f:#打開csv文件 寫入數據
        # csv_file = open(‘house.csv‘,‘wb‘)
        # csv_write = csv.writer(csv_file,dialect=‘excel‘)
        zf_url=‘https://sz.lianjia.com/zufang/‘#要訪問的url地址
        zf_xpath=‘//ul[@id="house-lst"]/li/div[@class="info-panel"]/h2/a‘#租房xpath地址
        name_xpath=‘//div[@class="brokerName"]/a‘#聯系人名字xpath
        house_list=getRequet(zf_url,zf_xpath)
        for house  in house_list:

            print(‘正在下載:‘,zf_url)#打印下載鏈接地址
            attrib = house.attrib
            house_name = attrib[‘title‘]
            url =attrib[‘href‘]
            username=getRequet(url,name_xpath)[0].text#取聯系人名字的文本信息下標0
            # csv_write.witerow(house_name,username)
            # print(‘@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@‘)
            # print(name)
            # print(url)


            #
            f.write(house_name.encode())#encode編碼
            f.write(‘\n‘.encode())
            f.write(username.encode())
            f.write(‘\n‘.encode())

            # print(house_name)
            # print(username)
            sleep(1)
            # print(‘@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@‘)
        print(‘下載完成‘)
        f.close()

if __name__==‘__main__‘:
    main()

  

鏈家深圳租房信息爬取練習 附加源碼