1. 程式人生 > >Python爬取鏈家地鐵房資料

Python爬取鏈家地鐵房資料

#coding=gbk
#因為涉及到中文,utf-8會報錯
### 環境:Python 3.6###
import requests
import re
import pandas as pd
import csv
from bs4 import BeautifulSoup
def generate_allurl(user_in_nub):
    url = 'https://bj.lianjia.com/ditiefang/li647/pg{}/'
    for url_next in range(1, int(user_in_nub)):
        yield url.format(url_next)
def main():
    #user_in_nub = input('輸入生成頁數:')
    df = []
    for i in generate_allurl(35): #總共34頁
        print("頁碼"+i)
        #get_allurl(i)
        res = requests.get(i)
        if res.status_code == 200:
            soup = BeautifulSoup(res.text, 'lxml') #獲取html的文字
            re_set = re.compile('<li.*?class="clear">.*?<a.*?class="img.*?".*?data-housecode="(.*?)"')#正則匹配data-house
            re_get = re.findall(re_set, res.text)#獲取一頁的二手房資訊個數
            print(len(re_get))
            #for i in re_get:
               #print(i)
                #open_url(i)
            for i in range(len(re_get)):
                info = {}
                info['編號'] = re_get[i] + '號'
                #print(info)
                info['單價'] = soup.select('.unitPrice')[i].text
                info['地鐵'] = soup.select('.subway')[i].text
                df.append(info)
                print(df)
                #df1 = pd.DataFrame.from_dict(info, orient='index').T
                #pandas_to_xlsx(df1)
    # 表頭
   # header = ['編號', '單價', '地鐵']
    print(len(df))
    df = pd.DataFrame(df)
    #將結果寫入csv
    df.to_csv('D:/dst8.csv', index=False)
if __name__ == '__main__':
    main()