Python爬取鏈家地鐵房資料
阿新 • • 發佈:2019-02-12
#coding=gbk #因為涉及到中文,utf-8會報錯 ### 環境:Python 3.6### import requests import re import pandas as pd import csv from bs4 import BeautifulSoup def generate_allurl(user_in_nub): url = 'https://bj.lianjia.com/ditiefang/li647/pg{}/' for url_next in range(1, int(user_in_nub)): yield url.format(url_next) def main(): #user_in_nub = input('輸入生成頁數:') df = [] for i in generate_allurl(35): #總共34頁 print("頁碼"+i) #get_allurl(i) res = requests.get(i) if res.status_code == 200: soup = BeautifulSoup(res.text, 'lxml') #獲取html的文字 re_set = re.compile('<li.*?class="clear">.*?<a.*?class="img.*?".*?data-housecode="(.*?)"')#正則匹配data-house re_get = re.findall(re_set, res.text)#獲取一頁的二手房資訊個數 print(len(re_get)) #for i in re_get: #print(i) #open_url(i) for i in range(len(re_get)): info = {} info['編號'] = re_get[i] + '號' #print(info) info['單價'] = soup.select('.unitPrice')[i].text info['地鐵'] = soup.select('.subway')[i].text df.append(info) print(df) #df1 = pd.DataFrame.from_dict(info, orient='index').T #pandas_to_xlsx(df1) # 表頭 # header = ['編號', '單價', '地鐵'] print(len(df)) df = pd.DataFrame(df) #將結果寫入csv df.to_csv('D:/dst8.csv', index=False) if __name__ == '__main__': main()