爬取鏈家網北京房源及房價分析
阿新 • • 發佈:2018-12-30
爬取鏈家網北京房源及房價分析
文章開始把我喜歡的這句話送個大家:這個世界上還有什麼比自己寫的程式碼執行在一億人的電腦上更酷的事情嗎,如果有那就是讓這個數字再擴大十倍
1.資料獲取
# 獲取某市區域的所有連結 def get_areas(url): print('start grabing areas') headers = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36'} resposne = requests.get(url, headers=headers) content = etree.HTML(resposne.text) areas = content.xpath("//dd[@data-index = '0']//div[@class='option-list']/a/text()") areas_link = content.xpath("//dd[@data-index = '0']//div[@class='option-list']/a/@href") for i in range(1,len(areas)): area = areas[i] area_link = areas_link[i] link = 'https://bj.lianjia.com' + area_link print("開始抓取頁面") get_pages(area, link) #通過獲取某一區域的頁數,來拼接某一頁的連結 def get_pages(area,area_link): headers = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36'} resposne = requests.get(area_link, headers=headers) pages = int(re.findall("page-data=\'{\"totalPage\":(\d+),\"curPage\"", resposne.text)[0]) print("這個區域有" + str(pages) + "頁") for page in range(1,pages+1): url = 'https://bj.lianjia.com/zufang/dongcheng/pg' + str(page) print("開始抓取" + str(page) +"的資訊") get_house_info(area,url) #獲取某一區域某一頁的詳細房租資訊 def get_house_info(area, url): headers = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36'} time.sleep(2) try: resposne = requests.get(url, headers=headers) content = etree.HTML(resposne.text) info=[] for i in range(30): title = content.xpath("//div[@class='where']/a/span/text()")[i] room_type = content.xpath("//div[@class='where']/span[1]/span/text()")[i] square = re.findall("(\d+)",content.xpath("//div[@class='where']/span[2]/text()")[i])[0] position = content.xpath("//div[@class='where']/span[3]/text()")[i].replace(" ", "") try: detail_place = re.findall("([\u4E00-\u9FA5]+)租房", content.xpath("//div[@class='other']/div/a/text()")[i])[0] except Exception as e: detail_place = "" floor =re.findall("([\u4E00-\u9FA5]+)\(", content.xpath("//div[@class='other']/div/text()[1]")[i])[0] total_floor = re.findall("(\d+)",content.xpath("//div[@class='other']/div/text()[1]")[i])[0] try: house_year = re.findall("(\d+)",content.xpath("//div[@class='other']/div/text()[2]")[i])[0] except Exception as e: house_year = "" price = content.xpath("//div[@class='col-3']/div/span/text()")[i] with open('鏈家北京租房.txt','a',encoding='utf-8') as f: f.write(area + ',' + title + ',' + room_type + ',' + square + ',' +position+ ','+ detail_place+','+floor+','+total_floor+','+price+','+house_year+'\n') print('writing work has done!continue the next page') except Exception as e: print( 'ooops! connecting error, retrying.....') time.sleep(20) return get_house_info(area, url) def main(): print('start!') url = 'https://bj.lianjia.com/zufang' get_areas(url) if __name__ == '__main__': main()
2.資料清理與視覺化
detail_place = df.groupby(['detail_place']) house_com = detail_place['price'].agg(['mean','count'])#agg為聚合函式 house_com.reset_index(inplace=True)#set_index(某列)設定某列為索引,reset_index還原回去 detail_place_main = house_com.sort_values('count',ascending=False)[0:20]#sort_values()按某一列或幾列排序 attr = detail_place_main['detail_place'] v1 = detail_place_main['count'] v2 = detail_place_main['mean'] line = Line("北京主要路段房租均價") line.add("路段",attr,v2,is_stack=True,xaxis_rotate=30,yaxix_min=4.2, mark_point=['min','max'],xaxis_interval=0,line_color='lightblue', line_width=4,mark_point_textcolor='black',mark_point_color='lightblue', is_splitline_show=False) bar = Bar("北京主要路段房屋數量") bar.add("路段",attr,v1,is_stack=True,xaxis_rotate=30,yaxix_min=4.2, xaxis_interval=0,is_splitline_show=False) overlap = Overlap() overlap.add(bar) overlap.add(line,yaxis_index=1,is_add_yaxis=True) overlap.render('北京路段_房屋均價分佈圖.html') #房源價格區間分佈圖 price_info = df[['area', 'price']] #對價格分割槽 bins = [0,1000,1500,2000,2500,3000,4000,5000,6000,8000,10000] level = ['0-1000','1000-1500', '1500-2000', '2000-3000', '3000-4000', '4000-5000', '5000-6000', '6000-8000', '8000-1000','10000以上'] price_stage = pd.cut(price_info['price'], bins = bins,labels = level).value_counts().sort_index() attr = price_stage.index v1 = price_stage.values bar = Bar("價格區間&房源數量分佈") bar.add("",attr,v1,is_stack=True,xaxis_rotate=30,yaxix_min=4.2, xaxis_interval=0,is_splitline_show=False) overlap = Overlap() overlap.add(bar) overlap.render('價格區間&房源數量分佈.html') #房屋面積分佈 bins =[0,30,60,90,120,150,200,300,400,700] level = ['0-30', '30-60', '60-90', '90-120', '120-150', '150-200', '200-300','300-400','400+'] df['square_level'] = pd.cut(df['square'],bins = bins,labels = level) df_digit= df[['area', 'room_type', 'square', 'position', 'total_floor', 'floor', 'house_year', 'price', 'square_level']] s = df_digit['square_level'].value_counts() attr = s.index v1 = s.values pie = Pie("房屋面積分佈",title_pos='center') pie.add( "", attr, v1, radius=[40, 75], label_text_color=None, is_label_show=True, legend_orient="vertical", legend_pos="left", ) overlap = Overlap() overlap.add(pie) overlap.render('房屋面積分佈.html') #房屋面積&價位分佈 bins =[0,30,60,90,120,150,200,300,400,700] level = ['0-30', '30-60', '60-90', '90-120', '120-150', '150-200', '200-300','300-400','400+'] df['square_level'] = pd.cut(df['square'],bins = bins,labels = level) df_digit= df[['area', 'room_type', 'square', 'position', 'total_floor', 'floor', 'house_year', 'price', 'square_level']] square = df_digit[['square_level','price']] prices = square.groupby('square_level').mean().reset_index() amount = square.groupby('square_level').count().reset_index() attr = prices['square_level'] v1 = prices['price'] pie = Bar("房屋面積&價位分佈布") pie.add("", attr, v1, is_label_show=True) pie.render() bar = Bar("房屋面積&價位分佈") bar.add("",attr,v1,is_stack=True,xaxis_rotate=30,yaxix_min=4.2, xaxis_interval=0,is_splitline_show=False) overlap = Overlap() overlap.add(bar) overlap.render('房屋面積&價位分佈.html')
3.視覺化部分截圖