1. 程式人生 > >數據采集分析

數據采集分析

ram encoding visual dataframe amp default tde 表示 dex

#!/bin/env python
#--coding:utf-8--
#auth:tyk
#data:2019-2-3
#function:Exploratory Visualization
###################################
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import sys
import warnings
import re
from data_1 import ZcSummary
from scipy.stats import norm
warnings.filterwarnings("ignore")
mpl.rcParams[‘axes.unicode_minus‘] = False
mpl.rcParams[‘font.family‘]=‘sans-serif‘
mpl.rcParams[‘font.sans-serif‘]=[‘SimHei‘]
reload(sys)
sys.setdefaultencoding(‘UTF-8‘)
hit_df = pd.read_csv(‘hit.csv‘,encoding=‘UTF-8‘)
print ">>==================data info=====================<<"
#hit_df.info()
#print (hit_df.shape)
#print (hit_df.dtypes)
#print ">>==================data describe=================<<"
try:

print(hit_df.describe())

pass

except Exception,e:
print e
df = hit_df.copy()
#df[‘floor‘]=re.findall("\d+",df[‘floor‘])
#df[‘PerPrice‘] = hit_df[‘Price‘]/hit_df[‘Size‘]

#columns = [‘fang_key‘, ‘fang_desc‘, ‘price‘, ‘price_pre‘, ‘community‘, ‘housetype‘, ‘area‘, ‘region‘, ‘plate‘, ‘floor‘, ‘direction‘, ‘age‘,‘address‘,‘updated_date‘]
#df = pd.DataFrame(df, columns = columns)
def variables_analyse(data):
sns.distplot(df[‘price‘])
plt.show()
def region_analyse(data):
df=data.copy()
df_house_count = df.groupby(‘region‘)[‘price‘].count().sort_values(ascending=False).to_frame().reset_index()
df_house_mean = df.groupby(‘region‘)[‘price_pre‘].mean().sort_values(ascending=False).to_frame().reset_index()
f, [ax1,ax2,ax3] = plt.subplots(3,1,figsize=(30,45),dpi=100)
sns.barplot(x=‘region‘, y=‘price_pre‘, palette="Blues_d", data=df_house_mean, ax=ax1)
ax1.set_title(‘深圳各大區二手房每平米單價對比‘,fontsize=5)
ax1.set_xlabel(‘區域‘,)
ax1.set_ylabel(‘每平方米單價‘)
sns.barplot(x=‘region‘, y=‘price‘, palette="Greens_d", data=df_house_count, ax=ax2)
ax2.set_title(‘深圳各大區二手房數量對比‘,fontsize=5)
ax2.set_xlabel(‘區域‘)
ax2.set_ylabel(‘數量‘)
sns.boxplot(x=‘region‘, y=‘price‘, data=df, ax=ax3)
ax3.set_title(‘深圳各大區二手房房屋總價‘,fontsize=5)
ax3.set_xlabel(‘區域‘)
ax3.set_ylabel(‘房屋總價‘)
plt.savefig("region.png",dpi=100)
plt.show()
def area_analyse(data):
df=data.copy()
#過濾散點

df = df[(df[‘Layout‘]!=‘疊拼別墅‘)&(df[‘Size‘]<1000)]

f, [ax1,ax2] = plt.subplots(1, 2, figsize=(15, 5),dpi=100)
sns.distplot(df[‘area‘], bins=20, ax=ax1, color=‘r‘)
sns.kdeplot(df[‘area‘], shade=True, ax=ax1)
sns.regplot(x=‘area‘, y=‘price‘, data=df, ax=ax2)
plt.savefig("area.png",dpi=100)

#另外一種展示
fig,axes = plt.subplots()
sns.distplot(data[‘area‘],bins=50,kde=False,fit=norm,ax=axes)
axes.set(xlabel=‘面積/平米‘,ylabel=‘概率密度‘,title=‘面積頻率分布直方圖‘)
plt.show()

def housetype_analyse(data):
df=data.copy()
f, ax1= plt.subplots(figsize=(20,20))
sns.countplot(y=‘housetype‘, data=df, ax=ax1)
ax1.set_title(‘房屋戶型‘,fontsize=15)
ax1.set_xlabel(‘數量‘)
ax1.set_ylabel(‘戶型‘)
plt.savefig("housetype.png",dpi=100)
plt.show()
def year_analyse(data):
new_data=data.copy()
new_data[‘age‘].unique()
count_by_create_time = new_data[‘age‘].groupby(new_data[‘age‘]).count() # 對不同時間建造的房屋進行分組統計

將房屋建造時間分成2000年及以前、2001-2005年、2006-2010年、2011-2015年、2016年及以後這五組數據,並分組計數

count_by_create_time1 = count_by_create_time.loc[:2000].sum()
count_by_create_time2 = count_by_create_time.loc[2001:2005].sum()
count_by_create_time3 = count_by_create_time.loc[2006:2010].sum()
count_by_create_time4 = count_by_create_time.loc[2011:2015].sum()
count_by_create_time5 = count_by_create_time.loc[2016:].sum()
new_count_by_create_time = pd.Series([count_by_create_time1,count_by_create_time2,count_by_create_time3,
                                  count_by_create_time4,count_by_create_time5],
                                index=[‘2000年及以前‘,‘2001-2005年‘,‘2006-2010年‘,‘2011-2015年‘,‘2016年及以後‘])

繪制折線圖

fig,axes = plt.subplots(1,2)
count_by_create_time.plot(kind=‘line‘,ax=axes[0])

axes[0].set(xlabel=‘房屋建造時間/年‘,ylabel=‘房數量/套‘,
              title=‘房數量隨房屋建造時間變化折線圖‘,
       xticks=[1970,1980,1990,2000,2010,2018])  # 設置折線圖標題、坐標軸標簽和x軸上的數值標簽

# 繪制餅形圖
new_count_by_create_time.plot(kind=‘pie‘,ax=axes[1],autopct=‘%.1f%%‘,startangle=90,label=‘‘)
# autopct參數的作用是指定餅形圖中數據標簽的顯示方式
# ‘%.1f%%‘表示數據標簽的格式是保留一位小數的百分數
# startangle=90表示餅圖的起始繪制角度是偏離x軸90度,並按逆時針繪制
# label=‘‘後,餅形圖的左邊便不會再顯示Series對象的名字
axes[1].set_title(‘不同建造時間範圍內房屋占比餅形圖‘)   # 設置餅形圖的標題
axes[1].set_aspect(‘equal‘)   # 設置餅形圖的縱橫比相等  
plt.subplots_adjust(wspace=0.5)   # 設置figure對象中子圖的間距
plt.show()

def floor_analyse(data):
df=data.copy()
f, ax1= plt.subplots(figsize=(20,5))
sns.countplot(x=‘floor‘, data=df, ax=ax1)
ax1.set_title(‘房屋戶型‘,fontsize=15)
ax1.set_xlabel(‘數量‘)
ax1.set_ylabel(‘價格‘)
plt.savefig("floor.png",dpi=100)
plt.show()
def price_analyse(data):
new_data=data.copy()
#處理計算各區的平均房價
fig,axes = plt.subplots(1,2)
sns.distplot(new_data[‘price‘],bins=50,kde=False,fit=norm,ax=axes[0])
sns.distplot(new_data[‘price_pre‘],bins=50,kde=False,fit=norm,ax=axes[1])
axes[0].set(xlabel=‘售價/萬‘,ylabel=‘概率密度‘)
axes[1].set(xlabel=‘單價/元每平米‘,ylabel=‘概率密度‘)
fig.suptitle(‘二手房的售價和單價頻率分布直方圖‘)
plt.subplots_adjust(wspace=0.4) # 設置figure對象中兩子圖的間距
plt.savefig(‘price.png‘)

plt.show()

min_price = new_data[‘price‘].mean() - new_data[‘price‘].std()
max_price = new_data[‘price‘].mean() +  new_data[‘price‘].std()
print(min_price,max_price)
min_average_price = new_data[‘price_pre‘].mean() - new_data[‘price_pre‘].std()
max_average_price = new_data[‘price_pre‘].mean() + new_data[‘price_pre‘].std()
print(min_average_price,max_average_price)

def main():
parm=["price_pre","area",‘age‘,‘floor‘]
data_init=ZcSummary()
data=data_init.preprocess_features(df,parm)
#region_analyse(data)
#area_analyse(data)
#housetype_analyse(data)
#year_analyse(data)
#variables_analyse(data)
#floor_analyse(data)####has####
price_analyse(data)
if name == ‘main‘:
try:
main()
except Exception as e:
print e

數據采集分析