1. 程式人生 > >專案一:醫療費用預估

專案一:醫療費用預估

今天開始準備記錄自己看過或者參加的專案,並把其中的一些細節在程式碼中寫出來。

開始,載入需要用的庫和資料:

import warnings
warnings.filterwarnings('ignore')
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
import numpy as np
import sklearn.metrics

#載入資料
df = pd.read_csv('./insurance.csv')
#剔除缺失值
df = df.dropna(how='all')
#檢視資料分佈狀況
df.describe()

得到資料分佈情況:

df.head()
#檢視不同維度之間的相關性
df.corr()
corrMatrix=df.corr()
sns.set(font_scale=1.10)
plt.figure(figsize=(8, 8))
sns.heatmap(corrMatrix, vmax=.8, linewidths=0.01,
            square=True,annot=True,cmap='viridis',linecolor="white")
plt.title('Correlation between video num and retain');

#選出特徵和標籤
df['bmi_int'] = df['bmi'].apply(lambda x: int(x))
variables = ['sex','smoker','region','age','bmi_int','children']
#bmi表示身體健康指數  體質指數(BMI)=體重(kg)÷身高^2(m)

# 資料分佈分析
print('資料分佈分析:')
for v in variables:
    df = df.sort_values(by=[v])
    df[v].value_counts().plot(kind = 'bar')
    plt.title(v)
    plt.show()

#平均醫療開銷分析
print('平均醫療開銷分析:')
for v in variables:
    group_df = df.groupby(pd.Grouper(key=v)).mean()
#     print(group_df)
    group_df = group_df.sort_index()
#     print(group_df)
    group_df.plot(y = ['charges'],kind = 'bar')
    plt.show()

#兩兩變數分析
print('兩兩變數分析:')
variables = ['sex','smoker','region','age','bmi_int','children','charges']
sns_plot = sns.pairplot(df[variables])
plt.show()

print('建模與評估\n\n')

#對類別型變數進行編碼,轉換為數值型
le_sex = LabelEncoder()
le_smoker = LabelEncoder()
le_region = LabelEncoder()

#fit_transform與fit功能一樣,但是不可替換 
#具體可以看https://blog.csdn.net/weixin_38278334/article/details/82971752
# fit_transform(trainData)對部分資料先擬合fit,找到該part的整體指標,如均值、方差、
# 最大值最小值等等(根據具體轉換的目的),然後對該trainData進行轉換transform,
# 從而實現資料的標準化、歸一化等等。
df['sex'] = le_sex.fit_transform(df['sex'])  
df['smoker'] = le_smoker.fit_transform(df['smoker'])
df['region'] = le_region.fit_transform(df['region'])

variables = ['sex','smoker','region','age','bmi','children']

X = df[variables]
sc = StandardScaler()
X = sc.fit_transform(X) 
Y = df['charges']
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

#訓練集模型
regressor = RandomForestRegressor(n_estimators = 200)
regressor.fit(X_train,y_train)

#prediction and evaluation
y_train_pred = regressor.predict(X_train)
# print(y_train_pred)
y_test_pred = regressor.predict(X_test)

#MAE平均絕對誤差(Mean Absolute Deviation)  
#RMSE:Root Mean Square Error,均方根誤差
print('RandomForestRegressor evaluating result:')
print("Train MAE: ", sklearn.metrics.mean_absolute_error(y_train, y_train_pred))
print("Train RMSE: ", np.sqrt(sklearn.metrics.mean_squared_error(y_train, y_train_pred)))
print("Test MAE: ", sklearn.metrics.mean_absolute_error(y_test, y_test_pred))
print("Test RMSE: ", np.sqrt(sklearn.metrics.mean_squared_error(y_test, y_test_pred)))

print('特徵重要度排序\n')
importances = regressor.feature_importances_
#計算該行的標準差
#regressor.estimators表示regressor中列的列表
std = np.std([tree.feature_importances_ for tree in regressor.estimators_],axis=0)
indices = np.argsort(importances)[::-1] #返回對陣列進行排序的索引

importance_list = []
for f in range(X.shape[1]):
    variable = variables[indices[f]]
    importance_list.append(variable)
    print("%d.%s(%f)" % (f + 1, variable, importances[indices[f]]))

# Plot the feature importances of the forest
plt.figure()
plt.title("feature importance")
plt.bar(importance_list, importances[indices],
       color="r", yerr=std[indices], align="center")
plt.show()

print('在新資料上進行預測\n\n')

billy = ['male','yes','southeast',25,30.5,2]
print('Billy - ',str(billy))

billy[0] = le_sex.transform([billy[0]])[0]
billy[1] = le_smoker.transform([billy[1]])[0] 
billy[2] = le_region.transform([billy[2]])[0] 

X = sc.transform([billy])

cost_for_billy = regressor.predict(X)[0]
print('Billy的醫療開銷 = ',cost_for_billy,'\n\n')


dennis = ['female','no','southeast',45,19,0]
print('Dennis - ',str(dennis))

dennis[0] = le_sex.transform([dennis[0]])[0] 
dennis[1] = le_smoker.transform([dennis[1]])[0] 
dennis[2] = le_region.transform([dennis[2]])[0] 

X = sc.transform([dennis])

cost_for_dennis = regressor.predict(X)[0]

print('Dennis的醫療開銷 = ',cost_for_dennis)


完。