通過使用各種演算法(線性迴歸,邏輯迴歸,隨機森林,繼承演算法)預測泰坦尼克號上的某個人是否獲救
阿新 • • 發佈:2018-11-28
Python原始碼:
#!/usr/bin/env python # encoding: utf-8 """ @Company:華中科技大學電氣學院聚變與等離子研究所 @version: V1.0 @author: Victor @contact: [email protected] @software: PyCharm @file: Taitannic.py @time: 2018/11/20 19:39 @Desc:通過使用各種演算法預測泰坦尼克號上的某個人是否獲救(根據每個人的特徵建立模型) """ ###通過建立模型預測一個人是否被獲救 import pandas as pd taitannic = pd.read_csv("taitannic_train.csv") # taitannic.head() # print(taitannic.describe())###計算輸出每個特徵的統計資訊,可以看到age特徵中有缺失!!!! ###用均值填補age的缺失值 taitannic['Age'] = taitannic['Age'].fillna(taitannic['Age'].median()) # print(taitannic.describe()) print(taitannic['Sex'].unique()) ###將字元型性別轉化為0和1,因為機器學習處理資料都是轉化為數值型。 taitannic.loc[taitannic['Sex'] == 'male', 'Sex'] = 0 taitannic.loc[taitannic['Sex'] == 'female', 'Sex'] = 1 print(taitannic['Sex'].unique()) ####將字元型Embarked轉化為數值型,並且用眾數填充缺失值 print(taitannic["Embarked"].unique()) taitannic["Embarked"] = taitannic["Embarked"].fillna('S') taitannic.loc[taitannic["Embarked"] == "S", "Embarked"] = 0 taitannic.loc[taitannic["Embarked"] == "C", "Embarked"] = 1 taitannic.loc[taitannic["Embarked"] == "Q", "Embarked"] = 2 print(taitannic["Embarked"].unique()) #######呼叫線性迴歸的模型庫來訓練資料得到好的引數############# from sklearn.linear_model import LinearRegression from sklearn.model_selection import KFold from sklearn.model_selection import train_test_split ##選擇的特徵 predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"] alg = LinearRegression() ##初始化模型物件 kf = KFold(3, False, random_state=1) print(kf) ##kf是切分後的資料(訓練集中的train+test) predictions = [] for train, test in kf.split(taitannic[predictions]): ###train和test都是劃分後的索引 # print(train) # print('===========') # print(test) # print('================***************') train_predictors = (taitannic[predictors].iloc[train, :]) ##通過行號索引資料,只取選好的特徵資料 # print(taitannic[predictors]) train_target = taitannic['Survived'].iloc[train] ###訓練 alg.fit(train_predictors, train_target) ####預測 test_predictions = alg.predict(taitannic[predictors].iloc[test, :]) predictions.append(test_predictions) ##儲存三次交叉驗證中每個人的預測正確率 ##print(sum(predictions)) import numpy as np predictions = np.concatenate(predictions, axis=0) ##print(predictions) ####如果正確率大於0.5則對,否則為錯 predictions[predictions > 0.5] = 1 predictions[predictions <= 0.5] = 0 ####和實際結果進行比對,計算準確率 accuracy = sum(predictions[predictions == taitannic['Survived']]) / len(predictions) print(accuracy * 3) ###三次交叉驗證均化了 ###########邏輯迴歸:通過概率值################################### from sklearn import cross_validation from sklearn.linear_model import LogisticRegression # Initialize our algorithm alg = LogisticRegression(random_state=1) # Compute the accuracy score for all the cross validation folds. (much simpler than what we did before!) scores = cross_validation.cross_val_score(alg, taitannic[predictors], taitannic["Survived"], cv=3) # Take the mean of the scores (because we have one for each fold) print(scores.mean()) ######測試資料集############################################# taitannic_test = pd.read_csv("test.csv") taitannic_test["Age"] = taitannic_test["Age"].fillna(taitannic["Age"].median()) taitannic_test["Fare"] = taitannic_test["Fare"].fillna(taitannic_test["Fare"].median()) taitannic_test.loc[taitannic_test["Sex"] == "male", "Sex"] = 0 taitannic_test.loc[taitannic_test["Sex"] == "female", "Sex"] = 1 taitannic_test["Embarked"] = taitannic_test["Embarked"].fillna("S") taitannic_test.loc[taitannic_test["Embarked"] == "S", "Embarked"] = 0 taitannic_test.loc[taitannic_test["Embarked"] == "C", "Embarked"] = 1 taitannic_test.loc[taitannic_test["Embarked"] == "Q", "Embarked"] = 2 ######隨機森林分類################################################# from sklearn import cross_validation from sklearn.ensemble import RandomForestClassifier predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"] alg = RandomForestClassifier(random_state=1, n_estimators=10, min_samples_split=2, min_samples_leaf=1) kf = cross_validation.KFold(taitannic.shape[0], n_folds=3, random_state=1) scores = cross_validation.cross_val_score(alg, taitannic[predictors], taitannic["Survived"], cv=kf) print(scores.mean()) ####改變樹的數量 alg = RandomForestClassifier(random_state=1, n_estimators=60, min_samples_split=4, min_samples_leaf=2) kf = cross_validation.KFold(taitannic.shape[0], 3, random_state=1) scores = cross_validation.cross_val_score(alg, taitannic[predictors], taitannic["Survived"], cv=kf) print(scores.mean()) #####重新構造特徵引數來優化模型############################ # 家族特徵 taitannic["FamilySize"] = taitannic["SibSp"] + taitannic["Parch"] ##名字長度特徵 taitannic["NameLength"] = taitannic["Name"].apply(lambda x: len(x)) import re def get_title(name): title_search = re.search(' ([A-Za-z]+)\.', name) if title_search: return title_search.group(1) return "" titles = taitannic["Name"].apply(get_title) print(pd.value_counts(titles)) ###看名字中的稱呼是否有對生存有影響 title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Dr": 5, "Rev": 6, "Major": 7, "Col": 7, "Mlle": 8, "Mme": 8, "Don": 9, "Lady": 10, "Countess": 10, "Jonkheer": 10, "Sir": 9, "Capt": 7, "Ms": 2} for k, v in title_mapping.items(): titles[titles == k] = v print(pd.value_counts(titles)) taitannic["Title"] = titles #############探索各個特徵的重要性,通過改變某個特徵的資料,再看結果變化是否很大來確定重要性####### import numpy as np from sklearn.feature_selection import SelectKBest, f_classif import matplotlib.pyplot as plt predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked", "FamilySize", "Title", "NameLength"] selector = SelectKBest(f_classif, k=5) selector.fit(taitannic[predictors], taitannic["Survived"]) scores = -np.log10(selector.pvalues_) plt.bar(range(len(predictors)), scores) plt.xticks(range(len(predictors)), predictors, rotation='vertical') plt.show() ####直方圖長的重要性高一些。 predictors = ["Pclass", "Sex", "Fare", "Title"] alg = RandomForestClassifier(random_state=1, n_estimators=50, min_samples_split=8, min_samples_leaf=4) ##################整合演算法分類################################ from sklearn.ensemble import GradientBoostingClassifier import numpy as np # The algorithms we want to ensemble. # We're using the more linear predictors for the logistic regression, and everything with the gradient boosting classifier. algorithms = [ [GradientBoostingClassifier(random_state=1, n_estimators=25, max_depth=3), ["Pclass", "Sex", "Age", "Fare", "Embarked", "FamilySize", "Title", ]], [LogisticRegression(random_state=1), ["Pclass", "Sex", "Fare", "FamilySize", "Title", "Age", "Embarked"]] ] # Initialize the cross validation folds kf = KFold(taitannic.shape[0], n_folds=3, random_state=1) predictions = [] for train, test in kf: train_target = taitannic["Survived"].iloc[train] full_test_predictions = [] for alg, predictors in algorithms: alg.fit(taitannic[predictors].iloc[train, :], train_target) test_predictions = alg.predict_proba(taitannic[predictors].iloc[test, :].astype(float))[:, 1] full_test_predictions.append(test_predictions) test_predictions = (full_test_predictions[0] + full_test_predictions[1]) / 2 test_predictions[test_predictions <= .5] = 0 test_predictions[test_predictions > .5] = 1 predictions.append(test_predictions) predictions = np.concatenate(predictions, axis=0) accuracy = sum(predictions[predictions == taitannic["Survived"]]) / len(predictions) print(accuracy)