1. 程式人生 > >機器學習實戰之Titanic(Kaggle)

機器學習實戰之Titanic(Kaggle)

一、船員資料分析


  1. PassengerId :每一個乘客的標誌符
  2. Survived:Lable值,代表是否獲救
  3. Pclass:船員倉庫等級
  4. Name:姓名
  5. Sex:性別
  6. Age:年齡 
  7. SibSp:兄弟姐妹有幾個
  8. Parch:老人孩子的數量
  9. Ticket:船票的編號
  10. Fare:船票價格
  11. Cabin:船艙位置,此列出現大量缺失,可以不要
  12. Embarked:上船地點

二、資料預處理

1.匯入需要的包

import pandas as pa
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import KFold

2.觀察資料的前幾行

filename = "train.csv"
titanic = pa.read_csv(filename)
titanic.head()
結果:

3.觀察資料的簡單資料特徵

print titanic.describe()

結果:

    PassengerId    Survived      Pclass         Age       SibSp  \
count   891.000000  891.000000  891.000000  714.000000  891.000000   
mean    446.000000    0.383838    2.308642   29.699118    0.523008   
std     257.353842    0.486592    0.836071   14.526497    1.102743   
min       1.000000    0.000000    1.000000    0.420000    0.000000   
25%     223.500000    0.000000    2.000000         NaN    0.000000   
50%     446.000000    0.000000    3.000000         NaN    0.000000   
75%     668.500000    1.000000    3.000000         NaN    1.000000   
max     891.000000    1.000000    3.000000   80.000000    8.000000   

            Parch        Fare  
count  891.000000  891.000000  
mean     0.381594   32.204208  
std      0.806057   49.693429  
min      0.000000    0.000000  
25%      0.000000    7.910400  
50%      0.000000   14.454200  
75%      0.000000   31.000000  
max      6.000000  512.329200  
  • 可以看到Age列資料只有714個,其餘列均有891個,因此此列需要對缺失值進行填充
    titanic["Age"]=titanic["Age"].fillna(titanic["Age"].median())
    print titanic.describe()

    結果:

     PassengerId    Survived      Pclass         Age       SibSp  \
    count   891.000000  891.000000  891.000000  891.000000  891.000000   
    mean    446.000000    0.383838    2.308642   29.361582    0.523008   
    std     257.353842    0.486592    0.836071   13.019697    1.102743   
    min       1.000000    0.000000    1.000000    0.420000    0.000000   
    25%     223.500000    0.000000    2.000000   22.000000    0.000000   
    50%     446.000000    0.000000    3.000000   28.000000    0.000000   
    75%     668.500000    1.000000    3.000000   35.000000    1.000000   
    max     891.000000    1.000000    3.000000   80.000000    8.000000   
    
                Parch        Fare  
    count  891.000000  891.000000  
    mean     0.381594   32.204208  
    std      0.806057   49.693429  
    min      0.000000    0.000000  
    25%      0.000000    7.910400  
    50%      0.000000   14.454200  
    75%      0.000000   31.000000  
    max      6.000000  512.329200 
  • 將string值轉為int/float值
            1) 首先,觀察相應列有幾種字串
    • print titanic["Sex"].unique()
      print titanic["Embarked"].unique()

      結果:

      ['male' 'female']
      ['S' 'C' 'Q' nan]
      2) 然後,將相應字串的位置附上對應的Int/float值

      titanic.loc[titanic["Sex"]=="male","Sex"] = 0; 
      titanic.loc[titanic["Sex"]=="female","Sex"] = 1;
      titanic.loc[titanic["Embarked"]=="S","Embarked"] = 0; 
      titanic.loc[titanic["Embarked"]=="C","Embarked"] = 1;
      titanic.loc[titanic["Embarked"]=="Q","Embarked"] = 2;
      titanic.head()

      結果:


      替換成功

三、分類

def data_proprocess():
    import pandas as pa
    import numpy as np
    import matplotlib.pyplot as plt
    
    filename = "train.csv"
    titanic = pa.read_csv(filename)
    #titanic.head()
    #print titanic.describe()
    titanic["Age"]=titanic["Age"].fillna(titanic["Age"].median())
    titanic['Embarked'] = titanic['Embarked'].fillna('S')
    #print titanic["Sex"].unique()
    #print titanic["Embarked"].unique()
    titanic.loc[titanic["Sex"]=="male","Sex"] = 0; 
    titanic.loc[titanic["Sex"]=="female","Sex"] = 1;
    
    titanic.loc[titanic["Embarked"]=="S","Embarked"] = 0; 
    titanic.loc[titanic["Embarked"]=="C","Embarked"] = 1;
    titanic.loc[titanic["Embarked"]=="Q","Embarked"] = 2;
    #titanic.head()
    return titanic

def classify_LinearRegression(titanic):
    import pandas as pa
    import numpy as np
    import matplotlib.pyplot as plt
    from sklearn.cross_validation import KFold
    from sklearn.linear_model import LinearRegression
    
    predictors = ["Pclass","Sex","Age","SibSp","Parch","Fare","Embarked"]#特徵
    
    alg = LinearRegression()#線性迴歸
    kf = KFold(titanic.shape[0],n_folds=3,random_state=1)#交叉驗證集
    predictions = []
    for train,test in kf:
        train_predictors = (titanic[predictors].iloc[train,:])
        train_target = titanic["Survived"].iloc[train]
        alg.fit(train_predictors,train_target)
        test_predictions = alg.predict(titanic[predictors].iloc[test,:])
        predictions.append(test_predictions)
    
    predictions = np.concatenate(predictions,axis=0)
    predictions[predictions > 0.5] =1
    predictions[predictions <= 0.5] =0

    accuracy = sum(predictions[predictions == titanic['Survived']])/len(predictions)
    return accuracy

def classify_LogisticRegression(titanic):
    import pandas as pa
    import numpy as np
    import matplotlib.pyplot as plt
    from sklearn import cross_validation
    from sklearn.linear_model import LogisticRegression
    predictors = ["Pclass","Sex","Age","SibSp","Parch","Fare","Embarked"]#特徵
    alg = LogisticRegression(random_state=1)
    scores = cross_validation.cross_val_score(alg,titanic[predictors],titanic["Survived"],cv=3)
    return scores.mean()
print "LinearRegression Classification result is :"
print classify_LinearRegression(data_proprocess())
print "LogisticRegression Classification result is :"
print classify_LogisticRegression(data_proprocess())
結果:
LinearRegression Classification result is :
0.261503928171
LogisticRegression Classification result is :
0.787878787879

從結果可以看出,還是用邏輯迴歸做分類問題精度更高。

四、使用隨機森林提高分類精度並將結果傳到kaggle

def classify_RandomForestClassifier(train_data,test_data):
    from sklearn.ensemble import RandomForestClassifier
    from sklearn import cross_validation
    import pandas as pa  
    import numpy as np  
    predictors = ["Pclass","Sex","Age","SibSp","Parch","Fare","Embarked"] 
    clf = RandomForestClassifier(n_estimators=10, max_depth=None,min_samples_split=2, random_state=0)
    scores = cross_validation.cross_val_score(clf,train_data[predictors],train_data["Survived"],cv=3)  
    clf .fit(train_data[predictors],train_data["Survived"])
    predict_result=  clf.predict(test_data[predictors])
    result = pa.DataFrame({'PassengerId':test_data['PassengerId'].as_matrix(), 'Survived':predict_result.astype(np.int32)})
    result.to_csv("logistic_regression_predictions.csv", index=False)
    return scores.mean() 
print "train"
titanic_train=data_proprocess("train.csv")
print "test"
titanic_test=data_proprocess("test.csv")

classify_RandomForestClassifier(titanic_train,titanic_test)