1. 程式人生 > >二分類問題(泰坦尼克號獲救預測)

二分類問題(泰坦尼克號獲救預測)

# -*- coding: utf-8 -*-


# @Time    : 2018/12/13 10:46
# @Author  : WenZhao
# @Email   : [email protected]
# @File    : tt.py
# @Software: PyCharm
'''
    泰坦尼克號預測(線性迴歸二分類)
    1.pandas資料處理
    2.numpy資料作為tensorflow的輸入
    3.線上性迴歸的基礎上增加sigmoid函式實現二分類
    4.交叉熵
    5.構造batch訓練(batch越多構造訓練結果越穩定)
    6.訓練速度
    7.訓練結果的視覺化
'''

import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
data=pd.read_csv('./data/tt/train.csv')
data=data[[ 'Survived', 'Pclass','Sex', 'Age', 'SibSp',
       'Parch', 'Fare', 'Cabin', 'Embarked']]
data['Age']=data['Age'].fillna(data['Age'].mean())
data['Cabin']=pd.factorize(data.Cabin)[0]
data.fillna(0,inplace=True)
# print(data['Pclass'])
data['Sex']=[1 if x=='male' else 0 for x in data.Sex]
data['p1']=np.array(data['Pclass']==1).astype(np.int32)
data['p2']=np.array(data['Pclass']==2).astype(np.int32)
data['p3']=np.array(data['Pclass']==3).astype(np.int32)
del data['Pclass']
data['e1']=np.array(data['Embarked']=='S').astype(np.int32)
data['e2']=np.array(data['Embarked']=='C').astype(np.int32)
data['e3']=np.array(data['Embarked']=='Q').astype(np.int32)
del data['Embarked']
data_train=data[['Sex', 'Age', 'SibSp',
       'Parch', 'Fare', 'Cabin', 'p1','p2','p3','e1','e2','e3']].values
data_target=data['Survived'].values.reshape(len(data),1)

# 構建神經網路
x=tf.placeholder("float",shape=[None,12])
y=tf.placeholder("float",shape=[None,1])
# output=weight*x+bias
weight=tf.Variable(tf.random_normal([12,1]))
bias=tf.Variable(tf.random_normal([1]))
output=tf.matmul(x,weight)+bias
pred=tf.cast(tf.sigmoid(output)>0.5,tf.float32)

# 損失函式
loss=tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=y,logits=output))

# 梯度下降訓練
train_step=tf.train.GradientDescentOptimizer(0.0003).minimize(loss)

# 計算準確率
accuracy=tf.reduce_mean(tf.cast(tf.equal(pred,y),tf.float32))

# test測試

data_test=pd.read_csv('./data/tt/test.csv')
data_test=data_test[[ 'Pclass','Sex', 'Age', 'SibSp',
       'Parch', 'Fare', 'Cabin', 'Embarked']]
data_test['Age']=data_test['Age'].fillna(data_test['Age'].mean())
data_test['Cabin']=pd.factorize(data_test.Cabin)[0]
data_test.fillna(0,inplace=True)
# print(data['Pclass'])
data_test['Sex']=[1 if x=='male' else 0 for x in data_test.Sex]
data_test['p1']=np.array(data_test['Pclass']==1).astype(np.int32)
data_test['p2']=np.array(data_test['Pclass']==2).astype(np.int32)
data_test['p3']=np.array(data_test['Pclass']==3).astype(np.int32)
del data_test['Pclass']
data_test['e1']=np.array(data_test['Embarked']=='S').astype(np.int32)
data_test['e2']=np.array(data_test['Embarked']=='C').astype(np.int32)
data_test['e3']=np.array(data_test['Embarked']=='Q').astype(np.int32)
del data_test['Embarked']


test_label=pd.read_csv('./data/tt/gender.csv')
test_label=np.reshape(test_label.Survived.values.astype(np.int32),(418,1))

sess=tf.Session()
sess.run(tf.global_variables_initializer())

loss_train=[]
train_acc=[]
test_acc=[]

# 開始訓練
for i in range(25000):
       # 亂序
       index=np.random.permutation(range(data_target.shape[0]))
       data_train=data_train[index]
       data_target=data_target[index]
       for n in range(len(data_target)//100+1):
              batch_xs=data_train[n*100:n*100+100]
              batch_ys=data_target[n*100:n*100+100]
              sess.run(train_step,feed_dict={x:batch_xs,y:batch_ys})

       if i%1000==0:
              loss_temp=sess.run(loss,feed_dict={x:batch_xs,y:batch_ys})
              loss_train.append(loss)
              train_acc_temp=sess.run(accuracy,feed_dict={x:batch_xs,y:batch_ys})
              train_acc.append(train_acc_temp)
              test_acc_temp=sess.run(accuracy,feed_dict={x:data_test,y:test_label})
              test_acc.append(test_acc_temp)
              print(loss_temp,train_acc_temp,test_acc_temp)


plt.plot(train_acc,'b-',label='train_acc')
plt.plot(test_acc,'r--',label='test_acc')

plt.title('train and test accuracy')
plt.legend()
plt.show()