1. 程式人生 > >10.邏輯迴歸-下采樣、過取樣、交叉驗證

10.邏輯迴歸-下采樣、過取樣、交叉驗證

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import confusion_matrix, recall_score, classification_report
from imblearn.over_sampling import SMOTE


data = pd.read_csv('creditcard.csv')
print(data.shape)
print(data.columns)
# print(data.head(100))
count_classes = pd.value_counts(data['Class'], sort=True)
count_classes.plot(kind='bar')
plt.title('Fraud class histogram')
plt.xlabel('Class')
plt.ylabel('Frequency')
plt.show()

# 歸一化
data['new_Amount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))
# 丟掉某些無用列
data = data.drop(['Time', 'Amount'], axis=1)

# 初始化資料
X = data.loc[:, data.columns != 'Class']
y = data.loc[:, data.columns == 'Class']

# 獲取異常樣本的個數
number_records_fraud = len(data[data.Class == 1])
# 獲取異常樣本的索引
fraud_index = np.array(data[data.Class == 1].index)

# 獲取正常樣本的個數
number_records_normal = len(data[data.Class == 0])
# 獲取正常樣本的索引
normal_index = data[data.Class == 0].index

# 下采樣,採取與樣本少的數量一樣的資料
# 隨機選擇樣本
random_normal_index = np.random.choice(normal_index, number_records_fraud, replace=False)
random_normal_index = np.array(random_normal_index)
# print(len(random_normal_index))=492

# 將隨機選擇的樣本index與fraud樣本的索引連線成一個新的array
under_sample_index = np.concatenate([random_normal_index, fraud_index])
# print(len(under_sample_index))=984

# 根據下采樣的索引獲取下采樣的資料集
under_sample_data = data.iloc[under_sample_index]
# print(len(under_sample_data))=984
X_under_sample_data = under_sample_data[under_sample_data.columns[under_sample_data.columns != 'Class']]
y_under_sample_data = under_sample_data[under_sample_data.columns[under_sample_data.columns == 'Class']]
# 另外一種寫法,待會驗證一下
# X_under_sample_data = under_sample_data.loc[under_sample_data.columns != 'Class']
# y_under_sample_data = under_sample_data.loc[under_sample_data.columns == 'Class']



# The whole dataset 全部資料集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# The under_sample dataset 下采樣資料集
X_train_undersample, X_test_undersample, y_train_undersample, y_test_undersample = train_test_split(X_under_sample_data, y_under_sample_data, test_size=0.3, random_state=0)



def printing_Kfold_scores(x_train_data, y_train_data):
    # 生成交叉驗證的引數,會得到二維列表train_index 和 test_index
    kfold = KFold(n_splits=5, shuffle=False)
    # 不同的正則項引數:懲罰力度
    c_param_range = [0.01, 0.1, 1, 10, 100]
    # fold 中有兩個列表,train_index 和 test_index
    j = 0
    for c_param in c_param_range:
        # 這裡for迴圈是為了使用不同的懲罰力度來初始化正則項
        print('-----------------------------------')
        print('C Parameter:', c_param)
        print('-----------------------------------')
        print('')
        recall_accs = []
        for iteration, index in enumerate(kfold.split(x_train_data), start=1):
            # for迴圈裡面是使用5次交叉驗證訓練
            # 使用懲罰力度呼叫邏輯迴歸模型
            # 模型初始化
            lr = LogisticRegression(C = c_param, penalty = 'l1')
            # 訓練模型
            lr.fit(x_train_data.iloc[index[0], :].values, y_train_data.iloc[index[0], :].values.ravel())
            # 用訓練的模型預測資料
            y_predicted_undersample = lr.predict(x_train_data.iloc[index[1], :].values)

            recall_acc = recall_score(y_train_data.iloc[index[1], :].values, y_predicted_undersample)
            recall_accs.append(recall_acc)
            print('Iteration:', iteration, ': Recall Score = ', recall_acc)
        print('Mean Recall Score:',np.mean(recall_accs))


# y_predicted_undersample = printing_Kfold_scores(X_train_undersample, y_train_undersample)
# y_predicted_undersample = printing_Kfold_scores(X, y_train_undersample)

kfold = KFold(n_splits=5, shuffle=False)
recall_accs = []
for iteration, indexs in enumerate(kfold.split(X_train_undersample), start=1):
    lr = LogisticRegression(C=0.01, penalty='l1')
    lr.fit(X_train_undersample.iloc[indexs[0], :].values, y_train_undersample.iloc[indexs[0], :].values.ravel())
    # 預測下采樣資料
    # y_predicted_labels = lr.predict(X_test_undersample.values)
    # recall_acc = recall_score(y_test_undersample, y_predicted_labels)
    # 預測所有資料
    y_predicted_labels = lr.predict(X_test.values)
    recall_acc = recall_score(y_test, y_predicted_labels)
    # 預測過取樣資料
    # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
    # over_sampler = SMOTE(random_state=0)
    # os_X, os_y = over_sampler.fit_sample(X_train, y_train)
    # y_predicted_labels = lr.predict(X_test.values)
    # recall_acc = recall_score(y_test, y_predicted_labels)

    print('Recall:',recall_acc)
    recall_accs.append(recall_acc)
print('Recall Means:', np.mean(recall_accs))