1. 程式人生 > >實戰:利用Python sklearn庫裡的邏輯迴歸模型訓練資料---建立模型

實戰:利用Python sklearn庫裡的邏輯迴歸模型訓練資料---建立模型

本案例主要是通過對不均衡的28萬組資料建立模型,分析預測欺詐使用者,採用sigmod函式作為損失函式,採用交叉驗證的方法

和l1正交法則,比對了不同懲罰函式下的模型的召回率,也通過預測值和實際值做出混淆矩陣更加直觀看到各種預測結果。

也比較了sigmod函式下的不同閾值下的模型預測的精度和召回率。

以下是部分資料格式,程式碼中對每一步如何做以及為什麼做了詳細的說明。可能有某些庫的版本不同會出現錯誤。

資料格式:

Python原始碼:

#!/usr/bin/env python
# encoding: utf-8
"""
@Company:華中科技大學電氣學院聚變與等離子研究所
@version: V1.0
@author: Victor
@contact: 
[email protected]
or [email protected] 2018--2020 @software: PyCharm @file: LG.py @time: 2018/11/16 16:32 @Desc: """ import pandas as pd import matplotlib.pyplot as plt import numpy as np data = pd.read_csv("creditcard.csv") data.head() ##該案列中Class是分類,1表示該使用者是欺詐使用者,0表示為好使用者 count_classes = pd.value_counts(data['Class'],sort=True).sort_index()##取出Class列中不同數的個數,並按照大小排序 print(count_classes) plt.figure(1) count_classes.plot(kind='bar')###直接呼叫pd中的plot畫直方圖 plt.title("Fraud class histogram") plt.xlabel("Class") plt.ylabel("Frequency") from sklearn.preprocessing import StandardScaler data['normAmount']=StandardScaler().fit_transform(data['Amount'].reshape(-1,1))##因為機器預設Amout大小與某個特徵資料的大小相對應,影響建模,所以對Amount進行標準化 data = data.drop(['Time','Amount'],axis=1) data.head(3) X = data.ix[:,data.columns != 'Class']##取得除去Class列的所有資料 y = data.ix[:,data.columns == 'class'] #X.head() #y.head() number_records_fraud = len(data[data.Class == 1])##class為1的數量 fraud_indices = np.array(data[data.Class == 1].index)###取出所有class為1在原始檔中編號 ##print(fraud_indices) normal_indices = data[data.Class == 0].index ##向下取樣:保證資料一樣少 random_normal_indices = np.random.choice(normal_indices,number_records_fraud,replace=False) #print(random_normal_indices) random_normal_indices = np.array(random_normal_indices)##將其變為陣列形式,方便使用 #print(random_normal_indices) ##將class為0和1的等量樣本資料的編號整合到一起 under_sample_indices = np.concatenate([fraud_indices,random_normal_indices]) #print(under_sample_indices) ##根據索引編號取出實際資料 under_sample_data = data.iloc[under_sample_indices,:] under_sample_data.head() ###對取樣的資料進行分析處理 X_undersample = under_sample_data.ix[:, under_sample_data.columns != 'Class'] y_undersample = under_sample_data.ix[:, under_sample_data.columns == 'Class'] # Showing ratio print("Percentage of normal transactions: ", len(under_sample_data[under_sample_data.Class == 0])/len(under_sample_data)) print("Percentage of fraud transactions: ", len(under_sample_data[under_sample_data.Class == 1])/len(under_sample_data)) print("Total number of transactions in resampled data: ", len(under_sample_data))##輸入總的樣本數 ##開始訓練資料,建立模型 from sklearn.cross_validation import train_test_split ##交叉驗證 ##whole dataset X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state = 0)##對原始資料進行切分,取30%來測試,70%用於模型建立 print("Number transactions train dataset: ", len(X_train)) print("Number transactions test dataset: ", len(X_test)) print("Total number of transactions: ", len(X_train)+len(X_test)) ##取樣資料集 X_train_undersample,X_test_undersample,y_train_undersample,y_test_undersample = train_test_split(X_undersample, y_undersample, test_size=0.3, random_state=0) print("==============================================") print("Number transactions train dataset: ", len(X_train_undersample)) print("Number transactions test dataset: ", len(X_test_undersample)) print("Total number of transactions: ", len(X_train_undersample)+len(X_test_undersample)) ##Recall=TP/(TP+FN) from sklearn.linear_model import LogisticRegression from sklearn.cross_validation import KFold,cross_val_score from sklearn.metrics import confusion_matrix,recall_score,classification_report ###交叉驗證 def printing_Kfold_socres(x_train_data, y_train_data): ##對訓練資料集分成5份,交叉選取其中一份作為驗證集,然後取引數平均 fold = KFold(len(y_train_data), 5, shuffle=False) ##不同的懲罰引數,因為選擇不同的正交模式對引數的浮動影響很大,需要一個引數來懲罰控制 c_param_range = [0.01, 0.1, 1, 10, 100] results_table = pd.DataFrame(index=range(len(c_param_range), 2), columns=['C_parameter', 'Mean recall score']) ##儲存結果 results_table['C_parameter'] = c_param_range # print(results_table) ### the k-fold will give 2 lists: train_indices = indices[0], test_indices = indices[1] j = 0 ##呼叫引數的次數標記 ##迴圈每個懲罰引數下得交叉驗證情況 for c_param in c_param_range: print("=====================") print("current c paramter:", c_param) print("=====================") print("\n") recall_accs = [] ##儲存每個懲罰引數下得結果精度 #####設定從1(預設是0)開始遍歷fold,fold是序列 for iteration, indices in enumerate(fold, start=1): ###iteration是交叉驗證的次數,indices是下標而已 # >>>seasons = ['Spring', 'Summer', 'Fall', 'Winter'] # >>> list(enumerate(seasons)) # [(0, 'Spring'), (1, 'Summer'), (2, 'Fall'), (3, 'Winter')] # >>> list(enumerate(seasons, start=1)) # 下標從 1 開始 # [(1, 'Spring'), (2, 'Summer'), (3, 'Fall'), (4, 'Winter')] ##使用懲罰引數呼叫邏輯迴歸模型 ##lr是邏輯迴歸模型的例項,penalty是選擇哪種正交模式 lr = LogisticRegression(C=c_param, penalty='l1') ##train the model lr.fit(x_train_data.iloc[indices[0], :], y_train_data.iloc[indices[0], :].values.ravel()) ##通過上面的模型測試驗證集 y_pred_undersample = lr.predict(x_train_data.iloc[indices[1], :].values) ##recall_score計算召回率,並將結果儲存在陣列中 recall_acc = recall_score(y_train_data.iloc[indices[1], :].values, y_pred_undersample) recall_accs.append(recall_acc) print("iteration:", iteration, " recall score:", recall_acc) ##某個懲罰引數下的平均召回率 results_table.ix[j, 'Mean recall score'] = np.mean(recall_accs) ##將結果儲存在result_table的j行的Mean recall score列 j += 1 print('=======') print("Mean recall score:", np.mean(recall_accs)) print("=============================") ##找到上面每個懲罰引數下的最大的平局召回率對應的引數 return results_table ############下采樣交叉驗證############# results_table = printing_Kfold_socres(X_train_undersample,y_train_undersample) print(results_table) #print("**********************************") ############原始資料集交叉驗證######## ###召回率很低很低 #results_table1 = printing_Kfold_socres(X_train,y_train) #print(results_table1) ##############根據模型預測做出混淆矩陣############## import itertools def plot_confusion_matrix(cm, classes,title='Confusion matrix',cmap=plt.cm.Blues): """ This function prints and plots the confusion matrix. """ plt.imshow(cm, interpolation='nearest', cmap=cmap) plt.title(title) plt.colorbar() tick_marks = np.arange(len(classes)) plt.xticks(tick_marks, classes, rotation=0) plt.yticks(tick_marks, classes) thresh = cm.max() / 2. for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): plt.text(j, i, cm[i, j],horizontalalignment="center",color="white" if cm[i, j] > thresh else "black") plt.tight_layout() plt.ylabel('True label') plt.xlabel('Predicted label') ###呼叫模型預測,用預測值畫混淆矩陣 import itertools lr = LogisticRegression(C = 0.01, penalty = 'l1') lr.fit(X_train_undersample,y_train_undersample.values.ravel()) y_pred_undersample = lr.predict(X_test_undersample.values)###預測的直接是分類值 0,1 # 計算混淆矩陣 cnf_matrix = confusion_matrix(y_test_undersample,y_pred_undersample)##預測值和真實值畫混淆矩陣 np.set_printoptions(precision=2) print("Recall metric in the testing dataset: ", cnf_matrix[1,1]/(cnf_matrix[1,0]+cnf_matrix[1,1])) # Plot non-normalized confusion matrix class_names = [0,1] plt.figure(2) plot_confusion_matrix(cnf_matrix , classes=class_names , title='Confusion matrix') #plt.show() #####sigmod函式(損失函式)的閾值對模型的預測有很大影響 #####所以看下各個閾值下的召回率和精度,找到最合適的閾值 #####通過混淆矩陣來看 lr = LogisticRegression(C=0.01, penalty='l1') lr.fit(X_train_undersample, y_train_undersample.values.ravel()) y_pred_undersample_proba = lr.predict_proba(X_test_undersample.values) thresholds = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9] plt.figure(figsize=(10, 10)) j = 1 for i in thresholds: y_test_predictions_high_recall = y_pred_undersample_proba[:, 1] > i plt.subplot(3, 3, j) j += 1 # Compute confusion matrix cnf_matrix = confusion_matrix(y_test_undersample, y_test_predictions_high_recall) np.set_printoptions(precision=2) print("Recall metric in the testing dataset: ", cnf_matrix[1, 1] / (cnf_matrix[1, 0] + cnf_matrix[1, 1])) # Plot non-normalized confusion matrix class_names = [0, 1] plot_confusion_matrix(cnf_matrix , classes=class_names , title='Threshold >= %s' % i) plt.show() '''上取樣:使不均衡的樣本資料一樣多,通常採用SMOTE演算法(通過在少類的樣本中找到一個點,求該點到其他少類樣本點的距離 再排列取值生成新的樣本點) 過取樣可以使召回下降,精度上升,誤殺率下降''' import pandas as pd from imblearn.over_sampling import SMOTE from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import confusion_matrix from sklearn.model_selection import train_test_split credit_cards=pd.read_csv('creditcard.csv') columns=credit_cards.columns # The labels are in the last column ('Class'). Simply remove it to obtain features columns features_columns=columns.delete(len(columns)-1) features=credit_cards[features_columns] labels=credit_cards['Class'] features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size=0.2, random_state=0) oversampler=SMOTE(random_state=0) os_features,os_labels=oversampler.fit_sample(features_train,labels_train) os_features = pd.DataFrame(os_features) os_labels = pd.DataFrame(os_labels) best_c = printing_Kfold_socres(os_features,os_labels) lr = LogisticRegression(C = 0.01, penalty = 'l1') lr.fit(os_features,os_labels.values.ravel()) y_pred = lr.predict(features_test.values) # Compute confusion matrix cnf_matrix = confusion_matrix(labels_test,y_pred) np.set_printoptions(precision=2) print("Recall metric in the testing dataset: ", cnf_matrix[1,1]/(cnf_matrix[1,0]+cnf_matrix[1,1])) # Plot non-normalized confusion matrix class_names = [0,1] plt.figure() plot_confusion_matrix(cnf_matrix , classes=class_names , title='Confusion matrix') plt.show()
因為結果圖片很多,沒有展示出來