【機器學習】Python sklearn包的使用示例以及引數調優示例
阿新 • • 發佈:2019-01-07
# coding=utf-8 # !/usr/bin/env python ''''' 【說明】 1.當前sklearn版本0.18 2.sklearn自帶的鳶尾花資料集樣例: (1)樣本特徵矩陣(型別:numpy.ndarray) [[ 6.7 3. 5.2 2.3] [ 6.3 2.5 5. 1.9] [ 6.5 3. 5.2 2. ] [ 6.2 3.4 5.4 2.3] [ 5.9 3. 5.1 1.8]] 每行是一個樣本,矩陣行數=樣本總數,矩陣列數=每個樣本特徵數 (2)樣本類別矩陣(型別:numpy.ndarray) [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2] 每個元素對應一個樣本的類標 3.本地excel表的資料集樣例: class0 p1 p2 p3 p4 p5 p6 p7 0 0 0 0 1 0 0 0 0 5 9 10 10 0 1 1 0 0 1 1 0 0 1 0 0 0 1 1 0 0 1 0 每行是一個樣本,每行第一個元素是樣本所屬類別,後續元素是樣本的特徵 ''' import os import numpy as np import pandas as pd from sklearn import datasets from sklearn import preprocessing from sklearn import neighbors from sklearn.discriminant_analysis import LinearDiscriminantAnalysis from sklearn import svm from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import train_test_split from sklearn.model_selection import StratifiedKFold from sklearn.linear_model import LogisticRegression from sklearn.model_selection import GridSearchCV from time import time from sklearn.naive_bayes import MultinomialNB from sklearn import tree from sklearn.ensemble import GradientBoostingClassifier #讀取sklearn自帶的資料集(鳶尾花) def getData_1(): iris = datasets.load_iris() X = iris.data #樣本特徵矩陣,150*4矩陣,每行一個樣本,每個樣本維度是4 y = iris.target #樣本類別矩陣,150維行向量,每個元素代表一個樣本的類別 #讀取本地excel表格內的資料集(抽取每類60%樣本組成訓練集,剩餘樣本組成測試集) #返回一個元祖,其內有4個元素(型別均為numpy.ndarray): #(1)歸一化後的訓練集矩陣,每行為一個訓練樣本,矩陣行數=訓練樣本總數,矩陣列數=每個訓練樣本的特徵數 #(2)每個訓練樣本的類標 #(3)歸一化後的測試集矩陣,每行為一個測試樣本,矩陣行數=測試樣本總數,矩陣列數=每個測試樣本的特徵數 #(4)每個測試樣本的類標 #【注】歸一化採用“最大最小值”方法。 def getData_2(): fPath = 'F:/cleanData_dropSJS.csv' if os.path.exists(fPath): data = pd.read_csv(fPath,header=None,skiprows=1,names=['class0','pixel0','pixel1','pixel2','pixel3','pixel4','pixel5', 'pixel6']) X_train1, X_test1, y_train1, y_test1 = train_test_split(data, data['class0'], test_size = 0.4, random_state = 0) min_max_scaler = preprocessing.MinMaxScaler() #歸一化 X_train_minmax = min_max_scaler.fit_transform(np.array(X_train1)) X_test_minmax = min_max_scaler.fit_transform(np.array(X_test1)) return (X_train_minmax, np.array(y_train1), X_test_minmax, np.array(y_test1)) else: print 'No such file or directory!' #讀取本地excel表格內的資料集(每類隨機生成K個訓練集和測試集的組合) #【K的含義】假設一共有1000個樣本,K取10,那麼就將這1000個樣本切分10份(一份100個),那麼就產生了10個測試集 #對於每一份的測試集,剩餘900個樣本即作為訓練集 #結果返回一個字典:鍵為集合編號(1train, 1trainclass, 1test, 1testclass, 2train, 2trainclass, 2test, 2testclass...),值為資料 #其中1train和1test為隨機生成的第一組訓練集和測試集(1trainclass和1testclass為訓練樣本類別和測試樣本類別),其他以此類推 def getData_3(): fPath = 'F:/cleanData_dropSJS.csv' if os.path.exists(fPath): #讀取csv檔案內的資料, dataMatrix = np.array(pd.read_csv(fPath,header=None,skiprows=1,names=['class0','pixel0','pixel1','pixel2','pixel3','pixel4','pixel5', 'pixel6'])) #獲取每個樣本的特徵以及類標 rowNum, colNum = dataMatrix.shape[0], dataMatrix.shape[1] sampleData = [] sampleClass = [] for i in range(0, rowNum): tempList = list(dataMatrix[i,:]) sampleClass.append(tempList[0]) sampleData.append(tempList[1:]) sampleM = np.array(sampleData) #二維矩陣,一行是一個樣本,行數=樣本總數,列數=樣本特徵數 classM = np.array(sampleClass) #一維列向量,每個元素對應每個樣本所屬類別 #呼叫StratifiedKFold方法生成訓練集和測試集 skf = StratifiedKFold(n_splits = 10) setDict = {} #建立字典,用於儲存生成的訓練集和測試集 count = 1 for trainI, testI in skf.split(sampleM, classM): trainSTemp = [] #用於儲存當前迴圈抽取出的訓練樣本資料 trainCTemp = [] #用於儲存當前迴圈抽取出的訓練樣本類標 testSTemp = [] #用於儲存當前迴圈抽取出的測試樣本資料 testCTemp = [] #用於儲存當前迴圈抽取出的測試樣本類標 #生成訓練集 trainIndex = list(trainI) for t1 in range(0, len(trainIndex)): trainNum = trainIndex[t1] trainSTemp.append(list(sampleM[trainNum, :])) trainCTemp.append(list(classM)[trainNum]) setDict[str(count) + 'train'] = np.array(trainSTemp) setDict[str(count) + 'trainclass'] = np.array(trainCTemp) #生成測試集 testIndex = list(testI) for t2 in range(0, len(testIndex)): testNum = testIndex[t2] testSTemp.append(list(sampleM[testNum, :])) testCTemp.append(list(classM)[testNum]) setDict[str(count) + 'test'] = np.array(testSTemp) setDict[str(count) + 'testclass'] = np.array(testCTemp) count += 1 return setDict else: print 'No such file or directory!' #K近鄰(K Nearest Neighbor) def KNN(): clf = neighbors.KNeighborsClassifier() return clf #線性鑑別分析(Linear Discriminant Analysis) def LDA(): clf = LinearDiscriminantAnalysis() return clf #支援向量機(Support Vector Machine) def SVM(): clf = svm.SVC() return clf #邏輯迴歸(Logistic Regression) def LR(): clf = LogisticRegression() return clf #隨機森林決策樹(Random Forest) def RF(): clf = RandomForestClassifier() return clf #多項式樸素貝葉斯分類器 def native_bayes_classifier(): clf = MultinomialNB(alpha = 0.01) return clf #決策樹 def decision_tree_classifier(): clf = tree.DecisionTreeClassifier() return clf #GBDT def gradient_boosting_classifier(): clf = GradientBoostingClassifier(n_estimators = 200) return clf #計算識別率 def getRecognitionRate(testPre, testClass): testNum = len(testPre) rightNum = 0 for i in range(0, testNum): if testClass[i] == testPre[i]: rightNum += 1 return float(rightNum) / float(testNum) #report函式,將調參的詳細結果儲存到本地F盤(路徑可自行修改,其中n_top是指定輸出前多少個最優引數組合以及該組合的模型得分) def report(results, n_top=5488): f = open('F:/grid_search_rf.txt', 'w') for i in range(1, n_top + 1): candidates = np.flatnonzero(results['rank_test_score'] == i) for candidate in candidates: f.write("Model with rank: {0}".format(i) + '\n') f.write("Mean validation score: {0:.3f} (std: {1:.3f})".format( results['mean_test_score'][candidate], results['std_test_score'][candidate]) + '\n') f.write("Parameters: {0}".format(results['params'][candidate]) + '\n') f.write("\n") f.close() #自動調參(以隨機森林為例) def selectRFParam(): clf_RF = RF() param_grid = {"max_depth": [3,15], "min_samples_split": [3, 5, 10], "min_samples_leaf": [3, 5, 10], "bootstrap": [True, False], "criterion": ["gini", "entropy"], "n_estimators": range(10,50,10)} # "class_weight": [{0:1,1:13.24503311,2:1.315789474,3:12.42236025,4:8.163265306,5:31.25,6:4.77326969,7:19.41747573}], # "max_features": range(3,10), # "warm_start": [True, False], # "oob_score": [True, False], # "verbose": [True, False]} grid_search = GridSearchCV(clf_RF, param_grid=param_grid, n_jobs=4) start = time() T = getData_2() #獲取資料集 grid_search.fit(T[0], T[1]) #傳入訓練集矩陣和訓練樣本類標 print("GridSearchCV took %.2f seconds for %d candidate parameter settings." % (time() - start, len(grid_search.cv_results_['params']))) report(grid_search.cv_results_) #“主”函式1(KFold方法生成K個訓練集和測試集,即資料集採用getData_3()函式獲取,計算這K個組合的平均識別率) def totalAlgorithm_1(): #獲取各個分類器 clf_KNN = KNN() clf_LDA = LDA() clf_SVM = SVM() clf_LR = LR() clf_RF = RF() clf_NBC = native_bayes_classifier() clf_DTC = decision_tree_classifier() clf_GBDT = gradient_boosting_classifier() #獲取訓練集和測試集 setDict = getData_3() setNums = len(setDict.keys()) / 4 #一共生成了setNums個訓練集和setNums個測試集,它們之間是一一對應關係 #定義變數,用於將每個分類器的所有識別率累加 KNN_rate = 0.0 LDA_rate = 0.0 SVM_rate = 0.0 LR_rate = 0.0 RF_rate = 0.0 NBC_rate = 0.0 DTC_rate = 0.0 GBDT_rate = 0.0 for i in range(1, setNums + 1): trainMatrix = setDict[str(i) + 'train'] trainClass = setDict[str(i) + 'trainclass'] testMatrix = setDict[str(i) + 'test'] testClass = setDict[str(i) + 'testclass'] #輸入訓練樣本 clf_KNN.fit(trainMatrix, trainClass) clf_LDA.fit(trainMatrix, trainClass) clf_SVM.fit(trainMatrix, trainClass) clf_LR.fit(trainMatrix, trainClass) clf_RF.fit(trainMatrix, trainClass) clf_NBC.fit(trainMatrix, trainClass) clf_DTC.fit(trainMatrix, trainClass) clf_GBDT.fit(trainMatrix, trainClass) #計算識別率 KNN_rate += getRecognitionRate(clf_KNN.predict(testMatrix), testClass) LDA_rate += getRecognitionRate(clf_LDA.predict(testMatrix), testClass) SVM_rate += getRecognitionRate(clf_SVM.predict(testMatrix), testClass) LR_rate += getRecognitionRate(clf_LR.predict(testMatrix), testClass) RF_rate += getRecognitionRate(clf_RF.predict(testMatrix), testClass) NBC_rate += getRecognitionRate(clf_NBC.predict(testMatrix), testClass) DTC_rate += getRecognitionRate(clf_DTC.predict(testMatrix), testClass) GBDT_rate += getRecognitionRate(clf_GBDT.predict(testMatrix), testClass) #輸出各個分類器的平均識別率(K個訓練集測試集,計算平均) print print print print('K Nearest Neighbor mean recognition rate: ', KNN_rate / float(setNums)) print('Linear Discriminant Analysis mean recognition rate: ', LDA_rate / float(setNums)) print('Support Vector Machine mean recognition rate: ', SVM_rate / float(setNums)) print('Logistic Regression mean recognition rate: ', LR_rate / float(setNums)) print('Random Forest mean recognition rate: ', RF_rate / float(setNums)) print('Native Bayes Classifier mean recognition rate: ', NBC_rate / float(setNums)) print('Decision Tree Classifier mean recognition rate: ', DTC_rate / float(setNums)) print('Gradient Boosting Decision Tree mean recognition rate: ', GBDT_rate / float(setNums)) #“主”函式2(每類前x%作為訓練集,剩餘作為測試集,即資料集用getData_2()方法獲取,計算識別率) def totalAlgorithm_2(): #獲取各個分類器 clf_KNN = KNN() clf_LDA = LDA() clf_SVM = SVM() clf_LR = LR() clf_RF = RF() clf_NBC = native_bayes_classifier() clf_DTC = decision_tree_classifier() clf_GBDT = gradient_boosting_classifier() #獲取訓練集和測試集 T = getData_2() trainMatrix, trainClass, testMatrix, testClass = T[0], T[1], T[2], T[3] #輸入訓練樣本 clf_KNN.fit(trainMatrix, trainClass) clf_LDA.fit(trainMatrix, trainClass) clf_SVM.fit(trainMatrix, trainClass) clf_LR.fit(trainMatrix, trainClass) clf_RF.fit(trainMatrix, trainClass) clf_NBC.fit(trainMatrix, trainClass) clf_DTC.fit(trainMatrix, trainClass) clf_GBDT.fit(trainMatrix, trainClass) #輸出各個分類器的識別率 print('K Nearest Neighbor recognition rate: ', getRecognitionRate(clf_KNN.predict(testMatrix), testClass)) print('Linear Discriminant Analysis recognition rate: ', getRecognitionRate(clf_LDA.predict(testMatrix), testClass)) print('Support Vector Machine recognition rate: ', getRecognitionRate(clf_SVM.predict(testMatrix), testClass)) print('Logistic Regression recognition rate: ', getRecognitionRate(clf_LR.predict(testMatrix), testClass)) print('Random Forest recognition rate: ', getRecognitionRate(clf_RF.predict(testMatrix), testClass)) print('Native Bayes Classifier recognition rate: ', getRecognitionRate(clf_NBC.predict(testMatrix), testClass)) print('Decision Tree Classifier recognition rate: ', getRecognitionRate(clf_DTC.predict(testMatrix), testClass)) print('Gradient Boosting Decision Tree recognition rate: ', getRecognitionRate(clf_GBDT.predict(testMatrix), testClass)) if __name__ == '__main__': print('K個訓練集和測試集的平均識別率') totalAlgorithm_1() print('每類前x%訓練,剩餘測試,各個模型的識別率') totalAlgorithm_2() selectRFParam() print('隨機森林引數調優完成!') ''' 【輸出結果】 K個訓練集和測試集的平均識別率 ('K Nearest Neighbor mean recognition rate: ', 0.48914314291650945) ('Linear Discriminant Analysis mean recognition rate: ', 0.5284076063968655) ('Support Vector Machine mean recognition rate: ', 0.5271199740575014) ('Logistic Regression mean recognition rate: ', 0.5620828985391165) ('Random Forest mean recognition rate: ', 0.512993404168108) ('Native Bayes Classifier mean recognition rate: ', 0.4467074333715003) ('Decision Tree Classifier mean recognition rate: ', 0.47351209424438706) ('Gradient Boosting Decision Tree mean recognition rate: ', 0.5603633086892212) 每類前x%訓練,剩餘測試,各個模型的識別率 ('K Nearest Neighbor recognition rate: ', 0.9892818863879957) ('Linear Discriminant Analysis recognition rate: ', 1.0) ('Support Vector Machine recognition rate: ', 0.8928188638799571) ('Logistic Regression recognition rate: ', 0.8494105037513398) ('Random Forest recognition rate: ', 0.9801714898177921) ('Native Bayes Classifier recognition rate: ', 0.7604501607717041) ('Decision Tree Classifier recognition rate: ', 1.0) ('Gradient Boosting Decision Tree recognition rate: ', 1.0) GridSearchCV took 69.51 seconds for 288 candidate parameter settings. 隨機森林引數調優完成! '''
【總結】如果你直接跑我的程式碼需要修改的地方:
(1)程式碼最前面各種匯入的模組你是否已經正確安裝?
(2)getData_2()和getData_3()函式內的fPath變數,即資料來源檔案路徑
(3)如果需要引數調優,設定儲存結果的檔案路徑,程式碼中在report()函式的第一行