1. 程式人生 > >【機器學習】Python sklearn包的使用示例以及引數調優示例

【機器學習】Python sklearn包的使用示例以及引數調優示例

# coding=utf-8
# !/usr/bin/env python
'''''
【說明】
1.當前sklearn版本0.18
2.sklearn自帶的鳶尾花資料集樣例:
(1)樣本特徵矩陣(型別:numpy.ndarray)
 [[ 6.7  3.   5.2  2.3]
 [ 6.3  2.5  5.   1.9]
 [ 6.5  3.   5.2  2. ]
 [ 6.2  3.4  5.4  2.3]
 [ 5.9  3.   5.1  1.8]]
 每行是一個樣本,矩陣行數=樣本總數,矩陣列數=每個樣本特徵數
 (2)樣本類別矩陣(型別:numpy.ndarray)
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]
 每個元素對應一個樣本的類標
 3.本地excel表的資料集樣例:
class0  p1  p2  p3  p4  p5  p6  p7
0   0   0   0   1   0   0   0
0   5   9   10  10  0   1   1
0   0   1   1   0   0   1   0
0   0   1   1   0   0   1   0
每行是一個樣本,每行第一個元素是樣本所屬類別,後續元素是樣本的特徵
'''
import os
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn import preprocessing
from sklearn import neighbors
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from time import time
from sklearn.naive_bayes import MultinomialNB
from sklearn import tree
from sklearn.ensemble import GradientBoostingClassifier

#讀取sklearn自帶的資料集(鳶尾花)
def getData_1():
    iris = datasets.load_iris()
    X = iris.data   #樣本特徵矩陣,150*4矩陣,每行一個樣本,每個樣本維度是4
    y = iris.target #樣本類別矩陣,150維行向量,每個元素代表一個樣本的類別

#讀取本地excel表格內的資料集(抽取每類60%樣本組成訓練集,剩餘樣本組成測試集)
#返回一個元祖,其內有4個元素(型別均為numpy.ndarray):
#(1)歸一化後的訓練集矩陣,每行為一個訓練樣本,矩陣行數=訓練樣本總數,矩陣列數=每個訓練樣本的特徵數
#(2)每個訓練樣本的類標
#(3)歸一化後的測試集矩陣,每行為一個測試樣本,矩陣行數=測試樣本總數,矩陣列數=每個測試樣本的特徵數
#(4)每個測試樣本的類標
#【注】歸一化採用“最大最小值”方法。
def getData_2():
    fPath = 'F:/cleanData_dropSJS.csv'
    if os.path.exists(fPath):
        data = pd.read_csv(fPath,header=None,skiprows=1,names=['class0','pixel0','pixel1','pixel2','pixel3','pixel4','pixel5', 'pixel6'])
        X_train1, X_test1, y_train1, y_test1 = train_test_split(data, data['class0'], test_size = 0.4, random_state = 0)
        min_max_scaler = preprocessing.MinMaxScaler()   #歸一化
        X_train_minmax = min_max_scaler.fit_transform(np.array(X_train1))
        X_test_minmax = min_max_scaler.fit_transform(np.array(X_test1))
        return (X_train_minmax, np.array(y_train1), X_test_minmax, np.array(y_test1))
    else:
        print 'No such file or directory!'

#讀取本地excel表格內的資料集(每類隨機生成K個訓練集和測試集的組合)
#【K的含義】假設一共有1000個樣本,K取10,那麼就將這1000個樣本切分10份(一份100個),那麼就產生了10個測試集
#對於每一份的測試集,剩餘900個樣本即作為訓練集
#結果返回一個字典:鍵為集合編號(1train, 1trainclass, 1test, 1testclass, 2train, 2trainclass, 2test, 2testclass...),值為資料
#其中1train和1test為隨機生成的第一組訓練集和測試集(1trainclass和1testclass為訓練樣本類別和測試樣本類別),其他以此類推
def getData_3():
    fPath = 'F:/cleanData_dropSJS.csv'
    if os.path.exists(fPath):
        #讀取csv檔案內的資料,
        dataMatrix = np.array(pd.read_csv(fPath,header=None,skiprows=1,names=['class0','pixel0','pixel1','pixel2','pixel3','pixel4','pixel5', 'pixel6']))
        #獲取每個樣本的特徵以及類標
        rowNum, colNum = dataMatrix.shape[0], dataMatrix.shape[1]
        sampleData = []
        sampleClass = []
        for i in range(0, rowNum):
            tempList = list(dataMatrix[i,:])
            sampleClass.append(tempList[0])
            sampleData.append(tempList[1:])
        sampleM = np.array(sampleData)  #二維矩陣,一行是一個樣本,行數=樣本總數,列數=樣本特徵數
        classM = np.array(sampleClass)  #一維列向量,每個元素對應每個樣本所屬類別
        #呼叫StratifiedKFold方法生成訓練集和測試集
        skf = StratifiedKFold(n_splits = 10)
        setDict = {}    #建立字典,用於儲存生成的訓練集和測試集
        count = 1
        for trainI, testI in skf.split(sampleM, classM):
            trainSTemp = [] #用於儲存當前迴圈抽取出的訓練樣本資料
            trainCTemp = [] #用於儲存當前迴圈抽取出的訓練樣本類標
            testSTemp = []  #用於儲存當前迴圈抽取出的測試樣本資料
            testCTemp = []  #用於儲存當前迴圈抽取出的測試樣本類標
            #生成訓練集
            trainIndex = list(trainI)
            for t1 in range(0, len(trainIndex)):
                trainNum = trainIndex[t1]
                trainSTemp.append(list(sampleM[trainNum, :]))
                trainCTemp.append(list(classM)[trainNum])
            setDict[str(count) + 'train'] = np.array(trainSTemp)
            setDict[str(count) + 'trainclass'] = np.array(trainCTemp)
            #生成測試集
            testIndex = list(testI)
            for t2 in range(0, len(testIndex)):
                testNum = testIndex[t2]
                testSTemp.append(list(sampleM[testNum, :]))
                testCTemp.append(list(classM)[testNum])
            setDict[str(count) + 'test'] = np.array(testSTemp)
            setDict[str(count) + 'testclass'] = np.array(testCTemp)
            count += 1
        return setDict
    else:
        print 'No such file or directory!'

#K近鄰(K Nearest Neighbor)
def KNN():
    clf = neighbors.KNeighborsClassifier()
    return clf

#線性鑑別分析(Linear Discriminant Analysis)
def LDA():
    clf = LinearDiscriminantAnalysis()
    return clf

#支援向量機(Support Vector Machine)
def SVM():
    clf = svm.SVC()
    return clf

#邏輯迴歸(Logistic Regression)
def LR():
    clf = LogisticRegression()
    return clf

#隨機森林決策樹(Random Forest)
def RF():
    clf = RandomForestClassifier()
    return clf

#多項式樸素貝葉斯分類器
def native_bayes_classifier():
    clf = MultinomialNB(alpha = 0.01)
    return clf

#決策樹
def decision_tree_classifier():
    clf = tree.DecisionTreeClassifier()
    return clf

#GBDT
def gradient_boosting_classifier():
    clf = GradientBoostingClassifier(n_estimators = 200)
    return clf

#計算識別率
def getRecognitionRate(testPre, testClass):
    testNum = len(testPre)
    rightNum = 0
    for i in range(0, testNum):
        if testClass[i] == testPre[i]:
            rightNum += 1
    return float(rightNum) / float(testNum)

#report函式,將調參的詳細結果儲存到本地F盤(路徑可自行修改,其中n_top是指定輸出前多少個最優引數組合以及該組合的模型得分)
def report(results, n_top=5488):
    f = open('F:/grid_search_rf.txt', 'w')
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            f.write("Model with rank: {0}".format(i) + '\n')
            f.write("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]) + '\n')
            f.write("Parameters: {0}".format(results['params'][candidate]) + '\n')
            f.write("\n")
    f.close()

#自動調參(以隨機森林為例)
def selectRFParam():
    clf_RF = RF()
    param_grid = {"max_depth": [3,15],
                  "min_samples_split": [3, 5, 10],
                  "min_samples_leaf": [3, 5, 10],
                  "bootstrap": [True, False],
                  "criterion": ["gini", "entropy"],
                  "n_estimators": range(10,50,10)}
                  # "class_weight": [{0:1,1:13.24503311,2:1.315789474,3:12.42236025,4:8.163265306,5:31.25,6:4.77326969,7:19.41747573}],
                  # "max_features": range(3,10),
                  # "warm_start": [True, False],
                  # "oob_score": [True, False],
                  # "verbose": [True, False]}
    grid_search = GridSearchCV(clf_RF, param_grid=param_grid, n_jobs=4)
    start = time()
    T = getData_2()    #獲取資料集
    grid_search.fit(T[0], T[1]) #傳入訓練集矩陣和訓練樣本類標
    print("GridSearchCV took %.2f seconds for %d candidate parameter settings."
          % (time() - start, len(grid_search.cv_results_['params'])))
    report(grid_search.cv_results_)

#“主”函式1(KFold方法生成K個訓練集和測試集,即資料集採用getData_3()函式獲取,計算這K個組合的平均識別率)
def totalAlgorithm_1():
    #獲取各個分類器
    clf_KNN = KNN()
    clf_LDA = LDA()
    clf_SVM = SVM()
    clf_LR = LR()
    clf_RF = RF()
    clf_NBC = native_bayes_classifier()
    clf_DTC = decision_tree_classifier()
    clf_GBDT = gradient_boosting_classifier()
    #獲取訓練集和測試集
    setDict = getData_3()
    setNums = len(setDict.keys()) / 4  #一共生成了setNums個訓練集和setNums個測試集,它們之間是一一對應關係
    #定義變數,用於將每個分類器的所有識別率累加
    KNN_rate = 0.0
    LDA_rate = 0.0
    SVM_rate = 0.0
    LR_rate = 0.0
    RF_rate = 0.0
    NBC_rate = 0.0
    DTC_rate = 0.0
    GBDT_rate = 0.0
    for i in range(1, setNums + 1):
        trainMatrix = setDict[str(i) + 'train']
        trainClass = setDict[str(i) + 'trainclass']
        testMatrix = setDict[str(i) + 'test']
        testClass = setDict[str(i) + 'testclass']
        #輸入訓練樣本
        clf_KNN.fit(trainMatrix, trainClass)
        clf_LDA.fit(trainMatrix, trainClass)
        clf_SVM.fit(trainMatrix, trainClass)
        clf_LR.fit(trainMatrix, trainClass)
        clf_RF.fit(trainMatrix, trainClass)
        clf_NBC.fit(trainMatrix, trainClass)
        clf_DTC.fit(trainMatrix, trainClass)
        clf_GBDT.fit(trainMatrix, trainClass)
        #計算識別率
        KNN_rate += getRecognitionRate(clf_KNN.predict(testMatrix), testClass)
        LDA_rate += getRecognitionRate(clf_LDA.predict(testMatrix), testClass)
        SVM_rate += getRecognitionRate(clf_SVM.predict(testMatrix), testClass)
        LR_rate += getRecognitionRate(clf_LR.predict(testMatrix), testClass)
        RF_rate += getRecognitionRate(clf_RF.predict(testMatrix), testClass)
        NBC_rate += getRecognitionRate(clf_NBC.predict(testMatrix), testClass)
        DTC_rate += getRecognitionRate(clf_DTC.predict(testMatrix), testClass)
        GBDT_rate += getRecognitionRate(clf_GBDT.predict(testMatrix), testClass)
    #輸出各個分類器的平均識別率(K個訓練集測試集,計算平均)
    print
    print
    print
    print('K Nearest Neighbor mean recognition rate: ', KNN_rate / float(setNums))
    print('Linear Discriminant Analysis mean recognition rate: ', LDA_rate / float(setNums))
    print('Support Vector Machine mean recognition rate: ', SVM_rate / float(setNums))
    print('Logistic Regression mean recognition rate: ', LR_rate / float(setNums))
    print('Random Forest mean recognition rate: ', RF_rate / float(setNums))
    print('Native Bayes Classifier mean recognition rate: ', NBC_rate / float(setNums))
    print('Decision Tree Classifier mean recognition rate: ', DTC_rate / float(setNums))
    print('Gradient Boosting Decision Tree mean recognition rate: ', GBDT_rate / float(setNums))

#“主”函式2(每類前x%作為訓練集,剩餘作為測試集,即資料集用getData_2()方法獲取,計算識別率)
def totalAlgorithm_2():
    #獲取各個分類器
    clf_KNN = KNN()
    clf_LDA = LDA()
    clf_SVM = SVM()
    clf_LR = LR()
    clf_RF = RF()
    clf_NBC = native_bayes_classifier()
    clf_DTC = decision_tree_classifier()
    clf_GBDT = gradient_boosting_classifier()
    #獲取訓練集和測試集
    T = getData_2()
    trainMatrix, trainClass, testMatrix, testClass = T[0], T[1], T[2], T[3]
    #輸入訓練樣本
    clf_KNN.fit(trainMatrix, trainClass)
    clf_LDA.fit(trainMatrix, trainClass)
    clf_SVM.fit(trainMatrix, trainClass)
    clf_LR.fit(trainMatrix, trainClass)
    clf_RF.fit(trainMatrix, trainClass)
    clf_NBC.fit(trainMatrix, trainClass)
    clf_DTC.fit(trainMatrix, trainClass)
    clf_GBDT.fit(trainMatrix, trainClass)
    #輸出各個分類器的識別率
    print('K Nearest Neighbor recognition rate: ', getRecognitionRate(clf_KNN.predict(testMatrix), testClass))
    print('Linear Discriminant Analysis recognition rate: ', getRecognitionRate(clf_LDA.predict(testMatrix), testClass))
    print('Support Vector Machine recognition rate: ', getRecognitionRate(clf_SVM.predict(testMatrix), testClass))
    print('Logistic Regression recognition rate: ', getRecognitionRate(clf_LR.predict(testMatrix), testClass))
    print('Random Forest recognition rate: ', getRecognitionRate(clf_RF.predict(testMatrix), testClass))
    print('Native Bayes Classifier recognition rate: ', getRecognitionRate(clf_NBC.predict(testMatrix), testClass))
    print('Decision Tree Classifier recognition rate: ', getRecognitionRate(clf_DTC.predict(testMatrix), testClass))
    print('Gradient Boosting Decision Tree recognition rate: ', getRecognitionRate(clf_GBDT.predict(testMatrix), testClass))

if __name__ == '__main__':
    print('K個訓練集和測試集的平均識別率')
    totalAlgorithm_1()
    print('每類前x%訓練,剩餘測試,各個模型的識別率')
    totalAlgorithm_2()
    selectRFParam()
    print('隨機森林引數調優完成!')

'''
【輸出結果】
K個訓練集和測試集的平均識別率
('K Nearest Neighbor mean recognition rate: ', 0.48914314291650945)
('Linear Discriminant Analysis mean recognition rate: ', 0.5284076063968655)
('Support Vector Machine mean recognition rate: ', 0.5271199740575014)
('Logistic Regression mean recognition rate: ', 0.5620828985391165)
('Random Forest mean recognition rate: ', 0.512993404168108)
('Native Bayes Classifier mean recognition rate: ', 0.4467074333715003)
('Decision Tree Classifier mean recognition rate: ', 0.47351209424438706)
('Gradient Boosting Decision Tree mean recognition rate: ', 0.5603633086892212)
每類前x%訓練,剩餘測試,各個模型的識別率
('K Nearest Neighbor recognition rate: ', 0.9892818863879957)
('Linear Discriminant Analysis recognition rate: ', 1.0)
('Support Vector Machine recognition rate: ', 0.8928188638799571)
('Logistic Regression recognition rate: ', 0.8494105037513398)
('Random Forest recognition rate: ', 0.9801714898177921)
('Native Bayes Classifier recognition rate: ', 0.7604501607717041)
('Decision Tree Classifier recognition rate: ', 1.0)
('Gradient Boosting Decision Tree recognition rate: ', 1.0)
GridSearchCV took 69.51 seconds for 288 candidate parameter settings.
隨機森林引數調優完成!
'''

【總結】如果你直接跑我的程式碼需要修改的地方:
(1)程式碼最前面各種匯入的模組你是否已經正確安裝?
(2)getData_2()和getData_3()函式內的fPath變數,即資料來源檔案路徑
(3)如果需要引數調優,設定儲存結果的檔案路徑,程式碼中在report()函式的第一行