1. 程式人生 > >《Python機器學習及實踐》----監督學習經典模型

《Python機器學習及實踐》----監督學習經典模型

本片部落格是根據《Python機器學習及實踐》一書中的例項,所有程式碼均在本地編譯通過。資料為從該書指定的百度網盤上下載的,或者是sklearn自帶資料下載到本地使用的。
程式碼片段:

import pandas as pd
import numpy as np
column_names = ['Sample code number', 'Clump Thickness', 'Uniformity of Cell Size', 'Uniformity of Cell Shape', 'Marginal Adhesion', 'Single Epithelial Cell Size', 'Bare Nuclei'
, 'Bland Chromatin', 'Normal Nucleoli', 'Mitoses', 'Class'] data=pd.read_csv('D:\Source Code\machinelearn\\breast-cancer-wisconsin.txt',sep=',', names=column_names) data = data.replace(to_replace='?', value=np.nan) data = data.dropna(how='any') data.shape from sklearn.cross_validation import train_test_split X_train,X_test,Y_train,Y_test = train_test_split(data[column_names[1
:10]],data[column_names[10]],test_size=0.25,random_state=33) Y_train.value_counts() Y_test.value_counts() from sklearn.preprocessing import StandardScaler from sklearn.linear_model import LogisticRegression from sklearn.linear_model import SGDClassifier ss = StandardScaler() X_train = ss.fit_transform(X_train) X_test = ss.transform(X_test) lr = LogisticRegression() sgdc = SGDClassifier() lr.fit(X_train,Y_train) lr_y_predict = lr.predict(X_test) sgdc.fit(X_train,Y_train) sgdc_y_predict = sgdc.predict(X_test) from
sklearn.metrics import classification_report print 'Accuracy of LR Classifier:', lr.score(X_test,Y_test) print classification_report(Y_test,lr_y_predict,target_names=['Benign','Malignant']) print 'Accuracy of SGD Classifier:',sgdc.score(X_test,Y_test) print classification_report(Y_test,sgdc_y_predict,target_names=['Benign','Malignant']) from sklearn.datasets import load_digits digits = load_digits() digits.data.shape from sklearn.cross_validation import train_test_split x_train,x_test,y_train,y_test = train_test_split(digits.data,digits.target,test_size=0.25,random_state=33) y_train.shape y_test.shape from sklearn.preprocessing import StandardScaler from sklearn.svm import LinearSVC ss = StandardScaler() x_train = ss.fit_transform(x_train) x_test = ss.transform(x_test) lsvc = LinearSVC() lsvc.fit(x_train,y_train) y_predict = lsvc.predict(x_test) from sklearn.metrics import classification_report print 'The Accuracy of Linear SVC is ',lsvc.score(x_test,y_test) print classification_report(y_test,y_predict,target_names=digits.target_names.astype(str)) from sklearn.datasets import fetch_20newsgroups news = fetch_20newsgroups() print len(news.data) print news.data[0] from sklearn.cross_validation import train_test_split x_train,x_test,y_train,y_test = train_test_split(news.data,news.target,test_size=0.25,random_state=33) from sklearn.feature_extraction.text import CountVectorizer vec = CountVectorizer() x_train = vec.fit_transform(x_train) x_test = vec.transform(x_test) from sklearn.naive_bayes import MultinomialNB mnb = MultinomialNB() mnb.fit(x_train,y_train) y_predict = mnb.predict(x_test) from sklearn.metrics import classification_report print 'The Accuracy of Naive Bayes Classifier is ',mnb.score(x_test,y_test) print classification_report(y_test,y_predict,target_names=news.target_names) from sklearn.datasets import load_iris iris = load_iris() iris.data.shape print iris.DESCR from sklearn.cross_validation import train_test_split x_train,x_test,y_train,y_test = train_test_split(iris.data,iris.target,test_size=0.25,random_state=33) from sklearn.preprocessing import StandardScaler from sklearn.neighbors import KNeighborsClassifier ss = StandardScaler() x_train = ss.fit_transform(x_train) x_test = ss.transform(x_test) knc = KNeighborsClassifier() knc.fit(x_train,y_train) y_predict = knc.predict(x_test) print 'The Accuracy of K-Nearest Neighbor Classifier is ',knc.score(x_test,y_test) from sklearn.metrics import classification_report print classification_report(y_test,y_predict,target_names=iris.target_names) import pandas as pd titanic = pd.read_csv('D:\Source Code\machinelearn\\titanic.txt') titanic.head() titanic.info() X = titanic[['pclass','age','sex']] Y = titanic['survived'] X.info() X['age'].fillna(X['age'].mean(),inplace=True) X.info() from sklearn.cross_validation import train_test_split x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.25,random_state=33) from sklearn.feature_extraction import DictVectorizer vec = DictVectorizer(sparse=False) x_train = vec.fit_transform(x_train.to_dict(orient='record')) print vec.feature_names_ x_test = vec.transform(x_test.to_dict(orient='record')) from sklearn.tree import DecisionTreeClassifier dtc = DecisionTreeClassifier() dtc.fit(x_train,y_train) y_predict= dtc.predict(x_test) from sklearn.metrics import classification_report print dtc.score(x_test,y_test) print classification_report(y_predict,y_test,target_names=['died','survived']) import pandas as pd titanic = pd.read_csv('D:\Source Code\machinelearn\\titanic.txt') X = titanic[['pclass','age','sex']] Y = titanic['survived'] X['age'].fillna(X['age'].mean(),inplace=True) from sklearn.cross_validation import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state = 33) from sklearn.feature_extraction import DictVectorizer vec = DictVectorizer(sparse=False) X_train = vec.fit_transform(X_train.to_dict(orient='record')) X_test = vec.transform(X_test.to_dict(orient='record')) from sklearn.tree import DecisionTreeClassifier dtc = DecisionTreeClassifier() dtc.fit(X_train, y_train) dtc_y_pred = dtc.predict(X_test) from sklearn.ensemble import RandomForestClassifier rfc = RandomForestClassifier() rfc.fit(X_train, y_train) rfc_y_pred = rfc.predict(X_test) from sklearn.ensemble import GradientBoostingClassifier gbc = GradientBoostingClassifier() gbc.fit(X_train, y_train) gbc_y_pred = gbc.predict(X_test) from sklearn.metrics import classification_report print 'The accuracy of decision tree is', dtc.score(X_test, y_test) print classification_report(dtc_y_pred, y_test) print 'The accuracy of random forest classifier is', rfc.score(X_test, y_test) print classification_report(rfc_y_pred, y_test) print 'The accuracy of gradient tree boosting is', gbc.score(X_test, y_test) print classification_report(gbc_y_pred, y_test) #線性迴歸 from sklearn.datasets import load_boston boston = load_boston() print boston.DESCR from sklearn.cross_validation import train_test_split import numpy as np X = boston.data Y = boston.target X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state = 33) print "The max target value is ", np.max(boston.target) print "The min target value is ", np.min(boston.target) print "The average target value is", np.mean(boston.target) from sklearn.preprocessing import StandardScaler ss_X = StandardScaler() ss_Y = StandardScaler() X_train = ss_X.fit_transform(X_train) X_test = ss_X.transform(X_test) Y_train = ss_Y.fit_transform(Y_train) Y_test = ss_Y.transform(Y_test) from sklearn.linear_model import LinearRegression lr = LinearRegression() lr.fit(X_train,Y_train) lr_y_predict = lr.predict(X_test) from sklearn.linear_model import SGDRegressor sgdr = SGDRegressor() sgdr.fit(X_train,Y_train) sgdr_y_predict = sgdr.predict(X_test) print 'The value of default measurement of LinearRegression is', lr.score(X_test, Y_test) from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error print 'The value of R-squared of LinearRegression is', r2_score(Y_test, lr_y_predict) print 'The mean squared error of LinearRegression is', mean_squared_error(ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(lr_y_predict)) print 'The mean absoluate error of LinearRegression is', mean_absolute_error(ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(lr_y_predict)) print 'The value of default measurement of SGDRegressor is', sgdr.score(X_test, Y_test) print 'The value of R-squared of SGDRegressor is', r2_score(Y_test, sgdr_y_predict) print 'The mean squared error of SGDRegressor is', mean_squared_error(ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(sgdr_y_predict)) print 'The mean absoluate error of SGDRegressor is', mean_absolute_error(ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(sgdr_y_predict)) from sklearn.svm import SVR linear_svr = SVR(kernel='linear') linear_svr.fit(X_train,Y_train) linear_svr_y_predict = linear_svr.predict(X_test) poly_svr = SVR(kernel='poly') poly_svr.fit(X_train,Y_train) poly_svr_y_predict = poly_svr.predict(X_test) rbf_svr = SVR(kernel='rbf') rbf_svr.fit(X_train,Y_train) rbf_svr_y_predict = rbf_svr.predict(X_test) from sklearn.metrics import r2_score,mean_absolute_error,median_absolute_error print 'R-squared value of linear SVR is', linear_svr.score(X_test, Y_test) print 'The mean squared error of linear SVR is', mean_squared_error(ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(linear_svr_y_predict)) print 'The mean absoluate error of linear SVR is', mean_absolute_error(ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(linear_svr_y_predict)) print 'R-squared value of Poly SVR is', poly_svr.score(X_test, Y_test) print 'The mean squared error of Poly SVR is', mean_squared_error(ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(poly_svr_y_predict)) print 'The mean absoluate error of Poly SVR is', mean_absolute_error(ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(poly_svr_y_predict)) print 'R-squared value of RBF SVR is', rbf_svr.score(X_test, Y_test) print 'The mean squared error of RBF SVR is', mean_squared_error(ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(rbf_svr_y_predict)) print 'The mean absoluate error of RBF SVR is', mean_absolute_error(ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(rbf_svr_y_predict)) from sklearn.neighbors import KNeighborsRegressor uni_knr = KNeighborsRegressor(weights='uniform') uni_knr.fit(X_train,Y_train) uni_knr_y_predict = uni_knr.predict(X_test) dis_knr = KNeighborsRegressor(weights='distance') dis_knr.fit(X_train,Y_train) dis_knr_y_predict = dis_knr.predict(X_test) print 'R-squared value of uniform-weighted KNeighorRegression:', uni_knr.score(X_test, Y_test) print 'The mean squared error of uniform-weighted KNeighorRegression:', mean_squared_error(ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(uni_knr_y_predict)) print 'The mean absoluate error of uniform-weighted KNeighorRegression', mean_absolute_error(ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(uni_knr_y_predict)) print 'R-squared value of distance-weighted KNeighorRegression:', dis_knr.score(X_test, Y_test) print 'The mean squared error of distance-weighted KNeighorRegression:', mean_squared_error(ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(dis_knr_y_predict)) print 'The mean absoluate error of distance-weighted KNeighorRegression:', mean_absolute_error(ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(dis_knr_y_predict)) from sklearn.tree import DecisionTreeRegressor dtr = DecisionTreeRegressor() dtr.fit(X_train,Y_train) dtr_y_predict = dtr.predict(X_test) print 'R-squared value of DecisionTreeRegressor:', dtr.score(X_test, Y_test) print 'The mean squared error of DecisionTreeRegressor:', mean_squared_error(ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(dtr_y_predict)) print 'The mean absoluate error of DecisionTreeRegressor:', mean_absolute_error(ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(dtr_y_predict)) from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor # 使用RandomForestRegressor訓練模型,並對測試資料做出預測,結果儲存在變數rfr_y_predict中。 rfr = RandomForestRegressor() rfr.fit(X_train, Y_train) rfr_y_predict = rfr.predict(X_test) # 使用ExtraTreesRegressor訓練模型,並對測試資料做出預測,結果儲存在變數etr_y_predict中。 etr = ExtraTreesRegressor() etr.fit(X_train, Y_train) etr_y_predict = etr.predict(X_test) # 使用GradientBoostingRegressor訓練模型,並對測試資料做出預測,結果儲存在變數gbr_y_predict中。 gbr = GradientBoostingRegressor() gbr.fit(X_train, Y_train) gbr_y_predict = gbr.predict(X_test) # 使用R-squared、MSE以及MAE指標對預設配置的隨機迴歸森林在測試集上進行效能評估。 print 'R-squared value of RandomForestRegressor:', rfr.score(X_test, Y_test) print 'The mean squared error of RandomForestRegressor:', mean_squared_error(ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(rfr_y_predict)) print 'The mean absoluate error of RandomForestRegressor:', mean_absolute_error(ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(rfr_y_predict)) # 使用R-squared、MSE以及MAE指標對預設配置的極端迴歸森林在測試集上進行效能評估。\n", print 'R-squared value of ExtraTreesRegessor:', etr.score(X_test, Y_test) print 'The mean squared error of ExtraTreesRegessor:', mean_squared_error(ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(etr_y_predict)) print 'The mean absoluate error of ExtraTreesRegessor:', mean_absolute_error(ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(etr_y_predict)) # 利用訓練好的極端迴歸森林模型,輸出每種特徵對預測目標的貢獻度。 print np.sort(zip(etr.feature_importances_, boston.feature_names), axis=0) # 使用R-squared、MSE以及MAE指標對預設配置的梯度提升迴歸樹在測試集上進行效能評估。 print 'R-squared value of GradientBoostingRegressor:', gbr.score(X_test, Y_test) print 'The mean squared error of GradientBoostingRegressor:', mean_squared_error(ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(gbr_y_predict)) print 'The mean absoluate error of GradientBoostingRegressor:', mean_absolute_error(ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(gbr_y_predict))

其中陣列讀資料有一處有問題,報錯如下:

No handlers could be found for logger "sklearn.datasets.twenty_newsgroups"

解決方法如下:
首先手動下載 http://qwone.com/~jason/20Newsgroups/20news-bydate.tar.gz 這個包,
在……\賬戶名\scikit_learn_data,將下載的包放在該目錄下,
之後找到twenty_newsgroups.py檔案,該檔案主要通過這個包來線上下載,我們這裡手動下載,然後註釋相關程式碼即可。
找到這個函式 download_20newsgroups,註釋以下程式碼

if os.path.exists(archive_path):
        # Download is not complete as the .tar.gz file is removed after
        # download.
        logger.warn("Download was incomplete, downloading again.")
        os.remove(archive_path)

    logger.warn("Downloading dataset from %s (14 MB)", URL)
    opener = urlopen(URL)
    open(archive_path, 'wb').write(opener.read())

觀察以下程式碼,是將下載的檔案解壓,所以我們註釋掉上面線上下載即可

logger.info("Decompressing %s", archive_path)
tarfile.open(archive_path, "r:gz").extractall(path=target_dir)
os.remove(archive_path)

儲存,執行相關匯入資料集程式,最終可以發現……r\賬戶名\scikit_learn_data目錄下只剩下
20news-bydate.pkz檔案,以後在執行程式就不需要重新下載了。