《python機器學習及實踐-從零開始通往kaggle競賽之路(程式碼Python 3.6 版)》chapter1.1
阿新 • • 發佈:2019-01-25
import pandas as pd #匯入pandas 庫 df_train = pd.read_csv('../Datasets/Breast-Cancer/breast-cancer-train.csv') #讀取目錄下的資料,如果程式碼與檔案路徑不在一起,則需要另行設定 df_test = pd.read_csv('../Datasets/Breast-Cancer/breast-cancer-test.csv') print(df_train.head(5)) #顯示df_train 前列5行資料,瞭解資料大概樣式 print(df_test.head(5)) df_test_negative = df_test.loc[df_test['Type'] == 0][['Clump Thickness', 'Cell Size']] #先對test 的“Type”行進行判斷,然後切分其他兩列資料 df_test_positive = df_test.loc[df_test['Type'] == 1][['Clump Thickness', 'Cell Size']] print(df_test_negative.head()) print(df_test_positive.head()) import matplotlib.pyplot as plt plt.scatter(df_test_negative['Clump Thickness'],df_test_negative['Cell Size'],marker = 'o', s=20, c='green') plt.scatter(df_test_positive['Clump Thickness'],df_test_positive['Cell Size'], marker = 'x', s=10, c='red') plt.xlabel('Clump Thickness') plt.ylabel('Cell Size') plt.show() import numpy as np intercept = np.random.random([1]) coef = np.random.random([2]) lx = np.arange(0, 12) ly = (-intercept - lx * coef[0]) / coef[1] plt.plot(lx, ly, c='yellow') plt.scatter(df_test_negative['Clump Thickness'],df_test_negative['Cell Size'], marker = 'o', s=200, c='red') plt.scatter(df_test_positive['Clump Thickness'],df_test_positive['Cell Size'], marker = 'x', s=150, c='black') plt.xlabel('Clump Thickness') plt.ylabel('Cell Size') plt.show() from sklearn.linear_model import LogisticRegression lr = LogisticRegression() lr.fit(df_train[['Clump Thickness', 'Cell Size']][:10], df_train['Type'][:10]) print ('Testing accuracy (10 training samples):', lr.score(df_test[['Clump Thickness', 'Cell Size']], df_test['Type'])) intercept = lr.intercept_ coef = lr.coef_[0, :] ly = (-intercept - lx * coef[0]) / coef[1] plt.plot(lx, ly, c='green') plt.scatter(df_test_negative['Clump Thickness'],df_test_negative['Cell Size'], marker = 'o', s=200, c='red') plt.scatter(df_test_positive['Clump Thickness'],df_test_positive['Cell Size'], marker = 'x', s=150, c='black') plt.xlabel('Clump Thickness') plt.ylabel('Cell Size') plt.show() lr = LogisticRegression() lr.fit(df_train[['Clump Thickness', 'Cell Size']], df_train['Type']) print ('Testing accuracy (all training samples):', lr.score(df_test[['Clump Thickness', 'Cell Size']], df_test['Type'])) intercept = lr.intercept_ coef = lr.coef_[0, :] ly = (-intercept - lx * coef[0]) / coef[1] plt.plot(lx, ly, c='blue') plt.scatter(df_test_negative['Clump Thickness'],df_test_negative['Cell Size'], marker = 'o', s=200, c='red') plt.scatter(df_test_positive['Clump Thickness'],df_test_positive['Cell Size'], marker = 'x', s=150, c='black') plt.xlabel('Clump Thickness') plt.ylabel('Cell Size') plt.show()
釋出修改程式碼已經過作者同意,如果有疑問,可以留言給我。