1. 程式人生 > >《python機器學習及實踐-從零開始通往kaggle競賽之路(程式碼Python 3.6 版)》chapter1.1

《python機器學習及實踐-從零開始通往kaggle競賽之路(程式碼Python 3.6 版)》chapter1.1

import pandas as pd  #匯入pandas 庫
df_train = pd.read_csv('../Datasets/Breast-Cancer/breast-cancer-train.csv') #讀取目錄下的資料,如果程式碼與檔案路徑不在一起,則需要另行設定

df_test = pd.read_csv('../Datasets/Breast-Cancer/breast-cancer-test.csv')

print(df_train.head(5)) #顯示df_train 前列5行資料,瞭解資料大概樣式
print(df_test.head(5))

df_test_negative = df_test.loc[df_test['Type'] == 0][['Clump Thickness', 'Cell Size']] #先對test 的“Type”行進行判斷,然後切分其他兩列資料
df_test_positive = df_test.loc[df_test['Type'] == 1][['Clump Thickness', 'Cell Size']]
print(df_test_negative.head())
print(df_test_positive.head())

import matplotlib.pyplot as plt

plt.scatter(df_test_negative['Clump Thickness'],df_test_negative['Cell Size'],marker = 'o', s=20, c='green')
plt.scatter(df_test_positive['Clump Thickness'],df_test_positive['Cell Size'], marker = 'x', s=10, c='red')

plt.xlabel('Clump Thickness')
plt.ylabel('Cell Size')

plt.show()

import numpy as np

intercept = np.random.random([1])
coef = np.random.random([2])

lx = np.arange(0, 12)
ly = (-intercept - lx * coef[0]) / coef[1]

plt.plot(lx, ly, c='yellow')


plt.scatter(df_test_negative['Clump Thickness'],df_test_negative['Cell Size'], marker = 'o', s=200, c='red')
plt.scatter(df_test_positive['Clump Thickness'],df_test_positive['Cell Size'], marker = 'x', s=150, c='black')
plt.xlabel('Clump Thickness')
plt.ylabel('Cell Size')
plt.show()

from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()

lr.fit(df_train[['Clump Thickness', 'Cell Size']][:10], df_train['Type'][:10])
print ('Testing accuracy (10 training samples):', lr.score(df_test[['Clump Thickness', 'Cell Size']], df_test['Type']))

intercept = lr.intercept_
coef = lr.coef_[0, :]

ly = (-intercept - lx * coef[0]) / coef[1]

plt.plot(lx, ly, c='green')
plt.scatter(df_test_negative['Clump Thickness'],df_test_negative['Cell Size'], marker = 'o', s=200, c='red')
plt.scatter(df_test_positive['Clump Thickness'],df_test_positive['Cell Size'], marker = 'x', s=150, c='black')
plt.xlabel('Clump Thickness')
plt.ylabel('Cell Size')
plt.show()

lr = LogisticRegression()

lr.fit(df_train[['Clump Thickness', 'Cell Size']], df_train['Type'])
print ('Testing accuracy (all training samples):', lr.score(df_test[['Clump Thickness', 'Cell Size']], df_test['Type']))

intercept = lr.intercept_
coef = lr.coef_[0, :]
ly = (-intercept - lx * coef[0]) / coef[1]

plt.plot(lx, ly, c='blue')
plt.scatter(df_test_negative['Clump Thickness'],df_test_negative['Cell Size'], marker = 'o', s=200, c='red')
plt.scatter(df_test_positive['Clump Thickness'],df_test_positive['Cell Size'], marker = 'x', s=150, c='black')
plt.xlabel('Clump Thickness')
plt.ylabel('Cell Size')
plt.show()

釋出修改程式碼已經過作者同意,如果有疑問,可以留言給我。