1. 程式人生 > >python良\惡性腫瘤預測-LogisticRegression以及SGDClassifier

python良\惡性腫瘤預測-LogisticRegression以及SGDClassifier

# -*- coding: utf-8 -*- """ Created on Fri Oct 12 16:56:56 2018

@author: fengjuan """

import pandas as pd import numpy as np #匯入matplotlib工具包的pyplot並簡稱為plt #import matplotlib.pyplot as plt #df_train.info() #建立特徵列表,網址裡資料沒有表頭 column_names=['Sample code number','Clump Thickness','Uniformity of Cell Size',              'Uniformity of Cell Shape','Marginal Adhesion','Single Epithelial Cell Size',              'Bare Nulclei','Bland Chromatin','Nomal Nucleoli','Mitoses','Class'] #從網上讀取 data=pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data',                  names=column_names) #將資料裡的?替換為標準缺失值 data=data.replace(to_replace='?',value=np.nan) #丟棄有缺失的資料,只要有缺失就丟棄 data=data.dropna(how='any') data.info() #因為元資料沒有測試集,所以將資料集分成測試集和訓練集,隨機取樣25%作為測試集 from sklearn.cross_validation import train_test_split X_train,X_test,y_train,y_test=train_test_split(data[column_names[1:10]],                                                data[column_names[10]],                                                test_size=0.25,random_state=33) #查驗測試集和訓練集的數量和類別分類 print(y_train.value_counts()) print(y_test.value_counts())

‘’‘

輸出的結果是:

2    344 4    168 Name: Class, dtype: int64 2    100 4     71 Name: Class, dtype: int64

‘’‘

from sklearn.preprocessing import StandardScaler from sklearn.linear_model import LogisticRegression from sklearn.linear_model import SGDClassifier ss = StandardScaler() X_train = ss.fit_transform(X_train) X_test = ss.transform(X_test) lr=LogisticRegression() sgdc=SGDClassifier() #呼叫LogisticRegression中fit函式/模組來訓練模型引數 lr.fit(X_train,y_train) #用訓練好的模型lr預測,結果儲存在變數lr_y_predict lr_y_predict=lr.predict(X_test) #呼叫SGDClassifier中fit函式/模組來訓練模型引數 sgdc.fit(X_train,y_train) #用訓練好的模型sgdc預測,結果儲存在變數sgdc_y_predict sgdc_y_predict=sgdc.predict(X_test) #效能預測 from sklearn.metrics import classification_report print('Accuracy of LR Classifier:',lr.score(X_test,y_test)) print(classification_report(y_test,lr_y_predict,target_names=['Benign',                                                               'Malignant'])) print('Accuracy of SGD Classifier:',sgdc.score(X_test,y_test)) print(classification_report(y_test,sgdc_y_predict,target_names=['Benign',                                                               'Malignant']))

'''結果: Accuracy of LR Classifier: 0.9883040935672515              precision    recall  f1-score   support

     Benign       0.99      0.99      0.99       100   Malignant       0.99      0.99      0.99        71

avg / total       0.99      0.99      0.99       171

Accuracy of SGD Classifier: 0.9766081871345029              precision    recall  f1-score   support

     Benign       0.99      0.97      0.98       100   Malignant       0.96      0.99      0.97        71

avg / total       0.98      0.98      0.98       171 '''