Introduction.to.Machine.Learning.with.Python 筆記
Python 3.0+ Chapter One from preamble import * %matplotlib inline import numpy as np x = np.array([[1, 2, 3], [4, 5, 6]]) print("x:\n{}".format(x)) from scipy import sparse # create a 2d NumPy array with a diagonal of ones, and zeros everywhere else eye = np.eye(4) print("NumPy array:\n{}".format(eye)) # convert the NumPy array to a SciPy sparse matrix in CSR format # only the non-zero entries are stored sparse_matrix = sparse.csr_matrix(eye) print("\nSciPy sparse CSR matrix:\n{}".format(sparse_matrix)) data = np.ones(4) row_indices = np.arange(4) col_indices = np.arange(4) eye_coo = sparse.coo_matrix((data, (row_indices, col_indices))) # coo_matrix, a type of sparse matrix print("COO representation:\n{}".format(eye_coo)) %matplotlib inline import matplotlib.pyplot as plt # Generate a sequence numbers from -10 to 10 with 100 steps in between x = np.linspace(-10, 10, 100) # create a second array using sinus y = np.sin(x) # The plot function makes a line chart of one array against another plt.plot(x, y, marker="x") import pandas as pd from IPython.display import display #disply # create a simple dataset of people data = {'Name': ["John", "Anna", "Peter", "Linda"], 'Location' : ["New York", "Paris", "Berlin", "London"], 'Age' : [24, 13, 53, 33] } data_pandas = pd.DataFrame(data) # IPython.display allows "pretty printing" of dataframes # in the Jupyter notebook display(data_pandas) # Notice: use Ipython's display function A First Application: Classifying iris species from sklearn.datasets import load_iris iris_dataset = load_iris() print("Keys of iris_dataset: {}".format(iris_dataset.keys())) print(iris_dataset['DESCR'][:193] + "\n...") print("Type of data: {}".format(type(iris_dataset['data']))) print("First five rows of data:\n{}".format(iris_dataset['data'][:5])) from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split( iris_dataset['data'], iris_dataset['target'], random_state=0) # create dataframe from data in X_train # label the columns using the strings in iris_dataset.feature_names iris_dataframe = pd.DataFrame(X_train, columns=iris_dataset.feature_names) # create a scatter matrix from the dataframe, color by y_train pd.plotting.scatter_matrix(iris_dataframe, c=y_train, figsize=(15, 15), marker='o', \ #use scatter_matrix to create pair_plot Python 3.0+ hist_kwds={'bins': 20}, s=60, alpha=.8, cmap=mglearn.cm3) from sklearn.neighbors import KNeighborsClassifier knn = KNeighborsClassifier(n_neighbors=1) knn.fit(X_train, y_train) X_new = np.array([[5, 2.9, 1, 0.2]]) print("X_new.shape: {}".format(X_new.shape)) prediction = knn.predict(X_new) print("Prediction: {}".format(prediction)) print("Predicted target name: {}".format( \ iris_dataset['target_names'][prediction])) print("Test set score: {:.2f}".format(knn.score(X_test, y_test))) Chapter Two --- Chapter Three --- Chapter Four Representing Data and Engineering Features Categorical Variables import os import pandas as pd import mglearn # The file has no headers naming the columns, so we pass header=None # and provide the column names explicitly in "names" adult_path = os.path.join(mglearn.datasets.DATA_PATH, "adult.data") data = pd.read_csv( adult_path, header=None, index_col=False, names=['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'gender', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income']) # For illustration purposes, we only select some of the columns: data = data[['age', 'workclass', 'education', 'gender', 'hours-per-week', 'occupation', 'income']] # IPython.display allows nice output formatting within the Jupyter notebook disply(data.head()) print(data.gender.value_counts()) print("Original features:\n", list(data.columns), "\n") data_dummies = pd.get_dummies(data) print("Features after get_dummies:\n", list(data_dummies.columns)) display(data_dummies.head(n=2)) # Get only the columns containing features # that is all columns from 'age' to 'occupation_ Transport-moving' # This range contains all the features but not the target features = data_dummies.loc[:, 'age':'occupation_ Transport-moving'] # extract NumPy arrays X = features.values y = data_dummies['income_ >50K'].values print("X.shape: {} y.shape: {}".format(X.shape, y.shape)) from sklearn.linear_model import LogisticRegression from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) logreg = LogisticRegression() logreg.fit(X_train, y_train) print("Test score: {:.2f}".format(logreg.score(X_test, y_test))) Numbers can encode categoricals # create a DataFrame with an integer feature and a categorical string feature demo_df = pd.DataFrame({'Integer Feature': [0, 1, 2, 1], 'Categorical Feature': ['socks', 'fox', 'socks', 'box']}) display(demo_df) demo_df['Integer Feature'] = demo_df['Integer Feature'].astype(str) display(pd.get_dummies(demo_df, columns=['Integer Feature', 'Categorical Feature'])) Binning, Discretization, Linear Models and Trees from sklearn.linear_model import LinearRegression from sklearn.tree import DecisionTreeRegressor X, y = mglearn.datasets.make_wave(n_samples=100) line = np.linspace(-3, 3, 1000, endpoint=False).reshape(-1, 1) reg = DecisionTreeRegressor(min_samples_split=3).fit(X, y) plt.plot(line, reg.predict(line), label="decision tree") reg = LinearRegression().fit(X, y) plt.plot(line, reg.predict(line), label="linear regression") plt.plot(X[:, 0], y, 'o', c='k') plt.ylabel("Regression output") plt.xlabel("Input feature") plt.legend(loc="best") import numpy as np bins = np.linspace(-3, 3, 11) print("bins: {}".format(bins)) which_bin = np.digitize(X, bins=bins) print("\nData points:\n", X[:5]) print("\nBin membership for data points:\n", which_bin[:5]) from sklearn.preprocessing import OneHotEncoder # transform using the OneHotEncoder encoder = OneHotEncoder(sparse=False) # encoder.fit finds the unique values that appear in which_bin encoder.fit(which_bin) # transform creates the one-hot encoding X_binned = encoder.transform(which_bin) print(X_binned[:5]) print("X_binned.shape: {}".format(X_binned.shape)) line_binned = encoder.transform(np.digitize(line, bins=bins)) reg = LinearRegression().fit(X_binned, y) plt.plot(line, reg.predict(line_binned), label='linear regression binned') reg = DecisionTreeRegressor(min_samples_split=3).fit(X_binned, y) plt.plot(line, reg.predict(line_binned), label='decision tree binned') plt.plot(X[:, 0], y, 'o', c='k') plt.vlines(bins, -3, 3, linewidth=1, alpha=.2) plt.legend(loc="best") plt.ylabel("Regression output") plt.xlabel("Input feature") Interactions and Polynomials X_combined = np.hstack([X, X_binned]) print(X_combined.shape) reg = LinearRegression().fit(X_combined, y) line_combined = np.hstack([line, line_binned]) plt.plot(line, reg.predict(line_combined), label='linear regression combined') for bin in bins: plt.plot([bin, bin], [-3, 3], ':', c='k', linewidth=1) plt.legend(loc="best") plt.ylabel("Regression output") plt.xlabel("Input feature") plt.plot(X[:, 0], y, 'o', c='k') X_product = np.hstack([X_binned, X * X_binned]) print(X_product.shape) X_binned.shape X.shape from sklearn.preprocessing import PolynomialFeatures # include polynomials up to x ** 10: # the default "include_bias=True" adds a feature that's constantly 1 poly = PolynomialFeatures(degree=10, include_bias=False) poly.fit(X) X_poly = poly.transform(X) print("Polynomial feature names:\n{}".format(poly.get_feature_names())) reg = LinearRegression().fit(X_poly, y) line_poly = poly.transform(line) plt.plot(line, reg.predict(line_poly), label='polynomial linear regression') plt.plot(X[:, 0], y, 'o', c='k') plt.ylabel("Regression output") plt.xlabel("Input feature") plt.legend(loc="best") from sklearn.svm import SVR for gamma in [1, 10]: svr = SVR(gamma=gamma).fit(X, y) plt.plot(line, svr.predict(line), label='SVR gamma={}'.format(gamma)) plt.plot(X[:, 0], y, 'o', c='k') plt.ylabel("Regression output") plt.xlabel("Input feature") plt.legend(loc="best") from sklearn.datasets import load_boston from sklearn.model_selection import train_test_split from sklearn.preprocessing import MinMaxScaler boston = load_boston() X_train, X_test, y_train, y_test = train_test_split( boston.data, boston.target, random_state=0) # rescale data scaler = MinMaxScaler() X_train_scaled = scaler.fit_transform(X_train) X_test_scaled = scaler.transform(X_test) poly = PolynomialFeatures(degree=2).fit(X_train_scaled) X_train_poly = poly.transform(X_train_scaled) X_test_poly = poly.transform(X_test_scaled) print("X_train.shape: {}".format(X_train.shape)) print("X_train_poly.shape: {}".format(X_train_poly.shape)) print("Polynomial feature names:\n{}".format(poly.get_feature_names())) from sklearn.linear_model import Ridge ridge = Ridge().fit(X_train_scaled, y_train) print("Score without interactions: {:.3f}".format( ridge.score(X_test_scaled, y_test))) ridge = Ridge().fit(X_train_poly, y_train) print("Score with interactions: {:.3f}".format( ridge.score(X_test_poly, y_test))) from sklearn.ensemble import RandomForestRegressor rf = RandomForestRegressor(n_estimators=100).fit(X_train_scaled, y_train) print("Score without interactions: {:.3f}".format( rf.score(X_test_scaled, y_test))) rf = RandomForestRegressor(n_estimators=100).fit(X_train_poly, y_train) print("Score with interactions: {:.3f}".format(rf.score(X_test_poly, y_test))) Univariate Non-linear transformations rnd = np.random.RandomState(0) X_org = rnd.normal(size=(1000, 3)) w = rnd.normal(size=3) X = rnd.poisson(10 * np.exp(X_org)) y = np.dot(X_org, w) print("Number of feature appearances:\n{}".format(np.bincount(X[:, 0]))) bins = np.bincount(X[:, 0]) # bincount, 統計每個索引出現的次數 plt.bar(range(len(bins)), bins, color='grey') plt.ylabel("Number of appearances") plt.xlabel("Value") from sklearn.linear_model import Ridge X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) score = Ridge().fit(X_train, y_train).score(X_test, y_test) print("Test score: {:.3f}".format(score)) X_train_log = np.log(X_train + 1) X_test_log = np.log(X_test + 1) plt.hist(X_train_log[:, 0], bins=25, color='gray') plt.ylabel("Number of appearances") plt.xlabel("Value") score = Ridge().fit(X_train_log, y_train).score(X_test_log, y_test) print("Test score: {:.3f}".format(score)) Automatic Feature Selection Univariate statistics from sklearn.datasets import load_breast_cancer from sklearn.feature_selection import SelectPercentile from sklearn.model_selection import train_test_split cancer = load_breast_cancer() # get deterministic random numbers rng = np.random.RandomState(42) noise = rng.normal(size=(len(cancer.data), 50)) # add noise features to the data # the first 30 features are from the dataset, the next 50 are noise X_w_noise = np.hstack([cancer.data, noise]) X_train, X_test, y_train, y_test = train_test_split( X_w_noise, cancer.target, random_state=0, test_size=.5) # use f_classif (the default) and SelectPercentile to select 50% of features select = SelectPercentile(percentile=50) select.fit(X_train, y_train) # transform training set X_train_selected = select.transform(X_train) print("X_train.shape: {}".format(X_train.shape)) print("X_train_selected.shape: {}".format(X_train_selected.shape)) mask = select.get_support() print(mask) # visualize the mask. black is True, white is False plt.matshow(mask.reshape(1, -1), cmap='gray_r') plt.xlabel("Sample index") plt.yticks(()) Model-based Feature Selection from sklearn.feature_selection import SelectFromModel from sklearn.ensemble import RandomForestClassifier select = SelectFromModel( RandomForestClassifier(n_estimators=100, random_state=42), threshold="median") select.fit(X_train, y_train) X_train_l1 = select.transform(X_train) print("X_train.shape: {}".format(X_train.shape)) print("X_train_l1.shape: {}".format(X_train_l1.shape)) Iterative feature selection from sklearn.feature_selection import RFE select = RFE(RandomForestClassifier(n_estimators=100, random_state=42), n_features_to_select=40) select.fit(X_train, y_train) # visualize the selected features: mask = select.get_support() plt.matshow(mask.reshape(1, -1), cmap='gray_r') plt.xlabel("Sample index") plt.yticks(()) X_train_rfe = select.transform(X_train) X_test_rfe = select.transform(X_test) score = LogisticRegression().fit(X_train_rfe, y_train).score(X_test_rfe, y_test) print("Test score: {:.3f}".format(score)) print("Test score: {:.3f}".format(select.score(X_test, y_test))) Utilizing Expert Knowledge citibike = mglearn.datasets.load_citibike() print("Citibike data:\n{}".format(citibike.head())) plt.figure(figsize=(10, 3)) xticks = pd.date_range(start=citibike.index.min(), end=citibike.index.max(), freq='D') plt.xticks(xticks, xticks.strftime("%a %m-%d"), rotation=90, ha="left") plt.plot(citibike, linewidth=1) plt.xlabel("Date") plt.ylabel("Rentals") # extract the target values (number of rentals) y = citibike.values # convert to POSIX time by dividing by 10**9 X = citibike.index.astype("int64").values.reshape(-1, 1) // 10**9 ##reshape (-1,1)轉化成矩陣 # use the first 184 data points for training, the rest for testing n_train = 184 # function to evaluate and plot a regressor on a given feature set def eval_on_features(features, target, regressor): # split the given features into a training and a test set X_train, X_test = features[:n_train], features[n_train:] # also split the target array y_train, y_test = target[:n_train], target[n_train:] regressor.fit(X_train, y_train) print("Test-set R^2: {:.2f}".format(regressor.score(X_test, y_test))) y_pred = regressor.predict(X_test) y_pred_train = regressor.predict(X_train) plt.figure(figsize=(10, 3)) plt.xticks(range(0, len(X), 8), xticks.strftime("%a %m-%d"), rotation=90, ha="left") plt.plot(range(n_train), y_train, label="train") plt.plot(range(n_train, len(y_test) + n_train), y_test, '-', label="test") plt.plot(range(n_train), y_pred_train, '--', label="prediction train") plt.plot(range(n_train, len(y_test) + n_train), y_pred, '--', label="prediction test") plt.legend(loc=(1.01, 0)) plt.xlabel("Date") plt.ylabel("Rentals") from sklearn.ensemble import RandomForestRegressor regressor = RandomForestRegressor(n_estimators=100, random_state=0) eval_on_features(X, y, regressor) X_hour = citibike.index.hour.values.reshape(-1, 1) eval_on_features(X_hour, y, regressor) X_hour_week = np.hstack([citibike.index.dayofweek.values.reshape(-1, 1), citibike.index.hour.values.reshape(-1, 1)]) eval_on_features(X_hour_week, y, regressor) from sklearn.linear_model import LinearRegression eval_on_features(X_hour_week, y, LinearRegression()) enc = OneHotEncoder() X_hour_week_onehot = enc.fit_transform(X_hour_week).toarray() poly_transformer = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False) X_hour_week_onehot_poly = poly_transformer.fit_transform(X_hour_week_onehot) lr = Ridge() eval_on_features(X_hour_week_onehot_poly, y, lr) hour = ["%02d:00" % i for i in range(0, 24, 3)] day = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"] features = day + hour features_poly = poly_transformer.get_feature_names(features) features_nonzero = np.array(features_poly)[lr.coef_ != 0] coef_nonzero = lr.coef_[lr.coef_ != 0] plt.figure(figsize=(15, 2)) plt.plot(coef_nonzero, 'o') plt.xticks(np.arange(len(coef_nonzero)), features_nonzero, rotation=90) plt.xlabel("Feature name") plt.ylabel("Feature magnitude") Capture 5 Model Evaluation and Improvement from preamble import * %matplotlib inline from sklearn.datasets import make_blobs from sklearn.linear_model import LogisticRegression from sklearn.model_selection import train_test_split # create a synthetic dataset X, y = make_blobs(random_state=0) # split data and labels into a training and a test set X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) # instantiate a model and fit it to the training set logreg = LogisticRegression().fit(X_train, y_train) # evaluate the model on the test set print("Test set score: {:.2f}".format(logreg.score(X_test, y_test))) Cross-Validation from sklearn.model_selection import cross_val_score from sklearn.datasets import load_iris from sklearn.linear_model import LogisticRegression iris = load_iris() logreg = LogisticRegression() scores = cross_val_score(logreg, iris.data, iris.target) print("Cross-validation scores: {}".format(scores)) scores = cross_val_score(logreg, iris.data, iris.target, cv=5) print("Cross-validation scores: {}".format(scores)) rint("Average cross-validation score: {:.2f}".format(scores.mean())) Stratified K-Fold cross-validation and other strategies from sklearn.datasets import load_iris iris = load_iris() print("Iris labels:\n{}".format(iris.target)) More control over cross-validation from sklearn.model_selection import KFold kfold = KFold(n_splits=5) print("Cross-validation scores:\n{}".format( cross_val_score(logreg, iris.data, iris.target, cv=kfold))) kfold = KFold(n_splits=3) print("Cross-validation scores:\n{}".format( cross_val_score(logreg, iris.data, iris.target, cv=kfold))) Leave-one-out cross-validation from sklearn.model_selection import LeaveOneOut loo = LeaveOneOut() scores = cross_val_score(logreg, iris.data, iris.target, cv=loo) print("Number of cv iterations: ", len(scores)) print("Mean accuracy: {:.2f}".format(scores.mean())) Leave-one-out cross-validation from sklearn.model_selection import LeaveOneOut loo = LeaveOneOut() scores = cross_val_score(logreg, iris.data, iris.target, cv=loo) print("Number of cv iterations: ", len(scores)) print("Mean accuracy: {:.2f}".format(scores.mean())) Shuffle-split cross-validation from sklearn.model_selection import ShuffleSplit shuffle_split = ShuffleSplit(test_size=.5, train_size=.5, n_splits=10) scores = cross_val_score(logreg, iris.data, iris.target, cv=shuffle_split) print("Cross-validation scores:\n{}".format(scores)) Cross-validation with groups from sklearn.model_selection import GroupKFold # create synthetic dataset X, y = make_blobs(n_samples=12, random_state=0) # assume the first three samples belong to the same group, # then the next four, etc groups = [0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 3] scores = cross_val_score(logreg, X, y, groups, cv=GroupKFold(n_splits=3)) print("Cross-validation scores:\n{}".format(scores)) Grid Search # naive grid search implementation from sklearn.svm import SVC X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=0) print("Size of training set: {} size of test set: {}".format( X_train.shape[0], X_test.shape[0])) best_score = 0 for gamma in [0.001, 0.01, 0.1, 1, 10, 100]: for C in [0.001, 0.01, 0.1, 1, 10, 100]: # for each combination of parameters, train an SVC svm = SVC(gamma=gamma, C=C) svm.fit(X_train, y_train) # evaluate the SVC on the test set score = svm.score(X_test, y_test) # if we got a better score, store the score and parameters if score > best_score: best_score = score best_parameters = {'C': C, 'gamma': gamma} print("Best score: {:.2f}".format(best_score)) print("Best parameters: {}".format(best_parameters)) from sklearn.svm import SVC # split data into train+validation set and test set X_trainval, X_test, y_trainval, y_test = train_test_split( iris.data, iris.target, random_state=0) # split train+validation set into training and validation sets X_train, X_valid, y_train, y_valid = train_test_split( X_trainval, y_trainval, random_state=1) print("Size of training set: {} size of validation set: {} size of test set:" " {}\n".format(X_train.shape[0], X_valid.shape[0], X_test.shape[0])) best_score = 0 for gamma in [0.001, 0.01, 0.1, 1, 10, 100]: for C in [0.001, 0.01, 0.1, 1, 10, 100]: # for each combination of parameters train an SVC svm = SVC(gamma=gamma, C=C) svm.fit(X_train, y_train) # evaluate the SVC on the validation set score = svm.score(X_valid, y_valid) # if we got a better score, store the score and parameters if score > best_score: best_score = score best_parameters = {'C': C, 'gamma': gamma} # rebuild a model on the combined training and validation set, # and evaluate it on the test set svm = SVC(**best_parameters) svm.fit(X_trainval, y_trainval) test_score = svm.score(X_test, y_test) print("Best score on validation set: {:.2f}".format(best_score)) print("Best parameters: ", best_parameters) print("Test set score with best parameters: {:.2f}".format(test_score)) Grid-search with cross-validation # reference: manual_grid_search_cv for gamma in [0.001, 0.01, 0.1, 1, 10, 100]: for C in [0.001, 0.01, 0.1, 1, 10, 100]: # for each combination of parameters, # train an SVC svm = SVC(gamma=gamma, C=C) # perform cross-validation scores = cross_val_score(svm, X_trainval, y_trainval, cv=5) # compute mean cross-validation accuracy score = np.mean(scores) # if we got a better score, store the score and parameters if score > best_score: best_score = score best_parameters = {'C': C, 'gamma': gamma} # rebuild a model on the combined training and validation set svm = SVC(**best_parameters) svm.fit(X_trainval, y_trainval) from sklearn.model_selection import GridSearchCV from sklearn.svm import SVC grid_search = GridSearchCV(SVC(), param_grid, cv=5) grid_search.fit(X_train, y_train) print("Test set score: {:.2f}".format(grid_search.score(X_test, y_test))) print("Best parameters: {}".format(grid_search.best_params_)) print("Best cross-validation score: {:.2f}".format(grid_search.best_score_)) print("Best estimator:\n{}".format(grid_search.best_estimator_)) Analyzing the result of cross-validation import pandas as pd # convert to Dataframe results = pd.DataFrame(grid_search.cv_results_) # show the first 5 rows display(results.head()) scores = np.array(results.mean_test_score).reshape(6, 6) # plot the mean cross-validation scores mglearn.tools.heatmap(scores, xlabel='gamma', xticklabels=param_grid['gamma'], ylabel='C', yticklabels=param_grid['C'], cmap="viridis") fig, axes = plt.subplots(1, 3, figsize=(13, 5)) param_grid_linear = {'C': np.linspace(1, 2, 6), 'gamma': np.linspace(1, 2, 6)} param_grid_one_log = {'C': np.linspace(1, 2, 6), 'gamma': np.logspace(-3, 2, 6)} param_grid_range = {'C': np.logspace(-3, 2, 6), 'gamma': np.logspace(-7, -2, 6)} for param_grid, ax in zip([param_grid_linear, param_grid_one_log, param_grid_range], axes): grid_search = GridSearchCV(SVC(), param_grid, cv=5) grid_search.fit(X_train, y_train) scores = grid_search.cv_results_['mean_test_score'].reshape(6, 6) # plot the mean cross-validation scores scores_image = mglearn.tools.heatmap( scores, xlabel='gamma', ylabel='C', xticklabels=param_grid['gamma'], yticklabels=param_grid['C'], cmap="viridis", ax=ax) plt.colorbar(scores_image, ax=axes.tolist()) Nested cross-validation param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100], 'gamma': [0.001, 0.01, 0.1, 1, 10, 100]} scores = cross_val_score(GridSearchCV(SVC(), param_grid, cv=5), iris.data, iris.target, cv=5) print("Cross-validation scores: ", scores) print("Mean cross-validation score: ", scores.mean()) def nested_cv(X, y, inner_cv, outer_cv, Classifier, parameter_grid): outer_scores = [] # for each split of the data in the outer cross-validation # (split method returns indices of training and test part) for training_samples, test_samples in outer_cv.split(X, y): # find best parameter using inner cross-validation best_parms = {} best_score = -np.inf # iterate over parameters for parameters in parameter_grid: # accumulate score over inner splits cv_scores = [] # iterate over inner cross-validation for inner_train, inner_test in inner_cv.split( # 注意StratifiedKFold(5) 等有split方法 X[training_samples], y[training_samples]): # build classifier given parameters and training data clf = Classifier(**parameters) clf.fit(X[inner_train], y[inner_train]) # evaluate on inner test set score = clf.score(X[inner_test], y[inner_test]) cv_scores.append(score) # compute mean score over inner folds mean_score = np.mean(cv_scores) if mean_score > best_score: # if better than so far, remember parameters best_score = mean_score best_params = parameters # build classifier on best parameters using outer training set clf = Classifier(**best_params) clf.fit(X[training_samples], y[training_samples]) # evaluate outer_scores.append(clf.score(X[test_samples], y[test_samples])) return np.array(outer_scores) from sklearn.model_selection import ParameterGrid, StratifiedKFold scores = nested_cv(iris.data, iris.target, StratifiedKFold(5), StratifiedKFold(5), SVC, ParameterGrid(param_grid)) print("Cross-validation scores: {}".format(scores)) Evaluation Metrics and Scoring Imbalanced datasets from sklearn.datasets import load_digits digits = load_digits() y = digits.target == 9 X_train, X_test, y_train, y_test = train_test_split( digits.data, y, random_state=0) from sklearn.dummy import DummyClassifier dummy_majority = DummyClassifier(strategy='most_frequent').fit(X_train, y_train) pred_most_frequent = dummy_majority.predict(X_test) print("Unique predicted labels: {}".format(np.unique(pred_most_frequent))) print("Test score: {:.2f}".format(dummy_majority.score(X_test, y_test))) Confusion matrices from sklearn.metrics import confusion_matrix confusion = confusion_matrix(y_test, pred_logreg) print("Confusion matrix:\n{}".format(confusion)) from sklearn.metrics import f1_score print("f1 score most frequent: {:.2f}".format( f1_score(y_test, pred_most_frequent))) print("f1 score dummy: {:.2f}".format(f1_score(y_test, pred_dummy))) print("f1 score tree: {:.2f}".format(f1_score(y_test, pred_tree))) print("f1 score logistic regression: {:.2f}".format( f1_score(y_test, pred_logreg))) Taking uncertainty into account from mglearn.datasets import make_blobs X, y = make_blobs(n_samples=(400, 50), centers=2, cluster_std=[7.0, 2], random_state=22) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) svc = SVC(gamma=.05).fit(X_train, y_train) print(classification_report(y_test, svc.predict(X_test))) y_pred_lower_threshold = svc.decision_function(X_test) > -.8 print(classification_report(y_test, y_pred_lower_threshold)) Precision-Recall curves and ROC curves from sklearn.metrics import precision_recall_curve precision, recall, thresholds = precision_recall_curve( y_test, svc.decision_function(X_test)) # create a similar dataset as before, but with more samples # to get a smoother curve X, y = make_blobs(n_samples=(4000, 500), centers=2, cluster_std=[7.0, 2], random_state=22) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) svc = SVC(gamma=.05).fit(X_train, y_train) precision, recall, thresholds = precision_recall_curve( y_test, svc.decision_function(X_test)) # find threshold closest to zero close_zero = np.argmin(np.abs(thresholds)) plt.plot(precision[close_zero], recall[close_zero], 'o', markersize=10, label="threshold zero", fillstyle="none", c='k', mew=2) plt.plot(precision, recall, label="precision recall curve") plt.xlabel("Precision") plt.ylabel("Recall") plt.legend(loc="best") from sklearn.ensemble import RandomForestClassifier rf = RandomForestClassifier(n_estimators=100, random_state=0, max_features=2) rf.fit(X_train, y_train) # RandomForestClassifier has predict_proba, but not decision_function precision_rf, recall_rf, thresholds_rf = precision_recall_curve( y_test, rf.predict_proba(X_test)[:, 1]) plt.plot(precision, recall, label="svc") plt.plot(precision[close_zero], recall[close_zero], 'o', markersize=10, label="threshold zero svc", fillstyle="none", c='k', mew=2) plt.plot(precision_rf, recall_rf, label="rf") close_default_rf = np.argmin(np.abs(thresholds_rf - 0.5)) plt.plot(precision_rf[close_default_rf], recall_rf[close_default_rf], '^', c='k', markersize=10, label="threshold 0.5 rf", fillstyle="none", mew=2) plt.xlabel("Precision") plt.ylabel("Recall") plt.legend(loc="best") print("f1_score of random forest: {:.3f}".format( f1_score(y_test, rf.predict(X_test)))) print("f1_score of svc: {:.3f}".format(f1_score(y_test, svc.predict(X_test)))) from sklearn.metrics import average_precision_score ap_rf = average_precision_score(y_test, rf.predict_proba(X_test)[:, 1]) ap_svc = average_precision_score(y_test, svc.decision_function(X_test)) print("Average precision of random forest: {:.3f}".format(ap_rf)) print("Average precision of svc: {:.3f}".format(ap_svc)) from sklearn.metrics import roc_curve fpr, tpr, thresholds = roc_curve(y_test, svc.decision_function(X_test)) plt.plot(fpr, tpr, label="ROC Curve") plt.xlabel("FPR") plt.ylabel("TPR (recall)") # find threshold closest to zero close_zero = np.argmin(np.abs(thresholds)) plt.plot(fpr[close_zero], tpr[close_zero], 'o', markersize=10, label="threshold zero", fillstyle="none", c='k', mew=2) plt.legend(loc=4) from sklearn.metrics import roc_auc_score rf_auc = roc_auc_score(y_test, rf.predict_proba(X_test)[:, 1]) svc_auc = roc_auc_score(y_test, svc.decision_function(X_test)) print("AUC for Random Forest: {:.3f}".format(rf_auc)) print("AUC for SVC: {:.3f}".format(svc_auc)) y = digits.target == 9 X_train, X_test, y_train, y_test = train_test_split( digits.data, y, random_state=0) plt.figure() for gamma in [1, 0.05, 0.01]: svc = SVC(gamma=gamma).fit(X_train, y_train) accuracy = svc.score(X_test, y_test) auc = roc_auc_score(y_test, svc.decision_function(X_test)) fpr, tpr, _ = roc_curve(y_test , svc.decision_function(X_test)) print("gamma = {:.2f} accuracy = {:.2f} AUC = {:.2f}".format( gamma, accuracy, auc)) plt.plot(fpr, tpr, label="gamma={:.3f}".format(gamma)) plt.xlabel("FPR") plt.ylabel("TPR") plt.xlim(-0.01, 1) plt.ylim(0, 1.02) plt.legend(loc="best") Multi-class classification from sklearn.metrics import accuracy_score X_train, X_test, y_train, y_test = train_test_split( digits.data, digits.target, random_state=0) lr = LogisticRegression().fit(X_train, y_train) pred = lr.predict(X_test) print("Accuracy: {:.3f}".format(accuracy_score(y_test, pred))) print("Confusion matrix:\n{}".format(confusion_matrix(y_test, pred))) scores_image = mglearn.tools.heatmap( confusion_matrix(y_test, pred), xlabel='Predicted label', ylabel='True label', xticklabels=digits.target_names, yticklabels=digits.target_names, cmap=plt.cm.gray_r, fmt="%d") plt.title("Confusion matrix") plt.gca().invert_yaxis() Using evaluation metrics in model selection # default scoring for classification is accuracy print("Default scoring: {}".format( cross_val_score(SVC(), digits.data, digits.target == 9))) # providing scoring="accuracy" doesn't change the results explicit_accuracy = cross_val_score(SVC(), digits.data, digits.target == 9, scoring="accuracy") print("Explicit accuracy scoring: {}".format(explicit_accuracy)) roc_auc = cross_val_score(SVC(), digits.data, digits.target == 9, scoring="roc_auc") print("AUC scoring: {}".format(roc_auc)) X_train, X_test, y_train, y_test = train_test_split( digits.data, digits.target == 9, random_state=0) # we provide a somewhat bad grid to illustrate the point: param_grid = {'gamma': [0.0001, 0.01, 0.1, 1, 10]} # using the default scoring of accuracy: grid = GridSearchCV(SVC(), param_grid=param_grid) grid.fit(X_train, y_train) print("Grid-Search with accuracy") print("Best parameters:", grid.best_params_) print("Best cross-validation score (accuracy)): {:.3f}".format(grid.best_score_)) print("Test set AUC: {:.3f}".format( roc_auc_score(y_test, grid.decision_function(X_test)))) print("Test set accuracy: {:.3f}".format(grid.score(X_test, y_test))) # using AUC scoring instead: grid = GridSearchCV(SVC(), param_grid=param_grid, scoring="roc_auc") grid.fit(X_train, y_train) print("\nGrid-Search with AUC") print("Best parameters:", grid.best_params_) print("Best cross-validation score (AUC): {:.3f}".format(grid.best_score_)) print("Test set AUC: {:.3f}".format( roc_auc_score(y_test, grid.decision_function(X_test)))) print("Test set accuracy: {:.3f}".format(grid.score(X_test, y_test))) from sklearn.metrics.scorer import SCORERS print("Available scorers:\n{}".format(sorted(SCORERS.keys()))) Capture six from preamble import * %matplotlib inline from sklearn.svm import SVC from sklearn.datasets import load_breast_cancer from sklearn.model_selection import train_test_split from sklearn.preprocessing import MinMaxScaler # load and split the data cancer = load_breast_cancer() X_train, X_test, y_train, y_test = train_test_split( cancer.data, cancer.target, random_state=0) # compute minimum and maximum on the training data scaler = MinMaxScaler().fit(X_train) # rescale the training data X_train_scaled = scaler.transform(X_train) svm = SVC() # learn an SVM on the scaled training data svm.fit(X_train_scaled, y_train) # scale the test data and score the scaled data X_test_scaled = scaler.transform(X_test) print("Test score: {:.2f}".format(svm.score(X_test_scaled, y_test))) Parameter Selection with Preprocessing from sklearn.model_selection import GridSearchCV # for illustration purposes only, don't use this code! param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100], 'gamma': [0.001, 0.01, 0.1, 1, 10, 100]} grid = GridSearchCV(SVC(), param_grid=param_grid, cv=5) grid.fit(X_train_scaled, y_train) print("Best cross-validation accuracy: {:.2f}".format(grid.best_score_)) print("Best parameters: ", grid.best_params_) print("Test set accuracy: {:.2f}".format(grid.score(X_test_scaled, y_test))) Building Pipelines from sklearn.pipeline import Pipeline pipe = Pipeline([("scaler", MinMaxScaler()), ("svm", SVC())]) pipe.fit(X_train, y_train) print("Test score: {:.2f}".format(pipe.score(X_test, y_test))) Using Pipelines in Grid-searches param_grid = {'svm__C': [0.001, 0.01, 0.1, 1, 10, 100], 'svm__gamma': [0.001, 0.01, 0.1, 1, 10, 100]} grid = GridSearchCV(pipe, param_grid=param_grid, cv=5) grid.fit(X_train, y_train) print("Best cross-validation accuracy: {:.2f}".format(grid.best_score_)) print("Test set score: {:.2f}".format(grid.score(X_test, y_test))) print("Best parameters: {}".format(grid.best_params_)) rnd = np.random.RandomState(seed=0) X = rnd.normal(size=(100, 10000)) y = rnd.normal(size=(100,)) from sklearn.feature_selection import SelectPercentile, f_regression select = SelectPercentile(score_func=f_regression, percentile=5).fit(X, y) X_selected = select.transform(X) print("X_selected.shape: {}".format(X_selected.shape)) from sklearn.model_selection import cross_val_score from sklearn.linear_model import Ridge print("Cross-validation accuracy (cv only on ridge): {:.2f}".format( np.mean(cross_val_score(Ridge(), X_selected, y, cv=5)))) pipe = Pipeline([("select", SelectPercentile(score_func=f_regression, percentile=5)), ("ridge", Ridge())]) print("Cross-validation accuracy (pipeline): {:.2f}".format( np.mean(cross_val_score(pipe, X, y, cv=5)))) The General Pipeline Interface def fit(self, X, y): X_transformed = X for name, estimator in self.steps[:-1]: # iterate over all but the final step # fit and transform the data X_transformed = estimator.fit_transform(X_transformed, y) # fit the last step self.steps[-1][1].fit(X_transformed, y) return self def predict(self, X): X_transformed = X for step in self.steps[:-1]: # iterate over all but the final step # transform the data X_transformed = step[1].transform(X_transformed) # predict using the last step return self.steps[-1][1].predict(X_transformed) Convenient Pipeline creation with make_pipeline from sklearn.pipeline import make_pipeline # standard syntax pipe_long = Pipeline([("scaler", MinMaxScaler()), ("svm", SVC(C=100))]) # abbreviated syntax pipe_short = make_pipeline(MinMaxScaler(), SVC(C=100)) print("Pipeline steps:\n{}".format(pipe_short.steps)) from sklearn.preprocessing import StandardScaler from sklearn.decomposition import PCA pipe = make_pipeline(StandardScaler(), PCA(n_components=2), StandardScaler()) print("Pipeline steps:\n{}".format(pipe.steps)) Accessing step attributes # fit the pipeline defined before to the cancer dataset pipe.fit(cancer.data) # extract the first two principal components from the "pca" step components = pipe.named_steps["pca"].components_ print("components.shape: {}".format(components.shape)) Accessing Attributes in a Pipeline inside GridSearchCV from sklearn.linear_model import LogisticRegression pipe = make_pipeline(StandardScaler(), LogisticRegression()) param_grid = {'logisticregression__C': [0.01, 0.1, 1, 10, 100]} X_train, X_test, y_train, y_test = train_test_split( cancer.data, cancer.target, random_state=4) grid = GridSearchCV(pipe, param_grid, cv=5) grid.fit(X_train, y_train) print("Best estimator:\n{}".format(grid.best_estimator_)) print("Logistic regression step:\n{}".format( grid.best_estimator_.named_steps["logisticregression"])) Grid-searching preprocessing steps and model parameters from sklearn.datasets import load_boston from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from sklearn.linear_model import Ridge boston = load_boston() X_train, X_test, y_train, y_test = train_test_split(boston.data, boston.target, random_state=0) from sklearn.preprocessing import PolynomialFeatures pipe = make_pipeline( StandardScaler(), PolynomialFeatures(), Ridge()) param_grid = {'polynomialfeatures__degree': [1, 2, 3], 'ridge__alpha': [0.001, 0.01, 0.1, 1, 10, 100]} grid = GridSearchCV(pipe, param_grid=param_grid, cv=5, n_jobs=-1) grid.fit(X_train, y_train) print("Best parameters: {}".format(grid.best_params_)) print("Test-set score: {:.2f}".format(grid.score(X_test, y_test))) param_grid = {'ridge__alpha': [0.001, 0.01, 0.1, 1, 10, 100]} pipe = make_pipeline(StandardScaler(), Ridge()) grid = GridSearchCV(pipe, param_grid, cv=5) grid.fit(X_train, y_train) print("Score without poly features: {:.2f}".format(grid.score(X_test, y_test))) pipe = Pipeline([('preprocessing', StandardScaler()), ('classifier', SVC())]) from sklearn.ensemble import RandomForestClassifier param_grid = [ {'classifier': [SVC()], 'preprocessing': [StandardScaler(), None], 'classifier__gamma': [0.001, 0.01, 0.1, 1, 10, 100], 'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100]}, {'classifier': [RandomForestClassifier(n_estimators=100)], 'preprocessing': [None], 'classifier__max_features': [1, 2, 3]}] X_train, X_test, y_train, y_test = train_test_split( cancer.data, cancer.target, random_state=0) grid = GridSearchCV(pipe, param_grid, cv=5) grid.fit(X_train, y_train) print("Best params:\n{}\n".format(grid.best_params_)) print("Best cross-validation score: {:.2f}".format(grid.best_score_)) print("Test-set score: {:.2f}".format(grid.score(X_test, y_test))) Capter 8
相關推薦
Introduction.to.Machine.Learning.with.Python 筆記
Python 3.0+ Chapter One from preamble import * %matplotlib inline import numpy as np x = np.array([[1, 2, 3], [4, 5, 6]]) print("x:\
Introduction to Machine Learning with Python/Python機器學習基礎教程_程式碼修改與更新
2.3.1樣本資料集 --程式碼bug及修改意見 import matplotlib.pyplot as plt import mglearn X,y=mglearn.datasets.make_forge() mglearn.discrete_scatter(X[:,0
Introduction to Machine Learning with IBM Watson Studio
After logging into Watson Studio, select New Modeler Flow. Enter a name, keep the default settings, and then click Create. Next expand the Import menu, dra
How to Clean Text for Machine Learning with Python
Tweet Share Share Google Plus You cannot go straight from raw text to fitting a machine learning
Andrew NG機器學習課程筆記系列之——Introduction to Machine Learning
引言 本系列文章是本人對Andrew NG的機器學習課程的一些筆記,如有錯誤,請讀者以課程為準。 在現實生活中,我們每天都可能在不知不覺中使用了各種各樣的機器學習演算法。 例如,當你每一次使用 Google 時,它之所以可以執行良好,其中一個重要原因便是由 Google 實
Book Review: Machine Learning with Python Cookbook
Additional Considerations The only criticism I can place is that I wish there were more topics covered in the content. Some specific areas I would have li
[Machine Learning with Python] Cross Validation and Grid Search: An Example of KNN
Train model: from sklearn.model_selection import GridSearchCV param_grid = [ # try 6 (3×2) combinations of hyperparameters {'n_neighbors': [3,
[Machine Learning with Python] Data Preparation by Pandas and Scikit-Learn
In this article, we dicuss some main steps in data preparation. Drop Labels Firstly, we drop labels for train set. Here we use drop() method in Pandas li
[Machine Learning with Python] My First Data Preprocessing Pipeline with Titanic Dataset
The Dataset was acquired from https://www.kaggle.com/c/titanic For data preprocessing, I firstly defined three transformers: DataFrameSelector: S
Introduction to Random Number Generators for Machine Learning in Python
Tweet Share Share Google Plus Randomness is a big part of machine learning. Randomness is used a
How to Get Started with Machine Learning in Python
Tweet Share Share Google Plus The Python conference PyCon2014 has held recently and the videos f
論文筆記-Sequence to Sequence Learning with Neural Networks
map tran between work down all 9.png ever onf 大體思想和RNN encoder-decoder是一樣的,只是用來LSTM來實現。 paper提到三個important point: 1)encoder和decoder的LSTM
【DeepLearning學習筆記】Coursera課程《Neural Networks and Deep Learning》——Week1 Introduction to deep learning課堂筆記
決定 如同 樣本 理解 你是 水平 包含 rod spa Coursera課程《Neural Networks and Deep Learning》 deeplearning.ai Week1 Introduction to deep learning What is a
OReilly.Hands-On.Machine.Learning.with.Scikit-Learn.and.TensorFlow學習筆記彙總
其中用到的知識點我都記錄在部落格中了:https://blog.csdn.net/dss_dssssd 第一章知識點總結: supervised learning k-Nearest Neighbors Linear Regression
Hands-on Machine Learning with Scikit-Learn and TensorFlow(中文版)和深度學習原理與TensorFlow實踐-學習筆記
監督學習:新增標籤。學習的目標是求出輸入與輸出之間的關係函式y=f(x)。樸素貝葉斯、邏輯迴歸和神經網路等都屬於監督學習的方法。 監督學習主要解決兩類核心問題,即迴歸和分類。 迴歸和分類的區別在於強調一個是連續的,一個是離散的。 非監督學習:不新增標籤。學習目標是為了探索樣本資料之間是否
Nvidia looks to transform machine learning with GPUs
Nvidia is no stranger to data crunching applications of its GPU architecture. It's been dominating the AI deep learning development space for years and sat
Removing Obstacles to Production Machine Learning with OpnIDS and Dragonfly MLE
Machine learning promises to address many of the challenges faced by network security analysts; however, there are still many obstacles that prevent widesp
《Hands-On Machine Learning with Scikit-Learn & TensorFlow》讀書筆記 第一章 機器學習概覽
一、機器學習概覽 為什麼使用機器學習? 機器學習善於: 需要進行大量手工調整或需要擁有長串規則才能解決的問題:機器學習演算法通常可以簡化程式碼、提高效能。 問題複雜,傳統方法難以解決:最好的機器學習方法可以找到解決方案。 環境有波動:機器學習演算法可以適
Hands on Machine Learning with Sklearn and TensorFlow學習筆記——機器學習概覽
一、什麼是機器學習? 計算機程式利用經驗E(訓練資料)學習任務T(要做什麼,即目標),效能是P(效能指標),如果針對任務T的效能P隨著經驗E不斷增長,成為機器學習。【這是湯姆米切爾在1997年定義】 大白話:類比於學生學習考試,你先練習一套有一套的模擬卷 (這就相當於訓練資料),在這幾
Rescaling Data for Machine Learning in Python with Scikit
Tweet Share Share Google Plus Your data must be prepared before you can build models. The data p