【Python例項第8講】模型複雜度影響
阿新 • • 發佈:2018-11-09
機器學習訓練營——機器學習愛好者的自由交流空間(qq 群號:696721295)
本講介紹模型複雜度怎樣影響預測精度和計算效能。我們使用的資料集仍然是波士頓房價資料集。對於模型的每一類,我們通過選擇有關的模型引數,度量計算效能和預測功效的影響,以此考察模型的複雜度。下面,我們用Python程式碼解釋原理。
程式碼詳解
首先,載入必須的Python函式庫。
print(__doc__) # Author: Eustache Diemert <[email protected]> # License: BSD 3 clause import time import numpy as np import matplotlib.pyplot as plt from mpl_toolkits.axes_grid1.parasite_axes import host_subplot from mpl_toolkits.axisartist.axislines import Axes from scipy.sparse.csr import csr_matrix from sklearn import datasets from sklearn.utils import shuffle from sklearn.metrics import mean_squared_error from sklearn.svm.classes import NuSVR from sklearn.ensemble.gradient_boosting import GradientBoostingRegressor from sklearn.linear_model.stochastic_gradient import SGDClassifier from sklearn.metrics import hamming_loss
為了產生模擬資料集,初始化隨機數生成器。
np.random.seed(0)
定義函式generate_data
, 用來產生用於迴歸或分類問題的模擬資料。該函式有兩個輸入引數,其中,case
指定迴歸還是分類問題,sparse
是一個邏輯引數,用來指定資料是否是稀疏的,該引數預設值為False
. 函式輸出模擬的資料集。
def generate_data(case, sparse=False): """Generate regression/classification data.""" bunch = None if case == 'regression': bunch = datasets.load_boston() elif case == 'classification': bunch = datasets.fetch_20newsgroups_vectorized(subset='all') X, y = shuffle(bunch.data, bunch.target) offset = int(X.shape[0] * 0.8) X_train, y_train = X[:offset], y[:offset] X_test, y_test = X[offset:], y[offset:] if sparse: X_train = csr_matrix(X_train) X_test = csr_matrix(X_test) else: X_train = np.array(X_train) X_test = np.array(X_test) y_test = np.array(y_test) y_train = np.array(y_train) data = {'X_train': X_train, 'X_test': X_test, 'y_train': y_train, 'y_test': y_test} return data
定義函式benchmark_influence
, 它的作用是確立一個影響標準,用以評價模型複雜度。在這裡,我們用latency
度量計算效能,用MSE
度量預測功效。改變這兩個引數的值,評價模型的複雜度影響。因此,該函式有一個字典型輸入引數conf
, 指定要改變的引數型別和引數值。函式輸出預測功效prediction_powers
, 程式執行時間prediction_times
, 模型複雜度complexities
.
def benchmark_influence(conf): """ Benchmark influence of :changing_param: on both MSE and latency. """ prediction_times = [] prediction_powers = [] complexities = [] for param_value in conf['changing_param_values']: conf['tuned_params'][conf['changing_param']] = param_value estimator = conf['estimator'](**conf['tuned_params']) print("Benchmarking %s" % estimator) estimator.fit(conf['data']['X_train'], conf['data']['y_train']) conf['postfit_hook'](estimator) complexity = conf['complexity_computer'](estimator) complexities.append(complexity) start_time = time.time() for _ in range(conf['n_samples']): y_pred = estimator.predict(conf['data']['X_test']) elapsed_time = (time.time() - start_time) / float(conf['n_samples']) prediction_times.append(elapsed_time) pred_score = conf['prediction_performance_computer']( conf['data']['y_test'], y_pred) prediction_powers.append(pred_score) print("Complexity: %d | %s: %.4f | Pred. Time: %fs\n" % ( complexity, conf['prediction_performance_label'], pred_score, elapsed_time)) return prediction_powers, prediction_times, complexities
函式plot_influence
以accuracy
, latency
為變數,視覺化模型複雜度。
def plot_influence(conf, mse_values, prediction_times, complexities):
"""
Plot influence of model complexity on both accuracy and latency.
"""
plt.figure(figsize=(12, 6))
host = host_subplot(111, axes_class=Axes)
plt.subplots_adjust(right=0.75)
par1 = host.twinx()
host.set_xlabel('Model Complexity (%s)' % conf['complexity_label'])
y1_label = conf['prediction_performance_label']
y2_label = "Time (s)"
host.set_ylabel(y1_label)
par1.set_ylabel(y2_label)
p1, = host.plot(complexities, mse_values, 'b-', label="prediction error")
p2, = par1.plot(complexities, prediction_times, 'r-',
label="latency")
host.legend(loc='upper right')
host.axis["left"].label.set_color(p1.get_color())
par1.axis["right"].label.set_color(p2.get_color())
plt.title('Influence of Model Complexity - %s' % conf['estimator'].__name__)
plt.show()
函式_count_nonzero_coefficients
統計估計量的非零係數。
def _count_nonzero_coefficients(estimator):
a = estimator.coef_.toarray()
return np.count_nonzero(a)
下面,我們分別模擬波士頓房價迴歸和分類的資料集,評價迴歸和分類預測模型的複雜度影響。
regression_data = generate_data('regression')
classification_data = generate_data('classification', sparse=True)
configurations = [
{'estimator': SGDClassifier,
'tuned_params': {'penalty': 'elasticnet', 'alpha': 0.001, 'loss':
'modified_huber', 'fit_intercept': True, 'tol': 1e-3},
'changing_param': 'l1_ratio',
'changing_param_values': [0.25, 0.5, 0.75, 0.9],
'complexity_label': 'non_zero coefficients',
'complexity_computer': _count_nonzero_coefficients,
'prediction_performance_computer': hamming_loss,
'prediction_performance_label': 'Hamming Loss (Misclassification Ratio)',
'postfit_hook': lambda x: x.sparsify(),
'data': classification_data,
'n_samples': 30},
{'estimator': NuSVR,
'tuned_params': {'C': 1e3, 'gamma': 2 ** -15},
'changing_param': 'nu',
'changing_param_values': [0.1, 0.25, 0.5, 0.75, 0.9],
'complexity_label': 'n_support_vectors',
'complexity_computer': lambda x: len(x.support_vectors_),
'data': regression_data,
'postfit_hook': lambda x: x,
'prediction_performance_computer': mean_squared_error,
'prediction_performance_label': 'MSE',
'n_samples': 30},
{'estimator': GradientBoostingRegressor,
'tuned_params': {'loss': 'ls'},
'changing_param': 'n_estimators',
'changing_param_values': [10, 50, 100, 200, 500],
'complexity_label': 'n_trees',
'complexity_computer': lambda x: x.n_estimators,
'data': regression_data,
'postfit_hook': lambda x: x,
'prediction_performance_computer': mean_squared_error,
'prediction_performance_label': 'MSE',
'n_samples': 30},
]
for conf in configurations:
prediction_performances, prediction_times, complexities = \
benchmark_influence(conf)
plot_influence(conf, prediction_performances, prediction_times,
complexities)
閱讀更多精彩內容,請關注微信公眾號:統計學習與大資料