1. 程式人生 > >原始碼解讀----之-----k_means相關方法(被k_means呼叫)

原始碼解讀----之-----k_means相關方法(被k_means呼叫)

本文是個人的理解,由於剛接觸並且自身能力也有限,也許會存在誤解,歡迎留言指正,本人一定虛心請教,謝謝
def _tolerance(X, tol):
"""Return a tolerance which is independent of the dataset"""
    #判斷是否是一個稀疏矩陣,如果是則,則計算X的縱軸(axis=0標識縱軸,axis=1表示橫軸)均值和方差,得到方差
if sp.issparse(X):
from sklearn.utils.sparsefuncs import mean_variance_axis
        variances = mean_variance_axis(X
, axis=0)[1] #否則,直接計算各列的方差 else: variances = np.var(X, axis=0) #返回方差的均值*tol return np.mean(variances) * tol def as_float_array(X, copy=True, force_all_finite=True): '''檢驗隨機數生成器 random_state:None | int | RandomState例項 如果為None,則返回np.random的RandomState的一個例項 如果為int,則返回一個以int為種子的新RandomState例項
如果為RandomState例項,則返回該例項 否則,ValueError''' if isinstance(X, np.matrix) or (not isinstance(X, np.ndarray) and not sp.issparse(X)): return check_array(X, ['csr', 'csc', 'coo'], dtype=np.float64, copy=copy, force_all_finite=force_all_finite, ensure_2d=False) elif
sp.issparse(X) and X.dtype in [np.float32, np.float64]: return X.copy() if copy else X elif X.dtype in [np.float32, np.float64]: # is numpy array return X.copy('F' if X.flags['F_CONTIGUOUS'] else 'C') if copy else X else: if X.dtype.kind in 'uib' and X.dtype.itemsize <= 4: return_dtype = np.float32 else: return_dtype = np.float64 return X.astype(return_dtype) def check_array(array, accept_sparse=False, dtype="numeric", order=None, copy=False, force_all_finite=True, ensure_2d=True, allow_nd=False, ensure_min_samples=1, ensure_min_features=1, warn_on_dtype=False, estimator=None): ''' 將輸入的驗證陣列,列表,稀疏矩陣等轉換為至少為2維的numpy array. ''' if accept_sparse is None: warnings.warn( "Passing 'None' to parameter 'accept_sparse' in methods " "check_array and check_X_y is deprecated in version 0.19 " "and will be removed in 0.21. Use 'accept_sparse=False' " " instead.", DeprecationWarning) accept_sparse = False # store whether originally we wanted numeric dtype dtype_numeric = isinstance(dtype, six.string_types) and dtype == "numeric" dtype_orig = getattr(array, "dtype", None) if not hasattr(dtype_orig, 'kind'): # not a data type (e.g. a column named dtype in a pandas DataFrame) dtype_orig = None if dtype_numeric: if dtype_orig is not None and dtype_orig.kind == "O": # if input is object, convert to float. dtype = np.float64 else: dtype = None if isinstance(dtype, (list, tuple)): if dtype_orig is not None and dtype_orig in dtype: # no dtype conversion required dtype = None else: # dtype conversion required. Let's select the first element of the # list of accepted types. dtype = dtype[0] if estimator is not None: if isinstance(estimator, six.string_types): estimator_name = estimator else: estimator_name = estimator.__class__.__name__ else: estimator_name = "Estimator" context = " by %s" % estimator_name if estimator is not None else "" if sp.issparse(array):#如果是稀疏矩陣則轉換稀疏矩陣為指定格式 array = _ensure_sparse_format(array, accept_sparse, dtype, copy, force_all_finite) else: array = np.array(array, dtype=dtype, order=order, copy=copy) if ensure_2d: if array.ndim == 1: raise ValueError( "Expected 2D array, got 1D array instead:\narray={}.\n" "Reshape your data either using array.reshape(-1, 1) if " "your data has a single feature or array.reshape(1, -1) " "if it contains a single sample.".format(array)) array = np.atleast_2d(array) # To ensure that array flags are maintained array = np.array(array, dtype=dtype, order=order, copy=copy) # make sure we actually converted to numeric: if dtype_numeric and array.dtype.kind == "O": array = array.astype(np.float64) if not allow_nd and array.ndim >= 3: raise ValueError("Found array with dim %d. %s expected <= 2." % (array.ndim, estimator_name)) if force_all_finite: _assert_all_finite(array) shape_repr = _shape_repr(array.shape) if ensure_min_samples > 0: n_samples = _num_samples(array) if n_samples < ensure_min_samples: raise ValueError("Found array with %d sample(s) (shape=%s) while a" " minimum of %d is required%s." % (n_samples, shape_repr, ensure_min_samples, context)) if ensure_min_features > 0 and array.ndim == 2: n_features = array.shape[1] if n_features < ensure_min_features: raise ValueError("Found array with %d feature(s) (shape=%s) while" " a minimum of %d is required%s." % (n_features, shape_repr, ensure_min_features, context)) if warn_on_dtype and dtype_orig is not None and array.dtype != dtype_orig: msg = ("Data with input dtype %s was converted to %s%s." % (dtype_orig, array.dtype, context)) warnings.warn(msg, DataConversionWarning) return array def _ensure_sparse_format(spmatrix, accept_sparse, dtype, copy, force_all_finite): """稀疏矩陣轉換為一個給定的格式 """ if dtype is None: dtype = spmatrix.dtype changed_format = False if isinstance(accept_sparse, six.string_types): accept_sparse = [accept_sparse] if accept_sparse is False: raise TypeError('A sparse matrix was passed, but dense ' 'data is required. Use X.toarray() to ' 'convert to a dense numpy array.') elif isinstance(accept_sparse, (list, tuple)): if len(accept_sparse) == 0: raise ValueError("When providing 'accept_sparse' " "as a tuple or list, it must contain at " "least one string value.") # ensure correct sparse format if spmatrix.format not in accept_sparse: # create new with correct sparse spmatrix = spmatrix.asformat(accept_sparse[0]) changed_format = True elif accept_sparse is not True: # any other type raise ValueError("Parameter 'accept_sparse' should be a string, " "boolean or list of strings. You provided " "'accept_sparse={}'.".format(accept_sparse)) if dtype != spmatrix.dtype: # convert dtype spmatrix = spmatrix.astype(dtype) elif copy and not changed_format: # force copy spmatrix = spmatrix.copy() if force_all_finite: if not hasattr(spmatrix, "data"): warnings.warn("Can't check %s sparse matrix for nan or inf." % spmatrix.format) else:#否則檢驗spmatrix.data,如果spmatrix.data包含NaN或無窮就丟擲一個ValueError _assert_all_finite(spmatrix.data) return spmatrix def _assert_all_finite(X): """如果X包NaA或無窮則丟擲一個ValueError,和assert_all_finite,但只是針對ndarray.""" if _get_config()['assume_finite']:#獲取配置檔案中assume_finite的值 return X = np.asanyarray(X) if (X.dtype.char in np.typecodes['AllFloat'] and not np.isfinite(X.sum()) and not np.isfinite(X).all()): raise ValueError("Input contains NaN, infinity" " or a value too large for %r." % X.dtype) def _validate_center_shape(X, n_centers, centers): """Check if centers is compatible with X and n_centers""" if len(centers) != n_centers: raise ValueError('The shape of the initial centers (%s) ' 'does not match the number of clusters %i' % (centers.shape, n_centers)) if centers.shape[1] != X.shape[1]: raise ValueError( "The number of features of the initial centers %s " "does not match the number of features of the data %s." % (centers.shape[1], X.shape[1])) def _kmeans_single_lloyd(X, n_clusters, max_iter=300, init='k-means++', verbose=False, x_squared_norms=None, random_state=None, tol=1e-4, precompute_distances=True): """A single run of k-means, assumes preparation completed prior. """ random_state = check_random_state(random_state) best_labels, best_inertia, best_centers = None, None, None # init centers = _init_centroids(X, n_clusters, init, random_state=random_state, x_squared_norms=x_squared_norms) if verbose: print("Initialization complete") # Allocate memory to store the distances for each sample to its # closer center for reallocation in case of ties distances = np.zeros(shape=(X.shape[0],), dtype=X.dtype) # iterations for i in range(max_iter): centers_old = centers.copy() # labels assignment is also called the E-step of EM labels, inertia = \ _labels_inertia(X, x_squared_norms, centers, precompute_distances=precompute_distances, distances=distances) # computation of the means is also called the M-step of EM if sp.issparse(X): centers = _k_means._centers_sparse(X, labels, n_clusters, distances) else: centers = _k_means._centers_dense(X, labels, n_clusters, distances) if verbose: print("Iteration %2d, inertia %.3f" % (i, inertia)) if best_inertia is None or inertia < best_inertia: best_labels = labels.copy() best_centers = centers.copy() best_inertia = inertia center_shift_total = squared_norm(centers_old - centers) if center_shift_total <= tol: if verbose: print("Converged at iteration %d: " "center shift %e within tolerance %e" % (i, center_shift_total, tol)) break if center_shift_total > 0: # rerun E-step in case of non-convergence so that predicted labels # match cluster centers best_labels, best_inertia = \ _labels_inertia(X, x_squared_norms, best_centers, precompute_distances=precompute_distances, distances=distances) return best_labels, best_inertia, best_centers, i + 1

參考地址:https://github.com/scikit-learn/scikit-learn