原始碼解讀----之-----k_means相關方法（被k_means呼叫）

阿新 • • 發佈：2018-12-29

本文是個人的理解，由於剛接觸並且自身能力也有限，也許會存在誤解，歡迎留言指正，本人一定虛心請教，謝謝

def _tolerance(X, tol):
"""Return a tolerance which is independent of the dataset"""
    #判斷是否是一個稀疏矩陣,如果是則，則計算X的縱軸(axis=0標識縱軸，axis=1表示橫軸）均值和方差，得到方差
if sp.issparse(X):
from sklearn.utils.sparsefuncs import mean_variance_axis
        variances = mean_variance_axis(X 
, axis=0)[1]
    #否則，直接計算各列的方差
else:
variances = np.var(X, axis=0)
    #返回方差的均值*tol
return np.mean(variances) * tol
def as_float_array(X, copy=True, force_all_finite=True):
'''檢驗隨機數生成器
    random_state:None | int | RandomState例項
    如果為None，則返回np.random的RandomState的一個例項
    如果為int，則返回一個以int為種子的新RandomState例項
 
    如果為RandomState例項，則返回該例項
    否則，ValueError'''
if isinstance(X, np.matrix) or (not isinstance(X, np.ndarray)
                                    and not sp.issparse(X)):
return check_array(X, ['csr', 'csc', 'coo'], dtype=np.float64,
copy=copy, force_all_finite=force_all_finite,
ensure_2d=False)
    elif  
sp.issparse(X) and X.dtype in [np.float32, np.float64]:
return X.copy() if copy else X
elif X.dtype in [np.float32, np.float64]:  # is numpy array
return X.copy('F' if X.flags['F_CONTIGUOUS'] else 'C') if copy else X
else:
if X.dtype.kind in 'uib' and X.dtype.itemsize <= 4:
return_dtype = np.float32
        else:
return_dtype = np.float64
        return X.astype(return_dtype)


def check_array(array, accept_sparse=False, dtype="numeric", order=None,
copy=False, force_all_finite=True, ensure_2d=True,
allow_nd=False, ensure_min_samples=1, ensure_min_features=1,
warn_on_dtype=False, estimator=None):
'''
    將輸入的驗證陣列,列表,稀疏矩陣等轉換為至少為2維的numpy array.
    '''
if accept_sparse is None:
warnings.warn(
            "Passing 'None' to parameter 'accept_sparse' in methods "
            "check_array and check_X_y is deprecated in version 0.19 "
            "and will be removed in 0.21. Use 'accept_sparse=False' "
            " instead.", DeprecationWarning)
        accept_sparse = False
# store whether originally we wanted numeric dtype
dtype_numeric = isinstance(dtype, six.string_types) and dtype == "numeric"
dtype_orig = getattr(array, "dtype", None)
    if not hasattr(dtype_orig, 'kind'):
# not a data type (e.g. a column named dtype in a pandas DataFrame)
dtype_orig = None
    if dtype_numeric:
if dtype_orig is not None and dtype_orig.kind == "O":
# if input is object, convert to float.
dtype = np.float64
        else:
dtype = None
    if isinstance(dtype, (list, tuple)):
if dtype_orig is not None and dtype_orig in dtype:
# no dtype conversion required
dtype = None
        else:
# dtype conversion required. Let's select the first element of the
            # list of accepted types.
dtype = dtype[0]

    if estimator is not None:
if isinstance(estimator, six.string_types):
estimator_name = estimator
else:
estimator_name = estimator.__class__.__name__
else:
estimator_name = "Estimator"
context = " by %s" % estimator_name if estimator is not None else ""
if sp.issparse(array):#如果是稀疏矩陣則轉換稀疏矩陣為指定格式
array = _ensure_sparse_format(array, accept_sparse, dtype, copy,
force_all_finite)
    else:
array = np.array(array, dtype=dtype, order=order, copy=copy)

        if ensure_2d:
if array.ndim == 1:
raise ValueError(
                    "Expected 2D array, got 1D array instead:\narray={}.\n"
                    "Reshape your data either using array.reshape(-1, 1) if "
                    "your data has a single feature or array.reshape(1, -1) "
                    "if it contains a single sample.".format(array))
            array = np.atleast_2d(array)
            # To ensure that array flags are maintained
array = np.array(array, dtype=dtype, order=order, copy=copy)

        # make sure we actually converted to numeric:
if dtype_numeric and array.dtype.kind == "O":
array = array.astype(np.float64)
        if not allow_nd and array.ndim >= 3:
raise ValueError("Found array with dim %d. %s expected <= 2."
% (array.ndim, estimator_name))
        if force_all_finite:
_assert_all_finite(array)

    shape_repr = _shape_repr(array.shape)
    if ensure_min_samples > 0:
n_samples = _num_samples(array)
        if n_samples < ensure_min_samples:
raise ValueError("Found array with %d sample(s) (shape=%s) while a"
                             " minimum of %d is required%s."
% (n_samples, shape_repr, ensure_min_samples,
context))

    if ensure_min_features > 0 and array.ndim == 2:
n_features = array.shape[1]
        if n_features < ensure_min_features:
raise ValueError("Found array with %d feature(s) (shape=%s) while"
                             " a minimum of %d is required%s."
% (n_features, shape_repr, ensure_min_features,
context))

    if warn_on_dtype and dtype_orig is not None and array.dtype != dtype_orig:
msg = ("Data with input dtype %s was converted to %s%s."
% (dtype_orig, array.dtype, context))
        warnings.warn(msg, DataConversionWarning)
    return array
def _ensure_sparse_format(spmatrix, accept_sparse, dtype, copy,
force_all_finite):
"""稀疏矩陣轉換為一個給定的格式 """
if dtype is None:
dtype = spmatrix.dtype

    changed_format = False
    if isinstance(accept_sparse, six.string_types):
accept_sparse = [accept_sparse]

    if accept_sparse is False:
raise TypeError('A sparse matrix was passed, but dense '
                        'data is required. Use X.toarray() to '
                        'convert to a dense numpy array.')
    elif isinstance(accept_sparse, (list, tuple)):
if len(accept_sparse) == 0:
raise ValueError("When providing 'accept_sparse' "
                             "as a tuple or list, it must contain at "
                             "least one string value.")
        # ensure correct sparse format
if spmatrix.format not in accept_sparse:
# create new with correct sparse
spmatrix = spmatrix.asformat(accept_sparse[0])
            changed_format = True
    elif accept_sparse is not True:
# any other type
raise ValueError("Parameter 'accept_sparse' should be a string, "
                         "boolean or list of strings. You provided "
                         "'accept_sparse={}'.".format(accept_sparse))

    if dtype != spmatrix.dtype:
# convert dtype
spmatrix = spmatrix.astype(dtype)
    elif copy and not changed_format:
# force copy
spmatrix = spmatrix.copy()

    if force_all_finite:
if not hasattr(spmatrix, "data"):
warnings.warn("Can't check %s sparse matrix for nan or inf."
% spmatrix.format)
        else:#否則檢驗spmatrix.data，如果spmatrix.data包含NaN或無窮就丟擲一個ValueError
_assert_all_finite(spmatrix.data)
    return spmatrix
def _assert_all_finite(X):
"""如果X包NaA或無窮則丟擲一個ValueError,和assert_all_finite,但只是針對ndarray."""
if _get_config()['assume_finite']:#獲取配置檔案中assume_finite的值
return
X = np.asanyarray(X)
    if (X.dtype.char in np.typecodes['AllFloat'] and not np.isfinite(X.sum())
            and not np.isfinite(X).all()):
raise ValueError("Input contains NaN, infinity"
                         " or a value too large for %r." % X.dtype)


def _validate_center_shape(X, n_centers, centers):
"""Check if centers is compatible with X and n_centers"""
if len(centers) != n_centers:
raise ValueError('The shape of the initial centers (%s) '
                         'does not match the number of clusters %i'
% (centers.shape, n_centers))
    if centers.shape[1] != X.shape[1]:
raise ValueError(
            "The number of features of the initial centers %s "
            "does not match the number of features of the data %s."
% (centers.shape[1], X.shape[1]))


def _kmeans_single_lloyd(X, n_clusters, max_iter=300, init='k-means++',
verbose=False, x_squared_norms=None,
random_state=None, tol=1e-4,
precompute_distances=True):
"""A single run of k-means, assumes preparation completed prior.
    """
random_state = check_random_state(random_state)

    best_labels, best_inertia, best_centers = None, None, None
# init
centers = _init_centroids(X, n_clusters, init, random_state=random_state,
x_squared_norms=x_squared_norms)
    if verbose:
print("Initialization complete")

    # Allocate memory to store the distances for each sample to its
    # closer center for reallocation in case of ties
distances = np.zeros(shape=(X.shape[0],), dtype=X.dtype)

    # iterations
for i in range(max_iter):
centers_old = centers.copy()
        # labels assignment is also called the E-step of EM
labels, inertia = \
            _labels_inertia(X, x_squared_norms, centers,
precompute_distances=precompute_distances,
distances=distances)

        # computation of the means is also called the M-step of EM
if sp.issparse(X):
centers = _k_means._centers_sparse(X, labels, n_clusters,
distances)
        else:
centers = _k_means._centers_dense(X, labels, n_clusters, distances)

        if verbose:
print("Iteration %2d, inertia %.3f" % (i, inertia))

        if best_inertia is None or inertia < best_inertia:
best_labels = labels.copy()
            best_centers = centers.copy()
            best_inertia = inertia

        center_shift_total = squared_norm(centers_old - centers)
        if center_shift_total <= tol:
if verbose:
print("Converged at iteration %d: "
                      "center shift %e within tolerance %e"
% (i, center_shift_total, tol))
            break
    if center_shift_total > 0:
# rerun E-step in case of non-convergence so that predicted labels
        # match cluster centers
best_labels, best_inertia = \
            _labels_inertia(X, x_squared_norms, best_centers,
precompute_distances=precompute_distances,
distances=distances)

    return best_labels, best_inertia, best_centers, i + 1

參考地址：https://github.com/scikit-learn/scikit-learn

原始碼解讀----之-----k_means相關方法（被k_means呼叫）

本文是個人的理解，由於剛接觸並且自身能力也有限，也許會存在誤解，歡迎留言指正，本人一定虛心請教，謝謝 def _tolerance(X, tol): """Return a tolerance which is independent of the dataset""

原始碼解讀----之_k-means++初始化質心的方法(被k_means呼叫)

本文是個人的理解，由於剛接觸並且自身能力也有限，也許會存在誤解，歡迎留言指正，本人一定虛心請教，謝謝 def _k_init(X, n_clusters, x_squared_norms, random_state, n_local_trials=None): """根

PyTorch原始碼解讀之torchvision.transforms（轉）

原文地址：https://blog.csdn.net/u014380165/article/details/79167753 PyTorch框架中有一個非常重要且好用的包：torchvision，該包主要由3個子包組成，分別是：torchvision.dat

Spring原始碼解讀之Spring MVC HandlerMapping元件（二）

一、HandlerMapping HandlerMapping作用是根據request找到相應的處理器Handler和Interceptors，並將Handler和Interceptors封裝成HandlerExecutionChain 物件返回。Handler

HashMap原始碼註解之靜態工具方法hash()、tableSizeFor()（四）

注意，本文基於JDK 1.8 HashMap#hash() 為什麼要有HashMap的hash()方法，難道不能直接使用KV中K原有的hash值嗎？在HashMap的put、get操作時為什麼不能直接使用K中原有的hash值。 /*

IPFS（三）原始碼解讀之-add

package commands import ( “errors” “fmt” “io” “os” “strings” //塊服務提供的介面 blockservice “github.com/ipfs/go-ipfs/blockservice” //核心api core “github

Mysql 之編譯安裝方法（Mysql5.7）

select eas base sql_mod work names ase 下載安裝 eating 參考本博客文章：http://blog.51cto.com/12965094/2129267 1. 下載安裝包 wget http://downloads.sourcefo

【1】pytorch torchvision原始碼解讀之Alexnet

最近開始學習一個新的深度學習框架PyTorch。框架中有一個非常重要且好用的包：torchvision，顧名思義這個包主要是關於計算機視覺cv的。這個包主要由3個子包組成，分別是：torchvision.datasets、torchvision.models、torchvision.trans

java原始碼解讀之HashMap

1:首先下載openjdk(http://pan.baidu.com/s/1dFMZXg1),把原始碼匯入eclipse,以便看到jdk原始碼 Windows-Prefe

少說話多寫程式碼之Python學習017——字典的方法（items、pop）

items方法將字典的所有項以列表方式返回，列表中每一項都表示為（鍵，值）對形式，但對字典的項的次序沒有什麼規律。 #items方法 d={'title':'繞口令：喇嘛和啞巴', 'content':'打南邊來了個啞巴，腰裡別了個喇叭；打北邊來了個喇

少說話多寫程式碼之Python學習019——字典的方法（update、values）

update方法利用一個字典A去更新另一個字典B。A的項會新增到B中，如果存在相同鍵，則A會覆蓋B的這個鍵。 #update 方法 d={ 'name':'楊友山', 'blog地址':'https://blog.csdn.net/y

少說話多寫程式碼之Python學習018——字典的方法（popitem、setdefault）

popitem方法 popitem其實和pop方法沒什麼兩樣，雖然解釋說pop是彈出字典的最後一項，popitem彈出的是字典的隨機項。但是字典是一個連結串列結構，哪裡有最後一項和第一項呢？不管怎麼說，我們可以看看popitem的用法。 #popitem d={} d={'詩仙':'李白',

JDK原始碼-HashMap-remove方法（JDK7和JDK8）

remove方法原始碼相應簡單很多測試程式碼 /** * 測試remove操作的區別 */ @Test public void remove(){ HashMap<String, String> map = new HashMap();

JDK原始碼-HashMap-put方法（JDK7和JDK8）

下面是對HashMap中put方法的原始碼進行註釋測試程式碼 /** * 測試put操作的區別 */ @Test public void put(){ HashMap<String, String&

spring原始碼學習之路---IOC初探（一）

首先把spring原始碼匯入，怎麼匯入百度下。首先我們來說一下IOC，IOC是spring最核心的理念，包括AOP也要屈居第二，那麼IOC到底是什麼呢，四個字，控制反轉。網上有不少是這麼解釋IOC的，說IOC是將物件的建立和依賴關係交給容器，這句話我相信不少人都知道，在我個人的理解

PyTorch原始碼解讀之torch.utils.data.DataLoader(轉)

原文連結 https://blog.csdn.net/u014380165/article/details/79058479 寫得特別好！最近正好在學習pytorch，學習一下！ PyTorch中資料讀取的一個重要介面是torch.utils.data.DataLoade

PyTorch原始碼解讀之torchvision.models(轉)

原文地址：https://blog.csdn.net/u014380165/article/details/79119664 PyTorch框架中有一個非常重要且好用的包：torchvision，該包主要由3個子包組成，分別是：torchvision.datasets、torchvision.mode

jQuery原始碼解讀之init函式

jQuery的構造方法： // 直接new了一個物件。同時根據jQuery.fn = jQuery.prototype，jQuery.fn相當於jQuery.prototype。 jQuery = function( selector, context ) { return

PyTorch原始碼解讀之torchvision.transforms

PyTorch框架中有一個非常重要且好用的包：torchvision，該包主要由3個子包組成，分別是：torchvision.datasets、torchvision.models、torchvision.transforms。這3個子包的具體介紹可以參考

eureka原始碼解讀之服務端

剖析eureka服務端啟動流程服務端啟動類-入口處 @EnableEurekaServer @SpringBootApplication public class EurekaServerApplication { public static void main(Strin

原始碼解讀----之-----k_means相關方法（被k_means呼叫）

相關推薦