# 监督学习分类之最近质心(Nearest Centroid)

2022/05/18 16:51

NearestCentroid.py

import warnings
import numpy as np
from functools import partial
import itertools
import numbers

from scipy.spatial import distance

_VALID_METRICS = ['euclidean', 'l2', 'l1', 'manhattan', 'cityblock',
'braycurtis', 'canberra', 'chebyshev', 'correlation',
'cosine', 'dice', 'hamming', 'jaccard', 'kulsinski',
'mahalanobis', 'matching', 'minkowski', 'rogerstanimoto',
'russellrao', 'seuclidean', 'sokalmichener',
'sokalsneath', 'sqeuclidean', 'yule', "wminkowski",
'nan_euclidean', 'haversine']

def gen_batches(n, batch_size, *, min_batch_size=0):
"""Generator to create slices containing batch_size elements, from 0 to n.

The last slice may contain less than batch_size elements, when batch_size
does not divide n.

Parameters
----------
n : int
batch_size : int
Number of element in each batch
min_batch_size : int, default=0
Minimum batch size to produce.

Yields
------
slice of batch_size elements

Examples
--------
>>> from sklearn.utils import gen_batches
>>> list(gen_batches(7, 3))
[slice(0, 3, None), slice(3, 6, None), slice(6, 7, None)]
>>> list(gen_batches(6, 3))
[slice(0, 3, None), slice(3, 6, None)]
>>> list(gen_batches(2, 3))
[slice(0, 2, None)]
>>> list(gen_batches(7, 3, min_batch_size=0))
[slice(0, 3, None), slice(3, 6, None), slice(6, 7, None)]
>>> list(gen_batches(7, 3, min_batch_size=2))
[slice(0, 3, None), slice(3, 7, None)]
"""
if not isinstance(batch_size, numbers.Integral):
raise TypeError("gen_batches got batch_size=%s, must be an"
" integer" % batch_size)
if batch_size <= 0:
raise ValueError("gen_batches got batch_size=%s, must be"
" positive" % batch_size)
start = 0
for _ in range(int(n // batch_size)):
end = start + batch_size
if end + min_batch_size > n:
continue
yield slice(start, end)
start = end
if start < n:
yield slice(start, n)

def _euclidean_distances_upcast(X, XX=None, Y=None, YY=None, batch_size=None):
"""Euclidean distances between X and Y

Assumes X and Y have float32 dtype.
Assumes XX and YY have float64 dtype or are None.

X and Y are upcast to float64 by chunks, which size is chosen to limit
memory increase by approximately 10% (at least 10MiB).
"""
n_samples_X = X.shape[0]
n_samples_Y = Y.shape[0]
n_features = X.shape[1]

distances = np.empty((n_samples_X, n_samples_Y), dtype=np.float32)

if batch_size is None:
x_density = 1
y_density = 1

# Allow 10% more memory than X, Y and the distance matrix take (at
# least 10MiB)
maxmem = max(
((x_density * n_samples_X + y_density * n_samples_Y) * n_features
+ (x_density * n_samples_X * y_density * n_samples_Y)) / 10,
10 * 2 ** 17)

# The increase amount of memory in 8-byte blocks is:
# - x_density * batch_size * n_features (copy of chunk of X)
# - y_density * batch_size * n_features (copy of chunk of Y)
# - batch_size * batch_size (chunk of distance matrix)
# Hence x² + (xd+yd)kx = M, where x=batch_size, k=n_features, M=maxmem
#                                 xd=x_density and yd=y_density
tmp = (x_density + y_density) * n_features
batch_size = (-tmp + np.sqrt(tmp ** 2 + 4 * maxmem)) / 2
batch_size = max(int(batch_size), 1)

x_batches = gen_batches(n_samples_X, batch_size)

for i, x_slice in enumerate(x_batches):
X_chunk = X[x_slice].astype(np.float64)
if XX is None:
XX_chunk = row_norms(X_chunk, squared=True)[:, np.newaxis]
else:
XX_chunk = XX[x_slice]

y_batches = gen_batches(n_samples_Y, batch_size)

for j, y_slice in enumerate(y_batches):
if X is Y and j < i:
# when X is Y the distance matrix is symmetric so we only need
# to compute half of it.
d = distances[y_slice, x_slice].T

else:
Y_chunk = Y[y_slice].astype(np.float64)
if YY is None:
YY_chunk = row_norms(Y_chunk, squared=True)[np.newaxis, :]
else:
YY_chunk = YY[:, y_slice]

d = -2 * safe_dot(X_chunk, Y_chunk.T, dense_output=True)
d += XX_chunk
d += YY_chunk

distances[x_slice, y_slice] = d.astype(np.float32, copy=False)

return distances

def euclidean_distances(X, Y=None, *, Y_norm_squared=None, squared=False,
X_norm_squared=None):
"""
Considering the rows of X (and Y=X) as vectors, compute the
distance matrix between each pair of vectors.

For efficiency reasons, the euclidean distance between a pair of row
vector x and y is computed as::

dist(x, y) = sqrt(dot(x, x) - 2 * dot(x, y) + dot(y, y))

This formulation has two advantages over other ways of computing distances.
First, it is computationally efficient when dealing with sparse data.
Second, if one argument varies but the other remains unchanged, then
dot(x, x) and/or dot(y, y) can be pre-computed.

However, this is not the most precise way of doing this computation, and
the distance matrix returned by this function may not be exactly
symmetric as required by, e.g., scipy.spatial.distance functions.

Read more in the :ref:User Guide <metrics>.

Parameters
----------
X : {array-like, sparse matrix}, shape (n_samples_1, n_features)

Y : {array-like, sparse matrix}, shape (n_samples_2, n_features)

Y_norm_squared : array-like, shape (n_samples_2, ), optional
Pre-computed dot-products of vectors in Y (e.g.,
(Y**2).sum(axis=1))
May be ignored in some cases, see the note below.

squared : boolean, optional
Return squared Euclidean distances.

X_norm_squared : array-like of shape (n_samples,), optional
Pre-computed dot-products of vectors in X (e.g.,
(X**2).sum(axis=1))
May be ignored in some cases, see the note below.

Notes
-----
To achieve better accuracy, X_norm_squared and Y_norm_squared may be
unused if they are passed as float32.

Returns
-------
distances : array, shape (n_samples_1, n_samples_2)

Examples
--------
>>> from sklearn.metrics.pairwise import euclidean_distances
>>> X = [[0, 1], [1, 1]]
>>> # distance between rows of X
>>> euclidean_distances(X, X)
array([[0., 1.],
[1., 0.]])
>>> # get distance to origin
>>> euclidean_distances(X, [[0, 0]])
array([[1.        ],
[1.41421356]])

--------
paired_distances : distances betweens pairs of elements of X and Y.
"""
# If norms are passed as float32, they are unused. If arrays are passed as
# float32, norms needs to be recomputed on upcast chunks.
# TODO: use a float64 accumulator in row_norms to avoid the latter.
if X_norm_squared is not None:
XX = X_norm_squared
if XX.shape == (1, X.shape[0]):
XX = XX.T
elif XX.shape != (X.shape[0], 1):
raise ValueError(
"Incompatible dimensions for X and X_norm_squared")
if XX.dtype == np.float32:
XX = None
elif X.dtype == np.float32:
XX = None
else:
XX = row_norms(X, squared=True)[:, np.newaxis]

if X is Y and XX is not None:
# shortcut in the common case euclidean_distances(X, X)
YY = XX.T
elif Y_norm_squared is not None:
YY = np.atleast_2d(Y_norm_squared)

if YY.shape != (1, Y.shape[0]):
raise ValueError(
"Incompatible dimensions for Y and Y_norm_squared")
if YY.dtype == np.float32:
YY = None
elif Y.dtype == np.float32:
YY = None
else:
YY = row_norms(Y, squared=True)[np.newaxis, :]

if X.dtype == np.float32:
# To minimize precision issues with float32, we compute the distance
# matrix on chunks of X and Y upcast to float64
distances = _euclidean_distances_upcast(X, XX, Y, YY)
else:
# if dtype is already float64, no need to chunk and upcast
distances = - 2 * safe_dot(X, Y.T, dense_output=True)
distances += XX
distances += YY
np.maximum(distances, 0, out=distances)

# Ensure that distances between vectors and themselves are set to 0.0.
# This may not be the case due to floating point rounding errors.
if X is Y:
np.fill_diagonal(distances, 0)

return distances if squared else np.sqrt(distances, out=distances)

def _object_dtype_isnan(X):
return X != X

if X.dtype.kind == "f":
return np.isnan(X)
elif X.dtype.kind in ("i", "u"):
# can't have NaNs in integer array.
return np.zeros(X.shape, dtype=bool)
else:
# np.isnan does not work on object dtypes.
return _object_dtype_isnan(X)
else:

def is_scalar_nan(x):
"""Tests if x is NaN

This function is meant to overcome the issue that np.isnan does not allow
non-numerical types as input, and that np.nan is not np.float('nan').

Parameters
----------
x : any type

Returns
-------
boolean

Examples
--------
>>> is_scalar_nan(np.nan)
True
>>> is_scalar_nan(float("nan"))
True
>>> is_scalar_nan(None)
False
>>> is_scalar_nan("")
False
>>> is_scalar_nan([np.nan])
False
"""
# convert from numpy.bool_ to python bool to ensure that testing
# is_scalar_nan(x) is True does not fail.
return bool(isinstance(x, numbers.Real) and np.isnan(x))

def nan_euclidean_distances(X, Y=None, *, squared=False,
missing_values=np.nan, copy=True):
"""Calculate the euclidean distances in the presence of missing values.

Compute the euclidean distance between each pair of samples in X and Y,
where Y=X is assumed if Y=None. When calculating the distance between a
pair of samples, this formulation ignores feature coordinates with a
missing value in either sample and scales up the weight of the remaining
coordinates:

dist(x,y) = sqrt(weight * sq. distance from present coordinates)
where,
weight = Total # of coordinates / # of present coordinates

For example, the distance between [3, na, na, 6] and [1, na, 4, 5]
is:

.. math::
\\sqrt{\\frac{4}{2}((3-1)^2 + (6-5)^2)}

If all the coordinates are missing or if there are no common present
coordinates then NaN is returned for that pair.

Read more in the :ref:User Guide <metrics>.

Parameters
----------
X : array-like, shape=(n_samples_1, n_features)

Y : array-like, shape=(n_samples_2, n_features)

squared : bool, default=False
Return squared Euclidean distances.

missing_values : np.nan or int, default=np.nan
Representation of missing value

copy : boolean, default=True
Make and use a deep copy of X and Y (if Y exists)

Returns
-------
distances : array, shape (n_samples_1, n_samples_2)

Examples
--------
>>> from sklearn.metrics.pairwise import nan_euclidean_distances
>>> nan = float("NaN")
>>> X = [[0, 1], [1, nan]]
>>> nan_euclidean_distances(X, X) # distance between rows of X
array([[0.        , 1.41421356],
[1.41421356, 0.        ]])

>>> # get distance to origin
>>> nan_euclidean_distances(X, [[0, 0]])
array([[1.        ],
[1.41421356]])

References
----------
* John K. Dixon, "Pattern Recognition with Partly Missing Data",
IEEE Transactions on Systems, Man, and Cybernetics, Volume: 9, Issue:
10, pp. 617 - 621, Oct. 1979.
http://ieeexplore.ieee.org/abstract/document/4310090/

--------
paired_distances : distances between pairs of elements of X and Y.
"""
# Get missing mask for X

# Get missing mask for Y
missing_Y = missing_X if Y is X else _get_mask(Y, missing_values)

# set missing values to zero
X[missing_X] = 0
Y[missing_Y] = 0

distances = euclidean_distances(X, Y, squared=True)

# Adjust distances for missing values
XX = X * X
YY = Y * Y
distances -= np.dot(XX, missing_Y.T)
distances -= np.dot(missing_X, YY.T)

np.clip(distances, 0, None, out=distances)

if X is Y:
# Ensure that distances between vectors and themselves are set to 0.0.
# This may not be the case due to floating point rounding errors.
np.fill_diagonal(distances, 0.0)

present_X = 1 - missing_X
present_Y = present_X if Y is X else ~missing_Y
present_count = np.dot(present_X, present_Y.T)
distances[present_count == 0] = np.nan
# avoid divide by zero
np.maximum(1, present_count, out=present_count)
distances /= present_count
distances *= X.shape[1]

if not squared:
np.sqrt(distances, out=distances)

return distances

def haversine_distances(X, Y=None):
"""Compute the Haversine distance between samples in X and Y

The Haversine (or great circle) distance is the angular distance between
two points on the surface of a sphere. The first distance of each point is
assumed to be the latitude, the second is the longitude, given in radians.
The dimension of the data must be 2.

.. math::
D(x, y) = 2\\arcsin[\\sqrt{\\sin^2((x1 - y1) / 2)
+ \\cos(x1)\\cos(y1)\\sin^2((x2 - y2) / 2)}]

Parameters
----------
X : array_like, shape (n_samples_1, 2)

Y : array_like, shape (n_samples_2, 2), optional

Returns
-------
distance : {array}, shape (n_samples_1, n_samples_2)

Notes
-----
As the Earth is nearly spherical, the haversine formula provides a good
approximation of the distance between two points of the Earth surface, with
a less than 1% error on average.

Examples
--------
We want to calculate the distance between the Ezeiza Airport
(Buenos Aires, Argentina) and the Charles de Gaulle Airport (Paris, France)

>>> from sklearn.metrics.pairwise import haversine_distances
>>> bsas = [-34.83333, -58.5166646]
>>> paris = [49.0083899664, 2.53844117956]
>>> result * 6371000/1000  # multiply by Earth radius to get kilometers
array([[    0.        , 11099.54035582],
[11099.54035582,     0.        ]])
"""
from sklearn.neighbors import DistanceMetric
return DistanceMetric.get_metric('haversine').pairwise(X, Y)

def manhattan_distances(X, Y=None, *, sum_over_features=True):
""" Compute the L1 distances between the vectors in X and Y.

With sum_over_features equal to False it returns the componentwise
distances.

Read more in the :ref:User Guide <metrics>.

Parameters
----------
X : array_like
An array with shape (n_samples_X, n_features).

Y : array_like, optional
An array with shape (n_samples_Y, n_features).

sum_over_features : bool, default=True
If True the function returns the pairwise distance matrix
else it returns the componentwise L1 pairwise-distances.
Not supported for sparse matrix inputs.

Returns
-------
D : array
If sum_over_features is False shape is
(n_samples_X * n_samples_Y, n_features) and D contains the
componentwise L1 pairwise-distances (ie. absolute difference),
else shape is (n_samples_X, n_samples_Y) and D contains
the pairwise L1 distances.

Notes
--------
When X and/or Y are CSR sparse matrices and they are not already
in canonical format, this function modifies them in-place to
make them canonical.

Examples
--------
>>> from sklearn.metrics.pairwise import manhattan_distances
>>> manhattan_distances([[3]], [[3]])
array([[0.]])
>>> manhattan_distances([[3]], [[2]])
array([[1.]])
>>> manhattan_distances([[2]], [[3]])
array([[1.]])
>>> manhattan_distances([[1, 2], [3, 4]],\
[[1, 2], [0, 3]])
array([[0., 2.],
[4., 4.]])
>>> import numpy as np
>>> X = np.ones((1, 2))
>>> y = np.full((2, 2), 2.)
>>> manhattan_distances(X, y, sum_over_features=False)
array([[1., 1.],
[1., 1.]])
"""
if sum_over_features:
return distance.cdist(X, Y, 'cityblock')

D = X[:, np.newaxis, :] - Y[np.newaxis, :, :]
D = np.abs(D, D)
return D.reshape((-1, X.shape[1]))

def _handle_zeros_in_scale(scale, copy=True):
''' Makes sure that whenever scale is zero, we handle it correctly.

This happens in most scalers when we have constant features.'''

# if we are fitting on 1D arrays, scale might be a scalar
if np.isscalar(scale):
if scale == .0:
scale = 1.
return scale
elif isinstance(scale, np.ndarray):
if copy:
# New array to avoid side-effects
scale = scale.copy()
scale[scale == 0.0] = 1.0
return scale

def row_norms(X, squared=False):
"""Row-wise (squared) Euclidean norm of X.

Equivalent to np.sqrt((X * X).sum(axis=1)), but also supports sparse
matrices and does not create an X.shape-sized temporary.

Performs no input validation.

Parameters
----------
X : array_like
The input array
squared : bool, optional (default = False)
If True, return squared norms.

Returns
-------
array_like
The row-wise (squared) Euclidean norm of X.
"""
norms = np.einsum('ij,ij->i', X, X)

if not squared:
np.sqrt(norms, norms)
return norms

def normalize(X, norm='l2', *, axis=1, copy=True, return_norm=False):
"""Scale input vectors individually to unit norm (vector length).

Read more in the :ref:User Guide <preprocessing_normalization>.

Parameters
----------
X : {array-like, sparse matrix}, shape [n_samples, n_features]
The data to normalize, element by element.
scipy.sparse matrices should be in CSR format to avoid an
un-necessary copy.

norm : 'l1', 'l2', or 'max', optional ('l2' by default)
The norm to use to normalize each non zero sample (or each non-zero
feature if axis is 0).

axis : 0 or 1, optional (1 by default)
axis used to normalize the data along. If 1, independently normalize
each sample, otherwise (if 0) normalize each feature.

copy : boolean, optional, default True
set to False to perform inplace row normalization and avoid a
copy (if the input is already a numpy array or a scipy.sparse
CSR matrix and if axis is 1).

return_norm : boolean, default False
whether to return the computed norms

Returns
-------
X : {array-like, sparse matrix}, shape [n_samples, n_features]
Normalized input X.

norms : array, shape [n_samples] if axis=1 else [n_features]
An array of norms along given axis for X.
When X is sparse, a NotImplementedError will be raised
for norm 'l1' or 'l2'.

--------
Normalizer: Performs normalization using the Transformer API
(e.g. as part of a preprocessing :class:sklearn.pipeline.Pipeline).

Notes
-----
For a comparison of the different scalers, transformers, and normalizers,
see :ref:examples/preprocessing/plot_all_scaling.py
<sphx_glr_auto_examples_preprocessing_plot_all_scaling.py>.

"""
if axis == 0:
X = X.T

if norm == 'l1':
norms = np.abs(X).sum(axis=1)
elif norm == 'l2':
norms = row_norms(X)
elif norm == 'max':
norms = np.max(abs(X), axis=1)
norms = _handle_zeros_in_scale(norms, copy=False)
X /= norms[:, np.newaxis]

if axis == 0:
X = X.T

if return_norm:
return X, norms
else:
return X

def safe_dot(a, b, *, dense_output=False):
"""Dot product that handle the sparse matrix case correctly

Parameters
----------
a : array or sparse matrix
b : array or sparse matrix
dense_output : boolean, (default=False)
When False, a and b both being sparse will yield sparse output.
When True, output will always be a dense array.

Returns
-------
dot_product : array or sparse matrix
sparse if a and b are sparse and dense_output=False.
"""
if a.ndim > 2 or b.ndim > 2:
ret = np.dot(a, b)
else:
ret = a @ b

return ret

def cosine_similarity(X, Y=None, dense_output=True):
"""Compute cosine similarity between samples in X and Y.

Cosine similarity, or the cosine kernel, computes similarity as the
normalized dot product of X and Y:

K(X, Y) = <X, Y> / (||X||*||Y||)

On L2-normalized data, this function is equivalent to linear_kernel.

Read more in the :ref:User Guide <cosine_similarity>.

Parameters
----------
X : ndarray or sparse array, shape: (n_samples_X, n_features)
Input data.

Y : ndarray or sparse array, shape: (n_samples_Y, n_features)
Input data. If None, the output will be the pairwise
similarities between all samples in X.

dense_output : boolean (optional), default True
Whether to return dense output even when the input is sparse. If
False, the output is sparse if both input arrays are sparse.

parameter dense_output for dense output.

Returns
-------
kernel matrix : array
An array with shape (n_samples_X, n_samples_Y).
"""
# to avoid recursive import
X_normalized = normalize(X, copy=True)
if X is Y:
Y_normalized = X_normalized
else:
Y_normalized = normalize(Y, copy=True)

K = safe_dot(X_normalized, Y_normalized.T,
dense_output=dense_output)

return K

def cosine_distances(X, Y=None):
"""Compute cosine distance between samples in X and Y.

Cosine distance is defined as 1.0 minus the cosine similarity.

Read more in the :ref:User Guide <metrics>.

Parameters
----------
X : array_like, sparse matrix
with shape (n_samples_X, n_features).

Y : array_like, sparse matrix (optional)
with shape (n_samples_Y, n_features).

Returns
-------
distance matrix : array
An array with shape (n_samples_X, n_samples_Y).

--------
sklearn.metrics.pairwise.cosine_similarity
scipy.spatial.distance.cosine : dense matrices only
"""
# 1.0 - cosine_similarity(X, Y) without copy
S = cosine_similarity(X, Y)
S *= -1
S += 1
np.clip(S, 0, 2, out=S)
if X is Y or Y is None:
# Ensure that distances between vectors and themselves are set to 0.0.
# This may not be the case due to floating point rounding errors.
S[np.diag_indices_from(S)] = 0.0
return S

PAIRWISE_DISTANCE_FUNCTIONS = {
# If updating this dictionary, update the doc in both distance_metrics()
# and also in pairwise_distances()!
'cityblock': manhattan_distances,
'cosine': cosine_distances,
'euclidean': euclidean_distances,
'haversine': haversine_distances,
'l2': euclidean_distances,
'l1': manhattan_distances,
'manhattan': manhattan_distances,
'precomputed': None,  # HACK: precomputed is always allowed, never called
'nan_euclidean': nan_euclidean_distances,
}

def _pairwise_callable(X, Y, metric, force_all_finite=True, **kwds):
"""Handle the callable case for pairwise_{distances,kernels}
"""
if X is Y:
# Only calculate metric for upper triangle
out = np.zeros((X.shape[0], Y.shape[0]), dtype='float')
iterator = itertools.combinations(range(X.shape[0]), 2)
for i, j in iterator:
out[i, j] = metric(X[i], Y[j], **kwds)

# Make symmetric
# NB: out += out.T will produce incorrect results
out = out + out.T

# Calculate diagonal
# NB: nonzero diagonals are allowed for both metrics and kernels
for i in range(X.shape[0]):
x = X[i]
out[i, i] = metric(x, x, **kwds)

else:
# Calculate all cells
out = np.empty((X.shape[0], Y.shape[0]), dtype='float')
iterator = itertools.product(range(X.shape[0]), range(Y.shape[0]))
for i, j in iterator:
out[i, j] = metric(X[i], Y[j], **kwds)

return out

def _pairwise(X, Y, func, n_jobs, **kwds):
"""Break the pairwise matrix in n_jobs even slices
and compute them in parallel"""

if Y is None:
Y = X
return func(X, Y, **kwds)

def pairwise_distances(X, Y=None, metric="euclidean", *, n_jobs=None,
force_all_finite=True, **kwds):
""" Compute the distance matrix from a vector array X and optional Y.

This method takes either a vector array or a distance matrix, and returns
a distance matrix. If the input is a vector array, the distances are
computed. If the input is a distances matrix, it is returned instead.

This method provides a safe way to take a distance matrix as input, while
preserving compatibility with many other algorithms that take a vector
array.

If Y is given (default is None), then the returned matrix is the pairwise
distance between the arrays from both X and Y.

Valid values for metric are:

- From scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2',
'manhattan']. These metrics support sparse matrix
inputs.
['nan_euclidean'] but it does not yet support sparse matrices.

- From scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev',
'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', 'mahalanobis',
'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean',
'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule']
See the documentation for scipy.spatial.distance for details on these
metrics. These metrics do not support sparse matrix inputs.

Note that in the case of 'cityblock', 'cosine' and 'euclidean' (which are
valid scipy.spatial.distance metrics), the scikit-learn implementation
will be used, which is faster and has support for sparse matrices (except
for 'cityblock'). For a verbose description of the metrics from
scikit-learn, see the __doc__ of the sklearn.pairwise.distance_metrics
function.

Read more in the :ref:User Guide <metrics>.

Parameters
----------
X : array [n_samples_a, n_samples_a] if metric == "precomputed", or, \
[n_samples_a, n_features] otherwise
Array of pairwise distances between samples, or a feature array.

Y : array [n_samples_b, n_features], optional
An optional second feature array. Only allowed if
metric != "precomputed".

metric : string, or callable
The metric to use when calculating distance between instances in a
feature array. If metric is a string, it must be one of the options
allowed by scipy.spatial.distance.pdist for its metric parameter, or
a metric listed in pairwise.PAIRWISE_DISTANCE_FUNCTIONS.
If metric is "precomputed", X is assumed to be a distance matrix.
Alternatively, if metric is a callable function, it is called on each
pair of instances (rows) and the resulting value recorded. The callable
should take two arrays from X as input and return a value indicating
the distance between them.

n_jobs : int or None, optional (default=None)
The number of jobs to use for the computation. This works by breaking
down the pairwise matrix into n_jobs even slices and computing them in
parallel.

None means 1 unless in a :obj:joblib.parallel_backend context.
-1 means using all processors. See :term:Glossary <n_jobs>
for more details.

force_all_finite : boolean or 'allow-nan', (default=True)
Whether to raise an error on np.inf, np.nan, pd.NA in array. The
possibilities are:

- True: Force all values of array to be finite.
- False: accepts np.inf, np.nan, pd.NA in array.
- 'allow-nan': accepts only np.nan and pd.NA values in array. Values
cannot be infinite.

force_all_finite accepts the string 'allow-nan'.

.. versionchanged:: 0.23
Accepts pd.NA and converts it into np.nan

**kwds : optional keyword parameters
Any further parameters are passed directly to the distance function.
If using a scipy.spatial.distance metric, the parameters are still
metric dependent. See the scipy docs for usage examples.

Returns
-------
D : array [n_samples_a, n_samples_a] or [n_samples_a, n_samples_b]
A distance matrix D such that D_{i, j} is the distance between the
ith and jth vectors of the given matrix X, if Y is None.
If Y is not None, then D_{i, j} is the distance between the ith array
from X and the jth array from Y.

--------
pairwise_distances_chunked : performs the same calculation as this
function, but returns a generator of chunks of the distance matrix, in
order to limit memory usage.
paired_distances : Computes the distances between corresponding
elements of two arrays
"""
if (metric not in _VALID_METRICS and
not callable(metric) and metric != "precomputed"):
raise ValueError("Unknown metric %s. "
"Valid metrics are %s, or 'precomputed', or a "
"callable" % (metric, _VALID_METRICS))

if metric == "precomputed":
return X
elif metric in PAIRWISE_DISTANCE_FUNCTIONS:
func = PAIRWISE_DISTANCE_FUNCTIONS[metric]
elif callable(metric):
func = partial(_pairwise_callable, metric=metric,
force_all_finite=force_all_finite, **kwds)
else:
if X is Y:
return distance.squareform(distance.pdist(X, metric=metric,
**kwds))
func = partial(distance.cdist, metric=metric, **kwds)

return _pairwise(X, Y, func, n_jobs, **kwds)

def column_or_1d(y, *, warn=False):
""" Ravel column or 1d numpy array, else raises an error

Parameters
----------
y : array-like

warn : boolean, default False
To control display of warnings.

Returns
-------
y : array

"""
y = np.asarray(y)
shape = np.shape(y)
if len(shape) == 1:
return np.ravel(y)
if len(shape) == 2 and shape[1] == 1:
return np.ravel(y)

raise ValueError(
"y should be a 1d array, "
"got an array of shape {} instead.".format(shape))

"""
Helper function to check for unknowns in values to be encoded.

Uses pure python method for object dtype, and numpy method for
all other dtypes.

Parameters
----------
values : array
Values to check for unknowns.
uniques : array
Allowed uniques values.
If True, return a mask of the same shape as values indicating
the valid values.

Returns
-------
diff : list
The unique values present in values and not in uniques (the
unknown values).
Additionally returned if return_mask=True.

"""
if values.dtype == object:
uniques_set = set(uniques)
diff = list(set(values) - uniques_set)
if diff:
valid_mask = np.array([val in uniques_set for val in values])
else:
else:
return diff
else:
unique_values = np.unique(values)
diff = list(np.setdiff1d(unique_values, uniques, assume_unique=True))
if diff:
else:
else:
return diff

def _encode_numpy(values, uniques=None, encode=False, check_unknown=True):
# only used in _encode below, see docstring there for details
if uniques is None:
if encode:
uniques, encoded = np.unique(values, return_inverse=True)
return uniques, encoded
else:
# unique sorts
return np.unique(values)
if encode:
if check_unknown:
diff = _encode_check_unknown(values, uniques)
if diff:
raise ValueError("y contains previously unseen labels: %s"
% str(diff))
encoded = np.searchsorted(uniques, values)
return uniques, encoded
else:
return uniques

def _encode_python(values, uniques=None, encode=False):
# only used in _encode below, see docstring there for details
if uniques is None:
uniques = sorted(set(values))
uniques = np.array(uniques, dtype=values.dtype)
if encode:
table = {val: i for i, val in enumerate(uniques)}
try:
encoded = np.array([table[v] for v in values])
except KeyError as e:
raise ValueError("y contains previously unseen labels: %s"
% str(e))
return uniques, encoded
else:
return uniques

def _encode(values, uniques=None, encode=False, check_unknown=True):
"""Helper function to factorize (find uniques) and encode values.

Uses pure python method for object dtype, and numpy method for
all other dtypes.
The numpy method has the limitation that the uniques need to
be sorted. Importantly, this is not checked but assumed to already be
the case. The calling method needs to ensure this for all non-object
values.

Parameters
----------
values : array
Values to factorize or encode.
uniques : array, optional
If passed, uniques are not determined from passed values (this
can be because the user specified categories, or because they
already have been determined in fit).
encode : bool, default False
If True, also encode the values into integer codes based on uniques.
check_unknown : bool, default True
If True, check for values in values that are not in unique
and raise an error. This is ignored for object dtype, and treated as
True in this case. This parameter is useful for
_BaseEncoder._transform() to avoid calling _encode_check_unknown()
twice.

Returns
-------
uniques
If encode=False. The unique values are sorted if the uniques
parameter was None (and thus inferred from the data).
(uniques, encoded)
If encode=True.

"""
if values.dtype == object:
try:
res = _encode_python(values, uniques, encode)
except TypeError:
types = sorted(t.__qualname__
for t in set(type(v) for v in values))
raise TypeError("Encoders require their input to be uniformly "
f"strings or numbers. Got {types}")
return res
else:
return _encode_numpy(values, uniques, encode,
check_unknown=check_unknown)

def _num_samples(x):
"""Return number of samples in array-like x."""
message = 'Expected sequence or array-like, got %s' % type(x)
if hasattr(x, 'fit') and callable(x.fit):
# Don't get num_samples from an ensembles length!
raise TypeError(message)

if not hasattr(x, '__len__') and not hasattr(x, 'shape'):
if hasattr(x, '__array__'):
x = np.asarray(x)
else:
raise TypeError(message)

if hasattr(x, 'shape') and x.shape is not None:
if len(x.shape) == 0:
raise TypeError("Singleton array %r cannot be considered"
" a valid collection." % x)
# Check that shape is returning an integer or default to len
# Dask dataframes may not return numeric shape[0] value
if isinstance(x.shape[0], numbers.Integral):
return x.shape[0]

try:
return len(x)
except TypeError:
raise TypeError(message)

class LabelEncoder:
"""Encode target labels with value between 0 and n_classes-1.

This transformer should be used to encode target values, *i.e.* y, and
not the input X.

Read more in the :ref:User Guide <preprocessing_targets>.

Attributes
----------
classes_ : array of shape (n_class,)
Holds the label for each class.

Examples
--------
LabelEncoder can be used to normalize labels.

>>> from sklearn import preprocessing
>>> le = preprocessing.LabelEncoder()
>>> le.fit([1, 2, 2, 6])
LabelEncoder()
>>> le.classes_
array([1, 2, 6])
>>> le.transform([1, 1, 2, 6])
array([0, 0, 1, 2]...)
>>> le.inverse_transform([0, 0, 1, 2])
array([1, 1, 2, 6])

It can also be used to transform non-numerical labels (as long as they are
hashable and comparable) to numerical labels.

>>> le = preprocessing.LabelEncoder()
>>> le.fit(["paris", "paris", "tokyo", "amsterdam"])
LabelEncoder()
>>> list(le.classes_)
['amsterdam', 'paris', 'tokyo']
>>> le.transform(["tokyo", "tokyo", "paris"])
array([2, 2, 1]...)
>>> list(le.inverse_transform([2, 2, 1]))
['tokyo', 'tokyo', 'paris']

--------
sklearn.preprocessing.OrdinalEncoder : Encode categorical features
using an ordinal encoding scheme.

sklearn.preprocessing.OneHotEncoder : Encode categorical features
as a one-hot numeric array.
"""

def fit(self, y):
"""Fit label encoder

Parameters
----------
y : array-like of shape (n_samples,)
Target values.

Returns
-------
self : returns an instance of self.
"""
y = column_or_1d(y, warn=True)
self.classes_ = _encode(y)
return self

def fit_transform(self, y):
"""Fit label encoder and return encoded labels

Parameters
----------
y : array-like of shape [n_samples]
Target values.

Returns
-------
y : array-like of shape [n_samples]
"""
y = column_or_1d(y, warn=True)
self.classes_, y = _encode(y, encode=True)
return y

def transform(self, y):
"""Transform labels to normalized encoding.

Parameters
----------
y : array-like of shape [n_samples]
Target values.

Returns
-------
y : array-like of shape [n_samples]
"""
y = column_or_1d(y, warn=True)
# transform of empty array is empty array
if _num_samples(y) == 0:
return np.array([])

_, y = _encode(y, uniques=self.classes_, encode=True)
return y

def inverse_transform(self, y):
"""Transform labels back to original encoding.

Parameters
----------
y : numpy array of shape [n_samples]
Target values.

Returns
-------
y : numpy array of shape [n_samples]
"""
y = column_or_1d(y, warn=True)
# inverse transform of empty array is empty array
if _num_samples(y) == 0:
return np.array([])

diff = np.setdiff1d(y, np.arange(len(self.classes_)))
if len(diff):
raise ValueError(
"y contains previously unseen labels: %s" % str(diff))
y = np.asarray(y)
return self.classes_[y]

def _more_tags(self):
return {'X_types': ['1dlabels']}

class NearestCentroid:
"""Nearest centroid classifier.

Each class is represented by its centroid, with test samples classified to
the class with the nearest centroid.

Read more in the :ref:User Guide <nearest_centroid_classifier>.

Parameters
----------
metric : str or callable
The metric to use when calculating distance between instances in a
feature array. If metric is a string or callable, it must be one of
the options allowed by metrics.pairwise.pairwise_distances for its
metric parameter.
The centroids for the samples corresponding to each class is the point
from which the sum of the distances (according to the metric) of all
samples that belong to that particular class are minimized.
If the "manhattan" metric is provided, this centroid is the median and
for all other metrics, the centroid is now set to be the mean.

.. versionchanged:: 0.19
metric='precomputed' was deprecated and now raises an error

shrink_threshold : float, default=None
Threshold for shrinking centroids to remove features.

Attributes
----------
centroids_ : array-like of shape (n_classes, n_features)
Centroid of each class.

classes_ : array of shape (n_classes,)
The unique classes labels.

Examples
--------
>>> from sklearn.neighbors import NearestCentroid
>>> import numpy as np
>>> X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
>>> y = np.array([1, 1, 1, 2, 2, 2])
>>> clf = NearestCentroid()
>>> clf.fit(X, y)
NearestCentroid()
>>> print(clf.predict([[-0.8, -1]]))
[1]

--------
sklearn.neighbors.KNeighborsClassifier: nearest neighbors classifier

Notes
-----
When used for text classification with tf-idf vectors, this classifier is
also known as the Rocchio classifier.

References
----------
Tibshirani, R., Hastie, T., Narasimhan, B., & Chu, G. (2002). Diagnosis of
multiple cancer types by shrunken centroids of gene expression. Proceedings
of the National Academy of Sciences of the United States of America,
99(10), 6567-6572. The National Academy of Sciences.

"""
def __init__(self, metric='euclidean', *, shrink_threshold=None):
self.metric = metric
self.shrink_threshold = shrink_threshold

def fit(self, X, y):
"""
Fit the NearestCentroid model according to the given training data.

Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
Training vector, where n_samples is the number of samples and
n_features is the number of features.
Note that centroid shrinking cannot be used with sparse matrices.
y : array-like of shape (n_samples,)
Target values (integers)
"""
if self.metric == 'precomputed':
raise ValueError("Precomputed is not supported.")

n_samples, n_features = X.shape
le = LabelEncoder()
y_ind = le.fit_transform(y)
self.classes_ = classes = le.classes_
n_classes = classes.size
if n_classes < 2:
raise ValueError('The number of classes has to be greater than'
' one; got %d class' % (n_classes))

# Mask mapping each class to its members.
self.centroids_ = np.empty((n_classes, n_features), dtype=np.float64)
# Number of clusters in each class.
nk = np.zeros(n_classes)

for cur_class in range(n_classes):

# XXX: Update other averaging methods according to the metrics.
if self.metric == "manhattan":
# NumPy does not calculate median of sparse matrices.
else:
if self.metric != 'euclidean':
warnings.warn("Averaging for metrics other than "
"euclidean and manhattan not supported. "
"The average is set to be the mean."
)

if self.shrink_threshold:
dataset_centroid_ = np.mean(X, axis=0)

# m parameter for determining deviation
m = np.sqrt((1. / nk) - (1. / n_samples))
# Calculate deviation using the standard deviation of centroids.
variance = (X - self.centroids_[y_ind]) ** 2
variance = variance.sum(axis=0)
s = np.sqrt(variance / (n_samples - n_classes))
s += np.median(s)  # To deter outliers from affecting the results.
mm = m.reshape(len(m), 1)  # Reshape to allow broadcasting.
ms = mm * s
deviation = ((self.centroids_ - dataset_centroid_) / ms)
# Soft thresholding: if the deviation crosses 0 during shrinking,
# it becomes zero.
signs = np.sign(deviation)
deviation = (np.abs(deviation) - self.shrink_threshold)
np.clip(deviation, 0, None, out=deviation)
deviation *= signs
# Now adjust the centroids using the deviation
msd = ms * deviation
self.centroids_ = dataset_centroid_[np.newaxis, :] + msd
return self

def predict(self, X):
"""Perform classification on an array of test vectors X.

The predicted class C for each sample in X is returned.

Parameters
----------
X : array-like of shape (n_samples, n_features)

Returns
-------
C : ndarray of shape (n_samples,)

Notes
-----
If the metric constructor parameter is "precomputed", X is assumed to
be the distance matrix between the data to be predicted and
self.centroids_.
"""
idx = pairwise_distances(
X, self.centroids_, metric=self.metric).argmin(axis=1)
return self.classes_[idx]

knc_test.py

import numpy as np
from NearestCentroid import NearestCentroid

X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
y = np.array([1, 1, 1, 2, 2, 2])
clf = NearestCentroid()
clf.fit(X, y)
print("[-0.8, -1] is classified as ", clf.predict(np.array([[-0.8, -1]])))

0 评论
0 收藏
0