# 聚类(Clustering)之DBSCAN

2022/04/22 11:47

DBSCAN算法实际上定义了核心样点(core sample)，其周围至少有min_samples个样点与其距离小于eps，这些样点称为核心样点的邻点，表明该核心样点所处的空间属于某个稠密区。通过找到一个核心样点a，然后考察其邻点是否有新的核心样点b，如果有，则继续考察b的邻点是否有新的核心样点c…… 如此往复，即可找到一系列核心样点及其邻点，从而形成一个类(cluster)。

DBSCAN算法是确定性的，意味着如果同一组采样点以同样的顺序输入，则聚类结果也是一样的。但如果输入顺序不同，则结果可能不同。

dbscan.py

import numpy as np

import warnings
import itertools
from scipy.spatial import distance
from functools import partial
from KMeans import euclidean_distances
#from kd_tree import KDTree
#from ball_tree import BallTree

# Helper functions - distance
PAIRWISE_DISTANCE_FUNCTIONS = {
# If updating this dictionary, update the doc in both distance_metrics()
# and also in pairwise_distances()!
#    'cityblock': manhattan_distances,
#    'cosine': cosine_distances,
'euclidean': euclidean_distances,
'l2': euclidean_distances,
#    'l1': manhattan_distances,
#    'manhattan': manhattan_distances,
'precomputed': None,  # HACK: precomputed is always allowed, never called
}

VALID_METRICS = dict(#ball_tree=BallTree.valid_metrics,
#kd_tree=KDTree.valid_metrics,
# The following list comes from the
# sklearn.metrics.pairwise doc string
brute=(list(PAIRWISE_DISTANCE_FUNCTIONS.keys()) +
['braycurtis', 'canberra', 'chebyshev',
'correlation', 'cosine', 'dice', 'hamming',
'jaccard', 'kulsinski', 'mahalanobis',
'matching', 'minkowski', 'rogerstanimoto',
'russellrao', 'seuclidean', 'sokalmichener',
'sokalsneath', 'sqeuclidean',
'yule', 'wminkowski']))

def _pairwise(X, Y, func, **kwds):
"""Break the pairwise matrix in n_jobs even slices
and compute them in parallel"""
if Y is None:
Y = X

return func(X, Y, **kwds)

def _pairwise_callable(X, Y, metric, **kwds):
"""Handle the callable case for pairwise_{distances,kernels}
"""
if X is Y:
# Only calculate metric for upper triangle
out = np.zeros((X.shape[0], Y.shape[0]), dtype='float')
iterator = itertools.combinations(range(X.shape[0]), 2)
for i, j in iterator:
out[i, j] = metric(X[i], Y[j], **kwds)

# Make symmetric
# NB: out += out.T will produce incorrect results
out = out + out.T

# Calculate diagonal
# NB: nonzero diagonals are allowed for both metrics and kernels
for i in range(X.shape[0]):
x = X[i]
out[i, i] = metric(x, x, **kwds)

else:
# Calculate all cells
out = np.empty((X.shape[0], Y.shape[0]), dtype='float')
iterator = itertools.product(range(X.shape[0]), range(Y.shape[0]))
for i, j in iterator:
out[i, j] = metric(X[i], Y[j], **kwds)

return out

def pairwise_distances(X, Y=None, metric="euclidean", **kwds):
""" Compute the distance matrix from a vector array X and optional Y.

This method takes either a vector array or a distance matrix, and returns
a distance matrix. If the input is a vector array, the distances are
computed. If the input is a distances matrix, it is returned instead.

This method provides a safe way to take a distance matrix as input, while
preserving compatibility with many other algorithms that take a vector
array.

If Y is given (default is None), then the returned matrix is the pairwise
distance between the arrays from both X and Y.

Valid values for metric are:

- From scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2',
'manhattan'].

- From scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev',
'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', 'mahalanobis',
'matching', 'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean',
'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule']
See the documentation for scipy.spatial.distance for details on these
metrics.

Note that in the case of 'cityblock', 'cosine' and 'euclidean' (which are
valid scipy.spatial.distance metrics), the scikit-learn implementation
will be used, which is faster and has support for sparse matrices (except
for 'cityblock'). For a verbose description of the metrics from
scikit-learn, see the __doc__ of the sklearn.pairwise.distance_metrics
function.

Parameters
----------
X : array [n_samples_a, n_samples_a] if metric == "precomputed", or, \
[n_samples_a, n_features] otherwise
Array of pairwise distances between samples, or a feature array.

Y : array [n_samples_b, n_features], optional
An optional second feature array. Only allowed if metric != "precomputed".

metric : string, or callable
The metric to use when calculating distance between instances in a
feature array. If metric is a string, it must be one of the options
allowed by scipy.spatial.distance.pdist for its metric parameter, or
a metric listed in pairwise.PAIRWISE_DISTANCE_FUNCTIONS.
If metric is "precomputed", X is assumed to be a distance matrix.
Alternatively, if metric is a callable function, it is called on each
pair of instances (rows) and the resulting value recorded. The callable
should take two arrays from X as input and return a value indicating
the distance between them.

**kwds : optional keyword parameters
Any further parameters are passed directly to the distance function.
If using a scipy.spatial.distance metric, the parameters are still
metric dependent. See the scipy docs for usage examples.

Returns
-------
D : array [n_samples_a, n_samples_a] or [n_samples_a, n_samples_b]
A distance matrix D such that D_{i, j} is the distance between the
ith and jth vectors of the given matrix X, if Y is None.
If Y is not None, then D_{i, j} is the distance between the ith array
from X and the jth array from Y.

"""
if (metric not in VALID_METRICS['brute'] and
not callable(metric) and metric != "precomputed"):
raise ValueError("Unknown metric %s. "
"Valid metrics are %s, or 'precomputed', or a "
"callable" % (metric, VALID_METRICS))

if metric == "precomputed":
return X
elif metric in PAIRWISE_DISTANCE_FUNCTIONS:
func = PAIRWISE_DISTANCE_FUNCTIONS[metric]
elif callable(metric):
func = partial(_pairwise_callable, metric=metric, **kwds)
else:
if X is Y:
return distance.squareform(distance.pdist(X, metric=metric,
**kwds))
func = partial(distance.cdist, metric=metric, **kwds)

return _pairwise(X, Y, func, **kwds)

class NeighborsBase:
"""Base class for nearest neighbors estimators."""

def __init__(self):
pass

algorithm='auto', leaf_size=30, metric='minkowski',
p=2, metric_params=None):

if metric_params is None:
metric_params = {}

self.n_neighbors = n_neighbors
self.algorithm = algorithm
self.leaf_size = leaf_size
self.metric = metric
self.metric_params = metric_params
self.p = p

if algorithm not in ['auto', 'brute',
'kd_tree', 'ball_tree']:
raise ValueError("unrecognized algorithm: '%s'" % algorithm)

if algorithm == 'auto':
if metric == 'precomputed':
alg_check = 'brute'
else:
alg_check = 'ball_tree'
else:
alg_check = algorithm

if callable(metric):
if algorithm == 'kd_tree':
# callable metric is only valid for brute force and ball_tree
raise ValueError(
"kd_tree algorithm does not support callable metric '%s'"
% metric)
elif metric not in VALID_METRICS[alg_check]:
raise ValueError("Metric '%s' not valid for algorithm '%s'"
% (metric, algorithm))

if self.metric_params is not None and 'p' in self.metric_params:
warnings.warn("Parameter p is found in metric_params. "
"The corresponding parameter from __init__ "
"is ignored.", SyntaxWarning, stacklevel=3)
effective_p = metric_params['p']
else:
effective_p = self.p

if self.metric in ['wminkowski', 'minkowski'] and effective_p < 1:
raise ValueError("p must be greater than one for minkowski metric")

self._fit_X = None
self._tree = None
self._fit_method = None

def fit(self, X):
if self.metric_params is None:
self.effective_metric_params_ = {}
else:
self.effective_metric_params_ = self.metric_params.copy()

effective_p = self.effective_metric_params_.get('p', self.p)
if self.metric in ['wminkowski', 'minkowski']:
self.effective_metric_params_['p'] = effective_p

self.effective_metric_ = self.metric
# For minkowski distance, use more efficient methods where available
if self.metric == 'minkowski':
p = self.effective_metric_params_.pop('p', 2)
if p < 1:
raise ValueError("p must be greater than one "
"for minkowski metric")
elif p == 1:
self.effective_metric_ = 'manhattan'
elif p == 2:
self.effective_metric_ = 'euclidean'
elif p == np.inf:
self.effective_metric_ = 'chebyshev'
else:
self.effective_metric_params_['p'] = p

if isinstance(X, NeighborsBase):
self._fit_X = X._fit_X
self._tree = X._tree
self._fit_method = X._fit_method
return self

#        elif isinstance(X, BallTree):
#            self._fit_X = X.data
#            self._tree = X
#            self._fit_method = 'ball_tree'
#            return self

#        elif isinstance(X, KDTree):
#            self._fit_X = X.data
#            self._tree = X
#            self._fit_method = 'kd_tree'
#            return self

n_samples = X.shape[0]
if n_samples == 0:
raise ValueError("n_samples must be greater than 0")

self._fit_method = self.algorithm
self._fit_X = X

if self._fit_method == 'auto':
# A tree approach is better for small number of neighbors,
# and KDTree is generally faster when available
if ((self.n_neighbors is None or
self.n_neighbors < self._fit_X.shape[0] // 2) and
self.metric != 'precomputed'):
if self.effective_metric_ in VALID_METRICS['kd_tree']:
self._fit_method = 'kd_tree'
else:
self._fit_method = 'ball_tree'
else:
self._fit_method = 'brute'

#        if self._fit_method == 'ball_tree':
#            self._tree = BallTree(X, self.leaf_size,
#                                  metric=self.effective_metric_,
#                                  **self.effective_metric_params_)
#        elif self._fit_method == 'kd_tree':
#            self._tree = KDTree(X, self.leaf_size,
#                                metric=self.effective_metric_,
#                                **self.effective_metric_params_)
#        elif self._fit_method == 'brute':
if self._fit_method == 'brute':
self._tree = None
else:
raise ValueError("algorithm = '%s' not recognized"
% self.algorithm)

if self.n_neighbors is not None:
if self.n_neighbors <= 0:
raise ValueError(
"Expected n_neighbors > 0. Got %d" %
self.n_neighbors
)

return self

"""Finds the neighbors within a given radius of a point or points.

Return the indices and distances of each point from the dataset
lying in a ball with size radius around the points of the query
array. Points lying on the boundary are included in the results.

The result points are *not* necessarily sorted by distance to their
query point.

Parameters
----------
X : array-like, (n_samples, n_features), optional
The query point or points.
If not provided, neighbors of each indexed point are returned.
In this case, the query point is not considered its own neighbor.

Limiting distance of neighbors to return.
(default is the value passed to the constructor).

return_distance : boolean, optional. Defaults to True.
If False, distances will not be returned

Returns
-------
dist : array, shape (n_samples,) of arrays
Array representing the distances to each point, only present if
return_distance=True. The distance values are computed according
to the metric constructor parameter.

ind : array, shape (n_samples,) of arrays
An array of arrays of indices of the approximate nearest points
from the population matrix that lie within a ball of size
radius around the query points.

Examples
--------
In the following example, we construct a NeighborsClassifier
class from an array representing our data set and ask who's
the closest point to [1, 1, 1]:

>>> import numpy as np
>>> samples = [[0., 0., 0.], [0., .5, 0.], [1., 1., .5]]
>>> from sklearn.neighbors import NearestNeighbors
>>> neigh.fit(samples) # doctest: +ELLIPSIS
NearestNeighbors(algorithm='auto', leaf_size=30, ...)
>>> rng = neigh.radius_neighbors([[1., 1., 1.]])
>>> print(np.asarray(rng[0][0])) # doctest: +ELLIPSIS
[ 1.5  0.5]
>>> print(np.asarray(rng[1][0])) # doctest: +ELLIPSIS
[1 2]

The first array returned contains the distances to all points which
are closer than 1.6, while the second array returned contains their
indices.  In general, multiple points can be queried at the same time.

Notes
-----
Because the number of neighbors of each point is not necessarily
equal, the results for multiple query points cannot be fit in a
standard data array.
For efficiency, radius_neighbors returns arrays of objects, where
each object is a 1D array of indices or distances.
"""
if X is not None:
query_is_train = False
else:
query_is_train = True
X = self._fit_X

n_samples = X.shape[0]
if self._fit_method == 'brute':
# for efficiency, use squared euclidean distances
if self.effective_metric_ == 'euclidean':
dist = pairwise_distances(X, self._fit_X, 'euclidean',
squared=True)
else:
dist = pairwise_distances(X, self._fit_X,
self.effective_metric_,
**self.effective_metric_params_)

neigh_ind_list = [np.where(d <= radius)[0] for d in dist]

# See https://github.com/numpy/numpy/issues/5456
# if you want to understand why this is initialized this way.
neigh_ind = np.empty(n_samples, dtype='object')
neigh_ind[:] = neigh_ind_list

if return_distance:
dist_array = np.empty(n_samples, dtype='object')
if self.effective_metric_ == 'euclidean':
dist_list = [np.sqrt(d[neigh_ind[i]])
for i, d in enumerate(dist)]
else:
dist_list = [d[neigh_ind[i]]
for i, d in enumerate(dist)]
dist_array[:] = dist_list

results = dist_array, neigh_ind
else:
results = neigh_ind

elif self._fit_method in ['ball_tree', 'kd_tree']:
return_distance=return_distance)
if return_distance:
results = results[::-1]
else:
raise ValueError("internal: _fit_method not recognized")

if not query_is_train:
return results
else:
# If the query data is the same as the indexed data, we would like
# to ignore the first nearest neighbor of every sample, i.e
# the sample itself.
if return_distance:
dist, neigh_ind = results
else:
neigh_ind = results

for ind, ind_neighbor in enumerate(neigh_ind):

if return_distance:

if return_distance:
return dist, neigh_ind
return neigh_ind

"""Unsupervised learner for implementing neighbor searches.

Parameters
----------
n_neighbors : int, optional (default = 5)
Number of neighbors to use by default for :meth:k_neighbors queries.

radius : float, optional (default = 1.0)
Range of parameter space to use by default for :methradius_neighbors
queries.

algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional
Algorithm used to compute the nearest neighbors:

- 'ball_tree' will use :class:BallTree
- 'kd_tree' will use :class:KDtree
- 'brute' will use a brute-force search.
- 'auto' will attempt to decide the most appropriate algorithm
based on the values passed to :meth:fit method.

Note: fitting on sparse input will override the setting of
this parameter, using brute force.

leaf_size : int, optional (default = 30)
Leaf size passed to BallTree or KDTree.  This can affect the
speed of the construction and query, as well as the memory
required to store the tree.  The optimal value depends on the
nature of the problem.

p: integer, optional (default = 2)
Parameter for the Minkowski metric from
sklearn.metrics.pairwise.pairwise_distances. When p = 1, this is
equivalent to using manhattan_distance (l1), and euclidean_distance
(l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.

metric : string or callable, default 'minkowski'
metric to use for distance computation. Any metric from scikit-learn
or scipy.spatial.distance can be used.

If metric is a callable function, it is called on each
pair of instances (rows) and the resulting value recorded. The callable
should take two arrays as input and return one value indicating the
distance between them. This works for Scipy's metrics, but is less
efficient than passing the metric name as a string.

Distance matrices are not supported.

Valid values for metric are:

- from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2',
'manhattan']

- from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev',
'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski',
'mahalanobis', 'matching', 'minkowski', 'rogerstanimoto',
'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath',
'sqeuclidean', 'yule']

See the documentation for scipy.spatial.distance for details on these
metrics.

metric_params : dict, optional (default = None)
Additional keyword arguments for the metric function.

Examples
--------
>>> import numpy as np
>>> from sklearn.neighbors import NearestNeighbors
>>> samples = [[0, 0, 2], [1, 0, 0], [0, 0, 1]]

>>> neigh = NearestNeighbors(2, 0.4)
>>> neigh.fit(samples)  #doctest: +ELLIPSIS
NearestNeighbors(...)

>>> neigh.kneighbors([[0, 0, 1.3]], 2, return_distance=False)
... #doctest: +ELLIPSIS
array([[2, 0]]...)

>>> nbrs = neigh.radius_neighbors([[0, 0, 1.3]], 0.4, return_distance=False)
>>> np.asarray(nbrs[0][0])
array(2)

--------
KNeighborsClassifier
KNeighborsRegressor
BallTree

Notes
-----
See :ref:Nearest Neighbors <neighbors> in the online documentation
for a discussion of the choice of algorithm and leaf_size.

http://en.wikipedia.org/wiki/K-nearest_neighbor_algorithm
"""

algorithm='auto', leaf_size=30, metric='minkowski',
p=2, metric_params=None):
self._init_params(n_neighbors=n_neighbors,
algorithm=algorithm,
leaf_size=leaf_size, metric=metric, p=p,
metric_params=metric_params)

def dbscan_inner(is_core,  neighborhoods, labels):
label_num = 0
stack = []

for i in range(labels.shape[0]):
if labels[i] != -1 or not is_core[i]:
continue

# Depth-first search starting from i, ending at the non-core points.
# This is very similar to the classic algorithm for computing connected
# components, the difference being that we label non-core points as
# part of a cluster (component), but don't expand their neighborhoods.
while True:
if labels[i] == -1:
labels[i] = label_num
if is_core[i]:
neighb = neighborhoods[i]
for j in range(neighb.shape[0]):
v = neighb[j]
if labels[v] == -1:
stack.append(v)

if len(stack) == 0:
break
i = stack.pop()

label_num += 1

def dbscan(X, eps=0.5, min_samples=5, metric='minkowski',
algorithm='auto', leaf_size=30, p=2, sample_weight=None):
"""Perform DBSCAN clustering from vector array or distance matrix.

Parameters
----------
X : array of shape (n_samples, n_features), or \
array of shape (n_samples, n_samples)
A feature array, or array of distances between samples if
metric='precomputed'.

eps : float, optional
The maximum distance between two samples for them to be considered
as in the same neighborhood.

min_samples : int, optional
The number of samples (or total weight) in a neighborhood for a point
to be considered as a core point. This includes the point itself.

metric : string, or callable
The metric to use when calculating distance between instances in a
feature array. If metric is a string or callable, it must be one of
the options allowed by metrics.pairwise.pairwise_distances for its
metric parameter.
If metric is "precomputed", X is assumed to be a distance matrix and
must be square.

algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional
The algorithm to be used by the NearestNeighbors module
to compute pointwise distances and find nearest neighbors.
See NearestNeighbors module documentation for details.

leaf_size : int, optional (default = 30)
Leaf size passed to BallTree or KDTree. This can affect the speed
of the construction and query, as well as the memory required
to store the tree. The optimal value depends
on the nature of the problem.

p : float, optional
The power of the Minkowski metric to be used to calculate distance
between points.

sample_weight : array, shape (n_samples,), optional
Weight of each sample, such that a sample with a weight of at least
min_samples is by itself a core sample; a sample with negative
weight may inhibit its eps-neighbor from being core.
Note that weights are absolute, and default to 1.

Returns
-------
core_samples : array [n_core_samples]
Indices of core samples.

labels : array [n_samples]
Cluster labels for each point.  Noisy samples are given the label -1.

Notes
-----
This implementation bulk-computes all neighborhood queries, which increases
the memory complexity to O(n.d) where d is the average number of neighbors,
while original DBSCAN had memory complexity O(n).

References
----------
Ester, M., H. P. Kriegel, J. Sander, and X. Xu, "A Density-Based
Algorithm for Discovering Clusters in Large Spatial Databases with Noise".
In: Proceedings of the 2nd International Conference on Knowledge Discovery
and Data Mining, Portland, OR, AAAI Press, pp. 226-231. 1996
"""
if not eps > 0.0:
raise ValueError("eps must be positive.")

# Calculate neighborhood for all samples. This leaves the original point
# in, which needs to be considered later (i.e. point i is in the
# neighborhood of point i. While True, its useless information)
leaf_size=leaf_size,
metric=metric, p=p)
neighbors_model.fit(X)
# This has worst case O(n^2) memory complexity
return_distance=False)

if sample_weight is None:
n_neighbors = np.array([len(neighbors)
for neighbors in neighborhoods])
else:
n_neighbors = np.array([np.sum(sample_weight[neighbors])
for neighbors in neighborhoods])

# Initially, all samples are noise.
labels = -np.ones(X.shape[0], dtype=np.intp)

# A list of all core samples found.
core_samples = np.asarray(n_neighbors >= min_samples, dtype=np.uint8)
dbscan_inner(core_samples, neighborhoods, labels)
return np.where(core_samples)[0], labels

class DBSCAN:
"""Perform DBSCAN clustering from vector array or distance matrix.

DBSCAN - Density-Based Spatial Clustering of Applications with Noise.
Finds core samples of high density and expands clusters from them.
Good for data which contains clusters of similar density.

Parameters
----------
eps : float, optional
The maximum distance between two samples for them to be considered
as in the same neighborhood.
min_samples : int, optional
The number of samples (or total weight) in a neighborhood for a point
to be considered as a core point. This includes the point itself.
metric : string, or callable
The metric to use when calculating distance between instances in a
feature array. If metric is a string or callable, it must be one of
the options allowed by metrics.pairwise.calculate_distance for its
metric parameter.
If metric is "precomputed", X is assumed to be a distance matrix and
must be square.

algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional
The algorithm to be used by the NearestNeighbors module
to compute pointwise distances and find nearest neighbors.
See NearestNeighbors module documentation for details.
leaf_size : int, optional (default = 30)
Leaf size passed to BallTree or KDTree. This can affect the speed
of the construction and query, as well as the memory required
to store the tree. The optimal value depends
on the nature of the problem.

Attributes
----------
core_sample_indices_ : array, shape = [n_core_samples]
Indices of core samples.

components_ : array, shape = [n_core_samples, n_features]
Copy of each core sample found by training.

labels_ : array, shape = [n_samples]
Cluster labels for each point in the dataset given to fit().
Noisy samples are given the label -1.

Notes
-----
This implementation bulk-computes all neighborhood queries, which increases
the memory complexity to O(n.d) where d is the average number of neighbors,
while original DBSCAN had memory complexity O(n).

References
----------
Ester, M., H. P. Kriegel, J. Sander, and X. Xu, "A Density-Based
Algorithm for Discovering Clusters in Large Spatial Databases with Noise".
In: Proceedings of the 2nd International Conference on Knowledge Discovery
and Data Mining, Portland, OR, AAAI Press, pp. 226-231. 1996
"""

def __init__(self, eps=0.5, min_samples=5, metric='euclidean',
algorithm='brute', leaf_size=30, p=None):
self.eps = eps
self.min_samples = min_samples
self.metric = metric
self.algorithm = algorithm
self.leaf_size = leaf_size
self.p = p

def fit(self, X, y=None, sample_weight=None):
"""Perform DBSCAN clustering from features or distance matrix.

Parameters
----------
X : array of shape (n_samples, n_features), or \
array of shape (n_samples, n_samples)
A feature array, or array of distances between samples if
metric='precomputed'.
sample_weight : array, shape (n_samples,), optional
Weight of each sample, such that a sample with a weight of at least
min_samples is by itself a core sample; a sample with negative
weight may inhibit its eps-neighbor from being core.
Note that weights are absolute, and default to 1.
"""
clust = dbscan(X, sample_weight=sample_weight,
eps=self.eps, min_samples=self.min_samples,
metric=self.metric, algorithm=self.algorithm,
leaf_size=self.leaf_size, p=self.p)
self.core_sample_indices_, self.labels_ = clust
if len(self.core_sample_indices_):
# fix for scipy sparse indexing issue
self.components_ = X[self.core_sample_indices_].copy()
else:
# no core samples
self.components_ = np.empty((0, X.shape[1]))
return self

def fit_predict(self, X, y=None, sample_weight=None):
"""Performs clustering on X and returns cluster labels.

Parameters
----------
X : array of shape (n_samples, n_features), or \
array of shape (n_samples, n_samples)
A feature array, or array of distances between samples if
metric='precomputed'.
sample_weight : array, shape (n_samples,), optional
Weight of each sample, such that a sample with a weight of at least
min_samples is by itself a core sample; a sample with negative
weight may inhibit its eps-neighbor from being core.
Note that weights are absolute, and default to 1.

Returns
-------
y : ndarray, shape (n_samples,)
cluster labels
"""
self.fit(X, sample_weight=sample_weight)
return self.labels_

dbscan_test.py

import numpy as np

from dbscan import DBSCAN
from sklearn import metrics
from sklearn.datasets import make_blobs
from sklearn.preprocessing import StandardScaler

# #############################################################################
# Generate sample data
centers = [[1, 1], [-1, -1], [1, -1]]
X, labels_true = make_blobs(
n_samples=750, centers=centers, cluster_std=0.4, random_state=0
)

X = StandardScaler().fit_transform(X)

# #############################################################################
# Compute DBSCAN
db = DBSCAN(eps=0.3, min_samples=10).fit(X)
labels = db.labels_

# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)

print("Estimated number of clusters: %d" % n_clusters_)
print("Estimated number of noise points: %d" % n_noise_)
print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels))
print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels))
print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels))
print(
)
print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, labels))

# #############################################################################
# Plot result
import matplotlib.pyplot as plt

# Black removed and is used for noise instead.
unique_labels = set(labels)
colors = [plt.cm.Spectral(each) for each in np.linspace(0, 1, len(unique_labels))]
for k, col in zip(unique_labels, colors):
if k == -1:
# Black used for noise.
col = [0, 0, 0, 1]

plt.plot(
xy[:, 0],
xy[:, 1],
"o",
markerfacecolor=tuple(col),
markeredgecolor="k",
markersize=14,
)

plt.plot(
xy[:, 0],
xy[:, 1],
"o",
markerfacecolor=tuple(col),
markeredgecolor="k",
markersize=6,
)

plt.title("Estimated number of clusters: %d" % n_clusters_)
plt.show()

test.py

import time

import numpy as np
import matplotlib.pyplot as plt

from KMeans import MiniBatchKMeans
from gmm import GMM
from dbscan import DBSCAN
from sklearn import datasets
from sklearn.preprocessing import StandardScaler

np.random.seed(0)

# Generate datasets. We choose the size big enough to see the scalability
# of the algorithms, but not too big to avoid too long running times
n_samples = 1500
noisy_circles = datasets.make_circles(n_samples=n_samples, factor=.5,
noise=.05)
noisy_moons = datasets.make_moons(n_samples=n_samples, noise=.05)
blobs = datasets.make_blobs(n_samples=n_samples, random_state=8)
no_structure = np.random.rand(n_samples, 2), None

colors = np.array([x for x in 'bgrcmykbgrcmykbgrcmykbgrcmyk'])
colors = np.hstack([colors] * 20)

clustering_names = [
'MiniBatchKMeans', 'DBSCAN', 'GaussianMixture']

plt.figure(figsize=(len(clustering_names) * 2 + 3, 9.5))
hspace=.01)

plot_num = 1

datasets = [noisy_circles, noisy_moons, blobs, no_structure]
for i_dataset, dataset in enumerate(datasets):
X, y = dataset
# normalize dataset for easier parameter selection
X = StandardScaler().fit_transform(X)

# create clustering estimators
two_means = MiniBatchKMeans(n_clusters=2)
dbscan = DBSCAN(eps=.2)
gmm = GMM(
n_components=2, covariance_type="full"
)

clustering_algorithms = [two_means, dbscan, gmm]

for name, algorithm in zip(clustering_names, clustering_algorithms):
# predict cluster memberships
t0 = time.time()
algorithm.fit(X)
t1 = time.time()
if hasattr(algorithm, 'labels_'):
y_pred = algorithm.labels_.astype(np.int)
else:
y_pred = algorithm.predict(X)

# plot
plt.subplot(len(datasets), len(clustering_algorithms), plot_num)
if i_dataset == 0:
plt.title(name, size=18)
plt.scatter(X[:, 0], X[:, 1], color=colors[y_pred].tolist(), s=10)

if hasattr(algorithm, 'cluster_centers_'):
centers = algorithm.cluster_centers_
center_colors = colors[:len(centers)]
plt.scatter(centers[:, 0], centers[:, 1], s=100, c=center_colors)
plt.xlim(-2, 2)
plt.ylim(-2, 2)
plt.xticks(())
plt.yticks(())
plt.text(.99, .01, ('%.2fs' % (t1 - t0)).lstrip('0'),
transform=plt.gca().transAxes, size=15,
horizontalalignment='right')
plot_num += 1

plt.show()

0 评论
0 收藏
0