# 最简单的聚类——胜者为王

2022/04/24 17:21

1. 找到n个初始点作为中心点；

2. 计算每个采样点和这n个中心点的距离，并把该采样点归属到距离最短的中心点ci代表的类；

3. 将ci向着该采样点移动一个距离，移动的距离正比于两者之间的距离。

import numpy
from matplotlib import pyplot
from sklearn.metrics import silhouette_score

def distance(x, y):
return numpy.sqrt(numpy.dot(x-y, x-y))

class SimpleClustering:
def __init__(self, n_cluster, learning_rate=0.1, n_iter=50):
self.n_cluster = n_cluster
self.learning_rate = learning_rate
self.n_iter = n_iter

# 在采样点参数空间中均匀选择初始中心
def _init_uniform(self, X, n_samples, n_features):
self.centers = numpy.zeros((self.n_cluster, n_features))
max_value = numpy.amax(X, axis=0)
min_value = numpy.amin(X, axis=0)
step = (max_value - min_value) / (self.n_cluster - 1)
for i in range(self.n_cluster):
self.centers[i] = min_value + i*step

# 在采样点中随机选择初始中心
def _init_randint(self, X, n_samples, n_features):
center_indices = numpy.random.randint(0, n_samples+1, self.n_cluster)
self.centers = numpy.zeros((self.n_cluster, n_features))
for i in range(self.n_cluster):
self.centers[i] = X[center_indices[i]].copy()

# 在采样点参数空间中随机选择初始中心
def _init_random(self, X, n_samples, n_features):
rand_fac = numpy.random.rand(self.n_cluster, n_features)
self.centers = numpy.zeros((self.n_cluster, n_features))
max_value = numpy.amax(X, axis=0)
min_value = numpy.amin(X, axis=0)
step = max_value - min_value
for i in range(self.n_cluster):
self.centers[i] = min_value + rand_fac[i]*step

def fit(self, X):
n_samples = X.shape[0]
n_features = X.shape[1]
self._init_uniform(X, n_samples, n_features)
self.labels = numpy.zeros(n_samples, dtype=numpy.int64)
dist = numpy.zeros(self.n_cluster)
for _ in range(self.n_iter):  # 迭代
# 对于每个点，给出其所属聚类中心，并调整聚类中心位置
# 任一采样点归类于距离最近的中心点
for sample in range(n_samples):
for idx in range(self.n_cluster):
dist[idx] = distance(X[sample], self.centers[idx])
# 给出所属中心点
self.labels[sample] = numpy.argmin(dist)
# 调整中心点向该采样点方向移动
self.centers[self.labels[sample]] += self.learning_rate \
*(X[sample] - self.centers[self.labels[sample]])

def predict(self, X):
n_samples = X.shape[0]
inertia = 0.0
self.labels = np.zeros(n_samples, dtype=np.int64)

for sample_idx in range(n_samples):
min_dist = -1
for center_idx in range(self.n_cluster):
diff = X[sample_idx]-self.centers[center_idx]
dist = np.dot(diff, diff)
if min_dist == -1 or dist < min_dist:
min_dist = dist
self.labels[sample_idx] = center_idx

inertia += min_dist

return (inertia, self.labels)

if __name__ == "__main__":
data = [[130.0, 38],[130.5, 36],[131.0, 35],[131.5, 33],[132.0, 36],
[132.5, 38],[133.0, 32],[133.5, 34],[134.0, 37],[134.5, 35],
[150.0, 48],[150.5, 49],[151.0, 46],[151.5, 52],[152.0, 48],
[152.5, 50],[153.0, 51],[153.5, 53],[154.0, 49],[154.5, 52],
[170.0, 65],[170.5, 62],[171.0, 64],[171.5, 70],[172.0, 66],
[172.5, 71],[173.0, 68],[173.5, 72],[174.0, 69],[174.5, 63]]
X = numpy.array(data)

sc = SimpleClustering(n_cluster=3)
sc.fit(X)
print(sc.centers)
print(sc.labels)
score = silhouette_score(X, sc.labels)
print("score: ", score)

pyplot.scatter(X[:, 0], X[:, 1], c=sc.labels)
pyplot.show()

1. K-means存在的问题该方法依然存在，即只能处理凸数据的聚类，且维数太高之后欧氏距离会变得太大；

2. 代码中实现了三种初始化方法，可以根据情况使用。但数据并未做处理，如果某一维很大而另一维很小，则可能效果不好。需要对数据进行归一化后再聚类，效果可能会得到改善。

0 评论
0 收藏
0