1.我们后期补充数学推导

我们算法的原理如下:

1
首先对于一批样本点,我们并没有头绪,首先随机找到几个已知点作为初始质心,然后分别对样本所有点,计算其到质心的距离,为每个样本点选择最近的质心,这样就形成了一个初始的归类了,然后我们对于每个类,重新计算质心,得到新的质心后,对所有样本,再执行到质心距离的计算和判别,重新分类,多次迭代实现分类。

我们来实现一下伪代码:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
init_center[] = select_random_smaple_point(k)#所以为k-means算法
for i in range(dataset):
for j in range(init_center):
sample_to_center_distances[i][j] = calculate_distance(dataset(i), init_center(j))
sample_belongs_to_center[i] = min(sample_to_center_distances[i])#取距离最小的那个

int class[][]
k=0
#将每个样本点归类
for j in range(init_center):
for i in range(dataset):
if sample_belongs_to_center[i] == init_center[j]:
class[j][k] = sample_belongs_to_center[i]
k++
else:
continue
#最后重新计算质心
for i in range(class):
center[i] = sum(class[i]) / number of column

2.算法实施如下

这里我们取了3个质心,并且默认样本维度为2。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import numpy as np
import random

class Kmeans:
def __init__(self):
self.dataSet = self.load_dataSet()
self.k = 3

def load_dataSet(self):
dataSet = []
filename = "kmeans.txt"
file = open(filename)
for line in file.readlines():
lineArray = line.strip().split(' ')
dataSet.append([float(lineArray[0]), float(lineArray[1])])
print("初始化数据:", np.array(dataSet))
return np.array(dataSet)

def exam_equal(self, center, dataSet):
number_of_samples = dataSet.shape[0]
for i in range(len(center)):
for j in range(i + 1, len(center)):
if np.array_equal(center[i], center[j]):
print("元素", center[i], "和元素", center[j], "相等")
center[j] = dataSet[random.randint(0, number_of_samples - 1)]
return center

def init_center(self, dataSet, k):
number_of_samples = dataSet.shape[0]
column_feature = dataSet.shape[1]
print(f"输入数据格式为{number_of_samples}x{column_feature}")
center = np.zeros((k, column_feature), dtype=np.float64)
for i in range(k):
center[i] = dataSet[random.randint(0, number_of_samples - 1)]
center = self.exam_equal(center, dataSet)
print("初始化中心矩阵:", center)
return center

def distance(self, x1, x2):
sum = np.sum(np.square(x1 - x2))
return np.sqrt(sum)

def cluster(self, dataSet, center):
sample_corresp_class = np.zeros(dataSet.shape[0], dtype=np.int32)
for idx_i, i in enumerate(dataSet):
sample_to_center_distance = np.zeros(len(center), dtype=np.float64)
for idx_j, j in enumerate(center):
sample_to_center_distance[idx_j] = self.distance(i, j)
sample_corresp_class[idx_i] = np.argmin(sample_to_center_distance)#这里是最小距离对应的中心的索引(比如第二个中心就是2)
return sample_corresp_class

def class_center(self, sample_corresp_class, center, dataSet):
center_class = [[] for _ in range(len(center))]#一共三个集群len(center) = 3
for idx_j, j in enumerate(sample_corresp_class):
center_class[j].append(dataSet[idx_j])#对于j中心(比如第2个中心),添加其样本
print("归类数组:", center_class)
return center_class

def recalcu_center(self, center_class):
center = np.zeros((len(center_class), len(center_class[0][0])), dtype=np.float64)
for idx, cluster in enumerate(center_class):
center[idx] = np.mean(cluster, axis=0)
print("不同集群质心坐标:", center)
return center

if __name__ == '__main__':
kmeans = Kmeans()
center = kmeans.init_center(kmeans.dataSet, kmeans.k)
for i in range(5):
sample_corresp_class = kmeans.cluster(kmeans.dataSet, center)
center_class = kmeans.class_center(sample_corresp_class, center, kmeans.dataSet)
center = kmeans.recalcu_center(center_class)
#再添加个画图就完美了

我们所取用的数据文件

1
2
3
4
5
6
7
8
9
10
11
12
-1.1 1.234
-2.2 2.256
-3.3 3.379
-4.4 4.683
1.1 1.345
2.3 2.234
3.5 3.652
4.11 4.254
1.34 -1.242
2.21 -2.222
3.012 -3.011
3.432 -3.624