# 菜鸟之路——机器学习之Kmeans聚类个人理解及Python实现

Kmeans聚类大致就是选择K个中心点。不断遍历更新中心点的位置。离哪个中心点近就属于哪一类。中心点的更新取此类的平均点。

``` 1 import numpy as np
2
3 def kmeans(X,k,maxIt):
4     numPoints,numDim= X.shape
5     dataSet=np.zeros((numPoints,numDim+1))
6     dataSet[:,:-1]=X        #给训练集加一列存放分类信息
7     centroids = dataSet[np.random.randint(numPoints,size=k)]
8     #centroids = dataSet[0:2,:]
9
10     centroids[:,-1]=range(1,k+1)#中心点最后一列储存K个值，也就是K类
11     iterations=0
12     oldCentroids=None
13
14     while not SholdStop(oldCentroids,centroids,iterations,maxIt): #当不满足条件是就继续循环
15         print("iterations:",iterations)
16         print("centroids:", centroids)
17         oldCentroids = np.copy(centroids)     #注意与赋值等号的区别
18         iterations += 1
19         updateLabels(dataSet,centroids) #更新数据集最后一列分类信息
20         centroids=getCentroids(dataSet,k)#根据分类信息更新中心点
21     return dataSet
22
23 def SholdStop(oldCentroids,centroids,iterations,maxIt):
24     if iterations>maxIt:
25         return True
26     return np.array_equal(oldCentroids,centroids)
27
28 def updateLabels(dataSet,centroids):
29     numPoints, numDim = dataSet.shape
30     for i in range(0,numPoints):
31         dataSet[i,-1]=getLabelFromClosestCentroid(dataSet[i,:-1],centroids)
32
33 def getLabelFromClosestCentroid(dataSetRow,centroids):
34     label = centroids[0,-1]
35     minDist = np.linalg.norm(dataSetRow-centroids[0,:-1])
36     for i in range(1,centroids.shape[0]):
37         dist = np.linalg.norm(dataSetRow-centroids[i,:-1])
38         if dist<minDist:
39             minDist=dist
40             label=centroids[i,-1]
41     print("minDist",minDist)
42     return label
43
44 def getCentroids(dataSet,k):
45     result=np.zeros((k,dataSet.shape[1]))
46     for i in range(1,k+1):
47         oneClister = dataSet[dataSet[:,-1]==i,:-1]
48         result[i-1,:-1]=np.mean(oneClister,axis=0)   #每列取平均值
49         result[i - 1, -1]=i
50     return result
51
52
53 x1=np.array([1,1])
54 x2=np.array([2,1])
55 x3=np.array([4,3])
56 x4=np.array([5,4])
57 testX=np.vstack((x1,x3,x3,x4))  #纵向堆起来组成一个矩阵
58
59 result=kmeans(testX,2,10)
60 print("result",result)```