

from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
def GetDistance(v1,v2):
distance =np.sqrt(np.sum( (v1-v2)**2))
return distance
def kmeans(data,k):
nSamples , dim =data.shape
Centroids = np.zeros((k,dim))
indexs = []
indexs = np.random.randint(0,nSamples,size =k)
Centroids = data[indexs , :]
ClusterData = np.array(np.zeros((nSamples,)),dtype = int)
minIndex = 0
ClusterChanged = True
while ClusterChanged :
ClusterChanged = False
for i in range(nSamples) :
minDistance =1000000
for j in range(k) :
distance = GetDistance(data[i,:] , Centroids[j,:])
if minDistance > distance :
minDistance = distance
minIndex = j
if ClusterData[i,] != minIndex :
ClusterChanged =True
ClusterData[i,] = minIndex
for j in range(k) :
clusterindexs = np.nonzero(ClusterData[:,] ==j)
Centroids[j,] = np.mean(data[clusterindexs,:] , axis =1)
ShowCluster(data, Centroids, ClusterData,k)
return Centroids , ClusterData
def ShowCluster(data, Centroids, ClusterData,k) :
markdata = ['or', 'ob', 'og', 'ok', '^r', '+r', 'sr', 'dr', '<r', 'pr']
markcentroid = ['*r', '*b', '*g', '*k', '*b', '+b', 'sb', 'db', '<b', 'pb']
if k > len(markcentroid):
print("类别超出索引")
return 0
nSameples, dim = data.shape
for i in range(nSameples) :
plt.plot(data[i,0],data[i,1],markdata[ClusterData[i,]])
for j in range(k):
plt.plot(Centroids[j,0],Centroids[j,1],markcentroid[j],markersize = 20)
if __name__ == "__main__":
filepath = "D:\ZS\study\PythonWorkPlace\聚类\kmeans.txt"
data=np.genfromtxt(filepath , delimiter = " ")
Centroids,clusterData = kmeans(data,5)