k-均值聚类算法;二分k均值聚类算法

时间:2022-02-23 22:05:46

根据《机器学习实战》一书第十章学习k均值聚类算法和二分k均值聚类算法,自己把代码边敲边理解了一下,修正了一些原书中代码的细微差错。目前代码有时会出现如下4种报错信息,这有待继续探究和完善。

报错信息:

Warning (from warnings module):
  File "F:\Python2.7.6\lib\site-packages\numpy\core\_methods.py", line 55
    warnings.warn("Mean of empty slice.", RuntimeWarning)
RuntimeWarning: Mean of empty slice.

Warning (from warnings module):
  File "F:\Python2.7.6\lib\site-packages\numpy\core\_methods.py", line 65
    ret, rcount, out=ret, casting='unsafe', subok=False)
RuntimeWarning: invalid value encountered in true_divide

Warning (from warnings module):
  File "E:\学习\学习\Python书籍\机器学习实战源代码\Ch10\kMeans.py", line 18
    return sqrt(sum(power(vecA - vecB, 2))) #la.norm(vecA-vecB)
RuntimeWarning: invalid value encountered in power

Traceback (most recent call last):
  File "E:\学习\学习\Python书籍\机器学习实战源代码\Ch10\kMeans.py", line 147, in <module>
    centList,myNewAssments = biKmeans(newDataMat,i)
  File "E:\学习\学习\Python书籍\机器学习实战源代码\Ch10\kMeans.py", line 62, in biKmeans
    centroidMat, splitClustAss = kMeans(ptsInCurrCluster, 2, distMeas)
  File "E:\学习\学习\Python书籍\机器学习实战源代码\Ch10\kMeans.py", line 33, in kMeans
    centroids = createCent(dataSet, k)
  File "E:\学习\学习\Python书籍\机器学习实战源代码\Ch10\kMeans.py", line 24, in randCent
    minJ = min(dataSet[:,j])
ValueError: min() arg is an empty sequence

k均值聚类算法、二分k均值聚类算法的python代码如下:

# -*- coding: utf-8 -*-

'''
kMeans聚类算法
基本思想:
基于距离和预定义好的类簇数目k,
首先,随机选定k个初始类簇中心(不同的类簇中心会导致收敛速度和聚类结果有差别,有可能会陷入局部最优.)
其次,计算每个点到每个类簇中心的距离,并将其分配到最近的类簇中
第三,重新计算每个类簇的中心
第四,重复第二步和第三步直到类簇中心不再发生变化,聚类停止

二分k-均值聚类算法
基本思想
该算法首先将所有点作为一个簇,然后将该簇一分为二.之后选择其中一个簇继续进行划分,
选择哪一个簇进行划分取决于对"其划分是否可以最大程度降低SSE的值.上述基于SSE的划分过程不断重复,
直到得到用户指定的簇数目为止.

本代码对原书代码稍作微调,对有的小细节错误进行改正

Author: <jianzhang.zhang@foxmail.com>
Data  : 2016-06-11
'''


import pickle,matplotlib
import matplotlib.pyplot as plt
from numpy import *
from pprint import pprint




def loadDataSet(filename):
    '''
    从文件中加载数据矩阵
    param filename: 保存数据矩阵的文件名 str
    return dataSet: 数据矩阵   [[],[],...]  list(list)
    '''
    dataMat = []
    with open(filename) as f:
        l = f.readlines()
    for line in l:
        curLine = line.strip().split('\t')
        fltLine = map(float,curLine)
        dataMat.append(fltLine)
    return dataMat


def distEclud(vecA,vecB):
    '''
    计算两个向量的欧氏距离
    param vecA,vecB: 两个待计算距离的向量  numpy.ndarray
    return : 两个向量的欧氏距离
    '''
    return sqrt(sum(power(vecA - vecB, 2)))

def randCent(dataSet,k):
    '''
    随机挑选k个初始簇中心
    param dataSet: 数据矩阵
    param k: 簇数
    return : k个初始簇中心
    '''
    n = shape(dataSet)[1]
    centroids = mat(zeros((k,n)))
    for j in range(n):
        try:
            minJ = min(dataSet[:,j])
        except:
            print dataSet
        maxJ = max(dataSet[:,j])
        rangeJ = float(maxJ - minJ)
        centroids[:,j] = minJ + rangeJ * random.rand(k,1)
    return centroids

def kMeans(dataSet,k,distMeas = distEclud,createCent = randCent):
    '''
    K-均值聚类算法
    param dataSet: 数据集  numpy.matrix
    param k: 类别数        int
    param distMeas: 计算距离的方法,默认为欧氏距离  function
    param createCent: 产生初始簇中心的方式,默认是随机生成 function
    return centroids: 聚类结果的类簇中心 numpy.matrix
    return clusterAssment: 每条记录的聚类结果 numpy.matrix matrix([[clusterTag,欧氏距离平方],
                                                                [clusterTag,欧氏距离平方],...])
    '''
    m = shape(dataSet)[0]
    clusterAssment = mat(zeros((m,2)))
    centroids = createCent(dataSet,k)
    clusterChanged = True
    # 记录迭代次数
    iteration = 1
    while clusterChanged:
        clusterChanged = False
        for i in range(m):
            minDist = inf; minIndex = -1
            for j in range(k):
                distJI = distMeas(centroids[j,:],dataSet[i,:])
                if distJI < minDist:
                    minDist = distJI;minIndex = j
            # 判断所属类别是否变化,进而判断是否停止类中心变换,只有每个点的类别均不再发生变化,才认为聚类可以终止
            if clusterAssment[i,0] != minIndex:
                clusterChanged = True
            clusterAssment[i,:] = minIndex,minDist**2
##        print "Iter:",iteration
##        iteration += 1
##        print centroids
        for cent in range(k):
            '''mean(matrix,axis = 0)表示沿矩阵列方向进行均值计算
               clusterAssment[:,0] == cent返回bool型矩阵
               nonzero(clusterAssment[:,0] == cent)返回True值的坐标(array(x1,x2,..),array(y1,y2,...)),(第一维坐标,第二维坐标)
               nonzero(clusterAssment[:,0] == cent)[0]返回第一维坐标array(x1,x2,..)
            '''
            ptsInClust = dataSet[nonzero(clusterAssment[:,0].A == cent)[0]]
            centroids[cent,:] = mean(ptsInClust,axis = 0)
    return centroids,clusterAssment

def biKmeans(dataSet,k,distMeas = distEclud):
    '''
    二分K-均值聚类算法,基本思路:
    param dataSet: 数据集  numpy.matrix
    param k: 类簇数目
    param distMeas: 计算距离的方法,默认为欧氏距离  function
    return mat(centList): 聚类结果的类簇中心 numpy.matrix
    return clusterAssment: 每条记录的聚类结果 numpy.matrix matrix([[clusterTag,欧氏距离平方],
                                                                [clusterTag,欧氏距离平方],...])
    '''
    m = shape(dataSet)[0] # 记录数
    clusterAssment = mat(zeros((m,2))) # 初始化聚类结果为零矩阵
    centroid0 = mean(dataSet,axis = 0).tolist()[0] # 初始化第一个类簇中心,全部记录的中心,结果是一个列表
    centList = [centroid0] # centList用来保存类簇中心,首先将初始化类簇中心加入其中,centList的长度亦即类簇数目
    # 计算每一条记录到初始类簇中心的欧氏距离平方
    for j in range(m): 
        clusterAssment[j,1] = distMeas(dataSet[j,:],mat(centroid0))**2        
    while (len(centList) < k): # 当聚类数目达到k时停止
        lowestSSE = inf # 初始化误差平方和为正无穷
        # 尝试划分已有的每一个簇,寻找使得SSE降幅最大的那个簇,然后对其进行2-Means聚类划分
        for i in range(len(centList)):
            # 取出属于类簇i的记录
            ptsInCurrCluster = dataSet[nonzero(clusterAssment[:,0].A==i)[0],:]
            # 对类簇i进行2-Means聚类划分
            centroidMat,splitClustAss = kMeans(ptsInCurrCluster,2,distMeas)
            # 对类簇i进行2-Means聚类划分后求其SSE
            sseSplit = sum(splitClustAss[:,1])
            # 对除类簇i之外的其他类簇求SSE
            sseNotSplit = sum(clusterAssment[nonzero(clusterAssment[:,0].A!=i)[0],1])
##            print "sseSplit,sseNotSplit:",sseSplit,sseNotSplit
            # 判断划分类簇i后的总SSE是否比lowestSSE小   
            if sseSplit+sseNotSplit < lowestSSE:
                lowestSSE = sseSplit+sseNotSplit
                bestCentToSplit = i
                bestNewCents = centroidMat.copy()
                bestClustAss = splitClustAss.copy()
        bestClustAss[nonzero(bestClustAss[:,0].A == 1)[0],0] = len(centList)
        bestClustAss[nonzero(bestClustAss[:,0].A == 0)[0],0] = bestCentToSplit
##        print "The bestCentToSplit is:",bestCentToSplit
##        print "The length of bestClustAss is:",len(bestClustAss)
        # 更新保存类簇中心的列表
        centList[bestCentToSplit] = bestNewCents[0,:].tolist()[0]
        centList.append(bestNewCents[1,:].tolist()[0])
        # 更新聚类结果
        clusterAssment[nonzero(clusterAssment[:,0].A == bestCentToSplit)[0],:] = bestClustAss
    return mat(centList),clusterAssment

def distSLC(vecA,vecB):
    '''
    计算球面上两点间距离的公式
    设所求点A ,纬度β1 ,经度α1 ;点B ,纬度β2 ,经度α2.则距离
    S = R * arc cos[cosβ1*cosβ2*cos(α1-α2)+sinβ1*sinβ2]
    param vecA: A点坐标
    param vecB: B点坐标
    return 球面距离
    '''
    a = sin(vecA[0,1]*pi/180) * sin(vecB[0,1]*pi/180)
    b = cos(vecA[0,1]*pi/180) * cos(vecB[0,1]*pi/180) * cos(pi * (vecB[0,0] - vecA[0,0])/180)
    return arccos(a+b)*6371.0

def clusterClubs(numClust = 5):
    '''
    对地图上的点按照球面距离聚类,并在地图上展示
    param numClust: 默认类簇数量5  int
    '''
    datList = []
    with open('places.txt') as f:
        l = f.readlines()
    for line in l:
        lineArr = line.strip().split('\t')
        datList.append([float(lineArr[4]),float(lineArr[3])])
    datMat = mat(datList)
    myCentroids,clustAssing = biKmeans(datMat,numClust,distMeas = distSLC)

    fig = plt.figure()
    rect = [0.1,0.1,0.8,0.8]
    scatterMarkers = ['s','o','^','8','p',
                      'd','v','h','>','<',]
    axprops = dict(xticks = [],yticks = [])
    ax0 = fig.add_axes(rect,label = 'ax0',**axprops)
    imgP = plt.imread('Portland.png')
    ax0.imshow(imgP)
    ax1 = fig.add_axes(rect,label = 'ax1',frameon = False)
    for i in range(numClust):
        ptsInCurrCluster = datMat[nonzero(clustAssing[:,0].A == i)[0],:]
        markerStyle = scatterMarkers[i % len(scatterMarkers)]
        ax1.scatter(ptsInCurrCluster[:,0].flatten().A[0],\
                    ptsInCurrCluster[:,1].flatten().A[0],
                    marker = markerStyle,s = 90)
    ax1.scatter(myCentroids[:,0].flatten().A[0],\
                myCentroids[:,1].flatten().A[0],
                marker = '+', s = 300)
    plt.show()

            
def resuDisp(clusterAssing):
    '''
    输出显示聚类结果
    param clusterAssing: 聚类结果矩阵
    return resuDict: 聚类结果字典
    '''
    resuList = clusterAssing.tolist()
    resuDict = {}
    for i in range(len(resuList)):
        label = resuList[i][0]
        resuDict[label] = resuDict.get(label,[])
        resuDict[label].append(i)
    for key,value in resuDict.iteritems():
        print key,':',value   
    return resuDict


def storeResu(resuDict,filename):
    '''
    存储聚类结果到文件
    '''
    with open(filename,'w') as f:
        pickle.dump(resuDict,f)

def getResu(filename):
    '''
    从文件中读取聚类结果
    '''
    with open(filename) as f:
        resuDict = pickle.load(f)
    return resuDict

def plotCluster(datMat,clustAssing,myCentroids,numClust):
    '''
    画图显示聚类结果
    param datmat: 数据集
    param clustAssing: 聚类结果
    param myCentroids: 类簇中心
    param numClust: 类簇数目
    '''
    fig = plt.figure()
    rect = [0.1,0.1,0.8,0.8]
    scatterMarkers = ['s','o','^','8','p',
                      'd','v','h','>','<',]
    ax1 = fig.add_axes(rect,label = 'ax1',frameon = False)
    for i in range(numClust):
        ptsInCurrCluster = datMat[nonzero(clustAssing[:,0].A == i)[0],:]
        markerStyle = scatterMarkers[i % len(scatterMarkers)]
        ax1.scatter(ptsInCurrCluster[:,0].flatten().A[0],\
                    ptsInCurrCluster[:,1].flatten().A[0],
                    marker = markerStyle,s = 90)
    ax1.scatter(myCentroids[:,0].flatten().A[0],\
                myCentroids[:,1].flatten().A[0],
                marker = '+', s = 300)
    plt.show()
    

if __name__ == "__main__":
##    dataMat = mat(loadDataSet('testSet.txt'))
##    newDataMat = mat(loadDataSet('testSet2.txt'))
##    print "Minimum of 0 column:",min(dataMat[:,0])
##    print "Minimum of 1 column:",min(dataMat[:,1])
##    print "Maximun of 1 column:",max(dataMat[:,1])
##    print "Maximun of 0 column:",max(dataMat[:,0])
##    print "Random centroids: \n"
##    pprint(randCent(dataMat,2))
##    print "Euclidean distance of record_0 and record_1:",distEclud(dataMat[0],dataMat[1])
##    # 由于初始类中心的选取是随机的,所以每次运行收敛次数也是不一致的
##    myCentroids,clusterAssing = kMeans(dataMat,3)
##    resuDict = resuDisp(clusterAssing)
##    storeResu(resuDict,'clusterResult')
##    result = getResu('clusterResult')
##    centList,myNewAssments = biKmeans(newDataMat,3)
##    plotCluster(newDataMat,myNewAssments,centList,3)
##    resuDict = resuDisp(myNewAssments)
##    totalSSE = sum(myNewAssments[:,1])
##    clusterClubs(numClust = 5)


不完善之处希望大家评论指正,互相交流。