K均值算法-python实现

时间:2022-03-31 03:14:48

 

 

测试数据展示:

K均值算法-python实现

 

 
 
#coding:utf-8
__author__ = 'similarface'
'''
实现K均值算法

算法摘要:
-----------------------------
输入:所有数据点A,聚类个数k
输出:k个聚类的中心点

随机选取k个初始的中心点
repeat:
计算每个点和中心点的距离,将点分配给最近的中心簇中
计算Ck,更新簇的中心点
until 中心点稳定

-----------------------------
'''
import sys
import random
import math
from collections import defaultdict
data=[[ -1.97869191 ,-10.41036729], [ -5.98733291 , 1.09319552], [ -3.32042501 , -9.43521984], [ -6.95990143 , 1.48643384], [ -0.51581431 , -2.95589335], [ -6.98576681 , 2.3764377 ], [ -5.80752805 , -0.01880673], [ -2.3875164 , -3.79858985], [ -1.50834952 ,-11.55108763], [ -0.31932001 , -8.72310502], [ -0.10604775 , -1.97508279], [ -6.3698932 , 2.17096658], [ 0.98564842 , -2.22738281], [ -0.9889725 , -3.47291703], [ -1.03011438 , -1.02557957], [ -0.45064353 , -8.8508534 ], [ 0.72986148 , -1.97091264], [ -0.24707289 , -9.8104778 ], [ -3.30332765 , 3.32133548], [ 1.02581503 , -2.93411237], [ -0.77207963 , -3.88246058], [ 1.11435514 , -2.1403809 ], [ -3.01806146 , -8.44065141], [ 2.07905596 , -0.39137275], [ -1.3132102 , -5.21553485], [ 2.37703059 , -1.1452029 ], [ -3.71486953 , -9.36874886], [ -0.44554402 , -3.2977466 ], [ -1.07589398 , -8.07477912], [ -7.2774513 , 3.49894912], [ -1.29923245 , -8.30647414], [ -1.39638106 , -4.26919995], [ -2.23638886 , -5.40208811], [ -2.50153112 , -7.67699872], [ -0.72081785 , -1.37019171], [ -5.89699295 , 2.86422394], [ -0.66995787 , -9.05797846], [ -6.2936531 , 1.50084162], [ -3.14819261 , -7.7502907 ], [ 0.77192861 , -2.46670777], [ -1.38115313 , -1.23503221], [ -5.21920316 , 2.77710219], [ -2.18234803 ,-10.07244764], [ -1.80391665 , -5.05103832], [ 0.67755635 , -2.64355425], [ -0.52813711 ,-10.47523635], [ -4.22948326 , 1.0542576 ], [ 1.7408485 , -1.52644915], [ -0.6172497 , -0.79361238], [ -6.17304838 , 1.40648868], [ -2.4369522 , -3.95312369], [ -2.33805418 ,-10.39048298], [ -0.89823572 , -8.91519992], [ -5.49292357 , 1.36243111], [ -0.4075959 , -0.98602662], [ -1.46335853 , -8.46162063], [ 1.0505005 , -1.13446366], [ -6.54744623 , 2.29309604], [ -6.41410112 , 2.1273699 ], [ 2.49020135 , -2.82466326], [ -2.5027096 , -2.21684939], [ 1.17331624 , -1.36048319], [ 2.92640652 , -2.36434847], [ -1.57138311 , -2.48238607], [ 0.14325405 , -2.14589394], [ -2.64755963 , -4.57606078], [ 1.77939563 , -1.09938345], [ -1.74705392 , -5.14259238], [ -0.65895073 , -1.96921394], [ -1.40612244 , -3.5517328 ], [ -2.20539552 , -4.7867456 ], [ 0.88989739 , -0.44520158], [ -6.21503005 , 1.05650418], [ -1.61030464 , -3.15726266], [ -6.21496271 , 2.16858806], [ -1.01573424 , -2.67549788], [ -0.4681611 , -4.89114339], [ 1.36979547 , -1.42903191], [ -0.90965742 , -8.1218415 ], [ -2.57921362 , -1.84976724], [ -7.01537899 , 1.79532873], [ -4.51668131 , 0.73373973], [ -6.55623248 , -0.04283413], [ 0.37487407 , -0.91475768], [ 0.38115481 , -1.64481461], [ -4.11222325 , -2.00214115], [ -1.46957122 , -9.55869403], [ -6.87835953 , 3.37557201], [ -6.49999403 , 2.69702331], [ -2.9219904 , -4.13889999], [ 1.62861332 , 0.80867712], [ -2.13652734 , -3.20900184], [ 0.08713347 , -8.26358973], [ -0.61588054 , -8.7465907 ], [ -1.91357867 , -3.14379003], [ -1.51220857 , 0.53244231], [ 0.99104311 , -1.43284403], [ -2.70008268 , -3.56958972], [ -5.8267567 , 3.17894392], [ 1.10320057 , -3.20707537], [ 1.70531079 , -3.09426819], [ 0.89454062 , -3.84466463], [ -1.34578645 , -4.86207938], [ -0.58498235 ,-11.51494191], [ 1.05937597 , -0.7579938 ], [ -5.94171269 , 3.08161308], [ -5.39980072 , 4.37525462], [ 0.73374694 , -4.02735671], [ -3.74456491 , 3.04297057], [ 1.70084242 , -1.17949827], [ -6.44717333 , 2.13090812], [ -4.61625936 , 2.74952795], [ 0.42186795 , -1.00112008], [ -2.48625317 , -2.64140122], [ -0.16344961 , -0.05951747], [ 0.82017839 , -1.68889855], [ 1.8084556 , -3.36847451], [ 0.30428829 , -4.02238273], [ -0.45539895 , -8.23326244], [ -0.17095868 ,-11.24639309], [ -1.47484741 , -3.92998889], [ -4.84939275 , 3.22778867], [ -1.77602069 , -4.55311048], [ 0.30937327 , -3.97368662], [ 0.5124909 , -9.91048868], [ -0.80962387 , -3.80036663], [ -0.40311582 , -3.37210203], [ -1.28940953 , -4.22317842], [ -3.4964651 , -4.59144396], [ -0.90788399 , -5.10084263], [ -0.74104364 ,-10.07763506], [ -1.2800922 , -8.93912279], [ -1.66664693 , -4.37979994], [ -0.27973607 , -9.86256788], [ 0.39010877 , -1.25159452], [ 0.55523077 , -9.04078549], [ 2.60842583 , -0.86794594], [ -0.84660563 , -8.20309613], [ -6.42496164 , 2.96670557], [ 1.92513692 , 0.17477999], [ -2.15713739 , -5.48111104], [ -5.82420484 , 1.29802453], [ -2.19316436 , -1.77843034], [ -1.87385754 , -8.3319748 ], [ -2.53552918 , -8.54331169], [ -5.04349522 , 1.48053745], [ -0.32431771 , -3.31914574], [ -1.60368203 , -9.05765066], [ -3.93955141 , -9.57292799], [ -2.4311049 , -9.90799783], [ -1.66161844 , -9.41498635], [ -0.8118896 , -4.44914322], [ -1.02353151 ,-10.47025441], [ 1.32915795 , 0.61082376], [ -4.85863866 , 2.71818185], [ -1.1656265 , -2.98631583], [ -2.57539962 , -1.30662085], [ -5.14115986 , 2.11918259], [ 2.24169986 , -2.5392787 ], [ -2.44130996 , -3.06176393], [ 1.99964344 , -8.51061404], [ -1.69941844 , -9.61380426], [ -1.15527831 , -8.72497322], [ -1.65805364 , -2.62718768], [ -3.12203531 , -3.70129132], [ -1.91775697 ,-10.66908765], [ -3.52654286 , 1.509838 ], [ 0.20550002 , -0.86879848], [ -1.39452325 , -9.97964956], [ -0.8835818 , -9.7960928 ], [ -5.47503834 , 1.3675566 ], [ 0.29507856 , -1.05360095], [ 0.81825271 , -2.73069558], [ -1.65577138 , -9.55594613], [ -0.10657046 ,-11.82507855], [ -6.43716673 , 1.12621231], [ -2.68080553 , -4.15115913], [ -5.46682052 , 4.6280828 ], [ 1.57945366 , -1.31393944], [ 2.44441946 , -2.35848003], [ -2.25041232 , -4.64475199], [ -5.93596316 , 3.55810189], [ -6.08327339 , 1.26745748], [ 1.506501 , -1.62839627], [ 1.78230921 , -2.91522595], [ 2.82859067 , -8.66035449], [ -0.66876118 , -9.43350477], [ 0.91100418 , -1.59821873], [ -0.16221522 , -9.75571745], [ 0.15873435 , -2.29051183], [ -7.5806633 , 2.88460368], [ -2.97040189 , -2.2072549 ], [ -5.95789399 , 1.05100704], [ -2.41323523 , -9.04838281], [ -5.05820587 , 1.75215814], [ -1.30140995 , -3.59063453], [ -1.29343329 , -2.7140364 ], [ -6.15517065 , 2.47899111], [ -1.59361015 , -8.71046363], [ 0.82608078 , -1.87016308], [ 1.71860282 , -1.65302661], [ 0.31995672 , -3.43856678], [ -1.9993558 ,-10.06488996], [ -5.20501379 , 2.0292834 ], [ 1.13908963 , -1.6936582 ], [ -6.16415229 , 2.24844103], [ -0.41050376 ,-10.56856594], [ -7.29419673 , 2.10875296], [ 0.68097889 , 0.85112594], [ 1.1848232 , -1.95576116], [ -6.13784033 , 3.27454164], [ -4.94592301 , 3.50193532], [ -0.89962999 , -9.69861063], [ -5.31271816 , 2.30731199], [ -1.27736788 , -1.52709537], [ 0.33970811 , 0.11528184], [ -1.74223531 , -3.63574418], [ -5.11750476 , 2.30467137], [ -1.60515159 ,-10.04170987], [ -1.75660679 , -3.02168142], [ -1.09969215 , -8.92831109], [ -8.09999402 , 2.98488494], [ -5.4498388 , 2.21471778], [ -1.77514158 , -5.22156992], [ -1.06398595 ,-11.34008775], [ -1.07153453 , -4.10149796], [ -7.24043131 , 1.91557865], [ -6.33736287 , 0.43514226], [ 0.62173043 , 1.86741382], [ -2.11753563 , -3.98311226], [ 0.46171023 , -9.92897624], [ 2.82419621 , -0.35337615], [ -7.72527978 , 4.42206927], [ -0.49463392 , -4.41118163], [ -7.36970566 , 1.76857486], [ -8.07564582 , 1.72023916], [ -2.10923725 , -9.39376515], [ -0.91504844 , -8.70739333], [ 2.6351642 , -0.98185444], [ -2.41442044 ,-10.18889625], [ 2.02143446 , -2.01543187], [ -0.92096863 , -8.85925495], [ -2.17903191 , -1.65878724], [ -6.25233557 , 2.33764219], [ -1.60598371 , -4.16162683], [ -1.64458105 ,-10.35745484], [ -1.03866233 , -8.98404971], [ -2.76256743 , -8.63516347], [ 0.80420551 , -1.74288075], [ -0.03026543 , -1.74172697], [ -1.93726763 , -5.39538281], [ -3.3712446 , -3.89409507], [ -1.61892392 , -9.71765939], [ -5.69386864 , 3.93793276], [ -5.34498618 , 2.0693253 ], [ -0.77824475 ,-10.32568907], [ -1.80769409 , -4.46833214], [ -1.68399423 ,-10.86599403], [ -1.3196722 , -9.15547193], [ -0.06811619 , -1.40206897], [ -1.07371903 , -3.88629849], [ -1.73432981 , -8.96710465], [ -2.18736646 , -3.70811542], [ -7.1865842 , 3.11806934], [ -2.90291449 , -3.02986961], [ -1.93061611 , -3.05009085], [ 1.27033628 ,-10.95464861], [ -2.30151669 , -9.04907966], [ 0.21944157 , -5.44956932], [ -4.95790559 , 2.43632632], [ -5.09335092 , 2.47355038], [ -5.33075221 , 2.57934775], [ -4.91352172 , 0.70785394], [ 1.07861399 , -0.62821787], [ -5.61777478 , 2.78571681], [ -0.75580553 , -8.74619579], [ -6.84289623 , 3.12082979], [ -1.69560499 , -2.72407455], [ 0.18757605 , -9.8171527 ], [ -2.34090099 , -8.88980884], [ -1.02993907 , -3.97537434], [ -7.952894 , 2.87991319], [ -5.90898023 , 1.15375484], [ -0.20175034 , -1.78148269], [ -5.03133839 , 2.37464369], [ -5.35976552 , 2.11910146], [ 1.38354601 , -0.31804274], [ -6.29083717 , 3.60726959], [ -2.41154316 , -3.72594284], [ 1.70677401 , 1.23618273], [ 1.83086535 , -1.42523455], [ -1.09860809 , -1.84702593], [ -2.67919211 , -3.62422108], [ -6.77223728 , 4.13723749], [ -1.74210731 , -9.13058687], [ -2.67557352 , -4.3499291 ], [ -2.45517504 , -5.13617648], [ -1.54915892 , -7.25010857], [ 1.81313467 , -1.92467083], [ -1.5841884 , -6.8961805 ], [ -1.19769074 , -4.59711705], [ -5.40166242 , 3.12407116], [ -0.67858614 , -9.47781587], [ 0.83352543 , -0.74460559], [ -2.47535278 , -2.50855939], [ -1.42824915 , -7.98003845], [ -8.01058566 , 1.63404449], [ 2.0119666 , 1.00882614], [ -4.81816885 , 3.72073108], [ -6.27164232 , 0.74780494], [ -5.65408139 , 1.0799859 ], [ 2.13810493 , -1.44566983], [ 2.61434254 , -0.58086887], [ -4.6059069 , 1.38411417], [ 0.44030012 , -0.99402533], [ -1.91478126 , -8.97307912], [ -6.36433615 , 1.6497788 ], [ -0.07381757 , -1.94648329], [ -0.72864791 , -7.18926735], [ -5.59867106 , 1.66313127], [ 0.70973004 , -0.75512788], [ -1.33633557 , -2.6256785 ], [ -2.15820985 , -9.63790953], [ -4.33013714 , 0.97871974], [ -1.74844822 ,-10.73619567], [ -0.16767692 , -3.84016148], [ -2.02797291 , -9.47245011], [ -6.19473103 , 2.41547938], [ -2.73346631 , -9.81949314], [ 1.05371201 , -2.63214103], [ 1.59306999 , -0.74416768], [ -6.4721467 , 2.95054106], [ -5.02266832 , 2.53430552], [ -1.20943949 , -3.81029773], [ -2.39099269 , -3.38764578], [ 1.86409032 , -0.70074535], [ -0.8131639 , -2.36670563], [ -1.55628145 , -9.99835926], [ -1.9233198 , -3.10609538], [ -6.09795188 , 1.76016581], [ -0.13265422 , -0.80505548], [ 1.40927131 , -1.35139941], [ -6.06728988 , 2.43844581], [ -6.77797943 , 2.21185794], [ -3.09368405 , -5.90874304], [ 1.55591864 , -1.4580672 ], [ 2.54154025 , -1.60938019], [ -3.50927448 , -5.62064487], [ -5.61928015 , 3.08987021], [ -1.07380783 , -3.02885557], [ 0.15881217 , -1.95127059], [ -1.93335222 , -8.63901908], [ 1.62504848 , -0.87595942], [ 1.8230386 , -0.9574862 ], [ -0.43711337 , -3.64783404], [ 0.110124 , 0.25183468], [ -0.48092196 , -8.2188617 ], [ -1.95777753 ,-10.92091439], [ 1.3203166 , -2.74891159], [ 1.54591325 , -2.13399516], [ -0.11858047 , -9.09539732], [ -8.03689652 , 1.51829382], [ 2.12599563 , -0.9232473 ], [ -6.6466344 , 1.43966762], [ -2.53164296 , -2.76452777], [ -2.82677657 ,-11.5641273 ], [ -2.60710702 , -4.50856754], [ 0.05546421 , -9.17884603], [ -2.3788409 , -7.89698831], [ -5.90466798 , 3.09117187], [ -6.37783409 , 2.28944986], [ -6.3933459 , 2.05685086], [ 0.68989568 , -3.88908243], [ 0.51326445 , -2.31320125], [ -1.44144678 , -2.6282341 ], [ -2.69891251 , -9.73324948], [ -0.28764562 , -0.98370587], [ 0.25165836 ,-11.46732114], [ -4.95384992 , 2.04017736], [ -0.85715442 , -0.96306408], [ 2.10389484 , -1.66689096], [ -6.81038823 , 2.37777702], [ 0.78454593 , -1.15593416], [ -1.45368824 , -3.27385342], [ -0.37638912 , 1.3767851 ], [ -5.06831433 , 2.53524728], [ -2.70131918 , -9.63497056], [ -1.23856256 ,-10.59940081], [ -1.93958449 , -2.98186006], [ -0.30387455 , -3.25837812], [ -4.98980684 , 3.66124623], [ -2.84011639 , -3.47084983], [ -1.68584182 , -3.25767216], [ -0.58390398 , -8.78405909], [ -0.78216181 , -9.35497119], [ -5.58569152 , 1.43897246], [ -1.65427904 , -4.34620073], [ -1.38595406 , -3.46417994], [ 0.57884096 , -1.22623874], [ 2.03872755 , 0.07546388]]

def calc_geometric_distance(pointA,pointB):
'''
计算两个点的几何距离 兼容多纬度
:param pointA:
:param pointB:
:return:
'''
if len(pointA)==len(pointB):
sumct = 0
for idx in range(len(pointA)):
sumct += math.pow(pointA[idx]-pointB[idx],2)
return math.sqrt(sumct)
return None

def getInitCenter(data,k):
'''
随机选择k个点做中心点
:param data:
:param k:
:return:
'''
initcenter=[]
for i in range(k):
idx=random.randint(0,len(data))
initcenter.append(data[idx])
return initcenter

def cluser(dataset):
'''
将单个点 向 所有中心点距离最近的归属
:param dataset:
:return:
'''
k_dataset=defaultdict(list)
for k,v in dataset.items():
k_dataset[v.index(min(v))+1].append(k)
return k_dataset

def cluser_data(data,data_idx):
'''
还原数据 开始存放的是数据的索引
:param data: 数据
:param data_idx: 数据索引集合
:return:
'''
data_cluster={}
for k,v in data_idx.items():
data_cluster[k]=[data[item] for item in v]
return data_cluster

def calc_center(clusterdataset):
'''
计算中心点
:param clusterdataset:
:return:
'''
centers=[]
#遍历每一个簇的数据
for k,v in clusterdataset.items():
#单个簇的数据个数
numct=float(len(v))
#将单个簇的数据对应特征求和
sum_vecter=[0]*len(v[0])
for item in v:
for ix in range(len(sum_vecter)):
sum_vecter[ix]=sum_vecter[ix]+item[ix]
#单个簇的均值
avg_vector=[item/numct for item in sum_vecter]
#为中心点
centers.append(avg_vector)
return centers



def k_means(data,initcenter,i):
distances=defaultdict(list)
for idx_data in range(len(data)):
item=data[idx_data]
#点到所有中心点几何
for center in initcenter:
#点到中心点距离
point_center_d=calc_geometric_distance(item,center)
distances[idx_data].append(point_center_d)
initcenter=calc_center(cluser_data(data,cluser(distances)))
print initcenter,i
i=i+1
if i>100:
sys.exit(-1)
k_means(data,initcenter,i)

# print distances
return None

if __name__=='__main__':
k=4
i=0
initcenter=getInitCenter(data,k)
k_means(data,initcenter,i)

'''
数据在第次收敛:
[[-1.090354490561798, -9.544300641123598], [-3.4009604193589746, -3.448081201923078], [-0.19541685830769231, -3.3116750070769227], [-2.446104015535713, 0.7224692077976191]] 0
[[-1.2749446563000004, -9.409093471199999], [-2.470575770652174, -3.807719888695652], [0.35515101615942024, -2.3586021581159424], [-5.119257947586205, 1.9765036246551722]] 1
[[-1.2749446563000004, -9.409093471199999], [-1.967171268292682, -3.7557153614634147], [0.8185451994915249, -1.6050642896610166], [-5.937498566799999, 2.2599845357999997]] 2
[[-1.2749446563000004, -9.409093471199999], [-1.7570419110638291, -3.732020352340426], [0.9475681998113205, -1.3826069122641511], [-5.937498566799999, 2.2599845357999997]] 3
[[-1.2749446563000004, -9.409093471199999], [-1.6700011342424235, -3.7007944505050507], [0.9961425923762373, -1.2969068833663366], [-5.937498566799999, 2.2599845357999997]] 4
[[-1.2749446563000004, -9.409093471199999], [-1.6660748016999996, -3.6790574597], [1.0188776970999995, -1.2946049985], [-5.937498566799999, 2.2599845357999997]] 5
[[-1.2749446563000004, -9.409093471199999], [-1.6660748016999996, -3.6790574597], [1.0188776970999995, -1.2946049985], [-5.937498566799999, 2.2599845357999997]] 6
''