=================================版权声明=================================
版权声明:原创文章 禁止转载
请通过右侧公告中的“联系邮箱(wlsandwho@foxmail.com)”联系我
勿用于学术性引用。
勿用于商业出版、商业印刷、商业引用以及其他商业用途。
本文不定期修正完善。
本文链接:http://www.cnblogs.com/wlsandwho/p/7587203.html
耻辱墙:http://www.cnblogs.com/wlsandwho/p/4206472.html
=======================================================================
这个示例实际上是对kNN的练习,区别是使用来自文件的数据。
=======================================================================
1从文件中读取数据并格式化为指定方式。
文件为file2matrix.py
from numpy import *
'''
def file2matrix1(filename):
f=open(filename)
arrlines=f.readlines()
rows=len(arrlines)
retmat=zeros((rows,3))
vctlabels=[]
index=0 for line in arrlines:
line=line.strip()
list=line.split("\t")
retmat[index,0:3]=list[0:3]
vctlabels.append(int(list[-1]))
index+=1 return retmat,vctlabels mat1,labels1=file2matrix1("datingTestSet2.txt")
print(mat1)
print(labels1)
'''
def file2matrix(filename):
with open(filename) as file:
line1=file.readline()
list1=line1.split()
cols=len(list1)
file.seek(0,0)
lines=file.readlines()
rows=len(lines) index=0
labels=[]
realcol=cols-1
retmat=zeros((rows,realcol))
for line in lines:
list=line.split()
retmat[index,:]=list[0:realcol]
labels.append(int(list[-1])) index+=1
return retmat,labels def file2matrix2(filename):
with open(filename) as file:
line1=file.readline()
list1=line1.split()
cols=len(list1)
file.seek(0,0)
lines=file.readlines()
rows=len(lines) index=0
labels=[]
realcol=cols-1
retmat=zeros((rows,realcol))
for line in lines:
list=line.split()
retmat[index,:]=list[0:realcol]
#labels.append(int(list[-1]))
if("largeDoses"==list[-1]):
labels.append(3)
elif("smallDoses"==list[-1]):
labels.append(2)
elif("didntLike"==list[-1]):
labels.append(1)
index+=1
return retmat,labels
测试一下
文件为test_file2matrix.py
from file2matrix import * mat,labels=file2matrix("datingTestSet2.txt")
print(mat)
print(labels)
结果(红框是因为我用的虚拟机,不要在意这些细节)
=======================================================================
2 做个图看看相关性/趋势。实际上这里并没有做数学上的讨论,就是画个图看看臆测一下。
文件是drawapicture1.py
import matplotlib
import matplotlib.pyplot as plt
import numpy
from file2matrix import *
mat,labels=file2matrix("datingTestSet2.txt")
nSizeofLabels=len(labels) fig=plt.figure()
ax=fig.add_subplot(111)
ax.scatter(mat[:,1],mat[:,2],s=15.0*array(labels),c=15.0*array(labels))
plt.show()
结果
文件是drawapicture2.py
import matplotlib
import matplotlib.pyplot as plt
import numpy
from file2matrix import * mat,labels=file2matrix("datingTestSet2.txt")
nSizeofLabels=len(labels)
mat1x=[]
mat2x=[]
mat3x=[]
mat1y=[]
mat2y=[]
mat3y=[]
for i in range(nSizeofLabels):
if labels[i]==1:
mat1x.append(mat[i][0])
mat1y.append(mat[i][1])
elif labels[i]==2:
mat2x.append(mat[i][0])
mat2y.append(mat[i][1])
elif labels[i]==3:
mat3x.append(mat[i][0])
mat3y.append(mat[i][1]) fig=plt.figure()
ax=fig.add_subplot(111)
lg1=ax.scatter(mat1x,mat1y,s=20,c='red')
lg2=ax.scatter(mat2x,mat2y,s=20,c='green')
lg3=ax.scatter(mat3x,mat3y,s=20,c='blue')
fig.legend((lg1,lg2,lg3),('yiban','xihuan','milian'),"upper left")
plt.show()
结果是
=======================================================================
3 由于不同意义的数据的取值范围很大,所以需要归一化。
文件是matrixnormalization.py
from numpy import * def autonorm(mat):
minv=mat.min(0)
maxv=mat.max(0)
diff=maxv-minv
rows=mat.shape[0]
normmat=zeros(shape(mat))
normmat=mat-tile(minv,(rows,1))
normmat=normmat/tile(diff,(rows,1))
return normmat,minv,maxv,diff
测试归一化
文件是test_matrixnormalization.py
from matrixnormalization import * mat=array([[1,20,3000],[5,60,7000],[2,30,3],[6,60,6000]])
normmat=autonorm(mat)
print(normmat)
结果(不知道为何,跟书上的结果不太一样。)
=======================================================================
4测试一下这个分类器的效果
文件名test_dating.py
from file2matrix import *
from matrixnormalization import *
from kNN import * def testdating():
ratio=0.1
countforerr=0 mat,lab=file2matrix2("datingTestSet.txt") normmat,minv,maxv,diff=autonorm(mat) allrows=normmat.shape[0]
rowsforTest=int(allrows*ratio) for i in range(rowsforTest):
res=classify_kNN(normmat[i,:],normmat[rowsforTest:allrows,:],lab[rowsforTest:allrows],3)
print('the result is',res,'the real is',lab[i])
if (res!=lab[i]):
countforerr+=1.0 print("the error rate is", (countforerr/float(allrows))) testdating()
结果
=======================================================================
5支持手工输入
文件名test_dating2.py
from file2matrix import *
from matrixnormalization import *
from kNN import * def testdating():
ratio=0.1
countforerr=0
reslist=["not at all","a little","very well"]
mat,lab=file2matrix2("datingTestSet.txt") normmat,minv,maxv,diff=autonorm(mat) allrows=normmat.shape[0]
rowsforTest=int(allrows*ratio) percenttats=float(input("percentage of time spent playing video games?"))
flymiles=float(input("frequent filer miles earned per year?"))
icecream=float(input("liters of ice cream consumed per year?"))
desarr=array([flymiles,percenttats,icecream])
normdesarr=(desarr-minv)/diff res=classify_kNN(normdesarr,normmat[rowsforTest:allrows,:],lab[rowsforTest:allrows],3)
print("You will probably like this person:",reslist[res-1]) testdating()
结果
=======================================================================
这一节更像是用一个小算法领着学习python。