#K-NN分类 import os
import sys
import time
import operator
import cx_Oracle
import numpy as np
import pandas as pd
import tensorflow as tf conn=cx_Oracle.connect('doctor/admin@localhost:1521/tszr')
cursor = conn.cursor() #获取数据集
def getdata(surgery,surgeryChest):
sql = "select feature1,feature2,feature3,feature4,feature5,trainLable \
from menzhenZ where surgery='%s' and surgeryChest='%s'" % (surgery,surgeryChest)
cursor.execute(sql)
rows = cursor.fetchall()
dataset = []
lables = []
for row in rows:
temp = []
temp.append(row[0])
temp.append(row[1])
temp.append(row[2])
temp.append(row[3])
temp.append(row[4])
dataset.append(temp)
lables.append(row[5])
return np.array(dataset),np.array(lables) def gettestdata(surgery,surgeryChest):
sql = "select feature1,feature2,feature3,feature4,feature5,trainLable from \
testZ where surgery='%s' and surgeryChest='%s'" % (surgery,surgeryChest)
cursor.execute(sql)
rows = cursor.fetchall()
testdataset = []
testlables = []
for row in rows:
temp = []
temp.append(row[0])
temp.append(row[1])
temp.append(row[2])
temp.append(row[3])
temp.append(row[4])
testdataset.append(temp)
testlables.append(row[5])
return np.array(testdataset),np.array(testlables) #K-NN分类
def classify0(inX, dataSet, labels, k):
dataSetSize = dataSet.shape[0]
diffMat = np.tile(inX, (dataSetSize,1)) - dataSet
sqDiffMat = diffMat**2
sqDistances = sqDiffMat.sum(axis=1)
distances = sqDistances**0.5
sortedDistIndicies = distances.argsort()
classCount={}
for i in range(k):
voteIlabel = labels[sortedDistIndicies[i]]
classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1
sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
return sortedClassCount[0][0] #归一化
def autoNorm(dataSet):
minVals = dataSet.min(0)
maxVals = dataSet.max(0)
ranges = maxVals - minVals
normDataSet = np.zeros(np.shape(dataSet))
m = dataSet.shape[0]
normDataSet = dataSet - np.tile(minVals, (m,1))
normDataSet = normDataSet/np.tile(ranges, (m,1))
return normDataSet, ranges, minVals erace = []
accuc = []
t = [] #启动和检测模型
def datingClassTest():
datingDataMat,datingLabels = getdata("外科","胸外科")
normMat, ranges, minVals = autoNorm(datingDataMat)
testdataset,testlables = gettestdata("外科","胸外科")
testnormMat, testranges, testminVals = autoNorm(testdataset)
errorCount = 0.0
start = time.time()
for j in [3,5,7,9,11,13]:
for i in range(np.shape(testnormMat)[0]):
classifierResult = classify0(testnormMat[i,:],normMat,datingLabels,j)
print("the classifier came back with: %s, the real answer is: %s" % (classifierResult, testlables[i]))
if (classifierResult != testlables[i]):
errorCount += 1.0
end = time.time()
t.append(end)
erace.append(errorCount/float(np.shape(testnormMat)[0])*100)
accuc.append((1.0-errorCount/float(np.shape(testnormMat)[0]))*100)
print("错误率: %.2f%%" % (errorCount/float(np.shape(testnormMat)[0])*100))
print("准确率: %.2f%%" % ((1.0-errorCount/float(np.shape(testnormMat)[0]))*100))
print("训练和预测一共耗时: %.2f 秒" % (end-start))
datingClassTest()
print(accuc)
print(erace)
print(t)
#探索不同的K值对算法的影响 import matplotlib.pyplot as plt x = [3,5,7,9,11,13]
plt.plot(x,erace,c='r')
plt.plot(x,accuc,c='g')
plt.legend(['error race','accuce race'],loc=9)
plt.show()
print(accuc)
print(erace)
#决策树
import os
import sys
import time
import operator
import cx_Oracle
import numpy as np
import pandas as pd
from math import log
import tensorflow as tf conn=cx_Oracle.connect('doctor/admin@localhost:1521/tszr')
cursor = conn.cursor() #获取数据集
def getdata(surgery,surgeryChest):
sql = "select feature1,feature2,feature3,feature4,feature5,trainLable from menzhenZ where surgery='%s' and surgeryChest='%s'" % (surgery,surgeryChest)
cursor.execute(sql)
rows = cursor.fetchall()
dataset = []
for row in rows:
temp = []
temp.append(row[0])
temp.append(row[1])
temp.append(row[2])
temp.append(row[3])
temp.append(row[4])
temp.append(row[5])
dataset.append(temp)
lables = []
lables.append("呼吸急促")
lables.append("持续性脉搏加快")
lables.append("畏寒")
lables.append("血压降低")
lables.append("咳血")
return dataset,lables def gettestdata(surgery,surgeryChest):
sql = "select feature1,feature2,feature3,feature4,feature5,trainLable from testZ where surgery='%s' and surgeryChest='%s'" % (surgery,surgeryChest)
cursor.execute(sql)
rows = cursor.fetchall()
testdataset = []
testlables = []
for row in rows:
temp = []
temp.append(row[0])
temp.append(row[1])
temp.append(row[2])
temp.append(row[3])
temp.append(row[4])
testdataset.append(temp)
testlables.append(row[5])
return testdataset,testlables #计算熵值
def calcShannonEnt(dataSet):
numEntries = len(dataSet)
labelCounts = {}
for featVec in dataSet:
currentLabel = featVec[-1]
if currentLabel not in labelCounts.keys(): labelCounts[currentLabel] = 0
labelCounts[currentLabel] += 1
shannonEnt = 0.0
for key in labelCounts:
prob = float(labelCounts[key])/numEntries
shannonEnt -= prob * log(prob,2)
return shannonEnt #按照给定特征划分数据集
def splitDataSet(dataSet, axis, value):
retDataSet = []
for featVec in dataSet:
if featVec[axis] == value:
reducedFeatVec = featVec[:axis]
reducedFeatVec.extend(featVec[axis+1:])
retDataSet.append(reducedFeatVec)
return retDataSet #选择最好的属性
def chooseBestFeatureToSplit(dataSet):
numFeatures = len(dataSet[0]) - 1
baseEntropy = calcShannonEnt(dataSet)
bestInfoGain = 0.0
bestFeature = -1
for i in range(numFeatures):
featList = [example[i] for example in dataSet]
uniqueVals = set(featList)
newEntropy = 0.0
for value in uniqueVals:
subDataSet = splitDataSet(dataSet, i, value)
prob = len(subDataSet)/float(len(dataSet))
newEntropy += prob * calcShannonEnt(subDataSet)
infoGain = baseEntropy - newEntropy
if (infoGain > bestInfoGain):
bestInfoGain = infoGain
bestFeature = i
return bestFeature #统计机制
def majorityCnt(classList):
classCount={}
for vote in classList:
if vote not in classCount.keys(): classCount[vote] = 0
classCount[vote] += 1
sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
return sortedClassCount[0][0] #创建决策树
def createTree(dataSet,labels):
classList = [example[-1] for example in dataSet]
if classList.count(classList[0]) == len(classList):
return classList[0]
if len(dataSet[0]) == 1:
return majorityCnt(classList)
bestFeat = chooseBestFeatureToSplit(dataSet)
bestFeatLabel = labels[bestFeat]
myTree = {bestFeatLabel:{}}
temp = []
for i in labels:
if i != labels[bestFeat]:
temp.append(i)
labels = temp
featValues = [example[bestFeat] for example in dataSet]
uniqueVals = set(featValues)
for value in uniqueVals:
subLabels = labels[:]
myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet, bestFeat, value),subLabels)
return myTree #使用决策树模型分类
def classify(inputTree,featLabels,testVec):
for i in inputTree.keys():
firstStr = i
break
secondDict = inputTree[firstStr]
featIndex = featLabels.index(firstStr)
key = testVec[featIndex]
valueOfFeat = secondDict[key]
if isinstance(valueOfFeat, dict):
classLabel = classify(valueOfFeat, featLabels, testVec)
else: classLabel = valueOfFeat
return classLabel #启动和检测模型
def datingClassTest():
dataSet,labels = getdata("外科","胸外科")
myTree = createTree(dataSet,labels)
testdataset,testlables = gettestdata("外科","胸外科")
errorCount = 0.0
start = time.time()
for i in range(np.shape(testdataset)[0]):
classifierResult = classify(myTree,labels,testdataset[i])
print("the classifier came back with: %s, the real answer is: %s" % (classifierResult, testlables[i]))
if (classifierResult != testlables[i]):
errorCount += 1.0
end = time.time()
print("错误率: %.2f%%" % (errorCount/float(np.shape(testdataset)[0])*100))
print("准确率: %.2f%%" % ((1.0-errorCount/float(np.shape(testdataset)[0]))*100))
print("训练和预测一共耗时: %.2f 秒" % (end-start))
datingClassTest()
#选取前600条记录生成并打印决策树
dataSet,labels = getdata("外科","胸外科")
dataSet = dataSet[0:600]
labels = labels[0:600]
myTree = createTree(dataSet,labels)
print(myTree)
#比较K-NN算法与决策树算法的优劣
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt x = np.array([10,12])
y = [85.6,87.3]
plt.bar(x,y,edgecolor='yellow')
for i,j in zip(x,y):
plt.text(i-0.2,j-0.2,'%.2f%%' % j)
plt.text(9.7,40,'K-NN right race')
plt.text(11.7,40,'Tree right race')
plt.show()
#使用神经网络探索数据集
import sys
import os
import time
import operator
import cx_Oracle
import numpy as np
import pandas as pd
import tensorflow as tf conn=cx_Oracle.connect('doctor/admin@localhost:1521/tszr')
cursor = conn.cursor() #one-hot编码
def onehot(labels):
n_sample = len(labels)
n_class = max(labels) + 1
onehot_labels = np.zeros((n_sample, n_class))
onehot_labels[np.arange(n_sample), labels] = 1
return onehot_labels #获取数据集
def getdata(surgery,surgeryChest):
sql = "select feature1,feature2,feature3,feature4,feature5,trainLable from menzhen where surgery='%s' and surgeryChest='%s'" % (surgery,surgeryChest)
cursor.execute(sql)
rows = cursor.fetchall()
dataset = []
lables = []
for row in rows:
temp = []
temp.append(row[0])
temp.append(row[1])
temp.append(row[2])
temp.append(row[3])
temp.append(row[4])
dataset.append(temp)
if(row[5]==3):
lables.append(0)
elif(row[5]==6):
lables.append(1)
else:
lables.append(2)
dataset = np.array(dataset)
lables = np.array(lables)
dataset = dataset.astype(np.float32)
labless = onehot(lables)
return dataset,labless #获取测试数据集
def gettestdata(surgery,surgeryChest):
sql = "select feature1,feature2,feature3,feature4,feature5,trainLable from test where surgery='%s' and surgeryChest='%s'" % (surgery,surgeryChest)
cursor.execute(sql)
rows = cursor.fetchall()
testdataset = []
testlables = []
for row in rows:
temp = []
temp.append(row[0])
temp.append(row[1])
temp.append(row[2])
temp.append(row[3])
temp.append(row[4])
testdataset.append(temp)
if(row[5]==3):
testlables.append(0)
elif(row[5]==6):
testlables.append(1)
else:
testlables.append(2)
testdataset = np.array(testdataset)
testlables = np.array(testlables)
testdataset = testdataset.astype(np.float32)
testlabless = onehot(testlables)
return testdataset,testlabless dataset,labless = getdata("外科","胸外科")
testdataset,testlables = gettestdata("外科","胸外科") dataset = dataset[0:100]
labless = labless[0:100] x_data = tf.placeholder("float32", [None, 5])
y_data = tf.placeholder("float32", [None, 3]) weight = tf.Variable(tf.ones([5, 3]))
bias = tf.Variable(tf.ones([3])) #使用softmax激活函数
y_model = tf.nn.softmax(tf.matmul(x_data, weight) + bias) #y_model = tf.nn.relu(tf.matmul(x_data, weight) + bias) # loss = tf.reduce_sum(tf.pow((y_model - y_data), 2)) #使用交叉熵作为损失函数
loss = -tf.reduce_sum(y_data*tf.log(y_model)) # train_step = tf.train.GradientDescentOptimizer(1e-4).minimize(loss) #使用AdamOptimizer优化器
train_step = tf.train.AdamOptimizer(1e-4).minimize(loss) #train_step = tf.train.MomentumOptimizer(1e-4,0.9).minimize(loss) #评估模型
correct_prediction = tf.equal(tf.argmax(y_model, 1), tf.argmax(y_data, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float")) init = tf.initialize_all_variables()
sess = tf.Session()
sess.run(init)
start = time.time()
for _ in range(10):
for i in range(int(len(dataset)/100)):
sess.run(train_step, feed_dict={x_data:dataset[i:i+100,:], y_data:labless[i:i+100,:]})
print("模型准确率",sess.run(accuracy, feed_dict={x_data:testdataset , y_data:testlables}))
end = time.time()
print("模型训练和测试公耗时:%.2f 秒" % (end-start))
#加深一层神经网络
import sys
import os
import time
import operator
import cx_Oracle
import numpy as np
import pandas as pd
import tensorflow as tf conn=cx_Oracle.connect('doctor/admin@localhost:1521/tszr')
cursor = conn.cursor() #one-hot编码
def onehot(labels):
n_sample = len(labels)
n_class = max(labels) + 1
onehot_labels = np.zeros((n_sample, n_class))
onehot_labels[np.arange(n_sample), labels] = 1
return onehot_labels #获取数据集
def getdata(surgery,surgeryChest):
sql = "select feature1,feature2,feature3,feature4,feature5,trainLable from menzhen where surgery='%s' and surgeryChest='%s'" % (surgery,surgeryChest)
cursor.execute(sql)
rows = cursor.fetchall()
dataset = []
lables = []
for row in rows:
temp = []
temp.append(row[0])
temp.append(row[1])
temp.append(row[2])
temp.append(row[3])
temp.append(row[4])
dataset.append(temp)
if(row[5]==3):
lables.append(0)
elif(row[5]==6):
lables.append(1)
else:
lables.append(2)
dataset = np.array(dataset)
lables = np.array(lables)
dataset = dataset.astype(np.float32)
labless = onehot(lables)
return dataset,labless def gettestdata(surgery,surgeryChest):
sql = "select feature1,feature2,feature3,feature4,feature5,trainLable from test where surgery='%s' and surgeryChest='%s'" % (surgery,surgeryChest)
cursor.execute(sql)
rows = cursor.fetchall()
testdataset = []
testlables = []
for row in rows:
temp = []
temp.append(row[0])
temp.append(row[1])
temp.append(row[2])
temp.append(row[3])
temp.append(row[4])
testdataset.append(temp)
if(row[5]==3):
testlables.append(0)
elif(row[5]==6):
testlables.append(1)
else:
testlables.append(2)
testdataset = np.array(testdataset)
testlables = np.array(testlables)
testdataset = testdataset.astype(np.float32)
testlabless = onehot(testlables)
return testdataset,testlabless dataset,labless = getdata("外科","胸外科")
testdataset,testlables = gettestdata("外科","胸外科") dataset = dataset[0:100]
labless = labless[0:100] x_data = tf.placeholder("float32", [None, 5])
y_data = tf.placeholder("float32", [None, 3]) weight1 = tf.Variable(tf.ones([5, 20]))
bias1 = tf.Variable(tf.ones([20]))
y_model1 = tf.matmul(x_data, weight1) + bias1 #加深一层神经网络
weight2 = tf.Variable(tf.ones([20, 3]))
bias2 = tf.Variable(tf.ones([3]))
y_model = tf.nn.softmax(tf.matmul(y_model1, weight2) + bias2) loss = tf.reduce_sum(tf.pow((y_model - y_data), 2))
# loss = -tf.reduce_sum(y_data*tf.log(y_model)) #train_step = tf.train.GradientDescentOptimizer(1e-4).minimize(loss)
train_step = tf.train.AdamOptimizer(1e-4).minimize(loss)
# train_step = tf.train.MomentumOptimizer(1e-4,0.9).minimize(loss) correct_prediction = tf.equal(tf.argmax(y_model, 1), tf.argmax(y_data, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float")) init = tf.initialize_all_variables()
sess = tf.Session()
sess.run(init)
start = time.time()
for _ in range(10):
for i in range(int(len(dataset)/100)):
sess.run(train_step, feed_dict={x_data:dataset[i:i+100,:], y_data:labless[i:i+100,:]})
print("模型准确率",sess.run(accuracy, feed_dict={x_data:testdataset , y_data:testlables}))
end = time.time()
print("模型训练和测试公耗时:%.2f 秒" % (end-start))
#比较决策树与神经网络的优劣
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt x = np.array([10,12])
y = [87.1,87.4]
plt.bar(x,y,edgecolor="yellow")
for i,j in zip(x,y):
plt.text(i-0.2,j-0.2,"%.2f%%" % j)
plt.text(9.7,40,"Tree right race")
plt.text(11.7,40,"Net right race")
plt.scatter([9.7,11.7],[0.05,0.36],c="r")
plt.plot([9.7,11.7],[0.05,0.36],c="g")
plt.show()
#统计各种算法处理模型数据
K-NN算法:
当K取[3,5,7,9,11,13]时,对应的:
准确率:[85.6, 72.6, 60.0, 47.4, 34.8, 22.299999999999996]
总耗时:[1554119134.435363, 1554119136.6192698,
1554119138.846019, 1554119141.2507513, 1554119143.4782736, 1554119145.5415804] 决策树:
准确率: 87.10%
训练和预测一共耗时: 0.05 秒 神经网络设计:
1 最小二乘法 softmax GradientDescentOptimizer 模型
模型准确率 0.874
模型训练和测试公耗时:0.16 秒 2 最小二乘法 softmax AdamOptimizer 模型
模型准确率 0.874
模型训练和测试公耗时:0.19 秒 3 最小二乘法 softmax MomentumOptimizer 模型
模型准确率 0.874
模型训练和测试公耗时:0.18 秒 4 最小二乘法 relu GradientDescentOptimizer 模型
模型准确率 0.874
模型训练和测试公耗时:0.17 秒 5 最小二乘法 relu AdamOptimizer 模型
模型准确率 0.874
模型训练和测试公耗时:0.15 秒 6 最小二乘法 relu MomentumOptimizer 模型
模型准确率 0.006
模型训练和测试公耗时:0.19 秒 7 交叉熵 softmax GradientDescentOptimizer 模型
模型准确率 0.874
模型训练和测试公耗时:0.09 秒 8 交叉熵 softmax AdamOptimizer 模型
模型准确率 0.874
模型训练和测试公耗时:0.08 秒 9 交叉熵 softmax MomentumOptimizer 模型
模型准确率 0.874
模型训练和测试公耗时:0.06 秒 10 交叉熵 relu GradientDescentOptimizer 模型
模型准确率 0.874
模型训练和测试公耗时:0.08 秒 11 交叉熵 relu AdamOptimizer 模型
模型准确率 0.874
模型训练和测试公耗时:0.08 秒 12 交叉熵 relu MomentumOptimizer 模型
模型准确率 0.874
模型训练和测试公耗时:0.09 秒 从上面的12种神经网络设计模型中可以看出:最小二乘法 relu MomentumOptimizer 模型
的准确率只有0.006,所以这种模型的设计是失败的。 a = [0.874]*10
print(a)
#计算成功的各种神经网络模型的准确率与耗时的比值:
a = [0.874]*11
b = [0.16,0.19,0.18,0.17,0.15,0.09,0.08,0.06,0.08,0.09,0.09]
c = []
for i in range(len(a)):
c.append(a[i]/b[i])
for i in range(len(c)):
print("准确率与耗时的比值:%.4f" % (c[i]))
#K-NN算法
#当K取3、5、7、9、11、13时的准确率饼图分布显示
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt acc = [85.6, 72.6, 60.0, 47.4, 34.8, 22.2]
labels = ['K-3','K-5','K-7','K-9','K-11','K-13']
plt.pie(acc,labels=labels,shadow=True,startangle=90,autopct='%1.4f%%')
plt.axis('equal')
plt.title('K-NN',fontsize=25)
plt.show()
#K-NN算法耗时散点图
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D x = np.array([1,2,3,4,5,6])
z = np.array([1554119134.435363, 1554119136.6192698,1554119138.846019,
1554119141.2507513, 1554119143.4782736, 1554119145.5415804])
plt.scatter(x,z,c='g')
plt.xticks(x+0.4,['KNN-1','KNN-2','KNN-3','KNN-4','KNN-5','KNN-6'])
plt.show()
#神经网络算法对应各种有用的模型设计耗时曲线图
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D x = np.array([1,2,3,4,5,6,7,8,9,10,11])
z = np.array([0.16,0.19,0.18,0.17,0.15,0.09,0.08,0.06,0.08,0.09,0.09])
plt.scatter(x,z,c='r')
plt.xticks(x+0.4,['NET-1','NET-2','NET-3','NET-4','NET-5',
'NET-6','NET-7','NET-8','NET-9','NET-10','NET-11'])
plt.show()
#K-NN、决策树以及神经网络算法对比
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt acc = [85.6, 72.6, 60.0, 47.4, 34.8, 22.2,87.10,0.874,
87.4,87.4,87.4,87.4,87.4,87.4,87.4,87.4,87.4,87.4]
labels = ['K-3','K-5','K-7','K-9','K-11','K-13','TREE',
'NET-1','NET-2','NET-3','NET-4','NET-5','NET-6','NET-7',
'NET-8','NET-9','NET-10','NET-11']
explode = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.2,0,0,0]
plt.pie(acc,labels=labels,explode=explode,shadow=True,startangle=90,autopct='%1.4f%%')
plt.axis('equal')
plt.title('K-NN AND TREE AND NET',fontsize=25)
plt.show()