Python入门经典学习1-乳腺癌分类问题

时间:2022-12-08 08:13:50

基于肿瘤特征判定是恶性肿瘤还是良性肿瘤,通过研究699个患者的肿瘤属性,找到肿瘤预测模式,根据肿瘤属性来判定肿瘤性质,对没有见过见过面的患者,根据属性来判定是否为恶性肿瘤。

用到的数据:链接:http://pan.baidu.com/s/1c26Dbjy 密码:gllb

###########################################
# 分类器:肿瘤良性还是恶性
###########################################



###########################################
# 读入数据集,并得到元祖列表
###########################################
def ReadSet(FileName):
TrainSet = []
TrainFile = open(FileName)
for line in TrainFile:
line = line.strip() #去掉'\n'
if '?' in line: #注意:引号中间不要有空格,去掉含有问号的坏数据
continue
id,a1,a2,a3,a4,a5,a6,a7,a8,a9,diag = line.split(',')#以逗号分开
if diag == '4':
diagMorB = 'm'
else:
diagMorB = 'b'
PatientTuple = (id,diagMorB,int(a1),int(a2),int(a3),int(a4),int(a5),\
int(a6),int(a7),int(a8),int(a9))
TrainSet.append(PatientTuple)
return TrainSet
###########################################
# 训练分类器
###########################################
def sumLists(list1,list2):
listofsums =[0.0] * 9
for index in range(9):
listofsums[index] = list1[index] + list2[index]
return listofsums

def makeAverages(listofsums,total):
averageList =[0.0] * 9
for index in range(9):
averageList[index] = listofsums[index] / float(total)
return averageList

def Classifier(TrainSet):
benignSums = [0] * 9
benignCount = 0
malignantSums = [0] * 9
malignantCount = 0

for patientTup in TrainSet:
if patientTup[1] == 'b':
benignSums = sumLists(benignSums,patientTup[2:])
benignCount += 1
else:
malignantSums = sumLists(malignantSums,patientTup[2:])
malignantCount += 1

benignAvgs = makeAverages(benignSums,benignCount)
malignantAvgs = makeAverages(malignantSums,malignantCount)

classifier = makeAverages(sumLists(benignAvgs,malignantAvgs),2)
return classifier
###########################################
# 测试分类器
###########################################
def Test(TestSet,classifier):
results = []
for patient in TestSet:
benignCount = 0
malignantCount = 0
for index in range(9):
if patient[index + 2] > classifier[index]:#注意索引值加2才是属性值
malignantCount += 1
else:
benignCount += 1
resultTuple = (patient[0],benignCount,malignantCount,patient[1])
results.append(resultTuple)
return results
###########################################
# 格式化输出测试结果
###########################################
def ShowResult(Result):
totalCount = 0
wrongcount = 0

for r in Result:
totalCount += 1
if r[1] > r[2]:
if r[3] == 'm':
wrongcount += 1
elif r[3] == 'b':
wrongcount += 1
print("%d patients,there were %d wrong" %(totalCount,wrongcount))
###########################################
# 主函数
###########################################
def main():

print("Reading in train data ...")
TrainFileName = "C:\\Python36\\code\\RuXian\\fullTrainData.txt"
TrainSet = ReadSet(TrainFileName)
#print(TrainSet)
print("Read TrainSet Done!")

print("Begin Training...")
classifier = Classifier(TrainSet)
print("Train Classifier Done!")

print("Reading in test data ...")
TestFileName = "C:\\Python36\\code\\RuXian\\fullTestData.txt"
TestSet = ReadSet(TestFileName)
print("Read TestSet Done!")

print("Begin Testing...")
Result = Test(TestSet,classifier)
#print(Result)
print("Test Done!")

ShowResult(Result)
print ("program finished.\n")
参考:《Pthon入门经典学习书》