机器学习实战_朴素贝叶斯分类器_预测疾病

import numpy as np

"""
函数说明：创建实验样本
Parameters：无
Returns：
    postingList - 实验样本切分的词条
    classList - 类别标签
Modify：
    2019-03-23
"""


def loadDataSet():
    postingList = [["打喷嚏", "护士"],
                   ["打喷嚏", "农夫"],
                   ["头痛", "建筑工人"],
                   ["头痛", "建筑工人"],
                   ["打喷嚏", "教师"],
                   ["头痛", "教师"]]
    classList = ["感冒", "过敏", "脑震荡", "感冒", "感冒", "脑震荡"]
    print("词条集合：\n", np.array(postingList))
    print("标签集合：\n", classList)
    return postingList, classList


"""
函数说明：生成“词汇表”
Parameters：
    postingList - 实验样本划分的词条
    classList - 标签
Returns：
    vocabulary - 词汇表
    labels - 标签表
Modify:
    2019-03-23
"""


def createVocabulary(postingList, classList):
    vocabulary = []
    for words in postingList:
        for word in words:
            if word not in vocabulary:
                vocabulary.append(word)
    print("字典：\n", vocabulary)

    labels = []
    for lab in classList:
        if lab not in labels:
            labels.append(lab)

    print("标签表：\n", labels)
    return vocabulary, labels


"""
函数说明：词条向量化、标签向量化
Parameters：
    postingList - 划分好的词条集合
    classList - 便签集合
    vocabulary - 词汇表
    labels - 标签表
Returns：
    postingVec - 向量化词条
    classVec - 向量化标签
Modify：
"""


def vector(postingList, classList, vocabulary, labels):
    # 词条向量化
    postingVecSet = []  # 词条向量的集合
    for words in postingList:
        postingVec = [0] * len(vocabulary)
        for i in range(len(words)):
            if words[i] in vocabulary:
                postingVec[vocabulary.index(words[i])] = 1
        postingVecSet.append(postingVec)
    # 标签向量化
    classVec = []
    for i in range(len(classList)):
        classVec.append(labels.index(classList[i]))
    print("向量化词条；\n", np.array(postingVecSet))
    print("向量化标签：\n", classVec)
    return postingVecSet, classVec


"""
函数说明：朴素贝叶斯分类器训练器
Parameters：
    postingVecSet - 向量化词条
    classVec - 向量化标签
Returns：
    P0Vector - [ P( 感冒 |症状、职业) ]
    P1Vector - [ P( 过敏 |症状、职业) ]
    P2Vector - [ P(脑震荡|症状、职业) ]
    PA - P( 感冒 )
    PB - P( 过敏 )
    PC - P(脑震荡)
Modify：
    2019-03-23
"""


def train(postingVecSet, classVec):
    PA = 0
    PB = 0
    PC = 0
    for disease in classVec:
        if disease == 0:  # 统计感冒数
            PA += 1
        elif disease == 1:  # 统计过敏数
            PB += 1
        else:  # 统计脑震荡数
            PC += 1
    # 计算 P(感冒) P(过敏) P(脑震荡)
    PA = PA / float(len(classVec))  # P(感冒)
    PB = PB / float(len(classVec))  # P(过敏)
    PC = PC / float(len(classVec))  # P(脑震荡)
    print("感冒概率：", PA, "过敏概率：", PB, "脑震荡概率：", PC)
    P0Vector = np.ones(len(postingVecSet[0]))
    P1Vector = np.ones(len(postingVecSet[0]))
    P2Vector = np.ones(len(postingVecSet[0]))
    P0Demon = 2.0
    P1Demon = 2.0
    P2Demon = 2.0
    for i in range(len(classVec)):
        if classVec[i] == 0:
            P0Vector += postingVecSet[i]
            P0Demon += sum(postingVecSet[i])
        elif classVec[i] == 1:
            P1Vector += postingVecSet[i]
            P1Demon += sum(postingVecSet[i])
        else:
            P2Vector += postingVecSet[i]
            P2Demon += sum(postingVecSet[i])
    P0Vector = np.log(P0Vector / P0Demon)
    P1Vector = np.log(P1Vector / P1Demon)
    P2Vector = np.log(P2Vector / P2Demon)
    print("P0Vector:", P0Vector)
    print("P1Vector:", P1Vector)
    print("P2Vector:", P2Vector)
    return PA, PB, PC, P0Vector, P1Vector, P2Vector


"""
函数说明：使用分类器进行分类
Parameters：
    test - 测试用例（向量）
    P0Vector - [ P( 感冒 |症状、职业) ]
    P1Vector - [ P( 过敏 |症状、职业) ]
    P2Vector - [ P(脑震荡|症状、职业) ]
    PA - P( 感冒 )
    PB - P( 过敏 )
    PC - P(脑震荡)
Returns：
    0 - 感冒
    1 - 过敏
    2 - 脑震荡
Modify：
    2019-03-23
"""


def classification(test, P0Vector, P1Vector, P2Vector, PA, PB, PC):
    PA = sum(P0Vector * test) + np.log(PA)
    PB = sum(P1Vector * test) + np.log(PB)
    PC = sum(P2Vector * test) + np.log(PC)
    print("感冒的概率：", PA, "过敏的概率：", PB, "脑震荡的概率：", PC)
    if max(PA, PB, PC) == PA:
        print("最可能的疾病：感冒")
    elif max(PA, PB, PC) == PB:
        print("最可能的疾病：过敏")
    else:
        print("最可能的疾病：脑震荡")
    return max(PA, PB, PC)


if __name__ == '__main__':
    postingList, classList = loadDataSet()
    vocabulary, labels = createVocabulary(postingList, classList)
    postingVecSet, classVec = vector(postingList, classList, vocabulary, labels)
    PA, PB, PC, P0Vector, P1Vector, P2Vector = train(postingVecSet, classVec)

    for zhengZhuang in ['打喷嚏', '头痛']:
        for zhiYe in ['护士', '农夫', '建筑工人', '教师']:
            test = np.zeros(len(vocabulary))
            test[vocabulary.index(zhengZhuang)] = 1
            test[vocabulary.index(zhiYe)] = 1
            print("症状：", zhengZhuang, "职业：", zhiYe)
            classification(test, P0Vector, P1Vector, P2Vector, PA, PB, PC)
            print("\n")

运行结果：

D:\PyCharm\Projects\MachineLearning\venv\Scripts\python.exe D:/PyCharm/Projects/MachineLearning/BeiYesi_YiYuan.py
词条集合：
 [['打喷嚏' '护士']
 ['打喷嚏' '农夫']
 ['头痛' '建筑工人']
 ['头痛' '建筑工人']
 ['打喷嚏' '教师']
 ['头痛' '教师']]
标签集合：
 ['感冒', '过敏', '脑震荡', '感冒', '感冒', '脑震荡']
字典：
 ['打喷嚏', '护士', '农夫', '头痛', '建筑工人', '教师']
标签表：
 ['感冒', '过敏', '脑震荡']
向量化词条；
 [[1 1 0 0 0 0]
 [1 0 1 0 0 0]
 [0 0 0 1 1 0]
 [0 0 0 1 1 0]
 [1 0 0 0 0 1]
 [0 0 0 1 0 1]]
向量化标签：
 [0, 1, 2, 0, 0, 2]
感冒概率： 0.5 过敏概率： 0.16666666666666666 脑震荡概率： 0.3333333333333333
P0Vector: [-0.98082925 -1.38629436 -2.07944154 -1.38629436 -1.38629436 -1.38629436]
P1Vector: [-0.69314718 -1.38629436 -0.69314718 -1.38629436 -1.38629436 -1.38629436]
P2Vector: [-1.79175947 -1.79175947 -1.79175947 -0.69314718 -1.09861229 -1.09861229]
症状： 打喷嚏 职业： 护士
感冒的概率： -3.0602707946915624 过敏的概率： -3.8712010109078907 脑震荡的概率： -4.68213122712422
最可能的疾病：感冒


症状： 打喷嚏 职业： 农夫
感冒的概率： -3.7534179752515073 过敏的概率： -3.1780538303479453 脑震荡的概率： -4.68213122712422
最可能的疾病：过敏


症状： 打喷嚏 职业： 建筑工人
感冒的概率： -3.0602707946915624 过敏的概率： -3.8712010109078907 脑震荡的概率： -3.9889840465642745
最可能的疾病：感冒


症状： 打喷嚏 职业： 教师
感冒的概率： -3.0602707946915624 过敏的概率： -3.8712010109078907 脑震荡的概率： -3.9889840465642745
最可能的疾病：感冒


症状： 头痛 职业： 护士
感冒的概率： -3.4657359027997265 过敏的概率： -4.564348191467836 脑震荡的概率： -3.58351893845611
最可能的疾病：感冒


症状： 头痛 职业： 农夫
感冒的概率： -4.1588830833596715 过敏的概率： -3.8712010109078907 脑震荡的概率： -3.58351893845611
最可能的疾病：脑震荡


症状： 头痛 职业： 建筑工人
感冒的概率： -3.4657359027997265 过敏的概率： -4.564348191467836 脑震荡的概率： -2.8903717578961645
最可能的疾病：脑震荡


症状： 头痛 职业： 教师
感冒的概率： -3.4657359027997265 过敏的概率： -4.564348191467836 脑震荡的概率： -2.8903717578961645
最可能的疾病：脑震荡



Process finished with exit code 0

秒客网

机器学习实战_朴素贝叶斯分类器_预测疾病

相关文章