机器学习实战(第五章-Logistics回归-所有代码与详细注解及相关数据文件-python3.7)

时间:2023-02-24 07:56:53



本次代码中加入了书上没有提供的可视化过程代码,将各个权重变化趋势很好的展现出来了。

该算法是最基础的算法,所以准确率不高,原理还是值得学习的,以前过目就忘的理论知识,实践之后记得牢牢的!

那么,正文就从这里开始啦!(我的代码都是可直接运行的,只要环境正确

1、logRegres01_gradient_ascent.py

'''
逻辑回归梯度上升优化算法
部分注释参考于:
javascript:void(0)
报错问题的解决参考自:
javascript:void(0)
注意点:矩阵与数组的转换,矩阵方便使用常规的矩阵乘法,数组则便于对对应元素进行运算操作,稍有不慎就会出错
'''
from numpy import *
import matplotlib.pyplot as plt


# 准备数据
def loadDataSet():
dataMat = [];
labelMat = []
fr = open('data/testSet.txt')
for line in fr.readlines():
lineArr = line.strip().split()
dataMat.append([1.0, float(lineArr[0]), float(lineArr[1])])
labelMat.append(int(lineArr[2]))
return dataMat, labelMat

# 分析数据
def analyseData(dataMat):
dataArr = array(dataMat)
num = shape(dataArr)[0]
xcord1 = []
ycord1 = [] # 标签为1的数据点坐标
xcord0 = []
ycord0 = [] # 标签为0的数据点坐标
for i in range(num):
if int(labelMat[i]) == 1:
xcord1.append(dataArr[i, 1])
ycord1.append(dataArr[i, 2])
else:
xcord0.append(dataArr[i, 1])
ycord0.append(dataArr[i, 2])
fig = plt.figure()
fig.set_size_inches(18.5, 18.5)
ax = fig.add_subplot(111)
ax.scatter(xcord1, ycord1, s=30, c='red', marker='s')
ax.scatter(xcord0, ycord0, s=30, c='green')
# plt.savefig('01分析数据')
plt.show()

# 定义 sigmoid 函数
def sigmoid(x):

# 此处为区分矩阵与标量
if isinstance(x, float) or isinstance(x, int):
# 这样写可以避免报错:RuntimeWarning: overflow encountered in exp
# 两个式子其实是等价的
if x >= 0:
return 1.0 / (1.0 + exp(-x)) # 保证数值不过大
else:
return exp(x) / (1 + exp(x)) # 保证数值不过小
else:
if x.any() >= 0:
return 1.0 / (1.0 + exp(-x)) # 保证数值不过大
else:
return exp(x) / (1 + exp(x)) # 保证数值不过小

# 训练模型:梯度上升
def gradAscent(dataMatIn, classLabels, numIter=150):
# 新增 weightIndex、weightTracks参数,用于追踪迭代过程中weights的变化情况
weightIndex = 0
weightTracks = {}

dataMatrix = mat(dataMatIn) # mat 将数组转化为矩阵
labelMat = mat(classLabels).transpose() # transpose 转置
m, n = shape(dataMatrix) # m为样本数(行数)100,n为特征数(列数)3
alpha = 0.001 # 自定义步长
weights = ones((n, 1)) # 3*1列向量

weightTracks[weightIndex] = weights

'''
本次迭代涉及 300 * maxIters = 300 * 500 = 15w+ 次乘法运算
当数据量变大时,计算量也急剧上升
优化方法:采用随机梯度上升算法
'''
for k in range(numIter):
# 此步骤涉及 300 次乘法运算
h = sigmoid(dataMatrix * weights) # 求预测类别 (100*3) * (3*1) = (m*1) = (100 * 1)
error = (labelMat - h) # 求实际类别与预测类别的差值 (100 * 1) labelMat是实际类别
weights = weights + alpha * dataMatrix.transpose() * error # alpha: 步长,error: 梯度

weightIndex = weightIndex + 1
weightTracks[weightIndex] = weights

return array(weights), weightIndex, weightTracks # array(weights): (3 * 1)


'''
改进的训练模型:随机梯度上升
一次仅用一个样本点来更新回归系数
由于可以在新样本到来时对分类器进行增量式更新,随机梯度算法也是一个在线学习算法
与 "在线学习" 相对应的是 "批处理",即每次都处理所有数据
'''


def stocGradAscent0(dataMatrix, classLabels):
# 新增 weightIndex、weightTracks参数,用于追踪迭代过程中weights的变化情况
weightIndex = 0
weightTracks = {}

m, n = shape(dataMatrix) # m=100, n=3
dataArr = array(dataMatrix)
alpha = 0.01
weights = ones(n) # (3*1) 列向量

weightTracks[weightIndex] = weights

for i in range(m):
h = sigmoid(sum(dataArr[i] * weights)) # 数组*数组=对应元素相乘 (3*1)*(3*1)=(3*1)
error = classLabels[i] - h
weights = weights + alpha * error * dataArr[i]

weightIndex = weightIndex + 1
weightTracks[weightIndex] = weights

return weights, weightIndex, weightTracks


'''
改进的随机梯度上升算法:
alpha 每次迭代都会调整,可缓解数据波动or高频波动
alpha 会随着迭代的深入而不断变小,但不会减小到0,这是为了确保多次迭代之后新数据还是具有一定的影响力
如果处理的问题是动态变化的,则可以适当放大alpha计算过程中的那个常数项,确保新值获得更大的回归系数
当 j<<max(i) 时,alpha并不是严格下降的,避免参数的严格下降也常见于模拟退火算法等其他优化算法中
'''


def stocGradAscent1(dataMatrix, classLabels, numIter=150):

weightIndex = 0
weightTracks = {}

dataArr = array(dataMatrix)
m, n = shape(dataMatrix) # m=100, n=3
weights = ones(n) # (3*1)

weightTracks[weightIndex] = weights

# 优化1:对整个样本数据迭代 numIter=150 次,而不再是 1 次
for j in range(numIter):
dataIndex = list(range(m))
for i in range(m):
# 优化2:alpha实时调整
alpha = 4 / (1.0 + j + i) + 0.01
# 优化3:随机选取样本来更新回归系数,以减少周期性的波动,每次选中操作之后删除该数据
randIndex = int(random.uniform(0, len(dataIndex)))
h = sigmoid(sum(dataArr[randIndex] * weights))
error = classLabels[randIndex] - h
weights = weights + alpha * error * dataArr[randIndex]
del(dataIndex[randIndex])

weightIndex = weightIndex + 1
weightTracks[weightIndex] = weights

return weights, weightIndex, weightTracks


# 分析数据,画出决策边界
def plotBestFit(dataMat, labelMat, weights, weightIndex, weightTracks):
dataArr = array(dataMat)
num = shape(dataArr)[0]
xcord1 = []
ycord1 = [] # 标签为1的数据点坐标
xcord0 = []
ycord0 = [] # 标签为0的数据点坐标
for i in range(num):
if int(labelMat[i]) == 1:
xcord1.append(dataArr[i, 1])
ycord1.append(dataArr[i, 2])
else:
xcord0.append(dataArr[i, 1])
ycord0.append(dataArr[i, 2])
fig = plt.figure()
fig.set_size_inches(18.5, 18.5)
ax = []
ax.append(fig.add_subplot(221))
ax[0].scatter(xcord1, ycord1, s=30, c='red', marker='s')
ax[0].scatter(xcord0, ycord0, s=30, c='green')
x = array(arange(-3.0, 3.0, 0.1))
'''
z = w0*x0 + w1*x1 + w2*x2 (z取0时,即sigmoid函数值为0.5时,是该二分类的边界处)
故取z=0, 且已知 x0=1, 此处设x1为横坐标,x2为纵坐标
得:y = (-w0 - w1 * x) / w2
'''
y = (-weights[0] - weights[1] * x) / weights[2]
ax[0].plot(x, y)
plt.xlabel('x1')
plt.ylabel('x2')

# 画出回归系数变化曲线
x = []
y = [[], [], []]
for i in range(weightIndex):
x.append(i)
y[0].append(weightTracks[i][0])
y[1].append(weightTracks[i][1])
y[2].append(weightTracks[i][2])
for i in range(1, 4):
ax.append(fig.add_subplot(2, 2, i + 1))
ax[i].title.set_text('feature%d_weights' % i)
ax[i].plot(x, y[i - 1])
plt.xlabel('iter')
plt.ylabel('number')

# plt.savefig('01梯度上升算法结果图')
# plt.savefig('02随机梯度上升算法结果图')
# plt.savefig('03优化后的随机梯度上升算法结果图')
plt.show()

if __name__ == "__main__":

# 处理数据
dataMat, labelMat = loadDataSet()
# 分析数据
# analyseData(dataMat)

# 采用梯度上升算法训练出来的回归系数
weights, weightIndex, weightTracks = gradAscent(dataMat, labelMat)
print(weights)
plotBestFit(dataMat, labelMat, weights, weightIndex, weightTracks)

# 采用随机梯度上升算法训练出来的回归系数,由于迭代次数过少,所以效果较差
# weights, weightIndex, weightTracks = stocGradAscent0(dataMat, labelMat)
# print(weights)
# plotBestFit(dataMat, labelMat, weights, weightIndex, weightTracks)

# 采用优化后的随机梯度上升算法训练出来的回归系数
# weights, weightIndex, weightTracks = stocGradAscent1(dataMat, labelMat)
# print(weights)
# plotBestFit(dataMat, labelMat, weights, weightIndex, weightTracks)

结果图几张如下:

初始数据分析分布:

机器学习实战(第五章-Logistics回归-所有代码与详细注解及相关数据文件-python3.7)

梯度上升算法:

机器学习实战(第五章-Logistics回归-所有代码与详细注解及相关数据文件-python3.7)

随机梯度上升算法:

机器学习实战(第五章-Logistics回归-所有代码与详细注解及相关数据文件-python3.7)

改进后的随机梯度上升算法:

机器学习实战(第五章-Logistics回归-所有代码与详细注解及相关数据文件-python3.7)

2、logRegres02_livability.py

'''
利用逻辑回归与随机梯度上升算法来预测患有疝病(马胃肠痛)的马的存活率
1、收集数据
2、准备数据:处理数据集中的缺失值
方法:(1) 使用可用特征的均值填补缺失值
(2) 使用特殊值(如-1)填补缺失值
(3) 忽略有缺失值的样本
(4) 使用相似样本的均值填补缺失值
(5) 使用另外的机器学习算法预测缺失值
3、分析数据:可视化并观察数据
4、训练算法
5、测试算法
6、使用算法
'''
from logRegres01_gradient_ascent import *


def classifyVector(inX, weights):
prob = sigmoid(sum(inX * weights))
if prob > 0.5:
return 1.0
else:
return 0.0


def colicTest():
frTrain = open('data/horseColicTraining.txt')
frTest = open('data/horseColicTest.txt')
trainingSet = []
trainingLabels = []
for line in frTrain.readlines():
# 处理每行数据,前21个特征赋值给 lineArr,添加到trainingSet;后1个标签添加到trainingLabels
currLine = line.strip().split('\t')
lineArr = []
for i in range(len(currLine) - 1):
lineArr.append(float(currLine[i]))
trainingSet.append(lineArr)
trainingLabels.append(float(currLine[len(currLine) - 1]))
trainingWeights, weightIndex, weightTracks = stocGradAscent1(trainingSet, trainingLabels, 500)
# print(weightIndex)
errorCount = 0
testSampleNum = 0.0 # 测试集的长度
for line in frTest.readlines():
testSampleNum += 1.0
currLine = line.strip().split('\t')
lineArr = []
for i in range(len(currLine) - 1):
lineArr.append(float(currLine[i]))
if int(classifyVector(array(lineArr), trainingWeights)) != int(currLine[len(currLine) - 1]):
errorCount += 1
errorRate = float(errorCount) / float(testSampleNum)
print("the error rate of this test is", errorRate)
return errorRate


if __name__ == "__main__":

testTimes = 10
errorSum = 0.0
for k in range(testTimes):
errorSum += colicTest()
averageRate = errorSum / float(testTimes)
print("the average error rate is", averageRate)

结果图:

机器学习实战(第五章-Logistics回归-所有代码与详细注解及相关数据文件-python3.7)