逻辑回归的分布式实现 [Logistic Regression / Machine Learning / Spark ]

时间:2022-06-21 00:48:03

1- 问题提出


2- 逻辑回归


3- 理论推导


4- Python/Spark实现

 # -*- coding: utf-8 -*-
from pyspark import SparkContext
from math import * theta = [0, 0, 0] #初始theta值
alpha = 0.001 #学习速率 def inner(x, y):
return sum([i*j for i,j in zip(x,y)]) def func(lst):
h = (1 + exp(-inner(lst, theta)))**(-1)
return map(lambda x: (h - lst[-1]) * x, lst[:-1]) sc = SparkContext('local') rdd = sc.textFile('/home/freyr/logisticRegression.txt')\
.map(lambda line: map(float, line.strip().split(',')))\
.map(lambda lst: [1]+lst) for i in range(400):
partheta = rdd.map(func)\
.reduce(lambda x,y: [i+j for i,j in zip(x,y)]) for j in range(3):
theta[j] = theta[j] - alpha * partheta[j] print 'theta = %s' % theta

PS: logisticRegression.txt