Deeplearning原文作者Hinton代码注解

时间:2023-03-09 03:53:16
Deeplearning原文作者Hinton代码注解

[z]Deeplearning原文作者Hinton代码注解

跑Hinton最初代码时看到这篇注释文章,很少细心,待研究。。。
原文地址:>http://www.cnblogs.com/BeDPS/p/3182725.html

    1. Matlab示例代码为两部分,分别对应不同的论文:
    2. 1. Reducing the Dimensionality of data with neural networks
    3.   ministdeepauto.m   backprop.m   rbmhidlinear.m
    4. 2. A fast learing algorithm for deep belief net
    5.   mnistclassify.m   backpropclassfy.m
    6. 其余部分代码通用。
    7. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
    8. mnistclassify.m
    9. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
    10. clear all
    11. close all
    12. maxepoch=50;    %迭代次数
    13. numhid=500; numpen=500; numpen2=2000;
    14. fprintf(1,’Converting Raw files into Matlab format \n’);
    15. converter;
    16. fprintf(1,’Pretraining a deep autoencoder. \n’);
    17. fprintf(1,’The Science paper used 50 epochs. This uses %3i \n’, maxepoch);
    18. makebatches;%分批数据
    19. [numcases numdims numbatches]=size(batchdata); %获取batchdata数据大小
    20. %%numcases  每批数据的个数
    21. %%numdims   数据元组的维度
    22. %%numbtches 数据批数
    23. fprintf(1,’Pretraining Layer 1 with RBM: %d-%d \n’,numdims,numhid);%图像输入层到第一个隐藏层
    24. restart=1;                  %设置初始化参数
    25. rbm;                %调用RBM训练数据
    26. hidrecbiases=hidbiases;  %获取隐藏层偏置值
    27. save mnistvhclassify vishid hidrecbiases visbiases; %
    28. fprintf(1,’\nPretraining Layer 2 with RBM: %d-%d \n’,numhid,numpen);%第一个隐藏层到第二个隐藏层
    29. batchdata=batchposhidprobs;     %上一个RBM的隐藏层输出,读入作为这个RBM的输入
    30. numhid=numpen;%设置隐藏层的节点数,输入的节点数已经由读入数据给出
    31. restart=1;
    32. rbm;
    33. hidpen=vishid; penrecbiases=hidbiases; hidgenbiases=visbiases; %同上,提取权值,偏置,
    34. save mnisthpclassify hidpen penrecbiases hidgenbiases;
    35. fprintf(1,’\nPretraining Layer 3 with RBM: %d-%d \n’,numpen,numpen2);%第二个隐藏层到第三层隐藏层,其余同上
    36. batchdata=batchposhidprobs;
    37. numhid=numpen2;
    38. restart=1;
    39. rbm;
    40. hidpen2=vishid; penrecbiases2=hidbiases; hidgenbiases2=visbiases;
    41. save mnisthp2classify hidpen2 penrecbiases2 hidgenbiases2;
    42. backpropclassify;
    43. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
    44. backpropclassify.m
    45. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
    46. maxepoch=200;
    47. fprintf(1,’\nTraining discriminative model on MNIST by minimizing cross entropy error. \n’);%最小化交叉熵
    48. fprintf(1,’60 batches of 1000 cases each. \n’);
    49. load mnistvhclassify%加载各层之间的权值,以及偏置
    50. load mnisthpclassify
    51. load mnisthp2classify
    52. makebatches;%分批数据
    53. [numcases numdims numbatches]=size(batchdata);
    54. N=numcases; %获取每批数据向量数
    55. %%%% PREINITIALIZE WEIGHTS OF THE DISCRIMINATIVE MODEL%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
    56. w1=[vishid; hidrecbiases];%第一层到第二层的权重,以及第二层的偏置
    57. w2=[hidpen; penrecbiases];%类上
    58. w3=[hidpen2; penrecbiases2];%类上
    59. w_class = 0.1*randn(size(w3,2)+1,10);%随机生成第四层列数+1行,10列的矩阵
    60. %%%%%%%%%% END OF PREINITIALIZATIO OF WEIGHTS  %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
    61. l1=size(w1,1)-1;%获取每层的单元个数
    62. l2=size(w2,1)-1;
    63. l3=size(w3,1)-1;
    64. l4=size(w_class,1)-1;%最高层的单元个数
    65. l5=10; %label层单元个数
    66. test_err=[];%
    67. train_err=[];%
    68. for epoch = 1:maxepoch
    69. %%%%%%%%%%%%%%%%%%%% COMPUTE TRAINING MISCLASSIFICATION ERROR %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
    70. err=0;
    71. err_cr=0;
    72. counter=0;
    73. [numcases numdims numbatches]=size(batchdata);
    74. %%numcases  每批数据的个数
    75. %%numdims   数据元组的维度
    76. %%numbtches 数据批数
    77. N=numcases;%%每批次数据向量个数
    78. for batch = 1:numbatches
    79. data = [batchdata(:,:,batch)];%读取一批次数据
    80. target = [batchtargets(:,:,batch)];%读取当前批次的目标值
    81. data = [data ones(N,1)];%在原数据后添加N行1列数据
    82. w1probs = 1./(1 + exp(-data*w1)); w1probs = [w1probs  ones(N,1)];%sigmod计算各层的概率值,参见BP算法
    83. w2probs = 1./(1 + exp(-w1probs*w2)); w2probs = [w2probs ones(N,1)];
    84. w3probs = 1./(1 + exp(-w2probs*w3)); w3probs = [w3probs  ones(N,1)];
    85. targetout = exp(w3probs*w_class);%计算最后的输出值N行10列
    86. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
    87. %对最后的label的输出处理过程,见公式6.1,其中w3probs*w_class是label的输入
    88. %最后只能有一个单元被激活,激活单元的选择即通过下面计算得出的概率来进行选择
    89. %10个单元组成的“softmax”组
    90. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
    91. targetout = targetout./repmat(sum(targetout,2),1,10);%计算最后10个label输出除以输出值的总和
    92. [I J]=max(targetout,[],2);%取计算结果每行中的最大值,以及其列标
    93. [I1 J1]=max(target,[],2);%取原先设定目标值的最大值以及列标
    94. counter=counter+length(find(J==J1));%统计正确的条数
    95. err_cr = err_cr- sum(sum( target(:,1:end).*log(targetout))) ; %%%%????
    96. end
    97. train_err(epoch)=(numcases*numbatches-counter);%总的错误条数???
    98. train_crerr(epoch)=err_cr/numbatches;%平均每批次错误率???
    99. %%%%%%%%%%%%%% END OF COMPUTING TRAINING MISCLASSIFICATION ERROR %%%%%%%%%%%%%%%%%%%%%%%%%%%%%
    100. %%%%%%%%%%%%%%%%%%%% COMPUTE TEST MISCLASSIFICATION ERROR %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
    101. err=0;
    102. err_cr=0;
    103. counter=0;
    104. [testnumcases testnumdims testnumbatches]=size(testbatchdata);
    105. N=testnumcases;
    106. for batch = 1:testnumbatches
    107. data = [testbatchdata(:,:,batch)];
    108. target = [testbatchtargets(:,:,batch)];
    109. data = [data ones(N,1)];
    110. w1probs = 1./(1 + exp(-data*w1)); w1probs = [w1probs  ones(N,1)];
    111. w2probs = 1./(1 + exp(-w1probs*w2)); w2probs = [w2probs ones(N,1)];
    112. w3probs = 1./(1 + exp(-w2probs*w3)); w3probs = [w3probs  ones(N,1)];
    113. targetout = exp(w3probs*w_class);
    114. targetout = targetout./repmat(sum(targetout,2),1,10);
    115. [I J]=max(targetout,[],2);
    116. [I1 J1]=max(target,[],2);
    117. counter=counter+length(find(J==J1));
    118. err_cr = err_cr- sum(sum( target(:,1:end).*log(targetout))) ;
    119. end
    120. test_err(epoch)=(testnumcases*testnumbatches-counter);
    121. test_crerr(epoch)=err_cr/testnumbatches;
    122. fprintf(1,’Before epoch %d Train # misclassified: %d (from %d). Test # misclassified: %d (from %d) \t \t \n’,…
    123. epoch,train_err(epoch),numcases*numbatches,test_err(epoch),testnumcases*testnumbatches);
    124. %%%%%%%%%%%%%% END OF COMPUTING TEST MISCLASSIFICATION ERROR %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
    125. tt=0;
    126. for batch = 1:numbatches/10
    127. fprintf(1,’epoch %d batch %d\r’,epoch,batch);
    128. %%%%%%%%%%% COMBINE 10 MINIBATCHES INTO 1 LARGER MINIBATCH %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
    129. %组合10个小批次为1000样例的批次,然后用conjugate gradient来进行微调
    130. tt=tt+1;
    131. data=[];
    132. targets=[];
    133. for kk=1:10
    134. data=[data
    135. batchdata(:,:,(tt-1)*10+kk)]; %10个小批次合成
    136. targets=[targets
    137. batchtargets(:,:,(tt-1)*10+kk)];
    138. end
    139. %%%%%%%%%%%%%%% PERFORM CONJUGATE GRADIENT WITH 3 LINESEARCHES %%%%%%%%%%%%%%%%%%%%%%%%%%%%%
    140. max_iter=3;       %设置线性搜索的次数
    141. if epoch<6                            % First update top-level weights holding other weights fixed.
    142. N = size(data,1);               %获取数据的行数
    143. XX = [data ones(N,1)];    %每行数据后面增加1,用来增加偏置
    144. w1probs = 1./(1 + exp(-XX*w1)); w1probs = [w1probs  ones(N,1)];
    145. w2probs = 1./(1 + exp(-w1probs*w2)); w2probs = [w2probs ones(N,1)];
    146. w3probs = 1./(1 + exp(-w2probs*w3)); %w3probs = [w3probs  ones(N,1)];
    147. VV = [w_class(:)']‘;    %VV将随机生成的向量w_class展开成一列???为什么展开成一列与minimize的参数有关
    148. %
    149. Dim = [l4; l5];             %记录最后两层的单元节点数,即2000的隐藏层和10的label层
    150. [X, fX] = minimize(VV,’CG_CLASSIFY_INIT’,max_iter,Dim,w3probs,targets);%只训练两层 %%%详细见函数定义
    151. %minimize is Cari Rasmusssen’s ”minimize” code
    152. %%——————参数含义——————%%
    153. %VV         随机权重向量的展开 ,其作为输入参数,列必须为1(D by 1)
    154. %X          函数f=”CG_CLASSIFY_INIT”的最优化参数
    155. %fX         函数f对X的偏导
    156. %max_iter  如果为正,表示线性搜索次数,为负,函数的最大值个数
    157. %%————————————————-%
    158. w_class = reshape(X,l4+1,l5);%恢复权值矩阵结构
    159. else                      %进入整体微调过程
    160. VV = [w1(: )' w2(: )' w3(: )' w_class(: )']‘; %将所有权值按列展开成一列
    161. Dim = [l1; l2; l3; l4; l5]; %记录各层单元个数传入
    162. [X, fX] = minimize(VV,’CG_CLASSIFY’,max_iter,Dim,data,targets);
    163. w1 = reshape(X(1: (l1+1)*l2),l1+1,l2);   %恢复W1权值1.0
    164. xxx = (l1+1)*l2;  %临时变量,用于恢复权值单元
    165. w2 = reshape(X (xxx+1: xxx+ (l2+1)*l3),l2+1,l3);
    166. xxx = xxx+(l2+1)*l3;
    167. w3 = reshape(X (xxx+1: xxx+ (l3+1)*l4),l3+1,l4);
    168. xxx = xxx+(l3+1)*l4;
    169. w_class = reshape(X (xxx+1: xxx+ (l4+1)*l5),l4+1,l5);
    170. end
    171. %%%%%%%%%%%%%%% END OF CONJUGATE GRADIENT WITH 3 LINESEARCHES %%%%%%%%%%%%%%%%%%%%%%%%%%%%%
    172. end
    173. save mnistclassify_weights w1 w2 w3 w_class
    174. save mnistclassify_error test_err test_crerr train_err train_crerr;
    175. end
    176. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
    177. rbm.m
    178. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%\
    179. epsilonw      = 0.1;   % Learning rate for weights
    180. epsilonvb     = 0.1;   % Learning rate for biases of visible units
    181. epsilonhb     = 0.1;   % Learning rate for biases of hidden units
    182. weightcost  = 0.0002;
    183. initialmomentum  = 0.5;
    184. finalmomentum    = 0.9;
    185. [numcases numdims numbatches]=size(batchdata);
    186. %%numcases  每批数据的个数
    187. %%numdims   数据元组的维度
    188. %%numbtches 数据批数
    189. if restart ==1,
    190. restart=0;
    191. epoch=1;
    192. % Initializing symmetric weights and biases. 初始化对称权值和偏置
    193. vishid     = 0.1*randn(numdims, numhid); %初始化生成可视层到隐藏层的权值
    194. hidbiases  = zeros(1,numhid);%隐藏单元的偏置值
    195. visbiases  = zeros(1,numdims);%可见单元的偏置值
    196. poshidprobs = zeros(numcases,numhid); %正向的隐藏单元概率生成
    197. neghidprobs = zeros(numcases,numhid);%反向的隐藏单元概率生成
    198. posprods    = zeros(numdims,numhid);%正向可见单元概率生成
    199. negprods    = zeros(numdims,numhid);%反向可见单元概率生成
    200. vishidinc  = zeros(numdims,numhid);%%%%%可视单元和隐藏单元之间的权值增量
    201. hidbiasinc = zeros(1,numhid);%%隐藏单元的偏置增量
    202. visbiasinc = zeros(1,numdims);%%可视单元的偏置增量
    203. batchposhidprobs=zeros(numcases,numhid,numbatches);%存储每次迭代计算好的每层的隐藏层概率,作为下一个RBM的输入
    204. end
    205. %%%%%%%%%%%%%%%%简单输出 迭代次数 处理的批次%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
    206. for epoch = epoch:maxepoch,  %迭代处理
    207. fprintf(1,’epoch %d\r’,epoch);
    208. errsum=0; %初始化输出错误为0
    209. for batch = 1:numbatches, %每次处理一批次的数据
    210. fprintf(1,’epoch %d batch %d\r’,epoch,batch);
    211. %%%%%%%%% START POSITIVE PHASE %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
    212. data = batchdata(:,:,batch); %读取当前批次的全部数据vi
    213. poshidprobs = 1./(1 + exp(-data*vishid - repmat(hidbiases,numcases,1))); %计算前向传播的隐藏层概率hi
    214. batchposhidprobs(:,:,batch)=poshidprobs;%将计算好的概率赋值给当前批次前向传播的隐藏层最后一次计算好的值作为下一层的输入
    215. posprods    = data’ * poshidprobs;%contrastive divergence过程<vi,hi>
    216. poshidact   = sum(poshidprobs);%average-wise隐藏层激活概率值
    217. posvisact = sum(data);%average-wise可视层激活概率值
    218. %%%%%%%%% END OF POSITIVE PHASE  %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
    219. poshidstates = poshidprobs > rand(numcases,numhid);%gibbs抽样,设定状态
    220. %%%%%%%%% START NEGATIVE PHASE  %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
    221. negdata = 1./(1 + exp(-poshidstates*vishid’ - repmat(visbiases,numcases,1)));%根据hi计算vi+1
    222. neghidprobs = 1./(1 + exp(-negdata*vishid - repmat(hidbiases,numcases,1)));   %根据vi+1计算hi+1
    223. negprods  = negdata’*neghidprobs;%contrastive divergence <vi+1,hi+1>
    224. neghidact = sum(neghidprobs);
    225. negvisact = sum(negdata);
    226. %%%%%%%%% END OF NEGATIVE PHASE %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
    227. err= sum(sum( (data-negdata).^2 )); %重新构建数据的方差
    228. errsum = err + errsum;%整体方差
    229. if epoch>5, %迭代次数不同调整冲量
    230. momentum=finalmomentum;
    231. else
    232. momentum=initialmomentum;
    233. end;
    234. %%%%%%%%% UPDATE WEIGHTS AND BIASES %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
    235. vishidinc = momentum*vishidinc + …
    236. epsilonw*( (posprods-negprods)/numcases - weightcost*vishid);%权重增量计算
    237. visbiasinc = momentum*visbiasinc + (epsilonvb/numcases)*(posvisact-negvisact);%偏置增量计算
    238. hidbiasinc = momentum*hidbiasinc + (epsilonhb/numcases)*(poshidact-neghidact);%隐藏层增量计算
    239. vishid = vishid + vishidinc;
    240. visbiases = visbiases + visbiasinc;
    241. hidbiases = hidbiases + hidbiasinc;
    242. %%%%%%%%%%%%%%%% END OF UPDATES %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
    243. end
    244. fprintf(1, ’epoch %4i error %6.1f  \n’, epoch, errsum);
    245. end;
    246. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
    247. CG_CLASSIFY_INIT.M
    248. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%\
    249. function [f, df] = CG_CLASSIFY_INIT(VV,Dim,w3probs,target);%CG对最上面两层的训练
    250. l1 = Dim(1);
    251. l2 = Dim(2);
    252. N = size(w3probs,1);
    253. % Do decomversion.
    254. w_class = reshape(VV,l1+1,l2); %恢复权重,
    255. w3probs = [w3probs  ones(N,1)];  %一列,偏置
    256. targetout = exp(w3probs*w_class);  %计算label层的输出结果为numbercase*lablesnumber的矩阵
    257. targetout = targetout./repmat(sum(targetout,2),1,10); %选择最后的激活单元,见backpropclassify.m 的76行
    258. f = -sum(sum( target(:,1:end).*log(targetout))) ; %交叉熵  只采用了前边部分
    259. IO = (targetout-target(:,1:end));   % 输入和输出结果之间的差值
    260. Ix_class=IO; %
    261. dw_class =  w3probs’*Ix_class;%导数F(x)((1-F(x))乘以输出结果的偏差..其中F为sigmoid函数
    262. df = [dw_class(:)']‘;
    263. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
    264. CG_CLASSIFY.M
    265. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
    266. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
    267. %   该段代码对所有权重进行整体微调
    268. %   各部分过程见 CG_CLASSIFY_INIT.m注解
    269. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
    270. function [f, df] = CG_CLASSIFY(VV,Dim,XX,target);
    271. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
    272. rbmhidlinear.m
    273. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
    274. %除了最后计算单元值采用的是线性单元其余过程全部一样
    275. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

from: http://jacoxu.com/?p=692