nnet3的并行化训练

时间:2021-09-08 22:44:49

num_epochs=1

num_archives=64

args.num_jobs_initial=3

args.num_jobs_final=8

num_iters=2*num_epochs*num_archives/(args.num_jobs_initial+args.num_jobs_final)

=2*1*64/(3+8)=12

 
 

3,3,4,4,5,5,6,6,6,7,7,8

64 archives

6*24=144

iteration 0, 3 GPUs

# 并行化训练

nnet3-train ... 0.mdl 1.1.raw

nnet3-train ... 0.mdl 1.2.raw

nnet3-train ... 0.mdl 1.3.raw

# 模型平均

nnet3-average exp/nnet3/rsi/1.1.raw exp/nnet3/rsi/1.2.raw exp/nnet3/rsi/1.3.raw - | nnet3-am-copy --set-raw-nnet=- exp/nnet3/rsi/0.mdl exp/nnet3/rsi/1.mdl

# 计算交叉验证的似然

nnet3-compute-prob ... 0.mdl train_diagnostic.egs

nnet3-compute-prob ... 0.mdl valid_diagnostic.egs

iteration 1, 3 GPUs

nnet3-train ... 1.mdl 2.1.raw

nnet3-train ... 1.mdl 2.2.raw

nnet3-train ... 1.mdl 2.3.raw

nnet3-average exp/nnet3/rsi/2.1.raw exp/nnet3/rsi/2.2.raw exp/nnet3/rsi/2.3.raw - | nnet3-am-copy --set-raw-nnet=- exp/nnet3/rsi/1.mdl exp/nnet3/rsi/2.mdl

nnet3-compute-prob ... 1.mdl train_diagnostic.egs

nnet3-compute-prob ... 1.mdl valid_diagnostic.egs

iteration 2, 4 GPUs

nnet3-train ... 2.mdl 3.1.raw

nnet3-train ... 2.mdl 3.2.raw

nnet3-train ... 2.mdl 3.3.raw

nnet3-train ... 2.mdl 3.4.raw

nnet3-average exp/nnet3/rsi/3.1.raw exp/nnet3/rsi/3.2.raw exp/nnet3/rsi/3.3.raw exp/nnet3/rsi/3.4.raw - | nnet3-am-copy --set-raw-nnet=- exp/nnet3/rsi/2.mdl exp/nnet3/rsi/3.mdl

nnet3-compute-prob ... 2.mdl train_diagnostic.egs

nnet3-compute-prob ... 2.mdl valid_diagnostic.egs

iteration 3, 4 GPUs

nnet3-train ... 3.mdl 4.1.raw

nnet3-train ... 3.mdl 4.2.raw

nnet3-train ... 3.mdl 4.3.raw

nnet3-train ... 3.mdl 4.4.raw

nnet3-average exp/nnet3/rsi/4.1.raw exp/nnet3/rsi/4.2.raw exp/nnet3/rsi/4.3.raw exp/nnet3/rsi/4.4.raw - | nnet3-am-copy --set-raw-nnet=- exp/nnet3/rsi/3.mdl exp/nnet3/rsi/4.mdl

nnet3-compute-prob ... 3.mdl train_diagnostic.egs

nnet3-compute-prob ... 3.mdl valid_diagnostic.egs

iteration 4, 5 GPUs

nnet3-train ... 4.mdl 5.1.raw

nnet3-train ... 4.mdl 5.2.raw

nnet3-train ... 4.mdl 5.3.raw

nnet3-train ... 4.mdl 5.4.raw

nnet3-train ... 4.mdl 5.5.raw

nnet3-average exp/nnet3/rsi/5.1.raw exp/nnet3/rsi/5.2.raw exp/nnet3/rsi/5.3.raw exp/nnet3/rsi/5.4.raw exp/nnet3/rsi/5.5.raw - | nnet3-am-copy --set-raw-nnet=- exp/nnet3/rsi/4.mdl exp/nnet3/rsi/5.mdl

nnet3-compute-prob ... 4.mdl train_diagnostic.egs

nnet3-compute-prob ... 4.mdl valid_diagnostic.egs

iteration 5, 5 GPUs

nnet3-train ... 5.mdl 6.1.raw

nnet3-train ... 5.mdl 6.2.raw

nnet3-train ... 5.mdl 6.3.raw

nnet3-train ... 5.mdl 6.4.raw

nnet3-train ... 5.mdl 6.5.raw

nnet3-average exp/nnet3/rsi/6.1.raw exp/nnet3/rsi/6.2.raw exp/nnet3/rsi/6.3.raw exp/nnet3/rsi/6.4.raw exp/nnet3/rsi/6.5.raw - | nnet3-am-copy --set-raw-nnet=- exp/nnet3/rsi/5.mdl exp/nnet3/rsi/6.mdl

nnet3-compute-prob ... 5.mdl train_diagnostic.egs

nnet3-compute-prob ... 5.mdl valid_diagnostic.egs

iteration 6, 6 GPUs

nnet3-train ... 6.mdl 7.1.raw

nnet3-train ... 6.mdl 7.2.raw

nnet3-train ... 6.mdl 7.3.raw

nnet3-train ... 6.mdl 7.4.raw

nnet3-train ... 6.mdl 7.5.raw

nnet3-train ... 6.mdl 7.6.raw

nnet3-average exp/nnet3/rsi/7.1.raw exp/nnet3/rsi/7.2.raw exp/nnet3/rsi/7.3.raw exp/nnet3/rsi/7.4.raw exp/nnet3/rsi/7.5.raw exp/nnet3/rsi/7.6.raw - | nnet3-am-copy --set-raw-nnet=- exp/nnet3/rsi/6.mdl exp/nnet3/rsi/7.mdl

nnet3-compute-prob ... 6.mdl train_diagnostic.egs

nnet3-compute-prob ... 6.mdl valid_diagnostic.egs

iteration 7, 6 GPUs

nnet3-train ... 7.mdl 8.1.raw

nnet3-train ... 7.mdl 8.2.raw

nnet3-train ... 7.mdl 8.3.raw

nnet3-train ... 7.mdl 8.4.raw

nnet3-train ... 7.mdl 8.5.raw

nnet3-train ... 7.mdl 8.6.raw

nnet3-average exp/nnet3/rsi/8.1.raw exp/nnet3/rsi/8.2.raw exp/nnet3/rsi/8.3.raw exp/nnet3/rsi/8.4.raw exp/nnet3/rsi/8.5.raw exp/nnet3/rsi/8.6.raw - | nnet3-am-copy --set-raw-nnet=- exp/nnet3/rsi/7.mdl exp/nnet3/rsi/8.mdl

nnet3-compute-prob ... 7.mdl train_diagnostic.egs

nnet3-compute-prob ... 7.mdl valid_diagnostic.egs

iteration 8, 6 GPUs

nnet3-train ... 8.mdl 9.1.raw

nnet3-train ... 8.mdl 9.2.raw

nnet3-train ... 8.mdl 9.3.raw

nnet3-train ... 8.mdl 9.4.raw

nnet3-train ... 8.mdl 9.5.raw

nnet3-train ... 8.mdl 9.6.raw

nnet3-average exp/nnet3/rsi/9.1.raw exp/nnet3/rsi/9.2.raw exp/nnet3/rsi/9.3.raw exp/nnet3/rsi/9.4.raw exp/nnet3/rsi/9.5.raw exp/nnet3/rsi/9.6.raw - | nnet3-am-copy --set-raw-nnet=- exp/nnet3/rsi/8.mdl exp/nnet3/rsi/9.mdl

nnet3-compute-prob ... 8.mdl train_diagnostic.egs

nnet3-compute-prob ... 8.mdl valid_diagnostic.egs

iteration 9, 7 GPUs

nnet3-train ... 9.mdl 10.1.raw

nnet3-train ... 9.mdl 10.2.raw

nnet3-train ... 9.mdl 10.3.raw

nnet3-train ... 9.mdl 10.4.raw

nnet3-train ... 9.mdl 10.5.raw

nnet3-train ... 9.mdl 10.6.raw

nnet3-train ... 9.mdl 10.7.raw

nnet3-average exp/nnet3/rsi/10.1.raw exp/nnet3/rsi/10.2.raw exp/nnet3/rsi/10.3.raw exp/nnet3/rsi/10.4.raw exp/nnet3/rsi/10.5.raw exp/nnet3/rsi/10.6.raw exp/nnet3/rsi/10.7.raw - | nnet3-am-copy --set-raw-nnet=- exp/nnet3/rsi/9.mdl exp/nnet3/rsi/10.mdl

nnet3-compute-prob ... 9.mdl train_diagnostic.egs

nnet3-compute-prob ... 9.mdl valid_diagnostic.egs

iteration 10, 7 GPUs

nnet3-train ... 10.mdl 11.1.raw

nnet3-train ... 10.mdl 11.2.raw

nnet3-train ... 10.mdl 11.3.raw

nnet3-train ... 10.mdl 11.4.raw

nnet3-train ... 10.mdl 11.5.raw

nnet3-train ... 10.mdl 11.6.raw

nnet3-train ... 10.mdl 11.7.raw

nnet3-train ... 10.mdl 11.8.raw

nnet3-average exp/nnet3/rsi/11.1.raw exp/nnet3/rsi/11.2.raw exp/nnet3/rsi/11.3.raw exp/nnet3/rsi/11.4.raw exp/nnet3/rsi/11.5.raw exp/nnet3/rsi/11.6.raw exp/nnet3/rsi/11.7.raw exp/nnet3/rsi/11.8.raw - | nnet3-am-copy --set-raw-nnet=- exp/nnet3/rsi/10.mdl exp/nnet3/rsi/11.mdl

nnet3-compute-prob ... 10.mdl train_diagnostic.egs

nnet3-compute-prob ... 10.mdl valid_diagnostic.egs

iteration 11, 8 GPUs

nnet3-train ... 11.mdl 12.1.raw

nnet3-train ... 11.mdl 12.2.raw

nnet3-train ... 11.mdl 12.3.raw

nnet3-train ... 11.mdl 12.4.raw

nnet3-train ... 11.mdl 12.5.raw

nnet3-train ... 11.mdl 12.6.raw

nnet3-train ... 11.mdl 12.7.raw

nnet3-train ... 11.mdl 12.8.raw

nnet3-average exp/nnet3/rsi/12.1.raw exp/nnet3/rsi/12.2.raw exp/nnet3/rsi/12.3.raw exp/nnet3/rsi/12.4.raw exp/nnet3/rsi/12.5.raw exp/nnet3/rsi/12.6.raw exp/nnet3/rsi/12.7.raw exp/nnet3/rsi/12.8.raw - | nnet3-am-copy --set-raw-nnet=- exp/nnet3/rsi/11.mdl exp/nnet3/rsi/12.mdl

nnet3-compute-prob ... 11.mdl train_diagnostic.egs

nnet3-compute-prob ... 11.mdl valid_diagnostic.egs

combine

nnet3-combine ... exp/nnet3/rsi/12.mdl exp/nnet3/rsi/11.mdl exp/nnet3/rsi/10.mdl exp/nnet3/rsi/9.mdl exp/nnet3/rsi/8.mdl exp/nnet3/rsi/7.mdl exp/nnet3/rsi/egs/combine.egs exp/nnet3/rsi/combined.mdl

nnet3-compute-prob ... combined.mdl train_diagnostic.egs

nnet3-compute-prob ... combined.mdl valid_diagnostic.egs

adjust priors

nnet3-compute-from-egs --use-gpu=yes --apply-exp=true exp/nnet3/rsi/combined.mdl ark:'... egs.1.ark' ark:- | matrix-sum-rows ark:- ark:- | vector-sum ark:- exp/nnet3/rsi/post.combined.1.vec

nnet3-compute-from-egs --use-gpu=yes --apply-exp=true exp/nnet3/rsi/combined.mdl ark:'... egs.2.ark' ark:- | matrix-sum-rows ark:- ark:- | vector-sum ark:- exp/nnet3/rsi/post.combined.2.vec

nnet3-compute-from-egs --use-gpu=yes --apply-exp=true exp/nnet3/rsi/combined.mdl ark:'... egs.3.ark' ark:- | matrix-sum-rows ark:- ark:- | vector-sum ark:- exp/nnet3/rsi/post.combined.3.vec

nnet3-compute-from-egs --use-gpu=yes --apply-exp=true exp/nnet3/rsi/combined.mdl ark:'... egs.4.ark' ark:- | matrix-sum-rows ark:- ark:- | vector-sum ark:- exp/nnet3/rsi/post.combined.4.vec

nnet3-compute-from-egs --use-gpu=yes --apply-exp=true exp/nnet3/rsi/combined.mdl ark:'... egs.5.ark' ark:- | matrix-sum-rows ark:- ark:- | vector-sum ark:- exp/nnet3/rsi/post.combined.5.vec

nnet3-compute-from-egs --use-gpu=yes --apply-exp=true exp/nnet3/rsi/combined.mdl ark:'... egs.6.ark' ark:- | matrix-sum-rows ark:- ark:- | vector-sum ark:- exp/nnet3/rsi/post.combined.6.vec

nnet3-compute-from-egs --use-gpu=yes --apply-exp=true exp/nnet3/rsi/combined.mdl ark:'... egs.7.ark' ark:- | matrix-sum-rows ark:- ark:- | vector-sum ark:- exp/nnet3/rsi/post.combined.7.vec

nnet3-compute-from-egs --use-gpu=yes --apply-exp=true exp/nnet3/rsi/combined.mdl ark:'... egs.8.ark' ark:- | matrix-sum-rows ark:- ark:- | vector-sum ark:- exp/nnet3/rsi/post.combined.8.vec

nnet3-compute-from-egs --use-gpu=yes --apply-exp=true exp/nnet3/rsi/combined.mdl ark:'... egs.9.ark' ark:- | matrix-sum-rows ark:- ark:- | vector-sum ark:- exp/nnet3/rsi/post.combined.9.vec

nnet3-compute-from-egs --use-gpu=yes --apply-exp=true exp/nnet3/rsi/combined.mdl ark:'... egs.10.ark' ark:- | matrix-sum-rows ark:- ark:- | vector-sum ark:- exp/nnet3/rsi/post.combined.10.vec

nnet3-compute-from-egs --use-gpu=yes --apply-exp=true exp/nnet3/rsi/combined.mdl ark:'... egs.11.ark' ark:- | matrix-sum-rows ark:- ark:- | vector-sum ark:- exp/nnet3/rsi/post.combined.11.vec

 
 

vector-sum exp/nnet3/rsi/post.combined.1.vec exp/nnet3/rsi/post.combined.10.vec exp/nnet3/rsi/post.combined.2.vec exp/nnet3/rsi/post.combined.3.vec exp/nnet3/rsi/post.combined.4.vec exp/nnet3/rsi/post.combined.5.vec exp/nnet3/rsi/post.combined.6.vec exp/nnet3/rsi/post.combined.7.vec exp/nnet3/rsi/post.combined.8.vec exp/nnet3/rsi/post.combined.9.vec exp/nnet3/rsi/post.combined.vec

 
 

nnet3-am-adjust-priors exp/nnet3/rsi/combined.mdl exp/nnet3/rsi/post.combined.vec exp/nnet3/rsi/final.mdl