trunk,nnet: changing nnet prototype, making the bottleneck networks more stable by reducing learninig rates of weights around bottleneck.

KarelVesely84 · KarelVesely84 · commit b03ef028d015 · 2014-06-23T19:32:25.000Z
git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@4076 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
diff --git a/egs/wsj/s5/utils/nnet/make_nnet_proto.py b/egs/wsj/s5/utils/nnet/make_nnet_proto.py
@@ -115,18 +115,20 @@ def Glorot(dim1, dim2):
 # Optionaly add bottleneck
 if o.bottleneck_dim != 0:
   assert(o.bottleneck_dim > 0)
-  print "<AffineTransform> <InputDim> %d <OutputDim> %d <BiasMean> %f <BiasRange> %f <ParamStddev> %f" % \
-   (num_hid_neurons, o.bottleneck_dim, 0.0, 0.0, \
-    (o.param_stddev_factor * Glorot(num_hid_neurons, o.bottleneck_dim)))
-  print "<AffineTransform> <InputDim> %d <OutputDim> %d <BiasMean> %f <BiasRange> %f <ParamStddev> %f" % \
+  # 25% smaller stddev -> small bottleneck range, 10x smaller learning rate
+  print "<AffineTransformNobias> <InputDim> %d <OutputDim> %d <ParamStddev> %f <LearnRateCoef> %f" % \
+   (num_hid_neurons, o.bottleneck_dim, \
+    (o.param_stddev_factor * Glorot(num_hid_neurons, o.bottleneck_dim) * 0.75 ), 0.1)
+  # 25% smaller stddev -> smaller gradient in prev. layer, 10x smaller learning rate for weigts & biases
+  print "<AffineTransform> <InputDim> %d <OutputDim> %d <BiasMean> %f <BiasRange> %f <ParamStddev> %f <LearnRateCoef> %f <BiasLearnRateCoef> %f" % \
    (o.bottleneck_dim, num_hid_neurons, o.hid_bias_mean, o.hid_bias_range, \
-    (o.param_stddev_factor * Glorot(o.bottleneck_dim, num_hid_neurons)))
+    (o.param_stddev_factor * Glorot(o.bottleneck_dim, num_hid_neurons) * 0.75 ), 0.1, 0.1) 
   print "%s <InputDim> %d <OutputDim> %d" % (o.activation_type, num_hid_neurons, num_hid_neurons)
 
-# Last AffineTransform
-print "<AffineTransform> <InputDim> %d <OutputDim> %d <BiasMean> %f <BiasRange> %f <ParamStddev> %f" % \
+# Last AffineTransform (10x smaller learning rate on bias)
+print "<AffineTransform> <InputDim> %d <OutputDim> %d <BiasMean> %f <BiasRange> %f <ParamStddev> %f <LearnRateCoef> %f <BiasLearnRateCoef> %f" % \
       (num_hid_neurons, num_leaves, 0.0, 0.0, \
-       (o.param_stddev_factor * Glorot(num_hid_neurons, num_leaves)))
+       (o.param_stddev_factor * Glorot(num_hid_neurons, num_leaves)), 1.0, 0.1)
 
 # Optionaly append softmax
 if o.with_softmax: