Skip to content

Commit cefc844

Browse files
naxingyudanpovey
authored andcommitted
[egs][scripts] Add pitch recipe for nnet2/3 online setup (hkust) (kaldi-asr#1280)
1 parent 5d1f524 commit cefc844

31 files changed

+979
-620
lines changed

egs/hkust/README.txt

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,3 @@ LDC2005S15 : http://www.ldc.upenn.edu/Catalog/catalogEntry.jsp?catalogId=LDC2005
66
LDC2005T32 : http://www.ldc.upenn.edu/Catalog/catalogEntry.jsp?catalogId=LDC2005T32
77

88
s5: The experiments here were based on the above corpus
9-
10-
11-

egs/hkust/s5/RESULTS

Lines changed: 29 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,30 @@
11
# for x in exp/*/decode; do [ -d $x ] && grep WER $x/cer_* | utils/best_wer.sh; done
2-
%WER 80.89 [ 45422 / 56154, 1530 ins, 11018 del, 32874 sub ] exp/mono0a/decode/cer_9
3-
%WER 60.01 [ 33698 / 56154, 2528 ins, 5961 del, 25209 sub ] exp/tri1/decode/cer_12
4-
%WER 59.68 [ 33514 / 56154, 2574 ins, 5752 del, 25188 sub ] exp/tri2/decode/cer_12
5-
%WER 57.25 [ 32148 / 56154, 2484 ins, 5811 del, 23853 sub ] exp/tri3a/decode/cer_13
6-
%WER 53.47 [ 30026 / 56154, 2789 ins, 5115 del, 22122 sub ] exp/tri4a/decode/cer_13
7-
%WER 49.72 [ 27921 / 56154, 2833 ins, 4568 del, 20520 sub ] exp/tri5a/decode/cer_13
8-
%WER 43.95 [ 24681 / 56154, 2106 ins, 3890 del, 18685 sub ] exp/tri5a_mmi_b0.1/decode/cer_10
9-
%WER 44.60 [ 25044 / 56154, 2121 ins, 4040 del, 18883 sub ] exp/tri5a_mpe/decode/cer_11
10-
%WER 43.81 [ 24602 / 56154, 2843 ins, 3751 del, 18008 sub ] exp/sgmm2_5a/decode/cer_10
11-
exp/tri5a_mce/decode/cer_11:%WER 44.74 [ 25125 / 56154, 2112 ins, 4108 del, 18905 sub ]
2+
%WER 80.72 [ 45327 / 56154, 1609 ins, 10856 del, 32862 sub ] exp/mono0a/decode/cer_9
3+
%WER 58.86 [ 33054 / 56154, 2651 ins, 6240 del, 24163 sub ] exp/tri1/decode/cer_13
4+
%WER 58.32 [ 32748 / 56154, 2491 ins, 6279 del, 23978 sub ] exp/tri2/decode/cer_14
5+
%WER 56.49 [ 31719 / 56154, 2601 ins, 5979 del, 23139 sub ] exp/tri3a/decode/cer_13
6+
%WER 51.75 [ 29060 / 56154, 2879 ins, 5088 del, 21093 sub ] exp/tri4a/decode/cer_13
7+
%WER 47.36 [ 26596 / 56154, 2740 ins, 4577 del, 19279 sub ] exp/tri5a/decode/cer_13
8+
%WER 42.55 [ 23894 / 56154, 1877 ins, 4437 del, 17580 sub ] exp/tri5a_mpe/decode/cer_13
9+
%WER 42.19 [ 23693 / 56154, 2138 ins, 3871 del, 17684 sub ] exp/tri5a_mmi_b0.1/decode/cer_10
10+
%WER 41.11 [ 23086 / 56154, 2863 ins, 3608 del, 16615 sub ] exp/sgmm2_5a/decode/cer_10
1211

12+
# nnet2 online results
13+
%WER 38.32 [ 21518 / 56154, 2344 ins, 4273 del, 14901 sub ] exp/nnet2_online/nnet_ms/decode/cer_12
14+
%WER 38.01 [ 21345 / 56154, 2555 ins, 4173 del, 14617 sub ] exp/nnet2_online/nnet_ms_online/decode/cer_12
15+
%WER 37.10 [ 20832 / 56154, 2399 ins, 3936 del, 14497 sub ] exp/nnet2_online/nnet_ms_online/decode_per_utt/cer_12
16+
17+
# nnet3 online results
18+
%WER 32.77 [ 18400 / 56154, 1971 ins, 3525 del, 12904 sub ] exp/nnet3/tdnn_sp/decode/cer_10
19+
%WER 33.02 [ 18540 / 56154, 2335 ins, 3251 del, 12954 sub ] exp/nnet3/tdnn_sp_online/decode/cer_9
20+
%WER 34.01 [ 19098 / 56154, 2195 ins, 3482 del, 13421 sub ] exp/nnet3/tdnn_sp_online/decode_per_utt/cer_10
21+
22+
# chain online results
23+
%WER 28.24 [ 15858 / 56154, 1454 ins, 3415 del, 10989 sub ] exp/chain/tdnn_7h_sp/decode/cer_10
24+
%WER 28.16 [ 15812 / 56154, 1648 ins, 2824 del, 11340 sub ] exp/chain/tdnn_7h_sp_online/decode/cer_9
25+
%WER 29.55 [ 16594 / 56154, 1547 ins, 3437 del, 11610 sub ] exp/chain/tdnn_7h_sp_online/decode_per_utt/cer_10
26+
27+
## results before adding pitch
1328
# nnet1 results
1429
exp/dnn5b_pretrain-dbn_dnn/decode/cer_10:%WER 39.42 [ 22134 / 56154, 2507 ins, 3730 del, 15897 sub ]
1530
exp/dnn5b_pretrain-dbn_dnn_smbr/decode/cer_11:%WER 36.50 [ 20499 / 56154, 1915 ins, 3312 del, 15272 sub ]
@@ -18,11 +33,11 @@ exp/cnn5c/decode/cer_10:%WER 40.13 [ 22536 / 56154, 2329 ins, 3962 del, 16245 su
1833
exp/cnn5c_pretrain-dbn_dnn/decode/cer_10:%WER 38.80 [ 21790 / 56154, 2470 ins, 3582 del, 15738 sub ]
1934
exp/lstm5e/decode/cer_10:%WER 37.61 [ 21121 / 56154, 1829 ins, 3941 del, 15351 sub ]
2035

21-
# nnet2 results
36+
# nnet2 mfcc results
2237
exp/nnet2_5d/decode/cer_10:%WER 38.59 [ 21669 / 56154, 2498 ins, 3581 del, 15590 sub ]
23-
# ConvNet with 2 convolutional layers and 2 ReLU layers
38+
# ConvNet using fbank, with 2 convolutional layers and 2 ReLU layers
2439
exp/nnet2_convnet/decode/cer_10:%WER 41.19 [ 23129 / 56154, 2599 ins, 3782 del, 16748 sub ]
2540

26-
# nnet3 results (using speed perturbed data)
41+
# nnet3 mfcc results (using speed perturbed data)
2742
exp/nnet3/tdnn_sp/decode_dev/cer_10:%WER 33.79 [ 18977 / 56154, 2027 ins, 3485 del, 13465 sub ]
28-
exp/nnet3/lstm_sp_ld5/decode_dev/cer_9:%WER 33.51 [ 18815 / 56154, 1813 ins, 3249 del, 13753 sub ]
43+
exp/nnet3/lstm_sp_ld5/decode_dev/cer_9:%WER 33.51 [ 18815 / 56154, 1813 ins, 3249 del, 13753 sub ]

egs/hkust/s5/conf/mfcc_hires.conf

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
# config for high-resolution MFCC features, intended for neural network training.
2+
# Note: we keep all cepstra, so it has the same info as filterbank features,
3+
# but MFCC is more easily compressible (because less correlated) which is why
4+
# we prefer this method.
5+
--use-energy=false # use average of log energy, not energy.
6+
--sample-frequency=8000 # Switchboard is sampled at 8kHz
7+
--num-mel-bins=40 # similar to Google's setup.
8+
--num-ceps=40 # there is no dimensionality reduction.
9+
--low-freq=40 # low cutoff frequency for mel bins
10+
--high-freq=-200 # high cutoff frequently, relative to Nyquist of 4000 (=3800)

egs/hkust/s5/conf/online_cmvn.conf

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
# configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
--sample-frequency=8000

egs/hkust/s5/conf/pitch.conf

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
--sample-frequency=8000
Lines changed: 211 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,211 @@
1+
#!/bin/bash
2+
3+
# This script is based on tun_tdnn_7h.sh in swbd chain recipe.
4+
5+
set -e
6+
7+
# configs for 'chain'
8+
affix=
9+
stage=12
10+
train_stage=-10
11+
get_egs_stage=-10
12+
dir=exp/chain/tdnn_7h # Note: _sp will get added to this if $speed_perturb == true.
13+
decode_iter=
14+
15+
# training options
16+
num_epochs=4
17+
initial_effective_lrate=0.001
18+
final_effective_lrate=0.0001
19+
leftmost_questions_truncate=-1
20+
max_param_change=2.0
21+
final_layer_normalize_target=0.5
22+
num_jobs_initial=2
23+
num_jobs_final=12
24+
minibatch_size=128
25+
frames_per_eg=150
26+
remove_egs=true
27+
common_egs_dir=
28+
xent_regularize=0.1
29+
30+
# End configuration section.
31+
echo "$0 $@" # Print the command line for logging
32+
33+
. ./cmd.sh
34+
. ./path.sh
35+
. ./utils/parse_options.sh
36+
37+
if ! cuda-compiled; then
38+
cat <<EOF && exit 1
39+
This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
40+
If you want to use GPUs (and have them), go to src/, and configure and make on a machine
41+
where "nvcc" is installed.
42+
EOF
43+
fi
44+
45+
# The iVector-extraction and feature-dumping parts are the same as the standard
46+
# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
47+
# run those things.
48+
49+
dir=${dir}${affix:+_$affix}_sp
50+
train_set=train_sp
51+
ali_dir=exp/tri5a_sp_ali
52+
treedir=exp/chain/tri6_7d_tree_sp
53+
lang=data/lang_chain
54+
55+
56+
# if we are using the speed-perturbed data we need to generate
57+
# alignments for it.
58+
local/nnet3/run_ivector_common.sh --stage $stage \
59+
--ivector-extractor exp/nnet2_online/extractor || exit 1;
60+
61+
if [ $stage -le 9 ]; then
62+
# Get the alignments as lattices (gives the LF-MMI training more freedom).
63+
# use the same num-jobs as the alignments
64+
nj=$(cat $ali_dir/num_jobs) || exit 1;
65+
steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
66+
data/lang exp/tri5a exp/tri5a_sp_lats
67+
rm exp/tri5a_sp_lats/fsts.*.gz # save space
68+
fi
69+
70+
if [ $stage -le 10 ]; then
71+
# Create a version of the lang/ directory that has one state per phone in the
72+
# topo file. [note, it really has two states.. the first one is only repeated
73+
# once, the second one has zero or more repeats.]
74+
rm -rf $lang
75+
cp -r data/lang $lang
76+
silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
77+
nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
78+
# Use our special topology... note that later on may have to tune this
79+
# topology.
80+
steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
81+
fi
82+
83+
if [ $stage -le 11 ]; then
84+
# Build a tree using our new topology. This is the critically different
85+
# step compared with other recipes.
86+
steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
87+
--leftmost-questions-truncate $leftmost_questions_truncate \
88+
--context-opts "--context-width=2 --central-position=1" \
89+
--cmd "$train_cmd" 5000 data/$train_set $lang $ali_dir $treedir
90+
fi
91+
92+
if [ $stage -le 12 ]; then
93+
echo "$0: creating neural net configs using the xconfig parser";
94+
95+
num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
96+
learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
97+
98+
mkdir -p $dir/configs
99+
cat <<EOF > $dir/configs/network.xconfig
100+
input dim=100 name=ivector
101+
input dim=43 name=input
102+
103+
# please note that it is important to have input layer with the name=input
104+
# as the layer immediately preceding the fixed-affine-layer to enable
105+
# the use of short notation for the descriptor
106+
fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
107+
108+
# the first splicing is moved before the lda layer, so no splicing here
109+
relu-renorm-layer name=tdnn1 dim=625
110+
relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=625
111+
relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=625
112+
relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=625
113+
relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=625
114+
relu-renorm-layer name=tdnn6 input=Append(-3,0,3) dim=625
115+
116+
## adding the layers for chain branch
117+
relu-renorm-layer name=prefinal-chain input=tdnn6 dim=625 target-rms=0.5
118+
output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5
119+
120+
# adding the layers for xent branch
121+
# This block prints the configs for a separate output that will be
122+
# trained with a cross-entropy objective in the 'chain' models... this
123+
# has the effect of regularizing the hidden parts of the model. we use
124+
# 0.5 / args.xent_regularize as the learning rate factor- the factor of
125+
# 0.5 / args.xent_regularize is suitable as it means the xent
126+
# final-layer learns at a rate independent of the regularization
127+
# constant; and the 0.5 was tuned so as to make the relative progress
128+
# similar in the xent and regular final layers.
129+
relu-renorm-layer name=prefinal-xent input=tdnn6 dim=625 target-rms=0.5
130+
output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
131+
132+
EOF
133+
steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
134+
fi
135+
136+
if [ $stage -le 13 ]; then
137+
if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
138+
utils/create_split_dir.pl \
139+
/export/b0{5,6,7,8}/$USER/kaldi-data/egs/hkust-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
140+
fi
141+
142+
steps/nnet3/chain/train.py --stage $train_stage \
143+
--cmd "$decode_cmd" \
144+
--feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
145+
--feat.cmvn-opts "--norm-means=false --norm-vars=false" \
146+
--chain.xent-regularize $xent_regularize \
147+
--chain.leaky-hmm-coefficient 0.1 \
148+
--chain.l2-regularize 0.00005 \
149+
--chain.apply-deriv-weights false \
150+
--chain.lm-opts="--num-extra-lm-states=2000" \
151+
--egs.dir "$common_egs_dir" \
152+
--egs.stage $get_egs_stage \
153+
--egs.opts "--frames-overlap-per-eg 0" \
154+
--egs.chunk-width $frames_per_eg \
155+
--trainer.num-chunk-per-minibatch $minibatch_size \
156+
--trainer.frames-per-iter 1500000 \
157+
--trainer.num-epochs $num_epochs \
158+
--trainer.optimization.num-jobs-initial $num_jobs_initial \
159+
--trainer.optimization.num-jobs-final $num_jobs_final \
160+
--trainer.optimization.initial-effective-lrate $initial_effective_lrate \
161+
--trainer.optimization.final-effective-lrate $final_effective_lrate \
162+
--trainer.max-param-change $max_param_change \
163+
--cleanup.remove-egs $remove_egs \
164+
--feat-dir data/${train_set}_hires \
165+
--tree-dir $treedir \
166+
--lat-dir exp/tri5a_sp_lats \
167+
--dir $dir || exit 1;
168+
fi
169+
170+
if [ $stage -le 14 ]; then
171+
# Note: it might appear that this $lang directory is mismatched, and it is as
172+
# far as the 'topo' is concerned, but this script doesn't read the 'topo' from
173+
# the lang directory.
174+
utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $dir/graph
175+
fi
176+
177+
graph_dir=$dir/graph
178+
if [ $stage -le 15 ]; then
179+
iter_opts=
180+
if [ ! -z $decode_iter ]; then
181+
iter_opts=" --iter $decode_iter "
182+
fi
183+
steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
184+
--nj 10 --cmd "$decode_cmd" $iter_opts \
185+
--online-ivector-dir exp/nnet3/ivectors_dev \
186+
$graph_dir data/dev_hires $dir/decode || exit 1;
187+
fi
188+
189+
if [ $stage -le 16 ]; then
190+
steps/online/nnet3/prepare_online_decoding.sh --mfcc-config conf/mfcc_hires.conf \
191+
--add-pitch true \
192+
data/lang exp/nnet2_online/extractor "$dir" ${dir}_online || exit 1;
193+
fi
194+
195+
if [ $stage -le 17 ]; then
196+
# do the actual online decoding with iVectors, carrying info forward from
197+
# previous utterances of the same speaker.
198+
steps/online/nnet3/decode.sh --config conf/decode.config \
199+
--cmd "$decode_cmd" --nj 10 --acwt 1.0 --post-decode-acwt 10.0 \
200+
"$graph_dir" data/dev_hires \
201+
${dir}_online/decode || exit 1;
202+
fi
203+
204+
if [ $stage -le 18 ]; then
205+
# this version of the decoding treats each utterance separately
206+
# without carrying forward speaker information.
207+
steps/online/nnet3/decode.sh --config conf/decode.config \
208+
--cmd "$decode_cmd" --nj 10 --per-utt true --acwt 1.0 --post-decode-acwt 10.0 \
209+
"$graph_dir" data/dev_hires \
210+
${dir}_online/decode_per_utt || exit 1;
211+
fi

egs/hkust/s5/local/create_oov_char_lexicon.pl

100644100755
Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
#!/usr/bin/perl
2-
# Copyright 2016 LeSpeech (Author: Xingyu Na)
1+
#!/usr/bin/env perl
2+
# Copyright 2016 Alibaba Robotics Corp. (Author: Xingyu Na)
33
#
44
# A script for char-based Chinese OOV lexicon generation.
55
#

0 commit comments

Comments
 (0)