Skip to content

Commit 350d8b4

Browse files
committed
trunk: online-nnet2 decoding setup: adding scripts which make it possible to estimate the iVectors per speaker, excluding silence (so not-truly-online decoding). Some code changes for iVector which allow for scaling up the prior term when the data count exceeds a certain value (this seems to be important, for some reason). And misc. code fixes.
git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@4865 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
1 parent 7d8ff21 commit 350d8b4

18 files changed

Lines changed: 584 additions & 115 deletions

egs/librispeech/s5/local/online/run_nnet2.sh

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -147,7 +147,7 @@ if [ $stage -le 13 ]; then
147147
done
148148
fi
149149

150-
exit 0;
150+
#exit 0;
151151
###### Comment out the "exit 0" above to run the multi-threaded decoding. #####
152152

153153
if [ $stage -le 14 ]; then
@@ -166,8 +166,8 @@ if [ $stage -le 15 ]; then
166166
test=dev_clean
167167
steps/online/nnet2/decode.sh --threaded true --do-endpointing true \
168168
--config conf/decode.config --cmd "$decode_cmd" --nj 30 \
169-
--per-utt true exp/tri6b/graph_pp_tgsmall data/$test \
170-
${dir}_online/decode_pp_${test}_tgsmall_utt_threaded_ep || exit 1;
169+
--per-utt true exp/tri6b/graph_tgsmall data/$test \
170+
${dir}_online/decode_${test}_tgsmall_utt_threaded_ep || exit 1;
171171
fi
172172

173173
exit 0;

egs/wsj/s5/local/online/run_nnet2.sh

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,34 @@ if [ $stage -le 13 ]; then
134134
done
135135
fi
136136

137+
if [ $stage -le 14 ]; then
138+
# this does offline decoding, as stage 10, except we estimate the iVectors per
139+
# speaker, excluding silence (based on alignments from a GMM decoding), with a
140+
# different script. This is just to demonstrate that script.
141+
142+
rm exp/nnet2_online/.error 2>/dev/null
143+
for year in eval92 dev93; do
144+
steps/online/nnet2/extract_ivectors.sh --cmd "$train_cmd" --nj 8 \
145+
data/test_${year}_hires data/lang exp/nnet2_online/extractor \
146+
exp/tri4b/decode_tgpr_$year exp/nnet2_online/ivectors_spk_test_${year} || touch exp/nnet2_online/.error &
147+
done
148+
wait
149+
[ -f exp/nnet2_online/.error ] && echo "$0: Error getting iVectors" && exit 1;
150+
151+
for lm_suffix in bd_tgpr; do # just use the bd decoding, to avoid wasting time.
152+
graph_dir=exp/tri4b/graph_${lm_suffix}
153+
# use already-built graphs.
154+
for year in eval92 dev93; do
155+
steps/nnet2/decode.sh --nj 8 --cmd "$decode_cmd" \
156+
--online-ivector-dir exp/nnet2_online/ivectors_spk_test_$year \
157+
$graph_dir data/test_${year}_hires $dir/decode_${lm_suffix}_${year}_spk || touch exp/nnet2_online/.error &
158+
done
159+
done
160+
wait
161+
[ -f exp/nnet2_online/.error ] && echo "$0: Error decoding" && exit 1;
162+
fi
163+
164+
137165

138166

139167
exit 0;
Lines changed: 207 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,207 @@
1+
#!/bin/bash
2+
3+
# Copyright 2013 Daniel Povey
4+
# Apache 2.0.
5+
6+
7+
# This script computes iVectors in the same format as extract_ivectors_online.sh,
8+
# except that they are actually not really computed online, they are first computed
9+
# per speaker and just duplicated many times.
10+
#
11+
# This setup also makes it possible to use a previous decoding or alignment, to
12+
# down-weight silence in the stats (default is --silence-weight 0.0).
13+
#
14+
# This is for when you use the "online-decoding" setup in an offline task, and
15+
# you want the best possible results.
16+
17+
18+
# Begin configuration section.
19+
nj=30
20+
cmd="run.pl"
21+
stage=0
22+
num_gselect=5 # Gaussian-selection using diagonal model: number of Gaussians to select
23+
min_post=0.025 # Minimum posterior to use (posteriors below this are pruned out)
24+
ivector_period=10
25+
posterior_scale=0.1 # Scale on the acoustic posteriors, intended to account for
26+
# inter-frame correlations. Making this small during iVector
27+
# extraction is equivalent to scaling up the prior, and will
28+
# will tend to produce smaller iVectors where data-counts are
29+
# small. It's not so important that this match the value
30+
# used when training the iVector extractor, but more important
31+
# that this match the value used when you do real online decoding
32+
# with the neural nets trained with these iVectors.
33+
max_count=100 # Interpret this as a number of frames times posterior scale...
34+
# this config ensures that once the count exceeds this (i.e.
35+
# 1000 frames, or 10 seconds, by default), we start to scale
36+
# down the stats, accentuating the prior term. This seems quite
37+
# important for some reason.
38+
compress=true # If true, compress the iVectors stored on disk (it's lossy
39+
# compression, as used for feature matrices).
40+
silence_weight=0.0
41+
acwt=0.1 # used if input is a decode dir, to get best path from lattices.
42+
mdl=final # change this if decode directory did not have ../final.mdl present.
43+
44+
# End configuration section.
45+
46+
echo "$0 $@" # Print the command line for logging
47+
48+
if [ -f path.sh ]; then . ./path.sh; fi
49+
. parse_options.sh || exit 1;
50+
51+
52+
if [ $# != 4 ] && [ $# != 5 ]; then
53+
echo "Usage: $0 [options] <data> <lang> <extractor-dir> [<alignment-dir>|<decode-dir>] <ivector-dir>"
54+
echo " e.g.: $0 data/test exp/nnet2_online/extractor exp/tri3/decode_test exp/nnet2_online/ivectors_test"
55+
echo "main options (for others, see top of script file)"
56+
echo " --config <config-file> # config containing options"
57+
echo " --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
58+
echo " --nj <n|10> # Number of jobs (also see num-processes and num-threads)"
59+
echo " # Ignored if <alignment-dir> or <decode-dir> supplied."
60+
echo " --stage <stage|0> # To control partial reruns"
61+
echo " --num-gselect <n|5> # Number of Gaussians to select using"
62+
echo " # diagonal model."
63+
echo " --min-post <float;default=0.025> # Pruning threshold for posteriors"
64+
echo " --ivector-period <int;default=10> # How often to extract an iVector (frames)"
65+
echo " --utts-per-spk-max <int;default=-1> # Controls splitting into 'fake speakers'."
66+
echo " # Set to 1 if compatibility with utterance-by-utterance"
67+
echo " # decoding is the only factor, and to larger if you care "
68+
echo " # also about adaptation over several utterances."
69+
exit 1;
70+
fi
71+
72+
if [ $# -eq 4 ]; then
73+
data=$1
74+
lang=$2
75+
srcdir=$3
76+
dir=$4
77+
else # 5 arguments
78+
data=$1
79+
lang=$2
80+
srcdir=$3
81+
ali_or_decode_dir=$4
82+
dir=$5
83+
fi
84+
85+
for f in $data/feats.scp $srcdir/final.ie $srcdir/final.dubm $srcdir/global_cmvn.stats $srcdir/splice_opts \
86+
$lang/phones.txt $srcdir/online_cmvn.conf $srcdir/final.mat; do
87+
[ ! -f $f ] && echo "$0: No such file $f" && exit 1;
88+
done
89+
90+
mkdir -p $dir/log
91+
silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
92+
93+
if [ ! -z "$ali_or_decode_dir" ]; then
94+
95+
nj_orig=$(cat $ali_or_decode_dir/num_jobs) || exit 1;
96+
97+
if [ -f $ali_or_decode_dir/ali.1.gz ]; then
98+
if [ ! -f $ali_or_decode_dir/${mdl}.mdl ]; then
99+
echo "$0: expected $ali_or_decode_dir/${mdl}.mdl to exist."
100+
exit 1;
101+
fi
102+
103+
if [ $stage -le 0 ]; then
104+
rm $dir/weights.*.gz 2>/dev/null
105+
106+
$cmd JOB=1:$nj_orig $dir/log/ali_to_post.JOB.log \
107+
gunzip -c $ali_or_decode_dir/ali.JOB.gz \| \
108+
ali-to-post ark:- ark:- \| \
109+
weight-silence-post $silence_weight $silphonelist $ali_or_decode_dir/final.mdl ark:- ark:- \| \
110+
post-to-weights ark:- "ark:|gzip -c >$dir/weights.JOB.gz" || exit 1;
111+
112+
# put all the weights in one archive.
113+
for j in $(seq $nj_orig); do gunzip -c $dir/weights.$j.gz; done | gzip -c >$dir/weights.gz || exit 1;
114+
rm $dir/weights.*.gz || exit 1;
115+
fi
116+
117+
elif [ -f $ali_or_decode_dir/lat.1.gz ]; then
118+
if [ ! -f $ali_or_decode_dir/../${mdl}.mdl ]; then
119+
echo "$0: expected $ali_or_decode_dir/../${mdl}.mdl to exist."
120+
exit 1;
121+
fi
122+
123+
124+
if [ $stage -le 0 ]; then
125+
rm $dir/weights.*.gz 2>/dev/null
126+
127+
$cmd JOB=1:$nj_orig $dir/log/lat_to_post.JOB.log \
128+
lattice-best-path --acoustic-scale=$acwt "ark:gunzip -c $ali_or_decode_dir/lat.JOB.gz|" ark:/dev/null ark:- \| \
129+
ali-to-post ark:- ark:- \| \
130+
weight-silence-post $silence_weight $silphonelist $ali_or_decode_dir/../${mdl}.mdl ark:- ark:- \| \
131+
post-to-weights ark:- "ark:|gzip -c >$dir/weights.JOB.gz" || exit 1;
132+
133+
# put all the weights in one archive.
134+
for j in $(seq $nj_orig); do gunzip -c $dir/weights.$j.gz; done | gzip -c >$dir/weights.gz || exit 1;
135+
rm $dir/weights.*.gz || exit 1;
136+
fi
137+
else
138+
echo "$0: expected ali.1.gz or lat.1.gz to exist in $ali_or_decode_dir";
139+
exit 1;
140+
fi
141+
142+
fi
143+
144+
# Now work out the per-speaker iVectors.
145+
146+
sdata=$data/split$nj;
147+
utils/split_data.sh $data $nj || exit 1;
148+
149+
echo $ivector_period > $dir/ivector_period || exit 1;
150+
splice_opts=$(cat $srcdir/splice_opts)
151+
152+
153+
gmm_feats="ark,s,cs:apply-cmvn-online --spk2utt=ark:$sdata/JOB/spk2utt --config=$srcdir/online_cmvn.conf $srcdir/global_cmvn.stats scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
154+
feats="ark,s,cs:splice-feats $splice_opts scp:$sdata/JOB/feats.scp ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
155+
156+
157+
if [ $stage -le 1 ]; then
158+
if [ ! -z "$ali_or_decode_dir" ]; then
159+
$cmd JOB=1:$nj $dir/log/extract_ivectors.JOB.log \
160+
gmm-global-get-post --n=$num_gselect --min-post=$min_post $srcdir/final.dubm "$gmm_feats" ark:- \| \
161+
weight-post ark:- "ark,s,cs:gunzip -c $dir/weights.gz|" ark:- \| \
162+
ivector-extract --acoustic-weight=$posterior_scale --compute-objf-change=true \
163+
--max-count=$max_count --spk2utt=ark:$sdata/JOB/spk2utt \
164+
$srcdir/final.ie "$feats" ark,s,cs:- ark,t:$dir/ivectors_spk.JOB.ark || exit 1;
165+
else
166+
$cmd JOB=1:$nj $dir/log/extract_ivectors.JOB.log \
167+
gmm-global-get-post --n=$num_gselect --min-post=$min_post $srcdir/final.dubm "$gmm_feats" ark:- \| \
168+
ivector-extract --acoustic-weight=$posterior_scale --compute-objf-change=true \
169+
--max-count=$max_count --spk2utt=ark:$sdata/JOB/spk2utt \
170+
$srcdir/final.ie "$feats" ark,s,cs:- ark,t:$dir/ivectors_spk.JOB.ark || exit 1;
171+
fi
172+
fi
173+
174+
# get an utterance-level set of iVectors (just duplicate the speaker-level ones).
175+
if [ $stage -le 2 ]; then
176+
for j in $(seq $nj); do
177+
utils/apply_map.pl -f 2 $dir/ivectors_spk.$j.ark <$sdata/$j/utt2spk >$dir/ivectors_utt.$j.ark || exit 1;
178+
done
179+
fi
180+
181+
ivector_dim=$[$(head -n 1 $dir/ivectors_spk.1.ark | wc -w) - 3] || exit 1;
182+
echo "$0: iVector dim is $ivector_dim"
183+
184+
base_feat_dim=$(feat-to-dim scp:$data/feats.scp -) || exit 1;
185+
186+
start_dim=$base_feat_dim
187+
end_dim=$[$base_feat_dim+$ivector_dim-1]
188+
189+
190+
if [ $stage -le 3 ]; then
191+
# here, we are just using the original features in $sdata/JOB/feats.scp for
192+
# their number of rows; we use the select-feats command to remove those
193+
# features and retain only the iVector features.
194+
$cmd JOB=1:$nj $dir/log/duplicate_feats.JOB.log \
195+
append-vector-to-feats scp:$sdata/JOB/feats.scp ark:$dir/ivectors_utt.JOB.ark ark:- \| \
196+
select-feats "$start_dim-$end_dim" ark:- ark:- \| \
197+
subsample-feats --n=$ivector_period ark:- ark:- \| \
198+
copy-feats --compress=$compress ark:- \
199+
ark,scp:$dir/ivector_online.JOB.ark,$dir/ivector_online.JOB.scp || exit 1;
200+
fi
201+
202+
if [ $stage -le 4 ]; then
203+
echo "$0: combining iVectors across jobs"
204+
for j in $(seq $nj); do cat $dir/ivector_online.$j.scp; done >$dir/ivector_online.scp || exit 1;
205+
fi
206+
207+
echo "$0: done extracting (pseudo-online) iVectors"

egs/wsj/s5/steps/online/nnet2/extract_ivectors_online.sh

Lines changed: 2 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -32,9 +32,6 @@ posterior_scale=0.1 # Scale on the acoustic posteriors, intended to account for
3232
# used when training the iVector extractor, but more important
3333
# that this match the value used when you do real online decoding
3434
# with the neural nets trained with these iVectors.
35-
#utts_per_spk_max=-1 # This option is no longer supported, you should use
36-
# steps/online/nnet2/copy_data_dir.sh with the --utts-per-spk-max
37-
# option to make a copy of the data dir.
3835
compress=true # If true, compress the iVectors stored on disk (it's lossy
3936
# compression, as used for feature matrices).
4037

@@ -58,10 +55,6 @@ if [ $# != 3 ]; then
5855
echo " # diagonal model."
5956
echo " --min-post <float;default=0.025> # Pruning threshold for posteriors"
6057
echo " --ivector-period <int;default=10> # How often to extract an iVector (frames)"
61-
echo " --utts-per-spk-max <int;default=-1> # Controls splitting into 'fake speakers'."
62-
echo " # Set to 1 if compatibility with utterance-by-utterance"
63-
echo " # decoding is the only factor, and to larger if you care "
64-
echo " # also about adaptation over several utterances."
6558
exit 1;
6659
fi
6760

@@ -71,7 +64,7 @@ dir=$3
7164

7265
for f in $data/feats.scp $srcdir/final.ie $srcdir/final.dubm $srcdir/global_cmvn.stats $srcdir/splice_opts \
7366
$srcdir/online_cmvn.conf $srcdir/final.mat; do
74-
[ ! -f $f ] && echo "No such file $f" && exit 1;
67+
[ ! -f $f ] && echo "$0: No such file $f" && exit 1;
7568
done
7669

7770
# Set various variables.
@@ -86,7 +79,7 @@ splice_opts=$(cat $srcdir/splice_opts)
8679
# the program ivector-extract-online2 does a bunch of stuff in memory and is
8780
# config-driven... this was easier in this case because the same code is
8881
# involved in online decoding. We need to create a config file for iVector
89-
# extration.
82+
# extraction.
9083

9184
ieconf=$dir/conf/ivector_extractor.conf
9285
echo -n >$ieconf
@@ -104,15 +97,6 @@ echo "--posterior-scale=$posterior_scale" >>$ieconf
10497
echo "--max-remembered-frames=1000" >>$ieconf # the default
10598

10699

107-
ns=$(wc -l <$data/spk2utt)
108-
if [ "$ns" == 1 -a "$utts_per_spk_max" != 1 -a "$utts_per_spk_max" != -1 ]; then
109-
echo "$0: you seem to have just one speaker in your database. This is probably not a good idea."
110-
echo " see http://kaldi.sourceforge.net/data_prep.html (search for 'bold') for why"
111-
echo " Setting --utts-per-spk-max to 1."
112-
utts_per_spk_max=1
113-
fi
114-
115-
116100

117101
for n in $(seq $nj); do
118102
# This will do nothing unless the directory $dir/storage exists;

src/bin/ali-to-post.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ int main(int argc, char *argv[]) {
3434
try {
3535
const char *usage =
3636
"Convert alignments to posteriors\n"
37-
"Usage: ali-to-post [options] alignments-rspecifier posteriors-wspecifier\n"
37+
"Usage: ali-to-post [options] <alignments-rspecifier> <posteriors-wspecifier>\n"
3838
"e.g.:\n"
3939
" ali-to-post ark:1.ali ark:1.post\n";
4040

src/bin/copy-matrix.cc

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -38,10 +38,14 @@ int main(int argc, char *argv[]) {
3838
"See also: copy-feats\n";
3939

4040
bool binary = true;
41+
BaseFloat scale = 1.0;
4142
ParseOptions po(usage);
4243

43-
po.Register("binary", &binary, "Write in binary mode (only relevant if output is a wxfilename)");
44-
44+
po.Register("binary", &binary,
45+
"Write in binary mode (only relevant if output is a wxfilename)");
46+
po.Register("scale", &scale,
47+
"This option can be used to scale the matrices being copied.");
48+
4549
po.Read(argc, argv);
4650

4751
if (po.NumArgs() != 2) {
@@ -68,6 +72,7 @@ int main(int argc, char *argv[]) {
6872
if (!in_is_rspecifier) {
6973
Matrix<BaseFloat> mat;
7074
ReadKaldiObject(matrix_in_fn, &mat);
75+
if (scale != 1.0) mat.Scale(scale);
7176
Output ko(matrix_out_fn, binary);
7277
mat.Write(ko.Stream(), binary);
7378
KALDI_LOG << "Copied matrix to " << matrix_out_fn;
@@ -76,8 +81,15 @@ int main(int argc, char *argv[]) {
7681
int num_done = 0;
7782
BaseFloatMatrixWriter writer(matrix_out_fn);
7883
SequentialBaseFloatMatrixReader reader(matrix_in_fn);
79-
for (; !reader.Done(); reader.Next(), num_done++)
80-
writer.Write(reader.Key(), reader.Value());
84+
for (; !reader.Done(); reader.Next(), num_done++) {
85+
if (scale != 1.0) {
86+
Matrix<BaseFloat> mat(reader.Value());
87+
mat.Scale(scale);
88+
writer.Write(reader.Key(), mat);
89+
} else {
90+
writer.Write(reader.Key(), reader.Value());
91+
}
92+
}
8193
KALDI_LOG << "Copied " << num_done << " matrices.";
8294
return (num_done != 0 ? 0 : 1);
8395
}

src/featbin/append-feats.cc

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -50,9 +50,9 @@ int main(int argc, char *argv[]) {
5050
exit(1);
5151
}
5252

53-
std::string rspecifier1 = po.GetArg(1);
54-
std::string rspecifier2 = po.GetArg(2);
55-
std::string wspecifier = po.GetArg(3);
53+
std::string rspecifier1 = po.GetArg(1),
54+
rspecifier2 = po.GetArg(2),
55+
wspecifier = po.GetArg(3);
5656

5757
BaseFloatMatrixWriter feats_writer(wspecifier);
5858
SequentialBaseFloatMatrixReader feats_reader1(rspecifier1);

src/featbin/paste-feats.cc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -78,8 +78,8 @@ int main(int argc, char *argv[]) {
7878
"Usage: paste-feats <in-rspecifier1> <in-rspecifier2> [<in-rspecifier3> ...] <out-wspecifier>\n"
7979
" or: paste-feats <in-rxfilename1> <in-rxfilename2> [<in-rxfilename3> ...] <out-wxfilename>\n"
8080
" e.g. paste-feats ark:feats1.ark \"ark:select-feats 0-3 ark:feats2.ark ark:- |\" ark:feats-out.ark\n"
81-
" or: paste-feats foo.mat bar.mat baz.mat\n";
82-
81+
" or: paste-feats foo.mat bar.mat baz.mat\n"
82+
"See also: copy-feats, copy-matrix, append-vector-to-feats, concat-feats\n";
8383

8484
ParseOptions po(usage);
8585

0 commit comments

Comments
 (0)