Skip to content

Commit 295877c

Browse files
teinhonglodanpovey
authored andcommitted
[scripts] Add decoding script that combines nnet3 outputs (kaldi-asr#2534)
1 parent eacf34a commit 295877c

File tree

2 files changed

+342
-4
lines changed

2 files changed

+342
-4
lines changed
Lines changed: 262 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,262 @@
1+
#!/bin/bash
2+
3+
# Copyright 2018 Tien-Hong Lo
4+
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
12+
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
13+
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
14+
# MERCHANTABLITY OR NON-INFRINGEMENT.
15+
# See the Apache 2 License for the specific language governing permissions and
16+
# limitations under the License.
17+
18+
19+
# Script for system combination using output of the neural networks.
20+
# This calls nnet3-compute, matrix-sum and latgen-faster-mapped to create a system combination.
21+
set -euo pipefail
22+
# begin configuration section.
23+
cmd=run.pl
24+
25+
# Neural Network
26+
stage=0
27+
iter=final
28+
nj=30
29+
output_name="output"
30+
ivector_scale=1.0
31+
apply_exp=false # Apply exp i.e. write likelihoods instead of log-likelihoods
32+
compress=false # Specifies whether the output should be compressed before
33+
# dumping to disk
34+
use_gpu=false
35+
skip_diagnostics=false
36+
extra_left_context=0
37+
extra_right_context=0
38+
extra_left_context_initial=-1
39+
extra_right_context_final=-1
40+
online_ivector_dir=
41+
frame_subsampling_factor=1
42+
frames_per_chunk=150
43+
average=true
44+
45+
# Decode
46+
beam=15.0 # prune the lattices prior to MBR decoding, for speed.
47+
max_active=7000
48+
min_active=200
49+
acwt=0.1 # Just a default value, used for adaptation and beam-pruning..
50+
post_decode_acwt=1.0 # can be used in 'chain' systems to scale acoustics by 10 so the
51+
# regular scoring script works.
52+
lattice_beam=8.0 # Beam we use in lattice generation.
53+
num_threads=1 # if >1, will use latgen-faster--map-parallel
54+
min_lmwt=5
55+
max_lmwt=15
56+
parallel_opts="--num-threads 3"
57+
scoring_opts=
58+
minimize=false
59+
skip_scoring=false
60+
61+
word_determinize=false # If set to true, then output lattice does not retain
62+
# alternate paths a sequence of words (with alternate pronunciations).
63+
# Setting to true is the default in steps/nnet3/decode.sh.
64+
# However, setting this to false
65+
# is useful for generation w of semi-supervised training
66+
# supervision and frame-level confidences.
67+
write_compact=true # If set to false, then writes the lattice in non-compact format,
68+
# retaining the acoustic scores on each arc. This is
69+
# required to be false for LM rescoring undeterminized
70+
# lattices (when --word-determinize is false)
71+
#end configuration section.
72+
73+
[ -f ./path.sh ] && . ./path.sh
74+
. parse_options.sh || exit 1;
75+
76+
77+
if [ $# -lt 5 ]; then
78+
echo "Usage: $0 [options] <data-dir> <graph-dir> <nnet3-dir> <nnet3-dir2> [<nnet3-dir3> ... ] <output-dir>"
79+
echo "e.g.: local/socal/score_fusion.sh --nj 8 \\"
80+
echo "--online-ivector-dir exp/nnet3/ivectors_test_eval92 \\"
81+
echo " data/test_eval92_hires exp/nnet3/tdnn/graph exp/nnet3/tdnn/output exp/nnet3/tdnn1/output .. \\"
82+
echo " exp/nnet3/tdnn_comb/decode_dev"
83+
echo "main options (for others, see top of script file)"
84+
echo " --config <config-file> # config containing options"
85+
echo " --nj <nj> # number of parallel jobs"
86+
echo " --cmd <cmd> # Command to run in parallel with"
87+
echo " --iter <iter> # Iteration of model to decode; default is final."
88+
exit 1;
89+
fi
90+
91+
echo "$0 $@"
92+
93+
data=$1
94+
graphdir=$2
95+
dir=${@: -1} # last argument to the script
96+
shift 2;
97+
model_dirs=( $@ ) # read the remaining arguments into an array
98+
unset model_dirs[${#model_dirs[@]}-1] # 'pop' the last argument which is odir
99+
num_sys=${#model_dirs[@]} # number of systems to combine
100+
101+
for f in $graphdir/words.txt $graphdir/phones/word_boundary.int ; do
102+
[ ! -f $f ] && echo "$0: file $f does not exist" && exit 1;
103+
done
104+
105+
[ ! -z "$online_ivector_dir" ] && \
106+
extra_files="$online_ivector_dir/ivector_online.scp $online_ivector_dir/ivector_period"
107+
108+
if [ ! -z "$online_ivector_dir" ]; then
109+
ivector_period=$(cat $online_ivector_dir/ivector_period) || exit 1;
110+
ivector_opts="--online-ivectors=scp:$online_ivector_dir/ivector_online.scp --online-ivector-period=$ivector_period"
111+
fi
112+
113+
frame_subsampling_opt=
114+
if [ $frame_subsampling_factor -ne 1 ]; then
115+
# e.g. for 'chain' systems
116+
frame_subsampling_opt="--frame-subsampling-factor=$frame_subsampling_factor"
117+
fi
118+
119+
# convert $dir to absolute pathname
120+
fdir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $dir ${PWD}`
121+
122+
# Possibly use multi-threaded decoder
123+
thread_string=
124+
[ $num_threads -gt 1 ] && thread_string="-parallel --num-threads=$num_threads"
125+
126+
mkdir -p $dir/temp
127+
128+
for i in `seq 0 $[num_sys-1]`; do
129+
srcdir=${model_dirs[$i]}
130+
131+
model=$srcdir/$iter.mdl
132+
if [ ! -f $srcdir/$iter.mdl ]; then
133+
echo "$0: Error: no such file $srcdir/$iter.raw. Trying $srcdir/$iter.mdl exit" && exit 1;
134+
fi
135+
136+
# check that they have the same tree
137+
show-transitions $graphdir/phones.txt $model > $dir/temp/transition.${i}.txt
138+
cmp_tree=`diff -q $dir/temp/transition.0.txt $dir/temp/transition.${i}.txt | awk '{print $5}'`
139+
if [ ! -z $cmp_tree ]; then
140+
echo "$0 tree must be the same."
141+
exit 0;
142+
fi
143+
144+
# check that they have the same frame-subsampling-factor
145+
if [ $frame_subsampling_factor -ne `cat $srcdir/frame_subsampling_factor` ]; then
146+
echo "$0 frame_subsampling_factor must be the same."
147+
exit 0;
148+
fi
149+
150+
for f in $data/feats.scp $model $extra_files; do
151+
[ ! -f $f ] && echo "$0: no such file $f" && exit 1;
152+
done
153+
154+
if [ ! -z "$output_name" ] && [ "$output_name" != "output" ]; then
155+
echo "$0: Using output-name $output_name"
156+
model="nnet3-copy --edits='remove-output-nodes name=output;rename-node old-name=$output_name new-name=output' $model - |"
157+
fi
158+
159+
## Set up features.
160+
if [ -f $srcdir/final.mat ]; then
161+
echo "$0: Error: lda feature type is no longer supported." && exit 1
162+
fi
163+
164+
sdata=$data/split$nj;
165+
cmvn_opts=`cat $srcdir/cmvn_opts` || exit 1;
166+
167+
feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |"
168+
169+
if $apply_exp; then
170+
output_wspecifier="ark:| copy-matrix --apply-exp ark:- ark:-"
171+
else
172+
output_wspecifier="ark:| copy-feats --compress=$compress ark:- ark:-"
173+
fi
174+
175+
gpu_opt="--use-gpu=no"
176+
gpu_queue_opt=
177+
178+
if $use_gpu; then
179+
gpu_queue_opt="--gpu 1"
180+
gpu_opt="--use-gpu=yes"
181+
fi
182+
183+
echo "$i $model";
184+
models[$i]="ark,s,cs:nnet3-compute $gpu_opt $ivector_opts $frame_subsampling_opt \
185+
--frames-per-chunk=$frames_per_chunk \
186+
--extra-left-context=$extra_left_context \
187+
--extra-right-context=$extra_right_context \
188+
--extra-left-context-initial=$extra_left_context_initial \
189+
--extra-right-context-final=$extra_right_context_final \
190+
'$model' '$feats' '$output_wspecifier' |"
191+
done
192+
193+
# remove tempdir
194+
rm -rf $dir/temp
195+
196+
# split data to nj
197+
[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
198+
echo $nj > $dir/num_jobs
199+
200+
201+
# Assume the nnet trained by
202+
# the same tree and frame subsampling factor.
203+
mkdir -p $dir/log
204+
205+
if [ -f $model ]; then
206+
echo "$0: $model exists, copy model to $dir/../"
207+
cp $model $dir/../
208+
fi
209+
210+
if [ -f $srcdir/frame_shift ]; then
211+
cp $srcdir/frame_shift $dir/../
212+
echo "$0: $srcdir/frame_shift exists, copy $srcdir/frame_shift to $dir/../"
213+
elif [ -f $srcdir/frame_subsampling_factor ]; then
214+
cp $srcdir/frame_subsampling_factor $dir/../
215+
echo "$0: $srcdir/frame_subsampling_factor exists, copy $srcdir/frame_subsampling_factor to $dir/../"
216+
fi
217+
218+
lat_wspecifier="ark:|"
219+
extra_opts=
220+
if ! $write_compact; then
221+
extra_opts="--determinize-lattice=false"
222+
lat_wspecifier="ark:| lattice-determinize-phone-pruned --beam=$lattice_beam --acoustic-scale=$acwt --minimize=$minimize --word-determinize=$word_determinize --write-compact=false $model ark:- ark:- |"
223+
fi
224+
225+
if [ "$post_decode_acwt" == 1.0 ]; then
226+
lat_wspecifier="$lat_wspecifier gzip -c >$dir/lat.JOB.gz"
227+
else
228+
lat_wspecifier="$lat_wspecifier lattice-scale --acoustic-scale=$post_decode_acwt --write-compact=$write_compact ark:- ark:- | gzip -c >$dir/lat.JOB.gz"
229+
fi
230+
231+
232+
if [ $stage -le 0 ]; then
233+
$cmd --num-threads $num_threads JOB=1:$nj $dir/log/decode.JOB.log \
234+
matrix-sum --average=$average "${models[@]}" ark:- \| \
235+
latgen-faster-mapped$thread_string --lattice-beam=$lattice_beam --acoustic-scale=$acwt --allow-partial=true \
236+
--minimize=$minimize --max-active=$max_active --min-active=$min_active --beam=$beam \
237+
--word-symbol-table=$graphdir/words.txt ${extra_opts} "$model" \
238+
$graphdir/HCLG.fst ark:- "$lat_wspecifier"
239+
fi
240+
241+
if [ $stage -le 1 ]; then
242+
if ! $skip_diagnostics ; then
243+
[ ! -z $iter ] && iter_opt="--iter $iter"
244+
steps/diagnostic/analyze_lats.sh --cmd "$cmd" $iter_opt $graphdir $dir
245+
fi
246+
fi
247+
248+
if ! $skip_scoring ; then
249+
if [ $stage -le 2 ]; then
250+
[ ! -x local/score.sh ] && \
251+
echo "Not scoring because local/score.sh does not exist or not executable." && exit 1;
252+
echo "score best paths"
253+
[ "$iter" != "final" ] && iter_opt="--iter $iter"
254+
scoring_opts="--min_lmwt $min_lmwt"
255+
local/score.sh $scoring_opts --cmd "$cmd" $data $graphdir $dir
256+
echo "score confidence and timing with sclite"
257+
fi
258+
fi
259+
260+
261+
exit 0
262+

src/bin/matrix-sum.cc

Lines changed: 80 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,79 @@ int32 TypeOneUsage(const ParseOptions &po,
103103
return (n_success != 0 && n_missing < (n_success - n_missing)) ? 0 : 1;
104104
}
105105

106+
int32 TypeOneUsageAverage(const ParseOptions &po) {
107+
int32 num_args = po.NumArgs();
108+
std::string matrix_in_fn1 = po.GetArg(1),
109+
matrix_out_fn = po.GetArg(num_args);
110+
BaseFloat scale = 1.0 / (num_args - 1);
111+
112+
// Output matrix
113+
BaseFloatMatrixWriter matrix_writer(matrix_out_fn);
114+
115+
// Input matrices
116+
SequentialBaseFloatMatrixReader matrix_reader1(matrix_in_fn1);
117+
std::vector<RandomAccessBaseFloatMatrixReader*>
118+
matrix_readers(num_args-2,
119+
static_cast<RandomAccessBaseFloatMatrixReader*>(NULL));
120+
std::vector<std::string> matrix_in_fns(num_args-2);
121+
for (int32 i = 2; i < num_args; ++i) {
122+
matrix_readers[i-2] = new RandomAccessBaseFloatMatrixReader(po.GetArg(i));
123+
matrix_in_fns[i-2] = po.GetArg(i);
124+
}
125+
126+
int32 n_utts = 0, n_total_matrices = 0,
127+
n_success = 0, n_missing = 0, n_other_errors = 0;
128+
129+
for (; !matrix_reader1.Done(); matrix_reader1.Next()) {
130+
std::string key = matrix_reader1.Key();
131+
Matrix<BaseFloat> matrix1 = matrix_reader1.Value();
132+
matrix_reader1.FreeCurrent();
133+
n_utts++;
134+
n_total_matrices++;
135+
136+
matrix1.Scale(scale);
137+
138+
Matrix<BaseFloat> matrix_out(matrix1);
139+
140+
for (int32 i = 0; i < num_args-2; ++i) {
141+
if (matrix_readers[i]->HasKey(key)) {
142+
Matrix<BaseFloat> matrix2 = matrix_readers[i]->Value(key);
143+
n_total_matrices++;
144+
if (SameDim(matrix2, matrix_out)) {
145+
matrix_out.AddMat(scale, matrix2, kNoTrans);
146+
} else {
147+
KALDI_WARN << "Dimension mismatch for utterance " << key
148+
<< " : " << matrix2.NumRows() << " by "
149+
<< matrix2.NumCols() << " for "
150+
<< "system " << (i + 2) << ", rspecifier: "
151+
<< matrix_in_fns[i] << " vs " << matrix_out.NumRows()
152+
<< " by " << matrix_out.NumCols()
153+
<< " primary matrix, rspecifier:" << matrix_in_fn1;
154+
n_other_errors++;
155+
}
156+
} else {
157+
KALDI_WARN << "No matrix found for utterance " << key << " for "
158+
<< "system " << (i + 2) << ", rspecifier: "
159+
<< matrix_in_fns[i];
160+
n_missing++;
161+
}
162+
}
163+
164+
matrix_writer.Write(key, matrix_out);
165+
n_success++;
166+
}
167+
168+
KALDI_LOG << "Processed " << n_utts << " utterances: with a total of "
169+
<< n_total_matrices << " matrices across " << (num_args-1)
170+
<< " different systems";
171+
KALDI_LOG << "Produced output for " << n_success << " utterances; "
172+
<< n_missing << " total missing matrices";
173+
174+
DeletePointers(&matrix_readers);
175+
176+
return (n_success != 0 && n_missing < (n_success - n_missing)) ? 0 : 1;
177+
}
178+
106179
int32 TypeTwoUsage(const ParseOptions &po,
107180
bool binary) {
108181
KALDI_ASSERT(po.NumArgs() == 2);
@@ -223,7 +296,7 @@ int main(int argc, char *argv[]) {
223296
po.Register("binary", &binary, "If true, write output as binary (only "
224297
"relevant for usage types two or three");
225298
po.Register("average", &average, "If true, compute average instead of "
226-
"sum; only currently compatible with type 3 usage.");
299+
"sum; currently compatible with type 3 or type 1 usage.");
227300

228301
po.Read(argc, argv);
229302

@@ -232,9 +305,11 @@ int main(int argc, char *argv[]) {
232305
if (po.NumArgs() >= 2 &&
233306
ClassifyWspecifier(po.GetArg(N), NULL, NULL, NULL) != kNoWspecifier) {
234307
if (average)
235-
KALDI_ERR << "--average option not compatible with type one usage.";
236-
// output to table.
237-
exit_status = TypeOneUsage(po, scale1, scale2);
308+
// average option with type one usage.";
309+
exit_status = TypeOneUsageAverage(po);
310+
else
311+
// output to table.
312+
exit_status = TypeOneUsage(po, scale1, scale2);
238313
} else if (po.NumArgs() == 2 &&
239314
ClassifyRspecifier(po.GetArg(1), NULL, NULL) != kNoRspecifier &&
240315
ClassifyWspecifier(po.GetArg(N), NULL, NULL, NULL) ==
@@ -260,3 +335,4 @@ int main(int argc, char *argv[]) {
260335
return -1;
261336
}
262337
}
338+

0 commit comments

Comments
 (0)