Skip to content

Commit 8df46b8

Browse files
committed
master: merging most code changes from chain branch (but not the actual chain code)-- want to keep shared parts of the code the same.
1 parent b5fb3f9 commit 8df46b8

File tree

126 files changed

+5799
-2047
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

126 files changed

+5799
-2047
lines changed

egs/swbd/s5c/local/nnet3/run_ivector_common.sh

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,20 +24,20 @@ if [ "$speed_perturb" == "true" ]; then
2424
utils/combine_data.sh data/${datadir}_tmp data/temp1 data/temp2
2525
utils/validate_data_dir.sh --no-feats data/${datadir}_tmp
2626
rm -r data/temp1 data/temp2
27-
27+
2828
mfccdir=mfcc_perturbed
2929
steps/make_mfcc.sh --cmd "$train_cmd" --nj 50 \
3030
data/${datadir}_tmp exp/make_mfcc/${datadir}_tmp $mfccdir || exit 1;
3131
steps/compute_cmvn_stats.sh data/${datadir}_tmp exp/make_mfcc/${datadir}_tmp $mfccdir || exit 1;
3232
utils/fix_data_dir.sh data/${datadir}_tmp
33-
33+
3434
utils/copy_data_dir.sh --spk-prefix sp1.0- --utt-prefix sp1.0- data/${datadir} data/temp0
3535
utils/combine_data.sh data/${datadir}_sp data/${datadir}_tmp data/temp0
3636
utils/fix_data_dir.sh data/${datadir}_sp
3737
rm -r data/temp0 data/${datadir}_tmp
3838
done
3939
fi
40-
40+
4141
if [ $stage -le 2 ] && [ "$generate_alignments" == "true" ]; then
4242
#obtain the alignment of the perturbed data
4343
steps/align_fmllr.sh --nj 100 --cmd "$train_cmd" \

egs/swbd/s5c/local/online/run_nnet2_common.sh

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
#!/bin/bash
22

33
. ./cmd.sh
4-
set -e
4+
set -e
55
stage=1
66
train_stage=-10
77

@@ -18,7 +18,7 @@ if [ $stage -le 1 ]; then
1818
fi
1919
utils/copy_data_dir.sh data/train data/train_scaled_hires
2020
utils/copy_data_dir.sh data/train data/train_hires
21-
21+
2222
data_dir=data/train_scaled_hires
2323
cat $data_dir/wav.scp | python -c "
2424
import sys, os, subprocess, re, random
@@ -34,12 +34,12 @@ for line in sys.stdin.readlines():
3434
--cmd "$train_cmd" data/train_scaled_hires exp/make_hires/train_scaled $mfccdir;
3535
steps/compute_cmvn_stats.sh data/train_scaled_hires exp/make_hires/train_scaled $mfccdir;
3636

37-
# we need these features for the run_nnet2_ms.sh
37+
# we need these features for the run_nnet2_ms.sh
3838
steps/make_mfcc.sh --nj 70 --mfcc-config conf/mfcc_hires.conf \
3939
--cmd "$train_cmd" data/train_hires exp/make_hires/train $mfccdir;
4040
steps/compute_cmvn_stats.sh data/train_hires exp/make_hires/train $mfccdir;
4141

42-
# Remove the small number of utterances that couldn't be extracted for some
42+
# Remove the small number of utterances that couldn't be extracted for some
4343
# reason (e.g. too short; no such file).
4444
utils/fix_data_dir.sh data/train_scaled_hires;
4545
utils/fix_data_dir.sh data/train_hires;
@@ -50,7 +50,7 @@ for line in sys.stdin.readlines():
5050
data/eval2000_hires exp/make_hires/eval2000 $mfccdir;
5151
steps/compute_cmvn_stats.sh data/eval2000_hires exp/make_hires/eval2000 $mfccdir;
5252
utils/fix_data_dir.sh data/eval2000_hires # remove segments with problems
53-
53+
5454
# Use the first 4k sentences as dev set. Note: when we trained the LM, we used
5555
# the 1st 10k sentences as dev set, so the 1st 4k won't have been used in the
5656
# LM training data. However, they will be in the lexicon, plus speakers
@@ -84,7 +84,7 @@ if [ $stage -le 2 ]; then
8484
# We need to build a small system just because we need the LDA+MLLT transform
8585
# to train the diag-UBM on top of. We use --num-iters 13 because after we get
8686
# the transform (12th iter is the last), any further training is pointless.
87-
# this decision is based on fisher_english
87+
# this decision is based on fisher_english
8888
steps/train_lda_mllt.sh --cmd "$train_cmd" --num-iters 13 \
8989
--splice-opts "--left-context=3 --right-context=3" \
9090
5500 90000 data/train_scaled_hires_100k_nodup \

egs/swbd/s5c/local/score_basic.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ dir=$3
2828

2929
model=$dir/../final.mdl # assume model one level up from decoding dir.
3030

31-
hubscr=$KALDI_ROOT/tools/sctk/bin/hubscr.pl
31+
hubscr=$KALDI_ROOT/tools/sctk/bin/hubscr.pl
3232
[ ! -f $hubscr ] && echo "Cannot find scoring program at $hubscr" && exit 1;
3333
hubdir=`dirname $hubscr`
3434

@@ -42,7 +42,7 @@ mkdir -p $dir/scoring/log
4242

4343

4444
function filter_text {
45-
perl -e 'foreach $w (@ARGV) { $bad{$w} = 1; }
45+
perl -e 'foreach $w (@ARGV) { $bad{$w} = 1; }
4646
while(<STDIN>) { @A = split(" ", $_); $id = shift @A; print "$id ";
4747
foreach $a (@A) { if (!defined $bad{$a}) { print "$a "; }} print "\n"; }' \
4848
'[NOISE]' '[LAUGHTER]' '[VOCALIZED-NOISE]' '<UNK>' '%HESITATION'

egs/swbd/s5c/run.sh

Lines changed: 18 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
# 1. added more training data for early stages
88
# 2. removed SAT system (and later stages) on the 100k utterance training data
99
# 3. reduced number of LM rescoring, only sw1_tg and sw1_fsh_fg remain
10-
# 4. mapped swbd transcription to fisher style, instead of the other way around
10+
# 4. mapped swbd transcription to fisher style, instead of the other way around
1111

1212
set -e # exit on error
1313
has_fisher=true
@@ -22,7 +22,7 @@ local/swbd1_prepare_dict.sh
2222
# which specifies the directory to Switchboard documentations. Specifically, if
2323
# this argument is given, the script will look for the conv.tab file and correct
2424
# speaker IDs to the actual speaker personal identification numbers released in
25-
# the documentations. The documentations can be found here:
25+
# the documentations. The documentations can be found here:
2626
# https://catalog.ldc.upenn.edu/docs/LDC97S62/
2727
# Note: if you are using this link, make sure you rename conv_tab.csv to conv.tab
2828
# after downloading.
@@ -37,7 +37,7 @@ utils/prepare_lang.sh data/local/dict_nosp \
3737
"<unk>" data/local/lang_nosp data/lang_nosp
3838

3939
# Now train the language models. We are using SRILM and interpolating with an
40-
# LM trained on the Fisher transcripts (part 2 disk is currently missing; so
40+
# LM trained on the Fisher transcripts (part 2 disk is currently missing; so
4141
# only part 1 transcripts ~700hr are used)
4242

4343
# If you have the Fisher data, you can set this "fisher_dir" variable.
@@ -79,7 +79,7 @@ mfccdir=mfcc
7979
for x in train eval2000; do
8080
steps/make_mfcc.sh --nj 50 --cmd "$train_cmd" \
8181
data/$x exp/make_mfcc/$x $mfccdir
82-
steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir
82+
steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir
8383
utils/fix_data_dir.sh data/$x
8484
done
8585

@@ -91,10 +91,10 @@ utils/subset_data_dir.sh --first data/train 4000 data/train_dev # 5hr 6min
9191
n=$[`cat data/train/segments | wc -l` - 4000]
9292
utils/subset_data_dir.sh --last data/train $n data/train_nodev
9393

94-
# Now-- there are 260k utterances (313hr 23min), and we want to start the
95-
# monophone training on relatively short utterances (easier to align), but not
94+
# Now-- there are 260k utterances (313hr 23min), and we want to start the
95+
# monophone training on relatively short utterances (easier to align), but not
9696
# only the shortest ones (mostly uh-huh). So take the 100k shortest ones;
97-
# remove most of the repeated utterances (these are the uh-huh type ones), and
97+
# remove most of the repeated utterances (these are the uh-huh type ones), and
9898
# then take 10k random utterances from those (about 4hr 40mins)
9999
utils/subset_data_dir.sh --shortest data/train_nodev 100000 data/train_100kshort
100100
utils/subset_data_dir.sh data/train_100kshort 30000 data/train_30kshort
@@ -108,13 +108,13 @@ local/remove_dup_utts.sh 200 data/train_100k data/train_100k_nodup # 110hr
108108
local/remove_dup_utts.sh 300 data/train_nodev data/train_nodup # 286hr
109109
## Starting basic training on MFCC features
110110
steps/train_mono.sh --nj 30 --cmd "$train_cmd" \
111-
data/train_30kshort data/lang_nosp exp/mono
111+
data/train_30kshort data/lang_nosp exp/mono
112112

113113
steps/align_si.sh --nj 30 --cmd "$train_cmd" \
114-
data/train_100k_nodup data/lang_nosp exp/mono exp/mono_ali
114+
data/train_100k_nodup data/lang_nosp exp/mono exp/mono_ali
115115

116116
steps/train_deltas.sh --cmd "$train_cmd" \
117-
3200 30000 data/train_100k_nodup data/lang_nosp exp/mono_ali exp/tri1
117+
3200 30000 data/train_100k_nodup data/lang_nosp exp/mono_ali exp/tri1
118118

119119
(
120120
graph_dir=exp/tri1/graph_nosp_sw1_tg
@@ -125,7 +125,7 @@ steps/train_deltas.sh --cmd "$train_cmd" \
125125
) &
126126

127127
steps/align_si.sh --nj 30 --cmd "$train_cmd" \
128-
data/train_100k_nodup data/lang_nosp exp/tri1 exp/tri1_ali
128+
data/train_100k_nodup data/lang_nosp exp/tri1 exp/tri1_ali
129129

130130
steps/train_deltas.sh --cmd "$train_cmd" \
131131
4000 70000 data/train_100k_nodup data/lang_nosp exp/tri1_ali exp/tri2
@@ -149,11 +149,11 @@ steps/align_si.sh --nj 30 --cmd "$train_cmd" \
149149
# From now, we start using all of the data (except some duplicates of common
150150
# utterances, which don't really contribute much).
151151
steps/align_si.sh --nj 30 --cmd "$train_cmd" \
152-
data/train_nodup data/lang_nosp exp/tri2 exp/tri2_ali_nodup
152+
data/train_nodup data/lang_nosp exp/tri2 exp/tri2_ali_nodup
153153

154154
# Do another iteration of LDA+MLLT training, on all the data.
155155
steps/train_lda_mllt.sh --cmd "$train_cmd" \
156-
6000 140000 data/train_nodup data/lang_nosp exp/tri2_ali_nodup exp/tri3
156+
6000 140000 data/train_nodup data/lang_nosp exp/tri2_ali_nodup exp/tri3
157157

158158
(
159159
graph_dir=exp/tri3/graph_nosp_sw1_tg
@@ -190,7 +190,7 @@ fi
190190

191191
# Train tri4, which is LDA+MLLT+SAT, on all the (nodup) data.
192192
steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \
193-
data/train_nodup data/lang exp/tri3 exp/tri3_ali_nodup
193+
data/train_nodup data/lang exp/tri3 exp/tri3_ali_nodup
194194

195195

196196
steps/train_sat.sh --cmd "$train_cmd" \
@@ -215,21 +215,21 @@ if $has_fisher; then
215215
exp/tri4/decode_eval2000_sw1_{tg,fsh_fg}
216216
fi
217217

218-
# MMI training starting from the LDA+MLLT+SAT systems on all the (nodup) data.
218+
# MMI training starting from the LDA+MLLT+SAT systems on all the (nodup) data.
219219
steps/align_fmllr.sh --nj 50 --cmd "$train_cmd" \
220220
data/train_nodup data/lang exp/tri4 exp/tri4_ali_nodup
221221

222222
steps/make_denlats.sh --nj 50 --cmd "$decode_cmd" \
223223
--config conf/decode.config --transform-dir exp/tri4_ali_nodup \
224-
data/train_nodup data/lang exp/tri4 exp/tri4_denlats_nodup
224+
data/train_nodup data/lang exp/tri4 exp/tri4_denlats_nodup
225225

226226
# 4 iterations of MMI seems to work well overall. The number of iterations is
227227
# used as an explicit argument even though train_mmi.sh will use 4 iterations by
228228
# default.
229229
num_mmi_iters=4
230230
steps/train_mmi.sh --cmd "$decode_cmd" \
231231
--boost 0.1 --num-iters $num_mmi_iters \
232-
data/train_nodup data/lang exp/tri4_{ali,denlats}_nodup exp/tri4_mmi_b0.1
232+
data/train_nodup data/lang exp/tri4_{ali,denlats}_nodup exp/tri4_mmi_b0.1
233233

234234
for iter in 1 2 3 4; do
235235
(
@@ -260,7 +260,7 @@ steps/train_diag_ubm.sh --silence-weight 0.5 --nj 50 --cmd "$train_cmd" \
260260
steps/train_mmi_fmmi.sh --learning-rate 0.005 \
261261
--boost 0.1 --cmd "$train_cmd" \
262262
data/train_nodup data/lang exp/tri4_ali_nodup exp/tri4_dubm \
263-
exp/tri4_denlats_nodup exp/tri4_fmmi_b0.1
263+
exp/tri4_denlats_nodup exp/tri4_fmmi_b0.1
264264

265265
for iter in 4 5 6 7 8; do
266266
(

egs/wsj/s5/local/nnet3/run_tdnn_baseline.sh

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,6 @@ if [ $stage -le 8 ]; then
3535
/export/b0{3,4,5,6}/$USER/kaldi-data/egs/wsj-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
3636
fi
3737

38-
3938
steps/nnet3/train_tdnn.sh --stage $train_stage \
4039
--num-epochs 8 --num-jobs-initial 2 --num-jobs-final 14 \
4140
--splice-indexes "-1,0,1 -2,1 -4,2 0" \
Lines changed: 160 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,160 @@
1+
#!/bin/bash
2+
#
3+
# Copyright 2012-2015 Johns Hopkins University (Author: Daniel Povey)
4+
# Apache 2.0
5+
6+
# Version of align_fmllr.sh that generates lattices (lat.*.gz) with
7+
# alignments of alternative pronunciations in them. Mainly intended
8+
# as a precursor to CTC training for now.
9+
10+
# Begin configuration section.
11+
stage=0
12+
nj=4
13+
cmd=run.pl
14+
# Begin configuration.
15+
scale_opts="--transition-scale=1.0 --self-loop-scale=0.1"
16+
acoustic_scale=0.1
17+
beam=10
18+
retry_beam=40
19+
final_beam=20 # For the lattice-generation phase there is no retry-beam. This
20+
# is a limitation of gmm-latgen-faster. We just use an
21+
# intermediate beam. We'll lose a little data and it will be
22+
# slightly slower. (however, the min-active of 200 that
23+
# gmm-latgen-faster defaults to may help.)
24+
boost_silence=1.0 # factor by which to boost silence during alignment.
25+
fmllr_update_type=full
26+
# End configuration options.
27+
28+
echo "$0 $@" # Print the command line for logging
29+
30+
[ -f path.sh ] && . ./path.sh # source the path.
31+
. parse_options.sh || exit 1;
32+
33+
if [ $# != 4 ]; then
34+
echo "usage: steps/align_fmllr_lats.sh <data-dir> <lang-dir> <src-dir> <align-dir>"
35+
echo "e.g.: steps/align_fmllr_lats.sh data/train data/lang exp/tri1 exp/tri1_lats"
36+
echo "main options (for others, see top of script file)"
37+
echo " --config <config-file> # config containing options"
38+
echo " --nj <nj> # number of parallel jobs"
39+
echo " --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
40+
echo " --fmllr-update-type (full|diag|offset|none) # default full."
41+
exit 1;
42+
fi
43+
44+
data=$1
45+
lang=$2
46+
srcdir=$3
47+
dir=$4
48+
49+
oov=`cat $lang/oov.int` || exit 1;
50+
silphonelist=`cat $lang/phones/silence.csl` || exit 1;
51+
sdata=$data/split$nj
52+
53+
mkdir -p $dir/log
54+
echo $nj > $dir/num_jobs
55+
[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
56+
57+
cp $srcdir/{tree,final.mdl} $dir || exit 1;
58+
cp $srcdir/final.alimdl $dir 2>/dev/null
59+
cp $srcdir/final.occs $dir;
60+
splice_opts=`cat $srcdir/splice_opts 2>/dev/null` # frame-splicing options.
61+
cp $srcdir/splice_opts $dir 2>/dev/null # frame-splicing options.
62+
cmvn_opts=`cat $srcdir/cmvn_opts 2>/dev/null`
63+
cp $srcdir/cmvn_opts $dir 2>/dev/null # cmn/cmvn option.
64+
delta_opts=`cat $srcdir/delta_opts 2>/dev/null`
65+
cp $srcdir/delta_opts $dir 2>/dev/null
66+
67+
if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
68+
echo "$0: feature type is $feat_type"
69+
70+
case $feat_type in
71+
delta) sifeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |";;
72+
lda) sifeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
73+
cp $srcdir/final.mat $dir
74+
cp $srcdir/full.mat $dir 2>/dev/null
75+
;;
76+
*) echo "Invalid feature type $feat_type" && exit 1;
77+
esac
78+
79+
## Set up model and alignment model.
80+
mdl=$srcdir/final.mdl
81+
if [ -f $srcdir/final.alimdl ]; then
82+
alimdl=$srcdir/final.alimdl
83+
else
84+
alimdl=$srcdir/final.mdl
85+
fi
86+
[ ! -f $mdl ] && echo "$0: no such model $mdl" && exit 1;
87+
alimdl_cmd="gmm-boost-silence --boost=$boost_silence `cat $lang/phones/optional_silence.csl` $alimdl - |"
88+
mdl_cmd="gmm-boost-silence --boost=$boost_silence `cat $lang/phones/optional_silence.csl` $mdl - |"
89+
90+
91+
## because gmm-latgen-faster doesn't support adding the transition-probs to the
92+
## graph itself, we need to bake them into the compiled graphs. This means we can't reuse previously compiled graphs,
93+
## because the other scripts write them without transition probs.
94+
if [ $stage -le 0 ]; then
95+
echo "$0: compiling training graphs"
96+
tra="ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $sdata/JOB/text|";
97+
$cmd JOB=1:$nj $dir/log/compile_graphs.JOB.log \
98+
compile-train-graphs $scale_opts $dir/tree $dir/final.mdl $lang/L.fst "$tra" \
99+
"ark:|gzip -c >$dir/fsts.JOB.gz" || exit 1;
100+
fi
101+
102+
103+
if [ $stage -le 1 ]; then
104+
# Note: we need to set --transition-scale=0.0 --self-loop-scale=0.0 because,
105+
# as explained above, we compiled the transition probs into the training
106+
# graphs.
107+
echo "$0: aligning data in $data using $alimdl and speaker-independent features."
108+
$cmd JOB=1:$nj $dir/log/align_pass1.JOB.log \
109+
gmm-align-compiled --transition-scale=0.0 --self-loop-scale=0.0 --acoustic-scale=$acoustic_scale \
110+
--beam=$beam --retry-beam=$retry_beam "$alimdl_cmd" \
111+
"ark:gunzip -c $dir/fsts.JOB.gz|" "$sifeats" "ark:|gzip -c >$dir/pre_ali.JOB.gz" || exit 1;
112+
fi
113+
114+
if [ $stage -le 2 ]; then
115+
echo "$0: computing fMLLR transforms"
116+
if [ "$alimdl" != "$mdl" ]; then
117+
$cmd JOB=1:$nj $dir/log/fmllr.JOB.log \
118+
ali-to-post "ark:gunzip -c $dir/pre_ali.JOB.gz|" ark:- \| \
119+
weight-silence-post 0.0 $silphonelist $alimdl ark:- ark:- \| \
120+
gmm-post-to-gpost $alimdl "$sifeats" ark:- ark:- \| \
121+
gmm-est-fmllr-gpost --fmllr-update-type=$fmllr_update_type \
122+
--spk2utt=ark:$sdata/JOB/spk2utt $mdl "$sifeats" \
123+
ark,s,cs:- ark:$dir/trans.JOB || exit 1;
124+
else
125+
$cmd JOB=1:$nj $dir/log/fmllr.JOB.log \
126+
ali-to-post "ark:gunzip -c $dir/pre_ali.JOB.gz|" ark:- \| \
127+
weight-silence-post 0.0 $silphonelist $alimdl ark:- ark:- \| \
128+
gmm-est-fmllr --fmllr-update-type=$fmllr_update_type \
129+
--spk2utt=ark:$sdata/JOB/spk2utt $mdl "$sifeats" \
130+
ark,s,cs:- ark:$dir/trans.JOB || exit 1;
131+
fi
132+
fi
133+
134+
feats="$sifeats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$dir/trans.JOB ark:- ark:- |"
135+
136+
if [ $stage -le 3 ]; then
137+
# Warning: gmm-latgen-faster doesn't support a retry-beam so you may get more
138+
# alignment errors (however, it does have a default min-active=200 so this
139+
# will tend to reduce alignment errors).
140+
# --allow_partial=false makes sure we reach the end of the decoding graph.
141+
# --word-determinize=false makes sure we retain the alternative pronunciations of
142+
# words (including alternatives regarding optional silences).
143+
# --lattice-beam=$beam keeps all the alternatives that were within the beam,
144+
# it means we do no pruning of the lattice (lattices from a training transcription
145+
# will be small anyway).
146+
echo "$0: generating lattices containing alternate pronunciations."
147+
$cmd JOB=1:$nj $dir/log/generate_lattices.JOB.log \
148+
gmm-latgen-faster --acoustic-scale=$acoustic_scale --beam=$final_beam \
149+
--lattice-beam=$final_beam --allow-partial=false --word-determinize=false \
150+
"$mdl_cmd" "ark:gunzip -c $dir/fsts.JOB.gz|" "$feats" \
151+
"ark:|gzip -c >$dir/lat.JOB.gz" || exit 1;
152+
fi
153+
154+
rm $dir/pre_ali.*.gz
155+
156+
echo "$0: done generating lattices from training transcripts."
157+
158+
utils/summarize_warnings.pl $dir/log
159+
160+
exit 0;

0 commit comments

Comments
 (0)