Skip to content

Commit 2f29a0c

Browse files
committed
sandbox/dan2: merge changes from trunk, fix additional include guard names.
git-svn-id: https://svn.code.sf.net/p/kaldi/code/sandbox/dan2@2998 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
2 parents 2c04c0f + 41c2a6e commit 2f29a0c

56 files changed

Lines changed: 1526 additions & 146 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.
Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
11
beam=18.0 # beam for decoding.
2-
lat_beam=10.0 # lattice beam for decoding
2+
3+
lattice_beam=10.0 # lattice beam for decoding
4+
lat_beam=10.0 # lattice beam for decoding (## This is the variable in older kaldi scripts, and has been replaced by "lattice_beam")
5+
36
first_beam=10.0 # beam for 1st-pass decoding in SAT.

egs/swbd/s5b/RESULTS

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,3 +34,27 @@ exp/tri4b/decode_eval2000_sw1_tg/score_14/eval2000.ctm.swbd.filt.sys: | Sum/
3434
exp/tri4b/decode_eval2000_sw1_tg.si/score_12/eval2000.ctm.swbd.filt.sys: | Sum/Avg | 1831 21395 | 73.6 19.3 7.1 3.6 30.0 66.2 |
3535

3636

37+
# some more recent results (Sep 25 2013), from tri4b and tri4c_reseg, to
38+
# see the effect of resegmentation. Note: we're only looking at the "swbd" results here,
39+
# the callhome results or total results are terrible because of huge insertions, because
40+
# it seems that only some segments of the audio files are in the stm. I'm not sure
41+
# where to get the start and end points in the files, that they intended us to
42+
# decode.
43+
%WER 22.2 | 1831 21395 | 80.3 13.8 5.9 2.5 22.2 60.1 | exp/tri4b/decode_eval2000_sw1_fsh_tgpr/score_15/eval2000.ctm.swbd.filt.sys
44+
%WER 29.3 | 1831 21395 | 73.5 18.7 7.8 2.9 29.3 65.0 | exp/tri4b/decode_eval2000_sw1_fsh_tgpr.si/score_17/eval2000.ctm.swbd.filt.sys
45+
%WER 22.5 | 1831 21395 | 79.8 13.8 6.4 2.3 22.5 60.3 | exp/tri4b/decode_eval2000_sw1_tg/score_17/eval2000.ctm.swbd.filt.sys
46+
%WER 30.5 | 1831 21395 | 73.1 19.8 7.1 3.6 30.5 65.8 | exp/tri4b/decode_eval2000_sw1_tg.si/score_14/eval2000.ctm.swbd.filt.sys
47+
48+
%WER 22.9 | 1831 21395 | 79.7 13.4 6.9 2.6 22.9 62.8 | exp/tri4c_reseg/decode_eval2000_sw1_fsh_tgpr/score_14/eval2000.ctm.swbd.filt.sys
49+
%WER 29.6 | 1831 21395 | 73.8 18.2 8.1 3.4 29.6 66.8 | exp/tri4c_reseg/decode_eval2000_sw1_fsh_tgpr.si/score_13/eval2000.ctm.swbd.filt.sys
50+
%WER 23.5 | 1831 21395 | 79.1 13.8 7.1 2.6 23.5 63.6 | exp/tri4c_reseg/decode_eval2000_sw1_tg/score_15/eval2000.ctm.swbd.filt.sys
51+
%WER 30.9 | 1831 21395 | 73.1 19.0 7.9 4.0 30.9 67.6 | exp/tri4c_reseg/decode_eval2000_sw1_tg.si/score_12/eval2000.ctm.swbd.filt.sys
52+
# so the resegmented one is about 0.3 to 1.0 worse, but the #sub is actually down, it's due to more deletions
53+
# and insertions. This is kind of what we'd expect, since the reference segmentation is a kind of "oracle".
54+
55+
# below are some partial results (only the SI decode finished so far), of a version where I
56+
# kept the segments that the segmentation regarded as noise (e.g. cough, etc.) Strangely, it
57+
# has (slightly) more deletions and fewer insertions than the baseline, which is the opposite
58+
# of what we would expect.
59+
%WER 29.7 | 1831 21395 | 73.6 18.3 8.0 3.4 29.7 67.0 | exp/tri4c_reseg/decode_eval2000_with_noise_sw1_fsh_tgpr.si/score_13/eval2000_with_noise.ctm.swbd.filt.sys
60+
%WER 30.8 | 1831 21395 | 72.9 19.0 8.1 3.7 30.8 67.5 | exp/tri4c_reseg/decode_eval2000_with_noise_sw1_tg.si/score_13/eval2000_with_noise.ctm.swbd.filt.sys
Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,126 @@
1+
#!/bin/bash
2+
3+
. cmd.sh
4+
5+
steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \
6+
data/train_30k_nodup data/lang exp/tri3b exp/tri3b_ali_30k_nodup || exit 1;
7+
8+
9+
steps/train_lda_mllt.sh --cmd "$train_cmd" --realign-iters "" \
10+
1000 10000 data/train_30k_nodup data/lang exp/tri3b_ali_30k_nodup exp/tri4b_seg || exit 1;
11+
12+
# Make the phone decoding-graph.
13+
steps/make_phone_graph.sh data/lang exp/tri3b_ali_all exp/tri4b_seg || exit 1;
14+
15+
mkdir -p data_reseg
16+
17+
for data in train_orig eval2000; do
18+
cp -rT data/${data} data_reseg/${data}_orig; rm -r data_reseg/${data}_orig/split*
19+
for f in text utt2spk spk2utt feats.scp cmvn.scp segments; do rm data_reseg/${data}_orig/$f; done
20+
cat data_reseg/${data}_orig/wav.scp | awk '{print $1, $1;}' | \
21+
tee data_reseg/${data}_orig/spk2utt > data_reseg/${data}_orig/utt2spk
22+
mfccdir=mfcc_reseg # don't use mfcc because of the way names are assigned within that
23+
# dir, we'll overwrite the old data.
24+
mkdir -p mfcc_reseg
25+
steps/make_mfcc.sh --compress true --nj 20 --cmd "$train_cmd" data_reseg/${data}_orig exp/make_mfcc/${data}_orig $mfccdir
26+
# caution: the new speakers don't correspond to the old ones, since they now have "sw0" at the start..
27+
steps/compute_cmvn_stats.sh --two-channel data_reseg/${data}_orig exp/make_mfcc/${data}_orig $mfccdir
28+
done
29+
30+
31+
32+
steps/decode_si_ali.sh --cmd "$decode_cmd" --nj 60 --beam 7.0 --max-active 1000 \
33+
exp/tri4b_seg/phone_graph data_reseg/train_orig exp/tri4b_seg/decode_train_orig2
34+
35+
steps/decode_si_ali.sh --cmd "$decode_cmd" --nj 10 --beam 7.0 --max-active 1000 \
36+
exp/tri4b_seg/phone_graph data_reseg/eval2000_orig exp/tri4b_seg/decode_eval2000_orig
37+
38+
39+
# Here: resegment.
40+
# Note: it would be perfectly possible to use exp/tri3b_ali_train here instead
41+
# of exp/tri4b_seg/decode_train_orig. In this case we'd be relying on the transcripts.
42+
# I chose not to do this for more consistency with what happens in test time.
43+
44+
steps/resegment_data.sh --cmd "$train_cmd" data_reseg/train_orig data/lang \
45+
exp/tri4b_seg/decode_train_orig data_reseg/train exp/tri4b_resegment_train
46+
47+
steps/resegment_data.sh --cmd "$train_cmd" data_reseg/eval2000_orig data/lang \
48+
exp/tri4b_seg/decode_eval2000_orig data_reseg/eval2000 exp/tri4b_resegment_eval2000
49+
50+
# We need all the training data to be aligned (not just "train_nodup"), in order
51+
# to get the resegmented "text".
52+
steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \
53+
data/train data/lang exp/tri3b exp/tri3b_ali_train || exit 1;
54+
55+
# Get the file data_reseg/train/text
56+
steps/resegment_text.sh --cmd "$train_cmd" data/train data/lang \
57+
exp/tri3b_ali_train data_reseg/train exp/tri4b_resegment_train
58+
59+
60+
for data in train eval2000; do
61+
utils/fix_data_dir.sh data_reseg/${data}
62+
utils/validate_data_dir.sh --no-feats --no-text data_reseg/${data}
63+
mfccdir=mfcc_reseg # don't use mfcc because of the way names are assigned within that
64+
# dir, we'll overwrite the old data.
65+
steps/make_mfcc.sh --compress true --nj 40 --cmd "$train_cmd" data_reseg/${data} \
66+
exp/make_mfcc/${data}_reseg $mfccdir || exit 1;
67+
steps/compute_cmvn_stats.sh data_reseg/${data} exp/make_mfcc/${data}_reseg $mfccdir || exit 1;
68+
utils/fix_data_dir.sh data_reseg/${data} || exit 1;
69+
done
70+
71+
72+
# Note: we'll be comparing tri4b, which was trained on train_nodup, with tri4c_reseg, which
73+
# was trained on *all* the resegmented data. However, it's comparable because the actual hours
74+
# of data is less in tri4c_reseg: 265h, versus 284 in the nodup data.
75+
# cat data/train_nodup/segments | awk '{nf += $4 - $3; } END{print nf /3600;}'
76+
# 284.433
77+
# cat data_reseg/train/segments | awk '{nf += $4 - $3; } END{print nf /3600;}'
78+
# 265.154
79+
80+
steps/align_fmllr.sh --nj 40 --cmd "$train_cmd" \
81+
data_reseg/train data/lang exp/tri3b exp/tri3b_ali_reseg || exit 1;
82+
83+
steps/train_sat.sh --cmd "$train_cmd" \
84+
11500 200000 data_reseg/train data/lang exp/tri3b_ali_reseg exp/tri4c_reseg || exit 1;
85+
86+
87+
for lm_suffix in tg fsh_tgpr; do
88+
(
89+
graph_dir=exp/tri4c_reseg/graph_sw1_${lm_suffix}
90+
$train_cmd $graph_dir/mkgraph.log \
91+
utils/mkgraph.sh data/lang_sw1_${lm_suffix} exp/tri4c_reseg $graph_dir
92+
steps/decode_fmllr.sh --nj 30 --cmd "$decode_cmd" --config conf/decode.config \
93+
$graph_dir data_reseg/eval2000 exp/tri4c_reseg/decode_eval2000_sw1_${lm_suffix}
94+
) &
95+
done
96+
97+
98+
exit 0;
99+
100+
# Below is experimental.
101+
# I'm figuring out whether we should keep the segments the the 1st pass designated as noise.
102+
steps/resegment_data.sh --cmd "$train_cmd" \
103+
--segmentation-opts "--remove-noise-only-segments false" \
104+
data_reseg/eval2000_orig data/lang \
105+
exp/tri4b_seg/decode_eval2000_orig data_reseg/eval2000_with_noise exp/tri4b_resegment_eval2000_with_noise
106+
107+
for data in eval2000_with_noise; do
108+
utils/fix_data_dir.sh data_reseg/${data}
109+
utils/validate_data_dir.sh --no-feats --no-text data_reseg/${data}
110+
mfccdir=mfcc_reseg # don't use mfcc because of the way names are assigned within that
111+
# dir, we'll overwrite the old data.
112+
steps/make_mfcc.sh --compress true --nj 40 --cmd "$train_cmd" data_reseg/${data} \
113+
exp/make_mfcc/${data}_reseg $mfccdir || exit 1;
114+
steps/compute_cmvn_stats.sh data_reseg/${data} exp/make_mfcc/${data}_reseg $mfccdir || exit 1;
115+
utils/fix_data_dir.sh data_reseg/${data} || exit 1;
116+
done
117+
118+
for lm_suffix in tg fsh_tgpr; do
119+
(
120+
graph_dir=exp/tri4c_reseg/graph_sw1_${lm_suffix}
121+
$train_cmd $graph_dir/mkgraph.log \
122+
utils/mkgraph.sh data/lang_sw1_${lm_suffix} exp/tri4c_reseg $graph_dir
123+
steps/decode_fmllr.sh --nj 30 --cmd "$decode_cmd" --config conf/decode.config \
124+
$graph_dir data_reseg/eval2000_with_noise exp/tri4c_reseg/decode_eval2000_with_noise_sw1_${lm_suffix}
125+
) &
126+
done

egs/swbd/s5b/local/score_sclite.sh

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@ if [ $stage -le 2 ]; then
8383
fi
8484

8585
# For eval2000 score the subsets
86-
if [ "$name" == "eval2000" ]; then
86+
case "$name" in eval2000* )
8787
# Score only the, swbd part...
8888
if [ $stage -le 3 ]; then
8989
$cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.swbd.LMWT.log \
@@ -98,6 +98,7 @@ if [ "$name" == "eval2000" ]; then
9898
grep -v '^sw_' $dir/score_LMWT/${name}.ctm '>' $dir/score_LMWT/${name}.ctm.callhm '&&' \
9999
$hubscr -p $hubdir -V -l english -h hub5 -g $data/glm -r $dir/score_LMWT/stm.callhm $dir/score_LMWT/${name}.ctm.callhm || exit 1;
100100
fi
101-
fi
101+
;;
102+
esac
102103

103104
exit 0

egs/swbd/s5b/local/swbd1_train_lms.sh

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -122,4 +122,3 @@ fi
122122
## The following takes about 11 minutes to download on Eddie:
123123
# wget --no-check-certificate http://ssli.ee.washington.edu/data/191M_conversational_web-filt+periods.gz
124124

125-

egs/swbd/s5b/run.sh

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
# This is supposed to be the "new" version of the switchboard recipe,
66
# after the s5/ one became a bit messy. It is not 100% checked-through yet.
77

8-
exit 1;
8+
#exit 1;
99
# This is a shell script, but it's recommended that you run the commands one by
1010
# one by copying and pasting into the shell.
1111
# Caution: some of the graph creation steps use quite a bit of memory, so you
@@ -176,6 +176,7 @@ done
176176
steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \
177177
data/train_100k_nodup data/lang exp/tri3b exp/tri3b_ali_100k_nodup || exit 1;
178178

179+
179180
steps/train_sat.sh --cmd "$train_cmd" \
180181
5500 90000 data/train_100k_nodup data/lang exp/tri3b_ali_100k_nodup \
181182
exp/tri4a || exit 1;
@@ -190,14 +191,18 @@ for lm_suffix in tg fsh_tgpr; do
190191
) &
191192
done
192193

194+
195+
#local/run_resegment.sh
196+
193197
# Now train a LDA+MLLT+SAT model on the entire training data (train_nodup;
194198
# 286 hours)
195199
# Train tri4b, which is LDA+MLLT+SAT, on train_nodup data.
196200
steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \
197-
data/train_nodup data/lang exp/tri3b exp/tri3b_ali_all || exit 1;
201+
data/train_nodup data/lang exp/tri3b exp/tri3b_ali_nodup || exit 1;
202+
198203

199204
steps/train_sat.sh --cmd "$train_cmd" \
200-
11500 200000 data/train_nodup data/lang exp/tri3b_ali_all exp/tri4b || exit 1;
205+
11500 200000 data/train_nodup data/lang exp/tri3b_ali_nodup exp/tri4b || exit 1;
201206

202207
for lm_suffix in tg fsh_tgpr; do
203208
(
@@ -219,15 +224,15 @@ steps/align_fmllr.sh --nj 50 --cmd "$train_cmd" \
219224
data/train_100k_nodup data/lang exp/tri4a exp/tri4a_ali_100k_nodup || exit 1
220225

221226
steps/align_fmllr.sh --nj 100 --cmd "$train_cmd" \
222-
data/train_nodup data/lang exp/tri4b exp/tri4b_ali_all || exit 1
227+
data/train_nodup data/lang exp/tri4b exp/tri4b_ali_nodup || exit 1
223228

224229
steps/make_denlats.sh --nj 50 --cmd "$decode_cmd" --config conf/decode.config \
225230
--transform-dir exp/tri4a_ali_100k_nodup \
226231
data/train_100k_nodup data/lang exp/tri4a exp/tri4a_denlats_100k_nodup \
227232
|| exit 1;
228233

229234
steps/make_denlats.sh --nj 100 --cmd "$decode_cmd" --config conf/decode.config \
230-
--transform-dir exp/tri4b_ali_all \
235+
--transform-dir exp/tri4b_ali_nodup \
231236
data/train_nodup data/lang exp/tri4b exp/tri4b_denlats_all || exit 1;
232237

233238
# 4 iterations of MMI seems to work well overall. The number of iterations is
@@ -274,14 +279,14 @@ steps/train_diag_ubm.sh --silence-weight 0.5 --nj 50 --cmd "$train_cmd" \
274279
700 data/train_100k_nodup data/lang exp/tri4a_ali_100k_nodup exp/tri4a_dubm
275280

276281
steps/train_diag_ubm.sh --silence-weight 0.5 --nj 100 --cmd "$train_cmd" \
277-
700 data/train_nodup data/lang exp/tri4b_ali_all exp/tri4b_dubm
282+
700 data/train_nodup data/lang exp/tri4b_ali_nodup exp/tri4b_dubm
278283

279284
steps/train_mmi_fmmi.sh --learning-rate 0.005 --boost 0.1 --cmd "$train_cmd" \
280285
data/train_100k_nodup data/lang exp/tri4a_ali_100k_nodup exp/tri4a_dubm \
281286
exp/tri4a_denlats_100k_nodup exp/tri4a_fmmi_b0.1 || exit 1;
282287

283288
steps/train_mmi_fmmi.sh --learning-rate 0.005 --boost 0.1 --cmd "$train_cmd" \
284-
data/train_nodup data/lang exp/tri4b_ali_all exp/tri4b_dubm \
289+
data/train_nodup data/lang exp/tri4b_ali_nodup exp/tri4b_dubm \
285290
exp/tri4b_denlats_all exp/tri4b_fmmi_b0.1 || exit 1;
286291

287292
for iter in 4 5 6 7 8; do

egs/wsj/s5/run.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -322,7 +322,7 @@ local/run_mmi_tri4b.sh
322322
local/run_sgmm2.sh
323323

324324
# You probably wany to run the hybrid recipe as it is complementary:
325-
local/run_hybrid.sh
325+
local/run_dnn.sh
326326

327327

328328
# Getting results [see RESULTS file]

egs/wsj/s5/steps/compute_cmvn_stats.sh

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,14 +21,24 @@
2121
echo "$0 $@" # Print the command line for logging
2222

2323
fake=false
24+
two_channel=false
25+
2426
if [ $1 == "--fake" ]; then
2527
fake=true
2628
shift
2729
fi
30+
if [ $1 == "--two-channel" ]; then
31+
two_channel=true
32+
shift
33+
fi
2834

2935
if [ $# != 3 ]; then
30-
echo "usage: compute_cmvn_stats.sh [--fake] <data-dir> <log-dir> <path-to-cmvn-dir>";
31-
echo "(note: --fake gives you fake cmvn stats that do no normalization.)"
36+
echo "usage: compute_cmvn_stats.sh [options] <data-dir> <log-dir> <path-to-cmvn-dir>";
37+
echo "Options:"
38+
echo " --fake gives you fake cmvn stats that do no normalization."
39+
echo " --two-channel is for two-channel telephone data, there must be no segments "
40+
echo " file and reco2file_and_channel must be present. It will take"
41+
echo " only frames that are louder than the other channel."
3242
exit 1;
3343
fi
3444

@@ -63,7 +73,11 @@ if $fake; then
6373
for (n=0; n < dim; n++) { printf("1 "); } print "0 ]";}' | \
6474
copy-matrix ark:- ark,scp:$cmvndir/cmvn_$name.ark,$cmvndir/cmvn_$name.scp && \
6575
echo "Error creating fake CMVN stats" && exit 1;
66-
else
76+
elif $two_channel; then
77+
! compute-cmvn-stats-two-channel $data/reco2file_and_channel scp:$data/feats.scp \
78+
ark,scp:$cmvndir/cmvn_$name.ark,$cmvndir/cmvn_$name.scp \
79+
2> $logdir/cmvn_$name.log && echo "Error computing CMVN stats (using two-channel method)" && exit 1;
80+
else
6781
! compute-cmvn-stats --spk2utt=ark:$data/spk2utt scp:$data/feats.scp ark,scp:$cmvndir/cmvn_$name.ark,$cmvndir/cmvn_$name.scp \
6882
2> $logdir/cmvn_$name.log && echo "Error computing CMVN stats" && exit 1;
6983
fi

egs/wsj/s5/steps/decode_nnet_cpu.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ cmd=run.pl
1616
beam=15.0
1717
max_active=7000
1818

19-
#WARNING: This option is renamed lat_beam (it was renamed to follow the naming
19+
#WARNING: This option is renamed from lat_beam (it was renamed to follow the naming
2020
# in the other scripts
2121
lattice_beam=8.0 # Beam we use in lattice generation.
2222
iter=final

egs/wsj/s5/steps/decode_si_ali.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
decode.sh

0 commit comments

Comments
 (0)