Skip to content

Commit 7a47a11

Browse files
committed
misc commits for tedlium2 start-up
1 parent bb992e6 commit 7a47a11

File tree

12 files changed

+77
-186
lines changed

12 files changed

+77
-186
lines changed

egs/tedlium/s5/db/extra.dic

Lines changed: 0 additions & 151 deletions
This file was deleted.

egs/tedlium/s5_r2/local/chain/run_tdnn.sh

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ set -uo pipefail
2626

2727
# configs for 'chain'
2828
affix=
29-
stage=0 # After running the entire script once, you can set stage=12 to tune the neural net only.
29+
stage=12 # After running the entire script once, you can set stage=12 to tune the neural net only.
3030
train_stage=-10
3131
get_egs_stage=-10
3232
dir=exp/chain/tdnn
@@ -42,8 +42,8 @@ final_effective_lrate=0.0001
4242
leftmost_questions_truncate=-1
4343
max_param_change=2.0
4444
final_layer_normalize_target=0.5
45-
num_jobs_initial=3
46-
num_jobs_final=8
45+
num_jobs_initial=2
46+
num_jobs_final=2
4747
minibatch_size=128
4848
relu_dim=425
4949
frames_per_eg=150
@@ -73,16 +73,17 @@ fi
7373
# run those things.
7474

7575
gmm_dir=exp/tri3
76-
ali_dir=exp/tri3_ali
76+
ali_dir=exp/tri3_ali_sp
7777
lats_dir=${ali_dir/ali/lats}
7878
treedir=exp/chain/tri3_tree
7979
lang=data/lang_chain
8080

8181
mkdir -p $dir
8282

8383
local/nnet3/run_ivector_common.sh --stage $stage \
84-
--generate-alignments false \
85-
--speed-perturb true || exit 1;
84+
--generate-alignments false || exit 1;
85+
# \
86+
# --speed-perturb true || exit 1;
8687

8788
if [ $stage -le 9 ]; then
8889
# Get the alignments as lattices (gives the chain training more freedom).
@@ -188,7 +189,7 @@ if [ $stage -le 15 ]; then
188189
for decode_set in dev test; do
189190
(
190191
steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
191-
--nj $(wc -l < data/$decode_set/spk2utt) --cmd "$decode_cmd" $iter_opts \
192+
--nj 2 --cmd "$decode_cmd" $iter_opts \
192193
--online-ivector-dir exp/nnet3/ivectors_${decode_set} \
193194
--scoring-opts "--min_lmwt 5 --max_lmwt 15" \
194195
$graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter} || exit 1;

egs/tedlium/s5_r2/local/join_suffix.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,23 @@
11
#!/usr/bin/env python
22
#
3-
# Copyright 2014 Nickolay V. Shmyrev
3+
# Copyright 2014 Nickolay V. Shmyrev
4+
# 2016 Johns Hopkins University (author: Daniel Povey)
45
# Apache 2.0
56

67

78
import sys
89
from codecs import open
910

10-
words = set()
11-
for line in open(sys.argv[1], encoding='utf8'):
12-
items = line.split()
13-
words.add(items[0])
11+
# This script joins together pairs of split-up words like "you 're" -> "you're".
12+
# The TEDLIUM transcripts are normalized in a way that's not traditional for
13+
# speech recognition.
1414

1515
for line in sys.stdin:
1616
items = line.split()
1717
new_items = []
1818
i = 1
1919
while i < len(items):
20-
if i < len(items) - 1 and items[i+1][0] == '\'' and items[i] + items[i+1] in words:
20+
if i < len(items) - 1 and items[i+1][0] == '\'':
2121
new_items.append(items[i] + items[i+1])
2222
i = i + 1
2323
else:

egs/tedlium/s5_r2/local/nnet3/run_tdnn.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ if [ $stage -le 11 ]; then
100100
for decode_set in dev test; do
101101
(
102102
steps/nnet3/decode.sh \
103-
--nj 1 --cmd "$decode_cmd" $iter_opts \
103+
--nj 4 --cmd "$decode_cmd" $iter_opts \
104104
--online-ivector-dir exp/nnet3/ivectors_${decode_set} \
105105
$graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter} || exit 1;
106106

egs/tedlium/s5_r2/local/prepare_data.sh

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
#!/bin/bash
22
#
3-
# Copyright 2014 Nickolay V. Shmyrev
4-
# 2014 Brno University of Technology (Author: Karel Vesely)
3+
# Copyright 2014 Nickolay V. Shmyrev
4+
# 2014 Brno University of Technology (Author: Karel Vesely)
5+
# 2016 Johns Hopkins University (Author: Daniel Povey)
56
# Apache 2.0
67

78
# To be run from one directory above this script.
@@ -38,14 +39,14 @@ for set in dev test train; do
3839
-e 's:<sil>::g' \
3940
-e 's:([^ ]*)$::' | \
4041
awk '{ $2 = "A"; print $0; }'
41-
} | local/join_suffix.py db/TEDLIUM_release2/TEDLIUM.152k.dic > data/$set/stm
42+
} | local/join_suffix.py > data/$set/stm
4243

4344
# Prepare 'text' file
4445
# - {NOISE} -> [NOISE] : map the tags to match symbols in dictionary
4546
cat $dir/stm | grep -v -e 'ignore_time_segment_in_scoring' -e ';;' | \
46-
awk '{ printf ("%s-%07d-%07d", $1, $4*100, $5*100);
47-
for (i=7;i<=NF;i++) { printf(" %s", $i); }
48-
printf("\n");
47+
awk '{ printf ("%s-%07d-%07d", $1, $4*100, $5*100);
48+
for (i=7;i<=NF;i++) { printf(" %s", $i); }
49+
printf("\n");
4950
}' | tr '{}' '[]' | sort -k1,1 > $dir/text || exit 1
5051

5152
# Prepare 'segments', 'utt2spk', 'spk2utt'
@@ -62,6 +63,15 @@ for set in dev test train; do
6263
[FAKE] => %HESITATION / [ ] __ [ ] ;; hesitation token
6364
' > data/$set/glm
6465

66+
# The training set seems to not have enough silence padding in the segmentations,
67+
# especially at the beginning of segments. Extend the times.
68+
if [ $set == "train" ]; then
69+
mv data/$set/segments data/$set/segments.temp
70+
utils/data/extend_segment_times.py --start-padding=0.15 \
71+
--end-padding=0.1 <data/$set/segments.temp >data/$set/segments || exit 1
72+
rm data/$set/segments.temp
73+
fi
74+
6575
# Check that data dirs are okay!
6676
utils/validate_data_dir.sh --no-feats $dir || exit 1
6777
done

egs/tedlium/s5_r2/local/prepare_lm.sh

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
# Copyright 2014 Nickolay V. Shmyrev
44
# Apache 2.0
55

6+
# TODO: change here the LMs from PocoLM incliding the acoustic training data
67

78
if [ -f path.sh ]; then . path.sh; fi
89

@@ -16,7 +17,7 @@ gunzip -c "$arpa_lm" | arpa2fst --disambig-symbol=#0 \
1617
--read-symbol-table=data/lang_nosp_test/words.txt - data/lang_nosp_test/G.fst
1718

1819

19-
echo "Checking how stochastic G is (the first of these numbers should be small):"
20+
echo "$0: Checking how stochastic G is (the first of these numbers should be small):"
2021
fstisstochastic data/lang_nosp_test/G.fst
2122

2223
utils/validate_lang.pl data/lang_nosp_test || exit 1;

egs/tedlium/s5_r2/local/score_sclite.sh

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ beam=7 # speed-up, but may affect MBR confidences.
1313
word_ins_penalty=0.0,0.5,1.0
1414
min_lmwt=8
1515
max_lmwt=17
16+
iter=final
1617
#end configuration section.
1718

1819
[ -f ./path.sh ] && . ./path.sh
@@ -83,3 +84,5 @@ if [ $stage -le 2 ]; then
8384
fi
8485

8586
exit 0
87+
88+

egs/tedlium/s5_r2/local/ted_train_lm.sh

Lines changed: 8 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -27,11 +27,8 @@ export PATH=$KALDI_ROOT/tools/pocolm/scripts:$PATH
2727
if [ -d pocolm ]; then
2828
echo Not installing the pocolm toolkit since it is already there.
2929
else
30-
echo Downloading and installing the pocolm tools
31-
git clone https://github.com/danpovey/pocolm.git || exit 1;
32-
cd pocolm/src
33-
make || exit 1;
34-
echo Done making the pocolm tools
30+
echo "Please install the PocoLM toolkit with kaldi/tools/install_pocolm.sh"
31+
exit 1;
3532
fi
3633
) || exit 1;
3734

@@ -84,12 +81,12 @@ for order in 3 4; do
8481

8582
done
8683

87-
# pruning the LM for order 3 only, and using threshold 0.10 to get a 30MB LM size
84+
# pruning the LM for order 3 only, and using 3400000 n-grams to get a 30MB LM size
8885
order=3
89-
for threshold in 0.10; do
90-
prune_lm_dir.py ${dir}/data/lm_${order} $threshold ${dir}/data/lm_${order}_prune${threshold} 2>&1 | tail -n 5 | head -n 3
91-
get_data_prob.py ${dir}/data/text/dev.txt ${dir}/data/lm_${order}_prune${threshold} 2>&1 | grep -F '[perplexity'
86+
size=3400000
87+
prune_lm_dir.py --target-num-ngrams=$size ${dir}/data/lm_${order} ${dir}/data/lm_${order}_prune 2>&1 | tail -n 5 | head -n 3
88+
get_data_prob.py ${dir}/data/text/dev.txt ${dir}/data/lm_${order}_prune 2>&1 | grep -F '[perplexity'
89+
90+
format_arpa_lm.py ${dir}/data/lm_${order}_prune | gzip -c > ${dir}/data/arpa/${vocab_size}_${order}gram_prune.arpa.gz
9291

93-
format_arpa_lm.py ${dir}/data/lm_${order}_prune${threshold} | gzip -c > ${dir}/data/arpa/${vocab_size}_${order}gram_prune${threshold}.arpa.gz
94-
done
9592

egs/tedlium/s5_r2/results.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ filter_regexp=.
44
[ $# -ge 1 ] && filter_regexp=$1
55

66
for x in exp/*/decode*; do [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; done 2>/dev/null
7-
for x in exp/{mono,tri,sgmm,nnet,dnn,lstm}*/decode*; do [ -d $x ] && grep Sum $x/score_*/*.sys | utils/best_wer.sh; done 2>/dev/null | grep $filter_regexp
8-
for x in exp/{mono,tri,sgmm,nnet,dnn,lstm}*/*/decode*; do [ -d $x ] && grep Sum $x/score_*/*.sys | utils/best_wer.sh; done 2>/dev/null | grep $filter_regexp
7+
for x in exp/{mono,tri,sgmm,nnet,dnn,lstm,chain}*/decode*; do [ -d $x ] && grep Sum $x/score_*/*.sys | utils/best_wer.sh; done 2>/dev/null | grep $filter_regexp
8+
for x in exp/{mono,tri,sgmm,nnet,dnn,lstm,chain}*/*/decode*; do [ -d $x ] && grep Sum $x/score_*/*.sys | utils/best_wer.sh; done 2>/dev/null | grep $filter_regexp
99
exit 0
1010

egs/tedlium/s5_r2/run.sh

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,12 +36,14 @@ if [ $stage -le 0 ]; then
3636
utils/prepare_lang.sh data/local/dict_nosp \
3737
"<unk>" data/local/lang_nosp data/lang_nosp || exit 1
3838

39+
# Here needs to be inserted ted_train_lm.sh
40+
3941
local/prepare_lm.sh || exit 1
4042

4143
fi
4244

4345
# Feature extraction
44-
feat_dir=$pwd/data/mfcc_features
46+
4547
if [ $stage -le 1 ]; then
4648
for set in test dev train; do
4749
dir=data/$set
@@ -179,6 +181,8 @@ fi
179181
# local/nnet3/run_tdnn.sh
180182
# local/nnet3/run_tdnn_discriminative.sh
181183

184+
local/chain/run_tdnn.sh
185+
182186

183187
echo success...
184188
exit 0

0 commit comments

Comments
 (0)