misc commits for tedlium2 start-up

vince62s · vince62s · commit 7a47a1154550 · 2016-07-29T22:46:46.000+02:00
diff --git a/egs/tedlium/s5/db/extra.dic b/egs/tedlium/s5/db/extra.dic
diff --git a/egs/tedlium/s5_r2/local/chain/run_tdnn.sh b/egs/tedlium/s5_r2/local/chain/run_tdnn.sh
@@ -26,7 +26,7 @@ set -uo pipefail
 
 # configs for 'chain'
 affix=
-stage=0 # After running the entire script once, you can set stage=12 to tune the neural net only.
+stage=12 # After running the entire script once, you can set stage=12 to tune the neural net only.
 train_stage=-10
 get_egs_stage=-10
 dir=exp/chain/tdnn
@@ -42,8 +42,8 @@ final_effective_lrate=0.0001
 leftmost_questions_truncate=-1
 max_param_change=2.0
 final_layer_normalize_target=0.5
-num_jobs_initial=3
-num_jobs_final=8
+num_jobs_initial=2
+num_jobs_final=2
 minibatch_size=128
 relu_dim=425
 frames_per_eg=150
@@ -73,16 +73,17 @@ fi
 # run those things.
 
 gmm_dir=exp/tri3
-ali_dir=exp/tri3_ali
+ali_dir=exp/tri3_ali_sp
 lats_dir=${ali_dir/ali/lats}
 treedir=exp/chain/tri3_tree
 lang=data/lang_chain
 
 mkdir -p $dir
 
 local/nnet3/run_ivector_common.sh --stage $stage \
-  --generate-alignments false \
-  --speed-perturb true || exit 1;
+  --generate-alignments false || exit 1;
+# \
+#  --speed-perturb true || exit 1;
 
 if [ $stage -le 9 ]; then
   # Get the alignments as lattices (gives the chain training more freedom).
@@ -188,7 +189,7 @@ if [ $stage -le 15 ]; then
   for decode_set in dev test; do
     (
     steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
-      --nj $(wc -l < data/$decode_set/spk2utt) --cmd "$decode_cmd" $iter_opts \
+      --nj 2 --cmd "$decode_cmd" $iter_opts \
       --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
       --scoring-opts "--min_lmwt 5 --max_lmwt 15" \
       $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter} || exit 1;
diff --git a/egs/tedlium/s5_r2/local/join_suffix.py b/egs/tedlium/s5_r2/local/join_suffix.py
@@ -1,23 +1,23 @@
 #!/usr/bin/env python
 #
-# Copyright  2014 Nickolay V. Shmyrev 
+# Copyright  2014  Nickolay V. Shmyrev
+#            2016  Johns Hopkins University (author: Daniel Povey)
 # Apache 2.0
 
 
 import sys
 from codecs import open
 
-words = set()
-for line in open(sys.argv[1], encoding='utf8'):
-    items = line.split()
-    words.add(items[0])
+# This script joins together pairs of split-up words like "you 're" -> "you're".
+# The TEDLIUM transcripts are normalized in a way that's not traditional for
+# speech recognition.
 
 for line in sys.stdin:
     items = line.split()
     new_items = []
     i = 1
     while i < len(items):
-        if i < len(items) - 1 and items[i+1][0] == '\'' and items[i] + items[i+1] in words:
+        if i < len(items) - 1 and items[i+1][0] == '\'':
             new_items.append(items[i] + items[i+1])
             i = i + 1
         else:
diff --git a/egs/tedlium/s5_r2/local/nnet3/run_tdnn.sh b/egs/tedlium/s5_r2/local/nnet3/run_tdnn.sh
@@ -100,7 +100,7 @@ if [ $stage -le 11 ]; then
   for decode_set in dev test; do
     (
     steps/nnet3/decode.sh \
-      --nj 1 --cmd "$decode_cmd" $iter_opts \
+      --nj 4 --cmd "$decode_cmd" $iter_opts \
       --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
       $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter} || exit 1;
 
diff --git a/egs/tedlium/s5_r2/local/prepare_data.sh b/egs/tedlium/s5_r2/local/prepare_data.sh
@@ -1,7 +1,8 @@
 #!/bin/bash
 #
-# Copyright  2014 Nickolay V. Shmyrev 
-#            2014 Brno University of Technology (Author: Karel Vesely)
+# Copyright  2014  Nickolay V. Shmyrev 
+#            2014  Brno University of Technology (Author: Karel Vesely)
+#            2016  Johns Hopkins University (Author: Daniel Povey)
 # Apache 2.0
 
 # To be run from one directory above this script.
@@ -38,14 +39,14 @@ for set in dev test train; do
           -e 's:<sil>::g' \
           -e 's:([^ ]*)$::' | \
       awk '{ $2 = "A"; print $0; }'
-  } | local/join_suffix.py db/TEDLIUM_release2/TEDLIUM.152k.dic > data/$set/stm 
+  } | local/join_suffix.py > data/$set/stm
 
   # Prepare 'text' file
   # - {NOISE} -> [NOISE] : map the tags to match symbols in dictionary
   cat $dir/stm | grep -v -e 'ignore_time_segment_in_scoring' -e ';;' | \
-    awk '{ printf ("%s-%07d-%07d", $1, $4*100, $5*100); 
-           for (i=7;i<=NF;i++) { printf(" %s", $i); } 
-           printf("\n"); 
+    awk '{ printf ("%s-%07d-%07d", $1, $4*100, $5*100);
+           for (i=7;i<=NF;i++) { printf(" %s", $i); }
+           printf("\n");
          }' | tr '{}' '[]' | sort -k1,1 > $dir/text || exit 1
 
   # Prepare 'segments', 'utt2spk', 'spk2utt'
@@ -62,6 +63,15 @@ for set in dev test train; do
   [FAKE]     =>  %HESITATION     / [ ] __ [ ] ;; hesitation token
   ' > data/$set/glm
 
+  # The training set seems to not have enough silence padding in the segmentations,
+  # especially at the beginning of segments.  Extend the times.
+  if [ $set == "train" ]; then
+    mv data/$set/segments data/$set/segments.temp
+    utils/data/extend_segment_times.py --start-padding=0.15 \
+      --end-padding=0.1 <data/$set/segments.temp >data/$set/segments || exit 1
+    rm data/$set/segments.temp
+  fi
+
   # Check that data dirs are okay!
   utils/validate_data_dir.sh --no-feats $dir || exit 1
 done
diff --git a/egs/tedlium/s5_r2/local/prepare_lm.sh b/egs/tedlium/s5_r2/local/prepare_lm.sh
@@ -3,6 +3,7 @@
 # Copyright  2014 Nickolay V. Shmyrev
 # Apache 2.0
 
+# TODO: change here the LMs from PocoLM incliding the acoustic training data
 
 if [ -f path.sh ]; then . path.sh; fi
 
@@ -16,7 +17,7 @@ gunzip -c "$arpa_lm" | arpa2fst --disambig-symbol=#0 \
   --read-symbol-table=data/lang_nosp_test/words.txt - data/lang_nosp_test/G.fst
 
 
-echo  "Checking how stochastic G is (the first of these numbers should be small):"
+echo  "$0: Checking how stochastic G is (the first of these numbers should be small):"
 fstisstochastic data/lang_nosp_test/G.fst
 
 utils/validate_lang.pl data/lang_nosp_test || exit 1;
diff --git a/egs/tedlium/s5_r2/local/score_sclite.sh b/egs/tedlium/s5_r2/local/score_sclite.sh
@@ -13,6 +13,7 @@ beam=7  # speed-up, but may affect MBR confidences.
 word_ins_penalty=0.0,0.5,1.0
 min_lmwt=8
 max_lmwt=17
+iter=final
 #end configuration section.
 
 [ -f ./path.sh ] && . ./path.sh
@@ -83,3 +84,5 @@ if [ $stage -le 2 ]; then
 fi
 
 exit 0
+
+
diff --git a/egs/tedlium/s5_r2/local/ted_train_lm.sh b/egs/tedlium/s5_r2/local/ted_train_lm.sh
@@ -27,11 +27,8 @@ export PATH=$KALDI_ROOT/tools/pocolm/scripts:$PATH
  if [ -d pocolm ]; then
    echo Not installing the pocolm toolkit since it is already there.
  else
-   echo Downloading and installing the pocolm tools
-   git clone https://github.com/danpovey/pocolm.git || exit 1;
-   cd pocolm/src
-   make || exit 1;
-   echo Done making the pocolm tools
+   echo "Please install the PocoLM toolkit with kaldi/tools/install_pocolm.sh"
+   exit 1;
  fi
 ) || exit 1;
 
@@ -84,12 +81,12 @@ for order in 3 4; do
 
 done
 
-# pruning the LM for order 3 only, and using threshold 0.10 to get a 30MB LM size
+# pruning the LM for order 3 only, and using 3400000 n-grams to get a 30MB LM size
 order=3
-for threshold in 0.10; do
-    prune_lm_dir.py ${dir}/data/lm_${order} $threshold ${dir}/data/lm_${order}_prune${threshold} 2>&1 | tail -n 5 | head -n 3
-    get_data_prob.py ${dir}/data/text/dev.txt ${dir}/data/lm_${order}_prune${threshold} 2>&1 | grep -F '[perplexity'
+size=3400000
+    prune_lm_dir.py --target-num-ngrams=$size ${dir}/data/lm_${order} ${dir}/data/lm_${order}_prune 2>&1 | tail -n 5 | head -n 3
+    get_data_prob.py ${dir}/data/text/dev.txt ${dir}/data/lm_${order}_prune 2>&1 | grep -F '[perplexity'
+
+    format_arpa_lm.py ${dir}/data/lm_${order}_prune | gzip -c > ${dir}/data/arpa/${vocab_size}_${order}gram_prune.arpa.gz
 
-    format_arpa_lm.py ${dir}/data/lm_${order}_prune${threshold} | gzip -c > ${dir}/data/arpa/${vocab_size}_${order}gram_prune${threshold}.arpa.gz
-done
 
diff --git a/egs/tedlium/s5_r2/results.sh b/egs/tedlium/s5_r2/results.sh
@@ -4,7 +4,7 @@ filter_regexp=.
 [ $# -ge 1 ] && filter_regexp=$1
 
 for x in exp/*/decode*; do [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; done 2>/dev/null
-  for x in exp/{mono,tri,sgmm,nnet,dnn,lstm}*/decode*; do [ -d $x ] && grep Sum $x/score_*/*.sys | utils/best_wer.sh; done 2>/dev/null | grep $filter_regexp
-   for x in exp/{mono,tri,sgmm,nnet,dnn,lstm}*/*/decode*; do [ -d $x ] && grep Sum $x/score_*/*.sys | utils/best_wer.sh; done 2>/dev/null | grep $filter_regexp
+  for x in exp/{mono,tri,sgmm,nnet,dnn,lstm,chain}*/decode*; do [ -d $x ] && grep Sum $x/score_*/*.sys | utils/best_wer.sh; done 2>/dev/null | grep $filter_regexp
+   for x in exp/{mono,tri,sgmm,nnet,dnn,lstm,chain}*/*/decode*; do [ -d $x ] && grep Sum $x/score_*/*.sys | utils/best_wer.sh; done 2>/dev/null | grep $filter_regexp
 exit 0
 
diff --git a/egs/tedlium/s5_r2/run.sh b/egs/tedlium/s5_r2/run.sh
@@ -36,12 +36,14 @@ if [ $stage -le 0 ]; then
   utils/prepare_lang.sh data/local/dict_nosp \
     "<unk>" data/local/lang_nosp data/lang_nosp || exit 1
 
+# Here needs to be inserted ted_train_lm.sh
+
   local/prepare_lm.sh || exit 1
 
 fi
 
 # Feature extraction
-feat_dir=$pwd/data/mfcc_features
+
 if [ $stage -le 1 ]; then
   for set in test dev train; do
     dir=data/$set
@@ -179,6 +181,8 @@ fi
 # local/nnet3/run_tdnn.sh
 # local/nnet3/run_tdnn_discriminative.sh
 
+local/chain/run_tdnn.sh
+
 
 echo success...
 exit 0
diff --git a/tools/extras/install_pocolm.sh b/tools/extras/install_pocolm.sh
diff --git a/tools/install_pocolm.sh b/tools/install_pocolm.sh

Original file line number	Diff line number	Diff line change
`@@ -100,7 +100,7 @@ if [ $stage -le 11 ]; then`
`100`	`100`	`for decode_set in dev test; do`
`101`	`101`	`(`
`102`	`102`	`steps/nnet3/decode.sh \`
`103`		`- --nj 1 --cmd "$decode_cmd" $iter_opts \`
	`103`	`+ --nj 4 --cmd "$decode_cmd" $iter_opts \`
`104`	`104`	`--online-ivector-dir exp/nnet3/ivectors_${decode_set} \`
`105`	`105`	`$graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter} \|\| exit 1;`
`106`	`106`