11#! /bin/bash
22#
3- # Copyright 2014 Nickolay V. Shmyrev
4- # 2014 Brno University of Technology (Author: Karel Vesely)
3+ # Copyright 2014 Nickolay V. Shmyrev
4+ # 2014 Brno University of Technology (Author: Karel Vesely)
5+ # 2016 Johns Hopkins University (Author: Daniel Povey)
56# Apache 2.0
67
78# To be run from one directory above this script.
@@ -38,14 +39,14 @@ for set in dev test train; do
3839 -e ' s:<sil>::g' \
3940 -e ' s:([^ ]*)$::' | \
4041 awk ' { $2 = "A"; print $0; }'
41- } | local/join_suffix.py db/TEDLIUM_release2/TEDLIUM.152k.dic > data/$set /stm
42+ } | local/join_suffix.py > data/$set /stm
4243
4344 # Prepare 'text' file
4445 # - {NOISE} -> [NOISE] : map the tags to match symbols in dictionary
4546 cat $dir /stm | grep -v -e ' ignore_time_segment_in_scoring' -e ' ;;' | \
46- awk ' { printf ("%s-%07d-%07d", $1, $4*100, $5*100);
47- for (i=7;i<=NF;i++) { printf(" %s", $i); }
48- printf("\n");
47+ awk ' { printf ("%s-%07d-%07d", $1, $4*100, $5*100);
48+ for (i=7;i<=NF;i++) { printf(" %s", $i); }
49+ printf("\n");
4950 }' | tr ' {}' ' []' | sort -k1,1 > $dir /text || exit 1
5051
5152 # Prepare 'segments', 'utt2spk', 'spk2utt'
@@ -62,6 +63,15 @@ for set in dev test train; do
6263 [FAKE] => %HESITATION / [ ] __ [ ] ;; hesitation token
6364 ' > data/$set /glm
6465
66+ # The training set seems to not have enough silence padding in the segmentations,
67+ # especially at the beginning of segments. Extend the times.
68+ if [ $set == " train" ]; then
69+ mv data/$set /segments data/$set /segments.temp
70+ utils/data/extend_segment_times.py --start-padding=0.15 \
71+ --end-padding=0.1 < data/$set /segments.temp > data/$set /segments || exit 1
72+ rm data/$set /segments.temp
73+ fi
74+
6575 # Check that data dirs are okay!
6676 utils/validate_data_dir.sh --no-feats $dir || exit 1
6777done
0 commit comments