Skip to content

Commit b68c428

Browse files
committed
Merge remote-tracking branch 'upstream/master' into kaldi_52
2 parents 21d11ff + 7267281 commit b68c428

File tree

155 files changed

+1665
-1951
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

155 files changed

+1665
-1951
lines changed

egs/babel/s5d/local/extend_lexicon.sh

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -182,6 +182,10 @@ if [ $stage -le -5 ]; then
182182
awk '{for(n=2;n<=NF;n++) seen[$n]=1;} END{for (key in seen) print key;}' >$dir/phonelist
183183

184184
cat $dir/phonelist | perl -e ' @ids = ("a".."z", "A".."Z", "0".."9", ":", "=", "?", "@", "[", "]", "^", "+", "\$", "%", "&", "#", "*", "!", "(", ")", "{", "}" );
185+
use open ":std", ":encoding(UTF-8)";
186+
foreach $elem (150..250) {
187+
push @ids, chr($elem);
188+
}
185189
@map = (); while(<>) {
186190
chomp; $output = "$_ ";
187191
@col = split("_");
@@ -198,7 +202,7 @@ if [ $stage -le -5 ]; then
198202
}
199203
$output .= "$map[$p]->{$col[$p]}";
200204
}
201-
print "$output\n"; }' > $dir/phone_map
205+
print "$output\n"; }' > $dir/phone_map || exit 1
202206
cat $dir/phone_map | awk '{print $2, $1}' > $dir/phone_map.reverse
203207

204208
cat $toplevel_dir/input_lexicon.txt | \
@@ -245,6 +249,7 @@ if [ $stage -le -1 ]; then
245249
rm $dir/probs.* 2>/dev/null
246250

247251
echo '#!/usr/bin/perl
252+
use open ":std", ":encoding(UTF-8)";
248253
while(1) {
249254
$sent = <>; $line=<>; if ($line !~ m/sentences/) { $sent =~ m/^file/ || die "Bad sent $sent"; exit(0); }
250255
$line = <>; if ($line !~ m/logprob= (\S+)/) { die "Bad line $line"; } print "$1 $sent";
@@ -344,14 +349,20 @@ if [ $stage -le $g2p_iters ]; then
344349
\> $dir/p2g_output.JOB || exit 1;
345350
perl -wlne 'use strict;
346351
our %P;
352+
my $l = $_;
347353
my ($prn,$num,$prb,$spl)=m/^(\S+)\s+(\S+)\s+(\S+)\s+(.*)$/;
354+
355+
print STDERR "Warning: error parsing line \"$l\"\n" unless (defined $prb);
356+
next unless defined($prb);
357+
348358
my $tok=$prn."=".$spl;
349359
$P{$tok} = [ $num, $prb ] unless (defined($P{$tok}) && $P{$tok}[1] < $prb);
350360
END {
351-
map{ my ($prn,$spl)=m/^(.*)=(.*)$/;
361+
map{ $tok = $_;
362+
my ($prn,$spl)=m/^(.*)=(.*)$/;
352363
my ($num, $prb) = @{$P{$tok}};
353364
print join("\t",$prn,$num,$prb,$spl)
354-
} sort keys %P
365+
} sort keys %P;
355366
}' $dir/p2g_output.* > $dir/p2g_output
356367
rm $dir/p2g_output.*
357368
fi
@@ -495,7 +506,7 @@ if [ $stage -le $[$g2p_iters+2] ]; then
495506
cp $dir/oov2prob $toplevel_dir/oov2prob
496507
fi
497508

498-
# Finally, if $dev_text is not empty, print out OOV rate. We assame $dev_text is
509+
# Finally, if $dev_text is not empty, print out OOV rate. We assume $dev_text is
499510
# in the following format:
500511
# 14350_A_20121123_042710_001717 yebo yini
501512
# where "14350_A_20121123_042710_001717" is the utterance id and "yebo yini" is

egs/babel_multilang/s5/local/nnet3/run_shared_ivector_extractor.sh

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,8 @@ multi_data_dir=$2
3434
global_extractor_dir=$3
3535

3636
langconf=conf/$lda_mllt_lang/lang.conf
37-
[ ! -f $langconf ] && echo "Language configuration lang.conf does not exist! Use the configurations in conf/${lda_mllt_lang}/* as a startup" && exit 1
37+
[ ! -f $langconf ] && \
38+
echo "Language configuration lang.conf does not exist. Start with configurations in conf/${lda_mllt_lang}/*." && exit 1
3839
. $langconf || exit 1;
3940

4041
if [ $stage -le 4 ]; then

egs/babel_multilang/s5/local/nnet3/run_tdnn_multilingual.sh

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -207,10 +207,6 @@ EOF
207207
--config-dir $dir/configs/ \
208208
--nnet-edits="rename-node old-name=output-0 new-name=output"
209209

210-
cat <<EOF >> $dir/configs/vars
211-
include_log_softmax=false
212-
EOF
213-
fi
214210

215211
if [ $stage -le 9 ]; then
216212
echo "$0: Generates separate egs dir per language for multilingual training."

egs/fisher_callhome_spanish/s5/local/callhome_create_splits.sh

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,5 @@ do
2727

2828
utils/fix_data_dir.sh $data_dir/$dirName
2929
utils/validate_data_dir.sh $data_dir/$dirName
30-
rm $data_dir/$dirName/*.tmp
3130
done
3231

egs/fisher_callhome_spanish/s5/local/create_splits.sh

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,5 @@ do
2626

2727
utils/fix_data_dir.sh $data_dir/$split
2828
utils/validate_data_dir.sh $data_dir/$split
29-
rm $data_dir/$split/*.tmp
3029
done
3130

egs/fisher_callhome_spanish/s5/local/fsp_data_prep.sh

Lines changed: 19 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -55,34 +55,38 @@ vely"
5555
exit 1;
5656
fi
5757

58-
if [ ! -d links/LDC2010S01/DISC1/data/speech -o ! -d links/LDC2010S01/DISC2/data/speech ];
58+
#if [ ! -d links/LDC2010S01/DISC1/data/speech -o ! -d links/LDC2010S01/DISC2/data/speech ];
59+
if [ ! -d links/LDC2010S01/data/speech ];
5960
then
60-
echo "Disc 1 and 2 directories missing or not properly organised within the speech data dir"
61-
echo "Typical format is LDC2010S01/DISC?/data/speech"
61+
echo "Speech directories missing or not properly organised within the speech data dir"
62+
echo "Typical format is LDC2010S01/data/speech"
6263
exit 1;
6364
fi
6465

6566
#Check the transcripts directories as well to see if they exist
66-
if [ ! -d links/LDC2010T04/data/transcripts ];
67+
if [ ! -d links/LDC2010T04/fisher_spa_tr/data/transcripts ];
6768
then
6869
echo "Transcript directories missing or not properly organised"
69-
echo "Typical format is LDC2010T04/data/transcripts"
70+
echo "Typical format is LDC2010T04/fisher_spa_tr/data/transcripts"
7071
exit 1;
7172
fi
7273

73-
speech_d1=$dir/links/LDC2010S01/DISC1/data/speech
74-
speech_d2=$dir/links/LDC2010S01/DISC2/data/speech
75-
transcripts=$dir/links/LDC2010T04/data/transcripts
74+
#speech_d1=$dir/links/LDC2010S01/DISC1/data/speech
75+
#speech_d2=$dir/links/LDC2010S01/DISC2/data/speech
76+
speech=$dir/links/LDC2010S01/data/speech
77+
transcripts=$dir/links/LDC2010T04/fisher_spa_tr/data/transcripts
7678

77-
fcount_d1=`find ${speech_d1} -iname '*.sph' | wc -l`
78-
fcount_d2=`find ${speech_d2} -iname '*.sph' | wc -l`
79+
#fcount_d1=`find ${speech_d1} -iname '*.sph' | wc -l`
80+
#fcount_d2=`find ${speech_d2} -iname '*.sph' | wc -l`
81+
fcount_s=`find ${speech} -iname '*.sph' | wc -l`
7982
fcount_t=`find ${transcripts} -iname '*.tdf' | wc -l`
8083
#TODO:it seems like not all speech files have transcripts
8184
#Now check if we got all the files that we needed
82-
if [ $fcount_d1 != 411 -o $fcount_d2 != 408 -o $fcount_t != 819 ];
85+
#if [ $fcount_d1 != 411 -o $fcount_d2 != 408 -o $fcount_t != 819 ];
86+
if [ $fcount_s != 819 -o $fcount_t != 819 ];
8387
then
8488
echo "Incorrect number of files in the data directories"
85-
echo "DISC1 and DISC2 should contain 411 and 408 .sph files respectively"
89+
echo "DISC1 and DISC2 should contain 411 and 408 .sph files respectively (Total = 819)"
8690
echo "The transcripts should contain 819 files"
8791
exit 1;
8892
fi
@@ -91,8 +95,9 @@ if [ $stage -le 0 ]; then
9195
#Gather all the speech files together to create a file list
9296
#TODO: Train and test split might be required
9397
(
94-
find $speech_d1 -iname '*.sph';
95-
find $speech_d2 -iname '*.sph';
98+
#find $speech_d1 -iname '*.sph';
99+
#find $speech_d2 -iname '*.sph';
100+
find $speech -iname '*.sph';
96101
) > $tmpdir/train_sph.flist
97102

98103
#Get all the transcripts in one place

egs/fisher_callhome_spanish/s5/local/fsp_train_lms.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ export PATH=$PATH:`pwd`/../../../tools/kaldi_lm
5858
else
5959
echo Downloading and installing the kaldi_lm tools
6060
if [ ! -f kaldi_lm.tar.gz ]; then
61-
wget http://merlin.fit.vutbr.cz/kaldi/kaldi_lm.tar.gz || exit 1;
61+
wget http://www.danielpovey.com/files/kaldi/kaldi_lm.tar.gz || exit 1;
6262
fi
6363
tar -xvzf kaldi_lm.tar.gz || exit 1;
6464
cd kaldi_lm

0 commit comments

Comments
 (0)