|
| 1 | +#!/bin/bash |
| 2 | + |
| 3 | +# Copyright 2013 Daniel Povey |
| 4 | +# Apache 2.0. |
| 5 | + |
| 6 | + |
| 7 | +# This script computes iVectors in the same format as extract_ivectors_online.sh, |
| 8 | +# except that they are actually not really computed online, they are first computed |
| 9 | +# per speaker and just duplicated many times. |
| 10 | +# |
| 11 | +# This setup also makes it possible to use a previous decoding or alignment, to |
| 12 | +# down-weight silence in the stats (default is --silence-weight 0.0). |
| 13 | +# |
| 14 | +# This is for when you use the "online-decoding" setup in an offline task, and |
| 15 | +# you want the best possible results. |
| 16 | + |
| 17 | + |
| 18 | +# Begin configuration section. |
| 19 | +nj=30 |
| 20 | +cmd="run.pl" |
| 21 | +stage=0 |
| 22 | +num_gselect=5 # Gaussian-selection using diagonal model: number of Gaussians to select |
| 23 | +min_post=0.025 # Minimum posterior to use (posteriors below this are pruned out) |
| 24 | +ivector_period=10 |
| 25 | +posterior_scale=0.1 # Scale on the acoustic posteriors, intended to account for |
| 26 | + # inter-frame correlations. Making this small during iVector |
| 27 | + # extraction is equivalent to scaling up the prior, and will |
| 28 | + # will tend to produce smaller iVectors where data-counts are |
| 29 | + # small. It's not so important that this match the value |
| 30 | + # used when training the iVector extractor, but more important |
| 31 | + # that this match the value used when you do real online decoding |
| 32 | + # with the neural nets trained with these iVectors. |
| 33 | +max_count=100 # Interpret this as a number of frames times posterior scale... |
| 34 | + # this config ensures that once the count exceeds this (i.e. |
| 35 | + # 1000 frames, or 10 seconds, by default), we start to scale |
| 36 | + # down the stats, accentuating the prior term. This seems quite |
| 37 | + # important for some reason. |
| 38 | +compress=true # If true, compress the iVectors stored on disk (it's lossy |
| 39 | + # compression, as used for feature matrices). |
| 40 | +silence_weight=0.0 |
| 41 | +acwt=0.1 # used if input is a decode dir, to get best path from lattices. |
| 42 | +mdl=final # change this if decode directory did not have ../final.mdl present. |
| 43 | + |
| 44 | +# End configuration section. |
| 45 | + |
| 46 | +echo "$0 $@" # Print the command line for logging |
| 47 | + |
| 48 | +if [ -f path.sh ]; then . ./path.sh; fi |
| 49 | +. parse_options.sh || exit 1; |
| 50 | + |
| 51 | + |
| 52 | +if [ $# != 4 ] && [ $# != 5 ]; then |
| 53 | + echo "Usage: $0 [options] <data> <lang> <extractor-dir> [<alignment-dir>|<decode-dir>] <ivector-dir>" |
| 54 | + echo " e.g.: $0 data/test exp/nnet2_online/extractor exp/tri3/decode_test exp/nnet2_online/ivectors_test" |
| 55 | + echo "main options (for others, see top of script file)" |
| 56 | + echo " --config <config-file> # config containing options" |
| 57 | + echo " --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs." |
| 58 | + echo " --nj <n|10> # Number of jobs (also see num-processes and num-threads)" |
| 59 | + echo " # Ignored if <alignment-dir> or <decode-dir> supplied." |
| 60 | + echo " --stage <stage|0> # To control partial reruns" |
| 61 | + echo " --num-gselect <n|5> # Number of Gaussians to select using" |
| 62 | + echo " # diagonal model." |
| 63 | + echo " --min-post <float;default=0.025> # Pruning threshold for posteriors" |
| 64 | + echo " --ivector-period <int;default=10> # How often to extract an iVector (frames)" |
| 65 | + echo " --utts-per-spk-max <int;default=-1> # Controls splitting into 'fake speakers'." |
| 66 | + echo " # Set to 1 if compatibility with utterance-by-utterance" |
| 67 | + echo " # decoding is the only factor, and to larger if you care " |
| 68 | + echo " # also about adaptation over several utterances." |
| 69 | + exit 1; |
| 70 | +fi |
| 71 | + |
| 72 | +if [ $# -eq 4 ]; then |
| 73 | + data=$1 |
| 74 | + lang=$2 |
| 75 | + srcdir=$3 |
| 76 | + dir=$4 |
| 77 | +else # 5 arguments |
| 78 | + data=$1 |
| 79 | + lang=$2 |
| 80 | + srcdir=$3 |
| 81 | + ali_or_decode_dir=$4 |
| 82 | + dir=$5 |
| 83 | +fi |
| 84 | + |
| 85 | +for f in $data/feats.scp $srcdir/final.ie $srcdir/final.dubm $srcdir/global_cmvn.stats $srcdir/splice_opts \ |
| 86 | + $lang/phones.txt $srcdir/online_cmvn.conf $srcdir/final.mat; do |
| 87 | + [ ! -f $f ] && echo "$0: No such file $f" && exit 1; |
| 88 | +done |
| 89 | + |
| 90 | +mkdir -p $dir/log |
| 91 | +silphonelist=$(cat $lang/phones/silence.csl) || exit 1; |
| 92 | + |
| 93 | +if [ ! -z "$ali_or_decode_dir" ]; then |
| 94 | + |
| 95 | + nj_orig=$(cat $ali_or_decode_dir/num_jobs) || exit 1; |
| 96 | + |
| 97 | + if [ -f $ali_or_decode_dir/ali.1.gz ]; then |
| 98 | + if [ ! -f $ali_or_decode_dir/${mdl}.mdl ]; then |
| 99 | + echo "$0: expected $ali_or_decode_dir/${mdl}.mdl to exist." |
| 100 | + exit 1; |
| 101 | + fi |
| 102 | + |
| 103 | + if [ $stage -le 0 ]; then |
| 104 | + rm $dir/weights.*.gz 2>/dev/null |
| 105 | + |
| 106 | + $cmd JOB=1:$nj_orig $dir/log/ali_to_post.JOB.log \ |
| 107 | + gunzip -c $ali_or_decode_dir/ali.JOB.gz \| \ |
| 108 | + ali-to-post ark:- ark:- \| \ |
| 109 | + weight-silence-post $silence_weight $silphonelist $ali_or_decode_dir/final.mdl ark:- ark:- \| \ |
| 110 | + post-to-weights ark:- "ark:|gzip -c >$dir/weights.JOB.gz" || exit 1; |
| 111 | + |
| 112 | + # put all the weights in one archive. |
| 113 | + for j in $(seq $nj_orig); do gunzip -c $dir/weights.$j.gz; done | gzip -c >$dir/weights.gz || exit 1; |
| 114 | + rm $dir/weights.*.gz || exit 1; |
| 115 | + fi |
| 116 | + |
| 117 | + elif [ -f $ali_or_decode_dir/lat.1.gz ]; then |
| 118 | + if [ ! -f $ali_or_decode_dir/../${mdl}.mdl ]; then |
| 119 | + echo "$0: expected $ali_or_decode_dir/../${mdl}.mdl to exist." |
| 120 | + exit 1; |
| 121 | + fi |
| 122 | + |
| 123 | + |
| 124 | + if [ $stage -le 0 ]; then |
| 125 | + rm $dir/weights.*.gz 2>/dev/null |
| 126 | + |
| 127 | + $cmd JOB=1:$nj_orig $dir/log/lat_to_post.JOB.log \ |
| 128 | + lattice-best-path --acoustic-scale=$acwt "ark:gunzip -c $ali_or_decode_dir/lat.JOB.gz|" ark:/dev/null ark:- \| \ |
| 129 | + ali-to-post ark:- ark:- \| \ |
| 130 | + weight-silence-post $silence_weight $silphonelist $ali_or_decode_dir/../${mdl}.mdl ark:- ark:- \| \ |
| 131 | + post-to-weights ark:- "ark:|gzip -c >$dir/weights.JOB.gz" || exit 1; |
| 132 | + |
| 133 | + # put all the weights in one archive. |
| 134 | + for j in $(seq $nj_orig); do gunzip -c $dir/weights.$j.gz; done | gzip -c >$dir/weights.gz || exit 1; |
| 135 | + rm $dir/weights.*.gz || exit 1; |
| 136 | + fi |
| 137 | + else |
| 138 | + echo "$0: expected ali.1.gz or lat.1.gz to exist in $ali_or_decode_dir"; |
| 139 | + exit 1; |
| 140 | + fi |
| 141 | + |
| 142 | +fi |
| 143 | + |
| 144 | +# Now work out the per-speaker iVectors. |
| 145 | + |
| 146 | +sdata=$data/split$nj; |
| 147 | +utils/split_data.sh $data $nj || exit 1; |
| 148 | + |
| 149 | +echo $ivector_period > $dir/ivector_period || exit 1; |
| 150 | +splice_opts=$(cat $srcdir/splice_opts) |
| 151 | + |
| 152 | + |
| 153 | +gmm_feats="ark,s,cs:apply-cmvn-online --spk2utt=ark:$sdata/JOB/spk2utt --config=$srcdir/online_cmvn.conf $srcdir/global_cmvn.stats scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |" |
| 154 | +feats="ark,s,cs:splice-feats $splice_opts scp:$sdata/JOB/feats.scp ark:- | transform-feats $srcdir/final.mat ark:- ark:- |" |
| 155 | + |
| 156 | + |
| 157 | +if [ $stage -le 1 ]; then |
| 158 | + if [ ! -z "$ali_or_decode_dir" ]; then |
| 159 | + $cmd JOB=1:$nj $dir/log/extract_ivectors.JOB.log \ |
| 160 | + gmm-global-get-post --n=$num_gselect --min-post=$min_post $srcdir/final.dubm "$gmm_feats" ark:- \| \ |
| 161 | + weight-post ark:- "ark,s,cs:gunzip -c $dir/weights.gz|" ark:- \| \ |
| 162 | + ivector-extract --acoustic-weight=$posterior_scale --compute-objf-change=true \ |
| 163 | + --max-count=$max_count --spk2utt=ark:$sdata/JOB/spk2utt \ |
| 164 | + $srcdir/final.ie "$feats" ark,s,cs:- ark,t:$dir/ivectors_spk.JOB.ark || exit 1; |
| 165 | + else |
| 166 | + $cmd JOB=1:$nj $dir/log/extract_ivectors.JOB.log \ |
| 167 | + gmm-global-get-post --n=$num_gselect --min-post=$min_post $srcdir/final.dubm "$gmm_feats" ark:- \| \ |
| 168 | + ivector-extract --acoustic-weight=$posterior_scale --compute-objf-change=true \ |
| 169 | + --max-count=$max_count --spk2utt=ark:$sdata/JOB/spk2utt \ |
| 170 | + $srcdir/final.ie "$feats" ark,s,cs:- ark,t:$dir/ivectors_spk.JOB.ark || exit 1; |
| 171 | + fi |
| 172 | +fi |
| 173 | + |
| 174 | +# get an utterance-level set of iVectors (just duplicate the speaker-level ones). |
| 175 | +if [ $stage -le 2 ]; then |
| 176 | + for j in $(seq $nj); do |
| 177 | + utils/apply_map.pl -f 2 $dir/ivectors_spk.$j.ark <$sdata/$j/utt2spk >$dir/ivectors_utt.$j.ark || exit 1; |
| 178 | + done |
| 179 | +fi |
| 180 | + |
| 181 | +ivector_dim=$[$(head -n 1 $dir/ivectors_spk.1.ark | wc -w) - 3] || exit 1; |
| 182 | +echo "$0: iVector dim is $ivector_dim" |
| 183 | + |
| 184 | +base_feat_dim=$(feat-to-dim scp:$data/feats.scp -) || exit 1; |
| 185 | + |
| 186 | +start_dim=$base_feat_dim |
| 187 | +end_dim=$[$base_feat_dim+$ivector_dim-1] |
| 188 | + |
| 189 | + |
| 190 | +if [ $stage -le 3 ]; then |
| 191 | + # here, we are just using the original features in $sdata/JOB/feats.scp for |
| 192 | + # their number of rows; we use the select-feats command to remove those |
| 193 | + # features and retain only the iVector features. |
| 194 | + $cmd JOB=1:$nj $dir/log/duplicate_feats.JOB.log \ |
| 195 | + append-vector-to-feats scp:$sdata/JOB/feats.scp ark:$dir/ivectors_utt.JOB.ark ark:- \| \ |
| 196 | + select-feats "$start_dim-$end_dim" ark:- ark:- \| \ |
| 197 | + subsample-feats --n=$ivector_period ark:- ark:- \| \ |
| 198 | + copy-feats --compress=$compress ark:- \ |
| 199 | + ark,scp:$dir/ivector_online.JOB.ark,$dir/ivector_online.JOB.scp || exit 1; |
| 200 | +fi |
| 201 | + |
| 202 | +if [ $stage -le 4 ]; then |
| 203 | + echo "$0: combining iVectors across jobs" |
| 204 | + for j in $(seq $nj); do cat $dir/ivector_online.$j.scp; done >$dir/ivector_online.scp || exit 1; |
| 205 | +fi |
| 206 | + |
| 207 | +echo "$0: done extracting (pseudo-online) iVectors" |
0 commit comments