[scripts] Add decoding script that combines nnet3 outputs (kaldi-asr#2534)

teinhonglo · danpovey · commit 295877c2ebe5 · 2018-07-07T13:54:39.000-04:00
diff --git a/egs/wsj/s5/steps/nnet3/decode_score_fusion.sh b/egs/wsj/s5/steps/nnet3/decode_score_fusion.sh
@@ -0,0 +1,262 @@
+#!/bin/bash
+
+# Copyright 2018        Tien-Hong Lo
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Script for system combination using output of the neural networks.
+# This calls nnet3-compute, matrix-sum and latgen-faster-mapped to create a system combination.
+set -euo pipefail
+# begin configuration section.
+cmd=run.pl
+
+# Neural Network
+stage=0
+iter=final
+nj=30
+output_name="output"
+ivector_scale=1.0
+apply_exp=false  # Apply exp i.e. write likelihoods instead of log-likelihoods
+compress=false    # Specifies whether the output should be compressed before
+                  # dumping to disk
+use_gpu=false
+skip_diagnostics=false
+extra_left_context=0
+extra_right_context=0
+extra_left_context_initial=-1
+extra_right_context_final=-1
+online_ivector_dir=
+frame_subsampling_factor=1
+frames_per_chunk=150
+average=true
+
+# Decode
+beam=15.0 # prune the lattices prior to MBR decoding, for speed.
+max_active=7000
+min_active=200
+acwt=0.1  # Just a default value, used for adaptation and beam-pruning..
+post_decode_acwt=1.0  # can be used in 'chain' systems to scale acoustics by 10 so the
+                      # regular scoring script works.
+lattice_beam=8.0 # Beam we use in lattice generation.
+num_threads=1 # if >1, will use latgen-faster--map-parallel
+min_lmwt=5
+max_lmwt=15
+parallel_opts="--num-threads 3"
+scoring_opts=
+minimize=false
+skip_scoring=false
+
+word_determinize=false  # If set to true, then output lattice does not retain
+                        # alternate paths a sequence of words (with alternate pronunciations).
+                        # Setting to true is the default in steps/nnet3/decode.sh.
+                        # However, setting this to false
+                        # is useful for generation w of semi-supervised training
+                        # supervision and frame-level confidences.
+write_compact=true   # If set to false, then writes the lattice in non-compact format,
+                     # retaining the acoustic scores on each arc. This is
+                     # required to be false for LM rescoring undeterminized
+                     # lattices (when --word-determinize is false)
+#end configuration section.
+
+[ -f ./path.sh ] && . ./path.sh
+. parse_options.sh || exit 1;
+
+
+if [ $# -lt 5 ]; then
+  echo "Usage: $0 [options] <data-dir> <graph-dir> <nnet3-dir> <nnet3-dir2> [<nnet3-dir3> ... ] <output-dir>"
+  echo "e.g.:   local/socal/score_fusion.sh --nj 8 \\"
+  echo "--online-ivector-dir exp/nnet3/ivectors_test_eval92 \\"
+  echo "    data/test_eval92_hires exp/nnet3/tdnn/graph exp/nnet3/tdnn/output exp/nnet3/tdnn1/output .. \\"
+  echo "    exp/nnet3/tdnn_comb/decode_dev"
+  echo "main options (for others, see top of script file)"
+  echo "  --config <config-file>                   # config containing options"
+  echo "  --nj <nj>                                # number of parallel jobs"
+  echo "  --cmd <cmd>                              # Command to run in parallel with"
+  echo "  --iter <iter>                            # Iteration of model to decode; default is final."
+  exit 1;
+fi
+
+echo "$0 $@"
+
+data=$1
+graphdir=$2
+dir=${@: -1}  # last argument to the script
+shift 2;
+model_dirs=( $@ )  # read the remaining arguments into an array
+unset model_dirs[${#model_dirs[@]}-1]  # 'pop' the last argument which is odir
+num_sys=${#model_dirs[@]}  # number of systems to combine
+
+for f in $graphdir/words.txt $graphdir/phones/word_boundary.int ; do
+  [ ! -f $f ] && echo "$0: file $f does not exist" && exit 1;
+done
+
+[ ! -z "$online_ivector_dir" ] && \
+   extra_files="$online_ivector_dir/ivector_online.scp $online_ivector_dir/ivector_period"
+   
+if [ ! -z "$online_ivector_dir" ]; then
+    ivector_period=$(cat $online_ivector_dir/ivector_period) || exit 1;
+    ivector_opts="--online-ivectors=scp:$online_ivector_dir/ivector_online.scp --online-ivector-period=$ivector_period"
+fi
+
+frame_subsampling_opt=
+if [ $frame_subsampling_factor -ne 1 ]; then
+  # e.g. for 'chain' systems
+  frame_subsampling_opt="--frame-subsampling-factor=$frame_subsampling_factor"
+fi
+
+# convert $dir to absolute pathname
+fdir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $dir ${PWD}`
+
+# Possibly use multi-threaded decoder
+thread_string=
+[ $num_threads -gt 1 ] && thread_string="-parallel --num-threads=$num_threads"
+
+mkdir -p $dir/temp
+
+for i in `seq 0 $[num_sys-1]`; do
+  srcdir=${model_dirs[$i]}
+  
+  model=$srcdir/$iter.mdl
+  if [ ! -f $srcdir/$iter.mdl ]; then
+    echo "$0: Error: no such file $srcdir/$iter.raw. Trying $srcdir/$iter.mdl exit" && exit 1;
+  fi
+  
+  # check that they have the same tree
+  show-transitions $graphdir/phones.txt $model > $dir/temp/transition.${i}.txt
+  cmp_tree=`diff -q $dir/temp/transition.0.txt $dir/temp/transition.${i}.txt | awk '{print $5}'`
+  if [ ! -z $cmp_tree ]; then
+    echo "$0 tree must be the same."
+    exit 0;
+  fi
+  
+  # check that they have the same frame-subsampling-factor
+  if [ $frame_subsampling_factor -ne `cat $srcdir/frame_subsampling_factor` ]; then
+    echo "$0 frame_subsampling_factor must be the same."
+    exit 0;
+  fi
+  
+    for f in $data/feats.scp $model $extra_files; do
+    [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
+  done
+
+  if [ ! -z "$output_name" ] && [ "$output_name" != "output" ]; then
+    echo "$0: Using output-name $output_name"
+    model="nnet3-copy --edits='remove-output-nodes name=output;rename-node old-name=$output_name new-name=output' $model - |"
+  fi
+
+  ## Set up features.
+  if [ -f $srcdir/final.mat ]; then
+    echo "$0: Error: lda feature type is no longer supported." && exit 1
+  fi
+  
+  sdata=$data/split$nj;
+  cmvn_opts=`cat $srcdir/cmvn_opts` || exit 1;
+  
+  feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |"
+
+  if $apply_exp; then
+    output_wspecifier="ark:| copy-matrix --apply-exp ark:- ark:-"
+  else
+    output_wspecifier="ark:| copy-feats --compress=$compress ark:- ark:-"
+  fi
+
+  gpu_opt="--use-gpu=no"
+  gpu_queue_opt=
+
+  if $use_gpu; then
+    gpu_queue_opt="--gpu 1"
+    gpu_opt="--use-gpu=yes"
+  fi
+
+  echo "$i $model";
+  models[$i]="ark,s,cs:nnet3-compute $gpu_opt $ivector_opts $frame_subsampling_opt \
+     --frames-per-chunk=$frames_per_chunk \
+     --extra-left-context=$extra_left_context \
+     --extra-right-context=$extra_right_context \
+     --extra-left-context-initial=$extra_left_context_initial \
+     --extra-right-context-final=$extra_right_context_final \
+     '$model' '$feats' '$output_wspecifier' |"
+done
+
+# remove tempdir
+rm -rf $dir/temp
+
+# split data to nj
+[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
+echo $nj > $dir/num_jobs
+
+
+# Assume the nnet trained by 
+# the same tree and frame subsampling factor.
+mkdir -p $dir/log
+
+if [ -f $model ]; then
+  echo "$0: $model exists, copy model to $dir/../"
+  cp $model $dir/../
+fi
+
+if [ -f $srcdir/frame_shift ]; then
+  cp $srcdir/frame_shift $dir/../
+  echo "$0: $srcdir/frame_shift exists, copy $srcdir/frame_shift to $dir/../"
+elif [ -f $srcdir/frame_subsampling_factor ]; then
+  cp $srcdir/frame_subsampling_factor $dir/../
+  echo "$0: $srcdir/frame_subsampling_factor exists, copy $srcdir/frame_subsampling_factor to $dir/../"
+fi
+
+lat_wspecifier="ark:|"
+extra_opts=
+if ! $write_compact; then
+  extra_opts="--determinize-lattice=false"
+  lat_wspecifier="ark:| lattice-determinize-phone-pruned --beam=$lattice_beam --acoustic-scale=$acwt --minimize=$minimize --word-determinize=$word_determinize --write-compact=false $model ark:- ark:- |"
+fi
+
+if [ "$post_decode_acwt" == 1.0 ]; then
+  lat_wspecifier="$lat_wspecifier gzip -c >$dir/lat.JOB.gz"
+else
+  lat_wspecifier="$lat_wspecifier lattice-scale --acoustic-scale=$post_decode_acwt --write-compact=$write_compact ark:- ark:- | gzip -c >$dir/lat.JOB.gz"
+fi
+
+
+if [ $stage -le 0 ]; then  
+  $cmd --num-threads $num_threads JOB=1:$nj $dir/log/decode.JOB.log \
+    matrix-sum --average=$average "${models[@]}" ark:- \| \
+	latgen-faster-mapped$thread_string --lattice-beam=$lattice_beam --acoustic-scale=$acwt --allow-partial=true \
+	 --minimize=$minimize --max-active=$max_active --min-active=$min_active --beam=$beam \
+     --word-symbol-table=$graphdir/words.txt ${extra_opts} "$model" \
+     $graphdir/HCLG.fst ark:- "$lat_wspecifier"
+fi
+
+if [ $stage -le 1 ]; then
+  if ! $skip_diagnostics ; then
+    [ ! -z $iter ] && iter_opt="--iter $iter"
+    steps/diagnostic/analyze_lats.sh --cmd "$cmd" $iter_opt $graphdir $dir
+  fi
+fi
+
+if ! $skip_scoring ; then
+  if [ $stage -le 2 ]; then
+    [ ! -x local/score.sh ] && \
+    echo "Not scoring because local/score.sh does not exist or not executable." && exit 1;
+    echo "score best paths"
+    [ "$iter" != "final" ] && iter_opt="--iter $iter"
+	scoring_opts="--min_lmwt $min_lmwt"
+    local/score.sh $scoring_opts --cmd "$cmd" $data $graphdir $dir
+    echo "score confidence and timing with sclite"
+  fi
+fi
+
+
+exit 0
+
diff --git a/src/bin/matrix-sum.cc b/src/bin/matrix-sum.cc
@@ -103,6 +103,79 @@ int32 TypeOneUsage(const ParseOptions &po,
   return (n_success != 0 && n_missing < (n_success - n_missing)) ? 0 : 1;
 }
 
+int32 TypeOneUsageAverage(const ParseOptions &po) {
+  int32 num_args = po.NumArgs();
+  std::string matrix_in_fn1 = po.GetArg(1),
+      matrix_out_fn = po.GetArg(num_args);
+  BaseFloat scale = 1.0 / (num_args - 1);
+
+  // Output matrix
+  BaseFloatMatrixWriter matrix_writer(matrix_out_fn);
+
+  // Input matrices
+  SequentialBaseFloatMatrixReader matrix_reader1(matrix_in_fn1);
+  std::vector<RandomAccessBaseFloatMatrixReader*>
+      matrix_readers(num_args-2,
+                     static_cast<RandomAccessBaseFloatMatrixReader*>(NULL));
+  std::vector<std::string> matrix_in_fns(num_args-2);
+  for (int32 i = 2; i < num_args; ++i) {
+    matrix_readers[i-2] = new RandomAccessBaseFloatMatrixReader(po.GetArg(i));
+    matrix_in_fns[i-2] = po.GetArg(i);
+  }
+
+  int32 n_utts = 0, n_total_matrices = 0,
+      n_success = 0, n_missing = 0, n_other_errors = 0;
+
+  for (; !matrix_reader1.Done(); matrix_reader1.Next()) {
+    std::string key = matrix_reader1.Key();
+    Matrix<BaseFloat> matrix1 = matrix_reader1.Value();
+    matrix_reader1.FreeCurrent();
+    n_utts++;
+    n_total_matrices++;
+
+    matrix1.Scale(scale);
+
+    Matrix<BaseFloat> matrix_out(matrix1);
+
+    for (int32 i = 0; i < num_args-2; ++i) {
+      if (matrix_readers[i]->HasKey(key)) {
+        Matrix<BaseFloat> matrix2 = matrix_readers[i]->Value(key);
+        n_total_matrices++;
+        if (SameDim(matrix2, matrix_out)) {
+          matrix_out.AddMat(scale, matrix2, kNoTrans);
+        } else {
+          KALDI_WARN << "Dimension mismatch for utterance " << key
+                     << " : " << matrix2.NumRows() << " by "
+                     << matrix2.NumCols() << " for "
+                     << "system " << (i + 2) << ", rspecifier: "
+                     << matrix_in_fns[i] << " vs " << matrix_out.NumRows()
+                     << " by " << matrix_out.NumCols()
+                     << " primary matrix, rspecifier:" << matrix_in_fn1;
+          n_other_errors++;
+        }
+      } else {
+        KALDI_WARN << "No matrix found for utterance " << key << " for "
+                   << "system " << (i + 2) << ", rspecifier: "
+                   << matrix_in_fns[i];
+        n_missing++;
+      }
+    }
+
+    matrix_writer.Write(key, matrix_out);
+    n_success++;
+  }
+
+  KALDI_LOG << "Processed " << n_utts << " utterances: with a total of "
+            << n_total_matrices << " matrices across " << (num_args-1)
+            << " different systems";
+  KALDI_LOG << "Produced output for " << n_success << " utterances; "
+            << n_missing << " total missing matrices";
+
+  DeletePointers(&matrix_readers);
+
+  return (n_success != 0 && n_missing < (n_success - n_missing)) ? 0 : 1;
+}
+
 int32 TypeTwoUsage(const ParseOptions &po,
                    bool binary) {
   KALDI_ASSERT(po.NumArgs() == 2);
@@ -223,7 +296,7 @@ int main(int argc, char *argv[]) {
     po.Register("binary", &binary, "If true, write output as binary (only "
                 "relevant for usage types two or three");
     po.Register("average", &average, "If true, compute average instead of "
-                "sum; only currently compatible with type 3 usage.");
+                "sum; currently compatible with type 3 or type 1 usage.");
 
     po.Read(argc, argv);
 
@@ -232,9 +305,11 @@ int main(int argc, char *argv[]) {
     if (po.NumArgs() >= 2 &&
         ClassifyWspecifier(po.GetArg(N), NULL, NULL, NULL) != kNoWspecifier) {
       if (average)
-        KALDI_ERR << "--average option not compatible with type one usage.";
-      // output to table.
-      exit_status = TypeOneUsage(po, scale1, scale2);
+        // average option with type one usage.";
+	    exit_status = TypeOneUsageAverage(po);
+	  else
+        // output to table.
+        exit_status = TypeOneUsage(po, scale1, scale2);
     } else if (po.NumArgs() == 2 &&
                ClassifyRspecifier(po.GetArg(1), NULL, NULL) != kNoRspecifier &&
                ClassifyWspecifier(po.GetArg(N), NULL, NULL, NULL) ==
@@ -260,3 +335,4 @@ int main(int argc, char *argv[]) {
     return -1;
   }
 }
+