feelingstack
diff --git a/‎egs/wsj/s5/steps/online/nnet3/decode.sh‎
Lines changed: 151 additions & 0 deletions b/‎egs/wsj/s5/steps/online/nnet3/decode.sh‎
Lines changed: 151 additions & 0 deletions
diff --git a/‎src/nnet3/Makefile‎
Lines changed: 2 additions & 1 deletion b/‎src/nnet3/Makefile‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎src/nnet3/online-nnet3-decodable.cc‎
Lines changed: 181 additions & 0 deletions b/‎src/nnet3/online-nnet3-decodable.cc‎
Lines changed: 181 additions & 0 deletions
@@ -0,0 +1,151 @@
+#!/bin/bash
+
+# Copyright 2014  Johns Hopkins University (Author: Daniel Povey)
+# Apache 2.0
+
+# Begin configuration section.  
+stage=0
+nj=4
+cmd=run.pl
+max_active=7000
+threaded=false
+modify_ivector_config=false #  only relevant to threaded decoder.
+beam=15.0
+lattice_beam=6.0
+acwt=0.1   # note: only really affects adaptation and pruning (scoring is on
+           # lattices).
+post_decode_acwt=1.0  # can be used in 'chain' systems to scale acoustics by 10 so the
+                      # regular scoring script works.
+per_utt=false
+online=true  # only relevant to non-threaded decoder.
+do_endpointing=false
+do_speex_compressing=false
+scoring_opts=
+skip_scoring=false
+silence_weight=1.0  # set this to a value less than 1 (e.g. 0) to enable silence weighting.
+max_state_duration=40 # This only has an effect if you are doing silence
+  # weighting.  This default is probably reasonable.  transition-ids repeated
+  # more than this many times in an alignment are treated as silence.
+iter=final
+# End configuration section.
+
+echo "$0 $@"  # Print the command line for logging
+
+[ -f ./path.sh ] && . ./path.sh; # source the path.
+. parse_options.sh || exit 1;
+
+if [ $# != 3 ]; then
+   echo "Usage: $0 [options] <graph-dir> <data-dir> <decode-dir>"
+   echo "... where <decode-dir> is assumed to be a sub-directory of the directory"
+   echo " where the models are, as prepared by steps/online/nnet2/prepare_online_decoding.sh"
+   echo "e.g.: $0 exp/tri3b/graph data/test exp/tri3b_online/decode/"
+   echo ""
+   echo ""
+   echo "main options (for others, see top of script file)"
+   echo "  --config <config-file>                           # config containing options"
+   echo "  --nj <nj>                                        # number of parallel jobs"
+   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+   echo "  --acwt <float>                                   # acoustic scale used for lattice generation "
+   echo "  --per-utt <true|false>                           # If true, decode per utterance without"
+   echo "                                                   # carrying forward adaptation info from previous"
+   echo "                                                   # utterances of each speaker.  Default: false"
+   echo "  --online <true|false>                            # Set this to false if you don't really care about"
+   echo "                                                   # simulating online decoding and just want the best"
+   echo "                                                   # results.  This will use all the data within each"
+   echo "                                                   # utterance (plus any previous utterance, if not in"
+   echo "                                                   # per-utterance mode) to estimate the iVectors."
+   echo "  --scoring-opts <string>                          # options to local/score.sh"
+   echo "  --iter <iter>                                    # Iteration of model to decode; default is final."
+   exit 1;
+fi
+
+
+graphdir=$1
+data=$2
+dir=$3
+srcdir=`dirname $dir`; # The model directory is one level up from decoding directory.
+sdata=$data/split$nj;
+
+mkdir -p $dir/log
+[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
+echo $nj > $dir/num_jobs
+
+for f in $srcdir/conf/online_nnet2_decoding.conf $srcdir/${iter}.mdl \
+    $graphdir/HCLG.fst $graphdir/words.txt $data/wav.scp; do
+  if [ ! -f $f ]; then
+    echo "$0: no such file $f"
+    exit 1;
+  fi
+done
+
+if ! $per_utt; then
+  spk2utt_rspecifier="ark:$sdata/JOB/spk2utt"
+else
+  mkdir -p $dir/per_utt
+  for j in $(seq $nj); do
+    awk '{print $1, $1}' <$sdata/$j/utt2spk >$dir/per_utt/utt2spk.$j || exit 1;
+  done
+  spk2utt_rspecifier="ark:$dir/per_utt/utt2spk.JOB"
+fi
+
+if [ -f $data/segments ]; then
+  wav_rspecifier="ark,s,cs:extract-segments scp,p:$sdata/JOB/wav.scp $sdata/JOB/segments ark:- |"
+else
+  wav_rspecifier="ark,s,cs:wav-copy scp,p:$sdata/JOB/wav.scp ark:- |"
+fi
+if $do_speex_compressing; then
+  wav_rspecifier="$wav_rspecifier compress-uncompress-speex ark:- ark:- |"
+fi
+if $do_endpointing; then
+  wav_rspecifier="$wav_rspecifier extend-wav-with-silence ark:- ark:- |"  
+fi
+
+if [ "$silence_weight" != "1.0" ]; then
+  silphones=$(cat $graphdir/phones/silence.csl) || exit 1
+  silence_weighting_opts="--ivector-silence-weighting.max-state-duration=$max_state_duration --ivector-silence-weighting.silence_phones=$silphones --ivector-silence-weighting.silence-weight=$silence_weight"
+else
+  silence_weighting_opts=
+fi
+
+
+if $threaded; then
+  decoder=online2-wav-nnet2-latgen-threaded
+    # note: the decoder actually uses 4 threads, but the average usage will normally
+    # be more like 2.
+  parallel_opts="--num-threads 2"
+  opts="--modify-ivector-config=$modify_ivector_config --verbose=1"
+else
+  decoder=online2-wav-nnet3-latgen-faster
+  parallel_opts=
+  opts="--online=$online"
+fi
+
+if [ "$post_decode_acwt" == 1.0 ]; then
+  lat_wspecifier="ark:|gzip -c >$dir/lat.JOB.gz"
+else
+  lat_wspecifier="ark:|lattice-scale --acoustic-scale=$post_decode_acwt ark:- ark:- | gzip -c >$dir/lat.JOB.gz"
+fi
+
+
+if [ -f $srcdir/frame_subsampling_factor ]; then
+  # e.g. for 'chain' systems
+  frame_subsampling_opt="--frame-subsampling-factor=$(cat $srcdir/frame_subsampling_factor)"
+fi
+
+if [ $stage -le 0 ]; then
+  $cmd $parallel_opts JOB=1:$nj $dir/log/decode.JOB.log \
+    $decoder $opts $silence_weighting_opts --do-endpointing=$do_endpointing $frame_subsampling_opt \
+     --config=$srcdir/conf/online_nnet2_decoding.conf \
+     --max-active=$max_active --beam=$beam --lattice-beam=$lattice_beam \
+     --acoustic-scale=$acwt --word-symbol-table=$graphdir/words.txt \
+     $srcdir/${iter}.mdl $graphdir/HCLG.fst $spk2utt_rspecifier "$wav_rspecifier" \
+      $lat_wspecifier || exit 1;
+fi
+
+if ! $skip_scoring ; then
+  [ ! -x local/score.sh ] && \
+    echo "Not scoring because local/score.sh does not exist or not executable." && exit 1;
+  local/score.sh --cmd "$cmd" $scoring_opts $data $graphdir $dir
+fi
+
+exit 0;
@@ -24,7 +24,8 @@ OBJFILES = nnet-common.o nnet-compile.o nnet-component-itf.o \
   nnet-example-utils.o nnet-training.o \
   nnet-diagnostics.o nnet-combine.o nnet-am-decodable-simple.o \
   nnet-optimize-utils.o nnet-chain-example.o \
-  nnet-chain-training.o nnet-chain-diagnostics.o nnet-chain-combine.o
+  nnet-chain-training.o nnet-chain-diagnostics.o nnet-chain-combine.o \
+  online-nnet3-decodable.o
 
 LIBNAME = kaldi-nnet3
 
 
@@ -0,0 +1,181 @@
+// nnet3/online-nnet3-decodable.cc
+
+// Copyright  2014  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "nnet3/online-nnet3-decodable.h"
+#include "nnet3/nnet-utils.h"
+
+namespace kaldi {
+namespace nnet3 {
+
+DecodableNnet3Online::DecodableNnet3Online(
+    const AmNnetSimple &nnet,
+    const TransitionModel &trans_model,
+    const DecodableNnet3OnlineOptions &opts,
+    OnlineFeatureInterface *input_feats):
+    compiler_(nnet.GetNnet(), opts_.optimize_config),
+    features_(input_feats),
+    nnet_(nnet),
+    trans_model_(trans_model),
+    opts_(opts),
+    feat_dim_(input_feats->Dim()),
+    num_pdfs_(nnet.GetNnet().OutputDim("output")),
+    begin_frame_(-1) {
+  KALDI_ASSERT(opts_.max_nnet_batch_size > 0);
+  log_priors_ = nnet_.Priors();
+  KALDI_ASSERT((log_priors_.Dim() == 0 || log_priors_.Dim() == trans_model_.NumPdfs()) &&
+               "Priors in neural network must match with transition model (if exist).");
+
+  ComputeSimpleNnetContext(nnet_.GetNnet(), &left_context_, &right_context_);
+  log_priors_.ApplyLog();
+}
+
+
+
+BaseFloat DecodableNnet3Online::LogLikelihood(int32 frame, int32 index) {
+  ComputeForFrame(frame);
+  int32 pdf_id = trans_model_.TransitionIdToPdf(index);
+  KALDI_ASSERT(frame >= begin_frame_ &&
+               frame < begin_frame_ + scaled_loglikes_.NumRows());
+  return scaled_loglikes_(frame - begin_frame_, pdf_id);
+}
+
+
+bool DecodableNnet3Online::IsLastFrame(int32 frame) const {
+  KALDI_ASSERT(false && "Method is not imlemented");
+  return false;
+}
+
+int32 DecodableNnet3Online::NumFramesReady() const {
+  int32 features_ready = features_->NumFramesReady();
+  if (features_ready == 0)
+    return 0;
+  bool input_finished = features_->IsLastFrame(features_ready - 1);
+  if (opts_.pad_input) {
+    // normal case... we'll pad with duplicates of first + last frame to get the
+    // required left and right context.
+    if (input_finished) return subsampling(features_ready);
+    else return std::max<int32>(0, subsampling(features_ready - right_context_));
+  } else {
+    return std::max<int32>(0, subsampling(features_ready - right_context_ - left_context_));
+  }
+}
+
+int32 DecodableNnet3Online::subsampling(int32 num_frames) const {
+  return (num_frames) / opts_.frame_subsampling_factor;
+}
+
+void DecodableNnet3Online::ComputeForFrame(int32 subsampled_frame) {
+  int32 features_ready = features_->NumFramesReady();
+  bool input_finished = features_->IsLastFrame(features_ready - 1);  
+  KALDI_ASSERT(subsampled_frame >= 0);
+  if (subsampled_frame >= begin_frame_ &&
+      subsampled_frame < begin_frame_ + scaled_loglikes_.NumRows())
+    return;
+  KALDI_ASSERT(subsampled_frame < NumFramesReady());
+
+  int32 subsample = opts_.frame_subsampling_factor;
+
+  int32 input_frame_begin;
+  if (opts_.pad_input)
+    input_frame_begin = subsampled_frame * subsample  - left_context_;
+  else
+    input_frame_begin = subsampled_frame * subsample;
+  int32 max_possible_input_frame_end = features_ready /* - ( features_ready - right_context_) % subsample */;
+  if (input_finished && opts_.pad_input)
+    max_possible_input_frame_end += right_context_;
+  int32 input_frame_end = std::min<int32>(max_possible_input_frame_end,
+                                          input_frame_begin +
+                                          left_context_ + right_context_ +
+                                          opts_.max_nnet_batch_size);
+  KALDI_ASSERT(input_frame_end > input_frame_begin);
+  Matrix<BaseFloat> features(input_frame_end - input_frame_begin,
+                             feat_dim_);
+  for (int32 t = input_frame_begin; t < input_frame_end; t++) {
+    SubVector<BaseFloat> row(features, t - input_frame_begin);
+    int32 t_modified = t;
+    // The next two if-statements take care of "pad_input"
+    if (t_modified < 0)
+      t_modified = 0;
+    if (t_modified >= features_ready)
+      t_modified = features_ready - 1;
+    features_->GetFrame(t_modified, &row);
+  }
+
+  int32 num_subsampled_frames = subsampling(input_frame_end - input_frame_begin -
+          left_context_ - right_context_);
+  DoNnetComputation(input_frame_begin,
+      features, subsampled_frame * subsample, num_subsampled_frames);
+
+  begin_frame_ = subsampled_frame;
+}
+
+void DecodableNnet3Online::DoNnetComputation(
+    int32 input_t_start,
+    const MatrixBase<BaseFloat> &input_feats,
+    int32 output_t_start,
+    int32 num_subsampled_frames) {
+  ComputationRequest request;
+  request.need_model_derivative = false;
+  request.store_component_stats = false;
+
+  bool shift_time = true; // shift the 'input' and 'output' to a consistent
+                          // time, to take advantage of caching in the compiler.
+                          // An optimization.
+  int32 time_offset = (shift_time ? -output_t_start : 0);
+
+  // First add the regular features-- named "input".
+  request.inputs.reserve(2);
+  request.inputs.push_back(
+      IoSpecification("input", time_offset + input_t_start,
+                      time_offset + input_t_start + input_feats.NumRows()));
+  IoSpecification output_spec;
+  output_spec.name = "output";
+  output_spec.has_deriv = false;
+  int32 subsample = opts_.frame_subsampling_factor;
+  output_spec.indexes.resize(num_subsampled_frames);
+  // leave n and x values at 0 (the constructor sets these).
+  for (int32 i = 0; i < num_subsampled_frames; i++)
+    output_spec.indexes[i].t = time_offset + output_t_start + i * subsample;
+  request.outputs.resize(1);
+  request.outputs[0].Swap(&output_spec);
+
+  const NnetComputation *computation = compiler_.Compile(request);
+  Nnet *nnet_to_update = NULL;  // we're not doing any update.
+  NnetComputer computer(opts_.compute_config, *computation,
+                        nnet_.GetNnet(), nnet_to_update);
+
+  CuMatrix<BaseFloat> input_feats_cu(input_feats);
+  computer.AcceptInput("input", &input_feats_cu);
+  CuMatrix<BaseFloat> ivector_feats_cu;
+  computer.Forward();
+  CuMatrix<BaseFloat> cu_output;
+  computer.GetOutputDestructive("output", &cu_output);
+  // subtract log-prior (divide by prior)
+  if (log_priors_.Dim() != 0)
+    cu_output.AddVecToRows(-1.0, log_priors_);
+  // apply the acoustic scale
+  cu_output.Scale(opts_.acoustic_scale);
+  scaled_loglikes_.Resize(0, 0);
+  // the following statement just swaps the pointers if we're not using a GPU.
+  cu_output.Swap(&scaled_loglikes_);
+//  current_log_post_subsampled_offset_ = output_t_start / subsample;
+}
+
+} // namespace nnet3
+} // namespace kaldi