Skip to content

Commit ad6491a

Browse files
naxingyudanpovey
authored andcommitted
[src] Add --write-per-frame-acoustic-loglikes options to alignment programs (kaldi-asr#1601)
1 parent ba13848 commit ad6491a

File tree

8 files changed

+117
-34
lines changed

8 files changed

+117
-34
lines changed

src/decoder/decoder-wrappers.cc

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919

2020
#include "decoder/decoder-wrappers.h"
2121
#include "decoder/faster-decoder.h"
22+
#include "lat/lattice-functions.h"
2223

2324
namespace kaldi {
2425

@@ -419,13 +420,13 @@ void ModifyGraphForCarefulAlignment(
419420
fst::Concat(fst, fst_rhs);
420421
}
421422

422-
423+
423424
void AlignUtteranceWrapper(
424425
const AlignConfig &config,
425426
const std::string &utt,
426427
BaseFloat acoustic_scale, // affects scores written to scores_writer, if
427428
// present
428-
fst::VectorFst<fst::StdArc> *fst, // non-const in case config.careful ==
429+
fst::VectorFst<fst::StdArc> *fst, // non-const in case config.careful ==
429430
// true.
430431
DecodableInterface *decodable, // not const but is really an input.
431432
Int32VectorWriter *alignment_writer,
@@ -434,7 +435,8 @@ void AlignUtteranceWrapper(
434435
int32 *num_error,
435436
int32 *num_retried,
436437
double *tot_like,
437-
int64 *frame_count) {
438+
int64 *frame_count,
439+
BaseFloatVectorWriter *per_frame_acwt_writer) {
438440

439441
if ((config.retry_beam != 0 && config.retry_beam <= config.beam) ||
440442
config.beam <= 0.0) {
@@ -460,7 +462,7 @@ void AlignUtteranceWrapper(
460462
decoder.Decode(decodable);
461463

462464
bool ans = decoder.ReachedFinal(); // consider only final states.
463-
465+
464466
if (!ans && config.retry_beam != 0.0) {
465467
if (num_retried != NULL) (*num_retried)++;
466468
KALDI_WARN << "Retrying utterance " << utt << " with beam "
@@ -477,15 +479,15 @@ void AlignUtteranceWrapper(
477479
if (num_error != NULL) (*num_error)++;
478480
return;
479481
}
480-
482+
481483
fst::VectorFst<LatticeArc> decoded; // linear FST.
482484
decoder.GetBestPath(&decoded);
483485
if (decoded.NumStates() == 0) {
484486
KALDI_WARN << "Error getting best path from decoder (likely a bug)";
485487
if (num_error != NULL) (*num_error)++;
486488
return;
487489
}
488-
490+
489491
std::vector<int32> alignment;
490492
std::vector<int32> words;
491493
LatticeWeight weight;
@@ -499,10 +501,16 @@ void AlignUtteranceWrapper(
499501

500502
if (alignment_writer != NULL && alignment_writer->IsOpen())
501503
alignment_writer->Write(utt, alignment);
502-
504+
503505
if (scores_writer != NULL && scores_writer->IsOpen())
504506
scores_writer->Write(utt, -(weight.Value1()+weight.Value2()));
505-
}
506507

508+
Vector<BaseFloat> per_frame_loglikes;
509+
if (per_frame_acwt_writer != NULL && per_frame_acwt_writer->IsOpen()) {
510+
GetPerFrameAcousticCosts(decoded, &per_frame_loglikes);
511+
per_frame_loglikes.Scale(-1 / acoustic_scale);
512+
per_frame_acwt_writer->Write(utt, per_frame_loglikes);
513+
}
514+
}
507515

508516
} // end namespace kaldi.

src/decoder/decoder-wrappers.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,8 @@ void AlignUtteranceWrapper(
6969
int32 *num_error,
7070
int32 *num_retried,
7171
double *tot_like,
72-
int64 *frame_count);
72+
int64 *frame_count,
73+
BaseFloatVectorWriter *per_frame_acwt_writer = NULL);
7374

7475

7576

src/gmmbin/gmm-align-compiled.cc

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -39,8 +39,8 @@ int main(int argc, char *argv[]) {
3939

4040
const char *usage =
4141
"Align features given [GMM-based] models.\n"
42-
"Usage: gmm-align-compiled [options] model-in graphs-rspecifier "
43-
"feature-rspecifier alignments-wspecifier [scores-wspecifier]\n"
42+
"Usage: gmm-align-compiled [options] <model-in> <graphs-rspecifier> "
43+
"<feature-rspecifier> <alignments-wspecifier> [scores-wspecifier]\n"
4444
"e.g.: \n"
4545
" gmm-align-compiled 1.mdl ark:graphs.fsts scp:train.scp ark:1.ali\n"
4646
"or:\n"
@@ -52,6 +52,7 @@ int main(int argc, char *argv[]) {
5252
BaseFloat acoustic_scale = 1.0;
5353
BaseFloat transition_scale = 1.0;
5454
BaseFloat self_loop_scale = 1.0;
55+
std::string per_frame_acwt_wspecifier;
5556

5657
align_config.Register(&po);
5758
po.Register("transition-scale", &transition_scale,
@@ -60,13 +61,16 @@ int main(int argc, char *argv[]) {
6061
"Scaling factor for acoustic likelihoods");
6162
po.Register("self-loop-scale", &self_loop_scale,
6263
"Scale of self-loop versus non-self-loop log probs [relative to acoustics]");
64+
po.Register("write-per-frame-acoustic-loglikes", &per_frame_acwt_wspecifier,
65+
"Wspecifier for table of vectors containing the acoustic log-likelihoods "
66+
"per frame for each utterance. E.g. ark:foo/per_frame_logprobs.1.ark");
6367
po.Read(argc, argv);
6468

6569
if (po.NumArgs() < 4 || po.NumArgs() > 5) {
6670
po.PrintUsage();
6771
exit(1);
6872
}
69-
73+
7074
std::string model_in_filename = po.GetArg(1),
7175
fst_rspecifier = po.GetArg(2),
7276
feature_rspecifier = po.GetArg(3),
@@ -86,6 +90,7 @@ int main(int argc, char *argv[]) {
8690
RandomAccessBaseFloatMatrixReader feature_reader(feature_rspecifier);
8791
Int32VectorWriter alignment_writer(alignment_wspecifier);
8892
BaseFloatWriter scores_writer(scores_wspecifier);
93+
BaseFloatVectorWriter per_frame_acwt_writer(per_frame_acwt_wspecifier);
8994

9095
int num_done = 0, num_err = 0, num_retry = 0;
9196
double tot_like = 0.0;
@@ -118,12 +123,13 @@ int main(int argc, char *argv[]) {
118123

119124
DecodableAmDiagGmmScaled gmm_decodable(am_gmm, trans_model, features,
120125
acoustic_scale);
121-
126+
127+
KALDI_LOG << utt;
122128
AlignUtteranceWrapper(align_config, utt,
123129
acoustic_scale, &decode_fst, &gmm_decodable,
124130
&alignment_writer, &scores_writer,
125131
&num_done, &num_err, &num_retry,
126-
&tot_like, &frame_count);
132+
&tot_like, &frame_count, &per_frame_acwt_writer);
127133
}
128134
}
129135
KALDI_LOG << "Overall log-likelihood per frame is " << (tot_like/frame_count)
@@ -137,5 +143,3 @@ int main(int argc, char *argv[]) {
137143
return -1;
138144
}
139145
}
140-
141-

src/lat/lattice-functions.cc

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,48 @@ namespace kaldi {
3333
using std::map;
3434
using std::vector;
3535

36+
void GetPerFrameAcousticCosts(const Lattice &nbest, Vector<BaseFloat> *per_frame_loglikes) {
37+
using namespace fst;
38+
typedef Lattice::Arc::Weight Weight;
39+
vector<BaseFloat> loglikes;
40+
41+
int32 cur_state = nbest.Start();
42+
int32 prev_frame = -1;
43+
BaseFloat eps_acwt = 0.0;
44+
while(1) {
45+
Weight w = nbest.Final(cur_state);
46+
if (w != Weight::Zero()) {
47+
KALDI_ASSERT(nbest.NumArcs(cur_state) == 0);
48+
if (per_frame_loglikes != NULL) {
49+
SubVector<BaseFloat> subvec(&(loglikes[0]), loglikes.size());
50+
Vector<BaseFloat> vec(subvec);
51+
*per_frame_loglikes = vec;
52+
}
53+
break;
54+
} else {
55+
KALDI_ASSERT(nbest.NumArcs(cur_state) == 1);
56+
fst::ArcIterator<Lattice> iter(nbest, cur_state);
57+
const Lattice::Arc &arc = iter.Value();
58+
BaseFloat acwt = arc.weight.Value2();
59+
if (arc.ilabel != 0) {
60+
if (eps_acwt > 0) {
61+
acwt += eps_acwt;
62+
eps_acwt = 0.0;
63+
}
64+
loglikes.push_back(acwt);
65+
prev_frame++;
66+
} else if (acwt == acwt){
67+
if (prev_frame > -1) {
68+
loglikes[prev_frame] += acwt;
69+
} else {
70+
eps_acwt += acwt;
71+
}
72+
}
73+
cur_state = arc.nextstate;
74+
}
75+
}
76+
}
77+
3678
int32 LatticeStateTimes(const Lattice &lat, vector<int32> *times) {
3779
if (!lat.Properties(fst::kTopSorted, true))
3880
KALDI_ERR << "Input lattice must be topologically sorted.";

src/lat/lattice-functions.h

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,24 @@
3636

3737
namespace kaldi {
3838

39+
/**
40+
This function extracts the per-frame log likelihoods from a linear
41+
lattice (which we refer to as an 'nbest' lattice elsewhere in Kaldi code).
42+
The dimension of *per_frame_loglikes will be set to the
43+
number of input symbols in 'nbest'. The elements of
44+
'*per_frame_loglikes' will be set to the .Value2() elements of the lattice
45+
weights, which represent the acoustic costs; you may want to scale this
46+
vector afterward by -1/acoustic_scale to get the original loglikes.
47+
If there are acoustic costs on input-epsilon arcs or the final-prob in 'nbest'
48+
(and this should not normally be the case in situations where it makes
49+
sense to call this function), they will be included to the cost of the
50+
preceding input symbol, or the following input symbol for input-epsilons
51+
encountered prior to any input symbol. If 'nbest' has no input symbols,
52+
'per_frame_loglikes' will be set to the empty vector.
53+
**/
54+
void GetPerFrameAcousticCosts(const Lattice &nbest,
55+
Vector<BaseFloat> *per_frame_loglikes);
56+
3957
/// This function iterates over the states of a topologically sorted lattice and
4058
/// counts the time instance corresponding to each state. The times are returned
4159
/// in a vector of integers 'times' which is resized to have a size equal to the

src/nnet2bin/nnet-align-compiled.cc

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -40,8 +40,8 @@ int main(int argc, char *argv[]) {
4040

4141
const char *usage =
4242
"Align features given neural-net-based model\n"
43-
"Usage: nnet-align-compiled [options] model-in graphs-rspecifier "
44-
"feature-rspecifier alignments-wspecifier\n"
43+
"Usage: nnet-align-compiled [options] <model-in> <graphs-rspecifier> "
44+
"<feature-rspecifier> <alignments-wspecifier>\n"
4545
"e.g.: \n"
4646
" nnet-align-compiled 1.mdl ark:graphs.fsts scp:train.scp ark:1.ali\n"
4747
"or:\n"
@@ -54,6 +54,7 @@ int main(int argc, char *argv[]) {
5454
BaseFloat acoustic_scale = 1.0;
5555
BaseFloat transition_scale = 1.0;
5656
BaseFloat self_loop_scale = 1.0;
57+
std::string per_frame_acwt_wspecifier;
5758

5859
align_config.Register(&po);
5960
po.Register("transition-scale", &transition_scale,
@@ -63,6 +64,9 @@ int main(int argc, char *argv[]) {
6364
po.Register("self-loop-scale", &self_loop_scale,
6465
"Scale of self-loop versus non-self-loop "
6566
"log probs [relative to acoustics]");
67+
po.Register("write-per-frame-acoustic-loglikes", &per_frame_acwt_wspecifier,
68+
"Wspecifier for table of vectors containing the acoustic log-likelihoods "
69+
"per frame for each utterance. E.g. ark:foo/per_frame_logprobs.1.ark");
6670
po.Register("use-gpu", &use_gpu,
6771
"yes|no|optional|wait, only has effect if compiled with CUDA");
6872
po.Read(argc, argv);
@@ -75,7 +79,7 @@ int main(int argc, char *argv[]) {
7579
#if HAVE_CUDA==1
7680
CuDevice::Instantiate().SelectGpuId(use_gpu);
7781
#endif
78-
82+
7983
std::string model_in_filename = po.GetArg(1),
8084
fst_rspecifier = po.GetArg(2),
8185
feature_rspecifier = po.GetArg(3),
@@ -100,6 +104,7 @@ int main(int argc, char *argv[]) {
100104
RandomAccessBaseFloatCuMatrixReader feature_reader(feature_rspecifier);
101105
Int32VectorWriter alignment_writer(alignment_wspecifier);
102106
BaseFloatWriter scores_writer(scores_wspecifier);
107+
BaseFloatVectorWriter per_frame_acwt_writer(per_frame_acwt_wspecifier);
103108

104109
for (; !fst_reader.Done(); fst_reader.Next()) {
105110
std::string utt = fst_reader.Key();
@@ -135,7 +140,7 @@ int main(int argc, char *argv[]) {
135140
acoustic_scale, &decode_fst, &nnet_decodable,
136141
&alignment_writer, &scores_writer,
137142
&num_done, &num_err, &num_retry,
138-
&tot_like, &frame_count);
143+
&tot_like, &frame_count, &per_frame_acwt_writer);
139144
}
140145
KALDI_LOG << "Overall log-likelihood per frame is " << (tot_like/frame_count)
141146
<< " over " << frame_count<< " frames.";
@@ -152,5 +157,3 @@ int main(int argc, char *argv[]) {
152157
return -1;
153158
}
154159
}
155-
156-

src/nnet3bin/nnet3-align-compiled.cc

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ int main(int argc, char *argv[]) {
5656
std::string use_gpu = "yes";
5757
BaseFloat transition_scale = 1.0;
5858
BaseFloat self_loop_scale = 1.0;
59+
std::string per_frame_acwt_wspecifier;
5960

6061
std::string ivector_rspecifier,
6162
online_ivector_rspecifier,
@@ -71,6 +72,9 @@ int main(int argc, char *argv[]) {
7172
po.Register("self-loop-scale", &self_loop_scale,
7273
"Scale of self-loop versus non-self-loop "
7374
"log probs [relative to acoustics]");
75+
po.Register("write-per-frame-acoustic-loglikes", &per_frame_acwt_wspecifier,
76+
"Wspecifier for table of vectors containing the acoustic log-likelihoods "
77+
"per frame for each utterance. E.g. ark:foo/per_frame_logprobs.1.ark");
7478
po.Register("ivectors", &ivector_rspecifier, "Rspecifier for "
7579
"iVectors as vectors (i.e. not estimated online); per utterance "
7680
"by default, or per speaker if you provide the --utt2spk option.");
@@ -126,7 +130,7 @@ int main(int argc, char *argv[]) {
126130
RandomAccessBaseFloatMatrixReader feature_reader(feature_rspecifier);
127131
Int32VectorWriter alignment_writer(alignment_wspecifier);
128132
BaseFloatWriter scores_writer(scores_wspecifier);
129-
133+
BaseFloatVectorWriter per_frame_acwt_writer(per_frame_acwt_wspecifier);
130134

131135
for (; !fst_reader.Done(); fst_reader.Next()) {
132136
std::string utt = fst_reader.Key();
@@ -185,7 +189,7 @@ int main(int argc, char *argv[]) {
185189
&decode_fst, &nnet_decodable,
186190
&alignment_writer, &scores_writer,
187191
&num_done, &num_err, &num_retry,
188-
&tot_like, &frame_count);
192+
&tot_like, &frame_count, &per_frame_acwt_writer);
189193
}
190194
KALDI_LOG << "Overall log-likelihood per frame is "
191195
<< (tot_like/frame_count)

0 commit comments

Comments
 (0)