Skip to content

Commit 70a3aa4

Browse files
committed
Merging ^/sandbox/tanel back to trunk: added interface to GStreamer for online decoding
git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@2659 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
1 parent 91401e7 commit 70a3aa4

File tree

82 files changed

+1957
-124
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

82 files changed

+1957
-124
lines changed

src/decoder/biglm-faster-decoder.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@
1919
#define KALDI_DECODER_BIGLM_FASTER_DECODER_H_
2020

2121
#include "util/stl-utils.h"
22-
#include "util/parse-options.h"
2322
#include "util/hash-list.h"
2423
#include "fst/fstlib.h"
2524
#include "itf/decodable-itf.h"

src/decoder/faster-decoder.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
#define KALDI_DECODER_FASTER_DECODER_H_
2020

2121
#include "util/stl-utils.h"
22-
#include "util/parse-options.h"
22+
#include "itf/options-itf.h"
2323
#include "util/hash-list.h"
2424
#include "fst/fstlib.h"
2525
#include "itf/decodable-itf.h"
@@ -46,7 +46,7 @@ struct FasterDecoderOptions {
4646
// alignment, use small default.
4747
beam_delta(0.5),
4848
hash_ratio(2.0) { }
49-
void Register(ParseOptions *po, bool full) { /// if "full", use obscure
49+
void Register(OptionsItf *po, bool full) { /// if "full", use obscure
5050
/// options too.
5151
/// Depends on program.
5252
po->Register("beam", &beam, "Decoder beam");

src/decoder/lattice-faster-decoder.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ struct LatticeFasterDecoderConfig {
5555
max_arcs(-1),
5656
beam_delta(0.5),
5757
hash_ratio(2.0) { }
58-
void Register(ParseOptions *po) {
58+
void Register(OptionsItf *po) {
5959
po->Register("beam", &beam, "Decoding beam.");
6060
po->Register("max-active", &max_active, "Decoder max active states.");
6161
po->Register("min-active", &min_active, "Decoder minimum #active states.");

src/decoder/lattice-simple-decoder.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ struct LatticeSimpleDecoderConfig {
5555
max_loop(500000),
5656
max_arcs(-1),
5757
beam_ratio(0.9) { }
58-
void Register(ParseOptions *po) {
58+
void Register(OptionsItf *po) {
5959
po->Register("beam", &beam, "Decoding beam.");
6060
po->Register("lattice-beam", &lattice_beam, "Lattice generation beam");
6161
po->Register("prune-interval", &prune_interval, "Interval (in frames) at which to prune tokens");

src/decoder/lattice-tracking-decoder.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ struct LatticeTrackingDecoderConfig {
5757
hash_ratio(2.0),
5858
extra_beam(4.0),
5959
max_beam(40.0) { }
60-
void Register(ParseOptions *po) {
60+
void Register(OptionsItf *po) {
6161
po->Register("beam", &beam, "Decoding beam.");
6262
po->Register("max-active", &max_active, "Decoder max active states.");
6363
po->Register("lattice-beam", &lattice_beam, "Lattice generation beam");

src/decoder/nbest-decoder.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525
#include <tr1/unordered_map>
2626
#endif
2727
#include "util/stl-utils.h"
28-
#include "util/parse-options.h"
28+
#include "itf/options-itf.h"
2929
#include "util/hash-list.h"
3030
#include "fst/fstlib.h"
3131
#include "itf/decodable-itf.h"
@@ -44,7 +44,7 @@ struct NBestDecoderOptions {
4444
max_active(std::numeric_limits<int32>::max()),
4545
n_best(1),
4646
beam_delta(0.5), hash_ratio(2.0) { }
47-
void Register(ParseOptions *po, bool full) { /// if "full", use obscure
47+
void Register(OptionsItf *po, bool full) { /// if "full", use obscure
4848
/// options too.
4949
/// Depends on program.
5050
po->Register("beam", &beam, "Decoder beam");

src/decoder/training-graph-compiler.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ struct TrainingGraphCompilerOptions {
4040
rm_eps(false),
4141
reorder(b) { }
4242

43-
void Register(ParseOptions *po) {
43+
void Register(OptionsItf *po) {
4444
po->Register("transition-scale", &transition_scale, "Scale of transition "
4545
"probabilities (excluding self-loops)");
4646
po->Register("self-loop-scale", &self_loop_scale, "Scale of self-loop vs. "

src/doc/online_programs.dox

Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,9 @@ script found there. The programs are as follows:
3737

3838
There is also a Java equivalent of the online-audio-client which contains slightly more features and has a GUI.
3939

40+
In addition, there is a GStreamer 1.0 compatible plugin that acts as a filter, taking raw audio as input and producing
41+
recognized word as output. The plugin is based on \ref OnlineFasterDecoder, as other online recognition programs.
42+
4043
\section audio_server Online Audio Server
4144

4245
The main difference between the online-server-gmm-decode-faster and online-audio-server-decode-faster programs is the input: the former accepts feature vectors, while the latter accepts RAW audio.
@@ -116,6 +119,126 @@ java -jar online-audio-client.jar
116119

117120
Or simply double-click the JAR file in the graphical interface.
118121

122+
\section gst_plugin GStreamer plugin
123+
124+
Kaldi toolkit comes with a plugin for the <a href="http://gstreamer.freedesktop.org/">GStreamer</a> media streaming framework (version 1.0 or compatible).
125+
The plugin acts as a filter that accepts raw audio as input and produces recognized words as output.
126+
127+
The main benefit of the plugin is the fact that it makes Kaldi's online speech recognition functionality available to all
128+
programming languages that support GStreamer 1.0 (that includes Python, Ruby, Java, Vala and many more). It also simplifies the integration
129+
of the Kaldi online decoder in applications since communicating with the decoder follows GStreamer standards.
130+
131+
\subsection gst_plugin_installation Installation
132+
133+
The source of the GStreamer plugin is located in the `src/gst-plugin` directory. To compile the plugin, rest of the Kaldi
134+
toolkit has to be compiled with the '-fPIC' compilation option. To do this, just add `-fPIC` to the `CXXFLAGS` in
135+
the `src/kaldi.mk` file. Then recompile Kaldi as usual. Also compile the online extensions (`make ext`).
136+
137+
Make sure the package that provides GStreamer 1.0 development headers is installed on your system (on Debian, the needed package is called
138+
`libgstreamer1.0-dev`).
139+
140+
Finally, run `make depend` and `make` in the `src/gst-plugin` directory. This should result in a file `src/gst-plugin/libgstkaldi.so`
141+
which contains the GStreamer plugin.
142+
143+
To make GStreamer able to find the Kaldi plugin, you have to add the `src/gst-plugin` directory to its plugin search path. To do this,
144+
add the directory to the GST_PLUGIN_PATH environment variable:
145+
\verbatim
146+
export GST_PLUGIN_PATH=$KALDI_ROOT/src/gst-plugin
147+
\endverbatim
148+
Of course, replace `$KALDI_ROOT` with the actual location of the Kaldi root folder on your file system.
149+
150+
Now, running `gst-inspect-1.0 onlinegmmdecodefaster` should provide info about the plugin:
151+
\verbatim
152+
# gst-inspect-1.0 onlinegmmdecodefaster
153+
Factory Details:
154+
Rank: none (0)
155+
Long-name: OnlineGmmDecodeFaster
156+
Klass: Speech/Audio
157+
Description: Convert speech to text
158+
Author: Tanel Alumae <tanel.alumae@phon.ioc.ee>
159+
[..]
160+
Element Properties:
161+
name : The name of the object
162+
flags: readable, writable
163+
String. Default: "onlinegmmdecodefaster0"
164+
parent : The parent of the object
165+
flags: readable, writable
166+
Object of type "GstObject"
167+
silent : Determines whether incoming audio is sent to the decoder or not
168+
flags: readable, writable
169+
Boolean. Default: false
170+
model : Filename of the acoustic model
171+
flags: readable, writable
172+
String. Default: "final.mdl"
173+
fst : Filename of the HCLG FST
174+
flags: readable, writable
175+
String. Default: "HCLG.fst"
176+
[..]
177+
min-cmn-window : Minumum CMN window used at start of decoding (adds latency only at start)
178+
flags: readable, writable
179+
Integer. Range: -2147483648 - 2147483647 Default: 100
180+
181+
Element Signals:
182+
"hyp-word" : void user_function (GstElement* object,
183+
gchararray arg0,
184+
gpointer user_data);
185+
\endverbatim
186+
187+
\subsection usage_cli Usage through the command-line
188+
189+
The most simple way to use the GStreamer plugin is via the command line. You have to specify the model files used for decoding
190+
when lauching the plugin. To do this, set the `model`, `fst`, `word-syms`, `silence-phones` and optionally the `lda-mat`
191+
plugin properties (similarly to Kaldi's command-line online decoders). The decoder accepts only 16KHz 16-bit mono audio. Any audio stream can be automatically converted to the
192+
required format using GStreamer's `audioresample` and `audioconvert` plugins.
193+
194+
For example, to decode the file `test1.wav` using a model files in `tri2b_mmi`, and have the recognized stream of words printed to stdout, execute:
195+
\verbatim
196+
gst-launch-1.0 -q filesrc location=test1.wav \
197+
! decodebin ! audioconvert ! audioresample \
198+
! onlinegmmdecodefaster model=tri2b_mmi/model fst=tri2b_mmi/HCLG.fst \
199+
word-syms=tri2b_mmi/words.txt silence-phones="1:2:3:4:5" lda-mat=tri2b_mmi/matrix \
200+
! filesink location=/dev/stdout buffer-mode=2
201+
\endverbatim
202+
Note that the audio stream is segmented on the fly, with "<#s>" denoting silence.
203+
204+
You can easily try live decoding of microphone input by replacing `filesrc location=test1.wav` with `pulsesrc` (given that
205+
your OS uses the PulseAudio framework).
206+
207+
An example stript that uses the plugin via the command-line to process a buch of audio files is located in `egs/voxforge/gst_demo/run-simulated.sh`.
208+
209+
\subsection usage_gst Usage through GStreamer bindings
210+
211+
An example of a Python GUI program that uses the plugin via the GStreamer bindings is located in `egs/voxforge/gst_demo/run-live.py`.
212+
213+
The program constructs in the `init_gst(self)` method a similar pipeline of GStreamer elements as in the command-line example.
214+
The model files and some decoding parameters are communicated to the `onlinegmmdecodefaster` element through the standard `set_property()`
215+
method. More interesting is this part of the code:
216+
\verbatim
217+
self.asr.connect('hyp-word', self._on_word)
218+
\endverbatim
219+
This expression orders our decoding plugin to call the GUI's `_on_word` method whenever it produces a new recognized word.
220+
The `_on_word()` method looks like this:
221+
\verbatim
222+
def _on_word(self, asr, word):
223+
Gdk.threads_enter()
224+
if word == "<#s>":
225+
self.textbuf.insert_at_cursor("\n")
226+
else:
227+
self.textbuf.insert_at_cursor(word)
228+
self.textbuf.insert_at_cursor(" ")
229+
Gdk.threads_leave()
230+
\endverbatim
231+
What it does (apart from some GUI-related chemistry), is that it inserts the recognized word into the text buffer that is connected
232+
to the GUI's main text box. If a segmentation symbol is recognized, it inserts a line break instead.
233+
234+
Recognition start and stop are controlled by setting the `silent` property of the decoder plugin to `False` or `True`. Setting the
235+
property to `False` orders the plugin not to process any incoming audio (although the audio that is already being processed might
236+
produce some new recognized words).
237+
238+
239+
240+
241+
119242
*/
120243

121244

src/feat/feature-fbank.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ struct FbankOptions {
5151
htk_compat(false),
5252
use_log_fbank(true) {}
5353

54-
void Register(ParseOptions *po) {
54+
void Register(OptionsItf *po) {
5555
frame_opts.Register(po);
5656
mel_opts.Register(po);
5757
po->Register("use-energy", &use_energy,

src/feat/feature-functions.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ struct MelBanksOptions {
4747
: num_bins(num_bins), low_freq(20), high_freq(0), vtln_low(400),
4848
vtln_high(-400), debug_mel(false) {}
4949

50-
void Register(ParseOptions *po) {
50+
void Register(OptionsItf *po) {
5151
po->Register("num-mel-bins", &num_bins,
5252
"Number of triangular mel-frequency bins");
5353
po->Register("low-freq", &low_freq,
@@ -88,7 +88,7 @@ struct FrameExtractionOptions {
8888
window_type("povey"),
8989
round_to_power_of_two(true) { }
9090

91-
void Register(ParseOptions *po) {
91+
void Register(OptionsItf *po) {
9292
po->Register("sample-frequency", &samp_freq,
9393
"Waveform data sample frequency (must match the waveform file, "
9494
"if specified there)");
@@ -185,7 +185,7 @@ struct DeltaFeaturesOptions {
185185

186186
DeltaFeaturesOptions(int32 order = 2, int32 window = 2):
187187
order(order), window(window) { }
188-
void Register(ParseOptions *po) {
188+
void Register(OptionsItf *po) {
189189
po->Register("delta-order", &order, "Order of delta computation");
190190
po->Register("delta-window", &window,
191191
"Parameter controlling window for delta computation (actual window"

0 commit comments

Comments
 (0)