From 7d897644b770d39b14b7c0d851e7f7bf97f1aa5e Mon Sep 17 00:00:00 2001
From: Seppo Ingalsuo <seppo.ingalsuo@linux.intel.com>
Date: Tue, 12 May 2026 14:13:08 +0300
Subject: [PATCH 1/3] Audio: MFCC: Add Voice Activity Detection based on Mel
 spectrum

Add mfcc_vad module with A-weighted energy-based voice activity
detection that operates on the Mel log spectrum produced by the MFCC
component. The algorithm tracks a per-bin noise floor with instant-down
and slow-rise behavior, then computes a weighted energy delta above
the floor. Speech is declared when the delta exceeds a threshold
(0.35 in Q9.23) with a 20-frame hangover to prevent rapid toggling.
The VAD is gated on the new enable_vad flag in sof_mfcc_config.

Add struct mfcc_data_header with six int32 fields (magic,
frame_number, reserved, energy, noise_energy, vad_flag) prepended to
every output frame in all format paths (S16, S24, S32). This replaces
the previous magic-word-only header. The header carries the VAD
decision and energy values from the DSP for downstream consumers.

Extend sof_mfcc_config in user/mfcc.h with reserved16[3] padding for
32-bit alignment, and new boolean fields enable_vad, enable_dtx,
update_controls, and reserved_bool[5]. The config blob size increases
from 104 to 116 bytes.

Update Matlab/Octave decode scripts (decode_mel.m, decode_ceps.m,
decode_all.m) and setup_mfcc.m for the expanded header and config
struct. Regenerate topology2 configuration blobs (default.conf,
mel80.conf) with the new blob size.

Signed-off-by: Seppo Ingalsuo <seppo.ingalsuo@linux.intel.com>
---
 src/audio/mfcc/CMakeLists.txt                 |   2 +-
 src/audio/mfcc/mfcc_common.c                  |  69 +++--
 src/audio/mfcc/mfcc_setup.c                   |  17 +-
 src/audio/mfcc/mfcc_vad.c                     | 245 ++++++++++++++++++
 src/audio/mfcc/tune/decode_all.m              |   4 +-
 src/audio/mfcc/tune/decode_ceps.m             |  42 ++-
 src/audio/mfcc/tune/decode_mel.m              |  86 ++++--
 src/audio/mfcc/tune/setup_mfcc.m              |  19 +-
 src/include/sof/audio/mfcc/mfcc_comp.h        |  20 +-
 src/include/sof/audio/mfcc/mfcc_vad.h         |  99 +++++++
 src/include/user/mfcc.h                       |   7 +-
 .../include/components/mfcc/default.conf      |  12 +-
 .../include/components/mfcc/mel80.conf        |  10 +-
 13 files changed, 559 insertions(+), 73 deletions(-)
 create mode 100644 src/audio/mfcc/mfcc_vad.c
 create mode 100644 src/include/sof/audio/mfcc/mfcc_vad.h

diff --git a/src/audio/mfcc/CMakeLists.txt b/src/audio/mfcc/CMakeLists.txt
index f8af79d1ca8a..10daf78aa2a6 100644
--- a/src/audio/mfcc/CMakeLists.txt
+++ b/src/audio/mfcc/CMakeLists.txt
@@ -4,5 +4,5 @@ if(CONFIG_COMP_MFCC STREQUAL "m" AND DEFINED CONFIG_LLEXT)
   add_subdirectory(llext ${PROJECT_BINARY_DIR}/mfcc_llext)
   add_dependencies(app mfcc)
 else()
-  add_local_sources(sof mfcc.c mfcc_setup.c mfcc_common.c mfcc_generic.c mfcc_hifi4.c mfcc_hifi3.c)
+  add_local_sources(sof mfcc.c mfcc_setup.c mfcc_common.c mfcc_generic.c mfcc_hifi4.c mfcc_hifi3.c mfcc_vad.c)
 endif()
diff --git a/src/audio/mfcc/mfcc_common.c b/src/audio/mfcc/mfcc_common.c
index 1079864e9259..97f2cc547f53 100644
--- a/src/audio/mfcc/mfcc_common.c
+++ b/src/audio/mfcc/mfcc_common.c
@@ -21,6 +21,8 @@
 #include <stddef.h>
 #include <stdint.h>
 
+#include <sof/audio/mfcc/mfcc_vad.h>
+
 LOG_MODULE_REGISTER(mfcc_common, CONFIG_SOF_LOG_LEVEL);
 
 /*
@@ -169,6 +171,18 @@ static int mfcc_stft_process(const struct comp_dev *dev, struct mfcc_comp_data *
 
 			cc_count += state->dct.num_out;
 		}
+
+		/* Run VAD on the mel log spectrum (available in both modes) */
+		if (config->enable_vad)
+			mfcc_vad_update(&cd->vad, state->mel_log_32);
+
+		/* Populate data header for this output frame */
+		state->header.magic = MFCC_MAGIC;
+		state->header.frame_number = cd->vad.frame_count;
+		state->header.reserved = 0;
+		state->header.energy = cd->vad.energy;
+		state->header.noise_energy = cd->vad.noise_energy;
+		state->header.vad_flag = cd->vad.is_speech ? 1 : 0;
 	}
 
 	return cc_count;
@@ -267,9 +281,8 @@ void mfcc_s16_default(struct processing_module *mod, struct input_stream_buffer
 	struct mfcc_comp_data *cd = module_get_private_data(mod);
 	struct mfcc_state *state = &cd->state;
 	struct mfcc_buffer *buf = &cd->state.buf;
-	uint32_t magic = MFCC_MAGIC;
 	int16_t *w_ptr = audio_stream_get_wptr(sink);
-	const int num_magic = 2;
+	const int num_header_s16 = sizeof(state->header) / sizeof(int16_t);
 	int num_ceps;
 	int sink_samples;
 	int to_copy;
@@ -280,25 +293,27 @@ void mfcc_s16_default(struct processing_module *mod, struct input_stream_buffer
 	/* Run STFT and processing after FFT: Mel auditory filter and DCT. */
 	num_ceps = mfcc_stft_process(mod->dev, cd);
 
-	/* If new output produced, set up pointer into scratch data and mark magic pending */
+	/* If new output produced, set up pointer into scratch data and mark header pending */
 	if (num_ceps > 0) {
-		if (state->mel_only)
+		if (state->mel_only) {
 			state->out_data_ptr = state->mel_spectra->data;
-		else
+		} else {
 			state->out_data_ptr = state->cepstral_coef->data;
+		}
 
 		state->out_remain = num_ceps;
-		state->magic_pending = true;
+		state->header_pending = true;
 	}
 
 	/* Write to sink, limited by period size */
 	sink_samples = frames * audio_stream_get_channels(sink);
 
-	/* Write magic word first if pending */
-	if (state->magic_pending && sink_samples >= num_magic) {
-		w_ptr = mfcc_sink_copy_data_s16(sink, w_ptr, num_magic, (int16_t *)&magic);
-		sink_samples -= num_magic;
-		state->magic_pending = false;
+	/* Write data header first if pending */
+	if (state->header_pending && sink_samples >= num_header_s16) {
+		w_ptr = mfcc_sink_copy_data_s16(sink, w_ptr, num_header_s16,
+						(int16_t *)&state->header);
+		sink_samples -= num_header_s16;
+		state->header_pending = false;
 	}
 
 	/* Write cepstral/mel data from scratch buffer */
@@ -363,9 +378,8 @@ void mfcc_s24_default(struct processing_module *mod, struct input_stream_buffer
 	struct mfcc_comp_data *cd = module_get_private_data(mod);
 	struct mfcc_state *state = &cd->state;
 	struct mfcc_buffer *buf = &cd->state.buf;
-	uint32_t magic = MFCC_MAGIC;
 	int32_t *w_ptr = audio_stream_get_wptr(sink);
-	const int num_magic = 1; /* one int32_t word for magic */
+	const int num_header_s32 = sizeof(state->header) / sizeof(int32_t);
 	int num_ceps;
 	int sink_samples;
 	int remain_s32;
@@ -391,17 +405,18 @@ void mfcc_s24_default(struct processing_module *mod, struct input_stream_buffer
 		}
 
 		state->out_remain = num_ceps;
-		state->magic_pending = true;
+		state->header_pending = true;
 	}
 
 	/* Write to sink, limited by period size */
 	sink_samples = frames * audio_stream_get_channels(sink);
 
-	/* Write magic word first if pending */
-	if (state->magic_pending && sink_samples >= num_magic) {
-		w_ptr = mfcc_sink_copy_data_s32(sink, w_ptr, num_magic, (int32_t *)&magic);
-		sink_samples -= num_magic;
-		state->magic_pending = false;
+	/* Write data header first if pending */
+	if (state->header_pending && sink_samples >= num_header_s32) {
+		w_ptr = mfcc_sink_copy_data_s32(sink, w_ptr, num_header_s32,
+						(int32_t *)&state->header);
+		sink_samples -= num_header_s32;
+		state->header_pending = false;
 	}
 
 	if (state->mel_only) {
@@ -443,9 +458,8 @@ void mfcc_s32_default(struct processing_module *mod, struct input_stream_buffer
 	struct mfcc_comp_data *cd = module_get_private_data(mod);
 	struct mfcc_state *state = &cd->state;
 	struct mfcc_buffer *buf = &cd->state.buf;
-	uint32_t magic = MFCC_MAGIC;
 	int32_t *w_ptr = audio_stream_get_wptr(sink);
-	const int num_magic = 1; /* one int32_t word for magic */
+	const int num_header_s32 = sizeof(state->header) / sizeof(int32_t);
 	int num_ceps;
 	int sink_samples;
 	int remain_s32;
@@ -466,17 +480,18 @@ void mfcc_s32_default(struct processing_module *mod, struct input_stream_buffer
 		}
 
 		state->out_remain = num_ceps;
-		state->magic_pending = true;
+		state->header_pending = true;
 	}
 
 	/* Write to sink, limited by period size */
 	sink_samples = frames * audio_stream_get_channels(sink);
 
-	/* Write magic word first if pending */
-	if (state->magic_pending && sink_samples >= num_magic) {
-		w_ptr = mfcc_sink_copy_data_s32(sink, w_ptr, num_magic, (int32_t *)&magic);
-		sink_samples -= num_magic;
-		state->magic_pending = false;
+	/* Write data header first if pending */
+	if (state->header_pending && sink_samples >= num_header_s32) {
+		w_ptr = mfcc_sink_copy_data_s32(sink, w_ptr, num_header_s32,
+						(int32_t *)&state->header);
+		sink_samples -= num_header_s32;
+		state->header_pending = false;
 	}
 
 	if (state->mel_only) {
diff --git a/src/audio/mfcc/mfcc_setup.c b/src/audio/mfcc/mfcc_setup.c
index 1cad4b2b984e..23f07e6aaf68 100644
--- a/src/audio/mfcc/mfcc_setup.c
+++ b/src/audio/mfcc/mfcc_setup.c
@@ -18,6 +18,8 @@
 #include <stddef.h>
 #include <stdint.h>
 
+#include <sof/audio/mfcc/mfcc_vad.h>
+
 /* Definitions for cepstral lifter */
 #define PI_Q23 Q_CONVERT_FLOAT(3.1415926536, 23)
 #define TWO_PI_Q23 Q_CONVERT_FLOAT(6.2831853072, 23)
@@ -332,7 +334,7 @@ int mfcc_setup(struct processing_module *mod, int max_frames, int sample_rate, i
 	 * least fft_hop_size * channels int16_t samples per hop (worst case s16).
 	 * If output exceeds this, data accumulates and will eventually overflow.
 	 */
-	int out_per_hop = max_out_per_hop + 2;
+	int out_per_hop = max_out_per_hop + sizeof(state->header) / sizeof(int16_t);
 	int sink_per_hop = fft->fft_hop_size * channels;
 
 	if (out_per_hop > sink_per_hop) {
@@ -345,11 +347,20 @@ int mfcc_setup(struct processing_module *mod, int max_frames, int sample_rate, i
 	/* Set initial state for STFT */
 	state->waiting_fill = true;
 	state->prev_samples_valid = false;
-	state->magic_pending = false;
+	state->header_pending = false;
+	memset(&state->header, 0, sizeof(state->header));
 	state->out_data_ptr = NULL;
 	state->out_data_ptr_32 = NULL;
 	state->out_remain = 0;
 
+	if (config->enable_vad) {
+		ret = mfcc_vad_init(&cd->vad, config->num_mel_bins, sample_rate, mod);
+		if (ret < 0) {
+			comp_err(dev, "Failed VAD init");
+			goto free_lifter;
+		}
+	}
+
 	comp_dbg(dev, "done");
 	return 0;
 
@@ -389,4 +400,6 @@ void mfcc_free_buffers(struct processing_module *mod)
 	mod_free(mod, cd->state.melfb.data);
 	mod_free(mod, cd->state.dct.matrix);
 	mod_free(mod, cd->state.lifter.matrix);
+	mod_free(mod, cd->vad.noise_floor);
+	mod_free(mod, cd->vad.weights);
 }
diff --git a/src/audio/mfcc/mfcc_vad.c b/src/audio/mfcc/mfcc_vad.c
new file mode 100644
index 000000000000..1ac13cf53b88
--- /dev/null
+++ b/src/audio/mfcc/mfcc_vad.c
@@ -0,0 +1,245 @@
+// SPDX-License-Identifier: BSD-3-Clause
+//
+// Copyright(c) 2026 Intel Corporation.
+//
+// Author: Seppo Ingalsuo <seppo.ingalsuo@linux.intel.com>
+
+/**
+ * \file mfcc_vad.c
+ * \brief Voice Activity Detection based on Mel spectrum energy.
+ *
+ * Implements a VAD that tracks per-bin noise floor and computes a
+ * speech-frequency weighted energy above the floor. Speech is declared
+ * when the weighted delta exceeds a threshold, with hangover to prevent
+ * rapid toggling.
+ */
+
+#include <sof/audio/mfcc/mfcc_vad.h>
+
+#include <sof/audio/component.h>
+#include <sof/audio/format.h>
+#include <sof/audio/module_adapter/module/module_interface.h>
+#include <sof/math/auditory.h>
+#include <sof/trace/trace.h>
+#include <errno.h>
+#include <stddef.h>
+
+LOG_MODULE_DECLARE(mfcc, CONFIG_SOF_LOG_LEVEL);
+
+/**
+ * \brief A-weighting table: 1/3 octave band center frequencies in Hz (Q16.0).
+ *
+ * From IEC 61672-1:2013, source:
+ * https://acousticalengineer.com/a-weighting-table/
+ */
+#define A_WEIGHT_TABLE_SIZE	36
+
+static const int16_t a_weight_hz[A_WEIGHT_TABLE_SIZE] = {
+	    6,     8,    10,    13,    16,    20,    25,    32,
+	   40,    50,    63,    80,   100,   125,   160,   200,
+	  250,   315,   400,   500,   630,   800,  1000,  1250,
+	 1600,  2000,  2500,  3150,  4000,  5000,  6300,  8000,
+	10000, 12500, 16000, 20000,
+};
+
+/**
+ * \brief A-weighting linear amplitude, scaled so peak (at 2500 Hz) maps
+ *        to INT16_MAX (32767).  Original dB values converted via
+ *        10^(dB/20) then scaled by 32767 / max.
+ */
+static const int16_t a_weight_lin[A_WEIGHT_TABLE_SIZE] = {
+	    2,     4,     9,    19,    43,    85,   162,   299,
+	  531,   862,  1382,  2140,  3129,  4370,  6172,  8136,
+	10362, 13196, 16234, 19518, 22669, 25730, 28212, 30230,
+	31655, 32392, 32767, 32392, 31655, 30230, 27889, 24856,
+	21156, 17196, 13045,  9670,
+};
+
+/**
+ * \brief Compute A-weighted speech-frequency emphasis weights for Mel bins.
+ *
+ * Weights are computed by linearly interpolating the A-weighting table
+ * at each Mel bin center frequency.  Output weights are in Q1.15 and
+ * sum to approximately 2^15.
+ *
+ * \param[out] weights Output weight array.
+ * \param[in] num_mel Number of Mel bins.
+ * \param[in] sample_rate Sample rate in Hz.
+ */
+static void mfcc_vad_build_weights(int16_t *weights, int num_mel, int sample_rate)
+{
+	int32_t scaled, num;
+	int32_t sum = 0;
+	int16_t f_hz, f0, f1, w, w0, w1, den;
+	int16_t mel_end = psy_hz_to_mel((int16_t)(sample_rate / 2));
+	int16_t mel_step = mel_end / (num_mel + 1);
+	int i, j;
+
+	if (!num_mel)
+		return;
+
+	for (i = 0; i < num_mel; i++) {
+		f_hz = psy_mel_to_hz((int16_t)((i + 1) * mel_step));
+
+		/* Find the table interval containing f_hz and interpolate */
+		if (f_hz <= a_weight_hz[0]) {
+			w = a_weight_lin[0];
+		} else if (f_hz >= a_weight_hz[A_WEIGHT_TABLE_SIZE - 1]) {
+			w = a_weight_lin[A_WEIGHT_TABLE_SIZE - 1];
+		} else {
+			/* Find j such that a_weight_hz[j] <= f_hz < a_weight_hz[j+1] */
+			for (j = 0; j < A_WEIGHT_TABLE_SIZE - 2; j++) {
+				if (f_hz < a_weight_hz[j + 1])
+					break;
+			}
+
+			/* Linear interpolation: w = w0 + (w1 - w0) * (f - f0) / (f1 - f0) */
+			f0 = a_weight_hz[j];
+			f1 = a_weight_hz[j + 1];
+			w0 = a_weight_lin[j];
+			w1 = a_weight_lin[j + 1];
+			num = (int32_t)(w1 - w0) * (f_hz - f0);
+			den = f1 - f0;
+			w = w0 + (int16_t)(num / den);
+		}
+
+		weights[i] = w;
+		sum += w;
+	}
+
+	/* Normalize weights so they sum to 1.0 */
+	for (i = 0; i < num_mel; i++) {
+		scaled = ((int32_t)weights[i] << 16) / sum; /* Q1.16 */
+		weights[i] = (int16_t)Q_SHIFT_RND(scaled, 16, 15); /* Round to Q1.15 */
+	}
+}
+
+int mfcc_vad_init(struct mfcc_vad_state *vad, int num_mel_bins, int sample_rate,
+		  struct processing_module *mod)
+{
+	if (!vad)
+		return -EINVAL;
+
+	if (num_mel_bins <= 0)
+		return -EINVAL;
+
+	vad->num_mel_bins = num_mel_bins;
+	vad->energy_threshold = MFCC_VAD_ENERGY_THRESHOLD;
+	vad->noise_rise_alpha_slow = MFCC_VAD_NOISE_RISE_ALPHA;
+	vad->noise_rise_alpha_fast = MFCC_VAD_NOISE_RISE_ALPHA_FAST;
+	vad->hangover_max = MFCC_VAD_HANGOVER_FRAMES;
+	vad->hangover_counter = 0;
+	vad->init_frames = MFCC_VAD_NOISE_INIT_FRAMES;
+	vad->frame_count = 0;
+	vad->is_speech = false;
+	vad->initialized = false;
+
+	/* Allocate per-bin noise floor */
+	vad->noise_floor = mod_zalloc(mod, num_mel_bins * sizeof(int32_t));
+	if (!vad->noise_floor)
+		return -ENOMEM;
+
+	/* Allocate and compute per-bin weights */
+	vad->weights = mod_zalloc(mod, num_mel_bins * sizeof(int16_t));
+	if (!vad->weights) {
+		mod_free(mod, vad->noise_floor);
+		vad->noise_floor = NULL;
+		return -ENOMEM;
+	}
+
+	mfcc_vad_build_weights(vad->weights, num_mel_bins, sample_rate);
+	return 0;
+}
+
+int mfcc_vad_update(struct mfcc_vad_state *vad, const int32_t *mel_log)
+{
+	int64_t signal_energy = 0;
+	int64_t noise_energy = 0;
+	int64_t energy_delta = 0;
+	int32_t delta;
+	int32_t p;
+	int16_t alpha;
+	int i;
+
+	if (!vad || !mel_log)
+		return 0;
+
+	vad->frame_count++;
+
+	/* Initialize noise floor to first frame */
+	if (!vad->initialized) {
+		for (i = 0; i < vad->num_mel_bins; i++)
+			vad->noise_floor[i] = mel_log[i];
+
+		vad->initialized = true;
+	}
+
+	/* Select rise alpha based on convergence phase */
+	if (vad->frame_count <= vad->init_frames)
+		alpha = vad->noise_rise_alpha_fast;
+	else
+		alpha = vad->noise_rise_alpha_slow;
+
+	/* Update noise floor: follow down instantly, rise slowly */
+	for (i = 0; i < vad->num_mel_bins; i++) {
+		if (mel_log[i] < vad->noise_floor[i]) {
+			/* Instant follow-down */
+			vad->noise_floor[i] = mel_log[i];
+		} else {
+			/* Slow rise: floor += alpha * (mel - floor)
+			 * Q9.23 + Q1.15 * Q9.23 => need Q9.23 result
+			 * alpha is Q1.15, delta is Q9.23
+			 */
+			delta = mel_log[i] - vad->noise_floor[i];
+			p = (int32_t)Q_MULTSR_32X32((int64_t)alpha, delta, 15, 23, 23);
+			vad->noise_floor[i] += p;
+		}
+	}
+
+	/* Compute weighted signal energy and noise floor energy.
+	 * weights are Q1.15, mel values are Q9.23
+	 * Products are Q10.38, accumulate in int64_t then shift to Q9.23
+	 */
+
+	for (i = 0; i < vad->num_mel_bins; i++) {
+		signal_energy += (int64_t)vad->weights[i] * mel_log[i];
+		noise_energy += (int64_t)vad->weights[i] * vad->noise_floor[i];
+	}
+
+	vad->energy = sat_int32(Q_SHIFT_RND(signal_energy, 38, 23));
+	vad->noise_energy = sat_int32(Q_SHIFT_RND(noise_energy, 38, 23));
+	energy_delta = vad->energy - vad->noise_energy;
+
+	/* Round accumulated energy from Q10.38 to Q9.23, saturate to int32 */
+	if (energy_delta > vad->energy_threshold) {
+		vad->hangover_counter = vad->hangover_max;
+		vad->is_speech = true;
+	} else {
+		if (vad->hangover_counter > 0) {
+			vad->hangover_counter--;
+			vad->is_speech = true;
+		} else {
+			vad->is_speech = false;
+		}
+	}
+
+	return vad->is_speech ? 1 : 0;
+}
+
+void mfcc_vad_reset(struct mfcc_vad_state *vad)
+{
+	int i;
+
+	if (!vad)
+		return;
+
+	vad->frame_count = 0;
+	vad->hangover_counter = 0;
+	vad->energy = 0;
+	vad->noise_energy = 0;
+	vad->is_speech = false;
+	vad->initialized = false;
+
+	for (i = 0; i < vad->num_mel_bins; i++)
+		vad->noise_floor[i] = 0;
+}
diff --git a/src/audio/mfcc/tune/decode_all.m b/src/audio/mfcc/tune/decode_all.m
index d5b60289b4cf..23ca07522aae 100644
--- a/src/audio/mfcc/tune/decode_all.m
+++ b/src/audio/mfcc/tune/decode_all.m
@@ -25,7 +25,7 @@
 	fn = all_ceps_files{i};
 	if exist(fn, 'file')
 		fprintf('Decoding MFCC ceps: %s\n', fn);
-		[ceps, t, n] = decode_ceps(fn, num_ceps);
+		[ceps, t, n, energy, noise_energy, vad] = decode_ceps(fn, num_ceps);
 	end
 end
 
@@ -34,6 +34,6 @@
 	fmt = all_mel_fmts{i};
 	if exist(fn, 'file')
 		fprintf('Decoding Mel: %s\n', fn);
-		[mel, t, n] = decode_mel(fn, num_mel, fmt);
+		[mel, t, n, energy, noise_energy, vad] = decode_mel(fn, num_mel, fmt);
 	end
 end
diff --git a/src/audio/mfcc/tune/decode_ceps.m b/src/audio/mfcc/tune/decode_ceps.m
index a63677fa3731..32a04e8d8df7 100644
--- a/src/audio/mfcc/tune/decode_ceps.m
+++ b/src/audio/mfcc/tune/decode_ceps.m
@@ -1,4 +1,4 @@
-% [ceps, t, n] = decode_ceps(fn, num_ceps, num_channels)
+% [ceps, t, n, vad, energy, noise_energy, frame_number] = decode_ceps(fn, num_ceps, num_channels)
 %
 % Input
 %   fn - File with MFCC data in .raw or .wav format
@@ -9,11 +9,16 @@
 %   ceps - cepstral coefficients
 %   t - time vector for plotting
 %   n - ceps 1..num_ceps vector for plotting
+%   vad - VAD flag per frame from DSP
+%   energy - weighted signal energy per frame from DSP
+%   noise_energy - weighted noise floor energy per frame from DSP
+%   frame_number - frame number from DSP
 
 % SPDX-License-Identifier: BSD-3-Clause
-% Copyright(c) 2022 Intel Corporation. All rights reserved.
+% Copyright(c) 2022-2026 Intel Corporation. All rights reserved.
 
-function [ceps, t, n] = decode_ceps(fn, num_ceps, num_channels)
+function [ceps, t, n, vad, energy, noise_energy, frame_number] = ...
+	decode_ceps(fn, num_ceps, num_channels)
 
 if nargin < 3
 	num_channels = 1;
@@ -23,6 +28,7 @@
 fs = 16e3;
 qformat = 7;
 magic = [25443 28006]; % ASCII 'mfcc' as int16
+num_magic = 2; % magic word is 2 x int16
 
 % Load output data
 [data, num_channels] = get_file(fn, num_channels);
@@ -41,17 +47,37 @@
 
 period_ceps = idx(2)-idx(1);
 num_frames = length(idx);
+
+% Header after magic is [frame_number, reserved, energy, noise_energy, vad_flag]
+% as int32 (10 int16 slots), followed by num_ceps coefficients.
+payload_len = 10 + num_ceps; % 5 int32 = 10 int16, then ceps data
+
+% Last frame can be incomplete due to span over multiple periods
+last = idx(end) + num_magic + payload_len - 1;
+if (last > length(data))
+    num_frames = num_frames - 1;
+end
+
 t_ceps = period_ceps / num_channels / fs;
 t = (0:num_frames -1) * t_ceps;
 n = 1:num_ceps;
 
-ceps = zeros(num_ceps, num_frames);
+payload = zeros(payload_len, num_frames);
 for i = 1:num_frames
-	i1 = idx(i) + 2;
-	i2 = i1 + num_ceps - 1;
-	ceps(:,i) = data(i1:i2) / 2^qformat;
+	i1 = idx(i) + num_magic;
+	i2 = i1 + payload_len - 1;
+	payload(:,i) = double(data(i1:i2));
 end
 
+% Reassemble int32 from pairs of int16 (little-endian).
+% Low half must be treated as unsigned with mod() to handle negative int16.
+frame_number = mod(payload(1,:), 65536) + payload(2,:) * 65536;
+% payload(3:4,:) is reserved, skip
+energy = mod(payload(5,:), 65536) + payload(6,:) * 65536;
+noise_energy = mod(payload(7,:), 65536) + payload(8,:) * 65536;
+vad = mod(payload(9,:), 65536) + payload(10,:) * 65536;
+ceps = payload(11:payload_len, :) / 2^qformat;
+
 figure;
 surf(t, n, ceps, 'EdgeColor', 'none');
 colormap(jet);
@@ -75,7 +101,7 @@
 	case '.wav'
 		tmp = audioread(fn, 'native');
 		t = whos('tmp');
-		if ~strcmp(t.class, 'int16');
+		if ~strcmp(t.class, 'int16')
 			error('Only 16-bit wav file format is supported');
 		end
 		s = size(tmp);
diff --git a/src/audio/mfcc/tune/decode_mel.m b/src/audio/mfcc/tune/decode_mel.m
index f6a723aa2040..24296b529cbc 100644
--- a/src/audio/mfcc/tune/decode_mel.m
+++ b/src/audio/mfcc/tune/decode_mel.m
@@ -1,26 +1,28 @@
-% [mel, t, n] = decode_mel(fn, num_mel, fmt, num_channels)
+% [mel, t, n, vad, energy, noise_energy, frame_number] = decode_mel(fn, num_mel, fmt, num_channels)
 %
 % Input
 %   fn - File with Mel data in .raw or .wav format
 %   num_mel - number of Mel coefficients per frame
 %   fmt - format of the Mel data ('s16', 's24', 's32')
-%   num_channels - needed for .raw format, omit for .wav
+%   num_channels - needed for .raw format, omit for .wav, default 2
 %
 % Outputs
 %   mel - Mel coefficients
 %   t - time vector for plotting
 %   n - mel 1..num_mel vector for plotting
+%   vad - VAD flag per frame from DSP
+%   energy - weighted signal energy per frame from DSP
+%   noise_energy - weighted noise floor energy per frame from DSP
+%   frame_number - frame number from DSP
 
 % SPDX-License-Identifier: BSD-3-Clause
 % Copyright(c) 2026 Intel Corporation.
 
-function [mel, t, n] = decode_mel(fn, num_mel, fmt, num_channels)
+function [mel, t, n, vad, energy, noise_energy, frame_number] = ...
+	decode_mel(fn, num_mel, fmt, num_channels)
 
-if nargin < 3
-	fmt = 's16';
-end
 if nargin < 4
-	num_channels = 1;
+	num_channels = 2;
 end
 
 % MFCC stream
@@ -30,15 +32,15 @@
   case 's16'
     qformat = 7;
     magic = [25443 28006]; % ASCII 'mfcc' as two int16
-    num_magic = 2;
+    num_magic = 2; % magic word is 2 x int16
   case 's24'
     qformat = 15;
     magic = int32(1835426659); % 0x6D666363 as int32
-    num_magic = 1;
+    num_magic = 1; % magic word is 1 x int32
   case 's32'
     qformat = 23;
     magic = int32(1835426659); % 0x6D666363 as int32
-    num_magic = 1;
+    num_magic = 1; % magic word is 1 x int32
     otherwise
     error("Use 's16', 's24', or 's32' as format.");
 end
@@ -68,33 +70,77 @@
 period_mel = idx(2)-idx(1);
 num_frames = length(idx);
 
+% Header after magic is [frame_number, reserved, energy, noise_energy, vad_flag]
+% as int32, followed by num_mel coefficients.
+% For s16 each int32 occupies 2 int16 slots.
+if strcmp(fmt, 's16')
+	payload_len = 10 + num_mel; % 5 int32 = 10 int16, then mel data
+else
+	payload_len = 5 + num_mel; % frame_number + reserved + energy + noise_energy + vad_flag + mel
+end
+
 % Last frame can be incomplete due to span over multiple periods
-last = idx(end) + num_mel - 1;
+last = idx(end) + num_magic + payload_len - 1;
 if (last > length(data))
     num_frames = num_frames - 1;
 end
 
-t_mel = period_mel / num_channels / fs;
-t = (0:num_frames -1) * t_mel;
-n = 1:num_mel;
-
-mel = zeros(num_mel, num_frames);
+payload = zeros(payload_len, num_frames);
 for i = 1:num_frames
 	i1 = idx(i) + num_magic;
-	i2 = i1 + num_mel - 1;
-	mel(:,i) = double(data(i1:i2)) / 2^qformat;
+	i2 = i1 + payload_len - 1;
+	payload(:,i) = double(data(i1:i2));
 end
 
-figure;
+if strcmp(fmt, 's16')
+	% Reassemble int32 from pairs of int16 (little-endian).
+	% Low half must be treated as unsigned with mod() to handle negative int16.
+	frame_number = mod(payload(1,:), 65536) + payload(2,:) * 65536;
+	% payload(3:4,:) is reserved, skip
+	energy = mod(payload(5,:), 65536) + payload(6,:) * 65536;
+	noise_energy = mod(payload(7,:), 65536) + payload(8,:) * 65536;
+	vad = mod(payload(9,:), 65536) + payload(10,:) * 65536;
+	mel = payload(11:payload_len, :) / 2^qformat;
+else
+	frame_number = payload(1, :);
+	% payload(2,:) is reserved, skip
+	energy = payload(3, :) / 2^qformat;
+	noise_energy = payload(4, :) / 2^qformat;
+	vad = payload(5, :);
+	mel = payload(6:payload_len, :) / 2^qformat;
+end
+
+t_mel = period_mel / num_channels / fs;
+t = (0:num_frames -1) * t_mel;
+n = 1:num_mel;
+
+figure
 imagesc(t, n, mel);
 axis xy;
 colormap(jet);
 colorbar;
 tstr = sprintf('SOF MFCC Mel coefficients (%s)', fn);
 title(tstr, 'Interpreter', 'None');
-xlabel('Time (s)');
 ylabel('Mel coef #');
 
+figure
+subplot(2,1,1);
+level = sum(mel(:,:));
+plot(t, vad)
+ax = axis();
+axis([ax(1:2) -0.1 1.1]);
+grid on;
+title(tstr, 'Interpreter', 'None');
+xlabel('Time (s)');
+ylabel('VAD flag');
+
+subplot(2,1,2);
+plot(t, energy, t, noise_energy);
+grid on;
+legend('Energy', 'Noise Energy');
+xlabel('Time (s)');
+ylabel('Energy');
+
 end
 
 function [data, num_channels] = get_file(fn, num_channels, fmt)
diff --git a/src/audio/mfcc/tune/setup_mfcc.m b/src/audio/mfcc/tune/setup_mfcc.m
index bd2b3f11e60b..3cda3221a4fc 100644
--- a/src/audio/mfcc/tune/setup_mfcc.m
+++ b/src/audio/mfcc/tune/setup_mfcc.m
@@ -62,6 +62,9 @@ function setup_mfcc()
 	cfg.mmax_init = 0; % same
 	cfg.mmax_coef = 0; % same
 	cfg.dynamic_mmax = false; % same
+	cfg.enable_vad = false;
+	cfg.enable_dtx = false;
+	cfg.update_controls = false;
 end
 
 function cfg = get_mel_spectrogram_config()
@@ -99,6 +102,9 @@ function setup_mfcc()
 	cfg.mmax_init = 0; % Initial value max Mel value, data clamp is mmax - top_db
 	cfg.mmax_coef = 0; % Dynamic max Mel value decay coefficient (zero lock to found max)
 	cfg.dynamic_mmax = true;
+	cfg.enable_vad = true;
+	cfg.enable_dtx = false;
+	cfg.update_controls = true;
 end
 
 function export_mfcc_setup(gen_cfg, cfg)
@@ -107,7 +113,7 @@ function export_mfcc_setup(gen_cfg, cfg)
 addpath([gen_cfg.tools_path 'tune/common']);
 
 %% Blob size, size plus reserved(8) + current parameters
-nbytes_data = 104;
+nbytes_data = 116;
 
 %% Little endian
 sh32 = [0 -8 -16 -24];
@@ -160,6 +166,10 @@ function export_mfcc_setup(gen_cfg, cfg)
 v = 0;                                           [b8, j] = add_w16b(v, b8, j); % vtln_high Qx.y TBD
 v = 0;                                           [b8, j] = add_w16b(v, b8, j); % vtln_low Qx.y TBD
 v = 0;                                           [b8, j] = add_w16b(v, b8, j); % vtln_warp Qx.y TBD
+% reserved16[3]
+for i = 1:3
+	[b8, j] = add_w16b(0, b8, j);
+end
 v = cfg.htk_compat;                              [b8, j] = add_w8b(v, b8, j); % bool
 v = cfg.raw_energy;                              [b8, j] = add_w8b(v, b8, j); % bool
 v = cfg.remove_dc_offset;                        [b8, j] = add_w8b(v, b8, j); % bool
@@ -168,6 +178,13 @@ function export_mfcc_setup(gen_cfg, cfg)
 v = cfg.subtract_mean;                           [b8, j] = add_w8b(v, b8, j); % bool
 v = cfg.use_energy;                              [b8, j] = add_w8b(v, b8, j); % bool
 v = cfg.dynamic_mmax;                            [b8, j] = add_w8b(v, b8, j); % bool
+v = cfg.enable_vad;                              [b8, j] = add_w8b(v, b8, j); % bool
+v = cfg.enable_dtx;                              [b8, j] = add_w8b(v, b8, j); % bool
+v = cfg.update_controls;                         [b8, j] = add_w8b(v, b8, j); % bool
+% reserved_bool[5]
+for i = 1:5
+	[b8, j] = add_w8b(0, b8, j);
+end
 
 %% Export
 tplg_fn = [gen_cfg.mfcc_conf_path cfg.tplg_fn];
diff --git a/src/include/sof/audio/mfcc/mfcc_comp.h b/src/include/sof/audio/mfcc/mfcc_comp.h
index 025eef116752..80ab3e376487 100644
--- a/src/include/sof/audio/mfcc/mfcc_comp.h
+++ b/src/include/sof/audio/mfcc/mfcc_comp.h
@@ -12,6 +12,7 @@
 #include <sof/math/auditory.h>
 #include <sof/math/dct.h>
 #include <sof/math/fft.h>
+#include <sof/audio/mfcc/mfcc_vad.h>
 #include <stddef.h>
 #include <stdint.h>
 
@@ -32,6 +33,21 @@
 #define MFCC_MAGIC 0x6d666363 /* ASCII for "mfcc" */
 #define MFCC_FFT_BITS	32
 
+/**
+ * \brief Data header prepended to every MFCC output frame.
+ *
+ * Written before the Mel spectrum or cepstral coefficient data in each
+ * output frame.  All fields are int32_t so the header is 16 bytes.
+ */
+struct mfcc_data_header {
+	uint32_t magic;        /**< Magic word MFCC_MAGIC (0x6d666363) */
+	uint32_t frame_number; /**< Frame number, counting calculated frames starting from 0 */
+	int32_t reserved;	 /**< Reserved for future use, set to 0 */
+	int32_t energy;       /**< Weighted signal energy in Q9.23 */
+	int32_t noise_energy; /**< Weighted noise floor energy in Q9.23 */
+	int32_t vad_flag;     /**< VAD decision: 1 = speech, 0 = silence */
+};
+
 /** \brief Type definition for processing function select return value. */
 typedef void (*mfcc_func)(struct processing_module *mod,
 			  struct input_stream_buffer *bsource,
@@ -105,7 +121,8 @@ struct mfcc_state {
 	bool mel_only; /**< When true, output Mel spectra instead of cepstral coefficients */
 	bool waiting_fill; /**< booleans */
 	bool prev_samples_valid;
-	bool magic_pending; /**< True when magic word not yet written for current output */
+	bool header_pending; /**< True when data header not yet written for current output */
+	struct mfcc_data_header header; /**< Data header for current output frame */
 	size_t sample_buffers_size; /**< bytes */
 	int16_t *out_data_ptr; /**< Read pointer into scratch data for multi-period output */
 	int32_t *out_data_ptr_32; /**< Read pointer for 32-bit mel-only output */
@@ -115,6 +132,7 @@ struct mfcc_state {
 /* MFCC component private data */
 struct mfcc_comp_data {
 	struct mfcc_state state;
+	struct mfcc_vad_state vad;
 	struct comp_data_blob_handler *model_handler;
 	struct sof_mfcc_config *config;
 	int max_frames;
diff --git a/src/include/sof/audio/mfcc/mfcc_vad.h b/src/include/sof/audio/mfcc/mfcc_vad.h
new file mode 100644
index 000000000000..e12dd7e31e80
--- /dev/null
+++ b/src/include/sof/audio/mfcc/mfcc_vad.h
@@ -0,0 +1,99 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Copyright(c) 2026 Intel Corporation.
+ *
+ * Author: Seppo Ingalsuo <seppo.ingalsuo@linux.intel.com>
+ */
+
+/**
+ * \file mfcc_vad.h
+ * \brief Voice Activity Detection based on Mel spectrum energy.
+ *
+ * This VAD operates on the Q9.23 Mel log spectrum values produced by
+ * the MFCC component. It tracks a per-bin noise floor that follows
+ * the signal downward instantly and rises slowly, then computes a
+ * speech-weighted energy delta above the floor.
+ */
+
+#ifndef __SOF_AUDIO_MFCC_MFCC_VAD_H__
+#define __SOF_AUDIO_MFCC_MFCC_VAD_H__
+
+#include <stdint.h>
+#include <stdbool.h>
+
+struct processing_module;
+
+/**
+ * \brief Number of frames for fast noise floor convergence at startup (~1 s at 10 ms/frame).
+ */
+#define MFCC_VAD_NOISE_INIT_FRAMES	100
+
+/**
+ * \brief Slow noise floor rise coefficient in Q1.15 (0.0020 * 2^15 = 66).
+ */
+#define MFCC_VAD_NOISE_RISE_ALPHA	66
+
+/**
+ * \brief Fast noise floor rise coefficient in Q1.15 (0.05 * 2^15 = 1638).
+ */
+#define MFCC_VAD_NOISE_RISE_ALPHA_FAST	1638
+
+/**
+ * \brief Energy threshold for speech detection in Q9.23 (0.35 * 2^23 = 2936013).
+ */
+#define MFCC_VAD_ENERGY_THRESHOLD	2936013
+
+/**
+ * \brief Hangover frame count to keep VAD active after last speech detection.
+ */
+#define MFCC_VAD_HANGOVER_FRAMES	20
+
+/**
+ * \brief VAD state structure.
+ */
+struct mfcc_vad_state {
+	int32_t *noise_floor; /**< Per-bin noise floor in Q9.23 */
+	int16_t *weights; /**< Speech-frequency emphasis weights Q1.15 */
+	int32_t energy_threshold; /**< Energy threshold Q9.23 */
+	int16_t noise_rise_alpha_slow; /**< Slow rise alpha Q1.15 */
+	int16_t noise_rise_alpha_fast; /**< Fast rise alpha Q1.15 */
+	int16_t hangover_max; /**< Maximum hangover frames */
+	int16_t hangover_counter; /**< Current hangover counter */
+	int16_t num_mel_bins; /**< Number of Mel bins in use */
+	int16_t init_frames; /**< Number of initial frames for fast convergence */
+	int32_t frame_count; /**< Total frames processed */
+	int32_t energy; /**< Weighted signal energy in Q9.23 */
+	int32_t noise_energy; /**< Weighted noise floor energy in Q9.23 */
+	bool is_speech; /**< Current VAD decision */
+	bool initialized; /**< True after first frame processed */
+};
+
+/**
+ * \brief Initialize VAD state.
+ *
+ * \param[out] vad Pointer to VAD state to initialize.
+ * \param[in] num_mel_bins Number of Mel bins.
+ * \param[in] sample_rate Audio sample rate in Hz.
+ * \param[in] mod Processing module for memory allocation.
+ * \return 0 on success, negative error code on failure.
+ */
+int mfcc_vad_init(struct mfcc_vad_state *vad, int num_mel_bins, int sample_rate,
+		  struct processing_module *mod);
+
+/**
+ * \brief Process one Mel spectrum frame and update VAD decision.
+ *
+ * \param[in,out] vad Pointer to VAD state.
+ * \param[in] mel_log Mel log spectrum in Q9.23, array of num_mel_bins values.
+ * \return 1 if speech detected, 0 if silence.
+ */
+int mfcc_vad_update(struct mfcc_vad_state *vad, const int32_t *mel_log);
+
+/**
+ * \brief Reset VAD state without changing configuration.
+ *
+ * \param[in,out] vad Pointer to VAD state.
+ */
+void mfcc_vad_reset(struct mfcc_vad_state *vad);
+
+#endif /* __SOF_AUDIO_MFCC_MFCC_VAD_H__ */
diff --git a/src/include/user/mfcc.h b/src/include/user/mfcc.h
index 8a0defcd9883..a2f3717daa52 100644
--- a/src/include/user/mfcc.h
+++ b/src/include/user/mfcc.h
@@ -77,6 +77,7 @@ struct sof_mfcc_config {
 	int16_t vtln_high; /**< Reserved, no support */
 	int16_t vtln_low; /**< Reserved, no support */
 	int16_t vtln_warp; /**< Reserved, no support */
+	int16_t reserved16[3]; /**< Reserved for future 16-bit fields, set to 0 */
 	bool htk_compat; /**< Must be false */
 	bool raw_energy; /**< Reserved, no support */
 	bool remove_dc_offset; /**< Reserved, no support */
@@ -85,8 +86,10 @@ struct sof_mfcc_config {
 	bool subtract_mean; /**< Must be false (0) */
 	bool use_energy; /**< Must be false (0) */
 	bool dynamic_mmax; /**< Track max Mel value for clamp with top_db value */
-	bool reserved_bool2;
-	bool reserved_bool3;
+	bool enable_vad; /**< Run VAD algorithm */
+	bool enable_dtx; /**< Reserved (stream once per second non-speech frames) */
+	bool update_controls; /**< Update controls with VAD decision */
+	bool reserved_bool[5]; /* Reserved for future boolean flags, set to false (0) */
 } __attribute__((packed));
 
 #endif /* __USER_MFCC_H__ */
diff --git a/tools/topology/topology2/include/components/mfcc/default.conf b/tools/topology/topology2/include/components/mfcc/default.conf
index 42a6d6608b8b..3bbd72696806 100644
--- a/tools/topology/topology2/include/components/mfcc/default.conf
+++ b/tools/topology/topology2/include/components/mfcc/default.conf
@@ -1,12 +1,12 @@
-# Exported MFCC configuration 05-May-2026
+# Exported MFCC configuration 19-May-2026
 # cd src/audio/mfcc/tune; octave setup_mfcc.m
 Object.Base.data."mfcc_config" {
 	bytes "
 		0x53,0x4f,0x46,0x34,0x00,0x00,0x00,0x00,
-		0x68,0x00,0x00,0x00,0x01,0xd0,0x01,0x03,
+		0x74,0x00,0x00,0x00,0x01,0xd0,0x01,0x03,
 		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
 		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-		0x68,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+		0x74,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
 		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
 		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
 		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
@@ -17,6 +17,8 @@ Object.Base.data."mfcc_config" {
 		0xc3,0x35,0x00,0x2c,0x00,0x00,0x00,0x00,
 		0x90,0x01,0xa0,0x00,0x00,0x00,0x14,0x00,
 		0x0d,0x00,0x17,0x00,0x00,0x00,0x00,0x64,
-		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x01,
-		0x01,0x01,0x01,0x00,0x00,0x00,0x00,0x00"
+		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+		0x00,0x00,0x00,0x00,0x00,0x01,0x01,0x01,
+		0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+		0x00,0x00,0x00,0x00"
 }
diff --git a/tools/topology/topology2/include/components/mfcc/mel80.conf b/tools/topology/topology2/include/components/mfcc/mel80.conf
index 04aa2a15c660..480725c2d24f 100644
--- a/tools/topology/topology2/include/components/mfcc/mel80.conf
+++ b/tools/topology/topology2/include/components/mfcc/mel80.conf
@@ -1,12 +1,12 @@
-# Exported MFCC configuration 05-May-2026
+# Exported MFCC configuration 19-May-2026
 # cd src/audio/mfcc/tune; octave setup_mfcc.m
 Object.Base.data."mfcc_config" {
 	bytes "
 		0x53,0x4f,0x46,0x34,0x00,0x00,0x00,0x00,
-		0x68,0x00,0x00,0x00,0x01,0xd0,0x01,0x03,
+		0x74,0x00,0x00,0x00,0x01,0xd0,0x01,0x03,
 		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
 		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-		0x68,0x00,0x00,0x00,0x00,0x02,0x00,0x04,
+		0x74,0x00,0x00,0x00,0x00,0x02,0x00,0x04,
 		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
 		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
 		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
@@ -18,5 +18,7 @@ Object.Base.data."mfcc_config" {
 		0x90,0x01,0xa0,0x00,0x40,0x1f,0x00,0x00,
 		0x00,0x00,0x50,0x00,0x00,0x00,0x00,0x04,
 		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-		0x00,0x01,0x01,0x00,0x00,0x01,0x00,0x00"
+		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x01,
+		0x01,0x00,0x00,0x01,0x01,0x00,0x01,0x00,
+		0x00,0x00,0x00,0x00"
 }

From 86388a7279554c7b7c553300180ce2a1d7dcefe4 Mon Sep 17 00:00:00 2001
From: Seppo Ingalsuo <seppo.ingalsuo@linux.intel.com>
Date: Tue, 19 May 2026 18:46:55 +0300
Subject: [PATCH 2/3] Audio: MFCC: Add Python script for speech to text with
 Whisper model

Add sof_mel_to_text_live_dsp_vad.py that captures mel spectrogram frames
from ALSA with embedded DSP VAD flag and performs live speech-to-text
transcription using OpenVINO Whisper. The script buffers mel frames during
speech and triggers Whisper inference when silence is detected after
speech. Capture runs continuously in a separate thread during inference
to avoid frame drops.

Replace the old README.txt with a comprehensive README.md that documents
the MFCC tuning tools, testbench usage with run_mfcc.sh, output file
formats, Matlab/Octave decode and plotting scripts, and the new live
transcription workflow.

Signed-off-by: Seppo Ingalsuo <seppo.ingalsuo@linux.intel.com>
---
 src/audio/mfcc/tune/README.md                 |  98 ++++
 src/audio/mfcc/tune/README.txt                |  52 --
 .../mfcc/tune/sof_mel_to_text_live_dsp_vad.py | 454 ++++++++++++++++++
 3 files changed, 552 insertions(+), 52 deletions(-)
 create mode 100644 src/audio/mfcc/tune/README.md
 delete mode 100644 src/audio/mfcc/tune/README.txt
 create mode 100644 src/audio/mfcc/tune/sof_mel_to_text_live_dsp_vad.py

diff --git a/src/audio/mfcc/tune/README.md b/src/audio/mfcc/tune/README.md
new file mode 100644
index 000000000000..5fef841efff1
--- /dev/null
+++ b/src/audio/mfcc/tune/README.md
@@ -0,0 +1,98 @@
+# SOF MFCC Tuning Tools
+
+This directory contains a tool to create configuration blob for SOF
+MFCC component. It's simply run in Matlab or Octave with command
+`setup_mfcc`. The MFCC configuration parameters can be edited from the
+script.
+
+## Testbench
+
+The configuration can be test run with testbench. First the test topologies
+need to be created with `scripts/build-tools.sh -t`. Next the testbench
+is built with `scripts/rebuild-testbench.sh`.
+
+Once the previous steps are done, a sample wav file can be processed
+with script `run_mfcc.sh`. The script converts the input to raw 16 kHz
+stereo format and runs the testbench for S16, S24, and S32 bit depths,
+producing both cepstral coefficient (MFCC) and Mel spectrogram outputs.
+
+```
+./run_mfcc.sh /usr/share/sounds/alsa/Front_Center.wav
+```
+
+Output files from host testbench:
+
+| File | Content |
+|------|---------|
+| `mfcc_s16.raw`, `mfcc_s24.raw`, `mfcc_s32.raw` | Cepstral coefficients |
+| `mel_s16.raw`, `mel_s24.raw`, `mel_s32.raw` | Mel spectrogram |
+
+If the `XTENSA_PATH` environment variable is set, the script also runs
+the Xtensa build of the testbench (via `xt-run`) and produces additional
+output files prefixed with `xt_`:
+
+| File | Content |
+|------|---------|
+| `xt_mfcc_s16.raw`, `xt_mfcc_s24.raw`, `xt_mfcc_s32.raw` | Cepstral coefficients |
+| `xt_mel_s16.raw`, `xt_mel_s24.raw`, `xt_mel_s32.raw` | Mel spectrogram |
+
+## Decoding and Plotting
+
+All output files can be decoded and plotted at once in Matlab or Octave
+with the `decode_all.m` script:
+
+```matlab
+decode_all
+```
+
+This calls `decode_ceps` for each MFCC file (13 cepstral coefficients) and
+`decode_mel` for each Mel file (80 Mel bins), plotting spectrograms for all
+files that exist including the Xtensa variants.
+
+Individual files can also be decoded manually:
+
+```matlab
+[ceps, t, n] = decode_ceps('mfcc_s16.raw', 13);
+```
+
+In the above it's known from configuration script that MFCC was set up to
+output 13 cepstral coefficients from each FFT → Mel → DCT → Cepstral
+coefficients computation run.
+
+The 80 bands Mel output can be visualized with command:
+
+```matlab
+[mel, t, n] = decode_mel('mel_s16.raw', 80);
+```
+
+## Live Whisper Transcription with DSP VAD
+
+The directory contains a Python script `sof_mel_to_text_live_dsp_vad.py`.
+It can be used with development topologies
+`sof-arl-cs42l43-l0-cs35l56-l23-mfcc.tplg` and
+`sof-mtl-rt713-l0-rt1316-l12-mfcc.tplg`. It captures from default audio
+device `hw:0,47` (headset microphone) Mel audio features and VAD flags.
+The captured frames with detected speech are sent to Whisper speech
+recognizer model for conversion to text.
+
+### Prerequisites
+
+The script needs OpenVINO. Please follow the install procedure from
+<https://docs.openvino.ai/2025/get-started/install-openvino.html>.
+
+The following Python pip installs are needed into the same OpenVINO venv:
+
+```bash
+pip install openvino openvino-tokenizers openvino-genai
+pip install optimum[intel]
+pip install transformers
+pip install huggingface_hub
+```
+
+### NPU / GPU Support
+
+The script by default runs the Whisper encoder model in the NPU. To
+use the NPU, install the driver from
+<https://github.com/intel/linux-npu-driver/releases>. If the NPU is not
+available, change the encoder to CPU with run option `--encoder-device CPU`.
+With a GPU both `--encoder-device GPU` and `--decoder-device GPU` can be set.
diff --git a/src/audio/mfcc/tune/README.txt b/src/audio/mfcc/tune/README.txt
deleted file mode 100644
index a0c3189e81a3..000000000000
--- a/src/audio/mfcc/tune/README.txt
+++ /dev/null
@@ -1,52 +0,0 @@
-This directory contains a tool to create configuration blob for SOF
-MFCC component. It's simply run in Matlab or Octave with command
-"setup_mfcc". The MFCC configuration parameters can be edited from the
-script.
-
-The configuration can be test run with testbench. First the test topologies
-need to be created with "scripts/build-tools.sh -t". Next the testbench
-is build with "scripts/rebuild-testbench.sh".
-
-Once the previous steps are done, a sample wav file can be processed
-with script run_mfcc.sh. The script converts the input to raw 16 kHz
-stereo format and runs the testbench for S16, S24, and S32 bit depths,
-producing both cepstral coefficient (MFCC) and Mel spectrogram outputs.
-
-./run_mfcc.sh /usr/share/sounds/alsa/Front_Center.wav
-
-Output files from host testbench:
-  mfcc_s16.raw, mfcc_s24.raw, mfcc_s32.raw   - cepstral coefficients
-  mel_s16.raw, mel_s24.raw, mel_s32.raw       - Mel spectrogram
-
-If the XTENSA_PATH environment variable is set, the script also runs
-the Xtensa build of the testbench (via xt-run) and produces additional
-output files prefixed with "xt_":
-  xt_mfcc_s16.raw, xt_mfcc_s24.raw, xt_mfcc_s32.raw
-  xt_mel_s16.raw, xt_mel_s24.raw, xt_mel_s32.raw
-
-All output files can be decoded and plotted at once in Matlab or Octave
-with the decode_all.m script:
-
-decode_all
-
-This calls decode_ceps for each MFCC file (13 cepstral coefficients) and
-decode_mel for each Mel file (80 Mel bins), plotting spectrograms for all
-files that exist including the Xtensa variants.
-
-Individual files can also be decoded manually:
-
-[ceps, t, n] = decode_ceps('mfcc_s16.raw', 13);
-
-In the above it's known from configuration script that MFCC was set up to
-output 13 cepstral coefficients from each FFT -> Mel -> DCT -> Cepstral
-coefficients computation run.
-
-The 80 bands Mel output can be visualized with command:
-
-[mel, t, n] = decode_mel('mel_s16.raw', 80);
-
-Other kind of signals have quite big visual difference in audio features. Try
-e.g. other sound files found in computer.
-
-./run_mfcc.sh /usr/share/sounds/gnome/default/alerts/bark.ogg
-./run_mfcc.sh /usr/share/sounds/gnome/default/alerts/sonar.ogg
diff --git a/src/audio/mfcc/tune/sof_mel_to_text_live_dsp_vad.py b/src/audio/mfcc/tune/sof_mel_to_text_live_dsp_vad.py
new file mode 100644
index 000000000000..33862da283e4
--- /dev/null
+++ b/src/audio/mfcc/tune/sof_mel_to_text_live_dsp_vad.py
@@ -0,0 +1,454 @@
+"""Live SOF mel capture with DSP VAD-triggered Whisper transcription.
+
+Captures mel frames from ALSA with embedded VAD flag from the DSP.
+Frame format: [magic(int32), frame_number(uint32), reserved(int32), energy(int32), noise_energy(int32), vad_flag(int32), mel[0..79](int32)]
+When silence of 100ms is detected after speech, sends the buffered mel
+features to Whisper (OpenVINO encoder+decoder) for transcription.
+Capture continues running during Whisper inference.
+
+Usage:
+    python sof_mel_to_text_live_dsp_vad.py [--device hw:0,47] [--model whisper-medium-int4-ov]
+    python sof_mel_to_text_live_dsp_vad.py --plot  # with live spectrogram
+"""
+
+import argparse
+import os
+import struct
+import subprocess
+import threading
+import time
+import numpy as np
+import openvino as ov
+import huggingface_hub as hf_hub
+from pathlib import Path
+
+# Graphics imports deferred until --plot is used
+matplotlib = None
+plt = None
+
+# SOF mel_s32.raw format constants (with DSP data header)
+SOF_MAGIC_S32 = np.int32(0x6D666363)  # ASCII 'mfcc' as int32
+SOF_MAGIC_BYTES = struct.pack('<i', 0x6D666363)
+SOF_NUM_HEADER = 6            # magic, frame_number, reserved, energy, noise_energy, vad_flag
+SOF_Q_FORMAT = 23            # Q9.23 fixed-point
+SOF_NUM_MEL = 80
+SOF_FRAME_INTS = SOF_NUM_HEADER + SOF_NUM_MEL  # 86 int32 per frame
+SOF_FRAME_BYTES = SOF_FRAME_INTS * 4  # 344 bytes per frame
+
+# Speech buffering
+SILENCE_TRIGGER_MS = 100     # ms of silence after speech to trigger transcription
+SILENCE_TRIGGER_FRAMES = SILENCE_TRIGGER_MS // 10  # 10 frames at 10ms/frame
+MIN_SPEECH_MS = 500          # minimum speech duration to send to Whisper
+MIN_SPEECH_FRAMES = MIN_SPEECH_MS // 10  # 50 frames at 10ms/frame
+
+# Whisper model constants
+WHISPER_FEATURE_SIZE = 80
+WHISPER_NB_MAX_FRAMES = 3000  # 30 seconds at 10ms per frame
+
+
+def decode_mel_frame(raw_ints):
+    """Convert 80 int32 Q9.23 values to float32 mel coefficients."""
+    return raw_ints.astype(np.float64) / (2 ** SOF_Q_FORMAT)
+
+
+# ---------- Optional scrolling plot ----------
+
+SPECTROGRAM_WIDTH = 100
+
+
+class MelPlotter:
+    """Real-time scrolling mel spectrogram + VAD strip."""
+
+    def __init__(self, num_mel=SOF_NUM_MEL, width=SPECTROGRAM_WIDTH):
+        global matplotlib, plt
+        import matplotlib as _mpl
+        _mpl.use('TkAgg')
+        import matplotlib.pyplot as _plt
+        matplotlib = _mpl
+        plt = _plt
+
+        self.num_mel = num_mel
+        self.width = width
+
+        self.mel_buf = np.zeros((num_mel, width), dtype=np.float64)
+        self.vad_buf = np.zeros(width, dtype=np.float64)
+        self.x = np.arange(width)
+
+        self.fig, (self.ax_mel, self.ax_vad) = plt.subplots(
+            2, 1, figsize=(10, 5),
+            gridspec_kw={'height_ratios': [5, 1]},
+            sharex=True
+        )
+        self.fig.tight_layout(pad=2.0)
+
+        self.im_mel = self.ax_mel.imshow(
+            self.mel_buf, aspect='auto', origin='lower',
+            interpolation='nearest', cmap='turbo',
+            vmin=-2.0, vmax=2.0
+        )
+        self.ax_mel.set_ylabel('Mel bin')
+        self.ax_mel.set_title('Mel Spectrogram (scrolling) — DSP VAD')
+
+        self.line_vad, = self.ax_vad.plot(
+            self.x, self.vad_buf, color='green', linewidth=1.5,
+            drawstyle='steps-post')
+        self.ax_vad.set_ylabel('VAD')
+        self.ax_vad.set_xlabel('Frame')
+        self.ax_vad.set_ylim(-0.1, 1.1)
+        self.ax_vad.set_yticks([0, 1])
+        self.ax_vad.set_yticklabels(['Silent', 'Speech'])
+
+        plt.ion()
+        plt.show(block=False)
+        self.fig.canvas.draw()
+        self.fig.canvas.flush_events()
+
+    def update(self, mel_frame, is_speech):
+        self.mel_buf[:, :-1] = self.mel_buf[:, 1:]
+        self.mel_buf[:, -1] = mel_frame
+        self.vad_buf[:-1] = self.vad_buf[1:]
+        self.vad_buf[-1] = 1.0 if is_speech else 0.0
+
+        self.im_mel.set_data(self.mel_buf)
+        self.line_vad.set_ydata(self.vad_buf)
+
+        self.fig.canvas.draw_idle()
+        self.fig.canvas.flush_events()
+
+
+# ---------- Whisper inference ----------
+
+class WhisperTranscriber:
+    """Whisper encoder+decoder using OpenVINO, runs in a background thread."""
+
+    def __init__(self, model_path, encoder_device="NPU", decoder_device="CPU"):
+        self.model_path = model_path
+        core = ov.Core()
+        encoder_xml = str(Path(model_path) / "openvino_encoder_model.xml")
+        decoder_xml = str(Path(model_path) / "openvino_decoder_model.xml")
+        # NPU requires static shapes — fix [?,?,3000] to [1,80,3000]
+        encoder_model = core.read_model(encoder_xml)
+        encoder_model.reshape({0: [1, WHISPER_FEATURE_SIZE, WHISPER_NB_MAX_FRAMES]})
+        self.encoder = core.compile_model(encoder_model, encoder_device)
+        self.decoder = core.compile_model(decoder_xml, decoder_device)
+        self._load_tokenizer()
+        self._busy = False
+        self._lock = threading.Lock()
+
+    def _load_tokenizer(self):
+        """Load Whisper tokenizer."""
+        try:
+            from transformers import WhisperTokenizer
+            self.tokenizer = WhisperTokenizer.from_pretrained(self.model_path)
+            self._tokenizer_type = "hf"
+        except ImportError:
+            import openvino_genai as ov_genai
+            self.tokenizer = ov_genai.Tokenizer(self.model_path)
+            self._tokenizer_type = "ov"
+
+    def is_busy(self):
+        with self._lock:
+            return self._busy
+
+    def transcribe_async(self, mel_frames, callback):
+        """Run transcription in a background thread.
+
+        Args:
+            mel_frames: list of np.ndarray [80] mel frames
+            callback: function(text) called with result
+        """
+        with self._lock:
+            if self._busy:
+                return False
+            self._busy = True
+
+        t = threading.Thread(target=self._run, args=(mel_frames, callback),
+                             daemon=True)
+        t.start()
+        return True
+
+    def _run(self, mel_frames, callback):
+        try:
+            text = self._transcribe(mel_frames)
+            callback(text)
+        except Exception as e:
+            print(f"  [Whisper ERROR] {e}", flush=True)
+        finally:
+            with self._lock:
+                self._busy = False
+
+    def _transcribe(self, mel_frames):
+        """Encode mel frames and decode to text."""
+        n_frames = len(mel_frames)
+        if n_frames == 0:
+            return ""
+
+        # Stack frames into [80, n_frames]
+        features = np.column_stack(mel_frames).astype(np.float32)
+
+        # Pad to 3000 frames
+        silence_val = features.min()
+        padded = np.full((WHISPER_FEATURE_SIZE, WHISPER_NB_MAX_FRAMES),
+                         silence_val, dtype=np.float32)
+        n = min(n_frames, WHISPER_NB_MAX_FRAMES)
+        padded[:, :n] = features[:, :n]
+
+        # Encoder
+        t0 = time.time()
+        encoder_input = padded.reshape(1, WHISPER_FEATURE_SIZE, WHISPER_NB_MAX_FRAMES)
+        encoder_req = self.encoder.create_infer_request()
+        encoder_req.set_tensor("input_features", ov.Tensor(encoder_input))
+        encoder_req.infer()
+        hidden_state = encoder_req.get_tensor("last_hidden_state").data.copy()
+        t1 = time.time()
+        print(f"  [Whisper] encoder: {t1-t0:.2f}s", flush=True)
+
+        # Decoder: greedy decode
+        token_ids = self._greedy_decode(hidden_state)
+        t2 = time.time()
+        print(f"  [Whisper] decoder: {t2-t1:.2f}s ({len(token_ids)} tokens)",
+              flush=True)
+
+        # Convert to text
+        text_tokens = [t for t in token_ids if t < 50257]
+        if self._tokenizer_type == "hf":
+            text = self.tokenizer.decode(text_tokens)
+        else:
+            text = self.tokenizer.decode(text_tokens)
+
+        return text.strip()
+
+    def _greedy_decode(self, hidden_state, max_tokens=448):
+        """Greedy decoding loop."""
+        sot_tokens = [50258, 50259, 50359, 50363]
+        eos_token = 50257
+
+        decoder_req = self.decoder.create_infer_request()
+        input_names = [inp.get_any_name() for inp in self.decoder.inputs]
+        has_cache_position = "cache_position" in input_names
+
+        decoder_req.set_tensor("encoder_hidden_states", ov.Tensor(hidden_state))
+
+        # Prefill with SOT tokens
+        input_ids = np.array([sot_tokens], dtype=np.int64)
+        beam_idx = np.array([0], dtype=np.int32)
+
+        decoder_req.set_tensor("input_ids", ov.Tensor(input_ids))
+        if "beam_idx" in input_names:
+            decoder_req.set_tensor("beam_idx", ov.Tensor(beam_idx))
+        if has_cache_position:
+            cache_pos = np.arange(len(sot_tokens), dtype=np.int64).reshape(1, -1)
+            decoder_req.set_tensor("cache_position", ov.Tensor(cache_pos))
+
+        decoder_req.infer()
+        logits = decoder_req.get_tensor("logits").data
+        next_token = int(np.argmax(logits[0, -1, :]))
+
+        generated = [next_token]
+        position = len(sot_tokens)
+
+        for _ in range(max_tokens - 1):
+            if next_token == eos_token:
+                break
+
+            decoder_req.set_tensor("input_ids",
+                                   ov.Tensor(np.array([[next_token]], dtype=np.int64)))
+            if "beam_idx" in input_names:
+                decoder_req.set_tensor("beam_idx", ov.Tensor(beam_idx))
+            if has_cache_position:
+                decoder_req.set_tensor("cache_position",
+                                       ov.Tensor(np.array([[position]], dtype=np.int64)))
+
+            decoder_req.infer()
+            logits = decoder_req.get_tensor("logits").data
+            next_token = int(np.argmax(logits[0, -1, :]))
+            generated.append(next_token)
+            position += 1
+
+        return generated
+
+
+# ---------- Frame parser ----------
+
+def find_frame_in_buffer(buf):
+    """Find the first complete mel frame with data header in a byte buffer.
+
+    Frame layout: [magic(4B), frame_number(4B), reserved(4B), energy(4B),
+                   noise_energy(4B), vad_flag(4B), mel[0..79](320B)] = 344 bytes
+    Returns: (vad_flag, mel_ints, remaining_buf) or (None, None, buf)
+    """
+    while True:
+        idx = buf.find(SOF_MAGIC_BYTES)
+        if idx < 0:
+            if len(buf) > 3:
+                buf = buf[-3:]
+            return None, None, buf
+        end = idx + SOF_FRAME_BYTES
+        if end > len(buf):
+            buf = buf[idx:]
+            return None, None, buf
+        # Parse vad_flag at offset 20 (after magic + frame_number + reserved + energy + noise_energy)
+        vad_flag = struct.unpack_from('<i', buf, idx + 20)[0]
+        # Parse 80 mel coefficients (after 24-byte header)
+        mel_bytes = buf[idx + SOF_NUM_HEADER * 4 : end]
+        mel_ints = np.frombuffer(mel_bytes, dtype=np.int32)
+        buf = buf[end:]
+        return vad_flag, mel_ints, buf
+
+
+# ---------- Main capture + transcription loop ----------
+
+def run_capture(device, rate, model_path, encoder_device, decoder_device,
+                enable_plot=False):
+    """Main capture loop: ALSA → DSP VAD → buffer speech → Whisper."""
+
+    plotter = MelPlotter() if enable_plot else None
+    transcriber = WhisperTranscriber(model_path, encoder_device=encoder_device,
+                                     decoder_device=decoder_device)
+
+    cmd = [
+        'arecord', '-D', device, '-f', 'S32_LE', '-c', '2',
+        '-r', str(rate), '-t', 'raw', '--buffer-size', '8192',
+    ]
+
+    print(f"Starting capture: {' '.join(cmd)}")
+    print(f"VAD source: DSP (embedded in stream)")
+    print(f"Silence trigger: {SILENCE_TRIGGER_MS}ms ({SILENCE_TRIGGER_FRAMES} frames)")
+    print(f"Whisper model: {model_path} (encoder: {encoder_device}, decoder: {decoder_device})")
+    print()
+
+    proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+
+    buf = b''
+    read_chunk = SOF_FRAME_BYTES * 4
+    frame_num = 0
+    prev_speech = None
+
+    # Speech buffering state
+    speech_buffer = []         # list of mel frames during speech
+    silence_counter = 0        # consecutive silence frames after speech
+    was_speaking = False       # True if we have buffered speech frames
+
+    def on_transcription(text):
+        if text:
+            print(f"\n  >> \"{text}\"\n", flush=True)
+        else:
+            print("  [Whisper] empty result", flush=True)
+
+    try:
+        while True:
+            data = proc.stdout.read(read_chunk)
+            if not data:
+                rc = proc.poll()
+                if rc is not None:
+                    stderr_out = proc.stderr.read().decode(errors='replace')
+                    print(f"\narecord exited with code {rc}")
+                    if stderr_out:
+                        print(f"stderr: {stderr_out}")
+                    break
+                continue
+
+            buf += data
+
+            while True:
+                vad_flag, frame_ints, buf = find_frame_in_buffer(buf)
+                if frame_ints is None:
+                    break
+
+                frame_num += 1
+                mel = decode_mel_frame(frame_ints)
+                speech = vad_flag != 0
+
+                # Print VAD transitions when not plotting
+                if plotter is None and speech != prev_speech:
+                    t = frame_num * 0.01
+                    tag = "SPEECH" if speech else "SILENCE"
+                    print(f"  [{t:7.2f}s] {tag}", flush=True)
+                prev_speech = speech
+
+                # Update plot
+                if plotter is not None:
+                    plotter.update(mel, speech)
+
+                # --- Speech buffering logic ---
+                if speech:
+                    speech_buffer.append(mel.copy())
+                    silence_counter = 0
+                    was_speaking = True
+                else:
+                    if was_speaking:
+                        silence_counter += 1
+                        if silence_counter >= SILENCE_TRIGGER_FRAMES:
+                            n = len(speech_buffer)
+                            duration = n * 0.01
+                            t = frame_num * 0.01
+
+                            if n < MIN_SPEECH_FRAMES:
+                                # Too short — discard
+                                speech_buffer.clear()
+                                silence_counter = 0
+                                was_speaking = False
+                                continue
+
+                            # Silence threshold reached — send to Whisper
+                            print(f"  [{t:7.2f}s] Transcribing {n} frames "
+                                  f"({duration:.1f}s)...", flush=True)
+
+                            if not transcriber.is_busy():
+                                frames_copy = list(speech_buffer)
+                                transcriber.transcribe_async(
+                                    frames_copy, on_transcription)
+                            else:
+                                print(f"  [{t:7.2f}s] (Whisper busy, "
+                                      f"dropping {n} frames)", flush=True)
+
+                            speech_buffer.clear()
+                            silence_counter = 0
+                            was_speaking = False
+
+    except (KeyboardInterrupt, BrokenPipeError):
+        pass
+    finally:
+        if proc.poll() is None:
+            proc.terminate()
+            try:
+                proc.wait(timeout=3)
+            except subprocess.TimeoutExpired:
+                proc.kill()
+                proc.wait()
+        if plotter is not None:
+            try:
+                plt.close(plotter.fig)
+            except Exception:
+                pass
+        print("\n\nCapture stopped.")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Live SOF mel capture with DSP VAD-triggered Whisper transcription")
+    parser.add_argument('--device', '-D', default='hw:0,47',
+                        help='ALSA capture device (default: hw:0,47)')
+    parser.add_argument('--rate', '-r', type=int, default=16000,
+                        help='Sample rate for arecord (default: 16000)')
+    parser.add_argument('--model', '-m', default='whisper-medium-int4-ov',
+                        help='Path to Whisper OpenVINO model directory')
+    parser.add_argument('--encoder-device', default='NPU',
+                        help='OpenVINO device for encoder (default: NPU)')
+    parser.add_argument('--decoder-device', default='CPU',
+                        help='OpenVINO device for decoder (default: CPU)')
+    parser.add_argument('--plot', action='store_true',
+                        help='Show live scrolling mel spectrogram and VAD plot')
+    args = parser.parse_args()
+    model_id = "OpenVINO/" + os.path.basename(args.model)
+    if not os.path.isdir(args.model):
+        print(f"Downloading model {model_id} ...")
+        hf_hub.snapshot_download(model_id, local_dir=args.model)
+
+    print("=== Live SOF Mel → Whisper Transcription (DSP VAD) ===\n")
+    run_capture(args.device, args.rate, args.model, args.encoder_device,
+                args.decoder_device, enable_plot=args.plot)
+
+
+if __name__ == '__main__':
+    main()

From ca7aa07f8fcd9b474bb9f4bd08276382bfd3475f Mon Sep 17 00:00:00 2001
From: Seppo Ingalsuo <seppo.ingalsuo@linux.intel.com>
Date: Tue, 19 May 2026 19:42:36 +0300
Subject: [PATCH 3/3] Audio: MFCC: Add VAD switch control notification to user
 space

Add IPC4 notification that sends the VAD state to user space via a
switch control whenever the VAD decision changes between speech and
silence. The notification is initialized during prepare and sent from
the audio processing path on VAD state transitions.

The implementation follows the TDFB/sound_dose notification pattern:
mfcc_ipc4.c contains the IPC4-specific notification init and send
functions, while mfcc.c provides weak stubs so IPC3 builds link
without the IPC4 dependencies.

Add handling for SOF_IPC4_SWITCH_CONTROL_PARAM_ID in mfcc_get_config
and mfcc_set_config so the kernel driver can read back the current VAD
state after receiving a notification. The switch control is read-only
from the DSP side.

Both the notification init and the VAD state change detection are
gated on the update_controls flag in the configuration blob struct.

Add a switch control (mixer) to the MFCC topology2 widget definition
for the VAD notification.

Signed-off-by: Seppo Ingalsuo <seppo.ingalsuo@linux.intel.com>
---
 src/audio/mfcc/CMakeLists.txt                 |  3 +
 src/audio/mfcc/mfcc.c                         | 44 ++++++++-
 src/audio/mfcc/mfcc_common.c                  | 19 +++-
 src/audio/mfcc/mfcc_ipc4.c                    | 97 +++++++++++++++++++
 src/include/sof/audio/mfcc/mfcc_comp.h        | 10 ++
 .../intel/sdw-dmic-audio-feature.conf         | 15 +++
 .../intel/sdw-jack-audio-feature.conf         | 15 +++
 7 files changed, 196 insertions(+), 7 deletions(-)
 create mode 100644 src/audio/mfcc/mfcc_ipc4.c

diff --git a/src/audio/mfcc/CMakeLists.txt b/src/audio/mfcc/CMakeLists.txt
index 10daf78aa2a6..274c7aa05eb8 100644
--- a/src/audio/mfcc/CMakeLists.txt
+++ b/src/audio/mfcc/CMakeLists.txt
@@ -5,4 +5,7 @@ if(CONFIG_COMP_MFCC STREQUAL "m" AND DEFINED CONFIG_LLEXT)
   add_dependencies(app mfcc)
 else()
   add_local_sources(sof mfcc.c mfcc_setup.c mfcc_common.c mfcc_generic.c mfcc_hifi4.c mfcc_hifi3.c mfcc_vad.c)
+  if(CONFIG_IPC_MAJOR_4)
+    add_local_sources(sof mfcc_ipc4.c)
+  endif()
 endif()
diff --git a/src/audio/mfcc/mfcc.c b/src/audio/mfcc/mfcc.c
index ea09d919009b..4976ad39a723 100644
--- a/src/audio/mfcc/mfcc.c
+++ b/src/audio/mfcc/mfcc.c
@@ -23,6 +23,7 @@
 #include <ipc/control.h>
 #include <ipc/stream.h>
 #include <ipc/topology.h>
+#include <ipc4/header.h>
 #include <user/mfcc.h>
 #include <user/trace.h>
 #include <rtos/init.h>
@@ -68,6 +69,16 @@ static mfcc_func mfcc_find_func(enum sof_ipc_frame source_format,
  * End of MFCC setup code. Next the standard component methods.
  */
 
+/* Weak stubs for IPC notification, overridden by mfcc_ipc4.c */
+__weak int mfcc_ipc_notification_init(struct processing_module *mod)
+{
+	return 0;
+}
+
+__weak void mfcc_send_vad_notification(struct processing_module *mod, uint32_t val)
+{
+}
+
 static int mfcc_init(struct processing_module *mod)
 {
 	struct module_data *md = &mod->priv;
@@ -97,6 +108,7 @@ static int mfcc_free(struct processing_module *mod)
 	struct mfcc_comp_data *cd = module_get_private_data(mod);
 
 	comp_info(mod->dev, "entry");
+	ipc_msg_free(cd->msg);
 	mod_data_blob_handler_free(mod, cd->model_handler);
 	mfcc_free_buffers(mod);
 	mod_free(mod, cd);
@@ -109,10 +121,21 @@ static int mfcc_get_config(struct processing_module *mod,
 {
 	struct sof_ipc_ctrl_data *cdata = (struct sof_ipc_ctrl_data *)fragment;
 	struct mfcc_comp_data *cd = module_get_private_data(mod);
+	struct sof_ipc4_control_msg_payload *ctl;
 
 	comp_info(mod->dev, "entry");
 
-	return comp_data_blob_get_cmd(cd->model_handler, cdata, fragment_size);
+	switch (config_id) {
+	case SOF_IPC4_SWITCH_CONTROL_PARAM_ID:
+		ctl = (struct sof_ipc4_control_msg_payload *)fragment;
+		if (ctl->id == MFCC_CTRL_INDEX_VAD && ctl->num_elems == 1) {
+			ctl->chanv[0].value = cd->vad_prev ? 1 : 0;
+			*data_offset_size = sizeof(*ctl) + sizeof(ctl->chanv[0]);
+		}
+		return 0;
+	default:
+		return comp_data_blob_get_cmd(cd->model_handler, cdata, fragment_size);
+	}
 }
 
 static int mfcc_set_config(struct processing_module *mod, uint32_t config_id,
@@ -124,8 +147,14 @@ static int mfcc_set_config(struct processing_module *mod, uint32_t config_id,
 
 	comp_info(mod->dev, "entry");
 
-	return comp_data_blob_set(cd->model_handler, pos, data_offset_size,
-				  fragment, fragment_size);
+	switch (config_id) {
+	case SOF_IPC4_SWITCH_CONTROL_PARAM_ID:
+		/* VAD switch is read-only, ignore set requests */
+		return 0;
+	default:
+		return comp_data_blob_set(cd->model_handler, pos, data_offset_size,
+					  fragment, fragment_size);
+	}
 }
 
 static int mfcc_process(struct processing_module *mod,
@@ -198,6 +227,15 @@ static int mfcc_prepare(struct processing_module *mod,
 		goto err;
 	}
 
+	/* Initialize VAD switch control notification if enabled */
+	if (cd->config && cd->config->update_controls) {
+		ret = mfcc_ipc_notification_init(mod);
+		if (ret < 0)
+			goto err;
+
+		cd->vad_prev = false;
+	}
+
 	return 0;
 
 err:
diff --git a/src/audio/mfcc/mfcc_common.c b/src/audio/mfcc/mfcc_common.c
index 97f2cc547f53..2c5b70e94c08 100644
--- a/src/audio/mfcc/mfcc_common.c
+++ b/src/audio/mfcc/mfcc_common.c
@@ -29,8 +29,9 @@ LOG_MODULE_REGISTER(mfcc_common, CONFIG_SOF_LOG_LEVEL);
  * The main processing function for MFCC
  */
 
-static int mfcc_stft_process(const struct comp_dev *dev, struct mfcc_comp_data *cd)
+static int mfcc_stft_process(struct processing_module *mod, struct mfcc_comp_data *cd)
 {
+	const struct comp_dev *dev = mod->dev;
 	struct sof_mfcc_config *config = cd->config;
 	struct mfcc_state *state = &cd->state;
 	struct mfcc_buffer *buf = &state->buf;
@@ -183,6 +184,16 @@ static int mfcc_stft_process(const struct comp_dev *dev, struct mfcc_comp_data *
 		state->header.energy = cd->vad.energy;
 		state->header.noise_energy = cd->vad.noise_energy;
 		state->header.vad_flag = cd->vad.is_speech ? 1 : 0;
+
+		/* Send notification when VAD state changes */
+		if (config->update_controls) {
+			bool vad_now = cd->vad.is_speech;
+
+			if (vad_now != cd->vad_prev) {
+				mfcc_send_vad_notification(mod, vad_now ? 1 : 0);
+				cd->vad_prev = vad_now;
+			}
+		}
 	}
 
 	return cc_count;
@@ -291,7 +302,7 @@ void mfcc_s16_default(struct processing_module *mod, struct input_stream_buffer
 	mfcc_source_copy_s16(bsource, buf, &state->emph, frames, state->source_channel);
 
 	/* Run STFT and processing after FFT: Mel auditory filter and DCT. */
-	num_ceps = mfcc_stft_process(mod->dev, cd);
+	num_ceps = mfcc_stft_process(mod, cd);
 
 	/* If new output produced, set up pointer into scratch data and mark header pending */
 	if (num_ceps > 0) {
@@ -390,7 +401,7 @@ void mfcc_s24_default(struct processing_module *mod, struct input_stream_buffer
 	mfcc_source_copy_s24(bsource, buf, &state->emph, frames, state->source_channel);
 
 	/* Run STFT and processing after FFT */
-	num_ceps = mfcc_stft_process(mod->dev, cd);
+	num_ceps = mfcc_stft_process(mod, cd);
 
 	/* If new output produced, set up pointer into scratch data */
 	if (num_ceps > 0) {
@@ -469,7 +480,7 @@ void mfcc_s32_default(struct processing_module *mod, struct input_stream_buffer
 	mfcc_source_copy_s32(bsource, buf, &state->emph, frames, state->source_channel);
 
 	/* Run STFT and processing after FFT */
-	num_ceps = mfcc_stft_process(mod->dev, cd);
+	num_ceps = mfcc_stft_process(mod, cd);
 
 	/* If new output produced, set up pointer into scratch data */
 	if (num_ceps > 0) {
diff --git a/src/audio/mfcc/mfcc_ipc4.c b/src/audio/mfcc/mfcc_ipc4.c
new file mode 100644
index 000000000000..c763d2e765bd
--- /dev/null
+++ b/src/audio/mfcc/mfcc_ipc4.c
@@ -0,0 +1,97 @@
+// SPDX-License-Identifier: BSD-3-Clause
+//
+// Copyright(c) 2026 Intel Corporation.
+//
+// Author: Seppo Ingalsuo <seppo.ingalsuo@linux.intel.com>
+
+/**
+ * \file mfcc_ipc4.c
+ * \brief IPC4-specific functions for MFCC component.
+ *
+ * Provides VAD switch control notification to user space via the
+ * IPC4 module notification mechanism.
+ */
+
+#include <sof/audio/mfcc/mfcc_comp.h>
+#include <sof/audio/module_adapter/module/generic.h>
+#include <sof/audio/component.h>
+#include <sof/ipc/msg.h>
+#include <sof/trace/trace.h>
+#include <ipc4/base-config.h>
+#include <ipc4/header.h>
+#include <ipc4/module.h>
+#include <ipc4/notification.h>
+#include <rtos/string.h>
+#include <errno.h>
+#include <stdint.h>
+
+LOG_MODULE_DECLARE(mfcc, CONFIG_SOF_LOG_LEVEL);
+
+/**
+ * \brief Initialize IPC notification message for VAD switch control.
+ *
+ * Allocates and configures the IPC message used to send VAD state
+ * change notifications to user space via a switch control.
+ */
+int mfcc_ipc_notification_init(struct processing_module *mod)
+{
+	struct mfcc_comp_data *cd = module_get_private_data(mod);
+	struct ipc_msg msg_proto;
+	struct comp_dev *dev = mod->dev;
+	struct comp_ipc_config *ipc_config = &dev->ipc_config;
+	union ipc4_notification_header *primary =
+		(union ipc4_notification_header *)&msg_proto.header;
+	struct sof_ipc4_notify_module_data *msg_module_data;
+	struct sof_ipc4_control_msg_payload *msg_payload;
+
+	memset_s(&msg_proto, sizeof(msg_proto), 0, sizeof(msg_proto));
+	primary->r.notif_type = SOF_IPC4_MODULE_NOTIFICATION;
+	primary->r.type = SOF_IPC4_GLB_NOTIFICATION;
+	primary->r.rsp = SOF_IPC4_MESSAGE_DIR_MSG_REQUEST;
+	primary->r.msg_tgt = SOF_IPC4_MESSAGE_TARGET_FW_GEN_MSG;
+	cd->msg = ipc_msg_w_ext_init(msg_proto.header, msg_proto.extension,
+				     sizeof(struct sof_ipc4_notify_module_data) +
+				     sizeof(struct sof_ipc4_control_msg_payload) +
+				     sizeof(struct sof_ipc4_ctrl_value_chan));
+	if (!cd->msg) {
+		comp_err(dev, "Failed to initialize VAD notification");
+		return -ENOMEM;
+	}
+
+	msg_module_data = (struct sof_ipc4_notify_module_data *)cd->msg->tx_data;
+	msg_module_data->instance_id = IPC4_INST_ID(ipc_config->id);
+	msg_module_data->module_id = IPC4_MOD_ID(ipc_config->id);
+	msg_module_data->event_id = SOF_IPC4_NOTIFY_MODULE_EVENTID_ALSA_MAGIC_VAL |
+		SOF_IPC4_SWITCH_CONTROL_PARAM_ID;
+	msg_module_data->event_data_size = sizeof(struct sof_ipc4_control_msg_payload) +
+		sizeof(struct sof_ipc4_ctrl_value_chan);
+
+	msg_payload = (struct sof_ipc4_control_msg_payload *)msg_module_data->event_data;
+	msg_payload->id = MFCC_CTRL_INDEX_VAD;
+	msg_payload->num_elems = 1;
+	msg_payload->chanv[0].channel = 0;
+
+	comp_dbg(dev, "VAD notification init: instance_id = 0x%08x, module_id = 0x%08x",
+		 msg_module_data->instance_id, msg_module_data->module_id);
+	return 0;
+}
+
+/**
+ * \brief Send VAD switch control notification to user space.
+ * \param mod Processing module.
+ * \param val VAD value: 1 = speech, 0 = silence.
+ */
+void mfcc_send_vad_notification(struct processing_module *mod, uint32_t val)
+{
+	struct mfcc_comp_data *cd = module_get_private_data(mod);
+	struct sof_ipc4_notify_module_data *msg_module_data;
+	struct sof_ipc4_control_msg_payload *msg_payload;
+
+	if (!cd->msg)
+		return;
+
+	msg_module_data = (struct sof_ipc4_notify_module_data *)cd->msg->tx_data;
+	msg_payload = (struct sof_ipc4_control_msg_payload *)msg_module_data->event_data;
+	msg_payload->chanv[0].value = val;
+	ipc_msg_send(cd->msg, NULL, false);
+}
diff --git a/src/include/sof/audio/mfcc/mfcc_comp.h b/src/include/sof/audio/mfcc/mfcc_comp.h
index 80ab3e376487..72177df2dd99 100644
--- a/src/include/sof/audio/mfcc/mfcc_comp.h
+++ b/src/include/sof/audio/mfcc/mfcc_comp.h
@@ -13,6 +13,7 @@
 #include <sof/math/dct.h>
 #include <sof/math/fft.h>
 #include <sof/audio/mfcc/mfcc_vad.h>
+#include <sof/ipc/msg.h>
 #include <stddef.h>
 #include <stdint.h>
 
@@ -33,6 +34,9 @@
 #define MFCC_MAGIC 0x6d666363 /* ASCII for "mfcc" */
 #define MFCC_FFT_BITS	32
 
+/** \brief Switch control index for VAD notification to user space */
+#define MFCC_CTRL_INDEX_VAD	0
+
 /**
  * \brief Data header prepended to every MFCC output frame.
  *
@@ -135,7 +139,9 @@ struct mfcc_comp_data {
 	struct mfcc_vad_state vad;
 	struct comp_data_blob_handler *model_handler;
 	struct sof_mfcc_config *config;
+	struct ipc_msg *msg;		/**< IPC notification for VAD switch control */
 	int max_frames;
+	bool vad_prev;			/**< Previous VAD state for edge detection */
 	mfcc_func mfcc_func;		/**< processing function */
 };
 
@@ -156,6 +162,10 @@ int mfcc_setup(struct processing_module *mod, int max_frames, int rate, int chan
 
 void mfcc_free_buffers(struct processing_module *mod);
 
+int mfcc_ipc_notification_init(struct processing_module *mod);
+
+void mfcc_send_vad_notification(struct processing_module *mod, uint32_t val);
+
 void mfcc_fill_prev_samples(struct mfcc_buffer *buf, int16_t *prev_data,
 			    int prev_data_length);
 
diff --git a/tools/topology/topology2/platform/intel/sdw-dmic-audio-feature.conf b/tools/topology/topology2/platform/intel/sdw-dmic-audio-feature.conf
index 87039b261597..4b7ec2478076 100644
--- a/tools/topology/topology2/platform/intel/sdw-dmic-audio-feature.conf
+++ b/tools/topology/topology2/platform/intel/sdw-dmic-audio-feature.conf
@@ -21,6 +21,21 @@ Object.Pipeline.host-gateway-src-mfcc-capture [
 					name "$SDW_DMIC_AUDIO_FEATURE_CAPTURE_PCM_NAME MFCC bytes"
 					<include/components/mfcc/mel80.conf>
 				}
+				mixer."1" {
+					name "$SDW_DMIC_AUDIO_FEATURE_CAPTURE_PCM_NAME MFCC VAD"
+					Object.Base.channel.1 {
+						name	"fc"
+						shift	0
+					}
+					Object.Base.ops.1 {
+						name	"ctl"
+						info	"volsw"
+						#259 binds the mixer control to switch get/put handlers
+						get	259
+						put	259
+					}
+					max 1
+				}
 			}
 		}
 	}
diff --git a/tools/topology/topology2/platform/intel/sdw-jack-audio-feature.conf b/tools/topology/topology2/platform/intel/sdw-jack-audio-feature.conf
index 9645199d6907..019b09911197 100644
--- a/tools/topology/topology2/platform/intel/sdw-jack-audio-feature.conf
+++ b/tools/topology/topology2/platform/intel/sdw-jack-audio-feature.conf
@@ -21,6 +21,21 @@ Object.Pipeline.host-gateway-src-mfcc-capture [
 					name "$SDW_JACK_AUDIO_FEATURE_CAPTURE_PCM_NAME MFCC bytes"
 					<include/components/mfcc/mel80.conf>
 				}
+				mixer."1" {
+					name "$SDW_JACK_AUDIO_FEATURE_CAPTURE_PCM_NAME MFCC VAD"
+					Object.Base.channel.1 {
+						name	"fc"
+						shift	0
+					}
+					Object.Base.ops.1 {
+						name	"ctl"
+						info	"volsw"
+						#259 binds the mixer control to switch get/put handlers
+						get	259
+						put	259
+					}
+					max 1
+				}
 			}
 		}
 	}