diff --git a/src/audio/mfcc/CMakeLists.txt b/src/audio/mfcc/CMakeLists.txt
index f8af79d1ca8a..274c7aa05eb8 100644
--- a/src/audio/mfcc/CMakeLists.txt
+++ b/src/audio/mfcc/CMakeLists.txt
@@ -4,5 +4,8 @@ if(CONFIG_COMP_MFCC STREQUAL "m" AND DEFINED CONFIG_LLEXT)
   add_subdirectory(llext ${PROJECT_BINARY_DIR}/mfcc_llext)
   add_dependencies(app mfcc)
 else()
-  add_local_sources(sof mfcc.c mfcc_setup.c mfcc_common.c mfcc_generic.c mfcc_hifi4.c mfcc_hifi3.c)
+  add_local_sources(sof mfcc.c mfcc_setup.c mfcc_common.c mfcc_generic.c mfcc_hifi4.c mfcc_hifi3.c mfcc_vad.c)
+  if(CONFIG_IPC_MAJOR_4)
+    add_local_sources(sof mfcc_ipc4.c)
+  endif()
 endif()
diff --git a/src/audio/mfcc/mfcc.c b/src/audio/mfcc/mfcc.c
index ea09d919009b..4976ad39a723 100644
--- a/src/audio/mfcc/mfcc.c
+++ b/src/audio/mfcc/mfcc.c
@@ -23,6 +23,7 @@
 #include <ipc/control.h>
 #include <ipc/stream.h>
 #include <ipc/topology.h>
+#include <ipc4/header.h>
 #include <user/mfcc.h>
 #include <user/trace.h>
 #include <rtos/init.h>
@@ -68,6 +69,16 @@ static mfcc_func mfcc_find_func(enum sof_ipc_frame source_format,
  * End of MFCC setup code. Next the standard component methods.
  */
 
+/* Weak stubs for IPC notification, overridden by mfcc_ipc4.c */
+__weak int mfcc_ipc_notification_init(struct processing_module *mod)
+{
+	return 0;
+}
+
+__weak void mfcc_send_vad_notification(struct processing_module *mod, uint32_t val)
+{
+}
+
 static int mfcc_init(struct processing_module *mod)
 {
 	struct module_data *md = &mod->priv;
@@ -97,6 +108,7 @@ static int mfcc_free(struct processing_module *mod)
 	struct mfcc_comp_data *cd = module_get_private_data(mod);
 
 	comp_info(mod->dev, "entry");
+	ipc_msg_free(cd->msg);
 	mod_data_blob_handler_free(mod, cd->model_handler);
 	mfcc_free_buffers(mod);
 	mod_free(mod, cd);
@@ -109,10 +121,21 @@ static int mfcc_get_config(struct processing_module *mod,
 {
 	struct sof_ipc_ctrl_data *cdata = (struct sof_ipc_ctrl_data *)fragment;
 	struct mfcc_comp_data *cd = module_get_private_data(mod);
+	struct sof_ipc4_control_msg_payload *ctl;
 
 	comp_info(mod->dev, "entry");
 
-	return comp_data_blob_get_cmd(cd->model_handler, cdata, fragment_size);
+	switch (config_id) {
+	case SOF_IPC4_SWITCH_CONTROL_PARAM_ID:
+		ctl = (struct sof_ipc4_control_msg_payload *)fragment;
+		if (ctl->id == MFCC_CTRL_INDEX_VAD && ctl->num_elems == 1) {
+			ctl->chanv[0].value = cd->vad_prev ? 1 : 0;
+			*data_offset_size = sizeof(*ctl) + sizeof(ctl->chanv[0]);
+		}
+		return 0;
+	default:
+		return comp_data_blob_get_cmd(cd->model_handler, cdata, fragment_size);
+	}
 }
 
 static int mfcc_set_config(struct processing_module *mod, uint32_t config_id,
@@ -124,8 +147,14 @@ static int mfcc_set_config(struct processing_module *mod, uint32_t config_id,
 
 	comp_info(mod->dev, "entry");
 
-	return comp_data_blob_set(cd->model_handler, pos, data_offset_size,
-				  fragment, fragment_size);
+	switch (config_id) {
+	case SOF_IPC4_SWITCH_CONTROL_PARAM_ID:
+		/* VAD switch is read-only, ignore set requests */
+		return 0;
+	default:
+		return comp_data_blob_set(cd->model_handler, pos, data_offset_size,
+					  fragment, fragment_size);
+	}
 }
 
 static int mfcc_process(struct processing_module *mod,
@@ -198,6 +227,15 @@ static int mfcc_prepare(struct processing_module *mod,
 		goto err;
 	}
 
+	/* Initialize VAD switch control notification if enabled */
+	if (cd->config && cd->config->update_controls) {
+		ret = mfcc_ipc_notification_init(mod);
+		if (ret < 0)
+			goto err;
+
+		cd->vad_prev = false;
+	}
+
 	return 0;
 
 err:
diff --git a/src/audio/mfcc/mfcc_common.c b/src/audio/mfcc/mfcc_common.c
index 1079864e9259..2c5b70e94c08 100644
--- a/src/audio/mfcc/mfcc_common.c
+++ b/src/audio/mfcc/mfcc_common.c
@@ -21,14 +21,17 @@
 #include <stddef.h>
 #include <stdint.h>
 
+#include <sof/audio/mfcc/mfcc_vad.h>
+
 LOG_MODULE_REGISTER(mfcc_common, CONFIG_SOF_LOG_LEVEL);
 
 /*
  * The main processing function for MFCC
  */
 
-static int mfcc_stft_process(const struct comp_dev *dev, struct mfcc_comp_data *cd)
+static int mfcc_stft_process(struct processing_module *mod, struct mfcc_comp_data *cd)
 {
+	const struct comp_dev *dev = mod->dev;
 	struct sof_mfcc_config *config = cd->config;
 	struct mfcc_state *state = &cd->state;
 	struct mfcc_buffer *buf = &state->buf;
@@ -169,6 +172,28 @@ static int mfcc_stft_process(const struct comp_dev *dev, struct mfcc_comp_data *
 
 			cc_count += state->dct.num_out;
 		}
+
+		/* Run VAD on the mel log spectrum (available in both modes) */
+		if (config->enable_vad)
+			mfcc_vad_update(&cd->vad, state->mel_log_32);
+
+		/* Populate data header for this output frame */
+		state->header.magic = MFCC_MAGIC;
+		state->header.frame_number = cd->vad.frame_count;
+		state->header.reserved = 0;
+		state->header.energy = cd->vad.energy;
+		state->header.noise_energy = cd->vad.noise_energy;
+		state->header.vad_flag = cd->vad.is_speech ? 1 : 0;
+
+		/* Send notification when VAD state changes */
+		if (config->update_controls) {
+			bool vad_now = cd->vad.is_speech;
+
+			if (vad_now != cd->vad_prev) {
+				mfcc_send_vad_notification(mod, vad_now ? 1 : 0);
+				cd->vad_prev = vad_now;
+			}
+		}
 	}
 
 	return cc_count;
@@ -267,9 +292,8 @@ void mfcc_s16_default(struct processing_module *mod, struct input_stream_buffer
 	struct mfcc_comp_data *cd = module_get_private_data(mod);
 	struct mfcc_state *state = &cd->state;
 	struct mfcc_buffer *buf = &cd->state.buf;
-	uint32_t magic = MFCC_MAGIC;
 	int16_t *w_ptr = audio_stream_get_wptr(sink);
-	const int num_magic = 2;
+	const int num_header_s16 = sizeof(state->header) / sizeof(int16_t);
 	int num_ceps;
 	int sink_samples;
 	int to_copy;
@@ -278,27 +302,29 @@ void mfcc_s16_default(struct processing_module *mod, struct input_stream_buffer
 	mfcc_source_copy_s16(bsource, buf, &state->emph, frames, state->source_channel);
 
 	/* Run STFT and processing after FFT: Mel auditory filter and DCT. */
-	num_ceps = mfcc_stft_process(mod->dev, cd);
+	num_ceps = mfcc_stft_process(mod, cd);
 
-	/* If new output produced, set up pointer into scratch data and mark magic pending */
+	/* If new output produced, set up pointer into scratch data and mark header pending */
 	if (num_ceps > 0) {
-		if (state->mel_only)
+		if (state->mel_only) {
 			state->out_data_ptr = state->mel_spectra->data;
-		else
+		} else {
 			state->out_data_ptr = state->cepstral_coef->data;
+		}
 
 		state->out_remain = num_ceps;
-		state->magic_pending = true;
+		state->header_pending = true;
 	}
 
 	/* Write to sink, limited by period size */
 	sink_samples = frames * audio_stream_get_channels(sink);
 
-	/* Write magic word first if pending */
-	if (state->magic_pending && sink_samples >= num_magic) {
-		w_ptr = mfcc_sink_copy_data_s16(sink, w_ptr, num_magic, (int16_t *)&magic);
-		sink_samples -= num_magic;
-		state->magic_pending = false;
+	/* Write data header first if pending */
+	if (state->header_pending && sink_samples >= num_header_s16) {
+		w_ptr = mfcc_sink_copy_data_s16(sink, w_ptr, num_header_s16,
+						(int16_t *)&state->header);
+		sink_samples -= num_header_s16;
+		state->header_pending = false;
 	}
 
 	/* Write cepstral/mel data from scratch buffer */
@@ -363,9 +389,8 @@ void mfcc_s24_default(struct processing_module *mod, struct input_stream_buffer
 	struct mfcc_comp_data *cd = module_get_private_data(mod);
 	struct mfcc_state *state = &cd->state;
 	struct mfcc_buffer *buf = &cd->state.buf;
-	uint32_t magic = MFCC_MAGIC;
 	int32_t *w_ptr = audio_stream_get_wptr(sink);
-	const int num_magic = 1; /* one int32_t word for magic */
+	const int num_header_s32 = sizeof(state->header) / sizeof(int32_t);
 	int num_ceps;
 	int sink_samples;
 	int remain_s32;
@@ -376,7 +401,7 @@ void mfcc_s24_default(struct processing_module *mod, struct input_stream_buffer
 	mfcc_source_copy_s24(bsource, buf, &state->emph, frames, state->source_channel);
 
 	/* Run STFT and processing after FFT */
-	num_ceps = mfcc_stft_process(mod->dev, cd);
+	num_ceps = mfcc_stft_process(mod, cd);
 
 	/* If new output produced, set up pointer into scratch data */
 	if (num_ceps > 0) {
@@ -391,17 +416,18 @@ void mfcc_s24_default(struct processing_module *mod, struct input_stream_buffer
 		}
 
 		state->out_remain = num_ceps;
-		state->magic_pending = true;
+		state->header_pending = true;
 	}
 
 	/* Write to sink, limited by period size */
 	sink_samples = frames * audio_stream_get_channels(sink);
 
-	/* Write magic word first if pending */
-	if (state->magic_pending && sink_samples >= num_magic) {
-		w_ptr = mfcc_sink_copy_data_s32(sink, w_ptr, num_magic, (int32_t *)&magic);
-		sink_samples -= num_magic;
-		state->magic_pending = false;
+	/* Write data header first if pending */
+	if (state->header_pending && sink_samples >= num_header_s32) {
+		w_ptr = mfcc_sink_copy_data_s32(sink, w_ptr, num_header_s32,
+						(int32_t *)&state->header);
+		sink_samples -= num_header_s32;
+		state->header_pending = false;
 	}
 
 	if (state->mel_only) {
@@ -443,9 +469,8 @@ void mfcc_s32_default(struct processing_module *mod, struct input_stream_buffer
 	struct mfcc_comp_data *cd = module_get_private_data(mod);
 	struct mfcc_state *state = &cd->state;
 	struct mfcc_buffer *buf = &cd->state.buf;
-	uint32_t magic = MFCC_MAGIC;
 	int32_t *w_ptr = audio_stream_get_wptr(sink);
-	const int num_magic = 1; /* one int32_t word for magic */
+	const int num_header_s32 = sizeof(state->header) / sizeof(int32_t);
 	int num_ceps;
 	int sink_samples;
 	int remain_s32;
@@ -455,7 +480,7 @@ void mfcc_s32_default(struct processing_module *mod, struct input_stream_buffer
 	mfcc_source_copy_s32(bsource, buf, &state->emph, frames, state->source_channel);
 
 	/* Run STFT and processing after FFT */
-	num_ceps = mfcc_stft_process(mod->dev, cd);
+	num_ceps = mfcc_stft_process(mod, cd);
 
 	/* If new output produced, set up pointer into scratch data */
 	if (num_ceps > 0) {
@@ -466,17 +491,18 @@ void mfcc_s32_default(struct processing_module *mod, struct input_stream_buffer
 		}
 
 		state->out_remain = num_ceps;
-		state->magic_pending = true;
+		state->header_pending = true;
 	}
 
 	/* Write to sink, limited by period size */
 	sink_samples = frames * audio_stream_get_channels(sink);
 
-	/* Write magic word first if pending */
-	if (state->magic_pending && sink_samples >= num_magic) {
-		w_ptr = mfcc_sink_copy_data_s32(sink, w_ptr, num_magic, (int32_t *)&magic);
-		sink_samples -= num_magic;
-		state->magic_pending = false;
+	/* Write data header first if pending */
+	if (state->header_pending && sink_samples >= num_header_s32) {
+		w_ptr = mfcc_sink_copy_data_s32(sink, w_ptr, num_header_s32,
+						(int32_t *)&state->header);
+		sink_samples -= num_header_s32;
+		state->header_pending = false;
 	}
 
 	if (state->mel_only) {
diff --git a/src/audio/mfcc/mfcc_ipc4.c b/src/audio/mfcc/mfcc_ipc4.c
new file mode 100644
index 000000000000..c763d2e765bd
--- /dev/null
+++ b/src/audio/mfcc/mfcc_ipc4.c
@@ -0,0 +1,97 @@
+// SPDX-License-Identifier: BSD-3-Clause
+//
+// Copyright(c) 2026 Intel Corporation.
+//
+// Author: Seppo Ingalsuo <seppo.ingalsuo@linux.intel.com>
+
+/**
+ * \file mfcc_ipc4.c
+ * \brief IPC4-specific functions for MFCC component.
+ *
+ * Provides VAD switch control notification to user space via the
+ * IPC4 module notification mechanism.
+ */
+
+#include <sof/audio/mfcc/mfcc_comp.h>
+#include <sof/audio/module_adapter/module/generic.h>
+#include <sof/audio/component.h>
+#include <sof/ipc/msg.h>
+#include <sof/trace/trace.h>
+#include <ipc4/base-config.h>
+#include <ipc4/header.h>
+#include <ipc4/module.h>
+#include <ipc4/notification.h>
+#include <rtos/string.h>
+#include <errno.h>
+#include <stdint.h>
+
+LOG_MODULE_DECLARE(mfcc, CONFIG_SOF_LOG_LEVEL);
+
+/**
+ * \brief Initialize IPC notification message for VAD switch control.
+ *
+ * Allocates and configures the IPC message used to send VAD state
+ * change notifications to user space via a switch control.
+ */
+int mfcc_ipc_notification_init(struct processing_module *mod)
+{
+	struct mfcc_comp_data *cd = module_get_private_data(mod);
+	struct ipc_msg msg_proto;
+	struct comp_dev *dev = mod->dev;
+	struct comp_ipc_config *ipc_config = &dev->ipc_config;
+	union ipc4_notification_header *primary =
+		(union ipc4_notification_header *)&msg_proto.header;
+	struct sof_ipc4_notify_module_data *msg_module_data;
+	struct sof_ipc4_control_msg_payload *msg_payload;
+
+	memset_s(&msg_proto, sizeof(msg_proto), 0, sizeof(msg_proto));
+	primary->r.notif_type = SOF_IPC4_MODULE_NOTIFICATION;
+	primary->r.type = SOF_IPC4_GLB_NOTIFICATION;
+	primary->r.rsp = SOF_IPC4_MESSAGE_DIR_MSG_REQUEST;
+	primary->r.msg_tgt = SOF_IPC4_MESSAGE_TARGET_FW_GEN_MSG;
+	cd->msg = ipc_msg_w_ext_init(msg_proto.header, msg_proto.extension,
+				     sizeof(struct sof_ipc4_notify_module_data) +
+				     sizeof(struct sof_ipc4_control_msg_payload) +
+				     sizeof(struct sof_ipc4_ctrl_value_chan));
+	if (!cd->msg) {
+		comp_err(dev, "Failed to initialize VAD notification");
+		return -ENOMEM;
+	}
+
+	msg_module_data = (struct sof_ipc4_notify_module_data *)cd->msg->tx_data;
+	msg_module_data->instance_id = IPC4_INST_ID(ipc_config->id);
+	msg_module_data->module_id = IPC4_MOD_ID(ipc_config->id);
+	msg_module_data->event_id = SOF_IPC4_NOTIFY_MODULE_EVENTID_ALSA_MAGIC_VAL |
+		SOF_IPC4_SWITCH_CONTROL_PARAM_ID;
+	msg_module_data->event_data_size = sizeof(struct sof_ipc4_control_msg_payload) +
+		sizeof(struct sof_ipc4_ctrl_value_chan);
+
+	msg_payload = (struct sof_ipc4_control_msg_payload *)msg_module_data->event_data;
+	msg_payload->id = MFCC_CTRL_INDEX_VAD;
+	msg_payload->num_elems = 1;
+	msg_payload->chanv[0].channel = 0;
+
+	comp_dbg(dev, "VAD notification init: instance_id = 0x%08x, module_id = 0x%08x",
+		 msg_module_data->instance_id, msg_module_data->module_id);
+	return 0;
+}
+
+/**
+ * \brief Send VAD switch control notification to user space.
+ * \param mod Processing module.
+ * \param val VAD value: 1 = speech, 0 = silence.
+ */
+void mfcc_send_vad_notification(struct processing_module *mod, uint32_t val)
+{
+	struct mfcc_comp_data *cd = module_get_private_data(mod);
+	struct sof_ipc4_notify_module_data *msg_module_data;
+	struct sof_ipc4_control_msg_payload *msg_payload;
+
+	if (!cd->msg)
+		return;
+
+	msg_module_data = (struct sof_ipc4_notify_module_data *)cd->msg->tx_data;
+	msg_payload = (struct sof_ipc4_control_msg_payload *)msg_module_data->event_data;
+	msg_payload->chanv[0].value = val;
+	ipc_msg_send(cd->msg, NULL, false);
+}
diff --git a/src/audio/mfcc/mfcc_setup.c b/src/audio/mfcc/mfcc_setup.c
index 1cad4b2b984e..23f07e6aaf68 100644
--- a/src/audio/mfcc/mfcc_setup.c
+++ b/src/audio/mfcc/mfcc_setup.c
@@ -18,6 +18,8 @@
 #include <stddef.h>
 #include <stdint.h>
 
+#include <sof/audio/mfcc/mfcc_vad.h>
+
 /* Definitions for cepstral lifter */
 #define PI_Q23 Q_CONVERT_FLOAT(3.1415926536, 23)
 #define TWO_PI_Q23 Q_CONVERT_FLOAT(6.2831853072, 23)
@@ -332,7 +334,7 @@ int mfcc_setup(struct processing_module *mod, int max_frames, int sample_rate, i
 	 * least fft_hop_size * channels int16_t samples per hop (worst case s16).
 	 * If output exceeds this, data accumulates and will eventually overflow.
 	 */
-	int out_per_hop = max_out_per_hop + 2;
+	int out_per_hop = max_out_per_hop + sizeof(state->header) / sizeof(int16_t);
 	int sink_per_hop = fft->fft_hop_size * channels;
 
 	if (out_per_hop > sink_per_hop) {
@@ -345,11 +347,20 @@ int mfcc_setup(struct processing_module *mod, int max_frames, int sample_rate, i
 	/* Set initial state for STFT */
 	state->waiting_fill = true;
 	state->prev_samples_valid = false;
-	state->magic_pending = false;
+	state->header_pending = false;
+	memset(&state->header, 0, sizeof(state->header));
 	state->out_data_ptr = NULL;
 	state->out_data_ptr_32 = NULL;
 	state->out_remain = 0;
 
+	if (config->enable_vad) {
+		ret = mfcc_vad_init(&cd->vad, config->num_mel_bins, sample_rate, mod);
+		if (ret < 0) {
+			comp_err(dev, "Failed VAD init");
+			goto free_lifter;
+		}
+	}
+
 	comp_dbg(dev, "done");
 	return 0;
 
@@ -389,4 +400,6 @@ void mfcc_free_buffers(struct processing_module *mod)
 	mod_free(mod, cd->state.melfb.data);
 	mod_free(mod, cd->state.dct.matrix);
 	mod_free(mod, cd->state.lifter.matrix);
+	mod_free(mod, cd->vad.noise_floor);
+	mod_free(mod, cd->vad.weights);
 }
diff --git a/src/audio/mfcc/mfcc_vad.c b/src/audio/mfcc/mfcc_vad.c
new file mode 100644
index 000000000000..1ac13cf53b88
--- /dev/null
+++ b/src/audio/mfcc/mfcc_vad.c
@@ -0,0 +1,245 @@
+// SPDX-License-Identifier: BSD-3-Clause
+//
+// Copyright(c) 2026 Intel Corporation.
+//
+// Author: Seppo Ingalsuo <seppo.ingalsuo@linux.intel.com>
+
+/**
+ * \file mfcc_vad.c
+ * \brief Voice Activity Detection based on Mel spectrum energy.
+ *
+ * Implements a VAD that tracks per-bin noise floor and computes a
+ * speech-frequency weighted energy above the floor. Speech is declared
+ * when the weighted delta exceeds a threshold, with hangover to prevent
+ * rapid toggling.
+ */
+
+#include <sof/audio/mfcc/mfcc_vad.h>
+
+#include <sof/audio/component.h>
+#include <sof/audio/format.h>
+#include <sof/audio/module_adapter/module/module_interface.h>
+#include <sof/math/auditory.h>
+#include <sof/trace/trace.h>
+#include <errno.h>
+#include <stddef.h>
+
+LOG_MODULE_DECLARE(mfcc, CONFIG_SOF_LOG_LEVEL);
+
+/**
+ * \brief A-weighting table: 1/3 octave band center frequencies in Hz (Q16.0).
+ *
+ * From IEC 61672-1:2013, source:
+ * https://acousticalengineer.com/a-weighting-table/
+ */
+#define A_WEIGHT_TABLE_SIZE	36
+
+static const int16_t a_weight_hz[A_WEIGHT_TABLE_SIZE] = {
+	    6,     8,    10,    13,    16,    20,    25,    32,
+	   40,    50,    63,    80,   100,   125,   160,   200,
+	  250,   315,   400,   500,   630,   800,  1000,  1250,
+	 1600,  2000,  2500,  3150,  4000,  5000,  6300,  8000,
+	10000, 12500, 16000, 20000,
+};
+
+/**
+ * \brief A-weighting linear amplitude, scaled so peak (at 2500 Hz) maps
+ *        to INT16_MAX (32767).  Original dB values converted via
+ *        10^(dB/20) then scaled by 32767 / max.
+ */
+static const int16_t a_weight_lin[A_WEIGHT_TABLE_SIZE] = {
+	    2,     4,     9,    19,    43,    85,   162,   299,
+	  531,   862,  1382,  2140,  3129,  4370,  6172,  8136,
+	10362, 13196, 16234, 19518, 22669, 25730, 28212, 30230,
+	31655, 32392, 32767, 32392, 31655, 30230, 27889, 24856,
+	21156, 17196, 13045,  9670,
+};
+
+/**
+ * \brief Compute A-weighted speech-frequency emphasis weights for Mel bins.
+ *
+ * Weights are computed by linearly interpolating the A-weighting table
+ * at each Mel bin center frequency.  Output weights are in Q1.15 and
+ * sum to approximately 2^15.
+ *
+ * \param[out] weights Output weight array.
+ * \param[in] num_mel Number of Mel bins.
+ * \param[in] sample_rate Sample rate in Hz.
+ */
+static void mfcc_vad_build_weights(int16_t *weights, int num_mel, int sample_rate)
+{
+	int32_t scaled, num;
+	int32_t sum = 0;
+	int16_t f_hz, f0, f1, w, w0, w1, den;
+	int16_t mel_end = psy_hz_to_mel((int16_t)(sample_rate / 2));
+	int16_t mel_step = mel_end / (num_mel + 1);
+	int i, j;
+
+	if (!num_mel)
+		return;
+
+	for (i = 0; i < num_mel; i++) {
+		f_hz = psy_mel_to_hz((int16_t)((i + 1) * mel_step));
+
+		/* Find the table interval containing f_hz and interpolate */
+		if (f_hz <= a_weight_hz[0]) {
+			w = a_weight_lin[0];
+		} else if (f_hz >= a_weight_hz[A_WEIGHT_TABLE_SIZE - 1]) {
+			w = a_weight_lin[A_WEIGHT_TABLE_SIZE - 1];
+		} else {
+			/* Find j such that a_weight_hz[j] <= f_hz < a_weight_hz[j+1] */
+			for (j = 0; j < A_WEIGHT_TABLE_SIZE - 2; j++) {
+				if (f_hz < a_weight_hz[j + 1])
+					break;
+			}
+
+			/* Linear interpolation: w = w0 + (w1 - w0) * (f - f0) / (f1 - f0) */
+			f0 = a_weight_hz[j];
+			f1 = a_weight_hz[j + 1];
+			w0 = a_weight_lin[j];
+			w1 = a_weight_lin[j + 1];
+			num = (int32_t)(w1 - w0) * (f_hz - f0);
+			den = f1 - f0;
+			w = w0 + (int16_t)(num / den);
+		}
+
+		weights[i] = w;
+		sum += w;
+	}
+
+	/* Normalize weights so they sum to 1.0 */
+	for (i = 0; i < num_mel; i++) {
+		scaled = ((int32_t)weights[i] << 16) / sum; /* Q1.16 */
+		weights[i] = (int16_t)Q_SHIFT_RND(scaled, 16, 15); /* Round to Q1.15 */
+	}
+}
+
+int mfcc_vad_init(struct mfcc_vad_state *vad, int num_mel_bins, int sample_rate,
+		  struct processing_module *mod)
+{
+	if (!vad)
+		return -EINVAL;
+
+	if (num_mel_bins <= 0)
+		return -EINVAL;
+
+	vad->num_mel_bins = num_mel_bins;
+	vad->energy_threshold = MFCC_VAD_ENERGY_THRESHOLD;
+	vad->noise_rise_alpha_slow = MFCC_VAD_NOISE_RISE_ALPHA;
+	vad->noise_rise_alpha_fast = MFCC_VAD_NOISE_RISE_ALPHA_FAST;
+	vad->hangover_max = MFCC_VAD_HANGOVER_FRAMES;
+	vad->hangover_counter = 0;
+	vad->init_frames = MFCC_VAD_NOISE_INIT_FRAMES;
+	vad->frame_count = 0;
+	vad->is_speech = false;
+	vad->initialized = false;
+
+	/* Allocate per-bin noise floor */
+	vad->noise_floor = mod_zalloc(mod, num_mel_bins * sizeof(int32_t));
+	if (!vad->noise_floor)
+		return -ENOMEM;
+
+	/* Allocate and compute per-bin weights */
+	vad->weights = mod_zalloc(mod, num_mel_bins * sizeof(int16_t));
+	if (!vad->weights) {
+		mod_free(mod, vad->noise_floor);
+		vad->noise_floor = NULL;
+		return -ENOMEM;
+	}
+
+	mfcc_vad_build_weights(vad->weights, num_mel_bins, sample_rate);
+	return 0;
+}
+
+int mfcc_vad_update(struct mfcc_vad_state *vad, const int32_t *mel_log)
+{
+	int64_t signal_energy = 0;
+	int64_t noise_energy = 0;
+	int64_t energy_delta = 0;
+	int32_t delta;
+	int32_t p;
+	int16_t alpha;
+	int i;
+
+	if (!vad || !mel_log)
+		return 0;
+
+	vad->frame_count++;
+
+	/* Initialize noise floor to first frame */
+	if (!vad->initialized) {
+		for (i = 0; i < vad->num_mel_bins; i++)
+			vad->noise_floor[i] = mel_log[i];
+
+		vad->initialized = true;
+	}
+
+	/* Select rise alpha based on convergence phase */
+	if (vad->frame_count <= vad->init_frames)
+		alpha = vad->noise_rise_alpha_fast;
+	else
+		alpha = vad->noise_rise_alpha_slow;
+
+	/* Update noise floor: follow down instantly, rise slowly */
+	for (i = 0; i < vad->num_mel_bins; i++) {
+		if (mel_log[i] < vad->noise_floor[i]) {
+			/* Instant follow-down */
+			vad->noise_floor[i] = mel_log[i];
+		} else {
+			/* Slow rise: floor += alpha * (mel - floor)
+			 * Q9.23 + Q1.15 * Q9.23 => need Q9.23 result
+			 * alpha is Q1.15, delta is Q9.23
+			 */
+			delta = mel_log[i] - vad->noise_floor[i];
+			p = (int32_t)Q_MULTSR_32X32((int64_t)alpha, delta, 15, 23, 23);
+			vad->noise_floor[i] += p;
+		}
+	}
+
+	/* Compute weighted signal energy and noise floor energy.
+	 * weights are Q1.15, mel values are Q9.23
+	 * Products are Q10.38, accumulate in int64_t then shift to Q9.23
+	 */
+
+	for (i = 0; i < vad->num_mel_bins; i++) {
+		signal_energy += (int64_t)vad->weights[i] * mel_log[i];
+		noise_energy += (int64_t)vad->weights[i] * vad->noise_floor[i];
+	}
+
+	vad->energy = sat_int32(Q_SHIFT_RND(signal_energy, 38, 23));
+	vad->noise_energy = sat_int32(Q_SHIFT_RND(noise_energy, 38, 23));
+	energy_delta = vad->energy - vad->noise_energy;
+
+	/* Round accumulated energy from Q10.38 to Q9.23, saturate to int32 */
+	if (energy_delta > vad->energy_threshold) {
+		vad->hangover_counter = vad->hangover_max;
+		vad->is_speech = true;
+	} else {
+		if (vad->hangover_counter > 0) {
+			vad->hangover_counter--;
+			vad->is_speech = true;
+		} else {
+			vad->is_speech = false;
+		}
+	}
+
+	return vad->is_speech ? 1 : 0;
+}
+
+void mfcc_vad_reset(struct mfcc_vad_state *vad)
+{
+	int i;
+
+	if (!vad)
+		return;
+
+	vad->frame_count = 0;
+	vad->hangover_counter = 0;
+	vad->energy = 0;
+	vad->noise_energy = 0;
+	vad->is_speech = false;
+	vad->initialized = false;
+
+	for (i = 0; i < vad->num_mel_bins; i++)
+		vad->noise_floor[i] = 0;
+}
diff --git a/src/audio/mfcc/tune/README.md b/src/audio/mfcc/tune/README.md
new file mode 100644
index 000000000000..5fef841efff1
--- /dev/null
+++ b/src/audio/mfcc/tune/README.md
@@ -0,0 +1,98 @@
+# SOF MFCC Tuning Tools
+
+This directory contains a tool to create configuration blob for SOF
+MFCC component. It's simply run in Matlab or Octave with command
+`setup_mfcc`. The MFCC configuration parameters can be edited from the
+script.
+
+## Testbench
+
+The configuration can be test run with testbench. First the test topologies
+need to be created with `scripts/build-tools.sh -t`. Next the testbench
+is built with `scripts/rebuild-testbench.sh`.
+
+Once the previous steps are done, a sample wav file can be processed
+with script `run_mfcc.sh`. The script converts the input to raw 16 kHz
+stereo format and runs the testbench for S16, S24, and S32 bit depths,
+producing both cepstral coefficient (MFCC) and Mel spectrogram outputs.
+
+```
+./run_mfcc.sh /usr/share/sounds/alsa/Front_Center.wav
+```
+
+Output files from host testbench:
+
+| File | Content |
+|------|---------|
+| `mfcc_s16.raw`, `mfcc_s24.raw`, `mfcc_s32.raw` | Cepstral coefficients |
+| `mel_s16.raw`, `mel_s24.raw`, `mel_s32.raw` | Mel spectrogram |
+
+If the `XTENSA_PATH` environment variable is set, the script also runs
+the Xtensa build of the testbench (via `xt-run`) and produces additional
+output files prefixed with `xt_`:
+
+| File | Content |
+|------|---------|
+| `xt_mfcc_s16.raw`, `xt_mfcc_s24.raw`, `xt_mfcc_s32.raw` | Cepstral coefficients |
+| `xt_mel_s16.raw`, `xt_mel_s24.raw`, `xt_mel_s32.raw` | Mel spectrogram |
+
+## Decoding and Plotting
+
+All output files can be decoded and plotted at once in Matlab or Octave
+with the `decode_all.m` script:
+
+```matlab
+decode_all
+```
+
+This calls `decode_ceps` for each MFCC file (13 cepstral coefficients) and
+`decode_mel` for each Mel file (80 Mel bins), plotting spectrograms for all
+files that exist including the Xtensa variants.
+
+Individual files can also be decoded manually:
+
+```matlab
+[ceps, t, n] = decode_ceps('mfcc_s16.raw', 13);
+```
+
+In the above it's known from configuration script that MFCC was set up to
+output 13 cepstral coefficients from each FFT → Mel → DCT → Cepstral
+coefficients computation run.
+
+The 80 bands Mel output can be visualized with command:
+
+```matlab
+[mel, t, n] = decode_mel('mel_s16.raw', 80);
+```
+
+## Live Whisper Transcription with DSP VAD
+
+The directory contains a Python script `sof_mel_to_text_live_dsp_vad.py`.
+It can be used with development topologies
+`sof-arl-cs42l43-l0-cs35l56-l23-mfcc.tplg` and
+`sof-mtl-rt713-l0-rt1316-l12-mfcc.tplg`. It captures from default audio
+device `hw:0,47` (headset microphone) Mel audio features and VAD flags.
+The captured frames with detected speech are sent to Whisper speech
+recognizer model for conversion to text.
+
+### Prerequisites
+
+The script needs OpenVINO. Please follow the install procedure from
+<https://docs.openvino.ai/2025/get-started/install-openvino.html>.
+
+The following Python pip installs are needed into the same OpenVINO venv:
+
+```bash
+pip install openvino openvino-tokenizers openvino-genai
+pip install optimum[intel]
+pip install transformers
+pip install huggingface_hub
+```
+
+### NPU / GPU Support
+
+The script by default runs the Whisper encoder model in the NPU. To
+use the NPU, install the driver from
+<https://github.com/intel/linux-npu-driver/releases>. If the NPU is not
+available, change the encoder to CPU with run option `--encoder-device CPU`.
+With a GPU both `--encoder-device GPU` and `--decoder-device GPU` can be set.
diff --git a/src/audio/mfcc/tune/README.txt b/src/audio/mfcc/tune/README.txt
deleted file mode 100644
index a0c3189e81a3..000000000000
--- a/src/audio/mfcc/tune/README.txt
+++ /dev/null
@@ -1,52 +0,0 @@
-This directory contains a tool to create configuration blob for SOF
-MFCC component. It's simply run in Matlab or Octave with command
-"setup_mfcc". The MFCC configuration parameters can be edited from the
-script.
-
-The configuration can be test run with testbench. First the test topologies
-need to be created with "scripts/build-tools.sh -t". Next the testbench
-is build with "scripts/rebuild-testbench.sh".
-
-Once the previous steps are done, a sample wav file can be processed
-with script run_mfcc.sh. The script converts the input to raw 16 kHz
-stereo format and runs the testbench for S16, S24, and S32 bit depths,
-producing both cepstral coefficient (MFCC) and Mel spectrogram outputs.
-
-./run_mfcc.sh /usr/share/sounds/alsa/Front_Center.wav
-
-Output files from host testbench:
-  mfcc_s16.raw, mfcc_s24.raw, mfcc_s32.raw   - cepstral coefficients
-  mel_s16.raw, mel_s24.raw, mel_s32.raw       - Mel spectrogram
-
-If the XTENSA_PATH environment variable is set, the script also runs
-the Xtensa build of the testbench (via xt-run) and produces additional
-output files prefixed with "xt_":
-  xt_mfcc_s16.raw, xt_mfcc_s24.raw, xt_mfcc_s32.raw
-  xt_mel_s16.raw, xt_mel_s24.raw, xt_mel_s32.raw
-
-All output files can be decoded and plotted at once in Matlab or Octave
-with the decode_all.m script:
-
-decode_all
-
-This calls decode_ceps for each MFCC file (13 cepstral coefficients) and
-decode_mel for each Mel file (80 Mel bins), plotting spectrograms for all
-files that exist including the Xtensa variants.
-
-Individual files can also be decoded manually:
-
-[ceps, t, n] = decode_ceps('mfcc_s16.raw', 13);
-
-In the above it's known from configuration script that MFCC was set up to
-output 13 cepstral coefficients from each FFT -> Mel -> DCT -> Cepstral
-coefficients computation run.
-
-The 80 bands Mel output can be visualized with command:
-
-[mel, t, n] = decode_mel('mel_s16.raw', 80);
-
-Other kind of signals have quite big visual difference in audio features. Try
-e.g. other sound files found in computer.
-
-./run_mfcc.sh /usr/share/sounds/gnome/default/alerts/bark.ogg
-./run_mfcc.sh /usr/share/sounds/gnome/default/alerts/sonar.ogg
diff --git a/src/audio/mfcc/tune/decode_all.m b/src/audio/mfcc/tune/decode_all.m
index d5b60289b4cf..23ca07522aae 100644
--- a/src/audio/mfcc/tune/decode_all.m
+++ b/src/audio/mfcc/tune/decode_all.m
@@ -25,7 +25,7 @@
 	fn = all_ceps_files{i};
 	if exist(fn, 'file')
 		fprintf('Decoding MFCC ceps: %s\n', fn);
-		[ceps, t, n] = decode_ceps(fn, num_ceps);
+		[ceps, t, n, energy, noise_energy, vad] = decode_ceps(fn, num_ceps);
 	end
 end
 
@@ -34,6 +34,6 @@
 	fmt = all_mel_fmts{i};
 	if exist(fn, 'file')
 		fprintf('Decoding Mel: %s\n', fn);
-		[mel, t, n] = decode_mel(fn, num_mel, fmt);
+		[mel, t, n, energy, noise_energy, vad] = decode_mel(fn, num_mel, fmt);
 	end
 end
diff --git a/src/audio/mfcc/tune/decode_ceps.m b/src/audio/mfcc/tune/decode_ceps.m
index a63677fa3731..32a04e8d8df7 100644
--- a/src/audio/mfcc/tune/decode_ceps.m
+++ b/src/audio/mfcc/tune/decode_ceps.m
@@ -1,4 +1,4 @@
-% [ceps, t, n] = decode_ceps(fn, num_ceps, num_channels)
+% [ceps, t, n, vad, energy, noise_energy, frame_number] = decode_ceps(fn, num_ceps, num_channels)
 %
 % Input
 %   fn - File with MFCC data in .raw or .wav format
@@ -9,11 +9,16 @@
 %   ceps - cepstral coefficients
 %   t - time vector for plotting
 %   n - ceps 1..num_ceps vector for plotting
+%   vad - VAD flag per frame from DSP
+%   energy - weighted signal energy per frame from DSP
+%   noise_energy - weighted noise floor energy per frame from DSP
+%   frame_number - frame number from DSP
 
 % SPDX-License-Identifier: BSD-3-Clause
-% Copyright(c) 2022 Intel Corporation. All rights reserved.
+% Copyright(c) 2022-2026 Intel Corporation. All rights reserved.
 
-function [ceps, t, n] = decode_ceps(fn, num_ceps, num_channels)
+function [ceps, t, n, vad, energy, noise_energy, frame_number] = ...
+	decode_ceps(fn, num_ceps, num_channels)
 
 if nargin < 3
 	num_channels = 1;
@@ -23,6 +28,7 @@
 fs = 16e3;
 qformat = 7;
 magic = [25443 28006]; % ASCII 'mfcc' as int16
+num_magic = 2; % magic word is 2 x int16
 
 % Load output data
 [data, num_channels] = get_file(fn, num_channels);
@@ -41,17 +47,37 @@
 
 period_ceps = idx(2)-idx(1);
 num_frames = length(idx);
+
+% Header after magic is [frame_number, reserved, energy, noise_energy, vad_flag]
+% as int32 (10 int16 slots), followed by num_ceps coefficients.
+payload_len = 10 + num_ceps; % 5 int32 = 10 int16, then ceps data
+
+% Last frame can be incomplete due to span over multiple periods
+last = idx(end) + num_magic + payload_len - 1;
+if (last > length(data))
+    num_frames = num_frames - 1;
+end
+
 t_ceps = period_ceps / num_channels / fs;
 t = (0:num_frames -1) * t_ceps;
 n = 1:num_ceps;
 
-ceps = zeros(num_ceps, num_frames);
+payload = zeros(payload_len, num_frames);
 for i = 1:num_frames
-	i1 = idx(i) + 2;
-	i2 = i1 + num_ceps - 1;
-	ceps(:,i) = data(i1:i2) / 2^qformat;
+	i1 = idx(i) + num_magic;
+	i2 = i1 + payload_len - 1;
+	payload(:,i) = double(data(i1:i2));
 end
 
+% Reassemble int32 from pairs of int16 (little-endian).
+% Low half must be treated as unsigned with mod() to handle negative int16.
+frame_number = mod(payload(1,:), 65536) + payload(2,:) * 65536;
+% payload(3:4,:) is reserved, skip
+energy = mod(payload(5,:), 65536) + payload(6,:) * 65536;
+noise_energy = mod(payload(7,:), 65536) + payload(8,:) * 65536;
+vad = mod(payload(9,:), 65536) + payload(10,:) * 65536;
+ceps = payload(11:payload_len, :) / 2^qformat;
+
 figure;
 surf(t, n, ceps, 'EdgeColor', 'none');
 colormap(jet);
@@ -75,7 +101,7 @@
 	case '.wav'
 		tmp = audioread(fn, 'native');
 		t = whos('tmp');
-		if ~strcmp(t.class, 'int16');
+		if ~strcmp(t.class, 'int16')
 			error('Only 16-bit wav file format is supported');
 		end
 		s = size(tmp);
diff --git a/src/audio/mfcc/tune/decode_mel.m b/src/audio/mfcc/tune/decode_mel.m
index f6a723aa2040..24296b529cbc 100644
--- a/src/audio/mfcc/tune/decode_mel.m
+++ b/src/audio/mfcc/tune/decode_mel.m
@@ -1,26 +1,28 @@
-% [mel, t, n] = decode_mel(fn, num_mel, fmt, num_channels)
+% [mel, t, n, vad, energy, noise_energy, frame_number] = decode_mel(fn, num_mel, fmt, num_channels)
 %
 % Input
 %   fn - File with Mel data in .raw or .wav format
 %   num_mel - number of Mel coefficients per frame
 %   fmt - format of the Mel data ('s16', 's24', 's32')
-%   num_channels - needed for .raw format, omit for .wav
+%   num_channels - needed for .raw format, omit for .wav, default 2
 %
 % Outputs
 %   mel - Mel coefficients
 %   t - time vector for plotting
 %   n - mel 1..num_mel vector for plotting
+%   vad - VAD flag per frame from DSP
+%   energy - weighted signal energy per frame from DSP
+%   noise_energy - weighted noise floor energy per frame from DSP
+%   frame_number - frame number from DSP
 
 % SPDX-License-Identifier: BSD-3-Clause
 % Copyright(c) 2026 Intel Corporation.
 
-function [mel, t, n] = decode_mel(fn, num_mel, fmt, num_channels)
+function [mel, t, n, vad, energy, noise_energy, frame_number] = ...
+	decode_mel(fn, num_mel, fmt, num_channels)
 
-if nargin < 3
-	fmt = 's16';
-end
 if nargin < 4
-	num_channels = 1;
+	num_channels = 2;
 end
 
 % MFCC stream
@@ -30,15 +32,15 @@
   case 's16'
     qformat = 7;
     magic = [25443 28006]; % ASCII 'mfcc' as two int16
-    num_magic = 2;
+    num_magic = 2; % magic word is 2 x int16
   case 's24'
     qformat = 15;
     magic = int32(1835426659); % 0x6D666363 as int32
-    num_magic = 1;
+    num_magic = 1; % magic word is 1 x int32
   case 's32'
     qformat = 23;
     magic = int32(1835426659); % 0x6D666363 as int32
-    num_magic = 1;
+    num_magic = 1; % magic word is 1 x int32
     otherwise
     error("Use 's16', 's24', or 's32' as format.");
 end
@@ -68,33 +70,77 @@
 period_mel = idx(2)-idx(1);
 num_frames = length(idx);
 
+% Header after magic is [frame_number, reserved, energy, noise_energy, vad_flag]
+% as int32, followed by num_mel coefficients.
+% For s16 each int32 occupies 2 int16 slots.
+if strcmp(fmt, 's16')
+	payload_len = 10 + num_mel; % 5 int32 = 10 int16, then mel data
+else
+	payload_len = 5 + num_mel; % frame_number + reserved + energy + noise_energy + vad_flag + mel
+end
+
 % Last frame can be incomplete due to span over multiple periods
-last = idx(end) + num_mel - 1;
+last = idx(end) + num_magic + payload_len - 1;
 if (last > length(data))
     num_frames = num_frames - 1;
 end
 
-t_mel = period_mel / num_channels / fs;
-t = (0:num_frames -1) * t_mel;
-n = 1:num_mel;
-
-mel = zeros(num_mel, num_frames);
+payload = zeros(payload_len, num_frames);
 for i = 1:num_frames
 	i1 = idx(i) + num_magic;
-	i2 = i1 + num_mel - 1;
-	mel(:,i) = double(data(i1:i2)) / 2^qformat;
+	i2 = i1 + payload_len - 1;
+	payload(:,i) = double(data(i1:i2));
 end
 
-figure;
+if strcmp(fmt, 's16')
+	% Reassemble int32 from pairs of int16 (little-endian).
+	% Low half must be treated as unsigned with mod() to handle negative int16.
+	frame_number = mod(payload(1,:), 65536) + payload(2,:) * 65536;
+	% payload(3:4,:) is reserved, skip
+	energy = mod(payload(5,:), 65536) + payload(6,:) * 65536;
+	noise_energy = mod(payload(7,:), 65536) + payload(8,:) * 65536;
+	vad = mod(payload(9,:), 65536) + payload(10,:) * 65536;
+	mel = payload(11:payload_len, :) / 2^qformat;
+else
+	frame_number = payload(1, :);
+	% payload(2,:) is reserved, skip
+	energy = payload(3, :) / 2^qformat;
+	noise_energy = payload(4, :) / 2^qformat;
+	vad = payload(5, :);
+	mel = payload(6:payload_len, :) / 2^qformat;
+end
+
+t_mel = period_mel / num_channels / fs;
+t = (0:num_frames -1) * t_mel;
+n = 1:num_mel;
+
+figure
 imagesc(t, n, mel);
 axis xy;
 colormap(jet);
 colorbar;
 tstr = sprintf('SOF MFCC Mel coefficients (%s)', fn);
 title(tstr, 'Interpreter', 'None');
-xlabel('Time (s)');
 ylabel('Mel coef #');
 
+figure
+subplot(2,1,1);
+level = sum(mel(:,:));
+plot(t, vad)
+ax = axis();
+axis([ax(1:2) -0.1 1.1]);
+grid on;
+title(tstr, 'Interpreter', 'None');
+xlabel('Time (s)');
+ylabel('VAD flag');
+
+subplot(2,1,2);
+plot(t, energy, t, noise_energy);
+grid on;
+legend('Energy', 'Noise Energy');
+xlabel('Time (s)');
+ylabel('Energy');
+
 end
 
 function [data, num_channels] = get_file(fn, num_channels, fmt)
diff --git a/src/audio/mfcc/tune/setup_mfcc.m b/src/audio/mfcc/tune/setup_mfcc.m
index bd2b3f11e60b..3cda3221a4fc 100644
--- a/src/audio/mfcc/tune/setup_mfcc.m
+++ b/src/audio/mfcc/tune/setup_mfcc.m
@@ -62,6 +62,9 @@ function setup_mfcc()
 	cfg.mmax_init = 0; % same
 	cfg.mmax_coef = 0; % same
 	cfg.dynamic_mmax = false; % same
+	cfg.enable_vad = false;
+	cfg.enable_dtx = false;
+	cfg.update_controls = false;
 end
 
 function cfg = get_mel_spectrogram_config()
@@ -99,6 +102,9 @@ function setup_mfcc()
 	cfg.mmax_init = 0; % Initial value max Mel value, data clamp is mmax - top_db
 	cfg.mmax_coef = 0; % Dynamic max Mel value decay coefficient (zero lock to found max)
 	cfg.dynamic_mmax = true;
+	cfg.enable_vad = true;
+	cfg.enable_dtx = false;
+	cfg.update_controls = true;
 end
 
 function export_mfcc_setup(gen_cfg, cfg)
@@ -107,7 +113,7 @@ function export_mfcc_setup(gen_cfg, cfg)
 addpath([gen_cfg.tools_path 'tune/common']);
 
 %% Blob size, size plus reserved(8) + current parameters
-nbytes_data = 104;
+nbytes_data = 116;
 
 %% Little endian
 sh32 = [0 -8 -16 -24];
@@ -160,6 +166,10 @@ function export_mfcc_setup(gen_cfg, cfg)
 v = 0;                                           [b8, j] = add_w16b(v, b8, j); % vtln_high Qx.y TBD
 v = 0;                                           [b8, j] = add_w16b(v, b8, j); % vtln_low Qx.y TBD
 v = 0;                                           [b8, j] = add_w16b(v, b8, j); % vtln_warp Qx.y TBD
+% reserved16[3]
+for i = 1:3
+	[b8, j] = add_w16b(0, b8, j);
+end
 v = cfg.htk_compat;                              [b8, j] = add_w8b(v, b8, j); % bool
 v = cfg.raw_energy;                              [b8, j] = add_w8b(v, b8, j); % bool
 v = cfg.remove_dc_offset;                        [b8, j] = add_w8b(v, b8, j); % bool
@@ -168,6 +178,13 @@ function export_mfcc_setup(gen_cfg, cfg)
 v = cfg.subtract_mean;                           [b8, j] = add_w8b(v, b8, j); % bool
 v = cfg.use_energy;                              [b8, j] = add_w8b(v, b8, j); % bool
 v = cfg.dynamic_mmax;                            [b8, j] = add_w8b(v, b8, j); % bool
+v = cfg.enable_vad;                              [b8, j] = add_w8b(v, b8, j); % bool
+v = cfg.enable_dtx;                              [b8, j] = add_w8b(v, b8, j); % bool
+v = cfg.update_controls;                         [b8, j] = add_w8b(v, b8, j); % bool
+% reserved_bool[5]
+for i = 1:5
+	[b8, j] = add_w8b(0, b8, j);
+end
 
 %% Export
 tplg_fn = [gen_cfg.mfcc_conf_path cfg.tplg_fn];
diff --git a/src/audio/mfcc/tune/sof_mel_to_text_live_dsp_vad.py b/src/audio/mfcc/tune/sof_mel_to_text_live_dsp_vad.py
new file mode 100644
index 000000000000..33862da283e4
--- /dev/null
+++ b/src/audio/mfcc/tune/sof_mel_to_text_live_dsp_vad.py
@@ -0,0 +1,454 @@
+"""Live SOF mel capture with DSP VAD-triggered Whisper transcription.
+
+Captures mel frames from ALSA with embedded VAD flag from the DSP.
+Frame format: [magic(int32), frame_number(uint32), reserved(int32), energy(int32), noise_energy(int32), vad_flag(int32), mel[0..79](int32)]
+When silence of 100ms is detected after speech, sends the buffered mel
+features to Whisper (OpenVINO encoder+decoder) for transcription.
+Capture continues running during Whisper inference.
+
+Usage:
+    python sof_mel_to_text_live_dsp_vad.py [--device hw:0,47] [--model whisper-medium-int4-ov]
+    python sof_mel_to_text_live_dsp_vad.py --plot  # with live spectrogram
+"""
+
+import argparse
+import os
+import struct
+import subprocess
+import threading
+import time
+import numpy as np
+import openvino as ov
+import huggingface_hub as hf_hub
+from pathlib import Path
+
+# Graphics imports deferred until --plot is used
+matplotlib = None
+plt = None
+
+# SOF mel_s32.raw format constants (with DSP data header)
+SOF_MAGIC_S32 = np.int32(0x6D666363)  # ASCII 'mfcc' as int32
+SOF_MAGIC_BYTES = struct.pack('<i', 0x6D666363)
+SOF_NUM_HEADER = 6            # magic, frame_number, reserved, energy, noise_energy, vad_flag
+SOF_Q_FORMAT = 23            # Q9.23 fixed-point
+SOF_NUM_MEL = 80
+SOF_FRAME_INTS = SOF_NUM_HEADER + SOF_NUM_MEL  # 86 int32 per frame
+SOF_FRAME_BYTES = SOF_FRAME_INTS * 4  # 344 bytes per frame
+
+# Speech buffering
+SILENCE_TRIGGER_MS = 100     # ms of silence after speech to trigger transcription
+SILENCE_TRIGGER_FRAMES = SILENCE_TRIGGER_MS // 10  # 10 frames at 10ms/frame
+MIN_SPEECH_MS = 500          # minimum speech duration to send to Whisper
+MIN_SPEECH_FRAMES = MIN_SPEECH_MS // 10  # 50 frames at 10ms/frame
+
+# Whisper model constants
+WHISPER_FEATURE_SIZE = 80
+WHISPER_NB_MAX_FRAMES = 3000  # 30 seconds at 10ms per frame
+
+
+def decode_mel_frame(raw_ints):
+    """Convert 80 int32 Q9.23 values to float32 mel coefficients."""
+    return raw_ints.astype(np.float64) / (2 ** SOF_Q_FORMAT)
+
+
+# ---------- Optional scrolling plot ----------
+
+SPECTROGRAM_WIDTH = 100
+
+
+class MelPlotter:
+    """Real-time scrolling mel spectrogram + VAD strip."""
+
+    def __init__(self, num_mel=SOF_NUM_MEL, width=SPECTROGRAM_WIDTH):
+        global matplotlib, plt
+        import matplotlib as _mpl
+        _mpl.use('TkAgg')
+        import matplotlib.pyplot as _plt
+        matplotlib = _mpl
+        plt = _plt
+
+        self.num_mel = num_mel
+        self.width = width
+
+        self.mel_buf = np.zeros((num_mel, width), dtype=np.float64)
+        self.vad_buf = np.zeros(width, dtype=np.float64)
+        self.x = np.arange(width)
+
+        self.fig, (self.ax_mel, self.ax_vad) = plt.subplots(
+            2, 1, figsize=(10, 5),
+            gridspec_kw={'height_ratios': [5, 1]},
+            sharex=True
+        )
+        self.fig.tight_layout(pad=2.0)
+
+        self.im_mel = self.ax_mel.imshow(
+            self.mel_buf, aspect='auto', origin='lower',
+            interpolation='nearest', cmap='turbo',
+            vmin=-2.0, vmax=2.0
+        )
+        self.ax_mel.set_ylabel('Mel bin')
+        self.ax_mel.set_title('Mel Spectrogram (scrolling) — DSP VAD')
+
+        self.line_vad, = self.ax_vad.plot(
+            self.x, self.vad_buf, color='green', linewidth=1.5,
+            drawstyle='steps-post')
+        self.ax_vad.set_ylabel('VAD')
+        self.ax_vad.set_xlabel('Frame')
+        self.ax_vad.set_ylim(-0.1, 1.1)
+        self.ax_vad.set_yticks([0, 1])
+        self.ax_vad.set_yticklabels(['Silent', 'Speech'])
+
+        plt.ion()
+        plt.show(block=False)
+        self.fig.canvas.draw()
+        self.fig.canvas.flush_events()
+
+    def update(self, mel_frame, is_speech):
+        self.mel_buf[:, :-1] = self.mel_buf[:, 1:]
+        self.mel_buf[:, -1] = mel_frame
+        self.vad_buf[:-1] = self.vad_buf[1:]
+        self.vad_buf[-1] = 1.0 if is_speech else 0.0
+
+        self.im_mel.set_data(self.mel_buf)
+        self.line_vad.set_ydata(self.vad_buf)
+
+        self.fig.canvas.draw_idle()
+        self.fig.canvas.flush_events()
+
+
+# ---------- Whisper inference ----------
+
+class WhisperTranscriber:
+    """Whisper encoder+decoder using OpenVINO, runs in a background thread."""
+
+    def __init__(self, model_path, encoder_device="NPU", decoder_device="CPU"):
+        self.model_path = model_path
+        core = ov.Core()
+        encoder_xml = str(Path(model_path) / "openvino_encoder_model.xml")
+        decoder_xml = str(Path(model_path) / "openvino_decoder_model.xml")
+        # NPU requires static shapes — fix [?,?,3000] to [1,80,3000]
+        encoder_model = core.read_model(encoder_xml)
+        encoder_model.reshape({0: [1, WHISPER_FEATURE_SIZE, WHISPER_NB_MAX_FRAMES]})
+        self.encoder = core.compile_model(encoder_model, encoder_device)
+        self.decoder = core.compile_model(decoder_xml, decoder_device)
+        self._load_tokenizer()
+        self._busy = False
+        self._lock = threading.Lock()
+
+    def _load_tokenizer(self):
+        """Load Whisper tokenizer."""
+        try:
+            from transformers import WhisperTokenizer
+            self.tokenizer = WhisperTokenizer.from_pretrained(self.model_path)
+            self._tokenizer_type = "hf"
+        except ImportError:
+            import openvino_genai as ov_genai
+            self.tokenizer = ov_genai.Tokenizer(self.model_path)
+            self._tokenizer_type = "ov"
+
+    def is_busy(self):
+        with self._lock:
+            return self._busy
+
+    def transcribe_async(self, mel_frames, callback):
+        """Run transcription in a background thread.
+
+        Args:
+            mel_frames: list of np.ndarray [80] mel frames
+            callback: function(text) called with result
+        """
+        with self._lock:
+            if self._busy:
+                return False
+            self._busy = True
+
+        t = threading.Thread(target=self._run, args=(mel_frames, callback),
+                             daemon=True)
+        t.start()
+        return True
+
+    def _run(self, mel_frames, callback):
+        try:
+            text = self._transcribe(mel_frames)
+            callback(text)
+        except Exception as e:
+            print(f"  [Whisper ERROR] {e}", flush=True)
+        finally:
+            with self._lock:
+                self._busy = False
+
+    def _transcribe(self, mel_frames):
+        """Encode mel frames and decode to text."""
+        n_frames = len(mel_frames)
+        if n_frames == 0:
+            return ""
+
+        # Stack frames into [80, n_frames]
+        features = np.column_stack(mel_frames).astype(np.float32)
+
+        # Pad to 3000 frames
+        silence_val = features.min()
+        padded = np.full((WHISPER_FEATURE_SIZE, WHISPER_NB_MAX_FRAMES),
+                         silence_val, dtype=np.float32)
+        n = min(n_frames, WHISPER_NB_MAX_FRAMES)
+        padded[:, :n] = features[:, :n]
+
+        # Encoder
+        t0 = time.time()
+        encoder_input = padded.reshape(1, WHISPER_FEATURE_SIZE, WHISPER_NB_MAX_FRAMES)
+        encoder_req = self.encoder.create_infer_request()
+        encoder_req.set_tensor("input_features", ov.Tensor(encoder_input))
+        encoder_req.infer()
+        hidden_state = encoder_req.get_tensor("last_hidden_state").data.copy()
+        t1 = time.time()
+        print(f"  [Whisper] encoder: {t1-t0:.2f}s", flush=True)
+
+        # Decoder: greedy decode
+        token_ids = self._greedy_decode(hidden_state)
+        t2 = time.time()
+        print(f"  [Whisper] decoder: {t2-t1:.2f}s ({len(token_ids)} tokens)",
+              flush=True)
+
+        # Convert to text
+        text_tokens = [t for t in token_ids if t < 50257]
+        if self._tokenizer_type == "hf":
+            text = self.tokenizer.decode(text_tokens)
+        else:
+            text = self.tokenizer.decode(text_tokens)
+
+        return text.strip()
+
+    def _greedy_decode(self, hidden_state, max_tokens=448):
+        """Greedy decoding loop."""
+        sot_tokens = [50258, 50259, 50359, 50363]
+        eos_token = 50257
+
+        decoder_req = self.decoder.create_infer_request()
+        input_names = [inp.get_any_name() for inp in self.decoder.inputs]
+        has_cache_position = "cache_position" in input_names
+
+        decoder_req.set_tensor("encoder_hidden_states", ov.Tensor(hidden_state))
+
+        # Prefill with SOT tokens
+        input_ids = np.array([sot_tokens], dtype=np.int64)
+        beam_idx = np.array([0], dtype=np.int32)
+
+        decoder_req.set_tensor("input_ids", ov.Tensor(input_ids))
+        if "beam_idx" in input_names:
+            decoder_req.set_tensor("beam_idx", ov.Tensor(beam_idx))
+        if has_cache_position:
+            cache_pos = np.arange(len(sot_tokens), dtype=np.int64).reshape(1, -1)
+            decoder_req.set_tensor("cache_position", ov.Tensor(cache_pos))
+
+        decoder_req.infer()
+        logits = decoder_req.get_tensor("logits").data
+        next_token = int(np.argmax(logits[0, -1, :]))
+
+        generated = [next_token]
+        position = len(sot_tokens)
+
+        for _ in range(max_tokens - 1):
+            if next_token == eos_token:
+                break
+
+            decoder_req.set_tensor("input_ids",
+                                   ov.Tensor(np.array([[next_token]], dtype=np.int64)))
+            if "beam_idx" in input_names:
+                decoder_req.set_tensor("beam_idx", ov.Tensor(beam_idx))
+            if has_cache_position:
+                decoder_req.set_tensor("cache_position",
+                                       ov.Tensor(np.array([[position]], dtype=np.int64)))
+
+            decoder_req.infer()
+            logits = decoder_req.get_tensor("logits").data
+            next_token = int(np.argmax(logits[0, -1, :]))
+            generated.append(next_token)
+            position += 1
+
+        return generated
+
+
+# ---------- Frame parser ----------
+
+def find_frame_in_buffer(buf):
+    """Find the first complete mel frame with data header in a byte buffer.
+
+    Frame layout: [magic(4B), frame_number(4B), reserved(4B), energy(4B),
+                   noise_energy(4B), vad_flag(4B), mel[0..79](320B)] = 344 bytes
+    Returns: (vad_flag, mel_ints, remaining_buf) or (None, None, buf)
+    """
+    while True:
+        idx = buf.find(SOF_MAGIC_BYTES)
+        if idx < 0:
+            if len(buf) > 3:
+                buf = buf[-3:]
+            return None, None, buf
+        end = idx + SOF_FRAME_BYTES
+        if end > len(buf):
+            buf = buf[idx:]
+            return None, None, buf
+        # Parse vad_flag at offset 20 (after magic + frame_number + reserved + energy + noise_energy)
+        vad_flag = struct.unpack_from('<i', buf, idx + 20)[0]
+        # Parse 80 mel coefficients (after 24-byte header)
+        mel_bytes = buf[idx + SOF_NUM_HEADER * 4 : end]
+        mel_ints = np.frombuffer(mel_bytes, dtype=np.int32)
+        buf = buf[end:]
+        return vad_flag, mel_ints, buf
+
+
+# ---------- Main capture + transcription loop ----------
+
+def run_capture(device, rate, model_path, encoder_device, decoder_device,
+                enable_plot=False):
+    """Main capture loop: ALSA → DSP VAD → buffer speech → Whisper."""
+
+    plotter = MelPlotter() if enable_plot else None
+    transcriber = WhisperTranscriber(model_path, encoder_device=encoder_device,
+                                     decoder_device=decoder_device)
+
+    cmd = [
+        'arecord', '-D', device, '-f', 'S32_LE', '-c', '2',
+        '-r', str(rate), '-t', 'raw', '--buffer-size', '8192',
+    ]
+
+    print(f"Starting capture: {' '.join(cmd)}")
+    print(f"VAD source: DSP (embedded in stream)")
+    print(f"Silence trigger: {SILENCE_TRIGGER_MS}ms ({SILENCE_TRIGGER_FRAMES} frames)")
+    print(f"Whisper model: {model_path} (encoder: {encoder_device}, decoder: {decoder_device})")
+    print()
+
+    proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+
+    buf = b''
+    read_chunk = SOF_FRAME_BYTES * 4
+    frame_num = 0
+    prev_speech = None
+
+    # Speech buffering state
+    speech_buffer = []         # list of mel frames during speech
+    silence_counter = 0        # consecutive silence frames after speech
+    was_speaking = False       # True if we have buffered speech frames
+
+    def on_transcription(text):
+        if text:
+            print(f"\n  >> \"{text}\"\n", flush=True)
+        else:
+            print("  [Whisper] empty result", flush=True)
+
+    try:
+        while True:
+            data = proc.stdout.read(read_chunk)
+            if not data:
+                rc = proc.poll()
+                if rc is not None:
+                    stderr_out = proc.stderr.read().decode(errors='replace')
+                    print(f"\narecord exited with code {rc}")
+                    if stderr_out:
+                        print(f"stderr: {stderr_out}")
+                    break
+                continue
+
+            buf += data
+
+            while True:
+                vad_flag, frame_ints, buf = find_frame_in_buffer(buf)
+                if frame_ints is None:
+                    break
+
+                frame_num += 1
+                mel = decode_mel_frame(frame_ints)
+                speech = vad_flag != 0
+
+                # Print VAD transitions when not plotting
+                if plotter is None and speech != prev_speech:
+                    t = frame_num * 0.01
+                    tag = "SPEECH" if speech else "SILENCE"
+                    print(f"  [{t:7.2f}s] {tag}", flush=True)
+                prev_speech = speech
+
+                # Update plot
+                if plotter is not None:
+                    plotter.update(mel, speech)
+
+                # --- Speech buffering logic ---
+                if speech:
+                    speech_buffer.append(mel.copy())
+                    silence_counter = 0
+                    was_speaking = True
+                else:
+                    if was_speaking:
+                        silence_counter += 1
+                        if silence_counter >= SILENCE_TRIGGER_FRAMES:
+                            n = len(speech_buffer)
+                            duration = n * 0.01
+                            t = frame_num * 0.01
+
+                            if n < MIN_SPEECH_FRAMES:
+                                # Too short — discard
+                                speech_buffer.clear()
+                                silence_counter = 0
+                                was_speaking = False
+                                continue
+
+                            # Silence threshold reached — send to Whisper
+                            print(f"  [{t:7.2f}s] Transcribing {n} frames "
+                                  f"({duration:.1f}s)...", flush=True)
+
+                            if not transcriber.is_busy():
+                                frames_copy = list(speech_buffer)
+                                transcriber.transcribe_async(
+                                    frames_copy, on_transcription)
+                            else:
+                                print(f"  [{t:7.2f}s] (Whisper busy, "
+                                      f"dropping {n} frames)", flush=True)
+
+                            speech_buffer.clear()
+                            silence_counter = 0
+                            was_speaking = False
+
+    except (KeyboardInterrupt, BrokenPipeError):
+        pass
+    finally:
+        if proc.poll() is None:
+            proc.terminate()
+            try:
+                proc.wait(timeout=3)
+            except subprocess.TimeoutExpired:
+                proc.kill()
+                proc.wait()
+        if plotter is not None:
+            try:
+                plt.close(plotter.fig)
+            except Exception:
+                pass
+        print("\n\nCapture stopped.")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Live SOF mel capture with DSP VAD-triggered Whisper transcription")
+    parser.add_argument('--device', '-D', default='hw:0,47',
+                        help='ALSA capture device (default: hw:0,47)')
+    parser.add_argument('--rate', '-r', type=int, default=16000,
+                        help='Sample rate for arecord (default: 16000)')
+    parser.add_argument('--model', '-m', default='whisper-medium-int4-ov',
+                        help='Path to Whisper OpenVINO model directory')
+    parser.add_argument('--encoder-device', default='NPU',
+                        help='OpenVINO device for encoder (default: NPU)')
+    parser.add_argument('--decoder-device', default='CPU',
+                        help='OpenVINO device for decoder (default: CPU)')
+    parser.add_argument('--plot', action='store_true',
+                        help='Show live scrolling mel spectrogram and VAD plot')
+    args = parser.parse_args()
+    model_id = "OpenVINO/" + os.path.basename(args.model)
+    if not os.path.isdir(args.model):
+        print(f"Downloading model {model_id} ...")
+        hf_hub.snapshot_download(model_id, local_dir=args.model)
+
+    print("=== Live SOF Mel → Whisper Transcription (DSP VAD) ===\n")
+    run_capture(args.device, args.rate, args.model, args.encoder_device,
+                args.decoder_device, enable_plot=args.plot)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/src/include/sof/audio/mfcc/mfcc_comp.h b/src/include/sof/audio/mfcc/mfcc_comp.h
index 025eef116752..72177df2dd99 100644
--- a/src/include/sof/audio/mfcc/mfcc_comp.h
+++ b/src/include/sof/audio/mfcc/mfcc_comp.h
@@ -12,6 +12,8 @@
 #include <sof/math/auditory.h>
 #include <sof/math/dct.h>
 #include <sof/math/fft.h>
+#include <sof/audio/mfcc/mfcc_vad.h>
+#include <sof/ipc/msg.h>
 #include <stddef.h>
 #include <stdint.h>
 
@@ -32,6 +34,24 @@
 #define MFCC_MAGIC 0x6d666363 /* ASCII for "mfcc" */
 #define MFCC_FFT_BITS	32
 
+/** \brief Switch control index for VAD notification to user space */
+#define MFCC_CTRL_INDEX_VAD	0
+
+/**
+ * \brief Data header prepended to every MFCC output frame.
+ *
+ * Written before the Mel spectrum or cepstral coefficient data in each
+ * output frame.  All fields are int32_t so the header is 16 bytes.
+ */
+struct mfcc_data_header {
+	uint32_t magic;        /**< Magic word MFCC_MAGIC (0x6d666363) */
+	uint32_t frame_number; /**< Frame number, counting calculated frames starting from 0 */
+	int32_t reserved;	 /**< Reserved for future use, set to 0 */
+	int32_t energy;       /**< Weighted signal energy in Q9.23 */
+	int32_t noise_energy; /**< Weighted noise floor energy in Q9.23 */
+	int32_t vad_flag;     /**< VAD decision: 1 = speech, 0 = silence */
+};
+
 /** \brief Type definition for processing function select return value. */
 typedef void (*mfcc_func)(struct processing_module *mod,
 			  struct input_stream_buffer *bsource,
@@ -105,7 +125,8 @@ struct mfcc_state {
 	bool mel_only; /**< When true, output Mel spectra instead of cepstral coefficients */
 	bool waiting_fill; /**< booleans */
 	bool prev_samples_valid;
-	bool magic_pending; /**< True when magic word not yet written for current output */
+	bool header_pending; /**< True when data header not yet written for current output */
+	struct mfcc_data_header header; /**< Data header for current output frame */
 	size_t sample_buffers_size; /**< bytes */
 	int16_t *out_data_ptr; /**< Read pointer into scratch data for multi-period output */
 	int32_t *out_data_ptr_32; /**< Read pointer for 32-bit mel-only output */
@@ -115,9 +136,12 @@ struct mfcc_state {
 /* MFCC component private data */
 struct mfcc_comp_data {
 	struct mfcc_state state;
+	struct mfcc_vad_state vad;
 	struct comp_data_blob_handler *model_handler;
 	struct sof_mfcc_config *config;
+	struct ipc_msg *msg;		/**< IPC notification for VAD switch control */
 	int max_frames;
+	bool vad_prev;			/**< Previous VAD state for edge detection */
 	mfcc_func mfcc_func;		/**< processing function */
 };
 
@@ -138,6 +162,10 @@ int mfcc_setup(struct processing_module *mod, int max_frames, int rate, int chan
 
 void mfcc_free_buffers(struct processing_module *mod);
 
+int mfcc_ipc_notification_init(struct processing_module *mod);
+
+void mfcc_send_vad_notification(struct processing_module *mod, uint32_t val);
+
 void mfcc_fill_prev_samples(struct mfcc_buffer *buf, int16_t *prev_data,
 			    int prev_data_length);
 
diff --git a/src/include/sof/audio/mfcc/mfcc_vad.h b/src/include/sof/audio/mfcc/mfcc_vad.h
new file mode 100644
index 000000000000..e12dd7e31e80
--- /dev/null
+++ b/src/include/sof/audio/mfcc/mfcc_vad.h
@@ -0,0 +1,99 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Copyright(c) 2026 Intel Corporation.
+ *
+ * Author: Seppo Ingalsuo <seppo.ingalsuo@linux.intel.com>
+ */
+
+/**
+ * \file mfcc_vad.h
+ * \brief Voice Activity Detection based on Mel spectrum energy.
+ *
+ * This VAD operates on the Q9.23 Mel log spectrum values produced by
+ * the MFCC component. It tracks a per-bin noise floor that follows
+ * the signal downward instantly and rises slowly, then computes a
+ * speech-weighted energy delta above the floor.
+ */
+
+#ifndef __SOF_AUDIO_MFCC_MFCC_VAD_H__
+#define __SOF_AUDIO_MFCC_MFCC_VAD_H__
+
+#include <stdint.h>
+#include <stdbool.h>
+
+struct processing_module;
+
+/**
+ * \brief Number of frames for fast noise floor convergence at startup (~1 s at 10 ms/frame).
+ */
+#define MFCC_VAD_NOISE_INIT_FRAMES	100
+
+/**
+ * \brief Slow noise floor rise coefficient in Q1.15 (0.0020 * 2^15 = 66).
+ */
+#define MFCC_VAD_NOISE_RISE_ALPHA	66
+
+/**
+ * \brief Fast noise floor rise coefficient in Q1.15 (0.05 * 2^15 = 1638).
+ */
+#define MFCC_VAD_NOISE_RISE_ALPHA_FAST	1638
+
+/**
+ * \brief Energy threshold for speech detection in Q9.23 (0.35 * 2^23 = 2936013).
+ */
+#define MFCC_VAD_ENERGY_THRESHOLD	2936013
+
+/**
+ * \brief Hangover frame count to keep VAD active after last speech detection.
+ */
+#define MFCC_VAD_HANGOVER_FRAMES	20
+
+/**
+ * \brief VAD state structure.
+ */
+struct mfcc_vad_state {
+	int32_t *noise_floor; /**< Per-bin noise floor in Q9.23 */
+	int16_t *weights; /**< Speech-frequency emphasis weights Q1.15 */
+	int32_t energy_threshold; /**< Energy threshold Q9.23 */
+	int16_t noise_rise_alpha_slow; /**< Slow rise alpha Q1.15 */
+	int16_t noise_rise_alpha_fast; /**< Fast rise alpha Q1.15 */
+	int16_t hangover_max; /**< Maximum hangover frames */
+	int16_t hangover_counter; /**< Current hangover counter */
+	int16_t num_mel_bins; /**< Number of Mel bins in use */
+	int16_t init_frames; /**< Number of initial frames for fast convergence */
+	int32_t frame_count; /**< Total frames processed */
+	int32_t energy; /**< Weighted signal energy in Q9.23 */
+	int32_t noise_energy; /**< Weighted noise floor energy in Q9.23 */
+	bool is_speech; /**< Current VAD decision */
+	bool initialized; /**< True after first frame processed */
+};
+
+/**
+ * \brief Initialize VAD state.
+ *
+ * \param[out] vad Pointer to VAD state to initialize.
+ * \param[in] num_mel_bins Number of Mel bins.
+ * \param[in] sample_rate Audio sample rate in Hz.
+ * \param[in] mod Processing module for memory allocation.
+ * \return 0 on success, negative error code on failure.
+ */
+int mfcc_vad_init(struct mfcc_vad_state *vad, int num_mel_bins, int sample_rate,
+		  struct processing_module *mod);
+
+/**
+ * \brief Process one Mel spectrum frame and update VAD decision.
+ *
+ * \param[in,out] vad Pointer to VAD state.
+ * \param[in] mel_log Mel log spectrum in Q9.23, array of num_mel_bins values.
+ * \return 1 if speech detected, 0 if silence.
+ */
+int mfcc_vad_update(struct mfcc_vad_state *vad, const int32_t *mel_log);
+
+/**
+ * \brief Reset VAD state without changing configuration.
+ *
+ * \param[in,out] vad Pointer to VAD state.
+ */
+void mfcc_vad_reset(struct mfcc_vad_state *vad);
+
+#endif /* __SOF_AUDIO_MFCC_MFCC_VAD_H__ */
diff --git a/src/include/user/mfcc.h b/src/include/user/mfcc.h
index 8a0defcd9883..a2f3717daa52 100644
--- a/src/include/user/mfcc.h
+++ b/src/include/user/mfcc.h
@@ -77,6 +77,7 @@ struct sof_mfcc_config {
 	int16_t vtln_high; /**< Reserved, no support */
 	int16_t vtln_low; /**< Reserved, no support */
 	int16_t vtln_warp; /**< Reserved, no support */
+	int16_t reserved16[3]; /**< Reserved for future 16-bit fields, set to 0 */
 	bool htk_compat; /**< Must be false */
 	bool raw_energy; /**< Reserved, no support */
 	bool remove_dc_offset; /**< Reserved, no support */
@@ -85,8 +86,10 @@ struct sof_mfcc_config {
 	bool subtract_mean; /**< Must be false (0) */
 	bool use_energy; /**< Must be false (0) */
 	bool dynamic_mmax; /**< Track max Mel value for clamp with top_db value */
-	bool reserved_bool2;
-	bool reserved_bool3;
+	bool enable_vad; /**< Run VAD algorithm */
+	bool enable_dtx; /**< Reserved (stream once per second non-speech frames) */
+	bool update_controls; /**< Update controls with VAD decision */
+	bool reserved_bool[5]; /* Reserved for future boolean flags, set to false (0) */
 } __attribute__((packed));
 
 #endif /* __USER_MFCC_H__ */
diff --git a/tools/topology/topology2/include/components/mfcc/default.conf b/tools/topology/topology2/include/components/mfcc/default.conf
index 42a6d6608b8b..3bbd72696806 100644
--- a/tools/topology/topology2/include/components/mfcc/default.conf
+++ b/tools/topology/topology2/include/components/mfcc/default.conf
@@ -1,12 +1,12 @@
-# Exported MFCC configuration 05-May-2026
+# Exported MFCC configuration 19-May-2026
 # cd src/audio/mfcc/tune; octave setup_mfcc.m
 Object.Base.data."mfcc_config" {
 	bytes "
 		0x53,0x4f,0x46,0x34,0x00,0x00,0x00,0x00,
-		0x68,0x00,0x00,0x00,0x01,0xd0,0x01,0x03,
+		0x74,0x00,0x00,0x00,0x01,0xd0,0x01,0x03,
 		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
 		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-		0x68,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+		0x74,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
 		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
 		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
 		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
@@ -17,6 +17,8 @@ Object.Base.data."mfcc_config" {
 		0xc3,0x35,0x00,0x2c,0x00,0x00,0x00,0x00,
 		0x90,0x01,0xa0,0x00,0x00,0x00,0x14,0x00,
 		0x0d,0x00,0x17,0x00,0x00,0x00,0x00,0x64,
-		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x01,
-		0x01,0x01,0x01,0x00,0x00,0x00,0x00,0x00"
+		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+		0x00,0x00,0x00,0x00,0x00,0x01,0x01,0x01,
+		0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+		0x00,0x00,0x00,0x00"
 }
diff --git a/tools/topology/topology2/include/components/mfcc/mel80.conf b/tools/topology/topology2/include/components/mfcc/mel80.conf
index 04aa2a15c660..480725c2d24f 100644
--- a/tools/topology/topology2/include/components/mfcc/mel80.conf
+++ b/tools/topology/topology2/include/components/mfcc/mel80.conf
@@ -1,12 +1,12 @@
-# Exported MFCC configuration 05-May-2026
+# Exported MFCC configuration 19-May-2026
 # cd src/audio/mfcc/tune; octave setup_mfcc.m
 Object.Base.data."mfcc_config" {
 	bytes "
 		0x53,0x4f,0x46,0x34,0x00,0x00,0x00,0x00,
-		0x68,0x00,0x00,0x00,0x01,0xd0,0x01,0x03,
+		0x74,0x00,0x00,0x00,0x01,0xd0,0x01,0x03,
 		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
 		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-		0x68,0x00,0x00,0x00,0x00,0x02,0x00,0x04,
+		0x74,0x00,0x00,0x00,0x00,0x02,0x00,0x04,
 		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
 		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
 		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
@@ -18,5 +18,7 @@ Object.Base.data."mfcc_config" {
 		0x90,0x01,0xa0,0x00,0x40,0x1f,0x00,0x00,
 		0x00,0x00,0x50,0x00,0x00,0x00,0x00,0x04,
 		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-		0x00,0x01,0x01,0x00,0x00,0x01,0x00,0x00"
+		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x01,
+		0x01,0x00,0x00,0x01,0x01,0x00,0x01,0x00,
+		0x00,0x00,0x00,0x00"
 }
diff --git a/tools/topology/topology2/platform/intel/sdw-dmic-audio-feature.conf b/tools/topology/topology2/platform/intel/sdw-dmic-audio-feature.conf
index 87039b261597..4b7ec2478076 100644
--- a/tools/topology/topology2/platform/intel/sdw-dmic-audio-feature.conf
+++ b/tools/topology/topology2/platform/intel/sdw-dmic-audio-feature.conf
@@ -21,6 +21,21 @@ Object.Pipeline.host-gateway-src-mfcc-capture [
 					name "$SDW_DMIC_AUDIO_FEATURE_CAPTURE_PCM_NAME MFCC bytes"
 					<include/components/mfcc/mel80.conf>
 				}
+				mixer."1" {
+					name "$SDW_DMIC_AUDIO_FEATURE_CAPTURE_PCM_NAME MFCC VAD"
+					Object.Base.channel.1 {
+						name	"fc"
+						shift	0
+					}
+					Object.Base.ops.1 {
+						name	"ctl"
+						info	"volsw"
+						#259 binds the mixer control to switch get/put handlers
+						get	259
+						put	259
+					}
+					max 1
+				}
 			}
 		}
 	}
diff --git a/tools/topology/topology2/platform/intel/sdw-jack-audio-feature.conf b/tools/topology/topology2/platform/intel/sdw-jack-audio-feature.conf
index 9645199d6907..019b09911197 100644
--- a/tools/topology/topology2/platform/intel/sdw-jack-audio-feature.conf
+++ b/tools/topology/topology2/platform/intel/sdw-jack-audio-feature.conf
@@ -21,6 +21,21 @@ Object.Pipeline.host-gateway-src-mfcc-capture [
 					name "$SDW_JACK_AUDIO_FEATURE_CAPTURE_PCM_NAME MFCC bytes"
 					<include/components/mfcc/mel80.conf>
 				}
+				mixer."1" {
+					name "$SDW_JACK_AUDIO_FEATURE_CAPTURE_PCM_NAME MFCC VAD"
+					Object.Base.channel.1 {
+						name	"fc"
+						shift	0
+					}
+					Object.Base.ops.1 {
+						name	"ctl"
+						info	"volsw"
+						#259 binds the mixer control to switch get/put handlers
+						get	259
+						put	259
+					}
+					max 1
+				}
 			}
 		}
 	}