diff --git a/src/audio/mfcc/CMakeLists.txt b/src/audio/mfcc/CMakeLists.txt index f8af79d1ca8a..274c7aa05eb8 100644 --- a/src/audio/mfcc/CMakeLists.txt +++ b/src/audio/mfcc/CMakeLists.txt @@ -4,5 +4,8 @@ if(CONFIG_COMP_MFCC STREQUAL "m" AND DEFINED CONFIG_LLEXT) add_subdirectory(llext ${PROJECT_BINARY_DIR}/mfcc_llext) add_dependencies(app mfcc) else() - add_local_sources(sof mfcc.c mfcc_setup.c mfcc_common.c mfcc_generic.c mfcc_hifi4.c mfcc_hifi3.c) + add_local_sources(sof mfcc.c mfcc_setup.c mfcc_common.c mfcc_generic.c mfcc_hifi4.c mfcc_hifi3.c mfcc_vad.c) + if(CONFIG_IPC_MAJOR_4) + add_local_sources(sof mfcc_ipc4.c) + endif() endif() diff --git a/src/audio/mfcc/mfcc.c b/src/audio/mfcc/mfcc.c index ea09d919009b..4976ad39a723 100644 --- a/src/audio/mfcc/mfcc.c +++ b/src/audio/mfcc/mfcc.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -68,6 +69,16 @@ static mfcc_func mfcc_find_func(enum sof_ipc_frame source_format, * End of MFCC setup code. Next the standard component methods. */ +/* Weak stubs for IPC notification, overridden by mfcc_ipc4.c */ +__weak int mfcc_ipc_notification_init(struct processing_module *mod) +{ + return 0; +} + +__weak void mfcc_send_vad_notification(struct processing_module *mod, uint32_t val) +{ +} + static int mfcc_init(struct processing_module *mod) { struct module_data *md = &mod->priv; @@ -97,6 +108,7 @@ static int mfcc_free(struct processing_module *mod) struct mfcc_comp_data *cd = module_get_private_data(mod); comp_info(mod->dev, "entry"); + ipc_msg_free(cd->msg); mod_data_blob_handler_free(mod, cd->model_handler); mfcc_free_buffers(mod); mod_free(mod, cd); @@ -109,10 +121,21 @@ static int mfcc_get_config(struct processing_module *mod, { struct sof_ipc_ctrl_data *cdata = (struct sof_ipc_ctrl_data *)fragment; struct mfcc_comp_data *cd = module_get_private_data(mod); + struct sof_ipc4_control_msg_payload *ctl; comp_info(mod->dev, "entry"); - return comp_data_blob_get_cmd(cd->model_handler, cdata, fragment_size); + switch (config_id) { + case SOF_IPC4_SWITCH_CONTROL_PARAM_ID: + ctl = (struct sof_ipc4_control_msg_payload *)fragment; + if (ctl->id == MFCC_CTRL_INDEX_VAD && ctl->num_elems == 1) { + ctl->chanv[0].value = cd->vad_prev ? 1 : 0; + *data_offset_size = sizeof(*ctl) + sizeof(ctl->chanv[0]); + } + return 0; + default: + return comp_data_blob_get_cmd(cd->model_handler, cdata, fragment_size); + } } static int mfcc_set_config(struct processing_module *mod, uint32_t config_id, @@ -124,8 +147,14 @@ static int mfcc_set_config(struct processing_module *mod, uint32_t config_id, comp_info(mod->dev, "entry"); - return comp_data_blob_set(cd->model_handler, pos, data_offset_size, - fragment, fragment_size); + switch (config_id) { + case SOF_IPC4_SWITCH_CONTROL_PARAM_ID: + /* VAD switch is read-only, ignore set requests */ + return 0; + default: + return comp_data_blob_set(cd->model_handler, pos, data_offset_size, + fragment, fragment_size); + } } static int mfcc_process(struct processing_module *mod, @@ -198,6 +227,15 @@ static int mfcc_prepare(struct processing_module *mod, goto err; } + /* Initialize VAD switch control notification if enabled */ + if (cd->config && cd->config->update_controls) { + ret = mfcc_ipc_notification_init(mod); + if (ret < 0) + goto err; + + cd->vad_prev = false; + } + return 0; err: diff --git a/src/audio/mfcc/mfcc_common.c b/src/audio/mfcc/mfcc_common.c index 1079864e9259..2c5b70e94c08 100644 --- a/src/audio/mfcc/mfcc_common.c +++ b/src/audio/mfcc/mfcc_common.c @@ -21,14 +21,17 @@ #include #include +#include + LOG_MODULE_REGISTER(mfcc_common, CONFIG_SOF_LOG_LEVEL); /* * The main processing function for MFCC */ -static int mfcc_stft_process(const struct comp_dev *dev, struct mfcc_comp_data *cd) +static int mfcc_stft_process(struct processing_module *mod, struct mfcc_comp_data *cd) { + const struct comp_dev *dev = mod->dev; struct sof_mfcc_config *config = cd->config; struct mfcc_state *state = &cd->state; struct mfcc_buffer *buf = &state->buf; @@ -169,6 +172,28 @@ static int mfcc_stft_process(const struct comp_dev *dev, struct mfcc_comp_data * cc_count += state->dct.num_out; } + + /* Run VAD on the mel log spectrum (available in both modes) */ + if (config->enable_vad) + mfcc_vad_update(&cd->vad, state->mel_log_32); + + /* Populate data header for this output frame */ + state->header.magic = MFCC_MAGIC; + state->header.frame_number = cd->vad.frame_count; + state->header.reserved = 0; + state->header.energy = cd->vad.energy; + state->header.noise_energy = cd->vad.noise_energy; + state->header.vad_flag = cd->vad.is_speech ? 1 : 0; + + /* Send notification when VAD state changes */ + if (config->update_controls) { + bool vad_now = cd->vad.is_speech; + + if (vad_now != cd->vad_prev) { + mfcc_send_vad_notification(mod, vad_now ? 1 : 0); + cd->vad_prev = vad_now; + } + } } return cc_count; @@ -267,9 +292,8 @@ void mfcc_s16_default(struct processing_module *mod, struct input_stream_buffer struct mfcc_comp_data *cd = module_get_private_data(mod); struct mfcc_state *state = &cd->state; struct mfcc_buffer *buf = &cd->state.buf; - uint32_t magic = MFCC_MAGIC; int16_t *w_ptr = audio_stream_get_wptr(sink); - const int num_magic = 2; + const int num_header_s16 = sizeof(state->header) / sizeof(int16_t); int num_ceps; int sink_samples; int to_copy; @@ -278,27 +302,29 @@ void mfcc_s16_default(struct processing_module *mod, struct input_stream_buffer mfcc_source_copy_s16(bsource, buf, &state->emph, frames, state->source_channel); /* Run STFT and processing after FFT: Mel auditory filter and DCT. */ - num_ceps = mfcc_stft_process(mod->dev, cd); + num_ceps = mfcc_stft_process(mod, cd); - /* If new output produced, set up pointer into scratch data and mark magic pending */ + /* If new output produced, set up pointer into scratch data and mark header pending */ if (num_ceps > 0) { - if (state->mel_only) + if (state->mel_only) { state->out_data_ptr = state->mel_spectra->data; - else + } else { state->out_data_ptr = state->cepstral_coef->data; + } state->out_remain = num_ceps; - state->magic_pending = true; + state->header_pending = true; } /* Write to sink, limited by period size */ sink_samples = frames * audio_stream_get_channels(sink); - /* Write magic word first if pending */ - if (state->magic_pending && sink_samples >= num_magic) { - w_ptr = mfcc_sink_copy_data_s16(sink, w_ptr, num_magic, (int16_t *)&magic); - sink_samples -= num_magic; - state->magic_pending = false; + /* Write data header first if pending */ + if (state->header_pending && sink_samples >= num_header_s16) { + w_ptr = mfcc_sink_copy_data_s16(sink, w_ptr, num_header_s16, + (int16_t *)&state->header); + sink_samples -= num_header_s16; + state->header_pending = false; } /* Write cepstral/mel data from scratch buffer */ @@ -363,9 +389,8 @@ void mfcc_s24_default(struct processing_module *mod, struct input_stream_buffer struct mfcc_comp_data *cd = module_get_private_data(mod); struct mfcc_state *state = &cd->state; struct mfcc_buffer *buf = &cd->state.buf; - uint32_t magic = MFCC_MAGIC; int32_t *w_ptr = audio_stream_get_wptr(sink); - const int num_magic = 1; /* one int32_t word for magic */ + const int num_header_s32 = sizeof(state->header) / sizeof(int32_t); int num_ceps; int sink_samples; int remain_s32; @@ -376,7 +401,7 @@ void mfcc_s24_default(struct processing_module *mod, struct input_stream_buffer mfcc_source_copy_s24(bsource, buf, &state->emph, frames, state->source_channel); /* Run STFT and processing after FFT */ - num_ceps = mfcc_stft_process(mod->dev, cd); + num_ceps = mfcc_stft_process(mod, cd); /* If new output produced, set up pointer into scratch data */ if (num_ceps > 0) { @@ -391,17 +416,18 @@ void mfcc_s24_default(struct processing_module *mod, struct input_stream_buffer } state->out_remain = num_ceps; - state->magic_pending = true; + state->header_pending = true; } /* Write to sink, limited by period size */ sink_samples = frames * audio_stream_get_channels(sink); - /* Write magic word first if pending */ - if (state->magic_pending && sink_samples >= num_magic) { - w_ptr = mfcc_sink_copy_data_s32(sink, w_ptr, num_magic, (int32_t *)&magic); - sink_samples -= num_magic; - state->magic_pending = false; + /* Write data header first if pending */ + if (state->header_pending && sink_samples >= num_header_s32) { + w_ptr = mfcc_sink_copy_data_s32(sink, w_ptr, num_header_s32, + (int32_t *)&state->header); + sink_samples -= num_header_s32; + state->header_pending = false; } if (state->mel_only) { @@ -443,9 +469,8 @@ void mfcc_s32_default(struct processing_module *mod, struct input_stream_buffer struct mfcc_comp_data *cd = module_get_private_data(mod); struct mfcc_state *state = &cd->state; struct mfcc_buffer *buf = &cd->state.buf; - uint32_t magic = MFCC_MAGIC; int32_t *w_ptr = audio_stream_get_wptr(sink); - const int num_magic = 1; /* one int32_t word for magic */ + const int num_header_s32 = sizeof(state->header) / sizeof(int32_t); int num_ceps; int sink_samples; int remain_s32; @@ -455,7 +480,7 @@ void mfcc_s32_default(struct processing_module *mod, struct input_stream_buffer mfcc_source_copy_s32(bsource, buf, &state->emph, frames, state->source_channel); /* Run STFT and processing after FFT */ - num_ceps = mfcc_stft_process(mod->dev, cd); + num_ceps = mfcc_stft_process(mod, cd); /* If new output produced, set up pointer into scratch data */ if (num_ceps > 0) { @@ -466,17 +491,18 @@ void mfcc_s32_default(struct processing_module *mod, struct input_stream_buffer } state->out_remain = num_ceps; - state->magic_pending = true; + state->header_pending = true; } /* Write to sink, limited by period size */ sink_samples = frames * audio_stream_get_channels(sink); - /* Write magic word first if pending */ - if (state->magic_pending && sink_samples >= num_magic) { - w_ptr = mfcc_sink_copy_data_s32(sink, w_ptr, num_magic, (int32_t *)&magic); - sink_samples -= num_magic; - state->magic_pending = false; + /* Write data header first if pending */ + if (state->header_pending && sink_samples >= num_header_s32) { + w_ptr = mfcc_sink_copy_data_s32(sink, w_ptr, num_header_s32, + (int32_t *)&state->header); + sink_samples -= num_header_s32; + state->header_pending = false; } if (state->mel_only) { diff --git a/src/audio/mfcc/mfcc_ipc4.c b/src/audio/mfcc/mfcc_ipc4.c new file mode 100644 index 000000000000..c763d2e765bd --- /dev/null +++ b/src/audio/mfcc/mfcc_ipc4.c @@ -0,0 +1,97 @@ +// SPDX-License-Identifier: BSD-3-Clause +// +// Copyright(c) 2026 Intel Corporation. +// +// Author: Seppo Ingalsuo + +/** + * \file mfcc_ipc4.c + * \brief IPC4-specific functions for MFCC component. + * + * Provides VAD switch control notification to user space via the + * IPC4 module notification mechanism. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +LOG_MODULE_DECLARE(mfcc, CONFIG_SOF_LOG_LEVEL); + +/** + * \brief Initialize IPC notification message for VAD switch control. + * + * Allocates and configures the IPC message used to send VAD state + * change notifications to user space via a switch control. + */ +int mfcc_ipc_notification_init(struct processing_module *mod) +{ + struct mfcc_comp_data *cd = module_get_private_data(mod); + struct ipc_msg msg_proto; + struct comp_dev *dev = mod->dev; + struct comp_ipc_config *ipc_config = &dev->ipc_config; + union ipc4_notification_header *primary = + (union ipc4_notification_header *)&msg_proto.header; + struct sof_ipc4_notify_module_data *msg_module_data; + struct sof_ipc4_control_msg_payload *msg_payload; + + memset_s(&msg_proto, sizeof(msg_proto), 0, sizeof(msg_proto)); + primary->r.notif_type = SOF_IPC4_MODULE_NOTIFICATION; + primary->r.type = SOF_IPC4_GLB_NOTIFICATION; + primary->r.rsp = SOF_IPC4_MESSAGE_DIR_MSG_REQUEST; + primary->r.msg_tgt = SOF_IPC4_MESSAGE_TARGET_FW_GEN_MSG; + cd->msg = ipc_msg_w_ext_init(msg_proto.header, msg_proto.extension, + sizeof(struct sof_ipc4_notify_module_data) + + sizeof(struct sof_ipc4_control_msg_payload) + + sizeof(struct sof_ipc4_ctrl_value_chan)); + if (!cd->msg) { + comp_err(dev, "Failed to initialize VAD notification"); + return -ENOMEM; + } + + msg_module_data = (struct sof_ipc4_notify_module_data *)cd->msg->tx_data; + msg_module_data->instance_id = IPC4_INST_ID(ipc_config->id); + msg_module_data->module_id = IPC4_MOD_ID(ipc_config->id); + msg_module_data->event_id = SOF_IPC4_NOTIFY_MODULE_EVENTID_ALSA_MAGIC_VAL | + SOF_IPC4_SWITCH_CONTROL_PARAM_ID; + msg_module_data->event_data_size = sizeof(struct sof_ipc4_control_msg_payload) + + sizeof(struct sof_ipc4_ctrl_value_chan); + + msg_payload = (struct sof_ipc4_control_msg_payload *)msg_module_data->event_data; + msg_payload->id = MFCC_CTRL_INDEX_VAD; + msg_payload->num_elems = 1; + msg_payload->chanv[0].channel = 0; + + comp_dbg(dev, "VAD notification init: instance_id = 0x%08x, module_id = 0x%08x", + msg_module_data->instance_id, msg_module_data->module_id); + return 0; +} + +/** + * \brief Send VAD switch control notification to user space. + * \param mod Processing module. + * \param val VAD value: 1 = speech, 0 = silence. + */ +void mfcc_send_vad_notification(struct processing_module *mod, uint32_t val) +{ + struct mfcc_comp_data *cd = module_get_private_data(mod); + struct sof_ipc4_notify_module_data *msg_module_data; + struct sof_ipc4_control_msg_payload *msg_payload; + + if (!cd->msg) + return; + + msg_module_data = (struct sof_ipc4_notify_module_data *)cd->msg->tx_data; + msg_payload = (struct sof_ipc4_control_msg_payload *)msg_module_data->event_data; + msg_payload->chanv[0].value = val; + ipc_msg_send(cd->msg, NULL, false); +} diff --git a/src/audio/mfcc/mfcc_setup.c b/src/audio/mfcc/mfcc_setup.c index 1cad4b2b984e..23f07e6aaf68 100644 --- a/src/audio/mfcc/mfcc_setup.c +++ b/src/audio/mfcc/mfcc_setup.c @@ -18,6 +18,8 @@ #include #include +#include + /* Definitions for cepstral lifter */ #define PI_Q23 Q_CONVERT_FLOAT(3.1415926536, 23) #define TWO_PI_Q23 Q_CONVERT_FLOAT(6.2831853072, 23) @@ -332,7 +334,7 @@ int mfcc_setup(struct processing_module *mod, int max_frames, int sample_rate, i * least fft_hop_size * channels int16_t samples per hop (worst case s16). * If output exceeds this, data accumulates and will eventually overflow. */ - int out_per_hop = max_out_per_hop + 2; + int out_per_hop = max_out_per_hop + sizeof(state->header) / sizeof(int16_t); int sink_per_hop = fft->fft_hop_size * channels; if (out_per_hop > sink_per_hop) { @@ -345,11 +347,20 @@ int mfcc_setup(struct processing_module *mod, int max_frames, int sample_rate, i /* Set initial state for STFT */ state->waiting_fill = true; state->prev_samples_valid = false; - state->magic_pending = false; + state->header_pending = false; + memset(&state->header, 0, sizeof(state->header)); state->out_data_ptr = NULL; state->out_data_ptr_32 = NULL; state->out_remain = 0; + if (config->enable_vad) { + ret = mfcc_vad_init(&cd->vad, config->num_mel_bins, sample_rate, mod); + if (ret < 0) { + comp_err(dev, "Failed VAD init"); + goto free_lifter; + } + } + comp_dbg(dev, "done"); return 0; @@ -389,4 +400,6 @@ void mfcc_free_buffers(struct processing_module *mod) mod_free(mod, cd->state.melfb.data); mod_free(mod, cd->state.dct.matrix); mod_free(mod, cd->state.lifter.matrix); + mod_free(mod, cd->vad.noise_floor); + mod_free(mod, cd->vad.weights); } diff --git a/src/audio/mfcc/mfcc_vad.c b/src/audio/mfcc/mfcc_vad.c new file mode 100644 index 000000000000..1ac13cf53b88 --- /dev/null +++ b/src/audio/mfcc/mfcc_vad.c @@ -0,0 +1,245 @@ +// SPDX-License-Identifier: BSD-3-Clause +// +// Copyright(c) 2026 Intel Corporation. +// +// Author: Seppo Ingalsuo + +/** + * \file mfcc_vad.c + * \brief Voice Activity Detection based on Mel spectrum energy. + * + * Implements a VAD that tracks per-bin noise floor and computes a + * speech-frequency weighted energy above the floor. Speech is declared + * when the weighted delta exceeds a threshold, with hangover to prevent + * rapid toggling. + */ + +#include + +#include +#include +#include +#include +#include +#include +#include + +LOG_MODULE_DECLARE(mfcc, CONFIG_SOF_LOG_LEVEL); + +/** + * \brief A-weighting table: 1/3 octave band center frequencies in Hz (Q16.0). + * + * From IEC 61672-1:2013, source: + * https://acousticalengineer.com/a-weighting-table/ + */ +#define A_WEIGHT_TABLE_SIZE 36 + +static const int16_t a_weight_hz[A_WEIGHT_TABLE_SIZE] = { + 6, 8, 10, 13, 16, 20, 25, 32, + 40, 50, 63, 80, 100, 125, 160, 200, + 250, 315, 400, 500, 630, 800, 1000, 1250, + 1600, 2000, 2500, 3150, 4000, 5000, 6300, 8000, + 10000, 12500, 16000, 20000, +}; + +/** + * \brief A-weighting linear amplitude, scaled so peak (at 2500 Hz) maps + * to INT16_MAX (32767). Original dB values converted via + * 10^(dB/20) then scaled by 32767 / max. + */ +static const int16_t a_weight_lin[A_WEIGHT_TABLE_SIZE] = { + 2, 4, 9, 19, 43, 85, 162, 299, + 531, 862, 1382, 2140, 3129, 4370, 6172, 8136, + 10362, 13196, 16234, 19518, 22669, 25730, 28212, 30230, + 31655, 32392, 32767, 32392, 31655, 30230, 27889, 24856, + 21156, 17196, 13045, 9670, +}; + +/** + * \brief Compute A-weighted speech-frequency emphasis weights for Mel bins. + * + * Weights are computed by linearly interpolating the A-weighting table + * at each Mel bin center frequency. Output weights are in Q1.15 and + * sum to approximately 2^15. + * + * \param[out] weights Output weight array. + * \param[in] num_mel Number of Mel bins. + * \param[in] sample_rate Sample rate in Hz. + */ +static void mfcc_vad_build_weights(int16_t *weights, int num_mel, int sample_rate) +{ + int32_t scaled, num; + int32_t sum = 0; + int16_t f_hz, f0, f1, w, w0, w1, den; + int16_t mel_end = psy_hz_to_mel((int16_t)(sample_rate / 2)); + int16_t mel_step = mel_end / (num_mel + 1); + int i, j; + + if (!num_mel) + return; + + for (i = 0; i < num_mel; i++) { + f_hz = psy_mel_to_hz((int16_t)((i + 1) * mel_step)); + + /* Find the table interval containing f_hz and interpolate */ + if (f_hz <= a_weight_hz[0]) { + w = a_weight_lin[0]; + } else if (f_hz >= a_weight_hz[A_WEIGHT_TABLE_SIZE - 1]) { + w = a_weight_lin[A_WEIGHT_TABLE_SIZE - 1]; + } else { + /* Find j such that a_weight_hz[j] <= f_hz < a_weight_hz[j+1] */ + for (j = 0; j < A_WEIGHT_TABLE_SIZE - 2; j++) { + if (f_hz < a_weight_hz[j + 1]) + break; + } + + /* Linear interpolation: w = w0 + (w1 - w0) * (f - f0) / (f1 - f0) */ + f0 = a_weight_hz[j]; + f1 = a_weight_hz[j + 1]; + w0 = a_weight_lin[j]; + w1 = a_weight_lin[j + 1]; + num = (int32_t)(w1 - w0) * (f_hz - f0); + den = f1 - f0; + w = w0 + (int16_t)(num / den); + } + + weights[i] = w; + sum += w; + } + + /* Normalize weights so they sum to 1.0 */ + for (i = 0; i < num_mel; i++) { + scaled = ((int32_t)weights[i] << 16) / sum; /* Q1.16 */ + weights[i] = (int16_t)Q_SHIFT_RND(scaled, 16, 15); /* Round to Q1.15 */ + } +} + +int mfcc_vad_init(struct mfcc_vad_state *vad, int num_mel_bins, int sample_rate, + struct processing_module *mod) +{ + if (!vad) + return -EINVAL; + + if (num_mel_bins <= 0) + return -EINVAL; + + vad->num_mel_bins = num_mel_bins; + vad->energy_threshold = MFCC_VAD_ENERGY_THRESHOLD; + vad->noise_rise_alpha_slow = MFCC_VAD_NOISE_RISE_ALPHA; + vad->noise_rise_alpha_fast = MFCC_VAD_NOISE_RISE_ALPHA_FAST; + vad->hangover_max = MFCC_VAD_HANGOVER_FRAMES; + vad->hangover_counter = 0; + vad->init_frames = MFCC_VAD_NOISE_INIT_FRAMES; + vad->frame_count = 0; + vad->is_speech = false; + vad->initialized = false; + + /* Allocate per-bin noise floor */ + vad->noise_floor = mod_zalloc(mod, num_mel_bins * sizeof(int32_t)); + if (!vad->noise_floor) + return -ENOMEM; + + /* Allocate and compute per-bin weights */ + vad->weights = mod_zalloc(mod, num_mel_bins * sizeof(int16_t)); + if (!vad->weights) { + mod_free(mod, vad->noise_floor); + vad->noise_floor = NULL; + return -ENOMEM; + } + + mfcc_vad_build_weights(vad->weights, num_mel_bins, sample_rate); + return 0; +} + +int mfcc_vad_update(struct mfcc_vad_state *vad, const int32_t *mel_log) +{ + int64_t signal_energy = 0; + int64_t noise_energy = 0; + int64_t energy_delta = 0; + int32_t delta; + int32_t p; + int16_t alpha; + int i; + + if (!vad || !mel_log) + return 0; + + vad->frame_count++; + + /* Initialize noise floor to first frame */ + if (!vad->initialized) { + for (i = 0; i < vad->num_mel_bins; i++) + vad->noise_floor[i] = mel_log[i]; + + vad->initialized = true; + } + + /* Select rise alpha based on convergence phase */ + if (vad->frame_count <= vad->init_frames) + alpha = vad->noise_rise_alpha_fast; + else + alpha = vad->noise_rise_alpha_slow; + + /* Update noise floor: follow down instantly, rise slowly */ + for (i = 0; i < vad->num_mel_bins; i++) { + if (mel_log[i] < vad->noise_floor[i]) { + /* Instant follow-down */ + vad->noise_floor[i] = mel_log[i]; + } else { + /* Slow rise: floor += alpha * (mel - floor) + * Q9.23 + Q1.15 * Q9.23 => need Q9.23 result + * alpha is Q1.15, delta is Q9.23 + */ + delta = mel_log[i] - vad->noise_floor[i]; + p = (int32_t)Q_MULTSR_32X32((int64_t)alpha, delta, 15, 23, 23); + vad->noise_floor[i] += p; + } + } + + /* Compute weighted signal energy and noise floor energy. + * weights are Q1.15, mel values are Q9.23 + * Products are Q10.38, accumulate in int64_t then shift to Q9.23 + */ + + for (i = 0; i < vad->num_mel_bins; i++) { + signal_energy += (int64_t)vad->weights[i] * mel_log[i]; + noise_energy += (int64_t)vad->weights[i] * vad->noise_floor[i]; + } + + vad->energy = sat_int32(Q_SHIFT_RND(signal_energy, 38, 23)); + vad->noise_energy = sat_int32(Q_SHIFT_RND(noise_energy, 38, 23)); + energy_delta = vad->energy - vad->noise_energy; + + /* Round accumulated energy from Q10.38 to Q9.23, saturate to int32 */ + if (energy_delta > vad->energy_threshold) { + vad->hangover_counter = vad->hangover_max; + vad->is_speech = true; + } else { + if (vad->hangover_counter > 0) { + vad->hangover_counter--; + vad->is_speech = true; + } else { + vad->is_speech = false; + } + } + + return vad->is_speech ? 1 : 0; +} + +void mfcc_vad_reset(struct mfcc_vad_state *vad) +{ + int i; + + if (!vad) + return; + + vad->frame_count = 0; + vad->hangover_counter = 0; + vad->energy = 0; + vad->noise_energy = 0; + vad->is_speech = false; + vad->initialized = false; + + for (i = 0; i < vad->num_mel_bins; i++) + vad->noise_floor[i] = 0; +} diff --git a/src/audio/mfcc/tune/README.md b/src/audio/mfcc/tune/README.md new file mode 100644 index 000000000000..5fef841efff1 --- /dev/null +++ b/src/audio/mfcc/tune/README.md @@ -0,0 +1,98 @@ +# SOF MFCC Tuning Tools + +This directory contains a tool to create configuration blob for SOF +MFCC component. It's simply run in Matlab or Octave with command +`setup_mfcc`. The MFCC configuration parameters can be edited from the +script. + +## Testbench + +The configuration can be test run with testbench. First the test topologies +need to be created with `scripts/build-tools.sh -t`. Next the testbench +is built with `scripts/rebuild-testbench.sh`. + +Once the previous steps are done, a sample wav file can be processed +with script `run_mfcc.sh`. The script converts the input to raw 16 kHz +stereo format and runs the testbench for S16, S24, and S32 bit depths, +producing both cepstral coefficient (MFCC) and Mel spectrogram outputs. + +``` +./run_mfcc.sh /usr/share/sounds/alsa/Front_Center.wav +``` + +Output files from host testbench: + +| File | Content | +|------|---------| +| `mfcc_s16.raw`, `mfcc_s24.raw`, `mfcc_s32.raw` | Cepstral coefficients | +| `mel_s16.raw`, `mel_s24.raw`, `mel_s32.raw` | Mel spectrogram | + +If the `XTENSA_PATH` environment variable is set, the script also runs +the Xtensa build of the testbench (via `xt-run`) and produces additional +output files prefixed with `xt_`: + +| File | Content | +|------|---------| +| `xt_mfcc_s16.raw`, `xt_mfcc_s24.raw`, `xt_mfcc_s32.raw` | Cepstral coefficients | +| `xt_mel_s16.raw`, `xt_mel_s24.raw`, `xt_mel_s32.raw` | Mel spectrogram | + +## Decoding and Plotting + +All output files can be decoded and plotted at once in Matlab or Octave +with the `decode_all.m` script: + +```matlab +decode_all +``` + +This calls `decode_ceps` for each MFCC file (13 cepstral coefficients) and +`decode_mel` for each Mel file (80 Mel bins), plotting spectrograms for all +files that exist including the Xtensa variants. + +Individual files can also be decoded manually: + +```matlab +[ceps, t, n] = decode_ceps('mfcc_s16.raw', 13); +``` + +In the above it's known from configuration script that MFCC was set up to +output 13 cepstral coefficients from each FFT → Mel → DCT → Cepstral +coefficients computation run. + +The 80 bands Mel output can be visualized with command: + +```matlab +[mel, t, n] = decode_mel('mel_s16.raw', 80); +``` + +## Live Whisper Transcription with DSP VAD + +The directory contains a Python script `sof_mel_to_text_live_dsp_vad.py`. +It can be used with development topologies +`sof-arl-cs42l43-l0-cs35l56-l23-mfcc.tplg` and +`sof-mtl-rt713-l0-rt1316-l12-mfcc.tplg`. It captures from default audio +device `hw:0,47` (headset microphone) Mel audio features and VAD flags. +The captured frames with detected speech are sent to Whisper speech +recognizer model for conversion to text. + +### Prerequisites + +The script needs OpenVINO. Please follow the install procedure from +. + +The following Python pip installs are needed into the same OpenVINO venv: + +```bash +pip install openvino openvino-tokenizers openvino-genai +pip install optimum[intel] +pip install transformers +pip install huggingface_hub +``` + +### NPU / GPU Support + +The script by default runs the Whisper encoder model in the NPU. To +use the NPU, install the driver from +. If the NPU is not +available, change the encoder to CPU with run option `--encoder-device CPU`. +With a GPU both `--encoder-device GPU` and `--decoder-device GPU` can be set. diff --git a/src/audio/mfcc/tune/README.txt b/src/audio/mfcc/tune/README.txt deleted file mode 100644 index a0c3189e81a3..000000000000 --- a/src/audio/mfcc/tune/README.txt +++ /dev/null @@ -1,52 +0,0 @@ -This directory contains a tool to create configuration blob for SOF -MFCC component. It's simply run in Matlab or Octave with command -"setup_mfcc". The MFCC configuration parameters can be edited from the -script. - -The configuration can be test run with testbench. First the test topologies -need to be created with "scripts/build-tools.sh -t". Next the testbench -is build with "scripts/rebuild-testbench.sh". - -Once the previous steps are done, a sample wav file can be processed -with script run_mfcc.sh. The script converts the input to raw 16 kHz -stereo format and runs the testbench for S16, S24, and S32 bit depths, -producing both cepstral coefficient (MFCC) and Mel spectrogram outputs. - -./run_mfcc.sh /usr/share/sounds/alsa/Front_Center.wav - -Output files from host testbench: - mfcc_s16.raw, mfcc_s24.raw, mfcc_s32.raw - cepstral coefficients - mel_s16.raw, mel_s24.raw, mel_s32.raw - Mel spectrogram - -If the XTENSA_PATH environment variable is set, the script also runs -the Xtensa build of the testbench (via xt-run) and produces additional -output files prefixed with "xt_": - xt_mfcc_s16.raw, xt_mfcc_s24.raw, xt_mfcc_s32.raw - xt_mel_s16.raw, xt_mel_s24.raw, xt_mel_s32.raw - -All output files can be decoded and plotted at once in Matlab or Octave -with the decode_all.m script: - -decode_all - -This calls decode_ceps for each MFCC file (13 cepstral coefficients) and -decode_mel for each Mel file (80 Mel bins), plotting spectrograms for all -files that exist including the Xtensa variants. - -Individual files can also be decoded manually: - -[ceps, t, n] = decode_ceps('mfcc_s16.raw', 13); - -In the above it's known from configuration script that MFCC was set up to -output 13 cepstral coefficients from each FFT -> Mel -> DCT -> Cepstral -coefficients computation run. - -The 80 bands Mel output can be visualized with command: - -[mel, t, n] = decode_mel('mel_s16.raw', 80); - -Other kind of signals have quite big visual difference in audio features. Try -e.g. other sound files found in computer. - -./run_mfcc.sh /usr/share/sounds/gnome/default/alerts/bark.ogg -./run_mfcc.sh /usr/share/sounds/gnome/default/alerts/sonar.ogg diff --git a/src/audio/mfcc/tune/decode_all.m b/src/audio/mfcc/tune/decode_all.m index d5b60289b4cf..23ca07522aae 100644 --- a/src/audio/mfcc/tune/decode_all.m +++ b/src/audio/mfcc/tune/decode_all.m @@ -25,7 +25,7 @@ fn = all_ceps_files{i}; if exist(fn, 'file') fprintf('Decoding MFCC ceps: %s\n', fn); - [ceps, t, n] = decode_ceps(fn, num_ceps); + [ceps, t, n, energy, noise_energy, vad] = decode_ceps(fn, num_ceps); end end @@ -34,6 +34,6 @@ fmt = all_mel_fmts{i}; if exist(fn, 'file') fprintf('Decoding Mel: %s\n', fn); - [mel, t, n] = decode_mel(fn, num_mel, fmt); + [mel, t, n, energy, noise_energy, vad] = decode_mel(fn, num_mel, fmt); end end diff --git a/src/audio/mfcc/tune/decode_ceps.m b/src/audio/mfcc/tune/decode_ceps.m index a63677fa3731..32a04e8d8df7 100644 --- a/src/audio/mfcc/tune/decode_ceps.m +++ b/src/audio/mfcc/tune/decode_ceps.m @@ -1,4 +1,4 @@ -% [ceps, t, n] = decode_ceps(fn, num_ceps, num_channels) +% [ceps, t, n, vad, energy, noise_energy, frame_number] = decode_ceps(fn, num_ceps, num_channels) % % Input % fn - File with MFCC data in .raw or .wav format @@ -9,11 +9,16 @@ % ceps - cepstral coefficients % t - time vector for plotting % n - ceps 1..num_ceps vector for plotting +% vad - VAD flag per frame from DSP +% energy - weighted signal energy per frame from DSP +% noise_energy - weighted noise floor energy per frame from DSP +% frame_number - frame number from DSP % SPDX-License-Identifier: BSD-3-Clause -% Copyright(c) 2022 Intel Corporation. All rights reserved. +% Copyright(c) 2022-2026 Intel Corporation. All rights reserved. -function [ceps, t, n] = decode_ceps(fn, num_ceps, num_channels) +function [ceps, t, n, vad, energy, noise_energy, frame_number] = ... + decode_ceps(fn, num_ceps, num_channels) if nargin < 3 num_channels = 1; @@ -23,6 +28,7 @@ fs = 16e3; qformat = 7; magic = [25443 28006]; % ASCII 'mfcc' as int16 +num_magic = 2; % magic word is 2 x int16 % Load output data [data, num_channels] = get_file(fn, num_channels); @@ -41,17 +47,37 @@ period_ceps = idx(2)-idx(1); num_frames = length(idx); + +% Header after magic is [frame_number, reserved, energy, noise_energy, vad_flag] +% as int32 (10 int16 slots), followed by num_ceps coefficients. +payload_len = 10 + num_ceps; % 5 int32 = 10 int16, then ceps data + +% Last frame can be incomplete due to span over multiple periods +last = idx(end) + num_magic + payload_len - 1; +if (last > length(data)) + num_frames = num_frames - 1; +end + t_ceps = period_ceps / num_channels / fs; t = (0:num_frames -1) * t_ceps; n = 1:num_ceps; -ceps = zeros(num_ceps, num_frames); +payload = zeros(payload_len, num_frames); for i = 1:num_frames - i1 = idx(i) + 2; - i2 = i1 + num_ceps - 1; - ceps(:,i) = data(i1:i2) / 2^qformat; + i1 = idx(i) + num_magic; + i2 = i1 + payload_len - 1; + payload(:,i) = double(data(i1:i2)); end +% Reassemble int32 from pairs of int16 (little-endian). +% Low half must be treated as unsigned with mod() to handle negative int16. +frame_number = mod(payload(1,:), 65536) + payload(2,:) * 65536; +% payload(3:4,:) is reserved, skip +energy = mod(payload(5,:), 65536) + payload(6,:) * 65536; +noise_energy = mod(payload(7,:), 65536) + payload(8,:) * 65536; +vad = mod(payload(9,:), 65536) + payload(10,:) * 65536; +ceps = payload(11:payload_len, :) / 2^qformat; + figure; surf(t, n, ceps, 'EdgeColor', 'none'); colormap(jet); @@ -75,7 +101,7 @@ case '.wav' tmp = audioread(fn, 'native'); t = whos('tmp'); - if ~strcmp(t.class, 'int16'); + if ~strcmp(t.class, 'int16') error('Only 16-bit wav file format is supported'); end s = size(tmp); diff --git a/src/audio/mfcc/tune/decode_mel.m b/src/audio/mfcc/tune/decode_mel.m index f6a723aa2040..24296b529cbc 100644 --- a/src/audio/mfcc/tune/decode_mel.m +++ b/src/audio/mfcc/tune/decode_mel.m @@ -1,26 +1,28 @@ -% [mel, t, n] = decode_mel(fn, num_mel, fmt, num_channels) +% [mel, t, n, vad, energy, noise_energy, frame_number] = decode_mel(fn, num_mel, fmt, num_channels) % % Input % fn - File with Mel data in .raw or .wav format % num_mel - number of Mel coefficients per frame % fmt - format of the Mel data ('s16', 's24', 's32') -% num_channels - needed for .raw format, omit for .wav +% num_channels - needed for .raw format, omit for .wav, default 2 % % Outputs % mel - Mel coefficients % t - time vector for plotting % n - mel 1..num_mel vector for plotting +% vad - VAD flag per frame from DSP +% energy - weighted signal energy per frame from DSP +% noise_energy - weighted noise floor energy per frame from DSP +% frame_number - frame number from DSP % SPDX-License-Identifier: BSD-3-Clause % Copyright(c) 2026 Intel Corporation. -function [mel, t, n] = decode_mel(fn, num_mel, fmt, num_channels) +function [mel, t, n, vad, energy, noise_energy, frame_number] = ... + decode_mel(fn, num_mel, fmt, num_channels) -if nargin < 3 - fmt = 's16'; -end if nargin < 4 - num_channels = 1; + num_channels = 2; end % MFCC stream @@ -30,15 +32,15 @@ case 's16' qformat = 7; magic = [25443 28006]; % ASCII 'mfcc' as two int16 - num_magic = 2; + num_magic = 2; % magic word is 2 x int16 case 's24' qformat = 15; magic = int32(1835426659); % 0x6D666363 as int32 - num_magic = 1; + num_magic = 1; % magic word is 1 x int32 case 's32' qformat = 23; magic = int32(1835426659); % 0x6D666363 as int32 - num_magic = 1; + num_magic = 1; % magic word is 1 x int32 otherwise error("Use 's16', 's24', or 's32' as format."); end @@ -68,33 +70,77 @@ period_mel = idx(2)-idx(1); num_frames = length(idx); +% Header after magic is [frame_number, reserved, energy, noise_energy, vad_flag] +% as int32, followed by num_mel coefficients. +% For s16 each int32 occupies 2 int16 slots. +if strcmp(fmt, 's16') + payload_len = 10 + num_mel; % 5 int32 = 10 int16, then mel data +else + payload_len = 5 + num_mel; % frame_number + reserved + energy + noise_energy + vad_flag + mel +end + % Last frame can be incomplete due to span over multiple periods -last = idx(end) + num_mel - 1; +last = idx(end) + num_magic + payload_len - 1; if (last > length(data)) num_frames = num_frames - 1; end -t_mel = period_mel / num_channels / fs; -t = (0:num_frames -1) * t_mel; -n = 1:num_mel; - -mel = zeros(num_mel, num_frames); +payload = zeros(payload_len, num_frames); for i = 1:num_frames i1 = idx(i) + num_magic; - i2 = i1 + num_mel - 1; - mel(:,i) = double(data(i1:i2)) / 2^qformat; + i2 = i1 + payload_len - 1; + payload(:,i) = double(data(i1:i2)); end -figure; +if strcmp(fmt, 's16') + % Reassemble int32 from pairs of int16 (little-endian). + % Low half must be treated as unsigned with mod() to handle negative int16. + frame_number = mod(payload(1,:), 65536) + payload(2,:) * 65536; + % payload(3:4,:) is reserved, skip + energy = mod(payload(5,:), 65536) + payload(6,:) * 65536; + noise_energy = mod(payload(7,:), 65536) + payload(8,:) * 65536; + vad = mod(payload(9,:), 65536) + payload(10,:) * 65536; + mel = payload(11:payload_len, :) / 2^qformat; +else + frame_number = payload(1, :); + % payload(2,:) is reserved, skip + energy = payload(3, :) / 2^qformat; + noise_energy = payload(4, :) / 2^qformat; + vad = payload(5, :); + mel = payload(6:payload_len, :) / 2^qformat; +end + +t_mel = period_mel / num_channels / fs; +t = (0:num_frames -1) * t_mel; +n = 1:num_mel; + +figure imagesc(t, n, mel); axis xy; colormap(jet); colorbar; tstr = sprintf('SOF MFCC Mel coefficients (%s)', fn); title(tstr, 'Interpreter', 'None'); -xlabel('Time (s)'); ylabel('Mel coef #'); +figure +subplot(2,1,1); +level = sum(mel(:,:)); +plot(t, vad) +ax = axis(); +axis([ax(1:2) -0.1 1.1]); +grid on; +title(tstr, 'Interpreter', 'None'); +xlabel('Time (s)'); +ylabel('VAD flag'); + +subplot(2,1,2); +plot(t, energy, t, noise_energy); +grid on; +legend('Energy', 'Noise Energy'); +xlabel('Time (s)'); +ylabel('Energy'); + end function [data, num_channels] = get_file(fn, num_channels, fmt) diff --git a/src/audio/mfcc/tune/setup_mfcc.m b/src/audio/mfcc/tune/setup_mfcc.m index bd2b3f11e60b..3cda3221a4fc 100644 --- a/src/audio/mfcc/tune/setup_mfcc.m +++ b/src/audio/mfcc/tune/setup_mfcc.m @@ -62,6 +62,9 @@ function setup_mfcc() cfg.mmax_init = 0; % same cfg.mmax_coef = 0; % same cfg.dynamic_mmax = false; % same + cfg.enable_vad = false; + cfg.enable_dtx = false; + cfg.update_controls = false; end function cfg = get_mel_spectrogram_config() @@ -99,6 +102,9 @@ function setup_mfcc() cfg.mmax_init = 0; % Initial value max Mel value, data clamp is mmax - top_db cfg.mmax_coef = 0; % Dynamic max Mel value decay coefficient (zero lock to found max) cfg.dynamic_mmax = true; + cfg.enable_vad = true; + cfg.enable_dtx = false; + cfg.update_controls = true; end function export_mfcc_setup(gen_cfg, cfg) @@ -107,7 +113,7 @@ function export_mfcc_setup(gen_cfg, cfg) addpath([gen_cfg.tools_path 'tune/common']); %% Blob size, size plus reserved(8) + current parameters -nbytes_data = 104; +nbytes_data = 116; %% Little endian sh32 = [0 -8 -16 -24]; @@ -160,6 +166,10 @@ function export_mfcc_setup(gen_cfg, cfg) v = 0; [b8, j] = add_w16b(v, b8, j); % vtln_high Qx.y TBD v = 0; [b8, j] = add_w16b(v, b8, j); % vtln_low Qx.y TBD v = 0; [b8, j] = add_w16b(v, b8, j); % vtln_warp Qx.y TBD +% reserved16[3] +for i = 1:3 + [b8, j] = add_w16b(0, b8, j); +end v = cfg.htk_compat; [b8, j] = add_w8b(v, b8, j); % bool v = cfg.raw_energy; [b8, j] = add_w8b(v, b8, j); % bool v = cfg.remove_dc_offset; [b8, j] = add_w8b(v, b8, j); % bool @@ -168,6 +178,13 @@ function export_mfcc_setup(gen_cfg, cfg) v = cfg.subtract_mean; [b8, j] = add_w8b(v, b8, j); % bool v = cfg.use_energy; [b8, j] = add_w8b(v, b8, j); % bool v = cfg.dynamic_mmax; [b8, j] = add_w8b(v, b8, j); % bool +v = cfg.enable_vad; [b8, j] = add_w8b(v, b8, j); % bool +v = cfg.enable_dtx; [b8, j] = add_w8b(v, b8, j); % bool +v = cfg.update_controls; [b8, j] = add_w8b(v, b8, j); % bool +% reserved_bool[5] +for i = 1:5 + [b8, j] = add_w8b(0, b8, j); +end %% Export tplg_fn = [gen_cfg.mfcc_conf_path cfg.tplg_fn]; diff --git a/src/audio/mfcc/tune/sof_mel_to_text_live_dsp_vad.py b/src/audio/mfcc/tune/sof_mel_to_text_live_dsp_vad.py new file mode 100644 index 000000000000..33862da283e4 --- /dev/null +++ b/src/audio/mfcc/tune/sof_mel_to_text_live_dsp_vad.py @@ -0,0 +1,454 @@ +"""Live SOF mel capture with DSP VAD-triggered Whisper transcription. + +Captures mel frames from ALSA with embedded VAD flag from the DSP. +Frame format: [magic(int32), frame_number(uint32), reserved(int32), energy(int32), noise_energy(int32), vad_flag(int32), mel[0..79](int32)] +When silence of 100ms is detected after speech, sends the buffered mel +features to Whisper (OpenVINO encoder+decoder) for transcription. +Capture continues running during Whisper inference. + +Usage: + python sof_mel_to_text_live_dsp_vad.py [--device hw:0,47] [--model whisper-medium-int4-ov] + python sof_mel_to_text_live_dsp_vad.py --plot # with live spectrogram +""" + +import argparse +import os +import struct +import subprocess +import threading +import time +import numpy as np +import openvino as ov +import huggingface_hub as hf_hub +from pathlib import Path + +# Graphics imports deferred until --plot is used +matplotlib = None +plt = None + +# SOF mel_s32.raw format constants (with DSP data header) +SOF_MAGIC_S32 = np.int32(0x6D666363) # ASCII 'mfcc' as int32 +SOF_MAGIC_BYTES = struct.pack(' 3: + buf = buf[-3:] + return None, None, buf + end = idx + SOF_FRAME_BYTES + if end > len(buf): + buf = buf[idx:] + return None, None, buf + # Parse vad_flag at offset 20 (after magic + frame_number + reserved + energy + noise_energy) + vad_flag = struct.unpack_from('> \"{text}\"\n", flush=True) + else: + print(" [Whisper] empty result", flush=True) + + try: + while True: + data = proc.stdout.read(read_chunk) + if not data: + rc = proc.poll() + if rc is not None: + stderr_out = proc.stderr.read().decode(errors='replace') + print(f"\narecord exited with code {rc}") + if stderr_out: + print(f"stderr: {stderr_out}") + break + continue + + buf += data + + while True: + vad_flag, frame_ints, buf = find_frame_in_buffer(buf) + if frame_ints is None: + break + + frame_num += 1 + mel = decode_mel_frame(frame_ints) + speech = vad_flag != 0 + + # Print VAD transitions when not plotting + if plotter is None and speech != prev_speech: + t = frame_num * 0.01 + tag = "SPEECH" if speech else "SILENCE" + print(f" [{t:7.2f}s] {tag}", flush=True) + prev_speech = speech + + # Update plot + if plotter is not None: + plotter.update(mel, speech) + + # --- Speech buffering logic --- + if speech: + speech_buffer.append(mel.copy()) + silence_counter = 0 + was_speaking = True + else: + if was_speaking: + silence_counter += 1 + if silence_counter >= SILENCE_TRIGGER_FRAMES: + n = len(speech_buffer) + duration = n * 0.01 + t = frame_num * 0.01 + + if n < MIN_SPEECH_FRAMES: + # Too short — discard + speech_buffer.clear() + silence_counter = 0 + was_speaking = False + continue + + # Silence threshold reached — send to Whisper + print(f" [{t:7.2f}s] Transcribing {n} frames " + f"({duration:.1f}s)...", flush=True) + + if not transcriber.is_busy(): + frames_copy = list(speech_buffer) + transcriber.transcribe_async( + frames_copy, on_transcription) + else: + print(f" [{t:7.2f}s] (Whisper busy, " + f"dropping {n} frames)", flush=True) + + speech_buffer.clear() + silence_counter = 0 + was_speaking = False + + except (KeyboardInterrupt, BrokenPipeError): + pass + finally: + if proc.poll() is None: + proc.terminate() + try: + proc.wait(timeout=3) + except subprocess.TimeoutExpired: + proc.kill() + proc.wait() + if plotter is not None: + try: + plt.close(plotter.fig) + except Exception: + pass + print("\n\nCapture stopped.") + + +def main(): + parser = argparse.ArgumentParser( + description="Live SOF mel capture with DSP VAD-triggered Whisper transcription") + parser.add_argument('--device', '-D', default='hw:0,47', + help='ALSA capture device (default: hw:0,47)') + parser.add_argument('--rate', '-r', type=int, default=16000, + help='Sample rate for arecord (default: 16000)') + parser.add_argument('--model', '-m', default='whisper-medium-int4-ov', + help='Path to Whisper OpenVINO model directory') + parser.add_argument('--encoder-device', default='NPU', + help='OpenVINO device for encoder (default: NPU)') + parser.add_argument('--decoder-device', default='CPU', + help='OpenVINO device for decoder (default: CPU)') + parser.add_argument('--plot', action='store_true', + help='Show live scrolling mel spectrogram and VAD plot') + args = parser.parse_args() + model_id = "OpenVINO/" + os.path.basename(args.model) + if not os.path.isdir(args.model): + print(f"Downloading model {model_id} ...") + hf_hub.snapshot_download(model_id, local_dir=args.model) + + print("=== Live SOF Mel → Whisper Transcription (DSP VAD) ===\n") + run_capture(args.device, args.rate, args.model, args.encoder_device, + args.decoder_device, enable_plot=args.plot) + + +if __name__ == '__main__': + main() diff --git a/src/include/sof/audio/mfcc/mfcc_comp.h b/src/include/sof/audio/mfcc/mfcc_comp.h index 025eef116752..72177df2dd99 100644 --- a/src/include/sof/audio/mfcc/mfcc_comp.h +++ b/src/include/sof/audio/mfcc/mfcc_comp.h @@ -12,6 +12,8 @@ #include #include #include +#include +#include #include #include @@ -32,6 +34,24 @@ #define MFCC_MAGIC 0x6d666363 /* ASCII for "mfcc" */ #define MFCC_FFT_BITS 32 +/** \brief Switch control index for VAD notification to user space */ +#define MFCC_CTRL_INDEX_VAD 0 + +/** + * \brief Data header prepended to every MFCC output frame. + * + * Written before the Mel spectrum or cepstral coefficient data in each + * output frame. All fields are int32_t so the header is 16 bytes. + */ +struct mfcc_data_header { + uint32_t magic; /**< Magic word MFCC_MAGIC (0x6d666363) */ + uint32_t frame_number; /**< Frame number, counting calculated frames starting from 0 */ + int32_t reserved; /**< Reserved for future use, set to 0 */ + int32_t energy; /**< Weighted signal energy in Q9.23 */ + int32_t noise_energy; /**< Weighted noise floor energy in Q9.23 */ + int32_t vad_flag; /**< VAD decision: 1 = speech, 0 = silence */ +}; + /** \brief Type definition for processing function select return value. */ typedef void (*mfcc_func)(struct processing_module *mod, struct input_stream_buffer *bsource, @@ -105,7 +125,8 @@ struct mfcc_state { bool mel_only; /**< When true, output Mel spectra instead of cepstral coefficients */ bool waiting_fill; /**< booleans */ bool prev_samples_valid; - bool magic_pending; /**< True when magic word not yet written for current output */ + bool header_pending; /**< True when data header not yet written for current output */ + struct mfcc_data_header header; /**< Data header for current output frame */ size_t sample_buffers_size; /**< bytes */ int16_t *out_data_ptr; /**< Read pointer into scratch data for multi-period output */ int32_t *out_data_ptr_32; /**< Read pointer for 32-bit mel-only output */ @@ -115,9 +136,12 @@ struct mfcc_state { /* MFCC component private data */ struct mfcc_comp_data { struct mfcc_state state; + struct mfcc_vad_state vad; struct comp_data_blob_handler *model_handler; struct sof_mfcc_config *config; + struct ipc_msg *msg; /**< IPC notification for VAD switch control */ int max_frames; + bool vad_prev; /**< Previous VAD state for edge detection */ mfcc_func mfcc_func; /**< processing function */ }; @@ -138,6 +162,10 @@ int mfcc_setup(struct processing_module *mod, int max_frames, int rate, int chan void mfcc_free_buffers(struct processing_module *mod); +int mfcc_ipc_notification_init(struct processing_module *mod); + +void mfcc_send_vad_notification(struct processing_module *mod, uint32_t val); + void mfcc_fill_prev_samples(struct mfcc_buffer *buf, int16_t *prev_data, int prev_data_length); diff --git a/src/include/sof/audio/mfcc/mfcc_vad.h b/src/include/sof/audio/mfcc/mfcc_vad.h new file mode 100644 index 000000000000..e12dd7e31e80 --- /dev/null +++ b/src/include/sof/audio/mfcc/mfcc_vad.h @@ -0,0 +1,99 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * + * Copyright(c) 2026 Intel Corporation. + * + * Author: Seppo Ingalsuo + */ + +/** + * \file mfcc_vad.h + * \brief Voice Activity Detection based on Mel spectrum energy. + * + * This VAD operates on the Q9.23 Mel log spectrum values produced by + * the MFCC component. It tracks a per-bin noise floor that follows + * the signal downward instantly and rises slowly, then computes a + * speech-weighted energy delta above the floor. + */ + +#ifndef __SOF_AUDIO_MFCC_MFCC_VAD_H__ +#define __SOF_AUDIO_MFCC_MFCC_VAD_H__ + +#include +#include + +struct processing_module; + +/** + * \brief Number of frames for fast noise floor convergence at startup (~1 s at 10 ms/frame). + */ +#define MFCC_VAD_NOISE_INIT_FRAMES 100 + +/** + * \brief Slow noise floor rise coefficient in Q1.15 (0.0020 * 2^15 = 66). + */ +#define MFCC_VAD_NOISE_RISE_ALPHA 66 + +/** + * \brief Fast noise floor rise coefficient in Q1.15 (0.05 * 2^15 = 1638). + */ +#define MFCC_VAD_NOISE_RISE_ALPHA_FAST 1638 + +/** + * \brief Energy threshold for speech detection in Q9.23 (0.35 * 2^23 = 2936013). + */ +#define MFCC_VAD_ENERGY_THRESHOLD 2936013 + +/** + * \brief Hangover frame count to keep VAD active after last speech detection. + */ +#define MFCC_VAD_HANGOVER_FRAMES 20 + +/** + * \brief VAD state structure. + */ +struct mfcc_vad_state { + int32_t *noise_floor; /**< Per-bin noise floor in Q9.23 */ + int16_t *weights; /**< Speech-frequency emphasis weights Q1.15 */ + int32_t energy_threshold; /**< Energy threshold Q9.23 */ + int16_t noise_rise_alpha_slow; /**< Slow rise alpha Q1.15 */ + int16_t noise_rise_alpha_fast; /**< Fast rise alpha Q1.15 */ + int16_t hangover_max; /**< Maximum hangover frames */ + int16_t hangover_counter; /**< Current hangover counter */ + int16_t num_mel_bins; /**< Number of Mel bins in use */ + int16_t init_frames; /**< Number of initial frames for fast convergence */ + int32_t frame_count; /**< Total frames processed */ + int32_t energy; /**< Weighted signal energy in Q9.23 */ + int32_t noise_energy; /**< Weighted noise floor energy in Q9.23 */ + bool is_speech; /**< Current VAD decision */ + bool initialized; /**< True after first frame processed */ +}; + +/** + * \brief Initialize VAD state. + * + * \param[out] vad Pointer to VAD state to initialize. + * \param[in] num_mel_bins Number of Mel bins. + * \param[in] sample_rate Audio sample rate in Hz. + * \param[in] mod Processing module for memory allocation. + * \return 0 on success, negative error code on failure. + */ +int mfcc_vad_init(struct mfcc_vad_state *vad, int num_mel_bins, int sample_rate, + struct processing_module *mod); + +/** + * \brief Process one Mel spectrum frame and update VAD decision. + * + * \param[in,out] vad Pointer to VAD state. + * \param[in] mel_log Mel log spectrum in Q9.23, array of num_mel_bins values. + * \return 1 if speech detected, 0 if silence. + */ +int mfcc_vad_update(struct mfcc_vad_state *vad, const int32_t *mel_log); + +/** + * \brief Reset VAD state without changing configuration. + * + * \param[in,out] vad Pointer to VAD state. + */ +void mfcc_vad_reset(struct mfcc_vad_state *vad); + +#endif /* __SOF_AUDIO_MFCC_MFCC_VAD_H__ */ diff --git a/src/include/user/mfcc.h b/src/include/user/mfcc.h index 8a0defcd9883..a2f3717daa52 100644 --- a/src/include/user/mfcc.h +++ b/src/include/user/mfcc.h @@ -77,6 +77,7 @@ struct sof_mfcc_config { int16_t vtln_high; /**< Reserved, no support */ int16_t vtln_low; /**< Reserved, no support */ int16_t vtln_warp; /**< Reserved, no support */ + int16_t reserved16[3]; /**< Reserved for future 16-bit fields, set to 0 */ bool htk_compat; /**< Must be false */ bool raw_energy; /**< Reserved, no support */ bool remove_dc_offset; /**< Reserved, no support */ @@ -85,8 +86,10 @@ struct sof_mfcc_config { bool subtract_mean; /**< Must be false (0) */ bool use_energy; /**< Must be false (0) */ bool dynamic_mmax; /**< Track max Mel value for clamp with top_db value */ - bool reserved_bool2; - bool reserved_bool3; + bool enable_vad; /**< Run VAD algorithm */ + bool enable_dtx; /**< Reserved (stream once per second non-speech frames) */ + bool update_controls; /**< Update controls with VAD decision */ + bool reserved_bool[5]; /* Reserved for future boolean flags, set to false (0) */ } __attribute__((packed)); #endif /* __USER_MFCC_H__ */ diff --git a/tools/topology/topology2/include/components/mfcc/default.conf b/tools/topology/topology2/include/components/mfcc/default.conf index 42a6d6608b8b..3bbd72696806 100644 --- a/tools/topology/topology2/include/components/mfcc/default.conf +++ b/tools/topology/topology2/include/components/mfcc/default.conf @@ -1,12 +1,12 @@ -# Exported MFCC configuration 05-May-2026 +# Exported MFCC configuration 19-May-2026 # cd src/audio/mfcc/tune; octave setup_mfcc.m Object.Base.data."mfcc_config" { bytes " 0x53,0x4f,0x46,0x34,0x00,0x00,0x00,0x00, - 0x68,0x00,0x00,0x00,0x01,0xd0,0x01,0x03, + 0x74,0x00,0x00,0x00,0x01,0xd0,0x01,0x03, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, - 0x68,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x74,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, @@ -17,6 +17,8 @@ Object.Base.data."mfcc_config" { 0xc3,0x35,0x00,0x2c,0x00,0x00,0x00,0x00, 0x90,0x01,0xa0,0x00,0x00,0x00,0x14,0x00, 0x0d,0x00,0x17,0x00,0x00,0x00,0x00,0x64, - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x01, - 0x01,0x01,0x01,0x00,0x00,0x00,0x00,0x00" + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x01,0x01,0x01, + 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00" } diff --git a/tools/topology/topology2/include/components/mfcc/mel80.conf b/tools/topology/topology2/include/components/mfcc/mel80.conf index 04aa2a15c660..480725c2d24f 100644 --- a/tools/topology/topology2/include/components/mfcc/mel80.conf +++ b/tools/topology/topology2/include/components/mfcc/mel80.conf @@ -1,12 +1,12 @@ -# Exported MFCC configuration 05-May-2026 +# Exported MFCC configuration 19-May-2026 # cd src/audio/mfcc/tune; octave setup_mfcc.m Object.Base.data."mfcc_config" { bytes " 0x53,0x4f,0x46,0x34,0x00,0x00,0x00,0x00, - 0x68,0x00,0x00,0x00,0x01,0xd0,0x01,0x03, + 0x74,0x00,0x00,0x00,0x01,0xd0,0x01,0x03, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, - 0x68,0x00,0x00,0x00,0x00,0x02,0x00,0x04, + 0x74,0x00,0x00,0x00,0x00,0x02,0x00,0x04, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, @@ -18,5 +18,7 @@ Object.Base.data."mfcc_config" { 0x90,0x01,0xa0,0x00,0x40,0x1f,0x00,0x00, 0x00,0x00,0x50,0x00,0x00,0x00,0x00,0x04, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, - 0x00,0x01,0x01,0x00,0x00,0x01,0x00,0x00" + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x01, + 0x01,0x00,0x00,0x01,0x01,0x00,0x01,0x00, + 0x00,0x00,0x00,0x00" } diff --git a/tools/topology/topology2/platform/intel/sdw-dmic-audio-feature.conf b/tools/topology/topology2/platform/intel/sdw-dmic-audio-feature.conf index 87039b261597..4b7ec2478076 100644 --- a/tools/topology/topology2/platform/intel/sdw-dmic-audio-feature.conf +++ b/tools/topology/topology2/platform/intel/sdw-dmic-audio-feature.conf @@ -21,6 +21,21 @@ Object.Pipeline.host-gateway-src-mfcc-capture [ name "$SDW_DMIC_AUDIO_FEATURE_CAPTURE_PCM_NAME MFCC bytes" } + mixer."1" { + name "$SDW_DMIC_AUDIO_FEATURE_CAPTURE_PCM_NAME MFCC VAD" + Object.Base.channel.1 { + name "fc" + shift 0 + } + Object.Base.ops.1 { + name "ctl" + info "volsw" + #259 binds the mixer control to switch get/put handlers + get 259 + put 259 + } + max 1 + } } } } diff --git a/tools/topology/topology2/platform/intel/sdw-jack-audio-feature.conf b/tools/topology/topology2/platform/intel/sdw-jack-audio-feature.conf index 9645199d6907..019b09911197 100644 --- a/tools/topology/topology2/platform/intel/sdw-jack-audio-feature.conf +++ b/tools/topology/topology2/platform/intel/sdw-jack-audio-feature.conf @@ -21,6 +21,21 @@ Object.Pipeline.host-gateway-src-mfcc-capture [ name "$SDW_JACK_AUDIO_FEATURE_CAPTURE_PCM_NAME MFCC bytes" } + mixer."1" { + name "$SDW_JACK_AUDIO_FEATURE_CAPTURE_PCM_NAME MFCC VAD" + Object.Base.channel.1 { + name "fc" + shift 0 + } + Object.Base.ops.1 { + name "ctl" + info "volsw" + #259 binds the mixer control to switch get/put handlers + get 259 + put 259 + } + max 1 + } } } }