From 1d54447ba1df141181c66dab2bb2837b53a13ad0 Mon Sep 17 00:00:00 2001 From: Serhiy Katsyuba Date: Wed, 17 Jun 2026 13:00:13 +0200 Subject: [PATCH 1/2] userspace: proxy: use a separate worker per core Previously a single userspace worker (work queue) served requests for all userspace modules, regardless of the core they ran on, and the worker thread was re-pinned to the requesting module's core on every invocation. This actually only worked in single-core tests, but on multi-core pinning failed, because a thread can only be pinned while it is not running. Replace the single worker with a per-core worker array. Each worker thread is pinned to its core once, at creation time. Signed-off-by: Serhiy Katsyuba --- .../module_adapter/library/userspace_proxy.c | 95 +++++++++++-------- 1 file changed, 53 insertions(+), 42 deletions(-) diff --git a/src/audio/module_adapter/library/userspace_proxy.c b/src/audio/module_adapter/library/userspace_proxy.c index 79e55bd4124e..6c52793fd1c3 100644 --- a/src/audio/module_adapter/library/userspace_proxy.c +++ b/src/audio/module_adapter/library/userspace_proxy.c @@ -54,12 +54,16 @@ static const struct module_interface userspace_proxy_interface; #include #include -static inline int user_worker_get(void) +static inline int user_worker_get(int cpu) { + ARG_UNUSED(cpu); return 0; } -static inline void user_worker_put(void) { } +static inline void user_worker_put(int cpu) +{ + ARG_UNUSED(cpu); +} struct k_work_user *userspace_proxy_register_ipc_handler(struct processing_module *mod, struct k_event *event) @@ -86,9 +90,10 @@ struct k_work_user *userspace_proxy_register_ipc_handler(struct processing_modul * It invokes the appropriate module function in userspace context and writes the operation * result back into the work item. * - * There is only a single work queue, which is shared by all userspace modules. It is created - * dynamically when needed. Because SOF uses a single dedicated thread for handling IPC, there - * is no need to perform any additional serialization when accessing the worker. + * There is a separate work queue per core. Each core's work queue is shared by all + * userspace modules running on that core and is created dynamically when needed. A given + * core's worker is only accessed from that same core's IPC handling context, so there is no + * need to perform any additional serialization when accessing it. */ struct user_worker { k_tid_t thread_id; /* ipc worker thread ID */ @@ -98,70 +103,84 @@ struct user_worker { struct k_event event; }; -static struct user_worker worker; +static struct user_worker worker[CONFIG_CORE_COUNT]; -static int user_worker_get(void) +static int user_worker_get(int cpu) { - if (worker.reference_count) { - worker.reference_count++; + assert(cpu >= 0 && cpu < (int)ARRAY_SIZE(worker)); + + if (worker[cpu].reference_count) { + worker[cpu].reference_count++; return 0; } - worker.stack_ptr = user_stack_allocate(CONFIG_SOF_USERSPACE_PROXY_WORKER_STACK_SIZE, - K_USER); - if (!worker.stack_ptr) { + worker[cpu].stack_ptr = user_stack_allocate(CONFIG_SOF_USERSPACE_PROXY_WORKER_STACK_SIZE, + K_USER); + if (!worker[cpu].stack_ptr) { tr_err(&userspace_proxy_tr, "Userspace worker stack allocation failed."); return -ENOMEM; } - k_event_init(&worker.event); - k_work_user_queue_start(&worker.work_queue, worker.stack_ptr, + k_event_init(&worker[cpu].event); + k_work_user_queue_start(&worker[cpu].work_queue, worker[cpu].stack_ptr, CONFIG_SOF_USERSPACE_PROXY_WORKER_STACK_SIZE, 0, NULL); - worker.thread_id = k_work_user_queue_thread_get(&worker.work_queue); + worker[cpu].thread_id = k_work_user_queue_thread_get(&worker[cpu].work_queue); + + /* + * k_work_user_queue_start() starts the worker thread immediately. + * We need to make sure it is not running when pinning it to a specific core. + */ + k_thread_suspend(worker[cpu].thread_id); + + /* Pin worker thread to the same core as the module */ + k_thread_cpu_pin(worker[cpu].thread_id, cpu); + + k_thread_access_grant(worker[cpu].thread_id, &worker[cpu].event); + + k_thread_resume(worker[cpu].thread_id); - k_thread_access_grant(worker.thread_id, &worker.event); + worker[cpu].reference_count++; - worker.reference_count++; return 0; } -static void user_worker_put(void) +static void user_worker_put(int cpu) { + assert(cpu >= 0 && cpu < (int)ARRAY_SIZE(worker)); + /* Module removed so decrement counter */ - worker.reference_count--; + worker[cpu].reference_count--; /* Free worker resources if no more active user space modules */ - if (worker.reference_count == 0) { - k_thread_abort(worker.thread_id); - user_stack_free(worker.stack_ptr); + if (worker[cpu].reference_count == 0) { + k_thread_abort(worker[cpu].thread_id); + user_stack_free(worker[cpu].stack_ptr); } } #endif static int user_work_item_init(struct userspace_context *user_ctx, struct k_heap *user_heap) { + int cpu = cpu_get_id(); struct user_work_item *work_item = NULL; int ret; - ret = user_worker_get(); + ret = user_worker_get(cpu); if (ret) return ret; - /* We have only a single userspace IPC worker. It handles requests for all userspace - * modules, which may run on different cores. Because the worker processes work items - * coming from any core, the work item must be allocated in coherent memory. - */ + /* TODO: this can probably be allocated as cached? */ work_item = sof_heap_alloc(user_heap, SOF_MEM_FLAG_COHERENT, sizeof(*work_item), 0); if (!work_item) { - user_worker_put(); + user_worker_put(cpu); return -ENOMEM; } k_work_user_init(&work_item->work_item, userspace_proxy_worker_handler); #if !IS_ENABLED(CONFIG_SOF_USERSPACE_MOD_IPC_BY_DP_THREAD) - work_item->event = &worker.event; + work_item->event = &worker[cpu].event; #endif work_item->params.context = user_ctx; work_item->params.mod = NULL; @@ -173,7 +192,7 @@ static int user_work_item_init(struct userspace_context *user_ctx, struct k_heap static void user_work_item_free(struct userspace_context *user_ctx, struct k_heap *user_heap) { sof_heap_free(user_heap, user_ctx->work_item); - user_worker_put(); + user_worker_put(cpu_get_id()); } static inline struct module_params *user_work_get_params(struct userspace_context *user_ctx) @@ -193,7 +212,8 @@ static int userspace_proxy_invoke(struct userspace_context *user_ctx, uint32_t c #if IS_ENABLED(CONFIG_SOF_USERSPACE_MOD_IPC_BY_DP_THREAD) struct k_event * const event = user_ctx->dp_event; #else - struct k_event * const event = &worker.event; + int cpu = cpu_get_id(); + struct k_event * const event = &worker[cpu].event; #endif struct module_params *params = user_work_get_params(user_ctx); const uintptr_t ipc_req_buf = (uintptr_t)MAILBOX_HOSTBOX_BASE; @@ -216,22 +236,13 @@ static int userspace_proxy_invoke(struct userspace_context *user_ctx, uint32_t c #if !IS_ENABLED(CONFIG_SOF_USERSPACE_MOD_IPC_BY_DP_THREAD) /* Switch worker thread to module memory domain */ - ret = k_mem_domain_add_thread(user_ctx->comp_dom, worker.thread_id); + ret = k_mem_domain_add_thread(user_ctx->comp_dom, worker[cpu].thread_id); if (ret < 0) { tr_err(&userspace_proxy_tr, "Failed to switch memory domain, error: %d", ret); goto done; } -#ifdef CONFIG_SCHED_CPU_MASK - /* Pin worker thread to the same core as the module */ - ret = k_thread_cpu_pin(worker.thread_id, cpu_get_id()); - if (ret < 0) { - tr_err(&userspace_proxy_tr, "Failed to pin cpu, error: %d", ret); - goto done; - } -#endif - - ret = k_work_user_submit_to_queue(&worker.work_queue, &user_ctx->work_item->work_item); + ret = k_work_user_submit_to_queue(&worker[cpu].work_queue, &user_ctx->work_item->work_item); if (ret < 0) { tr_err(&userspace_proxy_tr, "Submit to queue error: %d", ret); goto done; From 30948a39d8ca3e5d64206f30dc117e90c6ae8431 Mon Sep 17 00:00:00 2001 From: Serhiy Katsyuba Date: Thu, 18 Jun 2026 17:18:59 +0200 Subject: [PATCH 2/2] userspace: proxy: allocate work item as cached The proxy now uses a separate worker per core. Each work item is allocated, submitted and processed on the same core, so cross-core coherency is no longer required. Signed-off-by: Serhiy Katsyuba --- src/audio/module_adapter/library/userspace_proxy.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/audio/module_adapter/library/userspace_proxy.c b/src/audio/module_adapter/library/userspace_proxy.c index 6c52793fd1c3..b2b5014497fa 100644 --- a/src/audio/module_adapter/library/userspace_proxy.c +++ b/src/audio/module_adapter/library/userspace_proxy.c @@ -170,8 +170,7 @@ static int user_work_item_init(struct userspace_context *user_ctx, struct k_heap if (ret) return ret; - /* TODO: this can probably be allocated as cached? */ - work_item = sof_heap_alloc(user_heap, SOF_MEM_FLAG_COHERENT, sizeof(*work_item), 0); + work_item = sof_heap_alloc(user_heap, 0, sizeof(*work_item), 0); if (!work_item) { user_worker_put(cpu); return -ENOMEM;