| 1 | // SPDX-License-Identifier: GPL-2.0-only |
| 2 | /* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */ |
| 3 | #include <linux/mm.h> |
| 4 | #include <linux/llist.h> |
| 5 | #include <linux/bpf.h> |
| 6 | #include <linux/irq_work.h> |
| 7 | #include <linux/bpf_mem_alloc.h> |
| 8 | #include <linux/memcontrol.h> |
| 9 | #include <asm/local.h> |
| 10 | |
| 11 | /* Any context (including NMI) BPF specific memory allocator. |
| 12 | * |
| 13 | * Tracing BPF programs can attach to kprobe and fentry. Hence they |
| 14 | * run in unknown context where calling plain kmalloc() might not be safe. |
| 15 | * |
| 16 | * Front-end kmalloc() with per-cpu per-bucket cache of free elements. |
| 17 | * Refill this cache asynchronously from irq_work. |
| 18 | * |
| 19 | * CPU_0 buckets |
| 20 | * 16 32 64 96 128 196 256 512 1024 2048 4096 |
| 21 | * ... |
| 22 | * CPU_N buckets |
| 23 | * 16 32 64 96 128 196 256 512 1024 2048 4096 |
| 24 | * |
| 25 | * The buckets are prefilled at the start. |
| 26 | * BPF programs always run with migration disabled. |
| 27 | * It's safe to allocate from cache of the current cpu with irqs disabled. |
| 28 | * Free-ing is always done into bucket of the current cpu as well. |
| 29 | * irq_work trims extra free elements from buckets with kfree |
| 30 | * and refills them with kmalloc, so global kmalloc logic takes care |
| 31 | * of freeing objects allocated by one cpu and freed on another. |
| 32 | * |
| 33 | * Every allocated objected is padded with extra 8 bytes that contains |
| 34 | * struct llist_node. |
| 35 | */ |
| 36 | #define LLIST_NODE_SZ sizeof(struct llist_node) |
| 37 | |
| 38 | #define BPF_MEM_ALLOC_SIZE_MAX 4096 |
| 39 | |
| 40 | /* similar to kmalloc, but sizeof == 8 bucket is gone */ |
| 41 | static u8 size_index[24] __ro_after_init = { |
| 42 | 3, /* 8 */ |
| 43 | 3, /* 16 */ |
| 44 | 4, /* 24 */ |
| 45 | 4, /* 32 */ |
| 46 | 5, /* 40 */ |
| 47 | 5, /* 48 */ |
| 48 | 5, /* 56 */ |
| 49 | 5, /* 64 */ |
| 50 | 1, /* 72 */ |
| 51 | 1, /* 80 */ |
| 52 | 1, /* 88 */ |
| 53 | 1, /* 96 */ |
| 54 | 6, /* 104 */ |
| 55 | 6, /* 112 */ |
| 56 | 6, /* 120 */ |
| 57 | 6, /* 128 */ |
| 58 | 2, /* 136 */ |
| 59 | 2, /* 144 */ |
| 60 | 2, /* 152 */ |
| 61 | 2, /* 160 */ |
| 62 | 2, /* 168 */ |
| 63 | 2, /* 176 */ |
| 64 | 2, /* 184 */ |
| 65 | 2 /* 192 */ |
| 66 | }; |
| 67 | |
| 68 | static int bpf_mem_cache_idx(size_t size) |
| 69 | { |
| 70 | if (!size || size > BPF_MEM_ALLOC_SIZE_MAX) |
| 71 | return -1; |
| 72 | |
| 73 | if (size <= 192) |
| 74 | return size_index[(size - 1) / 8] - 1; |
| 75 | |
| 76 | return fls(x: size - 1) - 2; |
| 77 | } |
| 78 | |
| 79 | #define NUM_CACHES 11 |
| 80 | |
| 81 | struct bpf_mem_cache { |
| 82 | /* per-cpu list of free objects of size 'unit_size'. |
| 83 | * All accesses are done with interrupts disabled and 'active' counter |
| 84 | * protection with __llist_add() and __llist_del_first(). |
| 85 | */ |
| 86 | struct llist_head free_llist; |
| 87 | local_t active; |
| 88 | |
| 89 | /* Operations on the free_list from unit_alloc/unit_free/bpf_mem_refill |
| 90 | * are sequenced by per-cpu 'active' counter. But unit_free() cannot |
| 91 | * fail. When 'active' is busy the unit_free() will add an object to |
| 92 | * free_llist_extra. |
| 93 | */ |
| 94 | struct llist_head ; |
| 95 | |
| 96 | struct irq_work refill_work; |
| 97 | struct obj_cgroup *objcg; |
| 98 | int unit_size; |
| 99 | /* count of objects in free_llist */ |
| 100 | int free_cnt; |
| 101 | int low_watermark, high_watermark, batch; |
| 102 | int percpu_size; |
| 103 | bool draining; |
| 104 | struct bpf_mem_cache *tgt; |
| 105 | |
| 106 | /* list of objects to be freed after RCU GP */ |
| 107 | struct llist_head free_by_rcu; |
| 108 | struct llist_node *free_by_rcu_tail; |
| 109 | struct llist_head waiting_for_gp; |
| 110 | struct llist_node *waiting_for_gp_tail; |
| 111 | struct rcu_head rcu; |
| 112 | atomic_t call_rcu_in_progress; |
| 113 | struct llist_head ; |
| 114 | |
| 115 | /* list of objects to be freed after RCU tasks trace GP */ |
| 116 | struct llist_head free_by_rcu_ttrace; |
| 117 | struct llist_head waiting_for_gp_ttrace; |
| 118 | struct rcu_head rcu_ttrace; |
| 119 | atomic_t call_rcu_ttrace_in_progress; |
| 120 | }; |
| 121 | |
| 122 | struct bpf_mem_caches { |
| 123 | struct bpf_mem_cache cache[NUM_CACHES]; |
| 124 | }; |
| 125 | |
| 126 | static const u16 sizes[NUM_CACHES] = {96, 192, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096}; |
| 127 | |
| 128 | static struct llist_node notrace *__llist_del_first(struct llist_head *head) |
| 129 | { |
| 130 | struct llist_node *entry, *next; |
| 131 | |
| 132 | entry = head->first; |
| 133 | if (!entry) |
| 134 | return NULL; |
| 135 | next = entry->next; |
| 136 | head->first = next; |
| 137 | return entry; |
| 138 | } |
| 139 | |
| 140 | static void *__alloc(struct bpf_mem_cache *c, int node, gfp_t flags) |
| 141 | { |
| 142 | if (c->percpu_size) { |
| 143 | void __percpu **obj = kmalloc_node(c->percpu_size, flags, node); |
| 144 | void __percpu *pptr = __alloc_percpu_gfp(c->unit_size, 8, flags); |
| 145 | |
| 146 | if (!obj || !pptr) { |
| 147 | free_percpu(pdata: pptr); |
| 148 | kfree(objp: obj); |
| 149 | return NULL; |
| 150 | } |
| 151 | obj[1] = pptr; |
| 152 | return obj; |
| 153 | } |
| 154 | |
| 155 | return kmalloc_node(c->unit_size, flags | __GFP_ZERO, node); |
| 156 | } |
| 157 | |
| 158 | static struct mem_cgroup *get_memcg(const struct bpf_mem_cache *c) |
| 159 | { |
| 160 | #ifdef CONFIG_MEMCG |
| 161 | if (c->objcg) |
| 162 | return get_mem_cgroup_from_objcg(objcg: c->objcg); |
| 163 | return root_mem_cgroup; |
| 164 | #else |
| 165 | return NULL; |
| 166 | #endif |
| 167 | } |
| 168 | |
| 169 | static void inc_active(struct bpf_mem_cache *c, unsigned long *flags) |
| 170 | { |
| 171 | if (IS_ENABLED(CONFIG_PREEMPT_RT)) |
| 172 | /* In RT irq_work runs in per-cpu kthread, so disable |
| 173 | * interrupts to avoid preemption and interrupts and |
| 174 | * reduce the chance of bpf prog executing on this cpu |
| 175 | * when active counter is busy. |
| 176 | */ |
| 177 | local_irq_save(*flags); |
| 178 | /* alloc_bulk runs from irq_work which will not preempt a bpf |
| 179 | * program that does unit_alloc/unit_free since IRQs are |
| 180 | * disabled there. There is no race to increment 'active' |
| 181 | * counter. It protects free_llist from corruption in case NMI |
| 182 | * bpf prog preempted this loop. |
| 183 | */ |
| 184 | WARN_ON_ONCE(local_inc_return(&c->active) != 1); |
| 185 | } |
| 186 | |
| 187 | static void dec_active(struct bpf_mem_cache *c, unsigned long *flags) |
| 188 | { |
| 189 | local_dec(l: &c->active); |
| 190 | if (IS_ENABLED(CONFIG_PREEMPT_RT)) |
| 191 | local_irq_restore(*flags); |
| 192 | } |
| 193 | |
| 194 | static void add_obj_to_free_list(struct bpf_mem_cache *c, void *obj) |
| 195 | { |
| 196 | unsigned long flags; |
| 197 | |
| 198 | inc_active(c, flags: &flags); |
| 199 | __llist_add(new: obj, head: &c->free_llist); |
| 200 | c->free_cnt++; |
| 201 | dec_active(c, flags: &flags); |
| 202 | } |
| 203 | |
| 204 | /* Mostly runs from irq_work except __init phase. */ |
| 205 | static void alloc_bulk(struct bpf_mem_cache *c, int cnt, int node, bool atomic) |
| 206 | { |
| 207 | struct mem_cgroup *memcg = NULL, *old_memcg; |
| 208 | gfp_t gfp; |
| 209 | void *obj; |
| 210 | int i; |
| 211 | |
| 212 | gfp = __GFP_NOWARN | __GFP_ACCOUNT; |
| 213 | gfp |= atomic ? GFP_NOWAIT : GFP_KERNEL; |
| 214 | |
| 215 | for (i = 0; i < cnt; i++) { |
| 216 | /* |
| 217 | * For every 'c' llist_del_first(&c->free_by_rcu_ttrace); is |
| 218 | * done only by one CPU == current CPU. Other CPUs might |
| 219 | * llist_add() and llist_del_all() in parallel. |
| 220 | */ |
| 221 | obj = llist_del_first(head: &c->free_by_rcu_ttrace); |
| 222 | if (!obj) |
| 223 | break; |
| 224 | add_obj_to_free_list(c, obj); |
| 225 | } |
| 226 | if (i >= cnt) |
| 227 | return; |
| 228 | |
| 229 | for (; i < cnt; i++) { |
| 230 | obj = llist_del_first(head: &c->waiting_for_gp_ttrace); |
| 231 | if (!obj) |
| 232 | break; |
| 233 | add_obj_to_free_list(c, obj); |
| 234 | } |
| 235 | if (i >= cnt) |
| 236 | return; |
| 237 | |
| 238 | memcg = get_memcg(c); |
| 239 | old_memcg = set_active_memcg(memcg); |
| 240 | for (; i < cnt; i++) { |
| 241 | /* Allocate, but don't deplete atomic reserves that typical |
| 242 | * GFP_ATOMIC would do. irq_work runs on this cpu and kmalloc |
| 243 | * will allocate from the current numa node which is what we |
| 244 | * want here. |
| 245 | */ |
| 246 | obj = __alloc(c, node, flags: gfp); |
| 247 | if (!obj) |
| 248 | break; |
| 249 | add_obj_to_free_list(c, obj); |
| 250 | } |
| 251 | set_active_memcg(old_memcg); |
| 252 | mem_cgroup_put(memcg); |
| 253 | } |
| 254 | |
| 255 | static void free_one(void *obj, bool percpu) |
| 256 | { |
| 257 | if (percpu) |
| 258 | free_percpu(pdata: ((void __percpu **)obj)[1]); |
| 259 | |
| 260 | kfree(objp: obj); |
| 261 | } |
| 262 | |
| 263 | static int free_all(struct llist_node *llnode, bool percpu) |
| 264 | { |
| 265 | struct llist_node *pos, *t; |
| 266 | int cnt = 0; |
| 267 | |
| 268 | llist_for_each_safe(pos, t, llnode) { |
| 269 | free_one(obj: pos, percpu); |
| 270 | cnt++; |
| 271 | } |
| 272 | return cnt; |
| 273 | } |
| 274 | |
| 275 | static void __free_rcu(struct rcu_head *head) |
| 276 | { |
| 277 | struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu_ttrace); |
| 278 | |
| 279 | free_all(llnode: llist_del_all(head: &c->waiting_for_gp_ttrace), percpu: !!c->percpu_size); |
| 280 | atomic_set(v: &c->call_rcu_ttrace_in_progress, i: 0); |
| 281 | } |
| 282 | |
| 283 | static void __free_rcu_tasks_trace(struct rcu_head *head) |
| 284 | { |
| 285 | /* If RCU Tasks Trace grace period implies RCU grace period, |
| 286 | * there is no need to invoke call_rcu(). |
| 287 | */ |
| 288 | if (rcu_trace_implies_rcu_gp()) |
| 289 | __free_rcu(head); |
| 290 | else |
| 291 | call_rcu(head, func: __free_rcu); |
| 292 | } |
| 293 | |
| 294 | static void enque_to_free(struct bpf_mem_cache *c, void *obj) |
| 295 | { |
| 296 | struct llist_node *llnode = obj; |
| 297 | |
| 298 | /* bpf_mem_cache is a per-cpu object. Freeing happens in irq_work. |
| 299 | * Nothing races to add to free_by_rcu_ttrace list. |
| 300 | */ |
| 301 | llist_add(new: llnode, head: &c->free_by_rcu_ttrace); |
| 302 | } |
| 303 | |
| 304 | static void do_call_rcu_ttrace(struct bpf_mem_cache *c) |
| 305 | { |
| 306 | struct llist_node *llnode, *t; |
| 307 | |
| 308 | if (atomic_xchg(v: &c->call_rcu_ttrace_in_progress, new: 1)) { |
| 309 | if (unlikely(READ_ONCE(c->draining))) { |
| 310 | llnode = llist_del_all(head: &c->free_by_rcu_ttrace); |
| 311 | free_all(llnode, percpu: !!c->percpu_size); |
| 312 | } |
| 313 | return; |
| 314 | } |
| 315 | |
| 316 | WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp_ttrace)); |
| 317 | llist_for_each_safe(llnode, t, llist_del_all(&c->free_by_rcu_ttrace)) |
| 318 | llist_add(new: llnode, head: &c->waiting_for_gp_ttrace); |
| 319 | |
| 320 | if (unlikely(READ_ONCE(c->draining))) { |
| 321 | __free_rcu(head: &c->rcu_ttrace); |
| 322 | return; |
| 323 | } |
| 324 | |
| 325 | /* Use call_rcu_tasks_trace() to wait for sleepable progs to finish. |
| 326 | * If RCU Tasks Trace grace period implies RCU grace period, free |
| 327 | * these elements directly, else use call_rcu() to wait for normal |
| 328 | * progs to finish and finally do free_one() on each element. |
| 329 | */ |
| 330 | call_rcu_tasks_trace(rhp: &c->rcu_ttrace, func: __free_rcu_tasks_trace); |
| 331 | } |
| 332 | |
| 333 | static void free_bulk(struct bpf_mem_cache *c) |
| 334 | { |
| 335 | struct bpf_mem_cache *tgt = c->tgt; |
| 336 | struct llist_node *llnode, *t; |
| 337 | unsigned long flags; |
| 338 | int cnt; |
| 339 | |
| 340 | WARN_ON_ONCE(tgt->unit_size != c->unit_size); |
| 341 | WARN_ON_ONCE(tgt->percpu_size != c->percpu_size); |
| 342 | |
| 343 | do { |
| 344 | inc_active(c, flags: &flags); |
| 345 | llnode = __llist_del_first(head: &c->free_llist); |
| 346 | if (llnode) |
| 347 | cnt = --c->free_cnt; |
| 348 | else |
| 349 | cnt = 0; |
| 350 | dec_active(c, flags: &flags); |
| 351 | if (llnode) |
| 352 | enque_to_free(c: tgt, obj: llnode); |
| 353 | } while (cnt > (c->high_watermark + c->low_watermark) / 2); |
| 354 | |
| 355 | /* and drain free_llist_extra */ |
| 356 | llist_for_each_safe(llnode, t, llist_del_all(&c->free_llist_extra)) |
| 357 | enque_to_free(c: tgt, obj: llnode); |
| 358 | do_call_rcu_ttrace(c: tgt); |
| 359 | } |
| 360 | |
| 361 | static void __free_by_rcu(struct rcu_head *head) |
| 362 | { |
| 363 | struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu); |
| 364 | struct bpf_mem_cache *tgt = c->tgt; |
| 365 | struct llist_node *llnode; |
| 366 | |
| 367 | WARN_ON_ONCE(tgt->unit_size != c->unit_size); |
| 368 | WARN_ON_ONCE(tgt->percpu_size != c->percpu_size); |
| 369 | |
| 370 | llnode = llist_del_all(head: &c->waiting_for_gp); |
| 371 | if (!llnode) |
| 372 | goto out; |
| 373 | |
| 374 | llist_add_batch(new_first: llnode, new_last: c->waiting_for_gp_tail, head: &tgt->free_by_rcu_ttrace); |
| 375 | |
| 376 | /* Objects went through regular RCU GP. Send them to RCU tasks trace */ |
| 377 | do_call_rcu_ttrace(c: tgt); |
| 378 | out: |
| 379 | atomic_set(v: &c->call_rcu_in_progress, i: 0); |
| 380 | } |
| 381 | |
| 382 | static void check_free_by_rcu(struct bpf_mem_cache *c) |
| 383 | { |
| 384 | struct llist_node *llnode, *t; |
| 385 | unsigned long flags; |
| 386 | |
| 387 | /* drain free_llist_extra_rcu */ |
| 388 | if (unlikely(!llist_empty(&c->free_llist_extra_rcu))) { |
| 389 | inc_active(c, flags: &flags); |
| 390 | llist_for_each_safe(llnode, t, llist_del_all(&c->free_llist_extra_rcu)) |
| 391 | if (__llist_add(new: llnode, head: &c->free_by_rcu)) |
| 392 | c->free_by_rcu_tail = llnode; |
| 393 | dec_active(c, flags: &flags); |
| 394 | } |
| 395 | |
| 396 | if (llist_empty(head: &c->free_by_rcu)) |
| 397 | return; |
| 398 | |
| 399 | if (atomic_xchg(v: &c->call_rcu_in_progress, new: 1)) { |
| 400 | /* |
| 401 | * Instead of kmalloc-ing new rcu_head and triggering 10k |
| 402 | * call_rcu() to hit rcutree.qhimark and force RCU to notice |
| 403 | * the overload just ask RCU to hurry up. There could be many |
| 404 | * objects in free_by_rcu list. |
| 405 | * This hint reduces memory consumption for an artificial |
| 406 | * benchmark from 2 Gbyte to 150 Mbyte. |
| 407 | */ |
| 408 | rcu_request_urgent_qs_task(current); |
| 409 | return; |
| 410 | } |
| 411 | |
| 412 | WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp)); |
| 413 | |
| 414 | inc_active(c, flags: &flags); |
| 415 | WRITE_ONCE(c->waiting_for_gp.first, __llist_del_all(&c->free_by_rcu)); |
| 416 | c->waiting_for_gp_tail = c->free_by_rcu_tail; |
| 417 | dec_active(c, flags: &flags); |
| 418 | |
| 419 | if (unlikely(READ_ONCE(c->draining))) { |
| 420 | free_all(llnode: llist_del_all(head: &c->waiting_for_gp), percpu: !!c->percpu_size); |
| 421 | atomic_set(v: &c->call_rcu_in_progress, i: 0); |
| 422 | } else { |
| 423 | call_rcu_hurry(head: &c->rcu, func: __free_by_rcu); |
| 424 | } |
| 425 | } |
| 426 | |
| 427 | static void bpf_mem_refill(struct irq_work *work) |
| 428 | { |
| 429 | struct bpf_mem_cache *c = container_of(work, struct bpf_mem_cache, refill_work); |
| 430 | int cnt; |
| 431 | |
| 432 | /* Racy access to free_cnt. It doesn't need to be 100% accurate */ |
| 433 | cnt = c->free_cnt; |
| 434 | if (cnt < c->low_watermark) |
| 435 | /* irq_work runs on this cpu and kmalloc will allocate |
| 436 | * from the current numa node which is what we want here. |
| 437 | */ |
| 438 | alloc_bulk(c, cnt: c->batch, NUMA_NO_NODE, atomic: true); |
| 439 | else if (cnt > c->high_watermark) |
| 440 | free_bulk(c); |
| 441 | |
| 442 | check_free_by_rcu(c); |
| 443 | } |
| 444 | |
| 445 | static void notrace irq_work_raise(struct bpf_mem_cache *c) |
| 446 | { |
| 447 | irq_work_queue(work: &c->refill_work); |
| 448 | } |
| 449 | |
| 450 | /* For typical bpf map case that uses bpf_mem_cache_alloc and single bucket |
| 451 | * the freelist cache will be elem_size * 64 (or less) on each cpu. |
| 452 | * |
| 453 | * For bpf programs that don't have statically known allocation sizes and |
| 454 | * assuming (low_mark + high_mark) / 2 as an average number of elements per |
| 455 | * bucket and all buckets are used the total amount of memory in freelists |
| 456 | * on each cpu will be: |
| 457 | * 64*16 + 64*32 + 64*64 + 64*96 + 64*128 + 64*196 + 64*256 + 32*512 + 16*1024 + 8*2048 + 4*4096 |
| 458 | * == ~ 116 Kbyte using below heuristic. |
| 459 | * Initialized, but unused bpf allocator (not bpf map specific one) will |
| 460 | * consume ~ 11 Kbyte per cpu. |
| 461 | * Typical case will be between 11K and 116K closer to 11K. |
| 462 | * bpf progs can and should share bpf_mem_cache when possible. |
| 463 | * |
| 464 | * Percpu allocation is typically rare. To avoid potential unnecessary large |
| 465 | * memory consumption, set low_mark = 1 and high_mark = 3, resulting in c->batch = 1. |
| 466 | */ |
| 467 | static void init_refill_work(struct bpf_mem_cache *c) |
| 468 | { |
| 469 | init_irq_work(work: &c->refill_work, func: bpf_mem_refill); |
| 470 | if (c->percpu_size) { |
| 471 | c->low_watermark = 1; |
| 472 | c->high_watermark = 3; |
| 473 | } else if (c->unit_size <= 256) { |
| 474 | c->low_watermark = 32; |
| 475 | c->high_watermark = 96; |
| 476 | } else { |
| 477 | /* When page_size == 4k, order-0 cache will have low_mark == 2 |
| 478 | * and high_mark == 6 with batch alloc of 3 individual pages at |
| 479 | * a time. |
| 480 | * 8k allocs and above low == 1, high == 3, batch == 1. |
| 481 | */ |
| 482 | c->low_watermark = max(32 * 256 / c->unit_size, 1); |
| 483 | c->high_watermark = max(96 * 256 / c->unit_size, 3); |
| 484 | } |
| 485 | c->batch = max((c->high_watermark - c->low_watermark) / 4 * 3, 1); |
| 486 | } |
| 487 | |
| 488 | static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu) |
| 489 | { |
| 490 | int cnt = 1; |
| 491 | |
| 492 | /* To avoid consuming memory, for non-percpu allocation, assume that |
| 493 | * 1st run of bpf prog won't be doing more than 4 map_update_elem from |
| 494 | * irq disabled region if unit size is less than or equal to 256. |
| 495 | * For all other cases, let us just do one allocation. |
| 496 | */ |
| 497 | if (!c->percpu_size && c->unit_size <= 256) |
| 498 | cnt = 4; |
| 499 | alloc_bulk(c, cnt, cpu_to_node(cpu), atomic: false); |
| 500 | } |
| 501 | |
| 502 | /* When size != 0 bpf_mem_cache for each cpu. |
| 503 | * This is typical bpf hash map use case when all elements have equal size. |
| 504 | * |
| 505 | * When size == 0 allocate 11 bpf_mem_cache-s for each cpu, then rely on |
| 506 | * kmalloc/kfree. Max allocation size is 4096 in this case. |
| 507 | * This is bpf_dynptr and bpf_kptr use case. |
| 508 | */ |
| 509 | int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu) |
| 510 | { |
| 511 | struct bpf_mem_caches *cc; struct bpf_mem_caches __percpu *pcc; |
| 512 | struct bpf_mem_cache *c; struct bpf_mem_cache __percpu *pc; |
| 513 | struct obj_cgroup *objcg = NULL; |
| 514 | int cpu, i, unit_size, percpu_size = 0; |
| 515 | |
| 516 | if (percpu && size == 0) |
| 517 | return -EINVAL; |
| 518 | |
| 519 | /* room for llist_node and per-cpu pointer */ |
| 520 | if (percpu) |
| 521 | percpu_size = LLIST_NODE_SZ + sizeof(void *); |
| 522 | ma->percpu = percpu; |
| 523 | |
| 524 | if (size) { |
| 525 | pc = __alloc_percpu_gfp(sizeof(*pc), 8, GFP_KERNEL); |
| 526 | if (!pc) |
| 527 | return -ENOMEM; |
| 528 | |
| 529 | if (!percpu) |
| 530 | size += LLIST_NODE_SZ; /* room for llist_node */ |
| 531 | unit_size = size; |
| 532 | |
| 533 | #ifdef CONFIG_MEMCG |
| 534 | if (memcg_bpf_enabled()) |
| 535 | objcg = get_obj_cgroup_from_current(); |
| 536 | #endif |
| 537 | ma->objcg = objcg; |
| 538 | |
| 539 | for_each_possible_cpu(cpu) { |
| 540 | c = per_cpu_ptr(pc, cpu); |
| 541 | c->unit_size = unit_size; |
| 542 | c->objcg = objcg; |
| 543 | c->percpu_size = percpu_size; |
| 544 | c->tgt = c; |
| 545 | init_refill_work(c); |
| 546 | prefill_mem_cache(c, cpu); |
| 547 | } |
| 548 | ma->cache = pc; |
| 549 | return 0; |
| 550 | } |
| 551 | |
| 552 | pcc = __alloc_percpu_gfp(sizeof(*cc), 8, GFP_KERNEL); |
| 553 | if (!pcc) |
| 554 | return -ENOMEM; |
| 555 | #ifdef CONFIG_MEMCG |
| 556 | objcg = get_obj_cgroup_from_current(); |
| 557 | #endif |
| 558 | ma->objcg = objcg; |
| 559 | for_each_possible_cpu(cpu) { |
| 560 | cc = per_cpu_ptr(pcc, cpu); |
| 561 | for (i = 0; i < NUM_CACHES; i++) { |
| 562 | c = &cc->cache[i]; |
| 563 | c->unit_size = sizes[i]; |
| 564 | c->objcg = objcg; |
| 565 | c->percpu_size = percpu_size; |
| 566 | c->tgt = c; |
| 567 | |
| 568 | init_refill_work(c); |
| 569 | prefill_mem_cache(c, cpu); |
| 570 | } |
| 571 | } |
| 572 | |
| 573 | ma->caches = pcc; |
| 574 | return 0; |
| 575 | } |
| 576 | |
| 577 | int bpf_mem_alloc_percpu_init(struct bpf_mem_alloc *ma, struct obj_cgroup *objcg) |
| 578 | { |
| 579 | struct bpf_mem_caches __percpu *pcc; |
| 580 | |
| 581 | pcc = __alloc_percpu_gfp(sizeof(struct bpf_mem_caches), 8, GFP_KERNEL); |
| 582 | if (!pcc) |
| 583 | return -ENOMEM; |
| 584 | |
| 585 | ma->caches = pcc; |
| 586 | ma->objcg = objcg; |
| 587 | ma->percpu = true; |
| 588 | return 0; |
| 589 | } |
| 590 | |
| 591 | int bpf_mem_alloc_percpu_unit_init(struct bpf_mem_alloc *ma, int size) |
| 592 | { |
| 593 | struct bpf_mem_caches *cc; struct bpf_mem_caches __percpu *pcc; |
| 594 | int cpu, i, unit_size, percpu_size; |
| 595 | struct obj_cgroup *objcg; |
| 596 | struct bpf_mem_cache *c; |
| 597 | |
| 598 | i = bpf_mem_cache_idx(size); |
| 599 | if (i < 0) |
| 600 | return -EINVAL; |
| 601 | |
| 602 | /* room for llist_node and per-cpu pointer */ |
| 603 | percpu_size = LLIST_NODE_SZ + sizeof(void *); |
| 604 | |
| 605 | unit_size = sizes[i]; |
| 606 | objcg = ma->objcg; |
| 607 | pcc = ma->caches; |
| 608 | |
| 609 | for_each_possible_cpu(cpu) { |
| 610 | cc = per_cpu_ptr(pcc, cpu); |
| 611 | c = &cc->cache[i]; |
| 612 | if (c->unit_size) |
| 613 | break; |
| 614 | |
| 615 | c->unit_size = unit_size; |
| 616 | c->objcg = objcg; |
| 617 | c->percpu_size = percpu_size; |
| 618 | c->tgt = c; |
| 619 | |
| 620 | init_refill_work(c); |
| 621 | prefill_mem_cache(c, cpu); |
| 622 | } |
| 623 | |
| 624 | return 0; |
| 625 | } |
| 626 | |
| 627 | static void drain_mem_cache(struct bpf_mem_cache *c) |
| 628 | { |
| 629 | bool percpu = !!c->percpu_size; |
| 630 | |
| 631 | /* No progs are using this bpf_mem_cache, but htab_map_free() called |
| 632 | * bpf_mem_cache_free() for all remaining elements and they can be in |
| 633 | * free_by_rcu_ttrace or in waiting_for_gp_ttrace lists, so drain those lists now. |
| 634 | * |
| 635 | * Except for waiting_for_gp_ttrace list, there are no concurrent operations |
| 636 | * on these lists, so it is safe to use __llist_del_all(). |
| 637 | */ |
| 638 | free_all(llnode: llist_del_all(head: &c->free_by_rcu_ttrace), percpu); |
| 639 | free_all(llnode: llist_del_all(head: &c->waiting_for_gp_ttrace), percpu); |
| 640 | free_all(llnode: __llist_del_all(head: &c->free_llist), percpu); |
| 641 | free_all(llnode: __llist_del_all(head: &c->free_llist_extra), percpu); |
| 642 | free_all(llnode: __llist_del_all(head: &c->free_by_rcu), percpu); |
| 643 | free_all(llnode: __llist_del_all(head: &c->free_llist_extra_rcu), percpu); |
| 644 | free_all(llnode: llist_del_all(head: &c->waiting_for_gp), percpu); |
| 645 | } |
| 646 | |
| 647 | static void check_mem_cache(struct bpf_mem_cache *c) |
| 648 | { |
| 649 | WARN_ON_ONCE(!llist_empty(&c->free_by_rcu_ttrace)); |
| 650 | WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp_ttrace)); |
| 651 | WARN_ON_ONCE(!llist_empty(&c->free_llist)); |
| 652 | WARN_ON_ONCE(!llist_empty(&c->free_llist_extra)); |
| 653 | WARN_ON_ONCE(!llist_empty(&c->free_by_rcu)); |
| 654 | WARN_ON_ONCE(!llist_empty(&c->free_llist_extra_rcu)); |
| 655 | WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp)); |
| 656 | } |
| 657 | |
| 658 | static void check_leaked_objs(struct bpf_mem_alloc *ma) |
| 659 | { |
| 660 | struct bpf_mem_caches *cc; |
| 661 | struct bpf_mem_cache *c; |
| 662 | int cpu, i; |
| 663 | |
| 664 | if (ma->cache) { |
| 665 | for_each_possible_cpu(cpu) { |
| 666 | c = per_cpu_ptr(ma->cache, cpu); |
| 667 | check_mem_cache(c); |
| 668 | } |
| 669 | } |
| 670 | if (ma->caches) { |
| 671 | for_each_possible_cpu(cpu) { |
| 672 | cc = per_cpu_ptr(ma->caches, cpu); |
| 673 | for (i = 0; i < NUM_CACHES; i++) { |
| 674 | c = &cc->cache[i]; |
| 675 | check_mem_cache(c); |
| 676 | } |
| 677 | } |
| 678 | } |
| 679 | } |
| 680 | |
| 681 | static void free_mem_alloc_no_barrier(struct bpf_mem_alloc *ma) |
| 682 | { |
| 683 | check_leaked_objs(ma); |
| 684 | free_percpu(pdata: ma->cache); |
| 685 | free_percpu(pdata: ma->caches); |
| 686 | ma->cache = NULL; |
| 687 | ma->caches = NULL; |
| 688 | } |
| 689 | |
| 690 | static void free_mem_alloc(struct bpf_mem_alloc *ma) |
| 691 | { |
| 692 | /* waiting_for_gp[_ttrace] lists were drained, but RCU callbacks |
| 693 | * might still execute. Wait for them. |
| 694 | * |
| 695 | * rcu_barrier_tasks_trace() doesn't imply synchronize_rcu_tasks_trace(), |
| 696 | * but rcu_barrier_tasks_trace() and rcu_barrier() below are only used |
| 697 | * to wait for the pending __free_rcu_tasks_trace() and __free_rcu(), |
| 698 | * so if call_rcu(head, __free_rcu) is skipped due to |
| 699 | * rcu_trace_implies_rcu_gp(), it will be OK to skip rcu_barrier() by |
| 700 | * using rcu_trace_implies_rcu_gp() as well. |
| 701 | */ |
| 702 | rcu_barrier(); /* wait for __free_by_rcu */ |
| 703 | rcu_barrier_tasks_trace(); /* wait for __free_rcu */ |
| 704 | if (!rcu_trace_implies_rcu_gp()) |
| 705 | rcu_barrier(); |
| 706 | free_mem_alloc_no_barrier(ma); |
| 707 | } |
| 708 | |
| 709 | static void free_mem_alloc_deferred(struct work_struct *work) |
| 710 | { |
| 711 | struct bpf_mem_alloc *ma = container_of(work, struct bpf_mem_alloc, work); |
| 712 | |
| 713 | free_mem_alloc(ma); |
| 714 | kfree(objp: ma); |
| 715 | } |
| 716 | |
| 717 | static void destroy_mem_alloc(struct bpf_mem_alloc *ma, int rcu_in_progress) |
| 718 | { |
| 719 | struct bpf_mem_alloc *copy; |
| 720 | |
| 721 | if (!rcu_in_progress) { |
| 722 | /* Fast path. No callbacks are pending, hence no need to do |
| 723 | * rcu_barrier-s. |
| 724 | */ |
| 725 | free_mem_alloc_no_barrier(ma); |
| 726 | return; |
| 727 | } |
| 728 | |
| 729 | copy = kmemdup(ma, sizeof(*ma), GFP_KERNEL); |
| 730 | if (!copy) { |
| 731 | /* Slow path with inline barrier-s */ |
| 732 | free_mem_alloc(ma); |
| 733 | return; |
| 734 | } |
| 735 | |
| 736 | /* Defer barriers into worker to let the rest of map memory to be freed */ |
| 737 | memset(ma, 0, sizeof(*ma)); |
| 738 | INIT_WORK(©->work, free_mem_alloc_deferred); |
| 739 | queue_work(wq: system_dfl_wq, work: ©->work); |
| 740 | } |
| 741 | |
| 742 | void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma) |
| 743 | { |
| 744 | struct bpf_mem_caches *cc; |
| 745 | struct bpf_mem_cache *c; |
| 746 | int cpu, i, rcu_in_progress; |
| 747 | |
| 748 | if (ma->cache) { |
| 749 | rcu_in_progress = 0; |
| 750 | for_each_possible_cpu(cpu) { |
| 751 | c = per_cpu_ptr(ma->cache, cpu); |
| 752 | WRITE_ONCE(c->draining, true); |
| 753 | irq_work_sync(work: &c->refill_work); |
| 754 | drain_mem_cache(c); |
| 755 | rcu_in_progress += atomic_read(v: &c->call_rcu_ttrace_in_progress); |
| 756 | rcu_in_progress += atomic_read(v: &c->call_rcu_in_progress); |
| 757 | } |
| 758 | obj_cgroup_put(objcg: ma->objcg); |
| 759 | destroy_mem_alloc(ma, rcu_in_progress); |
| 760 | } |
| 761 | if (ma->caches) { |
| 762 | rcu_in_progress = 0; |
| 763 | for_each_possible_cpu(cpu) { |
| 764 | cc = per_cpu_ptr(ma->caches, cpu); |
| 765 | for (i = 0; i < NUM_CACHES; i++) { |
| 766 | c = &cc->cache[i]; |
| 767 | WRITE_ONCE(c->draining, true); |
| 768 | irq_work_sync(work: &c->refill_work); |
| 769 | drain_mem_cache(c); |
| 770 | rcu_in_progress += atomic_read(v: &c->call_rcu_ttrace_in_progress); |
| 771 | rcu_in_progress += atomic_read(v: &c->call_rcu_in_progress); |
| 772 | } |
| 773 | } |
| 774 | obj_cgroup_put(objcg: ma->objcg); |
| 775 | destroy_mem_alloc(ma, rcu_in_progress); |
| 776 | } |
| 777 | } |
| 778 | |
| 779 | /* notrace is necessary here and in other functions to make sure |
| 780 | * bpf programs cannot attach to them and cause llist corruptions. |
| 781 | */ |
| 782 | static void notrace *unit_alloc(struct bpf_mem_cache *c) |
| 783 | { |
| 784 | struct llist_node *llnode = NULL; |
| 785 | unsigned long flags; |
| 786 | int cnt = 0; |
| 787 | |
| 788 | /* Disable irqs to prevent the following race for majority of prog types: |
| 789 | * prog_A |
| 790 | * bpf_mem_alloc |
| 791 | * preemption or irq -> prog_B |
| 792 | * bpf_mem_alloc |
| 793 | * |
| 794 | * but prog_B could be a perf_event NMI prog. |
| 795 | * Use per-cpu 'active' counter to order free_list access between |
| 796 | * unit_alloc/unit_free/bpf_mem_refill. |
| 797 | */ |
| 798 | local_irq_save(flags); |
| 799 | if (local_inc_return(&c->active) == 1) { |
| 800 | llnode = __llist_del_first(head: &c->free_llist); |
| 801 | if (llnode) { |
| 802 | cnt = --c->free_cnt; |
| 803 | *(struct bpf_mem_cache **)llnode = c; |
| 804 | } |
| 805 | } |
| 806 | local_dec(l: &c->active); |
| 807 | |
| 808 | WARN_ON(cnt < 0); |
| 809 | |
| 810 | if (cnt < c->low_watermark) |
| 811 | irq_work_raise(c); |
| 812 | /* Enable IRQ after the enqueue of irq work completes, so irq work |
| 813 | * will run after IRQ is enabled and free_llist may be refilled by |
| 814 | * irq work before other task preempts current task. |
| 815 | */ |
| 816 | local_irq_restore(flags); |
| 817 | |
| 818 | return llnode; |
| 819 | } |
| 820 | |
| 821 | /* Though 'ptr' object could have been allocated on a different cpu |
| 822 | * add it to the free_llist of the current cpu. |
| 823 | * Let kfree() logic deal with it when it's later called from irq_work. |
| 824 | */ |
| 825 | static void notrace unit_free(struct bpf_mem_cache *c, void *ptr) |
| 826 | { |
| 827 | struct llist_node *llnode = ptr - LLIST_NODE_SZ; |
| 828 | unsigned long flags; |
| 829 | int cnt = 0; |
| 830 | |
| 831 | BUILD_BUG_ON(LLIST_NODE_SZ > 8); |
| 832 | |
| 833 | /* |
| 834 | * Remember bpf_mem_cache that allocated this object. |
| 835 | * The hint is not accurate. |
| 836 | */ |
| 837 | c->tgt = *(struct bpf_mem_cache **)llnode; |
| 838 | |
| 839 | local_irq_save(flags); |
| 840 | if (local_inc_return(&c->active) == 1) { |
| 841 | __llist_add(new: llnode, head: &c->free_llist); |
| 842 | cnt = ++c->free_cnt; |
| 843 | } else { |
| 844 | /* unit_free() cannot fail. Therefore add an object to atomic |
| 845 | * llist. free_bulk() will drain it. Though free_llist_extra is |
| 846 | * a per-cpu list we have to use atomic llist_add here, since |
| 847 | * it also can be interrupted by bpf nmi prog that does another |
| 848 | * unit_free() into the same free_llist_extra. |
| 849 | */ |
| 850 | llist_add(new: llnode, head: &c->free_llist_extra); |
| 851 | } |
| 852 | local_dec(l: &c->active); |
| 853 | |
| 854 | if (cnt > c->high_watermark) |
| 855 | /* free few objects from current cpu into global kmalloc pool */ |
| 856 | irq_work_raise(c); |
| 857 | /* Enable IRQ after irq_work_raise() completes, otherwise when current |
| 858 | * task is preempted by task which does unit_alloc(), unit_alloc() may |
| 859 | * return NULL unexpectedly because irq work is already pending but can |
| 860 | * not been triggered and free_llist can not be refilled timely. |
| 861 | */ |
| 862 | local_irq_restore(flags); |
| 863 | } |
| 864 | |
| 865 | static void notrace unit_free_rcu(struct bpf_mem_cache *c, void *ptr) |
| 866 | { |
| 867 | struct llist_node *llnode = ptr - LLIST_NODE_SZ; |
| 868 | unsigned long flags; |
| 869 | |
| 870 | c->tgt = *(struct bpf_mem_cache **)llnode; |
| 871 | |
| 872 | local_irq_save(flags); |
| 873 | if (local_inc_return(&c->active) == 1) { |
| 874 | if (__llist_add(new: llnode, head: &c->free_by_rcu)) |
| 875 | c->free_by_rcu_tail = llnode; |
| 876 | } else { |
| 877 | llist_add(new: llnode, head: &c->free_llist_extra_rcu); |
| 878 | } |
| 879 | local_dec(l: &c->active); |
| 880 | |
| 881 | if (!atomic_read(v: &c->call_rcu_in_progress)) |
| 882 | irq_work_raise(c); |
| 883 | local_irq_restore(flags); |
| 884 | } |
| 885 | |
| 886 | /* Called from BPF program or from sys_bpf syscall. |
| 887 | * In both cases migration is disabled. |
| 888 | */ |
| 889 | void notrace *bpf_mem_alloc(struct bpf_mem_alloc *ma, size_t size) |
| 890 | { |
| 891 | int idx; |
| 892 | void *ret; |
| 893 | |
| 894 | if (!size) |
| 895 | return NULL; |
| 896 | |
| 897 | if (!ma->percpu) |
| 898 | size += LLIST_NODE_SZ; |
| 899 | idx = bpf_mem_cache_idx(size); |
| 900 | if (idx < 0) |
| 901 | return NULL; |
| 902 | |
| 903 | ret = unit_alloc(this_cpu_ptr(ma->caches)->cache + idx); |
| 904 | return !ret ? NULL : ret + LLIST_NODE_SZ; |
| 905 | } |
| 906 | |
| 907 | void notrace bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr) |
| 908 | { |
| 909 | struct bpf_mem_cache *c; |
| 910 | int idx; |
| 911 | |
| 912 | if (!ptr) |
| 913 | return; |
| 914 | |
| 915 | c = *(void **)(ptr - LLIST_NODE_SZ); |
| 916 | idx = bpf_mem_cache_idx(size: c->unit_size); |
| 917 | if (WARN_ON_ONCE(idx < 0)) |
| 918 | return; |
| 919 | |
| 920 | unit_free(this_cpu_ptr(ma->caches)->cache + idx, ptr); |
| 921 | } |
| 922 | |
| 923 | void notrace bpf_mem_free_rcu(struct bpf_mem_alloc *ma, void *ptr) |
| 924 | { |
| 925 | struct bpf_mem_cache *c; |
| 926 | int idx; |
| 927 | |
| 928 | if (!ptr) |
| 929 | return; |
| 930 | |
| 931 | c = *(void **)(ptr - LLIST_NODE_SZ); |
| 932 | idx = bpf_mem_cache_idx(size: c->unit_size); |
| 933 | if (WARN_ON_ONCE(idx < 0)) |
| 934 | return; |
| 935 | |
| 936 | unit_free_rcu(this_cpu_ptr(ma->caches)->cache + idx, ptr); |
| 937 | } |
| 938 | |
| 939 | void notrace *bpf_mem_cache_alloc(struct bpf_mem_alloc *ma) |
| 940 | { |
| 941 | void *ret; |
| 942 | |
| 943 | ret = unit_alloc(this_cpu_ptr(ma->cache)); |
| 944 | return !ret ? NULL : ret + LLIST_NODE_SZ; |
| 945 | } |
| 946 | |
| 947 | void notrace bpf_mem_cache_free(struct bpf_mem_alloc *ma, void *ptr) |
| 948 | { |
| 949 | if (!ptr) |
| 950 | return; |
| 951 | |
| 952 | unit_free(this_cpu_ptr(ma->cache), ptr); |
| 953 | } |
| 954 | |
| 955 | void notrace bpf_mem_cache_free_rcu(struct bpf_mem_alloc *ma, void *ptr) |
| 956 | { |
| 957 | if (!ptr) |
| 958 | return; |
| 959 | |
| 960 | unit_free_rcu(this_cpu_ptr(ma->cache), ptr); |
| 961 | } |
| 962 | |
| 963 | /* Directly does a kfree() without putting 'ptr' back to the free_llist |
| 964 | * for reuse and without waiting for a rcu_tasks_trace gp. |
| 965 | * The caller must first go through the rcu_tasks_trace gp for 'ptr' |
| 966 | * before calling bpf_mem_cache_raw_free(). |
| 967 | * It could be used when the rcu_tasks_trace callback does not have |
| 968 | * a hold on the original bpf_mem_alloc object that allocated the |
| 969 | * 'ptr'. This should only be used in the uncommon code path. |
| 970 | * Otherwise, the bpf_mem_alloc's free_llist cannot be refilled |
| 971 | * and may affect performance. |
| 972 | */ |
| 973 | void bpf_mem_cache_raw_free(void *ptr) |
| 974 | { |
| 975 | if (!ptr) |
| 976 | return; |
| 977 | |
| 978 | kfree(objp: ptr - LLIST_NODE_SZ); |
| 979 | } |
| 980 | |
| 981 | /* When flags == GFP_KERNEL, it signals that the caller will not cause |
| 982 | * deadlock when using kmalloc. bpf_mem_cache_alloc_flags() will use |
| 983 | * kmalloc if the free_llist is empty. |
| 984 | */ |
| 985 | void notrace *bpf_mem_cache_alloc_flags(struct bpf_mem_alloc *ma, gfp_t flags) |
| 986 | { |
| 987 | struct bpf_mem_cache *c; |
| 988 | void *ret; |
| 989 | |
| 990 | c = this_cpu_ptr(ma->cache); |
| 991 | |
| 992 | ret = unit_alloc(c); |
| 993 | if (!ret && flags == GFP_KERNEL) { |
| 994 | struct mem_cgroup *memcg, *old_memcg; |
| 995 | |
| 996 | memcg = get_memcg(c); |
| 997 | old_memcg = set_active_memcg(memcg); |
| 998 | ret = __alloc(c, NUMA_NO_NODE, GFP_KERNEL | __GFP_NOWARN | __GFP_ACCOUNT); |
| 999 | if (ret) |
| 1000 | *(struct bpf_mem_cache **)ret = c; |
| 1001 | set_active_memcg(old_memcg); |
| 1002 | mem_cgroup_put(memcg); |
| 1003 | } |
| 1004 | |
| 1005 | return !ret ? NULL : ret + LLIST_NODE_SZ; |
| 1006 | } |
| 1007 | |
| 1008 | int bpf_mem_alloc_check_size(bool percpu, size_t size) |
| 1009 | { |
| 1010 | /* The size of percpu allocation doesn't have LLIST_NODE_SZ overhead */ |
| 1011 | if ((percpu && size > BPF_MEM_ALLOC_SIZE_MAX) || |
| 1012 | (!percpu && size > BPF_MEM_ALLOC_SIZE_MAX - LLIST_NODE_SZ)) |
| 1013 | return -E2BIG; |
| 1014 | |
| 1015 | return 0; |
| 1016 | } |
| 1017 | |