Skip to content

Commit 15a07b3

Browse files
4astdavem330
authored andcommitted
bpf: add lookup/update support for per-cpu hash and array maps
The functions bpf_map_lookup_elem(map, key, value) and bpf_map_update_elem(map, key, value, flags) need to get/set values from all-cpus for per-cpu hash and array maps, so that user space can aggregate/update them as necessary. Example of single counter aggregation in user space: unsigned int nr_cpus = sysconf(_SC_NPROCESSORS_CONF); long values[nr_cpus]; long value = 0; bpf_lookup_elem(fd, key, values); for (i = 0; i < nr_cpus; i++) value += values[i]; The user space must provide round_up(value_size, 8) * nr_cpus array to get/set values, since kernel will use 'long' copy of per-cpu values to try to copy good counters atomically. It's a best-effort, since bpf programs and user space are racing to access the same memory. Signed-off-by: Alexei Starovoitov <ast@kernel.org> Signed-off-by: David S. Miller <davem@davemloft.net>
1 parent a10423b commit 15a07b3

File tree

4 files changed

+201
-26
lines changed

4 files changed

+201
-26
lines changed

include/linux/bpf.h

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -183,6 +183,29 @@ int bpf_prog_new_fd(struct bpf_prog *prog);
183183
int bpf_obj_pin_user(u32 ufd, const char __user *pathname);
184184
int bpf_obj_get_user(const char __user *pathname);
185185

186+
int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value);
187+
int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value);
188+
int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value,
189+
u64 flags);
190+
int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value,
191+
u64 flags);
192+
193+
/* memcpy that is used with 8-byte aligned pointers, power-of-8 size and
194+
* forced to use 'long' read/writes to try to atomically copy long counters.
195+
* Best-effort only. No barriers here, since it _will_ race with concurrent
196+
* updates from BPF programs. Called from bpf syscall and mostly used with
197+
* size 8 or 16 bytes, so ask compiler to inline it.
198+
*/
199+
static inline void bpf_long_memcpy(void *dst, const void *src, u32 size)
200+
{
201+
const long *lsrc = src;
202+
long *ldst = dst;
203+
204+
size /= sizeof(long);
205+
while (size--)
206+
*ldst++ = *lsrc++;
207+
}
208+
186209
/* verify correctness of eBPF program */
187210
int bpf_check(struct bpf_prog **fp, union bpf_attr *attr);
188211
#else

kernel/bpf/arraymap.c

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,32 @@ static void *percpu_array_map_lookup_elem(struct bpf_map *map, void *key)
130130
return this_cpu_ptr(array->pptrs[index]);
131131
}
132132

133+
int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value)
134+
{
135+
struct bpf_array *array = container_of(map, struct bpf_array, map);
136+
u32 index = *(u32 *)key;
137+
void __percpu *pptr;
138+
int cpu, off = 0;
139+
u32 size;
140+
141+
if (unlikely(index >= array->map.max_entries))
142+
return -ENOENT;
143+
144+
/* per_cpu areas are zero-filled and bpf programs can only
145+
* access 'value_size' of them, so copying rounded areas
146+
* will not leak any kernel data
147+
*/
148+
size = round_up(map->value_size, 8);
149+
rcu_read_lock();
150+
pptr = array->pptrs[index];
151+
for_each_possible_cpu(cpu) {
152+
bpf_long_memcpy(value + off, per_cpu_ptr(pptr, cpu), size);
153+
off += size;
154+
}
155+
rcu_read_unlock();
156+
return 0;
157+
}
158+
133159
/* Called from syscall */
134160
static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
135161
{
@@ -177,6 +203,44 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value,
177203
return 0;
178204
}
179205

206+
int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value,
207+
u64 map_flags)
208+
{
209+
struct bpf_array *array = container_of(map, struct bpf_array, map);
210+
u32 index = *(u32 *)key;
211+
void __percpu *pptr;
212+
int cpu, off = 0;
213+
u32 size;
214+
215+
if (unlikely(map_flags > BPF_EXIST))
216+
/* unknown flags */
217+
return -EINVAL;
218+
219+
if (unlikely(index >= array->map.max_entries))
220+
/* all elements were pre-allocated, cannot insert a new one */
221+
return -E2BIG;
222+
223+
if (unlikely(map_flags == BPF_NOEXIST))
224+
/* all elements already exist */
225+
return -EEXIST;
226+
227+
/* the user space will provide round_up(value_size, 8) bytes that
228+
* will be copied into per-cpu area. bpf programs can only access
229+
* value_size of it. During lookup the same extra bytes will be
230+
* returned or zeros which were zero-filled by percpu_alloc,
231+
* so no kernel data leaks possible
232+
*/
233+
size = round_up(map->value_size, 8);
234+
rcu_read_lock();
235+
pptr = array->pptrs[index];
236+
for_each_possible_cpu(cpu) {
237+
bpf_long_memcpy(per_cpu_ptr(pptr, cpu), value + off, size);
238+
off += size;
239+
}
240+
rcu_read_unlock();
241+
return 0;
242+
}
243+
180244
/* Called from syscall or from eBPF program */
181245
static int array_map_delete_elem(struct bpf_map *map, void *key)
182246
{

kernel/bpf/hashtab.c

Lines changed: 74 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -290,7 +290,7 @@ static void free_htab_elem(struct htab_elem *l, bool percpu, u32 key_size)
290290

291291
static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
292292
void *value, u32 key_size, u32 hash,
293-
bool percpu)
293+
bool percpu, bool onallcpus)
294294
{
295295
u32 size = htab->map.value_size;
296296
struct htab_elem *l_new;
@@ -312,8 +312,18 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
312312
return NULL;
313313
}
314314

315-
/* copy true value_size bytes */
316-
memcpy(this_cpu_ptr(pptr), value, htab->map.value_size);
315+
if (!onallcpus) {
316+
/* copy true value_size bytes */
317+
memcpy(this_cpu_ptr(pptr), value, htab->map.value_size);
318+
} else {
319+
int off = 0, cpu;
320+
321+
for_each_possible_cpu(cpu) {
322+
bpf_long_memcpy(per_cpu_ptr(pptr, cpu),
323+
value + off, size);
324+
off += size;
325+
}
326+
}
317327
htab_elem_set_ptr(l_new, key_size, pptr);
318328
} else {
319329
memcpy(l_new->key + round_up(key_size, 8), value, size);
@@ -368,7 +378,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
368378
/* allocate new element outside of the lock, since
369379
* we're most likley going to insert it
370380
*/
371-
l_new = alloc_htab_elem(htab, key, value, key_size, hash, false);
381+
l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false);
372382
if (!l_new)
373383
return -ENOMEM;
374384

@@ -402,8 +412,9 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
402412
return ret;
403413
}
404414

405-
static int htab_percpu_map_update_elem(struct bpf_map *map, void *key,
406-
void *value, u64 map_flags)
415+
static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
416+
void *value, u64 map_flags,
417+
bool onallcpus)
407418
{
408419
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
409420
struct htab_elem *l_new = NULL, *l_old;
@@ -436,12 +447,25 @@ static int htab_percpu_map_update_elem(struct bpf_map *map, void *key,
436447
goto err;
437448

438449
if (l_old) {
450+
void __percpu *pptr = htab_elem_get_ptr(l_old, key_size);
451+
u32 size = htab->map.value_size;
452+
439453
/* per-cpu hash map can update value in-place */
440-
memcpy(this_cpu_ptr(htab_elem_get_ptr(l_old, key_size)),
441-
value, htab->map.value_size);
454+
if (!onallcpus) {
455+
memcpy(this_cpu_ptr(pptr), value, size);
456+
} else {
457+
int off = 0, cpu;
458+
459+
size = round_up(size, 8);
460+
for_each_possible_cpu(cpu) {
461+
bpf_long_memcpy(per_cpu_ptr(pptr, cpu),
462+
value + off, size);
463+
off += size;
464+
}
465+
}
442466
} else {
443467
l_new = alloc_htab_elem(htab, key, value, key_size,
444-
hash, true);
468+
hash, true, onallcpus);
445469
if (!l_new) {
446470
ret = -ENOMEM;
447471
goto err;
@@ -455,6 +479,12 @@ static int htab_percpu_map_update_elem(struct bpf_map *map, void *key,
455479
return ret;
456480
}
457481

482+
static int htab_percpu_map_update_elem(struct bpf_map *map, void *key,
483+
void *value, u64 map_flags)
484+
{
485+
return __htab_percpu_map_update_elem(map, key, value, map_flags, false);
486+
}
487+
458488
/* Called from syscall or from eBPF program */
459489
static int htab_map_delete_elem(struct bpf_map *map, void *key)
460490
{
@@ -557,6 +587,41 @@ static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key)
557587
return NULL;
558588
}
559589

590+
int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value)
591+
{
592+
struct htab_elem *l;
593+
void __percpu *pptr;
594+
int ret = -ENOENT;
595+
int cpu, off = 0;
596+
u32 size;
597+
598+
/* per_cpu areas are zero-filled and bpf programs can only
599+
* access 'value_size' of them, so copying rounded areas
600+
* will not leak any kernel data
601+
*/
602+
size = round_up(map->value_size, 8);
603+
rcu_read_lock();
604+
l = __htab_map_lookup_elem(map, key);
605+
if (!l)
606+
goto out;
607+
pptr = htab_elem_get_ptr(l, map->key_size);
608+
for_each_possible_cpu(cpu) {
609+
bpf_long_memcpy(value + off,
610+
per_cpu_ptr(pptr, cpu), size);
611+
off += size;
612+
}
613+
ret = 0;
614+
out:
615+
rcu_read_unlock();
616+
return ret;
617+
}
618+
619+
int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value,
620+
u64 map_flags)
621+
{
622+
return __htab_percpu_map_update_elem(map, key, value, map_flags, true);
623+
}
624+
560625
static const struct bpf_map_ops htab_percpu_ops = {
561626
.map_alloc = htab_map_alloc,
562627
.map_free = htab_map_free,

kernel/bpf/syscall.c

Lines changed: 40 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -239,6 +239,7 @@ static int map_lookup_elem(union bpf_attr *attr)
239239
int ufd = attr->map_fd;
240240
struct bpf_map *map;
241241
void *key, *value, *ptr;
242+
u32 value_size;
242243
struct fd f;
243244
int err;
244245

@@ -259,23 +260,35 @@ static int map_lookup_elem(union bpf_attr *attr)
259260
if (copy_from_user(key, ukey, map->key_size) != 0)
260261
goto free_key;
261262

263+
if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
264+
map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
265+
value_size = round_up(map->value_size, 8) * num_possible_cpus();
266+
else
267+
value_size = map->value_size;
268+
262269
err = -ENOMEM;
263-
value = kmalloc(map->value_size, GFP_USER | __GFP_NOWARN);
270+
value = kmalloc(value_size, GFP_USER | __GFP_NOWARN);
264271
if (!value)
265272
goto free_key;
266273

267-
rcu_read_lock();
268-
ptr = map->ops->map_lookup_elem(map, key);
269-
if (ptr)
270-
memcpy(value, ptr, map->value_size);
271-
rcu_read_unlock();
274+
if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH) {
275+
err = bpf_percpu_hash_copy(map, key, value);
276+
} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
277+
err = bpf_percpu_array_copy(map, key, value);
278+
} else {
279+
rcu_read_lock();
280+
ptr = map->ops->map_lookup_elem(map, key);
281+
if (ptr)
282+
memcpy(value, ptr, value_size);
283+
rcu_read_unlock();
284+
err = ptr ? 0 : -ENOENT;
285+
}
272286

273-
err = -ENOENT;
274-
if (!ptr)
287+
if (err)
275288
goto free_value;
276289

277290
err = -EFAULT;
278-
if (copy_to_user(uvalue, value, map->value_size) != 0)
291+
if (copy_to_user(uvalue, value, value_size) != 0)
279292
goto free_value;
280293

281294
err = 0;
@@ -298,6 +311,7 @@ static int map_update_elem(union bpf_attr *attr)
298311
int ufd = attr->map_fd;
299312
struct bpf_map *map;
300313
void *key, *value;
314+
u32 value_size;
301315
struct fd f;
302316
int err;
303317

@@ -318,21 +332,30 @@ static int map_update_elem(union bpf_attr *attr)
318332
if (copy_from_user(key, ukey, map->key_size) != 0)
319333
goto free_key;
320334

335+
if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
336+
map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
337+
value_size = round_up(map->value_size, 8) * num_possible_cpus();
338+
else
339+
value_size = map->value_size;
340+
321341
err = -ENOMEM;
322-
value = kmalloc(map->value_size, GFP_USER | __GFP_NOWARN);
342+
value = kmalloc(value_size, GFP_USER | __GFP_NOWARN);
323343
if (!value)
324344
goto free_key;
325345

326346
err = -EFAULT;
327-
if (copy_from_user(value, uvalue, map->value_size) != 0)
347+
if (copy_from_user(value, uvalue, value_size) != 0)
328348
goto free_value;
329349

330-
/* eBPF program that use maps are running under rcu_read_lock(),
331-
* therefore all map accessors rely on this fact, so do the same here
332-
*/
333-
rcu_read_lock();
334-
err = map->ops->map_update_elem(map, key, value, attr->flags);
335-
rcu_read_unlock();
350+
if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH) {
351+
err = bpf_percpu_hash_update(map, key, value, attr->flags);
352+
} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
353+
err = bpf_percpu_array_update(map, key, value, attr->flags);
354+
} else {
355+
rcu_read_lock();
356+
err = map->ops->map_update_elem(map, key, value, attr->flags);
357+
rcu_read_unlock();
358+
}
336359

337360
free_value:
338361
kfree(value);

0 commit comments

Comments
 (0)