hashtab.c source code [linux/kernel/bpf/hashtab.c]

1	// SPDX-License-Identifier: GPL-2.0-only
2	/ Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com*
3	* Copyright (c) 2016 Facebook
4	*/
5	#include <linux/bpf.h>
6	#include <linux/btf.h>
7	#include <linux/jhash.h>
8	#include <linux/filter.h>
9	#include <linux/rculist_nulls.h>
10	#include <linux/rcupdate_wait.h>
11	#include <linux/random.h>
12	#include <uapi/linux/btf.h>
13	#include <linux/rcupdate_trace.h>
14	#include <linux/btf_ids.h>
15	#include "percpu_freelist.h"
16	#include "bpf_lru_list.h"
17	#include "map_in_map.h"
18	#include <linux/bpf_mem_alloc.h>
19	#include <asm/rqspinlock.h>
20
21	#define HTAB_CREATE_FLAG_MASK \
22	(BPF_F_NO_PREALLOC \| BPF_F_NO_COMMON_LRU \| BPF_F_NUMA_NODE \| \
23	BPF_F_ACCESS_MASK \| BPF_F_ZERO_SEED)
24
25	#define BATCH_OPS(_name) \
26	.map_lookup_batch = \
27	_name##_map_lookup_batch, \
28	.map_lookup_and_delete_batch = \
29	_name##_map_lookup_and_delete_batch, \
30	.map_update_batch = \
31	generic_map_update_batch, \
32	.map_delete_batch = \
33	generic_map_delete_batch
34
35	/*
36	* The bucket lock has two protection scopes:
37	*
38	* 1) Serializing concurrent operations from BPF programs on different
39	* CPUs
40	*
41	* 2) Serializing concurrent operations from BPF programs and sys_bpf()
42	*
43	* BPF programs can execute in any context including perf, kprobes and
44	* tracing. As there are almost no limits where perf, kprobes and tracing
45	* can be invoked from the lock operations need to be protected against
46	* deadlocks. Deadlocks can be caused by recursion and by an invocation in
47	* the lock held section when functions which acquire this lock are invoked
48	* from sys_bpf(). BPF recursion is prevented by incrementing the per CPU
49	* variable bpf_prog_active, which prevents BPF programs attached to perf
50	* events, kprobes and tracing to be invoked before the prior invocation
51	* from one of these contexts completed. sys_bpf() uses the same mechanism
52	* by pinning the task to the current CPU and incrementing the recursion
53	* protection across the map operation.
54	*
55	* This has subtle implications on PREEMPT_RT. PREEMPT_RT forbids certain
56	* operations like memory allocations (even with GFP_ATOMIC) from atomic
57	* contexts. This is required because even with GFP_ATOMIC the memory
58	* allocator calls into code paths which acquire locks with long held lock
59	* sections. To ensure the deterministic behaviour these locks are regular
60	* spinlocks, which are converted to 'sleepable' spinlocks on RT. The only
61	* true atomic contexts on an RT kernel are the low level hardware
62	* handling, scheduling, low level interrupt handling, NMIs etc. None of
63	* these contexts should ever do memory allocations.
64	*
65	* As regular device interrupt handlers and soft interrupts are forced into
66	* thread context, the existing code which does
67	* spin_lock(); alloc(GFP_ATOMIC); spin_unlock();
68	* just works.
69	*
70	* In theory the BPF locks could be converted to regular spinlocks as well,
71	* but the bucket locks and percpu_freelist locks can be taken from
72	* arbitrary contexts (perf, kprobes, tracepoints) which are required to be
73	* atomic contexts even on RT. Before the introduction of bpf_mem_alloc,
74	* it is only safe to use raw spinlock for preallocated hash map on a RT kernel,
75	* because there is no memory allocation within the lock held sections. However
76	* after hash map was fully converted to use bpf_mem_alloc, there will be
77	* non-synchronous memory allocation for non-preallocated hash map, so it is
78	* safe to always use raw spinlock for bucket lock.
79	*/
80	struct bucket {
81	struct hlist_nulls_head head;
82	rqspinlock_t raw_lock;
83	};
84
85	#define HASHTAB_MAP_LOCK_COUNT 8
86	#define HASHTAB_MAP_LOCK_MASK (HASHTAB_MAP_LOCK_COUNT - 1)
87
88	struct bpf_htab {
89	struct bpf_map map;
90	struct bpf_mem_alloc ma;
91	struct bpf_mem_alloc pcpu_ma;
92	struct bucket *buckets;
93	void *elems;
94	union {
95	struct pcpu_freelist freelist;
96	struct bpf_lru lru;
97	};
98	struct htab_elem __percpu extra_elems;
99	/ number of elements in non-preallocated hashtable are kept*
100	* in either pcount or count
101	*/
102	struct percpu_counter pcount;
103	atomic_t count;
104	bool use_percpu_counter;
105	u32 n_buckets; / number of hash buckets /
106	u32 elem_size; / size of each element in bytes /
107	u32 hashrnd;
108	};
109
110	/ each htab element is struct htab_elem + key + value /
111	struct htab_elem {
112	union {
113	struct hlist_nulls_node hash_node;
114	struct {
115	void *padding;
116	union {
117	struct pcpu_freelist_node fnode;
118	struct htab_elem *batch_flink;
119	};
120	};
121	};
122	union {
123	/ pointer to per-cpu pointer /
124	void *ptr_to_pptr;
125	struct bpf_lru_node lru_node;
126	};
127	u32 hash;
128	char key[] __aligned(`8`);
129	};
130
131	static inline bool htab_is_prealloc(const struct bpf_htab *htab)
132	{
133	return !(htab->map.map_flags & BPF_F_NO_PREALLOC);
134	}
135
136	static void htab_init_buckets(struct bpf_htab *htab)
137	{
138	unsigned int i;
139
140	for (i = `0`; i < htab->n_buckets; i++) {
141	INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i);
142	raw_res_spin_lock_init(&htab->buckets[i].raw_lock);
143	cond_resched();
144	}
145	}
146
147	static inline int htab_lock_bucket(struct bucket b, unsigned* long *pflags)
148	{
149	unsigned long flags;
150	int ret;
151
152	ret = raw_res_spin_lock_irqsave(&b->raw_lock, flags);
153	if (ret)
154	return ret;
155	*pflags = flags;
156	return `0`;
157	}
158
159	static inline void htab_unlock_bucket(struct bucket b, unsigned* long flags)
160	{
161	raw_res_spin_unlock_irqrestore(&b->raw_lock, flags);
162	}
163
164	static bool htab_lru_map_delete_node(void arg, struct* bpf_lru_node *node);
165
166	static bool htab_is_lru(const struct bpf_htab *htab)
167	{
168	return htab->map.map_type == BPF_MAP_TYPE_LRU_HASH \|\|
169	htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH;
170	}
171
172	static bool htab_is_percpu(const struct bpf_htab *htab)
173	{
174	return htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH \|\|
175	htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH;
176	}
177
178	static inline bool is_fd_htab(const struct bpf_htab *htab)
179	{
180	return htab->map.map_type == BPF_MAP_TYPE_HASH_OF_MAPS;
181	}
182
183	static inline void htab_elem_value(struct* htab_elem *l, u32 key_size)
184	{
185	return l->key + round_up(key_size, `8`);
186	}
187
188	static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size,
189	void __percpu *pptr)
190	{
191	(void* __percpu **)htab_elem_value(l, key_size) = pptr;
192	}
193
194	static inline void __percpu htab_elem_get_ptr(struct* htab_elem *l, u32 key_size)
195	{
196	return (void* __percpu **)htab_elem_value(l, key_size);
197	}
198
199	static void fd_htab_map_get_ptr(const* struct bpf_map map, struct* htab_elem *l)
200	{
201	return (void* **)htab_elem_value(l, key_size: map->key_size);
202	}
203
204	static struct htab_elem get_htab_elem(struct* bpf_htab htab, int* i)
205	{
206	return (struct htab_elem ) (htab->elems + i (u64)htab->elem_size);
207	}
208
209	/ Both percpu and fd htab support in-place update, so no need for*
210	* extra elem. LRU itself can remove the least used element, so
211	* there is no need for an extra elem during map_update.
212	*/
213	static bool htab_has_extra_elems(struct bpf_htab *htab)
214	{
215	return !htab_is_percpu(htab) && !htab_is_lru(htab) && !is_fd_htab(htab);
216	}
217
218	static void htab_free_prealloced_internal_structs(struct bpf_htab *htab)
219	{
220	u32 num_entries = htab->map.max_entries;
221	int i;
222
223	if (htab_has_extra_elems(htab))
224	num_entries += num_possible_cpus();
225
226	for (i = `0`; i < num_entries; i++) {
227	struct htab_elem *elem;
228
229	elem = get_htab_elem(htab, i);
230	bpf_map_free_internal_structs(map: &htab->map,
231	obj: htab_elem_value(l: elem, key_size: htab->map.key_size));
232	cond_resched();
233	}
234	}
235
236	static void htab_free_prealloced_fields(struct bpf_htab *htab)
237	{
238	u32 num_entries = htab->map.max_entries;
239	int i;
240
241	if (IS_ERR_OR_NULL(ptr: htab->map.record))
242	return;
243	if (htab_has_extra_elems(htab))
244	num_entries += num_possible_cpus();
245	for (i = `0`; i < num_entries; i++) {
246	struct htab_elem *elem;
247
248	elem = get_htab_elem(htab, i);
249	if (htab_is_percpu(htab)) {
250	void __percpu *pptr = htab_elem_get_ptr(l: elem, key_size: htab->map.key_size);
251	int cpu;
252
253	for_each_possible_cpu(cpu) {
254	bpf_obj_free_fields(rec: htab->map.record, per_cpu_ptr(pptr, cpu));
255	cond_resched();
256	}
257	} else {
258	bpf_obj_free_fields(rec: htab->map.record,
259	obj: htab_elem_value(l: elem, key_size: htab->map.key_size));
260	cond_resched();
261	}
262	cond_resched();
263	}
264	}
265
266	static void htab_free_elems(struct bpf_htab *htab)
267	{
268	int i;
269
270	if (!htab_is_percpu(htab))
271	goto free_elems;
272
273	for (i = `0`; i < htab->map.max_entries; i++) {
274	void __percpu *pptr;
275
276	pptr = htab_elem_get_ptr(l: get_htab_elem(htab, i),
277	key_size: htab->map.key_size);
278	free_percpu(pdata: pptr);
279	cond_resched();
280	}
281	free_elems:
282	bpf_map_area_free(base: htab->elems);
283	}
284
285	/ The LRU list has a lock (lru_lock). Each htab bucket has a lock*
286	* (bucket_lock). If both locks need to be acquired together, the lock
287	* order is always lru_lock -> bucket_lock and this only happens in
288	* bpf_lru_list.c logic. For example, certain code path of
289	* bpf_lru_pop_free(), which is called by function prealloc_lru_pop(),
290	* will acquire lru_lock first followed by acquiring bucket_lock.
291	*
292	* In hashtab.c, to avoid deadlock, lock acquisition of
293	* bucket_lock followed by lru_lock is not allowed. In such cases,
294	* bucket_lock needs to be released first before acquiring lru_lock.
295	*/
296	static struct htab_elem prealloc_lru_pop(struct* bpf_htab htab, void* *key,
297	u32 hash)
298	{
299	struct bpf_lru_node *node = bpf_lru_pop_free(lru: &htab->lru, hash);
300	struct htab_elem *l;
301
302	if (node) {
303	bpf_map_inc_elem_count(map: &htab->map);
304	l = container_of(node, struct htab_elem, lru_node);
305	memcpy(l->key, key, htab->map.key_size);
306	return l;
307	}
308
309	return NULL;
310	}
311
312	static int prealloc_init(struct bpf_htab *htab)
313	{
314	u32 num_entries = htab->map.max_entries;
315	int err = -ENOMEM, i;
316
317	if (htab_has_extra_elems(htab))
318	num_entries += num_possible_cpus();
319
320	htab->elems = bpf_map_area_alloc(size: (u64)htab->elem_size * num_entries,
321	numa_node: htab->map.numa_node);
322	if (!htab->elems)
323	return -ENOMEM;
324
325	if (!htab_is_percpu(htab))
326	goto skip_percpu_elems;
327
328	for (i = `0`; i < num_entries; i++) {
329	u32 size = round_up(htab->map.value_size, `8`);
330	void __percpu *pptr;
331
332	pptr = bpf_map_alloc_percpu(map: &htab->map, size, align: `8`,
333	GFP_USER \| __GFP_NOWARN);
334	if (!pptr)
335	goto free_elems;
336	htab_elem_set_ptr(l: get_htab_elem(htab, i), key_size: htab->map.key_size,
337	pptr);
338	cond_resched();
339	}
340
341	skip_percpu_elems:
342	if (htab_is_lru(htab))
343	err = bpf_lru_init(lru: &htab->lru,
344	percpu: htab->map.map_flags & BPF_F_NO_COMMON_LRU,
345	offsetof(struct htab_elem, hash) -
346	offsetof(struct htab_elem, lru_node),
347	del_from_htab: htab_lru_map_delete_node,
348	delete_arg: htab);
349	else
350	err = pcpu_freelist_init(&htab->freelist);
351
352	if (err)
353	goto free_elems;
354
355	if (htab_is_lru(htab))
356	bpf_lru_populate(lru: &htab->lru, buf: htab->elems,
357	offsetof(struct htab_elem, lru_node),
358	elem_size: htab->elem_size, nr_elems: num_entries);
359	else
360	pcpu_freelist_populate(s: &htab->freelist,
361	buf: htab->elems + offsetof(struct htab_elem, fnode),
362	elem_size: htab->elem_size, nr_elems: num_entries);
363
364	return `0`;
365
366	free_elems:
367	htab_free_elems(htab);
368	return err;
369	}
370
371	static void prealloc_destroy(struct bpf_htab *htab)
372	{
373	htab_free_elems(htab);
374
375	if (htab_is_lru(htab))
376	bpf_lru_destroy(lru: &htab->lru);
377	else
378	pcpu_freelist_destroy(s: &htab->freelist);
379	}
380
381	static int alloc_extra_elems(struct bpf_htab *htab)
382	{
383	struct htab_elem __percpu pptr, *l_new;
384	struct pcpu_freelist_node *l;
385	int cpu;
386
387	pptr = bpf_map_alloc_percpu(map: &htab->map, size: sizeof(struct htab_elem *), align: `8`,
388	GFP_USER \| __GFP_NOWARN);
389	if (!pptr)
390	return -ENOMEM;
391
392	for_each_possible_cpu(cpu) {
393	l = pcpu_freelist_pop(&htab->freelist);
394	/ pop will succeed, since prealloc_init()*
395	* preallocated extra num_possible_cpus elements
396	*/
397	l_new = container_of(l, struct htab_elem, fnode);
398	*per_cpu_ptr(pptr, cpu) = l_new;
399	}
400	htab->extra_elems = pptr;
401	return `0`;
402	}
403
404	/ Called from syscall /
405	static int htab_map_alloc_check(union bpf_attr *attr)
406	{
407	bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH \|\|
408	attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH);
409	bool lru = (attr->map_type == BPF_MAP_TYPE_LRU_HASH \|\|
410	attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH);
411	/ percpu_lru means each cpu has its own LRU list.*
412	* it is different from BPF_MAP_TYPE_PERCPU_HASH where
413	* the map's value itself is percpu. percpu_lru has
414	* nothing to do with the map's value.
415	*/
416	bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU);
417	bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC);
418	bool zero_seed = (attr->map_flags & BPF_F_ZERO_SEED);
419	int numa_node = bpf_map_attr_numa_node(attr);
420
421	BUILD_BUG_ON(offsetof(struct htab_elem, fnode.next) !=
422	offsetof(struct htab_elem, hash_node.pprev));
423
424	if (zero_seed && !capable(CAP_SYS_ADMIN))
425	/ Guard against local DoS, and discourage production use. /
426	return -EPERM;
427
428	if (attr->map_flags & ~HTAB_CREATE_FLAG_MASK \|\|
429	!bpf_map_flags_access_ok(access_flags: attr->map_flags))
430	return -EINVAL;
431
432	if (!lru && percpu_lru)
433	return -EINVAL;
434
435	if (lru && !prealloc)
436	return -ENOTSUPP;
437
438	if (numa_node != NUMA_NO_NODE && (percpu \|\| percpu_lru))
439	return -EINVAL;
440
441	/ check sanity of attributes.*
442	* value_size == 0 may be allowed in the future to use map as a set
443	*/
444	if (attr->max_entries == `0` \|\| attr->key_size == `0` \|\|
445	attr->value_size == `0`)
446	return -EINVAL;
447
448	if ((u64)attr->key_size + attr->value_size >= KMALLOC_MAX_SIZE -
449	sizeof(struct htab_elem))
450	/ if key_size + value_size is bigger, the user space won't be*
451	* able to access the elements via bpf syscall. This check
452	* also makes sure that the elem_size doesn't overflow and it's
453	* kmalloc-able later in htab_map_update_elem()
454	*/
455	return -E2BIG;
456	/ percpu map value size is bound by PCPU_MIN_UNIT_SIZE /
457	if (percpu && round_up(attr->value_size, `8`) > PCPU_MIN_UNIT_SIZE)
458	return -E2BIG;
459
460	return `0`;
461	}
462
463	static struct bpf_map htab_map_alloc(union* bpf_attr *attr)
464	{
465	bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH \|\|
466	attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH);
467	/ percpu_lru means each cpu has its own LRU list.*
468	* it is different from BPF_MAP_TYPE_PERCPU_HASH where
469	* the map's value itself is percpu. percpu_lru has
470	* nothing to do with the map's value.
471	*/
472	bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU);
473	bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC);
474	struct bpf_htab *htab;
475	int err;
476
477	htab = bpf_map_area_alloc(size: sizeof(*htab), NUMA_NO_NODE);
478	if (!htab)
479	return ERR_PTR(error: -ENOMEM);
480
481	bpf_map_init_from_attr(map: &htab->map, attr);
482
483	if (percpu_lru) {
484	/ ensure each CPU's lru list has >=1 elements.*
485	* since we are at it, make each lru list has the same
486	* number of elements.
487	*/
488	htab->map.max_entries = roundup(attr->max_entries,
489	num_possible_cpus());
490	if (htab->map.max_entries < attr->max_entries)
491	htab->map.max_entries = rounddown(attr->max_entries,
492	num_possible_cpus());
493	}
494
495	/ hash table size must be power of 2; roundup_pow_of_two() can overflow*
496	* into UB on 32-bit arches, so check that first
497	*/
498	err = -E2BIG;
499	if (htab->map.max_entries > `1UL` << `31`)
500	goto free_htab;
501
502	htab->n_buckets = roundup_pow_of_two(htab->map.max_entries);
503
504	htab->elem_size = sizeof(struct htab_elem) +
505	round_up(htab->map.key_size, `8`);
506	if (percpu)
507	htab->elem_size += sizeof(void *);
508	else
509	htab->elem_size += round_up(htab->map.value_size, `8`);
510
511	/ check for u32 overflow /
512	if (htab->n_buckets > U32_MAX / sizeof(struct bucket))
513	goto free_htab;
514
515	err = bpf_map_init_elem_count(map: &htab->map);
516	if (err)
517	goto free_htab;
518
519	err = -ENOMEM;
520	htab->buckets = bpf_map_area_alloc(size: htab->n_buckets *
521	sizeof(struct bucket),
522	numa_node: htab->map.numa_node);
523	if (!htab->buckets)
524	goto free_elem_count;
525
526	if (htab->map.map_flags & BPF_F_ZERO_SEED)
527	htab->hashrnd = `0`;
528	else
529	htab->hashrnd = get_random_u32();
530
531	htab_init_buckets(htab);
532
533	/ compute_batch_value() computes batch value as num_online_cpus() * 2*
534	* and __percpu_counter_compare() needs
535	* htab->max_entries - cur_number_of_elems to be more than batch * num_online_cpus()
536	* for percpu_counter to be faster than atomic_t. In practice the average bpf
537	* hash map size is 10k, which means that a system with 64 cpus will fill
538	* hashmap to 20% of 10k before percpu_counter becomes ineffective. Therefore
539	* define our own batch count as 32 then 10k hash map can be filled up to 80%:
540	* 10k - 8k > 32 _batch_ * 64 _cpus_
541	* and __percpu_counter_compare() will still be fast. At that point hash map
542	* collisions will dominate its performance anyway. Assume that hash map filled
543	* to 50+% isn't going to be O(1) and use the following formula to choose
544	* between percpu_counter and atomic_t.
545	*/
546	#define PERCPU_COUNTER_BATCH 32
547	if (attr->max_entries / `2` > num_online_cpus() * PERCPU_COUNTER_BATCH)
548	htab->use_percpu_counter = true;
549
550	if (htab->use_percpu_counter) {
551	err = percpu_counter_init(&htab->pcount, `0`, GFP_KERNEL);
552	if (err)
553	goto free_map_locked;
554	}
555
556	if (prealloc) {
557	err = prealloc_init(htab);
558	if (err)
559	goto free_map_locked;
560
561	if (htab_has_extra_elems(htab)) {
562	err = alloc_extra_elems(htab);
563	if (err)
564	goto free_prealloc;
565	}
566	} else {
567	err = bpf_mem_alloc_init(ma: &htab->ma, size: htab->elem_size, percpu: false);
568	if (err)
569	goto free_map_locked;
570	if (percpu) {
571	err = bpf_mem_alloc_init(ma: &htab->pcpu_ma,
572	round_up(htab->map.value_size, `8`), percpu: true);
573	if (err)
574	goto free_map_locked;
575	}
576	}
577
578	return &htab->map;
579
580	free_prealloc:
581	prealloc_destroy(htab);
582	free_map_locked:
583	if (htab->use_percpu_counter)
584	percpu_counter_destroy(fbc: &htab->pcount);
585	bpf_map_area_free(base: htab->buckets);
586	bpf_mem_alloc_destroy(ma: &htab->pcpu_ma);
587	bpf_mem_alloc_destroy(ma: &htab->ma);
588	free_elem_count:
589	bpf_map_free_elem_count(map: &htab->map);
590	free_htab:
591	bpf_map_area_free(base: htab);
592	return ERR_PTR(error: err);
593	}
594
595	static inline u32 htab_map_hash(const void *key, u32 key_len, u32 hashrnd)
596	{
597	if (likely(key_len % `4` == `0`))
598	return jhash2(k: key, length: key_len / `4`, initval: hashrnd);
599	return jhash(key, length: key_len, initval: hashrnd);
600	}
601
602	static inline struct bucket __select_bucket(struct* bpf_htab *htab, u32 hash)
603	{
604	return &htab->buckets[hash & (htab->n_buckets - `1`)];
605	}
606
607	static inline struct hlist_nulls_head select_bucket(struct* bpf_htab *htab, u32 hash)
608	{
609	return &__select_bucket(htab, hash)->head;
610	}
611
612	/ this lookup function can only be called with bucket lock taken /
613	static struct htab_elem lookup_elem_raw(struct* hlist_nulls_head *head, u32 hash,
614	void *key, u32 key_size)
615	{
616	struct hlist_nulls_node *n;
617	struct htab_elem *l;
618
619	hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
620	if (l->hash == hash && !memcmp(p: &l->key, q: key, size: key_size))
621	return l;
622
623	return NULL;
624	}
625
626	/ can be called without bucket lock. it will repeat the loop in*
627	* the unlikely event when elements moved from one bucket into another
628	* while link list is being walked
629	*/
630	static struct htab_elem lookup_nulls_elem_raw(struct* hlist_nulls_head *head,
631	u32 hash, void *key,
632	u32 key_size, u32 n_buckets)
633	{
634	struct hlist_nulls_node *n;
635	struct htab_elem *l;
636
637	again:
638	hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
639	if (l->hash == hash && !memcmp(p: &l->key, q: key, size: key_size))
640	return l;
641
642	if (unlikely(get_nulls_value(n) != (hash & (n_buckets - `1`))))
643	goto again;
644
645	return NULL;
646	}
647
648	/ Called from syscall or from eBPF program directly, so*
649	* arguments have to match bpf_map_lookup_elem() exactly.
650	* The return value is adjusted by BPF instructions
651	* in htab_map_gen_lookup().
652	*/
653	static void __htab_map_lookup_elem(struct* bpf_map map, void* *key)
654	{
655	struct bpf_htab htab = container_of(map, struct* bpf_htab, map);
656	struct hlist_nulls_head *head;
657	struct htab_elem *l;
658	u32 hash, key_size;
659
660	WARN_ON_ONCE(!bpf_rcu_lock_held());
661
662	key_size = map->key_size;
663
664	hash = htab_map_hash(key, key_len: key_size, hashrnd: htab->hashrnd);
665
666	head = select_bucket(htab, hash);
667
668	l = lookup_nulls_elem_raw(head, hash, key, key_size, n_buckets: htab->n_buckets);
669
670	return l;
671	}
672
673	static void htab_map_lookup_elem(struct* bpf_map map, void* *key)
674	{
675	struct htab_elem *l = __htab_map_lookup_elem(map, key);
676
677	if (l)
678	return htab_elem_value(l, key_size: map->key_size);
679
680	return NULL;
681	}
682
683	/ inline bpf_map_lookup_elem() call.*
684	* Instead of:
685	* bpf_prog
686	* bpf_map_lookup_elem
687	* map->ops->map_lookup_elem
688	* htab_map_lookup_elem
689	* __htab_map_lookup_elem
690	* do:
691	* bpf_prog
692	* __htab_map_lookup_elem
693	*/
694	static int htab_map_gen_lookup(struct bpf_map map, struct* bpf_insn *insn_buf)
695	{
696	struct bpf_insn *insn = insn_buf;
697	const int ret = BPF_REG_0;
698
699	BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem,
700	(void ()(struct bpf_map map, void* *key))NULL));
701	*insn++ = BPF_EMIT_CALL(__htab_map_lookup_elem);
702	*insn++ = BPF_JMP_IMM(BPF_JEQ, ret, `0`, `1`);
703	*insn++ = BPF_ALU64_IMM(BPF_ADD, ret,
704	offsetof(struct htab_elem, key) +
705	round_up(map->key_size, `8`));
706	return insn - insn_buf;
707	}
708
709	static __always_inline void __htab_lru_map_lookup_elem(struct* bpf_map *map,
710	void key, const* bool mark)
711	{
712	struct htab_elem *l = __htab_map_lookup_elem(map, key);
713
714	if (l) {
715	if (mark)
716	bpf_lru_node_set_ref(node: &l->lru_node);
717	return htab_elem_value(l, key_size: map->key_size);
718	}
719
720	return NULL;
721	}
722
723	static void htab_lru_map_lookup_elem(struct* bpf_map map, void* *key)
724	{
725	return __htab_lru_map_lookup_elem(map, key, mark: true);
726	}
727
728	static void htab_lru_map_lookup_elem_sys(struct* bpf_map map, void* *key)
729	{
730	return __htab_lru_map_lookup_elem(map, key, mark: false);
731	}
732
733	static int htab_lru_map_gen_lookup(struct bpf_map *map,
734	struct bpf_insn *insn_buf)
735	{
736	struct bpf_insn *insn = insn_buf;
737	const int ret = BPF_REG_0;
738	const int ref_reg = BPF_REG_1;
739
740	BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem,
741	(void ()(struct bpf_map map, void* *key))NULL));
742	*insn++ = BPF_EMIT_CALL(__htab_map_lookup_elem);
743	*insn++ = BPF_JMP_IMM(BPF_JEQ, ret, `0`, `4`);
744	*insn++ = BPF_LDX_MEM(BPF_B, ref_reg, ret,
745	offsetof(struct htab_elem, lru_node) +
746	offsetof(struct bpf_lru_node, ref));
747	*insn++ = BPF_JMP_IMM(BPF_JNE, ref_reg, `0`, `1`);
748	*insn++ = BPF_ST_MEM(BPF_B, ret,
749	offsetof(struct htab_elem, lru_node) +
750	offsetof(struct bpf_lru_node, ref),
751	`1`);
752	*insn++ = BPF_ALU64_IMM(BPF_ADD, ret,
753	offsetof(struct htab_elem, key) +
754	round_up(map->key_size, `8`));
755	return insn - insn_buf;
756	}
757
758	static void check_and_free_fields(struct bpf_htab *htab,
759	struct htab_elem *elem)
760	{
761	if (IS_ERR_OR_NULL(ptr: htab->map.record))
762	return;
763
764	if (htab_is_percpu(htab)) {
765	void __percpu *pptr = htab_elem_get_ptr(l: elem, key_size: htab->map.key_size);
766	int cpu;
767
768	for_each_possible_cpu(cpu)
769	bpf_obj_free_fields(rec: htab->map.record, per_cpu_ptr(pptr, cpu));
770	} else {
771	void *map_value = htab_elem_value(l: elem, key_size: htab->map.key_size);
772
773	bpf_obj_free_fields(rec: htab->map.record, obj: map_value);
774	}
775	}
776
777	/ It is called from the bpf_lru_list when the LRU needs to delete*
778	* older elements from the htab.
779	*/
780	static bool htab_lru_map_delete_node(void arg, struct* bpf_lru_node *node)
781	{
782	struct bpf_htab *htab = arg;
783	struct htab_elem l = NULL, tgt_l;
784	struct hlist_nulls_head *head;
785	struct hlist_nulls_node *n;
786	unsigned long flags;
787	struct bucket *b;
788	int ret;
789
790	tgt_l = container_of(node, struct htab_elem, lru_node);
791	b = __select_bucket(htab, hash: tgt_l->hash);
792	head = &b->head;
793
794	ret = htab_lock_bucket(b, pflags: &flags);
795	if (ret)
796	return false;
797
798	hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
799	if (l == tgt_l) {
800	hlist_nulls_del_rcu(n: &l->hash_node);
801	bpf_map_dec_elem_count(map: &htab->map);
802	break;
803	}
804
805	htab_unlock_bucket(b, flags);
806
807	if (l == tgt_l)
808	check_and_free_fields(htab, elem: l);
809	return l == tgt_l;
810	}
811
812	/ Called from syscall /
813	static int htab_map_get_next_key(struct bpf_map map, void* key, void* *next_key)
814	{
815	struct bpf_htab htab = container_of(map, struct* bpf_htab, map);
816	struct hlist_nulls_head *head;
817	struct htab_elem l, next_l;
818	u32 hash, key_size;
819	int i = `0`;
820
821	WARN_ON_ONCE(!rcu_read_lock_held());
822
823	key_size = map->key_size;
824
825	if (!key)
826	goto find_first_elem;
827
828	hash = htab_map_hash(key, key_len: key_size, hashrnd: htab->hashrnd);
829
830	head = select_bucket(htab, hash);
831
832	/ lookup the key /
833	l = lookup_nulls_elem_raw(head, hash, key, key_size, n_buckets: htab->n_buckets);
834
835	if (!l)
836	goto find_first_elem;
837
838	/ key was found, get next key in the same bucket /
839	next_l = hlist_nulls_entry_safe(rcu_dereference_raw(hlist_nulls_next_rcu(&l->hash_node)),
840	struct htab_elem, hash_node);
841
842	if (next_l) {
843	/ if next elem in this hash list is non-zero, just return it /
844	memcpy(next_key, next_l->key, key_size);
845	return `0`;
846	}
847
848	/ no more elements in this hash list, go to the next bucket /
849	i = hash & (htab->n_buckets - `1`);
850	i++;
851
852	find_first_elem:
853	/ iterate over buckets /
854	for (; i < htab->n_buckets; i++) {
855	head = select_bucket(htab, hash: i);
856
857	/ pick first element in the bucket /
858	next_l = hlist_nulls_entry_safe(rcu_dereference_raw(hlist_nulls_first_rcu(head)),
859	struct htab_elem, hash_node);
860	if (next_l) {
861	/ if it's not empty, just return it /
862	memcpy(next_key, next_l->key, key_size);
863	return `0`;
864	}
865	}
866
867	/ iterated over all buckets and all elements /
868	return -ENOENT;
869	}
870
871	static void htab_elem_free(struct bpf_htab htab, struct* htab_elem *l)
872	{
873	check_and_free_fields(htab, elem: l);
874
875	if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH)
876	bpf_mem_cache_free(ma: &htab->pcpu_ma, ptr: l->ptr_to_pptr);
877	bpf_mem_cache_free(ma: &htab->ma, ptr: l);
878	}
879
880	static void htab_put_fd_value(struct bpf_htab htab, struct* htab_elem *l)
881	{
882	struct bpf_map *map = &htab->map;
883	void *ptr;
884
885	if (map->ops->map_fd_put_ptr) {
886	ptr = fd_htab_map_get_ptr(map, l);
887	map->ops->map_fd_put_ptr(map, ptr, true);
888	}
889	}
890
891	static bool is_map_full(struct bpf_htab *htab)
892	{
893	if (htab->use_percpu_counter)
894	return __percpu_counter_compare(fbc: &htab->pcount, rhs: htab->map.max_entries,
895	PERCPU_COUNTER_BATCH) >= `0`;
896	return atomic_read(v: &htab->count) >= htab->map.max_entries;
897	}
898
899	static void inc_elem_count(struct bpf_htab *htab)
900	{
901	bpf_map_inc_elem_count(map: &htab->map);
902
903	if (htab->use_percpu_counter)
904	percpu_counter_add_batch(fbc: &htab->pcount, amount: `1`, PERCPU_COUNTER_BATCH);
905	else
906	atomic_inc(v: &htab->count);
907	}
908
909	static void dec_elem_count(struct bpf_htab *htab)
910	{
911	bpf_map_dec_elem_count(map: &htab->map);
912
913	if (htab->use_percpu_counter)
914	percpu_counter_add_batch(fbc: &htab->pcount, amount: -`1`, PERCPU_COUNTER_BATCH);
915	else
916	atomic_dec(v: &htab->count);
917	}
918
919
920	static void free_htab_elem(struct bpf_htab htab, struct* htab_elem *l)
921	{
922	htab_put_fd_value(htab, l);
923
924	if (htab_is_prealloc(htab)) {
925	bpf_map_dec_elem_count(map: &htab->map);
926	check_and_free_fields(htab, elem: l);
927	pcpu_freelist_push(&htab->freelist, &l->fnode);
928	} else {
929	dec_elem_count(htab);
930	htab_elem_free(htab, l);
931	}
932	}
933
934	static void pcpu_copy_value(struct bpf_htab htab, void* __percpu *pptr,
935	void *value, bool onallcpus)
936	{
937	void *ptr;
938
939	if (!onallcpus) {
940	/ copy true value_size bytes /
941	ptr = this_cpu_ptr(pptr);
942	copy_map_value(map: &htab->map, dst: ptr, src: value);
943	bpf_obj_free_fields(rec: htab->map.record, obj: ptr);
944	} else {
945	u32 size = round_up(htab->map.value_size, `8`);
946	int off = `0`, cpu;
947
948	for_each_possible_cpu(cpu) {
949	ptr = per_cpu_ptr(pptr, cpu);
950	copy_map_value_long(map: &htab->map, dst: ptr, src: value + off);
951	bpf_obj_free_fields(rec: htab->map.record, obj: ptr);
952	off += size;
953	}
954	}
955	}
956
957	static void pcpu_init_value(struct bpf_htab htab, void* __percpu *pptr,
958	void *value, bool onallcpus)
959	{
960	/ When not setting the initial value on all cpus, zero-fill element*
961	* values for other cpus. Otherwise, bpf program has no way to ensure
962	* known initial values for cpus other than current one
963	* (onallcpus=false always when coming from bpf prog).
964	*/
965	if (!onallcpus) {
966	int current_cpu = raw_smp_processor_id();
967	int cpu;
968
969	for_each_possible_cpu(cpu) {
970	if (cpu == current_cpu)
971	copy_map_value_long(map: &htab->map, per_cpu_ptr(pptr, cpu), src: value);
972	else / Since elem is preallocated, we cannot touch special fields /
973	zero_map_value(map: &htab->map, per_cpu_ptr(pptr, cpu));
974	}
975	} else {
976	pcpu_copy_value(htab, pptr, value, onallcpus);
977	}
978	}
979
980	static bool fd_htab_map_needs_adjust(const struct bpf_htab *htab)
981	{
982	return is_fd_htab(htab) && BITS_PER_LONG == `64`;
983	}
984
985	static struct htab_elem alloc_htab_elem(struct* bpf_htab htab, void* *key,
986	void *value, u32 key_size, u32 hash,
987	bool percpu, bool onallcpus,
988	struct htab_elem *old_elem)
989	{
990	u32 size = htab->map.value_size;
991	bool prealloc = htab_is_prealloc(htab);
992	struct htab_elem l_new, *pl_new;
993	void __percpu *pptr;
994
995	if (prealloc) {
996	if (old_elem) {
997	/ if we're updating the existing element,*
998	* use per-cpu extra elems to avoid freelist_pop/push
999	*/
1000	pl_new = this_cpu_ptr(htab->extra_elems);
1001	l_new = *pl_new;
1002	*pl_new = old_elem;
1003	} else {
1004	struct pcpu_freelist_node *l;
1005
1006	l = __pcpu_freelist_pop(&htab->freelist);
1007	if (!l)
1008	return ERR_PTR(error: -E2BIG);
1009	l_new = container_of(l, struct htab_elem, fnode);
1010	bpf_map_inc_elem_count(map: &htab->map);
1011	}
1012	} else {
1013	if (is_map_full(htab))
1014	if (!old_elem)
1015	/ when map is full and update() is replacing*
1016	* old element, it's ok to allocate, since
1017	* old element will be freed immediately.
1018	* Otherwise return an error
1019	*/
1020	return ERR_PTR(error: -E2BIG);
1021	inc_elem_count(htab);
1022	l_new = bpf_mem_cache_alloc(ma: &htab->ma);
1023	if (!l_new) {
1024	l_new = ERR_PTR(error: -ENOMEM);
1025	goto dec_count;
1026	}
1027	}
1028
1029	memcpy(l_new->key, key, key_size);
1030	if (percpu) {
1031	if (prealloc) {
1032	pptr = htab_elem_get_ptr(l: l_new, key_size);
1033	} else {
1034	/ alloc_percpu zero-fills /
1035	void *ptr = bpf_mem_cache_alloc(ma: &htab->pcpu_ma);
1036
1037	if (!ptr) {
1038	bpf_mem_cache_free(ma: &htab->ma, ptr: l_new);
1039	l_new = ERR_PTR(error: -ENOMEM);
1040	goto dec_count;
1041	}
1042	l_new->ptr_to_pptr = ptr;
1043	pptr = (void* __percpu **)ptr;
1044	}
1045
1046	pcpu_init_value(htab, pptr, value, onallcpus);
1047
1048	if (!prealloc)
1049	htab_elem_set_ptr(l: l_new, key_size, pptr);
1050	} else if (fd_htab_map_needs_adjust(htab)) {
1051	size = round_up(size, `8`);
1052	memcpy(htab_elem_value(l_new, key_size), value, size);
1053	} else {
1054	copy_map_value(map: &htab->map, dst: htab_elem_value(l: l_new, key_size), src: value);
1055	}
1056
1057	l_new->hash = hash;
1058	return l_new;
1059	dec_count:
1060	dec_elem_count(htab);
1061	return l_new;
1062	}
1063
1064	static int check_flags(struct bpf_htab htab, struct* htab_elem *l_old,
1065	u64 map_flags)
1066	{
1067	if (l_old && (map_flags & ~BPF_F_LOCK) == BPF_NOEXIST)
1068	/ elem already exists /
1069	return -EEXIST;
1070
1071	if (!l_old && (map_flags & ~BPF_F_LOCK) == BPF_EXIST)
1072	/ elem doesn't exist, cannot update it /
1073	return -ENOENT;
1074
1075	return `0`;
1076	}
1077
1078	/ Called from syscall or from eBPF program /
1079	static long htab_map_update_elem(struct bpf_map map, void* key, void* *value,
1080	u64 map_flags)
1081	{
1082	struct bpf_htab htab = container_of(map, struct* bpf_htab, map);
1083	struct htab_elem l_new, l_old;
1084	struct hlist_nulls_head *head;
1085	unsigned long flags;
1086	struct bucket *b;
1087	u32 key_size, hash;
1088	int ret;
1089
1090	if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST))
1091	/ unknown flags /
1092	return -EINVAL;
1093
1094	WARN_ON_ONCE(!bpf_rcu_lock_held());
1095
1096	key_size = map->key_size;
1097
1098	hash = htab_map_hash(key, key_len: key_size, hashrnd: htab->hashrnd);
1099
1100	b = __select_bucket(htab, hash);
1101	head = &b->head;
1102
1103	if (unlikely(map_flags & BPF_F_LOCK)) {
1104	if (unlikely(!btf_record_has_field(map->record, BPF_SPIN_LOCK)))
1105	return -EINVAL;
1106	/ find an element without taking the bucket lock /
1107	l_old = lookup_nulls_elem_raw(head, hash, key, key_size,
1108	n_buckets: htab->n_buckets);
1109	ret = check_flags(htab, l_old, map_flags);
1110	if (ret)
1111	return ret;
1112	if (l_old) {
1113	/ grab the element lock and update value in place /
1114	copy_map_value_locked(map,
1115	dst: htab_elem_value(l: l_old, key_size),
1116	src: value, lock_src: false);
1117	return `0`;
1118	}
1119	/ fall through, grab the bucket lock and lookup again.*
1120	* 99.9% chance that the element won't be found,
1121	* but second lookup under lock has to be done.
1122	*/
1123	}
1124
1125	ret = htab_lock_bucket(b, pflags: &flags);
1126	if (ret)
1127	return ret;
1128
1129	l_old = lookup_elem_raw(head, hash, key, key_size);
1130
1131	ret = check_flags(htab, l_old, map_flags);
1132	if (ret)
1133	goto err;
1134
1135	if (unlikely(l_old && (map_flags & BPF_F_LOCK))) {
1136	/ first lookup without the bucket lock didn't find the element,*
1137	* but second lookup with the bucket lock found it.
1138	* This case is highly unlikely, but has to be dealt with:
1139	* grab the element lock in addition to the bucket lock
1140	* and update element in place
1141	*/
1142	copy_map_value_locked(map,
1143	dst: htab_elem_value(l: l_old, key_size),
1144	src: value, lock_src: false);
1145	ret = `0`;
1146	goto err;
1147	}
1148
1149	l_new = alloc_htab_elem(htab, key, value, key_size, hash, percpu: false, onallcpus: false,
1150	old_elem: l_old);
1151	if (IS_ERR(ptr: l_new)) {
1152	/ all pre-allocated elements are in use or memory exhausted /
1153	ret = PTR_ERR(ptr: l_new);
1154	goto err;
1155	}
1156
1157	/ add new element to the head of the list, so that*
1158	* concurrent search will find it before old elem
1159	*/
1160	hlist_nulls_add_head_rcu(n: &l_new->hash_node, h: head);
1161	if (l_old) {
1162	hlist_nulls_del_rcu(n: &l_old->hash_node);
1163
1164	/ l_old has already been stashed in htab->extra_elems, free*
1165	* its special fields before it is available for reuse.
1166	*/
1167	if (htab_is_prealloc(htab))
1168	check_and_free_fields(htab, elem: l_old);
1169	}
1170	htab_unlock_bucket(b, flags);
1171	if (l_old && !htab_is_prealloc(htab))
1172	free_htab_elem(htab, l: l_old);
1173	return `0`;
1174	err:
1175	htab_unlock_bucket(b, flags);
1176	return ret;
1177	}
1178
1179	static void htab_lru_push_free(struct bpf_htab htab, struct* htab_elem *elem)
1180	{
1181	check_and_free_fields(htab, elem);
1182	bpf_map_dec_elem_count(map: &htab->map);
1183	bpf_lru_push_free(lru: &htab->lru, node: &elem->lru_node);
1184	}
1185
1186	static long htab_lru_map_update_elem(struct bpf_map map, void* key, void* *value,
1187	u64 map_flags)
1188	{
1189	struct bpf_htab htab = container_of(map, struct* bpf_htab, map);
1190	struct htab_elem l_new, l_old = NULL;
1191	struct hlist_nulls_head *head;
1192	unsigned long flags;
1193	struct bucket *b;
1194	u32 key_size, hash;
1195	int ret;
1196
1197	if (unlikely(map_flags > BPF_EXIST))
1198	/ unknown flags /
1199	return -EINVAL;
1200
1201	WARN_ON_ONCE(!bpf_rcu_lock_held());
1202
1203	key_size = map->key_size;
1204
1205	hash = htab_map_hash(key, key_len: key_size, hashrnd: htab->hashrnd);
1206
1207	b = __select_bucket(htab, hash);
1208	head = &b->head;
1209
1210	/ For LRU, we need to alloc before taking bucket's*
1211	* spinlock because getting free nodes from LRU may need
1212	* to remove older elements from htab and this removal
1213	* operation will need a bucket lock.
1214	*/
1215	l_new = prealloc_lru_pop(htab, key, hash);
1216	if (!l_new)
1217	return -ENOMEM;
1218	copy_map_value(map: &htab->map, dst: htab_elem_value(l: l_new, key_size: map->key_size), src: value);
1219
1220	ret = htab_lock_bucket(b, pflags: &flags);
1221	if (ret)
1222	goto err_lock_bucket;
1223
1224	l_old = lookup_elem_raw(head, hash, key, key_size);
1225
1226	ret = check_flags(htab, l_old, map_flags);
1227	if (ret)
1228	goto err;
1229
1230	/ add new element to the head of the list, so that*
1231	* concurrent search will find it before old elem
1232	*/
1233	hlist_nulls_add_head_rcu(n: &l_new->hash_node, h: head);
1234	if (l_old) {
1235	bpf_lru_node_set_ref(node: &l_new->lru_node);
1236	hlist_nulls_del_rcu(n: &l_old->hash_node);
1237	}
1238	ret = `0`;
1239
1240	err:
1241	htab_unlock_bucket(b, flags);
1242
1243	err_lock_bucket:
1244	if (ret)
1245	htab_lru_push_free(htab, elem: l_new);
1246	else if (l_old)
1247	htab_lru_push_free(htab, elem: l_old);
1248
1249	return ret;
1250	}
1251
1252	static long htab_map_update_elem_in_place(struct bpf_map map, void* *key,
1253	void *value, u64 map_flags,
1254	bool percpu, bool onallcpus)
1255	{
1256	struct bpf_htab htab = container_of(map, struct* bpf_htab, map);
1257	struct htab_elem l_new, l_old;
1258	struct hlist_nulls_head *head;
1259	void *old_map_ptr = NULL;
1260	unsigned long flags;
1261	struct bucket *b;
1262	u32 key_size, hash;
1263	int ret;
1264
1265	if (unlikely(map_flags > BPF_EXIST))
1266	/ unknown flags /
1267	return -EINVAL;
1268
1269	WARN_ON_ONCE(!bpf_rcu_lock_held());
1270
1271	key_size = map->key_size;
1272
1273	hash = htab_map_hash(key, key_len: key_size, hashrnd: htab->hashrnd);
1274
1275	b = __select_bucket(htab, hash);
1276	head = &b->head;
1277
1278	ret = htab_lock_bucket(b, pflags: &flags);
1279	if (ret)
1280	return ret;
1281
1282	l_old = lookup_elem_raw(head, hash, key, key_size);
1283
1284	ret = check_flags(htab, l_old, map_flags);
1285	if (ret)
1286	goto err;
1287
1288	if (l_old) {
1289	/ Update value in-place /
1290	if (percpu) {
1291	pcpu_copy_value(htab, pptr: htab_elem_get_ptr(l: l_old, key_size),
1292	value, onallcpus);
1293	} else {
1294	void **inner_map_pptr = htab_elem_value(l: l_old, key_size);
1295
1296	old_map_ptr = *inner_map_pptr;
1297	WRITE_ONCE(inner_map_pptr, (void **)value);
1298	}
1299	} else {
1300	l_new = alloc_htab_elem(htab, key, value, key_size,
1301	hash, percpu, onallcpus, NULL);
1302	if (IS_ERR(ptr: l_new)) {
1303	ret = PTR_ERR(ptr: l_new);
1304	goto err;
1305	}
1306	hlist_nulls_add_head_rcu(n: &l_new->hash_node, h: head);
1307	}
1308	err:
1309	htab_unlock_bucket(b, flags);
1310	if (old_map_ptr)
1311	map->ops->map_fd_put_ptr(map, old_map_ptr, true);
1312	return ret;
1313	}
1314
1315	static long __htab_lru_percpu_map_update_elem(struct bpf_map map, void* *key,
1316	void *value, u64 map_flags,
1317	bool onallcpus)
1318	{
1319	struct bpf_htab htab = container_of(map, struct* bpf_htab, map);
1320	struct htab_elem l_new = NULL, l_old;
1321	struct hlist_nulls_head *head;
1322	unsigned long flags;
1323	struct bucket *b;
1324	u32 key_size, hash;
1325	int ret;
1326
1327	if (unlikely(map_flags > BPF_EXIST))
1328	/ unknown flags /
1329	return -EINVAL;
1330
1331	WARN_ON_ONCE(!bpf_rcu_lock_held());
1332
1333	key_size = map->key_size;
1334
1335	hash = htab_map_hash(key, key_len: key_size, hashrnd: htab->hashrnd);
1336
1337	b = __select_bucket(htab, hash);
1338	head = &b->head;
1339
1340	/ For LRU, we need to alloc before taking bucket's*
1341	* spinlock because LRU's elem alloc may need
1342	* to remove older elem from htab and this removal
1343	* operation will need a bucket lock.
1344	*/
1345	if (map_flags != BPF_EXIST) {
1346	l_new = prealloc_lru_pop(htab, key, hash);
1347	if (!l_new)
1348	return -ENOMEM;
1349	}
1350
1351	ret = htab_lock_bucket(b, pflags: &flags);
1352	if (ret)
1353	goto err_lock_bucket;
1354
1355	l_old = lookup_elem_raw(head, hash, key, key_size);
1356
1357	ret = check_flags(htab, l_old, map_flags);
1358	if (ret)
1359	goto err;
1360
1361	if (l_old) {
1362	bpf_lru_node_set_ref(node: &l_old->lru_node);
1363
1364	/ per-cpu hash map can update value in-place /
1365	pcpu_copy_value(htab, pptr: htab_elem_get_ptr(l: l_old, key_size),
1366	value, onallcpus);
1367	} else {
1368	pcpu_init_value(htab, pptr: htab_elem_get_ptr(l: l_new, key_size),
1369	value, onallcpus);
1370	hlist_nulls_add_head_rcu(n: &l_new->hash_node, h: head);
1371	l_new = NULL;
1372	}
1373	ret = `0`;
1374	err:
1375	htab_unlock_bucket(b, flags);
1376	err_lock_bucket:
1377	if (l_new) {
1378	bpf_map_dec_elem_count(map: &htab->map);
1379	bpf_lru_push_free(lru: &htab->lru, node: &l_new->lru_node);
1380	}
1381	return ret;
1382	}
1383
1384	static long htab_percpu_map_update_elem(struct bpf_map map, void* *key,
1385	void *value, u64 map_flags)
1386	{
1387	return htab_map_update_elem_in_place(map, key, value, map_flags, percpu: true, onallcpus: false);
1388	}
1389
1390	static long htab_lru_percpu_map_update_elem(struct bpf_map map, void* *key,
1391	void *value, u64 map_flags)
1392	{
1393	return __htab_lru_percpu_map_update_elem(map, key, value, map_flags,
1394	onallcpus: false);
1395	}
1396
1397	/ Called from syscall or from eBPF program /
1398	static long htab_map_delete_elem(struct bpf_map map, void* *key)
1399	{
1400	struct bpf_htab htab = container_of(map, struct* bpf_htab, map);
1401	struct hlist_nulls_head *head;
1402	struct bucket *b;
1403	struct htab_elem *l;
1404	unsigned long flags;
1405	u32 hash, key_size;
1406	int ret;
1407
1408	WARN_ON_ONCE(!bpf_rcu_lock_held());
1409
1410	key_size = map->key_size;
1411
1412	hash = htab_map_hash(key, key_len: key_size, hashrnd: htab->hashrnd);
1413	b = __select_bucket(htab, hash);
1414	head = &b->head;
1415
1416	ret = htab_lock_bucket(b, pflags: &flags);
1417	if (ret)
1418	return ret;
1419
1420	l = lookup_elem_raw(head, hash, key, key_size);
1421	if (l)
1422	hlist_nulls_del_rcu(n: &l->hash_node);
1423	else
1424	ret = -ENOENT;
1425
1426	htab_unlock_bucket(b, flags);
1427
1428	if (l)
1429	free_htab_elem(htab, l);
1430	return ret;
1431	}
1432
1433	static long htab_lru_map_delete_elem(struct bpf_map map, void* *key)
1434	{
1435	struct bpf_htab htab = container_of(map, struct* bpf_htab, map);
1436	struct hlist_nulls_head *head;
1437	struct bucket *b;
1438	struct htab_elem *l;
1439	unsigned long flags;
1440	u32 hash, key_size;
1441	int ret;
1442
1443	WARN_ON_ONCE(!bpf_rcu_lock_held());
1444
1445	key_size = map->key_size;
1446
1447	hash = htab_map_hash(key, key_len: key_size, hashrnd: htab->hashrnd);
1448	b = __select_bucket(htab, hash);
1449	head = &b->head;
1450
1451	ret = htab_lock_bucket(b, pflags: &flags);
1452	if (ret)
1453	return ret;
1454
1455	l = lookup_elem_raw(head, hash, key, key_size);
1456
1457	if (l)
1458	hlist_nulls_del_rcu(n: &l->hash_node);
1459	else
1460	ret = -ENOENT;
1461
1462	htab_unlock_bucket(b, flags);
1463	if (l)
1464	htab_lru_push_free(htab, elem: l);
1465	return ret;
1466	}
1467
1468	static void delete_all_elements(struct bpf_htab *htab)
1469	{
1470	int i;
1471
1472	/ It's called from a worker thread and migration has been disabled,*
1473	* therefore, it is OK to invoke bpf_mem_cache_free() directly.
1474	*/
1475	for (i = `0`; i < htab->n_buckets; i++) {
1476	struct hlist_nulls_head *head = select_bucket(htab, hash: i);
1477	struct hlist_nulls_node *n;
1478	struct htab_elem *l;
1479
1480	hlist_nulls_for_each_entry_safe(l, n, head, hash_node) {
1481	hlist_nulls_del_rcu(n: &l->hash_node);
1482	htab_elem_free(htab, l);
1483	}
1484	cond_resched();
1485	}
1486	}
1487
1488	static void htab_free_malloced_internal_structs(struct bpf_htab *htab)
1489	{
1490	int i;
1491
1492	rcu_read_lock();
1493	for (i = `0`; i < htab->n_buckets; i++) {
1494	struct hlist_nulls_head *head = select_bucket(htab, hash: i);
1495	struct hlist_nulls_node *n;
1496	struct htab_elem *l;
1497
1498	hlist_nulls_for_each_entry(l, n, head, hash_node) {
1499	/ We only free internal structs on uref dropping to zero /
1500	bpf_map_free_internal_structs(map: &htab->map,
1501	obj: htab_elem_value(l, key_size: htab->map.key_size));
1502	}
1503	cond_resched_rcu();
1504	}
1505	rcu_read_unlock();
1506	}
1507
1508	static void htab_map_free_internal_structs(struct bpf_map *map)
1509	{
1510	struct bpf_htab htab = container_of(map, struct* bpf_htab, map);
1511
1512	/ We only free internal structs on uref dropping to zero /
1513	if (!bpf_map_has_internal_structs(map))
1514	return;
1515
1516	if (htab_is_prealloc(htab))
1517	htab_free_prealloced_internal_structs(htab);
1518	else
1519	htab_free_malloced_internal_structs(htab);
1520	}
1521
1522	/ Called when map->refcnt goes to zero, either from workqueue or from syscall /
1523	static void htab_map_free(struct bpf_map *map)
1524	{
1525	struct bpf_htab htab = container_of(map, struct* bpf_htab, map);
1526
1527	/ bpf_free_used_maps() or close(map_fd) will trigger this map_free callback.*
1528	* bpf_free_used_maps() is called after bpf prog is no longer executing.
1529	* There is no need to synchronize_rcu() here to protect map elements.
1530	*/
1531
1532	/ htab no longer uses call_rcu() directly. bpf_mem_alloc does it*
1533	* underneath and is responsible for waiting for callbacks to finish
1534	* during bpf_mem_alloc_destroy().
1535	*/
1536	if (!htab_is_prealloc(htab)) {
1537	delete_all_elements(htab);
1538	} else {
1539	htab_free_prealloced_fields(htab);
1540	prealloc_destroy(htab);
1541	}
1542
1543	bpf_map_free_elem_count(map);
1544	free_percpu(pdata: htab->extra_elems);
1545	bpf_map_area_free(base: htab->buckets);
1546	bpf_mem_alloc_destroy(ma: &htab->pcpu_ma);
1547	bpf_mem_alloc_destroy(ma: &htab->ma);
1548	if (htab->use_percpu_counter)
1549	percpu_counter_destroy(fbc: &htab->pcount);
1550	bpf_map_area_free(base: htab);
1551	}
1552
1553	static void htab_map_seq_show_elem(struct bpf_map map, void* *key,
1554	struct seq_file *m)
1555	{
1556	void *value;
1557
1558	rcu_read_lock();
1559
1560	value = htab_map_lookup_elem(map, key);
1561	if (!value) {
1562	rcu_read_unlock();
1563	return;
1564	}
1565
1566	btf_type_seq_show(btf: map->btf, type_id: map->btf_key_type_id, obj: key, m);
1567	seq_puts(m, s: ": ");
1568	btf_type_seq_show(btf: map->btf, type_id: map->btf_value_type_id, obj: value, m);
1569	seq_putc(m, c: `'\n'`);
1570
1571	rcu_read_unlock();
1572	}
1573
1574	static int __htab_map_lookup_and_delete_elem(struct bpf_map map, void* *key,
1575	void *value, bool is_lru_map,
1576	bool is_percpu, u64 flags)
1577	{
1578	struct bpf_htab htab = container_of(map, struct* bpf_htab, map);
1579	struct hlist_nulls_head *head;
1580	unsigned long bflags;
1581	struct htab_elem *l;
1582	u32 hash, key_size;
1583	struct bucket *b;
1584	int ret;
1585
1586	key_size = map->key_size;
1587
1588	hash = htab_map_hash(key, key_len: key_size, hashrnd: htab->hashrnd);
1589	b = __select_bucket(htab, hash);
1590	head = &b->head;
1591
1592	ret = htab_lock_bucket(b, pflags: &bflags);
1593	if (ret)
1594	return ret;
1595
1596	l = lookup_elem_raw(head, hash, key, key_size);
1597	if (!l) {
1598	ret = -ENOENT;
1599	goto out_unlock;
1600	}
1601
1602	if (is_percpu) {
1603	u32 roundup_value_size = round_up(map->value_size, `8`);
1604	void __percpu *pptr;
1605	int off = `0`, cpu;
1606
1607	pptr = htab_elem_get_ptr(l, key_size);
1608	for_each_possible_cpu(cpu) {
1609	copy_map_value_long(map: &htab->map, dst: value + off, per_cpu_ptr(pptr, cpu));
1610	check_and_init_map_value(map: &htab->map, dst: value + off);
1611	off += roundup_value_size;
1612	}
1613	} else {
1614	void *src = htab_elem_value(l, key_size: map->key_size);
1615
1616	if (flags & BPF_F_LOCK)
1617	copy_map_value_locked(map, dst: value, src, lock_src: true);
1618	else
1619	copy_map_value(map, dst: value, src);
1620	/ Zeroing special fields in the temp buffer /
1621	check_and_init_map_value(map, dst: value);
1622	}
1623	hlist_nulls_del_rcu(n: &l->hash_node);
1624
1625	out_unlock:
1626	htab_unlock_bucket(b, flags: bflags);
1627
1628	if (l) {
1629	if (is_lru_map)
1630	htab_lru_push_free(htab, elem: l);
1631	else
1632	free_htab_elem(htab, l);
1633	}
1634
1635	return ret;
1636	}
1637
1638	static int htab_map_lookup_and_delete_elem(struct bpf_map map, void* *key,
1639	void *value, u64 flags)
1640	{
1641	return __htab_map_lookup_and_delete_elem(map, key, value, is_lru_map: false, is_percpu: false,
1642	flags);
1643	}
1644
1645	static int htab_percpu_map_lookup_and_delete_elem(struct bpf_map *map,
1646	void key, void* *value,
1647	u64 flags)
1648	{
1649	return __htab_map_lookup_and_delete_elem(map, key, value, is_lru_map: false, is_percpu: true,
1650	flags);
1651	}
1652
1653	static int htab_lru_map_lookup_and_delete_elem(struct bpf_map map, void* *key,
1654	void *value, u64 flags)
1655	{
1656	return __htab_map_lookup_and_delete_elem(map, key, value, is_lru_map: true, is_percpu: false,
1657	flags);
1658	}
1659
1660	static int htab_lru_percpu_map_lookup_and_delete_elem(struct bpf_map *map,
1661	void key, void* *value,
1662	u64 flags)
1663	{
1664	return __htab_map_lookup_and_delete_elem(map, key, value, is_lru_map: true, is_percpu: true,
1665	flags);
1666	}
1667
1668	static int
1669	__htab_map_lookup_and_delete_batch(struct bpf_map *map,
1670	const union bpf_attr *attr,
1671	union bpf_attr __user *uattr,
1672	bool do_delete, bool is_lru_map,
1673	bool is_percpu)
1674	{
1675	struct bpf_htab htab = container_of(map, struct* bpf_htab, map);
1676	void keys = NULL, values = NULL, value, dst_key, *dst_val;
1677	void __user *uvalues = u64_to_user_ptr(attr->batch.values);
1678	void __user *ukeys = u64_to_user_ptr(attr->batch.keys);
1679	void __user *ubatch = u64_to_user_ptr(attr->batch.in_batch);
1680	u32 batch, max_count, size, bucket_size, map_id;
1681	u32 bucket_cnt, total, key_size, value_size;
1682	struct htab_elem *node_to_free = NULL;
1683	u64 elem_map_flags, map_flags;
1684	struct hlist_nulls_head *head;
1685	struct hlist_nulls_node *n;
1686	unsigned long flags = `0`;
1687	bool locked = false;
1688	struct htab_elem *l;
1689	struct bucket *b;
1690	int ret = `0`;
1691
1692	elem_map_flags = attr->batch.elem_flags;
1693	if ((elem_map_flags & ~BPF_F_LOCK) \|\|
1694	((elem_map_flags & BPF_F_LOCK) && !btf_record_has_field(rec: map->record, type: BPF_SPIN_LOCK)))
1695	return -EINVAL;
1696
1697	map_flags = attr->batch.flags;
1698	if (map_flags)
1699	return -EINVAL;
1700
1701	max_count = attr->batch.count;
1702	if (!max_count)
1703	return `0`;
1704
1705	if (put_user(`0`, &uattr->batch.count))
1706	return -EFAULT;
1707
1708	batch = `0`;
1709	if (ubatch && copy_from_user(to: &batch, from: ubatch, n: sizeof(batch)))
1710	return -EFAULT;
1711
1712	if (batch >= htab->n_buckets)
1713	return -ENOENT;
1714
1715	key_size = htab->map.key_size;
1716	value_size = htab->map.value_size;
1717	size = round_up(value_size, `8`);
1718	if (is_percpu)
1719	value_size = size * num_possible_cpus();
1720	total = `0`;
1721	/ while experimenting with hash tables with sizes ranging from 10 to*
1722	* 1000, it was observed that a bucket can have up to 5 entries.
1723	*/
1724	bucket_size = `5`;
1725
1726	alloc:
1727	/ We cannot do copy_from_user or copy_to_user inside*
1728	* the rcu_read_lock. Allocate enough space here.
1729	*/
1730	keys = kvmalloc_array(key_size, bucket_size, GFP_USER \| __GFP_NOWARN);
1731	values = kvmalloc_array(value_size, bucket_size, GFP_USER \| __GFP_NOWARN);
1732	if (!keys \|\| !values) {
1733	ret = -ENOMEM;
1734	goto after_loop;
1735	}
1736
1737	again:
1738	bpf_disable_instrumentation();
1739	rcu_read_lock();
1740	again_nocopy:
1741	dst_key = keys;
1742	dst_val = values;
1743	b = &htab->buckets[batch];
1744	head = &b->head;
1745	/ do not grab the lock unless need it (bucket_cnt > 0). /
1746	if (locked) {
1747	ret = htab_lock_bucket(b, pflags: &flags);
1748	if (ret) {
1749	rcu_read_unlock();
1750	bpf_enable_instrumentation();
1751	goto after_loop;
1752	}
1753	}
1754
1755	bucket_cnt = `0`;
1756	hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
1757	bucket_cnt++;
1758
1759	if (bucket_cnt && !locked) {
1760	locked = true;
1761	goto again_nocopy;
1762	}
1763
1764	if (bucket_cnt > (max_count - total)) {
1765	if (total == `0`)
1766	ret = -ENOSPC;
1767	/ Note that since bucket_cnt > 0 here, it is implicit*
1768	* that the locked was grabbed, so release it.
1769	*/
1770	htab_unlock_bucket(b, flags);
1771	rcu_read_unlock();
1772	bpf_enable_instrumentation();
1773	goto after_loop;
1774	}
1775
1776	if (bucket_cnt > bucket_size) {
1777	bucket_size = bucket_cnt;
1778	/ Note that since bucket_cnt > 0 here, it is implicit*
1779	* that the locked was grabbed, so release it.
1780	*/
1781	htab_unlock_bucket(b, flags);
1782	rcu_read_unlock();
1783	bpf_enable_instrumentation();
1784	kvfree(addr: keys);
1785	kvfree(addr: values);
1786	goto alloc;
1787	}
1788
1789	/ Next block is only safe to run if you have grabbed the lock /
1790	if (!locked)
1791	goto next_batch;
1792
1793	hlist_nulls_for_each_entry_safe(l, n, head, hash_node) {
1794	memcpy(dst_key, l->key, key_size);
1795
1796	if (is_percpu) {
1797	int off = `0`, cpu;
1798	void __percpu *pptr;
1799
1800	pptr = htab_elem_get_ptr(l, key_size: map->key_size);
1801	for_each_possible_cpu(cpu) {
1802	copy_map_value_long(map: &htab->map, dst: dst_val + off, per_cpu_ptr(pptr, cpu));
1803	check_and_init_map_value(map: &htab->map, dst: dst_val + off);
1804	off += size;
1805	}
1806	} else {
1807	value = htab_elem_value(l, key_size);
1808	if (is_fd_htab(htab)) {
1809	struct bpf_map **inner_map = value;
1810
1811	/ Actual value is the id of the inner map /
1812	map_id = map->ops->map_fd_sys_lookup_elem(*inner_map);
1813	value = &map_id;
1814	}
1815
1816	if (elem_map_flags & BPF_F_LOCK)
1817	copy_map_value_locked(map, dst: dst_val, src: value,
1818	lock_src: true);
1819	else
1820	copy_map_value(map, dst: dst_val, src: value);
1821	/ Zeroing special fields in the temp buffer /
1822	check_and_init_map_value(map, dst: dst_val);
1823	}
1824	if (do_delete) {
1825	hlist_nulls_del_rcu(n: &l->hash_node);
1826
1827	/ bpf_lru_push_free() will acquire lru_lock, which*
1828	* may cause deadlock. See comments in function
1829	* prealloc_lru_pop(). Let us do bpf_lru_push_free()
1830	* after releasing the bucket lock.
1831	*
1832	* For htab of maps, htab_put_fd_value() in
1833	* free_htab_elem() may acquire a spinlock with bucket
1834	* lock being held and it violates the lock rule, so
1835	* invoke free_htab_elem() after unlock as well.
1836	*/
1837	l->batch_flink = node_to_free;
1838	node_to_free = l;
1839	}
1840	dst_key += key_size;
1841	dst_val += value_size;
1842	}
1843
1844	htab_unlock_bucket(b, flags);
1845	locked = false;
1846
1847	while (node_to_free) {
1848	l = node_to_free;
1849	node_to_free = node_to_free->batch_flink;
1850	if (is_lru_map)
1851	htab_lru_push_free(htab, elem: l);
1852	else
1853	free_htab_elem(htab, l);
1854	}
1855
1856	next_batch:
1857	/ If we are not copying data, we can go to next bucket and avoid*
1858	* unlocking the rcu.
1859	*/
1860	if (!bucket_cnt && (batch + `1` < htab->n_buckets)) {
1861	batch++;
1862	goto again_nocopy;
1863	}
1864
1865	rcu_read_unlock();
1866	bpf_enable_instrumentation();
1867	if (bucket_cnt && (copy_to_user(to: ukeys + total * key_size, from: keys,
1868	n: key_size * bucket_cnt) \|\|
1869	copy_to_user(to: uvalues + total * value_size, from: values,
1870	n: value_size * bucket_cnt))) {
1871	ret = -EFAULT;
1872	goto after_loop;
1873	}
1874
1875	total += bucket_cnt;
1876	batch++;
1877	if (batch >= htab->n_buckets) {
1878	ret = -ENOENT;
1879	goto after_loop;
1880	}
1881	goto again;
1882
1883	after_loop:
1884	if (ret == -EFAULT)
1885	goto out;
1886
1887	/ copy # of entries and next batch /
1888	ubatch = u64_to_user_ptr(attr->batch.out_batch);
1889	if (copy_to_user(to: ubatch, from: &batch, n: sizeof(batch)) \|\|
1890	put_user(total, &uattr->batch.count))
1891	ret = -EFAULT;
1892
1893	out:
1894	kvfree(addr: keys);
1895	kvfree(addr: values);
1896	return ret;
1897	}
1898
1899	static int
1900	htab_percpu_map_lookup_batch(struct bpf_map map, const* union bpf_attr *attr,
1901	union bpf_attr __user *uattr)
1902	{
1903	return __htab_map_lookup_and_delete_batch(map, attr, uattr, do_delete: false,
1904	is_lru_map: false, is_percpu: true);
1905	}
1906
1907	static int
1908	htab_percpu_map_lookup_and_delete_batch(struct bpf_map *map,
1909	const union bpf_attr *attr,
1910	union bpf_attr __user *uattr)
1911	{
1912	return __htab_map_lookup_and_delete_batch(map, attr, uattr, do_delete: true,
1913	is_lru_map: false, is_percpu: true);
1914	}
1915
1916	static int
1917	htab_map_lookup_batch(struct bpf_map map, const* union bpf_attr *attr,
1918	union bpf_attr __user *uattr)
1919	{
1920	return __htab_map_lookup_and_delete_batch(map, attr, uattr, do_delete: false,
1921	is_lru_map: false, is_percpu: false);
1922	}
1923
1924	static int
1925	htab_map_lookup_and_delete_batch(struct bpf_map *map,
1926	const union bpf_attr *attr,
1927	union bpf_attr __user *uattr)
1928	{
1929	return __htab_map_lookup_and_delete_batch(map, attr, uattr, do_delete: true,
1930	is_lru_map: false, is_percpu: false);
1931	}
1932
1933	static int
1934	htab_lru_percpu_map_lookup_batch(struct bpf_map *map,
1935	const union bpf_attr *attr,
1936	union bpf_attr __user *uattr)
1937	{
1938	return __htab_map_lookup_and_delete_batch(map, attr, uattr, do_delete: false,
1939	is_lru_map: true, is_percpu: true);
1940	}
1941
1942	static int
1943	htab_lru_percpu_map_lookup_and_delete_batch(struct bpf_map *map,
1944	const union bpf_attr *attr,
1945	union bpf_attr __user *uattr)
1946	{
1947	return __htab_map_lookup_and_delete_batch(map, attr, uattr, do_delete: true,
1948	is_lru_map: true, is_percpu: true);
1949	}
1950
1951	static int
1952	htab_lru_map_lookup_batch(struct bpf_map map, const* union bpf_attr *attr,
1953	union bpf_attr __user *uattr)
1954	{
1955	return __htab_map_lookup_and_delete_batch(map, attr, uattr, do_delete: false,
1956	is_lru_map: true, is_percpu: false);
1957	}
1958
1959	static int
1960	htab_lru_map_lookup_and_delete_batch(struct bpf_map *map,
1961	const union bpf_attr *attr,
1962	union bpf_attr __user *uattr)
1963	{
1964	return __htab_map_lookup_and_delete_batch(map, attr, uattr, do_delete: true,
1965	is_lru_map: true, is_percpu: false);
1966	}
1967
1968	struct bpf_iter_seq_hash_map_info {
1969	struct bpf_map *map;
1970	struct bpf_htab *htab;
1971	void percpu_value_buf; // non-zero means percpu hash*
1972	u32 bucket_id;
1973	u32 skip_elems;
1974	};
1975
1976	static struct htab_elem *
1977	bpf_hash_map_seq_find_next(struct bpf_iter_seq_hash_map_info *info,
1978	struct htab_elem *prev_elem)
1979	{
1980	const struct bpf_htab *htab = info->htab;
1981	u32 skip_elems = info->skip_elems;
1982	u32 bucket_id = info->bucket_id;
1983	struct hlist_nulls_head *head;
1984	struct hlist_nulls_node *n;
1985	struct htab_elem *elem;
1986	struct bucket *b;
1987	u32 i, count;
1988
1989	if (bucket_id >= htab->n_buckets)
1990	return NULL;
1991
1992	/ try to find next elem in the same bucket /
1993	if (prev_elem) {
1994	/ no update/deletion on this bucket, prev_elem should be still valid*
1995	* and we won't skip elements.
1996	*/
1997	n = rcu_dereference_raw(hlist_nulls_next_rcu(&prev_elem->hash_node));
1998	elem = hlist_nulls_entry_safe(n, struct htab_elem, hash_node);
1999	if (elem)
2000	return elem;
2001
2002	/ not found, unlock and go to the next bucket /
2003	b = &htab->buckets[bucket_id++];
2004	rcu_read_unlock();
2005	skip_elems = `0`;
2006	}
2007
2008	for (i = bucket_id; i < htab->n_buckets; i++) {
2009	b = &htab->buckets[i];
2010	rcu_read_lock();
2011
2012	count = `0`;
2013	head = &b->head;
2014	hlist_nulls_for_each_entry_rcu(elem, n, head, hash_node) {
2015	if (count >= skip_elems) {
2016	info->bucket_id = i;
2017	info->skip_elems = count;
2018	return elem;
2019	}
2020	count++;
2021	}
2022
2023	rcu_read_unlock();
2024	skip_elems = `0`;
2025	}
2026
2027	info->bucket_id = i;
2028	info->skip_elems = `0`;
2029	return NULL;
2030	}
2031
2032	static void bpf_hash_map_seq_start(struct* seq_file seq, loff_t pos)
2033	{
2034	struct bpf_iter_seq_hash_map_info *info = seq->private;
2035	struct htab_elem *elem;
2036
2037	elem = bpf_hash_map_seq_find_next(info, NULL);
2038	if (!elem)
2039	return NULL;
2040
2041	if (*pos == `0`)
2042	++*pos;
2043	return elem;
2044	}
2045
2046	static void bpf_hash_map_seq_next(struct* seq_file seq, void* v, loff_t pos)
2047	{
2048	struct bpf_iter_seq_hash_map_info *info = seq->private;
2049
2050	++*pos;
2051	++info->skip_elems;
2052	return bpf_hash_map_seq_find_next(info, prev_elem: v);
2053	}
2054
2055	static int __bpf_hash_map_seq_show(struct seq_file seq, struct* htab_elem *elem)
2056	{
2057	struct bpf_iter_seq_hash_map_info *info = seq->private;
2058	struct bpf_iter__bpf_map_elem ctx = {};
2059	struct bpf_map *map = info->map;
2060	struct bpf_iter_meta meta;
2061	int ret = `0`, off = `0`, cpu;
2062	u32 roundup_value_size;
2063	struct bpf_prog *prog;
2064	void __percpu *pptr;
2065
2066	meta.seq = seq;
2067	prog = bpf_iter_get_info(meta: &meta, in_stop: elem == NULL);
2068	if (prog) {
2069	ctx.meta = &meta;
2070	ctx.map = info->map;
2071	if (elem) {
2072	ctx.key = elem->key;
2073	if (!info->percpu_value_buf) {
2074	ctx.value = htab_elem_value(l: elem, key_size: map->key_size);
2075	} else {
2076	roundup_value_size = round_up(map->value_size, `8`);
2077	pptr = htab_elem_get_ptr(l: elem, key_size: map->key_size);
2078	for_each_possible_cpu(cpu) {
2079	copy_map_value_long(map, dst: info->percpu_value_buf + off,
2080	per_cpu_ptr(pptr, cpu));
2081	check_and_init_map_value(map, dst: info->percpu_value_buf + off);
2082	off += roundup_value_size;
2083	}
2084	ctx.value = info->percpu_value_buf;
2085	}
2086	}
2087	ret = bpf_iter_run_prog(prog, ctx: &ctx);
2088	}
2089
2090	return ret;
2091	}
2092
2093	static int bpf_hash_map_seq_show(struct seq_file seq, void* *v)
2094	{
2095	return __bpf_hash_map_seq_show(seq, elem: v);
2096	}
2097
2098	static void bpf_hash_map_seq_stop(struct seq_file seq, void* *v)
2099	{
2100	if (!v)
2101	(void)__bpf_hash_map_seq_show(seq, NULL);
2102	else
2103	rcu_read_unlock();
2104	}
2105
2106	static int bpf_iter_init_hash_map(void *priv_data,
2107	struct bpf_iter_aux_info *aux)
2108	{
2109	struct bpf_iter_seq_hash_map_info *seq_info = priv_data;
2110	struct bpf_map *map = aux->map;
2111	void *value_buf;
2112	u32 buf_size;
2113
2114	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH \|\|
2115	map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
2116	buf_size = round_up(map->value_size, `8`) * num_possible_cpus();
2117	value_buf = kmalloc(buf_size, GFP_USER \| __GFP_NOWARN);
2118	if (!value_buf)
2119	return -ENOMEM;
2120
2121	seq_info->percpu_value_buf = value_buf;
2122	}
2123
2124	bpf_map_inc_with_uref(map);
2125	seq_info->map = map;
2126	seq_info->htab = container_of(map, struct bpf_htab, map);
2127	return `0`;
2128	}
2129
2130	static void bpf_iter_fini_hash_map(void *priv_data)
2131	{
2132	struct bpf_iter_seq_hash_map_info *seq_info = priv_data;
2133
2134	bpf_map_put_with_uref(map: seq_info->map);
2135	kfree(objp: seq_info->percpu_value_buf);
2136	}
2137
2138	static const struct seq_operations bpf_hash_map_seq_ops = {
2139	.start = bpf_hash_map_seq_start,
2140	.next = bpf_hash_map_seq_next,
2141	.stop = bpf_hash_map_seq_stop,
2142	.show = bpf_hash_map_seq_show,
2143	};
2144
2145	static const struct bpf_iter_seq_info iter_seq_info = {
2146	.seq_ops = &bpf_hash_map_seq_ops,
2147	.init_seq_private = bpf_iter_init_hash_map,
2148	.fini_seq_private = bpf_iter_fini_hash_map,
2149	.seq_priv_size = sizeof(struct bpf_iter_seq_hash_map_info),
2150	};
2151
2152	static long bpf_for_each_hash_elem(struct bpf_map *map, bpf_callback_t callback_fn,
2153	void *callback_ctx, u64 flags)
2154	{
2155	struct bpf_htab htab = container_of(map, struct* bpf_htab, map);
2156	struct hlist_nulls_head *head;
2157	struct hlist_nulls_node *n;
2158	struct htab_elem *elem;
2159	int i, num_elems = `0`;
2160	void __percpu *pptr;
2161	struct bucket *b;
2162	void key, val;
2163	bool is_percpu;
2164	u64 ret = `0`;
2165
2166	cant_migrate();
2167
2168	if (flags != `0`)
2169	return -EINVAL;
2170
2171	is_percpu = htab_is_percpu(htab);
2172
2173	/ migration has been disabled, so percpu value prepared here will be*
2174	* the same as the one seen by the bpf program with
2175	* bpf_map_lookup_elem().
2176	*/
2177	for (i = `0`; i < htab->n_buckets; i++) {
2178	b = &htab->buckets[i];
2179	rcu_read_lock();
2180	head = &b->head;
2181	hlist_nulls_for_each_entry_safe(elem, n, head, hash_node) {
2182	key = elem->key;
2183	if (is_percpu) {
2184	/ current cpu value for percpu map /
2185	pptr = htab_elem_get_ptr(l: elem, key_size: map->key_size);
2186	val = this_cpu_ptr(pptr);
2187	} else {
2188	val = htab_elem_value(l: elem, key_size: map->key_size);
2189	}
2190	num_elems++;
2191	ret = callback_fn((u64)(long)map, (u64)(long)key,
2192	(u64)(long)val, (u64)(long)callback_ctx, `0`);
2193	/ return value: 0 - continue, 1 - stop and return /
2194	if (ret) {
2195	rcu_read_unlock();
2196	goto out;
2197	}
2198	}
2199	rcu_read_unlock();
2200	}
2201	out:
2202	return num_elems;
2203	}
2204
2205	static u64 htab_map_mem_usage(const struct bpf_map *map)
2206	{
2207	struct bpf_htab htab = container_of(map, struct* bpf_htab, map);
2208	u32 value_size = round_up(htab->map.value_size, `8`);
2209	bool prealloc = htab_is_prealloc(htab);
2210	bool percpu = htab_is_percpu(htab);
2211	bool lru = htab_is_lru(htab);
2212	u64 num_entries;
2213	u64 usage = sizeof(struct bpf_htab);
2214
2215	usage += sizeof(struct bucket) * htab->n_buckets;
2216	usage += sizeof(int) * num_possible_cpus() * HASHTAB_MAP_LOCK_COUNT;
2217	if (prealloc) {
2218	num_entries = map->max_entries;
2219	if (htab_has_extra_elems(htab))
2220	num_entries += num_possible_cpus();
2221
2222	usage += htab->elem_size * num_entries;
2223
2224	if (percpu)
2225	usage += value_size * num_possible_cpus() * num_entries;
2226	else if (!lru)
2227	usage += sizeof(struct htab_elem ) num_possible_cpus();
2228	} else {
2229	#define LLIST_NODE_SZ sizeof(struct llist_node)
2230
2231	num_entries = htab->use_percpu_counter ?
2232	percpu_counter_sum(fbc: &htab->pcount) :
2233	atomic_read(v: &htab->count);
2234	usage += (htab->elem_size + LLIST_NODE_SZ) * num_entries;
2235	if (percpu) {
2236	usage += (LLIST_NODE_SZ + sizeof(void )) num_entries;
2237	usage += value_size * num_possible_cpus() * num_entries;
2238	}
2239	}
2240	return usage;
2241	}
2242
2243	BTF_ID_LIST_SINGLE(htab_map_btf_ids, struct, bpf_htab)
2244	const struct bpf_map_ops htab_map_ops = {
2245	.map_meta_equal = bpf_map_meta_equal,
2246	.map_alloc_check = htab_map_alloc_check,
2247	.map_alloc = htab_map_alloc,
2248	.map_free = htab_map_free,
2249	.map_get_next_key = htab_map_get_next_key,
2250	.map_release_uref = htab_map_free_internal_structs,
2251	.map_lookup_elem = htab_map_lookup_elem,
2252	.map_lookup_and_delete_elem = htab_map_lookup_and_delete_elem,
2253	.map_update_elem = htab_map_update_elem,
2254	.map_delete_elem = htab_map_delete_elem,
2255	.map_gen_lookup = htab_map_gen_lookup,
2256	.map_seq_show_elem = htab_map_seq_show_elem,
2257	.map_set_for_each_callback_args = map_set_for_each_callback_args,
2258	.map_for_each_callback = bpf_for_each_hash_elem,
2259	.map_mem_usage = htab_map_mem_usage,
2260	BATCH_OPS(htab),
2261	.map_btf_id = &htab_map_btf_ids[`0`],
2262	.iter_seq_info = &iter_seq_info,
2263	};
2264
2265	const struct bpf_map_ops htab_lru_map_ops = {
2266	.map_meta_equal = bpf_map_meta_equal,
2267	.map_alloc_check = htab_map_alloc_check,
2268	.map_alloc = htab_map_alloc,
2269	.map_free = htab_map_free,
2270	.map_get_next_key = htab_map_get_next_key,
2271	.map_release_uref = htab_map_free_internal_structs,
2272	.map_lookup_elem = htab_lru_map_lookup_elem,
2273	.map_lookup_and_delete_elem = htab_lru_map_lookup_and_delete_elem,
2274	.map_lookup_elem_sys_only = htab_lru_map_lookup_elem_sys,
2275	.map_update_elem = htab_lru_map_update_elem,
2276	.map_delete_elem = htab_lru_map_delete_elem,
2277	.map_gen_lookup = htab_lru_map_gen_lookup,
2278	.map_seq_show_elem = htab_map_seq_show_elem,
2279	.map_set_for_each_callback_args = map_set_for_each_callback_args,
2280	.map_for_each_callback = bpf_for_each_hash_elem,
2281	.map_mem_usage = htab_map_mem_usage,
2282	BATCH_OPS(htab_lru),
2283	.map_btf_id = &htab_map_btf_ids[`0`],
2284	.iter_seq_info = &iter_seq_info,
2285	};
2286
2287	/ Called from eBPF program /
2288	static void htab_percpu_map_lookup_elem(struct* bpf_map map, void* *key)
2289	{
2290	struct htab_elem *l = __htab_map_lookup_elem(map, key);
2291
2292	if (l)
2293	return this_cpu_ptr(htab_elem_get_ptr(l, map->key_size));
2294	else
2295	return NULL;
2296	}
2297
2298	/ inline bpf_map_lookup_elem() call for per-CPU hashmap /
2299	static int htab_percpu_map_gen_lookup(struct bpf_map map, struct* bpf_insn *insn_buf)
2300	{
2301	struct bpf_insn *insn = insn_buf;
2302
2303	if (!bpf_jit_supports_percpu_insn())
2304	return -EOPNOTSUPP;
2305
2306	BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem,
2307	(void ()(struct bpf_map map, void* *key))NULL));
2308	*insn++ = BPF_EMIT_CALL(__htab_map_lookup_elem);
2309	*insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, `0`, `3`);
2310	*insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_0,
2311	offsetof(struct htab_elem, key) + roundup(map->key_size, `8`));
2312	*insn++ = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, `0`);
2313	*insn++ = BPF_MOV64_PERCPU_REG(BPF_REG_0, BPF_REG_0);
2314
2315	return insn - insn_buf;
2316	}
2317
2318	static void htab_percpu_map_lookup_percpu_elem(struct* bpf_map map, void* *key, u32 cpu)
2319	{
2320	struct htab_elem *l;
2321
2322	if (cpu >= nr_cpu_ids)
2323	return NULL;
2324
2325	l = __htab_map_lookup_elem(map, key);
2326	if (l)
2327	return per_cpu_ptr(htab_elem_get_ptr(l, map->key_size), cpu);
2328	else
2329	return NULL;
2330	}
2331
2332	static void htab_lru_percpu_map_lookup_elem(struct* bpf_map map, void* *key)
2333	{
2334	struct htab_elem *l = __htab_map_lookup_elem(map, key);
2335
2336	if (l) {
2337	bpf_lru_node_set_ref(node: &l->lru_node);
2338	return this_cpu_ptr(htab_elem_get_ptr(l, map->key_size));
2339	}
2340
2341	return NULL;
2342	}
2343
2344	static void htab_lru_percpu_map_lookup_percpu_elem(struct* bpf_map map, void* *key, u32 cpu)
2345	{
2346	struct htab_elem *l;
2347
2348	if (cpu >= nr_cpu_ids)
2349	return NULL;
2350
2351	l = __htab_map_lookup_elem(map, key);
2352	if (l) {
2353	bpf_lru_node_set_ref(node: &l->lru_node);
2354	return per_cpu_ptr(htab_elem_get_ptr(l, map->key_size), cpu);
2355	}
2356
2357	return NULL;
2358	}
2359
2360	int bpf_percpu_hash_copy(struct bpf_map map, void* key, void* *value)
2361	{
2362	struct htab_elem *l;
2363	void __percpu *pptr;
2364	int ret = -ENOENT;
2365	int cpu, off = `0`;
2366	u32 size;
2367
2368	/ per_cpu areas are zero-filled and bpf programs can only*
2369	* access 'value_size' of them, so copying rounded areas
2370	* will not leak any kernel data
2371	*/
2372	size = round_up(map->value_size, `8`);
2373	rcu_read_lock();
2374	l = __htab_map_lookup_elem(map, key);
2375	if (!l)
2376	goto out;
2377	/ We do not mark LRU map element here in order to not mess up*
2378	* eviction heuristics when user space does a map walk.
2379	*/
2380	pptr = htab_elem_get_ptr(l, key_size: map->key_size);
2381	for_each_possible_cpu(cpu) {
2382	copy_map_value_long(map, dst: value + off, per_cpu_ptr(pptr, cpu));
2383	check_and_init_map_value(map, dst: value + off);
2384	off += size;
2385	}
2386	ret = `0`;
2387	out:
2388	rcu_read_unlock();
2389	return ret;
2390	}
2391
2392	int bpf_percpu_hash_update(struct bpf_map map, void* key, void* *value,
2393	u64 map_flags)
2394	{
2395	struct bpf_htab htab = container_of(map, struct* bpf_htab, map);
2396	int ret;
2397
2398	rcu_read_lock();
2399	if (htab_is_lru(htab))
2400	ret = __htab_lru_percpu_map_update_elem(map, key, value,
2401	map_flags, onallcpus: true);
2402	else
2403	ret = htab_map_update_elem_in_place(map, key, value, map_flags,
2404	percpu: true, onallcpus: true);
2405	rcu_read_unlock();
2406
2407	return ret;
2408	}
2409
2410	static void htab_percpu_map_seq_show_elem(struct bpf_map map, void* *key,
2411	struct seq_file *m)
2412	{
2413	struct htab_elem *l;
2414	void __percpu *pptr;
2415	int cpu;
2416
2417	rcu_read_lock();
2418
2419	l = __htab_map_lookup_elem(map, key);
2420	if (!l) {
2421	rcu_read_unlock();
2422	return;
2423	}
2424
2425	btf_type_seq_show(btf: map->btf, type_id: map->btf_key_type_id, obj: key, m);
2426	seq_puts(m, s: ": {\n");
2427	pptr = htab_elem_get_ptr(l, key_size: map->key_size);
2428	for_each_possible_cpu(cpu) {
2429	seq_printf(m, fmt: "\tcpu%d: ", cpu);
2430	btf_type_seq_show(btf: map->btf, type_id: map->btf_value_type_id,
2431	per_cpu_ptr(pptr, cpu), m);
2432	seq_putc(m, c: `'\n'`);
2433	}
2434	seq_puts(m, s: "}\n");
2435
2436	rcu_read_unlock();
2437	}
2438
2439	const struct bpf_map_ops htab_percpu_map_ops = {
2440	.map_meta_equal = bpf_map_meta_equal,
2441	.map_alloc_check = htab_map_alloc_check,
2442	.map_alloc = htab_map_alloc,
2443	.map_free = htab_map_free,
2444	.map_get_next_key = htab_map_get_next_key,
2445	.map_lookup_elem = htab_percpu_map_lookup_elem,
2446	.map_gen_lookup = htab_percpu_map_gen_lookup,
2447	.map_lookup_and_delete_elem = htab_percpu_map_lookup_and_delete_elem,
2448	.map_update_elem = htab_percpu_map_update_elem,
2449	.map_delete_elem = htab_map_delete_elem,
2450	.map_lookup_percpu_elem = htab_percpu_map_lookup_percpu_elem,
2451	.map_seq_show_elem = htab_percpu_map_seq_show_elem,
2452	.map_set_for_each_callback_args = map_set_for_each_callback_args,
2453	.map_for_each_callback = bpf_for_each_hash_elem,
2454	.map_mem_usage = htab_map_mem_usage,
2455	BATCH_OPS(htab_percpu),
2456	.map_btf_id = &htab_map_btf_ids[`0`],
2457	.iter_seq_info = &iter_seq_info,
2458	};
2459
2460	const struct bpf_map_ops htab_lru_percpu_map_ops = {
2461	.map_meta_equal = bpf_map_meta_equal,
2462	.map_alloc_check = htab_map_alloc_check,
2463	.map_alloc = htab_map_alloc,
2464	.map_free = htab_map_free,
2465	.map_get_next_key = htab_map_get_next_key,
2466	.map_lookup_elem = htab_lru_percpu_map_lookup_elem,
2467	.map_lookup_and_delete_elem = htab_lru_percpu_map_lookup_and_delete_elem,
2468	.map_update_elem = htab_lru_percpu_map_update_elem,
2469	.map_delete_elem = htab_lru_map_delete_elem,
2470	.map_lookup_percpu_elem = htab_lru_percpu_map_lookup_percpu_elem,
2471	.map_seq_show_elem = htab_percpu_map_seq_show_elem,
2472	.map_set_for_each_callback_args = map_set_for_each_callback_args,
2473	.map_for_each_callback = bpf_for_each_hash_elem,
2474	.map_mem_usage = htab_map_mem_usage,
2475	BATCH_OPS(htab_lru_percpu),
2476	.map_btf_id = &htab_map_btf_ids[`0`],
2477	.iter_seq_info = &iter_seq_info,
2478	};
2479
2480	static int fd_htab_map_alloc_check(union bpf_attr *attr)
2481	{
2482	if (attr->value_size != sizeof(u32))
2483	return -EINVAL;
2484	return htab_map_alloc_check(attr);
2485	}
2486
2487	static void fd_htab_map_free(struct bpf_map *map)
2488	{
2489	struct bpf_htab htab = container_of(map, struct* bpf_htab, map);
2490	struct hlist_nulls_node *n;
2491	struct hlist_nulls_head *head;
2492	struct htab_elem *l;
2493	int i;
2494
2495	for (i = `0`; i < htab->n_buckets; i++) {
2496	head = select_bucket(htab, hash: i);
2497
2498	hlist_nulls_for_each_entry_safe(l, n, head, hash_node) {
2499	void *ptr = fd_htab_map_get_ptr(map, l);
2500
2501	map->ops->map_fd_put_ptr(map, ptr, false);
2502	}
2503	}
2504
2505	htab_map_free(map);
2506	}
2507
2508	/ only called from syscall /
2509	int bpf_fd_htab_map_lookup_elem(struct bpf_map map, void* key, u32 value)
2510	{
2511	void **ptr;
2512	int ret = `0`;
2513
2514	if (!map->ops->map_fd_sys_lookup_elem)
2515	return -ENOTSUPP;
2516
2517	rcu_read_lock();
2518	ptr = htab_map_lookup_elem(map, key);
2519	if (ptr)
2520	value = map->ops->map_fd_sys_lookup_elem(READ_ONCE(ptr));
2521	else
2522	ret = -ENOENT;
2523	rcu_read_unlock();
2524
2525	return ret;
2526	}
2527
2528	/ Only called from syscall /
2529	int bpf_fd_htab_map_update_elem(struct bpf_map map, struct* file *map_file,
2530	void key, void* *value, u64 map_flags)
2531	{
2532	void *ptr;
2533	int ret;
2534
2535	ptr = map->ops->map_fd_get_ptr(map, map_file, (int* *)value);
2536	if (IS_ERR(ptr))
2537	return PTR_ERR(ptr);
2538
2539	/ The htab bucket lock is always held during update operations in fd*
2540	* htab map, and the following rcu_read_lock() is only used to avoid
2541	* the WARN_ON_ONCE in htab_map_update_elem_in_place().
2542	*/
2543	rcu_read_lock();
2544	ret = htab_map_update_elem_in_place(map, key, value: &ptr, map_flags, percpu: false, onallcpus: false);
2545	rcu_read_unlock();
2546	if (ret)
2547	map->ops->map_fd_put_ptr(map, ptr, false);
2548
2549	return ret;
2550	}
2551
2552	static struct bpf_map htab_of_map_alloc(union* bpf_attr *attr)
2553	{
2554	struct bpf_map map, inner_map_meta;
2555
2556	inner_map_meta = bpf_map_meta_alloc(inner_map_ufd: attr->inner_map_fd);
2557	if (IS_ERR(ptr: inner_map_meta))
2558	return inner_map_meta;
2559
2560	map = htab_map_alloc(attr);
2561	if (IS_ERR(ptr: map)) {
2562	bpf_map_meta_free(map_meta: inner_map_meta);
2563	return map;
2564	}
2565
2566	map->inner_map_meta = inner_map_meta;
2567
2568	return map;
2569	}
2570
2571	static void htab_of_map_lookup_elem(struct* bpf_map map, void* *key)
2572	{
2573	struct bpf_map **inner_map = htab_map_lookup_elem(map, key);
2574
2575	if (!inner_map)
2576	return NULL;
2577
2578	return READ_ONCE(*inner_map);
2579	}
2580
2581	static int htab_of_map_gen_lookup(struct bpf_map *map,
2582	struct bpf_insn *insn_buf)
2583	{
2584	struct bpf_insn *insn = insn_buf;
2585	const int ret = BPF_REG_0;
2586
2587	BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem,
2588	(void ()(struct bpf_map map, void* *key))NULL));
2589	*insn++ = BPF_EMIT_CALL(__htab_map_lookup_elem);
2590	*insn++ = BPF_JMP_IMM(BPF_JEQ, ret, `0`, `2`);
2591	*insn++ = BPF_ALU64_IMM(BPF_ADD, ret,
2592	offsetof(struct htab_elem, key) +
2593	round_up(map->key_size, `8`));
2594	*insn++ = BPF_LDX_MEM(BPF_DW, ret, ret, `0`);
2595
2596	return insn - insn_buf;
2597	}
2598
2599	static void htab_of_map_free(struct bpf_map *map)
2600	{
2601	bpf_map_meta_free(map_meta: map->inner_map_meta);
2602	fd_htab_map_free(map);
2603	}
2604
2605	const struct bpf_map_ops htab_of_maps_map_ops = {
2606	.map_alloc_check = fd_htab_map_alloc_check,
2607	.map_alloc = htab_of_map_alloc,
2608	.map_free = htab_of_map_free,
2609	.map_get_next_key = htab_map_get_next_key,
2610	.map_lookup_elem = htab_of_map_lookup_elem,
2611	.map_delete_elem = htab_map_delete_elem,
2612	.map_fd_get_ptr = bpf_map_fd_get_ptr,
2613	.map_fd_put_ptr = bpf_map_fd_put_ptr,
2614	.map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem,
2615	.map_gen_lookup = htab_of_map_gen_lookup,
2616	.map_check_btf = map_check_no_btf,
2617	.map_mem_usage = htab_map_mem_usage,
2618	BATCH_OPS(htab),
2619	.map_btf_id = &htab_map_btf_ids[`0`],
2620	};
2621

source code of linux/kernel/bpf/hashtab.c