cgroup.c source code [linux/kernel/bpf/cgroup.c]

1	// SPDX-License-Identifier: GPL-2.0-only
2	/*
3	* Functions to manage eBPF programs attached to cgroups
4	*
5	* Copyright (c) 2016 Daniel Mack
6	*/
7
8	#include <linux/kernel.h>
9	#include <linux/atomic.h>
10	#include <linux/cgroup.h>
11	#include <linux/filter.h>
12	#include <linux/slab.h>
13	#include <linux/sysctl.h>
14	#include <linux/string.h>
15	#include <linux/bpf.h>
16	#include <linux/bpf-cgroup.h>
17	#include <linux/bpf_lsm.h>
18	#include <linux/bpf_verifier.h>
19	#include <net/sock.h>
20	#include <net/bpf_sk_storage.h>
21
22	#include "../cgroup/cgroup-internal.h"
23
24	DEFINE_STATIC_KEY_ARRAY_FALSE(cgroup_bpf_enabled_key, MAX_CGROUP_BPF_ATTACH_TYPE);
25	EXPORT_SYMBOL(cgroup_bpf_enabled_key);
26
27	/*
28	* cgroup bpf destruction makes heavy use of work items and there can be a lot
29	* of concurrent destructions. Use a separate workqueue so that cgroup bpf
30	* destruction work items don't end up filling up max_active of system_percpu_wq
31	* which may lead to deadlock.
32	*/
33	static struct workqueue_struct *cgroup_bpf_destroy_wq;
34
35	static int __init cgroup_bpf_wq_init(void)
36	{
37	cgroup_bpf_destroy_wq = alloc_workqueue("cgroup_bpf_destroy",
38	WQ_PERCPU, `1`);
39	if (!cgroup_bpf_destroy_wq)
40	panic(fmt: "Failed to alloc workqueue for cgroup bpf destroy.\n");
41	return `0`;
42	}
43	core_initcall(cgroup_bpf_wq_init);
44
45	static int cgroup_bpf_lifetime_notify(struct notifier_block *nb,
46	unsigned long action, void *data);
47
48	static struct notifier_block cgroup_bpf_lifetime_nb = {
49	.notifier_call = cgroup_bpf_lifetime_notify,
50	};
51
52	void __init cgroup_bpf_lifetime_notifier_init(void)
53	{
54	BUG_ON(blocking_notifier_chain_register(&cgroup_lifetime_notifier,
55	&cgroup_bpf_lifetime_nb));
56	}
57
58	/ __always_inline is necessary to prevent indirect call through run_prog*
59	* function pointer.
60	*/
61	static __always_inline int
62	bpf_prog_run_array_cg(const struct cgroup_bpf *cgrp,
63	enum cgroup_bpf_attach_type atype,
64	const void *ctx, bpf_prog_run_fn run_prog,
65	int retval, u32 *ret_flags)
66	{
67	const struct bpf_prog_array_item *item;
68	const struct bpf_prog *prog;
69	const struct bpf_prog_array *array;
70	struct bpf_run_ctx *old_run_ctx;
71	struct bpf_cg_run_ctx run_ctx;
72	u32 func_ret;
73
74	run_ctx.retval = retval;
75	rcu_read_lock_dont_migrate();
76	array = rcu_dereference(cgrp->effective[atype]);
77	item = &array->items[`0`];
78	old_run_ctx = bpf_set_run_ctx(new_ctx: &run_ctx.run_ctx);
79	while ((prog = READ_ONCE(item->prog))) {
80	run_ctx.prog_item = item;
81	func_ret = run_prog(prog, ctx);
82	if (ret_flags) {
83	*(ret_flags) \|= (func_ret >> `1`);
84	func_ret &= `1`;
85	}
86	if (!func_ret && !IS_ERR_VALUE((long)run_ctx.retval))
87	run_ctx.retval = -EPERM;
88	item++;
89	}
90	bpf_reset_run_ctx(old_ctx: old_run_ctx);
91	rcu_read_unlock_migrate();
92	return run_ctx.retval;
93	}
94
95	unsigned int __cgroup_bpf_run_lsm_sock(const void *ctx,
96	const struct bpf_insn *insn)
97	{
98	const struct bpf_prog *shim_prog;
99	struct sock *sk;
100	struct cgroup *cgrp;
101	int ret = `0`;
102	u64 *args;
103
104	args = (u64 *)ctx;
105	sk = (void )(unsigned* long)args[`0`];
106	/shim_prog = container_of(insn, struct bpf_prog, insnsi);/
107	shim_prog = (const struct bpf_prog )((void* )insn - offsetof(struct* bpf_prog, insnsi));
108
109	cgrp = sock_cgroup_ptr(skcd: &sk->sk_cgrp_data);
110	if (likely(cgrp))
111	ret = bpf_prog_run_array_cg(cgrp: &cgrp->bpf,
112	atype: shim_prog->aux->cgroup_atype,
113	ctx, run_prog: bpf_prog_run, retval: `0`, NULL);
114	return ret;
115	}
116
117	unsigned int __cgroup_bpf_run_lsm_socket(const void *ctx,
118	const struct bpf_insn *insn)
119	{
120	const struct bpf_prog *shim_prog;
121	struct socket *sock;
122	struct cgroup *cgrp;
123	int ret = `0`;
124	u64 *args;
125
126	args = (u64 *)ctx;
127	sock = (void )(unsigned* long)args[`0`];
128	/shim_prog = container_of(insn, struct bpf_prog, insnsi);/
129	shim_prog = (const struct bpf_prog )((void* )insn - offsetof(struct* bpf_prog, insnsi));
130
131	cgrp = sock_cgroup_ptr(skcd: &sock->sk->sk_cgrp_data);
132	if (likely(cgrp))
133	ret = bpf_prog_run_array_cg(cgrp: &cgrp->bpf,
134	atype: shim_prog->aux->cgroup_atype,
135	ctx, run_prog: bpf_prog_run, retval: `0`, NULL);
136	return ret;
137	}
138
139	unsigned int __cgroup_bpf_run_lsm_current(const void *ctx,
140	const struct bpf_insn *insn)
141	{
142	const struct bpf_prog *shim_prog;
143	struct cgroup *cgrp;
144	int ret = `0`;
145
146	/shim_prog = container_of(insn, struct bpf_prog, insnsi);/
147	shim_prog = (const struct bpf_prog )((void* )insn - offsetof(struct* bpf_prog, insnsi));
148
149	/ We rely on trampoline's __bpf_prog_enter_lsm_cgroup to grab RCU read lock. /
150	cgrp = task_dfl_cgroup(current);
151	if (likely(cgrp))
152	ret = bpf_prog_run_array_cg(cgrp: &cgrp->bpf,
153	atype: shim_prog->aux->cgroup_atype,
154	ctx, run_prog: bpf_prog_run, retval: `0`, NULL);
155	return ret;
156	}
157
158	#ifdef CONFIG_BPF_LSM
159	struct cgroup_lsm_atype {
160	u32 attach_btf_id;
161	int refcnt;
162	};
163
164	static struct cgroup_lsm_atype cgroup_lsm_atype[CGROUP_LSM_NUM];
165
166	static enum cgroup_bpf_attach_type
167	bpf_cgroup_atype_find(enum bpf_attach_type attach_type, u32 attach_btf_id)
168	{
169	int i;
170
171	lockdep_assert_held(&cgroup_mutex);
172
173	if (attach_type != BPF_LSM_CGROUP)
174	return to_cgroup_bpf_attach_type(attach_type);
175
176	for (i = `0`; i < ARRAY_SIZE(cgroup_lsm_atype); i++)
177	if (cgroup_lsm_atype[i].attach_btf_id == attach_btf_id)
178	return CGROUP_LSM_START + i;
179
180	for (i = `0`; i < ARRAY_SIZE(cgroup_lsm_atype); i++)
181	if (cgroup_lsm_atype[i].attach_btf_id == `0`)
182	return CGROUP_LSM_START + i;
183
184	return -E2BIG;
185
186	}
187
188	void bpf_cgroup_atype_get(u32 attach_btf_id, int cgroup_atype)
189	{
190	int i = cgroup_atype - CGROUP_LSM_START;
191
192	lockdep_assert_held(&cgroup_mutex);
193
194	WARN_ON_ONCE(cgroup_lsm_atype[i].attach_btf_id &&
195	cgroup_lsm_atype[i].attach_btf_id != attach_btf_id);
196
197	cgroup_lsm_atype[i].attach_btf_id = attach_btf_id;
198	cgroup_lsm_atype[i].refcnt++;
199	}
200
201	void bpf_cgroup_atype_put(int cgroup_atype)
202	{
203	int i = cgroup_atype - CGROUP_LSM_START;
204
205	cgroup_lock();
206	if (--cgroup_lsm_atype[i].refcnt <= `0`)
207	cgroup_lsm_atype[i].attach_btf_id = `0`;
208	WARN_ON_ONCE(cgroup_lsm_atype[i].refcnt < `0`);
209	cgroup_unlock();
210	}
211	#else
212	static enum cgroup_bpf_attach_type
213	bpf_cgroup_atype_find(enum bpf_attach_type attach_type, u32 attach_btf_id)
214	{
215	if (attach_type != BPF_LSM_CGROUP)
216	return to_cgroup_bpf_attach_type(attach_type);
217	return -EOPNOTSUPP;
218	}
219	#endif /* CONFIG_BPF_LSM */
220
221	static void cgroup_bpf_offline(struct cgroup *cgrp)
222	{
223	cgroup_get(cgrp);
224	percpu_ref_kill(ref: &cgrp->bpf.refcnt);
225	}
226
227	static void bpf_cgroup_storages_free(struct bpf_cgroup_storage *storages[])
228	{
229	enum bpf_cgroup_storage_type stype;
230
231	for_each_cgroup_storage_type(stype)
232	bpf_cgroup_storage_free(storage: storages[stype]);
233	}
234
235	static int bpf_cgroup_storages_alloc(struct bpf_cgroup_storage *storages[],
236	struct bpf_cgroup_storage *new_storages[],
237	enum bpf_attach_type type,
238	struct bpf_prog *prog,
239	struct cgroup *cgrp)
240	{
241	enum bpf_cgroup_storage_type stype;
242	struct bpf_cgroup_storage_key key;
243	struct bpf_map *map;
244
245	key.cgroup_inode_id = cgroup_id(cgrp);
246	key.attach_type = type;
247
248	for_each_cgroup_storage_type(stype) {
249	map = prog->aux->cgroup_storage[stype];
250	if (!map)
251	continue;
252
253	storages[stype] = cgroup_storage_lookup(map: (void *)map, key: &key, locked: false);
254	if (storages[stype])
255	continue;
256
257	storages[stype] = bpf_cgroup_storage_alloc(prog, stype);
258	if (IS_ERR(ptr: storages[stype])) {
259	bpf_cgroup_storages_free(storages: new_storages);
260	return -ENOMEM;
261	}
262
263	new_storages[stype] = storages[stype];
264	}
265
266	return `0`;
267	}
268
269	static void bpf_cgroup_storages_assign(struct bpf_cgroup_storage *dst[],
270	struct bpf_cgroup_storage *src[])
271	{
272	enum bpf_cgroup_storage_type stype;
273
274	for_each_cgroup_storage_type(stype)
275	dst[stype] = src[stype];
276	}
277
278	static void bpf_cgroup_storages_link(struct bpf_cgroup_storage *storages[],
279	struct cgroup *cgrp,
280	enum bpf_attach_type attach_type)
281	{
282	enum bpf_cgroup_storage_type stype;
283
284	for_each_cgroup_storage_type(stype)
285	bpf_cgroup_storage_link(storage: storages[stype], cgroup: cgrp, type: attach_type);
286	}
287
288	/ Called when bpf_cgroup_link is auto-detached from dying cgroup.*
289	* It drops cgroup and bpf_prog refcounts, and marks bpf_link as defunct. It
290	* doesn't free link memory, which will eventually be done by bpf_link's
291	* release() callback, when its last FD is closed.
292	*/
293	static void bpf_cgroup_link_auto_detach(struct bpf_cgroup_link *link)
294	{
295	cgroup_put(cgrp: link->cgroup);
296	link->cgroup = NULL;
297	}
298
299	/**
300	* cgroup_bpf_release() - put references of all bpf programs and
301	* release all cgroup bpf data
302	* @work: work structure embedded into the cgroup to modify
303	*/
304	static void cgroup_bpf_release(struct work_struct *work)
305	{
306	struct cgroup p, cgrp = container_of(work, struct cgroup,
307	bpf.release_work);
308	struct bpf_prog_array *old_array;
309	struct list_head *storages = &cgrp->bpf.storages;
310	struct bpf_cgroup_storage storage, stmp;
311
312	unsigned int atype;
313
314	cgroup_lock();
315
316	for (atype = `0`; atype < ARRAY_SIZE(cgrp->bpf.progs); atype++) {
317	struct hlist_head *progs = &cgrp->bpf.progs[atype];
318	struct bpf_prog_list *pl;
319	struct hlist_node *pltmp;
320
321	hlist_for_each_entry_safe(pl, pltmp, progs, node) {
322	hlist_del(n: &pl->node);
323	if (pl->prog) {
324	if (pl->prog->expected_attach_type == BPF_LSM_CGROUP)
325	bpf_trampoline_unlink_cgroup_shim(prog: pl->prog);
326	bpf_prog_put(prog: pl->prog);
327	}
328	if (pl->link) {
329	if (pl->link->link.prog->expected_attach_type == BPF_LSM_CGROUP)
330	bpf_trampoline_unlink_cgroup_shim(prog: pl->link->link.prog);
331	bpf_cgroup_link_auto_detach(link: pl->link);
332	}
333	kfree(objp: pl);
334	static_branch_dec(&cgroup_bpf_enabled_key[atype]);
335	}
336	old_array = rcu_dereference_protected(
337	cgrp->bpf.effective[atype],
338	lockdep_is_held(&cgroup_mutex));
339	bpf_prog_array_free(progs: old_array);
340	}
341
342	list_for_each_entry_safe(storage, stmp, storages, list_cg) {
343	bpf_cgroup_storage_unlink(storage);
344	bpf_cgroup_storage_free(storage);
345	}
346
347	cgroup_unlock();
348
349	for (p = cgroup_parent(cgrp); p; p = cgroup_parent(cgrp: p))
350	cgroup_bpf_put(cgrp: p);
351
352	percpu_ref_exit(ref: &cgrp->bpf.refcnt);
353	cgroup_put(cgrp);
354	}
355
356	/**
357	* cgroup_bpf_release_fn() - callback used to schedule releasing
358	* of bpf cgroup data
359	* @ref: percpu ref counter structure
360	*/
361	static void cgroup_bpf_release_fn(struct percpu_ref *ref)
362	{
363	struct cgroup cgrp = container_of(ref, struct* cgroup, bpf.refcnt);
364
365	INIT_WORK(&cgrp->bpf.release_work, cgroup_bpf_release);
366	queue_work(wq: cgroup_bpf_destroy_wq, work: &cgrp->bpf.release_work);
367	}
368
369	/ Get underlying bpf_prog of bpf_prog_list entry, regardless if it's through*
370	* link or direct prog.
371	*/
372	static struct bpf_prog prog_list_prog(struct* bpf_prog_list *pl)
373	{
374	if (pl->prog)
375	return pl->prog;
376	if (pl->link)
377	return pl->link->link.prog;
378	return NULL;
379	}
380
381	/ count number of elements in the list.*
382	* it's slow but the list cannot be long
383	*/
384	static u32 prog_list_length(struct hlist_head head, int* *preorder_cnt)
385	{
386	struct bpf_prog_list *pl;
387	u32 cnt = `0`;
388
389	hlist_for_each_entry(pl, head, node) {
390	if (!prog_list_prog(pl))
391	continue;
392	if (preorder_cnt && (pl->flags & BPF_F_PREORDER))
393	(*preorder_cnt)++;
394	cnt++;
395	}
396	return cnt;
397	}
398
399	/ if parent has non-overridable prog attached,*
400	* disallow attaching new programs to the descendent cgroup.
401	* if parent has overridable or multi-prog, allow attaching
402	*/
403	static bool hierarchy_allows_attach(struct cgroup *cgrp,
404	enum cgroup_bpf_attach_type atype)
405	{
406	struct cgroup *p;
407
408	p = cgroup_parent(cgrp);
409	if (!p)
410	return true;
411	do {
412	u32 flags = p->bpf.flags[atype];
413	u32 cnt;
414
415	if (flags & BPF_F_ALLOW_MULTI)
416	return true;
417	cnt = prog_list_length(head: &p->bpf.progs[atype], NULL);
418	WARN_ON_ONCE(cnt > `1`);
419	if (cnt == `1`)
420	return !!(flags & BPF_F_ALLOW_OVERRIDE);
421	p = cgroup_parent(cgrp: p);
422	} while (p);
423	return true;
424	}
425
426	/ compute a chain of effective programs for a given cgroup:*
427	* start from the list of programs in this cgroup and add
428	* all parent programs.
429	* Note that parent's F_ALLOW_OVERRIDE-type program is yielding
430	* to programs in this cgroup
431	*/
432	static int compute_effective_progs(struct cgroup *cgrp,
433	enum cgroup_bpf_attach_type atype,
434	struct bpf_prog_array **array)
435	{
436	struct bpf_prog_array_item *item;
437	struct bpf_prog_array *progs;
438	struct bpf_prog_list *pl;
439	struct cgroup *p = cgrp;
440	int i, j, cnt = `0`, preorder_cnt = `0`, fstart, bstart, init_bstart;
441
442	/ count number of effective programs by walking parents /
443	do {
444	if (cnt == `0` \|\| (p->bpf.flags[atype] & BPF_F_ALLOW_MULTI))
445	cnt += prog_list_length(head: &p->bpf.progs[atype], preorder_cnt: &preorder_cnt);
446	p = cgroup_parent(cgrp: p);
447	} while (p);
448
449	progs = bpf_prog_array_alloc(prog_cnt: cnt, GFP_KERNEL);
450	if (!progs)
451	return -ENOMEM;
452
453	/ populate the array with effective progs /
454	cnt = `0`;
455	p = cgrp;
456	fstart = preorder_cnt;
457	bstart = preorder_cnt - `1`;
458	do {
459	if (cnt > `0` && !(p->bpf.flags[atype] & BPF_F_ALLOW_MULTI))
460	continue;
461
462	init_bstart = bstart;
463	hlist_for_each_entry(pl, &p->bpf.progs[atype], node) {
464	if (!prog_list_prog(pl))
465	continue;
466
467	if (pl->flags & BPF_F_PREORDER) {
468	item = &progs->items[bstart];
469	bstart--;
470	} else {
471	item = &progs->items[fstart];
472	fstart++;
473	}
474	item->prog = prog_list_prog(pl);
475	bpf_cgroup_storages_assign(dst: item->cgroup_storage,
476	src: pl->storage);
477	cnt++;
478	}
479
480	/ reverse pre-ordering progs at this cgroup level /
481	for (i = bstart + `1`, j = init_bstart; i < j; i++, j--)
482	swap(progs->items[i], progs->items[j]);
483
484	} while ((p = cgroup_parent(cgrp: p)));
485
486	*array = progs;
487	return `0`;
488	}
489
490	static void activate_effective_progs(struct cgroup *cgrp,
491	enum cgroup_bpf_attach_type atype,
492	struct bpf_prog_array *old_array)
493	{
494	old_array = rcu_replace_pointer(cgrp->bpf.effective[atype], old_array,
495	lockdep_is_held(&cgroup_mutex));
496	/ free prog array after grace period, since __cgroup_bpf_run_()
497	* might be still walking the array
498	*/
499	bpf_prog_array_free(progs: old_array);
500	}
501
502	/**
503	* cgroup_bpf_inherit() - inherit effective programs from parent
504	* @cgrp: the cgroup to modify
505	*/
506	static int cgroup_bpf_inherit(struct cgroup *cgrp)
507	{
508	/ has to use marco instead of const int, since compiler thinks*
509	* that array below is variable length
510	*/
511	#define NR ARRAY_SIZE(cgrp->bpf.effective)
512	struct bpf_prog_array *arrays[NR] = {};
513	struct cgroup *p;
514	int ret, i;
515
516	ret = percpu_ref_init(ref: &cgrp->bpf.refcnt, release: cgroup_bpf_release_fn, flags: `0`,
517	GFP_KERNEL);
518	if (ret)
519	return ret;
520
521	for (p = cgroup_parent(cgrp); p; p = cgroup_parent(cgrp: p))
522	cgroup_bpf_get(cgrp: p);
523
524	for (i = `0`; i < NR; i++)
525	INIT_HLIST_HEAD(&cgrp->bpf.progs[i]);
526
527	INIT_LIST_HEAD(list: &cgrp->bpf.storages);
528
529	for (i = `0`; i < NR; i++)
530	if (compute_effective_progs(cgrp, atype: i, array: &arrays[i]))
531	goto cleanup;
532
533	for (i = `0`; i < NR; i++)
534	activate_effective_progs(cgrp, atype: i, old_array: arrays[i]);
535
536	return `0`;
537	cleanup:
538	for (i = `0`; i < NR; i++)
539	bpf_prog_array_free(progs: arrays[i]);
540
541	for (p = cgroup_parent(cgrp); p; p = cgroup_parent(cgrp: p))
542	cgroup_bpf_put(cgrp: p);
543
544	percpu_ref_exit(ref: &cgrp->bpf.refcnt);
545
546	return -ENOMEM;
547	}
548
549	static int cgroup_bpf_lifetime_notify(struct notifier_block *nb,
550	unsigned long action, void *data)
551	{
552	struct cgroup *cgrp = data;
553	int ret = `0`;
554
555	if (cgrp->root != &cgrp_dfl_root)
556	return NOTIFY_OK;
557
558	switch (action) {
559	case CGROUP_LIFETIME_ONLINE:
560	ret = cgroup_bpf_inherit(cgrp);
561	break;
562	case CGROUP_LIFETIME_OFFLINE:
563	cgroup_bpf_offline(cgrp);
564	break;
565	}
566
567	return notifier_from_errno(err: ret);
568	}
569
570	static int update_effective_progs(struct cgroup *cgrp,
571	enum cgroup_bpf_attach_type atype)
572	{
573	struct cgroup_subsys_state *css;
574	int err;
575
576	/ allocate and recompute effective prog arrays /
577	css_for_each_descendant_pre(css, &cgrp->self) {
578	struct cgroup desc = container_of(css, struct* cgroup, self);
579
580	if (percpu_ref_is_zero(ref: &desc->bpf.refcnt))
581	continue;
582
583	err = compute_effective_progs(cgrp: desc, atype, array: &desc->bpf.inactive);
584	if (err)
585	goto cleanup;
586	}
587
588	/ all allocations were successful. Activate all prog arrays /
589	css_for_each_descendant_pre(css, &cgrp->self) {
590	struct cgroup desc = container_of(css, struct* cgroup, self);
591
592	if (percpu_ref_is_zero(ref: &desc->bpf.refcnt)) {
593	if (unlikely(desc->bpf.inactive)) {
594	bpf_prog_array_free(progs: desc->bpf.inactive);
595	desc->bpf.inactive = NULL;
596	}
597	continue;
598	}
599
600	activate_effective_progs(cgrp: desc, atype, old_array: desc->bpf.inactive);
601	desc->bpf.inactive = NULL;
602	}
603
604	return `0`;
605
606	cleanup:
607	/ oom while computing effective. Free all computed effective arrays*
608	* since they were not activated
609	*/
610	css_for_each_descendant_pre(css, &cgrp->self) {
611	struct cgroup desc = container_of(css, struct* cgroup, self);
612
613	bpf_prog_array_free(progs: desc->bpf.inactive);
614	desc->bpf.inactive = NULL;
615	}
616
617	return err;
618	}
619
620	#define BPF_CGROUP_MAX_PROGS 64
621
622	static struct bpf_prog_list find_attach_entry(struct* hlist_head *progs,
623	struct bpf_prog *prog,
624	struct bpf_cgroup_link *link,
625	struct bpf_prog *replace_prog,
626	bool allow_multi)
627	{
628	struct bpf_prog_list *pl;
629
630	/ single-attach case /
631	if (!allow_multi) {
632	if (hlist_empty(h: progs))
633	return NULL;
634	return hlist_entry(progs->first, typeof(*pl), node);
635	}
636
637	hlist_for_each_entry(pl, progs, node) {
638	if (prog && pl->prog == prog && prog != replace_prog)
639	/ disallow attaching the same prog twice /
640	return ERR_PTR(error: -EINVAL);
641	if (link && pl->link == link)
642	/ disallow attaching the same link twice /
643	return ERR_PTR(error: -EINVAL);
644	}
645
646	/ direct prog multi-attach w/ replacement case /
647	if (replace_prog) {
648	hlist_for_each_entry(pl, progs, node) {
649	if (pl->prog == replace_prog)
650	/ a match found /
651	return pl;
652	}
653	/ prog to replace not found for cgroup /
654	return ERR_PTR(error: -ENOENT);
655	}
656
657	return NULL;
658	}
659
660	static struct bpf_link *bpf_get_anchor_link(u32 flags, u32 id_or_fd)
661	{
662	struct bpf_link *link = ERR_PTR(error: -EINVAL);
663
664	if (flags & BPF_F_ID)
665	link = bpf_link_by_id(id: id_or_fd);
666	else if (id_or_fd)
667	link = bpf_link_get_from_fd(ufd: id_or_fd);
668	return link;
669	}
670
671	static struct bpf_prog *bpf_get_anchor_prog(u32 flags, u32 id_or_fd)
672	{
673	struct bpf_prog *prog = ERR_PTR(error: -EINVAL);
674
675	if (flags & BPF_F_ID)
676	prog = bpf_prog_by_id(id: id_or_fd);
677	else if (id_or_fd)
678	prog = bpf_prog_get(ufd: id_or_fd);
679	return prog;
680	}
681
682	static struct bpf_prog_list get_prog_list(struct* hlist_head progs, struct* bpf_prog *prog,
683	struct bpf_cgroup_link *link, u32 flags, u32 id_or_fd)
684	{
685	bool is_link = flags & BPF_F_LINK, is_id = flags & BPF_F_ID;
686	struct bpf_prog_list pltmp, pl = ERR_PTR(error: -EINVAL);
687	bool preorder = flags & BPF_F_PREORDER;
688	struct bpf_link *anchor_link = NULL;
689	struct bpf_prog *anchor_prog = NULL;
690	bool is_before, is_after;
691
692	is_before = flags & BPF_F_BEFORE;
693	is_after = flags & BPF_F_AFTER;
694	if (is_link \|\| is_id \|\| id_or_fd) {
695	/ flags must have either BPF_F_BEFORE or BPF_F_AFTER /
696	if (is_before == is_after)
697	return ERR_PTR(error: -EINVAL);
698	if ((is_link && !link) \|\| (!is_link && !prog))
699	return ERR_PTR(error: -EINVAL);
700	} else if (!hlist_empty(h: progs)) {
701	/ flags cannot have both BPF_F_BEFORE and BPF_F_AFTER /
702	if (is_before && is_after)
703	return ERR_PTR(error: -EINVAL);
704	}
705
706	if (is_link) {
707	anchor_link = bpf_get_anchor_link(flags, id_or_fd);
708	if (IS_ERR(ptr: anchor_link))
709	return ERR_CAST(ptr: anchor_link);
710	} else if (is_id \|\| id_or_fd) {
711	anchor_prog = bpf_get_anchor_prog(flags, id_or_fd);
712	if (IS_ERR(ptr: anchor_prog))
713	return ERR_CAST(ptr: anchor_prog);
714	}
715
716	if (!anchor_prog && !anchor_link) {
717	/ if there is no anchor_prog/anchor_link, then BPF_F_PREORDER*
718	* doesn't matter since either prepend or append to a combined
719	* list of progs will end up with correct result.
720	*/
721	hlist_for_each_entry(pltmp, progs, node) {
722	if (is_before)
723	return pltmp;
724	if (pltmp->node.next)
725	continue;
726	return pltmp;
727	}
728	return NULL;
729	}
730
731	hlist_for_each_entry(pltmp, progs, node) {
732	if ((anchor_prog && anchor_prog == pltmp->prog) \|\|
733	(anchor_link && anchor_link == &pltmp->link->link)) {
734	if (!!(pltmp->flags & BPF_F_PREORDER) != preorder)
735	goto out;
736	pl = pltmp;
737	goto out;
738	}
739	}
740
741	pl = ERR_PTR(error: -ENOENT);
742	out:
743	if (anchor_link)
744	bpf_link_put(link: anchor_link);
745	else
746	bpf_prog_put(prog: anchor_prog);
747	return pl;
748	}
749
750	static int insert_pl_to_hlist(struct bpf_prog_list pl, struct* hlist_head *progs,
751	struct bpf_prog prog, struct* bpf_cgroup_link *link,
752	u32 flags, u32 id_or_fd)
753	{
754	struct bpf_prog_list *pltmp;
755
756	pltmp = get_prog_list(progs, prog, link, flags, id_or_fd);
757	if (IS_ERR(ptr: pltmp))
758	return PTR_ERR(ptr: pltmp);
759
760	if (!pltmp)
761	hlist_add_head(n: &pl->node, h: progs);
762	else if (flags & BPF_F_BEFORE)
763	hlist_add_before(n: &pl->node, next: &pltmp->node);
764	else
765	hlist_add_behind(n: &pl->node, prev: &pltmp->node);
766
767	return `0`;
768	}
769
770	/**
771	* __cgroup_bpf_attach() - Attach the program or the link to a cgroup, and
772	* propagate the change to descendants
773	* @cgrp: The cgroup which descendants to traverse
774	* @prog: A program to attach
775	* @link: A link to attach
776	* @replace_prog: Previously attached program to replace if BPF_F_REPLACE is set
777	* @type: Type of attach operation
778	* @flags: Option flags
779	* @id_or_fd: Relative prog id or fd
780	* @revision: bpf_prog_list revision
781	*
782	* Exactly one of @prog or @link can be non-null.
783	* Must be called with cgroup_mutex held.
784	*/
785	static int __cgroup_bpf_attach(struct cgroup *cgrp,
786	struct bpf_prog prog, struct* bpf_prog *replace_prog,
787	struct bpf_cgroup_link *link,
788	enum bpf_attach_type type, u32 flags, u32 id_or_fd,
789	u64 revision)
790	{
791	u32 saved_flags = (flags & (BPF_F_ALLOW_OVERRIDE \| BPF_F_ALLOW_MULTI));
792	struct bpf_prog *old_prog = NULL;
793	struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {};
794	struct bpf_cgroup_storage *new_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {};
795	struct bpf_prog *new_prog = prog ? : link->link.prog;
796	enum cgroup_bpf_attach_type atype;
797	struct bpf_prog_list *pl;
798	struct hlist_head *progs;
799	int err;
800
801	if (((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI)) \|\|
802	((flags & BPF_F_REPLACE) && !(flags & BPF_F_ALLOW_MULTI)))
803	/ invalid combination /
804	return -EINVAL;
805	if ((flags & BPF_F_REPLACE) && (flags & (BPF_F_BEFORE \| BPF_F_AFTER)))
806	/ only either replace or insertion with before/after /
807	return -EINVAL;
808	if (link && (prog \|\| replace_prog))
809	/ only either link or prog/replace_prog can be specified /
810	return -EINVAL;
811	if (!!replace_prog != !!(flags & BPF_F_REPLACE))
812	/ replace_prog implies BPF_F_REPLACE, and vice versa /
813	return -EINVAL;
814
815	atype = bpf_cgroup_atype_find(attach_type: type, attach_btf_id: new_prog->aux->attach_btf_id);
816	if (atype < `0`)
817	return -EINVAL;
818	if (revision && revision != cgrp->bpf.revisions[atype])
819	return -ESTALE;
820
821	progs = &cgrp->bpf.progs[atype];
822
823	if (!hierarchy_allows_attach(cgrp, atype))
824	return -EPERM;
825
826	if (!hlist_empty(h: progs) && cgrp->bpf.flags[atype] != saved_flags)
827	/ Disallow attaching non-overridable on top*
828	* of existing overridable in this cgroup.
829	* Disallow attaching multi-prog if overridable or none
830	*/
831	return -EPERM;
832
833	if (prog_list_length(head: progs, NULL) >= BPF_CGROUP_MAX_PROGS)
834	return -E2BIG;
835
836	pl = find_attach_entry(progs, prog, link, replace_prog,
837	allow_multi: flags & BPF_F_ALLOW_MULTI);
838	if (IS_ERR(ptr: pl))
839	return PTR_ERR(ptr: pl);
840
841	if (bpf_cgroup_storages_alloc(storages: storage, new_storages: new_storage, type,
842	prog: prog ? : link->link.prog, cgrp))
843	return -ENOMEM;
844
845	if (pl) {
846	old_prog = pl->prog;
847	} else {
848	pl = kmalloc(sizeof(*pl), GFP_KERNEL);
849	if (!pl) {
850	bpf_cgroup_storages_free(storages: new_storage);
851	return -ENOMEM;
852	}
853
854	err = insert_pl_to_hlist(pl, progs, prog, link, flags, id_or_fd);
855	if (err) {
856	kfree(objp: pl);
857	bpf_cgroup_storages_free(storages: new_storage);
858	return err;
859	}
860	}
861
862	pl->prog = prog;
863	pl->link = link;
864	pl->flags = flags;
865	bpf_cgroup_storages_assign(dst: pl->storage, src: storage);
866	cgrp->bpf.flags[atype] = saved_flags;
867
868	if (type == BPF_LSM_CGROUP) {
869	err = bpf_trampoline_link_cgroup_shim(prog: new_prog, cgroup_atype: atype, attach_type: type);
870	if (err)
871	goto cleanup;
872	}
873
874	err = update_effective_progs(cgrp, atype);
875	if (err)
876	goto cleanup_trampoline;
877
878	cgrp->bpf.revisions[atype] += `1`;
879	if (old_prog) {
880	if (type == BPF_LSM_CGROUP)
881	bpf_trampoline_unlink_cgroup_shim(prog: old_prog);
882	bpf_prog_put(prog: old_prog);
883	} else {
884	static_branch_inc(&cgroup_bpf_enabled_key[atype]);
885	}
886	bpf_cgroup_storages_link(storages: new_storage, cgrp, attach_type: type);
887	return `0`;
888
889	cleanup_trampoline:
890	if (type == BPF_LSM_CGROUP)
891	bpf_trampoline_unlink_cgroup_shim(prog: new_prog);
892
893	cleanup:
894	if (old_prog) {
895	pl->prog = old_prog;
896	pl->link = NULL;
897	}
898	bpf_cgroup_storages_free(storages: new_storage);
899	if (!old_prog) {
900	hlist_del(n: &pl->node);
901	kfree(objp: pl);
902	}
903	return err;
904	}
905
906	static int cgroup_bpf_attach(struct cgroup *cgrp,
907	struct bpf_prog prog, struct* bpf_prog *replace_prog,
908	struct bpf_cgroup_link *link,
909	enum bpf_attach_type type,
910	u32 flags, u32 id_or_fd, u64 revision)
911	{
912	int ret;
913
914	cgroup_lock();
915	ret = __cgroup_bpf_attach(cgrp, prog, replace_prog, link, type, flags,
916	id_or_fd, revision);
917	cgroup_unlock();
918	return ret;
919	}
920
921	/ Swap updated BPF program for given link in effective program arrays across*
922	* all descendant cgroups. This function is guaranteed to succeed.
923	*/
924	static void replace_effective_prog(struct cgroup *cgrp,
925	enum cgroup_bpf_attach_type atype,
926	struct bpf_cgroup_link *link)
927	{
928	struct bpf_prog_array_item *item;
929	struct cgroup_subsys_state *css;
930	struct bpf_prog_array *progs;
931	struct bpf_prog_list *pl;
932	struct hlist_head *head;
933	struct cgroup *cg;
934	int pos;
935
936	css_for_each_descendant_pre(css, &cgrp->self) {
937	struct cgroup desc = container_of(css, struct* cgroup, self);
938
939	if (percpu_ref_is_zero(ref: &desc->bpf.refcnt))
940	continue;
941
942	/ find position of link in effective progs array /
943	for (pos = `0`, cg = desc; cg; cg = cgroup_parent(cgrp: cg)) {
944	if (pos && !(cg->bpf.flags[atype] & BPF_F_ALLOW_MULTI))
945	continue;
946
947	head = &cg->bpf.progs[atype];
948	hlist_for_each_entry(pl, head, node) {
949	if (!prog_list_prog(pl))
950	continue;
951	if (pl->link == link)
952	goto found;
953	pos++;
954	}
955	}
956	found:
957	BUG_ON(!cg);
958	progs = rcu_dereference_protected(
959	desc->bpf.effective[atype],
960	lockdep_is_held(&cgroup_mutex));
961	item = &progs->items[pos];
962	WRITE_ONCE(item->prog, link->link.prog);
963	}
964	}
965
966	/**
967	* __cgroup_bpf_replace() - Replace link's program and propagate the change
968	* to descendants
969	* @cgrp: The cgroup which descendants to traverse
970	* @link: A link for which to replace BPF program
971	* @new_prog: &struct bpf_prog for the target BPF program with its refcnt
972	* incremented
973	*
974	* Must be called with cgroup_mutex held.
975	*/
976	static int __cgroup_bpf_replace(struct cgroup *cgrp,
977	struct bpf_cgroup_link *link,
978	struct bpf_prog *new_prog)
979	{
980	enum cgroup_bpf_attach_type atype;
981	struct bpf_prog *old_prog;
982	struct bpf_prog_list *pl;
983	struct hlist_head *progs;
984	bool found = false;
985
986	atype = bpf_cgroup_atype_find(attach_type: link->link.attach_type, attach_btf_id: new_prog->aux->attach_btf_id);
987	if (atype < `0`)
988	return -EINVAL;
989
990	progs = &cgrp->bpf.progs[atype];
991
992	if (link->link.prog->type != new_prog->type)
993	return -EINVAL;
994
995	hlist_for_each_entry(pl, progs, node) {
996	if (pl->link == link) {
997	found = true;
998	break;
999	}
1000	}
1001	if (!found)
1002	return -ENOENT;
1003
1004	cgrp->bpf.revisions[atype] += `1`;
1005	old_prog = xchg(&link->link.prog, new_prog);
1006	replace_effective_prog(cgrp, atype, link);
1007	bpf_prog_put(prog: old_prog);
1008	return `0`;
1009	}
1010
1011	static int cgroup_bpf_replace(struct bpf_link link, struct* bpf_prog *new_prog,
1012	struct bpf_prog *old_prog)
1013	{
1014	struct bpf_cgroup_link *cg_link;
1015	int ret;
1016
1017	cg_link = container_of(link, struct bpf_cgroup_link, link);
1018
1019	cgroup_lock();
1020	/ link might have been auto-released by dying cgroup, so fail /
1021	if (!cg_link->cgroup) {
1022	ret = -ENOLINK;
1023	goto out_unlock;
1024	}
1025	if (old_prog && link->prog != old_prog) {
1026	ret = -EPERM;
1027	goto out_unlock;
1028	}
1029	ret = __cgroup_bpf_replace(cgrp: cg_link->cgroup, link: cg_link, new_prog);
1030	out_unlock:
1031	cgroup_unlock();
1032	return ret;
1033	}
1034
1035	static struct bpf_prog_list find_detach_entry(struct* hlist_head *progs,
1036	struct bpf_prog *prog,
1037	struct bpf_cgroup_link *link,
1038	bool allow_multi)
1039	{
1040	struct bpf_prog_list *pl;
1041
1042	if (!allow_multi) {
1043	if (hlist_empty(h: progs))
1044	/ report error when trying to detach and nothing is attached /
1045	return ERR_PTR(error: -ENOENT);
1046
1047	/ to maintain backward compatibility NONE and OVERRIDE cgroups*
1048	* allow detaching with invalid FD (prog==NULL) in legacy mode
1049	*/
1050	return hlist_entry(progs->first, typeof(*pl), node);
1051	}
1052
1053	if (!prog && !link)
1054	/ to detach MULTI prog the user has to specify valid FD*
1055	* of the program or link to be detached
1056	*/
1057	return ERR_PTR(error: -EINVAL);
1058
1059	/ find the prog or link and detach it /
1060	hlist_for_each_entry(pl, progs, node) {
1061	if (pl->prog == prog && pl->link == link)
1062	return pl;
1063	}
1064	return ERR_PTR(error: -ENOENT);
1065	}
1066
1067	/**
1068	* purge_effective_progs() - After compute_effective_progs fails to alloc new
1069	* cgrp->bpf.inactive table we can recover by
1070	* recomputing the array in place.
1071	*
1072	* @cgrp: The cgroup which descendants to travers
1073	* @prog: A program to detach or NULL
1074	* @link: A link to detach or NULL
1075	* @atype: Type of detach operation
1076	*/
1077	static void purge_effective_progs(struct cgroup cgrp, struct* bpf_prog *prog,
1078	struct bpf_cgroup_link *link,
1079	enum cgroup_bpf_attach_type atype)
1080	{
1081	struct cgroup_subsys_state *css;
1082	struct bpf_prog_array *progs;
1083	struct bpf_prog_list *pl;
1084	struct hlist_head *head;
1085	struct cgroup *cg;
1086	int pos;
1087
1088	/ recompute effective prog array in place /
1089	css_for_each_descendant_pre(css, &cgrp->self) {
1090	struct cgroup desc = container_of(css, struct* cgroup, self);
1091
1092	if (percpu_ref_is_zero(ref: &desc->bpf.refcnt))
1093	continue;
1094
1095	/ find position of link or prog in effective progs array /
1096	for (pos = `0`, cg = desc; cg; cg = cgroup_parent(cgrp: cg)) {
1097	if (pos && !(cg->bpf.flags[atype] & BPF_F_ALLOW_MULTI))
1098	continue;
1099
1100	head = &cg->bpf.progs[atype];
1101	hlist_for_each_entry(pl, head, node) {
1102	if (!prog_list_prog(pl))
1103	continue;
1104	if (pl->prog == prog && pl->link == link)
1105	goto found;
1106	pos++;
1107	}
1108	}
1109
1110	/ no link or prog match, skip the cgroup of this layer /
1111	continue;
1112	found:
1113	progs = rcu_dereference_protected(
1114	desc->bpf.effective[atype],
1115	lockdep_is_held(&cgroup_mutex));
1116
1117	/ Remove the program from the array /
1118	WARN_ONCE(bpf_prog_array_delete_safe_at(progs, pos),
1119	"Failed to purge a prog from array at index %d", pos);
1120	}
1121	}
1122
1123	/**
1124	* __cgroup_bpf_detach() - Detach the program or link from a cgroup, and
1125	* propagate the change to descendants
1126	* @cgrp: The cgroup which descendants to traverse
1127	* @prog: A program to detach or NULL
1128	* @link: A link to detach or NULL
1129	* @type: Type of detach operation
1130	* @revision: bpf_prog_list revision
1131	*
1132	* At most one of @prog or @link can be non-NULL.
1133	* Must be called with cgroup_mutex held.
1134	*/
1135	static int __cgroup_bpf_detach(struct cgroup cgrp, struct* bpf_prog *prog,
1136	struct bpf_cgroup_link link, enum* bpf_attach_type type,
1137	u64 revision)
1138	{
1139	enum cgroup_bpf_attach_type atype;
1140	struct bpf_prog *old_prog;
1141	struct bpf_prog_list *pl;
1142	struct hlist_head *progs;
1143	u32 attach_btf_id = `0`;
1144	u32 flags;
1145
1146	if (prog)
1147	attach_btf_id = prog->aux->attach_btf_id;
1148	if (link)
1149	attach_btf_id = link->link.prog->aux->attach_btf_id;
1150
1151	atype = bpf_cgroup_atype_find(attach_type: type, attach_btf_id);
1152	if (atype < `0`)
1153	return -EINVAL;
1154
1155	if (revision && revision != cgrp->bpf.revisions[atype])
1156	return -ESTALE;
1157
1158	progs = &cgrp->bpf.progs[atype];
1159	flags = cgrp->bpf.flags[atype];
1160
1161	if (prog && link)
1162	/ only one of prog or link can be specified /
1163	return -EINVAL;
1164
1165	pl = find_detach_entry(progs, prog, link, allow_multi: flags & BPF_F_ALLOW_MULTI);
1166	if (IS_ERR(ptr: pl))
1167	return PTR_ERR(ptr: pl);
1168
1169	/ mark it deleted, so it's ignored while recomputing effective /
1170	old_prog = pl->prog;
1171	pl->prog = NULL;
1172	pl->link = NULL;
1173
1174	if (update_effective_progs(cgrp, atype)) {
1175	/ if update effective array failed replace the prog with a dummy prog/
1176	pl->prog = old_prog;
1177	pl->link = link;
1178	purge_effective_progs(cgrp, prog: old_prog, link, atype);
1179	}
1180
1181	/ now can actually delete it from this cgroup list /
1182	hlist_del(n: &pl->node);
1183	cgrp->bpf.revisions[atype] += `1`;
1184
1185	kfree(objp: pl);
1186	if (hlist_empty(h: progs))
1187	/ last program was detached, reset flags to zero /
1188	cgrp->bpf.flags[atype] = `0`;
1189	if (old_prog) {
1190	if (type == BPF_LSM_CGROUP)
1191	bpf_trampoline_unlink_cgroup_shim(prog: old_prog);
1192	bpf_prog_put(prog: old_prog);
1193	}
1194	static_branch_dec(&cgroup_bpf_enabled_key[atype]);
1195	return `0`;
1196	}
1197
1198	static int cgroup_bpf_detach(struct cgroup cgrp, struct* bpf_prog *prog,
1199	enum bpf_attach_type type, u64 revision)
1200	{
1201	int ret;
1202
1203	cgroup_lock();
1204	ret = __cgroup_bpf_detach(cgrp, prog, NULL, type, revision);
1205	cgroup_unlock();
1206	return ret;
1207	}
1208
1209	/ Must be called with cgroup_mutex held to avoid races. /
1210	static int __cgroup_bpf_query(struct cgroup cgrp, const* union bpf_attr *attr,
1211	union bpf_attr __user *uattr)
1212	{
1213	__u32 __user *prog_attach_flags = u64_to_user_ptr(attr->query.prog_attach_flags);
1214	bool effective_query = attr->query.query_flags & BPF_F_QUERY_EFFECTIVE;
1215	__u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids);
1216	enum bpf_attach_type type = attr->query.attach_type;
1217	enum cgroup_bpf_attach_type from_atype, to_atype;
1218	enum cgroup_bpf_attach_type atype;
1219	struct bpf_prog_array *effective;
1220	int cnt, ret = `0`, i;
1221	int total_cnt = `0`;
1222	u64 revision = `0`;
1223	u32 flags;
1224
1225	if (effective_query && prog_attach_flags)
1226	return -EINVAL;
1227
1228	if (type == BPF_LSM_CGROUP) {
1229	if (!effective_query && attr->query.prog_cnt &&
1230	prog_ids && !prog_attach_flags)
1231	return -EINVAL;
1232
1233	from_atype = CGROUP_LSM_START;
1234	to_atype = CGROUP_LSM_END;
1235	flags = `0`;
1236	} else {
1237	from_atype = to_cgroup_bpf_attach_type(attach_type: type);
1238	if (from_atype < `0`)
1239	return -EINVAL;
1240	to_atype = from_atype;
1241	flags = cgrp->bpf.flags[from_atype];
1242	}
1243
1244	for (atype = from_atype; atype <= to_atype; atype++) {
1245	if (effective_query) {
1246	effective = rcu_dereference_protected(cgrp->bpf.effective[atype],
1247	lockdep_is_held(&cgroup_mutex));
1248	total_cnt += bpf_prog_array_length(progs: effective);
1249	} else {
1250	total_cnt += prog_list_length(head: &cgrp->bpf.progs[atype], NULL);
1251	}
1252	}
1253
1254	/ always output uattr->query.attach_flags as 0 during effective query /
1255	flags = effective_query ? `0` : flags;
1256	if (copy_to_user(to: &uattr->query.attach_flags, from: &flags, n: sizeof(flags)))
1257	return -EFAULT;
1258	if (copy_to_user(to: &uattr->query.prog_cnt, from: &total_cnt, n: sizeof(total_cnt)))
1259	return -EFAULT;
1260	if (!effective_query && from_atype == to_atype)
1261	revision = cgrp->bpf.revisions[from_atype];
1262	if (copy_to_user(to: &uattr->query.revision, from: &revision, n: sizeof(revision)))
1263	return -EFAULT;
1264	if (attr->query.prog_cnt == `0` \|\| !prog_ids \|\| !total_cnt)
1265	/ return early if user requested only program count + flags /
1266	return `0`;
1267
1268	if (attr->query.prog_cnt < total_cnt) {
1269	total_cnt = attr->query.prog_cnt;
1270	ret = -ENOSPC;
1271	}
1272
1273	for (atype = from_atype; atype <= to_atype && total_cnt; atype++) {
1274	if (effective_query) {
1275	effective = rcu_dereference_protected(cgrp->bpf.effective[atype],
1276	lockdep_is_held(&cgroup_mutex));
1277	cnt = min_t(int, bpf_prog_array_length(effective), total_cnt);
1278	ret = bpf_prog_array_copy_to_user(progs: effective, prog_ids, cnt);
1279	} else {
1280	struct hlist_head *progs;
1281	struct bpf_prog_list *pl;
1282	struct bpf_prog *prog;
1283	u32 id;
1284
1285	progs = &cgrp->bpf.progs[atype];
1286	cnt = min_t(int, prog_list_length(progs, NULL), total_cnt);
1287	i = `0`;
1288	hlist_for_each_entry(pl, progs, node) {
1289	prog = prog_list_prog(pl);
1290	id = prog->aux->id;
1291	if (copy_to_user(to: prog_ids + i, from: &id, n: sizeof(id)))
1292	return -EFAULT;
1293	if (++i == cnt)
1294	break;
1295	}
1296
1297	if (prog_attach_flags) {
1298	flags = cgrp->bpf.flags[atype];
1299
1300	for (i = `0`; i < cnt; i++)
1301	if (copy_to_user(to: prog_attach_flags + i,
1302	from: &flags, n: sizeof(flags)))
1303	return -EFAULT;
1304	prog_attach_flags += cnt;
1305	}
1306	}
1307
1308	prog_ids += cnt;
1309	total_cnt -= cnt;
1310	}
1311	return ret;
1312	}
1313
1314	static int cgroup_bpf_query(struct cgroup cgrp, const* union bpf_attr *attr,
1315	union bpf_attr __user *uattr)
1316	{
1317	int ret;
1318
1319	cgroup_lock();
1320	ret = __cgroup_bpf_query(cgrp, attr, uattr);
1321	cgroup_unlock();
1322	return ret;
1323	}
1324
1325	int cgroup_bpf_prog_attach(const union bpf_attr *attr,
1326	enum bpf_prog_type ptype, struct bpf_prog *prog)
1327	{
1328	struct bpf_prog *replace_prog = NULL;
1329	struct cgroup *cgrp;
1330	int ret;
1331
1332	cgrp = cgroup_get_from_fd(fd: attr->target_fd);
1333	if (IS_ERR(ptr: cgrp))
1334	return PTR_ERR(ptr: cgrp);
1335
1336	if ((attr->attach_flags & BPF_F_ALLOW_MULTI) &&
1337	(attr->attach_flags & BPF_F_REPLACE)) {
1338	replace_prog = bpf_prog_get_type(ufd: attr->replace_bpf_fd, type: ptype);
1339	if (IS_ERR(ptr: replace_prog)) {
1340	cgroup_put(cgrp);
1341	return PTR_ERR(ptr: replace_prog);
1342	}
1343	}
1344
1345	ret = cgroup_bpf_attach(cgrp, prog, replace_prog, NULL,
1346	type: attr->attach_type, flags: attr->attach_flags,
1347	id_or_fd: attr->relative_fd, revision: attr->expected_revision);
1348
1349	if (replace_prog)
1350	bpf_prog_put(prog: replace_prog);
1351	cgroup_put(cgrp);
1352	return ret;
1353	}
1354
1355	int cgroup_bpf_prog_detach(const union bpf_attr attr, enum* bpf_prog_type ptype)
1356	{
1357	struct bpf_prog *prog;
1358	struct cgroup *cgrp;
1359	int ret;
1360
1361	cgrp = cgroup_get_from_fd(fd: attr->target_fd);
1362	if (IS_ERR(ptr: cgrp))
1363	return PTR_ERR(ptr: cgrp);
1364
1365	prog = bpf_prog_get_type(ufd: attr->attach_bpf_fd, type: ptype);
1366	if (IS_ERR(ptr: prog))
1367	prog = NULL;
1368
1369	ret = cgroup_bpf_detach(cgrp, prog, type: attr->attach_type, revision: attr->expected_revision);
1370	if (prog)
1371	bpf_prog_put(prog);
1372
1373	cgroup_put(cgrp);
1374	return ret;
1375	}
1376
1377	static void bpf_cgroup_link_release(struct bpf_link *link)
1378	{
1379	struct bpf_cgroup_link *cg_link =
1380	container_of(link, struct bpf_cgroup_link, link);
1381	struct cgroup *cg;
1382
1383	/ link might have been auto-detached by dying cgroup already,*
1384	* in that case our work is done here
1385	*/
1386	if (!cg_link->cgroup)
1387	return;
1388
1389	cgroup_lock();
1390
1391	/ re-check cgroup under lock again /
1392	if (!cg_link->cgroup) {
1393	cgroup_unlock();
1394	return;
1395	}
1396
1397	WARN_ON(__cgroup_bpf_detach(cg_link->cgroup, NULL, cg_link,
1398	link->attach_type, `0`));
1399	if (link->attach_type == BPF_LSM_CGROUP)
1400	bpf_trampoline_unlink_cgroup_shim(prog: cg_link->link.prog);
1401
1402	cg = cg_link->cgroup;
1403	cg_link->cgroup = NULL;
1404
1405	cgroup_unlock();
1406
1407	cgroup_put(cgrp: cg);
1408	}
1409
1410	static void bpf_cgroup_link_dealloc(struct bpf_link *link)
1411	{
1412	struct bpf_cgroup_link *cg_link =
1413	container_of(link, struct bpf_cgroup_link, link);
1414
1415	kfree(objp: cg_link);
1416	}
1417
1418	static int bpf_cgroup_link_detach(struct bpf_link *link)
1419	{
1420	bpf_cgroup_link_release(link);
1421
1422	return `0`;
1423	}
1424
1425	static void bpf_cgroup_link_show_fdinfo(const struct bpf_link *link,
1426	struct seq_file *seq)
1427	{
1428	struct bpf_cgroup_link *cg_link =
1429	container_of(link, struct bpf_cgroup_link, link);
1430	u64 cg_id = `0`;
1431
1432	cgroup_lock();
1433	if (cg_link->cgroup)
1434	cg_id = cgroup_id(cgrp: cg_link->cgroup);
1435	cgroup_unlock();
1436
1437	seq_printf(m: seq,
1438	fmt: "cgroup_id:\t%llu\n"
1439	"attach_type:\t%d\n",
1440	cg_id,
1441	link->attach_type);
1442	}
1443
1444	static int bpf_cgroup_link_fill_link_info(const struct bpf_link *link,
1445	struct bpf_link_info *info)
1446	{
1447	struct bpf_cgroup_link *cg_link =
1448	container_of(link, struct bpf_cgroup_link, link);
1449	u64 cg_id = `0`;
1450
1451	cgroup_lock();
1452	if (cg_link->cgroup)
1453	cg_id = cgroup_id(cgrp: cg_link->cgroup);
1454	cgroup_unlock();
1455
1456	info->cgroup.cgroup_id = cg_id;
1457	info->cgroup.attach_type = link->attach_type;
1458	return `0`;
1459	}
1460
1461	static const struct bpf_link_ops bpf_cgroup_link_lops = {
1462	.release = bpf_cgroup_link_release,
1463	.dealloc = bpf_cgroup_link_dealloc,
1464	.detach = bpf_cgroup_link_detach,
1465	.update_prog = cgroup_bpf_replace,
1466	.show_fdinfo = bpf_cgroup_link_show_fdinfo,
1467	.fill_link_info = bpf_cgroup_link_fill_link_info,
1468	};
1469
1470	#define BPF_F_LINK_ATTACH_MASK \
1471	(BPF_F_ID \| \
1472	BPF_F_BEFORE \| \
1473	BPF_F_AFTER \| \
1474	BPF_F_PREORDER \| \
1475	BPF_F_LINK)
1476
1477	int cgroup_bpf_link_attach(const union bpf_attr attr, struct* bpf_prog *prog)
1478	{
1479	struct bpf_link_primer link_primer;
1480	struct bpf_cgroup_link *link;
1481	struct cgroup *cgrp;
1482	int err;
1483
1484	if (attr->link_create.flags & (~BPF_F_LINK_ATTACH_MASK))
1485	return -EINVAL;
1486
1487	cgrp = cgroup_get_from_fd(fd: attr->link_create.target_fd);
1488	if (IS_ERR(ptr: cgrp))
1489	return PTR_ERR(ptr: cgrp);
1490
1491	link = kzalloc(sizeof(*link), GFP_USER);
1492	if (!link) {
1493	err = -ENOMEM;
1494	goto out_put_cgroup;
1495	}
1496	bpf_link_init(link: &link->link, type: BPF_LINK_TYPE_CGROUP, ops: &bpf_cgroup_link_lops,
1497	prog, attach_type: attr->link_create.attach_type);
1498	link->cgroup = cgrp;
1499
1500	err = bpf_link_prime(link: &link->link, primer: &link_primer);
1501	if (err) {
1502	kfree(objp: link);
1503	goto out_put_cgroup;
1504	}
1505
1506	err = cgroup_bpf_attach(cgrp, NULL, NULL, link,
1507	type: link->link.attach_type, BPF_F_ALLOW_MULTI \| attr->link_create.flags,
1508	id_or_fd: attr->link_create.cgroup.relative_fd,
1509	revision: attr->link_create.cgroup.expected_revision);
1510	if (err) {
1511	bpf_link_cleanup(primer: &link_primer);
1512	goto out_put_cgroup;
1513	}
1514
1515	return bpf_link_settle(primer: &link_primer);
1516
1517	out_put_cgroup:
1518	cgroup_put(cgrp);
1519	return err;
1520	}
1521
1522	int cgroup_bpf_prog_query(const union bpf_attr *attr,
1523	union bpf_attr __user *uattr)
1524	{
1525	struct cgroup *cgrp;
1526	int ret;
1527
1528	cgrp = cgroup_get_from_fd(fd: attr->query.target_fd);
1529	if (IS_ERR(ptr: cgrp))
1530	return PTR_ERR(ptr: cgrp);
1531
1532	ret = cgroup_bpf_query(cgrp, attr, uattr);
1533
1534	cgroup_put(cgrp);
1535	return ret;
1536	}
1537
1538	/**
1539	* __cgroup_bpf_run_filter_skb() - Run a program for packet filtering
1540	* @sk: The socket sending or receiving traffic
1541	* @skb: The skb that is being sent or received
1542	* @atype: The type of program to be executed
1543	*
1544	* If no socket is passed, or the socket is not of type INET or INET6,
1545	* this function does nothing and returns 0.
1546	*
1547	* The program type passed in via @type must be suitable for network
1548	* filtering. No further check is performed to assert that.
1549	*
1550	* For egress packets, this function can return:
1551	* NET_XMIT_SUCCESS (0) - continue with packet output
1552	* NET_XMIT_DROP (1) - drop packet and notify TCP to call cwr
1553	* NET_XMIT_CN (2) - continue with packet output and notify TCP
1554	* to call cwr
1555	* -err - drop packet
1556	*
1557	* For ingress packets, this function will return -EPERM if any
1558	* attached program was found and if it returned != 1 during execution.
1559	* Otherwise 0 is returned.
1560	*/
1561	int __cgroup_bpf_run_filter_skb(struct sock *sk,
1562	struct sk_buff *skb,
1563	enum cgroup_bpf_attach_type atype)
1564	{
1565	unsigned int offset = -skb_network_offset(skb);
1566	struct sock *save_sk;
1567	void *saved_data_end;
1568	struct cgroup *cgrp;
1569	int ret;
1570
1571	if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6)
1572	return `0`;
1573
1574	cgrp = sock_cgroup_ptr(skcd: &sk->sk_cgrp_data);
1575	save_sk = skb->sk;
1576	skb->sk = sk;
1577	__skb_push(skb, len: offset);
1578
1579	/ compute pointers for the bpf prog /
1580	bpf_compute_and_save_data_end(skb, saved_data_end: &saved_data_end);
1581
1582	if (atype == CGROUP_INET_EGRESS) {
1583	u32 flags = `0`;
1584	bool cn;
1585
1586	ret = bpf_prog_run_array_cg(cgrp: &cgrp->bpf, atype, ctx: skb,
1587	run_prog: __bpf_prog_run_save_cb, retval: `0`, ret_flags: &flags);
1588
1589	/ Return values of CGROUP EGRESS BPF programs are:*
1590	* 0: drop packet
1591	* 1: keep packet
1592	* 2: drop packet and cn
1593	* 3: keep packet and cn
1594	*
1595	* The returned value is then converted to one of the NET_XMIT
1596	* or an error code that is then interpreted as drop packet
1597	* (and no cn):
1598	* 0: NET_XMIT_SUCCESS skb should be transmitted
1599	* 1: NET_XMIT_DROP skb should be dropped and cn
1600	* 2: NET_XMIT_CN skb should be transmitted and cn
1601	* 3: -err skb should be dropped
1602	*/
1603
1604	cn = flags & BPF_RET_SET_CN;
1605	if (ret && !IS_ERR_VALUE((long)ret))
1606	ret = -EFAULT;
1607	if (!ret)
1608	ret = (cn ? NET_XMIT_CN : NET_XMIT_SUCCESS);
1609	else
1610	ret = (cn ? NET_XMIT_DROP : ret);
1611	} else {
1612	ret = bpf_prog_run_array_cg(cgrp: &cgrp->bpf, atype,
1613	ctx: skb, run_prog: __bpf_prog_run_save_cb, retval: `0`,
1614	NULL);
1615	if (ret && !IS_ERR_VALUE((long)ret))
1616	ret = -EFAULT;
1617	}
1618	bpf_restore_data_end(skb, saved_data_end);
1619	__skb_pull(skb, len: offset);
1620	skb->sk = save_sk;
1621
1622	return ret;
1623	}
1624	EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb);
1625
1626	/**
1627	* __cgroup_bpf_run_filter_sk() - Run a program on a sock
1628	* @sk: sock structure to manipulate
1629	* @atype: The type of program to be executed
1630	*
1631	* socket is passed is expected to be of type INET or INET6.
1632	*
1633	* The program type passed in via @type must be suitable for sock
1634	* filtering. No further check is performed to assert that.
1635	*
1636	* This function will return %-EPERM if any if an attached program was found
1637	* and if it returned != 1 during execution. In all other cases, 0 is returned.
1638	*/
1639	int __cgroup_bpf_run_filter_sk(struct sock *sk,
1640	enum cgroup_bpf_attach_type atype)
1641	{
1642	struct cgroup *cgrp = sock_cgroup_ptr(skcd: &sk->sk_cgrp_data);
1643
1644	return bpf_prog_run_array_cg(cgrp: &cgrp->bpf, atype, ctx: sk, run_prog: bpf_prog_run, retval: `0`,
1645	NULL);
1646	}
1647	EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
1648
1649	/**
1650	* __cgroup_bpf_run_filter_sock_addr() - Run a program on a sock and
1651	* provided by user sockaddr
1652	* @sk: sock struct that will use sockaddr
1653	* @uaddr: sockaddr struct provided by user
1654	* @uaddrlen: Pointer to the size of the sockaddr struct provided by user. It is
1655	* read-only for AF_INET[6] uaddr but can be modified for AF_UNIX
1656	* uaddr.
1657	* @atype: The type of program to be executed
1658	* @t_ctx: Pointer to attach type specific context
1659	* @flags: Pointer to u32 which contains higher bits of BPF program
1660	* return value (OR'ed together).
1661	*
1662	* socket is expected to be of type INET, INET6 or UNIX.
1663	*
1664	* This function will return %-EPERM if an attached program is found and
1665	* returned value != 1 during execution. In all other cases, 0 is returned.
1666	*/
1667	int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
1668	struct sockaddr_unsized *uaddr,
1669	int *uaddrlen,
1670	enum cgroup_bpf_attach_type atype,
1671	void *t_ctx,
1672	u32 *flags)
1673	{
1674	struct bpf_sock_addr_kern ctx = {
1675	.sk = sk,
1676	.uaddr = uaddr,
1677	.t_ctx = t_ctx,
1678	};
1679	struct sockaddr_storage storage;
1680	struct cgroup *cgrp;
1681	int ret;
1682
1683	/ Check socket family since not all sockets represent network*
1684	* endpoint (e.g. AF_UNIX).
1685	*/
1686	if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6 &&
1687	sk->sk_family != AF_UNIX)
1688	return `0`;
1689
1690	if (!ctx.uaddr) {
1691	memset(&storage, `0`, sizeof(storage));
1692	ctx.uaddr = (struct sockaddr_unsized *)&storage;
1693	ctx.uaddrlen = `0`;
1694	} else {
1695	ctx.uaddrlen = *uaddrlen;
1696	}
1697
1698	cgrp = sock_cgroup_ptr(skcd: &sk->sk_cgrp_data);
1699	ret = bpf_prog_run_array_cg(cgrp: &cgrp->bpf, atype, ctx: &ctx, run_prog: bpf_prog_run,
1700	retval: `0`, ret_flags: flags);
1701
1702	if (!ret && uaddr)
1703	*uaddrlen = ctx.uaddrlen;
1704
1705	return ret;
1706	}
1707	EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_addr);
1708
1709	/**
1710	* __cgroup_bpf_run_filter_sock_ops() - Run a program on a sock
1711	* @sk: socket to get cgroup from
1712	* @sock_ops: bpf_sock_ops_kern struct to pass to program. Contains
1713	* sk with connection information (IP addresses, etc.) May not contain
1714	* cgroup info if it is a req sock.
1715	* @atype: The type of program to be executed
1716	*
1717	* socket passed is expected to be of type INET or INET6.
1718	*
1719	* The program type passed in via @type must be suitable for sock_ops
1720	* filtering. No further check is performed to assert that.
1721	*
1722	* This function will return %-EPERM if any if an attached program was found
1723	* and if it returned != 1 during execution. In all other cases, 0 is returned.
1724	*/
1725	int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
1726	struct bpf_sock_ops_kern *sock_ops,
1727	enum cgroup_bpf_attach_type atype)
1728	{
1729	struct cgroup *cgrp = sock_cgroup_ptr(skcd: &sk->sk_cgrp_data);
1730
1731	return bpf_prog_run_array_cg(cgrp: &cgrp->bpf, atype, ctx: sock_ops, run_prog: bpf_prog_run,
1732	retval: `0`, NULL);
1733	}
1734	EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops);
1735
1736	int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
1737	short access, enum cgroup_bpf_attach_type atype)
1738	{
1739	struct cgroup *cgrp;
1740	struct bpf_cgroup_dev_ctx ctx = {
1741	.access_type = (access << `16`) \| dev_type,
1742	.major = major,
1743	.minor = minor,
1744	};
1745	int ret;
1746
1747	rcu_read_lock();
1748	cgrp = task_dfl_cgroup(current);
1749	ret = bpf_prog_run_array_cg(cgrp: &cgrp->bpf, atype, ctx: &ctx, run_prog: bpf_prog_run, retval: `0`,
1750	NULL);
1751	rcu_read_unlock();
1752
1753	return ret;
1754	}
1755
1756	BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags)
1757	{
1758	/ flags argument is not used now,*
1759	* but provides an ability to extend the API.
1760	* verifier checks that its value is correct.
1761	*/
1762	enum bpf_cgroup_storage_type stype = cgroup_storage_type(map);
1763	struct bpf_cgroup_storage *storage;
1764	struct bpf_cg_run_ctx *ctx;
1765	void *ptr;
1766
1767	/ get current cgroup storage from BPF run context /
1768	ctx = container_of(current->bpf_ctx, struct bpf_cg_run_ctx, run_ctx);
1769	storage = ctx->prog_item->cgroup_storage[stype];
1770
1771	if (stype == BPF_CGROUP_STORAGE_SHARED)
1772	ptr = &READ_ONCE(storage->buf)->data[`0`];
1773	else
1774	ptr = this_cpu_ptr(storage->percpu_buf);
1775
1776	return (unsigned long)ptr;
1777	}
1778
1779	const struct bpf_func_proto bpf_get_local_storage_proto = {
1780	.func = bpf_get_local_storage,
1781	.gpl_only = false,
1782	.ret_type = RET_PTR_TO_MAP_VALUE,
1783	.arg1_type = ARG_CONST_MAP_PTR,
1784	.arg2_type = ARG_ANYTHING,
1785	};
1786
1787	BPF_CALL_0(bpf_get_retval)
1788	{
1789	struct bpf_cg_run_ctx *ctx =
1790	container_of(current->bpf_ctx, struct bpf_cg_run_ctx, run_ctx);
1791
1792	return ctx->retval;
1793	}
1794
1795	const struct bpf_func_proto bpf_get_retval_proto = {
1796	.func = bpf_get_retval,
1797	.gpl_only = false,
1798	.ret_type = RET_INTEGER,
1799	};
1800
1801	BPF_CALL_1(bpf_set_retval, int, retval)
1802	{
1803	struct bpf_cg_run_ctx *ctx =
1804	container_of(current->bpf_ctx, struct bpf_cg_run_ctx, run_ctx);
1805
1806	ctx->retval = retval;
1807	return `0`;
1808	}
1809
1810	const struct bpf_func_proto bpf_set_retval_proto = {
1811	.func = bpf_set_retval,
1812	.gpl_only = false,
1813	.ret_type = RET_INTEGER,
1814	.arg1_type = ARG_ANYTHING,
1815	};
1816
1817	static const struct bpf_func_proto *
1818	cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
1819	{
1820	const struct bpf_func_proto *func_proto;
1821
1822	func_proto = cgroup_common_func_proto(func_id, prog);
1823	if (func_proto)
1824	return func_proto;
1825
1826	switch (func_id) {
1827	case BPF_FUNC_perf_event_output:
1828	return &bpf_event_output_data_proto;
1829	default:
1830	return bpf_base_func_proto(func_id, prog);
1831	}
1832	}
1833
1834	static bool cgroup_dev_is_valid_access(int off, int size,
1835	enum bpf_access_type type,
1836	const struct bpf_prog *prog,
1837	struct bpf_insn_access_aux *info)
1838	{
1839	const int size_default = sizeof(__u32);
1840
1841	if (type == BPF_WRITE)
1842	return false;
1843
1844	if (off < `0` \|\| off + size > sizeof(struct bpf_cgroup_dev_ctx))
1845	return false;
1846	/ The verifier guarantees that size > 0. /
1847	if (off % size != `0`)
1848	return false;
1849
1850	switch (off) {
1851	case bpf_ctx_range(struct bpf_cgroup_dev_ctx, access_type):
1852	bpf_ctx_record_field_size(aux: info, size: size_default);
1853	if (!bpf_ctx_narrow_access_ok(off, size, size_default))
1854	return false;
1855	break;
1856	default:
1857	if (size != size_default)
1858	return false;
1859	}
1860
1861	return true;
1862	}
1863
1864	const struct bpf_prog_ops cg_dev_prog_ops = {
1865	};
1866
1867	const struct bpf_verifier_ops cg_dev_verifier_ops = {
1868	.get_func_proto = cgroup_dev_func_proto,
1869	.is_valid_access = cgroup_dev_is_valid_access,
1870	};
1871
1872	/**
1873	* __cgroup_bpf_run_filter_sysctl - Run a program on sysctl
1874	*
1875	* @head: sysctl table header
1876	* @table: sysctl table
1877	* @write: sysctl is being read (= 0) or written (= 1)
1878	* @buf: pointer to buffer (in and out)
1879	* @pcount: value-result argument: value is size of buffer pointed to by @buf,
1880	* result is size of @new_buf if program set new value, initial value
1881	* otherwise
1882	* @ppos: value-result argument: value is position at which read from or write
1883	* to sysctl is happening, result is new position if program overrode it,
1884	* initial value otherwise
1885	* @atype: type of program to be executed
1886	*
1887	* Program is run when sysctl is being accessed, either read or written, and
1888	* can allow or deny such access.
1889	*
1890	* This function will return %-EPERM if an attached program is found and
1891	* returned value != 1 during execution. In all other cases 0 is returned.
1892	*/
1893	int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
1894	const struct ctl_table table, int* write,
1895	char *buf, size_t pcount, loff_t *ppos,
1896	enum cgroup_bpf_attach_type atype)
1897	{
1898	struct bpf_sysctl_kern ctx = {
1899	.head = head,
1900	.table = table,
1901	.write = write,
1902	.ppos = ppos,
1903	.cur_val = NULL,
1904	.cur_len = PAGE_SIZE,
1905	.new_val = NULL,
1906	.new_len = `0`,
1907	.new_updated = `0`,
1908	};
1909	struct cgroup *cgrp;
1910	loff_t pos = `0`;
1911	int ret;
1912
1913	ctx.cur_val = kmalloc_track_caller(ctx.cur_len, GFP_KERNEL);
1914	if (!ctx.cur_val \|\|
1915	table->proc_handler(table, `0`, ctx.cur_val, &ctx.cur_len, &pos)) {
1916	/ Let BPF program decide how to proceed. /
1917	ctx.cur_len = `0`;
1918	}
1919
1920	if (write && buf && pcount) {
1921	/ BPF program should be able to override new value with a*
1922	* buffer bigger than provided by user.
1923	*/
1924	ctx.new_val = kmalloc_track_caller(PAGE_SIZE, GFP_KERNEL);
1925	ctx.new_len = min_t(size_t, PAGE_SIZE, *pcount);
1926	if (ctx.new_val) {
1927	memcpy(ctx.new_val, *buf, ctx.new_len);
1928	} else {
1929	/ Let BPF program decide how to proceed. /
1930	ctx.new_len = `0`;
1931	}
1932	}
1933
1934	rcu_read_lock();
1935	cgrp = task_dfl_cgroup(current);
1936	ret = bpf_prog_run_array_cg(cgrp: &cgrp->bpf, atype, ctx: &ctx, run_prog: bpf_prog_run, retval: `0`,
1937	NULL);
1938	rcu_read_unlock();
1939
1940	kfree(objp: ctx.cur_val);
1941
1942	if (ret == `1` && ctx.new_updated) {
1943	kfree(objp: *buf);
1944	*buf = ctx.new_val;
1945	*pcount = ctx.new_len;
1946	} else {
1947	kfree(objp: ctx.new_val);
1948	}
1949
1950	return ret;
1951	}
1952
1953	#ifdef CONFIG_NET
1954	static int sockopt_alloc_buf(struct bpf_sockopt_kern ctx, int* max_optlen,
1955	struct bpf_sockopt_buf *buf)
1956	{
1957	if (unlikely(max_optlen < `0`))
1958	return -EINVAL;
1959
1960	if (unlikely(max_optlen > PAGE_SIZE)) {
1961	/ We don't expose optvals that are greater than PAGE_SIZE*
1962	* to the BPF program.
1963	*/
1964	max_optlen = PAGE_SIZE;
1965	}
1966
1967	if (max_optlen <= sizeof(buf->data)) {
1968	/ When the optval fits into BPF_SOCKOPT_KERN_BUF_SIZE*
1969	* bytes avoid the cost of kzalloc.
1970	*/
1971	ctx->optval = buf->data;
1972	ctx->optval_end = ctx->optval + max_optlen;
1973	return max_optlen;
1974	}
1975
1976	ctx->optval = kzalloc(max_optlen, GFP_USER);
1977	if (!ctx->optval)
1978	return -ENOMEM;
1979
1980	ctx->optval_end = ctx->optval + max_optlen;
1981
1982	return max_optlen;
1983	}
1984
1985	static void sockopt_free_buf(struct bpf_sockopt_kern *ctx,
1986	struct bpf_sockopt_buf *buf)
1987	{
1988	if (ctx->optval == buf->data)
1989	return;
1990	kfree(objp: ctx->optval);
1991	}
1992
1993	static bool sockopt_buf_allocated(struct bpf_sockopt_kern *ctx,
1994	struct bpf_sockopt_buf *buf)
1995	{
1996	return ctx->optval != buf->data;
1997	}
1998
1999	int __cgroup_bpf_run_filter_setsockopt(struct sock sk, int* *level,
2000	int *optname, sockptr_t optval,
2001	int optlen, char* **kernel_optval)
2002	{
2003	struct cgroup *cgrp = sock_cgroup_ptr(skcd: &sk->sk_cgrp_data);
2004	struct bpf_sockopt_buf buf = {};
2005	struct bpf_sockopt_kern ctx = {
2006	.sk = sk,
2007	.level = *level,
2008	.optname = *optname,
2009	};
2010	int ret, max_optlen;
2011
2012	/ Allocate a bit more than the initial user buffer for*
2013	* BPF program. The canonical use case is overriding
2014	* TCP_CONGESTION(nv) to TCP_CONGESTION(cubic).
2015	*/
2016	max_optlen = max_t(int, `16`, *optlen);
2017	max_optlen = sockopt_alloc_buf(ctx: &ctx, max_optlen, buf: &buf);
2018	if (max_optlen < `0`)
2019	return max_optlen;
2020
2021	ctx.optlen = *optlen;
2022
2023	if (copy_from_sockptr(dst: ctx.optval, src: optval,
2024	min(*optlen, max_optlen))) {
2025	ret = -EFAULT;
2026	goto out;
2027	}
2028
2029	lock_sock(sk);
2030	ret = bpf_prog_run_array_cg(cgrp: &cgrp->bpf, atype: CGROUP_SETSOCKOPT,
2031	ctx: &ctx, run_prog: bpf_prog_run, retval: `0`, NULL);
2032	release_sock(sk);
2033
2034	if (ret)
2035	goto out;
2036
2037	if (ctx.optlen == -`1`) {
2038	/ optlen set to -1, bypass kernel /
2039	ret = `1`;
2040	} else if (ctx.optlen > max_optlen \|\| ctx.optlen < -`1`) {
2041	/ optlen is out of bounds /
2042	if (*optlen > PAGE_SIZE && ctx.optlen >= `0`) {
2043	pr_info_once("bpf setsockopt: ignoring program buffer with optlen=%d (max_optlen=%d)\n",
2044	ctx.optlen, max_optlen);
2045	ret = `0`;
2046	goto out;
2047	}
2048	ret = -EFAULT;
2049	} else {
2050	/ optlen within bounds, run kernel handler /
2051	ret = `0`;
2052
2053	/ export any potential modifications /
2054	*level = ctx.level;
2055	*optname = ctx.optname;
2056
2057	/ optlen == 0 from BPF indicates that we should*
2058	* use original userspace data.
2059	*/
2060	if (ctx.optlen != `0`) {
2061	*optlen = ctx.optlen;
2062	/ We've used bpf_sockopt_kern->buf as an intermediary*
2063	* storage, but the BPF program indicates that we need
2064	* to pass this data to the kernel setsockopt handler.
2065	* No way to export on-stack buf, have to allocate a
2066	* new buffer.
2067	*/
2068	if (!sockopt_buf_allocated(ctx: &ctx, buf: &buf)) {
2069	void *p = kmalloc(ctx.optlen, GFP_USER);
2070
2071	if (!p) {
2072	ret = -ENOMEM;
2073	goto out;
2074	}
2075	memcpy(p, ctx.optval, ctx.optlen);
2076	*kernel_optval = p;
2077	} else {
2078	*kernel_optval = ctx.optval;
2079	}
2080	/ export and don't free sockopt buf /
2081	return `0`;
2082	}
2083	}
2084
2085	out:
2086	sockopt_free_buf(ctx: &ctx, buf: &buf);
2087	return ret;
2088	}
2089
2090	int __cgroup_bpf_run_filter_getsockopt(struct sock sk, int* level,
2091	int optname, sockptr_t optval,
2092	sockptr_t optlen, int max_optlen,
2093	int retval)
2094	{
2095	struct cgroup *cgrp = sock_cgroup_ptr(skcd: &sk->sk_cgrp_data);
2096	struct bpf_sockopt_buf buf = {};
2097	struct bpf_sockopt_kern ctx = {
2098	.sk = sk,
2099	.level = level,
2100	.optname = optname,
2101	.current_task = current,
2102	};
2103	int orig_optlen;
2104	int ret;
2105
2106	orig_optlen = max_optlen;
2107	ctx.optlen = max_optlen;
2108	max_optlen = sockopt_alloc_buf(ctx: &ctx, max_optlen, buf: &buf);
2109	if (max_optlen < `0`)
2110	return max_optlen;
2111
2112	if (!retval) {
2113	/ If kernel getsockopt finished successfully,*
2114	* copy whatever was returned to the user back
2115	* into our temporary buffer. Set optlen to the
2116	* one that kernel returned as well to let
2117	* BPF programs inspect the value.
2118	*/
2119	if (copy_from_sockptr(dst: &ctx.optlen, src: optlen,
2120	size: sizeof(ctx.optlen))) {
2121	ret = -EFAULT;
2122	goto out;
2123	}
2124
2125	if (ctx.optlen < `0`) {
2126	ret = -EFAULT;
2127	goto out;
2128	}
2129	orig_optlen = ctx.optlen;
2130
2131	if (copy_from_sockptr(dst: ctx.optval, src: optval,
2132	min(ctx.optlen, max_optlen))) {
2133	ret = -EFAULT;
2134	goto out;
2135	}
2136	}
2137
2138	lock_sock(sk);
2139	ret = bpf_prog_run_array_cg(cgrp: &cgrp->bpf, atype: CGROUP_GETSOCKOPT,
2140	ctx: &ctx, run_prog: bpf_prog_run, retval, NULL);
2141	release_sock(sk);
2142
2143	if (ret < `0`)
2144	goto out;
2145
2146	if (!sockptr_is_null(sockptr: optval) &&
2147	(ctx.optlen > max_optlen \|\| ctx.optlen < `0`)) {
2148	if (orig_optlen > PAGE_SIZE && ctx.optlen >= `0`) {
2149	pr_info_once("bpf getsockopt: ignoring program buffer with optlen=%d (max_optlen=%d)\n",
2150	ctx.optlen, max_optlen);
2151	ret = retval;
2152	goto out;
2153	}
2154	ret = -EFAULT;
2155	goto out;
2156	}
2157
2158	if (ctx.optlen != `0`) {
2159	if (!sockptr_is_null(sockptr: optval) &&
2160	copy_to_sockptr(dst: optval, src: ctx.optval, size: ctx.optlen)) {
2161	ret = -EFAULT;
2162	goto out;
2163	}
2164	if (copy_to_sockptr(dst: optlen, src: &ctx.optlen, size: sizeof(ctx.optlen))) {
2165	ret = -EFAULT;
2166	goto out;
2167	}
2168	}
2169
2170	out:
2171	sockopt_free_buf(ctx: &ctx, buf: &buf);
2172	return ret;
2173	}
2174
2175	int __cgroup_bpf_run_filter_getsockopt_kern(struct sock sk, int* level,
2176	int optname, void *optval,
2177	int optlen, int* retval)
2178	{
2179	struct cgroup *cgrp = sock_cgroup_ptr(skcd: &sk->sk_cgrp_data);
2180	struct bpf_sockopt_kern ctx = {
2181	.sk = sk,
2182	.level = level,
2183	.optname = optname,
2184	.optlen = *optlen,
2185	.optval = optval,
2186	.optval_end = optval + *optlen,
2187	.current_task = current,
2188	};
2189	int ret;
2190
2191	/ Note that __cgroup_bpf_run_filter_getsockopt doesn't copy*
2192	* user data back into BPF buffer when reval != 0. This is
2193	* done as an optimization to avoid extra copy, assuming
2194	* kernel won't populate the data in case of an error.
2195	* Here we always pass the data and memset() should
2196	* be called if that data shouldn't be "exported".
2197	*/
2198
2199	ret = bpf_prog_run_array_cg(cgrp: &cgrp->bpf, atype: CGROUP_GETSOCKOPT,
2200	ctx: &ctx, run_prog: bpf_prog_run, retval, NULL);
2201	if (ret < `0`)
2202	return ret;
2203
2204	if (ctx.optlen > *optlen)
2205	return -EFAULT;
2206
2207	/ BPF programs can shrink the buffer, export the modifications.*
2208	*/
2209	if (ctx.optlen != `0`)
2210	*optlen = ctx.optlen;
2211
2212	return ret;
2213	}
2214	#endif
2215
2216	static ssize_t sysctl_cpy_dir(const struct ctl_dir dir, char* **bufp,
2217	size_t *lenp)
2218	{
2219	ssize_t tmp_ret = `0`, ret;
2220
2221	if (dir->header.parent) {
2222	tmp_ret = sysctl_cpy_dir(dir: dir->header.parent, bufp, lenp);
2223	if (tmp_ret < `0`)
2224	return tmp_ret;
2225	}
2226
2227	ret = strscpy(bufp, dir->header.ctl_table[`0`].procname, lenp);
2228	if (ret < `0`)
2229	return ret;
2230	*bufp += ret;
2231	*lenp -= ret;
2232	ret += tmp_ret;
2233
2234	/ Avoid leading slash. /
2235	if (!ret)
2236	return ret;
2237
2238	tmp_ret = strscpy(bufp, "/", lenp);
2239	if (tmp_ret < `0`)
2240	return tmp_ret;
2241	*bufp += tmp_ret;
2242	*lenp -= tmp_ret;
2243
2244	return ret + tmp_ret;
2245	}
2246
2247	BPF_CALL_4(bpf_sysctl_get_name, struct bpf_sysctl_kern , ctx, char* *, buf,
2248	size_t, buf_len, u64, flags)
2249	{
2250	ssize_t tmp_ret = `0`, ret;
2251
2252	if (!buf)
2253	return -EINVAL;
2254
2255	if (!(flags & BPF_F_SYSCTL_BASE_NAME)) {
2256	if (!ctx->head)
2257	return -EINVAL;
2258	tmp_ret = sysctl_cpy_dir(dir: ctx->head->parent, bufp: &buf, lenp: &buf_len);
2259	if (tmp_ret < `0`)
2260	return tmp_ret;
2261	}
2262
2263	ret = strscpy(buf, ctx->table->procname, buf_len);
2264
2265	return ret < `0` ? ret : tmp_ret + ret;
2266	}
2267
2268	static const struct bpf_func_proto bpf_sysctl_get_name_proto = {
2269	.func = bpf_sysctl_get_name,
2270	.gpl_only = false,
2271	.ret_type = RET_INTEGER,
2272	.arg1_type = ARG_PTR_TO_CTX,
2273	.arg2_type = ARG_PTR_TO_MEM \| MEM_WRITE,
2274	.arg3_type = ARG_CONST_SIZE,
2275	.arg4_type = ARG_ANYTHING,
2276	};
2277
2278	static int copy_sysctl_value(char dst, size_t dst_len, char* *src,
2279	size_t src_len)
2280	{
2281	if (!dst)
2282	return -EINVAL;
2283
2284	if (!dst_len)
2285	return -E2BIG;
2286
2287	if (!src \|\| !src_len) {
2288	memset(dst, `0`, dst_len);
2289	return -EINVAL;
2290	}
2291
2292	memcpy(dst, src, min(dst_len, src_len));
2293
2294	if (dst_len > src_len) {
2295	memset(dst + src_len, `'\0'`, dst_len - src_len);
2296	return src_len;
2297	}
2298
2299	dst[dst_len - `1`] = `'\0'`;
2300
2301	return -E2BIG;
2302	}
2303
2304	BPF_CALL_3(bpf_sysctl_get_current_value, struct bpf_sysctl_kern *, ctx,
2305	char *, buf, size_t, buf_len)
2306	{
2307	return copy_sysctl_value(dst: buf, dst_len: buf_len, src: ctx->cur_val, src_len: ctx->cur_len);
2308	}
2309
2310	static const struct bpf_func_proto bpf_sysctl_get_current_value_proto = {
2311	.func = bpf_sysctl_get_current_value,
2312	.gpl_only = false,
2313	.ret_type = RET_INTEGER,
2314	.arg1_type = ARG_PTR_TO_CTX,
2315	.arg2_type = ARG_PTR_TO_UNINIT_MEM,
2316	.arg3_type = ARG_CONST_SIZE,
2317	};
2318
2319	BPF_CALL_3(bpf_sysctl_get_new_value, struct bpf_sysctl_kern , ctx, char* *, buf,
2320	size_t, buf_len)
2321	{
2322	if (!ctx->write) {
2323	if (buf && buf_len)
2324	memset(buf, `'\0'`, buf_len);
2325	return -EINVAL;
2326	}
2327	return copy_sysctl_value(dst: buf, dst_len: buf_len, src: ctx->new_val, src_len: ctx->new_len);
2328	}
2329
2330	static const struct bpf_func_proto bpf_sysctl_get_new_value_proto = {
2331	.func = bpf_sysctl_get_new_value,
2332	.gpl_only = false,
2333	.ret_type = RET_INTEGER,
2334	.arg1_type = ARG_PTR_TO_CTX,
2335	.arg2_type = ARG_PTR_TO_UNINIT_MEM,
2336	.arg3_type = ARG_CONST_SIZE,
2337	};
2338
2339	BPF_CALL_3(bpf_sysctl_set_new_value, struct bpf_sysctl_kern *, ctx,
2340	const char *, buf, size_t, buf_len)
2341	{
2342	if (!ctx->write \|\| !ctx->new_val \|\| !ctx->new_len \|\| !buf \|\| !buf_len)
2343	return -EINVAL;
2344
2345	if (buf_len > PAGE_SIZE - `1`)
2346	return -E2BIG;
2347
2348	memcpy(ctx->new_val, buf, buf_len);
2349	ctx->new_len = buf_len;
2350	ctx->new_updated = `1`;
2351
2352	return `0`;
2353	}
2354
2355	static const struct bpf_func_proto bpf_sysctl_set_new_value_proto = {
2356	.func = bpf_sysctl_set_new_value,
2357	.gpl_only = false,
2358	.ret_type = RET_INTEGER,
2359	.arg1_type = ARG_PTR_TO_CTX,
2360	.arg2_type = ARG_PTR_TO_MEM \| MEM_RDONLY,
2361	.arg3_type = ARG_CONST_SIZE,
2362	};
2363
2364	static const struct bpf_func_proto *
2365	sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
2366	{
2367	const struct bpf_func_proto *func_proto;
2368
2369	func_proto = cgroup_common_func_proto(func_id, prog);
2370	if (func_proto)
2371	return func_proto;
2372
2373	switch (func_id) {
2374	case BPF_FUNC_sysctl_get_name:
2375	return &bpf_sysctl_get_name_proto;
2376	case BPF_FUNC_sysctl_get_current_value:
2377	return &bpf_sysctl_get_current_value_proto;
2378	case BPF_FUNC_sysctl_get_new_value:
2379	return &bpf_sysctl_get_new_value_proto;
2380	case BPF_FUNC_sysctl_set_new_value:
2381	return &bpf_sysctl_set_new_value_proto;
2382	case BPF_FUNC_ktime_get_coarse_ns:
2383	return &bpf_ktime_get_coarse_ns_proto;
2384	case BPF_FUNC_perf_event_output:
2385	return &bpf_event_output_data_proto;
2386	default:
2387	return bpf_base_func_proto(func_id, prog);
2388	}
2389	}
2390
2391	static bool sysctl_is_valid_access(int off, int size, enum bpf_access_type type,
2392	const struct bpf_prog *prog,
2393	struct bpf_insn_access_aux *info)
2394	{
2395	const int size_default = sizeof(__u32);
2396
2397	if (off < `0` \|\| off + size > sizeof(struct bpf_sysctl) \|\| off % size)
2398	return false;
2399
2400	switch (off) {
2401	case bpf_ctx_range(struct bpf_sysctl, write):
2402	if (type != BPF_READ)
2403	return false;
2404	bpf_ctx_record_field_size(aux: info, size: size_default);
2405	return bpf_ctx_narrow_access_ok(off, size, size_default);
2406	case bpf_ctx_range(struct bpf_sysctl, file_pos):
2407	if (type == BPF_READ) {
2408	bpf_ctx_record_field_size(aux: info, size: size_default);
2409	return bpf_ctx_narrow_access_ok(off, size, size_default);
2410	} else {
2411	return size == size_default;
2412	}
2413	default:
2414	return false;
2415	}
2416	}
2417
2418	static u32 sysctl_convert_ctx_access(enum bpf_access_type type,
2419	const struct bpf_insn *si,
2420	struct bpf_insn *insn_buf,
2421	struct bpf_prog prog, u32 target_size)
2422	{
2423	struct bpf_insn *insn = insn_buf;
2424	u32 read_size;
2425
2426	switch (si->off) {
2427	case offsetof(struct bpf_sysctl, write):
2428	*insn++ = BPF_LDX_MEM(
2429	BPF_SIZE(si->code), si->dst_reg, si->src_reg,
2430	bpf_target_off(struct bpf_sysctl_kern, write,
2431	sizeof_field(struct bpf_sysctl_kern,
2432	write),
2433	target_size));
2434	break;
2435	case offsetof(struct bpf_sysctl, file_pos):
2436	/ ppos is a pointer so it should be accessed via indirect*
2437	* loads and stores. Also for stores additional temporary
2438	* register is used since neither src_reg nor dst_reg can be
2439	* overridden.
2440	*/
2441	if (type == BPF_WRITE) {
2442	int treg = BPF_REG_9;
2443
2444	if (si->src_reg == treg \|\| si->dst_reg == treg)
2445	--treg;
2446	if (si->src_reg == treg \|\| si->dst_reg == treg)
2447	--treg;
2448	*insn++ = BPF_STX_MEM(
2449	BPF_DW, si->dst_reg, treg,
2450	offsetof(struct bpf_sysctl_kern, tmp_reg));
2451	*insn++ = BPF_LDX_MEM(
2452	BPF_FIELD_SIZEOF(struct bpf_sysctl_kern, ppos),
2453	treg, si->dst_reg,
2454	offsetof(struct bpf_sysctl_kern, ppos));
2455	*insn++ = BPF_RAW_INSN(
2456	BPF_CLASS(si->code) \| BPF_MEM \| BPF_SIZEOF(u32),
2457	treg, si->src_reg,
2458	bpf_ctx_narrow_access_offset(
2459	`0`, sizeof(u32), sizeof(loff_t)),
2460	si->imm);
2461	*insn++ = BPF_LDX_MEM(
2462	BPF_DW, treg, si->dst_reg,
2463	offsetof(struct bpf_sysctl_kern, tmp_reg));
2464	} else {
2465	*insn++ = BPF_LDX_MEM(
2466	BPF_FIELD_SIZEOF(struct bpf_sysctl_kern, ppos),
2467	si->dst_reg, si->src_reg,
2468	offsetof(struct bpf_sysctl_kern, ppos));
2469	read_size = bpf_size_to_bytes(BPF_SIZE(si->code));
2470	*insn++ = BPF_LDX_MEM(
2471	BPF_SIZE(si->code), si->dst_reg, si->dst_reg,
2472	bpf_ctx_narrow_access_offset(
2473	`0`, read_size, sizeof(loff_t)));
2474	}
2475	target_size = sizeof*(u32);
2476	break;
2477	}
2478
2479	return insn - insn_buf;
2480	}
2481
2482	const struct bpf_verifier_ops cg_sysctl_verifier_ops = {
2483	.get_func_proto = sysctl_func_proto,
2484	.is_valid_access = sysctl_is_valid_access,
2485	.convert_ctx_access = sysctl_convert_ctx_access,
2486	};
2487
2488	const struct bpf_prog_ops cg_sysctl_prog_ops = {
2489	};
2490
2491	#ifdef CONFIG_NET
2492	BPF_CALL_1(bpf_get_netns_cookie_sockopt, struct bpf_sockopt_kern *, ctx)
2493	{
2494	const struct net *net = ctx ? sock_net(sk: ctx->sk) : &init_net;
2495
2496	return net->net_cookie;
2497	}
2498
2499	static const struct bpf_func_proto bpf_get_netns_cookie_sockopt_proto = {
2500	.func = bpf_get_netns_cookie_sockopt,
2501	.gpl_only = false,
2502	.ret_type = RET_INTEGER,
2503	.arg1_type = ARG_PTR_TO_CTX_OR_NULL,
2504	};
2505	#endif
2506
2507	static const struct bpf_func_proto *
2508	cg_sockopt_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
2509	{
2510	const struct bpf_func_proto *func_proto;
2511
2512	func_proto = cgroup_common_func_proto(func_id, prog);
2513	if (func_proto)
2514	return func_proto;
2515
2516	switch (func_id) {
2517	#ifdef CONFIG_NET
2518	case BPF_FUNC_get_netns_cookie:
2519	return &bpf_get_netns_cookie_sockopt_proto;
2520	case BPF_FUNC_sk_storage_get:
2521	return &bpf_sk_storage_get_proto;
2522	case BPF_FUNC_sk_storage_delete:
2523	return &bpf_sk_storage_delete_proto;
2524	case BPF_FUNC_setsockopt:
2525	if (prog->expected_attach_type == BPF_CGROUP_SETSOCKOPT)
2526	return &bpf_sk_setsockopt_proto;
2527	return NULL;
2528	case BPF_FUNC_getsockopt:
2529	if (prog->expected_attach_type == BPF_CGROUP_SETSOCKOPT)
2530	return &bpf_sk_getsockopt_proto;
2531	return NULL;
2532	#endif
2533	#ifdef CONFIG_INET
2534	case BPF_FUNC_tcp_sock:
2535	return &bpf_tcp_sock_proto;
2536	#endif
2537	case BPF_FUNC_perf_event_output:
2538	return &bpf_event_output_data_proto;
2539	default:
2540	return bpf_base_func_proto(func_id, prog);
2541	}
2542	}
2543
2544	static bool cg_sockopt_is_valid_access(int off, int size,
2545	enum bpf_access_type type,
2546	const struct bpf_prog *prog,
2547	struct bpf_insn_access_aux *info)
2548	{
2549	const int size_default = sizeof(__u32);
2550
2551	if (off < `0` \|\| off >= sizeof(struct bpf_sockopt))
2552	return false;
2553
2554	if (off % size != `0`)
2555	return false;
2556
2557	if (type == BPF_WRITE) {
2558	switch (off) {
2559	case offsetof(struct bpf_sockopt, retval):
2560	if (size != size_default)
2561	return false;
2562	return prog->expected_attach_type ==
2563	BPF_CGROUP_GETSOCKOPT;
2564	case offsetof(struct bpf_sockopt, optname):
2565	fallthrough;
2566	case offsetof(struct bpf_sockopt, level):
2567	if (size != size_default)
2568	return false;
2569	return prog->expected_attach_type ==
2570	BPF_CGROUP_SETSOCKOPT;
2571	case offsetof(struct bpf_sockopt, optlen):
2572	return size == size_default;
2573	default:
2574	return false;
2575	}
2576	}
2577
2578	switch (off) {
2579	case bpf_ctx_range_ptr(struct bpf_sockopt, sk):
2580	if (size != sizeof(__u64))
2581	return false;
2582	info->reg_type = PTR_TO_SOCKET;
2583	break;
2584	case bpf_ctx_range_ptr(struct bpf_sockopt, optval):
2585	if (size != sizeof(__u64))
2586	return false;
2587	info->reg_type = PTR_TO_PACKET;
2588	break;
2589	case bpf_ctx_range_ptr(struct bpf_sockopt, optval_end):
2590	if (size != sizeof(__u64))
2591	return false;
2592	info->reg_type = PTR_TO_PACKET_END;
2593	break;
2594	case bpf_ctx_range(struct bpf_sockopt, retval):
2595	if (size != size_default)
2596	return false;
2597	return prog->expected_attach_type == BPF_CGROUP_GETSOCKOPT;
2598	default:
2599	if (size != size_default)
2600	return false;
2601	break;
2602	}
2603	return true;
2604	}
2605
2606	#define CG_SOCKOPT_READ_FIELD(F) \
2607	BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, F), \
2608	si->dst_reg, si->src_reg, \
2609	offsetof(struct bpf_sockopt_kern, F))
2610
2611	#define CG_SOCKOPT_WRITE_FIELD(F) \
2612	BPF_RAW_INSN((BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, F) \| \
2613	BPF_MEM \| BPF_CLASS(si->code)), \
2614	si->dst_reg, si->src_reg, \
2615	offsetof(struct bpf_sockopt_kern, F), \
2616	si->imm)
2617
2618	static u32 cg_sockopt_convert_ctx_access(enum bpf_access_type type,
2619	const struct bpf_insn *si,
2620	struct bpf_insn *insn_buf,
2621	struct bpf_prog *prog,
2622	u32 *target_size)
2623	{
2624	struct bpf_insn *insn = insn_buf;
2625
2626	switch (si->off) {
2627	case offsetof(struct bpf_sockopt, sk):
2628	*insn++ = CG_SOCKOPT_READ_FIELD(sk);
2629	break;
2630	case offsetof(struct bpf_sockopt, level):
2631	if (type == BPF_WRITE)
2632	*insn++ = CG_SOCKOPT_WRITE_FIELD(level);
2633	else
2634	*insn++ = CG_SOCKOPT_READ_FIELD(level);
2635	break;
2636	case offsetof(struct bpf_sockopt, optname):
2637	if (type == BPF_WRITE)
2638	*insn++ = CG_SOCKOPT_WRITE_FIELD(optname);
2639	else
2640	*insn++ = CG_SOCKOPT_READ_FIELD(optname);
2641	break;
2642	case offsetof(struct bpf_sockopt, optlen):
2643	if (type == BPF_WRITE)
2644	*insn++ = CG_SOCKOPT_WRITE_FIELD(optlen);
2645	else
2646	*insn++ = CG_SOCKOPT_READ_FIELD(optlen);
2647	break;
2648	case offsetof(struct bpf_sockopt, retval):
2649	BUILD_BUG_ON(offsetof(struct bpf_cg_run_ctx, run_ctx) != `0`);
2650
2651	if (type == BPF_WRITE) {
2652	int treg = BPF_REG_9;
2653
2654	if (si->src_reg == treg \|\| si->dst_reg == treg)
2655	--treg;
2656	if (si->src_reg == treg \|\| si->dst_reg == treg)
2657	--treg;
2658	*insn++ = BPF_STX_MEM(BPF_DW, si->dst_reg, treg,
2659	offsetof(struct bpf_sockopt_kern, tmp_reg));
2660	insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct* bpf_sockopt_kern, current_task),
2661	treg, si->dst_reg,
2662	offsetof(struct bpf_sockopt_kern, current_task));
2663	insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct* task_struct, bpf_ctx),
2664	treg, treg,
2665	offsetof(struct task_struct, bpf_ctx));
2666	*insn++ = BPF_RAW_INSN(BPF_CLASS(si->code) \| BPF_MEM \|
2667	BPF_FIELD_SIZEOF(struct bpf_cg_run_ctx, retval),
2668	treg, si->src_reg,
2669	offsetof(struct bpf_cg_run_ctx, retval),
2670	si->imm);
2671	*insn++ = BPF_LDX_MEM(BPF_DW, treg, si->dst_reg,
2672	offsetof(struct bpf_sockopt_kern, tmp_reg));
2673	} else {
2674	insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct* bpf_sockopt_kern, current_task),
2675	si->dst_reg, si->src_reg,
2676	offsetof(struct bpf_sockopt_kern, current_task));
2677	insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct* task_struct, bpf_ctx),
2678	si->dst_reg, si->dst_reg,
2679	offsetof(struct task_struct, bpf_ctx));
2680	insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct* bpf_cg_run_ctx, retval),
2681	si->dst_reg, si->dst_reg,
2682	offsetof(struct bpf_cg_run_ctx, retval));
2683	}
2684	break;
2685	case offsetof(struct bpf_sockopt, optval):
2686	*insn++ = CG_SOCKOPT_READ_FIELD(optval);
2687	break;
2688	case offsetof(struct bpf_sockopt, optval_end):
2689	*insn++ = CG_SOCKOPT_READ_FIELD(optval_end);
2690	break;
2691	}
2692
2693	return insn - insn_buf;
2694	}
2695
2696	static int cg_sockopt_get_prologue(struct bpf_insn *insn_buf,
2697	bool direct_write,
2698	const struct bpf_prog *prog)
2699	{
2700	/ Nothing to do for sockopt argument. The data is kzalloc'ated.*
2701	*/
2702	return `0`;
2703	}
2704
2705	const struct bpf_verifier_ops cg_sockopt_verifier_ops = {
2706	.get_func_proto = cg_sockopt_func_proto,
2707	.is_valid_access = cg_sockopt_is_valid_access,
2708	.convert_ctx_access = cg_sockopt_convert_ctx_access,
2709	.gen_prologue = cg_sockopt_get_prologue,
2710	};
2711
2712	const struct bpf_prog_ops cg_sockopt_prog_ops = {
2713	};
2714
2715	/ Common helpers for cgroup hooks. /
2716	const struct bpf_func_proto *
2717	cgroup_common_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
2718	{
2719	switch (func_id) {
2720	case BPF_FUNC_get_local_storage:
2721	return &bpf_get_local_storage_proto;
2722	case BPF_FUNC_get_retval:
2723	switch (prog->expected_attach_type) {
2724	case BPF_CGROUP_INET_INGRESS:
2725	case BPF_CGROUP_INET_EGRESS:
2726	case BPF_CGROUP_SOCK_OPS:
2727	case BPF_CGROUP_UDP4_RECVMSG:
2728	case BPF_CGROUP_UDP6_RECVMSG:
2729	case BPF_CGROUP_UNIX_RECVMSG:
2730	case BPF_CGROUP_INET4_GETPEERNAME:
2731	case BPF_CGROUP_INET6_GETPEERNAME:
2732	case BPF_CGROUP_UNIX_GETPEERNAME:
2733	case BPF_CGROUP_INET4_GETSOCKNAME:
2734	case BPF_CGROUP_INET6_GETSOCKNAME:
2735	case BPF_CGROUP_UNIX_GETSOCKNAME:
2736	return NULL;
2737	default:
2738	return &bpf_get_retval_proto;
2739	}
2740	case BPF_FUNC_set_retval:
2741	switch (prog->expected_attach_type) {
2742	case BPF_CGROUP_INET_INGRESS:
2743	case BPF_CGROUP_INET_EGRESS:
2744	case BPF_CGROUP_SOCK_OPS:
2745	case BPF_CGROUP_UDP4_RECVMSG:
2746	case BPF_CGROUP_UDP6_RECVMSG:
2747	case BPF_CGROUP_UNIX_RECVMSG:
2748	case BPF_CGROUP_INET4_GETPEERNAME:
2749	case BPF_CGROUP_INET6_GETPEERNAME:
2750	case BPF_CGROUP_UNIX_GETPEERNAME:
2751	case BPF_CGROUP_INET4_GETSOCKNAME:
2752	case BPF_CGROUP_INET6_GETSOCKNAME:
2753	case BPF_CGROUP_UNIX_GETSOCKNAME:
2754	return NULL;
2755	default:
2756	return &bpf_set_retval_proto;
2757	}
2758	default:
2759	return NULL;
2760	}
2761	}
2762

source code of linux/kernel/bpf/cgroup.c