uprobes.c source code [linux/kernel/events/uprobes.c]

1	// SPDX-License-Identifier: GPL-2.0+
2	/*
3	* User-space Probes (UProbes)
4	*
5	* Copyright (C) IBM Corporation, 2008-2012
6	* Authors:
7	* Srikar Dronamraju
8	* Jim Keniston
9	* Copyright (C) 2011-2012 Red Hat, Inc., Peter Zijlstra
10	*/
11
12	#include <linux/kernel.h>
13	#include <linux/highmem.h>
14	#include <linux/pagemap.h> /* read_mapping_page */
15	#include <linux/slab.h>
16	#include <linux/sched.h>
17	#include <linux/sched/mm.h>
18	#include <linux/export.h>
19	#include <linux/rmap.h> /* anon_vma_prepare */
20	#include <linux/mmu_notifier.h>
21	#include <linux/swap.h> /* folio_free_swap */
22	#include <linux/ptrace.h> /* user_enable_single_step */
23	#include <linux/kdebug.h> /* notifier mechanism */
24	#include <linux/percpu-rwsem.h>
25	#include <linux/task_work.h>
26	#include <linux/shmem_fs.h>
27	#include <linux/khugepaged.h>
28	#include <linux/rcupdate_trace.h>
29	#include <linux/workqueue.h>
30	#include <linux/srcu.h>
31	#include <linux/oom.h> /* check_stable_address_space */
32	#include <linux/pagewalk.h>
33
34	#include <linux/uprobes.h>
35
36	#define UINSNS_PER_PAGE (PAGE_SIZE/UPROBE_XOL_SLOT_BYTES)
37	#define MAX_UPROBE_XOL_SLOTS UINSNS_PER_PAGE
38
39	static struct rb_root uprobes_tree = RB_ROOT;
40	/*
41	* allows us to skip the uprobe_mmap if there are no uprobe events active
42	* at this time. Probably a fine grained per inode count is better?
43	*/
44	#define no_uprobe_events() RB_EMPTY_ROOT(&uprobes_tree)
45
46	static DEFINE_RWLOCK(uprobes_treelock); / serialize rbtree access /
47	static seqcount_rwlock_t uprobes_seqcount = SEQCNT_RWLOCK_ZERO(uprobes_seqcount, &uprobes_treelock);
48
49	#define UPROBES_HASH_SZ 13
50	/ serialize uprobe->pending_list /
51	static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ];
52	#define uprobes_mmap_hash(v) (&uprobes_mmap_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ])
53
54	DEFINE_STATIC_PERCPU_RWSEM(dup_mmap_sem);
55
56	/ Covers return_instance's uprobe lifetime. /
57	DEFINE_STATIC_SRCU(uretprobes_srcu);
58
59	/ Have a copy of original instruction /
60	#define UPROBE_COPY_INSN 0
61
62	struct uprobe {
63	struct rb_node rb_node; / node in the rb tree /
64	refcount_t ref;
65	struct rw_semaphore register_rwsem;
66	struct rw_semaphore consumer_rwsem;
67	struct list_head pending_list;
68	struct list_head consumers;
69	struct inode inode; /* Also hold a ref to inode /
70	union {
71	struct rcu_head rcu;
72	struct work_struct work;
73	};
74	loff_t offset;
75	loff_t ref_ctr_offset;
76	unsigned long flags; / "unsigned long" so bitops work /
77
78	/*
79	* The generic code assumes that it has two members of unknown type
80	* owned by the arch-specific code:
81	*
82	* insn - copy_insn() saves the original instruction here for
83	* arch_uprobe_analyze_insn().
84	*
85	* ixol - potentially modified instruction to execute out of
86	* line, copied to xol_area by xol_get_insn_slot().
87	*/
88	struct arch_uprobe arch;
89	};
90
91	struct delayed_uprobe {
92	struct list_head list;
93	struct uprobe *uprobe;
94	struct mm_struct *mm;
95	};
96
97	static DEFINE_MUTEX(delayed_uprobe_lock);
98	static LIST_HEAD(delayed_uprobe_list);
99
100	/*
101	* Execute out of line area: anonymous executable mapping installed
102	* by the probed task to execute the copy of the original instruction
103	* mangled by set_swbp().
104	*
105	* On a breakpoint hit, thread contests for a slot. It frees the
106	* slot after singlestep. Currently a fixed number of slots are
107	* allocated.
108	*/
109	struct xol_area {
110	wait_queue_head_t wq; / if all slots are busy /
111	unsigned long bitmap; /* 0 = free slot /
112
113	struct page *page;
114	/*
115	* We keep the vma's vm_start rather than a pointer to the vma
116	* itself. The probed process or a naughty kernel module could make
117	* the vma go away, and we must handle that reasonably gracefully.
118	*/
119	unsigned long vaddr; / Page(s) of instruction slots /
120	};
121
122	static void uprobe_warn(struct task_struct t, const* char *msg)
123	{
124	pr_warn("uprobe: %s:%d failed to %s\n", t->comm, t->pid, msg);
125	}
126
127	/*
128	* valid_vma: Verify if the specified vma is an executable vma
129	* Relax restrictions while unregistering: vm_flags might have
130	* changed after breakpoint was inserted.
131	* - is_register: indicates if we are in register context.
132	* - Return 1 if the specified virtual address is in an
133	* executable vma.
134	*/
135	static bool valid_vma(struct vm_area_struct *vma, bool is_register)
136	{
137	vm_flags_t flags = VM_HUGETLB \| VM_MAYEXEC \| VM_MAYSHARE;
138
139	if (is_register)
140	flags \|= VM_WRITE;
141
142	return vma->vm_file && (vma->vm_flags & flags) == VM_MAYEXEC;
143	}
144
145	static unsigned long offset_to_vaddr(struct vm_area_struct *vma, loff_t offset)
146	{
147	return vma->vm_start + offset - ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
148	}
149
150	static loff_t vaddr_to_offset(struct vm_area_struct vma, unsigned* long vaddr)
151	{
152	return ((loff_t)vma->vm_pgoff << PAGE_SHIFT) + (vaddr - vma->vm_start);
153	}
154
155	/**
156	* is_swbp_insn - check if instruction is breakpoint instruction.
157	* @insn: instruction to be checked.
158	* Default implementation of is_swbp_insn
159	* Returns true if @insn is a breakpoint instruction.
160	*/
161	bool __weak is_swbp_insn(uprobe_opcode_t *insn)
162	{
163	return *insn == UPROBE_SWBP_INSN;
164	}
165
166	/**
167	* is_trap_insn - check if instruction is breakpoint instruction.
168	* @insn: instruction to be checked.
169	* Default implementation of is_trap_insn
170	* Returns true if @insn is a breakpoint instruction.
171	*
172	* This function is needed for the case where an architecture has multiple
173	* trap instructions (like powerpc).
174	*/
175	bool __weak is_trap_insn(uprobe_opcode_t *insn)
176	{
177	return is_swbp_insn(insn);
178	}
179
180	void uprobe_copy_from_page(struct page page, unsigned* long vaddr, void dst, int* len)
181	{
182	void *kaddr = kmap_atomic(page);
183	memcpy(dst, kaddr + (vaddr & ~PAGE_MASK), len);
184	kunmap_atomic(kaddr);
185	}
186
187	static void copy_to_page(struct page page, unsigned* long vaddr, const void src, int* len)
188	{
189	void *kaddr = kmap_atomic(page);
190	memcpy(kaddr + (vaddr & ~PAGE_MASK), src, len);
191	kunmap_atomic(kaddr);
192	}
193
194	static int verify_opcode(struct page page, unsigned* long vaddr, uprobe_opcode_t *insn,
195	int nbytes, void *data)
196	{
197	uprobe_opcode_t old_opcode;
198	bool is_swbp;
199
200	/*
201	* Note: We only check if the old_opcode is UPROBE_SWBP_INSN here.
202	* We do not check if it is any other 'trap variant' which could
203	* be conditional trap instruction such as the one powerpc supports.
204	*
205	* The logic is that we do not care if the underlying instruction
206	* is a trap variant; uprobes always wins over any other (gdb)
207	* breakpoint.
208	*/
209	uprobe_copy_from_page(page, vaddr, dst: &old_opcode, UPROBE_SWBP_INSN_SIZE);
210	is_swbp = is_swbp_insn(insn: &old_opcode);
211
212	if (is_swbp_insn(insn)) {
213	if (is_swbp) / register: already installed? /
214	return `0`;
215	} else {
216	if (!is_swbp) / unregister: was it changed by us? /
217	return `0`;
218	}
219
220	return `1`;
221	}
222
223	static struct delayed_uprobe *
224	delayed_uprobe_check(struct uprobe uprobe, struct* mm_struct *mm)
225	{
226	struct delayed_uprobe *du;
227
228	list_for_each_entry(du, &delayed_uprobe_list, list)
229	if (du->uprobe == uprobe && du->mm == mm)
230	return du;
231	return NULL;
232	}
233
234	static int delayed_uprobe_add(struct uprobe uprobe, struct* mm_struct *mm)
235	{
236	struct delayed_uprobe *du;
237
238	if (delayed_uprobe_check(uprobe, mm))
239	return `0`;
240
241	du = kzalloc(sizeof(*du), GFP_KERNEL);
242	if (!du)
243	return -ENOMEM;
244
245	du->uprobe = uprobe;
246	du->mm = mm;
247	list_add(new: &du->list, head: &delayed_uprobe_list);
248	return `0`;
249	}
250
251	static void delayed_uprobe_delete(struct delayed_uprobe *du)
252	{
253	if (WARN_ON(!du))
254	return;
255	list_del(entry: &du->list);
256	kfree(objp: du);
257	}
258
259	static void delayed_uprobe_remove(struct uprobe uprobe, struct* mm_struct *mm)
260	{
261	struct list_head pos, q;
262	struct delayed_uprobe *du;
263
264	if (!uprobe && !mm)
265	return;
266
267	list_for_each_safe(pos, q, &delayed_uprobe_list) {
268	du = list_entry(pos, struct delayed_uprobe, list);
269
270	if (uprobe && du->uprobe != uprobe)
271	continue;
272	if (mm && du->mm != mm)
273	continue;
274
275	delayed_uprobe_delete(du);
276	}
277	}
278
279	static bool valid_ref_ctr_vma(struct uprobe *uprobe,
280	struct vm_area_struct *vma)
281	{
282	unsigned long vaddr = offset_to_vaddr(vma, offset: uprobe->ref_ctr_offset);
283
284	return uprobe->ref_ctr_offset &&
285	vma->vm_file &&
286	file_inode(f: vma->vm_file) == uprobe->inode &&
287	(vma->vm_flags & (VM_WRITE\|VM_SHARED)) == VM_WRITE &&
288	vma->vm_start <= vaddr &&
289	vma->vm_end > vaddr;
290	}
291
292	static struct vm_area_struct *
293	find_ref_ctr_vma(struct uprobe uprobe, struct* mm_struct *mm)
294	{
295	VMA_ITERATOR(vmi, mm, `0`);
296	struct vm_area_struct *tmp;
297
298	for_each_vma(vmi, tmp)
299	if (valid_ref_ctr_vma(uprobe, vma: tmp))
300	return tmp;
301
302	return NULL;
303	}
304
305	static int
306	__update_ref_ctr(struct mm_struct mm, unsigned* long vaddr, short d)
307	{
308	void *kaddr;
309	struct page *page;
310	int ret;
311	short *ptr;
312
313	if (!vaddr \|\| !d)
314	return -EINVAL;
315
316	ret = get_user_pages_remote(mm, start: vaddr, nr_pages: `1`,
317	gup_flags: FOLL_WRITE, pages: &page, NULL);
318	if (unlikely(ret <= `0`)) {
319	/*
320	* We are asking for 1 page. If get_user_pages_remote() fails,
321	* it may return 0, in that case we have to return error.
322	*/
323	return ret == `0` ? -EBUSY : ret;
324	}
325
326	kaddr = kmap_atomic(page);
327	ptr = kaddr + (vaddr & ~PAGE_MASK);
328
329	if (unlikely(*ptr + d < `0`)) {
330	pr_warn("ref_ctr going negative. vaddr: 0x%lx, "
331	"curr val: %d, delta: %d\n", vaddr, *ptr, d);
332	ret = -EINVAL;
333	goto out;
334	}
335
336	*ptr += d;
337	ret = `0`;
338	out:
339	kunmap_atomic(kaddr);
340	put_page(page);
341	return ret;
342	}
343
344	static void update_ref_ctr_warn(struct uprobe *uprobe,
345	struct mm_struct mm, short* d)
346	{
347	pr_warn("ref_ctr %s failed for inode: 0x%lx offset: "
348	"0x%llx ref_ctr_offset: 0x%llx of mm: 0x%p\n",
349	d > `0` ? "increment" : "decrement", uprobe->inode->i_ino,
350	(unsigned long long) uprobe->offset,
351	(unsigned long long) uprobe->ref_ctr_offset, mm);
352	}
353
354	static int update_ref_ctr(struct uprobe uprobe, struct* mm_struct *mm,
355	short d)
356	{
357	struct vm_area_struct *rc_vma;
358	unsigned long rc_vaddr;
359	int ret = `0`;
360
361	rc_vma = find_ref_ctr_vma(uprobe, mm);
362
363	if (rc_vma) {
364	rc_vaddr = offset_to_vaddr(vma: rc_vma, offset: uprobe->ref_ctr_offset);
365	ret = __update_ref_ctr(mm, vaddr: rc_vaddr, d);
366	if (ret)
367	update_ref_ctr_warn(uprobe, mm, d);
368
369	if (d > `0`)
370	return ret;
371	}
372
373	mutex_lock(&delayed_uprobe_lock);
374	if (d > `0`)
375	ret = delayed_uprobe_add(uprobe, mm);
376	else
377	delayed_uprobe_remove(uprobe, mm);
378	mutex_unlock(lock: &delayed_uprobe_lock);
379
380	return ret;
381	}
382
383	static bool orig_page_is_identical(struct vm_area_struct *vma,
384	unsigned long vaddr, struct page page, bool pmd_mappable)
385	{
386	const pgoff_t index = vaddr_to_offset(vma, vaddr) >> PAGE_SHIFT;
387	struct folio *orig_folio = filemap_get_folio(mapping: vma->vm_file->f_mapping,
388	index);
389	struct page *orig_page;
390	bool identical;
391
392	if (IS_ERR(ptr: orig_folio))
393	return false;
394	orig_page = folio_file_page(folio: orig_folio, index);
395
396	*pmd_mappable = folio_test_pmd_mappable(folio: orig_folio);
397	identical = folio_test_uptodate(folio: orig_folio) &&
398	pages_identical(page1: page, page2: orig_page);
399	folio_put(folio: orig_folio);
400	return identical;
401	}
402
403	static int __uprobe_write(struct vm_area_struct *vma,
404	struct folio_walk fw, struct* folio *folio,
405	unsigned long insn_vaddr, uprobe_opcode_t insn, int* nbytes,
406	bool is_register)
407	{
408	const unsigned long vaddr = insn_vaddr & PAGE_MASK;
409	bool pmd_mappable;
410
411	/ For now, we'll only handle PTE-mapped folios. /
412	if (fw->level != FW_LEVEL_PTE)
413	return -EFAULT;
414
415	/*
416	* See can_follow_write_pte(): we'd actually prefer a writable PTE here,
417	* but the VMA might not be writable.
418	*/
419	if (!pte_write(pte: fw->pte)) {
420	if (!PageAnonExclusive(page: fw->page))
421	return -EFAULT;
422	if (unlikely(userfaultfd_pte_wp(vma, fw->pte)))
423	return -EFAULT;
424	/ SOFTDIRTY is handled via pte_mkdirty() below. /
425	}
426
427	/*
428	* We'll temporarily unmap the page and flush the TLB, such that we can
429	* modify the page atomically.
430	*/
431	flush_cache_page(vma, vmaddr: vaddr, pfn: pte_pfn(pte: fw->pte));
432	fw->pte = ptep_clear_flush(vma, address: vaddr, ptep: fw->ptep);
433	copy_to_page(page: fw->page, vaddr: insn_vaddr, src: insn, len: nbytes);
434
435	/*
436	* When unregistering, we may only zap a PTE if uffd is disabled and
437	* there are no unexpected folio references ...
438	*/
439	if (is_register \|\| userfaultfd_missing(vma) \|\|
440	(folio_ref_count(folio) != folio_expected_ref_count(folio) + `1`))
441	goto remap;
442
443	/*
444	* ... and the mapped page is identical to the original page that
445	* would get faulted in on next access.
446	*/
447	if (!orig_page_is_identical(vma, vaddr, page: fw->page, pmd_mappable: &pmd_mappable))
448	goto remap;
449
450	dec_mm_counter(mm: vma->vm_mm, member: MM_ANONPAGES);
451	folio_remove_rmap_pte(folio, fw->page, vma);
452	if (!folio_mapped(folio) && folio_test_swapcache(folio) &&
453	folio_trylock(folio)) {
454	folio_free_swap(folio);
455	folio_unlock(folio);
456	}
457	folio_put(folio);
458
459	return pmd_mappable;
460	remap:
461	/*
462	* Make sure that our copy_to_page() changes become visible before the
463	* set_pte_at() write.
464	*/
465	smp_wmb();
466	/ We modified the page. Make sure to mark the PTE dirty. /
467	set_pte_at(vma->vm_mm, vaddr, fw->ptep, pte_mkdirty(fw->pte));
468	return `0`;
469	}
470
471	/*
472	* NOTE:
473	* Expect the breakpoint instruction to be the smallest size instruction for
474	* the architecture. If an arch has variable length instruction and the
475	* breakpoint instruction is not of the smallest length instruction
476	* supported by that architecture then we need to modify is_trap_at_addr and
477	* uprobe_write_opcode accordingly. This would never be a problem for archs
478	* that have fixed length instructions.
479	*
480	* uprobe_write_opcode - write the opcode at a given virtual address.
481	* @auprobe: arch specific probepoint information.
482	* @vma: the probed virtual memory area.
483	* @opcode_vaddr: the virtual address to store the opcode.
484	* @opcode: opcode to be written at @opcode_vaddr.
485	*
486	* Called with mm->mmap_lock held for write.
487	* Return 0 (success) or a negative errno.
488	*/
489	int uprobe_write_opcode(struct arch_uprobe auprobe, struct* vm_area_struct *vma,
490	const unsigned long opcode_vaddr, uprobe_opcode_t opcode,
491	bool is_register)
492	{
493	return uprobe_write(auprobe, vma, opcode_vaddr, insn: &opcode, UPROBE_SWBP_INSN_SIZE,
494	verify: verify_opcode, is_register, do_update_ref_ctr: true / do_update_ref_ctr /, NULL);
495	}
496
497	int uprobe_write(struct arch_uprobe auprobe, struct* vm_area_struct *vma,
498	const unsigned long insn_vaddr, uprobe_opcode_t insn, int* nbytes,
499	uprobe_write_verify_t verify, bool is_register, bool do_update_ref_ctr,
500	void *data)
501	{
502	const unsigned long vaddr = insn_vaddr & PAGE_MASK;
503	struct mm_struct *mm = vma->vm_mm;
504	struct uprobe *uprobe;
505	int ret, ref_ctr_updated = `0`;
506	unsigned int gup_flags = FOLL_FORCE;
507	struct mmu_notifier_range range;
508	struct folio_walk fw;
509	struct folio *folio;
510	struct page *page;
511
512	uprobe = container_of(auprobe, struct uprobe, arch);
513
514	if (WARN_ON_ONCE(!is_cow_mapping(vma->vm_flags)))
515	return -EINVAL;
516
517	/*
518	* When registering, we have to break COW to get an exclusive anonymous
519	* page that we can safely modify. Use FOLL_WRITE to trigger a write
520	* fault if required. When unregistering, we might be lucky and the
521	* anon page is already gone. So defer write faults until really
522	* required. Use FOLL_SPLIT_PMD, because __uprobe_write()
523	* cannot deal with PMDs yet.
524	*/
525	if (is_register)
526	gup_flags \|= FOLL_WRITE \| FOLL_SPLIT_PMD;
527
528	retry:
529	ret = get_user_pages_remote(mm, start: vaddr, nr_pages: `1`, gup_flags, pages: &page, NULL);
530	if (ret <= `0`)
531	goto out;
532	folio = page_folio(page);
533
534	ret = verify(page, insn_vaddr, insn, nbytes, data);
535	if (ret <= `0`) {
536	folio_put(folio);
537	goto out;
538	}
539
540	/ We are going to replace instruction, update ref_ctr. /
541	if (do_update_ref_ctr && !ref_ctr_updated && uprobe->ref_ctr_offset) {
542	ret = update_ref_ctr(uprobe, mm, d: is_register ? `1` : -`1`);
543	if (ret) {
544	folio_put(folio);
545	goto out;
546	}
547
548	ref_ctr_updated = `1`;
549	}
550
551	ret = `0`;
552	if (unlikely(!folio_test_anon(folio) \|\| folio_is_zone_device(folio))) {
553	VM_WARN_ON_ONCE(is_register);
554	folio_put(folio);
555	goto out;
556	}
557
558	if (!is_register) {
559	/*
560	* In the common case, we'll be able to zap the page when
561	* unregistering. So trigger MMU notifiers now, as we won't
562	* be able to do it under PTL.
563	*/
564	mmu_notifier_range_init(range: &range, event: MMU_NOTIFY_CLEAR, flags: `0`, mm,
565	start: vaddr, end: vaddr + PAGE_SIZE);
566	mmu_notifier_invalidate_range_start(range: &range);
567	}
568
569	ret = -EAGAIN;
570	/ Walk the page tables again, to perform the actual update. /
571	if (folio_walk_start(fw: &fw, vma, addr: vaddr, flags: `0`)) {
572	if (fw.page == page)
573	ret = __uprobe_write(vma, fw: &fw, folio, insn_vaddr, insn, nbytes, is_register);
574	folio_walk_end(&fw, vma);
575	}
576
577	if (!is_register)
578	mmu_notifier_invalidate_range_end(range: &range);
579
580	folio_put(folio);
581	switch (ret) {
582	case -EFAULT:
583	gup_flags \|= FOLL_WRITE \| FOLL_SPLIT_PMD;
584	fallthrough;
585	case -EAGAIN:
586	goto retry;
587	default:
588	break;
589	}
590
591	out:
592	/ Revert back reference counter if instruction update failed. /
593	if (do_update_ref_ctr && ret < `0` && ref_ctr_updated)
594	update_ref_ctr(uprobe, mm, d: is_register ? -`1` : `1`);
595
596	/ try collapse pmd for compound page /
597	if (ret > `0`)
598	collapse_pte_mapped_thp(mm, addr: vaddr, install_pmd: false);
599
600	return ret < `0` ? ret : `0`;
601	}
602
603	/**
604	* set_swbp - store breakpoint at a given address.
605	* @auprobe: arch specific probepoint information.
606	* @vma: the probed virtual memory area.
607	* @vaddr: the virtual address to insert the opcode.
608	*
609	* For mm @mm, store the breakpoint instruction at @vaddr.
610	* Return 0 (success) or a negative errno.
611	*/
612	int __weak set_swbp(struct arch_uprobe auprobe, struct* vm_area_struct *vma,
613	unsigned long vaddr)
614	{
615	return uprobe_write_opcode(auprobe, vma, opcode_vaddr: vaddr, UPROBE_SWBP_INSN, is_register: true);
616	}
617
618	/**
619	* set_orig_insn - Restore the original instruction.
620	* @vma: the probed virtual memory area.
621	* @auprobe: arch specific probepoint information.
622	* @vaddr: the virtual address to insert the opcode.
623	*
624	* For mm @mm, restore the original opcode (opcode) at @vaddr.
625	* Return 0 (success) or a negative errno.
626	*/
627	int __weak set_orig_insn(struct arch_uprobe *auprobe,
628	struct vm_area_struct vma, unsigned* long vaddr)
629	{
630	return uprobe_write_opcode(auprobe, vma, opcode_vaddr: vaddr,
631	opcode: (uprobe_opcode_t )&auprobe->insn, is_register: false);
632	}
633
634	/ uprobe should have guaranteed positive refcount /
635	static struct uprobe get_uprobe(struct* uprobe *uprobe)
636	{
637	refcount_inc(r: &uprobe->ref);
638	return uprobe;
639	}
640
641	/*
642	* uprobe should have guaranteed lifetime, which can be either of:
643	* - caller already has refcount taken (and wants an extra one);
644	* - uprobe is RCU protected and won't be freed until after grace period;
645	* - we are holding uprobes_treelock (for read or write, doesn't matter).
646	*/
647	static struct uprobe try_get_uprobe(struct* uprobe *uprobe)
648	{
649	if (refcount_inc_not_zero(r: &uprobe->ref))
650	return uprobe;
651	return NULL;
652	}
653
654	static inline bool uprobe_is_active(struct uprobe *uprobe)
655	{
656	return !RB_EMPTY_NODE(&uprobe->rb_node);
657	}
658
659	static void uprobe_free_rcu_tasks_trace(struct rcu_head *rcu)
660	{
661	struct uprobe uprobe = container_of(rcu, struct* uprobe, rcu);
662
663	kfree(objp: uprobe);
664	}
665
666	static void uprobe_free_srcu(struct rcu_head *rcu)
667	{
668	struct uprobe uprobe = container_of(rcu, struct* uprobe, rcu);
669
670	call_rcu_tasks_trace(rhp: &uprobe->rcu, func: uprobe_free_rcu_tasks_trace);
671	}
672
673	static void uprobe_free_deferred(struct work_struct *work)
674	{
675	struct uprobe uprobe = container_of(work, struct* uprobe, work);
676
677	write_lock(&uprobes_treelock);
678
679	if (uprobe_is_active(uprobe)) {
680	write_seqcount_begin(&uprobes_seqcount);
681	rb_erase(&uprobe->rb_node, &uprobes_tree);
682	write_seqcount_end(&uprobes_seqcount);
683	}
684
685	write_unlock(&uprobes_treelock);
686
687	/*
688	* If application munmap(exec_vma) before uprobe_unregister()
689	* gets called, we don't get a chance to remove uprobe from
690	* delayed_uprobe_list from remove_breakpoint(). Do it here.
691	*/
692	mutex_lock(&delayed_uprobe_lock);
693	delayed_uprobe_remove(uprobe, NULL);
694	mutex_unlock(lock: &delayed_uprobe_lock);
695
696	/ start srcu -> rcu_tasks_trace -> kfree chain /
697	call_srcu(ssp: &uretprobes_srcu, head: &uprobe->rcu, func: uprobe_free_srcu);
698	}
699
700	static void put_uprobe(struct uprobe *uprobe)
701	{
702	if (!refcount_dec_and_test(r: &uprobe->ref))
703	return;
704
705	INIT_WORK(&uprobe->work, uprobe_free_deferred);
706	schedule_work(work: &uprobe->work);
707	}
708
709	/ Initialize hprobe as SRCU-protected "leased" uprobe /
710	static void hprobe_init_leased(struct hprobe hprobe, struct* uprobe uprobe, int* srcu_idx)
711	{
712	WARN_ON(!uprobe);
713	hprobe->state = HPROBE_LEASED;
714	hprobe->uprobe = uprobe;
715	hprobe->srcu_idx = srcu_idx;
716	}
717
718	/ Initialize hprobe as refcounted ("stable") uprobe (uprobe can be NULL). /
719	static void hprobe_init_stable(struct hprobe hprobe, struct* uprobe *uprobe)
720	{
721	hprobe->state = uprobe ? HPROBE_STABLE : HPROBE_GONE;
722	hprobe->uprobe = uprobe;
723	hprobe->srcu_idx = -`1`;
724	}
725
726	/*
727	* hprobe_consume() fetches hprobe's underlying uprobe and detects whether
728	* uprobe is SRCU protected or is refcounted. hprobe_consume() can be
729	* used only once for a given hprobe.
730	*
731	* Caller has to call hprobe_finalize() and pass previous hprobe_state, so
732	* that hprobe_finalize() can perform SRCU unlock or put uprobe, whichever
733	* is appropriate.
734	*/
735	static inline struct uprobe hprobe_consume(struct* hprobe hprobe, enum* hprobe_state *hstate)
736	{
737	*hstate = xchg(&hprobe->state, HPROBE_CONSUMED);
738	switch (*hstate) {
739	case HPROBE_LEASED:
740	case HPROBE_STABLE:
741	return hprobe->uprobe;
742	case HPROBE_GONE: / uprobe is NULL, no SRCU /
743	case HPROBE_CONSUMED: / uprobe was finalized already, do nothing /
744	return NULL;
745	default:
746	WARN(`1`, "hprobe invalid state %d", *hstate);
747	return NULL;
748	}
749	}
750
751	/*
752	* Reset hprobe state and, if hprobe was LEASED, release SRCU lock.
753	* hprobe_finalize() can only be used from current context after
754	* hprobe_consume() call (which determines uprobe and hstate value).
755	*/
756	static void hprobe_finalize(struct hprobe hprobe, enum* hprobe_state hstate)
757	{
758	switch (hstate) {
759	case HPROBE_LEASED:
760	__srcu_read_unlock(ssp: &uretprobes_srcu, idx: hprobe->srcu_idx);
761	break;
762	case HPROBE_STABLE:
763	put_uprobe(uprobe: hprobe->uprobe);
764	break;
765	case HPROBE_GONE:
766	case HPROBE_CONSUMED:
767	break;
768	default:
769	WARN(`1`, "hprobe invalid state %d", hstate);
770	break;
771	}
772	}
773
774	/*
775	* Attempt to switch (atomically) uprobe from being SRCU protected (LEASED)
776	* to refcounted (STABLE) state. Competes with hprobe_consume(); only one of
777	* them can win the race to perform SRCU unlocking. Whoever wins must perform
778	* SRCU unlock.
779	*
780	* Returns underlying valid uprobe or NULL, if there was no underlying uprobe
781	* to begin with or we failed to bump its refcount and it's going away.
782	*
783	* Returned non-NULL uprobe can be still safely used within an ongoing SRCU
784	* locked region. If `get` is true, it's guaranteed that non-NULL uprobe has
785	* an extra refcount for caller to assume and use. Otherwise, it's not
786	* guaranteed that returned uprobe has a positive refcount, so caller has to
787	* attempt try_get_uprobe(), if it needs to preserve uprobe beyond current
788	* SRCU lock region. See dup_utask().
789	*/
790	static struct uprobe hprobe_expire(struct* hprobe *hprobe, bool get)
791	{
792	enum hprobe_state hstate;
793
794	/*
795	* Caller should guarantee that return_instance is not going to be
796	* freed from under us. This can be achieved either through holding
797	* rcu_read_lock() or by owning return_instance in the first place.
798	*
799	* Underlying uprobe is itself protected from reuse by SRCU, so ensure
800	* SRCU lock is held properly.
801	*/
802	lockdep_assert(srcu_read_lock_held(&uretprobes_srcu));
803
804	hstate = READ_ONCE(hprobe->state);
805	switch (hstate) {
806	case HPROBE_STABLE:
807	/ uprobe has positive refcount, bump refcount, if necessary /
808	return get ? get_uprobe(uprobe: hprobe->uprobe) : hprobe->uprobe;
809	case HPROBE_GONE:
810	/*
811	* SRCU was unlocked earlier and we didn't manage to take
812	* uprobe refcnt, so it's effectively NULL
813	*/
814	return NULL;
815	case HPROBE_CONSUMED:
816	/*
817	* uprobe was consumed, so it's effectively NULL as far as
818	* uretprobe processing logic is concerned
819	*/
820	return NULL;
821	case HPROBE_LEASED: {
822	struct uprobe *uprobe = try_get_uprobe(uprobe: hprobe->uprobe);
823	/*
824	* Try to switch hprobe state, guarding against
825	* hprobe_consume() or another hprobe_expire() racing with us.
826	* Note, if we failed to get uprobe refcount, we use special
827	* HPROBE_GONE state to signal that hprobe->uprobe shouldn't
828	* be used as it will be freed after SRCU is unlocked.
829	*/
830	if (try_cmpxchg(&hprobe->state, &hstate, uprobe ? HPROBE_STABLE : HPROBE_GONE)) {
831	/ We won the race, we are the ones to unlock SRCU /
832	__srcu_read_unlock(ssp: &uretprobes_srcu, idx: hprobe->srcu_idx);
833	return get ? get_uprobe(uprobe) : uprobe;
834	}
835
836	/*
837	* We lost the race, undo refcount bump (if it ever happened),
838	* unless caller would like an extra refcount anyways.
839	*/
840	if (uprobe && !get)
841	put_uprobe(uprobe);
842	/*
843	* Even if hprobe_consume() or another hprobe_expire() wins
844	* the state update race and unlocks SRCU from under us, we
845	* still have a guarantee that underyling uprobe won't be
846	* freed due to ongoing caller's SRCU lock region, so we can
847	* return it regardless. Also, if `get` was true, we also have
848	* an extra ref for the caller to own. This is used in dup_utask().
849	*/
850	return uprobe;
851	}
852	default:
853	WARN(`1`, "unknown hprobe state %d", hstate);
854	return NULL;
855	}
856	}
857
858	static __always_inline
859	int uprobe_cmp(const struct inode l_inode, const* loff_t l_offset,
860	const struct uprobe *r)
861	{
862	if (l_inode < r->inode)
863	return -`1`;
864
865	if (l_inode > r->inode)
866	return `1`;
867
868	if (l_offset < r->offset)
869	return -`1`;
870
871	if (l_offset > r->offset)
872	return `1`;
873
874	return `0`;
875	}
876
877	#define __node_2_uprobe(node) \
878	rb_entry((node), struct uprobe, rb_node)
879
880	struct __uprobe_key {
881	struct inode *inode;
882	loff_t offset;
883	};
884
885	static inline int __uprobe_cmp_key(const void key, const* struct rb_node *b)
886	{
887	const struct __uprobe_key *a = key;
888	return uprobe_cmp(l_inode: a->inode, l_offset: a->offset, __node_2_uprobe(b));
889	}
890
891	static inline int __uprobe_cmp(struct rb_node a, const* struct rb_node *b)
892	{
893	struct uprobe *u = __node_2_uprobe(a);
894	return uprobe_cmp(l_inode: u->inode, l_offset: u->offset, __node_2_uprobe(b));
895	}
896
897	/*
898	* Assumes being inside RCU protected region.
899	* No refcount is taken on returned uprobe.
900	*/
901	static struct uprobe find_uprobe_rcu(struct* inode *inode, loff_t offset)
902	{
903	struct __uprobe_key key = {
904	.inode = inode,
905	.offset = offset,
906	};
907	struct rb_node *node;
908	unsigned int seq;
909
910	lockdep_assert(rcu_read_lock_trace_held());
911
912	do {
913	seq = read_seqcount_begin(&uprobes_seqcount);
914	node = rb_find_rcu(key: &key, tree: &uprobes_tree, cmp: __uprobe_cmp_key);
915	/*
916	* Lockless RB-tree lookups can result only in false negatives.
917	* If the element is found, it is correct and can be returned
918	* under RCU protection. If we find nothing, we need to
919	* validate that seqcount didn't change. If it did, we have to
920	* try again as we might have missed the element (false
921	* negative). If seqcount is unchanged, search truly failed.
922	*/
923	if (node)
924	return __node_2_uprobe(node);
925	} while (read_seqcount_retry(&uprobes_seqcount, seq));
926
927	return NULL;
928	}
929
930	/*
931	* Attempt to insert a new uprobe into uprobes_tree.
932	*
933	* If uprobe already exists (for given inode+offset), we just increment
934	* refcount of previously existing uprobe.
935	*
936	* If not, a provided new instance of uprobe is inserted into the tree (with
937	* assumed initial refcount == 1).
938	*
939	* In any case, we return a uprobe instance that ends up being in uprobes_tree.
940	* Caller has to clean up new uprobe instance, if it ended up not being
941	* inserted into the tree.
942	*
943	* We assume that uprobes_treelock is held for writing.
944	*/
945	static struct uprobe __insert_uprobe(struct* uprobe *uprobe)
946	{
947	struct rb_node *node;
948	again:
949	node = rb_find_add_rcu(node: &uprobe->rb_node, tree: &uprobes_tree, cmp: __uprobe_cmp);
950	if (node) {
951	struct uprobe *u = __node_2_uprobe(node);
952
953	if (!try_get_uprobe(uprobe: u)) {
954	rb_erase(node, &uprobes_tree);
955	RB_CLEAR_NODE(&u->rb_node);
956	goto again;
957	}
958
959	return u;
960	}
961
962	return uprobe;
963	}
964
965	/*
966	* Acquire uprobes_treelock and insert uprobe into uprobes_tree
967	* (or reuse existing one, see __insert_uprobe() comments above).
968	*/
969	static struct uprobe insert_uprobe(struct* uprobe *uprobe)
970	{
971	struct uprobe *u;
972
973	write_lock(&uprobes_treelock);
974	write_seqcount_begin(&uprobes_seqcount);
975	u = __insert_uprobe(uprobe);
976	write_seqcount_end(&uprobes_seqcount);
977	write_unlock(&uprobes_treelock);
978
979	return u;
980	}
981
982	static void
983	ref_ctr_mismatch_warn(struct uprobe cur_uprobe, struct* uprobe *uprobe)
984	{
985	pr_warn("ref_ctr_offset mismatch. inode: 0x%lx offset: 0x%llx "
986	"ref_ctr_offset(old): 0x%llx ref_ctr_offset(new): 0x%llx\n",
987	uprobe->inode->i_ino, (unsigned long long) uprobe->offset,
988	(unsigned long long) cur_uprobe->ref_ctr_offset,
989	(unsigned long long) uprobe->ref_ctr_offset);
990	}
991
992	static struct uprobe alloc_uprobe(struct* inode *inode, loff_t offset,
993	loff_t ref_ctr_offset)
994	{
995	struct uprobe uprobe, cur_uprobe;
996
997	uprobe = kzalloc(sizeof(struct uprobe), GFP_KERNEL);
998	if (!uprobe)
999	return ERR_PTR(error: -ENOMEM);
1000
1001	uprobe->inode = inode;
1002	uprobe->offset = offset;
1003	uprobe->ref_ctr_offset = ref_ctr_offset;
1004	INIT_LIST_HEAD(list: &uprobe->consumers);
1005	init_rwsem(&uprobe->register_rwsem);
1006	init_rwsem(&uprobe->consumer_rwsem);
1007	RB_CLEAR_NODE(&uprobe->rb_node);
1008	refcount_set(r: &uprobe->ref, n: `1`);
1009
1010	/ add to uprobes_tree, sorted on inode:offset /
1011	cur_uprobe = insert_uprobe(uprobe);
1012	/ a uprobe exists for this inode:offset combination /
1013	if (cur_uprobe != uprobe) {
1014	if (cur_uprobe->ref_ctr_offset != uprobe->ref_ctr_offset) {
1015	ref_ctr_mismatch_warn(cur_uprobe, uprobe);
1016	put_uprobe(uprobe: cur_uprobe);
1017	kfree(objp: uprobe);
1018	return ERR_PTR(error: -EINVAL);
1019	}
1020	kfree(objp: uprobe);
1021	uprobe = cur_uprobe;
1022	}
1023
1024	return uprobe;
1025	}
1026
1027	static void consumer_add(struct uprobe uprobe, struct* uprobe_consumer *uc)
1028	{
1029	static atomic64_t id;
1030
1031	down_write(sem: &uprobe->consumer_rwsem);
1032	list_add_rcu(new: &uc->cons_node, head: &uprobe->consumers);
1033	uc->id = (__u64) atomic64_inc_return(v: &id);
1034	up_write(sem: &uprobe->consumer_rwsem);
1035	}
1036
1037	/*
1038	* For uprobe @uprobe, delete the consumer @uc.
1039	* Should never be called with consumer that's not part of @uprobe->consumers.
1040	*/
1041	static void consumer_del(struct uprobe uprobe, struct* uprobe_consumer *uc)
1042	{
1043	down_write(sem: &uprobe->consumer_rwsem);
1044	list_del_rcu(entry: &uc->cons_node);
1045	up_write(sem: &uprobe->consumer_rwsem);
1046	}
1047
1048	static int __copy_insn(struct address_space mapping, struct* file *filp,
1049	void insn, int* nbytes, loff_t offset)
1050	{
1051	struct page *page;
1052	/*
1053	* Ensure that the page that has the original instruction is populated
1054	* and in page-cache. If ->read_folio == NULL it must be shmem_mapping(),
1055	* see uprobe_register().
1056	*/
1057	if (mapping->a_ops->read_folio)
1058	page = read_mapping_page(mapping, index: offset >> PAGE_SHIFT, file: filp);
1059	else
1060	page = shmem_read_mapping_page(mapping, index: offset >> PAGE_SHIFT);
1061	if (IS_ERR(ptr: page))
1062	return PTR_ERR(ptr: page);
1063
1064	uprobe_copy_from_page(page, vaddr: offset, dst: insn, len: nbytes);
1065	put_page(page);
1066
1067	return `0`;
1068	}
1069
1070	static int copy_insn(struct uprobe uprobe, struct* file *filp)
1071	{
1072	struct address_space *mapping = uprobe->inode->i_mapping;
1073	loff_t offs = uprobe->offset;
1074	void *insn = &uprobe->arch.insn;
1075	int size = sizeof(uprobe->arch.insn);
1076	int len, err = -EIO;
1077
1078	/ Copy only available bytes, -EIO if nothing was read /
1079	do {
1080	if (offs >= i_size_read(inode: uprobe->inode))
1081	break;
1082
1083	len = min_t(int, size, PAGE_SIZE - (offs & ~PAGE_MASK));
1084	err = __copy_insn(mapping, filp, insn, nbytes: len, offset: offs);
1085	if (err)
1086	break;
1087
1088	insn += len;
1089	offs += len;
1090	size -= len;
1091	} while (size);
1092
1093	return err;
1094	}
1095
1096	static int prepare_uprobe(struct uprobe uprobe, struct* file *file,
1097	struct mm_struct mm, unsigned* long vaddr)
1098	{
1099	int ret = `0`;
1100
1101	if (test_bit(UPROBE_COPY_INSN, &uprobe->flags))
1102	return ret;
1103
1104	/ TODO: move this into _register, until then we abuse this sem. /
1105	down_write(sem: &uprobe->consumer_rwsem);
1106	if (test_bit(UPROBE_COPY_INSN, &uprobe->flags))
1107	goto out;
1108
1109	ret = copy_insn(uprobe, filp: file);
1110	if (ret)
1111	goto out;
1112
1113	ret = -ENOTSUPP;
1114	if (is_trap_insn(insn: (uprobe_opcode_t *)&uprobe->arch.insn))
1115	goto out;
1116
1117	ret = arch_uprobe_analyze_insn(aup: &uprobe->arch, mm, addr: vaddr);
1118	if (ret)
1119	goto out;
1120
1121	smp_wmb(); / pairs with the smp_rmb() in handle_swbp() /
1122	set_bit(UPROBE_COPY_INSN, addr: &uprobe->flags);
1123
1124	out:
1125	up_write(sem: &uprobe->consumer_rwsem);
1126
1127	return ret;
1128	}
1129
1130	static inline bool consumer_filter(struct uprobe_consumer uc, struct* mm_struct *mm)
1131	{
1132	return !uc->filter \|\| uc->filter(uc, mm);
1133	}
1134
1135	static bool filter_chain(struct uprobe uprobe, struct* mm_struct *mm)
1136	{
1137	struct uprobe_consumer *uc;
1138	bool ret = false;
1139
1140	down_read(sem: &uprobe->consumer_rwsem);
1141	list_for_each_entry_rcu(uc, &uprobe->consumers, cons_node, rcu_read_lock_trace_held()) {
1142	ret = consumer_filter(uc, mm);
1143	if (ret)
1144	break;
1145	}
1146	up_read(sem: &uprobe->consumer_rwsem);
1147
1148	return ret;
1149	}
1150
1151	static int install_breakpoint(struct uprobe uprobe, struct* vm_area_struct *vma,
1152	unsigned long vaddr)
1153	{
1154	struct mm_struct *mm = vma->vm_mm;
1155	bool first_uprobe;
1156	int ret;
1157
1158	ret = prepare_uprobe(uprobe, file: vma->vm_file, mm, vaddr);
1159	if (ret)
1160	return ret;
1161
1162	/*
1163	* set MMF_HAS_UPROBES in advance for uprobe_pre_sstep_notifier(),
1164	* the task can hit this breakpoint right after __replace_page().
1165	*/
1166	first_uprobe = !mm_flags_test(MMF_HAS_UPROBES, mm);
1167	if (first_uprobe)
1168	mm_flags_set(MMF_HAS_UPROBES, mm);
1169
1170	ret = set_swbp(auprobe: &uprobe->arch, vma, vaddr);
1171	if (!ret)
1172	mm_flags_clear(MMF_RECALC_UPROBES, mm);
1173	else if (first_uprobe)
1174	mm_flags_clear(MMF_HAS_UPROBES, mm);
1175
1176	return ret;
1177	}
1178
1179	static int remove_breakpoint(struct uprobe uprobe, struct* vm_area_struct *vma,
1180	unsigned long vaddr)
1181	{
1182	struct mm_struct *mm = vma->vm_mm;
1183
1184	mm_flags_set(MMF_RECALC_UPROBES, mm);
1185	return set_orig_insn(auprobe: &uprobe->arch, vma, vaddr);
1186	}
1187
1188	struct map_info {
1189	struct map_info *next;
1190	struct mm_struct *mm;
1191	unsigned long vaddr;
1192	};
1193
1194	static inline struct map_info free_map_info(struct* map_info *info)
1195	{
1196	struct map_info *next = info->next;
1197	kfree(objp: info);
1198	return next;
1199	}
1200
1201	static struct map_info *
1202	build_map_info(struct address_space *mapping, loff_t offset, bool is_register)
1203	{
1204	unsigned long pgoff = offset >> PAGE_SHIFT;
1205	struct vm_area_struct *vma;
1206	struct map_info *curr = NULL;
1207	struct map_info *prev = NULL;
1208	struct map_info *info;
1209	int more = `0`;
1210
1211	again:
1212	i_mmap_lock_read(mapping);
1213	vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
1214	if (!valid_vma(vma, is_register))
1215	continue;
1216
1217	if (!prev && !more) {
1218	/*
1219	* Needs GFP_NOWAIT to avoid i_mmap_rwsem recursion through
1220	* reclaim. This is optimistic, no harm done if it fails.
1221	*/
1222	prev = kmalloc(sizeof(struct map_info),
1223	GFP_NOWAIT \| __GFP_NOMEMALLOC);
1224	if (prev)
1225	prev->next = NULL;
1226	}
1227	if (!prev) {
1228	more++;
1229	continue;
1230	}
1231
1232	if (!mmget_not_zero(mm: vma->vm_mm))
1233	continue;
1234
1235	info = prev;
1236	prev = prev->next;
1237	info->next = curr;
1238	curr = info;
1239
1240	info->mm = vma->vm_mm;
1241	info->vaddr = offset_to_vaddr(vma, offset);
1242	}
1243	i_mmap_unlock_read(mapping);
1244
1245	if (!more)
1246	goto out;
1247
1248	prev = curr;
1249	while (curr) {
1250	mmput(curr->mm);
1251	curr = curr->next;
1252	}
1253
1254	do {
1255	info = kmalloc(sizeof(struct map_info), GFP_KERNEL);
1256	if (!info) {
1257	curr = ERR_PTR(error: -ENOMEM);
1258	goto out;
1259	}
1260	info->next = prev;
1261	prev = info;
1262	} while (--more);
1263
1264	goto again;
1265	out:
1266	while (prev)
1267	prev = free_map_info(info: prev);
1268	return curr;
1269	}
1270
1271	static int
1272	register_for_each_vma(struct uprobe uprobe, struct* uprobe_consumer *new)
1273	{
1274	bool is_register = !!new;
1275	struct map_info *info;
1276	int err = `0`;
1277
1278	percpu_down_write(&dup_mmap_sem);
1279	info = build_map_info(mapping: uprobe->inode->i_mapping,
1280	offset: uprobe->offset, is_register);
1281	if (IS_ERR(ptr: info)) {
1282	err = PTR_ERR(ptr: info);
1283	goto out;
1284	}
1285
1286	while (info) {
1287	struct mm_struct *mm = info->mm;
1288	struct vm_area_struct *vma;
1289
1290	if (err && is_register)
1291	goto free;
1292	/*
1293	* We take mmap_lock for writing to avoid the race with
1294	* find_active_uprobe_rcu() which takes mmap_lock for reading.
1295	* Thus this install_breakpoint() can not make
1296	* is_trap_at_addr() true right after find_uprobe_rcu()
1297	* returns NULL in find_active_uprobe_rcu().
1298	*/
1299	mmap_write_lock(mm);
1300	if (check_stable_address_space(mm))
1301	goto unlock;
1302
1303	vma = find_vma(mm, addr: info->vaddr);
1304	if (!vma \|\| !valid_vma(vma, is_register) \|\|
1305	file_inode(f: vma->vm_file) != uprobe->inode)
1306	goto unlock;
1307
1308	if (vma->vm_start > info->vaddr \|\|
1309	vaddr_to_offset(vma, vaddr: info->vaddr) != uprobe->offset)
1310	goto unlock;
1311
1312	if (is_register) {
1313	/ consult only the "caller", new consumer. /
1314	if (consumer_filter(uc: new, mm))
1315	err = install_breakpoint(uprobe, vma, vaddr: info->vaddr);
1316	} else if (mm_flags_test(MMF_HAS_UPROBES, mm)) {
1317	if (!filter_chain(uprobe, mm))
1318	err \|= remove_breakpoint(uprobe, vma, vaddr: info->vaddr);
1319	}
1320
1321	unlock:
1322	mmap_write_unlock(mm);
1323	free:
1324	mmput(mm);
1325	info = free_map_info(info);
1326	}
1327	out:
1328	percpu_up_write(&dup_mmap_sem);
1329	return err;
1330	}
1331
1332	/**
1333	* uprobe_unregister_nosync - unregister an already registered probe.
1334	* @uprobe: uprobe to remove
1335	* @uc: identify which probe if multiple probes are colocated.
1336	*/
1337	void uprobe_unregister_nosync(struct uprobe uprobe, struct* uprobe_consumer *uc)
1338	{
1339	int err;
1340
1341	down_write(sem: &uprobe->register_rwsem);
1342	consumer_del(uprobe, uc);
1343	err = register_for_each_vma(uprobe, NULL);
1344	up_write(sem: &uprobe->register_rwsem);
1345
1346	/ TODO : cant unregister? schedule a worker thread /
1347	if (unlikely(err)) {
1348	uprobe_warn(current, msg: "unregister, leaking uprobe");
1349	return;
1350	}
1351
1352	put_uprobe(uprobe);
1353	}
1354	EXPORT_SYMBOL_GPL(uprobe_unregister_nosync);
1355
1356	void uprobe_unregister_sync(void)
1357	{
1358	/*
1359	* Now that handler_chain() and handle_uretprobe_chain() iterate over
1360	* uprobe->consumers list under RCU protection without holding
1361	* uprobe->register_rwsem, we need to wait for RCU grace period to
1362	* make sure that we can't call into just unregistered
1363	* uprobe_consumer's callbacks anymore. If we don't do that, fast and
1364	* unlucky enough caller can free consumer's memory and cause
1365	* handler_chain() or handle_uretprobe_chain() to do an use-after-free.
1366	*/
1367	synchronize_rcu_tasks_trace();
1368	synchronize_srcu(ssp: &uretprobes_srcu);
1369	}
1370	EXPORT_SYMBOL_GPL(uprobe_unregister_sync);
1371
1372	/**
1373	* uprobe_register - register a probe
1374	* @inode: the file in which the probe has to be placed.
1375	* @offset: offset from the start of the file.
1376	* @ref_ctr_offset: offset of SDT marker / reference counter
1377	* @uc: information on howto handle the probe..
1378	*
1379	* Apart from the access refcount, uprobe_register() takes a creation
1380	* refcount (thro alloc_uprobe) if and only if this @uprobe is getting
1381	* inserted into the rbtree (i.e first consumer for a @inode:@offset
1382	* tuple). Creation refcount stops uprobe_unregister from freeing the
1383	* @uprobe even before the register operation is complete. Creation
1384	* refcount is released when the last @uc for the @uprobe
1385	* unregisters. Caller of uprobe_register() is required to keep @inode
1386	* (and the containing mount) referenced.
1387	*
1388	* Return: pointer to the new uprobe on success or an ERR_PTR on failure.
1389	*/
1390	struct uprobe uprobe_register(struct* inode *inode,
1391	loff_t offset, loff_t ref_ctr_offset,
1392	struct uprobe_consumer *uc)
1393	{
1394	struct uprobe *uprobe;
1395	int ret;
1396
1397	/ Uprobe must have at least one set consumer /
1398	if (!uc->handler && !uc->ret_handler)
1399	return ERR_PTR(error: -EINVAL);
1400
1401	/ copy_insn() uses read_mapping_page() or shmem_read_mapping_page() /
1402	if (!inode->i_mapping->a_ops->read_folio &&
1403	!shmem_mapping(mapping: inode->i_mapping))
1404	return ERR_PTR(error: -EIO);
1405	/ Racy, just to catch the obvious mistakes /
1406	if (offset > i_size_read(inode))
1407	return ERR_PTR(error: -EINVAL);
1408
1409	/*
1410	* This ensures that uprobe_copy_from_page(), copy_to_page() and
1411	* __update_ref_ctr() can't cross page boundary.
1412	*/
1413	if (!IS_ALIGNED(offset, UPROBE_SWBP_INSN_SIZE))
1414	return ERR_PTR(error: -EINVAL);
1415	if (!IS_ALIGNED(ref_ctr_offset, sizeof(short)))
1416	return ERR_PTR(error: -EINVAL);
1417
1418	uprobe = alloc_uprobe(inode, offset, ref_ctr_offset);
1419	if (IS_ERR(ptr: uprobe))
1420	return uprobe;
1421
1422	down_write(sem: &uprobe->register_rwsem);
1423	consumer_add(uprobe, uc);
1424	ret = register_for_each_vma(uprobe, new: uc);
1425	up_write(sem: &uprobe->register_rwsem);
1426
1427	if (ret) {
1428	uprobe_unregister_nosync(uprobe, uc);
1429	/*
1430	* Registration might have partially succeeded, so we can have
1431	* this consumer being called right at this time. We need to
1432	* sync here. It's ok, it's unlikely slow path.
1433	*/
1434	uprobe_unregister_sync();
1435	return ERR_PTR(error: ret);
1436	}
1437
1438	return uprobe;
1439	}
1440	EXPORT_SYMBOL_GPL(uprobe_register);
1441
1442	/**
1443	* uprobe_apply - add or remove the breakpoints according to @uc->filter
1444	* @uprobe: uprobe which "owns" the breakpoint
1445	* @uc: consumer which wants to add more or remove some breakpoints
1446	* @add: add or remove the breakpoints
1447	* Return: 0 on success or negative error code.
1448	*/
1449	int uprobe_apply(struct uprobe uprobe, struct* uprobe_consumer *uc, bool add)
1450	{
1451	struct uprobe_consumer *con;
1452	int ret = -ENOENT;
1453
1454	down_write(sem: &uprobe->register_rwsem);
1455
1456	rcu_read_lock_trace();
1457	list_for_each_entry_rcu(con, &uprobe->consumers, cons_node, rcu_read_lock_trace_held()) {
1458	if (con == uc) {
1459	ret = register_for_each_vma(uprobe, new: add ? uc : NULL);
1460	break;
1461	}
1462	}
1463	rcu_read_unlock_trace();
1464
1465	up_write(sem: &uprobe->register_rwsem);
1466
1467	return ret;
1468	}
1469
1470	static int unapply_uprobe(struct uprobe uprobe, struct* mm_struct *mm)
1471	{
1472	VMA_ITERATOR(vmi, mm, `0`);
1473	struct vm_area_struct *vma;
1474	int err = `0`;
1475
1476	mmap_write_lock(mm);
1477	for_each_vma(vmi, vma) {
1478	unsigned long vaddr;
1479	loff_t offset;
1480
1481	if (!valid_vma(vma, is_register: false) \|\|
1482	file_inode(f: vma->vm_file) != uprobe->inode)
1483	continue;
1484
1485	offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT;
1486	if (uprobe->offset < offset \|\|
1487	uprobe->offset >= offset + vma->vm_end - vma->vm_start)
1488	continue;
1489
1490	vaddr = offset_to_vaddr(vma, offset: uprobe->offset);
1491	err \|= remove_breakpoint(uprobe, vma, vaddr);
1492	}
1493	mmap_write_unlock(mm);
1494
1495	return err;
1496	}
1497
1498	static struct rb_node *
1499	find_node_in_range(struct inode *inode, loff_t min, loff_t max)
1500	{
1501	struct rb_node *n = uprobes_tree.rb_node;
1502
1503	while (n) {
1504	struct uprobe u = rb_entry(n, struct* uprobe, rb_node);
1505
1506	if (inode < u->inode) {
1507	n = n->rb_left;
1508	} else if (inode > u->inode) {
1509	n = n->rb_right;
1510	} else {
1511	if (max < u->offset)
1512	n = n->rb_left;
1513	else if (min > u->offset)
1514	n = n->rb_right;
1515	else
1516	break;
1517	}
1518	}
1519
1520	return n;
1521	}
1522
1523	/*
1524	* For a given range in vma, build a list of probes that need to be inserted.
1525	*/
1526	static void build_probe_list(struct inode *inode,
1527	struct vm_area_struct *vma,
1528	unsigned long start, unsigned long end,
1529	struct list_head *head)
1530	{
1531	loff_t min, max;
1532	struct rb_node n, t;
1533	struct uprobe *u;
1534
1535	INIT_LIST_HEAD(list: head);
1536	min = vaddr_to_offset(vma, vaddr: start);
1537	max = min + (end - start) - `1`;
1538
1539	read_lock(&uprobes_treelock);
1540	n = find_node_in_range(inode, min, max);
1541	if (n) {
1542	for (t = n; t; t = rb_prev(t)) {
1543	u = rb_entry(t, struct uprobe, rb_node);
1544	if (u->inode != inode \|\| u->offset < min)
1545	break;
1546	/ if uprobe went away, it's safe to ignore it /
1547	if (try_get_uprobe(uprobe: u))
1548	list_add(new: &u->pending_list, head);
1549	}
1550	for (t = n; (t = rb_next(t)); ) {
1551	u = rb_entry(t, struct uprobe, rb_node);
1552	if (u->inode != inode \|\| u->offset > max)
1553	break;
1554	/ if uprobe went away, it's safe to ignore it /
1555	if (try_get_uprobe(uprobe: u))
1556	list_add(new: &u->pending_list, head);
1557	}
1558	}
1559	read_unlock(&uprobes_treelock);
1560	}
1561
1562	/ @vma contains reference counter, not the probed instruction. /
1563	static int delayed_ref_ctr_inc(struct vm_area_struct *vma)
1564	{
1565	struct list_head pos, q;
1566	struct delayed_uprobe *du;
1567	unsigned long vaddr;
1568	int ret = `0`, err = `0`;
1569
1570	mutex_lock(&delayed_uprobe_lock);
1571	list_for_each_safe(pos, q, &delayed_uprobe_list) {
1572	du = list_entry(pos, struct delayed_uprobe, list);
1573
1574	if (du->mm != vma->vm_mm \|\|
1575	!valid_ref_ctr_vma(uprobe: du->uprobe, vma))
1576	continue;
1577
1578	vaddr = offset_to_vaddr(vma, offset: du->uprobe->ref_ctr_offset);
1579	ret = __update_ref_ctr(mm: vma->vm_mm, vaddr, d: `1`);
1580	if (ret) {
1581	update_ref_ctr_warn(uprobe: du->uprobe, mm: vma->vm_mm, d: `1`);
1582	if (!err)
1583	err = ret;
1584	}
1585	delayed_uprobe_delete(du);
1586	}
1587	mutex_unlock(lock: &delayed_uprobe_lock);
1588	return err;
1589	}
1590
1591	/*
1592	* Called from mmap_region/vma_merge with mm->mmap_lock acquired.
1593	*
1594	* Currently we ignore all errors and always return 0, the callers
1595	* can't handle the failure anyway.
1596	*/
1597	int uprobe_mmap(struct vm_area_struct *vma)
1598	{
1599	struct list_head tmp_list;
1600	struct uprobe uprobe, u;
1601	struct inode *inode;
1602
1603	if (no_uprobe_events())
1604	return `0`;
1605
1606	if (vma->vm_file &&
1607	(vma->vm_flags & (VM_WRITE\|VM_SHARED)) == VM_WRITE &&
1608	mm_flags_test(MMF_HAS_UPROBES, mm: vma->vm_mm))
1609	delayed_ref_ctr_inc(vma);
1610
1611	if (!valid_vma(vma, is_register: true))
1612	return `0`;
1613
1614	inode = file_inode(f: vma->vm_file);
1615	if (!inode)
1616	return `0`;
1617
1618	mutex_lock(uprobes_mmap_hash(inode));
1619	build_probe_list(inode, vma, start: vma->vm_start, end: vma->vm_end, head: &tmp_list);
1620	/*
1621	* We can race with uprobe_unregister(), this uprobe can be already
1622	* removed. But in this case filter_chain() must return false, all
1623	* consumers have gone away.
1624	*/
1625	list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) {
1626	if (!fatal_signal_pending(current) &&
1627	filter_chain(uprobe, mm: vma->vm_mm)) {
1628	unsigned long vaddr = offset_to_vaddr(vma, offset: uprobe->offset);
1629	install_breakpoint(uprobe, vma, vaddr);
1630	}
1631	put_uprobe(uprobe);
1632	}
1633	mutex_unlock(uprobes_mmap_hash(inode));
1634
1635	return `0`;
1636	}
1637
1638	static bool
1639	vma_has_uprobes(struct vm_area_struct vma, unsigned* long start, unsigned long end)
1640	{
1641	loff_t min, max;
1642	struct inode *inode;
1643	struct rb_node *n;
1644
1645	inode = file_inode(f: vma->vm_file);
1646
1647	min = vaddr_to_offset(vma, vaddr: start);
1648	max = min + (end - start) - `1`;
1649
1650	read_lock(&uprobes_treelock);
1651	n = find_node_in_range(inode, min, max);
1652	read_unlock(&uprobes_treelock);
1653
1654	return !!n;
1655	}
1656
1657	/*
1658	* Called in context of a munmap of a vma.
1659	*/
1660	void uprobe_munmap(struct vm_area_struct vma, unsigned* long start, unsigned long end)
1661	{
1662	if (no_uprobe_events() \|\| !valid_vma(vma, is_register: false))
1663	return;
1664
1665	if (!atomic_read(v: &vma->vm_mm->mm_users)) / called by mmput() ? /
1666	return;
1667
1668	if (!mm_flags_test(MMF_HAS_UPROBES, mm: vma->vm_mm) \|\|
1669	mm_flags_test(MMF_RECALC_UPROBES, mm: vma->vm_mm))
1670	return;
1671
1672	if (vma_has_uprobes(vma, start, end))
1673	mm_flags_set(MMF_RECALC_UPROBES, mm: vma->vm_mm);
1674	}
1675
1676	static vm_fault_t xol_fault(const struct vm_special_mapping *sm,
1677	struct vm_area_struct vma, struct* vm_fault *vmf)
1678	{
1679	struct xol_area *area = vma->vm_mm->uprobes_state.xol_area;
1680
1681	vmf->page = area->page;
1682	get_page(page: vmf->page);
1683	return `0`;
1684	}
1685
1686	static int xol_mremap(const struct vm_special_mapping sm, struct* vm_area_struct *new_vma)
1687	{
1688	return -EPERM;
1689	}
1690
1691	static const struct vm_special_mapping xol_mapping = {
1692	.name = "[uprobes]",
1693	.fault = xol_fault,
1694	.mremap = xol_mremap,
1695	};
1696
1697	/ Slot allocation for XOL /
1698	static int xol_add_vma(struct mm_struct mm, struct* xol_area *area)
1699	{
1700	struct vm_area_struct *vma;
1701	int ret;
1702
1703	if (mmap_write_lock_killable(mm))
1704	return -EINTR;
1705
1706	if (mm->uprobes_state.xol_area) {
1707	ret = -EALREADY;
1708	goto fail;
1709	}
1710
1711	if (!area->vaddr) {
1712	/ Try to map as high as possible, this is only a hint. /
1713	area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE,
1714	PAGE_SIZE, pgoff: `0`, flags: `0`);
1715	if (IS_ERR_VALUE(area->vaddr)) {
1716	ret = area->vaddr;
1717	goto fail;
1718	}
1719	}
1720
1721	vma = _install_special_mapping(mm, addr: area->vaddr, PAGE_SIZE,
1722	VM_EXEC\|VM_MAYEXEC\|VM_DONTCOPY\|VM_IO\|
1723	VM_SEALED_SYSMAP,
1724	spec: &xol_mapping);
1725	if (IS_ERR(ptr: vma)) {
1726	ret = PTR_ERR(ptr: vma);
1727	goto fail;
1728	}
1729
1730	ret = `0`;
1731	/ pairs with get_xol_area() /
1732	smp_store_release(&mm->uprobes_state.xol_area, area); / ^^^ /
1733	fail:
1734	mmap_write_unlock(mm);
1735
1736	return ret;
1737	}
1738
1739	void * __weak arch_uretprobe_trampoline(unsigned long *psize)
1740	{
1741	static uprobe_opcode_t insn = UPROBE_SWBP_INSN;
1742
1743	*psize = UPROBE_SWBP_INSN_SIZE;
1744	return &insn;
1745	}
1746
1747	static struct xol_area __create_xol_area(unsigned* long vaddr)
1748	{
1749	struct mm_struct *mm = current->mm;
1750	unsigned long insns_size;
1751	struct xol_area *area;
1752	void *insns;
1753
1754	area = kzalloc(sizeof(*area), GFP_KERNEL);
1755	if (unlikely(!area))
1756	goto out;
1757
1758	area->bitmap = kcalloc(BITS_TO_LONGS(UINSNS_PER_PAGE), sizeof(long),
1759	GFP_KERNEL);
1760	if (!area->bitmap)
1761	goto free_area;
1762
1763	area->page = alloc_page(GFP_HIGHUSER \| __GFP_ZERO);
1764	if (!area->page)
1765	goto free_bitmap;
1766
1767	area->vaddr = vaddr;
1768	init_waitqueue_head(&area->wq);
1769	/ Reserve the 1st slot for get_trampoline_vaddr() /
1770	set_bit(nr: `0`, addr: area->bitmap);
1771	insns = arch_uretprobe_trampoline(psize: &insns_size);
1772	arch_uprobe_copy_ixol(page: area->page, vaddr: `0`, src: insns, len: insns_size);
1773
1774	if (!xol_add_vma(mm, area))
1775	return area;
1776
1777	__free_page(area->page);
1778	free_bitmap:
1779	kfree(objp: area->bitmap);
1780	free_area:
1781	kfree(objp: area);
1782	out:
1783	return NULL;
1784	}
1785
1786	/*
1787	* get_xol_area - Allocate process's xol_area if necessary.
1788	* This area will be used for storing instructions for execution out of line.
1789	*
1790	* Returns the allocated area or NULL.
1791	*/
1792	static struct xol_area get_xol_area(void*)
1793	{
1794	struct mm_struct *mm = current->mm;
1795	struct xol_area *area;
1796
1797	if (!mm->uprobes_state.xol_area)
1798	__create_xol_area(vaddr: `0`);
1799
1800	/ Pairs with xol_add_vma() smp_store_release() /
1801	area = READ_ONCE(mm->uprobes_state.xol_area); / ^^^ /
1802	return area;
1803	}
1804
1805	void __weak arch_uprobe_clear_state(struct mm_struct *mm)
1806	{
1807	}
1808
1809	void __weak arch_uprobe_init_state(struct mm_struct *mm)
1810	{
1811	}
1812
1813	/*
1814	* uprobe_clear_state - Free the area allocated for slots.
1815	*/
1816	void uprobe_clear_state(struct mm_struct *mm)
1817	{
1818	struct xol_area *area = mm->uprobes_state.xol_area;
1819
1820	mutex_lock(&delayed_uprobe_lock);
1821	delayed_uprobe_remove(NULL, mm);
1822	mutex_unlock(lock: &delayed_uprobe_lock);
1823
1824	arch_uprobe_clear_state(mm);
1825
1826	if (!area)
1827	return;
1828
1829	put_page(page: area->page);
1830	kfree(objp: area->bitmap);
1831	kfree(objp: area);
1832	}
1833
1834	void uprobe_start_dup_mmap(void)
1835	{
1836	percpu_down_read(sem: &dup_mmap_sem);
1837	}
1838
1839	void uprobe_end_dup_mmap(void)
1840	{
1841	percpu_up_read(sem: &dup_mmap_sem);
1842	}
1843
1844	void uprobe_dup_mmap(struct mm_struct oldmm, struct* mm_struct *newmm)
1845	{
1846	if (mm_flags_test(MMF_HAS_UPROBES, mm: oldmm)) {
1847	mm_flags_set(MMF_HAS_UPROBES, mm: newmm);
1848	/ unconditionally, dup_mmap() skips VM_DONTCOPY vmas /
1849	mm_flags_set(MMF_RECALC_UPROBES, mm: newmm);
1850	}
1851	}
1852
1853	static unsigned long xol_get_slot_nr(struct xol_area *area)
1854	{
1855	unsigned long slot_nr;
1856
1857	slot_nr = find_first_zero_bit(addr: area->bitmap, UINSNS_PER_PAGE);
1858	if (slot_nr < UINSNS_PER_PAGE) {
1859	if (!test_and_set_bit(nr: slot_nr, addr: area->bitmap))
1860	return slot_nr;
1861	}
1862
1863	return UINSNS_PER_PAGE;
1864	}
1865
1866	/*
1867	* xol_get_insn_slot - allocate a slot for xol.
1868	*/
1869	static bool xol_get_insn_slot(struct uprobe uprobe, struct* uprobe_task *utask)
1870	{
1871	struct xol_area *area = get_xol_area();
1872	unsigned long slot_nr;
1873
1874	if (!area)
1875	return false;
1876
1877	wait_event(area->wq, (slot_nr = xol_get_slot_nr(area)) < UINSNS_PER_PAGE);
1878
1879	utask->xol_vaddr = area->vaddr + slot_nr * UPROBE_XOL_SLOT_BYTES;
1880	arch_uprobe_copy_ixol(page: area->page, vaddr: utask->xol_vaddr,
1881	src: &uprobe->arch.ixol, len: sizeof(uprobe->arch.ixol));
1882	return true;
1883	}
1884
1885	/*
1886	* xol_free_insn_slot - free the slot allocated by xol_get_insn_slot()
1887	*/
1888	static void xol_free_insn_slot(struct uprobe_task *utask)
1889	{
1890	struct xol_area *area = current->mm->uprobes_state.xol_area;
1891	unsigned long offset = utask->xol_vaddr - area->vaddr;
1892	unsigned int slot_nr;
1893
1894	utask->xol_vaddr = `0`;
1895	/ xol_vaddr must fit into [area->vaddr, area->vaddr + PAGE_SIZE) /
1896	if (WARN_ON_ONCE(offset >= PAGE_SIZE))
1897	return;
1898
1899	slot_nr = offset / UPROBE_XOL_SLOT_BYTES;
1900	clear_bit(nr: slot_nr, addr: area->bitmap);
1901	smp_mb__after_atomic(); / pairs with prepare_to_wait() /
1902	if (waitqueue_active(wq_head: &area->wq))
1903	wake_up(&area->wq);
1904	}
1905
1906	void __weak arch_uprobe_copy_ixol(struct page page, unsigned* long vaddr,
1907	void src, unsigned* long len)
1908	{
1909	/ Initialize the slot /
1910	copy_to_page(page, vaddr, src, len);
1911
1912	/*
1913	* We probably need flush_icache_user_page() but it needs vma.
1914	* This should work on most of architectures by default. If
1915	* architecture needs to do something different it can define
1916	* its own version of the function.
1917	*/
1918	flush_dcache_page(page);
1919	}
1920
1921	/**
1922	* uprobe_get_swbp_addr - compute address of swbp given post-swbp regs
1923	* @regs: Reflects the saved state of the task after it has hit a breakpoint
1924	* instruction.
1925	* Return the address of the breakpoint instruction.
1926	*/
1927	unsigned long __weak uprobe_get_swbp_addr(struct pt_regs *regs)
1928	{
1929	return instruction_pointer(regs) - UPROBE_SWBP_INSN_SIZE;
1930	}
1931
1932	unsigned long uprobe_get_trap_addr(struct pt_regs *regs)
1933	{
1934	struct uprobe_task *utask = current->utask;
1935
1936	if (unlikely(utask && utask->active_uprobe))
1937	return utask->vaddr;
1938
1939	return instruction_pointer(regs);
1940	}
1941
1942	static void ri_pool_push(struct uprobe_task utask, struct* return_instance *ri)
1943	{
1944	ri->cons_cnt = `0`;
1945	ri->next = utask->ri_pool;
1946	utask->ri_pool = ri;
1947	}
1948
1949	static struct return_instance ri_pool_pop(struct* uprobe_task *utask)
1950	{
1951	struct return_instance *ri = utask->ri_pool;
1952
1953	if (likely(ri))
1954	utask->ri_pool = ri->next;
1955
1956	return ri;
1957	}
1958
1959	static void ri_free(struct return_instance *ri)
1960	{
1961	kfree(objp: ri->extra_consumers);
1962	kfree_rcu(ri, rcu);
1963	}
1964
1965	static void free_ret_instance(struct uprobe_task *utask,
1966	struct return_instance *ri, bool cleanup_hprobe)
1967	{
1968	unsigned seq;
1969
1970	if (cleanup_hprobe) {
1971	enum hprobe_state hstate;
1972
1973	(void)hprobe_consume(hprobe: &ri->hprobe, hstate: &hstate);
1974	hprobe_finalize(hprobe: &ri->hprobe, hstate);
1975	}
1976
1977	/*
1978	* At this point return_instance is unlinked from utask's
1979	* return_instances list and this has become visible to ri_timer().
1980	* If seqcount now indicates that ri_timer's return instance
1981	* processing loop isn't active, we can return ri into the pool of
1982	* to-be-reused return instances for future uretprobes. If ri_timer()
1983	* happens to be running right now, though, we fallback to safety and
1984	* just perform RCU-delated freeing of ri.
1985	* Admittedly, this is a rather simple use of seqcount, but it nicely
1986	* abstracts away all the necessary memory barriers, so we use
1987	* a well-supported kernel primitive here.
1988	*/
1989	if (raw_seqcount_try_begin(&utask->ri_seqcount, seq)) {
1990	/ immediate reuse of ri without RCU GP is OK /
1991	ri_pool_push(utask, ri);
1992	} else {
1993	/ we might be racing with ri_timer(), so play it safe /
1994	ri_free(ri);
1995	}
1996	}
1997
1998	/*
1999	* Called with no locks held.
2000	* Called in context of an exiting or an exec-ing thread.
2001	*/
2002	void uprobe_free_utask(struct task_struct *t)
2003	{
2004	struct uprobe_task *utask = t->utask;
2005	struct return_instance ri, ri_next;
2006
2007	if (!utask)
2008	return;
2009
2010	t->utask = NULL;
2011	WARN_ON_ONCE(utask->active_uprobe \|\| utask->xol_vaddr);
2012
2013	timer_delete_sync(timer: &utask->ri_timer);
2014
2015	ri = utask->return_instances;
2016	while (ri) {
2017	ri_next = ri->next;
2018	free_ret_instance(utask, ri, cleanup_hprobe: true / cleanup_hprobe /);
2019	ri = ri_next;
2020	}
2021
2022	/ free_ret_instance() above might add to ri_pool, so this loop should come last /
2023	ri = utask->ri_pool;
2024	while (ri) {
2025	ri_next = ri->next;
2026	ri_free(ri);
2027	ri = ri_next;
2028	}
2029
2030	kfree(objp: utask);
2031	}
2032
2033	#define RI_TIMER_PERIOD (HZ / 10) /* 100 ms */
2034
2035	#define for_each_ret_instance_rcu(pos, head) \
2036	for (pos = rcu_dereference_raw(head); pos; pos = rcu_dereference_raw(pos->next))
2037
2038	static void ri_timer(struct timer_list *timer)
2039	{
2040	struct uprobe_task utask = container_of(timer, struct* uprobe_task, ri_timer);
2041	struct return_instance *ri;
2042
2043	/ SRCU protects uprobe from reuse for the cmpxchg() inside hprobe_expire(). /
2044	guard(srcu)(l: &uretprobes_srcu);
2045	/ RCU protects return_instance from freeing. /
2046	guard(rcu)();
2047
2048	/*
2049	* See free_ret_instance() for notes on seqcount use.
2050	* We also employ raw API variants to avoid lockdep false-positive
2051	* warning complaining about enabled preemption. The timer can only be
2052	* invoked once for a uprobe_task. Therefore there can only be one
2053	* writer. The reader does not require an even sequence count to make
2054	* progress, so it is OK to remain preemptible on PREEMPT_RT.
2055	*/
2056	raw_write_seqcount_begin(&utask->ri_seqcount);
2057
2058	for_each_ret_instance_rcu(ri, utask->return_instances)
2059	hprobe_expire(hprobe: &ri->hprobe, get: false);
2060
2061	raw_write_seqcount_end(&utask->ri_seqcount);
2062	}
2063
2064	static struct uprobe_task alloc_utask(void*)
2065	{
2066	struct uprobe_task *utask;
2067
2068	utask = kzalloc(sizeof(*utask), GFP_KERNEL);
2069	if (!utask)
2070	return NULL;
2071
2072	timer_setup(&utask->ri_timer, ri_timer, `0`);
2073	seqcount_init(&utask->ri_seqcount);
2074
2075	return utask;
2076	}
2077
2078	/*
2079	* Allocate a uprobe_task object for the task if necessary.
2080	* Called when the thread hits a breakpoint.
2081	*
2082	* Returns:
2083	* - pointer to new uprobe_task on success
2084	* - NULL otherwise
2085	*/
2086	static struct uprobe_task get_utask(void*)
2087	{
2088	if (!current->utask)
2089	current->utask = alloc_utask();
2090	return current->utask;
2091	}
2092
2093	static struct return_instance alloc_return_instance(struct* uprobe_task *utask)
2094	{
2095	struct return_instance *ri;
2096
2097	ri = ri_pool_pop(utask);
2098	if (ri)
2099	return ri;
2100
2101	ri = kzalloc(sizeof(*ri), GFP_KERNEL);
2102	if (!ri)
2103	return ZERO_SIZE_PTR;
2104
2105	return ri;
2106	}
2107
2108	static struct return_instance dup_return_instance(struct* return_instance *old)
2109	{
2110	struct return_instance *ri;
2111
2112	ri = kmemdup(old, sizeof(*ri), GFP_KERNEL);
2113	if (!ri)
2114	return NULL;
2115
2116	if (unlikely(old->cons_cnt > `1`)) {
2117	ri->extra_consumers = kmemdup(old->extra_consumers,
2118	sizeof(ri->extra_consumers[`0`]) * (old->cons_cnt - `1`),
2119	GFP_KERNEL);
2120	if (!ri->extra_consumers) {
2121	kfree(objp: ri);
2122	return NULL;
2123	}
2124	}
2125
2126	return ri;
2127	}
2128
2129	static int dup_utask(struct task_struct t, struct* uprobe_task *o_utask)
2130	{
2131	struct uprobe_task *n_utask;
2132	struct return_instance *p, o, *n;
2133	struct uprobe *uprobe;
2134
2135	n_utask = alloc_utask();
2136	if (!n_utask)
2137	return -ENOMEM;
2138	t->utask = n_utask;
2139
2140	/ protect uprobes from freeing, we'll need try_get_uprobe() them /
2141	guard(srcu)(l: &uretprobes_srcu);
2142
2143	p = &n_utask->return_instances;
2144	for (o = o_utask->return_instances; o; o = o->next) {
2145	n = dup_return_instance(old: o);
2146	if (!n)
2147	return -ENOMEM;
2148
2149	/ if uprobe is non-NULL, we'll have an extra refcount for uprobe /
2150	uprobe = hprobe_expire(hprobe: &o->hprobe, get: true);
2151
2152	/*
2153	* New utask will have stable properly refcounted uprobe or
2154	* NULL. Even if we failed to get refcounted uprobe, we still
2155	* need to preserve full set of return_instances for proper
2156	* uretprobe handling and nesting in forked task.
2157	*/
2158	hprobe_init_stable(hprobe: &n->hprobe, uprobe);
2159
2160	n->next = NULL;
2161	rcu_assign_pointer(*p, n);
2162	p = &n->next;
2163
2164	n_utask->depth++;
2165	}
2166
2167	return `0`;
2168	}
2169
2170	static void dup_xol_work(struct callback_head *work)
2171	{
2172	if (current->flags & PF_EXITING)
2173	return;
2174
2175	if (!__create_xol_area(current->utask->dup_xol_addr) &&
2176	!fatal_signal_pending(current))
2177	uprobe_warn(current, msg: "dup xol area");
2178	}
2179
2180	/*
2181	* Called in context of a new clone/fork from copy_process.
2182	*/
2183	void uprobe_copy_process(struct task_struct *t, u64 flags)
2184	{
2185	struct uprobe_task *utask = current->utask;
2186	struct mm_struct *mm = current->mm;
2187	struct xol_area *area;
2188
2189	t->utask = NULL;
2190
2191	if (!utask \|\| !utask->return_instances)
2192	return;
2193
2194	if (mm == t->mm && !(flags & CLONE_VFORK))
2195	return;
2196
2197	if (dup_utask(t, o_utask: utask))
2198	return uprobe_warn(t, msg: "dup ret instances");
2199
2200	/ The task can fork() after dup_xol_work() fails /
2201	area = mm->uprobes_state.xol_area;
2202	if (!area)
2203	return uprobe_warn(t, msg: "dup xol area");
2204
2205	if (mm == t->mm)
2206	return;
2207
2208	t->utask->dup_xol_addr = area->vaddr;
2209	init_task_work(twork: &t->utask->dup_xol_work, func: dup_xol_work);
2210	task_work_add(task: t, twork: &t->utask->dup_xol_work, mode: TWA_RESUME);
2211	}
2212
2213	/*
2214	* Current area->vaddr notion assume the trampoline address is always
2215	* equal area->vaddr.
2216	*
2217	* Returns -1 in case the xol_area is not allocated.
2218	*/
2219	unsigned long uprobe_get_trampoline_vaddr(void)
2220	{
2221	unsigned long trampoline_vaddr = UPROBE_NO_TRAMPOLINE_VADDR;
2222	struct xol_area *area;
2223
2224	/ Pairs with xol_add_vma() smp_store_release() /
2225	area = READ_ONCE(current->mm->uprobes_state.xol_area); / ^^^ /
2226	if (area)
2227	trampoline_vaddr = area->vaddr;
2228
2229	return trampoline_vaddr;
2230	}
2231
2232	static void cleanup_return_instances(struct uprobe_task *utask, bool chained,
2233	struct pt_regs *regs)
2234	{
2235	struct return_instance ri = utask->return_instances, ri_next;
2236	enum rp_check ctx = chained ? RP_CHECK_CHAIN_CALL : RP_CHECK_CALL;
2237
2238	while (ri && !arch_uretprobe_is_alive(ret: ri, ctx, regs)) {
2239	ri_next = ri->next;
2240	rcu_assign_pointer(utask->return_instances, ri_next);
2241	utask->depth--;
2242
2243	free_ret_instance(utask, ri, cleanup_hprobe: true / cleanup_hprobe /);
2244	ri = ri_next;
2245	}
2246	}
2247
2248	static void prepare_uretprobe(struct uprobe uprobe, struct* pt_regs *regs,
2249	struct return_instance *ri)
2250	{
2251	struct uprobe_task *utask = current->utask;
2252	unsigned long orig_ret_vaddr, trampoline_vaddr;
2253	bool chained;
2254	int srcu_idx;
2255
2256	if (!get_xol_area())
2257	goto free;
2258
2259	if (utask->depth >= MAX_URETPROBE_DEPTH) {
2260	printk_ratelimited(KERN_INFO "uprobe: omit uretprobe due to"
2261	" nestedness limit pid/tgid=%d/%d\n",
2262	current->pid, current->tgid);
2263	goto free;
2264	}
2265
2266	trampoline_vaddr = uprobe_get_trampoline_vaddr();
2267	orig_ret_vaddr = arch_uretprobe_hijack_return_addr(trampoline_vaddr, regs);
2268	if (orig_ret_vaddr == -`1`)
2269	goto free;
2270
2271	/ drop the entries invalidated by longjmp() /
2272	chained = (orig_ret_vaddr == trampoline_vaddr);
2273	cleanup_return_instances(utask, chained, regs);
2274
2275	/*
2276	* We don't want to keep trampoline address in stack, rather keep the
2277	* original return address of first caller thru all the consequent
2278	* instances. This also makes breakpoint unwrapping easier.
2279	*/
2280	if (chained) {
2281	if (!utask->return_instances) {
2282	/*
2283	* This situation is not possible. Likely we have an
2284	* attack from user-space.
2285	*/
2286	uprobe_warn(current, msg: "handle tail call");
2287	goto free;
2288	}
2289	orig_ret_vaddr = utask->return_instances->orig_ret_vaddr;
2290	}
2291
2292	/ __srcu_read_lock() because SRCU lock survives switch to user space /
2293	srcu_idx = __srcu_read_lock(ssp: &uretprobes_srcu);
2294
2295	ri->func = instruction_pointer(regs);
2296	ri->stack = user_stack_pointer(regs);
2297	ri->orig_ret_vaddr = orig_ret_vaddr;
2298	ri->chained = chained;
2299
2300	utask->depth++;
2301
2302	hprobe_init_leased(hprobe: &ri->hprobe, uprobe, srcu_idx);
2303	ri->next = utask->return_instances;
2304	rcu_assign_pointer(utask->return_instances, ri);
2305
2306	mod_timer(timer: &utask->ri_timer, expires: jiffies + RI_TIMER_PERIOD);
2307
2308	return;
2309	free:
2310	ri_free(ri);
2311	}
2312
2313	/ Prepare to single-step probed instruction out of line. /
2314	static int
2315	pre_ssout(struct uprobe uprobe, struct* pt_regs regs, unsigned* long bp_vaddr)
2316	{
2317	struct uprobe_task *utask = current->utask;
2318	int err;
2319
2320	if (!try_get_uprobe(uprobe))
2321	return -EINVAL;
2322
2323	if (!xol_get_insn_slot(uprobe, utask)) {
2324	err = -ENOMEM;
2325	goto err_out;
2326	}
2327
2328	utask->vaddr = bp_vaddr;
2329	err = arch_uprobe_pre_xol(aup: &uprobe->arch, regs);
2330	if (unlikely(err)) {
2331	xol_free_insn_slot(utask);
2332	goto err_out;
2333	}
2334
2335	utask->active_uprobe = uprobe;
2336	utask->state = UTASK_SSTEP;
2337	return `0`;
2338	err_out:
2339	put_uprobe(uprobe);
2340	return err;
2341	}
2342
2343	/*
2344	* If we are singlestepping, then ensure this thread is not connected to
2345	* non-fatal signals until completion of singlestep. When xol insn itself
2346	* triggers the signal, restart the original insn even if the task is
2347	* already SIGKILL'ed (since coredump should report the correct ip). This
2348	* is even more important if the task has a handler for SIGSEGV/etc, The
2349	* _same_ instruction should be repeated again after return from the signal
2350	* handler, and SSTEP can never finish in this case.
2351	*/
2352	bool uprobe_deny_signal(void)
2353	{
2354	struct task_struct *t = current;
2355	struct uprobe_task *utask = t->utask;
2356
2357	if (likely(!utask \|\| !utask->active_uprobe))
2358	return false;
2359
2360	WARN_ON_ONCE(utask->state != UTASK_SSTEP);
2361
2362	if (task_sigpending(p: t)) {
2363	utask->signal_denied = true;
2364	clear_tsk_thread_flag(tsk: t, TIF_SIGPENDING);
2365
2366	if (__fatal_signal_pending(p: t) \|\| arch_uprobe_xol_was_trapped(tsk: t)) {
2367	utask->state = UTASK_SSTEP_TRAPPED;
2368	set_tsk_thread_flag(tsk: t, TIF_UPROBE);
2369	}
2370	}
2371
2372	return true;
2373	}
2374
2375	static void mmf_recalc_uprobes(struct mm_struct *mm)
2376	{
2377	VMA_ITERATOR(vmi, mm, `0`);
2378	struct vm_area_struct *vma;
2379
2380	for_each_vma(vmi, vma) {
2381	if (!valid_vma(vma, is_register: false))
2382	continue;
2383	/*
2384	* This is not strictly accurate, we can race with
2385	* uprobe_unregister() and see the already removed
2386	* uprobe if delete_uprobe() was not yet called.
2387	* Or this uprobe can be filtered out.
2388	*/
2389	if (vma_has_uprobes(vma, start: vma->vm_start, end: vma->vm_end))
2390	return;
2391	}
2392
2393	mm_flags_clear(MMF_HAS_UPROBES, mm);
2394	}
2395
2396	static int is_trap_at_addr(struct mm_struct mm, unsigned* long vaddr)
2397	{
2398	struct page *page;
2399	uprobe_opcode_t opcode;
2400	int result;
2401
2402	if (WARN_ON_ONCE(!IS_ALIGNED(vaddr, UPROBE_SWBP_INSN_SIZE)))
2403	return -EINVAL;
2404
2405	pagefault_disable();
2406	result = __get_user(opcode, (uprobe_opcode_t __user *)vaddr);
2407	pagefault_enable();
2408
2409	if (likely(result == `0`))
2410	goto out;
2411
2412	result = get_user_pages(start: vaddr, nr_pages: `1`, gup_flags: FOLL_FORCE, pages: &page);
2413	if (result < `0`)
2414	return result;
2415
2416	uprobe_copy_from_page(page, vaddr, dst: &opcode, UPROBE_SWBP_INSN_SIZE);
2417	put_page(page);
2418	out:
2419	/ This needs to return true for any variant of the trap insn /
2420	return is_trap_insn(insn: &opcode);
2421	}
2422
2423	static struct uprobe find_active_uprobe_speculative(unsigned* long bp_vaddr)
2424	{
2425	struct mm_struct *mm = current->mm;
2426	struct uprobe *uprobe = NULL;
2427	struct vm_area_struct *vma;
2428	struct file *vm_file;
2429	loff_t offset;
2430	unsigned int seq;
2431
2432	guard(rcu)();
2433
2434	if (!mmap_lock_speculate_try_begin(mm, seq: &seq))
2435	return NULL;
2436
2437	vma = vma_lookup(mm, addr: bp_vaddr);
2438	if (!vma)
2439	return NULL;
2440
2441	/*
2442	* vm_file memory can be reused for another instance of struct file,
2443	* but can't be freed from under us, so it's safe to read fields from
2444	* it, even if the values are some garbage values; ultimately
2445	* find_uprobe_rcu() + mmap_lock_speculation_end() check will ensure
2446	* that whatever we speculatively found is correct
2447	*/
2448	vm_file = READ_ONCE(vma->vm_file);
2449	if (!vm_file)
2450	return NULL;
2451
2452	offset = (loff_t)(vma->vm_pgoff << PAGE_SHIFT) + (bp_vaddr - vma->vm_start);
2453	uprobe = find_uprobe_rcu(inode: vm_file->f_inode, offset);
2454	if (!uprobe)
2455	return NULL;
2456
2457	/ now double check that nothing about MM changed /
2458	if (mmap_lock_speculate_retry(mm, seq))
2459	return NULL;
2460
2461	return uprobe;
2462	}
2463
2464	/ assumes being inside RCU protected region /
2465	static struct uprobe find_active_uprobe_rcu(unsigned* long bp_vaddr, int *is_swbp)
2466	{
2467	struct mm_struct *mm = current->mm;
2468	struct uprobe *uprobe = NULL;
2469	struct vm_area_struct *vma;
2470
2471	uprobe = find_active_uprobe_speculative(bp_vaddr);
2472	if (uprobe)
2473	return uprobe;
2474
2475	mmap_read_lock(mm);
2476	vma = vma_lookup(mm, addr: bp_vaddr);
2477	if (vma) {
2478	if (vma->vm_file) {
2479	struct inode *inode = file_inode(f: vma->vm_file);
2480	loff_t offset = vaddr_to_offset(vma, vaddr: bp_vaddr);
2481
2482	uprobe = find_uprobe_rcu(inode, offset);
2483	}
2484
2485	if (!uprobe)
2486	*is_swbp = is_trap_at_addr(mm, vaddr: bp_vaddr);
2487	} else {
2488	*is_swbp = -EFAULT;
2489	}
2490
2491	if (!uprobe && mm_flags_test_and_clear(MMF_RECALC_UPROBES, mm))
2492	mmf_recalc_uprobes(mm);
2493	mmap_read_unlock(mm);
2494
2495	return uprobe;
2496	}
2497
2498	static struct return_instance push_consumer(struct* return_instance *ri, __u64 id, __u64 cookie)
2499	{
2500	struct return_consumer *ric;
2501
2502	if (unlikely(ri == ZERO_SIZE_PTR))
2503	return ri;
2504
2505	if (unlikely(ri->cons_cnt > `0`)) {
2506	ric = krealloc(ri->extra_consumers, sizeof(ric) ri->cons_cnt, GFP_KERNEL);
2507	if (!ric) {
2508	ri_free(ri);
2509	return ZERO_SIZE_PTR;
2510	}
2511	ri->extra_consumers = ric;
2512	}
2513
2514	ric = likely(ri->cons_cnt == `0`) ? &ri->consumer : &ri->extra_consumers[ri->cons_cnt - `1`];
2515	ric->id = id;
2516	ric->cookie = cookie;
2517
2518	ri->cons_cnt++;
2519	return ri;
2520	}
2521
2522	static struct return_consumer *
2523	return_consumer_find(struct return_instance ri, int* iter, int* id)
2524	{
2525	struct return_consumer *ric;
2526	int idx;
2527
2528	for (idx = *iter; idx < ri->cons_cnt; idx++)
2529	{
2530	ric = likely(idx == `0`) ? &ri->consumer : &ri->extra_consumers[idx - `1`];
2531	if (ric->id == id) {
2532	*iter = idx + `1`;
2533	return ric;
2534	}
2535	}
2536
2537	return NULL;
2538	}
2539
2540	static bool ignore_ret_handler(int rc)
2541	{
2542	return rc == UPROBE_HANDLER_REMOVE \|\| rc == UPROBE_HANDLER_IGNORE;
2543	}
2544
2545	static void handler_chain(struct uprobe uprobe, struct* pt_regs *regs)
2546	{
2547	struct uprobe_consumer *uc;
2548	bool has_consumers = false, remove = true;
2549	struct return_instance *ri = NULL;
2550	struct uprobe_task *utask = current->utask;
2551
2552	utask->auprobe = &uprobe->arch;
2553
2554	list_for_each_entry_rcu(uc, &uprobe->consumers, cons_node, rcu_read_lock_trace_held()) {
2555	bool session = uc->handler && uc->ret_handler;
2556	__u64 cookie = `0`;
2557	int rc = `0`;
2558
2559	if (uc->handler) {
2560	rc = uc->handler(uc, regs, &cookie);
2561	WARN(rc < `0` \|\| rc > `2`,
2562	"bad rc=0x%x from %ps()\n", rc, uc->handler);
2563	}
2564
2565	remove &= rc == UPROBE_HANDLER_REMOVE;
2566	has_consumers = true;
2567
2568	if (!uc->ret_handler \|\| ignore_ret_handler(rc))
2569	continue;
2570
2571	if (!ri)
2572	ri = alloc_return_instance(utask);
2573
2574	if (session)
2575	ri = push_consumer(ri, id: uc->id, cookie);
2576	}
2577	utask->auprobe = NULL;
2578
2579	if (!ZERO_OR_NULL_PTR(ri))
2580	prepare_uretprobe(uprobe, regs, ri);
2581
2582	if (remove && has_consumers) {
2583	down_read(sem: &uprobe->register_rwsem);
2584
2585	/ re-check that removal is still required, this time under lock /
2586	if (!filter_chain(uprobe, current->mm)) {
2587	WARN_ON(!uprobe_is_active(uprobe));
2588	unapply_uprobe(uprobe, current->mm);
2589	}
2590
2591	up_read(sem: &uprobe->register_rwsem);
2592	}
2593	}
2594
2595	static void
2596	handle_uretprobe_chain(struct return_instance ri, struct* uprobe uprobe, struct* pt_regs *regs)
2597	{
2598	struct return_consumer *ric;
2599	struct uprobe_consumer *uc;
2600	int ric_idx = `0`;
2601
2602	/ all consumers unsubscribed meanwhile /
2603	if (unlikely(!uprobe))
2604	return;
2605
2606	rcu_read_lock_trace();
2607	list_for_each_entry_rcu(uc, &uprobe->consumers, cons_node, rcu_read_lock_trace_held()) {
2608	bool session = uc->handler && uc->ret_handler;
2609
2610	if (uc->ret_handler) {
2611	ric = return_consumer_find(ri, iter: &ric_idx, id: uc->id);
2612	if (!session \|\| ric)
2613	uc->ret_handler(uc, ri->func, regs, ric ? &ric->cookie : NULL);
2614	}
2615	}
2616	rcu_read_unlock_trace();
2617	}
2618
2619	static struct return_instance find_next_ret_chain(struct* return_instance *ri)
2620	{
2621	bool chained;
2622
2623	do {
2624	chained = ri->chained;
2625	ri = ri->next; / can't be NULL if chained /
2626	} while (chained);
2627
2628	return ri;
2629	}
2630
2631	void uprobe_handle_trampoline(struct pt_regs *regs)
2632	{
2633	struct uprobe_task *utask;
2634	struct return_instance ri, ri_next, *next_chain;
2635	struct uprobe *uprobe;
2636	enum hprobe_state hstate;
2637	bool valid;
2638
2639	utask = current->utask;
2640	if (!utask)
2641	goto sigill;
2642
2643	ri = utask->return_instances;
2644	if (!ri)
2645	goto sigill;
2646
2647	do {
2648	/*
2649	* We should throw out the frames invalidated by longjmp().
2650	* If this chain is valid, then the next one should be alive
2651	* or NULL; the latter case means that nobody but ri->func
2652	* could hit this trampoline on return. TODO: sigaltstack().
2653	*/
2654	next_chain = find_next_ret_chain(ri);
2655	valid = !next_chain \|\| arch_uretprobe_is_alive(ret: next_chain, ctx: RP_CHECK_RET, regs);
2656
2657	instruction_pointer_set(regs, val: ri->orig_ret_vaddr);
2658	do {
2659	/ pop current instance from the stack of pending return instances,*
2660	* as it's not pending anymore: we just fixed up original
2661	* instruction pointer in regs and are about to call handlers;
2662	* this allows fixup_uretprobe_trampoline_entries() to properly fix up
2663	* captured stack traces from uretprobe handlers, in which pending
2664	* trampoline addresses on the stack are replaced with correct
2665	* original return addresses
2666	*/
2667	ri_next = ri->next;
2668	rcu_assign_pointer(utask->return_instances, ri_next);
2669	utask->depth--;
2670
2671	uprobe = hprobe_consume(hprobe: &ri->hprobe, hstate: &hstate);
2672	if (valid)
2673	handle_uretprobe_chain(ri, uprobe, regs);
2674	hprobe_finalize(hprobe: &ri->hprobe, hstate);
2675
2676	/ We already took care of hprobe, no need to waste more time on that. /
2677	free_ret_instance(utask, ri, cleanup_hprobe: false / !cleanup_hprobe /);
2678	ri = ri_next;
2679	} while (ri != next_chain);
2680	} while (!valid);
2681
2682	return;
2683
2684	sigill:
2685	uprobe_warn(current, msg: "handle uretprobe, sending SIGILL.");
2686	force_sig(SIGILL);
2687	}
2688
2689	bool __weak arch_uprobe_ignore(struct arch_uprobe aup, struct* pt_regs *regs)
2690	{
2691	return false;
2692	}
2693
2694	bool __weak arch_uretprobe_is_alive(struct return_instance ret, enum* rp_check ctx,
2695	struct pt_regs *regs)
2696	{
2697	return true;
2698	}
2699
2700	void __weak arch_uprobe_optimize(struct arch_uprobe auprobe, unsigned* long vaddr)
2701	{
2702	}
2703
2704	/*
2705	* Run handler and ask thread to singlestep.
2706	* Ensure all non-fatal signals cannot interrupt thread while it singlesteps.
2707	*/
2708	static void handle_swbp(struct pt_regs *regs)
2709	{
2710	struct uprobe *uprobe;
2711	unsigned long bp_vaddr;
2712	int is_swbp;
2713
2714	bp_vaddr = uprobe_get_swbp_addr(regs);
2715	if (bp_vaddr == uprobe_get_trampoline_vaddr())
2716	return uprobe_handle_trampoline(regs);
2717
2718	rcu_read_lock_trace();
2719
2720	uprobe = find_active_uprobe_rcu(bp_vaddr, is_swbp: &is_swbp);
2721	if (!uprobe) {
2722	if (is_swbp > `0`) {
2723	/ No matching uprobe; signal SIGTRAP. /
2724	force_sig(SIGTRAP);
2725	} else {
2726	/*
2727	* Either we raced with uprobe_unregister() or we can't
2728	* access this memory. The latter is only possible if
2729	* another thread plays with our ->mm. In both cases
2730	* we can simply restart. If this vma was unmapped we
2731	* can pretend this insn was not executed yet and get
2732	* the (correct) SIGSEGV after restart.
2733	*/
2734	instruction_pointer_set(regs, val: bp_vaddr);
2735	}
2736	goto out;
2737	}
2738
2739	/ change it in advance for ->handler() and restart /
2740	instruction_pointer_set(regs, val: bp_vaddr);
2741
2742	/*
2743	* TODO: move copy_insn/etc into _register and remove this hack.
2744	* After we hit the bp, _unregister + _register can install the
2745	* new and not-yet-analyzed uprobe at the same address, restart.
2746	*/
2747	if (unlikely(!test_bit(UPROBE_COPY_INSN, &uprobe->flags)))
2748	goto out;
2749
2750	/*
2751	* Pairs with the smp_wmb() in prepare_uprobe().
2752	*
2753	* Guarantees that if we see the UPROBE_COPY_INSN bit set, then
2754	* we must also see the stores to &uprobe->arch performed by the
2755	* prepare_uprobe() call.
2756	*/
2757	smp_rmb();
2758
2759	/ Tracing handlers use ->utask to communicate with fetch methods /
2760	if (!get_utask())
2761	goto out;
2762
2763	if (arch_uprobe_ignore(aup: &uprobe->arch, regs))
2764	goto out;
2765
2766	handler_chain(uprobe, regs);
2767
2768	/ Try to optimize after first hit. /
2769	arch_uprobe_optimize(auprobe: &uprobe->arch, vaddr: bp_vaddr);
2770
2771	/*
2772	* If user decided to take execution elsewhere, it makes little sense
2773	* to execute the original instruction, so let's skip it.
2774	*/
2775	if (instruction_pointer(regs) != bp_vaddr)
2776	goto out;
2777
2778	if (arch_uprobe_skip_sstep(aup: &uprobe->arch, regs))
2779	goto out;
2780
2781	if (pre_ssout(uprobe, regs, bp_vaddr))
2782	goto out;
2783
2784	out:
2785	/ arch_uprobe_skip_sstep() succeeded, or restart if can't singlestep /
2786	rcu_read_unlock_trace();
2787	}
2788
2789	void handle_syscall_uprobe(struct pt_regs regs, unsigned* long bp_vaddr)
2790	{
2791	struct uprobe *uprobe;
2792	int is_swbp;
2793
2794	guard(rcu_tasks_trace)();
2795
2796	uprobe = find_active_uprobe_rcu(bp_vaddr, is_swbp: &is_swbp);
2797	if (!uprobe)
2798	return;
2799	if (!get_utask())
2800	return;
2801	if (arch_uprobe_ignore(aup: &uprobe->arch, regs))
2802	return;
2803	handler_chain(uprobe, regs);
2804	}
2805
2806	/*
2807	* Perform required fix-ups and disable singlestep.
2808	* Allow pending signals to take effect.
2809	*/
2810	static void handle_singlestep(struct uprobe_task utask, struct* pt_regs *regs)
2811	{
2812	struct uprobe *uprobe;
2813	int err = `0`;
2814
2815	uprobe = utask->active_uprobe;
2816	if (utask->state == UTASK_SSTEP_ACK)
2817	err = arch_uprobe_post_xol(aup: &uprobe->arch, regs);
2818	else if (utask->state == UTASK_SSTEP_TRAPPED)
2819	arch_uprobe_abort_xol(aup: &uprobe->arch, regs);
2820	else
2821	WARN_ON_ONCE(`1`);
2822
2823	put_uprobe(uprobe);
2824	utask->active_uprobe = NULL;
2825	utask->state = UTASK_RUNNING;
2826	xol_free_insn_slot(utask);
2827
2828	if (utask->signal_denied) {
2829	set_thread_flag(TIF_SIGPENDING);
2830	utask->signal_denied = false;
2831	}
2832
2833	if (unlikely(err)) {
2834	uprobe_warn(current, msg: "execute the probed insn, sending SIGILL.");
2835	force_sig(SIGILL);
2836	}
2837	}
2838
2839	/*
2840	* On breakpoint hit, breakpoint notifier sets the TIF_UPROBE flag and
2841	* allows the thread to return from interrupt. After that handle_swbp()
2842	* sets utask->active_uprobe.
2843	*
2844	* On singlestep exception, singlestep notifier sets the TIF_UPROBE flag
2845	* and allows the thread to return from interrupt.
2846	*
2847	* While returning to userspace, thread notices the TIF_UPROBE flag and calls
2848	* uprobe_notify_resume().
2849	*/
2850	void uprobe_notify_resume(struct pt_regs *regs)
2851	{
2852	struct uprobe_task *utask;
2853
2854	clear_thread_flag(TIF_UPROBE);
2855
2856	utask = current->utask;
2857	if (utask && utask->active_uprobe)
2858	handle_singlestep(utask, regs);
2859	else
2860	handle_swbp(regs);
2861	}
2862
2863	/*
2864	* uprobe_pre_sstep_notifier gets called from interrupt context as part of
2865	* notifier mechanism. Set TIF_UPROBE flag and indicate breakpoint hit.
2866	*/
2867	int uprobe_pre_sstep_notifier(struct pt_regs *regs)
2868	{
2869	if (!current->mm)
2870	return `0`;
2871
2872	if (!mm_flags_test(MMF_HAS_UPROBES, current->mm) &&
2873	(!current->utask \|\| !current->utask->return_instances))
2874	return `0`;
2875
2876	set_thread_flag(TIF_UPROBE);
2877	return `1`;
2878	}
2879
2880	/*
2881	* uprobe_post_sstep_notifier gets called in interrupt context as part of notifier
2882	* mechanism. Set TIF_UPROBE flag and indicate completion of singlestep.
2883	*/
2884	int uprobe_post_sstep_notifier(struct pt_regs *regs)
2885	{
2886	struct uprobe_task *utask = current->utask;
2887
2888	if (!current->mm \|\| !utask \|\| !utask->active_uprobe)
2889	/ task is currently not uprobed /
2890	return `0`;
2891
2892	utask->state = UTASK_SSTEP_ACK;
2893	set_thread_flag(TIF_UPROBE);
2894	return `1`;
2895	}
2896
2897	static struct notifier_block uprobe_exception_nb = {
2898	.notifier_call = arch_uprobe_exception_notify,
2899	.priority = INT_MAX-`1`, / notified after kprobes, kgdb /
2900	};
2901
2902	void __init uprobes_init(void)
2903	{
2904	int i;
2905
2906	for (i = `0`; i < UPROBES_HASH_SZ; i++)
2907	mutex_init(&uprobes_mmap_mutex[i]);
2908
2909	BUG_ON(register_die_notifier(&uprobe_exception_nb));
2910	}
2911

source code of linux/kernel/events/uprobes.c