userfaultfd.c source code [linux/mm/userfaultfd.c]

1	// SPDX-License-Identifier: GPL-2.0-only
2	/*
3	* mm/userfaultfd.c
4	*
5	* Copyright (C) 2015 Red Hat, Inc.
6	*/
7
8	#include <linux/mm.h>
9	#include <linux/sched/signal.h>
10	#include <linux/pagemap.h>
11	#include <linux/rmap.h>
12	#include <linux/swap.h>
13	#include <linux/leafops.h>
14	#include <linux/userfaultfd_k.h>
15	#include <linux/mmu_notifier.h>
16	#include <linux/hugetlb.h>
17	#include <linux/shmem_fs.h>
18	#include <asm/tlbflush.h>
19	#include <asm/tlb.h>
20	#include "internal.h"
21	#include "swap.h"
22
23	static __always_inline
24	bool validate_dst_vma(struct vm_area_struct dst_vma, unsigned* long dst_end)
25	{
26	/ Make sure that the dst range is fully within dst_vma. /
27	if (dst_end > dst_vma->vm_end)
28	return false;
29
30	/*
31	* Check the vma is registered in uffd, this is required to
32	* enforce the VM_MAYWRITE check done at uffd registration
33	* time.
34	*/
35	if (!dst_vma->vm_userfaultfd_ctx.ctx)
36	return false;
37
38	return true;
39	}
40
41	static __always_inline
42	struct vm_area_struct find_vma_and_prepare_anon(struct* mm_struct *mm,
43	unsigned long addr)
44	{
45	struct vm_area_struct *vma;
46
47	mmap_assert_locked(mm);
48	vma = vma_lookup(mm, addr);
49	if (!vma)
50	vma = ERR_PTR(error: -ENOENT);
51	else if (!(vma->vm_flags & VM_SHARED) &&
52	unlikely(anon_vma_prepare(vma)))
53	vma = ERR_PTR(error: -ENOMEM);
54
55	return vma;
56	}
57
58	#ifdef CONFIG_PER_VMA_LOCK
59	/*
60	* uffd_lock_vma() - Lookup and lock vma corresponding to @address.
61	* @mm: mm to search vma in.
62	* @address: address that the vma should contain.
63	*
64	* Should be called without holding mmap_lock.
65	*
66	* Return: A locked vma containing @address, -ENOENT if no vma is found, or
67	* -ENOMEM if anon_vma couldn't be allocated.
68	*/
69	static struct vm_area_struct uffd_lock_vma(struct* mm_struct *mm,
70	unsigned long address)
71	{
72	struct vm_area_struct *vma;
73
74	vma = lock_vma_under_rcu(mm, address);
75	if (vma) {
76	/*
77	* We know we're going to need to use anon_vma, so check
78	* that early.
79	*/
80	if (!(vma->vm_flags & VM_SHARED) && unlikely(!vma->anon_vma))
81	vma_end_read(vma);
82	else
83	return vma;
84	}
85
86	mmap_read_lock(mm);
87	vma = find_vma_and_prepare_anon(mm, addr: address);
88	if (!IS_ERR(ptr: vma)) {
89	bool locked = vma_start_read_locked(vma);
90
91	if (!locked)
92	vma = ERR_PTR(error: -EAGAIN);
93	}
94
95	mmap_read_unlock(mm);
96	return vma;
97	}
98
99	static struct vm_area_struct uffd_mfill_lock(struct* mm_struct *dst_mm,
100	unsigned long dst_start,
101	unsigned long len)
102	{
103	struct vm_area_struct *dst_vma;
104
105	dst_vma = uffd_lock_vma(mm: dst_mm, address: dst_start);
106	if (IS_ERR(ptr: dst_vma) \|\| validate_dst_vma(dst_vma, dst_end: dst_start + len))
107	return dst_vma;
108
109	vma_end_read(vma: dst_vma);
110	return ERR_PTR(error: -ENOENT);
111	}
112
113	static void uffd_mfill_unlock(struct vm_area_struct *vma)
114	{
115	vma_end_read(vma);
116	}
117
118	#else
119
120	static struct vm_area_struct uffd_mfill_lock(struct* mm_struct *dst_mm,
121	unsigned long dst_start,
122	unsigned long len)
123	{
124	struct vm_area_struct *dst_vma;
125
126	mmap_read_lock(dst_mm);
127	dst_vma = find_vma_and_prepare_anon(dst_mm, dst_start);
128	if (IS_ERR(dst_vma))
129	goto out_unlock;
130
131	if (validate_dst_vma(dst_vma, dst_start + len))
132	return dst_vma;
133
134	dst_vma = ERR_PTR(-ENOENT);
135	out_unlock:
136	mmap_read_unlock(dst_mm);
137	return dst_vma;
138	}
139
140	static void uffd_mfill_unlock(struct vm_area_struct *vma)
141	{
142	mmap_read_unlock(vma->vm_mm);
143	}
144	#endif
145
146	/ Check if dst_addr is outside of file's size. Must be called with ptl held. /
147	static bool mfill_file_over_size(struct vm_area_struct *dst_vma,
148	unsigned long dst_addr)
149	{
150	struct inode *inode;
151	pgoff_t offset, max_off;
152
153	if (!dst_vma->vm_file)
154	return false;
155
156	inode = dst_vma->vm_file->f_inode;
157	offset = linear_page_index(vma: dst_vma, address: dst_addr);
158	max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
159	return offset >= max_off;
160	}
161
162	/*
163	* Install PTEs, to map dst_addr (within dst_vma) to page.
164	*
165	* This function handles both MCOPY_ATOMIC_NORMAL and _CONTINUE for both shmem
166	* and anon, and for both shared and private VMAs.
167	*/
168	int mfill_atomic_install_pte(pmd_t *dst_pmd,
169	struct vm_area_struct *dst_vma,
170	unsigned long dst_addr, struct page *page,
171	bool newly_allocated, uffd_flags_t flags)
172	{
173	int ret;
174	struct mm_struct *dst_mm = dst_vma->vm_mm;
175	pte_t _dst_pte, *dst_pte;
176	bool writable = dst_vma->vm_flags & VM_WRITE;
177	bool vm_shared = dst_vma->vm_flags & VM_SHARED;
178	spinlock_t *ptl;
179	struct folio *folio = page_folio(page);
180	bool page_in_cache = folio_mapping(folio);
181	pte_t dst_ptep;
182
183	_dst_pte = mk_pte(page, pgprot: dst_vma->vm_page_prot);
184	_dst_pte = pte_mkdirty(pte: _dst_pte);
185	if (page_in_cache && !vm_shared)
186	writable = false;
187	if (writable)
188	_dst_pte = pte_mkwrite(pte: _dst_pte, vma: dst_vma);
189	if (flags & MFILL_ATOMIC_WP)
190	_dst_pte = pte_mkuffd_wp(pte: _dst_pte);
191
192	ret = -EAGAIN;
193	dst_pte = pte_offset_map_lock(mm: dst_mm, pmd: dst_pmd, addr: dst_addr, ptlp: &ptl);
194	if (!dst_pte)
195	goto out;
196
197	if (mfill_file_over_size(dst_vma, dst_addr)) {
198	ret = -EFAULT;
199	goto out_unlock;
200	}
201
202	ret = -EEXIST;
203
204	dst_ptep = ptep_get(ptep: dst_pte);
205
206	/*
207	* We are allowed to overwrite a UFFD pte marker: consider when both
208	* MISSING\|WP registered, we firstly wr-protect a none pte which has no
209	* page cache page backing it, then access the page.
210	*/
211	if (!pte_none(pte: dst_ptep) && !pte_is_uffd_marker(pte: dst_ptep))
212	goto out_unlock;
213
214	if (page_in_cache) {
215	/ Usually, cache pages are already added to LRU /
216	if (newly_allocated)
217	folio_add_lru(folio);
218	folio_add_file_rmap_pte(folio, page, dst_vma);
219	} else {
220	folio_add_new_anon_rmap(folio, dst_vma, address: dst_addr, RMAP_EXCLUSIVE);
221	folio_add_lru_vma(folio, dst_vma);
222	}
223
224	/*
225	* Must happen after rmap, as mm_counter() checks mapping (via
226	* PageAnon()), which is set by __page_set_anon_rmap().
227	*/
228	inc_mm_counter(mm: dst_mm, member: mm_counter(folio));
229
230	set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
231
232	/ No need to invalidate - it was non-present before /
233	update_mmu_cache(vma: dst_vma, addr: dst_addr, ptep: dst_pte);
234	ret = `0`;
235	out_unlock:
236	pte_unmap_unlock(dst_pte, ptl);
237	out:
238	return ret;
239	}
240
241	static int mfill_atomic_pte_copy(pmd_t *dst_pmd,
242	struct vm_area_struct *dst_vma,
243	unsigned long dst_addr,
244	unsigned long src_addr,
245	uffd_flags_t flags,
246	struct folio **foliop)
247	{
248	void *kaddr;
249	int ret;
250	struct folio *folio;
251
252	if (!*foliop) {
253	ret = -ENOMEM;
254	folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, `0`, dst_vma,
255	dst_addr);
256	if (!folio)
257	goto out;
258
259	kaddr = kmap_local_folio(folio, offset: `0`);
260	/*
261	* The read mmap_lock is held here. Despite the
262	* mmap_lock being read recursive a deadlock is still
263	* possible if a writer has taken a lock. For example:
264	*
265	* process A thread 1 takes read lock on own mmap_lock
266	* process A thread 2 calls mmap, blocks taking write lock
267	* process B thread 1 takes page fault, read lock on own mmap lock
268	* process B thread 2 calls mmap, blocks taking write lock
269	* process A thread 1 blocks taking read lock on process B
270	* process B thread 1 blocks taking read lock on process A
271	*
272	* Disable page faults to prevent potential deadlock
273	* and retry the copy outside the mmap_lock.
274	*/
275	pagefault_disable();
276	ret = copy_from_user(to: kaddr, from: (const void __user *) src_addr,
277	PAGE_SIZE);
278	pagefault_enable();
279	kunmap_local(kaddr);
280
281	/ fallback to copy_from_user outside mmap_lock /
282	if (unlikely(ret)) {
283	ret = -ENOENT;
284	*foliop = folio;
285	/ don't free the page /
286	goto out;
287	}
288
289	flush_dcache_folio(folio);
290	} else {
291	folio = *foliop;
292	*foliop = NULL;
293	}
294
295	/*
296	* The memory barrier inside __folio_mark_uptodate makes sure that
297	* preceding stores to the page contents become visible before
298	* the set_pte_at() write.
299	*/
300	__folio_mark_uptodate(folio);
301
302	ret = -ENOMEM;
303	if (mem_cgroup_charge(folio, mm: dst_vma->vm_mm, GFP_KERNEL))
304	goto out_release;
305
306	ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr,
307	page: &folio->page, newly_allocated: true, flags);
308	if (ret)
309	goto out_release;
310	out:
311	return ret;
312	out_release:
313	folio_put(folio);
314	goto out;
315	}
316
317	static int mfill_atomic_pte_zeroed_folio(pmd_t *dst_pmd,
318	struct vm_area_struct *dst_vma,
319	unsigned long dst_addr)
320	{
321	struct folio *folio;
322	int ret = -ENOMEM;
323
324	folio = vma_alloc_zeroed_movable_folio(dst_vma, dst_addr);
325	if (!folio)
326	return ret;
327
328	if (mem_cgroup_charge(folio, mm: dst_vma->vm_mm, GFP_KERNEL))
329	goto out_put;
330
331	/*
332	* The memory barrier inside __folio_mark_uptodate makes sure that
333	* zeroing out the folio become visible before mapping the page
334	* using set_pte_at(). See do_anonymous_page().
335	*/
336	__folio_mark_uptodate(folio);
337
338	ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr,
339	page: &folio->page, newly_allocated: true, flags: `0`);
340	if (ret)
341	goto out_put;
342
343	return `0`;
344	out_put:
345	folio_put(folio);
346	return ret;
347	}
348
349	static int mfill_atomic_pte_zeropage(pmd_t *dst_pmd,
350	struct vm_area_struct *dst_vma,
351	unsigned long dst_addr)
352	{
353	pte_t _dst_pte, *dst_pte;
354	spinlock_t *ptl;
355	int ret;
356
357	if (mm_forbids_zeropage(dst_vma->vm_mm))
358	return mfill_atomic_pte_zeroed_folio(dst_pmd, dst_vma, dst_addr);
359
360	_dst_pte = pte_mkspecial(pte: pfn_pte(page_nr: my_zero_pfn(addr: dst_addr),
361	pgprot: dst_vma->vm_page_prot));
362	ret = -EAGAIN;
363	dst_pte = pte_offset_map_lock(mm: dst_vma->vm_mm, pmd: dst_pmd, addr: dst_addr, ptlp: &ptl);
364	if (!dst_pte)
365	goto out;
366	if (mfill_file_over_size(dst_vma, dst_addr)) {
367	ret = -EFAULT;
368	goto out_unlock;
369	}
370	ret = -EEXIST;
371	if (!pte_none(pte: ptep_get(ptep: dst_pte)))
372	goto out_unlock;
373	set_pte_at(dst_vma->vm_mm, dst_addr, dst_pte, _dst_pte);
374	/ No need to invalidate - it was non-present before /
375	update_mmu_cache(vma: dst_vma, addr: dst_addr, ptep: dst_pte);
376	ret = `0`;
377	out_unlock:
378	pte_unmap_unlock(dst_pte, ptl);
379	out:
380	return ret;
381	}
382
383	/ Handles UFFDIO_CONTINUE for all shmem VMAs (shared or private). /
384	static int mfill_atomic_pte_continue(pmd_t *dst_pmd,
385	struct vm_area_struct *dst_vma,
386	unsigned long dst_addr,
387	uffd_flags_t flags)
388	{
389	struct inode *inode = file_inode(f: dst_vma->vm_file);
390	pgoff_t pgoff = linear_page_index(vma: dst_vma, address: dst_addr);
391	struct folio *folio;
392	struct page *page;
393	int ret;
394
395	ret = shmem_get_folio(inode, index: pgoff, write_end: `0`, foliop: &folio, sgp: SGP_NOALLOC);
396	/ Our caller expects us to return -EFAULT if we failed to find folio /
397	if (ret == -ENOENT)
398	ret = -EFAULT;
399	if (ret)
400	goto out;
401	if (!folio) {
402	ret = -EFAULT;
403	goto out;
404	}
405
406	page = folio_file_page(folio, index: pgoff);
407	if (PageHWPoison(page)) {
408	ret = -EIO;
409	goto out_release;
410	}
411
412	ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr,
413	page, newly_allocated: false, flags);
414	if (ret)
415	goto out_release;
416
417	folio_unlock(folio);
418	ret = `0`;
419	out:
420	return ret;
421	out_release:
422	folio_unlock(folio);
423	folio_put(folio);
424	goto out;
425	}
426
427	/ Handles UFFDIO_POISON for all non-hugetlb VMAs. /
428	static int mfill_atomic_pte_poison(pmd_t *dst_pmd,
429	struct vm_area_struct *dst_vma,
430	unsigned long dst_addr,
431	uffd_flags_t flags)
432	{
433	int ret;
434	struct mm_struct *dst_mm = dst_vma->vm_mm;
435	pte_t _dst_pte, *dst_pte;
436	spinlock_t *ptl;
437
438	_dst_pte = make_pte_marker(PTE_MARKER_POISONED);
439	ret = -EAGAIN;
440	dst_pte = pte_offset_map_lock(mm: dst_mm, pmd: dst_pmd, addr: dst_addr, ptlp: &ptl);
441	if (!dst_pte)
442	goto out;
443
444	if (mfill_file_over_size(dst_vma, dst_addr)) {
445	ret = -EFAULT;
446	goto out_unlock;
447	}
448
449	ret = -EEXIST;
450	/ Refuse to overwrite any PTE, even a PTE marker (e.g. UFFD WP). /
451	if (!pte_none(pte: ptep_get(ptep: dst_pte)))
452	goto out_unlock;
453
454	set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
455
456	/ No need to invalidate - it was non-present before /
457	update_mmu_cache(vma: dst_vma, addr: dst_addr, ptep: dst_pte);
458	ret = `0`;
459	out_unlock:
460	pte_unmap_unlock(dst_pte, ptl);
461	out:
462	return ret;
463	}
464
465	static pmd_t mm_alloc_pmd(struct* mm_struct mm, unsigned* long address)
466	{
467	pgd_t *pgd;
468	p4d_t *p4d;
469	pud_t *pud;
470
471	pgd = pgd_offset(mm, address);
472	p4d = p4d_alloc(mm, pgd, address);
473	if (!p4d)
474	return NULL;
475	pud = pud_alloc(mm, p4d, address);
476	if (!pud)
477	return NULL;
478	/*
479	* Note that we didn't run this because the pmd was
480	* missing, the *pmd may be already established and in
481	* turn it may also be a trans_huge_pmd.
482	*/
483	return pmd_alloc(mm, pud, address);
484	}
485
486	#ifdef CONFIG_HUGETLB_PAGE
487	/*
488	* mfill_atomic processing for HUGETLB vmas. Note that this routine is
489	* called with either vma-lock or mmap_lock held, it will release the lock
490	* before returning.
491	*/
492	static __always_inline ssize_t mfill_atomic_hugetlb(
493	struct userfaultfd_ctx *ctx,
494	struct vm_area_struct *dst_vma,
495	unsigned long dst_start,
496	unsigned long src_start,
497	unsigned long len,
498	uffd_flags_t flags)
499	{
500	struct mm_struct *dst_mm = dst_vma->vm_mm;
501	ssize_t err;
502	pte_t *dst_pte;
503	unsigned long src_addr, dst_addr;
504	long copied;
505	struct folio *folio;
506	unsigned long vma_hpagesize;
507	pgoff_t idx;
508	u32 hash;
509	struct address_space *mapping;
510
511	/*
512	* There is no default zero huge page for all huge page sizes as
513	* supported by hugetlb. A PMD_SIZE huge pages may exist as used
514	* by THP. Since we can not reliably insert a zero page, this
515	* feature is not supported.
516	*/
517	if (uffd_flags_mode_is(flags, expected: MFILL_ATOMIC_ZEROPAGE)) {
518	up_read(sem: &ctx->map_changing_lock);
519	uffd_mfill_unlock(vma: dst_vma);
520	return -EINVAL;
521	}
522
523	src_addr = src_start;
524	dst_addr = dst_start;
525	copied = `0`;
526	folio = NULL;
527	vma_hpagesize = vma_kernel_pagesize(vma: dst_vma);
528
529	/*
530	* Validate alignment based on huge page size
531	*/
532	err = -EINVAL;
533	if (dst_start & (vma_hpagesize - `1`) \|\| len & (vma_hpagesize - `1`))
534	goto out_unlock;
535
536	retry:
537	/*
538	* On routine entry dst_vma is set. If we had to drop mmap_lock and
539	* retry, dst_vma will be set to NULL and we must lookup again.
540	*/
541	if (!dst_vma) {
542	dst_vma = uffd_mfill_lock(dst_mm, dst_start, len);
543	if (IS_ERR(ptr: dst_vma)) {
544	err = PTR_ERR(ptr: dst_vma);
545	goto out;
546	}
547
548	err = -ENOENT;
549	if (!is_vm_hugetlb_page(vma: dst_vma))
550	goto out_unlock_vma;
551
552	err = -EINVAL;
553	if (vma_hpagesize != vma_kernel_pagesize(vma: dst_vma))
554	goto out_unlock_vma;
555
556	/*
557	* If memory mappings are changing because of non-cooperative
558	* operation (e.g. mremap) running in parallel, bail out and
559	* request the user to retry later
560	*/
561	down_read(sem: &ctx->map_changing_lock);
562	err = -EAGAIN;
563	if (atomic_read(v: &ctx->mmap_changing))
564	goto out_unlock;
565	}
566
567	while (src_addr < src_start + len) {
568	VM_WARN_ON_ONCE(dst_addr >= dst_start + len);
569
570	/*
571	* Serialize via vma_lock and hugetlb_fault_mutex.
572	* vma_lock ensures the dst_pte remains valid even
573	* in the case of shared pmds. fault mutex prevents
574	* races with other faulting threads.
575	*/
576	idx = linear_page_index(vma: dst_vma, address: dst_addr);
577	mapping = dst_vma->vm_file->f_mapping;
578	hash = hugetlb_fault_mutex_hash(mapping, idx);
579	mutex_lock(&hugetlb_fault_mutex_table[hash]);
580	hugetlb_vma_lock_read(vma: dst_vma);
581
582	err = -ENOMEM;
583	dst_pte = huge_pte_alloc(mm: dst_mm, vma: dst_vma, addr: dst_addr, sz: vma_hpagesize);
584	if (!dst_pte) {
585	hugetlb_vma_unlock_read(vma: dst_vma);
586	mutex_unlock(lock: &hugetlb_fault_mutex_table[hash]);
587	goto out_unlock;
588	}
589
590	if (!uffd_flags_mode_is(flags, expected: MFILL_ATOMIC_CONTINUE)) {
591	const pte_t ptep = huge_ptep_get(mm: dst_mm, addr: dst_addr, ptep: dst_pte);
592
593	if (!huge_pte_none(pte: ptep) && !pte_is_uffd_marker(pte: ptep)) {
594	err = -EEXIST;
595	hugetlb_vma_unlock_read(vma: dst_vma);
596	mutex_unlock(lock: &hugetlb_fault_mutex_table[hash]);
597	goto out_unlock;
598	}
599	}
600
601	err = hugetlb_mfill_atomic_pte(dst_pte, dst_vma, dst_addr,
602	src_addr, flags, foliop: &folio);
603
604	hugetlb_vma_unlock_read(vma: dst_vma);
605	mutex_unlock(lock: &hugetlb_fault_mutex_table[hash]);
606
607	cond_resched();
608
609	if (unlikely(err == -ENOENT)) {
610	up_read(sem: &ctx->map_changing_lock);
611	uffd_mfill_unlock(vma: dst_vma);
612	VM_WARN_ON_ONCE(!folio);
613
614	err = copy_folio_from_user(dst_folio: folio,
615	usr_src: (const void __user *)src_addr, allow_pagefault: true);
616	if (unlikely(err)) {
617	err = -EFAULT;
618	goto out;
619	}
620
621	dst_vma = NULL;
622	goto retry;
623	} else
624	VM_WARN_ON_ONCE(folio);
625
626	if (!err) {
627	dst_addr += vma_hpagesize;
628	src_addr += vma_hpagesize;
629	copied += vma_hpagesize;
630
631	if (fatal_signal_pending(current))
632	err = -EINTR;
633	}
634	if (err)
635	break;
636	}
637
638	out_unlock:
639	up_read(sem: &ctx->map_changing_lock);
640	out_unlock_vma:
641	uffd_mfill_unlock(vma: dst_vma);
642	out:
643	if (folio)
644	folio_put(folio);
645	VM_WARN_ON_ONCE(copied < `0`);
646	VM_WARN_ON_ONCE(err > `0`);
647	VM_WARN_ON_ONCE(!copied && !err);
648	return copied ? copied : err;
649	}
650	#else /* !CONFIG_HUGETLB_PAGE */
651	/ fail at build time if gcc attempts to use this /
652	extern ssize_t mfill_atomic_hugetlb(struct userfaultfd_ctx *ctx,
653	struct vm_area_struct *dst_vma,
654	unsigned long dst_start,
655	unsigned long src_start,
656	unsigned long len,
657	uffd_flags_t flags);
658	#endif /* CONFIG_HUGETLB_PAGE */
659
660	static __always_inline ssize_t mfill_atomic_pte(pmd_t *dst_pmd,
661	struct vm_area_struct *dst_vma,
662	unsigned long dst_addr,
663	unsigned long src_addr,
664	uffd_flags_t flags,
665	struct folio **foliop)
666	{
667	ssize_t err;
668
669	if (uffd_flags_mode_is(flags, expected: MFILL_ATOMIC_CONTINUE)) {
670	return mfill_atomic_pte_continue(dst_pmd, dst_vma,
671	dst_addr, flags);
672	} else if (uffd_flags_mode_is(flags, expected: MFILL_ATOMIC_POISON)) {
673	return mfill_atomic_pte_poison(dst_pmd, dst_vma,
674	dst_addr, flags);
675	}
676
677	/*
678	* The normal page fault path for a shmem will invoke the
679	* fault, fill the hole in the file and COW it right away. The
680	* result generates plain anonymous memory. So when we are
681	* asked to fill an hole in a MAP_PRIVATE shmem mapping, we'll
682	* generate anonymous memory directly without actually filling
683	* the hole. For the MAP_PRIVATE case the robustness check
684	* only happens in the pagetable (to verify it's still none)
685	* and not in the radix tree.
686	*/
687	if (!(dst_vma->vm_flags & VM_SHARED)) {
688	if (uffd_flags_mode_is(flags, expected: MFILL_ATOMIC_COPY))
689	err = mfill_atomic_pte_copy(dst_pmd, dst_vma,
690	dst_addr, src_addr,
691	flags, foliop);
692	else
693	err = mfill_atomic_pte_zeropage(dst_pmd,
694	dst_vma, dst_addr);
695	} else {
696	err = shmem_mfill_atomic_pte(dst_pmd, dst_vma,
697	dst_addr, src_addr,
698	flags, foliop);
699	}
700
701	return err;
702	}
703
704	static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
705	unsigned long dst_start,
706	unsigned long src_start,
707	unsigned long len,
708	uffd_flags_t flags)
709	{
710	struct mm_struct *dst_mm = ctx->mm;
711	struct vm_area_struct *dst_vma;
712	ssize_t err;
713	pmd_t *dst_pmd;
714	unsigned long src_addr, dst_addr;
715	long copied;
716	struct folio *folio;
717
718	/*
719	* Sanitize the command parameters:
720	*/
721	VM_WARN_ON_ONCE(dst_start & ~PAGE_MASK);
722	VM_WARN_ON_ONCE(len & ~PAGE_MASK);
723
724	/ Does the address range wrap, or is the span zero-sized? /
725	VM_WARN_ON_ONCE(src_start + len <= src_start);
726	VM_WARN_ON_ONCE(dst_start + len <= dst_start);
727
728	src_addr = src_start;
729	dst_addr = dst_start;
730	copied = `0`;
731	folio = NULL;
732	retry:
733	/*
734	* Make sure the vma is not shared, that the dst range is
735	* both valid and fully within a single existing vma.
736	*/
737	dst_vma = uffd_mfill_lock(dst_mm, dst_start, len);
738	if (IS_ERR(ptr: dst_vma)) {
739	err = PTR_ERR(ptr: dst_vma);
740	goto out;
741	}
742
743	/*
744	* If memory mappings are changing because of non-cooperative
745	* operation (e.g. mremap) running in parallel, bail out and
746	* request the user to retry later
747	*/
748	down_read(sem: &ctx->map_changing_lock);
749	err = -EAGAIN;
750	if (atomic_read(v: &ctx->mmap_changing))
751	goto out_unlock;
752
753	err = -EINVAL;
754	/*
755	* shmem_zero_setup is invoked in mmap for MAP_ANONYMOUS\|MAP_SHARED but
756	* it will overwrite vm_ops, so vma_is_anonymous must return false.
757	*/
758	if (WARN_ON_ONCE(vma_is_anonymous(dst_vma) &&
759	dst_vma->vm_flags & VM_SHARED))
760	goto out_unlock;
761
762	/*
763	* validate 'mode' now that we know the dst_vma: don't allow
764	* a wrprotect copy if the userfaultfd didn't register as WP.
765	*/
766	if ((flags & MFILL_ATOMIC_WP) && !(dst_vma->vm_flags & VM_UFFD_WP))
767	goto out_unlock;
768
769	/*
770	* If this is a HUGETLB vma, pass off to appropriate routine
771	*/
772	if (is_vm_hugetlb_page(vma: dst_vma))
773	return mfill_atomic_hugetlb(ctx, dst_vma, dst_start,
774	src_start, len, flags);
775
776	if (!vma_is_anonymous(vma: dst_vma) && !vma_is_shmem(vma: dst_vma))
777	goto out_unlock;
778	if (!vma_is_shmem(vma: dst_vma) &&
779	uffd_flags_mode_is(flags, expected: MFILL_ATOMIC_CONTINUE))
780	goto out_unlock;
781
782	while (src_addr < src_start + len) {
783	pmd_t dst_pmdval;
784
785	VM_WARN_ON_ONCE(dst_addr >= dst_start + len);
786
787	dst_pmd = mm_alloc_pmd(mm: dst_mm, address: dst_addr);
788	if (unlikely(!dst_pmd)) {
789	err = -ENOMEM;
790	break;
791	}
792
793	dst_pmdval = pmdp_get_lockless(pmdp: dst_pmd);
794	if (unlikely(pmd_none(dst_pmdval)) &&
795	unlikely(__pte_alloc(dst_mm, dst_pmd))) {
796	err = -ENOMEM;
797	break;
798	}
799	dst_pmdval = pmdp_get_lockless(pmdp: dst_pmd);
800	/*
801	* If the dst_pmd is THP don't override it and just be strict.
802	* (This includes the case where the PMD used to be THP and
803	* changed back to none after __pte_alloc().)
804	*/
805	if (unlikely(!pmd_present(dst_pmdval) \|\|
806	pmd_trans_huge(dst_pmdval))) {
807	err = -EEXIST;
808	break;
809	}
810	if (unlikely(pmd_bad(dst_pmdval))) {
811	err = -EFAULT;
812	break;
813	}
814	/*
815	* For shmem mappings, khugepaged is allowed to remove page
816	* tables under us; pte_offset_map_lock() will deal with that.
817	*/
818
819	err = mfill_atomic_pte(dst_pmd, dst_vma, dst_addr,
820	src_addr, flags, foliop: &folio);
821	cond_resched();
822
823	if (unlikely(err == -ENOENT)) {
824	void *kaddr;
825
826	up_read(sem: &ctx->map_changing_lock);
827	uffd_mfill_unlock(vma: dst_vma);
828	VM_WARN_ON_ONCE(!folio);
829
830	kaddr = kmap_local_folio(folio, offset: `0`);
831	err = copy_from_user(to: kaddr,
832	from: (const void __user *) src_addr,
833	PAGE_SIZE);
834	kunmap_local(kaddr);
835	if (unlikely(err)) {
836	err = -EFAULT;
837	goto out;
838	}
839	flush_dcache_folio(folio);
840	goto retry;
841	} else
842	VM_WARN_ON_ONCE(folio);
843
844	if (!err) {
845	dst_addr += PAGE_SIZE;
846	src_addr += PAGE_SIZE;
847	copied += PAGE_SIZE;
848
849	if (fatal_signal_pending(current))
850	err = -EINTR;
851	}
852	if (err)
853	break;
854	}
855
856	out_unlock:
857	up_read(sem: &ctx->map_changing_lock);
858	uffd_mfill_unlock(vma: dst_vma);
859	out:
860	if (folio)
861	folio_put(folio);
862	VM_WARN_ON_ONCE(copied < `0`);
863	VM_WARN_ON_ONCE(err > `0`);
864	VM_WARN_ON_ONCE(!copied && !err);
865	return copied ? copied : err;
866	}
867
868	ssize_t mfill_atomic_copy(struct userfaultfd_ctx ctx, unsigned* long dst_start,
869	unsigned long src_start, unsigned long len,
870	uffd_flags_t flags)
871	{
872	return mfill_atomic(ctx, dst_start, src_start, len,
873	flags: uffd_flags_set_mode(flags, mode: MFILL_ATOMIC_COPY));
874	}
875
876	ssize_t mfill_atomic_zeropage(struct userfaultfd_ctx *ctx,
877	unsigned long start,
878	unsigned long len)
879	{
880	return mfill_atomic(ctx, dst_start: start, src_start: `0`, len,
881	flags: uffd_flags_set_mode(flags: `0`, mode: MFILL_ATOMIC_ZEROPAGE));
882	}
883
884	ssize_t mfill_atomic_continue(struct userfaultfd_ctx ctx, unsigned* long start,
885	unsigned long len, uffd_flags_t flags)
886	{
887
888	/*
889	* A caller might reasonably assume that UFFDIO_CONTINUE contains an
890	* smp_wmb() to ensure that any writes to the about-to-be-mapped page by
891	* the thread doing the UFFDIO_CONTINUE are guaranteed to be visible to
892	* subsequent loads from the page through the newly mapped address range.
893	*/
894	smp_wmb();
895
896	return mfill_atomic(ctx, dst_start: start, src_start: `0`, len,
897	flags: uffd_flags_set_mode(flags, mode: MFILL_ATOMIC_CONTINUE));
898	}
899
900	ssize_t mfill_atomic_poison(struct userfaultfd_ctx ctx, unsigned* long start,
901	unsigned long len, uffd_flags_t flags)
902	{
903	return mfill_atomic(ctx, dst_start: start, src_start: `0`, len,
904	flags: uffd_flags_set_mode(flags, mode: MFILL_ATOMIC_POISON));
905	}
906
907	long uffd_wp_range(struct vm_area_struct *dst_vma,
908	unsigned long start, unsigned long len, bool enable_wp)
909	{
910	unsigned int mm_cp_flags;
911	struct mmu_gather tlb;
912	long ret;
913
914	VM_WARN_ONCE(start < dst_vma->vm_start \|\| start + len > dst_vma->vm_end,
915	"The address range exceeds VMA boundary.\n");
916	if (enable_wp)
917	mm_cp_flags = MM_CP_UFFD_WP;
918	else
919	mm_cp_flags = MM_CP_UFFD_WP_RESOLVE;
920
921	/*
922	* vma->vm_page_prot already reflects that uffd-wp is enabled for this
923	* VMA (see userfaultfd_set_vm_flags()) and that all PTEs are supposed
924	* to be write-protected as default whenever protection changes.
925	* Try upgrading write permissions manually.
926	*/
927	if (!enable_wp && vma_wants_manual_pte_write_upgrade(vma: dst_vma))
928	mm_cp_flags \|= MM_CP_TRY_CHANGE_WRITABLE;
929	tlb_gather_mmu(tlb: &tlb, mm: dst_vma->vm_mm);
930	ret = change_protection(tlb: &tlb, vma: dst_vma, start, end: start + len, cp_flags: mm_cp_flags);
931	tlb_finish_mmu(tlb: &tlb);
932
933	return ret;
934	}
935
936	int mwriteprotect_range(struct userfaultfd_ctx ctx, unsigned* long start,
937	unsigned long len, bool enable_wp)
938	{
939	struct mm_struct *dst_mm = ctx->mm;
940	unsigned long end = start + len;
941	unsigned long _start, _end;
942	struct vm_area_struct *dst_vma;
943	unsigned long page_mask;
944	long err;
945	VMA_ITERATOR(vmi, dst_mm, start);
946
947	/*
948	* Sanitize the command parameters:
949	*/
950	VM_WARN_ON_ONCE(start & ~PAGE_MASK);
951	VM_WARN_ON_ONCE(len & ~PAGE_MASK);
952
953	/ Does the address range wrap, or is the span zero-sized? /
954	VM_WARN_ON_ONCE(start + len <= start);
955
956	mmap_read_lock(mm: dst_mm);
957
958	/*
959	* If memory mappings are changing because of non-cooperative
960	* operation (e.g. mremap) running in parallel, bail out and
961	* request the user to retry later
962	*/
963	down_read(sem: &ctx->map_changing_lock);
964	err = -EAGAIN;
965	if (atomic_read(v: &ctx->mmap_changing))
966	goto out_unlock;
967
968	err = -ENOENT;
969	for_each_vma_range(vmi, dst_vma, end) {
970
971	if (!userfaultfd_wp(vma: dst_vma)) {
972	err = -ENOENT;
973	break;
974	}
975
976	if (is_vm_hugetlb_page(vma: dst_vma)) {
977	err = -EINVAL;
978	page_mask = vma_kernel_pagesize(vma: dst_vma) - `1`;
979	if ((start & page_mask) \|\| (len & page_mask))
980	break;
981	}
982
983	_start = max(dst_vma->vm_start, start);
984	_end = min(dst_vma->vm_end, end);
985
986	err = uffd_wp_range(dst_vma, start: _start, len: _end - _start, enable_wp);
987
988	/ Return 0 on success, <0 on failures /
989	if (err < `0`)
990	break;
991	err = `0`;
992	}
993	out_unlock:
994	up_read(sem: &ctx->map_changing_lock);
995	mmap_read_unlock(mm: dst_mm);
996	return err;
997	}
998
999
1000	void double_pt_lock(spinlock_t *ptl1,
1001	spinlock_t *ptl2)
1002	__acquires(ptl1)
1003	__acquires(ptl2)
1004	{
1005	if (ptl1 > ptl2)
1006	swap(ptl1, ptl2);
1007	/ lock in virtual address order to avoid lock inversion /
1008	spin_lock(lock: ptl1);
1009	if (ptl1 != ptl2)
1010	spin_lock_nested(ptl2, SINGLE_DEPTH_NESTING);
1011	else
1012	__acquire(ptl2);
1013	}
1014
1015	void double_pt_unlock(spinlock_t *ptl1,
1016	spinlock_t *ptl2)
1017	__releases(ptl1)
1018	__releases(ptl2)
1019	{
1020	spin_unlock(lock: ptl1);
1021	if (ptl1 != ptl2)
1022	spin_unlock(lock: ptl2);
1023	else
1024	__release(ptl2);
1025	}
1026
1027	static inline bool is_pte_pages_stable(pte_t dst_pte, pte_t src_pte,
1028	pte_t orig_dst_pte, pte_t orig_src_pte,
1029	pmd_t *dst_pmd, pmd_t dst_pmdval)
1030	{
1031	return pte_same(a: ptep_get(ptep: src_pte), b: orig_src_pte) &&
1032	pte_same(a: ptep_get(ptep: dst_pte), b: orig_dst_pte) &&
1033	pmd_same(pmd_a: dst_pmdval, pmd_b: pmdp_get_lockless(pmdp: dst_pmd));
1034	}
1035
1036	/*
1037	* Checks if the two ptes and the corresponding folio are eligible for batched
1038	* move. If so, then returns pointer to the locked folio. Otherwise, returns NULL.
1039	*
1040	* NOTE: folio's reference is not required as the whole operation is within
1041	* PTL's critical section.
1042	*/
1043	static struct folio check_ptes_for_batched_move(struct* vm_area_struct *src_vma,
1044	unsigned long src_addr,
1045	pte_t src_pte, pte_t dst_pte)
1046	{
1047	pte_t orig_dst_pte, orig_src_pte;
1048	struct folio *folio;
1049
1050	orig_dst_pte = ptep_get(ptep: dst_pte);
1051	if (!pte_none(pte: orig_dst_pte))
1052	return NULL;
1053
1054	orig_src_pte = ptep_get(ptep: src_pte);
1055	if (!pte_present(a: orig_src_pte) \|\| is_zero_pfn(pfn: pte_pfn(pte: orig_src_pte)))
1056	return NULL;
1057
1058	folio = vm_normal_folio(vma: src_vma, addr: src_addr, pte: orig_src_pte);
1059	if (!folio \|\| !folio_trylock(folio))
1060	return NULL;
1061	if (!PageAnonExclusive(page: &folio->page) \|\| folio_test_large(folio)) {
1062	folio_unlock(folio);
1063	return NULL;
1064	}
1065	return folio;
1066	}
1067
1068	/*
1069	* Moves src folios to dst in a batch as long as they are not large, and can
1070	* successfully take the lock via folio_trylock().
1071	*/
1072	static long move_present_ptes(struct mm_struct *mm,
1073	struct vm_area_struct *dst_vma,
1074	struct vm_area_struct *src_vma,
1075	unsigned long dst_addr, unsigned long src_addr,
1076	pte_t dst_pte, pte_t src_pte,
1077	pte_t orig_dst_pte, pte_t orig_src_pte,
1078	pmd_t *dst_pmd, pmd_t dst_pmdval,
1079	spinlock_t dst_ptl, spinlock_t src_ptl,
1080	struct folio *first_src_folio, unsigned* long len)
1081	{
1082	int err = `0`;
1083	struct folio src_folio = first_src_folio;
1084	unsigned long src_start = src_addr;
1085	unsigned long src_end;
1086
1087	len = pmd_addr_end(dst_addr, dst_addr + len) - dst_addr;
1088	src_end = pmd_addr_end(src_addr, src_addr + len);
1089	flush_cache_range(vma: src_vma, start: src_addr, end: src_end);
1090	double_pt_lock(ptl1: dst_ptl, ptl2: src_ptl);
1091
1092	if (!is_pte_pages_stable(dst_pte, src_pte, orig_dst_pte, orig_src_pte,
1093	dst_pmd, dst_pmdval)) {
1094	err = -EAGAIN;
1095	goto out;
1096	}
1097	if (folio_test_large(folio: src_folio) \|\|
1098	folio_maybe_dma_pinned(folio: src_folio) \|\|
1099	!PageAnonExclusive(page: &src_folio->page)) {
1100	err = -EBUSY;
1101	goto out;
1102	}
1103	/ It's safe to drop the reference now as the page-table is holding one. /
1104	folio_put(folio: *first_src_folio);
1105	*first_src_folio = NULL;
1106	arch_enter_lazy_mmu_mode();
1107
1108	while (true) {
1109	orig_src_pte = ptep_get_and_clear(mm, addr: src_addr, ptep: src_pte);
1110	/ Folio got pinned from under us. Put it back and fail the move. /
1111	if (folio_maybe_dma_pinned(folio: src_folio)) {
1112	set_pte_at(mm, src_addr, src_pte, orig_src_pte);
1113	err = -EBUSY;
1114	break;
1115	}
1116
1117	folio_move_anon_rmap(src_folio, dst_vma);
1118	src_folio->index = linear_page_index(vma: dst_vma, address: dst_addr);
1119
1120	orig_dst_pte = folio_mk_pte(folio: src_folio, pgprot: dst_vma->vm_page_prot);
1121	/ Set soft dirty bit so userspace can notice the pte was moved /
1122	if (pgtable_supports_soft_dirty())
1123	orig_dst_pte = pte_mksoft_dirty(pte: orig_dst_pte);
1124	if (pte_dirty(pte: orig_src_pte))
1125	orig_dst_pte = pte_mkdirty(pte: orig_dst_pte);
1126	orig_dst_pte = pte_mkwrite(pte: orig_dst_pte, vma: dst_vma);
1127	set_pte_at(mm, dst_addr, dst_pte, orig_dst_pte);
1128
1129	src_addr += PAGE_SIZE;
1130	if (src_addr == src_end)
1131	break;
1132	dst_addr += PAGE_SIZE;
1133	dst_pte++;
1134	src_pte++;
1135
1136	folio_unlock(folio: src_folio);
1137	src_folio = check_ptes_for_batched_move(src_vma, src_addr,
1138	src_pte, dst_pte);
1139	if (!src_folio)
1140	break;
1141	}
1142
1143	arch_leave_lazy_mmu_mode();
1144	if (src_addr > src_start)
1145	flush_tlb_range(src_vma, src_start, src_addr);
1146
1147	if (src_folio)
1148	folio_unlock(folio: src_folio);
1149	out:
1150	double_pt_unlock(ptl1: dst_ptl, ptl2: src_ptl);
1151	return src_addr > src_start ? src_addr - src_start : err;
1152	}
1153
1154	static int move_swap_pte(struct mm_struct mm, struct* vm_area_struct *dst_vma,
1155	unsigned long dst_addr, unsigned long src_addr,
1156	pte_t dst_pte, pte_t src_pte,
1157	pte_t orig_dst_pte, pte_t orig_src_pte,
1158	pmd_t *dst_pmd, pmd_t dst_pmdval,
1159	spinlock_t dst_ptl, spinlock_t src_ptl,
1160	struct folio *src_folio,
1161	struct swap_info_struct *si, swp_entry_t entry)
1162	{
1163	/*
1164	* Check if the folio still belongs to the target swap entry after
1165	* acquiring the lock. Folio can be freed in the swap cache while
1166	* not locked.
1167	*/
1168	if (src_folio && unlikely(!folio_test_swapcache(src_folio) \|\|
1169	entry.val != src_folio->swap.val))
1170	return -EAGAIN;
1171
1172	double_pt_lock(ptl1: dst_ptl, ptl2: src_ptl);
1173
1174	if (!is_pte_pages_stable(dst_pte, src_pte, orig_dst_pte, orig_src_pte,
1175	dst_pmd, dst_pmdval)) {
1176	double_pt_unlock(ptl1: dst_ptl, ptl2: src_ptl);
1177	return -EAGAIN;
1178	}
1179
1180	/*
1181	* The src_folio resides in the swapcache, requiring an update to its
1182	* index and mapping to align with the dst_vma, where a swap-in may
1183	* occur and hit the swapcache after moving the PTE.
1184	*/
1185	if (src_folio) {
1186	folio_move_anon_rmap(src_folio, dst_vma);
1187	src_folio->index = linear_page_index(vma: dst_vma, address: dst_addr);
1188	} else {
1189	/*
1190	* Check if the swap entry is cached after acquiring the src_pte
1191	* lock. Otherwise, we might miss a newly loaded swap cache folio.
1192	*
1193	* Check swap_map directly to minimize overhead, READ_ONCE is sufficient.
1194	* We are trying to catch newly added swap cache, the only possible case is
1195	* when a folio is swapped in and out again staying in swap cache, using the
1196	* same entry before the PTE check above. The PTL is acquired and released
1197	* twice, each time after updating the swap_map's flag. So holding
1198	* the PTL here ensures we see the updated value. False positive is possible,
1199	* e.g. SWP_SYNCHRONOUS_IO swapin may set the flag without touching the
1200	* cache, or during the tiny synchronization window between swap cache and
1201	* swap_map, but it will be gone very quickly, worst result is retry jitters.
1202	*/
1203	if (READ_ONCE(si->swap_map[swp_offset(entry)]) & SWAP_HAS_CACHE) {
1204	double_pt_unlock(ptl1: dst_ptl, ptl2: src_ptl);
1205	return -EAGAIN;
1206	}
1207	}
1208
1209	orig_src_pte = ptep_get_and_clear(mm, addr: src_addr, ptep: src_pte);
1210	if (pgtable_supports_soft_dirty())
1211	orig_src_pte = pte_swp_mksoft_dirty(pte: orig_src_pte);
1212	set_pte_at(mm, dst_addr, dst_pte, orig_src_pte);
1213	double_pt_unlock(ptl1: dst_ptl, ptl2: src_ptl);
1214
1215	return PAGE_SIZE;
1216	}
1217
1218	static int move_zeropage_pte(struct mm_struct *mm,
1219	struct vm_area_struct *dst_vma,
1220	struct vm_area_struct *src_vma,
1221	unsigned long dst_addr, unsigned long src_addr,
1222	pte_t dst_pte, pte_t src_pte,
1223	pte_t orig_dst_pte, pte_t orig_src_pte,
1224	pmd_t *dst_pmd, pmd_t dst_pmdval,
1225	spinlock_t dst_ptl, spinlock_t src_ptl)
1226	{
1227	pte_t zero_pte;
1228
1229	double_pt_lock(ptl1: dst_ptl, ptl2: src_ptl);
1230	if (!is_pte_pages_stable(dst_pte, src_pte, orig_dst_pte, orig_src_pte,
1231	dst_pmd, dst_pmdval)) {
1232	double_pt_unlock(ptl1: dst_ptl, ptl2: src_ptl);
1233	return -EAGAIN;
1234	}
1235
1236	zero_pte = pte_mkspecial(pte: pfn_pte(page_nr: my_zero_pfn(addr: dst_addr),
1237	pgprot: dst_vma->vm_page_prot));
1238	ptep_clear_flush(vma: src_vma, address: src_addr, ptep: src_pte);
1239	set_pte_at(mm, dst_addr, dst_pte, zero_pte);
1240	double_pt_unlock(ptl1: dst_ptl, ptl2: src_ptl);
1241
1242	return PAGE_SIZE;
1243	}
1244
1245
1246	/*
1247	* The mmap_lock for reading is held by the caller. Just move the page(s)
1248	* from src_pmd to dst_pmd if possible, and return number of bytes moved.
1249	* On failure, an error code is returned.
1250	*/
1251	static long move_pages_ptes(struct mm_struct mm, pmd_t dst_pmd, pmd_t *src_pmd,
1252	struct vm_area_struct *dst_vma,
1253	struct vm_area_struct *src_vma,
1254	unsigned long dst_addr, unsigned long src_addr,
1255	unsigned long len, __u64 mode)
1256	{
1257	struct swap_info_struct *si = NULL;
1258	pte_t orig_src_pte, orig_dst_pte;
1259	pte_t src_folio_pte;
1260	spinlock_t src_ptl, dst_ptl;
1261	pte_t *src_pte = NULL;
1262	pte_t *dst_pte = NULL;
1263	pmd_t dummy_pmdval;
1264	pmd_t dst_pmdval;
1265	struct folio *src_folio = NULL;
1266	struct mmu_notifier_range range;
1267	long ret = `0`;
1268
1269	mmu_notifier_range_init(range: &range, event: MMU_NOTIFY_CLEAR, flags: `0`, mm,
1270	start: src_addr, end: src_addr + len);
1271	mmu_notifier_invalidate_range_start(range: &range);
1272	retry:
1273	/*
1274	* Use the maywrite version to indicate that dst_pte will be modified,
1275	* since dst_pte needs to be none, the subsequent pte_same() check
1276	* cannot prevent the dst_pte page from being freed concurrently, so we
1277	* also need to abtain dst_pmdval and recheck pmd_same() later.
1278	*/
1279	dst_pte = pte_offset_map_rw_nolock(mm, pmd: dst_pmd, addr: dst_addr, pmdvalp: &dst_pmdval,
1280	ptlp: &dst_ptl);
1281
1282	/ Retry if a huge pmd materialized from under us /
1283	if (unlikely(!dst_pte)) {
1284	ret = -EAGAIN;
1285	goto out;
1286	}
1287
1288	/*
1289	* Unlike dst_pte, the subsequent pte_same() check can ensure the
1290	* stability of the src_pte page, so there is no need to get pmdval,
1291	* just pass a dummy variable to it.
1292	*/
1293	src_pte = pte_offset_map_rw_nolock(mm, pmd: src_pmd, addr: src_addr, pmdvalp: &dummy_pmdval,
1294	ptlp: &src_ptl);
1295
1296	/*
1297	* We held the mmap_lock for reading so MADV_DONTNEED
1298	* can zap transparent huge pages under us, or the
1299	* transparent huge page fault can establish new
1300	* transparent huge pages under us.
1301	*/
1302	if (unlikely(!src_pte)) {
1303	ret = -EAGAIN;
1304	goto out;
1305	}
1306
1307	/ Sanity checks before the operation /
1308	if (pmd_none(pmd: dst_pmd) \|\| pmd_none(pmd: src_pmd) \|\|
1309	pmd_trans_huge(pmd: dst_pmd) \|\| pmd_trans_huge(pmd: src_pmd)) {
1310	ret = -EINVAL;
1311	goto out;
1312	}
1313
1314	spin_lock(lock: dst_ptl);
1315	orig_dst_pte = ptep_get(ptep: dst_pte);
1316	spin_unlock(lock: dst_ptl);
1317	if (!pte_none(pte: orig_dst_pte)) {
1318	ret = -EEXIST;
1319	goto out;
1320	}
1321
1322	spin_lock(lock: src_ptl);
1323	orig_src_pte = ptep_get(ptep: src_pte);
1324	spin_unlock(lock: src_ptl);
1325	if (pte_none(pte: orig_src_pte)) {
1326	if (!(mode & UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES))
1327	ret = -ENOENT;
1328	else / nothing to do to move a hole /
1329	ret = PAGE_SIZE;
1330	goto out;
1331	}
1332
1333	/ If PTE changed after we locked the folio them start over /
1334	if (src_folio && unlikely(!pte_same(src_folio_pte, orig_src_pte))) {
1335	ret = -EAGAIN;
1336	goto out;
1337	}
1338
1339	if (pte_present(a: orig_src_pte)) {
1340	if (is_zero_pfn(pfn: pte_pfn(pte: orig_src_pte))) {
1341	ret = move_zeropage_pte(mm, dst_vma, src_vma,
1342	dst_addr, src_addr, dst_pte, src_pte,
1343	orig_dst_pte, orig_src_pte,
1344	dst_pmd, dst_pmdval, dst_ptl, src_ptl);
1345	goto out;
1346	}
1347
1348	/*
1349	* Pin and lock source folio. Since we are in RCU read section,
1350	* we can't block, so on contention have to unmap the ptes,
1351	* obtain the lock and retry.
1352	*/
1353	if (!src_folio) {
1354	struct folio *folio;
1355	bool locked;
1356
1357	/*
1358	* Pin the page while holding the lock to be sure the
1359	* page isn't freed under us
1360	*/
1361	spin_lock(lock: src_ptl);
1362	if (!pte_same(a: orig_src_pte, b: ptep_get(ptep: src_pte))) {
1363	spin_unlock(lock: src_ptl);
1364	ret = -EAGAIN;
1365	goto out;
1366	}
1367
1368	folio = vm_normal_folio(vma: src_vma, addr: src_addr, pte: orig_src_pte);
1369	if (!folio \|\| !PageAnonExclusive(page: &folio->page)) {
1370	spin_unlock(lock: src_ptl);
1371	ret = -EBUSY;
1372	goto out;
1373	}
1374
1375	locked = folio_trylock(folio);
1376	/*
1377	* We avoid waiting for folio lock with a raised
1378	* refcount for large folios because extra refcounts
1379	* will result in split_folio() failing later and
1380	* retrying. If multiple tasks are trying to move a
1381	* large folio we can end up livelocking.
1382	*/
1383	if (!locked && folio_test_large(folio)) {
1384	spin_unlock(lock: src_ptl);
1385	ret = -EAGAIN;
1386	goto out;
1387	}
1388
1389	folio_get(folio);
1390	src_folio = folio;
1391	src_folio_pte = orig_src_pte;
1392	spin_unlock(lock: src_ptl);
1393
1394	if (!locked) {
1395	pte_unmap(pte: src_pte);
1396	pte_unmap(pte: dst_pte);
1397	src_pte = dst_pte = NULL;
1398	/ now we can block and wait /
1399	folio_lock(folio: src_folio);
1400	goto retry;
1401	}
1402
1403	if (WARN_ON_ONCE(!folio_test_anon(src_folio))) {
1404	ret = -EBUSY;
1405	goto out;
1406	}
1407	}
1408
1409	/ at this point we have src_folio locked /
1410	if (folio_test_large(folio: src_folio)) {
1411	/ split_folio() can block /
1412	pte_unmap(pte: src_pte);
1413	pte_unmap(pte: dst_pte);
1414	src_pte = dst_pte = NULL;
1415	ret = split_folio(src_folio);
1416	if (ret)
1417	goto out;
1418	/ have to reacquire the folio after it got split /
1419	folio_unlock(folio: src_folio);
1420	folio_put(folio: src_folio);
1421	src_folio = NULL;
1422	goto retry;
1423	}
1424
1425	ret = move_present_ptes(mm, dst_vma, src_vma,
1426	dst_addr, src_addr, dst_pte, src_pte,
1427	orig_dst_pte, orig_src_pte, dst_pmd,
1428	dst_pmdval, dst_ptl, src_ptl, first_src_folio: &src_folio,
1429	len);
1430	} else { / !pte_present() /
1431	struct folio *folio = NULL;
1432	const softleaf_t entry = softleaf_from_pte(pte: orig_src_pte);
1433
1434	if (softleaf_is_migration(entry)) {
1435	pte_unmap(pte: src_pte);
1436	pte_unmap(pte: dst_pte);
1437	src_pte = dst_pte = NULL;
1438	migration_entry_wait(mm, pmd: src_pmd, address: src_addr);
1439
1440	ret = -EAGAIN;
1441	goto out;
1442	} else if (!softleaf_is_swap(entry)) {
1443	ret = -EFAULT;
1444	goto out;
1445	}
1446
1447	if (!pte_swp_exclusive(pte: orig_src_pte)) {
1448	ret = -EBUSY;
1449	goto out;
1450	}
1451
1452	si = get_swap_device(entry);
1453	if (unlikely(!si)) {
1454	ret = -EAGAIN;
1455	goto out;
1456	}
1457	/*
1458	* Verify the existence of the swapcache. If present, the folio's
1459	* index and mapping must be updated even when the PTE is a swap
1460	* entry. The anon_vma lock is not taken during this process since
1461	* the folio has already been unmapped, and the swap entry is
1462	* exclusive, preventing rmap walks.
1463	*
1464	* For large folios, return -EBUSY immediately, as split_folio()
1465	* also returns -EBUSY when attempting to split unmapped large
1466	* folios in the swapcache. This issue needs to be resolved
1467	* separately to allow proper handling.
1468	*/
1469	if (!src_folio)
1470	folio = swap_cache_get_folio(entry);
1471	if (folio) {
1472	if (folio_test_large(folio)) {
1473	ret = -EBUSY;
1474	folio_put(folio);
1475	goto out;
1476	}
1477	src_folio = folio;
1478	src_folio_pte = orig_src_pte;
1479	if (!folio_trylock(folio: src_folio)) {
1480	pte_unmap(pte: src_pte);
1481	pte_unmap(pte: dst_pte);
1482	src_pte = dst_pte = NULL;
1483	put_swap_device(si);
1484	si = NULL;
1485	/ now we can block and wait /
1486	folio_lock(folio: src_folio);
1487	goto retry;
1488	}
1489	}
1490	ret = move_swap_pte(mm, dst_vma, dst_addr, src_addr, dst_pte, src_pte,
1491	orig_dst_pte, orig_src_pte, dst_pmd, dst_pmdval,
1492	dst_ptl, src_ptl, src_folio, si, entry);
1493	}
1494
1495	out:
1496	if (src_folio) {
1497	folio_unlock(folio: src_folio);
1498	folio_put(folio: src_folio);
1499	}
1500	/*
1501	* Unmap in reverse order (LIFO) to maintain proper kmap_local
1502	* index ordering when CONFIG_HIGHPTE is enabled. We mapped dst_pte
1503	* first, then src_pte, so we must unmap src_pte first, then dst_pte.
1504	*/
1505	if (src_pte)
1506	pte_unmap(pte: src_pte);
1507	if (dst_pte)
1508	pte_unmap(pte: dst_pte);
1509	mmu_notifier_invalidate_range_end(range: &range);
1510	if (si)
1511	put_swap_device(si);
1512
1513	return ret;
1514	}
1515
1516	#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1517	static inline bool move_splits_huge_pmd(unsigned long dst_addr,
1518	unsigned long src_addr,
1519	unsigned long src_end)
1520	{
1521	return (src_addr & ~HPAGE_PMD_MASK) \|\| (dst_addr & ~HPAGE_PMD_MASK) \|\|
1522	src_end - src_addr < HPAGE_PMD_SIZE;
1523	}
1524	#else
1525	static inline bool move_splits_huge_pmd(unsigned long dst_addr,
1526	unsigned long src_addr,
1527	unsigned long src_end)
1528	{
1529	/ This is unreachable anyway, just to avoid warnings when HPAGE_PMD_SIZE==0 /
1530	return false;
1531	}
1532	#endif
1533
1534	static inline bool vma_move_compatible(struct vm_area_struct *vma)
1535	{
1536	return !(vma->vm_flags & (VM_PFNMAP \| VM_IO \| VM_HUGETLB \|
1537	VM_MIXEDMAP \| VM_SHADOW_STACK));
1538	}
1539
1540	static int validate_move_areas(struct userfaultfd_ctx *ctx,
1541	struct vm_area_struct *src_vma,
1542	struct vm_area_struct *dst_vma)
1543	{
1544	/ Only allow moving if both have the same access and protection /
1545	if ((src_vma->vm_flags & VM_ACCESS_FLAGS) != (dst_vma->vm_flags & VM_ACCESS_FLAGS) \|\|
1546	pgprot_val(src_vma->vm_page_prot) != pgprot_val(dst_vma->vm_page_prot))
1547	return -EINVAL;
1548
1549	/ Only allow moving if both are mlocked or both aren't /
1550	if ((src_vma->vm_flags & VM_LOCKED) != (dst_vma->vm_flags & VM_LOCKED))
1551	return -EINVAL;
1552
1553	/*
1554	* For now, we keep it simple and only move between writable VMAs.
1555	* Access flags are equal, therefore checking only the source is enough.
1556	*/
1557	if (!(src_vma->vm_flags & VM_WRITE))
1558	return -EINVAL;
1559
1560	/ Check if vma flags indicate content which can be moved /
1561	if (!vma_move_compatible(vma: src_vma) \|\| !vma_move_compatible(vma: dst_vma))
1562	return -EINVAL;
1563
1564	/ Ensure dst_vma is registered in uffd we are operating on /
1565	if (!dst_vma->vm_userfaultfd_ctx.ctx \|\|
1566	dst_vma->vm_userfaultfd_ctx.ctx != ctx)
1567	return -EINVAL;
1568
1569	/ Only allow moving across anonymous vmas /
1570	if (!vma_is_anonymous(vma: src_vma) \|\| !vma_is_anonymous(vma: dst_vma))
1571	return -EINVAL;
1572
1573	return `0`;
1574	}
1575
1576	static __always_inline
1577	int find_vmas_mm_locked(struct mm_struct *mm,
1578	unsigned long dst_start,
1579	unsigned long src_start,
1580	struct vm_area_struct **dst_vmap,
1581	struct vm_area_struct **src_vmap)
1582	{
1583	struct vm_area_struct *vma;
1584
1585	mmap_assert_locked(mm);
1586	vma = find_vma_and_prepare_anon(mm, addr: dst_start);
1587	if (IS_ERR(ptr: vma))
1588	return PTR_ERR(ptr: vma);
1589
1590	*dst_vmap = vma;
1591	/ Skip finding src_vma if src_start is in dst_vma /
1592	if (src_start >= vma->vm_start && src_start < vma->vm_end)
1593	goto out_success;
1594
1595	vma = vma_lookup(mm, addr: src_start);
1596	if (!vma)
1597	return -ENOENT;
1598	out_success:
1599	*src_vmap = vma;
1600	return `0`;
1601	}
1602
1603	#ifdef CONFIG_PER_VMA_LOCK
1604	static int uffd_move_lock(struct mm_struct *mm,
1605	unsigned long dst_start,
1606	unsigned long src_start,
1607	struct vm_area_struct **dst_vmap,
1608	struct vm_area_struct **src_vmap)
1609	{
1610	struct vm_area_struct *vma;
1611	int err;
1612
1613	vma = uffd_lock_vma(mm, address: dst_start);
1614	if (IS_ERR(ptr: vma))
1615	return PTR_ERR(ptr: vma);
1616
1617	*dst_vmap = vma;
1618	/*
1619	* Skip finding src_vma if src_start is in dst_vma. This also ensures
1620	* that we don't lock the same vma twice.
1621	*/
1622	if (src_start >= vma->vm_start && src_start < vma->vm_end) {
1623	*src_vmap = vma;
1624	return `0`;
1625	}
1626
1627	/*
1628	* Using uffd_lock_vma() to get src_vma can lead to following deadlock:
1629	*
1630	* Thread1 Thread2
1631	* ------- -------
1632	* vma_start_read(dst_vma)
1633	* mmap_write_lock(mm)
1634	* vma_start_write(src_vma)
1635	* vma_start_read(src_vma)
1636	* mmap_read_lock(mm)
1637	* vma_start_write(dst_vma)
1638	*/
1639	*src_vmap = lock_vma_under_rcu(mm, address: src_start);
1640	if (likely(*src_vmap))
1641	return `0`;
1642
1643	/ Undo any locking and retry in mmap_lock critical section /
1644	vma_end_read(vma: *dst_vmap);
1645
1646	mmap_read_lock(mm);
1647	err = find_vmas_mm_locked(mm, dst_start, src_start, dst_vmap, src_vmap);
1648	if (err)
1649	goto out;
1650
1651	if (!vma_start_read_locked(vma: *dst_vmap)) {
1652	err = -EAGAIN;
1653	goto out;
1654	}
1655
1656	/ Nothing further to do if both vmas are locked. /
1657	if (dst_vmap == src_vmap)
1658	goto out;
1659
1660	if (!vma_start_read_locked_nested(vma: *src_vmap, SINGLE_DEPTH_NESTING)) {
1661	/ Undo dst_vmap locking if src_vmap failed to lock /
1662	vma_end_read(vma: *dst_vmap);
1663	err = -EAGAIN;
1664	}
1665	out:
1666	mmap_read_unlock(mm);
1667	return err;
1668	}
1669
1670	static void uffd_move_unlock(struct vm_area_struct *dst_vma,
1671	struct vm_area_struct *src_vma)
1672	{
1673	vma_end_read(vma: src_vma);
1674	if (src_vma != dst_vma)
1675	vma_end_read(vma: dst_vma);
1676	}
1677
1678	#else
1679
1680	static int uffd_move_lock(struct mm_struct *mm,
1681	unsigned long dst_start,
1682	unsigned long src_start,
1683	struct vm_area_struct **dst_vmap,
1684	struct vm_area_struct **src_vmap)
1685	{
1686	int err;
1687
1688	mmap_read_lock(mm);
1689	err = find_vmas_mm_locked(mm, dst_start, src_start, dst_vmap, src_vmap);
1690	if (err)
1691	mmap_read_unlock(mm);
1692	return err;
1693	}
1694
1695	static void uffd_move_unlock(struct vm_area_struct *dst_vma,
1696	struct vm_area_struct *src_vma)
1697	{
1698	mmap_assert_locked(src_vma->vm_mm);
1699	mmap_read_unlock(dst_vma->vm_mm);
1700	}
1701	#endif
1702
1703	/**
1704	* move_pages - move arbitrary anonymous pages of an existing vma
1705	* @ctx: pointer to the userfaultfd context
1706	* @dst_start: start of the destination virtual memory range
1707	* @src_start: start of the source virtual memory range
1708	* @len: length of the virtual memory range
1709	* @mode: flags from uffdio_move.mode
1710	*
1711	* It will either use the mmap_lock in read mode or per-vma locks
1712	*
1713	* move_pages() remaps arbitrary anonymous pages atomically in zero
1714	* copy. It only works on non shared anonymous pages because those can
1715	* be relocated without generating non linear anon_vmas in the rmap
1716	* code.
1717	*
1718	* It provides a zero copy mechanism to handle userspace page faults.
1719	* The source vma pages should have mapcount == 1, which can be
1720	* enforced by using madvise(MADV_DONTFORK) on src vma.
1721	*
1722	* The thread receiving the page during the userland page fault
1723	* will receive the faulting page in the source vma through the network,
1724	* storage or any other I/O device (MADV_DONTFORK in the source vma
1725	* avoids move_pages() to fail with -EBUSY if the process forks before
1726	* move_pages() is called), then it will call move_pages() to map the
1727	* page in the faulting address in the destination vma.
1728	*
1729	* This userfaultfd command works purely via pagetables, so it's the
1730	* most efficient way to move physical non shared anonymous pages
1731	* across different virtual addresses. Unlike mremap()/mmap()/munmap()
1732	* it does not create any new vmas. The mapping in the destination
1733	* address is atomic.
1734	*
1735	* It only works if the vma protection bits are identical from the
1736	* source and destination vma.
1737	*
1738	* It can remap non shared anonymous pages within the same vma too.
1739	*
1740	* If the source virtual memory range has any unmapped holes, or if
1741	* the destination virtual memory range is not a whole unmapped hole,
1742	* move_pages() will fail respectively with -ENOENT or -EEXIST. This
1743	* provides a very strict behavior to avoid any chance of memory
1744	* corruption going unnoticed if there are userland race conditions.
1745	* Only one thread should resolve the userland page fault at any given
1746	* time for any given faulting address. This means that if two threads
1747	* try to both call move_pages() on the same destination address at the
1748	* same time, the second thread will get an explicit error from this
1749	* command.
1750	*
1751	* The command retval will return "len" is successful. The command
1752	* however can be interrupted by fatal signals or errors. If
1753	* interrupted it will return the number of bytes successfully
1754	* remapped before the interruption if any, or the negative error if
1755	* none. It will never return zero. Either it will return an error or
1756	* an amount of bytes successfully moved. If the retval reports a
1757	* "short" remap, the move_pages() command should be repeated by
1758	* userland with src+retval, dst+reval, len-retval if it wants to know
1759	* about the error that interrupted it.
1760	*
1761	* The UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES flag can be specified to
1762	* prevent -ENOENT errors to materialize if there are holes in the
1763	* source virtual range that is being remapped. The holes will be
1764	* accounted as successfully remapped in the retval of the
1765	* command. This is mostly useful to remap hugepage naturally aligned
1766	* virtual regions without knowing if there are transparent hugepage
1767	* in the regions or not, but preventing the risk of having to split
1768	* the hugepmd during the remap.
1769	*/
1770	ssize_t move_pages(struct userfaultfd_ctx ctx, unsigned* long dst_start,
1771	unsigned long src_start, unsigned long len, __u64 mode)
1772	{
1773	struct mm_struct *mm = ctx->mm;
1774	struct vm_area_struct src_vma, dst_vma;
1775	unsigned long src_addr, dst_addr, src_end;
1776	pmd_t src_pmd, dst_pmd;
1777	long err = -EINVAL;
1778	ssize_t moved = `0`;
1779
1780	/ Sanitize the command parameters. /
1781	VM_WARN_ON_ONCE(src_start & ~PAGE_MASK);
1782	VM_WARN_ON_ONCE(dst_start & ~PAGE_MASK);
1783	VM_WARN_ON_ONCE(len & ~PAGE_MASK);
1784
1785	/ Does the address range wrap, or is the span zero-sized? /
1786	VM_WARN_ON_ONCE(src_start + len < src_start);
1787	VM_WARN_ON_ONCE(dst_start + len < dst_start);
1788
1789	err = uffd_move_lock(mm, dst_start, src_start, dst_vmap: &dst_vma, src_vmap: &src_vma);
1790	if (err)
1791	goto out;
1792
1793	/ Re-check after taking map_changing_lock /
1794	err = -EAGAIN;
1795	down_read(sem: &ctx->map_changing_lock);
1796	if (likely(atomic_read(&ctx->mmap_changing)))
1797	goto out_unlock;
1798	/*
1799	* Make sure the vma is not shared, that the src and dst remap
1800	* ranges are both valid and fully within a single existing
1801	* vma.
1802	*/
1803	err = -EINVAL;
1804	if (src_vma->vm_flags & VM_SHARED)
1805	goto out_unlock;
1806	if (src_start + len > src_vma->vm_end)
1807	goto out_unlock;
1808
1809	if (dst_vma->vm_flags & VM_SHARED)
1810	goto out_unlock;
1811	if (dst_start + len > dst_vma->vm_end)
1812	goto out_unlock;
1813
1814	err = validate_move_areas(ctx, src_vma, dst_vma);
1815	if (err)
1816	goto out_unlock;
1817
1818	for (src_addr = src_start, dst_addr = dst_start, src_end = src_start + len;
1819	src_addr < src_end;) {
1820	spinlock_t *ptl;
1821	pmd_t dst_pmdval;
1822	unsigned long step_size;
1823
1824	/*
1825	* Below works because anonymous area would not have a
1826	* transparent huge PUD. If file-backed support is added,
1827	* that case would need to be handled here.
1828	*/
1829	src_pmd = mm_find_pmd(mm, address: src_addr);
1830	if (unlikely(!src_pmd)) {
1831	if (!(mode & UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES)) {
1832	err = -ENOENT;
1833	break;
1834	}
1835	src_pmd = mm_alloc_pmd(mm, address: src_addr);
1836	if (unlikely(!src_pmd)) {
1837	err = -ENOMEM;
1838	break;
1839	}
1840	}
1841	dst_pmd = mm_alloc_pmd(mm, address: dst_addr);
1842	if (unlikely(!dst_pmd)) {
1843	err = -ENOMEM;
1844	break;
1845	}
1846
1847	dst_pmdval = pmdp_get_lockless(pmdp: dst_pmd);
1848	/*
1849	* If the dst_pmd is mapped as THP don't override it and just
1850	* be strict. If dst_pmd changes into TPH after this check, the
1851	* move_pages_huge_pmd() will detect the change and retry
1852	* while move_pages_pte() will detect the change and fail.
1853	*/
1854	if (unlikely(pmd_trans_huge(dst_pmdval))) {
1855	err = -EEXIST;
1856	break;
1857	}
1858
1859	ptl = pmd_trans_huge_lock(pmd: src_pmd, vma: src_vma);
1860	if (ptl) {
1861	/ Check if we can move the pmd without splitting it. /
1862	if (move_splits_huge_pmd(dst_addr, src_addr, src_end: src_start + len) \|\|
1863	!pmd_none(pmd: dst_pmdval)) {
1864	/ Can be a migration entry /
1865	if (pmd_present(pmd: *src_pmd)) {
1866	struct folio folio = pmd_folio(src_pmd);
1867
1868	if (!is_huge_zero_folio(folio) &&
1869	!PageAnonExclusive(page: &folio->page)) {
1870	spin_unlock(lock: ptl);
1871	err = -EBUSY;
1872	break;
1873	}
1874	}
1875
1876	spin_unlock(lock: ptl);
1877	split_huge_pmd(src_vma, src_pmd, src_addr);
1878	/ The folio will be split by move_pages_pte() /
1879	continue;
1880	}
1881
1882	err = move_pages_huge_pmd(mm, dst_pmd, src_pmd,
1883	dst_pmdval, dst_vma, src_vma,
1884	dst_addr, src_addr);
1885	step_size = HPAGE_PMD_SIZE;
1886	} else {
1887	long ret;
1888
1889	if (pmd_none(pmd: *src_pmd)) {
1890	if (!(mode & UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES)) {
1891	err = -ENOENT;
1892	break;
1893	}
1894	if (unlikely(__pte_alloc(mm, src_pmd))) {
1895	err = -ENOMEM;
1896	break;
1897	}
1898	}
1899
1900	if (unlikely(pte_alloc(mm, dst_pmd))) {
1901	err = -ENOMEM;
1902	break;
1903	}
1904
1905	ret = move_pages_ptes(mm, dst_pmd, src_pmd,
1906	dst_vma, src_vma, dst_addr,
1907	src_addr, len: src_end - src_addr, mode);
1908	if (ret < `0`)
1909	err = ret;
1910	else
1911	step_size = ret;
1912	}
1913
1914	cond_resched();
1915
1916	if (fatal_signal_pending(current)) {
1917	/ Do not override an error /
1918	if (!err \|\| err == -EAGAIN)
1919	err = -EINTR;
1920	break;
1921	}
1922
1923	if (err) {
1924	if (err == -EAGAIN)
1925	continue;
1926	break;
1927	}
1928
1929	/ Proceed to the next page /
1930	dst_addr += step_size;
1931	src_addr += step_size;
1932	moved += step_size;
1933	}
1934
1935	out_unlock:
1936	up_read(sem: &ctx->map_changing_lock);
1937	uffd_move_unlock(dst_vma, src_vma);
1938	out:
1939	VM_WARN_ON_ONCE(moved < `0`);
1940	VM_WARN_ON_ONCE(err > `0`);
1941	VM_WARN_ON_ONCE(!moved && !err);
1942	return moved ? moved : err;
1943	}
1944
1945	static void userfaultfd_set_vm_flags(struct vm_area_struct *vma,
1946	vm_flags_t vm_flags)
1947	{
1948	const bool uffd_wp_changed = (vma->vm_flags ^ vm_flags) & VM_UFFD_WP;
1949
1950	vm_flags_reset(vma, flags: vm_flags);
1951	/*
1952	* For shared mappings, we want to enable writenotify while
1953	* userfaultfd-wp is enabled (see vma_wants_writenotify()). We'll simply
1954	* recalculate vma->vm_page_prot whenever userfaultfd-wp changes.
1955	*/
1956	if ((vma->vm_flags & VM_SHARED) && uffd_wp_changed)
1957	vma_set_page_prot(vma);
1958	}
1959
1960	static void userfaultfd_set_ctx(struct vm_area_struct *vma,
1961	struct userfaultfd_ctx *ctx,
1962	vm_flags_t vm_flags)
1963	{
1964	vma_start_write(vma);
1965	vma->vm_userfaultfd_ctx = (struct vm_userfaultfd_ctx){ctx};
1966	userfaultfd_set_vm_flags(vma,
1967	vm_flags: (vma->vm_flags & ~__VM_UFFD_FLAGS) \| vm_flags);
1968	}
1969
1970	void userfaultfd_reset_ctx(struct vm_area_struct *vma)
1971	{
1972	userfaultfd_set_ctx(vma, NULL, vm_flags: `0`);
1973	}
1974
1975	struct vm_area_struct userfaultfd_clear_vma(struct* vma_iterator *vmi,
1976	struct vm_area_struct *prev,
1977	struct vm_area_struct *vma,
1978	unsigned long start,
1979	unsigned long end)
1980	{
1981	struct vm_area_struct *ret;
1982	bool give_up_on_oom = false;
1983
1984	/*
1985	* If we are modifying only and not splitting, just give up on the merge
1986	* if OOM prevents us from merging successfully.
1987	*/
1988	if (start == vma->vm_start && end == vma->vm_end)
1989	give_up_on_oom = true;
1990
1991	/ Reset ptes for the whole vma range if wr-protected /
1992	if (userfaultfd_wp(vma))
1993	uffd_wp_range(dst_vma: vma, start, len: end - start, enable_wp: false);
1994
1995	ret = vma_modify_flags_uffd(vmi, prev, vma, start, end,
1996	vm_flags: vma->vm_flags & ~__VM_UFFD_FLAGS,
1997	NULL_VM_UFFD_CTX, give_up_on_oom);
1998
1999	/*
2000	* In the vma_merge() successful mprotect-like case 8:
2001	* the next vma was merged into the current one and
2002	* the current one has not been updated yet.
2003	*/
2004	if (!IS_ERR(ptr: ret))
2005	userfaultfd_reset_ctx(vma: ret);
2006
2007	return ret;
2008	}
2009
2010	/ Assumes mmap write lock taken, and mm_struct pinned. /
2011	int userfaultfd_register_range(struct userfaultfd_ctx *ctx,
2012	struct vm_area_struct *vma,
2013	vm_flags_t vm_flags,
2014	unsigned long start, unsigned long end,
2015	bool wp_async)
2016	{
2017	VMA_ITERATOR(vmi, ctx->mm, start);
2018	struct vm_area_struct *prev = vma_prev(vmi: &vmi);
2019	unsigned long vma_end;
2020	vm_flags_t new_flags;
2021
2022	if (vma->vm_start < start)
2023	prev = vma;
2024
2025	for_each_vma_range(vmi, vma, end) {
2026	cond_resched();
2027
2028	VM_WARN_ON_ONCE(!vma_can_userfault(vma, vm_flags, wp_async));
2029	VM_WARN_ON_ONCE(vma->vm_userfaultfd_ctx.ctx &&
2030	vma->vm_userfaultfd_ctx.ctx != ctx);
2031	VM_WARN_ON_ONCE(!(vma->vm_flags & VM_MAYWRITE));
2032
2033	/*
2034	* Nothing to do: this vma is already registered into this
2035	* userfaultfd and with the right tracking mode too.
2036	*/
2037	if (vma->vm_userfaultfd_ctx.ctx == ctx &&
2038	(vma->vm_flags & vm_flags) == vm_flags)
2039	goto skip;
2040
2041	if (vma->vm_start > start)
2042	start = vma->vm_start;
2043	vma_end = min(end, vma->vm_end);
2044
2045	new_flags = (vma->vm_flags & ~__VM_UFFD_FLAGS) \| vm_flags;
2046	vma = vma_modify_flags_uffd(vmi: &vmi, prev, vma, start, end: vma_end,
2047	vm_flags: new_flags,
2048	new_ctx: (struct vm_userfaultfd_ctx){ctx},
2049	/ give_up_on_oom = /false);
2050	if (IS_ERR(ptr: vma))
2051	return PTR_ERR(ptr: vma);
2052
2053	/*
2054	* In the vma_merge() successful mprotect-like case 8:
2055	* the next vma was merged into the current one and
2056	* the current one has not been updated yet.
2057	*/
2058	userfaultfd_set_ctx(vma, ctx, vm_flags);
2059
2060	if (is_vm_hugetlb_page(vma) && uffd_disable_huge_pmd_share(vma))
2061	hugetlb_unshare_all_pmds(vma);
2062
2063	skip:
2064	prev = vma;
2065	start = vma->vm_end;
2066	}
2067
2068	return `0`;
2069	}
2070
2071	void userfaultfd_release_new(struct userfaultfd_ctx *ctx)
2072	{
2073	struct mm_struct *mm = ctx->mm;
2074	struct vm_area_struct *vma;
2075	VMA_ITERATOR(vmi, mm, `0`);
2076
2077	/ the various vma->vm_userfaultfd_ctx still points to it /
2078	mmap_write_lock(mm);
2079	for_each_vma(vmi, vma) {
2080	if (vma->vm_userfaultfd_ctx.ctx == ctx)
2081	userfaultfd_reset_ctx(vma);
2082	}
2083	mmap_write_unlock(mm);
2084	}
2085
2086	void userfaultfd_release_all(struct mm_struct *mm,
2087	struct userfaultfd_ctx *ctx)
2088	{
2089	struct vm_area_struct vma, prev;
2090	VMA_ITERATOR(vmi, mm, `0`);
2091
2092	if (!mmget_not_zero(mm))
2093	return;
2094
2095	/*
2096	* Flush page faults out of all CPUs. NOTE: all page faults
2097	* must be retried without returning VM_FAULT_SIGBUS if
2098	* userfaultfd_ctx_get() succeeds but vma->vma_userfault_ctx
2099	* changes while handle_userfault released the mmap_lock. So
2100	* it's critical that released is set to true (above), before
2101	* taking the mmap_lock for writing.
2102	*/
2103	mmap_write_lock(mm);
2104	prev = NULL;
2105	for_each_vma(vmi, vma) {
2106	cond_resched();
2107	VM_WARN_ON_ONCE(!!vma->vm_userfaultfd_ctx.ctx ^
2108	!!(vma->vm_flags & __VM_UFFD_FLAGS));
2109	if (vma->vm_userfaultfd_ctx.ctx != ctx) {
2110	prev = vma;
2111	continue;
2112	}
2113
2114	vma = userfaultfd_clear_vma(vmi: &vmi, prev, vma,
2115	start: vma->vm_start, end: vma->vm_end);
2116	prev = vma;
2117	}
2118	mmap_write_unlock(mm);
2119	mmput(mm);
2120	}
2121

source code of linux/mm/userfaultfd.c