migrate_device.c source code [linux/mm/migrate_device.c]

1	// SPDX-License-Identifier: GPL-2.0
2	/*
3	* Device Memory Migration functionality.
4	*
5	* Originally written by Jérôme Glisse.
6	*/
7	#include <linux/export.h>
8	#include <linux/memremap.h>
9	#include <linux/migrate.h>
10	#include <linux/mm.h>
11	#include <linux/mm_inline.h>
12	#include <linux/mmu_notifier.h>
13	#include <linux/oom.h>
14	#include <linux/pagewalk.h>
15	#include <linux/rmap.h>
16	#include <linux/leafops.h>
17	#include <linux/pgalloc.h>
18	#include <asm/tlbflush.h>
19	#include "internal.h"
20
21	static int migrate_vma_collect_skip(unsigned long start,
22	unsigned long end,
23	struct mm_walk *walk)
24	{
25	struct migrate_vma *migrate = walk->private;
26	unsigned long addr;
27
28	for (addr = start; addr < end; addr += PAGE_SIZE) {
29	migrate->dst[migrate->npages] = `0`;
30	migrate->src[migrate->npages++] = `0`;
31	}
32
33	return `0`;
34	}
35
36	static int migrate_vma_collect_hole(unsigned long start,
37	unsigned long end,
38	__always_unused int depth,
39	struct mm_walk *walk)
40	{
41	struct migrate_vma *migrate = walk->private;
42	unsigned long addr;
43
44	/ Only allow populating anonymous memory. /
45	if (!vma_is_anonymous(vma: walk->vma))
46	return migrate_vma_collect_skip(start, end, walk);
47
48	if (thp_migration_supported() &&
49	(migrate->flags & MIGRATE_VMA_SELECT_COMPOUND) &&
50	(IS_ALIGNED(start, HPAGE_PMD_SIZE) &&
51	IS_ALIGNED(end, HPAGE_PMD_SIZE))) {
52	migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE \|
53	MIGRATE_PFN_COMPOUND;
54	migrate->dst[migrate->npages] = `0`;
55	migrate->npages++;
56	migrate->cpages++;
57
58	/*
59	* Collect the remaining entries as holes, in case we
60	* need to split later
61	*/
62	return migrate_vma_collect_skip(start: start + PAGE_SIZE, end, walk);
63	}
64
65	for (addr = start; addr < end; addr += PAGE_SIZE) {
66	migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE;
67	migrate->dst[migrate->npages] = `0`;
68	migrate->npages++;
69	migrate->cpages++;
70	}
71
72	return `0`;
73	}
74
75	/**
76	* migrate_vma_split_folio() - Helper function to split a THP folio
77	* @folio: the folio to split
78	* @fault_page: struct page associated with the fault if any
79	*
80	* Returns 0 on success
81	*/
82	static int migrate_vma_split_folio(struct folio *folio,
83	struct page *fault_page)
84	{
85	int ret;
86	struct folio *fault_folio = fault_page ? page_folio(fault_page) : NULL;
87	struct folio *new_fault_folio = NULL;
88
89	if (folio != fault_folio) {
90	folio_get(folio);
91	folio_lock(folio);
92	}
93
94	ret = split_folio(folio);
95	if (ret) {
96	if (folio != fault_folio) {
97	folio_unlock(folio);
98	folio_put(folio);
99	}
100	return ret;
101	}
102
103	new_fault_folio = fault_page ? page_folio(fault_page) : NULL;
104
105	/*
106	* Ensure the lock is held on the correct
107	* folio after the split
108	*/
109	if (!new_fault_folio) {
110	folio_unlock(folio);
111	folio_put(folio);
112	} else if (folio != new_fault_folio) {
113	if (new_fault_folio != fault_folio) {
114	folio_get(folio: new_fault_folio);
115	folio_lock(folio: new_fault_folio);
116	}
117	folio_unlock(folio);
118	folio_put(folio);
119	}
120
121	return `0`;
122	}
123
124	/* migrate_vma_collect_huge_pmd - collect THP pages without splitting the*
125	* folio for device private pages.
126	* @pmdp: pointer to pmd entry
127	* @start: start address of the range for migration
128	* @end: end address of the range for migration
129	* @walk: mm_walk callback structure
130	* @fault_folio: folio associated with the fault if any
131	*
132	* Collect the huge pmd entry at @pmdp for migration and set the
133	* MIGRATE_PFN_COMPOUND flag in the migrate src entry to indicate that
134	* migration will occur at HPAGE_PMD granularity
135	*/
136	static int migrate_vma_collect_huge_pmd(pmd_t pmdp, unsigned* long start,
137	unsigned long end, struct mm_walk *walk,
138	struct folio *fault_folio)
139	{
140	struct mm_struct *mm = walk->mm;
141	struct folio *folio;
142	struct migrate_vma *migrate = walk->private;
143	spinlock_t *ptl;
144	int ret;
145	unsigned long write = `0`;
146
147	ptl = pmd_lock(mm, pmd: pmdp);
148	if (pmd_none(pmd: *pmdp)) {
149	spin_unlock(lock: ptl);
150	return migrate_vma_collect_hole(start, end, depth: -`1`, walk);
151	}
152
153	if (pmd_trans_huge(pmd: *pmdp)) {
154	if (!(migrate->flags & MIGRATE_VMA_SELECT_SYSTEM)) {
155	spin_unlock(lock: ptl);
156	return migrate_vma_collect_skip(start, end, walk);
157	}
158
159	folio = pmd_folio(*pmdp);
160	if (is_huge_zero_folio(folio)) {
161	spin_unlock(lock: ptl);
162	return migrate_vma_collect_hole(start, end, depth: -`1`, walk);
163	}
164	if (pmd_write(pmd: *pmdp))
165	write = MIGRATE_PFN_WRITE;
166	} else if (!pmd_present(pmd: *pmdp)) {
167	const softleaf_t entry = softleaf_from_pmd(pmd: *pmdp);
168
169	folio = softleaf_to_folio(entry);
170
171	if (!softleaf_is_device_private(entry) \|\|
172	!(migrate->flags & MIGRATE_VMA_SELECT_DEVICE_PRIVATE) \|\|
173	(folio->pgmap->owner != migrate->pgmap_owner)) {
174	spin_unlock(lock: ptl);
175	return migrate_vma_collect_skip(start, end, walk);
176	}
177
178	if (softleaf_is_migration(entry)) {
179	migration_entry_wait_on_locked(entry, ptl);
180	spin_unlock(lock: ptl);
181	return -EAGAIN;
182	}
183
184	if (softleaf_is_device_private_write(entry))
185	write = MIGRATE_PFN_WRITE;
186	} else {
187	spin_unlock(lock: ptl);
188	return -EAGAIN;
189	}
190
191	folio_get(folio);
192	if (folio != fault_folio && unlikely(!folio_trylock(folio))) {
193	spin_unlock(lock: ptl);
194	folio_put(folio);
195	return migrate_vma_collect_skip(start, end, walk);
196	}
197
198	if (thp_migration_supported() &&
199	(migrate->flags & MIGRATE_VMA_SELECT_COMPOUND) &&
200	(IS_ALIGNED(start, HPAGE_PMD_SIZE) &&
201	IS_ALIGNED(end, HPAGE_PMD_SIZE))) {
202
203	struct page_vma_mapped_walk pvmw = {
204	.ptl = ptl,
205	.address = start,
206	.pmd = pmdp,
207	.vma = walk->vma,
208	};
209
210	unsigned long pfn = page_to_pfn(folio_page(folio, `0`));
211
212	migrate->src[migrate->npages] = migrate_pfn(pfn) \| write
213	\| MIGRATE_PFN_MIGRATE
214	\| MIGRATE_PFN_COMPOUND;
215	migrate->dst[migrate->npages++] = `0`;
216	migrate->cpages++;
217	ret = set_pmd_migration_entry(pvmw: &pvmw, folio_page(folio, `0`));
218	if (ret) {
219	migrate->npages--;
220	migrate->cpages--;
221	migrate->src[migrate->npages] = `0`;
222	migrate->dst[migrate->npages] = `0`;
223	goto fallback;
224	}
225	migrate_vma_collect_skip(start: start + PAGE_SIZE, end, walk);
226	spin_unlock(lock: ptl);
227	return `0`;
228	}
229
230	fallback:
231	spin_unlock(lock: ptl);
232	if (!folio_test_large(folio))
233	goto done;
234	ret = split_folio(folio);
235	if (fault_folio != folio)
236	folio_unlock(folio);
237	folio_put(folio);
238	if (ret)
239	return migrate_vma_collect_skip(start, end, walk);
240	if (pmd_none(pmd: pmdp_get_lockless(pmdp)))
241	return migrate_vma_collect_hole(start, end, depth: -`1`, walk);
242
243	done:
244	return -ENOENT;
245	}
246
247	static int migrate_vma_collect_pmd(pmd_t *pmdp,
248	unsigned long start,
249	unsigned long end,
250	struct mm_walk *walk)
251	{
252	struct migrate_vma *migrate = walk->private;
253	struct vm_area_struct *vma = walk->vma;
254	struct mm_struct *mm = vma->vm_mm;
255	unsigned long addr = start, unmapped = `0`;
256	spinlock_t *ptl;
257	struct folio *fault_folio = migrate->fault_page ?
258	page_folio(migrate->fault_page) : NULL;
259	pte_t *ptep;
260
261	again:
262	if (pmd_trans_huge(pmd: pmdp) \|\| !pmd_present(pmd: pmdp)) {
263	int ret = migrate_vma_collect_huge_pmd(pmdp, start, end, walk, fault_folio);
264
265	if (ret == -EAGAIN)
266	goto again;
267	if (ret == `0`)
268	return `0`;
269	}
270
271	ptep = pte_offset_map_lock(mm, pmd: pmdp, addr: start, ptlp: &ptl);
272	if (!ptep)
273	goto again;
274	arch_enter_lazy_mmu_mode();
275	ptep += (addr - start) / PAGE_SIZE;
276
277	for (; addr < end; addr += PAGE_SIZE, ptep++) {
278	struct dev_pagemap *pgmap;
279	unsigned long mpfn = `0`, pfn;
280	struct folio *folio;
281	struct page *page;
282	softleaf_t entry;
283	pte_t pte;
284
285	pte = ptep_get(ptep);
286
287	if (pte_none(pte)) {
288	if (vma_is_anonymous(vma)) {
289	mpfn = MIGRATE_PFN_MIGRATE;
290	migrate->cpages++;
291	}
292	goto next;
293	}
294
295	if (!pte_present(a: pte)) {
296	/*
297	* Only care about unaddressable device page special
298	* page table entry. Other special swap entries are not
299	* migratable, and we ignore regular swapped page.
300	*/
301	entry = softleaf_from_pte(pte);
302	if (!softleaf_is_device_private(entry))
303	goto next;
304
305	page = softleaf_to_page(entry);
306	pgmap = page_pgmap(page);
307	if (!(migrate->flags &
308	MIGRATE_VMA_SELECT_DEVICE_PRIVATE) \|\|
309	pgmap->owner != migrate->pgmap_owner)
310	goto next;
311
312	folio = page_folio(page);
313	if (folio_test_large(folio)) {
314	int ret;
315
316	arch_leave_lazy_mmu_mode();
317	pte_unmap_unlock(ptep, ptl);
318	ret = migrate_vma_split_folio(folio,
319	fault_page: migrate->fault_page);
320
321	if (ret) {
322	if (unmapped)
323	flush_tlb_range(walk->vma, start, end);
324
325	return migrate_vma_collect_skip(start: addr, end, walk);
326	}
327
328	goto again;
329	}
330
331	mpfn = migrate_pfn(page_to_pfn(page)) \|
332	MIGRATE_PFN_MIGRATE;
333	if (softleaf_is_device_private_write(entry))
334	mpfn \|= MIGRATE_PFN_WRITE;
335	} else {
336	pfn = pte_pfn(pte);
337	if (is_zero_pfn(pfn) &&
338	(migrate->flags & MIGRATE_VMA_SELECT_SYSTEM)) {
339	mpfn = MIGRATE_PFN_MIGRATE;
340	migrate->cpages++;
341	goto next;
342	}
343	page = vm_normal_page(vma: migrate->vma, addr, pte);
344	if (page && !is_zone_device_page(page) &&
345	!(migrate->flags & MIGRATE_VMA_SELECT_SYSTEM)) {
346	goto next;
347	} else if (page && is_device_coherent_page(page)) {
348	pgmap = page_pgmap(page);
349
350	if (!(migrate->flags &
351	MIGRATE_VMA_SELECT_DEVICE_COHERENT) \|\|
352	pgmap->owner != migrate->pgmap_owner)
353	goto next;
354	}
355	folio = page ? page_folio(page) : NULL;
356	if (folio && folio_test_large(folio)) {
357	int ret;
358
359	arch_leave_lazy_mmu_mode();
360	pte_unmap_unlock(ptep, ptl);
361	ret = migrate_vma_split_folio(folio,
362	fault_page: migrate->fault_page);
363
364	if (ret) {
365	if (unmapped)
366	flush_tlb_range(walk->vma, start, end);
367
368	return migrate_vma_collect_skip(start: addr, end, walk);
369	}
370
371	goto again;
372	}
373	mpfn = migrate_pfn(pfn) \| MIGRATE_PFN_MIGRATE;
374	mpfn \|= pte_write(pte) ? MIGRATE_PFN_WRITE : `0`;
375	}
376
377	if (!page \|\| !page->mapping) {
378	mpfn = `0`;
379	goto next;
380	}
381
382	/*
383	* By getting a reference on the folio we pin it and that blocks
384	* any kind of migration. Side effect is that it "freezes" the
385	* pte.
386	*
387	* We drop this reference after isolating the folio from the lru
388	* for non device folio (device folio are not on the lru and thus
389	* can't be dropped from it).
390	*/
391	folio = page_folio(page);
392	folio_get(folio);
393
394	/*
395	* We rely on folio_trylock() to avoid deadlock between
396	* concurrent migrations where each is waiting on the others
397	* folio lock. If we can't immediately lock the folio we fail this
398	* migration as it is only best effort anyway.
399	*
400	* If we can lock the folio it's safe to set up a migration entry
401	* now. In the common case where the folio is mapped once in a
402	* single process setting up the migration entry now is an
403	* optimisation to avoid walking the rmap later with
404	* try_to_migrate().
405	*/
406	if (fault_folio == folio \|\| folio_trylock(folio)) {
407	bool anon_exclusive;
408	pte_t swp_pte;
409
410	flush_cache_page(vma, vmaddr: addr, pfn: pte_pfn(pte));
411	anon_exclusive = folio_test_anon(folio) &&
412	PageAnonExclusive(page);
413	if (anon_exclusive) {
414	pte = ptep_clear_flush(vma, address: addr, ptep);
415
416	if (folio_try_share_anon_rmap_pte(folio, page)) {
417	set_pte_at(mm, addr, ptep, pte);
418	if (fault_folio != folio)
419	folio_unlock(folio);
420	folio_put(folio);
421	mpfn = `0`;
422	goto next;
423	}
424	} else {
425	pte = ptep_get_and_clear(mm, addr, ptep);
426	}
427
428	migrate->cpages++;
429
430	/ Set the dirty flag on the folio now the pte is gone. /
431	if (pte_dirty(pte))
432	folio_mark_dirty(folio);
433
434	/ Setup special migration page table entry /
435	if (mpfn & MIGRATE_PFN_WRITE)
436	entry = make_writable_migration_entry(
437	page_to_pfn(page));
438	else if (anon_exclusive)
439	entry = make_readable_exclusive_migration_entry(
440	page_to_pfn(page));
441	else
442	entry = make_readable_migration_entry(
443	page_to_pfn(page));
444	if (pte_present(a: pte)) {
445	if (pte_young(pte))
446	entry = make_migration_entry_young(entry);
447	if (pte_dirty(pte))
448	entry = make_migration_entry_dirty(entry);
449	}
450	swp_pte = swp_entry_to_pte(entry);
451	if (pte_present(a: pte)) {
452	if (pte_soft_dirty(pte))
453	swp_pte = pte_swp_mksoft_dirty(pte: swp_pte);
454	if (pte_uffd_wp(pte))
455	swp_pte = pte_swp_mkuffd_wp(pte: swp_pte);
456	} else {
457	if (pte_swp_soft_dirty(pte))
458	swp_pte = pte_swp_mksoft_dirty(pte: swp_pte);
459	if (pte_swp_uffd_wp(pte))
460	swp_pte = pte_swp_mkuffd_wp(pte: swp_pte);
461	}
462	set_pte_at(mm, addr, ptep, swp_pte);
463
464	/*
465	* This is like regular unmap: we remove the rmap and
466	* drop the folio refcount. The folio won't be freed, as
467	* we took a reference just above.
468	*/
469	folio_remove_rmap_pte(folio, page, vma);
470	folio_put(folio);
471
472	if (pte_present(a: pte))
473	unmapped++;
474	} else {
475	folio_put(folio);
476	mpfn = `0`;
477	}
478
479	next:
480	migrate->dst[migrate->npages] = `0`;
481	migrate->src[migrate->npages++] = mpfn;
482	}
483
484	/ Only flush the TLB if we actually modified any entries /
485	if (unmapped)
486	flush_tlb_range(walk->vma, start, end);
487
488	arch_leave_lazy_mmu_mode();
489	pte_unmap_unlock(ptep - `1`, ptl);
490
491	return `0`;
492	}
493
494	static const struct mm_walk_ops migrate_vma_walk_ops = {
495	.pmd_entry = migrate_vma_collect_pmd,
496	.pte_hole = migrate_vma_collect_hole,
497	.walk_lock = PGWALK_RDLOCK,
498	};
499
500	/*
501	* migrate_vma_collect() - collect pages over a range of virtual addresses
502	* @migrate: migrate struct containing all migration information
503	*
504	* This will walk the CPU page table. For each virtual address backed by a
505	* valid page, it updates the src array and takes a reference on the page, in
506	* order to pin the page until we lock it and unmap it.
507	*/
508	static void migrate_vma_collect(struct migrate_vma *migrate)
509	{
510	struct mmu_notifier_range range;
511
512	/*
513	* Note that the pgmap_owner is passed to the mmu notifier callback so
514	* that the registered device driver can skip invalidating device
515	* private page mappings that won't be migrated.
516	*/
517	mmu_notifier_range_init_owner(range: &range, event: MMU_NOTIFY_MIGRATE, flags: `0`,
518	mm: migrate->vma->vm_mm, start: migrate->start, end: migrate->end,
519	owner: migrate->pgmap_owner);
520	mmu_notifier_invalidate_range_start(range: &range);
521
522	walk_page_range(mm: migrate->vma->vm_mm, start: migrate->start, end: migrate->end,
523	ops: &migrate_vma_walk_ops, private: migrate);
524
525	mmu_notifier_invalidate_range_end(range: &range);
526	migrate->end = migrate->start + (migrate->npages << PAGE_SHIFT);
527	}
528
529	/*
530	* migrate_vma_check_page() - check if page is pinned or not
531	* @page: struct page to check
532	*
533	* Pinned pages cannot be migrated. This is the same test as in
534	* folio_migrate_mapping(), except that here we allow migration of a
535	* ZONE_DEVICE page.
536	*/
537	static bool migrate_vma_check_page(struct page page, struct* page *fault_page)
538	{
539	struct folio *folio = page_folio(page);
540
541	/*
542	* One extra ref because caller holds an extra reference, either from
543	* folio_isolate_lru() for a regular folio, or migrate_vma_collect() for
544	* a device folio.
545	*/
546	int extra = `1` + (page == fault_page);
547
548	/ Page from ZONE_DEVICE have one extra reference /
549	if (folio_is_zone_device(folio))
550	extra++;
551
552	/ For file back page /
553	if (folio_mapping(folio))
554	extra += `1` + folio_has_private(folio);
555
556	if ((folio_ref_count(folio) - extra) > folio_mapcount(folio))
557	return false;
558
559	return true;
560	}
561
562	/*
563	* Unmaps pages for migration. Returns number of source pfns marked as
564	* migrating.
565	*/
566	static unsigned long migrate_device_unmap(unsigned long *src_pfns,
567	unsigned long npages,
568	struct page *fault_page)
569	{
570	struct folio *fault_folio = fault_page ?
571	page_folio(fault_page) : NULL;
572	unsigned long i, restore = `0`;
573	bool allow_drain = true;
574	unsigned long unmapped = `0`;
575
576	lru_add_drain();
577
578	for (i = `0`; i < npages; ) {
579	struct page *page = migrate_pfn_to_page(mpfn: src_pfns[i]);
580	struct folio *folio;
581	unsigned int nr = `1`;
582
583	if (!page) {
584	if (src_pfns[i] & MIGRATE_PFN_MIGRATE)
585	unmapped++;
586	goto next;
587	}
588
589	folio = page_folio(page);
590	nr = folio_nr_pages(folio);
591
592	if (nr > `1`)
593	src_pfns[i] \|= MIGRATE_PFN_COMPOUND;
594
595
596	/ ZONE_DEVICE folios are not on LRU /
597	if (!folio_is_zone_device(folio)) {
598	if (!folio_test_lru(folio) && allow_drain) {
599	/ Drain CPU's lru cache /
600	lru_add_drain_all();
601	allow_drain = false;
602	}
603
604	if (!folio_isolate_lru(folio)) {
605	src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
606	restore++;
607	goto next;
608	}
609
610	/ Drop the reference we took in collect /
611	folio_put(folio);
612	}
613
614	if (folio_mapped(folio))
615	try_to_migrate(folio, flags: `0`);
616
617	if (folio_mapped(folio) \|\|
618	!migrate_vma_check_page(page, fault_page)) {
619	if (!folio_is_zone_device(folio)) {
620	folio_get(folio);
621	folio_putback_lru(folio);
622	}
623
624	src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
625	restore++;
626	goto next;
627	}
628
629	unmapped++;
630	next:
631	i += nr;
632	}
633
634	for (i = `0`; i < npages && restore; i++) {
635	struct page *page = migrate_pfn_to_page(mpfn: src_pfns[i]);
636	struct folio *folio;
637
638	if (!page \|\| (src_pfns[i] & MIGRATE_PFN_MIGRATE))
639	continue;
640
641	folio = page_folio(page);
642	remove_migration_ptes(src: folio, dst: folio, flags: `0`);
643
644	src_pfns[i] = `0`;
645	if (fault_folio != folio)
646	folio_unlock(folio);
647	folio_put(folio);
648	restore--;
649	}
650
651	return unmapped;
652	}
653
654	/*
655	* migrate_vma_unmap() - replace page mapping with special migration pte entry
656	* @migrate: migrate struct containing all migration information
657	*
658	* Isolate pages from the LRU and replace mappings (CPU page table pte) with a
659	* special migration pte entry and check if it has been pinned. Pinned pages are
660	* restored because we cannot migrate them.
661	*
662	* This is the last step before we call the device driver callback to allocate
663	* destination memory and copy contents of original page over to new page.
664	*/
665	static void migrate_vma_unmap(struct migrate_vma *migrate)
666	{
667	migrate->cpages = migrate_device_unmap(src_pfns: migrate->src, npages: migrate->npages,
668	fault_page: migrate->fault_page);
669	}
670
671	/**
672	* migrate_vma_setup() - prepare to migrate a range of memory
673	* @args: contains the vma, start, and pfns arrays for the migration
674	*
675	* Returns: negative errno on failures, 0 when 0 or more pages were migrated
676	* without an error.
677	*
678	* Prepare to migrate a range of memory virtual address range by collecting all
679	* the pages backing each virtual address in the range, saving them inside the
680	* src array. Then lock those pages and unmap them. Once the pages are locked
681	* and unmapped, check whether each page is pinned or not. Pages that aren't
682	* pinned have the MIGRATE_PFN_MIGRATE flag set (by this function) in the
683	* corresponding src array entry. Then restores any pages that are pinned, by
684	* remapping and unlocking those pages.
685	*
686	* The caller should then allocate destination memory and copy source memory to
687	* it for all those entries (ie with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE
688	* flag set). Once these are allocated and copied, the caller must update each
689	* corresponding entry in the dst array with the pfn value of the destination
690	* page and with MIGRATE_PFN_VALID. Destination pages must be locked via
691	* lock_page().
692	*
693	* Note that the caller does not have to migrate all the pages that are marked
694	* with MIGRATE_PFN_MIGRATE flag in src array unless this is a migration from
695	* device memory to system memory. If the caller cannot migrate a device page
696	* back to system memory, then it must return VM_FAULT_SIGBUS, which has severe
697	* consequences for the userspace process, so it must be avoided if at all
698	* possible.
699	*
700	* For empty entries inside CPU page table (pte_none() or pmd_none() is true) we
701	* do set MIGRATE_PFN_MIGRATE flag inside the corresponding source array thus
702	* allowing the caller to allocate device memory for those unbacked virtual
703	* addresses. For this the caller simply has to allocate device memory and
704	* properly set the destination entry like for regular migration. Note that
705	* this can still fail, and thus inside the device driver you must check if the
706	* migration was successful for those entries after calling migrate_vma_pages(),
707	* just like for regular migration.
708	*
709	* After that, the callers must call migrate_vma_pages() to go over each entry
710	* in the src array that has the MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag
711	* set. If the corresponding entry in dst array has MIGRATE_PFN_VALID flag set,
712	* then migrate_vma_pages() to migrate struct page information from the source
713	* struct page to the destination struct page. If it fails to migrate the
714	* struct page information, then it clears the MIGRATE_PFN_MIGRATE flag in the
715	* src array.
716	*
717	* At this point all successfully migrated pages have an entry in the src
718	* array with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag set and the dst
719	* array entry with MIGRATE_PFN_VALID flag set.
720	*
721	* Once migrate_vma_pages() returns the caller may inspect which pages were
722	* successfully migrated, and which were not. Successfully migrated pages will
723	* have the MIGRATE_PFN_MIGRATE flag set for their src array entry.
724	*
725	* It is safe to update device page table after migrate_vma_pages() because
726	* both destination and source page are still locked, and the mmap_lock is held
727	* in read mode (hence no one can unmap the range being migrated).
728	*
729	* Once the caller is done cleaning up things and updating its page table (if it
730	* chose to do so, this is not an obligation) it finally calls
731	* migrate_vma_finalize() to update the CPU page table to point to new pages
732	* for successfully migrated pages or otherwise restore the CPU page table to
733	* point to the original source pages.
734	*/
735	int migrate_vma_setup(struct migrate_vma *args)
736	{
737	long nr_pages = (args->end - args->start) >> PAGE_SHIFT;
738
739	args->start &= PAGE_MASK;
740	args->end &= PAGE_MASK;
741	if (!args->vma \|\| is_vm_hugetlb_page(vma: args->vma) \|\|
742	(args->vma->vm_flags & VM_SPECIAL) \|\| vma_is_dax(vma: args->vma))
743	return -EINVAL;
744	if (nr_pages <= `0`)
745	return -EINVAL;
746	if (args->start < args->vma->vm_start \|\|
747	args->start >= args->vma->vm_end)
748	return -EINVAL;
749	if (args->end <= args->vma->vm_start \|\| args->end > args->vma->vm_end)
750	return -EINVAL;
751	if (!args->src \|\| !args->dst)
752	return -EINVAL;
753	if (args->fault_page && !is_device_private_page(page: args->fault_page))
754	return -EINVAL;
755	if (args->fault_page && !PageLocked(page: args->fault_page))
756	return -EINVAL;
757
758	memset(args->src, `0`, sizeof(args->src) nr_pages);
759	args->cpages = `0`;
760	args->npages = `0`;
761
762	migrate_vma_collect(migrate: args);
763
764	if (args->cpages)
765	migrate_vma_unmap(migrate: args);
766
767	/*
768	* At this point pages are locked and unmapped, and thus they have
769	* stable content and can safely be copied to destination memory that
770	* is allocated by the drivers.
771	*/
772	return `0`;
773
774	}
775	EXPORT_SYMBOL(migrate_vma_setup);
776
777	#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
778	/**
779	* migrate_vma_insert_huge_pmd_page: Insert a huge folio into @migrate->vma->vm_mm
780	* at @addr. folio is already allocated as a part of the migration process with
781	* large page.
782	*
783	* @page needs to be initialized and setup after it's allocated. The code bits
784	* here follow closely the code in __do_huge_pmd_anonymous_page(). This API does
785	* not support THP zero pages.
786	*
787	* @migrate: migrate_vma arguments
788	* @addr: address where the folio will be inserted
789	* @page: page to be inserted at @addr
790	* @src: src pfn which is being migrated
791	* @pmdp: pointer to the pmd
792	*/
793	static int migrate_vma_insert_huge_pmd_page(struct migrate_vma *migrate,
794	unsigned long addr,
795	struct page *page,
796	unsigned long *src,
797	pmd_t *pmdp)
798	{
799	struct vm_area_struct *vma = migrate->vma;
800	gfp_t gfp = vma_thp_gfp_mask(vma);
801	struct folio *folio = page_folio(page);
802	int ret;
803	vm_fault_t csa_ret;
804	spinlock_t *ptl;
805	pgtable_t pgtable;
806	pmd_t entry;
807	bool flush = false;
808	unsigned long i;
809
810	VM_WARN_ON_FOLIO(!folio, folio);
811	VM_WARN_ON_ONCE(!pmd_none(pmdp) && !is_huge_zero_pmd(pmdp));
812
813	if (!thp_vma_suitable_order(vma, addr, HPAGE_PMD_ORDER))
814	return -EINVAL;
815
816	ret = anon_vma_prepare(vma);
817	if (ret)
818	return ret;
819
820	folio_set_order(folio, HPAGE_PMD_ORDER);
821	folio_set_large_rmappable(folio);
822
823	if (mem_cgroup_charge(folio, mm: migrate->vma->vm_mm, gfp)) {
824	count_vm_event(item: THP_FAULT_FALLBACK);
825	count_mthp_stat(HPAGE_PMD_ORDER, item: MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
826	ret = -ENOMEM;
827	goto abort;
828	}
829
830	__folio_mark_uptodate(folio);
831
832	pgtable = pte_alloc_one(vma->vm_mm);
833	if (unlikely(!pgtable))
834	goto abort;
835
836	if (folio_is_device_private(folio)) {
837	swp_entry_t swp_entry;
838
839	if (vma->vm_flags & VM_WRITE)
840	swp_entry = make_writable_device_private_entry(
841	page_to_pfn(page));
842	else
843	swp_entry = make_readable_device_private_entry(
844	page_to_pfn(page));
845	entry = swp_entry_to_pmd(entry: swp_entry);
846	} else {
847	if (folio_is_zone_device(folio) &&
848	!folio_is_device_coherent(folio)) {
849	goto abort;
850	}
851	entry = folio_mk_pmd(folio, pgprot: vma->vm_page_prot);
852	if (vma->vm_flags & VM_WRITE)
853	entry = pmd_mkwrite(pmd: pmd_mkdirty(pmd: entry), vma);
854	}
855
856	ptl = pmd_lock(mm: vma->vm_mm, pmd: pmdp);
857	csa_ret = check_stable_address_space(mm: vma->vm_mm);
858	if (csa_ret)
859	goto abort;
860
861	/*
862	* Check for userfaultfd but do not deliver the fault. Instead,
863	* just back off.
864	*/
865	if (userfaultfd_missing(vma))
866	goto unlock_abort;
867
868	if (!pmd_none(pmd: *pmdp)) {
869	if (!is_huge_zero_pmd(pmd: *pmdp))
870	goto unlock_abort;
871	flush = true;
872	} else if (!pmd_none(pmd: *pmdp))
873	goto unlock_abort;
874
875	add_mm_counter(mm: vma->vm_mm, member: MM_ANONPAGES, HPAGE_PMD_NR);
876	folio_add_new_anon_rmap(folio, vma, address: addr, RMAP_EXCLUSIVE);
877	if (!folio_is_zone_device(folio))
878	folio_add_lru_vma(folio, vma);
879	folio_get(folio);
880
881	if (flush) {
882	pte_free(mm: vma->vm_mm, pte_page: pgtable);
883	flush_cache_page(vma, vmaddr: addr, pfn: addr + HPAGE_PMD_SIZE);
884	pmdp_invalidate(vma, address: addr, pmdp);
885	} else {
886	pgtable_trans_huge_deposit(mm: vma->vm_mm, pmdp, pgtable);
887	mm_inc_nr_ptes(mm: vma->vm_mm);
888	}
889	set_pmd_at(mm: vma->vm_mm, addr, pmdp, pmd: entry);
890	update_mmu_cache_pmd(vma, addr, pmd: pmdp);
891
892	spin_unlock(lock: ptl);
893
894	count_vm_event(item: THP_FAULT_ALLOC);
895	count_mthp_stat(HPAGE_PMD_ORDER, item: MTHP_STAT_ANON_FAULT_ALLOC);
896	count_memcg_event_mm(mm: vma->vm_mm, idx: THP_FAULT_ALLOC);
897
898	return `0`;
899
900	unlock_abort:
901	spin_unlock(lock: ptl);
902	abort:
903	for (i = `0`; i < HPAGE_PMD_NR; i++)
904	src[i] &= ~MIGRATE_PFN_MIGRATE;
905	return `0`;
906	}
907
908	static int migrate_vma_split_unmapped_folio(struct migrate_vma *migrate,
909	unsigned long idx, unsigned long addr,
910	struct folio *folio)
911	{
912	unsigned long i;
913	unsigned long pfn;
914	unsigned long flags;
915	int ret = `0`;
916
917	folio_get(folio);
918	split_huge_pmd_address(vma: migrate->vma, address: addr, freeze: true);
919	ret = folio_split_unmapped(folio, new_order: `0`);
920	if (ret)
921	return ret;
922	migrate->src[idx] &= ~MIGRATE_PFN_COMPOUND;
923	flags = migrate->src[idx] & ((`1UL` << MIGRATE_PFN_SHIFT) - `1`);
924	pfn = migrate->src[idx] >> MIGRATE_PFN_SHIFT;
925	for (i = `1`; i < HPAGE_PMD_NR; i++)
926	migrate->src[i+idx] = migrate_pfn(pfn: pfn + i) \| flags;
927	return ret;
928	}
929	#else /* !CONFIG_ARCH_ENABLE_THP_MIGRATION */
930	static int migrate_vma_insert_huge_pmd_page(struct migrate_vma *migrate,
931	unsigned long addr,
932	struct page *page,
933	unsigned long *src,
934	pmd_t *pmdp)
935	{
936	return `0`;
937	}
938
939	static int migrate_vma_split_unmapped_folio(struct migrate_vma *migrate,
940	unsigned long idx, unsigned long addr,
941	struct folio *folio)
942	{
943	return `0`;
944	}
945	#endif
946
947	static unsigned long migrate_vma_nr_pages(unsigned long *src)
948	{
949	unsigned long nr = `1`;
950	#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
951	if (*src & MIGRATE_PFN_COMPOUND)
952	nr = HPAGE_PMD_NR;
953	#else
954	if (*src & MIGRATE_PFN_COMPOUND)
955	VM_WARN_ON_ONCE(true);
956	#endif
957	return nr;
958	}
959
960	/*
961	* This code closely matches the code in:
962	* __handle_mm_fault()
963	* handle_pte_fault()
964	* do_anonymous_page()
965	* to map in an anonymous zero page but the struct page will be a ZONE_DEVICE
966	* private or coherent page.
967	*/
968	static void migrate_vma_insert_page(struct migrate_vma *migrate,
969	unsigned long addr,
970	unsigned long *dst,
971	unsigned long *src)
972	{
973	struct page page = migrate_pfn_to_page(mpfn: dst);
974	struct folio *folio = page_folio(page);
975	struct vm_area_struct *vma = migrate->vma;
976	struct mm_struct *mm = vma->vm_mm;
977	bool flush = false;
978	spinlock_t *ptl;
979	pte_t entry;
980	pgd_t *pgdp;
981	p4d_t *p4dp;
982	pud_t *pudp;
983	pmd_t *pmdp;
984	pte_t *ptep;
985	pte_t orig_pte;
986
987	/ Only allow populating anonymous memory /
988	if (!vma_is_anonymous(vma))
989	goto abort;
990
991	pgdp = pgd_offset(mm, addr);
992	p4dp = p4d_alloc(mm, pgd: pgdp, address: addr);
993	if (!p4dp)
994	goto abort;
995	pudp = pud_alloc(mm, p4d: p4dp, address: addr);
996	if (!pudp)
997	goto abort;
998	pmdp = pmd_alloc(mm, pud: pudp, address: addr);
999	if (!pmdp)
1000	goto abort;
1001
1002	if (thp_migration_supported() && (*dst & MIGRATE_PFN_COMPOUND)) {
1003	int ret = migrate_vma_insert_huge_pmd_page(migrate, addr, page,
1004	src, pmdp);
1005	if (ret)
1006	goto abort;
1007	return;
1008	}
1009
1010	if (!pmd_none(pmd: *pmdp)) {
1011	if (pmd_trans_huge(pmd: *pmdp)) {
1012	if (!is_huge_zero_pmd(pmd: *pmdp))
1013	goto abort;
1014	split_huge_pmd(vma, pmdp, addr);
1015	} else if (pmd_leaf(pte: *pmdp))
1016	goto abort;
1017	}
1018
1019	if (pte_alloc(mm, pmdp))
1020	goto abort;
1021	if (unlikely(anon_vma_prepare(vma)))
1022	goto abort;
1023	if (mem_cgroup_charge(folio, mm: vma->vm_mm, GFP_KERNEL))
1024	goto abort;
1025
1026	/*
1027	* The memory barrier inside __folio_mark_uptodate makes sure that
1028	* preceding stores to the folio contents become visible before
1029	* the set_pte_at() write.
1030	*/
1031	__folio_mark_uptodate(folio);
1032
1033	if (folio_is_device_private(folio)) {
1034	swp_entry_t swp_entry;
1035
1036	if (vma->vm_flags & VM_WRITE)
1037	swp_entry = make_writable_device_private_entry(
1038	page_to_pfn(page));
1039	else
1040	swp_entry = make_readable_device_private_entry(
1041	page_to_pfn(page));
1042	entry = swp_entry_to_pte(entry: swp_entry);
1043	} else {
1044	if (folio_is_zone_device(folio) &&
1045	!folio_is_device_coherent(folio)) {
1046	pr_warn_once("Unsupported ZONE_DEVICE page type.\n");
1047	goto abort;
1048	}
1049	entry = mk_pte(page, pgprot: vma->vm_page_prot);
1050	if (vma->vm_flags & VM_WRITE)
1051	entry = pte_mkwrite(pte: pte_mkdirty(pte: entry), vma);
1052	}
1053
1054	ptep = pte_offset_map_lock(mm, pmd: pmdp, addr, ptlp: &ptl);
1055	if (!ptep)
1056	goto abort;
1057	orig_pte = ptep_get(ptep);
1058
1059	if (check_stable_address_space(mm))
1060	goto unlock_abort;
1061
1062	if (pte_present(a: orig_pte)) {
1063	unsigned long pfn = pte_pfn(pte: orig_pte);
1064
1065	if (!is_zero_pfn(pfn))
1066	goto unlock_abort;
1067	flush = true;
1068	} else if (!pte_none(pte: orig_pte))
1069	goto unlock_abort;
1070
1071	/*
1072	* Check for userfaultfd but do not deliver the fault. Instead,
1073	* just back off.
1074	*/
1075	if (userfaultfd_missing(vma))
1076	goto unlock_abort;
1077
1078	inc_mm_counter(mm, member: MM_ANONPAGES);
1079	folio_add_new_anon_rmap(folio, vma, address: addr, RMAP_EXCLUSIVE);
1080	if (!folio_is_zone_device(folio))
1081	folio_add_lru_vma(folio, vma);
1082	folio_get(folio);
1083
1084	if (flush) {
1085	flush_cache_page(vma, vmaddr: addr, pfn: pte_pfn(pte: orig_pte));
1086	ptep_clear_flush(vma, address: addr, ptep);
1087	}
1088	set_pte_at(mm, addr, ptep, entry);
1089	update_mmu_cache(vma, addr, ptep);
1090
1091	pte_unmap_unlock(ptep, ptl);
1092	*src = MIGRATE_PFN_MIGRATE;
1093	return;
1094
1095	unlock_abort:
1096	pte_unmap_unlock(ptep, ptl);
1097	abort:
1098	*src &= ~MIGRATE_PFN_MIGRATE;
1099	}
1100
1101	static void __migrate_device_pages(unsigned long *src_pfns,
1102	unsigned long dst_pfns, unsigned* long npages,
1103	struct migrate_vma *migrate)
1104	{
1105	struct mmu_notifier_range range;
1106	unsigned long i, j;
1107	bool notified = false;
1108	unsigned long addr;
1109
1110	for (i = `0`; i < npages; ) {
1111	struct page *newpage = migrate_pfn_to_page(mpfn: dst_pfns[i]);
1112	struct page *page = migrate_pfn_to_page(mpfn: src_pfns[i]);
1113	struct address_space *mapping;
1114	struct folio newfolio, folio;
1115	int r, extra_cnt = `0`;
1116	unsigned long nr = `1`;
1117
1118	if (!newpage) {
1119	src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
1120	goto next;
1121	}
1122
1123	if (!page) {
1124	unsigned long addr;
1125
1126	if (!(src_pfns[i] & MIGRATE_PFN_MIGRATE))
1127	goto next;
1128
1129	/*
1130	* The only time there is no vma is when called from
1131	* migrate_device_coherent_folio(). However this isn't
1132	* called if the page could not be unmapped.
1133	*/
1134	VM_BUG_ON(!migrate);
1135	addr = migrate->start + i*PAGE_SIZE;
1136	if (!notified) {
1137	notified = true;
1138
1139	mmu_notifier_range_init_owner(range: &range,
1140	event: MMU_NOTIFY_MIGRATE, flags: `0`,
1141	mm: migrate->vma->vm_mm, start: addr, end: migrate->end,
1142	owner: migrate->pgmap_owner);
1143	mmu_notifier_invalidate_range_start(range: &range);
1144	}
1145
1146	if ((src_pfns[i] & MIGRATE_PFN_COMPOUND) &&
1147	(!(dst_pfns[i] & MIGRATE_PFN_COMPOUND))) {
1148	nr = migrate_vma_nr_pages(src: &src_pfns[i]);
1149	src_pfns[i] &= ~MIGRATE_PFN_COMPOUND;
1150	} else {
1151	nr = `1`;
1152	}
1153
1154	for (j = `0`; j < nr && i + j < npages; j++) {
1155	src_pfns[i+j] \|= MIGRATE_PFN_MIGRATE;
1156	migrate_vma_insert_page(migrate,
1157	addr: addr + j * PAGE_SIZE,
1158	dst: &dst_pfns[i+j], src: &src_pfns[i+j]);
1159	}
1160	goto next;
1161	}
1162
1163	newfolio = page_folio(newpage);
1164	folio = page_folio(page);
1165	mapping = folio_mapping(folio);
1166
1167	/*
1168	* If THP migration is enabled, check if both src and dst
1169	* can migrate large pages
1170	*/
1171	if (thp_migration_supported()) {
1172	if ((src_pfns[i] & MIGRATE_PFN_MIGRATE) &&
1173	(src_pfns[i] & MIGRATE_PFN_COMPOUND) &&
1174	!(dst_pfns[i] & MIGRATE_PFN_COMPOUND)) {
1175
1176	if (!migrate) {
1177	src_pfns[i] &= ~(MIGRATE_PFN_MIGRATE \|
1178	MIGRATE_PFN_COMPOUND);
1179	goto next;
1180	}
1181	nr = `1` << folio_order(folio);
1182	addr = migrate->start + i * PAGE_SIZE;
1183	if (migrate_vma_split_unmapped_folio(migrate, idx: i, addr, folio)) {
1184	src_pfns[i] &= ~(MIGRATE_PFN_MIGRATE \|
1185	MIGRATE_PFN_COMPOUND);
1186	goto next;
1187	}
1188	} else if ((src_pfns[i] & MIGRATE_PFN_MIGRATE) &&
1189	(dst_pfns[i] & MIGRATE_PFN_COMPOUND) &&
1190	!(src_pfns[i] & MIGRATE_PFN_COMPOUND)) {
1191	src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
1192	}
1193	}
1194
1195
1196	if (folio_is_device_private(folio: newfolio) \|\|
1197	folio_is_device_coherent(folio: newfolio)) {
1198	if (mapping) {
1199	/*
1200	* For now only support anonymous memory migrating to
1201	* device private or coherent memory.
1202	*
1203	* Try to get rid of swap cache if possible.
1204	*/
1205	if (!folio_test_anon(folio) \|\|
1206	!folio_free_swap(folio)) {
1207	src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
1208	goto next;
1209	}
1210	}
1211	} else if (folio_is_zone_device(folio: newfolio)) {
1212	/*
1213	* Other types of ZONE_DEVICE page are not supported.
1214	*/
1215	src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
1216	goto next;
1217	}
1218
1219	BUG_ON(folio_test_writeback(folio));
1220
1221	if (migrate && migrate->fault_page == page)
1222	extra_cnt = `1`;
1223	for (j = `0`; j < nr && i + j < npages; j++) {
1224	folio = page_folio(migrate_pfn_to_page(src_pfns[i+j]));
1225	newfolio = page_folio(migrate_pfn_to_page(dst_pfns[i+j]));
1226
1227	r = folio_migrate_mapping(mapping, newfolio, folio, extra_count: extra_cnt);
1228	if (r)
1229	src_pfns[i+j] &= ~MIGRATE_PFN_MIGRATE;
1230	else
1231	folio_migrate_flags(newfolio, folio);
1232	}
1233	next:
1234	i += nr;
1235	}
1236
1237	if (notified)
1238	mmu_notifier_invalidate_range_end(range: &range);
1239	}
1240
1241	/**
1242	* migrate_device_pages() - migrate meta-data from src page to dst page
1243	* @src_pfns: src_pfns returned from migrate_device_range()
1244	* @dst_pfns: array of pfns allocated by the driver to migrate memory to
1245	* @npages: number of pages in the range
1246	*
1247	* Equivalent to migrate_vma_pages(). This is called to migrate struct page
1248	* meta-data from source struct page to destination.
1249	*/
1250	void migrate_device_pages(unsigned long src_pfns, unsigned* long *dst_pfns,
1251	unsigned long npages)
1252	{
1253	__migrate_device_pages(src_pfns, dst_pfns, npages, NULL);
1254	}
1255	EXPORT_SYMBOL(migrate_device_pages);
1256
1257	/**
1258	* migrate_vma_pages() - migrate meta-data from src page to dst page
1259	* @migrate: migrate struct containing all migration information
1260	*
1261	* This migrates struct page meta-data from source struct page to destination
1262	* struct page. This effectively finishes the migration from source page to the
1263	* destination page.
1264	*/
1265	void migrate_vma_pages(struct migrate_vma *migrate)
1266	{
1267	__migrate_device_pages(src_pfns: migrate->src, dst_pfns: migrate->dst, npages: migrate->npages, migrate);
1268	}
1269	EXPORT_SYMBOL(migrate_vma_pages);
1270
1271	static void __migrate_device_finalize(unsigned long *src_pfns,
1272	unsigned long *dst_pfns,
1273	unsigned long npages,
1274	struct page *fault_page)
1275	{
1276	struct folio *fault_folio = fault_page ?
1277	page_folio(fault_page) : NULL;
1278	unsigned long i;
1279
1280	for (i = `0`; i < npages; i++) {
1281	struct folio dst = NULL, src = NULL;
1282	struct page *newpage = migrate_pfn_to_page(mpfn: dst_pfns[i]);
1283	struct page *page = migrate_pfn_to_page(mpfn: src_pfns[i]);
1284
1285	if (newpage)
1286	dst = page_folio(newpage);
1287
1288	if (!page) {
1289	if (dst) {
1290	WARN_ON_ONCE(fault_folio == dst);
1291	folio_unlock(folio: dst);
1292	folio_put(folio: dst);
1293	}
1294	continue;
1295	}
1296
1297	src = page_folio(page);
1298
1299	if (!(src_pfns[i] & MIGRATE_PFN_MIGRATE) \|\| !dst) {
1300	if (dst) {
1301	WARN_ON_ONCE(fault_folio == dst);
1302	folio_unlock(folio: dst);
1303	folio_put(folio: dst);
1304	}
1305	dst = src;
1306	}
1307
1308	if (!folio_is_zone_device(folio: dst))
1309	folio_add_lru(dst);
1310	remove_migration_ptes(src, dst, flags: `0`);
1311	if (fault_folio != src)
1312	folio_unlock(folio: src);
1313	folio_put(folio: src);
1314
1315	if (dst != src) {
1316	WARN_ON_ONCE(fault_folio == dst);
1317	folio_unlock(folio: dst);
1318	folio_put(folio: dst);
1319	}
1320	}
1321	}
1322
1323	/*
1324	* migrate_device_finalize() - complete page migration
1325	* @src_pfns: src_pfns returned from migrate_device_range()
1326	* @dst_pfns: array of pfns allocated by the driver to migrate memory to
1327	* @npages: number of pages in the range
1328	*
1329	* Completes migration of the page by removing special migration entries.
1330	* Drivers must ensure copying of page data is complete and visible to the CPU
1331	* before calling this.
1332	*/
1333	void migrate_device_finalize(unsigned long *src_pfns,
1334	unsigned long dst_pfns, unsigned* long npages)
1335	{
1336	return __migrate_device_finalize(src_pfns, dst_pfns, npages, NULL);
1337	}
1338	EXPORT_SYMBOL(migrate_device_finalize);
1339
1340	/**
1341	* migrate_vma_finalize() - restore CPU page table entry
1342	* @migrate: migrate struct containing all migration information
1343	*
1344	* This replaces the special migration pte entry with either a mapping to the
1345	* new page if migration was successful for that page, or to the original page
1346	* otherwise.
1347	*
1348	* This also unlocks the pages and puts them back on the lru, or drops the extra
1349	* refcount, for device pages.
1350	*/
1351	void migrate_vma_finalize(struct migrate_vma *migrate)
1352	{
1353	__migrate_device_finalize(src_pfns: migrate->src, dst_pfns: migrate->dst, npages: migrate->npages,
1354	fault_page: migrate->fault_page);
1355	}
1356	EXPORT_SYMBOL(migrate_vma_finalize);
1357
1358	static unsigned long migrate_device_pfn_lock(unsigned long pfn)
1359	{
1360	struct folio *folio;
1361
1362	folio = folio_get_nontail_page(pfn_to_page(pfn));
1363	if (!folio)
1364	return `0`;
1365
1366	if (!folio_trylock(folio)) {
1367	folio_put(folio);
1368	return `0`;
1369	}
1370
1371	return migrate_pfn(pfn) \| MIGRATE_PFN_MIGRATE;
1372	}
1373
1374	/**
1375	* migrate_device_range() - migrate device private pfns to normal memory.
1376	* @src_pfns: array large enough to hold migrating source device private pfns.
1377	* @start: starting pfn in the range to migrate.
1378	* @npages: number of pages to migrate.
1379	*
1380	* migrate_vma_setup() is similar in concept to migrate_vma_setup() except that
1381	* instead of looking up pages based on virtual address mappings a range of
1382	* device pfns that should be migrated to system memory is used instead.
1383	*
1384	* This is useful when a driver needs to free device memory but doesn't know the
1385	* virtual mappings of every page that may be in device memory. For example this
1386	* is often the case when a driver is being unloaded or unbound from a device.
1387	*
1388	* Like migrate_vma_setup() this function will take a reference and lock any
1389	* migrating pages that aren't free before unmapping them. Drivers may then
1390	* allocate destination pages and start copying data from the device to CPU
1391	* memory before calling migrate_device_pages().
1392	*/
1393	int migrate_device_range(unsigned long src_pfns, unsigned* long start,
1394	unsigned long npages)
1395	{
1396	unsigned long i, j, pfn;
1397
1398	for (pfn = start, i = `0`; i < npages; pfn++, i++) {
1399	struct page *page = pfn_to_page(pfn);
1400	struct folio *folio = page_folio(page);
1401	unsigned int nr = `1`;
1402
1403	src_pfns[i] = migrate_device_pfn_lock(pfn);
1404	nr = folio_nr_pages(folio);
1405	if (nr > `1`) {
1406	src_pfns[i] \|= MIGRATE_PFN_COMPOUND;
1407	for (j = `1`; j < nr; j++)
1408	src_pfns[i+j] = `0`;
1409	i += j - `1`;
1410	pfn += j - `1`;
1411	}
1412	}
1413
1414	migrate_device_unmap(src_pfns, npages, NULL);
1415
1416	return `0`;
1417	}
1418	EXPORT_SYMBOL(migrate_device_range);
1419
1420	/**
1421	* migrate_device_pfns() - migrate device private pfns to normal memory.
1422	* @src_pfns: pre-popluated array of source device private pfns to migrate.
1423	* @npages: number of pages to migrate.
1424	*
1425	* Similar to migrate_device_range() but supports non-contiguous pre-popluated
1426	* array of device pages to migrate.
1427	*/
1428	int migrate_device_pfns(unsigned long src_pfns, unsigned* long npages)
1429	{
1430	unsigned long i, j;
1431
1432	for (i = `0`; i < npages; i++) {
1433	struct page *page = pfn_to_page(src_pfns[i]);
1434	struct folio *folio = page_folio(page);
1435	unsigned int nr = `1`;
1436
1437	src_pfns[i] = migrate_device_pfn_lock(pfn: src_pfns[i]);
1438	nr = folio_nr_pages(folio);
1439	if (nr > `1`) {
1440	src_pfns[i] \|= MIGRATE_PFN_COMPOUND;
1441	for (j = `1`; j < nr; j++)
1442	src_pfns[i+j] = `0`;
1443	i += j - `1`;
1444	}
1445	}
1446
1447	migrate_device_unmap(src_pfns, npages, NULL);
1448
1449	return `0`;
1450	}
1451	EXPORT_SYMBOL(migrate_device_pfns);
1452
1453	/*
1454	* Migrate a device coherent folio back to normal memory. The caller should have
1455	* a reference on folio which will be copied to the new folio if migration is
1456	* successful or dropped on failure.
1457	*/
1458	int migrate_device_coherent_folio(struct folio *folio)
1459	{
1460	unsigned long src_pfn, dst_pfn = `0`;
1461	struct folio *dfolio;
1462
1463	WARN_ON_ONCE(folio_test_large(folio));
1464
1465	folio_lock(folio);
1466	src_pfn = migrate_pfn(pfn: folio_pfn(folio)) \| MIGRATE_PFN_MIGRATE;
1467
1468	/*
1469	* We don't have a VMA and don't need to walk the page tables to find
1470	* the source folio. So call migrate_vma_unmap() directly to unmap the
1471	* folio as migrate_vma_setup() will fail if args.vma == NULL.
1472	*/
1473	migrate_device_unmap(src_pfns: &src_pfn, npages: `1`, NULL);
1474	if (!(src_pfn & MIGRATE_PFN_MIGRATE))
1475	return -EBUSY;
1476
1477	dfolio = folio_alloc(GFP_USER \| __GFP_NOWARN, `0`);
1478	if (dfolio) {
1479	folio_lock(folio: dfolio);
1480	dst_pfn = migrate_pfn(pfn: folio_pfn(folio: dfolio));
1481	}
1482
1483	migrate_device_pages(&src_pfn, &dst_pfn, `1`);
1484	if (src_pfn & MIGRATE_PFN_MIGRATE)
1485	folio_copy(dst: dfolio, src: folio);
1486	migrate_device_finalize(&src_pfn, &dst_pfn, `1`);
1487
1488	if (src_pfn & MIGRATE_PFN_MIGRATE)
1489	return `0`;
1490	return -EBUSY;
1491	}
1492

source code of linux/mm/migrate_device.c