khugepaged.c source code [linux/mm/khugepaged.c]

1	// SPDX-License-Identifier: GPL-2.0
2	#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
3
4	#include <linux/mm.h>
5	#include <linux/sched.h>
6	#include <linux/sched/mm.h>
7	#include <linux/mmu_notifier.h>
8	#include <linux/rmap.h>
9	#include <linux/swap.h>
10	#include <linux/mm_inline.h>
11	#include <linux/kthread.h>
12	#include <linux/khugepaged.h>
13	#include <linux/freezer.h>
14	#include <linux/mman.h>
15	#include <linux/hashtable.h>
16	#include <linux/userfaultfd_k.h>
17	#include <linux/page_idle.h>
18	#include <linux/page_table_check.h>
19	#include <linux/rcupdate_wait.h>
20	#include <linux/leafops.h>
21	#include <linux/shmem_fs.h>
22	#include <linux/dax.h>
23	#include <linux/ksm.h>
24	#include <linux/pgalloc.h>
25
26	#include <asm/tlb.h>
27	#include "internal.h"
28	#include "mm_slot.h"
29
30	enum scan_result {
31	SCAN_FAIL,
32	SCAN_SUCCEED,
33	SCAN_NO_PTE_TABLE,
34	SCAN_PMD_MAPPED,
35	SCAN_EXCEED_NONE_PTE,
36	SCAN_EXCEED_SWAP_PTE,
37	SCAN_EXCEED_SHARED_PTE,
38	SCAN_PTE_NON_PRESENT,
39	SCAN_PTE_UFFD_WP,
40	SCAN_PTE_MAPPED_HUGEPAGE,
41	SCAN_LACK_REFERENCED_PAGE,
42	SCAN_PAGE_NULL,
43	SCAN_SCAN_ABORT,
44	SCAN_PAGE_COUNT,
45	SCAN_PAGE_LRU,
46	SCAN_PAGE_LOCK,
47	SCAN_PAGE_ANON,
48	SCAN_PAGE_COMPOUND,
49	SCAN_ANY_PROCESS,
50	SCAN_VMA_NULL,
51	SCAN_VMA_CHECK,
52	SCAN_ADDRESS_RANGE,
53	SCAN_DEL_PAGE_LRU,
54	SCAN_ALLOC_HUGE_PAGE_FAIL,
55	SCAN_CGROUP_CHARGE_FAIL,
56	SCAN_TRUNCATED,
57	SCAN_PAGE_HAS_PRIVATE,
58	SCAN_STORE_FAILED,
59	SCAN_COPY_MC,
60	SCAN_PAGE_FILLED,
61	};
62
63	#define CREATE_TRACE_POINTS
64	#include <trace/events/huge_memory.h>
65
66	static struct task_struct *khugepaged_thread __read_mostly;
67	static DEFINE_MUTEX(khugepaged_mutex);
68
69	/ default scan 8HPAGE_PMD_NR ptes (or vmas) every 10 second /*
70	static unsigned int khugepaged_pages_to_scan __read_mostly;
71	static unsigned int khugepaged_pages_collapsed;
72	static unsigned int khugepaged_full_scans;
73	static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = `10000`;
74	/ during fragmentation poll the hugepage allocator once every minute /
75	static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = `60000`;
76	static unsigned long khugepaged_sleep_expire;
77	static DEFINE_SPINLOCK(khugepaged_mm_lock);
78	static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait);
79	/*
80	* default collapse hugepages if there is at least one pte mapped like
81	* it would have happened if the vma was large enough during page
82	* fault.
83	*
84	* Note that these are only respected if collapse was initiated by khugepaged.
85	*/
86	unsigned int khugepaged_max_ptes_none __read_mostly;
87	static unsigned int khugepaged_max_ptes_swap __read_mostly;
88	static unsigned int khugepaged_max_ptes_shared __read_mostly;
89
90	#define MM_SLOTS_HASH_BITS 10
91	static DEFINE_READ_MOSTLY_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
92
93	static struct kmem_cache *mm_slot_cache __ro_after_init;
94
95	struct collapse_control {
96	bool is_khugepaged;
97
98	/ Num pages scanned per node /
99	u32 node_load[MAX_NUMNODES];
100
101	/ nodemask for allocation fallback /
102	nodemask_t alloc_nmask;
103	};
104
105	/**
106	* struct khugepaged_scan - cursor for scanning
107	* @mm_head: the head of the mm list to scan
108	* @mm_slot: the current mm_slot we are scanning
109	* @address: the next address inside that to be scanned
110	*
111	* There is only the one khugepaged_scan instance of this cursor structure.
112	*/
113	struct khugepaged_scan {
114	struct list_head mm_head;
115	struct mm_slot *mm_slot;
116	unsigned long address;
117	};
118
119	static struct khugepaged_scan khugepaged_scan = {
120	.mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head),
121	};
122
123	#ifdef CONFIG_SYSFS
124	static ssize_t scan_sleep_millisecs_show(struct kobject *kobj,
125	struct kobj_attribute *attr,
126	char *buf)
127	{
128	return sysfs_emit(buf, fmt: "%u\n", khugepaged_scan_sleep_millisecs);
129	}
130
131	static ssize_t __sleep_millisecs_store(const char *buf, size_t count,
132	unsigned int *millisecs)
133	{
134	unsigned int msecs;
135	int err;
136
137	err = kstrtouint(s: buf, base: `10`, res: &msecs);
138	if (err)
139	return -EINVAL;
140
141	*millisecs = msecs;
142	khugepaged_sleep_expire = `0`;
143	wake_up_interruptible(&khugepaged_wait);
144
145	return count;
146	}
147
148	static ssize_t scan_sleep_millisecs_store(struct kobject *kobj,
149	struct kobj_attribute *attr,
150	const char *buf, size_t count)
151	{
152	return __sleep_millisecs_store(buf, count, millisecs: &khugepaged_scan_sleep_millisecs);
153	}
154	static struct kobj_attribute scan_sleep_millisecs_attr =
155	__ATTR_RW(scan_sleep_millisecs);
156
157	static ssize_t alloc_sleep_millisecs_show(struct kobject *kobj,
158	struct kobj_attribute *attr,
159	char *buf)
160	{
161	return sysfs_emit(buf, fmt: "%u\n", khugepaged_alloc_sleep_millisecs);
162	}
163
164	static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj,
165	struct kobj_attribute *attr,
166	const char *buf, size_t count)
167	{
168	return __sleep_millisecs_store(buf, count, millisecs: &khugepaged_alloc_sleep_millisecs);
169	}
170	static struct kobj_attribute alloc_sleep_millisecs_attr =
171	__ATTR_RW(alloc_sleep_millisecs);
172
173	static ssize_t pages_to_scan_show(struct kobject *kobj,
174	struct kobj_attribute *attr,
175	char *buf)
176	{
177	return sysfs_emit(buf, fmt: "%u\n", khugepaged_pages_to_scan);
178	}
179	static ssize_t pages_to_scan_store(struct kobject *kobj,
180	struct kobj_attribute *attr,
181	const char *buf, size_t count)
182	{
183	unsigned int pages;
184	int err;
185
186	err = kstrtouint(s: buf, base: `10`, res: &pages);
187	if (err \|\| !pages)
188	return -EINVAL;
189
190	khugepaged_pages_to_scan = pages;
191
192	return count;
193	}
194	static struct kobj_attribute pages_to_scan_attr =
195	__ATTR_RW(pages_to_scan);
196
197	static ssize_t pages_collapsed_show(struct kobject *kobj,
198	struct kobj_attribute *attr,
199	char *buf)
200	{
201	return sysfs_emit(buf, fmt: "%u\n", khugepaged_pages_collapsed);
202	}
203	static struct kobj_attribute pages_collapsed_attr =
204	__ATTR_RO(pages_collapsed);
205
206	static ssize_t full_scans_show(struct kobject *kobj,
207	struct kobj_attribute *attr,
208	char *buf)
209	{
210	return sysfs_emit(buf, fmt: "%u\n", khugepaged_full_scans);
211	}
212	static struct kobj_attribute full_scans_attr =
213	__ATTR_RO(full_scans);
214
215	static ssize_t defrag_show(struct kobject *kobj,
216	struct kobj_attribute attr, char* *buf)
217	{
218	return single_hugepage_flag_show(kobj, attr, buf,
219	flag: TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
220	}
221	static ssize_t defrag_store(struct kobject *kobj,
222	struct kobj_attribute *attr,
223	const char *buf, size_t count)
224	{
225	return single_hugepage_flag_store(kobj, attr, buf, count,
226	flag: TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
227	}
228	static struct kobj_attribute khugepaged_defrag_attr =
229	__ATTR_RW(defrag);
230
231	/*
232	* max_ptes_none controls if khugepaged should collapse hugepages over
233	* any unmapped ptes in turn potentially increasing the memory
234	* footprint of the vmas. When max_ptes_none is 0 khugepaged will not
235	* reduce the available free memory in the system as it
236	* runs. Increasing max_ptes_none will instead potentially reduce the
237	* free memory in the system during the khugepaged scan.
238	*/
239	static ssize_t max_ptes_none_show(struct kobject *kobj,
240	struct kobj_attribute *attr,
241	char *buf)
242	{
243	return sysfs_emit(buf, fmt: "%u\n", khugepaged_max_ptes_none);
244	}
245	static ssize_t max_ptes_none_store(struct kobject *kobj,
246	struct kobj_attribute *attr,
247	const char *buf, size_t count)
248	{
249	int err;
250	unsigned long max_ptes_none;
251
252	err = kstrtoul(s: buf, base: `10`, res: &max_ptes_none);
253	if (err \|\| max_ptes_none > HPAGE_PMD_NR - `1`)
254	return -EINVAL;
255
256	khugepaged_max_ptes_none = max_ptes_none;
257
258	return count;
259	}
260	static struct kobj_attribute khugepaged_max_ptes_none_attr =
261	__ATTR_RW(max_ptes_none);
262
263	static ssize_t max_ptes_swap_show(struct kobject *kobj,
264	struct kobj_attribute *attr,
265	char *buf)
266	{
267	return sysfs_emit(buf, fmt: "%u\n", khugepaged_max_ptes_swap);
268	}
269
270	static ssize_t max_ptes_swap_store(struct kobject *kobj,
271	struct kobj_attribute *attr,
272	const char *buf, size_t count)
273	{
274	int err;
275	unsigned long max_ptes_swap;
276
277	err = kstrtoul(s: buf, base: `10`, res: &max_ptes_swap);
278	if (err \|\| max_ptes_swap > HPAGE_PMD_NR - `1`)
279	return -EINVAL;
280
281	khugepaged_max_ptes_swap = max_ptes_swap;
282
283	return count;
284	}
285
286	static struct kobj_attribute khugepaged_max_ptes_swap_attr =
287	__ATTR_RW(max_ptes_swap);
288
289	static ssize_t max_ptes_shared_show(struct kobject *kobj,
290	struct kobj_attribute *attr,
291	char *buf)
292	{
293	return sysfs_emit(buf, fmt: "%u\n", khugepaged_max_ptes_shared);
294	}
295
296	static ssize_t max_ptes_shared_store(struct kobject *kobj,
297	struct kobj_attribute *attr,
298	const char *buf, size_t count)
299	{
300	int err;
301	unsigned long max_ptes_shared;
302
303	err = kstrtoul(s: buf, base: `10`, res: &max_ptes_shared);
304	if (err \|\| max_ptes_shared > HPAGE_PMD_NR - `1`)
305	return -EINVAL;
306
307	khugepaged_max_ptes_shared = max_ptes_shared;
308
309	return count;
310	}
311
312	static struct kobj_attribute khugepaged_max_ptes_shared_attr =
313	__ATTR_RW(max_ptes_shared);
314
315	static struct attribute *khugepaged_attr[] = {
316	&khugepaged_defrag_attr.attr,
317	&khugepaged_max_ptes_none_attr.attr,
318	&khugepaged_max_ptes_swap_attr.attr,
319	&khugepaged_max_ptes_shared_attr.attr,
320	&pages_to_scan_attr.attr,
321	&pages_collapsed_attr.attr,
322	&full_scans_attr.attr,
323	&scan_sleep_millisecs_attr.attr,
324	&alloc_sleep_millisecs_attr.attr,
325	NULL,
326	};
327
328	struct attribute_group khugepaged_attr_group = {
329	.attrs = khugepaged_attr,
330	.name = "khugepaged",
331	};
332	#endif /* CONFIG_SYSFS */
333
334	static bool pte_none_or_zero(pte_t pte)
335	{
336	if (pte_none(pte))
337	return true;
338	return pte_present(a: pte) && is_zero_pfn(pfn: pte_pfn(pte));
339	}
340
341	int hugepage_madvise(struct vm_area_struct *vma,
342	vm_flags_t vm_flags, int* advice)
343	{
344	switch (advice) {
345	case MADV_HUGEPAGE:
346	#ifdef CONFIG_S390
347	/*
348	* qemu blindly sets MADV_HUGEPAGE on all allocations, but s390
349	* can't handle this properly after s390_enable_sie, so we simply
350	* ignore the madvise to prevent qemu from causing a SIGSEGV.
351	*/
352	if (mm_has_pgste(vma->vm_mm))
353	return `0`;
354	#endif
355	*vm_flags &= ~VM_NOHUGEPAGE;
356	*vm_flags \|= VM_HUGEPAGE;
357	/*
358	* If the vma become good for khugepaged to scan,
359	* register it here without waiting a page fault that
360	* may not happen any time soon.
361	*/
362	khugepaged_enter_vma(vma, vm_flags: *vm_flags);
363	break;
364	case MADV_NOHUGEPAGE:
365	*vm_flags &= ~VM_HUGEPAGE;
366	*vm_flags \|= VM_NOHUGEPAGE;
367	/*
368	* Setting VM_NOHUGEPAGE will prevent khugepaged from scanning
369	* this vma even if we leave the mm registered in khugepaged if
370	* it got registered before VM_NOHUGEPAGE was set.
371	*/
372	break;
373	}
374
375	return `0`;
376	}
377
378	int __init khugepaged_init(void)
379	{
380	mm_slot_cache = KMEM_CACHE(mm_slot, `0`);
381	if (!mm_slot_cache)
382	return -ENOMEM;
383
384	khugepaged_pages_to_scan = HPAGE_PMD_NR * `8`;
385	khugepaged_max_ptes_none = HPAGE_PMD_NR - `1`;
386	khugepaged_max_ptes_swap = HPAGE_PMD_NR / `8`;
387	khugepaged_max_ptes_shared = HPAGE_PMD_NR / `2`;
388
389	return `0`;
390	}
391
392	void __init khugepaged_destroy(void)
393	{
394	kmem_cache_destroy(s: mm_slot_cache);
395	}
396
397	static inline int hpage_collapse_test_exit(struct mm_struct *mm)
398	{
399	return atomic_read(v: &mm->mm_users) == `0`;
400	}
401
402	static inline int hpage_collapse_test_exit_or_disable(struct mm_struct *mm)
403	{
404	return hpage_collapse_test_exit(mm) \|\|
405	mm_flags_test(MMF_DISABLE_THP_COMPLETELY, mm);
406	}
407
408	static bool hugepage_pmd_enabled(void)
409	{
410	/*
411	* We cover the anon, shmem and the file-backed case here; file-backed
412	* hugepages, when configured in, are determined by the global control.
413	* Anon pmd-sized hugepages are determined by the pmd-size control.
414	* Shmem pmd-sized hugepages are also determined by its pmd-size control,
415	* except when the global shmem_huge is set to SHMEM_HUGE_DENY.
416	*/
417	if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) &&
418	hugepage_global_enabled())
419	return true;
420	if (test_bit(PMD_ORDER, &huge_anon_orders_always))
421	return true;
422	if (test_bit(PMD_ORDER, &huge_anon_orders_madvise))
423	return true;
424	if (test_bit(PMD_ORDER, &huge_anon_orders_inherit) &&
425	hugepage_global_enabled())
426	return true;
427	if (IS_ENABLED(CONFIG_SHMEM) && shmem_hpage_pmd_enabled())
428	return true;
429	return false;
430	}
431
432	void __khugepaged_enter(struct mm_struct *mm)
433	{
434	struct mm_slot *slot;
435	int wakeup;
436
437	/ __khugepaged_exit() must not run from under us /
438	VM_BUG_ON_MM(hpage_collapse_test_exit(mm), mm);
439	if (unlikely(mm_flags_test_and_set(MMF_VM_HUGEPAGE, mm)))
440	return;
441
442	slot = mm_slot_alloc(cache: mm_slot_cache);
443	if (!slot)
444	return;
445
446	spin_lock(lock: &khugepaged_mm_lock);
447	mm_slot_insert(mm_slots_hash, mm, slot);
448	/*
449	* Insert just behind the scanning cursor, to let the area settle
450	* down a little.
451	*/
452	wakeup = list_empty(head: &khugepaged_scan.mm_head);
453	list_add_tail(new: &slot->mm_node, head: &khugepaged_scan.mm_head);
454	spin_unlock(lock: &khugepaged_mm_lock);
455
456	mmgrab(mm);
457	if (wakeup)
458	wake_up_interruptible(&khugepaged_wait);
459	}
460
461	void khugepaged_enter_vma(struct vm_area_struct *vma,
462	vm_flags_t vm_flags)
463	{
464	if (!mm_flags_test(MMF_VM_HUGEPAGE, mm: vma->vm_mm) &&
465	hugepage_pmd_enabled()) {
466	if (thp_vma_allowable_order(vma, vm_flags, TVA_KHUGEPAGED, PMD_ORDER))
467	__khugepaged_enter(mm: vma->vm_mm);
468	}
469	}
470
471	void __khugepaged_exit(struct mm_struct *mm)
472	{
473	struct mm_slot *slot;
474	int free = `0`;
475
476	spin_lock(lock: &khugepaged_mm_lock);
477	slot = mm_slot_lookup(mm_slots_hash, mm);
478	if (slot && khugepaged_scan.mm_slot != slot) {
479	hash_del(node: &slot->hash);
480	list_del(entry: &slot->mm_node);
481	free = `1`;
482	}
483	spin_unlock(lock: &khugepaged_mm_lock);
484
485	if (free) {
486	mm_flags_clear(MMF_VM_HUGEPAGE, mm);
487	mm_slot_free(cache: mm_slot_cache, objp: slot);
488	mmdrop(mm);
489	} else if (slot) {
490	/*
491	* This is required to serialize against
492	* hpage_collapse_test_exit() (which is guaranteed to run
493	* under mmap sem read mode). Stop here (after we return all
494	* pagetables will be destroyed) until khugepaged has finished
495	* working on the pagetables under the mmap_lock.
496	*/
497	mmap_write_lock(mm);
498	mmap_write_unlock(mm);
499	}
500	}
501
502	static void release_pte_folio(struct folio *folio)
503	{
504	node_stat_mod_folio(folio,
505	item: NR_ISOLATED_ANON + folio_is_file_lru(folio),
506	nr: -folio_nr_pages(folio));
507	folio_unlock(folio);
508	folio_putback_lru(folio);
509	}
510
511	static void release_pte_pages(pte_t pte, pte_t _pte,
512	struct list_head *compound_pagelist)
513	{
514	struct folio folio, tmp;
515
516	while (--_pte >= pte) {
517	pte_t pteval = ptep_get(ptep: _pte);
518	unsigned long pfn;
519
520	if (pte_none(pte: pteval))
521	continue;
522	VM_WARN_ON_ONCE(!pte_present(pteval));
523	pfn = pte_pfn(pte: pteval);
524	if (is_zero_pfn(pfn))
525	continue;
526	folio = pfn_folio(pfn);
527	if (folio_test_large(folio))
528	continue;
529	release_pte_folio(folio);
530	}
531
532	list_for_each_entry_safe(folio, tmp, compound_pagelist, lru) {
533	list_del(entry: &folio->lru);
534	release_pte_folio(folio);
535	}
536	}
537
538	static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
539	unsigned long start_addr,
540	pte_t *pte,
541	struct collapse_control *cc,
542	struct list_head *compound_pagelist)
543	{
544	struct page *page = NULL;
545	struct folio *folio = NULL;
546	unsigned long addr = start_addr;
547	pte_t *_pte;
548	int none_or_zero = `0`, shared = `0`, result = SCAN_FAIL, referenced = `0`;
549
550	for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
551	_pte++, addr += PAGE_SIZE) {
552	pte_t pteval = ptep_get(ptep: _pte);
553	if (pte_none_or_zero(pte: pteval)) {
554	++none_or_zero;
555	if (!userfaultfd_armed(vma) &&
556	(!cc->is_khugepaged \|\|
557	none_or_zero <= khugepaged_max_ptes_none)) {
558	continue;
559	} else {
560	result = SCAN_EXCEED_NONE_PTE;
561	count_vm_event(item: THP_SCAN_EXCEED_NONE_PTE);
562	goto out;
563	}
564	}
565	if (!pte_present(a: pteval)) {
566	result = SCAN_PTE_NON_PRESENT;
567	goto out;
568	}
569	if (pte_uffd_wp(pte: pteval)) {
570	result = SCAN_PTE_UFFD_WP;
571	goto out;
572	}
573	page = vm_normal_page(vma, addr, pte: pteval);
574	if (unlikely(!page) \|\| unlikely(is_zone_device_page(page))) {
575	result = SCAN_PAGE_NULL;
576	goto out;
577	}
578
579	folio = page_folio(page);
580	VM_BUG_ON_FOLIO(!folio_test_anon(folio), folio);
581
582	/ See hpage_collapse_scan_pmd(). /
583	if (folio_maybe_mapped_shared(folio)) {
584	++shared;
585	if (cc->is_khugepaged &&
586	shared > khugepaged_max_ptes_shared) {
587	result = SCAN_EXCEED_SHARED_PTE;
588	count_vm_event(item: THP_SCAN_EXCEED_SHARED_PTE);
589	goto out;
590	}
591	}
592
593	if (folio_test_large(folio)) {
594	struct folio *f;
595
596	/*
597	* Check if we have dealt with the compound page
598	* already
599	*/
600	list_for_each_entry(f, compound_pagelist, lru) {
601	if (folio == f)
602	goto next;
603	}
604	}
605
606	/*
607	* We can do it before folio_isolate_lru because the
608	* folio can't be freed from under us. NOTE: PG_lock
609	* is needed to serialize against split_huge_page
610	* when invoked from the VM.
611	*/
612	if (!folio_trylock(folio)) {
613	result = SCAN_PAGE_LOCK;
614	goto out;
615	}
616
617	/*
618	* Check if the page has any GUP (or other external) pins.
619	*
620	* The page table that maps the page has been already unlinked
621	* from the page table tree and this process cannot get
622	* an additional pin on the page.
623	*
624	* New pins can come later if the page is shared across fork,
625	* but not from this process. The other process cannot write to
626	* the page, only trigger CoW.
627	*/
628	if (folio_expected_ref_count(folio) != folio_ref_count(folio)) {
629	folio_unlock(folio);
630	result = SCAN_PAGE_COUNT;
631	goto out;
632	}
633
634	/*
635	* Isolate the page to avoid collapsing an hugepage
636	* currently in use by the VM.
637	*/
638	if (!folio_isolate_lru(folio)) {
639	folio_unlock(folio);
640	result = SCAN_DEL_PAGE_LRU;
641	goto out;
642	}
643	node_stat_mod_folio(folio,
644	item: NR_ISOLATED_ANON + folio_is_file_lru(folio),
645	nr: folio_nr_pages(folio));
646	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
647	VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
648
649	if (folio_test_large(folio))
650	list_add_tail(new: &folio->lru, head: compound_pagelist);
651	next:
652	/*
653	* If collapse was initiated by khugepaged, check that there is
654	* enough young pte to justify collapsing the page
655	*/
656	if (cc->is_khugepaged &&
657	(pte_young(pte: pteval) \|\| folio_test_young(folio) \|\|
658	folio_test_referenced(folio) \|\|
659	mmu_notifier_test_young(mm: vma->vm_mm, address: addr)))
660	referenced++;
661	}
662
663	if (unlikely(cc->is_khugepaged && !referenced)) {
664	result = SCAN_LACK_REFERENCED_PAGE;
665	} else {
666	result = SCAN_SUCCEED;
667	trace_mm_collapse_huge_page_isolate(folio, none_or_zero,
668	referenced, status: result);
669	return result;
670	}
671	out:
672	release_pte_pages(pte, _pte, compound_pagelist);
673	trace_mm_collapse_huge_page_isolate(folio, none_or_zero,
674	referenced, status: result);
675	return result;
676	}
677
678	static void __collapse_huge_page_copy_succeeded(pte_t *pte,
679	struct vm_area_struct *vma,
680	unsigned long address,
681	spinlock_t *ptl,
682	struct list_head *compound_pagelist)
683	{
684	unsigned long end = address + HPAGE_PMD_SIZE;
685	struct folio src, tmp;
686	pte_t pteval;
687	pte_t *_pte;
688	unsigned int nr_ptes;
689
690	for (_pte = pte; _pte < pte + HPAGE_PMD_NR; _pte += nr_ptes,
691	address += nr_ptes * PAGE_SIZE) {
692	nr_ptes = `1`;
693	pteval = ptep_get(ptep: _pte);
694	if (pte_none_or_zero(pte: pteval)) {
695	add_mm_counter(mm: vma->vm_mm, member: MM_ANONPAGES, value: `1`);
696	if (pte_none(pte: pteval))
697	continue;
698	/*
699	* ptl mostly unnecessary.
700	*/
701	spin_lock(lock: ptl);
702	ptep_clear(mm: vma->vm_mm, addr: address, ptep: _pte);
703	spin_unlock(lock: ptl);
704	ksm_might_unmap_zero_page(mm: vma->vm_mm, pte: pteval);
705	} else {
706	struct page *src_page = pte_page(pteval);
707
708	src = page_folio(src_page);
709
710	if (folio_test_large(folio: src)) {
711	unsigned int max_nr_ptes = (end - address) >> PAGE_SHIFT;
712
713	nr_ptes = folio_pte_batch(folio: src, ptep: _pte, pte: pteval, max_nr: max_nr_ptes);
714	} else {
715	release_pte_folio(folio: src);
716	}
717
718	/*
719	* ptl mostly unnecessary, but preempt has to
720	* be disabled to update the per-cpu stats
721	* inside folio_remove_rmap_pte().
722	*/
723	spin_lock(lock: ptl);
724	clear_ptes(mm: vma->vm_mm, addr: address, ptep: _pte, nr: nr_ptes);
725	folio_remove_rmap_ptes(src, src_page, nr_pages: nr_ptes, vma);
726	spin_unlock(lock: ptl);
727	free_swap_cache(folio: src);
728	folio_put_refs(folio: src, refs: nr_ptes);
729	}
730	}
731
732	list_for_each_entry_safe(src, tmp, compound_pagelist, lru) {
733	list_del(entry: &src->lru);
734	node_stat_sub_folio(folio: src, item: NR_ISOLATED_ANON +
735	folio_is_file_lru(folio: src));
736	folio_unlock(folio: src);
737	free_swap_cache(folio: src);
738	folio_putback_lru(folio: src);
739	}
740	}
741
742	static void __collapse_huge_page_copy_failed(pte_t *pte,
743	pmd_t *pmd,
744	pmd_t orig_pmd,
745	struct vm_area_struct *vma,
746	struct list_head *compound_pagelist)
747	{
748	spinlock_t *pmd_ptl;
749
750	/*
751	* Re-establish the PMD to point to the original page table
752	* entry. Restoring PMD needs to be done prior to releasing
753	* pages. Since pages are still isolated and locked here,
754	* acquiring anon_vma_lock_write is unnecessary.
755	*/
756	pmd_ptl = pmd_lock(mm: vma->vm_mm, pmd);
757	pmd_populate(mm: vma->vm_mm, pmd, pmd_pgtable(orig_pmd));
758	spin_unlock(lock: pmd_ptl);
759	/*
760	* Release both raw and compound pages isolated
761	* in __collapse_huge_page_isolate.
762	*/
763	release_pte_pages(pte, pte: pte + HPAGE_PMD_NR, compound_pagelist);
764	}
765
766	/*
767	* __collapse_huge_page_copy - attempts to copy memory contents from raw
768	* pages to a hugepage. Cleans up the raw pages if copying succeeds;
769	* otherwise restores the original page table and releases isolated raw pages.
770	* Returns SCAN_SUCCEED if copying succeeds, otherwise returns SCAN_COPY_MC.
771	*
772	* @pte: starting of the PTEs to copy from
773	* @folio: the new hugepage to copy contents to
774	* @pmd: pointer to the new hugepage's PMD
775	* @orig_pmd: the original raw pages' PMD
776	* @vma: the original raw pages' virtual memory area
777	* @address: starting address to copy
778	* @ptl: lock on raw pages' PTEs
779	* @compound_pagelist: list that stores compound pages
780	*/
781	static int __collapse_huge_page_copy(pte_t pte, struct* folio *folio,
782	pmd_t pmd, pmd_t orig_pmd, struct* vm_area_struct *vma,
783	unsigned long address, spinlock_t *ptl,
784	struct list_head *compound_pagelist)
785	{
786	unsigned int i;
787	int result = SCAN_SUCCEED;
788
789	/*
790	* Copying pages' contents is subject to memory poison at any iteration.
791	*/
792	for (i = `0`; i < HPAGE_PMD_NR; i++) {
793	pte_t pteval = ptep_get(ptep: pte + i);
794	struct page *page = folio_page(folio, i);
795	unsigned long src_addr = address + i * PAGE_SIZE;
796	struct page *src_page;
797
798	if (pte_none_or_zero(pte: pteval)) {
799	clear_user_highpage(page, vaddr: src_addr);
800	continue;
801	}
802	src_page = pte_page(pteval);
803	if (copy_mc_user_highpage(to: page, from: src_page, vaddr: src_addr, vma) > `0`) {
804	result = SCAN_COPY_MC;
805	break;
806	}
807	}
808
809	if (likely(result == SCAN_SUCCEED))
810	__collapse_huge_page_copy_succeeded(pte, vma, address, ptl,
811	compound_pagelist);
812	else
813	__collapse_huge_page_copy_failed(pte, pmd, orig_pmd, vma,
814	compound_pagelist);
815
816	return result;
817	}
818
819	static void khugepaged_alloc_sleep(void)
820	{
821	DEFINE_WAIT(wait);
822
823	add_wait_queue(wq_head: &khugepaged_wait, wq_entry: &wait);
824	__set_current_state(TASK_INTERRUPTIBLE\|TASK_FREEZABLE);
825	schedule_timeout(timeout: msecs_to_jiffies(m: khugepaged_alloc_sleep_millisecs));
826	remove_wait_queue(wq_head: &khugepaged_wait, wq_entry: &wait);
827	}
828
829	struct collapse_control khugepaged_collapse_control = {
830	.is_khugepaged = true,
831	};
832
833	static bool hpage_collapse_scan_abort(int nid, struct collapse_control *cc)
834	{
835	int i;
836
837	/*
838	* If node_reclaim_mode is disabled, then no extra effort is made to
839	* allocate memory locally.
840	*/
841	if (!node_reclaim_enabled())
842	return false;
843
844	/ If there is a count for this node already, it must be acceptable /
845	if (cc->node_load[nid])
846	return false;
847
848	for (i = `0`; i < MAX_NUMNODES; i++) {
849	if (!cc->node_load[i])
850	continue;
851	if (node_distance(nid, i) > node_reclaim_distance)
852	return true;
853	}
854	return false;
855	}
856
857	#define khugepaged_defrag() \
858	(transparent_hugepage_flags & \
859	(1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG))
860
861	/ Defrag for khugepaged will enter direct reclaim/compaction if necessary /
862	static inline gfp_t alloc_hugepage_khugepaged_gfpmask(void)
863	{
864	return khugepaged_defrag() ? GFP_TRANSHUGE : GFP_TRANSHUGE_LIGHT;
865	}
866
867	#ifdef CONFIG_NUMA
868	static int hpage_collapse_find_target_node(struct collapse_control *cc)
869	{
870	int nid, target_node = `0`, max_value = `0`;
871
872	/ find first node with max normal pages hit /
873	for (nid = `0`; nid < MAX_NUMNODES; nid++)
874	if (cc->node_load[nid] > max_value) {
875	max_value = cc->node_load[nid];
876	target_node = nid;
877	}
878
879	for_each_online_node(nid) {
880	if (max_value == cc->node_load[nid])
881	node_set(nid, cc->alloc_nmask);
882	}
883
884	return target_node;
885	}
886	#else
887	static int hpage_collapse_find_target_node(struct collapse_control *cc)
888	{
889	return `0`;
890	}
891	#endif
892
893	/*
894	* If mmap_lock temporarily dropped, revalidate vma
895	* before taking mmap_lock.
896	* Returns enum scan_result value.
897	*/
898
899	static int hugepage_vma_revalidate(struct mm_struct mm, unsigned* long address,
900	bool expect_anon,
901	struct vm_area_struct **vmap,
902	struct collapse_control *cc)
903	{
904	struct vm_area_struct *vma;
905	enum tva_type type = cc->is_khugepaged ? TVA_KHUGEPAGED :
906	TVA_FORCED_COLLAPSE;
907
908	if (unlikely(hpage_collapse_test_exit_or_disable(mm)))
909	return SCAN_ANY_PROCESS;
910
911	*vmap = vma = find_vma(mm, addr: address);
912	if (!vma)
913	return SCAN_VMA_NULL;
914
915	if (!thp_vma_suitable_order(vma, addr: address, PMD_ORDER))
916	return SCAN_ADDRESS_RANGE;
917	if (!thp_vma_allowable_order(vma, vma->vm_flags, type, PMD_ORDER))
918	return SCAN_VMA_CHECK;
919	/*
920	* Anon VMA expected, the address may be unmapped then
921	* remapped to file after khugepaged reaquired the mmap_lock.
922	*
923	* thp_vma_allowable_order may return true for qualified file
924	* vmas.
925	*/
926	if (expect_anon && (!(vmap)->anon_vma \|\| !vma_is_anonymous(vma: vmap)))
927	return SCAN_PAGE_ANON;
928	return SCAN_SUCCEED;
929	}
930
931	static inline int check_pmd_state(pmd_t *pmd)
932	{
933	pmd_t pmde = pmdp_get_lockless(pmdp: pmd);
934
935	if (pmd_none(pmd: pmde))
936	return SCAN_NO_PTE_TABLE;
937
938	/*
939	* The folio may be under migration when khugepaged is trying to
940	* collapse it. Migration success or failure will eventually end
941	* up with a present PMD mapping a folio again.
942	*/
943	if (pmd_is_migration_entry(pmd: pmde))
944	return SCAN_PMD_MAPPED;
945	if (!pmd_present(pmd: pmde))
946	return SCAN_NO_PTE_TABLE;
947	if (pmd_trans_huge(pmd: pmde))
948	return SCAN_PMD_MAPPED;
949	if (pmd_bad(pmd: pmde))
950	return SCAN_NO_PTE_TABLE;
951	return SCAN_SUCCEED;
952	}
953
954	static int find_pmd_or_thp_or_none(struct mm_struct *mm,
955	unsigned long address,
956	pmd_t **pmd)
957	{
958	*pmd = mm_find_pmd(mm, address);
959	if (!*pmd)
960	return SCAN_NO_PTE_TABLE;
961
962	return check_pmd_state(pmd: *pmd);
963	}
964
965	static int check_pmd_still_valid(struct mm_struct *mm,
966	unsigned long address,
967	pmd_t *pmd)
968	{
969	pmd_t *new_pmd;
970	int result = find_pmd_or_thp_or_none(mm, address, pmd: &new_pmd);
971
972	if (result != SCAN_SUCCEED)
973	return result;
974	if (new_pmd != pmd)
975	return SCAN_FAIL;
976	return SCAN_SUCCEED;
977	}
978
979	/*
980	* Bring missing pages in from swap, to complete THP collapse.
981	* Only done if hpage_collapse_scan_pmd believes it is worthwhile.
982	*
983	* Called and returns without pte mapped or spinlocks held.
984	* Returns result: if not SCAN_SUCCEED, mmap_lock has been released.
985	*/
986	static int __collapse_huge_page_swapin(struct mm_struct *mm,
987	struct vm_area_struct *vma,
988	unsigned long start_addr, pmd_t *pmd,
989	int referenced)
990	{
991	int swapped_in = `0`;
992	vm_fault_t ret = `0`;
993	unsigned long addr, end = start_addr + (HPAGE_PMD_NR * PAGE_SIZE);
994	int result;
995	pte_t *pte = NULL;
996	spinlock_t *ptl;
997
998	for (addr = start_addr; addr < end; addr += PAGE_SIZE) {
999	struct vm_fault vmf = {
1000	.vma = vma,
1001	.address = addr,
1002	.pgoff = linear_page_index(vma, address: addr),
1003	.flags = FAULT_FLAG_ALLOW_RETRY,
1004	.pmd = pmd,
1005	};
1006
1007	if (!pte++) {
1008	/*
1009	* Here the ptl is only used to check pte_same() in
1010	* do_swap_page(), so readonly version is enough.
1011	*/
1012	pte = pte_offset_map_ro_nolock(mm, pmd, addr, ptlp: &ptl);
1013	if (!pte) {
1014	mmap_read_unlock(mm);
1015	result = SCAN_NO_PTE_TABLE;
1016	goto out;
1017	}
1018	}
1019
1020	vmf.orig_pte = ptep_get_lockless(ptep: pte);
1021	if (pte_none(pte: vmf.orig_pte) \|\|
1022	pte_present(a: vmf.orig_pte))
1023	continue;
1024
1025	vmf.pte = pte;
1026	vmf.ptl = ptl;
1027	ret = do_swap_page(vmf: &vmf);
1028	/ Which unmaps pte (after perhaps re-checking the entry) /
1029	pte = NULL;
1030
1031	/*
1032	* do_swap_page returns VM_FAULT_RETRY with released mmap_lock.
1033	* Note we treat VM_FAULT_RETRY as VM_FAULT_ERROR here because
1034	* we do not retry here and swap entry will remain in pagetable
1035	* resulting in later failure.
1036	*/
1037	if (ret & VM_FAULT_RETRY) {
1038	/ Likely, but not guaranteed, that page lock failed /
1039	result = SCAN_PAGE_LOCK;
1040	goto out;
1041	}
1042	if (ret & VM_FAULT_ERROR) {
1043	mmap_read_unlock(mm);
1044	result = SCAN_FAIL;
1045	goto out;
1046	}
1047	swapped_in++;
1048	}
1049
1050	if (pte)
1051	pte_unmap(pte);
1052
1053	/ Drain LRU cache to remove extra pin on the swapped in pages /
1054	if (swapped_in)
1055	lru_add_drain();
1056
1057	result = SCAN_SUCCEED;
1058	out:
1059	trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, ret: result);
1060	return result;
1061	}
1062
1063	static int alloc_charge_folio(struct folio foliop, struct** mm_struct *mm,
1064	struct collapse_control *cc)
1065	{
1066	gfp_t gfp = (cc->is_khugepaged ? alloc_hugepage_khugepaged_gfpmask() :
1067	GFP_TRANSHUGE);
1068	int node = hpage_collapse_find_target_node(cc);
1069	struct folio *folio;
1070
1071	folio = __folio_alloc(gfp, HPAGE_PMD_ORDER, node, &cc->alloc_nmask);
1072	if (!folio) {
1073	*foliop = NULL;
1074	count_vm_event(item: THP_COLLAPSE_ALLOC_FAILED);
1075	return SCAN_ALLOC_HUGE_PAGE_FAIL;
1076	}
1077
1078	count_vm_event(item: THP_COLLAPSE_ALLOC);
1079	if (unlikely(mem_cgroup_charge(folio, mm, gfp))) {
1080	folio_put(folio);
1081	*foliop = NULL;
1082	return SCAN_CGROUP_CHARGE_FAIL;
1083	}
1084
1085	count_memcg_folio_events(folio, idx: THP_COLLAPSE_ALLOC, nr: `1`);
1086
1087	*foliop = folio;
1088	return SCAN_SUCCEED;
1089	}
1090
1091	static int collapse_huge_page(struct mm_struct mm, unsigned* long address,
1092	int referenced, int unmapped,
1093	struct collapse_control *cc)
1094	{
1095	LIST_HEAD(compound_pagelist);
1096	pmd_t *pmd, _pmd;
1097	pte_t *pte;
1098	pgtable_t pgtable;
1099	struct folio *folio;
1100	spinlock_t pmd_ptl, pte_ptl;
1101	int result = SCAN_FAIL;
1102	struct vm_area_struct *vma;
1103	struct mmu_notifier_range range;
1104
1105	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
1106
1107	/*
1108	* Before allocating the hugepage, release the mmap_lock read lock.
1109	* The allocation can take potentially a long time if it involves
1110	* sync compaction, and we do not need to hold the mmap_lock during
1111	* that. We will recheck the vma after taking it again in write mode.
1112	*/
1113	mmap_read_unlock(mm);
1114
1115	result = alloc_charge_folio(foliop: &folio, mm, cc);
1116	if (result != SCAN_SUCCEED)
1117	goto out_nolock;
1118
1119	mmap_read_lock(mm);
1120	result = hugepage_vma_revalidate(mm, address, expect_anon: true, vmap: &vma, cc);
1121	if (result != SCAN_SUCCEED) {
1122	mmap_read_unlock(mm);
1123	goto out_nolock;
1124	}
1125
1126	result = find_pmd_or_thp_or_none(mm, address, pmd: &pmd);
1127	if (result != SCAN_SUCCEED) {
1128	mmap_read_unlock(mm);
1129	goto out_nolock;
1130	}
1131
1132	if (unmapped) {
1133	/*
1134	* __collapse_huge_page_swapin will return with mmap_lock
1135	* released when it fails. So we jump out_nolock directly in
1136	* that case. Continuing to collapse causes inconsistency.
1137	*/
1138	result = __collapse_huge_page_swapin(mm, vma, start_addr: address, pmd,
1139	referenced);
1140	if (result != SCAN_SUCCEED)
1141	goto out_nolock;
1142	}
1143
1144	mmap_read_unlock(mm);
1145	/*
1146	* Prevent all access to pagetables with the exception of
1147	* gup_fast later handled by the ptep_clear_flush and the VM
1148	* handled by the anon_vma lock + PG_lock.
1149	*
1150	* UFFDIO_MOVE is prevented to race as well thanks to the
1151	* mmap_lock.
1152	*/
1153	mmap_write_lock(mm);
1154	result = hugepage_vma_revalidate(mm, address, expect_anon: true, vmap: &vma, cc);
1155	if (result != SCAN_SUCCEED)
1156	goto out_up_write;
1157	/ check if the pmd is still valid /
1158	vma_start_write(vma);
1159	result = check_pmd_still_valid(mm, address, pmd);
1160	if (result != SCAN_SUCCEED)
1161	goto out_up_write;
1162
1163	anon_vma_lock_write(anon_vma: vma->anon_vma);
1164
1165	mmu_notifier_range_init(range: &range, event: MMU_NOTIFY_CLEAR, flags: `0`, mm, start: address,
1166	end: address + HPAGE_PMD_SIZE);
1167	mmu_notifier_invalidate_range_start(range: &range);
1168
1169	pmd_ptl = pmd_lock(mm, pmd); / probably unnecessary /
1170	/*
1171	* This removes any huge TLB entry from the CPU so we won't allow
1172	* huge and small TLB entries for the same virtual address to
1173	* avoid the risk of CPU bugs in that area.
1174	*
1175	* Parallel GUP-fast is fine since GUP-fast will back off when
1176	* it detects PMD is changed.
1177	*/
1178	_pmd = pmdp_collapse_flush(vma, address, pmdp: pmd);
1179	spin_unlock(lock: pmd_ptl);
1180	mmu_notifier_invalidate_range_end(range: &range);
1181	tlb_remove_table_sync_one();
1182
1183	pte = pte_offset_map_lock(mm, pmd: &_pmd, addr: address, ptlp: &pte_ptl);
1184	if (pte) {
1185	result = __collapse_huge_page_isolate(vma, start_addr: address, pte, cc,
1186	compound_pagelist: &compound_pagelist);
1187	spin_unlock(lock: pte_ptl);
1188	} else {
1189	result = SCAN_NO_PTE_TABLE;
1190	}
1191
1192	if (unlikely(result != SCAN_SUCCEED)) {
1193	if (pte)
1194	pte_unmap(pte);
1195	spin_lock(lock: pmd_ptl);
1196	BUG_ON(!pmd_none(*pmd));
1197	/*
1198	* We can only use set_pmd_at when establishing
1199	* hugepmds and never for establishing regular pmds that
1200	* points to regular pagetables. Use pmd_populate for that
1201	*/
1202	pmd_populate(mm, pmd, pmd_pgtable(_pmd));
1203	spin_unlock(lock: pmd_ptl);
1204	anon_vma_unlock_write(anon_vma: vma->anon_vma);
1205	goto out_up_write;
1206	}
1207
1208	/*
1209	* All pages are isolated and locked so anon_vma rmap
1210	* can't run anymore.
1211	*/
1212	anon_vma_unlock_write(anon_vma: vma->anon_vma);
1213
1214	result = __collapse_huge_page_copy(pte, folio, pmd, orig_pmd: _pmd,
1215	vma, address, ptl: pte_ptl,
1216	compound_pagelist: &compound_pagelist);
1217	pte_unmap(pte);
1218	if (unlikely(result != SCAN_SUCCEED))
1219	goto out_up_write;
1220
1221	/*
1222	* The smp_wmb() inside __folio_mark_uptodate() ensures the
1223	* copy_huge_page writes become visible before the set_pmd_at()
1224	* write.
1225	*/
1226	__folio_mark_uptodate(folio);
1227	pgtable = pmd_pgtable(_pmd);
1228
1229	spin_lock(lock: pmd_ptl);
1230	BUG_ON(!pmd_none(*pmd));
1231	pgtable_trans_huge_deposit(mm, pmdp: pmd, pgtable);
1232	map_anon_folio_pmd_nopf(folio, pmd, vma, haddr: address);
1233	spin_unlock(lock: pmd_ptl);
1234
1235	folio = NULL;
1236
1237	result = SCAN_SUCCEED;
1238	out_up_write:
1239	mmap_write_unlock(mm);
1240	out_nolock:
1241	if (folio)
1242	folio_put(folio);
1243	trace_mm_collapse_huge_page(mm, isolated: result == SCAN_SUCCEED, status: result);
1244	return result;
1245	}
1246
1247	static int hpage_collapse_scan_pmd(struct mm_struct *mm,
1248	struct vm_area_struct *vma,
1249	unsigned long start_addr, bool *mmap_locked,
1250	struct collapse_control *cc)
1251	{
1252	pmd_t *pmd;
1253	pte_t pte, _pte;
1254	int result = SCAN_FAIL, referenced = `0`;
1255	int none_or_zero = `0`, shared = `0`;
1256	struct page *page = NULL;
1257	struct folio *folio = NULL;
1258	unsigned long addr;
1259	spinlock_t *ptl;
1260	int node = NUMA_NO_NODE, unmapped = `0`;
1261
1262	VM_BUG_ON(start_addr & ~HPAGE_PMD_MASK);
1263
1264	result = find_pmd_or_thp_or_none(mm, address: start_addr, pmd: &pmd);
1265	if (result != SCAN_SUCCEED)
1266	goto out;
1267
1268	memset(cc->node_load, `0`, sizeof(cc->node_load));
1269	nodes_clear(cc->alloc_nmask);
1270	pte = pte_offset_map_lock(mm, pmd, addr: start_addr, ptlp: &ptl);
1271	if (!pte) {
1272	result = SCAN_NO_PTE_TABLE;
1273	goto out;
1274	}
1275
1276	for (addr = start_addr, _pte = pte; _pte < pte + HPAGE_PMD_NR;
1277	_pte++, addr += PAGE_SIZE) {
1278	pte_t pteval = ptep_get(ptep: _pte);
1279	if (pte_none_or_zero(pte: pteval)) {
1280	++none_or_zero;
1281	if (!userfaultfd_armed(vma) &&
1282	(!cc->is_khugepaged \|\|
1283	none_or_zero <= khugepaged_max_ptes_none)) {
1284	continue;
1285	} else {
1286	result = SCAN_EXCEED_NONE_PTE;
1287	count_vm_event(item: THP_SCAN_EXCEED_NONE_PTE);
1288	goto out_unmap;
1289	}
1290	}
1291	if (!pte_present(a: pteval)) {
1292	++unmapped;
1293	if (!cc->is_khugepaged \|\|
1294	unmapped <= khugepaged_max_ptes_swap) {
1295	/*
1296	* Always be strict with uffd-wp
1297	* enabled swap entries. Please see
1298	* comment below for pte_uffd_wp().
1299	*/
1300	if (pte_swp_uffd_wp_any(pte: pteval)) {
1301	result = SCAN_PTE_UFFD_WP;
1302	goto out_unmap;
1303	}
1304	continue;
1305	} else {
1306	result = SCAN_EXCEED_SWAP_PTE;
1307	count_vm_event(item: THP_SCAN_EXCEED_SWAP_PTE);
1308	goto out_unmap;
1309	}
1310	}
1311	if (pte_uffd_wp(pte: pteval)) {
1312	/*
1313	* Don't collapse the page if any of the small
1314	* PTEs are armed with uffd write protection.
1315	* Here we can also mark the new huge pmd as
1316	* write protected if any of the small ones is
1317	* marked but that could bring unknown
1318	* userfault messages that falls outside of
1319	* the registered range. So, just be simple.
1320	*/
1321	result = SCAN_PTE_UFFD_WP;
1322	goto out_unmap;
1323	}
1324
1325	page = vm_normal_page(vma, addr, pte: pteval);
1326	if (unlikely(!page) \|\| unlikely(is_zone_device_page(page))) {
1327	result = SCAN_PAGE_NULL;
1328	goto out_unmap;
1329	}
1330	folio = page_folio(page);
1331
1332	if (!folio_test_anon(folio)) {
1333	result = SCAN_PAGE_ANON;
1334	goto out_unmap;
1335	}
1336
1337	/*
1338	* We treat a single page as shared if any part of the THP
1339	* is shared.
1340	*/
1341	if (folio_maybe_mapped_shared(folio)) {
1342	++shared;
1343	if (cc->is_khugepaged &&
1344	shared > khugepaged_max_ptes_shared) {
1345	result = SCAN_EXCEED_SHARED_PTE;
1346	count_vm_event(item: THP_SCAN_EXCEED_SHARED_PTE);
1347	goto out_unmap;
1348	}
1349	}
1350
1351	/*
1352	* Record which node the original page is from and save this
1353	* information to cc->node_load[].
1354	* Khugepaged will allocate hugepage from the node has the max
1355	* hit record.
1356	*/
1357	node = folio_nid(folio);
1358	if (hpage_collapse_scan_abort(nid: node, cc)) {
1359	result = SCAN_SCAN_ABORT;
1360	goto out_unmap;
1361	}
1362	cc->node_load[node]++;
1363	if (!folio_test_lru(folio)) {
1364	result = SCAN_PAGE_LRU;
1365	goto out_unmap;
1366	}
1367	if (folio_test_locked(folio)) {
1368	result = SCAN_PAGE_LOCK;
1369	goto out_unmap;
1370	}
1371
1372	/*
1373	* Check if the page has any GUP (or other external) pins.
1374	*
1375	* Here the check may be racy:
1376	* it may see folio_mapcount() > folio_ref_count().
1377	* But such case is ephemeral we could always retry collapse
1378	* later. However it may report false positive if the page
1379	* has excessive GUP pins (i.e. 512). Anyway the same check
1380	* will be done again later the risk seems low.
1381	*/
1382	if (folio_expected_ref_count(folio) != folio_ref_count(folio)) {
1383	result = SCAN_PAGE_COUNT;
1384	goto out_unmap;
1385	}
1386
1387	/*
1388	* If collapse was initiated by khugepaged, check that there is
1389	* enough young pte to justify collapsing the page
1390	*/
1391	if (cc->is_khugepaged &&
1392	(pte_young(pte: pteval) \|\| folio_test_young(folio) \|\|
1393	folio_test_referenced(folio) \|\|
1394	mmu_notifier_test_young(mm: vma->vm_mm, address: addr)))
1395	referenced++;
1396	}
1397	if (cc->is_khugepaged &&
1398	(!referenced \|\|
1399	(unmapped && referenced < HPAGE_PMD_NR / `2`))) {
1400	result = SCAN_LACK_REFERENCED_PAGE;
1401	} else {
1402	result = SCAN_SUCCEED;
1403	}
1404	out_unmap:
1405	pte_unmap_unlock(pte, ptl);
1406	if (result == SCAN_SUCCEED) {
1407	result = collapse_huge_page(mm, address: start_addr, referenced,
1408	unmapped, cc);
1409	/ collapse_huge_page will return with the mmap_lock released /
1410	*mmap_locked = false;
1411	}
1412	out:
1413	trace_mm_khugepaged_scan_pmd(mm, folio, referenced,
1414	none_or_zero, status: result, unmapped);
1415	return result;
1416	}
1417
1418	static void collect_mm_slot(struct mm_slot *slot)
1419	{
1420	struct mm_struct *mm = slot->mm;
1421
1422	lockdep_assert_held(&khugepaged_mm_lock);
1423
1424	if (hpage_collapse_test_exit(mm)) {
1425	/ free mm_slot /
1426	hash_del(node: &slot->hash);
1427	list_del(entry: &slot->mm_node);
1428
1429	/*
1430	* Not strictly needed because the mm exited already.
1431	*
1432	* mm_flags_clear(MMF_VM_HUGEPAGE, mm);
1433	*/
1434
1435	/ khugepaged_mm_lock actually not necessary for the below /
1436	mm_slot_free(cache: mm_slot_cache, objp: slot);
1437	mmdrop(mm);
1438	}
1439	}
1440
1441	/ folio must be locked, and mmap_lock must be held /
1442	static int set_huge_pmd(struct vm_area_struct vma, unsigned* long addr,
1443	pmd_t pmdp, struct* folio folio, struct* page *page)
1444	{
1445	struct mm_struct *mm = vma->vm_mm;
1446	struct vm_fault vmf = {
1447	.vma = vma,
1448	.address = addr,
1449	.flags = `0`,
1450	};
1451	pgd_t *pgdp;
1452	p4d_t *p4dp;
1453	pud_t *pudp;
1454
1455	mmap_assert_locked(mm: vma->vm_mm);
1456
1457	if (!pmdp) {
1458	pgdp = pgd_offset(mm, addr);
1459	p4dp = p4d_alloc(mm, pgd: pgdp, address: addr);
1460	if (!p4dp)
1461	return SCAN_FAIL;
1462	pudp = pud_alloc(mm, p4d: p4dp, address: addr);
1463	if (!pudp)
1464	return SCAN_FAIL;
1465	pmdp = pmd_alloc(mm, pud: pudp, address: addr);
1466	if (!pmdp)
1467	return SCAN_FAIL;
1468	}
1469
1470	vmf.pmd = pmdp;
1471	if (do_set_pmd(vmf: &vmf, folio, page))
1472	return SCAN_FAIL;
1473
1474	folio_get(folio);
1475	return SCAN_SUCCEED;
1476	}
1477
1478	/**
1479	* collapse_pte_mapped_thp - Try to collapse a pte-mapped THP for mm at
1480	* address haddr.
1481	*
1482	* @mm: process address space where collapse happens
1483	* @addr: THP collapse address
1484	* @install_pmd: If a huge PMD should be installed
1485	*
1486	* This function checks whether all the PTEs in the PMD are pointing to the
1487	* right THP. If so, retract the page table so the THP can refault in with
1488	* as pmd-mapped. Possibly install a huge PMD mapping the THP.
1489	*/
1490	int collapse_pte_mapped_thp(struct mm_struct mm, unsigned* long addr,
1491	bool install_pmd)
1492	{
1493	int nr_mapped_ptes = `0`, result = SCAN_FAIL;
1494	unsigned int nr_batch_ptes;
1495	struct mmu_notifier_range range;
1496	bool notified = false;
1497	unsigned long haddr = addr & HPAGE_PMD_MASK;
1498	unsigned long end = haddr + HPAGE_PMD_SIZE;
1499	struct vm_area_struct *vma = vma_lookup(mm, addr: haddr);
1500	struct folio *folio;
1501	pte_t start_pte, pte;
1502	pmd_t *pmd, pgt_pmd;
1503	spinlock_t pml = NULL, ptl;
1504	int i;
1505
1506	mmap_assert_locked(mm);
1507
1508	/ First check VMA found, in case page tables are being torn down /
1509	if (!vma \|\| !vma->vm_file \|\|
1510	!range_in_vma(vma, start: haddr, end: haddr + HPAGE_PMD_SIZE))
1511	return SCAN_VMA_CHECK;
1512
1513	/ Fast check before locking page if already PMD-mapped /
1514	result = find_pmd_or_thp_or_none(mm, address: haddr, pmd: &pmd);
1515	if (result == SCAN_PMD_MAPPED)
1516	return result;
1517
1518	/*
1519	* If we are here, we've succeeded in replacing all the native pages
1520	* in the page cache with a single hugepage. If a mm were to fault-in
1521	* this memory (mapped by a suitably aligned VMA), we'd get the hugepage
1522	* and map it by a PMD, regardless of sysfs THP settings. As such, let's
1523	* analogously elide sysfs THP settings here and force collapse.
1524	*/
1525	if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_FORCED_COLLAPSE, PMD_ORDER))
1526	return SCAN_VMA_CHECK;
1527
1528	/ Keep pmd pgtable for uffd-wp; see comment in retract_page_tables() /
1529	if (userfaultfd_wp(vma))
1530	return SCAN_PTE_UFFD_WP;
1531
1532	folio = filemap_lock_folio(mapping: vma->vm_file->f_mapping,
1533	index: linear_page_index(vma, address: haddr));
1534	if (IS_ERR(ptr: folio))
1535	return SCAN_PAGE_NULL;
1536
1537	if (folio_order(folio) != HPAGE_PMD_ORDER) {
1538	result = SCAN_PAGE_COMPOUND;
1539	goto drop_folio;
1540	}
1541
1542	result = find_pmd_or_thp_or_none(mm, address: haddr, pmd: &pmd);
1543	switch (result) {
1544	case SCAN_SUCCEED:
1545	break;
1546	case SCAN_NO_PTE_TABLE:
1547	/*
1548	* All pte entries have been removed and pmd cleared.
1549	* Skip all the pte checks and just update the pmd mapping.
1550	*/
1551	goto maybe_install_pmd;
1552	default:
1553	goto drop_folio;
1554	}
1555
1556	result = SCAN_FAIL;
1557	start_pte = pte_offset_map_lock(mm, pmd, addr: haddr, ptlp: &ptl);
1558	if (!start_pte) / mmap_lock + page lock should prevent this /
1559	goto drop_folio;
1560
1561	/ step 1: check all mapped PTEs are to the right huge page /
1562	for (i = `0`, addr = haddr, pte = start_pte;
1563	i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
1564	struct page *page;
1565	pte_t ptent = ptep_get(ptep: pte);
1566
1567	/ empty pte, skip /
1568	if (pte_none(pte: ptent))
1569	continue;
1570
1571	/ page swapped out, abort /
1572	if (!pte_present(a: ptent)) {
1573	result = SCAN_PTE_NON_PRESENT;
1574	goto abort;
1575	}
1576
1577	page = vm_normal_page(vma, addr, pte: ptent);
1578	if (WARN_ON_ONCE(page && is_zone_device_page(page)))
1579	page = NULL;
1580	/*
1581	* Note that uprobe, debugger, or MAP_PRIVATE may change the
1582	* page table, but the new page will not be a subpage of hpage.
1583	*/
1584	if (folio_page(folio, i) != page)
1585	goto abort;
1586	}
1587
1588	pte_unmap_unlock(start_pte, ptl);
1589	mmu_notifier_range_init(range: &range, event: MMU_NOTIFY_CLEAR, flags: `0`, mm,
1590	start: haddr, end: haddr + HPAGE_PMD_SIZE);
1591	mmu_notifier_invalidate_range_start(range: &range);
1592	notified = true;
1593
1594	/*
1595	* pmd_lock covers a wider range than ptl, and (if split from mm's
1596	* page_table_lock) ptl nests inside pml. The less time we hold pml,
1597	* the better; but userfaultfd's mfill_atomic_pte() on a private VMA
1598	* inserts a valid as-if-COWed PTE without even looking up page cache.
1599	* So page lock of folio does not protect from it, so we must not drop
1600	* ptl before pgt_pmd is removed, so uffd private needs pml taken now.
1601	*/
1602	if (userfaultfd_armed(vma) && !(vma->vm_flags & VM_SHARED))
1603	pml = pmd_lock(mm, pmd);
1604
1605	start_pte = pte_offset_map_rw_nolock(mm, pmd, addr: haddr, pmdvalp: &pgt_pmd, ptlp: &ptl);
1606	if (!start_pte) / mmap_lock + page lock should prevent this /
1607	goto abort;
1608	if (!pml)
1609	spin_lock(lock: ptl);
1610	else if (ptl != pml)
1611	spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
1612
1613	if (unlikely(!pmd_same(pgt_pmd, pmdp_get_lockless(pmd))))
1614	goto abort;
1615
1616	/ step 2: clear page table and adjust rmap /
1617	for (i = `0`, addr = haddr, pte = start_pte; i < HPAGE_PMD_NR;
1618	i += nr_batch_ptes, addr += nr_batch_ptes * PAGE_SIZE,
1619	pte += nr_batch_ptes) {
1620	unsigned int max_nr_batch_ptes = (end - addr) >> PAGE_SHIFT;
1621	struct page *page;
1622	pte_t ptent = ptep_get(ptep: pte);
1623
1624	nr_batch_ptes = `1`;
1625
1626	if (pte_none(pte: ptent))
1627	continue;
1628	/*
1629	* We dropped ptl after the first scan, to do the mmu_notifier:
1630	* page lock stops more PTEs of the folio being faulted in, but
1631	* does not stop write faults COWing anon copies from existing
1632	* PTEs; and does not stop those being swapped out or migrated.
1633	*/
1634	if (!pte_present(a: ptent)) {
1635	result = SCAN_PTE_NON_PRESENT;
1636	goto abort;
1637	}
1638	page = vm_normal_page(vma, addr, pte: ptent);
1639
1640	if (folio_page(folio, i) != page)
1641	goto abort;
1642
1643	nr_batch_ptes = folio_pte_batch(folio, ptep: pte, pte: ptent, max_nr: max_nr_batch_ptes);
1644
1645	/*
1646	* Must clear entry, or a racing truncate may re-remove it.
1647	* TLB flush can be left until pmdp_collapse_flush() does it.
1648	* PTE dirty? Shmem page is already dirty; file is read-only.
1649	*/
1650	clear_ptes(mm, addr, ptep: pte, nr: nr_batch_ptes);
1651	folio_remove_rmap_ptes(folio, page, nr_pages: nr_batch_ptes, vma);
1652	nr_mapped_ptes += nr_batch_ptes;
1653	}
1654
1655	if (!pml)
1656	spin_unlock(lock: ptl);
1657
1658	/ step 3: set proper refcount and mm_counters. /
1659	if (nr_mapped_ptes) {
1660	folio_ref_sub(folio, nr: nr_mapped_ptes);
1661	add_mm_counter(mm, member: mm_counter_file(folio), value: -nr_mapped_ptes);
1662	}
1663
1664	/ step 4: remove empty page table /
1665	if (!pml) {
1666	pml = pmd_lock(mm, pmd);
1667	if (ptl != pml) {
1668	spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
1669	if (unlikely(!pmd_same(pgt_pmd, pmdp_get_lockless(pmd)))) {
1670	flush_tlb_mm(mm);
1671	goto unlock;
1672	}
1673	}
1674	}
1675	pgt_pmd = pmdp_collapse_flush(vma, address: haddr, pmdp: pmd);
1676	pmdp_get_lockless_sync();
1677	pte_unmap_unlock(start_pte, ptl);
1678	if (ptl != pml)
1679	spin_unlock(lock: pml);
1680
1681	mmu_notifier_invalidate_range_end(range: &range);
1682
1683	mm_dec_nr_ptes(mm);
1684	page_table_check_pte_clear_range(mm, addr: haddr, pmd: pgt_pmd);
1685	pte_free_defer(mm, pmd_pgtable(pgt_pmd));
1686
1687	maybe_install_pmd:
1688	/ step 5: install pmd entry /
1689	result = install_pmd
1690	? set_huge_pmd(vma, addr: haddr, pmdp: pmd, folio, page: &folio->page)
1691	: SCAN_SUCCEED;
1692	goto drop_folio;
1693	abort:
1694	if (nr_mapped_ptes) {
1695	flush_tlb_mm(mm);
1696	folio_ref_sub(folio, nr: nr_mapped_ptes);
1697	add_mm_counter(mm, member: mm_counter_file(folio), value: -nr_mapped_ptes);
1698	}
1699	unlock:
1700	if (start_pte)
1701	pte_unmap_unlock(start_pte, ptl);
1702	if (pml && pml != ptl)
1703	spin_unlock(lock: pml);
1704	if (notified)
1705	mmu_notifier_invalidate_range_end(range: &range);
1706	drop_folio:
1707	folio_unlock(folio);
1708	folio_put(folio);
1709	return result;
1710	}
1711
1712	/ Can we retract page tables for this file-backed VMA? /
1713	static bool file_backed_vma_is_retractable(struct vm_area_struct *vma)
1714	{
1715	/*
1716	* Check vma->anon_vma to exclude MAP_PRIVATE mappings that
1717	* got written to. These VMAs are likely not worth removing
1718	* page tables from, as PMD-mapping is likely to be split later.
1719	*/
1720	if (READ_ONCE(vma->anon_vma))
1721	return false;
1722
1723	/*
1724	* When a vma is registered with uffd-wp, we cannot recycle
1725	* the page table because there may be pte markers installed.
1726	* Other vmas can still have the same file mapped hugely, but
1727	* skip this one: it will always be mapped in small page size
1728	* for uffd-wp registered ranges.
1729	*/
1730	if (userfaultfd_wp(vma))
1731	return false;
1732
1733	/*
1734	* If the VMA contains guard regions then we can't collapse it.
1735	*
1736	* This is set atomically on guard marker installation under mmap/VMA
1737	* read lock, and here we may not hold any VMA or mmap lock at all.
1738	*
1739	* This is therefore serialised on the PTE page table lock, which is
1740	* obtained on guard region installation after the flag is set, so this
1741	* check being performed under this lock excludes races.
1742	*/
1743	if (vma_flag_test_atomic(vma, bit: VMA_MAYBE_GUARD_BIT))
1744	return false;
1745
1746	return true;
1747	}
1748
1749	static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
1750	{
1751	struct vm_area_struct *vma;
1752
1753	i_mmap_lock_read(mapping);
1754	vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
1755	struct mmu_notifier_range range;
1756	struct mm_struct *mm;
1757	unsigned long addr;
1758	pmd_t *pmd, pgt_pmd;
1759	spinlock_t *pml;
1760	spinlock_t *ptl;
1761	bool success = false;
1762
1763	addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
1764	if (addr & ~HPAGE_PMD_MASK \|\|
1765	vma->vm_end < addr + HPAGE_PMD_SIZE)
1766	continue;
1767
1768	mm = vma->vm_mm;
1769	if (find_pmd_or_thp_or_none(mm, address: addr, pmd: &pmd) != SCAN_SUCCEED)
1770	continue;
1771
1772	if (hpage_collapse_test_exit(mm))
1773	continue;
1774
1775	if (!file_backed_vma_is_retractable(vma))
1776	continue;
1777
1778	/ PTEs were notified when unmapped; but now for the PMD? /
1779	mmu_notifier_range_init(range: &range, event: MMU_NOTIFY_CLEAR, flags: `0`, mm,
1780	start: addr, end: addr + HPAGE_PMD_SIZE);
1781	mmu_notifier_invalidate_range_start(range: &range);
1782
1783	pml = pmd_lock(mm, pmd);
1784	/*
1785	* The lock of new_folio is still held, we will be blocked in
1786	* the page fault path, which prevents the pte entries from
1787	* being set again. So even though the old empty PTE page may be
1788	* concurrently freed and a new PTE page is filled into the pmd
1789	* entry, it is still empty and can be removed.
1790	*
1791	* So here we only need to recheck if the state of pmd entry
1792	* still meets our requirements, rather than checking pmd_same()
1793	* like elsewhere.
1794	*/
1795	if (check_pmd_state(pmd) != SCAN_SUCCEED)
1796	goto drop_pml;
1797	ptl = pte_lockptr(mm, pmd);
1798	if (ptl != pml)
1799	spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
1800
1801	/*
1802	* Huge page lock is still held, so normally the page table must
1803	* remain empty; and we have already skipped anon_vma and
1804	* userfaultfd_wp() vmas. But since the mmap_lock is not held,
1805	* it is still possible for a racing userfaultfd_ioctl() or
1806	* madvise() to have inserted ptes or markers. Now that we hold
1807	* ptlock, repeating the retractable checks protects us from
1808	* races against the prior checks.
1809	*/
1810	if (likely(file_backed_vma_is_retractable(vma))) {
1811	pgt_pmd = pmdp_collapse_flush(vma, address: addr, pmdp: pmd);
1812	pmdp_get_lockless_sync();
1813	success = true;
1814	}
1815
1816	if (ptl != pml)
1817	spin_unlock(lock: ptl);
1818	drop_pml:
1819	spin_unlock(lock: pml);
1820
1821	mmu_notifier_invalidate_range_end(range: &range);
1822
1823	if (success) {
1824	mm_dec_nr_ptes(mm);
1825	page_table_check_pte_clear_range(mm, addr, pmd: pgt_pmd);
1826	pte_free_defer(mm, pmd_pgtable(pgt_pmd));
1827	}
1828	}
1829	i_mmap_unlock_read(mapping);
1830	}
1831
1832	/**
1833	* collapse_file - collapse filemap/tmpfs/shmem pages into huge one.
1834	*
1835	* @mm: process address space where collapse happens
1836	* @addr: virtual collapse start address
1837	* @file: file that collapse on
1838	* @start: collapse start address
1839	* @cc: collapse context and scratchpad
1840	*
1841	* Basic scheme is simple, details are more complex:
1842	* - allocate and lock a new huge page;
1843	* - scan page cache, locking old pages
1844	* + swap/gup in pages if necessary;
1845	* - copy data to new page
1846	* - handle shmem holes
1847	* + re-validate that holes weren't filled by someone else
1848	* + check for userfaultfd
1849	* - finalize updates to the page cache;
1850	* - if replacing succeeds:
1851	* + unlock huge page;
1852	* + free old pages;
1853	* - if replacing failed;
1854	* + unlock old pages
1855	* + unlock and free huge page;
1856	*/
1857	static int collapse_file(struct mm_struct mm, unsigned* long addr,
1858	struct file *file, pgoff_t start,
1859	struct collapse_control *cc)
1860	{
1861	struct address_space *mapping = file->f_mapping;
1862	struct page *dst;
1863	struct folio folio, tmp, *new_folio;
1864	pgoff_t index = `0`, end = start + HPAGE_PMD_NR;
1865	LIST_HEAD(pagelist);
1866	XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER);
1867	int nr_none = `0`, result = SCAN_SUCCEED;
1868	bool is_shmem = shmem_file(file);
1869
1870	VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem);
1871	VM_BUG_ON(start & (HPAGE_PMD_NR - `1`));
1872
1873	result = alloc_charge_folio(foliop: &new_folio, mm, cc);
1874	if (result != SCAN_SUCCEED)
1875	goto out;
1876
1877	mapping_set_update(&xas, mapping);
1878
1879	__folio_set_locked(folio: new_folio);
1880	if (is_shmem)
1881	__folio_set_swapbacked(folio: new_folio);
1882	new_folio->index = start;
1883	new_folio->mapping = mapping;
1884
1885	/*
1886	* Ensure we have slots for all the pages in the range. This is
1887	* almost certainly a no-op because most of the pages must be present
1888	*/
1889	do {
1890	xas_lock_irq(&xas);
1891	xas_create_range(&xas);
1892	if (!xas_error(xas: &xas))
1893	break;
1894	xas_unlock_irq(&xas);
1895	if (!xas_nomem(&xas, GFP_KERNEL)) {
1896	result = SCAN_FAIL;
1897	goto rollback;
1898	}
1899	} while (`1`);
1900
1901	for (index = start; index < end;) {
1902	xas_set(xas: &xas, index);
1903	folio = xas_load(&xas);
1904
1905	VM_BUG_ON(index != xas.xa_index);
1906	if (is_shmem) {
1907	if (!folio) {
1908	/*
1909	* Stop if extent has been truncated or
1910	* hole-punched, and is now completely
1911	* empty.
1912	*/
1913	if (index == start) {
1914	if (!xas_next_entry(xas: &xas, max: end - `1`)) {
1915	result = SCAN_TRUNCATED;
1916	goto xa_locked;
1917	}
1918	}
1919	nr_none++;
1920	index++;
1921	continue;
1922	}
1923
1924	if (xa_is_value(entry: folio) \|\| !folio_test_uptodate(folio)) {
1925	xas_unlock_irq(&xas);
1926	/ swap in or instantiate fallocated page /
1927	if (shmem_get_folio(inode: mapping->host, index, write_end: `0`,
1928	foliop: &folio, sgp: SGP_NOALLOC)) {
1929	result = SCAN_FAIL;
1930	goto xa_unlocked;
1931	}
1932	/ drain lru cache to help folio_isolate_lru() /
1933	lru_add_drain();
1934	} else if (folio_trylock(folio)) {
1935	folio_get(folio);
1936	xas_unlock_irq(&xas);
1937	} else {
1938	result = SCAN_PAGE_LOCK;
1939	goto xa_locked;
1940	}
1941	} else { / !is_shmem /
1942	if (!folio \|\| xa_is_value(entry: folio)) {
1943	xas_unlock_irq(&xas);
1944	page_cache_sync_readahead(mapping, ra: &file->f_ra,
1945	file, index,
1946	req_count: end - index);
1947	/ drain lru cache to help folio_isolate_lru() /
1948	lru_add_drain();
1949	folio = filemap_lock_folio(mapping, index);
1950	if (IS_ERR(ptr: folio)) {
1951	result = SCAN_FAIL;
1952	goto xa_unlocked;
1953	}
1954	} else if (folio_test_dirty(folio)) {
1955	/*
1956	* khugepaged only works on read-only fd,
1957	* so this page is dirty because it hasn't
1958	* been flushed since first write. There
1959	* won't be new dirty pages.
1960	*
1961	* Trigger async flush here and hope the
1962	* writeback is done when khugepaged
1963	* revisits this page.
1964	*
1965	* This is a one-off situation. We are not
1966	* forcing writeback in loop.
1967	*/
1968	xas_unlock_irq(&xas);
1969	filemap_flush(mapping);
1970	result = SCAN_FAIL;
1971	goto xa_unlocked;
1972	} else if (folio_test_writeback(folio)) {
1973	xas_unlock_irq(&xas);
1974	result = SCAN_FAIL;
1975	goto xa_unlocked;
1976	} else if (folio_trylock(folio)) {
1977	folio_get(folio);
1978	xas_unlock_irq(&xas);
1979	} else {
1980	result = SCAN_PAGE_LOCK;
1981	goto xa_locked;
1982	}
1983	}
1984
1985	/*
1986	* The folio must be locked, so we can drop the i_pages lock
1987	* without racing with truncate.
1988	*/
1989	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
1990
1991	/ make sure the folio is up to date /
1992	if (unlikely(!folio_test_uptodate(folio))) {
1993	result = SCAN_FAIL;
1994	goto out_unlock;
1995	}
1996
1997	/*
1998	* If file was truncated then extended, or hole-punched, before
1999	* we locked the first folio, then a THP might be there already.
2000	* This will be discovered on the first iteration.
2001	*/
2002	if (folio_order(folio) == HPAGE_PMD_ORDER &&
2003	folio->index == start) {
2004	/ Maybe PMD-mapped /
2005	result = SCAN_PTE_MAPPED_HUGEPAGE;
2006	goto out_unlock;
2007	}
2008
2009	if (folio_mapping(folio) != mapping) {
2010	result = SCAN_TRUNCATED;
2011	goto out_unlock;
2012	}
2013
2014	if (!is_shmem && (folio_test_dirty(folio) \|\|
2015	folio_test_writeback(folio))) {
2016	/*
2017	* khugepaged only works on read-only fd, so this
2018	* folio is dirty because it hasn't been flushed
2019	* since first write.
2020	*/
2021	result = SCAN_FAIL;
2022	goto out_unlock;
2023	}
2024
2025	if (!folio_isolate_lru(folio)) {
2026	result = SCAN_DEL_PAGE_LRU;
2027	goto out_unlock;
2028	}
2029
2030	if (!filemap_release_folio(folio, GFP_KERNEL)) {
2031	result = SCAN_PAGE_HAS_PRIVATE;
2032	folio_putback_lru(folio);
2033	goto out_unlock;
2034	}
2035
2036	if (folio_mapped(folio))
2037	try_to_unmap(folio,
2038	flags: TTU_IGNORE_MLOCK \| TTU_BATCH_FLUSH);
2039
2040	xas_lock_irq(&xas);
2041
2042	VM_BUG_ON_FOLIO(folio != xa_load(xas.xa, index), folio);
2043
2044	/*
2045	* We control 2 + nr_pages references to the folio:
2046	* - we hold a pin on it;
2047	* - nr_pages reference from page cache;
2048	* - one from lru_isolate_folio;
2049	* If those are the only references, then any new usage
2050	* of the folio will have to fetch it from the page
2051	* cache. That requires locking the folio to handle
2052	* truncate, so any new usage will be blocked until we
2053	* unlock folio after collapse/during rollback.
2054	*/
2055	if (folio_ref_count(folio) != `2` + folio_nr_pages(folio)) {
2056	result = SCAN_PAGE_COUNT;
2057	xas_unlock_irq(&xas);
2058	folio_putback_lru(folio);
2059	goto out_unlock;
2060	}
2061
2062	/*
2063	* Accumulate the folios that are being collapsed.
2064	*/
2065	list_add_tail(new: &folio->lru, head: &pagelist);
2066	index += folio_nr_pages(folio);
2067	continue;
2068	out_unlock:
2069	folio_unlock(folio);
2070	folio_put(folio);
2071	goto xa_unlocked;
2072	}
2073
2074	if (!is_shmem) {
2075	filemap_nr_thps_inc(mapping);
2076	/*
2077	* Paired with the fence in do_dentry_open() -> get_write_access()
2078	* to ensure i_writecount is up to date and the update to nr_thps
2079	* is visible. Ensures the page cache will be truncated if the
2080	* file is opened writable.
2081	*/
2082	smp_mb();
2083	if (inode_is_open_for_write(inode: mapping->host)) {
2084	result = SCAN_FAIL;
2085	filemap_nr_thps_dec(mapping);
2086	}
2087	}
2088
2089	xa_locked:
2090	xas_unlock_irq(&xas);
2091	xa_unlocked:
2092
2093	/*
2094	* If collapse is successful, flush must be done now before copying.
2095	* If collapse is unsuccessful, does flush actually need to be done?
2096	* Do it anyway, to clear the state.
2097	*/
2098	try_to_unmap_flush();
2099
2100	if (result == SCAN_SUCCEED && nr_none &&
2101	!shmem_charge(inode: mapping->host, pages: nr_none))
2102	result = SCAN_FAIL;
2103	if (result != SCAN_SUCCEED) {
2104	nr_none = `0`;
2105	goto rollback;
2106	}
2107
2108	/*
2109	* The old folios are locked, so they won't change anymore.
2110	*/
2111	index = start;
2112	dst = folio_page(new_folio, `0`);
2113	list_for_each_entry(folio, &pagelist, lru) {
2114	int i, nr_pages = folio_nr_pages(folio);
2115
2116	while (index < folio->index) {
2117	clear_highpage(page: dst);
2118	index++;
2119	dst++;
2120	}
2121
2122	for (i = `0`; i < nr_pages; i++) {
2123	if (copy_mc_highpage(to: dst, folio_page(folio, i)) > `0`) {
2124	result = SCAN_COPY_MC;
2125	goto rollback;
2126	}
2127	index++;
2128	dst++;
2129	}
2130	}
2131	while (index < end) {
2132	clear_highpage(page: dst);
2133	index++;
2134	dst++;
2135	}
2136
2137	if (nr_none) {
2138	struct vm_area_struct *vma;
2139	int nr_none_check = `0`;
2140
2141	i_mmap_lock_read(mapping);
2142	xas_lock_irq(&xas);
2143
2144	xas_set(xas: &xas, index: start);
2145	for (index = start; index < end; index++) {
2146	if (!xas_next(xas: &xas)) {
2147	xas_store(&xas, XA_RETRY_ENTRY);
2148	if (xas_error(xas: &xas)) {
2149	result = SCAN_STORE_FAILED;
2150	goto immap_locked;
2151	}
2152	nr_none_check++;
2153	}
2154	}
2155
2156	if (nr_none != nr_none_check) {
2157	result = SCAN_PAGE_FILLED;
2158	goto immap_locked;
2159	}
2160
2161	/*
2162	* If userspace observed a missing page in a VMA with
2163	* a MODE_MISSING userfaultfd, then it might expect a
2164	* UFFD_EVENT_PAGEFAULT for that page. If so, we need to
2165	* roll back to avoid suppressing such an event. Since
2166	* wp/minor userfaultfds don't give userspace any
2167	* guarantees that the kernel doesn't fill a missing
2168	* page with a zero page, so they don't matter here.
2169	*
2170	* Any userfaultfds registered after this point will
2171	* not be able to observe any missing pages due to the
2172	* previously inserted retry entries.
2173	*/
2174	vma_interval_tree_foreach(vma, &mapping->i_mmap, start, end) {
2175	if (userfaultfd_missing(vma)) {
2176	result = SCAN_EXCEED_NONE_PTE;
2177	goto immap_locked;
2178	}
2179	}
2180
2181	immap_locked:
2182	i_mmap_unlock_read(mapping);
2183	if (result != SCAN_SUCCEED) {
2184	xas_set(xas: &xas, index: start);
2185	for (index = start; index < end; index++) {
2186	if (xas_next(xas: &xas) == XA_RETRY_ENTRY)
2187	xas_store(&xas, NULL);
2188	}
2189
2190	xas_unlock_irq(&xas);
2191	goto rollback;
2192	}
2193	} else {
2194	xas_lock_irq(&xas);
2195	}
2196
2197	if (is_shmem)
2198	lruvec_stat_mod_folio(folio: new_folio, idx: NR_SHMEM_THPS, HPAGE_PMD_NR);
2199	else
2200	lruvec_stat_mod_folio(folio: new_folio, idx: NR_FILE_THPS, HPAGE_PMD_NR);
2201
2202	if (nr_none) {
2203	lruvec_stat_mod_folio(folio: new_folio, idx: NR_FILE_PAGES, val: nr_none);
2204	/ nr_none is always 0 for non-shmem. /
2205	lruvec_stat_mod_folio(folio: new_folio, idx: NR_SHMEM, val: nr_none);
2206	}
2207
2208	/*
2209	* Mark new_folio as uptodate before inserting it into the
2210	* page cache so that it isn't mistaken for an fallocated but
2211	* unwritten page.
2212	*/
2213	folio_mark_uptodate(folio: new_folio);
2214	folio_ref_add(folio: new_folio, HPAGE_PMD_NR - `1`);
2215
2216	if (is_shmem)
2217	folio_mark_dirty(folio: new_folio);
2218	folio_add_lru(new_folio);
2219
2220	/ Join all the small entries into a single multi-index entry. /
2221	xas_set_order(xas: &xas, index: start, HPAGE_PMD_ORDER);
2222	xas_store(&xas, entry: new_folio);
2223	WARN_ON_ONCE(xas_error(&xas));
2224	xas_unlock_irq(&xas);
2225
2226	/*
2227	* Remove pte page tables, so we can re-fault the page as huge.
2228	* If MADV_COLLAPSE, adjust result to call collapse_pte_mapped_thp().
2229	*/
2230	retract_page_tables(mapping, pgoff: start);
2231	if (cc && !cc->is_khugepaged)
2232	result = SCAN_PTE_MAPPED_HUGEPAGE;
2233	folio_unlock(folio: new_folio);
2234
2235	/*
2236	* The collapse has succeeded, so free the old folios.
2237	*/
2238	list_for_each_entry_safe(folio, tmp, &pagelist, lru) {
2239	list_del(entry: &folio->lru);
2240	folio->mapping = NULL;
2241	folio_clear_active(folio);
2242	folio_clear_unevictable(folio);
2243	folio_unlock(folio);
2244	folio_put_refs(folio, refs: `2` + folio_nr_pages(folio));
2245	}
2246
2247	goto out;
2248
2249	rollback:
2250	/ Something went wrong: roll back page cache changes /
2251	if (nr_none) {
2252	xas_lock_irq(&xas);
2253	mapping->nrpages -= nr_none;
2254	xas_unlock_irq(&xas);
2255	shmem_uncharge(inode: mapping->host, pages: nr_none);
2256	}
2257
2258	list_for_each_entry_safe(folio, tmp, &pagelist, lru) {
2259	list_del(entry: &folio->lru);
2260	folio_unlock(folio);
2261	folio_putback_lru(folio);
2262	folio_put(folio);
2263	}
2264	/*
2265	* Undo the updates of filemap_nr_thps_inc for non-SHMEM
2266	* file only. This undo is not needed unless failure is
2267	* due to SCAN_COPY_MC.
2268	*/
2269	if (!is_shmem && result == SCAN_COPY_MC) {
2270	filemap_nr_thps_dec(mapping);
2271	/*
2272	* Paired with the fence in do_dentry_open() -> get_write_access()
2273	* to ensure the update to nr_thps is visible.
2274	*/
2275	smp_mb();
2276	}
2277
2278	new_folio->mapping = NULL;
2279
2280	folio_unlock(folio: new_folio);
2281	folio_put(folio: new_folio);
2282	out:
2283	VM_BUG_ON(!list_empty(&pagelist));
2284	trace_mm_khugepaged_collapse_file(mm, new_folio, index, addr, is_shmem, file, HPAGE_PMD_NR, result);
2285	return result;
2286	}
2287
2288	static int hpage_collapse_scan_file(struct mm_struct mm, unsigned* long addr,
2289	struct file *file, pgoff_t start,
2290	struct collapse_control *cc)
2291	{
2292	struct folio *folio = NULL;
2293	struct address_space *mapping = file->f_mapping;
2294	XA_STATE(xas, &mapping->i_pages, start);
2295	int present, swap;
2296	int node = NUMA_NO_NODE;
2297	int result = SCAN_SUCCEED;
2298
2299	present = `0`;
2300	swap = `0`;
2301	memset(cc->node_load, `0`, sizeof(cc->node_load));
2302	nodes_clear(cc->alloc_nmask);
2303	rcu_read_lock();
2304	xas_for_each(&xas, folio, start + HPAGE_PMD_NR - `1`) {
2305	if (xas_retry(xas: &xas, entry: folio))
2306	continue;
2307
2308	if (xa_is_value(entry: folio)) {
2309	swap += `1` << xas_get_order(xas: &xas);
2310	if (cc->is_khugepaged &&
2311	swap > khugepaged_max_ptes_swap) {
2312	result = SCAN_EXCEED_SWAP_PTE;
2313	count_vm_event(item: THP_SCAN_EXCEED_SWAP_PTE);
2314	break;
2315	}
2316	continue;
2317	}
2318
2319	if (!folio_try_get(folio)) {
2320	xas_reset(xas: &xas);
2321	continue;
2322	}
2323
2324	if (unlikely(folio != xas_reload(&xas))) {
2325	folio_put(folio);
2326	xas_reset(xas: &xas);
2327	continue;
2328	}
2329
2330	if (folio_order(folio) == HPAGE_PMD_ORDER &&
2331	folio->index == start) {
2332	/ Maybe PMD-mapped /
2333	result = SCAN_PTE_MAPPED_HUGEPAGE;
2334	/*
2335	* For SCAN_PTE_MAPPED_HUGEPAGE, further processing
2336	* by the caller won't touch the page cache, and so
2337	* it's safe to skip LRU and refcount checks before
2338	* returning.
2339	*/
2340	folio_put(folio);
2341	break;
2342	}
2343
2344	node = folio_nid(folio);
2345	if (hpage_collapse_scan_abort(nid: node, cc)) {
2346	result = SCAN_SCAN_ABORT;
2347	folio_put(folio);
2348	break;
2349	}
2350	cc->node_load[node]++;
2351
2352	if (!folio_test_lru(folio)) {
2353	result = SCAN_PAGE_LRU;
2354	folio_put(folio);
2355	break;
2356	}
2357
2358	if (folio_expected_ref_count(folio) + `1` != folio_ref_count(folio)) {
2359	result = SCAN_PAGE_COUNT;
2360	folio_put(folio);
2361	break;
2362	}
2363
2364	/*
2365	* We probably should check if the folio is referenced
2366	* here, but nobody would transfer pte_young() to
2367	* folio_test_referenced() for us. And rmap walk here
2368	* is just too costly...
2369	*/
2370
2371	present += folio_nr_pages(folio);
2372	folio_put(folio);
2373
2374	if (need_resched()) {
2375	xas_pause(&xas);
2376	cond_resched_rcu();
2377	}
2378	}
2379	rcu_read_unlock();
2380
2381	if (result == SCAN_SUCCEED) {
2382	if (cc->is_khugepaged &&
2383	present < HPAGE_PMD_NR - khugepaged_max_ptes_none) {
2384	result = SCAN_EXCEED_NONE_PTE;
2385	count_vm_event(item: THP_SCAN_EXCEED_NONE_PTE);
2386	} else {
2387	result = collapse_file(mm, addr, file, start, cc);
2388	}
2389	}
2390
2391	trace_mm_khugepaged_scan_file(mm, folio, file, present, swap, result);
2392	return result;
2393	}
2394
2395	static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
2396	struct collapse_control *cc)
2397	__releases(&khugepaged_mm_lock)
2398	__acquires(&khugepaged_mm_lock)
2399	{
2400	struct vma_iterator vmi;
2401	struct mm_slot *slot;
2402	struct mm_struct *mm;
2403	struct vm_area_struct *vma;
2404	int progress = `0`;
2405
2406	VM_BUG_ON(!pages);
2407	lockdep_assert_held(&khugepaged_mm_lock);
2408	*result = SCAN_FAIL;
2409
2410	if (khugepaged_scan.mm_slot) {
2411	slot = khugepaged_scan.mm_slot;
2412	} else {
2413	slot = list_first_entry(&khugepaged_scan.mm_head,
2414	struct mm_slot, mm_node);
2415	khugepaged_scan.address = `0`;
2416	khugepaged_scan.mm_slot = slot;
2417	}
2418	spin_unlock(lock: &khugepaged_mm_lock);
2419
2420	mm = slot->mm;
2421	/*
2422	* Don't wait for semaphore (to avoid long wait times). Just move to
2423	* the next mm on the list.
2424	*/
2425	vma = NULL;
2426	if (unlikely(!mmap_read_trylock(mm)))
2427	goto breakouterloop_mmap_lock;
2428
2429	progress++;
2430	if (unlikely(hpage_collapse_test_exit_or_disable(mm)))
2431	goto breakouterloop;
2432
2433	vma_iter_init(vmi: &vmi, mm, addr: khugepaged_scan.address);
2434	for_each_vma(vmi, vma) {
2435	unsigned long hstart, hend;
2436
2437	cond_resched();
2438	if (unlikely(hpage_collapse_test_exit_or_disable(mm))) {
2439	progress++;
2440	break;
2441	}
2442	if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_KHUGEPAGED, PMD_ORDER)) {
2443	skip:
2444	progress++;
2445	continue;
2446	}
2447	hstart = round_up(vma->vm_start, HPAGE_PMD_SIZE);
2448	hend = round_down(vma->vm_end, HPAGE_PMD_SIZE);
2449	if (khugepaged_scan.address > hend)
2450	goto skip;
2451	if (khugepaged_scan.address < hstart)
2452	khugepaged_scan.address = hstart;
2453	VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK);
2454
2455	while (khugepaged_scan.address < hend) {
2456	bool mmap_locked = true;
2457
2458	cond_resched();
2459	if (unlikely(hpage_collapse_test_exit_or_disable(mm)))
2460	goto breakouterloop;
2461
2462	VM_BUG_ON(khugepaged_scan.address < hstart \|\|
2463	khugepaged_scan.address + HPAGE_PMD_SIZE >
2464	hend);
2465	if (!vma_is_anonymous(vma)) {
2466	struct file *file = get_file(f: vma->vm_file);
2467	pgoff_t pgoff = linear_page_index(vma,
2468	address: khugepaged_scan.address);
2469
2470	mmap_read_unlock(mm);
2471	mmap_locked = false;
2472	*result = hpage_collapse_scan_file(mm,
2473	addr: khugepaged_scan.address, file, start: pgoff, cc);
2474	fput(file);
2475	if (*result == SCAN_PTE_MAPPED_HUGEPAGE) {
2476	mmap_read_lock(mm);
2477	if (hpage_collapse_test_exit_or_disable(mm))
2478	goto breakouterloop;
2479	*result = collapse_pte_mapped_thp(mm,
2480	addr: khugepaged_scan.address, install_pmd: false);
2481	if (*result == SCAN_PMD_MAPPED)
2482	*result = SCAN_SUCCEED;
2483	mmap_read_unlock(mm);
2484	}
2485	} else {
2486	*result = hpage_collapse_scan_pmd(mm, vma,
2487	start_addr: khugepaged_scan.address, mmap_locked: &mmap_locked, cc);
2488	}
2489
2490	if (*result == SCAN_SUCCEED)
2491	++khugepaged_pages_collapsed;
2492
2493	/ move to next address /
2494	khugepaged_scan.address += HPAGE_PMD_SIZE;
2495	progress += HPAGE_PMD_NR;
2496	if (!mmap_locked)
2497	/*
2498	* We released mmap_lock so break loop. Note
2499	* that we drop mmap_lock before all hugepage
2500	* allocations, so if allocation fails, we are
2501	* guaranteed to break here and report the
2502	* correct result back to caller.
2503	*/
2504	goto breakouterloop_mmap_lock;
2505	if (progress >= pages)
2506	goto breakouterloop;
2507	}
2508	}
2509	breakouterloop:
2510	mmap_read_unlock(mm); / exit_mmap will destroy ptes after this /
2511	breakouterloop_mmap_lock:
2512
2513	spin_lock(lock: &khugepaged_mm_lock);
2514	VM_BUG_ON(khugepaged_scan.mm_slot != slot);
2515	/*
2516	* Release the current mm_slot if this mm is about to die, or
2517	* if we scanned all vmas of this mm.
2518	*/
2519	if (hpage_collapse_test_exit(mm) \|\| !vma) {
2520	/*
2521	* Make sure that if mm_users is reaching zero while
2522	* khugepaged runs here, khugepaged_exit will find
2523	* mm_slot not pointing to the exiting mm.
2524	*/
2525	if (!list_is_last(list: &slot->mm_node, head: &khugepaged_scan.mm_head)) {
2526	khugepaged_scan.mm_slot = list_next_entry(slot, mm_node);
2527	khugepaged_scan.address = `0`;
2528	} else {
2529	khugepaged_scan.mm_slot = NULL;
2530	khugepaged_full_scans++;
2531	}
2532
2533	collect_mm_slot(slot);
2534	}
2535
2536	return progress;
2537	}
2538
2539	static int khugepaged_has_work(void)
2540	{
2541	return !list_empty(head: &khugepaged_scan.mm_head) && hugepage_pmd_enabled();
2542	}
2543
2544	static int khugepaged_wait_event(void)
2545	{
2546	return !list_empty(head: &khugepaged_scan.mm_head) \|\|
2547	kthread_should_stop();
2548	}
2549
2550	static void khugepaged_do_scan(struct collapse_control *cc)
2551	{
2552	unsigned int progress = `0`, pass_through_head = `0`;
2553	unsigned int pages = READ_ONCE(khugepaged_pages_to_scan);
2554	bool wait = true;
2555	int result = SCAN_SUCCEED;
2556
2557	lru_add_drain_all();
2558
2559	while (true) {
2560	cond_resched();
2561
2562	if (unlikely(kthread_should_stop()))
2563	break;
2564
2565	spin_lock(lock: &khugepaged_mm_lock);
2566	if (!khugepaged_scan.mm_slot)
2567	pass_through_head++;
2568	if (khugepaged_has_work() &&
2569	pass_through_head < `2`)
2570	progress += khugepaged_scan_mm_slot(pages: pages - progress,
2571	result: &result, cc);
2572	else
2573	progress = pages;
2574	spin_unlock(lock: &khugepaged_mm_lock);
2575
2576	if (progress >= pages)
2577	break;
2578
2579	if (result == SCAN_ALLOC_HUGE_PAGE_FAIL) {
2580	/*
2581	* If fail to allocate the first time, try to sleep for
2582	* a while. When hit again, cancel the scan.
2583	*/
2584	if (!wait)
2585	break;
2586	wait = false;
2587	khugepaged_alloc_sleep();
2588	}
2589	}
2590	}
2591
2592	static bool khugepaged_should_wakeup(void)
2593	{
2594	return kthread_should_stop() \|\|
2595	time_after_eq(jiffies, khugepaged_sleep_expire);
2596	}
2597
2598	static void khugepaged_wait_work(void)
2599	{
2600	if (khugepaged_has_work()) {
2601	const unsigned long scan_sleep_jiffies =
2602	msecs_to_jiffies(m: khugepaged_scan_sleep_millisecs);
2603
2604	if (!scan_sleep_jiffies)
2605	return;
2606
2607	khugepaged_sleep_expire = jiffies + scan_sleep_jiffies;
2608	wait_event_freezable_timeout(khugepaged_wait,
2609	khugepaged_should_wakeup(),
2610	scan_sleep_jiffies);
2611	return;
2612	}
2613
2614	if (hugepage_pmd_enabled())
2615	wait_event_freezable(khugepaged_wait, khugepaged_wait_event());
2616	}
2617
2618	static int khugepaged(void *none)
2619	{
2620	struct mm_slot *slot;
2621
2622	set_freezable();
2623	set_user_nice(current, MAX_NICE);
2624
2625	while (!kthread_should_stop()) {
2626	khugepaged_do_scan(cc: &khugepaged_collapse_control);
2627	khugepaged_wait_work();
2628	}
2629
2630	spin_lock(lock: &khugepaged_mm_lock);
2631	slot = khugepaged_scan.mm_slot;
2632	khugepaged_scan.mm_slot = NULL;
2633	if (slot)
2634	collect_mm_slot(slot);
2635	spin_unlock(lock: &khugepaged_mm_lock);
2636	return `0`;
2637	}
2638
2639	static void set_recommended_min_free_kbytes(void)
2640	{
2641	struct zone *zone;
2642	int nr_zones = `0`;
2643	unsigned long recommended_min;
2644
2645	if (!hugepage_pmd_enabled()) {
2646	calculate_min_free_kbytes();
2647	goto update_wmarks;
2648	}
2649
2650	for_each_populated_zone(zone) {
2651	/*
2652	* We don't need to worry about fragmentation of
2653	* ZONE_MOVABLE since it only has movable pages.
2654	*/
2655	if (zone_idx(zone) > gfp_zone(GFP_USER))
2656	continue;
2657
2658	nr_zones++;
2659	}
2660
2661	/ Ensure 2 pageblocks are free to assist fragmentation avoidance /
2662	recommended_min = pageblock_nr_pages * nr_zones * `2`;
2663
2664	/*
2665	* Make sure that on average at least two pageblocks are almost free
2666	* of another type, one for a migratetype to fall back to and a
2667	* second to avoid subsequent fallbacks of other types There are 3
2668	* MIGRATE_TYPES we care about.
2669	*/
2670	recommended_min += pageblock_nr_pages * nr_zones *
2671	MIGRATE_PCPTYPES * MIGRATE_PCPTYPES;
2672
2673	/ don't ever allow to reserve more than 5% of the lowmem /
2674	recommended_min = min(recommended_min,
2675	(unsigned long) nr_free_buffer_pages() / `20`);
2676	recommended_min <<= (PAGE_SHIFT-`10`);
2677
2678	if (recommended_min > min_free_kbytes) {
2679	if (user_min_free_kbytes >= `0`)
2680	pr_info("raising min_free_kbytes from %d to %lu to help transparent hugepage allocations\n",
2681	min_free_kbytes, recommended_min);
2682
2683	min_free_kbytes = recommended_min;
2684	}
2685
2686	update_wmarks:
2687	setup_per_zone_wmarks();
2688	}
2689
2690	int start_stop_khugepaged(void)
2691	{
2692	int err = `0`;
2693
2694	mutex_lock(&khugepaged_mutex);
2695	if (hugepage_pmd_enabled()) {
2696	if (!khugepaged_thread)
2697	khugepaged_thread = kthread_run(khugepaged, NULL,
2698	"khugepaged");
2699	if (IS_ERR(ptr: khugepaged_thread)) {
2700	pr_err("khugepaged: kthread_run(khugepaged) failed\n");
2701	err = PTR_ERR(ptr: khugepaged_thread);
2702	khugepaged_thread = NULL;
2703	goto fail;
2704	}
2705
2706	if (!list_empty(head: &khugepaged_scan.mm_head))
2707	wake_up_interruptible(&khugepaged_wait);
2708	} else if (khugepaged_thread) {
2709	kthread_stop(k: khugepaged_thread);
2710	khugepaged_thread = NULL;
2711	}
2712	set_recommended_min_free_kbytes();
2713	fail:
2714	mutex_unlock(lock: &khugepaged_mutex);
2715	return err;
2716	}
2717
2718	void khugepaged_min_free_kbytes_update(void)
2719	{
2720	mutex_lock(&khugepaged_mutex);
2721	if (hugepage_pmd_enabled() && khugepaged_thread)
2722	set_recommended_min_free_kbytes();
2723	mutex_unlock(lock: &khugepaged_mutex);
2724	}
2725
2726	bool current_is_khugepaged(void)
2727	{
2728	return kthread_func(current) == khugepaged;
2729	}
2730
2731	static int madvise_collapse_errno(enum scan_result r)
2732	{
2733	/*
2734	* MADV_COLLAPSE breaks from existing madvise(2) conventions to provide
2735	* actionable feedback to caller, so they may take an appropriate
2736	* fallback measure depending on the nature of the failure.
2737	*/
2738	switch (r) {
2739	case SCAN_ALLOC_HUGE_PAGE_FAIL:
2740	return -ENOMEM;
2741	case SCAN_CGROUP_CHARGE_FAIL:
2742	case SCAN_EXCEED_NONE_PTE:
2743	return -EBUSY;
2744	/ Resource temporary unavailable - trying again might succeed /
2745	case SCAN_PAGE_COUNT:
2746	case SCAN_PAGE_LOCK:
2747	case SCAN_PAGE_LRU:
2748	case SCAN_DEL_PAGE_LRU:
2749	case SCAN_PAGE_FILLED:
2750	return -EAGAIN;
2751	/*
2752	* Other: Trying again likely not to succeed / error intrinsic to
2753	* specified memory range. khugepaged likely won't be able to collapse
2754	* either.
2755	*/
2756	default:
2757	return -EINVAL;
2758	}
2759	}
2760
2761	int madvise_collapse(struct vm_area_struct vma, unsigned* long start,
2762	unsigned long end, bool *lock_dropped)
2763	{
2764	struct collapse_control *cc;
2765	struct mm_struct *mm = vma->vm_mm;
2766	unsigned long hstart, hend, addr;
2767	int thps = `0`, last_fail = SCAN_FAIL;
2768	bool mmap_locked = true;
2769
2770	BUG_ON(vma->vm_start > start);
2771	BUG_ON(vma->vm_end < end);
2772
2773	if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_FORCED_COLLAPSE, PMD_ORDER))
2774	return -EINVAL;
2775
2776	cc = kmalloc(sizeof(*cc), GFP_KERNEL);
2777	if (!cc)
2778	return -ENOMEM;
2779	cc->is_khugepaged = false;
2780
2781	mmgrab(mm);
2782	lru_add_drain_all();
2783
2784	hstart = (start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
2785	hend = end & HPAGE_PMD_MASK;
2786
2787	for (addr = hstart; addr < hend; addr += HPAGE_PMD_SIZE) {
2788	int result = SCAN_FAIL;
2789
2790	if (!mmap_locked) {
2791	cond_resched();
2792	mmap_read_lock(mm);
2793	mmap_locked = true;
2794	result = hugepage_vma_revalidate(mm, address: addr, expect_anon: false, vmap: &vma,
2795	cc);
2796	if (result != SCAN_SUCCEED) {
2797	last_fail = result;
2798	goto out_nolock;
2799	}
2800
2801	hend = min(hend, vma->vm_end & HPAGE_PMD_MASK);
2802	}
2803	mmap_assert_locked(mm);
2804	if (!vma_is_anonymous(vma)) {
2805	struct file *file = get_file(f: vma->vm_file);
2806	pgoff_t pgoff = linear_page_index(vma, address: addr);
2807
2808	mmap_read_unlock(mm);
2809	mmap_locked = false;
2810	result = hpage_collapse_scan_file(mm, addr, file, start: pgoff,
2811	cc);
2812	fput(file);
2813	} else {
2814	result = hpage_collapse_scan_pmd(mm, vma, start_addr: addr,
2815	mmap_locked: &mmap_locked, cc);
2816	}
2817	if (!mmap_locked)
2818	*lock_dropped = true;
2819
2820	handle_result:
2821	switch (result) {
2822	case SCAN_SUCCEED:
2823	case SCAN_PMD_MAPPED:
2824	++thps;
2825	break;
2826	case SCAN_PTE_MAPPED_HUGEPAGE:
2827	BUG_ON(mmap_locked);
2828	mmap_read_lock(mm);
2829	result = collapse_pte_mapped_thp(mm, addr, install_pmd: true);
2830	mmap_read_unlock(mm);
2831	goto handle_result;
2832	/ Whitelisted set of results where continuing OK /
2833	case SCAN_NO_PTE_TABLE:
2834	case SCAN_PTE_NON_PRESENT:
2835	case SCAN_PTE_UFFD_WP:
2836	case SCAN_LACK_REFERENCED_PAGE:
2837	case SCAN_PAGE_NULL:
2838	case SCAN_PAGE_COUNT:
2839	case SCAN_PAGE_LOCK:
2840	case SCAN_PAGE_COMPOUND:
2841	case SCAN_PAGE_LRU:
2842	case SCAN_DEL_PAGE_LRU:
2843	last_fail = result;
2844	break;
2845	default:
2846	last_fail = result;
2847	/ Other error, exit /
2848	goto out_maybelock;
2849	}
2850	}
2851
2852	out_maybelock:
2853	/ Caller expects us to hold mmap_lock on return /
2854	if (!mmap_locked)
2855	mmap_read_lock(mm);
2856	out_nolock:
2857	mmap_assert_locked(mm);
2858	mmdrop(mm);
2859	kfree(objp: cc);
2860
2861	return thps == ((hend - hstart) >> HPAGE_PMD_SHIFT) ? `0`
2862	: madvise_collapse_errno(r: last_fail);
2863	}
2864

source code of linux/mm/khugepaged.c