huge_memory.c source code [linux/mm/huge_memory.c]

1	// SPDX-License-Identifier: GPL-2.0-only
2	/*
3	* Copyright (C) 2009 Red Hat, Inc.
4	*/
5
6	#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
7
8	#include <linux/mm.h>
9	#include <linux/sched.h>
10	#include <linux/sched/mm.h>
11	#include <linux/sched/numa_balancing.h>
12	#include <linux/highmem.h>
13	#include <linux/hugetlb.h>
14	#include <linux/mmu_notifier.h>
15	#include <linux/rmap.h>
16	#include <linux/swap.h>
17	#include <linux/shrinker.h>
18	#include <linux/mm_inline.h>
19	#include <linux/swapops.h>
20	#include <linux/backing-dev.h>
21	#include <linux/dax.h>
22	#include <linux/mm_types.h>
23	#include <linux/khugepaged.h>
24	#include <linux/freezer.h>
25	#include <linux/mman.h>
26	#include <linux/memremap.h>
27	#include <linux/pagemap.h>
28	#include <linux/debugfs.h>
29	#include <linux/migrate.h>
30	#include <linux/hashtable.h>
31	#include <linux/userfaultfd_k.h>
32	#include <linux/page_idle.h>
33	#include <linux/shmem_fs.h>
34	#include <linux/oom.h>
35	#include <linux/numa.h>
36	#include <linux/page_owner.h>
37	#include <linux/sched/sysctl.h>
38	#include <linux/memory-tiers.h>
39	#include <linux/compat.h>
40	#include <linux/pgalloc.h>
41	#include <linux/pgalloc_tag.h>
42	#include <linux/pagewalk.h>
43
44	#include <asm/tlb.h>
45	#include "internal.h"
46	#include "swap.h"
47
48	#define CREATE_TRACE_POINTS
49	#include <trace/events/thp.h>
50
51	/*
52	* By default, transparent hugepage support is disabled in order to avoid
53	* risking an increased memory footprint for applications that are not
54	* guaranteed to benefit from it. When transparent hugepage support is
55	* enabled, it is for all mappings, and khugepaged scans all mappings.
56	* Defrag is invoked by khugepaged hugepage allocations and by page faults
57	* for all hugepage allocations.
58	*/
59	unsigned long transparent_hugepage_flags __read_mostly =
60	#ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS
61	(`1`<<TRANSPARENT_HUGEPAGE_FLAG)\|
62	#endif
63	#ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE
64	(`1`<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)\|
65	#endif
66	(`1`<<TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG)\|
67	(`1`<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)\|
68	(`1`<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
69
70	static struct shrinker *deferred_split_shrinker;
71	static unsigned long deferred_split_count(struct shrinker *shrink,
72	struct shrink_control *sc);
73	static unsigned long deferred_split_scan(struct shrinker *shrink,
74	struct shrink_control *sc);
75	static bool split_underused_thp = true;
76
77	static atomic_t huge_zero_refcount;
78	struct folio *huge_zero_folio __read_mostly;
79	unsigned long huge_zero_pfn __read_mostly = ~`0UL`;
80	unsigned long huge_anon_orders_always __read_mostly;
81	unsigned long huge_anon_orders_madvise __read_mostly;
82	unsigned long huge_anon_orders_inherit __read_mostly;
83	static bool anon_orders_configured __initdata;
84
85	static inline bool file_thp_enabled(struct vm_area_struct *vma)
86	{
87	struct inode *inode;
88
89	if (!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS))
90	return false;
91
92	if (!vma->vm_file)
93	return false;
94
95	inode = file_inode(f: vma->vm_file);
96
97	return !inode_is_open_for_write(inode) && S_ISREG(inode->i_mode);
98	}
99
100	unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
101	vm_flags_t vm_flags,
102	enum tva_type type,
103	unsigned long orders)
104	{
105	const bool smaps = type == TVA_SMAPS;
106	const bool in_pf = type == TVA_PAGEFAULT;
107	const bool forced_collapse = type == TVA_FORCED_COLLAPSE;
108	unsigned long supported_orders;
109
110	/ Check the intersection of requested and supported orders. /
111	if (vma_is_anonymous(vma))
112	supported_orders = THP_ORDERS_ALL_ANON;
113	else if (vma_is_special_huge(vma))
114	supported_orders = THP_ORDERS_ALL_SPECIAL;
115	else
116	supported_orders = THP_ORDERS_ALL_FILE_DEFAULT;
117
118	orders &= supported_orders;
119	if (!orders)
120	return `0`;
121
122	if (!vma->vm_mm) / vdso /
123	return `0`;
124
125	if (thp_disabled_by_hw() \|\| vma_thp_disabled(vma, vm_flags, forced_collapse))
126	return `0`;
127
128	/ khugepaged doesn't collapse DAX vma, but page fault is fine. /
129	if (vma_is_dax(vma))
130	return in_pf ? orders : `0`;
131
132	/*
133	* khugepaged special VMA and hugetlb VMA.
134	* Must be checked after dax since some dax mappings may have
135	* VM_MIXEDMAP set.
136	*/
137	if (!in_pf && !smaps && (vm_flags & VM_NO_KHUGEPAGED))
138	return `0`;
139
140	/*
141	* Check alignment for file vma and size for both file and anon vma by
142	* filtering out the unsuitable orders.
143	*
144	* Skip the check for page fault. Huge fault does the check in fault
145	* handlers.
146	*/
147	if (!in_pf) {
148	int order = highest_order(orders);
149	unsigned long addr;
150
151	while (orders) {
152	addr = vma->vm_end - (PAGE_SIZE << order);
153	if (thp_vma_suitable_order(vma, addr, order))
154	break;
155	order = next_order(orders: &orders, prev: order);
156	}
157
158	if (!orders)
159	return `0`;
160	}
161
162	/*
163	* Enabled via shmem mount options or sysfs settings.
164	* Must be done before hugepage flags check since shmem has its
165	* own flags.
166	*/
167	if (!in_pf && shmem_file(file: vma->vm_file))
168	return orders & shmem_allowable_huge_orders(inode: file_inode(f: vma->vm_file),
169	vma, index: vma->vm_pgoff, write_end: `0`,
170	shmem_huge_force: forced_collapse);
171
172	if (!vma_is_anonymous(vma)) {
173	/*
174	* Enforce THP collapse requirements as necessary. Anonymous vmas
175	* were already handled in thp_vma_allowable_orders().
176	*/
177	if (!forced_collapse &&
178	(!hugepage_global_enabled() \|\| (!(vm_flags & VM_HUGEPAGE) &&
179	!hugepage_global_always())))
180	return `0`;
181
182	/*
183	* Trust that ->huge_fault() handlers know what they are doing
184	* in fault path.
185	*/
186	if (((in_pf \|\| smaps)) && vma->vm_ops->huge_fault)
187	return orders;
188	/ Only regular file is valid in collapse path /
189	if (((!in_pf \|\| smaps)) && file_thp_enabled(vma))
190	return orders;
191	return `0`;
192	}
193
194	if (vma_is_temporary_stack(vma))
195	return `0`;
196
197	/*
198	* THPeligible bit of smaps should show 1 for proper VMAs even
199	* though anon_vma is not initialized yet.
200	*
201	* Allow page fault since anon_vma may be not initialized until
202	* the first page fault.
203	*/
204	if (!vma->anon_vma)
205	return (smaps \|\| in_pf) ? orders : `0`;
206
207	return orders;
208	}
209
210	static bool get_huge_zero_folio(void)
211	{
212	struct folio *zero_folio;
213	retry:
214	if (likely(atomic_inc_not_zero(&huge_zero_refcount)))
215	return true;
216
217	zero_folio = folio_alloc((GFP_TRANSHUGE \| __GFP_ZERO \| __GFP_ZEROTAGS) &
218	~__GFP_MOVABLE,
219	HPAGE_PMD_ORDER);
220	if (!zero_folio) {
221	count_vm_event(item: THP_ZERO_PAGE_ALLOC_FAILED);
222	return false;
223	}
224	/ Ensure zero folio won't have large_rmappable flag set. /
225	folio_clear_large_rmappable(folio: zero_folio);
226	preempt_disable();
227	if (cmpxchg(&huge_zero_folio, NULL, zero_folio)) {
228	preempt_enable();
229	folio_put(folio: zero_folio);
230	goto retry;
231	}
232	WRITE_ONCE(huge_zero_pfn, folio_pfn(zero_folio));
233
234	/ We take additional reference here. It will be put back by shrinker /
235	atomic_set(v: &huge_zero_refcount, i: `2`);
236	preempt_enable();
237	count_vm_event(item: THP_ZERO_PAGE_ALLOC);
238	return true;
239	}
240
241	static void put_huge_zero_folio(void)
242	{
243	/*
244	* Counter should never go to zero here. Only shrinker can put
245	* last reference.
246	*/
247	BUG_ON(atomic_dec_and_test(&huge_zero_refcount));
248	}
249
250	struct folio mm_get_huge_zero_folio(struct* mm_struct *mm)
251	{
252	if (IS_ENABLED(CONFIG_PERSISTENT_HUGE_ZERO_FOLIO))
253	return huge_zero_folio;
254
255	if (mm_flags_test(MMF_HUGE_ZERO_FOLIO, mm))
256	return READ_ONCE(huge_zero_folio);
257
258	if (!get_huge_zero_folio())
259	return NULL;
260
261	if (mm_flags_test_and_set(MMF_HUGE_ZERO_FOLIO, mm))
262	put_huge_zero_folio();
263
264	return READ_ONCE(huge_zero_folio);
265	}
266
267	void mm_put_huge_zero_folio(struct mm_struct *mm)
268	{
269	if (IS_ENABLED(CONFIG_PERSISTENT_HUGE_ZERO_FOLIO))
270	return;
271
272	if (mm_flags_test(MMF_HUGE_ZERO_FOLIO, mm))
273	put_huge_zero_folio();
274	}
275
276	static unsigned long shrink_huge_zero_folio_count(struct shrinker *shrink,
277	struct shrink_control *sc)
278	{
279	/ we can free zero page only if last reference remains /
280	return atomic_read(v: &huge_zero_refcount) == `1` ? HPAGE_PMD_NR : `0`;
281	}
282
283	static unsigned long shrink_huge_zero_folio_scan(struct shrinker *shrink,
284	struct shrink_control *sc)
285	{
286	if (atomic_cmpxchg(v: &huge_zero_refcount, old: `1`, new: `0`) == `1`) {
287	struct folio *zero_folio = xchg(&huge_zero_folio, NULL);
288	BUG_ON(zero_folio == NULL);
289	WRITE_ONCE(huge_zero_pfn, ~`0UL`);
290	folio_put(folio: zero_folio);
291	return HPAGE_PMD_NR;
292	}
293
294	return `0`;
295	}
296
297	static struct shrinker *huge_zero_folio_shrinker;
298
299	#ifdef CONFIG_SYSFS
300	static ssize_t enabled_show(struct kobject *kobj,
301	struct kobj_attribute attr, char* *buf)
302	{
303	const char *output;
304
305	if (test_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags))
306	output = "[always] madvise never";
307	else if (test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
308	&transparent_hugepage_flags))
309	output = "always [madvise] never";
310	else
311	output = "always madvise [never]";
312
313	return sysfs_emit(buf, fmt: "%s\n", output);
314	}
315
316	static ssize_t enabled_store(struct kobject *kobj,
317	struct kobj_attribute *attr,
318	const char *buf, size_t count)
319	{
320	ssize_t ret = count;
321
322	if (sysfs_streq(s1: buf, s2: "always")) {
323	clear_bit(nr: TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, addr: &transparent_hugepage_flags);
324	set_bit(nr: TRANSPARENT_HUGEPAGE_FLAG, addr: &transparent_hugepage_flags);
325	} else if (sysfs_streq(s1: buf, s2: "madvise")) {
326	clear_bit(nr: TRANSPARENT_HUGEPAGE_FLAG, addr: &transparent_hugepage_flags);
327	set_bit(nr: TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, addr: &transparent_hugepage_flags);
328	} else if (sysfs_streq(s1: buf, s2: "never")) {
329	clear_bit(nr: TRANSPARENT_HUGEPAGE_FLAG, addr: &transparent_hugepage_flags);
330	clear_bit(nr: TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, addr: &transparent_hugepage_flags);
331	} else
332	ret = -EINVAL;
333
334	if (ret > `0`) {
335	int err = start_stop_khugepaged();
336	if (err)
337	ret = err;
338	}
339	return ret;
340	}
341
342	static struct kobj_attribute enabled_attr = __ATTR_RW(enabled);
343
344	ssize_t single_hugepage_flag_show(struct kobject *kobj,
345	struct kobj_attribute attr, char* *buf,
346	enum transparent_hugepage_flag flag)
347	{
348	return sysfs_emit(buf, fmt: "%d\n",
349	!!test_bit(flag, &transparent_hugepage_flags));
350	}
351
352	ssize_t single_hugepage_flag_store(struct kobject *kobj,
353	struct kobj_attribute *attr,
354	const char *buf, size_t count,
355	enum transparent_hugepage_flag flag)
356	{
357	unsigned long value;
358	int ret;
359
360	ret = kstrtoul(s: buf, base: `10`, res: &value);
361	if (ret < `0`)
362	return ret;
363	if (value > `1`)
364	return -EINVAL;
365
366	if (value)
367	set_bit(nr: flag, addr: &transparent_hugepage_flags);
368	else
369	clear_bit(nr: flag, addr: &transparent_hugepage_flags);
370
371	return count;
372	}
373
374	static ssize_t defrag_show(struct kobject *kobj,
375	struct kobj_attribute attr, char* *buf)
376	{
377	const char *output;
378
379	if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG,
380	&transparent_hugepage_flags))
381	output = "[always] defer defer+madvise madvise never";
382	else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG,
383	&transparent_hugepage_flags))
384	output = "always [defer] defer+madvise madvise never";
385	else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG,
386	&transparent_hugepage_flags))
387	output = "always defer [defer+madvise] madvise never";
388	else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG,
389	&transparent_hugepage_flags))
390	output = "always defer defer+madvise [madvise] never";
391	else
392	output = "always defer defer+madvise madvise [never]";
393
394	return sysfs_emit(buf, fmt: "%s\n", output);
395	}
396
397	static ssize_t defrag_store(struct kobject *kobj,
398	struct kobj_attribute *attr,
399	const char *buf, size_t count)
400	{
401	if (sysfs_streq(s1: buf, s2: "always")) {
402	clear_bit(nr: TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, addr: &transparent_hugepage_flags);
403	clear_bit(nr: TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, addr: &transparent_hugepage_flags);
404	clear_bit(nr: TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, addr: &transparent_hugepage_flags);
405	set_bit(nr: TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, addr: &transparent_hugepage_flags);
406	} else if (sysfs_streq(s1: buf, s2: "defer+madvise")) {
407	clear_bit(nr: TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, addr: &transparent_hugepage_flags);
408	clear_bit(nr: TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, addr: &transparent_hugepage_flags);
409	clear_bit(nr: TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, addr: &transparent_hugepage_flags);
410	set_bit(nr: TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, addr: &transparent_hugepage_flags);
411	} else if (sysfs_streq(s1: buf, s2: "defer")) {
412	clear_bit(nr: TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, addr: &transparent_hugepage_flags);
413	clear_bit(nr: TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, addr: &transparent_hugepage_flags);
414	clear_bit(nr: TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, addr: &transparent_hugepage_flags);
415	set_bit(nr: TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, addr: &transparent_hugepage_flags);
416	} else if (sysfs_streq(s1: buf, s2: "madvise")) {
417	clear_bit(nr: TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, addr: &transparent_hugepage_flags);
418	clear_bit(nr: TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, addr: &transparent_hugepage_flags);
419	clear_bit(nr: TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, addr: &transparent_hugepage_flags);
420	set_bit(nr: TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, addr: &transparent_hugepage_flags);
421	} else if (sysfs_streq(s1: buf, s2: "never")) {
422	clear_bit(nr: TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, addr: &transparent_hugepage_flags);
423	clear_bit(nr: TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, addr: &transparent_hugepage_flags);
424	clear_bit(nr: TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, addr: &transparent_hugepage_flags);
425	clear_bit(nr: TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, addr: &transparent_hugepage_flags);
426	} else
427	return -EINVAL;
428
429	return count;
430	}
431	static struct kobj_attribute defrag_attr = __ATTR_RW(defrag);
432
433	static ssize_t use_zero_page_show(struct kobject *kobj,
434	struct kobj_attribute attr, char* *buf)
435	{
436	return single_hugepage_flag_show(kobj, attr, buf,
437	flag: TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
438	}
439	static ssize_t use_zero_page_store(struct kobject *kobj,
440	struct kobj_attribute attr, const* char *buf, size_t count)
441	{
442	return single_hugepage_flag_store(kobj, attr, buf, count,
443	flag: TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
444	}
445	static struct kobj_attribute use_zero_page_attr = __ATTR_RW(use_zero_page);
446
447	static ssize_t hpage_pmd_size_show(struct kobject *kobj,
448	struct kobj_attribute attr, char* *buf)
449	{
450	return sysfs_emit(buf, fmt: "%lu\n", HPAGE_PMD_SIZE);
451	}
452	static struct kobj_attribute hpage_pmd_size_attr =
453	__ATTR_RO(hpage_pmd_size);
454
455	static ssize_t split_underused_thp_show(struct kobject *kobj,
456	struct kobj_attribute attr, char* *buf)
457	{
458	return sysfs_emit(buf, fmt: "%d\n", split_underused_thp);
459	}
460
461	static ssize_t split_underused_thp_store(struct kobject *kobj,
462	struct kobj_attribute *attr,
463	const char *buf, size_t count)
464	{
465	int err = kstrtobool(s: buf, res: &split_underused_thp);
466
467	if (err < `0`)
468	return err;
469
470	return count;
471	}
472
473	static struct kobj_attribute split_underused_thp_attr = __ATTR(
474	shrink_underused, `0644`, split_underused_thp_show, split_underused_thp_store);
475
476	static struct attribute *hugepage_attr[] = {
477	&enabled_attr.attr,
478	&defrag_attr.attr,
479	&use_zero_page_attr.attr,
480	&hpage_pmd_size_attr.attr,
481	#ifdef CONFIG_SHMEM
482	&shmem_enabled_attr.attr,
483	#endif
484	&split_underused_thp_attr.attr,
485	NULL,
486	};
487
488	static const struct attribute_group hugepage_attr_group = {
489	.attrs = hugepage_attr,
490	};
491
492	static void hugepage_exit_sysfs(struct kobject *hugepage_kobj);
493	static void thpsize_release(struct kobject *kobj);
494	static DEFINE_SPINLOCK(huge_anon_orders_lock);
495	static LIST_HEAD(thpsize_list);
496
497	static ssize_t anon_enabled_show(struct kobject *kobj,
498	struct kobj_attribute attr, char* *buf)
499	{
500	int order = to_thpsize(kobj)->order;
501	const char *output;
502
503	if (test_bit(order, &huge_anon_orders_always))
504	output = "[always] inherit madvise never";
505	else if (test_bit(order, &huge_anon_orders_inherit))
506	output = "always [inherit] madvise never";
507	else if (test_bit(order, &huge_anon_orders_madvise))
508	output = "always inherit [madvise] never";
509	else
510	output = "always inherit madvise [never]";
511
512	return sysfs_emit(buf, fmt: "%s\n", output);
513	}
514
515	static ssize_t anon_enabled_store(struct kobject *kobj,
516	struct kobj_attribute *attr,
517	const char *buf, size_t count)
518	{
519	int order = to_thpsize(kobj)->order;
520	ssize_t ret = count;
521
522	if (sysfs_streq(s1: buf, s2: "always")) {
523	spin_lock(lock: &huge_anon_orders_lock);
524	clear_bit(nr: order, addr: &huge_anon_orders_inherit);
525	clear_bit(nr: order, addr: &huge_anon_orders_madvise);
526	set_bit(nr: order, addr: &huge_anon_orders_always);
527	spin_unlock(lock: &huge_anon_orders_lock);
528	} else if (sysfs_streq(s1: buf, s2: "inherit")) {
529	spin_lock(lock: &huge_anon_orders_lock);
530	clear_bit(nr: order, addr: &huge_anon_orders_always);
531	clear_bit(nr: order, addr: &huge_anon_orders_madvise);
532	set_bit(nr: order, addr: &huge_anon_orders_inherit);
533	spin_unlock(lock: &huge_anon_orders_lock);
534	} else if (sysfs_streq(s1: buf, s2: "madvise")) {
535	spin_lock(lock: &huge_anon_orders_lock);
536	clear_bit(nr: order, addr: &huge_anon_orders_always);
537	clear_bit(nr: order, addr: &huge_anon_orders_inherit);
538	set_bit(nr: order, addr: &huge_anon_orders_madvise);
539	spin_unlock(lock: &huge_anon_orders_lock);
540	} else if (sysfs_streq(s1: buf, s2: "never")) {
541	spin_lock(lock: &huge_anon_orders_lock);
542	clear_bit(nr: order, addr: &huge_anon_orders_always);
543	clear_bit(nr: order, addr: &huge_anon_orders_inherit);
544	clear_bit(nr: order, addr: &huge_anon_orders_madvise);
545	spin_unlock(lock: &huge_anon_orders_lock);
546	} else
547	ret = -EINVAL;
548
549	if (ret > `0`) {
550	int err;
551
552	err = start_stop_khugepaged();
553	if (err)
554	ret = err;
555	}
556	return ret;
557	}
558
559	static struct kobj_attribute anon_enabled_attr =
560	__ATTR(enabled, `0644`, anon_enabled_show, anon_enabled_store);
561
562	static struct attribute *anon_ctrl_attrs[] = {
563	&anon_enabled_attr.attr,
564	NULL,
565	};
566
567	static const struct attribute_group anon_ctrl_attr_grp = {
568	.attrs = anon_ctrl_attrs,
569	};
570
571	static struct attribute *file_ctrl_attrs[] = {
572	#ifdef CONFIG_SHMEM
573	&thpsize_shmem_enabled_attr.attr,
574	#endif
575	NULL,
576	};
577
578	static const struct attribute_group file_ctrl_attr_grp = {
579	.attrs = file_ctrl_attrs,
580	};
581
582	static struct attribute *any_ctrl_attrs[] = {
583	NULL,
584	};
585
586	static const struct attribute_group any_ctrl_attr_grp = {
587	.attrs = any_ctrl_attrs,
588	};
589
590	static const struct kobj_type thpsize_ktype = {
591	.release = &thpsize_release,
592	.sysfs_ops = &kobj_sysfs_ops,
593	};
594
595	DEFINE_PER_CPU(struct mthp_stat, mthp_stats) = {{{`0`}}};
596
597	static unsigned long sum_mthp_stat(int order, enum mthp_stat_item item)
598	{
599	unsigned long sum = `0`;
600	int cpu;
601
602	for_each_possible_cpu(cpu) {
603	struct mthp_stat *this = &per_cpu(mthp_stats, cpu);
604
605	sum += this->stats[order][item];
606	}
607
608	return sum;
609	}
610
611	#define DEFINE_MTHP_STAT_ATTR(_name, _index) \
612	static ssize_t _name##_show(struct kobject *kobj, \
613	struct kobj_attribute attr, char buf) \
614	{ \
615	int order = to_thpsize(kobj)->order; \
616	\
617	return sysfs_emit(buf, "%lu\n", sum_mthp_stat(order, _index)); \
618	} \
619	static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
620
621	DEFINE_MTHP_STAT_ATTR(anon_fault_alloc, MTHP_STAT_ANON_FAULT_ALLOC);
622	DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK);
623	DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
624	DEFINE_MTHP_STAT_ATTR(zswpout, MTHP_STAT_ZSWPOUT);
625	DEFINE_MTHP_STAT_ATTR(swpin, MTHP_STAT_SWPIN);
626	DEFINE_MTHP_STAT_ATTR(swpin_fallback, MTHP_STAT_SWPIN_FALLBACK);
627	DEFINE_MTHP_STAT_ATTR(swpin_fallback_charge, MTHP_STAT_SWPIN_FALLBACK_CHARGE);
628	DEFINE_MTHP_STAT_ATTR(swpout, MTHP_STAT_SWPOUT);
629	DEFINE_MTHP_STAT_ATTR(swpout_fallback, MTHP_STAT_SWPOUT_FALLBACK);
630	#ifdef CONFIG_SHMEM
631	DEFINE_MTHP_STAT_ATTR(shmem_alloc, MTHP_STAT_SHMEM_ALLOC);
632	DEFINE_MTHP_STAT_ATTR(shmem_fallback, MTHP_STAT_SHMEM_FALLBACK);
633	DEFINE_MTHP_STAT_ATTR(shmem_fallback_charge, MTHP_STAT_SHMEM_FALLBACK_CHARGE);
634	#endif
635	DEFINE_MTHP_STAT_ATTR(split, MTHP_STAT_SPLIT);
636	DEFINE_MTHP_STAT_ATTR(split_failed, MTHP_STAT_SPLIT_FAILED);
637	DEFINE_MTHP_STAT_ATTR(split_deferred, MTHP_STAT_SPLIT_DEFERRED);
638	DEFINE_MTHP_STAT_ATTR(nr_anon, MTHP_STAT_NR_ANON);
639	DEFINE_MTHP_STAT_ATTR(nr_anon_partially_mapped, MTHP_STAT_NR_ANON_PARTIALLY_MAPPED);
640
641	static struct attribute *anon_stats_attrs[] = {
642	&anon_fault_alloc_attr.attr,
643	&anon_fault_fallback_attr.attr,
644	&anon_fault_fallback_charge_attr.attr,
645	#ifndef CONFIG_SHMEM
646	&zswpout_attr.attr,
647	&swpin_attr.attr,
648	&swpin_fallback_attr.attr,
649	&swpin_fallback_charge_attr.attr,
650	&swpout_attr.attr,
651	&swpout_fallback_attr.attr,
652	#endif
653	&split_deferred_attr.attr,
654	&nr_anon_attr.attr,
655	&nr_anon_partially_mapped_attr.attr,
656	NULL,
657	};
658
659	static struct attribute_group anon_stats_attr_grp = {
660	.name = "stats",
661	.attrs = anon_stats_attrs,
662	};
663
664	static struct attribute *file_stats_attrs[] = {
665	#ifdef CONFIG_SHMEM
666	&shmem_alloc_attr.attr,
667	&shmem_fallback_attr.attr,
668	&shmem_fallback_charge_attr.attr,
669	#endif
670	NULL,
671	};
672
673	static struct attribute_group file_stats_attr_grp = {
674	.name = "stats",
675	.attrs = file_stats_attrs,
676	};
677
678	static struct attribute *any_stats_attrs[] = {
679	#ifdef CONFIG_SHMEM
680	&zswpout_attr.attr,
681	&swpin_attr.attr,
682	&swpin_fallback_attr.attr,
683	&swpin_fallback_charge_attr.attr,
684	&swpout_attr.attr,
685	&swpout_fallback_attr.attr,
686	#endif
687	&split_attr.attr,
688	&split_failed_attr.attr,
689	NULL,
690	};
691
692	static struct attribute_group any_stats_attr_grp = {
693	.name = "stats",
694	.attrs = any_stats_attrs,
695	};
696
697	static int sysfs_add_group(struct kobject *kobj,
698	const struct attribute_group *grp)
699	{
700	int ret = -ENOENT;
701
702	/*
703	* If the group is named, try to merge first, assuming the subdirectory
704	* was already created. This avoids the warning emitted by
705	* sysfs_create_group() if the directory already exists.
706	*/
707	if (grp->name)
708	ret = sysfs_merge_group(kobj, grp);
709	if (ret)
710	ret = sysfs_create_group(kobj, grp);
711
712	return ret;
713	}
714
715	static struct thpsize thpsize_create(int* order, struct kobject *parent)
716	{
717	unsigned long size = (PAGE_SIZE << order) / SZ_1K;
718	struct thpsize *thpsize;
719	int ret = -ENOMEM;
720
721	thpsize = kzalloc(sizeof(*thpsize), GFP_KERNEL);
722	if (!thpsize)
723	goto err;
724
725	thpsize->order = order;
726
727	ret = kobject_init_and_add(kobj: &thpsize->kobj, ktype: &thpsize_ktype, parent,
728	fmt: "hugepages-%lukB", size);
729	if (ret) {
730	kfree(objp: thpsize);
731	goto err;
732	}
733
734
735	ret = sysfs_add_group(kobj: &thpsize->kobj, grp: &any_ctrl_attr_grp);
736	if (ret)
737	goto err_put;
738
739	ret = sysfs_add_group(kobj: &thpsize->kobj, grp: &any_stats_attr_grp);
740	if (ret)
741	goto err_put;
742
743	if (BIT(order) & THP_ORDERS_ALL_ANON) {
744	ret = sysfs_add_group(kobj: &thpsize->kobj, grp: &anon_ctrl_attr_grp);
745	if (ret)
746	goto err_put;
747
748	ret = sysfs_add_group(kobj: &thpsize->kobj, grp: &anon_stats_attr_grp);
749	if (ret)
750	goto err_put;
751	}
752
753	if (BIT(order) & THP_ORDERS_ALL_FILE_DEFAULT) {
754	ret = sysfs_add_group(kobj: &thpsize->kobj, grp: &file_ctrl_attr_grp);
755	if (ret)
756	goto err_put;
757
758	ret = sysfs_add_group(kobj: &thpsize->kobj, grp: &file_stats_attr_grp);
759	if (ret)
760	goto err_put;
761	}
762
763	return thpsize;
764	err_put:
765	kobject_put(kobj: &thpsize->kobj);
766	err:
767	return ERR_PTR(error: ret);
768	}
769
770	static void thpsize_release(struct kobject *kobj)
771	{
772	kfree(to_thpsize(kobj));
773	}
774
775	static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj)
776	{
777	int err;
778	struct thpsize *thpsize;
779	unsigned long orders;
780	int order;
781
782	/*
783	* Default to setting PMD-sized THP to inherit the global setting and
784	* disable all other sizes. powerpc's PMD_ORDER isn't a compile-time
785	* constant so we have to do this here.
786	*/
787	if (!anon_orders_configured)
788	huge_anon_orders_inherit = BIT(PMD_ORDER);
789
790	*hugepage_kobj = kobject_create_and_add(name: "transparent_hugepage", parent: mm_kobj);
791	if (unlikely(!*hugepage_kobj)) {
792	pr_err("failed to create transparent hugepage kobject\n");
793	return -ENOMEM;
794	}
795
796	err = sysfs_create_group(kobj: *hugepage_kobj, grp: &hugepage_attr_group);
797	if (err) {
798	pr_err("failed to register transparent hugepage group\n");
799	goto delete_obj;
800	}
801
802	err = sysfs_create_group(kobj: *hugepage_kobj, grp: &khugepaged_attr_group);
803	if (err) {
804	pr_err("failed to register transparent hugepage group\n");
805	goto remove_hp_group;
806	}
807
808	orders = THP_ORDERS_ALL_ANON \| THP_ORDERS_ALL_FILE_DEFAULT;
809	order = highest_order(orders);
810	while (orders) {
811	thpsize = thpsize_create(order, parent: *hugepage_kobj);
812	if (IS_ERR(ptr: thpsize)) {
813	pr_err("failed to create thpsize for order %d\n", order);
814	err = PTR_ERR(ptr: thpsize);
815	goto remove_all;
816	}
817	list_add(new: &thpsize->node, head: &thpsize_list);
818	order = next_order(orders: &orders, prev: order);
819	}
820
821	return `0`;
822
823	remove_all:
824	hugepage_exit_sysfs(hugepage_kobj: *hugepage_kobj);
825	return err;
826	remove_hp_group:
827	sysfs_remove_group(kobj: *hugepage_kobj, grp: &hugepage_attr_group);
828	delete_obj:
829	kobject_put(kobj: *hugepage_kobj);
830	return err;
831	}
832
833	static void __init hugepage_exit_sysfs(struct kobject *hugepage_kobj)
834	{
835	struct thpsize thpsize, tmp;
836
837	list_for_each_entry_safe(thpsize, tmp, &thpsize_list, node) {
838	list_del(entry: &thpsize->node);
839	kobject_put(kobj: &thpsize->kobj);
840	}
841
842	sysfs_remove_group(kobj: hugepage_kobj, grp: &khugepaged_attr_group);
843	sysfs_remove_group(kobj: hugepage_kobj, grp: &hugepage_attr_group);
844	kobject_put(kobj: hugepage_kobj);
845	}
846	#else
847	static inline int hugepage_init_sysfs(struct kobject **hugepage_kobj)
848	{
849	return `0`;
850	}
851
852	static inline void hugepage_exit_sysfs(struct kobject *hugepage_kobj)
853	{
854	}
855	#endif /* CONFIG_SYSFS */
856
857	static int __init thp_shrinker_init(void)
858	{
859	deferred_split_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE \|
860	SHRINKER_MEMCG_AWARE \|
861	SHRINKER_NONSLAB,
862	fmt: "thp-deferred_split");
863	if (!deferred_split_shrinker)
864	return -ENOMEM;
865
866	deferred_split_shrinker->count_objects = deferred_split_count;
867	deferred_split_shrinker->scan_objects = deferred_split_scan;
868	shrinker_register(shrinker: deferred_split_shrinker);
869
870	if (IS_ENABLED(CONFIG_PERSISTENT_HUGE_ZERO_FOLIO)) {
871	/*
872	* Bump the reference of the huge_zero_folio and do not
873	* initialize the shrinker.
874	*
875	* huge_zero_folio will always be NULL on failure. We assume
876	* that get_huge_zero_folio() will most likely not fail as
877	* thp_shrinker_init() is invoked early on during boot.
878	*/
879	if (!get_huge_zero_folio())
880	pr_warn("Allocating persistent huge zero folio failed\n");
881	return `0`;
882	}
883
884	huge_zero_folio_shrinker = shrinker_alloc(flags: `0`, fmt: "thp-zero");
885	if (!huge_zero_folio_shrinker) {
886	shrinker_free(shrinker: deferred_split_shrinker);
887	return -ENOMEM;
888	}
889
890	huge_zero_folio_shrinker->count_objects = shrink_huge_zero_folio_count;
891	huge_zero_folio_shrinker->scan_objects = shrink_huge_zero_folio_scan;
892	shrinker_register(shrinker: huge_zero_folio_shrinker);
893
894	return `0`;
895	}
896
897	static void __init thp_shrinker_exit(void)
898	{
899	shrinker_free(shrinker: huge_zero_folio_shrinker);
900	shrinker_free(shrinker: deferred_split_shrinker);
901	}
902
903	static int __init hugepage_init(void)
904	{
905	int err;
906	struct kobject *hugepage_kobj;
907
908	if (!has_transparent_hugepage()) {
909	transparent_hugepage_flags = `1` << TRANSPARENT_HUGEPAGE_UNSUPPORTED;
910	return -EINVAL;
911	}
912
913	/*
914	* hugepages can't be allocated by the buddy allocator
915	*/
916	MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER > MAX_PAGE_ORDER);
917
918	err = hugepage_init_sysfs(hugepage_kobj: &hugepage_kobj);
919	if (err)
920	goto err_sysfs;
921
922	err = khugepaged_init();
923	if (err)
924	goto err_slab;
925
926	err = thp_shrinker_init();
927	if (err)
928	goto err_shrinker;
929
930	/*
931	* By default disable transparent hugepages on smaller systems,
932	* where the extra memory used could hurt more than TLB overhead
933	* is likely to save. The admin can still enable it through /sys.
934	*/
935	if (totalram_pages() < MB_TO_PAGES(`512`)) {
936	transparent_hugepage_flags = `0`;
937	return `0`;
938	}
939
940	err = start_stop_khugepaged();
941	if (err)
942	goto err_khugepaged;
943
944	return `0`;
945	err_khugepaged:
946	thp_shrinker_exit();
947	err_shrinker:
948	khugepaged_destroy();
949	err_slab:
950	hugepage_exit_sysfs(hugepage_kobj);
951	err_sysfs:
952	return err;
953	}
954	subsys_initcall(hugepage_init);
955
956	static int __init setup_transparent_hugepage(char *str)
957	{
958	int ret = `0`;
959	if (!str)
960	goto out;
961	if (!strcmp(str, "always")) {
962	set_bit(nr: TRANSPARENT_HUGEPAGE_FLAG,
963	addr: &transparent_hugepage_flags);
964	clear_bit(nr: TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
965	addr: &transparent_hugepage_flags);
966	ret = `1`;
967	} else if (!strcmp(str, "madvise")) {
968	clear_bit(nr: TRANSPARENT_HUGEPAGE_FLAG,
969	addr: &transparent_hugepage_flags);
970	set_bit(nr: TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
971	addr: &transparent_hugepage_flags);
972	ret = `1`;
973	} else if (!strcmp(str, "never")) {
974	clear_bit(nr: TRANSPARENT_HUGEPAGE_FLAG,
975	addr: &transparent_hugepage_flags);
976	clear_bit(nr: TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
977	addr: &transparent_hugepage_flags);
978	ret = `1`;
979	}
980	out:
981	if (!ret)
982	pr_warn("transparent_hugepage= cannot parse, ignored\n");
983	return ret;
984	}
985	__setup("transparent_hugepage=", setup_transparent_hugepage);
986
987	static char str_dup[PAGE_SIZE] __initdata;
988	static int __init setup_thp_anon(char *str)
989	{
990	char token, range, policy, subtoken;
991	unsigned long always, inherit, madvise;
992	char start_size, end_size;
993	int start, end, nr;
994	char *p;
995
996	if (!str \|\| strlen(str) + `1` > PAGE_SIZE)
997	goto err;
998	strscpy(str_dup, str);
999
1000	always = huge_anon_orders_always;
1001	madvise = huge_anon_orders_madvise;
1002	inherit = huge_anon_orders_inherit;
1003	p = str_dup;
1004	while ((token = strsep(&p, ";")) != NULL) {
1005	range = strsep(&token, ":");
1006	policy = token;
1007
1008	if (!policy)
1009	goto err;
1010
1011	while ((subtoken = strsep(&range, ",")) != NULL) {
1012	if (strchr(subtoken, `'-'`)) {
1013	start_size = strsep(&subtoken, "-");
1014	end_size = subtoken;
1015
1016	start = get_order_from_str(size_str: start_size, THP_ORDERS_ALL_ANON);
1017	end = get_order_from_str(size_str: end_size, THP_ORDERS_ALL_ANON);
1018	} else {
1019	start_size = end_size = subtoken;
1020	start = end = get_order_from_str(size_str: subtoken,
1021	THP_ORDERS_ALL_ANON);
1022	}
1023
1024	if (start == -EINVAL) {
1025	pr_err("invalid size %s in thp_anon boot parameter\n", start_size);
1026	goto err;
1027	}
1028
1029	if (end == -EINVAL) {
1030	pr_err("invalid size %s in thp_anon boot parameter\n", end_size);
1031	goto err;
1032	}
1033
1034	if (start < `0` \|\| end < `0` \|\| start > end)
1035	goto err;
1036
1037	nr = end - start + `1`;
1038	if (!strcmp(policy, "always")) {
1039	bitmap_set(map: &always, start, nbits: nr);
1040	bitmap_clear(map: &inherit, start, nbits: nr);
1041	bitmap_clear(map: &madvise, start, nbits: nr);
1042	} else if (!strcmp(policy, "madvise")) {
1043	bitmap_set(map: &madvise, start, nbits: nr);
1044	bitmap_clear(map: &inherit, start, nbits: nr);
1045	bitmap_clear(map: &always, start, nbits: nr);
1046	} else if (!strcmp(policy, "inherit")) {
1047	bitmap_set(map: &inherit, start, nbits: nr);
1048	bitmap_clear(map: &madvise, start, nbits: nr);
1049	bitmap_clear(map: &always, start, nbits: nr);
1050	} else if (!strcmp(policy, "never")) {
1051	bitmap_clear(map: &inherit, start, nbits: nr);
1052	bitmap_clear(map: &madvise, start, nbits: nr);
1053	bitmap_clear(map: &always, start, nbits: nr);
1054	} else {
1055	pr_err("invalid policy %s in thp_anon boot parameter\n", policy);
1056	goto err;
1057	}
1058	}
1059	}
1060
1061	huge_anon_orders_always = always;
1062	huge_anon_orders_madvise = madvise;
1063	huge_anon_orders_inherit = inherit;
1064	anon_orders_configured = true;
1065	return `1`;
1066
1067	err:
1068	pr_warn("thp_anon=%s: error parsing string, ignoring setting\n", str);
1069	return `0`;
1070	}
1071	__setup("thp_anon=", setup_thp_anon);
1072
1073	pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
1074	{
1075	if (likely(vma->vm_flags & VM_WRITE))
1076	pmd = pmd_mkwrite(pmd, vma);
1077	return pmd;
1078	}
1079
1080	static struct deferred_split split_queue_node(int* nid)
1081	{
1082	struct pglist_data *pgdata = NODE_DATA(nid);
1083
1084	return &pgdata->deferred_split_queue;
1085	}
1086
1087	#ifdef CONFIG_MEMCG
1088	static inline
1089	struct mem_cgroup folio_split_queue_memcg(struct* folio *folio,
1090	struct deferred_split *queue)
1091	{
1092	if (mem_cgroup_disabled())
1093	return NULL;
1094	if (split_queue_node(nid: folio_nid(folio)) == queue)
1095	return NULL;
1096	return container_of(queue, struct mem_cgroup, deferred_split_queue);
1097	}
1098
1099	static struct deferred_split memcg_split_queue(int* nid, struct mem_cgroup *memcg)
1100	{
1101	return memcg ? &memcg->deferred_split_queue : split_queue_node(nid);
1102	}
1103	#else
1104	static inline
1105	struct mem_cgroup folio_split_queue_memcg(struct* folio *folio,
1106	struct deferred_split *queue)
1107	{
1108	return NULL;
1109	}
1110
1111	static struct deferred_split memcg_split_queue(int* nid, struct mem_cgroup *memcg)
1112	{
1113	return split_queue_node(nid);
1114	}
1115	#endif
1116
1117	static struct deferred_split split_queue_lock(int* nid, struct mem_cgroup *memcg)
1118	{
1119	struct deferred_split *queue;
1120
1121	retry:
1122	queue = memcg_split_queue(nid, memcg);
1123	spin_lock(lock: &queue->split_queue_lock);
1124	/*
1125	* There is a period between setting memcg to dying and reparenting
1126	* deferred split queue, and during this period the THPs in the deferred
1127	* split queue will be hidden from the shrinker side.
1128	*/
1129	if (unlikely(memcg_is_dying(memcg))) {
1130	spin_unlock(lock: &queue->split_queue_lock);
1131	memcg = parent_mem_cgroup(memcg);
1132	goto retry;
1133	}
1134
1135	return queue;
1136	}
1137
1138	static struct deferred_split *
1139	split_queue_lock_irqsave(int nid, struct mem_cgroup memcg, unsigned* long *flags)
1140	{
1141	struct deferred_split *queue;
1142
1143	retry:
1144	queue = memcg_split_queue(nid, memcg);
1145	spin_lock_irqsave(&queue->split_queue_lock, *flags);
1146	if (unlikely(memcg_is_dying(memcg))) {
1147	spin_unlock_irqrestore(lock: &queue->split_queue_lock, flags: *flags);
1148	memcg = parent_mem_cgroup(memcg);
1149	goto retry;
1150	}
1151
1152	return queue;
1153	}
1154
1155	static struct deferred_split folio_split_queue_lock(struct* folio *folio)
1156	{
1157	return split_queue_lock(nid: folio_nid(folio), memcg: folio_memcg(folio));
1158	}
1159
1160	static struct deferred_split *
1161	folio_split_queue_lock_irqsave(struct folio folio, unsigned* long *flags)
1162	{
1163	return split_queue_lock_irqsave(nid: folio_nid(folio), memcg: folio_memcg(folio), flags);
1164	}
1165
1166	static inline void split_queue_unlock(struct deferred_split *queue)
1167	{
1168	spin_unlock(lock: &queue->split_queue_lock);
1169	}
1170
1171	static inline void split_queue_unlock_irqrestore(struct deferred_split *queue,
1172	unsigned long flags)
1173	{
1174	spin_unlock_irqrestore(lock: &queue->split_queue_lock, flags);
1175	}
1176
1177	static inline bool is_transparent_hugepage(const struct folio *folio)
1178	{
1179	if (!folio_test_large(folio))
1180	return false;
1181
1182	return is_huge_zero_folio(folio) \|\|
1183	folio_test_large_rmappable(folio);
1184	}
1185
1186	static unsigned long __thp_get_unmapped_area(struct file *filp,
1187	unsigned long addr, unsigned long len,
1188	loff_t off, unsigned long flags, unsigned long size,
1189	vm_flags_t vm_flags)
1190	{
1191	loff_t off_end = off + len;
1192	loff_t off_align = round_up(off, size);
1193	unsigned long len_pad, ret, off_sub;
1194
1195	if (!IS_ENABLED(CONFIG_64BIT) \|\| in_compat_syscall())
1196	return `0`;
1197
1198	if (off_end <= off_align \|\| (off_end - off_align) < size)
1199	return `0`;
1200
1201	len_pad = len + size;
1202	if (len_pad < len \|\| (off + len_pad) < off)
1203	return `0`;
1204
1205	ret = mm_get_unmapped_area_vmflags(filp, addr, len: len_pad,
1206	pgoff: off >> PAGE_SHIFT, flags, vm_flags);
1207
1208	/*
1209	* The failure might be due to length padding. The caller will retry
1210	* without the padding.
1211	*/
1212	if (IS_ERR_VALUE(ret))
1213	return `0`;
1214
1215	/*
1216	* Do not try to align to THP boundary if allocation at the address
1217	* hint succeeds.
1218	*/
1219	if (ret == addr)
1220	return addr;
1221
1222	off_sub = (off - ret) & (size - `1`);
1223
1224	if (mm_flags_test(MMF_TOPDOWN, current->mm) && !off_sub)
1225	return ret + size;
1226
1227	ret += off_sub;
1228	return ret;
1229	}
1230
1231	unsigned long thp_get_unmapped_area_vmflags(struct file filp, unsigned* long addr,
1232	unsigned long len, unsigned long pgoff, unsigned long flags,
1233	vm_flags_t vm_flags)
1234	{
1235	unsigned long ret;
1236	loff_t off = (loff_t)pgoff << PAGE_SHIFT;
1237
1238	ret = __thp_get_unmapped_area(filp, addr, len, off, flags, PMD_SIZE, vm_flags);
1239	if (ret)
1240	return ret;
1241
1242	return mm_get_unmapped_area_vmflags(filp, addr, len, pgoff, flags,
1243	vm_flags);
1244	}
1245
1246	unsigned long thp_get_unmapped_area(struct file filp, unsigned* long addr,
1247	unsigned long len, unsigned long pgoff, unsigned long flags)
1248	{
1249	return thp_get_unmapped_area_vmflags(filp, addr, len, pgoff, flags, vm_flags: `0`);
1250	}
1251	EXPORT_SYMBOL_GPL(thp_get_unmapped_area);
1252
1253	static struct folio vma_alloc_anon_folio_pmd(struct* vm_area_struct *vma,
1254	unsigned long addr)
1255	{
1256	gfp_t gfp = vma_thp_gfp_mask(vma);
1257	const int order = HPAGE_PMD_ORDER;
1258	struct folio *folio;
1259
1260	folio = vma_alloc_folio(gfp, order, vma, addr & HPAGE_PMD_MASK);
1261
1262	if (unlikely(!folio)) {
1263	count_vm_event(item: THP_FAULT_FALLBACK);
1264	count_mthp_stat(order, item: MTHP_STAT_ANON_FAULT_FALLBACK);
1265	return NULL;
1266	}
1267
1268	VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
1269	if (mem_cgroup_charge(folio, mm: vma->vm_mm, gfp)) {
1270	folio_put(folio);
1271	count_vm_event(item: THP_FAULT_FALLBACK);
1272	count_vm_event(item: THP_FAULT_FALLBACK_CHARGE);
1273	count_mthp_stat(order, item: MTHP_STAT_ANON_FAULT_FALLBACK);
1274	count_mthp_stat(order, item: MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
1275	return NULL;
1276	}
1277	folio_throttle_swaprate(folio, gfp);
1278
1279	/*
1280	* When a folio is not zeroed during allocation (__GFP_ZERO not used)
1281	* or user folios require special handling, folio_zero_user() is used to
1282	* make sure that the page corresponding to the faulting address will be
1283	* hot in the cache after zeroing.
1284	*/
1285	if (user_alloc_needs_zeroing())
1286	folio_zero_user(folio, addr_hint: addr);
1287	/*
1288	* The memory barrier inside __folio_mark_uptodate makes sure that
1289	* folio_zero_user writes become visible before the set_pmd_at()
1290	* write.
1291	*/
1292	__folio_mark_uptodate(folio);
1293	return folio;
1294	}
1295
1296	void map_anon_folio_pmd_nopf(struct folio folio, pmd_t pmd,
1297	struct vm_area_struct vma, unsigned* long haddr)
1298	{
1299	pmd_t entry;
1300
1301	entry = folio_mk_pmd(folio, pgprot: vma->vm_page_prot);
1302	entry = maybe_pmd_mkwrite(pmd: pmd_mkdirty(pmd: entry), vma);
1303	folio_add_new_anon_rmap(folio, vma, address: haddr, RMAP_EXCLUSIVE);
1304	folio_add_lru_vma(folio, vma);
1305	set_pmd_at(mm: vma->vm_mm, addr: haddr, pmdp: pmd, pmd: entry);
1306	update_mmu_cache_pmd(vma, addr: haddr, pmd);
1307	deferred_split_folio(folio, partially_mapped: false);
1308	}
1309
1310	static void map_anon_folio_pmd_pf(struct folio folio, pmd_t pmd,
1311	struct vm_area_struct vma, unsigned* long haddr)
1312	{
1313	map_anon_folio_pmd_nopf(folio, pmd, vma, haddr);
1314	add_mm_counter(mm: vma->vm_mm, member: MM_ANONPAGES, HPAGE_PMD_NR);
1315	count_vm_event(item: THP_FAULT_ALLOC);
1316	count_mthp_stat(HPAGE_PMD_ORDER, item: MTHP_STAT_ANON_FAULT_ALLOC);
1317	count_memcg_event_mm(mm: vma->vm_mm, idx: THP_FAULT_ALLOC);
1318	}
1319
1320	static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf)
1321	{
1322	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
1323	struct vm_area_struct *vma = vmf->vma;
1324	struct folio *folio;
1325	pgtable_t pgtable;
1326	vm_fault_t ret = `0`;
1327
1328	folio = vma_alloc_anon_folio_pmd(vma, addr: vmf->address);
1329	if (unlikely(!folio))
1330	return VM_FAULT_FALLBACK;
1331
1332	pgtable = pte_alloc_one(vma->vm_mm);
1333	if (unlikely(!pgtable)) {
1334	ret = VM_FAULT_OOM;
1335	goto release;
1336	}
1337
1338	vmf->ptl = pmd_lock(mm: vma->vm_mm, pmd: vmf->pmd);
1339	if (unlikely(!pmd_none(*vmf->pmd))) {
1340	goto unlock_release;
1341	} else {
1342	ret = check_stable_address_space(mm: vma->vm_mm);
1343	if (ret)
1344	goto unlock_release;
1345
1346	/ Deliver the page fault to userland /
1347	if (userfaultfd_missing(vma)) {
1348	spin_unlock(lock: vmf->ptl);
1349	folio_put(folio);
1350	pte_free(mm: vma->vm_mm, pte_page: pgtable);
1351	ret = handle_userfault(vmf, VM_UFFD_MISSING);
1352	VM_BUG_ON(ret & VM_FAULT_FALLBACK);
1353	return ret;
1354	}
1355	pgtable_trans_huge_deposit(mm: vma->vm_mm, pmdp: vmf->pmd, pgtable);
1356	map_anon_folio_pmd_pf(folio, pmd: vmf->pmd, vma, haddr);
1357	mm_inc_nr_ptes(mm: vma->vm_mm);
1358	spin_unlock(lock: vmf->ptl);
1359	}
1360
1361	return `0`;
1362	unlock_release:
1363	spin_unlock(lock: vmf->ptl);
1364	release:
1365	if (pgtable)
1366	pte_free(mm: vma->vm_mm, pte_page: pgtable);
1367	folio_put(folio);
1368	return ret;
1369
1370	}
1371
1372	vm_fault_t do_huge_pmd_device_private(struct vm_fault *vmf)
1373	{
1374	struct vm_area_struct *vma = vmf->vma;
1375	vm_fault_t ret = `0`;
1376	spinlock_t *ptl;
1377	softleaf_t entry;
1378	struct page *page;
1379	struct folio *folio;
1380
1381	if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
1382	vma_end_read(vma);
1383	return VM_FAULT_RETRY;
1384	}
1385
1386	ptl = pmd_lock(mm: vma->vm_mm, pmd: vmf->pmd);
1387	if (unlikely(!pmd_same(*vmf->pmd, vmf->orig_pmd))) {
1388	spin_unlock(lock: ptl);
1389	return `0`;
1390	}
1391
1392	entry = softleaf_from_pmd(pmd: vmf->orig_pmd);
1393	page = softleaf_to_page(entry);
1394	folio = page_folio(page);
1395	vmf->page = page;
1396	vmf->pte = NULL;
1397	if (folio_trylock(folio)) {
1398	folio_get(folio);
1399	spin_unlock(lock: ptl);
1400	ret = page_pgmap(page)->ops->migrate_to_ram(vmf);
1401	folio_unlock(folio);
1402	folio_put(folio);
1403	} else {
1404	spin_unlock(lock: ptl);
1405	}
1406
1407	return ret;
1408	}
1409
1410	/*
1411	* always: directly stall for all thp allocations
1412	* defer: wake kswapd and fail if not immediately available
1413	* defer+madvise: wake kswapd and directly stall for MADV_HUGEPAGE, otherwise
1414	* fail if not immediately available
1415	* madvise: directly stall for MADV_HUGEPAGE, otherwise fail if not immediately
1416	* available
1417	* never: never stall for any thp allocation
1418	*/
1419	gfp_t vma_thp_gfp_mask(struct vm_area_struct *vma)
1420	{
1421	const bool vma_madvised = vma && (vma->vm_flags & VM_HUGEPAGE);
1422
1423	/ Always do synchronous compaction /
1424	if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags))
1425	return GFP_TRANSHUGE \| (vma_madvised ? `0` : __GFP_NORETRY);
1426
1427	/ Kick kcompactd and fail quickly /
1428	if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags))
1429	return GFP_TRANSHUGE_LIGHT \| __GFP_KSWAPD_RECLAIM;
1430
1431	/ Synchronous compaction if madvised, otherwise kick kcompactd /
1432	if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags))
1433	return GFP_TRANSHUGE_LIGHT \|
1434	(vma_madvised ? __GFP_DIRECT_RECLAIM :
1435	__GFP_KSWAPD_RECLAIM);
1436
1437	/ Only do synchronous compaction if madvised /
1438	if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags))
1439	return GFP_TRANSHUGE_LIGHT \|
1440	(vma_madvised ? __GFP_DIRECT_RECLAIM : `0`);
1441
1442	return GFP_TRANSHUGE_LIGHT;
1443	}
1444
1445	/ Caller must hold page table lock. /
1446	static void set_huge_zero_folio(pgtable_t pgtable, struct mm_struct *mm,
1447	struct vm_area_struct vma, unsigned* long haddr, pmd_t *pmd,
1448	struct folio *zero_folio)
1449	{
1450	pmd_t entry;
1451	entry = folio_mk_pmd(folio: zero_folio, pgprot: vma->vm_page_prot);
1452	entry = pmd_mkspecial(pmd: entry);
1453	pgtable_trans_huge_deposit(mm, pmdp: pmd, pgtable);
1454	set_pmd_at(mm, addr: haddr, pmdp: pmd, pmd: entry);
1455	mm_inc_nr_ptes(mm);
1456	}
1457
1458	vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
1459	{
1460	struct vm_area_struct *vma = vmf->vma;
1461	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
1462	vm_fault_t ret;
1463
1464	if (!thp_vma_suitable_order(vma, addr: haddr, PMD_ORDER))
1465	return VM_FAULT_FALLBACK;
1466	ret = vmf_anon_prepare(vmf);
1467	if (ret)
1468	return ret;
1469	khugepaged_enter_vma(vma, vm_flags: vma->vm_flags);
1470
1471	if (!(vmf->flags & FAULT_FLAG_WRITE) &&
1472	!mm_forbids_zeropage(vma->vm_mm) &&
1473	transparent_hugepage_use_zero_page()) {
1474	pgtable_t pgtable;
1475	struct folio *zero_folio;
1476	vm_fault_t ret;
1477
1478	pgtable = pte_alloc_one(vma->vm_mm);
1479	if (unlikely(!pgtable))
1480	return VM_FAULT_OOM;
1481	zero_folio = mm_get_huge_zero_folio(mm: vma->vm_mm);
1482	if (unlikely(!zero_folio)) {
1483	pte_free(mm: vma->vm_mm, pte_page: pgtable);
1484	count_vm_event(item: THP_FAULT_FALLBACK);
1485	return VM_FAULT_FALLBACK;
1486	}
1487	vmf->ptl = pmd_lock(mm: vma->vm_mm, pmd: vmf->pmd);
1488	ret = `0`;
1489	if (pmd_none(pmd: *vmf->pmd)) {
1490	ret = check_stable_address_space(mm: vma->vm_mm);
1491	if (ret) {
1492	spin_unlock(lock: vmf->ptl);
1493	pte_free(mm: vma->vm_mm, pte_page: pgtable);
1494	} else if (userfaultfd_missing(vma)) {
1495	spin_unlock(lock: vmf->ptl);
1496	pte_free(mm: vma->vm_mm, pte_page: pgtable);
1497	ret = handle_userfault(vmf, VM_UFFD_MISSING);
1498	VM_BUG_ON(ret & VM_FAULT_FALLBACK);
1499	} else {
1500	set_huge_zero_folio(pgtable, mm: vma->vm_mm, vma,
1501	haddr, pmd: vmf->pmd, zero_folio);
1502	update_mmu_cache_pmd(vma, addr: vmf->address, pmd: vmf->pmd);
1503	spin_unlock(lock: vmf->ptl);
1504	}
1505	} else {
1506	spin_unlock(lock: vmf->ptl);
1507	pte_free(mm: vma->vm_mm, pte_page: pgtable);
1508	}
1509	return ret;
1510	}
1511
1512	return __do_huge_pmd_anonymous_page(vmf);
1513	}
1514
1515	struct folio_or_pfn {
1516	union {
1517	struct folio *folio;
1518	unsigned long pfn;
1519	};
1520	bool is_folio;
1521	};
1522
1523	static vm_fault_t insert_pmd(struct vm_area_struct vma, unsigned* long addr,
1524	pmd_t pmd, struct* folio_or_pfn fop, pgprot_t prot,
1525	bool write)
1526	{
1527	struct mm_struct *mm = vma->vm_mm;
1528	pgtable_t pgtable = NULL;
1529	spinlock_t *ptl;
1530	pmd_t entry;
1531
1532	if (addr < vma->vm_start \|\| addr >= vma->vm_end)
1533	return VM_FAULT_SIGBUS;
1534
1535	if (arch_needs_pgtable_deposit()) {
1536	pgtable = pte_alloc_one(vma->vm_mm);
1537	if (!pgtable)
1538	return VM_FAULT_OOM;
1539	}
1540
1541	ptl = pmd_lock(mm, pmd);
1542	if (!pmd_none(pmd: *pmd)) {
1543	const unsigned long pfn = fop.is_folio ? folio_pfn(folio: fop.folio) :
1544	fop.pfn;
1545
1546	if (write) {
1547	if (pmd_pfn(pmd: *pmd) != pfn) {
1548	WARN_ON_ONCE(!is_huge_zero_pmd(*pmd));
1549	goto out_unlock;
1550	}
1551	entry = pmd_mkyoung(pmd: *pmd);
1552	entry = maybe_pmd_mkwrite(pmd: pmd_mkdirty(pmd: entry), vma);
1553	if (pmdp_set_access_flags(vma, address: addr, pmdp: pmd, entry, dirty: `1`))
1554	update_mmu_cache_pmd(vma, addr, pmd);
1555	}
1556	goto out_unlock;
1557	}
1558
1559	if (fop.is_folio) {
1560	entry = folio_mk_pmd(folio: fop.folio, pgprot: vma->vm_page_prot);
1561
1562	if (is_huge_zero_folio(folio: fop.folio)) {
1563	entry = pmd_mkspecial(pmd: entry);
1564	} else {
1565	folio_get(folio: fop.folio);
1566	folio_add_file_rmap_pmd(fop.folio, &fop.folio->page, vma);
1567	add_mm_counter(mm, member: mm_counter_file(folio: fop.folio), HPAGE_PMD_NR);
1568	}
1569	} else {
1570	entry = pmd_mkhuge(pmd: pfn_pmd(page_nr: fop.pfn, pgprot: prot));
1571	entry = pmd_mkspecial(pmd: entry);
1572	}
1573	if (write) {
1574	entry = pmd_mkyoung(pmd: pmd_mkdirty(pmd: entry));
1575	entry = maybe_pmd_mkwrite(pmd: entry, vma);
1576	}
1577
1578	if (pgtable) {
1579	pgtable_trans_huge_deposit(mm, pmdp: pmd, pgtable);
1580	mm_inc_nr_ptes(mm);
1581	pgtable = NULL;
1582	}
1583
1584	set_pmd_at(mm, addr, pmdp: pmd, pmd: entry);
1585	update_mmu_cache_pmd(vma, addr, pmd);
1586
1587	out_unlock:
1588	spin_unlock(lock: ptl);
1589	if (pgtable)
1590	pte_free(mm, pte_page: pgtable);
1591	return VM_FAULT_NOPAGE;
1592	}
1593
1594	/**
1595	* vmf_insert_pfn_pmd - insert a pmd size pfn
1596	* @vmf: Structure describing the fault
1597	* @pfn: pfn to insert
1598	* @write: whether it's a write fault
1599	*
1600	* Insert a pmd size pfn. See vmf_insert_pfn() for additional info.
1601	*
1602	* Return: vm_fault_t value.
1603	*/
1604	vm_fault_t vmf_insert_pfn_pmd(struct vm_fault vmf, unsigned* long pfn,
1605	bool write)
1606	{
1607	unsigned long addr = vmf->address & PMD_MASK;
1608	struct vm_area_struct *vma = vmf->vma;
1609	pgprot_t pgprot = vma->vm_page_prot;
1610	struct folio_or_pfn fop = {
1611	.pfn = pfn,
1612	};
1613
1614	/*
1615	* If we had pmd_special, we could avoid all these restrictions,
1616	* but we need to be consistent with PTEs and architectures that
1617	* can't support a 'special' bit.
1618	*/
1619	BUG_ON(!(vma->vm_flags & (VM_PFNMAP\|VM_MIXEDMAP)));
1620	BUG_ON((vma->vm_flags & (VM_PFNMAP\|VM_MIXEDMAP)) ==
1621	(VM_PFNMAP\|VM_MIXEDMAP));
1622	BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
1623
1624	pfnmap_setup_cachemode_pfn(pfn, prot: &pgprot);
1625
1626	return insert_pmd(vma, addr, pmd: vmf->pmd, fop, prot: pgprot, write);
1627	}
1628	EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd);
1629
1630	vm_fault_t vmf_insert_folio_pmd(struct vm_fault vmf, struct* folio *folio,
1631	bool write)
1632	{
1633	struct vm_area_struct *vma = vmf->vma;
1634	unsigned long addr = vmf->address & PMD_MASK;
1635	struct folio_or_pfn fop = {
1636	.folio = folio,
1637	.is_folio = true,
1638	};
1639
1640	if (WARN_ON_ONCE(folio_order(folio) != PMD_ORDER))
1641	return VM_FAULT_SIGBUS;
1642
1643	return insert_pmd(vma, addr, pmd: vmf->pmd, fop, prot: vma->vm_page_prot, write);
1644	}
1645	EXPORT_SYMBOL_GPL(vmf_insert_folio_pmd);
1646
1647	#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
1648	static pud_t maybe_pud_mkwrite(pud_t pud, struct vm_area_struct *vma)
1649	{
1650	if (likely(vma->vm_flags & VM_WRITE))
1651	pud = pud_mkwrite(pud);
1652	return pud;
1653	}
1654
1655	static vm_fault_t insert_pud(struct vm_area_struct vma, unsigned* long addr,
1656	pud_t pud, struct* folio_or_pfn fop, pgprot_t prot, bool write)
1657	{
1658	struct mm_struct *mm = vma->vm_mm;
1659	spinlock_t *ptl;
1660	pud_t entry;
1661
1662	if (addr < vma->vm_start \|\| addr >= vma->vm_end)
1663	return VM_FAULT_SIGBUS;
1664
1665	ptl = pud_lock(mm, pud);
1666	if (!pud_none(pud: *pud)) {
1667	const unsigned long pfn = fop.is_folio ? folio_pfn(folio: fop.folio) :
1668	fop.pfn;
1669
1670	if (write) {
1671	if (WARN_ON_ONCE(pud_pfn(*pud) != pfn))
1672	goto out_unlock;
1673	entry = pud_mkyoung(pud: *pud);
1674	entry = maybe_pud_mkwrite(pud: pud_mkdirty(pud: entry), vma);
1675	if (pudp_set_access_flags(vma, address: addr, pudp: pud, entry, dirty: `1`))
1676	update_mmu_cache_pud(vma, addr, pud);
1677	}
1678	goto out_unlock;
1679	}
1680
1681	if (fop.is_folio) {
1682	entry = folio_mk_pud(folio: fop.folio, pgprot: vma->vm_page_prot);
1683
1684	folio_get(folio: fop.folio);
1685	folio_add_file_rmap_pud(fop.folio, &fop.folio->page, vma);
1686	add_mm_counter(mm, member: mm_counter_file(folio: fop.folio), HPAGE_PUD_NR);
1687	} else {
1688	entry = pud_mkhuge(pud: pfn_pud(page_nr: fop.pfn, pgprot: prot));
1689	entry = pud_mkspecial(pud: entry);
1690	}
1691	if (write) {
1692	entry = pud_mkyoung(pud: pud_mkdirty(pud: entry));
1693	entry = maybe_pud_mkwrite(pud: entry, vma);
1694	}
1695	set_pud_at(mm, addr, pudp: pud, pud: entry);
1696	update_mmu_cache_pud(vma, addr, pud);
1697	out_unlock:
1698	spin_unlock(lock: ptl);
1699	return VM_FAULT_NOPAGE;
1700	}
1701
1702	/**
1703	* vmf_insert_pfn_pud - insert a pud size pfn
1704	* @vmf: Structure describing the fault
1705	* @pfn: pfn to insert
1706	* @write: whether it's a write fault
1707	*
1708	* Insert a pud size pfn. See vmf_insert_pfn() for additional info.
1709	*
1710	* Return: vm_fault_t value.
1711	*/
1712	vm_fault_t vmf_insert_pfn_pud(struct vm_fault vmf, unsigned* long pfn,
1713	bool write)
1714	{
1715	unsigned long addr = vmf->address & PUD_MASK;
1716	struct vm_area_struct *vma = vmf->vma;
1717	pgprot_t pgprot = vma->vm_page_prot;
1718	struct folio_or_pfn fop = {
1719	.pfn = pfn,
1720	};
1721
1722	/*
1723	* If we had pud_special, we could avoid all these restrictions,
1724	* but we need to be consistent with PTEs and architectures that
1725	* can't support a 'special' bit.
1726	*/
1727	BUG_ON(!(vma->vm_flags & (VM_PFNMAP\|VM_MIXEDMAP)));
1728	BUG_ON((vma->vm_flags & (VM_PFNMAP\|VM_MIXEDMAP)) ==
1729	(VM_PFNMAP\|VM_MIXEDMAP));
1730	BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
1731
1732	pfnmap_setup_cachemode_pfn(pfn, prot: &pgprot);
1733
1734	return insert_pud(vma, addr, pud: vmf->pud, fop, prot: pgprot, write);
1735	}
1736	EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud);
1737
1738	/**
1739	* vmf_insert_folio_pud - insert a pud size folio mapped by a pud entry
1740	* @vmf: Structure describing the fault
1741	* @folio: folio to insert
1742	* @write: whether it's a write fault
1743	*
1744	* Return: vm_fault_t value.
1745	*/
1746	vm_fault_t vmf_insert_folio_pud(struct vm_fault vmf, struct* folio *folio,
1747	bool write)
1748	{
1749	struct vm_area_struct *vma = vmf->vma;
1750	unsigned long addr = vmf->address & PUD_MASK;
1751	struct folio_or_pfn fop = {
1752	.folio = folio,
1753	.is_folio = true,
1754	};
1755
1756	if (WARN_ON_ONCE(folio_order(folio) != PUD_ORDER))
1757	return VM_FAULT_SIGBUS;
1758
1759	return insert_pud(vma, addr, pud: vmf->pud, fop, prot: vma->vm_page_prot, write);
1760	}
1761	EXPORT_SYMBOL_GPL(vmf_insert_folio_pud);
1762	#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
1763
1764	/**
1765	* touch_pmd - Mark page table pmd entry as accessed and dirty (for write)
1766	* @vma: The VMA covering @addr
1767	* @addr: The virtual address
1768	* @pmd: pmd pointer into the page table mapping @addr
1769	* @write: Whether it's a write access
1770	*
1771	* Return: whether the pmd entry is changed
1772	*/
1773	bool touch_pmd(struct vm_area_struct vma, unsigned* long addr,
1774	pmd_t *pmd, bool write)
1775	{
1776	pmd_t entry;
1777
1778	entry = pmd_mkyoung(pmd: *pmd);
1779	if (write)
1780	entry = pmd_mkdirty(pmd: entry);
1781	if (pmdp_set_access_flags(vma, address: addr & HPAGE_PMD_MASK,
1782	pmdp: pmd, entry, dirty: write)) {
1783	update_mmu_cache_pmd(vma, addr, pmd);
1784	return true;
1785	}
1786
1787	return false;
1788	}
1789
1790	static void copy_huge_non_present_pmd(
1791	struct mm_struct dst_mm, struct* mm_struct *src_mm,
1792	pmd_t dst_pmd, pmd_t src_pmd, unsigned long addr,
1793	struct vm_area_struct dst_vma, struct* vm_area_struct *src_vma,
1794	pmd_t pmd, pgtable_t pgtable)
1795	{
1796	softleaf_t entry = softleaf_from_pmd(pmd);
1797	struct folio *src_folio;
1798
1799	VM_WARN_ON_ONCE(!pmd_is_valid_softleaf(pmd));
1800
1801	if (softleaf_is_migration_write(entry) \|\|
1802	softleaf_is_migration_read_exclusive(entry)) {
1803	entry = make_readable_migration_entry(offset: swp_offset(entry));
1804	pmd = swp_entry_to_pmd(entry);
1805	if (pmd_swp_soft_dirty(pmd: *src_pmd))
1806	pmd = pmd_swp_mksoft_dirty(pmd);
1807	if (pmd_swp_uffd_wp(pmd: *src_pmd))
1808	pmd = pmd_swp_mkuffd_wp(pmd);
1809	set_pmd_at(mm: src_mm, addr, pmdp: src_pmd, pmd);
1810	} else if (softleaf_is_device_private(entry)) {
1811	/*
1812	* For device private entries, since there are no
1813	* read exclusive entries, writable = !readable
1814	*/
1815	if (softleaf_is_device_private_write(entry)) {
1816	entry = make_readable_device_private_entry(offset: swp_offset(entry));
1817	pmd = swp_entry_to_pmd(entry);
1818
1819	if (pmd_swp_soft_dirty(pmd: *src_pmd))
1820	pmd = pmd_swp_mksoft_dirty(pmd);
1821	if (pmd_swp_uffd_wp(pmd: *src_pmd))
1822	pmd = pmd_swp_mkuffd_wp(pmd);
1823	set_pmd_at(mm: src_mm, addr, pmdp: src_pmd, pmd);
1824	}
1825
1826	src_folio = softleaf_to_folio(entry);
1827	VM_WARN_ON(!folio_test_large(src_folio));
1828
1829	folio_get(folio: src_folio);
1830	/*
1831	* folio_try_dup_anon_rmap_pmd does not fail for
1832	* device private entries.
1833	*/
1834	folio_try_dup_anon_rmap_pmd(folio: src_folio, page: &src_folio->page,
1835	dst_vma, src_vma);
1836	}
1837
1838	add_mm_counter(mm: dst_mm, member: MM_ANONPAGES, HPAGE_PMD_NR);
1839	mm_inc_nr_ptes(mm: dst_mm);
1840	pgtable_trans_huge_deposit(mm: dst_mm, pmdp: dst_pmd, pgtable);
1841	if (!userfaultfd_wp(vma: dst_vma))
1842	pmd = pmd_swp_clear_uffd_wp(pmd);
1843	set_pmd_at(mm: dst_mm, addr, pmdp: dst_pmd, pmd);
1844	}
1845
1846	int copy_huge_pmd(struct mm_struct dst_mm, struct* mm_struct *src_mm,
1847	pmd_t dst_pmd, pmd_t src_pmd, unsigned long addr,
1848	struct vm_area_struct dst_vma, struct* vm_area_struct *src_vma)
1849	{
1850	spinlock_t dst_ptl, src_ptl;
1851	struct page *src_page;
1852	struct folio *src_folio;
1853	pmd_t pmd;
1854	pgtable_t pgtable = NULL;
1855	int ret = -ENOMEM;
1856
1857	pmd = pmdp_get_lockless(pmdp: src_pmd);
1858	if (unlikely(pmd_present(pmd) && pmd_special(pmd) &&
1859	!is_huge_zero_pmd(pmd))) {
1860	dst_ptl = pmd_lock(mm: dst_mm, pmd: dst_pmd);
1861	src_ptl = pmd_lockptr(mm: src_mm, pmd: src_pmd);
1862	spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
1863	/*
1864	* No need to recheck the pmd, it can't change with write
1865	* mmap lock held here.
1866	*
1867	* Meanwhile, making sure it's not a CoW VMA with writable
1868	* mapping, otherwise it means either the anon page wrongly
1869	* applied special bit, or we made the PRIVATE mapping be
1870	* able to wrongly write to the backend MMIO.
1871	*/
1872	VM_WARN_ON_ONCE(is_cow_mapping(src_vma->vm_flags) && pmd_write(pmd));
1873	goto set_pmd;
1874	}
1875
1876	/ Skip if can be re-fill on fault /
1877	if (!vma_is_anonymous(vma: dst_vma))
1878	return `0`;
1879
1880	pgtable = pte_alloc_one(dst_mm);
1881	if (unlikely(!pgtable))
1882	goto out;
1883
1884	dst_ptl = pmd_lock(mm: dst_mm, pmd: dst_pmd);
1885	src_ptl = pmd_lockptr(mm: src_mm, pmd: src_pmd);
1886	spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
1887
1888	ret = -EAGAIN;
1889	pmd = *src_pmd;
1890
1891	if (unlikely(thp_migration_supported() &&
1892	pmd_is_valid_softleaf(pmd))) {
1893	copy_huge_non_present_pmd(dst_mm, src_mm, dst_pmd, src_pmd, addr,
1894	dst_vma, src_vma, pmd, pgtable);
1895	ret = `0`;
1896	goto out_unlock;
1897	}
1898
1899	if (unlikely(!pmd_trans_huge(pmd))) {
1900	pte_free(mm: dst_mm, pte_page: pgtable);
1901	goto out_unlock;
1902	}
1903	/*
1904	* When page table lock is held, the huge zero pmd should not be
1905	* under splitting since we don't split the page itself, only pmd to
1906	* a page table.
1907	*/
1908	if (is_huge_zero_pmd(pmd)) {
1909	/*
1910	* mm_get_huge_zero_folio() will never allocate a new
1911	* folio here, since we already have a zero page to
1912	* copy. It just takes a reference.
1913	*/
1914	mm_get_huge_zero_folio(mm: dst_mm);
1915	goto out_zero_page;
1916	}
1917
1918	src_page = pmd_page(pmd);
1919	VM_BUG_ON_PAGE(!PageHead(src_page), src_page);
1920	src_folio = page_folio(src_page);
1921
1922	folio_get(folio: src_folio);
1923	if (unlikely(folio_try_dup_anon_rmap_pmd(src_folio, src_page, dst_vma, src_vma))) {
1924	/ Page maybe pinned: split and retry the fault on PTEs. /
1925	folio_put(folio: src_folio);
1926	pte_free(mm: dst_mm, pte_page: pgtable);
1927	spin_unlock(lock: src_ptl);
1928	spin_unlock(lock: dst_ptl);
1929	__split_huge_pmd(vma: src_vma, pmd: src_pmd, address: addr, freeze: false);
1930	return -EAGAIN;
1931	}
1932	add_mm_counter(mm: dst_mm, member: MM_ANONPAGES, HPAGE_PMD_NR);
1933	out_zero_page:
1934	mm_inc_nr_ptes(mm: dst_mm);
1935	pgtable_trans_huge_deposit(mm: dst_mm, pmdp: dst_pmd, pgtable);
1936	pmdp_set_wrprotect(mm: src_mm, addr, pmdp: src_pmd);
1937	if (!userfaultfd_wp(vma: dst_vma))
1938	pmd = pmd_clear_uffd_wp(pmd);
1939	pmd = pmd_wrprotect(pmd);
1940	set_pmd:
1941	pmd = pmd_mkold(pmd);
1942	set_pmd_at(mm: dst_mm, addr, pmdp: dst_pmd, pmd);
1943
1944	ret = `0`;
1945	out_unlock:
1946	spin_unlock(lock: src_ptl);
1947	spin_unlock(lock: dst_ptl);
1948	out:
1949	return ret;
1950	}
1951
1952	#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
1953	void touch_pud(struct vm_area_struct vma, unsigned* long addr,
1954	pud_t *pud, bool write)
1955	{
1956	pud_t _pud;
1957
1958	_pud = pud_mkyoung(pud: *pud);
1959	if (write)
1960	_pud = pud_mkdirty(pud: _pud);
1961	if (pudp_set_access_flags(vma, address: addr & HPAGE_PUD_MASK,
1962	pudp: pud, entry: _pud, dirty: write))
1963	update_mmu_cache_pud(vma, addr, pud);
1964	}
1965
1966	int copy_huge_pud(struct mm_struct dst_mm, struct* mm_struct *src_mm,
1967	pud_t dst_pud, pud_t src_pud, unsigned long addr,
1968	struct vm_area_struct *vma)
1969	{
1970	spinlock_t dst_ptl, src_ptl;
1971	pud_t pud;
1972	int ret;
1973
1974	dst_ptl = pud_lock(mm: dst_mm, pud: dst_pud);
1975	src_ptl = pud_lockptr(mm: src_mm, pud: src_pud);
1976	spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
1977
1978	ret = -EAGAIN;
1979	pud = *src_pud;
1980	if (unlikely(!pud_trans_huge(pud)))
1981	goto out_unlock;
1982
1983	/*
1984	* TODO: once we support anonymous pages, use
1985	* folio_try_dup_anon_rmap_*() and split if duplicating fails.
1986	*/
1987	if (is_cow_mapping(flags: vma->vm_flags) && pud_write(pud)) {
1988	pudp_set_wrprotect(mm: src_mm, address: addr, pudp: src_pud);
1989	pud = pud_wrprotect(pud);
1990	}
1991	pud = pud_mkold(pud);
1992	set_pud_at(mm: dst_mm, addr, pudp: dst_pud, pud);
1993
1994	ret = `0`;
1995	out_unlock:
1996	spin_unlock(lock: src_ptl);
1997	spin_unlock(lock: dst_ptl);
1998	return ret;
1999	}
2000
2001	void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud)
2002	{
2003	bool write = vmf->flags & FAULT_FLAG_WRITE;
2004
2005	vmf->ptl = pud_lock(mm: vmf->vma->vm_mm, pud: vmf->pud);
2006	if (unlikely(!pud_same(*vmf->pud, orig_pud)))
2007	goto unlock;
2008
2009	touch_pud(vma: vmf->vma, addr: vmf->address, pud: vmf->pud, write);
2010	unlock:
2011	spin_unlock(lock: vmf->ptl);
2012	}
2013	#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
2014
2015	bool huge_pmd_set_accessed(struct vm_fault *vmf)
2016	{
2017	bool write = vmf->flags & FAULT_FLAG_WRITE;
2018
2019	if (unlikely(!pmd_same(*vmf->pmd, vmf->orig_pmd)))
2020	return false;
2021
2022	return touch_pmd(vma: vmf->vma, addr: vmf->address, pmd: vmf->pmd, write);
2023	}
2024
2025	static vm_fault_t do_huge_zero_wp_pmd(struct vm_fault *vmf)
2026	{
2027	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
2028	struct vm_area_struct *vma = vmf->vma;
2029	struct mmu_notifier_range range;
2030	struct folio *folio;
2031	vm_fault_t ret = `0`;
2032
2033	folio = vma_alloc_anon_folio_pmd(vma, addr: vmf->address);
2034	if (unlikely(!folio))
2035	return VM_FAULT_FALLBACK;
2036
2037	mmu_notifier_range_init(range: &range, event: MMU_NOTIFY_CLEAR, flags: `0`, mm: vma->vm_mm, start: haddr,
2038	end: haddr + HPAGE_PMD_SIZE);
2039	mmu_notifier_invalidate_range_start(range: &range);
2040	vmf->ptl = pmd_lock(mm: vma->vm_mm, pmd: vmf->pmd);
2041	if (unlikely(!pmd_same(pmdp_get(vmf->pmd), vmf->orig_pmd)))
2042	goto release;
2043	ret = check_stable_address_space(mm: vma->vm_mm);
2044	if (ret)
2045	goto release;
2046	(void)pmdp_huge_clear_flush(vma, address: haddr, pmdp: vmf->pmd);
2047	map_anon_folio_pmd_pf(folio, pmd: vmf->pmd, vma, haddr);
2048	goto unlock;
2049	release:
2050	folio_put(folio);
2051	unlock:
2052	spin_unlock(lock: vmf->ptl);
2053	mmu_notifier_invalidate_range_end(range: &range);
2054	return ret;
2055	}
2056
2057	vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
2058	{
2059	const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
2060	struct vm_area_struct *vma = vmf->vma;
2061	struct folio *folio;
2062	struct page *page;
2063	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
2064	pmd_t orig_pmd = vmf->orig_pmd;
2065
2066	vmf->ptl = pmd_lockptr(mm: vma->vm_mm, pmd: vmf->pmd);
2067	VM_BUG_ON_VMA(!vma->anon_vma, vma);
2068
2069	if (is_huge_zero_pmd(pmd: orig_pmd)) {
2070	vm_fault_t ret = do_huge_zero_wp_pmd(vmf);
2071
2072	if (!(ret & VM_FAULT_FALLBACK))
2073	return ret;
2074
2075	/ Fallback to splitting PMD if THP cannot be allocated /
2076	goto fallback;
2077	}
2078
2079	spin_lock(lock: vmf->ptl);
2080
2081	if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
2082	spin_unlock(lock: vmf->ptl);
2083	return `0`;
2084	}
2085
2086	page = pmd_page(orig_pmd);
2087	folio = page_folio(page);
2088	VM_BUG_ON_PAGE(!PageHead(page), page);
2089
2090	/ Early check when only holding the PT lock. /
2091	if (PageAnonExclusive(page))
2092	goto reuse;
2093
2094	if (!folio_trylock(folio)) {
2095	folio_get(folio);
2096	spin_unlock(lock: vmf->ptl);
2097	folio_lock(folio);
2098	spin_lock(lock: vmf->ptl);
2099	if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
2100	spin_unlock(lock: vmf->ptl);
2101	folio_unlock(folio);
2102	folio_put(folio);
2103	return `0`;
2104	}
2105	folio_put(folio);
2106	}
2107
2108	/ Recheck after temporarily dropping the PT lock. /
2109	if (PageAnonExclusive(page)) {
2110	folio_unlock(folio);
2111	goto reuse;
2112	}
2113
2114	/*
2115	* See do_wp_page(): we can only reuse the folio exclusively if
2116	* there are no additional references. Note that we always drain
2117	* the LRU cache immediately after adding a THP.
2118	*/
2119	if (folio_ref_count(folio) >
2120	`1` + folio_test_swapcache(folio) * folio_nr_pages(folio))
2121	goto unlock_fallback;
2122	if (folio_test_swapcache(folio))
2123	folio_free_swap(folio);
2124	if (folio_ref_count(folio) == `1`) {
2125	pmd_t entry;
2126
2127	folio_move_anon_rmap(folio, vma);
2128	SetPageAnonExclusive(page);
2129	folio_unlock(folio);
2130	reuse:
2131	if (unlikely(unshare)) {
2132	spin_unlock(lock: vmf->ptl);
2133	return `0`;
2134	}
2135	entry = pmd_mkyoung(pmd: orig_pmd);
2136	entry = maybe_pmd_mkwrite(pmd: pmd_mkdirty(pmd: entry), vma);
2137	if (pmdp_set_access_flags(vma, address: haddr, pmdp: vmf->pmd, entry, dirty: `1`))
2138	update_mmu_cache_pmd(vma, addr: vmf->address, pmd: vmf->pmd);
2139	spin_unlock(lock: vmf->ptl);
2140	return `0`;
2141	}
2142
2143	unlock_fallback:
2144	folio_unlock(folio);
2145	spin_unlock(lock: vmf->ptl);
2146	fallback:
2147	__split_huge_pmd(vma, pmd: vmf->pmd, address: vmf->address, freeze: false);
2148	return VM_FAULT_FALLBACK;
2149	}
2150
2151	static inline bool can_change_pmd_writable(struct vm_area_struct *vma,
2152	unsigned long addr, pmd_t pmd)
2153	{
2154	struct page *page;
2155
2156	if (WARN_ON_ONCE(!(vma->vm_flags & VM_WRITE)))
2157	return false;
2158
2159	/ Don't touch entries that are not even readable (NUMA hinting). /
2160	if (pmd_protnone(pmd))
2161	return false;
2162
2163	/ Do we need write faults for softdirty tracking? /
2164	if (pmd_needs_soft_dirty_wp(vma, pmd))
2165	return false;
2166
2167	/ Do we need write faults for uffd-wp tracking? /
2168	if (userfaultfd_huge_pmd_wp(vma, pmd))
2169	return false;
2170
2171	if (!(vma->vm_flags & VM_SHARED)) {
2172	/ See can_change_pte_writable(). /
2173	page = vm_normal_page_pmd(vma, addr, pmd);
2174	return page && PageAnon(page) && PageAnonExclusive(page);
2175	}
2176
2177	/ See can_change_pte_writable(). /
2178	return pmd_dirty(pmd);
2179	}
2180
2181	/ NUMA hinting page fault entry point for trans huge pmds /
2182	vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
2183	{
2184	struct vm_area_struct *vma = vmf->vma;
2185	struct folio *folio;
2186	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
2187	int nid = NUMA_NO_NODE;
2188	int target_nid, last_cpupid;
2189	pmd_t pmd, old_pmd;
2190	bool writable = false;
2191	int flags = `0`;
2192
2193	vmf->ptl = pmd_lock(mm: vma->vm_mm, pmd: vmf->pmd);
2194	old_pmd = pmdp_get(pmdp: vmf->pmd);
2195
2196	if (unlikely(!pmd_same(old_pmd, vmf->orig_pmd))) {
2197	spin_unlock(lock: vmf->ptl);
2198	return `0`;
2199	}
2200
2201	pmd = pmd_modify(pmd: old_pmd, newprot: vma->vm_page_prot);
2202
2203	/*
2204	* Detect now whether the PMD could be writable; this information
2205	* is only valid while holding the PT lock.
2206	*/
2207	writable = pmd_write(pmd);
2208	if (!writable && vma_wants_manual_pte_write_upgrade(vma) &&
2209	can_change_pmd_writable(vma, addr: vmf->address, pmd))
2210	writable = true;
2211
2212	folio = vm_normal_folio_pmd(vma, addr: haddr, pmd);
2213	if (!folio)
2214	goto out_map;
2215
2216	nid = folio_nid(folio);
2217
2218	target_nid = numa_migrate_check(folio, vmf, addr: haddr, flags: &flags, writable,
2219	last_cpupid: &last_cpupid);
2220	if (target_nid == NUMA_NO_NODE)
2221	goto out_map;
2222	if (migrate_misplaced_folio_prepare(folio, vma, node: target_nid)) {
2223	flags \|= TNF_MIGRATE_FAIL;
2224	goto out_map;
2225	}
2226	/ The folio is isolated and isolation code holds a folio reference. /
2227	spin_unlock(lock: vmf->ptl);
2228	writable = false;
2229
2230	if (!migrate_misplaced_folio(folio, node: target_nid)) {
2231	flags \|= TNF_MIGRATED;
2232	nid = target_nid;
2233	task_numa_fault(last_node: last_cpupid, node: nid, HPAGE_PMD_NR, flags);
2234	return `0`;
2235	}
2236
2237	flags \|= TNF_MIGRATE_FAIL;
2238	vmf->ptl = pmd_lock(mm: vma->vm_mm, pmd: vmf->pmd);
2239	if (unlikely(!pmd_same(pmdp_get(vmf->pmd), vmf->orig_pmd))) {
2240	spin_unlock(lock: vmf->ptl);
2241	return `0`;
2242	}
2243	out_map:
2244	/ Restore the PMD /
2245	pmd = pmd_modify(pmd: pmdp_get(pmdp: vmf->pmd), newprot: vma->vm_page_prot);
2246	pmd = pmd_mkyoung(pmd);
2247	if (writable)
2248	pmd = pmd_mkwrite(pmd, vma);
2249	set_pmd_at(mm: vma->vm_mm, addr: haddr, pmdp: vmf->pmd, pmd);
2250	update_mmu_cache_pmd(vma, addr: vmf->address, pmd: vmf->pmd);
2251	spin_unlock(lock: vmf->ptl);
2252
2253	if (nid != NUMA_NO_NODE)
2254	task_numa_fault(last_node: last_cpupid, node: nid, HPAGE_PMD_NR, flags);
2255	return `0`;
2256	}
2257
2258	/*
2259	* Return true if we do MADV_FREE successfully on entire pmd page.
2260	* Otherwise, return false.
2261	*/
2262	bool madvise_free_huge_pmd(struct mmu_gather tlb, struct* vm_area_struct *vma,
2263	pmd_t pmd, unsigned* long addr, unsigned long next)
2264	{
2265	spinlock_t *ptl;
2266	pmd_t orig_pmd;
2267	struct folio *folio;
2268	struct mm_struct *mm = tlb->mm;
2269	bool ret = false;
2270
2271	tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
2272
2273	ptl = pmd_trans_huge_lock(pmd, vma);
2274	if (!ptl)
2275	goto out_unlocked;
2276
2277	orig_pmd = *pmd;
2278	if (is_huge_zero_pmd(pmd: orig_pmd))
2279	goto out;
2280
2281	if (unlikely(!pmd_present(orig_pmd))) {
2282	VM_BUG_ON(thp_migration_supported() &&
2283	!pmd_is_migration_entry(orig_pmd));
2284	goto out;
2285	}
2286
2287	folio = pmd_folio(orig_pmd);
2288	/*
2289	* If other processes are mapping this folio, we couldn't discard
2290	* the folio unless they all do MADV_FREE so let's skip the folio.
2291	*/
2292	if (folio_maybe_mapped_shared(folio))
2293	goto out;
2294
2295	if (!folio_trylock(folio))
2296	goto out;
2297
2298	/*
2299	* If user want to discard part-pages of THP, split it so MADV_FREE
2300	* will deactivate only them.
2301	*/
2302	if (next - addr != HPAGE_PMD_SIZE) {
2303	folio_get(folio);
2304	spin_unlock(lock: ptl);
2305	split_folio(folio);
2306	folio_unlock(folio);
2307	folio_put(folio);
2308	goto out_unlocked;
2309	}
2310
2311	if (folio_test_dirty(folio))
2312	folio_clear_dirty(folio);
2313	folio_unlock(folio);
2314
2315	if (pmd_young(pmd: orig_pmd) \|\| pmd_dirty(pmd: orig_pmd)) {
2316	pmdp_invalidate(vma, address: addr, pmdp: pmd);
2317	orig_pmd = pmd_mkold(pmd: orig_pmd);
2318	orig_pmd = pmd_mkclean(pmd: orig_pmd);
2319
2320	set_pmd_at(mm, addr, pmdp: pmd, pmd: orig_pmd);
2321	tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
2322	}
2323
2324	folio_mark_lazyfree(folio);
2325	ret = true;
2326	out:
2327	spin_unlock(lock: ptl);
2328	out_unlocked:
2329	return ret;
2330	}
2331
2332	static inline void zap_deposited_table(struct mm_struct mm, pmd_t pmd)
2333	{
2334	pgtable_t pgtable;
2335
2336	pgtable = pgtable_trans_huge_withdraw(mm, pmdp: pmd);
2337	pte_free(mm, pte_page: pgtable);
2338	mm_dec_nr_ptes(mm);
2339	}
2340
2341	int zap_huge_pmd(struct mmu_gather tlb, struct* vm_area_struct *vma,
2342	pmd_t pmd, unsigned* long addr)
2343	{
2344	pmd_t orig_pmd;
2345	spinlock_t *ptl;
2346
2347	tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
2348
2349	ptl = __pmd_trans_huge_lock(pmd, vma);
2350	if (!ptl)
2351	return `0`;
2352	/*
2353	* For architectures like ppc64 we look at deposited pgtable
2354	* when calling pmdp_huge_get_and_clear. So do the
2355	* pgtable_trans_huge_withdraw after finishing pmdp related
2356	* operations.
2357	*/
2358	orig_pmd = pmdp_huge_get_and_clear_full(vma, address: addr, pmdp: pmd,
2359	full: tlb->fullmm);
2360	arch_check_zapped_pmd(vma, pmd: orig_pmd);
2361	tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
2362	if (!vma_is_dax(vma) && vma_is_special_huge(vma)) {
2363	if (arch_needs_pgtable_deposit())
2364	zap_deposited_table(mm: tlb->mm, pmd);
2365	spin_unlock(lock: ptl);
2366	} else if (is_huge_zero_pmd(pmd: orig_pmd)) {
2367	if (!vma_is_dax(vma) \|\| arch_needs_pgtable_deposit())
2368	zap_deposited_table(mm: tlb->mm, pmd);
2369	spin_unlock(lock: ptl);
2370	} else {
2371	struct folio *folio = NULL;
2372	int flush_needed = `1`;
2373
2374	if (pmd_present(pmd: orig_pmd)) {
2375	struct page *page = pmd_page(orig_pmd);
2376
2377	folio = page_folio(page);
2378	folio_remove_rmap_pmd(folio, page, vma);
2379	WARN_ON_ONCE(folio_mapcount(folio) < `0`);
2380	VM_BUG_ON_PAGE(!PageHead(page), page);
2381	} else if (pmd_is_valid_softleaf(pmd: orig_pmd)) {
2382	const softleaf_t entry = softleaf_from_pmd(pmd: orig_pmd);
2383
2384	folio = softleaf_to_folio(entry);
2385	flush_needed = `0`;
2386
2387	if (!thp_migration_supported())
2388	WARN_ONCE(`1`, "Non present huge pmd without pmd migration enabled!");
2389	}
2390
2391	if (folio_test_anon(folio)) {
2392	zap_deposited_table(mm: tlb->mm, pmd);
2393	add_mm_counter(mm: tlb->mm, member: MM_ANONPAGES, value: -HPAGE_PMD_NR);
2394	} else {
2395	if (arch_needs_pgtable_deposit())
2396	zap_deposited_table(mm: tlb->mm, pmd);
2397	add_mm_counter(mm: tlb->mm, member: mm_counter_file(folio),
2398	value: -HPAGE_PMD_NR);
2399
2400	/*
2401	* Use flush_needed to indicate whether the PMD entry
2402	* is present, instead of checking pmd_present() again.
2403	*/
2404	if (flush_needed && pmd_young(pmd: orig_pmd) &&
2405	likely(vma_has_recency(vma)))
2406	folio_mark_accessed(folio);
2407	}
2408
2409	if (folio_is_device_private(folio)) {
2410	folio_remove_rmap_pmd(folio, &folio->page, vma);
2411	WARN_ON_ONCE(folio_mapcount(folio) < `0`);
2412	folio_put(folio);
2413	}
2414
2415	spin_unlock(lock: ptl);
2416	if (flush_needed)
2417	tlb_remove_page_size(tlb, page: &folio->page, HPAGE_PMD_SIZE);
2418	}
2419	return `1`;
2420	}
2421
2422	#ifndef pmd_move_must_withdraw
2423	static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl,
2424	spinlock_t *old_pmd_ptl,
2425	struct vm_area_struct *vma)
2426	{
2427	/*
2428	* With split pmd lock we also need to move preallocated
2429	* PTE page table if new_pmd is on different PMD page table.
2430	*
2431	* We also don't deposit and withdraw tables for file pages.
2432	*/
2433	return (new_pmd_ptl != old_pmd_ptl) && vma_is_anonymous(vma);
2434	}
2435	#endif
2436
2437	static pmd_t move_soft_dirty_pmd(pmd_t pmd)
2438	{
2439	if (pgtable_supports_soft_dirty()) {
2440	if (unlikely(pmd_is_migration_entry(pmd)))
2441	pmd = pmd_swp_mksoft_dirty(pmd);
2442	else if (pmd_present(pmd))
2443	pmd = pmd_mksoft_dirty(pmd);
2444	}
2445
2446	return pmd;
2447	}
2448
2449	static pmd_t clear_uffd_wp_pmd(pmd_t pmd)
2450	{
2451	if (pmd_none(pmd))
2452	return pmd;
2453	if (pmd_present(pmd))
2454	pmd = pmd_clear_uffd_wp(pmd);
2455	else
2456	pmd = pmd_swp_clear_uffd_wp(pmd);
2457
2458	return pmd;
2459	}
2460
2461	bool move_huge_pmd(struct vm_area_struct vma, unsigned* long old_addr,
2462	unsigned long new_addr, pmd_t old_pmd, pmd_t new_pmd)
2463	{
2464	spinlock_t old_ptl, new_ptl;
2465	pmd_t pmd;
2466	struct mm_struct *mm = vma->vm_mm;
2467	bool force_flush = false;
2468
2469	/*
2470	* The destination pmd shouldn't be established, free_pgtables()
2471	* should have released it; but move_page_tables() might have already
2472	* inserted a page table, if racing against shmem/file collapse.
2473	*/
2474	if (!pmd_none(pmd: *new_pmd)) {
2475	VM_BUG_ON(pmd_trans_huge(*new_pmd));
2476	return false;
2477	}
2478
2479	/*
2480	* We don't have to worry about the ordering of src and dst
2481	* ptlocks because exclusive mmap_lock prevents deadlock.
2482	*/
2483	old_ptl = __pmd_trans_huge_lock(pmd: old_pmd, vma);
2484	if (old_ptl) {
2485	new_ptl = pmd_lockptr(mm, pmd: new_pmd);
2486	if (new_ptl != old_ptl)
2487	spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
2488	pmd = pmdp_huge_get_and_clear(mm, addr: old_addr, pmdp: old_pmd);
2489	if (pmd_present(pmd))
2490	force_flush = true;
2491	VM_BUG_ON(!pmd_none(*new_pmd));
2492
2493	if (pmd_move_must_withdraw(new_pmd_ptl: new_ptl, old_pmd_ptl: old_ptl, vma)) {
2494	pgtable_t pgtable;
2495	pgtable = pgtable_trans_huge_withdraw(mm, pmdp: old_pmd);
2496	pgtable_trans_huge_deposit(mm, pmdp: new_pmd, pgtable);
2497	}
2498	pmd = move_soft_dirty_pmd(pmd);
2499	if (vma_has_uffd_without_event_remap(vma))
2500	pmd = clear_uffd_wp_pmd(pmd);
2501	set_pmd_at(mm, addr: new_addr, pmdp: new_pmd, pmd);
2502	if (force_flush)
2503	flush_pmd_tlb_range(vma, old_addr, old_addr + PMD_SIZE);
2504	if (new_ptl != old_ptl)
2505	spin_unlock(lock: new_ptl);
2506	spin_unlock(lock: old_ptl);
2507	return true;
2508	}
2509	return false;
2510	}
2511
2512	static void change_non_present_huge_pmd(struct mm_struct *mm,
2513	unsigned long addr, pmd_t *pmd, bool uffd_wp,
2514	bool uffd_wp_resolve)
2515	{
2516	softleaf_t entry = softleaf_from_pmd(pmd: *pmd);
2517	const struct folio *folio = softleaf_to_folio(entry);
2518	pmd_t newpmd;
2519
2520	VM_WARN_ON(!pmd_is_valid_softleaf(*pmd));
2521	if (softleaf_is_migration_write(entry)) {
2522	/*
2523	* A protection check is difficult so
2524	* just be safe and disable write
2525	*/
2526	if (folio_test_anon(folio))
2527	entry = make_readable_exclusive_migration_entry(offset: swp_offset(entry));
2528	else
2529	entry = make_readable_migration_entry(offset: swp_offset(entry));
2530	newpmd = swp_entry_to_pmd(entry);
2531	if (pmd_swp_soft_dirty(pmd: *pmd))
2532	newpmd = pmd_swp_mksoft_dirty(pmd: newpmd);
2533	} else if (softleaf_is_device_private_write(entry)) {
2534	entry = make_readable_device_private_entry(offset: swp_offset(entry));
2535	newpmd = swp_entry_to_pmd(entry);
2536	} else {
2537	newpmd = *pmd;
2538	}
2539
2540	if (uffd_wp)
2541	newpmd = pmd_swp_mkuffd_wp(pmd: newpmd);
2542	else if (uffd_wp_resolve)
2543	newpmd = pmd_swp_clear_uffd_wp(pmd: newpmd);
2544	if (!pmd_same(pmd_a: *pmd, pmd_b: newpmd))
2545	set_pmd_at(mm, addr, pmdp: pmd, pmd: newpmd);
2546	}
2547
2548	/*
2549	* Returns
2550	* - 0 if PMD could not be locked
2551	* - 1 if PMD was locked but protections unchanged and TLB flush unnecessary
2552	* or if prot_numa but THP migration is not supported
2553	* - HPAGE_PMD_NR if protections changed and TLB flush necessary
2554	*/
2555	int change_huge_pmd(struct mmu_gather tlb, struct* vm_area_struct *vma,
2556	pmd_t pmd, unsigned* long addr, pgprot_t newprot,
2557	unsigned long cp_flags)
2558	{
2559	struct mm_struct *mm = vma->vm_mm;
2560	spinlock_t *ptl;
2561	pmd_t oldpmd, entry;
2562	bool prot_numa = cp_flags & MM_CP_PROT_NUMA;
2563	bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
2564	bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
2565	int ret = `1`;
2566
2567	tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
2568
2569	if (prot_numa && !thp_migration_supported())
2570	return `1`;
2571
2572	ptl = __pmd_trans_huge_lock(pmd, vma);
2573	if (!ptl)
2574	return `0`;
2575
2576	if (thp_migration_supported() && pmd_is_valid_softleaf(pmd: *pmd)) {
2577	change_non_present_huge_pmd(mm, addr, pmd, uffd_wp,
2578	uffd_wp_resolve);
2579	goto unlock;
2580	}
2581
2582	if (prot_numa) {
2583
2584	/*
2585	* Avoid trapping faults against the zero page. The read-only
2586	* data is likely to be read-cached on the local CPU and
2587	* local/remote hits to the zero page are not interesting.
2588	*/
2589	if (is_huge_zero_pmd(pmd: *pmd))
2590	goto unlock;
2591
2592	if (pmd_protnone(pmd: *pmd))
2593	goto unlock;
2594
2595	if (!folio_can_map_prot_numa(pmd_folio(*pmd), vma,
2596	is_private_single_threaded: vma_is_single_threaded_private(vma)))
2597	goto unlock;
2598	}
2599	/*
2600	* In case prot_numa, we are under mmap_read_lock(mm). It's critical
2601	* to not clear pmd intermittently to avoid race with MADV_DONTNEED
2602	* which is also under mmap_read_lock(mm):
2603	*
2604	* CPU0: CPU1:
2605	* change_huge_pmd(prot_numa=1)
2606	* pmdp_huge_get_and_clear_notify()
2607	* madvise_dontneed()
2608	* zap_pmd_range()
2609	* pmd_trans_huge(*pmd) == 0 (without ptl)
2610	* // skip the pmd
2611	* set_pmd_at();
2612	* // pmd is re-established
2613	*
2614	* The race makes MADV_DONTNEED miss the huge pmd and don't clear it
2615	* which may break userspace.
2616	*
2617	* pmdp_invalidate_ad() is required to make sure we don't miss
2618	* dirty/young flags set by hardware.
2619	*/
2620	oldpmd = pmdp_invalidate_ad(vma, address: addr, pmdp: pmd);
2621
2622	entry = pmd_modify(pmd: oldpmd, newprot);
2623	if (uffd_wp)
2624	entry = pmd_mkuffd_wp(pmd: entry);
2625	else if (uffd_wp_resolve)
2626	/*
2627	* Leave the write bit to be handled by PF interrupt
2628	* handler, then things like COW could be properly
2629	* handled.
2630	*/
2631	entry = pmd_clear_uffd_wp(pmd: entry);
2632
2633	/ See change_pte_range(). /
2634	if ((cp_flags & MM_CP_TRY_CHANGE_WRITABLE) && !pmd_write(pmd: entry) &&
2635	can_change_pmd_writable(vma, addr, pmd: entry))
2636	entry = pmd_mkwrite(pmd: entry, vma);
2637
2638	ret = HPAGE_PMD_NR;
2639	set_pmd_at(mm, addr, pmdp: pmd, pmd: entry);
2640
2641	if (huge_pmd_needs_flush(oldpmd, newpmd: entry))
2642	tlb_flush_pmd_range(tlb, address: addr, HPAGE_PMD_SIZE);
2643	unlock:
2644	spin_unlock(lock: ptl);
2645	return ret;
2646	}
2647
2648	/*
2649	* Returns:
2650	*
2651	* - 0: if pud leaf changed from under us
2652	* - 1: if pud can be skipped
2653	* - HPAGE_PUD_NR: if pud was successfully processed
2654	*/
2655	#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
2656	int change_huge_pud(struct mmu_gather tlb, struct* vm_area_struct *vma,
2657	pud_t pudp, unsigned* long addr, pgprot_t newprot,
2658	unsigned long cp_flags)
2659	{
2660	struct mm_struct *mm = vma->vm_mm;
2661	pud_t oldpud, entry;
2662	spinlock_t *ptl;
2663
2664	tlb_change_page_size(tlb, HPAGE_PUD_SIZE);
2665
2666	/ NUMA balancing doesn't apply to dax /
2667	if (cp_flags & MM_CP_PROT_NUMA)
2668	return `1`;
2669
2670	/*
2671	* Huge entries on userfault-wp only works with anonymous, while we
2672	* don't have anonymous PUDs yet.
2673	*/
2674	if (WARN_ON_ONCE(cp_flags & MM_CP_UFFD_WP_ALL))
2675	return `1`;
2676
2677	ptl = __pud_trans_huge_lock(pud: pudp, vma);
2678	if (!ptl)
2679	return `0`;
2680
2681	/*
2682	* Can't clear PUD or it can race with concurrent zapping. See
2683	* change_huge_pmd().
2684	*/
2685	oldpud = pudp_invalidate(vma, address: addr, pudp);
2686	entry = pud_modify(pud: oldpud, newprot);
2687	set_pud_at(mm, addr, pudp, pud: entry);
2688	tlb_flush_pud_range(tlb, address: addr, HPAGE_PUD_SIZE);
2689
2690	spin_unlock(lock: ptl);
2691	return HPAGE_PUD_NR;
2692	}
2693	#endif
2694
2695	#ifdef CONFIG_USERFAULTFD
2696	/*
2697	* The PT lock for src_pmd and dst_vma/src_vma (for reading) are locked by
2698	* the caller, but it must return after releasing the page_table_lock.
2699	* Just move the page from src_pmd to dst_pmd if possible.
2700	* Return zero if succeeded in moving the page, -EAGAIN if it needs to be
2701	* repeated by the caller, or other errors in case of failure.
2702	*/
2703	int move_pages_huge_pmd(struct mm_struct mm, pmd_t dst_pmd, pmd_t *src_pmd, pmd_t dst_pmdval,
2704	struct vm_area_struct dst_vma, struct* vm_area_struct *src_vma,
2705	unsigned long dst_addr, unsigned long src_addr)
2706	{
2707	pmd_t _dst_pmd, src_pmdval;
2708	struct page *src_page;
2709	struct folio *src_folio;
2710	spinlock_t src_ptl, dst_ptl;
2711	pgtable_t src_pgtable;
2712	struct mmu_notifier_range range;
2713	int err = `0`;
2714
2715	src_pmdval = *src_pmd;
2716	src_ptl = pmd_lockptr(mm, pmd: src_pmd);
2717
2718	lockdep_assert_held(src_ptl);
2719	vma_assert_locked(vma: src_vma);
2720	vma_assert_locked(vma: dst_vma);
2721
2722	/ Sanity checks before the operation /
2723	if (WARN_ON_ONCE(!pmd_none(dst_pmdval)) \|\| WARN_ON_ONCE(src_addr & ~HPAGE_PMD_MASK) \|\|
2724	WARN_ON_ONCE(dst_addr & ~HPAGE_PMD_MASK)) {
2725	spin_unlock(lock: src_ptl);
2726	return -EINVAL;
2727	}
2728
2729	if (!pmd_trans_huge(pmd: src_pmdval)) {
2730	spin_unlock(lock: src_ptl);
2731	if (pmd_is_migration_entry(pmd: src_pmdval)) {
2732	pmd_migration_entry_wait(mm, pmd: &src_pmdval);
2733	return -EAGAIN;
2734	}
2735	return -ENOENT;
2736	}
2737
2738	src_page = pmd_page(src_pmdval);
2739
2740	if (!is_huge_zero_pmd(pmd: src_pmdval)) {
2741	if (unlikely(!PageAnonExclusive(src_page))) {
2742	spin_unlock(lock: src_ptl);
2743	return -EBUSY;
2744	}
2745
2746	src_folio = page_folio(src_page);
2747	folio_get(folio: src_folio);
2748	} else
2749	src_folio = NULL;
2750
2751	spin_unlock(lock: src_ptl);
2752
2753	flush_cache_range(vma: src_vma, start: src_addr, end: src_addr + HPAGE_PMD_SIZE);
2754	mmu_notifier_range_init(range: &range, event: MMU_NOTIFY_CLEAR, flags: `0`, mm, start: src_addr,
2755	end: src_addr + HPAGE_PMD_SIZE);
2756	mmu_notifier_invalidate_range_start(range: &range);
2757
2758	if (src_folio)
2759	folio_lock(folio: src_folio);
2760
2761	dst_ptl = pmd_lockptr(mm, pmd: dst_pmd);
2762	double_pt_lock(ptl1: src_ptl, ptl2: dst_ptl);
2763	if (unlikely(!pmd_same(*src_pmd, src_pmdval) \|\|
2764	!pmd_same(*dst_pmd, dst_pmdval))) {
2765	err = -EAGAIN;
2766	goto unlock_ptls;
2767	}
2768	if (src_folio) {
2769	if (folio_maybe_dma_pinned(folio: src_folio) \|\|
2770	!PageAnonExclusive(page: &src_folio->page)) {
2771	err = -EBUSY;
2772	goto unlock_ptls;
2773	}
2774
2775	if (WARN_ON_ONCE(!folio_test_head(src_folio)) \|\|
2776	WARN_ON_ONCE(!folio_test_anon(src_folio))) {
2777	err = -EBUSY;
2778	goto unlock_ptls;
2779	}
2780
2781	src_pmdval = pmdp_huge_clear_flush(vma: src_vma, address: src_addr, pmdp: src_pmd);
2782	/ Folio got pinned from under us. Put it back and fail the move. /
2783	if (folio_maybe_dma_pinned(folio: src_folio)) {
2784	set_pmd_at(mm, addr: src_addr, pmdp: src_pmd, pmd: src_pmdval);
2785	err = -EBUSY;
2786	goto unlock_ptls;
2787	}
2788
2789	folio_move_anon_rmap(src_folio, dst_vma);
2790	src_folio->index = linear_page_index(vma: dst_vma, address: dst_addr);
2791
2792	_dst_pmd = folio_mk_pmd(folio: src_folio, pgprot: dst_vma->vm_page_prot);
2793	/ Follow mremap() behavior and treat the entry dirty after the move /
2794	_dst_pmd = pmd_mkwrite(pmd: pmd_mkdirty(pmd: _dst_pmd), vma: dst_vma);
2795	} else {
2796	src_pmdval = pmdp_huge_clear_flush(vma: src_vma, address: src_addr, pmdp: src_pmd);
2797	_dst_pmd = folio_mk_pmd(folio: src_folio, pgprot: dst_vma->vm_page_prot);
2798	}
2799	set_pmd_at(mm, addr: dst_addr, pmdp: dst_pmd, pmd: _dst_pmd);
2800
2801	src_pgtable = pgtable_trans_huge_withdraw(mm, pmdp: src_pmd);
2802	pgtable_trans_huge_deposit(mm, pmdp: dst_pmd, pgtable: src_pgtable);
2803	unlock_ptls:
2804	double_pt_unlock(ptl1: src_ptl, ptl2: dst_ptl);
2805	/ unblock rmap walks /
2806	if (src_folio)
2807	folio_unlock(folio: src_folio);
2808	mmu_notifier_invalidate_range_end(range: &range);
2809	if (src_folio)
2810	folio_put(folio: src_folio);
2811	return err;
2812	}
2813	#endif /* CONFIG_USERFAULTFD */
2814
2815	/*
2816	* Returns page table lock pointer if a given pmd maps a thp, NULL otherwise.
2817	*
2818	* Note that if it returns page table lock pointer, this routine returns without
2819	* unlocking page table lock. So callers must unlock it.
2820	*/
2821	spinlock_t __pmd_trans_huge_lock(pmd_t pmd, struct vm_area_struct *vma)
2822	{
2823	spinlock_t *ptl;
2824
2825	ptl = pmd_lock(mm: vma->vm_mm, pmd);
2826	if (likely(pmd_is_huge(*pmd)))
2827	return ptl;
2828	spin_unlock(lock: ptl);
2829	return NULL;
2830	}
2831
2832	/*
2833	* Returns page table lock pointer if a given pud maps a thp, NULL otherwise.
2834	*
2835	* Note that if it returns page table lock pointer, this routine returns without
2836	* unlocking page table lock. So callers must unlock it.
2837	*/
2838	spinlock_t __pud_trans_huge_lock(pud_t pud, struct vm_area_struct *vma)
2839	{
2840	spinlock_t *ptl;
2841
2842	ptl = pud_lock(mm: vma->vm_mm, pud);
2843	if (likely(pud_trans_huge(*pud)))
2844	return ptl;
2845	spin_unlock(lock: ptl);
2846	return NULL;
2847	}
2848
2849	#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
2850	int zap_huge_pud(struct mmu_gather tlb, struct* vm_area_struct *vma,
2851	pud_t pud, unsigned* long addr)
2852	{
2853	spinlock_t *ptl;
2854	pud_t orig_pud;
2855
2856	ptl = __pud_trans_huge_lock(pud, vma);
2857	if (!ptl)
2858	return `0`;
2859
2860	orig_pud = pudp_huge_get_and_clear_full(vma, address: addr, pudp: pud, full: tlb->fullmm);
2861	arch_check_zapped_pud(vma, pud: orig_pud);
2862	tlb_remove_pud_tlb_entry(tlb, pud, addr);
2863	if (!vma_is_dax(vma) && vma_is_special_huge(vma)) {
2864	spin_unlock(lock: ptl);
2865	/ No zero page support yet /
2866	} else {
2867	struct page *page = NULL;
2868	struct folio *folio;
2869
2870	/ No support for anonymous PUD pages or migration yet /
2871	VM_WARN_ON_ONCE(vma_is_anonymous(vma) \|\|
2872	!pud_present(orig_pud));
2873
2874	page = pud_page(orig_pud);
2875	folio = page_folio(page);
2876	folio_remove_rmap_pud(folio, page, vma);
2877	add_mm_counter(mm: tlb->mm, member: mm_counter_file(folio), value: -HPAGE_PUD_NR);
2878
2879	spin_unlock(lock: ptl);
2880	tlb_remove_page_size(tlb, page, HPAGE_PUD_SIZE);
2881	}
2882	return `1`;
2883	}
2884
2885	static void __split_huge_pud_locked(struct vm_area_struct vma, pud_t pud,
2886	unsigned long haddr)
2887	{
2888	struct folio *folio;
2889	struct page *page;
2890	pud_t old_pud;
2891
2892	VM_BUG_ON(haddr & ~HPAGE_PUD_MASK);
2893	VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
2894	VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PUD_SIZE, vma);
2895	VM_BUG_ON(!pud_trans_huge(*pud));
2896
2897	count_vm_event(item: THP_SPLIT_PUD);
2898
2899	old_pud = pudp_huge_clear_flush(vma, address: haddr, pudp: pud);
2900
2901	if (!vma_is_dax(vma))
2902	return;
2903
2904	page = pud_page(old_pud);
2905	folio = page_folio(page);
2906
2907	if (!folio_test_dirty(folio) && pud_dirty(pud: old_pud))
2908	folio_mark_dirty(folio);
2909	if (!folio_test_referenced(folio) && pud_young(pud: old_pud))
2910	folio_set_referenced(folio);
2911	folio_remove_rmap_pud(folio, page, vma);
2912	folio_put(folio);
2913	add_mm_counter(mm: vma->vm_mm, member: mm_counter_file(folio),
2914	value: -HPAGE_PUD_NR);
2915	}
2916
2917	void __split_huge_pud(struct vm_area_struct vma, pud_t pud,
2918	unsigned long address)
2919	{
2920	spinlock_t *ptl;
2921	struct mmu_notifier_range range;
2922
2923	mmu_notifier_range_init(range: &range, event: MMU_NOTIFY_CLEAR, flags: `0`, mm: vma->vm_mm,
2924	start: address & HPAGE_PUD_MASK,
2925	end: (address & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE);
2926	mmu_notifier_invalidate_range_start(range: &range);
2927	ptl = pud_lock(mm: vma->vm_mm, pud);
2928	if (unlikely(!pud_trans_huge(*pud)))
2929	goto out;
2930	__split_huge_pud_locked(vma, pud, haddr: range.start);
2931
2932	out:
2933	spin_unlock(lock: ptl);
2934	mmu_notifier_invalidate_range_end(range: &range);
2935	}
2936	#else
2937	void __split_huge_pud(struct vm_area_struct vma, pud_t pud,
2938	unsigned long address)
2939	{
2940	}
2941	#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
2942
2943	static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
2944	unsigned long haddr, pmd_t *pmd)
2945	{
2946	struct mm_struct *mm = vma->vm_mm;
2947	pgtable_t pgtable;
2948	pmd_t _pmd, old_pmd;
2949	unsigned long addr;
2950	pte_t *pte;
2951	int i;
2952
2953	/*
2954	* Leave pmd empty until pte is filled note that it is fine to delay
2955	* notification until mmu_notifier_invalidate_range_end() as we are
2956	* replacing a zero pmd write protected page with a zero pte write
2957	* protected page.
2958	*
2959	* See Documentation/mm/mmu_notifier.rst
2960	*/
2961	old_pmd = pmdp_huge_clear_flush(vma, address: haddr, pmdp: pmd);
2962
2963	pgtable = pgtable_trans_huge_withdraw(mm, pmdp: pmd);
2964	pmd_populate(mm, pmd: &_pmd, pte: pgtable);
2965
2966	pte = pte_offset_map(pmd: &_pmd, addr: haddr);
2967	VM_BUG_ON(!pte);
2968	for (i = `0`, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
2969	pte_t entry;
2970
2971	entry = pfn_pte(page_nr: my_zero_pfn(addr), pgprot: vma->vm_page_prot);
2972	entry = pte_mkspecial(pte: entry);
2973	if (pmd_uffd_wp(pmd: old_pmd))
2974	entry = pte_mkuffd_wp(pte: entry);
2975	VM_BUG_ON(!pte_none(ptep_get(pte)));
2976	set_pte_at(mm, addr, pte, entry);
2977	pte++;
2978	}
2979	pte_unmap(pte: pte - `1`);
2980	smp_wmb(); / make pte visible before pmd /
2981	pmd_populate(mm, pmd, pte: pgtable);
2982	}
2983
2984	static void __split_huge_pmd_locked(struct vm_area_struct vma, pmd_t pmd,
2985	unsigned long haddr, bool freeze)
2986	{
2987	struct mm_struct *mm = vma->vm_mm;
2988	struct folio *folio;
2989	struct page *page;
2990	pgtable_t pgtable;
2991	pmd_t old_pmd, _pmd;
2992	bool soft_dirty, uffd_wp = false, young = false, write = false;
2993	bool anon_exclusive = false, dirty = false;
2994	unsigned long addr;
2995	pte_t *pte;
2996	int i;
2997
2998	VM_BUG_ON(haddr & ~HPAGE_PMD_MASK);
2999	VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
3000	VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma);
3001
3002	VM_WARN_ON_ONCE(!pmd_is_valid_softleaf(pmd) && !pmd_trans_huge(pmd));
3003
3004	count_vm_event(item: THP_SPLIT_PMD);
3005
3006	if (!vma_is_anonymous(vma)) {
3007	old_pmd = pmdp_huge_clear_flush(vma, address: haddr, pmdp: pmd);
3008	/*
3009	* We are going to unmap this huge page. So
3010	* just go ahead and zap it
3011	*/
3012	if (arch_needs_pgtable_deposit())
3013	zap_deposited_table(mm, pmd);
3014	if (!vma_is_dax(vma) && vma_is_special_huge(vma))
3015	return;
3016	if (unlikely(pmd_is_migration_entry(old_pmd))) {
3017	const softleaf_t old_entry = softleaf_from_pmd(pmd: old_pmd);
3018
3019	folio = softleaf_to_folio(entry: old_entry);
3020	} else if (is_huge_zero_pmd(pmd: old_pmd)) {
3021	return;
3022	} else {
3023	page = pmd_page(old_pmd);
3024	folio = page_folio(page);
3025	if (!folio_test_dirty(folio) && pmd_dirty(pmd: old_pmd))
3026	folio_mark_dirty(folio);
3027	if (!folio_test_referenced(folio) && pmd_young(pmd: old_pmd))
3028	folio_set_referenced(folio);
3029	folio_remove_rmap_pmd(folio, page, vma);
3030	folio_put(folio);
3031	}
3032	add_mm_counter(mm, member: mm_counter_file(folio), value: -HPAGE_PMD_NR);
3033	return;
3034	}
3035
3036	if (is_huge_zero_pmd(pmd: *pmd)) {
3037	/*
3038	* FIXME: Do we want to invalidate secondary mmu by calling
3039	* mmu_notifier_arch_invalidate_secondary_tlbs() see comments below
3040	* inside __split_huge_pmd() ?
3041	*
3042	* We are going from a zero huge page write protected to zero
3043	* small page also write protected so it does not seems useful
3044	* to invalidate secondary mmu at this time.
3045	*/
3046	return __split_huge_zero_page_pmd(vma, haddr, pmd);
3047	}
3048
3049	if (pmd_is_migration_entry(pmd: *pmd)) {
3050	softleaf_t entry;
3051
3052	old_pmd = *pmd;
3053	entry = softleaf_from_pmd(pmd: old_pmd);
3054	page = softleaf_to_page(entry);
3055	folio = page_folio(page);
3056
3057	soft_dirty = pmd_swp_soft_dirty(pmd: old_pmd);
3058	uffd_wp = pmd_swp_uffd_wp(pmd: old_pmd);
3059
3060	write = softleaf_is_migration_write(entry);
3061	if (PageAnon(page))
3062	anon_exclusive = softleaf_is_migration_read_exclusive(entry);
3063	young = softleaf_is_migration_young(entry);
3064	dirty = softleaf_is_migration_dirty(entry);
3065	} else if (pmd_is_device_private_entry(pmd: *pmd)) {
3066	softleaf_t entry;
3067
3068	old_pmd = *pmd;
3069	entry = softleaf_from_pmd(pmd: old_pmd);
3070	page = softleaf_to_page(entry);
3071	folio = page_folio(page);
3072
3073	soft_dirty = pmd_swp_soft_dirty(pmd: old_pmd);
3074	uffd_wp = pmd_swp_uffd_wp(pmd: old_pmd);
3075
3076	write = softleaf_is_device_private_write(entry);
3077	anon_exclusive = PageAnonExclusive(page);
3078
3079	/*
3080	* Device private THP should be treated the same as regular
3081	* folios w.r.t anon exclusive handling. See the comments for
3082	* folio handling and anon_exclusive below.
3083	*/
3084	if (freeze && anon_exclusive &&
3085	folio_try_share_anon_rmap_pmd(folio, page))
3086	freeze = false;
3087	if (!freeze) {
3088	rmap_t rmap_flags = RMAP_NONE;
3089
3090	folio_ref_add(folio, HPAGE_PMD_NR - `1`);
3091	if (anon_exclusive)
3092	rmap_flags \|= RMAP_EXCLUSIVE;
3093
3094	folio_add_anon_rmap_ptes(folio, page, HPAGE_PMD_NR,
3095	vma, address: haddr, flags: rmap_flags);
3096	}
3097	} else {
3098	/*
3099	* Up to this point the pmd is present and huge and userland has
3100	* the whole access to the hugepage during the split (which
3101	* happens in place). If we overwrite the pmd with the not-huge
3102	* version pointing to the pte here (which of course we could if
3103	* all CPUs were bug free), userland could trigger a small page
3104	* size TLB miss on the small sized TLB while the hugepage TLB
3105	* entry is still established in the huge TLB. Some CPU doesn't
3106	* like that. See
3107	* http://support.amd.com/TechDocs/41322_10h_Rev_Gd.pdf, Erratum
3108	* 383 on page 105. Intel should be safe but is also warns that
3109	* it's only safe if the permission and cache attributes of the
3110	* two entries loaded in the two TLB is identical (which should
3111	* be the case here). But it is generally safer to never allow
3112	* small and huge TLB entries for the same virtual address to be
3113	* loaded simultaneously. So instead of doing "pmd_populate();
3114	* flush_pmd_tlb_range();" we first mark the current pmd
3115	* notpresent (atomically because here the pmd_trans_huge must
3116	* remain set at all times on the pmd until the split is
3117	* complete for this pmd), then we flush the SMP TLB and finally
3118	* we write the non-huge version of the pmd entry with
3119	* pmd_populate.
3120	*/
3121	old_pmd = pmdp_invalidate(vma, address: haddr, pmdp: pmd);
3122	page = pmd_page(old_pmd);
3123	folio = page_folio(page);
3124	if (pmd_dirty(pmd: old_pmd)) {
3125	dirty = true;
3126	folio_set_dirty(folio);
3127	}
3128	write = pmd_write(pmd: old_pmd);
3129	young = pmd_young(pmd: old_pmd);
3130	soft_dirty = pmd_soft_dirty(pmd: old_pmd);
3131	uffd_wp = pmd_uffd_wp(pmd: old_pmd);
3132
3133	VM_WARN_ON_FOLIO(!folio_ref_count(folio), folio);
3134	VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);
3135
3136	/*
3137	* Without "freeze", we'll simply split the PMD, propagating the
3138	* PageAnonExclusive() flag for each PTE by setting it for
3139	* each subpage -- no need to (temporarily) clear.
3140	*
3141	* With "freeze" we want to replace mapped pages by
3142	* migration entries right away. This is only possible if we
3143	* managed to clear PageAnonExclusive() -- see
3144	* set_pmd_migration_entry().
3145	*
3146	* In case we cannot clear PageAnonExclusive(), split the PMD
3147	* only and let try_to_migrate_one() fail later.
3148	*
3149	* See folio_try_share_anon_rmap_pmd(): invalidate PMD first.
3150	*/
3151	anon_exclusive = PageAnonExclusive(page);
3152	if (freeze && anon_exclusive &&
3153	folio_try_share_anon_rmap_pmd(folio, page))
3154	freeze = false;
3155	if (!freeze) {
3156	rmap_t rmap_flags = RMAP_NONE;
3157
3158	folio_ref_add(folio, HPAGE_PMD_NR - `1`);
3159	if (anon_exclusive)
3160	rmap_flags \|= RMAP_EXCLUSIVE;
3161	folio_add_anon_rmap_ptes(folio, page, HPAGE_PMD_NR,
3162	vma, address: haddr, flags: rmap_flags);
3163	}
3164	}
3165
3166	/*
3167	* Withdraw the table only after we mark the pmd entry invalid.
3168	* This's critical for some architectures (Power).
3169	*/
3170	pgtable = pgtable_trans_huge_withdraw(mm, pmdp: pmd);
3171	pmd_populate(mm, pmd: &_pmd, pte: pgtable);
3172
3173	pte = pte_offset_map(pmd: &_pmd, addr: haddr);
3174	VM_BUG_ON(!pte);
3175
3176	/*
3177	* Note that NUMA hinting access restrictions are not transferred to
3178	* avoid any possibility of altering permissions across VMAs.
3179	*/
3180	if (freeze \|\| pmd_is_migration_entry(pmd: old_pmd)) {
3181	pte_t entry;
3182	swp_entry_t swp_entry;
3183
3184	for (i = `0`, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
3185	if (write)
3186	swp_entry = make_writable_migration_entry(
3187	page_to_pfn(page + i));
3188	else if (anon_exclusive)
3189	swp_entry = make_readable_exclusive_migration_entry(
3190	page_to_pfn(page + i));
3191	else
3192	swp_entry = make_readable_migration_entry(
3193	page_to_pfn(page + i));
3194	if (young)
3195	swp_entry = make_migration_entry_young(entry: swp_entry);
3196	if (dirty)
3197	swp_entry = make_migration_entry_dirty(entry: swp_entry);
3198	entry = swp_entry_to_pte(entry: swp_entry);
3199	if (soft_dirty)
3200	entry = pte_swp_mksoft_dirty(pte: entry);
3201	if (uffd_wp)
3202	entry = pte_swp_mkuffd_wp(pte: entry);
3203	VM_WARN_ON(!pte_none(ptep_get(pte + i)));
3204	set_pte_at(mm, addr, pte + i, entry);
3205	}
3206	} else if (pmd_is_device_private_entry(pmd: old_pmd)) {
3207	pte_t entry;
3208	swp_entry_t swp_entry;
3209
3210	for (i = `0`, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
3211	/*
3212	* anon_exclusive was already propagated to the relevant
3213	* pages corresponding to the pte entries when freeze
3214	* is false.
3215	*/
3216	if (write)
3217	swp_entry = make_writable_device_private_entry(
3218	page_to_pfn(page + i));
3219	else
3220	swp_entry = make_readable_device_private_entry(
3221	page_to_pfn(page + i));
3222	/*
3223	* Young and dirty bits are not progated via swp_entry
3224	*/
3225	entry = swp_entry_to_pte(entry: swp_entry);
3226	if (soft_dirty)
3227	entry = pte_swp_mksoft_dirty(pte: entry);
3228	if (uffd_wp)
3229	entry = pte_swp_mkuffd_wp(pte: entry);
3230	VM_WARN_ON(!pte_none(ptep_get(pte + i)));
3231	set_pte_at(mm, addr, pte + i, entry);
3232	}
3233	} else {
3234	pte_t entry;
3235
3236	entry = mk_pte(page, READ_ONCE(vma->vm_page_prot));
3237	if (write)
3238	entry = pte_mkwrite(pte: entry, vma);
3239	if (!young)
3240	entry = pte_mkold(pte: entry);
3241	/ NOTE: this may set soft-dirty too on some archs /
3242	if (dirty)
3243	entry = pte_mkdirty(pte: entry);
3244	if (soft_dirty)
3245	entry = pte_mksoft_dirty(pte: entry);
3246	if (uffd_wp)
3247	entry = pte_mkuffd_wp(pte: entry);
3248
3249	for (i = `0`; i < HPAGE_PMD_NR; i++)
3250	VM_WARN_ON(!pte_none(ptep_get(pte + i)));
3251
3252	set_ptes(mm, addr: haddr, ptep: pte, pte: entry, HPAGE_PMD_NR);
3253	}
3254	pte_unmap(pte);
3255
3256	if (!pmd_is_migration_entry(pmd: *pmd))
3257	folio_remove_rmap_pmd(folio, page, vma);
3258	if (freeze)
3259	put_page(page);
3260
3261	smp_wmb(); / make pte visible before pmd /
3262	pmd_populate(mm, pmd, pte: pgtable);
3263	}
3264
3265	void split_huge_pmd_locked(struct vm_area_struct vma, unsigned* long address,
3266	pmd_t *pmd, bool freeze)
3267	{
3268	VM_WARN_ON_ONCE(!IS_ALIGNED(address, HPAGE_PMD_SIZE));
3269	if (pmd_trans_huge(pmd: pmd) \|\| pmd_is_valid_softleaf(pmd: pmd))
3270	__split_huge_pmd_locked(vma, pmd, haddr: address, freeze);
3271	}
3272
3273	void __split_huge_pmd(struct vm_area_struct vma, pmd_t pmd,
3274	unsigned long address, bool freeze)
3275	{
3276	spinlock_t *ptl;
3277	struct mmu_notifier_range range;
3278
3279	mmu_notifier_range_init(range: &range, event: MMU_NOTIFY_CLEAR, flags: `0`, mm: vma->vm_mm,
3280	start: address & HPAGE_PMD_MASK,
3281	end: (address & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE);
3282	mmu_notifier_invalidate_range_start(range: &range);
3283	ptl = pmd_lock(mm: vma->vm_mm, pmd);
3284	split_huge_pmd_locked(vma, address: range.start, pmd, freeze);
3285	spin_unlock(lock: ptl);
3286	mmu_notifier_invalidate_range_end(range: &range);
3287	}
3288
3289	void split_huge_pmd_address(struct vm_area_struct vma, unsigned* long address,
3290	bool freeze)
3291	{
3292	pmd_t *pmd = mm_find_pmd(mm: vma->vm_mm, address);
3293
3294	if (!pmd)
3295	return;
3296
3297	__split_huge_pmd(vma, pmd, address, freeze);
3298	}
3299
3300	static inline void split_huge_pmd_if_needed(struct vm_area_struct vma, unsigned* long address)
3301	{
3302	/*
3303	* If the new address isn't hpage aligned and it could previously
3304	* contain an hugepage: check if we need to split an huge pmd.
3305	*/
3306	if (!IS_ALIGNED(address, HPAGE_PMD_SIZE) &&
3307	range_in_vma(vma, ALIGN_DOWN(address, HPAGE_PMD_SIZE),
3308	ALIGN(address, HPAGE_PMD_SIZE)))
3309	split_huge_pmd_address(vma, address, freeze: false);
3310	}
3311
3312	void vma_adjust_trans_huge(struct vm_area_struct *vma,
3313	unsigned long start,
3314	unsigned long end,
3315	struct vm_area_struct *next)
3316	{
3317	/ Check if we need to split start first. /
3318	split_huge_pmd_if_needed(vma, address: start);
3319
3320	/ Check if we need to split end next. /
3321	split_huge_pmd_if_needed(vma, address: end);
3322
3323	/ If we're incrementing next->vm_start, we might need to split it. /
3324	if (next)
3325	split_huge_pmd_if_needed(vma: next, address: end);
3326	}
3327
3328	static void unmap_folio(struct folio *folio)
3329	{
3330	enum ttu_flags ttu_flags = TTU_RMAP_LOCKED \| TTU_SYNC \|
3331	TTU_BATCH_FLUSH;
3332
3333	VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
3334
3335	if (folio_test_pmd_mappable(folio))
3336	ttu_flags \|= TTU_SPLIT_HUGE_PMD;
3337
3338	/*
3339	* Anon pages need migration entries to preserve them, but file
3340	* pages can simply be left unmapped, then faulted back on demand.
3341	* If that is ever changed (perhaps for mlock), update remap_page().
3342	*/
3343	if (folio_test_anon(folio))
3344	try_to_migrate(folio, flags: ttu_flags);
3345	else
3346	try_to_unmap(folio, flags: ttu_flags \| TTU_IGNORE_MLOCK);
3347
3348	try_to_unmap_flush();
3349	}
3350
3351	static bool __discard_anon_folio_pmd_locked(struct vm_area_struct *vma,
3352	unsigned long addr, pmd_t *pmdp,
3353	struct folio *folio)
3354	{
3355	struct mm_struct *mm = vma->vm_mm;
3356	int ref_count, map_count;
3357	pmd_t orig_pmd = *pmdp;
3358
3359	if (pmd_dirty(pmd: orig_pmd))
3360	folio_set_dirty(folio);
3361	if (folio_test_dirty(folio) && !(vma->vm_flags & VM_DROPPABLE)) {
3362	folio_set_swapbacked(folio);
3363	return false;
3364	}
3365
3366	orig_pmd = pmdp_huge_clear_flush(vma, address: addr, pmdp);
3367
3368	/*
3369	* Syncing against concurrent GUP-fast:
3370	* - clear PMD; barrier; read refcount
3371	* - inc refcount; barrier; read PMD
3372	*/
3373	smp_mb();
3374
3375	ref_count = folio_ref_count(folio);
3376	map_count = folio_mapcount(folio);
3377
3378	/*
3379	* Order reads for folio refcount and dirty flag
3380	* (see comments in __remove_mapping()).
3381	*/
3382	smp_rmb();
3383
3384	/*
3385	* If the folio or its PMD is redirtied at this point, or if there
3386	* are unexpected references, we will give up to discard this folio
3387	* and remap it.
3388	*
3389	* The only folio refs must be one from isolation plus the rmap(s).
3390	*/
3391	if (pmd_dirty(pmd: orig_pmd))
3392	folio_set_dirty(folio);
3393	if (folio_test_dirty(folio) && !(vma->vm_flags & VM_DROPPABLE)) {
3394	folio_set_swapbacked(folio);
3395	set_pmd_at(mm, addr, pmdp, pmd: orig_pmd);
3396	return false;
3397	}
3398
3399	if (ref_count != map_count + `1`) {
3400	set_pmd_at(mm, addr, pmdp, pmd: orig_pmd);
3401	return false;
3402	}
3403
3404	folio_remove_rmap_pmd(folio, pmd_page(orig_pmd), vma);
3405	zap_deposited_table(mm, pmd: pmdp);
3406	add_mm_counter(mm, member: MM_ANONPAGES, value: -HPAGE_PMD_NR);
3407	if (vma->vm_flags & VM_LOCKED)
3408	mlock_drain_local();
3409	folio_put(folio);
3410
3411	return true;
3412	}
3413
3414	bool unmap_huge_pmd_locked(struct vm_area_struct vma, unsigned* long addr,
3415	pmd_t pmdp, struct* folio *folio)
3416	{
3417	VM_WARN_ON_FOLIO(!folio_test_pmd_mappable(folio), folio);
3418	VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio);
3419	VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);
3420	VM_WARN_ON_FOLIO(folio_test_swapbacked(folio), folio);
3421	VM_WARN_ON_ONCE(!IS_ALIGNED(addr, HPAGE_PMD_SIZE));
3422
3423	return __discard_anon_folio_pmd_locked(vma, addr, pmdp, folio);
3424	}
3425
3426	static void remap_page(struct folio folio, unsigned* long nr, int flags)
3427	{
3428	int i = `0`;
3429
3430	/ If unmap_folio() uses try_to_migrate() on file, remove this check /
3431	if (!folio_test_anon(folio))
3432	return;
3433	for (;;) {
3434	remove_migration_ptes(src: folio, dst: folio, flags: RMP_LOCKED \| flags);
3435	i += folio_nr_pages(folio);
3436	if (i >= nr)
3437	break;
3438	folio = folio_next(folio);
3439	}
3440	}
3441
3442	static void lru_add_split_folio(struct folio folio, struct* folio *new_folio,
3443	struct lruvec lruvec, struct* list_head *list)
3444	{
3445	VM_BUG_ON_FOLIO(folio_test_lru(new_folio), folio);
3446	lockdep_assert_held(&lruvec->lru_lock);
3447
3448	if (folio_is_device_private(folio))
3449	return;
3450
3451	if (list) {
3452	/ page reclaim is reclaiming a huge page /
3453	VM_WARN_ON(folio_test_lru(folio));
3454	folio_get(folio: new_folio);
3455	list_add_tail(new: &new_folio->lru, head: list);
3456	} else {
3457	/ head is still on lru (and we have it frozen) /
3458	VM_WARN_ON(!folio_test_lru(folio));
3459	if (folio_test_unevictable(folio))
3460	new_folio->mlock_count = `0`;
3461	else
3462	list_add_tail(new: &new_folio->lru, head: &folio->lru);
3463	folio_set_lru(folio: new_folio);
3464	}
3465	}
3466
3467	static bool page_range_has_hwpoisoned(struct page page, long* nr_pages)
3468	{
3469	for (; nr_pages; page++, nr_pages--)
3470	if (PageHWPoison(page))
3471	return true;
3472	return false;
3473	}
3474
3475	/*
3476	* It splits @folio into @new_order folios and copies the @folio metadata to
3477	* all the resulting folios.
3478	*/
3479	static void __split_folio_to_order(struct folio folio, int* old_order,
3480	int new_order)
3481	{
3482	/ Scan poisoned pages when split a poisoned folio to large folios /
3483	const bool handle_hwpoison = folio_test_has_hwpoisoned(folio) && new_order;
3484	long new_nr_pages = `1` << new_order;
3485	long nr_pages = `1` << old_order;
3486	long i;
3487
3488	folio_clear_has_hwpoisoned(folio);
3489
3490	/ Check first new_nr_pages since the loop below skips them /
3491	if (handle_hwpoison &&
3492	page_range_has_hwpoisoned(folio_page(folio, `0`), nr_pages: new_nr_pages))
3493	folio_set_has_hwpoisoned(folio);
3494	/*
3495	* Skip the first new_nr_pages, since the new folio from them have all
3496	* the flags from the original folio.
3497	*/
3498	for (i = new_nr_pages; i < nr_pages; i += new_nr_pages) {
3499	struct page *new_head = &folio->page + i;
3500	/*
3501	* Careful: new_folio is not a "real" folio before we cleared PageTail.
3502	* Don't pass it around before clear_compound_head().
3503	*/
3504	struct folio new_folio = (struct* folio *)new_head;
3505
3506	VM_BUG_ON_PAGE(atomic_read(&new_folio->_mapcount) != -`1`, new_head);
3507
3508	/*
3509	* Clone page flags before unfreezing refcount.
3510	*
3511	* After successful get_page_unless_zero() might follow flags change,
3512	* for example lock_page() which set PG_waiters.
3513	*
3514	* Note that for mapped sub-pages of an anonymous THP,
3515	* PG_anon_exclusive has been cleared in unmap_folio() and is stored in
3516	* the migration entry instead from where remap_page() will restore it.
3517	* We can still have PG_anon_exclusive set on effectively unmapped and
3518	* unreferenced sub-pages of an anonymous THP: we can simply drop
3519	* PG_anon_exclusive (-> PG_mappedtodisk) for these here.
3520	*/
3521	new_folio->flags.f &= ~PAGE_FLAGS_CHECK_AT_PREP;
3522	new_folio->flags.f \|= (folio->flags.f &
3523	((`1L` << PG_referenced) \|
3524	(`1L` << PG_swapbacked) \|
3525	(`1L` << PG_swapcache) \|
3526	(`1L` << PG_mlocked) \|
3527	(`1L` << PG_uptodate) \|
3528	(`1L` << PG_active) \|
3529	(`1L` << PG_workingset) \|
3530	(`1L` << PG_locked) \|
3531	(`1L` << PG_unevictable) \|
3532	#ifdef CONFIG_ARCH_USES_PG_ARCH_2
3533	(`1L` << PG_arch_2) \|
3534	#endif
3535	#ifdef CONFIG_ARCH_USES_PG_ARCH_3
3536	(`1L` << PG_arch_3) \|
3537	#endif
3538	(`1L` << PG_dirty) \|
3539	LRU_GEN_MASK \| LRU_REFS_MASK));
3540
3541	if (handle_hwpoison &&
3542	page_range_has_hwpoisoned(page: new_head, nr_pages: new_nr_pages))
3543	folio_set_has_hwpoisoned(folio: new_folio);
3544
3545	new_folio->mapping = folio->mapping;
3546	new_folio->index = folio->index + i;
3547
3548	if (folio_test_swapcache(folio))
3549	new_folio->swap.val = folio->swap.val + i;
3550
3551	/ Page flags must be visible before we make the page non-compound. /
3552	smp_wmb();
3553
3554	/*
3555	* Clear PageTail before unfreezing page refcount.
3556	*
3557	* After successful get_page_unless_zero() might follow put_page()
3558	* which needs correct compound_head().
3559	*/
3560	clear_compound_head(page: new_head);
3561	if (new_order) {
3562	prep_compound_page(page: new_head, order: new_order);
3563	folio_set_large_rmappable(folio: new_folio);
3564	}
3565
3566	if (folio_test_young(folio))
3567	folio_set_young(folio: new_folio);
3568	if (folio_test_idle(folio))
3569	folio_set_idle(folio: new_folio);
3570	#ifdef CONFIG_MEMCG
3571	new_folio->memcg_data = folio->memcg_data;
3572	#endif
3573
3574	folio_xchg_last_cpupid(folio: new_folio, cpupid: folio_last_cpupid(folio));
3575	}
3576
3577	if (new_order)
3578	folio_set_order(folio, order: new_order);
3579	else
3580	ClearPageCompound(page: &folio->page);
3581	}
3582
3583	/**
3584	* __split_unmapped_folio() - splits an unmapped @folio to lower order folios in
3585	* two ways: uniform split or non-uniform split.
3586	* @folio: the to-be-split folio
3587	* @new_order: the smallest order of the after split folios (since buddy
3588	* allocator like split generates folios with orders from @folio's
3589	* order - 1 to new_order).
3590	* @split_at: in buddy allocator like split, the folio containing @split_at
3591	* will be split until its order becomes @new_order.
3592	* @xas: xa_state pointing to folio->mapping->i_pages and locked by caller
3593	* @mapping: @folio->mapping
3594	* @split_type: if the split is uniform or not (buddy allocator like split)
3595	*
3596	*
3597	* 1. uniform split: the given @folio into multiple @new_order small folios,
3598	* where all small folios have the same order. This is done when
3599	* split_type is SPLIT_TYPE_UNIFORM.
3600	* 2. buddy allocator like (non-uniform) split: the given @folio is split into
3601	* half and one of the half (containing the given page) is split into half
3602	* until the given @folio's order becomes @new_order. This is done when
3603	* split_type is SPLIT_TYPE_NON_UNIFORM.
3604	*
3605	* The high level flow for these two methods are:
3606	*
3607	* 1. uniform split: @xas is split with no expectation of failure and a single
3608	* __split_folio_to_order() is called to split the @folio into @new_order
3609	* along with stats update.
3610	* 2. non-uniform split: folio_order - @new_order calls to
3611	* __split_folio_to_order() are expected to be made in a for loop to split
3612	* the @folio to one lower order at a time. The folio containing @split_at
3613	* is split in each iteration. @xas is split into half in each iteration and
3614	* can fail. A failed @xas split leaves split folios as is without merging
3615	* them back.
3616	*
3617	* After splitting, the caller's folio reference will be transferred to the
3618	* folio containing @split_at. The caller needs to unlock and/or free
3619	* after-split folios if necessary.
3620	*
3621	* Return: 0 - successful, <0 - failed (if -ENOMEM is returned, @folio might be
3622	* split but not to @new_order, the caller needs to check)
3623	*/
3624	static int __split_unmapped_folio(struct folio folio, int* new_order,
3625	struct page split_at, struct* xa_state *xas,
3626	struct address_space mapping, enum* split_type split_type)
3627	{
3628	const bool is_anon = folio_test_anon(folio);
3629	int old_order = folio_order(folio);
3630	int start_order = split_type == SPLIT_TYPE_UNIFORM ? new_order : old_order - `1`;
3631	int split_order;
3632
3633	/*
3634	* split to new_order one order at a time. For uniform split,
3635	* folio is split to new_order directly.
3636	*/
3637	for (split_order = start_order;
3638	split_order >= new_order;
3639	split_order--) {
3640	int nr_new_folios = `1UL` << (old_order - split_order);
3641
3642	/ order-1 anonymous folio is not supported /
3643	if (is_anon && split_order == `1`)
3644	continue;
3645
3646	if (mapping) {
3647	/*
3648	* uniform split has xas_split_alloc() called before
3649	* irq is disabled to allocate enough memory, whereas
3650	* non-uniform split can handle ENOMEM.
3651	*/
3652	if (split_type == SPLIT_TYPE_UNIFORM)
3653	xas_split(xas, entry: folio, order: old_order);
3654	else {
3655	xas_set_order(xas, index: folio->index, order: split_order);
3656	xas_try_split(xas, entry: folio, order: old_order);
3657	if (xas_error(xas))
3658	return xas_error(xas);
3659	}
3660	}
3661
3662	folio_split_memcg_refs(folio, old_order, new_order: split_order);
3663	split_page_owner(page: &folio->page, old_order, new_order: split_order);
3664	pgalloc_tag_split(folio, old_order, new_order: split_order);
3665	__split_folio_to_order(folio, old_order, new_order: split_order);
3666
3667	if (is_anon) {
3668	mod_mthp_stat(order: old_order, item: MTHP_STAT_NR_ANON, delta: -`1`);
3669	mod_mthp_stat(order: split_order, item: MTHP_STAT_NR_ANON, delta: nr_new_folios);
3670	}
3671	/*
3672	* If uniform split, the process is complete.
3673	* If non-uniform, continue splitting the folio at @split_at
3674	* as long as the next @split_order is >= @new_order.
3675	*/
3676	folio = page_folio(split_at);
3677	old_order = split_order;
3678	}
3679
3680	return `0`;
3681	}
3682
3683	/**
3684	* folio_check_splittable() - check if a folio can be split to a given order
3685	* @folio: folio to be split
3686	* @new_order: the smallest order of the after split folios (since buddy
3687	* allocator like split generates folios with orders from @folio's
3688	* order - 1 to new_order).
3689	* @split_type: uniform or non-uniform split
3690	*
3691	* folio_check_splittable() checks if @folio can be split to @new_order using
3692	* @split_type method. The truncated folio check must come first.
3693	*
3694	* Context: folio must be locked.
3695	*
3696	* Return: 0 - @folio can be split to @new_order, otherwise an error number is
3697	* returned.
3698	*/
3699	int folio_check_splittable(struct folio folio, unsigned* int new_order,
3700	enum split_type split_type)
3701	{
3702	VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio);
3703	/*
3704	* Folios that just got truncated cannot get split. Signal to the
3705	* caller that there was a race.
3706	*
3707	* TODO: this will also currently refuse folios without a mapping in the
3708	* swapcache (shmem or to-be-anon folios).
3709	*/
3710	if (!folio->mapping && !folio_test_anon(folio))
3711	return -EBUSY;
3712
3713	if (folio_test_anon(folio)) {
3714	/ order-1 is not supported for anonymous THP. /
3715	if (new_order == `1`)
3716	return -EINVAL;
3717	} else if (split_type == SPLIT_TYPE_NON_UNIFORM \|\| new_order) {
3718	if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) &&
3719	!mapping_large_folio_support(mapping: folio->mapping)) {
3720	/*
3721	* We can always split a folio down to a single page
3722	* (new_order == 0) uniformly.
3723	*
3724	* For any other scenario
3725	* a) uniform split targeting a large folio
3726	* (new_order > 0)
3727	* b) any non-uniform split
3728	* we must confirm that the file system supports large
3729	* folios.
3730	*
3731	* Note that we might still have THPs in such
3732	* mappings, which is created from khugepaged when
3733	* CONFIG_READ_ONLY_THP_FOR_FS is enabled. But in that
3734	* case, the mapping does not actually support large
3735	* folios properly.
3736	*/
3737	return -EINVAL;
3738	}
3739	}
3740
3741	/*
3742	* swapcache folio could only be split to order 0
3743	*
3744	* non-uniform split creates after-split folios with orders from
3745	* folio_order(folio) - 1 to new_order, making it not suitable for any
3746	* swapcache folio split. Only uniform split to order-0 can be used
3747	* here.
3748	*/
3749	if ((split_type == SPLIT_TYPE_NON_UNIFORM \|\| new_order) && folio_test_swapcache(folio)) {
3750	return -EINVAL;
3751	}
3752
3753	if (is_huge_zero_folio(folio))
3754	return -EINVAL;
3755
3756	if (folio_test_writeback(folio))
3757	return -EBUSY;
3758
3759	return `0`;
3760	}
3761
3762	/ Number of folio references from the pagecache or the swapcache. /
3763	static unsigned int folio_cache_ref_count(const struct folio *folio)
3764	{
3765	if (folio_test_anon(folio) && !folio_test_swapcache(folio))
3766	return `0`;
3767	return folio_nr_pages(folio);
3768	}
3769
3770	static int __folio_freeze_and_split_unmapped(struct folio folio, unsigned* int new_order,
3771	struct page split_at, struct* xa_state *xas,
3772	struct address_space *mapping, bool do_lru,
3773	struct list_head list, enum* split_type split_type,
3774	pgoff_t end, int *nr_shmem_dropped)
3775	{
3776	struct folio *end_folio = folio_next(folio);
3777	struct folio new_folio, next;
3778	int old_order = folio_order(folio);
3779	int ret = `0`;
3780	struct deferred_split *ds_queue;
3781
3782	VM_WARN_ON_ONCE(!mapping && end);
3783	/ Prevent deferred_split_scan() touching ->_refcount /
3784	ds_queue = folio_split_queue_lock(folio);
3785	if (folio_ref_freeze(folio, count: folio_cache_ref_count(folio) + `1`)) {
3786	struct swap_cluster_info *ci = NULL;
3787	struct lruvec *lruvec;
3788
3789	if (old_order > `1`) {
3790	if (!list_empty(head: &folio->_deferred_list)) {
3791	ds_queue->split_queue_len--;
3792	/*
3793	* Reinitialize page_deferred_list after removing the
3794	* page from the split_queue, otherwise a subsequent
3795	* split will see list corruption when checking the
3796	* page_deferred_list.
3797	*/
3798	list_del_init(entry: &folio->_deferred_list);
3799	}
3800	if (folio_test_partially_mapped(folio)) {
3801	folio_clear_partially_mapped(folio);
3802	mod_mthp_stat(order: old_order,
3803	item: MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, delta: -`1`);
3804	}
3805	}
3806	split_queue_unlock(queue: ds_queue);
3807	if (mapping) {
3808	int nr = folio_nr_pages(folio);
3809
3810	if (folio_test_pmd_mappable(folio) &&
3811	new_order < HPAGE_PMD_ORDER) {
3812	if (folio_test_swapbacked(folio)) {
3813	lruvec_stat_mod_folio(folio,
3814	idx: NR_SHMEM_THPS, val: -nr);
3815	} else {
3816	lruvec_stat_mod_folio(folio,
3817	idx: NR_FILE_THPS, val: -nr);
3818	filemap_nr_thps_dec(mapping);
3819	}
3820	}
3821	}
3822
3823	if (folio_test_swapcache(folio)) {
3824	if (mapping) {
3825	VM_WARN_ON_ONCE_FOLIO(mapping, folio);
3826	return -EINVAL;
3827	}
3828
3829	ci = swap_cluster_get_and_lock(folio);
3830	}
3831
3832	/ lock lru list/PageCompound, ref frozen by page_ref_freeze /
3833	if (do_lru)
3834	lruvec = folio_lruvec_lock(folio);
3835
3836	ret = __split_unmapped_folio(folio, new_order, split_at, xas,
3837	mapping, split_type);
3838
3839	/*
3840	* Unfreeze after-split folios and put them back to the right
3841	* list. @folio should be kept frozon until page cache
3842	* entries are updated with all the other after-split folios
3843	* to prevent others seeing stale page cache entries.
3844	* As a result, new_folio starts from the next folio of
3845	* @folio.
3846	*/
3847	for (new_folio = folio_next(folio); new_folio != end_folio;
3848	new_folio = next) {
3849	unsigned long nr_pages = folio_nr_pages(folio: new_folio);
3850
3851	next = folio_next(folio: new_folio);
3852
3853	zone_device_private_split_cb(original_folio: folio, new_folio);
3854
3855	folio_ref_unfreeze(folio: new_folio,
3856	count: folio_cache_ref_count(folio: new_folio) + `1`);
3857
3858	if (do_lru)
3859	lru_add_split_folio(folio, new_folio, lruvec, list);
3860
3861	/*
3862	* Anonymous folio with swap cache.
3863	* NOTE: shmem in swap cache is not supported yet.
3864	*/
3865	if (ci) {
3866	__swap_cache_replace_folio(ci, old: folio, new: new_folio);
3867	continue;
3868	}
3869
3870	/ Anonymous folio without swap cache /
3871	if (!mapping)
3872	continue;
3873
3874	/ Add the new folio to the page cache. /
3875	if (new_folio->index < end) {
3876	__xa_store(&mapping->i_pages, index: new_folio->index,
3877	entry: new_folio, `0`);
3878	continue;
3879	}
3880
3881	VM_WARN_ON_ONCE(!nr_shmem_dropped);
3882	/ Drop folio beyond EOF: ->index >= end /
3883	if (shmem_mapping(mapping) && nr_shmem_dropped)
3884	*nr_shmem_dropped += nr_pages;
3885	else if (folio_test_clear_dirty(folio: new_folio))
3886	folio_account_cleaned(
3887	folio: new_folio, wb: inode_to_wb(inode: mapping->host));
3888	__filemap_remove_folio(folio: new_folio, NULL);
3889	folio_put_refs(folio: new_folio, refs: nr_pages);
3890	}
3891
3892	zone_device_private_split_cb(original_folio: folio, NULL);
3893	/*
3894	* Unfreeze @folio only after all page cache entries, which
3895	* used to point to it, have been updated with new folios.
3896	* Otherwise, a parallel folio_try_get() can grab @folio
3897	* and its caller can see stale page cache entries.
3898	*/
3899	folio_ref_unfreeze(folio, count: folio_cache_ref_count(folio) + `1`);
3900
3901	if (do_lru)
3902	unlock_page_lruvec(lruvec);
3903
3904	if (ci)
3905	swap_cluster_unlock(ci);
3906	} else {
3907	split_queue_unlock(queue: ds_queue);
3908	return -EAGAIN;
3909	}
3910
3911	return ret;
3912	}
3913
3914	/**
3915	* __folio_split() - split a folio at @split_at to a @new_order folio
3916	* @folio: folio to split
3917	* @new_order: the order of the new folio
3918	* @split_at: a page within the new folio
3919	* @lock_at: a page within @folio to be left locked to caller
3920	* @list: after-split folios will be put on it if non NULL
3921	* @split_type: perform uniform split or not (non-uniform split)
3922	*
3923	* It calls __split_unmapped_folio() to perform uniform and non-uniform split.
3924	* It is in charge of checking whether the split is supported or not and
3925	* preparing @folio for __split_unmapped_folio().
3926	*
3927	* After splitting, the after-split folio containing @lock_at remains locked
3928	* and others are unlocked:
3929	* 1. for uniform split, @lock_at points to one of @folio's subpages;
3930	* 2. for buddy allocator like (non-uniform) split, @lock_at points to @folio.
3931	*
3932	* Return: 0 - successful, <0 - failed (if -ENOMEM is returned, @folio might be
3933	* split but not to @new_order, the caller needs to check)
3934	*/
3935	static int __folio_split(struct folio folio, unsigned* int new_order,
3936	struct page split_at, struct* page *lock_at,
3937	struct list_head list, enum* split_type split_type)
3938	{
3939	XA_STATE(xas, &folio->mapping->i_pages, folio->index);
3940	struct folio *end_folio = folio_next(folio);
3941	bool is_anon = folio_test_anon(folio);
3942	struct address_space *mapping = NULL;
3943	struct anon_vma *anon_vma = NULL;
3944	int old_order = folio_order(folio);
3945	struct folio new_folio, next;
3946	int nr_shmem_dropped = `0`;
3947	int remap_flags = `0`;
3948	int ret;
3949	pgoff_t end = `0`;
3950
3951	VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio);
3952	VM_WARN_ON_ONCE_FOLIO(!folio_test_large(folio), folio);
3953
3954	if (folio != page_folio(split_at) \|\| folio != page_folio(lock_at)) {
3955	ret = -EINVAL;
3956	goto out;
3957	}
3958
3959	if (new_order >= old_order) {
3960	ret = -EINVAL;
3961	goto out;
3962	}
3963
3964	ret = folio_check_splittable(folio, new_order, split_type);
3965	if (ret) {
3966	VM_WARN_ONCE(ret == -EINVAL, "Tried to split an unsplittable folio");
3967	goto out;
3968	}
3969
3970	if (is_anon) {
3971	/*
3972	* The caller does not necessarily hold an mmap_lock that would
3973	* prevent the anon_vma disappearing so we first we take a
3974	* reference to it and then lock the anon_vma for write. This
3975	* is similar to folio_lock_anon_vma_read except the write lock
3976	* is taken to serialise against parallel split or collapse
3977	* operations.
3978	*/
3979	anon_vma = folio_get_anon_vma(folio);
3980	if (!anon_vma) {
3981	ret = -EBUSY;
3982	goto out;
3983	}
3984	anon_vma_lock_write(anon_vma);
3985	mapping = NULL;
3986	} else {
3987	unsigned int min_order;
3988	gfp_t gfp;
3989
3990	mapping = folio->mapping;
3991	min_order = mapping_min_folio_order(mapping: folio->mapping);
3992	if (new_order < min_order) {
3993	ret = -EINVAL;
3994	goto out;
3995	}
3996
3997	gfp = current_gfp_context(flags: mapping_gfp_mask(mapping) &
3998	GFP_RECLAIM_MASK);
3999
4000	if (!filemap_release_folio(folio, gfp)) {
4001	ret = -EBUSY;
4002	goto out;
4003	}
4004
4005	if (split_type == SPLIT_TYPE_UNIFORM) {
4006	xas_set_order(xas: &xas, index: folio->index, order: new_order);
4007	xas_split_alloc(&xas, entry: folio, order: old_order, gfp);
4008	if (xas_error(xas: &xas)) {
4009	ret = xas_error(xas: &xas);
4010	goto out;
4011	}
4012	}
4013
4014	anon_vma = NULL;
4015	i_mmap_lock_read(mapping);
4016
4017	/*
4018	*__split_unmapped_folio() may need to trim off pages beyond
4019	* EOF: but on 32-bit, i_size_read() takes an irq-unsafe
4020	* seqlock, which cannot be nested inside the page tree lock.
4021	* So note end now: i_size itself may be changed at any moment,
4022	* but folio lock is good enough to serialize the trimming.
4023	*/
4024	end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
4025	if (shmem_mapping(mapping))
4026	end = shmem_fallocend(inode: mapping->host, eof: end);
4027	}
4028
4029	/*
4030	* Racy check if we can split the page, before unmap_folio() will
4031	* split PMDs
4032	*/
4033	if (folio_expected_ref_count(folio) != folio_ref_count(folio) - `1`) {
4034	ret = -EAGAIN;
4035	goto out_unlock;
4036	}
4037
4038	unmap_folio(folio);
4039
4040	/ block interrupt reentry in xa_lock and spinlock /
4041	local_irq_disable();
4042	if (mapping) {
4043	/*
4044	* Check if the folio is present in page cache.
4045	* We assume all tail are present too, if folio is there.
4046	*/
4047	xas_lock(&xas);
4048	xas_reset(xas: &xas);
4049	if (xas_load(&xas) != folio) {
4050	ret = -EAGAIN;
4051	goto fail;
4052	}
4053	}
4054
4055	ret = __folio_freeze_and_split_unmapped(folio, new_order, split_at, xas: &xas, mapping,
4056	do_lru: true, list, split_type, end, nr_shmem_dropped: &nr_shmem_dropped);
4057	fail:
4058	if (mapping)
4059	xas_unlock(&xas);
4060
4061	local_irq_enable();
4062
4063	if (nr_shmem_dropped)
4064	shmem_uncharge(inode: mapping->host, pages: nr_shmem_dropped);
4065
4066	if (!ret && is_anon && !folio_is_device_private(folio))
4067	remap_flags = RMP_USE_SHARED_ZEROPAGE;
4068
4069	remap_page(folio, nr: `1` << old_order, flags: remap_flags);
4070
4071	/*
4072	* Unlock all after-split folios except the one containing
4073	* @lock_at page. If @folio is not split, it will be kept locked.
4074	*/
4075	for (new_folio = folio; new_folio != end_folio; new_folio = next) {
4076	next = folio_next(folio: new_folio);
4077	if (new_folio == page_folio(lock_at))
4078	continue;
4079
4080	folio_unlock(folio: new_folio);
4081	/*
4082	* Subpages may be freed if there wasn't any mapping
4083	* like if add_to_swap() is running on a lru page that
4084	* had its mapping zapped. And freeing these pages
4085	* requires taking the lru_lock so we do the put_page
4086	* of the tail pages after the split is complete.
4087	*/
4088	free_folio_and_swap_cache(folio: new_folio);
4089	}
4090
4091	out_unlock:
4092	if (anon_vma) {
4093	anon_vma_unlock_write(anon_vma);
4094	put_anon_vma(anon_vma);
4095	}
4096	if (mapping)
4097	i_mmap_unlock_read(mapping);
4098	out:
4099	xas_destroy(&xas);
4100	if (old_order == HPAGE_PMD_ORDER)
4101	count_vm_event(item: !ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED);
4102	count_mthp_stat(order: old_order, item: !ret ? MTHP_STAT_SPLIT : MTHP_STAT_SPLIT_FAILED);
4103	return ret;
4104	}
4105
4106	/**
4107	* folio_split_unmapped() - split a large anon folio that is already unmapped
4108	* @folio: folio to split
4109	* @new_order: the order of folios after split
4110	*
4111	* This function is a helper for splitting folios that have already been
4112	* unmapped. The use case is that the device or the CPU can refuse to migrate
4113	* THP pages in the middle of migration, due to allocation issues on either
4114	* side.
4115	*
4116	* anon_vma_lock is not required to be held, mmap_read_lock() or
4117	* mmap_write_lock() should be held. @folio is expected to be locked by the
4118	* caller. device-private and non device-private folios are supported along
4119	* with folios that are in the swapcache. @folio should also be unmapped and
4120	* isolated from LRU (if applicable)
4121	*
4122	* Upon return, the folio is not remapped, split folios are not added to LRU,
4123	* free_folio_and_swap_cache() is not called, and new folios remain locked.
4124	*
4125	* Return: 0 on success, -EAGAIN if the folio cannot be split (e.g., due to
4126	* insufficient reference count or extra pins).
4127	*/
4128	int folio_split_unmapped(struct folio folio, unsigned* int new_order)
4129	{
4130	int ret = `0`;
4131
4132	VM_WARN_ON_ONCE_FOLIO(folio_mapped(folio), folio);
4133	VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio);
4134	VM_WARN_ON_ONCE_FOLIO(!folio_test_large(folio), folio);
4135	VM_WARN_ON_ONCE_FOLIO(!folio_test_anon(folio), folio);
4136
4137	if (folio_expected_ref_count(folio) != folio_ref_count(folio) - `1`)
4138	return -EAGAIN;
4139
4140	local_irq_disable();
4141	ret = __folio_freeze_and_split_unmapped(folio, new_order, split_at: &folio->page, NULL,
4142	NULL, do_lru: false, NULL, split_type: SPLIT_TYPE_UNIFORM,
4143	end: `0`, NULL);
4144	local_irq_enable();
4145	return ret;
4146	}
4147
4148	/*
4149	* This function splits a large folio into smaller folios of order @new_order.
4150	* @page can point to any page of the large folio to split. The split operation
4151	* does not change the position of @page.
4152	*
4153	* Prerequisites:
4154	*
4155	* 1) The caller must hold a reference on the @page's owning folio, also known
4156	* as the large folio.
4157	*
4158	* 2) The large folio must be locked.
4159	*
4160	* 3) The folio must not be pinned. Any unexpected folio references, including
4161	* GUP pins, will result in the folio not getting split; instead, the caller
4162	* will receive an -EAGAIN.
4163	*
4164	* 4) @new_order > 1, usually. Splitting to order-1 anonymous folios is not
4165	* supported for non-file-backed folios, because folio->_deferred_list, which
4166	* is used by partially mapped folios, is stored in subpage 2, but an order-1
4167	* folio only has subpages 0 and 1. File-backed order-1 folios are supported,
4168	* since they do not use _deferred_list.
4169	*
4170	* After splitting, the caller's folio reference will be transferred to @page,
4171	* resulting in a raised refcount of @page after this call. The other pages may
4172	* be freed if they are not mapped.
4173	*
4174	* If @list is null, tail pages will be added to LRU list, otherwise, to @list.
4175	*
4176	* Pages in @new_order will inherit the mapping, flags, and so on from the
4177	* huge page.
4178	*
4179	* Returns 0 if the huge page was split successfully.
4180	*
4181	* Returns -EAGAIN if the folio has unexpected reference (e.g., GUP) or if
4182	* the folio was concurrently removed from the page cache.
4183	*
4184	* Returns -EBUSY when trying to split the huge zeropage, if the folio is
4185	* under writeback, if fs-specific folio metadata cannot currently be
4186	* released, or if some unexpected race happened (e.g., anon VMA disappeared,
4187	* truncation).
4188	*
4189	* Callers should ensure that the order respects the address space mapping
4190	* min-order if one is set for non-anonymous folios.
4191	*
4192	* Returns -EINVAL when trying to split to an order that is incompatible
4193	* with the folio. Splitting to order 0 is compatible with all folios.
4194	*/
4195	int __split_huge_page_to_list_to_order(struct page page, struct* list_head *list,
4196	unsigned int new_order)
4197	{
4198	struct folio *folio = page_folio(page);
4199
4200	return __folio_split(folio, new_order, split_at: &folio->page, lock_at: page, list,
4201	split_type: SPLIT_TYPE_UNIFORM);
4202	}
4203
4204	/**
4205	* folio_split() - split a folio at @split_at to a @new_order folio
4206	* @folio: folio to split
4207	* @new_order: the order of the new folio
4208	* @split_at: a page within the new folio
4209	* @list: after-split folios are added to @list if not null, otherwise to LRU
4210	* list
4211	*
4212	* It has the same prerequisites and returns as
4213	* split_huge_page_to_list_to_order().
4214	*
4215	* Split a folio at @split_at to a new_order folio, leave the
4216	* remaining subpages of the original folio as large as possible. For example,
4217	* in the case of splitting an order-9 folio at its third order-3 subpages to
4218	* an order-3 folio, there are 2^(9-3)=64 order-3 subpages in the order-9 folio.
4219	* After the split, there will be a group of folios with different orders and
4220	* the new folio containing @split_at is marked in bracket:
4221	* [order-4, {order-3}, order-3, order-5, order-6, order-7, order-8].
4222	*
4223	* After split, folio is left locked for caller.
4224	*
4225	* Return: 0 - successful, <0 - failed (if -ENOMEM is returned, @folio might be
4226	* split but not to @new_order, the caller needs to check)
4227	*/
4228	int folio_split(struct folio folio, unsigned* int new_order,
4229	struct page split_at, struct* list_head *list)
4230	{
4231	return __folio_split(folio, new_order, split_at, lock_at: &folio->page, list,
4232	split_type: SPLIT_TYPE_NON_UNIFORM);
4233	}
4234
4235	/**
4236	* min_order_for_split() - get the minimum order @folio can be split to
4237	* @folio: folio to split
4238	*
4239	* min_order_for_split() tells the minimum order @folio can be split to.
4240	* If a file-backed folio is truncated, 0 will be returned. Any subsequent
4241	* split attempt should get -EBUSY from split checking code.
4242	*
4243	* Return: @folio's minimum order for split
4244	*/
4245	unsigned int min_order_for_split(struct folio *folio)
4246	{
4247	if (folio_test_anon(folio))
4248	return `0`;
4249
4250	/*
4251	* If the folio got truncated, we don't know the previous mapping and
4252	* consequently the old min order. But it doesn't matter, as any split
4253	* attempt will immediately fail with -EBUSY as the folio cannot get
4254	* split until freed.
4255	*/
4256	if (!folio->mapping)
4257	return `0`;
4258
4259	return mapping_min_folio_order(mapping: folio->mapping);
4260	}
4261
4262	int split_folio_to_list(struct folio folio, struct* list_head *list)
4263	{
4264	return split_huge_page_to_list_to_order(page: &folio->page, list, new_order: `0`);
4265	}
4266
4267	/*
4268	* __folio_unqueue_deferred_split() is not to be called directly:
4269	* the folio_unqueue_deferred_split() inline wrapper in mm/internal.h
4270	* limits its calls to those folios which may have a _deferred_list for
4271	* queueing THP splits, and that list is (racily observed to be) non-empty.
4272	*
4273	* It is unsafe to call folio_unqueue_deferred_split() until folio refcount is
4274	* zero: because even when split_queue_lock is held, a non-empty _deferred_list
4275	* might be in use on deferred_split_scan()'s unlocked on-stack list.
4276	*
4277	* If memory cgroups are enabled, split_queue_lock is in the mem_cgroup: it is
4278	* therefore important to unqueue deferred split before changing folio memcg.
4279	*/
4280	bool __folio_unqueue_deferred_split(struct folio *folio)
4281	{
4282	struct deferred_split *ds_queue;
4283	unsigned long flags;
4284	bool unqueued = false;
4285
4286	WARN_ON_ONCE(folio_ref_count(folio));
4287	WARN_ON_ONCE(!mem_cgroup_disabled() && !folio_memcg_charged(folio));
4288
4289	ds_queue = folio_split_queue_lock_irqsave(folio, flags: &flags);
4290	if (!list_empty(head: &folio->_deferred_list)) {
4291	ds_queue->split_queue_len--;
4292	if (folio_test_partially_mapped(folio)) {
4293	folio_clear_partially_mapped(folio);
4294	mod_mthp_stat(order: folio_order(folio),
4295	item: MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, delta: -`1`);
4296	}
4297	list_del_init(entry: &folio->_deferred_list);
4298	unqueued = true;
4299	}
4300	split_queue_unlock_irqrestore(queue: ds_queue, flags);
4301
4302	return unqueued; / useful for debug warnings /
4303	}
4304
4305	/ partially_mapped=false won't clear PG_partially_mapped folio flag /
4306	void deferred_split_folio(struct folio *folio, bool partially_mapped)
4307	{
4308	struct deferred_split *ds_queue;
4309	unsigned long flags;
4310
4311	/*
4312	* Order 1 folios have no space for a deferred list, but we also
4313	* won't waste much memory by not adding them to the deferred list.
4314	*/
4315	if (folio_order(folio) <= `1`)
4316	return;
4317
4318	if (!partially_mapped && !split_underused_thp)
4319	return;
4320
4321	/*
4322	* Exclude swapcache: originally to avoid a corrupt deferred split
4323	* queue. Nowadays that is fully prevented by memcg1_swapout();
4324	* but if page reclaim is already handling the same folio, it is
4325	* unnecessary to handle it again in the shrinker, so excluding
4326	* swapcache here may still be a useful optimization.
4327	*/
4328	if (folio_test_swapcache(folio))
4329	return;
4330
4331	ds_queue = folio_split_queue_lock_irqsave(folio, flags: &flags);
4332	if (partially_mapped) {
4333	if (!folio_test_partially_mapped(folio)) {
4334	folio_set_partially_mapped(folio);
4335	if (folio_test_pmd_mappable(folio))
4336	count_vm_event(item: THP_DEFERRED_SPLIT_PAGE);
4337	count_mthp_stat(order: folio_order(folio), item: MTHP_STAT_SPLIT_DEFERRED);
4338	mod_mthp_stat(order: folio_order(folio), item: MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, delta: `1`);
4339
4340	}
4341	} else {
4342	/ partially mapped folios cannot become non-partially mapped /
4343	VM_WARN_ON_FOLIO(folio_test_partially_mapped(folio), folio);
4344	}
4345	if (list_empty(head: &folio->_deferred_list)) {
4346	struct mem_cgroup *memcg;
4347
4348	memcg = folio_split_queue_memcg(folio, queue: ds_queue);
4349	list_add_tail(new: &folio->_deferred_list, head: &ds_queue->split_queue);
4350	ds_queue->split_queue_len++;
4351	if (memcg)
4352	set_shrinker_bit(memcg, nid: folio_nid(folio),
4353	shrinker_id: shrinker_id(shrinker: deferred_split_shrinker));
4354	}
4355	split_queue_unlock_irqrestore(queue: ds_queue, flags);
4356	}
4357
4358	static unsigned long deferred_split_count(struct shrinker *shrink,
4359	struct shrink_control *sc)
4360	{
4361	struct pglist_data *pgdata = NODE_DATA(sc->nid);
4362	struct deferred_split *ds_queue = &pgdata->deferred_split_queue;
4363
4364	#ifdef CONFIG_MEMCG
4365	if (sc->memcg)
4366	ds_queue = &sc->memcg->deferred_split_queue;
4367	#endif
4368	return READ_ONCE(ds_queue->split_queue_len);
4369	}
4370
4371	static bool thp_underused(struct folio *folio)
4372	{
4373	int num_zero_pages = `0`, num_filled_pages = `0`;
4374	int i;
4375
4376	if (khugepaged_max_ptes_none == HPAGE_PMD_NR - `1`)
4377	return false;
4378
4379	if (folio_contain_hwpoisoned_page(folio))
4380	return false;
4381
4382	for (i = `0`; i < folio_nr_pages(folio); i++) {
4383	if (pages_identical(folio_page(folio, i), ZERO_PAGE(`0`))) {
4384	if (++num_zero_pages > khugepaged_max_ptes_none)
4385	return true;
4386	} else {
4387	/*
4388	* Another path for early exit once the number
4389	* of non-zero filled pages exceeds threshold.
4390	*/
4391	if (++num_filled_pages >= HPAGE_PMD_NR - khugepaged_max_ptes_none)
4392	return false;
4393	}
4394	}
4395	return false;
4396	}
4397
4398	static unsigned long deferred_split_scan(struct shrinker *shrink,
4399	struct shrink_control *sc)
4400	{
4401	struct deferred_split *ds_queue;
4402	unsigned long flags;
4403	struct folio folio, next;
4404	int split = `0`, i;
4405	struct folio_batch fbatch;
4406
4407	folio_batch_init(fbatch: &fbatch);
4408
4409	retry:
4410	ds_queue = split_queue_lock_irqsave(nid: sc->nid, memcg: sc->memcg, flags: &flags);
4411	/ Take pin on all head pages to avoid freeing them under us /
4412	list_for_each_entry_safe(folio, next, &ds_queue->split_queue,
4413	_deferred_list) {
4414	if (folio_try_get(folio)) {
4415	folio_batch_add(fbatch: &fbatch, folio);
4416	} else if (folio_test_partially_mapped(folio)) {
4417	/ We lost race with folio_put() /
4418	folio_clear_partially_mapped(folio);
4419	mod_mthp_stat(order: folio_order(folio),
4420	item: MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, delta: -`1`);
4421	}
4422	list_del_init(entry: &folio->_deferred_list);
4423	ds_queue->split_queue_len--;
4424	if (!--sc->nr_to_scan)
4425	break;
4426	if (!folio_batch_space(fbatch: &fbatch))
4427	break;
4428	}
4429	split_queue_unlock_irqrestore(queue: ds_queue, flags);
4430
4431	for (i = `0`; i < folio_batch_count(fbatch: &fbatch); i++) {
4432	bool did_split = false;
4433	bool underused = false;
4434	struct deferred_split *fqueue;
4435
4436	folio = fbatch.folios[i];
4437	if (!folio_test_partially_mapped(folio)) {
4438	/*
4439	* See try_to_map_unused_to_zeropage(): we cannot
4440	* optimize zero-filled pages after splitting an
4441	* mlocked folio.
4442	*/
4443	if (folio_test_mlocked(folio))
4444	goto next;
4445	underused = thp_underused(folio);
4446	if (!underused)
4447	goto next;
4448	}
4449	if (!folio_trylock(folio))
4450	goto next;
4451	if (!split_folio(folio)) {
4452	did_split = true;
4453	if (underused)
4454	count_vm_event(item: THP_UNDERUSED_SPLIT_PAGE);
4455	split++;
4456	}
4457	folio_unlock(folio);
4458	next:
4459	if (did_split \|\| !folio_test_partially_mapped(folio))
4460	continue;
4461	/*
4462	* Only add back to the queue if folio is partially mapped.
4463	* If thp_underused returns false, or if split_folio fails
4464	* in the case it was underused, then consider it used and
4465	* don't add it back to split_queue.
4466	*/
4467	fqueue = folio_split_queue_lock_irqsave(folio, flags: &flags);
4468	if (list_empty(head: &folio->_deferred_list)) {
4469	list_add_tail(new: &folio->_deferred_list, head: &fqueue->split_queue);
4470	fqueue->split_queue_len++;
4471	}
4472	split_queue_unlock_irqrestore(queue: fqueue, flags);
4473	}
4474	folios_put(folios: &fbatch);
4475
4476	if (sc->nr_to_scan && !list_empty(head: &ds_queue->split_queue)) {
4477	cond_resched();
4478	goto retry;
4479	}
4480
4481	/*
4482	* Stop shrinker if we didn't split any page, but the queue is empty.
4483	* This can happen if pages were freed under us.
4484	*/
4485	if (!split && list_empty(head: &ds_queue->split_queue))
4486	return SHRINK_STOP;
4487	return split;
4488	}
4489
4490	#ifdef CONFIG_MEMCG
4491	void reparent_deferred_split_queue(struct mem_cgroup *memcg)
4492	{
4493	struct mem_cgroup *parent = parent_mem_cgroup(memcg);
4494	struct deferred_split *ds_queue = &memcg->deferred_split_queue;
4495	struct deferred_split *parent_ds_queue = &parent->deferred_split_queue;
4496	int nid;
4497
4498	spin_lock_irq(lock: &ds_queue->split_queue_lock);
4499	spin_lock_nested(&parent_ds_queue->split_queue_lock, SINGLE_DEPTH_NESTING);
4500
4501	if (!ds_queue->split_queue_len)
4502	goto unlock;
4503
4504	list_splice_tail_init(list: &ds_queue->split_queue, head: &parent_ds_queue->split_queue);
4505	parent_ds_queue->split_queue_len += ds_queue->split_queue_len;
4506	ds_queue->split_queue_len = `0`;
4507
4508	for_each_node(nid)
4509	set_shrinker_bit(memcg: parent, nid, shrinker_id: shrinker_id(shrinker: deferred_split_shrinker));
4510
4511	unlock:
4512	spin_unlock(lock: &parent_ds_queue->split_queue_lock);
4513	spin_unlock_irq(lock: &ds_queue->split_queue_lock);
4514	}
4515	#endif
4516
4517	#ifdef CONFIG_DEBUG_FS
4518	static void split_huge_pages_all(void)
4519	{
4520	struct zone *zone;
4521	struct page *page;
4522	struct folio *folio;
4523	unsigned long pfn, max_zone_pfn;
4524	unsigned long total = `0`, split = `0`;
4525
4526	pr_debug("Split all THPs\n");
4527	for_each_zone(zone) {
4528	if (!managed_zone(zone))
4529	continue;
4530	max_zone_pfn = zone_end_pfn(zone);
4531	for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) {
4532	int nr_pages;
4533
4534	page = pfn_to_online_page(pfn);
4535	if (!page \|\| PageTail(page))
4536	continue;
4537	folio = page_folio(page);
4538	if (!folio_try_get(folio))
4539	continue;
4540
4541	if (unlikely(page_folio(page) != folio))
4542	goto next;
4543
4544	if (zone != folio_zone(folio))
4545	goto next;
4546
4547	if (!folio_test_large(folio)
4548	\|\| folio_test_hugetlb(folio)
4549	\|\| !folio_test_lru(folio))
4550	goto next;
4551
4552	total++;
4553	folio_lock(folio);
4554	nr_pages = folio_nr_pages(folio);
4555	if (!split_folio(folio))
4556	split++;
4557	pfn += nr_pages - `1`;
4558	folio_unlock(folio);
4559	next:
4560	folio_put(folio);
4561	cond_resched();
4562	}
4563	}
4564
4565	pr_debug("%lu of %lu THP split\n", split, total);
4566	}
4567
4568	static inline bool vma_not_suitable_for_thp_split(struct vm_area_struct *vma)
4569	{
4570	return vma_is_special_huge(vma) \|\| (vma->vm_flags & VM_IO) \|\|
4571	is_vm_hugetlb_page(vma);
4572	}
4573
4574	static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
4575	unsigned long vaddr_end, unsigned int new_order,
4576	long in_folio_offset)
4577	{
4578	int ret = `0`;
4579	struct task_struct *task;
4580	struct mm_struct *mm;
4581	unsigned long total = `0`, split = `0`;
4582	unsigned long addr;
4583
4584	vaddr_start &= PAGE_MASK;
4585	vaddr_end &= PAGE_MASK;
4586
4587	task = find_get_task_by_vpid(nr: pid);
4588	if (!task) {
4589	ret = -ESRCH;
4590	goto out;
4591	}
4592
4593	/ Find the mm_struct /
4594	mm = get_task_mm(task);
4595	put_task_struct(t: task);
4596
4597	if (!mm) {
4598	ret = -EINVAL;
4599	goto out;
4600	}
4601
4602	pr_debug("Split huge pages in pid: %d, vaddr: [0x%lx - 0x%lx], new_order: %u, in_folio_offset: %ld\n",
4603	pid, vaddr_start, vaddr_end, new_order, in_folio_offset);
4604
4605	mmap_read_lock(mm);
4606	/*
4607	* always increase addr by PAGE_SIZE, since we could have a PTE page
4608	* table filled with PTE-mapped THPs, each of which is distinct.
4609	*/
4610	for (addr = vaddr_start; addr < vaddr_end; addr += PAGE_SIZE) {
4611	struct vm_area_struct *vma = vma_lookup(mm, addr);
4612	struct folio_walk fw;
4613	struct folio *folio;
4614	struct address_space *mapping;
4615	unsigned int target_order = new_order;
4616
4617	if (!vma)
4618	break;
4619
4620	/ skip special VMA and hugetlb VMA /
4621	if (vma_not_suitable_for_thp_split(vma)) {
4622	addr = vma->vm_end;
4623	continue;
4624	}
4625
4626	folio = folio_walk_start(fw: &fw, vma, addr, flags: `0`);
4627	if (!folio)
4628	continue;
4629
4630	if (!is_transparent_hugepage(folio))
4631	goto next;
4632
4633	if (!folio_test_anon(folio)) {
4634	mapping = folio->mapping;
4635	target_order = max(new_order,
4636	mapping_min_folio_order(mapping));
4637	}
4638
4639	if (target_order >= folio_order(folio))
4640	goto next;
4641
4642	total++;
4643	/*
4644	* For folios with private, split_huge_page_to_list_to_order()
4645	* will try to drop it before split and then check if the folio
4646	* can be split or not. So skip the check here.
4647	*/
4648	if (!folio_test_private(folio) &&
4649	folio_expected_ref_count(folio) != folio_ref_count(folio))
4650	goto next;
4651
4652	if (!folio_trylock(folio))
4653	goto next;
4654	folio_get(folio);
4655	folio_walk_end(&fw, vma);
4656
4657	if (!folio_test_anon(folio) && folio->mapping != mapping)
4658	goto unlock;
4659
4660	if (in_folio_offset < `0` \|\|
4661	in_folio_offset >= folio_nr_pages(folio)) {
4662	if (!split_folio_to_order(folio, new_order: target_order))
4663	split++;
4664	} else {
4665	struct page *split_at = folio_page(folio,
4666	in_folio_offset);
4667	if (!folio_split(folio, new_order: target_order, split_at, NULL))
4668	split++;
4669	}
4670
4671	unlock:
4672
4673	folio_unlock(folio);
4674	folio_put(folio);
4675
4676	cond_resched();
4677	continue;
4678	next:
4679	folio_walk_end(&fw, vma);
4680	cond_resched();
4681	}
4682	mmap_read_unlock(mm);
4683	mmput(mm);
4684
4685	pr_debug("%lu of %lu THP split\n", split, total);
4686
4687	out:
4688	return ret;
4689	}
4690
4691	static int split_huge_pages_in_file(const char *file_path, pgoff_t off_start,
4692	pgoff_t off_end, unsigned int new_order,
4693	long in_folio_offset)
4694	{
4695	struct filename *file;
4696	struct file *candidate;
4697	struct address_space *mapping;
4698	int ret = -EINVAL;
4699	pgoff_t index;
4700	int nr_pages = `1`;
4701	unsigned long total = `0`, split = `0`;
4702	unsigned int min_order;
4703	unsigned int target_order;
4704
4705	file = getname_kernel(file_path);
4706	if (IS_ERR(ptr: file))
4707	return ret;
4708
4709	candidate = file_open_name(file, O_RDONLY, `0`);
4710	if (IS_ERR(ptr: candidate))
4711	goto out;
4712
4713	pr_debug("split file-backed THPs in file: %s, page offset: [0x%lx - 0x%lx], new_order: %u, in_folio_offset: %ld\n",
4714	file_path, off_start, off_end, new_order, in_folio_offset);
4715
4716	mapping = candidate->f_mapping;
4717	min_order = mapping_min_folio_order(mapping);
4718	target_order = max(new_order, min_order);
4719
4720	for (index = off_start; index < off_end; index += nr_pages) {
4721	struct folio *folio = filemap_get_folio(mapping, index);
4722
4723	nr_pages = `1`;
4724	if (IS_ERR(ptr: folio))
4725	continue;
4726
4727	if (!folio_test_large(folio))
4728	goto next;
4729
4730	total++;
4731	nr_pages = folio_nr_pages(folio);
4732
4733	if (target_order >= folio_order(folio))
4734	goto next;
4735
4736	if (!folio_trylock(folio))
4737	goto next;
4738
4739	if (folio->mapping != mapping)
4740	goto unlock;
4741
4742	if (in_folio_offset < `0` \|\| in_folio_offset >= nr_pages) {
4743	if (!split_folio_to_order(folio, new_order: target_order))
4744	split++;
4745	} else {
4746	struct page *split_at = folio_page(folio,
4747	in_folio_offset);
4748	if (!folio_split(folio, new_order: target_order, split_at, NULL))
4749	split++;
4750	}
4751
4752	unlock:
4753	folio_unlock(folio);
4754	next:
4755	folio_put(folio);
4756	cond_resched();
4757	}
4758
4759	filp_close(candidate, NULL);
4760	ret = `0`;
4761
4762	pr_debug("%lu of %lu file-backed THP split\n", split, total);
4763	out:
4764	putname(name: file);
4765	return ret;
4766	}
4767
4768	#define MAX_INPUT_BUF_SZ 255
4769
4770	static ssize_t split_huge_pages_write(struct file file, const* char __user *buf,
4771	size_t count, loff_t *ppops)
4772	{
4773	static DEFINE_MUTEX(split_debug_mutex);
4774	ssize_t ret;
4775	/*
4776	* hold pid, start_vaddr, end_vaddr, new_order or
4777	* file_path, off_start, off_end, new_order
4778	*/
4779	char input_buf[MAX_INPUT_BUF_SZ];
4780	int pid;
4781	unsigned long vaddr_start, vaddr_end;
4782	unsigned int new_order = `0`;
4783	long in_folio_offset = -`1`;
4784
4785	ret = mutex_lock_interruptible(&split_debug_mutex);
4786	if (ret)
4787	return ret;
4788
4789	ret = -EFAULT;
4790
4791	memset(input_buf, `0`, MAX_INPUT_BUF_SZ);
4792	if (copy_from_user(to: input_buf, from: buf, min_t(size_t, count, MAX_INPUT_BUF_SZ)))
4793	goto out;
4794
4795	input_buf[MAX_INPUT_BUF_SZ - `1`] = `'\0'`;
4796
4797	if (input_buf[`0`] == `'/'`) {
4798	char *tok;
4799	char *tok_buf = input_buf;
4800	char file_path[MAX_INPUT_BUF_SZ];
4801	pgoff_t off_start = `0`, off_end = `0`;
4802	size_t input_len = strlen(input_buf);
4803
4804	tok = strsep(&tok_buf, ",");
4805	if (tok && tok_buf) {
4806	strscpy(file_path, tok);
4807	} else {
4808	ret = -EINVAL;
4809	goto out;
4810	}
4811
4812	ret = sscanf(tok_buf, "0x%lx,0x%lx,%d,%ld", &off_start, &off_end,
4813	&new_order, &in_folio_offset);
4814	if (ret != `2` && ret != `3` && ret != `4`) {
4815	ret = -EINVAL;
4816	goto out;
4817	}
4818	ret = split_huge_pages_in_file(file_path, off_start, off_end,
4819	new_order, in_folio_offset);
4820	if (!ret)
4821	ret = input_len;
4822
4823	goto out;
4824	}
4825
4826	ret = sscanf(input_buf, "%d,0x%lx,0x%lx,%d,%ld", &pid, &vaddr_start,
4827	&vaddr_end, &new_order, &in_folio_offset);
4828	if (ret == `1` && pid == `1`) {
4829	split_huge_pages_all();
4830	ret = strlen(input_buf);
4831	goto out;
4832	} else if (ret != `3` && ret != `4` && ret != `5`) {
4833	ret = -EINVAL;
4834	goto out;
4835	}
4836
4837	ret = split_huge_pages_pid(pid, vaddr_start, vaddr_end, new_order,
4838	in_folio_offset);
4839	if (!ret)
4840	ret = strlen(input_buf);
4841	out:
4842	mutex_unlock(lock: &split_debug_mutex);
4843	return ret;
4844
4845	}
4846
4847	static const struct file_operations split_huge_pages_fops = {
4848	.owner = THIS_MODULE,
4849	.write = split_huge_pages_write,
4850	};
4851
4852	static int __init split_huge_pages_debugfs(void)
4853	{
4854	debugfs_create_file("split_huge_pages", `0200`, NULL, NULL,
4855	&split_huge_pages_fops);
4856	return `0`;
4857	}
4858	late_initcall(split_huge_pages_debugfs);
4859	#endif
4860
4861	#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
4862	int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
4863	struct page *page)
4864	{
4865	struct folio *folio = page_folio(page);
4866	struct vm_area_struct *vma = pvmw->vma;
4867	struct mm_struct *mm = vma->vm_mm;
4868	unsigned long address = pvmw->address;
4869	bool anon_exclusive;
4870	pmd_t pmdval;
4871	swp_entry_t entry;
4872	pmd_t pmdswp;
4873
4874	if (!(pvmw->pmd && !pvmw->pte))
4875	return `0`;
4876
4877	flush_cache_range(vma, start: address, end: address + HPAGE_PMD_SIZE);
4878	if (unlikely(!pmd_present(*pvmw->pmd)))
4879	pmdval = pmdp_huge_get_and_clear(mm: vma->vm_mm, addr: address, pmdp: pvmw->pmd);
4880	else
4881	pmdval = pmdp_invalidate(vma, address, pmdp: pvmw->pmd);
4882
4883	/ See folio_try_share_anon_rmap_pmd(): invalidate PMD first. /
4884	anon_exclusive = folio_test_anon(folio) && PageAnonExclusive(page);
4885	if (anon_exclusive && folio_try_share_anon_rmap_pmd(folio, page)) {
4886	set_pmd_at(mm, addr: address, pmdp: pvmw->pmd, pmd: pmdval);
4887	return -EBUSY;
4888	}
4889
4890	if (pmd_dirty(pmd: pmdval))
4891	folio_mark_dirty(folio);
4892	if (pmd_write(pmd: pmdval))
4893	entry = make_writable_migration_entry(page_to_pfn(page));
4894	else if (anon_exclusive)
4895	entry = make_readable_exclusive_migration_entry(page_to_pfn(page));
4896	else
4897	entry = make_readable_migration_entry(page_to_pfn(page));
4898	if (pmd_young(pmd: pmdval))
4899	entry = make_migration_entry_young(entry);
4900	if (pmd_dirty(pmd: pmdval))
4901	entry = make_migration_entry_dirty(entry);
4902	pmdswp = swp_entry_to_pmd(entry);
4903	if (pmd_soft_dirty(pmd: pmdval))
4904	pmdswp = pmd_swp_mksoft_dirty(pmd: pmdswp);
4905	if (pmd_uffd_wp(pmd: pmdval))
4906	pmdswp = pmd_swp_mkuffd_wp(pmd: pmdswp);
4907	set_pmd_at(mm, addr: address, pmdp: pvmw->pmd, pmd: pmdswp);
4908	folio_remove_rmap_pmd(folio, page, vma);
4909	folio_put(folio);
4910	trace_set_migration_pmd(addr: address, pmd: pmd_val(pmd: pmdswp));
4911
4912	return `0`;
4913	}
4914
4915	void remove_migration_pmd(struct page_vma_mapped_walk pvmw, struct* page *new)
4916	{
4917	struct folio *folio = page_folio(new);
4918	struct vm_area_struct *vma = pvmw->vma;
4919	struct mm_struct *mm = vma->vm_mm;
4920	unsigned long address = pvmw->address;
4921	unsigned long haddr = address & HPAGE_PMD_MASK;
4922	pmd_t pmde;
4923	softleaf_t entry;
4924
4925	if (!(pvmw->pmd && !pvmw->pte))
4926	return;
4927
4928	entry = softleaf_from_pmd(pmd: *pvmw->pmd);
4929	folio_get(folio);
4930	pmde = folio_mk_pmd(folio, READ_ONCE(vma->vm_page_prot));
4931
4932	if (pmd_swp_soft_dirty(pmd: *pvmw->pmd))
4933	pmde = pmd_mksoft_dirty(pmd: pmde);
4934	if (softleaf_is_migration_write(entry))
4935	pmde = pmd_mkwrite(pmd: pmde, vma);
4936	if (pmd_swp_uffd_wp(pmd: *pvmw->pmd))
4937	pmde = pmd_mkuffd_wp(pmd: pmde);
4938	if (!softleaf_is_migration_young(entry))
4939	pmde = pmd_mkold(pmd: pmde);
4940	/ NOTE: this may contain setting soft-dirty on some archs /
4941	if (folio_test_dirty(folio) && softleaf_is_migration_dirty(entry))
4942	pmde = pmd_mkdirty(pmd: pmde);
4943
4944	if (folio_is_device_private(folio)) {
4945	swp_entry_t entry;
4946
4947	if (pmd_write(pmd: pmde))
4948	entry = make_writable_device_private_entry(
4949	page_to_pfn(new));
4950	else
4951	entry = make_readable_device_private_entry(
4952	page_to_pfn(new));
4953	pmde = swp_entry_to_pmd(entry);
4954
4955	if (pmd_swp_soft_dirty(pmd: *pvmw->pmd))
4956	pmde = pmd_swp_mksoft_dirty(pmd: pmde);
4957	if (pmd_swp_uffd_wp(pmd: *pvmw->pmd))
4958	pmde = pmd_swp_mkuffd_wp(pmd: pmde);
4959	}
4960
4961	if (folio_test_anon(folio)) {
4962	rmap_t rmap_flags = RMAP_NONE;
4963
4964	if (!softleaf_is_migration_read(entry))
4965	rmap_flags \|= RMAP_EXCLUSIVE;
4966
4967	folio_add_anon_rmap_pmd(folio, new, vma, address: haddr, flags: rmap_flags);
4968	} else {
4969	folio_add_file_rmap_pmd(folio, new, vma);
4970	}
4971	VM_BUG_ON(pmd_write(pmde) && folio_test_anon(folio) && !PageAnonExclusive(new));
4972	set_pmd_at(mm, addr: haddr, pmdp: pvmw->pmd, pmd: pmde);
4973
4974	/ No need to invalidate - it was non-present before /
4975	update_mmu_cache_pmd(vma, addr: address, pmd: pvmw->pmd);
4976	trace_remove_migration_pmd(addr: address, pmd: pmd_val(pmd: pmde));
4977	}
4978	#endif
4979

source code of linux/mm/huge_memory.c