dax.c source code [linux/fs/dax.c]

1	// SPDX-License-Identifier: GPL-2.0-only
2	/*
3	* fs/dax.c - Direct Access filesystem code
4	* Copyright (c) 2013-2014 Intel Corporation
5	* Author: Matthew Wilcox <matthew.r.wilcox@intel.com>
6	* Author: Ross Zwisler <ross.zwisler@linux.intel.com>
7	*/
8
9	#include <linux/atomic.h>
10	#include <linux/blkdev.h>
11	#include <linux/buffer_head.h>
12	#include <linux/dax.h>
13	#include <linux/fs.h>
14	#include <linux/highmem.h>
15	#include <linux/memcontrol.h>
16	#include <linux/mm.h>
17	#include <linux/mutex.h>
18	#include <linux/pagevec.h>
19	#include <linux/sched.h>
20	#include <linux/sched/signal.h>
21	#include <linux/uio.h>
22	#include <linux/vmstat.h>
23	#include <linux/sizes.h>
24	#include <linux/mmu_notifier.h>
25	#include <linux/iomap.h>
26	#include <linux/rmap.h>
27	#include <linux/pgalloc.h>
28
29	#define CREATE_TRACE_POINTS
30	#include <trace/events/fs_dax.h>
31
32	/ We choose 4096 entries - same as per-zone page wait tables /
33	#define DAX_WAIT_TABLE_BITS 12
34	#define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS)
35
36	/ The 'colour' (ie low bits) within a PMD of a page offset. /
37	#define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1)
38	#define PG_PMD_NR (PMD_SIZE >> PAGE_SHIFT)
39
40	static wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES];
41
42	static int __init init_dax_wait_table(void)
43	{
44	int i;
45
46	for (i = `0`; i < DAX_WAIT_TABLE_ENTRIES; i++)
47	init_waitqueue_head(wait_table + i);
48	return `0`;
49	}
50	fs_initcall(init_dax_wait_table);
51
52	/*
53	* DAX pagecache entries use XArray value entries so they can't be mistaken
54	* for pages. We use one bit for locking, one bit for the entry size (PMD)
55	* and two more to tell us if the entry is a zero page or an empty entry that
56	* is just used for locking. In total four special bits.
57	*
58	* If the PMD bit isn't set the entry has size PAGE_SIZE, and if the ZERO_PAGE
59	* and EMPTY bits aren't set the entry is a normal DAX entry with a filesystem
60	* block allocation.
61	*/
62	#define DAX_SHIFT (4)
63	#define DAX_LOCKED (1UL << 0)
64	#define DAX_PMD (1UL << 1)
65	#define DAX_ZERO_PAGE (1UL << 2)
66	#define DAX_EMPTY (1UL << 3)
67
68	static unsigned long dax_to_pfn(void *entry)
69	{
70	return xa_to_value(entry) >> DAX_SHIFT;
71	}
72
73	static struct folio dax_to_folio(void* *entry)
74	{
75	return page_folio(pfn_to_page(dax_to_pfn(entry)));
76	}
77
78	static void dax_make_entry(unsigned* long pfn, unsigned long flags)
79	{
80	return xa_mk_value(v: flags \| (pfn << DAX_SHIFT));
81	}
82
83	static bool dax_is_locked(void *entry)
84	{
85	return xa_to_value(entry) & DAX_LOCKED;
86	}
87
88	static unsigned int dax_entry_order(void *entry)
89	{
90	if (xa_to_value(entry) & DAX_PMD)
91	return PMD_ORDER;
92	return `0`;
93	}
94
95	static unsigned long dax_is_pmd_entry(void *entry)
96	{
97	return xa_to_value(entry) & DAX_PMD;
98	}
99
100	static bool dax_is_pte_entry(void *entry)
101	{
102	return !(xa_to_value(entry) & DAX_PMD);
103	}
104
105	static int dax_is_zero_entry(void *entry)
106	{
107	return xa_to_value(entry) & DAX_ZERO_PAGE;
108	}
109
110	static int dax_is_empty_entry(void *entry)
111	{
112	return xa_to_value(entry) & DAX_EMPTY;
113	}
114
115	/*
116	* true if the entry that was found is of a smaller order than the entry
117	* we were looking for
118	*/
119	static bool dax_is_conflict(void *entry)
120	{
121	return entry == XA_RETRY_ENTRY;
122	}
123
124	/*
125	* DAX page cache entry locking
126	*/
127	struct exceptional_entry_key {
128	struct xarray *xa;
129	pgoff_t entry_start;
130	};
131
132	struct wait_exceptional_entry_queue {
133	wait_queue_entry_t wait;
134	struct exceptional_entry_key key;
135	};
136
137	/**
138	* enum dax_wake_mode: waitqueue wakeup behaviour
139	* @WAKE_ALL: wake all waiters in the waitqueue
140	* @WAKE_NEXT: wake only the first waiter in the waitqueue
141	*/
142	enum dax_wake_mode {
143	WAKE_ALL,
144	WAKE_NEXT,
145	};
146
147	static wait_queue_head_t dax_entry_waitqueue(struct* xa_state *xas,
148	void entry, struct* exceptional_entry_key *key)
149	{
150	unsigned long hash;
151	unsigned long index = xas->xa_index;
152
153	/*
154	* If 'entry' is a PMD, align the 'index' that we use for the wait
155	* queue to the start of that PMD. This ensures that all offsets in
156	* the range covered by the PMD map to the same bit lock.
157	*/
158	if (dax_is_pmd_entry(entry))
159	index &= ~PG_PMD_COLOUR;
160	key->xa = xas->xa;
161	key->entry_start = index;
162
163	hash = hash_long((unsigned long)xas->xa ^ index, DAX_WAIT_TABLE_BITS);
164	return wait_table + hash;
165	}
166
167	static int wake_exceptional_entry_func(wait_queue_entry_t *wait,
168	unsigned int mode, int sync, void *keyp)
169	{
170	struct exceptional_entry_key *key = keyp;
171	struct wait_exceptional_entry_queue *ewait =
172	container_of(wait, struct wait_exceptional_entry_queue, wait);
173
174	if (key->xa != ewait->key.xa \|\|
175	key->entry_start != ewait->key.entry_start)
176	return `0`;
177	return autoremove_wake_function(wq_entry: wait, mode, sync, NULL);
178	}
179
180	/*
181	* @entry may no longer be the entry at the index in the mapping.
182	* The important information it's conveying is whether the entry at
183	* this index used to be a PMD entry.
184	*/
185	static void dax_wake_entry(struct xa_state xas, void* *entry,
186	enum dax_wake_mode mode)
187	{
188	struct exceptional_entry_key key;
189	wait_queue_head_t *wq;
190
191	wq = dax_entry_waitqueue(xas, entry, key: &key);
192
193	/*
194	* Checking for locked entry and prepare_to_wait_exclusive() happens
195	* under the i_pages lock, ditto for entry handling in our callers.
196	* So at this point all tasks that could have seen our entry locked
197	* must be in the waitqueue and the following check will see them.
198	*/
199	if (waitqueue_active(wq_head: wq))
200	__wake_up(wq_head: wq, TASK_NORMAL, nr: mode == WAKE_ALL ? `0` : `1`, key: &key);
201	}
202
203	/*
204	* Look up entry in page cache, wait for it to become unlocked if it
205	* is a DAX entry and return it. The caller must subsequently call
206	* put_unlocked_entry() if it did not lock the entry or dax_unlock_entry()
207	* if it did. The entry returned may have a larger order than @order.
208	* If @order is larger than the order of the entry found in i_pages, this
209	* function returns a dax_is_conflict entry.
210	*
211	* Must be called with the i_pages lock held.
212	*/
213	static void get_next_unlocked_entry(struct* xa_state xas, unsigned* int order)
214	{
215	void *entry;
216	struct wait_exceptional_entry_queue ewait;
217	wait_queue_head_t *wq;
218
219	init_wait(&ewait.wait);
220	ewait.wait.func = wake_exceptional_entry_func;
221
222	for (;;) {
223	entry = xas_find_conflict(xas);
224	if (!entry \|\| WARN_ON_ONCE(!xa_is_value(entry)))
225	return entry;
226	if (dax_entry_order(entry) < order)
227	return XA_RETRY_ENTRY;
228	if (!dax_is_locked(entry))
229	return entry;
230
231	wq = dax_entry_waitqueue(xas, entry, key: &ewait.key);
232	prepare_to_wait_exclusive(wq_head: wq, wq_entry: &ewait.wait,
233	TASK_UNINTERRUPTIBLE);
234	xas_unlock_irq(xas);
235	xas_reset(xas);
236	schedule();
237	finish_wait(wq_head: wq, wq_entry: &ewait.wait);
238	xas_lock_irq(xas);
239	}
240	}
241
242	/*
243	* Wait for the given entry to become unlocked. Caller must hold the i_pages
244	* lock and call either put_unlocked_entry() if it did not lock the entry or
245	* dax_unlock_entry() if it did. Returns an unlocked entry if still present.
246	*/
247	static void wait_entry_unlocked_exclusive(struct* xa_state xas, void* *entry)
248	{
249	struct wait_exceptional_entry_queue ewait;
250	wait_queue_head_t *wq;
251
252	init_wait(&ewait.wait);
253	ewait.wait.func = wake_exceptional_entry_func;
254
255	while (unlikely(dax_is_locked(entry))) {
256	wq = dax_entry_waitqueue(xas, entry, key: &ewait.key);
257	prepare_to_wait_exclusive(wq_head: wq, wq_entry: &ewait.wait,
258	TASK_UNINTERRUPTIBLE);
259	xas_reset(xas);
260	xas_unlock_irq(xas);
261	schedule();
262	finish_wait(wq_head: wq, wq_entry: &ewait.wait);
263	xas_lock_irq(xas);
264	entry = xas_load(xas);
265	}
266
267	if (xa_is_internal(entry))
268	return NULL;
269
270	return entry;
271	}
272
273	/*
274	* The only thing keeping the address space around is the i_pages lock
275	* (it's cycled in clear_inode() after removing the entries from i_pages)
276	* After we call xas_unlock_irq(), we cannot touch xas->xa.
277	*/
278	static void wait_entry_unlocked(struct xa_state xas, void* *entry)
279	{
280	struct wait_exceptional_entry_queue ewait;
281	wait_queue_head_t *wq;
282
283	init_wait(&ewait.wait);
284	ewait.wait.func = wake_exceptional_entry_func;
285
286	wq = dax_entry_waitqueue(xas, entry, key: &ewait.key);
287	/*
288	* Unlike get_next_unlocked_entry() there is no guarantee that this
289	* path ever successfully retrieves an unlocked entry before an
290	* inode dies. Perform a non-exclusive wait in case this path
291	* never successfully performs its own wake up.
292	*/
293	prepare_to_wait(wq_head: wq, wq_entry: &ewait.wait, TASK_UNINTERRUPTIBLE);
294	xas_unlock_irq(xas);
295	schedule();
296	finish_wait(wq_head: wq, wq_entry: &ewait.wait);
297	}
298
299	static void put_unlocked_entry(struct xa_state xas, void* *entry,
300	enum dax_wake_mode mode)
301	{
302	if (entry && !dax_is_conflict(entry))
303	dax_wake_entry(xas, entry, mode);
304	}
305
306	/*
307	* We used the xa_state to get the entry, but then we locked the entry and
308	* dropped the xa_lock, so we know the xa_state is stale and must be reset
309	* before use.
310	*/
311	static void dax_unlock_entry(struct xa_state xas, void* *entry)
312	{
313	void *old;
314
315	BUG_ON(dax_is_locked(entry));
316	xas_reset(xas);
317	xas_lock_irq(xas);
318	old = xas_store(xas, entry);
319	xas_unlock_irq(xas);
320	BUG_ON(!dax_is_locked(old));
321	dax_wake_entry(xas, entry, mode: WAKE_NEXT);
322	}
323
324	/*
325	* Return: The entry stored at this location before it was locked.
326	*/
327	static void dax_lock_entry(struct* xa_state xas, void* *entry)
328	{
329	unsigned long v = xa_to_value(entry);
330	return xas_store(xas, entry: xa_mk_value(v: v \| DAX_LOCKED));
331	}
332
333	static unsigned long dax_entry_size(void *entry)
334	{
335	if (dax_is_zero_entry(entry))
336	return `0`;
337	else if (dax_is_empty_entry(entry))
338	return `0`;
339	else if (dax_is_pmd_entry(entry))
340	return PMD_SIZE;
341	else
342	return PAGE_SIZE;
343	}
344
345	/*
346	* A DAX folio is considered shared if it has no mapping set and ->share (which
347	* shares the ->index field) is non-zero. Note this may return false even if the
348	* page is shared between multiple files but has not yet actually been mapped
349	* into multiple address spaces.
350	*/
351	static inline bool dax_folio_is_shared(struct folio *folio)
352	{
353	return !folio->mapping && folio->share;
354	}
355
356	/*
357	* When it is called by dax_insert_entry(), the shared flag will indicate
358	* whether this entry is shared by multiple files. If the page has not
359	* previously been associated with any mappings the ->mapping and ->index
360	* fields will be set. If it has already been associated with a mapping
361	* the mapping will be cleared and the share count set. It's then up to
362	* reverse map users like memory_failure() to call back into the filesystem to
363	* recover ->mapping and ->index information. For example by implementing
364	* dax_holder_operations.
365	*/
366	static void dax_folio_make_shared(struct folio *folio)
367	{
368	/*
369	* folio is not currently shared so mark it as shared by clearing
370	* folio->mapping.
371	*/
372	folio->mapping = NULL;
373
374	/*
375	* folio has previously been mapped into one address space so set the
376	* share count.
377	*/
378	folio->share = `1`;
379	}
380
381	static inline unsigned long dax_folio_put(struct folio *folio)
382	{
383	unsigned long ref;
384	int order, i;
385
386	if (!dax_folio_is_shared(folio))
387	ref = `0`;
388	else
389	ref = --folio->share;
390
391	if (ref)
392	return ref;
393
394	folio->mapping = NULL;
395	order = folio_order(folio);
396	if (!order)
397	return `0`;
398	folio_reset_order(folio);
399
400	for (i = `0`; i < (`1UL` << order); i++) {
401	struct dev_pagemap *pgmap = page_pgmap(page: &folio->page);
402	struct page *page = folio_page(folio, i);
403	struct folio new_folio = (struct* folio *)page;
404
405	ClearPageHead(page);
406	clear_compound_head(page);
407
408	new_folio->mapping = NULL;
409	/*
410	* Reset pgmap which was over-written by
411	* prep_compound_page().
412	*/
413	new_folio->pgmap = pgmap;
414	new_folio->share = `0`;
415	WARN_ON_ONCE(folio_ref_count(new_folio));
416	}
417
418	return ref;
419	}
420
421	static void dax_folio_init(void *entry)
422	{
423	struct folio *folio = dax_to_folio(entry);
424	int order = dax_entry_order(entry);
425
426	/*
427	* Folio should have been split back to order-0 pages in
428	* dax_folio_put() when they were removed from their
429	* final mapping.
430	*/
431	WARN_ON_ONCE(folio_order(folio));
432
433	if (order > `0`) {
434	prep_compound_page(page: &folio->page, order);
435	if (order > `1`)
436	INIT_LIST_HEAD(list: &folio->_deferred_list);
437	WARN_ON_ONCE(folio_ref_count(folio));
438	}
439	}
440
441	static void dax_associate_entry(void entry, struct* address_space *mapping,
442	struct vm_area_struct *vma,
443	unsigned long address, bool shared)
444	{
445	unsigned long size = dax_entry_size(entry), index;
446	struct folio *folio = dax_to_folio(entry);
447
448	if (dax_is_zero_entry(entry) \|\| dax_is_empty_entry(entry))
449	return;
450
451	index = linear_page_index(vma, address: address & ~(size - `1`));
452	if (shared && (folio->mapping \|\| dax_folio_is_shared(folio))) {
453	if (folio->mapping)
454	dax_folio_make_shared(folio);
455
456	WARN_ON_ONCE(!folio->share);
457	WARN_ON_ONCE(dax_entry_order(entry) != folio_order(folio));
458	folio->share++;
459	} else {
460	WARN_ON_ONCE(folio->mapping);
461	dax_folio_init(entry);
462	folio = dax_to_folio(entry);
463	folio->mapping = mapping;
464	folio->index = index;
465	}
466	}
467
468	static void dax_disassociate_entry(void entry, struct* address_space *mapping,
469	bool trunc)
470	{
471	struct folio *folio = dax_to_folio(entry);
472
473	if (dax_is_zero_entry(entry) \|\| dax_is_empty_entry(entry))
474	return;
475
476	dax_folio_put(folio);
477	}
478
479	static struct page dax_busy_page(void* *entry)
480	{
481	struct folio *folio = dax_to_folio(entry);
482
483	if (dax_is_zero_entry(entry) \|\| dax_is_empty_entry(entry))
484	return NULL;
485
486	if (folio_ref_count(folio) - folio_mapcount(folio))
487	return &folio->page;
488	else
489	return NULL;
490	}
491
492	/**
493	* dax_lock_folio - Lock the DAX entry corresponding to a folio
494	* @folio: The folio whose entry we want to lock
495	*
496	* Context: Process context.
497	* Return: A cookie to pass to dax_unlock_folio() or 0 if the entry could
498	* not be locked.
499	*/
500	dax_entry_t dax_lock_folio(struct folio *folio)
501	{
502	XA_STATE(xas, NULL, `0`);
503	void *entry;
504
505	/ Ensure folio->mapping isn't freed while we look at it /
506	rcu_read_lock();
507	for (;;) {
508	struct address_space *mapping = READ_ONCE(folio->mapping);
509
510	entry = NULL;
511	if (!mapping \|\| !dax_mapping(mapping))
512	break;
513
514	/*
515	* In the device-dax case there's no need to lock, a
516	* struct dev_pagemap pin is sufficient to keep the
517	* inode alive, and we assume we have dev_pagemap pin
518	* otherwise we would not have a valid pfn_to_page()
519	* translation.
520	*/
521	entry = (void *)~`0UL`;
522	if (S_ISCHR(mapping->host->i_mode))
523	break;
524
525	xas.xa = &mapping->i_pages;
526	xas_lock_irq(&xas);
527	if (mapping != folio->mapping) {
528	xas_unlock_irq(&xas);
529	continue;
530	}
531	xas_set(xas: &xas, index: folio->index);
532	entry = xas_load(&xas);
533	if (dax_is_locked(entry)) {
534	rcu_read_unlock();
535	wait_entry_unlocked(xas: &xas, entry);
536	rcu_read_lock();
537	continue;
538	}
539	dax_lock_entry(xas: &xas, entry);
540	xas_unlock_irq(&xas);
541	break;
542	}
543	rcu_read_unlock();
544	return (dax_entry_t)entry;
545	}
546
547	void dax_unlock_folio(struct folio *folio, dax_entry_t cookie)
548	{
549	struct address_space *mapping = folio->mapping;
550	XA_STATE(xas, &mapping->i_pages, folio->index);
551
552	if (S_ISCHR(mapping->host->i_mode))
553	return;
554
555	dax_unlock_entry(xas: &xas, entry: (void *)cookie);
556	}
557
558	/*
559	* dax_lock_mapping_entry - Lock the DAX entry corresponding to a mapping
560	* @mapping: the file's mapping whose entry we want to lock
561	* @index: the offset within this file
562	* @page: output the dax page corresponding to this dax entry
563	*
564	* Return: A cookie to pass to dax_unlock_mapping_entry() or 0 if the entry
565	* could not be locked.
566	*/
567	dax_entry_t dax_lock_mapping_entry(struct address_space *mapping, pgoff_t index,
568	struct page **page)
569	{
570	XA_STATE(xas, NULL, `0`);
571	void *entry;
572
573	rcu_read_lock();
574	for (;;) {
575	entry = NULL;
576	if (!dax_mapping(mapping))
577	break;
578
579	xas.xa = &mapping->i_pages;
580	xas_lock_irq(&xas);
581	xas_set(xas: &xas, index);
582	entry = xas_load(&xas);
583	if (dax_is_locked(entry)) {
584	rcu_read_unlock();
585	wait_entry_unlocked(xas: &xas, entry);
586	rcu_read_lock();
587	continue;
588	}
589	if (!entry \|\|
590	dax_is_zero_entry(entry) \|\| dax_is_empty_entry(entry)) {
591	/*
592	* Because we are looking for entry from file's mapping
593	* and index, so the entry may not be inserted for now,
594	* or even a zero/empty entry. We don't think this is
595	* an error case. So, return a special value and do
596	* not output @page.
597	*/
598	entry = (void *)~`0UL`;
599	} else {
600	*page = pfn_to_page(dax_to_pfn(entry));
601	dax_lock_entry(xas: &xas, entry);
602	}
603	xas_unlock_irq(&xas);
604	break;
605	}
606	rcu_read_unlock();
607	return (dax_entry_t)entry;
608	}
609
610	void dax_unlock_mapping_entry(struct address_space *mapping, pgoff_t index,
611	dax_entry_t cookie)
612	{
613	XA_STATE(xas, &mapping->i_pages, index);
614
615	if (cookie == ~`0UL`)
616	return;
617
618	dax_unlock_entry(xas: &xas, entry: (void *)cookie);
619	}
620
621	/*
622	* Find page cache entry at given index. If it is a DAX entry, return it
623	* with the entry locked. If the page cache doesn't contain an entry at
624	* that index, add a locked empty entry.
625	*
626	* When requesting an entry with size DAX_PMD, grab_mapping_entry() will
627	* either return that locked entry or will return VM_FAULT_FALLBACK.
628	* This will happen if there are any PTE entries within the PMD range
629	* that we are requesting.
630	*
631	* We always favor PTE entries over PMD entries. There isn't a flow where we
632	* evict PTE entries in order to 'upgrade' them to a PMD entry. A PMD
633	* insertion will fail if it finds any PTE entries already in the tree, and a
634	* PTE insertion will cause an existing PMD entry to be unmapped and
635	* downgraded to PTE entries. This happens for both PMD zero pages as
636	* well as PMD empty entries.
637	*
638	* The exception to this downgrade path is for PMD entries that have
639	* real storage backing them. We will leave these real PMD entries in
640	* the tree, and PTE writes will simply dirty the entire PMD entry.
641	*
642	* Note: Unlike filemap_fault() we don't honor FAULT_FLAG_RETRY flags. For
643	* persistent memory the benefit is doubtful. We can add that later if we can
644	* show it helps.
645	*
646	* On error, this function does not return an ERR_PTR. Instead it returns
647	* a VM_FAULT code, encoded as an xarray internal entry. The ERR_PTR values
648	* overlap with xarray value entries.
649	*/
650	static void grab_mapping_entry(struct* xa_state *xas,
651	struct address_space mapping, unsigned* int order)
652	{
653	unsigned long index = xas->xa_index;
654	bool pmd_downgrade; / splitting PMD entry into PTE entries? /
655	void *entry;
656
657	retry:
658	pmd_downgrade = false;
659	xas_lock_irq(xas);
660	entry = get_next_unlocked_entry(xas, order);
661
662	if (entry) {
663	if (dax_is_conflict(entry))
664	goto fallback;
665	if (!xa_is_value(entry)) {
666	xas_set_err(xas, err: -EIO);
667	goto out_unlock;
668	}
669
670	if (order == `0`) {
671	if (dax_is_pmd_entry(entry) &&
672	(dax_is_zero_entry(entry) \|\|
673	dax_is_empty_entry(entry))) {
674	pmd_downgrade = true;
675	}
676	}
677	}
678
679	if (pmd_downgrade) {
680	/*
681	* Make sure 'entry' remains valid while we drop
682	* the i_pages lock.
683	*/
684	dax_lock_entry(xas, entry);
685
686	/*
687	* Besides huge zero pages the only other thing that gets
688	* downgraded are empty entries which don't need to be
689	* unmapped.
690	*/
691	if (dax_is_zero_entry(entry)) {
692	xas_unlock_irq(xas);
693	unmap_mapping_pages(mapping,
694	start: xas->xa_index & ~PG_PMD_COLOUR,
695	PG_PMD_NR, even_cows: false);
696	xas_reset(xas);
697	xas_lock_irq(xas);
698	}
699
700	dax_disassociate_entry(entry, mapping, trunc: false);
701	xas_store(xas, NULL); / undo the PMD join /
702	dax_wake_entry(xas, entry, mode: WAKE_ALL);
703	mapping->nrpages -= PG_PMD_NR;
704	entry = NULL;
705	xas_set(xas, index);
706	}
707
708	if (entry) {
709	dax_lock_entry(xas, entry);
710	} else {
711	unsigned long flags = DAX_EMPTY;
712
713	if (order > `0`)
714	flags \|= DAX_PMD;
715	entry = dax_make_entry(pfn: `0`, flags);
716	dax_lock_entry(xas, entry);
717	if (xas_error(xas))
718	goto out_unlock;
719	mapping->nrpages += `1UL` << order;
720	}
721
722	out_unlock:
723	xas_unlock_irq(xas);
724	if (xas_nomem(xas, mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM))
725	goto retry;
726	if (xas->xa_node == XA_ERROR(-ENOMEM))
727	return xa_mk_internal(v: VM_FAULT_OOM);
728	if (xas_error(xas))
729	return xa_mk_internal(v: VM_FAULT_SIGBUS);
730	return entry;
731	fallback:
732	xas_unlock_irq(xas);
733	return xa_mk_internal(v: VM_FAULT_FALLBACK);
734	}
735
736	/**
737	* dax_layout_busy_page_range - find first pinned page in @mapping
738	* @mapping: address space to scan for a page with ref count > 1
739	* @start: Starting offset. Page containing 'start' is included.
740	* @end: End offset. Page containing 'end' is included. If 'end' is LLONG_MAX,
741	* pages from 'start' till the end of file are included.
742	*
743	* DAX requires ZONE_DEVICE mapped pages. These pages are never
744	* 'onlined' to the page allocator so they are considered idle when
745	* page->count == 1. A filesystem uses this interface to determine if
746	* any page in the mapping is busy, i.e. for DMA, or other
747	* get_user_pages() usages.
748	*
749	* It is expected that the filesystem is holding locks to block the
750	* establishment of new mappings in this address_space. I.e. it expects
751	* to be able to run unmap_mapping_range() and subsequently not race
752	* mapping_mapped() becoming true.
753	*/
754	struct page dax_layout_busy_page_range(struct* address_space *mapping,
755	loff_t start, loff_t end)
756	{
757	void *entry;
758	unsigned int scanned = `0`;
759	struct page *page = NULL;
760	pgoff_t start_idx = start >> PAGE_SHIFT;
761	pgoff_t end_idx;
762	XA_STATE(xas, &mapping->i_pages, start_idx);
763
764	if (!dax_mapping(mapping))
765	return NULL;
766
767	/ If end == LLONG_MAX, all pages from start to till end of file /
768	if (end == LLONG_MAX)
769	end_idx = ULONG_MAX;
770	else
771	end_idx = end >> PAGE_SHIFT;
772	/*
773	* If we race get_user_pages_fast() here either we'll see the
774	* elevated page count in the iteration and wait, or
775	* get_user_pages_fast() will see that the page it took a reference
776	* against is no longer mapped in the page tables and bail to the
777	* get_user_pages() slow path. The slow path is protected by
778	* pte_lock() and pmd_lock(). New references are not taken without
779	* holding those locks, and unmap_mapping_pages() will not zero the
780	* pte or pmd without holding the respective lock, so we are
781	* guaranteed to either see new references or prevent new
782	* references from being established.
783	*/
784	unmap_mapping_pages(mapping, start: start_idx, nr: end_idx - start_idx + `1`, even_cows: `0`);
785
786	xas_lock_irq(&xas);
787	xas_for_each(&xas, entry, end_idx) {
788	if (WARN_ON_ONCE(!xa_is_value(entry)))
789	continue;
790	entry = wait_entry_unlocked_exclusive(xas: &xas, entry);
791	if (entry)
792	page = dax_busy_page(entry);
793	put_unlocked_entry(xas: &xas, entry, mode: WAKE_NEXT);
794	if (page)
795	break;
796	if (++scanned % XA_CHECK_SCHED)
797	continue;
798
799	xas_pause(&xas);
800	xas_unlock_irq(&xas);
801	cond_resched();
802	xas_lock_irq(&xas);
803	}
804	xas_unlock_irq(&xas);
805	return page;
806	}
807	EXPORT_SYMBOL_GPL(dax_layout_busy_page_range);
808
809	struct page dax_layout_busy_page(struct* address_space *mapping)
810	{
811	return dax_layout_busy_page_range(mapping, `0`, LLONG_MAX);
812	}
813	EXPORT_SYMBOL_GPL(dax_layout_busy_page);
814
815	static int __dax_invalidate_entry(struct address_space *mapping,
816	pgoff_t index, bool trunc)
817	{
818	XA_STATE(xas, &mapping->i_pages, index);
819	int ret = `0`;
820	void *entry;
821
822	xas_lock_irq(&xas);
823	entry = get_next_unlocked_entry(xas: &xas, order: `0`);
824	if (!entry \|\| WARN_ON_ONCE(!xa_is_value(entry)))
825	goto out;
826	if (!trunc &&
827	(xas_get_mark(&xas, PAGECACHE_TAG_DIRTY) \|\|
828	xas_get_mark(&xas, PAGECACHE_TAG_TOWRITE)))
829	goto out;
830	dax_disassociate_entry(entry, mapping, trunc);
831	xas_store(&xas, NULL);
832	mapping->nrpages -= `1UL` << dax_entry_order(entry);
833	ret = `1`;
834	out:
835	put_unlocked_entry(xas: &xas, entry, mode: WAKE_ALL);
836	xas_unlock_irq(&xas);
837	return ret;
838	}
839
840	static int __dax_clear_dirty_range(struct address_space *mapping,
841	pgoff_t start, pgoff_t end)
842	{
843	XA_STATE(xas, &mapping->i_pages, start);
844	unsigned int scanned = `0`;
845	void *entry;
846
847	xas_lock_irq(&xas);
848	xas_for_each(&xas, entry, end) {
849	entry = wait_entry_unlocked_exclusive(xas: &xas, entry);
850	if (!entry)
851	continue;
852	xas_clear_mark(&xas, PAGECACHE_TAG_DIRTY);
853	xas_clear_mark(&xas, PAGECACHE_TAG_TOWRITE);
854	put_unlocked_entry(xas: &xas, entry, mode: WAKE_NEXT);
855
856	if (++scanned % XA_CHECK_SCHED)
857	continue;
858
859	xas_pause(&xas);
860	xas_unlock_irq(&xas);
861	cond_resched();
862	xas_lock_irq(&xas);
863	}
864	xas_unlock_irq(&xas);
865
866	return `0`;
867	}
868
869	/*
870	* Delete DAX entry at @index from @mapping. Wait for it
871	* to be unlocked before deleting it.
872	*/
873	int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
874	{
875	int ret = __dax_invalidate_entry(mapping, index, trunc: true);
876
877	/*
878	* This gets called from truncate / punch_hole path. As such, the caller
879	* must hold locks protecting against concurrent modifications of the
880	* page cache (usually fs-private i_mmap_sem for writing). Since the
881	* caller has seen a DAX entry for this index, we better find it
882	* at that index as well...
883	*/
884	WARN_ON_ONCE(!ret);
885	return ret;
886	}
887
888	void dax_delete_mapping_range(struct address_space *mapping,
889	loff_t start, loff_t end)
890	{
891	void *entry;
892	pgoff_t start_idx = start >> PAGE_SHIFT;
893	pgoff_t end_idx;
894	XA_STATE(xas, &mapping->i_pages, start_idx);
895
896	/ If end == LLONG_MAX, all pages from start to till end of file /
897	if (end == LLONG_MAX)
898	end_idx = ULONG_MAX;
899	else
900	end_idx = end >> PAGE_SHIFT;
901
902	xas_lock_irq(&xas);
903	xas_for_each(&xas, entry, end_idx) {
904	if (!xa_is_value(entry))
905	continue;
906	entry = wait_entry_unlocked_exclusive(xas: &xas, entry);
907	if (!entry)
908	continue;
909	dax_disassociate_entry(entry, mapping, trunc: true);
910	xas_store(&xas, NULL);
911	mapping->nrpages -= `1UL` << dax_entry_order(entry);
912	put_unlocked_entry(xas: &xas, entry, mode: WAKE_ALL);
913	}
914	xas_unlock_irq(&xas);
915	}
916	EXPORT_SYMBOL_GPL(dax_delete_mapping_range);
917
918	static int wait_page_idle(struct page *page,
919	void (cb)(struct inode *),
920	struct inode *inode)
921	{
922	return ___wait_var_event(page, dax_page_is_idle(page),
923	TASK_INTERRUPTIBLE, `0`, `0`, cb(inode));
924	}
925
926	static void wait_page_idle_uninterruptible(struct page *page,
927	struct inode *inode)
928	{
929	___wait_var_event(page, dax_page_is_idle(page),
930	TASK_UNINTERRUPTIBLE, `0`, `0`, schedule());
931	}
932
933	/*
934	* Unmaps the inode and waits for any DMA to complete prior to deleting the
935	* DAX mapping entries for the range.
936	*
937	* For NOWAIT behavior, pass @cb as NULL to early-exit on first found
938	* busy page
939	*/
940	int dax_break_layout(struct inode *inode, loff_t start, loff_t end,
941	void (cb)(struct inode *))
942	{
943	struct page *page;
944	int error = `0`;
945
946	if (!dax_mapping(mapping: inode->i_mapping))
947	return `0`;
948
949	do {
950	page = dax_layout_busy_page_range(inode->i_mapping, start, end);
951	if (!page)
952	break;
953	if (!cb) {
954	error = -ERESTARTSYS;
955	break;
956	}
957
958	error = wait_page_idle(page, cb, inode);
959	} while (error == `0`);
960
961	if (!page)
962	dax_delete_mapping_range(inode->i_mapping, start, end);
963
964	return error;
965	}
966	EXPORT_SYMBOL_GPL(dax_break_layout);
967
968	void dax_break_layout_final(struct inode *inode)
969	{
970	struct page *page;
971
972	if (!dax_mapping(mapping: inode->i_mapping))
973	return;
974
975	do {
976	page = dax_layout_busy_page_range(inode->i_mapping, `0`,
977	LLONG_MAX);
978	if (!page)
979	break;
980
981	wait_page_idle_uninterruptible(page, inode);
982	} while (true);
983
984	if (!page)
985	dax_delete_mapping_range(inode->i_mapping, `0`, LLONG_MAX);
986	}
987	EXPORT_SYMBOL_GPL(dax_break_layout_final);
988
989	/*
990	* Invalidate DAX entry if it is clean.
991	*/
992	int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
993	pgoff_t index)
994	{
995	return __dax_invalidate_entry(mapping, index, trunc: false);
996	}
997
998	static pgoff_t dax_iomap_pgoff(const struct iomap *iomap, loff_t pos)
999	{
1000	return PHYS_PFN(iomap->addr + (pos & PAGE_MASK) - iomap->offset);
1001	}
1002
1003	static int copy_cow_page_dax(struct vm_fault vmf, const* struct iomap_iter *iter)
1004	{
1005	pgoff_t pgoff = dax_iomap_pgoff(iomap: &iter->iomap, pos: iter->pos);
1006	void vto, kaddr;
1007	long rc;
1008	int id;
1009
1010	id = dax_read_lock();
1011	rc = dax_direct_access(dax_dev: iter->iomap.dax_dev, pgoff, nr_pages: `1`, mode: DAX_ACCESS,
1012	kaddr: &kaddr, NULL);
1013	if (rc < `0`) {
1014	dax_read_unlock(id);
1015	return rc;
1016	}
1017	vto = kmap_atomic(page: vmf->cow_page);
1018	copy_user_page(to: vto, from: kaddr, vaddr: vmf->address, topage: vmf->cow_page);
1019	kunmap_atomic(vto);
1020	dax_read_unlock(id);
1021	return `0`;
1022	}
1023
1024	/*
1025	* MAP_SYNC on a dax mapping guarantees dirty metadata is
1026	* flushed on write-faults (non-cow), but not read-faults.
1027	*/
1028	static bool dax_fault_is_synchronous(const struct iomap_iter *iter,
1029	struct vm_area_struct *vma)
1030	{
1031	return (iter->flags & IOMAP_WRITE) && (vma->vm_flags & VM_SYNC) &&
1032	(iter->iomap.flags & IOMAP_F_DIRTY);
1033	}
1034
1035	/*
1036	* By this point grab_mapping_entry() has ensured that we have a locked entry
1037	* of the appropriate size so we don't have to worry about downgrading PMDs to
1038	* PTEs. If we happen to be trying to insert a PTE and there is a PMD
1039	* already in the tree, we will skip the insertion and just dirty the PMD as
1040	* appropriate.
1041	*/
1042	static void dax_insert_entry(struct* xa_state xas, struct* vm_fault *vmf,
1043	const struct iomap_iter iter, void* entry, unsigned* long pfn,
1044	unsigned long flags)
1045	{
1046	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
1047	void *new_entry = dax_make_entry(pfn, flags);
1048	bool write = iter->flags & IOMAP_WRITE;
1049	bool dirty = write && !dax_fault_is_synchronous(iter, vma: vmf->vma);
1050	bool shared = iter->iomap.flags & IOMAP_F_SHARED;
1051
1052	if (dirty)
1053	__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
1054
1055	if (shared \|\| (dax_is_zero_entry(entry) && !(flags & DAX_ZERO_PAGE))) {
1056	unsigned long index = xas->xa_index;
1057	/ we are replacing a zero page with block mapping /
1058	if (dax_is_pmd_entry(entry))
1059	unmap_mapping_pages(mapping, start: index & ~PG_PMD_COLOUR,
1060	PG_PMD_NR, even_cows: false);
1061	else / pte entry /
1062	unmap_mapping_pages(mapping, start: index, nr: `1`, even_cows: false);
1063	}
1064
1065	xas_reset(xas);
1066	xas_lock_irq(xas);
1067	if (shared \|\| dax_is_zero_entry(entry) \|\| dax_is_empty_entry(entry)) {
1068	void *old;
1069
1070	dax_disassociate_entry(entry, mapping, trunc: false);
1071	dax_associate_entry(entry: new_entry, mapping, vma: vmf->vma,
1072	address: vmf->address, shared);
1073
1074	/*
1075	* Only swap our new entry into the page cache if the current
1076	* entry is a zero page or an empty entry. If a normal PTE or
1077	* PMD entry is already in the cache, we leave it alone. This
1078	* means that if we are trying to insert a PTE and the
1079	* existing entry is a PMD, we will just leave the PMD in the
1080	* tree and dirty it if necessary.
1081	*/
1082	old = dax_lock_entry(xas, entry: new_entry);
1083	WARN_ON_ONCE(old != xa_mk_value(xa_to_value(entry) \|
1084	DAX_LOCKED));
1085	entry = new_entry;
1086	} else {
1087	xas_load(xas); / Walk the xa_state /
1088	}
1089
1090	if (dirty)
1091	xas_set_mark(xas, PAGECACHE_TAG_DIRTY);
1092
1093	if (write && shared)
1094	xas_set_mark(xas, PAGECACHE_TAG_TOWRITE);
1095
1096	xas_unlock_irq(xas);
1097	return entry;
1098	}
1099
1100	static int dax_writeback_one(struct xa_state xas, struct* dax_device *dax_dev,
1101	struct address_space mapping, void* *entry)
1102	{
1103	unsigned long pfn, index, count, end;
1104	long ret = `0`;
1105	struct vm_area_struct *vma;
1106
1107	/*
1108	* A page got tagged dirty in DAX mapping? Something is seriously
1109	* wrong.
1110	*/
1111	if (WARN_ON(!xa_is_value(entry)))
1112	return -EIO;
1113
1114	if (unlikely(dax_is_locked(entry))) {
1115	void *old_entry = entry;
1116
1117	entry = get_next_unlocked_entry(xas, order: `0`);
1118
1119	/ Entry got punched out / reallocated? /
1120	if (!entry \|\| WARN_ON_ONCE(!xa_is_value(entry)))
1121	goto put_unlocked;
1122	/*
1123	* Entry got reallocated elsewhere? No need to writeback.
1124	* We have to compare pfns as we must not bail out due to
1125	* difference in lockbit or entry type.
1126	*/
1127	if (dax_to_pfn(entry: old_entry) != dax_to_pfn(entry))
1128	goto put_unlocked;
1129	if (WARN_ON_ONCE(dax_is_empty_entry(entry) \|\|
1130	dax_is_zero_entry(entry))) {
1131	ret = -EIO;
1132	goto put_unlocked;
1133	}
1134
1135	/ Another fsync thread may have already done this entry /
1136	if (!xas_get_mark(xas, PAGECACHE_TAG_TOWRITE))
1137	goto put_unlocked;
1138	}
1139
1140	/ Lock the entry to serialize with page faults /
1141	dax_lock_entry(xas, entry);
1142
1143	/*
1144	* We can clear the tag now but we have to be careful so that concurrent
1145	* dax_writeback_one() calls for the same index cannot finish before we
1146	* actually flush the caches. This is achieved as the calls will look
1147	* at the entry only under the i_pages lock and once they do that
1148	* they will see the entry locked and wait for it to unlock.
1149	*/
1150	xas_clear_mark(xas, PAGECACHE_TAG_TOWRITE);
1151	xas_unlock_irq(xas);
1152
1153	/*
1154	* If dax_writeback_mapping_range() was given a wbc->range_start
1155	* in the middle of a PMD, the 'index' we use needs to be
1156	* aligned to the start of the PMD.
1157	* This allows us to flush for PMD_SIZE and not have to worry about
1158	* partial PMD writebacks.
1159	*/
1160	pfn = dax_to_pfn(entry);
1161	count = `1UL` << dax_entry_order(entry);
1162	index = xas->xa_index & ~(count - `1`);
1163	end = index + count - `1`;
1164
1165	/ Walk all mappings of a given index of a file and writeprotect them /
1166	i_mmap_lock_read(mapping);
1167	vma_interval_tree_foreach(vma, &mapping->i_mmap, index, end) {
1168	pfn_mkclean_range(pfn, nr_pages: count, pgoff: index, vma);
1169	cond_resched();
1170	}
1171	i_mmap_unlock_read(mapping);
1172
1173	dax_flush(dax_dev, page_address(pfn_to_page(pfn)), size: count * PAGE_SIZE);
1174	/*
1175	* After we have flushed the cache, we can clear the dirty tag. There
1176	* cannot be new dirty data in the pfn after the flush has completed as
1177	* the pfn mappings are writeprotected and fault waits for mapping
1178	* entry lock.
1179	*/
1180	xas_reset(xas);
1181	xas_lock_irq(xas);
1182	xas_store(xas, entry);
1183	xas_clear_mark(xas, PAGECACHE_TAG_DIRTY);
1184	dax_wake_entry(xas, entry, mode: WAKE_NEXT);
1185
1186	trace_dax_writeback_one(inode: mapping->host, pgoff: index, pglen: count);
1187	return ret;
1188
1189	put_unlocked:
1190	put_unlocked_entry(xas, entry, mode: WAKE_NEXT);
1191	return ret;
1192	}
1193
1194	/*
1195	* Flush the mapping to the persistent domain within the byte range of [start,
1196	* end]. This is required by data integrity operations to ensure file data is
1197	* on persistent storage prior to completion of the operation.
1198	*/
1199	int dax_writeback_mapping_range(struct address_space *mapping,
1200	struct dax_device dax_dev, struct* writeback_control *wbc)
1201	{
1202	XA_STATE(xas, &mapping->i_pages, wbc->range_start >> PAGE_SHIFT);
1203	struct inode *inode = mapping->host;
1204	pgoff_t end_index = wbc->range_end >> PAGE_SHIFT;
1205	void *entry;
1206	int ret = `0`;
1207	unsigned int scanned = `0`;
1208
1209	if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT))
1210	return -EIO;
1211
1212	if (mapping_empty(mapping) \|\| wbc->sync_mode != WB_SYNC_ALL)
1213	return `0`;
1214
1215	trace_dax_writeback_range(inode, start_index: xas.xa_index, end_index);
1216
1217	tag_pages_for_writeback(mapping, start: xas.xa_index, end: end_index);
1218
1219	xas_lock_irq(&xas);
1220	xas_for_each_marked(&xas, entry, end_index, PAGECACHE_TAG_TOWRITE) {
1221	ret = dax_writeback_one(xas: &xas, dax_dev, mapping, entry);
1222	if (ret < `0`) {
1223	mapping_set_error(mapping, error: ret);
1224	break;
1225	}
1226	if (++scanned % XA_CHECK_SCHED)
1227	continue;
1228
1229	xas_pause(&xas);
1230	xas_unlock_irq(&xas);
1231	cond_resched();
1232	xas_lock_irq(&xas);
1233	}
1234	xas_unlock_irq(&xas);
1235	trace_dax_writeback_range_done(inode, start_index: xas.xa_index, end_index);
1236	return ret;
1237	}
1238	EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
1239
1240	static int dax_iomap_direct_access(const struct iomap *iomap, loff_t pos,
1241	size_t size, void *kaddr, unsigned* long *pfnp)
1242	{
1243	pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
1244	int id, rc = `0`;
1245	long length;
1246
1247	id = dax_read_lock();
1248	length = dax_direct_access(dax_dev: iomap->dax_dev, pgoff, PHYS_PFN(size),
1249	mode: DAX_ACCESS, kaddr, pfn: pfnp);
1250	if (length < `0`) {
1251	rc = length;
1252	goto out;
1253	}
1254	if (!pfnp)
1255	goto out_check_addr;
1256	rc = -EINVAL;
1257	if (PFN_PHYS(length) < size)
1258	goto out;
1259	if (*pfnp & (PHYS_PFN(size)-`1`))
1260	goto out;
1261
1262	rc = `0`;
1263
1264	out_check_addr:
1265	if (!kaddr)
1266	goto out;
1267	if (!*kaddr)
1268	rc = -EFAULT;
1269	out:
1270	dax_read_unlock(id);
1271	return rc;
1272	}
1273
1274	/**
1275	* dax_iomap_copy_around - Prepare for an unaligned write to a shared/cow page
1276	* by copying the data before and after the range to be written.
1277	* @pos: address to do copy from.
1278	* @length: size of copy operation.
1279	* @align_size: aligned w.r.t align_size (either PMD_SIZE or PAGE_SIZE)
1280	* @srcmap: iomap srcmap
1281	* @daddr: destination address to copy to.
1282	*
1283	* This can be called from two places. Either during DAX write fault (page
1284	* aligned), to copy the length size data to daddr. Or, while doing normal DAX
1285	* write operation, dax_iomap_iter() might call this to do the copy of either
1286	* start or end unaligned address. In the latter case the rest of the copy of
1287	* aligned ranges is taken care by dax_iomap_iter() itself.
1288	* If the srcmap contains invalid data, such as HOLE and UNWRITTEN, zero the
1289	* area to make sure no old data remains.
1290	*/
1291	static int dax_iomap_copy_around(loff_t pos, uint64_t length, size_t align_size,
1292	const struct iomap srcmap, void* *daddr)
1293	{
1294	loff_t head_off = pos & (align_size - `1`);
1295	size_t size = ALIGN(head_off + length, align_size);
1296	loff_t end = pos + length;
1297	loff_t pg_end = round_up(end, align_size);
1298	/ copy_all is usually in page fault case /
1299	bool copy_all = head_off == `0` && end == pg_end;
1300	/ zero the edges if srcmap is a HOLE or IOMAP_UNWRITTEN /
1301	bool zero_edge = srcmap->flags & IOMAP_F_SHARED \|\|
1302	srcmap->type == IOMAP_UNWRITTEN;
1303	void *saddr = NULL;
1304	int ret = `0`;
1305
1306	if (!zero_edge) {
1307	ret = dax_iomap_direct_access(iomap: srcmap, pos, size, kaddr: &saddr, NULL);
1308	if (ret)
1309	return dax_mem2blk_err(err: ret);
1310	}
1311
1312	if (copy_all) {
1313	if (zero_edge)
1314	memset(daddr, `0`, size);
1315	else
1316	ret = copy_mc_to_kernel(to: daddr, from: saddr, len: length);
1317	goto out;
1318	}
1319
1320	/ Copy the head part of the range /
1321	if (head_off) {
1322	if (zero_edge)
1323	memset(daddr, `0`, head_off);
1324	else {
1325	ret = copy_mc_to_kernel(to: daddr, from: saddr, len: head_off);
1326	if (ret)
1327	return -EIO;
1328	}
1329	}
1330
1331	/ Copy the tail part of the range /
1332	if (end < pg_end) {
1333	loff_t tail_off = head_off + length;
1334	loff_t tail_len = pg_end - end;
1335
1336	if (zero_edge)
1337	memset(daddr + tail_off, `0`, tail_len);
1338	else {
1339	ret = copy_mc_to_kernel(to: daddr + tail_off,
1340	from: saddr + tail_off, len: tail_len);
1341	if (ret)
1342	return -EIO;
1343	}
1344	}
1345	out:
1346	if (zero_edge)
1347	dax_flush(dax_dev: srcmap->dax_dev, addr: daddr, size);
1348	return ret ? -EIO : `0`;
1349	}
1350
1351	/*
1352	* The user has performed a load from a hole in the file. Allocating a new
1353	* page in the file would cause excessive storage usage for workloads with
1354	* sparse files. Instead we insert a read-only mapping of the 4k zero page.
1355	* If this page is ever written to we will re-fault and change the mapping to
1356	* point to real DAX storage instead.
1357	*/
1358	static vm_fault_t dax_load_hole(struct xa_state xas, struct* vm_fault *vmf,
1359	const struct iomap_iter iter, void* **entry)
1360	{
1361	struct inode *inode = iter->inode;
1362	unsigned long vaddr = vmf->address;
1363	unsigned long pfn = my_zero_pfn(addr: vaddr);
1364	vm_fault_t ret;
1365
1366	entry = dax_insert_entry(xas, vmf, iter, entry: entry, pfn, DAX_ZERO_PAGE);
1367
1368	ret = vmf_insert_page_mkwrite(vmf, pfn_to_page(pfn), write: false);
1369	trace_dax_load_hole(inode, vmf, result: ret);
1370	return ret;
1371	}
1372
1373	#ifdef CONFIG_FS_DAX_PMD
1374	static vm_fault_t dax_pmd_load_hole(struct xa_state xas, struct* vm_fault *vmf,
1375	const struct iomap_iter iter, void* **entry)
1376	{
1377	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
1378	struct inode *inode = mapping->host;
1379	struct folio *zero_folio;
1380	vm_fault_t ret;
1381
1382	zero_folio = mm_get_huge_zero_folio(mm: vmf->vma->vm_mm);
1383
1384	if (unlikely(!zero_folio)) {
1385	trace_dax_pmd_load_hole_fallback(inode, vmf, zero_folio, radix_entry: *entry);
1386	return VM_FAULT_FALLBACK;
1387	}
1388
1389	entry = dax_insert_entry(xas, vmf, iter, entry: entry, pfn: folio_pfn(folio: zero_folio),
1390	DAX_PMD \| DAX_ZERO_PAGE);
1391
1392	ret = vmf_insert_folio_pmd(vmf, folio: zero_folio, write: false);
1393	if (ret == VM_FAULT_NOPAGE)
1394	trace_dax_pmd_load_hole(inode, vmf, zero_folio, radix_entry: *entry);
1395	return ret;
1396	}
1397	#else
1398	static vm_fault_t dax_pmd_load_hole(struct xa_state xas, struct* vm_fault *vmf,
1399	const struct iomap_iter iter, void* **entry)
1400	{
1401	return VM_FAULT_FALLBACK;
1402	}
1403	#endif /* CONFIG_FS_DAX_PMD */
1404
1405	static int dax_unshare_iter(struct iomap_iter *iter)
1406	{
1407	struct iomap *iomap = &iter->iomap;
1408	const struct iomap *srcmap = iomap_iter_srcmap(i: iter);
1409	loff_t copy_pos = iter->pos;
1410	u64 copy_len = iomap_length(iter);
1411	u32 mod;
1412	int id = `0`;
1413	s64 ret;
1414	void daddr = NULL, saddr = NULL;
1415
1416	if (!iomap_want_unshare_iter(iter))
1417	return iomap_iter_advance_full(iter);
1418
1419	/*
1420	* Extend the file range to be aligned to fsblock/pagesize, because
1421	* we need to copy entire blocks, not just the byte range specified.
1422	* Invalidate the mapping because we're about to CoW.
1423	*/
1424	mod = offset_in_page(copy_pos);
1425	if (mod) {
1426	copy_len += mod;
1427	copy_pos -= mod;
1428	}
1429
1430	mod = offset_in_page(copy_pos + copy_len);
1431	if (mod)
1432	copy_len += PAGE_SIZE - mod;
1433
1434	invalidate_inode_pages2_range(mapping: iter->inode->i_mapping,
1435	start: copy_pos >> PAGE_SHIFT,
1436	end: (copy_pos + copy_len - `1`) >> PAGE_SHIFT);
1437
1438	id = dax_read_lock();
1439	ret = dax_iomap_direct_access(iomap, pos: copy_pos, size: copy_len, kaddr: &daddr, NULL);
1440	if (ret < `0`)
1441	goto out_unlock;
1442
1443	ret = dax_iomap_direct_access(iomap: srcmap, pos: copy_pos, size: copy_len, kaddr: &saddr, NULL);
1444	if (ret < `0`)
1445	goto out_unlock;
1446
1447	if (copy_mc_to_kernel(to: daddr, from: saddr, len: copy_len) != `0`)
1448	ret = -EIO;
1449
1450	out_unlock:
1451	dax_read_unlock(id);
1452	if (ret < `0`)
1453	return dax_mem2blk_err(err: ret);
1454	return iomap_iter_advance_full(iter);
1455	}
1456
1457	int dax_file_unshare(struct inode *inode, loff_t pos, loff_t len,
1458	const struct iomap_ops *ops)
1459	{
1460	struct iomap_iter iter = {
1461	.inode = inode,
1462	.pos = pos,
1463	.flags = IOMAP_WRITE \| IOMAP_UNSHARE \| IOMAP_DAX,
1464	};
1465	loff_t size = i_size_read(inode);
1466	int ret;
1467
1468	if (pos < `0` \|\| pos >= size)
1469	return `0`;
1470
1471	iter.len = min(len, size - pos);
1472	while ((ret = iomap_iter(iter: &iter, ops)) > `0`)
1473	iter.status = dax_unshare_iter(iter: &iter);
1474	return ret;
1475	}
1476	EXPORT_SYMBOL_GPL(dax_file_unshare);
1477
1478	static int dax_memzero(struct iomap_iter *iter, loff_t pos, size_t size)
1479	{
1480	const struct iomap *iomap = &iter->iomap;
1481	const struct iomap *srcmap = iomap_iter_srcmap(i: iter);
1482	unsigned offset = offset_in_page(pos);
1483	pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
1484	void *kaddr;
1485	long ret;
1486
1487	ret = dax_direct_access(dax_dev: iomap->dax_dev, pgoff, nr_pages: `1`, mode: DAX_ACCESS, kaddr: &kaddr,
1488	NULL);
1489	if (ret < `0`)
1490	return dax_mem2blk_err(err: ret);
1491
1492	memset(kaddr + offset, `0`, size);
1493	if (iomap->flags & IOMAP_F_SHARED)
1494	ret = dax_iomap_copy_around(pos, length: size, PAGE_SIZE, srcmap,
1495	daddr: kaddr);
1496	else
1497	dax_flush(dax_dev: iomap->dax_dev, addr: kaddr + offset, size);
1498	return ret;
1499	}
1500
1501	static int dax_zero_iter(struct iomap_iter iter, bool did_zero)
1502	{
1503	const struct iomap *iomap = &iter->iomap;
1504	const struct iomap *srcmap = iomap_iter_srcmap(i: iter);
1505	u64 length = iomap_length(iter);
1506	int ret;
1507
1508	/ already zeroed? we're done. /
1509	if (srcmap->type == IOMAP_HOLE \|\| srcmap->type == IOMAP_UNWRITTEN)
1510	return iomap_iter_advance(iter, count: length);
1511
1512	/*
1513	* invalidate the pages whose sharing state is to be changed
1514	* because of CoW.
1515	*/
1516	if (iomap->flags & IOMAP_F_SHARED)
1517	invalidate_inode_pages2_range(mapping: iter->inode->i_mapping,
1518	start: iter->pos >> PAGE_SHIFT,
1519	end: (iter->pos + length - `1`) >> PAGE_SHIFT);
1520
1521	do {
1522	loff_t pos = iter->pos;
1523	unsigned offset = offset_in_page(pos);
1524	pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
1525	int id;
1526
1527	length = min_t(u64, PAGE_SIZE - offset, length);
1528
1529	id = dax_read_lock();
1530	if (IS_ALIGNED(pos, PAGE_SIZE) && length == PAGE_SIZE)
1531	ret = dax_zero_page_range(dax_dev: iomap->dax_dev, pgoff, nr_pages: `1`);
1532	else
1533	ret = dax_memzero(iter, pos, size: length);
1534	dax_read_unlock(id);
1535
1536	if (ret < `0`)
1537	return ret;
1538
1539	ret = iomap_iter_advance(iter, count: length);
1540	if (ret)
1541	return ret;
1542	} while ((length = iomap_length(iter)) > `0`);
1543
1544	if (did_zero)
1545	*did_zero = true;
1546	return ret;
1547	}
1548
1549	int dax_zero_range(struct inode inode, loff_t pos, loff_t len, bool did_zero,
1550	const struct iomap_ops *ops)
1551	{
1552	struct iomap_iter iter = {
1553	.inode = inode,
1554	.pos = pos,
1555	.len = len,
1556	.flags = IOMAP_DAX \| IOMAP_ZERO,
1557	};
1558	int ret;
1559
1560	while ((ret = iomap_iter(iter: &iter, ops)) > `0`)
1561	iter.status = dax_zero_iter(iter: &iter, did_zero);
1562	return ret;
1563	}
1564	EXPORT_SYMBOL_GPL(dax_zero_range);
1565
1566	int dax_truncate_page(struct inode inode, loff_t pos, bool did_zero,
1567	const struct iomap_ops *ops)
1568	{
1569	unsigned int blocksize = i_blocksize(node: inode);
1570	unsigned int off = pos & (blocksize - `1`);
1571
1572	/ Block boundary? Nothing to do /
1573	if (!off)
1574	return `0`;
1575	return dax_zero_range(inode, pos, blocksize - off, did_zero, ops);
1576	}
1577	EXPORT_SYMBOL_GPL(dax_truncate_page);
1578
1579	static int dax_iomap_iter(struct iomap_iter iomi, struct* iov_iter *iter)
1580	{
1581	const struct iomap *iomap = &iomi->iomap;
1582	const struct iomap *srcmap = iomap_iter_srcmap(i: iomi);
1583	loff_t length = iomap_length(iter: iomi);
1584	loff_t pos = iomi->pos;
1585	struct dax_device *dax_dev = iomap->dax_dev;
1586	loff_t end = pos + length, done = `0`;
1587	bool write = iov_iter_rw(i: iter) == WRITE;
1588	bool cow = write && iomap->flags & IOMAP_F_SHARED;
1589	ssize_t ret = `0`;
1590	size_t xfer;
1591	int id;
1592
1593	if (!write) {
1594	end = min(end, i_size_read(iomi->inode));
1595	if (pos >= end)
1596	return `0`;
1597
1598	if (iomap->type == IOMAP_HOLE \|\| iomap->type == IOMAP_UNWRITTEN) {
1599	done = iov_iter_zero(min(length, end - pos), iter);
1600	return iomap_iter_advance(iter: iomi, count: done);
1601	}
1602	}
1603
1604	/*
1605	* In DAX mode, enforce either pure overwrites of written extents, or
1606	* writes to unwritten extents as part of a copy-on-write operation.
1607	*/
1608	if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED &&
1609	!(iomap->flags & IOMAP_F_SHARED)))
1610	return -EIO;
1611
1612	/*
1613	* Write can allocate block for an area which has a hole page mapped
1614	* into page tables. We have to tear down these mappings so that data
1615	* written by write(2) is visible in mmap.
1616	*/
1617	if (iomap->flags & IOMAP_F_NEW \|\| cow) {
1618	/*
1619	* Filesystem allows CoW on non-shared extents. The src extents
1620	* may have been mmapped with dirty mark before. To be able to
1621	* invalidate its dax entries, we need to clear the dirty mark
1622	* in advance.
1623	*/
1624	if (cow)
1625	__dax_clear_dirty_range(mapping: iomi->inode->i_mapping,
1626	start: pos >> PAGE_SHIFT,
1627	end: (end - `1`) >> PAGE_SHIFT);
1628	invalidate_inode_pages2_range(mapping: iomi->inode->i_mapping,
1629	start: pos >> PAGE_SHIFT,
1630	end: (end - `1`) >> PAGE_SHIFT);
1631	}
1632
1633	id = dax_read_lock();
1634	while ((pos = iomi->pos) < end) {
1635	unsigned offset = pos & (PAGE_SIZE - `1`);
1636	const size_t size = ALIGN(length + offset, PAGE_SIZE);
1637	pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
1638	ssize_t map_len;
1639	bool recovery = false;
1640	void *kaddr;
1641
1642	if (fatal_signal_pending(current)) {
1643	ret = -EINTR;
1644	break;
1645	}
1646
1647	map_len = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size),
1648	mode: DAX_ACCESS, kaddr: &kaddr, NULL);
1649	if (map_len == -EHWPOISON && iov_iter_rw(i: iter) == WRITE) {
1650	map_len = dax_direct_access(dax_dev, pgoff,
1651	PHYS_PFN(size), mode: DAX_RECOVERY_WRITE,
1652	kaddr: &kaddr, NULL);
1653	if (map_len > `0`)
1654	recovery = true;
1655	}
1656	if (map_len < `0`) {
1657	ret = dax_mem2blk_err(err: map_len);
1658	break;
1659	}
1660
1661	if (cow) {
1662	ret = dax_iomap_copy_around(pos, length, PAGE_SIZE,
1663	srcmap, daddr: kaddr);
1664	if (ret)
1665	break;
1666	}
1667
1668	map_len = PFN_PHYS(map_len);
1669	kaddr += offset;
1670	map_len -= offset;
1671	if (map_len > end - pos)
1672	map_len = end - pos;
1673
1674	if (recovery)
1675	xfer = dax_recovery_write(dax_dev, pgoff, addr: kaddr,
1676	bytes: map_len, i: iter);
1677	else if (write)
1678	xfer = dax_copy_from_iter(dax_dev, pgoff, addr: kaddr,
1679	bytes: map_len, i: iter);
1680	else
1681	xfer = dax_copy_to_iter(dax_dev, pgoff, addr: kaddr,
1682	bytes: map_len, i: iter);
1683
1684	ret = iomap_iter_advance(iter: iomi, count: xfer);
1685	if (!ret && xfer == `0`)
1686	ret = -EFAULT;
1687	if (xfer < map_len)
1688	break;
1689	length = iomap_length(iter: iomi);
1690	}
1691	dax_read_unlock(id);
1692
1693	return ret;
1694	}
1695
1696	/**
1697	* dax_iomap_rw - Perform I/O to a DAX file
1698	* @iocb: The control block for this I/O
1699	* @iter: The addresses to do I/O from or to
1700	* @ops: iomap ops passed from the file system
1701	*
1702	* This function performs read and write operations to directly mapped
1703	* persistent memory. The callers needs to take care of read/write exclusion
1704	* and evicting any page cache pages in the region under I/O.
1705	*/
1706	ssize_t
1707	dax_iomap_rw(struct kiocb iocb, struct* iov_iter *iter,
1708	const struct iomap_ops *ops)
1709	{
1710	struct iomap_iter iomi = {
1711	.inode = iocb->ki_filp->f_mapping->host,
1712	.pos = iocb->ki_pos,
1713	.len = iov_iter_count(i: iter),
1714	.flags = IOMAP_DAX,
1715	};
1716	loff_t done = `0`;
1717	int ret;
1718
1719	if (WARN_ON_ONCE(iocb->ki_flags & IOCB_ATOMIC))
1720	return -EIO;
1721
1722	if (!iomi.len)
1723	return `0`;
1724
1725	if (iov_iter_rw(i: iter) == WRITE) {
1726	lockdep_assert_held_write(&iomi.inode->i_rwsem);
1727	iomi.flags \|= IOMAP_WRITE;
1728	} else if (!sb_rdonly(sb: iomi.inode->i_sb)) {
1729	lockdep_assert_held(&iomi.inode->i_rwsem);
1730	}
1731
1732	if (iocb->ki_flags & IOCB_NOWAIT)
1733	iomi.flags \|= IOMAP_NOWAIT;
1734
1735	while ((ret = iomap_iter(iter: &iomi, ops)) > `0`)
1736	iomi.status = dax_iomap_iter(iomi: &iomi, iter);
1737
1738	done = iomi.pos - iocb->ki_pos;
1739	iocb->ki_pos = iomi.pos;
1740	return done ? done : ret;
1741	}
1742	EXPORT_SYMBOL_GPL(dax_iomap_rw);
1743
1744	static vm_fault_t dax_fault_return(int error)
1745	{
1746	if (error == `0`)
1747	return VM_FAULT_NOPAGE;
1748	return vmf_error(err: error);
1749	}
1750
1751	/*
1752	* When handling a synchronous page fault and the inode need a fsync, we can
1753	* insert the PTE/PMD into page tables only after that fsync happened. Skip
1754	* insertion for now and return the pfn so that caller can insert it after the
1755	* fsync is done.
1756	*/
1757	static vm_fault_t dax_fault_synchronous_pfnp(unsigned long *pfnp,
1758	unsigned long pfn)
1759	{
1760	if (WARN_ON_ONCE(!pfnp))
1761	return VM_FAULT_SIGBUS;
1762	*pfnp = pfn;
1763	return VM_FAULT_NEEDDSYNC;
1764	}
1765
1766	static vm_fault_t dax_fault_cow_page(struct vm_fault *vmf,
1767	const struct iomap_iter *iter)
1768	{
1769	vm_fault_t ret;
1770	int error = `0`;
1771
1772	switch (iter->iomap.type) {
1773	case IOMAP_HOLE:
1774	case IOMAP_UNWRITTEN:
1775	clear_user_highpage(page: vmf->cow_page, vaddr: vmf->address);
1776	break;
1777	case IOMAP_MAPPED:
1778	error = copy_cow_page_dax(vmf, iter);
1779	break;
1780	default:
1781	WARN_ON_ONCE(`1`);
1782	error = -EIO;
1783	break;
1784	}
1785
1786	if (error)
1787	return dax_fault_return(error);
1788
1789	__SetPageUptodate(page: vmf->cow_page);
1790	ret = finish_fault(vmf);
1791	if (!ret)
1792	return VM_FAULT_DONE_COW;
1793	return ret;
1794	}
1795
1796	/**
1797	* dax_fault_iter - Common actor to handle pfn insertion in PTE/PMD fault.
1798	* @vmf: vm fault instance
1799	* @iter: iomap iter
1800	* @pfnp: pfn to be returned
1801	* @xas: the dax mapping tree of a file
1802	* @entry: an unlocked dax entry to be inserted
1803	* @pmd: distinguish whether it is a pmd fault
1804	*/
1805	static vm_fault_t dax_fault_iter(struct vm_fault *vmf,
1806	const struct iomap_iter iter, unsigned* long *pfnp,
1807	struct xa_state xas, void* **entry, bool pmd)
1808	{
1809	const struct iomap *iomap = &iter->iomap;
1810	const struct iomap *srcmap = iomap_iter_srcmap(i: iter);
1811	size_t size = pmd ? PMD_SIZE : PAGE_SIZE;
1812	loff_t pos = (loff_t)xas->xa_index << PAGE_SHIFT;
1813	bool write = iter->flags & IOMAP_WRITE;
1814	unsigned long entry_flags = pmd ? DAX_PMD : `0`;
1815	struct folio *folio;
1816	int ret, err = `0`;
1817	unsigned long pfn;
1818	void *kaddr;
1819
1820	if (!pmd && vmf->cow_page)
1821	return dax_fault_cow_page(vmf, iter);
1822
1823	/ if we are reading UNWRITTEN and HOLE, return a hole. /
1824	if (!write &&
1825	(iomap->type == IOMAP_UNWRITTEN \|\| iomap->type == IOMAP_HOLE)) {
1826	if (!pmd)
1827	return dax_load_hole(xas, vmf, iter, entry);
1828	return dax_pmd_load_hole(xas, vmf, iter, entry);
1829	}
1830
1831	if (iomap->type != IOMAP_MAPPED && !(iomap->flags & IOMAP_F_SHARED)) {
1832	WARN_ON_ONCE(`1`);
1833	return pmd ? VM_FAULT_FALLBACK : VM_FAULT_SIGBUS;
1834	}
1835
1836	err = dax_iomap_direct_access(iomap, pos, size, kaddr: &kaddr, pfnp: &pfn);
1837	if (err)
1838	return pmd ? VM_FAULT_FALLBACK : dax_fault_return(error: err);
1839
1840	entry = dax_insert_entry(xas, vmf, iter, entry: entry, pfn, flags: entry_flags);
1841
1842	if (write && iomap->flags & IOMAP_F_SHARED) {
1843	err = dax_iomap_copy_around(pos, length: size, align_size: size, srcmap, daddr: kaddr);
1844	if (err)
1845	return dax_fault_return(error: err);
1846	}
1847
1848	folio = dax_to_folio(entry: *entry);
1849	if (dax_fault_is_synchronous(iter, vma: vmf->vma))
1850	return dax_fault_synchronous_pfnp(pfnp, pfn);
1851
1852	folio_ref_inc(folio);
1853	if (pmd)
1854	ret = vmf_insert_folio_pmd(vmf, folio: pfn_folio(pfn), write);
1855	else
1856	ret = vmf_insert_page_mkwrite(vmf, pfn_to_page(pfn), write);
1857	folio_put(folio);
1858
1859	return ret;
1860	}
1861
1862	static vm_fault_t dax_iomap_pte_fault(struct vm_fault vmf, unsigned* long *pfnp,
1863	int iomap_errp, const* struct iomap_ops *ops)
1864	{
1865	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
1866	XA_STATE(xas, &mapping->i_pages, vmf->pgoff);
1867	struct iomap_iter iter = {
1868	.inode = mapping->host,
1869	.pos = (loff_t)vmf->pgoff << PAGE_SHIFT,
1870	.len = PAGE_SIZE,
1871	.flags = IOMAP_DAX \| IOMAP_FAULT,
1872	};
1873	vm_fault_t ret = `0`;
1874	void *entry;
1875	int error;
1876
1877	trace_dax_pte_fault(inode: iter.inode, vmf, result: ret);
1878	/*
1879	* Check whether offset isn't beyond end of file now. Caller is supposed
1880	* to hold locks serializing us with truncate / punch hole so this is
1881	* a reliable test.
1882	*/
1883	if (iter.pos >= i_size_read(inode: iter.inode)) {
1884	ret = VM_FAULT_SIGBUS;
1885	goto out;
1886	}
1887
1888	if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page)
1889	iter.flags \|= IOMAP_WRITE;
1890
1891	entry = grab_mapping_entry(xas: &xas, mapping, order: `0`);
1892	if (xa_is_internal(entry)) {
1893	ret = xa_to_internal(entry);
1894	goto out;
1895	}
1896
1897	/*
1898	* It is possible, particularly with mixed reads & writes to private
1899	* mappings, that we have raced with a PMD fault that overlaps with
1900	* the PTE we need to set up. If so just return and the fault will be
1901	* retried.
1902	*/
1903	if (pmd_trans_huge(pmd: *vmf->pmd)) {
1904	ret = VM_FAULT_NOPAGE;
1905	goto unlock_entry;
1906	}
1907
1908	while ((error = iomap_iter(iter: &iter, ops)) > `0`) {
1909	if (WARN_ON_ONCE(iomap_length(&iter) < PAGE_SIZE)) {
1910	iter.status = -EIO; / fs corruption? /
1911	continue;
1912	}
1913
1914	ret = dax_fault_iter(vmf, iter: &iter, pfnp, xas: &xas, entry: &entry, pmd: false);
1915	if (ret != VM_FAULT_SIGBUS &&
1916	(iter.iomap.flags & IOMAP_F_NEW)) {
1917	count_vm_event(item: PGMAJFAULT);
1918	count_memcg_event_mm(mm: vmf->vma->vm_mm, idx: PGMAJFAULT);
1919	ret \|= VM_FAULT_MAJOR;
1920	}
1921
1922	if (!(ret & VM_FAULT_ERROR))
1923	iter.status = iomap_iter_advance(iter: &iter, PAGE_SIZE);
1924	}
1925
1926	if (iomap_errp)
1927	*iomap_errp = error;
1928	if (!ret && error)
1929	ret = dax_fault_return(error);
1930
1931	unlock_entry:
1932	dax_unlock_entry(xas: &xas, entry);
1933	out:
1934	trace_dax_pte_fault_done(inode: iter.inode, vmf, result: ret);
1935	return ret;
1936	}
1937
1938	#ifdef CONFIG_FS_DAX_PMD
1939	static bool dax_fault_check_fallback(struct vm_fault vmf, struct* xa_state *xas,
1940	pgoff_t max_pgoff)
1941	{
1942	unsigned long pmd_addr = vmf->address & PMD_MASK;
1943	bool write = vmf->flags & FAULT_FLAG_WRITE;
1944
1945	/*
1946	* Make sure that the faulting address's PMD offset (color) matches
1947	* the PMD offset from the start of the file. This is necessary so
1948	* that a PMD range in the page table overlaps exactly with a PMD
1949	* range in the page cache.
1950	*/
1951	if ((vmf->pgoff & PG_PMD_COLOUR) !=
1952	((vmf->address >> PAGE_SHIFT) & PG_PMD_COLOUR))
1953	return true;
1954
1955	/ Fall back to PTEs if we're going to COW /
1956	if (write && !(vmf->vma->vm_flags & VM_SHARED))
1957	return true;
1958
1959	/ If the PMD would extend outside the VMA /
1960	if (pmd_addr < vmf->vma->vm_start)
1961	return true;
1962	if ((pmd_addr + PMD_SIZE) > vmf->vma->vm_end)
1963	return true;
1964
1965	/ If the PMD would extend beyond the file size /
1966	if ((xas->xa_index \| PG_PMD_COLOUR) >= max_pgoff)
1967	return true;
1968
1969	return false;
1970	}
1971
1972	static vm_fault_t dax_iomap_pmd_fault(struct vm_fault vmf, unsigned* long *pfnp,
1973	const struct iomap_ops *ops)
1974	{
1975	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
1976	XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, PMD_ORDER);
1977	struct iomap_iter iter = {
1978	.inode = mapping->host,
1979	.len = PMD_SIZE,
1980	.flags = IOMAP_DAX \| IOMAP_FAULT,
1981	};
1982	vm_fault_t ret = VM_FAULT_FALLBACK;
1983	pgoff_t max_pgoff;
1984	void *entry;
1985
1986	if (vmf->flags & FAULT_FLAG_WRITE)
1987	iter.flags \|= IOMAP_WRITE;
1988
1989	/*
1990	* Check whether offset isn't beyond end of file now. Caller is
1991	* supposed to hold locks serializing us with truncate / punch hole so
1992	* this is a reliable test.
1993	*/
1994	max_pgoff = DIV_ROUND_UP(i_size_read(iter.inode), PAGE_SIZE);
1995
1996	trace_dax_pmd_fault(inode: iter.inode, vmf, max_pgoff, result: `0`);
1997
1998	if (xas.xa_index >= max_pgoff) {
1999	ret = VM_FAULT_SIGBUS;
2000	goto out;
2001	}
2002
2003	if (dax_fault_check_fallback(vmf, xas: &xas, max_pgoff))
2004	goto fallback;
2005
2006	/*
2007	* grab_mapping_entry() will make sure we get an empty PMD entry,
2008	* a zero PMD entry or a DAX PMD. If it can't (because a PTE
2009	* entry is already in the array, for instance), it will return
2010	* VM_FAULT_FALLBACK.
2011	*/
2012	entry = grab_mapping_entry(xas: &xas, mapping, PMD_ORDER);
2013	if (xa_is_internal(entry)) {
2014	ret = xa_to_internal(entry);
2015	goto fallback;
2016	}
2017
2018	/*
2019	* It is possible, particularly with mixed reads & writes to private
2020	* mappings, that we have raced with a PTE fault that overlaps with
2021	* the PMD we need to set up. If so just return and the fault will be
2022	* retried.
2023	*/
2024	if (!pmd_none(pmd: vmf->pmd) && !pmd_trans_huge(pmd: vmf->pmd)) {
2025	ret = `0`;
2026	goto unlock_entry;
2027	}
2028
2029	iter.pos = (loff_t)xas.xa_index << PAGE_SHIFT;
2030	while (iomap_iter(iter: &iter, ops) > `0`) {
2031	if (iomap_length(iter: &iter) < PMD_SIZE)
2032	continue; / actually breaks out of the loop /
2033
2034	ret = dax_fault_iter(vmf, iter: &iter, pfnp, xas: &xas, entry: &entry, pmd: true);
2035	if (ret != VM_FAULT_FALLBACK)
2036	iter.status = iomap_iter_advance(iter: &iter, PMD_SIZE);
2037	}
2038
2039	unlock_entry:
2040	dax_unlock_entry(xas: &xas, entry);
2041	fallback:
2042	if (ret == VM_FAULT_FALLBACK) {
2043	split_huge_pmd(vmf->vma, vmf->pmd, vmf->address);
2044	count_vm_event(item: THP_FAULT_FALLBACK);
2045	}
2046	out:
2047	trace_dax_pmd_fault_done(inode: iter.inode, vmf, max_pgoff, result: ret);
2048	return ret;
2049	}
2050	#else
2051	static vm_fault_t dax_iomap_pmd_fault(struct vm_fault vmf, unsigned* long *pfnp,
2052	const struct iomap_ops *ops)
2053	{
2054	return VM_FAULT_FALLBACK;
2055	}
2056	#endif /* CONFIG_FS_DAX_PMD */
2057
2058	/**
2059	* dax_iomap_fault - handle a page fault on a DAX file
2060	* @vmf: The description of the fault
2061	* @order: Order of the page to fault in
2062	* @pfnp: PFN to insert for synchronous faults if fsync is required
2063	* @iomap_errp: Storage for detailed error code in case of error
2064	* @ops: Iomap ops passed from the file system
2065	*
2066	* When a page fault occurs, filesystems may call this helper in
2067	* their fault handler for DAX files. dax_iomap_fault() assumes the caller
2068	* has done all the necessary locking for page fault to proceed
2069	* successfully.
2070	*/
2071	vm_fault_t dax_iomap_fault(struct vm_fault vmf, unsigned* int order,
2072	unsigned long pfnp, int* *iomap_errp,
2073	const struct iomap_ops *ops)
2074	{
2075	if (order == `0`)
2076	return dax_iomap_pte_fault(vmf, pfnp, iomap_errp, ops);
2077	else if (order == PMD_ORDER)
2078	return dax_iomap_pmd_fault(vmf, pfnp, ops);
2079	else
2080	return VM_FAULT_FALLBACK;
2081	}
2082	EXPORT_SYMBOL_GPL(dax_iomap_fault);
2083
2084	/*
2085	* dax_insert_pfn_mkwrite - insert PTE or PMD entry into page tables
2086	* @vmf: The description of the fault
2087	* @pfn: PFN to insert
2088	* @order: Order of entry to insert.
2089	*
2090	* This function inserts a writeable PTE or PMD entry into the page tables
2091	* for an mmaped DAX file. It also marks the page cache entry as dirty.
2092	*/
2093	static vm_fault_t dax_insert_pfn_mkwrite(struct vm_fault *vmf,
2094	unsigned long pfn, unsigned int order)
2095	{
2096	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
2097	XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, order);
2098	struct folio *folio;
2099	void *entry;
2100	vm_fault_t ret;
2101
2102	xas_lock_irq(&xas);
2103	entry = get_next_unlocked_entry(xas: &xas, order);
2104	/ Did we race with someone splitting entry or so? /
2105	if (!entry \|\| dax_is_conflict(entry) \|\|
2106	(order == `0` && !dax_is_pte_entry(entry))) {
2107	put_unlocked_entry(xas: &xas, entry, mode: WAKE_NEXT);
2108	xas_unlock_irq(&xas);
2109	trace_dax_insert_pfn_mkwrite_no_entry(inode: mapping->host, vmf,
2110	result: VM_FAULT_NOPAGE);
2111	return VM_FAULT_NOPAGE;
2112	}
2113	xas_set_mark(&xas, PAGECACHE_TAG_DIRTY);
2114	dax_lock_entry(xas: &xas, entry);
2115	xas_unlock_irq(&xas);
2116	folio = pfn_folio(pfn);
2117	folio_ref_inc(folio);
2118	if (order == `0`)
2119	ret = vmf_insert_page_mkwrite(vmf, page: &folio->page, write: true);
2120	#ifdef CONFIG_FS_DAX_PMD
2121	else if (order == PMD_ORDER)
2122	ret = vmf_insert_folio_pmd(vmf, folio, write: FAULT_FLAG_WRITE);
2123	#endif
2124	else
2125	ret = VM_FAULT_FALLBACK;
2126	folio_put(folio);
2127	dax_unlock_entry(xas: &xas, entry);
2128	trace_dax_insert_pfn_mkwrite(inode: mapping->host, vmf, result: ret);
2129	return ret;
2130	}
2131
2132	/**
2133	* dax_finish_sync_fault - finish synchronous page fault
2134	* @vmf: The description of the fault
2135	* @order: Order of entry to be inserted
2136	* @pfn: PFN to insert
2137	*
2138	* This function ensures that the file range touched by the page fault is
2139	* stored persistently on the media and handles inserting of appropriate page
2140	* table entry.
2141	*/
2142	vm_fault_t dax_finish_sync_fault(struct vm_fault vmf, unsigned* int order,
2143	unsigned long pfn)
2144	{
2145	int err;
2146	loff_t start = ((loff_t)vmf->pgoff) << PAGE_SHIFT;
2147	size_t len = PAGE_SIZE << order;
2148
2149	err = vfs_fsync_range(file: vmf->vma->vm_file, start, end: start + len - `1`, datasync: `1`);
2150	if (err)
2151	return VM_FAULT_SIGBUS;
2152	return dax_insert_pfn_mkwrite(vmf, pfn, order);
2153	}
2154	EXPORT_SYMBOL_GPL(dax_finish_sync_fault);
2155
2156	static int dax_range_compare_iter(struct iomap_iter *it_src,
2157	struct iomap_iter it_dest, u64 len, bool same)
2158	{
2159	const struct iomap *smap = &it_src->iomap;
2160	const struct iomap *dmap = &it_dest->iomap;
2161	loff_t pos1 = it_src->pos, pos2 = it_dest->pos;
2162	void saddr, daddr;
2163	int id, ret;
2164
2165	len = min(len, min(smap->length, dmap->length));
2166
2167	if (smap->type == IOMAP_HOLE && dmap->type == IOMAP_HOLE) {
2168	*same = true;
2169	goto advance;
2170	}
2171
2172	if (smap->type == IOMAP_HOLE \|\| dmap->type == IOMAP_HOLE) {
2173	*same = false;
2174	return `0`;
2175	}
2176
2177	id = dax_read_lock();
2178	ret = dax_iomap_direct_access(iomap: smap, pos: pos1, ALIGN(pos1 + len, PAGE_SIZE),
2179	kaddr: &saddr, NULL);
2180	if (ret < `0`)
2181	goto out_unlock;
2182
2183	ret = dax_iomap_direct_access(iomap: dmap, pos: pos2, ALIGN(pos2 + len, PAGE_SIZE),
2184	kaddr: &daddr, NULL);
2185	if (ret < `0`)
2186	goto out_unlock;
2187
2188	*same = !memcmp(p: saddr, q: daddr, size: len);
2189	if (!*same)
2190	len = `0`;
2191	dax_read_unlock(id);
2192
2193	advance:
2194	ret = iomap_iter_advance(iter: it_src, count: len);
2195	if (!ret)
2196	ret = iomap_iter_advance(iter: it_dest, count: len);
2197	return ret;
2198
2199	out_unlock:
2200	dax_read_unlock(id);
2201	return -EIO;
2202	}
2203
2204	int dax_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
2205	struct inode dst, loff_t dstoff, loff_t len, bool same,
2206	const struct iomap_ops *ops)
2207	{
2208	struct iomap_iter src_iter = {
2209	.inode = src,
2210	.pos = srcoff,
2211	.len = len,
2212	.flags = IOMAP_DAX,
2213	};
2214	struct iomap_iter dst_iter = {
2215	.inode = dst,
2216	.pos = dstoff,
2217	.len = len,
2218	.flags = IOMAP_DAX,
2219	};
2220	int ret, status;
2221
2222	while ((ret = iomap_iter(iter: &src_iter, ops)) > `0` &&
2223	(ret = iomap_iter(iter: &dst_iter, ops)) > `0`) {
2224	status = dax_range_compare_iter(it_src: &src_iter, it_dest: &dst_iter,
2225	min(src_iter.len, dst_iter.len), same);
2226	if (status < `0`)
2227	return ret;
2228	src_iter.status = dst_iter.status = status;
2229	}
2230	return ret;
2231	}
2232
2233	int dax_remap_file_range_prep(struct file *file_in, loff_t pos_in,
2234	struct file *file_out, loff_t pos_out,
2235	loff_t len, unsigned* int remap_flags,
2236	const struct iomap_ops *ops)
2237	{
2238	return __generic_remap_file_range_prep(file_in, pos_in, file_out,
2239	pos_out, len, remap_flags, dax_read_ops: ops);
2240	}
2241	EXPORT_SYMBOL_GPL(dax_remap_file_range_prep);
2242

source code of linux/fs/dax.c