send.c source code [linux/fs/btrfs/send.c]

1	// SPDX-License-Identifier: GPL-2.0
2	/*
3	* Copyright (C) 2012 Alexander Block. All rights reserved.
4	*/
5
6	#include <linux/bsearch.h>
7	#include <linux/falloc.h>
8	#include <linux/fs.h>
9	#include <linux/file.h>
10	#include <linux/sort.h>
11	#include <linux/mount.h>
12	#include <linux/xattr.h>
13	#include <linux/posix_acl_xattr.h>
14	#include <linux/radix-tree.h>
15	#include <linux/vmalloc.h>
16	#include <linux/string.h>
17	#include <linux/compat.h>
18	#include <linux/crc32c.h>
19	#include <linux/fsverity.h>
20	#include "send.h"
21	#include "ctree.h"
22	#include "backref.h"
23	#include "locking.h"
24	#include "disk-io.h"
25	#include "btrfs_inode.h"
26	#include "transaction.h"
27	#include "compression.h"
28	#include "print-tree.h"
29	#include "accessors.h"
30	#include "dir-item.h"
31	#include "file-item.h"
32	#include "ioctl.h"
33	#include "verity.h"
34	#include "lru_cache.h"
35
36	/*
37	* Maximum number of references an extent can have in order for us to attempt to
38	* issue clone operations instead of write operations. This currently exists to
39	* avoid hitting limitations of the backreference walking code (taking a lot of
40	* time and using too much memory for extents with large number of references).
41	*/
42	#define SEND_MAX_EXTENT_REFS 1024
43
44	/*
45	* A fs_path is a helper to dynamically build path names with unknown size.
46	* It reallocates the internal buffer on demand.
47	* It allows fast adding of path elements on the right side (normal path) and
48	* fast adding to the left side (reversed path). A reversed path can also be
49	* unreversed if needed.
50	*
51	* The definition of struct fs_path relies on -fms-extensions to allow
52	* including a tagged struct as an anonymous member.
53	*/
54	struct __fs_path {
55	char *start;
56	char *end;
57
58	char *buf;
59	unsigned short buf_len:`15`;
60	unsigned short reversed:`1`;
61	};
62	static_assert(sizeof(struct __fs_path) < `256`);
63	struct fs_path {
64	struct __fs_path;
65	/*
66	* Average path length does not exceed 200 bytes, we'll have
67	* better packing in the slab and higher chance to satisfy
68	* an allocation later during send.
69	*/
70	char inline_buf[`256` - sizeof(struct __fs_path)];
71	};
72	#define FS_PATH_INLINE_SIZE \
73	sizeof_field(struct fs_path, inline_buf)
74
75
76	/ reused for each extent /
77	struct clone_root {
78	struct btrfs_root *root;
79	u64 ino;
80	u64 offset;
81	u64 num_bytes;
82	bool found_ref;
83	};
84
85	#define SEND_MAX_NAME_CACHE_SIZE 256
86
87	/*
88	* Limit the root_ids array of struct backref_cache_entry to 17 elements.
89	* This makes the size of a cache entry to be exactly 192 bytes on x86_64, which
90	* can be satisfied from the kmalloc-192 slab, without wasting any space.
91	* The most common case is to have a single root for cloning, which corresponds
92	* to the send root. Having the user specify more than 16 clone roots is not
93	* common, and in such rare cases we simply don't use caching if the number of
94	* cloning roots that lead down to a leaf is more than 17.
95	*/
96	#define SEND_MAX_BACKREF_CACHE_ROOTS 17
97
98	/*
99	* Max number of entries in the cache.
100	* With SEND_MAX_BACKREF_CACHE_ROOTS as 17, the size in bytes, excluding
101	* maple tree's internal nodes, is 24K.
102	*/
103	#define SEND_MAX_BACKREF_CACHE_SIZE 128
104
105	/*
106	* A backref cache entry maps a leaf to a list of IDs of roots from which the
107	* leaf is accessible and we can use for clone operations.
108	* With SEND_MAX_BACKREF_CACHE_ROOTS as 12, each cache entry is 128 bytes (on
109	* x86_64).
110	*/
111	struct backref_cache_entry {
112	struct btrfs_lru_cache_entry entry;
113	u64 root_ids[SEND_MAX_BACKREF_CACHE_ROOTS];
114	/ Number of valid elements in the root_ids array. /
115	int num_roots;
116	};
117
118	/ See the comment at lru_cache.h about struct btrfs_lru_cache_entry. /
119	static_assert(offsetof(struct backref_cache_entry, entry) == `0`);
120
121	/*
122	* Max number of entries in the cache that stores directories that were already
123	* created. The cache uses raw struct btrfs_lru_cache_entry entries, so it uses
124	* at most 4096 bytes - sizeof(struct btrfs_lru_cache_entry) is 48 bytes, but
125	* the kmalloc-64 slab is used, so we get 4096 bytes (64 bytes * 64).
126	*/
127	#define SEND_MAX_DIR_CREATED_CACHE_SIZE 64
128
129	/*
130	* Max number of entries in the cache that stores directories that were already
131	* created. The cache uses raw struct btrfs_lru_cache_entry entries, so it uses
132	* at most 4096 bytes - sizeof(struct btrfs_lru_cache_entry) is 48 bytes, but
133	* the kmalloc-64 slab is used, so we get 4096 bytes (64 bytes * 64).
134	*/
135	#define SEND_MAX_DIR_UTIMES_CACHE_SIZE 64
136
137	struct send_ctx {
138	struct file *send_filp;
139	loff_t send_off;
140	char *send_buf;
141	u32 send_size;
142	u32 send_max_size;
143	/*
144	* Whether BTRFS_SEND_A_DATA attribute was already added to current
145	* command (since protocol v2, data must be the last attribute).
146	*/
147	bool put_data;
148	struct page **send_buf_pages;
149	u64 flags; / 'flags' member of btrfs_ioctl_send_args is u64 /
150	/ Protocol version compatibility requested /
151	u32 proto;
152
153	struct btrfs_root *send_root;
154	struct btrfs_root *parent_root;
155	struct clone_root *clone_roots;
156	int clone_roots_cnt;
157
158	/ current state of the compare_tree call /
159	struct btrfs_path *left_path;
160	struct btrfs_path *right_path;
161	struct btrfs_key *cmp_key;
162
163	/*
164	* Keep track of the generation of the last transaction that was used
165	* for relocating a block group. This is periodically checked in order
166	* to detect if a relocation happened since the last check, so that we
167	* don't operate on stale extent buffers for nodes (level >= 1) or on
168	* stale disk_bytenr values of file extent items.
169	*/
170	u64 last_reloc_trans;
171
172	/*
173	* infos of the currently processed inode. In case of deleted inodes,
174	* these are the values from the deleted inode.
175	*/
176	u64 cur_ino;
177	u64 cur_inode_gen;
178	u64 cur_inode_size;
179	u64 cur_inode_mode;
180	u64 cur_inode_rdev;
181	u64 cur_inode_last_extent;
182	u64 cur_inode_next_write_offset;
183	bool cur_inode_new;
184	bool cur_inode_new_gen;
185	bool cur_inode_deleted;
186	bool ignore_cur_inode;
187	bool cur_inode_needs_verity;
188	void *verity_descriptor;
189
190	u64 send_progress;
191
192	struct list_head new_refs;
193	struct list_head deleted_refs;
194
195	struct btrfs_lru_cache name_cache;
196
197	/*
198	* The inode we are currently processing. It's not NULL only when we
199	* need to issue write commands for data extents from this inode.
200	*/
201	struct inode *cur_inode;
202	struct file_ra_state ra;
203	u64 page_cache_clear_start;
204	bool clean_page_cache;
205
206	/*
207	* We process inodes by their increasing order, so if before an
208	* incremental send we reverse the parent/child relationship of
209	* directories such that a directory with a lower inode number was
210	* the parent of a directory with a higher inode number, and the one
211	* becoming the new parent got renamed too, we can't rename/move the
212	* directory with lower inode number when we finish processing it - we
213	* must process the directory with higher inode number first, then
214	* rename/move it and then rename/move the directory with lower inode
215	* number. Example follows.
216	*
217	* Tree state when the first send was performed:
218	*
219	* .
220	* \|-- a (ino 257)
221	* \|-- b (ino 258)
222	* \|
223	* \|
224	* \|-- c (ino 259)
225	* \| \|-- d (ino 260)
226	* \|
227	* \|-- c2 (ino 261)
228	*
229	* Tree state when the second (incremental) send is performed:
230	*
231	* .
232	* \|-- a (ino 257)
233	* \|-- b (ino 258)
234	* \|-- c2 (ino 261)
235	* \|-- d2 (ino 260)
236	* \|-- cc (ino 259)
237	*
238	* The sequence of steps that lead to the second state was:
239	*
240	* mv /a/b/c/d /a/b/c2/d2
241	* mv /a/b/c /a/b/c2/d2/cc
242	*
243	* "c" has lower inode number, but we can't move it (2nd mv operation)
244	* before we move "d", which has higher inode number.
245	*
246	* So we just memorize which move/rename operations must be performed
247	* later when their respective parent is processed and moved/renamed.
248	*/
249
250	/ Indexed by parent directory inode number. /
251	struct rb_root pending_dir_moves;
252
253	/*
254	* Reverse index, indexed by the inode number of a directory that
255	* is waiting for the move/rename of its immediate parent before its
256	* own move/rename can be performed.
257	*/
258	struct rb_root waiting_dir_moves;
259
260	/*
261	* A directory that is going to be rm'ed might have a child directory
262	* which is in the pending directory moves index above. In this case,
263	* the directory can only be removed after the move/rename of its child
264	* is performed. Example:
265	*
266	* Parent snapshot:
267	*
268	* . (ino 256)
269	* \|-- a/ (ino 257)
270	* \|-- b/ (ino 258)
271	* \|-- c/ (ino 259)
272	* \| \|-- x/ (ino 260)
273	* \|
274	* \|-- y/ (ino 261)
275	*
276	* Send snapshot:
277	*
278	* . (ino 256)
279	* \|-- a/ (ino 257)
280	* \|-- b/ (ino 258)
281	* \|-- YY/ (ino 261)
282	* \|-- x/ (ino 260)
283	*
284	* Sequence of steps that lead to the send snapshot:
285	* rm -f /a/b/c/foo.txt
286	* mv /a/b/y /a/b/YY
287	* mv /a/b/c/x /a/b/YY
288	* rmdir /a/b/c
289	*
290	* When the child is processed, its move/rename is delayed until its
291	* parent is processed (as explained above), but all other operations
292	* like update utimes, chown, chgrp, etc, are performed and the paths
293	* that it uses for those operations must use the orphanized name of
294	* its parent (the directory we're going to rm later), so we need to
295	* memorize that name.
296	*
297	* Indexed by the inode number of the directory to be deleted.
298	*/
299	struct rb_root orphan_dirs;
300
301	struct rb_root rbtree_new_refs;
302	struct rb_root rbtree_deleted_refs;
303
304	struct btrfs_lru_cache backref_cache;
305	u64 backref_cache_last_reloc_trans;
306
307	struct btrfs_lru_cache dir_created_cache;
308	struct btrfs_lru_cache dir_utimes_cache;
309
310	struct fs_path cur_inode_path;
311	};
312
313	struct pending_dir_move {
314	struct rb_node node;
315	struct list_head list;
316	u64 parent_ino;
317	u64 ino;
318	u64 gen;
319	struct list_head update_refs;
320	};
321
322	struct waiting_dir_move {
323	struct rb_node node;
324	u64 ino;
325	/*
326	* There might be some directory that could not be removed because it
327	* was waiting for this directory inode to be moved first. Therefore
328	* after this directory is moved, we can try to rmdir the ino rmdir_ino.
329	*/
330	u64 rmdir_ino;
331	u64 rmdir_gen;
332	bool orphanized;
333	};
334
335	struct orphan_dir_info {
336	struct rb_node node;
337	u64 ino;
338	u64 gen;
339	u64 last_dir_index_offset;
340	u64 dir_high_seq_ino;
341	};
342
343	struct name_cache_entry {
344	/*
345	* The key in the entry is an inode number, and the generation matches
346	* the inode's generation.
347	*/
348	struct btrfs_lru_cache_entry entry;
349	u64 parent_ino;
350	u64 parent_gen;
351	int ret;
352	int need_later_update;
353	/ Name length without NUL terminator. /
354	int name_len;
355	/ Not NUL terminated. /
356	char name[] __counted_by(name_len) __nonstring;
357	};
358
359	/ See the comment at lru_cache.h about struct btrfs_lru_cache_entry. /
360	static_assert(offsetof(struct name_cache_entry, entry) == `0`);
361
362	#define ADVANCE 1
363	#define ADVANCE_ONLY_NEXT -1
364
365	enum btrfs_compare_tree_result {
366	BTRFS_COMPARE_TREE_NEW,
367	BTRFS_COMPARE_TREE_DELETED,
368	BTRFS_COMPARE_TREE_CHANGED,
369	BTRFS_COMPARE_TREE_SAME,
370	};
371
372	__cold
373	static void inconsistent_snapshot_error(struct send_ctx *sctx,
374	enum btrfs_compare_tree_result result,
375	const char *what)
376	{
377	const char *result_string;
378
379	switch (result) {
380	case BTRFS_COMPARE_TREE_NEW:
381	result_string = "new";
382	break;
383	case BTRFS_COMPARE_TREE_DELETED:
384	result_string = "deleted";
385	break;
386	case BTRFS_COMPARE_TREE_CHANGED:
387	result_string = "updated";
388	break;
389	case BTRFS_COMPARE_TREE_SAME:
390	DEBUG_WARN("no change between trees");
391	result_string = "unchanged";
392	break;
393	default:
394	DEBUG_WARN("unexpected comparison result %d", result);
395	result_string = "unexpected";
396	}
397
398	btrfs_err(sctx->send_root->fs_info,
399	"Send: inconsistent snapshot, found %s %s for inode %llu without updated inode item, send root is %llu, parent root is %llu",
400	result_string, what, sctx->cmp_key->objectid,
401	btrfs_root_id(sctx->send_root),
402	(sctx->parent_root ? btrfs_root_id(sctx->parent_root) : `0`));
403	}
404
405	__maybe_unused
406	static bool proto_cmd_ok(const struct send_ctx sctx, int* cmd)
407	{
408	switch (sctx->proto) {
409	case `1`: return cmd <= BTRFS_SEND_C_MAX_V1;
410	case `2`: return cmd <= BTRFS_SEND_C_MAX_V2;
411	case `3`: return cmd <= BTRFS_SEND_C_MAX_V3;
412	default: return false;
413	}
414	}
415
416	static int is_waiting_for_move(struct send_ctx *sctx, u64 ino);
417
418	static struct waiting_dir_move *
419	get_waiting_dir_move(struct send_ctx *sctx, u64 ino);
420
421	static int is_waiting_for_rm(struct send_ctx *sctx, u64 dir_ino, u64 gen);
422
423	static int need_send_hole(struct send_ctx *sctx)
424	{
425	return (sctx->parent_root && !sctx->cur_inode_new &&
426	!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted &&
427	S_ISREG(sctx->cur_inode_mode));
428	}
429
430	static void fs_path_reset(struct fs_path *p)
431	{
432	if (p->reversed)
433	p->start = p->buf + p->buf_len - `1`;
434	else
435	p->start = p->buf;
436
437	p->end = p->start;
438	*p->start = `0`;
439	}
440
441	static void init_path(struct fs_path *p)
442	{
443	p->reversed = `0`;
444	p->buf = p->inline_buf;
445	p->buf_len = FS_PATH_INLINE_SIZE;
446	fs_path_reset(p);
447	}
448
449	static struct fs_path fs_path_alloc(void*)
450	{
451	struct fs_path *p;
452
453	p = kmalloc(sizeof(*p), GFP_KERNEL);
454	if (!p)
455	return NULL;
456	init_path(p);
457	return p;
458	}
459
460	static struct fs_path fs_path_alloc_reversed(void*)
461	{
462	struct fs_path *p;
463
464	p = fs_path_alloc();
465	if (!p)
466	return NULL;
467	p->reversed = `1`;
468	fs_path_reset(p);
469	return p;
470	}
471
472	static void fs_path_free(struct fs_path *p)
473	{
474	if (!p)
475	return;
476	if (p->buf != p->inline_buf)
477	kfree(objp: p->buf);
478	kfree(objp: p);
479	}
480
481	static inline int fs_path_len(const struct fs_path *p)
482	{
483	return p->end - p->start;
484	}
485
486	static int fs_path_ensure_buf(struct fs_path p, int* len)
487	{
488	char *tmp_buf;
489	int path_len;
490	int old_buf_len;
491
492	len++;
493
494	if (p->buf_len >= len)
495	return `0`;
496
497	if (WARN_ON(len > PATH_MAX))
498	return -ENAMETOOLONG;
499
500	path_len = fs_path_len(p);
501	old_buf_len = p->buf_len;
502
503	/*
504	* Allocate to the next largest kmalloc bucket size, to let
505	* the fast path happen most of the time.
506	*/
507	len = kmalloc_size_roundup(size: len);
508	/*
509	* First time the inline_buf does not suffice
510	*/
511	if (p->buf == p->inline_buf) {
512	tmp_buf = kmalloc(len, GFP_KERNEL);
513	if (tmp_buf)
514	memcpy(tmp_buf, p->buf, old_buf_len);
515	} else {
516	tmp_buf = krealloc(p->buf, len, GFP_KERNEL);
517	}
518	if (!tmp_buf)
519	return -ENOMEM;
520	p->buf = tmp_buf;
521	p->buf_len = len;
522
523	if (p->reversed) {
524	tmp_buf = p->buf + old_buf_len - path_len - `1`;
525	p->end = p->buf + p->buf_len - `1`;
526	p->start = p->end - path_len;
527	memmove(p->start, tmp_buf, path_len + `1`);
528	} else {
529	p->start = p->buf;
530	p->end = p->start + path_len;
531	}
532	return `0`;
533	}
534
535	static int fs_path_prepare_for_add(struct fs_path p, int* name_len,
536	char **prepared)
537	{
538	int ret;
539	int new_len;
540
541	new_len = fs_path_len(p) + name_len;
542	if (p->start != p->end)
543	new_len++;
544	ret = fs_path_ensure_buf(p, len: new_len);
545	if (ret < `0`)
546	return ret;
547
548	if (p->reversed) {
549	if (p->start != p->end)
550	*--p->start = `'/'`;
551	p->start -= name_len;
552	*prepared = p->start;
553	} else {
554	if (p->start != p->end)
555	*p->end++ = `'/'`;
556	*prepared = p->end;
557	p->end += name_len;
558	*p->end = `0`;
559	}
560
561	return `0`;
562	}
563
564	static int fs_path_add(struct fs_path p, const* char name, int* name_len)
565	{
566	int ret;
567	char *prepared;
568
569	ret = fs_path_prepare_for_add(p, name_len, prepared: &prepared);
570	if (ret < `0`)
571	return ret;
572	memcpy(prepared, name, name_len);
573
574	return `0`;
575	}
576
577	static inline int fs_path_add_path(struct fs_path p, const* struct fs_path *p2)
578	{
579	return fs_path_add(p, name: p2->start, name_len: fs_path_len(p: p2));
580	}
581
582	static int fs_path_add_from_extent_buffer(struct fs_path *p,
583	struct extent_buffer *eb,
584	unsigned long off, int len)
585	{
586	int ret;
587	char *prepared;
588
589	ret = fs_path_prepare_for_add(p, name_len: len, prepared: &prepared);
590	if (ret < `0`)
591	return ret;
592
593	read_extent_buffer(eb, dst: prepared, start: off, len);
594
595	return `0`;
596	}
597
598	static int fs_path_copy(struct fs_path p, struct* fs_path *from)
599	{
600	p->reversed = from->reversed;
601	fs_path_reset(p);
602
603	return fs_path_add_path(p, p2: from);
604	}
605
606	static void fs_path_unreverse(struct fs_path *p)
607	{
608	char *tmp;
609	int len;
610
611	if (!p->reversed)
612	return;
613
614	tmp = p->start;
615	len = fs_path_len(p);
616	p->start = p->buf;
617	p->end = p->start + len;
618	memmove(p->start, tmp, len + `1`);
619	p->reversed = `0`;
620	}
621
622	static inline bool is_current_inode_path(const struct send_ctx *sctx,
623	const struct fs_path *path)
624	{
625	const struct fs_path *cur = &sctx->cur_inode_path;
626
627	return (strncmp(path->start, cur->start, fs_path_len(p: cur)) == `0`);
628	}
629
630	static struct btrfs_path alloc_path_for_send(void*)
631	{
632	struct btrfs_path *path;
633
634	path = btrfs_alloc_path();
635	if (!path)
636	return NULL;
637	path->search_commit_root = true;
638	path->skip_locking = true;
639	path->need_commit_sem = true;
640	return path;
641	}
642
643	static int write_buf(struct file filp, const* void buf, u32 len, loff_t off)
644	{
645	int ret;
646	u32 pos = `0`;
647
648	while (pos < len) {
649	ret = kernel_write(filp, buf + pos, len - pos, off);
650	if (ret < `0`)
651	return ret;
652	if (unlikely(ret == `0`))
653	return -EIO;
654	pos += ret;
655	}
656
657	return `0`;
658	}
659
660	static int tlv_put(struct send_ctx sctx, u16 attr, const* void data, int* len)
661	{
662	struct btrfs_tlv_header *hdr;
663	int total_len = sizeof(*hdr) + len;
664	int left = sctx->send_max_size - sctx->send_size;
665
666	if (WARN_ON_ONCE(sctx->put_data))
667	return -EINVAL;
668
669	if (unlikely(left < total_len))
670	return -EOVERFLOW;
671
672	hdr = (struct btrfs_tlv_header *) (sctx->send_buf + sctx->send_size);
673	put_unaligned_le16(val: attr, p: &hdr->tlv_type);
674	put_unaligned_le16(val: len, p: &hdr->tlv_len);
675	memcpy(hdr + `1`, data, len);
676	sctx->send_size += total_len;
677
678	return `0`;
679	}
680
681	#define TLV_PUT_DEFINE_INT(bits) \
682	static int tlv_put_u##bits(struct send_ctx *sctx, \
683	u##bits attr, u##bits value) \
684	{ \
685	__le##bits __tmp = cpu_to_le##bits(value); \
686	return tlv_put(sctx, attr, &__tmp, sizeof(__tmp)); \
687	}
688
689	TLV_PUT_DEFINE_INT(`8`)
690	TLV_PUT_DEFINE_INT(`32`)
691	TLV_PUT_DEFINE_INT(`64`)
692
693	static int tlv_put_string(struct send_ctx *sctx, u16 attr,
694	const char str, int* len)
695	{
696	if (len == -`1`)
697	len = strlen(str);
698	return tlv_put(sctx, attr, data: str, len);
699	}
700
701	static int tlv_put_uuid(struct send_ctx *sctx, u16 attr,
702	const u8 *uuid)
703	{
704	return tlv_put(sctx, attr, data: uuid, BTRFS_UUID_SIZE);
705	}
706
707	static int tlv_put_btrfs_timespec(struct send_ctx *sctx, u16 attr,
708	struct extent_buffer *eb,
709	struct btrfs_timespec *ts)
710	{
711	struct btrfs_timespec bts;
712	read_extent_buffer(eb, dst: &bts, start: (unsigned long)ts, len: sizeof(bts));
713	return tlv_put(sctx, attr, data: &bts, len: sizeof(bts));
714	}
715
716
717	#define TLV_PUT(sctx, attrtype, data, attrlen) \
718	do { \
719	ret = tlv_put(sctx, attrtype, data, attrlen); \
720	if (ret < 0) \
721	goto tlv_put_failure; \
722	} while (0)
723
724	#define TLV_PUT_INT(sctx, attrtype, bits, value) \
725	do { \
726	ret = tlv_put_u##bits(sctx, attrtype, value); \
727	if (ret < 0) \
728	goto tlv_put_failure; \
729	} while (0)
730
731	#define TLV_PUT_U8(sctx, attrtype, data) TLV_PUT_INT(sctx, attrtype, 8, data)
732	#define TLV_PUT_U16(sctx, attrtype, data) TLV_PUT_INT(sctx, attrtype, 16, data)
733	#define TLV_PUT_U32(sctx, attrtype, data) TLV_PUT_INT(sctx, attrtype, 32, data)
734	#define TLV_PUT_U64(sctx, attrtype, data) TLV_PUT_INT(sctx, attrtype, 64, data)
735	#define TLV_PUT_STRING(sctx, attrtype, str, len) \
736	do { \
737	ret = tlv_put_string(sctx, attrtype, str, len); \
738	if (ret < 0) \
739	goto tlv_put_failure; \
740	} while (0)
741	#define TLV_PUT_PATH(sctx, attrtype, p) \
742	do { \
743	ret = tlv_put_string(sctx, attrtype, p->start, \
744	fs_path_len((p))); \
745	if (ret < 0) \
746	goto tlv_put_failure; \
747	} while(0)
748	#define TLV_PUT_UUID(sctx, attrtype, uuid) \
749	do { \
750	ret = tlv_put_uuid(sctx, attrtype, uuid); \
751	if (ret < 0) \
752	goto tlv_put_failure; \
753	} while (0)
754	#define TLV_PUT_BTRFS_TIMESPEC(sctx, attrtype, eb, ts) \
755	do { \
756	ret = tlv_put_btrfs_timespec(sctx, attrtype, eb, ts); \
757	if (ret < 0) \
758	goto tlv_put_failure; \
759	} while (0)
760
761	static int send_header(struct send_ctx *sctx)
762	{
763	struct btrfs_stream_header hdr;
764
765	strscpy(hdr.magic, BTRFS_SEND_STREAM_MAGIC);
766	hdr.version = cpu_to_le32(sctx->proto);
767	return write_buf(filp: sctx->send_filp, buf: &hdr, len: sizeof(hdr),
768	off: &sctx->send_off);
769	}
770
771	/*
772	* For each command/item we want to send to userspace, we call this function.
773	*/
774	static int begin_cmd(struct send_ctx sctx, int* cmd)
775	{
776	struct btrfs_cmd_header *hdr;
777
778	if (WARN_ON(!sctx->send_buf))
779	return -EINVAL;
780
781	if (unlikely(sctx->send_size != `0`)) {
782	btrfs_err(sctx->send_root->fs_info,
783	"send: command header buffer not empty cmd %d offset %llu",
784	cmd, sctx->send_off);
785	return -EINVAL;
786	}
787
788	sctx->send_size += sizeof(*hdr);
789	hdr = (struct btrfs_cmd_header *)sctx->send_buf;
790	put_unaligned_le16(val: cmd, p: &hdr->cmd);
791
792	return `0`;
793	}
794
795	static int send_cmd(struct send_ctx *sctx)
796	{
797	int ret;
798	struct btrfs_cmd_header *hdr;
799	u32 crc;
800
801	hdr = (struct btrfs_cmd_header *)sctx->send_buf;
802	put_unaligned_le32(val: sctx->send_size - sizeof(*hdr), p: &hdr->len);
803	put_unaligned_le32(val: `0`, p: &hdr->crc);
804
805	crc = crc32c(crc: `0`, p: (unsigned char *)sctx->send_buf, len: sctx->send_size);
806	put_unaligned_le32(val: crc, p: &hdr->crc);
807
808	ret = write_buf(filp: sctx->send_filp, buf: sctx->send_buf, len: sctx->send_size,
809	off: &sctx->send_off);
810
811	sctx->send_size = `0`;
812	sctx->put_data = false;
813
814	return ret;
815	}
816
817	/*
818	* Sends a move instruction to user space
819	*/
820	static int send_rename(struct send_ctx *sctx,
821	struct fs_path from, struct* fs_path *to)
822	{
823	int ret;
824
825	ret = begin_cmd(sctx, cmd: BTRFS_SEND_C_RENAME);
826	if (ret < `0`)
827	return ret;
828
829	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, from);
830	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_TO, to);
831
832	ret = send_cmd(sctx);
833
834	tlv_put_failure:
835	return ret;
836	}
837
838	/*
839	* Sends a link instruction to user space
840	*/
841	static int send_link(struct send_ctx *sctx,
842	struct fs_path path, struct* fs_path *lnk)
843	{
844	int ret;
845
846	ret = begin_cmd(sctx, cmd: BTRFS_SEND_C_LINK);
847	if (ret < `0`)
848	return ret;
849
850	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
851	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_LINK, lnk);
852
853	ret = send_cmd(sctx);
854
855	tlv_put_failure:
856	return ret;
857	}
858
859	/*
860	* Sends an unlink instruction to user space
861	*/
862	static int send_unlink(struct send_ctx sctx, struct* fs_path *path)
863	{
864	int ret;
865
866	ret = begin_cmd(sctx, cmd: BTRFS_SEND_C_UNLINK);
867	if (ret < `0`)
868	return ret;
869
870	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
871
872	ret = send_cmd(sctx);
873
874	tlv_put_failure:
875	return ret;
876	}
877
878	/*
879	* Sends a rmdir instruction to user space
880	*/
881	static int send_rmdir(struct send_ctx sctx, struct* fs_path *path)
882	{
883	int ret;
884
885	ret = begin_cmd(sctx, cmd: BTRFS_SEND_C_RMDIR);
886	if (ret < `0`)
887	return ret;
888
889	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
890
891	ret = send_cmd(sctx);
892
893	tlv_put_failure:
894	return ret;
895	}
896
897	struct btrfs_inode_info {
898	u64 size;
899	u64 gen;
900	u64 mode;
901	u64 uid;
902	u64 gid;
903	u64 rdev;
904	u64 fileattr;
905	u64 nlink;
906	};
907
908	/*
909	* Helper function to retrieve some fields from an inode item.
910	*/
911	static int get_inode_info(struct btrfs_root *root, u64 ino,
912	struct btrfs_inode_info *info)
913	{
914	int ret;
915	BTRFS_PATH_AUTO_FREE(path);
916	struct btrfs_inode_item *ii;
917	struct btrfs_key key;
918
919	path = alloc_path_for_send();
920	if (!path)
921	return -ENOMEM;
922
923	key.objectid = ino;
924	key.type = BTRFS_INODE_ITEM_KEY;
925	key.offset = `0`;
926	ret = btrfs_search_slot(NULL, root, key: &key, p: path, ins_len: `0`, cow: `0`);
927	if (ret) {
928	if (ret > `0`)
929	ret = -ENOENT;
930	return ret;
931	}
932
933	if (!info)
934	return `0`;
935
936	ii = btrfs_item_ptr(path->nodes[`0`], path->slots[`0`],
937	struct btrfs_inode_item);
938	info->size = btrfs_inode_size(eb: path->nodes[`0`], s: ii);
939	info->gen = btrfs_inode_generation(eb: path->nodes[`0`], s: ii);
940	info->mode = btrfs_inode_mode(eb: path->nodes[`0`], s: ii);
941	info->uid = btrfs_inode_uid(eb: path->nodes[`0`], s: ii);
942	info->gid = btrfs_inode_gid(eb: path->nodes[`0`], s: ii);
943	info->rdev = btrfs_inode_rdev(eb: path->nodes[`0`], s: ii);
944	info->nlink = btrfs_inode_nlink(eb: path->nodes[`0`], s: ii);
945	/*
946	* Transfer the unchanged u64 value of btrfs_inode_item::flags, that's
947	* otherwise logically split to 32/32 parts.
948	*/
949	info->fileattr = btrfs_inode_flags(eb: path->nodes[`0`], s: ii);
950
951	return `0`;
952	}
953
954	static int get_inode_gen(struct btrfs_root root, u64 ino, u64 gen)
955	{
956	int ret;
957	struct btrfs_inode_info info = { `0` };
958
959	ASSERT(gen);
960
961	ret = get_inode_info(root, ino, info: &info);
962	*gen = info.gen;
963	return ret;
964	}
965
966	typedef int (iterate_inode_ref_t)(u64 dir, struct* fs_path p, void* *ctx);
967
968	/*
969	* Helper function to iterate the entries in ONE btrfs_inode_ref or
970	* btrfs_inode_extref.
971	* The iterate callback may return a non zero value to stop iteration. This can
972	* be a negative value for error codes or 1 to simply stop it.
973	*
974	* path must point to the INODE_REF or INODE_EXTREF when called.
975	*/
976	static int iterate_inode_ref(struct btrfs_root root, struct* btrfs_path *path,
977	struct btrfs_key *found_key, bool resolve,
978	iterate_inode_ref_t iterate, void *ctx)
979	{
980	struct extent_buffer *eb = path->nodes[`0`];
981	struct btrfs_inode_ref *iref;
982	struct btrfs_inode_extref *extref;
983	BTRFS_PATH_AUTO_FREE(tmp_path);
984	struct fs_path *p;
985	u32 cur = `0`;
986	u32 total;
987	int slot = path->slots[`0`];
988	u32 name_len;
989	char *start;
990	int ret = `0`;
991	u64 dir;
992	unsigned long name_off;
993	unsigned long elem_size;
994	unsigned long ptr;
995
996	p = fs_path_alloc_reversed();
997	if (!p)
998	return -ENOMEM;
999
1000	tmp_path = alloc_path_for_send();
1001	if (!tmp_path) {
1002	fs_path_free(p);
1003	return -ENOMEM;
1004	}
1005
1006
1007	if (found_key->type == BTRFS_INODE_REF_KEY) {
1008	ptr = (unsigned long)btrfs_item_ptr(eb, slot,
1009	struct btrfs_inode_ref);
1010	total = btrfs_item_size(eb, slot);
1011	elem_size = sizeof(*iref);
1012	} else {
1013	ptr = btrfs_item_ptr_offset(eb, slot);
1014	total = btrfs_item_size(eb, slot);
1015	elem_size = sizeof(*extref);
1016	}
1017
1018	while (cur < total) {
1019	fs_path_reset(p);
1020
1021	if (found_key->type == BTRFS_INODE_REF_KEY) {
1022	iref = (struct btrfs_inode_ref *)(ptr + cur);
1023	name_len = btrfs_inode_ref_name_len(eb, s: iref);
1024	name_off = (unsigned long)(iref + `1`);
1025	dir = found_key->offset;
1026	} else {
1027	extref = (struct btrfs_inode_extref *)(ptr + cur);
1028	name_len = btrfs_inode_extref_name_len(eb, s: extref);
1029	name_off = (unsigned long)&extref->name;
1030	dir = btrfs_inode_extref_parent(eb, s: extref);
1031	}
1032
1033	if (resolve) {
1034	start = btrfs_ref_to_path(fs_root: root, path: tmp_path, name_len,
1035	name_off, eb_in: eb, parent: dir,
1036	dest: p->buf, size: p->buf_len);
1037	if (IS_ERR(ptr: start)) {
1038	ret = PTR_ERR(ptr: start);
1039	goto out;
1040	}
1041	if (start < p->buf) {
1042	/ overflow , try again with larger buffer /
1043	ret = fs_path_ensure_buf(p,
1044	len: p->buf_len + p->buf - start);
1045	if (ret < `0`)
1046	goto out;
1047	start = btrfs_ref_to_path(fs_root: root, path: tmp_path,
1048	name_len, name_off,
1049	eb_in: eb, parent: dir,
1050	dest: p->buf, size: p->buf_len);
1051	if (IS_ERR(ptr: start)) {
1052	ret = PTR_ERR(ptr: start);
1053	goto out;
1054	}
1055	if (unlikely(start < p->buf)) {
1056	btrfs_err(root->fs_info,
1057	"send: path ref buffer underflow for key " BTRFS_KEY_FMT,
1058	BTRFS_KEY_FMT_VALUE(found_key));
1059	ret = -EINVAL;
1060	goto out;
1061	}
1062	}
1063	p->start = start;
1064	} else {
1065	ret = fs_path_add_from_extent_buffer(p, eb, off: name_off,
1066	len: name_len);
1067	if (ret < `0`)
1068	goto out;
1069	}
1070
1071	cur += elem_size + name_len;
1072	ret = iterate(dir, p, ctx);
1073	if (ret)
1074	goto out;
1075	}
1076
1077	out:
1078	fs_path_free(p);
1079	return ret;
1080	}
1081
1082	typedef int (iterate_dir_item_t)(int* num, struct btrfs_key *di_key,
1083	const char name, int* name_len,
1084	const char data, int* data_len,
1085	void *ctx);
1086
1087	/*
1088	* Helper function to iterate the entries in ONE btrfs_dir_item.
1089	* The iterate callback may return a non zero value to stop iteration. This can
1090	* be a negative value for error codes or 1 to simply stop it.
1091	*
1092	* path must point to the dir item when called.
1093	*/
1094	static int iterate_dir_item(struct btrfs_root root, struct* btrfs_path *path,
1095	iterate_dir_item_t iterate, void *ctx)
1096	{
1097	int ret = `0`;
1098	struct extent_buffer *eb;
1099	struct btrfs_dir_item *di;
1100	struct btrfs_key di_key;
1101	char *buf = NULL;
1102	int buf_len;
1103	u32 name_len;
1104	u32 data_len;
1105	u32 cur;
1106	u32 len;
1107	u32 total;
1108	int slot;
1109	int num;
1110
1111	/*
1112	* Start with a small buffer (1 page). If later we end up needing more
1113	* space, which can happen for xattrs on a fs with a leaf size greater
1114	* than the page size, attempt to increase the buffer. Typically xattr
1115	* values are small.
1116	*/
1117	buf_len = PATH_MAX;
1118	buf = kmalloc(buf_len, GFP_KERNEL);
1119	if (!buf) {
1120	ret = -ENOMEM;
1121	goto out;
1122	}
1123
1124	eb = path->nodes[`0`];
1125	slot = path->slots[`0`];
1126	di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
1127	cur = `0`;
1128	len = `0`;
1129	total = btrfs_item_size(eb, slot);
1130
1131	num = `0`;
1132	while (cur < total) {
1133	name_len = btrfs_dir_name_len(eb, s: di);
1134	data_len = btrfs_dir_data_len(eb, s: di);
1135	btrfs_dir_item_key_to_cpu(eb, item: di, cpu_key: &di_key);
1136
1137	if (btrfs_dir_ftype(eb, item: di) == BTRFS_FT_XATTR) {
1138	if (unlikely(name_len > XATTR_NAME_MAX)) {
1139	ret = -ENAMETOOLONG;
1140	goto out;
1141	}
1142	if (unlikely(name_len + data_len >
1143	BTRFS_MAX_XATTR_SIZE(root->fs_info))) {
1144	ret = -E2BIG;
1145	goto out;
1146	}
1147	} else {
1148	/*
1149	* Path too long
1150	*/
1151	if (unlikely(name_len + data_len > PATH_MAX)) {
1152	ret = -ENAMETOOLONG;
1153	goto out;
1154	}
1155	}
1156
1157	if (name_len + data_len > buf_len) {
1158	buf_len = name_len + data_len;
1159	if (is_vmalloc_addr(x: buf)) {
1160	vfree(addr: buf);
1161	buf = NULL;
1162	} else {
1163	char *tmp = krealloc(buf, buf_len,
1164	GFP_KERNEL \| __GFP_NOWARN);
1165
1166	if (!tmp)
1167	kfree(objp: buf);
1168	buf = tmp;
1169	}
1170	if (!buf) {
1171	buf = kvmalloc(buf_len, GFP_KERNEL);
1172	if (!buf) {
1173	ret = -ENOMEM;
1174	goto out;
1175	}
1176	}
1177	}
1178
1179	read_extent_buffer(eb, dst: buf, start: (unsigned long)(di + `1`),
1180	len: name_len + data_len);
1181
1182	len = sizeof(*di) + name_len + data_len;
1183	di = (struct btrfs_dir_item )((char* *)di + len);
1184	cur += len;
1185
1186	ret = iterate(num, &di_key, buf, name_len, buf + name_len,
1187	data_len, ctx);
1188	if (ret < `0`)
1189	goto out;
1190	if (ret) {
1191	ret = `0`;
1192	goto out;
1193	}
1194
1195	num++;
1196	}
1197
1198	out:
1199	kvfree(addr: buf);
1200	return ret;
1201	}
1202
1203	static int __copy_first_ref(u64 dir, struct fs_path p, void* *ctx)
1204	{
1205	int ret;
1206	struct fs_path *pt = ctx;
1207
1208	ret = fs_path_copy(p: pt, from: p);
1209	if (ret < `0`)
1210	return ret;
1211
1212	/ we want the first only /
1213	return `1`;
1214	}
1215
1216	/*
1217	* Retrieve the first path of an inode. If an inode has more then one
1218	* ref/hardlink, this is ignored.
1219	*/
1220	static int get_inode_path(struct btrfs_root *root,
1221	u64 ino, struct fs_path *path)
1222	{
1223	int ret;
1224	struct btrfs_key key, found_key;
1225	BTRFS_PATH_AUTO_FREE(p);
1226
1227	p = alloc_path_for_send();
1228	if (!p)
1229	return -ENOMEM;
1230
1231	fs_path_reset(p: path);
1232
1233	key.objectid = ino;
1234	key.type = BTRFS_INODE_REF_KEY;
1235	key.offset = `0`;
1236
1237	ret = btrfs_search_slot_for_read(root, key: &key, p, find_higher: `1`, return_any: `0`);
1238	if (ret < `0`)
1239	return ret;
1240	if (ret)
1241	return `1`;
1242
1243	btrfs_item_key_to_cpu(eb: p->nodes[`0`], cpu_key: &found_key, nr: p->slots[`0`]);
1244	if (found_key.objectid != ino \|\|
1245	(found_key.type != BTRFS_INODE_REF_KEY &&
1246	found_key.type != BTRFS_INODE_EXTREF_KEY))
1247	return -ENOENT;
1248
1249	ret = iterate_inode_ref(root, path: p, found_key: &found_key, resolve: true, iterate: __copy_first_ref, ctx: path);
1250	if (ret < `0`)
1251	return ret;
1252	return `0`;
1253	}
1254
1255	struct backref_ctx {
1256	struct send_ctx *sctx;
1257
1258	/ number of total found references /
1259	u64 found;
1260
1261	/*
1262	* used for clones found in send_root. clones found behind cur_objectid
1263	* and cur_offset are not considered as allowed clones.
1264	*/
1265	u64 cur_objectid;
1266	u64 cur_offset;
1267
1268	/ may be truncated in case it's the last extent in a file /
1269	u64 extent_len;
1270
1271	/ The bytenr the file extent item we are processing refers to. /
1272	u64 bytenr;
1273	/ The owner (root id) of the data backref for the current extent. /
1274	u64 backref_owner;
1275	/ The offset of the data backref for the current extent. /
1276	u64 backref_offset;
1277	};
1278
1279	static int __clone_root_cmp_bsearch(const void key, const* void *elt)
1280	{
1281	u64 root = (u64)(uintptr_t)key;
1282	const struct clone_root *cr = elt;
1283
1284	if (root < btrfs_root_id(root: cr->root))
1285	return -`1`;
1286	if (root > btrfs_root_id(root: cr->root))
1287	return `1`;
1288	return `0`;
1289	}
1290
1291	static int __clone_root_cmp_sort(const void e1, const* void *e2)
1292	{
1293	const struct clone_root *cr1 = e1;
1294	const struct clone_root *cr2 = e2;
1295
1296	if (btrfs_root_id(root: cr1->root) < btrfs_root_id(root: cr2->root))
1297	return -`1`;
1298	if (btrfs_root_id(root: cr1->root) > btrfs_root_id(root: cr2->root))
1299	return `1`;
1300	return `0`;
1301	}
1302
1303	/*
1304	* Called for every backref that is found for the current extent.
1305	* Results are collected in sctx->clone_roots->ino/offset.
1306	*/
1307	static int iterate_backrefs(u64 ino, u64 offset, u64 num_bytes, u64 root_id,
1308	void *ctx_)
1309	{
1310	struct backref_ctx *bctx = ctx_;
1311	struct clone_root *clone_root;
1312
1313	/ First check if the root is in the list of accepted clone sources /
1314	clone_root = bsearch(key: (void *)(uintptr_t)root_id, base: bctx->sctx->clone_roots,
1315	num: bctx->sctx->clone_roots_cnt,
1316	size: sizeof(struct clone_root),
1317	cmp: __clone_root_cmp_bsearch);
1318	if (!clone_root)
1319	return `0`;
1320
1321	/ This is our own reference, bail out as we can't clone from it. /
1322	if (clone_root->root == bctx->sctx->send_root &&
1323	ino == bctx->cur_objectid &&
1324	offset == bctx->cur_offset)
1325	return `0`;
1326
1327	/*
1328	* Make sure we don't consider clones from send_root that are
1329	* behind the current inode/offset.
1330	*/
1331	if (clone_root->root == bctx->sctx->send_root) {
1332	/*
1333	* If the source inode was not yet processed we can't issue a
1334	* clone operation, as the source extent does not exist yet at
1335	* the destination of the stream.
1336	*/
1337	if (ino > bctx->cur_objectid)
1338	return `0`;
1339	/*
1340	* We clone from the inode currently being sent as long as the
1341	* source extent is already processed, otherwise we could try
1342	* to clone from an extent that does not exist yet at the
1343	* destination of the stream.
1344	*/
1345	if (ino == bctx->cur_objectid &&
1346	offset + bctx->extent_len >
1347	bctx->sctx->cur_inode_next_write_offset)
1348	return `0`;
1349	}
1350
1351	bctx->found++;
1352	clone_root->found_ref = true;
1353
1354	/*
1355	* If the given backref refers to a file extent item with a larger
1356	* number of bytes than what we found before, use the new one so that
1357	* we clone more optimally and end up doing less writes and getting
1358	* less exclusive, non-shared extents at the destination.
1359	*/
1360	if (num_bytes > clone_root->num_bytes) {
1361	clone_root->ino = ino;
1362	clone_root->offset = offset;
1363	clone_root->num_bytes = num_bytes;
1364
1365	/*
1366	* Found a perfect candidate, so there's no need to continue
1367	* backref walking.
1368	*/
1369	if (num_bytes >= bctx->extent_len)
1370	return BTRFS_ITERATE_EXTENT_INODES_STOP;
1371	}
1372
1373	return `0`;
1374	}
1375
1376	static bool lookup_backref_cache(u64 leaf_bytenr, void *ctx,
1377	const u64 *root_ids_ret, int* *root_count_ret)
1378	{
1379	struct backref_ctx *bctx = ctx;
1380	struct send_ctx *sctx = bctx->sctx;
1381	struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
1382	const u64 key = leaf_bytenr >> fs_info->nodesize_bits;
1383	struct btrfs_lru_cache_entry *raw_entry;
1384	struct backref_cache_entry *entry;
1385
1386	if (sctx->backref_cache.size == `0`)
1387	return false;
1388
1389	/*
1390	* If relocation happened since we first filled the cache, then we must
1391	* empty the cache and can not use it, because even though we operate on
1392	* read-only roots, their leaves and nodes may have been reallocated and
1393	* now be used for different nodes/leaves of the same tree or some other
1394	* tree.
1395	*
1396	* We are called from iterate_extent_inodes() while either holding a
1397	* transaction handle or holding fs_info->commit_root_sem, so no need
1398	* to take any lock here.
1399	*/
1400	if (fs_info->last_reloc_trans > sctx->backref_cache_last_reloc_trans) {
1401	btrfs_lru_cache_clear(cache: &sctx->backref_cache);
1402	return false;
1403	}
1404
1405	raw_entry = btrfs_lru_cache_lookup(cache: &sctx->backref_cache, key, gen: `0`);
1406	if (!raw_entry)
1407	return false;
1408
1409	entry = container_of(raw_entry, struct backref_cache_entry, entry);
1410	*root_ids_ret = entry->root_ids;
1411	*root_count_ret = entry->num_roots;
1412
1413	return true;
1414	}
1415
1416	static void store_backref_cache(u64 leaf_bytenr, const struct ulist *root_ids,
1417	void *ctx)
1418	{
1419	struct backref_ctx *bctx = ctx;
1420	struct send_ctx *sctx = bctx->sctx;
1421	struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
1422	struct backref_cache_entry *new_entry;
1423	struct ulist_iterator uiter;
1424	struct ulist_node *node;
1425	int ret;
1426
1427	/*
1428	* We're called while holding a transaction handle or while holding
1429	* fs_info->commit_root_sem (at iterate_extent_inodes()), so must do a
1430	* NOFS allocation.
1431	*/
1432	new_entry = kmalloc(sizeof(struct backref_cache_entry), GFP_NOFS);
1433	/ No worries, cache is optional. /
1434	if (!new_entry)
1435	return;
1436
1437	new_entry->entry.key = leaf_bytenr >> fs_info->nodesize_bits;
1438	new_entry->entry.gen = `0`;
1439	new_entry->num_roots = `0`;
1440	ULIST_ITER_INIT(&uiter);
1441	while ((node = ulist_next(ulist: root_ids, uiter: &uiter)) != NULL) {
1442	const u64 root_id = node->val;
1443	struct clone_root *root;
1444
1445	root = bsearch(key: (void *)(uintptr_t)root_id, base: sctx->clone_roots,
1446	num: sctx->clone_roots_cnt, size: sizeof(struct clone_root),
1447	cmp: __clone_root_cmp_bsearch);
1448	if (!root)
1449	continue;
1450
1451	/ Too many roots, just exit, no worries as caching is optional. /
1452	if (new_entry->num_roots >= SEND_MAX_BACKREF_CACHE_ROOTS) {
1453	kfree(objp: new_entry);
1454	return;
1455	}
1456
1457	new_entry->root_ids[new_entry->num_roots] = root_id;
1458	new_entry->num_roots++;
1459	}
1460
1461	/*
1462	* We may have not added any roots to the new cache entry, which means
1463	* none of the roots is part of the list of roots from which we are
1464	* allowed to clone. Cache the new entry as it's still useful to avoid
1465	* backref walking to determine which roots have a path to the leaf.
1466	*
1467	* Also use GFP_NOFS because we're called while holding a transaction
1468	* handle or while holding fs_info->commit_root_sem.
1469	*/
1470	ret = btrfs_lru_cache_store(cache: &sctx->backref_cache, new_entry: &new_entry->entry,
1471	GFP_NOFS);
1472	ASSERT(ret == `0` \|\| ret == -ENOMEM);
1473	if (ret) {
1474	/ Caching is optional, no worries. /
1475	kfree(objp: new_entry);
1476	return;
1477	}
1478
1479	/*
1480	* We are called from iterate_extent_inodes() while either holding a
1481	* transaction handle or holding fs_info->commit_root_sem, so no need
1482	* to take any lock here.
1483	*/
1484	if (sctx->backref_cache.size == `1`)
1485	sctx->backref_cache_last_reloc_trans = fs_info->last_reloc_trans;
1486	}
1487
1488	static int check_extent_item(u64 bytenr, const struct btrfs_extent_item *ei,
1489	const struct extent_buffer leaf, void* *ctx)
1490	{
1491	const u64 refs = btrfs_extent_refs(eb: leaf, s: ei);
1492	const struct backref_ctx *bctx = ctx;
1493	const struct send_ctx *sctx = bctx->sctx;
1494
1495	if (bytenr == bctx->bytenr) {
1496	const u64 flags = btrfs_extent_flags(eb: leaf, s: ei);
1497
1498	if (WARN_ON(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK))
1499	return -EUCLEAN;
1500
1501	/*
1502	* If we have only one reference and only the send root as a
1503	* clone source - meaning no clone roots were given in the
1504	* struct btrfs_ioctl_send_args passed to the send ioctl - then
1505	* it's our reference and there's no point in doing backref
1506	* walking which is expensive, so exit early.
1507	*/
1508	if (refs == `1` && sctx->clone_roots_cnt == `1`)
1509	return -ENOENT;
1510	}
1511
1512	/*
1513	* Backreference walking (iterate_extent_inodes() below) is currently
1514	* too expensive when an extent has a large number of references, both
1515	* in time spent and used memory. So for now just fallback to write
1516	* operations instead of clone operations when an extent has more than
1517	* a certain amount of references.
1518	*/
1519	if (refs > SEND_MAX_EXTENT_REFS)
1520	return -ENOENT;
1521
1522	return `0`;
1523	}
1524
1525	static bool skip_self_data_ref(u64 root, u64 ino, u64 offset, void *ctx)
1526	{
1527	const struct backref_ctx *bctx = ctx;
1528
1529	if (ino == bctx->cur_objectid &&
1530	root == bctx->backref_owner &&
1531	offset == bctx->backref_offset)
1532	return true;
1533
1534	return false;
1535	}
1536
1537	/*
1538	* Given an inode, offset and extent item, it finds a good clone for a clone
1539	* instruction. Returns -ENOENT when none could be found. The function makes
1540	* sure that the returned clone is usable at the point where sending is at the
1541	* moment. This means, that no clones are accepted which lie behind the current
1542	* inode+offset.
1543	*
1544	* path must point to the extent item when called.
1545	*/
1546	static int find_extent_clone(struct send_ctx *sctx,
1547	struct btrfs_path *path,
1548	u64 ino, u64 data_offset,
1549	u64 ino_size,
1550	struct clone_root **found)
1551	{
1552	struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
1553	int ret;
1554	int extent_type;
1555	u64 disk_byte;
1556	u64 num_bytes;
1557	struct btrfs_file_extent_item *fi;
1558	struct extent_buffer *eb = path->nodes[`0`];
1559	struct backref_ctx backref_ctx = { `0` };
1560	struct btrfs_backref_walk_ctx backref_walk_ctx = { `0` };
1561	struct clone_root *cur_clone_root;
1562	int compressed;
1563	u32 i;
1564
1565	/*
1566	* With fallocate we can get prealloc extents beyond the inode's i_size,
1567	* so we don't do anything here because clone operations can not clone
1568	* to a range beyond i_size without increasing the i_size of the
1569	* destination inode.
1570	*/
1571	if (data_offset >= ino_size)
1572	return `0`;
1573
1574	fi = btrfs_item_ptr(eb, path->slots[`0`], struct btrfs_file_extent_item);
1575	extent_type = btrfs_file_extent_type(eb, s: fi);
1576	if (extent_type == BTRFS_FILE_EXTENT_INLINE)
1577	return -ENOENT;
1578
1579	disk_byte = btrfs_file_extent_disk_bytenr(eb, s: fi);
1580	if (disk_byte == `0`)
1581	return -ENOENT;
1582
1583	compressed = btrfs_file_extent_compression(eb, s: fi);
1584	num_bytes = btrfs_file_extent_num_bytes(eb, s: fi);
1585
1586	/*
1587	* Setup the clone roots.
1588	*/
1589	for (i = `0`; i < sctx->clone_roots_cnt; i++) {
1590	cur_clone_root = sctx->clone_roots + i;
1591	cur_clone_root->ino = (u64)-`1`;
1592	cur_clone_root->offset = `0`;
1593	cur_clone_root->num_bytes = `0`;
1594	cur_clone_root->found_ref = false;
1595	}
1596
1597	backref_ctx.sctx = sctx;
1598	backref_ctx.cur_objectid = ino;
1599	backref_ctx.cur_offset = data_offset;
1600	backref_ctx.bytenr = disk_byte;
1601	/*
1602	* Use the header owner and not the send root's id, because in case of a
1603	* snapshot we can have shared subtrees.
1604	*/
1605	backref_ctx.backref_owner = btrfs_header_owner(eb);
1606	backref_ctx.backref_offset = data_offset - btrfs_file_extent_offset(eb, s: fi);
1607
1608	/*
1609	* The last extent of a file may be too large due to page alignment.
1610	* We need to adjust extent_len in this case so that the checks in
1611	* iterate_backrefs() work.
1612	*/
1613	if (data_offset + num_bytes >= ino_size)
1614	backref_ctx.extent_len = ino_size - data_offset;
1615	else
1616	backref_ctx.extent_len = num_bytes;
1617
1618	/*
1619	* Now collect all backrefs.
1620	*/
1621	backref_walk_ctx.bytenr = disk_byte;
1622	if (compressed == BTRFS_COMPRESS_NONE)
1623	backref_walk_ctx.extent_item_pos = btrfs_file_extent_offset(eb, s: fi);
1624	backref_walk_ctx.fs_info = fs_info;
1625	backref_walk_ctx.cache_lookup = lookup_backref_cache;
1626	backref_walk_ctx.cache_store = store_backref_cache;
1627	backref_walk_ctx.indirect_ref_iterator = iterate_backrefs;
1628	backref_walk_ctx.check_extent_item = check_extent_item;
1629	backref_walk_ctx.user_ctx = &backref_ctx;
1630
1631	/*
1632	* If have a single clone root, then it's the send root and we can tell
1633	* the backref walking code to skip our own backref and not resolve it,
1634	* since we can not use it for cloning - the source and destination
1635	* ranges can't overlap and in case the leaf is shared through a subtree
1636	* due to snapshots, we can't use those other roots since they are not
1637	* in the list of clone roots.
1638	*/
1639	if (sctx->clone_roots_cnt == `1`)
1640	backref_walk_ctx.skip_data_ref = skip_self_data_ref;
1641
1642	ret = iterate_extent_inodes(ctx: &backref_walk_ctx, search_commit_root: true, iterate: iterate_backrefs,
1643	user_ctx: &backref_ctx);
1644	if (ret < `0`)
1645	return ret;
1646
1647	down_read(sem: &fs_info->commit_root_sem);
1648	if (fs_info->last_reloc_trans > sctx->last_reloc_trans) {
1649	/*
1650	* A transaction commit for a transaction in which block group
1651	* relocation was done just happened.
1652	* The disk_bytenr of the file extent item we processed is
1653	* possibly stale, referring to the extent's location before
1654	* relocation. So act as if we haven't found any clone sources
1655	* and fallback to write commands, which will read the correct
1656	* data from the new extent location. Otherwise we will fail
1657	* below because we haven't found our own back reference or we
1658	* could be getting incorrect sources in case the old extent
1659	* was already reallocated after the relocation.
1660	*/
1661	up_read(sem: &fs_info->commit_root_sem);
1662	return -ENOENT;
1663	}
1664	up_read(sem: &fs_info->commit_root_sem);
1665
1666	if (!backref_ctx.found)
1667	return -ENOENT;
1668
1669	cur_clone_root = NULL;
1670	for (i = `0`; i < sctx->clone_roots_cnt; i++) {
1671	struct clone_root *clone_root = &sctx->clone_roots[i];
1672
1673	if (!clone_root->found_ref)
1674	continue;
1675
1676	/*
1677	* Choose the root from which we can clone more bytes, to
1678	* minimize write operations and therefore have more extent
1679	* sharing at the destination (the same as in the source).
1680	*/
1681	if (!cur_clone_root \|\|
1682	clone_root->num_bytes > cur_clone_root->num_bytes) {
1683	cur_clone_root = clone_root;
1684
1685	/*
1686	* We found an optimal clone candidate (any inode from
1687	* any root is fine), so we're done.
1688	*/
1689	if (clone_root->num_bytes >= backref_ctx.extent_len)
1690	break;
1691	}
1692	}
1693
1694	if (cur_clone_root) {
1695	*found = cur_clone_root;
1696	ret = `0`;
1697	} else {
1698	ret = -ENOENT;
1699	}
1700
1701	return ret;
1702	}
1703
1704	static int read_symlink(struct btrfs_root *root,
1705	u64 ino,
1706	struct fs_path *dest)
1707	{
1708	int ret;
1709	BTRFS_PATH_AUTO_FREE(path);
1710	struct btrfs_key key;
1711	struct btrfs_file_extent_item *ei;
1712	u8 type;
1713	u8 compression;
1714	unsigned long off;
1715	int len;
1716
1717	path = alloc_path_for_send();
1718	if (!path)
1719	return -ENOMEM;
1720
1721	key.objectid = ino;
1722	key.type = BTRFS_EXTENT_DATA_KEY;
1723	key.offset = `0`;
1724	ret = btrfs_search_slot(NULL, root, key: &key, p: path, ins_len: `0`, cow: `0`);
1725	if (ret < `0`)
1726	return ret;
1727	if (unlikely(ret)) {
1728	/*
1729	* An empty symlink inode. Can happen in rare error paths when
1730	* creating a symlink (transaction committed before the inode
1731	* eviction handler removed the symlink inode items and a crash
1732	* happened in between or the subvol was snapshotted in between).
1733	* Print an informative message to dmesg/syslog so that the user
1734	* can delete the symlink.
1735	*/
1736	btrfs_err(root->fs_info,
1737	"Found empty symlink inode %llu at root %llu",
1738	ino, btrfs_root_id(root));
1739	return -EIO;
1740	}
1741
1742	ei = btrfs_item_ptr(path->nodes[`0`], path->slots[`0`],
1743	struct btrfs_file_extent_item);
1744	type = btrfs_file_extent_type(eb: path->nodes[`0`], s: ei);
1745	if (unlikely(type != BTRFS_FILE_EXTENT_INLINE)) {
1746	ret = -EUCLEAN;
1747	btrfs_crit(root->fs_info,
1748	"send: found symlink extent that is not inline, ino %llu root %llu extent type %d",
1749	ino, btrfs_root_id(root), type);
1750	return ret;
1751	}
1752	compression = btrfs_file_extent_compression(eb: path->nodes[`0`], s: ei);
1753	if (unlikely(compression != BTRFS_COMPRESS_NONE)) {
1754	ret = -EUCLEAN;
1755	btrfs_crit(root->fs_info,
1756	"send: found symlink extent with compression, ino %llu root %llu compression type %d",
1757	ino, btrfs_root_id(root), compression);
1758	return ret;
1759	}
1760
1761	off = btrfs_file_extent_inline_start(e: ei);
1762	len = btrfs_file_extent_ram_bytes(eb: path->nodes[`0`], s: ei);
1763
1764	return fs_path_add_from_extent_buffer(p: dest, eb: path->nodes[`0`], off, len);
1765	}
1766
1767	/*
1768	* Helper function to generate a file name that is unique in the root of
1769	* send_root and parent_root. This is used to generate names for orphan inodes.
1770	*/
1771	static int gen_unique_name(struct send_ctx *sctx,
1772	u64 ino, u64 gen,
1773	struct fs_path *dest)
1774	{
1775	BTRFS_PATH_AUTO_FREE(path);
1776	struct btrfs_dir_item *di;
1777	char tmp[`64`];
1778	int len;
1779	u64 idx = `0`;
1780
1781	path = alloc_path_for_send();
1782	if (!path)
1783	return -ENOMEM;
1784
1785	while (`1`) {
1786	struct fscrypt_str tmp_name;
1787
1788	len = snprintf(buf: tmp, size: sizeof(tmp), fmt: "o%llu-%llu-%llu",
1789	ino, gen, idx);
1790	ASSERT(len < sizeof(tmp));
1791	tmp_name.name = tmp;
1792	tmp_name.len = len;
1793
1794	di = btrfs_lookup_dir_item(NULL, root: sctx->send_root,
1795	path, BTRFS_FIRST_FREE_OBJECTID,
1796	name: &tmp_name, mod: `0`);
1797	btrfs_release_path(p: path);
1798	if (IS_ERR(ptr: di))
1799	return PTR_ERR(ptr: di);
1800
1801	if (di) {
1802	/ not unique, try again /
1803	idx++;
1804	continue;
1805	}
1806
1807	if (!sctx->parent_root) {
1808	/ unique /
1809	break;
1810	}
1811
1812	di = btrfs_lookup_dir_item(NULL, root: sctx->parent_root,
1813	path, BTRFS_FIRST_FREE_OBJECTID,
1814	name: &tmp_name, mod: `0`);
1815	btrfs_release_path(p: path);
1816	if (IS_ERR(ptr: di))
1817	return PTR_ERR(ptr: di);
1818
1819	if (di) {
1820	/ not unique, try again /
1821	idx++;
1822	continue;
1823	}
1824	/ unique /
1825	break;
1826	}
1827
1828	return fs_path_add(p: dest, name: tmp, name_len: len);
1829	}
1830
1831	enum inode_state {
1832	inode_state_no_change,
1833	inode_state_will_create,
1834	inode_state_did_create,
1835	inode_state_will_delete,
1836	inode_state_did_delete,
1837	};
1838
1839	static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen,
1840	u64 send_gen, u64 parent_gen)
1841	{
1842	int ret;
1843	int left_ret;
1844	int right_ret;
1845	u64 left_gen;
1846	u64 right_gen = `0`;
1847	struct btrfs_inode_info info;
1848
1849	ret = get_inode_info(root: sctx->send_root, ino, info: &info);
1850	if (ret < `0` && ret != -ENOENT)
1851	return ret;
1852	left_ret = (info.nlink == `0`) ? -ENOENT : ret;
1853	left_gen = info.gen;
1854	if (send_gen)
1855	*send_gen = ((left_ret == -ENOENT) ? `0` : info.gen);
1856
1857	if (!sctx->parent_root) {
1858	right_ret = -ENOENT;
1859	} else {
1860	ret = get_inode_info(root: sctx->parent_root, ino, info: &info);
1861	if (ret < `0` && ret != -ENOENT)
1862	return ret;
1863	right_ret = (info.nlink == `0`) ? -ENOENT : ret;
1864	right_gen = info.gen;
1865	if (parent_gen)
1866	*parent_gen = ((right_ret == -ENOENT) ? `0` : info.gen);
1867	}
1868
1869	if (!left_ret && !right_ret) {
1870	if (left_gen == gen && right_gen == gen) {
1871	ret = inode_state_no_change;
1872	} else if (left_gen == gen) {
1873	if (ino < sctx->send_progress)
1874	ret = inode_state_did_create;
1875	else
1876	ret = inode_state_will_create;
1877	} else if (right_gen == gen) {
1878	if (ino < sctx->send_progress)
1879	ret = inode_state_did_delete;
1880	else
1881	ret = inode_state_will_delete;
1882	} else {
1883	ret = -ENOENT;
1884	}
1885	} else if (!left_ret) {
1886	if (left_gen == gen) {
1887	if (ino < sctx->send_progress)
1888	ret = inode_state_did_create;
1889	else
1890	ret = inode_state_will_create;
1891	} else {
1892	ret = -ENOENT;
1893	}
1894	} else if (!right_ret) {
1895	if (right_gen == gen) {
1896	if (ino < sctx->send_progress)
1897	ret = inode_state_did_delete;
1898	else
1899	ret = inode_state_will_delete;
1900	} else {
1901	ret = -ENOENT;
1902	}
1903	} else {
1904	ret = -ENOENT;
1905	}
1906
1907	return ret;
1908	}
1909
1910	static int is_inode_existent(struct send_ctx *sctx, u64 ino, u64 gen,
1911	u64 send_gen, u64 parent_gen)
1912	{
1913	int ret;
1914
1915	if (ino == BTRFS_FIRST_FREE_OBJECTID)
1916	return `1`;
1917
1918	ret = get_cur_inode_state(sctx, ino, gen, send_gen, parent_gen);
1919	if (ret < `0`)
1920	return ret;
1921
1922	if (ret == inode_state_no_change \|\|
1923	ret == inode_state_did_create \|\|
1924	ret == inode_state_will_delete)
1925	return `1`;
1926
1927	return `0`;
1928	}
1929
1930	/*
1931	* Helper function to lookup a dir item in a dir.
1932	*/
1933	static int lookup_dir_item_inode(struct btrfs_root *root,
1934	u64 dir, const char name, int* name_len,
1935	u64 *found_inode)
1936	{
1937	int ret = `0`;
1938	struct btrfs_dir_item *di;
1939	struct btrfs_key key;
1940	BTRFS_PATH_AUTO_FREE(path);
1941	struct fscrypt_str name_str = FSTR_INIT((char *)name, name_len);
1942
1943	path = alloc_path_for_send();
1944	if (!path)
1945	return -ENOMEM;
1946
1947	di = btrfs_lookup_dir_item(NULL, root, path, dir, name: &name_str, mod: `0`);
1948	if (IS_ERR_OR_NULL(ptr: di))
1949	return di ? PTR_ERR(ptr: di) : -ENOENT;
1950
1951	btrfs_dir_item_key_to_cpu(eb: path->nodes[`0`], item: di, cpu_key: &key);
1952	if (key.type == BTRFS_ROOT_ITEM_KEY)
1953	return -ENOENT;
1954
1955	*found_inode = key.objectid;
1956
1957	return ret;
1958	}
1959
1960	/*
1961	* Looks up the first btrfs_inode_ref of a given ino. It returns the parent dir,
1962	* generation of the parent dir and the name of the dir entry.
1963	*/
1964	static int get_first_ref(struct btrfs_root *root, u64 ino,
1965	u64 dir, u64 dir_gen, struct fs_path *name)
1966	{
1967	int ret;
1968	struct btrfs_key key;
1969	struct btrfs_key found_key;
1970	BTRFS_PATH_AUTO_FREE(path);
1971	int len;
1972	u64 parent_dir;
1973
1974	path = alloc_path_for_send();
1975	if (!path)
1976	return -ENOMEM;
1977
1978	key.objectid = ino;
1979	key.type = BTRFS_INODE_REF_KEY;
1980	key.offset = `0`;
1981
1982	ret = btrfs_search_slot_for_read(root, key: &key, p: path, find_higher: `1`, return_any: `0`);
1983	if (ret < `0`)
1984	return ret;
1985	if (!ret)
1986	btrfs_item_key_to_cpu(eb: path->nodes[`0`], cpu_key: &found_key,
1987	nr: path->slots[`0`]);
1988	if (ret \|\| found_key.objectid != ino \|\|
1989	(found_key.type != BTRFS_INODE_REF_KEY &&
1990	found_key.type != BTRFS_INODE_EXTREF_KEY))
1991	return -ENOENT;
1992
1993	if (found_key.type == BTRFS_INODE_REF_KEY) {
1994	struct btrfs_inode_ref *iref;
1995	iref = btrfs_item_ptr(path->nodes[`0`], path->slots[`0`],
1996	struct btrfs_inode_ref);
1997	len = btrfs_inode_ref_name_len(eb: path->nodes[`0`], s: iref);
1998	ret = fs_path_add_from_extent_buffer(p: name, eb: path->nodes[`0`],
1999	off: (unsigned long)(iref + `1`),
2000	len);
2001	parent_dir = found_key.offset;
2002	} else {
2003	struct btrfs_inode_extref *extref;
2004	extref = btrfs_item_ptr(path->nodes[`0`], path->slots[`0`],
2005	struct btrfs_inode_extref);
2006	len = btrfs_inode_extref_name_len(eb: path->nodes[`0`], s: extref);
2007	ret = fs_path_add_from_extent_buffer(p: name, eb: path->nodes[`0`],
2008	off: (unsigned long)&extref->name, len);
2009	parent_dir = btrfs_inode_extref_parent(eb: path->nodes[`0`], s: extref);
2010	}
2011	if (ret < `0`)
2012	return ret;
2013	btrfs_release_path(p: path);
2014
2015	if (dir_gen) {
2016	ret = get_inode_gen(root, ino: parent_dir, gen: dir_gen);
2017	if (ret < `0`)
2018	return ret;
2019	}
2020
2021	*dir = parent_dir;
2022
2023	return ret;
2024	}
2025
2026	static int is_first_ref(struct btrfs_root *root,
2027	u64 ino, u64 dir,
2028	const char name, int* name_len)
2029	{
2030	int ret;
2031	struct fs_path *tmp_name;
2032	u64 tmp_dir;
2033
2034	tmp_name = fs_path_alloc();
2035	if (!tmp_name)
2036	return -ENOMEM;
2037
2038	ret = get_first_ref(root, ino, dir: &tmp_dir, NULL, name: tmp_name);
2039	if (ret < `0`)
2040	goto out;
2041
2042	if (dir != tmp_dir \|\| name_len != fs_path_len(p: tmp_name)) {
2043	ret = `0`;
2044	goto out;
2045	}
2046
2047	ret = !memcmp(p: tmp_name->start, q: name, size: name_len);
2048
2049	out:
2050	fs_path_free(p: tmp_name);
2051	return ret;
2052	}
2053
2054	/*
2055	* Used by process_recorded_refs to determine if a new ref would overwrite an
2056	* already existing ref. In case it detects an overwrite, it returns the
2057	* inode/gen in who_ino/who_gen.
2058	* When an overwrite is detected, process_recorded_refs does proper orphanizing
2059	* to make sure later references to the overwritten inode are possible.
2060	* Orphanizing is however only required for the first ref of an inode.
2061	* process_recorded_refs does an additional is_first_ref check to see if
2062	* orphanizing is really required.
2063	*/
2064	static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen,
2065	const char name, int* name_len,
2066	u64 who_ino, u64 who_gen, u64 *who_mode)
2067	{
2068	int ret;
2069	u64 parent_root_dir_gen;
2070	u64 other_inode = `0`;
2071	struct btrfs_inode_info info;
2072
2073	if (!sctx->parent_root)
2074	return `0`;
2075
2076	ret = is_inode_existent(sctx, ino: dir, gen: dir_gen, NULL, parent_gen: &parent_root_dir_gen);
2077	if (ret <= `0`)
2078	return `0`;
2079
2080	/*
2081	* If we have a parent root we need to verify that the parent dir was
2082	* not deleted and then re-created, if it was then we have no overwrite
2083	* and we can just unlink this entry.
2084	*
2085	* @parent_root_dir_gen was set to 0 if the inode does not exist in the
2086	* parent root.
2087	*/
2088	if (sctx->parent_root && dir != BTRFS_FIRST_FREE_OBJECTID &&
2089	parent_root_dir_gen != dir_gen)
2090	return `0`;
2091
2092	ret = lookup_dir_item_inode(root: sctx->parent_root, dir, name, name_len,
2093	found_inode: &other_inode);
2094	if (ret == -ENOENT)
2095	return `0`;
2096	else if (ret < `0`)
2097	return ret;
2098
2099	/*
2100	* Check if the overwritten ref was already processed. If yes, the ref
2101	* was already unlinked/moved, so we can safely assume that we will not
2102	* overwrite anything at this point in time.
2103	*/
2104	if (other_inode > sctx->send_progress \|\|
2105	is_waiting_for_move(sctx, ino: other_inode)) {
2106	ret = get_inode_info(root: sctx->parent_root, ino: other_inode, info: &info);
2107	if (ret < `0`)
2108	return ret;
2109
2110	*who_ino = other_inode;
2111	*who_gen = info.gen;
2112	*who_mode = info.mode;
2113	return `1`;
2114	}
2115
2116	return `0`;
2117	}
2118
2119	/*
2120	* Checks if the ref was overwritten by an already processed inode. This is
2121	* used by __get_cur_name_and_parent to find out if the ref was orphanized and
2122	* thus the orphan name needs be used.
2123	* process_recorded_refs also uses it to avoid unlinking of refs that were
2124	* overwritten.
2125	*/
2126	static int did_overwrite_ref(struct send_ctx *sctx,
2127	u64 dir, u64 dir_gen,
2128	u64 ino, u64 ino_gen,
2129	const char name, int* name_len)
2130	{
2131	int ret;
2132	u64 ow_inode;
2133	u64 ow_gen = `0`;
2134	u64 send_root_dir_gen;
2135
2136	if (!sctx->parent_root)
2137	return `0`;
2138
2139	ret = is_inode_existent(sctx, ino: dir, gen: dir_gen, send_gen: &send_root_dir_gen, NULL);
2140	if (ret <= `0`)
2141	return ret;
2142
2143	/*
2144	* @send_root_dir_gen was set to 0 if the inode does not exist in the
2145	* send root.
2146	*/
2147	if (dir != BTRFS_FIRST_FREE_OBJECTID && send_root_dir_gen != dir_gen)
2148	return `0`;
2149
2150	/ check if the ref was overwritten by another ref /
2151	ret = lookup_dir_item_inode(root: sctx->send_root, dir, name, name_len,
2152	found_inode: &ow_inode);
2153	if (ret == -ENOENT) {
2154	/ was never and will never be overwritten /
2155	return `0`;
2156	} else if (ret < `0`) {
2157	return ret;
2158	}
2159
2160	if (ow_inode == ino) {
2161	ret = get_inode_gen(root: sctx->send_root, ino: ow_inode, gen: &ow_gen);
2162	if (ret < `0`)
2163	return ret;
2164
2165	/ It's the same inode, so no overwrite happened. /
2166	if (ow_gen == ino_gen)
2167	return `0`;
2168	}
2169
2170	/*
2171	* We know that it is or will be overwritten. Check this now.
2172	* The current inode being processed might have been the one that caused
2173	* inode 'ino' to be orphanized, therefore check if ow_inode matches
2174	* the current inode being processed.
2175	*/
2176	if (ow_inode < sctx->send_progress)
2177	return `1`;
2178
2179	if (ino != sctx->cur_ino && ow_inode == sctx->cur_ino) {
2180	if (ow_gen == `0`) {
2181	ret = get_inode_gen(root: sctx->send_root, ino: ow_inode, gen: &ow_gen);
2182	if (ret < `0`)
2183	return ret;
2184	}
2185	if (ow_gen == sctx->cur_inode_gen)
2186	return `1`;
2187	}
2188
2189	return `0`;
2190	}
2191
2192	/*
2193	* Same as did_overwrite_ref, but also checks if it is the first ref of an inode
2194	* that got overwritten. This is used by process_recorded_refs to determine
2195	* if it has to use the path as returned by get_cur_path or the orphan name.
2196	*/
2197	static int did_overwrite_first_ref(struct send_ctx *sctx, u64 ino, u64 gen)
2198	{
2199	int ret = `0`;
2200	struct fs_path *name = NULL;
2201	u64 dir;
2202	u64 dir_gen;
2203
2204	if (!sctx->parent_root)
2205	goto out;
2206
2207	name = fs_path_alloc();
2208	if (!name)
2209	return -ENOMEM;
2210
2211	ret = get_first_ref(root: sctx->parent_root, ino, dir: &dir, dir_gen: &dir_gen, name);
2212	if (ret < `0`)
2213	goto out;
2214
2215	ret = did_overwrite_ref(sctx, dir, dir_gen, ino, ino_gen: gen,
2216	name: name->start, name_len: fs_path_len(p: name));
2217
2218	out:
2219	fs_path_free(p: name);
2220	return ret;
2221	}
2222
2223	static inline struct name_cache_entry name_cache_search(struct* send_ctx *sctx,
2224	u64 ino, u64 gen)
2225	{
2226	struct btrfs_lru_cache_entry *entry;
2227
2228	entry = btrfs_lru_cache_lookup(cache: &sctx->name_cache, key: ino, gen);
2229	if (!entry)
2230	return NULL;
2231
2232	return container_of(entry, struct name_cache_entry, entry);
2233	}
2234
2235	/*
2236	* Used by get_cur_path for each ref up to the root.
2237	* Returns 0 if it succeeded.
2238	* Returns 1 if the inode is not existent or got overwritten. In that case, the
2239	* name is an orphan name. This instructs get_cur_path to stop iterating. If 1
2240	* is returned, parent_ino/parent_gen are not guaranteed to be valid.
2241	* Returns <0 in case of error.
2242	*/
2243	static int __get_cur_name_and_parent(struct send_ctx *sctx,
2244	u64 ino, u64 gen,
2245	u64 *parent_ino,
2246	u64 *parent_gen,
2247	struct fs_path *dest)
2248	{
2249	int ret;
2250	int nce_ret;
2251	struct name_cache_entry *nce;
2252
2253	/*
2254	* First check if we already did a call to this function with the same
2255	* ino/gen. If yes, check if the cache entry is still up-to-date. If yes
2256	* return the cached result.
2257	*/
2258	nce = name_cache_search(sctx, ino, gen);
2259	if (nce) {
2260	if (ino < sctx->send_progress && nce->need_later_update) {
2261	btrfs_lru_cache_remove(cache: &sctx->name_cache, entry: &nce->entry);
2262	nce = NULL;
2263	} else {
2264	*parent_ino = nce->parent_ino;
2265	*parent_gen = nce->parent_gen;
2266	ret = fs_path_add(p: dest, name: nce->name, name_len: nce->name_len);
2267	if (ret < `0`)
2268	return ret;
2269	return nce->ret;
2270	}
2271	}
2272
2273	/*
2274	* If the inode is not existent yet, add the orphan name and return 1.
2275	* This should only happen for the parent dir that we determine in
2276	* record_new_ref_if_needed().
2277	*/
2278	ret = is_inode_existent(sctx, ino, gen, NULL, NULL);
2279	if (ret < `0`)
2280	return ret;
2281
2282	if (!ret) {
2283	ret = gen_unique_name(sctx, ino, gen, dest);
2284	if (ret < `0`)
2285	return ret;
2286	ret = `1`;
2287	goto out_cache;
2288	}
2289
2290	/*
2291	* Depending on whether the inode was already processed or not, use
2292	* send_root or parent_root for ref lookup.
2293	*/
2294	if (ino < sctx->send_progress)
2295	ret = get_first_ref(root: sctx->send_root, ino,
2296	dir: parent_ino, dir_gen: parent_gen, name: dest);
2297	else
2298	ret = get_first_ref(root: sctx->parent_root, ino,
2299	dir: parent_ino, dir_gen: parent_gen, name: dest);
2300	if (ret < `0`)
2301	return ret;
2302
2303	/*
2304	* Check if the ref was overwritten by an inode's ref that was processed
2305	* earlier. If yes, treat as orphan and return 1.
2306	*/
2307	ret = did_overwrite_ref(sctx, dir: parent_ino, dir_gen: parent_gen, ino, ino_gen: gen,
2308	name: dest->start, name_len: fs_path_len(p: dest));
2309	if (ret < `0`)
2310	return ret;
2311	if (ret) {
2312	fs_path_reset(p: dest);
2313	ret = gen_unique_name(sctx, ino, gen, dest);
2314	if (ret < `0`)
2315	return ret;
2316	ret = `1`;
2317	}
2318
2319	out_cache:
2320	/*
2321	* Store the result of the lookup in the name cache.
2322	*/
2323	nce = kmalloc(sizeof(*nce) + fs_path_len(dest), GFP_KERNEL);
2324	if (!nce)
2325	return -ENOMEM;
2326
2327	nce->entry.key = ino;
2328	nce->entry.gen = gen;
2329	nce->parent_ino = *parent_ino;
2330	nce->parent_gen = *parent_gen;
2331	nce->name_len = fs_path_len(p: dest);
2332	nce->ret = ret;
2333	memcpy(nce->name, dest->start, nce->name_len);
2334
2335	if (ino < sctx->send_progress)
2336	nce->need_later_update = `0`;
2337	else
2338	nce->need_later_update = `1`;
2339
2340	nce_ret = btrfs_lru_cache_store(cache: &sctx->name_cache, new_entry: &nce->entry, GFP_KERNEL);
2341	if (nce_ret < `0`) {
2342	kfree(objp: nce);
2343	return nce_ret;
2344	}
2345
2346	return ret;
2347	}
2348
2349	/*
2350	* Magic happens here. This function returns the first ref to an inode as it
2351	* would look like while receiving the stream at this point in time.
2352	* We walk the path up to the root. For every inode in between, we check if it
2353	* was already processed/sent. If yes, we continue with the parent as found
2354	* in send_root. If not, we continue with the parent as found in parent_root.
2355	* If we encounter an inode that was deleted at this point in time, we use the
2356	* inodes "orphan" name instead of the real name and stop. Same with new inodes
2357	* that were not created yet and overwritten inodes/refs.
2358	*
2359	* When do we have orphan inodes:
2360	* 1. When an inode is freshly created and thus no valid refs are available yet
2361	* 2. When a directory lost all it's refs (deleted) but still has dir items
2362	* inside which were not processed yet (pending for move/delete). If anyone
2363	* tried to get the path to the dir items, it would get a path inside that
2364	* orphan directory.
2365	* 3. When an inode is moved around or gets new links, it may overwrite the ref
2366	* of an unprocessed inode. If in that case the first ref would be
2367	* overwritten, the overwritten inode gets "orphanized". Later when we
2368	* process this overwritten inode, it is restored at a new place by moving
2369	* the orphan inode.
2370	*
2371	* sctx->send_progress tells this function at which point in time receiving
2372	* would be.
2373	*/
2374	static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen,
2375	struct fs_path *dest)
2376	{
2377	int ret = `0`;
2378	struct fs_path *name = NULL;
2379	u64 parent_inode = `0`;
2380	u64 parent_gen = `0`;
2381	int stop = `0`;
2382	const bool is_cur_inode = (ino == sctx->cur_ino && gen == sctx->cur_inode_gen);
2383
2384	if (is_cur_inode && fs_path_len(p: &sctx->cur_inode_path) > `0`) {
2385	if (dest != &sctx->cur_inode_path)
2386	return fs_path_copy(p: dest, from: &sctx->cur_inode_path);
2387
2388	return `0`;
2389	}
2390
2391	name = fs_path_alloc();
2392	if (!name) {
2393	ret = -ENOMEM;
2394	goto out;
2395	}
2396
2397	dest->reversed = `1`;
2398	fs_path_reset(p: dest);
2399
2400	while (!stop && ino != BTRFS_FIRST_FREE_OBJECTID) {
2401	struct waiting_dir_move *wdm;
2402
2403	fs_path_reset(p: name);
2404
2405	if (is_waiting_for_rm(sctx, dir_ino: ino, gen)) {
2406	ret = gen_unique_name(sctx, ino, gen, dest: name);
2407	if (ret < `0`)
2408	goto out;
2409	ret = fs_path_add_path(p: dest, p2: name);
2410	break;
2411	}
2412
2413	wdm = get_waiting_dir_move(sctx, ino);
2414	if (wdm && wdm->orphanized) {
2415	ret = gen_unique_name(sctx, ino, gen, dest: name);
2416	stop = `1`;
2417	} else if (wdm) {
2418	ret = get_first_ref(root: sctx->parent_root, ino,
2419	dir: &parent_inode, dir_gen: &parent_gen, name);
2420	} else {
2421	ret = __get_cur_name_and_parent(sctx, ino, gen,
2422	parent_ino: &parent_inode,
2423	parent_gen: &parent_gen, dest: name);
2424	if (ret)
2425	stop = `1`;
2426	}
2427
2428	if (ret < `0`)
2429	goto out;
2430
2431	ret = fs_path_add_path(p: dest, p2: name);
2432	if (ret < `0`)
2433	goto out;
2434
2435	ino = parent_inode;
2436	gen = parent_gen;
2437	}
2438
2439	out:
2440	fs_path_free(p: name);
2441	if (!ret) {
2442	fs_path_unreverse(p: dest);
2443	if (is_cur_inode && dest != &sctx->cur_inode_path)
2444	ret = fs_path_copy(p: &sctx->cur_inode_path, from: dest);
2445	}
2446
2447	return ret;
2448	}
2449
2450	/*
2451	* Sends a BTRFS_SEND_C_SUBVOL command/item to userspace
2452	*/
2453	static int send_subvol_begin(struct send_ctx *sctx)
2454	{
2455	int ret;
2456	struct btrfs_root *send_root = sctx->send_root;
2457	struct btrfs_root *parent_root = sctx->parent_root;
2458	BTRFS_PATH_AUTO_FREE(path);
2459	struct btrfs_key key;
2460	struct btrfs_root_ref *ref;
2461	struct extent_buffer *leaf;
2462	char AUTO_KFREE(name);
2463	int namelen;
2464
2465	path = btrfs_alloc_path();
2466	if (!path)
2467	return -ENOMEM;
2468
2469	name = kmalloc(BTRFS_PATH_NAME_MAX, GFP_KERNEL);
2470	if (!name)
2471	return -ENOMEM;
2472
2473	key.objectid = btrfs_root_id(root: send_root);
2474	key.type = BTRFS_ROOT_BACKREF_KEY;
2475	key.offset = `0`;
2476
2477	ret = btrfs_search_slot_for_read(root: send_root->fs_info->tree_root,
2478	key: &key, p: path, find_higher: `1`, return_any: `0`);
2479	if (ret < `0`)
2480	return ret;
2481	if (ret)
2482	return -ENOENT;
2483
2484	leaf = path->nodes[`0`];
2485	btrfs_item_key_to_cpu(eb: leaf, cpu_key: &key, nr: path->slots[`0`]);
2486	if (key.type != BTRFS_ROOT_BACKREF_KEY \|\|
2487	key.objectid != btrfs_root_id(root: send_root)) {
2488	return -ENOENT;
2489	}
2490	ref = btrfs_item_ptr(leaf, path->slots[`0`], struct btrfs_root_ref);
2491	namelen = btrfs_root_ref_name_len(eb: leaf, s: ref);
2492	read_extent_buffer(eb: leaf, dst: name, start: (unsigned long)(ref + `1`), len: namelen);
2493	btrfs_release_path(p: path);
2494
2495	if (parent_root) {
2496	ret = begin_cmd(sctx, cmd: BTRFS_SEND_C_SNAPSHOT);
2497	if (ret < `0`)
2498	return ret;
2499	} else {
2500	ret = begin_cmd(sctx, cmd: BTRFS_SEND_C_SUBVOL);
2501	if (ret < `0`)
2502	return ret;
2503	}
2504
2505	TLV_PUT_STRING(sctx, BTRFS_SEND_A_PATH, name, namelen);
2506
2507	if (!btrfs_is_empty_uuid(uuid: sctx->send_root->root_item.received_uuid))
2508	TLV_PUT_UUID(sctx, BTRFS_SEND_A_UUID,
2509	sctx->send_root->root_item.received_uuid);
2510	else
2511	TLV_PUT_UUID(sctx, BTRFS_SEND_A_UUID,
2512	sctx->send_root->root_item.uuid);
2513
2514	TLV_PUT_U64(sctx, BTRFS_SEND_A_CTRANSID,
2515	btrfs_root_ctransid(&sctx->send_root->root_item));
2516	if (parent_root) {
2517	if (!btrfs_is_empty_uuid(uuid: parent_root->root_item.received_uuid))
2518	TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
2519	parent_root->root_item.received_uuid);
2520	else
2521	TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
2522	parent_root->root_item.uuid);
2523	TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID,
2524	btrfs_root_ctransid(&sctx->parent_root->root_item));
2525	}
2526
2527	ret = send_cmd(sctx);
2528
2529	tlv_put_failure:
2530	return ret;
2531	}
2532
2533	static struct fs_path get_cur_inode_path(struct* send_ctx *sctx)
2534	{
2535	if (fs_path_len(p: &sctx->cur_inode_path) == `0`) {
2536	int ret;
2537
2538	ret = get_cur_path(sctx, ino: sctx->cur_ino, gen: sctx->cur_inode_gen,
2539	dest: &sctx->cur_inode_path);
2540	if (ret < `0`)
2541	return ERR_PTR(error: ret);
2542	}
2543
2544	return &sctx->cur_inode_path;
2545	}
2546
2547	static struct fs_path get_path_for_command(struct* send_ctx *sctx, u64 ino, u64 gen)
2548	{
2549	struct fs_path *path;
2550	int ret;
2551
2552	if (ino == sctx->cur_ino && gen == sctx->cur_inode_gen)
2553	return get_cur_inode_path(sctx);
2554
2555	path = fs_path_alloc();
2556	if (!path)
2557	return ERR_PTR(error: -ENOMEM);
2558
2559	ret = get_cur_path(sctx, ino, gen, dest: path);
2560	if (ret < `0`) {
2561	fs_path_free(p: path);
2562	return ERR_PTR(error: ret);
2563	}
2564
2565	return path;
2566	}
2567
2568	static void free_path_for_command(const struct send_ctx sctx, struct* fs_path *path)
2569	{
2570	if (path != &sctx->cur_inode_path)
2571	fs_path_free(p: path);
2572	}
2573
2574	static int send_truncate(struct send_ctx *sctx, u64 ino, u64 gen, u64 size)
2575	{
2576	int ret = `0`;
2577	struct fs_path *p;
2578
2579	p = get_path_for_command(sctx, ino, gen);
2580	if (IS_ERR(ptr: p))
2581	return PTR_ERR(ptr: p);
2582
2583	ret = begin_cmd(sctx, cmd: BTRFS_SEND_C_TRUNCATE);
2584	if (ret < `0`)
2585	goto out;
2586
2587	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
2588	TLV_PUT_U64(sctx, BTRFS_SEND_A_SIZE, size);
2589
2590	ret = send_cmd(sctx);
2591
2592	tlv_put_failure:
2593	out:
2594	free_path_for_command(sctx, path: p);
2595	return ret;
2596	}
2597
2598	static int send_chmod(struct send_ctx *sctx, u64 ino, u64 gen, u64 mode)
2599	{
2600	int ret = `0`;
2601	struct fs_path *p;
2602
2603	p = get_path_for_command(sctx, ino, gen);
2604	if (IS_ERR(ptr: p))
2605	return PTR_ERR(ptr: p);
2606
2607	ret = begin_cmd(sctx, cmd: BTRFS_SEND_C_CHMOD);
2608	if (ret < `0`)
2609	goto out;
2610
2611	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
2612	TLV_PUT_U64(sctx, BTRFS_SEND_A_MODE, mode & `07777`);
2613
2614	ret = send_cmd(sctx);
2615
2616	tlv_put_failure:
2617	out:
2618	free_path_for_command(sctx, path: p);
2619	return ret;
2620	}
2621
2622	static int send_fileattr(struct send_ctx *sctx, u64 ino, u64 gen, u64 fileattr)
2623	{
2624	int ret = `0`;
2625	struct fs_path *p;
2626
2627	if (sctx->proto < `2`)
2628	return `0`;
2629
2630	p = get_path_for_command(sctx, ino, gen);
2631	if (IS_ERR(ptr: p))
2632	return PTR_ERR(ptr: p);
2633
2634	ret = begin_cmd(sctx, cmd: BTRFS_SEND_C_FILEATTR);
2635	if (ret < `0`)
2636	goto out;
2637
2638	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
2639	TLV_PUT_U64(sctx, BTRFS_SEND_A_FILEATTR, fileattr);
2640
2641	ret = send_cmd(sctx);
2642
2643	tlv_put_failure:
2644	out:
2645	free_path_for_command(sctx, path: p);
2646	return ret;
2647	}
2648
2649	static int send_chown(struct send_ctx *sctx, u64 ino, u64 gen, u64 uid, u64 gid)
2650	{
2651	int ret = `0`;
2652	struct fs_path *p;
2653
2654	p = get_path_for_command(sctx, ino, gen);
2655	if (IS_ERR(ptr: p))
2656	return PTR_ERR(ptr: p);
2657
2658	ret = begin_cmd(sctx, cmd: BTRFS_SEND_C_CHOWN);
2659	if (ret < `0`)
2660	goto out;
2661
2662	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
2663	TLV_PUT_U64(sctx, BTRFS_SEND_A_UID, uid);
2664	TLV_PUT_U64(sctx, BTRFS_SEND_A_GID, gid);
2665
2666	ret = send_cmd(sctx);
2667
2668	tlv_put_failure:
2669	out:
2670	free_path_for_command(sctx, path: p);
2671	return ret;
2672	}
2673
2674	static int send_utimes(struct send_ctx *sctx, u64 ino, u64 gen)
2675	{
2676	int ret = `0`;
2677	struct fs_path *p = NULL;
2678	struct btrfs_inode_item *ii;
2679	BTRFS_PATH_AUTO_FREE(path);
2680	struct extent_buffer *eb;
2681	struct btrfs_key key;
2682	int slot;
2683
2684	p = get_path_for_command(sctx, ino, gen);
2685	if (IS_ERR(ptr: p))
2686	return PTR_ERR(ptr: p);
2687
2688	path = alloc_path_for_send();
2689	if (!path) {
2690	ret = -ENOMEM;
2691	goto out;
2692	}
2693
2694	key.objectid = ino;
2695	key.type = BTRFS_INODE_ITEM_KEY;
2696	key.offset = `0`;
2697	ret = btrfs_search_slot(NULL, root: sctx->send_root, key: &key, p: path, ins_len: `0`, cow: `0`);
2698	if (ret > `0`)
2699	ret = -ENOENT;
2700	if (ret < `0`)
2701	goto out;
2702
2703	eb = path->nodes[`0`];
2704	slot = path->slots[`0`];
2705	ii = btrfs_item_ptr(eb, slot, struct btrfs_inode_item);
2706
2707	ret = begin_cmd(sctx, cmd: BTRFS_SEND_C_UTIMES);
2708	if (ret < `0`)
2709	goto out;
2710
2711	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
2712	TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_ATIME, eb, &ii->atime);
2713	TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_MTIME, eb, &ii->mtime);
2714	TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_CTIME, eb, &ii->ctime);
2715	if (sctx->proto >= `2`)
2716	TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_OTIME, eb, &ii->otime);
2717
2718	ret = send_cmd(sctx);
2719
2720	tlv_put_failure:
2721	out:
2722	free_path_for_command(sctx, path: p);
2723	return ret;
2724	}
2725
2726	/*
2727	* If the cache is full, we can't remove entries from it and do a call to
2728	* send_utimes() for each respective inode, because we might be finishing
2729	* processing an inode that is a directory and it just got renamed, and existing
2730	* entries in the cache may refer to inodes that have the directory in their
2731	* full path - in which case we would generate outdated paths (pre-rename)
2732	* for the inodes that the cache entries point to. Instead of pruning the
2733	* cache when inserting, do it after we finish processing each inode at
2734	* finish_inode_if_needed().
2735	*/
2736	static int cache_dir_utimes(struct send_ctx *sctx, u64 dir, u64 gen)
2737	{
2738	struct btrfs_lru_cache_entry *entry;
2739	int ret;
2740
2741	entry = btrfs_lru_cache_lookup(cache: &sctx->dir_utimes_cache, key: dir, gen);
2742	if (entry != NULL)
2743	return `0`;
2744
2745	/ Caching is optional, don't fail if we can't allocate memory. /
2746	entry = kmalloc(sizeof(*entry), GFP_KERNEL);
2747	if (!entry)
2748	return send_utimes(sctx, ino: dir, gen);
2749
2750	entry->key = dir;
2751	entry->gen = gen;
2752
2753	ret = btrfs_lru_cache_store(cache: &sctx->dir_utimes_cache, new_entry: entry, GFP_KERNEL);
2754	ASSERT(ret != -EEXIST);
2755	if (ret) {
2756	kfree(objp: entry);
2757	return send_utimes(sctx, ino: dir, gen);
2758	}
2759
2760	return `0`;
2761	}
2762
2763	static int trim_dir_utimes_cache(struct send_ctx *sctx)
2764	{
2765	while (sctx->dir_utimes_cache.size > SEND_MAX_DIR_UTIMES_CACHE_SIZE) {
2766	struct btrfs_lru_cache_entry *lru;
2767	int ret;
2768
2769	lru = btrfs_lru_cache_lru_entry(cache: &sctx->dir_utimes_cache);
2770	ASSERT(lru != NULL);
2771
2772	ret = send_utimes(sctx, ino: lru->key, gen: lru->gen);
2773	if (ret)
2774	return ret;
2775
2776	btrfs_lru_cache_remove(cache: &sctx->dir_utimes_cache, entry: lru);
2777	}
2778
2779	return `0`;
2780	}
2781
2782	/*
2783	* Sends a BTRFS_SEND_C_MKXXX or SYMLINK command to user space. We don't have
2784	* a valid path yet because we did not process the refs yet. So, the inode
2785	* is created as orphan.
2786	*/
2787	static int send_create_inode(struct send_ctx *sctx, u64 ino)
2788	{
2789	int ret = `0`;
2790	struct fs_path *p;
2791	int cmd;
2792	struct btrfs_inode_info info;
2793	u64 gen;
2794	u64 mode;
2795	u64 rdev;
2796
2797	p = fs_path_alloc();
2798	if (!p)
2799	return -ENOMEM;
2800
2801	if (ino != sctx->cur_ino) {
2802	ret = get_inode_info(root: sctx->send_root, ino, info: &info);
2803	if (ret < `0`)
2804	goto out;
2805	gen = info.gen;
2806	mode = info.mode;
2807	rdev = info.rdev;
2808	} else {
2809	gen = sctx->cur_inode_gen;
2810	mode = sctx->cur_inode_mode;
2811	rdev = sctx->cur_inode_rdev;
2812	}
2813
2814	if (S_ISREG(mode)) {
2815	cmd = BTRFS_SEND_C_MKFILE;
2816	} else if (S_ISDIR(mode)) {
2817	cmd = BTRFS_SEND_C_MKDIR;
2818	} else if (S_ISLNK(mode)) {
2819	cmd = BTRFS_SEND_C_SYMLINK;
2820	} else if (S_ISCHR(mode) \|\| S_ISBLK(mode)) {
2821	cmd = BTRFS_SEND_C_MKNOD;
2822	} else if (S_ISFIFO(mode)) {
2823	cmd = BTRFS_SEND_C_MKFIFO;
2824	} else if (S_ISSOCK(mode)) {
2825	cmd = BTRFS_SEND_C_MKSOCK;
2826	} else {
2827	btrfs_warn(sctx->send_root->fs_info, "unexpected inode type %o",
2828	(int)(mode & S_IFMT));
2829	ret = -EOPNOTSUPP;
2830	goto out;
2831	}
2832
2833	ret = begin_cmd(sctx, cmd);
2834	if (ret < `0`)
2835	goto out;
2836
2837	ret = gen_unique_name(sctx, ino, gen, dest: p);
2838	if (ret < `0`)
2839	goto out;
2840
2841	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
2842	TLV_PUT_U64(sctx, BTRFS_SEND_A_INO, ino);
2843
2844	if (S_ISLNK(mode)) {
2845	fs_path_reset(p);
2846	ret = read_symlink(root: sctx->send_root, ino, dest: p);
2847	if (ret < `0`)
2848	goto out;
2849	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_LINK, p);
2850	} else if (S_ISCHR(mode) \|\| S_ISBLK(mode) \|\|
2851	S_ISFIFO(mode) \|\| S_ISSOCK(mode)) {
2852	TLV_PUT_U64(sctx, BTRFS_SEND_A_RDEV, new_encode_dev(rdev));
2853	TLV_PUT_U64(sctx, BTRFS_SEND_A_MODE, mode);
2854	}
2855
2856	ret = send_cmd(sctx);
2857	if (ret < `0`)
2858	goto out;
2859
2860
2861	tlv_put_failure:
2862	out:
2863	fs_path_free(p);
2864	return ret;
2865	}
2866
2867	static void cache_dir_created(struct send_ctx *sctx, u64 dir)
2868	{
2869	struct btrfs_lru_cache_entry *entry;
2870	int ret;
2871
2872	/ Caching is optional, ignore any failures. /
2873	entry = kmalloc(sizeof(*entry), GFP_KERNEL);
2874	if (!entry)
2875	return;
2876
2877	entry->key = dir;
2878	entry->gen = `0`;
2879	ret = btrfs_lru_cache_store(cache: &sctx->dir_created_cache, new_entry: entry, GFP_KERNEL);
2880	if (ret < `0`)
2881	kfree(objp: entry);
2882	}
2883
2884	/*
2885	* We need some special handling for inodes that get processed before the parent
2886	* directory got created. See process_recorded_refs for details.
2887	* This function does the check if we already created the dir out of order.
2888	*/
2889	static int did_create_dir(struct send_ctx *sctx, u64 dir)
2890	{
2891	int ret = `0`;
2892	int iter_ret = `0`;
2893	BTRFS_PATH_AUTO_FREE(path);
2894	struct btrfs_key key;
2895	struct btrfs_key found_key;
2896	struct btrfs_key di_key;
2897	struct btrfs_dir_item *di;
2898
2899	if (btrfs_lru_cache_lookup(cache: &sctx->dir_created_cache, key: dir, gen: `0`))
2900	return `1`;
2901
2902	path = alloc_path_for_send();
2903	if (!path)
2904	return -ENOMEM;
2905
2906	key.objectid = dir;
2907	key.type = BTRFS_DIR_INDEX_KEY;
2908	key.offset = `0`;
2909
2910	btrfs_for_each_slot(sctx->send_root, &key, &found_key, path, iter_ret) {
2911	struct extent_buffer *eb = path->nodes[`0`];
2912
2913	if (found_key.objectid != key.objectid \|\|
2914	found_key.type != key.type) {
2915	ret = `0`;
2916	break;
2917	}
2918
2919	di = btrfs_item_ptr(eb, path->slots[`0`], struct btrfs_dir_item);
2920	btrfs_dir_item_key_to_cpu(eb, item: di, cpu_key: &di_key);
2921
2922	if (di_key.type != BTRFS_ROOT_ITEM_KEY &&
2923	di_key.objectid < sctx->send_progress) {
2924	ret = `1`;
2925	cache_dir_created(sctx, dir);
2926	break;
2927	}
2928	}
2929	/ Catch error found during iteration /
2930	if (iter_ret < `0`)
2931	ret = iter_ret;
2932
2933	return ret;
2934	}
2935
2936	/*
2937	* Only creates the inode if it is:
2938	* 1. Not a directory
2939	* 2. Or a directory which was not created already due to out of order
2940	* directories. See did_create_dir and process_recorded_refs for details.
2941	*/
2942	static int send_create_inode_if_needed(struct send_ctx *sctx)
2943	{
2944	int ret;
2945
2946	if (S_ISDIR(sctx->cur_inode_mode)) {
2947	ret = did_create_dir(sctx, dir: sctx->cur_ino);
2948	if (ret < `0`)
2949	return ret;
2950	else if (ret > `0`)
2951	return `0`;
2952	}
2953
2954	ret = send_create_inode(sctx, ino: sctx->cur_ino);
2955
2956	if (ret == `0` && S_ISDIR(sctx->cur_inode_mode))
2957	cache_dir_created(sctx, dir: sctx->cur_ino);
2958
2959	return ret;
2960	}
2961
2962	struct recorded_ref {
2963	struct list_head list;
2964	char *name;
2965	struct fs_path *full_path;
2966	u64 dir;
2967	u64 dir_gen;
2968	int name_len;
2969	struct rb_node node;
2970	struct rb_root *root;
2971	};
2972
2973	static struct recorded_ref recorded_ref_alloc(void*)
2974	{
2975	struct recorded_ref *ref;
2976
2977	ref = kzalloc(sizeof(*ref), GFP_KERNEL);
2978	if (!ref)
2979	return NULL;
2980	RB_CLEAR_NODE(&ref->node);
2981	INIT_LIST_HEAD(list: &ref->list);
2982	return ref;
2983	}
2984
2985	static void recorded_ref_free(struct recorded_ref *ref)
2986	{
2987	if (!ref)
2988	return;
2989	if (!RB_EMPTY_NODE(&ref->node))
2990	rb_erase(&ref->node, ref->root);
2991	list_del(entry: &ref->list);
2992	fs_path_free(p: ref->full_path);
2993	kfree(objp: ref);
2994	}
2995
2996	static void set_ref_path(struct recorded_ref ref, struct* fs_path *path)
2997	{
2998	ref->full_path = path;
2999	ref->name = (char *)kbasename(path: ref->full_path->start);
3000	ref->name_len = ref->full_path->end - ref->name;
3001	}
3002
3003	static int dup_ref(struct recorded_ref ref, struct* list_head *list)
3004	{
3005	struct recorded_ref *new;
3006
3007	new = recorded_ref_alloc();
3008	if (!new)
3009	return -ENOMEM;
3010
3011	new->dir = ref->dir;
3012	new->dir_gen = ref->dir_gen;
3013	list_add_tail(new: &new->list, head: list);
3014	return `0`;
3015	}
3016
3017	static void __free_recorded_refs(struct list_head *head)
3018	{
3019	struct recorded_ref *cur;
3020
3021	while (!list_empty(head)) {
3022	cur = list_first_entry(head, struct recorded_ref, list);
3023	recorded_ref_free(ref: cur);
3024	}
3025	}
3026
3027	static void free_recorded_refs(struct send_ctx *sctx)
3028	{
3029	__free_recorded_refs(head: &sctx->new_refs);
3030	__free_recorded_refs(head: &sctx->deleted_refs);
3031	}
3032
3033	/*
3034	* Renames/moves a file/dir to its orphan name. Used when the first
3035	* ref of an unprocessed inode gets overwritten and for all non empty
3036	* directories.
3037	*/
3038	static int orphanize_inode(struct send_ctx *sctx, u64 ino, u64 gen,
3039	struct fs_path *path)
3040	{
3041	int ret;
3042	struct fs_path *orphan;
3043
3044	orphan = fs_path_alloc();
3045	if (!orphan)
3046	return -ENOMEM;
3047
3048	ret = gen_unique_name(sctx, ino, gen, dest: orphan);
3049	if (ret < `0`)
3050	goto out;
3051
3052	ret = send_rename(sctx, from: path, to: orphan);
3053	if (ret < `0`)
3054	goto out;
3055
3056	if (ino == sctx->cur_ino && gen == sctx->cur_inode_gen)
3057	ret = fs_path_copy(p: &sctx->cur_inode_path, from: orphan);
3058
3059	out:
3060	fs_path_free(p: orphan);
3061	return ret;
3062	}
3063
3064	static struct orphan_dir_info add_orphan_dir_info(struct* send_ctx *sctx,
3065	u64 dir_ino, u64 dir_gen)
3066	{
3067	struct rb_node **p = &sctx->orphan_dirs.rb_node;
3068	struct rb_node *parent = NULL;
3069	struct orphan_dir_info entry, odi;
3070
3071	while (*p) {
3072	parent = *p;
3073	entry = rb_entry(parent, struct orphan_dir_info, node);
3074	if (dir_ino < entry->ino)
3075	p = &(*p)->rb_left;
3076	else if (dir_ino > entry->ino)
3077	p = &(*p)->rb_right;
3078	else if (dir_gen < entry->gen)
3079	p = &(*p)->rb_left;
3080	else if (dir_gen > entry->gen)
3081	p = &(*p)->rb_right;
3082	else
3083	return entry;
3084	}
3085
3086	odi = kmalloc(sizeof(*odi), GFP_KERNEL);
3087	if (!odi)
3088	return ERR_PTR(error: -ENOMEM);
3089	odi->ino = dir_ino;
3090	odi->gen = dir_gen;
3091	odi->last_dir_index_offset = `0`;
3092	odi->dir_high_seq_ino = `0`;
3093
3094	rb_link_node(node: &odi->node, parent, rb_link: p);
3095	rb_insert_color(&odi->node, &sctx->orphan_dirs);
3096	return odi;
3097	}
3098
3099	static struct orphan_dir_info get_orphan_dir_info(struct* send_ctx *sctx,
3100	u64 dir_ino, u64 gen)
3101	{
3102	struct rb_node *n = sctx->orphan_dirs.rb_node;
3103	struct orphan_dir_info *entry;
3104
3105	while (n) {
3106	entry = rb_entry(n, struct orphan_dir_info, node);
3107	if (dir_ino < entry->ino)
3108	n = n->rb_left;
3109	else if (dir_ino > entry->ino)
3110	n = n->rb_right;
3111	else if (gen < entry->gen)
3112	n = n->rb_left;
3113	else if (gen > entry->gen)
3114	n = n->rb_right;
3115	else
3116	return entry;
3117	}
3118	return NULL;
3119	}
3120
3121	static int is_waiting_for_rm(struct send_ctx *sctx, u64 dir_ino, u64 gen)
3122	{
3123	struct orphan_dir_info *odi = get_orphan_dir_info(sctx, dir_ino, gen);
3124
3125	return odi != NULL;
3126	}
3127
3128	static void free_orphan_dir_info(struct send_ctx *sctx,
3129	struct orphan_dir_info *odi)
3130	{
3131	if (!odi)
3132	return;
3133	rb_erase(&odi->node, &sctx->orphan_dirs);
3134	kfree(objp: odi);
3135	}
3136
3137	/*
3138	* Returns 1 if a directory can be removed at this point in time.
3139	* We check this by iterating all dir items and checking if the inode behind
3140	* the dir item was already processed.
3141	*/
3142	static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen)
3143	{
3144	int ret = `0`;
3145	int iter_ret = `0`;
3146	struct btrfs_root *root = sctx->parent_root;
3147	struct btrfs_path *path;
3148	struct btrfs_key key;
3149	struct btrfs_key found_key;
3150	struct btrfs_key loc;
3151	struct btrfs_dir_item *di;
3152	struct orphan_dir_info *odi = NULL;
3153	u64 dir_high_seq_ino = `0`;
3154	u64 last_dir_index_offset = `0`;
3155
3156	/*
3157	* Don't try to rmdir the top/root subvolume dir.
3158	*/
3159	if (dir == BTRFS_FIRST_FREE_OBJECTID)
3160	return `0`;
3161
3162	odi = get_orphan_dir_info(sctx, dir_ino: dir, gen: dir_gen);
3163	if (odi && sctx->cur_ino < odi->dir_high_seq_ino)
3164	return `0`;
3165
3166	path = alloc_path_for_send();
3167	if (!path)
3168	return -ENOMEM;
3169
3170	if (!odi) {
3171	/*
3172	* Find the inode number associated with the last dir index
3173	* entry. This is very likely the inode with the highest number
3174	* of all inodes that have an entry in the directory. We can
3175	* then use it to avoid future calls to can_rmdir(), when
3176	* processing inodes with a lower number, from having to search
3177	* the parent root b+tree for dir index keys.
3178	*/
3179	key.objectid = dir;
3180	key.type = BTRFS_DIR_INDEX_KEY;
3181	key.offset = (u64)-`1`;
3182
3183	ret = btrfs_search_slot(NULL, root, key: &key, p: path, ins_len: `0`, cow: `0`);
3184	if (ret < `0`) {
3185	goto out;
3186	} else if (ret > `0`) {
3187	/ Can't happen, the root is never empty. /
3188	ASSERT(path->slots[`0`] > `0`);
3189	if (WARN_ON(path->slots[`0`] == `0`)) {
3190	ret = -EUCLEAN;
3191	goto out;
3192	}
3193	path->slots[`0`]--;
3194	}
3195
3196	btrfs_item_key_to_cpu(eb: path->nodes[`0`], cpu_key: &key, nr: path->slots[`0`]);
3197	if (key.objectid != dir \|\| key.type != BTRFS_DIR_INDEX_KEY) {
3198	/ No index keys, dir can be removed. /
3199	ret = `1`;
3200	goto out;
3201	}
3202
3203	di = btrfs_item_ptr(path->nodes[`0`], path->slots[`0`],
3204	struct btrfs_dir_item);
3205	btrfs_dir_item_key_to_cpu(eb: path->nodes[`0`], item: di, cpu_key: &loc);
3206	dir_high_seq_ino = loc.objectid;
3207	if (sctx->cur_ino < dir_high_seq_ino) {
3208	ret = `0`;
3209	goto out;
3210	}
3211
3212	btrfs_release_path(p: path);
3213	}
3214
3215	key.objectid = dir;
3216	key.type = BTRFS_DIR_INDEX_KEY;
3217	key.offset = (odi ? odi->last_dir_index_offset : `0`);
3218
3219	btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) {
3220	struct waiting_dir_move *dm;
3221
3222	if (found_key.objectid != key.objectid \|\|
3223	found_key.type != key.type)
3224	break;
3225
3226	di = btrfs_item_ptr(path->nodes[`0`], path->slots[`0`],
3227	struct btrfs_dir_item);
3228	btrfs_dir_item_key_to_cpu(eb: path->nodes[`0`], item: di, cpu_key: &loc);
3229
3230	dir_high_seq_ino = max(dir_high_seq_ino, loc.objectid);
3231	last_dir_index_offset = found_key.offset;
3232
3233	dm = get_waiting_dir_move(sctx, ino: loc.objectid);
3234	if (dm) {
3235	dm->rmdir_ino = dir;
3236	dm->rmdir_gen = dir_gen;
3237	ret = `0`;
3238	goto out;
3239	}
3240
3241	if (loc.objectid > sctx->cur_ino) {
3242	ret = `0`;
3243	goto out;
3244	}
3245	}
3246	if (iter_ret < `0`) {
3247	ret = iter_ret;
3248	goto out;
3249	}
3250	free_orphan_dir_info(sctx, odi);
3251
3252	ret = `1`;
3253
3254	out:
3255	btrfs_free_path(p: path);
3256
3257	if (ret)
3258	return ret;
3259
3260	if (!odi) {
3261	odi = add_orphan_dir_info(sctx, dir_ino: dir, dir_gen);
3262	if (IS_ERR(ptr: odi))
3263	return PTR_ERR(ptr: odi);
3264
3265	odi->gen = dir_gen;
3266	}
3267
3268	odi->last_dir_index_offset = last_dir_index_offset;
3269	odi->dir_high_seq_ino = max(odi->dir_high_seq_ino, dir_high_seq_ino);
3270
3271	return `0`;
3272	}
3273
3274	static int is_waiting_for_move(struct send_ctx *sctx, u64 ino)
3275	{
3276	struct waiting_dir_move *entry = get_waiting_dir_move(sctx, ino);
3277
3278	return entry != NULL;
3279	}
3280
3281	static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino, bool orphanized)
3282	{
3283	struct rb_node **p = &sctx->waiting_dir_moves.rb_node;
3284	struct rb_node *parent = NULL;
3285	struct waiting_dir_move entry, dm;
3286
3287	dm = kmalloc(sizeof(*dm), GFP_KERNEL);
3288	if (!dm)
3289	return -ENOMEM;
3290	dm->ino = ino;
3291	dm->rmdir_ino = `0`;
3292	dm->rmdir_gen = `0`;
3293	dm->orphanized = orphanized;
3294
3295	while (*p) {
3296	parent = *p;
3297	entry = rb_entry(parent, struct waiting_dir_move, node);
3298	if (ino < entry->ino) {
3299	p = &(*p)->rb_left;
3300	} else if (ino > entry->ino) {
3301	p = &(*p)->rb_right;
3302	} else {
3303	kfree(objp: dm);
3304	return -EEXIST;
3305	}
3306	}
3307
3308	rb_link_node(node: &dm->node, parent, rb_link: p);
3309	rb_insert_color(&dm->node, &sctx->waiting_dir_moves);
3310	return `0`;
3311	}
3312
3313	static struct waiting_dir_move *
3314	get_waiting_dir_move(struct send_ctx *sctx, u64 ino)
3315	{
3316	struct rb_node *n = sctx->waiting_dir_moves.rb_node;
3317	struct waiting_dir_move *entry;
3318
3319	while (n) {
3320	entry = rb_entry(n, struct waiting_dir_move, node);
3321	if (ino < entry->ino)
3322	n = n->rb_left;
3323	else if (ino > entry->ino)
3324	n = n->rb_right;
3325	else
3326	return entry;
3327	}
3328	return NULL;
3329	}
3330
3331	static void free_waiting_dir_move(struct send_ctx *sctx,
3332	struct waiting_dir_move *dm)
3333	{
3334	if (!dm)
3335	return;
3336	rb_erase(&dm->node, &sctx->waiting_dir_moves);
3337	kfree(objp: dm);
3338	}
3339
3340	static int add_pending_dir_move(struct send_ctx *sctx,
3341	u64 ino,
3342	u64 ino_gen,
3343	u64 parent_ino,
3344	struct list_head *new_refs,
3345	struct list_head *deleted_refs,
3346	const bool is_orphan)
3347	{
3348	struct rb_node **p = &sctx->pending_dir_moves.rb_node;
3349	struct rb_node *parent = NULL;
3350	struct pending_dir_move entry = NULL, pm;
3351	struct recorded_ref *cur;
3352	int exists = `0`;
3353	int ret;
3354
3355	pm = kmalloc(sizeof(*pm), GFP_KERNEL);
3356	if (!pm)
3357	return -ENOMEM;
3358	pm->parent_ino = parent_ino;
3359	pm->ino = ino;
3360	pm->gen = ino_gen;
3361	INIT_LIST_HEAD(list: &pm->list);
3362	INIT_LIST_HEAD(list: &pm->update_refs);
3363	RB_CLEAR_NODE(&pm->node);
3364
3365	while (*p) {
3366	parent = *p;
3367	entry = rb_entry(parent, struct pending_dir_move, node);
3368	if (parent_ino < entry->parent_ino) {
3369	p = &(*p)->rb_left;
3370	} else if (parent_ino > entry->parent_ino) {
3371	p = &(*p)->rb_right;
3372	} else {
3373	exists = `1`;
3374	break;
3375	}
3376	}
3377
3378	list_for_each_entry(cur, deleted_refs, list) {
3379	ret = dup_ref(ref: cur, list: &pm->update_refs);
3380	if (ret < `0`)
3381	goto out;
3382	}
3383	list_for_each_entry(cur, new_refs, list) {
3384	ret = dup_ref(ref: cur, list: &pm->update_refs);
3385	if (ret < `0`)
3386	goto out;
3387	}
3388
3389	ret = add_waiting_dir_move(sctx, ino: pm->ino, orphanized: is_orphan);
3390	if (ret)
3391	goto out;
3392
3393	if (exists) {
3394	list_add_tail(new: &pm->list, head: &entry->list);
3395	} else {
3396	rb_link_node(node: &pm->node, parent, rb_link: p);
3397	rb_insert_color(&pm->node, &sctx->pending_dir_moves);
3398	}
3399	ret = `0`;
3400	out:
3401	if (ret) {
3402	__free_recorded_refs(head: &pm->update_refs);
3403	kfree(objp: pm);
3404	}
3405	return ret;
3406	}
3407
3408	static struct pending_dir_move get_pending_dir_moves(struct* send_ctx *sctx,
3409	u64 parent_ino)
3410	{
3411	struct rb_node *n = sctx->pending_dir_moves.rb_node;
3412	struct pending_dir_move *entry;
3413
3414	while (n) {
3415	entry = rb_entry(n, struct pending_dir_move, node);
3416	if (parent_ino < entry->parent_ino)
3417	n = n->rb_left;
3418	else if (parent_ino > entry->parent_ino)
3419	n = n->rb_right;
3420	else
3421	return entry;
3422	}
3423	return NULL;
3424	}
3425
3426	static int path_loop(struct send_ctx sctx, struct* fs_path *name,
3427	u64 ino, u64 gen, u64 *ancestor_ino)
3428	{
3429	int ret = `0`;
3430	u64 parent_inode = `0`;
3431	u64 parent_gen = `0`;
3432	u64 start_ino = ino;
3433
3434	*ancestor_ino = `0`;
3435	while (ino != BTRFS_FIRST_FREE_OBJECTID) {
3436	fs_path_reset(p: name);
3437
3438	if (is_waiting_for_rm(sctx, dir_ino: ino, gen))
3439	break;
3440	if (is_waiting_for_move(sctx, ino)) {
3441	if (*ancestor_ino == `0`)
3442	*ancestor_ino = ino;
3443	ret = get_first_ref(root: sctx->parent_root, ino,
3444	dir: &parent_inode, dir_gen: &parent_gen, name);
3445	} else {
3446	ret = __get_cur_name_and_parent(sctx, ino, gen,
3447	parent_ino: &parent_inode,
3448	parent_gen: &parent_gen, dest: name);
3449	if (ret > `0`) {
3450	ret = `0`;
3451	break;
3452	}
3453	}
3454	if (ret < `0`)
3455	break;
3456	if (parent_inode == start_ino) {
3457	ret = `1`;
3458	if (*ancestor_ino == `0`)
3459	*ancestor_ino = ino;
3460	break;
3461	}
3462	ino = parent_inode;
3463	gen = parent_gen;
3464	}
3465	return ret;
3466	}
3467
3468	static int apply_dir_move(struct send_ctx sctx, struct* pending_dir_move *pm)
3469	{
3470	struct fs_path *from_path = NULL;
3471	struct fs_path *to_path = NULL;
3472	struct fs_path *name = NULL;
3473	u64 orig_progress = sctx->send_progress;
3474	struct recorded_ref *cur;
3475	u64 parent_ino, parent_gen;
3476	struct waiting_dir_move *dm = NULL;
3477	u64 rmdir_ino = `0`;
3478	u64 rmdir_gen;
3479	u64 ancestor;
3480	bool is_orphan;
3481	int ret;
3482
3483	name = fs_path_alloc();
3484	from_path = fs_path_alloc();
3485	if (!name \|\| !from_path) {
3486	ret = -ENOMEM;
3487	goto out;
3488	}
3489
3490	dm = get_waiting_dir_move(sctx, ino: pm->ino);
3491	ASSERT(dm);
3492	rmdir_ino = dm->rmdir_ino;
3493	rmdir_gen = dm->rmdir_gen;
3494	is_orphan = dm->orphanized;
3495	free_waiting_dir_move(sctx, dm);
3496
3497	if (is_orphan) {
3498	ret = gen_unique_name(sctx, ino: pm->ino,
3499	gen: pm->gen, dest: from_path);
3500	} else {
3501	ret = get_first_ref(root: sctx->parent_root, ino: pm->ino,
3502	dir: &parent_ino, dir_gen: &parent_gen, name);
3503	if (ret < `0`)
3504	goto out;
3505	ret = get_cur_path(sctx, ino: parent_ino, gen: parent_gen,
3506	dest: from_path);
3507	if (ret < `0`)
3508	goto out;
3509	ret = fs_path_add_path(p: from_path, p2: name);
3510	}
3511	if (ret < `0`)
3512	goto out;
3513
3514	sctx->send_progress = sctx->cur_ino + `1`;
3515	ret = path_loop(sctx, name, ino: pm->ino, gen: pm->gen, ancestor_ino: &ancestor);
3516	if (ret < `0`)
3517	goto out;
3518	if (ret) {
3519	LIST_HEAD(deleted_refs);
3520	ASSERT(ancestor > BTRFS_FIRST_FREE_OBJECTID);
3521	ret = add_pending_dir_move(sctx, ino: pm->ino, ino_gen: pm->gen, parent_ino: ancestor,
3522	new_refs: &pm->update_refs, deleted_refs: &deleted_refs,
3523	is_orphan);
3524	if (ret < `0`)
3525	goto out;
3526	if (rmdir_ino) {
3527	dm = get_waiting_dir_move(sctx, ino: pm->ino);
3528	ASSERT(dm);
3529	dm->rmdir_ino = rmdir_ino;
3530	dm->rmdir_gen = rmdir_gen;
3531	}
3532	goto out;
3533	}
3534	fs_path_reset(p: name);
3535	to_path = name;
3536	name = NULL;
3537	ret = get_cur_path(sctx, ino: pm->ino, gen: pm->gen, dest: to_path);
3538	if (ret < `0`)
3539	goto out;
3540
3541	ret = send_rename(sctx, from: from_path, to: to_path);
3542	if (ret < `0`)
3543	goto out;
3544
3545	if (rmdir_ino) {
3546	struct orphan_dir_info *odi;
3547	u64 gen;
3548
3549	odi = get_orphan_dir_info(sctx, dir_ino: rmdir_ino, gen: rmdir_gen);
3550	if (!odi) {
3551	/ already deleted /
3552	goto finish;
3553	}
3554	gen = odi->gen;
3555
3556	ret = can_rmdir(sctx, dir: rmdir_ino, dir_gen: gen);
3557	if (ret < `0`)
3558	goto out;
3559	if (!ret)
3560	goto finish;
3561
3562	name = fs_path_alloc();
3563	if (!name) {
3564	ret = -ENOMEM;
3565	goto out;
3566	}
3567	ret = get_cur_path(sctx, ino: rmdir_ino, gen, dest: name);
3568	if (ret < `0`)
3569	goto out;
3570	ret = send_rmdir(sctx, path: name);
3571	if (ret < `0`)
3572	goto out;
3573	}
3574
3575	finish:
3576	ret = cache_dir_utimes(sctx, dir: pm->ino, gen: pm->gen);
3577	if (ret < `0`)
3578	goto out;
3579
3580	/*
3581	* After rename/move, need to update the utimes of both new parent(s)
3582	* and old parent(s).
3583	*/
3584	list_for_each_entry(cur, &pm->update_refs, list) {
3585	/*
3586	* The parent inode might have been deleted in the send snapshot
3587	*/
3588	ret = get_inode_info(root: sctx->send_root, ino: cur->dir, NULL);
3589	if (ret == -ENOENT) {
3590	ret = `0`;
3591	continue;
3592	}
3593	if (ret < `0`)
3594	goto out;
3595
3596	ret = cache_dir_utimes(sctx, dir: cur->dir, gen: cur->dir_gen);
3597	if (ret < `0`)
3598	goto out;
3599	}
3600
3601	out:
3602	fs_path_free(p: name);
3603	fs_path_free(p: from_path);
3604	fs_path_free(p: to_path);
3605	sctx->send_progress = orig_progress;
3606
3607	return ret;
3608	}
3609
3610	static void free_pending_move(struct send_ctx sctx, struct* pending_dir_move *m)
3611	{
3612	if (!list_empty(head: &m->list))
3613	list_del(entry: &m->list);
3614	if (!RB_EMPTY_NODE(&m->node))
3615	rb_erase(&m->node, &sctx->pending_dir_moves);
3616	__free_recorded_refs(head: &m->update_refs);
3617	kfree(objp: m);
3618	}
3619
3620	static void tail_append_pending_moves(struct send_ctx *sctx,
3621	struct pending_dir_move *moves,
3622	struct list_head *stack)
3623	{
3624	if (list_empty(head: &moves->list)) {
3625	list_add_tail(new: &moves->list, head: stack);
3626	} else {
3627	LIST_HEAD(list);
3628	list_splice_init(list: &moves->list, head: &list);
3629	list_add_tail(new: &moves->list, head: stack);
3630	list_splice_tail(list: &list, head: stack);
3631	}
3632	if (!RB_EMPTY_NODE(&moves->node)) {
3633	rb_erase(&moves->node, &sctx->pending_dir_moves);
3634	RB_CLEAR_NODE(&moves->node);
3635	}
3636	}
3637
3638	static int apply_children_dir_moves(struct send_ctx *sctx)
3639	{
3640	struct pending_dir_move *pm;
3641	LIST_HEAD(stack);
3642	u64 parent_ino = sctx->cur_ino;
3643	int ret = `0`;
3644
3645	pm = get_pending_dir_moves(sctx, parent_ino);
3646	if (!pm)
3647	return `0`;
3648
3649	tail_append_pending_moves(sctx, moves: pm, stack: &stack);
3650
3651	while (!list_empty(head: &stack)) {
3652	pm = list_first_entry(&stack, struct pending_dir_move, list);
3653	parent_ino = pm->ino;
3654	ret = apply_dir_move(sctx, pm);
3655	free_pending_move(sctx, m: pm);
3656	if (ret)
3657	goto out;
3658	pm = get_pending_dir_moves(sctx, parent_ino);
3659	if (pm)
3660	tail_append_pending_moves(sctx, moves: pm, stack: &stack);
3661	}
3662	return `0`;
3663
3664	out:
3665	while (!list_empty(head: &stack)) {
3666	pm = list_first_entry(&stack, struct pending_dir_move, list);
3667	free_pending_move(sctx, m: pm);
3668	}
3669	return ret;
3670	}
3671
3672	/*
3673	* We might need to delay a directory rename even when no ancestor directory
3674	* (in the send root) with a higher inode number than ours (sctx->cur_ino) was
3675	* renamed. This happens when we rename a directory to the old name (the name
3676	* in the parent root) of some other unrelated directory that got its rename
3677	* delayed due to some ancestor with higher number that got renamed.
3678	*
3679	* Example:
3680	*
3681	* Parent snapshot:
3682	* . (ino 256)
3683	* \|---- a/ (ino 257)
3684	* \| \|---- file (ino 260)
3685	* \|
3686	* \|---- b/ (ino 258)
3687	* \|---- c/ (ino 259)
3688	*
3689	* Send snapshot:
3690	* . (ino 256)
3691	* \|---- a/ (ino 258)
3692	* \|---- x/ (ino 259)
3693	* \|---- y/ (ino 257)
3694	* \|----- file (ino 260)
3695	*
3696	* Here we can not rename 258 from 'b' to 'a' without the rename of inode 257
3697	* from 'a' to 'x/y' happening first, which in turn depends on the rename of
3698	* inode 259 from 'c' to 'x'. So the order of rename commands the send stream
3699	* must issue is:
3700	*
3701	* 1 - rename 259 from 'c' to 'x'
3702	* 2 - rename 257 from 'a' to 'x/y'
3703	* 3 - rename 258 from 'b' to 'a'
3704	*
3705	* Returns 1 if the rename of sctx->cur_ino needs to be delayed, 0 if it can
3706	* be done right away and < 0 on error.
3707	*/
3708	static int wait_for_dest_dir_move(struct send_ctx *sctx,
3709	struct recorded_ref *parent_ref,
3710	const bool is_orphan)
3711	{
3712	BTRFS_PATH_AUTO_FREE(path);
3713	struct btrfs_key key;
3714	struct btrfs_key di_key;
3715	struct btrfs_dir_item *di;
3716	u64 left_gen;
3717	u64 right_gen;
3718	int ret = `0`;
3719	struct waiting_dir_move *wdm;
3720
3721	if (RB_EMPTY_ROOT(&sctx->waiting_dir_moves))
3722	return `0`;
3723
3724	path = alloc_path_for_send();
3725	if (!path)
3726	return -ENOMEM;
3727
3728	key.objectid = parent_ref->dir;
3729	key.type = BTRFS_DIR_ITEM_KEY;
3730	key.offset = btrfs_name_hash(name: parent_ref->name, len: parent_ref->name_len);
3731
3732	ret = btrfs_search_slot(NULL, root: sctx->parent_root, key: &key, p: path, ins_len: `0`, cow: `0`);
3733	if (ret < `0`)
3734	return ret;
3735	if (ret > `0`)
3736	return `0`;
3737
3738	di = btrfs_match_dir_item_name(path, name: parent_ref->name,
3739	name_len: parent_ref->name_len);
3740	if (!di)
3741	return `0`;
3742	/*
3743	* di_key.objectid has the number of the inode that has a dentry in the
3744	* parent directory with the same name that sctx->cur_ino is being
3745	* renamed to. We need to check if that inode is in the send root as
3746	* well and if it is currently marked as an inode with a pending rename,
3747	* if it is, we need to delay the rename of sctx->cur_ino as well, so
3748	* that it happens after that other inode is renamed.
3749	*/
3750	btrfs_dir_item_key_to_cpu(eb: path->nodes[`0`], item: di, cpu_key: &di_key);
3751	if (di_key.type != BTRFS_INODE_ITEM_KEY)
3752	return `0`;
3753
3754	ret = get_inode_gen(root: sctx->parent_root, ino: di_key.objectid, gen: &left_gen);
3755	if (ret < `0`)
3756	return ret;
3757	ret = get_inode_gen(root: sctx->send_root, ino: di_key.objectid, gen: &right_gen);
3758	if (ret < `0`) {
3759	if (ret == -ENOENT)
3760	ret = `0`;
3761	return ret;
3762	}
3763
3764	/ Different inode, no need to delay the rename of sctx->cur_ino /
3765	if (right_gen != left_gen)
3766	return `0`;
3767
3768	wdm = get_waiting_dir_move(sctx, ino: di_key.objectid);
3769	if (wdm && !wdm->orphanized) {
3770	ret = add_pending_dir_move(sctx,
3771	ino: sctx->cur_ino,
3772	ino_gen: sctx->cur_inode_gen,
3773	parent_ino: di_key.objectid,
3774	new_refs: &sctx->new_refs,
3775	deleted_refs: &sctx->deleted_refs,
3776	is_orphan);
3777	if (!ret)
3778	ret = `1`;
3779	}
3780	return ret;
3781	}
3782
3783	/*
3784	* Check if inode ino2, or any of its ancestors, is inode ino1.
3785	* Return 1 if true, 0 if false and < 0 on error.
3786	*/
3787	static int check_ino_in_path(struct btrfs_root *root,
3788	const u64 ino1,
3789	const u64 ino1_gen,
3790	const u64 ino2,
3791	const u64 ino2_gen,
3792	struct fs_path *fs_path)
3793	{
3794	u64 ino = ino2;
3795
3796	if (ino1 == ino2)
3797	return ino1_gen == ino2_gen;
3798
3799	while (ino > BTRFS_FIRST_FREE_OBJECTID) {
3800	u64 parent;
3801	u64 parent_gen;
3802	int ret;
3803
3804	fs_path_reset(p: fs_path);
3805	ret = get_first_ref(root, ino, dir: &parent, dir_gen: &parent_gen, name: fs_path);
3806	if (ret < `0`)
3807	return ret;
3808	if (parent == ino1)
3809	return parent_gen == ino1_gen;
3810	ino = parent;
3811	}
3812	return `0`;
3813	}
3814
3815	/*
3816	* Check if inode ino1 is an ancestor of inode ino2 in the given root for any
3817	* possible path (in case ino2 is not a directory and has multiple hard links).
3818	* Return 1 if true, 0 if false and < 0 on error.
3819	*/
3820	static int is_ancestor(struct btrfs_root *root,
3821	const u64 ino1,
3822	const u64 ino1_gen,
3823	const u64 ino2,
3824	struct fs_path *fs_path)
3825	{
3826	bool free_fs_path = false;
3827	int ret = `0`;
3828	int iter_ret = `0`;
3829	BTRFS_PATH_AUTO_FREE(path);
3830	struct btrfs_key key;
3831
3832	if (!fs_path) {
3833	fs_path = fs_path_alloc();
3834	if (!fs_path)
3835	return -ENOMEM;
3836	free_fs_path = true;
3837	}
3838
3839	path = alloc_path_for_send();
3840	if (!path) {
3841	ret = -ENOMEM;
3842	goto out;
3843	}
3844
3845	key.objectid = ino2;
3846	key.type = BTRFS_INODE_REF_KEY;
3847	key.offset = `0`;
3848
3849	btrfs_for_each_slot(root, &key, &key, path, iter_ret) {
3850	struct extent_buffer *leaf = path->nodes[`0`];
3851	int slot = path->slots[`0`];
3852	u32 cur_offset = `0`;
3853	u32 item_size;
3854
3855	if (key.objectid != ino2)
3856	break;
3857	if (key.type != BTRFS_INODE_REF_KEY &&
3858	key.type != BTRFS_INODE_EXTREF_KEY)
3859	break;
3860
3861	item_size = btrfs_item_size(eb: leaf, slot);
3862	while (cur_offset < item_size) {
3863	u64 parent;
3864	u64 parent_gen;
3865
3866	if (key.type == BTRFS_INODE_EXTREF_KEY) {
3867	unsigned long ptr;
3868	struct btrfs_inode_extref *extref;
3869
3870	ptr = btrfs_item_ptr_offset(leaf, slot);
3871	extref = (struct btrfs_inode_extref *)
3872	(ptr + cur_offset);
3873	parent = btrfs_inode_extref_parent(eb: leaf,
3874	s: extref);
3875	cur_offset += sizeof(*extref);
3876	cur_offset += btrfs_inode_extref_name_len(eb: leaf,
3877	s: extref);
3878	} else {
3879	parent = key.offset;
3880	cur_offset = item_size;
3881	}
3882
3883	ret = get_inode_gen(root, ino: parent, gen: &parent_gen);
3884	if (ret < `0`)
3885	goto out;
3886	ret = check_ino_in_path(root, ino1, ino1_gen,
3887	ino2: parent, ino2_gen: parent_gen, fs_path);
3888	if (ret)
3889	goto out;
3890	}
3891	}
3892	ret = `0`;
3893	if (iter_ret < `0`)
3894	ret = iter_ret;
3895
3896	out:
3897	if (free_fs_path)
3898	fs_path_free(p: fs_path);
3899	return ret;
3900	}
3901
3902	static int wait_for_parent_move(struct send_ctx *sctx,
3903	struct recorded_ref *parent_ref,
3904	const bool is_orphan)
3905	{
3906	int ret = `0`;
3907	u64 ino = parent_ref->dir;
3908	u64 ino_gen = parent_ref->dir_gen;
3909	u64 parent_ino_before, parent_ino_after;
3910	struct fs_path *path_before = NULL;
3911	struct fs_path *path_after = NULL;
3912	int len1, len2;
3913
3914	path_after = fs_path_alloc();
3915	path_before = fs_path_alloc();
3916	if (!path_after \|\| !path_before) {
3917	ret = -ENOMEM;
3918	goto out;
3919	}
3920
3921	/*
3922	* Our current directory inode may not yet be renamed/moved because some
3923	* ancestor (immediate or not) has to be renamed/moved first. So find if
3924	* such ancestor exists and make sure our own rename/move happens after
3925	* that ancestor is processed to avoid path build infinite loops (done
3926	* at get_cur_path()).
3927	*/
3928	while (ino > BTRFS_FIRST_FREE_OBJECTID) {
3929	u64 parent_ino_after_gen;
3930
3931	if (is_waiting_for_move(sctx, ino)) {
3932	/*
3933	* If the current inode is an ancestor of ino in the
3934	* parent root, we need to delay the rename of the
3935	* current inode, otherwise don't delayed the rename
3936	* because we can end up with a circular dependency
3937	* of renames, resulting in some directories never
3938	* getting the respective rename operations issued in
3939	* the send stream or getting into infinite path build
3940	* loops.
3941	*/
3942	ret = is_ancestor(root: sctx->parent_root,
3943	ino1: sctx->cur_ino, ino1_gen: sctx->cur_inode_gen,
3944	ino2: ino, fs_path: path_before);
3945	if (ret)
3946	break;
3947	}
3948
3949	fs_path_reset(p: path_before);
3950	fs_path_reset(p: path_after);
3951
3952	ret = get_first_ref(root: sctx->send_root, ino, dir: &parent_ino_after,
3953	dir_gen: &parent_ino_after_gen, name: path_after);
3954	if (ret < `0`)
3955	goto out;
3956	ret = get_first_ref(root: sctx->parent_root, ino, dir: &parent_ino_before,
3957	NULL, name: path_before);
3958	if (ret < `0` && ret != -ENOENT) {
3959	goto out;
3960	} else if (ret == -ENOENT) {
3961	ret = `0`;
3962	break;
3963	}
3964
3965	len1 = fs_path_len(p: path_before);
3966	len2 = fs_path_len(p: path_after);
3967	if (ino > sctx->cur_ino &&
3968	(parent_ino_before != parent_ino_after \|\| len1 != len2 \|\|
3969	memcmp(p: path_before->start, q: path_after->start, size: len1))) {
3970	u64 parent_ino_gen;
3971
3972	ret = get_inode_gen(root: sctx->parent_root, ino, gen: &parent_ino_gen);
3973	if (ret < `0`)
3974	goto out;
3975	if (ino_gen == parent_ino_gen) {
3976	ret = `1`;
3977	break;
3978	}
3979	}
3980	ino = parent_ino_after;
3981	ino_gen = parent_ino_after_gen;
3982	}
3983
3984	out:
3985	fs_path_free(p: path_before);
3986	fs_path_free(p: path_after);
3987
3988	if (ret == `1`) {
3989	ret = add_pending_dir_move(sctx,
3990	ino: sctx->cur_ino,
3991	ino_gen: sctx->cur_inode_gen,
3992	parent_ino: ino,
3993	new_refs: &sctx->new_refs,
3994	deleted_refs: &sctx->deleted_refs,
3995	is_orphan);
3996	if (!ret)
3997	ret = `1`;
3998	}
3999
4000	return ret;
4001	}
4002
4003	static int update_ref_path(struct send_ctx sctx, struct* recorded_ref *ref)
4004	{
4005	int ret;
4006	struct fs_path *new_path;
4007
4008	/*
4009	* Our reference's name member points to its full_path member string, so
4010	* we use here a new path.
4011	*/
4012	new_path = fs_path_alloc();
4013	if (!new_path)
4014	return -ENOMEM;
4015
4016	ret = get_cur_path(sctx, ino: ref->dir, gen: ref->dir_gen, dest: new_path);
4017	if (ret < `0`) {
4018	fs_path_free(p: new_path);
4019	return ret;
4020	}
4021	ret = fs_path_add(p: new_path, name: ref->name, name_len: ref->name_len);
4022	if (ret < `0`) {
4023	fs_path_free(p: new_path);
4024	return ret;
4025	}
4026
4027	fs_path_free(p: ref->full_path);
4028	set_ref_path(ref, path: new_path);
4029
4030	return `0`;
4031	}
4032
4033	/*
4034	* When processing the new references for an inode we may orphanize an existing
4035	* directory inode because its old name conflicts with one of the new references
4036	* of the current inode. Later, when processing another new reference of our
4037	* inode, we might need to orphanize another inode, but the path we have in the
4038	* reference reflects the pre-orphanization name of the directory we previously
4039	* orphanized. For example:
4040	*
4041	* parent snapshot looks like:
4042	*
4043	* . (ino 256)
4044	* \|----- f1 (ino 257)
4045	* \|----- f2 (ino 258)
4046	* \|----- d1/ (ino 259)
4047	* \|----- d2/ (ino 260)
4048	*
4049	* send snapshot looks like:
4050	*
4051	* . (ino 256)
4052	* \|----- d1 (ino 258)
4053	* \|----- f2/ (ino 259)
4054	* \|----- f2_link/ (ino 260)
4055	* \| \|----- f1 (ino 257)
4056	* \|
4057	* \|----- d2 (ino 258)
4058	*
4059	* When processing inode 257 we compute the name for inode 259 as "d1", and we
4060	* cache it in the name cache. Later when we start processing inode 258, when
4061	* collecting all its new references we set a full path of "d1/d2" for its new
4062	* reference with name "d2". When we start processing the new references we
4063	* start by processing the new reference with name "d1", and this results in
4064	* orphanizing inode 259, since its old reference causes a conflict. Then we
4065	* move on the next new reference, with name "d2", and we find out we must
4066	* orphanize inode 260, as its old reference conflicts with ours - but for the
4067	* orphanization we use a source path corresponding to the path we stored in the
4068	* new reference, which is "d1/d2" and not "o259-6-0/d2" - this makes the
4069	* receiver fail since the path component "d1/" no longer exists, it was renamed
4070	* to "o259-6-0/" when processing the previous new reference. So in this case we
4071	* must recompute the path in the new reference and use it for the new
4072	* orphanization operation.
4073	*/
4074	static int refresh_ref_path(struct send_ctx sctx, struct* recorded_ref *ref)
4075	{
4076	char AUTO_KFREE(name);
4077	int ret;
4078
4079	name = kmemdup(ref->name, ref->name_len, GFP_KERNEL);
4080	if (!name)
4081	return -ENOMEM;
4082
4083	fs_path_reset(p: ref->full_path);
4084	ret = get_cur_path(sctx, ino: ref->dir, gen: ref->dir_gen, dest: ref->full_path);
4085	if (ret < `0`)
4086	return ret;
4087
4088	ret = fs_path_add(p: ref->full_path, name, name_len: ref->name_len);
4089	if (ret < `0`)
4090	return ret;
4091
4092	/ Update the reference's base name pointer. /
4093	set_ref_path(ref, path: ref->full_path);
4094
4095	return `0`;
4096	}
4097
4098	static int rbtree_check_dir_ref_comp(const void k, const* struct rb_node *node)
4099	{
4100	const struct recorded_ref *data = k;
4101	const struct recorded_ref ref = rb_entry(node, struct* recorded_ref, node);
4102
4103	if (data->dir > ref->dir)
4104	return `1`;
4105	if (data->dir < ref->dir)
4106	return -`1`;
4107	if (data->dir_gen > ref->dir_gen)
4108	return `1`;
4109	if (data->dir_gen < ref->dir_gen)
4110	return -`1`;
4111	return `0`;
4112	}
4113
4114	static bool rbtree_check_dir_ref_less(struct rb_node node, const* struct rb_node *parent)
4115	{
4116	const struct recorded_ref entry = rb_entry(node, struct* recorded_ref, node);
4117
4118	return rbtree_check_dir_ref_comp(k: entry, node: parent) < `0`;
4119	}
4120
4121	static int record_check_dir_ref_in_tree(struct rb_root *root,
4122	struct recorded_ref ref, struct* list_head *list)
4123	{
4124	struct recorded_ref *tmp_ref;
4125	int ret;
4126
4127	if (rb_find(key: ref, tree: root, cmp: rbtree_check_dir_ref_comp))
4128	return `0`;
4129
4130	ret = dup_ref(ref, list);
4131	if (ret < `0`)
4132	return ret;
4133
4134	tmp_ref = list_last_entry(list, struct recorded_ref, list);
4135	rb_add(node: &tmp_ref->node, tree: root, less: rbtree_check_dir_ref_less);
4136	tmp_ref->root = root;
4137	return `0`;
4138	}
4139
4140	static int rename_current_inode(struct send_ctx *sctx,
4141	struct fs_path *current_path,
4142	struct fs_path *new_path)
4143	{
4144	int ret;
4145
4146	ret = send_rename(sctx, from: current_path, to: new_path);
4147	if (ret < `0`)
4148	return ret;
4149
4150	ret = fs_path_copy(p: &sctx->cur_inode_path, from: new_path);
4151	if (ret < `0`)
4152	return ret;
4153
4154	return fs_path_copy(p: current_path, from: new_path);
4155	}
4156
4157	/*
4158	* This does all the move/link/unlink/rmdir magic.
4159	*/
4160	static int process_recorded_refs(struct send_ctx sctx, int* *pending_move)
4161	{
4162	struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
4163	int ret = `0`;
4164	struct recorded_ref *cur;
4165	struct recorded_ref *cur2;
4166	LIST_HEAD(check_dirs);
4167	struct rb_root rbtree_check_dirs = RB_ROOT;
4168	struct fs_path *valid_path = NULL;
4169	u64 ow_inode = `0`;
4170	u64 ow_gen;
4171	u64 ow_mode;
4172	bool did_overwrite = false;
4173	bool is_orphan = false;
4174	bool can_rename = true;
4175	bool orphanized_dir = false;
4176	bool orphanized_ancestor = false;
4177
4178	/*
4179	* This should never happen as the root dir always has the same ref
4180	* which is always '..'
4181	*/
4182	if (unlikely(sctx->cur_ino <= BTRFS_FIRST_FREE_OBJECTID)) {
4183	btrfs_err(fs_info,
4184	"send: unexpected inode %llu in process_recorded_refs()",
4185	sctx->cur_ino);
4186	ret = -EINVAL;
4187	goto out;
4188	}
4189
4190	valid_path = fs_path_alloc();
4191	if (!valid_path) {
4192	ret = -ENOMEM;
4193	goto out;
4194	}
4195
4196	/*
4197	* First, check if the first ref of the current inode was overwritten
4198	* before. If yes, we know that the current inode was already orphanized
4199	* and thus use the orphan name. If not, we can use get_cur_path to
4200	* get the path of the first ref as it would like while receiving at
4201	* this point in time.
4202	* New inodes are always orphan at the beginning, so force to use the
4203	* orphan name in this case.
4204	* The first ref is stored in valid_path and will be updated if it
4205	* gets moved around.
4206	*/
4207	if (!sctx->cur_inode_new) {
4208	ret = did_overwrite_first_ref(sctx, ino: sctx->cur_ino,
4209	gen: sctx->cur_inode_gen);
4210	if (ret < `0`)
4211	goto out;
4212	if (ret)
4213	did_overwrite = true;
4214	}
4215	if (sctx->cur_inode_new \|\| did_overwrite) {
4216	ret = gen_unique_name(sctx, ino: sctx->cur_ino,
4217	gen: sctx->cur_inode_gen, dest: valid_path);
4218	if (ret < `0`)
4219	goto out;
4220	is_orphan = true;
4221	} else {
4222	ret = get_cur_path(sctx, ino: sctx->cur_ino, gen: sctx->cur_inode_gen,
4223	dest: valid_path);
4224	if (ret < `0`)
4225	goto out;
4226	}
4227
4228	/*
4229	* Before doing any rename and link operations, do a first pass on the
4230	* new references to orphanize any unprocessed inodes that may have a
4231	* reference that conflicts with one of the new references of the current
4232	* inode. This needs to happen first because a new reference may conflict
4233	* with the old reference of a parent directory, so we must make sure
4234	* that the path used for link and rename commands don't use an
4235	* orphanized name when an ancestor was not yet orphanized.
4236	*
4237	* Example:
4238	*
4239	* Parent snapshot:
4240	*
4241	* . (ino 256)
4242	* \|----- testdir/ (ino 259)
4243	* \| \|----- a (ino 257)
4244	* \|
4245	* \|----- b (ino 258)
4246	*
4247	* Send snapshot:
4248	*
4249	* . (ino 256)
4250	* \|----- testdir_2/ (ino 259)
4251	* \| \|----- a (ino 260)
4252	* \|
4253	* \|----- testdir (ino 257)
4254	* \|----- b (ino 257)
4255	* \|----- b2 (ino 258)
4256	*
4257	* Processing the new reference for inode 257 with name "b" may happen
4258	* before processing the new reference with name "testdir". If so, we
4259	* must make sure that by the time we send a link command to create the
4260	* hard link "b", inode 259 was already orphanized, since the generated
4261	* path in "valid_path" already contains the orphanized name for 259.
4262	* We are processing inode 257, so only later when processing 259 we do
4263	* the rename operation to change its temporary (orphanized) name to
4264	* "testdir_2".
4265	*/
4266	list_for_each_entry(cur, &sctx->new_refs, list) {
4267	ret = get_cur_inode_state(sctx, ino: cur->dir, gen: cur->dir_gen, NULL, NULL);
4268	if (ret < `0`)
4269	goto out;
4270	if (ret == inode_state_will_create)
4271	continue;
4272
4273	/*
4274	* Check if this new ref would overwrite the first ref of another
4275	* unprocessed inode. If yes, orphanize the overwritten inode.
4276	* If we find an overwritten ref that is not the first ref,
4277	* simply unlink it.
4278	*/
4279	ret = will_overwrite_ref(sctx, dir: cur->dir, dir_gen: cur->dir_gen,
4280	name: cur->name, name_len: cur->name_len,
4281	who_ino: &ow_inode, who_gen: &ow_gen, who_mode: &ow_mode);
4282	if (ret < `0`)
4283	goto out;
4284	if (ret) {
4285	ret = is_first_ref(root: sctx->parent_root,
4286	ino: ow_inode, dir: cur->dir, name: cur->name,
4287	name_len: cur->name_len);
4288	if (ret < `0`)
4289	goto out;
4290	if (ret) {
4291	struct name_cache_entry *nce;
4292	struct waiting_dir_move *wdm;
4293
4294	if (orphanized_dir) {
4295	ret = refresh_ref_path(sctx, ref: cur);
4296	if (ret < `0`)
4297	goto out;
4298	}
4299
4300	ret = orphanize_inode(sctx, ino: ow_inode, gen: ow_gen,
4301	path: cur->full_path);
4302	if (ret < `0`)
4303	goto out;
4304	if (S_ISDIR(ow_mode))
4305	orphanized_dir = true;
4306
4307	/*
4308	* If ow_inode has its rename operation delayed
4309	* make sure that its orphanized name is used in
4310	* the source path when performing its rename
4311	* operation.
4312	*/
4313	wdm = get_waiting_dir_move(sctx, ino: ow_inode);
4314	if (wdm)
4315	wdm->orphanized = true;
4316
4317	/*
4318	* Make sure we clear our orphanized inode's
4319	* name from the name cache. This is because the
4320	* inode ow_inode might be an ancestor of some
4321	* other inode that will be orphanized as well
4322	* later and has an inode number greater than
4323	* sctx->send_progress. We need to prevent
4324	* future name lookups from using the old name
4325	* and get instead the orphan name.
4326	*/
4327	nce = name_cache_search(sctx, ino: ow_inode, gen: ow_gen);
4328	if (nce)
4329	btrfs_lru_cache_remove(cache: &sctx->name_cache,
4330	entry: &nce->entry);
4331
4332	/*
4333	* ow_inode might currently be an ancestor of
4334	* cur_ino, therefore compute valid_path (the
4335	* current path of cur_ino) again because it
4336	* might contain the pre-orphanization name of
4337	* ow_inode, which is no longer valid.
4338	*/
4339	ret = is_ancestor(root: sctx->parent_root,
4340	ino1: ow_inode, ino1_gen: ow_gen,
4341	ino2: sctx->cur_ino, NULL);
4342	if (ret > `0`) {
4343	orphanized_ancestor = true;
4344	fs_path_reset(p: valid_path);
4345	fs_path_reset(p: &sctx->cur_inode_path);
4346	ret = get_cur_path(sctx, ino: sctx->cur_ino,
4347	gen: sctx->cur_inode_gen,
4348	dest: valid_path);
4349	}
4350	if (ret < `0`)
4351	goto out;
4352	} else {
4353	/*
4354	* If we previously orphanized a directory that
4355	* collided with a new reference that we already
4356	* processed, recompute the current path because
4357	* that directory may be part of the path.
4358	*/
4359	if (orphanized_dir) {
4360	ret = refresh_ref_path(sctx, ref: cur);
4361	if (ret < `0`)
4362	goto out;
4363	}
4364	ret = send_unlink(sctx, path: cur->full_path);
4365	if (ret < `0`)
4366	goto out;
4367	}
4368	}
4369
4370	}
4371
4372	list_for_each_entry(cur, &sctx->new_refs, list) {
4373	/*
4374	* We may have refs where the parent directory does not exist
4375	* yet. This happens if the parent directories inum is higher
4376	* than the current inum. To handle this case, we create the
4377	* parent directory out of order. But we need to check if this
4378	* did already happen before due to other refs in the same dir.
4379	*/
4380	ret = get_cur_inode_state(sctx, ino: cur->dir, gen: cur->dir_gen, NULL, NULL);
4381	if (ret < `0`)
4382	goto out;
4383	if (ret == inode_state_will_create) {
4384	ret = `0`;
4385	/*
4386	* First check if any of the current inodes refs did
4387	* already create the dir.
4388	*/
4389	list_for_each_entry(cur2, &sctx->new_refs, list) {
4390	if (cur == cur2)
4391	break;
4392	if (cur2->dir == cur->dir) {
4393	ret = `1`;
4394	break;
4395	}
4396	}
4397
4398	/*
4399	* If that did not happen, check if a previous inode
4400	* did already create the dir.
4401	*/
4402	if (!ret)
4403	ret = did_create_dir(sctx, dir: cur->dir);
4404	if (ret < `0`)
4405	goto out;
4406	if (!ret) {
4407	ret = send_create_inode(sctx, ino: cur->dir);
4408	if (ret < `0`)
4409	goto out;
4410	cache_dir_created(sctx, dir: cur->dir);
4411	}
4412	}
4413
4414	if (S_ISDIR(sctx->cur_inode_mode) && sctx->parent_root) {
4415	ret = wait_for_dest_dir_move(sctx, parent_ref: cur, is_orphan);
4416	if (ret < `0`)
4417	goto out;
4418	if (ret == `1`) {
4419	can_rename = false;
4420	*pending_move = `1`;
4421	}
4422	}
4423
4424	if (S_ISDIR(sctx->cur_inode_mode) && sctx->parent_root &&
4425	can_rename) {
4426	ret = wait_for_parent_move(sctx, parent_ref: cur, is_orphan);
4427	if (ret < `0`)
4428	goto out;
4429	if (ret == `1`) {
4430	can_rename = false;
4431	*pending_move = `1`;
4432	}
4433	}
4434
4435	/*
4436	* link/move the ref to the new place. If we have an orphan
4437	* inode, move it and update valid_path. If not, link or move
4438	* it depending on the inode mode.
4439	*/
4440	if (is_orphan && can_rename) {
4441	ret = rename_current_inode(sctx, current_path: valid_path, new_path: cur->full_path);
4442	if (ret < `0`)
4443	goto out;
4444	is_orphan = false;
4445	} else if (can_rename) {
4446	if (S_ISDIR(sctx->cur_inode_mode)) {
4447	/*
4448	* Dirs can't be linked, so move it. For moved
4449	* dirs, we always have one new and one deleted
4450	* ref. The deleted ref is ignored later.
4451	*/
4452	ret = rename_current_inode(sctx, current_path: valid_path,
4453	new_path: cur->full_path);
4454	if (ret < `0`)
4455	goto out;
4456	} else {
4457	/*
4458	* We might have previously orphanized an inode
4459	* which is an ancestor of our current inode,
4460	* so our reference's full path, which was
4461	* computed before any such orphanizations, must
4462	* be updated.
4463	*/
4464	if (orphanized_dir) {
4465	ret = update_ref_path(sctx, ref: cur);
4466	if (ret < `0`)
4467	goto out;
4468	}
4469	ret = send_link(sctx, path: cur->full_path,
4470	lnk: valid_path);
4471	if (ret < `0`)
4472	goto out;
4473	}
4474	}
4475	ret = record_check_dir_ref_in_tree(root: &rbtree_check_dirs, ref: cur, list: &check_dirs);
4476	if (ret < `0`)
4477	goto out;
4478	}
4479
4480	if (S_ISDIR(sctx->cur_inode_mode) && sctx->cur_inode_deleted) {
4481	/*
4482	* Check if we can already rmdir the directory. If not,
4483	* orphanize it. For every dir item inside that gets deleted
4484	* later, we do this check again and rmdir it then if possible.
4485	* See the use of check_dirs for more details.
4486	*/
4487	ret = can_rmdir(sctx, dir: sctx->cur_ino, dir_gen: sctx->cur_inode_gen);
4488	if (ret < `0`)
4489	goto out;
4490	if (ret) {
4491	ret = send_rmdir(sctx, path: valid_path);
4492	if (ret < `0`)
4493	goto out;
4494	} else if (!is_orphan) {
4495	ret = orphanize_inode(sctx, ino: sctx->cur_ino,
4496	gen: sctx->cur_inode_gen, path: valid_path);
4497	if (ret < `0`)
4498	goto out;
4499	is_orphan = true;
4500	}
4501
4502	list_for_each_entry(cur, &sctx->deleted_refs, list) {
4503	ret = record_check_dir_ref_in_tree(root: &rbtree_check_dirs, ref: cur, list: &check_dirs);
4504	if (ret < `0`)
4505	goto out;
4506	}
4507	} else if (S_ISDIR(sctx->cur_inode_mode) &&
4508	!list_empty(head: &sctx->deleted_refs)) {
4509	/*
4510	* We have a moved dir. Add the old parent to check_dirs
4511	*/
4512	cur = list_first_entry(&sctx->deleted_refs, struct recorded_ref, list);
4513	ret = record_check_dir_ref_in_tree(root: &rbtree_check_dirs, ref: cur, list: &check_dirs);
4514	if (ret < `0`)
4515	goto out;
4516	} else if (!S_ISDIR(sctx->cur_inode_mode)) {
4517	/*
4518	* We have a non dir inode. Go through all deleted refs and
4519	* unlink them if they were not already overwritten by other
4520	* inodes.
4521	*/
4522	list_for_each_entry(cur, &sctx->deleted_refs, list) {
4523	ret = did_overwrite_ref(sctx, dir: cur->dir, dir_gen: cur->dir_gen,
4524	ino: sctx->cur_ino, ino_gen: sctx->cur_inode_gen,
4525	name: cur->name, name_len: cur->name_len);
4526	if (ret < `0`)
4527	goto out;
4528	if (!ret) {
4529	/*
4530	* If we orphanized any ancestor before, we need
4531	* to recompute the full path for deleted names,
4532	* since any such path was computed before we
4533	* processed any references and orphanized any
4534	* ancestor inode.
4535	*/
4536	if (orphanized_ancestor) {
4537	ret = update_ref_path(sctx, ref: cur);
4538	if (ret < `0`)
4539	goto out;
4540	}
4541	ret = send_unlink(sctx, path: cur->full_path);
4542	if (ret < `0`)
4543	goto out;
4544	if (is_current_inode_path(sctx, path: cur->full_path))
4545	fs_path_reset(p: &sctx->cur_inode_path);
4546	}
4547	ret = record_check_dir_ref_in_tree(root: &rbtree_check_dirs, ref: cur, list: &check_dirs);
4548	if (ret < `0`)
4549	goto out;
4550	}
4551	/*
4552	* If the inode is still orphan, unlink the orphan. This may
4553	* happen when a previous inode did overwrite the first ref
4554	* of this inode and no new refs were added for the current
4555	* inode. Unlinking does not mean that the inode is deleted in
4556	* all cases. There may still be links to this inode in other
4557	* places.
4558	*/
4559	if (is_orphan) {
4560	ret = send_unlink(sctx, path: valid_path);
4561	if (ret < `0`)
4562	goto out;
4563	}
4564	}
4565
4566	/*
4567	* We did collect all parent dirs where cur_inode was once located. We
4568	* now go through all these dirs and check if they are pending for
4569	* deletion and if it's finally possible to perform the rmdir now.
4570	* We also update the inode stats of the parent dirs here.
4571	*/
4572	list_for_each_entry(cur, &check_dirs, list) {
4573	/*
4574	* In case we had refs into dirs that were not processed yet,
4575	* we don't need to do the utime and rmdir logic for these dirs.
4576	* The dir will be processed later.
4577	*/
4578	if (cur->dir > sctx->cur_ino)
4579	continue;
4580
4581	ret = get_cur_inode_state(sctx, ino: cur->dir, gen: cur->dir_gen, NULL, NULL);
4582	if (ret < `0`)
4583	goto out;
4584
4585	if (ret == inode_state_did_create \|\|
4586	ret == inode_state_no_change) {
4587	ret = cache_dir_utimes(sctx, dir: cur->dir, gen: cur->dir_gen);
4588	if (ret < `0`)
4589	goto out;
4590	} else if (ret == inode_state_did_delete) {
4591	ret = can_rmdir(sctx, dir: cur->dir, dir_gen: cur->dir_gen);
4592	if (ret < `0`)
4593	goto out;
4594	if (ret) {
4595	ret = get_cur_path(sctx, ino: cur->dir,
4596	gen: cur->dir_gen, dest: valid_path);
4597	if (ret < `0`)
4598	goto out;
4599	ret = send_rmdir(sctx, path: valid_path);
4600	if (ret < `0`)
4601	goto out;
4602	}
4603	}
4604	}
4605
4606	ret = `0`;
4607
4608	out:
4609	__free_recorded_refs(head: &check_dirs);
4610	free_recorded_refs(sctx);
4611	fs_path_free(p: valid_path);
4612	return ret;
4613	}
4614
4615	static int rbtree_ref_comp(const void k, const* struct rb_node *node)
4616	{
4617	const struct recorded_ref *data = k;
4618	const struct recorded_ref ref = rb_entry(node, struct* recorded_ref, node);
4619
4620	if (data->dir > ref->dir)
4621	return `1`;
4622	if (data->dir < ref->dir)
4623	return -`1`;
4624	if (data->dir_gen > ref->dir_gen)
4625	return `1`;
4626	if (data->dir_gen < ref->dir_gen)
4627	return -`1`;
4628	if (data->name_len > ref->name_len)
4629	return `1`;
4630	if (data->name_len < ref->name_len)
4631	return -`1`;
4632	return strcmp(data->name, ref->name);
4633	}
4634
4635	static bool rbtree_ref_less(struct rb_node node, const* struct rb_node *parent)
4636	{
4637	const struct recorded_ref entry = rb_entry(node, struct* recorded_ref, node);
4638
4639	return rbtree_ref_comp(k: entry, node: parent) < `0`;
4640	}
4641
4642	static int record_ref_in_tree(struct rb_root root, struct* list_head *refs,
4643	struct fs_path *name, u64 dir, u64 dir_gen,
4644	struct send_ctx *sctx)
4645	{
4646	int ret = `0`;
4647	struct fs_path *path = NULL;
4648	struct recorded_ref *ref = NULL;
4649
4650	path = fs_path_alloc();
4651	if (!path) {
4652	ret = -ENOMEM;
4653	goto out;
4654	}
4655
4656	ref = recorded_ref_alloc();
4657	if (!ref) {
4658	ret = -ENOMEM;
4659	goto out;
4660	}
4661
4662	ret = get_cur_path(sctx, ino: dir, gen: dir_gen, dest: path);
4663	if (ret < `0`)
4664	goto out;
4665	ret = fs_path_add_path(p: path, p2: name);
4666	if (ret < `0`)
4667	goto out;
4668
4669	ref->dir = dir;
4670	ref->dir_gen = dir_gen;
4671	set_ref_path(ref, path);
4672	list_add_tail(new: &ref->list, head: refs);
4673	rb_add(node: &ref->node, tree: root, less: rbtree_ref_less);
4674	ref->root = root;
4675	out:
4676	if (ret) {
4677	if (path && (!ref \|\| !ref->full_path))
4678	fs_path_free(p: path);
4679	recorded_ref_free(ref);
4680	}
4681	return ret;
4682	}
4683
4684	static int record_new_ref_if_needed(u64 dir, struct fs_path name, void* *ctx)
4685	{
4686	int ret;
4687	struct send_ctx *sctx = ctx;
4688	struct rb_node *node = NULL;
4689	struct recorded_ref data;
4690	struct recorded_ref *ref;
4691	u64 dir_gen;
4692
4693	ret = get_inode_gen(root: sctx->send_root, ino: dir, gen: &dir_gen);
4694	if (ret < `0`)
4695	return ret;
4696
4697	data.dir = dir;
4698	data.dir_gen = dir_gen;
4699	set_ref_path(ref: &data, path: name);
4700	node = rb_find(key: &data, tree: &sctx->rbtree_deleted_refs, cmp: rbtree_ref_comp);
4701	if (node) {
4702	ref = rb_entry(node, struct recorded_ref, node);
4703	recorded_ref_free(ref);
4704	} else {
4705	ret = record_ref_in_tree(root: &sctx->rbtree_new_refs,
4706	refs: &sctx->new_refs, name, dir, dir_gen,
4707	sctx);
4708	}
4709
4710	return ret;
4711	}
4712
4713	static int record_deleted_ref_if_needed(u64 dir, struct fs_path name, void* *ctx)
4714	{
4715	int ret;
4716	struct send_ctx *sctx = ctx;
4717	struct rb_node *node = NULL;
4718	struct recorded_ref data;
4719	struct recorded_ref *ref;
4720	u64 dir_gen;
4721
4722	ret = get_inode_gen(root: sctx->parent_root, ino: dir, gen: &dir_gen);
4723	if (ret < `0`)
4724	return ret;
4725
4726	data.dir = dir;
4727	data.dir_gen = dir_gen;
4728	set_ref_path(ref: &data, path: name);
4729	node = rb_find(key: &data, tree: &sctx->rbtree_new_refs, cmp: rbtree_ref_comp);
4730	if (node) {
4731	ref = rb_entry(node, struct recorded_ref, node);
4732	recorded_ref_free(ref);
4733	} else {
4734	ret = record_ref_in_tree(root: &sctx->rbtree_deleted_refs,
4735	refs: &sctx->deleted_refs, name, dir,
4736	dir_gen, sctx);
4737	}
4738
4739	return ret;
4740	}
4741
4742	static int record_new_ref(struct send_ctx *sctx)
4743	{
4744	int ret;
4745
4746	ret = iterate_inode_ref(root: sctx->send_root, path: sctx->left_path, found_key: sctx->cmp_key,
4747	resolve: false, iterate: record_new_ref_if_needed, ctx: sctx);
4748	if (ret < `0`)
4749	return ret;
4750
4751	return `0`;
4752	}
4753
4754	static int record_deleted_ref(struct send_ctx *sctx)
4755	{
4756	int ret;
4757
4758	ret = iterate_inode_ref(root: sctx->parent_root, path: sctx->right_path, found_key: sctx->cmp_key,
4759	resolve: false, iterate: record_deleted_ref_if_needed, ctx: sctx);
4760	if (ret < `0`)
4761	return ret;
4762
4763	return `0`;
4764	}
4765
4766	static int record_changed_ref(struct send_ctx *sctx)
4767	{
4768	int ret;
4769
4770	ret = iterate_inode_ref(root: sctx->send_root, path: sctx->left_path, found_key: sctx->cmp_key,
4771	resolve: false, iterate: record_new_ref_if_needed, ctx: sctx);
4772	if (ret < `0`)
4773	return ret;
4774	ret = iterate_inode_ref(root: sctx->parent_root, path: sctx->right_path, found_key: sctx->cmp_key,
4775	resolve: false, iterate: record_deleted_ref_if_needed, ctx: sctx);
4776	if (ret < `0`)
4777	return ret;
4778
4779	return `0`;
4780	}
4781
4782	/*
4783	* Record and process all refs at once. Needed when an inode changes the
4784	* generation number, which means that it was deleted and recreated.
4785	*/
4786	static int process_all_refs(struct send_ctx *sctx,
4787	enum btrfs_compare_tree_result cmd)
4788	{
4789	int ret = `0`;
4790	int iter_ret = `0`;
4791	struct btrfs_root *root;
4792	BTRFS_PATH_AUTO_FREE(path);
4793	struct btrfs_key key;
4794	struct btrfs_key found_key;
4795	iterate_inode_ref_t cb;
4796	int pending_move = `0`;
4797
4798	path = alloc_path_for_send();
4799	if (!path)
4800	return -ENOMEM;
4801
4802	if (cmd == BTRFS_COMPARE_TREE_NEW) {
4803	root = sctx->send_root;
4804	cb = record_new_ref_if_needed;
4805	} else if (cmd == BTRFS_COMPARE_TREE_DELETED) {
4806	root = sctx->parent_root;
4807	cb = record_deleted_ref_if_needed;
4808	} else {
4809	btrfs_err(sctx->send_root->fs_info,
4810	"Wrong command %d in process_all_refs", cmd);
4811	return -EINVAL;
4812	}
4813
4814	key.objectid = sctx->cmp_key->objectid;
4815	key.type = BTRFS_INODE_REF_KEY;
4816	key.offset = `0`;
4817	btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) {
4818	if (found_key.objectid != key.objectid \|\|
4819	(found_key.type != BTRFS_INODE_REF_KEY &&
4820	found_key.type != BTRFS_INODE_EXTREF_KEY))
4821	break;
4822
4823	ret = iterate_inode_ref(root, path, found_key: &found_key, resolve: false, iterate: cb, ctx: sctx);
4824	if (ret < `0`)
4825	return ret;
4826	}
4827	/ Catch error found during iteration /
4828	if (iter_ret < `0`)
4829	return iter_ret;
4830
4831	btrfs_release_path(p: path);
4832
4833	/*
4834	* We don't actually care about pending_move as we are simply
4835	* re-creating this inode and will be rename'ing it into place once we
4836	* rename the parent directory.
4837	*/
4838	return process_recorded_refs(sctx, pending_move: &pending_move);
4839	}
4840
4841	static int send_set_xattr(struct send_ctx *sctx,
4842	const char name, int* name_len,
4843	const char data, int* data_len)
4844	{
4845	struct fs_path *path;
4846	int ret;
4847
4848	path = get_cur_inode_path(sctx);
4849	if (IS_ERR(ptr: path))
4850	return PTR_ERR(ptr: path);
4851
4852	ret = begin_cmd(sctx, cmd: BTRFS_SEND_C_SET_XATTR);
4853	if (ret < `0`)
4854	return ret;
4855
4856	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
4857	TLV_PUT_STRING(sctx, BTRFS_SEND_A_XATTR_NAME, name, name_len);
4858	TLV_PUT(sctx, BTRFS_SEND_A_XATTR_DATA, data, data_len);
4859
4860	ret = send_cmd(sctx);
4861
4862	tlv_put_failure:
4863	return ret;
4864	}
4865
4866	static int send_remove_xattr(struct send_ctx *sctx,
4867	struct fs_path *path,
4868	const char name, int* name_len)
4869	{
4870	int ret;
4871
4872	ret = begin_cmd(sctx, cmd: BTRFS_SEND_C_REMOVE_XATTR);
4873	if (ret < `0`)
4874	return ret;
4875
4876	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
4877	TLV_PUT_STRING(sctx, BTRFS_SEND_A_XATTR_NAME, name, name_len);
4878
4879	ret = send_cmd(sctx);
4880
4881	tlv_put_failure:
4882	return ret;
4883	}
4884
4885	static int __process_new_xattr(int num, struct btrfs_key *di_key,
4886	const char name, int* name_len, const char *data,
4887	int data_len, void *ctx)
4888	{
4889	struct send_ctx *sctx = ctx;
4890	struct posix_acl_xattr_header dummy_acl;
4891
4892	/ Capabilities are emitted by finish_inode_if_needed /
4893	if (!strncmp(name, XATTR_NAME_CAPS, name_len))
4894	return `0`;
4895
4896	/*
4897	* This hack is needed because empty acls are stored as zero byte
4898	* data in xattrs. Problem with that is, that receiving these zero byte
4899	* acls will fail later. To fix this, we send a dummy acl list that
4900	* only contains the version number and no entries.
4901	*/
4902	if (!strncmp(name, XATTR_NAME_POSIX_ACL_ACCESS, name_len) \|\|
4903	!strncmp(name, XATTR_NAME_POSIX_ACL_DEFAULT, name_len)) {
4904	if (data_len == `0`) {
4905	dummy_acl.a_version =
4906	cpu_to_le32(POSIX_ACL_XATTR_VERSION);
4907	data = (char *)&dummy_acl;
4908	data_len = sizeof(dummy_acl);
4909	}
4910	}
4911
4912	return send_set_xattr(sctx, name, name_len, data, data_len);
4913	}
4914
4915	static int __process_deleted_xattr(int num, struct btrfs_key *di_key,
4916	const char name, int* name_len,
4917	const char data, int* data_len, void *ctx)
4918	{
4919	struct send_ctx *sctx = ctx;
4920	struct fs_path *p;
4921
4922	p = get_cur_inode_path(sctx);
4923	if (IS_ERR(ptr: p))
4924	return PTR_ERR(ptr: p);
4925
4926	return send_remove_xattr(sctx, path: p, name, name_len);
4927	}
4928
4929	static int process_new_xattr(struct send_ctx *sctx)
4930	{
4931	return iterate_dir_item(root: sctx->send_root, path: sctx->left_path,
4932	iterate: __process_new_xattr, ctx: sctx);
4933	}
4934
4935	static int process_deleted_xattr(struct send_ctx *sctx)
4936	{
4937	return iterate_dir_item(root: sctx->parent_root, path: sctx->right_path,
4938	iterate: __process_deleted_xattr, ctx: sctx);
4939	}
4940
4941	struct find_xattr_ctx {
4942	const char *name;
4943	int name_len;
4944	int found_idx;
4945	char *found_data;
4946	int found_data_len;
4947	bool copy_data;
4948	};
4949
4950	static int __find_xattr(int num, struct btrfs_key di_key, const* char *name,
4951	int name_len, const char data, int* data_len, void *vctx)
4952	{
4953	struct find_xattr_ctx *ctx = vctx;
4954
4955	if (name_len == ctx->name_len &&
4956	strncmp(name, ctx->name, name_len) == `0`) {
4957	ctx->found_idx = num;
4958	ctx->found_data_len = data_len;
4959	if (ctx->copy_data) {
4960	ctx->found_data = kmemdup(data, data_len, GFP_KERNEL);
4961	if (!ctx->found_data)
4962	return -ENOMEM;
4963	}
4964	return `1`;
4965	}
4966	return `0`;
4967	}
4968
4969	static int find_xattr(struct btrfs_root *root,
4970	struct btrfs_path *path,
4971	struct btrfs_key *key,
4972	const char name, int* name_len,
4973	char *data, int* *data_len)
4974	{
4975	int ret;
4976	struct find_xattr_ctx ctx;
4977
4978	ctx.name = name;
4979	ctx.name_len = name_len;
4980	ctx.found_idx = -`1`;
4981	ctx.found_data = NULL;
4982	ctx.found_data_len = `0`;
4983	ctx.copy_data = (data != NULL);
4984
4985	ret = iterate_dir_item(root, path, iterate: __find_xattr, ctx: &ctx);
4986	if (ret < `0`)
4987	return ret;
4988
4989	if (ctx.found_idx == -`1`)
4990	return -ENOENT;
4991	if (data) {
4992	*data = ctx.found_data;
4993	*data_len = ctx.found_data_len;
4994	} else {
4995	ASSERT(ctx.found_data == NULL);
4996	}
4997	return ctx.found_idx;
4998	}
4999
5000
5001	static int __process_changed_new_xattr(int num, struct btrfs_key *di_key,
5002	const char name, int* name_len,
5003	const char data, int* data_len,
5004	void *ctx)
5005	{
5006	int ret;
5007	struct send_ctx *sctx = ctx;
5008	char AUTO_KFREE(found_data);
5009	int found_data_len = `0`;
5010
5011	ret = find_xattr(root: sctx->parent_root, path: sctx->right_path,
5012	key: sctx->cmp_key, name, name_len, data: &found_data,
5013	data_len: &found_data_len);
5014	if (ret == -ENOENT) {
5015	ret = __process_new_xattr(num, di_key, name, name_len, data,
5016	data_len, ctx);
5017	} else if (ret >= `0`) {
5018	if (data_len != found_data_len \|\|
5019	memcmp(p: data, q: found_data, size: data_len)) {
5020	ret = __process_new_xattr(num, di_key, name, name_len,
5021	data, data_len, ctx);
5022	} else {
5023	ret = `0`;
5024	}
5025	}
5026
5027	return ret;
5028	}
5029
5030	static int __process_changed_deleted_xattr(int num, struct btrfs_key *di_key,
5031	const char name, int* name_len,
5032	const char data, int* data_len,
5033	void *ctx)
5034	{
5035	int ret;
5036	struct send_ctx *sctx = ctx;
5037
5038	ret = find_xattr(root: sctx->send_root, path: sctx->left_path, key: sctx->cmp_key,
5039	name, name_len, NULL, NULL);
5040	if (ret == -ENOENT)
5041	ret = __process_deleted_xattr(num, di_key, name, name_len, data,
5042	data_len, ctx);
5043	else if (ret >= `0`)
5044	ret = `0`;
5045
5046	return ret;
5047	}
5048
5049	static int process_changed_xattr(struct send_ctx *sctx)
5050	{
5051	int ret;
5052
5053	ret = iterate_dir_item(root: sctx->send_root, path: sctx->left_path,
5054	iterate: __process_changed_new_xattr, ctx: sctx);
5055	if (ret < `0`)
5056	return ret;
5057
5058	return iterate_dir_item(root: sctx->parent_root, path: sctx->right_path,
5059	iterate: __process_changed_deleted_xattr, ctx: sctx);
5060	}
5061
5062	static int process_all_new_xattrs(struct send_ctx *sctx)
5063	{
5064	int ret = `0`;
5065	int iter_ret = `0`;
5066	struct btrfs_root *root;
5067	BTRFS_PATH_AUTO_FREE(path);
5068	struct btrfs_key key;
5069	struct btrfs_key found_key;
5070
5071	path = alloc_path_for_send();
5072	if (!path)
5073	return -ENOMEM;
5074
5075	root = sctx->send_root;
5076
5077	key.objectid = sctx->cmp_key->objectid;
5078	key.type = BTRFS_XATTR_ITEM_KEY;
5079	key.offset = `0`;
5080	btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) {
5081	if (found_key.objectid != key.objectid \|\|
5082	found_key.type != key.type) {
5083	ret = `0`;
5084	break;
5085	}
5086
5087	ret = iterate_dir_item(root, path, iterate: __process_new_xattr, ctx: sctx);
5088	if (ret < `0`)
5089	break;
5090	}
5091	/ Catch error found during iteration /
5092	if (iter_ret < `0`)
5093	ret = iter_ret;
5094
5095	return ret;
5096	}
5097
5098	static int send_verity(struct send_ctx sctx, struct* fs_path *path,
5099	struct fsverity_descriptor *desc)
5100	{
5101	int ret;
5102
5103	ret = begin_cmd(sctx, cmd: BTRFS_SEND_C_ENABLE_VERITY);
5104	if (ret < `0`)
5105	return ret;
5106
5107	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
5108	TLV_PUT_U8(sctx, BTRFS_SEND_A_VERITY_ALGORITHM,
5109	le8_to_cpu(desc->hash_algorithm));
5110	TLV_PUT_U32(sctx, BTRFS_SEND_A_VERITY_BLOCK_SIZE,
5111	`1U` << le8_to_cpu(desc->log_blocksize));
5112	TLV_PUT(sctx, BTRFS_SEND_A_VERITY_SALT_DATA, desc->salt,
5113	le8_to_cpu(desc->salt_size));
5114	TLV_PUT(sctx, BTRFS_SEND_A_VERITY_SIG_DATA, desc->signature,
5115	le32_to_cpu(desc->sig_size));
5116
5117	ret = send_cmd(sctx);
5118
5119	tlv_put_failure:
5120	return ret;
5121	}
5122
5123	static int process_verity(struct send_ctx *sctx)
5124	{
5125	int ret = `0`;
5126	struct btrfs_inode *inode;
5127	struct fs_path *p;
5128
5129	inode = btrfs_iget(ino: sctx->cur_ino, root: sctx->send_root);
5130	if (IS_ERR(ptr: inode))
5131	return PTR_ERR(ptr: inode);
5132
5133	ret = btrfs_get_verity_descriptor(inode: &inode->vfs_inode, NULL, buf_size: `0`);
5134	if (ret < `0`)
5135	goto iput;
5136
5137	if (unlikely(ret > FS_VERITY_MAX_DESCRIPTOR_SIZE)) {
5138	ret = -EMSGSIZE;
5139	goto iput;
5140	}
5141	if (!sctx->verity_descriptor) {
5142	sctx->verity_descriptor = kvmalloc(FS_VERITY_MAX_DESCRIPTOR_SIZE,
5143	GFP_KERNEL);
5144	if (!sctx->verity_descriptor) {
5145	ret = -ENOMEM;
5146	goto iput;
5147	}
5148	}
5149
5150	ret = btrfs_get_verity_descriptor(inode: &inode->vfs_inode, buf: sctx->verity_descriptor, buf_size: ret);
5151	if (ret < `0`)
5152	goto iput;
5153
5154	p = get_cur_inode_path(sctx);
5155	if (IS_ERR(ptr: p)) {
5156	ret = PTR_ERR(ptr: p);
5157	goto iput;
5158	}
5159
5160	ret = send_verity(sctx, path: p, desc: sctx->verity_descriptor);
5161	iput:
5162	iput(&inode->vfs_inode);
5163	return ret;
5164	}
5165
5166	static inline u64 max_send_read_size(const struct send_ctx *sctx)
5167	{
5168	return sctx->send_max_size - SZ_16K;
5169	}
5170
5171	static int put_data_header(struct send_ctx *sctx, u32 len)
5172	{
5173	if (WARN_ON_ONCE(sctx->put_data))
5174	return -EINVAL;
5175	sctx->put_data = true;
5176	if (sctx->proto >= `2`) {
5177	/*
5178	* Since v2, the data attribute header doesn't include a length,
5179	* it is implicitly to the end of the command.
5180	*/
5181	if (unlikely(sctx->send_max_size - sctx->send_size < sizeof(__le16) + len))
5182	return -EOVERFLOW;
5183	put_unaligned_le16(val: BTRFS_SEND_A_DATA, p: sctx->send_buf + sctx->send_size);
5184	sctx->send_size += sizeof(__le16);
5185	} else {
5186	struct btrfs_tlv_header *hdr;
5187
5188	if (unlikely(sctx->send_max_size - sctx->send_size < sizeof(*hdr) + len))
5189	return -EOVERFLOW;
5190	hdr = (struct btrfs_tlv_header *)(sctx->send_buf + sctx->send_size);
5191	put_unaligned_le16(val: BTRFS_SEND_A_DATA, p: &hdr->tlv_type);
5192	put_unaligned_le16(val: len, p: &hdr->tlv_len);
5193	sctx->send_size += sizeof(*hdr);
5194	}
5195	return `0`;
5196	}
5197
5198	static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len)
5199	{
5200	struct btrfs_root *root = sctx->send_root;
5201	struct btrfs_fs_info *fs_info = root->fs_info;
5202	u64 cur = offset;
5203	const u64 end = offset + len;
5204	const pgoff_t last_index = ((end - `1`) >> PAGE_SHIFT);
5205	struct address_space *mapping = sctx->cur_inode->i_mapping;
5206	int ret;
5207
5208	ret = put_data_header(sctx, len);
5209	if (ret)
5210	return ret;
5211
5212	while (cur < end) {
5213	pgoff_t index = (cur >> PAGE_SHIFT);
5214	unsigned int cur_len;
5215	unsigned int pg_offset;
5216	struct folio *folio;
5217
5218	folio = filemap_lock_folio(mapping, index);
5219	if (IS_ERR(ptr: folio)) {
5220	page_cache_sync_readahead(mapping,
5221	ra: &sctx->ra, NULL, index,
5222	req_count: last_index + `1` - index);
5223
5224	folio = filemap_grab_folio(mapping, index);
5225	if (IS_ERR(ptr: folio)) {
5226	ret = PTR_ERR(ptr: folio);
5227	break;
5228	}
5229	}
5230	pg_offset = offset_in_folio(folio, cur);
5231	cur_len = min_t(unsigned int, end - cur, folio_size(folio) - pg_offset);
5232
5233	if (folio_test_readahead(folio))
5234	page_cache_async_readahead(mapping, ra: &sctx->ra, NULL, folio,
5235	req_count: last_index + `1` - index);
5236
5237	if (!folio_test_uptodate(folio)) {
5238	btrfs_read_folio(NULL, folio);
5239	folio_lock(folio);
5240	if (unlikely(!folio_test_uptodate(folio))) {
5241	folio_unlock(folio);
5242	btrfs_err(fs_info,
5243	"send: IO error at offset %llu for inode %llu root %llu",
5244	folio_pos(folio), sctx->cur_ino,
5245	btrfs_root_id(sctx->send_root));
5246	folio_put(folio);
5247	ret = -EIO;
5248	break;
5249	}
5250	if (folio->mapping != mapping) {
5251	folio_unlock(folio);
5252	folio_put(folio);
5253	continue;
5254	}
5255	}
5256
5257	memcpy_from_folio(to: sctx->send_buf + sctx->send_size, folio,
5258	offset: pg_offset, len: cur_len);
5259	folio_unlock(folio);
5260	folio_put(folio);
5261	cur += cur_len;
5262	sctx->send_size += cur_len;
5263	}
5264
5265	return ret;
5266	}
5267
5268	/*
5269	* Read some bytes from the current inode/file and send a write command to
5270	* user space.
5271	*/
5272	static int send_write(struct send_ctx *sctx, u64 offset, u32 len)
5273	{
5274	int ret = `0`;
5275	struct fs_path *p;
5276
5277	p = get_cur_inode_path(sctx);
5278	if (IS_ERR(ptr: p))
5279	return PTR_ERR(ptr: p);
5280
5281	ret = begin_cmd(sctx, cmd: BTRFS_SEND_C_WRITE);
5282	if (ret < `0`)
5283	return ret;
5284
5285	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
5286	TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
5287	ret = put_file_data(sctx, offset, len);
5288	if (ret < `0`)
5289	return ret;
5290
5291	ret = send_cmd(sctx);
5292
5293	tlv_put_failure:
5294	return ret;
5295	}
5296
5297	/*
5298	* Send a clone command to user space.
5299	*/
5300	static int send_clone(struct send_ctx *sctx,
5301	u64 offset, u32 len,
5302	struct clone_root *clone_root)
5303	{
5304	int ret = `0`;
5305	struct fs_path *p;
5306	struct fs_path *cur_inode_path;
5307	u64 gen;
5308
5309	cur_inode_path = get_cur_inode_path(sctx);
5310	if (IS_ERR(ptr: cur_inode_path))
5311	return PTR_ERR(ptr: cur_inode_path);
5312
5313	p = fs_path_alloc();
5314	if (!p)
5315	return -ENOMEM;
5316
5317	ret = begin_cmd(sctx, cmd: BTRFS_SEND_C_CLONE);
5318	if (ret < `0`)
5319	goto out;
5320
5321	TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
5322	TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_LEN, len);
5323	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, cur_inode_path);
5324
5325	if (clone_root->root == sctx->send_root) {
5326	ret = get_inode_gen(root: sctx->send_root, ino: clone_root->ino, gen: &gen);
5327	if (ret < `0`)
5328	goto out;
5329	ret = get_cur_path(sctx, ino: clone_root->ino, gen, dest: p);
5330	} else {
5331	ret = get_inode_path(root: clone_root->root, ino: clone_root->ino, path: p);
5332	}
5333	if (ret < `0`)
5334	goto out;
5335
5336	/*
5337	* If the parent we're using has a received_uuid set then use that as
5338	* our clone source as that is what we will look for when doing a
5339	* receive.
5340	*
5341	* This covers the case that we create a snapshot off of a received
5342	* subvolume and then use that as the parent and try to receive on a
5343	* different host.
5344	*/
5345	if (!btrfs_is_empty_uuid(uuid: clone_root->root->root_item.received_uuid))
5346	TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
5347	clone_root->root->root_item.received_uuid);
5348	else
5349	TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
5350	clone_root->root->root_item.uuid);
5351	TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID,
5352	btrfs_root_ctransid(&clone_root->root->root_item));
5353	TLV_PUT_PATH(sctx, BTRFS_SEND_A_CLONE_PATH, p);
5354	TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_OFFSET,
5355	clone_root->offset);
5356
5357	ret = send_cmd(sctx);
5358
5359	tlv_put_failure:
5360	out:
5361	fs_path_free(p);
5362	return ret;
5363	}
5364
5365	/*
5366	* Send an update extent command to user space.
5367	*/
5368	static int send_update_extent(struct send_ctx *sctx,
5369	u64 offset, u32 len)
5370	{
5371	int ret = `0`;
5372	struct fs_path *p;
5373
5374	p = get_cur_inode_path(sctx);
5375	if (IS_ERR(ptr: p))
5376	return PTR_ERR(ptr: p);
5377
5378	ret = begin_cmd(sctx, cmd: BTRFS_SEND_C_UPDATE_EXTENT);
5379	if (ret < `0`)
5380	return ret;
5381
5382	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
5383	TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
5384	TLV_PUT_U64(sctx, BTRFS_SEND_A_SIZE, len);
5385
5386	ret = send_cmd(sctx);
5387
5388	tlv_put_failure:
5389	return ret;
5390	}
5391
5392	static int send_fallocate(struct send_ctx *sctx, u32 mode, u64 offset, u64 len)
5393	{
5394	struct fs_path *path;
5395	int ret;
5396
5397	path = get_cur_inode_path(sctx);
5398	if (IS_ERR(ptr: path))
5399	return PTR_ERR(ptr: path);
5400
5401	ret = begin_cmd(sctx, cmd: BTRFS_SEND_C_FALLOCATE);
5402	if (ret < `0`)
5403	return ret;
5404
5405	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
5406	TLV_PUT_U32(sctx, BTRFS_SEND_A_FALLOCATE_MODE, mode);
5407	TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
5408	TLV_PUT_U64(sctx, BTRFS_SEND_A_SIZE, len);
5409
5410	ret = send_cmd(sctx);
5411
5412	tlv_put_failure:
5413	return ret;
5414	}
5415
5416	static int send_hole(struct send_ctx *sctx, u64 end)
5417	{
5418	struct fs_path *p = NULL;
5419	u64 read_size = max_send_read_size(sctx);
5420	u64 offset = sctx->cur_inode_last_extent;
5421	int ret = `0`;
5422
5423	/*
5424	* Starting with send stream v2 we have fallocate and can use it to
5425	* punch holes instead of sending writes full of zeroes.
5426	*/
5427	if (proto_cmd_ok(sctx, cmd: BTRFS_SEND_C_FALLOCATE))
5428	return send_fallocate(sctx, FALLOC_FL_PUNCH_HOLE \| FALLOC_FL_KEEP_SIZE,
5429	offset, len: end - offset);
5430
5431	/*
5432	* A hole that starts at EOF or beyond it. Since we do not yet support
5433	* fallocate (for extent preallocation and hole punching), sending a
5434	* write of zeroes starting at EOF or beyond would later require issuing
5435	* a truncate operation which would undo the write and achieve nothing.
5436	*/
5437	if (offset >= sctx->cur_inode_size)
5438	return `0`;
5439
5440	/*
5441	* Don't go beyond the inode's i_size due to prealloc extents that start
5442	* after the i_size.
5443	*/
5444	end = min_t(u64, end, sctx->cur_inode_size);
5445
5446	if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA)
5447	return send_update_extent(sctx, offset, len: end - offset);
5448
5449	p = get_cur_inode_path(sctx);
5450	if (IS_ERR(ptr: p))
5451	return PTR_ERR(ptr: p);
5452
5453	while (offset < end) {
5454	u64 len = min(end - offset, read_size);
5455
5456	ret = begin_cmd(sctx, cmd: BTRFS_SEND_C_WRITE);
5457	if (ret < `0`)
5458	break;
5459	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
5460	TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
5461	ret = put_data_header(sctx, len);
5462	if (ret < `0`)
5463	break;
5464	memset(sctx->send_buf + sctx->send_size, `0`, len);
5465	sctx->send_size += len;
5466	ret = send_cmd(sctx);
5467	if (ret < `0`)
5468	break;
5469	offset += len;
5470	}
5471	sctx->cur_inode_next_write_offset = offset;
5472	tlv_put_failure:
5473	return ret;
5474	}
5475
5476	static int send_encoded_inline_extent(struct send_ctx *sctx,
5477	struct btrfs_path *path, u64 offset,
5478	u64 len)
5479	{
5480	struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
5481	struct fs_path *fspath;
5482	struct extent_buffer *leaf = path->nodes[`0`];
5483	struct btrfs_key key;
5484	struct btrfs_file_extent_item *ei;
5485	u64 ram_bytes;
5486	size_t inline_size;
5487	int ret;
5488
5489	fspath = get_cur_inode_path(sctx);
5490	if (IS_ERR(ptr: fspath))
5491	return PTR_ERR(ptr: fspath);
5492
5493	ret = begin_cmd(sctx, cmd: BTRFS_SEND_C_ENCODED_WRITE);
5494	if (ret < `0`)
5495	return ret;
5496
5497	btrfs_item_key_to_cpu(eb: leaf, cpu_key: &key, nr: path->slots[`0`]);
5498	ei = btrfs_item_ptr(leaf, path->slots[`0`], struct btrfs_file_extent_item);
5499	ram_bytes = btrfs_file_extent_ram_bytes(eb: leaf, s: ei);
5500	inline_size = btrfs_file_extent_inline_item_len(eb: leaf, nr: path->slots[`0`]);
5501
5502	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, fspath);
5503	TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
5504	TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_FILE_LEN,
5505	min(key.offset + ram_bytes - offset, len));
5506	TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_LEN, ram_bytes);
5507	TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_OFFSET, offset - key.offset);
5508	ret = btrfs_encoded_io_compression_from_extent(fs_info,
5509	compress_type: btrfs_file_extent_compression(eb: leaf, s: ei));
5510	if (ret < `0`)
5511	return ret;
5512	TLV_PUT_U32(sctx, BTRFS_SEND_A_COMPRESSION, ret);
5513
5514	ret = put_data_header(sctx, len: inline_size);
5515	if (ret < `0`)
5516	return ret;
5517	read_extent_buffer(eb: leaf, dst: sctx->send_buf + sctx->send_size,
5518	start: btrfs_file_extent_inline_start(e: ei), len: inline_size);
5519	sctx->send_size += inline_size;
5520
5521	ret = send_cmd(sctx);
5522
5523	tlv_put_failure:
5524	return ret;
5525	}
5526
5527	static int send_encoded_extent(struct send_ctx sctx, struct* btrfs_path *path,
5528	u64 offset, u64 len)
5529	{
5530	struct btrfs_root *root = sctx->send_root;
5531	struct btrfs_fs_info *fs_info = root->fs_info;
5532	struct btrfs_inode *inode;
5533	struct fs_path *fspath;
5534	struct extent_buffer *leaf = path->nodes[`0`];
5535	struct btrfs_key key;
5536	struct btrfs_file_extent_item *ei;
5537	u64 disk_bytenr, disk_num_bytes;
5538	u32 data_offset;
5539	struct btrfs_cmd_header *hdr;
5540	u32 crc;
5541	int ret;
5542
5543	inode = btrfs_iget(ino: sctx->cur_ino, root);
5544	if (IS_ERR(ptr: inode))
5545	return PTR_ERR(ptr: inode);
5546
5547	fspath = get_cur_inode_path(sctx);
5548	if (IS_ERR(ptr: fspath)) {
5549	ret = PTR_ERR(ptr: fspath);
5550	goto out;
5551	}
5552
5553	ret = begin_cmd(sctx, cmd: BTRFS_SEND_C_ENCODED_WRITE);
5554	if (ret < `0`)
5555	goto out;
5556
5557	btrfs_item_key_to_cpu(eb: leaf, cpu_key: &key, nr: path->slots[`0`]);
5558	ei = btrfs_item_ptr(leaf, path->slots[`0`], struct btrfs_file_extent_item);
5559	disk_bytenr = btrfs_file_extent_disk_bytenr(eb: leaf, s: ei);
5560	disk_num_bytes = btrfs_file_extent_disk_num_bytes(eb: leaf, s: ei);
5561
5562	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, fspath);
5563	TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
5564	TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_FILE_LEN,
5565	min(key.offset + btrfs_file_extent_num_bytes(leaf, ei) - offset,
5566	len));
5567	TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_LEN,
5568	btrfs_file_extent_ram_bytes(leaf, ei));
5569	TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_OFFSET,
5570	offset - key.offset + btrfs_file_extent_offset(leaf, ei));
5571	ret = btrfs_encoded_io_compression_from_extent(fs_info,
5572	compress_type: btrfs_file_extent_compression(eb: leaf, s: ei));
5573	if (ret < `0`)
5574	goto out;
5575	TLV_PUT_U32(sctx, BTRFS_SEND_A_COMPRESSION, ret);
5576	TLV_PUT_U32(sctx, BTRFS_SEND_A_ENCRYPTION, `0`);
5577
5578	ret = put_data_header(sctx, len: disk_num_bytes);
5579	if (ret < `0`)
5580	goto out;
5581
5582	/*
5583	* We want to do I/O directly into the send buffer, so get the next page
5584	* boundary in the send buffer. This means that there may be a gap
5585	* between the beginning of the command and the file data.
5586	*/
5587	data_offset = PAGE_ALIGN(sctx->send_size);
5588	if (unlikely(data_offset > sctx->send_max_size \|\|
5589	sctx->send_max_size - data_offset < disk_num_bytes)) {
5590	ret = -EOVERFLOW;
5591	goto out;
5592	}
5593
5594	/*
5595	* Note that send_buf is a mapping of send_buf_pages, so this is really
5596	* reading into send_buf.
5597	*/
5598	ret = btrfs_encoded_read_regular_fill_pages(inode,
5599	disk_bytenr, disk_io_size: disk_num_bytes,
5600	pages: sctx->send_buf_pages +
5601	(data_offset >> PAGE_SHIFT),
5602	NULL);
5603	if (ret)
5604	goto out;
5605
5606	hdr = (struct btrfs_cmd_header *)sctx->send_buf;
5607	hdr->len = cpu_to_le32(sctx->send_size + disk_num_bytes - sizeof(*hdr));
5608	hdr->crc = `0`;
5609	crc = crc32c(crc: `0`, p: sctx->send_buf, len: sctx->send_size);
5610	crc = crc32c(crc, p: sctx->send_buf + data_offset, len: disk_num_bytes);
5611	hdr->crc = cpu_to_le32(crc);
5612
5613	ret = write_buf(filp: sctx->send_filp, buf: sctx->send_buf, len: sctx->send_size,
5614	off: &sctx->send_off);
5615	if (!ret) {
5616	ret = write_buf(filp: sctx->send_filp, buf: sctx->send_buf + data_offset,
5617	len: disk_num_bytes, off: &sctx->send_off);
5618	}
5619	sctx->send_size = `0`;
5620	sctx->put_data = false;
5621
5622	tlv_put_failure:
5623	out:
5624	iput(&inode->vfs_inode);
5625	return ret;
5626	}
5627
5628	static int send_extent_data(struct send_ctx sctx, struct* btrfs_path *path,
5629	const u64 offset, const u64 len)
5630	{
5631	const u64 end = offset + len;
5632	struct extent_buffer *leaf = path->nodes[`0`];
5633	struct btrfs_file_extent_item *ei;
5634	u64 read_size = max_send_read_size(sctx);
5635	u64 sent = `0`;
5636
5637	if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA)
5638	return send_update_extent(sctx, offset, len);
5639
5640	ei = btrfs_item_ptr(leaf, path->slots[`0`],
5641	struct btrfs_file_extent_item);
5642	if ((sctx->flags & BTRFS_SEND_FLAG_COMPRESSED) &&
5643	btrfs_file_extent_compression(eb: leaf, s: ei) != BTRFS_COMPRESS_NONE) {
5644	bool is_inline = (btrfs_file_extent_type(eb: leaf, s: ei) ==
5645	BTRFS_FILE_EXTENT_INLINE);
5646
5647	/*
5648	* Send the compressed extent unless the compressed data is
5649	* larger than the decompressed data. This can happen if we're
5650	* not sending the entire extent, either because it has been
5651	* partially overwritten/truncated or because this is a part of
5652	* the extent that we couldn't clone in clone_range().
5653	*/
5654	if (is_inline &&
5655	btrfs_file_extent_inline_item_len(eb: leaf,
5656	nr: path->slots[`0`]) <= len) {
5657	return send_encoded_inline_extent(sctx, path, offset,
5658	len);
5659	} else if (!is_inline &&
5660	btrfs_file_extent_disk_num_bytes(eb: leaf, s: ei) <= len) {
5661	return send_encoded_extent(sctx, path, offset, len);
5662	}
5663	}
5664
5665	if (sctx->cur_inode == NULL) {
5666	struct btrfs_inode *btrfs_inode;
5667	struct btrfs_root *root = sctx->send_root;
5668
5669	btrfs_inode = btrfs_iget(ino: sctx->cur_ino, root);
5670	if (IS_ERR(ptr: btrfs_inode))
5671	return PTR_ERR(ptr: btrfs_inode);
5672
5673	sctx->cur_inode = &btrfs_inode->vfs_inode;
5674	memset(&sctx->ra, `0`, sizeof(struct file_ra_state));
5675	file_ra_state_init(ra: &sctx->ra, mapping: sctx->cur_inode->i_mapping);
5676
5677	/*
5678	* It's very likely there are no pages from this inode in the page
5679	* cache, so after reading extents and sending their data, we clean
5680	* the page cache to avoid trashing the page cache (adding pressure
5681	* to the page cache and forcing eviction of other data more useful
5682	* for applications).
5683	*
5684	* We decide if we should clean the page cache simply by checking
5685	* if the inode's mapping nrpages is 0 when we first open it, and
5686	* not by using something like filemap_range_has_page() before
5687	* reading an extent because when we ask the readahead code to
5688	* read a given file range, it may (and almost always does) read
5689	* pages from beyond that range (see the documentation for
5690	* page_cache_sync_readahead()), so it would not be reliable,
5691	* because after reading the first extent future calls to
5692	* filemap_range_has_page() would return true because the readahead
5693	* on the previous extent resulted in reading pages of the current
5694	* extent as well.
5695	*/
5696	sctx->clean_page_cache = (sctx->cur_inode->i_mapping->nrpages == `0`);
5697	sctx->page_cache_clear_start = round_down(offset, PAGE_SIZE);
5698	}
5699
5700	while (sent < len) {
5701	u64 size = min(len - sent, read_size);
5702	int ret;
5703
5704	ret = send_write(sctx, offset: offset + sent, len: size);
5705	if (ret < `0`)
5706	return ret;
5707	sent += size;
5708	}
5709
5710	if (sctx->clean_page_cache && PAGE_ALIGNED(end)) {
5711	/*
5712	* Always operate only on ranges that are a multiple of the page
5713	* size. This is not only to prevent zeroing parts of a page in
5714	* the case of subpage sector size, but also to guarantee we evict
5715	* pages, as passing a range that is smaller than page size does
5716	* not evict the respective page (only zeroes part of its content).
5717	*
5718	* Always start from the end offset of the last range cleared.
5719	* This is because the readahead code may (and very often does)
5720	* reads pages beyond the range we request for readahead. So if
5721	* we have an extent layout like this:
5722	*
5723	* [ extent A ] [ extent B ] [ extent C ]
5724	*
5725	* When we ask page_cache_sync_readahead() to read extent A, it
5726	* may also trigger reads for pages of extent B. If we are doing
5727	* an incremental send and extent B has not changed between the
5728	* parent and send snapshots, some or all of its pages may end
5729	* up being read and placed in the page cache. So when truncating
5730	* the page cache we always start from the end offset of the
5731	* previously processed extent up to the end of the current
5732	* extent.
5733	*/
5734	truncate_inode_pages_range(mapping: &sctx->cur_inode->i_data,
5735	lstart: sctx->page_cache_clear_start,
5736	lend: end - `1`);
5737	sctx->page_cache_clear_start = end;
5738	}
5739
5740	return `0`;
5741	}
5742
5743	/*
5744	* Search for a capability xattr related to sctx->cur_ino. If the capability is
5745	* found, call send_set_xattr function to emit it.
5746	*
5747	* Return 0 if there isn't a capability, or when the capability was emitted
5748	* successfully, or < 0 if an error occurred.
5749	*/
5750	static int send_capabilities(struct send_ctx *sctx)
5751	{
5752	BTRFS_PATH_AUTO_FREE(path);
5753	struct btrfs_dir_item *di;
5754	struct extent_buffer *leaf;
5755	unsigned long data_ptr;
5756	char AUTO_KFREE(buf);
5757	int buf_len;
5758	int ret = `0`;
5759
5760	path = alloc_path_for_send();
5761	if (!path)
5762	return -ENOMEM;
5763
5764	di = btrfs_lookup_xattr(NULL, root: sctx->send_root, path, dir: sctx->cur_ino,
5765	XATTR_NAME_CAPS, strlen(XATTR_NAME_CAPS), mod: `0`);
5766	if (!di) {
5767	/ There is no xattr for this inode /
5768	return `0`;
5769	} else if (IS_ERR(ptr: di)) {
5770	return PTR_ERR(ptr: di);
5771	}
5772
5773	leaf = path->nodes[`0`];
5774	buf_len = btrfs_dir_data_len(eb: leaf, s: di);
5775
5776	buf = kmalloc(buf_len, GFP_KERNEL);
5777	if (!buf)
5778	return -ENOMEM;
5779
5780	data_ptr = (unsigned long)(di + `1`) + btrfs_dir_name_len(eb: leaf, s: di);
5781	read_extent_buffer(eb: leaf, dst: buf, start: data_ptr, len: buf_len);
5782
5783	ret = send_set_xattr(sctx, XATTR_NAME_CAPS,
5784	strlen(XATTR_NAME_CAPS), data: buf, data_len: buf_len);
5785	return ret;
5786	}
5787
5788	static int clone_range(struct send_ctx sctx, struct* btrfs_path *dst_path,
5789	struct clone_root clone_root, const* u64 disk_byte,
5790	u64 data_offset, u64 offset, u64 len)
5791	{
5792	BTRFS_PATH_AUTO_FREE(path);
5793	struct btrfs_key key;
5794	int ret;
5795	struct btrfs_inode_info info;
5796	u64 clone_src_i_size = `0`;
5797
5798	/*
5799	* Prevent cloning from a zero offset with a length matching the sector
5800	* size because in some scenarios this will make the receiver fail.
5801	*
5802	* For example, if in the source filesystem the extent at offset 0
5803	* has a length of sectorsize and it was written using direct IO, then
5804	* it can never be an inline extent (even if compression is enabled).
5805	* Then this extent can be cloned in the original filesystem to a non
5806	* zero file offset, but it may not be possible to clone in the
5807	* destination filesystem because it can be inlined due to compression
5808	* on the destination filesystem (as the receiver's write operations are
5809	* always done using buffered IO). The same happens when the original
5810	* filesystem does not have compression enabled but the destination
5811	* filesystem has.
5812	*/
5813	if (clone_root->offset == `0` &&
5814	len == sctx->send_root->fs_info->sectorsize)
5815	return send_extent_data(sctx, path: dst_path, offset, len);
5816
5817	path = alloc_path_for_send();
5818	if (!path)
5819	return -ENOMEM;
5820
5821	/*
5822	* There are inodes that have extents that lie behind its i_size. Don't
5823	* accept clones from these extents.
5824	*/
5825	ret = get_inode_info(root: clone_root->root, ino: clone_root->ino, info: &info);
5826	btrfs_release_path(p: path);
5827	if (ret < `0`)
5828	return ret;
5829	clone_src_i_size = info.size;
5830
5831	/*
5832	* We can't send a clone operation for the entire range if we find
5833	* extent items in the respective range in the source file that
5834	* refer to different extents or if we find holes.
5835	* So check for that and do a mix of clone and regular write/copy
5836	* operations if needed.
5837	*
5838	* Example:
5839	*
5840	* mkfs.btrfs -f /dev/sda
5841	* mount /dev/sda /mnt
5842	* xfs_io -f -c "pwrite -S 0xaa 0K 100K" /mnt/foo
5843	* cp --reflink=always /mnt/foo /mnt/bar
5844	* xfs_io -c "pwrite -S 0xbb 50K 50K" /mnt/foo
5845	* btrfs subvolume snapshot -r /mnt /mnt/snap
5846	*
5847	* If when we send the snapshot and we are processing file bar (which
5848	* has a higher inode number than foo) we blindly send a clone operation
5849	* for the [0, 100K[ range from foo to bar, the receiver ends up getting
5850	* a file bar that matches the content of file foo - iow, doesn't match
5851	* the content from bar in the original filesystem.
5852	*/
5853	key.objectid = clone_root->ino;
5854	key.type = BTRFS_EXTENT_DATA_KEY;
5855	key.offset = clone_root->offset;
5856	ret = btrfs_search_slot(NULL, root: clone_root->root, key: &key, p: path, ins_len: `0`, cow: `0`);
5857	if (ret < `0`)
5858	return ret;
5859	if (ret > `0` && path->slots[`0`] > `0`) {
5860	btrfs_item_key_to_cpu(eb: path->nodes[`0`], cpu_key: &key, nr: path->slots[`0`] - `1`);
5861	if (key.objectid == clone_root->ino &&
5862	key.type == BTRFS_EXTENT_DATA_KEY)
5863	path->slots[`0`]--;
5864	}
5865
5866	while (true) {
5867	struct extent_buffer *leaf = path->nodes[`0`];
5868	int slot = path->slots[`0`];
5869	struct btrfs_file_extent_item *ei;
5870	u8 type;
5871	u64 ext_len;
5872	u64 clone_len;
5873	u64 clone_data_offset;
5874	bool crossed_src_i_size = false;
5875
5876	if (slot >= btrfs_header_nritems(eb: leaf)) {
5877	ret = btrfs_next_leaf(root: clone_root->root, path);
5878	if (ret < `0`)
5879	return ret;
5880	else if (ret > `0`)
5881	break;
5882	continue;
5883	}
5884
5885	btrfs_item_key_to_cpu(eb: leaf, cpu_key: &key, nr: slot);
5886
5887	/*
5888	* We might have an implicit trailing hole (NO_HOLES feature
5889	* enabled). We deal with it after leaving this loop.
5890	*/
5891	if (key.objectid != clone_root->ino \|\|
5892	key.type != BTRFS_EXTENT_DATA_KEY)
5893	break;
5894
5895	ei = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
5896	type = btrfs_file_extent_type(eb: leaf, s: ei);
5897	if (type == BTRFS_FILE_EXTENT_INLINE) {
5898	ext_len = btrfs_file_extent_ram_bytes(eb: leaf, s: ei);
5899	ext_len = PAGE_ALIGN(ext_len);
5900	} else {
5901	ext_len = btrfs_file_extent_num_bytes(eb: leaf, s: ei);
5902	}
5903
5904	if (key.offset + ext_len <= clone_root->offset)
5905	goto next;
5906
5907	if (key.offset > clone_root->offset) {
5908	/ Implicit hole, NO_HOLES feature enabled. /
5909	u64 hole_len = key.offset - clone_root->offset;
5910
5911	if (hole_len > len)
5912	hole_len = len;
5913	ret = send_extent_data(sctx, path: dst_path, offset,
5914	len: hole_len);
5915	if (ret < `0`)
5916	return ret;
5917
5918	len -= hole_len;
5919	if (len == `0`)
5920	break;
5921	offset += hole_len;
5922	clone_root->offset += hole_len;
5923	data_offset += hole_len;
5924	}
5925
5926	if (key.offset >= clone_root->offset + len)
5927	break;
5928
5929	if (key.offset >= clone_src_i_size)
5930	break;
5931
5932	if (key.offset + ext_len > clone_src_i_size) {
5933	ext_len = clone_src_i_size - key.offset;
5934	crossed_src_i_size = true;
5935	}
5936
5937	clone_data_offset = btrfs_file_extent_offset(eb: leaf, s: ei);
5938	if (btrfs_file_extent_disk_bytenr(eb: leaf, s: ei) == disk_byte) {
5939	clone_root->offset = key.offset;
5940	if (clone_data_offset < data_offset &&
5941	clone_data_offset + ext_len > data_offset) {
5942	u64 extent_offset;
5943
5944	extent_offset = data_offset - clone_data_offset;
5945	ext_len -= extent_offset;
5946	clone_data_offset += extent_offset;
5947	clone_root->offset += extent_offset;
5948	}
5949	}
5950
5951	clone_len = min_t(u64, ext_len, len);
5952
5953	if (btrfs_file_extent_disk_bytenr(eb: leaf, s: ei) == disk_byte &&
5954	clone_data_offset == data_offset) {
5955	const u64 src_end = clone_root->offset + clone_len;
5956	const u64 sectorsize = SZ_64K;
5957
5958	/*
5959	* We can't clone the last block, when its size is not
5960	* sector size aligned, into the middle of a file. If we
5961	* do so, the receiver will get a failure (-EINVAL) when
5962	* trying to clone or will silently corrupt the data in
5963	* the destination file if it's on a kernel without the
5964	* fix introduced by commit ac765f83f1397646
5965	* ("Btrfs: fix data corruption due to cloning of eof
5966	* block).
5967	*
5968	* So issue a clone of the aligned down range plus a
5969	* regular write for the eof block, if we hit that case.
5970	*
5971	* Also, we use the maximum possible sector size, 64K,
5972	* because we don't know what's the sector size of the
5973	* filesystem that receives the stream, so we have to
5974	* assume the largest possible sector size.
5975	*/
5976	if (src_end == clone_src_i_size &&
5977	!IS_ALIGNED(src_end, sectorsize) &&
5978	offset + clone_len < sctx->cur_inode_size) {
5979	u64 slen;
5980
5981	slen = ALIGN_DOWN(src_end - clone_root->offset,
5982	sectorsize);
5983	if (slen > `0`) {
5984	ret = send_clone(sctx, offset, len: slen,
5985	clone_root);
5986	if (ret < `0`)
5987	return ret;
5988	}
5989	ret = send_extent_data(sctx, path: dst_path,
5990	offset: offset + slen,
5991	len: clone_len - slen);
5992	} else {
5993	ret = send_clone(sctx, offset, len: clone_len,
5994	clone_root);
5995	}
5996	} else if (crossed_src_i_size && clone_len < len) {
5997	/*
5998	* If we are at i_size of the clone source inode and we
5999	* can not clone from it, terminate the loop. This is
6000	* to avoid sending two write operations, one with a
6001	* length matching clone_len and the final one after
6002	* this loop with a length of len - clone_len.
6003	*
6004	* When using encoded writes (BTRFS_SEND_FLAG_COMPRESSED
6005	* was passed to the send ioctl), this helps avoid
6006	* sending an encoded write for an offset that is not
6007	* sector size aligned, in case the i_size of the source
6008	* inode is not sector size aligned. That will make the
6009	* receiver fallback to decompression of the data and
6010	* writing it using regular buffered IO, therefore while
6011	* not incorrect, it's not optimal due decompression and
6012	* possible re-compression at the receiver.
6013	*/
6014	break;
6015	} else {
6016	ret = send_extent_data(sctx, path: dst_path, offset,
6017	len: clone_len);
6018	}
6019
6020	if (ret < `0`)
6021	return ret;
6022
6023	len -= clone_len;
6024	if (len == `0`)
6025	break;
6026	offset += clone_len;
6027	clone_root->offset += clone_len;
6028
6029	/*
6030	* If we are cloning from the file we are currently processing,
6031	* and using the send root as the clone root, we must stop once
6032	* the current clone offset reaches the current eof of the file
6033	* at the receiver, otherwise we would issue an invalid clone
6034	* operation (source range going beyond eof) and cause the
6035	* receiver to fail. So if we reach the current eof, bail out
6036	* and fallback to a regular write.
6037	*/
6038	if (clone_root->root == sctx->send_root &&
6039	clone_root->ino == sctx->cur_ino &&
6040	clone_root->offset >= sctx->cur_inode_next_write_offset)
6041	break;
6042
6043	data_offset += clone_len;
6044	next:
6045	path->slots[`0`]++;
6046	}
6047
6048	if (len > `0`)
6049	ret = send_extent_data(sctx, path: dst_path, offset, len);
6050	else
6051	ret = `0`;
6052	return ret;
6053	}
6054
6055	static int send_write_or_clone(struct send_ctx *sctx,
6056	struct btrfs_path *path,
6057	struct btrfs_key *key,
6058	struct clone_root *clone_root)
6059	{
6060	int ret = `0`;
6061	u64 offset = key->offset;
6062	u64 end;
6063	u64 bs = sctx->send_root->fs_info->sectorsize;
6064	struct btrfs_file_extent_item *ei;
6065	u64 disk_byte;
6066	u64 data_offset;
6067	u64 num_bytes;
6068	struct btrfs_inode_info info = { `0` };
6069
6070	end = min_t(u64, btrfs_file_extent_end(path), sctx->cur_inode_size);
6071	if (offset >= end)
6072	return `0`;
6073
6074	num_bytes = end - offset;
6075
6076	if (!clone_root)
6077	goto write_data;
6078
6079	if (IS_ALIGNED(end, bs))
6080	goto clone_data;
6081
6082	/*
6083	* If the extent end is not aligned, we can clone if the extent ends at
6084	* the i_size of the inode and the clone range ends at the i_size of the
6085	* source inode, otherwise the clone operation fails with -EINVAL.
6086	*/
6087	if (end != sctx->cur_inode_size)
6088	goto write_data;
6089
6090	ret = get_inode_info(root: clone_root->root, ino: clone_root->ino, info: &info);
6091	if (ret < `0`)
6092	return ret;
6093
6094	if (clone_root->offset + num_bytes == info.size) {
6095	/*
6096	* The final size of our file matches the end offset, but it may
6097	* be that its current size is larger, so we have to truncate it
6098	* to any value between the start offset of the range and the
6099	* final i_size, otherwise the clone operation is invalid
6100	* because it's unaligned and it ends before the current EOF.
6101	* We do this truncate to the final i_size when we finish
6102	* processing the inode, but it's too late by then. And here we
6103	* truncate to the start offset of the range because it's always
6104	* sector size aligned while if it were the final i_size it
6105	* would result in dirtying part of a page, filling part of a
6106	* page with zeroes and then having the clone operation at the
6107	* receiver trigger IO and wait for it due to the dirty page.
6108	*/
6109	if (sctx->parent_root != NULL) {
6110	ret = send_truncate(sctx, ino: sctx->cur_ino,
6111	gen: sctx->cur_inode_gen, size: offset);
6112	if (ret < `0`)
6113	return ret;
6114	}
6115	goto clone_data;
6116	}
6117
6118	write_data:
6119	ret = send_extent_data(sctx, path, offset, len: num_bytes);
6120	sctx->cur_inode_next_write_offset = end;
6121	return ret;
6122
6123	clone_data:
6124	ei = btrfs_item_ptr(path->nodes[`0`], path->slots[`0`],
6125	struct btrfs_file_extent_item);
6126	disk_byte = btrfs_file_extent_disk_bytenr(eb: path->nodes[`0`], s: ei);
6127	data_offset = btrfs_file_extent_offset(eb: path->nodes[`0`], s: ei);
6128	ret = clone_range(sctx, dst_path: path, clone_root, disk_byte, data_offset, offset,
6129	len: num_bytes);
6130	sctx->cur_inode_next_write_offset = end;
6131	return ret;
6132	}
6133
6134	static int is_extent_unchanged(struct send_ctx *sctx,
6135	struct btrfs_path *left_path,
6136	struct btrfs_key *ekey)
6137	{
6138	int ret = `0`;
6139	struct btrfs_key key;
6140	BTRFS_PATH_AUTO_FREE(path);
6141	struct extent_buffer *eb;
6142	int slot;
6143	struct btrfs_key found_key;
6144	struct btrfs_file_extent_item *ei;
6145	u64 left_disknr;
6146	u64 right_disknr;
6147	u64 left_offset;
6148	u64 right_offset;
6149	u64 left_offset_fixed;
6150	u64 left_len;
6151	u64 right_len;
6152	u64 left_gen;
6153	u64 right_gen;
6154	u8 left_type;
6155	u8 right_type;
6156
6157	path = alloc_path_for_send();
6158	if (!path)
6159	return -ENOMEM;
6160
6161	eb = left_path->nodes[`0`];
6162	slot = left_path->slots[`0`];
6163	ei = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
6164	left_type = btrfs_file_extent_type(eb, s: ei);
6165
6166	if (left_type != BTRFS_FILE_EXTENT_REG)
6167	return `0`;
6168
6169	left_disknr = btrfs_file_extent_disk_bytenr(eb, s: ei);
6170	left_len = btrfs_file_extent_num_bytes(eb, s: ei);
6171	left_offset = btrfs_file_extent_offset(eb, s: ei);
6172	left_gen = btrfs_file_extent_generation(eb, s: ei);
6173
6174	/*
6175	* Following comments will refer to these graphics. L is the left
6176	* extents which we are checking at the moment. 1-8 are the right
6177	* extents that we iterate.
6178	*
6179	* \|-----L-----\|
6180	* \|-1-\|-2a-\|-3-\|-4-\|-5-\|-6-\|
6181	*
6182	* \|-----L-----\|
6183	* \|--1--\|-2b-\|...(same as above)
6184	*
6185	* Alternative situation. Happens on files where extents got split.
6186	* \|-----L-----\|
6187	* \|-----------7-----------\|-6-\|
6188	*
6189	* Alternative situation. Happens on files which got larger.
6190	* \|-----L-----\|
6191	* \|-8-\|
6192	* Nothing follows after 8.
6193	*/
6194
6195	key.objectid = ekey->objectid;
6196	key.type = BTRFS_EXTENT_DATA_KEY;
6197	key.offset = ekey->offset;
6198	ret = btrfs_search_slot_for_read(root: sctx->parent_root, key: &key, p: path, find_higher: `0`, return_any: `0`);
6199	if (ret < `0`)
6200	return ret;
6201	if (ret)
6202	return `0`;
6203
6204	/*
6205	* Handle special case where the right side has no extents at all.
6206	*/
6207	eb = path->nodes[`0`];
6208	slot = path->slots[`0`];
6209	btrfs_item_key_to_cpu(eb, cpu_key: &found_key, nr: slot);
6210	if (found_key.objectid != key.objectid \|\|
6211	found_key.type != key.type)
6212	/ If we're a hole then just pretend nothing changed /
6213	return (left_disknr ? `0` : `1`);
6214
6215	/*
6216	* We're now on 2a, 2b or 7.
6217	*/
6218	key = found_key;
6219	while (key.offset < ekey->offset + left_len) {
6220	ei = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
6221	right_type = btrfs_file_extent_type(eb, s: ei);
6222	if (right_type != BTRFS_FILE_EXTENT_REG &&
6223	right_type != BTRFS_FILE_EXTENT_INLINE)
6224	return `0`;
6225
6226	if (right_type == BTRFS_FILE_EXTENT_INLINE) {
6227	right_len = btrfs_file_extent_ram_bytes(eb, s: ei);
6228	right_len = PAGE_ALIGN(right_len);
6229	} else {
6230	right_len = btrfs_file_extent_num_bytes(eb, s: ei);
6231	}
6232
6233	/*
6234	* Are we at extent 8? If yes, we know the extent is changed.
6235	* This may only happen on the first iteration.
6236	*/
6237	if (found_key.offset + right_len <= ekey->offset)
6238	/ If we're a hole just pretend nothing changed /
6239	return (left_disknr ? `0` : `1`);
6240
6241	/*
6242	* We just wanted to see if when we have an inline extent, what
6243	* follows it is a regular extent (wanted to check the above
6244	* condition for inline extents too). This should normally not
6245	* happen but it's possible for example when we have an inline
6246	* compressed extent representing data with a size matching
6247	* the page size (currently the same as sector size).
6248	*/
6249	if (right_type == BTRFS_FILE_EXTENT_INLINE)
6250	return `0`;
6251
6252	right_disknr = btrfs_file_extent_disk_bytenr(eb, s: ei);
6253	right_offset = btrfs_file_extent_offset(eb, s: ei);
6254	right_gen = btrfs_file_extent_generation(eb, s: ei);
6255
6256	left_offset_fixed = left_offset;
6257	if (key.offset < ekey->offset) {
6258	/ Fix the right offset for 2a and 7. /
6259	right_offset += ekey->offset - key.offset;
6260	} else {
6261	/ Fix the left offset for all behind 2a and 2b /
6262	left_offset_fixed += key.offset - ekey->offset;
6263	}
6264
6265	/*
6266	* Check if we have the same extent.
6267	*/
6268	if (left_disknr != right_disknr \|\|
6269	left_offset_fixed != right_offset \|\|
6270	left_gen != right_gen)
6271	return `0`;
6272
6273	/*
6274	* Go to the next extent.
6275	*/
6276	ret = btrfs_next_item(root: sctx->parent_root, p: path);
6277	if (ret < `0`)
6278	return ret;
6279	if (!ret) {
6280	eb = path->nodes[`0`];
6281	slot = path->slots[`0`];
6282	btrfs_item_key_to_cpu(eb, cpu_key: &found_key, nr: slot);
6283	}
6284	if (ret \|\| found_key.objectid != key.objectid \|\|
6285	found_key.type != key.type) {
6286	key.offset += right_len;
6287	break;
6288	}
6289	if (found_key.offset != key.offset + right_len)
6290	return `0`;
6291
6292	key = found_key;
6293	}
6294
6295	/*
6296	* We're now behind the left extent (treat as unchanged) or at the end
6297	* of the right side (treat as changed).
6298	*/
6299	if (key.offset >= ekey->offset + left_len)
6300	ret = `1`;
6301	else
6302	ret = `0`;
6303
6304	return ret;
6305	}
6306
6307	static int get_last_extent(struct send_ctx *sctx, u64 offset)
6308	{
6309	BTRFS_PATH_AUTO_FREE(path);
6310	struct btrfs_root *root = sctx->send_root;
6311	struct btrfs_key key;
6312	int ret;
6313
6314	path = alloc_path_for_send();
6315	if (!path)
6316	return -ENOMEM;
6317
6318	sctx->cur_inode_last_extent = `0`;
6319
6320	key.objectid = sctx->cur_ino;
6321	key.type = BTRFS_EXTENT_DATA_KEY;
6322	key.offset = offset;
6323	ret = btrfs_search_slot_for_read(root, key: &key, p: path, find_higher: `0`, return_any: `1`);
6324	if (ret < `0`)
6325	return ret;
6326	ret = `0`;
6327	btrfs_item_key_to_cpu(eb: path->nodes[`0`], cpu_key: &key, nr: path->slots[`0`]);
6328	if (key.objectid != sctx->cur_ino \|\| key.type != BTRFS_EXTENT_DATA_KEY)
6329	return ret;
6330
6331	sctx->cur_inode_last_extent = btrfs_file_extent_end(path);
6332	return ret;
6333	}
6334
6335	static int range_is_hole_in_parent(struct send_ctx *sctx,
6336	const u64 start,
6337	const u64 end)
6338	{
6339	BTRFS_PATH_AUTO_FREE(path);
6340	struct btrfs_key key;
6341	struct btrfs_root *root = sctx->parent_root;
6342	u64 search_start = start;
6343	int ret;
6344
6345	path = alloc_path_for_send();
6346	if (!path)
6347	return -ENOMEM;
6348
6349	key.objectid = sctx->cur_ino;
6350	key.type = BTRFS_EXTENT_DATA_KEY;
6351	key.offset = search_start;
6352	ret = btrfs_search_slot(NULL, root, key: &key, p: path, ins_len: `0`, cow: `0`);
6353	if (ret < `0`)
6354	return ret;
6355	if (ret > `0` && path->slots[`0`] > `0`)
6356	path->slots[`0`]--;
6357
6358	while (search_start < end) {
6359	struct extent_buffer *leaf = path->nodes[`0`];
6360	int slot = path->slots[`0`];
6361	struct btrfs_file_extent_item *fi;
6362	u64 extent_end;
6363
6364	if (slot >= btrfs_header_nritems(eb: leaf)) {
6365	ret = btrfs_next_leaf(root, path);
6366	if (ret < `0`)
6367	return ret;
6368	if (ret > `0`)
6369	break;
6370	continue;
6371	}
6372
6373	btrfs_item_key_to_cpu(eb: leaf, cpu_key: &key, nr: slot);
6374	if (key.objectid < sctx->cur_ino \|\|
6375	key.type < BTRFS_EXTENT_DATA_KEY)
6376	goto next;
6377	if (key.objectid > sctx->cur_ino \|\|
6378	key.type > BTRFS_EXTENT_DATA_KEY \|\|
6379	key.offset >= end)
6380	break;
6381
6382	fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
6383	extent_end = btrfs_file_extent_end(path);
6384	if (extent_end <= start)
6385	goto next;
6386	if (btrfs_file_extent_type(eb: leaf, s: fi) == BTRFS_FILE_EXTENT_INLINE)
6387	return `0`;
6388	if (btrfs_file_extent_disk_bytenr(eb: leaf, s: fi) == `0`) {
6389	search_start = extent_end;
6390	goto next;
6391	}
6392	return `0`;
6393	next:
6394	path->slots[`0`]++;
6395	}
6396	return `1`;
6397	}
6398
6399	static int maybe_send_hole(struct send_ctx sctx, struct* btrfs_path *path,
6400	struct btrfs_key *key)
6401	{
6402	int ret = `0`;
6403
6404	if (sctx->cur_ino != key->objectid \|\| !need_send_hole(sctx))
6405	return `0`;
6406
6407	/*
6408	* Get last extent's end offset (exclusive) if we haven't determined it
6409	* yet (we're processing the first file extent item that is new), or if
6410	* we're at the first slot of a leaf and the last extent's end is less
6411	* than the current extent's offset, because we might have skipped
6412	* entire leaves that contained only file extent items for our current
6413	* inode. These leaves have a generation number smaller (older) than the
6414	* one in the current leaf and the leaf our last extent came from, and
6415	* are located between these 2 leaves.
6416	*/
6417	if ((sctx->cur_inode_last_extent == (u64)-`1`) \|\|
6418	(path->slots[`0`] == `0` && sctx->cur_inode_last_extent < key->offset)) {
6419	ret = get_last_extent(sctx, offset: key->offset - `1`);
6420	if (ret)
6421	return ret;
6422	}
6423
6424	if (sctx->cur_inode_last_extent < key->offset) {
6425	ret = range_is_hole_in_parent(sctx,
6426	start: sctx->cur_inode_last_extent,
6427	end: key->offset);
6428	if (ret < `0`)
6429	return ret;
6430	else if (ret == `0`)
6431	ret = send_hole(sctx, end: key->offset);
6432	else
6433	ret = `0`;
6434	}
6435	sctx->cur_inode_last_extent = btrfs_file_extent_end(path);
6436	return ret;
6437	}
6438
6439	static int process_extent(struct send_ctx *sctx,
6440	struct btrfs_path *path,
6441	struct btrfs_key *key)
6442	{
6443	struct clone_root *found_clone = NULL;
6444	int ret = `0`;
6445
6446	if (S_ISLNK(sctx->cur_inode_mode))
6447	return `0`;
6448
6449	if (sctx->parent_root && !sctx->cur_inode_new) {
6450	ret = is_extent_unchanged(sctx, left_path: path, ekey: key);
6451	if (ret < `0`)
6452	goto out;
6453	if (ret) {
6454	ret = `0`;
6455	goto out_hole;
6456	}
6457	} else {
6458	struct btrfs_file_extent_item *ei;
6459	u8 type;
6460
6461	ei = btrfs_item_ptr(path->nodes[`0`], path->slots[`0`],
6462	struct btrfs_file_extent_item);
6463	type = btrfs_file_extent_type(eb: path->nodes[`0`], s: ei);
6464	if (type == BTRFS_FILE_EXTENT_PREALLOC \|\|
6465	type == BTRFS_FILE_EXTENT_REG) {
6466	/*
6467	* The send spec does not have a prealloc command yet,
6468	* so just leave a hole for prealloc'ed extents until
6469	* we have enough commands queued up to justify rev'ing
6470	* the send spec.
6471	*/
6472	if (type == BTRFS_FILE_EXTENT_PREALLOC) {
6473	ret = `0`;
6474	goto out;
6475	}
6476
6477	/ Have a hole, just skip it. /
6478	if (btrfs_file_extent_disk_bytenr(eb: path->nodes[`0`], s: ei) == `0`) {
6479	ret = `0`;
6480	goto out;
6481	}
6482	}
6483	}
6484
6485	ret = find_extent_clone(sctx, path, ino: key->objectid, data_offset: key->offset,
6486	ino_size: sctx->cur_inode_size, found: &found_clone);
6487	if (ret != -ENOENT && ret < `0`)
6488	goto out;
6489
6490	ret = send_write_or_clone(sctx, path, key, clone_root: found_clone);
6491	if (ret)
6492	goto out;
6493	out_hole:
6494	ret = maybe_send_hole(sctx, path, key);
6495	out:
6496	return ret;
6497	}
6498
6499	static int process_all_extents(struct send_ctx *sctx)
6500	{
6501	int ret = `0`;
6502	int iter_ret = `0`;
6503	struct btrfs_root *root;
6504	BTRFS_PATH_AUTO_FREE(path);
6505	struct btrfs_key key;
6506	struct btrfs_key found_key;
6507
6508	root = sctx->send_root;
6509	path = alloc_path_for_send();
6510	if (!path)
6511	return -ENOMEM;
6512
6513	key.objectid = sctx->cmp_key->objectid;
6514	key.type = BTRFS_EXTENT_DATA_KEY;
6515	key.offset = `0`;
6516	btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) {
6517	if (found_key.objectid != key.objectid \|\|
6518	found_key.type != key.type) {
6519	ret = `0`;
6520	break;
6521	}
6522
6523	ret = process_extent(sctx, path, key: &found_key);
6524	if (ret < `0`)
6525	break;
6526	}
6527	/ Catch error found during iteration /
6528	if (iter_ret < `0`)
6529	ret = iter_ret;
6530
6531	return ret;
6532	}
6533
6534	static int process_recorded_refs_if_needed(struct send_ctx *sctx, bool at_end,
6535	int *pending_move,
6536	int *refs_processed)
6537	{
6538	int ret = `0`;
6539
6540	if (sctx->cur_ino == `0`)
6541	goto out;
6542	if (!at_end && sctx->cur_ino == sctx->cmp_key->objectid &&
6543	sctx->cmp_key->type <= BTRFS_INODE_EXTREF_KEY)
6544	goto out;
6545	if (list_empty(head: &sctx->new_refs) && list_empty(head: &sctx->deleted_refs))
6546	goto out;
6547
6548	ret = process_recorded_refs(sctx, pending_move);
6549	if (ret < `0`)
6550	goto out;
6551
6552	*refs_processed = `1`;
6553	out:
6554	return ret;
6555	}
6556
6557	static int finish_inode_if_needed(struct send_ctx *sctx, bool at_end)
6558	{
6559	int ret = `0`;
6560	struct btrfs_inode_info info;
6561	u64 left_mode;
6562	u64 left_uid;
6563	u64 left_gid;
6564	u64 left_fileattr;
6565	u64 right_mode;
6566	u64 right_uid;
6567	u64 right_gid;
6568	u64 right_fileattr;
6569	int need_chmod = `0`;
6570	int need_chown = `0`;
6571	bool need_fileattr = false;
6572	int need_truncate = `1`;
6573	int pending_move = `0`;
6574	int refs_processed = `0`;
6575
6576	if (sctx->ignore_cur_inode)
6577	return `0`;
6578
6579	ret = process_recorded_refs_if_needed(sctx, at_end, pending_move: &pending_move,
6580	refs_processed: &refs_processed);
6581	if (ret < `0`)
6582	goto out;
6583
6584	/*
6585	* We have processed the refs and thus need to advance send_progress.
6586	* Now, calls to get_cur_xxx will take the updated refs of the current
6587	* inode into account.
6588	*
6589	* On the other hand, if our current inode is a directory and couldn't
6590	* be moved/renamed because its parent was renamed/moved too and it has
6591	* a higher inode number, we can only move/rename our current inode
6592	* after we moved/renamed its parent. Therefore in this case operate on
6593	* the old path (pre move/rename) of our current inode, and the
6594	* move/rename will be performed later.
6595	*/
6596	if (refs_processed && !pending_move)
6597	sctx->send_progress = sctx->cur_ino + `1`;
6598
6599	if (sctx->cur_ino == `0` \|\| sctx->cur_inode_deleted)
6600	goto out;
6601	if (!at_end && sctx->cmp_key->objectid == sctx->cur_ino)
6602	goto out;
6603	ret = get_inode_info(root: sctx->send_root, ino: sctx->cur_ino, info: &info);
6604	if (ret < `0`)
6605	goto out;
6606	left_mode = info.mode;
6607	left_uid = info.uid;
6608	left_gid = info.gid;
6609	left_fileattr = info.fileattr;
6610
6611	if (!sctx->parent_root \|\| sctx->cur_inode_new) {
6612	need_chown = `1`;
6613	if (!S_ISLNK(sctx->cur_inode_mode))
6614	need_chmod = `1`;
6615	if (sctx->cur_inode_next_write_offset == sctx->cur_inode_size)
6616	need_truncate = `0`;
6617	} else {
6618	u64 old_size;
6619
6620	ret = get_inode_info(root: sctx->parent_root, ino: sctx->cur_ino, info: &info);
6621	if (ret < `0`)
6622	goto out;
6623	old_size = info.size;
6624	right_mode = info.mode;
6625	right_uid = info.uid;
6626	right_gid = info.gid;
6627	right_fileattr = info.fileattr;
6628
6629	if (left_uid != right_uid \|\| left_gid != right_gid)
6630	need_chown = `1`;
6631	if (!S_ISLNK(sctx->cur_inode_mode) && left_mode != right_mode)
6632	need_chmod = `1`;
6633	if (!S_ISLNK(sctx->cur_inode_mode) && left_fileattr != right_fileattr)
6634	need_fileattr = true;
6635	if ((old_size == sctx->cur_inode_size) \|\|
6636	(sctx->cur_inode_size > old_size &&
6637	sctx->cur_inode_next_write_offset == sctx->cur_inode_size))
6638	need_truncate = `0`;
6639	}
6640
6641	if (S_ISREG(sctx->cur_inode_mode)) {
6642	if (need_send_hole(sctx)) {
6643	if (sctx->cur_inode_last_extent == (u64)-`1` \|\|
6644	sctx->cur_inode_last_extent <
6645	sctx->cur_inode_size) {
6646	ret = get_last_extent(sctx, offset: (u64)-`1`);
6647	if (ret)
6648	goto out;
6649	}
6650	if (sctx->cur_inode_last_extent < sctx->cur_inode_size) {
6651	ret = range_is_hole_in_parent(sctx,
6652	start: sctx->cur_inode_last_extent,
6653	end: sctx->cur_inode_size);
6654	if (ret < `0`) {
6655	goto out;
6656	} else if (ret == `0`) {
6657	ret = send_hole(sctx, end: sctx->cur_inode_size);
6658	if (ret < `0`)
6659	goto out;
6660	} else {
6661	/ Range is already a hole, skip. /
6662	ret = `0`;
6663	}
6664	}
6665	}
6666	if (need_truncate) {
6667	ret = send_truncate(sctx, ino: sctx->cur_ino,
6668	gen: sctx->cur_inode_gen,
6669	size: sctx->cur_inode_size);
6670	if (ret < `0`)
6671	goto out;
6672	}
6673	}
6674
6675	if (need_chown) {
6676	ret = send_chown(sctx, ino: sctx->cur_ino, gen: sctx->cur_inode_gen,
6677	uid: left_uid, gid: left_gid);
6678	if (ret < `0`)
6679	goto out;
6680	}
6681	if (need_chmod) {
6682	ret = send_chmod(sctx, ino: sctx->cur_ino, gen: sctx->cur_inode_gen,
6683	mode: left_mode);
6684	if (ret < `0`)
6685	goto out;
6686	}
6687	if (need_fileattr) {
6688	ret = send_fileattr(sctx, ino: sctx->cur_ino, gen: sctx->cur_inode_gen,
6689	fileattr: left_fileattr);
6690	if (ret < `0`)
6691	goto out;
6692	}
6693
6694	if (proto_cmd_ok(sctx, cmd: BTRFS_SEND_C_ENABLE_VERITY)
6695	&& sctx->cur_inode_needs_verity) {
6696	ret = process_verity(sctx);
6697	if (ret < `0`)
6698	goto out;
6699	}
6700
6701	ret = send_capabilities(sctx);
6702	if (ret < `0`)
6703	goto out;
6704
6705	/*
6706	* If other directory inodes depended on our current directory
6707	* inode's move/rename, now do their move/rename operations.
6708	*/
6709	if (!is_waiting_for_move(sctx, ino: sctx->cur_ino)) {
6710	ret = apply_children_dir_moves(sctx);
6711	if (ret)
6712	goto out;
6713	/*
6714	* Need to send that every time, no matter if it actually
6715	* changed between the two trees as we have done changes to
6716	* the inode before. If our inode is a directory and it's
6717	* waiting to be moved/renamed, we will send its utimes when
6718	* it's moved/renamed, therefore we don't need to do it here.
6719	*/
6720	sctx->send_progress = sctx->cur_ino + `1`;
6721
6722	/*
6723	* If the current inode is a non-empty directory, delay issuing
6724	* the utimes command for it, as it's very likely we have inodes
6725	* with an higher number inside it. We want to issue the utimes
6726	* command only after adding all dentries to it.
6727	*/
6728	if (S_ISDIR(sctx->cur_inode_mode) && sctx->cur_inode_size > `0`)
6729	ret = cache_dir_utimes(sctx, dir: sctx->cur_ino, gen: sctx->cur_inode_gen);
6730	else
6731	ret = send_utimes(sctx, ino: sctx->cur_ino, gen: sctx->cur_inode_gen);
6732
6733	if (ret < `0`)
6734	goto out;
6735	}
6736
6737	out:
6738	if (!ret)
6739	ret = trim_dir_utimes_cache(sctx);
6740
6741	return ret;
6742	}
6743
6744	static void close_current_inode(struct send_ctx *sctx)
6745	{
6746	u64 i_size;
6747
6748	if (sctx->cur_inode == NULL)
6749	return;
6750
6751	i_size = i_size_read(inode: sctx->cur_inode);
6752
6753	/*
6754	* If we are doing an incremental send, we may have extents between the
6755	* last processed extent and the i_size that have not been processed
6756	* because they haven't changed but we may have read some of their pages
6757	* through readahead, see the comments at send_extent_data().
6758	*/
6759	if (sctx->clean_page_cache && sctx->page_cache_clear_start < i_size)
6760	truncate_inode_pages_range(mapping: &sctx->cur_inode->i_data,
6761	lstart: sctx->page_cache_clear_start,
6762	round_up(i_size, PAGE_SIZE) - `1`);
6763
6764	iput(sctx->cur_inode);
6765	sctx->cur_inode = NULL;
6766	}
6767
6768	static int changed_inode(struct send_ctx *sctx,
6769	enum btrfs_compare_tree_result result)
6770	{
6771	int ret = `0`;
6772	struct btrfs_key *key = sctx->cmp_key;
6773	struct btrfs_inode_item *left_ii = NULL;
6774	struct btrfs_inode_item *right_ii = NULL;
6775	u64 left_gen = `0`;
6776	u64 right_gen = `0`;
6777
6778	close_current_inode(sctx);
6779
6780	sctx->cur_ino = key->objectid;
6781	sctx->cur_inode_new_gen = false;
6782	sctx->cur_inode_last_extent = (u64)-`1`;
6783	sctx->cur_inode_next_write_offset = `0`;
6784	sctx->ignore_cur_inode = false;
6785	fs_path_reset(p: &sctx->cur_inode_path);
6786
6787	/*
6788	* Set send_progress to current inode. This will tell all get_cur_xxx
6789	* functions that the current inode's refs are not updated yet. Later,
6790	* when process_recorded_refs is finished, it is set to cur_ino + 1.
6791	*/
6792	sctx->send_progress = sctx->cur_ino;
6793
6794	if (result == BTRFS_COMPARE_TREE_NEW \|\|
6795	result == BTRFS_COMPARE_TREE_CHANGED) {
6796	left_ii = btrfs_item_ptr(sctx->left_path->nodes[`0`],
6797	sctx->left_path->slots[`0`],
6798	struct btrfs_inode_item);
6799	left_gen = btrfs_inode_generation(eb: sctx->left_path->nodes[`0`],
6800	s: left_ii);
6801	} else {
6802	right_ii = btrfs_item_ptr(sctx->right_path->nodes[`0`],
6803	sctx->right_path->slots[`0`],
6804	struct btrfs_inode_item);
6805	right_gen = btrfs_inode_generation(eb: sctx->right_path->nodes[`0`],
6806	s: right_ii);
6807	}
6808	if (result == BTRFS_COMPARE_TREE_CHANGED) {
6809	right_ii = btrfs_item_ptr(sctx->right_path->nodes[`0`],
6810	sctx->right_path->slots[`0`],
6811	struct btrfs_inode_item);
6812
6813	right_gen = btrfs_inode_generation(eb: sctx->right_path->nodes[`0`],
6814	s: right_ii);
6815
6816	/*
6817	* The cur_ino = root dir case is special here. We can't treat
6818	* the inode as deleted+reused because it would generate a
6819	* stream that tries to delete/mkdir the root dir.
6820	*/
6821	if (left_gen != right_gen &&
6822	sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID)
6823	sctx->cur_inode_new_gen = true;
6824	}
6825
6826	/*
6827	* Normally we do not find inodes with a link count of zero (orphans)
6828	* because the most common case is to create a snapshot and use it
6829	* for a send operation. However other less common use cases involve
6830	* using a subvolume and send it after turning it to RO mode just
6831	* after deleting all hard links of a file while holding an open
6832	* file descriptor against it or turning a RO snapshot into RW mode,
6833	* keep an open file descriptor against a file, delete it and then
6834	* turn the snapshot back to RO mode before using it for a send
6835	* operation. The former is what the receiver operation does.
6836	* Therefore, if we want to send these snapshots soon after they're
6837	* received, we need to handle orphan inodes as well. Moreover, orphans
6838	* can appear not only in the send snapshot but also in the parent
6839	* snapshot. Here are several cases:
6840	*
6841	* Case 1: BTRFS_COMPARE_TREE_NEW
6842	* \| send snapshot \| action
6843	* --------------------------------
6844	* nlink \| 0 \| ignore
6845	*
6846	* Case 2: BTRFS_COMPARE_TREE_DELETED
6847	* \| parent snapshot \| action
6848	* ----------------------------------
6849	* nlink \| 0 \| as usual
6850	* Note: No unlinks will be sent because there're no paths for it.
6851	*
6852	* Case 3: BTRFS_COMPARE_TREE_CHANGED
6853	* \| \| parent snapshot \| send snapshot \| action
6854	* -----------------------------------------------------------------------
6855	* subcase 1 \| nlink \| 0 \| 0 \| ignore
6856	* subcase 2 \| nlink \| >0 \| 0 \| new_gen(deletion)
6857	* subcase 3 \| nlink \| 0 \| >0 \| new_gen(creation)
6858	*
6859	*/
6860	if (result == BTRFS_COMPARE_TREE_NEW) {
6861	if (btrfs_inode_nlink(eb: sctx->left_path->nodes[`0`], s: left_ii) == `0`) {
6862	sctx->ignore_cur_inode = true;
6863	goto out;
6864	}
6865	sctx->cur_inode_gen = left_gen;
6866	sctx->cur_inode_new = true;
6867	sctx->cur_inode_deleted = false;
6868	sctx->cur_inode_size = btrfs_inode_size(
6869	eb: sctx->left_path->nodes[`0`], s: left_ii);
6870	sctx->cur_inode_mode = btrfs_inode_mode(
6871	eb: sctx->left_path->nodes[`0`], s: left_ii);
6872	sctx->cur_inode_rdev = btrfs_inode_rdev(
6873	eb: sctx->left_path->nodes[`0`], s: left_ii);
6874	if (sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID)
6875	ret = send_create_inode_if_needed(sctx);
6876	} else if (result == BTRFS_COMPARE_TREE_DELETED) {
6877	sctx->cur_inode_gen = right_gen;
6878	sctx->cur_inode_new = false;
6879	sctx->cur_inode_deleted = true;
6880	sctx->cur_inode_size = btrfs_inode_size(
6881	eb: sctx->right_path->nodes[`0`], s: right_ii);
6882	sctx->cur_inode_mode = btrfs_inode_mode(
6883	eb: sctx->right_path->nodes[`0`], s: right_ii);
6884	} else if (result == BTRFS_COMPARE_TREE_CHANGED) {
6885	u32 new_nlinks, old_nlinks;
6886
6887	new_nlinks = btrfs_inode_nlink(eb: sctx->left_path->nodes[`0`], s: left_ii);
6888	old_nlinks = btrfs_inode_nlink(eb: sctx->right_path->nodes[`0`], s: right_ii);
6889	if (new_nlinks == `0` && old_nlinks == `0`) {
6890	sctx->ignore_cur_inode = true;
6891	goto out;
6892	} else if (new_nlinks == `0` \|\| old_nlinks == `0`) {
6893	sctx->cur_inode_new_gen = `1`;
6894	}
6895	/*
6896	* We need to do some special handling in case the inode was
6897	* reported as changed with a changed generation number. This
6898	* means that the original inode was deleted and new inode
6899	* reused the same inum. So we have to treat the old inode as
6900	* deleted and the new one as new.
6901	*/
6902	if (sctx->cur_inode_new_gen) {
6903	/*
6904	* First, process the inode as if it was deleted.
6905	*/
6906	if (old_nlinks > `0`) {
6907	sctx->cur_inode_gen = right_gen;
6908	sctx->cur_inode_new = false;
6909	sctx->cur_inode_deleted = true;
6910	sctx->cur_inode_size = btrfs_inode_size(
6911	eb: sctx->right_path->nodes[`0`], s: right_ii);
6912	sctx->cur_inode_mode = btrfs_inode_mode(
6913	eb: sctx->right_path->nodes[`0`], s: right_ii);
6914	ret = process_all_refs(sctx,
6915	cmd: BTRFS_COMPARE_TREE_DELETED);
6916	if (ret < `0`)
6917	goto out;
6918	}
6919
6920	/*
6921	* Now process the inode as if it was new.
6922	*/
6923	if (new_nlinks > `0`) {
6924	sctx->cur_inode_gen = left_gen;
6925	sctx->cur_inode_new = true;
6926	sctx->cur_inode_deleted = false;
6927	sctx->cur_inode_size = btrfs_inode_size(
6928	eb: sctx->left_path->nodes[`0`],
6929	s: left_ii);
6930	sctx->cur_inode_mode = btrfs_inode_mode(
6931	eb: sctx->left_path->nodes[`0`],
6932	s: left_ii);
6933	sctx->cur_inode_rdev = btrfs_inode_rdev(
6934	eb: sctx->left_path->nodes[`0`],
6935	s: left_ii);
6936	ret = send_create_inode_if_needed(sctx);
6937	if (ret < `0`)
6938	goto out;
6939
6940	ret = process_all_refs(sctx, cmd: BTRFS_COMPARE_TREE_NEW);
6941	if (ret < `0`)
6942	goto out;
6943	/*
6944	* Advance send_progress now as we did not get
6945	* into process_recorded_refs_if_needed in the
6946	* new_gen case.
6947	*/
6948	sctx->send_progress = sctx->cur_ino + `1`;
6949
6950	/*
6951	* Now process all extents and xattrs of the
6952	* inode as if they were all new.
6953	*/
6954	ret = process_all_extents(sctx);
6955	if (ret < `0`)
6956	goto out;
6957	ret = process_all_new_xattrs(sctx);
6958	if (ret < `0`)
6959	goto out;
6960	}
6961	} else {
6962	sctx->cur_inode_gen = left_gen;
6963	sctx->cur_inode_new = false;
6964	sctx->cur_inode_new_gen = false;
6965	sctx->cur_inode_deleted = false;
6966	sctx->cur_inode_size = btrfs_inode_size(
6967	eb: sctx->left_path->nodes[`0`], s: left_ii);
6968	sctx->cur_inode_mode = btrfs_inode_mode(
6969	eb: sctx->left_path->nodes[`0`], s: left_ii);
6970	}
6971	}
6972
6973	out:
6974	return ret;
6975	}
6976
6977	/*
6978	* We have to process new refs before deleted refs, but compare_trees gives us
6979	* the new and deleted refs mixed. To fix this, we record the new/deleted refs
6980	* first and later process them in process_recorded_refs.
6981	* For the cur_inode_new_gen case, we skip recording completely because
6982	* changed_inode did already initiate processing of refs. The reason for this is
6983	* that in this case, compare_tree actually compares the refs of 2 different
6984	* inodes. To fix this, process_all_refs is used in changed_inode to handle all
6985	* refs of the right tree as deleted and all refs of the left tree as new.
6986	*/
6987	static int changed_ref(struct send_ctx *sctx,
6988	enum btrfs_compare_tree_result result)
6989	{
6990	int ret = `0`;
6991
6992	if (unlikely(sctx->cur_ino != sctx->cmp_key->objectid)) {
6993	inconsistent_snapshot_error(sctx, result, what: "reference");
6994	return -EIO;
6995	}
6996
6997	if (!sctx->cur_inode_new_gen &&
6998	sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID) {
6999	if (result == BTRFS_COMPARE_TREE_NEW)
7000	ret = record_new_ref(sctx);
7001	else if (result == BTRFS_COMPARE_TREE_DELETED)
7002	ret = record_deleted_ref(sctx);
7003	else if (result == BTRFS_COMPARE_TREE_CHANGED)
7004	ret = record_changed_ref(sctx);
7005	}
7006
7007	return ret;
7008	}
7009
7010	/*
7011	* Process new/deleted/changed xattrs. We skip processing in the
7012	* cur_inode_new_gen case because changed_inode did already initiate processing
7013	* of xattrs. The reason is the same as in changed_ref
7014	*/
7015	static int changed_xattr(struct send_ctx *sctx,
7016	enum btrfs_compare_tree_result result)
7017	{
7018	int ret = `0`;
7019
7020	if (unlikely(sctx->cur_ino != sctx->cmp_key->objectid)) {
7021	inconsistent_snapshot_error(sctx, result, what: "xattr");
7022	return -EIO;
7023	}
7024
7025	if (!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted) {
7026	if (result == BTRFS_COMPARE_TREE_NEW)
7027	ret = process_new_xattr(sctx);
7028	else if (result == BTRFS_COMPARE_TREE_DELETED)
7029	ret = process_deleted_xattr(sctx);
7030	else if (result == BTRFS_COMPARE_TREE_CHANGED)
7031	ret = process_changed_xattr(sctx);
7032	}
7033
7034	return ret;
7035	}
7036
7037	/*
7038	* Process new/deleted/changed extents. We skip processing in the
7039	* cur_inode_new_gen case because changed_inode did already initiate processing
7040	* of extents. The reason is the same as in changed_ref
7041	*/
7042	static int changed_extent(struct send_ctx *sctx,
7043	enum btrfs_compare_tree_result result)
7044	{
7045	int ret = `0`;
7046
7047	/*
7048	* We have found an extent item that changed without the inode item
7049	* having changed. This can happen either after relocation (where the
7050	* disk_bytenr of an extent item is replaced at
7051	* relocation.c:replace_file_extents()) or after deduplication into a
7052	* file in both the parent and send snapshots (where an extent item can
7053	* get modified or replaced with a new one). Note that deduplication
7054	* updates the inode item, but it only changes the iversion (sequence
7055	* field in the inode item) of the inode, so if a file is deduplicated
7056	* the same amount of times in both the parent and send snapshots, its
7057	* iversion becomes the same in both snapshots, whence the inode item is
7058	* the same on both snapshots.
7059	*/
7060	if (sctx->cur_ino != sctx->cmp_key->objectid)
7061	return `0`;
7062
7063	if (!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted) {
7064	if (result != BTRFS_COMPARE_TREE_DELETED)
7065	ret = process_extent(sctx, path: sctx->left_path,
7066	key: sctx->cmp_key);
7067	}
7068
7069	return ret;
7070	}
7071
7072	static int changed_verity(struct send_ctx sctx, enum* btrfs_compare_tree_result result)
7073	{
7074	if (!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted) {
7075	if (result == BTRFS_COMPARE_TREE_NEW)
7076	sctx->cur_inode_needs_verity = true;
7077	}
7078	return `0`;
7079	}
7080
7081	static int dir_changed(struct send_ctx *sctx, u64 dir)
7082	{
7083	u64 orig_gen, new_gen;
7084	int ret;
7085
7086	ret = get_inode_gen(root: sctx->send_root, ino: dir, gen: &new_gen);
7087	if (ret)
7088	return ret;
7089
7090	ret = get_inode_gen(root: sctx->parent_root, ino: dir, gen: &orig_gen);
7091	if (ret)
7092	return ret;
7093
7094	return (orig_gen != new_gen) ? `1` : `0`;
7095	}
7096
7097	static int compare_refs(struct send_ctx sctx, struct* btrfs_path *path,
7098	struct btrfs_key *key)
7099	{
7100	struct btrfs_inode_extref *extref;
7101	struct extent_buffer *leaf;
7102	u64 dirid = `0`, last_dirid = `0`;
7103	unsigned long ptr;
7104	u32 item_size;
7105	u32 cur_offset = `0`;
7106	int ref_name_len;
7107	int ret = `0`;
7108
7109	/ Easy case, just check this one dirid /
7110	if (key->type == BTRFS_INODE_REF_KEY) {
7111	dirid = key->offset;
7112
7113	ret = dir_changed(sctx, dir: dirid);
7114	goto out;
7115	}
7116
7117	leaf = path->nodes[`0`];
7118	item_size = btrfs_item_size(eb: leaf, slot: path->slots[`0`]);
7119	ptr = btrfs_item_ptr_offset(leaf, path->slots[`0`]);
7120	while (cur_offset < item_size) {
7121	extref = (struct btrfs_inode_extref *)(ptr +
7122	cur_offset);
7123	dirid = btrfs_inode_extref_parent(eb: leaf, s: extref);
7124	ref_name_len = btrfs_inode_extref_name_len(eb: leaf, s: extref);
7125	cur_offset += ref_name_len + sizeof(*extref);
7126	if (dirid == last_dirid)
7127	continue;
7128	ret = dir_changed(sctx, dir: dirid);
7129	if (ret)
7130	break;
7131	last_dirid = dirid;
7132	}
7133	out:
7134	return ret;
7135	}
7136
7137	/*
7138	* Updates compare related fields in sctx and simply forwards to the actual
7139	* changed_xxx functions.
7140	*/
7141	static int changed_cb(struct btrfs_path *left_path,
7142	struct btrfs_path *right_path,
7143	struct btrfs_key *key,
7144	enum btrfs_compare_tree_result result,
7145	struct send_ctx *sctx)
7146	{
7147	int ret;
7148
7149	/*
7150	* We can not hold the commit root semaphore here. This is because in
7151	* the case of sending and receiving to the same filesystem, using a
7152	* pipe, could result in a deadlock:
7153	*
7154	* 1) The task running send blocks on the pipe because it's full;
7155	*
7156	* 2) The task running receive, which is the only consumer of the pipe,
7157	* is waiting for a transaction commit (for example due to a space
7158	* reservation when doing a write or triggering a transaction commit
7159	* when creating a subvolume);
7160	*
7161	* 3) The transaction is waiting to write lock the commit root semaphore,
7162	* but can not acquire it since it's being held at 1).
7163	*
7164	* Down this call chain we write to the pipe through kernel_write().
7165	* The same type of problem can also happen when sending to a file that
7166	* is stored in the same filesystem - when reserving space for a write
7167	* into the file, we can trigger a transaction commit.
7168	*
7169	* Our caller has supplied us with clones of leaves from the send and
7170	* parent roots, so we're safe here from a concurrent relocation and
7171	* further reallocation of metadata extents while we are here. Below we
7172	* also assert that the leaves are clones.
7173	*/
7174	lockdep_assert_not_held(&sctx->send_root->fs_info->commit_root_sem);
7175
7176	/*
7177	* We always have a send root, so left_path is never NULL. We will not
7178	* have a leaf when we have reached the end of the send root but have
7179	* not yet reached the end of the parent root.
7180	*/
7181	if (left_path->nodes[`0`])
7182	ASSERT(test_bit(EXTENT_BUFFER_UNMAPPED,
7183	&left_path->nodes[`0`]->bflags));
7184	/*
7185	* When doing a full send we don't have a parent root, so right_path is
7186	* NULL. When doing an incremental send, we may have reached the end of
7187	* the parent root already, so we don't have a leaf at right_path.
7188	*/
7189	if (right_path && right_path->nodes[`0`])
7190	ASSERT(test_bit(EXTENT_BUFFER_UNMAPPED,
7191	&right_path->nodes[`0`]->bflags));
7192
7193	if (result == BTRFS_COMPARE_TREE_SAME) {
7194	if (key->type == BTRFS_INODE_REF_KEY \|\|
7195	key->type == BTRFS_INODE_EXTREF_KEY) {
7196	ret = compare_refs(sctx, path: left_path, key);
7197	if (!ret)
7198	return `0`;
7199	if (ret < `0`)
7200	return ret;
7201	} else if (key->type == BTRFS_EXTENT_DATA_KEY) {
7202	return maybe_send_hole(sctx, path: left_path, key);
7203	} else {
7204	return `0`;
7205	}
7206	result = BTRFS_COMPARE_TREE_CHANGED;
7207	}
7208
7209	sctx->left_path = left_path;
7210	sctx->right_path = right_path;
7211	sctx->cmp_key = key;
7212
7213	ret = finish_inode_if_needed(sctx, at_end: `0`);
7214	if (ret < `0`)
7215	goto out;
7216
7217	/ Ignore non-FS objects /
7218	if (key->objectid == BTRFS_FREE_INO_OBJECTID \|\|
7219	key->objectid == BTRFS_FREE_SPACE_OBJECTID)
7220	goto out;
7221
7222	if (key->type == BTRFS_INODE_ITEM_KEY) {
7223	ret = changed_inode(sctx, result);
7224	} else if (!sctx->ignore_cur_inode) {
7225	if (key->type == BTRFS_INODE_REF_KEY \|\|
7226	key->type == BTRFS_INODE_EXTREF_KEY)
7227	ret = changed_ref(sctx, result);
7228	else if (key->type == BTRFS_XATTR_ITEM_KEY)
7229	ret = changed_xattr(sctx, result);
7230	else if (key->type == BTRFS_EXTENT_DATA_KEY)
7231	ret = changed_extent(sctx, result);
7232	else if (key->type == BTRFS_VERITY_DESC_ITEM_KEY &&
7233	key->offset == `0`)
7234	ret = changed_verity(sctx, result);
7235	}
7236
7237	out:
7238	return ret;
7239	}
7240
7241	static int search_key_again(const struct send_ctx *sctx,
7242	struct btrfs_root *root,
7243	struct btrfs_path *path,
7244	const struct btrfs_key *key)
7245	{
7246	int ret;
7247
7248	if (!path->need_commit_sem)
7249	lockdep_assert_held_read(&root->fs_info->commit_root_sem);
7250
7251	/*
7252	* Roots used for send operations are readonly and no one can add,
7253	* update or remove keys from them, so we should be able to find our
7254	* key again. The only exception is deduplication, which can operate on
7255	* readonly roots and add, update or remove keys to/from them - but at
7256	* the moment we don't allow it to run in parallel with send.
7257	*/
7258	ret = btrfs_search_slot(NULL, root, key, p: path, ins_len: `0`, cow: `0`);
7259	ASSERT(ret <= `0`);
7260	if (unlikely(ret > `0`)) {
7261	btrfs_print_tree(c: path->nodes[path->lowest_level], follow: false);
7262	btrfs_err(root->fs_info,
7263	"send: key " BTRFS_KEY_FMT" not found in %s root %llu, lowest_level %d, slot %d",
7264	BTRFS_KEY_FMT_VALUE(key),
7265	(root == sctx->parent_root ? "parent" : "send"),
7266	btrfs_root_id(root), path->lowest_level,
7267	path->slots[path->lowest_level]);
7268	return -EUCLEAN;
7269	}
7270
7271	return ret;
7272	}
7273
7274	static int full_send_tree(struct send_ctx *sctx)
7275	{
7276	int ret;
7277	struct btrfs_root *send_root = sctx->send_root;
7278	struct btrfs_key key;
7279	struct btrfs_fs_info *fs_info = send_root->fs_info;
7280	BTRFS_PATH_AUTO_FREE(path);
7281
7282	path = alloc_path_for_send();
7283	if (!path)
7284	return -ENOMEM;
7285	path->reada = READA_FORWARD_ALWAYS;
7286
7287	key.objectid = BTRFS_FIRST_FREE_OBJECTID;
7288	key.type = BTRFS_INODE_ITEM_KEY;
7289	key.offset = `0`;
7290
7291	down_read(sem: &fs_info->commit_root_sem);
7292	sctx->last_reloc_trans = fs_info->last_reloc_trans;
7293	up_read(sem: &fs_info->commit_root_sem);
7294
7295	ret = btrfs_search_slot_for_read(root: send_root, key: &key, p: path, find_higher: `1`, return_any: `0`);
7296	if (ret < `0`)
7297	return ret;
7298	if (ret)
7299	goto out_finish;
7300
7301	while (`1`) {
7302	btrfs_item_key_to_cpu(eb: path->nodes[`0`], cpu_key: &key, nr: path->slots[`0`]);
7303
7304	ret = changed_cb(left_path: path, NULL, key: &key,
7305	result: BTRFS_COMPARE_TREE_NEW, sctx);
7306	if (ret < `0`)
7307	return ret;
7308
7309	down_read(sem: &fs_info->commit_root_sem);
7310	if (fs_info->last_reloc_trans > sctx->last_reloc_trans) {
7311	sctx->last_reloc_trans = fs_info->last_reloc_trans;
7312	up_read(sem: &fs_info->commit_root_sem);
7313	/*
7314	* A transaction used for relocating a block group was
7315	* committed or is about to finish its commit. Release
7316	* our path (leaf) and restart the search, so that we
7317	* avoid operating on any file extent items that are
7318	* stale, with a disk_bytenr that reflects a pre
7319	* relocation value. This way we avoid as much as
7320	* possible to fallback to regular writes when checking
7321	* if we can clone file ranges.
7322	*/
7323	btrfs_release_path(p: path);
7324	ret = search_key_again(sctx, root: send_root, path, key: &key);
7325	if (ret < `0`)
7326	return ret;
7327	} else {
7328	up_read(sem: &fs_info->commit_root_sem);
7329	}
7330
7331	ret = btrfs_next_item(root: send_root, p: path);
7332	if (ret < `0`)
7333	return ret;
7334	if (ret) {
7335	ret = `0`;
7336	break;
7337	}
7338	}
7339
7340	out_finish:
7341	return finish_inode_if_needed(sctx, at_end: `1`);
7342	}
7343
7344	static int replace_node_with_clone(struct btrfs_path path, int* level)
7345	{
7346	struct extent_buffer *clone;
7347
7348	clone = btrfs_clone_extent_buffer(src: path->nodes[level]);
7349	if (!clone)
7350	return -ENOMEM;
7351
7352	free_extent_buffer(eb: path->nodes[level]);
7353	path->nodes[level] = clone;
7354
7355	return `0`;
7356	}
7357
7358	static int tree_move_down(struct btrfs_path path, int* *level, u64 reada_min_gen)
7359	{
7360	struct extent_buffer *eb;
7361	struct extent_buffer parent = path->nodes[level];
7362	int slot = path->slots[*level];
7363	const int nritems = btrfs_header_nritems(eb: parent);
7364	u64 reada_max;
7365	u64 reada_done = `0`;
7366
7367	lockdep_assert_held_read(&parent->fs_info->commit_root_sem);
7368	ASSERT(*level != `0`);
7369
7370	eb = btrfs_read_node_slot(parent, slot);
7371	if (IS_ERR(ptr: eb))
7372	return PTR_ERR(ptr: eb);
7373
7374	/*
7375	* Trigger readahead for the next leaves we will process, so that it is
7376	* very likely that when we need them they are already in memory and we
7377	* will not block on disk IO. For nodes we only do readahead for one,
7378	* since the time window between processing nodes is typically larger.
7379	*/
7380	reada_max = (*level == `1` ? SZ_128K : eb->fs_info->nodesize);
7381
7382	for (slot++; slot < nritems && reada_done < reada_max; slot++) {
7383	if (btrfs_node_ptr_generation(eb: parent, nr: slot) > reada_min_gen) {
7384	btrfs_readahead_node_child(node: parent, slot);
7385	reada_done += eb->fs_info->nodesize;
7386	}
7387	}
7388
7389	path->nodes[*level - `1`] = eb;
7390	path->slots[*level - `1`] = `0`;
7391	(*level)--;
7392
7393	if (*level == `0`)
7394	return replace_node_with_clone(path, level: `0`);
7395
7396	return `0`;
7397	}
7398
7399	static int tree_move_next_or_upnext(struct btrfs_path *path,
7400	int level, int* root_level)
7401	{
7402	int ret = `0`;
7403	int nritems;
7404	nritems = btrfs_header_nritems(eb: path->nodes[*level]);
7405
7406	path->slots[*level]++;
7407
7408	while (path->slots[*level] >= nritems) {
7409	if (*level == root_level) {
7410	path->slots[*level] = nritems - `1`;
7411	return -`1`;
7412	}
7413
7414	/ move upnext /
7415	path->slots[*level] = `0`;
7416	free_extent_buffer(eb: path->nodes[*level]);
7417	path->nodes[*level] = NULL;
7418	(*level)++;
7419	path->slots[*level]++;
7420
7421	nritems = btrfs_header_nritems(eb: path->nodes[*level]);
7422	ret = `1`;
7423	}
7424	return ret;
7425	}
7426
7427	/*
7428	* Returns 1 if it had to move up and next. 0 is returned if it moved only next
7429	* or down.
7430	*/
7431	static int tree_advance(struct btrfs_path *path,
7432	int level, int* root_level,
7433	int allow_down,
7434	struct btrfs_key *key,
7435	u64 reada_min_gen)
7436	{
7437	int ret;
7438
7439	if (*level == `0` \|\| !allow_down) {
7440	ret = tree_move_next_or_upnext(path, level, root_level);
7441	} else {
7442	ret = tree_move_down(path, level, reada_min_gen);
7443	}
7444
7445	/*
7446	* Even if we have reached the end of a tree, ret is -1, update the key
7447	* anyway, so that in case we need to restart due to a block group
7448	* relocation, we can assert that the last key of the root node still
7449	* exists in the tree.
7450	*/
7451	if (*level == `0`)
7452	btrfs_item_key_to_cpu(eb: path->nodes[*level], cpu_key: key,
7453	nr: path->slots[*level]);
7454	else
7455	btrfs_node_key_to_cpu(eb: path->nodes[*level], cpu_key: key,
7456	nr: path->slots[*level]);
7457
7458	return ret;
7459	}
7460
7461	static int tree_compare_item(struct btrfs_path *left_path,
7462	struct btrfs_path *right_path,
7463	char *tmp_buf)
7464	{
7465	int cmp;
7466	int len1, len2;
7467	unsigned long off1, off2;
7468
7469	len1 = btrfs_item_size(eb: left_path->nodes[`0`], slot: left_path->slots[`0`]);
7470	len2 = btrfs_item_size(eb: right_path->nodes[`0`], slot: right_path->slots[`0`]);
7471	if (len1 != len2)
7472	return `1`;
7473
7474	off1 = btrfs_item_ptr_offset(left_path->nodes[`0`], left_path->slots[`0`]);
7475	off2 = btrfs_item_ptr_offset(right_path->nodes[`0`],
7476	right_path->slots[`0`]);
7477
7478	read_extent_buffer(eb: left_path->nodes[`0`], dst: tmp_buf, start: off1, len: len1);
7479
7480	cmp = memcmp_extent_buffer(eb: right_path->nodes[`0`], ptrv: tmp_buf, start: off2, len: len1);
7481	if (cmp)
7482	return `1`;
7483	return `0`;
7484	}
7485
7486	/*
7487	* A transaction used for relocating a block group was committed or is about to
7488	* finish its commit. Release our paths and restart the search, so that we are
7489	* not using stale extent buffers:
7490	*
7491	* 1) For levels > 0, we are only holding references of extent buffers, without
7492	* any locks on them, which does not prevent them from having been relocated
7493	* and reallocated after the last time we released the commit root semaphore.
7494	* The exception are the root nodes, for which we always have a clone, see
7495	* the comment at btrfs_compare_trees();
7496	*
7497	* 2) For leaves, level 0, we are holding copies (clones) of extent buffers, so
7498	* we are safe from the concurrent relocation and reallocation. However they
7499	* can have file extent items with a pre relocation disk_bytenr value, so we
7500	* restart the start from the current commit roots and clone the new leaves so
7501	* that we get the post relocation disk_bytenr values. Not doing so, could
7502	* make us clone the wrong data in case there are new extents using the old
7503	* disk_bytenr that happen to be shared.
7504	*/
7505	static int restart_after_relocation(struct btrfs_path *left_path,
7506	struct btrfs_path *right_path,
7507	const struct btrfs_key *left_key,
7508	const struct btrfs_key *right_key,
7509	int left_level,
7510	int right_level,
7511	const struct send_ctx *sctx)
7512	{
7513	int root_level;
7514	int ret;
7515
7516	lockdep_assert_held_read(&sctx->send_root->fs_info->commit_root_sem);
7517
7518	btrfs_release_path(p: left_path);
7519	btrfs_release_path(p: right_path);
7520
7521	/*
7522	* Since keys can not be added or removed to/from our roots because they
7523	* are readonly and we do not allow deduplication to run in parallel
7524	* (which can add, remove or change keys), the layout of the trees should
7525	* not change.
7526	*/
7527	left_path->lowest_level = left_level;
7528	ret = search_key_again(sctx, root: sctx->send_root, path: left_path, key: left_key);
7529	if (ret < `0`)
7530	return ret;
7531
7532	right_path->lowest_level = right_level;
7533	ret = search_key_again(sctx, root: sctx->parent_root, path: right_path, key: right_key);
7534	if (ret < `0`)
7535	return ret;
7536
7537	/*
7538	* If the lowest level nodes are leaves, clone them so that they can be
7539	* safely used by changed_cb() while not under the protection of the
7540	* commit root semaphore, even if relocation and reallocation happens in
7541	* parallel.
7542	*/
7543	if (left_level == `0`) {
7544	ret = replace_node_with_clone(path: left_path, level: `0`);
7545	if (ret < `0`)
7546	return ret;
7547	}
7548
7549	if (right_level == `0`) {
7550	ret = replace_node_with_clone(path: right_path, level: `0`);
7551	if (ret < `0`)
7552	return ret;
7553	}
7554
7555	/*
7556	* Now clone the root nodes (unless they happen to be the leaves we have
7557	* already cloned). This is to protect against concurrent snapshotting of
7558	* the send and parent roots (see the comment at btrfs_compare_trees()).
7559	*/
7560	root_level = btrfs_header_level(eb: sctx->send_root->commit_root);
7561	if (root_level > `0`) {
7562	ret = replace_node_with_clone(path: left_path, level: root_level);
7563	if (ret < `0`)
7564	return ret;
7565	}
7566
7567	root_level = btrfs_header_level(eb: sctx->parent_root->commit_root);
7568	if (root_level > `0`) {
7569	ret = replace_node_with_clone(path: right_path, level: root_level);
7570	if (ret < `0`)
7571	return ret;
7572	}
7573
7574	return `0`;
7575	}
7576
7577	/*
7578	* This function compares two trees and calls the provided callback for
7579	* every changed/new/deleted item it finds.
7580	* If shared tree blocks are encountered, whole subtrees are skipped, making
7581	* the compare pretty fast on snapshotted subvolumes.
7582	*
7583	* This currently works on commit roots only. As commit roots are read only,
7584	* we don't do any locking. The commit roots are protected with transactions.
7585	* Transactions are ended and rejoined when a commit is tried in between.
7586	*
7587	* This function checks for modifications done to the trees while comparing.
7588	* If it detects a change, it aborts immediately.
7589	*/
7590	static int btrfs_compare_trees(struct btrfs_root *left_root,
7591	struct btrfs_root right_root, struct* send_ctx *sctx)
7592	{
7593	struct btrfs_fs_info *fs_info = left_root->fs_info;
7594	int ret;
7595	int cmp;
7596	BTRFS_PATH_AUTO_FREE(left_path);
7597	BTRFS_PATH_AUTO_FREE(right_path);
7598	struct btrfs_key left_key;
7599	struct btrfs_key right_key;
7600	char *tmp_buf = NULL;
7601	int left_root_level;
7602	int right_root_level;
7603	int left_level;
7604	int right_level;
7605	int left_end_reached = `0`;
7606	int right_end_reached = `0`;
7607	int advance_left = `0`;
7608	int advance_right = `0`;
7609	u64 left_blockptr;
7610	u64 right_blockptr;
7611	u64 left_gen;
7612	u64 right_gen;
7613	u64 reada_min_gen;
7614
7615	left_path = btrfs_alloc_path();
7616	if (!left_path) {
7617	ret = -ENOMEM;
7618	goto out;
7619	}
7620	right_path = btrfs_alloc_path();
7621	if (!right_path) {
7622	ret = -ENOMEM;
7623	goto out;
7624	}
7625
7626	tmp_buf = kvmalloc(fs_info->nodesize, GFP_KERNEL);
7627	if (!tmp_buf) {
7628	ret = -ENOMEM;
7629	goto out;
7630	}
7631
7632	left_path->search_commit_root = true;
7633	left_path->skip_locking = true;
7634	right_path->search_commit_root = true;
7635	right_path->skip_locking = true;
7636
7637	/*
7638	* Strategy: Go to the first items of both trees. Then do
7639	*
7640	* If both trees are at level 0
7641	* Compare keys of current items
7642	* If left < right treat left item as new, advance left tree
7643	* and repeat
7644	* If left > right treat right item as deleted, advance right tree
7645	* and repeat
7646	* If left == right do deep compare of items, treat as changed if
7647	* needed, advance both trees and repeat
7648	* If both trees are at the same level but not at level 0
7649	* Compare keys of current nodes/leafs
7650	* If left < right advance left tree and repeat
7651	* If left > right advance right tree and repeat
7652	* If left == right compare blockptrs of the next nodes/leafs
7653	* If they match advance both trees but stay at the same level
7654	* and repeat
7655	* If they don't match advance both trees while allowing to go
7656	* deeper and repeat
7657	* If tree levels are different
7658	* Advance the tree that needs it and repeat
7659	*
7660	* Advancing a tree means:
7661	* If we are at level 0, try to go to the next slot. If that's not
7662	* possible, go one level up and repeat. Stop when we found a level
7663	* where we could go to the next slot. We may at this point be on a
7664	* node or a leaf.
7665	*
7666	* If we are not at level 0 and not on shared tree blocks, go one
7667	* level deeper.
7668	*
7669	* If we are not at level 0 and on shared tree blocks, go one slot to
7670	* the right if possible or go up and right.
7671	*/
7672
7673	down_read(sem: &fs_info->commit_root_sem);
7674	left_level = btrfs_header_level(eb: left_root->commit_root);
7675	left_root_level = left_level;
7676	/*
7677	* We clone the root node of the send and parent roots to prevent races
7678	* with snapshot creation of these roots. Snapshot creation COWs the
7679	* root node of a tree, so after the transaction is committed the old
7680	* extent can be reallocated while this send operation is still ongoing.
7681	* So we clone them, under the commit root semaphore, to be race free.
7682	*/
7683	left_path->nodes[left_level] =
7684	btrfs_clone_extent_buffer(src: left_root->commit_root);
7685	if (!left_path->nodes[left_level]) {
7686	ret = -ENOMEM;
7687	goto out_unlock;
7688	}
7689
7690	right_level = btrfs_header_level(eb: right_root->commit_root);
7691	right_root_level = right_level;
7692	right_path->nodes[right_level] =
7693	btrfs_clone_extent_buffer(src: right_root->commit_root);
7694	if (!right_path->nodes[right_level]) {
7695	ret = -ENOMEM;
7696	goto out_unlock;
7697	}
7698	/*
7699	* Our right root is the parent root, while the left root is the "send"
7700	* root. We know that all new nodes/leaves in the left root must have
7701	* a generation greater than the right root's generation, so we trigger
7702	* readahead for those nodes and leaves of the left root, as we know we
7703	* will need to read them at some point.
7704	*/
7705	reada_min_gen = btrfs_header_generation(eb: right_root->commit_root);
7706
7707	if (left_level == `0`)
7708	btrfs_item_key_to_cpu(eb: left_path->nodes[left_level],
7709	cpu_key: &left_key, nr: left_path->slots[left_level]);
7710	else
7711	btrfs_node_key_to_cpu(eb: left_path->nodes[left_level],
7712	cpu_key: &left_key, nr: left_path->slots[left_level]);
7713	if (right_level == `0`)
7714	btrfs_item_key_to_cpu(eb: right_path->nodes[right_level],
7715	cpu_key: &right_key, nr: right_path->slots[right_level]);
7716	else
7717	btrfs_node_key_to_cpu(eb: right_path->nodes[right_level],
7718	cpu_key: &right_key, nr: right_path->slots[right_level]);
7719
7720	sctx->last_reloc_trans = fs_info->last_reloc_trans;
7721
7722	while (`1`) {
7723	if (need_resched() \|\|
7724	rwsem_is_contended(sem: &fs_info->commit_root_sem)) {
7725	up_read(sem: &fs_info->commit_root_sem);
7726	cond_resched();
7727	down_read(sem: &fs_info->commit_root_sem);
7728	}
7729
7730	if (fs_info->last_reloc_trans > sctx->last_reloc_trans) {
7731	ret = restart_after_relocation(left_path, right_path,
7732	left_key: &left_key, right_key: &right_key,
7733	left_level, right_level,
7734	sctx);
7735	if (ret < `0`)
7736	goto out_unlock;
7737	sctx->last_reloc_trans = fs_info->last_reloc_trans;
7738	}
7739
7740	if (advance_left && !left_end_reached) {
7741	ret = tree_advance(path: left_path, level: &left_level,
7742	root_level: left_root_level,
7743	allow_down: advance_left != ADVANCE_ONLY_NEXT,
7744	key: &left_key, reada_min_gen);
7745	if (ret == -`1`)
7746	left_end_reached = ADVANCE;
7747	else if (ret < `0`)
7748	goto out_unlock;
7749	advance_left = `0`;
7750	}
7751	if (advance_right && !right_end_reached) {
7752	ret = tree_advance(path: right_path, level: &right_level,
7753	root_level: right_root_level,
7754	allow_down: advance_right != ADVANCE_ONLY_NEXT,
7755	key: &right_key, reada_min_gen);
7756	if (ret == -`1`)
7757	right_end_reached = ADVANCE;
7758	else if (ret < `0`)
7759	goto out_unlock;
7760	advance_right = `0`;
7761	}
7762
7763	if (left_end_reached && right_end_reached) {
7764	ret = `0`;
7765	goto out_unlock;
7766	} else if (left_end_reached) {
7767	if (right_level == `0`) {
7768	up_read(sem: &fs_info->commit_root_sem);
7769	ret = changed_cb(left_path, right_path,
7770	key: &right_key,
7771	result: BTRFS_COMPARE_TREE_DELETED,
7772	sctx);
7773	if (ret < `0`)
7774	goto out;
7775	down_read(sem: &fs_info->commit_root_sem);
7776	}
7777	advance_right = ADVANCE;
7778	continue;
7779	} else if (right_end_reached) {
7780	if (left_level == `0`) {
7781	up_read(sem: &fs_info->commit_root_sem);
7782	ret = changed_cb(left_path, right_path,
7783	key: &left_key,
7784	result: BTRFS_COMPARE_TREE_NEW,
7785	sctx);
7786	if (ret < `0`)
7787	goto out;
7788	down_read(sem: &fs_info->commit_root_sem);
7789	}
7790	advance_left = ADVANCE;
7791	continue;
7792	}
7793
7794	if (left_level == `0` && right_level == `0`) {
7795	up_read(sem: &fs_info->commit_root_sem);
7796	cmp = btrfs_comp_cpu_keys(k1: &left_key, k2: &right_key);
7797	if (cmp < `0`) {
7798	ret = changed_cb(left_path, right_path,
7799	key: &left_key,
7800	result: BTRFS_COMPARE_TREE_NEW,
7801	sctx);
7802	advance_left = ADVANCE;
7803	} else if (cmp > `0`) {
7804	ret = changed_cb(left_path, right_path,
7805	key: &right_key,
7806	result: BTRFS_COMPARE_TREE_DELETED,
7807	sctx);
7808	advance_right = ADVANCE;
7809	} else {
7810	enum btrfs_compare_tree_result result;
7811
7812	WARN_ON(!extent_buffer_uptodate(left_path->nodes[`0`]));
7813	ret = tree_compare_item(left_path, right_path,
7814	tmp_buf);
7815	if (ret)
7816	result = BTRFS_COMPARE_TREE_CHANGED;
7817	else
7818	result = BTRFS_COMPARE_TREE_SAME;
7819	ret = changed_cb(left_path, right_path,
7820	key: &left_key, result, sctx);
7821	advance_left = ADVANCE;
7822	advance_right = ADVANCE;
7823	}
7824
7825	if (ret < `0`)
7826	goto out;
7827	down_read(sem: &fs_info->commit_root_sem);
7828	} else if (left_level == right_level) {
7829	cmp = btrfs_comp_cpu_keys(k1: &left_key, k2: &right_key);
7830	if (cmp < `0`) {
7831	advance_left = ADVANCE;
7832	} else if (cmp > `0`) {
7833	advance_right = ADVANCE;
7834	} else {
7835	left_blockptr = btrfs_node_blockptr(
7836	eb: left_path->nodes[left_level],
7837	nr: left_path->slots[left_level]);
7838	right_blockptr = btrfs_node_blockptr(
7839	eb: right_path->nodes[right_level],
7840	nr: right_path->slots[right_level]);
7841	left_gen = btrfs_node_ptr_generation(
7842	eb: left_path->nodes[left_level],
7843	nr: left_path->slots[left_level]);
7844	right_gen = btrfs_node_ptr_generation(
7845	eb: right_path->nodes[right_level],
7846	nr: right_path->slots[right_level]);
7847	if (left_blockptr == right_blockptr &&
7848	left_gen == right_gen) {
7849	/*
7850	* As we're on a shared block, don't
7851	* allow to go deeper.
7852	*/
7853	advance_left = ADVANCE_ONLY_NEXT;
7854	advance_right = ADVANCE_ONLY_NEXT;
7855	} else {
7856	advance_left = ADVANCE;
7857	advance_right = ADVANCE;
7858	}
7859	}
7860	} else if (left_level < right_level) {
7861	advance_right = ADVANCE;
7862	} else {
7863	advance_left = ADVANCE;
7864	}
7865	}
7866
7867	out_unlock:
7868	up_read(sem: &fs_info->commit_root_sem);
7869	out:
7870	kvfree(addr: tmp_buf);
7871	return ret;
7872	}
7873
7874	static int send_subvol(struct send_ctx *sctx)
7875	{
7876	int ret;
7877
7878	if (!(sctx->flags & BTRFS_SEND_FLAG_OMIT_STREAM_HEADER)) {
7879	ret = send_header(sctx);
7880	if (ret < `0`)
7881	goto out;
7882	}
7883
7884	ret = send_subvol_begin(sctx);
7885	if (ret < `0`)
7886	goto out;
7887
7888	if (sctx->parent_root) {
7889	ret = btrfs_compare_trees(left_root: sctx->send_root, right_root: sctx->parent_root, sctx);
7890	if (ret < `0`)
7891	goto out;
7892	ret = finish_inode_if_needed(sctx, at_end: `1`);
7893	if (ret < `0`)
7894	goto out;
7895	} else {
7896	ret = full_send_tree(sctx);
7897	if (ret < `0`)
7898	goto out;
7899	}
7900
7901	out:
7902	free_recorded_refs(sctx);
7903	return ret;
7904	}
7905
7906	/*
7907	* If orphan cleanup did remove any orphans from a root, it means the tree
7908	* was modified and therefore the commit root is not the same as the current
7909	* root anymore. This is a problem, because send uses the commit root and
7910	* therefore can see inode items that don't exist in the current root anymore,
7911	* and for example make calls to btrfs_iget, which will do tree lookups based
7912	* on the current root and not on the commit root. Those lookups will fail,
7913	* returning a -ESTALE error, and making send fail with that error. So make
7914	* sure a send does not see any orphans we have just removed, and that it will
7915	* see the same inodes regardless of whether a transaction commit happened
7916	* before it started (meaning that the commit root will be the same as the
7917	* current root) or not.
7918	*/
7919	static int ensure_commit_roots_uptodate(struct send_ctx *sctx)
7920	{
7921	struct btrfs_root *root = sctx->parent_root;
7922
7923	if (root && root->node != root->commit_root)
7924	return btrfs_commit_current_transaction(root);
7925
7926	for (int i = `0`; i < sctx->clone_roots_cnt; i++) {
7927	root = sctx->clone_roots[i].root;
7928	if (root->node != root->commit_root)
7929	return btrfs_commit_current_transaction(root);
7930	}
7931
7932	return `0`;
7933	}
7934
7935	/*
7936	* Make sure any existing delalloc is flushed for any root used by a send
7937	* operation so that we do not miss any data and we do not race with writeback
7938	* finishing and changing a tree while send is using the tree. This could
7939	* happen if a subvolume is in RW mode, has delalloc, is turned to RO mode and
7940	* a send operation then uses the subvolume.
7941	* After flushing delalloc ensure_commit_roots_uptodate() must be called.
7942	*/
7943	static int flush_delalloc_roots(struct send_ctx *sctx)
7944	{
7945	struct btrfs_root *root = sctx->parent_root;
7946	int ret;
7947	int i;
7948
7949	if (root) {
7950	ret = btrfs_start_delalloc_snapshot(root, in_reclaim_context: false);
7951	if (ret)
7952	return ret;
7953	btrfs_wait_ordered_extents(root, U64_MAX, NULL);
7954	}
7955
7956	for (i = `0`; i < sctx->clone_roots_cnt; i++) {
7957	root = sctx->clone_roots[i].root;
7958	ret = btrfs_start_delalloc_snapshot(root, in_reclaim_context: false);
7959	if (ret)
7960	return ret;
7961	btrfs_wait_ordered_extents(root, U64_MAX, NULL);
7962	}
7963
7964	return `0`;
7965	}
7966
7967	static void btrfs_root_dec_send_in_progress(struct btrfs_root* root)
7968	{
7969	spin_lock(lock: &root->root_item_lock);
7970	root->send_in_progress--;
7971	/*
7972	* Not much left to do, we don't know why it's unbalanced and
7973	* can't blindly reset it to 0.
7974	*/
7975	if (root->send_in_progress < `0`)
7976	btrfs_err(root->fs_info,
7977	"send_in_progress unbalanced %d root %llu",
7978	root->send_in_progress, btrfs_root_id(root));
7979	spin_unlock(lock: &root->root_item_lock);
7980	}
7981
7982	static void dedupe_in_progress_warn(const struct btrfs_root *root)
7983	{
7984	btrfs_warn_rl(root->fs_info,
7985	"cannot use root %llu for send while deduplications on it are in progress (%d in progress)",
7986	btrfs_root_id(root), root->dedupe_in_progress);
7987	}
7988
7989	long btrfs_ioctl_send(struct btrfs_root send_root, const* struct btrfs_ioctl_send_args *arg)
7990	{
7991	int ret = `0`;
7992	struct btrfs_fs_info *fs_info = send_root->fs_info;
7993	struct btrfs_root *clone_root;
7994	struct send_ctx *sctx = NULL;
7995	u32 i;
7996	u64 *clone_sources_tmp = NULL;
7997	int clone_sources_to_rollback = `0`;
7998	size_t alloc_size;
7999	int sort_clone_roots = `0`;
8000	struct btrfs_lru_cache_entry *entry;
8001	struct btrfs_lru_cache_entry *tmp;
8002
8003	if (!capable(CAP_SYS_ADMIN))
8004	return -EPERM;
8005
8006	/*
8007	* The subvolume must remain read-only during send, protect against
8008	* making it RW. This also protects against deletion.
8009	*/
8010	spin_lock(lock: &send_root->root_item_lock);
8011	/*
8012	* Unlikely but possible, if the subvolume is marked for deletion but
8013	* is slow to remove the directory entry, send can still be started.
8014	*/
8015	if (btrfs_root_dead(root: send_root)) {
8016	spin_unlock(lock: &send_root->root_item_lock);
8017	return -EPERM;
8018	}
8019	/ Userspace tools do the checks and warn the user if it's not RO. /
8020	if (!btrfs_root_readonly(root: send_root)) {
8021	spin_unlock(lock: &send_root->root_item_lock);
8022	return -EPERM;
8023	}
8024	if (send_root->dedupe_in_progress) {
8025	dedupe_in_progress_warn(root: send_root);
8026	spin_unlock(lock: &send_root->root_item_lock);
8027	return -EAGAIN;
8028	}
8029	send_root->send_in_progress++;
8030	spin_unlock(lock: &send_root->root_item_lock);
8031
8032	/*
8033	* Check that we don't overflow at later allocations, we request
8034	* clone_sources_count + 1 items, and compare to unsigned long inside
8035	* access_ok. Also set an upper limit for allocation size so this can't
8036	* easily exhaust memory. Max number of clone sources is about 200K.
8037	*/
8038	if (arg->clone_sources_count > SZ_8M / sizeof(struct clone_root)) {
8039	ret = -EINVAL;
8040	goto out;
8041	}
8042
8043	if (arg->flags & ~BTRFS_SEND_FLAG_MASK) {
8044	ret = -EOPNOTSUPP;
8045	goto out;
8046	}
8047
8048	sctx = kzalloc(sizeof(struct send_ctx), GFP_KERNEL);
8049	if (!sctx) {
8050	ret = -ENOMEM;
8051	goto out;
8052	}
8053
8054	init_path(p: &sctx->cur_inode_path);
8055	INIT_LIST_HEAD(list: &sctx->new_refs);
8056	INIT_LIST_HEAD(list: &sctx->deleted_refs);
8057
8058	btrfs_lru_cache_init(cache: &sctx->name_cache, SEND_MAX_NAME_CACHE_SIZE);
8059	btrfs_lru_cache_init(cache: &sctx->backref_cache, SEND_MAX_BACKREF_CACHE_SIZE);
8060	btrfs_lru_cache_init(cache: &sctx->dir_created_cache,
8061	SEND_MAX_DIR_CREATED_CACHE_SIZE);
8062	/*
8063	* This cache is periodically trimmed to a fixed size elsewhere, see
8064	* cache_dir_utimes() and trim_dir_utimes_cache().
8065	*/
8066	btrfs_lru_cache_init(cache: &sctx->dir_utimes_cache, max_size: `0`);
8067
8068	sctx->pending_dir_moves = RB_ROOT;
8069	sctx->waiting_dir_moves = RB_ROOT;
8070	sctx->orphan_dirs = RB_ROOT;
8071	sctx->rbtree_new_refs = RB_ROOT;
8072	sctx->rbtree_deleted_refs = RB_ROOT;
8073
8074	sctx->flags = arg->flags;
8075
8076	if (arg->flags & BTRFS_SEND_FLAG_VERSION) {
8077	if (arg->version > BTRFS_SEND_STREAM_VERSION) {
8078	ret = -EPROTO;
8079	goto out;
8080	}
8081	/ Zero means "use the highest version" /
8082	sctx->proto = arg->version ?: BTRFS_SEND_STREAM_VERSION;
8083	} else {
8084	sctx->proto = `1`;
8085	}
8086	if ((arg->flags & BTRFS_SEND_FLAG_COMPRESSED) && sctx->proto < `2`) {
8087	ret = -EINVAL;
8088	goto out;
8089	}
8090
8091	sctx->send_filp = fget(fd: arg->send_fd);
8092	if (!sctx->send_filp \|\| !(sctx->send_filp->f_mode & FMODE_WRITE)) {
8093	ret = -EBADF;
8094	goto out;
8095	}
8096
8097	sctx->send_root = send_root;
8098	sctx->clone_roots_cnt = arg->clone_sources_count;
8099
8100	if (sctx->proto >= `2`) {
8101	u32 send_buf_num_pages;
8102
8103	sctx->send_max_size = BTRFS_SEND_BUF_SIZE_V2;
8104	sctx->send_buf = vmalloc(sctx->send_max_size);
8105	if (!sctx->send_buf) {
8106	ret = -ENOMEM;
8107	goto out;
8108	}
8109	send_buf_num_pages = sctx->send_max_size >> PAGE_SHIFT;
8110	sctx->send_buf_pages = kcalloc(send_buf_num_pages,
8111	sizeof(*sctx->send_buf_pages),
8112	GFP_KERNEL);
8113	if (!sctx->send_buf_pages) {
8114	ret = -ENOMEM;
8115	goto out;
8116	}
8117	for (i = `0`; i < send_buf_num_pages; i++) {
8118	sctx->send_buf_pages[i] =
8119	vmalloc_to_page(addr: sctx->send_buf + (i << PAGE_SHIFT));
8120	}
8121	} else {
8122	sctx->send_max_size = BTRFS_SEND_BUF_SIZE_V1;
8123	sctx->send_buf = kvmalloc(sctx->send_max_size, GFP_KERNEL);
8124	}
8125	if (!sctx->send_buf) {
8126	ret = -ENOMEM;
8127	goto out;
8128	}
8129
8130	sctx->clone_roots = kvcalloc(arg->clone_sources_count + `1`,
8131	sizeof(*sctx->clone_roots),
8132	GFP_KERNEL);
8133	if (!sctx->clone_roots) {
8134	ret = -ENOMEM;
8135	goto out;
8136	}
8137
8138	alloc_size = array_size(sizeof(*arg->clone_sources),
8139	arg->clone_sources_count);
8140
8141	if (arg->clone_sources_count) {
8142	clone_sources_tmp = kvmalloc(alloc_size, GFP_KERNEL);
8143	if (!clone_sources_tmp) {
8144	ret = -ENOMEM;
8145	goto out;
8146	}
8147
8148	ret = copy_from_user(to: clone_sources_tmp, from: arg->clone_sources,
8149	n: alloc_size);
8150	if (ret) {
8151	ret = -EFAULT;
8152	goto out;
8153	}
8154
8155	for (i = `0`; i < arg->clone_sources_count; i++) {
8156	clone_root = btrfs_get_fs_root(fs_info,
8157	objectid: clone_sources_tmp[i], check_ref: true);
8158	if (IS_ERR(ptr: clone_root)) {
8159	ret = PTR_ERR(ptr: clone_root);
8160	goto out;
8161	}
8162	spin_lock(lock: &clone_root->root_item_lock);
8163	if (!btrfs_root_readonly(root: clone_root) \|\|
8164	btrfs_root_dead(root: clone_root)) {
8165	spin_unlock(lock: &clone_root->root_item_lock);
8166	btrfs_put_root(root: clone_root);
8167	ret = -EPERM;
8168	goto out;
8169	}
8170	if (clone_root->dedupe_in_progress) {
8171	dedupe_in_progress_warn(root: clone_root);
8172	spin_unlock(lock: &clone_root->root_item_lock);
8173	btrfs_put_root(root: clone_root);
8174	ret = -EAGAIN;
8175	goto out;
8176	}
8177	clone_root->send_in_progress++;
8178	spin_unlock(lock: &clone_root->root_item_lock);
8179
8180	sctx->clone_roots[i].root = clone_root;
8181	clone_sources_to_rollback = i + `1`;
8182	}
8183	kvfree(addr: clone_sources_tmp);
8184	clone_sources_tmp = NULL;
8185	}
8186
8187	if (arg->parent_root) {
8188	sctx->parent_root = btrfs_get_fs_root(fs_info, objectid: arg->parent_root,
8189	check_ref: true);
8190	if (IS_ERR(ptr: sctx->parent_root)) {
8191	ret = PTR_ERR(ptr: sctx->parent_root);
8192	goto out;
8193	}
8194
8195	spin_lock(lock: &sctx->parent_root->root_item_lock);
8196	sctx->parent_root->send_in_progress++;
8197	if (!btrfs_root_readonly(root: sctx->parent_root) \|\|
8198	btrfs_root_dead(root: sctx->parent_root)) {
8199	spin_unlock(lock: &sctx->parent_root->root_item_lock);
8200	ret = -EPERM;
8201	goto out;
8202	}
8203	if (sctx->parent_root->dedupe_in_progress) {
8204	dedupe_in_progress_warn(root: sctx->parent_root);
8205	spin_unlock(lock: &sctx->parent_root->root_item_lock);
8206	ret = -EAGAIN;
8207	goto out;
8208	}
8209	spin_unlock(lock: &sctx->parent_root->root_item_lock);
8210	}
8211
8212	/*
8213	* Clones from send_root are allowed, but only if the clone source
8214	* is behind the current send position. This is checked while searching
8215	* for possible clone sources.
8216	*/
8217	sctx->clone_roots[sctx->clone_roots_cnt++].root =
8218	btrfs_grab_root(root: sctx->send_root);
8219
8220	/ We do a bsearch later /
8221	sort(base: sctx->clone_roots, num: sctx->clone_roots_cnt,
8222	size: sizeof(*sctx->clone_roots), cmp_func: __clone_root_cmp_sort,
8223	NULL);
8224	sort_clone_roots = `1`;
8225
8226	ret = flush_delalloc_roots(sctx);
8227	if (ret)
8228	goto out;
8229
8230	ret = ensure_commit_roots_uptodate(sctx);
8231	if (ret)
8232	goto out;
8233
8234	ret = send_subvol(sctx);
8235	if (ret < `0`)
8236	goto out;
8237
8238	btrfs_lru_cache_for_each_entry_safe(&sctx->dir_utimes_cache, entry, tmp) {
8239	ret = send_utimes(sctx, ino: entry->key, gen: entry->gen);
8240	if (ret < `0`)
8241	goto out;
8242	btrfs_lru_cache_remove(cache: &sctx->dir_utimes_cache, entry);
8243	}
8244
8245	if (!(sctx->flags & BTRFS_SEND_FLAG_OMIT_END_CMD)) {
8246	ret = begin_cmd(sctx, cmd: BTRFS_SEND_C_END);
8247	if (ret < `0`)
8248	goto out;
8249	ret = send_cmd(sctx);
8250	if (ret < `0`)
8251	goto out;
8252	}
8253
8254	out:
8255	WARN_ON(sctx && !ret && !RB_EMPTY_ROOT(&sctx->pending_dir_moves));
8256	while (sctx && !RB_EMPTY_ROOT(&sctx->pending_dir_moves)) {
8257	struct rb_node *n;
8258	struct pending_dir_move *pm;
8259
8260	n = rb_first(root: &sctx->pending_dir_moves);
8261	pm = rb_entry(n, struct pending_dir_move, node);
8262	while (!list_empty(head: &pm->list)) {
8263	struct pending_dir_move *pm2;
8264
8265	pm2 = list_first_entry(&pm->list,
8266	struct pending_dir_move, list);
8267	free_pending_move(sctx, m: pm2);
8268	}
8269	free_pending_move(sctx, m: pm);
8270	}
8271
8272	WARN_ON(sctx && !ret && !RB_EMPTY_ROOT(&sctx->waiting_dir_moves));
8273	while (sctx && !RB_EMPTY_ROOT(&sctx->waiting_dir_moves)) {
8274	struct rb_node *n;
8275	struct waiting_dir_move *dm;
8276
8277	n = rb_first(root: &sctx->waiting_dir_moves);
8278	dm = rb_entry(n, struct waiting_dir_move, node);
8279	rb_erase(&dm->node, &sctx->waiting_dir_moves);
8280	kfree(objp: dm);
8281	}
8282
8283	WARN_ON(sctx && !ret && !RB_EMPTY_ROOT(&sctx->orphan_dirs));
8284	while (sctx && !RB_EMPTY_ROOT(&sctx->orphan_dirs)) {
8285	struct rb_node *n;
8286	struct orphan_dir_info *odi;
8287
8288	n = rb_first(root: &sctx->orphan_dirs);
8289	odi = rb_entry(n, struct orphan_dir_info, node);
8290	free_orphan_dir_info(sctx, odi);
8291	}
8292
8293	if (sort_clone_roots) {
8294	for (i = `0`; i < sctx->clone_roots_cnt; i++) {
8295	btrfs_root_dec_send_in_progress(
8296	root: sctx->clone_roots[i].root);
8297	btrfs_put_root(root: sctx->clone_roots[i].root);
8298	}
8299	} else {
8300	for (i = `0`; sctx && i < clone_sources_to_rollback; i++) {
8301	btrfs_root_dec_send_in_progress(
8302	root: sctx->clone_roots[i].root);
8303	btrfs_put_root(root: sctx->clone_roots[i].root);
8304	}
8305
8306	btrfs_root_dec_send_in_progress(root: send_root);
8307	}
8308	if (sctx && !IS_ERR_OR_NULL(ptr: sctx->parent_root)) {
8309	btrfs_root_dec_send_in_progress(root: sctx->parent_root);
8310	btrfs_put_root(root: sctx->parent_root);
8311	}
8312
8313	kvfree(addr: clone_sources_tmp);
8314
8315	if (sctx) {
8316	if (sctx->send_filp)
8317	fput(sctx->send_filp);
8318
8319	kvfree(addr: sctx->clone_roots);
8320	kfree(objp: sctx->send_buf_pages);
8321	kvfree(addr: sctx->send_buf);
8322	kvfree(addr: sctx->verity_descriptor);
8323
8324	close_current_inode(sctx);
8325
8326	btrfs_lru_cache_clear(cache: &sctx->name_cache);
8327	btrfs_lru_cache_clear(cache: &sctx->backref_cache);
8328	btrfs_lru_cache_clear(cache: &sctx->dir_created_cache);
8329	btrfs_lru_cache_clear(cache: &sctx->dir_utimes_cache);
8330
8331	if (sctx->cur_inode_path.buf != sctx->cur_inode_path.inline_buf)
8332	kfree(objp: sctx->cur_inode_path.buf);
8333
8334	kfree(objp: sctx);
8335	}
8336
8337	return ret;
8338	}
8339

source code of linux/fs/btrfs/send.c