inode.c source code [linux/fs/ceph/inode.c]

1	// SPDX-License-Identifier: GPL-2.0
2	#include <linux/ceph/ceph_debug.h>
3
4	#include <linux/module.h>
5	#include <linux/fs.h>
6	#include <linux/slab.h>
7	#include <linux/string.h>
8	#include <linux/uaccess.h>
9	#include <linux/kernel.h>
10	#include <linux/writeback.h>
11	#include <linux/vmalloc.h>
12	#include <linux/xattr.h>
13	#include <linux/posix_acl.h>
14	#include <linux/random.h>
15	#include <linux/sort.h>
16	#include <linux/iversion.h>
17	#include <linux/fscrypt.h>
18
19	#include "super.h"
20	#include "mds_client.h"
21	#include "cache.h"
22	#include "crypto.h"
23	#include <linux/ceph/decode.h>
24
25	/*
26	* Ceph inode operations
27	*
28	* Implement basic inode helpers (get, alloc) and inode ops (getattr,
29	* setattr, etc.), xattr helpers, and helpers for assimilating
30	* metadata returned by the MDS into our cache.
31	*
32	* Also define helpers for doing asynchronous writeback, invalidation,
33	* and truncation for the benefit of those who can't afford to block
34	* (typically because they are in the message handler path).
35	*/
36
37	static const struct inode_operations ceph_symlink_iops;
38	static const struct inode_operations ceph_encrypted_symlink_iops;
39
40	static void ceph_inode_work(struct work_struct *work);
41
42	/*
43	* find or create an inode, given the ceph ino number
44	*/
45	static int ceph_set_ino_cb(struct inode inode, void* *data)
46	{
47	struct ceph_inode_info *ci = ceph_inode(inode);
48	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(sb: inode->i_sb);
49
50	ci->i_vino = (struct* ceph_vino *)data;
51	inode->i_ino = ceph_vino_to_ino_t(vino: ci->i_vino);
52	inode_set_iversion_raw(inode, val: `0`);
53	percpu_counter_inc(fbc: &mdsc->metric.total_inodes);
54
55	return `0`;
56	}
57
58	/*
59	* Check if the parent inode matches the vino from directory reply info
60	*/
61	static inline bool ceph_vino_matches_parent(struct inode *parent,
62	struct ceph_vino vino)
63	{
64	return ceph_ino(inode: parent) == vino.ino && ceph_snap(inode: parent) == vino.snap;
65	}
66
67	/*
68	* Validate that the directory inode referenced by @req->r_parent matches the
69	* inode number and snapshot id contained in the reply's directory record. If
70	* they do not match – which can theoretically happen if the parent dentry was
71	* moved between the time the request was issued and the reply arrived – fall
72	* back to looking up the correct inode in the inode cache.
73	*
74	* A reference is always returned. Callers that receive a different inode
75	* than the original @parent are responsible for dropping the extra reference
76	* once the reply has been processed.
77	*/
78	static struct inode ceph_get_reply_dir(struct* super_block *sb,
79	struct inode *parent,
80	struct ceph_mds_reply_info_parsed *rinfo)
81	{
82	struct ceph_vino vino;
83
84	if (unlikely(!rinfo->diri.in))
85	return parent; / nothing to compare against /
86
87	/ If we didn't have a cached parent inode to begin with, just bail out. /
88	if (!parent)
89	return NULL;
90
91	vino.ino = le64_to_cpu(rinfo->diri.in->ino);
92	vino.snap = le64_to_cpu(rinfo->diri.in->snapid);
93
94	if (likely(ceph_vino_matches_parent(parent, vino)))
95	return parent; / matches – use the original reference /
96
97	/ Mismatch – this should be rare. Emit a WARN and obtain the correct inode. /
98	WARN_ONCE(`1`, "ceph: reply dir mismatch (parent valid %llx.%llx reply %llx.%llx)\n",
99	ceph_ino(parent), ceph_snap(parent), vino.ino, vino.snap);
100
101	return ceph_get_inode(sb, vino, NULL);
102	}
103
104	/**
105	* ceph_new_inode - allocate a new inode in advance of an expected create
106	* @dir: parent directory for new inode
107	* @dentry: dentry that may eventually point to new inode
108	* @mode: mode of new inode
109	* @as_ctx: pointer to inherited security context
110	*
111	* Allocate a new inode in advance of an operation to create a new inode.
112	* This allocates the inode and sets up the acl_sec_ctx with appropriate
113	* info for the new inode.
114	*
115	* Returns a pointer to the new inode or an ERR_PTR.
116	*/
117	struct inode ceph_new_inode(struct* inode dir, struct* dentry *dentry,
118	umode_t mode, struct* ceph_acl_sec_ctx *as_ctx)
119	{
120	int err;
121	struct inode *inode;
122
123	inode = new_inode(sb: dir->i_sb);
124	if (!inode)
125	return ERR_PTR(error: -ENOMEM);
126
127	inode->i_blkbits = CEPH_FSCRYPT_BLOCK_SHIFT;
128
129	if (!S_ISLNK(*mode)) {
130	err = ceph_pre_init_acls(dir, mode, as_ctx);
131	if (err < `0`)
132	goto out_err;
133	}
134
135	inode_state_assign_raw(inode, flags: `0`);
136	inode->i_mode = *mode;
137
138	err = ceph_security_init_secctx(dentry, mode: *mode, ctx: as_ctx);
139	if (err < `0`)
140	goto out_err;
141
142	/*
143	* We'll skip setting fscrypt context for snapshots, leaving that for
144	* the handle_reply().
145	*/
146	if (ceph_snap(inode: dir) != CEPH_SNAPDIR) {
147	err = ceph_fscrypt_prepare_context(dir, inode, as: as_ctx);
148	if (err)
149	goto out_err;
150	}
151
152	return inode;
153	out_err:
154	iput(inode);
155	return ERR_PTR(error: err);
156	}
157
158	void ceph_as_ctx_to_req(struct ceph_mds_request *req,
159	struct ceph_acl_sec_ctx *as_ctx)
160	{
161	if (as_ctx->pagelist) {
162	req->r_pagelist = as_ctx->pagelist;
163	as_ctx->pagelist = NULL;
164	}
165	ceph_fscrypt_as_ctx_to_req(req, as: as_ctx);
166	}
167
168	/**
169	* ceph_get_inode - find or create/hash a new inode
170	* @sb: superblock to search and allocate in
171	* @vino: vino to search for
172	* @newino: optional new inode to insert if one isn't found (may be NULL)
173	*
174	* Search for or insert a new inode into the hash for the given vino, and
175	* return a reference to it. If new is non-NULL, its reference is consumed.
176	*/
177	struct inode ceph_get_inode(struct* super_block sb, struct* ceph_vino vino,
178	struct inode *newino)
179	{
180	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(sb);
181	struct ceph_client *cl = mdsc->fsc->client;
182	struct inode *inode;
183
184	if (ceph_vino_is_reserved(vino))
185	return ERR_PTR(error: -EREMOTEIO);
186
187	if (newino) {
188	inode = inode_insert5(inode: newino, hashval: (unsigned long)vino.ino,
189	test: ceph_ino_compare, set: ceph_set_ino_cb, data: &vino);
190	if (inode != newino)
191	iput(newino);
192	} else {
193	inode = iget5_locked(sb, (unsigned long)vino.ino,
194	test: ceph_ino_compare, set: ceph_set_ino_cb, &vino);
195	}
196
197	if (!inode) {
198	doutc(cl, "no inode found for %llx.%llx\n", vino.ino, vino.snap);
199	return ERR_PTR(error: -ENOMEM);
200	}
201
202	doutc(cl, "on %llx=%llx.%llx got %p new %d\n",
203	ceph_present_inode(inode), ceph_vinop(inode), inode,
204	!!(inode_state_read_once(inode) & I_NEW));
205	return inode;
206	}
207
208	/*
209	* get/construct snapdir inode for a given directory
210	*/
211	struct inode ceph_get_snapdir(struct* inode *parent)
212	{
213	struct ceph_client *cl = ceph_inode_to_client(inode: parent);
214	struct ceph_vino vino = {
215	.ino = ceph_ino(inode: parent),
216	.snap = CEPH_SNAPDIR,
217	};
218	struct inode *inode = ceph_get_inode(sb: parent->i_sb, vino, NULL);
219	struct ceph_inode_info *ci = ceph_inode(inode);
220	int ret = -ENOTDIR;
221
222	if (IS_ERR(ptr: inode))
223	return inode;
224
225	if (!S_ISDIR(parent->i_mode)) {
226	pr_warn_once_client(cl, "bad snapdir parent type (mode=0%o)\n",
227	parent->i_mode);
228	goto err;
229	}
230
231	if (!(inode_state_read_once(inode) & I_NEW) && !S_ISDIR(inode->i_mode)) {
232	pr_warn_once_client(cl, "bad snapdir inode type (mode=0%o)\n",
233	inode->i_mode);
234	goto err;
235	}
236
237	inode->i_mode = parent->i_mode;
238	inode->i_uid = parent->i_uid;
239	inode->i_gid = parent->i_gid;
240	inode_set_mtime_to_ts(inode, ts: inode_get_mtime(inode: parent));
241	inode_set_ctime_to_ts(inode, ts: inode_get_ctime(inode: parent));
242	inode_set_atime_to_ts(inode, ts: inode_get_atime(inode: parent));
243	ci->i_rbytes = `0`;
244	ci->i_btime = ceph_inode(inode: parent)->i_btime;
245
246	#ifdef CONFIG_FS_ENCRYPTION
247	/ if encrypted, just borrow fscrypt_auth from parent /
248	if (IS_ENCRYPTED(parent)) {
249	struct ceph_inode_info *pci = ceph_inode(inode: parent);
250
251	ci->fscrypt_auth = kmemdup(pci->fscrypt_auth,
252	pci->fscrypt_auth_len,
253	GFP_KERNEL);
254	if (ci->fscrypt_auth) {
255	inode->i_flags \|= S_ENCRYPTED;
256	ci->fscrypt_auth_len = pci->fscrypt_auth_len;
257	} else {
258	doutc(cl, "Failed to alloc snapdir fscrypt_auth\n");
259	ret = -ENOMEM;
260	goto err;
261	}
262	}
263	#endif
264	if (inode_state_read_once(inode) & I_NEW) {
265	inode->i_op = &ceph_snapdir_iops;
266	inode->i_fop = &ceph_snapdir_fops;
267	ci->i_snap_caps = CEPH_CAP_PIN; / so we can open /
268	unlock_new_inode(inode);
269	}
270
271	return inode;
272	err:
273	if ((inode_state_read_once(inode) & I_NEW))
274	discard_new_inode(inode);
275	else
276	iput(inode);
277	return ERR_PTR(error: ret);
278	}
279
280	const struct inode_operations ceph_file_iops = {
281	.permission = ceph_permission,
282	.setattr = ceph_setattr,
283	.getattr = ceph_getattr,
284	.listxattr = ceph_listxattr,
285	.get_inode_acl = ceph_get_acl,
286	.set_acl = ceph_set_acl,
287	};
288
289
290	/*
291	* We use a 'frag tree' to keep track of the MDS's directory fragments
292	* for a given inode (usually there is just a single fragment). We
293	* need to know when a child frag is delegated to a new MDS, or when
294	* it is flagged as replicated, so we can direct our requests
295	* accordingly.
296	*/
297
298	/*
299	* find/create a frag in the tree
300	*/
301	static struct ceph_inode_frag __get_or_create_frag(struct* ceph_inode_info *ci,
302	u32 f)
303	{
304	struct inode *inode = &ci->netfs.inode;
305	struct ceph_client *cl = ceph_inode_to_client(inode);
306	struct rb_node **p;
307	struct rb_node *parent = NULL;
308	struct ceph_inode_frag *frag;
309	int c;
310
311	p = &ci->i_fragtree.rb_node;
312	while (*p) {
313	parent = *p;
314	frag = rb_entry(parent, struct ceph_inode_frag, node);
315	c = ceph_frag_compare(a: f, b: frag->frag);
316	if (c < `0`)
317	p = &(*p)->rb_left;
318	else if (c > `0`)
319	p = &(*p)->rb_right;
320	else
321	return frag;
322	}
323
324	frag = kmalloc(sizeof(*frag), GFP_NOFS);
325	if (!frag)
326	return ERR_PTR(error: -ENOMEM);
327
328	frag->frag = f;
329	frag->split_by = `0`;
330	frag->mds = -`1`;
331	frag->ndist = `0`;
332
333	rb_link_node(node: &frag->node, parent, rb_link: p);
334	rb_insert_color(&frag->node, &ci->i_fragtree);
335
336	doutc(cl, "added %p %llx.%llx frag %x\n", inode, ceph_vinop(inode), f);
337	return frag;
338	}
339
340	/*
341	* find a specific frag @f
342	*/
343	struct ceph_inode_frag __ceph_find_frag(struct* ceph_inode_info *ci, u32 f)
344	{
345	struct rb_node *n = ci->i_fragtree.rb_node;
346
347	while (n) {
348	struct ceph_inode_frag *frag =
349	rb_entry(n, struct ceph_inode_frag, node);
350	int c = ceph_frag_compare(a: f, b: frag->frag);
351	if (c < `0`)
352	n = n->rb_left;
353	else if (c > `0`)
354	n = n->rb_right;
355	else
356	return frag;
357	}
358	return NULL;
359	}
360
361	/*
362	* Choose frag containing the given value @v. If @pfrag is
363	* specified, copy the frag delegation info to the caller if
364	* it is present.
365	*/
366	static u32 __ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
367	struct ceph_inode_frag pfrag, int* *found)
368	{
369	struct ceph_client *cl = ceph_inode_to_client(inode: &ci->netfs.inode);
370	u32 t = ceph_frag_make(b: `0`, v: `0`);
371	struct ceph_inode_frag *frag;
372	unsigned nway, i;
373	u32 n;
374
375	if (found)
376	*found = `0`;
377
378	while (`1`) {
379	WARN_ON(!ceph_frag_contains_value(t, v));
380	frag = __ceph_find_frag(ci, f: t);
381	if (!frag)
382	break; / t is a leaf /
383	if (frag->split_by == `0`) {
384	if (pfrag)
385	memcpy(pfrag, frag, sizeof(*pfrag));
386	if (found)
387	*found = `1`;
388	break;
389	}
390
391	/ choose child /
392	nway = `1` << frag->split_by;
393	doutc(cl, "frag(%x) %x splits by %d (%d ways)\n", v, t,
394	frag->split_by, nway);
395	for (i = `0`; i < nway; i++) {
396	n = ceph_frag_make_child(f: t, by: frag->split_by, i);
397	if (ceph_frag_contains_value(f: n, v)) {
398	t = n;
399	break;
400	}
401	}
402	BUG_ON(i == nway);
403	}
404	doutc(cl, "frag(%x) = %x\n", v, t);
405
406	return t;
407	}
408
409	u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
410	struct ceph_inode_frag pfrag, int* *found)
411	{
412	u32 ret;
413	mutex_lock(&ci->i_fragtree_mutex);
414	ret = __ceph_choose_frag(ci, v, pfrag, found);
415	mutex_unlock(lock: &ci->i_fragtree_mutex);
416	return ret;
417	}
418
419	/*
420	* Process dirfrag (delegation) info from the mds. Include leaf
421	* fragment in tree ONLY if ndist > 0. Otherwise, only
422	* branches/splits are included in i_fragtree)
423	*/
424	static int ceph_fill_dirfrag(struct inode *inode,
425	struct ceph_mds_reply_dirfrag *dirinfo)
426	{
427	struct ceph_inode_info *ci = ceph_inode(inode);
428	struct ceph_client *cl = ceph_inode_to_client(inode);
429	struct ceph_inode_frag *frag;
430	u32 id = le32_to_cpu(dirinfo->frag);
431	int mds = le32_to_cpu(dirinfo->auth);
432	int ndist = le32_to_cpu(dirinfo->ndist);
433	int diri_auth = -`1`;
434	int i;
435	int err = `0`;
436
437	spin_lock(lock: &ci->i_ceph_lock);
438	if (ci->i_auth_cap)
439	diri_auth = ci->i_auth_cap->mds;
440	spin_unlock(lock: &ci->i_ceph_lock);
441
442	if (mds == -`1`) / CDIR_AUTH_PARENT /
443	mds = diri_auth;
444
445	mutex_lock(&ci->i_fragtree_mutex);
446	if (ndist == `0` && mds == diri_auth) {
447	/ no delegation info needed. /
448	frag = __ceph_find_frag(ci, f: id);
449	if (!frag)
450	goto out;
451	if (frag->split_by == `0`) {
452	/ tree leaf, remove /
453	doutc(cl, "removed %p %llx.%llx frag %x (no ref)\n",
454	inode, ceph_vinop(inode), id);
455	rb_erase(&frag->node, &ci->i_fragtree);
456	kfree(objp: frag);
457	} else {
458	/ tree branch, keep and clear /
459	doutc(cl, "cleared %p %llx.%llx frag %x referral\n",
460	inode, ceph_vinop(inode), id);
461	frag->mds = -`1`;
462	frag->ndist = `0`;
463	}
464	goto out;
465	}
466
467
468	/ find/add this frag to store mds delegation info /
469	frag = __get_or_create_frag(ci, f: id);
470	if (IS_ERR(ptr: frag)) {
471	/ this is not the end of the world; we can continue*
472	with bad/inaccurate delegation info /*
473	pr_err_client(cl, "ENOMEM on mds ref %p %llx.%llx fg %x\n",
474	inode, ceph_vinop(inode),
475	le32_to_cpu(dirinfo->frag));
476	err = -ENOMEM;
477	goto out;
478	}
479
480	frag->mds = mds;
481	frag->ndist = min_t(u32, ndist, CEPH_MAX_DIRFRAG_REP);
482	for (i = `0`; i < frag->ndist; i++)
483	frag->dist[i] = le32_to_cpu(dirinfo->dist[i]);
484	doutc(cl, "%p %llx.%llx frag %x ndist=%d\n", inode,
485	ceph_vinop(inode), frag->frag, frag->ndist);
486
487	out:
488	mutex_unlock(lock: &ci->i_fragtree_mutex);
489	return err;
490	}
491
492	static int frag_tree_split_cmp(const void l, const* void *r)
493	{
494	struct ceph_frag_tree_split ls = (struct* ceph_frag_tree_split*)l;
495	struct ceph_frag_tree_split rs = (struct* ceph_frag_tree_split*)r;
496	return ceph_frag_compare(le32_to_cpu(ls->frag),
497	le32_to_cpu(rs->frag));
498	}
499
500	static bool is_frag_child(u32 f, struct ceph_inode_frag *frag)
501	{
502	if (!frag)
503	return f == ceph_frag_make(b: `0`, v: `0`);
504	if (ceph_frag_bits(f) != ceph_frag_bits(f: frag->frag) + frag->split_by)
505	return false;
506	return ceph_frag_contains_value(f: frag->frag, v: ceph_frag_value(f));
507	}
508
509	static int ceph_fill_fragtree(struct inode *inode,
510	struct ceph_frag_tree_head *fragtree,
511	struct ceph_mds_reply_dirfrag *dirinfo)
512	{
513	struct ceph_client *cl = ceph_inode_to_client(inode);
514	struct ceph_inode_info *ci = ceph_inode(inode);
515	struct ceph_inode_frag frag, prev_frag = NULL;
516	struct rb_node *rb_node;
517	unsigned i, split_by, nsplits;
518	u32 id;
519	bool update = false;
520
521	mutex_lock(&ci->i_fragtree_mutex);
522	nsplits = le32_to_cpu(fragtree->nsplits);
523	if (nsplits != ci->i_fragtree_nsplits) {
524	update = true;
525	} else if (nsplits) {
526	i = get_random_u32_below(ceil: nsplits);
527	id = le32_to_cpu(fragtree->splits[i].frag);
528	if (!__ceph_find_frag(ci, f: id))
529	update = true;
530	} else if (!RB_EMPTY_ROOT(&ci->i_fragtree)) {
531	rb_node = rb_first(root: &ci->i_fragtree);
532	frag = rb_entry(rb_node, struct ceph_inode_frag, node);
533	if (frag->frag != ceph_frag_make(b: `0`, v: `0`) \|\| rb_next(rb_node))
534	update = true;
535	}
536	if (!update && dirinfo) {
537	id = le32_to_cpu(dirinfo->frag);
538	if (id != __ceph_choose_frag(ci, v: id, NULL, NULL))
539	update = true;
540	}
541	if (!update)
542	goto out_unlock;
543
544	if (nsplits > `1`) {
545	sort(base: fragtree->splits, num: nsplits, size: sizeof(fragtree->splits[`0`]),
546	cmp_func: frag_tree_split_cmp, NULL);
547	}
548
549	doutc(cl, "%p %llx.%llx\n", inode, ceph_vinop(inode));
550	rb_node = rb_first(root: &ci->i_fragtree);
551	for (i = `0`; i < nsplits; i++) {
552	id = le32_to_cpu(fragtree->splits[i].frag);
553	split_by = le32_to_cpu(fragtree->splits[i].by);
554	if (split_by == `0` \|\| ceph_frag_bits(f: id) + split_by > `24`) {
555	pr_err_client(cl, "%p %llx.%llx invalid split %d/%u, "
556	"frag %x split by %d\n", inode,
557	ceph_vinop(inode), i, nsplits, id, split_by);
558	continue;
559	}
560	frag = NULL;
561	while (rb_node) {
562	frag = rb_entry(rb_node, struct ceph_inode_frag, node);
563	if (ceph_frag_compare(a: frag->frag, b: id) >= `0`) {
564	if (frag->frag != id)
565	frag = NULL;
566	else
567	rb_node = rb_next(rb_node);
568	break;
569	}
570	rb_node = rb_next(rb_node);
571	/ delete stale split/leaf node /
572	if (frag->split_by > `0` \|\|
573	!is_frag_child(f: frag->frag, frag: prev_frag)) {
574	rb_erase(&frag->node, &ci->i_fragtree);
575	if (frag->split_by > `0`)
576	ci->i_fragtree_nsplits--;
577	kfree(objp: frag);
578	}
579	frag = NULL;
580	}
581	if (!frag) {
582	frag = __get_or_create_frag(ci, f: id);
583	if (IS_ERR(ptr: frag))
584	continue;
585	}
586	if (frag->split_by == `0`)
587	ci->i_fragtree_nsplits++;
588	frag->split_by = split_by;
589	doutc(cl, " frag %x split by %d\n", frag->frag, frag->split_by);
590	prev_frag = frag;
591	}
592	while (rb_node) {
593	frag = rb_entry(rb_node, struct ceph_inode_frag, node);
594	rb_node = rb_next(rb_node);
595	/ delete stale split/leaf node /
596	if (frag->split_by > `0` \|\|
597	!is_frag_child(f: frag->frag, frag: prev_frag)) {
598	rb_erase(&frag->node, &ci->i_fragtree);
599	if (frag->split_by > `0`)
600	ci->i_fragtree_nsplits--;
601	kfree(objp: frag);
602	}
603	}
604	out_unlock:
605	mutex_unlock(lock: &ci->i_fragtree_mutex);
606	return `0`;
607	}
608
609	/*
610	* initialize a newly allocated inode.
611	*/
612	struct inode ceph_alloc_inode(struct* super_block *sb)
613	{
614	struct ceph_fs_client *fsc = ceph_sb_to_fs_client(sb);
615	struct ceph_inode_info *ci;
616	int i;
617
618	ci = alloc_inode_sb(sb, ceph_inode_cachep, GFP_NOFS);
619	if (!ci)
620	return NULL;
621
622	doutc(fsc->client, "%p\n", &ci->netfs.inode);
623
624	/ Set parameters for the netfs library /
625	netfs_inode_init(ctx: &ci->netfs, ops: &ceph_netfs_ops, use_zero_point: false);
626
627	spin_lock_init(&ci->i_ceph_lock);
628
629	ci->i_version = `0`;
630	ci->i_inline_version = `0`;
631	ci->i_time_warp_seq = `0`;
632	ci->i_ceph_flags = `0`;
633	atomic64_set(v: &ci->i_ordered_count, i: `1`);
634	atomic64_set(v: &ci->i_release_count, i: `1`);
635	atomic64_set(v: &ci->i_complete_seq[`0`], i: `0`);
636	atomic64_set(v: &ci->i_complete_seq[`1`], i: `0`);
637	ci->i_symlink = NULL;
638
639	ci->i_max_bytes = `0`;
640	ci->i_max_files = `0`;
641
642	memset(&ci->i_dir_layout, `0`, sizeof(ci->i_dir_layout));
643	memset(&ci->i_cached_layout, `0`, sizeof(ci->i_cached_layout));
644	RCU_INIT_POINTER(ci->i_layout.pool_ns, NULL);
645
646	ci->i_fragtree = RB_ROOT;
647	mutex_init(&ci->i_fragtree_mutex);
648
649	ci->i_xattrs.blob = NULL;
650	ci->i_xattrs.prealloc_blob = NULL;
651	ci->i_xattrs.dirty = false;
652	ci->i_xattrs.index = RB_ROOT;
653	ci->i_xattrs.count = `0`;
654	ci->i_xattrs.names_size = `0`;
655	ci->i_xattrs.vals_size = `0`;
656	ci->i_xattrs.version = `0`;
657	ci->i_xattrs.index_version = `0`;
658
659	ci->i_caps = RB_ROOT;
660	ci->i_auth_cap = NULL;
661	ci->i_dirty_caps = `0`;
662	ci->i_flushing_caps = `0`;
663	INIT_LIST_HEAD(list: &ci->i_dirty_item);
664	INIT_LIST_HEAD(list: &ci->i_flushing_item);
665	ci->i_prealloc_cap_flush = NULL;
666	INIT_LIST_HEAD(list: &ci->i_cap_flush_list);
667	init_waitqueue_head(&ci->i_cap_wq);
668	ci->i_hold_caps_max = `0`;
669	INIT_LIST_HEAD(list: &ci->i_cap_delay_list);
670	INIT_LIST_HEAD(list: &ci->i_cap_snaps);
671	ci->i_head_snapc = NULL;
672	ci->i_snap_caps = `0`;
673
674	ci->i_last_rd = ci->i_last_wr = jiffies - `3600` * HZ;
675	for (i = `0`; i < CEPH_FILE_MODE_BITS; i++)
676	ci->i_nr_by_mode[i] = `0`;
677
678	mutex_init(&ci->i_truncate_mutex);
679	ci->i_truncate_seq = `0`;
680	ci->i_truncate_size = `0`;
681	ci->i_truncate_pending = `0`;
682	ci->i_truncate_pagecache_size = `0`;
683
684	ci->i_max_size = `0`;
685	ci->i_reported_size = `0`;
686	ci->i_wanted_max_size = `0`;
687	ci->i_requested_max_size = `0`;
688
689	ci->i_pin_ref = `0`;
690	ci->i_rd_ref = `0`;
691	ci->i_rdcache_ref = `0`;
692	ci->i_wr_ref = `0`;
693	ci->i_wb_ref = `0`;
694	ci->i_fx_ref = `0`;
695	ci->i_wrbuffer_ref = `0`;
696	ci->i_wrbuffer_ref_head = `0`;
697	atomic_set(v: &ci->i_filelock_ref, i: `0`);
698	atomic_set(v: &ci->i_shared_gen, i: `1`);
699	ci->i_rdcache_gen = `0`;
700	ci->i_rdcache_revoking = `0`;
701
702	INIT_LIST_HEAD(list: &ci->i_unsafe_dirops);
703	INIT_LIST_HEAD(list: &ci->i_unsafe_iops);
704	spin_lock_init(&ci->i_unsafe_lock);
705
706	ci->i_snap_realm = NULL;
707	INIT_LIST_HEAD(list: &ci->i_snap_realm_item);
708	INIT_LIST_HEAD(list: &ci->i_snap_flush_item);
709
710	INIT_WORK(&ci->i_work, ceph_inode_work);
711	ci->i_work_mask = `0`;
712	memset(&ci->i_btime, `'\0'`, sizeof(ci->i_btime));
713	#ifdef CONFIG_FS_ENCRYPTION
714	ci->i_crypt_info = NULL;
715	ci->fscrypt_auth = NULL;
716	ci->fscrypt_auth_len = `0`;
717	#endif
718	return &ci->netfs.inode;
719	}
720
721	void ceph_free_inode(struct inode *inode)
722	{
723	struct ceph_inode_info *ci = ceph_inode(inode);
724
725	kfree(objp: ci->i_symlink);
726	#ifdef CONFIG_FS_ENCRYPTION
727	kfree(objp: ci->fscrypt_auth);
728	#endif
729	fscrypt_free_inode(inode);
730	kmem_cache_free(s: ceph_inode_cachep, objp: ci);
731	}
732
733	void ceph_evict_inode(struct inode *inode)
734	{
735	struct ceph_inode_info *ci = ceph_inode(inode);
736	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(sb: inode->i_sb);
737	struct ceph_client *cl = ceph_inode_to_client(inode);
738	struct ceph_inode_frag *frag;
739	struct rb_node *n;
740
741	doutc(cl, "%p ino %llx.%llx\n", inode, ceph_vinop(inode));
742
743	percpu_counter_dec(fbc: &mdsc->metric.total_inodes);
744
745	netfs_wait_for_outstanding_io(inode);
746	truncate_inode_pages_final(mapping: &inode->i_data);
747	if (inode_state_read_once(inode) & I_PINNING_NETFS_WB)
748	ceph_fscache_unuse_cookie(inode, update: true);
749	clear_inode(inode);
750
751	ceph_fscache_unregister_inode_cookie(ci);
752	fscrypt_put_encryption_info(inode);
753
754	__ceph_remove_caps(ci);
755
756	if (__ceph_has_quota(ci, which: QUOTA_GET_ANY))
757	ceph_adjust_quota_realms_count(inode, inc: false);
758
759	/*
760	* we may still have a snap_realm reference if there are stray
761	* caps in i_snap_caps.
762	*/
763	if (ci->i_snap_realm) {
764	if (ceph_snap(inode) == CEPH_NOSNAP) {
765	doutc(cl, " dropping residual ref to snap realm %p\n",
766	ci->i_snap_realm);
767	ceph_change_snap_realm(inode, NULL);
768	} else {
769	ceph_put_snapid_map(mdsc, sm: ci->i_snapid_map);
770	ci->i_snap_realm = NULL;
771	}
772	}
773
774	while ((n = rb_first(root: &ci->i_fragtree)) != NULL) {
775	frag = rb_entry(n, struct ceph_inode_frag, node);
776	rb_erase(n, &ci->i_fragtree);
777	kfree(objp: frag);
778	}
779	ci->i_fragtree_nsplits = `0`;
780
781	__ceph_destroy_xattrs(ci);
782	if (ci->i_xattrs.blob)
783	ceph_buffer_put(b: ci->i_xattrs.blob);
784	if (ci->i_xattrs.prealloc_blob)
785	ceph_buffer_put(b: ci->i_xattrs.prealloc_blob);
786
787	ceph_put_string(rcu_dereference_raw(ci->i_layout.pool_ns));
788	ceph_put_string(rcu_dereference_raw(ci->i_cached_layout.pool_ns));
789	}
790
791	static inline blkcnt_t calc_inode_blocks(u64 size)
792	{
793	return (size + (`1`<<`9`) - `1`) >> `9`;
794	}
795
796	/*
797	* Helpers to fill in size, ctime, mtime, and atime. We have to be
798	* careful because either the client or MDS may have more up to date
799	* info, depending on which capabilities are held, and whether
800	* time_warp_seq or truncate_seq have increased. (Ordinarily, mtime
801	* and size are monotonically increasing, except when utimes() or
802	* truncate() increments the corresponding _seq values.)
803	*/
804	int ceph_fill_file_size(struct inode inode, int* issued,
805	u32 truncate_seq, u64 truncate_size, u64 size)
806	{
807	struct ceph_client *cl = ceph_inode_to_client(inode);
808	struct ceph_inode_info *ci = ceph_inode(inode);
809	int queue_trunc = `0`;
810	loff_t isize = i_size_read(inode);
811
812	if (ceph_seq_cmp(a: truncate_seq, b: ci->i_truncate_seq) > `0` \|\|
813	(truncate_seq == ci->i_truncate_seq && size > isize)) {
814	doutc(cl, "size %lld -> %llu\n", isize, size);
815	if (size > `0` && S_ISDIR(inode->i_mode)) {
816	pr_err_client(cl, "non-zero size for directory\n");
817	size = `0`;
818	}
819	i_size_write(inode, i_size: size);
820	inode->i_blocks = calc_inode_blocks(size);
821	/*
822	* If we're expanding, then we should be able to just update
823	* the existing cookie.
824	*/
825	if (size > isize)
826	ceph_fscache_update(inode);
827	ci->i_reported_size = size;
828	if (truncate_seq != ci->i_truncate_seq) {
829	doutc(cl, "truncate_seq %u -> %u\n",
830	ci->i_truncate_seq, truncate_seq);
831	ci->i_truncate_seq = truncate_seq;
832
833	/ the MDS should have revoked these caps /
834	WARN_ON_ONCE(issued & (CEPH_CAP_FILE_RD \|
835	CEPH_CAP_FILE_LAZYIO));
836	/*
837	* If we hold relevant caps, or in the case where we're
838	* not the only client referencing this file and we
839	* don't hold those caps, then we need to check whether
840	* the file is either opened or mmaped
841	*/
842	if ((issued & (CEPH_CAP_FILE_CACHE\|
843	CEPH_CAP_FILE_BUFFER)) \|\|
844	mapping_mapped(mapping: inode->i_mapping) \|\|
845	__ceph_is_file_opened(ci)) {
846	ci->i_truncate_pending++;
847	queue_trunc = `1`;
848	}
849	}
850	}
851
852	/*
853	* It's possible that the new sizes of the two consecutive
854	* size truncations will be in the same fscrypt last block,
855	* and we need to truncate the corresponding page caches
856	* anyway.
857	*/
858	if (ceph_seq_cmp(a: truncate_seq, b: ci->i_truncate_seq) >= `0`) {
859	doutc(cl, "truncate_size %lld -> %llu, encrypted %d\n",
860	ci->i_truncate_size, truncate_size,
861	!!IS_ENCRYPTED(inode));
862
863	ci->i_truncate_size = truncate_size;
864
865	if (IS_ENCRYPTED(inode)) {
866	doutc(cl, "truncate_pagecache_size %lld -> %llu\n",
867	ci->i_truncate_pagecache_size, size);
868	ci->i_truncate_pagecache_size = size;
869	} else {
870	ci->i_truncate_pagecache_size = truncate_size;
871	}
872	}
873	return queue_trunc;
874	}
875
876	void ceph_fill_file_time(struct inode inode, int* issued,
877	u64 time_warp_seq, struct timespec64 *ctime,
878	struct timespec64 mtime, struct* timespec64 *atime)
879	{
880	struct ceph_client *cl = ceph_inode_to_client(inode);
881	struct ceph_inode_info *ci = ceph_inode(inode);
882	struct timespec64 iatime = inode_get_atime(inode);
883	struct timespec64 ictime = inode_get_ctime(inode);
884	struct timespec64 imtime = inode_get_mtime(inode);
885	int warn = `0`;
886
887	if (issued & (CEPH_CAP_FILE_EXCL\|
888	CEPH_CAP_FILE_WR\|
889	CEPH_CAP_FILE_BUFFER\|
890	CEPH_CAP_AUTH_EXCL\|
891	CEPH_CAP_XATTR_EXCL)) {
892	if (ci->i_version == `0` \|\|
893	timespec64_compare(lhs: ctime, rhs: &ictime) > `0`) {
894	doutc(cl, "ctime %ptSp -> %ptSp inc w/ cap\n", &ictime, ctime);
895	inode_set_ctime_to_ts(inode, ts: *ctime);
896	}
897	if (ci->i_version == `0` \|\|
898	ceph_seq_cmp(a: time_warp_seq, b: ci->i_time_warp_seq) > `0`) {
899	/ the MDS did a utimes() /
900	doutc(cl, "mtime %ptSp -> %ptSp tw %d -> %d\n", &imtime, mtime,
901	ci->i_time_warp_seq, (int)time_warp_seq);
902
903	inode_set_mtime_to_ts(inode, ts: *mtime);
904	inode_set_atime_to_ts(inode, ts: *atime);
905	ci->i_time_warp_seq = time_warp_seq;
906	} else if (time_warp_seq == ci->i_time_warp_seq) {
907	/ nobody did utimes(); take the max /
908	if (timespec64_compare(lhs: mtime, rhs: &imtime) > `0`) {
909	doutc(cl, "mtime %ptSp -> %ptSp inc\n", &imtime, mtime);
910	inode_set_mtime_to_ts(inode, ts: *mtime);
911	}
912	if (timespec64_compare(lhs: atime, rhs: &iatime) > `0`) {
913	doutc(cl, "atime %ptSp -> %ptSp inc\n", &iatime, atime);
914	inode_set_atime_to_ts(inode, ts: *atime);
915	}
916	} else if (issued & CEPH_CAP_FILE_EXCL) {
917	/ we did a utimes(); ignore mds values /
918	} else {
919	warn = `1`;
920	}
921	} else {
922	/ we have no write\|excl caps; whatever the MDS says is true /
923	if (ceph_seq_cmp(a: time_warp_seq, b: ci->i_time_warp_seq) >= `0`) {
924	inode_set_ctime_to_ts(inode, ts: *ctime);
925	inode_set_mtime_to_ts(inode, ts: *mtime);
926	inode_set_atime_to_ts(inode, ts: *atime);
927	ci->i_time_warp_seq = time_warp_seq;
928	} else {
929	warn = `1`;
930	}
931	}
932	if (warn) / time_warp_seq shouldn't go backwards /
933	doutc(cl, "%p mds time_warp_seq %llu < %u\n", inode,
934	time_warp_seq, ci->i_time_warp_seq);
935	}
936
937	#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
938	static int decode_encrypted_symlink(struct ceph_mds_client *mdsc,
939	const char *encsym,
940	int enclen, u8 **decsym)
941	{
942	struct ceph_client *cl = mdsc->fsc->client;
943	int declen;
944	u8 *sym;
945
946	sym = kmalloc(enclen + `1`, GFP_NOFS);
947	if (!sym)
948	return -ENOMEM;
949
950	declen = base64_decode(src: encsym, len: enclen, dst: sym, padding: false, variant: BASE64_IMAP);
951	if (declen < `0`) {
952	pr_err_client(cl,
953	"can't decode symlink (%d). Content: %.*s\n",
954	declen, enclen, encsym);
955	kfree(objp: sym);
956	return -EIO;
957	}
958	sym[declen + `1`] = `'\0'`;
959	*decsym = sym;
960	return declen;
961	}
962	#else
963	static int decode_encrypted_symlink(struct ceph_mds_client *mdsc,
964	const char *encsym,
965	int symlen, u8 **decsym)
966	{
967	return -EOPNOTSUPP;
968	}
969	#endif
970
971	/*
972	* Populate an inode based on info from mds. May be called on new or
973	* existing inodes.
974	*/
975	int ceph_fill_inode(struct inode inode, struct* page *locked_page,
976	struct ceph_mds_reply_info_in *iinfo,
977	struct ceph_mds_reply_dirfrag *dirinfo,
978	struct ceph_mds_session session, int* cap_fmode,
979	struct ceph_cap_reservation *caps_reservation)
980	{
981	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(sb: inode->i_sb);
982	struct ceph_client *cl = mdsc->fsc->client;
983	struct ceph_mds_reply_inode *info = iinfo->in;
984	struct ceph_inode_info *ci = ceph_inode(inode);
985	int issued, new_issued, info_caps;
986	struct timespec64 mtime, atime, ctime;
987	struct ceph_buffer *xattr_blob = NULL;
988	struct ceph_buffer *old_blob = NULL;
989	struct ceph_string *pool_ns = NULL;
990	struct ceph_cap *new_cap = NULL;
991	int err = `0`;
992	bool wake = false;
993	bool queue_trunc = false;
994	bool new_version = false;
995	bool fill_inline = false;
996	umode_t mode = le32_to_cpu(info->mode);
997	dev_t rdev = le32_to_cpu(info->rdev);
998
999	lockdep_assert_held(&mdsc->snap_rwsem);
1000
1001	doutc(cl, "%p ino %llx.%llx v %llu had %llu\n", inode, ceph_vinop(inode),
1002	le64_to_cpu(info->version), ci->i_version);
1003
1004	/ Once I_NEW is cleared, we can't change type or dev numbers /
1005	if (inode_state_read_once(inode) & I_NEW) {
1006	inode->i_mode = mode;
1007	} else {
1008	if (inode_wrong_type(inode, mode)) {
1009	pr_warn_once_client(cl,
1010	"inode type changed! (ino %llx.%llx is 0%o, mds says 0%o)\n",
1011	ceph_vinop(inode), inode->i_mode, mode);
1012	return -ESTALE;
1013	}
1014
1015	if ((S_ISCHR(mode) \|\| S_ISBLK(mode)) && inode->i_rdev != rdev) {
1016	pr_warn_once_client(cl,
1017	"dev inode rdev changed! (ino %llx.%llx is %u:%u, mds says %u:%u)\n",
1018	ceph_vinop(inode), MAJOR(inode->i_rdev),
1019	MINOR(inode->i_rdev), MAJOR(rdev),
1020	MINOR(rdev));
1021	return -ESTALE;
1022	}
1023	}
1024
1025	info_caps = le32_to_cpu(info->cap.caps);
1026
1027	/ prealloc new cap struct /
1028	if (info_caps && ceph_snap(inode) == CEPH_NOSNAP) {
1029	new_cap = ceph_get_cap(mdsc, ctx: caps_reservation);
1030	if (!new_cap)
1031	return -ENOMEM;
1032	}
1033
1034	/*
1035	* prealloc xattr data, if it looks like we'll need it. only
1036	* if len > 4 (meaning there are actually xattrs; the first 4
1037	* bytes are the xattr count).
1038	*/
1039	if (iinfo->xattr_len > `4`) {
1040	xattr_blob = ceph_buffer_new(len: iinfo->xattr_len, GFP_NOFS);
1041	if (!xattr_blob)
1042	pr_err_client(cl, "ENOMEM xattr blob %d bytes\n",
1043	iinfo->xattr_len);
1044	}
1045
1046	if (iinfo->pool_ns_len > `0`)
1047	pool_ns = ceph_find_or_create_string(str: iinfo->pool_ns_data,
1048	len: iinfo->pool_ns_len);
1049
1050	if (ceph_snap(inode) != CEPH_NOSNAP && !ci->i_snapid_map)
1051	ci->i_snapid_map = ceph_get_snapid_map(mdsc, snap: ceph_snap(inode));
1052
1053	spin_lock(lock: &ci->i_ceph_lock);
1054
1055	/*
1056	* provided version will be odd if inode value is projected,
1057	* even if stable. skip the update if we have newer stable
1058	* info (ours>=theirs, e.g. due to racing mds replies), unless
1059	* we are getting projected (unstable) info (in which case the
1060	* version is odd, and we want ours>theirs).
1061	* us them
1062	* 2 2 skip
1063	* 3 2 skip
1064	* 3 3 update
1065	*/
1066	if (ci->i_version == `0` \|\|
1067	((info->cap.flags & CEPH_CAP_FLAG_AUTH) &&
1068	le64_to_cpu(info->version) > (ci->i_version & ~`1`)))
1069	new_version = true;
1070
1071	/ Update change_attribute /
1072	inode_set_max_iversion_raw(inode, val: iinfo->change_attr);
1073
1074	__ceph_caps_issued(ci, implemented: &issued);
1075	issued \|= __ceph_caps_dirty(ci);
1076	new_issued = ~issued & info_caps;
1077
1078	__ceph_update_quota(ci, max_bytes: iinfo->max_bytes, max_files: iinfo->max_files);
1079
1080	#ifdef CONFIG_FS_ENCRYPTION
1081	if (iinfo->fscrypt_auth_len &&
1082	((inode_state_read_once(inode) & I_NEW) \|\| (ci->fscrypt_auth_len == `0`))) {
1083	kfree(objp: ci->fscrypt_auth);
1084	ci->fscrypt_auth_len = iinfo->fscrypt_auth_len;
1085	ci->fscrypt_auth = iinfo->fscrypt_auth;
1086	iinfo->fscrypt_auth = NULL;
1087	iinfo->fscrypt_auth_len = `0`;
1088	inode_set_flags(inode, S_ENCRYPTED, S_ENCRYPTED);
1089	}
1090	#endif
1091
1092	if ((new_version \|\| (new_issued & CEPH_CAP_AUTH_SHARED)) &&
1093	(issued & CEPH_CAP_AUTH_EXCL) == `0`) {
1094	inode->i_mode = mode;
1095	inode->i_uid = make_kuid(from: &init_user_ns, le32_to_cpu(info->uid));
1096	inode->i_gid = make_kgid(from: &init_user_ns, le32_to_cpu(info->gid));
1097	doutc(cl, "%p %llx.%llx mode 0%o uid.gid %d.%d\n", inode,
1098	ceph_vinop(inode), inode->i_mode,
1099	from_kuid(&init_user_ns, inode->i_uid),
1100	from_kgid(&init_user_ns, inode->i_gid));
1101	ceph_decode_timespec64(ts: &ci->i_btime, tv: &iinfo->btime);
1102	ceph_decode_timespec64(ts: &ci->i_snap_btime, tv: &iinfo->snap_btime);
1103	}
1104
1105	/ directories have fl_stripe_unit set to zero /
1106	if (IS_ENCRYPTED(inode))
1107	inode->i_blkbits = CEPH_FSCRYPT_BLOCK_SHIFT;
1108	else if (le32_to_cpu(info->layout.fl_stripe_unit))
1109	inode->i_blkbits =
1110	fls(le32_to_cpu(info->layout.fl_stripe_unit)) - `1`;
1111	else
1112	inode->i_blkbits = CEPH_BLOCK_SHIFT;
1113
1114	if ((new_version \|\| (new_issued & CEPH_CAP_LINK_SHARED)) &&
1115	(issued & CEPH_CAP_LINK_EXCL) == `0`)
1116	set_nlink(inode, le32_to_cpu(info->nlink));
1117
1118	if (new_version \|\| (new_issued & CEPH_CAP_ANY_RD)) {
1119	/ be careful with mtime, atime, size /
1120	ceph_decode_timespec64(ts: &atime, tv: &info->atime);
1121	ceph_decode_timespec64(ts: &mtime, tv: &info->mtime);
1122	ceph_decode_timespec64(ts: &ctime, tv: &info->ctime);
1123	ceph_fill_file_time(inode, issued,
1124	le32_to_cpu(info->time_warp_seq),
1125	ctime: &ctime, mtime: &mtime, atime: &atime);
1126	}
1127
1128	if (new_version \|\| (info_caps & CEPH_CAP_FILE_SHARED)) {
1129	ci->i_files = le64_to_cpu(info->files);
1130	ci->i_subdirs = le64_to_cpu(info->subdirs);
1131	}
1132
1133	if (new_version \|\|
1134	(new_issued & (CEPH_CAP_ANY_FILE_RD \| CEPH_CAP_ANY_FILE_WR))) {
1135	u64 size = le64_to_cpu(info->size);
1136	s64 old_pool = ci->i_layout.pool_id;
1137	struct ceph_string *old_ns;
1138
1139	ceph_file_layout_from_legacy(fl: &ci->i_layout, legacy: &info->layout);
1140	old_ns = rcu_dereference_protected(ci->i_layout.pool_ns,
1141	lockdep_is_held(&ci->i_ceph_lock));
1142	rcu_assign_pointer(ci->i_layout.pool_ns, pool_ns);
1143
1144	if (ci->i_layout.pool_id != old_pool \|\| pool_ns != old_ns)
1145	ci->i_ceph_flags &= ~CEPH_I_POOL_PERM;
1146
1147	pool_ns = old_ns;
1148
1149	if (IS_ENCRYPTED(inode) && size &&
1150	iinfo->fscrypt_file_len == sizeof(__le64)) {
1151	u64 fsize = __le64_to_cpu((__le64 )iinfo->fscrypt_file);
1152
1153	if (size == round_up(fsize, CEPH_FSCRYPT_BLOCK_SIZE)) {
1154	size = fsize;
1155	} else {
1156	pr_warn_client(cl,
1157	"fscrypt size mismatch: size=%llu fscrypt_file=%llu, discarding fscrypt_file size.\n",
1158	info->size, size);
1159	}
1160	}
1161
1162	queue_trunc = ceph_fill_file_size(inode, issued,
1163	le32_to_cpu(info->truncate_seq),
1164	le64_to_cpu(info->truncate_size),
1165	size);
1166	/ only update max_size on auth cap /
1167	if ((info->cap.flags & CEPH_CAP_FLAG_AUTH) &&
1168	ci->i_max_size != le64_to_cpu(info->max_size)) {
1169	doutc(cl, "max_size %lld -> %llu\n",
1170	ci->i_max_size, le64_to_cpu(info->max_size));
1171	ci->i_max_size = le64_to_cpu(info->max_size);
1172	}
1173	}
1174
1175	/ layout and rstat are not tracked by capability, update them if*
1176	* the inode info is from auth mds */
1177	if (new_version \|\| (info->cap.flags & CEPH_CAP_FLAG_AUTH)) {
1178	if (S_ISDIR(inode->i_mode)) {
1179	ci->i_dir_layout = iinfo->dir_layout;
1180	ci->i_rbytes = le64_to_cpu(info->rbytes);
1181	ci->i_rfiles = le64_to_cpu(info->rfiles);
1182	ci->i_rsubdirs = le64_to_cpu(info->rsubdirs);
1183	ci->i_dir_pin = iinfo->dir_pin;
1184	ci->i_rsnaps = iinfo->rsnaps;
1185	ceph_decode_timespec64(ts: &ci->i_rctime, tv: &info->rctime);
1186	}
1187	}
1188
1189	/ xattrs /
1190	/ note that if i_xattrs.len <= 4, i_xattrs.data will still be NULL. /
1191	if ((ci->i_xattrs.version == `0` \|\| !(issued & CEPH_CAP_XATTR_EXCL)) &&
1192	le64_to_cpu(info->xattr_version) > ci->i_xattrs.version) {
1193	if (ci->i_xattrs.blob)
1194	old_blob = ci->i_xattrs.blob;
1195	ci->i_xattrs.blob = xattr_blob;
1196	if (xattr_blob)
1197	memcpy(ci->i_xattrs.blob->vec.iov_base,
1198	iinfo->xattr_data, iinfo->xattr_len);
1199	ci->i_xattrs.version = le64_to_cpu(info->xattr_version);
1200	ceph_forget_all_cached_acls(inode);
1201	ceph_security_invalidate_secctx(inode);
1202	xattr_blob = NULL;
1203	}
1204
1205	/ finally update i_version /
1206	if (le64_to_cpu(info->version) > ci->i_version)
1207	ci->i_version = le64_to_cpu(info->version);
1208
1209	inode->i_mapping->a_ops = &ceph_aops;
1210
1211	switch (inode->i_mode & S_IFMT) {
1212	case S_IFIFO:
1213	case S_IFBLK:
1214	case S_IFCHR:
1215	case S_IFSOCK:
1216	inode->i_blkbits = PAGE_SHIFT;
1217	init_special_inode(inode, inode->i_mode, rdev);
1218	inode->i_op = &ceph_file_iops;
1219	break;
1220	case S_IFREG:
1221	inode->i_op = &ceph_file_iops;
1222	inode->i_fop = &ceph_file_fops;
1223	break;
1224	case S_IFLNK:
1225	if (!ci->i_symlink) {
1226	u32 symlen = iinfo->symlink_len;
1227	char *sym;
1228
1229	spin_unlock(lock: &ci->i_ceph_lock);
1230
1231	if (IS_ENCRYPTED(inode)) {
1232	if (symlen != i_size_read(inode))
1233	pr_err_client(cl,
1234	"%p %llx.%llx BAD symlink size %lld\n",
1235	inode, ceph_vinop(inode),
1236	i_size_read(inode));
1237
1238	err = decode_encrypted_symlink(mdsc, encsym: iinfo->symlink,
1239	enclen: symlen, decsym: (u8 **)&sym);
1240	if (err < `0`) {
1241	pr_err_client(cl,
1242	"decoding encrypted symlink failed: %d\n",
1243	err);
1244	goto out;
1245	}
1246	symlen = err;
1247	i_size_write(inode, i_size: symlen);
1248	inode->i_blocks = calc_inode_blocks(size: symlen);
1249	} else {
1250	if (symlen != i_size_read(inode)) {
1251	pr_err_client(cl,
1252	"%p %llx.%llx BAD symlink size %lld\n",
1253	inode, ceph_vinop(inode),
1254	i_size_read(inode));
1255	i_size_write(inode, i_size: symlen);
1256	inode->i_blocks = calc_inode_blocks(size: symlen);
1257	}
1258
1259	err = -ENOMEM;
1260	sym = kstrndup(s: iinfo->symlink, len: symlen, GFP_NOFS);
1261	if (!sym)
1262	goto out;
1263	}
1264
1265	spin_lock(lock: &ci->i_ceph_lock);
1266	if (!ci->i_symlink)
1267	ci->i_symlink = sym;
1268	else
1269	kfree(objp: sym); / lost a race /
1270	}
1271
1272	if (IS_ENCRYPTED(inode)) {
1273	/*
1274	* Encrypted symlinks need to be decrypted before we can
1275	* cache their targets in i_link. Don't touch it here.
1276	*/
1277	inode->i_op = &ceph_encrypted_symlink_iops;
1278	} else {
1279	inode->i_link = ci->i_symlink;
1280	inode->i_op = &ceph_symlink_iops;
1281	}
1282	break;
1283	case S_IFDIR:
1284	inode->i_op = &ceph_dir_iops;
1285	inode->i_fop = &ceph_dir_fops;
1286	break;
1287	default:
1288	pr_err_client(cl, "%p %llx.%llx BAD mode 0%o\n", inode,
1289	ceph_vinop(inode), inode->i_mode);
1290	}
1291
1292	/ were we issued a capability? /
1293	if (info_caps) {
1294	if (ceph_snap(inode) == CEPH_NOSNAP) {
1295	ceph_add_cap(inode, session,
1296	le64_to_cpu(info->cap.cap_id),
1297	issued: info_caps,
1298	le32_to_cpu(info->cap.wanted),
1299	le32_to_cpu(info->cap.seq),
1300	le32_to_cpu(info->cap.mseq),
1301	le64_to_cpu(info->cap.realm),
1302	flags: info->cap.flags, new_cap: &new_cap);
1303
1304	/ set dir completion flag? /
1305	if (S_ISDIR(inode->i_mode) &&
1306	ci->i_files == `0` && ci->i_subdirs == `0` &&
1307	(info_caps & CEPH_CAP_FILE_SHARED) &&
1308	(issued & CEPH_CAP_FILE_EXCL) == `0` &&
1309	!__ceph_dir_is_complete(ci)) {
1310	doutc(cl, " marking %p complete (empty)\n",
1311	inode);
1312	i_size_write(inode, i_size: `0`);
1313	__ceph_dir_set_complete(ci,
1314	release_count: atomic64_read(v: &ci->i_release_count),
1315	ordered_count: atomic64_read(v: &ci->i_ordered_count));
1316	}
1317
1318	wake = true;
1319	} else {
1320	doutc(cl, " %p got snap_caps %s\n", inode,
1321	ceph_cap_string(info_caps));
1322	ci->i_snap_caps \|= info_caps;
1323	}
1324	}
1325
1326	if (iinfo->inline_version > `0` &&
1327	iinfo->inline_version >= ci->i_inline_version) {
1328	int cache_caps = CEPH_CAP_FILE_CACHE \| CEPH_CAP_FILE_LAZYIO;
1329	ci->i_inline_version = iinfo->inline_version;
1330	if (ceph_has_inline_data(ci) &&
1331	(locked_page \|\| (info_caps & cache_caps)))
1332	fill_inline = true;
1333	}
1334
1335	if (cap_fmode >= `0`) {
1336	if (!info_caps)
1337	pr_warn_client(cl, "mds issued no caps on %llx.%llx\n",
1338	ceph_vinop(inode));
1339	__ceph_touch_fmode(ci, mdsc, fmode: cap_fmode);
1340	}
1341
1342	spin_unlock(lock: &ci->i_ceph_lock);
1343
1344	ceph_fscache_register_inode_cookie(inode);
1345
1346	if (fill_inline)
1347	ceph_fill_inline_data(inode, locked_page,
1348	data: iinfo->inline_data, len: iinfo->inline_len);
1349
1350	if (wake)
1351	wake_up_all(&ci->i_cap_wq);
1352
1353	/ queue truncate if we saw i_size decrease /
1354	if (queue_trunc)
1355	ceph_queue_vmtruncate(inode);
1356
1357	/ populate frag tree /
1358	if (S_ISDIR(inode->i_mode))
1359	ceph_fill_fragtree(inode, fragtree: &info->fragtree, dirinfo);
1360
1361	/ update delegation info? /
1362	if (dirinfo)
1363	ceph_fill_dirfrag(inode, dirinfo);
1364
1365	err = `0`;
1366	out:
1367	if (new_cap)
1368	ceph_put_cap(mdsc, cap: new_cap);
1369	ceph_buffer_put(b: old_blob);
1370	ceph_buffer_put(b: xattr_blob);
1371	ceph_put_string(str: pool_ns);
1372	return err;
1373	}
1374
1375	/*
1376	* caller should hold session s_mutex and dentry->d_lock.
1377	*/
1378	static void __update_dentry_lease(struct inode dir, struct* dentry *dentry,
1379	struct ceph_mds_reply_lease *lease,
1380	struct ceph_mds_session *session,
1381	unsigned long from_time,
1382	struct ceph_mds_session **old_lease_session)
1383	{
1384	struct ceph_client *cl = ceph_inode_to_client(inode: dir);
1385	struct ceph_dentry_info *di = ceph_dentry(dentry);
1386	unsigned mask = le16_to_cpu(lease->mask);
1387	long unsigned duration = le32_to_cpu(lease->duration_ms);
1388	long unsigned ttl = from_time + (duration * HZ) / `1000`;
1389	long unsigned half_ttl = from_time + (duration * HZ / `2`) / `1000`;
1390
1391	doutc(cl, "%p duration %lu ms ttl %lu\n", dentry, duration, ttl);
1392
1393	/ only track leases on regular dentries /
1394	if (ceph_snap(inode: dir) != CEPH_NOSNAP)
1395	return;
1396
1397	if (mask & CEPH_LEASE_PRIMARY_LINK)
1398	di->flags \|= CEPH_DENTRY_PRIMARY_LINK;
1399	else
1400	di->flags &= ~CEPH_DENTRY_PRIMARY_LINK;
1401
1402	di->lease_shared_gen = atomic_read(v: &ceph_inode(inode: dir)->i_shared_gen);
1403	if (!(mask & CEPH_LEASE_VALID)) {
1404	__ceph_dentry_dir_lease_touch(di);
1405	return;
1406	}
1407
1408	if (di->lease_gen == atomic_read(v: &session->s_cap_gen) &&
1409	time_before(ttl, di->time))
1410	return; / we already have a newer lease. /
1411
1412	if (di->lease_session && di->lease_session != session) {
1413	*old_lease_session = di->lease_session;
1414	di->lease_session = NULL;
1415	}
1416
1417	if (!di->lease_session)
1418	di->lease_session = ceph_get_mds_session(s: session);
1419	di->lease_gen = atomic_read(v: &session->s_cap_gen);
1420	di->lease_seq = le32_to_cpu(lease->seq);
1421	di->lease_renew_after = half_ttl;
1422	di->lease_renew_from = `0`;
1423	di->time = ttl;
1424
1425	__ceph_dentry_lease_touch(di);
1426	}
1427
1428	static inline void update_dentry_lease(struct inode dir, struct* dentry *dentry,
1429	struct ceph_mds_reply_lease *lease,
1430	struct ceph_mds_session *session,
1431	unsigned long from_time)
1432	{
1433	struct ceph_mds_session *old_lease_session = NULL;
1434	spin_lock(lock: &dentry->d_lock);
1435	__update_dentry_lease(dir, dentry, lease, session, from_time,
1436	old_lease_session: &old_lease_session);
1437	spin_unlock(lock: &dentry->d_lock);
1438	ceph_put_mds_session(s: old_lease_session);
1439	}
1440
1441	/*
1442	* update dentry lease without having parent inode locked
1443	*/
1444	static void update_dentry_lease_careful(struct dentry *dentry,
1445	struct ceph_mds_reply_lease *lease,
1446	struct ceph_mds_session *session,
1447	unsigned long from_time,
1448	char *dname, u32 dname_len,
1449	struct ceph_vino *pdvino,
1450	struct ceph_vino *ptvino)
1451
1452	{
1453	struct inode *dir;
1454	struct ceph_mds_session *old_lease_session = NULL;
1455
1456	spin_lock(lock: &dentry->d_lock);
1457	/ make sure dentry's name matches target /
1458	if (dentry->d_name.len != dname_len \|\|
1459	memcmp(p: dentry->d_name.name, q: dname, size: dname_len))
1460	goto out_unlock;
1461
1462	dir = d_inode(dentry: dentry->d_parent);
1463	/ make sure parent matches dvino /
1464	if (!ceph_ino_compare(inode: dir, data: pdvino))
1465	goto out_unlock;
1466
1467	/ make sure dentry's inode matches target. NULL ptvino means that*
1468	* we expect a negative dentry */
1469	if (ptvino) {
1470	if (d_really_is_negative(dentry))
1471	goto out_unlock;
1472	if (!ceph_ino_compare(inode: d_inode(dentry), data: ptvino))
1473	goto out_unlock;
1474	} else {
1475	if (d_really_is_positive(dentry))
1476	goto out_unlock;
1477	}
1478
1479	__update_dentry_lease(dir, dentry, lease, session,
1480	from_time, old_lease_session: &old_lease_session);
1481	out_unlock:
1482	spin_unlock(lock: &dentry->d_lock);
1483	ceph_put_mds_session(s: old_lease_session);
1484	}
1485
1486	/*
1487	* splice a dentry to an inode.
1488	* caller must hold directory i_rwsem for this to be safe.
1489	*/
1490	static int splice_dentry(struct dentry pdn, struct** inode *in)
1491	{
1492	struct ceph_client *cl = ceph_inode_to_client(inode: in);
1493	struct dentry dn = pdn;
1494	struct dentry *realdn;
1495
1496	BUG_ON(d_inode(dn));
1497
1498	if (S_ISDIR(in->i_mode)) {
1499	/ If inode is directory, d_splice_alias() below will remove*
1500	* 'realdn' from its origin parent. We need to ensure that
1501	* origin parent's readdir cache will not reference 'realdn'
1502	*/
1503	realdn = d_find_any_alias(inode: in);
1504	if (realdn) {
1505	struct ceph_dentry_info *di = ceph_dentry(dentry: realdn);
1506	spin_lock(lock: &realdn->d_lock);
1507
1508	realdn->d_op->d_prune(realdn);
1509
1510	di->time = jiffies;
1511	di->lease_shared_gen = `0`;
1512	di->offset = `0`;
1513
1514	spin_unlock(lock: &realdn->d_lock);
1515	dput(realdn);
1516	}
1517	}
1518
1519	/ dn must be unhashed /
1520	if (!d_unhashed(dentry: dn))
1521	d_drop(dentry: dn);
1522	realdn = d_splice_alias(in, dn);
1523	if (IS_ERR(ptr: realdn)) {
1524	pr_err_client(cl, "error %ld %p inode %p ino %llx.%llx\n",
1525	PTR_ERR(realdn), dn, in, ceph_vinop(in));
1526	return PTR_ERR(ptr: realdn);
1527	}
1528
1529	if (realdn) {
1530	doutc(cl, "dn %p (%d) spliced with %p (%d) inode %p ino %llx.%llx\n",
1531	dn, d_count(dn), realdn, d_count(realdn),
1532	d_inode(realdn), ceph_vinop(d_inode(realdn)));
1533	dput(dn);
1534	*pdn = realdn;
1535	} else {
1536	BUG_ON(!ceph_dentry(dn));
1537	doutc(cl, "dn %p attached to %p ino %llx.%llx\n", dn,
1538	d_inode(dn), ceph_vinop(d_inode(dn)));
1539	}
1540	return `0`;
1541	}
1542
1543	/*
1544	* Incorporate results into the local cache. This is either just
1545	* one inode, or a directory, dentry, and possibly linked-to inode (e.g.,
1546	* after a lookup).
1547	*
1548	* A reply may contain
1549	* a directory inode along with a dentry.
1550	* and/or a target inode
1551	*
1552	* Called with snap_rwsem (read).
1553	*/
1554	int ceph_fill_trace(struct super_block sb, struct* ceph_mds_request *req)
1555	{
1556	struct ceph_mds_session *session = req->r_session;
1557	struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
1558	struct inode *in = NULL;
1559	struct ceph_vino tvino, dvino;
1560	struct ceph_fs_client *fsc = ceph_sb_to_fs_client(sb);
1561	struct ceph_client *cl = fsc->client;
1562	struct inode *parent_dir = NULL;
1563	int err = `0`;
1564
1565	doutc(cl, "%p is_dentry %d is_target %d\n", req,
1566	rinfo->head->is_dentry, rinfo->head->is_target);
1567
1568	if (!rinfo->head->is_target && !rinfo->head->is_dentry) {
1569	doutc(cl, "reply is empty!\n");
1570	if (rinfo->head->result == `0` && req->r_parent)
1571	ceph_invalidate_dir_request(req);
1572	return `0`;
1573	}
1574
1575	if (rinfo->head->is_dentry) {
1576	/*
1577	* r_parent may be stale, in cases when R_PARENT_LOCKED is not set,
1578	* so we need to get the correct inode
1579	*/
1580	parent_dir = ceph_get_reply_dir(sb, parent: req->r_parent, rinfo);
1581	if (unlikely(IS_ERR(parent_dir))) {
1582	err = PTR_ERR(ptr: parent_dir);
1583	goto done;
1584	}
1585	if (parent_dir) {
1586	err = ceph_fill_inode(inode: parent_dir, NULL, iinfo: &rinfo->diri,
1587	dirinfo: rinfo->dirfrag, session, cap_fmode: -`1`,
1588	caps_reservation: &req->r_caps_reservation);
1589	if (err < `0`)
1590	goto done;
1591	} else {
1592	WARN_ON_ONCE(`1`);
1593	}
1594
1595	if (parent_dir && req->r_op == CEPH_MDS_OP_LOOKUPNAME &&
1596	test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags) &&
1597	!test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) {
1598	bool is_nokey = false;
1599	struct qstr dname;
1600	struct dentry dn, parent;
1601	struct fscrypt_str oname = FSTR_INIT(NULL, `0`);
1602	struct ceph_fname fname = { .dir = parent_dir,
1603	.name = rinfo->dname,
1604	.ctext = rinfo->altname,
1605	.name_len = rinfo->dname_len,
1606	.ctext_len = rinfo->altname_len };
1607
1608	BUG_ON(!rinfo->head->is_target);
1609	BUG_ON(req->r_dentry);
1610
1611	parent = d_find_any_alias(inode: parent_dir);
1612	BUG_ON(!parent);
1613
1614	err = ceph_fname_alloc_buffer(parent: parent_dir, fname: &oname);
1615	if (err < `0`) {
1616	dput(parent);
1617	goto done;
1618	}
1619
1620	err = ceph_fname_to_usr(fname: &fname, NULL, oname: &oname, is_nokey: &is_nokey);
1621	if (err < `0`) {
1622	dput(parent);
1623	ceph_fname_free_buffer(parent: parent_dir, fname: &oname);
1624	goto done;
1625	}
1626	dname.name = oname.name;
1627	dname.len = oname.len;
1628	dname.hash = full_name_hash(salt: parent, dname.name, dname.len);
1629	tvino.ino = le64_to_cpu(rinfo->targeti.in->ino);
1630	tvino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
1631	retry_lookup:
1632	dn = d_lookup(parent, &dname);
1633	doutc(cl, "d_lookup on parent=%p name=%.*s got %p\n",
1634	parent, dname.len, dname.name, dn);
1635
1636	if (!dn) {
1637	dn = d_alloc(parent, &dname);
1638	doutc(cl, "d_alloc %p '%.*s' = %p\n", parent,
1639	dname.len, dname.name, dn);
1640	if (!dn) {
1641	dput(parent);
1642	ceph_fname_free_buffer(parent: parent_dir, fname: &oname);
1643	err = -ENOMEM;
1644	goto done;
1645	}
1646	if (is_nokey) {
1647	spin_lock(lock: &dn->d_lock);
1648	dn->d_flags \|= DCACHE_NOKEY_NAME;
1649	spin_unlock(lock: &dn->d_lock);
1650	}
1651	err = `0`;
1652	} else if (d_really_is_positive(dentry: dn) &&
1653	(ceph_ino(inode: d_inode(dentry: dn)) != tvino.ino \|\|
1654	ceph_snap(inode: d_inode(dentry: dn)) != tvino.snap)) {
1655	doutc(cl, " dn %p points to wrong inode %p\n",
1656	dn, d_inode(dn));
1657	ceph_dir_clear_ordered(inode: parent_dir);
1658	d_delete(dn);
1659	dput(dn);
1660	goto retry_lookup;
1661	}
1662	ceph_fname_free_buffer(parent: parent_dir, fname: &oname);
1663
1664	req->r_dentry = dn;
1665	dput(parent);
1666	}
1667	}
1668
1669	if (rinfo->head->is_target) {
1670	/ Should be filled in by handle_reply /
1671	BUG_ON(!req->r_target_inode);
1672
1673	in = req->r_target_inode;
1674	err = ceph_fill_inode(inode: in, locked_page: req->r_locked_page, iinfo: &rinfo->targeti,
1675	NULL, session,
1676	cap_fmode: (!test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags) &&
1677	!test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags) &&
1678	rinfo->head->result == `0`) ? req->r_fmode : -`1`,
1679	caps_reservation: &req->r_caps_reservation);
1680	if (err < `0`) {
1681	pr_err_client(cl, "badness %p %llx.%llx\n", in,
1682	ceph_vinop(in));
1683	req->r_target_inode = NULL;
1684	if (inode_state_read_once(inode: in) & I_NEW)
1685	discard_new_inode(in);
1686	else
1687	iput(in);
1688	goto done;
1689	}
1690	if (inode_state_read_once(inode: in) & I_NEW)
1691	unlock_new_inode(in);
1692	}
1693
1694	/*
1695	* ignore null lease/binding on snapdir ENOENT, or else we
1696	* will have trouble splicing in the virtual snapdir later
1697	*/
1698	if (rinfo->head->is_dentry &&
1699	!test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags) &&
1700	test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags) &&
1701	(rinfo->head->is_target \|\| strncmp(req->r_dentry->d_name.name,
1702	fsc->mount_options->snapdir_name,
1703	req->r_dentry->d_name.len))) {
1704	/*
1705	* lookup link rename : null -> possibly existing inode
1706	* mknod symlink mkdir : null -> new inode
1707	* unlink : linked -> null
1708	*/
1709	struct inode *dir = req->r_parent;
1710	struct dentry *dn = req->r_dentry;
1711	bool have_dir_cap, have_lease;
1712
1713	BUG_ON(!dn);
1714	BUG_ON(!dir);
1715	BUG_ON(d_inode(dn->d_parent) != dir);
1716
1717	dvino.ino = le64_to_cpu(rinfo->diri.in->ino);
1718	dvino.snap = le64_to_cpu(rinfo->diri.in->snapid);
1719
1720	BUG_ON(ceph_ino(dir) != dvino.ino);
1721	BUG_ON(ceph_snap(dir) != dvino.snap);
1722
1723	/ do we have a lease on the whole dir? /
1724	have_dir_cap =
1725	(le32_to_cpu(rinfo->diri.in->cap.caps) &
1726	CEPH_CAP_FILE_SHARED);
1727
1728	/ do we have a dn lease? /
1729	have_lease = have_dir_cap \|\|
1730	le32_to_cpu(rinfo->dlease->duration_ms);
1731	if (!have_lease)
1732	doutc(cl, "no dentry lease or dir cap\n");
1733
1734	/ rename? /
1735	if (req->r_old_dentry && req->r_op == CEPH_MDS_OP_RENAME) {
1736	struct inode *olddir = req->r_old_dentry_dir;
1737	BUG_ON(!olddir);
1738
1739	doutc(cl, " src %p '%pd' dst %p '%pd'\n",
1740	req->r_old_dentry, req->r_old_dentry, dn, dn);
1741	doutc(cl, "doing d_move %p -> %p\n", req->r_old_dentry, dn);
1742
1743	/ d_move screws up sibling dentries' offsets /
1744	ceph_dir_clear_ordered(inode: dir);
1745	ceph_dir_clear_ordered(inode: olddir);
1746
1747	d_move(req->r_old_dentry, dn);
1748	doutc(cl, " src %p '%pd' dst %p '%pd'\n",
1749	req->r_old_dentry, req->r_old_dentry, dn, dn);
1750
1751	/ ensure target dentry is invalidated, despite*
1752	rehashing bug in vfs_rename_dir /*
1753	ceph_invalidate_dentry_lease(dentry: dn);
1754
1755	doutc(cl, "dn %p gets new offset %lld\n",
1756	req->r_old_dentry,
1757	ceph_dentry(req->r_old_dentry)->offset);
1758
1759	/ swap r_dentry and r_old_dentry in case that*
1760	* splice_dentry() gets called later. This is safe
1761	* because no other place will use them */
1762	req->r_dentry = req->r_old_dentry;
1763	req->r_old_dentry = dn;
1764	dn = req->r_dentry;
1765	}
1766
1767	/ null dentry? /
1768	if (!rinfo->head->is_target) {
1769	doutc(cl, "null dentry\n");
1770	if (d_really_is_positive(dentry: dn)) {
1771	doutc(cl, "d_delete %p\n", dn);
1772	ceph_dir_clear_ordered(inode: dir);
1773	d_delete(dn);
1774	} else if (have_lease) {
1775	if (d_unhashed(dentry: dn))
1776	d_add(dn, NULL);
1777	}
1778
1779	if (!d_unhashed(dentry: dn) && have_lease)
1780	update_dentry_lease(dir, dentry: dn,
1781	lease: rinfo->dlease, session,
1782	from_time: req->r_request_started);
1783	goto done;
1784	}
1785
1786	if (unlikely(!in)) {
1787	err = -EINVAL;
1788	goto done;
1789	}
1790
1791	/ attach proper inode /
1792	if (d_really_is_negative(dentry: dn)) {
1793	ceph_dir_clear_ordered(inode: dir);
1794	ihold(inode: in);
1795	err = splice_dentry(pdn: &req->r_dentry, in);
1796	if (err < `0`)
1797	goto done;
1798	dn = req->r_dentry; / may have spliced /
1799	} else if (d_really_is_positive(dentry: dn) && d_inode(dentry: dn) != in) {
1800	doutc(cl, " %p links to %p %llx.%llx, not %llx.%llx\n",
1801	dn, d_inode(dn), ceph_vinop(d_inode(dn)),
1802	ceph_vinop(in));
1803	d_invalidate(dn);
1804	have_lease = false;
1805	}
1806
1807	if (have_lease) {
1808	update_dentry_lease(dir, dentry: dn,
1809	lease: rinfo->dlease, session,
1810	from_time: req->r_request_started);
1811	}
1812	doutc(cl, " final dn %p\n", dn);
1813	} else if ((req->r_op == CEPH_MDS_OP_LOOKUPSNAP \|\|
1814	req->r_op == CEPH_MDS_OP_MKSNAP) &&
1815	test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags) &&
1816	!test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) {
1817	struct inode *dir = req->r_parent;
1818
1819	/ fill out a snapdir LOOKUPSNAP dentry /
1820	BUG_ON(!dir);
1821	BUG_ON(ceph_snap(dir) != CEPH_SNAPDIR);
1822	BUG_ON(!req->r_dentry);
1823	doutc(cl, " linking snapped dir %p to dn %p\n", in,
1824	req->r_dentry);
1825	ceph_dir_clear_ordered(inode: dir);
1826
1827	if (unlikely(!in)) {
1828	err = -EINVAL;
1829	goto done;
1830	}
1831
1832	ihold(inode: in);
1833	err = splice_dentry(pdn: &req->r_dentry, in);
1834	if (err < `0`)
1835	goto done;
1836	} else if (rinfo->head->is_dentry && req->r_dentry) {
1837	/ parent inode is not locked, be careful /
1838	struct ceph_vino *ptvino = NULL;
1839	dvino.ino = le64_to_cpu(rinfo->diri.in->ino);
1840	dvino.snap = le64_to_cpu(rinfo->diri.in->snapid);
1841	if (rinfo->head->is_target) {
1842	tvino.ino = le64_to_cpu(rinfo->targeti.in->ino);
1843	tvino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
1844	ptvino = &tvino;
1845	}
1846	update_dentry_lease_careful(dentry: req->r_dentry, lease: rinfo->dlease,
1847	session, from_time: req->r_request_started,
1848	dname: rinfo->dname, dname_len: rinfo->dname_len,
1849	pdvino: &dvino, ptvino);
1850	}
1851	done:
1852	/ Drop extra ref from ceph_get_reply_dir() if it returned a new inode /
1853	if (unlikely(!IS_ERR_OR_NULL(parent_dir) && parent_dir != req->r_parent))
1854	iput(parent_dir);
1855	doutc(cl, "done err=%d\n", err);
1856	return err;
1857	}
1858
1859	/*
1860	* Prepopulate our cache with readdir results, leases, etc.
1861	*/
1862	static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req,
1863	struct ceph_mds_session *session)
1864	{
1865	struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
1866	struct ceph_client *cl = session->s_mdsc->fsc->client;
1867	int i, err = `0`;
1868
1869	for (i = `0`; i < rinfo->dir_nr; i++) {
1870	struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i;
1871	struct ceph_vino vino;
1872	struct inode *in;
1873	int rc;
1874
1875	vino.ino = le64_to_cpu(rde->inode.in->ino);
1876	vino.snap = le64_to_cpu(rde->inode.in->snapid);
1877
1878	in = ceph_get_inode(sb: req->r_dentry->d_sb, vino, NULL);
1879	if (IS_ERR(ptr: in)) {
1880	err = PTR_ERR(ptr: in);
1881	doutc(cl, "badness got %d\n", err);
1882	continue;
1883	}
1884	rc = ceph_fill_inode(inode: in, NULL, iinfo: &rde->inode, NULL, session,
1885	cap_fmode: -`1`, caps_reservation: &req->r_caps_reservation);
1886	if (rc < `0`) {
1887	pr_err_client(cl, "inode badness on %p got %d\n", in,
1888	rc);
1889	err = rc;
1890	if (inode_state_read_once(inode: in) & I_NEW) {
1891	ihold(inode: in);
1892	discard_new_inode(in);
1893	}
1894	} else if (inode_state_read_once(inode: in) & I_NEW) {
1895	unlock_new_inode(in);
1896	}
1897
1898	iput(in);
1899	}
1900
1901	return err;
1902	}
1903
1904	void ceph_readdir_cache_release(struct ceph_readdir_cache_control *ctl)
1905	{
1906	if (ctl->folio) {
1907	folio_release_kmap(folio: ctl->folio, addr: ctl->dentries);
1908	ctl->folio = NULL;
1909	}
1910	}
1911
1912	static int fill_readdir_cache(struct inode dir, struct* dentry *dn,
1913	struct ceph_readdir_cache_control *ctl,
1914	struct ceph_mds_request *req)
1915	{
1916	struct ceph_client *cl = ceph_inode_to_client(inode: dir);
1917	struct ceph_inode_info *ci = ceph_inode(inode: dir);
1918	unsigned nsize = PAGE_SIZE / sizeof(struct dentry*);
1919	unsigned idx = ctl->index % nsize;
1920	pgoff_t pgoff = ctl->index / nsize;
1921
1922	if (!ctl->folio \|\| pgoff != ctl->folio->index) {
1923	ceph_readdir_cache_release(ctl);
1924	fgf_t fgf = FGP_LOCK;
1925
1926	if (idx == `0`)
1927	fgf \|= FGP_ACCESSED \| FGP_CREAT;
1928
1929	ctl->folio = __filemap_get_folio(mapping: &dir->i_data, index: pgoff,
1930	fgf_flags: fgf, gfp: mapping_gfp_mask(mapping: &dir->i_data));
1931	if (IS_ERR(ptr: ctl->folio)) {
1932	int err = PTR_ERR(ptr: ctl->folio);
1933
1934	ctl->folio = NULL;
1935	ctl->index = -`1`;
1936	return idx == `0` ? err : `0`;
1937	}
1938	/ reading/filling the cache are serialized by*
1939	* i_rwsem, no need to use folio lock */
1940	folio_unlock(folio: ctl->folio);
1941	ctl->dentries = kmap_local_folio(folio: ctl->folio, offset: `0`);
1942	if (idx == `0`)
1943	memset(ctl->dentries, `0`, PAGE_SIZE);
1944	}
1945
1946	if (req->r_dir_release_cnt == atomic64_read(v: &ci->i_release_count) &&
1947	req->r_dir_ordered_cnt == atomic64_read(v: &ci->i_ordered_count)) {
1948	doutc(cl, "dn %p idx %d\n", dn, ctl->index);
1949	ctl->dentries[idx] = dn;
1950	ctl->index++;
1951	} else {
1952	doutc(cl, "disable readdir cache\n");
1953	ctl->index = -`1`;
1954	}
1955	return `0`;
1956	}
1957
1958	int ceph_readdir_prepopulate(struct ceph_mds_request *req,
1959	struct ceph_mds_session *session)
1960	{
1961	struct dentry *parent = req->r_dentry;
1962	struct inode *inode = d_inode(dentry: parent);
1963	struct ceph_inode_info *ci = ceph_inode(inode);
1964	struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
1965	struct ceph_client *cl = session->s_mdsc->fsc->client;
1966	struct qstr dname;
1967	struct dentry *dn;
1968	struct inode *in;
1969	int err = `0`, skipped = `0`, ret, i;
1970	u32 frag = le32_to_cpu(req->r_args.readdir.frag);
1971	u32 last_hash = `0`;
1972	u32 fpos_offset;
1973	struct ceph_readdir_cache_control cache_ctl = {};
1974
1975	if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags))
1976	return readdir_prepopulate_inodes_only(req, session);
1977
1978	if (rinfo->hash_order) {
1979	if (req->r_path2) {
1980	last_hash = ceph_str_hash(type: ci->i_dir_layout.dl_dir_hash,
1981	s: req->r_path2,
1982	strlen(req->r_path2));
1983	last_hash = ceph_frag_value(f: last_hash);
1984	} else if (rinfo->offset_hash) {
1985	/ mds understands offset_hash /
1986	WARN_ON_ONCE(req->r_readdir_offset != `2`);
1987	last_hash = le32_to_cpu(req->r_args.readdir.offset_hash);
1988	}
1989	}
1990
1991	if (rinfo->dir_dir &&
1992	le32_to_cpu(rinfo->dir_dir->frag) != frag) {
1993	doutc(cl, "got new frag %x -> %x\n", frag,
1994	le32_to_cpu(rinfo->dir_dir->frag));
1995	frag = le32_to_cpu(rinfo->dir_dir->frag);
1996	if (!rinfo->hash_order)
1997	req->r_readdir_offset = `2`;
1998	}
1999
2000	if (le32_to_cpu(rinfo->head->op) == CEPH_MDS_OP_LSSNAP) {
2001	doutc(cl, "%d items under SNAPDIR dn %p\n",
2002	rinfo->dir_nr, parent);
2003	} else {
2004	doutc(cl, "%d items under dn %p\n", rinfo->dir_nr, parent);
2005	if (rinfo->dir_dir)
2006	ceph_fill_dirfrag(inode: d_inode(dentry: parent), dirinfo: rinfo->dir_dir);
2007
2008	if (ceph_frag_is_leftmost(f: frag) &&
2009	req->r_readdir_offset == `2` &&
2010	!(rinfo->hash_order && last_hash)) {
2011	/ note dir version at start of readdir so we can*
2012	* tell if any dentries get dropped */
2013	req->r_dir_release_cnt =
2014	atomic64_read(v: &ci->i_release_count);
2015	req->r_dir_ordered_cnt =
2016	atomic64_read(v: &ci->i_ordered_count);
2017	req->r_readdir_cache_idx = `0`;
2018	}
2019	}
2020
2021	cache_ctl.index = req->r_readdir_cache_idx;
2022	fpos_offset = req->r_readdir_offset;
2023
2024	/ FIXME: release caps/leases if error occurs /
2025	for (i = `0`; i < rinfo->dir_nr; i++) {
2026	struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i;
2027	struct ceph_vino tvino;
2028
2029	dname.name = rde->name;
2030	dname.len = rde->name_len;
2031	dname.hash = full_name_hash(salt: parent, dname.name, dname.len);
2032
2033	tvino.ino = le64_to_cpu(rde->inode.in->ino);
2034	tvino.snap = le64_to_cpu(rde->inode.in->snapid);
2035
2036	if (rinfo->hash_order) {
2037	u32 hash = ceph_frag_value(f: rde->raw_hash);
2038	if (hash != last_hash)
2039	fpos_offset = `2`;
2040	last_hash = hash;
2041	rde->offset = ceph_make_fpos(high: hash, off: fpos_offset++, hash_order: true);
2042	} else {
2043	rde->offset = ceph_make_fpos(high: frag, off: fpos_offset++, hash_order: false);
2044	}
2045
2046	retry_lookup:
2047	dn = d_lookup(parent, &dname);
2048	doutc(cl, "d_lookup on parent=%p name=%.*s got %p\n",
2049	parent, dname.len, dname.name, dn);
2050
2051	if (!dn) {
2052	dn = d_alloc(parent, &dname);
2053	doutc(cl, "d_alloc %p '%.*s' = %p\n", parent,
2054	dname.len, dname.name, dn);
2055	if (!dn) {
2056	doutc(cl, "d_alloc badness\n");
2057	err = -ENOMEM;
2058	goto out;
2059	}
2060	if (rde->is_nokey) {
2061	spin_lock(lock: &dn->d_lock);
2062	dn->d_flags \|= DCACHE_NOKEY_NAME;
2063	spin_unlock(lock: &dn->d_lock);
2064	}
2065	} else if (d_really_is_positive(dentry: dn) &&
2066	(ceph_ino(inode: d_inode(dentry: dn)) != tvino.ino \|\|
2067	ceph_snap(inode: d_inode(dentry: dn)) != tvino.snap)) {
2068	struct ceph_dentry_info *di = ceph_dentry(dentry: dn);
2069	doutc(cl, " dn %p points to wrong inode %p\n",
2070	dn, d_inode(dn));
2071
2072	spin_lock(lock: &dn->d_lock);
2073	if (di->offset > `0` &&
2074	di->lease_shared_gen ==
2075	atomic_read(v: &ci->i_shared_gen)) {
2076	__ceph_dir_clear_ordered(ci);
2077	di->offset = `0`;
2078	}
2079	spin_unlock(lock: &dn->d_lock);
2080
2081	d_delete(dn);
2082	dput(dn);
2083	goto retry_lookup;
2084	}
2085
2086	/ inode /
2087	if (d_really_is_positive(dentry: dn)) {
2088	in = d_inode(dentry: dn);
2089	} else {
2090	in = ceph_get_inode(sb: parent->d_sb, vino: tvino, NULL);
2091	if (IS_ERR(ptr: in)) {
2092	doutc(cl, "new_inode badness\n");
2093	d_drop(dentry: dn);
2094	dput(dn);
2095	err = PTR_ERR(ptr: in);
2096	goto out;
2097	}
2098	}
2099
2100	ret = ceph_fill_inode(inode: in, NULL, iinfo: &rde->inode, NULL, session,
2101	cap_fmode: -`1`, caps_reservation: &req->r_caps_reservation);
2102	if (ret < `0`) {
2103	pr_err_client(cl, "badness on %p %llx.%llx\n", in,
2104	ceph_vinop(in));
2105	if (d_really_is_negative(dentry: dn)) {
2106	if (inode_state_read_once(inode: in) & I_NEW) {
2107	ihold(inode: in);
2108	discard_new_inode(in);
2109	}
2110	iput(in);
2111	}
2112	d_drop(dentry: dn);
2113	err = ret;
2114	goto next_item;
2115	}
2116	if (inode_state_read_once(inode: in) & I_NEW)
2117	unlock_new_inode(in);
2118
2119	if (d_really_is_negative(dentry: dn)) {
2120	if (ceph_security_xattr_deadlock(in)) {
2121	doutc(cl, " skip splicing dn %p to inode %p"
2122	" (security xattr deadlock)\n", dn, in);
2123	iput(in);
2124	skipped++;
2125	goto next_item;
2126	}
2127
2128	err = splice_dentry(pdn: &dn, in);
2129	if (err < `0`)
2130	goto next_item;
2131	}
2132
2133	ceph_dentry(dentry: dn)->offset = rde->offset;
2134
2135	update_dentry_lease(dir: d_inode(dentry: parent), dentry: dn,
2136	lease: rde->lease, session: req->r_session,
2137	from_time: req->r_request_started);
2138
2139	if (err == `0` && skipped == `0` && cache_ctl.index >= `0`) {
2140	ret = fill_readdir_cache(dir: d_inode(dentry: parent), dn,
2141	ctl: &cache_ctl, req);
2142	if (ret < `0`)
2143	err = ret;
2144	}
2145	next_item:
2146	dput(dn);
2147	}
2148	out:
2149	if (err == `0` && skipped == `0`) {
2150	set_bit(CEPH_MDS_R_DID_PREPOPULATE, addr: &req->r_req_flags);
2151	req->r_readdir_cache_idx = cache_ctl.index;
2152	}
2153	ceph_readdir_cache_release(ctl: &cache_ctl);
2154	doutc(cl, "done\n");
2155	return err;
2156	}
2157
2158	bool ceph_inode_set_size(struct inode *inode, loff_t size)
2159	{
2160	struct ceph_client *cl = ceph_inode_to_client(inode);
2161	struct ceph_inode_info *ci = ceph_inode(inode);
2162	bool ret;
2163
2164	spin_lock(lock: &ci->i_ceph_lock);
2165	doutc(cl, "set_size %p %llu -> %llu\n", inode, i_size_read(inode), size);
2166	i_size_write(inode, i_size: size);
2167	ceph_fscache_update(inode);
2168	inode->i_blocks = calc_inode_blocks(size);
2169
2170	ret = __ceph_should_report_size(ci);
2171
2172	spin_unlock(lock: &ci->i_ceph_lock);
2173
2174	return ret;
2175	}
2176
2177	void ceph_queue_inode_work(struct inode inode, int* work_bit)
2178	{
2179	struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
2180	struct ceph_client *cl = fsc->client;
2181	struct ceph_inode_info *ci = ceph_inode(inode);
2182	set_bit(nr: work_bit, addr: &ci->i_work_mask);
2183
2184	ihold(inode);
2185	if (queue_work(wq: fsc->inode_wq, work: &ci->i_work)) {
2186	doutc(cl, "%p %llx.%llx mask=%lx\n", inode,
2187	ceph_vinop(inode), ci->i_work_mask);
2188	} else {
2189	doutc(cl, "%p %llx.%llx already queued, mask=%lx\n",
2190	inode, ceph_vinop(inode), ci->i_work_mask);
2191	iput(inode);
2192	}
2193	}
2194
2195	static void ceph_do_invalidate_pages(struct inode *inode)
2196	{
2197	struct ceph_client *cl = ceph_inode_to_client(inode);
2198	struct ceph_inode_info *ci = ceph_inode(inode);
2199	u32 orig_gen;
2200	int check = `0`;
2201
2202	ceph_fscache_invalidate(inode, dio_write: false);
2203
2204	mutex_lock(&ci->i_truncate_mutex);
2205
2206	if (ceph_inode_is_shutdown(inode)) {
2207	pr_warn_ratelimited_client(cl,
2208	"%p %llx.%llx is shut down\n", inode,
2209	ceph_vinop(inode));
2210	mapping_set_error(mapping: inode->i_mapping, error: -EIO);
2211	truncate_pagecache(inode, new: `0`);
2212	mutex_unlock(lock: &ci->i_truncate_mutex);
2213	goto out;
2214	}
2215
2216	spin_lock(lock: &ci->i_ceph_lock);
2217	doutc(cl, "%p %llx.%llx gen %d revoking %d\n", inode,
2218	ceph_vinop(inode), ci->i_rdcache_gen, ci->i_rdcache_revoking);
2219	if (ci->i_rdcache_revoking != ci->i_rdcache_gen) {
2220	if (__ceph_caps_revoking_other(ci, NULL, CEPH_CAP_FILE_CACHE))
2221	check = `1`;
2222	spin_unlock(lock: &ci->i_ceph_lock);
2223	mutex_unlock(lock: &ci->i_truncate_mutex);
2224	goto out;
2225	}
2226	orig_gen = ci->i_rdcache_gen;
2227	spin_unlock(lock: &ci->i_ceph_lock);
2228
2229	if (invalidate_inode_pages2(mapping: inode->i_mapping) < `0`) {
2230	pr_err_client(cl, "invalidate_inode_pages2 %llx.%llx failed\n",
2231	ceph_vinop(inode));
2232	}
2233
2234	spin_lock(lock: &ci->i_ceph_lock);
2235	if (orig_gen == ci->i_rdcache_gen &&
2236	orig_gen == ci->i_rdcache_revoking) {
2237	doutc(cl, "%p %llx.%llx gen %d successful\n", inode,
2238	ceph_vinop(inode), ci->i_rdcache_gen);
2239	ci->i_rdcache_revoking--;
2240	check = `1`;
2241	} else {
2242	doutc(cl, "%p %llx.%llx gen %d raced, now %d revoking %d\n",
2243	inode, ceph_vinop(inode), orig_gen, ci->i_rdcache_gen,
2244	ci->i_rdcache_revoking);
2245	if (__ceph_caps_revoking_other(ci, NULL, CEPH_CAP_FILE_CACHE))
2246	check = `1`;
2247	}
2248	spin_unlock(lock: &ci->i_ceph_lock);
2249	mutex_unlock(lock: &ci->i_truncate_mutex);
2250	out:
2251	if (check)
2252	ceph_check_caps(ci, flags: `0`);
2253	}
2254
2255	/*
2256	* Make sure any pending truncation is applied before doing anything
2257	* that may depend on it.
2258	*/
2259	void __ceph_do_pending_vmtruncate(struct inode *inode)
2260	{
2261	struct ceph_client *cl = ceph_inode_to_client(inode);
2262	struct ceph_inode_info *ci = ceph_inode(inode);
2263	u64 to;
2264	int wrbuffer_refs, finish = `0`;
2265
2266	mutex_lock(&ci->i_truncate_mutex);
2267	retry:
2268	spin_lock(lock: &ci->i_ceph_lock);
2269	if (ci->i_truncate_pending == `0`) {
2270	doutc(cl, "%p %llx.%llx none pending\n", inode,
2271	ceph_vinop(inode));
2272	spin_unlock(lock: &ci->i_ceph_lock);
2273	mutex_unlock(lock: &ci->i_truncate_mutex);
2274	return;
2275	}
2276
2277	/*
2278	* make sure any dirty snapped pages are flushed before we
2279	* possibly truncate them.. so write AND block!
2280	*/
2281	if (ci->i_wrbuffer_ref_head < ci->i_wrbuffer_ref) {
2282	spin_unlock(lock: &ci->i_ceph_lock);
2283	doutc(cl, "%p %llx.%llx flushing snaps first\n", inode,
2284	ceph_vinop(inode));
2285	filemap_write_and_wait_range(mapping: &inode->i_data, lstart: `0`,
2286	lend: inode->i_sb->s_maxbytes);
2287	goto retry;
2288	}
2289
2290	/ there should be no reader or writer /
2291	WARN_ON_ONCE(ci->i_rd_ref \|\| ci->i_wr_ref);
2292
2293	to = ci->i_truncate_pagecache_size;
2294	wrbuffer_refs = ci->i_wrbuffer_ref;
2295	doutc(cl, "%p %llx.%llx (%d) to %lld\n", inode, ceph_vinop(inode),
2296	ci->i_truncate_pending, to);
2297	spin_unlock(lock: &ci->i_ceph_lock);
2298
2299	ceph_fscache_resize(inode, to);
2300	truncate_pagecache(inode, new: to);
2301
2302	spin_lock(lock: &ci->i_ceph_lock);
2303	if (to == ci->i_truncate_pagecache_size) {
2304	ci->i_truncate_pending = `0`;
2305	finish = `1`;
2306	}
2307	spin_unlock(lock: &ci->i_ceph_lock);
2308	if (!finish)
2309	goto retry;
2310
2311	mutex_unlock(lock: &ci->i_truncate_mutex);
2312
2313	if (wrbuffer_refs == `0`)
2314	ceph_check_caps(ci, flags: `0`);
2315
2316	wake_up_all(&ci->i_cap_wq);
2317	}
2318
2319	static void ceph_inode_work(struct work_struct *work)
2320	{
2321	struct ceph_inode_info ci = container_of(work, struct* ceph_inode_info,
2322	i_work);
2323	struct inode *inode = &ci->netfs.inode;
2324	struct ceph_client *cl = ceph_inode_to_client(inode);
2325
2326	if (test_and_clear_bit(CEPH_I_WORK_WRITEBACK, addr: &ci->i_work_mask)) {
2327	doutc(cl, "writeback %p %llx.%llx\n", inode, ceph_vinop(inode));
2328	filemap_fdatawrite(&inode->i_data);
2329	}
2330	if (test_and_clear_bit(CEPH_I_WORK_INVALIDATE_PAGES, addr: &ci->i_work_mask))
2331	ceph_do_invalidate_pages(inode);
2332
2333	if (test_and_clear_bit(CEPH_I_WORK_VMTRUNCATE, addr: &ci->i_work_mask))
2334	__ceph_do_pending_vmtruncate(inode);
2335
2336	if (test_and_clear_bit(CEPH_I_WORK_CHECK_CAPS, addr: &ci->i_work_mask))
2337	ceph_check_caps(ci, flags: `0`);
2338
2339	if (test_and_clear_bit(CEPH_I_WORK_FLUSH_SNAPS, addr: &ci->i_work_mask))
2340	ceph_flush_snaps(ci, NULL);
2341
2342	iput(inode);
2343	}
2344
2345	static const char ceph_encrypted_get_link(struct* dentry *dentry,
2346	struct inode *inode,
2347	struct delayed_call *done)
2348	{
2349	struct ceph_inode_info *ci = ceph_inode(inode);
2350
2351	if (!dentry)
2352	return ERR_PTR(error: -ECHILD);
2353
2354	return fscrypt_get_symlink(inode, caddr: ci->i_symlink, max_size: i_size_read(inode),
2355	done);
2356	}
2357
2358	static int ceph_encrypted_symlink_getattr(struct mnt_idmap *idmap,
2359	const struct path *path,
2360	struct kstat *stat, u32 request_mask,
2361	unsigned int query_flags)
2362	{
2363	int ret;
2364
2365	ret = ceph_getattr(idmap, path, stat, request_mask, flags: query_flags);
2366	if (ret)
2367	return ret;
2368	return fscrypt_symlink_getattr(path, stat);
2369	}
2370
2371	/*
2372	* symlinks
2373	*/
2374	static const struct inode_operations ceph_symlink_iops = {
2375	.get_link = simple_get_link,
2376	.setattr = ceph_setattr,
2377	.getattr = ceph_getattr,
2378	.listxattr = ceph_listxattr,
2379	};
2380
2381	static const struct inode_operations ceph_encrypted_symlink_iops = {
2382	.get_link = ceph_encrypted_get_link,
2383	.setattr = ceph_setattr,
2384	.getattr = ceph_encrypted_symlink_getattr,
2385	.listxattr = ceph_listxattr,
2386	};
2387
2388	/*
2389	* Transfer the encrypted last block to the MDS and the MDS
2390	* will help update it when truncating a smaller size.
2391	*
2392	* We don't support a PAGE_SIZE that is smaller than the
2393	* CEPH_FSCRYPT_BLOCK_SIZE.
2394	*/
2395	static int fill_fscrypt_truncate(struct inode *inode,
2396	struct ceph_mds_request *req,
2397	struct iattr *attr)
2398	{
2399	struct ceph_client *cl = ceph_inode_to_client(inode);
2400	struct ceph_inode_info *ci = ceph_inode(inode);
2401	int boff = attr->ia_size % CEPH_FSCRYPT_BLOCK_SIZE;
2402	loff_t pos, orig_pos = round_down(attr->ia_size,
2403	CEPH_FSCRYPT_BLOCK_SIZE);
2404	u64 block = orig_pos >> CEPH_FSCRYPT_BLOCK_SHIFT;
2405	struct ceph_pagelist *pagelist = NULL;
2406	struct kvec iov = {`0`};
2407	struct iov_iter iter;
2408	struct page *page = NULL;
2409	struct ceph_fscrypt_truncate_size_header header;
2410	int retry_op = `0`;
2411	int len = CEPH_FSCRYPT_BLOCK_SIZE;
2412	loff_t i_size = i_size_read(inode);
2413	int got, ret, issued;
2414	u64 objver;
2415
2416	ret = __ceph_get_caps(inode, NULL, CEPH_CAP_FILE_RD, want: `0`, endoff: -`1`, got: &got);
2417	if (ret < `0`)
2418	return ret;
2419
2420	issued = __ceph_caps_issued(ci, NULL);
2421
2422	doutc(cl, "size %lld -> %lld got cap refs on %s, issued %s\n",
2423	i_size, attr->ia_size, ceph_cap_string(got),
2424	ceph_cap_string(issued));
2425
2426	/ Try to writeback the dirty pagecaches /
2427	if (issued & (CEPH_CAP_FILE_BUFFER)) {
2428	loff_t lend = orig_pos + CEPH_FSCRYPT_BLOCK_SIZE - `1`;
2429
2430	ret = filemap_write_and_wait_range(mapping: inode->i_mapping,
2431	lstart: orig_pos, lend);
2432	if (ret < `0`)
2433	goto out;
2434	}
2435
2436	page = __page_cache_alloc(GFP_KERNEL);
2437	if (page == NULL) {
2438	ret = -ENOMEM;
2439	goto out;
2440	}
2441
2442	pagelist = ceph_pagelist_alloc(GFP_KERNEL);
2443	if (!pagelist) {
2444	ret = -ENOMEM;
2445	goto out;
2446	}
2447
2448	iov.iov_base = kmap_local_page(page);
2449	iov.iov_len = len;
2450	iov_iter_kvec(i: &iter, READ, kvec: &iov, nr_segs: `1`, count: len);
2451
2452	pos = orig_pos;
2453	ret = __ceph_sync_read(inode, ki_pos: &pos, to: &iter, retry_op: &retry_op, last_objver: &objver);
2454	if (ret < `0`)
2455	goto out;
2456
2457	/ Insert the header first /
2458	header.ver = `1`;
2459	header.compat = `1`;
2460	header.change_attr = cpu_to_le64(inode_peek_iversion_raw(inode));
2461
2462	/*
2463	* Always set the block_size to CEPH_FSCRYPT_BLOCK_SIZE,
2464	* because in MDS it may need this to do the truncate.
2465	*/
2466	header.block_size = cpu_to_le32(CEPH_FSCRYPT_BLOCK_SIZE);
2467
2468	/*
2469	* If we hit a hole here, we should just skip filling
2470	* the fscrypt for the request, because once the fscrypt
2471	* is enabled, the file will be split into many blocks
2472	* with the size of CEPH_FSCRYPT_BLOCK_SIZE, if there
2473	* has a hole, the hole size should be multiple of block
2474	* size.
2475	*
2476	* If the Rados object doesn't exist, it will be set to 0.
2477	*/
2478	if (!objver) {
2479	doutc(cl, "hit hole, ppos %lld < size %lld\n", pos, i_size);
2480
2481	header.data_len = cpu_to_le32(`8` + `8` + `4`);
2482	header.file_offset = `0`;
2483	ret = `0`;
2484	} else {
2485	header.data_len = cpu_to_le32(`8` + `8` + `4` + CEPH_FSCRYPT_BLOCK_SIZE);
2486	header.file_offset = cpu_to_le64(orig_pos);
2487
2488	doutc(cl, "encrypt block boff/bsize %d/%lu\n", boff,
2489	CEPH_FSCRYPT_BLOCK_SIZE);
2490
2491	/ truncate and zero out the extra contents for the last block /
2492	memset(iov.iov_base + boff, `0`, PAGE_SIZE - boff);
2493
2494	/ encrypt the last block /
2495	ret = ceph_fscrypt_encrypt_block_inplace(inode, page,
2496	CEPH_FSCRYPT_BLOCK_SIZE,
2497	offs: `0`, lblk_num: block);
2498	if (ret)
2499	goto out;
2500	}
2501
2502	/ Insert the header /
2503	ret = ceph_pagelist_append(pl: pagelist, d: &header, l: sizeof(header));
2504	if (ret)
2505	goto out;
2506
2507	if (header.block_size) {
2508	/ Append the last block contents to pagelist /
2509	ret = ceph_pagelist_append(pl: pagelist, d: iov.iov_base,
2510	CEPH_FSCRYPT_BLOCK_SIZE);
2511	if (ret)
2512	goto out;
2513	}
2514	req->r_pagelist = pagelist;
2515	out:
2516	doutc(cl, "%p %llx.%llx size dropping cap refs on %s\n", inode,
2517	ceph_vinop(inode), ceph_cap_string(got));
2518	ceph_put_cap_refs(ci, had: got);
2519	if (iov.iov_base)
2520	kunmap_local(iov.iov_base);
2521	if (page)
2522	__free_pages(page, order: `0`);
2523	if (ret && pagelist)
2524	ceph_pagelist_release(pl: pagelist);
2525	return ret;
2526	}
2527
2528	int __ceph_setattr(struct mnt_idmap idmap, struct* inode *inode,
2529	struct iattr attr, struct* ceph_iattr *cia)
2530	{
2531	struct ceph_inode_info *ci = ceph_inode(inode);
2532	unsigned int ia_valid = attr->ia_valid;
2533	struct ceph_mds_request *req;
2534	struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(sb: inode->i_sb)->mdsc;
2535	struct ceph_client *cl = ceph_inode_to_client(inode);
2536	struct ceph_cap_flush *prealloc_cf;
2537	loff_t isize = i_size_read(inode);
2538	int issued;
2539	int release = `0`, dirtied = `0`;
2540	int mask = `0`;
2541	int err = `0`;
2542	int inode_dirty_flags = `0`;
2543	bool lock_snap_rwsem = false;
2544	bool fill_fscrypt;
2545	int truncate_retry = `20`; / The RMW will take around 50ms /
2546	struct dentry *dentry;
2547	char *path;
2548	bool do_sync = false;
2549
2550	dentry = d_find_alias(inode);
2551	if (!dentry) {
2552	do_sync = true;
2553	} else {
2554	struct ceph_path_info path_info;
2555	path = ceph_mdsc_build_path(mdsc, dentry, path_info: &path_info, for_wire: `0`);
2556	if (IS_ERR(ptr: path)) {
2557	do_sync = true;
2558	err = `0`;
2559	} else {
2560	err = ceph_mds_check_access(mdsc, tpath: path, MAY_WRITE);
2561	}
2562	ceph_mdsc_free_path_info(path_info: &path_info);
2563	dput(dentry);
2564
2565	/ For none EACCES cases will let the MDS do the mds auth check /
2566	if (err == -EACCES) {
2567	return err;
2568	} else if (err < `0`) {
2569	do_sync = true;
2570	err = `0`;
2571	}
2572	}
2573
2574	retry:
2575	prealloc_cf = ceph_alloc_cap_flush();
2576	if (!prealloc_cf)
2577	return -ENOMEM;
2578
2579	req = ceph_mdsc_create_request(mdsc, op: CEPH_MDS_OP_SETATTR,
2580	mode: USE_AUTH_MDS);
2581	if (IS_ERR(ptr: req)) {
2582	ceph_free_cap_flush(cf: prealloc_cf);
2583	return PTR_ERR(ptr: req);
2584	}
2585
2586	fill_fscrypt = false;
2587	spin_lock(lock: &ci->i_ceph_lock);
2588	issued = __ceph_caps_issued(ci, NULL);
2589
2590	if (!ci->i_head_snapc &&
2591	(issued & (CEPH_CAP_ANY_EXCL \| CEPH_CAP_FILE_WR))) {
2592	lock_snap_rwsem = true;
2593	if (!down_read_trylock(sem: &mdsc->snap_rwsem)) {
2594	spin_unlock(lock: &ci->i_ceph_lock);
2595	down_read(sem: &mdsc->snap_rwsem);
2596	spin_lock(lock: &ci->i_ceph_lock);
2597	issued = __ceph_caps_issued(ci, NULL);
2598	}
2599	}
2600
2601	doutc(cl, "%p %llx.%llx issued %s\n", inode, ceph_vinop(inode),
2602	ceph_cap_string(issued));
2603	#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
2604	if (cia && cia->fscrypt_auth) {
2605	u32 len = ceph_fscrypt_auth_len(fa: cia->fscrypt_auth);
2606
2607	if (len > sizeof(*cia->fscrypt_auth)) {
2608	err = -EINVAL;
2609	spin_unlock(lock: &ci->i_ceph_lock);
2610	goto out;
2611	}
2612
2613	doutc(cl, "%p %llx.%llx fscrypt_auth len %u to %u)\n", inode,
2614	ceph_vinop(inode), ci->fscrypt_auth_len, len);
2615
2616	/ It should never be re-set once set /
2617	WARN_ON_ONCE(ci->fscrypt_auth);
2618
2619	if (!do_sync && (issued & CEPH_CAP_AUTH_EXCL)) {
2620	dirtied \|= CEPH_CAP_AUTH_EXCL;
2621	kfree(objp: ci->fscrypt_auth);
2622	ci->fscrypt_auth = (u8 *)cia->fscrypt_auth;
2623	ci->fscrypt_auth_len = len;
2624	} else if ((issued & CEPH_CAP_AUTH_SHARED) == `0` \|\|
2625	ci->fscrypt_auth_len != len \|\|
2626	memcmp(p: ci->fscrypt_auth, q: cia->fscrypt_auth, size: len)) {
2627	req->r_fscrypt_auth = cia->fscrypt_auth;
2628	mask \|= CEPH_SETATTR_FSCRYPT_AUTH;
2629	release \|= CEPH_CAP_AUTH_SHARED;
2630	}
2631	cia->fscrypt_auth = NULL;
2632	}
2633	#else
2634	if (cia && cia->fscrypt_auth) {
2635	err = -EINVAL;
2636	spin_unlock(&ci->i_ceph_lock);
2637	goto out;
2638	}
2639	#endif /* CONFIG_FS_ENCRYPTION */
2640
2641	if (ia_valid & ATTR_UID) {
2642	kuid_t fsuid = from_vfsuid(idmap, fs_userns: i_user_ns(inode), vfsuid: attr->ia_vfsuid);
2643
2644	doutc(cl, "%p %llx.%llx uid %d -> %d\n", inode,
2645	ceph_vinop(inode),
2646	from_kuid(&init_user_ns, inode->i_uid),
2647	from_kuid(&init_user_ns, attr->ia_uid));
2648	if (!do_sync && (issued & CEPH_CAP_AUTH_EXCL)) {
2649	inode->i_uid = fsuid;
2650	dirtied \|= CEPH_CAP_AUTH_EXCL;
2651	} else if ((issued & CEPH_CAP_AUTH_SHARED) == `0` \|\|
2652	!uid_eq(left: fsuid, right: inode->i_uid)) {
2653	req->r_args.setattr.uid = cpu_to_le32(
2654	from_kuid(&init_user_ns, fsuid));
2655	mask \|= CEPH_SETATTR_UID;
2656	release \|= CEPH_CAP_AUTH_SHARED;
2657	}
2658	}
2659	if (ia_valid & ATTR_GID) {
2660	kgid_t fsgid = from_vfsgid(idmap, fs_userns: i_user_ns(inode), vfsgid: attr->ia_vfsgid);
2661
2662	doutc(cl, "%p %llx.%llx gid %d -> %d\n", inode,
2663	ceph_vinop(inode),
2664	from_kgid(&init_user_ns, inode->i_gid),
2665	from_kgid(&init_user_ns, attr->ia_gid));
2666	if (!do_sync && (issued & CEPH_CAP_AUTH_EXCL)) {
2667	inode->i_gid = fsgid;
2668	dirtied \|= CEPH_CAP_AUTH_EXCL;
2669	} else if ((issued & CEPH_CAP_AUTH_SHARED) == `0` \|\|
2670	!gid_eq(left: fsgid, right: inode->i_gid)) {
2671	req->r_args.setattr.gid = cpu_to_le32(
2672	from_kgid(&init_user_ns, fsgid));
2673	mask \|= CEPH_SETATTR_GID;
2674	release \|= CEPH_CAP_AUTH_SHARED;
2675	}
2676	}
2677	if (ia_valid & ATTR_MODE) {
2678	doutc(cl, "%p %llx.%llx mode 0%o -> 0%o\n", inode,
2679	ceph_vinop(inode), inode->i_mode, attr->ia_mode);
2680	if (!do_sync && (issued & CEPH_CAP_AUTH_EXCL)) {
2681	inode->i_mode = attr->ia_mode;
2682	dirtied \|= CEPH_CAP_AUTH_EXCL;
2683	} else if ((issued & CEPH_CAP_AUTH_SHARED) == `0` \|\|
2684	attr->ia_mode != inode->i_mode) {
2685	inode->i_mode = attr->ia_mode;
2686	req->r_args.setattr.mode = cpu_to_le32(attr->ia_mode);
2687	mask \|= CEPH_SETATTR_MODE;
2688	release \|= CEPH_CAP_AUTH_SHARED;
2689	}
2690	}
2691
2692	if (ia_valid & ATTR_ATIME) {
2693	struct timespec64 atime = inode_get_atime(inode);
2694
2695	doutc(cl, "%p %llx.%llx atime %ptSp -> %ptSp\n",
2696	inode, ceph_vinop(inode), &atime, &attr->ia_atime);
2697	if (!do_sync && (issued & CEPH_CAP_FILE_EXCL)) {
2698	ci->i_time_warp_seq++;
2699	inode_set_atime_to_ts(inode, ts: attr->ia_atime);
2700	dirtied \|= CEPH_CAP_FILE_EXCL;
2701	} else if (!do_sync && (issued & CEPH_CAP_FILE_WR) &&
2702	timespec64_compare(lhs: &atime,
2703	rhs: &attr->ia_atime) < `0`) {
2704	inode_set_atime_to_ts(inode, ts: attr->ia_atime);
2705	dirtied \|= CEPH_CAP_FILE_WR;
2706	} else if ((issued & CEPH_CAP_FILE_SHARED) == `0` \|\|
2707	!timespec64_equal(a: &atime, b: &attr->ia_atime)) {
2708	ceph_encode_timespec64(tv: &req->r_args.setattr.atime,
2709	ts: &attr->ia_atime);
2710	mask \|= CEPH_SETATTR_ATIME;
2711	release \|= CEPH_CAP_FILE_SHARED \|
2712	CEPH_CAP_FILE_RD \| CEPH_CAP_FILE_WR;
2713	}
2714	}
2715	if (ia_valid & ATTR_SIZE) {
2716	doutc(cl, "%p %llx.%llx size %lld -> %lld\n", inode,
2717	ceph_vinop(inode), isize, attr->ia_size);
2718	/*
2719	* Only when the new size is smaller and not aligned to
2720	* CEPH_FSCRYPT_BLOCK_SIZE will the RMW is needed.
2721	*/
2722	if (IS_ENCRYPTED(inode) && attr->ia_size < isize &&
2723	(attr->ia_size % CEPH_FSCRYPT_BLOCK_SIZE)) {
2724	mask \|= CEPH_SETATTR_SIZE;
2725	release \|= CEPH_CAP_FILE_SHARED \| CEPH_CAP_FILE_EXCL \|
2726	CEPH_CAP_FILE_RD \| CEPH_CAP_FILE_WR;
2727	set_bit(CEPH_MDS_R_FSCRYPT_FILE, addr: &req->r_req_flags);
2728	mask \|= CEPH_SETATTR_FSCRYPT_FILE;
2729	req->r_args.setattr.size =
2730	cpu_to_le64(round_up(attr->ia_size,
2731	CEPH_FSCRYPT_BLOCK_SIZE));
2732	req->r_args.setattr.old_size =
2733	cpu_to_le64(round_up(isize,
2734	CEPH_FSCRYPT_BLOCK_SIZE));
2735	req->r_fscrypt_file = attr->ia_size;
2736	fill_fscrypt = true;
2737	} else if (!do_sync && (issued & CEPH_CAP_FILE_EXCL) && attr->ia_size >= isize) {
2738	if (attr->ia_size > isize) {
2739	i_size_write(inode, i_size: attr->ia_size);
2740	inode->i_blocks = calc_inode_blocks(size: attr->ia_size);
2741	ci->i_reported_size = attr->ia_size;
2742	dirtied \|= CEPH_CAP_FILE_EXCL;
2743	ia_valid \|= ATTR_MTIME;
2744	}
2745	} else if ((issued & CEPH_CAP_FILE_SHARED) == `0` \|\|
2746	attr->ia_size != isize) {
2747	mask \|= CEPH_SETATTR_SIZE;
2748	release \|= CEPH_CAP_FILE_SHARED \| CEPH_CAP_FILE_EXCL \|
2749	CEPH_CAP_FILE_RD \| CEPH_CAP_FILE_WR;
2750	if (IS_ENCRYPTED(inode) && attr->ia_size) {
2751	set_bit(CEPH_MDS_R_FSCRYPT_FILE, addr: &req->r_req_flags);
2752	mask \|= CEPH_SETATTR_FSCRYPT_FILE;
2753	req->r_args.setattr.size =
2754	cpu_to_le64(round_up(attr->ia_size,
2755	CEPH_FSCRYPT_BLOCK_SIZE));
2756	req->r_args.setattr.old_size =
2757	cpu_to_le64(round_up(isize,
2758	CEPH_FSCRYPT_BLOCK_SIZE));
2759	req->r_fscrypt_file = attr->ia_size;
2760	} else {
2761	req->r_args.setattr.size = cpu_to_le64(attr->ia_size);
2762	req->r_args.setattr.old_size = cpu_to_le64(isize);
2763	req->r_fscrypt_file = `0`;
2764	}
2765	}
2766	}
2767	if (ia_valid & ATTR_MTIME) {
2768	struct timespec64 mtime = inode_get_mtime(inode);
2769
2770	doutc(cl, "%p %llx.%llx mtime %ptSp -> %ptSp\n",
2771	inode, ceph_vinop(inode), &mtime, &attr->ia_mtime);
2772	if (!do_sync && (issued & CEPH_CAP_FILE_EXCL)) {
2773	ci->i_time_warp_seq++;
2774	inode_set_mtime_to_ts(inode, ts: attr->ia_mtime);
2775	dirtied \|= CEPH_CAP_FILE_EXCL;
2776	} else if (!do_sync && (issued & CEPH_CAP_FILE_WR) &&
2777	timespec64_compare(lhs: &mtime, rhs: &attr->ia_mtime) < `0`) {
2778	inode_set_mtime_to_ts(inode, ts: attr->ia_mtime);
2779	dirtied \|= CEPH_CAP_FILE_WR;
2780	} else if ((issued & CEPH_CAP_FILE_SHARED) == `0` \|\|
2781	!timespec64_equal(a: &mtime, b: &attr->ia_mtime)) {
2782	ceph_encode_timespec64(tv: &req->r_args.setattr.mtime,
2783	ts: &attr->ia_mtime);
2784	mask \|= CEPH_SETATTR_MTIME;
2785	release \|= CEPH_CAP_FILE_SHARED \|
2786	CEPH_CAP_FILE_RD \| CEPH_CAP_FILE_WR;
2787	}
2788	}
2789
2790	/ these do nothing /
2791	if (ia_valid & ATTR_CTIME) {
2792	struct timespec64 ictime = inode_get_ctime(inode);
2793	bool only = (ia_valid & (ATTR_SIZE\|ATTR_MTIME\|ATTR_ATIME\|
2794	ATTR_MODE\|ATTR_UID\|ATTR_GID)) == `0`;
2795	doutc(cl, "%p %llx.%llx ctime %ptSp -> %ptSp (%s)\n",
2796	inode, ceph_vinop(inode), &ictime, &attr->ia_ctime,
2797	only ? "ctime only" : "ignored");
2798	if (only) {
2799	/*
2800	* if kernel wants to dirty ctime but nothing else,
2801	* we need to choose a cap to dirty under, or do
2802	* a almost-no-op setattr
2803	*/
2804	if (issued & CEPH_CAP_AUTH_EXCL)
2805	dirtied \|= CEPH_CAP_AUTH_EXCL;
2806	else if (issued & CEPH_CAP_FILE_EXCL)
2807	dirtied \|= CEPH_CAP_FILE_EXCL;
2808	else if (issued & CEPH_CAP_XATTR_EXCL)
2809	dirtied \|= CEPH_CAP_XATTR_EXCL;
2810	else
2811	mask \|= CEPH_SETATTR_CTIME;
2812	}
2813	}
2814	if (ia_valid & ATTR_FILE)
2815	doutc(cl, "%p %llx.%llx ATTR_FILE ... hrm!\n", inode,
2816	ceph_vinop(inode));
2817
2818	if (dirtied) {
2819	inode_dirty_flags = __ceph_mark_dirty_caps(ci, mask: dirtied,
2820	pcf: &prealloc_cf);
2821	inode_set_ctime_to_ts(inode, ts: attr->ia_ctime);
2822	inode_inc_iversion_raw(inode);
2823	}
2824
2825	release &= issued;
2826	spin_unlock(lock: &ci->i_ceph_lock);
2827	if (lock_snap_rwsem) {
2828	up_read(sem: &mdsc->snap_rwsem);
2829	lock_snap_rwsem = false;
2830	}
2831
2832	if (inode_dirty_flags)
2833	__mark_inode_dirty(inode, inode_dirty_flags);
2834
2835	if (mask) {
2836	req->r_inode = inode;
2837	ihold(inode);
2838	req->r_inode_drop = release;
2839	req->r_args.setattr.mask = cpu_to_le32(mask);
2840	req->r_num_caps = `1`;
2841	req->r_stamp = attr->ia_ctime;
2842	if (fill_fscrypt) {
2843	err = fill_fscrypt_truncate(inode, req, attr);
2844	if (err)
2845	goto out;
2846	}
2847
2848	/*
2849	* The truncate request will return -EAGAIN when the
2850	* last block has been updated just before the MDS
2851	* successfully gets the xlock for the FILE lock. To
2852	* avoid corrupting the file contents we need to retry
2853	* it.
2854	*/
2855	err = ceph_mdsc_do_request(mdsc, NULL, req);
2856	if (err == -EAGAIN && truncate_retry--) {
2857	doutc(cl, "%p %llx.%llx result=%d (%s locally, %d remote), retry it!\n",
2858	inode, ceph_vinop(inode), err,
2859	ceph_cap_string(dirtied), mask);
2860	ceph_mdsc_put_request(req);
2861	ceph_free_cap_flush(cf: prealloc_cf);
2862	goto retry;
2863	}
2864	}
2865	out:
2866	doutc(cl, "%p %llx.%llx result=%d (%s locally, %d remote)\n", inode,
2867	ceph_vinop(inode), err, ceph_cap_string(dirtied), mask);
2868
2869	ceph_mdsc_put_request(req);
2870	ceph_free_cap_flush(cf: prealloc_cf);
2871
2872	if (err >= `0` && (mask & CEPH_SETATTR_SIZE))
2873	__ceph_do_pending_vmtruncate(inode);
2874
2875	return err;
2876	}
2877
2878	/*
2879	* setattr
2880	*/
2881	int ceph_setattr(struct mnt_idmap idmap, struct* dentry *dentry,
2882	struct iattr *attr)
2883	{
2884	struct inode *inode = d_inode(dentry);
2885	struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
2886	int err;
2887
2888	if (ceph_snap(inode) != CEPH_NOSNAP)
2889	return -EROFS;
2890
2891	if (ceph_inode_is_shutdown(inode))
2892	return -ESTALE;
2893
2894	err = fscrypt_prepare_setattr(dentry, attr);
2895	if (err)
2896	return err;
2897
2898	err = setattr_prepare(idmap, dentry, attr);
2899	if (err != `0`)
2900	return err;
2901
2902	if ((attr->ia_valid & ATTR_SIZE) &&
2903	attr->ia_size > max(i_size_read(inode), fsc->max_file_size))
2904	return -EFBIG;
2905
2906	if ((attr->ia_valid & ATTR_SIZE) &&
2907	ceph_quota_is_max_bytes_exceeded(inode, newlen: attr->ia_size))
2908	return -EDQUOT;
2909
2910	err = __ceph_setattr(idmap, inode, attr, NULL);
2911
2912	if (err >= `0` && (attr->ia_valid & ATTR_MODE))
2913	err = posix_acl_chmod(idmap, dentry, attr->ia_mode);
2914
2915	return err;
2916	}
2917
2918	int ceph_try_to_choose_auth_mds(struct inode inode, int* mask)
2919	{
2920	int issued = ceph_caps_issued(ci: ceph_inode(inode));
2921
2922	/*
2923	* If any 'x' caps is issued we can just choose the auth MDS
2924	* instead of the random replica MDSes. Because only when the
2925	* Locker is in LOCK_EXEC state will the loner client could
2926	* get the 'x' caps. And if we send the getattr requests to
2927	* any replica MDS it must auth pin and tries to rdlock from
2928	* the auth MDS, and then the auth MDS need to do the Locker
2929	* state transition to LOCK_SYNC. And after that the lock state
2930	* will change back.
2931	*
2932	* This cost much when doing the Locker state transition and
2933	* usually will need to revoke caps from clients.
2934	*
2935	* And for the 'Xs' caps for getxattr we will also choose the
2936	* auth MDS, because the MDS side code is buggy due to setxattr
2937	* won't notify the replica MDSes when the values changed and
2938	* the replica MDS will return the old values. Though we will
2939	* fix it in MDS code, but this still makes sense for old ceph.
2940	*/
2941	if (((mask & CEPH_CAP_ANY_SHARED) && (issued & CEPH_CAP_ANY_EXCL))
2942	\|\| (mask & (CEPH_STAT_RSTAT \| CEPH_STAT_CAP_XATTR)))
2943	return USE_AUTH_MDS;
2944	else
2945	return USE_ANY_MDS;
2946	}
2947
2948	/*
2949	* Verify that we have a lease on the given mask. If not,
2950	* do a getattr against an mds.
2951	*/
2952	int __ceph_do_getattr(struct inode inode, struct* page *locked_page,
2953	int mask, bool force)
2954	{
2955	struct ceph_fs_client *fsc = ceph_sb_to_fs_client(sb: inode->i_sb);
2956	struct ceph_client *cl = fsc->client;
2957	struct ceph_mds_client *mdsc = fsc->mdsc;
2958	struct ceph_mds_request *req;
2959	int mode;
2960	int err;
2961
2962	if (ceph_snap(inode) == CEPH_SNAPDIR) {
2963	doutc(cl, "inode %p %llx.%llx SNAPDIR\n", inode,
2964	ceph_vinop(inode));
2965	return `0`;
2966	}
2967
2968	doutc(cl, "inode %p %llx.%llx mask %s mode 0%o\n", inode,
2969	ceph_vinop(inode), ceph_cap_string(mask), inode->i_mode);
2970	if (!force && ceph_caps_issued_mask_metric(ci: ceph_inode(inode), mask, touch: `1`))
2971	return `0`;
2972
2973	mode = ceph_try_to_choose_auth_mds(inode, mask);
2974	req = ceph_mdsc_create_request(mdsc, op: CEPH_MDS_OP_GETATTR, mode);
2975	if (IS_ERR(ptr: req))
2976	return PTR_ERR(ptr: req);
2977	req->r_inode = inode;
2978	ihold(inode);
2979	req->r_num_caps = `1`;
2980	req->r_args.getattr.mask = cpu_to_le32(mask);
2981	req->r_locked_page = locked_page;
2982	err = ceph_mdsc_do_request(mdsc, NULL, req);
2983	if (locked_page && err == `0`) {
2984	u64 inline_version = req->r_reply_info.targeti.inline_version;
2985	if (inline_version == `0`) {
2986	/ the reply is supposed to contain inline data /
2987	err = -EINVAL;
2988	} else if (inline_version == CEPH_INLINE_NONE \|\|
2989	inline_version == `1`) {
2990	err = -ENODATA;
2991	} else {
2992	err = req->r_reply_info.targeti.inline_len;
2993	}
2994	}
2995	ceph_mdsc_put_request(req);
2996	doutc(cl, "result=%d\n", err);
2997	return err;
2998	}
2999
3000	int ceph_do_getvxattr(struct inode inode, const* char name, void* *value,
3001	size_t size)
3002	{
3003	struct ceph_fs_client *fsc = ceph_sb_to_fs_client(sb: inode->i_sb);
3004	struct ceph_client *cl = fsc->client;
3005	struct ceph_mds_client *mdsc = fsc->mdsc;
3006	struct ceph_mds_request *req;
3007	int mode = USE_AUTH_MDS;
3008	int err;
3009	char *xattr_value;
3010	size_t xattr_value_len;
3011
3012	req = ceph_mdsc_create_request(mdsc, op: CEPH_MDS_OP_GETVXATTR, mode);
3013	if (IS_ERR(ptr: req)) {
3014	err = -ENOMEM;
3015	goto out;
3016	}
3017
3018	req->r_feature_needed = CEPHFS_FEATURE_OP_GETVXATTR;
3019	req->r_path2 = kstrdup(s: name, GFP_NOFS);
3020	if (!req->r_path2) {
3021	err = -ENOMEM;
3022	goto put;
3023	}
3024
3025	ihold(inode);
3026	req->r_inode = inode;
3027	err = ceph_mdsc_do_request(mdsc, NULL, req);
3028	if (err < `0`)
3029	goto put;
3030
3031	xattr_value = req->r_reply_info.xattr_info.xattr_value;
3032	xattr_value_len = req->r_reply_info.xattr_info.xattr_value_len;
3033
3034	doutc(cl, "xattr_value_len:%zu, size:%zu\n", xattr_value_len, size);
3035
3036	err = (int)xattr_value_len;
3037	if (size == `0`)
3038	goto put;
3039
3040	if (xattr_value_len > size) {
3041	err = -ERANGE;
3042	goto put;
3043	}
3044
3045	memcpy(value, xattr_value, xattr_value_len);
3046	put:
3047	ceph_mdsc_put_request(req);
3048	out:
3049	doutc(cl, "result=%d\n", err);
3050	return err;
3051	}
3052
3053
3054	/*
3055	* Check inode permissions. We verify we have a valid value for
3056	* the AUTH cap, then call the generic handler.
3057	*/
3058	int ceph_permission(struct mnt_idmap idmap, struct* inode *inode,
3059	int mask)
3060	{
3061	int err;
3062
3063	if (mask & MAY_NOT_BLOCK)
3064	return -ECHILD;
3065
3066	err = ceph_do_getattr(inode, CEPH_CAP_AUTH_SHARED, force: false);
3067
3068	if (!err)
3069	err = generic_permission(idmap, inode, mask);
3070	return err;
3071	}
3072
3073	/ Craft a mask of needed caps given a set of requested statx attrs. /
3074	static int statx_to_caps(u32 want, umode_t mode)
3075	{
3076	int mask = `0`;
3077
3078	if (want & (STATX_MODE\|STATX_UID\|STATX_GID\|STATX_CTIME\|STATX_BTIME\|STATX_CHANGE_COOKIE))
3079	mask \|= CEPH_CAP_AUTH_SHARED;
3080
3081	if (want & (STATX_NLINK\|STATX_CTIME\|STATX_CHANGE_COOKIE)) {
3082	/*
3083	* The link count for directories depends on inode->i_subdirs,
3084	* and that is only updated when Fs caps are held.
3085	*/
3086	if (S_ISDIR(mode))
3087	mask \|= CEPH_CAP_FILE_SHARED;
3088	else
3089	mask \|= CEPH_CAP_LINK_SHARED;
3090	}
3091
3092	if (want & (STATX_ATIME\|STATX_MTIME\|STATX_CTIME\|STATX_SIZE\|STATX_BLOCKS\|STATX_CHANGE_COOKIE))
3093	mask \|= CEPH_CAP_FILE_SHARED;
3094
3095	if (want & (STATX_CTIME\|STATX_CHANGE_COOKIE))
3096	mask \|= CEPH_CAP_XATTR_SHARED;
3097
3098	return mask;
3099	}
3100
3101	/*
3102	* Get all the attributes. If we have sufficient caps for the requested attrs,
3103	* then we can avoid talking to the MDS at all.
3104	*/
3105	int ceph_getattr(struct mnt_idmap idmap, const* struct path *path,
3106	struct kstat stat, u32 request_mask, unsigned* int flags)
3107	{
3108	struct inode *inode = d_inode(dentry: path->dentry);
3109	struct super_block *sb = inode->i_sb;
3110	struct ceph_inode_info *ci = ceph_inode(inode);
3111	u32 valid_mask = STATX_BASIC_STATS;
3112	int err = `0`;
3113
3114	if (ceph_inode_is_shutdown(inode))
3115	return -ESTALE;
3116
3117	/ Skip the getattr altogether if we're asked not to sync /
3118	if ((flags & AT_STATX_SYNC_TYPE) != AT_STATX_DONT_SYNC) {
3119	err = ceph_do_getattr(inode,
3120	mask: statx_to_caps(want: request_mask, mode: inode->i_mode),
3121	force: flags & AT_STATX_FORCE_SYNC);
3122	if (err)
3123	return err;
3124	}
3125
3126	generic_fillattr(idmap, request_mask, inode, stat);
3127	stat->ino = ceph_present_inode(inode);
3128
3129	/*
3130	* btime on newly-allocated inodes is 0, so if this is still set to
3131	* that, then assume that it's not valid.
3132	*/
3133	if (ci->i_btime.tv_sec \|\| ci->i_btime.tv_nsec) {
3134	stat->btime = ci->i_btime;
3135	valid_mask \|= STATX_BTIME;
3136	}
3137
3138	if (request_mask & STATX_CHANGE_COOKIE) {
3139	stat->change_cookie = inode_peek_iversion_raw(inode);
3140	valid_mask \|= STATX_CHANGE_COOKIE;
3141	}
3142
3143	if (ceph_snap(inode) == CEPH_NOSNAP)
3144	stat->dev = sb->s_dev;
3145	else
3146	stat->dev = ci->i_snapid_map ? ci->i_snapid_map->dev : `0`;
3147
3148	if (S_ISDIR(inode->i_mode)) {
3149	if (ceph_test_mount_opt(ceph_sb_to_fs_client(sb), RBYTES)) {
3150	stat->size = ci->i_rbytes;
3151	} else if (ceph_snap(inode) == CEPH_SNAPDIR) {
3152	struct ceph_inode_info *pci;
3153	struct ceph_snap_realm *realm;
3154	struct inode *parent;
3155
3156	parent = ceph_lookup_inode(sb, ino: ceph_ino(inode));
3157	if (IS_ERR(ptr: parent))
3158	return PTR_ERR(ptr: parent);
3159
3160	pci = ceph_inode(inode: parent);
3161	spin_lock(lock: &pci->i_ceph_lock);
3162	realm = pci->i_snap_realm;
3163	if (realm)
3164	stat->size = realm->num_snaps;
3165	else
3166	stat->size = `0`;
3167	spin_unlock(lock: &pci->i_ceph_lock);
3168	iput(parent);
3169	} else {
3170	stat->size = ci->i_files + ci->i_subdirs;
3171	}
3172	stat->blocks = `0`;
3173	stat->blksize = `65536`;
3174	/*
3175	* Some applications rely on the number of st_nlink
3176	* value on directories to be either 0 (if unlinked)
3177	* or 2 + number of subdirectories.
3178	*/
3179	if (stat->nlink == `1`)
3180	/ '.' + '..' + subdirs /
3181	stat->nlink = `1` + `1` + ci->i_subdirs;
3182	}
3183
3184	stat->attributes \|= STATX_ATTR_CHANGE_MONOTONIC;
3185	if (IS_ENCRYPTED(inode))
3186	stat->attributes \|= STATX_ATTR_ENCRYPTED;
3187	stat->attributes_mask \|= (STATX_ATTR_CHANGE_MONOTONIC \|
3188	STATX_ATTR_ENCRYPTED);
3189
3190	stat->result_mask = request_mask & valid_mask;
3191	return err;
3192	}
3193
3194	void ceph_inode_shutdown(struct inode *inode)
3195	{
3196	struct ceph_inode_info *ci = ceph_inode(inode);
3197	struct rb_node *p;
3198	int iputs = `0`;
3199	bool invalidate = false;
3200
3201	spin_lock(lock: &ci->i_ceph_lock);
3202	ci->i_ceph_flags \|= CEPH_I_SHUTDOWN;
3203	p = rb_first(root: &ci->i_caps);
3204	while (p) {
3205	struct ceph_cap cap = rb_entry(p, struct* ceph_cap, ci_node);
3206
3207	p = rb_next(p);
3208	iputs += ceph_purge_inode_cap(inode, cap, invalidate: &invalidate);
3209	}
3210	spin_unlock(lock: &ci->i_ceph_lock);
3211
3212	if (invalidate)
3213	ceph_queue_invalidate(inode);
3214	while (iputs--)
3215	iput(inode);
3216	}
3217

source code of linux/fs/ceph/inode.c