addr.c source code [linux/fs/ceph/addr.c]

1	// SPDX-License-Identifier: GPL-2.0
2	#include <linux/ceph/ceph_debug.h>
3
4	#include <linux/backing-dev.h>
5	#include <linux/fs.h>
6	#include <linux/mm.h>
7	#include <linux/swap.h>
8	#include <linux/pagemap.h>
9	#include <linux/slab.h>
10	#include <linux/pagevec.h>
11	#include <linux/task_io_accounting_ops.h>
12	#include <linux/signal.h>
13	#include <linux/iversion.h>
14	#include <linux/ktime.h>
15	#include <linux/netfs.h>
16	#include <trace/events/netfs.h>
17
18	#include "super.h"
19	#include "mds_client.h"
20	#include "cache.h"
21	#include "metric.h"
22	#include "crypto.h"
23	#include <linux/ceph/osd_client.h>
24	#include <linux/ceph/striper.h>
25
26	/*
27	* Ceph address space ops.
28	*
29	* There are a few funny things going on here.
30	*
31	* The page->private field is used to reference a struct
32	* ceph_snap_context for _every_ dirty page. This indicates which
33	* snapshot the page was logically dirtied in, and thus which snap
34	* context needs to be associated with the osd write during writeback.
35	*
36	* Similarly, struct ceph_inode_info maintains a set of counters to
37	* count dirty pages on the inode. In the absence of snapshots,
38	* i_wrbuffer_ref == i_wrbuffer_ref_head == the dirty page count.
39	*
40	* When a snapshot is taken (that is, when the client receives
41	* notification that a snapshot was taken), each inode with caps and
42	* with dirty pages (dirty pages implies there is a cap) gets a new
43	* ceph_cap_snap in the i_cap_snaps list (which is sorted in ascending
44	* order, new snaps go to the tail). The i_wrbuffer_ref_head count is
45	* moved to capsnap->dirty. (Unless a sync write is currently in
46	* progress. In that case, the capsnap is said to be "pending", new
47	* writes cannot start, and the capsnap isn't "finalized" until the
48	* write completes (or fails) and a final size/mtime for the inode for
49	* that snap can be settled upon.) i_wrbuffer_ref_head is reset to 0.
50	*
51	* On writeback, we must submit writes to the osd IN SNAP ORDER. So,
52	* we look for the first capsnap in i_cap_snaps and write out pages in
53	* that snap context _only_. Then we move on to the next capsnap,
54	* eventually reaching the "live" or "head" context (i.e., pages that
55	* are not yet snapped) and are writing the most recently dirtied
56	* pages.
57	*
58	* Invalidate and so forth must take care to ensure the dirty page
59	* accounting is preserved.
60	*/
61
62	#define CONGESTION_ON_THRESH(congestion_kb) (congestion_kb >> (PAGE_SHIFT-10))
63	#define CONGESTION_OFF_THRESH(congestion_kb) \
64	(CONGESTION_ON_THRESH(congestion_kb) - \
65	(CONGESTION_ON_THRESH(congestion_kb) >> 2))
66
67	static int ceph_netfs_check_write_begin(struct file file, loff_t pos, unsigned* int len,
68	struct folio *foliop, void* **_fsdata);
69
70	static inline struct ceph_snap_context page_snap_context(struct* page *page)
71	{
72	if (PagePrivate(page))
73	return (void *)page->private;
74	return NULL;
75	}
76
77	/*
78	* Dirty a page. Optimistically adjust accounting, on the assumption
79	* that we won't race with invalidate. If we do, readjust.
80	*/
81	static bool ceph_dirty_folio(struct address_space mapping, struct* folio *folio)
82	{
83	struct inode *inode = mapping->host;
84	struct ceph_client *cl = ceph_inode_to_client(inode);
85	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(sb: inode->i_sb);
86	struct ceph_inode_info *ci;
87	struct ceph_snap_context *snapc;
88
89	if (folio_test_dirty(folio)) {
90	doutc(cl, "%llx.%llx %p idx %lu -- already dirty\n",
91	ceph_vinop(inode), folio, folio->index);
92	VM_BUG_ON_FOLIO(!folio_test_private(folio), folio);
93	return false;
94	}
95
96	atomic64_inc(v: &mdsc->dirty_folios);
97
98	ci = ceph_inode(inode);
99
100	/ dirty the head /
101	spin_lock(lock: &ci->i_ceph_lock);
102	if (__ceph_have_pending_cap_snap(ci)) {
103	struct ceph_cap_snap *capsnap =
104	list_last_entry(&ci->i_cap_snaps,
105	struct ceph_cap_snap,
106	ci_item);
107	snapc = ceph_get_snap_context(sc: capsnap->context);
108	capsnap->dirty_pages++;
109	} else {
110	BUG_ON(!ci->i_head_snapc);
111	snapc = ceph_get_snap_context(sc: ci->i_head_snapc);
112	++ci->i_wrbuffer_ref_head;
113	}
114	if (ci->i_wrbuffer_ref == `0`)
115	ihold(inode);
116	++ci->i_wrbuffer_ref;
117	doutc(cl, "%llx.%llx %p idx %lu head %d/%d -> %d/%d "
118	"snapc %p seq %lld (%d snaps)\n",
119	ceph_vinop(inode), folio, folio->index,
120	ci->i_wrbuffer_ref-`1`, ci->i_wrbuffer_ref_head-`1`,
121	ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head,
122	snapc, snapc->seq, snapc->num_snaps);
123	spin_unlock(lock: &ci->i_ceph_lock);
124
125	/*
126	* Reference snap context in folio->private. Also set
127	* PagePrivate so that we get invalidate_folio callback.
128	*/
129	VM_WARN_ON_FOLIO(folio->private, folio);
130	folio_attach_private(folio, data: snapc);
131
132	return ceph_fscache_dirty_folio(mapping, folio);
133	}
134
135	/*
136	* If we are truncating the full folio (i.e. offset == 0), adjust the
137	* dirty folio counters appropriately. Only called if there is private
138	* data on the folio.
139	*/
140	static void ceph_invalidate_folio(struct folio *folio, size_t offset,
141	size_t length)
142	{
143	struct inode *inode = folio->mapping->host;
144	struct ceph_client *cl = ceph_inode_to_client(inode);
145	struct ceph_inode_info *ci = ceph_inode(inode);
146	struct ceph_snap_context *snapc;
147
148
149	if (offset != `0` \|\| length != folio_size(folio)) {
150	doutc(cl, "%llx.%llx idx %lu partial dirty page %zu~%zu\n",
151	ceph_vinop(inode), folio->index, offset, length);
152	return;
153	}
154
155	WARN_ON(!folio_test_locked(folio));
156	if (folio_test_private(folio)) {
157	doutc(cl, "%llx.%llx idx %lu full dirty page\n",
158	ceph_vinop(inode), folio->index);
159
160	snapc = folio_detach_private(folio);
161	ceph_put_wrbuffer_cap_refs(ci, nr: `1`, snapc);
162	ceph_put_snap_context(sc: snapc);
163	}
164
165	netfs_invalidate_folio(folio, offset, length);
166	}
167
168	static void ceph_netfs_expand_readahead(struct netfs_io_request *rreq)
169	{
170	struct inode *inode = rreq->inode;
171	struct ceph_inode_info *ci = ceph_inode(inode);
172	struct ceph_file_layout *lo = &ci->i_layout;
173	unsigned long max_pages = inode->i_sb->s_bdi->ra_pages;
174	loff_t end = rreq->start + rreq->len, new_end;
175	struct ceph_netfs_request_data *priv = rreq->netfs_priv;
176	unsigned long max_len;
177	u32 blockoff;
178
179	if (priv) {
180	/ Readahead is disabled by posix_fadvise POSIX_FADV_RANDOM /
181	if (priv->file_ra_disabled)
182	max_pages = `0`;
183	else
184	max_pages = priv->file_ra_pages;
185
186	}
187
188	/ Readahead is disabled /
189	if (!max_pages)
190	return;
191
192	max_len = max_pages << PAGE_SHIFT;
193
194	/*
195	* Try to expand the length forward by rounding up it to the next
196	* block, but do not exceed the file size, unless the original
197	* request already exceeds it.
198	*/
199	new_end = umin(round_up(end, lo->stripe_unit), rreq->i_size);
200	if (new_end > end && new_end <= rreq->start + max_len)
201	rreq->len = new_end - rreq->start;
202
203	/ Try to expand the start downward /
204	div_u64_rem(dividend: rreq->start, divisor: lo->stripe_unit, remainder: &blockoff);
205	if (rreq->len + blockoff <= max_len) {
206	rreq->start -= blockoff;
207	rreq->len += blockoff;
208	}
209	}
210
211	static void finish_netfs_read(struct ceph_osd_request *req)
212	{
213	struct inode *inode = req->r_inode;
214	struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
215	struct ceph_client *cl = fsc->client;
216	struct ceph_osd_data *osd_data = osd_req_op_extent_osd_data(osd_req: req, which: `0`);
217	struct netfs_io_subrequest *subreq = req->r_priv;
218	struct ceph_osd_req_op *op = &req->r_ops[`0`];
219	int err = req->r_result;
220	bool sparse = (op->op == CEPH_OSD_OP_SPARSE_READ);
221
222	ceph_update_read_metrics(m: &fsc->mdsc->metric, r_start: req->r_start_latency,
223	r_end: req->r_end_latency, size: osd_data->length, rc: err);
224
225	doutc(cl, "result %d subreq->len=%zu i_size=%lld\n", req->r_result,
226	subreq->len, i_size_read(req->r_inode));
227
228	/ no object means success but no data /
229	if (err == -ENOENT) {
230	__set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
231	__set_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags);
232	err = `0`;
233	} else if (err == -EBLOCKLISTED) {
234	fsc->blocklisted = true;
235	}
236
237	if (err >= `0`) {
238	if (sparse && err > `0`)
239	err = ceph_sparse_ext_map_end(op);
240	if (err < subreq->len &&
241	subreq->rreq->origin != NETFS_UNBUFFERED_READ &&
242	subreq->rreq->origin != NETFS_DIO_READ)
243	__set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
244	if (IS_ENCRYPTED(inode) && err > `0`) {
245	err = ceph_fscrypt_decrypt_extents(inode,
246	page: osd_data->pages, off: subreq->start,
247	map: op->extent.sparse_ext,
248	ext_cnt: op->extent.sparse_ext_cnt);
249	if (err > subreq->len)
250	err = subreq->len;
251	}
252	if (err > `0`)
253	__set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
254	}
255
256	if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES) {
257	ceph_put_page_vector(pages: osd_data->pages,
258	num_pages: calc_pages_for(off: osd_data->alignment,
259	len: osd_data->length), dirty: false);
260	}
261	if (err > `0`) {
262	subreq->transferred = err;
263	err = `0`;
264	}
265	subreq->error = err;
266	trace_netfs_sreq(sreq: subreq, what: netfs_sreq_trace_io_progress);
267	netfs_read_subreq_terminated(subreq);
268	iput(req->r_inode);
269	ceph_dec_osd_stopping_blocker(mdsc: fsc->mdsc);
270	}
271
272	static bool ceph_netfs_issue_op_inline(struct netfs_io_subrequest *subreq)
273	{
274	struct netfs_io_request *rreq = subreq->rreq;
275	struct inode *inode = rreq->inode;
276	struct ceph_mds_reply_info_parsed *rinfo;
277	struct ceph_mds_reply_info_in *iinfo;
278	struct ceph_mds_request *req;
279	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(sb: inode->i_sb);
280	struct ceph_inode_info *ci = ceph_inode(inode);
281	ssize_t err = `0`;
282	size_t len;
283	int mode;
284
285	if (rreq->origin != NETFS_UNBUFFERED_READ &&
286	rreq->origin != NETFS_DIO_READ)
287	__set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
288	__clear_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags);
289
290	if (subreq->start >= inode->i_size)
291	goto out;
292
293	/ We need to fetch the inline data. /
294	mode = ceph_try_to_choose_auth_mds(inode, CEPH_STAT_CAP_INLINE_DATA);
295	req = ceph_mdsc_create_request(mdsc, op: CEPH_MDS_OP_GETATTR, mode);
296	if (IS_ERR(ptr: req)) {
297	err = PTR_ERR(ptr: req);
298	goto out;
299	}
300	req->r_ino1 = ci->i_vino;
301	req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INLINE_DATA);
302	req->r_num_caps = `2`;
303
304	trace_netfs_sreq(sreq: subreq, what: netfs_sreq_trace_submit);
305	err = ceph_mdsc_do_request(mdsc, NULL, req);
306	if (err < `0`)
307	goto out;
308
309	rinfo = &req->r_reply_info;
310	iinfo = &rinfo->targeti;
311	if (iinfo->inline_version == CEPH_INLINE_NONE) {
312	/ The data got uninlined /
313	ceph_mdsc_put_request(req);
314	return false;
315	}
316
317	len = min_t(size_t, iinfo->inline_len - subreq->start, subreq->len);
318	err = copy_to_iter(addr: iinfo->inline_data + subreq->start, bytes: len, i: &subreq->io_iter);
319	if (err == `0`) {
320	err = -EFAULT;
321	} else {
322	subreq->transferred += err;
323	err = `0`;
324	}
325
326	ceph_mdsc_put_request(req);
327	out:
328	subreq->error = err;
329	trace_netfs_sreq(sreq: subreq, what: netfs_sreq_trace_io_progress);
330	netfs_read_subreq_terminated(subreq);
331	return true;
332	}
333
334	static int ceph_netfs_prepare_read(struct netfs_io_subrequest *subreq)
335	{
336	struct netfs_io_request *rreq = subreq->rreq;
337	struct inode *inode = rreq->inode;
338	struct ceph_inode_info *ci = ceph_inode(inode);
339	struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
340	u64 objno, objoff;
341	u32 xlen;
342
343	/ Truncate the extent at the end of the current block /
344	ceph_calc_file_object_mapping(l: &ci->i_layout, off: subreq->start, len: subreq->len,
345	objno: &objno, objoff: &objoff, xlen: &xlen);
346	rreq->io_streams[`0`].sreq_max_len = umin(xlen, fsc->mount_options->rsize);
347	return `0`;
348	}
349
350	static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq)
351	{
352	struct netfs_io_request *rreq = subreq->rreq;
353	struct inode *inode = rreq->inode;
354	struct ceph_inode_info *ci = ceph_inode(inode);
355	struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
356	struct ceph_client *cl = fsc->client;
357	struct ceph_osd_request *req = NULL;
358	struct ceph_vino vino = ceph_vino(inode);
359	int err;
360	u64 len;
361	bool sparse = IS_ENCRYPTED(inode) \|\| ceph_test_mount_opt(fsc, SPARSEREAD);
362	u64 off = subreq->start;
363	int extent_cnt;
364
365	if (ceph_inode_is_shutdown(inode)) {
366	err = -EIO;
367	goto out;
368	}
369
370	if (ceph_has_inline_data(ci) && ceph_netfs_issue_op_inline(subreq))
371	return;
372
373	// TODO: This rounding here is slightly dodgy. It should* work, for*
374	// now, as the cache only deals in blocks that are a multiple of
375	// PAGE_SIZE and fscrypt blocks are at most PAGE_SIZE. What needs to
376	// happen is for the fscrypt driving to be moved into netfslib and the
377	// data in the cache also to be stored encrypted.
378	len = subreq->len;
379	ceph_fscrypt_adjust_off_and_len(inode, off: &off, len: &len);
380
381	req = ceph_osdc_new_request(&fsc->client->osdc, layout: &ci->i_layout, vino,
382	offset: off, len: &len, which: `0`, num_ops: `1`, opcode: sparse ? CEPH_OSD_OP_SPARSE_READ : CEPH_OSD_OP_READ,
383	flags: CEPH_OSD_FLAG_READ, NULL, truncate_seq: ci->i_truncate_seq,
384	truncate_size: ci->i_truncate_size, use_mempool: false);
385	if (IS_ERR(ptr: req)) {
386	err = PTR_ERR(ptr: req);
387	req = NULL;
388	goto out;
389	}
390
391	if (sparse) {
392	extent_cnt = __ceph_sparse_read_ext_count(inode, len);
393	err = ceph_alloc_sparse_ext_map(op: &req->r_ops[`0`], cnt: extent_cnt);
394	if (err)
395	goto out;
396	}
397
398	doutc(cl, "%llx.%llx pos=%llu orig_len=%zu len=%llu\n",
399	ceph_vinop(inode), subreq->start, subreq->len, len);
400
401	/*
402	* FIXME: For now, use CEPH_OSD_DATA_TYPE_PAGES instead of _ITER for
403	* encrypted inodes. We'd need infrastructure that handles an iov_iter
404	* instead of page arrays, and we don't have that as of yet. Once the
405	* dust settles on the write helpers and encrypt/decrypt routines for
406	* netfs, we should be able to rework this.
407	*/
408	if (IS_ENCRYPTED(inode)) {
409	struct page **pages;
410	size_t page_off;
411
412	/*
413	* FIXME: io_iter.count needs to be corrected to aligned
414	* length. Otherwise, iov_iter_get_pages_alloc2() operates
415	* with the initial unaligned length value. As a result,
416	* ceph_msg_data_cursor_init() triggers BUG_ON() in the case
417	* if msg->sparse_read_total > msg->data_length.
418	*/
419	subreq->io_iter.count = len;
420
421	err = iov_iter_get_pages_alloc2(i: &subreq->io_iter, pages: &pages, maxsize: len, start: &page_off);
422	if (err < `0`) {
423	doutc(cl, "%llx.%llx failed to allocate pages, %d\n",
424	ceph_vinop(inode), err);
425	goto out;
426	}
427
428	/ should always give us a page-aligned read /
429	WARN_ON_ONCE(page_off);
430	len = err;
431	err = `0`;
432
433	osd_req_op_extent_osd_data_pages(req, which: `0`, pages, length: len, alignment: `0`, pages_from_pool: false,
434	own_pages: false);
435	} else {
436	osd_req_op_extent_osd_iter(osd_req: req, which: `0`, iter: &subreq->io_iter);
437	}
438	if (!ceph_inc_osd_stopping_blocker(mdsc: fsc->mdsc)) {
439	err = -EIO;
440	goto out;
441	}
442	req->r_callback = finish_netfs_read;
443	req->r_priv = subreq;
444	req->r_inode = inode;
445	ihold(inode);
446
447	trace_netfs_sreq(sreq: subreq, what: netfs_sreq_trace_submit);
448	ceph_osdc_start_request(osdc: req->r_osdc, req);
449	out:
450	ceph_osdc_put_request(req);
451	if (err) {
452	subreq->error = err;
453	netfs_read_subreq_terminated(subreq);
454	}
455	doutc(cl, "%llx.%llx result %d\n", ceph_vinop(inode), err);
456	}
457
458	static int ceph_init_request(struct netfs_io_request rreq, struct* file *file)
459	{
460	struct inode *inode = rreq->inode;
461	struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
462	struct ceph_client *cl = ceph_inode_to_client(inode);
463	int got = `0`, want = CEPH_CAP_FILE_CACHE;
464	struct ceph_netfs_request_data *priv;
465	int ret = `0`;
466
467	/ [DEPRECATED] Use PG_private_2 to mark folio being written to the cache. /
468	__set_bit(NETFS_RREQ_USE_PGPRIV2, &rreq->flags);
469
470	if (rreq->origin != NETFS_READAHEAD)
471	return `0`;
472
473	priv = kzalloc(sizeof(*priv), GFP_NOFS);
474	if (!priv)
475	return -ENOMEM;
476
477	if (file) {
478	struct ceph_rw_context *rw_ctx;
479	struct ceph_file_info *fi = file->private_data;
480
481	priv->file_ra_pages = file->f_ra.ra_pages;
482	priv->file_ra_disabled = file->f_mode & FMODE_RANDOM;
483
484	rw_ctx = ceph_find_rw_context(cf: fi);
485	if (rw_ctx) {
486	rreq->netfs_priv = priv;
487	return `0`;
488	}
489	}
490
491	/*
492	* readahead callers do not necessarily hold Fcb caps
493	* (e.g. fadvise, madvise).
494	*/
495	ret = ceph_try_get_caps(inode, CEPH_CAP_FILE_RD, want, nonblock: true, got: &got);
496	if (ret < `0`) {
497	doutc(cl, "%llx.%llx, error getting cap\n", ceph_vinop(inode));
498	goto out;
499	}
500
501	if (!(got & want)) {
502	doutc(cl, "%llx.%llx, no cache cap\n", ceph_vinop(inode));
503	ret = -EACCES;
504	goto out;
505	}
506	if (ret == `0`) {
507	ret = -EACCES;
508	goto out;
509	}
510
511	priv->caps = got;
512	rreq->netfs_priv = priv;
513	rreq->io_streams[`0`].sreq_max_len = fsc->mount_options->rsize;
514
515	out:
516	if (ret < `0`) {
517	if (got)
518	ceph_put_cap_refs(ci: ceph_inode(inode), had: got);
519	kfree(objp: priv);
520	}
521
522	return ret;
523	}
524
525	static void ceph_netfs_free_request(struct netfs_io_request *rreq)
526	{
527	struct ceph_netfs_request_data *priv = rreq->netfs_priv;
528
529	if (!priv)
530	return;
531
532	if (priv->caps)
533	ceph_put_cap_refs(ci: ceph_inode(inode: rreq->inode), had: priv->caps);
534	kfree(objp: priv);
535	rreq->netfs_priv = NULL;
536	}
537
538	const struct netfs_request_ops ceph_netfs_ops = {
539	.init_request = ceph_init_request,
540	.free_request = ceph_netfs_free_request,
541	.prepare_read = ceph_netfs_prepare_read,
542	.issue_read = ceph_netfs_issue_read,
543	.expand_readahead = ceph_netfs_expand_readahead,
544	.check_write_begin = ceph_netfs_check_write_begin,
545	};
546
547	#ifdef CONFIG_CEPH_FSCACHE
548	static void ceph_set_page_fscache(struct page *page)
549	{
550	folio_start_private_2(page_folio(page)); / [DEPRECATED] /
551	}
552
553	static void ceph_fscache_write_terminated(void *priv, ssize_t error)
554	{
555	struct inode *inode = priv;
556
557	if (IS_ERR_VALUE(error) && error != -ENOBUFS)
558	ceph_fscache_invalidate(inode, dio_write: false);
559	}
560
561	static void ceph_fscache_write_to_cache(struct inode *inode, u64 off, u64 len, bool caching)
562	{
563	struct ceph_inode_info *ci = ceph_inode(inode);
564	struct fscache_cookie *cookie = ceph_fscache_cookie(ci);
565
566	fscache_write_to_cache(cookie, mapping: inode->i_mapping, start: off, len, i_size: i_size_read(inode),
567	term_func: ceph_fscache_write_terminated, term_func_priv: inode, using_pgpriv2: true, caching);
568	}
569	#else
570	static inline void ceph_set_page_fscache(struct page *page)
571	{
572	}
573
574	static inline void ceph_fscache_write_to_cache(struct inode *inode, u64 off, u64 len, bool caching)
575	{
576	}
577	#endif /* CONFIG_CEPH_FSCACHE */
578
579	struct ceph_writeback_ctl
580	{
581	loff_t i_size;
582	u64 truncate_size;
583	u32 truncate_seq;
584	bool size_stable;
585
586	bool head_snapc;
587	struct ceph_snap_context *snapc;
588	struct ceph_snap_context *last_snapc;
589
590	bool done;
591	bool should_loop;
592	bool range_whole;
593	pgoff_t start_index;
594	pgoff_t index;
595	pgoff_t end;
596	xa_mark_t tag;
597
598	pgoff_t strip_unit_end;
599	unsigned int wsize;
600	unsigned int nr_folios;
601	unsigned int max_pages;
602	unsigned int locked_pages;
603
604	int op_idx;
605	int num_ops;
606	u64 offset;
607	u64 len;
608
609	struct folio_batch fbatch;
610	unsigned int processed_in_fbatch;
611
612	bool from_pool;
613	struct page **pages;
614	struct page **data_pages;
615	};
616
617	/*
618	* Get ref for the oldest snapc for an inode with dirty data... that is, the
619	* only snap context we are allowed to write back.
620	*/
621	static struct ceph_snap_context *
622	get_oldest_context(struct inode inode, struct* ceph_writeback_ctl *ctl,
623	struct ceph_snap_context *page_snapc)
624	{
625	struct ceph_inode_info *ci = ceph_inode(inode);
626	struct ceph_client *cl = ceph_inode_to_client(inode);
627	struct ceph_snap_context *snapc = NULL;
628	struct ceph_cap_snap *capsnap = NULL;
629
630	spin_lock(lock: &ci->i_ceph_lock);
631	list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
632	doutc(cl, " capsnap %p snapc %p has %d dirty pages\n",
633	capsnap, capsnap->context, capsnap->dirty_pages);
634	if (!capsnap->dirty_pages)
635	continue;
636
637	/ get i_size, truncate_{seq,size} for page_snapc? /
638	if (snapc && capsnap->context != page_snapc)
639	continue;
640
641	if (ctl) {
642	if (capsnap->writing) {
643	ctl->i_size = i_size_read(inode);
644	ctl->size_stable = false;
645	} else {
646	ctl->i_size = capsnap->size;
647	ctl->size_stable = true;
648	}
649	ctl->truncate_size = capsnap->truncate_size;
650	ctl->truncate_seq = capsnap->truncate_seq;
651	ctl->head_snapc = false;
652	}
653
654	if (snapc)
655	break;
656
657	snapc = ceph_get_snap_context(sc: capsnap->context);
658	if (!page_snapc \|\|
659	page_snapc == snapc \|\|
660	page_snapc->seq > snapc->seq)
661	break;
662	}
663	if (!snapc && ci->i_wrbuffer_ref_head) {
664	snapc = ceph_get_snap_context(sc: ci->i_head_snapc);
665	doutc(cl, " head snapc %p has %d dirty pages\n", snapc,
666	ci->i_wrbuffer_ref_head);
667	if (ctl) {
668	ctl->i_size = i_size_read(inode);
669	ctl->truncate_size = ci->i_truncate_size;
670	ctl->truncate_seq = ci->i_truncate_seq;
671	ctl->size_stable = false;
672	ctl->head_snapc = true;
673	}
674	}
675	spin_unlock(lock: &ci->i_ceph_lock);
676	return snapc;
677	}
678
679	static u64 get_writepages_data_length(struct inode *inode,
680	struct page *page, u64 start)
681	{
682	struct ceph_inode_info *ci = ceph_inode(inode);
683	struct ceph_snap_context *snapc;
684	struct ceph_cap_snap *capsnap = NULL;
685	u64 end = i_size_read(inode);
686	u64 ret;
687
688	snapc = page_snap_context(page: ceph_fscrypt_pagecache_page(page));
689	if (snapc != ci->i_head_snapc) {
690	bool found = false;
691	spin_lock(lock: &ci->i_ceph_lock);
692	list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
693	if (capsnap->context == snapc) {
694	if (!capsnap->writing)
695	end = capsnap->size;
696	found = true;
697	break;
698	}
699	}
700	spin_unlock(lock: &ci->i_ceph_lock);
701	WARN_ON(!found);
702	}
703	if (end > ceph_fscrypt_page_offset(page) + thp_size(page))
704	end = ceph_fscrypt_page_offset(page) + thp_size(page);
705	ret = end > start ? end - start : `0`;
706	if (ret && fscrypt_is_bounce_page(page))
707	ret = round_up(ret, CEPH_FSCRYPT_BLOCK_SIZE);
708	return ret;
709	}
710
711	/*
712	* Write a folio, but leave it locked.
713	*
714	* If we get a write error, mark the mapping for error, but still adjust the
715	* dirty page accounting (i.e., folio is no longer dirty).
716	*/
717	static int write_folio_nounlock(struct folio *folio,
718	struct writeback_control *wbc)
719	{
720	struct page *page = &folio->page;
721	struct inode *inode = folio->mapping->host;
722	struct ceph_inode_info *ci = ceph_inode(inode);
723	struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
724	struct ceph_client *cl = fsc->client;
725	struct ceph_snap_context snapc, oldest;
726	loff_t page_off = folio_pos(folio);
727	int err;
728	loff_t len = folio_size(folio);
729	loff_t wlen;
730	struct ceph_writeback_ctl ceph_wbc;
731	struct ceph_osd_client *osdc = &fsc->client->osdc;
732	struct ceph_osd_request *req;
733	bool caching = ceph_is_cache_enabled(inode);
734	struct page *bounce_page = NULL;
735
736	doutc(cl, "%llx.%llx folio %p idx %lu\n", ceph_vinop(inode), folio,
737	folio->index);
738
739	if (ceph_inode_is_shutdown(inode))
740	return -EIO;
741
742	/ verify this is a writeable snap context /
743	snapc = page_snap_context(page: &folio->page);
744	if (!snapc) {
745	doutc(cl, "%llx.%llx folio %p not dirty?\n", ceph_vinop(inode),
746	folio);
747	return `0`;
748	}
749	oldest = get_oldest_context(inode, ctl: &ceph_wbc, page_snapc: snapc);
750	if (snapc->seq > oldest->seq) {
751	doutc(cl, "%llx.%llx folio %p snapc %p not writeable - noop\n",
752	ceph_vinop(inode), folio, snapc);
753	/ we should only noop if called by kswapd /
754	WARN_ON(!(current->flags & PF_MEMALLOC));
755	ceph_put_snap_context(sc: oldest);
756	folio_redirty_for_writepage(wbc, folio);
757	return `0`;
758	}
759	ceph_put_snap_context(sc: oldest);
760
761	/ is this a partial page at end of file? /
762	if (page_off >= ceph_wbc.i_size) {
763	doutc(cl, "%llx.%llx folio at %lu beyond eof %llu\n",
764	ceph_vinop(inode), folio->index, ceph_wbc.i_size);
765	folio_invalidate(folio, offset: `0`, length: folio_size(folio));
766	return `0`;
767	}
768
769	if (ceph_wbc.i_size < page_off + len)
770	len = ceph_wbc.i_size - page_off;
771
772	wlen = IS_ENCRYPTED(inode) ? round_up(len, CEPH_FSCRYPT_BLOCK_SIZE) : len;
773	doutc(cl, "%llx.%llx folio %p index %lu on %llu~%llu snapc %p seq %lld\n",
774	ceph_vinop(inode), folio, folio->index, page_off, wlen, snapc,
775	snapc->seq);
776
777	if (atomic_long_inc_return(v: &fsc->writeback_count) >
778	CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb))
779	fsc->write_congested = true;
780
781	req = ceph_osdc_new_request(osdc, layout: &ci->i_layout, vino: ceph_vino(inode),
782	offset: page_off, len: &wlen, which: `0`, num_ops: `1`, opcode: CEPH_OSD_OP_WRITE,
783	flags: CEPH_OSD_FLAG_WRITE, snapc,
784	truncate_seq: ceph_wbc.truncate_seq,
785	truncate_size: ceph_wbc.truncate_size, use_mempool: true);
786	if (IS_ERR(ptr: req)) {
787	folio_redirty_for_writepage(wbc, folio);
788	return PTR_ERR(ptr: req);
789	}
790
791	if (wlen < len)
792	len = wlen;
793
794	folio_start_writeback(folio);
795	if (caching)
796	ceph_set_page_fscache(page: &folio->page);
797	ceph_fscache_write_to_cache(inode, off: page_off, len, caching);
798
799	if (IS_ENCRYPTED(inode)) {
800	bounce_page = fscrypt_encrypt_pagecache_blocks(folio,
801	CEPH_FSCRYPT_BLOCK_SIZE, offs: `0`,
802	GFP_NOFS);
803	if (IS_ERR(ptr: bounce_page)) {
804	folio_redirty_for_writepage(wbc, folio);
805	folio_end_writeback(folio);
806	ceph_osdc_put_request(req);
807	return PTR_ERR(ptr: bounce_page);
808	}
809	}
810
811	/ it may be a short write due to an object boundary /
812	WARN_ON_ONCE(len > folio_size(folio));
813	osd_req_op_extent_osd_data_pages(req, which: `0`,
814	pages: bounce_page ? &bounce_page : &page, length: wlen, alignment: `0`,
815	pages_from_pool: false, own_pages: false);
816	doutc(cl, "%llx.%llx %llu~%llu (%llu bytes, %sencrypted)\n",
817	ceph_vinop(inode), page_off, len, wlen,
818	IS_ENCRYPTED(inode) ? "" : "not ");
819
820	req->r_mtime = inode_get_mtime(inode);
821	ceph_osdc_start_request(osdc, req);
822	err = ceph_osdc_wait_request(osdc, req);
823
824	ceph_update_write_metrics(m: &fsc->mdsc->metric, r_start: req->r_start_latency,
825	r_end: req->r_end_latency, size: len, rc: err);
826	fscrypt_free_bounce_page(bounce_page);
827	ceph_osdc_put_request(req);
828	if (err == `0`)
829	err = len;
830
831	if (err < `0`) {
832	struct writeback_control tmp_wbc;
833	if (!wbc)
834	wbc = &tmp_wbc;
835	if (err == -ERESTARTSYS) {
836	/ killed by SIGKILL /
837	doutc(cl, "%llx.%llx interrupted page %p\n",
838	ceph_vinop(inode), folio);
839	folio_redirty_for_writepage(wbc, folio);
840	folio_end_writeback(folio);
841	return err;
842	}
843	if (err == -EBLOCKLISTED)
844	fsc->blocklisted = true;
845	doutc(cl, "%llx.%llx setting mapping error %d %p\n",
846	ceph_vinop(inode), err, folio);
847	mapping_set_error(mapping: &inode->i_data, error: err);
848	wbc->pages_skipped++;
849	} else {
850	doutc(cl, "%llx.%llx cleaned page %p\n",
851	ceph_vinop(inode), folio);
852	err = `0`; / vfs expects us to return 0 /
853	}
854	oldest = folio_detach_private(folio);
855	WARN_ON_ONCE(oldest != snapc);
856	folio_end_writeback(folio);
857	ceph_put_wrbuffer_cap_refs(ci, nr: `1`, snapc);
858	ceph_put_snap_context(sc: snapc); / page's reference /
859
860	if (atomic_long_dec_return(v: &fsc->writeback_count) <
861	CONGESTION_OFF_THRESH(fsc->mount_options->congestion_kb))
862	fsc->write_congested = false;
863
864	return err;
865	}
866
867	/*
868	* async writeback completion handler.
869	*
870	* If we get an error, set the mapping error bit, but not the individual
871	* page error bits.
872	*/
873	static void writepages_finish(struct ceph_osd_request *req)
874	{
875	struct inode *inode = req->r_inode;
876	struct ceph_inode_info *ci = ceph_inode(inode);
877	struct ceph_client *cl = ceph_inode_to_client(inode);
878	struct ceph_osd_data *osd_data;
879	struct page *page;
880	int num_pages, total_pages = `0`;
881	int i, j;
882	int rc = req->r_result;
883	struct ceph_snap_context *snapc = req->r_snapc;
884	struct address_space *mapping = inode->i_mapping;
885	struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
886	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(sb: inode->i_sb);
887	unsigned int len = `0`;
888	bool remove_page;
889
890	doutc(cl, "%llx.%llx rc %d\n", ceph_vinop(inode), rc);
891	if (rc < `0`) {
892	mapping_set_error(mapping, error: rc);
893	ceph_set_error_write(ci);
894	if (rc == -EBLOCKLISTED)
895	fsc->blocklisted = true;
896	} else {
897	ceph_clear_error_write(ci);
898	}
899
900	/*
901	* We lost the cache cap, need to truncate the page before
902	* it is unlocked, otherwise we'd truncate it later in the
903	* page truncation thread, possibly losing some data that
904	* raced its way in
905	*/
906	remove_page = !(ceph_caps_issued(ci) &
907	(CEPH_CAP_FILE_CACHE\|CEPH_CAP_FILE_LAZYIO));
908
909	/ clean all pages /
910	for (i = `0`; i < req->r_num_ops; i++) {
911	if (req->r_ops[i].op != CEPH_OSD_OP_WRITE) {
912	pr_warn_client(cl,
913	"%llx.%llx incorrect op %d req %p index %d tid %llu\n",
914	ceph_vinop(inode), req->r_ops[i].op, req, i,
915	req->r_tid);
916	break;
917	}
918
919	osd_data = osd_req_op_extent_osd_data(osd_req: req, which: i);
920	BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES);
921	len += osd_data->length;
922	num_pages = calc_pages_for(off: (u64)osd_data->alignment,
923	len: (u64)osd_data->length);
924	total_pages += num_pages;
925	for (j = `0`; j < num_pages; j++) {
926	page = osd_data->pages[j];
927	if (fscrypt_is_bounce_page(page)) {
928	page = fscrypt_pagecache_page(bounce_page: page);
929	fscrypt_free_bounce_page(bounce_page: osd_data->pages[j]);
930	osd_data->pages[j] = page;
931	}
932	BUG_ON(!page);
933	WARN_ON(!PageUptodate(page));
934
935	if (atomic_long_dec_return(v: &fsc->writeback_count) <
936	CONGESTION_OFF_THRESH(
937	fsc->mount_options->congestion_kb))
938	fsc->write_congested = false;
939
940	ceph_put_snap_context(sc: detach_page_private(page));
941	end_page_writeback(page);
942
943	if (atomic64_dec_return(v: &mdsc->dirty_folios) <= `0`) {
944	wake_up_all(&mdsc->flush_end_wq);
945	WARN_ON(atomic64_read(&mdsc->dirty_folios) < `0`);
946	}
947
948	doutc(cl, "unlocking %p\n", page);
949
950	if (remove_page)
951	generic_error_remove_folio(mapping: inode->i_mapping,
952	page_folio(page));
953
954	unlock_page(page);
955	}
956	doutc(cl, "%llx.%llx wrote %llu bytes cleaned %d pages\n",
957	ceph_vinop(inode), osd_data->length,
958	rc >= `0` ? num_pages : `0`);
959
960	release_pages(osd_data->pages, nr: num_pages);
961	}
962
963	ceph_update_write_metrics(m: &fsc->mdsc->metric, r_start: req->r_start_latency,
964	r_end: req->r_end_latency, size: len, rc);
965
966	ceph_put_wrbuffer_cap_refs(ci, nr: total_pages, snapc);
967
968	osd_data = osd_req_op_extent_osd_data(osd_req: req, which: `0`);
969	if (osd_data->pages_from_pool)
970	mempool_free(element: osd_data->pages, pool: ceph_wb_pagevec_pool);
971	else
972	kfree(objp: osd_data->pages);
973	ceph_osdc_put_request(req);
974	ceph_dec_osd_stopping_blocker(mdsc: fsc->mdsc);
975	}
976
977	static inline
978	bool is_forced_umount(struct address_space *mapping)
979	{
980	struct inode *inode = mapping->host;
981	struct ceph_inode_info *ci = ceph_inode(inode);
982	struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
983	struct ceph_client *cl = fsc->client;
984
985	if (ceph_inode_is_shutdown(inode)) {
986	if (ci->i_wrbuffer_ref > `0`) {
987	pr_warn_ratelimited_client(cl,
988	"%llx.%llx %lld forced umount\n",
989	ceph_vinop(inode), ceph_ino(inode));
990	}
991	mapping_set_error(mapping, error: -EIO);
992	return true;
993	}
994
995	return false;
996	}
997
998	static inline
999	unsigned int ceph_define_write_size(struct address_space *mapping)
1000	{
1001	struct inode *inode = mapping->host;
1002	struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
1003	unsigned int wsize = i_blocksize(node: inode);
1004
1005	if (fsc->mount_options->wsize < wsize)
1006	wsize = fsc->mount_options->wsize;
1007
1008	return wsize;
1009	}
1010
1011	static inline
1012	void ceph_folio_batch_init(struct ceph_writeback_ctl *ceph_wbc)
1013	{
1014	folio_batch_init(fbatch: &ceph_wbc->fbatch);
1015	ceph_wbc->processed_in_fbatch = `0`;
1016	}
1017
1018	static inline
1019	void ceph_folio_batch_reinit(struct ceph_writeback_ctl *ceph_wbc)
1020	{
1021	folio_batch_release(fbatch: &ceph_wbc->fbatch);
1022	ceph_folio_batch_init(ceph_wbc);
1023	}
1024
1025	static inline
1026	void ceph_init_writeback_ctl(struct address_space *mapping,
1027	struct writeback_control *wbc,
1028	struct ceph_writeback_ctl *ceph_wbc)
1029	{
1030	ceph_wbc->snapc = NULL;
1031	ceph_wbc->last_snapc = NULL;
1032
1033	ceph_wbc->strip_unit_end = `0`;
1034	ceph_wbc->wsize = ceph_define_write_size(mapping);
1035
1036	ceph_wbc->nr_folios = `0`;
1037	ceph_wbc->max_pages = `0`;
1038	ceph_wbc->locked_pages = `0`;
1039
1040	ceph_wbc->done = false;
1041	ceph_wbc->should_loop = false;
1042	ceph_wbc->range_whole = false;
1043
1044	ceph_wbc->start_index = wbc->range_cyclic ? mapping->writeback_index : `0`;
1045	ceph_wbc->index = ceph_wbc->start_index;
1046	ceph_wbc->end = -`1`;
1047
1048	ceph_wbc->tag = wbc_to_tag(wbc);
1049
1050	ceph_wbc->op_idx = -`1`;
1051	ceph_wbc->num_ops = `0`;
1052	ceph_wbc->offset = `0`;
1053	ceph_wbc->len = `0`;
1054	ceph_wbc->from_pool = false;
1055
1056	ceph_folio_batch_init(ceph_wbc);
1057
1058	ceph_wbc->pages = NULL;
1059	ceph_wbc->data_pages = NULL;
1060	}
1061
1062	static inline
1063	int ceph_define_writeback_range(struct address_space *mapping,
1064	struct writeback_control *wbc,
1065	struct ceph_writeback_ctl *ceph_wbc)
1066	{
1067	struct inode *inode = mapping->host;
1068	struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
1069	struct ceph_client *cl = fsc->client;
1070
1071	/ find oldest snap context with dirty data /
1072	ceph_wbc->snapc = get_oldest_context(inode, ctl: ceph_wbc, NULL);
1073	if (!ceph_wbc->snapc) {
1074	/ hmm, why does writepages get called when there*
1075	is no dirty data? /*
1076	doutc(cl, " no snap context with dirty data?\n");
1077	return -ENODATA;
1078	}
1079
1080	doutc(cl, " oldest snapc is %p seq %lld (%d snaps)\n",
1081	ceph_wbc->snapc, ceph_wbc->snapc->seq,
1082	ceph_wbc->snapc->num_snaps);
1083
1084	ceph_wbc->should_loop = false;
1085
1086	if (ceph_wbc->head_snapc && ceph_wbc->snapc != ceph_wbc->last_snapc) {
1087	/ where to start/end? /
1088	if (wbc->range_cyclic) {
1089	ceph_wbc->index = ceph_wbc->start_index;
1090	ceph_wbc->end = -`1`;
1091	if (ceph_wbc->index > `0`)
1092	ceph_wbc->should_loop = true;
1093	doutc(cl, " cyclic, start at %lu\n", ceph_wbc->index);
1094	} else {
1095	ceph_wbc->index = wbc->range_start >> PAGE_SHIFT;
1096	ceph_wbc->end = wbc->range_end >> PAGE_SHIFT;
1097	if (wbc->range_start == `0` && wbc->range_end == LLONG_MAX)
1098	ceph_wbc->range_whole = true;
1099	doutc(cl, " not cyclic, %lu to %lu\n",
1100	ceph_wbc->index, ceph_wbc->end);
1101	}
1102	} else if (!ceph_wbc->head_snapc) {
1103	/ Do not respect wbc->range_{start,end}. Dirty pages*
1104	* in that range can be associated with newer snapc.
1105	* They are not writeable until we write all dirty pages
1106	* associated with 'snapc' get written */
1107	if (ceph_wbc->index > `0`)
1108	ceph_wbc->should_loop = true;
1109	doutc(cl, " non-head snapc, range whole\n");
1110	}
1111
1112	ceph_put_snap_context(sc: ceph_wbc->last_snapc);
1113	ceph_wbc->last_snapc = ceph_wbc->snapc;
1114
1115	return `0`;
1116	}
1117
1118	static inline
1119	bool has_writeback_done(struct ceph_writeback_ctl *ceph_wbc)
1120	{
1121	return ceph_wbc->done && ceph_wbc->index > ceph_wbc->end;
1122	}
1123
1124	static inline
1125	bool can_next_page_be_processed(struct ceph_writeback_ctl *ceph_wbc,
1126	unsigned index)
1127	{
1128	return index < ceph_wbc->nr_folios &&
1129	ceph_wbc->locked_pages < ceph_wbc->max_pages;
1130	}
1131
1132	static
1133	int ceph_check_page_before_write(struct address_space *mapping,
1134	struct writeback_control *wbc,
1135	struct ceph_writeback_ctl *ceph_wbc,
1136	struct folio *folio)
1137	{
1138	struct inode *inode = mapping->host;
1139	struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
1140	struct ceph_client *cl = fsc->client;
1141	struct ceph_snap_context *pgsnapc;
1142
1143	/ only dirty folios, or our accounting breaks /
1144	if (unlikely(!folio_test_dirty(folio) \|\| folio->mapping != mapping)) {
1145	doutc(cl, "!dirty or !mapping %p\n", folio);
1146	return -ENODATA;
1147	}
1148
1149	/ only if matching snap context /
1150	pgsnapc = page_snap_context(page: &folio->page);
1151	if (pgsnapc != ceph_wbc->snapc) {
1152	doutc(cl, "folio snapc %p %lld != oldest %p %lld\n",
1153	pgsnapc, pgsnapc->seq,
1154	ceph_wbc->snapc, ceph_wbc->snapc->seq);
1155
1156	if (!ceph_wbc->should_loop && !ceph_wbc->head_snapc &&
1157	wbc->sync_mode != WB_SYNC_NONE)
1158	ceph_wbc->should_loop = true;
1159
1160	return -ENODATA;
1161	}
1162
1163	if (folio_pos(folio) >= ceph_wbc->i_size) {
1164	doutc(cl, "folio at %lu beyond eof %llu\n",
1165	folio->index, ceph_wbc->i_size);
1166
1167	if ((ceph_wbc->size_stable \|\|
1168	folio_pos(folio) >= i_size_read(inode)) &&
1169	folio_clear_dirty_for_io(folio))
1170	folio_invalidate(folio, offset: `0`, length: folio_size(folio));
1171
1172	return -ENODATA;
1173	}
1174
1175	if (ceph_wbc->strip_unit_end &&
1176	(folio->index > ceph_wbc->strip_unit_end)) {
1177	doutc(cl, "end of strip unit %p\n", folio);
1178	return -E2BIG;
1179	}
1180
1181	return `0`;
1182	}
1183
1184	static inline
1185	void __ceph_allocate_page_array(struct ceph_writeback_ctl *ceph_wbc,
1186	unsigned int max_pages)
1187	{
1188	ceph_wbc->pages = kmalloc_array(max_pages,
1189	sizeof(*ceph_wbc->pages),
1190	GFP_NOFS);
1191	if (!ceph_wbc->pages) {
1192	ceph_wbc->from_pool = true;
1193	ceph_wbc->pages = mempool_alloc(ceph_wb_pagevec_pool, GFP_NOFS);
1194	BUG_ON(!ceph_wbc->pages);
1195	}
1196	}
1197
1198	static inline
1199	void ceph_allocate_page_array(struct address_space *mapping,
1200	struct ceph_writeback_ctl *ceph_wbc,
1201	struct folio *folio)
1202	{
1203	struct inode *inode = mapping->host;
1204	struct ceph_inode_info *ci = ceph_inode(inode);
1205	u64 objnum;
1206	u64 objoff;
1207	u32 xlen;
1208
1209	/ prepare async write request /
1210	ceph_wbc->offset = (u64)folio_pos(folio);
1211	ceph_calc_file_object_mapping(l: &ci->i_layout,
1212	off: ceph_wbc->offset, len: ceph_wbc->wsize,
1213	objno: &objnum, objoff: &objoff, xlen: &xlen);
1214
1215	ceph_wbc->num_ops = `1`;
1216	ceph_wbc->strip_unit_end = folio->index + ((xlen - `1`) >> PAGE_SHIFT);
1217
1218	BUG_ON(ceph_wbc->pages);
1219	ceph_wbc->max_pages = calc_pages_for(off: `0`, len: (u64)xlen);
1220	__ceph_allocate_page_array(ceph_wbc, max_pages: ceph_wbc->max_pages);
1221
1222	ceph_wbc->len = `0`;
1223	}
1224
1225	static inline
1226	bool is_folio_index_contiguous(const struct ceph_writeback_ctl *ceph_wbc,
1227	const struct folio *folio)
1228	{
1229	return folio->index == (ceph_wbc->offset + ceph_wbc->len) >> PAGE_SHIFT;
1230	}
1231
1232	static inline
1233	bool is_num_ops_too_big(struct ceph_writeback_ctl *ceph_wbc)
1234	{
1235	return ceph_wbc->num_ops >=
1236	(ceph_wbc->from_pool ? CEPH_OSD_SLAB_OPS : CEPH_OSD_MAX_OPS);
1237	}
1238
1239	static inline
1240	bool is_write_congestion_happened(struct ceph_fs_client *fsc)
1241	{
1242	return atomic_long_inc_return(v: &fsc->writeback_count) >
1243	CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb);
1244	}
1245
1246	static inline int move_dirty_folio_in_page_array(struct address_space *mapping,
1247	struct writeback_control *wbc,
1248	struct ceph_writeback_ctl ceph_wbc, struct* folio *folio)
1249	{
1250	struct inode *inode = mapping->host;
1251	struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
1252	struct ceph_client *cl = fsc->client;
1253	struct page **pages = ceph_wbc->pages;
1254	unsigned int index = ceph_wbc->locked_pages;
1255	gfp_t gfp_flags = ceph_wbc->locked_pages ? GFP_NOWAIT : GFP_NOFS;
1256
1257	if (IS_ENCRYPTED(inode)) {
1258	pages[index] = fscrypt_encrypt_pagecache_blocks(folio,
1259	PAGE_SIZE,
1260	offs: `0`,
1261	gfp_flags);
1262	if (IS_ERR(ptr: pages[index])) {
1263	int err = PTR_ERR(ptr: pages[index]);
1264
1265	if (err == -EINVAL) {
1266	pr_err_client(cl, "inode->i_blkbits=%hhu\n",
1267	inode->i_blkbits);
1268	}
1269
1270	/ better not fail on first page! /
1271	BUG_ON(ceph_wbc->locked_pages == `0`);
1272
1273	pages[index] = NULL;
1274	return err;
1275	}
1276	} else {
1277	pages[index] = &folio->page;
1278	}
1279
1280	ceph_wbc->locked_pages++;
1281
1282	return `0`;
1283	}
1284
1285	static
1286	int ceph_process_folio_batch(struct address_space *mapping,
1287	struct writeback_control *wbc,
1288	struct ceph_writeback_ctl *ceph_wbc)
1289	{
1290	struct inode *inode = mapping->host;
1291	struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
1292	struct ceph_client *cl = fsc->client;
1293	struct folio *folio = NULL;
1294	unsigned i;
1295	int rc = `0`;
1296
1297	for (i = `0`; can_next_page_be_processed(ceph_wbc, index: i); i++) {
1298	folio = ceph_wbc->fbatch.folios[i];
1299
1300	if (!folio)
1301	continue;
1302
1303	doutc(cl, "? %p idx %lu, folio_test_writeback %#x, "
1304	"folio_test_dirty %#x, folio_test_locked %#x\n",
1305	folio, folio->index, folio_test_writeback(folio),
1306	folio_test_dirty(folio),
1307	folio_test_locked(folio));
1308
1309	if (folio_test_writeback(folio) \|\|
1310	folio_test_private_2(folio) / [DEPRECATED] /) {
1311	doutc(cl, "waiting on writeback %p\n", folio);
1312	folio_wait_writeback(folio);
1313	folio_wait_private_2(folio); / [DEPRECATED] /
1314	continue;
1315	}
1316
1317	if (ceph_wbc->locked_pages == `0`)
1318	folio_lock(folio);
1319	else if (!folio_trylock(folio))
1320	break;
1321
1322	rc = ceph_check_page_before_write(mapping, wbc,
1323	ceph_wbc, folio);
1324	if (rc == -ENODATA) {
1325	rc = `0`;
1326	folio_unlock(folio);
1327	ceph_wbc->fbatch.folios[i] = NULL;
1328	continue;
1329	} else if (rc == -E2BIG) {
1330	rc = `0`;
1331	folio_unlock(folio);
1332	ceph_wbc->fbatch.folios[i] = NULL;
1333	break;
1334	}
1335
1336	if (!folio_clear_dirty_for_io(folio)) {
1337	doutc(cl, "%p !folio_clear_dirty_for_io\n", folio);
1338	folio_unlock(folio);
1339	ceph_wbc->fbatch.folios[i] = NULL;
1340	continue;
1341	}
1342
1343	/*
1344	* We have something to write. If this is
1345	* the first locked page this time through,
1346	* calculate max possible write size and
1347	* allocate a page array
1348	*/
1349	if (ceph_wbc->locked_pages == `0`) {
1350	ceph_allocate_page_array(mapping, ceph_wbc, folio);
1351	} else if (!is_folio_index_contiguous(ceph_wbc, folio)) {
1352	if (is_num_ops_too_big(ceph_wbc)) {
1353	folio_redirty_for_writepage(wbc, folio);
1354	folio_unlock(folio);
1355	break;
1356	}
1357
1358	ceph_wbc->num_ops++;
1359	ceph_wbc->offset = (u64)folio_pos(folio);
1360	ceph_wbc->len = `0`;
1361	}
1362
1363	/ note position of first page in fbatch /
1364	doutc(cl, "%llx.%llx will write folio %p idx %lu\n",
1365	ceph_vinop(inode), folio, folio->index);
1366
1367	fsc->write_congested = is_write_congestion_happened(fsc);
1368
1369	rc = move_dirty_folio_in_page_array(mapping, wbc, ceph_wbc,
1370	folio);
1371	if (rc) {
1372	folio_redirty_for_writepage(wbc, folio);
1373	folio_unlock(folio);
1374	break;
1375	}
1376
1377	ceph_wbc->fbatch.folios[i] = NULL;
1378	ceph_wbc->len += folio_size(folio);
1379	}
1380
1381	ceph_wbc->processed_in_fbatch = i;
1382
1383	return rc;
1384	}
1385
1386	static inline
1387	void ceph_shift_unused_folios_left(struct folio_batch *fbatch)
1388	{
1389	unsigned j, n = `0`;
1390
1391	/ shift unused page to beginning of fbatch /
1392	for (j = `0`; j < folio_batch_count(fbatch); j++) {
1393	if (!fbatch->folios[j])
1394	continue;
1395
1396	if (n < j) {
1397	fbatch->folios[n] = fbatch->folios[j];
1398	}
1399
1400	n++;
1401	}
1402
1403	fbatch->nr = n;
1404	}
1405
1406	static
1407	int ceph_submit_write(struct address_space *mapping,
1408	struct writeback_control *wbc,
1409	struct ceph_writeback_ctl *ceph_wbc)
1410	{
1411	struct inode *inode = mapping->host;
1412	struct ceph_inode_info *ci = ceph_inode(inode);
1413	struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
1414	struct ceph_client *cl = fsc->client;
1415	struct ceph_vino vino = ceph_vino(inode);
1416	struct ceph_osd_request *req = NULL;
1417	struct page *page = NULL;
1418	bool caching = ceph_is_cache_enabled(inode);
1419	u64 offset;
1420	u64 len;
1421	unsigned i;
1422
1423	new_request:
1424	offset = ceph_fscrypt_page_offset(page: ceph_wbc->pages[`0`]);
1425	len = ceph_wbc->wsize;
1426
1427	req = ceph_osdc_new_request(&fsc->client->osdc,
1428	layout: &ci->i_layout, vino,
1429	offset, len: &len, which: `0`, num_ops: ceph_wbc->num_ops,
1430	opcode: CEPH_OSD_OP_WRITE, flags: CEPH_OSD_FLAG_WRITE,
1431	snapc: ceph_wbc->snapc, truncate_seq: ceph_wbc->truncate_seq,
1432	truncate_size: ceph_wbc->truncate_size, use_mempool: false);
1433	if (IS_ERR(ptr: req)) {
1434	req = ceph_osdc_new_request(&fsc->client->osdc,
1435	layout: &ci->i_layout, vino,
1436	offset, len: &len, which: `0`,
1437	min(ceph_wbc->num_ops,
1438	CEPH_OSD_SLAB_OPS),
1439	opcode: CEPH_OSD_OP_WRITE,
1440	flags: CEPH_OSD_FLAG_WRITE,
1441	snapc: ceph_wbc->snapc,
1442	truncate_seq: ceph_wbc->truncate_seq,
1443	truncate_size: ceph_wbc->truncate_size,
1444	use_mempool: true);
1445	BUG_ON(IS_ERR(req));
1446	}
1447
1448	page = ceph_wbc->pages[ceph_wbc->locked_pages - `1`];
1449	BUG_ON(len < ceph_fscrypt_page_offset(page) + thp_size(page) - offset);
1450
1451	if (!ceph_inc_osd_stopping_blocker(mdsc: fsc->mdsc)) {
1452	for (i = `0`; i < folio_batch_count(fbatch: &ceph_wbc->fbatch); i++) {
1453	struct folio *folio = ceph_wbc->fbatch.folios[i];
1454
1455	if (!folio)
1456	continue;
1457
1458	page = &folio->page;
1459	redirty_page_for_writepage(wbc, page);
1460	unlock_page(page);
1461	}
1462
1463	for (i = `0`; i < ceph_wbc->locked_pages; i++) {
1464	page = ceph_fscrypt_pagecache_page(page: ceph_wbc->pages[i]);
1465
1466	if (!page)
1467	continue;
1468
1469	redirty_page_for_writepage(wbc, page);
1470	unlock_page(page);
1471	}
1472
1473	ceph_osdc_put_request(req);
1474	return -EIO;
1475	}
1476
1477	req->r_callback = writepages_finish;
1478	req->r_inode = inode;
1479
1480	/ Format the osd request message and submit the write /
1481	len = `0`;
1482	ceph_wbc->data_pages = ceph_wbc->pages;
1483	ceph_wbc->op_idx = `0`;
1484	for (i = `0`; i < ceph_wbc->locked_pages; i++) {
1485	u64 cur_offset;
1486
1487	page = ceph_fscrypt_pagecache_page(page: ceph_wbc->pages[i]);
1488	cur_offset = page_offset(page);
1489
1490	/*
1491	* Discontinuity in page range? Ceph can handle that by just passing
1492	* multiple extents in the write op.
1493	*/
1494	if (offset + len != cur_offset) {
1495	/ If it's full, stop here /
1496	if (ceph_wbc->op_idx + `1` == req->r_num_ops)
1497	break;
1498
1499	/ Kick off an fscache write with what we have so far. /
1500	ceph_fscache_write_to_cache(inode, off: offset, len, caching);
1501
1502	/ Start a new extent /
1503	osd_req_op_extent_dup_last(osd_req: req, which: ceph_wbc->op_idx,
1504	offset_inc: cur_offset - offset);
1505
1506	doutc(cl, "got pages at %llu~%llu\n", offset, len);
1507
1508	osd_req_op_extent_osd_data_pages(req, which: ceph_wbc->op_idx,
1509	pages: ceph_wbc->data_pages,
1510	length: len, alignment: `0`,
1511	pages_from_pool: ceph_wbc->from_pool,
1512	own_pages: false);
1513	osd_req_op_extent_update(osd_req: req, which: ceph_wbc->op_idx, length: len);
1514
1515	len = `0`;
1516	offset = cur_offset;
1517	ceph_wbc->data_pages = ceph_wbc->pages + i;
1518	ceph_wbc->op_idx++;
1519	}
1520
1521	set_page_writeback(page);
1522
1523	if (caching)
1524	ceph_set_page_fscache(page);
1525
1526	len += thp_size(page);
1527	}
1528
1529	ceph_fscache_write_to_cache(inode, off: offset, len, caching);
1530
1531	if (ceph_wbc->size_stable) {
1532	len = min(len, ceph_wbc->i_size - offset);
1533	} else if (i == ceph_wbc->locked_pages) {
1534	/ writepages_finish() clears writeback pages*
1535	* according to the data length, so make sure
1536	* data length covers all locked pages */
1537	u64 min_len = len + `1` - thp_size(page);
1538	len = get_writepages_data_length(inode,
1539	page: ceph_wbc->pages[i - `1`],
1540	start: offset);
1541	len = max(len, min_len);
1542	}
1543
1544	if (IS_ENCRYPTED(inode))
1545	len = round_up(len, CEPH_FSCRYPT_BLOCK_SIZE);
1546
1547	doutc(cl, "got pages at %llu~%llu\n", offset, len);
1548
1549	if (IS_ENCRYPTED(inode) &&
1550	((offset \| len) & ~CEPH_FSCRYPT_BLOCK_MASK)) {
1551	pr_warn_client(cl,
1552	"bad encrypted write offset=%lld len=%llu\n",
1553	offset, len);
1554	}
1555
1556	osd_req_op_extent_osd_data_pages(req, which: ceph_wbc->op_idx,
1557	pages: ceph_wbc->data_pages, length: len,
1558	alignment: `0`, pages_from_pool: ceph_wbc->from_pool, own_pages: false);
1559	osd_req_op_extent_update(osd_req: req, which: ceph_wbc->op_idx, length: len);
1560
1561	BUG_ON(ceph_wbc->op_idx + `1` != req->r_num_ops);
1562
1563	ceph_wbc->from_pool = false;
1564	if (i < ceph_wbc->locked_pages) {
1565	BUG_ON(ceph_wbc->num_ops <= req->r_num_ops);
1566	ceph_wbc->num_ops -= req->r_num_ops;
1567	ceph_wbc->locked_pages -= i;
1568
1569	/ allocate new pages array for next request /
1570	ceph_wbc->data_pages = ceph_wbc->pages;
1571	__ceph_allocate_page_array(ceph_wbc, max_pages: ceph_wbc->locked_pages);
1572	memcpy(ceph_wbc->pages, ceph_wbc->data_pages + i,
1573	ceph_wbc->locked_pages * sizeof(*ceph_wbc->pages));
1574	memset(ceph_wbc->data_pages + i, `0`,
1575	ceph_wbc->locked_pages * sizeof(*ceph_wbc->pages));
1576	} else {
1577	BUG_ON(ceph_wbc->num_ops != req->r_num_ops);
1578	/ request message now owns the pages array /
1579	ceph_wbc->pages = NULL;
1580	}
1581
1582	req->r_mtime = inode_get_mtime(inode);
1583	ceph_osdc_start_request(osdc: &fsc->client->osdc, req);
1584	req = NULL;
1585
1586	wbc->nr_to_write -= i;
1587	if (ceph_wbc->pages)
1588	goto new_request;
1589
1590	return `0`;
1591	}
1592
1593	static
1594	void ceph_wait_until_current_writes_complete(struct address_space *mapping,
1595	struct writeback_control *wbc,
1596	struct ceph_writeback_ctl *ceph_wbc)
1597	{
1598	struct page *page;
1599	unsigned i, nr;
1600
1601	if (wbc->sync_mode != WB_SYNC_NONE &&
1602	ceph_wbc->start_index == `0` && / all dirty pages were checked /
1603	!ceph_wbc->head_snapc) {
1604	ceph_wbc->index = `0`;
1605
1606	while ((ceph_wbc->index <= ceph_wbc->end) &&
1607	(nr = filemap_get_folios_tag(mapping,
1608	start: &ceph_wbc->index,
1609	end: (pgoff_t)-`1`,
1610	PAGECACHE_TAG_WRITEBACK,
1611	fbatch: &ceph_wbc->fbatch))) {
1612	for (i = `0`; i < nr; i++) {
1613	page = &ceph_wbc->fbatch.folios[i]->page;
1614	if (page_snap_context(page) != ceph_wbc->snapc)
1615	continue;
1616	wait_on_page_writeback(page);
1617	}
1618
1619	folio_batch_release(fbatch: &ceph_wbc->fbatch);
1620	cond_resched();
1621	}
1622	}
1623	}
1624
1625	/*
1626	* initiate async writeback
1627	*/
1628	static int ceph_writepages_start(struct address_space *mapping,
1629	struct writeback_control *wbc)
1630	{
1631	struct inode *inode = mapping->host;
1632	struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
1633	struct ceph_client *cl = fsc->client;
1634	struct ceph_writeback_ctl ceph_wbc;
1635	int rc = `0`;
1636
1637	if (wbc->sync_mode == WB_SYNC_NONE && fsc->write_congested)
1638	return `0`;
1639
1640	doutc(cl, "%llx.%llx (mode=%s)\n", ceph_vinop(inode),
1641	wbc->sync_mode == WB_SYNC_NONE ? "NONE" :
1642	(wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
1643
1644	if (is_forced_umount(mapping)) {
1645	/ we're in a forced umount, don't write! /
1646	return -EIO;
1647	}
1648
1649	ceph_init_writeback_ctl(mapping, wbc, ceph_wbc: &ceph_wbc);
1650
1651	if (!ceph_inc_osd_stopping_blocker(mdsc: fsc->mdsc)) {
1652	rc = -EIO;
1653	goto out;
1654	}
1655
1656	retry:
1657	rc = ceph_define_writeback_range(mapping, wbc, ceph_wbc: &ceph_wbc);
1658	if (rc == -ENODATA) {
1659	/ hmm, why does writepages get called when there*
1660	is no dirty data? /*
1661	rc = `0`;
1662	goto dec_osd_stopping_blocker;
1663	}
1664
1665	if (wbc->sync_mode == WB_SYNC_ALL \|\| wbc->tagged_writepages)
1666	tag_pages_for_writeback(mapping, start: ceph_wbc.index, end: ceph_wbc.end);
1667
1668	while (!has_writeback_done(ceph_wbc: &ceph_wbc)) {
1669	ceph_wbc.locked_pages = `0`;
1670	ceph_wbc.max_pages = ceph_wbc.wsize >> PAGE_SHIFT;
1671
1672	get_more_pages:
1673	ceph_folio_batch_reinit(ceph_wbc: &ceph_wbc);
1674
1675	ceph_wbc.nr_folios = filemap_get_folios_tag(mapping,
1676	start: &ceph_wbc.index,
1677	end: ceph_wbc.end,
1678	tag: ceph_wbc.tag,
1679	fbatch: &ceph_wbc.fbatch);
1680	doutc(cl, "pagevec_lookup_range_tag for tag %#x got %d\n",
1681	ceph_wbc.tag, ceph_wbc.nr_folios);
1682
1683	if (!ceph_wbc.nr_folios && !ceph_wbc.locked_pages)
1684	break;
1685
1686	process_folio_batch:
1687	rc = ceph_process_folio_batch(mapping, wbc, ceph_wbc: &ceph_wbc);
1688	ceph_shift_unused_folios_left(fbatch: &ceph_wbc.fbatch);
1689	if (rc)
1690	goto release_folios;
1691
1692	/ did we get anything? /
1693	if (!ceph_wbc.locked_pages)
1694	goto release_folios;
1695
1696	if (ceph_wbc.processed_in_fbatch) {
1697	if (folio_batch_count(fbatch: &ceph_wbc.fbatch) == `0` &&
1698	ceph_wbc.locked_pages < ceph_wbc.max_pages) {
1699	doutc(cl, "reached end fbatch, trying for more\n");
1700	goto get_more_pages;
1701	}
1702	}
1703
1704	rc = ceph_submit_write(mapping, wbc, ceph_wbc: &ceph_wbc);
1705	if (rc)
1706	goto release_folios;
1707
1708	ceph_wbc.locked_pages = `0`;
1709	ceph_wbc.strip_unit_end = `0`;
1710
1711	if (folio_batch_count(fbatch: &ceph_wbc.fbatch) > `0`) {
1712	ceph_wbc.nr_folios =
1713	folio_batch_count(fbatch: &ceph_wbc.fbatch);
1714	goto process_folio_batch;
1715	}
1716
1717	/*
1718	* We stop writing back only if we are not doing
1719	* integrity sync. In case of integrity sync we have to
1720	* keep going until we have written all the pages
1721	* we tagged for writeback prior to entering this loop.
1722	*/
1723	if (wbc->nr_to_write <= `0` && wbc->sync_mode == WB_SYNC_NONE)
1724	ceph_wbc.done = true;
1725
1726	release_folios:
1727	doutc(cl, "folio_batch release on %d folios (%p)\n",
1728	(int)ceph_wbc.fbatch.nr,
1729	ceph_wbc.fbatch.nr ? ceph_wbc.fbatch.folios[`0`] : NULL);
1730	folio_batch_release(fbatch: &ceph_wbc.fbatch);
1731	}
1732
1733	if (ceph_wbc.should_loop && !ceph_wbc.done) {
1734	/ more to do; loop back to beginning of file /
1735	doutc(cl, "looping back to beginning of file\n");
1736	/ OK even when start_index == 0 /
1737	ceph_wbc.end = ceph_wbc.start_index - `1`;
1738
1739	/ to write dirty pages associated with next snapc,*
1740	* we need to wait until current writes complete */
1741	ceph_wait_until_current_writes_complete(mapping, wbc, ceph_wbc: &ceph_wbc);
1742
1743	ceph_wbc.start_index = `0`;
1744	ceph_wbc.index = `0`;
1745	goto retry;
1746	}
1747
1748	if (wbc->range_cyclic \|\| (ceph_wbc.range_whole && wbc->nr_to_write > `0`))
1749	mapping->writeback_index = ceph_wbc.index;
1750
1751	dec_osd_stopping_blocker:
1752	ceph_dec_osd_stopping_blocker(mdsc: fsc->mdsc);
1753
1754	out:
1755	ceph_put_snap_context(sc: ceph_wbc.last_snapc);
1756	doutc(cl, "%llx.%llx dend - startone, rc = %d\n", ceph_vinop(inode),
1757	rc);
1758
1759	return rc;
1760	}
1761
1762	/*
1763	* See if a given @snapc is either writeable, or already written.
1764	*/
1765	static int context_is_writeable_or_written(struct inode *inode,
1766	struct ceph_snap_context *snapc)
1767	{
1768	struct ceph_snap_context *oldest = get_oldest_context(inode, NULL, NULL);
1769	int ret = !oldest \|\| snapc->seq <= oldest->seq;
1770
1771	ceph_put_snap_context(sc: oldest);
1772	return ret;
1773	}
1774
1775	/**
1776	* ceph_find_incompatible - find an incompatible context and return it
1777	* @folio: folio being dirtied
1778	*
1779	* We are only allowed to write into/dirty a folio if the folio is
1780	* clean, or already dirty within the same snap context. Returns a
1781	* conflicting context if there is one, NULL if there isn't, or a
1782	* negative error code on other errors.
1783	*
1784	* Must be called with folio lock held.
1785	*/
1786	static struct ceph_snap_context *
1787	ceph_find_incompatible(struct folio *folio)
1788	{
1789	struct inode *inode = folio->mapping->host;
1790	struct ceph_client *cl = ceph_inode_to_client(inode);
1791	struct ceph_inode_info *ci = ceph_inode(inode);
1792
1793	if (ceph_inode_is_shutdown(inode)) {
1794	doutc(cl, " %llx.%llx folio %p is shutdown\n",
1795	ceph_vinop(inode), folio);
1796	return ERR_PTR(error: -ESTALE);
1797	}
1798
1799	for (;;) {
1800	struct ceph_snap_context snapc, oldest;
1801
1802	folio_wait_writeback(folio);
1803
1804	snapc = page_snap_context(page: &folio->page);
1805	if (!snapc \|\| snapc == ci->i_head_snapc)
1806	break;
1807
1808	/*
1809	* this folio is already dirty in another (older) snap
1810	* context! is it writeable now?
1811	*/
1812	oldest = get_oldest_context(inode, NULL, NULL);
1813	if (snapc->seq > oldest->seq) {
1814	/ not writeable -- return it for the caller to deal with /
1815	ceph_put_snap_context(sc: oldest);
1816	doutc(cl, " %llx.%llx folio %p snapc %p not current or oldest\n",
1817	ceph_vinop(inode), folio, snapc);
1818	return ceph_get_snap_context(sc: snapc);
1819	}
1820	ceph_put_snap_context(sc: oldest);
1821
1822	/ yay, writeable, do it now (without dropping folio lock) /
1823	doutc(cl, " %llx.%llx folio %p snapc %p not current, but oldest\n",
1824	ceph_vinop(inode), folio, snapc);
1825	if (folio_clear_dirty_for_io(folio)) {
1826	int r = write_folio_nounlock(folio, NULL);
1827	if (r < `0`)
1828	return ERR_PTR(error: r);
1829	}
1830	}
1831	return NULL;
1832	}
1833
1834	static int ceph_netfs_check_write_begin(struct file file, loff_t pos, unsigned* int len,
1835	struct folio *foliop, void* **_fsdata)
1836	{
1837	struct inode *inode = file_inode(f: file);
1838	struct ceph_inode_info *ci = ceph_inode(inode);
1839	struct ceph_snap_context *snapc;
1840
1841	snapc = ceph_find_incompatible(folio: *foliop);
1842	if (snapc) {
1843	int r;
1844
1845	folio_unlock(folio: *foliop);
1846	folio_put(folio: *foliop);
1847	*foliop = NULL;
1848	if (IS_ERR(ptr: snapc))
1849	return PTR_ERR(ptr: snapc);
1850
1851	ceph_queue_writeback(inode);
1852	r = wait_event_killable(ci->i_cap_wq,
1853	context_is_writeable_or_written(inode, snapc));
1854	ceph_put_snap_context(sc: snapc);
1855	return r == `0` ? -EAGAIN : r;
1856	}
1857	return `0`;
1858	}
1859
1860	/*
1861	* We are only allowed to write into/dirty the page if the page is
1862	* clean, or already dirty within the same snap context.
1863	*/
1864	static int ceph_write_begin(const struct kiocb *iocb,
1865	struct address_space *mapping,
1866	loff_t pos, unsigned len,
1867	struct folio *foliop, void* **fsdata)
1868	{
1869	struct file *file = iocb->ki_filp;
1870	struct inode *inode = file_inode(f: file);
1871	struct ceph_inode_info *ci = ceph_inode(inode);
1872	int r;
1873
1874	r = netfs_write_begin(&ci->netfs, file, inode->i_mapping, pos, len, foliop, NULL);
1875	if (r < `0`)
1876	return r;
1877
1878	folio_wait_private_2(folio: foliop); /* [DEPRECATED] /
1879	WARN_ON_ONCE(!folio_test_locked(*foliop));
1880	return `0`;
1881	}
1882
1883	/*
1884	* we don't do anything in here that simple_write_end doesn't do
1885	* except adjust dirty page accounting
1886	*/
1887	static int ceph_write_end(const struct kiocb *iocb,
1888	struct address_space *mapping, loff_t pos,
1889	unsigned len, unsigned copied,
1890	struct folio folio, void* *fsdata)
1891	{
1892	struct file *file = iocb->ki_filp;
1893	struct inode *inode = file_inode(f: file);
1894	struct ceph_client *cl = ceph_inode_to_client(inode);
1895	bool check_cap = false;
1896
1897	doutc(cl, "%llx.%llx file %p folio %p %d~%d (%d)\n", ceph_vinop(inode),
1898	file, folio, (int)pos, (int)copied, (int)len);
1899
1900	if (!folio_test_uptodate(folio)) {
1901	/ just return that nothing was copied on a short copy /
1902	if (copied < len) {
1903	copied = `0`;
1904	goto out;
1905	}
1906	folio_mark_uptodate(folio);
1907	}
1908
1909	/ did file size increase? /
1910	if (pos+copied > i_size_read(inode))
1911	check_cap = ceph_inode_set_size(inode, size: pos+copied);
1912
1913	folio_mark_dirty(folio);
1914
1915	out:
1916	folio_unlock(folio);
1917	folio_put(folio);
1918
1919	if (check_cap)
1920	ceph_check_caps(ci: ceph_inode(inode), CHECK_CAPS_AUTHONLY);
1921
1922	return copied;
1923	}
1924
1925	const struct address_space_operations ceph_aops = {
1926	.read_folio = netfs_read_folio,
1927	.readahead = netfs_readahead,
1928	.writepages = ceph_writepages_start,
1929	.write_begin = ceph_write_begin,
1930	.write_end = ceph_write_end,
1931	.dirty_folio = ceph_dirty_folio,
1932	.invalidate_folio = ceph_invalidate_folio,
1933	.release_folio = netfs_release_folio,
1934	.direct_IO = noop_direct_IO,
1935	.migrate_folio = filemap_migrate_folio,
1936	};
1937
1938	static void ceph_block_sigs(sigset_t *oldset)
1939	{
1940	sigset_t mask;
1941	siginitsetinv(set: &mask, sigmask(SIGKILL));
1942	sigprocmask(SIG_BLOCK, &mask, oldset);
1943	}
1944
1945	static void ceph_restore_sigs(sigset_t *oldset)
1946	{
1947	sigprocmask(SIG_SETMASK, oldset, NULL);
1948	}
1949
1950	/*
1951	* vm ops
1952	*/
1953	static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf)
1954	{
1955	struct vm_area_struct *vma = vmf->vma;
1956	struct inode *inode = file_inode(f: vma->vm_file);
1957	struct ceph_inode_info *ci = ceph_inode(inode);
1958	struct ceph_client *cl = ceph_inode_to_client(inode);
1959	struct ceph_file_info *fi = vma->vm_file->private_data;
1960	loff_t off = (loff_t)vmf->pgoff << PAGE_SHIFT;
1961	int want, got, err;
1962	sigset_t oldset;
1963	vm_fault_t ret = VM_FAULT_SIGBUS;
1964
1965	if (ceph_inode_is_shutdown(inode))
1966	return ret;
1967
1968	ceph_block_sigs(oldset: &oldset);
1969
1970	doutc(cl, "%llx.%llx %llu trying to get caps\n",
1971	ceph_vinop(inode), off);
1972	if (fi->fmode & CEPH_FILE_MODE_LAZY)
1973	want = CEPH_CAP_FILE_CACHE \| CEPH_CAP_FILE_LAZYIO;
1974	else
1975	want = CEPH_CAP_FILE_CACHE;
1976
1977	got = `0`;
1978	err = ceph_get_caps(filp: vma->vm_file, CEPH_CAP_FILE_RD, want, endoff: -`1`, got: &got);
1979	if (err < `0`)
1980	goto out_restore;
1981
1982	doutc(cl, "%llx.%llx %llu got cap refs on %s\n", ceph_vinop(inode),
1983	off, ceph_cap_string(got));
1984
1985	if ((got & (CEPH_CAP_FILE_CACHE \| CEPH_CAP_FILE_LAZYIO)) \|\|
1986	!ceph_has_inline_data(ci)) {
1987	CEPH_DEFINE_RW_CONTEXT(rw_ctx, got);
1988	ceph_add_rw_context(cf: fi, ctx: &rw_ctx);
1989	ret = filemap_fault(vmf);
1990	ceph_del_rw_context(cf: fi, ctx: &rw_ctx);
1991	doutc(cl, "%llx.%llx %llu drop cap refs %s ret %x\n",
1992	ceph_vinop(inode), off, ceph_cap_string(got), ret);
1993	} else
1994	err = -EAGAIN;
1995
1996	ceph_put_cap_refs(ci, had: got);
1997
1998	if (err != -EAGAIN)
1999	goto out_restore;
2000
2001	/ read inline data /
2002	if (off >= PAGE_SIZE) {
2003	/ does not support inline data > PAGE_SIZE /
2004	ret = VM_FAULT_SIGBUS;
2005	} else {
2006	struct address_space *mapping = inode->i_mapping;
2007	struct page *page;
2008
2009	filemap_invalidate_lock_shared(mapping);
2010	page = find_or_create_page(mapping, index: `0`,
2011	gfp_mask: mapping_gfp_constraint(mapping, gfp_mask: ~__GFP_FS));
2012	if (!page) {
2013	ret = VM_FAULT_OOM;
2014	goto out_inline;
2015	}
2016	err = __ceph_do_getattr(inode, locked_page: page,
2017	CEPH_STAT_CAP_INLINE_DATA, force: true);
2018	if (err < `0` \|\| off >= i_size_read(inode)) {
2019	unlock_page(page);
2020	put_page(page);
2021	ret = vmf_error(err);
2022	goto out_inline;
2023	}
2024	if (err < PAGE_SIZE)
2025	zero_user_segment(page, start: err, PAGE_SIZE);
2026	else
2027	flush_dcache_page(page);
2028	SetPageUptodate(page);
2029	vmf->page = page;
2030	ret = VM_FAULT_MAJOR \| VM_FAULT_LOCKED;
2031	out_inline:
2032	filemap_invalidate_unlock_shared(mapping);
2033	doutc(cl, "%llx.%llx %llu read inline data ret %x\n",
2034	ceph_vinop(inode), off, ret);
2035	}
2036	out_restore:
2037	ceph_restore_sigs(oldset: &oldset);
2038	if (err < `0`)
2039	ret = vmf_error(err);
2040
2041	return ret;
2042	}
2043
2044	static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
2045	{
2046	struct vm_area_struct *vma = vmf->vma;
2047	struct inode *inode = file_inode(f: vma->vm_file);
2048	struct ceph_client *cl = ceph_inode_to_client(inode);
2049	struct ceph_inode_info *ci = ceph_inode(inode);
2050	struct ceph_file_info *fi = vma->vm_file->private_data;
2051	struct ceph_cap_flush *prealloc_cf;
2052	struct folio *folio = page_folio(vmf->page);
2053	loff_t off = folio_pos(folio);
2054	loff_t size = i_size_read(inode);
2055	size_t len;
2056	int want, got, err;
2057	sigset_t oldset;
2058	vm_fault_t ret = VM_FAULT_SIGBUS;
2059
2060	if (ceph_inode_is_shutdown(inode))
2061	return ret;
2062
2063	prealloc_cf = ceph_alloc_cap_flush();
2064	if (!prealloc_cf)
2065	return VM_FAULT_OOM;
2066
2067	sb_start_pagefault(sb: inode->i_sb);
2068	ceph_block_sigs(oldset: &oldset);
2069
2070	if (off + folio_size(folio) <= size)
2071	len = folio_size(folio);
2072	else
2073	len = offset_in_folio(folio, size);
2074
2075	doutc(cl, "%llx.%llx %llu~%zd getting caps i_size %llu\n",
2076	ceph_vinop(inode), off, len, size);
2077	if (fi->fmode & CEPH_FILE_MODE_LAZY)
2078	want = CEPH_CAP_FILE_BUFFER \| CEPH_CAP_FILE_LAZYIO;
2079	else
2080	want = CEPH_CAP_FILE_BUFFER;
2081
2082	got = `0`;
2083	err = ceph_get_caps(filp: vma->vm_file, CEPH_CAP_FILE_WR, want, endoff: off + len, got: &got);
2084	if (err < `0`)
2085	goto out_free;
2086
2087	doutc(cl, "%llx.%llx %llu~%zd got cap refs on %s\n", ceph_vinop(inode),
2088	off, len, ceph_cap_string(got));
2089
2090	/ Update time before taking folio lock /
2091	file_update_time(file: vma->vm_file);
2092	inode_inc_iversion_raw(inode);
2093
2094	do {
2095	struct ceph_snap_context *snapc;
2096
2097	folio_lock(folio);
2098
2099	if (folio_mkwrite_check_truncate(folio, inode) < `0`) {
2100	folio_unlock(folio);
2101	ret = VM_FAULT_NOPAGE;
2102	break;
2103	}
2104
2105	snapc = ceph_find_incompatible(folio);
2106	if (!snapc) {
2107	/ success. we'll keep the folio locked. /
2108	folio_mark_dirty(folio);
2109	ret = VM_FAULT_LOCKED;
2110	break;
2111	}
2112
2113	folio_unlock(folio);
2114
2115	if (IS_ERR(ptr: snapc)) {
2116	ret = VM_FAULT_SIGBUS;
2117	break;
2118	}
2119
2120	ceph_queue_writeback(inode);
2121	err = wait_event_killable(ci->i_cap_wq,
2122	context_is_writeable_or_written(inode, snapc));
2123	ceph_put_snap_context(sc: snapc);
2124	} while (err == `0`);
2125
2126	if (ret == VM_FAULT_LOCKED) {
2127	int dirty;
2128	spin_lock(lock: &ci->i_ceph_lock);
2129	dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR,
2130	pcf: &prealloc_cf);
2131	spin_unlock(lock: &ci->i_ceph_lock);
2132	if (dirty)
2133	__mark_inode_dirty(inode, dirty);
2134	}
2135
2136	doutc(cl, "%llx.%llx %llu~%zd dropping cap refs on %s ret %x\n",
2137	ceph_vinop(inode), off, len, ceph_cap_string(got), ret);
2138	ceph_put_cap_refs_async(ci, had: got);
2139	out_free:
2140	ceph_restore_sigs(oldset: &oldset);
2141	sb_end_pagefault(sb: inode->i_sb);
2142	ceph_free_cap_flush(cf: prealloc_cf);
2143	if (err < `0`)
2144	ret = vmf_error(err);
2145	return ret;
2146	}
2147
2148	void ceph_fill_inline_data(struct inode inode, struct* page *locked_page,
2149	char *data, size_t len)
2150	{
2151	struct ceph_client *cl = ceph_inode_to_client(inode);
2152	struct address_space *mapping = inode->i_mapping;
2153	struct page *page;
2154
2155	if (locked_page) {
2156	page = locked_page;
2157	} else {
2158	if (i_size_read(inode) == `0`)
2159	return;
2160	page = find_or_create_page(mapping, index: `0`,
2161	gfp_mask: mapping_gfp_constraint(mapping,
2162	gfp_mask: ~__GFP_FS));
2163	if (!page)
2164	return;
2165	if (PageUptodate(page)) {
2166	unlock_page(page);
2167	put_page(page);
2168	return;
2169	}
2170	}
2171
2172	doutc(cl, "%p %llx.%llx len %zu locked_page %p\n", inode,
2173	ceph_vinop(inode), len, locked_page);
2174
2175	if (len > `0`) {
2176	void *kaddr = kmap_atomic(page);
2177	memcpy(kaddr, data, len);
2178	kunmap_atomic(kaddr);
2179	}
2180
2181	if (page != locked_page) {
2182	if (len < PAGE_SIZE)
2183	zero_user_segment(page, start: len, PAGE_SIZE);
2184	else
2185	flush_dcache_page(page);
2186
2187	SetPageUptodate(page);
2188	unlock_page(page);
2189	put_page(page);
2190	}
2191	}
2192
2193	int ceph_uninline_data(struct file *file)
2194	{
2195	struct inode *inode = file_inode(f: file);
2196	struct ceph_inode_info *ci = ceph_inode(inode);
2197	struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
2198	struct ceph_client *cl = fsc->client;
2199	struct ceph_osd_request *req = NULL;
2200	struct ceph_cap_flush *prealloc_cf = NULL;
2201	struct folio *folio = NULL;
2202	u64 inline_version = CEPH_INLINE_NONE;
2203	struct page *pages[`1`];
2204	int err = `0`;
2205	u64 len;
2206
2207	spin_lock(lock: &ci->i_ceph_lock);
2208	inline_version = ci->i_inline_version;
2209	spin_unlock(lock: &ci->i_ceph_lock);
2210
2211	doutc(cl, "%llx.%llx inline_version %llu\n", ceph_vinop(inode),
2212	inline_version);
2213
2214	if (ceph_inode_is_shutdown(inode)) {
2215	err = -EIO;
2216	goto out;
2217	}
2218
2219	if (inline_version == CEPH_INLINE_NONE)
2220	return `0`;
2221
2222	prealloc_cf = ceph_alloc_cap_flush();
2223	if (!prealloc_cf)
2224	return -ENOMEM;
2225
2226	if (inline_version == `1`) / initial version, no data /
2227	goto out_uninline;
2228
2229	folio = read_mapping_folio(mapping: inode->i_mapping, index: `0`, file);
2230	if (IS_ERR(ptr: folio)) {
2231	err = PTR_ERR(ptr: folio);
2232	goto out;
2233	}
2234
2235	folio_lock(folio);
2236
2237	len = i_size_read(inode);
2238	if (len > folio_size(folio))
2239	len = folio_size(folio);
2240
2241	req = ceph_osdc_new_request(&fsc->client->osdc, layout: &ci->i_layout,
2242	vino: ceph_vino(inode), offset: `0`, len: &len, which: `0`, num_ops: `1`,
2243	opcode: CEPH_OSD_OP_CREATE, flags: CEPH_OSD_FLAG_WRITE,
2244	NULL, truncate_seq: `0`, truncate_size: `0`, use_mempool: false);
2245	if (IS_ERR(ptr: req)) {
2246	err = PTR_ERR(ptr: req);
2247	goto out_unlock;
2248	}
2249
2250	req->r_mtime = inode_get_mtime(inode);
2251	ceph_osdc_start_request(osdc: &fsc->client->osdc, req);
2252	err = ceph_osdc_wait_request(osdc: &fsc->client->osdc, req);
2253	ceph_osdc_put_request(req);
2254	if (err < `0`)
2255	goto out_unlock;
2256
2257	req = ceph_osdc_new_request(&fsc->client->osdc, layout: &ci->i_layout,
2258	vino: ceph_vino(inode), offset: `0`, len: &len, which: `1`, num_ops: `3`,
2259	opcode: CEPH_OSD_OP_WRITE, flags: CEPH_OSD_FLAG_WRITE,
2260	NULL, truncate_seq: ci->i_truncate_seq,
2261	truncate_size: ci->i_truncate_size, use_mempool: false);
2262	if (IS_ERR(ptr: req)) {
2263	err = PTR_ERR(ptr: req);
2264	goto out_unlock;
2265	}
2266
2267	pages[`0`] = folio_page(folio, `0`);
2268	osd_req_op_extent_osd_data_pages(req, which: `1`, pages, length: len, alignment: `0`, pages_from_pool: false, own_pages: false);
2269
2270	{
2271	__le64 xattr_buf = cpu_to_le64(inline_version);
2272	err = osd_req_op_xattr_init(osd_req: req, which: `0`, opcode: CEPH_OSD_OP_CMPXATTR,
2273	name: "inline_version", value: &xattr_buf,
2274	size: sizeof(xattr_buf),
2275	cmp_op: CEPH_OSD_CMPXATTR_OP_GT,
2276	cmp_mode: CEPH_OSD_CMPXATTR_MODE_U64);
2277	if (err)
2278	goto out_put_req;
2279	}
2280
2281	{
2282	char xattr_buf[`32`];
2283	int xattr_len = snprintf(buf: xattr_buf, size: sizeof(xattr_buf),
2284	fmt: "%llu", inline_version);
2285	err = osd_req_op_xattr_init(osd_req: req, which: `2`, opcode: CEPH_OSD_OP_SETXATTR,
2286	name: "inline_version",
2287	value: xattr_buf, size: xattr_len, cmp_op: `0`, cmp_mode: `0`);
2288	if (err)
2289	goto out_put_req;
2290	}
2291
2292	req->r_mtime = inode_get_mtime(inode);
2293	ceph_osdc_start_request(osdc: &fsc->client->osdc, req);
2294	err = ceph_osdc_wait_request(osdc: &fsc->client->osdc, req);
2295
2296	ceph_update_write_metrics(m: &fsc->mdsc->metric, r_start: req->r_start_latency,
2297	r_end: req->r_end_latency, size: len, rc: err);
2298
2299	out_uninline:
2300	if (!err) {
2301	int dirty;
2302
2303	/ Set to CAP_INLINE_NONE and dirty the caps /
2304	down_read(sem: &fsc->mdsc->snap_rwsem);
2305	spin_lock(lock: &ci->i_ceph_lock);
2306	ci->i_inline_version = CEPH_INLINE_NONE;
2307	dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR, pcf: &prealloc_cf);
2308	spin_unlock(lock: &ci->i_ceph_lock);
2309	up_read(sem: &fsc->mdsc->snap_rwsem);
2310	if (dirty)
2311	__mark_inode_dirty(inode, dirty);
2312	}
2313	out_put_req:
2314	ceph_osdc_put_request(req);
2315	if (err == -ECANCELED)
2316	err = `0`;
2317	out_unlock:
2318	if (folio) {
2319	folio_unlock(folio);
2320	folio_put(folio);
2321	}
2322	out:
2323	ceph_free_cap_flush(cf: prealloc_cf);
2324	doutc(cl, "%llx.%llx inline_version %llu = %d\n",
2325	ceph_vinop(inode), inline_version, err);
2326	return err;
2327	}
2328
2329	static const struct vm_operations_struct ceph_vmops = {
2330	.fault = ceph_filemap_fault,
2331	.page_mkwrite = ceph_page_mkwrite,
2332	};
2333
2334	int ceph_mmap_prepare(struct vm_area_desc *desc)
2335	{
2336	struct address_space *mapping = desc->file->f_mapping;
2337
2338	if (!mapping->a_ops->read_folio)
2339	return -ENOEXEC;
2340	desc->vm_ops = &ceph_vmops;
2341	return `0`;
2342	}
2343
2344	enum {
2345	POOL_READ = `1`,
2346	POOL_WRITE = `2`,
2347	};
2348
2349	static int __ceph_pool_perm_get(struct ceph_inode_info *ci,
2350	s64 pool, struct ceph_string *pool_ns)
2351	{
2352	struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode: &ci->netfs.inode);
2353	struct ceph_mds_client *mdsc = fsc->mdsc;
2354	struct ceph_client *cl = fsc->client;
2355	struct ceph_osd_request rd_req = NULL, wr_req = NULL;
2356	struct rb_node *p, parent;
2357	struct ceph_pool_perm *perm;
2358	struct page **pages;
2359	size_t pool_ns_len;
2360	int err = `0`, err2 = `0`, have = `0`;
2361
2362	down_read(sem: &mdsc->pool_perm_rwsem);
2363	p = &mdsc->pool_perm_tree.rb_node;
2364	while (*p) {
2365	perm = rb_entry(p, struct* ceph_pool_perm, node);
2366	if (pool < perm->pool)
2367	p = &(*p)->rb_left;
2368	else if (pool > perm->pool)
2369	p = &(*p)->rb_right;
2370	else {
2371	int ret = ceph_compare_string(cs: pool_ns,
2372	str: perm->pool_ns,
2373	len: perm->pool_ns_len);
2374	if (ret < `0`)
2375	p = &(*p)->rb_left;
2376	else if (ret > `0`)
2377	p = &(*p)->rb_right;
2378	else {
2379	have = perm->perm;
2380	break;
2381	}
2382	}
2383	}
2384	up_read(sem: &mdsc->pool_perm_rwsem);
2385	if (*p)
2386	goto out;
2387
2388	if (pool_ns)
2389	doutc(cl, "pool %lld ns %.*s no perm cached\n", pool,
2390	(int)pool_ns->len, pool_ns->str);
2391	else
2392	doutc(cl, "pool %lld no perm cached\n", pool);
2393
2394	down_write(sem: &mdsc->pool_perm_rwsem);
2395	p = &mdsc->pool_perm_tree.rb_node;
2396	parent = NULL;
2397	while (*p) {
2398	parent = *p;
2399	perm = rb_entry(parent, struct ceph_pool_perm, node);
2400	if (pool < perm->pool)
2401	p = &(*p)->rb_left;
2402	else if (pool > perm->pool)
2403	p = &(*p)->rb_right;
2404	else {
2405	int ret = ceph_compare_string(cs: pool_ns,
2406	str: perm->pool_ns,
2407	len: perm->pool_ns_len);
2408	if (ret < `0`)
2409	p = &(*p)->rb_left;
2410	else if (ret > `0`)
2411	p = &(*p)->rb_right;
2412	else {
2413	have = perm->perm;
2414	break;
2415	}
2416	}
2417	}
2418	if (*p) {
2419	up_write(sem: &mdsc->pool_perm_rwsem);
2420	goto out;
2421	}
2422
2423	rd_req = ceph_osdc_alloc_request(osdc: &fsc->client->osdc, NULL,
2424	num_ops: `1`, use_mempool: false, GFP_NOFS);
2425	if (!rd_req) {
2426	err = -ENOMEM;
2427	goto out_unlock;
2428	}
2429
2430	rd_req->r_flags = CEPH_OSD_FLAG_READ;
2431	osd_req_op_init(osd_req: rd_req, which: `0`, opcode: CEPH_OSD_OP_STAT, flags: `0`);
2432	rd_req->r_base_oloc.pool = pool;
2433	if (pool_ns)
2434	rd_req->r_base_oloc.pool_ns = ceph_get_string(str: pool_ns);
2435	ceph_oid_printf(oid: &rd_req->r_base_oid, fmt: "%llx.00000000", ci->i_vino.ino);
2436
2437	err = ceph_osdc_alloc_messages(req: rd_req, GFP_NOFS);
2438	if (err)
2439	goto out_unlock;
2440
2441	wr_req = ceph_osdc_alloc_request(osdc: &fsc->client->osdc, NULL,
2442	num_ops: `1`, use_mempool: false, GFP_NOFS);
2443	if (!wr_req) {
2444	err = -ENOMEM;
2445	goto out_unlock;
2446	}
2447
2448	wr_req->r_flags = CEPH_OSD_FLAG_WRITE;
2449	osd_req_op_init(osd_req: wr_req, which: `0`, opcode: CEPH_OSD_OP_CREATE, flags: CEPH_OSD_OP_FLAG_EXCL);
2450	ceph_oloc_copy(dest: &wr_req->r_base_oloc, src: &rd_req->r_base_oloc);
2451	ceph_oid_copy(dest: &wr_req->r_base_oid, src: &rd_req->r_base_oid);
2452
2453	err = ceph_osdc_alloc_messages(req: wr_req, GFP_NOFS);
2454	if (err)
2455	goto out_unlock;
2456
2457	/ one page should be large enough for STAT data /
2458	pages = ceph_alloc_page_vector(num_pages: `1`, GFP_KERNEL);
2459	if (IS_ERR(ptr: pages)) {
2460	err = PTR_ERR(ptr: pages);
2461	goto out_unlock;
2462	}
2463
2464	osd_req_op_raw_data_in_pages(rd_req, which: `0`, pages, PAGE_SIZE,
2465	alignment: `0`, pages_from_pool: false, own_pages: true);
2466	ceph_osdc_start_request(osdc: &fsc->client->osdc, req: rd_req);
2467
2468	wr_req->r_mtime = inode_get_mtime(inode: &ci->netfs.inode);
2469	ceph_osdc_start_request(osdc: &fsc->client->osdc, req: wr_req);
2470
2471	err = ceph_osdc_wait_request(osdc: &fsc->client->osdc, req: rd_req);
2472	err2 = ceph_osdc_wait_request(osdc: &fsc->client->osdc, req: wr_req);
2473
2474	if (err >= `0` \|\| err == -ENOENT)
2475	have \|= POOL_READ;
2476	else if (err != -EPERM) {
2477	if (err == -EBLOCKLISTED)
2478	fsc->blocklisted = true;
2479	goto out_unlock;
2480	}
2481
2482	if (err2 == `0` \|\| err2 == -EEXIST)
2483	have \|= POOL_WRITE;
2484	else if (err2 != -EPERM) {
2485	if (err2 == -EBLOCKLISTED)
2486	fsc->blocklisted = true;
2487	err = err2;
2488	goto out_unlock;
2489	}
2490
2491	pool_ns_len = pool_ns ? pool_ns->len : `0`;
2492	perm = kmalloc(struct_size(perm, pool_ns, pool_ns_len + `1`), GFP_NOFS);
2493	if (!perm) {
2494	err = -ENOMEM;
2495	goto out_unlock;
2496	}
2497
2498	perm->pool = pool;
2499	perm->perm = have;
2500	perm->pool_ns_len = pool_ns_len;
2501	if (pool_ns_len > `0`)
2502	memcpy(perm->pool_ns, pool_ns->str, pool_ns_len);
2503	perm->pool_ns[pool_ns_len] = `0`;
2504
2505	rb_link_node(node: &perm->node, parent, rb_link: p);
2506	rb_insert_color(&perm->node, &mdsc->pool_perm_tree);
2507	err = `0`;
2508	out_unlock:
2509	up_write(sem: &mdsc->pool_perm_rwsem);
2510
2511	ceph_osdc_put_request(req: rd_req);
2512	ceph_osdc_put_request(req: wr_req);
2513	out:
2514	if (!err)
2515	err = have;
2516	if (pool_ns)
2517	doutc(cl, "pool %lld ns %.*s result = %d\n", pool,
2518	(int)pool_ns->len, pool_ns->str, err);
2519	else
2520	doutc(cl, "pool %lld result = %d\n", pool, err);
2521	return err;
2522	}
2523
2524	int ceph_pool_perm_check(struct inode inode, int* need)
2525	{
2526	struct ceph_client *cl = ceph_inode_to_client(inode);
2527	struct ceph_inode_info *ci = ceph_inode(inode);
2528	struct ceph_string *pool_ns;
2529	s64 pool;
2530	int ret, flags;
2531
2532	/ Only need to do this for regular files /
2533	if (!S_ISREG(inode->i_mode))
2534	return `0`;
2535
2536	if (ci->i_vino.snap != CEPH_NOSNAP) {
2537	/*
2538	* Pool permission check needs to write to the first object.
2539	* But for snapshot, head of the first object may have already
2540	* been deleted. Skip check to avoid creating orphan object.
2541	*/
2542	return `0`;
2543	}
2544
2545	if (ceph_test_mount_opt(ceph_inode_to_fs_client(inode),
2546	NOPOOLPERM))
2547	return `0`;
2548
2549	spin_lock(lock: &ci->i_ceph_lock);
2550	flags = ci->i_ceph_flags;
2551	pool = ci->i_layout.pool_id;
2552	spin_unlock(lock: &ci->i_ceph_lock);
2553	check:
2554	if (flags & CEPH_I_POOL_PERM) {
2555	if ((need & CEPH_CAP_FILE_RD) && !(flags & CEPH_I_POOL_RD)) {
2556	doutc(cl, "pool %lld no read perm\n", pool);
2557	return -EPERM;
2558	}
2559	if ((need & CEPH_CAP_FILE_WR) && !(flags & CEPH_I_POOL_WR)) {
2560	doutc(cl, "pool %lld no write perm\n", pool);
2561	return -EPERM;
2562	}
2563	return `0`;
2564	}
2565
2566	pool_ns = ceph_try_get_string(ci->i_layout.pool_ns);
2567	ret = __ceph_pool_perm_get(ci, pool, pool_ns);
2568	ceph_put_string(str: pool_ns);
2569	if (ret < `0`)
2570	return ret;
2571
2572	flags = CEPH_I_POOL_PERM;
2573	if (ret & POOL_READ)
2574	flags \|= CEPH_I_POOL_RD;
2575	if (ret & POOL_WRITE)
2576	flags \|= CEPH_I_POOL_WR;
2577
2578	spin_lock(lock: &ci->i_ceph_lock);
2579	if (pool == ci->i_layout.pool_id &&
2580	pool_ns == rcu_dereference_raw(ci->i_layout.pool_ns)) {
2581	ci->i_ceph_flags \|= flags;
2582	} else {
2583	pool = ci->i_layout.pool_id;
2584	flags = ci->i_ceph_flags;
2585	}
2586	spin_unlock(lock: &ci->i_ceph_lock);
2587	goto check;
2588	}
2589
2590	void ceph_pool_perm_destroy(struct ceph_mds_client *mdsc)
2591	{
2592	struct ceph_pool_perm *perm;
2593	struct rb_node *n;
2594
2595	while (!RB_EMPTY_ROOT(&mdsc->pool_perm_tree)) {
2596	n = rb_first(root: &mdsc->pool_perm_tree);
2597	perm = rb_entry(n, struct ceph_pool_perm, node);
2598	rb_erase(n, &mdsc->pool_perm_tree);
2599	kfree(objp: perm);
2600	}
2601	}
2602

source code of linux/fs/ceph/addr.c