file.c source code [linux/fs/zonefs/file.c]

1	// SPDX-License-Identifier: GPL-2.0
2	/*
3	* Simple file system for zoned block devices exposing zones as files.
4	*
5	* Copyright (C) 2022 Western Digital Corporation or its affiliates.
6	*/
7	#include <linux/module.h>
8	#include <linux/pagemap.h>
9	#include <linux/iomap.h>
10	#include <linux/init.h>
11	#include <linux/slab.h>
12	#include <linux/blkdev.h>
13	#include <linux/statfs.h>
14	#include <linux/writeback.h>
15	#include <linux/quotaops.h>
16	#include <linux/seq_file.h>
17	#include <linux/parser.h>
18	#include <linux/uio.h>
19	#include <linux/mman.h>
20	#include <linux/sched/mm.h>
21	#include <linux/task_io_accounting_ops.h>
22
23	#include "zonefs.h"
24
25	#include "trace.h"
26
27	static int zonefs_read_iomap_begin(struct inode *inode, loff_t offset,
28	loff_t length, unsigned int flags,
29	struct iomap iomap, struct* iomap *srcmap)
30	{
31	struct zonefs_inode_info *zi = ZONEFS_I(inode);
32	struct zonefs_zone *z = zonefs_inode_zone(inode);
33	struct super_block *sb = inode->i_sb;
34	loff_t isize;
35
36	/*
37	* All blocks are always mapped below EOF. If reading past EOF,
38	* act as if there is a hole up to the file maximum size.
39	*/
40	mutex_lock(&zi->i_truncate_mutex);
41	iomap->bdev = inode->i_sb->s_bdev;
42	iomap->offset = ALIGN_DOWN(offset, sb->s_blocksize);
43	isize = i_size_read(inode);
44	if (iomap->offset >= isize) {
45	iomap->type = IOMAP_HOLE;
46	iomap->addr = IOMAP_NULL_ADDR;
47	iomap->length = length;
48	} else {
49	iomap->type = IOMAP_MAPPED;
50	iomap->addr = (z->z_sector << SECTOR_SHIFT) + iomap->offset;
51	iomap->length = isize - iomap->offset;
52	}
53	mutex_unlock(lock: &zi->i_truncate_mutex);
54
55	trace_zonefs_iomap_begin(inode, iomap);
56
57	return `0`;
58	}
59
60	static const struct iomap_ops zonefs_read_iomap_ops = {
61	.iomap_begin = zonefs_read_iomap_begin,
62	};
63
64	static int zonefs_write_iomap_begin(struct inode *inode, loff_t offset,
65	loff_t length, unsigned int flags,
66	struct iomap iomap, struct* iomap *srcmap)
67	{
68	struct zonefs_inode_info *zi = ZONEFS_I(inode);
69	struct zonefs_zone *z = zonefs_inode_zone(inode);
70	struct super_block *sb = inode->i_sb;
71	loff_t isize;
72
73	/ All write I/Os should always be within the file maximum size /
74	if (WARN_ON_ONCE(offset + length > z->z_capacity))
75	return -EIO;
76
77	/*
78	* Sequential zones can only accept direct writes. This is already
79	* checked when writes are issued, so warn if we see a page writeback
80	* operation.
81	*/
82	if (WARN_ON_ONCE(zonefs_zone_is_seq(z) && !(flags & IOMAP_DIRECT)))
83	return -EIO;
84
85	/*
86	* For conventional zones, all blocks are always mapped. For sequential
87	* zones, all blocks after always mapped below the inode size (zone
88	* write pointer) and unwritten beyond.
89	*/
90	mutex_lock(&zi->i_truncate_mutex);
91	iomap->bdev = inode->i_sb->s_bdev;
92	iomap->offset = ALIGN_DOWN(offset, sb->s_blocksize);
93	iomap->addr = (z->z_sector << SECTOR_SHIFT) + iomap->offset;
94	isize = i_size_read(inode);
95	if (iomap->offset >= isize) {
96	iomap->type = IOMAP_UNWRITTEN;
97	iomap->length = z->z_capacity - iomap->offset;
98	} else {
99	iomap->type = IOMAP_MAPPED;
100	iomap->length = isize - iomap->offset;
101	}
102	mutex_unlock(lock: &zi->i_truncate_mutex);
103
104	trace_zonefs_iomap_begin(inode, iomap);
105
106	return `0`;
107	}
108
109	static const struct iomap_ops zonefs_write_iomap_ops = {
110	.iomap_begin = zonefs_write_iomap_begin,
111	};
112
113	static int zonefs_read_folio(struct file unused, struct* folio *folio)
114	{
115	iomap_bio_read_folio(folio, ops: &zonefs_read_iomap_ops);
116	return `0`;
117	}
118
119	static void zonefs_readahead(struct readahead_control *rac)
120	{
121	iomap_bio_readahead(rac, ops: &zonefs_read_iomap_ops);
122	}
123
124	/*
125	* Map blocks for page writeback. This is used only on conventional zone files,
126	* which implies that the page range can only be within the fixed inode size.
127	*/
128	static ssize_t zonefs_writeback_range(struct iomap_writepage_ctx *wpc,
129	struct folio folio, u64 offset, unsigned* len, u64 end_pos)
130	{
131	struct zonefs_zone *z = zonefs_inode_zone(inode: wpc->inode);
132
133	if (WARN_ON_ONCE(zonefs_zone_is_seq(z)))
134	return -EIO;
135	if (WARN_ON_ONCE(offset >= i_size_read(wpc->inode)))
136	return -EIO;
137
138	/ If the mapping is already OK, nothing needs to be done /
139	if (offset < wpc->iomap.offset \|\|
140	offset >= wpc->iomap.offset + wpc->iomap.length) {
141	int error;
142
143	error = zonefs_write_iomap_begin(inode: wpc->inode, offset,
144	length: z->z_capacity - offset, IOMAP_WRITE,
145	iomap: &wpc->iomap, NULL);
146	if (error)
147	return error;
148	}
149
150	return iomap_add_to_ioend(wpc, folio, pos: offset, end_pos, dirty_len: len);
151	}
152
153	static const struct iomap_writeback_ops zonefs_writeback_ops = {
154	.writeback_range = zonefs_writeback_range,
155	.writeback_submit = iomap_ioend_writeback_submit,
156	};
157
158	static int zonefs_writepages(struct address_space *mapping,
159	struct writeback_control *wbc)
160	{
161	struct iomap_writepage_ctx wpc = {
162	.inode = mapping->host,
163	.wbc = wbc,
164	.ops = &zonefs_writeback_ops,
165	};
166
167	return iomap_writepages(wpc: &wpc);
168	}
169
170	static int zonefs_swap_activate(struct swap_info_struct *sis,
171	struct file swap_file, sector_t span)
172	{
173	struct inode *inode = file_inode(f: swap_file);
174
175	if (zonefs_inode_is_seq(inode)) {
176	zonefs_err(inode->i_sb,
177	"swap file: not a conventional zone file\n");
178	return -EINVAL;
179	}
180
181	return iomap_swapfile_activate(sis, swap_file, pagespan: span,
182	ops: &zonefs_read_iomap_ops);
183	}
184
185	const struct address_space_operations zonefs_file_aops = {
186	.read_folio = zonefs_read_folio,
187	.readahead = zonefs_readahead,
188	.writepages = zonefs_writepages,
189	.dirty_folio = iomap_dirty_folio,
190	.release_folio = iomap_release_folio,
191	.invalidate_folio = iomap_invalidate_folio,
192	.migrate_folio = filemap_migrate_folio,
193	.is_partially_uptodate = iomap_is_partially_uptodate,
194	.error_remove_folio = generic_error_remove_folio,
195	.swap_activate = zonefs_swap_activate,
196	};
197
198	int zonefs_file_truncate(struct inode *inode, loff_t isize)
199	{
200	struct zonefs_inode_info *zi = ZONEFS_I(inode);
201	struct zonefs_zone *z = zonefs_inode_zone(inode);
202	loff_t old_isize;
203	enum req_op op;
204	int ret = `0`;
205
206	/*
207	* Only sequential zone files can be truncated and truncation is allowed
208	* only down to a 0 size, which is equivalent to a zone reset, and to
209	* the maximum file size, which is equivalent to a zone finish.
210	*/
211	if (!zonefs_zone_is_seq(z))
212	return -EPERM;
213
214	if (!isize)
215	op = REQ_OP_ZONE_RESET;
216	else if (isize == z->z_capacity)
217	op = REQ_OP_ZONE_FINISH;
218	else
219	return -EPERM;
220
221	inode_dio_wait(inode);
222
223	/ Serialize against page faults /
224	filemap_invalidate_lock(mapping: inode->i_mapping);
225
226	/ Serialize against zonefs_iomap_begin() /
227	mutex_lock(&zi->i_truncate_mutex);
228
229	old_isize = i_size_read(inode);
230	if (isize == old_isize)
231	goto unlock;
232
233	ret = zonefs_inode_zone_mgmt(inode, op);
234	if (ret)
235	goto unlock;
236
237	/*
238	* If the mount option ZONEFS_MNTOPT_EXPLICIT_OPEN is set,
239	* take care of open zones.
240	*/
241	if (z->z_flags & ZONEFS_ZONE_OPEN) {
242	/*
243	* Truncating a zone to EMPTY or FULL is the equivalent of
244	* closing the zone. For a truncation to 0, we need to
245	* re-open the zone to ensure new writes can be processed.
246	* For a truncation to the maximum file size, the zone is
247	* closed and writes cannot be accepted anymore, so clear
248	* the open flag.
249	*/
250	if (!isize)
251	ret = zonefs_inode_zone_mgmt(inode, op: REQ_OP_ZONE_OPEN);
252	else
253	z->z_flags &= ~ZONEFS_ZONE_OPEN;
254	}
255
256	zonefs_update_stats(inode, new_isize: isize);
257	truncate_setsize(inode, newsize: isize);
258	z->z_wpoffset = isize;
259	zonefs_inode_account_active(inode);
260
261	unlock:
262	mutex_unlock(lock: &zi->i_truncate_mutex);
263	filemap_invalidate_unlock(mapping: inode->i_mapping);
264
265	return ret;
266	}
267
268	static int zonefs_file_fsync(struct file *file, loff_t start, loff_t end,
269	int datasync)
270	{
271	struct inode *inode = file_inode(f: file);
272	int ret = `0`;
273
274	if (unlikely(IS_IMMUTABLE(inode)))
275	return -EPERM;
276
277	/*
278	* Since only direct writes are allowed in sequential files, page cache
279	* flush is needed only for conventional zone files.
280	*/
281	if (zonefs_inode_is_cnv(inode))
282	ret = file_write_and_wait_range(file, start, end);
283	if (!ret)
284	ret = blkdev_issue_flush(bdev: inode->i_sb->s_bdev);
285
286	if (ret)
287	zonefs_io_error(inode, write: true);
288
289	return ret;
290	}
291
292	static vm_fault_t zonefs_filemap_page_mkwrite(struct vm_fault *vmf)
293	{
294	struct inode *inode = file_inode(f: vmf->vma->vm_file);
295	vm_fault_t ret;
296
297	if (unlikely(IS_IMMUTABLE(inode)))
298	return VM_FAULT_SIGBUS;
299
300	/*
301	* Sanity check: only conventional zone files can have shared
302	* writeable mappings.
303	*/
304	if (zonefs_inode_is_seq(inode))
305	return VM_FAULT_NOPAGE;
306
307	sb_start_pagefault(sb: inode->i_sb);
308	file_update_time(file: vmf->vma->vm_file);
309
310	/ Serialize against truncates /
311	filemap_invalidate_lock_shared(mapping: inode->i_mapping);
312	ret = iomap_page_mkwrite(vmf, ops: &zonefs_write_iomap_ops, NULL);
313	filemap_invalidate_unlock_shared(mapping: inode->i_mapping);
314
315	sb_end_pagefault(sb: inode->i_sb);
316	return ret;
317	}
318
319	static const struct vm_operations_struct zonefs_file_vm_ops = {
320	.fault = filemap_fault,
321	.map_pages = filemap_map_pages,
322	.page_mkwrite = zonefs_filemap_page_mkwrite,
323	};
324
325	static int zonefs_file_mmap_prepare(struct vm_area_desc *desc)
326	{
327	struct file *file = desc->file;
328
329	/*
330	* Conventional zones accept random writes, so their files can support
331	* shared writable mappings. For sequential zone files, only read
332	* mappings are possible since there are no guarantees for write
333	* ordering between msync() and page cache writeback.
334	*/
335	if (zonefs_inode_is_seq(inode: file_inode(f: file)) &&
336	(desc->vm_flags & VM_SHARED) && (desc->vm_flags & VM_MAYWRITE))
337	return -EINVAL;
338
339	file_accessed(file);
340	desc->vm_ops = &zonefs_file_vm_ops;
341
342	return `0`;
343	}
344
345	static loff_t zonefs_file_llseek(struct file file, loff_t offset, int* whence)
346	{
347	loff_t isize = i_size_read(inode: file_inode(f: file));
348
349	/*
350	* Seeks are limited to below the zone size for conventional zones
351	* and below the zone write pointer for sequential zones. In both
352	* cases, this limit is the inode size.
353	*/
354	return generic_file_llseek_size(file, offset, whence, maxsize: isize, eof: isize);
355	}
356
357	static int zonefs_file_write_dio_end_io(struct kiocb *iocb, ssize_t size,
358	int error, unsigned int flags)
359	{
360	struct inode *inode = file_inode(f: iocb->ki_filp);
361	struct zonefs_inode_info *zi = ZONEFS_I(inode);
362
363	if (error) {
364	/*
365	* For Sync IOs, error recovery is called from
366	* zonefs_file_dio_write().
367	*/
368	if (!is_sync_kiocb(kiocb: iocb))
369	zonefs_io_error(inode, write: true);
370	return error;
371	}
372
373	if (size && zonefs_inode_is_seq(inode)) {
374	/*
375	* Note that we may be seeing completions out of order,
376	* but that is not a problem since a write completed
377	* successfully necessarily means that all preceding writes
378	* were also successful. So we can safely increase the inode
379	* size to the write end location.
380	*/
381	mutex_lock(&zi->i_truncate_mutex);
382	if (i_size_read(inode) < iocb->ki_pos + size) {
383	zonefs_update_stats(inode, new_isize: iocb->ki_pos + size);
384	zonefs_i_size_write(inode, isize: iocb->ki_pos + size);
385	}
386	mutex_unlock(lock: &zi->i_truncate_mutex);
387	}
388
389	return `0`;
390	}
391
392	static const struct iomap_dio_ops zonefs_write_dio_ops = {
393	.end_io = zonefs_file_write_dio_end_io,
394	};
395
396	/*
397	* Do not exceed the LFS limits nor the file zone size. If pos is under the
398	* limit it becomes a short access. If it exceeds the limit, return -EFBIG.
399	*/
400	static loff_t zonefs_write_check_limits(struct file *file, loff_t pos,
401	loff_t count)
402	{
403	struct inode *inode = file_inode(f: file);
404	struct zonefs_zone *z = zonefs_inode_zone(inode);
405	loff_t limit = rlimit(RLIMIT_FSIZE);
406	loff_t max_size = z->z_capacity;
407
408	if (limit != RLIM_INFINITY) {
409	if (pos >= limit) {
410	send_sig(SIGXFSZ, current, `0`);
411	return -EFBIG;
412	}
413	count = min(count, limit - pos);
414	}
415
416	if (!(file->f_flags & O_LARGEFILE))
417	max_size = min_t(loff_t, MAX_NON_LFS, max_size);
418
419	if (unlikely(pos >= max_size))
420	return -EFBIG;
421
422	return min(count, max_size - pos);
423	}
424
425	static ssize_t zonefs_write_checks(struct kiocb iocb, struct* iov_iter *from)
426	{
427	struct file *file = iocb->ki_filp;
428	struct inode *inode = file_inode(f: file);
429	struct zonefs_inode_info *zi = ZONEFS_I(inode);
430	struct zonefs_zone *z = zonefs_inode_zone(inode);
431	loff_t count;
432
433	if (IS_SWAPFILE(inode))
434	return -ETXTBSY;
435
436	if (!iov_iter_count(i: from))
437	return `0`;
438
439	if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT))
440	return -EINVAL;
441
442	if (iocb->ki_flags & IOCB_APPEND) {
443	if (zonefs_zone_is_cnv(z))
444	return -EINVAL;
445	mutex_lock(&zi->i_truncate_mutex);
446	iocb->ki_pos = z->z_wpoffset;
447	mutex_unlock(lock: &zi->i_truncate_mutex);
448	}
449
450	count = zonefs_write_check_limits(file, pos: iocb->ki_pos,
451	count: iov_iter_count(i: from));
452	if (count < `0`)
453	return count;
454
455	iov_iter_truncate(i: from, count);
456	return iov_iter_count(i: from);
457	}
458
459	/*
460	* Handle direct writes. For sequential zone files, this is the only possible
461	* write path. For these files, check that the user is issuing writes
462	* sequentially from the end of the file. This code assumes that the block layer
463	* delivers write requests to the device in sequential order. This is always the
464	* case if a block IO scheduler implementing the ELEVATOR_F_ZBD_SEQ_WRITE
465	* elevator feature is being used (e.g. mq-deadline). The block layer always
466	* automatically select such an elevator for zoned block devices during the
467	* device initialization.
468	*/
469	static ssize_t zonefs_file_dio_write(struct kiocb iocb, struct* iov_iter *from)
470	{
471	struct inode *inode = file_inode(f: iocb->ki_filp);
472	struct zonefs_inode_info *zi = ZONEFS_I(inode);
473	struct zonefs_zone *z = zonefs_inode_zone(inode);
474	struct super_block *sb = inode->i_sb;
475	ssize_t ret, count;
476
477	/*
478	* For async direct IOs to sequential zone files, refuse IOCB_NOWAIT
479	* as this can cause write reordering (e.g. the first aio gets EAGAIN
480	* on the inode lock but the second goes through but is now unaligned).
481	*/
482	if (zonefs_zone_is_seq(z) && !is_sync_kiocb(kiocb: iocb) &&
483	(iocb->ki_flags & IOCB_NOWAIT))
484	return -EOPNOTSUPP;
485
486	if (iocb->ki_flags & IOCB_NOWAIT) {
487	if (!inode_trylock(inode))
488	return -EAGAIN;
489	} else {
490	inode_lock(inode);
491	}
492
493	count = zonefs_write_checks(iocb, from);
494	if (count <= `0`) {
495	ret = count;
496	goto inode_unlock;
497	}
498
499	if ((iocb->ki_pos \| count) & (sb->s_blocksize - `1`)) {
500	ret = -EINVAL;
501	goto inode_unlock;
502	}
503
504	/ Enforce sequential writes (append only) in sequential zones /
505	if (zonefs_zone_is_seq(z)) {
506	mutex_lock(&zi->i_truncate_mutex);
507	if (iocb->ki_pos != z->z_wpoffset) {
508	mutex_unlock(lock: &zi->i_truncate_mutex);
509	ret = -EINVAL;
510	goto inode_unlock;
511	}
512	/*
513	* Advance the zone write pointer offset. This assumes that the
514	* IO will succeed, which is OK to do because we do not allow
515	* partial writes (IOMAP_DIO_PARTIAL is not set) and if the IO
516	* fails, the error path will correct the write pointer offset.
517	*/
518	z->z_wpoffset += count;
519	zonefs_inode_account_active(inode);
520	mutex_unlock(lock: &zi->i_truncate_mutex);
521	}
522
523	/*
524	* iomap_dio_rw() may return ENOTBLK if there was an issue with
525	* page invalidation. Overwrite that error code with EBUSY so that
526	* the user can make sense of the error.
527	*/
528	ret = iomap_dio_rw(iocb, iter: from, ops: &zonefs_write_iomap_ops,
529	dops: &zonefs_write_dio_ops, dio_flags: `0`, NULL, done_before: `0`);
530	if (ret == -ENOTBLK)
531	ret = -EBUSY;
532
533	/*
534	* For a failed IO or partial completion, trigger error recovery
535	* to update the zone write pointer offset to a correct value.
536	* For asynchronous IOs, zonefs_file_write_dio_end_io() may already
537	* have executed error recovery if the IO already completed when we
538	* reach here. However, we cannot know that and execute error recovery
539	* again (that will not change anything).
540	*/
541	if (zonefs_zone_is_seq(z)) {
542	if (ret > `0` && ret != count)
543	ret = -EIO;
544	if (ret < `0` && ret != -EIOCBQUEUED)
545	zonefs_io_error(inode, write: true);
546	}
547
548	inode_unlock:
549	inode_unlock(inode);
550
551	return ret;
552	}
553
554	static ssize_t zonefs_file_buffered_write(struct kiocb *iocb,
555	struct iov_iter *from)
556	{
557	struct inode *inode = file_inode(f: iocb->ki_filp);
558	ssize_t ret;
559
560	/*
561	* Direct IO writes are mandatory for sequential zone files so that the
562	* write IO issuing order is preserved.
563	*/
564	if (zonefs_inode_is_seq(inode))
565	return -EIO;
566
567	if (iocb->ki_flags & IOCB_NOWAIT) {
568	if (!inode_trylock(inode))
569	return -EAGAIN;
570	} else {
571	inode_lock(inode);
572	}
573
574	ret = zonefs_write_checks(iocb, from);
575	if (ret <= `0`)
576	goto inode_unlock;
577
578	ret = iomap_file_buffered_write(iocb, from, ops: &zonefs_write_iomap_ops,
579	NULL, NULL);
580	if (ret == -EIO)
581	zonefs_io_error(inode, write: true);
582
583	inode_unlock:
584	inode_unlock(inode);
585	if (ret > `0`)
586	ret = generic_write_sync(iocb, count: ret);
587
588	return ret;
589	}
590
591	static ssize_t zonefs_file_write_iter(struct kiocb iocb, struct* iov_iter *from)
592	{
593	struct inode *inode = file_inode(f: iocb->ki_filp);
594	struct zonefs_zone *z = zonefs_inode_zone(inode);
595
596	if (unlikely(IS_IMMUTABLE(inode)))
597	return -EPERM;
598
599	if (sb_rdonly(sb: inode->i_sb))
600	return -EROFS;
601
602	/ Write operations beyond the zone capacity are not allowed /
603	if (iocb->ki_pos >= z->z_capacity)
604	return -EFBIG;
605
606	if (iocb->ki_flags & IOCB_DIRECT) {
607	ssize_t ret = zonefs_file_dio_write(iocb, from);
608
609	if (ret != -ENOTBLK)
610	return ret;
611	}
612
613	return zonefs_file_buffered_write(iocb, from);
614	}
615
616	static int zonefs_file_read_dio_end_io(struct kiocb *iocb, ssize_t size,
617	int error, unsigned int flags)
618	{
619	if (error) {
620	zonefs_io_error(inode: file_inode(f: iocb->ki_filp), write: false);
621	return error;
622	}
623
624	return `0`;
625	}
626
627	static const struct iomap_dio_ops zonefs_read_dio_ops = {
628	.end_io = zonefs_file_read_dio_end_io,
629	};
630
631	static ssize_t zonefs_file_read_iter(struct kiocb iocb, struct* iov_iter *to)
632	{
633	struct inode *inode = file_inode(f: iocb->ki_filp);
634	struct zonefs_inode_info *zi = ZONEFS_I(inode);
635	struct zonefs_zone *z = zonefs_inode_zone(inode);
636	struct super_block *sb = inode->i_sb;
637	loff_t isize;
638	ssize_t ret;
639
640	/ Offline zones cannot be read /
641	if (unlikely(IS_IMMUTABLE(inode) && !(inode->i_mode & `0777`)))
642	return -EPERM;
643
644	if (iocb->ki_pos >= z->z_capacity)
645	return `0`;
646
647	if (iocb->ki_flags & IOCB_NOWAIT) {
648	if (!inode_trylock_shared(inode))
649	return -EAGAIN;
650	} else {
651	inode_lock_shared(inode);
652	}
653
654	/ Limit read operations to written data /
655	mutex_lock(&zi->i_truncate_mutex);
656	isize = i_size_read(inode);
657	if (iocb->ki_pos >= isize) {
658	mutex_unlock(lock: &zi->i_truncate_mutex);
659	ret = `0`;
660	goto inode_unlock;
661	}
662	iov_iter_truncate(i: to, count: isize - iocb->ki_pos);
663	mutex_unlock(lock: &zi->i_truncate_mutex);
664
665	if (iocb->ki_flags & IOCB_DIRECT) {
666	size_t count = iov_iter_count(i: to);
667
668	if ((iocb->ki_pos \| count) & (sb->s_blocksize - `1`)) {
669	ret = -EINVAL;
670	goto inode_unlock;
671	}
672	file_accessed(file: iocb->ki_filp);
673	ret = iomap_dio_rw(iocb, iter: to, ops: &zonefs_read_iomap_ops,
674	dops: &zonefs_read_dio_ops, dio_flags: `0`, NULL, done_before: `0`);
675	} else {
676	ret = generic_file_read_iter(iocb, to);
677	if (ret == -EIO)
678	zonefs_io_error(inode, write: false);
679	}
680
681	inode_unlock:
682	inode_unlock_shared(inode);
683
684	return ret;
685	}
686
687	static ssize_t zonefs_file_splice_read(struct file in, loff_t ppos,
688	struct pipe_inode_info *pipe,
689	size_t len, unsigned int flags)
690	{
691	struct inode *inode = file_inode(f: in);
692	struct zonefs_inode_info *zi = ZONEFS_I(inode);
693	struct zonefs_zone *z = zonefs_inode_zone(inode);
694	loff_t isize;
695	ssize_t ret = `0`;
696
697	/ Offline zones cannot be read /
698	if (unlikely(IS_IMMUTABLE(inode) && !(inode->i_mode & `0777`)))
699	return -EPERM;
700
701	if (*ppos >= z->z_capacity)
702	return `0`;
703
704	inode_lock_shared(inode);
705
706	/ Limit read operations to written data /
707	mutex_lock(&zi->i_truncate_mutex);
708	isize = i_size_read(inode);
709	if (*ppos >= isize)
710	len = `0`;
711	else
712	len = min_t(loff_t, len, isize - *ppos);
713	mutex_unlock(lock: &zi->i_truncate_mutex);
714
715	if (len > `0`) {
716	ret = filemap_splice_read(in, ppos, pipe, len, flags);
717	if (ret == -EIO)
718	zonefs_io_error(inode, write: false);
719	}
720
721	inode_unlock_shared(inode);
722	return ret;
723	}
724
725	/*
726	* Write open accounting is done only for sequential files.
727	*/
728	static inline bool zonefs_seq_file_need_wro(struct inode *inode,
729	struct file *file)
730	{
731	if (zonefs_inode_is_cnv(inode))
732	return false;
733
734	if (!(file->f_mode & FMODE_WRITE))
735	return false;
736
737	return true;
738	}
739
740	static int zonefs_seq_file_write_open(struct inode *inode)
741	{
742	struct zonefs_inode_info *zi = ZONEFS_I(inode);
743	struct zonefs_zone *z = zonefs_inode_zone(inode);
744	int ret = `0`;
745
746	mutex_lock(&zi->i_truncate_mutex);
747
748	if (!zi->i_wr_refcnt) {
749	struct zonefs_sb_info *sbi = ZONEFS_SB(sb: inode->i_sb);
750	unsigned int wro = atomic_inc_return(v: &sbi->s_wro_seq_files);
751
752	if (sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) {
753
754	if (sbi->s_max_wro_seq_files
755	&& wro > sbi->s_max_wro_seq_files) {
756	atomic_dec(v: &sbi->s_wro_seq_files);
757	ret = -EBUSY;
758	goto unlock;
759	}
760
761	if (i_size_read(inode) < z->z_capacity) {
762	ret = zonefs_inode_zone_mgmt(inode,
763	op: REQ_OP_ZONE_OPEN);
764	if (ret) {
765	atomic_dec(v: &sbi->s_wro_seq_files);
766	goto unlock;
767	}
768	z->z_flags \|= ZONEFS_ZONE_OPEN;
769	zonefs_inode_account_active(inode);
770	}
771	}
772	}
773
774	zi->i_wr_refcnt++;
775
776	unlock:
777	mutex_unlock(lock: &zi->i_truncate_mutex);
778
779	return ret;
780	}
781
782	static int zonefs_file_open(struct inode inode, struct* file *file)
783	{
784	int ret;
785
786	file->f_mode \|= FMODE_CAN_ODIRECT;
787	ret = generic_file_open(inode, filp: file);
788	if (ret)
789	return ret;
790
791	if (zonefs_seq_file_need_wro(inode, file))
792	return zonefs_seq_file_write_open(inode);
793
794	return `0`;
795	}
796
797	static void zonefs_seq_file_write_close(struct inode *inode)
798	{
799	struct zonefs_inode_info *zi = ZONEFS_I(inode);
800	struct zonefs_zone *z = zonefs_inode_zone(inode);
801	struct super_block *sb = inode->i_sb;
802	struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
803	int ret = `0`;
804
805	mutex_lock(&zi->i_truncate_mutex);
806
807	zi->i_wr_refcnt--;
808	if (zi->i_wr_refcnt)
809	goto unlock;
810
811	/*
812	* The file zone may not be open anymore (e.g. the file was truncated to
813	* its maximum size or it was fully written). For this case, we only
814	* need to decrement the write open count.
815	*/
816	if (z->z_flags & ZONEFS_ZONE_OPEN) {
817	ret = zonefs_inode_zone_mgmt(inode, op: REQ_OP_ZONE_CLOSE);
818	if (ret) {
819	__zonefs_io_error(inode, write: false);
820	/*
821	* Leaving zones explicitly open may lead to a state
822	* where most zones cannot be written (zone resources
823	* exhausted). So take preventive action by remounting
824	* read-only.
825	*/
826	if (z->z_flags & ZONEFS_ZONE_OPEN &&
827	!(sb->s_flags & SB_RDONLY)) {
828	zonefs_warn(sb,
829	"closing zone at %llu failed %d\n",
830	z->z_sector, ret);
831	zonefs_warn(sb,
832	"remounting filesystem read-only\n");
833	sb->s_flags \|= SB_RDONLY;
834	}
835	goto unlock;
836	}
837
838	z->z_flags &= ~ZONEFS_ZONE_OPEN;
839	zonefs_inode_account_active(inode);
840	}
841
842	atomic_dec(v: &sbi->s_wro_seq_files);
843
844	unlock:
845	mutex_unlock(lock: &zi->i_truncate_mutex);
846	}
847
848	static int zonefs_file_release(struct inode inode, struct* file *file)
849	{
850	/*
851	* If we explicitly open a zone we must close it again as well, but the
852	* zone management operation can fail (either due to an IO error or as
853	* the zone has gone offline or read-only). Make sure we don't fail the
854	* close(2) for user-space.
855	*/
856	if (zonefs_seq_file_need_wro(inode, file))
857	zonefs_seq_file_write_close(inode);
858
859	return `0`;
860	}
861
862	const struct file_operations zonefs_file_operations = {
863	.open = zonefs_file_open,
864	.release = zonefs_file_release,
865	.fsync = zonefs_file_fsync,
866	.mmap_prepare = zonefs_file_mmap_prepare,
867	.llseek = zonefs_file_llseek,
868	.read_iter = zonefs_file_read_iter,
869	.write_iter = zonefs_file_write_iter,
870	.splice_read = zonefs_file_splice_read,
871	.splice_write = iter_file_splice_write,
872	.iopoll = iocb_bio_iopoll,
873	};
874

source code of linux/fs/zonefs/file.c