md.h source code [linux/drivers/md/md.h]

1	/ SPDX-License-Identifier: GPL-2.0-or-later /
2	/*
3	md.h : kernel internal structure of the Linux MD driver
4	Copyright (C) 1996-98 Ingo Molnar, Gadi Oxman
5
6	*/
7
8	#ifndef _MD_MD_H
9	#define _MD_MD_H
10
11	#include <linux/blkdev.h>
12	#include <linux/backing-dev.h>
13	#include <linux/badblocks.h>
14	#include <linux/kobject.h>
15	#include <linux/list.h>
16	#include <linux/mm.h>
17	#include <linux/mutex.h>
18	#include <linux/timer.h>
19	#include <linux/wait.h>
20	#include <linux/workqueue.h>
21	#include <linux/raid/md_u.h>
22	#include <trace/events/block.h>
23
24	#define MaxSector (~(sector_t)0)
25
26	enum md_submodule_type {
27	MD_PERSONALITY = `0`,
28	MD_CLUSTER,
29	MD_BITMAP,
30	};
31
32	enum md_submodule_id {
33	ID_LINEAR = LEVEL_LINEAR,
34	ID_RAID0 = `0`,
35	ID_RAID1 = `1`,
36	ID_RAID4 = `4`,
37	ID_RAID5 = `5`,
38	ID_RAID6 = `6`,
39	ID_RAID10 = `10`,
40	ID_CLUSTER,
41	ID_BITMAP,
42	ID_LLBITMAP,
43	ID_BITMAP_NONE,
44	};
45
46	struct md_submodule_head {
47	enum md_submodule_type type;
48	enum md_submodule_id id;
49	const char *name;
50	struct module *owner;
51	};
52
53	/*
54	* These flags should really be called "NO_RETRY" rather than
55	* "FAILFAST" because they don't make any promise about time lapse,
56	* only about the number of retries, which will be zero.
57	* REQ_FAILFAST_DRIVER is not included because
58	* Commit: 4a27446f3e39 ("[SCSI] modify scsi to handle new fail fast flags.")
59	* seems to suggest that the errors it avoids retrying should usually
60	* be retried.
61	*/
62	#define MD_FAILFAST (REQ_FAILFAST_DEV \| REQ_FAILFAST_TRANSPORT)
63
64	/ Status of sync thread. /
65	enum sync_action {
66	/*
67	* Represent by MD_RECOVERY_SYNC, start when:
68	* 1) after assemble, sync data from first rdev to other copies, this
69	* must be done first before other sync actions and will only execute
70	* once;
71	* 2) resize the array(notice that this is not reshape), sync data for
72	* the new range;
73	*/
74	ACTION_RESYNC,
75	/*
76	* Represent by MD_RECOVERY_RECOVER, start when:
77	* 1) for new replacement, sync data based on the replace rdev or
78	* available copies from other rdev;
79	* 2) for new member disk while the array is degraded, sync data from
80	* other rdev;
81	* 3) reassemble after power failure or re-add a hot removed rdev, sync
82	* data from first rdev to other copies based on bitmap;
83	*/
84	ACTION_RECOVER,
85	/*
86	* Represent by MD_RECOVERY_SYNC \| MD_RECOVERY_REQUESTED \|
87	* MD_RECOVERY_CHECK, start when user echo "check" to sysfs api
88	* sync_action, used to check if data copies from differenct rdev are
89	* the same. The number of mismatch sectors will be exported to user
90	* by sysfs api mismatch_cnt;
91	*/
92	ACTION_CHECK,
93	/*
94	* Represent by MD_RECOVERY_SYNC \| MD_RECOVERY_REQUESTED, start when
95	* user echo "repair" to sysfs api sync_action, usually paired with
96	* ACTION_CHECK, used to force syncing data once user found that there
97	* are inconsistent data,
98	*/
99	ACTION_REPAIR,
100	/*
101	* Represent by MD_RECOVERY_RESHAPE, start when new member disk is added
102	* to the conf, notice that this is different from spares or
103	* replacement;
104	*/
105	ACTION_RESHAPE,
106	/*
107	* Represent by MD_RECOVERY_FROZEN, can be set by sysfs api sync_action
108	* or internal usage like setting the array read-only, will forbid above
109	* actions.
110	*/
111	ACTION_FROZEN,
112	/*
113	* All above actions don't match.
114	*/
115	ACTION_IDLE,
116	NR_SYNC_ACTIONS,
117	};
118
119	/*
120	* The struct embedded in rdev is used to serialize IO.
121	*/
122	struct serial_in_rdev {
123	struct rb_root_cached serial_rb;
124	spinlock_t serial_lock;
125	wait_queue_head_t serial_io_wait;
126	};
127
128	/*
129	* MD's 'extended' device
130	*/
131	struct md_rdev {
132	struct list_head same_set; / RAID devices within the same set /
133
134	sector_t sectors; / Device size (in 512bytes sectors) /
135	struct mddev mddev; /* RAID array if running /
136	unsigned long last_events; / IO event timestamp /
137
138	/*
139	* If meta_bdev is non-NULL, it means that a separate device is
140	* being used to store the metadata (superblock/bitmap) which
141	* would otherwise be contained on the same device as the data (bdev).
142	*/
143	struct block_device *meta_bdev;
144	struct block_device bdev; /* block device handle /
145	struct file bdev_file; /* Handle from open for bdev /
146
147	struct page sb_page, bb_page;
148	int sb_loaded;
149	__u64 sb_events;
150	sector_t data_offset; / start of data in array /
151	sector_t new_data_offset;/ only relevant while reshaping /
152	sector_t sb_start; / offset of the super block (in 512byte sectors) /
153	int sb_size; / bytes in the superblock /
154	int preferred_minor; / autorun support /
155
156	struct kobject kobj;
157
158	/ A device can be in one of three states based on two flags:*
159	* Not working: faulty==1 in_sync==0
160	* Fully working: faulty==0 in_sync==1
161	* Working, but not
162	* in sync with array
163	* faulty==0 in_sync==0
164	*
165	* It can never have faulty==1, in_sync==1
166	* This reduces the burden of testing multiple flags in many cases
167	*/
168
169	unsigned long flags; / bit set of 'enum flag_bits' bits. /
170	wait_queue_head_t blocked_wait;
171
172	int desc_nr; / descriptor index in the superblock /
173	int raid_disk; / role of device in array /
174	int new_raid_disk; / role that the device will have in*
175	* the array after a level-change completes.
176	*/
177	int saved_raid_disk; / role that device used to have in the*
178	* array and could again if we did a partial
179	* resync from the bitmap
180	*/
181	union {
182	sector_t recovery_offset;/ If this device has been partially*
183	* recovered, this is where we were
184	* up to.
185	*/
186	sector_t journal_tail; / If this device is a journal device,*
187	* this is the journal tail (journal
188	* recovery start point)
189	*/
190	};
191
192	atomic_t nr_pending; / number of pending requests.*
193	* only maintained for arrays that
194	* support hot removal
195	*/
196	atomic_t read_errors; / number of consecutive read errors that*
197	* we have tried to ignore.
198	*/
199	time64_t last_read_error; / monotonic time since our*
200	* last read error
201	*/
202	atomic_t corrected_errors; / number of corrected read errors,*
203	* for reporting to userspace and storing
204	* in superblock.
205	*/
206
207	struct serial_in_rdev serial; /* used for raid1 io serialization /
208
209	struct kernfs_node sysfs_state; /* handle for 'state'*
210	* sysfs entry */
211	/ handle for 'unacknowledged_bad_blocks' sysfs dentry /
212	struct kernfs_node *sysfs_unack_badblocks;
213	/ handle for 'bad_blocks' sysfs dentry /
214	struct kernfs_node *sysfs_badblocks;
215	struct badblocks badblocks;
216
217	struct {
218	short offset; / Offset from superblock to start of PPL.*
219	* Not used by external metadata. */
220	unsigned int size; / Size in sectors of the PPL space /
221	sector_t sector; / First sector of the PPL space /
222	} ppl;
223	};
224	enum flag_bits {
225	Faulty, / device is known to have a fault /
226	In_sync, / device is in_sync with rest of array /
227	Bitmap_sync, / ..actually, not quite In_sync. Need a*
228	* bitmap-based recovery to get fully in sync.
229	* The bit is only meaningful before device
230	* has been passed to pers->hot_add_disk.
231	*/
232	WriteMostly, / Avoid reading if at all possible /
233	AutoDetected, / added by auto-detect /
234	Blocked, / An error occurred but has not yet*
235	* been acknowledged by the metadata
236	* handler, so don't allow writes
237	* until it is cleared */
238	WriteErrorSeen, / A write error has been seen on this*
239	* device
240	*/
241	FaultRecorded, / Intermediate state for clearing*
242	* Blocked. The Fault is/will-be
243	* recorded in the metadata, but that
244	* metadata hasn't been stored safely
245	* on disk yet.
246	*/
247	BlockedBadBlocks, / A writer is blocked because they*
248	* found an unacknowledged bad-block.
249	* This can safely be cleared at any
250	* time, and the writer will re-check.
251	* It may be set at any time, and at
252	* worst the writer will timeout and
253	* re-check. So setting it as
254	* accurately as possible is good, but
255	* not absolutely critical.
256	*/
257	WantReplacement, / This device is a candidate to be*
258	* hot-replaced, either because it has
259	* reported some faults, or because
260	* of explicit request.
261	*/
262	Replacement, / This device is a replacement for*
263	* a want_replacement device with same
264	* raid_disk number.
265	*/
266	Candidate, / For clustered environments only:*
267	* This device is seen locally but not
268	* by the whole cluster
269	*/
270	Journal, / This device is used as journal for*
271	* raid-5/6.
272	* Usually, this device should be faster
273	* than other devices in the array
274	*/
275	ClusterRemove,
276	ExternalBbl, / External metadata provides bad*
277	* block management for a disk
278	*/
279	FailFast, / Minimal retries should be attempted on*
280	* this device, so use REQ_FAILFAST_DEV.
281	* Also don't try to repair failed reads.
282	* It is expects that no bad block log
283	* is present.
284	*/
285	LastDev, / Seems to be the last working dev as*
286	* it didn't fail, so don't use FailFast
287	* any more for metadata
288	*/
289	CollisionCheck, /*
290	* check if there is collision between raid1
291	* serial bios.
292	*/
293	Nonrot, / non-rotational device (SSD) /
294	};
295
296	static inline int is_badblock(struct md_rdev *rdev, sector_t s, sector_t sectors,
297	sector_t first_bad, sector_t bad_sectors)
298	{
299	if (unlikely(rdev->badblocks.count)) {
300	int rv = badblocks_check(bb: &rdev->badblocks, s: rdev->data_offset + s,
301	sectors,
302	first_bad, bad_sectors);
303	if (rv)
304	*first_bad -= rdev->data_offset;
305	return rv;
306	}
307	return `0`;
308	}
309
310	static inline int rdev_has_badblock(struct md_rdev *rdev, sector_t s,
311	int sectors)
312	{
313	sector_t first_bad;
314	sector_t bad_sectors;
315
316	return is_badblock(rdev, s, sectors, first_bad: &first_bad, bad_sectors: &bad_sectors);
317	}
318
319	extern bool rdev_set_badblocks(struct md_rdev rdev, sector_t s, int* sectors,
320	int is_new);
321	extern void rdev_clear_badblocks(struct md_rdev rdev, sector_t s, int* sectors,
322	int is_new);
323	struct md_cluster_info;
324	struct md_cluster_operations;
325
326	/**
327	* enum mddev_flags - md device flags.
328	* @MD_ARRAY_FIRST_USE: First use of array, needs initialization.
329	* @MD_CLOSING: If set, we are closing the array, do not open it then.
330	* @MD_JOURNAL_CLEAN: A raid with journal is already clean.
331	* @MD_HAS_JOURNAL: The raid array has journal feature set.
332	* @MD_CLUSTER_RESYNC_LOCKED: cluster raid only, which means node, already took
333	* resync lock, need to release the lock.
334	* @MD_FAILFAST_SUPPORTED: Using MD_FAILFAST on metadata writes is supported as
335	* calls to md_error() will never cause the array to
336	* become failed.
337	* @MD_HAS_PPL: The raid array has PPL feature set.
338	* @MD_HAS_MULTIPLE_PPLS: The raid array has multiple PPLs feature set.
339	* @MD_NOT_READY: do_md_run() is active, so 'array_state', ust not report that
340	* array is ready yet.
341	* @MD_BROKEN: This is used to stop writes and mark array as failed.
342	* @MD_DELETED: This device is being deleted
343	*
344	* change UNSUPPORTED_MDDEV_FLAGS for each array type if new flag is added
345	*/
346	enum mddev_flags {
347	MD_ARRAY_FIRST_USE,
348	MD_CLOSING,
349	MD_JOURNAL_CLEAN,
350	MD_HAS_JOURNAL,
351	MD_CLUSTER_RESYNC_LOCKED,
352	MD_FAILFAST_SUPPORTED,
353	MD_HAS_PPL,
354	MD_HAS_MULTIPLE_PPLS,
355	MD_NOT_READY,
356	MD_BROKEN,
357	MD_DO_DELETE,
358	MD_DELETED,
359	};
360
361	enum mddev_sb_flags {
362	MD_SB_CHANGE_DEVS, / Some device status has changed /
363	MD_SB_CHANGE_CLEAN, / transition to or from 'clean' /
364	MD_SB_CHANGE_PENDING, / switch from 'clean' to 'active' in progress /
365	MD_SB_NEED_REWRITE, / metadata write needs to be repeated /
366	};
367
368	#define NR_SERIAL_INFOS 8
369	/ record current range of serialize IOs /
370	struct serial_info {
371	struct rb_node node;
372	sector_t start; / start sector of rb node /
373	sector_t last; / end sector of rb node /
374	sector_t _subtree_last; / highest sector in subtree of rb node /
375	};
376
377	/*
378	* mddev->curr_resync stores the current sector of the resync but
379	* also has some overloaded values.
380	*/
381	enum {
382	/ No resync in progress /
383	MD_RESYNC_NONE = `0`,
384	/ Yielded to allow another conflicting resync to commence /
385	MD_RESYNC_YIELDED = `1`,
386	/ Delayed to check that there is no conflict with another sync /
387	MD_RESYNC_DELAYED = `2`,
388	/ Any value greater than or equal to this is in an active resync /
389	MD_RESYNC_ACTIVE = `3`,
390	};
391
392	struct mddev {
393	void *private;
394	struct md_personality *pers;
395	dev_t unit;
396	int md_minor;
397	struct list_head disks;
398	unsigned long flags;
399	unsigned long sb_flags;
400
401	int suspended;
402	struct mutex suspend_mutex;
403	struct percpu_ref active_io;
404	int ro;
405	int sysfs_active; / set when sysfs deletes*
406	* are happening, so run/
407	* takeover/stop are not safe
408	*/
409	struct gendisk gendisk; /* mdraid gendisk /
410	struct gendisk dm_gendisk; /* dm-raid gendisk /
411
412	struct kobject kobj;
413	int hold_active;
414	#define UNTIL_IOCTL 1
415	#define UNTIL_STOP 2
416
417	/ Superblock information /
418	int major_version,
419	minor_version,
420	patch_version;
421	int persistent;
422	int external; / metadata is*
423	* managed externally */
424	char metadata_type[`17`]; / externally set/
425	int chunk_sectors;
426	time64_t ctime, utime;
427	int level, layout;
428	char clevel[`16`];
429	int raid_disks;
430	int max_disks;
431	sector_t dev_sectors; / used size of*
432	* component devices */
433	sector_t array_sectors; / exported array size /
434	int external_size; / size managed*
435	* externally */
436	unsigned int logical_block_size;
437	__u64 events;
438	/ If the last 'event' was simply a clean->dirty transition, and*
439	* we didn't write it to the spares, then it is safe and simple
440	* to just decrement the event count on a dirty->clean transition.
441	* So we record that possibility here.
442	*/
443	int can_decrease_events;
444
445	char uuid[`16`];
446
447	/ If the array is being reshaped, we need to record the*
448	* new shape and an indication of where we are up to.
449	* This is written to the superblock.
450	* If reshape_position is MaxSector, then no reshape is happening (yet).
451	*/
452	sector_t reshape_position;
453	int delta_disks, new_level, new_layout;
454	int new_chunk_sectors;
455	int reshape_backwards;
456
457	struct md_thread __rcu thread; /* management thread /
458	struct md_thread __rcu sync_thread; /* doing resync or reconstruct /
459
460	/*
461	* Set when a sync operation is started. It holds this value even
462	* when the sync thread is "frozen" (interrupted) or "idle" (stopped
463	* or finished). It is overwritten when a new sync operation is begun.
464	*/
465	enum sync_action last_sync_action;
466	sector_t curr_resync; / last block scheduled /
467	/ As resync requests can complete out of order, we cannot easily track*
468	* how much resync has been completed. So we occasionally pause until
469	* everything completes, then set curr_resync_completed to curr_resync.
470	* As such it may be well behind the real resync mark, but it is a value
471	* we are certain of.
472	*/
473	sector_t curr_resync_completed;
474	unsigned long resync_mark; / a recent timestamp /
475	sector_t resync_mark_cnt;/ blocks written at resync_mark /
476	sector_t curr_mark_cnt; / blocks scheduled now /
477
478	sector_t resync_max_sectors; / may be set by personality /
479
480	atomic64_t resync_mismatches; / count of sectors where*
481	* parity/replica mismatch found
482	*/
483
484	/ allow user-space to request suspension of IO to regions of the array /
485	sector_t suspend_lo;
486	sector_t suspend_hi;
487	/ if zero, use the system-wide default /
488	int sync_speed_min;
489	int sync_speed_max;
490	int sync_io_depth;
491
492	/ resync even though the same disks are shared among md-devices /
493	int parallel_resync;
494
495	int ok_start_degraded;
496
497	unsigned long recovery;
498	/ If a RAID personality determines that recovery (of a particular*
499	* device) will fail due to a read error on the source device, it
500	* takes a copy of this number and does not attempt recovery again
501	* until this number changes.
502	*/
503	int recovery_disabled;
504
505	int in_sync; / know to not need resync /
506	/ 'open_mutex' avoids races between 'md_open' and 'do_md_stop', so*
507	* that we are never stopping an array while it is open.
508	* 'reconfig_mutex' protects all other reconfiguration.
509	* These locks are separate due to conflicting interactions
510	* with disk->open_mutex.
511	* Lock ordering is:
512	* reconfig_mutex -> disk->open_mutex
513	* disk->open_mutex -> open_mutex: e.g. __blkdev_get -> md_open
514	*/
515	struct mutex open_mutex;
516	struct mutex reconfig_mutex;
517	atomic_t active; / general refcount /
518	atomic_t openers; / number of active opens /
519
520	int changed; / True if we might need to*
521	* reread partition info */
522	int degraded; / whether md should consider*
523	* adding a spare
524	*/
525
526	unsigned long normal_io_events; / IO event timestamp /
527	atomic_t recovery_active; / blocks scheduled, but not written /
528	wait_queue_head_t recovery_wait;
529	sector_t resync_offset;
530	sector_t resync_min; / user requested sync*
531	* starts here */
532	sector_t resync_max; / resync should pause*
533	* when it gets here */
534
535	struct kernfs_node sysfs_state; /* handle for 'array_state'*
536	* file in sysfs.
537	*/
538	struct kernfs_node sysfs_action; /* handle for 'sync_action' /
539	struct kernfs_node sysfs_completed; /handle for 'sync_completed' /*
540	struct kernfs_node sysfs_degraded; /handle for 'degraded' /*
541	struct kernfs_node sysfs_level; /handle for 'level' /*
542
543	/ used for delayed sysfs removal /
544	struct work_struct del_work;
545	/ used for register new sync thread /
546	struct work_struct sync_work;
547
548	/ "lock" protects:*
549	* flush_bio transition from NULL to !NULL
550	* rdev superblocks, events
551	* clearing MD_CHANGE_*
552	* in_sync - and related safemode and MD_CHANGE changes
553	* pers (also protected by reconfig_mutex and pending IO).
554	* clearing ->bitmap
555	* clearing ->bitmap_info.file
556	* changing ->resync_{min,max}
557	* setting MD_RECOVERY_RUNNING (which interacts with resync_{min,max})
558	*/
559	spinlock_t lock;
560	wait_queue_head_t sb_wait; / for waiting on superblock updates /
561	atomic_t pending_writes; / number of active superblock writes /
562
563	unsigned int safemode; / if set, update "clean" superblock*
564	* when no writes pending.
565	*/
566	unsigned int safemode_delay;
567	struct timer_list safemode_timer;
568	struct percpu_ref writes_pending;
569	int sync_checkers; / # of threads checking writes_pending /
570
571	enum md_submodule_id bitmap_id;
572	void bitmap; /* the bitmap for the device /
573	struct bitmap_operations *bitmap_ops;
574	struct {
575	struct file file; /* the bitmap file /
576	loff_t offset; / offset from superblock of*
577	* start of bitmap. May be
578	* negative, but not '0'
579	* For external metadata, offset
580	* from start of device.
581	*/
582	unsigned long space; / space available at this offset /
583	loff_t default_offset; / this is the offset to use when*
584	* hot-adding a bitmap. It should
585	* eventually be settable by sysfs.
586	*/
587	unsigned long default_space; / space available at*
588	* default offset */
589	struct mutex mutex;
590	unsigned long chunksize;
591	unsigned long daemon_sleep; / how many jiffies between updates? /
592	unsigned long max_write_behind; / write-behind mode /
593	int external;
594	int nodes; / Maximum number of nodes in the cluster /
595	char cluster_name[`64`]; / Name of the cluster /
596	} bitmap_info;
597
598	atomic_t max_corr_read_errors; / max read retries /
599	struct list_head all_mddevs;
600
601	const struct attribute_group *to_remove;
602
603	struct bio_set bio_set;
604	struct bio_set sync_set; / for sync operations like*
605	* metadata and bitmap writes
606	*/
607	struct bio_set io_clone_set;
608
609	struct work_struct event_work; / used by dm to report failure event /
610	mempool_t *serial_info_pool;
611	void (sync_super)(struct* mddev mddev, struct* md_rdev *rdev);
612	struct md_cluster_info *cluster_info;
613	struct md_cluster_operations *cluster_ops;
614	unsigned int good_device_nr; / good device num within cluster raid /
615	unsigned int noio_flag; / for memalloc scope API /
616
617	/*
618	* Temporarily store rdev that will be finally removed when
619	* reconfig_mutex is unlocked, protected by reconfig_mutex.
620	*/
621	struct list_head deleting;
622
623	/ The sequence number for sync thread /
624	atomic_t sync_seq;
625
626	bool has_superblocks:`1`;
627	bool fail_last_dev:`1`;
628	bool serialize_policy:`1`;
629	};
630
631	enum recovery_flags {
632	/ flags for sync thread running status /
633
634	/*
635	* set when one of sync action is set and new sync thread need to be
636	* registered, or just add/remove spares from conf.
637	*/
638	MD_RECOVERY_NEEDED,
639	/ sync thread is running, or about to be started /
640	MD_RECOVERY_RUNNING,
641	/ sync thread needs to be aborted for some reason /
642	MD_RECOVERY_INTR,
643	/ sync thread is done and is waiting to be unregistered /
644	MD_RECOVERY_DONE,
645	/ running sync thread must abort immediately, and not restart /
646	MD_RECOVERY_FROZEN,
647	/ waiting for pers->start() to finish /
648	MD_RECOVERY_WAIT,
649	/ interrupted because io-error /
650	MD_RECOVERY_ERROR,
651
652	/ flags determines sync action, see details in enum sync_action /
653
654	/ if just this flag is set, action is resync. /
655	MD_RECOVERY_SYNC,
656	/*
657	* paired with MD_RECOVERY_SYNC, if MD_RECOVERY_CHECK is not set,
658	* action is repair, means user requested resync.
659	*/
660	MD_RECOVERY_REQUESTED,
661	/*
662	* paired with MD_RECOVERY_SYNC and MD_RECOVERY_REQUESTED, action is
663	* check.
664	*/
665	MD_RECOVERY_CHECK,
666	/ recovery, or need to try it /
667	MD_RECOVERY_RECOVER,
668	/ reshape /
669	MD_RECOVERY_RESHAPE,
670	/ remote node is running resync thread /
671	MD_RESYNCING_REMOTE,
672	/ raid456 lazy initial recover /
673	MD_RECOVERY_LAZY_RECOVER,
674	};
675
676	enum md_ro_state {
677	MD_RDWR,
678	MD_RDONLY,
679	MD_AUTO_READ,
680	MD_MAX_STATE
681	};
682
683	static inline bool md_is_rdwr(struct mddev *mddev)
684	{
685	return (mddev->ro == MD_RDWR);
686	}
687
688	static inline bool reshape_interrupted(struct mddev *mddev)
689	{
690	/ reshape never start /
691	if (mddev->reshape_position == MaxSector)
692	return false;
693
694	/ interrupted /
695	if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
696	return true;
697
698	/ running reshape will be interrupted soon. /
699	if (test_bit(MD_RECOVERY_WAIT, &mddev->recovery) \|\|
700	test_bit(MD_RECOVERY_INTR, &mddev->recovery) \|\|
701	test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
702	return true;
703
704	return false;
705	}
706
707	static inline int __must_check mddev_lock(struct mddev *mddev)
708	{
709	int ret;
710
711	ret = mutex_lock_interruptible(&mddev->reconfig_mutex);
712
713	/ MD_DELETED is set in do_md_stop with reconfig_mutex.*
714	* So check it here.
715	*/
716	if (!ret && test_bit(MD_DELETED, &mddev->flags)) {
717	ret = -ENODEV;
718	mutex_unlock(lock: &mddev->reconfig_mutex);
719	}
720
721	return ret;
722	}
723
724	/ Sometimes we need to take the lock in a situation where*
725	* failure due to interrupts is not acceptable.
726	* It doesn't need to check MD_DELETED here, the owner which
727	* holds the lock here can't be stopped. And all paths can't
728	* call this function after do_md_stop.
729	*/
730	static inline void mddev_lock_nointr(struct mddev *mddev)
731	{
732	mutex_lock(&mddev->reconfig_mutex);
733	}
734
735	static inline int mddev_trylock(struct mddev *mddev)
736	{
737	int ret;
738
739	ret = mutex_trylock(&mddev->reconfig_mutex);
740	if (!ret && test_bit(MD_DELETED, &mddev->flags)) {
741	ret = -ENODEV;
742	mutex_unlock(lock: &mddev->reconfig_mutex);
743	}
744	return ret;
745	}
746	extern void mddev_unlock(struct mddev *mddev);
747
748	struct md_personality
749	{
750	struct md_submodule_head head;
751
752	bool __must_check (make_request)(struct* mddev mddev, struct* bio *bio);
753	/*
754	* start up works that do NOT require md_thread. tasks that
755	* requires md_thread should go into start()
756	*/
757	int (run)(struct* mddev *mddev);
758	/ start up works that require md threads /
759	int (start)(struct* mddev *mddev);
760	void (free)(struct* mddev mddev, void* *priv);
761	void (status)(struct* seq_file seq, struct* mddev *mddev);
762	/ error_handler must set ->faulty and clear ->in_sync*
763	* if appropriate, and should abort recovery if needed
764	*/
765	void (error_handler)(struct* mddev mddev, struct* md_rdev *rdev);
766	int (hot_add_disk) (struct* mddev mddev, struct* md_rdev *rdev);
767	int (hot_remove_disk) (struct* mddev mddev, struct* md_rdev *rdev);
768	int (spare_active) (struct* mddev *mddev);
769	sector_t (sync_request)(struct* mddev *mddev, sector_t sector_nr,
770	sector_t max_sector, int *skipped);
771	int (resize) (struct* mddev *mddev, sector_t sectors);
772	sector_t (size) (struct* mddev mddev, sector_t sectors, int* raid_disks);
773	int (check_reshape) (struct* mddev *mddev);
774	int (start_reshape) (struct* mddev *mddev);
775	void (finish_reshape) (struct* mddev *mddev);
776	void (update_reshape_pos) (struct* mddev *mddev);
777	void (prepare_suspend) (struct* mddev *mddev);
778	/ quiesce suspends or resumes internal processing.*
779	* 1 - stop new actions and wait for action io to complete
780	* 0 - return to normal behaviour
781	*/
782	void (quiesce) (struct* mddev mddev, int* quiesce);
783	/ takeover is used to transition an array from one*
784	* personality to another. The new personality must be able
785	* to handle the data in the current layout.
786	* e.g. 2drive raid1 -> 2drive raid5
787	* ndrive raid5 -> degraded n+1drive raid6 with special layout
788	* If the takeover succeeds, a new 'private' structure is returned.
789	* This needs to be installed and then ->run used to activate the
790	* array.
791	*/
792	void (takeover) (struct mddev *mddev);
793	/ Changes the consistency policy of an active array. /
794	int (change_consistency_policy)(struct* mddev mddev, const* char *buf);
795	/ convert io ranges from array to bitmap /
796	void (bitmap_sector)(struct* mddev mddev, sector_t offset,
797	unsigned long *sectors);
798	};
799
800	struct md_sysfs_entry {
801	struct attribute attr;
802	ssize_t (show)(struct* mddev , char* *);
803	ssize_t (store)(struct* mddev , const* char *, size_t);
804	};
805
806	static inline struct kernfs_node sysfs_get_dirent_safe(struct* kernfs_node sd, char* *name)
807	{
808	if (sd)
809	return sysfs_get_dirent(parent: sd, name);
810	return sd;
811	}
812	static inline void sysfs_notify_dirent_safe(struct kernfs_node *sd)
813	{
814	if (sd)
815	sysfs_notify_dirent(kn: sd);
816	}
817
818	static inline char * mdname (struct mddev * mddev)
819	{
820	return mddev->gendisk ? mddev->gendisk->disk_name : "mdX";
821	}
822
823	static inline int sysfs_link_rdev(struct mddev mddev, struct* md_rdev *rdev)
824	{
825	char nm[`20`];
826	if (!test_bit(Replacement, &rdev->flags) &&
827	!test_bit(Journal, &rdev->flags) &&
828	mddev->kobj.sd) {
829	sprintf(buf: nm, fmt: "rd%d", rdev->raid_disk);
830	return sysfs_create_link(kobj: &mddev->kobj, target: &rdev->kobj, name: nm);
831	} else
832	return `0`;
833	}
834
835	static inline void sysfs_unlink_rdev(struct mddev mddev, struct* md_rdev *rdev)
836	{
837	char nm[`20`];
838	if (!test_bit(Replacement, &rdev->flags) &&
839	!test_bit(Journal, &rdev->flags) &&
840	mddev->kobj.sd) {
841	sprintf(buf: nm, fmt: "rd%d", rdev->raid_disk);
842	sysfs_remove_link(kobj: &mddev->kobj, name: nm);
843	}
844	}
845
846	/*
847	* iterates through some rdev ringlist. It's safe to remove the
848	* current 'rdev'. Dont touch 'tmp' though.
849	*/
850	#define rdev_for_each_list(rdev, tmp, head) \
851	list_for_each_entry_safe(rdev, tmp, head, same_set)
852
853	/*
854	* iterates through the 'same array disks' ringlist
855	*/
856	#define rdev_for_each(rdev, mddev) \
857	list_for_each_entry(rdev, &((mddev)->disks), same_set)
858
859	#define rdev_for_each_safe(rdev, tmp, mddev) \
860	list_for_each_entry_safe(rdev, tmp, &((mddev)->disks), same_set)
861
862	#define rdev_for_each_rcu(rdev, mddev) \
863	list_for_each_entry_rcu(rdev, &((mddev)->disks), same_set)
864
865	struct md_thread {
866	void (run) (struct* md_thread *thread);
867	struct mddev *mddev;
868	wait_queue_head_t wqueue;
869	unsigned long flags;
870	struct task_struct *tsk;
871	unsigned long timeout;
872	void *private;
873	};
874
875	struct md_io_clone {
876	struct mddev *mddev;
877	struct bio *orig_bio;
878	unsigned long start_time;
879	sector_t offset;
880	unsigned long sectors;
881	enum stat_group rw;
882	struct bio bio_clone;
883	};
884
885	#define THREAD_WAKEUP 0
886
887	#define md_wakeup_thread(thread) do { \
888	rcu_read_lock(); \
889	__md_wakeup_thread(thread); \
890	rcu_read_unlock(); \
891	} while (0)
892
893	static inline void safe_put_page(struct page *p)
894	{
895	if (p) put_page(page: p);
896	}
897
898	int register_md_submodule(struct md_submodule_head *msh);
899	void unregister_md_submodule(struct md_submodule_head *msh);
900
901	extern struct md_thread *md_register_thread(
902	void (run)(struct* md_thread *thread),
903	struct mddev *mddev,
904	const char *name);
905	extern void md_unregister_thread(struct mddev mddev, struct* md_thread __rcu **threadp);
906	extern void __md_wakeup_thread(struct md_thread __rcu *thread);
907	extern void md_check_recovery(struct mddev *mddev);
908	extern void md_reap_sync_thread(struct mddev *mddev);
909	extern enum sync_action md_sync_action(struct mddev *mddev);
910	extern enum sync_action md_sync_action_by_name(const char *page);
911	extern const char md_sync_action_name(enum* sync_action action);
912	extern void md_write_start(struct mddev mddev, struct* bio *bi);
913	extern void md_write_inc(struct mddev mddev, struct* bio *bi);
914	extern void md_write_end(struct mddev *mddev);
915	extern void md_done_sync(struct mddev mddev, int* blocks, int ok);
916	extern void md_error(struct mddev mddev, struct* md_rdev *rdev);
917	extern void md_finish_reshape(struct mddev *mddev);
918	void md_submit_discard_bio(struct mddev mddev, struct* md_rdev *rdev,
919	struct bio *bio, sector_t start, sector_t size);
920	void md_account_bio(struct mddev mddev, struct* bio **bio);
921	void md_free_cloned_bio(struct bio *bio);
922
923	extern bool __must_check md_flush_request(struct mddev mddev, struct* bio *bio);
924	void md_write_metadata(struct mddev mddev, struct* md_rdev *rdev,
925	sector_t sector, int size, struct page *page,
926	unsigned int offset);
927	extern int md_super_wait(struct mddev *mddev);
928	extern int sync_page_io(struct md_rdev rdev, sector_t sector, int* size,
929	struct page *page, blk_opf_t opf, bool metadata_op);
930	extern void md_do_sync(struct md_thread *thread);
931	extern void md_new_event(void);
932	extern void md_allow_write(struct mddev *mddev);
933	extern void md_wait_for_blocked_rdev(struct md_rdev rdev, struct* mddev *mddev);
934	extern void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors);
935	extern int md_check_no_bitmap(struct mddev *mddev);
936	extern int md_integrity_register(struct mddev *mddev);
937	extern int strict_strtoul_scaled(const char cp, unsigned* long res, int* scale);
938
939	extern int mddev_init(struct mddev *mddev);
940	extern void mddev_destroy(struct mddev *mddev);
941	void md_init_stacking_limits(struct queue_limits *lim);
942	struct mddev md_alloc(dev_t dev, char* *name);
943	void mddev_put(struct mddev *mddev);
944	extern int md_run(struct mddev *mddev);
945	extern int md_start(struct mddev *mddev);
946	extern void md_stop(struct mddev *mddev);
947	extern void md_stop_writes(struct mddev *mddev);
948	extern int md_rdev_init(struct md_rdev *rdev);
949	extern void md_rdev_clear(struct md_rdev *rdev);
950
951	extern bool md_handle_request(struct mddev mddev, struct* bio *bio);
952	extern int mddev_suspend(struct mddev *mddev, bool interruptible);
953	extern void mddev_resume(struct mddev *mddev);
954	extern void md_idle_sync_thread(struct mddev *mddev);
955	extern void md_frozen_sync_thread(struct mddev *mddev);
956	extern void md_unfrozen_sync_thread(struct mddev *mddev);
957
958	extern void md_update_sb(struct mddev mddev, int* force);
959	extern void mddev_create_serial_pool(struct mddev mddev, struct* md_rdev *rdev);
960	extern void mddev_destroy_serial_pool(struct mddev *mddev,
961	struct md_rdev *rdev);
962	struct md_rdev md_find_rdev_nr_rcu(struct* mddev mddev, int* nr);
963	struct md_rdev md_find_rdev_rcu(struct* mddev *mddev, dev_t dev);
964
965	static inline bool is_rdev_broken(struct md_rdev *rdev)
966	{
967	return !disk_live(disk: rdev->bdev->bd_disk);
968	}
969
970	static inline void rdev_dec_pending(struct md_rdev rdev, struct* mddev *mddev)
971	{
972	int faulty = test_bit(Faulty, &rdev->flags);
973	if (atomic_dec_and_test(v: &rdev->nr_pending) && faulty) {
974	set_bit(nr: MD_RECOVERY_NEEDED, addr: &mddev->recovery);
975	md_wakeup_thread(mddev->thread);
976	}
977	}
978
979	static inline int mddev_is_clustered(struct mddev *mddev)
980	{
981	return mddev->cluster_info && mddev->bitmap_info.nodes > `1`;
982	}
983
984	/ clear unsupported mddev_flags /
985	static inline void mddev_clear_unsupported_flags(struct mddev *mddev,
986	unsigned long unsupported_flags)
987	{
988	mddev->flags &= ~unsupported_flags;
989	}
990
991	static inline void mddev_check_write_zeroes(struct mddev mddev, struct* bio *bio)
992	{
993	if (bio_op(bio) == REQ_OP_WRITE_ZEROES &&
994	!bio->bi_bdev->bd_disk->queue->limits.max_write_zeroes_sectors)
995	mddev->gendisk->queue->limits.max_write_zeroes_sectors = `0`;
996	}
997
998	static inline int mddev_suspend_and_lock(struct mddev *mddev)
999	{
1000	int ret;
1001
1002	ret = mddev_suspend(mddev, interruptible: true);
1003	if (ret)
1004	return ret;
1005
1006	ret = mddev_lock(mddev);
1007	if (ret)
1008	mddev_resume(mddev);
1009
1010	return ret;
1011	}
1012
1013	static inline void mddev_suspend_and_lock_nointr(struct mddev *mddev)
1014	{
1015	mddev_suspend(mddev, interruptible: false);
1016	mutex_lock(&mddev->reconfig_mutex);
1017	}
1018
1019	static inline void mddev_unlock_and_resume(struct mddev *mddev)
1020	{
1021	mddev_unlock(mddev);
1022	mddev_resume(mddev);
1023	}
1024
1025	struct mdu_array_info_s;
1026	struct mdu_disk_info_s;
1027
1028	extern int mdp_major;
1029	void md_autostart_arrays(int part);
1030	int md_set_array_info(struct mddev mddev, struct* mdu_array_info_s *info);
1031	int md_add_new_disk(struct mddev mddev, struct* mdu_disk_info_s *info);
1032	int do_md_run(struct mddev *mddev);
1033	#define MDDEV_STACK_INTEGRITY (1u << 0)
1034	int mddev_stack_rdev_limits(struct mddev mddev, struct* queue_limits *lim,
1035	unsigned int flags);
1036	int mddev_stack_new_rdev(struct mddev mddev, struct* md_rdev *rdev);
1037	void mddev_update_io_opt(struct mddev mddev, unsigned* int nr_stripes);
1038
1039	extern const struct block_device_operations md_fops;
1040
1041	/*
1042	* MD devices can be used undeneath by DM, in which case ->gendisk is NULL.
1043	*/
1044	static inline bool mddev_is_dm(struct mddev *mddev)
1045	{
1046	return !mddev->gendisk;
1047	}
1048
1049	static inline bool raid_is_456(struct mddev *mddev)
1050	{
1051	return mddev->level == ID_RAID4 \|\| mddev->level == ID_RAID5 \|\|
1052	mddev->level == ID_RAID6;
1053	}
1054
1055	static inline void mddev_trace_remap(struct mddev mddev, struct* bio *bio,
1056	sector_t sector)
1057	{
1058	if (!mddev_is_dm(mddev))
1059	trace_block_bio_remap(bio, dev: disk_devt(disk: mddev->gendisk), from: sector);
1060	}
1061
1062	static inline bool rdev_blocked(struct md_rdev *rdev)
1063	{
1064	/*
1065	* Blocked will be set by error handler and cleared by daemon after
1066	* updating superblock, meanwhile write IO should be blocked to prevent
1067	* reading old data after power failure.
1068	*/
1069	if (test_bit(Blocked, &rdev->flags))
1070	return true;
1071
1072	/*
1073	* Faulty device should not be accessed anymore, there is no need to
1074	* wait for bad block to be acknowledged.
1075	*/
1076	if (test_bit(Faulty, &rdev->flags))
1077	return false;
1078
1079	/ rdev is blocked by badblocks. /
1080	if (test_bit(BlockedBadBlocks, &rdev->flags))
1081	return true;
1082
1083	return false;
1084	}
1085
1086	#define mddev_add_trace_msg(mddev, fmt, args...) \
1087	do { \
1088	if (!mddev_is_dm(mddev)) \
1089	blk_add_trace_msg((mddev)->gendisk->queue, fmt, ##args); \
1090	} while (0)
1091
1092	#endif /* _MD_MD_H */
1093

source code of linux/drivers/md/md.h