Skip to content

Commit 2d78f8c

Browse files
committed
md: create externally visible flags for supporting hot-replace.
hot-replace is a feature being added to md which will allow a device to be replaced without removing it from the array first. With hot-replace a spare can be activated and recovery can start while the original device is still in place, thus allowing a transition from an unreliable device to a reliable device without leaving the array degraded during the transition. It can also be use when the original device is still reliable but it not wanted for some reason. This will eventually be supported in RAID4/5/6 and RAID10. This patch adds a super-block flag to distinguish the replacement device. If an old kernel sees this flag it will reject the device. It also adds two per-device flags which are viewable and settable via sysfs. "want_replacement" can be set to request that a device be replaced. "replacement" is set to show that this device is replacing another device. The "rd%d" links in /sys/block/mdXx/md only apply to the original device, not the replacement. We currently don't make links for the replacement - there doesn't seem to be a need. Signed-off-by: NeilBrown <neilb@suse.de>
1 parent b8321b6 commit 2d78f8c

4 files changed

Lines changed: 125 additions & 39 deletions

File tree

Documentation/md.txt

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -357,14 +357,14 @@ Each directory contains:
357357
written to, that device.
358358

359359
state
360-
A file recording the current state of the device in the array
360+
A file recording the current state of the device in the array
361361
which can be a comma separated list of
362362
faulty - device has been kicked from active use due to
363-
a detected fault or it has unacknowledged bad
364-
blocks
363+
a detected fault, or it has unacknowledged bad
364+
blocks
365365
in_sync - device is a fully in-sync member of the array
366366
writemostly - device will only be subject to read
367-
requests if there are no other options.
367+
requests if there are no other options.
368368
This applies only to raid1 arrays.
369369
blocked - device has failed, and the failure hasn't been
370370
acknowledged yet by the metadata handler.
@@ -374,6 +374,13 @@ Each directory contains:
374374
This includes spares that are in the process
375375
of being recovered to
376376
write_error - device has ever seen a write error.
377+
want_replacement - device is (mostly) working but probably
378+
should be replaced, either due to errors or
379+
due to user request.
380+
replacement - device is a replacement for another active
381+
device with same raid_disk.
382+
383+
377384
This list may grow in future.
378385
This can be written to.
379386
Writing "faulty" simulates a failure on the device.
@@ -386,6 +393,13 @@ Each directory contains:
386393
Writing "in_sync" sets the in_sync flag.
387394
Writing "write_error" sets writeerrorseen flag.
388395
Writing "-write_error" clears writeerrorseen flag.
396+
Writing "want_replacement" is allowed at any time except to a
397+
replacement device or a spare. It sets the flag.
398+
Writing "-want_replacement" is allowed at any time. It clears
399+
the flag.
400+
Writing "replacement" or "-replacement" is only allowed before
401+
starting the array. It sets or clears the flag.
402+
389403

390404
This file responds to select/poll. Any change to 'faulty'
391405
or 'blocked' causes an event.

drivers/md/md.c

Lines changed: 54 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1714,6 +1714,8 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
17141714
}
17151715
if (sb->devflags & WriteMostly1)
17161716
set_bit(WriteMostly, &rdev->flags);
1717+
if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT)
1718+
set_bit(Replacement, &rdev->flags);
17171719
} else /* MULTIPATH are always insync */
17181720
set_bit(In_sync, &rdev->flags);
17191721

@@ -1767,6 +1769,9 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
17671769
sb->recovery_offset =
17681770
cpu_to_le64(rdev->recovery_offset);
17691771
}
1772+
if (test_bit(Replacement, &rdev->flags))
1773+
sb->feature_map |=
1774+
cpu_to_le32(MD_FEATURE_REPLACEMENT);
17701775

17711776
if (mddev->reshape_position != MaxSector) {
17721777
sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE);
@@ -2560,6 +2565,15 @@ state_show(struct md_rdev *rdev, char *page)
25602565
len += sprintf(page+len, "%swrite_error", sep);
25612566
sep = ",";
25622567
}
2568+
if (test_bit(WantReplacement, &rdev->flags)) {
2569+
len += sprintf(page+len, "%swant_replacement", sep);
2570+
sep = ",";
2571+
}
2572+
if (test_bit(Replacement, &rdev->flags)) {
2573+
len += sprintf(page+len, "%sreplacement", sep);
2574+
sep = ",";
2575+
}
2576+
25632577
return len+sprintf(page+len, "\n");
25642578
}
25652579

@@ -2628,6 +2642,42 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
26282642
} else if (cmd_match(buf, "-write_error")) {
26292643
clear_bit(WriteErrorSeen, &rdev->flags);
26302644
err = 0;
2645+
} else if (cmd_match(buf, "want_replacement")) {
2646+
/* Any non-spare device that is not a replacement can
2647+
* become want_replacement at any time, but we then need to
2648+
* check if recovery is needed.
2649+
*/
2650+
if (rdev->raid_disk >= 0 &&
2651+
!test_bit(Replacement, &rdev->flags))
2652+
set_bit(WantReplacement, &rdev->flags);
2653+
set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
2654+
md_wakeup_thread(rdev->mddev->thread);
2655+
err = 0;
2656+
} else if (cmd_match(buf, "-want_replacement")) {
2657+
/* Clearing 'want_replacement' is always allowed.
2658+
* Once replacements starts it is too late though.
2659+
*/
2660+
err = 0;
2661+
clear_bit(WantReplacement, &rdev->flags);
2662+
} else if (cmd_match(buf, "replacement")) {
2663+
/* Can only set a device as a replacement when array has not
2664+
* yet been started. Once running, replacement is automatic
2665+
* from spares, or by assigning 'slot'.
2666+
*/
2667+
if (rdev->mddev->pers)
2668+
err = -EBUSY;
2669+
else {
2670+
set_bit(Replacement, &rdev->flags);
2671+
err = 0;
2672+
}
2673+
} else if (cmd_match(buf, "-replacement")) {
2674+
/* Similarly, can only clear Replacement before start */
2675+
if (rdev->mddev->pers)
2676+
err = -EBUSY;
2677+
else {
2678+
clear_bit(Replacement, &rdev->flags);
2679+
err = 0;
2680+
}
26312681
}
26322682
if (!err)
26332683
sysfs_notify_dirent_safe(rdev->sysfs_state);
@@ -6717,8 +6767,11 @@ static int md_seq_show(struct seq_file *seq, void *v)
67176767
if (test_bit(Faulty, &rdev->flags)) {
67186768
seq_printf(seq, "(F)");
67196769
continue;
6720-
} else if (rdev->raid_disk < 0)
6770+
}
6771+
if (rdev->raid_disk < 0)
67216772
seq_printf(seq, "(S)"); /* spare */
6773+
if (test_bit(Replacement, &rdev->flags))
6774+
seq_printf(seq, "(R)");
67226775
sectors += rdev->sectors;
67236776
}
67246777

drivers/md/md.h

Lines changed: 48 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -72,34 +72,7 @@ struct md_rdev {
7272
* This reduces the burden of testing multiple flags in many cases
7373
*/
7474

75-
unsigned long flags;
76-
#define Faulty 1 /* device is known to have a fault */
77-
#define In_sync 2 /* device is in_sync with rest of array */
78-
#define WriteMostly 4 /* Avoid reading if at all possible */
79-
#define AutoDetected 7 /* added by auto-detect */
80-
#define Blocked 8 /* An error occurred but has not yet
81-
* been acknowledged by the metadata
82-
* handler, so don't allow writes
83-
* until it is cleared */
84-
#define WriteErrorSeen 9 /* A write error has been seen on this
85-
* device
86-
*/
87-
#define FaultRecorded 10 /* Intermediate state for clearing
88-
* Blocked. The Fault is/will-be
89-
* recorded in the metadata, but that
90-
* metadata hasn't been stored safely
91-
* on disk yet.
92-
*/
93-
#define BlockedBadBlocks 11 /* A writer is blocked because they
94-
* found an unacknowledged bad-block.
95-
* This can safely be cleared at any
96-
* time, and the writer will re-check.
97-
* It may be set at any time, and at
98-
* worst the writer will timeout and
99-
* re-check. So setting it as
100-
* accurately as possible is good, but
101-
* not absolutely critical.
102-
*/
75+
unsigned long flags; /* bit set of 'enum flag_bits' bits. */
10376
wait_queue_head_t blocked_wait;
10477

10578
int desc_nr; /* descriptor index in the superblock */
@@ -152,6 +125,44 @@ struct md_rdev {
152125
sector_t size; /* in sectors */
153126
} badblocks;
154127
};
128+
enum flag_bits {
129+
Faulty, /* device is known to have a fault */
130+
In_sync, /* device is in_sync with rest of array */
131+
WriteMostly, /* Avoid reading if at all possible */
132+
AutoDetected, /* added by auto-detect */
133+
Blocked, /* An error occurred but has not yet
134+
* been acknowledged by the metadata
135+
* handler, so don't allow writes
136+
* until it is cleared */
137+
WriteErrorSeen, /* A write error has been seen on this
138+
* device
139+
*/
140+
FaultRecorded, /* Intermediate state for clearing
141+
* Blocked. The Fault is/will-be
142+
* recorded in the metadata, but that
143+
* metadata hasn't been stored safely
144+
* on disk yet.
145+
*/
146+
BlockedBadBlocks, /* A writer is blocked because they
147+
* found an unacknowledged bad-block.
148+
* This can safely be cleared at any
149+
* time, and the writer will re-check.
150+
* It may be set at any time, and at
151+
* worst the writer will timeout and
152+
* re-check. So setting it as
153+
* accurately as possible is good, but
154+
* not absolutely critical.
155+
*/
156+
WantReplacement, /* This device is a candidate to be
157+
* hot-replaced, either because it has
158+
* reported some faults, or because
159+
* of explicit request.
160+
*/
161+
Replacement, /* This device is a replacement for
162+
* a want_replacement device with same
163+
* raid_disk number.
164+
*/
165+
};
155166

156167
#define BB_LEN_MASK (0x00000000000001FFULL)
157168
#define BB_OFFSET_MASK (0x7FFFFFFFFFFFFE00ULL)
@@ -482,15 +493,20 @@ static inline char * mdname (struct mddev * mddev)
482493
static inline int sysfs_link_rdev(struct mddev *mddev, struct md_rdev *rdev)
483494
{
484495
char nm[20];
485-
sprintf(nm, "rd%d", rdev->raid_disk);
486-
return sysfs_create_link(&mddev->kobj, &rdev->kobj, nm);
496+
if (!test_bit(Replacement, &rdev->flags)) {
497+
sprintf(nm, "rd%d", rdev->raid_disk);
498+
return sysfs_create_link(&mddev->kobj, &rdev->kobj, nm);
499+
} else
500+
return 0;
487501
}
488502

489503
static inline void sysfs_unlink_rdev(struct mddev *mddev, struct md_rdev *rdev)
490504
{
491505
char nm[20];
492-
sprintf(nm, "rd%d", rdev->raid_disk);
493-
sysfs_remove_link(&mddev->kobj, nm);
506+
if (!test_bit(Replacement, &rdev->flags)) {
507+
sprintf(nm, "rd%d", rdev->raid_disk);
508+
sysfs_remove_link(&mddev->kobj, nm);
509+
}
494510
}
495511

496512
/*

include/linux/raid/md_p.h

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -277,7 +277,10 @@ struct mdp_superblock_1 {
277277
*/
278278
#define MD_FEATURE_RESHAPE_ACTIVE 4
279279
#define MD_FEATURE_BAD_BLOCKS 8 /* badblock list is not empty */
280-
281-
#define MD_FEATURE_ALL (1|2|4|8)
280+
#define MD_FEATURE_REPLACEMENT 16 /* This device is replacing an
281+
* active device with same 'role'.
282+
* 'recovery_offset' is also set.
283+
*/
284+
#define MD_FEATURE_ALL (1|2|4|8|16)
282285

283286
#endif

0 commit comments

Comments
 (0)