Skip to content

Commit d337482

Browse files
committed
md: make devices disappear when they are no longer needed.
Currently md devices, once created, never disappear until the module is unloaded. This is essentially because the gendisk holds a reference to the mddev, and the mddev holds a reference to the gendisk, this a circular reference. If we drop the reference from mddev to gendisk, then we need to ensure that the mddev is destroyed when the gendisk is destroyed. However it is not possible to hook into the gendisk destruction process to enable this. So we drop the reference from the gendisk to the mddev and destroy the gendisk when the mddev gets destroyed. However this has a complication. Between the call __blkdev_get->get_gendisk->kobj_lookup->md_probe and the call __blkdev_get->md_open there is no obvious way to hold a reference on the mddev any more, so unless something is done, it will disappear and gendisk will be destroyed prematurely. Also, once we decide to destroy the mddev, there will be an unlockable moment before the gendisk is unlinked (blk_unregister_region) during which a new reference to the gendisk can be created. We need to ensure that this reference can not be used. i.e. the ->open must fail. So: 1/ in md_probe we set a flag in the mddev (hold_active) which indicates that the array should be treated as active, even though there are no references, and no appearance of activity. This is cleared by md_release when the device is closed if it is no longer needed. This ensures that the gendisk will survive between md_probe and md_open. 2/ In md_open we check if the mddev we expect to open matches the gendisk that we did open. If there is a mismatch we return -ERESTARTSYS and modify __blkdev_get to retry from the top in that case. In the -ERESTARTSYS sys case we make sure to wait until the old gendisk (that we succeeded in opening) is really gone so we loop at most once. Some udev configurations will always open an md device when it first appears. If we allow an md device that was just created by an open to disappear on an immediate close, then this can race with such udev configurations and result in an infinite loop the device being opened and closed, then re-open due to the 'ADD' even from the first open, and then close and so on. So we make sure an md device, once created by an open, remains active at least until some md 'ioctl' has been made on it. This means that all normal usage of md devices will allow them to disappear promptly when not needed, but the worst that an incorrect usage will do it cause an inactive md device to be left in existence (it can easily be removed). As an array can be stopped by writing to a sysfs attribute echo clear > /sys/block/mdXXX/md/array_state we need to use scheduled work for deleting the gendisk and other kobjects. This allows us to wait for any pending gendisk deletion to complete by simply calling flush_scheduled_work(). Signed-off-by: NeilBrown <neilb@suse.de>
1 parent a21d150 commit d337482

3 files changed

Lines changed: 67 additions & 12 deletions

File tree

drivers/md/md.c

Lines changed: 49 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -214,16 +214,33 @@ static inline mddev_t *mddev_get(mddev_t *mddev)
214214
return mddev;
215215
}
216216

217+
static void mddev_delayed_delete(struct work_struct *ws)
218+
{
219+
mddev_t *mddev = container_of(ws, mddev_t, del_work);
220+
kobject_del(&mddev->kobj);
221+
kobject_put(&mddev->kobj);
222+
}
223+
217224
static void mddev_put(mddev_t *mddev)
218225
{
219226
if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
220227
return;
221-
if (!mddev->raid_disks && list_empty(&mddev->disks)) {
228+
if (!mddev->raid_disks && list_empty(&mddev->disks) &&
229+
!mddev->hold_active) {
222230
list_del(&mddev->all_mddevs);
223-
spin_unlock(&all_mddevs_lock);
224-
kobject_put(&mddev->kobj);
225-
} else
226-
spin_unlock(&all_mddevs_lock);
231+
if (mddev->gendisk) {
232+
/* we did a probe so need to clean up.
233+
* Call schedule_work inside the spinlock
234+
* so that flush_scheduled_work() after
235+
* mddev_find will succeed in waiting for the
236+
* work to be done.
237+
*/
238+
INIT_WORK(&mddev->del_work, mddev_delayed_delete);
239+
schedule_work(&mddev->del_work);
240+
} else
241+
kfree(mddev);
242+
}
243+
spin_unlock(&all_mddevs_lock);
227244
}
228245

229246
static mddev_t * mddev_find(dev_t unit)
@@ -242,6 +259,7 @@ static mddev_t * mddev_find(dev_t unit)
242259

243260
if (new) {
244261
list_add(&new->all_mddevs, &all_mddevs);
262+
mddev->hold_active = UNTIL_IOCTL;
245263
spin_unlock(&all_mddevs_lock);
246264
return new;
247265
}
@@ -3435,6 +3453,8 @@ md_attr_store(struct kobject *kobj, struct attribute *attr,
34353453
if (!capable(CAP_SYS_ADMIN))
34363454
return -EACCES;
34373455
rv = mddev_lock(mddev);
3456+
if (mddev->hold_active == UNTIL_IOCTL)
3457+
mddev->hold_active = 0;
34383458
if (!rv) {
34393459
rv = entry->store(mddev, page, length);
34403460
mddev_unlock(mddev);
@@ -3484,6 +3504,11 @@ static struct kobject *md_probe(dev_t dev, int *part, void *data)
34843504
if (!mddev)
34853505
return NULL;
34863506

3507+
/* wait for any previous instance if this device
3508+
* to be completed removed (mddev_delayed_delete).
3509+
*/
3510+
flush_scheduled_work();
3511+
34873512
mutex_lock(&disks_mutex);
34883513
if (mddev->gendisk) {
34893514
mutex_unlock(&disks_mutex);
@@ -3520,7 +3545,7 @@ static struct kobject *md_probe(dev_t dev, int *part, void *data)
35203545
disk->private_data = mddev;
35213546
disk->queue = mddev->queue;
35223547
/* Allow extended partitions. This makes the
3523-
* 'mdp' device redundant, but we can really
3548+
* 'mdp' device redundant, but we can't really
35243549
* remove it now.
35253550
*/
35263551
disk->flags |= GENHD_FL_EXT_DEVT;
@@ -3536,6 +3561,7 @@ static struct kobject *md_probe(dev_t dev, int *part, void *data)
35363561
kobject_uevent(&mddev->kobj, KOBJ_ADD);
35373562
mddev->sysfs_state = sysfs_get_dirent(mddev->kobj.sd, "array_state");
35383563
}
3564+
mddev_put(mddev);
35393565
return NULL;
35403566
}
35413567

@@ -5054,6 +5080,9 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
50545080

50555081
done_unlock:
50565082
abort_unlock:
5083+
if (mddev->hold_active == UNTIL_IOCTL &&
5084+
err != -EINVAL)
5085+
mddev->hold_active = 0;
50575086
mddev_unlock(mddev);
50585087

50595088
return err;
@@ -5070,14 +5099,25 @@ static int md_open(struct block_device *bdev, fmode_t mode)
50705099
* Succeed if we can lock the mddev, which confirms that
50715100
* it isn't being stopped right now.
50725101
*/
5073-
mddev_t *mddev = bdev->bd_disk->private_data;
5102+
mddev_t *mddev = mddev_find(bdev->bd_dev);
50745103
int err;
50755104

5105+
if (mddev->gendisk != bdev->bd_disk) {
5106+
/* we are racing with mddev_put which is discarding this
5107+
* bd_disk.
5108+
*/
5109+
mddev_put(mddev);
5110+
/* Wait until bdev->bd_disk is definitely gone */
5111+
flush_scheduled_work();
5112+
/* Then retry the open from the top */
5113+
return -ERESTARTSYS;
5114+
}
5115+
BUG_ON(mddev != bdev->bd_disk->private_data);
5116+
50765117
if ((err = mutex_lock_interruptible_nested(&mddev->reconfig_mutex, 1)))
50775118
goto out;
50785119

50795120
err = 0;
5080-
mddev_get(mddev);
50815121
atomic_inc(&mddev->openers);
50825122
mddev_unlock(mddev);
50835123

@@ -6436,11 +6476,8 @@ static __exit void md_exit(void)
64366476
unregister_sysctl_table(raid_table_header);
64376477
remove_proc_entry("mdstat", NULL);
64386478
for_each_mddev(mddev, tmp) {
6439-
struct gendisk *disk = mddev->gendisk;
6440-
if (!disk)
6441-
continue;
64426479
export_array(mddev);
6443-
mddev_put(mddev);
6480+
mddev->hold_active = 0;
64446481
}
64456482
}
64466483

fs/block_dev.c

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1005,6 +1005,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
10051005
}
10061006

10071007
lock_kernel();
1008+
restart:
10081009

10091010
ret = -ENXIO;
10101011
disk = get_gendisk(bdev->bd_dev, &partno);
@@ -1025,6 +1026,19 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
10251026

10261027
if (disk->fops->open) {
10271028
ret = disk->fops->open(bdev, mode);
1029+
if (ret == -ERESTARTSYS) {
1030+
/* Lost a race with 'disk' being
1031+
* deleted, try again.
1032+
* See md.c
1033+
*/
1034+
disk_put_part(bdev->bd_part);
1035+
bdev->bd_part = NULL;
1036+
module_put(disk->fops->owner);
1037+
put_disk(disk);
1038+
bdev->bd_disk = NULL;
1039+
mutex_unlock(&bdev->bd_mutex);
1040+
goto restart;
1041+
}
10281042
if (ret)
10291043
goto out_clear;
10301044
}

include/linux/raid/md_k.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,8 @@ struct mddev_s
137137
struct gendisk *gendisk;
138138

139139
struct kobject kobj;
140+
int hold_active;
141+
#define UNTIL_IOCTL 1
140142

141143
/* Superblock information */
142144
int major_version,
@@ -246,6 +248,8 @@ struct mddev_s
246248
*/
247249
struct sysfs_dirent *sysfs_action; /* handle for 'sync_action' */
248250

251+
struct work_struct del_work; /* used for delayed sysfs removal */
252+
249253
spinlock_t write_lock;
250254
wait_queue_head_t sb_wait; /* for waiting on superblock updates */
251255
atomic_t pending_writes; /* number of active superblock writes */

0 commit comments

Comments
 (0)