Commit 4b80991c authored by NeilBrown's avatar NeilBrown

md: Protect access to mddev->disks list using RCU

All modifications and most access to the mddev->disks list are made
under the reconfig_mutex lock.  However there are three places where
the list is walked without any locking.  If a reconfig happens at this
time, havoc (and oops) can ensue.

So use RCU to protect these accesses:
  - wrap them in rcu_read_{,un}lock()
  - use list_for_each_entry_rcu
  - add to the list with list_add_rcu
  - delete from the list with list_del_rcu
  - delay the 'free' with call_rcu rather than schedule_work

Note that export_rdev did a list_del_init on this list.  In almost all
cases the entry was not in the list anymore so it was a no-op and so
safe.  It is no longer safe as after list_del_rcu we may not touch
the list_head.
An audit shows that export_rdev is called:
  - after unbind_rdev_from_array, in which case the delete has
     already been done,
  - after bind_rdev_to_array fails, in which case the delete isn't needed.
  - before the device has been put on a list at all (e.g. in
      add_new_disk where reading the superblock fails).
  - and in autorun devices after a failure when the device is on a
      different list.

So remove the list_del_init call from export_rdev, and add it back
immediately before the called to export_rdev for that last case.

Note also that ->same_set is sometimes used for lists other than
mddev->list (e.g. candidates).  In these cases rcu is not needed.
Signed-off-by: default avatarNeilBrown <neilb@suse.de>
parent f2ea68cf
...@@ -241,10 +241,10 @@ static struct page *read_sb_page(mddev_t *mddev, long offset, unsigned long inde ...@@ -241,10 +241,10 @@ static struct page *read_sb_page(mddev_t *mddev, long offset, unsigned long inde
static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait) static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
{ {
mdk_rdev_t *rdev; mdk_rdev_t *rdev;
struct list_head *tmp;
mddev_t *mddev = bitmap->mddev; mddev_t *mddev = bitmap->mddev;
rdev_for_each(rdev, tmp, mddev) rcu_read_lock();
rdev_for_each_rcu(rdev, mddev)
if (test_bit(In_sync, &rdev->flags) if (test_bit(In_sync, &rdev->flags)
&& !test_bit(Faulty, &rdev->flags)) { && !test_bit(Faulty, &rdev->flags)) {
int size = PAGE_SIZE; int size = PAGE_SIZE;
...@@ -260,11 +260,11 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait) ...@@ -260,11 +260,11 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
+ (long)(page->index * (PAGE_SIZE/512)) + (long)(page->index * (PAGE_SIZE/512))
+ size/512 > 0) + size/512 > 0)
/* bitmap runs in to metadata */ /* bitmap runs in to metadata */
return -EINVAL; goto bad_alignment;
if (rdev->data_offset + mddev->size*2 if (rdev->data_offset + mddev->size*2
> rdev->sb_start + bitmap->offset) > rdev->sb_start + bitmap->offset)
/* data runs in to bitmap */ /* data runs in to bitmap */
return -EINVAL; goto bad_alignment;
} else if (rdev->sb_start < rdev->data_offset) { } else if (rdev->sb_start < rdev->data_offset) {
/* METADATA BITMAP DATA */ /* METADATA BITMAP DATA */
if (rdev->sb_start if (rdev->sb_start
...@@ -272,7 +272,7 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait) ...@@ -272,7 +272,7 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
+ page->index*(PAGE_SIZE/512) + size/512 + page->index*(PAGE_SIZE/512) + size/512
> rdev->data_offset) > rdev->data_offset)
/* bitmap runs in to data */ /* bitmap runs in to data */
return -EINVAL; goto bad_alignment;
} else { } else {
/* DATA METADATA BITMAP - no problems */ /* DATA METADATA BITMAP - no problems */
} }
...@@ -282,10 +282,15 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait) ...@@ -282,10 +282,15 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
size, size,
page); page);
} }
rcu_read_unlock();
if (wait) if (wait)
md_super_wait(mddev); md_super_wait(mddev);
return 0; return 0;
bad_alignment:
rcu_read_unlock();
return -EINVAL;
} }
static void bitmap_file_kick(struct bitmap *bitmap); static void bitmap_file_kick(struct bitmap *bitmap);
......
...@@ -1395,15 +1395,17 @@ static struct super_type super_types[] = { ...@@ -1395,15 +1395,17 @@ static struct super_type super_types[] = {
static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2) static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2)
{ {
struct list_head *tmp, *tmp2;
mdk_rdev_t *rdev, *rdev2; mdk_rdev_t *rdev, *rdev2;
rdev_for_each(rdev, tmp, mddev1) rcu_read_lock();
rdev_for_each(rdev2, tmp2, mddev2) rdev_for_each_rcu(rdev, mddev1)
rdev_for_each_rcu(rdev2, mddev2)
if (rdev->bdev->bd_contains == if (rdev->bdev->bd_contains ==
rdev2->bdev->bd_contains) rdev2->bdev->bd_contains) {
rcu_read_unlock();
return 1; return 1;
}
rcu_read_unlock();
return 0; return 0;
} }
...@@ -1470,7 +1472,7 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) ...@@ -1470,7 +1472,7 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
kobject_del(&rdev->kobj); kobject_del(&rdev->kobj);
goto fail; goto fail;
} }
list_add(&rdev->same_set, &mddev->disks); list_add_rcu(&rdev->same_set, &mddev->disks);
bd_claim_by_disk(rdev->bdev, rdev->bdev->bd_holder, mddev->gendisk); bd_claim_by_disk(rdev->bdev, rdev->bdev->bd_holder, mddev->gendisk);
return 0; return 0;
...@@ -1495,14 +1497,16 @@ static void unbind_rdev_from_array(mdk_rdev_t * rdev) ...@@ -1495,14 +1497,16 @@ static void unbind_rdev_from_array(mdk_rdev_t * rdev)
return; return;
} }
bd_release_from_disk(rdev->bdev, rdev->mddev->gendisk); bd_release_from_disk(rdev->bdev, rdev->mddev->gendisk);
list_del_init(&rdev->same_set); list_del_rcu(&rdev->same_set);
printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b)); printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b));
rdev->mddev = NULL; rdev->mddev = NULL;
sysfs_remove_link(&rdev->kobj, "block"); sysfs_remove_link(&rdev->kobj, "block");
/* We need to delay this, otherwise we can deadlock when /* We need to delay this, otherwise we can deadlock when
* writing to 'remove' to "dev/state" * writing to 'remove' to "dev/state". We also need
* to delay it due to rcu usage.
*/ */
synchronize_rcu();
INIT_WORK(&rdev->del_work, md_delayed_delete); INIT_WORK(&rdev->del_work, md_delayed_delete);
kobject_get(&rdev->kobj); kobject_get(&rdev->kobj);
schedule_work(&rdev->del_work); schedule_work(&rdev->del_work);
...@@ -1558,7 +1562,6 @@ static void export_rdev(mdk_rdev_t * rdev) ...@@ -1558,7 +1562,6 @@ static void export_rdev(mdk_rdev_t * rdev)
if (rdev->mddev) if (rdev->mddev)
MD_BUG(); MD_BUG();
free_disk_sb(rdev); free_disk_sb(rdev);
list_del_init(&rdev->same_set);
#ifndef MODULE #ifndef MODULE
if (test_bit(AutoDetected, &rdev->flags)) if (test_bit(AutoDetected, &rdev->flags))
md_autodetect_dev(rdev->bdev->bd_dev); md_autodetect_dev(rdev->bdev->bd_dev);
...@@ -4062,8 +4065,10 @@ static void autorun_devices(int part) ...@@ -4062,8 +4065,10 @@ static void autorun_devices(int part)
/* on success, candidates will be empty, on error /* on success, candidates will be empty, on error
* it won't... * it won't...
*/ */
rdev_for_each_list(rdev, tmp, candidates) rdev_for_each_list(rdev, tmp, candidates) {
list_del_init(&rdev->same_set);
export_rdev(rdev); export_rdev(rdev);
}
mddev_put(mddev); mddev_put(mddev);
} }
printk(KERN_INFO "md: ... autorun DONE.\n"); printk(KERN_INFO "md: ... autorun DONE.\n");
...@@ -5529,12 +5534,12 @@ int unregister_md_personality(struct mdk_personality *p) ...@@ -5529,12 +5534,12 @@ int unregister_md_personality(struct mdk_personality *p)
static int is_mddev_idle(mddev_t *mddev) static int is_mddev_idle(mddev_t *mddev)
{ {
mdk_rdev_t * rdev; mdk_rdev_t * rdev;
struct list_head *tmp;
int idle; int idle;
long curr_events; long curr_events;
idle = 1; idle = 1;
rdev_for_each(rdev, tmp, mddev) { rcu_read_lock();
rdev_for_each_rcu(rdev, mddev) {
struct gendisk *disk = rdev->bdev->bd_contains->bd_disk; struct gendisk *disk = rdev->bdev->bd_contains->bd_disk;
curr_events = disk_stat_read(disk, sectors[0]) + curr_events = disk_stat_read(disk, sectors[0]) +
disk_stat_read(disk, sectors[1]) - disk_stat_read(disk, sectors[1]) -
...@@ -5566,6 +5571,7 @@ static int is_mddev_idle(mddev_t *mddev) ...@@ -5566,6 +5571,7 @@ static int is_mddev_idle(mddev_t *mddev)
idle = 0; idle = 0;
} }
} }
rcu_read_unlock();
return idle; return idle;
} }
......
...@@ -339,6 +339,9 @@ static inline char * mdname (mddev_t * mddev) ...@@ -339,6 +339,9 @@ static inline char * mdname (mddev_t * mddev)
#define rdev_for_each(rdev, tmp, mddev) \ #define rdev_for_each(rdev, tmp, mddev) \
rdev_for_each_list(rdev, tmp, (mddev)->disks) rdev_for_each_list(rdev, tmp, (mddev)->disks)
#define rdev_for_each_rcu(rdev, mddev) \
list_for_each_entry_rcu(rdev, &((mddev)->disks), same_set)
typedef struct mdk_thread_s { typedef struct mdk_thread_s {
void (*run) (mddev_t *mddev); void (*run) (mddev_t *mddev);
mddev_t *mddev; mddev_t *mddev;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment