Commit 29269553 authored by NeilBrown's avatar NeilBrown Committed by Linus Torvalds

[PATCH] md: Final stages of raid5 expand code

This patch adds raid5_reshape and end_reshape which will start and finish the
reshape processes.

raid5_reshape is only enabled in CONFIG_MD_RAID5_RESHAPE is set, to discourage
accidental use.

Read the 'help' for the CONFIG_MD_RAID5_RESHAPE entry.

and Make sure that you have backups, just in case.
Signed-off-by: default avatarNeil Brown <neilb@suse.de>
Signed-off-by: default avatarAndrew Morton <akpm@osdl.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@osdl.org>
parent ccfcc3c1
...@@ -127,6 +127,32 @@ config MD_RAID5 ...@@ -127,6 +127,32 @@ config MD_RAID5
If unsure, say Y. If unsure, say Y.
config MD_RAID5_RESHAPE
bool "Support adding drives to a raid-5 array (experimental)"
depends on MD_RAID5 && EXPERIMENTAL
---help---
A RAID-5 set can be expanded by adding extra drives. This
requires "restriping" the array which means (almost) every
block must be written to a different place.
This option allows such restriping to be done while the array
is online. However it is still EXPERIMENTAL code. It should
work, but please be sure that you have backups.
You will need a version of mdadm newer than 2.3.1. During the
early stage of reshape there is a critical section where live data
is being over-written. A crash during this time needs extra care
for recovery. The newer mdadm takes a copy of the data in the
critical section and will restore it, if necessary, after a crash.
The mdadm usage is e.g.
mdadm --grow /dev/md1 --raid-disks=6
to grow '/dev/md1' to having 6 disks.
Note: The array can only be expanded, not contracted.
There should be enough spares already present to make the new
array workable.
config MD_RAID6 config MD_RAID6
tristate "RAID-6 mode" tristate "RAID-6 mode"
depends on BLK_DEV_MD depends on BLK_DEV_MD
......
...@@ -158,11 +158,12 @@ static int start_readonly; ...@@ -158,11 +158,12 @@ static int start_readonly;
*/ */
static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters); static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters);
static atomic_t md_event_count; static atomic_t md_event_count;
static void md_new_event(mddev_t *mddev) void md_new_event(mddev_t *mddev)
{ {
atomic_inc(&md_event_count); atomic_inc(&md_event_count);
wake_up(&md_event_waiters); wake_up(&md_event_waiters);
} }
EXPORT_SYMBOL_GPL(md_new_event);
/* /*
* Enables to iterate over all existing md arrays * Enables to iterate over all existing md arrays
...@@ -4467,7 +4468,7 @@ static DECLARE_WAIT_QUEUE_HEAD(resync_wait); ...@@ -4467,7 +4468,7 @@ static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
#define SYNC_MARKS 10 #define SYNC_MARKS 10
#define SYNC_MARK_STEP (3*HZ) #define SYNC_MARK_STEP (3*HZ)
static void md_do_sync(mddev_t *mddev) void md_do_sync(mddev_t *mddev)
{ {
mddev_t *mddev2; mddev_t *mddev2;
unsigned int currspeed = 0, unsigned int currspeed = 0,
...@@ -4704,6 +4705,7 @@ static void md_do_sync(mddev_t *mddev) ...@@ -4704,6 +4705,7 @@ static void md_do_sync(mddev_t *mddev)
set_bit(MD_RECOVERY_DONE, &mddev->recovery); set_bit(MD_RECOVERY_DONE, &mddev->recovery);
md_wakeup_thread(mddev->thread); md_wakeup_thread(mddev->thread);
} }
EXPORT_SYMBOL_GPL(md_do_sync);
/* /*
......
...@@ -331,6 +331,8 @@ static int grow_stripes(raid5_conf_t *conf, int num) ...@@ -331,6 +331,8 @@ static int grow_stripes(raid5_conf_t *conf, int num)
} }
return 0; return 0;
} }
#ifdef CONFIG_MD_RAID5_RESHAPE
static int resize_stripes(raid5_conf_t *conf, int newsize) static int resize_stripes(raid5_conf_t *conf, int newsize)
{ {
/* Make all the stripes able to hold 'newsize' devices. /* Make all the stripes able to hold 'newsize' devices.
...@@ -451,7 +453,7 @@ static int resize_stripes(raid5_conf_t *conf, int newsize) ...@@ -451,7 +453,7 @@ static int resize_stripes(raid5_conf_t *conf, int newsize)
conf->pool_size = newsize; conf->pool_size = newsize;
return err; return err;
} }
#endif
static int drop_one_stripe(raid5_conf_t *conf) static int drop_one_stripe(raid5_conf_t *conf)
{ {
...@@ -1034,6 +1036,8 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in ...@@ -1034,6 +1036,8 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
return 0; return 0;
} }
static void end_reshape(raid5_conf_t *conf);
static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int disks) static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int disks)
{ {
int sectors_per_chunk = conf->chunk_size >> 9; int sectors_per_chunk = conf->chunk_size >> 9;
...@@ -1844,6 +1848,10 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i ...@@ -1844,6 +1848,10 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
if (sector_nr >= max_sector) { if (sector_nr >= max_sector) {
/* just being told to finish up .. nothing much to do */ /* just being told to finish up .. nothing much to do */
unplug_slaves(mddev); unplug_slaves(mddev);
if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
end_reshape(conf);
return 0;
}
if (mddev->curr_resync < max_sector) /* aborted */ if (mddev->curr_resync < max_sector) /* aborted */
bitmap_end_sync(mddev->bitmap, mddev->curr_resync, bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
...@@ -2464,6 +2472,116 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors) ...@@ -2464,6 +2472,116 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors)
return 0; return 0;
} }
#ifdef CONFIG_MD_RAID5_RESHAPE
static int raid5_reshape(mddev_t *mddev, int raid_disks)
{
raid5_conf_t *conf = mddev_to_conf(mddev);
int err;
mdk_rdev_t *rdev;
struct list_head *rtmp;
int spares = 0;
int added_devices = 0;
if (mddev->degraded ||
test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
return -EBUSY;
if (conf->raid_disks > raid_disks)
return -EINVAL; /* Cannot shrink array yet */
if (conf->raid_disks == raid_disks)
return 0; /* nothing to do */
/* Can only proceed if there are plenty of stripe_heads.
* We need a minimum of one full stripe,, and for sensible progress
* it is best to have about 4 times that.
* If we require 4 times, then the default 256 4K stripe_heads will
* allow for chunk sizes up to 256K, which is probably OK.
* If the chunk size is greater, user-space should request more
* stripe_heads first.
*/
if ((mddev->chunk_size / STRIPE_SIZE) * 4 > conf->max_nr_stripes) {
printk(KERN_WARNING "raid5: reshape: not enough stripes. Needed %lu\n",
(mddev->chunk_size / STRIPE_SIZE)*4);
return -ENOSPC;
}
ITERATE_RDEV(mddev, rdev, rtmp)
if (rdev->raid_disk < 0 &&
!test_bit(Faulty, &rdev->flags))
spares++;
if (conf->raid_disks + spares < raid_disks-1)
/* Not enough devices even to make a degraded array
* of that size
*/
return -EINVAL;
err = resize_stripes(conf, raid_disks);
if (err)
return err;
spin_lock_irq(&conf->device_lock);
conf->previous_raid_disks = conf->raid_disks;
mddev->raid_disks = conf->raid_disks = raid_disks;
conf->expand_progress = 0;
spin_unlock_irq(&conf->device_lock);
/* Add some new drives, as many as will fit.
* We know there are enough to make the newly sized array work.
*/
ITERATE_RDEV(mddev, rdev, rtmp)
if (rdev->raid_disk < 0 &&
!test_bit(Faulty, &rdev->flags)) {
if (raid5_add_disk(mddev, rdev)) {
char nm[20];
set_bit(In_sync, &rdev->flags);
conf->working_disks++;
added_devices++;
sprintf(nm, "rd%d", rdev->raid_disk);
sysfs_create_link(&mddev->kobj, &rdev->kobj, nm);
} else
break;
}
mddev->degraded = (raid_disks - conf->previous_raid_disks) - added_devices;
clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
mddev->sync_thread = md_register_thread(md_do_sync, mddev,
"%s_reshape");
if (!mddev->sync_thread) {
mddev->recovery = 0;
spin_lock_irq(&conf->device_lock);
mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks;
conf->expand_progress = MaxSector;
spin_unlock_irq(&conf->device_lock);
return -EAGAIN;
}
md_wakeup_thread(mddev->sync_thread);
md_new_event(mddev);
return 0;
}
#endif
static void end_reshape(raid5_conf_t *conf)
{
struct block_device *bdev;
conf->mddev->array_size = conf->mddev->size * (conf->mddev->raid_disks-1);
set_capacity(conf->mddev->gendisk, conf->mddev->array_size << 1);
conf->mddev->changed = 1;
bdev = bdget_disk(conf->mddev->gendisk, 0);
if (bdev) {
mutex_lock(&bdev->bd_inode->i_mutex);
i_size_write(bdev->bd_inode, conf->mddev->array_size << 10);
mutex_unlock(&bdev->bd_inode->i_mutex);
bdput(bdev);
}
spin_lock_irq(&conf->device_lock);
conf->expand_progress = MaxSector;
spin_unlock_irq(&conf->device_lock);
}
static void raid5_quiesce(mddev_t *mddev, int state) static void raid5_quiesce(mddev_t *mddev, int state)
{ {
raid5_conf_t *conf = mddev_to_conf(mddev); raid5_conf_t *conf = mddev_to_conf(mddev);
...@@ -2502,6 +2620,9 @@ static struct mdk_personality raid5_personality = ...@@ -2502,6 +2620,9 @@ static struct mdk_personality raid5_personality =
.spare_active = raid5_spare_active, .spare_active = raid5_spare_active,
.sync_request = sync_request, .sync_request = sync_request,
.resize = raid5_resize, .resize = raid5_resize,
#ifdef CONFIG_MD_RAID5_RESHAPE
.reshape = raid5_reshape,
#endif
.quiesce = raid5_quiesce, .quiesce = raid5_quiesce,
}; };
......
...@@ -92,7 +92,8 @@ extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, ...@@ -92,7 +92,8 @@ extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
extern void md_super_wait(mddev_t *mddev); extern void md_super_wait(mddev_t *mddev);
extern int sync_page_io(struct block_device *bdev, sector_t sector, int size, extern int sync_page_io(struct block_device *bdev, sector_t sector, int size,
struct page *page, int rw); struct page *page, int rw);
extern void md_do_sync(mddev_t *mddev);
extern void md_new_event(mddev_t *mddev);
#define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); } #define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment