Commit 5fd6c1dc authored by NeilBrown's avatar NeilBrown Committed by Linus Torvalds

[PATCH] md: allow checkpoint of recovery with version-1 superblock

For a while we have had checkpointing of resync.  The version-1 superblock
allows recovery to be checkpointed as well, and this patch implements that.

Due to early carelessness we need to add a feature flag to signal that the
recovery_offset field is in use, otherwise older kernels would assume that a
partially recovered array is in fact fully recovered.
Signed-off-by: default avatarNeil Brown <neilb@suse.de>
Signed-off-by: default avatarAndrew Morton <akpm@osdl.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@osdl.org>
parent a8a55c38
...@@ -1175,6 +1175,10 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) ...@@ -1175,6 +1175,10 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
set_bit(Faulty, &rdev->flags); set_bit(Faulty, &rdev->flags);
break; break;
default: default:
if ((le32_to_cpu(sb->feature_map) &
MD_FEATURE_RECOVERY_OFFSET))
rdev->recovery_offset = le64_to_cpu(sb->recovery_offset);
else
set_bit(In_sync, &rdev->flags); set_bit(In_sync, &rdev->flags);
rdev->raid_disk = role; rdev->raid_disk = role;
break; break;
...@@ -1199,6 +1203,7 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) ...@@ -1199,6 +1203,7 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
sb->feature_map = 0; sb->feature_map = 0;
sb->pad0 = 0; sb->pad0 = 0;
sb->recovery_offset = cpu_to_le64(0);
memset(sb->pad1, 0, sizeof(sb->pad1)); memset(sb->pad1, 0, sizeof(sb->pad1));
memset(sb->pad2, 0, sizeof(sb->pad2)); memset(sb->pad2, 0, sizeof(sb->pad2));
memset(sb->pad3, 0, sizeof(sb->pad3)); memset(sb->pad3, 0, sizeof(sb->pad3));
...@@ -1219,6 +1224,14 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) ...@@ -1219,6 +1224,14 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset); sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset);
sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET); sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET);
} }
if (rdev->raid_disk >= 0 &&
!test_bit(In_sync, &rdev->flags) &&
rdev->recovery_offset > 0) {
sb->feature_map |= cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET);
sb->recovery_offset = cpu_to_le64(rdev->recovery_offset);
}
if (mddev->reshape_position != MaxSector) { if (mddev->reshape_position != MaxSector) {
sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE); sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE);
sb->reshape_position = cpu_to_le64(mddev->reshape_position); sb->reshape_position = cpu_to_le64(mddev->reshape_position);
...@@ -1243,11 +1256,12 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) ...@@ -1243,11 +1256,12 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
sb->dev_roles[i] = cpu_to_le16(0xfffe); sb->dev_roles[i] = cpu_to_le16(0xfffe);
else if (test_bit(In_sync, &rdev2->flags)) else if (test_bit(In_sync, &rdev2->flags))
sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
else if (rdev2->raid_disk >= 0 && rdev2->recovery_offset > 0)
sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
else else
sb->dev_roles[i] = cpu_to_le16(0xffff); sb->dev_roles[i] = cpu_to_le16(0xffff);
} }
sb->recovery_offset = cpu_to_le64(0); /* not supported yet */
sb->sb_csum = calc_sb_1_csum(sb); sb->sb_csum = calc_sb_1_csum(sb);
} }
...@@ -2603,8 +2617,6 @@ static struct kobject *md_probe(dev_t dev, int *part, void *data) ...@@ -2603,8 +2617,6 @@ static struct kobject *md_probe(dev_t dev, int *part, void *data)
return NULL; return NULL;
} }
void md_wakeup_thread(mdk_thread_t *thread);
static void md_safemode_timeout(unsigned long data) static void md_safemode_timeout(unsigned long data)
{ {
mddev_t *mddev = (mddev_t *) data; mddev_t *mddev = (mddev_t *) data;
...@@ -2786,6 +2798,36 @@ static int do_md_run(mddev_t * mddev) ...@@ -2786,6 +2798,36 @@ static int do_md_run(mddev_t * mddev)
mddev->queue->queuedata = mddev; mddev->queue->queuedata = mddev;
mddev->queue->make_request_fn = mddev->pers->make_request; mddev->queue->make_request_fn = mddev->pers->make_request;
/* If there is a partially-recovered drive we need to
* start recovery here. If we leave it to md_check_recovery,
* it will remove the drives and not do the right thing
*/
if (mddev->degraded) {
struct list_head *rtmp;
int spares = 0;
ITERATE_RDEV(mddev,rdev,rtmp)
if (rdev->raid_disk >= 0 &&
!test_bit(In_sync, &rdev->flags) &&
!test_bit(Faulty, &rdev->flags))
/* complete an interrupted recovery */
spares++;
if (spares && mddev->pers->sync_request) {
mddev->recovery = 0;
set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
mddev->sync_thread = md_register_thread(md_do_sync,
mddev,
"%s_resync");
if (!mddev->sync_thread) {
printk(KERN_ERR "%s: could not start resync"
" thread...\n",
mdname(mddev));
/* leave the spares where they are, it shouldn't hurt */
mddev->recovery = 0;
} else
md_wakeup_thread(mddev->sync_thread);
}
}
mddev->changed = 1; mddev->changed = 1;
md_new_event(mddev); md_new_event(mddev);
return 0; return 0;
...@@ -2819,6 +2861,7 @@ static int restart_array(mddev_t *mddev) ...@@ -2819,6 +2861,7 @@ static int restart_array(mddev_t *mddev)
*/ */
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
md_wakeup_thread(mddev->thread); md_wakeup_thread(mddev->thread);
md_wakeup_thread(mddev->sync_thread);
err = 0; err = 0;
} else { } else {
printk(KERN_ERR "md: %s has no personality assigned.\n", printk(KERN_ERR "md: %s has no personality assigned.\n",
...@@ -2842,6 +2885,7 @@ static int do_md_stop(mddev_t * mddev, int ro) ...@@ -2842,6 +2885,7 @@ static int do_md_stop(mddev_t * mddev, int ro)
} }
if (mddev->sync_thread) { if (mddev->sync_thread) {
set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
set_bit(MD_RECOVERY_INTR, &mddev->recovery); set_bit(MD_RECOVERY_INTR, &mddev->recovery);
md_unregister_thread(mddev->sync_thread); md_unregister_thread(mddev->sync_thread);
mddev->sync_thread = NULL; mddev->sync_thread = NULL;
...@@ -2871,13 +2915,14 @@ static int do_md_stop(mddev_t * mddev, int ro) ...@@ -2871,13 +2915,14 @@ static int do_md_stop(mddev_t * mddev, int ro)
if (mddev->ro) if (mddev->ro)
mddev->ro = 0; mddev->ro = 0;
} }
if (!mddev->in_sync) { if (!mddev->in_sync || mddev->sb_dirty) {
/* mark array as shutdown cleanly */ /* mark array as shutdown cleanly */
mddev->in_sync = 1; mddev->in_sync = 1;
md_update_sb(mddev); md_update_sb(mddev);
} }
if (ro) if (ro)
set_disk_ro(disk, 1); set_disk_ro(disk, 1);
clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
} }
/* /*
...@@ -4665,10 +4710,14 @@ void md_do_sync(mddev_t *mddev) ...@@ -4665,10 +4710,14 @@ void md_do_sync(mddev_t *mddev)
struct list_head *tmp; struct list_head *tmp;
sector_t last_check; sector_t last_check;
int skipped = 0; int skipped = 0;
struct list_head *rtmp;
mdk_rdev_t *rdev;
/* just incase thread restarts... */ /* just incase thread restarts... */
if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
return; return;
if (mddev->ro) /* never try to sync a read-only array */
return;
/* we overload curr_resync somewhat here. /* we overload curr_resync somewhat here.
* 0 == not engaged in resync at all * 0 == not engaged in resync at all
...@@ -4727,17 +4776,30 @@ void md_do_sync(mddev_t *mddev) ...@@ -4727,17 +4776,30 @@ void md_do_sync(mddev_t *mddev)
} }
} while (mddev->curr_resync < 2); } while (mddev->curr_resync < 2);
j = 0;
if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
/* resync follows the size requested by the personality, /* resync follows the size requested by the personality,
* which defaults to physical size, but can be virtual size * which defaults to physical size, but can be virtual size
*/ */
max_sectors = mddev->resync_max_sectors; max_sectors = mddev->resync_max_sectors;
mddev->resync_mismatches = 0; mddev->resync_mismatches = 0;
/* we don't use the checkpoint if there's a bitmap */
if (!mddev->bitmap &&
!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
j = mddev->recovery_cp;
} else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
max_sectors = mddev->size << 1; max_sectors = mddev->size << 1;
else else {
/* recovery follows the physical size of devices */ /* recovery follows the physical size of devices */
max_sectors = mddev->size << 1; max_sectors = mddev->size << 1;
j = MaxSector;
ITERATE_RDEV(mddev,rdev,rtmp)
if (rdev->raid_disk >= 0 &&
!test_bit(Faulty, &rdev->flags) &&
!test_bit(In_sync, &rdev->flags) &&
rdev->recovery_offset < j)
j = rdev->recovery_offset;
}
printk(KERN_INFO "md: syncing RAID array %s\n", mdname(mddev)); printk(KERN_INFO "md: syncing RAID array %s\n", mdname(mddev));
printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed:" printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed:"
...@@ -4747,12 +4809,7 @@ void md_do_sync(mddev_t *mddev) ...@@ -4747,12 +4809,7 @@ void md_do_sync(mddev_t *mddev)
speed_max(mddev)); speed_max(mddev));
is_mddev_idle(mddev); /* this also initializes IO event counters */ is_mddev_idle(mddev); /* this also initializes IO event counters */
/* we don't use the checkpoint if there's a bitmap */
if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && !mddev->bitmap
&& ! test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
j = mddev->recovery_cp;
else
j = 0;
io_sectors = 0; io_sectors = 0;
for (m = 0; m < SYNC_MARKS; m++) { for (m = 0; m < SYNC_MARKS; m++) {
mark[m] = jiffies; mark[m] = jiffies;
...@@ -4873,15 +4930,28 @@ void md_do_sync(mddev_t *mddev) ...@@ -4873,15 +4930,28 @@ void md_do_sync(mddev_t *mddev)
if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) && if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) &&
test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) && !test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
mddev->curr_resync > 2 && mddev->curr_resync > 2) {
mddev->curr_resync >= mddev->recovery_cp) { if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
if (mddev->curr_resync >= mddev->recovery_cp) {
printk(KERN_INFO printk(KERN_INFO
"md: checkpointing recovery of %s.\n", "md: checkpointing recovery of %s.\n",
mdname(mddev)); mdname(mddev));
mddev->recovery_cp = mddev->curr_resync; mddev->recovery_cp = mddev->curr_resync;
}
} else } else
mddev->recovery_cp = MaxSector; mddev->recovery_cp = MaxSector;
} else {
if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
mddev->curr_resync = MaxSector;
ITERATE_RDEV(mddev,rdev,rtmp)
if (rdev->raid_disk >= 0 &&
!test_bit(Faulty, &rdev->flags) &&
!test_bit(In_sync, &rdev->flags) &&
rdev->recovery_offset < mddev->curr_resync)
rdev->recovery_offset = mddev->curr_resync;
mddev->sb_dirty = 1;
}
} }
skip: skip:
...@@ -5002,6 +5072,8 @@ void md_check_recovery(mddev_t *mddev) ...@@ -5002,6 +5072,8 @@ void md_check_recovery(mddev_t *mddev)
clear_bit(MD_RECOVERY_INTR, &mddev->recovery); clear_bit(MD_RECOVERY_INTR, &mddev->recovery);
clear_bit(MD_RECOVERY_DONE, &mddev->recovery); clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
goto unlock;
/* no recovery is running. /* no recovery is running.
* remove any failed drives, then * remove any failed drives, then
* add spares if possible. * add spares if possible.
...@@ -5024,6 +5096,7 @@ void md_check_recovery(mddev_t *mddev) ...@@ -5024,6 +5096,7 @@ void md_check_recovery(mddev_t *mddev)
ITERATE_RDEV(mddev,rdev,rtmp) ITERATE_RDEV(mddev,rdev,rtmp)
if (rdev->raid_disk < 0 if (rdev->raid_disk < 0
&& !test_bit(Faulty, &rdev->flags)) { && !test_bit(Faulty, &rdev->flags)) {
rdev->recovery_offset = 0;
if (mddev->pers->hot_add_disk(mddev,rdev)) { if (mddev->pers->hot_add_disk(mddev,rdev)) {
char nm[20]; char nm[20];
sprintf(nm, "rd%d", rdev->raid_disk); sprintf(nm, "rd%d", rdev->raid_disk);
......
...@@ -1888,7 +1888,8 @@ static int run(mddev_t *mddev) ...@@ -1888,7 +1888,8 @@ static int run(mddev_t *mddev)
disk = conf->mirrors + i; disk = conf->mirrors + i;
if (!disk->rdev) { if (!disk->rdev ||
!test_bit(In_sync, &disk->rdev->flags)) {
disk->head_position = 0; disk->head_position = 0;
mddev->degraded++; mddev->degraded++;
} }
......
...@@ -2015,7 +2015,8 @@ static int run(mddev_t *mddev) ...@@ -2015,7 +2015,8 @@ static int run(mddev_t *mddev)
disk = conf->mirrors + i; disk = conf->mirrors + i;
if (!disk->rdev) { if (!disk->rdev ||
!test_bit(In_sync, &rdev->flags)) {
disk->head_position = 0; disk->head_position = 0;
mddev->degraded++; mddev->degraded++;
} }
......
...@@ -3555,6 +3555,7 @@ static int raid5_start_reshape(mddev_t *mddev) ...@@ -3555,6 +3555,7 @@ static int raid5_start_reshape(mddev_t *mddev)
set_bit(In_sync, &rdev->flags); set_bit(In_sync, &rdev->flags);
conf->working_disks++; conf->working_disks++;
added_devices++; added_devices++;
rdev->recovery_offset = 0;
sprintf(nm, "rd%d", rdev->raid_disk); sprintf(nm, "rd%d", rdev->raid_disk);
sysfs_create_link(&mddev->kobj, &rdev->kobj, nm); sysfs_create_link(&mddev->kobj, &rdev->kobj, nm);
} else } else
......
...@@ -88,6 +88,10 @@ struct mdk_rdev_s ...@@ -88,6 +88,10 @@ struct mdk_rdev_s
* array and could again if we did a partial * array and could again if we did a partial
* resync from the bitmap * resync from the bitmap
*/ */
sector_t recovery_offset;/* If this device has been partially
* recovered, this is where we were
* up to.
*/
atomic_t nr_pending; /* number of pending requests. atomic_t nr_pending; /* number of pending requests.
* only maintained for arrays that * only maintained for arrays that
...@@ -183,6 +187,8 @@ struct mddev_s ...@@ -183,6 +187,8 @@ struct mddev_s
#define MD_RECOVERY_REQUESTED 6 #define MD_RECOVERY_REQUESTED 6
#define MD_RECOVERY_CHECK 7 #define MD_RECOVERY_CHECK 7
#define MD_RECOVERY_RESHAPE 8 #define MD_RECOVERY_RESHAPE 8
#define MD_RECOVERY_FROZEN 9
unsigned long recovery; unsigned long recovery;
int in_sync; /* know to not need resync */ int in_sync; /* know to not need resync */
......
...@@ -265,9 +265,12 @@ struct mdp_superblock_1 { ...@@ -265,9 +265,12 @@ struct mdp_superblock_1 {
/* feature_map bits */ /* feature_map bits */
#define MD_FEATURE_BITMAP_OFFSET 1 #define MD_FEATURE_BITMAP_OFFSET 1
#define MD_FEATURE_RECOVERY_OFFSET 2 /* recovery_offset is present and
* must be honoured
*/
#define MD_FEATURE_RESHAPE_ACTIVE 4 #define MD_FEATURE_RESHAPE_ACTIVE 4
#define MD_FEATURE_ALL 5 #define MD_FEATURE_ALL (1|2|4)
#endif #endif
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment