Commit 8ddf9efe authored by NeilBrown's avatar NeilBrown Committed by Linus Torvalds

[PATCH] md: support write-mostly device in raid1

This allows a device in a raid1 to be marked as "write mostly".  Read requests
will only be sent if there is no other option.
Signed-off-by: default avatarNeil Brown <neilb@cse.unsw.edu.au>
Signed-off-by: default avatarAndrew Morton <akpm@osdl.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@osdl.org>
parent 36fa3063
...@@ -670,6 +670,7 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) ...@@ -670,6 +670,7 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
if (mddev->level != LEVEL_MULTIPATH) { if (mddev->level != LEVEL_MULTIPATH) {
rdev->faulty = 0; rdev->faulty = 0;
rdev->flags = 0;
desc = sb->disks + rdev->desc_nr; desc = sb->disks + rdev->desc_nr;
if (desc->state & (1<<MD_DISK_FAULTY)) if (desc->state & (1<<MD_DISK_FAULTY))
...@@ -679,6 +680,8 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) ...@@ -679,6 +680,8 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
rdev->in_sync = 1; rdev->in_sync = 1;
rdev->raid_disk = desc->raid_disk; rdev->raid_disk = desc->raid_disk;
} }
if (desc->state & (1<<MD_DISK_WRITEMOSTLY))
set_bit(WriteMostly, &rdev->flags);
} else /* MULTIPATH are always insync */ } else /* MULTIPATH are always insync */
rdev->in_sync = 1; rdev->in_sync = 1;
return 0; return 0;
...@@ -777,6 +780,8 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) ...@@ -777,6 +780,8 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
spare++; spare++;
working++; working++;
} }
if (test_bit(WriteMostly, &rdev2->flags))
d->state |= (1<<MD_DISK_WRITEMOSTLY);
} }
/* now set the "removed" and "faulty" bits on any missing devices */ /* now set the "removed" and "faulty" bits on any missing devices */
...@@ -990,6 +995,9 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) ...@@ -990,6 +995,9 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
rdev->raid_disk = role; rdev->raid_disk = role;
break; break;
} }
rdev->flags = 0;
if (sb->devflags & WriteMostly1)
set_bit(WriteMostly, &rdev->flags);
} else /* MULTIPATH are always insync */ } else /* MULTIPATH are always insync */
rdev->in_sync = 1; rdev->in_sync = 1;
...@@ -2152,6 +2160,8 @@ static int get_disk_info(mddev_t * mddev, void __user * arg) ...@@ -2152,6 +2160,8 @@ static int get_disk_info(mddev_t * mddev, void __user * arg)
info.state |= (1<<MD_DISK_ACTIVE); info.state |= (1<<MD_DISK_ACTIVE);
info.state |= (1<<MD_DISK_SYNC); info.state |= (1<<MD_DISK_SYNC);
} }
if (test_bit(WriteMostly, &rdev->flags))
info.state |= (1<<MD_DISK_WRITEMOSTLY);
} else { } else {
info.major = info.minor = 0; info.major = info.minor = 0;
info.raid_disk = -1; info.raid_disk = -1;
...@@ -2237,6 +2247,9 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) ...@@ -2237,6 +2247,9 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
rdev->saved_raid_disk = rdev->raid_disk; rdev->saved_raid_disk = rdev->raid_disk;
rdev->in_sync = 0; /* just to be sure */ rdev->in_sync = 0; /* just to be sure */
if (info->state & (1<<MD_DISK_WRITEMOSTLY))
set_bit(WriteMostly, &rdev->flags);
rdev->raid_disk = -1; rdev->raid_disk = -1;
err = bind_rdev_to_array(rdev, mddev); err = bind_rdev_to_array(rdev, mddev);
if (err) if (err)
...@@ -2277,6 +2290,9 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) ...@@ -2277,6 +2290,9 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
else else
rdev->in_sync = 0; rdev->in_sync = 0;
if (info->state & (1<<MD_DISK_WRITEMOSTLY))
set_bit(WriteMostly, &rdev->flags);
err = bind_rdev_to_array(rdev, mddev); err = bind_rdev_to_array(rdev, mddev);
if (err) { if (err) {
export_rdev(rdev); export_rdev(rdev);
...@@ -3329,6 +3345,8 @@ static int md_seq_show(struct seq_file *seq, void *v) ...@@ -3329,6 +3345,8 @@ static int md_seq_show(struct seq_file *seq, void *v)
char b[BDEVNAME_SIZE]; char b[BDEVNAME_SIZE];
seq_printf(seq, " %s[%d]", seq_printf(seq, " %s[%d]",
bdevname(rdev->bdev,b), rdev->desc_nr); bdevname(rdev->bdev,b), rdev->desc_nr);
if (test_bit(WriteMostly, &rdev->flags))
seq_printf(seq, "(W)");
if (rdev->faulty) { if (rdev->faulty) {
seq_printf(seq, "(F)"); seq_printf(seq, "(F)");
continue; continue;
......
...@@ -360,13 +360,14 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio) ...@@ -360,13 +360,14 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
{ {
const unsigned long this_sector = r1_bio->sector; const unsigned long this_sector = r1_bio->sector;
int new_disk = conf->last_used, disk = new_disk; int new_disk = conf->last_used, disk = new_disk;
int wonly_disk = -1;
const int sectors = r1_bio->sectors; const int sectors = r1_bio->sectors;
sector_t new_distance, current_distance; sector_t new_distance, current_distance;
mdk_rdev_t *new_rdev, *rdev; mdk_rdev_t *rdev;
rcu_read_lock(); rcu_read_lock();
/* /*
* Check if it if we can balance. We can balance on the whole * Check if we can balance. We can balance on the whole
* device if no resync is going on, or below the resync window. * device if no resync is going on, or below the resync window.
* We take the first readable disk when above the resync window. * We take the first readable disk when above the resync window.
*/ */
...@@ -376,11 +377,16 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio) ...@@ -376,11 +377,16 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
/* Choose the first operation device, for consistancy */ /* Choose the first operation device, for consistancy */
new_disk = 0; new_disk = 0;
while ((new_rdev=conf->mirrors[new_disk].rdev) == NULL || for (rdev = conf->mirrors[new_disk].rdev;
!new_rdev->in_sync) { !rdev || !rdev->in_sync
new_disk++; || test_bit(WriteMostly, &rdev->flags);
if (new_disk == conf->raid_disks) { rdev = conf->mirrors[++new_disk].rdev) {
new_disk = -1;
if (rdev && rdev->in_sync)
wonly_disk = new_disk;
if (new_disk == conf->raid_disks - 1) {
new_disk = wonly_disk;
break; break;
} }
} }
...@@ -389,16 +395,26 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio) ...@@ -389,16 +395,26 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
/* make sure the disk is operational */ /* make sure the disk is operational */
while ((new_rdev=conf->mirrors[new_disk].rdev) == NULL || for (rdev = conf->mirrors[new_disk].rdev;
!new_rdev->in_sync) { !rdev || !rdev->in_sync ||
test_bit(WriteMostly, &rdev->flags);
rdev = conf->mirrors[new_disk].rdev) {
if (rdev && rdev->in_sync)
wonly_disk = new_disk;
if (new_disk <= 0) if (new_disk <= 0)
new_disk = conf->raid_disks; new_disk = conf->raid_disks;
new_disk--; new_disk--;
if (new_disk == disk) { if (new_disk == disk) {
new_disk = -1; new_disk = wonly_disk;
goto rb_out; break;
} }
} }
if (new_disk < 0)
goto rb_out;
disk = new_disk; disk = new_disk;
/* now disk == new_disk == starting point for search */ /* now disk == new_disk == starting point for search */
...@@ -419,37 +435,41 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio) ...@@ -419,37 +435,41 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
disk = conf->raid_disks; disk = conf->raid_disks;
disk--; disk--;
if ((rdev=conf->mirrors[disk].rdev) == NULL || rdev = conf->mirrors[disk].rdev;
!rdev->in_sync)
if (!rdev ||
!rdev->in_sync ||
test_bit(WriteMostly, &rdev->flags))
continue; continue;
if (!atomic_read(&rdev->nr_pending)) { if (!atomic_read(&rdev->nr_pending)) {
new_disk = disk; new_disk = disk;
new_rdev = rdev;
break; break;
} }
new_distance = abs(this_sector - conf->mirrors[disk].head_position); new_distance = abs(this_sector - conf->mirrors[disk].head_position);
if (new_distance < current_distance) { if (new_distance < current_distance) {
current_distance = new_distance; current_distance = new_distance;
new_disk = disk; new_disk = disk;
new_rdev = rdev;
} }
} while (disk != conf->last_used); } while (disk != conf->last_used);
rb_out: rb_out:
if (new_disk >= 0) { if (new_disk >= 0) {
conf->next_seq_sect = this_sector + sectors; rdev = conf->mirrors[new_disk].rdev;
conf->last_used = new_disk; if (!rdev)
atomic_inc(&new_rdev->nr_pending); goto retry;
if (!new_rdev->in_sync) { atomic_inc(&rdev->nr_pending);
if (!rdev->in_sync) {
/* cannot risk returning a device that failed /* cannot risk returning a device that failed
* before we inc'ed nr_pending * before we inc'ed nr_pending
*/ */
atomic_dec(&new_rdev->nr_pending); atomic_dec(&rdev->nr_pending);
goto retry; goto retry;
} }
conf->next_seq_sect = this_sector + sectors;
conf->last_used = new_disk;
} }
rcu_read_unlock(); rcu_read_unlock();
...@@ -1109,6 +1129,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i ...@@ -1109,6 +1129,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
sector_t max_sector, nr_sectors; sector_t max_sector, nr_sectors;
int disk; int disk;
int i; int i;
int wonly;
int write_targets = 0; int write_targets = 0;
int sync_blocks; int sync_blocks;
int still_degraded = 0; int still_degraded = 0;
...@@ -1164,14 +1185,21 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i ...@@ -1164,14 +1185,21 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
*/ */
disk = conf->last_used; disk = conf->last_used;
/* make sure disk is operational */ /* make sure disk is operational */
wonly = disk;
while (conf->mirrors[disk].rdev == NULL || while (conf->mirrors[disk].rdev == NULL ||
!conf->mirrors[disk].rdev->in_sync) { !conf->mirrors[disk].rdev->in_sync ||
test_bit(WriteMostly, &conf->mirrors[disk].rdev->flags)
) {
if (conf->mirrors[disk].rdev &&
conf->mirrors[disk].rdev->in_sync)
wonly = disk;
if (disk <= 0) if (disk <= 0)
disk = conf->raid_disks; disk = conf->raid_disks;
disk--; disk--;
if (disk == conf->last_used) if (disk == conf->last_used) {
disk = wonly;
break; break;
}
} }
conf->last_used = disk; conf->last_used = disk;
atomic_inc(&conf->mirrors[disk].rdev->nr_pending); atomic_inc(&conf->mirrors[disk].rdev->nr_pending);
......
...@@ -181,6 +181,9 @@ struct mdk_rdev_s ...@@ -181,6 +181,9 @@ struct mdk_rdev_s
int faulty; /* if faulty do not issue IO requests */ int faulty; /* if faulty do not issue IO requests */
int in_sync; /* device is a full member of the array */ int in_sync; /* device is a full member of the array */
unsigned long flags; /* Should include faulty and in_sync here. */
#define WriteMostly 4 /* Avoid reading if at all possible */
int desc_nr; /* descriptor index in the superblock */ int desc_nr; /* descriptor index in the superblock */
int raid_disk; /* role of device in array */ int raid_disk; /* role of device in array */
int saved_raid_disk; /* role that device used to have in the int saved_raid_disk; /* role that device used to have in the
......
...@@ -79,6 +79,11 @@ ...@@ -79,6 +79,11 @@
#define MD_DISK_SYNC 2 /* disk is in sync with the raid set */ #define MD_DISK_SYNC 2 /* disk is in sync with the raid set */
#define MD_DISK_REMOVED 3 /* disk is in sync with the raid set */ #define MD_DISK_REMOVED 3 /* disk is in sync with the raid set */
#define MD_DISK_WRITEMOSTLY 9 /* disk is "write-mostly" is RAID1 config.
* read requests will only be sent here in
* dire need
*/
typedef struct mdp_device_descriptor_s { typedef struct mdp_device_descriptor_s {
__u32 number; /* 0 Device number in the entire set */ __u32 number; /* 0 Device number in the entire set */
__u32 major; /* 1 Device major number */ __u32 major; /* 1 Device major number */
...@@ -193,7 +198,7 @@ struct mdp_superblock_1 { ...@@ -193,7 +198,7 @@ struct mdp_superblock_1 {
__u64 ctime; /* lo 40 bits are seconds, top 24 are microseconds or 0*/ __u64 ctime; /* lo 40 bits are seconds, top 24 are microseconds or 0*/
__u32 level; /* -4 (multipath), -1 (linear), 0,1,4,5 */ __u32 level; /* -4 (multipath), -1 (linear), 0,1,4,5 */
__u32 layout; /* only for raid5 currently */ __u32 layout; /* only for raid5 and raid10 currently */
__u64 size; /* used size of component devices, in 512byte sectors */ __u64 size; /* used size of component devices, in 512byte sectors */
__u32 chunksize; /* in 512byte sectors */ __u32 chunksize; /* in 512byte sectors */
...@@ -212,7 +217,9 @@ struct mdp_superblock_1 { ...@@ -212,7 +217,9 @@ struct mdp_superblock_1 {
__u32 dev_number; /* permanent identifier of this device - not role in raid */ __u32 dev_number; /* permanent identifier of this device - not role in raid */
__u32 cnt_corrected_read; /* number of read errors that were corrected by re-writing */ __u32 cnt_corrected_read; /* number of read errors that were corrected by re-writing */
__u8 device_uuid[16]; /* user-space setable, ignored by kernel */ __u8 device_uuid[16]; /* user-space setable, ignored by kernel */
__u8 pad2[64-56]; /* set to 0 when writing */ __u8 devflags; /* per-device flags. Only one defined...*/
#define WriteMostly1 1 /* mask for writemostly flag in above */
__u8 pad2[64-57]; /* set to 0 when writing */
/* array state information - 64 bytes */ /* array state information - 64 bytes */
__u64 utime; /* 40 bits second, 24 btes microseconds */ __u64 utime; /* 40 bits second, 24 btes microseconds */
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment