Commit 6cce3b23 authored by NeilBrown's avatar NeilBrown Committed by Linus Torvalds

[PATCH] md: write intent bitmap support for raid10

Signed-off-by: default avatarNeil Brown <neilb@suse.de>
Signed-off-by: default avatarAndrew Morton <akpm@osdl.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@osdl.org>
parent b15c2e57
...@@ -714,9 +714,10 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) ...@@ -714,9 +714,10 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
if (sb->state & (1<<MD_SB_BITMAP_PRESENT) && if (sb->state & (1<<MD_SB_BITMAP_PRESENT) &&
mddev->bitmap_file == NULL) { mddev->bitmap_file == NULL) {
if (mddev->level != 1 && mddev->level != 5 && mddev->level != 6) { if (mddev->level != 1 && mddev->level != 5 && mddev->level != 6
&& mddev->level != 10) {
/* FIXME use a better test */ /* FIXME use a better test */
printk(KERN_WARNING "md: bitmaps only support for raid1\n"); printk(KERN_WARNING "md: bitmaps not supported for this level.\n");
return -EINVAL; return -EINVAL;
} }
mddev->bitmap_offset = mddev->default_bitmap_offset; mddev->bitmap_offset = mddev->default_bitmap_offset;
...@@ -1037,8 +1038,9 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) ...@@ -1037,8 +1038,9 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) && if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) &&
mddev->bitmap_file == NULL ) { mddev->bitmap_file == NULL ) {
if (mddev->level != 1) { if (mddev->level != 1 && mddev->level != 5 && mddev->level != 6
printk(KERN_WARNING "md: bitmaps only supported for raid1\n"); && mddev->level != 10) {
printk(KERN_WARNING "md: bitmaps not supported for this level.\n");
return -EINVAL; return -EINVAL;
} }
mddev->bitmap_offset = (__s32)le32_to_cpu(sb->bitmap_offset); mddev->bitmap_offset = (__s32)le32_to_cpu(sb->bitmap_offset);
......
...@@ -18,7 +18,9 @@ ...@@ -18,7 +18,9 @@
* Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/ */
#include "dm-bio-list.h"
#include <linux/raid/raid10.h> #include <linux/raid/raid10.h>
#include <linux/raid/bitmap.h>
/* /*
* RAID10 provides a combination of RAID0 and RAID1 functionality. * RAID10 provides a combination of RAID0 and RAID1 functionality.
...@@ -306,9 +308,11 @@ static int raid10_end_write_request(struct bio *bio, unsigned int bytes_done, in ...@@ -306,9 +308,11 @@ static int raid10_end_write_request(struct bio *bio, unsigned int bytes_done, in
/* /*
* this branch is our 'one mirror IO has finished' event handler: * this branch is our 'one mirror IO has finished' event handler:
*/ */
if (!uptodate) if (!uptodate) {
md_error(r10_bio->mddev, conf->mirrors[dev].rdev); md_error(r10_bio->mddev, conf->mirrors[dev].rdev);
else /* an I/O failed, we can't clear the bitmap */
set_bit(R10BIO_Degraded, &r10_bio->state);
} else
/* /*
* Set R10BIO_Uptodate in our master bio, so that * Set R10BIO_Uptodate in our master bio, so that
* we will return a good error code for to the higher * we will return a good error code for to the higher
...@@ -328,6 +332,11 @@ static int raid10_end_write_request(struct bio *bio, unsigned int bytes_done, in ...@@ -328,6 +332,11 @@ static int raid10_end_write_request(struct bio *bio, unsigned int bytes_done, in
* already. * already.
*/ */
if (atomic_dec_and_test(&r10_bio->remaining)) { if (atomic_dec_and_test(&r10_bio->remaining)) {
/* clear the bitmap if all writes complete successfully */
bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector,
r10_bio->sectors,
!test_bit(R10BIO_Degraded, &r10_bio->state),
0);
md_write_end(r10_bio->mddev); md_write_end(r10_bio->mddev);
raid_end_bio_io(r10_bio); raid_end_bio_io(r10_bio);
} }
...@@ -486,8 +495,9 @@ static int read_balance(conf_t *conf, r10bio_t *r10_bio) ...@@ -486,8 +495,9 @@ static int read_balance(conf_t *conf, r10bio_t *r10_bio)
rcu_read_lock(); rcu_read_lock();
/* /*
* Check if we can balance. We can balance on the whole * Check if we can balance. We can balance on the whole
* device if no resync is going on, or below the resync window. * device if no resync is going on (recovery is ok), or below
* We take the first readable disk when above the resync window. * the resync window. We take the first readable disk when
* above the resync window.
*/ */
if (conf->mddev->recovery_cp < MaxSector if (conf->mddev->recovery_cp < MaxSector
&& (this_sector + sectors >= conf->next_resync)) { && (this_sector + sectors >= conf->next_resync)) {
...@@ -591,7 +601,10 @@ static void unplug_slaves(mddev_t *mddev) ...@@ -591,7 +601,10 @@ static void unplug_slaves(mddev_t *mddev)
static void raid10_unplug(request_queue_t *q) static void raid10_unplug(request_queue_t *q)
{ {
mddev_t *mddev = q->queuedata;
unplug_slaves(q->queuedata); unplug_slaves(q->queuedata);
md_wakeup_thread(mddev->thread);
} }
static int raid10_issue_flush(request_queue_t *q, struct gendisk *disk, static int raid10_issue_flush(request_queue_t *q, struct gendisk *disk,
...@@ -647,12 +660,13 @@ static int raid10_issue_flush(request_queue_t *q, struct gendisk *disk, ...@@ -647,12 +660,13 @@ static int raid10_issue_flush(request_queue_t *q, struct gendisk *disk,
*/ */
#define RESYNC_DEPTH 32 #define RESYNC_DEPTH 32
static void raise_barrier(conf_t *conf) static void raise_barrier(conf_t *conf, int force)
{ {
BUG_ON(force && !conf->barrier);
spin_lock_irq(&conf->resync_lock); spin_lock_irq(&conf->resync_lock);
/* Wait until no block IO is waiting */ /* Wait until no block IO is waiting (unless 'force') */
wait_event_lock_irq(conf->wait_barrier, !conf->nr_waiting, wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting,
conf->resync_lock, conf->resync_lock,
raid10_unplug(conf->mddev->queue)); raid10_unplug(conf->mddev->queue));
...@@ -710,6 +724,8 @@ static int make_request(request_queue_t *q, struct bio * bio) ...@@ -710,6 +724,8 @@ static int make_request(request_queue_t *q, struct bio * bio)
int i; int i;
int chunk_sects = conf->chunk_mask + 1; int chunk_sects = conf->chunk_mask + 1;
const int rw = bio_data_dir(bio); const int rw = bio_data_dir(bio);
struct bio_list bl;
unsigned long flags;
if (unlikely(bio_barrier(bio))) { if (unlikely(bio_barrier(bio))) {
bio_endio(bio, bio->bi_size, -EOPNOTSUPP); bio_endio(bio, bio->bi_size, -EOPNOTSUPP);
...@@ -767,6 +783,7 @@ static int make_request(request_queue_t *q, struct bio * bio) ...@@ -767,6 +783,7 @@ static int make_request(request_queue_t *q, struct bio * bio)
r10_bio->mddev = mddev; r10_bio->mddev = mddev;
r10_bio->sector = bio->bi_sector; r10_bio->sector = bio->bi_sector;
r10_bio->state = 0;
if (rw == READ) { if (rw == READ) {
/* /*
...@@ -811,13 +828,16 @@ static int make_request(request_queue_t *q, struct bio * bio) ...@@ -811,13 +828,16 @@ static int make_request(request_queue_t *q, struct bio * bio)
!test_bit(Faulty, &rdev->flags)) { !test_bit(Faulty, &rdev->flags)) {
atomic_inc(&rdev->nr_pending); atomic_inc(&rdev->nr_pending);
r10_bio->devs[i].bio = bio; r10_bio->devs[i].bio = bio;
} else } else {
r10_bio->devs[i].bio = NULL; r10_bio->devs[i].bio = NULL;
set_bit(R10BIO_Degraded, &r10_bio->state);
}
} }
rcu_read_unlock(); rcu_read_unlock();
atomic_set(&r10_bio->remaining, 1); atomic_set(&r10_bio->remaining, 0);
bio_list_init(&bl);
for (i = 0; i < conf->copies; i++) { for (i = 0; i < conf->copies; i++) {
struct bio *mbio; struct bio *mbio;
int d = r10_bio->devs[i].devnum; int d = r10_bio->devs[i].devnum;
...@@ -835,13 +855,14 @@ static int make_request(request_queue_t *q, struct bio * bio) ...@@ -835,13 +855,14 @@ static int make_request(request_queue_t *q, struct bio * bio)
mbio->bi_private = r10_bio; mbio->bi_private = r10_bio;
atomic_inc(&r10_bio->remaining); atomic_inc(&r10_bio->remaining);
generic_make_request(mbio); bio_list_add(&bl, mbio);
} }
if (atomic_dec_and_test(&r10_bio->remaining)) { bitmap_startwrite(mddev->bitmap, bio->bi_sector, r10_bio->sectors, 0);
md_write_end(mddev); spin_lock_irqsave(&conf->device_lock, flags);
raid_end_bio_io(r10_bio); bio_list_merge(&conf->pending_bio_list, &bl);
} blk_plug_device(mddev->queue);
spin_unlock_irqrestore(&conf->device_lock, flags);
return 0; return 0;
} }
...@@ -999,7 +1020,12 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) ...@@ -999,7 +1020,12 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
if (!enough(conf)) if (!enough(conf))
return 0; return 0;
for (mirror=0; mirror < mddev->raid_disks; mirror++) if (rdev->saved_raid_disk >= 0 &&
conf->mirrors[rdev->saved_raid_disk].rdev == NULL)
mirror = rdev->saved_raid_disk;
else
mirror = 0;
for ( ; mirror < mddev->raid_disks; mirror++)
if ( !(p=conf->mirrors+mirror)->rdev) { if ( !(p=conf->mirrors+mirror)->rdev) {
blk_queue_stack_limits(mddev->queue, blk_queue_stack_limits(mddev->queue,
...@@ -1015,6 +1041,8 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) ...@@ -1015,6 +1041,8 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
p->head_position = 0; p->head_position = 0;
rdev->raid_disk = mirror; rdev->raid_disk = mirror;
found = 1; found = 1;
if (rdev->saved_raid_disk != mirror)
conf->fullsync = 1;
rcu_assign_pointer(p->rdev, rdev); rcu_assign_pointer(p->rdev, rdev);
break; break;
} }
...@@ -1282,6 +1310,26 @@ static void raid10d(mddev_t *mddev) ...@@ -1282,6 +1310,26 @@ static void raid10d(mddev_t *mddev)
for (;;) { for (;;) {
char b[BDEVNAME_SIZE]; char b[BDEVNAME_SIZE];
spin_lock_irqsave(&conf->device_lock, flags); spin_lock_irqsave(&conf->device_lock, flags);
if (conf->pending_bio_list.head) {
bio = bio_list_get(&conf->pending_bio_list);
blk_remove_plug(mddev->queue);
spin_unlock_irqrestore(&conf->device_lock, flags);
/* flush any pending bitmap writes to disk before proceeding w/ I/O */
if (bitmap_unplug(mddev->bitmap) != 0)
printk("%s: bitmap file write failed!\n", mdname(mddev));
while (bio) { /* submit pending writes */
struct bio *next = bio->bi_next;
bio->bi_next = NULL;
generic_make_request(bio);
bio = next;
}
unplug = 1;
continue;
}
if (list_empty(head)) if (list_empty(head))
break; break;
r10_bio = list_entry(head->prev, r10bio_t, retry_list); r10_bio = list_entry(head->prev, r10bio_t, retry_list);
...@@ -1388,6 +1436,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i ...@@ -1388,6 +1436,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
sector_t max_sector, nr_sectors; sector_t max_sector, nr_sectors;
int disk; int disk;
int i; int i;
int max_sync;
int sync_blocks;
sector_t sectors_skipped = 0; sector_t sectors_skipped = 0;
int chunks_skipped = 0; int chunks_skipped = 0;
...@@ -1401,6 +1451,29 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i ...@@ -1401,6 +1451,29 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
max_sector = mddev->resync_max_sectors; max_sector = mddev->resync_max_sectors;
if (sector_nr >= max_sector) { if (sector_nr >= max_sector) {
/* If we aborted, we need to abort the
* sync on the 'current' bitmap chucks (there can
* be several when recovering multiple devices).
* as we may have started syncing it but not finished.
* We can find the current address in
* mddev->curr_resync, but for recovery,
* we need to convert that to several
* virtual addresses.
*/
if (mddev->curr_resync < max_sector) { /* aborted */
if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
&sync_blocks, 1);
else for (i=0; i<conf->raid_disks; i++) {
sector_t sect =
raid10_find_virt(conf, mddev->curr_resync, i);
bitmap_end_sync(mddev->bitmap, sect,
&sync_blocks, 1);
}
} else /* completed sync */
conf->fullsync = 0;
bitmap_close_sync(mddev->bitmap);
close_sync(conf); close_sync(conf);
*skipped = 1; *skipped = 1;
return sectors_skipped; return sectors_skipped;
...@@ -1425,8 +1498,6 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i ...@@ -1425,8 +1498,6 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
*/ */
if (!go_faster && conf->nr_waiting) if (!go_faster && conf->nr_waiting)
msleep_interruptible(1000); msleep_interruptible(1000);
raise_barrier(conf);
conf->next_resync = sector_nr;
/* Again, very different code for resync and recovery. /* Again, very different code for resync and recovery.
* Both must result in an r10bio with a list of bios that * Both must result in an r10bio with a list of bios that
...@@ -1443,6 +1514,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i ...@@ -1443,6 +1514,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
* end_sync_write if we will want to write. * end_sync_write if we will want to write.
*/ */
max_sync = RESYNC_PAGES << (PAGE_SHIFT-9);
if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
/* recovery... the complicated one */ /* recovery... the complicated one */
int i, j, k; int i, j, k;
...@@ -1451,13 +1523,29 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i ...@@ -1451,13 +1523,29 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
for (i=0 ; i<conf->raid_disks; i++) for (i=0 ; i<conf->raid_disks; i++)
if (conf->mirrors[i].rdev && if (conf->mirrors[i].rdev &&
!test_bit(In_sync, &conf->mirrors[i].rdev->flags)) { !test_bit(In_sync, &conf->mirrors[i].rdev->flags)) {
int still_degraded = 0;
/* want to reconstruct this device */ /* want to reconstruct this device */
r10bio_t *rb2 = r10_bio; r10bio_t *rb2 = r10_bio;
sector_t sect = raid10_find_virt(conf, sector_nr, i);
int must_sync;
/* Unless we are doing a full sync, we only need
* to recover the block if it is set in the bitmap
*/
must_sync = bitmap_start_sync(mddev->bitmap, sect,
&sync_blocks, 1);
if (sync_blocks < max_sync)
max_sync = sync_blocks;
if (!must_sync &&
!conf->fullsync) {
/* yep, skip the sync_blocks here, but don't assume
* that there will never be anything to do here
*/
chunks_skipped = -1;
continue;
}
r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO); r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
spin_lock_irq(&conf->resync_lock); raise_barrier(conf, rb2 != NULL);
if (rb2) conf->barrier++;
spin_unlock_irq(&conf->resync_lock);
atomic_set(&r10_bio->remaining, 0); atomic_set(&r10_bio->remaining, 0);
r10_bio->master_bio = (struct bio*)rb2; r10_bio->master_bio = (struct bio*)rb2;
...@@ -1465,8 +1553,21 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i ...@@ -1465,8 +1553,21 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
atomic_inc(&rb2->remaining); atomic_inc(&rb2->remaining);
r10_bio->mddev = mddev; r10_bio->mddev = mddev;
set_bit(R10BIO_IsRecover, &r10_bio->state); set_bit(R10BIO_IsRecover, &r10_bio->state);
r10_bio->sector = raid10_find_virt(conf, sector_nr, i); r10_bio->sector = sect;
raid10_find_phys(conf, r10_bio); raid10_find_phys(conf, r10_bio);
/* Need to check if this section will still be
* degraded
*/
for (j=0; j<conf->copies;j++) {
int d = r10_bio->devs[j].devnum;
if (conf->mirrors[d].rdev == NULL ||
test_bit(Faulty, &conf->mirrors[d].rdev->flags))
still_degraded = 1;
}
must_sync = bitmap_start_sync(mddev->bitmap, sect,
&sync_blocks, still_degraded);
for (j=0; j<conf->copies;j++) { for (j=0; j<conf->copies;j++) {
int d = r10_bio->devs[j].devnum; int d = r10_bio->devs[j].devnum;
if (conf->mirrors[d].rdev && if (conf->mirrors[d].rdev &&
...@@ -1526,10 +1627,22 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i ...@@ -1526,10 +1627,22 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
} else { } else {
/* resync. Schedule a read for every block at this virt offset */ /* resync. Schedule a read for every block at this virt offset */
int count = 0; int count = 0;
if (!bitmap_start_sync(mddev->bitmap, sector_nr,
&sync_blocks, mddev->degraded) &&
!conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
/* We can skip this block */
*skipped = 1;
return sync_blocks + sectors_skipped;
}
if (sync_blocks < max_sync)
max_sync = sync_blocks;
r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO); r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
r10_bio->mddev = mddev; r10_bio->mddev = mddev;
atomic_set(&r10_bio->remaining, 0); atomic_set(&r10_bio->remaining, 0);
raise_barrier(conf, 0);
conf->next_resync = sector_nr;
r10_bio->master_bio = NULL; r10_bio->master_bio = NULL;
r10_bio->sector = sector_nr; r10_bio->sector = sector_nr;
...@@ -1582,6 +1695,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i ...@@ -1582,6 +1695,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
} }
nr_sectors = 0; nr_sectors = 0;
if (sector_nr + max_sync < max_sector)
max_sector = sector_nr + max_sync;
do { do {
struct page *page; struct page *page;
int len = PAGE_SIZE; int len = PAGE_SIZE;
...@@ -1821,6 +1936,26 @@ static int stop(mddev_t *mddev) ...@@ -1821,6 +1936,26 @@ static int stop(mddev_t *mddev)
return 0; return 0;
} }
static void raid10_quiesce(mddev_t *mddev, int state)
{
conf_t *conf = mddev_to_conf(mddev);
switch(state) {
case 1:
raise_barrier(conf, 0);
break;
case 0:
lower_barrier(conf);
break;
}
if (mddev->thread) {
if (mddev->bitmap)
mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ;
else
mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT;
md_wakeup_thread(mddev->thread);
}
}
static mdk_personality_t raid10_personality = static mdk_personality_t raid10_personality =
{ {
...@@ -1835,6 +1970,7 @@ static mdk_personality_t raid10_personality = ...@@ -1835,6 +1970,7 @@ static mdk_personality_t raid10_personality =
.hot_remove_disk= raid10_remove_disk, .hot_remove_disk= raid10_remove_disk,
.spare_active = raid10_spare_active, .spare_active = raid10_spare_active,
.sync_request = sync_request, .sync_request = sync_request,
.quiesce = raid10_quiesce,
}; };
static int __init raid_init(void) static int __init raid_init(void)
......
...@@ -35,13 +35,19 @@ struct r10_private_data_s { ...@@ -35,13 +35,19 @@ struct r10_private_data_s {
sector_t chunk_mask; sector_t chunk_mask;
struct list_head retry_list; struct list_head retry_list;
/* for use when syncing mirrors: */ /* queue pending writes and submit them on unplug */
struct bio_list pending_bio_list;
spinlock_t resync_lock; spinlock_t resync_lock;
int nr_pending; int nr_pending;
int nr_waiting; int nr_waiting;
int barrier; int barrier;
sector_t next_resync; sector_t next_resync;
int fullsync; /* set to 1 if a full sync is needed,
* (fresh device added).
* Cleared when a sync completes.
*/
wait_queue_head_t wait_barrier; wait_queue_head_t wait_barrier;
...@@ -100,4 +106,5 @@ struct r10bio_s { ...@@ -100,4 +106,5 @@ struct r10bio_s {
#define R10BIO_Uptodate 0 #define R10BIO_Uptodate 0
#define R10BIO_IsSync 1 #define R10BIO_IsSync 1
#define R10BIO_IsRecover 2 #define R10BIO_IsRecover 2
#define R10BIO_Degraded 3
#endif #endif
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment