Commit 06386bbf authored by Jonathan Brassow's avatar Jonathan Brassow Committed by Alasdair G Kergon

dm raid1: handle read failures

This patch gives the ability to respond-to/record device failures
that happen during read operations.  It also adds the ability to
read from mirror devices that are not the primary if they are
in-sync.

There are essentially two read paths in mirroring; the direct path
and the queued path.  When a read request is mapped, if the region
is 'in-sync' the direct path is taken; otherwise the queued path
is taken.

If the direct path is taken, we must record bio information so that
if the read fails we can retry it.  We then discover the status of
a direct read through mirror_end_io.  If the read has failed, we will
mark the device from which the read was attempted as failed (so we
don't try to read from it again), restore the bio and try again.

If the queued path is taken, we discover the results of the read
from 'read_callback'.  If the device failed, we will mark the device
as failed and attempt the read again if there is another device
where this region is known to be 'in-sync'.
Signed-off-by: default avatarJonathan Brassow <jbrassow@redhat.com>
Signed-off-by: default avatarAlasdair G Kergon <agk@redhat.com>
parent b80aa7a0
...@@ -6,6 +6,7 @@ ...@@ -6,6 +6,7 @@
#include "dm.h" #include "dm.h"
#include "dm-bio-list.h" #include "dm-bio-list.h"
#include "dm-bio-record.h"
#include "dm-io.h" #include "dm-io.h"
#include "dm-log.h" #include "dm-log.h"
#include "kcopyd.h" #include "kcopyd.h"
...@@ -141,6 +142,7 @@ struct mirror_set { ...@@ -141,6 +142,7 @@ struct mirror_set {
struct bio_list failures; struct bio_list failures;
struct dm_io_client *io_client; struct dm_io_client *io_client;
mempool_t *read_record_pool;
/* recovery */ /* recovery */
region_t nr_regions; region_t nr_regions;
...@@ -647,24 +649,30 @@ static void rh_start_recovery(struct region_hash *rh) ...@@ -647,24 +649,30 @@ static void rh_start_recovery(struct region_hash *rh)
wake(rh->ms); wake(rh->ms);
} }
#define MIN_READ_RECORDS 20
struct dm_raid1_read_record {
struct mirror *m;
struct dm_bio_details details;
};
/* /*
* Every mirror should look like this one. * Every mirror should look like this one.
*/ */
#define DEFAULT_MIRROR 0 #define DEFAULT_MIRROR 0
/* /*
* This is yucky. We squirrel the mirror_set struct away inside * This is yucky. We squirrel the mirror struct away inside
* bi_next for write buffers. This is safe since the bh * bi_next for read/write buffers. This is safe since the bh
* doesn't get submitted to the lower levels of block layer. * doesn't get submitted to the lower levels of block layer.
*/ */
static struct mirror_set *bio_get_ms(struct bio *bio) static struct mirror *bio_get_m(struct bio *bio)
{ {
return (struct mirror_set *) bio->bi_next; return (struct mirror *) bio->bi_next;
} }
static void bio_set_ms(struct bio *bio, struct mirror_set *ms) static void bio_set_m(struct bio *bio, struct mirror *m)
{ {
bio->bi_next = (struct bio *) ms; bio->bi_next = (struct bio *) m;
} }
static struct mirror *get_default_mirror(struct mirror_set *ms) static struct mirror *get_default_mirror(struct mirror_set *ms)
...@@ -857,17 +865,105 @@ static void do_recovery(struct mirror_set *ms) ...@@ -857,17 +865,105 @@ static void do_recovery(struct mirror_set *ms)
*---------------------------------------------------------------*/ *---------------------------------------------------------------*/
static struct mirror *choose_mirror(struct mirror_set *ms, sector_t sector) static struct mirror *choose_mirror(struct mirror_set *ms, sector_t sector)
{ {
/* FIXME: add read balancing */ struct mirror *m = get_default_mirror(ms);
return get_default_mirror(ms);
do {
if (likely(!atomic_read(&m->error_count)))
return m;
if (m-- == ms->mirror)
m += ms->nr_mirrors;
} while (m != get_default_mirror(ms));
return NULL;
}
static int default_ok(struct mirror *m)
{
struct mirror *default_mirror = get_default_mirror(m->ms);
return !atomic_read(&default_mirror->error_count);
}
static int mirror_available(struct mirror_set *ms, struct bio *bio)
{
region_t region = bio_to_region(&ms->rh, bio);
if (ms->rh.log->type->in_sync(ms->rh.log, region, 0))
return choose_mirror(ms, bio->bi_sector) ? 1 : 0;
return 0;
} }
/* /*
* remap a buffer to a particular mirror. * remap a buffer to a particular mirror.
*/ */
static void map_bio(struct mirror_set *ms, struct mirror *m, struct bio *bio) static sector_t map_sector(struct mirror *m, struct bio *bio)
{
return m->offset + (bio->bi_sector - m->ms->ti->begin);
}
static void map_bio(struct mirror *m, struct bio *bio)
{ {
bio->bi_bdev = m->dev->bdev; bio->bi_bdev = m->dev->bdev;
bio->bi_sector = m->offset + (bio->bi_sector - ms->ti->begin); bio->bi_sector = map_sector(m, bio);
}
static void map_region(struct io_region *io, struct mirror *m,
struct bio *bio)
{
io->bdev = m->dev->bdev;
io->sector = map_sector(m, bio);
io->count = bio->bi_size >> 9;
}
/*-----------------------------------------------------------------
* Reads
*---------------------------------------------------------------*/
static void read_callback(unsigned long error, void *context)
{
struct bio *bio = context;
struct mirror *m;
m = bio_get_m(bio);
bio_set_m(bio, NULL);
if (likely(!error)) {
bio_endio(bio, 0);
return;
}
fail_mirror(m, DM_RAID1_READ_ERROR);
if (likely(default_ok(m)) || mirror_available(m->ms, bio)) {
DMWARN_LIMIT("Read failure on mirror device %s. "
"Trying alternative device.",
m->dev->name);
queue_bio(m->ms, bio, bio_rw(bio));
return;
}
DMERR_LIMIT("Read failure on mirror device %s. Failing I/O.",
m->dev->name);
bio_endio(bio, -EIO);
}
/* Asynchronous read. */
static void read_async_bio(struct mirror *m, struct bio *bio)
{
struct io_region io;
struct dm_io_request io_req = {
.bi_rw = READ,
.mem.type = DM_IO_BVEC,
.mem.ptr.bvec = bio->bi_io_vec + bio->bi_idx,
.notify.fn = read_callback,
.notify.context = bio,
.client = m->ms->io_client,
};
map_region(&io, m, bio);
bio_set_m(bio, m);
(void) dm_io(&io_req, 1, &io, NULL);
} }
static void do_reads(struct mirror_set *ms, struct bio_list *reads) static void do_reads(struct mirror_set *ms, struct bio_list *reads)
...@@ -878,17 +974,20 @@ static void do_reads(struct mirror_set *ms, struct bio_list *reads) ...@@ -878,17 +974,20 @@ static void do_reads(struct mirror_set *ms, struct bio_list *reads)
while ((bio = bio_list_pop(reads))) { while ((bio = bio_list_pop(reads))) {
region = bio_to_region(&ms->rh, bio); region = bio_to_region(&ms->rh, bio);
m = get_default_mirror(ms);
/* /*
* We can only read balance if the region is in sync. * We can only read balance if the region is in sync.
*/ */
if (rh_in_sync(&ms->rh, region, 1)) if (likely(rh_in_sync(&ms->rh, region, 1)))
m = choose_mirror(ms, bio->bi_sector); m = choose_mirror(ms, bio->bi_sector);
else else if (m && atomic_read(&m->error_count))
m = get_default_mirror(ms); m = NULL;
map_bio(ms, m, bio); if (likely(m))
generic_make_request(bio); read_async_bio(m, bio);
else
bio_endio(bio, -EIO);
} }
} }
...@@ -964,8 +1063,8 @@ static void write_callback(unsigned long error, void *context) ...@@ -964,8 +1063,8 @@ static void write_callback(unsigned long error, void *context)
int should_wake = 0; int should_wake = 0;
unsigned long flags; unsigned long flags;
ms = bio_get_ms(bio); ms = bio_get_m(bio)->ms;
bio_set_ms(bio, NULL); bio_set_m(bio, NULL);
/* /*
* NOTE: We don't decrement the pending count here, * NOTE: We don't decrement the pending count here,
...@@ -1008,7 +1107,7 @@ out: ...@@ -1008,7 +1107,7 @@ out:
static void do_write(struct mirror_set *ms, struct bio *bio) static void do_write(struct mirror_set *ms, struct bio *bio)
{ {
unsigned int i; unsigned int i;
struct io_region io[KCOPYD_MAX_REGIONS+1]; struct io_region io[ms->nr_mirrors], *dest = io;
struct mirror *m; struct mirror *m;
struct dm_io_request io_req = { struct dm_io_request io_req = {
.bi_rw = WRITE, .bi_rw = WRITE,
...@@ -1019,15 +1118,14 @@ static void do_write(struct mirror_set *ms, struct bio *bio) ...@@ -1019,15 +1118,14 @@ static void do_write(struct mirror_set *ms, struct bio *bio)
.client = ms->io_client, .client = ms->io_client,
}; };
for (i = 0; i < ms->nr_mirrors; i++) { for (i = 0, m = ms->mirror; i < ms->nr_mirrors; i++, m++)
m = ms->mirror + i; map_region(dest++, m, bio);
io[i].bdev = m->dev->bdev; /*
io[i].sector = m->offset + (bio->bi_sector - ms->ti->begin); * Use default mirror because we only need it to retrieve the reference
io[i].count = bio->bi_size >> 9; * to the mirror set in write_callback().
} */
bio_set_m(bio, get_default_mirror(ms));
bio_set_ms(bio, ms);
(void) dm_io(&io_req, ms->nr_mirrors, io, NULL); (void) dm_io(&io_req, ms->nr_mirrors, io, NULL);
} }
...@@ -1092,7 +1190,7 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes) ...@@ -1092,7 +1190,7 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes)
rh_delay(&ms->rh, bio); rh_delay(&ms->rh, bio);
while ((bio = bio_list_pop(&nosync))) { while ((bio = bio_list_pop(&nosync))) {
map_bio(ms, get_default_mirror(ms), bio); map_bio(get_default_mirror(ms), bio);
generic_make_request(bio); generic_make_request(bio);
} }
} }
...@@ -1231,9 +1329,19 @@ static struct mirror_set *alloc_context(unsigned int nr_mirrors, ...@@ -1231,9 +1329,19 @@ static struct mirror_set *alloc_context(unsigned int nr_mirrors,
atomic_set(&ms->suspend, 0); atomic_set(&ms->suspend, 0);
atomic_set(&ms->default_mirror, DEFAULT_MIRROR); atomic_set(&ms->default_mirror, DEFAULT_MIRROR);
len = sizeof(struct dm_raid1_read_record);
ms->read_record_pool = mempool_create_kmalloc_pool(MIN_READ_RECORDS,
len);
if (!ms->read_record_pool) {
ti->error = "Error creating mirror read_record_pool";
kfree(ms);
return NULL;
}
ms->io_client = dm_io_client_create(DM_IO_PAGES); ms->io_client = dm_io_client_create(DM_IO_PAGES);
if (IS_ERR(ms->io_client)) { if (IS_ERR(ms->io_client)) {
ti->error = "Error creating dm_io client"; ti->error = "Error creating dm_io client";
mempool_destroy(ms->read_record_pool);
kfree(ms); kfree(ms);
return NULL; return NULL;
} }
...@@ -1241,6 +1349,7 @@ static struct mirror_set *alloc_context(unsigned int nr_mirrors, ...@@ -1241,6 +1349,7 @@ static struct mirror_set *alloc_context(unsigned int nr_mirrors,
if (rh_init(&ms->rh, ms, dl, region_size, ms->nr_regions)) { if (rh_init(&ms->rh, ms, dl, region_size, ms->nr_regions)) {
ti->error = "Error creating dirty region hash"; ti->error = "Error creating dirty region hash";
dm_io_client_destroy(ms->io_client); dm_io_client_destroy(ms->io_client);
mempool_destroy(ms->read_record_pool);
kfree(ms); kfree(ms);
return NULL; return NULL;
} }
...@@ -1256,6 +1365,7 @@ static void free_context(struct mirror_set *ms, struct dm_target *ti, ...@@ -1256,6 +1365,7 @@ static void free_context(struct mirror_set *ms, struct dm_target *ti,
dm_io_client_destroy(ms->io_client); dm_io_client_destroy(ms->io_client);
rh_exit(&ms->rh); rh_exit(&ms->rh);
mempool_destroy(ms->read_record_pool);
kfree(ms); kfree(ms);
} }
...@@ -1510,10 +1620,11 @@ static int mirror_map(struct dm_target *ti, struct bio *bio, ...@@ -1510,10 +1620,11 @@ static int mirror_map(struct dm_target *ti, struct bio *bio,
int r, rw = bio_rw(bio); int r, rw = bio_rw(bio);
struct mirror *m; struct mirror *m;
struct mirror_set *ms = ti->private; struct mirror_set *ms = ti->private;
struct dm_raid1_read_record *read_record = NULL;
map_context->ll = bio_to_region(&ms->rh, bio);
if (rw == WRITE) { if (rw == WRITE) {
/* Save region for mirror_end_io() handler */
map_context->ll = bio_to_region(&ms->rh, bio);
queue_bio(ms, bio, rw); queue_bio(ms, bio, rw);
return DM_MAPIO_SUBMITTED; return DM_MAPIO_SUBMITTED;
} }
...@@ -1523,28 +1634,34 @@ static int mirror_map(struct dm_target *ti, struct bio *bio, ...@@ -1523,28 +1634,34 @@ static int mirror_map(struct dm_target *ti, struct bio *bio,
if (r < 0 && r != -EWOULDBLOCK) if (r < 0 && r != -EWOULDBLOCK)
return r; return r;
if (r == -EWOULDBLOCK) /* FIXME: ugly */
r = DM_MAPIO_SUBMITTED;
/* /*
* We don't want to fast track a recovery just for a read * If region is not in-sync queue the bio.
* ahead. So we just let it silently fail.
* FIXME: get rid of this.
*/ */
if (!r && rw == READA) if (!r || (r == -EWOULDBLOCK)) {
return -EIO; if (rw == READA)
return -EWOULDBLOCK;
if (!r) {
/* Pass this io over to the daemon */
queue_bio(ms, bio, rw); queue_bio(ms, bio, rw);
return DM_MAPIO_SUBMITTED; return DM_MAPIO_SUBMITTED;
} }
/*
* The region is in-sync and we can perform reads directly.
* Store enough information so we can retry if it fails.
*/
m = choose_mirror(ms, bio->bi_sector); m = choose_mirror(ms, bio->bi_sector);
if (!m) if (unlikely(!m))
return -EIO; return -EIO;
map_bio(ms, m, bio); read_record = mempool_alloc(ms->read_record_pool, GFP_NOIO);
if (likely(read_record)) {
dm_bio_record(&read_record->details, bio);
map_context->ptr = read_record;
read_record->m = m;
}
map_bio(m, bio);
return DM_MAPIO_REMAPPED; return DM_MAPIO_REMAPPED;
} }
...@@ -1553,15 +1670,64 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio, ...@@ -1553,15 +1670,64 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio,
{ {
int rw = bio_rw(bio); int rw = bio_rw(bio);
struct mirror_set *ms = (struct mirror_set *) ti->private; struct mirror_set *ms = (struct mirror_set *) ti->private;
region_t region = map_context->ll; struct mirror *m = NULL;
struct dm_bio_details *bd = NULL;
struct dm_raid1_read_record *read_record = map_context->ptr;
/* /*
* We need to dec pending if this was a write. * We need to dec pending if this was a write.
*/ */
if (rw == WRITE) if (rw == WRITE) {
rh_dec(&ms->rh, region); rh_dec(&ms->rh, map_context->ll);
return error;
}
return 0; if (error == -EOPNOTSUPP)
goto out;
if ((error == -EWOULDBLOCK) && bio_rw_ahead(bio))
goto out;
if (unlikely(error)) {
if (!read_record) {
/*
* There wasn't enough memory to record necessary
* information for a retry or there was no other
* mirror in-sync.
*/
DMERR_LIMIT("Mirror read failed from %s.",
m->dev->name);
return -EIO;
}
DMERR("Mirror read failed from %s. Trying alternative device.",
m->dev->name);
m = read_record->m;
fail_mirror(m, DM_RAID1_READ_ERROR);
/*
* A failed read is requeued for another attempt using an intact
* mirror.
*/
if (default_ok(m) || mirror_available(ms, bio)) {
bd = &read_record->details;
dm_bio_restore(bd, bio);
mempool_free(read_record, ms->read_record_pool);
map_context->ptr = NULL;
queue_bio(ms, bio, rw);
return 1;
}
DMERR("All replicated volumes dead, failing I/O");
}
out:
if (read_record) {
mempool_free(read_record, ms->read_record_pool);
map_context->ptr = NULL;
}
return error;
} }
static void mirror_presuspend(struct dm_target *ti) static void mirror_presuspend(struct dm_target *ti)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment