Commit fad61490 authored by Trond Myklebust's avatar Trond Myklebust

nfs: Use UNSTABLE + COMMIT for NFS O_DIRECT writes

Currently NFS O_DIRECT writes use FILE_SYNC so that a COMMIT is not
necessary.  This simplifies the internal logic, but this could be a
difficult workload for some servers.

Instead, let's send UNSTABLE writes, and after they all complete, send a
COMMIT for the dirty range.  After the COMMIT returns successfully, then do
the wake_up or fire off aio_complete().

Test plan:
Async direct I/O tests against Solaris (or any server that requires
committed unstable writes).  Reboot server during test.

Based on an earlier patch by Chuck Lever <cel@netapp.com>
Signed-off-by: default avatarTrond Myklebust <Trond.Myklebust@netapp.com>
parent e17b1fc4
...@@ -69,11 +69,15 @@ struct nfs_direct_req { ...@@ -69,11 +69,15 @@ struct nfs_direct_req {
struct kref kref; /* release manager */ struct kref kref; /* release manager */
/* I/O parameters */ /* I/O parameters */
struct list_head list; /* nfs_read/write_data structs */ struct list_head list, /* nfs_read/write_data structs */
rewrite_list; /* saved nfs_write_data structs */
struct file * filp; /* file descriptor */ struct file * filp; /* file descriptor */
struct kiocb * iocb; /* controlling i/o request */ struct kiocb * iocb; /* controlling i/o request */
wait_queue_head_t wait; /* wait for i/o completion */ wait_queue_head_t wait; /* wait for i/o completion */
struct inode * inode; /* target file of i/o */ struct inode * inode; /* target file of i/o */
unsigned long user_addr; /* location of user's buffer */
size_t user_count; /* total bytes to move */
loff_t pos; /* starting offset in file */
struct page ** pages; /* pages in our buffer */ struct page ** pages; /* pages in our buffer */
unsigned int npages; /* count of pages */ unsigned int npages; /* count of pages */
...@@ -82,8 +86,18 @@ struct nfs_direct_req { ...@@ -82,8 +86,18 @@ struct nfs_direct_req {
int outstanding; /* i/os we're waiting for */ int outstanding; /* i/os we're waiting for */
ssize_t count, /* bytes actually processed */ ssize_t count, /* bytes actually processed */
error; /* any reported error */ error; /* any reported error */
/* commit state */
struct nfs_write_data * commit_data; /* special write_data for commits */
int flags;
#define NFS_ODIRECT_DO_COMMIT (1) /* an unstable reply was received */
#define NFS_ODIRECT_RESCHED_WRITES (2) /* write verification failed */
struct nfs_writeverf verf; /* unstable write verifier */
}; };
static void nfs_direct_write_schedule(struct nfs_direct_req *dreq, int sync);
static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode);
/** /**
* nfs_direct_IO - NFS address space operation for direct I/O * nfs_direct_IO - NFS address space operation for direct I/O
* @rw: direction (read or write) * @rw: direction (read or write)
...@@ -160,11 +174,13 @@ static inline struct nfs_direct_req *nfs_direct_req_alloc(void) ...@@ -160,11 +174,13 @@ static inline struct nfs_direct_req *nfs_direct_req_alloc(void)
kref_init(&dreq->kref); kref_init(&dreq->kref);
init_waitqueue_head(&dreq->wait); init_waitqueue_head(&dreq->wait);
INIT_LIST_HEAD(&dreq->list); INIT_LIST_HEAD(&dreq->list);
INIT_LIST_HEAD(&dreq->rewrite_list);
dreq->iocb = NULL; dreq->iocb = NULL;
spin_lock_init(&dreq->lock); spin_lock_init(&dreq->lock);
dreq->outstanding = 0; dreq->outstanding = 0;
dreq->count = 0; dreq->count = 0;
dreq->error = 0; dreq->error = 0;
dreq->flags = 0;
return dreq; return dreq;
} }
...@@ -299,7 +315,7 @@ static const struct rpc_call_ops nfs_read_direct_ops = { ...@@ -299,7 +315,7 @@ static const struct rpc_call_ops nfs_read_direct_ops = {
* For each nfs_read_data struct that was allocated on the list, dispatch * For each nfs_read_data struct that was allocated on the list, dispatch
* an NFS READ operation * an NFS READ operation
*/ */
static void nfs_direct_read_schedule(struct nfs_direct_req *dreq, unsigned long user_addr, size_t count, loff_t pos) static void nfs_direct_read_schedule(struct nfs_direct_req *dreq)
{ {
struct file *file = dreq->filp; struct file *file = dreq->filp;
struct inode *inode = file->f_mapping->host; struct inode *inode = file->f_mapping->host;
...@@ -307,11 +323,13 @@ static void nfs_direct_read_schedule(struct nfs_direct_req *dreq, unsigned long ...@@ -307,11 +323,13 @@ static void nfs_direct_read_schedule(struct nfs_direct_req *dreq, unsigned long
file->private_data; file->private_data;
struct list_head *list = &dreq->list; struct list_head *list = &dreq->list;
struct page **pages = dreq->pages; struct page **pages = dreq->pages;
size_t count = dreq->user_count;
loff_t pos = dreq->pos;
size_t rsize = NFS_SERVER(inode)->rsize; size_t rsize = NFS_SERVER(inode)->rsize;
unsigned int curpage, pgbase; unsigned int curpage, pgbase;
curpage = 0; curpage = 0;
pgbase = user_addr & ~PAGE_MASK; pgbase = dreq->user_addr & ~PAGE_MASK;
do { do {
struct nfs_read_data *data; struct nfs_read_data *data;
size_t bytes; size_t bytes;
...@@ -373,6 +391,9 @@ static ssize_t nfs_direct_read(struct kiocb *iocb, unsigned long user_addr, size ...@@ -373,6 +391,9 @@ static ssize_t nfs_direct_read(struct kiocb *iocb, unsigned long user_addr, size
if (!dreq) if (!dreq)
return -ENOMEM; return -ENOMEM;
dreq->user_addr = user_addr;
dreq->user_count = count;
dreq->pos = pos;
dreq->pages = pages; dreq->pages = pages;
dreq->npages = nr_pages; dreq->npages = nr_pages;
igrab(inode); igrab(inode);
...@@ -383,13 +404,137 @@ static ssize_t nfs_direct_read(struct kiocb *iocb, unsigned long user_addr, size ...@@ -383,13 +404,137 @@ static ssize_t nfs_direct_read(struct kiocb *iocb, unsigned long user_addr, size
nfs_add_stats(inode, NFSIOS_DIRECTREADBYTES, count); nfs_add_stats(inode, NFSIOS_DIRECTREADBYTES, count);
rpc_clnt_sigmask(clnt, &oldset); rpc_clnt_sigmask(clnt, &oldset);
nfs_direct_read_schedule(dreq, user_addr, count, pos); nfs_direct_read_schedule(dreq);
result = nfs_direct_wait(dreq); result = nfs_direct_wait(dreq);
rpc_clnt_sigunmask(clnt, &oldset); rpc_clnt_sigunmask(clnt, &oldset);
return result; return result;
} }
static void nfs_direct_free_writedata(struct nfs_direct_req *dreq)
{
list_splice_init(&dreq->rewrite_list, &dreq->list);
while (!list_empty(&dreq->list)) {
struct nfs_write_data *data = list_entry(dreq->list.next, struct nfs_write_data, pages);
list_del(&data->pages);
nfs_writedata_release(data);
}
}
#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
{
struct list_head *pos;
list_splice_init(&dreq->rewrite_list, &dreq->list);
list_for_each(pos, &dreq->list)
dreq->outstanding++;
dreq->count = 0;
nfs_direct_write_schedule(dreq, FLUSH_STABLE);
}
static void nfs_direct_commit_result(struct rpc_task *task, void *calldata)
{
struct nfs_write_data *data = calldata;
struct nfs_direct_req *dreq = (struct nfs_direct_req *) data->req;
/* Call the NFS version-specific code */
if (NFS_PROTO(data->inode)->commit_done(task, data) != 0)
return;
if (unlikely(task->tk_status < 0)) {
dreq->error = task->tk_status;
dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
}
if (memcmp(&dreq->verf, &data->verf, sizeof(data->verf))) {
dprintk("NFS: %5u commit verify failed\n", task->tk_pid);
dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
}
dprintk("NFS: %5u commit returned %d\n", task->tk_pid, task->tk_status);
nfs_direct_write_complete(dreq, data->inode);
}
static const struct rpc_call_ops nfs_commit_direct_ops = {
.rpc_call_done = nfs_direct_commit_result,
.rpc_release = nfs_commit_release,
};
static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq)
{
struct file *file = dreq->filp;
struct nfs_open_context *ctx = (struct nfs_open_context *)
file->private_data;
struct nfs_write_data *data = dreq->commit_data;
struct rpc_task *task = &data->task;
data->inode = dreq->inode;
data->cred = ctx->cred;
data->args.fh = NFS_FH(data->inode);
data->args.offset = dreq->pos;
data->args.count = dreq->user_count;
data->res.count = 0;
data->res.fattr = &data->fattr;
data->res.verf = &data->verf;
rpc_init_task(&data->task, NFS_CLIENT(dreq->inode), RPC_TASK_ASYNC,
&nfs_commit_direct_ops, data);
NFS_PROTO(data->inode)->commit_setup(data, 0);
data->task.tk_priority = RPC_PRIORITY_NORMAL;
data->task.tk_cookie = (unsigned long)data->inode;
/* Note: task.tk_ops->rpc_release will free dreq->commit_data */
dreq->commit_data = NULL;
dprintk("NFS: %5u initiated commit call\n", task->tk_pid);
lock_kernel();
rpc_execute(&data->task);
unlock_kernel();
}
static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode)
{
int flags = dreq->flags;
dreq->flags = 0;
switch (flags) {
case NFS_ODIRECT_DO_COMMIT:
nfs_direct_commit_schedule(dreq);
break;
case NFS_ODIRECT_RESCHED_WRITES:
nfs_direct_write_reschedule(dreq);
break;
default:
nfs_end_data_update(inode);
if (dreq->commit_data != NULL)
nfs_commit_free(dreq->commit_data);
nfs_direct_free_writedata(dreq);
nfs_direct_complete(dreq);
}
}
static void nfs_alloc_commit_data(struct nfs_direct_req *dreq)
{
dreq->commit_data = nfs_commit_alloc(0);
if (dreq->commit_data != NULL)
dreq->commit_data->req = (struct nfs_page *) dreq;
}
#else
static inline void nfs_alloc_commit_data(struct nfs_direct_req *dreq)
{
dreq->commit_data = NULL;
}
static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode)
{
nfs_end_data_update(inode);
nfs_direct_free_writedata(dreq);
nfs_direct_complete(dreq);
}
#endif
static struct nfs_direct_req *nfs_direct_write_alloc(size_t nbytes, size_t wsize) static struct nfs_direct_req *nfs_direct_write_alloc(size_t nbytes, size_t wsize)
{ {
struct list_head *list; struct list_head *list;
...@@ -424,14 +569,13 @@ static struct nfs_direct_req *nfs_direct_write_alloc(size_t nbytes, size_t wsize ...@@ -424,14 +569,13 @@ static struct nfs_direct_req *nfs_direct_write_alloc(size_t nbytes, size_t wsize
break; break;
nbytes -= wsize; nbytes -= wsize;
} }
nfs_alloc_commit_data(dreq);
kref_get(&dreq->kref); kref_get(&dreq->kref);
return dreq; return dreq;
} }
/*
* NB: Return the value of the first error return code. Subsequent
* errors after the first one are ignored.
*/
static void nfs_direct_write_result(struct rpc_task *task, void *calldata) static void nfs_direct_write_result(struct rpc_task *task, void *calldata)
{ {
struct nfs_write_data *data = calldata; struct nfs_write_data *data = calldata;
...@@ -440,41 +584,62 @@ static void nfs_direct_write_result(struct rpc_task *task, void *calldata) ...@@ -440,41 +584,62 @@ static void nfs_direct_write_result(struct rpc_task *task, void *calldata)
if (nfs_writeback_done(task, data) != 0) if (nfs_writeback_done(task, data) != 0)
return; return;
/* If the server fell back to an UNSTABLE write, it's an error. */
if (unlikely(data->res.verf->committed != NFS_FILE_SYNC))
status = -EIO;
spin_lock(&dreq->lock); spin_lock(&dreq->lock);
if (likely(status >= 0)) if (likely(status >= 0))
dreq->count += data->res.count; dreq->count += data->res.count;
else else
dreq->error = status; dreq->error = task->tk_status;
if (data->res.verf->committed != NFS_FILE_SYNC) {
switch (dreq->flags) {
case 0:
memcpy(&dreq->verf, &data->verf, sizeof(dreq->verf));
dreq->flags = NFS_ODIRECT_DO_COMMIT;
break;
case NFS_ODIRECT_DO_COMMIT:
if (memcmp(&dreq->verf, &data->verf, sizeof(dreq->verf))) {
dprintk("NFS: %5u write verify failed\n", task->tk_pid);
dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
}
}
}
/* In case we have to resend */
data->args.stable = NFS_FILE_SYNC;
spin_unlock(&dreq->lock);
}
/*
* NB: Return the value of the first error return code. Subsequent
* errors after the first one are ignored.
*/
static void nfs_direct_write_release(void *calldata)
{
struct nfs_write_data *data = calldata;
struct nfs_direct_req *dreq = (struct nfs_direct_req *) data->req;
spin_lock(&dreq->lock);
if (--dreq->outstanding) { if (--dreq->outstanding) {
spin_unlock(&dreq->lock); spin_unlock(&dreq->lock);
return; return;
} }
spin_unlock(&dreq->lock); spin_unlock(&dreq->lock);
nfs_end_data_update(data->inode); nfs_direct_write_complete(dreq, data->inode);
nfs_direct_complete(dreq);
} }
static const struct rpc_call_ops nfs_write_direct_ops = { static const struct rpc_call_ops nfs_write_direct_ops = {
.rpc_call_done = nfs_direct_write_result, .rpc_call_done = nfs_direct_write_result,
.rpc_release = nfs_writedata_release, .rpc_release = nfs_direct_write_release,
}; };
/* /*
* For each nfs_write_data struct that was allocated on the list, dispatch * For each nfs_write_data struct that was allocated on the list, dispatch
* an NFS WRITE operation * an NFS WRITE operation
*
* XXX: For now, support only FILE_SYNC writes. Later we may add
* support for UNSTABLE + COMMIT.
*/ */
static void nfs_direct_write_schedule(struct nfs_direct_req *dreq, unsigned long user_addr, size_t count, loff_t pos) static void nfs_direct_write_schedule(struct nfs_direct_req *dreq, int sync)
{ {
struct file *file = dreq->filp; struct file *file = dreq->filp;
struct inode *inode = file->f_mapping->host; struct inode *inode = file->f_mapping->host;
...@@ -482,11 +647,13 @@ static void nfs_direct_write_schedule(struct nfs_direct_req *dreq, unsigned long ...@@ -482,11 +647,13 @@ static void nfs_direct_write_schedule(struct nfs_direct_req *dreq, unsigned long
file->private_data; file->private_data;
struct list_head *list = &dreq->list; struct list_head *list = &dreq->list;
struct page **pages = dreq->pages; struct page **pages = dreq->pages;
size_t count = dreq->user_count;
loff_t pos = dreq->pos;
size_t wsize = NFS_SERVER(inode)->wsize; size_t wsize = NFS_SERVER(inode)->wsize;
unsigned int curpage, pgbase; unsigned int curpage, pgbase;
curpage = 0; curpage = 0;
pgbase = user_addr & ~PAGE_MASK; pgbase = dreq->user_addr & ~PAGE_MASK;
do { do {
struct nfs_write_data *data; struct nfs_write_data *data;
size_t bytes; size_t bytes;
...@@ -496,7 +663,7 @@ static void nfs_direct_write_schedule(struct nfs_direct_req *dreq, unsigned long ...@@ -496,7 +663,7 @@ static void nfs_direct_write_schedule(struct nfs_direct_req *dreq, unsigned long
bytes = count; bytes = count;
data = list_entry(list->next, struct nfs_write_data, pages); data = list_entry(list->next, struct nfs_write_data, pages);
list_del_init(&data->pages); list_move_tail(&data->pages, &dreq->rewrite_list);
data->inode = inode; data->inode = inode;
data->cred = ctx->cred; data->cred = ctx->cred;
...@@ -512,7 +679,7 @@ static void nfs_direct_write_schedule(struct nfs_direct_req *dreq, unsigned long ...@@ -512,7 +679,7 @@ static void nfs_direct_write_schedule(struct nfs_direct_req *dreq, unsigned long
rpc_init_task(&data->task, NFS_CLIENT(inode), RPC_TASK_ASYNC, rpc_init_task(&data->task, NFS_CLIENT(inode), RPC_TASK_ASYNC,
&nfs_write_direct_ops, data); &nfs_write_direct_ops, data);
NFS_PROTO(inode)->write_setup(data, FLUSH_STABLE); NFS_PROTO(inode)->write_setup(data, sync);
data->task.tk_priority = RPC_PRIORITY_NORMAL; data->task.tk_priority = RPC_PRIORITY_NORMAL;
data->task.tk_cookie = (unsigned long) inode; data->task.tk_cookie = (unsigned long) inode;
...@@ -544,11 +711,18 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, unsigned long user_addr, siz ...@@ -544,11 +711,18 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, unsigned long user_addr, siz
struct inode *inode = iocb->ki_filp->f_mapping->host; struct inode *inode = iocb->ki_filp->f_mapping->host;
struct rpc_clnt *clnt = NFS_CLIENT(inode); struct rpc_clnt *clnt = NFS_CLIENT(inode);
struct nfs_direct_req *dreq; struct nfs_direct_req *dreq;
size_t wsize = NFS_SERVER(inode)->wsize;
int sync = 0;
dreq = nfs_direct_write_alloc(count, NFS_SERVER(inode)->wsize); dreq = nfs_direct_write_alloc(count, wsize);
if (!dreq) if (!dreq)
return -ENOMEM; return -ENOMEM;
if (dreq->commit_data == NULL || count < wsize)
sync = FLUSH_STABLE;
dreq->user_addr = user_addr;
dreq->user_count = count;
dreq->pos = pos;
dreq->pages = pages; dreq->pages = pages;
dreq->npages = nr_pages; dreq->npages = nr_pages;
igrab(inode); igrab(inode);
...@@ -562,7 +736,7 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, unsigned long user_addr, siz ...@@ -562,7 +736,7 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, unsigned long user_addr, siz
nfs_begin_data_update(inode); nfs_begin_data_update(inode);
rpc_clnt_sigmask(clnt, &oldset); rpc_clnt_sigmask(clnt, &oldset);
nfs_direct_write_schedule(dreq, user_addr, count, pos); nfs_direct_write_schedule(dreq, sync);
result = nfs_direct_wait(dreq); result = nfs_direct_wait(dreq);
rpc_clnt_sigunmask(clnt, &oldset); rpc_clnt_sigunmask(clnt, &oldset);
......
...@@ -422,6 +422,7 @@ void nfs_commit_free(struct nfs_write_data *p); ...@@ -422,6 +422,7 @@ void nfs_commit_free(struct nfs_write_data *p);
extern int nfs_sync_inode(struct inode *, unsigned long, unsigned int, int); extern int nfs_sync_inode(struct inode *, unsigned long, unsigned int, int);
#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
extern int nfs_commit_inode(struct inode *, int); extern int nfs_commit_inode(struct inode *, int);
extern void nfs_commit_release(void *wdata);
#else #else
static inline int static inline int
nfs_commit_inode(struct inode *inode, int how) nfs_commit_inode(struct inode *inode, int how)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment