Commit c63c7b05 authored by Trond Myklebust's avatar Trond Myklebust

NFS: Fix a race when doing NFS write coalescing

Currently we do write coalescing in a very inefficient manner: one pass in
generic_writepages() in order to lock the pages for writing, then one pass
in nfs_flush_mapping() and/or nfs_sync_mapping_wait() in order to gather
the locked pages for coalescing into RPC requests of size "wsize".

In fact, it turns out there is actually a deadlock possible here since we
only start I/O on the second pass. If the user signals the process while
we're in nfs_sync_mapping_wait(), for instance, then we may exit before
starting I/O on all the requests that have been queued up.
Signed-off-by: default avatarTrond Myklebust <Trond.Myklebust@netapp.com>
parent 8b09bee3
...@@ -17,7 +17,6 @@ ...@@ -17,7 +17,6 @@
#include <linux/nfs_page.h> #include <linux/nfs_page.h>
#include <linux/nfs_fs.h> #include <linux/nfs_fs.h>
#include <linux/nfs_mount.h> #include <linux/nfs_mount.h>
#include <linux/writeback.h>
#define NFS_PARANOIA 1 #define NFS_PARANOIA 1
...@@ -353,25 +352,6 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc, ...@@ -353,25 +352,6 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
return 1; return 1;
} }
/**
* nfs_pageio_add_list - Split coalesced requests out from a list.
* @desc: destination io descriptor
* @head: source list
*
* Moves a maximum of 'nmax' elements from one list to another.
* The elements are checked to ensure that they form a contiguous set
* of pages, and that the RPC credentials are the same.
*/
void nfs_pageio_add_list(struct nfs_pageio_descriptor *desc,
struct list_head *head)
{
while (!list_empty(head)) {
struct nfs_page *req = nfs_list_entry(head->next);
if (!nfs_pageio_add_request(desc, req))
break;
}
}
/** /**
* nfs_pageio_complete - Complete I/O on an nfs_pageio_descriptor * nfs_pageio_complete - Complete I/O on an nfs_pageio_descriptor
* @desc: pointer to io descriptor * @desc: pointer to io descriptor
...@@ -382,78 +362,6 @@ void nfs_pageio_complete(struct nfs_pageio_descriptor *desc) ...@@ -382,78 +362,6 @@ void nfs_pageio_complete(struct nfs_pageio_descriptor *desc)
} }
#define NFS_SCAN_MAXENTRIES 16 #define NFS_SCAN_MAXENTRIES 16
/**
* nfs_scan_dirty - Scan the radix tree for dirty requests
* @mapping: pointer to address space
* @wbc: writeback_control structure
* @dst: Destination list
*
* Moves elements from one of the inode request lists.
* If the number of requests is set to 0, the entire address_space
* starting at index idx_start, is scanned.
* The requests are *not* checked to ensure that they form a contiguous set.
* You must be holding the inode's req_lock when calling this function
*/
long nfs_scan_dirty(struct address_space *mapping,
struct writeback_control *wbc,
struct list_head *dst)
{
struct nfs_inode *nfsi = NFS_I(mapping->host);
struct nfs_page *pgvec[NFS_SCAN_MAXENTRIES];
struct nfs_page *req;
pgoff_t idx_start, idx_end;
long res = 0;
int found, i;
if (nfsi->ndirty == 0)
return 0;
if (wbc->range_cyclic) {
idx_start = 0;
idx_end = ULONG_MAX;
} else if (wbc->range_end == 0) {
idx_start = wbc->range_start >> PAGE_CACHE_SHIFT;
idx_end = ULONG_MAX;
} else {
idx_start = wbc->range_start >> PAGE_CACHE_SHIFT;
idx_end = wbc->range_end >> PAGE_CACHE_SHIFT;
}
for (;;) {
unsigned int toscan = NFS_SCAN_MAXENTRIES;
found = radix_tree_gang_lookup_tag(&nfsi->nfs_page_tree,
(void **)&pgvec[0], idx_start, toscan,
NFS_PAGE_TAG_DIRTY);
/* Did we make progress? */
if (found <= 0)
break;
for (i = 0; i < found; i++) {
req = pgvec[i];
if (!wbc->range_cyclic && req->wb_index > idx_end)
goto out;
/* Try to lock request and mark it for writeback */
if (!nfs_set_page_writeback_locked(req))
goto next;
radix_tree_tag_clear(&nfsi->nfs_page_tree,
req->wb_index, NFS_PAGE_TAG_DIRTY);
nfsi->ndirty--;
nfs_list_remove_request(req);
nfs_list_add_request(req, dst);
res++;
if (res == LONG_MAX)
goto out;
next:
idx_start = req->wb_index + 1;
}
}
out:
WARN_ON ((nfsi->ndirty == 0) != list_empty(&nfsi->dirty));
return res;
}
/** /**
* nfs_scan_list - Scan a list for matching requests * nfs_scan_list - Scan a list for matching requests
* @nfsi: NFS inode * @nfsi: NFS inode
......
...@@ -38,7 +38,8 @@ ...@@ -38,7 +38,8 @@
static struct nfs_page * nfs_update_request(struct nfs_open_context*, static struct nfs_page * nfs_update_request(struct nfs_open_context*,
struct page *, struct page *,
unsigned int, unsigned int); unsigned int, unsigned int);
static long nfs_flush_mapping(struct address_space *mapping, struct writeback_control *wbc, int how); static void nfs_pageio_init_write(struct nfs_pageio_descriptor *desc,
struct inode *inode, int ioflags);
static const struct rpc_call_ops nfs_write_partial_ops; static const struct rpc_call_ops nfs_write_partial_ops;
static const struct rpc_call_ops nfs_write_full_ops; static const struct rpc_call_ops nfs_write_full_ops;
static const struct rpc_call_ops nfs_commit_ops; static const struct rpc_call_ops nfs_commit_ops;
...@@ -201,7 +202,7 @@ static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page, ...@@ -201,7 +202,7 @@ static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page,
static int wb_priority(struct writeback_control *wbc) static int wb_priority(struct writeback_control *wbc)
{ {
if (wbc->for_reclaim) if (wbc->for_reclaim)
return FLUSH_HIGHPRI; return FLUSH_HIGHPRI | FLUSH_STABLE;
if (wbc->for_kupdate) if (wbc->for_kupdate)
return FLUSH_LOWPRI; return FLUSH_LOWPRI;
return 0; return 0;
...@@ -251,7 +252,8 @@ static void nfs_end_page_writeback(struct page *page) ...@@ -251,7 +252,8 @@ static void nfs_end_page_writeback(struct page *page)
* was not tagged. * was not tagged.
* May also return an error if the user signalled nfs_wait_on_request(). * May also return an error if the user signalled nfs_wait_on_request().
*/ */
static int nfs_page_mark_flush(struct page *page) static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
struct page *page)
{ {
struct nfs_page *req; struct nfs_page *req;
struct nfs_inode *nfsi = NFS_I(page->mapping->host); struct nfs_inode *nfsi = NFS_I(page->mapping->host);
...@@ -273,6 +275,8 @@ static int nfs_page_mark_flush(struct page *page) ...@@ -273,6 +275,8 @@ static int nfs_page_mark_flush(struct page *page)
* request as dirty (in which case we don't care). * request as dirty (in which case we don't care).
*/ */
spin_unlock(req_lock); spin_unlock(req_lock);
/* Prevent deadlock! */
nfs_pageio_complete(pgio);
ret = nfs_wait_on_request(req); ret = nfs_wait_on_request(req);
nfs_release_request(req); nfs_release_request(req);
if (ret != 0) if (ret != 0)
...@@ -283,21 +287,18 @@ static int nfs_page_mark_flush(struct page *page) ...@@ -283,21 +287,18 @@ static int nfs_page_mark_flush(struct page *page)
/* This request is marked for commit */ /* This request is marked for commit */
spin_unlock(req_lock); spin_unlock(req_lock);
nfs_unlock_request(req); nfs_unlock_request(req);
nfs_pageio_complete(pgio);
return 1; return 1;
} }
if (nfs_set_page_writeback(page) == 0) { if (nfs_set_page_writeback(page) != 0) {
nfs_list_remove_request(req);
/* add the request to the inode's dirty list. */
radix_tree_tag_set(&nfsi->nfs_page_tree,
req->wb_index, NFS_PAGE_TAG_DIRTY);
nfs_list_add_request(req, &nfsi->dirty);
nfsi->ndirty++;
spin_unlock(req_lock);
__mark_inode_dirty(page->mapping->host, I_DIRTY_PAGES);
} else
spin_unlock(req_lock); spin_unlock(req_lock);
BUG();
}
radix_tree_tag_set(&nfsi->nfs_page_tree, req->wb_index,
NFS_PAGE_TAG_WRITEBACK);
ret = test_bit(PG_NEED_FLUSH, &req->wb_flags); ret = test_bit(PG_NEED_FLUSH, &req->wb_flags);
nfs_unlock_request(req); spin_unlock(req_lock);
nfs_pageio_add_request(pgio, req);
return ret; return ret;
} }
...@@ -306,6 +307,7 @@ static int nfs_page_mark_flush(struct page *page) ...@@ -306,6 +307,7 @@ static int nfs_page_mark_flush(struct page *page)
*/ */
static int nfs_writepage_locked(struct page *page, struct writeback_control *wbc) static int nfs_writepage_locked(struct page *page, struct writeback_control *wbc)
{ {
struct nfs_pageio_descriptor mypgio, *pgio;
struct nfs_open_context *ctx; struct nfs_open_context *ctx;
struct inode *inode = page->mapping->host; struct inode *inode = page->mapping->host;
unsigned offset; unsigned offset;
...@@ -314,7 +316,14 @@ static int nfs_writepage_locked(struct page *page, struct writeback_control *wbc ...@@ -314,7 +316,14 @@ static int nfs_writepage_locked(struct page *page, struct writeback_control *wbc
nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE); nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE);
nfs_add_stats(inode, NFSIOS_WRITEPAGES, 1); nfs_add_stats(inode, NFSIOS_WRITEPAGES, 1);
err = nfs_page_mark_flush(page); if (wbc->for_writepages)
pgio = wbc->fs_private;
else {
nfs_pageio_init_write(&mypgio, inode, wb_priority(wbc));
pgio = &mypgio;
}
err = nfs_page_async_flush(pgio, page);
if (err <= 0) if (err <= 0)
goto out; goto out;
err = 0; err = 0;
...@@ -331,12 +340,12 @@ static int nfs_writepage_locked(struct page *page, struct writeback_control *wbc ...@@ -331,12 +340,12 @@ static int nfs_writepage_locked(struct page *page, struct writeback_control *wbc
put_nfs_open_context(ctx); put_nfs_open_context(ctx);
if (err != 0) if (err != 0)
goto out; goto out;
err = nfs_page_mark_flush(page); err = nfs_page_async_flush(pgio, page);
if (err > 0) if (err > 0)
err = 0; err = 0;
out: out:
if (!wbc->for_writepages) if (!wbc->for_writepages)
nfs_flush_mapping(page->mapping, wbc, FLUSH_STABLE|wb_priority(wbc)); nfs_pageio_complete(pgio);
return err; return err;
} }
...@@ -352,20 +361,20 @@ int nfs_writepage(struct page *page, struct writeback_control *wbc) ...@@ -352,20 +361,20 @@ int nfs_writepage(struct page *page, struct writeback_control *wbc)
int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc) int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
{ {
struct inode *inode = mapping->host; struct inode *inode = mapping->host;
struct nfs_pageio_descriptor pgio;
int err; int err;
nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES); nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES);
nfs_pageio_init_write(&pgio, inode, wb_priority(wbc));
wbc->fs_private = &pgio;
err = generic_writepages(mapping, wbc); err = generic_writepages(mapping, wbc);
nfs_pageio_complete(&pgio);
if (err) if (err)
return err; return err;
err = nfs_flush_mapping(mapping, wbc, wb_priority(wbc)); if (pgio.pg_error)
if (err < 0) return pgio.pg_error;
goto out; return 0;
nfs_add_stats(inode, NFSIOS_WRITEPAGES, err);
err = 0;
out:
return err;
} }
/* /*
...@@ -536,18 +545,6 @@ static int nfs_wait_on_requests_locked(struct inode *inode, unsigned long idx_st ...@@ -536,18 +545,6 @@ static int nfs_wait_on_requests_locked(struct inode *inode, unsigned long idx_st
return res; return res;
} }
static void nfs_cancel_dirty_list(struct list_head *head)
{
struct nfs_page *req;
while(!list_empty(head)) {
req = nfs_list_entry(head->next);
nfs_list_remove_request(req);
nfs_end_page_writeback(req->wb_page);
nfs_inode_remove_request(req);
nfs_clear_page_writeback(req);
}
}
static void nfs_cancel_commit_list(struct list_head *head) static void nfs_cancel_commit_list(struct list_head *head)
{ {
struct nfs_page *req; struct nfs_page *req;
...@@ -936,33 +933,15 @@ static int nfs_flush_one(struct inode *inode, struct list_head *head, size_t cou ...@@ -936,33 +933,15 @@ static int nfs_flush_one(struct inode *inode, struct list_head *head, size_t cou
return -ENOMEM; return -ENOMEM;
} }
static int nfs_flush_list(struct inode *inode, struct list_head *head, int npages, int how) static void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio,
struct inode *inode, int ioflags)
{ {
struct nfs_pageio_descriptor desc;
int wpages = NFS_SERVER(inode)->wpages;
int wsize = NFS_SERVER(inode)->wsize; int wsize = NFS_SERVER(inode)->wsize;
/* For single writes, FLUSH_STABLE is more efficient */
if (npages <= wpages && npages == NFS_I(inode)->npages
&& nfs_list_entry(head->next)->wb_bytes <= wsize)
how |= FLUSH_STABLE;
if (wsize < PAGE_CACHE_SIZE) if (wsize < PAGE_CACHE_SIZE)
nfs_pageio_init(&desc, inode, nfs_flush_multi, wsize, how); nfs_pageio_init(pgio, inode, nfs_flush_multi, wsize, ioflags);
else else
nfs_pageio_init(&desc, inode, nfs_flush_one, wsize, how); nfs_pageio_init(pgio, inode, nfs_flush_one, wsize, ioflags);
nfs_pageio_add_list(&desc, head);
nfs_pageio_complete(&desc);
if (desc.pg_error == 0)
return 0;
while (!list_empty(head)) {
struct nfs_page *req = nfs_list_entry(head->next);
nfs_list_remove_request(req);
nfs_redirty_request(req);
nfs_end_page_writeback(req->wb_page);
nfs_clear_page_writeback(req);
}
return desc.pg_error;
} }
/* /*
...@@ -1286,31 +1265,7 @@ static const struct rpc_call_ops nfs_commit_ops = { ...@@ -1286,31 +1265,7 @@ static const struct rpc_call_ops nfs_commit_ops = {
.rpc_call_done = nfs_commit_done, .rpc_call_done = nfs_commit_done,
.rpc_release = nfs_commit_release, .rpc_release = nfs_commit_release,
}; };
#else
static inline int nfs_commit_list(struct inode *inode, struct list_head *head, int how)
{
return 0;
}
#endif
static long nfs_flush_mapping(struct address_space *mapping, struct writeback_control *wbc, int how)
{
struct nfs_inode *nfsi = NFS_I(mapping->host);
LIST_HEAD(head);
long res;
spin_lock(&nfsi->req_lock);
res = nfs_scan_dirty(mapping, wbc, &head);
spin_unlock(&nfsi->req_lock);
if (res) {
int error = nfs_flush_list(mapping->host, &head, res, how);
if (error < 0)
return error;
}
return res;
}
#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
int nfs_commit_inode(struct inode *inode, int how) int nfs_commit_inode(struct inode *inode, int how)
{ {
struct nfs_inode *nfsi = NFS_I(inode); struct nfs_inode *nfsi = NFS_I(inode);
...@@ -1327,6 +1282,11 @@ int nfs_commit_inode(struct inode *inode, int how) ...@@ -1327,6 +1282,11 @@ int nfs_commit_inode(struct inode *inode, int how)
} }
return res; return res;
} }
#else
static inline int nfs_commit_list(struct inode *inode, struct list_head *head, int how)
{
return 0;
}
#endif #endif
long nfs_sync_mapping_wait(struct address_space *mapping, struct writeback_control *wbc, int how) long nfs_sync_mapping_wait(struct address_space *mapping, struct writeback_control *wbc, int how)
...@@ -1360,19 +1320,6 @@ long nfs_sync_mapping_wait(struct address_space *mapping, struct writeback_contr ...@@ -1360,19 +1320,6 @@ long nfs_sync_mapping_wait(struct address_space *mapping, struct writeback_contr
ret = nfs_wait_on_requests_locked(inode, idx_start, npages); ret = nfs_wait_on_requests_locked(inode, idx_start, npages);
if (ret != 0) if (ret != 0)
continue; continue;
pages = nfs_scan_dirty(mapping, wbc, &head);
if (pages != 0) {
spin_unlock(&nfsi->req_lock);
if (how & FLUSH_INVALIDATE) {
nfs_cancel_dirty_list(&head);
ret = pages;
} else
ret = nfs_flush_list(inode, &head, pages, how);
spin_lock(&nfsi->req_lock);
continue;
}
if (wbc->pages_skipped != 0)
continue;
if (nocommit) if (nocommit)
break; break;
pages = nfs_scan_commit(inode, &head, idx_start, npages); pages = nfs_scan_commit(inode, &head, idx_start, npages);
...@@ -1412,7 +1359,7 @@ int nfs_wb_all(struct inode *inode) ...@@ -1412,7 +1359,7 @@ int nfs_wb_all(struct inode *inode)
}; };
int ret; int ret;
ret = generic_writepages(mapping, &wbc); ret = nfs_writepages(mapping, &wbc);
if (ret < 0) if (ret < 0)
goto out; goto out;
ret = nfs_sync_mapping_wait(mapping, &wbc, 0); ret = nfs_sync_mapping_wait(mapping, &wbc, 0);
...@@ -1435,11 +1382,9 @@ int nfs_sync_mapping_range(struct address_space *mapping, loff_t range_start, lo ...@@ -1435,11 +1382,9 @@ int nfs_sync_mapping_range(struct address_space *mapping, loff_t range_start, lo
}; };
int ret; int ret;
if (!(how & FLUSH_NOWRITEPAGE)) { ret = nfs_writepages(mapping, &wbc);
ret = generic_writepages(mapping, &wbc);
if (ret < 0) if (ret < 0)
goto out; goto out;
}
ret = nfs_sync_mapping_wait(mapping, &wbc, how); ret = nfs_sync_mapping_wait(mapping, &wbc, how);
if (ret >= 0) if (ret >= 0)
return 0; return 0;
...@@ -1462,7 +1407,7 @@ int nfs_wb_page_priority(struct inode *inode, struct page *page, int how) ...@@ -1462,7 +1407,7 @@ int nfs_wb_page_priority(struct inode *inode, struct page *page, int how)
int ret; int ret;
BUG_ON(!PageLocked(page)); BUG_ON(!PageLocked(page));
if (!(how & FLUSH_NOWRITEPAGE) && clear_page_dirty_for_io(page)) { if (clear_page_dirty_for_io(page)) {
ret = nfs_writepage_locked(page, &wbc); ret = nfs_writepage_locked(page, &wbc);
if (ret < 0) if (ret < 0)
goto out; goto out;
......
...@@ -21,8 +21,7 @@ ...@@ -21,8 +21,7 @@
/* /*
* Valid flags for the radix tree * Valid flags for the radix tree
*/ */
#define NFS_PAGE_TAG_DIRTY 0 #define NFS_PAGE_TAG_WRITEBACK 0
#define NFS_PAGE_TAG_WRITEBACK 1
/* /*
* Valid flags for a dirty buffer * Valid flags for a dirty buffer
...@@ -72,9 +71,6 @@ extern void nfs_clear_request(struct nfs_page *req); ...@@ -72,9 +71,6 @@ extern void nfs_clear_request(struct nfs_page *req);
extern void nfs_release_request(struct nfs_page *req); extern void nfs_release_request(struct nfs_page *req);
extern long nfs_scan_dirty(struct address_space *mapping,
struct writeback_control *wbc,
struct list_head *dst);
extern int nfs_scan_list(struct nfs_inode *nfsi, struct list_head *head, struct list_head *dst, extern int nfs_scan_list(struct nfs_inode *nfsi, struct list_head *head, struct list_head *dst,
unsigned long idx_start, unsigned int npages); unsigned long idx_start, unsigned int npages);
extern void nfs_pageio_init(struct nfs_pageio_descriptor *desc, extern void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
...@@ -84,8 +80,6 @@ extern void nfs_pageio_init(struct nfs_pageio_descriptor *desc, ...@@ -84,8 +80,6 @@ extern void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
int how); int how);
extern int nfs_pageio_add_request(struct nfs_pageio_descriptor *, extern int nfs_pageio_add_request(struct nfs_pageio_descriptor *,
struct nfs_page *); struct nfs_page *);
extern void nfs_pageio_add_list(struct nfs_pageio_descriptor *,
struct list_head *);
extern void nfs_pageio_complete(struct nfs_pageio_descriptor *desc); extern void nfs_pageio_complete(struct nfs_pageio_descriptor *desc);
extern int nfs_wait_on_request(struct nfs_page *); extern int nfs_wait_on_request(struct nfs_page *);
extern void nfs_unlock_request(struct nfs_page *req); extern void nfs_unlock_request(struct nfs_page *req);
......
...@@ -59,6 +59,8 @@ struct writeback_control { ...@@ -59,6 +59,8 @@ struct writeback_control {
unsigned for_reclaim:1; /* Invoked from the page allocator */ unsigned for_reclaim:1; /* Invoked from the page allocator */
unsigned for_writepages:1; /* This is a writepages() call */ unsigned for_writepages:1; /* This is a writepages() call */
unsigned range_cyclic:1; /* range_start is cyclic */ unsigned range_cyclic:1; /* range_start is cyclic */
void *fs_private; /* For use by ->writepages() */
}; };
/* /*
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment