Commit 5f2cc086 authored by Chris Mason's avatar Chris Mason

Btrfs: Avoid unplug storms during commit

While doing a commit, btrfs makes sure all the metadata blocks
were properly written to disk, calling wait_on_page_writeback for
each page.  This writeback happens after allowing another transaction
to start, so it competes for the disk with other processes in the FS.

If the page writeback bit is still set, each wait_on_page_writeback might
trigger an unplug, even though the page might be waiting for checksumming
to finish or might be waiting for the async work queue to submit the
bio.

This trades wait_on_page_writeback for waiting on the extent writeback
bits.  It won't trigger any unplugs and substantially improves performance
in a number of workloads.

This also changes the async bio submission to avoid requeueing if there
is only one device.  The requeue just wastes CPU time because there are
no other devices to service.
Signed-off-by: default avatarChris Mason <chris.mason@oracle.com>
parent 42e70e7a
...@@ -20,6 +20,7 @@ ...@@ -20,6 +20,7 @@
#include <linux/sched.h> #include <linux/sched.h>
#include <linux/writeback.h> #include <linux/writeback.h>
#include <linux/pagemap.h> #include <linux/pagemap.h>
#include <linux/blkdev.h>
#include "ctree.h" #include "ctree.h"
#include "disk-io.h" #include "disk-io.h"
#include "transaction.h" #include "transaction.h"
...@@ -331,6 +332,7 @@ int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, ...@@ -331,6 +332,7 @@ int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
int werr = 0; int werr = 0;
struct page *page; struct page *page;
struct inode *btree_inode = root->fs_info->btree_inode; struct inode *btree_inode = root->fs_info->btree_inode;
struct extent_io_tree *io_tree = &BTRFS_I(btree_inode)->io_tree;
u64 start = 0; u64 start = 0;
u64 end; u64 end;
unsigned long index; unsigned long index;
...@@ -371,6 +373,11 @@ int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, ...@@ -371,6 +373,11 @@ int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
page_cache_release(page); page_cache_release(page);
} }
} }
/*
* we unplug once and then use the wait_on_extent_bit for
* everything else
*/
blk_run_address_space(btree_inode->i_mapping);
while(1) { while(1) {
ret = find_first_extent_bit(dirty_pages, 0, &start, &end, ret = find_first_extent_bit(dirty_pages, 0, &start, &end,
EXTENT_DIRTY); EXTENT_DIRTY);
...@@ -391,7 +398,28 @@ int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, ...@@ -391,7 +398,28 @@ int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
if (err) if (err)
werr = err; werr = err;
} }
if (PageWriteback(page)) {
/*
* we don't wait on the page writeback bit
* because that triggers a lot of unplugs.
* The extent bits are much nicer to
* the disks, but come with a slightly
* higher latency because we aren't forcing
* unplugs.
*/
wait_on_extent_writeback(io_tree,
page_offset(page),
page_offset(page) +
PAGE_CACHE_SIZE - 1);
}
if (PageWriteback(page)) {
/*
* the state bits get cleared before the
* page bits, lets add some extra
* paranoia here
*/
wait_on_page_writeback(page); wait_on_page_writeback(page);
}
page_cache_release(page); page_cache_release(page);
cond_resched(); cond_resched();
} }
......
...@@ -200,7 +200,8 @@ loop: ...@@ -200,7 +200,8 @@ loop:
* is now congested. Back off and let other work structs * is now congested. Back off and let other work structs
* run instead * run instead
*/ */
if (pending && bdi_write_congested(bdi)) { if (pending && bdi_write_congested(bdi) &&
fs_info->fs_devices->open_devices > 1) {
struct bio *old_head; struct bio *old_head;
spin_lock(&device->io_lock); spin_lock(&device->io_lock);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment