Commit 9ed74f2d authored by Josef Bacik's avatar Josef Bacik Committed by Chris Mason

Btrfs: proper -ENOSPC handling

At the start of a transaction we do a btrfs_reserve_metadata_space() and
specify how many items we plan on modifying.  Then once we've done our
modifications and such, just call btrfs_unreserve_metadata_space() for
the same number of items we reserved.

For keeping track of metadata needed for data I've had to add an extent_io op
for when we merge extents.  This lets us track space properly when we are doing
sequential writes, so we don't end up reserving way more metadata space than
what we need.

The only place where the metadata space accounting is not done is in the
relocation code.  This is because Yan is going to be reworking that code in the
near future, so running btrfs-vol -b could still possibly result in a ENOSPC
related panic.  This patch also turns off the metadata_ratio stuff in order to
allow users to more efficiently use their disk space.

This patch makes it so we track how much metadata we need for an inode's
delayed allocation extents by tracking how many extents are currently
waiting for allocation.  It introduces two new callbacks for the
extent_io tree's, merge_extent_hook and split_extent_hook.  These help
us keep track of when we merge delalloc extents together and split them
up.  Reservations are handled prior to any actually dirty'ing occurs,
and then we unreserve after we dirty.

btrfs_unreserve_metadata_for_delalloc() will make the appropriate
unreservations as needed based on the number of reservations we
currently have and the number of extents we currently have.  Doing the
reservation outside of doing any of the actual dirty'ing lets us do
things like filemap_flush() the inode to try and force delalloc to
happen, or as a last resort actually start allocation on all delalloc
inodes in the fs.  This has survived dbench, fs_mark and an fsx torture
test.
Signed-off-by: default avatarJosef Bacik <jbacik@redhat.com>
Signed-off-by: default avatarChris Mason <chris.mason@oracle.com>
parent c65ddb52
......@@ -127,6 +127,14 @@ struct btrfs_inode {
*/
u64 last_unlink_trans;
/*
* These two counters are for delalloc metadata reservations. We keep
* track of how many extents we've accounted for vs how many extents we
* have.
*/
int delalloc_reserved_extents;
int delalloc_extents;
/*
* ordered_data_close is set by truncate when a file that used
* to have good data has been truncated to zero. When it is set
......
......@@ -675,18 +675,19 @@ struct btrfs_space_info {
current allocations */
u64 bytes_readonly; /* total bytes that are read only */
u64 bytes_super; /* total bytes reserved for the super blocks */
/* delalloc accounting */
u64 bytes_delalloc; /* number of bytes reserved for allocation,
this space is not necessarily reserved yet
by the allocator */
u64 bytes_root; /* the number of bytes needed to commit a
transaction */
u64 bytes_may_use; /* number of bytes that may be used for
delalloc */
delalloc/allocations */
u64 bytes_delalloc; /* number of bytes currently reserved for
delayed allocation */
int full; /* indicates that we cannot allocate any more
chunks for this space */
int force_alloc; /* set if we need to force a chunk alloc for
this space */
int force_delalloc; /* make people start doing filemap_flush until
we're under a threshold */
struct list_head list;
......@@ -695,6 +696,9 @@ struct btrfs_space_info {
spinlock_t lock;
struct rw_semaphore groups_sem;
atomic_t caching_threads;
int allocating_chunk;
wait_queue_head_t wait;
};
/*
......@@ -2022,7 +2026,12 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde);
void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
int btrfs_check_metadata_free_space(struct btrfs_root *root);
int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items);
int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items);
int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root,
struct inode *inode, int num_items);
int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root,
struct inode *inode, int num_items);
int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
u64 bytes);
void btrfs_free_reserved_data_space(struct btrfs_root *root,
......
......@@ -1629,7 +1629,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
fs_info->sb = sb;
fs_info->max_extent = (u64)-1;
fs_info->max_inline = 8192 * 1024;
fs_info->metadata_ratio = 8;
fs_info->metadata_ratio = 0;
fs_info->thread_pool_size = min_t(unsigned long,
num_online_cpus() + 2, 8);
......
This diff is collapsed.
......@@ -280,6 +280,14 @@ static struct extent_buffer *buffer_search(struct extent_io_tree *tree,
return NULL;
}
static void merge_cb(struct extent_io_tree *tree, struct extent_state *new,
struct extent_state *other)
{
if (tree->ops && tree->ops->merge_extent_hook)
tree->ops->merge_extent_hook(tree->mapping->host, new,
other);
}
/*
* utility function to look for merge candidates inside a given range.
* Any extents with matching state are merged together into a single
......@@ -303,6 +311,7 @@ static int merge_state(struct extent_io_tree *tree,
other = rb_entry(other_node, struct extent_state, rb_node);
if (other->end == state->start - 1 &&
other->state == state->state) {
merge_cb(tree, state, other);
state->start = other->start;
other->tree = NULL;
rb_erase(&other->rb_node, &tree->state);
......@@ -314,33 +323,37 @@ static int merge_state(struct extent_io_tree *tree,
other = rb_entry(other_node, struct extent_state, rb_node);
if (other->start == state->end + 1 &&
other->state == state->state) {
merge_cb(tree, state, other);
other->start = state->start;
state->tree = NULL;
rb_erase(&state->rb_node, &tree->state);
free_extent_state(state);
state = NULL;
}
}
return 0;
}
static void set_state_cb(struct extent_io_tree *tree,
static int set_state_cb(struct extent_io_tree *tree,
struct extent_state *state,
unsigned long bits)
{
if (tree->ops && tree->ops->set_bit_hook) {
tree->ops->set_bit_hook(tree->mapping->host, state->start,
state->end, state->state, bits);
return tree->ops->set_bit_hook(tree->mapping->host,
state->start, state->end,
state->state, bits);
}
return 0;
}
static void clear_state_cb(struct extent_io_tree *tree,
struct extent_state *state,
unsigned long bits)
{
if (tree->ops && tree->ops->clear_bit_hook) {
tree->ops->clear_bit_hook(tree->mapping->host, state->start,
state->end, state->state, bits);
}
if (tree->ops && tree->ops->clear_bit_hook)
tree->ops->clear_bit_hook(tree->mapping->host, state, bits);
}
/*
......@@ -358,6 +371,7 @@ static int insert_state(struct extent_io_tree *tree,
int bits)
{
struct rb_node *node;
int ret;
if (end < start) {
printk(KERN_ERR "btrfs end < start %llu %llu\n",
......@@ -365,11 +379,14 @@ static int insert_state(struct extent_io_tree *tree,
(unsigned long long)start);
WARN_ON(1);
}
if (bits & EXTENT_DIRTY)
tree->dirty_bytes += end - start + 1;
state->start = start;
state->end = end;
set_state_cb(tree, state, bits);
ret = set_state_cb(tree, state, bits);
if (ret)
return ret;
if (bits & EXTENT_DIRTY)
tree->dirty_bytes += end - start + 1;
state->state |= bits;
node = tree_insert(&tree->state, end, &state->rb_node);
if (node) {
......@@ -387,6 +404,15 @@ static int insert_state(struct extent_io_tree *tree,
return 0;
}
static int split_cb(struct extent_io_tree *tree, struct extent_state *orig,
u64 split)
{
if (tree->ops && tree->ops->split_extent_hook)
return tree->ops->split_extent_hook(tree->mapping->host,
orig, split);
return 0;
}
/*
* split a given extent state struct in two, inserting the preallocated
* struct 'prealloc' as the newly created second half. 'split' indicates an
......@@ -405,6 +431,9 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
struct extent_state *prealloc, u64 split)
{
struct rb_node *node;
split_cb(tree, orig, split);
prealloc->start = orig->start;
prealloc->end = split - 1;
prealloc->state = orig->state;
......@@ -542,8 +571,8 @@ hit_next:
if (err)
goto out;
if (state->end <= end) {
set |= clear_state_bit(tree, state, bits,
wake, delete);
set |= clear_state_bit(tree, state, bits, wake,
delete);
if (last_end == (u64)-1)
goto out;
start = last_end + 1;
......@@ -561,12 +590,11 @@ hit_next:
prealloc = alloc_extent_state(GFP_ATOMIC);
err = split_state(tree, state, prealloc, end + 1);
BUG_ON(err == -EEXIST);
if (wake)
wake_up(&state->wq);
set |= clear_state_bit(tree, prealloc, bits,
wake, delete);
set |= clear_state_bit(tree, prealloc, bits, wake, delete);
prealloc = NULL;
goto out;
}
......@@ -667,16 +695,23 @@ out:
return 0;
}
static void set_state_bits(struct extent_io_tree *tree,
static int set_state_bits(struct extent_io_tree *tree,
struct extent_state *state,
int bits)
{
int ret;
ret = set_state_cb(tree, state, bits);
if (ret)
return ret;
if ((bits & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
u64 range = state->end - state->start + 1;
tree->dirty_bytes += range;
}
set_state_cb(tree, state, bits);
state->state |= bits;
return 0;
}
static void cache_state(struct extent_state *state,
......@@ -758,7 +793,10 @@ hit_next:
goto out;
}
set_state_bits(tree, state, bits);
err = set_state_bits(tree, state, bits);
if (err)
goto out;
cache_state(state, cached_state);
merge_state(tree, state);
if (last_end == (u64)-1)
......@@ -805,7 +843,9 @@ hit_next:
if (err)
goto out;
if (state->end <= end) {
set_state_bits(tree, state, bits);
err = set_state_bits(tree, state, bits);
if (err)
goto out;
cache_state(state, cached_state);
merge_state(tree, state);
if (last_end == (u64)-1)
......@@ -829,11 +869,13 @@ hit_next:
this_end = last_start - 1;
err = insert_state(tree, prealloc, start, this_end,
bits);
cache_state(prealloc, cached_state);
prealloc = NULL;
BUG_ON(err == -EEXIST);
if (err)
if (err) {
prealloc = NULL;
goto out;
}
cache_state(prealloc, cached_state);
prealloc = NULL;
start = this_end + 1;
goto search_again;
}
......@@ -852,7 +894,11 @@ hit_next:
err = split_state(tree, state, prealloc, end + 1);
BUG_ON(err == -EEXIST);
set_state_bits(tree, prealloc, bits);
err = set_state_bits(tree, prealloc, bits);
if (err) {
prealloc = NULL;
goto out;
}
cache_state(prealloc, cached_state);
merge_state(tree, prealloc);
prealloc = NULL;
......
......@@ -60,8 +60,13 @@ struct extent_io_ops {
struct extent_state *state, int uptodate);
int (*set_bit_hook)(struct inode *inode, u64 start, u64 end,
unsigned long old, unsigned long bits);
int (*clear_bit_hook)(struct inode *inode, u64 start, u64 end,
unsigned long old, unsigned long bits);
int (*clear_bit_hook)(struct inode *inode, struct extent_state *state,
unsigned long bits);
int (*merge_extent_hook)(struct inode *inode,
struct extent_state *new,
struct extent_state *other);
int (*split_extent_hook)(struct inode *inode,
struct extent_state *orig, u64 split);
int (*write_cache_pages_lock_hook)(struct page *page);
};
......@@ -79,10 +84,14 @@ struct extent_state {
u64 start;
u64 end; /* inclusive */
struct rb_node rb_node;
/* ADD NEW ELEMENTS AFTER THIS */
struct extent_io_tree *tree;
wait_queue_head_t wq;
atomic_t refs;
unsigned long state;
u64 split_start;
u64 split_end;
/* for use by the FS */
u64 private;
......
......@@ -123,7 +123,10 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
end_of_last_block = start_pos + num_bytes - 1;
btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block);
err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block);
if (err)
return err;
for (i = 0; i < num_pages; i++) {
struct page *p = pages[i];
SetPageUptodate(p);
......@@ -927,6 +930,11 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
err = file_remove_suid(file);
if (err)
goto out_nolock;
err = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
if (err)
goto out_nolock;
file_update_time(file);
pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
......@@ -1028,6 +1036,7 @@ out:
mutex_unlock(&inode->i_mutex);
if (ret)
err = ret;
btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
out_nolock:
kfree(pages);
......
This diff is collapsed.
......@@ -239,7 +239,13 @@ static noinline int create_subvol(struct btrfs_root *root,
u64 index = 0;
unsigned long nr = 1;
ret = btrfs_check_metadata_free_space(root);
/*
* 1 - inode item
* 2 - refs
* 1 - root item
* 2 - dir items
*/
ret = btrfs_reserve_metadata_space(root, 6);
if (ret)
return ret;
......@@ -340,6 +346,9 @@ fail:
err = btrfs_commit_transaction(trans, root);
if (err && !ret)
ret = err;
btrfs_unreserve_metadata_space(root, 6);
btrfs_btree_balance_dirty(root, nr);
return ret;
}
......@@ -355,19 +364,27 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
if (!root->ref_cows)
return -EINVAL;
ret = btrfs_check_metadata_free_space(root);
/*
* 1 - inode item
* 2 - refs
* 1 - root item
* 2 - dir items
*/
ret = btrfs_reserve_metadata_space(root, 6);
if (ret)
goto fail_unlock;
pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS);
if (!pending_snapshot) {
ret = -ENOMEM;
btrfs_unreserve_metadata_space(root, 6);
goto fail_unlock;
}
pending_snapshot->name = kmalloc(namelen + 1, GFP_NOFS);
if (!pending_snapshot->name) {
ret = -ENOMEM;
kfree(pending_snapshot);
btrfs_unreserve_metadata_space(root, 6);
goto fail_unlock;
}
memcpy(pending_snapshot->name, name, namelen);
......
......@@ -186,6 +186,9 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
h->alloc_exclude_start = 0;
h->delayed_ref_updates = 0;
if (!current->journal_info)
current->journal_info = h;
root->fs_info->running_transaction->use_count++;
record_root_in_trans(h, root);
mutex_unlock(&root->fs_info->trans_mutex);
......@@ -317,6 +320,9 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
wake_up(&cur_trans->writer_wait);
put_transaction(cur_trans);
mutex_unlock(&info->trans_mutex);
if (current->journal_info == trans)
current->journal_info = NULL;
memset(trans, 0, sizeof(*trans));
kmem_cache_free(btrfs_trans_handle_cachep, trans);
......@@ -743,6 +749,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
memcpy(&pending->root_key, &key, sizeof(key));
fail:
kfree(new_root_item);
btrfs_unreserve_metadata_space(root, 6);
return ret;
}
......@@ -1059,6 +1066,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
mutex_unlock(&root->fs_info->trans_mutex);
if (current->journal_info == trans)
current->journal_info = NULL;
kmem_cache_free(btrfs_trans_handle_cachep, trans);
return ret;
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment