Commit 3f157a2f authored by Chris Mason's avatar Chris Mason

Btrfs: Online btree defragmentation fixes

The btree defragger wasn't making forward progress because the new key wasn't
being saved by the btrfs_search_forward function.

This also disables the automatic btree defrag, it wasn't scaling well to
huge filesystems.  The auto-defrag needs to be done differently.
Signed-off-by: default avatarChris Mason <chris.mason@oracle.com>
parent 1b1e2135
......@@ -63,10 +63,9 @@ void btrfs_free_path(struct btrfs_path *p)
void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p)
{
int i;
int keep = p->keep_locks;
int skip = p->skip_locking;
for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
p->slots[i] = 0;
if (!p->nodes[i])
continue;
if (p->locks[i]) {
......@@ -74,10 +73,8 @@ void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p)
p->locks[i] = 0;
}
free_extent_buffer(p->nodes[i]);
p->nodes[i] = NULL;
}
memset(p, 0, sizeof(*p));
p->keep_locks = keep;
p->skip_locking = skip;
}
struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
......@@ -463,8 +460,6 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
search_start = cur->start;
last_block = cur->start;
*last_ret = search_start;
if (parent_level == 1)
btrfs_clear_buffer_defrag(cur);
btrfs_tree_unlock(cur);
free_extent_buffer(cur);
}
......@@ -2969,8 +2964,138 @@ int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
return 1;
}
/*
* A helper function to walk down the tree starting at min_key, and looking
* for nodes or leaves that are either in cache or have a minimum
* transaction id. This is used by the btree defrag code, but could
* also be used to search for blocks that have changed since a given
* transaction id.
*
* This does not cow, but it does stuff the starting key it finds back
* into min_key, so you can call btrfs_search_slot with cow=1 on the
* key and get a writable path.
*
* This does lock as it descends, and path->keep_locks should be set
* to 1 by the caller.
*
* This honors path->lowest_level to prevent descent past a given level
* of the tree.
*
* returns zero if something useful was found, < 0 on error and 1 if there
* was nothing in the tree that matched the search criteria.
*/
int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
struct btrfs_path *path, int cache_only,
u64 min_trans)
{
struct extent_buffer *cur;
struct btrfs_key found_key;
int slot;
u32 nritems;
int level;
int ret = 1;
again:
cur = btrfs_lock_root_node(root);
level = btrfs_header_level(cur);
path->nodes[level] = cur;
path->locks[level] = 1;
if (btrfs_header_generation(cur) < min_trans) {
ret = 1;
goto out;
}
while(1) {
nritems = btrfs_header_nritems(cur);
level = btrfs_header_level(cur);
bin_search(cur, min_key, level, &slot);
/* at level = 0, we're done, setup the path and exit */
if (level == 0) {
ret = 0;
path->slots[level] = slot;
btrfs_item_key_to_cpu(cur, &found_key, slot);
goto out;
}
/*
* check this node pointer against the cache_only and
* min_trans parameters. If it isn't in cache or is too
* old, skip to the next one.
*/
while(slot < nritems) {
u64 blockptr;
u64 gen;
struct extent_buffer *tmp;
blockptr = btrfs_node_blockptr(cur, slot);
gen = btrfs_node_ptr_generation(cur, slot);
if (gen < min_trans) {
slot++;
continue;
}
if (!cache_only)
break;
tmp = btrfs_find_tree_block(root, blockptr,
btrfs_level_size(root, level - 1));
if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
free_extent_buffer(tmp);
break;
}
if (tmp)
free_extent_buffer(tmp);
slot++;
}
/*
* we didn't find a candidate key in this node, walk forward
* and find another one
*/
if (slot >= nritems) {
ret = btrfs_find_next_key(root, path, min_key, level,
cache_only, min_trans);
if (ret == 0) {
btrfs_release_path(root, path);
goto again;
} else {
goto out;
}
}
/* save our key for returning back */
btrfs_node_key_to_cpu(cur, &found_key, slot);
path->slots[level] = slot;
if (level == path->lowest_level) {
ret = 0;
unlock_up(path, level, 1);
goto out;
}
cur = read_node_slot(root, cur, slot);
btrfs_tree_lock(cur);
path->locks[level - 1] = 1;
path->nodes[level - 1] = cur;
unlock_up(path, level, 1);
}
out:
if (ret == 0)
memcpy(min_key, &found_key, sizeof(found_key));
return ret;
}
/*
* this is similar to btrfs_next_leaf, but does not try to preserve
* and fixup the path. It looks for and returns the next key in the
* tree based on the current path and the cache_only and min_trans
* parameters.
*
* 0 is returned if another key is found, < 0 if there are any errors
* and 1 is returned if there are no higher keys in the tree
*
* path->keep_locks should be set to 1 on the search made before
* calling this function.
*/
int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
struct btrfs_key *key, int lowest_level)
struct btrfs_key *key, int lowest_level,
int cache_only, u64 min_trans)
{
int level = lowest_level;
int slot;
......@@ -2982,6 +3107,7 @@ int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
slot = path->slots[level] + 1;
c = path->nodes[level];
next:
if (slot >= btrfs_header_nritems(c)) {
level++;
if (level == BTRFS_MAX_LEVEL) {
......@@ -2991,8 +3117,28 @@ int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
}
if (level == 0)
btrfs_item_key_to_cpu(c, key, slot);
else
else {
u64 blockptr = btrfs_node_blockptr(c, slot);
u64 gen = btrfs_node_ptr_generation(c, slot);
if (cache_only) {
struct extent_buffer *cur;
cur = btrfs_find_tree_block(root, blockptr,
btrfs_level_size(root, level - 1));
if (!cur || !btrfs_buffer_uptodate(cur, gen)) {
slot++;
if (cur)
free_extent_buffer(cur);
goto next;
}
free_extent_buffer(cur);
}
if (gen < min_trans) {
slot++;
goto next;
}
btrfs_node_key_to_cpu(c, key, slot);
}
return 0;
}
return 1;
......@@ -3095,6 +3241,12 @@ done:
return 0;
}
/*
* this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
* searching until it gets past min_objectid or finds an item of 'type'
*
* returns 0 if something is found, 1 if nothing was found and < 0 on error
*/
int btrfs_previous_item(struct btrfs_root *root,
struct btrfs_path *path, u64 min_objectid,
int type)
......
......@@ -609,6 +609,7 @@ struct btrfs_root {
u64 last_inode_alloc;
int ref_cows;
int track_dirty;
u64 defrag_trans_start;
struct btrfs_key defrag_progress;
struct btrfs_key defrag_max;
int defrag_running;
......@@ -1412,7 +1413,11 @@ int btrfs_previous_item(struct btrfs_root *root,
struct extent_buffer *btrfs_root_node(struct btrfs_root *root);
struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root);
int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
struct btrfs_key *key, int lowest_level);
struct btrfs_key *key, int lowest_level,
int cache_only, u64 min_trans);
int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
struct btrfs_path *path, int cache_only,
u64 min_trans);
int btrfs_cow_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct extent_buffer *buf,
......
......@@ -295,7 +295,6 @@ int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
ret = btree_read_extent_buffer_pages(root, eb, start + PAGE_CACHE_SIZE,
btrfs_header_generation(eb));
BUG_ON(ret);
btrfs_clear_buffer_defrag(eb);
found_start = btrfs_header_bytenr(eb);
if (found_start != start) {
printk("warning: eb start incorrect %Lu buffer %Lu len %lu\n",
......@@ -355,7 +354,6 @@ int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
}
eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS);
btrfs_clear_buffer_defrag(eb);
found_start = btrfs_header_bytenr(eb);
if (found_start != start) {
ret = -EIO;
......@@ -736,6 +734,7 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
memset(&root->root_item, 0, sizeof(root->root_item));
memset(&root->defrag_progress, 0, sizeof(root->defrag_progress));
memset(&root->root_kobj, 0, sizeof(root->root_kobj));
root->defrag_trans_start = fs_info->generation;
init_completion(&root->kobj_unregister);
root->defrag_running = 0;
root->defrag_level = 0;
......@@ -1168,7 +1167,6 @@ static int transaction_kthread(void *arg)
goto sleep;
}
mutex_unlock(&root->fs_info->trans_mutex);
btrfs_defrag_dirty_roots(root->fs_info);
trans = btrfs_start_transaction(root, 1);
ret = btrfs_commit_transaction(trans, root);
sleep:
......@@ -1434,12 +1432,12 @@ struct btrfs_root *open_ctree(struct super_block *sb,
tree_root,
"btrfs-transaction");
if (!fs_info->transaction_kthread)
goto fail_trans_kthread;
goto fail_cleaner;
return tree_root;
fail_trans_kthread:
fail_cleaner:
kthread_stop(fs_info->cleaner_kthread);
fail_extent_root:
free_extent_buffer(extent_root->node);
......@@ -1662,7 +1660,6 @@ int close_ctree(struct btrfs_root *root)
kthread_stop(root->fs_info->transaction_kthread);
kthread_stop(root->fs_info->cleaner_kthread);
btrfs_defrag_dirty_roots(root->fs_info);
btrfs_clean_old_snapshots(root);
trans = btrfs_start_transaction(root, 1);
ret = btrfs_commit_transaction(trans, root);
......@@ -1794,58 +1791,6 @@ void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
return;
}
void btrfs_set_buffer_defrag(struct extent_buffer *buf)
{
struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
struct inode *btree_inode = root->fs_info->btree_inode;
set_extent_bits(&BTRFS_I(btree_inode)->io_tree, buf->start,
buf->start + buf->len - 1, EXTENT_DEFRAG, GFP_NOFS);
}
void btrfs_set_buffer_defrag_done(struct extent_buffer *buf)
{
struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
struct inode *btree_inode = root->fs_info->btree_inode;
set_extent_bits(&BTRFS_I(btree_inode)->io_tree, buf->start,
buf->start + buf->len - 1, EXTENT_DEFRAG_DONE,
GFP_NOFS);
}
int btrfs_buffer_defrag(struct extent_buffer *buf)
{
struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
struct inode *btree_inode = root->fs_info->btree_inode;
return test_range_bit(&BTRFS_I(btree_inode)->io_tree,
buf->start, buf->start + buf->len - 1, EXTENT_DEFRAG, 0);
}
int btrfs_buffer_defrag_done(struct extent_buffer *buf)
{
struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
struct inode *btree_inode = root->fs_info->btree_inode;
return test_range_bit(&BTRFS_I(btree_inode)->io_tree,
buf->start, buf->start + buf->len - 1,
EXTENT_DEFRAG_DONE, 0);
}
int btrfs_clear_buffer_defrag_done(struct extent_buffer *buf)
{
struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
struct inode *btree_inode = root->fs_info->btree_inode;
return clear_extent_bits(&BTRFS_I(btree_inode)->io_tree,
buf->start, buf->start + buf->len - 1,
EXTENT_DEFRAG_DONE, GFP_NOFS);
}
int btrfs_clear_buffer_defrag(struct extent_buffer *buf)
{
struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
struct inode *btree_inode = root->fs_info->btree_inode;
return clear_extent_bits(&BTRFS_I(btree_inode)->io_tree,
buf->start, buf->start + buf->len - 1,
EXTENT_DEFRAG, GFP_NOFS);
}
int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
{
struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
......
......@@ -61,12 +61,6 @@ int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid);
int btrfs_set_buffer_uptodate(struct extent_buffer *buf);
int wait_on_tree_block_writeback(struct btrfs_root *root,
struct extent_buffer *buf);
void btrfs_set_buffer_defrag(struct extent_buffer *buf);
void btrfs_set_buffer_defrag_done(struct extent_buffer *buf);
int btrfs_buffer_defrag(struct extent_buffer *buf);
int btrfs_buffer_defrag_done(struct extent_buffer *buf);
int btrfs_clear_buffer_defrag(struct extent_buffer *buf);
int btrfs_clear_buffer_defrag_done(struct extent_buffer *buf);
int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid);
u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len);
void btrfs_csum_final(u32 crc, char *result);
......
......@@ -2095,8 +2095,6 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
buf->start + buf->len - 1, GFP_NOFS);
if (!btrfs_test_opt(root, SSD))
btrfs_set_buffer_defrag(buf);
trans->blocks_used++;
return buf;
}
......
......@@ -365,7 +365,6 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
return 0;
}
btrfs_clean_old_snapshots(root);
btrfs_defrag_dirty_roots(root->fs_info);
trans = btrfs_start_transaction(root, 1);
ret = btrfs_commit_transaction(trans, root);
sb->s_dirt = 0;
......
......@@ -30,7 +30,6 @@ extern struct kmem_cache *btrfs_trans_handle_cachep;
extern struct kmem_cache *btrfs_transaction_cachep;
#define BTRFS_ROOT_TRANS_TAG 0
#define BTRFS_ROOT_DEFRAG_TAG 1
static noinline void put_transaction(struct btrfs_transaction *transaction)
{
......@@ -92,9 +91,6 @@ static noinline int record_root_in_trans(struct btrfs_root *root)
radix_tree_tag_set(&root->fs_info->fs_roots_radix,
(unsigned long)root->root_key.objectid,
BTRFS_ROOT_TRANS_TAG);
radix_tree_tag_set(&root->fs_info->fs_roots_radix,
(unsigned long)root->root_key.objectid,
BTRFS_ROOT_DEFRAG_TAG);
root->commit_root = btrfs_root_node(root);
} else {
WARN_ON(1);
......@@ -403,44 +399,15 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
cond_resched();
trans = btrfs_start_transaction(root, 1);
if (ret != -EAGAIN)
if (root->fs_info->closing || ret != -EAGAIN)
break;
}
root->defrag_running = 0;
smp_mb();
radix_tree_tag_clear(&info->fs_roots_radix,
(unsigned long)root->root_key.objectid,
BTRFS_ROOT_DEFRAG_TAG);
btrfs_end_transaction(trans, root);
return 0;
}
int btrfs_defrag_dirty_roots(struct btrfs_fs_info *info)
{
struct btrfs_root *gang[1];
struct btrfs_root *root;
int i;
int ret;
int err = 0;
u64 last = 0;
while(1) {
ret = radix_tree_gang_lookup_tag(&info->fs_roots_radix,
(void **)gang, last,
ARRAY_SIZE(gang),
BTRFS_ROOT_DEFRAG_TAG);
if (ret == 0)
break;
for (i = 0; i < ret; i++) {
root = gang[i];
last = root->root_key.objectid + 1;
btrfs_defrag_root(root, 1);
}
}
btrfs_defrag_root(info->extent_root, 1);
return err;
}
static noinline int drop_dirty_roots(struct btrfs_root *tree_root,
struct list_head *list)
{
......
......@@ -84,7 +84,6 @@ int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
int btrfs_add_dead_root(struct btrfs_root *root, struct btrfs_root *latest,
struct list_head *dead_list);
int btrfs_defrag_dirty_roots(struct btrfs_fs_info *info);
int btrfs_defrag_root(struct btrfs_root *root, int cacheonly);
int btrfs_clean_old_snapshots(struct btrfs_root *root);
int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
......
......@@ -32,10 +32,13 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
int wret;
int level;
int orig_level;
int i;
int is_extent = 0;
int next_key_ret = 0;
u64 last_ret = 0;
u64 min_trans = 0;
if (cache_only)
goto out;
if (root->fs_info->extent_root == root) {
/*
......@@ -43,10 +46,6 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
* we can't defrag the extent root without deadlock
*/
goto out;
#if 0
mutex_lock(&root->fs_info->alloc_mutex);
is_extent = 1;
#endif
}
if (root->ref_cows == 0 && !is_extent)
......@@ -84,6 +83,17 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
path->lowest_level = 1;
path->keep_locks = 1;
if (cache_only)
min_trans = root->defrag_trans_start;
ret = btrfs_search_forward(root, &key, path, cache_only, min_trans);
if (ret < 0)
goto out;
if (ret > 0) {
ret = 0;
goto out;
}
btrfs_release_path(root, path);
wret = btrfs_search_slot(trans, root, &key, path, 0, 1);
if (wret < 0) {
......@@ -95,7 +105,8 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
goto out;
}
path->slots[1] = btrfs_header_nritems(path->nodes[1]);
next_key_ret = btrfs_find_next_key(root, path, &key, 1);
next_key_ret = btrfs_find_next_key(root, path, &key, 1, cache_only,
min_trans);
ret = btrfs_realloc_node(trans, root,
path->nodes[1], 0,
cache_only, &last_ret,
......@@ -106,19 +117,9 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
ret = -EAGAIN;
}
for (i = 1; i < BTRFS_MAX_LEVEL; i++) {
if (path->locks[i]) {
btrfs_tree_unlock(path->nodes[i]);
path->locks[i] = 0;
}
if (path->nodes[i]) {
free_extent_buffer(path->nodes[i]);
path->nodes[i] = NULL;
}
}
btrfs_release_path(root, path);
if (is_extent)
btrfs_extent_post_op(trans, root);
out:
if (is_extent)
mutex_unlock(&root->fs_info->alloc_mutex);
......@@ -138,6 +139,7 @@ done:
if (ret != -EAGAIN) {
memset(&root->defrag_progress, 0,
sizeof(root->defrag_progress));
root->defrag_trans_start = trans->transid;
}
return ret;
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment