Commit e37c9e69 authored by Chris Mason's avatar Chris Mason Committed by David Woodhouse

Btrfs: many allocator fixes, pretty solid

Signed-off-by: default avatarChris Mason <chris.mason@oracle.com>
parent 3e1ad54f
......@@ -7,6 +7,7 @@
* Get rid of struct ctree_path, limiting tree levels held at one time
* Add generation number to key pointer in nodes
* Add generation number to inode
* Add ability to switch a block group from data to metadata or vice versa
* Release
* Do real tree locking
* Add extent mirroring (backup copies of blocks)
......
......@@ -77,7 +77,7 @@ int clear_radix_bit(struct radix_tree_root *radix, unsigned long bit)
}
int find_first_radix_bit(struct radix_tree_root *radix, unsigned long *retbits,
int nr)
unsigned long start, int nr)
{
unsigned long *bits;
unsigned long *gang[4];
......@@ -85,10 +85,13 @@ int find_first_radix_bit(struct radix_tree_root *radix, unsigned long *retbits,
int ret;
int i;
int total_found = 0;
unsigned long slot;
ret = radix_tree_gang_lookup(radix, (void **)gang, 0, ARRAY_SIZE(gang));
slot = start / BIT_RADIX_BITS_PER_ARRAY;
ret = radix_tree_gang_lookup(radix, (void **)gang, slot,
ARRAY_SIZE(gang));
found = start % BIT_RADIX_BITS_PER_ARRAY;
for (i = 0; i < ret && nr > 0; i++) {
found = 0;
bits = gang[i];
while(nr > 0) {
found = find_next_bit(bits + 1,
......@@ -104,6 +107,7 @@ int find_first_radix_bit(struct radix_tree_root *radix, unsigned long *retbits,
} else
break;
}
found = 0;
}
return total_found;
}
......@@ -6,7 +6,7 @@ int set_radix_bit(struct radix_tree_root *radix, unsigned long bit);
int test_radix_bit(struct radix_tree_root *radix, unsigned long bit);
int clear_radix_bit(struct radix_tree_root *radix, unsigned long bit);
int find_first_radix_bit(struct radix_tree_root *radix, unsigned long *retbits,
int nr);
unsigned long start, int nr);
static inline void init_bit_radix(struct radix_tree_root *radix)
{
......
......@@ -259,7 +259,9 @@ struct btrfs_block_group_cache {
u64 first_free;
u64 last_alloc;
u64 pinned;
u64 last_prealloc;
int data;
int cached;
};
struct crypto_hash;
......@@ -273,6 +275,7 @@ struct btrfs_fs_info {
struct radix_tree_root dev_radix;
struct radix_tree_root block_group_radix;
struct radix_tree_root block_group_data_radix;
struct radix_tree_root extent_map_radix;
u64 extent_tree_insert[BTRFS_MAX_LEVEL * 3];
int extent_tree_insert_nr;
......
......@@ -551,6 +551,7 @@ struct btrfs_root *open_ctree(struct super_block *sb)
init_bit_radix(&fs_info->pinned_radix);
init_bit_radix(&fs_info->pending_del_radix);
init_bit_radix(&fs_info->extent_map_radix);
INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_NOFS);
INIT_RADIX_TREE(&fs_info->dev_radix, GFP_NOFS);
INIT_RADIX_TREE(&fs_info->block_group_radix, GFP_KERNEL);
......
......@@ -12,6 +12,97 @@ static int finish_current_insert(struct btrfs_trans_handle *trans, struct
static int del_pending_extents(struct btrfs_trans_handle *trans, struct
btrfs_root *extent_root);
static int cache_block_group(struct btrfs_root *root,
struct btrfs_block_group_cache *block_group)
{
struct btrfs_path *path;
int ret;
struct btrfs_key key;
struct btrfs_leaf *leaf;
struct radix_tree_root *extent_radix;
int slot;
u64 i;
u64 last = 0;
u64 hole_size;
int found = 0;
root = root->fs_info->extent_root;
extent_radix = &root->fs_info->extent_map_radix;
if (block_group->cached)
return 0;
if (block_group->data)
return 0;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
printk("cache block group %Lu\n", block_group->key.objectid);
key.objectid = block_group->key.objectid;
key.flags = 0;
key.offset = 0;
btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
return ret;
if (ret && path->slots[0] > 0)
path->slots[0]--;
while(1) {
leaf = btrfs_buffer_leaf(path->nodes[0]);
slot = path->slots[0];
if (slot >= btrfs_header_nritems(&leaf->header)) {
ret = btrfs_next_leaf(root, path);
if (ret == 0)
continue;
else {
if (found) {
hole_size = block_group->key.objectid +
block_group->key.offset - last;
} else {
last = block_group->key.objectid;
hole_size = block_group->key.offset;
}
for (i = 0; i < hole_size; i++) {
set_radix_bit(extent_radix,
last + i);
}
break;
}
}
btrfs_disk_key_to_cpu(&key, &leaf->items[slot].key);
if (key.objectid >= block_group->key.objectid +
block_group->key.offset) {
if (found) {
hole_size = block_group->key.objectid +
block_group->key.offset - last;
} else {
last = block_group->key.objectid;
hole_size = block_group->key.offset;
}
for (i = 0; i < hole_size; i++) {
set_radix_bit(extent_radix, last + i);
}
break;
}
if (btrfs_key_type(&key) == BTRFS_EXTENT_ITEM_KEY) {
if (!found) {
last = key.objectid + key.offset;
found = 1;
} else {
hole_size = key.objectid - last;
for (i = 0; i < hole_size; i++) {
set_radix_bit(extent_radix, last + i);
}
last = key.objectid + key.offset;
}
}
path->slots[0]++;
}
block_group->cached = 1;
btrfs_free_path(path);
return 0;
}
static struct btrfs_block_group_cache *lookup_block_group(struct
btrfs_fs_info *info,
u64 blocknr)
......@@ -44,6 +135,63 @@ static struct btrfs_block_group_cache *lookup_block_group(struct
return NULL;
}
static u64 leaf_range(struct btrfs_root *root)
{
u64 size = BTRFS_LEAF_DATA_SIZE(root);
size = size / (sizeof(struct btrfs_extent_item) +
sizeof(struct btrfs_item));
return size;
}
static u64 find_search_start(struct btrfs_root *root,
struct btrfs_block_group_cache **cache_ret,
u64 search_start, int num)
{
unsigned long gang[8];
int ret;
struct btrfs_block_group_cache *cache = *cache_ret;
u64 last = max(search_start, cache->key.objectid);
if (cache->data)
goto out;
if (num > 1) {
last = max(last, cache->last_prealloc);
}
again:
cache_block_group(root, cache);
while(1) {
ret = find_first_radix_bit(&root->fs_info->extent_map_radix,
gang, last, ARRAY_SIZE(gang));
if (!ret)
goto out;
last = gang[ret-1] + 1;
if (num > 1) {
if (ret != ARRAY_SIZE(gang)) {
goto new_group;
}
if (gang[ret-1] - gang[0] > leaf_range(root)) {
continue;
}
}
if (gang[0] >= cache->key.objectid + cache->key.offset) {
goto new_group;
}
return gang[0];
}
out:
return max(cache->last_alloc, search_start);
new_group:
cache = lookup_block_group(root->fs_info, last + cache->key.offset - 1);
if (!cache) {
return max((*cache_ret)->last_alloc, search_start);
}
cache = btrfs_find_block_group(root, cache,
last + cache->key.offset - 1, 0);
*cache_ret = cache;
goto again;
}
struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root,
struct btrfs_block_group_cache
*hint, u64 search_start,
......@@ -89,13 +237,18 @@ struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root,
}
last = hint->key.offset * 2;
if (hint->key.objectid >= last)
last = max(search_start, hint->key.objectid - last);
last = max(search_start + hint->key.offset - 1,
hint->key.objectid - last);
else
last = hint->key.objectid + hint->key.offset;
hint_last = last;
} else {
hint_last = search_start;
last = search_start;
if (hint)
hint_last = max(hint->key.objectid, search_start);
else
hint_last = search_start;
last = hint_last;
}
while(1) {
ret = radix_tree_gang_lookup_tag(radix, (void **)cache,
......@@ -357,13 +510,14 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
static int update_block_group(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 blocknr, u64 num, int alloc)
u64 blocknr, u64 num, int alloc, int mark_free)
{
struct btrfs_block_group_cache *cache;
struct btrfs_fs_info *info = root->fs_info;
u64 total = num;
u64 old_val;
u64 block_in_group;
u64 i;
while(total) {
cache = lookup_block_group(info, blocknr);
......@@ -380,18 +534,38 @@ static int update_block_group(struct btrfs_trans_handle *trans,
old_val = btrfs_block_group_used(&cache->item);
num = min(total, cache->key.offset - block_in_group);
total -= num;
blocknr += num;
if (alloc) {
old_val += num;
if (blocknr > cache->last_alloc)
cache->last_alloc = blocknr;
if (!cache->data) {
for (i = 0; i < num; i++) {
clear_radix_bit(&info->extent_map_radix,
blocknr + i);
}
}
} else {
old_val -= num;
if (blocknr < cache->first_free)
cache->first_free = blocknr;
if (!cache->data && mark_free) {
for (i = 0; i < num; i++) {
set_radix_bit(&info->extent_map_radix,
blocknr + i);
}
}
if (old_val < (cache->key.offset * 8) / 10 &&
old_val + num >= (cache->key.offset * 8) / 10) {
printk("group %Lu now available\n", cache->key.objectid);
radix_tree_tag_set(cache->radix,
cache->key.objectid +
cache->key.offset - 1,
BTRFS_BLOCK_GROUP_AVAIL);
}
}
btrfs_set_block_group_used(&cache->item, old_val);
total -= num;
blocknr += num;
}
return 0;
}
......@@ -413,9 +587,10 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, struct
int ret;
int i;
struct radix_tree_root *pinned_radix = &root->fs_info->pinned_radix;
struct radix_tree_root *extent_radix = &root->fs_info->extent_map_radix;
while(1) {
ret = find_first_radix_bit(pinned_radix, gang,
ret = find_first_radix_bit(pinned_radix, gang, 0,
ARRAY_SIZE(gang));
if (!ret)
break;
......@@ -430,6 +605,10 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, struct
block_group->pinned--;
if (gang[i] < block_group->last_alloc)
block_group->last_alloc = gang[i];
if (gang[i] < block_group->last_prealloc)
block_group->last_prealloc = gang[i];
if (!block_group->data)
set_radix_bit(extent_radix, gang[i]);
}
try_remove_page(btree_inode->i_mapping,
gang[i] << (PAGE_CACHE_SHIFT -
......@@ -508,7 +687,8 @@ static int pin_down_block(struct btrfs_root *root, u64 blocknr, int pending)
* remove an extent from the root, returns 0 on success
*/
static int __free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
*root, u64 blocknr, u64 num_blocks, int pin)
*root, u64 blocknr, u64 num_blocks, int pin,
int mark_free)
{
struct btrfs_path *path;
struct btrfs_key key;
......@@ -556,10 +736,10 @@ static int __free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
ret = btrfs_del_item(trans, extent_root, path);
if (ret)
BUG();
ret = update_block_group(trans, root, blocknr, num_blocks, 0);
ret = update_block_group(trans, root, blocknr, num_blocks, 0,
mark_free);
BUG_ON(ret);
}
btrfs_release_path(extent_root, path);
btrfs_free_path(path);
finish_current_insert(trans, extent_root);
return ret;
......@@ -585,7 +765,7 @@ static int del_pending_extents(struct btrfs_trans_handle *trans, struct
pinned_radix = &extent_root->fs_info->pinned_radix;
while(1) {
ret = find_first_radix_bit(pending_radix, gang,
ret = find_first_radix_bit(pending_radix, gang, 0,
ARRAY_SIZE(gang));
if (!ret)
break;
......@@ -605,7 +785,7 @@ static int del_pending_extents(struct btrfs_trans_handle *trans, struct
wret = clear_radix_bit(pending_radix, gang[i]);
BUG_ON(wret);
wret = __free_extent(trans, extent_root,
gang[i], 1, 0);
gang[i], 1, 0, 0);
if (wret)
err = wret;
}
......@@ -627,7 +807,7 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
pin_down_block(root, blocknr, 1);
return 0;
}
ret = __free_extent(trans, root, blocknr, num_blocks, pin);
ret = __free_extent(trans, root, blocknr, num_blocks, pin, pin == 0);
pending_ret = del_pending_extents(trans, root->fs_info->extent_root);
return ret ? ret : pending_ret;
}
......@@ -688,18 +868,45 @@ static int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
check_failed:
if (!full_scan && block_group->data != data)
WARN_ON(1);
if (block_group->last_alloc > search_start)
search_start = block_group->last_alloc;
if (!data)
search_start = find_search_start(root, &block_group,
search_start, total_needed);
else
search_start = max(block_group->last_alloc, search_start);
btrfs_init_path(path);
ins->objectid = search_start;
ins->offset = 0;
start_found = 0;
ret = btrfs_search_slot(trans, root, ins, path, 0, 0);
if (ret < 0)
goto error;
if (path->slots[0] > 0)
if (path->slots[0] > 0) {
path->slots[0]--;
}
l = btrfs_buffer_leaf(path->nodes[0]);
btrfs_disk_key_to_cpu(&key, &l->items[path->slots[0]].key);
/*
* a rare case, go back one key if we hit a block group item
* instead of an extent item
*/
if (btrfs_key_type(&key) != BTRFS_EXTENT_ITEM_KEY &&
key.objectid + key.offset >= search_start) {
ins->objectid = key.objectid;
ins->offset = key.offset - 1;
btrfs_release_path(root, path);
ret = btrfs_search_slot(trans, root, ins, path, 0, 0);
if (ret < 0)
goto error;
if (path->slots[0] > 0) {
path->slots[0]--;
}
}
while (1) {
l = btrfs_buffer_leaf(path->nodes[0]);
......@@ -725,21 +932,23 @@ check_failed:
ins->offset = search_end - ins->objectid;
goto check_pending;
}
btrfs_disk_key_to_cpu(&key, &l->items[slot].key);
if (btrfs_key_type(&key) != BTRFS_EXTENT_ITEM_KEY)
goto next;
if (key.objectid >= search_start) {
if (start_found) {
if (last_block < search_start)
last_block = search_start;
hole_size = key.objectid - last_block;
if (hole_size >= num_blocks) {
ins->objectid = last_block;
ins->offset = hole_size;
goto check_pending;
}
if (key.objectid >= search_start && key.objectid > last_block &&
start_found) {
if (last_block < search_start)
last_block = search_start;
hole_size = key.objectid - last_block;
if (hole_size >= num_blocks) {
ins->objectid = last_block;
ins->offset = hole_size;
goto check_pending;
}
}
if (btrfs_key_type(&key) != BTRFS_EXTENT_ITEM_KEY)
goto next;
start_found = 1;
last_block = key.objectid + key.offset;
if (last_block >= block_group->key.objectid +
......@@ -759,6 +968,7 @@ check_pending:
*/
btrfs_release_path(root, path);
BUG_ON(ins->objectid < search_start);
if (ins->objectid + num_blocks >= search_end) {
if (full_scan)
return -ENOSPC;
......@@ -780,7 +990,7 @@ check_pending:
info->extent_tree_insert[0] &&
ins->objectid <= last) {
search_start = last + 1;
WARN_ON(1);
WARN_ON(!full_scan);
goto new_group;
}
}
......@@ -790,13 +1000,18 @@ check_pending:
if (ins->objectid + num_blocks > first &&
ins->objectid <= info->extent_tree_prealloc[0]) {
search_start = info->extent_tree_prealloc[0] + 1;
WARN_ON(1);
WARN_ON(!full_scan);
goto new_group;
}
}
if (fill_prealloc) {
int nr;
test_block = ins->objectid;
if (test_block - info->extent_tree_prealloc[total_needed - 1] >=
leaf_range(root)) {
total_found = 0;
info->extent_tree_prealloc_nr = total_found;
}
while(test_block < ins->objectid + ins->offset &&
total_found < total_needed) {
nr = total_needed - total_found - 1;
......@@ -811,11 +1026,15 @@ check_pending:
}
info->extent_tree_prealloc_nr = total_found;
}
block_group = lookup_block_group(info, ins->objectid);
if (block_group) {
block_group->last_alloc = ins->objectid;
if (!data)
trans->block_group = block_group;
if (!data) {
block_group = lookup_block_group(info, ins->objectid);
if (block_group) {
if (fill_prealloc)
block_group->last_prealloc =
info->extent_tree_prealloc[total_needed-1];
else
trans->block_group = block_group;
}
}
ins->offset = num_blocks;
btrfs_free_path(path);
......@@ -824,6 +1043,7 @@ check_pending:
new_group:
if (search_start + num_blocks >= search_end) {
search_start = orig_search_start;
printk("doing full scan!\n");
full_scan = 1;
}
block_group = lookup_block_group(info, search_start);
......@@ -871,26 +1091,57 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
info->extent_tree_insert[info->extent_tree_insert_nr++] =
ins->objectid;
ret = update_block_group(trans, root,
ins->objectid, ins->offset, 1);
ins->objectid, ins->offset, 1, 0);
BUG_ON(ret);
return 0;
}
/*
* if we're doing a data allocation, preallocate room in the
* extent tree first. This way the extent tree blocks end up
* in the correct block group.
*/
if (data) {
ret = find_free_extent(trans, root, 0, search_start,
search_end, &prealloc_key, 0);
if (ret) {
return ret;
}
if (prealloc_key.objectid + prealloc_key.offset >= search_end) {
int nr = info->extent_tree_prealloc_nr;
search_end = info->extent_tree_prealloc[nr - 1] - 1;
} else {
search_start = info->extent_tree_prealloc[0] + 1;
}
}
/* do the real allocation */
ret = find_free_extent(trans, root, num_blocks, search_start,
search_end, ins, data);
if (ret)
if (ret) {
return ret;
}
/* then do prealloc for the extent tree */
if (ins->objectid + ins->offset >= search_end)
search_end = ins->objectid - 1;
else
search_start = ins->objectid + ins->offset;
/*
* if we're doing a metadata allocation, preallocate space in the
* extent tree second. This way, we don't create a tiny hole
* in the allocation map between any unused preallocation blocks
* and the metadata block we're actually allocating. On disk,
* it'll go:
* [block we've allocated], [used prealloc 1], [ unused prealloc ]
* The unused prealloc will get reused the next time around.
*/
if (!data) {
if (ins->objectid + ins->offset >= search_end)
search_end = ins->objectid - 1;
else
search_start = ins->objectid + ins->offset;
ret = find_free_extent(trans, root, 0, search_start,
search_end, &prealloc_key, 0);
if (ret)
return ret;
ret = find_free_extent(trans, root, 0, search_start,
search_end, &prealloc_key, 0);
if (ret) {
return ret;
}
}
super_blocks_used = btrfs_super_blocks_used(info->disk_super);
btrfs_set_super_blocks_used(info->disk_super, super_blocks_used +
......@@ -900,11 +1151,13 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
finish_current_insert(trans, extent_root);
pending_ret = del_pending_extents(trans, extent_root);
if (ret)
if (ret) {
return ret;
if (pending_ret)
}
if (pending_ret) {
return pending_ret;
ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
}
ret = update_block_group(trans, root, ins->objectid, ins->offset, 1, 0);
return 0;
}
......@@ -920,7 +1173,7 @@ struct buffer_head *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
struct buffer_head *buf;
ret = btrfs_alloc_extent(trans, root, root->root_key.objectid,
1, hint, (unsigned long)-1, &ins, 0);
1, 0, (unsigned long)-1, &ins, 0);
if (ret) {
BUG();
return NULL;
......@@ -1134,6 +1387,8 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
{
int ret;
int ret2;
unsigned long gang[16];
int i;
ret = free_block_group_radix(&info->block_group_radix);
ret2 = free_block_group_radix(&info->block_group_data_radix);
......@@ -1141,6 +1396,16 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
return ret;
if (ret2)
return ret2;
while(1) {
ret = find_first_radix_bit(&info->extent_map_radix,
gang, 0, ARRAY_SIZE(gang));
if (!ret)
break;
for (i = 0; i < ret; i++) {
clear_radix_bit(&info->extent_map_radix, gang[i]);
}
}
return 0;
}
......@@ -1186,7 +1451,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
break;
}
if (nr & 1)
if (nr % 3)
radix = &info->block_group_data_radix;
else
radix = &info->block_group_radix;
......@@ -1197,8 +1462,14 @@ int btrfs_read_block_groups(struct btrfs_root *root)
memcpy(&cache->key, &found_key, sizeof(found_key));
cache->last_alloc = cache->key.objectid;
cache->first_free = cache->key.objectid;
cache->last_prealloc = cache->key.objectid;
cache->pinned = 0;
cache->data = (nr & 1);
cache->cached = 0;
if (nr % 3)
cache->data = 1;
else
cache->data = 0;
cache->radix = radix;
key.objectid = found_key.objectid + found_key.offset;
......
......@@ -1539,7 +1539,7 @@ static int dirty_and_release_pages(struct btrfs_trans_handle *trans,
kunmap(pages[i]);
}
SetPageChecked(pages[i]);
btrfs_update_inode_block_group(trans, inode);
// btrfs_update_inode_block_group(trans, inode);
ret = btrfs_end_transaction(trans, root);
BUG_ON(ret);
mutex_unlock(&root->fs_info->fs_mutex);
......@@ -1914,7 +1914,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
}
BUG_ON(ret);
alloc_extent_start = ins.objectid;
btrfs_update_inode_block_group(trans, inode);
// btrfs_update_inode_block_group(trans, inode);
ret = btrfs_end_transaction(trans, root);
mutex_unlock(&root->fs_info->fs_mutex);
......
......@@ -122,7 +122,8 @@ int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
}
dirty_pages = &trans->transaction->dirty_pages;
while(1) {
ret = find_first_radix_bit(dirty_pages, gang, ARRAY_SIZE(gang));
ret = find_first_radix_bit(dirty_pages, gang,
0, ARRAY_SIZE(gang));
if (!ret)
break;
for (i = 0; i < ret; i++) {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment