Commit 6be2ded1 authored by Aneesh Kumar K.V's avatar Aneesh Kumar K.V Committed by Theodore Ts'o

ext4: Don't allow lg prealloc list to be grow large.

Currently, the locality group prealloc list is freed only when there
is a block allocation failure. This can result in large number of
entries in the preallocation list making ext4_mb_use_preallocated()
expensive.

To fix this, we convert the locality group prealloc list to a hash
list. The hash index is the order of number of blocks in the prealloc
space with a max order of 9. When adding prealloc space to the list we
make sure total entries for each order does not exceed 8. If it is
more than 8 we discard few entries and make sure the we have only <= 5
entries.
Signed-off-by: default avatarAneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: default avatarTheodore Ts'o <tytso@mit.edu>
parent 1320cbcf
...@@ -2480,7 +2480,7 @@ err_freesgi: ...@@ -2480,7 +2480,7 @@ err_freesgi:
int ext4_mb_init(struct super_block *sb, int needs_recovery) int ext4_mb_init(struct super_block *sb, int needs_recovery)
{ {
struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_sb_info *sbi = EXT4_SB(sb);
unsigned i; unsigned i, j;
unsigned offset; unsigned offset;
unsigned max; unsigned max;
int ret; int ret;
...@@ -2552,7 +2552,8 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) ...@@ -2552,7 +2552,8 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
struct ext4_locality_group *lg; struct ext4_locality_group *lg;
lg = &sbi->s_locality_groups[i]; lg = &sbi->s_locality_groups[i];
mutex_init(&lg->lg_mutex); mutex_init(&lg->lg_mutex);
INIT_LIST_HEAD(&lg->lg_prealloc_list); for (j = 0; j < PREALLOC_TB_SIZE; j++)
INIT_LIST_HEAD(&lg->lg_prealloc_list[j]);
spin_lock_init(&lg->lg_prealloc_lock); spin_lock_init(&lg->lg_prealloc_lock);
} }
...@@ -3263,6 +3264,7 @@ static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac, ...@@ -3263,6 +3264,7 @@ static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac,
struct ext4_prealloc_space *pa) struct ext4_prealloc_space *pa)
{ {
unsigned int len = ac->ac_o_ex.fe_len; unsigned int len = ac->ac_o_ex.fe_len;
ext4_get_group_no_and_offset(ac->ac_sb, pa->pa_pstart, ext4_get_group_no_and_offset(ac->ac_sb, pa->pa_pstart,
&ac->ac_b_ex.fe_group, &ac->ac_b_ex.fe_group,
&ac->ac_b_ex.fe_start); &ac->ac_b_ex.fe_start);
...@@ -3285,6 +3287,7 @@ static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac, ...@@ -3285,6 +3287,7 @@ static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac,
static noinline_for_stack int static noinline_for_stack int
ext4_mb_use_preallocated(struct ext4_allocation_context *ac) ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
{ {
int order, i;
struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
struct ext4_locality_group *lg; struct ext4_locality_group *lg;
struct ext4_prealloc_space *pa; struct ext4_prealloc_space *pa;
...@@ -3325,22 +3328,29 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac) ...@@ -3325,22 +3328,29 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
lg = ac->ac_lg; lg = ac->ac_lg;
if (lg == NULL) if (lg == NULL)
return 0; return 0;
order = fls(ac->ac_o_ex.fe_len) - 1;
rcu_read_lock(); if (order > PREALLOC_TB_SIZE - 1)
list_for_each_entry_rcu(pa, &lg->lg_prealloc_list, pa_inode_list) { /* The max size of hash table is PREALLOC_TB_SIZE */
spin_lock(&pa->pa_lock); order = PREALLOC_TB_SIZE - 1;
if (pa->pa_deleted == 0 && pa->pa_free >= ac->ac_o_ex.fe_len) {
atomic_inc(&pa->pa_count); for (i = order; i < PREALLOC_TB_SIZE; i++) {
ext4_mb_use_group_pa(ac, pa); rcu_read_lock();
list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[i],
pa_inode_list) {
spin_lock(&pa->pa_lock);
if (pa->pa_deleted == 0 &&
pa->pa_free >= ac->ac_o_ex.fe_len) {
atomic_inc(&pa->pa_count);
ext4_mb_use_group_pa(ac, pa);
spin_unlock(&pa->pa_lock);
ac->ac_criteria = 20;
rcu_read_unlock();
return 1;
}
spin_unlock(&pa->pa_lock); spin_unlock(&pa->pa_lock);
ac->ac_criteria = 20;
rcu_read_unlock();
return 1;
} }
spin_unlock(&pa->pa_lock); rcu_read_unlock();
} }
rcu_read_unlock();
return 0; return 0;
} }
...@@ -3563,6 +3573,7 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac) ...@@ -3563,6 +3573,7 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
pa->pa_free = pa->pa_len; pa->pa_free = pa->pa_len;
atomic_set(&pa->pa_count, 1); atomic_set(&pa->pa_count, 1);
spin_lock_init(&pa->pa_lock); spin_lock_init(&pa->pa_lock);
INIT_LIST_HEAD(&pa->pa_inode_list);
pa->pa_deleted = 0; pa->pa_deleted = 0;
pa->pa_linear = 1; pa->pa_linear = 1;
...@@ -3583,10 +3594,10 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac) ...@@ -3583,10 +3594,10 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
list_add(&pa->pa_group_list, &grp->bb_prealloc_list); list_add(&pa->pa_group_list, &grp->bb_prealloc_list);
ext4_unlock_group(sb, ac->ac_b_ex.fe_group); ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
spin_lock(pa->pa_obj_lock); /*
list_add_tail_rcu(&pa->pa_inode_list, &lg->lg_prealloc_list); * We will later add the new pa to the right bucket
spin_unlock(pa->pa_obj_lock); * after updating the pa_free in ext4_mb_release_context
*/
return 0; return 0;
} }
...@@ -4123,22 +4134,168 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac, ...@@ -4123,22 +4134,168 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
} }
static noinline_for_stack void
ext4_mb_discard_lg_preallocations(struct super_block *sb,
struct ext4_locality_group *lg,
int order, int total_entries)
{
ext4_group_t group = 0;
struct ext4_buddy e4b;
struct list_head discard_list;
struct ext4_prealloc_space *pa, *tmp;
struct ext4_allocation_context *ac;
mb_debug("discard locality group preallocation\n");
INIT_LIST_HEAD(&discard_list);
ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
spin_lock(&lg->lg_prealloc_lock);
list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order],
pa_inode_list) {
spin_lock(&pa->pa_lock);
if (atomic_read(&pa->pa_count)) {
/*
* This is the pa that we just used
* for block allocation. So don't
* free that
*/
spin_unlock(&pa->pa_lock);
continue;
}
if (pa->pa_deleted) {
spin_unlock(&pa->pa_lock);
continue;
}
/* only lg prealloc space */
BUG_ON(!pa->pa_linear);
/* seems this one can be freed ... */
pa->pa_deleted = 1;
spin_unlock(&pa->pa_lock);
list_del_rcu(&pa->pa_inode_list);
list_add(&pa->u.pa_tmp_list, &discard_list);
total_entries--;
if (total_entries <= 5) {
/*
* we want to keep only 5 entries
* allowing it to grow to 8. This
* mak sure we don't call discard
* soon for this list.
*/
break;
}
}
spin_unlock(&lg->lg_prealloc_lock);
list_for_each_entry_safe(pa, tmp, &discard_list, u.pa_tmp_list) {
ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL);
if (ext4_mb_load_buddy(sb, group, &e4b)) {
ext4_error(sb, __func__, "Error in loading buddy "
"information for %lu\n", group);
continue;
}
ext4_lock_group(sb, group);
list_del(&pa->pa_group_list);
ext4_mb_release_group_pa(&e4b, pa, ac);
ext4_unlock_group(sb, group);
ext4_mb_release_desc(&e4b);
list_del(&pa->u.pa_tmp_list);
call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
}
if (ac)
kmem_cache_free(ext4_ac_cachep, ac);
}
/*
* We have incremented pa_count. So it cannot be freed at this
* point. Also we hold lg_mutex. So no parallel allocation is
* possible from this lg. That means pa_free cannot be updated.
*
* A parallel ext4_mb_discard_group_preallocations is possible.
* which can cause the lg_prealloc_list to be updated.
*/
static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac)
{
int order, added = 0, lg_prealloc_count = 1;
struct super_block *sb = ac->ac_sb;
struct ext4_locality_group *lg = ac->ac_lg;
struct ext4_prealloc_space *tmp_pa, *pa = ac->ac_pa;
order = fls(pa->pa_free) - 1;
if (order > PREALLOC_TB_SIZE - 1)
/* The max size of hash table is PREALLOC_TB_SIZE */
order = PREALLOC_TB_SIZE - 1;
/* Add the prealloc space to lg */
rcu_read_lock();
list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[order],
pa_inode_list) {
spin_lock(&tmp_pa->pa_lock);
if (tmp_pa->pa_deleted) {
spin_unlock(&pa->pa_lock);
continue;
}
if (!added && pa->pa_free < tmp_pa->pa_free) {
/* Add to the tail of the previous entry */
list_add_tail_rcu(&pa->pa_inode_list,
&tmp_pa->pa_inode_list);
added = 1;
/*
* we want to count the total
* number of entries in the list
*/
}
spin_unlock(&tmp_pa->pa_lock);
lg_prealloc_count++;
}
if (!added)
list_add_tail_rcu(&pa->pa_inode_list,
&lg->lg_prealloc_list[order]);
rcu_read_unlock();
/* Now trim the list to be not more than 8 elements */
if (lg_prealloc_count > 8) {
ext4_mb_discard_lg_preallocations(sb, lg,
order, lg_prealloc_count);
return;
}
return ;
}
/* /*
* release all resource we used in allocation * release all resource we used in allocation
*/ */
static int ext4_mb_release_context(struct ext4_allocation_context *ac) static int ext4_mb_release_context(struct ext4_allocation_context *ac)
{ {
if (ac->ac_pa) { struct ext4_prealloc_space *pa = ac->ac_pa;
if (ac->ac_pa->pa_linear) { if (pa) {
if (pa->pa_linear) {
/* see comment in ext4_mb_use_group_pa() */ /* see comment in ext4_mb_use_group_pa() */
spin_lock(&ac->ac_pa->pa_lock); spin_lock(&pa->pa_lock);
ac->ac_pa->pa_pstart += ac->ac_b_ex.fe_len; pa->pa_pstart += ac->ac_b_ex.fe_len;
ac->ac_pa->pa_lstart += ac->ac_b_ex.fe_len; pa->pa_lstart += ac->ac_b_ex.fe_len;
ac->ac_pa->pa_free -= ac->ac_b_ex.fe_len; pa->pa_free -= ac->ac_b_ex.fe_len;
ac->ac_pa->pa_len -= ac->ac_b_ex.fe_len; pa->pa_len -= ac->ac_b_ex.fe_len;
spin_unlock(&ac->ac_pa->pa_lock); spin_unlock(&pa->pa_lock);
/*
* We want to add the pa to the right bucket.
* Remove it from the list and while adding
* make sure the list to which we are adding
* doesn't grow big.
*/
if (likely(pa->pa_free)) {
spin_lock(pa->pa_obj_lock);
list_del_rcu(&pa->pa_inode_list);
spin_unlock(pa->pa_obj_lock);
ext4_mb_add_n_trim(ac);
}
} }
ext4_mb_put_pa(ac, ac->ac_sb, ac->ac_pa); ext4_mb_put_pa(ac, ac->ac_sb, pa);
} }
if (ac->ac_bitmap_page) if (ac->ac_bitmap_page)
page_cache_release(ac->ac_bitmap_page); page_cache_release(ac->ac_bitmap_page);
......
...@@ -164,11 +164,17 @@ struct ext4_free_extent { ...@@ -164,11 +164,17 @@ struct ext4_free_extent {
* Locality group: * Locality group:
* we try to group all related changes together * we try to group all related changes together
* so that writeback can flush/allocate them together as well * so that writeback can flush/allocate them together as well
* Size of lg_prealloc_list hash is determined by MB_DEFAULT_GROUP_PREALLOC
* (512). We store prealloc space into the hash based on the pa_free blocks
* order value.ie, fls(pa_free)-1;
*/ */
#define PREALLOC_TB_SIZE 10
struct ext4_locality_group { struct ext4_locality_group {
/* for allocator */ /* for allocator */
struct mutex lg_mutex; /* to serialize allocates */ /* to serialize allocates */
struct list_head lg_prealloc_list;/* list of preallocations */ struct mutex lg_mutex;
/* list of preallocations */
struct list_head lg_prealloc_list[PREALLOC_TB_SIZE];
spinlock_t lg_prealloc_lock; spinlock_t lg_prealloc_lock;
}; };
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment