Commit 6d61ef40 authored by Balbir Singh's avatar Balbir Singh Committed by Linus Torvalds

memcg: memory cgroup hierarchical reclaim

This patch introduces hierarchical reclaim.  When an ancestor goes over
its limit, the charging routine points to the parent that is above its
limit.  The reclaim process then starts from the last scanned child of the
ancestor and reclaims until the ancestor goes below its limit.

[akpm@linux-foundation.org: coding-style fixes]
[d-nishimura@mtf.biglobe.ne.jp: mem_cgroup_from_res_counter should handle both mem->res and mem->memsw]
Signed-off-by: default avatarBalbir Singh <balbir@linux.vnet.ibm.com>
Cc: YAMAMOTO Takashi <yamamoto@valinux.co.jp>
Cc: Paul Menage <menage@google.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Pavel Emelianov <xemul@openvz.org>
Cc: Dhaval Giani <dhaval@linux.vnet.ibm.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: default avatarDaisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent 28dbc4b6
...@@ -143,6 +143,13 @@ struct mem_cgroup { ...@@ -143,6 +143,13 @@ struct mem_cgroup {
struct mem_cgroup_lru_info info; struct mem_cgroup_lru_info info;
int prev_priority; /* for recording reclaim priority */ int prev_priority; /* for recording reclaim priority */
/*
* While reclaiming in a hiearchy, we cache the last child we
* reclaimed from. Protected by cgroup_lock()
*/
struct mem_cgroup *last_scanned_child;
int obsolete; int obsolete;
atomic_t refcnt; atomic_t refcnt;
/* /*
...@@ -461,6 +468,149 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, ...@@ -461,6 +468,149 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
return nr_taken; return nr_taken;
} }
#define mem_cgroup_from_res_counter(counter, member) \
container_of(counter, struct mem_cgroup, member)
/*
* This routine finds the DFS walk successor. This routine should be
* called with cgroup_mutex held
*/
static struct mem_cgroup *
mem_cgroup_get_next_node(struct mem_cgroup *curr, struct mem_cgroup *root_mem)
{
struct cgroup *cgroup, *curr_cgroup, *root_cgroup;
curr_cgroup = curr->css.cgroup;
root_cgroup = root_mem->css.cgroup;
if (!list_empty(&curr_cgroup->children)) {
/*
* Walk down to children
*/
mem_cgroup_put(curr);
cgroup = list_entry(curr_cgroup->children.next,
struct cgroup, sibling);
curr = mem_cgroup_from_cont(cgroup);
mem_cgroup_get(curr);
goto done;
}
visit_parent:
if (curr_cgroup == root_cgroup) {
mem_cgroup_put(curr);
curr = root_mem;
mem_cgroup_get(curr);
goto done;
}
/*
* Goto next sibling
*/
if (curr_cgroup->sibling.next != &curr_cgroup->parent->children) {
mem_cgroup_put(curr);
cgroup = list_entry(curr_cgroup->sibling.next, struct cgroup,
sibling);
curr = mem_cgroup_from_cont(cgroup);
mem_cgroup_get(curr);
goto done;
}
/*
* Go up to next parent and next parent's sibling if need be
*/
curr_cgroup = curr_cgroup->parent;
goto visit_parent;
done:
root_mem->last_scanned_child = curr;
return curr;
}
/*
* Visit the first child (need not be the first child as per the ordering
* of the cgroup list, since we track last_scanned_child) of @mem and use
* that to reclaim free pages from.
*/
static struct mem_cgroup *
mem_cgroup_get_first_node(struct mem_cgroup *root_mem)
{
struct cgroup *cgroup;
struct mem_cgroup *ret;
bool obsolete = (root_mem->last_scanned_child &&
root_mem->last_scanned_child->obsolete);
/*
* Scan all children under the mem_cgroup mem
*/
cgroup_lock();
if (list_empty(&root_mem->css.cgroup->children)) {
ret = root_mem;
goto done;
}
if (!root_mem->last_scanned_child || obsolete) {
if (obsolete)
mem_cgroup_put(root_mem->last_scanned_child);
cgroup = list_first_entry(&root_mem->css.cgroup->children,
struct cgroup, sibling);
ret = mem_cgroup_from_cont(cgroup);
mem_cgroup_get(ret);
} else
ret = mem_cgroup_get_next_node(root_mem->last_scanned_child,
root_mem);
done:
root_mem->last_scanned_child = ret;
cgroup_unlock();
return ret;
}
/*
* Dance down the hierarchy if needed to reclaim memory. We remember the
* last child we reclaimed from, so that we don't end up penalizing
* one child extensively based on its position in the children list.
*
* root_mem is the original ancestor that we've been reclaim from.
*/
static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
gfp_t gfp_mask, bool noswap)
{
struct mem_cgroup *next_mem;
int ret = 0;
/*
* Reclaim unconditionally and don't check for return value.
* We need to reclaim in the current group and down the tree.
* One might think about checking for children before reclaiming,
* but there might be left over accounting, even after children
* have left.
*/
ret = try_to_free_mem_cgroup_pages(root_mem, gfp_mask, noswap);
if (res_counter_check_under_limit(&root_mem->res))
return 0;
next_mem = mem_cgroup_get_first_node(root_mem);
while (next_mem != root_mem) {
if (next_mem->obsolete) {
mem_cgroup_put(next_mem);
cgroup_lock();
next_mem = mem_cgroup_get_first_node(root_mem);
cgroup_unlock();
continue;
}
ret = try_to_free_mem_cgroup_pages(next_mem, gfp_mask, noswap);
if (res_counter_check_under_limit(&root_mem->res))
return 0;
cgroup_lock();
next_mem = mem_cgroup_get_next_node(next_mem, root_mem);
cgroup_unlock();
}
return ret;
}
/* /*
* Unlike exported interface, "oom" parameter is added. if oom==true, * Unlike exported interface, "oom" parameter is added. if oom==true,
* oom-killer can be invoked. * oom-killer can be invoked.
...@@ -469,7 +619,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, ...@@ -469,7 +619,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
gfp_t gfp_mask, struct mem_cgroup **memcg, gfp_t gfp_mask, struct mem_cgroup **memcg,
bool oom) bool oom)
{ {
struct mem_cgroup *mem; struct mem_cgroup *mem, *mem_over_limit;
int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
struct res_counter *fail_res; struct res_counter *fail_res;
/* /*
...@@ -511,12 +661,18 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, ...@@ -511,12 +661,18 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
/* mem+swap counter fails */ /* mem+swap counter fails */
res_counter_uncharge(&mem->res, PAGE_SIZE); res_counter_uncharge(&mem->res, PAGE_SIZE);
noswap = true; noswap = true;
} mem_over_limit = mem_cgroup_from_res_counter(fail_res,
memsw);
} else
/* mem counter fails */
mem_over_limit = mem_cgroup_from_res_counter(fail_res,
res);
if (!(gfp_mask & __GFP_WAIT)) if (!(gfp_mask & __GFP_WAIT))
goto nomem; goto nomem;
if (try_to_free_mem_cgroup_pages(mem, gfp_mask, noswap)) ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, gfp_mask,
continue; noswap);
/* /*
* try_to_free_mem_cgroup_pages() might not give us a full * try_to_free_mem_cgroup_pages() might not give us a full
...@@ -1732,6 +1888,8 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) ...@@ -1732,6 +1888,8 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
res_counter_init(&mem->memsw, parent ? &parent->memsw : NULL); res_counter_init(&mem->memsw, parent ? &parent->memsw : NULL);
mem->last_scanned_child = NULL;
return &mem->css; return &mem->css;
free_out: free_out:
for_each_node_state(node, N_POSSIBLE) for_each_node_state(node, N_POSSIBLE)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment