Commit e557b087 authored by Balbir Singh's avatar Balbir Singh Committed by James Toy

Change the memory cgroup to remove the overhead associated with accounting

all pages in the root cgroup.  As a side-effect, we can no longer set a
memory hard limit in the root cgroup.

A new flag to track whether the page has been accounted or not has been
added as well.  Flags are now set atomically for page_cgroup,
pcg_default_flags is now obsolete and removed.
Signed-off-by: default avatarBalbir Singh <balbir@linux.vnet.ibm.com>
Signed-off-by: default avatarDaisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Reviewed-by: default avatarKAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Paul Menage <menage@google.com>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
parent a680d5dc
...@@ -179,6 +179,9 @@ The reclaim algorithm has not been modified for cgroups, except that ...@@ -179,6 +179,9 @@ The reclaim algorithm has not been modified for cgroups, except that
pages that are selected for reclaiming come from the per cgroup LRU pages that are selected for reclaiming come from the per cgroup LRU
list. list.
NOTE: Reclaim does not works for the root cgroup, since we cannot
set any limits on the root cgroup
2. Locking 2. Locking
The memory controller uses the following hierarchy The memory controller uses the following hierarchy
...@@ -210,6 +213,7 @@ We can alter the memory limit: ...@@ -210,6 +213,7 @@ We can alter the memory limit:
NOTE: We can use a suffix (k, K, m, M, g or G) to indicate values in kilo, NOTE: We can use a suffix (k, K, m, M, g or G) to indicate values in kilo,
mega or gigabytes. mega or gigabytes.
NOTE: We can write "-1" to reset the *.limit_in_bytes(unlimited). NOTE: We can write "-1" to reset the *.limit_in_bytes(unlimited).
NOTE: We cannot set limits on the root cgroup anymore.
# cat /cgroups/0/memory.limit_in_bytes # cat /cgroups/0/memory.limit_in_bytes
4194304 4194304
......
...@@ -38,6 +38,7 @@ enum { ...@@ -38,6 +38,7 @@ enum {
PCG_LOCK, /* page cgroup is locked */ PCG_LOCK, /* page cgroup is locked */
PCG_CACHE, /* charged as cache */ PCG_CACHE, /* charged as cache */
PCG_USED, /* this object is in use. */ PCG_USED, /* this object is in use. */
PCG_ACCT_LRU, /* page has been accounted for */
}; };
#define TESTPCGFLAG(uname, lname) \ #define TESTPCGFLAG(uname, lname) \
...@@ -52,11 +53,23 @@ static inline void SetPageCgroup##uname(struct page_cgroup *pc)\ ...@@ -52,11 +53,23 @@ static inline void SetPageCgroup##uname(struct page_cgroup *pc)\
static inline void ClearPageCgroup##uname(struct page_cgroup *pc) \ static inline void ClearPageCgroup##uname(struct page_cgroup *pc) \
{ clear_bit(PCG_##lname, &pc->flags); } { clear_bit(PCG_##lname, &pc->flags); }
#define TESTCLEARPCGFLAG(uname, lname) \
static inline int TestClearPageCgroup##uname(struct page_cgroup *pc) \
{ return test_and_clear_bit(PCG_##lname, &pc->flags); }
/* Cache flag is set only once (at allocation) */ /* Cache flag is set only once (at allocation) */
TESTPCGFLAG(Cache, CACHE) TESTPCGFLAG(Cache, CACHE)
CLEARPCGFLAG(Cache, CACHE)
SETPCGFLAG(Cache, CACHE)
TESTPCGFLAG(Used, USED) TESTPCGFLAG(Used, USED)
CLEARPCGFLAG(Used, USED) CLEARPCGFLAG(Used, USED)
SETPCGFLAG(Used, USED)
SETPCGFLAG(AcctLRU, ACCT_LRU)
CLEARPCGFLAG(AcctLRU, ACCT_LRU)
TESTPCGFLAG(AcctLRU, ACCT_LRU)
TESTCLEARPCGFLAG(AcctLRU, ACCT_LRU)
static inline int page_cgroup_nid(struct page_cgroup *pc) static inline int page_cgroup_nid(struct page_cgroup *pc)
{ {
......
...@@ -43,6 +43,7 @@ ...@@ -43,6 +43,7 @@
struct cgroup_subsys mem_cgroup_subsys __read_mostly; struct cgroup_subsys mem_cgroup_subsys __read_mostly;
#define MEM_CGROUP_RECLAIM_RETRIES 5 #define MEM_CGROUP_RECLAIM_RETRIES 5
struct mem_cgroup *root_mem_cgroup __read_mostly;
#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
/* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */ /* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */
...@@ -200,13 +201,8 @@ enum charge_type { ...@@ -200,13 +201,8 @@ enum charge_type {
#define PCGF_CACHE (1UL << PCG_CACHE) #define PCGF_CACHE (1UL << PCG_CACHE)
#define PCGF_USED (1UL << PCG_USED) #define PCGF_USED (1UL << PCG_USED)
#define PCGF_LOCK (1UL << PCG_LOCK) #define PCGF_LOCK (1UL << PCG_LOCK)
static const unsigned long /* Not used, but added here for completeness */
pcg_default_flags[NR_CHARGE_TYPE] = { #define PCGF_ACCT (1UL << PCG_ACCT)
PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* File Cache */
PCGF_USED | PCGF_LOCK, /* Anon */
PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* Shmem */
0, /* FORCE */
};
/* for encoding cft->private value on file */ /* for encoding cft->private value on file */
#define _MEM (0) #define _MEM (0)
...@@ -354,6 +350,11 @@ static int mem_cgroup_walk_tree(struct mem_cgroup *root, void *data, ...@@ -354,6 +350,11 @@ static int mem_cgroup_walk_tree(struct mem_cgroup *root, void *data,
return ret; return ret;
} }
static inline bool mem_cgroup_is_root(struct mem_cgroup *mem)
{
return (mem == root_mem_cgroup);
}
/* /*
* Following LRU functions are allowed to be used without PCG_LOCK. * Following LRU functions are allowed to be used without PCG_LOCK.
* Operations are called by routine of global LRU independently from memcg. * Operations are called by routine of global LRU independently from memcg.
...@@ -371,22 +372,24 @@ static int mem_cgroup_walk_tree(struct mem_cgroup *root, void *data, ...@@ -371,22 +372,24 @@ static int mem_cgroup_walk_tree(struct mem_cgroup *root, void *data,
void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru) void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru)
{ {
struct page_cgroup *pc; struct page_cgroup *pc;
struct mem_cgroup *mem;
struct mem_cgroup_per_zone *mz; struct mem_cgroup_per_zone *mz;
if (mem_cgroup_disabled()) if (mem_cgroup_disabled())
return; return;
pc = lookup_page_cgroup(page); pc = lookup_page_cgroup(page);
/* can happen while we handle swapcache. */ /* can happen while we handle swapcache. */
if (list_empty(&pc->lru) || !pc->mem_cgroup) if (!TestClearPageCgroupAcctLRU(pc))
return; return;
VM_BUG_ON(!pc->mem_cgroup);
/* /*
* We don't check PCG_USED bit. It's cleared when the "page" is finally * We don't check PCG_USED bit. It's cleared when the "page" is finally
* removed from global LRU. * removed from global LRU.
*/ */
mz = page_cgroup_zoneinfo(pc); mz = page_cgroup_zoneinfo(pc);
mem = pc->mem_cgroup;
MEM_CGROUP_ZSTAT(mz, lru) -= 1; MEM_CGROUP_ZSTAT(mz, lru) -= 1;
if (mem_cgroup_is_root(pc->mem_cgroup))
return;
VM_BUG_ON(list_empty(&pc->lru));
list_del_init(&pc->lru); list_del_init(&pc->lru);
return; return;
} }
...@@ -410,8 +413,8 @@ void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru) ...@@ -410,8 +413,8 @@ void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru)
* For making pc->mem_cgroup visible, insert smp_rmb() here. * For making pc->mem_cgroup visible, insert smp_rmb() here.
*/ */
smp_rmb(); smp_rmb();
/* unused page is not rotated. */ /* unused or root page is not rotated. */
if (!PageCgroupUsed(pc)) if (!PageCgroupUsed(pc) || PageCgroupAcctLRU(pc))
return; return;
mz = page_cgroup_zoneinfo(pc); mz = page_cgroup_zoneinfo(pc);
list_move(&pc->lru, &mz->lists[lru]); list_move(&pc->lru, &mz->lists[lru]);
...@@ -425,6 +428,7 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru) ...@@ -425,6 +428,7 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)
if (mem_cgroup_disabled()) if (mem_cgroup_disabled())
return; return;
pc = lookup_page_cgroup(page); pc = lookup_page_cgroup(page);
VM_BUG_ON(PageCgroupAcctLRU(pc));
/* /*
* Used bit is set without atomic ops but after smp_wmb(). * Used bit is set without atomic ops but after smp_wmb().
* For making pc->mem_cgroup visible, insert smp_rmb() here. * For making pc->mem_cgroup visible, insert smp_rmb() here.
...@@ -435,6 +439,9 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru) ...@@ -435,6 +439,9 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)
mz = page_cgroup_zoneinfo(pc); mz = page_cgroup_zoneinfo(pc);
MEM_CGROUP_ZSTAT(mz, lru) += 1; MEM_CGROUP_ZSTAT(mz, lru) += 1;
SetPageCgroupAcctLRU(pc);
if (mem_cgroup_is_root(pc->mem_cgroup))
return;
list_add(&pc->lru, &mz->lists[lru]); list_add(&pc->lru, &mz->lists[lru]);
} }
...@@ -469,7 +476,7 @@ static void mem_cgroup_lru_add_after_commit_swapcache(struct page *page) ...@@ -469,7 +476,7 @@ static void mem_cgroup_lru_add_after_commit_swapcache(struct page *page)
spin_lock_irqsave(&zone->lru_lock, flags); spin_lock_irqsave(&zone->lru_lock, flags);
/* link when the page is linked to LRU but page_cgroup isn't */ /* link when the page is linked to LRU but page_cgroup isn't */
if (PageLRU(page) && list_empty(&pc->lru)) if (PageLRU(page) && !PageCgroupAcctLRU(pc))
mem_cgroup_add_lru_list(page, page_lru(page)); mem_cgroup_add_lru_list(page, page_lru(page));
spin_unlock_irqrestore(&zone->lru_lock, flags); spin_unlock_irqrestore(&zone->lru_lock, flags);
} }
...@@ -1125,9 +1132,22 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, ...@@ -1125,9 +1132,22 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
css_put(&mem->css); css_put(&mem->css);
return; return;
} }
pc->mem_cgroup = mem; pc->mem_cgroup = mem;
smp_wmb(); smp_wmb();
pc->flags = pcg_default_flags[ctype]; switch (ctype) {
case MEM_CGROUP_CHARGE_TYPE_CACHE:
case MEM_CGROUP_CHARGE_TYPE_SHMEM:
SetPageCgroupCache(pc);
SetPageCgroupUsed(pc);
break;
case MEM_CGROUP_CHARGE_TYPE_MAPPED:
ClearPageCgroupCache(pc);
SetPageCgroupUsed(pc);
break;
default:
break;
}
mem_cgroup_charge_statistics(mem, pc, true); mem_cgroup_charge_statistics(mem, pc, true);
...@@ -2083,6 +2103,10 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft, ...@@ -2083,6 +2103,10 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
name = MEMFILE_ATTR(cft->private); name = MEMFILE_ATTR(cft->private);
switch (name) { switch (name) {
case RES_LIMIT: case RES_LIMIT:
if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */
ret = -EINVAL;
break;
}
/* This function does all necessary parse...reuse it */ /* This function does all necessary parse...reuse it */
ret = res_counter_memparse_write_strategy(buffer, &val); ret = res_counter_memparse_write_strategy(buffer, &val);
if (ret) if (ret)
...@@ -2549,6 +2573,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) ...@@ -2549,6 +2573,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
if (cont->parent == NULL) { if (cont->parent == NULL) {
enable_swap_cgroup(); enable_swap_cgroup();
parent = NULL; parent = NULL;
root_mem_cgroup = mem;
} else { } else {
parent = mem_cgroup_from_cont(cont->parent); parent = mem_cgroup_from_cont(cont->parent);
mem->use_hierarchy = parent->use_hierarchy; mem->use_hierarchy = parent->use_hierarchy;
...@@ -2577,6 +2602,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) ...@@ -2577,6 +2602,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
return &mem->css; return &mem->css;
free_out: free_out:
__mem_cgroup_free(mem); __mem_cgroup_free(mem);
root_mem_cgroup = NULL;
return ERR_PTR(error); return ERR_PTR(error);
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment