Implement reclaim from groups over their soft limit

Permit reclaim from memory cgroups on contention (via the direct reclaim path). memory cgroup soft limit reclaim finds the group that exceeds its soft limit by the largest number of pages and reclaims pages from it and then reinserts the cgroup into its correct place in the rbtree. Add additional checks to mem_cgroup_hierarchical_reclaim() to detect long loops in case all swap is turned off. The code has been refactored and the loop check (loop < 2) has been enhanced for soft limits. For soft limits, we try to do more targetted reclaim. Instead of bailing out after two loops, the routine now reclaims memory proportional to the size by which the soft limit is exceeded. The proportion has been empirically determined. Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Li Zefan <lizf@cn.fujitsu.com> Acked-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

Implement reclaim from groups over their soft limit
Permit reclaim from memory cgroups on contention (via the direct reclaim path). memory cgroup soft limit reclaim finds the group that exceeds its soft limit by the largest number of pages and reclaims pages from it and then reinserts the cgroup into its correct place in the rbtree. Add additional checks to mem_cgroup_hierarchical_reclaim() to detect long loops in case all swap is turned off. The code has been refactored and the loop check (loop < 2) has been enhanced for soft limits. For soft limits, we try to do more targetted reclaim. Instead of bailing out after two loops, the routine now reclaims memory proportional to the size by which the soft limit is exceeded. The proportion has been empirically determined. Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Li Zefan <lizf@cn.fujitsu.com> Acked-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
30147beb · Balbir Singh · James Toy · c4ac8782 · 30147beb · 30147beb
Commit 30147beb authored Aug 25, 2009 by Balbir Singh Committed by James Toy Aug 25, 2009
Showing with 259 additions and 15 deletions

include/linux/memcontrol.h include/linux/memcontrol.h +11 -0

include/linux/swap.h include/linux/swap.h +5 -0

mm/memcontrol.c mm/memcontrol.c +199 -14

mm/vmscan.c mm/vmscan.c +44 -1

No files found.
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -118,6 +118,9 @@ static inline bool mem_cgroup_disabled(void)

 extern bool mem_cgroup_oom_called(struct task_struct *task);
 void mem_cgroup_update_mapped_file_stat(struct page *page, int val);
+unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
+						gfp_t gfp_mask, int nid,
+						int zid);
 #else /* CONFIG_CGROUP_MEM_RES_CTLR */
 struct mem_cgroup;

@@ -276,6 +279,14 @@ static inline void mem_cgroup_update_mapped_file_stat(struct page *page,
 {
 }

+static inline
+unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
+						gfp_t gfp_mask, int nid,
+						int zid, int priority)
+{
+	return 0;
+}
+
 #endif /* CONFIG_CGROUP_MEM_CONT */

 #endif /* _LINUX_MEMCONTROL_H */

--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -239,6 +239,11 @@ extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem,
 						  gfp_t gfp_mask, bool noswap,
 						  unsigned int swappiness);
+extern unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
+						gfp_t gfp_mask, bool noswap,
+						unsigned int swappiness,
+						struct zone *zone,
+						int nid);
 extern int __isolate_lru_page(struct page *page, int mode, int file);
 extern unsigned long shrink_all_memory(unsigned long nr_pages);
 extern int vm_swappiness;

--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1840,11 +1840,45 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,

 #ifdef CONFIG_CGROUP_MEM_RES_CTLR

+unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
+						gfp_t gfp_mask, bool noswap,
+						unsigned int swappiness,
+						struct zone *zone, int nid)
+{
+	struct scan_control sc = {
+		.may_writepage = !laptop_mode,
+		.may_unmap = 1,
+		.may_swap = !noswap,
+		.swap_cluster_max = SWAP_CLUSTER_MAX,
+		.swappiness = swappiness,
+		.order = 0,
+		.mem_cgroup = mem,
+		.isolate_pages = mem_cgroup_isolate_pages,
+	};
+	nodemask_t nm  = nodemask_of_node(nid);
+
+	sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
+			(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
+	sc.nodemask = &nm;
+	sc.nr_reclaimed = 0;
+	sc.nr_scanned = 0;
+	/*
+	 * NOTE: Although we can get the priority field, using it
+	 * here is not a good idea, since it limits the pages we can scan.
+	 * if we don't reclaim here, the shrink_zone from balance_pgdat
+	 * will pick up pages from other mem cgroup's as well. We hack
+	 * the priority and make it zero.
+	 */
+	shrink_zone(0, zone, &sc);
+	return sc.nr_reclaimed;
+}
+
 unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
 					   gfp_t gfp_mask,
 					   bool noswap,
 					   unsigned int swappiness)
 {
+	struct zonelist *zonelist;
 	struct scan_control sc = {
 		.may_writepage = !laptop_mode,
 		.may_unmap = 1,
@@ -1856,7 +1890,6 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
 		.isolate_pages = mem_cgroup_isolate_pages,
 		.nodemask = NULL, /* we don't care the placement */
 	};
-	struct zonelist *zonelist;

 	sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
 			(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
@@ -1978,6 +2011,7 @@ loop_again:
 		for (i = 0; i <= end_zone; i++) {
 			struct zone *zone = pgdat->node_zones + i;
 			int nr_slab;
+			int nid, zid;

 			if (!populated_zone(zone))
 				continue;
@@ -1992,6 +2026,15 @@ loop_again:
 			temp_priority[i] = priority;
 			sc.nr_scanned = 0;
 			note_zone_scanning_priority(zone, priority);
+
+			nid = pgdat->node_id;
+			zid = zone_idx(zone);
+			/*
+			 * Call soft limit reclaim before calling shrink_zone.
+			 * For now we ignore the return value
+			 */
+			mem_cgroup_soft_limit_reclaim(zone, order, sc.gfp_mask,
+							nid, zid);
 			/*
 			 * We put equal pressure on every zone, unless one
 			 * zone has way too many pages free already.