Commit 4365a567 authored by KAMEZAWA Hiroyuki's avatar KAMEZAWA Hiroyuki Committed by Linus Torvalds

oom-kill: fix NUMA constraint check with nodemask

Fix node-oriented allocation handling in oom-kill.c I myself think of this
as a bugfix not as an ehnancement.

In these days, things are changed as
  - alloc_pages() eats nodemask as its arguments, __alloc_pages_nodemask().
  - mempolicy don't maintain its own private zonelists.
  (And cpuset doesn't use nodemask for __alloc_pages_nodemask())

So, current oom-killer's check function is wrong.

This patch does
  - check nodemask, if nodemask && nodemask doesn't cover all
    node_states[N_HIGH_MEMORY], this is CONSTRAINT_MEMORY_POLICY.
  - Scan all zonelist under nodemask, if it hits cpuset's wall
    this faiulre is from cpuset.
And
  - modifies the caller of out_of_memory not to call oom if __GFP_THISNODE.
    This doesn't change "current" behavior. If callers use __GFP_THISNODE
    it should handle "page allocation failure" by itself.

  - handle __GFP_NOFAIL+__GFP_THISNODE path.
    This is something like a FIXME but this gfpmask is not used now.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: default avatarKAMEZAWA Hiroyuki <kamezawa.hioryu@jp.fujitsu.com>
Acked-by: default avatarDavid Rientjes <rientjes@google.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Christoph Lameter <cl@linux-foundation.org>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent 3b4798cb
...@@ -339,7 +339,7 @@ static struct sysrq_key_op sysrq_term_op = { ...@@ -339,7 +339,7 @@ static struct sysrq_key_op sysrq_term_op = {
static void moom_callback(struct work_struct *ignored) static void moom_callback(struct work_struct *ignored)
{ {
out_of_memory(node_zonelist(0, GFP_KERNEL), GFP_KERNEL, 0); out_of_memory(node_zonelist(0, GFP_KERNEL), GFP_KERNEL, 0, NULL);
} }
static DECLARE_WORK(moom_work, moom_callback); static DECLARE_WORK(moom_work, moom_callback);
......
...@@ -10,6 +10,7 @@ ...@@ -10,6 +10,7 @@
#ifdef __KERNEL__ #ifdef __KERNEL__
#include <linux/types.h> #include <linux/types.h>
#include <linux/nodemask.h>
struct zonelist; struct zonelist;
struct notifier_block; struct notifier_block;
...@@ -26,7 +27,8 @@ enum oom_constraint { ...@@ -26,7 +27,8 @@ enum oom_constraint {
extern int try_set_zone_oom(struct zonelist *zonelist, gfp_t gfp_flags); extern int try_set_zone_oom(struct zonelist *zonelist, gfp_t gfp_flags);
extern void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags); extern void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags);
extern void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order); extern void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
int order, nodemask_t *mask);
extern int register_oom_notifier(struct notifier_block *nb); extern int register_oom_notifier(struct notifier_block *nb);
extern int unregister_oom_notifier(struct notifier_block *nb); extern int unregister_oom_notifier(struct notifier_block *nb);
......
...@@ -196,27 +196,46 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) ...@@ -196,27 +196,46 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
/* /*
* Determine the type of allocation constraint. * Determine the type of allocation constraint.
*/ */
static inline enum oom_constraint constrained_alloc(struct zonelist *zonelist,
gfp_t gfp_mask)
{
#ifdef CONFIG_NUMA #ifdef CONFIG_NUMA
static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
gfp_t gfp_mask, nodemask_t *nodemask)
{
struct zone *zone; struct zone *zone;
struct zoneref *z; struct zoneref *z;
enum zone_type high_zoneidx = gfp_zone(gfp_mask); enum zone_type high_zoneidx = gfp_zone(gfp_mask);
nodemask_t nodes = node_states[N_HIGH_MEMORY];
for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) /*
if (cpuset_zone_allowed_softwall(zone, gfp_mask)) * Reach here only when __GFP_NOFAIL is used. So, we should avoid
node_clear(zone_to_nid(zone), nodes); * to kill current.We have to random task kill in this case.
else * Hopefully, CONSTRAINT_THISNODE...but no way to handle it, now.
return CONSTRAINT_CPUSET; */
if (gfp_mask & __GFP_THISNODE)
return CONSTRAINT_NONE;
if (!nodes_empty(nodes)) /*
* The nodemask here is a nodemask passed to alloc_pages(). Now,
* cpuset doesn't use this nodemask for its hardwall/softwall/hierarchy
* feature. mempolicy is an only user of nodemask here.
* check mempolicy's nodemask contains all N_HIGH_MEMORY
*/
if (nodemask && !nodes_subset(node_states[N_HIGH_MEMORY], *nodemask))
return CONSTRAINT_MEMORY_POLICY; return CONSTRAINT_MEMORY_POLICY;
#endif
/* Check this allocation failure is caused by cpuset's wall function */
for_each_zone_zonelist_nodemask(zone, z, zonelist,
high_zoneidx, nodemask)
if (!cpuset_zone_allowed_softwall(zone, gfp_mask))
return CONSTRAINT_CPUSET;
return CONSTRAINT_NONE; return CONSTRAINT_NONE;
} }
#else
static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
gfp_t gfp_mask, nodemask_t *nodemask)
{
return CONSTRAINT_NONE;
}
#endif
/* /*
* Simple selection loop. We chose the process with the highest * Simple selection loop. We chose the process with the highest
...@@ -613,7 +632,8 @@ rest_and_return: ...@@ -613,7 +632,8 @@ rest_and_return:
* OR try to be smart about which process to kill. Note that we * OR try to be smart about which process to kill. Note that we
* don't have to be perfect here, we just have to be good. * don't have to be perfect here, we just have to be good.
*/ */
void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order) void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
int order, nodemask_t *nodemask)
{ {
unsigned long freed = 0; unsigned long freed = 0;
enum oom_constraint constraint; enum oom_constraint constraint;
...@@ -632,7 +652,7 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order) ...@@ -632,7 +652,7 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order)
* Check if there were limitations on the allocation (only relevant for * Check if there were limitations on the allocation (only relevant for
* NUMA) that may require different handling. * NUMA) that may require different handling.
*/ */
constraint = constrained_alloc(zonelist, gfp_mask); constraint = constrained_alloc(zonelist, gfp_mask, nodemask);
read_lock(&tasklist_lock); read_lock(&tasklist_lock);
switch (constraint) { switch (constraint) {
......
...@@ -1654,12 +1654,22 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, ...@@ -1654,12 +1654,22 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
if (page) if (page)
goto out; goto out;
/* The OOM killer will not help higher order allocs */ if (!(gfp_mask & __GFP_NOFAIL)) {
if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_NOFAIL)) /* The OOM killer will not help higher order allocs */
goto out; if (order > PAGE_ALLOC_COSTLY_ORDER)
goto out;
/*
* GFP_THISNODE contains __GFP_NORETRY and we never hit this.
* Sanity check for bare calls of __GFP_THISNODE, not real OOM.
* The caller should handle page allocation failure by itself if
* it specifies __GFP_THISNODE.
* Note: Hugepage uses it but will hit PAGE_ALLOC_COSTLY_ORDER.
*/
if (gfp_mask & __GFP_THISNODE)
goto out;
}
/* Exhausted what can be done so it's blamo time */ /* Exhausted what can be done so it's blamo time */
out_of_memory(zonelist, gfp_mask, order); out_of_memory(zonelist, gfp_mask, order, nodemask);
out: out:
clear_zonelist_oom(zonelist, gfp_mask); clear_zonelist_oom(zonelist, gfp_mask);
...@@ -3123,7 +3133,7 @@ static int __cpuinit process_zones(int cpu) ...@@ -3123,7 +3133,7 @@ static int __cpuinit process_zones(int cpu)
if (percpu_pagelist_fraction) if (percpu_pagelist_fraction)
setup_pagelist_highmark(zone_pcp(zone, cpu), setup_pagelist_highmark(zone_pcp(zone, cpu),
(zone->present_pages / percpu_pagelist_fraction)); (zone->present_pages / percpu_pagelist_fraction));
} }
return 0; return 0;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment