memcg: fix OOM killer under memcg

author KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>

Thu, 2 Apr 2009 23:57:38 +0000 (16:57 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Fri, 3 Apr 2009 02:04:55 +0000 (19:04 -0700)
author KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Thu, 2 Apr 2009 23:57:38 +0000 (16:57 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Fri, 3 Apr 2009 02:04:55 +0000 (19:04 -0700)
diff --git a/Documentation/cgroups/memcg_test.txt b/Documentation/cgroups/memcg_test.txt

index 523a9c16c400526e31d05a1aadee8164ef768800..8a11caf417a06777d48cfe9afe5517cd3925cb15 100644 (file)
--- a/Documentation/cgroups/memcg_test.txt
+++ b/Documentation/cgroups/memcg_test.txt
@@ -1,5 +1,5 @@
  Memory Resource Controller(Memcg)  Implementation Memo.
-Last Updated: 2009/1/19
+Last Updated: 2009/1/20
  Base Kernel Version: based on 2.6.29-rc2.
  
  Because VM is getting complex (one of reasons is memcg...), memcg's behavior
@@ -360,3 +360,21 @@ Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y.
         # kill malloc task.
  
         Of course, tmpfs v.s. swapoff test should be tested, too.
+
+ 9.8 OOM-Killer
+       Out-of-memory caused by memcg's limit will kill tasks under
+       the memcg. When hierarchy is used, a task under hierarchy
+       will be killed by the kernel.
+       In this case, panic_on_oom shouldn't be invoked and tasks
+       in other groups shouldn't be killed.
+
+       It's not difficult to cause OOM under memcg as following.
+       Case A) when you can swapoff
+       #swapoff -a
+       #echo 50M > /memory.limit_in_bytes
+       run 51M of malloc
+
+       Case B) when you use mem+swap limitation.
+       #echo 50M > memory.limit_in_bytes
+       #echo 50M > memory.memsw.limit_in_bytes
+       run 51M of malloc
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h

index b2816fba5306f1539eca62c6746d79237f27773a..43763bd772b922a80d09454f2be060bf4fbf25e1 100644 (file)
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -503,7 +503,7 @@ struct cgroup_subsys_state *css_get_next(struct cgroup_subsys *ss, int id,
  
  /* Returns true if root is ancestor of cg */
  bool css_is_ancestor(struct cgroup_subsys_state *cg,
-                    struct cgroup_subsys_state *root);
+                    const struct cgroup_subsys_state *root);
  
  /* Get id and depth of css */
  unsigned short css_id(struct cgroup_subsys_state *css);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c

index f2a3f5c9936c909a0b021425eec64a043a46b99f..382109b5baebc2a79d31f041ceb51babd1474c95 100644 (file)
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -3405,7 +3405,7 @@ unsigned short css_depth(struct cgroup_subsys_state *css)
  }
  
  bool css_is_ancestor(struct cgroup_subsys_state *child,
-                   struct cgroup_subsys_state *root)
+                   const struct cgroup_subsys_state *root)
  {
         struct css_id *child_id = rcu_dereference(child->id);
         struct css_id *root_id = rcu_dereference(root->id);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c

index 6f6a575e77ad970156b2fceee7a022707ce1805f..025f8abfae2d945f420f1c0f25e88bddafb6b838 100644 (file)
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -295,6 +295,9 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
  static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
  {
         struct mem_cgroup *mem = NULL;
+
+       if (!mm)
+               return NULL;
         /*
          * Because we have no locks, mm->owner's may be being moved to other
          * cgroup. We use css_tryget() here even if this looks
@@ -486,10 +489,20 @@ void mem_cgroup_move_lists(struct page *page,
  int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
  {
         int ret;
+       struct mem_cgroup *curr = NULL;
  
         task_lock(task);
-       ret = task->mm && mm_match_cgroup(task->mm, mem);
+       rcu_read_lock();
+       curr = try_get_mem_cgroup_from_mm(task->mm);
+       rcu_read_unlock();
         task_unlock(task);
+       if (!curr)
+               return 0;
+       if (curr->use_hierarchy)
+               ret = css_is_ancestor(&curr->css, &mem->css);
+       else
+               ret = (curr == mem);
+       css_put(&curr->css);
         return ret;
  }
  
@@ -820,6 +833,19 @@ bool mem_cgroup_oom_called(struct task_struct *task)
         rcu_read_unlock();
         return ret;
  }
+
+static int record_last_oom_cb(struct mem_cgroup *mem, void *data)
+{
+       mem->last_oom_jiffies = jiffies;
+       return 0;
+}
+
+static void record_last_oom(struct mem_cgroup *mem)
+{
+       mem_cgroup_walk_tree(mem, NULL, record_last_oom_cb);
+}
+
+
  /*
   * Unlike exported interface, "oom" parameter is added. if oom==true,
   * oom-killer can be invoked.
@@ -902,7 +928,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
                                 mutex_lock(&memcg_tasklist);
                                 mem_cgroup_out_of_memory(mem_over_limit, gfp_mask);
                                 mutex_unlock(&memcg_tasklist);
-                               mem_over_limit->last_oom_jiffies = jiffies;
+                               record_last_oom(mem_over_limit);
                         }
                         goto nomem;
                 }
author	KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
	Thu, 2 Apr 2009 23:57:38 +0000 (16:57 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Fri, 3 Apr 2009 02:04:55 +0000 (19:04 -0700)
Documentation/cgroups/memcg_test.txt		patch \| blob \| history
include/linux/cgroup.h		patch \| blob \| history
kernel/cgroup.c		patch \| blob \| history
mm/memcontrol.c		patch \| blob \| history