[PATCH] cpuset: rebind vma mempolicies fix

author Paul Jackson <pj@sgi.com>

Sun, 8 Jan 2006 09:01:59 +0000 (01:01 -0800)

committer Linus Torvalds <torvalds@g5.osdl.org>

Mon, 9 Jan 2006 04:13:44 +0000 (20:13 -0800)
author Paul Jackson <pj@sgi.com>
Sun, 8 Jan 2006 09:01:59 +0000 (01:01 -0800)
committer Linus Torvalds <torvalds@g5.osdl.org>
Mon, 9 Jan 2006 04:13:44 +0000 (20:13 -0800)
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h

index 74357cb9bc7ca0caeec26bbf6196be7a767d1abf..c7ac77e873b3fa800e4c4d3a3db0bd43dcaf7192 100644 (file)
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -150,6 +150,16 @@ extern void numa_policy_init(void);
  extern void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *new);
  extern void mpol_rebind_task(struct task_struct *tsk,
                                         const nodemask_t *new);
+extern void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new);
+#define set_cpuset_being_rebound(x) (cpuset_being_rebound = (x))
+
+#ifdef CONFIG_CPUSET
+#define current_cpuset_is_being_rebound() \
+                               (cpuset_being_rebound == current->cpuset)
+#else
+#define current_cpuset_is_being_rebound() 0
+#endif
+
  extern struct mempolicy default_policy;
  extern struct zonelist *huge_zonelist(struct vm_area_struct *vma,
                 unsigned long addr);
@@ -165,6 +175,8 @@ static inline void check_highest_zone(int k)
  int do_migrate_pages(struct mm_struct *mm,
         const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags);
  
+extern void *cpuset_being_rebound;     /* Trigger mpol_copy vma rebind */
+
  #else
  
  struct mempolicy {};
@@ -234,6 +246,12 @@ static inline void mpol_rebind_task(struct task_struct *tsk,
  {
  }
  
+static inline void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
+{
+}
+
+#define set_cpuset_being_rebound(x) do {} while (0)
+
  static inline struct zonelist *huge_zonelist(struct vm_area_struct *vma,
                 unsigned long addr)
  {
diff --git a/kernel/cpuset.c b/kernel/cpuset.c

index 6004719f26eef5b09770c006df3830fb62a340ea..19f87565be17f0c2fdd62c876c511fdc8f703942 100644 (file)
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -812,12 +812,24 @@ static int update_cpumask(struct cpuset *cs, char *buf)
  }
  
  /*
+ * Handle user request to change the 'mems' memory placement
+ * of a cpuset.  Needs to validate the request, update the
+ * cpusets mems_allowed and mems_generation, and for each
+ * task in the cpuset, rebind any vma mempolicies.
+ *
   * Call with manage_sem held.  May take callback_sem during call.
+ * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
+ * lock each such tasks mm->mmap_sem, scan its vma's and rebind
+ * their mempolicies to the cpusets new mems_allowed.
   */
  
  static int update_nodemask(struct cpuset *cs, char *buf)
  {
         struct cpuset trialcs;
+       struct task_struct *g, *p;
+       struct mm_struct **mmarray;
+       int i, n, ntasks;
+       int fudge;
         int retval;
  
         trialcs = *cs;
@@ -839,6 +851,76 @@ static int update_nodemask(struct cpuset *cs, char *buf)
         cs->mems_generation = atomic_read(&cpuset_mems_generation);
         up(&callback_sem);
  
+       set_cpuset_being_rebound(cs);           /* causes mpol_copy() rebind */
+
+       fudge = 10;                             /* spare mmarray[] slots */
+       fudge += cpus_weight(cs->cpus_allowed); /* imagine one fork-bomb/cpu */
+       retval = -ENOMEM;
+
+       /*
+        * Allocate mmarray[] to hold mm reference for each task
+        * in cpuset cs.  Can't kmalloc GFP_KERNEL while holding
+        * tasklist_lock.  We could use GFP_ATOMIC, but with a
+        * few more lines of code, we can retry until we get a big
+        * enough mmarray[] w/o using GFP_ATOMIC.
+        */
+       while (1) {
+               ntasks = atomic_read(&cs->count);       /* guess */
+               ntasks += fudge;
+               mmarray = kmalloc(ntasks * sizeof(*mmarray), GFP_KERNEL);
+               if (!mmarray)
+                       goto done;
+               write_lock_irq(&tasklist_lock);         /* block fork */
+               if (atomic_read(&cs->count) <= ntasks)
+                       break;                          /* got enough */
+               write_unlock_irq(&tasklist_lock);       /* try again */
+               kfree(mmarray);
+       }
+
+       n = 0;
+
+       /* Load up mmarray[] with mm reference for each task in cpuset. */
+       do_each_thread(g, p) {
+               struct mm_struct *mm;
+
+               if (n >= ntasks) {
+                       printk(KERN_WARNING
+                               "Cpuset mempolicy rebind incomplete.\n");
+                       continue;
+               }
+               if (p->cpuset != cs)
+                       continue;
+               mm = get_task_mm(p);
+               if (!mm)
+                       continue;
+               mmarray[n++] = mm;
+       } while_each_thread(g, p);
+       write_unlock_irq(&tasklist_lock);
+
+       /*
+        * Now that we've dropped the tasklist spinlock, we can
+        * rebind the vma mempolicies of each mm in mmarray[] to their
+        * new cpuset, and release that mm.  The mpol_rebind_mm()
+        * call takes mmap_sem, which we couldn't take while holding
+        * tasklist_lock.  Forks can happen again now - the mpol_copy()
+        * cpuset_being_rebound check will catch such forks, and rebind
+        * their vma mempolicies too.  Because we still hold the global
+        * cpuset manage_sem, we know that no other rebind effort will
+        * be contending for the global variable cpuset_being_rebound.
+        * It's ok if we rebind the same mm twice; mpol_rebind_mm()
+        * is idempotent.
+        */
+       for (i = 0; i < n; i++) {
+               struct mm_struct *mm = mmarray[i];
+
+               mpol_rebind_mm(mm, &cs->mems_allowed);
+               mmput(mm);
+       }
+
+       /* We're done rebinding vma's to this cpusets new mems_allowed. */
+       kfree(mmarray);
+       set_cpuset_being_rebound(NULL);
+       retval = 0;
  done:
         return retval;
  }
@@ -1011,6 +1093,7 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
         struct cpuset *oldcs;
         cpumask_t cpus;
         nodemask_t from, to;
+       struct mm_struct *mm;
  
         if (sscanf(pidbuf, "%d", &pid) != 1)
                 return -EIO;
@@ -1060,6 +1143,13 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
         to = cs->mems_allowed;
  
         up(&callback_sem);
+
+       mm = get_task_mm(tsk);
+       if (mm) {
+               mpol_rebind_mm(mm, &to);
+               mmput(mm);
+       }
+
         if (is_memory_migrate(cs))
                 do_migrate_pages(tsk->mm, &from, &to, MPOL_MF_MOVE_ALL);
         put_task_struct(tsk);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c

index c39bd86f4ea08a069ad5b1463b3df3806b671fdb..1850d0aef4ac3aba3abc99caf9b479c75319e368 100644 (file)
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1131,6 +1131,15 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
  }
  EXPORT_SYMBOL(alloc_pages_current);
  
+/*
+ * If mpol_copy() sees current->cpuset == cpuset_being_rebound, then it
+ * rebinds the mempolicy its copying by calling mpol_rebind_policy()
+ * with the mems_allowed returned by cpuset_mems_allowed().  This
+ * keeps mempolicies cpuset relative after its cpuset moves.  See
+ * further kernel/cpuset.c update_nodemask().
+ */
+void *cpuset_being_rebound;
+
  /* Slow path of a mempolicy copy */
  struct mempolicy *__mpol_copy(struct mempolicy *old)
  {
@@ -1138,6 +1147,10 @@ struct mempolicy *__mpol_copy(struct mempolicy *old)
  
         if (!new)
                 return ERR_PTR(-ENOMEM);
+       if (current_cpuset_is_being_rebound()) {
+               nodemask_t mems = cpuset_mems_allowed(current);
+               mpol_rebind_policy(old, &mems);
+       }
         *new = *old;
         atomic_set(&new->refcnt, 1);
         if (new->policy == MPOL_BIND) {
@@ -1480,6 +1493,22 @@ void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
         mpol_rebind_policy(tsk->mempolicy, new);
  }
  
+/*
+ * Rebind each vma in mm to new nodemask.
+ *
+ * Call holding a reference to mm.  Takes mm->mmap_sem during call.
+ */
+
+void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
+{
+       struct vm_area_struct *vma;
+
+       down_write(&mm->mmap_sem);
+       for (vma = mm->mmap; vma; vma = vma->vm_next)
+               mpol_rebind_policy(vma->vm_policy, new);
+       up_write(&mm->mmap_sem);
+}
+
  /*
   * Display pages allocated per node and memory policy via /proc.
   */
author	Paul Jackson <pj@sgi.com>
	Sun, 8 Jan 2006 09:01:59 +0000 (01:01 -0800)
committer	Linus Torvalds <torvalds@g5.osdl.org>
	Mon, 9 Jan 2006 04:13:44 +0000 (20:13 -0800)
include/linux/mempolicy.h		patch \| blob \| history
kernel/cpuset.c		patch \| blob \| history
mm/mempolicy.c		patch \| blob \| history