[PATCH] cpuset semaphore depth check deadlock fix

author Paul Jackson <pj@sgi.com>

Sat, 10 Sep 2005 07:26:06 +0000 (00:26 -0700)

committer Linus Torvalds <torvalds@g5.osdl.org>

Sat, 10 Sep 2005 17:06:21 +0000 (10:06 -0700)
author Paul Jackson <pj@sgi.com>
Sat, 10 Sep 2005 07:26:06 +0000 (00:26 -0700)
committer Linus Torvalds <torvalds@g5.osdl.org>
Sat, 10 Sep 2005 17:06:21 +0000 (10:06 -0700)
diff --git a/include/linux/sched.h b/include/linux/sched.h

index c551e6a1447e57949e84435fe3fb846799d30f8b..8a1fcfe80fc722ff11c33d044ed195b9e05bc877 100644 (file)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -782,6 +782,7 @@ struct task_struct {
         short il_next;
  #endif
  #ifdef CONFIG_CPUSETS
+       short cpuset_sem_nest_depth;
         struct cpuset *cpuset;
         nodemask_t mems_allowed;
         int cpuset_mems_generation;
diff --git a/kernel/cpuset.c b/kernel/cpuset.c

index 712d02029971ed456a4025d23f0cd4d187d2f6ff..407b5f0a8c8eeed2aea648b08748771dd284b3d7 100644 (file)
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -181,6 +181,37 @@ static struct super_block *cpuset_sb = NULL;
  
  static DECLARE_MUTEX(cpuset_sem);
  
+/*
+ * The global cpuset semaphore cpuset_sem can be needed by the
+ * memory allocator to update a tasks mems_allowed (see the calls
+ * to cpuset_update_current_mems_allowed()) or to walk up the
+ * cpuset hierarchy to find a mem_exclusive cpuset see the calls
+ * to cpuset_excl_nodes_overlap()).
+ *
+ * But if the memory allocation is being done by cpuset.c code, it
+ * usually already holds cpuset_sem.  Double tripping on a kernel
+ * semaphore deadlocks the current task, and any other task that
+ * subsequently tries to obtain the lock.
+ *
+ * Run all up's and down's on cpuset_sem through the following
+ * wrappers, which will detect this nested locking, and avoid
+ * deadlocking.
+ */
+
+static inline void cpuset_down(struct semaphore *psem)
+{
+       if (current->cpuset_sem_nest_depth == 0)
+               down(psem);
+       current->cpuset_sem_nest_depth++;
+}
+
+static inline void cpuset_up(struct semaphore *psem)
+{
+       current->cpuset_sem_nest_depth--;
+       if (current->cpuset_sem_nest_depth == 0)
+               up(psem);
+}
+
  /*
   * A couple of forward declarations required, due to cyclic reference loop:
   *  cpuset_mkdir -> cpuset_create -> cpuset_populate_dir -> cpuset_add_file
@@ -522,19 +553,10 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
   * Refresh current tasks mems_allowed and mems_generation from
   * current tasks cpuset.  Call with cpuset_sem held.
   *
- * Be sure to call refresh_mems() on any cpuset operation which
- * (1) holds cpuset_sem, and (2) might possibly alloc memory.
- * Call after obtaining cpuset_sem lock, before any possible
- * allocation.  Otherwise one risks trying to allocate memory
- * while the task cpuset_mems_generation is not the same as
- * the mems_generation in its cpuset, which would deadlock on
- * cpuset_sem in cpuset_update_current_mems_allowed().
- *
- * Since we hold cpuset_sem, once refresh_mems() is called, the
- * test (current->cpuset_mems_generation != cs->mems_generation)
- * in cpuset_update_current_mems_allowed() will remain false,
- * until we drop cpuset_sem.  Anyone else who would change our
- * cpusets mems_generation needs to lock cpuset_sem first.
+ * This routine is needed to update the per-task mems_allowed
+ * data, within the tasks context, when it is trying to allocate
+ * memory (in various mm/mempolicy.c routines) and notices
+ * that some other task has been modifying its cpuset.
   */
  
  static void refresh_mems(void)
@@ -840,7 +862,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us
         }
         buffer[nbytes] = 0;     /* nul-terminate */
  
-       down(&cpuset_sem);
+       cpuset_down(&cpuset_sem);
  
         if (is_removed(cs)) {
                 retval = -ENODEV;
@@ -874,7 +896,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us
         if (retval == 0)
                 retval = nbytes;
  out2:
-       up(&cpuset_sem);
+       cpuset_up(&cpuset_sem);
         cpuset_release_agent(pathbuf);
  out1:
         kfree(buffer);
@@ -914,9 +936,9 @@ static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
  {
         cpumask_t mask;
  
-       down(&cpuset_sem);
+       cpuset_down(&cpuset_sem);
         mask = cs->cpus_allowed;
-       up(&cpuset_sem);
+       cpuset_up(&cpuset_sem);
  
         return cpulist_scnprintf(page, PAGE_SIZE, mask);
  }
@@ -925,9 +947,9 @@ static int cpuset_sprintf_memlist(char *page, struct cpuset *cs)
  {
         nodemask_t mask;
  
-       down(&cpuset_sem);
+       cpuset_down(&cpuset_sem);
         mask = cs->mems_allowed;
-       up(&cpuset_sem);
+       cpuset_up(&cpuset_sem);
  
         return nodelist_scnprintf(page, PAGE_SIZE, mask);
  }
@@ -1334,8 +1356,7 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode)
         if (!cs)
                 return -ENOMEM;
  
-       down(&cpuset_sem);
-       refresh_mems();
+       cpuset_down(&cpuset_sem);
         cs->flags = 0;
         if (notify_on_release(parent))
                 set_bit(CS_NOTIFY_ON_RELEASE, &cs->flags);
@@ -1360,14 +1381,14 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode)
          * will down() this new directory's i_sem and if we race with
          * another mkdir, we might deadlock.
          */
-       up(&cpuset_sem);
+       cpuset_up(&cpuset_sem);
  
         err = cpuset_populate_dir(cs->dentry);
         /* If err < 0, we have a half-filled directory - oh well ;) */
         return 0;
  err:
         list_del(&cs->sibling);
-       up(&cpuset_sem);
+       cpuset_up(&cpuset_sem);
         kfree(cs);
         return err;
  }
@@ -1389,14 +1410,13 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
  
         /* the vfs holds both inode->i_sem already */
  
-       down(&cpuset_sem);
-       refresh_mems();
+       cpuset_down(&cpuset_sem);
         if (atomic_read(&cs->count) > 0) {
-               up(&cpuset_sem);
+               cpuset_up(&cpuset_sem);
                 return -EBUSY;
         }
         if (!list_empty(&cs->children)) {
-               up(&cpuset_sem);
+               cpuset_up(&cpuset_sem);
                 return -EBUSY;
         }
         parent = cs->parent;
@@ -1412,7 +1432,7 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
         spin_unlock(&d->d_lock);
         cpuset_d_remove_dir(d);
         dput(d);
-       up(&cpuset_sem);
+       cpuset_up(&cpuset_sem);
         cpuset_release_agent(pathbuf);
         return 0;
  }
@@ -1515,10 +1535,10 @@ void cpuset_exit(struct task_struct *tsk)
         if (notify_on_release(cs)) {
                 char *pathbuf = NULL;
  
-               down(&cpuset_sem);
+               cpuset_down(&cpuset_sem);
                 if (atomic_dec_and_test(&cs->count))
                         check_for_release(cs, &pathbuf);
-               up(&cpuset_sem);
+               cpuset_up(&cpuset_sem);
                 cpuset_release_agent(pathbuf);
         } else {
                 atomic_dec(&cs->count);
@@ -1539,11 +1559,11 @@ cpumask_t cpuset_cpus_allowed(const struct task_struct *tsk)
  {
         cpumask_t mask;
  
-       down(&cpuset_sem);
+       cpuset_down(&cpuset_sem);
         task_lock((struct task_struct *)tsk);
         guarantee_online_cpus(tsk->cpuset, &mask);
         task_unlock((struct task_struct *)tsk);
-       up(&cpuset_sem);
+       cpuset_up(&cpuset_sem);
  
         return mask;
  }
@@ -1568,9 +1588,9 @@ void cpuset_update_current_mems_allowed(void)
         if (!cs)
                 return;         /* task is exiting */
         if (current->cpuset_mems_generation != cs->mems_generation) {
-               down(&cpuset_sem);
+               cpuset_down(&cpuset_sem);
                 refresh_mems();
-               up(&cpuset_sem);
+               cpuset_up(&cpuset_sem);
         }
  }
  
@@ -1669,14 +1689,14 @@ int cpuset_zone_allowed(struct zone *z, unsigned int __nocast gfp_mask)
                 return 0;
  
         /* Not hardwall and node outside mems_allowed: scan up cpusets */
-       down(&cpuset_sem);
+       cpuset_down(&cpuset_sem);
         cs = current->cpuset;
         if (!cs)
                 goto done;              /* current task exiting */
         cs = nearest_exclusive_ancestor(cs);
         allowed = node_isset(node, cs->mems_allowed);
  done:
-       up(&cpuset_sem);
+       cpuset_up(&cpuset_sem);
         return allowed;
  }
  
@@ -1697,7 +1717,7 @@ int cpuset_excl_nodes_overlap(const struct task_struct *p)
         const struct cpuset *cs1, *cs2; /* my and p's cpuset ancestors */
         int overlap = 0;                /* do cpusets overlap? */
  
-       down(&cpuset_sem);
+       cpuset_down(&cpuset_sem);
         cs1 = current->cpuset;
         if (!cs1)
                 goto done;              /* current task exiting */
@@ -1708,7 +1728,7 @@ int cpuset_excl_nodes_overlap(const struct task_struct *p)
         cs2 = nearest_exclusive_ancestor(cs2);
         overlap = nodes_intersects(cs1->mems_allowed, cs2->mems_allowed);
  done:
-       up(&cpuset_sem);
+       cpuset_up(&cpuset_sem);
  
         return overlap;
  }
@@ -1731,7 +1751,7 @@ static int proc_cpuset_show(struct seq_file *m, void *v)
                 return -ENOMEM;
  
         tsk = m->private;
-       down(&cpuset_sem);
+       cpuset_down(&cpuset_sem);
         task_lock(tsk);
         cs = tsk->cpuset;
         task_unlock(tsk);
@@ -1746,7 +1766,7 @@ static int proc_cpuset_show(struct seq_file *m, void *v)
         seq_puts(m, buf);
         seq_putc(m, '\n');
  out:
-       up(&cpuset_sem);
+       cpuset_up(&cpuset_sem);
         kfree(buf);
         return retval;
  }
author	Paul Jackson <pj@sgi.com>
	Sat, 10 Sep 2005 07:26:06 +0000 (00:26 -0700)
committer	Linus Torvalds <torvalds@g5.osdl.org>
	Sat, 10 Sep 2005 17:06:21 +0000 (10:06 -0700)
include/linux/sched.h		patch \| blob \| history
kernel/cpuset.c		patch \| blob \| history