cgroup: fix cgroup_path() vs rename() race

author Li Zefan <lizefan@huawei.com>

Fri, 1 Mar 2013 07:01:56 +0000 (15:01 +0800)

committer Tejun Heo <tj@kernel.org>

Mon, 4 Mar 2013 17:50:08 +0000 (09:50 -0800)
author Li Zefan <lizefan@huawei.com>
Fri, 1 Mar 2013 07:01:56 +0000 (15:01 +0800)
committer Tejun Heo <tj@kernel.org>
Mon, 4 Mar 2013 17:50:08 +0000 (09:50 -0800)
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h

index f2b292925ccda3a2e8301eec4924e5fac989a354..4e595ee8c9151aea5c0005bffbe1d936c4603083 100644 (file)
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -247,9 +247,7 @@ static inline int blkg_path(struct blkcg_gq *blkg, char *buf, int buflen)
  {
         int ret;
  
-       rcu_read_lock();
         ret = cgroup_path(blkg->blkcg->css.cgroup, buf, buflen);
-       rcu_read_unlock();
         if (ret)
                 strncpy(buf, "<unavailable>", buflen);
         return ret;
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h

index 900af5964f5543f6dd1b5308419e93c70cb0ca07..75c6ec1ba1ba5b61e51eb9c4e44a8089ab7e78f0 100644 (file)
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -150,6 +150,11 @@ enum {
         CGRP_CPUSET_CLONE_CHILDREN,
  };
  
+struct cgroup_name {
+       struct rcu_head rcu_head;
+       char name[];
+};
+
  struct cgroup {
         unsigned long flags;            /* "unsigned long" so bitops work */
  
@@ -172,6 +177,19 @@ struct cgroup {
         struct cgroup *parent;          /* my parent */
         struct dentry *dentry;          /* cgroup fs entry, RCU protected */
  
+       /*
+        * This is a copy of dentry->d_name, and it's needed because
+        * we can't use dentry->d_name in cgroup_path().
+        *
+        * You must acquire rcu_read_lock() to access cgrp->name, and
+        * the only place that can change it is rename(), which is
+        * protected by parent dir's i_mutex.
+        *
+        * Normally you should use cgroup_name() wrapper rather than
+        * access it directly.
+        */
+       struct cgroup_name __rcu *name;
+
         /* Private pointers for each registered subsystem */
         struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT];
  
@@ -404,6 +422,12 @@ struct cgroup_scanner {
         void *data;
  };
  
+/* Caller should hold rcu_read_lock() */
+static inline const char *cgroup_name(const struct cgroup *cgrp)
+{
+       return rcu_dereference(cgrp->name)->name;
+}
+
  int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts);
  int cgroup_rm_cftypes(struct cgroup_subsys *ss, struct cftype *cfts);
  
diff --git a/kernel/cgroup.c b/kernel/cgroup.c

index a32f9432666cc9f0f736ca24639a58fc45557ce8..50682168abc2429eba40eb18499ca72624016b7d 100644 (file)
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -238,6 +238,8 @@ static DEFINE_SPINLOCK(hierarchy_id_lock);
  /* dummytop is a shorthand for the dummy hierarchy's top cgroup */
  #define dummytop (&rootnode.top_cgroup)
  
+static struct cgroup_name root_cgroup_name = { .name = "/" };
+
  /* This flag indicates whether tasks in the fork and exit paths should
   * check for fork/exit handlers to call. This avoids us having to do
   * extra work in the fork/exit path if none of the subsystems need to
@@ -859,6 +861,17 @@ static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb)
         return inode;
  }
  
+static struct cgroup_name *cgroup_alloc_name(struct dentry *dentry)
+{
+       struct cgroup_name *name;
+
+       name = kmalloc(sizeof(*name) + dentry->d_name.len + 1, GFP_KERNEL);
+       if (!name)
+               return NULL;
+       strcpy(name->name, dentry->d_name.name);
+       return name;
+}
+
  static void cgroup_free_fn(struct work_struct *work)
  {
         struct cgroup *cgrp = container_of(work, struct cgroup, free_work);
@@ -889,6 +902,7 @@ static void cgroup_free_fn(struct work_struct *work)
         simple_xattrs_free(&cgrp->xattrs);
  
         ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id);
+       kfree(rcu_dereference_raw(cgrp->name));
         kfree(cgrp);
  }
  
@@ -1421,6 +1435,7 @@ static void init_cgroup_root(struct cgroupfs_root *root)
         INIT_LIST_HEAD(&root->allcg_list);
         root->number_of_cgroups = 1;
         cgrp->root = root;
+       cgrp->name = &root_cgroup_name;
         cgrp->top_cgroup = cgrp;
         init_cgroup_housekeeping(cgrp);
         list_add_tail(&cgrp->allcg_node, &root->allcg_list);
@@ -1769,49 +1784,45 @@ static struct kobject *cgroup_kobj;
   * @buf: the buffer to write the path into
   * @buflen: the length of the buffer
   *
- * Called with cgroup_mutex held or else with an RCU-protected cgroup
- * reference.  Writes path of cgroup into buf.  Returns 0 on success,
- * -errno on error.
+ * Writes path of cgroup into buf.  Returns 0 on success, -errno on error.
+ *
+ * We can't generate cgroup path using dentry->d_name, as accessing
+ * dentry->name must be protected by irq-unsafe dentry->d_lock or parent
+ * inode's i_mutex, while on the other hand cgroup_path() can be called
+ * with some irq-safe spinlocks held.
   */
  int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
  {
-       struct dentry *dentry = cgrp->dentry;
+       int ret = -ENAMETOOLONG;
         char *start;
  
-       rcu_lockdep_assert(rcu_read_lock_held() || cgroup_lock_is_held(),
-                          "cgroup_path() called without proper locking");
-
-       if (cgrp == dummytop) {
-               /*
-                * Inactive subsystems have no dentry for their root
-                * cgroup
-                */
-               strcpy(buf, "/");
-               return 0;
-       }
-
         start = buf + buflen - 1;
-
         *start = '\0';
-       for (;;) {
-               int len = dentry->d_name.len;
  
+       rcu_read_lock();
+       while (cgrp) {
+               const char *name = cgroup_name(cgrp);
+               int len;
+
+               len = strlen(name);
                 if ((start -= len) < buf)
-                       return -ENAMETOOLONG;
-               memcpy(start, dentry->d_name.name, len);
-               cgrp = cgrp->parent;
-               if (!cgrp)
-                       break;
+                       goto out;
+               memcpy(start, name, len);
  
-               dentry = cgrp->dentry;
                 if (!cgrp->parent)
-                       continue;
+                       break;
+
                 if (--start < buf)
-                       return -ENAMETOOLONG;
+                       goto out;
                 *start = '/';
+
+               cgrp = cgrp->parent;
         }
+       ret = 0;
         memmove(buf, start, buf + buflen - start);
-       return 0;
+out:
+       rcu_read_unlock();
+       return ret;
  }
  EXPORT_SYMBOL_GPL(cgroup_path);
  
@@ -2537,13 +2548,40 @@ static int cgroup_file_release(struct inode *inode, struct file *file)
  static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry,
                             struct inode *new_dir, struct dentry *new_dentry)
  {
+       int ret;
+       struct cgroup_name *name, *old_name;
+       struct cgroup *cgrp;
+
+       /*
+        * It's convinient to use parent dir's i_mutex to protected
+        * cgrp->name.
+        */
+       lockdep_assert_held(&old_dir->i_mutex);
+
         if (!S_ISDIR(old_dentry->d_inode->i_mode))
                 return -ENOTDIR;
         if (new_dentry->d_inode)
                 return -EEXIST;
         if (old_dir != new_dir)
                 return -EIO;
-       return simple_rename(old_dir, old_dentry, new_dir, new_dentry);
+
+       cgrp = __d_cgrp(old_dentry);
+
+       name = cgroup_alloc_name(new_dentry);
+       if (!name)
+               return -ENOMEM;
+
+       ret = simple_rename(old_dir, old_dentry, new_dir, new_dentry);
+       if (ret) {
+               kfree(name);
+               return ret;
+       }
+
+       old_name = cgrp->name;
+       rcu_assign_pointer(cgrp->name, name);
+
+       kfree_rcu(old_name, rcu_head);
+       return 0;
  }
  
  static struct simple_xattrs *__d_xattrs(struct dentry *dentry)
@@ -4158,6 +4196,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
                              umode_t mode)
  {
         struct cgroup *cgrp;
+       struct cgroup_name *name;
         struct cgroupfs_root *root = parent->root;
         int err = 0;
         struct cgroup_subsys *ss;
@@ -4168,9 +4207,14 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
         if (!cgrp)
                 return -ENOMEM;
  
+       name = cgroup_alloc_name(dentry);
+       if (!name)
+               goto err_free_cgrp;
+       rcu_assign_pointer(cgrp->name, name);
+
         cgrp->id = ida_simple_get(&root->cgroup_ida, 1, 0, GFP_KERNEL);
         if (cgrp->id < 0)
-               goto err_free_cgrp;
+               goto err_free_name;
  
         /*
          * Only live parents can have children.  Note that the liveliness
@@ -4276,6 +4320,8 @@ err_free_all:
         deactivate_super(sb);
  err_free_id:
         ida_simple_remove(&root->cgroup_ida, cgrp->id);
+err_free_name:
+       kfree(rcu_dereference_raw(cgrp->name));
  err_free_cgrp:
         kfree(cgrp);
         return err;
author	Li Zefan <lizefan@huawei.com>
	Fri, 1 Mar 2013 07:01:56 +0000 (15:01 +0800)
committer	Tejun Heo <tj@kernel.org>
	Mon, 4 Mar 2013 17:50:08 +0000 (09:50 -0800)
block/blk-cgroup.h		patch \| blob \| history
include/linux/cgroup.h		patch \| blob \| history
kernel/cgroup.c		patch \| blob \| history