cgroup: use percpu refcnt for cgroup_subsys_states

author Tejun Heo <tj@kernel.org>

Fri, 14 Jun 2013 02:39:16 +0000 (19:39 -0700)

committer Tejun Heo <tj@kernel.org>

Fri, 14 Jun 2013 02:43:12 +0000 (19:43 -0700)
author Tejun Heo <tj@kernel.org>
Fri, 14 Jun 2013 02:39:16 +0000 (19:39 -0700)
committer Tejun Heo <tj@kernel.org>
Fri, 14 Jun 2013 02:43:12 +0000 (19:43 -0700)
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h

index e345d8b90046fe7b87b0f96b79cafe6274754883..b7bd4beae29467e35b5fd59227c63eb946da232c 100644 (file)
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -20,6 +20,7 @@
  #include <linux/workqueue.h>
  #include <linux/xattr.h>
  #include <linux/fs.h>
+#include <linux/percpu-refcount.h>
  
  #ifdef CONFIG_CGROUPS
  
@@ -72,13 +73,8 @@ struct cgroup_subsys_state {
          */
         struct cgroup *cgroup;
  
-       /*
-        * State maintained by the cgroup system to allow subsystems
-        * to be "busy". Should be accessed via css_get(),
-        * css_tryget() and css_put().
-        */
-
-       atomic_t refcnt;
+       /* reference count - access via css_[try]get() and css_put() */
+       struct percpu_ref refcnt;
  
         unsigned long flags;
         /* ID for this css, if possible */
@@ -104,11 +100,9 @@ static inline void css_get(struct cgroup_subsys_state *css)
  {
         /* We don't need to reference count the root state */
         if (!(css->flags & CSS_ROOT))
-               atomic_inc(&css->refcnt);
+               percpu_ref_get(&css->refcnt);
  }
  
-extern bool __css_tryget(struct cgroup_subsys_state *css);
-
  /**
   * css_tryget - try to obtain a reference on the specified css
   * @css: target css
@@ -123,11 +117,9 @@ static inline bool css_tryget(struct cgroup_subsys_state *css)
  {
         if (css->flags & CSS_ROOT)
                 return true;
-       return __css_tryget(css);
+       return percpu_ref_tryget(&css->refcnt);
  }
  
-extern void __css_put(struct cgroup_subsys_state *css);
-
  /**
   * css_put - put a css reference
   * @css: target css
@@ -137,7 +129,7 @@ extern void __css_put(struct cgroup_subsys_state *css);
  static inline void css_put(struct cgroup_subsys_state *css)
  {
         if (!(css->flags & CSS_ROOT))
-               __css_put(css);
+               percpu_ref_put(&css->refcnt);
  }
  
  /* bits in struct cgroup flags field */
@@ -231,9 +223,10 @@ struct cgroup {
         struct list_head pidlists;
         struct mutex pidlist_mutex;
  
-       /* For RCU-protected deletion */
+       /* For css percpu_ref killing and RCU-protected deletion */
         struct rcu_head rcu_head;
         struct work_struct destroy_work;
+       atomic_t css_kill_cnt;
  
         /* List of events which userspace want to receive */
         struct list_head event_list;
diff --git a/kernel/cgroup.c b/kernel/cgroup.c

index ebbfc043153f6a7520da5c9f1a1ac8326c8f84b4..2e9da7bf25cb740a2738b86ff3e211f8cf8d5306 100644 (file)
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -63,9 +63,6 @@
  
  #include <linux/atomic.h>
  
-/* css deactivation bias, makes css->refcnt negative to deny new trygets */
-#define CSS_DEACT_BIAS         INT_MIN
-
  /*
   * cgroup_mutex is the master lock.  Any modification to cgroup or its
   * hierarchy must be performed while holding it.
@@ -213,19 +210,6 @@ static int cgroup_destroy_locked(struct cgroup *cgrp);
  static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
                               struct cftype cfts[], bool is_add);
  
-static int css_unbias_refcnt(int refcnt)
-{
-       return refcnt >= 0 ? refcnt : refcnt - CSS_DEACT_BIAS;
-}
-
-/* the current nr of refs, always >= 0 whether @css is deactivated or not */
-static int css_refcnt(struct cgroup_subsys_state *css)
-{
-       int v = atomic_read(&css->refcnt);
-
-       return css_unbias_refcnt(v);
-}
-
  /* convenient tests for these bits */
  static inline bool cgroup_is_dead(const struct cgroup *cgrp)
  {
@@ -4139,12 +4123,19 @@ static void css_dput_fn(struct work_struct *work)
         deactivate_super(sb);
  }
  
+static void css_release(struct percpu_ref *ref)
+{
+       struct cgroup_subsys_state *css =
+               container_of(ref, struct cgroup_subsys_state, refcnt);
+
+       schedule_work(&css->dput_work);
+}
+
  static void init_cgroup_css(struct cgroup_subsys_state *css,
                                struct cgroup_subsys *ss,
                                struct cgroup *cgrp)
  {
         css->cgroup = cgrp;
-       atomic_set(&css->refcnt, 1);
         css->flags = 0;
         css->id = NULL;
         if (cgrp == dummytop)
@@ -4266,7 +4257,13 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
                         err = PTR_ERR(css);
                         goto err_free_all;
                 }
+
+               err = percpu_ref_init(&css->refcnt, css_release);
+               if (err)
+                       goto err_free_all;
+
                 init_cgroup_css(css, ss, cgrp);
+
                 if (ss->use_id) {
                         err = alloc_css_id(ss, parent, cgrp);
                         if (err)
@@ -4331,8 +4328,12 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
  
  err_free_all:
         for_each_subsys(root, ss) {
-               if (cgrp->subsys[ss->subsys_id])
+               struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
+
+               if (css) {
+                       percpu_ref_cancel_init(&css->refcnt);
                         ss->css_free(cgrp);
+               }
         }
         mutex_unlock(&cgroup_mutex);
         /* Release the reference count that we took on the superblock */
@@ -4360,6 +4361,48 @@ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
         return cgroup_create(c_parent, dentry, mode | S_IFDIR);
  }
  
+static void cgroup_css_killed(struct cgroup *cgrp)
+{
+       if (!atomic_dec_and_test(&cgrp->css_kill_cnt))
+               return;
+
+       /* percpu ref's of all css's are killed, kick off the next step */
+       INIT_WORK(&cgrp->destroy_work, cgroup_offline_fn);
+       schedule_work(&cgrp->destroy_work);
+}
+
+static void css_ref_killed_fn(struct percpu_ref *ref)
+{
+       struct cgroup_subsys_state *css =
+               container_of(ref, struct cgroup_subsys_state, refcnt);
+
+       cgroup_css_killed(css->cgroup);
+}
+
+/**
+ * cgroup_destroy_locked - the first stage of cgroup destruction
+ * @cgrp: cgroup to be destroyed
+ *
+ * css's make use of percpu refcnts whose killing latency shouldn't be
+ * exposed to userland and are RCU protected.  Also, cgroup core needs to
+ * guarantee that css_tryget() won't succeed by the time ->css_offline() is
+ * invoked.  To satisfy all the requirements, destruction is implemented in
+ * the following two steps.
+ *
+ * s1. Verify @cgrp can be destroyed and mark it dying.  Remove all
+ *     userland visible parts and start killing the percpu refcnts of
+ *     css's.  Set up so that the next stage will be kicked off once all
+ *     the percpu refcnts are confirmed to be killed.
+ *
+ * s2. Invoke ->css_offline(), mark the cgroup dead and proceed with the
+ *     rest of destruction.  Once all cgroup references are gone, the
+ *     cgroup is RCU-freed.
+ *
+ * This function implements s1.  After this step, @cgrp is gone as far as
+ * the userland is concerned and a new cgroup with the same name may be
+ * created.  As cgroup doesn't care about the names internally, this
+ * doesn't cause any problem.
+ */
  static int cgroup_destroy_locked(struct cgroup *cgrp)
         __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
  {
@@ -4382,16 +4425,34 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
                 return -EBUSY;
  
         /*
-        * Block new css_tryget() by deactivating refcnt and mark @cgrp
-        * removed.  This makes future css_tryget() attempts fail which we
-        * guarantee to ->css_offline() callbacks.
+        * Block new css_tryget() by killing css refcnts.  cgroup core
+        * guarantees that, by the time ->css_offline() is invoked, no new
+        * css reference will be given out via css_tryget().  We can't
+        * simply call percpu_ref_kill() and proceed to offlining css's
+        * because percpu_ref_kill() doesn't guarantee that the ref is seen
+        * as killed on all CPUs on return.
+        *
+        * Use percpu_ref_kill_and_confirm() to get notifications as each
+        * css is confirmed to be seen as killed on all CPUs.  The
+        * notification callback keeps track of the number of css's to be
+        * killed and schedules cgroup_offline_fn() to perform the rest of
+        * destruction once the percpu refs of all css's are confirmed to
+        * be killed.
          */
+       atomic_set(&cgrp->css_kill_cnt, 1);
         for_each_subsys(cgrp->root, ss) {
                 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
  
-               WARN_ON(atomic_read(&css->refcnt) < 0);
-               atomic_add(CSS_DEACT_BIAS, &css->refcnt);
+               /*
+                * Killing would put the base ref, but we need to keep it
+                * alive until after ->css_offline.
+                */
+               percpu_ref_get(&css->refcnt);
+
+               atomic_inc(&cgrp->css_kill_cnt);
+               percpu_ref_kill_and_confirm(&css->refcnt, css_ref_killed_fn);
         }
+       cgroup_css_killed(cgrp);
  
         /*
          * Mark @cgrp dead.  This prevents further task migration and child
@@ -4427,12 +4488,19 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
         }
         spin_unlock(&cgrp->event_list_lock);
  
-       INIT_WORK(&cgrp->destroy_work, cgroup_offline_fn);
-       schedule_work(&cgrp->destroy_work);
-
         return 0;
  };
  
+/**
+ * cgroup_offline_fn - the second step of cgroup destruction
+ * @work: cgroup->destroy_free_work
+ *
+ * This function is invoked from a work item for a cgroup which is being
+ * destroyed after the percpu refcnts of all css's are guaranteed to be
+ * seen as killed on all CPUs, and performs the rest of destruction.  This
+ * is the second step of destruction described in the comment above
+ * cgroup_destroy_locked().
+ */
  static void cgroup_offline_fn(struct work_struct *work)
  {
         struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work);
@@ -4442,16 +4510,19 @@ static void cgroup_offline_fn(struct work_struct *work)
  
         mutex_lock(&cgroup_mutex);
  
-       /* tell subsystems to initate destruction */
+       /*
+        * css_tryget() is guaranteed to fail now.  Tell subsystems to
+        * initate destruction.
+        */
         for_each_subsys(cgrp->root, ss)
                 offline_css(ss, cgrp);
  
         /*
-        * Put all the base refs.  Each css holds an extra reference to the
-        * cgroup's dentry and cgroup removal proceeds regardless of css
-        * refs.  On the last put of each css, whenever that may be, the
-        * extra dentry ref is put so that dentry destruction happens only
-        * after all css's are released.
+        * Put the css refs from cgroup_destroy_locked().  Each css holds
+        * an extra reference to the cgroup's dentry and cgroup removal
+        * proceeds regardless of css refs.  On the last put of each css,
+        * whenever that may be, the extra dentry ref is put so that dentry
+        * destruction happens only after all css's are released.
          */
         for_each_subsys(cgrp->root, ss)
                 css_put(cgrp->subsys[ss->subsys_id]);
@@ -5100,34 +5171,6 @@ static void check_for_release(struct cgroup *cgrp)
         }
  }
  
-/* Caller must verify that the css is not for root cgroup */
-bool __css_tryget(struct cgroup_subsys_state *css)
-{
-       while (true) {
-               int t, v;
-
-               v = css_refcnt(css);
-               t = atomic_cmpxchg(&css->refcnt, v, v + 1);
-               if (likely(t == v))
-                       return true;
-               else if (t < 0)
-                       return false;
-               cpu_relax();
-       }
-}
-EXPORT_SYMBOL_GPL(__css_tryget);
-
-/* Caller must verify that the css is not for root cgroup */
-void __css_put(struct cgroup_subsys_state *css)
-{
-       int v;
-
-       v = css_unbias_refcnt(atomic_dec_return(&css->refcnt));
-       if (v == 0)
-               schedule_work(&css->dput_work);
-}
-EXPORT_SYMBOL_GPL(__css_put);
-
  /*
   * Notify userspace when a cgroup is released, by running the
   * configured release agent with the name of the cgroup (path
@@ -5245,7 +5288,7 @@ unsigned short css_id(struct cgroup_subsys_state *css)
          * on this or this is under rcu_read_lock(). Once css->id is allocated,
          * it's unchanged until freed.
          */
-       cssid = rcu_dereference_check(css->id, css_refcnt(css));
+       cssid = rcu_dereference_raw(css->id);
  
         if (cssid)
                 return cssid->id;
author	Tejun Heo <tj@kernel.org>
	Fri, 14 Jun 2013 02:39:16 +0000 (19:39 -0700)
committer	Tejun Heo <tj@kernel.org>
	Fri, 14 Jun 2013 02:43:12 +0000 (19:43 -0700)
include/linux/cgroup.h		patch \| blob \| history
kernel/cgroup.c		patch \| blob \| history