sched: Introduce primitives to account for CFS bandwidth tracking

author Paul Turner <pjt@google.com>

Thu, 21 Jul 2011 16:43:28 +0000 (09:43 -0700)

committer Ingo Molnar <mingo@elte.hu>

Sun, 14 Aug 2011 10:03:20 +0000 (12:03 +0200)
author Paul Turner <pjt@google.com>
Thu, 21 Jul 2011 16:43:28 +0000 (09:43 -0700)
committer Ingo Molnar <mingo@elte.hu>
Sun, 14 Aug 2011 10:03:20 +0000 (12:03 +0200)
diff --git a/init/Kconfig b/init/Kconfig

index d62778390e5556af22a95cc61d6b5e0cdeeb262e..d19b3a77ab44e6114d49783b5eccb54755d54ea0 100644 (file)
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -715,6 +715,18 @@ config FAIR_GROUP_SCHED
         depends on CGROUP_SCHED
         default CGROUP_SCHED
  
+config CFS_BANDWIDTH
+       bool "CPU bandwidth provisioning for FAIR_GROUP_SCHED"
+       depends on EXPERIMENTAL
+       depends on FAIR_GROUP_SCHED
+       default n
+       help
+         This option allows users to define CPU bandwidth rates (limits) for
+         tasks running within the fair group scheduler.  Groups with no limit
+         set are considered to be unconstrained and will run with no
+         restriction.
+         See tip/Documentation/scheduler/sched-bwc.txt for more information.
+
  config RT_GROUP_SCHED
         bool "Group scheduling for SCHED_RR/FIFO"
         depends on EXPERIMENTAL
diff --git a/kernel/sched.c b/kernel/sched.c

index cd1a531ca8ff33ce527c3a22dc57f30a38fcb314..f08cb23be96c3904a1910380a4516f2c5a80b034 100644 (file)
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -247,6 +247,14 @@ struct cfs_rq;
  
  static LIST_HEAD(task_groups);
  
+struct cfs_bandwidth {
+#ifdef CONFIG_CFS_BANDWIDTH
+       raw_spinlock_t lock;
+       ktime_t period;
+       u64 quota;
+#endif
+};
+
  /* task group related information */
  struct task_group {
         struct cgroup_subsys_state css;
@@ -278,6 +286,8 @@ struct task_group {
  #ifdef CONFIG_SCHED_AUTOGROUP
         struct autogroup *autogroup;
  #endif
+
+       struct cfs_bandwidth cfs_bandwidth;
  };
  
  /* task_group_lock serializes the addition/removal of task groups */
@@ -377,9 +387,48 @@ struct cfs_rq {
  
         unsigned long load_contribution;
  #endif
+#ifdef CONFIG_CFS_BANDWIDTH
+       int runtime_enabled;
+       s64 runtime_remaining;
+#endif
  #endif
  };
  
+#ifdef CONFIG_FAIR_GROUP_SCHED
+#ifdef CONFIG_CFS_BANDWIDTH
+static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
+{
+       return &tg->cfs_bandwidth;
+}
+
+static inline u64 default_cfs_period(void);
+
+static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
+{
+       raw_spin_lock_init(&cfs_b->lock);
+       cfs_b->quota = RUNTIME_INF;
+       cfs_b->period = ns_to_ktime(default_cfs_period());
+}
+
+static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+{
+       cfs_rq->runtime_enabled = 0;
+}
+
+static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
+{}
+#else
+static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
+static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
+static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
+
+static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
+{
+       return NULL;
+}
+#endif /* CONFIG_CFS_BANDWIDTH */
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+
  /* Real-Time classes' related field in a runqueue: */
  struct rt_rq {
         struct rt_prio_array active;
@@ -7971,6 +8020,7 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
         /* allow initial update_cfs_load() to truncate */
         cfs_rq->load_stamp = 1;
  #endif
+       init_cfs_rq_runtime(cfs_rq);
  
         tg->cfs_rq[cpu] = cfs_rq;
         tg->se[cpu] = se;
@@ -8110,6 +8160,7 @@ void __init sched_init(void)
                  * We achieve this by letting root_task_group's tasks sit
                  * directly in rq->cfs (i.e root_task_group->se[] = NULL).
                  */
+               init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
                 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
  #endif /* CONFIG_FAIR_GROUP_SCHED */
  
@@ -8351,6 +8402,8 @@ static void free_fair_sched_group(struct task_group *tg)
  {
         int i;
  
+       destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
+
         for_each_possible_cpu(i) {
                 if (tg->cfs_rq)
                         kfree(tg->cfs_rq[i]);
@@ -8378,6 +8431,8 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
  
         tg->shares = NICE_0_LOAD;
  
+       init_cfs_bandwidth(tg_cfs_bandwidth(tg));
+
         for_each_possible_cpu(i) {
                 cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
                                       GFP_KERNEL, cpu_to_node(i));
@@ -8753,7 +8808,7 @@ static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
         return walk_tg_tree(tg_schedulable, tg_nop, &data);
  }
  
-static int tg_set_bandwidth(struct task_group *tg,
+static int tg_set_rt_bandwidth(struct task_group *tg,
                 u64 rt_period, u64 rt_runtime)
  {
         int i, err = 0;
@@ -8792,7 +8847,7 @@ int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
         if (rt_runtime_us < 0)
                 rt_runtime = RUNTIME_INF;
  
-       return tg_set_bandwidth(tg, rt_period, rt_runtime);
+       return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
  }
  
  long sched_group_rt_runtime(struct task_group *tg)
@@ -8817,7 +8872,7 @@ int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
         if (rt_period == 0)
                 return -EINVAL;
  
-       return tg_set_bandwidth(tg, rt_period, rt_runtime);
+       return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
  }
  
  long sched_group_rt_period(struct task_group *tg)
@@ -9007,6 +9062,128 @@ static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
  
         return (u64) scale_load_down(tg->shares);
  }
+
+#ifdef CONFIG_CFS_BANDWIDTH
+const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */
+const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
+
+static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
+{
+       int i;
+       struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
+       static DEFINE_MUTEX(mutex);
+
+       if (tg == &root_task_group)
+               return -EINVAL;
+
+       /*
+        * Ensure we have at some amount of bandwidth every period.  This is
+        * to prevent reaching a state of large arrears when throttled via
+        * entity_tick() resulting in prolonged exit starvation.
+        */
+       if (quota < min_cfs_quota_period || period < min_cfs_quota_period)
+               return -EINVAL;
+
+       /*
+        * Likewise, bound things on the otherside by preventing insane quota
+        * periods.  This also allows us to normalize in computing quota
+        * feasibility.
+        */
+       if (period > max_cfs_quota_period)
+               return -EINVAL;
+
+       mutex_lock(&mutex);
+       raw_spin_lock_irq(&cfs_b->lock);
+       cfs_b->period = ns_to_ktime(period);
+       cfs_b->quota = quota;
+       raw_spin_unlock_irq(&cfs_b->lock);
+
+       for_each_possible_cpu(i) {
+               struct cfs_rq *cfs_rq = tg->cfs_rq[i];
+               struct rq *rq = rq_of(cfs_rq);
+
+               raw_spin_lock_irq(&rq->lock);
+               cfs_rq->runtime_enabled = quota != RUNTIME_INF;
+               cfs_rq->runtime_remaining = 0;
+               raw_spin_unlock_irq(&rq->lock);
+       }
+       mutex_unlock(&mutex);
+
+       return 0;
+}
+
+int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
+{
+       u64 quota, period;
+
+       period = ktime_to_ns(tg_cfs_bandwidth(tg)->period);
+       if (cfs_quota_us < 0)
+               quota = RUNTIME_INF;
+       else
+               quota = (u64)cfs_quota_us * NSEC_PER_USEC;
+
+       return tg_set_cfs_bandwidth(tg, period, quota);
+}
+
+long tg_get_cfs_quota(struct task_group *tg)
+{
+       u64 quota_us;
+
+       if (tg_cfs_bandwidth(tg)->quota == RUNTIME_INF)
+               return -1;
+
+       quota_us = tg_cfs_bandwidth(tg)->quota;
+       do_div(quota_us, NSEC_PER_USEC);
+
+       return quota_us;
+}
+
+int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
+{
+       u64 quota, period;
+
+       period = (u64)cfs_period_us * NSEC_PER_USEC;
+       quota = tg_cfs_bandwidth(tg)->quota;
+
+       if (period <= 0)
+               return -EINVAL;
+
+       return tg_set_cfs_bandwidth(tg, period, quota);
+}
+
+long tg_get_cfs_period(struct task_group *tg)
+{
+       u64 cfs_period_us;
+
+       cfs_period_us = ktime_to_ns(tg_cfs_bandwidth(tg)->period);
+       do_div(cfs_period_us, NSEC_PER_USEC);
+
+       return cfs_period_us;
+}
+
+static s64 cpu_cfs_quota_read_s64(struct cgroup *cgrp, struct cftype *cft)
+{
+       return tg_get_cfs_quota(cgroup_tg(cgrp));
+}
+
+static int cpu_cfs_quota_write_s64(struct cgroup *cgrp, struct cftype *cftype,
+                               s64 cfs_quota_us)
+{
+       return tg_set_cfs_quota(cgroup_tg(cgrp), cfs_quota_us);
+}
+
+static u64 cpu_cfs_period_read_u64(struct cgroup *cgrp, struct cftype *cft)
+{
+       return tg_get_cfs_period(cgroup_tg(cgrp));
+}
+
+static int cpu_cfs_period_write_u64(struct cgroup *cgrp, struct cftype *cftype,
+                               u64 cfs_period_us)
+{
+       return tg_set_cfs_period(cgroup_tg(cgrp), cfs_period_us);
+}
+
+#endif /* CONFIG_CFS_BANDWIDTH */
  #endif /* CONFIG_FAIR_GROUP_SCHED */
  
  #ifdef CONFIG_RT_GROUP_SCHED
@@ -9041,6 +9218,18 @@ static struct cftype cpu_files[] = {
                 .write_u64 = cpu_shares_write_u64,
         },
  #endif
+#ifdef CONFIG_CFS_BANDWIDTH
+       {
+               .name = "cfs_quota_us",
+               .read_s64 = cpu_cfs_quota_read_s64,
+               .write_s64 = cpu_cfs_quota_write_s64,
+       },
+       {
+               .name = "cfs_period_us",
+               .read_u64 = cpu_cfs_period_read_u64,
+               .write_u64 = cpu_cfs_period_write_u64,
+       },
+#endif
  #ifdef CONFIG_RT_GROUP_SCHED
         {
                 .name = "rt_runtime_us",
@@ -9350,4 +9539,3 @@ struct cgroup_subsys cpuacct_subsys = {
         .subsys_id = cpuacct_subsys_id,
  };
  #endif /* CONFIG_CGROUP_CPUACCT */
-
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c

index f86b0cb5eb298a62372d6ef87d8b3bf77fe3fac3..f24f4171019d61267cfaafb474b7cf059243b88e 100644 (file)
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1234,6 +1234,22 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
                 check_preempt_tick(cfs_rq, curr);
  }
  
+
+/**************************************************
+ * CFS bandwidth control machinery
+ */
+
+#ifdef CONFIG_CFS_BANDWIDTH
+/*
+ * default period for cfs group bandwidth.
+ * default: 0.1s, units: nanoseconds
+ */
+static inline u64 default_cfs_period(void)
+{
+       return 100000000ULL;
+}
+#endif
+
  /**************************************************
   * CFS operations on tasks:
   */
author	Paul Turner <pjt@google.com>
	Thu, 21 Jul 2011 16:43:28 +0000 (09:43 -0700)
committer	Ingo Molnar <mingo@elte.hu>
	Sun, 14 Aug 2011 10:03:20 +0000 (12:03 +0200)
init/Kconfig		patch \| blob \| history
kernel/sched.c		patch \| blob \| history
kernel/sched_fair.c		patch \| blob \| history