sched: Add new scheduler syscalls to support an extended scheduling parameters ABI

author Dario Faggioli <raistlin@linux.it>

Thu, 7 Nov 2013 13:43:36 +0000 (14:43 +0100)

committer Ingo Molnar <mingo@kernel.org>

Mon, 13 Jan 2014 12:41:04 +0000 (13:41 +0100)
author Dario Faggioli <raistlin@linux.it>
Thu, 7 Nov 2013 13:43:36 +0000 (14:43 +0100)
committer Ingo Molnar <mingo@kernel.org>
Mon, 13 Jan 2014 12:41:04 +0000 (13:41 +0100)
diff --git a/arch/arm/include/asm/unistd.h b/arch/arm/include/asm/unistd.h

index 141baa3f9a72d1d931da67c30c600fa85bc4bfc5..acabef1a75df00637def7a94af9f1425d281b7f0 100644 (file)
--- a/arch/arm/include/asm/unistd.h
+++ b/arch/arm/include/asm/unistd.h
@@ -15,7 +15,7 @@
  
  #include <uapi/asm/unistd.h>
  
-#define __NR_syscalls  (380)
+#define __NR_syscalls  (384)
  #define __ARM_NR_cmpxchg               (__ARM_NR_BASE+0x00fff0)
  
  #define __ARCH_WANT_STAT64
diff --git a/arch/arm/include/uapi/asm/unistd.h b/arch/arm/include/uapi/asm/unistd.h

index af33b44990ed4a395662f0c5e7021049e5755a78..fb5584d0cc050a6c55b30ff8342615a5a39a1c2f 100644 (file)
--- a/arch/arm/include/uapi/asm/unistd.h
+++ b/arch/arm/include/uapi/asm/unistd.h
@@ -406,6 +406,8 @@
  #define __NR_process_vm_writev         (__NR_SYSCALL_BASE+377)
  #define __NR_kcmp                      (__NR_SYSCALL_BASE+378)
  #define __NR_finit_module              (__NR_SYSCALL_BASE+379)
+#define __NR_sched_setattr             (__NR_SYSCALL_BASE+380)
+#define __NR_sched_getattr             (__NR_SYSCALL_BASE+381)
  
  /*
   * This may need to be greater than __NR_last_syscall+1 in order to
diff --git a/arch/arm/kernel/calls.S b/arch/arm/kernel/calls.S

index c6ca7e376773fcc73ef619e1c5a97793d97b41b4..166e945de832f22b603d6b0de2ca3eb92f2ec732 100644 (file)
--- a/arch/arm/kernel/calls.S
+++ b/arch/arm/kernel/calls.S
@@ -389,6 +389,8 @@
                 CALL(sys_process_vm_writev)
                 CALL(sys_kcmp)
                 CALL(sys_finit_module)
+/* 380 */      CALL(sys_sched_setattr)
+               CALL(sys_sched_getattr)
  #ifndef syscalls_counted
  .equ syscalls_padding, ((NR_syscalls + 3) & ~3) - NR_syscalls
  #define syscalls_counted
diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl

index aabfb8380a1c6cf91e5af8aaed9dd30bd088d9de..96bc506ac6de7db16e1b0dd7ce161587e60f673c 100644 (file)
--- a/arch/x86/syscalls/syscall_32.tbl
+++ b/arch/x86/syscalls/syscall_32.tbl
@@ -357,3 +357,5 @@
  348    i386    process_vm_writev       sys_process_vm_writev           compat_sys_process_vm_writev
  349    i386    kcmp                    sys_kcmp
  350    i386    finit_module            sys_finit_module
+351    i386    sched_setattr           sys_sched_setattr
+352    i386    sched_getattr           sys_sched_getattr
diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl

index 38ae65dfd14ffe91904b4fc409ece711f84e61e9..a12bddc7ccea11edf9f7125d8353f04759d2cfb0 100644 (file)
--- a/arch/x86/syscalls/syscall_64.tbl
+++ b/arch/x86/syscalls/syscall_64.tbl
@@ -320,6 +320,8 @@
  311    64      process_vm_writev       sys_process_vm_writev
  312    common  kcmp                    sys_kcmp
  313    common  finit_module            sys_finit_module
+314    common  sched_setattr           sys_sched_setattr
+315    common  sched_getattr           sys_sched_getattr
  
  #
  # x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/include/linux/sched.h b/include/linux/sched.h

index 3a1e9857b393f5c1af37cedb59aa2c057156d222..86025b6c6387edf30fcfa413762b9bdb7180b0c4 100644 (file)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -56,6 +56,66 @@ struct sched_param {
  
  #include <asm/processor.h>
  
+#define SCHED_ATTR_SIZE_VER0   48      /* sizeof first published struct */
+
+/*
+ * Extended scheduling parameters data structure.
+ *
+ * This is needed because the original struct sched_param can not be
+ * altered without introducing ABI issues with legacy applications
+ * (e.g., in sched_getparam()).
+ *
+ * However, the possibility of specifying more than just a priority for
+ * the tasks may be useful for a wide variety of application fields, e.g.,
+ * multimedia, streaming, automation and control, and many others.
+ *
+ * This variant (sched_attr) is meant at describing a so-called
+ * sporadic time-constrained task. In such model a task is specified by:
+ *  - the activation period or minimum instance inter-arrival time;
+ *  - the maximum (or average, depending on the actual scheduling
+ *    discipline) computation time of all instances, a.k.a. runtime;
+ *  - the deadline (relative to the actual activation time) of each
+ *    instance.
+ * Very briefly, a periodic (sporadic) task asks for the execution of
+ * some specific computation --which is typically called an instance--
+ * (at most) every period. Moreover, each instance typically lasts no more
+ * than the runtime and must be completed by time instant t equal to
+ * the instance activation time + the deadline.
+ *
+ * This is reflected by the actual fields of the sched_attr structure:
+ *
+ *  @size              size of the structure, for fwd/bwd compat.
+ *
+ *  @sched_policy      task's scheduling policy
+ *  @sched_flags       for customizing the scheduler behaviour
+ *  @sched_nice                task's nice value      (SCHED_NORMAL/BATCH)
+ *  @sched_priority    task's static priority (SCHED_FIFO/RR)
+ *  @sched_deadline    representative of the task's deadline
+ *  @sched_runtime     representative of the task's runtime
+ *  @sched_period      representative of the task's period
+ *
+ * Given this task model, there are a multiplicity of scheduling algorithms
+ * and policies, that can be used to ensure all the tasks will make their
+ * timing constraints.
+ */
+struct sched_attr {
+       u32 size;
+
+       u32 sched_policy;
+       u64 sched_flags;
+
+       /* SCHED_NORMAL, SCHED_BATCH */
+       s32 sched_nice;
+
+       /* SCHED_FIFO, SCHED_RR */
+       u32 sched_priority;
+
+       /* SCHED_DEADLINE */
+       u64 sched_runtime;
+       u64 sched_deadline;
+       u64 sched_period;
+};
+
  struct exec_domain;
  struct futex_pi_state;
  struct robust_list_head;
@@ -1958,6 +2018,8 @@ extern int sched_setscheduler(struct task_struct *, int,
                               const struct sched_param *);
  extern int sched_setscheduler_nocheck(struct task_struct *, int,
                                       const struct sched_param *);
+extern int sched_setattr(struct task_struct *,
+                        const struct sched_attr *);
  extern struct task_struct *idle_task(int cpu);
  /**
   * is_idle_task - is the specified task an idle task?
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h

index 94273bbe605007462f6e0daacf01715f2a7aca6a..40ed9e9a77e53c77448104a921c0595f2a7927eb 100644 (file)
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -38,6 +38,7 @@ struct rlimit;
  struct rlimit64;
  struct rusage;
  struct sched_param;
+struct sched_attr;
  struct sel_arg_struct;
  struct semaphore;
  struct sembuf;
@@ -279,9 +280,14 @@ asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,
                                         struct sched_param __user *param);
  asmlinkage long sys_sched_setparam(pid_t pid,
                                         struct sched_param __user *param);
+asmlinkage long sys_sched_setattr(pid_t pid,
+                                       struct sched_attr __user *attr);
  asmlinkage long sys_sched_getscheduler(pid_t pid);
  asmlinkage long sys_sched_getparam(pid_t pid,
                                         struct sched_param __user *param);
+asmlinkage long sys_sched_getattr(pid_t pid,
+                                       struct sched_attr __user *attr,
+                                       unsigned int size);
  asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
                                         unsigned long __user *user_mask_ptr);
  asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
diff --git a/kernel/sched/core.c b/kernel/sched/core.c

index b21a63ed5d62193447657a96aa0b6c152b9170b4..8174f889076c2b279504edff2bfb293f662149ec 100644 (file)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2817,6 +2817,7 @@ out_unlock:
         __task_rq_unlock(rq);
  }
  #endif
+
  void set_user_nice(struct task_struct *p, long nice)
  {
         int old_prio, delta, on_rq;
@@ -2991,22 +2992,29 @@ static struct task_struct *find_process_by_pid(pid_t pid)
         return pid ? find_task_by_vpid(pid) : current;
  }
  
-/* Actually do priority change: must hold rq lock. */
-static void
-__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
+/* Actually do priority change: must hold pi & rq lock. */
+static void __setscheduler(struct rq *rq, struct task_struct *p,
+                          const struct sched_attr *attr)
  {
+       int policy = attr->sched_policy;
+
         p->policy = policy;
-       p->rt_priority = prio;
+
+       if (rt_policy(policy))
+               p->rt_priority = attr->sched_priority;
+       else
+               p->static_prio = NICE_TO_PRIO(attr->sched_nice);
+
         p->normal_prio = normal_prio(p);
-       /* we are holding p->pi_lock already */
         p->prio = rt_mutex_getprio(p);
+
         if (rt_prio(p->prio))
                 p->sched_class = &rt_sched_class;
         else
                 p->sched_class = &fair_sched_class;
+
         set_load_weight(p);
  }
-
  /*
   * check the target process has a UID that matches the current process's
   */
@@ -3023,10 +3031,12 @@ static bool check_same_owner(struct task_struct *p)
         return match;
  }
  
-static int __sched_setscheduler(struct task_struct *p, int policy,
-                               const struct sched_param *param, bool user)
+static int __sched_setscheduler(struct task_struct *p,
+                               const struct sched_attr *attr,
+                               bool user)
  {
         int retval, oldprio, oldpolicy = -1, on_rq, running;
+       int policy = attr->sched_policy;
         unsigned long flags;
         const struct sched_class *prev_class;
         struct rq *rq;
@@ -3054,17 +3064,22 @@ recheck:
          * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
          * SCHED_BATCH and SCHED_IDLE is 0.
          */
-       if (param->sched_priority < 0 ||
-           (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
-           (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
+       if (attr->sched_priority < 0 ||
+           (p->mm && attr->sched_priority > MAX_USER_RT_PRIO-1) ||
+           (!p->mm && attr->sched_priority > MAX_RT_PRIO-1))
                 return -EINVAL;
-       if (rt_policy(policy) != (param->sched_priority != 0))
+       if (rt_policy(policy) != (attr->sched_priority != 0))
                 return -EINVAL;
  
         /*
          * Allow unprivileged RT tasks to decrease priority:
          */
         if (user && !capable(CAP_SYS_NICE)) {
+               if (fair_policy(policy)) {
+                       if (!can_nice(p, attr->sched_nice))
+                               return -EPERM;
+               }
+
                 if (rt_policy(policy)) {
                         unsigned long rlim_rtprio =
                                         task_rlimit(p, RLIMIT_RTPRIO);
@@ -3074,8 +3089,8 @@ recheck:
                                 return -EPERM;
  
                         /* can't increase priority */
-                       if (param->sched_priority > p->rt_priority &&
-                           param->sched_priority > rlim_rtprio)
+                       if (attr->sched_priority > p->rt_priority &&
+                           attr->sched_priority > rlim_rtprio)
                                 return -EPERM;
                 }
  
@@ -3123,11 +3138,16 @@ recheck:
         /*
          * If not changing anything there's no need to proceed further:
          */
-       if (unlikely(policy == p->policy && (!rt_policy(policy) ||
-                       param->sched_priority == p->rt_priority))) {
+       if (unlikely(policy == p->policy)) {
+               if (fair_policy(policy) && attr->sched_nice != TASK_NICE(p))
+                       goto change;
+               if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
+                       goto change;
+
                 task_rq_unlock(rq, p, &flags);
                 return 0;
         }
+change:
  
  #ifdef CONFIG_RT_GROUP_SCHED
         if (user) {
@@ -3161,7 +3181,7 @@ recheck:
  
         oldprio = p->prio;
         prev_class = p->sched_class;
-       __setscheduler(rq, p, policy, param->sched_priority);
+       __setscheduler(rq, p, attr);
  
         if (running)
                 p->sched_class->set_curr_task(rq);
@@ -3189,10 +3209,20 @@ recheck:
  int sched_setscheduler(struct task_struct *p, int policy,
                        const struct sched_param *param)
  {
-       return __sched_setscheduler(p, policy, param, true);
+       struct sched_attr attr = {
+               .sched_policy   = policy,
+               .sched_priority = param->sched_priority
+       };
+       return __sched_setscheduler(p, &attr, true);
  }
  EXPORT_SYMBOL_GPL(sched_setscheduler);
  
+int sched_setattr(struct task_struct *p, const struct sched_attr *attr)
+{
+       return __sched_setscheduler(p, attr, true);
+}
+EXPORT_SYMBOL_GPL(sched_setattr);
+
  /**
   * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
   * @p: the task in question.
@@ -3209,7 +3239,11 @@ EXPORT_SYMBOL_GPL(sched_setscheduler);
  int sched_setscheduler_nocheck(struct task_struct *p, int policy,
                                const struct sched_param *param)
  {
-       return __sched_setscheduler(p, policy, param, false);
+       struct sched_attr attr = {
+               .sched_policy   = policy,
+               .sched_priority = param->sched_priority
+       };
+       return __sched_setscheduler(p, &attr, false);
  }
  
  static int
@@ -3234,6 +3268,79 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
         return retval;
  }
  
+/*
+ * Mimics kernel/events/core.c perf_copy_attr().
+ */
+static int sched_copy_attr(struct sched_attr __user *uattr,
+                          struct sched_attr *attr)
+{
+       u32 size;
+       int ret;
+
+       if (!access_ok(VERIFY_WRITE, uattr, SCHED_ATTR_SIZE_VER0))
+               return -EFAULT;
+
+       /*
+        * zero the full structure, so that a short copy will be nice.
+        */
+       memset(attr, 0, sizeof(*attr));
+
+       ret = get_user(size, &uattr->size);
+       if (ret)
+               return ret;
+
+       if (size > PAGE_SIZE)   /* silly large */
+               goto err_size;
+
+       if (!size)              /* abi compat */
+               size = SCHED_ATTR_SIZE_VER0;
+
+       if (size < SCHED_ATTR_SIZE_VER0)
+               goto err_size;
+
+       /*
+        * If we're handed a bigger struct than we know of,
+        * ensure all the unknown bits are 0 - i.e. new
+        * user-space does not rely on any kernel feature
+        * extensions we dont know about yet.
+        */
+       if (size > sizeof(*attr)) {
+               unsigned char __user *addr;
+               unsigned char __user *end;
+               unsigned char val;
+
+               addr = (void __user *)uattr + sizeof(*attr);
+               end  = (void __user *)uattr + size;
+
+               for (; addr < end; addr++) {
+                       ret = get_user(val, addr);
+                       if (ret)
+                               return ret;
+                       if (val)
+                               goto err_size;
+               }
+               size = sizeof(*attr);
+       }
+
+       ret = copy_from_user(attr, uattr, size);
+       if (ret)
+               return -EFAULT;
+
+       /*
+        * XXX: do we want to be lenient like existing syscalls; or do we want
+        * to be strict and return an error on out-of-bounds values?
+        */
+       attr->sched_nice = clamp(attr->sched_nice, -20, 19);
+
+out:
+       return ret;
+
+err_size:
+       put_user(sizeof(*attr), &uattr->size);
+       ret = -E2BIG;
+       goto out;
+}
+
  /**
   * sys_sched_setscheduler - set/change the scheduler policy and RT priority
   * @pid: the pid in question.
@@ -3264,6 +3371,33 @@ SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
         return do_sched_setscheduler(pid, -1, param);
  }
  
+/**
+ * sys_sched_setattr - same as above, but with extended sched_attr
+ * @pid: the pid in question.
+ * @attr: structure containing the extended parameters.
+ */
+SYSCALL_DEFINE2(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr)
+{
+       struct sched_attr attr;
+       struct task_struct *p;
+       int retval;
+
+       if (!uattr || pid < 0)
+               return -EINVAL;
+
+       if (sched_copy_attr(uattr, &attr))
+               return -EFAULT;
+
+       rcu_read_lock();
+       retval = -ESRCH;
+       p = find_process_by_pid(pid);
+       if (p != NULL)
+               retval = sched_setattr(p, &attr);
+       rcu_read_unlock();
+
+       return retval;
+}
+
  /**
   * sys_sched_getscheduler - get the policy (scheduling class) of a thread
   * @pid: the pid in question.
@@ -3334,6 +3468,92 @@ out_unlock:
         return retval;
  }
  
+static int sched_read_attr(struct sched_attr __user *uattr,
+                          struct sched_attr *attr,
+                          unsigned int usize)
+{
+       int ret;
+
+       if (!access_ok(VERIFY_WRITE, uattr, usize))
+               return -EFAULT;
+
+       /*
+        * If we're handed a smaller struct than we know of,
+        * ensure all the unknown bits are 0 - i.e. old
+        * user-space does not get uncomplete information.
+        */
+       if (usize < sizeof(*attr)) {
+               unsigned char *addr;
+               unsigned char *end;
+
+               addr = (void *)attr + usize;
+               end  = (void *)attr + sizeof(*attr);
+
+               for (; addr < end; addr++) {
+                       if (*addr)
+                               goto err_size;
+               }
+
+               attr->size = usize;
+       }
+
+       ret = copy_to_user(uattr, attr, usize);
+       if (ret)
+               return -EFAULT;
+
+out:
+       return ret;
+
+err_size:
+       ret = -E2BIG;
+       goto out;
+}
+
+/**
+ * sys_sched_getattr - same as above, but with extended "sched_param"
+ * @pid: the pid in question.
+ * @attr: structure containing the extended parameters.
+ * @size: sizeof(attr) for fwd/bwd comp.
+ */
+SYSCALL_DEFINE3(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
+               unsigned int, size)
+{
+       struct sched_attr attr = {
+               .size = sizeof(struct sched_attr),
+       };
+       struct task_struct *p;
+       int retval;
+
+       if (!uattr || pid < 0 || size > PAGE_SIZE ||
+           size < SCHED_ATTR_SIZE_VER0)
+               return -EINVAL;
+
+       rcu_read_lock();
+       p = find_process_by_pid(pid);
+       retval = -ESRCH;
+       if (!p)
+               goto out_unlock;
+
+       retval = security_task_getscheduler(p);
+       if (retval)
+               goto out_unlock;
+
+       attr.sched_policy = p->policy;
+       if (task_has_rt_policy(p))
+               attr.sched_priority = p->rt_priority;
+       else
+               attr.sched_nice = TASK_NICE(p);
+
+       rcu_read_unlock();
+
+       retval = sched_read_attr(uattr, &attr, size);
+       return retval;
+
+out_unlock:
+       rcu_read_unlock();
+       return retval;
+}
+
  long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
  {
         cpumask_var_t cpus_allowed, new_mask;
@@ -6400,13 +6620,16 @@ EXPORT_SYMBOL(__might_sleep);
  static void normalize_task(struct rq *rq, struct task_struct *p)
  {
         const struct sched_class *prev_class = p->sched_class;
+       struct sched_attr attr = {
+               .sched_policy = SCHED_NORMAL,
+       };
         int old_prio = p->prio;
         int on_rq;
  
         on_rq = p->on_rq;
         if (on_rq)
                 dequeue_task(rq, p, 0);
-       __setscheduler(rq, p, SCHED_NORMAL, 0);
+       __setscheduler(rq, p, &attr);
         if (on_rq) {
                 enqueue_task(rq, p, 0);
                 resched_task(rq->curr);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h

index b3b4a4953efcdae925c77e44fcf2798c69edd040..df023db7721c252f20dc358fe56f5c0d618e43e1 100644 (file)
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -81,11 +81,14 @@ extern void update_cpu_load_active(struct rq *this_rq);
   */
  #define RUNTIME_INF    ((u64)~0ULL)
  
+static inline int fair_policy(int policy)
+{
+       return policy == SCHED_NORMAL || policy == SCHED_BATCH;
+}
+
  static inline int rt_policy(int policy)
  {
-       if (policy == SCHED_FIFO || policy == SCHED_RR)
-               return 1;
-       return 0;
+       return policy == SCHED_FIFO || policy == SCHED_RR;
  }
  
  static inline int task_has_rt_policy(struct task_struct *p)
author	Dario Faggioli <raistlin@linux.it>
	Thu, 7 Nov 2013 13:43:36 +0000 (14:43 +0100)
committer	Ingo Molnar <mingo@kernel.org>
	Mon, 13 Jan 2014 12:41:04 +0000 (13:41 +0100)
arch/arm/include/asm/unistd.h		patch \| blob \| history
arch/arm/include/uapi/asm/unistd.h		patch \| blob \| history
arch/arm/kernel/calls.S		patch \| blob \| history
arch/x86/syscalls/syscall_32.tbl		patch \| blob \| history
arch/x86/syscalls/syscall_64.tbl		patch \| blob \| history
include/linux/sched.h		patch \| blob \| history
include/linux/syscalls.h		patch \| blob \| history
kernel/sched/core.c		patch \| blob \| history
kernel/sched/sched.h		patch \| blob \| history