sched: Fix domain iteration
authorPeter Zijlstra <a.p.zijlstra@chello.nl>
Thu, 31 May 2012 12:47:33 +0000 (14:47 +0200)
committerIngo Molnar <mingo@kernel.org>
Wed, 6 Jun 2012 14:52:26 +0000 (16:52 +0200)
Weird topologies can lead to asymmetric domain setups. This needs
further consideration since these setups are typically non-minimal
too.

For now, make it work by adding an extra mask selecting which CPUs
are allowed to iterate up.

The topology that triggered it is the one from David Rientjes:

10 20 20 30
20 10 20 20
20 20 10 20
30 20 20 10

resulting in boxes that wouldn't even boot.

Reported-by: David Rientjes <rientjes@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/n/tip-3p86l9cuaqnxz7uxsojmz5rm@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
include/linux/sched.h
kernel/sched/core.c
kernel/sched/fair.c
kernel/sched/sched.h

index 6029d8c544762bc04f88adc6ff8c6ca986caffbd..ac321d7534709968ac4d96fff48efce563b46b5d 100644 (file)
@@ -876,6 +876,8 @@ struct sched_group_power {
         * Number of busy cpus in this group.
         */
        atomic_t nr_busy_cpus;
+
+       unsigned long cpumask[0]; /* iteration mask */
 };
 
 struct sched_group {
@@ -900,6 +902,15 @@ static inline struct cpumask *sched_group_cpus(struct sched_group *sg)
        return to_cpumask(sg->cpumask);
 }
 
+/*
+ * cpumask masking which cpus in the group are allowed to iterate up the domain
+ * tree.
+ */
+static inline struct cpumask *sched_group_mask(struct sched_group *sg)
+{
+       return to_cpumask(sg->sgp->cpumask);
+}
+
 /**
  * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
  * @group: The group whose first cpu is to be returned.
index 6546083af3e0182d052fbff2f6217c271a9736a9..781acb91a50aa973e73ff119bfd055c8d4e7e45d 100644 (file)
@@ -5994,6 +5994,44 @@ struct sched_domain_topology_level {
        struct sd_data      data;
 };
 
+/*
+ * Build an iteration mask that can exclude certain CPUs from the upwards
+ * domain traversal.
+ *
+ * Asymmetric node setups can result in situations where the domain tree is of
+ * unequal depth, make sure to skip domains that already cover the entire
+ * range.
+ *
+ * In that case build_sched_domains() will have terminated the iteration early
+ * and our sibling sd spans will be empty. Domains should always include the
+ * cpu they're built on, so check that.
+ *
+ */
+static void build_group_mask(struct sched_domain *sd, struct sched_group *sg)
+{
+       const struct cpumask *span = sched_domain_span(sd);
+       struct sd_data *sdd = sd->private;
+       struct sched_domain *sibling;
+       int i;
+
+       for_each_cpu(i, span) {
+               sibling = *per_cpu_ptr(sdd->sd, i);
+               if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
+                       continue;
+
+               cpumask_set_cpu(i, sched_group_mask(sg));
+       }
+}
+
+/*
+ * Return the canonical balance cpu for this group, this is the first cpu
+ * of this group that's also in the iteration mask.
+ */
+int group_balance_cpu(struct sched_group *sg)
+{
+       return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg));
+}
+
 static int
 build_overlap_sched_groups(struct sched_domain *sd, int cpu)
 {
@@ -6012,6 +6050,12 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
                if (cpumask_test_cpu(i, covered))
                        continue;
 
+               child = *per_cpu_ptr(sdd->sd, i);
+
+               /* See the comment near build_group_mask(). */
+               if (!cpumask_test_cpu(i, sched_domain_span(child)))
+                       continue;
+
                sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
                                GFP_KERNEL, cpu_to_node(cpu));
 
@@ -6019,8 +6063,6 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
                        goto fail;
 
                sg_span = sched_group_cpus(sg);
-
-               child = *per_cpu_ptr(sdd->sd, i);
                if (child->child) {
                        child = child->child;
                        cpumask_copy(sg_span, sched_domain_span(child));
@@ -6030,13 +6072,18 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
                cpumask_or(covered, covered, sg_span);
 
                sg->sgp = *per_cpu_ptr(sdd->sgp, i);
-               atomic_inc(&sg->sgp->ref);
+               if (atomic_inc_return(&sg->sgp->ref) == 1)
+                       build_group_mask(sd, sg);
+
 
+               /*
+                * Make sure the first group of this domain contains the
+                * canonical balance cpu. Otherwise the sched_domain iteration
+                * breaks. See update_sg_lb_stats().
+                */
                if ((!groups && cpumask_test_cpu(cpu, sg_span)) ||
-                              cpumask_first(sg_span) == cpu) {
-                       WARN_ON_ONCE(!cpumask_test_cpu(cpu, sg_span));
+                   group_balance_cpu(sg) == cpu)
                        groups = sg;
-               }
 
                if (!first)
                        first = sg;
@@ -6109,6 +6156,7 @@ build_sched_groups(struct sched_domain *sd, int cpu)
 
                cpumask_clear(sched_group_cpus(sg));
                sg->sgp->power = 0;
+               cpumask_setall(sched_group_mask(sg));
 
                for_each_cpu(j, span) {
                        if (get_group(j, sdd, NULL) != group)
@@ -6150,7 +6198,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
                sg = sg->next;
        } while (sg != sd->groups);
 
-       if (cpu != group_first_cpu(sg))
+       if (cpu != group_balance_cpu(sg))
                return;
 
        update_group_power(sd, cpu);
@@ -6525,7 +6573,7 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
 
                        *per_cpu_ptr(sdd->sg, j) = sg;
 
-                       sgp = kzalloc_node(sizeof(struct sched_group_power),
+                       sgp = kzalloc_node(sizeof(struct sched_group_power) + cpumask_size(),
                                        GFP_KERNEL, cpu_to_node(j));
                        if (!sgp)
                                return -ENOMEM;
index b2a2d236f27b8f535e4b89835cb2be529fee8bb4..54cbaa4e7b37c571463017ddbc663c01f2604d59 100644 (file)
@@ -3652,7 +3652,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
        int i;
 
        if (local_group)
-               balance_cpu = group_first_cpu(group);
+               balance_cpu = group_balance_cpu(group);
 
        /* Tally up the load of all CPUs in the group */
        max_cpu_load = 0;
@@ -3667,7 +3667,8 @@ static inline void update_sg_lb_stats(struct lb_env *env,
 
                /* Bias balancing toward cpus of our domain */
                if (local_group) {
-                       if (idle_cpu(i) && !first_idle_cpu) {
+                       if (idle_cpu(i) && !first_idle_cpu &&
+                                       cpumask_test_cpu(i, sched_group_mask(group))) {
                                first_idle_cpu = 1;
                                balance_cpu = i;
                        }
index ba9dccfd24ce95b2b198257fb5e787e21da36d41..6d52cea7f33dc77496ec6db9e975cd2c0fd1c2b9 100644 (file)
@@ -526,6 +526,8 @@ static inline struct sched_domain *highest_flag_domain(int cpu, int flag)
 DECLARE_PER_CPU(struct sched_domain *, sd_llc);
 DECLARE_PER_CPU(int, sd_llc_id);
 
+extern int group_balance_cpu(struct sched_group *sg);
+
 #endif /* CONFIG_SMP */
 
 #include "stats.h"