sched: hierarchical load vs find_busiest_group
authorPeter Zijlstra <a.p.zijlstra@chello.nl>
Fri, 27 Jun 2008 11:41:28 +0000 (13:41 +0200)
committerIngo Molnar <mingo@elte.hu>
Fri, 27 Jun 2008 12:31:40 +0000 (14:31 +0200)
find_busiest_group() has some assumptions about task weight being in the
NICE_0_LOAD range. Hierarchical task groups break this assumption - fix this
by replacing it with the average task weight, which will adapt the situation.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Cc: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
kernel/sched.c

index 6a6b0139eb3264fc6bc3441c645ca6c72b13f2b0..5e2aa394a81259bcb678f2cf4a7c3ca276aa86ca 100644 (file)
@@ -3050,6 +3050,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
        max_load = this_load = total_load = total_pwr = 0;
        busiest_load_per_task = busiest_nr_running = 0;
        this_load_per_task = this_nr_running = 0;
+
        if (idle == CPU_NOT_IDLE)
                load_idx = sd->busy_idx;
        else if (idle == CPU_NEWLY_IDLE)
@@ -3064,6 +3065,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
                int __group_imb = 0;
                unsigned int balance_cpu = -1, first_idle_cpu = 0;
                unsigned long sum_nr_running, sum_weighted_load;
+               unsigned long sum_avg_load_per_task;
+               unsigned long avg_load_per_task;
 
                local_group = cpu_isset(this_cpu, group->cpumask);
 
@@ -3072,6 +3075,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 
                /* Tally up the load of all CPUs in the group */
                sum_weighted_load = sum_nr_running = avg_load = 0;
+               sum_avg_load_per_task = avg_load_per_task = 0;
+
                max_cpu_load = 0;
                min_cpu_load = ~0UL;
 
@@ -3105,6 +3110,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
                        avg_load += load;
                        sum_nr_running += rq->nr_running;
                        sum_weighted_load += weighted_cpuload(i);
+
+                       sum_avg_load_per_task += cpu_avg_load_per_task(i);
                }
 
                /*
@@ -3126,7 +3133,20 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
                avg_load = sg_div_cpu_power(group,
                                avg_load * SCHED_LOAD_SCALE);
 
-               if ((max_cpu_load - min_cpu_load) > SCHED_LOAD_SCALE)
+
+               /*
+                * Consider the group unbalanced when the imbalance is larger
+                * than the average weight of two tasks.
+                *
+                * APZ: with cgroup the avg task weight can vary wildly and
+                *      might not be a suitable number - should we keep a
+                *      normalized nr_running number somewhere that negates
+                *      the hierarchy?
+                */
+               avg_load_per_task = sg_div_cpu_power(group,
+                               sum_avg_load_per_task * SCHED_LOAD_SCALE);
+
+               if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
                        __group_imb = 1;
 
                group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
@@ -3267,9 +3287,9 @@ small_imbalance:
                        if (busiest_load_per_task > this_load_per_task)
                                imbn = 1;
                } else
-                       this_load_per_task = SCHED_LOAD_SCALE;
+                       this_load_per_task = cpu_avg_load_per_task(this_cpu);
 
-               if (max_load - this_load + SCHED_LOAD_SCALE_FUZZ >=
+               if (max_load - this_load + 2*busiest_load_per_task >=
                                        busiest_load_per_task * imbn) {
                        *imbalance = busiest_load_per_task;
                        return busiest;