From: Ingo Molnar Here's the merged sched.patch, against -mm5 + sched-domain-setup-lock. --- 25-akpm/arch/i386/kernel/smpboot.c | 49 ++++ 25-akpm/include/asm-i386/processor.h | 8 25-akpm/include/linux/sched.h | 10 25-akpm/kernel/sched.c | 376 +++++++++++++++-------------------- 4 files changed, 220 insertions(+), 223 deletions(-) diff -puN arch/i386/kernel/smpboot.c~sched-ingo-rollup arch/i386/kernel/smpboot.c --- 25/arch/i386/kernel/smpboot.c~sched-ingo-rollup 2004-03-29 23:33:34.508298448 -0800 +++ 25-akpm/arch/i386/kernel/smpboot.c 2004-03-29 23:33:34.515297384 -0800 @@ -1146,16 +1146,19 @@ __init void arch_init_sched_domains(void *cpu_domain = SD_SIBLING_INIT; cpu_domain->span = cpu_sibling_map[i]; + cpu_domain->cache_hot_time = cacheflush_time / 2; cpu_domain->parent = phys_domain; cpu_domain->groups = &sched_group_cpus[i]; *phys_domain = SD_CPU_INIT; phys_domain->span = nodemask; + phys_domain->cache_hot_time = cacheflush_time / 2; phys_domain->parent = node_domain; phys_domain->groups = &sched_group_phys[first_cpu(cpu_domain->span)]; *node_domain = SD_NODE_INIT; node_domain->span = cpu_possible_map; + node_domain->cache_hot_time = cacheflush_time; node_domain->groups = &sched_group_nodes[cpu_to_node(i)]; } @@ -1263,11 +1266,13 @@ __init void arch_init_sched_domains(void *cpu_domain = SD_SIBLING_INIT; cpu_domain->span = cpu_sibling_map[i]; + cpu_domain->cache_hot_time = cacheflush_time / 2; cpu_domain->parent = phys_domain; cpu_domain->groups = &sched_group_cpus[i]; *phys_domain = SD_CPU_INIT; phys_domain->span = cpu_possible_map; + phys_domain->cache_hot_time = cacheflush_time / 2; phys_domain->groups = &sched_group_phys[first_cpu(cpu_domain->span)]; } @@ -1324,7 +1329,49 @@ __init void arch_init_sched_domains(void } } #endif /* CONFIG_NUMA */ -#endif /* CONFIG_SCHED_SMT */ +#else /* !CONFIG_SCHED_SMT */ + +static struct sched_group sched_group_cpus[NR_CPUS]; +static DEFINE_PER_CPU(struct sched_domain, cpu_domains); + +void __init arch_init_sched_domains(void) +{ + int i; + struct sched_group *first_cpu = NULL, *last_cpu = NULL; + + /* Set up domains */ + for_each_cpu(i) { + struct sched_domain *cpu_sd = &per_cpu(cpu_domains, i); + + *cpu_sd = SD_CPU_INIT; + cpu_sd->span = cpu_possible_map; + cpu_sd->cache_hot_time = cacheflush_time / 2; + cpu_sd->groups = &sched_group_cpus[i]; + } + + /* Set up CPU groups */ + for_each_cpu_mask(i, cpu_possible_map) { + struct sched_group *cpu = &sched_group_cpus[i]; + + cpus_clear(cpu->cpumask); + cpu_set(i, cpu->cpumask); + cpu->cpu_power = SCHED_LOAD_SCALE; + + if (!first_cpu) + first_cpu = cpu; + if (last_cpu) + last_cpu->next = cpu; + last_cpu = cpu; + } + last_cpu->next = first_cpu; + + mb(); /* domains were modified outside the lock */ + for_each_cpu(i) { + struct sched_domain *cpu_sd = &per_cpu(cpu_domains, i); + cpu_attach_domain(cpu_sd, i); + } +} +#endif /* These are wrappers to interface to the new boot process. Someone who understands all this stuff should rewrite it properly. --RR 15/Jul/02 */ diff -puN include/asm-i386/processor.h~sched-ingo-rollup include/asm-i386/processor.h --- 25/include/asm-i386/processor.h~sched-ingo-rollup 2004-03-29 23:33:34.509298296 -0800 +++ 25-akpm/include/asm-i386/processor.h 2004-03-29 23:33:34.516297232 -0800 @@ -646,9 +646,11 @@ extern inline void prefetchw(const void extern void select_idle_routine(const struct cpuinfo_x86 *c); -#ifdef CONFIG_SCHED_SMT -#define ARCH_HAS_SCHED_DOMAIN -#define ARCH_HAS_SCHED_WAKE_IDLE +#ifdef CONFIG_SMP +# define ARCH_HAS_SCHED_DOMAIN +# ifdef CONFIG_SCHED_SMT +# define ARCH_HAS_SCHED_WAKE_IDLE +# endif #endif #endif /* __ASM_I386_PROCESSOR_H */ diff -puN include/linux/sched.h~sched-ingo-rollup include/linux/sched.h --- 25/include/linux/sched.h~sched-ingo-rollup 2004-03-29 23:33:34.511297992 -0800 +++ 25-akpm/include/linux/sched.h 2004-03-29 23:33:34.517297080 -0800 @@ -584,9 +584,9 @@ struct sched_domain { .cache_nice_tries = 0, \ .per_cpu_gain = 15, \ .flags = SD_BALANCE_NEWIDLE \ - | SD_WAKE_AFFINE \ - | SD_WAKE_IDLE \ - | SD_SHARE_CPUPOWER, \ + | SD_WAKE_AFFINE \ + | SD_WAKE_IDLE \ + | SD_SHARE_CPUPOWER, \ .last_balance = jiffies, \ .balance_interval = 1, \ .nr_balance_failed = 0, \ @@ -602,7 +602,7 @@ struct sched_domain { .busy_factor = 64, \ .imbalance_pct = 125, \ .cache_hot_time = (5*1000000/2), \ - .cache_nice_tries = 1, \ + .cache_nice_tries = 2, \ .per_cpu_gain = 100, \ .flags = SD_BALANCE_NEWIDLE \ | SD_WAKE_AFFINE, \ @@ -643,7 +643,7 @@ static inline int set_cpus_allowed(task_ extern unsigned long long sched_clock(void); -#ifdef CONFIG_NUMA +#ifdef CONFIG_SMP extern void sched_balance_exec(void); #else #define sched_balance_exec() {} diff -puN kernel/sched.c~sched-ingo-rollup kernel/sched.c --- 25/kernel/sched.c~sched-ingo-rollup 2004-03-29 23:33:34.512297840 -0800 +++ 25-akpm/kernel/sched.c 2004-03-29 23:33:34.523296168 -0800 @@ -180,11 +180,14 @@ ((MAX_TIMESLICE - MIN_TIMESLICE) * \ (MAX_PRIO-1 - (p)->static_prio) / (MAX_USER_PRIO-1))) -static inline unsigned int task_timeslice(task_t *p) +static unsigned int task_timeslice(task_t *p) { return BASE_TIMESLICE(p); } +#define task_hot(p, now, sd) \ + (!TASK_INTERACTIVE(p) && ((now)-(p)->timestamp < (sd)->cache_hot_time)) + /* * These are the runqueue data structures: */ @@ -209,14 +212,7 @@ struct prio_array { struct runqueue { spinlock_t lock; - /* - * nr_running and cpu_load should be in the same cacheline because - * remote CPUs use both these fields when doing load calculation. - */ unsigned long nr_running; -#ifdef CONFIG_SMP - unsigned long cpu_load; -#endif unsigned long long nr_switches; unsigned long expired_timestamp, nr_uninterruptible; unsigned long long timestamp_last_tick; @@ -262,7 +258,7 @@ static DEFINE_PER_CPU(struct runqueue, r * interrupts. Note the ordering: we can safely lookup the task_rq without * explicitly disabling preemption. */ -static inline runqueue_t *task_rq_lock(task_t *p, unsigned long *flags) +static runqueue_t *task_rq_lock(task_t *p, unsigned long *flags) { struct runqueue *rq; @@ -285,7 +281,7 @@ static inline void task_rq_unlock(runque /* * rq_lock - lock a given runqueue and disable interrupts. */ -static inline runqueue_t *this_rq_lock(void) +static runqueue_t *this_rq_lock(void) { runqueue_t *rq; @@ -304,7 +300,7 @@ static inline void rq_unlock(runqueue_t /* * Adding/removing a task to/from a priority array: */ -static inline void dequeue_task(struct task_struct *p, prio_array_t *array) +static void dequeue_task(struct task_struct *p, prio_array_t *array) { array->nr_active--; list_del(&p->run_list); @@ -312,7 +308,7 @@ static inline void dequeue_task(struct t __clear_bit(p->prio, array->bitmap); } -static inline void enqueue_task(struct task_struct *p, prio_array_t *array) +static void enqueue_task(struct task_struct *p, prio_array_t *array) { list_add_tail(&p->run_list, array->queue + p->prio); __set_bit(p->prio, array->bitmap); @@ -320,6 +316,21 @@ static inline void enqueue_task(struct t p->array = array; } +#ifdef CONFIG_SMP +/* + * Used by the migration code - we pull tasks from the head of the + * remote queue so we want these tasks to show up at the head of the + * local queue: + */ +static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array) +{ + list_add(&p->run_list, array->queue + p->prio); + __set_bit(p->prio, array->bitmap); + array->nr_active++; + p->array = array; +} +#endif + /* * effective_prio - return the priority that is based on the static * priority but is modified by bonuses/penalties. @@ -440,7 +451,7 @@ static void recalc_task_prio(task_t *p, * Update all the scheduling statistics stuff. (sleep average * calculation, priority modifiers, etc.) */ -static inline void activate_task(task_t *p, runqueue_t *rq) +static void activate_task(task_t *p, runqueue_t *rq) { unsigned long long now = sched_clock(); @@ -476,7 +487,7 @@ static inline void activate_task(task_t /* * deactivate_task - remove a task from the runqueue. */ -static inline void deactivate_task(struct task_struct *p, runqueue_t *rq) +static void deactivate_task(struct task_struct *p, runqueue_t *rq) { rq->nr_running--; if (p->state == TASK_UNINTERRUPTIBLE) @@ -493,7 +504,7 @@ static inline void deactivate_task(struc * the target CPU. */ #ifdef CONFIG_SMP -static inline void resched_task(task_t *p) +static void resched_task(task_t *p) { int need_resched, nrpolling; @@ -621,20 +632,9 @@ EXPORT_SYMBOL_GPL(kick_process); /* * Return a low guess at the load of cpu. */ -static inline unsigned long get_low_cpu_load(int cpu) -{ - runqueue_t *rq = cpu_rq(cpu); - unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; - - return min(rq->cpu_load, load_now); -} - -static inline unsigned long get_high_cpu_load(int cpu) +static inline unsigned long cpu_load(int cpu) { - runqueue_t *rq = cpu_rq(cpu); - unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; - - return max(rq->cpu_load, load_now); + return cpu_rq(cpu)->nr_running * SCHED_LOAD_SCALE; } #endif @@ -695,15 +695,14 @@ static inline int wake_idle(int cpu, tas */ static int try_to_wake_up(task_t * p, unsigned int state, int sync) { + int cpu, this_cpu, success = 0; unsigned long flags; - int success = 0; long old_state; runqueue_t *rq; - int cpu, this_cpu; #ifdef CONFIG_SMP - unsigned long long now; unsigned long load, this_load; struct sched_domain *sd; + unsigned long long now; int new_cpu; #endif @@ -722,65 +721,75 @@ static int try_to_wake_up(task_t * p, un if (unlikely(task_running(rq, p) || cpu_is_offline(this_cpu))) goto out_activate; - new_cpu = cpu; + new_cpu = this_cpu; - if (cpu == this_cpu || unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) + if (cpu == this_cpu) goto out_set_cpu; - load = get_low_cpu_load(cpu); - this_load = get_high_cpu_load(this_cpu); + /* + * Passive balance, if the load on the remote CPU is over + * the limit: + */ + load = cpu_load(cpu) * 100; + /* + * add the new task's effect to its new CPU. If sync wakeup then + * subtract current's load effect: this means that they cancel out + * each other in the sync case, the we have +1 load in the !sync case: + */ + this_load = cpu_load(this_cpu); + if (!sync) + this_load += SCHED_LOAD_SCALE; + this_load *= rq->sd->imbalance_pct; - /* Don't pull the task off an idle CPU to a busy one */ - if (load < SCHED_LOAD_SCALE/2 && this_load > SCHED_LOAD_SCALE/2) + if (load > this_load) goto out_set_cpu; - new_cpu = this_cpu; /* Wake to this CPU if we can */ - /* - * Passive load balancing. If the queues are very out of balance - * we might as well balance here rather than the periodic load - * balancing. + * Migrate if the source CPU is not idle or the target + * CPU is idle; if the two CPUs share a domain; and if the task + * is not cache-hot. + * + * (Note that these kinds of migrations violate the equilibrium, + * and might trigger follow-on load-balancing - hence we pick + * cache-cold tasks only.) */ - if (load > this_load + SCHED_LOAD_SCALE*2) - goto out_set_cpu; + if (!cpu_load(cpu) && cpu_load(this_cpu)) + goto out_activate; now = sched_clock(); - - /* - * Migrate the task to the waking domain. - * Do not violate hard affinity. - */ for_each_domain(this_cpu, sd) { if (!(sd->flags & SD_WAKE_AFFINE)) break; - if (rq->timestamp_last_tick - p->timestamp < sd->cache_hot_time) + if (task_hot(p, now, sd)) break; - + /* + * The two CPUs share a span of a domain that has affine + * wakeups enabled - the task can be migrated: + */ if (cpu_isset(cpu, sd->span)) goto out_set_cpu; } + /* No luck - fall back to the original CPU: */ + new_cpu = cpu; - new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */ out_set_cpu: new_cpu = wake_idle(new_cpu, p); if (new_cpu != cpu && cpu_isset(new_cpu, p->cpus_allowed)) { set_task_cpu(p, new_cpu); - goto repeat_lock_task; - } - goto out_activate; + task_rq_unlock(rq, &flags); -repeat_lock_task: - task_rq_unlock(rq, &flags); - rq = task_rq_lock(p, &flags); - old_state = p->state; - if (!(old_state & state)) - goto out; + /* might preempt at this point */ - if (p->array) - goto out_running; + rq = task_rq_lock(p, &flags); + old_state = p->state; + if (!(old_state & state)) + goto out; + if (p->array) + goto out_running; - this_cpu = smp_processor_id(); - cpu = task_cpu(p); + this_cpu = smp_processor_id(); + cpu = task_cpu(p); + } out_activate: #endif /* CONFIG_SMP */ @@ -972,7 +981,7 @@ void fastcall sched_exit(task_t * p) * with the lock held can cause deadlocks; see schedule() for * details.) */ -static inline void finish_task_switch(task_t *prev) +static void finish_task_switch(task_t *prev) { runqueue_t *rq = this_rq(); struct mm_struct *mm = rq->prev_mm; @@ -1093,7 +1102,7 @@ unsigned long nr_iowait(void) * Note this does not disable interrupts like task_rq_lock, * you need to do so manually before calling. */ -static inline void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2) +static void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2) { if (rq1 == rq2) spin_lock(&rq1->lock); @@ -1114,7 +1123,7 @@ static inline void double_rq_lock(runque * Note this does not restore interrupts like task_rq_unlock, * you need to do so manually after calling. */ -static inline void double_rq_unlock(runqueue_t *rq1, runqueue_t *rq2) +static void double_rq_unlock(runqueue_t *rq1, runqueue_t *rq2) { spin_unlock(&rq1->lock); if (rq1 != rq2) @@ -1129,7 +1138,6 @@ enum idle_type }; #ifdef CONFIG_SMP -#ifdef CONFIG_NUMA /* * If dest_cpu is allowed for this process, migrate the task to it. * This is accomplished by forcing the cpu_allowed mask to only @@ -1175,19 +1183,19 @@ out: */ static int sched_best_cpu(struct task_struct *p, struct sched_domain *sd) { + int i = 0, min_load, this_cpu, best_cpu; cpumask_t tmp; - int i, min_load, this_cpu, best_cpu; best_cpu = this_cpu = task_cpu(p); - min_load = INT_MAX; + + /* subtract the currently running task's load effect: */ + min_load = cpu_load(this_cpu) - SCHED_LOAD_SCALE; cpus_and(tmp, sd->span, cpu_online_map); + cpu_clear(this_cpu, tmp); + for_each_cpu_mask(i, tmp) { - unsigned long load; - if (i == this_cpu) - load = get_low_cpu_load(i); - else - load = get_high_cpu_load(i) + SCHED_LOAD_SCALE; + unsigned long load = cpu_load(i); if (min_load > load) { best_cpu = i; @@ -1199,25 +1207,27 @@ static int sched_best_cpu(struct task_st /* * sched_balance_exec(): find the highest-level, exec-balance-capable - * domain and try to migrate the task to the least loaded CPU. + * domain and try to migrate the current task to the least loaded CPU. * * execve() is a valuable balancing opportunity, because at this point - * the task has the smallest effective memory and cache footprint. + * the task has the smallest effective cache footprint - a completely new + * process image is being created, so almost all of the currently existing + * cache footprint is irrelevant. So we attempt to balance this task as + * broadly as possible, without considering migration costs, which costs + * otherwise affect all other types of task migrations. */ void sched_balance_exec(void) { struct sched_domain *sd, *best_sd = NULL; - int new_cpu; - int this_cpu = get_cpu(); + int new_cpu, this_cpu = get_cpu(); - /* Prefer the current CPU if there's only this task running */ + /* Prefer the current CPU if there's only this task running: */ if (this_rq()->nr_running <= 1) goto out; - for_each_domain(this_cpu, sd) { + for_each_domain(this_cpu, sd) if (sd->flags & SD_BALANCE_EXEC) best_sd = sd; - } if (best_sd) { new_cpu = sched_best_cpu(current, best_sd); @@ -1230,12 +1240,11 @@ void sched_balance_exec(void) out: put_cpu(); } -#endif /* CONFIG_NUMA */ /* * double_lock_balance - lock the busiest runqueue, this_rq is locked already. */ -static inline void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest) +static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest) { if (unlikely(!spin_trylock(&busiest->lock))) { if (busiest < this_rq) { @@ -1253,13 +1262,13 @@ static inline void double_lock_balance(r */ static inline void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, - runqueue_t *this_rq, prio_array_t *this_array, int this_cpu) + runqueue_t *this_rq, prio_array_t *this_array, int this_cpu) { dequeue_task(p, src_array); src_rq->nr_running--; set_task_cpu(p, this_cpu); this_rq->nr_running++; - enqueue_task(p, this_array); + enqueue_task_head(p, this_array); p->timestamp = sched_clock() - (src_rq->timestamp_last_tick - p->timestamp); /* @@ -1275,7 +1284,7 @@ void pull_task(runqueue_t *src_rq, prio_ */ static inline int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, - struct sched_domain *sd, enum idle_type idle) + struct sched_domain *sd, enum idle_type idle) { /* * We do not migrate tasks that are: @@ -1291,7 +1300,7 @@ int can_migrate_task(task_t *p, runqueue /* Aggressive migration if we've failed balancing */ if (idle == NEWLY_IDLE || sd->nr_balance_failed < sd->cache_nice_tries) { - if (rq->timestamp_last_tick - p->timestamp < sd->cache_hot_time) + if (task_hot(p, rq->timestamp_last_tick, sd)) return 0; } @@ -1306,30 +1315,24 @@ int can_migrate_task(task_t *p, runqueue * Called with both runqueues locked. */ static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest, - unsigned long max_nr_move, struct sched_domain *sd, - enum idle_type idle) + unsigned long max_nr_move, struct sched_domain *sd, + enum idle_type idle) { - int idx; - int pulled = 0; prio_array_t *array, *dst_array; struct list_head *head, *curr; + int ret, idx, pulled = 0; task_t *tmp; if (max_nr_move <= 0 || busiest->nr_running <= 1) goto out; - /* - * We first consider expired tasks. Those will likely not be - * executed in the near future, and they are most likely to - * be cache-cold, thus switching CPUs has the least effect - * on them. - */ - if (busiest->expired->nr_active) { - array = busiest->expired; - dst_array = this_rq->expired; - } else { + /* We first consider active tasks. */ + if (busiest->active->nr_active) { array = busiest->active; dst_array = this_rq->active; + } else { + array = busiest->expired; + dst_array = this_rq->expired; } new_array: @@ -1341,22 +1344,27 @@ skip_bitmap: else idx = find_next_bit(array->bitmap, MAX_PRIO, idx); if (idx >= MAX_PRIO) { - if (array == busiest->expired && busiest->active->nr_active) { - array = busiest->active; - dst_array = this_rq->active; + if (array == busiest->active && busiest->expired->nr_active) { + array = busiest->expired; + dst_array = this_rq->expired; goto new_array; } goto out; } head = array->queue + idx; - curr = head->prev; + curr = head->next; skip_queue: tmp = list_entry(curr, task_t, run_list); - curr = curr->prev; + curr = curr->next; - if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle)) { + ret = can_migrate_task(tmp, busiest, this_cpu, sd, idle); + if (ret == -1) { + idx++; + goto skip_bitmap; + } + if (!ret) { if (curr != head) goto skip_queue; idx++; @@ -1383,42 +1391,25 @@ out: */ static struct sched_group * find_busiest_group(struct sched_domain *sd, int this_cpu, - unsigned long *imbalance, enum idle_type idle) + unsigned long *nr_move, enum idle_type idle) { struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; - unsigned long max_load, avg_load, total_load, this_load, total_pwr; + unsigned long max_load, avg_load, total_load, + this_load, total_pwr, delta; max_load = this_load = total_load = total_pwr = 0; do { cpumask_t tmp; - unsigned long load; - int local_group; - int i, nr_cpus = 0; - - local_group = cpu_isset(this_cpu, group->cpumask); + int i; /* Tally up the load of all CPUs in the group */ - avg_load = 0; cpus_and(tmp, group->cpumask, cpu_online_map); - if (unlikely(cpus_empty(tmp))) { - WARN_ON(1); - return NULL; - } - - for_each_cpu_mask(i, tmp) { - /* Bias balancing toward cpus of our domain */ - if (local_group) - load = get_high_cpu_load(i); - else - load = get_low_cpu_load(i); - - nr_cpus++; - avg_load += load; - } + WARN_ON(cpus_empty(tmp)); - if (!nr_cpus) - goto nextgroup; + avg_load = 0; + for_each_cpu_mask(i, tmp) + avg_load += cpu_load(i); total_load += avg_load; total_pwr += group->cpu_power; @@ -1426,11 +1417,12 @@ find_busiest_group(struct sched_domain * /* Adjust by relative CPU power of the group */ avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; - if (local_group) { + if (cpu_isset(this_cpu, group->cpumask)) { this_load = avg_load; this = group; goto nextgroup; - } else if (avg_load > max_load) { + } + if (avg_load > max_load) { max_load = avg_load; busiest = group; } @@ -1443,8 +1435,8 @@ nextgroup: avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr; - if (this_load >= avg_load || - 100*max_load <= sd->imbalance_pct*this_load) + if ((this_load >= avg_load) + || (100*max_load <= sd->imbalance_pct*this_load)) goto out_balanced; /* @@ -1458,67 +1450,32 @@ nextgroup: * by pulling tasks to us. Be careful of negative numbers as they'll * appear as very large values with unsigned longs. */ - *imbalance = min(max_load - avg_load, avg_load - this_load); - - /* How much load to actually move to equalise the imbalance */ - *imbalance = (*imbalance * min(busiest->cpu_power, this->cpu_power)) - / SCHED_LOAD_SCALE; - - if (*imbalance < SCHED_LOAD_SCALE - 1) { - unsigned long pwr_now = 0, pwr_move = 0; - unsigned long tmp; - - if (max_load - this_load >= SCHED_LOAD_SCALE*2) { - *imbalance = 1; - return busiest; - } + delta = max_load - this_load; + if (delta > SCHED_LOAD_SCALE) { + delta = min(max_load - avg_load, avg_load - this_load); /* - * OK, we don't have enough imbalance to justify moving tasks, - * however we may be able to increase total CPU power used by - * moving them. + * How many tasks to actually move to equalise the + * imbalance: first round up (which will move us across + * the average unless we can precisely balance to the + * average) and get rid of the scaling factor: */ + delta += SCHED_LOAD_SCALE-1; + *nr_move = delta / SCHED_LOAD_SCALE; - pwr_now += busiest->cpu_power*min(SCHED_LOAD_SCALE, max_load); - pwr_now += this->cpu_power*min(SCHED_LOAD_SCALE, this_load); - pwr_now /= SCHED_LOAD_SCALE; - - /* Amount of load we'd subtract */ - tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/busiest->cpu_power; - if (max_load > tmp) - pwr_move += busiest->cpu_power*min(SCHED_LOAD_SCALE, - max_load - tmp); - - /* Amount of load we'd add */ - tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/this->cpu_power; - if (max_load < tmp) - tmp = max_load; - pwr_move += this->cpu_power*min(SCHED_LOAD_SCALE, this_load + tmp); - pwr_move /= SCHED_LOAD_SCALE; - - /* Move if we gain another 8th of a CPU worth of throughput */ - if (pwr_move < pwr_now + SCHED_LOAD_SCALE / 8) - goto out_balanced; - - *imbalance = 1; - return busiest; + if (*nr_move) + return busiest; } - /* Get rid of the scaling factor, rounding down as we divide */ - *imbalance = (*imbalance + 1) / SCHED_LOAD_SCALE; - - return busiest; - out_balanced: - if (busiest && idle != NOT_IDLE && max_load > SCHED_LOAD_SCALE) { - *imbalance = 1; + if (busiest && idle == NEWLY_IDLE && max_load > SCHED_LOAD_SCALE) { + *nr_move = 1; return busiest; } - *imbalance = 0; + *nr_move = 0; return NULL; } - /* * find_busiest_queue - find the busiest runqueue among the cpus in group. */ @@ -1531,9 +1488,9 @@ static runqueue_t *find_busiest_queue(st cpus_and(tmp, group->cpumask, cpu_online_map); for_each_cpu_mask(i, tmp) { - load = get_low_cpu_load(i); + load = cpu_load(i); - if (load >= max_load) { + if (load > max_load) { max_load = load; busiest = cpu_rq(i); } @@ -1552,8 +1509,8 @@ static int load_balance(int this_cpu, ru struct sched_domain *sd, enum idle_type idle) { struct sched_group *group; - runqueue_t *busiest; unsigned long imbalance; + runqueue_t *busiest; int nr_moved; spin_lock(&this_rq->lock); @@ -1579,7 +1536,6 @@ static int load_balance(int this_cpu, ru if (!nr_moved) { sd->nr_balance_failed++; - if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) { int wake = 0; @@ -1592,12 +1548,11 @@ static int load_balance(int this_cpu, ru spin_unlock(&busiest->lock); if (wake) wake_up_process(busiest->migration_thread); - /* - * We've kicked active balancing, reset the failure - * counter. + * We've kicked active balancing, reset the + * failure counter: */ - sd->nr_balance_failed = sd->cache_nice_tries; + sd->nr_balance_failed = 0; } } else sd->nr_balance_failed = 0; @@ -1683,18 +1638,18 @@ static inline void idle_balance(int this */ static void active_load_balance(runqueue_t *busiest, int busiest_cpu) { - struct sched_domain *sd; struct sched_group *group, *busy_group; + struct sched_domain *sd; int i; if (busiest->nr_running <= 1) return; - for_each_domain(busiest_cpu, sd) { + for_each_domain(busiest_cpu, sd) if (cpu_isset(busiest->push_cpu, sd->span)) break; - } - if (!sd) { + + if (!sd->parent && !cpu_isset(busiest->push_cpu, sd->span)) { WARN_ON(1); return; } @@ -1708,13 +1663,13 @@ static void active_load_balance(runqueue do { cpumask_t tmp; runqueue_t *rq; - int push_cpu = 0; + int push_cpu = 0; if (group == busy_group) goto next_group; cpus_and(tmp, group->cpumask, cpu_online_map); - if (cpus_weight(tmp) == 0) + if (!cpus_weight(tmp)) goto next_group; for_each_cpu_mask(i, tmp) { @@ -1745,20 +1700,14 @@ next_group: #define CPU_OFFSET(cpu) (HZ * cpu / NR_CPUS) static void rebalance_tick(int this_cpu, runqueue_t *this_rq, - enum idle_type idle) + enum idle_type idle) { - unsigned long old_load, this_load; unsigned long j = jiffies + CPU_OFFSET(this_cpu); struct sched_domain *sd; if (unlikely(cpu_is_offline(this_cpu))) return; - /* Update our load */ - old_load = this_rq->cpu_load; - this_load = this_rq->nr_running * SCHED_LOAD_SCALE; - this_rq->cpu_load = (old_load + this_load) / 2; - for_each_domain(this_cpu, sd) { unsigned long interval = sd->balance_interval; @@ -1767,7 +1716,7 @@ static void rebalance_tick(int this_cpu, /* scale ms to jiffies */ interval = MSEC_TO_JIFFIES(interval); - if (unlikely(interval == 0)) + if (unlikely(!interval)) interval = 1; if (j - sd->last_balance >= interval) { @@ -3479,10 +3428,10 @@ static void __init arch_init_sched_domai /* Set up groups */ for (i = 0; i < MAX_NUMNODES; i++) { - cpumask_t tmp = node_to_cpumask(i); - cpumask_t nodemask; struct sched_group *first_cpu = NULL, *last_cpu = NULL; struct sched_group *node = &sched_group_nodes[i]; + cpumask_t tmp = node_to_cpumask(i); + cpumask_t nodemask; int j; cpus_and(nodemask, tmp, cpu_possible_map); @@ -3606,12 +3555,12 @@ void sched_domain_debug(void) printk(" "); printk("groups:"); do { - if (group == NULL) { + if (!group) { printk(" ERROR: NULL"); break; } - if (cpus_weight(group->cpumask) == 0) + if (!cpus_weight(group->cpumask)) printk(" ERROR empty group:"); cpus_and(tmp, groupmask, group->cpumask); @@ -3691,7 +3640,6 @@ void __init sched_init(void) #ifdef CONFIG_SMP rq->sd = &sched_domain_init; - rq->cpu_load = 0; rq->active_balance = 0; rq->push_cpu = 0; rq->migration_thread = NULL; _