From: Ingo Molnar <mingo@elte.hu>

Here's the merged sched.patch, against -mm5 + sched-domain-setup-lock.


---

 25-akpm/arch/i386/kernel/smpboot.c   |   49 ++++
 25-akpm/include/asm-i386/processor.h |    8 
 25-akpm/include/linux/sched.h        |   10 
 25-akpm/kernel/sched.c               |  376 +++++++++++++++--------------------
 4 files changed, 220 insertions(+), 223 deletions(-)

diff -puN arch/i386/kernel/smpboot.c~sched-ingo-rollup arch/i386/kernel/smpboot.c
--- 25/arch/i386/kernel/smpboot.c~sched-ingo-rollup	2004-03-29 23:33:34.508298448 -0800
+++ 25-akpm/arch/i386/kernel/smpboot.c	2004-03-29 23:33:34.515297384 -0800
@@ -1146,16 +1146,19 @@ __init void arch_init_sched_domains(void
 
 		*cpu_domain = SD_SIBLING_INIT;
 		cpu_domain->span = cpu_sibling_map[i];
+		cpu_domain->cache_hot_time = cacheflush_time / 2;
 		cpu_domain->parent = phys_domain;
 		cpu_domain->groups = &sched_group_cpus[i];
 
 		*phys_domain = SD_CPU_INIT;
 		phys_domain->span = nodemask;
+		phys_domain->cache_hot_time = cacheflush_time / 2;
 		phys_domain->parent = node_domain;
 		phys_domain->groups = &sched_group_phys[first_cpu(cpu_domain->span)];
 
 		*node_domain = SD_NODE_INIT;
 		node_domain->span = cpu_possible_map;
+		node_domain->cache_hot_time = cacheflush_time;
 		node_domain->groups = &sched_group_nodes[cpu_to_node(i)];
 	}
 
@@ -1263,11 +1266,13 @@ __init void arch_init_sched_domains(void
 
 		*cpu_domain = SD_SIBLING_INIT;
 		cpu_domain->span = cpu_sibling_map[i];
+		cpu_domain->cache_hot_time = cacheflush_time / 2;
 		cpu_domain->parent = phys_domain;
 		cpu_domain->groups = &sched_group_cpus[i];
 
 		*phys_domain = SD_CPU_INIT;
 		phys_domain->span = cpu_possible_map;
+		phys_domain->cache_hot_time = cacheflush_time / 2;
 		phys_domain->groups = &sched_group_phys[first_cpu(cpu_domain->span)];
 	}
 
@@ -1324,7 +1329,49 @@ __init void arch_init_sched_domains(void
 	}
 }
 #endif /* CONFIG_NUMA */
-#endif /* CONFIG_SCHED_SMT */
+#else /* !CONFIG_SCHED_SMT */
+
+static struct sched_group sched_group_cpus[NR_CPUS];
+static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
+
+void __init arch_init_sched_domains(void)
+{
+	int i;
+	struct sched_group *first_cpu = NULL, *last_cpu = NULL;
+
+	/* Set up domains */
+	for_each_cpu(i) {
+		struct sched_domain *cpu_sd = &per_cpu(cpu_domains, i);
+
+		*cpu_sd = SD_CPU_INIT;
+		cpu_sd->span = cpu_possible_map;
+		cpu_sd->cache_hot_time = cacheflush_time / 2;
+		cpu_sd->groups = &sched_group_cpus[i];
+	}
+
+	/* Set up CPU groups */
+	for_each_cpu_mask(i, cpu_possible_map) {
+		struct sched_group *cpu = &sched_group_cpus[i];
+
+		cpus_clear(cpu->cpumask);
+		cpu_set(i, cpu->cpumask);
+		cpu->cpu_power = SCHED_LOAD_SCALE;
+
+		if (!first_cpu)
+			first_cpu = cpu;
+		if (last_cpu)
+			last_cpu->next = cpu;
+		last_cpu = cpu;
+	}
+	last_cpu->next = first_cpu;
+
+	mb(); /* domains were modified outside the lock */
+	for_each_cpu(i) {
+		struct sched_domain *cpu_sd = &per_cpu(cpu_domains, i);
+		cpu_attach_domain(cpu_sd, i);
+	}
+}
+#endif
 
 /* These are wrappers to interface to the new boot process.  Someone
    who understands all this stuff should rewrite it properly. --RR 15/Jul/02 */
diff -puN include/asm-i386/processor.h~sched-ingo-rollup include/asm-i386/processor.h
--- 25/include/asm-i386/processor.h~sched-ingo-rollup	2004-03-29 23:33:34.509298296 -0800
+++ 25-akpm/include/asm-i386/processor.h	2004-03-29 23:33:34.516297232 -0800
@@ -646,9 +646,11 @@ extern inline void prefetchw(const void 
 
 extern void select_idle_routine(const struct cpuinfo_x86 *c);
 
-#ifdef CONFIG_SCHED_SMT
-#define ARCH_HAS_SCHED_DOMAIN
-#define ARCH_HAS_SCHED_WAKE_IDLE
+#ifdef CONFIG_SMP
+# define ARCH_HAS_SCHED_DOMAIN
+# ifdef CONFIG_SCHED_SMT
+#  define ARCH_HAS_SCHED_WAKE_IDLE
+# endif
 #endif
 
 #endif /* __ASM_I386_PROCESSOR_H */
diff -puN include/linux/sched.h~sched-ingo-rollup include/linux/sched.h
--- 25/include/linux/sched.h~sched-ingo-rollup	2004-03-29 23:33:34.511297992 -0800
+++ 25-akpm/include/linux/sched.h	2004-03-29 23:33:34.517297080 -0800
@@ -584,9 +584,9 @@ struct sched_domain {
 	.cache_nice_tries	= 0,			\
 	.per_cpu_gain		= 15,			\
 	.flags			= SD_BALANCE_NEWIDLE	\
-				 | SD_WAKE_AFFINE	\
-				 | SD_WAKE_IDLE		\
-				 | SD_SHARE_CPUPOWER,	\
+				| SD_WAKE_AFFINE	\
+				| SD_WAKE_IDLE		\
+				| SD_SHARE_CPUPOWER,	\
 	.last_balance		= jiffies,		\
 	.balance_interval	= 1,			\
 	.nr_balance_failed	= 0,			\
@@ -602,7 +602,7 @@ struct sched_domain {
 	.busy_factor		= 64,			\
 	.imbalance_pct		= 125,			\
 	.cache_hot_time		= (5*1000000/2),	\
-	.cache_nice_tries	= 1,			\
+	.cache_nice_tries	= 2,			\
 	.per_cpu_gain		= 100,			\
 	.flags			= SD_BALANCE_NEWIDLE	\
 				| SD_WAKE_AFFINE,	\
@@ -643,7 +643,7 @@ static inline int set_cpus_allowed(task_
 
 extern unsigned long long sched_clock(void);
 
-#ifdef CONFIG_NUMA
+#ifdef CONFIG_SMP
 extern void sched_balance_exec(void);
 #else
 #define sched_balance_exec()   {}
diff -puN kernel/sched.c~sched-ingo-rollup kernel/sched.c
--- 25/kernel/sched.c~sched-ingo-rollup	2004-03-29 23:33:34.512297840 -0800
+++ 25-akpm/kernel/sched.c	2004-03-29 23:33:34.523296168 -0800
@@ -180,11 +180,14 @@
 		((MAX_TIMESLICE - MIN_TIMESLICE) * \
 			(MAX_PRIO-1 - (p)->static_prio) / (MAX_USER_PRIO-1)))
 
-static inline unsigned int task_timeslice(task_t *p)
+static unsigned int task_timeslice(task_t *p)
 {
 	return BASE_TIMESLICE(p);
 }
 
+#define task_hot(p, now, sd) \
+	(!TASK_INTERACTIVE(p) && ((now)-(p)->timestamp < (sd)->cache_hot_time))
+
 /*
  * These are the runqueue data structures:
  */
@@ -209,14 +212,7 @@ struct prio_array {
 struct runqueue {
 	spinlock_t lock;
 
-	/*
-	 * nr_running and cpu_load should be in the same cacheline because
-	 * remote CPUs use both these fields when doing load calculation.
-	 */
 	unsigned long nr_running;
-#ifdef CONFIG_SMP
-	unsigned long cpu_load;
-#endif
 	unsigned long long nr_switches;
 	unsigned long expired_timestamp, nr_uninterruptible;
 	unsigned long long timestamp_last_tick;
@@ -262,7 +258,7 @@ static DEFINE_PER_CPU(struct runqueue, r
  * interrupts.  Note the ordering: we can safely lookup the task_rq without
  * explicitly disabling preemption.
  */
-static inline runqueue_t *task_rq_lock(task_t *p, unsigned long *flags)
+static runqueue_t *task_rq_lock(task_t *p, unsigned long *flags)
 {
 	struct runqueue *rq;
 
@@ -285,7 +281,7 @@ static inline void task_rq_unlock(runque
 /*
  * rq_lock - lock a given runqueue and disable interrupts.
  */
-static inline runqueue_t *this_rq_lock(void)
+static runqueue_t *this_rq_lock(void)
 {
 	runqueue_t *rq;
 
@@ -304,7 +300,7 @@ static inline void rq_unlock(runqueue_t 
 /*
  * Adding/removing a task to/from a priority array:
  */
-static inline void dequeue_task(struct task_struct *p, prio_array_t *array)
+static void dequeue_task(struct task_struct *p, prio_array_t *array)
 {
 	array->nr_active--;
 	list_del(&p->run_list);
@@ -312,7 +308,7 @@ static inline void dequeue_task(struct t
 		__clear_bit(p->prio, array->bitmap);
 }
 
-static inline void enqueue_task(struct task_struct *p, prio_array_t *array)
+static void enqueue_task(struct task_struct *p, prio_array_t *array)
 {
 	list_add_tail(&p->run_list, array->queue + p->prio);
 	__set_bit(p->prio, array->bitmap);
@@ -320,6 +316,21 @@ static inline void enqueue_task(struct t
 	p->array = array;
 }
 
+#ifdef CONFIG_SMP
+/*
+ * Used by the migration code - we pull tasks from the head of the
+ * remote queue so we want these tasks to show up at the head of the
+ * local queue:
+ */
+static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array)
+{
+	list_add(&p->run_list, array->queue + p->prio);
+	__set_bit(p->prio, array->bitmap);
+	array->nr_active++;
+	p->array = array;
+}
+#endif
+
 /*
  * effective_prio - return the priority that is based on the static
  * priority but is modified by bonuses/penalties.
@@ -440,7 +451,7 @@ static void recalc_task_prio(task_t *p, 
  * Update all the scheduling statistics stuff. (sleep average
  * calculation, priority modifiers, etc.)
  */
-static inline void activate_task(task_t *p, runqueue_t *rq)
+static void activate_task(task_t *p, runqueue_t *rq)
 {
 	unsigned long long now = sched_clock();
 
@@ -476,7 +487,7 @@ static inline void activate_task(task_t 
 /*
  * deactivate_task - remove a task from the runqueue.
  */
-static inline void deactivate_task(struct task_struct *p, runqueue_t *rq)
+static void deactivate_task(struct task_struct *p, runqueue_t *rq)
 {
 	rq->nr_running--;
 	if (p->state == TASK_UNINTERRUPTIBLE)
@@ -493,7 +504,7 @@ static inline void deactivate_task(struc
  * the target CPU.
  */
 #ifdef CONFIG_SMP
-static inline void resched_task(task_t *p)
+static void resched_task(task_t *p)
 {
 	int need_resched, nrpolling;
 
@@ -621,20 +632,9 @@ EXPORT_SYMBOL_GPL(kick_process);
 /*
  * Return a low guess at the load of cpu.
  */
-static inline unsigned long get_low_cpu_load(int cpu)
-{
-	runqueue_t *rq = cpu_rq(cpu);
-	unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
-
-	return min(rq->cpu_load, load_now);
-}
-
-static inline unsigned long get_high_cpu_load(int cpu)
+static inline unsigned long cpu_load(int cpu)
 {
-	runqueue_t *rq = cpu_rq(cpu);
-	unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
-
-	return max(rq->cpu_load, load_now);
+	return cpu_rq(cpu)->nr_running * SCHED_LOAD_SCALE;
 }
 
 #endif
@@ -695,15 +695,14 @@ static inline int wake_idle(int cpu, tas
  */
 static int try_to_wake_up(task_t * p, unsigned int state, int sync)
 {
+	int cpu, this_cpu, success = 0;
 	unsigned long flags;
-	int success = 0;
 	long old_state;
 	runqueue_t *rq;
-	int cpu, this_cpu;
 #ifdef CONFIG_SMP
-	unsigned long long now;
 	unsigned long load, this_load;
 	struct sched_domain *sd;
+	unsigned long long now;
 	int new_cpu;
 #endif
 
@@ -722,65 +721,75 @@ static int try_to_wake_up(task_t * p, un
 	if (unlikely(task_running(rq, p) || cpu_is_offline(this_cpu)))
 		goto out_activate;
 
-	new_cpu = cpu;
+	new_cpu = this_cpu;
 
-	if (cpu == this_cpu || unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
+	if (cpu == this_cpu)
 		goto out_set_cpu;
 
-	load = get_low_cpu_load(cpu);
-	this_load = get_high_cpu_load(this_cpu);
+	/*
+	 * Passive balance, if the load on the remote CPU is over
+	 * the limit:
+	 */
+	load = cpu_load(cpu) * 100;
+	/*
+	 * add the new task's effect to its new CPU. If sync wakeup then
+	 * subtract current's load effect: this means that they cancel out
+	 * each other in the sync case, the we have +1 load in the !sync case:
+	 */
+	this_load = cpu_load(this_cpu);
+	if (!sync)
+		this_load += SCHED_LOAD_SCALE;
+	this_load *= rq->sd->imbalance_pct;
 
-	/* Don't pull the task off an idle CPU to a busy one */
-	if (load < SCHED_LOAD_SCALE/2 && this_load > SCHED_LOAD_SCALE/2)
+	if (load > this_load)
 		goto out_set_cpu;
 
-	new_cpu = this_cpu; /* Wake to this CPU if we can */
-
 	/*
-	 * Passive load balancing. If the queues are very out of balance
-	 * we might as well balance here rather than the periodic load
-	 * balancing.
+	 * Migrate if the source CPU is not idle or the target
+	 * CPU is idle; if the two CPUs share a domain; and if the task
+	 * is not cache-hot.
+	 *
+	 * (Note that these kinds of migrations violate the equilibrium,
+	 * and might trigger follow-on load-balancing - hence we pick
+	 * cache-cold tasks only.)
 	 */
-	if (load > this_load + SCHED_LOAD_SCALE*2)
-		goto out_set_cpu;
+	if (!cpu_load(cpu) && cpu_load(this_cpu))
+		goto out_activate;
 
 	now = sched_clock();
-
-	/*
-	 * Migrate the task to the waking domain.
-	 * Do not violate hard affinity.
-	 */
 	for_each_domain(this_cpu, sd) {
 		if (!(sd->flags & SD_WAKE_AFFINE))
 			break;
-		if (rq->timestamp_last_tick - p->timestamp < sd->cache_hot_time)
+		if (task_hot(p, now, sd))
 			break;
-
+		/*
+		 * The two CPUs share a span of a domain that has affine
+		 * wakeups enabled - the task can be migrated:
+		 */
 		if (cpu_isset(cpu, sd->span))
 			goto out_set_cpu;
 	}
+	/* No luck - fall back to the original CPU: */
+	new_cpu = cpu;
 
-	new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */
 out_set_cpu:
 	new_cpu = wake_idle(new_cpu, p);
 	if (new_cpu != cpu && cpu_isset(new_cpu, p->cpus_allowed)) {
 		set_task_cpu(p, new_cpu);
-		goto repeat_lock_task;
-	}
-	goto out_activate;
+		task_rq_unlock(rq, &flags);
 
-repeat_lock_task:
-	task_rq_unlock(rq, &flags);
-	rq = task_rq_lock(p, &flags);
-	old_state = p->state;
-	if (!(old_state & state))
-		goto out;
+		/* might preempt at this point */
 
-	if (p->array)
-		goto out_running;
+		rq = task_rq_lock(p, &flags);
+		old_state = p->state;
+		if (!(old_state & state))
+			goto out;
+		if (p->array)
+			goto out_running;
 
-	this_cpu = smp_processor_id();
-	cpu = task_cpu(p);
+		this_cpu = smp_processor_id();
+		cpu = task_cpu(p);
+	}
 
 out_activate:
 #endif /* CONFIG_SMP */
@@ -972,7 +981,7 @@ void fastcall sched_exit(task_t * p)
  * with the lock held can cause deadlocks; see schedule() for
  * details.)
  */
-static inline void finish_task_switch(task_t *prev)
+static void finish_task_switch(task_t *prev)
 {
 	runqueue_t *rq = this_rq();
 	struct mm_struct *mm = rq->prev_mm;
@@ -1093,7 +1102,7 @@ unsigned long nr_iowait(void)
  * Note this does not disable interrupts like task_rq_lock,
  * you need to do so manually before calling.
  */
-static inline void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2)
+static void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2)
 {
 	if (rq1 == rq2)
 		spin_lock(&rq1->lock);
@@ -1114,7 +1123,7 @@ static inline void double_rq_lock(runque
  * Note this does not restore interrupts like task_rq_unlock,
  * you need to do so manually after calling.
  */
-static inline void double_rq_unlock(runqueue_t *rq1, runqueue_t *rq2)
+static void double_rq_unlock(runqueue_t *rq1, runqueue_t *rq2)
 {
 	spin_unlock(&rq1->lock);
 	if (rq1 != rq2)
@@ -1129,7 +1138,6 @@ enum idle_type
 };
 
 #ifdef CONFIG_SMP
-#ifdef CONFIG_NUMA
 /*
  * If dest_cpu is allowed for this process, migrate the task to it.
  * This is accomplished by forcing the cpu_allowed mask to only
@@ -1175,19 +1183,19 @@ out:
  */
 static int sched_best_cpu(struct task_struct *p, struct sched_domain *sd)
 {
+	int i = 0, min_load, this_cpu, best_cpu;
 	cpumask_t tmp;
-	int i, min_load, this_cpu, best_cpu;
 
 	best_cpu = this_cpu = task_cpu(p);
-	min_load = INT_MAX;
+
+	/* subtract the currently running task's load effect: */
+	min_load = cpu_load(this_cpu) - SCHED_LOAD_SCALE;
 
 	cpus_and(tmp, sd->span, cpu_online_map);
+	cpu_clear(this_cpu, tmp);
+
 	for_each_cpu_mask(i, tmp) {
-		unsigned long load;
-		if (i == this_cpu)
-			load = get_low_cpu_load(i);
-		else
-			load = get_high_cpu_load(i) + SCHED_LOAD_SCALE;
+		unsigned long load = cpu_load(i);
 
 		if (min_load > load) {
 			best_cpu = i;
@@ -1199,25 +1207,27 @@ static int sched_best_cpu(struct task_st
 
 /*
  * sched_balance_exec(): find the highest-level, exec-balance-capable
- * domain and try to migrate the task to the least loaded CPU.
+ * domain and try to migrate the current task to the least loaded CPU.
  *
  * execve() is a valuable balancing opportunity, because at this point
- * the task has the smallest effective memory and cache footprint.
+ * the task has the smallest effective cache footprint - a completely new
+ * process image is being created, so almost all of the currently existing
+ * cache footprint is irrelevant. So we attempt to balance this task as
+ * broadly as possible, without considering migration costs, which costs
+ * otherwise affect all other types of task migrations.
  */
 void sched_balance_exec(void)
 {
 	struct sched_domain *sd, *best_sd = NULL;
-	int new_cpu;
-	int this_cpu = get_cpu();
+	int new_cpu, this_cpu = get_cpu();
 
-	/* Prefer the current CPU if there's only this task running */
+	/* Prefer the current CPU if there's only this task running: */
 	if (this_rq()->nr_running <= 1)
 		goto out;
 
-	for_each_domain(this_cpu, sd) {
+	for_each_domain(this_cpu, sd)
 		if (sd->flags & SD_BALANCE_EXEC)
 			best_sd = sd;
-	}
 
 	if (best_sd) {
 		new_cpu = sched_best_cpu(current, best_sd);
@@ -1230,12 +1240,11 @@ void sched_balance_exec(void)
 out:
 	put_cpu();
 }
-#endif /* CONFIG_NUMA */
 
 /*
  * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
  */
-static inline void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest)
+static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest)
 {
 	if (unlikely(!spin_trylock(&busiest->lock))) {
 		if (busiest < this_rq) {
@@ -1253,13 +1262,13 @@ static inline void double_lock_balance(r
  */
 static inline
 void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p,
-		runqueue_t *this_rq, prio_array_t *this_array, int this_cpu)
+	       runqueue_t *this_rq, prio_array_t *this_array, int this_cpu)
 {
 	dequeue_task(p, src_array);
 	src_rq->nr_running--;
 	set_task_cpu(p, this_cpu);
 	this_rq->nr_running++;
-	enqueue_task(p, this_array);
+	enqueue_task_head(p, this_array);
 	p->timestamp = sched_clock() -
 				(src_rq->timestamp_last_tick - p->timestamp);
 	/*
@@ -1275,7 +1284,7 @@ void pull_task(runqueue_t *src_rq, prio_
  */
 static inline
 int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
-		struct sched_domain *sd, enum idle_type idle)
+		     struct sched_domain *sd, enum idle_type idle)
 {
 	/*
 	 * We do not migrate tasks that are:
@@ -1291,7 +1300,7 @@ int can_migrate_task(task_t *p, runqueue
 	/* Aggressive migration if we've failed balancing */
 	if (idle == NEWLY_IDLE ||
 			sd->nr_balance_failed < sd->cache_nice_tries) {
-		if (rq->timestamp_last_tick - p->timestamp < sd->cache_hot_time)
+		if (task_hot(p, rq->timestamp_last_tick, sd))
 			return 0;
 	}
 
@@ -1306,30 +1315,24 @@ int can_migrate_task(task_t *p, runqueue
  * Called with both runqueues locked.
  */
 static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest,
-			unsigned long max_nr_move, struct sched_domain *sd,
-			enum idle_type idle)
+		      unsigned long max_nr_move, struct sched_domain *sd,
+		      enum idle_type idle)
 {
-	int idx;
-	int pulled = 0;
 	prio_array_t *array, *dst_array;
 	struct list_head *head, *curr;
+	int ret, idx, pulled = 0;
 	task_t *tmp;
 
 	if (max_nr_move <= 0 || busiest->nr_running <= 1)
 		goto out;
 
-	/*
-	 * We first consider expired tasks. Those will likely not be
-	 * executed in the near future, and they are most likely to
-	 * be cache-cold, thus switching CPUs has the least effect
-	 * on them.
-	 */
-	if (busiest->expired->nr_active) {
-		array = busiest->expired;
-		dst_array = this_rq->expired;
-	} else {
+	/* We first consider active tasks. */
+	if (busiest->active->nr_active) {
 		array = busiest->active;
 		dst_array = this_rq->active;
+	} else {
+		array = busiest->expired;
+		dst_array = this_rq->expired;
 	}
 
 new_array:
@@ -1341,22 +1344,27 @@ skip_bitmap:
 	else
 		idx = find_next_bit(array->bitmap, MAX_PRIO, idx);
 	if (idx >= MAX_PRIO) {
-		if (array == busiest->expired && busiest->active->nr_active) {
-			array = busiest->active;
-			dst_array = this_rq->active;
+		if (array == busiest->active && busiest->expired->nr_active) {
+			array = busiest->expired;
+			dst_array = this_rq->expired;
 			goto new_array;
 		}
 		goto out;
 	}
 
 	head = array->queue + idx;
-	curr = head->prev;
+	curr = head->next;
 skip_queue:
 	tmp = list_entry(curr, task_t, run_list);
 
-	curr = curr->prev;
+	curr = curr->next;
 
-	if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle)) {
+	ret = can_migrate_task(tmp, busiest, this_cpu, sd, idle);
+	if (ret == -1) {
+		idx++;
+		goto skip_bitmap;
+	}
+	if (!ret) {
 		if (curr != head)
 			goto skip_queue;
 		idx++;
@@ -1383,42 +1391,25 @@ out:
  */
 static struct sched_group *
 find_busiest_group(struct sched_domain *sd, int this_cpu,
-		unsigned long *imbalance, enum idle_type idle)
+		   unsigned long *nr_move, enum idle_type idle)
 {
 	struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
-	unsigned long max_load, avg_load, total_load, this_load, total_pwr;
+	unsigned long max_load, avg_load, total_load,
+			this_load, total_pwr, delta;
 
 	max_load = this_load = total_load = total_pwr = 0;
 
 	do {
 		cpumask_t tmp;
-		unsigned long load;
-		int local_group;
-		int i, nr_cpus = 0;
-
-		local_group = cpu_isset(this_cpu, group->cpumask);
+		int i;
 
 		/* Tally up the load of all CPUs in the group */
-		avg_load = 0;
 		cpus_and(tmp, group->cpumask, cpu_online_map);
-		if (unlikely(cpus_empty(tmp))) {
-			WARN_ON(1);
-			return NULL;
-		}
-
-		for_each_cpu_mask(i, tmp) {
-			/* Bias balancing toward cpus of our domain */
-			if (local_group)
-				load = get_high_cpu_load(i);
-			else
-				load = get_low_cpu_load(i);
-
-			nr_cpus++;
-			avg_load += load;
-		}
+		WARN_ON(cpus_empty(tmp));
 
-		if (!nr_cpus)
-			goto nextgroup;
+		avg_load = 0;
+		for_each_cpu_mask(i, tmp)
+			avg_load += cpu_load(i);
 
 		total_load += avg_load;
 		total_pwr += group->cpu_power;
@@ -1426,11 +1417,12 @@ find_busiest_group(struct sched_domain *
 		/* Adjust by relative CPU power of the group */
 		avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
 
-		if (local_group) {
+		if (cpu_isset(this_cpu, group->cpumask)) {
 			this_load = avg_load;
 			this = group;
 			goto nextgroup;
-		} else if (avg_load > max_load) {
+		}
+		if (avg_load > max_load) {
 			max_load = avg_load;
 			busiest = group;
 		}
@@ -1443,8 +1435,8 @@ nextgroup:
 
 	avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
 
-	if (this_load >= avg_load ||
-			100*max_load <= sd->imbalance_pct*this_load)
+	if ((this_load >= avg_load)
+			|| (100*max_load <= sd->imbalance_pct*this_load))
 		goto out_balanced;
 
 	/*
@@ -1458,67 +1450,32 @@ nextgroup:
 	 * by pulling tasks to us.  Be careful of negative numbers as they'll
 	 * appear as very large values with unsigned longs.
 	 */
-	*imbalance = min(max_load - avg_load, avg_load - this_load);
-
-	/* How much load to actually move to equalise the imbalance */
-	*imbalance = (*imbalance * min(busiest->cpu_power, this->cpu_power))
-				/ SCHED_LOAD_SCALE;
-
-	if (*imbalance < SCHED_LOAD_SCALE - 1) {
-		unsigned long pwr_now = 0, pwr_move = 0;
-		unsigned long tmp;
-
-		if (max_load - this_load >= SCHED_LOAD_SCALE*2) {
-			*imbalance = 1;
-			return busiest;
-		}
+	delta = max_load - this_load;
 
+	if (delta > SCHED_LOAD_SCALE) {
+		delta = min(max_load - avg_load, avg_load - this_load);
 		/*
-		 * OK, we don't have enough imbalance to justify moving tasks,
-		 * however we may be able to increase total CPU power used by
-		 * moving them.
+		 * How many tasks to actually move to equalise the
+		 * imbalance: first round up (which will move us across
+		 * the average unless we can precisely balance to the
+		 * average) and get rid of the scaling factor:
 		 */
+		delta += SCHED_LOAD_SCALE-1;
+		*nr_move = delta / SCHED_LOAD_SCALE;
 
-		pwr_now += busiest->cpu_power*min(SCHED_LOAD_SCALE, max_load);
-		pwr_now += this->cpu_power*min(SCHED_LOAD_SCALE, this_load);
-		pwr_now /= SCHED_LOAD_SCALE;
-
-		/* Amount of load we'd subtract */
-		tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/busiest->cpu_power;
-		if (max_load > tmp)
-			pwr_move += busiest->cpu_power*min(SCHED_LOAD_SCALE,
-							max_load - tmp);
-
-		/* Amount of load we'd add */
-		tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/this->cpu_power;
-		if (max_load < tmp)
-			tmp = max_load;
-		pwr_move += this->cpu_power*min(SCHED_LOAD_SCALE, this_load + tmp);
-		pwr_move /= SCHED_LOAD_SCALE;
-
-		/* Move if we gain another 8th of a CPU worth of throughput */
-		if (pwr_move < pwr_now + SCHED_LOAD_SCALE / 8)
-			goto out_balanced;
-
-		*imbalance = 1;
-		return busiest;
+		if (*nr_move)
+			return busiest;
 	}
 
-	/* Get rid of the scaling factor, rounding down as we divide */
-	*imbalance = (*imbalance + 1) / SCHED_LOAD_SCALE;
-
-	return busiest;
-
 out_balanced:
-	if (busiest && idle != NOT_IDLE && max_load > SCHED_LOAD_SCALE) {
-		*imbalance = 1;
+	if (busiest && idle == NEWLY_IDLE && max_load > SCHED_LOAD_SCALE) {
+		*nr_move = 1;
 		return busiest;
 	}
 
-	*imbalance = 0;
+	*nr_move = 0;
 	return NULL;
 }
-
 /*
  * find_busiest_queue - find the busiest runqueue among the cpus in group.
  */
@@ -1531,9 +1488,9 @@ static runqueue_t *find_busiest_queue(st
 
 	cpus_and(tmp, group->cpumask, cpu_online_map);
 	for_each_cpu_mask(i, tmp) {
-		load = get_low_cpu_load(i);
+		load = cpu_load(i);
 
-		if (load >= max_load) {
+		if (load > max_load) {
 			max_load = load;
 			busiest = cpu_rq(i);
 		}
@@ -1552,8 +1509,8 @@ static int load_balance(int this_cpu, ru
 			struct sched_domain *sd, enum idle_type idle)
 {
 	struct sched_group *group;
-	runqueue_t *busiest;
 	unsigned long imbalance;
+	runqueue_t *busiest;
 	int nr_moved;
 
 	spin_lock(&this_rq->lock);
@@ -1579,7 +1536,6 @@ static int load_balance(int this_cpu, ru
 
 	if (!nr_moved) {
 		sd->nr_balance_failed++;
-
 		if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
 			int wake = 0;
 
@@ -1592,12 +1548,11 @@ static int load_balance(int this_cpu, ru
 			spin_unlock(&busiest->lock);
 			if (wake)
 				wake_up_process(busiest->migration_thread);
-
 			/*
-			 * We've kicked active balancing, reset the failure
-			 * counter.
+			 * We've kicked active balancing, reset the
+			 * failure counter:
 			 */
-			sd->nr_balance_failed = sd->cache_nice_tries;
+			sd->nr_balance_failed = 0;
 		}
 	} else
 		sd->nr_balance_failed = 0;
@@ -1683,18 +1638,18 @@ static inline void idle_balance(int this
  */
 static void active_load_balance(runqueue_t *busiest, int busiest_cpu)
 {
-	struct sched_domain *sd;
 	struct sched_group *group, *busy_group;
+	struct sched_domain *sd;
 	int i;
 
 	if (busiest->nr_running <= 1)
 		return;
 
-	for_each_domain(busiest_cpu, sd) {
+	for_each_domain(busiest_cpu, sd)
 		if (cpu_isset(busiest->push_cpu, sd->span))
 			break;
-	}
-	if (!sd) {
+
+	if (!sd->parent && !cpu_isset(busiest->push_cpu, sd->span)) {
 		WARN_ON(1);
 		return;
 	}
@@ -1708,13 +1663,13 @@ static void active_load_balance(runqueue
  	do {
 		cpumask_t tmp;
 		runqueue_t *rq;
- 		int push_cpu = 0;
+		int push_cpu = 0;
 
  		if (group == busy_group)
  			goto next_group;
 
 		cpus_and(tmp, group->cpumask, cpu_online_map);
-		if (cpus_weight(tmp) == 0)
+		if (!cpus_weight(tmp))
 			goto next_group;
 
  		for_each_cpu_mask(i, tmp) {
@@ -1745,20 +1700,14 @@ next_group:
 #define CPU_OFFSET(cpu) (HZ * cpu / NR_CPUS)
 
 static void rebalance_tick(int this_cpu, runqueue_t *this_rq,
-						enum idle_type idle)
+			   enum idle_type idle)
 {
-	unsigned long old_load, this_load;
 	unsigned long j = jiffies + CPU_OFFSET(this_cpu);
 	struct sched_domain *sd;
 
 	if (unlikely(cpu_is_offline(this_cpu)))
 		return;
 
-	/* Update our load */
-	old_load = this_rq->cpu_load;
-	this_load = this_rq->nr_running * SCHED_LOAD_SCALE;
-	this_rq->cpu_load = (old_load + this_load) / 2;
-
 	for_each_domain(this_cpu, sd) {
 		unsigned long interval = sd->balance_interval;
 
@@ -1767,7 +1716,7 @@ static void rebalance_tick(int this_cpu,
 
 		/* scale ms to jiffies */
 		interval = MSEC_TO_JIFFIES(interval);
-		if (unlikely(interval == 0))
+		if (unlikely(!interval))
 			interval = 1;
 
 		if (j - sd->last_balance >= interval) {
@@ -3479,10 +3428,10 @@ static void __init arch_init_sched_domai
 
 	/* Set up groups */
 	for (i = 0; i < MAX_NUMNODES; i++) {
-		cpumask_t tmp = node_to_cpumask(i);
-		cpumask_t nodemask;
 		struct sched_group *first_cpu = NULL, *last_cpu = NULL;
 		struct sched_group *node = &sched_group_nodes[i];
+		cpumask_t tmp = node_to_cpumask(i);
+		cpumask_t nodemask;
 		int j;
 
 		cpus_and(nodemask, tmp, cpu_possible_map);
@@ -3606,12 +3555,12 @@ void sched_domain_debug(void)
 				printk(" ");
 			printk("groups:");
 			do {
-				if (group == NULL) {
+				if (!group) {
 					printk(" ERROR: NULL");
 					break;
 				}
 
-				if (cpus_weight(group->cpumask) == 0)
+				if (!cpus_weight(group->cpumask))
 					printk(" ERROR empty group:");
 
 				cpus_and(tmp, groupmask, group->cpumask);
@@ -3691,7 +3640,6 @@ void __init sched_init(void)
 
 #ifdef CONFIG_SMP
 		rq->sd = &sched_domain_init;
-		rq->cpu_load = 0;
 		rq->active_balance = 0;
 		rq->push_cpu = 0;
 		rq->migration_thread = NULL;

_