From 67bb6c036d1fc3d332c8527a36a546e3e72e822c Mon Sep 17 00:00:00 2001
From: Gautham R Shenoy <ego@in.ibm.com>
Date: Wed, 25 Mar 2009 14:43:35 +0530
Subject: sched: Simple helper functions for find_busiest_group()

Impact: cleanup

Currently the load idx calculation code is in find_busiest_group().
Move that to a static inline helper function.

Similary, to find the first cpu of a sched_group we use
cpumask_first(sched_group_cpus(group))

Use a helper to that. It improves readability in some cases.

Signed-off-by: Gautham R Shenoy <ego@in.ibm.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: "Balbir Singh" <balbir@in.ibm.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: "Dhaval Giani" <dhaval@linux.vnet.ibm.com>
Cc: Bharata B Rao <bharata@linux.vnet.ibm.com>
Cc: "Vaidyanathan Srinivasan" <svaidy@linux.vnet.ibm.com>
LKML-Reference: <20090325091335.13992.55424.stgit@sofia.in.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched.c b/kernel/sched.c
index 7b389c7..6aec1e7 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3189,6 +3189,43 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
 
 	return 0;
 }
+/********** Helpers for find_busiest_group ************************/
+
+/**
+ * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
+ * @group: The group whose first cpu is to be returned.
+ */
+static inline unsigned int group_first_cpu(struct sched_group *group)
+{
+	return cpumask_first(sched_group_cpus(group));
+}
+
+/**
+ * get_sd_load_idx - Obtain the load index for a given sched domain.
+ * @sd: The sched_domain whose load_idx is to be obtained.
+ * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
+ */
+static inline int get_sd_load_idx(struct sched_domain *sd,
+					enum cpu_idle_type idle)
+{
+	int load_idx;
+
+	switch (idle) {
+	case CPU_NOT_IDLE:
+		load_idx = sd->busy_idx;
+		break;
+
+	case CPU_NEWLY_IDLE:
+		load_idx = sd->newidle_idx;
+		break;
+	default:
+		load_idx = sd->idle_idx;
+		break;
+	}
+
+	return load_idx;
+}
+/******* find_busiest_group() helpers end here *********************/
 
 /*
  * find_busiest_group finds and returns the busiest CPU group within the
@@ -3217,12 +3254,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 	busiest_load_per_task = busiest_nr_running = 0;
 	this_load_per_task = this_nr_running = 0;
 
-	if (idle == CPU_NOT_IDLE)
-		load_idx = sd->busy_idx;
-	else if (idle == CPU_NEWLY_IDLE)
-		load_idx = sd->newidle_idx;
-	else
-		load_idx = sd->idle_idx;
+	load_idx = get_sd_load_idx(sd, idle);
 
 	do {
 		unsigned long load, group_capacity, max_cpu_load, min_cpu_load;
@@ -3238,7 +3270,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 					       sched_group_cpus(group));
 
 		if (local_group)
-			balance_cpu = cpumask_first(sched_group_cpus(group));
+			balance_cpu = group_first_cpu(group);
 
 		/* Tally up the load of all CPUs in the group */
 		sum_weighted_load = sum_nr_running = avg_load = 0;
@@ -3359,8 +3391,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 		 */
 		if ((sum_nr_running < min_nr_running) ||
 		    (sum_nr_running == min_nr_running &&
-		     cpumask_first(sched_group_cpus(group)) >
-		     cpumask_first(sched_group_cpus(group_min)))) {
+		     group_first_cpu(group) > group_first_cpu(group_min))) {
 			group_min = group;
 			min_nr_running = sum_nr_running;
 			min_load_per_task = sum_weighted_load /
@@ -3375,8 +3406,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 		if (sum_nr_running <= group_capacity - 1) {
 			if (sum_nr_running > leader_nr_running ||
 			    (sum_nr_running == leader_nr_running &&
-			     cpumask_first(sched_group_cpus(group)) <
-			     cpumask_first(sched_group_cpus(group_leader)))) {
+			     group_first_cpu(group) <
+			     group_first_cpu(group_leader))) {
 				group_leader = group;
 				leader_nr_running = sum_nr_running;
 			}
@@ -3504,7 +3535,7 @@ out_balanced:
 		*imbalance = min_load_per_task;
 		if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) {
 			cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu =
-				cpumask_first(sched_group_cpus(group_leader));
+				group_first_cpu(group_leader);
 		}
 		return group_min;
 	}
-- 
cgit v0.10.2


From 6dfdb0629019f307ab18864b1fd3e5dbb02f383c Mon Sep 17 00:00:00 2001
From: Gautham R Shenoy <ego@in.ibm.com>
Date: Wed, 25 Mar 2009 14:43:40 +0530
Subject: sched: Fix indentations in find_busiest_group() using gotos

Impact: cleanup

Some indentations in find_busiest_group() can minimized by using
early exits with the help of gotos. This improves readability in
a couple of cases.

Signed-off-by: Gautham R Shenoy <ego@in.ibm.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: "Balbir Singh" <balbir@in.ibm.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: "Dhaval Giani" <dhaval@linux.vnet.ibm.com>
Cc: Bharata B Rao <bharata@linux.vnet.ibm.com>
Cc: "Vaidyanathan Srinivasan" <svaidy@linux.vnet.ibm.com>
LKML-Reference: <20090325091340.13992.45062.stgit@sofia.in.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched.c b/kernel/sched.c
index 6aec1e7..f87adbe 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3403,14 +3403,14 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 		 * capacity but still has some space to pick up some load
 		 * from other group and save more power
 		 */
-		if (sum_nr_running <= group_capacity - 1) {
-			if (sum_nr_running > leader_nr_running ||
-			    (sum_nr_running == leader_nr_running &&
-			     group_first_cpu(group) <
-			     group_first_cpu(group_leader))) {
-				group_leader = group;
-				leader_nr_running = sum_nr_running;
-			}
+		if (sum_nr_running > group_capacity - 1)
+			goto group_next;
+
+		if (sum_nr_running > leader_nr_running ||
+		    (sum_nr_running == leader_nr_running &&
+		     group_first_cpu(group) < group_first_cpu(group_leader))) {
+			group_leader = group;
+			leader_nr_running = sum_nr_running;
 		}
 group_next:
 #endif
@@ -3531,14 +3531,16 @@ out_balanced:
 	if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
 		goto ret;
 
-	if (this == group_leader && group_leader != group_min) {
-		*imbalance = min_load_per_task;
-		if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) {
-			cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu =
-				group_first_cpu(group_leader);
-		}
-		return group_min;
+	if (this != group_leader || group_leader == group_min)
+		goto ret;
+
+	*imbalance = min_load_per_task;
+	if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) {
+		cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu =
+			group_first_cpu(group_leader);
 	}
+	return group_min;
+
 #endif
 ret:
 	*imbalance = 0;
-- 
cgit v0.10.2


From 381be78fdc829a22f6327a0ed09f54b6270a976d Mon Sep 17 00:00:00 2001
From: Gautham R Shenoy <ego@in.ibm.com>
Date: Wed, 25 Mar 2009 14:43:46 +0530
Subject: sched: Define structure to store the sched_group statistics for fbg()

Impact: cleanup

Currently a whole bunch of variables are used to store the
various statistics pertaining to the groups we iterate over
in find_busiest_group().

Group them together in a single data structure and add
appropriate comments.

This will be useful later on when we create helper functions
to calculate the sched_group statistics.

Credit: Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com>
Signed-off-by: Gautham R Shenoy <ego@in.ibm.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: "Balbir Singh" <balbir@in.ibm.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: "Dhaval Giani" <dhaval@linux.vnet.ibm.com>
Cc: Bharata B Rao <bharata@linux.vnet.ibm.com>
LKML-Reference: <20090325091345.13992.20099.stgit@sofia.in.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched.c b/kernel/sched.c
index f87adbe..109db12 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3192,6 +3192,18 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
 /********** Helpers for find_busiest_group ************************/
 
 /**
+ * sg_lb_stats - stats of a sched_group required for load_balancing
+ */
+struct sg_lb_stats {
+	unsigned long avg_load; /*Avg load across the CPUs of the group */
+	unsigned long group_load; /* Total load over the CPUs of the group */
+	unsigned long sum_nr_running; /* Nr tasks running in the group */
+	unsigned long sum_weighted_load; /* Weighted load of group's tasks */
+	unsigned long group_capacity;
+	int group_imb; /* Is there an imbalance in the group ? */
+};
+
+/**
  * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
  * @group: The group whose first cpu is to be returned.
  */
@@ -3257,23 +3269,22 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 	load_idx = get_sd_load_idx(sd, idle);
 
 	do {
-		unsigned long load, group_capacity, max_cpu_load, min_cpu_load;
+		struct sg_lb_stats sgs;
+		unsigned long load, max_cpu_load, min_cpu_load;
 		int local_group;
 		int i;
-		int __group_imb = 0;
 		unsigned int balance_cpu = -1, first_idle_cpu = 0;
-		unsigned long sum_nr_running, sum_weighted_load;
 		unsigned long sum_avg_load_per_task;
 		unsigned long avg_load_per_task;
 
 		local_group = cpumask_test_cpu(this_cpu,
 					       sched_group_cpus(group));
+		memset(&sgs, 0, sizeof(sgs));
 
 		if (local_group)
 			balance_cpu = group_first_cpu(group);
 
 		/* Tally up the load of all CPUs in the group */
-		sum_weighted_load = sum_nr_running = avg_load = 0;
 		sum_avg_load_per_task = avg_load_per_task = 0;
 
 		max_cpu_load = 0;
@@ -3301,9 +3312,9 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 					min_cpu_load = load;
 			}
 
-			avg_load += load;
-			sum_nr_running += rq->nr_running;
-			sum_weighted_load += weighted_cpuload(i);
+			sgs.group_load += load;
+			sgs.sum_nr_running += rq->nr_running;
+			sgs.sum_weighted_load += weighted_cpuload(i);
 
 			sum_avg_load_per_task += cpu_avg_load_per_task(i);
 		}
@@ -3320,12 +3331,12 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 			goto ret;
 		}
 
-		total_load += avg_load;
+		total_load += sgs.group_load;
 		total_pwr += group->__cpu_power;
 
 		/* Adjust by relative CPU power of the group */
-		avg_load = sg_div_cpu_power(group,
-				avg_load * SCHED_LOAD_SCALE);
+		sgs.avg_load = sg_div_cpu_power(group,
+				sgs.group_load * SCHED_LOAD_SCALE);
 
 
 		/*
@@ -3341,22 +3352,23 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 				sum_avg_load_per_task * SCHED_LOAD_SCALE);
 
 		if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
-			__group_imb = 1;
+			sgs.group_imb = 1;
 
-		group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
+		sgs.group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
 
 		if (local_group) {
-			this_load = avg_load;
+			this_load = sgs.avg_load;
 			this = group;
-			this_nr_running = sum_nr_running;
-			this_load_per_task = sum_weighted_load;
-		} else if (avg_load > max_load &&
-			   (sum_nr_running > group_capacity || __group_imb)) {
-			max_load = avg_load;
+			this_nr_running = sgs.sum_nr_running;
+			this_load_per_task = sgs.sum_weighted_load;
+		} else if (sgs.avg_load > max_load &&
+			   (sgs.sum_nr_running > sgs.group_capacity ||
+				sgs.group_imb)) {
+			max_load = sgs.avg_load;
 			busiest = group;
-			busiest_nr_running = sum_nr_running;
-			busiest_load_per_task = sum_weighted_load;
-			group_imb = __group_imb;
+			busiest_nr_running = sgs.sum_nr_running;
+			busiest_load_per_task = sgs.sum_weighted_load;
+			group_imb = sgs.group_imb;
 		}
 
 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
@@ -3372,7 +3384,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 		 * If the local group is idle or completely loaded
 		 * no need to do power savings balance at this domain
 		 */
-		if (local_group && (this_nr_running >= group_capacity ||
+		if (local_group && (this_nr_running >= sgs.group_capacity ||
 				    !this_nr_running))
 			power_savings_balance = 0;
 
@@ -3380,8 +3392,9 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 		 * If a group is already running at full capacity or idle,
 		 * don't include that group in power savings calculations
 		 */
-		if (!power_savings_balance || sum_nr_running >= group_capacity
-		    || !sum_nr_running)
+		if (!power_savings_balance ||
+			sgs.sum_nr_running >= sgs.group_capacity ||
+			!sgs.sum_nr_running)
 			goto group_next;
 
 		/*
@@ -3389,13 +3402,13 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 		 * This is the group from where we need to pick up the load
 		 * for saving power
 		 */
-		if ((sum_nr_running < min_nr_running) ||
-		    (sum_nr_running == min_nr_running &&
+		if ((sgs.sum_nr_running < min_nr_running) ||
+		    (sgs.sum_nr_running == min_nr_running &&
 		     group_first_cpu(group) > group_first_cpu(group_min))) {
 			group_min = group;
-			min_nr_running = sum_nr_running;
-			min_load_per_task = sum_weighted_load /
-						sum_nr_running;
+			min_nr_running = sgs.sum_nr_running;
+			min_load_per_task = sgs.sum_weighted_load /
+						sgs.sum_nr_running;
 		}
 
 		/*
@@ -3403,14 +3416,14 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 		 * capacity but still has some space to pick up some load
 		 * from other group and save more power
 		 */
-		if (sum_nr_running > group_capacity - 1)
+		if (sgs.sum_nr_running > sgs.group_capacity - 1)
 			goto group_next;
 
-		if (sum_nr_running > leader_nr_running ||
-		    (sum_nr_running == leader_nr_running &&
+		if (sgs.sum_nr_running > leader_nr_running ||
+		    (sgs.sum_nr_running == leader_nr_running &&
 		     group_first_cpu(group) < group_first_cpu(group_leader))) {
 			group_leader = group;
-			leader_nr_running = sum_nr_running;
+			leader_nr_running = sgs.sum_nr_running;
 		}
 group_next:
 #endif
-- 
cgit v0.10.2


From 1f8c553d0f11d85f7993fe21015695d266771c00 Mon Sep 17 00:00:00 2001
From: Gautham R Shenoy <ego@in.ibm.com>
Date: Wed, 25 Mar 2009 14:43:51 +0530
Subject: sched: Create a helper function to calculate sched_group stats for
 fbg()

Impact: cleanup

Create a helper function named update_sg_lb_stats() which
can be invoked to calculate the individual group's statistics
in find_busiest_group().

This reduces the lenght of find_busiest_group() considerably.

Credit: Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com>
Signed-off-by: Gautham R Shenoy <ego@in.ibm.com>
Aked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: "Balbir Singh" <balbir@in.ibm.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: "Dhaval Giani" <dhaval@linux.vnet.ibm.com>
Cc: Bharata B Rao <bharata@linux.vnet.ibm.com>
LKML-Reference: <20090325091351.13992.43461.stgit@sofia.in.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched.c b/kernel/sched.c
index 109db12..1893d55 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3237,6 +3237,103 @@ static inline int get_sd_load_idx(struct sched_domain *sd,
 
 	return load_idx;
 }
+
+
+/**
+ * update_sg_lb_stats - Update sched_group's statistics for load balancing.
+ * @group: sched_group whose statistics are to be updated.
+ * @this_cpu: Cpu for which load balance is currently performed.
+ * @idle: Idle status of this_cpu
+ * @load_idx: Load index of sched_domain of this_cpu for load calc.
+ * @sd_idle: Idle status of the sched_domain containing group.
+ * @local_group: Does group contain this_cpu.
+ * @cpus: Set of cpus considered for load balancing.
+ * @balance: Should we balance.
+ * @sgs: variable to hold the statistics for this group.
+ */
+static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu,
+			enum cpu_idle_type idle, int load_idx, int *sd_idle,
+			int local_group, const struct cpumask *cpus,
+			int *balance, struct sg_lb_stats *sgs)
+{
+	unsigned long load, max_cpu_load, min_cpu_load;
+	int i;
+	unsigned int balance_cpu = -1, first_idle_cpu = 0;
+	unsigned long sum_avg_load_per_task;
+	unsigned long avg_load_per_task;
+
+	if (local_group)
+		balance_cpu = group_first_cpu(group);
+
+	/* Tally up the load of all CPUs in the group */
+	sum_avg_load_per_task = avg_load_per_task = 0;
+	max_cpu_load = 0;
+	min_cpu_load = ~0UL;
+
+	for_each_cpu_and(i, sched_group_cpus(group), cpus) {
+		struct rq *rq = cpu_rq(i);
+
+		if (*sd_idle && rq->nr_running)
+			*sd_idle = 0;
+
+		/* Bias balancing toward cpus of our domain */
+		if (local_group) {
+			if (idle_cpu(i) && !first_idle_cpu) {
+				first_idle_cpu = 1;
+				balance_cpu = i;
+			}
+
+			load = target_load(i, load_idx);
+		} else {
+			load = source_load(i, load_idx);
+			if (load > max_cpu_load)
+				max_cpu_load = load;
+			if (min_cpu_load > load)
+				min_cpu_load = load;
+		}
+
+		sgs->group_load += load;
+		sgs->sum_nr_running += rq->nr_running;
+		sgs->sum_weighted_load += weighted_cpuload(i);
+
+		sum_avg_load_per_task += cpu_avg_load_per_task(i);
+	}
+
+	/*
+	 * First idle cpu or the first cpu(busiest) in this sched group
+	 * is eligible for doing load balancing at this and above
+	 * domains. In the newly idle case, we will allow all the cpu's
+	 * to do the newly idle load balance.
+	 */
+	if (idle != CPU_NEWLY_IDLE && local_group &&
+	    balance_cpu != this_cpu && balance) {
+		*balance = 0;
+		return;
+	}
+
+	/* Adjust by relative CPU power of the group */
+	sgs->avg_load = sg_div_cpu_power(group,
+			sgs->group_load * SCHED_LOAD_SCALE);
+
+
+	/*
+	 * Consider the group unbalanced when the imbalance is larger
+	 * than the average weight of two tasks.
+	 *
+	 * APZ: with cgroup the avg task weight can vary wildly and
+	 *      might not be a suitable number - should we keep a
+	 *      normalized nr_running number somewhere that negates
+	 *      the hierarchy?
+	 */
+	avg_load_per_task = sg_div_cpu_power(group,
+			sum_avg_load_per_task * SCHED_LOAD_SCALE);
+
+	if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
+		sgs->group_imb = 1;
+
+	sgs->group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
+
+}
 /******* find_busiest_group() helpers end here *********************/
 
 /*
@@ -3270,92 +3367,20 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 
 	do {
 		struct sg_lb_stats sgs;
-		unsigned long load, max_cpu_load, min_cpu_load;
 		int local_group;
-		int i;
-		unsigned int balance_cpu = -1, first_idle_cpu = 0;
-		unsigned long sum_avg_load_per_task;
-		unsigned long avg_load_per_task;
 
 		local_group = cpumask_test_cpu(this_cpu,
 					       sched_group_cpus(group));
 		memset(&sgs, 0, sizeof(sgs));
+		update_sg_lb_stats(group, this_cpu, idle, load_idx, sd_idle,
+				local_group, cpus, balance, &sgs);
 
-		if (local_group)
-			balance_cpu = group_first_cpu(group);
-
-		/* Tally up the load of all CPUs in the group */
-		sum_avg_load_per_task = avg_load_per_task = 0;
-
-		max_cpu_load = 0;
-		min_cpu_load = ~0UL;
-
-		for_each_cpu_and(i, sched_group_cpus(group), cpus) {
-			struct rq *rq = cpu_rq(i);
-
-			if (*sd_idle && rq->nr_running)
-				*sd_idle = 0;
-
-			/* Bias balancing toward cpus of our domain */
-			if (local_group) {
-				if (idle_cpu(i) && !first_idle_cpu) {
-					first_idle_cpu = 1;
-					balance_cpu = i;
-				}
-
-				load = target_load(i, load_idx);
-			} else {
-				load = source_load(i, load_idx);
-				if (load > max_cpu_load)
-					max_cpu_load = load;
-				if (min_cpu_load > load)
-					min_cpu_load = load;
-			}
-
-			sgs.group_load += load;
-			sgs.sum_nr_running += rq->nr_running;
-			sgs.sum_weighted_load += weighted_cpuload(i);
-
-			sum_avg_load_per_task += cpu_avg_load_per_task(i);
-		}
-
-		/*
-		 * First idle cpu or the first cpu(busiest) in this sched group
-		 * is eligible for doing load balancing at this and above
-		 * domains. In the newly idle case, we will allow all the cpu's
-		 * to do the newly idle load balance.
-		 */
-		if (idle != CPU_NEWLY_IDLE && local_group &&
-		    balance_cpu != this_cpu && balance) {
-			*balance = 0;
+		if (balance && !(*balance))
 			goto ret;
-		}
 
 		total_load += sgs.group_load;
 		total_pwr += group->__cpu_power;
 
-		/* Adjust by relative CPU power of the group */
-		sgs.avg_load = sg_div_cpu_power(group,
-				sgs.group_load * SCHED_LOAD_SCALE);
-
-
-		/*
-		 * Consider the group unbalanced when the imbalance is larger
-		 * than the average weight of two tasks.
-		 *
-		 * APZ: with cgroup the avg task weight can vary wildly and
-		 *      might not be a suitable number - should we keep a
-		 *      normalized nr_running number somewhere that negates
-		 *      the hierarchy?
-		 */
-		avg_load_per_task = sg_div_cpu_power(group,
-				sum_avg_load_per_task * SCHED_LOAD_SCALE);
-
-		if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
-			sgs.group_imb = 1;
-
-		sgs.group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
-
 		if (local_group) {
 			this_load = sgs.avg_load;
 			this = group;
-- 
cgit v0.10.2


From 222d656dea57e4e084fbd1e9383e6fed2ca9fa61 Mon Sep 17 00:00:00 2001
From: Gautham R Shenoy <ego@in.ibm.com>
Date: Wed, 25 Mar 2009 14:43:56 +0530
Subject: sched: Define structure to store the sched_domain statistics for
 fbg()

Impact: cleanup

Currently we use a lot of local variables in find_busiest_group()
to capture the various statistics related to the sched_domain.
Group them together into a single data structure.

This will help us to offload the job of updating the sched_domain
statistics to a helper function.

Credit: Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com>
Signed-off-by: Gautham R Shenoy <ego@in.ibm.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: "Balbir Singh" <balbir@in.ibm.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: "Dhaval Giani" <dhaval@linux.vnet.ibm.com>
Cc: Bharata B Rao <bharata@linux.vnet.ibm.com>
LKML-Reference: <20090325091356.13992.25970.stgit@sofia.in.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched.c b/kernel/sched.c
index 1893d55..8198dbe 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3190,6 +3190,37 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
 	return 0;
 }
 /********** Helpers for find_busiest_group ************************/
+/**
+ * sd_lb_stats - Structure to store the statistics of a sched_domain
+ * 		during load balancing.
+ */
+struct sd_lb_stats {
+	struct sched_group *busiest; /* Busiest group in this sd */
+	struct sched_group *this;  /* Local group in this sd */
+	unsigned long total_load;  /* Total load of all groups in sd */
+	unsigned long total_pwr;   /*	Total power of all groups in sd */
+	unsigned long avg_load;	   /* Average load across all groups in sd */
+
+	/** Statistics of this group */
+	unsigned long this_load;
+	unsigned long this_load_per_task;
+	unsigned long this_nr_running;
+
+	/* Statistics of the busiest group */
+	unsigned long max_load;
+	unsigned long busiest_load_per_task;
+	unsigned long busiest_nr_running;
+
+	int group_imb; /* Is there imbalance in this sd */
+#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+	int power_savings_balance; /* Is powersave balance needed for this sd */
+	struct sched_group *group_min; /* Least loaded group in sd */
+	struct sched_group *group_leader; /* Group which relieves group_min */
+	unsigned long min_load_per_task; /* load_per_task in group_min */
+	unsigned long leader_nr_running; /* Nr running of group_leader */
+	unsigned long min_nr_running; /* Nr running of group_min */
+#endif
+};
 
 /**
  * sg_lb_stats - stats of a sched_group required for load_balancing
@@ -3346,23 +3377,16 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 		   unsigned long *imbalance, enum cpu_idle_type idle,
 		   int *sd_idle, const struct cpumask *cpus, int *balance)
 {
-	struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
-	unsigned long max_load, avg_load, total_load, this_load, total_pwr;
+	struct sd_lb_stats sds;
+	struct sched_group *group = sd->groups;
 	unsigned long max_pull;
-	unsigned long busiest_load_per_task, busiest_nr_running;
-	unsigned long this_load_per_task, this_nr_running;
-	int load_idx, group_imb = 0;
+	int load_idx;
+
+	memset(&sds, 0, sizeof(sds));
 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
-	int power_savings_balance = 1;
-	unsigned long leader_nr_running = 0, min_load_per_task = 0;
-	unsigned long min_nr_running = ULONG_MAX;
-	struct sched_group *group_min = NULL, *group_leader = NULL;
+	sds.power_savings_balance = 1;
+	sds.min_nr_running = ULONG_MAX;
 #endif
-
-	max_load = this_load = total_load = total_pwr = 0;
-	busiest_load_per_task = busiest_nr_running = 0;
-	this_load_per_task = this_nr_running = 0;
-
 	load_idx = get_sd_load_idx(sd, idle);
 
 	do {
@@ -3378,22 +3402,22 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 		if (balance && !(*balance))
 			goto ret;
 
-		total_load += sgs.group_load;
-		total_pwr += group->__cpu_power;
+		sds.total_load += sgs.group_load;
+		sds.total_pwr += group->__cpu_power;
 
 		if (local_group) {
-			this_load = sgs.avg_load;
-			this = group;
-			this_nr_running = sgs.sum_nr_running;
-			this_load_per_task = sgs.sum_weighted_load;
-		} else if (sgs.avg_load > max_load &&
+			sds.this_load = sgs.avg_load;
+			sds.this = group;
+			sds.this_nr_running = sgs.sum_nr_running;
+			sds.this_load_per_task = sgs.sum_weighted_load;
+		} else if (sgs.avg_load > sds.max_load &&
 			   (sgs.sum_nr_running > sgs.group_capacity ||
 				sgs.group_imb)) {
-			max_load = sgs.avg_load;
-			busiest = group;
-			busiest_nr_running = sgs.sum_nr_running;
-			busiest_load_per_task = sgs.sum_weighted_load;
-			group_imb = sgs.group_imb;
+			sds.max_load = sgs.avg_load;
+			sds.busiest = group;
+			sds.busiest_nr_running = sgs.sum_nr_running;
+			sds.busiest_load_per_task = sgs.sum_weighted_load;
+			sds.group_imb = sgs.group_imb;
 		}
 
 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
@@ -3409,15 +3433,16 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 		 * If the local group is idle or completely loaded
 		 * no need to do power savings balance at this domain
 		 */
-		if (local_group && (this_nr_running >= sgs.group_capacity ||
-				    !this_nr_running))
-			power_savings_balance = 0;
+		if (local_group &&
+			(sds.this_nr_running >= sgs.group_capacity ||
+			!sds.this_nr_running))
+			sds.power_savings_balance = 0;
 
 		/*
 		 * If a group is already running at full capacity or idle,
 		 * don't include that group in power savings calculations
 		 */
-		if (!power_savings_balance ||
+		if (!sds.power_savings_balance ||
 			sgs.sum_nr_running >= sgs.group_capacity ||
 			!sgs.sum_nr_running)
 			goto group_next;
@@ -3427,12 +3452,13 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 		 * This is the group from where we need to pick up the load
 		 * for saving power
 		 */
-		if ((sgs.sum_nr_running < min_nr_running) ||
-		    (sgs.sum_nr_running == min_nr_running &&
-		     group_first_cpu(group) > group_first_cpu(group_min))) {
-			group_min = group;
-			min_nr_running = sgs.sum_nr_running;
-			min_load_per_task = sgs.sum_weighted_load /
+		if ((sgs.sum_nr_running < sds.min_nr_running) ||
+		    (sgs.sum_nr_running == sds.min_nr_running &&
+		     group_first_cpu(group) >
+			group_first_cpu(sds.group_min))) {
+			sds.group_min = group;
+			sds.min_nr_running = sgs.sum_nr_running;
+			sds.min_load_per_task = sgs.sum_weighted_load /
 						sgs.sum_nr_running;
 		}
 
@@ -3444,29 +3470,32 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 		if (sgs.sum_nr_running > sgs.group_capacity - 1)
 			goto group_next;
 
-		if (sgs.sum_nr_running > leader_nr_running ||
-		    (sgs.sum_nr_running == leader_nr_running &&
-		     group_first_cpu(group) < group_first_cpu(group_leader))) {
-			group_leader = group;
-			leader_nr_running = sgs.sum_nr_running;
+		if (sgs.sum_nr_running > sds.leader_nr_running ||
+		    (sgs.sum_nr_running == sds.leader_nr_running &&
+		     group_first_cpu(group) <
+			group_first_cpu(sds.group_leader))) {
+			sds.group_leader = group;
+			sds.leader_nr_running = sgs.sum_nr_running;
 		}
 group_next:
 #endif
 		group = group->next;
 	} while (group != sd->groups);
 
-	if (!busiest || this_load >= max_load || busiest_nr_running == 0)
+	if (!sds.busiest || sds.this_load >= sds.max_load
+		|| sds.busiest_nr_running == 0)
 		goto out_balanced;
 
-	avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
+	sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
 
-	if (this_load >= avg_load ||
-			100*max_load <= sd->imbalance_pct*this_load)
+	if (sds.this_load >= sds.avg_load ||
+			100*sds.max_load <= sd->imbalance_pct * sds.this_load)
 		goto out_balanced;
 
-	busiest_load_per_task /= busiest_nr_running;
-	if (group_imb)
-		busiest_load_per_task = min(busiest_load_per_task, avg_load);
+	sds.busiest_load_per_task /= sds.busiest_nr_running;
+	if (sds.group_imb)
+		sds.busiest_load_per_task =
+			min(sds.busiest_load_per_task, sds.avg_load);
 
 	/*
 	 * We're trying to get all the cpus to the average_load, so we don't
@@ -3479,7 +3508,7 @@ group_next:
 	 * by pulling tasks to us. Be careful of negative numbers as they'll
 	 * appear as very large values with unsigned longs.
 	 */
-	if (max_load <= busiest_load_per_task)
+	if (sds.max_load <= sds.busiest_load_per_task)
 		goto out_balanced;
 
 	/*
@@ -3487,17 +3516,18 @@ group_next:
 	 * max load less than avg load(as we skip the groups at or below
 	 * its cpu_power, while calculating max_load..)
 	 */
-	if (max_load < avg_load) {
+	if (sds.max_load < sds.avg_load) {
 		*imbalance = 0;
 		goto small_imbalance;
 	}
 
 	/* Don't want to pull so many tasks that a group would go idle */
-	max_pull = min(max_load - avg_load, max_load - busiest_load_per_task);
+	max_pull = min(sds.max_load - sds.avg_load,
+			sds.max_load - sds.busiest_load_per_task);
 
 	/* How much load to actually move to equalise the imbalance */
-	*imbalance = min(max_pull * busiest->__cpu_power,
-				(avg_load - this_load) * this->__cpu_power)
+	*imbalance = min(max_pull * sds.busiest->__cpu_power,
+			(sds.avg_load - sds.this_load) * sds.this->__cpu_power)
 			/ SCHED_LOAD_SCALE;
 
 	/*
@@ -3506,24 +3536,27 @@ group_next:
 	 * a think about bumping its value to force at least one task to be
 	 * moved
 	 */
-	if (*imbalance < busiest_load_per_task) {
+	if (*imbalance < sds.busiest_load_per_task) {
 		unsigned long tmp, pwr_now, pwr_move;
 		unsigned int imbn;
 
 small_imbalance:
 		pwr_move = pwr_now = 0;
 		imbn = 2;
-		if (this_nr_running) {
-			this_load_per_task /= this_nr_running;
-			if (busiest_load_per_task > this_load_per_task)
+		if (sds.this_nr_running) {
+			sds.this_load_per_task /= sds.this_nr_running;
+			if (sds.busiest_load_per_task >
+					sds.this_load_per_task)
 				imbn = 1;
 		} else
-			this_load_per_task = cpu_avg_load_per_task(this_cpu);
-
-		if (max_load - this_load + busiest_load_per_task >=
-					busiest_load_per_task * imbn) {
-			*imbalance = busiest_load_per_task;
-			return busiest;
+			sds.this_load_per_task =
+				cpu_avg_load_per_task(this_cpu);
+
+		if (sds.max_load - sds.this_load +
+			sds.busiest_load_per_task >=
+				sds.busiest_load_per_task * imbn) {
+			*imbalance = sds.busiest_load_per_task;
+			return sds.busiest;
 		}
 
 		/*
@@ -3532,52 +3565,54 @@ small_imbalance:
 		 * moving them.
 		 */
 
-		pwr_now += busiest->__cpu_power *
-				min(busiest_load_per_task, max_load);
-		pwr_now += this->__cpu_power *
-				min(this_load_per_task, this_load);
+		pwr_now += sds.busiest->__cpu_power *
+				min(sds.busiest_load_per_task, sds.max_load);
+		pwr_now += sds.this->__cpu_power *
+				min(sds.this_load_per_task, sds.this_load);
 		pwr_now /= SCHED_LOAD_SCALE;
 
 		/* Amount of load we'd subtract */
-		tmp = sg_div_cpu_power(busiest,
-				busiest_load_per_task * SCHED_LOAD_SCALE);
-		if (max_load > tmp)
-			pwr_move += busiest->__cpu_power *
-				min(busiest_load_per_task, max_load - tmp);
+		tmp = sg_div_cpu_power(sds.busiest,
+				sds.busiest_load_per_task * SCHED_LOAD_SCALE);
+		if (sds.max_load > tmp)
+			pwr_move += sds.busiest->__cpu_power *
+				min(sds.busiest_load_per_task,
+						sds.max_load - tmp);
 
 		/* Amount of load we'd add */
-		if (max_load * busiest->__cpu_power <
-				busiest_load_per_task * SCHED_LOAD_SCALE)
-			tmp = sg_div_cpu_power(this,
-					max_load * busiest->__cpu_power);
+		if (sds.max_load * sds.busiest->__cpu_power <
+				sds.busiest_load_per_task * SCHED_LOAD_SCALE)
+			tmp = sg_div_cpu_power(sds.this,
+				sds.max_load * sds.busiest->__cpu_power);
 		else
-			tmp = sg_div_cpu_power(this,
-				busiest_load_per_task * SCHED_LOAD_SCALE);
-		pwr_move += this->__cpu_power *
-				min(this_load_per_task, this_load + tmp);
+			tmp = sg_div_cpu_power(sds.this,
+				sds.busiest_load_per_task * SCHED_LOAD_SCALE);
+		pwr_move += sds.this->__cpu_power *
+				min(sds.this_load_per_task,
+					sds.this_load + tmp);
 		pwr_move /= SCHED_LOAD_SCALE;
 
 		/* Move if we gain throughput */
 		if (pwr_move > pwr_now)
-			*imbalance = busiest_load_per_task;
+			*imbalance = sds.busiest_load_per_task;
 	}
 
-	return busiest;
+	return sds.busiest;
 
 out_balanced:
 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
 	if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
 		goto ret;
 
-	if (this != group_leader || group_leader == group_min)
+	if (sds.this != sds.group_leader || sds.group_leader == sds.group_min)
 		goto ret;
 
-	*imbalance = min_load_per_task;
+	*imbalance = sds.min_load_per_task;
 	if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) {
 		cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu =
-			group_first_cpu(group_leader);
+			group_first_cpu(sds.group_leader);
 	}
-	return group_min;
+	return sds.group_min;
 
 #endif
 ret:
-- 
cgit v0.10.2


From 37abe198b1246ddd206319c43502a687db62d347 Mon Sep 17 00:00:00 2001
From: Gautham R Shenoy <ego@in.ibm.com>
Date: Wed, 25 Mar 2009 14:44:01 +0530
Subject: sched: Create a helper function to calculate sched_domain stats for
 fbg()

Impact: cleanup

Create a helper function named update_sd_lb_stats() to update the
various sched_domain related statistics in find_busiest_group().

With this we would have moved all the statistics computation out of
find_busiest_group().

Credit: Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com>
Signed-off-by: Gautham R Shenoy <ego@in.ibm.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: "Balbir Singh" <balbir@in.ibm.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: "Dhaval Giani" <dhaval@linux.vnet.ibm.com>
Cc: Bharata B Rao <bharata@linux.vnet.ibm.com>
LKML-Reference: <20090325091401.13992.88737.stgit@sofia.in.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched.c b/kernel/sched.c
index 8198dbe..ec715f9 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3365,32 +3365,33 @@ static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu,
 	sgs->group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
 
 }
-/******* find_busiest_group() helpers end here *********************/
 
-/*
- * find_busiest_group finds and returns the busiest CPU group within the
- * domain. It calculates and returns the amount of weighted load which
- * should be moved to restore balance via the imbalance parameter.
+/**
+ * update_sd_lb_stats - Update sched_group's statistics for load balancing.
+ * @sd: sched_domain whose statistics are to be updated.
+ * @this_cpu: Cpu for which load balance is currently performed.
+ * @idle: Idle status of this_cpu
+ * @sd_idle: Idle status of the sched_domain containing group.
+ * @cpus: Set of cpus considered for load balancing.
+ * @balance: Should we balance.
+ * @sds: variable to hold the statistics for this sched_domain.
  */
-static struct sched_group *
-find_busiest_group(struct sched_domain *sd, int this_cpu,
-		   unsigned long *imbalance, enum cpu_idle_type idle,
-		   int *sd_idle, const struct cpumask *cpus, int *balance)
+static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
+			enum cpu_idle_type idle, int *sd_idle,
+			const struct cpumask *cpus, int *balance,
+			struct sd_lb_stats *sds)
 {
-	struct sd_lb_stats sds;
 	struct sched_group *group = sd->groups;
-	unsigned long max_pull;
+	struct sg_lb_stats sgs;
 	int load_idx;
 
-	memset(&sds, 0, sizeof(sds));
 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
-	sds.power_savings_balance = 1;
-	sds.min_nr_running = ULONG_MAX;
+	sds->power_savings_balance = 1;
+	sds->min_nr_running = ULONG_MAX;
 #endif
 	load_idx = get_sd_load_idx(sd, idle);
 
 	do {
-		struct sg_lb_stats sgs;
 		int local_group;
 
 		local_group = cpumask_test_cpu(this_cpu,
@@ -3399,25 +3400,25 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 		update_sg_lb_stats(group, this_cpu, idle, load_idx, sd_idle,
 				local_group, cpus, balance, &sgs);
 
-		if (balance && !(*balance))
-			goto ret;
+		if (local_group && balance && !(*balance))
+			return;
 
-		sds.total_load += sgs.group_load;
-		sds.total_pwr += group->__cpu_power;
+		sds->total_load += sgs.group_load;
+		sds->total_pwr += group->__cpu_power;
 
 		if (local_group) {
-			sds.this_load = sgs.avg_load;
-			sds.this = group;
-			sds.this_nr_running = sgs.sum_nr_running;
-			sds.this_load_per_task = sgs.sum_weighted_load;
-		} else if (sgs.avg_load > sds.max_load &&
+			sds->this_load = sgs.avg_load;
+			sds->this = group;
+			sds->this_nr_running = sgs.sum_nr_running;
+			sds->this_load_per_task = sgs.sum_weighted_load;
+		} else if (sgs.avg_load > sds->max_load &&
 			   (sgs.sum_nr_running > sgs.group_capacity ||
 				sgs.group_imb)) {
-			sds.max_load = sgs.avg_load;
-			sds.busiest = group;
-			sds.busiest_nr_running = sgs.sum_nr_running;
-			sds.busiest_load_per_task = sgs.sum_weighted_load;
-			sds.group_imb = sgs.group_imb;
+			sds->max_load = sgs.avg_load;
+			sds->busiest = group;
+			sds->busiest_nr_running = sgs.sum_nr_running;
+			sds->busiest_load_per_task = sgs.sum_weighted_load;
+			sds->group_imb = sgs.group_imb;
 		}
 
 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
@@ -3434,15 +3435,15 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 		 * no need to do power savings balance at this domain
 		 */
 		if (local_group &&
-			(sds.this_nr_running >= sgs.group_capacity ||
-			!sds.this_nr_running))
-			sds.power_savings_balance = 0;
+			(sds->this_nr_running >= sgs.group_capacity ||
+			!sds->this_nr_running))
+			sds->power_savings_balance = 0;
 
 		/*
 		 * If a group is already running at full capacity or idle,
 		 * don't include that group in power savings calculations
 		 */
-		if (!sds.power_savings_balance ||
+		if (!sds->power_savings_balance ||
 			sgs.sum_nr_running >= sgs.group_capacity ||
 			!sgs.sum_nr_running)
 			goto group_next;
@@ -3452,13 +3453,13 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 		 * This is the group from where we need to pick up the load
 		 * for saving power
 		 */
-		if ((sgs.sum_nr_running < sds.min_nr_running) ||
-		    (sgs.sum_nr_running == sds.min_nr_running &&
+		if ((sgs.sum_nr_running < sds->min_nr_running) ||
+		    (sgs.sum_nr_running == sds->min_nr_running &&
 		     group_first_cpu(group) >
-			group_first_cpu(sds.group_min))) {
-			sds.group_min = group;
-			sds.min_nr_running = sgs.sum_nr_running;
-			sds.min_load_per_task = sgs.sum_weighted_load /
+			group_first_cpu(sds->group_min))) {
+			sds->group_min = group;
+			sds->min_nr_running = sgs.sum_nr_running;
+			sds->min_load_per_task = sgs.sum_weighted_load /
 						sgs.sum_nr_running;
 		}
 
@@ -3470,18 +3471,46 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 		if (sgs.sum_nr_running > sgs.group_capacity - 1)
 			goto group_next;
 
-		if (sgs.sum_nr_running > sds.leader_nr_running ||
-		    (sgs.sum_nr_running == sds.leader_nr_running &&
+		if (sgs.sum_nr_running > sds->leader_nr_running ||
+		    (sgs.sum_nr_running == sds->leader_nr_running &&
 		     group_first_cpu(group) <
-			group_first_cpu(sds.group_leader))) {
-			sds.group_leader = group;
-			sds.leader_nr_running = sgs.sum_nr_running;
+			group_first_cpu(sds->group_leader))) {
+			sds->group_leader = group;
+			sds->leader_nr_running = sgs.sum_nr_running;
 		}
 group_next:
 #endif
 		group = group->next;
 	} while (group != sd->groups);
 
+}
+/******* find_busiest_group() helpers end here *********************/
+
+/*
+ * find_busiest_group finds and returns the busiest CPU group within the
+ * domain. It calculates and returns the amount of weighted load which
+ * should be moved to restore balance via the imbalance parameter.
+ */
+static struct sched_group *
+find_busiest_group(struct sched_domain *sd, int this_cpu,
+		   unsigned long *imbalance, enum cpu_idle_type idle,
+		   int *sd_idle, const struct cpumask *cpus, int *balance)
+{
+	struct sd_lb_stats sds;
+	unsigned long max_pull;
+
+	memset(&sds, 0, sizeof(sds));
+
+	/*
+	 * Compute the various statistics relavent for load balancing at
+	 * this level.
+	 */
+	update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus,
+					balance, &sds);
+
+	if (balance && !(*balance))
+		goto ret;
+
 	if (!sds.busiest || sds.this_load >= sds.max_load
 		|| sds.busiest_nr_running == 0)
 		goto out_balanced;
-- 
cgit v0.10.2


From 2e6f44aeda426054fc58464df1ad571aecca0c92 Mon Sep 17 00:00:00 2001
From: Gautham R Shenoy <ego@in.ibm.com>
Date: Wed, 25 Mar 2009 14:44:06 +0530
Subject: sched: Create helper to calculate small_imbalance in fbg()

Impact: cleanup

We have two places in find_busiest_group() where we need to calculate
the minor imbalance before returning the busiest group. Encapsulate
this functionality into a seperate helper function.

Credit: Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com>
Signed-off-by: Gautham R Shenoy <ego@in.ibm.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: "Balbir Singh" <balbir@in.ibm.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: "Dhaval Giani" <dhaval@linux.vnet.ibm.com>
Cc: Bharata B Rao <bharata@linux.vnet.ibm.com>
LKML-Reference: <20090325091406.13992.54316.stgit@sofia.in.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched.c b/kernel/sched.c
index ec715f9..540147e 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3484,6 +3484,71 @@ group_next:
 	} while (group != sd->groups);
 
 }
+
+/**
+ * fix_small_imbalance - Calculate the minor imbalance that exists
+ * 			amongst the groups of a sched_domain, during
+ * 			load balancing.
+ * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
+ * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
+ * @imbalance: Variable to store the imbalance.
+ */
+static inline void fix_small_imbalance(struct sd_lb_stats *sds,
+				int this_cpu, unsigned long *imbalance)
+{
+	unsigned long tmp, pwr_now = 0, pwr_move = 0;
+	unsigned int imbn = 2;
+
+	if (sds->this_nr_running) {
+		sds->this_load_per_task /= sds->this_nr_running;
+		if (sds->busiest_load_per_task >
+				sds->this_load_per_task)
+			imbn = 1;
+	} else
+		sds->this_load_per_task =
+			cpu_avg_load_per_task(this_cpu);
+
+	if (sds->max_load - sds->this_load + sds->busiest_load_per_task >=
+			sds->busiest_load_per_task * imbn) {
+		*imbalance = sds->busiest_load_per_task;
+		return;
+	}
+
+	/*
+	 * OK, we don't have enough imbalance to justify moving tasks,
+	 * however we may be able to increase total CPU power used by
+	 * moving them.
+	 */
+
+	pwr_now += sds->busiest->__cpu_power *
+			min(sds->busiest_load_per_task, sds->max_load);
+	pwr_now += sds->this->__cpu_power *
+			min(sds->this_load_per_task, sds->this_load);
+	pwr_now /= SCHED_LOAD_SCALE;
+
+	/* Amount of load we'd subtract */
+	tmp = sg_div_cpu_power(sds->busiest,
+			sds->busiest_load_per_task * SCHED_LOAD_SCALE);
+	if (sds->max_load > tmp)
+		pwr_move += sds->busiest->__cpu_power *
+			min(sds->busiest_load_per_task, sds->max_load - tmp);
+
+	/* Amount of load we'd add */
+	if (sds->max_load * sds->busiest->__cpu_power <
+		sds->busiest_load_per_task * SCHED_LOAD_SCALE)
+		tmp = sg_div_cpu_power(sds->this,
+			sds->max_load * sds->busiest->__cpu_power);
+	else
+		tmp = sg_div_cpu_power(sds->this,
+			sds->busiest_load_per_task * SCHED_LOAD_SCALE);
+	pwr_move += sds->this->__cpu_power *
+			min(sds->this_load_per_task, sds->this_load + tmp);
+	pwr_move /= SCHED_LOAD_SCALE;
+
+	/* Move if we gain throughput */
+	if (pwr_move > pwr_now)
+		*imbalance = sds->busiest_load_per_task;
+}
 /******* find_busiest_group() helpers end here *********************/
 
 /*
@@ -3547,7 +3612,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 	 */
 	if (sds.max_load < sds.avg_load) {
 		*imbalance = 0;
-		goto small_imbalance;
+		fix_small_imbalance(&sds, this_cpu, imbalance);
+		goto ret_busiest;
 	}
 
 	/* Don't want to pull so many tasks that a group would go idle */
@@ -3565,67 +3631,10 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 	 * a think about bumping its value to force at least one task to be
 	 * moved
 	 */
-	if (*imbalance < sds.busiest_load_per_task) {
-		unsigned long tmp, pwr_now, pwr_move;
-		unsigned int imbn;
-
-small_imbalance:
-		pwr_move = pwr_now = 0;
-		imbn = 2;
-		if (sds.this_nr_running) {
-			sds.this_load_per_task /= sds.this_nr_running;
-			if (sds.busiest_load_per_task >
-					sds.this_load_per_task)
-				imbn = 1;
-		} else
-			sds.this_load_per_task =
-				cpu_avg_load_per_task(this_cpu);
-
-		if (sds.max_load - sds.this_load +
-			sds.busiest_load_per_task >=
-				sds.busiest_load_per_task * imbn) {
-			*imbalance = sds.busiest_load_per_task;
-			return sds.busiest;
-		}
-
-		/*
-		 * OK, we don't have enough imbalance to justify moving tasks,
-		 * however we may be able to increase total CPU power used by
-		 * moving them.
-		 */
-
-		pwr_now += sds.busiest->__cpu_power *
-				min(sds.busiest_load_per_task, sds.max_load);
-		pwr_now += sds.this->__cpu_power *
-				min(sds.this_load_per_task, sds.this_load);
-		pwr_now /= SCHED_LOAD_SCALE;
-
-		/* Amount of load we'd subtract */
-		tmp = sg_div_cpu_power(sds.busiest,
-				sds.busiest_load_per_task * SCHED_LOAD_SCALE);
-		if (sds.max_load > tmp)
-			pwr_move += sds.busiest->__cpu_power *
-				min(sds.busiest_load_per_task,
-						sds.max_load - tmp);
-
-		/* Amount of load we'd add */
-		if (sds.max_load * sds.busiest->__cpu_power <
-				sds.busiest_load_per_task * SCHED_LOAD_SCALE)
-			tmp = sg_div_cpu_power(sds.this,
-				sds.max_load * sds.busiest->__cpu_power);
-		else
-			tmp = sg_div_cpu_power(sds.this,
-				sds.busiest_load_per_task * SCHED_LOAD_SCALE);
-		pwr_move += sds.this->__cpu_power *
-				min(sds.this_load_per_task,
-					sds.this_load + tmp);
-		pwr_move /= SCHED_LOAD_SCALE;
-
-		/* Move if we gain throughput */
-		if (pwr_move > pwr_now)
-			*imbalance = sds.busiest_load_per_task;
-	}
+	if (*imbalance < sds.busiest_load_per_task)
+		fix_small_imbalance(&sds, this_cpu, imbalance);
 
+ret_busiest:
 	return sds.busiest;
 
 out_balanced:
-- 
cgit v0.10.2


From dbc523a3b86f9e1765b5e70e6886913b99cc5cec Mon Sep 17 00:00:00 2001
From: Gautham R Shenoy <ego@in.ibm.com>
Date: Wed, 25 Mar 2009 14:44:12 +0530
Subject: sched: Create a helper function to calculate imbalance

Move all the imbalance calculation out of find_busiest_group()
through this helper function.

With this change, the structure of find_busiest_group() will be
as follows:

- update_sched_domain_statistics.

- check if imbalance exits.

- update imbalance and return busiest.

Signed-off-by: Gautham R Shenoy <ego@in.ibm.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: "Balbir Singh" <balbir@in.ibm.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: "Dhaval Giani" <dhaval@linux.vnet.ibm.com>
Cc: Bharata B Rao <bharata@linux.vnet.ibm.com>
Cc: "Vaidyanathan Srinivasan" <svaidy@linux.vnet.ibm.com>
LKML-Reference: <20090325091411.13992.43293.stgit@sofia.in.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched.c b/kernel/sched.c
index 540147e..934f615 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3487,8 +3487,8 @@ group_next:
 
 /**
  * fix_small_imbalance - Calculate the minor imbalance that exists
- * 			amongst the groups of a sched_domain, during
- * 			load balancing.
+ *			amongst the groups of a sched_domain, during
+ *			load balancing.
  * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
  * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
  * @imbalance: Variable to store the imbalance.
@@ -3549,6 +3549,47 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds,
 	if (pwr_move > pwr_now)
 		*imbalance = sds->busiest_load_per_task;
 }
+
+/**
+ * calculate_imbalance - Calculate the amount of imbalance present within the
+ *			 groups of a given sched_domain during load balance.
+ * @sds: statistics of the sched_domain whose imbalance is to be calculated.
+ * @this_cpu: Cpu for which currently load balance is being performed.
+ * @imbalance: The variable to store the imbalance.
+ */
+static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
+		unsigned long *imbalance)
+{
+	unsigned long max_pull;
+	/*
+	 * In the presence of smp nice balancing, certain scenarios can have
+	 * max load less than avg load(as we skip the groups at or below
+	 * its cpu_power, while calculating max_load..)
+	 */
+	if (sds->max_load < sds->avg_load) {
+		*imbalance = 0;
+		return fix_small_imbalance(sds, this_cpu, imbalance);
+	}
+
+	/* Don't want to pull so many tasks that a group would go idle */
+	max_pull = min(sds->max_load - sds->avg_load,
+			sds->max_load - sds->busiest_load_per_task);
+
+	/* How much load to actually move to equalise the imbalance */
+	*imbalance = min(max_pull * sds->busiest->__cpu_power,
+		(sds->avg_load - sds->this_load) * sds->this->__cpu_power)
+			/ SCHED_LOAD_SCALE;
+
+	/*
+	 * if *imbalance is less than the average load per runnable task
+	 * there is no gaurantee that any tasks will be moved so we'll have
+	 * a think about bumping its value to force at least one task to be
+	 * moved
+	 */
+	if (*imbalance < sds->busiest_load_per_task)
+		return fix_small_imbalance(sds, this_cpu, imbalance);
+
+}
 /******* find_busiest_group() helpers end here *********************/
 
 /*
@@ -3562,7 +3603,6 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 		   int *sd_idle, const struct cpumask *cpus, int *balance)
 {
 	struct sd_lb_stats sds;
-	unsigned long max_pull;
 
 	memset(&sds, 0, sizeof(sds));
 
@@ -3605,36 +3645,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 	if (sds.max_load <= sds.busiest_load_per_task)
 		goto out_balanced;
 
-	/*
-	 * In the presence of smp nice balancing, certain scenarios can have
-	 * max load less than avg load(as we skip the groups at or below
-	 * its cpu_power, while calculating max_load..)
-	 */
-	if (sds.max_load < sds.avg_load) {
-		*imbalance = 0;
-		fix_small_imbalance(&sds, this_cpu, imbalance);
-		goto ret_busiest;
-	}
-
-	/* Don't want to pull so many tasks that a group would go idle */
-	max_pull = min(sds.max_load - sds.avg_load,
-			sds.max_load - sds.busiest_load_per_task);
-
-	/* How much load to actually move to equalise the imbalance */
-	*imbalance = min(max_pull * sds.busiest->__cpu_power,
-			(sds.avg_load - sds.this_load) * sds.this->__cpu_power)
-			/ SCHED_LOAD_SCALE;
-
-	/*
-	 * if *imbalance is less than the average load per runnable task
-	 * there is no gaurantee that any tasks will be moved so we'll have
-	 * a think about bumping its value to force at least one task to be
-	 * moved
-	 */
-	if (*imbalance < sds.busiest_load_per_task)
-		fix_small_imbalance(&sds, this_cpu, imbalance);
-
-ret_busiest:
+	/* Looks like there is an imbalance. Compute it */
+	calculate_imbalance(&sds, this_cpu, imbalance);
 	return sds.busiest;
 
 out_balanced:
-- 
cgit v0.10.2


From a021dc03376707c55a3483e32c16b8986d4414cc Mon Sep 17 00:00:00 2001
From: Gautham R Shenoy <ego@in.ibm.com>
Date: Wed, 25 Mar 2009 14:44:17 +0530
Subject: sched: Optimize the !power_savings_balance during fbg()

Impact: cleanup, micro-optimization

We don't need to perform power_savings balance if either the
cpu is NOT_IDLE or if the sched_domain doesn't contain the
SD_POWERSAVINGS_BALANCE flag set.

Currently, we check for these conditions multiple number of
times, even though these variables don't change over the scope
of find_busiest_group().

Check once, and store the value in the already exiting
"power_savings_balance" variable.

Signed-off-by: Gautham R Shenoy <ego@in.ibm.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: "Balbir Singh" <balbir@in.ibm.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: "Dhaval Giani" <dhaval@linux.vnet.ibm.com>
Cc: Bharata B Rao <bharata@linux.vnet.ibm.com>
Cc: "Vaidyanathan Srinivasan" <svaidy@linux.vnet.ibm.com>
LKML-Reference: <20090325091417.13992.2657.stgit@sofia.in.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched.c b/kernel/sched.c
index 934f615..71e8dca 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3386,8 +3386,17 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
 	int load_idx;
 
 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
-	sds->power_savings_balance = 1;
-	sds->min_nr_running = ULONG_MAX;
+	/*
+	 * Busy processors will not participate in power savings
+	 * balance.
+	 */
+	if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
+		sds->power_savings_balance = 0;
+	else {
+		sds->power_savings_balance = 1;
+		sds->min_nr_running = ULONG_MAX;
+		sds->leader_nr_running = 0;
+	}
 #endif
 	load_idx = get_sd_load_idx(sd, idle);
 
@@ -3422,12 +3431,8 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
 		}
 
 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
-		/*
-		 * Busy processors will not participate in power savings
-		 * balance.
-		 */
-		if (idle == CPU_NOT_IDLE ||
-				!(sd->flags & SD_POWERSAVINGS_BALANCE))
+
+		if (!sds->power_savings_balance)
 			goto group_next;
 
 		/*
@@ -3651,7 +3656,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 
 out_balanced:
 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
-	if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
+	if (!sds.power_savings_balance)
 		goto ret;
 
 	if (sds.this != sds.group_leader || sds.group_leader == sds.group_min)
-- 
cgit v0.10.2


From c071df18525a95b37dd5821a6dc4af83bd18675e Mon Sep 17 00:00:00 2001
From: Gautham R Shenoy <ego@in.ibm.com>
Date: Wed, 25 Mar 2009 14:44:22 +0530
Subject: sched: Refactor the power savings balance code

Impact: cleanup

Create seperate helper functions to initialize the
power-savings-balance related variables, to update them and
to check if we have a scope for performing power-savings balance.

Add no-op inline functions for the !(CONFIG_SCHED_MC || CONFIG_SCHED_SMT)
case.

This will eliminate all the #ifdef jungle in find_busiest_group() and the
other helper functions.

Signed-off-by: Gautham R Shenoy <ego@in.ibm.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: "Balbir Singh" <balbir@in.ibm.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: "Dhaval Giani" <dhaval@linux.vnet.ibm.com>
Cc: Bharata B Rao <bharata@linux.vnet.ibm.com>
Cc: "Vaidyanathan Srinivasan" <svaidy@linux.vnet.ibm.com>
LKML-Reference: <20090325091422.13992.73616.stgit@sofia.in.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched.c b/kernel/sched.c
index 71e8dca..5f21658 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3270,6 +3270,151 @@ static inline int get_sd_load_idx(struct sched_domain *sd,
 }
 
 
+#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+/**
+ * init_sd_power_savings_stats - Initialize power savings statistics for
+ * the given sched_domain, during load balancing.
+ *
+ * @sd: Sched domain whose power-savings statistics are to be initialized.
+ * @sds: Variable containing the statistics for sd.
+ * @idle: Idle status of the CPU at which we're performing load-balancing.
+ */
+static inline void init_sd_power_savings_stats(struct sched_domain *sd,
+	struct sd_lb_stats *sds, enum cpu_idle_type idle)
+{
+	/*
+	 * Busy processors will not participate in power savings
+	 * balance.
+	 */
+	if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
+		sds->power_savings_balance = 0;
+	else {
+		sds->power_savings_balance = 1;
+		sds->min_nr_running = ULONG_MAX;
+		sds->leader_nr_running = 0;
+	}
+}
+
+/**
+ * update_sd_power_savings_stats - Update the power saving stats for a
+ * sched_domain while performing load balancing.
+ *
+ * @group: sched_group belonging to the sched_domain under consideration.
+ * @sds: Variable containing the statistics of the sched_domain
+ * @local_group: Does group contain the CPU for which we're performing
+ * 		load balancing ?
+ * @sgs: Variable containing the statistics of the group.
+ */
+static inline void update_sd_power_savings_stats(struct sched_group *group,
+	struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
+{
+
+	if (!sds->power_savings_balance)
+		return;
+
+	/*
+	 * If the local group is idle or completely loaded
+	 * no need to do power savings balance at this domain
+	 */
+	if (local_group && (sds->this_nr_running >= sgs->group_capacity ||
+				!sds->this_nr_running))
+		sds->power_savings_balance = 0;
+
+	/*
+	 * If a group is already running at full capacity or idle,
+	 * don't include that group in power savings calculations
+	 */
+	if (!sds->power_savings_balance ||
+		sgs->sum_nr_running >= sgs->group_capacity ||
+		!sgs->sum_nr_running)
+		return;
+
+	/*
+	 * Calculate the group which has the least non-idle load.
+	 * This is the group from where we need to pick up the load
+	 * for saving power
+	 */
+	if ((sgs->sum_nr_running < sds->min_nr_running) ||
+	    (sgs->sum_nr_running == sds->min_nr_running &&
+	     group_first_cpu(group) > group_first_cpu(sds->group_min))) {
+		sds->group_min = group;
+		sds->min_nr_running = sgs->sum_nr_running;
+		sds->min_load_per_task = sgs->sum_weighted_load /
+						sgs->sum_nr_running;
+	}
+
+	/*
+	 * Calculate the group which is almost near its
+	 * capacity but still has some space to pick up some load
+	 * from other group and save more power
+	 */
+	if (sgs->sum_nr_running > sgs->group_capacity - 1)
+		return;
+
+	if (sgs->sum_nr_running > sds->leader_nr_running ||
+	    (sgs->sum_nr_running == sds->leader_nr_running &&
+	     group_first_cpu(group) < group_first_cpu(sds->group_leader))) {
+		sds->group_leader = group;
+		sds->leader_nr_running = sgs->sum_nr_running;
+	}
+}
+
+/**
+ * check_power_save_busiest_group - Check if we have potential to perform
+ *	some power-savings balance. If yes, set the busiest group to be
+ *	the least loaded group in the sched_domain, so that it's CPUs can
+ *	be put to idle.
+ *
+ * @sds: Variable containing the statistics of the sched_domain
+ *	under consideration.
+ * @this_cpu: Cpu at which we're currently performing load-balancing.
+ * @imbalance: Variable to store the imbalance.
+ *
+ * Returns 1 if there is potential to perform power-savings balance.
+ * Else returns 0.
+ */
+static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
+					int this_cpu, unsigned long *imbalance)
+{
+	if (!sds->power_savings_balance)
+		return 0;
+
+	if (sds->this != sds->group_leader ||
+			sds->group_leader == sds->group_min)
+		return 0;
+
+	*imbalance = sds->min_load_per_task;
+	sds->busiest = sds->group_min;
+
+	if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) {
+		cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu =
+			group_first_cpu(sds->group_leader);
+	}
+
+	return 1;
+
+}
+#else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
+static inline void init_sd_power_savings_stats(struct sched_domain *sd,
+	struct sd_lb_stats *sds, enum cpu_idle_type idle)
+{
+	return;
+}
+
+static inline void update_sd_power_savings_stats(struct sched_group *group,
+	struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
+{
+	return;
+}
+
+static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
+					int this_cpu, unsigned long *imbalance)
+{
+	return 0;
+}
+#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
+
+
 /**
  * update_sg_lb_stats - Update sched_group's statistics for load balancing.
  * @group: sched_group whose statistics are to be updated.
@@ -3385,19 +3530,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
 	struct sg_lb_stats sgs;
 	int load_idx;
 
-#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
-	/*
-	 * Busy processors will not participate in power savings
-	 * balance.
-	 */
-	if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
-		sds->power_savings_balance = 0;
-	else {
-		sds->power_savings_balance = 1;
-		sds->min_nr_running = ULONG_MAX;
-		sds->leader_nr_running = 0;
-	}
-#endif
+	init_sd_power_savings_stats(sd, sds, idle);
 	load_idx = get_sd_load_idx(sd, idle);
 
 	do {
@@ -3430,61 +3563,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
 			sds->group_imb = sgs.group_imb;
 		}
 
-#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
-
-		if (!sds->power_savings_balance)
-			goto group_next;
-
-		/*
-		 * If the local group is idle or completely loaded
-		 * no need to do power savings balance at this domain
-		 */
-		if (local_group &&
-			(sds->this_nr_running >= sgs.group_capacity ||
-			!sds->this_nr_running))
-			sds->power_savings_balance = 0;
-
-		/*
-		 * If a group is already running at full capacity or idle,
-		 * don't include that group in power savings calculations
-		 */
-		if (!sds->power_savings_balance ||
-			sgs.sum_nr_running >= sgs.group_capacity ||
-			!sgs.sum_nr_running)
-			goto group_next;
-
-		/*
-		 * Calculate the group which has the least non-idle load.
-		 * This is the group from where we need to pick up the load
-		 * for saving power
-		 */
-		if ((sgs.sum_nr_running < sds->min_nr_running) ||
-		    (sgs.sum_nr_running == sds->min_nr_running &&
-		     group_first_cpu(group) >
-			group_first_cpu(sds->group_min))) {
-			sds->group_min = group;
-			sds->min_nr_running = sgs.sum_nr_running;
-			sds->min_load_per_task = sgs.sum_weighted_load /
-						sgs.sum_nr_running;
-		}
-
-		/*
-		 * Calculate the group which is almost near its
-		 * capacity but still has some space to pick up some load
-		 * from other group and save more power
-		 */
-		if (sgs.sum_nr_running > sgs.group_capacity - 1)
-			goto group_next;
-
-		if (sgs.sum_nr_running > sds->leader_nr_running ||
-		    (sgs.sum_nr_running == sds->leader_nr_running &&
-		     group_first_cpu(group) <
-			group_first_cpu(sds->group_leader))) {
-			sds->group_leader = group;
-			sds->leader_nr_running = sgs.sum_nr_running;
-		}
-group_next:
-#endif
+		update_sd_power_savings_stats(group, sds, local_group, &sgs);
 		group = group->next;
 	} while (group != sd->groups);
 
@@ -3655,21 +3734,12 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 	return sds.busiest;
 
 out_balanced:
-#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
-	if (!sds.power_savings_balance)
-		goto ret;
-
-	if (sds.this != sds.group_leader || sds.group_leader == sds.group_min)
-		goto ret;
-
-	*imbalance = sds.min_load_per_task;
-	if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) {
-		cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu =
-			group_first_cpu(sds.group_leader);
-	}
-	return sds.group_min;
-
-#endif
+	/*
+	 * There is no obvious imbalance. But check if we can do some balancing
+	 * to save power.
+	 */
+	if (check_power_save_busiest_group(&sds, this_cpu, imbalance))
+		return sds.busiest;
 ret:
 	*imbalance = 0;
 	return NULL;
-- 
cgit v0.10.2


From b7bb4c9bb01941fe8feb653f3410e7ed0c9bb786 Mon Sep 17 00:00:00 2001
From: Gautham R Shenoy <ego@in.ibm.com>
Date: Wed, 25 Mar 2009 14:44:27 +0530
Subject: sched: Add comments to find_busiest_group() function

Impact: cleanup

Add /** style comments around find_busiest_group(). Also add a few
explanatory comments.

This concludes the find_busiest_group() cleanup. The function is
now down to 72 lines from the original 313 lines.

Signed-off-by: Gautham R Shenoy <ego@in.ibm.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: "Balbir Singh" <balbir@in.ibm.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: "Dhaval Giani" <dhaval@linux.vnet.ibm.com>
Cc: Bharata B Rao <bharata@linux.vnet.ibm.com>
Cc: "Vaidyanathan Srinivasan" <svaidy@linux.vnet.ibm.com>
LKML-Reference: <20090325091427.13992.18933.stgit@sofia.in.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched.c b/kernel/sched.c
index 5f21658..9f8506d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3676,10 +3676,30 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
 }
 /******* find_busiest_group() helpers end here *********************/
 
-/*
- * find_busiest_group finds and returns the busiest CPU group within the
- * domain. It calculates and returns the amount of weighted load which
- * should be moved to restore balance via the imbalance parameter.
+/**
+ * find_busiest_group - Returns the busiest group within the sched_domain
+ * if there is an imbalance. If there isn't an imbalance, and
+ * the user has opted for power-savings, it returns a group whose
+ * CPUs can be put to idle by rebalancing those tasks elsewhere, if
+ * such a group exists.
+ *
+ * Also calculates the amount of weighted load which should be moved
+ * to restore balance.
+ *
+ * @sd: The sched_domain whose busiest group is to be returned.
+ * @this_cpu: The cpu for which load balancing is currently being performed.
+ * @imbalance: Variable which stores amount of weighted load which should
+ *		be moved to restore balance/put a group to idle.
+ * @idle: The idle status of this_cpu.
+ * @sd_idle: The idleness of sd
+ * @cpus: The set of CPUs under consideration for load-balancing.
+ * @balance: Pointer to a variable indicating if this_cpu
+ *	is the appropriate cpu to perform load balancing at this_level.
+ *
+ * Returns:	- the busiest group if imbalance exists.
+ *		- If no imbalance and user has opted for power-savings balance,
+ *		   return the least loaded group whose CPUs can be
+ *		   put to idle by rebalancing its tasks onto our group.
  */
 static struct sched_group *
 find_busiest_group(struct sched_domain *sd, int this_cpu,
@@ -3697,17 +3717,31 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 	update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus,
 					balance, &sds);
 
+	/* Cases where imbalance does not exist from POV of this_cpu */
+	/* 1) this_cpu is not the appropriate cpu to perform load balancing
+	 *    at this level.
+	 * 2) There is no busy sibling group to pull from.
+	 * 3) This group is the busiest group.
+	 * 4) This group is more busy than the avg busieness at this
+	 *    sched_domain.
+	 * 5) The imbalance is within the specified limit.
+	 * 6) Any rebalance would lead to ping-pong
+	 */
 	if (balance && !(*balance))
 		goto ret;
 
-	if (!sds.busiest || sds.this_load >= sds.max_load
-		|| sds.busiest_nr_running == 0)
+	if (!sds.busiest || sds.busiest_nr_running == 0)
+		goto out_balanced;
+
+	if (sds.this_load >= sds.max_load)
 		goto out_balanced;
 
 	sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
 
-	if (sds.this_load >= sds.avg_load ||
-			100*sds.max_load <= sd->imbalance_pct * sds.this_load)
+	if (sds.this_load >= sds.avg_load)
+		goto out_balanced;
+
+	if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
 		goto out_balanced;
 
 	sds.busiest_load_per_task /= sds.busiest_nr_running;
-- 
cgit v0.10.2