From de1df26b7cef702a32ae876ed45c1112f523df48 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Fri, 5 Feb 2016 02:37:42 +0100
Subject: cpufreq: Clean up default and fallback governor setup

The preprocessor magic used for setting the default cpufreq governor
(and for using the performance governor as a fallback one for that
matter) is really nasty, so replace it with __weak functions and
overrides.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Saravana Kannan <skannan@codeaurora.org>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index e979ec7..34b17447 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -959,6 +959,11 @@ static int cpufreq_add_dev_interface(struct cpufreq_policy *policy)
 	return cpufreq_add_dev_symlink(policy);
 }
 
+__weak struct cpufreq_governor *cpufreq_default_governor(void)
+{
+	return NULL;
+}
+
 static int cpufreq_init_policy(struct cpufreq_policy *policy)
 {
 	struct cpufreq_governor *gov = NULL;
@@ -968,11 +973,14 @@ static int cpufreq_init_policy(struct cpufreq_policy *policy)
 
 	/* Update governor of new_policy to the governor used before hotplug */
 	gov = find_governor(policy->last_governor);
-	if (gov)
+	if (gov) {
 		pr_debug("Restoring governor %s for cpu %d\n",
 				policy->governor->name, policy->cpu);
-	else
-		gov = CPUFREQ_DEFAULT_GOVERNOR;
+	} else {
+		gov = cpufreq_default_governor();
+		if (!gov)
+			return -ENODATA;
+	}
 
 	new_policy.governor = gov;
 
@@ -1920,21 +1928,16 @@ int cpufreq_driver_target(struct cpufreq_policy *policy,
 }
 EXPORT_SYMBOL_GPL(cpufreq_driver_target);
 
+__weak struct cpufreq_governor *cpufreq_fallback_governor(void)
+{
+	return NULL;
+}
+
 static int __cpufreq_governor(struct cpufreq_policy *policy,
 					unsigned int event)
 {
 	int ret;
 
-	/* Only must be defined when default governor is known to have latency
-	   restrictions, like e.g. conservative or ondemand.
-	   That this is the case is already ensured in Kconfig
-	*/
-#ifdef CONFIG_CPU_FREQ_GOV_PERFORMANCE
-	struct cpufreq_governor *gov = &cpufreq_gov_performance;
-#else
-	struct cpufreq_governor *gov = NULL;
-#endif
-
 	/* Don't start any governor operations if we are entering suspend */
 	if (cpufreq_suspended)
 		return 0;
@@ -1948,12 +1951,14 @@ static int __cpufreq_governor(struct cpufreq_policy *policy,
 	if (policy->governor->max_transition_latency &&
 	    policy->cpuinfo.transition_latency >
 	    policy->governor->max_transition_latency) {
-		if (!gov)
-			return -EINVAL;
-		else {
+		struct cpufreq_governor *gov = cpufreq_fallback_governor();
+
+		if (gov) {
 			pr_warn("%s governor failed, too long transition latency of HW, fallback to %s governor\n",
 				policy->governor->name, gov->name);
 			policy->governor = gov;
+		} else {
+			return -EINVAL;
 		}
 	}
 
diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c
index 606ad74ab..8504a70 100644
--- a/drivers/cpufreq/cpufreq_conservative.c
+++ b/drivers/cpufreq/cpufreq_conservative.c
@@ -26,10 +26,7 @@ static DEFINE_PER_CPU(struct cs_cpu_dbs_info_s, cs_cpu_dbs_info);
 static int cs_cpufreq_governor_dbs(struct cpufreq_policy *policy,
 				   unsigned int event);
 
-#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE
-static
-#endif
-struct cpufreq_governor cpufreq_gov_conservative = {
+static struct cpufreq_governor cpufreq_gov_conservative = {
 	.name			= "conservative",
 	.governor		= cs_cpufreq_governor_dbs,
 	.max_transition_latency	= TRANSITION_LATENCY_LIMIT,
@@ -399,6 +396,11 @@ MODULE_DESCRIPTION("'cpufreq_conservative' - A dynamic cpufreq governor for "
 MODULE_LICENSE("GPL");
 
 #ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE
+struct cpufreq_governor *cpufreq_default_governor(void)
+{
+	return &cpufreq_gov_conservative;
+}
+
 fs_initcall(cpufreq_gov_dbs_init);
 #else
 module_init(cpufreq_gov_dbs_init);
diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c
index eae5107..929e193 100644
--- a/drivers/cpufreq/cpufreq_ondemand.c
+++ b/drivers/cpufreq/cpufreq_ondemand.c
@@ -31,9 +31,7 @@ static DEFINE_PER_CPU(struct od_cpu_dbs_info_s, od_cpu_dbs_info);
 
 static struct od_ops od_ops;
 
-#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND
 static struct cpufreq_governor cpufreq_gov_ondemand;
-#endif
 
 static unsigned int default_powersave_bias;
 
@@ -554,6 +552,19 @@ static struct common_dbs_data od_dbs_cdata = {
 	.mutex = __MUTEX_INITIALIZER(od_dbs_cdata.mutex),
 };
 
+static int od_cpufreq_governor_dbs(struct cpufreq_policy *policy,
+		unsigned int event)
+{
+	return cpufreq_governor_dbs(policy, &od_dbs_cdata, event);
+}
+
+static struct cpufreq_governor cpufreq_gov_ondemand = {
+	.name			= "ondemand",
+	.governor		= od_cpufreq_governor_dbs,
+	.max_transition_latency	= TRANSITION_LATENCY_LIMIT,
+	.owner			= THIS_MODULE,
+};
+
 static void od_set_powersave_bias(unsigned int powersave_bias)
 {
 	struct cpufreq_policy *policy;
@@ -605,22 +616,6 @@ void od_unregister_powersave_bias_handler(void)
 }
 EXPORT_SYMBOL_GPL(od_unregister_powersave_bias_handler);
 
-static int od_cpufreq_governor_dbs(struct cpufreq_policy *policy,
-		unsigned int event)
-{
-	return cpufreq_governor_dbs(policy, &od_dbs_cdata, event);
-}
-
-#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND
-static
-#endif
-struct cpufreq_governor cpufreq_gov_ondemand = {
-	.name			= "ondemand",
-	.governor		= od_cpufreq_governor_dbs,
-	.max_transition_latency	= TRANSITION_LATENCY_LIMIT,
-	.owner			= THIS_MODULE,
-};
-
 static int __init cpufreq_gov_dbs_init(void)
 {
 	return cpufreq_register_governor(&cpufreq_gov_ondemand);
@@ -638,6 +633,11 @@ MODULE_DESCRIPTION("'cpufreq_ondemand' - A dynamic cpufreq governor for "
 MODULE_LICENSE("GPL");
 
 #ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND
+struct cpufreq_governor *cpufreq_default_governor(void)
+{
+	return &cpufreq_gov_ondemand;
+}
+
 fs_initcall(cpufreq_gov_dbs_init);
 #else
 module_init(cpufreq_gov_dbs_init);
diff --git a/drivers/cpufreq/cpufreq_performance.c b/drivers/cpufreq/cpufreq_performance.c
index cf117de..af9f4b9 100644
--- a/drivers/cpufreq/cpufreq_performance.c
+++ b/drivers/cpufreq/cpufreq_performance.c
@@ -33,10 +33,7 @@ static int cpufreq_governor_performance(struct cpufreq_policy *policy,
 	return 0;
 }
 
-#ifdef CONFIG_CPU_FREQ_GOV_PERFORMANCE_MODULE
-static
-#endif
-struct cpufreq_governor cpufreq_gov_performance = {
+static struct cpufreq_governor cpufreq_gov_performance = {
 	.name		= "performance",
 	.governor	= cpufreq_governor_performance,
 	.owner		= THIS_MODULE,
@@ -52,6 +49,19 @@ static void __exit cpufreq_gov_performance_exit(void)
 	cpufreq_unregister_governor(&cpufreq_gov_performance);
 }
 
+#ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE
+struct cpufreq_governor *cpufreq_default_governor(void)
+{
+	return &cpufreq_gov_performance;
+}
+#endif
+#ifndef CONFIG_CPU_FREQ_GOV_PERFORMANCE_MODULE
+struct cpufreq_governor *cpufreq_fallback_governor(void)
+{
+	return &cpufreq_gov_performance;
+}
+#endif
+
 MODULE_AUTHOR("Dominik Brodowski <linux@brodo.de>");
 MODULE_DESCRIPTION("CPUfreq policy governor 'performance'");
 MODULE_LICENSE("GPL");
diff --git a/drivers/cpufreq/cpufreq_powersave.c b/drivers/cpufreq/cpufreq_powersave.c
index e3b874c..b8b4002 100644
--- a/drivers/cpufreq/cpufreq_powersave.c
+++ b/drivers/cpufreq/cpufreq_powersave.c
@@ -33,10 +33,7 @@ static int cpufreq_governor_powersave(struct cpufreq_policy *policy,
 	return 0;
 }
 
-#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_POWERSAVE
-static
-#endif
-struct cpufreq_governor cpufreq_gov_powersave = {
+static struct cpufreq_governor cpufreq_gov_powersave = {
 	.name		= "powersave",
 	.governor	= cpufreq_governor_powersave,
 	.owner		= THIS_MODULE,
@@ -57,6 +54,11 @@ MODULE_DESCRIPTION("CPUfreq policy governor 'powersave'");
 MODULE_LICENSE("GPL");
 
 #ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_POWERSAVE
+struct cpufreq_governor *cpufreq_default_governor(void)
+{
+	return &cpufreq_gov_powersave;
+}
+
 fs_initcall(cpufreq_gov_powersave_init);
 #else
 module_init(cpufreq_gov_powersave_init);
diff --git a/drivers/cpufreq/cpufreq_userspace.c b/drivers/cpufreq/cpufreq_userspace.c
index 4dbf1db..4d16f45 100644
--- a/drivers/cpufreq/cpufreq_userspace.c
+++ b/drivers/cpufreq/cpufreq_userspace.c
@@ -89,10 +89,7 @@ static int cpufreq_governor_userspace(struct cpufreq_policy *policy,
 	return rc;
 }
 
-#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE
-static
-#endif
-struct cpufreq_governor cpufreq_gov_userspace = {
+static struct cpufreq_governor cpufreq_gov_userspace = {
 	.name		= "userspace",
 	.governor	= cpufreq_governor_userspace,
 	.store_setspeed	= cpufreq_set,
@@ -116,6 +113,11 @@ MODULE_DESCRIPTION("CPUfreq policy governor 'userspace'");
 MODULE_LICENSE("GPL");
 
 #ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE
+struct cpufreq_governor *cpufreq_default_governor(void)
+{
+	return &cpufreq_gov_userspace;
+}
+
 fs_initcall(cpufreq_gov_userspace_init);
 #else
 module_init(cpufreq_gov_userspace_init);
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 88a4215..d0bf555 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -464,29 +464,8 @@ int __cpufreq_driver_target(struct cpufreq_policy *policy,
 int cpufreq_register_governor(struct cpufreq_governor *governor);
 void cpufreq_unregister_governor(struct cpufreq_governor *governor);
 
-/* CPUFREQ DEFAULT GOVERNOR */
-/*
- * Performance governor is fallback governor if any other gov failed to auto
- * load due latency restrictions
- */
-#ifdef CONFIG_CPU_FREQ_GOV_PERFORMANCE
-extern struct cpufreq_governor cpufreq_gov_performance;
-#endif
-#ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE
-#define CPUFREQ_DEFAULT_GOVERNOR	(&cpufreq_gov_performance)
-#elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_POWERSAVE)
-extern struct cpufreq_governor cpufreq_gov_powersave;
-#define CPUFREQ_DEFAULT_GOVERNOR	(&cpufreq_gov_powersave)
-#elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE)
-extern struct cpufreq_governor cpufreq_gov_userspace;
-#define CPUFREQ_DEFAULT_GOVERNOR	(&cpufreq_gov_userspace)
-#elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND)
-extern struct cpufreq_governor cpufreq_gov_ondemand;
-#define CPUFREQ_DEFAULT_GOVERNOR	(&cpufreq_gov_ondemand)
-#elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE)
-extern struct cpufreq_governor cpufreq_gov_conservative;
-#define CPUFREQ_DEFAULT_GOVERNOR	(&cpufreq_gov_conservative)
-#endif
+struct cpufreq_governor *cpufreq_default_governor(void);
+struct cpufreq_governor *cpufreq_fallback_governor(void);
 
 /*********************************************************************
  *                     FREQUENCY TABLE HELPERS                       *
-- 
cgit v0.10.2


From 86622cb8c57abb05fe95bea3a068949c0ca79fc3 Mon Sep 17 00:00:00 2001
From: Shilpasri G Bhat <shilpa.bhat@linux.vnet.ibm.com>
Date: Wed, 3 Feb 2016 01:11:37 +0530
Subject: cpufreq: powernv: Free 'chips' on module exit

This will free the dynamically allocated memory of 'chips' on
module exit.

Signed-off-by: Shilpasri G Bhat <shilpa.bhat@linux.vnet.ibm.com>
Reviewed-by: Gautham R. Shenoy <ego@linux.vnet.ibm.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c
index 547890f..53f980b 100644
--- a/drivers/cpufreq/powernv-cpufreq.c
+++ b/drivers/cpufreq/powernv-cpufreq.c
@@ -612,6 +612,7 @@ static void __exit powernv_cpufreq_exit(void)
 	unregister_reboot_notifier(&powernv_cpufreq_reboot_nb);
 	opal_message_notifier_unregister(OPAL_MSG_OCC,
 					 &powernv_cpufreq_opal_nb);
+	kfree(chips);
 	cpufreq_unregister_driver(&powernv_cpufreq_driver);
 }
 module_exit(powernv_cpufreq_exit);
-- 
cgit v0.10.2


From 6d167a44e6c8da3316e037b788585fcf96112bea Mon Sep 17 00:00:00 2001
From: Shilpasri G Bhat <shilpa.bhat@linux.vnet.ibm.com>
Date: Wed, 3 Feb 2016 01:11:38 +0530
Subject: cpufreq: powernv: Hot-plug safe the kworker thread

In the kworker_thread powernv_cpufreq_work_fn(), we can end up
sending an IPI to a cpu going offline. This is a rare corner case
which is fixed using {get/put}_online_cpus(). Along with this fix,
this patch adds changes to do oneshot cpumask_{clear/and} operation.

Suggested-by: Shreyas B Prabhu <shreyas@linux.vnet.ibm.com>
Suggested-by: Gautham R Shenoy <ego@linux.vnet.ibm.com>
Signed-off-by: Shilpasri G Bhat <shilpa.bhat@linux.vnet.ibm.com>
Reviewed-by: Gautham R. Shenoy <ego@linux.vnet.ibm.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c
index 53f980b..a271b0f 100644
--- a/drivers/cpufreq/powernv-cpufreq.c
+++ b/drivers/cpufreq/powernv-cpufreq.c
@@ -28,6 +28,7 @@
 #include <linux/of.h>
 #include <linux/reboot.h>
 #include <linux/slab.h>
+#include <linux/cpu.h>
 
 #include <asm/cputhreads.h>
 #include <asm/firmware.h>
@@ -423,18 +424,19 @@ void powernv_cpufreq_work_fn(struct work_struct *work)
 {
 	struct chip *chip = container_of(work, struct chip, throttle);
 	unsigned int cpu;
-	cpumask_var_t mask;
+	cpumask_t mask;
 
-	smp_call_function_any(&chip->mask,
+	get_online_cpus();
+	cpumask_and(&mask, &chip->mask, cpu_online_mask);
+	smp_call_function_any(&mask,
 			      powernv_cpufreq_throttle_check, NULL, 0);
 
 	if (!chip->restore)
-		return;
+		goto out;
 
 	chip->restore = false;
-	cpumask_copy(mask, &chip->mask);
-	for_each_cpu_and(cpu, mask, cpu_online_mask) {
-		int index, tcpu;
+	for_each_cpu(cpu, &mask) {
+		int index;
 		struct cpufreq_policy policy;
 
 		cpufreq_get_policy(&policy, cpu);
@@ -442,9 +444,10 @@ void powernv_cpufreq_work_fn(struct work_struct *work)
 					       policy.cur,
 					       CPUFREQ_RELATION_C, &index);
 		powernv_cpufreq_target_index(&policy, index);
-		for_each_cpu(tcpu, policy.cpus)
-			cpumask_clear_cpu(tcpu, mask);
+		cpumask_andnot(&mask, &mask, policy.cpus);
 	}
+out:
+	put_online_cpus();
 }
 
 static char throttle_reason[][30] = {
-- 
cgit v0.10.2


From 96c4726f01cdbf53acf74cf2394e287d74bf40a3 Mon Sep 17 00:00:00 2001
From: Shilpasri G Bhat <shilpa.bhat@linux.vnet.ibm.com>
Date: Wed, 3 Feb 2016 01:11:39 +0530
Subject: cpufreq: powernv: Remove cpu_to_chip_id() from hot-path

cpu_to_chip_id() does a DT walk through to find out the chip id by
taking a contended device tree lock. This adds an unnecessary overhead
in a hot path. So instead of calling cpu_to_chip_id() everytime cache
the chip ids for all cores in the array 'core_to_chip_map' and use it
in the hotpath.

Reported-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Shilpasri G Bhat <shilpa.bhat@linux.vnet.ibm.com>
Reviewed-by: Gautham R. Shenoy <ego@linux.vnet.ibm.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c
index a271b0f..c670314 100644
--- a/drivers/cpufreq/powernv-cpufreq.c
+++ b/drivers/cpufreq/powernv-cpufreq.c
@@ -43,6 +43,7 @@
 
 static struct cpufreq_frequency_table powernv_freqs[POWERNV_MAX_PSTATES+1];
 static bool rebooting, throttled, occ_reset;
+static unsigned int *core_to_chip_map;
 
 static struct chip {
 	unsigned int id;
@@ -313,13 +314,14 @@ static inline unsigned int get_nominal_index(void)
 static void powernv_cpufreq_throttle_check(void *data)
 {
 	unsigned int cpu = smp_processor_id();
+	unsigned int chip_id = core_to_chip_map[cpu_core_index_of_thread(cpu)];
 	unsigned long pmsr;
 	int pmsr_pmax, i;
 
 	pmsr = get_pmspr(SPRN_PMSR);
 
 	for (i = 0; i < nr_chips; i++)
-		if (chips[i].id == cpu_to_chip_id(cpu))
+		if (chips[i].id == chip_id)
 			break;
 
 	/* Check for Pmax Capping */
@@ -559,19 +561,29 @@ static int init_chip_info(void)
 	unsigned int chip[256];
 	unsigned int cpu, i;
 	unsigned int prev_chip_id = UINT_MAX;
+	cpumask_t cpu_mask;
+	int ret = -ENOMEM;
 
-	for_each_possible_cpu(cpu) {
+	core_to_chip_map = kcalloc(cpu_nr_cores(), sizeof(unsigned int),
+				   GFP_KERNEL);
+	if (!core_to_chip_map)
+		goto out;
+
+	cpumask_copy(&cpu_mask, cpu_possible_mask);
+	for_each_cpu(cpu, &cpu_mask) {
 		unsigned int id = cpu_to_chip_id(cpu);
 
 		if (prev_chip_id != id) {
 			prev_chip_id = id;
 			chip[nr_chips++] = id;
 		}
+		core_to_chip_map[cpu_core_index_of_thread(cpu)] = id;
+		cpumask_andnot(&cpu_mask, &cpu_mask, cpu_sibling_mask(cpu));
 	}
 
 	chips = kmalloc_array(nr_chips, sizeof(struct chip), GFP_KERNEL);
 	if (!chips)
-		return -ENOMEM;
+		goto free_chip_map;
 
 	for (i = 0; i < nr_chips; i++) {
 		chips[i].id = chip[i];
@@ -582,6 +594,10 @@ static int init_chip_info(void)
 	}
 
 	return 0;
+free_chip_map:
+	kfree(core_to_chip_map);
+out:
+	return ret;
 }
 
 static int __init powernv_cpufreq_init(void)
@@ -616,6 +632,7 @@ static void __exit powernv_cpufreq_exit(void)
 	opal_message_notifier_unregister(OPAL_MSG_OCC,
 					 &powernv_cpufreq_opal_nb);
 	kfree(chips);
+	kfree(core_to_chip_map);
 	cpufreq_unregister_driver(&powernv_cpufreq_driver);
 }
 module_exit(powernv_cpufreq_exit);
-- 
cgit v0.10.2


From 0306e481d479a58eff17c27adf213fbb5822946b Mon Sep 17 00:00:00 2001
From: Shilpasri G Bhat <shilpa.bhat@linux.vnet.ibm.com>
Date: Wed, 3 Feb 2016 01:11:40 +0530
Subject: cpufreq: powernv/tracing: Add powernv_throttle tracepoint

This patch adds the powernv_throttle tracepoint to trace the CPU
frequency throttling event, which is used by the powernv-cpufreq
driver in POWER8.

Signed-off-by: Shilpasri G Bhat <shilpa.bhat@linux.vnet.ibm.com>
Reviewed-by: Gautham R. Shenoy <ego@linux.vnet.ibm.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/include/trace/events/power.h b/include/trace/events/power.h
index 284244e..19e5030 100644
--- a/include/trace/events/power.h
+++ b/include/trace/events/power.h
@@ -38,6 +38,28 @@ DEFINE_EVENT(cpu, cpu_idle,
 	TP_ARGS(state, cpu_id)
 );
 
+TRACE_EVENT(powernv_throttle,
+
+	TP_PROTO(int chip_id, const char *reason, int pmax),
+
+	TP_ARGS(chip_id, reason, pmax),
+
+	TP_STRUCT__entry(
+		__field(int, chip_id)
+		__string(reason, reason)
+		__field(int, pmax)
+	),
+
+	TP_fast_assign(
+		__entry->chip_id = chip_id;
+		__assign_str(reason, reason);
+		__entry->pmax = pmax;
+	),
+
+	TP_printk("Chip %d Pmax %d %s", __entry->chip_id,
+		  __entry->pmax, __get_str(reason))
+);
+
 TRACE_EVENT(pstate_sample,
 
 	TP_PROTO(u32 core_busy,
diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c
index eb4220a..81b8745 100644
--- a/kernel/trace/power-traces.c
+++ b/kernel/trace/power-traces.c
@@ -15,4 +15,5 @@
 
 EXPORT_TRACEPOINT_SYMBOL_GPL(suspend_resume);
 EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle);
+EXPORT_TRACEPOINT_SYMBOL_GPL(powernv_throttle);
 
-- 
cgit v0.10.2


From c89f2682a39192433c296bf97b834fd2815a758b Mon Sep 17 00:00:00 2001
From: Shilpasri G Bhat <shilpa.bhat@linux.vnet.ibm.com>
Date: Wed, 3 Feb 2016 01:11:41 +0530
Subject: cpufreq: powernv: Replace pr_info with trace print for throttle event

Currently we use printk message to notify the throttle event. But this
can flood the console if the cpu is throttled frequently. So replace the
printk with the tracepoint to notify the throttle event. And also events
like throttle below nominal frequency and OCC_RESET are reduced to
pr_warn/pr_warn_once as pointed by MFG to not mark them as critical
messages. This patch adds 'throttle_reason' to struct chip to store the
throttle reason.

Signed-off-by: Shilpasri G Bhat <shilpa.bhat@linux.vnet.ibm.com>
Reviewed-by: Gautham R. Shenoy <ego@linux.vnet.ibm.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c
index c670314..1bbc10a 100644
--- a/drivers/cpufreq/powernv-cpufreq.c
+++ b/drivers/cpufreq/powernv-cpufreq.c
@@ -29,6 +29,7 @@
 #include <linux/reboot.h>
 #include <linux/slab.h>
 #include <linux/cpu.h>
+#include <trace/events/power.h>
 
 #include <asm/cputhreads.h>
 #include <asm/firmware.h>
@@ -45,12 +46,22 @@ static struct cpufreq_frequency_table powernv_freqs[POWERNV_MAX_PSTATES+1];
 static bool rebooting, throttled, occ_reset;
 static unsigned int *core_to_chip_map;
 
+static const char * const throttle_reason[] = {
+	"No throttling",
+	"Power Cap",
+	"Processor Over Temperature",
+	"Power Supply Failure",
+	"Over Current",
+	"OCC Reset"
+};
+
 static struct chip {
 	unsigned int id;
 	bool throttled;
+	bool restore;
+	u8 throttle_reason;
 	cpumask_t mask;
 	struct work_struct throttle;
-	bool restore;
 } *chips;
 
 static int nr_chips;
@@ -331,17 +342,17 @@ static void powernv_cpufreq_throttle_check(void *data)
 			goto next;
 		chips[i].throttled = true;
 		if (pmsr_pmax < powernv_pstate_info.nominal)
-			pr_crit("CPU %d on Chip %u has Pmax reduced below nominal frequency (%d < %d)\n",
-				cpu, chips[i].id, pmsr_pmax,
-				powernv_pstate_info.nominal);
-		else
-			pr_info("CPU %d on Chip %u has Pmax reduced below turbo frequency (%d < %d)\n",
-				cpu, chips[i].id, pmsr_pmax,
-				powernv_pstate_info.max);
+			pr_warn_once("CPU %d on Chip %u has Pmax reduced below nominal frequency (%d < %d)\n",
+				     cpu, chips[i].id, pmsr_pmax,
+				     powernv_pstate_info.nominal);
+		trace_powernv_throttle(chips[i].id,
+				      throttle_reason[chips[i].throttle_reason],
+				      pmsr_pmax);
 	} else if (chips[i].throttled) {
 		chips[i].throttled = false;
-		pr_info("CPU %d on Chip %u has Pmax restored to %d\n", cpu,
-			chips[i].id, pmsr_pmax);
+		trace_powernv_throttle(chips[i].id,
+				      throttle_reason[chips[i].throttle_reason],
+				      pmsr_pmax);
 	}
 
 	/* Check if Psafe_mode_active is set in PMSR. */
@@ -359,7 +370,7 @@ next:
 
 	if (throttled) {
 		pr_info("PMSR = %16lx\n", pmsr);
-		pr_crit("CPU Frequency could be throttled\n");
+		pr_warn("CPU Frequency could be throttled\n");
 	}
 }
 
@@ -452,15 +463,6 @@ out:
 	put_online_cpus();
 }
 
-static char throttle_reason[][30] = {
-					"No throttling",
-					"Power Cap",
-					"Processor Over Temperature",
-					"Power Supply Failure",
-					"Over Current",
-					"OCC Reset"
-				     };
-
 static int powernv_cpufreq_occ_msg(struct notifier_block *nb,
 				   unsigned long msg_type, void *_msg)
 {
@@ -486,7 +488,7 @@ static int powernv_cpufreq_occ_msg(struct notifier_block *nb,
 		 */
 		if (!throttled) {
 			throttled = true;
-			pr_crit("CPU frequency is throttled for duration\n");
+			pr_warn("CPU frequency is throttled for duration\n");
 		}
 
 		break;
@@ -510,23 +512,18 @@ static int powernv_cpufreq_occ_msg(struct notifier_block *nb,
 			return 0;
 		}
 
-		if (omsg.throttle_status &&
+		for (i = 0; i < nr_chips; i++)
+			if (chips[i].id == omsg.chip)
+				break;
+
+		if (omsg.throttle_status >= 0 &&
 		    omsg.throttle_status <= OCC_MAX_THROTTLE_STATUS)
-			pr_info("OCC: Chip %u Pmax reduced due to %s\n",
-				(unsigned int)omsg.chip,
-				throttle_reason[omsg.throttle_status]);
-		else if (!omsg.throttle_status)
-			pr_info("OCC: Chip %u %s\n", (unsigned int)omsg.chip,
-				throttle_reason[omsg.throttle_status]);
-		else
-			return 0;
+			chips[i].throttle_reason = omsg.throttle_status;
 
-		for (i = 0; i < nr_chips; i++)
-			if (chips[i].id == omsg.chip) {
-				if (!omsg.throttle_status)
-					chips[i].restore = true;
-				schedule_work(&chips[i].throttle);
-			}
+		if (!omsg.throttle_status)
+			chips[i].restore = true;
+
+		schedule_work(&chips[i].throttle);
 	}
 	return 0;
 }
@@ -581,16 +578,14 @@ static int init_chip_info(void)
 		cpumask_andnot(&cpu_mask, &cpu_mask, cpu_sibling_mask(cpu));
 	}
 
-	chips = kmalloc_array(nr_chips, sizeof(struct chip), GFP_KERNEL);
+	chips = kcalloc(nr_chips, sizeof(struct chip), GFP_KERNEL);
 	if (!chips)
 		goto free_chip_map;
 
 	for (i = 0; i < nr_chips; i++) {
 		chips[i].id = chip[i];
-		chips[i].throttled = false;
 		cpumask_copy(&chips[i].mask, cpumask_of_node(chip[i]));
 		INIT_WORK(&chips[i].throttle, powernv_cpufreq_work_fn);
-		chips[i].restore = false;
 	}
 
 	return 0;
-- 
cgit v0.10.2


From 9f8ea969d5cfdd4353d2adb004e8e2286b984369 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Tue, 9 Feb 2016 10:30:33 +0530
Subject: PM / OPP: get/put regulators from OPP core

This allows the OPP core to request/free the regulator resource,
attached to a device OPP. The regulator device is fetched using the name
provided by the driver, while calling: dev_pm_opp_set_regulator().

This will work for both OPP-v1 and v2 bindings.

This is a preliminary step for moving the OPP switching logic into the
OPP core.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Reviewed-by: Stephen Boyd <sboyd@codeaurora.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/base/power/opp/core.c b/drivers/base/power/opp/core.c
index cf351d3..1e22b71 100644
--- a/drivers/base/power/opp/core.c
+++ b/drivers/base/power/opp/core.c
@@ -19,6 +19,7 @@
 #include <linux/device.h>
 #include <linux/of.h>
 #include <linux/export.h>
+#include <linux/regulator/consumer.h>
 
 #include "opp.h"
 
@@ -565,6 +566,9 @@ static void _remove_device_opp(struct device_opp *dev_opp)
 	if (dev_opp->prop_name)
 		return;
 
+	if (!IS_ERR_OR_NULL(dev_opp->regulator))
+		return;
+
 	list_dev = list_first_entry(&dev_opp->dev_list, struct device_list_opp,
 				    node);
 
@@ -1085,6 +1089,113 @@ unlock:
 }
 EXPORT_SYMBOL_GPL(dev_pm_opp_put_prop_name);
 
+/**
+ * dev_pm_opp_set_regulator() - Set regulator name for the device
+ * @dev: Device for which regulator name is being set.
+ * @name: Name of the regulator.
+ *
+ * In order to support OPP switching, OPP layer needs to know the name of the
+ * device's regulator, as the core would be required to switch voltages as well.
+ *
+ * This must be called before any OPPs are initialized for the device.
+ *
+ * Locking: The internal device_opp and opp structures are RCU protected.
+ * Hence this function internally uses RCU updater strategy with mutex locks
+ * to keep the integrity of the internal data structures. Callers should ensure
+ * that this function is *NOT* called under RCU protection or in contexts where
+ * mutex cannot be locked.
+ */
+int dev_pm_opp_set_regulator(struct device *dev, const char *name)
+{
+	struct device_opp *dev_opp;
+	struct regulator *reg;
+	int ret;
+
+	mutex_lock(&dev_opp_list_lock);
+
+	dev_opp = _add_device_opp(dev);
+	if (!dev_opp) {
+		ret = -ENOMEM;
+		goto unlock;
+	}
+
+	/* This should be called before OPPs are initialized */
+	if (WARN_ON(!list_empty(&dev_opp->opp_list))) {
+		ret = -EBUSY;
+		goto err;
+	}
+
+	/* Already have a regulator set */
+	if (WARN_ON(!IS_ERR_OR_NULL(dev_opp->regulator))) {
+		ret = -EBUSY;
+		goto err;
+	}
+	/* Allocate the regulator */
+	reg = regulator_get_optional(dev, name);
+	if (IS_ERR(reg)) {
+		ret = PTR_ERR(reg);
+		if (ret != -EPROBE_DEFER)
+			dev_err(dev, "%s: no regulator (%s) found: %d\n",
+				__func__, name, ret);
+		goto err;
+	}
+
+	dev_opp->regulator = reg;
+
+	mutex_unlock(&dev_opp_list_lock);
+	return 0;
+
+err:
+	_remove_device_opp(dev_opp);
+unlock:
+	mutex_unlock(&dev_opp_list_lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_set_regulator);
+
+/**
+ * dev_pm_opp_put_regulator() - Releases resources blocked for regulator
+ * @dev: Device for which regulator was set.
+ *
+ * Locking: The internal device_opp and opp structures are RCU protected.
+ * Hence this function internally uses RCU updater strategy with mutex locks
+ * to keep the integrity of the internal data structures. Callers should ensure
+ * that this function is *NOT* called under RCU protection or in contexts where
+ * mutex cannot be locked.
+ */
+void dev_pm_opp_put_regulator(struct device *dev)
+{
+	struct device_opp *dev_opp;
+
+	mutex_lock(&dev_opp_list_lock);
+
+	/* Check for existing list for 'dev' first */
+	dev_opp = _find_device_opp(dev);
+	if (IS_ERR(dev_opp)) {
+		dev_err(dev, "Failed to find dev_opp: %ld\n", PTR_ERR(dev_opp));
+		goto unlock;
+	}
+
+	if (IS_ERR_OR_NULL(dev_opp->regulator)) {
+		dev_err(dev, "%s: Doesn't have regulator set\n", __func__);
+		goto unlock;
+	}
+
+	/* Make sure there are no concurrent readers while updating dev_opp */
+	WARN_ON(!list_empty(&dev_opp->opp_list));
+
+	regulator_put(dev_opp->regulator);
+	dev_opp->regulator = ERR_PTR(-EINVAL);
+
+	/* Try freeing device_opp if this was the last blocking resource */
+	_remove_device_opp(dev_opp);
+
+unlock:
+	mutex_unlock(&dev_opp_list_lock);
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_put_regulator);
+
 static bool _opp_is_supported(struct device *dev, struct device_opp *dev_opp,
 			      struct device_node *np)
 {
diff --git a/drivers/base/power/opp/opp.h b/drivers/base/power/opp/opp.h
index 690638e..416293b 100644
--- a/drivers/base/power/opp/opp.h
+++ b/drivers/base/power/opp/opp.h
@@ -22,6 +22,8 @@
 #include <linux/rculist.h>
 #include <linux/rcupdate.h>
 
+struct regulator;
+
 /* Lock to allow exclusive modification to the device and opp lists */
 extern struct mutex dev_opp_list_lock;
 
@@ -132,6 +134,7 @@ struct device_list_opp {
  * @supported_hw: Array of version number to support.
  * @supported_hw_count: Number of elements in supported_hw array.
  * @prop_name: A name to postfix to many DT properties, while parsing them.
+ * @regulator: Supply regulator
  * @dentry:	debugfs dentry pointer of the real device directory (not links).
  * @dentry_name: Name of the real dentry.
  *
@@ -159,6 +162,7 @@ struct device_opp {
 	unsigned int *supported_hw;
 	unsigned int supported_hw_count;
 	const char *prop_name;
+	struct regulator *regulator;
 
 #ifdef CONFIG_DEBUG_FS
 	struct dentry *dentry;
diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h
index 95403d2..c70a18a 100644
--- a/include/linux/pm_opp.h
+++ b/include/linux/pm_opp.h
@@ -60,6 +60,8 @@ int dev_pm_opp_set_supported_hw(struct device *dev, const u32 *versions,
 void dev_pm_opp_put_supported_hw(struct device *dev);
 int dev_pm_opp_set_prop_name(struct device *dev, const char *name);
 void dev_pm_opp_put_prop_name(struct device *dev);
+int dev_pm_opp_set_regulator(struct device *dev, const char *name);
+void dev_pm_opp_put_regulator(struct device *dev);
 #else
 static inline unsigned long dev_pm_opp_get_voltage(struct dev_pm_opp *opp)
 {
@@ -151,6 +153,13 @@ static inline int dev_pm_opp_set_prop_name(struct device *dev, const char *name)
 
 static inline void dev_pm_opp_put_prop_name(struct device *dev) {}
 
+static inline int dev_pm_opp_set_regulator(struct device *dev, const char *name)
+{
+	return -EINVAL;
+}
+
+static inline void dev_pm_opp_put_regulator(struct device *dev) {}
+
 #endif		/* CONFIG_PM_OPP */
 
 #if defined(CONFIG_PM_OPP) && defined(CONFIG_OF)
-- 
cgit v0.10.2


From 7d34d56ef3349cd5f29cf7aab6650f3414fa81b9 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Tue, 9 Feb 2016 10:30:34 +0530
Subject: PM / OPP: Disable OPPs that aren't supported by the regulator

Disable any OPPs where the connected regulator isn't able to provide the
specified voltage.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Reviewed-by: Stephen Boyd <sboyd@codeaurora.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/base/power/opp/core.c b/drivers/base/power/opp/core.c
index 1e22b71..71545be 100644
--- a/drivers/base/power/opp/core.c
+++ b/drivers/base/power/opp/core.c
@@ -687,6 +687,22 @@ static struct dev_pm_opp *_allocate_opp(struct device *dev,
 	return opp;
 }
 
+static bool _opp_supported_by_regulators(struct dev_pm_opp *opp,
+					 struct device_opp *dev_opp)
+{
+	struct regulator *reg = dev_opp->regulator;
+
+	if (!IS_ERR(reg) &&
+	    !regulator_is_supported_voltage(reg, opp->u_volt_min,
+					    opp->u_volt_max)) {
+		pr_warn("%s: OPP minuV: %lu maxuV: %lu, not supported by regulator\n",
+			__func__, opp->u_volt_min, opp->u_volt_max);
+		return false;
+	}
+
+	return true;
+}
+
 static int _opp_add(struct device *dev, struct dev_pm_opp *new_opp,
 		    struct device_opp *dev_opp)
 {
@@ -728,6 +744,12 @@ static int _opp_add(struct device *dev, struct dev_pm_opp *new_opp,
 		dev_err(dev, "%s: Failed to register opp to debugfs (%d)\n",
 			__func__, ret);
 
+	if (!_opp_supported_by_regulators(new_opp, dev_opp)) {
+		new_opp->available = false;
+		dev_warn(dev, "%s: OPP not supported by regulators (%lu)\n",
+			 __func__, new_opp->rate);
+	}
+
 	return 0;
 }
 
-- 
cgit v0.10.2


From 655c9df961751ce21466f6e97e8033932c27a675 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Tue, 9 Feb 2016 10:30:35 +0530
Subject: PM / OPP: Introduce dev_pm_opp_get_max_volt_latency()

In few use cases (like: cpufreq), it is desired to get the maximum
voltage latency for changing OPPs. Add support for that.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Reviewed-by: Stephen Boyd <sboyd@codeaurora.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/base/power/opp/core.c b/drivers/base/power/opp/core.c
index 71545be..ffe2406 100644
--- a/drivers/base/power/opp/core.c
+++ b/drivers/base/power/opp/core.c
@@ -231,6 +231,65 @@ unsigned long dev_pm_opp_get_max_clock_latency(struct device *dev)
 EXPORT_SYMBOL_GPL(dev_pm_opp_get_max_clock_latency);
 
 /**
+ * dev_pm_opp_get_max_volt_latency() - Get max voltage latency in nanoseconds
+ * @dev: device for which we do this operation
+ *
+ * Return: This function returns the max voltage latency in nanoseconds.
+ *
+ * Locking: This function takes rcu_read_lock().
+ */
+unsigned long dev_pm_opp_get_max_volt_latency(struct device *dev)
+{
+	struct device_opp *dev_opp;
+	struct dev_pm_opp *opp;
+	struct regulator *reg;
+	unsigned long latency_ns = 0;
+	unsigned long min_uV = ~0, max_uV = 0;
+	int ret;
+
+	rcu_read_lock();
+
+	dev_opp = _find_device_opp(dev);
+	if (IS_ERR(dev_opp)) {
+		rcu_read_unlock();
+		return 0;
+	}
+
+	reg = dev_opp->regulator;
+	if (IS_ERR_OR_NULL(reg)) {
+		/* Regulator may not be required for device */
+		if (reg)
+			dev_err(dev, "%s: Invalid regulator (%ld)\n", __func__,
+				PTR_ERR(reg));
+		rcu_read_unlock();
+		return 0;
+	}
+
+	list_for_each_entry_rcu(opp, &dev_opp->opp_list, node) {
+		if (!opp->available)
+			continue;
+
+		if (opp->u_volt_min < min_uV)
+			min_uV = opp->u_volt_min;
+		if (opp->u_volt_max > max_uV)
+			max_uV = opp->u_volt_max;
+	}
+
+	rcu_read_unlock();
+
+	/*
+	 * The caller needs to ensure that dev_opp (and hence the regulator)
+	 * isn't freed, while we are executing this routine.
+	 */
+	ret = regulator_set_voltage_time(reg, min_uV, max_uV);
+	if (ret > 0)
+		latency_ns = ret * 1000;
+
+	return latency_ns;
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_get_max_volt_latency);
+
+/**
  * dev_pm_opp_get_suspend_opp() - Get suspend opp
  * @dev:	device for which we do this operation
  *
diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h
index c70a18a..5daa430 100644
--- a/include/linux/pm_opp.h
+++ b/include/linux/pm_opp.h
@@ -34,6 +34,7 @@ bool dev_pm_opp_is_turbo(struct dev_pm_opp *opp);
 
 int dev_pm_opp_get_opp_count(struct device *dev);
 unsigned long dev_pm_opp_get_max_clock_latency(struct device *dev);
+unsigned long dev_pm_opp_get_max_volt_latency(struct device *dev);
 struct dev_pm_opp *dev_pm_opp_get_suspend_opp(struct device *dev);
 
 struct dev_pm_opp *dev_pm_opp_find_freq_exact(struct device *dev,
@@ -88,6 +89,11 @@ static inline unsigned long dev_pm_opp_get_max_clock_latency(struct device *dev)
 	return 0;
 }
 
+static inline unsigned long dev_pm_opp_get_max_volt_latency(struct device *dev)
+{
+	return 0;
+}
+
 static inline struct dev_pm_opp *dev_pm_opp_get_suspend_opp(struct device *dev)
 {
 	return NULL;
-- 
cgit v0.10.2


From 2174344765f472895c076d703c9cdc58215e1393 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Tue, 9 Feb 2016 10:30:36 +0530
Subject: PM / OPP: Introduce dev_pm_opp_get_max_transition_latency()

In few use cases (like: cpufreq), it is desired to get the maximum
latency for changing OPPs. Add support for that.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Reviewed-by: Stephen Boyd <sboyd@codeaurora.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/base/power/opp/core.c b/drivers/base/power/opp/core.c
index ffe2406..b0f5c72 100644
--- a/drivers/base/power/opp/core.c
+++ b/drivers/base/power/opp/core.c
@@ -290,6 +290,23 @@ unsigned long dev_pm_opp_get_max_volt_latency(struct device *dev)
 EXPORT_SYMBOL_GPL(dev_pm_opp_get_max_volt_latency);
 
 /**
+ * dev_pm_opp_get_max_transition_latency() - Get max transition latency in
+ *					     nanoseconds
+ * @dev: device for which we do this operation
+ *
+ * Return: This function returns the max transition latency, in nanoseconds, to
+ * switch from one OPP to other.
+ *
+ * Locking: This function takes rcu_read_lock().
+ */
+unsigned long dev_pm_opp_get_max_transition_latency(struct device *dev)
+{
+	return dev_pm_opp_get_max_volt_latency(dev) +
+		dev_pm_opp_get_max_clock_latency(dev);
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_get_max_transition_latency);
+
+/**
  * dev_pm_opp_get_suspend_opp() - Get suspend opp
  * @dev:	device for which we do this operation
  *
diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h
index 5daa430..59da3d9 100644
--- a/include/linux/pm_opp.h
+++ b/include/linux/pm_opp.h
@@ -35,6 +35,7 @@ bool dev_pm_opp_is_turbo(struct dev_pm_opp *opp);
 int dev_pm_opp_get_opp_count(struct device *dev);
 unsigned long dev_pm_opp_get_max_clock_latency(struct device *dev);
 unsigned long dev_pm_opp_get_max_volt_latency(struct device *dev);
+unsigned long dev_pm_opp_get_max_transition_latency(struct device *dev);
 struct dev_pm_opp *dev_pm_opp_get_suspend_opp(struct device *dev);
 
 struct dev_pm_opp *dev_pm_opp_find_freq_exact(struct device *dev,
@@ -94,6 +95,11 @@ static inline unsigned long dev_pm_opp_get_max_volt_latency(struct device *dev)
 	return 0;
 }
 
+static inline unsigned long dev_pm_opp_get_max_transition_latency(struct device *dev)
+{
+	return 0;
+}
+
 static inline struct dev_pm_opp *dev_pm_opp_get_suspend_opp(struct device *dev)
 {
 	return NULL;
-- 
cgit v0.10.2


From 50f8cfbd5897ca182d43f4caf19937153f17a604 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Tue, 9 Feb 2016 10:30:37 +0530
Subject: PM / OPP: Parse clock-latency and voltage-tolerance for v1 bindings

V2 bindings have better support for clock-latency and voltage-tolerance
and doesn't need special care. To use callbacks, like
dev_pm_opp_get_max_{transition|volt}_latency(), irrespective of the
bindings, the core needs to know clock-latency/voltage-tolerance for the
earlier bindings.

This patch reads clock-latency/voltage-tolerance from the device node,
irrespective of the bindings (to keep it simple) and use them only for
V1 bindings.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Reviewed-by: Stephen Boyd <sboyd@codeaurora.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/base/power/opp/core.c b/drivers/base/power/opp/core.c
index b0f5c72..4fafa73 100644
--- a/drivers/base/power/opp/core.c
+++ b/drivers/base/power/opp/core.c
@@ -582,6 +582,7 @@ static struct device_opp *_add_device_opp(struct device *dev)
 {
 	struct device_opp *dev_opp;
 	struct device_list_opp *list_dev;
+	struct device_node *np;
 
 	/* Check for existing list for 'dev' first */
 	dev_opp = _find_device_opp(dev);
@@ -604,6 +605,21 @@ static struct device_opp *_add_device_opp(struct device *dev)
 		return NULL;
 	}
 
+	/*
+	 * Only required for backward compatibility with v1 bindings, but isn't
+	 * harmful for other cases. And so we do it unconditionally.
+	 */
+	np = of_node_get(dev->of_node);
+	if (np) {
+		u32 val;
+
+		if (!of_property_read_u32(np, "clock-latency", &val))
+			dev_opp->clock_latency_ns_max = val;
+		of_property_read_u32(np, "voltage-tolerance",
+				     &dev_opp->voltage_tolerance_v1);
+		of_node_put(np);
+	}
+
 	srcu_init_notifier_head(&dev_opp->srcu_head);
 	INIT_LIST_HEAD(&dev_opp->opp_list);
 
@@ -861,6 +877,7 @@ static int _opp_add_v1(struct device *dev, unsigned long freq, long u_volt,
 {
 	struct device_opp *dev_opp;
 	struct dev_pm_opp *new_opp;
+	unsigned long tol;
 	int ret;
 
 	/* Hold our list modification lock here */
@@ -874,7 +891,10 @@ static int _opp_add_v1(struct device *dev, unsigned long freq, long u_volt,
 
 	/* populate the opp table */
 	new_opp->rate = freq;
+	tol = u_volt * dev_opp->voltage_tolerance_v1 / 100;
 	new_opp->u_volt = u_volt;
+	new_opp->u_volt_min = u_volt - tol;
+	new_opp->u_volt_max = u_volt + tol;
 	new_opp->available = true;
 	new_opp->dynamic = dynamic;
 
diff --git a/drivers/base/power/opp/opp.h b/drivers/base/power/opp/opp.h
index 416293b..fe44beb 100644
--- a/drivers/base/power/opp/opp.h
+++ b/drivers/base/power/opp/opp.h
@@ -138,6 +138,8 @@ struct device_list_opp {
  * @dentry:	debugfs dentry pointer of the real device directory (not links).
  * @dentry_name: Name of the real dentry.
  *
+ * @voltage_tolerance_v1: In percentage, for v1 bindings only.
+ *
  * This is an internal data structure maintaining the link to opps attached to
  * a device. This structure is not meant to be shared to users as it is
  * meant for book keeping and private to OPP library.
@@ -156,6 +158,10 @@ struct device_opp {
 
 	struct device_node *np;
 	unsigned long clock_latency_ns_max;
+
+	/* For backward compatibility with v1 bindings */
+	unsigned int voltage_tolerance_v1;
+
 	bool shared_opp;
 	struct dev_pm_opp *suspend_opp;
 
-- 
cgit v0.10.2


From d54974c2513f487e9e70fbdc79c5da51c53e23da Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Tue, 9 Feb 2016 10:30:38 +0530
Subject: PM / OPP: Manage device clk

OPP core has got almost everything now to manage device's OPP
transitions, the only thing left is device's clk. Get that as well.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Reviewed-by: Stephen Boyd <sboyd@codeaurora.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/base/power/opp/core.c b/drivers/base/power/opp/core.c
index 4fafa73..7d7749c 100644
--- a/drivers/base/power/opp/core.c
+++ b/drivers/base/power/opp/core.c
@@ -13,6 +13,7 @@
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
+#include <linux/clk.h>
 #include <linux/errno.h>
 #include <linux/err.h>
 #include <linux/slab.h>
@@ -583,6 +584,7 @@ static struct device_opp *_add_device_opp(struct device *dev)
 	struct device_opp *dev_opp;
 	struct device_list_opp *list_dev;
 	struct device_node *np;
+	int ret;
 
 	/* Check for existing list for 'dev' first */
 	dev_opp = _find_device_opp(dev);
@@ -620,6 +622,15 @@ static struct device_opp *_add_device_opp(struct device *dev)
 		of_node_put(np);
 	}
 
+	/* Find clk for the device */
+	dev_opp->clk = clk_get(dev, NULL);
+	if (IS_ERR(dev_opp->clk)) {
+		ret = PTR_ERR(dev_opp->clk);
+		if (ret != -EPROBE_DEFER)
+			dev_dbg(dev, "%s: Couldn't find clock: %d\n", __func__,
+				ret);
+	}
+
 	srcu_init_notifier_head(&dev_opp->srcu_head);
 	INIT_LIST_HEAD(&dev_opp->opp_list);
 
@@ -661,6 +672,10 @@ static void _remove_device_opp(struct device_opp *dev_opp)
 	if (!IS_ERR_OR_NULL(dev_opp->regulator))
 		return;
 
+	/* Release clk */
+	if (!IS_ERR(dev_opp->clk))
+		clk_put(dev_opp->clk);
+
 	list_dev = list_first_entry(&dev_opp->dev_list, struct device_list_opp,
 				    node);
 
diff --git a/drivers/base/power/opp/opp.h b/drivers/base/power/opp/opp.h
index fe44beb..4f1bdfc 100644
--- a/drivers/base/power/opp/opp.h
+++ b/drivers/base/power/opp/opp.h
@@ -22,6 +22,7 @@
 #include <linux/rculist.h>
 #include <linux/rcupdate.h>
 
+struct clk;
 struct regulator;
 
 /* Lock to allow exclusive modification to the device and opp lists */
@@ -134,6 +135,7 @@ struct device_list_opp {
  * @supported_hw: Array of version number to support.
  * @supported_hw_count: Number of elements in supported_hw array.
  * @prop_name: A name to postfix to many DT properties, while parsing them.
+ * @clk: Device's clock handle
  * @regulator: Supply regulator
  * @dentry:	debugfs dentry pointer of the real device directory (not links).
  * @dentry_name: Name of the real dentry.
@@ -168,6 +170,7 @@ struct device_opp {
 	unsigned int *supported_hw;
 	unsigned int supported_hw_count;
 	const char *prop_name;
+	struct clk *clk;
 	struct regulator *regulator;
 
 #ifdef CONFIG_DEBUG_FS
-- 
cgit v0.10.2


From 6a0712f6f199e737aa5913d28ec4bd3a25de9660 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Tue, 9 Feb 2016 10:30:39 +0530
Subject: PM / OPP: Add dev_pm_opp_set_rate()

This adds a routine, dev_pm_opp_set_rate(), responsible for configuring
power-supply and clock source for an OPP.

The OPP is found by matching against the target_freq passed to the
routine. This shall replace similar code present in most of the OPP
users and help simplify them a lot.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Reviewed-by: Stephen Boyd <sboyd@codeaurora.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/base/power/opp/core.c b/drivers/base/power/opp/core.c
index 7d7749c..ab711c2 100644
--- a/drivers/base/power/opp/core.c
+++ b/drivers/base/power/opp/core.c
@@ -529,6 +529,182 @@ struct dev_pm_opp *dev_pm_opp_find_freq_floor(struct device *dev,
 }
 EXPORT_SYMBOL_GPL(dev_pm_opp_find_freq_floor);
 
+/*
+ * The caller needs to ensure that device_opp (and hence the clk) isn't freed,
+ * while clk returned here is used.
+ */
+static struct clk *_get_opp_clk(struct device *dev)
+{
+	struct device_opp *dev_opp;
+	struct clk *clk;
+
+	rcu_read_lock();
+
+	dev_opp = _find_device_opp(dev);
+	if (IS_ERR(dev_opp)) {
+		dev_err(dev, "%s: device opp doesn't exist\n", __func__);
+		clk = ERR_CAST(dev_opp);
+		goto unlock;
+	}
+
+	clk = dev_opp->clk;
+	if (IS_ERR(clk))
+		dev_err(dev, "%s: No clock available for the device\n",
+			__func__);
+
+unlock:
+	rcu_read_unlock();
+	return clk;
+}
+
+static int _set_opp_voltage(struct device *dev, struct regulator *reg,
+			    unsigned long u_volt, unsigned long u_volt_min,
+			    unsigned long u_volt_max)
+{
+	int ret;
+
+	/* Regulator not available for device */
+	if (IS_ERR(reg)) {
+		dev_dbg(dev, "%s: regulator not available: %ld\n", __func__,
+			PTR_ERR(reg));
+		return 0;
+	}
+
+	dev_dbg(dev, "%s: voltages (mV): %lu %lu %lu\n", __func__, u_volt_min,
+		u_volt, u_volt_max);
+
+	ret = regulator_set_voltage_triplet(reg, u_volt_min, u_volt,
+					    u_volt_max);
+	if (ret)
+		dev_err(dev, "%s: failed to set voltage (%lu %lu %lu mV): %d\n",
+			__func__, u_volt_min, u_volt, u_volt_max, ret);
+
+	return ret;
+}
+
+/**
+ * dev_pm_opp_set_rate() - Configure new OPP based on frequency
+ * @dev:	 device for which we do this operation
+ * @target_freq: frequency to achieve
+ *
+ * This configures the power-supplies and clock source to the levels specified
+ * by the OPP corresponding to the target_freq.
+ *
+ * Locking: This function takes rcu_read_lock().
+ */
+int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq)
+{
+	struct device_opp *dev_opp;
+	struct dev_pm_opp *old_opp, *opp;
+	struct regulator *reg;
+	struct clk *clk;
+	unsigned long freq, old_freq;
+	unsigned long u_volt, u_volt_min, u_volt_max;
+	unsigned long ou_volt, ou_volt_min, ou_volt_max;
+	int ret;
+
+	if (unlikely(!target_freq)) {
+		dev_err(dev, "%s: Invalid target frequency %lu\n", __func__,
+			target_freq);
+		return -EINVAL;
+	}
+
+	clk = _get_opp_clk(dev);
+	if (IS_ERR(clk))
+		return PTR_ERR(clk);
+
+	freq = clk_round_rate(clk, target_freq);
+	if ((long)freq <= 0)
+		freq = target_freq;
+
+	old_freq = clk_get_rate(clk);
+
+	/* Return early if nothing to do */
+	if (old_freq == freq) {
+		dev_dbg(dev, "%s: old/new frequencies (%lu Hz) are same, nothing to do\n",
+			__func__, freq);
+		return 0;
+	}
+
+	rcu_read_lock();
+
+	dev_opp = _find_device_opp(dev);
+	if (IS_ERR(dev_opp)) {
+		dev_err(dev, "%s: device opp doesn't exist\n", __func__);
+		rcu_read_unlock();
+		return PTR_ERR(dev_opp);
+	}
+
+	old_opp = dev_pm_opp_find_freq_ceil(dev, &old_freq);
+	if (!IS_ERR(old_opp)) {
+		ou_volt = old_opp->u_volt;
+		ou_volt_min = old_opp->u_volt_min;
+		ou_volt_max = old_opp->u_volt_max;
+	} else {
+		dev_err(dev, "%s: failed to find current OPP for freq %lu (%ld)\n",
+			__func__, old_freq, PTR_ERR(old_opp));
+	}
+
+	opp = dev_pm_opp_find_freq_ceil(dev, &freq);
+	if (IS_ERR(opp)) {
+		ret = PTR_ERR(opp);
+		dev_err(dev, "%s: failed to find OPP for freq %lu (%d)\n",
+			__func__, freq, ret);
+		rcu_read_unlock();
+		return ret;
+	}
+
+	u_volt = opp->u_volt;
+	u_volt_min = opp->u_volt_min;
+	u_volt_max = opp->u_volt_max;
+
+	reg = dev_opp->regulator;
+
+	rcu_read_unlock();
+
+	/* Scaling up? Scale voltage before frequency */
+	if (freq > old_freq) {
+		ret = _set_opp_voltage(dev, reg, u_volt, u_volt_min,
+				       u_volt_max);
+		if (ret)
+			goto restore_voltage;
+	}
+
+	/* Change frequency */
+
+	dev_dbg(dev, "%s: switching OPP: %lu Hz --> %lu Hz\n",
+		__func__, old_freq, freq);
+
+	ret = clk_set_rate(clk, freq);
+	if (ret) {
+		dev_err(dev, "%s: failed to set clock rate: %d\n", __func__,
+			ret);
+		goto restore_voltage;
+	}
+
+	/* Scaling down? Scale voltage after frequency */
+	if (freq < old_freq) {
+		ret = _set_opp_voltage(dev, reg, u_volt, u_volt_min,
+				       u_volt_max);
+		if (ret)
+			goto restore_freq;
+	}
+
+	return 0;
+
+restore_freq:
+	if (clk_set_rate(clk, old_freq))
+		dev_err(dev, "%s: failed to restore old-freq (%lu Hz)\n",
+			__func__, old_freq);
+restore_voltage:
+	/* This shouldn't harm even if the voltages weren't updated earlier */
+	if (!IS_ERR(old_opp))
+		_set_opp_voltage(dev, reg, ou_volt, ou_volt_min, ou_volt_max);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_set_rate);
+
 /* List-dev Helpers */
 static void _kfree_list_dev_rcu(struct rcu_head *head)
 {
diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h
index 59da3d9..cccaf4a 100644
--- a/include/linux/pm_opp.h
+++ b/include/linux/pm_opp.h
@@ -64,6 +64,7 @@ int dev_pm_opp_set_prop_name(struct device *dev, const char *name);
 void dev_pm_opp_put_prop_name(struct device *dev);
 int dev_pm_opp_set_regulator(struct device *dev, const char *name);
 void dev_pm_opp_put_regulator(struct device *dev);
+int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq);
 #else
 static inline unsigned long dev_pm_opp_get_voltage(struct dev_pm_opp *opp)
 {
@@ -172,6 +173,11 @@ static inline int dev_pm_opp_set_regulator(struct device *dev, const char *name)
 
 static inline void dev_pm_opp_put_regulator(struct device *dev) {}
 
+static inline int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq)
+{
+	return -EINVAL;
+}
+
 #endif		/* CONFIG_PM_OPP */
 
 #if defined(CONFIG_PM_OPP) && defined(CONFIG_OF)
-- 
cgit v0.10.2


From 896d6a4c0f41a93809b83f9e58aad73874a89d99 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Tue, 9 Feb 2016 10:30:40 +0530
Subject: cpufreq: dt: Convert few pr_debug/err() calls to dev_dbg/err()

We have the device structure available now, lets use it for better print
messages.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Reviewed-by: Stephen Boyd <sboyd@codeaurora.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/cpufreq/cpufreq-dt.c b/drivers/cpufreq/cpufreq-dt.c
index 0ca74d0..ace0168 100644
--- a/drivers/cpufreq/cpufreq-dt.c
+++ b/drivers/cpufreq/cpufreq-dt.c
@@ -246,7 +246,7 @@ static int cpufreq_init(struct cpufreq_policy *policy)
 	 */
 	ret = dev_pm_opp_get_opp_count(cpu_dev);
 	if (ret <= 0) {
-		pr_debug("OPP table is not ready, deferring probe\n");
+		dev_dbg(cpu_dev, "OPP table is not ready, deferring probe\n");
 		ret = -EPROBE_DEFER;
 		goto out_free_opp;
 	}
@@ -325,7 +325,7 @@ static int cpufreq_init(struct cpufreq_policy *policy)
 
 	ret = dev_pm_opp_init_cpufreq_table(cpu_dev, &freq_table);
 	if (ret) {
-		pr_err("failed to init cpufreq table: %d\n", ret);
+		dev_err(cpu_dev, "failed to init cpufreq table: %d\n", ret);
 		goto out_free_priv;
 	}
 
-- 
cgit v0.10.2


From 457e99e60a8f5a40b7da204c0bfc8a86ad2161b9 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Tue, 9 Feb 2016 10:30:41 +0530
Subject: cpufreq: dt: Rename 'need_update' to 'opp_v1'

That's the real purpose of this field, i.e. to take special care of old
OPP V1 bindings. Lets name it accordingly, so that it can be used
elsewhere.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Reviewed-by: Stephen Boyd <sboyd@codeaurora.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/cpufreq/cpufreq-dt.c b/drivers/cpufreq/cpufreq-dt.c
index ace0168..0047d20 100644
--- a/drivers/cpufreq/cpufreq-dt.c
+++ b/drivers/cpufreq/cpufreq-dt.c
@@ -199,7 +199,7 @@ static int cpufreq_init(struct cpufreq_policy *policy)
 	struct dev_pm_opp *suspend_opp;
 	unsigned long min_uV = ~0, max_uV = 0;
 	unsigned int transition_latency;
-	bool need_update = false;
+	bool opp_v1 = false;
 	int ret;
 
 	ret = allocate_resources(policy->cpu, &cpu_dev, &cpu_reg, &cpu_clk);
@@ -223,7 +223,7 @@ static int cpufreq_init(struct cpufreq_policy *policy)
 		 * finding shared-OPPs for backward compatibility.
 		 */
 		if (ret == -ENOENT)
-			need_update = true;
+			opp_v1 = true;
 		else
 			goto out_node_put;
 	}
@@ -251,7 +251,7 @@ static int cpufreq_init(struct cpufreq_policy *policy)
 		goto out_free_opp;
 	}
 
-	if (need_update) {
+	if (opp_v1) {
 		struct cpufreq_dt_platform_data *pd = cpufreq_get_driver_data();
 
 		if (!pd || !pd->independent_clocks)
-- 
cgit v0.10.2


From 391d9aef8145204e0a5d67be3bd1fc45c5396dae Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Tue, 9 Feb 2016 10:30:42 +0530
Subject: cpufreq: dt: OPP layers handles clock-latency for V1 bindings as well

"clock-latency" is handled by OPP layer for all bindings and so there is
no need to make special calls for V1 bindings. Use
dev_pm_opp_get_max_clock_latency() for both the cases.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Reviewed-by: Stephen Boyd <sboyd@codeaurora.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/cpufreq/cpufreq-dt.c b/drivers/cpufreq/cpufreq-dt.c
index 0047d20..4c9f8a8 100644
--- a/drivers/cpufreq/cpufreq-dt.c
+++ b/drivers/cpufreq/cpufreq-dt.c
@@ -265,10 +265,6 @@ static int cpufreq_init(struct cpufreq_policy *policy)
 		if (ret)
 			dev_err(cpu_dev, "%s: failed to mark OPPs as shared: %d\n",
 				__func__, ret);
-
-		of_property_read_u32(np, "clock-latency", &transition_latency);
-	} else {
-		transition_latency = dev_pm_opp_get_max_clock_latency(cpu_dev);
 	}
 
 	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
@@ -279,6 +275,7 @@ static int cpufreq_init(struct cpufreq_policy *policy)
 
 	of_property_read_u32(np, "voltage-tolerance", &priv->voltage_tolerance);
 
+	transition_latency = dev_pm_opp_get_max_clock_latency(cpu_dev);
 	if (!transition_latency)
 		transition_latency = CPUFREQ_ETERNAL;
 
-- 
cgit v0.10.2


From 050794aaebbb9f2c2c50b340b6998273e7c64189 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Tue, 9 Feb 2016 10:30:43 +0530
Subject: cpufreq: dt: Pass regulator name to the OPP core

OPP core can handle the regulators by itself, and but it needs to know
the name of the regulator to fetch. Add support for that.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Reviewed-by: Stephen Boyd <sboyd@codeaurora.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/cpufreq/cpufreq-dt.c b/drivers/cpufreq/cpufreq-dt.c
index 4c9f8a8..2af75f8 100644
--- a/drivers/cpufreq/cpufreq-dt.c
+++ b/drivers/cpufreq/cpufreq-dt.c
@@ -34,6 +34,7 @@ struct private_data {
 	struct regulator *cpu_reg;
 	struct thermal_cooling_device *cdev;
 	unsigned int voltage_tolerance; /* in percentage */
+	const char *reg_name;
 };
 
 static struct freq_attr *cpufreq_dt_attr[] = {
@@ -119,6 +120,30 @@ static int set_target(struct cpufreq_policy *policy, unsigned int index)
 	return ret;
 }
 
+/*
+ * An earlier version of opp-v1 bindings used to name the regulator
+ * "cpu0-supply", we still need to handle that for backwards compatibility.
+ */
+static const char *find_supply_name(struct device *dev, struct device_node *np)
+{
+	struct property *pp;
+	int cpu = dev->id;
+
+	/* Try "cpu0" for older DTs */
+	if (!cpu) {
+		pp = of_find_property(np, "cpu0-supply", NULL);
+		if (pp)
+			return "cpu0";
+	}
+
+	pp = of_find_property(np, "cpu-supply", NULL);
+	if (pp)
+		return "cpu";
+
+	dev_dbg(dev, "no regulator for cpu%d\n", cpu);
+	return NULL;
+}
+
 static int allocate_resources(int cpu, struct device **cdev,
 			      struct regulator **creg, struct clk **cclk)
 {
@@ -200,6 +225,7 @@ static int cpufreq_init(struct cpufreq_policy *policy)
 	unsigned long min_uV = ~0, max_uV = 0;
 	unsigned int transition_latency;
 	bool opp_v1 = false;
+	const char *name;
 	int ret;
 
 	ret = allocate_resources(policy->cpu, &cpu_dev, &cpu_reg, &cpu_clk);
@@ -229,6 +255,20 @@ static int cpufreq_init(struct cpufreq_policy *policy)
 	}
 
 	/*
+	 * OPP layer will be taking care of regulators now, but it needs to know
+	 * the name of the regulator first.
+	 */
+	name = find_supply_name(cpu_dev, np);
+	if (name) {
+		ret = dev_pm_opp_set_regulator(cpu_dev, name);
+		if (ret) {
+			dev_err(cpu_dev, "Failed to set regulator for cpu%d: %d\n",
+				policy->cpu, ret);
+			goto out_node_put;
+		}
+	}
+
+	/*
 	 * Initialize OPP tables for all policy->cpus. They will be shared by
 	 * all CPUs which have marked their CPUs shared with OPP bindings.
 	 *
@@ -273,6 +313,7 @@ static int cpufreq_init(struct cpufreq_policy *policy)
 		goto out_free_opp;
 	}
 
+	priv->reg_name = name;
 	of_property_read_u32(np, "voltage-tolerance", &priv->voltage_tolerance);
 
 	transition_latency = dev_pm_opp_get_max_clock_latency(cpu_dev);
@@ -366,6 +407,8 @@ out_free_priv:
 	kfree(priv);
 out_free_opp:
 	dev_pm_opp_of_cpumask_remove_table(policy->cpus);
+	if (name)
+		dev_pm_opp_put_regulator(cpu_dev);
 out_node_put:
 	of_node_put(np);
 out_put_reg_clk:
@@ -383,6 +426,9 @@ static int cpufreq_exit(struct cpufreq_policy *policy)
 	cpufreq_cooling_unregister(priv->cdev);
 	dev_pm_opp_free_cpufreq_table(priv->cpu_dev, &policy->freq_table);
 	dev_pm_opp_of_cpumask_remove_table(policy->related_cpus);
+	if (priv->reg_name)
+		dev_pm_opp_put_regulator(priv->cpu_dev);
+
 	clk_put(policy->clk);
 	if (!IS_ERR(priv->cpu_reg))
 		regulator_put(priv->cpu_reg);
-- 
cgit v0.10.2


From 6def6ea75e6dea45f01a16ae3cfb5b5ce48dd5e9 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Tue, 9 Feb 2016 10:30:44 +0530
Subject: cpufreq: dt: Unsupported OPPs are already disabled

The core already have a valid regulator set for the device opp and the
unsupported OPPs are already disabled by the core. There is no need to
repeat that in the user drivers, get rid of it.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Reviewed-by: Stephen Boyd <sboyd@codeaurora.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/cpufreq/cpufreq-dt.c b/drivers/cpufreq/cpufreq-dt.c
index 2af75f8..c3fe894 100644
--- a/drivers/cpufreq/cpufreq-dt.c
+++ b/drivers/cpufreq/cpufreq-dt.c
@@ -349,8 +349,6 @@ static int cpufreq_init(struct cpufreq_policy *policy)
 					min_uV = opp_uV;
 				if (opp_uV > max_uV)
 					max_uV = opp_uV;
-			} else {
-				dev_pm_opp_disable(cpu_dev, opp_freq);
 			}
 
 			opp_freq++;
-- 
cgit v0.10.2


From 755b888ff098c9f762717a9fbda7e05b16619069 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Tue, 9 Feb 2016 10:30:45 +0530
Subject: cpufreq: dt: Reuse dev_pm_opp_get_max_transition_latency()

OPP layer has all the information now to calculate transition latency
(clock_latency + voltage_latency). Lets reuse the OPP layer helper
dev_pm_opp_get_max_transition_latency() instead of open coding the same
in cpufreq-dt driver.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Reviewed-by: Stephen Boyd <sboyd@codeaurora.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/cpufreq/cpufreq-dt.c b/drivers/cpufreq/cpufreq-dt.c
index c3fe894..6f80ce5 100644
--- a/drivers/cpufreq/cpufreq-dt.c
+++ b/drivers/cpufreq/cpufreq-dt.c
@@ -222,7 +222,6 @@ static int cpufreq_init(struct cpufreq_policy *policy)
 	struct regulator *cpu_reg;
 	struct clk *cpu_clk;
 	struct dev_pm_opp *suspend_opp;
-	unsigned long min_uV = ~0, max_uV = 0;
 	unsigned int transition_latency;
 	bool opp_v1 = false;
 	const char *name;
@@ -316,49 +315,6 @@ static int cpufreq_init(struct cpufreq_policy *policy)
 	priv->reg_name = name;
 	of_property_read_u32(np, "voltage-tolerance", &priv->voltage_tolerance);
 
-	transition_latency = dev_pm_opp_get_max_clock_latency(cpu_dev);
-	if (!transition_latency)
-		transition_latency = CPUFREQ_ETERNAL;
-
-	if (!IS_ERR(cpu_reg)) {
-		unsigned long opp_freq = 0;
-
-		/*
-		 * Disable any OPPs where the connected regulator isn't able to
-		 * provide the specified voltage and record minimum and maximum
-		 * voltage levels.
-		 */
-		while (1) {
-			struct dev_pm_opp *opp;
-			unsigned long opp_uV, tol_uV;
-
-			rcu_read_lock();
-			opp = dev_pm_opp_find_freq_ceil(cpu_dev, &opp_freq);
-			if (IS_ERR(opp)) {
-				rcu_read_unlock();
-				break;
-			}
-			opp_uV = dev_pm_opp_get_voltage(opp);
-			rcu_read_unlock();
-
-			tol_uV = opp_uV * priv->voltage_tolerance / 100;
-			if (regulator_is_supported_voltage(cpu_reg,
-							   opp_uV - tol_uV,
-							   opp_uV + tol_uV)) {
-				if (opp_uV < min_uV)
-					min_uV = opp_uV;
-				if (opp_uV > max_uV)
-					max_uV = opp_uV;
-			}
-
-			opp_freq++;
-		}
-
-		ret = regulator_set_voltage_time(cpu_reg, min_uV, max_uV);
-		if (ret > 0)
-			transition_latency += ret * 1000;
-	}
-
 	ret = dev_pm_opp_init_cpufreq_table(cpu_dev, &freq_table);
 	if (ret) {
 		dev_err(cpu_dev, "failed to init cpufreq table: %d\n", ret);
@@ -393,6 +349,10 @@ static int cpufreq_init(struct cpufreq_policy *policy)
 		cpufreq_dt_attr[1] = &cpufreq_freq_attr_scaling_boost_freqs;
 	}
 
+	transition_latency = dev_pm_opp_get_max_transition_latency(cpu_dev);
+	if (!transition_latency)
+		transition_latency = CPUFREQ_ETERNAL;
+
 	policy->cpuinfo.transition_latency = transition_latency;
 
 	of_node_put(np);
-- 
cgit v0.10.2


From 78c3ba5df96c875b1668e1cd3ee0a69e62454f32 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Tue, 9 Feb 2016 10:30:46 +0530
Subject: cpufreq: dt: Use dev_pm_opp_set_rate() to switch frequency

OPP core supports frequency/voltage changes based on the target
frequency now, use that instead of open coding the same in cpufreq-dt
driver.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Reviewed-by: Stephen Boyd <sboyd@codeaurora.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/cpufreq/cpufreq-dt.c b/drivers/cpufreq/cpufreq-dt.c
index 6f80ce5..150a172 100644
--- a/drivers/cpufreq/cpufreq-dt.c
+++ b/drivers/cpufreq/cpufreq-dt.c
@@ -45,79 +45,10 @@ static struct freq_attr *cpufreq_dt_attr[] = {
 
 static int set_target(struct cpufreq_policy *policy, unsigned int index)
 {
-	struct dev_pm_opp *opp;
-	struct cpufreq_frequency_table *freq_table = policy->freq_table;
-	struct clk *cpu_clk = policy->clk;
 	struct private_data *priv = policy->driver_data;
-	struct device *cpu_dev = priv->cpu_dev;
-	struct regulator *cpu_reg = priv->cpu_reg;
-	unsigned long volt = 0, tol = 0;
-	int volt_old = 0;
-	unsigned int old_freq, new_freq;
-	long freq_Hz, freq_exact;
-	int ret;
-
-	freq_Hz = clk_round_rate(cpu_clk, freq_table[index].frequency * 1000);
-	if (freq_Hz <= 0)
-		freq_Hz = freq_table[index].frequency * 1000;
-
-	freq_exact = freq_Hz;
-	new_freq = freq_Hz / 1000;
-	old_freq = clk_get_rate(cpu_clk) / 1000;
-
-	if (!IS_ERR(cpu_reg)) {
-		unsigned long opp_freq;
-
-		rcu_read_lock();
-		opp = dev_pm_opp_find_freq_ceil(cpu_dev, &freq_Hz);
-		if (IS_ERR(opp)) {
-			rcu_read_unlock();
-			dev_err(cpu_dev, "failed to find OPP for %ld\n",
-				freq_Hz);
-			return PTR_ERR(opp);
-		}
-		volt = dev_pm_opp_get_voltage(opp);
-		opp_freq = dev_pm_opp_get_freq(opp);
-		rcu_read_unlock();
-		tol = volt * priv->voltage_tolerance / 100;
-		volt_old = regulator_get_voltage(cpu_reg);
-		dev_dbg(cpu_dev, "Found OPP: %ld kHz, %ld uV\n",
-			opp_freq / 1000, volt);
-	}
-
-	dev_dbg(cpu_dev, "%u MHz, %d mV --> %u MHz, %ld mV\n",
-		old_freq / 1000, (volt_old > 0) ? volt_old / 1000 : -1,
-		new_freq / 1000, volt ? volt / 1000 : -1);
-
-	/* scaling up?  scale voltage before frequency */
-	if (!IS_ERR(cpu_reg) && new_freq > old_freq) {
-		ret = regulator_set_voltage_tol(cpu_reg, volt, tol);
-		if (ret) {
-			dev_err(cpu_dev, "failed to scale voltage up: %d\n",
-				ret);
-			return ret;
-		}
-	}
-
-	ret = clk_set_rate(cpu_clk, freq_exact);
-	if (ret) {
-		dev_err(cpu_dev, "failed to set clock rate: %d\n", ret);
-		if (!IS_ERR(cpu_reg) && volt_old > 0)
-			regulator_set_voltage_tol(cpu_reg, volt_old, tol);
-		return ret;
-	}
 
-	/* scaling down?  scale voltage after frequency */
-	if (!IS_ERR(cpu_reg) && new_freq < old_freq) {
-		ret = regulator_set_voltage_tol(cpu_reg, volt, tol);
-		if (ret) {
-			dev_err(cpu_dev, "failed to scale voltage down: %d\n",
-				ret);
-			clk_set_rate(cpu_clk, old_freq * 1000);
-		}
-	}
-
-	return ret;
+	return dev_pm_opp_set_rate(priv->cpu_dev,
+				   policy->freq_table[index].frequency * 1000);
 }
 
 /*
-- 
cgit v0.10.2


From df2c8ec28e73d47392b8cb24828c15c54819da41 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Tue, 9 Feb 2016 10:30:47 +0530
Subject: cpufreq: dt: No need to fetch voltage-tolerance

Its already done by core and we don't need to get it anymore.  And so,
we don't need to get of node in cpufreq_init() anymore, move that to
find_supply_name() instead.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Reviewed-by: Stephen Boyd <sboyd@codeaurora.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/cpufreq/cpufreq-dt.c b/drivers/cpufreq/cpufreq-dt.c
index 150a172..bbafd7b 100644
--- a/drivers/cpufreq/cpufreq-dt.c
+++ b/drivers/cpufreq/cpufreq-dt.c
@@ -33,7 +33,6 @@ struct private_data {
 	struct device *cpu_dev;
 	struct regulator *cpu_reg;
 	struct thermal_cooling_device *cdev;
-	unsigned int voltage_tolerance; /* in percentage */
 	const char *reg_name;
 };
 
@@ -55,24 +54,38 @@ static int set_target(struct cpufreq_policy *policy, unsigned int index)
  * An earlier version of opp-v1 bindings used to name the regulator
  * "cpu0-supply", we still need to handle that for backwards compatibility.
  */
-static const char *find_supply_name(struct device *dev, struct device_node *np)
+static const char *find_supply_name(struct device *dev)
 {
+	struct device_node *np;
 	struct property *pp;
 	int cpu = dev->id;
+	const char *name = NULL;
+
+	np = of_node_get(dev->of_node);
+
+	/* This must be valid for sure */
+	if (WARN_ON(!np))
+		return NULL;
 
 	/* Try "cpu0" for older DTs */
 	if (!cpu) {
 		pp = of_find_property(np, "cpu0-supply", NULL);
-		if (pp)
-			return "cpu0";
+		if (pp) {
+			name = "cpu0";
+			goto node_put;
+		}
 	}
 
 	pp = of_find_property(np, "cpu-supply", NULL);
-	if (pp)
-		return "cpu";
+	if (pp) {
+		name = "cpu";
+		goto node_put;
+	}
 
 	dev_dbg(dev, "no regulator for cpu%d\n", cpu);
-	return NULL;
+node_put:
+	of_node_put(np);
+	return name;
 }
 
 static int allocate_resources(int cpu, struct device **cdev,
@@ -147,7 +160,6 @@ try_again:
 static int cpufreq_init(struct cpufreq_policy *policy)
 {
 	struct cpufreq_frequency_table *freq_table;
-	struct device_node *np;
 	struct private_data *priv;
 	struct device *cpu_dev;
 	struct regulator *cpu_reg;
@@ -164,13 +176,6 @@ static int cpufreq_init(struct cpufreq_policy *policy)
 		return ret;
 	}
 
-	np = of_node_get(cpu_dev->of_node);
-	if (!np) {
-		dev_err(cpu_dev, "failed to find cpu%d node\n", policy->cpu);
-		ret = -ENOENT;
-		goto out_put_reg_clk;
-	}
-
 	/* Get OPP-sharing information from "operating-points-v2" bindings */
 	ret = dev_pm_opp_of_get_sharing_cpus(cpu_dev, policy->cpus);
 	if (ret) {
@@ -181,20 +186,20 @@ static int cpufreq_init(struct cpufreq_policy *policy)
 		if (ret == -ENOENT)
 			opp_v1 = true;
 		else
-			goto out_node_put;
+			goto out_put_reg_clk;
 	}
 
 	/*
 	 * OPP layer will be taking care of regulators now, but it needs to know
 	 * the name of the regulator first.
 	 */
-	name = find_supply_name(cpu_dev, np);
+	name = find_supply_name(cpu_dev);
 	if (name) {
 		ret = dev_pm_opp_set_regulator(cpu_dev, name);
 		if (ret) {
 			dev_err(cpu_dev, "Failed to set regulator for cpu%d: %d\n",
 				policy->cpu, ret);
-			goto out_node_put;
+			goto out_put_reg_clk;
 		}
 	}
 
@@ -244,7 +249,6 @@ static int cpufreq_init(struct cpufreq_policy *policy)
 	}
 
 	priv->reg_name = name;
-	of_property_read_u32(np, "voltage-tolerance", &priv->voltage_tolerance);
 
 	ret = dev_pm_opp_init_cpufreq_table(cpu_dev, &freq_table);
 	if (ret) {
@@ -286,8 +290,6 @@ static int cpufreq_init(struct cpufreq_policy *policy)
 
 	policy->cpuinfo.transition_latency = transition_latency;
 
-	of_node_put(np);
-
 	return 0;
 
 out_free_cpufreq_table:
@@ -298,8 +300,6 @@ out_free_opp:
 	dev_pm_opp_of_cpumask_remove_table(policy->cpus);
 	if (name)
 		dev_pm_opp_put_regulator(cpu_dev);
-out_node_put:
-	of_node_put(np);
 out_put_reg_clk:
 	clk_put(cpu_clk);
 	if (!IS_ERR(cpu_reg))
-- 
cgit v0.10.2


From dd02a3d920083b6cb0ee4f0eaf2c599b740bf5fe Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Tue, 9 Feb 2016 10:30:48 +0530
Subject: cpufreq: dt: No need to allocate resources anymore

OPP layer manages it now and cpufreq-dt driver doesn't need it. But, we
still need to check for availability of resources for deferred probing.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Reviewed-by: Stephen Boyd <sboyd@codeaurora.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/cpufreq/cpufreq-dt.c b/drivers/cpufreq/cpufreq-dt.c
index bbafd7b..f951f91 100644
--- a/drivers/cpufreq/cpufreq-dt.c
+++ b/drivers/cpufreq/cpufreq-dt.c
@@ -31,7 +31,6 @@
 
 struct private_data {
 	struct device *cpu_dev;
-	struct regulator *cpu_reg;
 	struct thermal_cooling_device *cdev;
 	const char *reg_name;
 };
@@ -88,73 +87,59 @@ node_put:
 	return name;
 }
 
-static int allocate_resources(int cpu, struct device **cdev,
-			      struct regulator **creg, struct clk **cclk)
+static int resources_available(void)
 {
 	struct device *cpu_dev;
 	struct regulator *cpu_reg;
 	struct clk *cpu_clk;
 	int ret = 0;
-	char *reg_cpu0 = "cpu0", *reg_cpu = "cpu", *reg;
+	const char *name;
 
-	cpu_dev = get_cpu_device(cpu);
+	cpu_dev = get_cpu_device(0);
 	if (!cpu_dev) {
-		pr_err("failed to get cpu%d device\n", cpu);
+		pr_err("failed to get cpu0 device\n");
 		return -ENODEV;
 	}
 
-	/* Try "cpu0" for older DTs */
-	if (!cpu)
-		reg = reg_cpu0;
-	else
-		reg = reg_cpu;
-
-try_again:
-	cpu_reg = regulator_get_optional(cpu_dev, reg);
-	ret = PTR_ERR_OR_ZERO(cpu_reg);
+	cpu_clk = clk_get(cpu_dev, NULL);
+	ret = PTR_ERR_OR_ZERO(cpu_clk);
 	if (ret) {
 		/*
-		 * If cpu's regulator supply node is present, but regulator is
-		 * not yet registered, we should try defering probe.
+		 * If cpu's clk node is present, but clock is not yet
+		 * registered, we should try defering probe.
 		 */
-		if (ret == -EPROBE_DEFER) {
-			dev_dbg(cpu_dev, "cpu%d regulator not ready, retry\n",
-				cpu);
-			return ret;
-		}
-
-		/* Try with "cpu-supply" */
-		if (reg == reg_cpu0) {
-			reg = reg_cpu;
-			goto try_again;
-		}
+		if (ret == -EPROBE_DEFER)
+			dev_dbg(cpu_dev, "clock not ready, retry\n");
+		else
+			dev_err(cpu_dev, "failed to get clock: %d\n", ret);
 
-		dev_dbg(cpu_dev, "no regulator for cpu%d: %d\n", cpu, ret);
+		return ret;
 	}
 
-	cpu_clk = clk_get(cpu_dev, NULL);
-	ret = PTR_ERR_OR_ZERO(cpu_clk);
-	if (ret) {
-		/* put regulator */
-		if (!IS_ERR(cpu_reg))
-			regulator_put(cpu_reg);
+	clk_put(cpu_clk);
+
+	name = find_supply_name(cpu_dev);
+	/* Platform doesn't require regulator */
+	if (!name)
+		return 0;
 
+	cpu_reg = regulator_get_optional(cpu_dev, name);
+	ret = PTR_ERR_OR_ZERO(cpu_reg);
+	if (ret) {
 		/*
-		 * If cpu's clk node is present, but clock is not yet
-		 * registered, we should try defering probe.
+		 * If cpu's regulator supply node is present, but regulator is
+		 * not yet registered, we should try defering probe.
 		 */
 		if (ret == -EPROBE_DEFER)
-			dev_dbg(cpu_dev, "cpu%d clock not ready, retry\n", cpu);
+			dev_dbg(cpu_dev, "cpu0 regulator not ready, retry\n");
 		else
-			dev_err(cpu_dev, "failed to get cpu%d clock: %d\n", cpu,
-				ret);
-	} else {
-		*cdev = cpu_dev;
-		*creg = cpu_reg;
-		*cclk = cpu_clk;
+			dev_dbg(cpu_dev, "no regulator for cpu0: %d\n", ret);
+
+		return ret;
 	}
 
-	return ret;
+	regulator_put(cpu_reg);
+	return 0;
 }
 
 static int cpufreq_init(struct cpufreq_policy *policy)
@@ -162,7 +147,6 @@ static int cpufreq_init(struct cpufreq_policy *policy)
 	struct cpufreq_frequency_table *freq_table;
 	struct private_data *priv;
 	struct device *cpu_dev;
-	struct regulator *cpu_reg;
 	struct clk *cpu_clk;
 	struct dev_pm_opp *suspend_opp;
 	unsigned int transition_latency;
@@ -170,9 +154,16 @@ static int cpufreq_init(struct cpufreq_policy *policy)
 	const char *name;
 	int ret;
 
-	ret = allocate_resources(policy->cpu, &cpu_dev, &cpu_reg, &cpu_clk);
-	if (ret) {
-		pr_err("%s: Failed to allocate resources: %d\n", __func__, ret);
+	cpu_dev = get_cpu_device(policy->cpu);
+	if (!cpu_dev) {
+		pr_err("failed to get cpu%d device\n", policy->cpu);
+		return -ENODEV;
+	}
+
+	cpu_clk = clk_get(cpu_dev, NULL);
+	if (IS_ERR(cpu_clk)) {
+		ret = PTR_ERR(cpu_clk);
+		dev_err(cpu_dev, "%s: failed to get clk: %d\n", __func__, ret);
 		return ret;
 	}
 
@@ -186,7 +177,7 @@ static int cpufreq_init(struct cpufreq_policy *policy)
 		if (ret == -ENOENT)
 			opp_v1 = true;
 		else
-			goto out_put_reg_clk;
+			goto out_put_clk;
 	}
 
 	/*
@@ -199,7 +190,7 @@ static int cpufreq_init(struct cpufreq_policy *policy)
 		if (ret) {
 			dev_err(cpu_dev, "Failed to set regulator for cpu%d: %d\n",
 				policy->cpu, ret);
-			goto out_put_reg_clk;
+			goto out_put_clk;
 		}
 	}
 
@@ -257,9 +248,7 @@ static int cpufreq_init(struct cpufreq_policy *policy)
 	}
 
 	priv->cpu_dev = cpu_dev;
-	priv->cpu_reg = cpu_reg;
 	policy->driver_data = priv;
-
 	policy->clk = cpu_clk;
 
 	rcu_read_lock();
@@ -300,10 +289,8 @@ out_free_opp:
 	dev_pm_opp_of_cpumask_remove_table(policy->cpus);
 	if (name)
 		dev_pm_opp_put_regulator(cpu_dev);
-out_put_reg_clk:
+out_put_clk:
 	clk_put(cpu_clk);
-	if (!IS_ERR(cpu_reg))
-		regulator_put(cpu_reg);
 
 	return ret;
 }
@@ -319,8 +306,6 @@ static int cpufreq_exit(struct cpufreq_policy *policy)
 		dev_pm_opp_put_regulator(priv->cpu_dev);
 
 	clk_put(policy->clk);
-	if (!IS_ERR(priv->cpu_reg))
-		regulator_put(priv->cpu_reg);
 	kfree(priv);
 
 	return 0;
@@ -373,9 +358,6 @@ static struct cpufreq_driver dt_cpufreq_driver = {
 
 static int dt_cpufreq_probe(struct platform_device *pdev)
 {
-	struct device *cpu_dev;
-	struct regulator *cpu_reg;
-	struct clk *cpu_clk;
 	int ret;
 
 	/*
@@ -385,19 +367,15 @@ static int dt_cpufreq_probe(struct platform_device *pdev)
 	 *
 	 * FIXME: Is checking this only for CPU0 sufficient ?
 	 */
-	ret = allocate_resources(0, &cpu_dev, &cpu_reg, &cpu_clk);
+	ret = resources_available();
 	if (ret)
 		return ret;
 
-	clk_put(cpu_clk);
-	if (!IS_ERR(cpu_reg))
-		regulator_put(cpu_reg);
-
 	dt_cpufreq_driver.driver_data = dev_get_platdata(&pdev->dev);
 
 	ret = cpufreq_register_driver(&dt_cpufreq_driver);
 	if (ret)
-		dev_err(cpu_dev, "failed register driver: %d\n", ret);
+		dev_err(&pdev->dev, "failed register driver: %d\n", ret);
 
 	return ret;
 }
-- 
cgit v0.10.2


From f97238373b8662a6d580e204df2e7bcbfa43e27a Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers3@gmail.com>
Date: Sun, 24 Jan 2016 20:08:52 -0600
Subject: PM / sleep: declare __tracedata symbols as char[] rather than char

Accessing more than one byte from a symbol declared simply 'char' is undefined
behavior, as reported by UBSAN:

	UBSAN: Undefined behaviour in drivers/base/power/trace.c:178:18
	load of address ffffffff8203fc78 with insufficient space
	for an object of type 'char'

Avoid this by declaring the symbols as arrays.

Signed-off-by: Eric Biggers <ebiggers3@gmail.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/base/power/trace.c b/drivers/base/power/trace.c
index a311cfa..a697579 100644
--- a/drivers/base/power/trace.c
+++ b/drivers/base/power/trace.c
@@ -166,14 +166,14 @@ void generate_pm_trace(const void *tracedata, unsigned int user)
 }
 EXPORT_SYMBOL(generate_pm_trace);
 
-extern char __tracedata_start, __tracedata_end;
+extern char __tracedata_start[], __tracedata_end[];
 static int show_file_hash(unsigned int value)
 {
 	int match;
 	char *tracedata;
 
 	match = 0;
-	for (tracedata = &__tracedata_start ; tracedata < &__tracedata_end ;
+	for (tracedata = __tracedata_start ; tracedata < __tracedata_end ;
 			tracedata += 2 + sizeof(unsigned long)) {
 		unsigned short lineno = *(unsigned short *)tracedata;
 		const char *file = *(const char **)(tracedata + 2);
-- 
cgit v0.10.2


From f7b382b988233b5851eddf4531651ffe4133e88c Mon Sep 17 00:00:00 2001
From: Abhilash Jindal <klock.android@gmail.com>
Date: Sun, 31 Jan 2016 14:29:01 -0500
Subject: PM/freezer: y2038, use boottime to compare tstamps

Wall time obtained from do_gettimeofday gives 32 bit timeval which can only
represent time until January 2038. This patch moves to ktime_t, a 64-bit time.

Also, wall time is susceptible to sudden jumps due to user setting the time or
due to NTP.  Boot time is constantly increasing time better suited for
subtracting two timestamps.

Signed-off-by: Abhilash Jindal <klock.android@gmail.com>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/kernel/power/process.c b/kernel/power/process.c
index 564f786..df058be 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -30,13 +30,12 @@ static int try_to_freeze_tasks(bool user_only)
 	unsigned long end_time;
 	unsigned int todo;
 	bool wq_busy = false;
-	struct timeval start, end;
-	u64 elapsed_msecs64;
+	ktime_t start, end, elapsed;
 	unsigned int elapsed_msecs;
 	bool wakeup = false;
 	int sleep_usecs = USEC_PER_MSEC;
 
-	do_gettimeofday(&start);
+	start = ktime_get_boottime();
 
 	end_time = jiffies + msecs_to_jiffies(freeze_timeout_msecs);
 
@@ -78,10 +77,9 @@ static int try_to_freeze_tasks(bool user_only)
 			sleep_usecs *= 2;
 	}
 
-	do_gettimeofday(&end);
-	elapsed_msecs64 = timeval_to_ns(&end) - timeval_to_ns(&start);
-	do_div(elapsed_msecs64, NSEC_PER_MSEC);
-	elapsed_msecs = elapsed_msecs64;
+	end = ktime_get_boottime();
+	elapsed = ktime_sub(end, start);
+	elapsed_msecs = ktime_to_ms(elapsed);
 
 	if (todo) {
 		pr_cont("\n");
-- 
cgit v0.10.2


From 22e09b333f0b395b3eb6ab6efa4b3284e2c06810 Mon Sep 17 00:00:00 2001
From: saurabh <saurabh.truth@gmail.com>
Date: Wed, 28 Oct 2015 08:54:01 +0530
Subject: PM / suspend: replacing printk

replacing printk(s) with appropriate pr_info and pr_err
in order to fix checkpatch.pl warnings

Signed-off-by: Saurabh Sengar <saurabh.truth@gmail.com>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index f9fe133..230a772 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -248,7 +248,7 @@ static int suspend_test(int level)
 {
 #ifdef CONFIG_PM_DEBUG
 	if (pm_test_level == level) {
-		printk(KERN_INFO "suspend debug: Waiting for %d second(s).\n",
+		pr_info("suspend debug: Waiting for %d second(s).\n",
 				pm_test_delay);
 		mdelay(pm_test_delay * 1000);
 		return 1;
@@ -320,7 +320,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
 
 	error = dpm_suspend_late(PMSG_SUSPEND);
 	if (error) {
-		printk(KERN_ERR "PM: late suspend of devices failed\n");
+		pr_err("PM: late suspend of devices failed\n");
 		goto Platform_finish;
 	}
 	error = platform_suspend_prepare_late(state);
@@ -329,7 +329,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
 
 	error = dpm_suspend_noirq(PMSG_SUSPEND);
 	if (error) {
-		printk(KERN_ERR "PM: noirq suspend of devices failed\n");
+		pr_err("PM: noirq suspend of devices failed\n");
 		goto Platform_early_resume;
 	}
 	error = platform_suspend_prepare_noirq(state);
-- 
cgit v0.10.2


From 78ecc56247f0ec2bc0cf6f2f2af69e98d99767bc Mon Sep 17 00:00:00 2001
From: Jon Hunter <jonathanh@nvidia.com>
Date: Thu, 11 Feb 2016 11:25:59 +0000
Subject: PM / OPP: Fix NULL pointer dereference crash when disabling OPPs

Commit 7d34d56ef334 (PM / OPP: Disable OPPs that aren't supported by
the regulator) causes a crash to happen on Tegra124 Jetson TK1 when
using the DFLL clock source for the CPU.  The DFLL manages the voltage
itself and so there is no regulator specified for the OPPs and so we
get a crash when we try to dereference the regulator pointer.  Fix
this by checking to see if the regulator IS_ERR_OR_NULL before
dereferencing it.

Fixes: 7d34d56ef334 (PM / OPP: Disable OPPs that aren't supported by the regulator)
Signed-off-by: Jon Hunter <jonathanh@nvidia.com>
Reported-by: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
[ rjw: Changelog ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/base/power/opp/core.c b/drivers/base/power/opp/core.c
index ab711c2..d7cd4e2 100644
--- a/drivers/base/power/opp/core.c
+++ b/drivers/base/power/opp/core.c
@@ -975,7 +975,7 @@ static bool _opp_supported_by_regulators(struct dev_pm_opp *opp,
 {
 	struct regulator *reg = dev_opp->regulator;
 
-	if (!IS_ERR(reg) &&
+	if (!IS_ERR_OR_NULL(reg) &&
 	    !regulator_is_supported_voltage(reg, opp->u_volt_min,
 					    opp->u_volt_max)) {
 		pr_warn("%s: OPP minuV: %lu maxuV: %lu, not supported by regulator\n",
-- 
cgit v0.10.2


From 6541aef01a9b308f280d3f2d26a46858e6dbef6a Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Fri, 12 Feb 2016 23:56:21 +0100
Subject: cpufreq: Drop unnecessary checks from show() and store()

The show() and store() routines in the cpufreq core don't need to
check if the struct freq_attr they want to use really provides the
callbacks they need as expected (if that's not the case, it means
a bug in the code anyway), so change them to avoid doing that.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 34b17447..78a262f 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -818,12 +818,7 @@ static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf)
 	ssize_t ret;
 
 	down_read(&policy->rwsem);
-
-	if (fattr->show)
-		ret = fattr->show(policy, buf);
-	else
-		ret = -EIO;
-
+	ret = fattr->show(policy, buf);
 	up_read(&policy->rwsem);
 
 	return ret;
@@ -838,18 +833,12 @@ static ssize_t store(struct kobject *kobj, struct attribute *attr,
 
 	get_online_cpus();
 
-	if (!cpu_online(policy->cpu))
-		goto unlock;
-
-	down_write(&policy->rwsem);
-
-	if (fattr->store)
+	if (cpu_online(policy->cpu)) {
+		down_write(&policy->rwsem);
 		ret = fattr->store(policy, buf, count);
-	else
-		ret = -EIO;
+		up_write(&policy->rwsem);
+	}
 
-	up_write(&policy->rwsem);
-unlock:
 	put_online_cpus();
 
 	return ret;
-- 
cgit v0.10.2


From c88c395f4a6485f23f81e385c79945d68bcd5c5d Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Mon, 15 Feb 2016 10:21:53 +0530
Subject: PM / OPP: Initialize u_volt_min/max to a valid value

We kept u_volt_min/max initialized to 0, when only the target voltage is
present in DT, instead of the target/min/max triplet.

This didn't go well with the regulator framework, as on few calls the
min voltage was set to target and max was set to 0 and so resulted in a
kernel crash like below:

kernel BUG at ../drivers/regulator/core.c:216!

[<c0684af4>] (regulator_check_voltage) from [<c06857ac>] (regulator_set_voltage_unlocked+0x58/0x230)
[<c06857ac>] (regulator_set_voltage_unlocked) from [<c06859ac>] (regulator_set_voltage+0x28/0x54)
[<c06859ac>] (regulator_set_voltage) from [<c0775b28>] (_set_opp_voltage+0x30/0x98)
[<c0775b28>] (_set_opp_voltage) from [<c0776630>] (dev_pm_opp_set_rate+0xf0/0x28c)
[<c0776630>] (dev_pm_opp_set_rate) from [<c096f784>] (__cpufreq_driver_target+0x184/0x2b4)
[<c096f784>] (__cpufreq_driver_target) from [<c0973760>] (dbs_check_cpu+0x1b0/0x1f4)
[<c0973760>] (dbs_check_cpu) from [<c0973f30>] (cpufreq_governor_dbs+0x324/0x5c4)
[<c0973f30>] (cpufreq_governor_dbs) from [<c0970958>] (__cpufreq_governor+0xe4/0x1ec)
[<c0970958>] (__cpufreq_governor) from [<c09711e0>] (cpufreq_init_policy+0x64/0x8c)
[<c09711e0>] (cpufreq_init_policy) from [<c09718cc>] (cpufreq_online+0x2fc/0x708)
[<c09718cc>] (cpufreq_online) from [<c0765ff0>] (subsys_interface_register+0x94/0xd8)
[<c0765ff0>] (subsys_interface_register) from [<c0970530>] (cpufreq_register_driver+0x14c/0x19c)
[<c0970530>] (cpufreq_register_driver) from [<c09746dc>] (dt_cpufreq_probe+0x70/0xec)
[<c09746dc>] (dt_cpufreq_probe) from [<c076907c>] (platform_drv_probe+0x4c/0xb0)
[<c076907c>] (platform_drv_probe) from [<c07678e0>] (driver_probe_device+0x214/0x2c0)
[<c07678e0>] (driver_probe_device) from [<c0767a18>] (__driver_attach+0x8c/0x90)
[<c0767a18>] (__driver_attach) from [<c0765c2c>] (bus_for_each_dev+0x68/0x9c)
[<c0765c2c>] (bus_for_each_dev) from [<c0766d78>] (bus_add_driver+0x1a0/0x218)
[<c0766d78>] (bus_add_driver) from [<c076810c>] (driver_register+0x78/0xf8)
[<c076810c>] (driver_register) from [<c0301d74>] (do_one_initcall+0x90/0x1d8)
[<c0301d74>] (do_one_initcall) from [<c1100e14>] (kernel_init_freeable+0x15c/0x1fc)
[<c1100e14>] (kernel_init_freeable) from [<c0b27a0c>] (kernel_init+0x8/0xf0)
[<c0b27a0c>] (kernel_init) from [<c0307d78>] (ret_from_fork+0x14/0x3c)
Code: e1550004 baffffeb e3a00000 e8bd8070 (e7f001f2)

Fix that by initializing u_volt_min/max to the target voltage in such cases.

Reported-and-tested-by: Krzysztof Kozlowski <k.kozlowski@samsung.com>
Fixes: 274659029c9d (PM / OPP: Add support to parse "operating-points-v2" bindings)
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/base/power/opp/core.c b/drivers/base/power/opp/core.c
index d7cd4e2..19fd7e7 100644
--- a/drivers/base/power/opp/core.c
+++ b/drivers/base/power/opp/core.c
@@ -1157,8 +1157,14 @@ static int opp_parse_supplies(struct dev_pm_opp *opp, struct device *dev,
 	}
 
 	opp->u_volt = microvolt[0];
-	opp->u_volt_min = microvolt[1];
-	opp->u_volt_max = microvolt[2];
+
+	if (count == 1) {
+		opp->u_volt_min = opp->u_volt;
+		opp->u_volt_max = opp->u_volt;
+	} else {
+		opp->u_volt_min = microvolt[1];
+		opp->u_volt_max = microvolt[2];
+	}
 
 	/* Search for "opp-microamp-<name>" */
 	prop = NULL;
-- 
cgit v0.10.2


From fc5cbf0c94b6f7fd62fdddff892207290510945d Mon Sep 17 00:00:00 2001
From: Axel Haslam <ahaslam+renesas@baylibre.com>
Date: Mon, 15 Feb 2016 11:10:51 +0100
Subject: PM / Domains: Support for multiple states

Some hardware (eg. OMAP), has the ability to enter different low power
modes for a given power domain. This allows for more fine grained control
over the power state of the platform. As a typical example, some registers
of the hardware may be implemented with retention flip-flops and be able
to retain their state at lower voltages allowing for faster on/off
latencies and an increased window of opportunity to enter an intermediate
low power state other than "off"

When trying to set a power domain to off, the genpd governor will choose
the deepest state that will respect the qos constraints of all the devices
and sub-domains on the power domain. The state chosen by the governor is
saved in the "state_idx" field of the generic_pm_domain structure and
shall be used by the power_off and power_on callbacks to perform the
necessary actions to set the power domain into (and out of) the state
indicated by state_idx.

States must be declared in ascending order from shallowest to deepest,
deepest meaning the state which takes longer to enter and exit.

For platforms that don't declare any states, a single a single "off"
state is used. Once all platforms are converted to use the state array,
the legacy on/off latencies will be removed.

[ Lina: Modified genpd state initialization and remove use of
        save_state_latency_ns in genpd timing data ]
Suggested-by: Kevin Hilman <khilman@linaro.org>
Signed-off-by: Lina Iyer <lina.iyer@linaro.org>
Signed-off-by: Axel Haslam <ahaslam+renesas@baylibre.com>
Acked-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c
index 301b785..4c6f46b 100644
--- a/drivers/base/power/domain.c
+++ b/drivers/base/power/domain.c
@@ -104,6 +104,7 @@ static void genpd_sd_counter_inc(struct generic_pm_domain *genpd)
 
 static int genpd_power_on(struct generic_pm_domain *genpd, bool timed)
 {
+	unsigned int state_idx = genpd->state_idx;
 	ktime_t time_start;
 	s64 elapsed_ns;
 	int ret;
@@ -120,10 +121,10 @@ static int genpd_power_on(struct generic_pm_domain *genpd, bool timed)
 		return ret;
 
 	elapsed_ns = ktime_to_ns(ktime_sub(ktime_get(), time_start));
-	if (elapsed_ns <= genpd->power_on_latency_ns)
+	if (elapsed_ns <= genpd->states[state_idx].power_on_latency_ns)
 		return ret;
 
-	genpd->power_on_latency_ns = elapsed_ns;
+	genpd->states[state_idx].power_on_latency_ns = elapsed_ns;
 	genpd->max_off_time_changed = true;
 	pr_debug("%s: Power-%s latency exceeded, new value %lld ns\n",
 		 genpd->name, "on", elapsed_ns);
@@ -133,6 +134,7 @@ static int genpd_power_on(struct generic_pm_domain *genpd, bool timed)
 
 static int genpd_power_off(struct generic_pm_domain *genpd, bool timed)
 {
+	unsigned int state_idx = genpd->state_idx;
 	ktime_t time_start;
 	s64 elapsed_ns;
 	int ret;
@@ -149,10 +151,10 @@ static int genpd_power_off(struct generic_pm_domain *genpd, bool timed)
 		return ret;
 
 	elapsed_ns = ktime_to_ns(ktime_sub(ktime_get(), time_start));
-	if (elapsed_ns <= genpd->power_off_latency_ns)
+	if (elapsed_ns <= genpd->states[state_idx].power_off_latency_ns)
 		return ret;
 
-	genpd->power_off_latency_ns = elapsed_ns;
+	genpd->states[state_idx].power_off_latency_ns = elapsed_ns;
 	genpd->max_off_time_changed = true;
 	pr_debug("%s: Power-%s latency exceeded, new value %lld ns\n",
 		 genpd->name, "off", elapsed_ns);
@@ -585,6 +587,8 @@ static void pm_genpd_sync_poweroff(struct generic_pm_domain *genpd,
 	    || atomic_read(&genpd->sd_count) > 0)
 		return;
 
+	/* Choose the deepest state when suspending */
+	genpd->state_idx = genpd->state_count - 1;
 	genpd_power_off(genpd, timed);
 
 	genpd->status = GPD_STATE_POWER_OFF;
@@ -1508,6 +1512,26 @@ void pm_genpd_init(struct generic_pm_domain *genpd,
 		genpd->dev_ops.start = pm_clk_resume;
 	}
 
+	if (genpd->state_idx >= GENPD_MAX_NUM_STATES) {
+		pr_warn("Initial state index out of bounds.\n");
+		genpd->state_idx = GENPD_MAX_NUM_STATES - 1;
+	}
+
+	if (genpd->state_count > GENPD_MAX_NUM_STATES) {
+		pr_warn("Limiting states to  %d\n", GENPD_MAX_NUM_STATES);
+		genpd->state_count = GENPD_MAX_NUM_STATES;
+	}
+
+	/* Use only one "off" state if there were no states declared */
+	if (genpd->state_count == 0) {
+		genpd->states[0].power_on_latency_ns =
+					genpd->power_on_latency_ns;
+		genpd->states[0].power_off_latency_ns =
+					genpd->power_off_latency_ns;
+
+		genpd->state_count = 1;
+	}
+
 	mutex_lock(&gpd_list_lock);
 	list_add(&genpd->gpd_list_node, &gpd_list);
 	mutex_unlock(&gpd_list_lock);
@@ -1872,7 +1896,12 @@ static int pm_genpd_summary_one(struct seq_file *s,
 
 	if (WARN_ON(genpd->status >= ARRAY_SIZE(status_lookup)))
 		goto exit;
-	seq_printf(s, "%-30s  %-15s ", genpd->name, status_lookup[genpd->status]);
+	seq_printf(s, "%-30s  %s", genpd->name, status_lookup[genpd->status]);
+
+	if (genpd->status == GPD_STATE_POWER_OFF)
+		seq_printf(s, " %-13d ", genpd->state_idx);
+	else
+		seq_printf(s, " %-15s ", "");
 
 	/*
 	 * Modifications on the list require holding locks on both
diff --git a/drivers/base/power/domain_governor.c b/drivers/base/power/domain_governor.c
index 1e937ac..00a5436 100644
--- a/drivers/base/power/domain_governor.c
+++ b/drivers/base/power/domain_governor.c
@@ -98,7 +98,8 @@ static bool default_stop_ok(struct device *dev)
  *
  * This routine must be executed under the PM domain's lock.
  */
-static bool default_power_down_ok(struct dev_pm_domain *pd)
+static bool __default_power_down_ok(struct dev_pm_domain *pd,
+				     unsigned int state)
 {
 	struct generic_pm_domain *genpd = pd_to_genpd(pd);
 	struct gpd_link *link;
@@ -106,27 +107,9 @@ static bool default_power_down_ok(struct dev_pm_domain *pd)
 	s64 min_off_time_ns;
 	s64 off_on_time_ns;
 
-	if (genpd->max_off_time_changed) {
-		struct gpd_link *link;
-
-		/*
-		 * We have to invalidate the cached results for the masters, so
-		 * use the observation that default_power_down_ok() is not
-		 * going to be called for any master until this instance
-		 * returns.
-		 */
-		list_for_each_entry(link, &genpd->slave_links, slave_node)
-			link->master->max_off_time_changed = true;
-
-		genpd->max_off_time_changed = false;
-		genpd->cached_power_down_ok = false;
-		genpd->max_off_time_ns = -1;
-	} else {
-		return genpd->cached_power_down_ok;
-	}
+	off_on_time_ns = genpd->states[state].power_off_latency_ns +
+		genpd->states[state].power_on_latency_ns;
 
-	off_on_time_ns = genpd->power_off_latency_ns +
-				genpd->power_on_latency_ns;
 
 	min_off_time_ns = -1;
 	/*
@@ -186,8 +169,6 @@ static bool default_power_down_ok(struct dev_pm_domain *pd)
 			min_off_time_ns = constraint_ns;
 	}
 
-	genpd->cached_power_down_ok = true;
-
 	/*
 	 * If the computed minimum device off time is negative, there are no
 	 * latency constraints, so the domain can spend arbitrary time in the
@@ -201,10 +182,45 @@ static bool default_power_down_ok(struct dev_pm_domain *pd)
 	 * time and the time needed to turn the domain on is the maximum
 	 * theoretical time this domain can spend in the "off" state.
 	 */
-	genpd->max_off_time_ns = min_off_time_ns - genpd->power_on_latency_ns;
+	genpd->max_off_time_ns = min_off_time_ns -
+		genpd->states[state].power_on_latency_ns;
 	return true;
 }
 
+static bool default_power_down_ok(struct dev_pm_domain *pd)
+{
+	struct generic_pm_domain *genpd = pd_to_genpd(pd);
+	struct gpd_link *link;
+
+	if (!genpd->max_off_time_changed)
+		return genpd->cached_power_down_ok;
+
+	/*
+	 * We have to invalidate the cached results for the masters, so
+	 * use the observation that default_power_down_ok() is not
+	 * going to be called for any master until this instance
+	 * returns.
+	 */
+	list_for_each_entry(link, &genpd->slave_links, slave_node)
+		link->master->max_off_time_changed = true;
+
+	genpd->max_off_time_ns = -1;
+	genpd->max_off_time_changed = false;
+	genpd->cached_power_down_ok = true;
+	genpd->state_idx = genpd->state_count - 1;
+
+	/* Find a state to power down to, starting from the deepest. */
+	while (!__default_power_down_ok(pd, genpd->state_idx)) {
+		if (genpd->state_idx == 0) {
+			genpd->cached_power_down_ok = false;
+			break;
+		}
+		genpd->state_idx--;
+	}
+
+	return genpd->cached_power_down_ok;
+}
+
 static bool always_on_power_down_ok(struct dev_pm_domain *domain)
 {
 	return false;
diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h
index db21d39..1726c4a 100644
--- a/include/linux/pm_domain.h
+++ b/include/linux/pm_domain.h
@@ -19,6 +19,8 @@
 /* Defines used for the flags field in the struct generic_pm_domain */
 #define GENPD_FLAG_PM_CLK	(1U << 0) /* PM domain uses PM clk */
 
+#define GENPD_MAX_NUM_STATES	8 /* Number of possible low power states */
+
 enum gpd_status {
 	GPD_STATE_ACTIVE = 0,	/* PM domain is active */
 	GPD_STATE_POWER_OFF,	/* PM domain is off */
@@ -37,6 +39,11 @@ struct gpd_dev_ops {
 	bool (*active_wakeup)(struct device *dev);
 };
 
+struct genpd_power_state {
+	s64 power_off_latency_ns;
+	s64 power_on_latency_ns;
+};
+
 struct generic_pm_domain {
 	struct dev_pm_domain domain;	/* PM domain operations */
 	struct list_head gpd_list_node;	/* Node in the global PM domains list */
@@ -66,6 +73,10 @@ struct generic_pm_domain {
 	void (*detach_dev)(struct generic_pm_domain *domain,
 			   struct device *dev);
 	unsigned int flags;		/* Bit field of configs for genpd */
+	struct genpd_power_state states[GENPD_MAX_NUM_STATES];
+	unsigned int state_count; /* number of states */
+	unsigned int state_idx; /* state that genpd will go to when off */
+
 };
 
 static inline struct generic_pm_domain *pd_to_genpd(struct dev_pm_domain *pd)
-- 
cgit v0.10.2


From eaa2d73ef9985f9e0fee7c3f04268c729100498d Mon Sep 17 00:00:00 2001
From: Axel Haslam <ahaslam+renesas@baylibre.com>
Date: Mon, 15 Feb 2016 11:10:52 +0100
Subject: ARM: imx6: pm: declare pm domain latency on power_state struct

The generic_pm_domain structure uses an array of latencies to be able to
declare multiple intermediate states.

Declare a single "OFF" state with the default latencies So that the
power_off_latency_ns and power_on_latency_ns fields of generic_pm_domain
structure can be eventually removed.

[ Lina: pm_genpd_init() argument changev ]
Signed-off-by: Lina Iyer <lina.iyer@linaro.org>
Signed-off-by: Axel Haslam <ahaslam+renesas@baylibre.com>
Acked-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/arch/arm/mach-imx/gpc.c b/arch/arm/mach-imx/gpc.c
index cfc696b..fd87205 100644
--- a/arch/arm/mach-imx/gpc.c
+++ b/arch/arm/mach-imx/gpc.c
@@ -374,8 +374,13 @@ static struct pu_domain imx6q_pu_domain = {
 		.name = "PU",
 		.power_off = imx6q_pm_pu_power_off,
 		.power_on = imx6q_pm_pu_power_on,
-		.power_off_latency_ns = 25000,
-		.power_on_latency_ns = 2000000,
+		.states = {
+			[0] = {
+				.power_off_latency_ns = 25000,
+				.power_on_latency_ns = 2000000,
+			},
+		},
+		.state_count = 1,
 	},
 };
 
-- 
cgit v0.10.2


From 90e63452ac3a42c9ff10b12880429e8592cf39ec Mon Sep 17 00:00:00 2001
From: Axel Haslam <ahaslam+renesas@baylibre.com>
Date: Mon, 15 Feb 2016 11:10:53 +0100
Subject: PM / Domains: remove old power on/off latencies

Now that all known users have been converted to use state latencies,
we can remove the latency field in the generic_pm_domain structure.

Signed-off-by: Axel Haslam <ahaslam+renesas@baylibre.com>
Acked-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c
index 4c6f46b..e8ca290 100644
--- a/drivers/base/power/domain.c
+++ b/drivers/base/power/domain.c
@@ -1523,14 +1523,8 @@ void pm_genpd_init(struct generic_pm_domain *genpd,
 	}
 
 	/* Use only one "off" state if there were no states declared */
-	if (genpd->state_count == 0) {
-		genpd->states[0].power_on_latency_ns =
-					genpd->power_on_latency_ns;
-		genpd->states[0].power_off_latency_ns =
-					genpd->power_off_latency_ns;
-
+	if (genpd->state_count == 0)
 		genpd->state_count = 1;
-	}
 
 	mutex_lock(&gpd_list_lock);
 	list_add(&genpd->gpd_list_node, &gpd_list);
diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h
index 1726c4a..49cd889 100644
--- a/include/linux/pm_domain.h
+++ b/include/linux/pm_domain.h
@@ -61,9 +61,7 @@ struct generic_pm_domain {
 	unsigned int prepared_count;	/* Suspend counter of prepared devices */
 	bool suspend_power_off;	/* Power status before system suspend */
 	int (*power_off)(struct generic_pm_domain *domain);
-	s64 power_off_latency_ns;
 	int (*power_on)(struct generic_pm_domain *domain);
-	s64 power_on_latency_ns;
 	struct gpd_dev_ops dev_ops;
 	s64 max_off_time_ns;	/* Maximum allowed "suspended" time. */
 	bool max_off_time_changed;
-- 
cgit v0.10.2


From 0c717d0f9cb46259dce5272705adce64a2d646d9 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Mon, 15 Feb 2016 21:56:42 +0530
Subject: PM / OPP: Initialize regulator pointer to an error value

We are currently required to do two checks for regulator pointer:
IS_ERR() and IS_NULL().

And multiple instances are reported, about both of these not being used
consistently and so resulting in crashes.

Fix that by initializing regulator pointer with an error value and
checking it only against an error.

This makes code more consistent and more efficient.

Fixes: 7d34d56ef334 (PM / OPP: Disable OPPs that aren't supported by the regulator)
Reported-and-tested-by: Jon Hunter <jonathanh@nvidia.com>
Reported-and-tested-by: Tony Lindgren <tony@atomide.com>
Reported-and-tested-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
[ rjw: Initialize to -ENXIO ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/base/power/opp/core.c b/drivers/base/power/opp/core.c
index 19fd7e7..5fb2f06 100644
--- a/drivers/base/power/opp/core.c
+++ b/drivers/base/power/opp/core.c
@@ -257,7 +257,7 @@ unsigned long dev_pm_opp_get_max_volt_latency(struct device *dev)
 	}
 
 	reg = dev_opp->regulator;
-	if (IS_ERR_OR_NULL(reg)) {
+	if (IS_ERR(reg)) {
 		/* Regulator may not be required for device */
 		if (reg)
 			dev_err(dev, "%s: Invalid regulator (%ld)\n", __func__,
@@ -798,6 +798,9 @@ static struct device_opp *_add_device_opp(struct device *dev)
 		of_node_put(np);
 	}
 
+	/* Set regulator to a non-NULL error value */
+	dev_opp->regulator = ERR_PTR(-ENXIO);
+
 	/* Find clk for the device */
 	dev_opp->clk = clk_get(dev, NULL);
 	if (IS_ERR(dev_opp->clk)) {
@@ -845,7 +848,7 @@ static void _remove_device_opp(struct device_opp *dev_opp)
 	if (dev_opp->prop_name)
 		return;
 
-	if (!IS_ERR_OR_NULL(dev_opp->regulator))
+	if (!IS_ERR(dev_opp->regulator))
 		return;
 
 	/* Release clk */
@@ -975,7 +978,7 @@ static bool _opp_supported_by_regulators(struct dev_pm_opp *opp,
 {
 	struct regulator *reg = dev_opp->regulator;
 
-	if (!IS_ERR_OR_NULL(reg) &&
+	if (!IS_ERR(reg) &&
 	    !regulator_is_supported_voltage(reg, opp->u_volt_min,
 					    opp->u_volt_max)) {
 		pr_warn("%s: OPP minuV: %lu maxuV: %lu, not supported by regulator\n",
@@ -1441,7 +1444,7 @@ int dev_pm_opp_set_regulator(struct device *dev, const char *name)
 	}
 
 	/* Already have a regulator set */
-	if (WARN_ON(!IS_ERR_OR_NULL(dev_opp->regulator))) {
+	if (WARN_ON(!IS_ERR(dev_opp->regulator))) {
 		ret = -EBUSY;
 		goto err;
 	}
@@ -1492,7 +1495,7 @@ void dev_pm_opp_put_regulator(struct device *dev)
 		goto unlock;
 	}
 
-	if (IS_ERR_OR_NULL(dev_opp->regulator)) {
+	if (IS_ERR(dev_opp->regulator)) {
 		dev_err(dev, "%s: Doesn't have regulator set\n", __func__);
 		goto unlock;
 	}
@@ -1501,7 +1504,7 @@ void dev_pm_opp_put_regulator(struct device *dev)
 	WARN_ON(!list_empty(&dev_opp->opp_list));
 
 	regulator_put(dev_opp->regulator);
-	dev_opp->regulator = ERR_PTR(-EINVAL);
+	dev_opp->regulator = ERR_PTR(-ENXIO);
 
 	/* Try freeing device_opp if this was the last blocking resource */
 	_remove_device_opp(dev_opp);
-- 
cgit v0.10.2


From 3b95bd160547f56a68aeb972c33ae9511e7a8380 Mon Sep 17 00:00:00 2001
From: Aleksey Makarov <aleksey.makarov@linaro.org>
Date: Tue, 16 Feb 2016 15:52:38 +0300
Subject: ACPI: introduce a function to find the first physical device

Factor out the code that finds the first physical device
of a given ACPI device.  It is used in several places.

Signed-off-by: Aleksey Makarov <aleksey.makarov@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/acpi/acpi_platform.c b/drivers/acpi/acpi_platform.c
index 296b7a1..c3af108 100644
--- a/drivers/acpi/acpi_platform.c
+++ b/drivers/acpi/acpi_platform.c
@@ -43,7 +43,6 @@ static const struct acpi_device_id forbidden_id_list[] = {
 struct platform_device *acpi_create_platform_device(struct acpi_device *adev)
 {
 	struct platform_device *pdev = NULL;
-	struct acpi_device *acpi_parent;
 	struct platform_device_info pdevinfo;
 	struct resource_entry *rentry;
 	struct list_head resource_list;
@@ -82,22 +81,8 @@ struct platform_device *acpi_create_platform_device(struct acpi_device *adev)
 	 * attached to it, that physical device should be the parent of the
 	 * platform device we are about to create.
 	 */
-	pdevinfo.parent = NULL;
-	acpi_parent = adev->parent;
-	if (acpi_parent) {
-		struct acpi_device_physical_node *entry;
-		struct list_head *list;
-
-		mutex_lock(&acpi_parent->physical_node_lock);
-		list = &acpi_parent->physical_node_list;
-		if (!list_empty(list)) {
-			entry = list_first_entry(list,
-					struct acpi_device_physical_node,
-					node);
-			pdevinfo.parent = entry->dev;
-		}
-		mutex_unlock(&acpi_parent->physical_node_lock);
-	}
+	pdevinfo.parent = adev->parent ?
+		acpi_get_first_physical_node(adev->parent) : NULL;
 	pdevinfo.name = dev_name(&adev->dev);
 	pdevinfo.id = -1;
 	pdevinfo.res = resources;
diff --git a/drivers/acpi/bus.c b/drivers/acpi/bus.c
index 891c42d..0e85678 100644
--- a/drivers/acpi/bus.c
+++ b/drivers/acpi/bus.c
@@ -479,24 +479,38 @@ static void acpi_device_remove_notify_handler(struct acpi_device *device)
                              Device Matching
    -------------------------------------------------------------------------- */
 
-static struct acpi_device *acpi_primary_dev_companion(struct acpi_device *adev,
-						      const struct device *dev)
+/**
+ * acpi_get_first_physical_node - Get first physical node of an ACPI device
+ * @adev:	ACPI device in question
+ *
+ * Return: First physical node of ACPI device @adev
+ */
+struct device *acpi_get_first_physical_node(struct acpi_device *adev)
 {
 	struct mutex *physical_node_lock = &adev->physical_node_lock;
+	struct device *phys_dev;
 
 	mutex_lock(physical_node_lock);
 	if (list_empty(&adev->physical_node_list)) {
-		adev = NULL;
+		phys_dev = NULL;
 	} else {
 		const struct acpi_device_physical_node *node;
 
 		node = list_first_entry(&adev->physical_node_list,
 					struct acpi_device_physical_node, node);
-		if (node->dev != dev)
-			adev = NULL;
+
+		phys_dev = node->dev;
 	}
 	mutex_unlock(physical_node_lock);
-	return adev;
+	return phys_dev;
+}
+
+static struct acpi_device *acpi_primary_dev_companion(struct acpi_device *adev,
+						      const struct device *dev)
+{
+	const struct device *phys_dev = acpi_get_first_physical_node(adev);
+
+	return phys_dev && phys_dev == dev ? adev : NULL;
 }
 
 /**
diff --git a/drivers/acpi/internal.h b/drivers/acpi/internal.h
index 1e6833a..8668891 100644
--- a/drivers/acpi/internal.h
+++ b/drivers/acpi/internal.h
@@ -106,6 +106,7 @@ bool acpi_device_is_present(struct acpi_device *adev);
 bool acpi_device_is_battery(struct acpi_device *adev);
 bool acpi_device_is_first_physical_node(struct acpi_device *adev,
 					const struct device *dev);
+struct device *acpi_get_first_physical_node(struct acpi_device *adev);
 
 /* --------------------------------------------------------------------------
                      Device Matching and Notification
-- 
cgit v0.10.2


From 6ce2e188a6ae339340d9bbf5bb0b81db20454353 Mon Sep 17 00:00:00 2001
From: Graeme Gregory <graeme.gregory@linaro.org>
Date: Wed, 20 Jan 2016 20:29:27 +0600
Subject: ACPI / scan: AMBA bus probing support

On ARM64 some devices use the AMBA device and not the platform bus for
probing so add support for this. Uses a dummy clock for apb_pclk as ACPI
does not have a suitable clock representation and to keep the core
AMBA bus code unchanged between probing methods.

Acked-by: Russell King <rmk+kernel@arm.linux.org.uk>
Signed-off-by: Graeme Gregory <graeme.gregory@linaro.org>
Signed-off-by: Aleksey Makarov <aleksey.makarov@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/acpi/Makefile b/drivers/acpi/Makefile
index cb648a4..edeb2d1 100644
--- a/drivers/acpi/Makefile
+++ b/drivers/acpi/Makefile
@@ -43,6 +43,7 @@ acpi-y				+= pci_root.o pci_link.o pci_irq.o
 acpi-y				+= acpi_lpss.o acpi_apd.o
 acpi-y				+= acpi_platform.o
 acpi-y				+= acpi_pnp.o
+acpi-$(CONFIG_ARM_AMBA)	+= acpi_amba.o
 acpi-y				+= int340x_thermal.o
 acpi-y				+= power.o
 acpi-y				+= event.o
diff --git a/drivers/acpi/acpi_amba.c b/drivers/acpi/acpi_amba.c
new file mode 100644
index 0000000..2a61b54
--- /dev/null
+++ b/drivers/acpi/acpi_amba.c
@@ -0,0 +1,122 @@
+
+/*
+ * ACPI support for platform bus type.
+ *
+ * Copyright (C) 2015, Linaro Ltd
+ * Author: Graeme Gregory <graeme.gregory@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/acpi.h>
+#include <linux/amba/bus.h>
+#include <linux/clkdev.h>
+#include <linux/clk-provider.h>
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/ioport.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+
+#include "internal.h"
+
+static const struct acpi_device_id amba_id_list[] = {
+	{"ARMH0061", 0}, /* PL061 GPIO Device */
+	{"", 0},
+};
+
+static void amba_register_dummy_clk(void)
+{
+	static struct clk *amba_dummy_clk;
+
+	/* If clock already registered */
+	if (amba_dummy_clk)
+		return;
+
+	amba_dummy_clk = clk_register_fixed_rate(NULL, "apb_pclk", NULL,
+						CLK_IS_ROOT, 0);
+	clk_register_clkdev(amba_dummy_clk, "apb_pclk", NULL);
+}
+
+static int amba_handler_attach(struct acpi_device *adev,
+				const struct acpi_device_id *id)
+{
+	struct amba_device *dev;
+	struct resource_entry *rentry;
+	struct list_head resource_list;
+	bool address_found = false;
+	int irq_no = 0;
+	int ret;
+
+	/* If the ACPI node already has a physical device attached, skip it. */
+	if (adev->physical_node_count)
+		return 0;
+
+	dev = amba_device_alloc(dev_name(&adev->dev), 0, 0);
+	if (!dev) {
+		dev_err(&adev->dev, "%s(): amba_device_alloc() failed\n",
+			__func__);
+		return -ENOMEM;
+	}
+
+	INIT_LIST_HEAD(&resource_list);
+	ret = acpi_dev_get_resources(adev, &resource_list, NULL, NULL);
+	if (ret < 0)
+		goto err_free;
+
+	list_for_each_entry(rentry, &resource_list, node) {
+		switch (resource_type(rentry->res)) {
+		case IORESOURCE_MEM:
+			if (!address_found) {
+				dev->res = *rentry->res;
+				address_found = true;
+			}
+			break;
+		case IORESOURCE_IRQ:
+			if (irq_no < AMBA_NR_IRQS)
+				dev->irq[irq_no++] = rentry->res->start;
+			break;
+		default:
+			dev_warn(&adev->dev, "Invalid resource\n");
+			break;
+		}
+	}
+
+	acpi_dev_free_resource_list(&resource_list);
+
+	/*
+	 * If the ACPI node has a parent and that parent has a physical device
+	 * attached to it, that physical device should be the parent of
+	 * the amba device we are about to create.
+	 */
+	if (adev->parent)
+		dev->dev.parent = acpi_get_first_physical_node(adev->parent);
+
+	ACPI_COMPANION_SET(&dev->dev, adev);
+
+	ret = amba_device_add(dev, &iomem_resource);
+	if (ret) {
+		dev_err(&adev->dev, "%s(): amba_device_add() failed (%d)\n",
+		       __func__, ret);
+		goto err_free;
+	}
+
+	return 1;
+
+err_free:
+	amba_device_put(dev);
+	return ret;
+}
+
+static struct acpi_scan_handler amba_handler = {
+	.ids = amba_id_list,
+	.attach = amba_handler_attach,
+};
+
+void __init acpi_amba_init(void)
+{
+	amba_register_dummy_clk();
+	acpi_scan_add_handler(&amba_handler);
+}
diff --git a/drivers/acpi/internal.h b/drivers/acpi/internal.h
index 8668891..9860df0 100644
--- a/drivers/acpi/internal.h
+++ b/drivers/acpi/internal.h
@@ -29,6 +29,11 @@ void acpi_processor_init(void);
 void acpi_platform_init(void);
 void acpi_pnp_init(void);
 void acpi_int340x_thermal_init(void);
+#ifdef CONFIG_ARM_AMBA
+void acpi_amba_init(void);
+#else
+static inline void acpi_amba_init(void) {}
+#endif
 int acpi_sysfs_init(void);
 void acpi_container_init(void);
 void acpi_memory_hotplug_init(void);
diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c
index 407a376..5f28cf7 100644
--- a/drivers/acpi/scan.c
+++ b/drivers/acpi/scan.c
@@ -1930,6 +1930,7 @@ int __init acpi_scan_init(void)
 	acpi_memory_hotplug_init();
 	acpi_pnp_init();
 	acpi_int340x_thermal_init();
+	acpi_amba_init();
 
 	acpi_scan_add_handler(&generic_device_handler);
 
-- 
cgit v0.10.2


From b6ec26fb963133d16acb473bf4fcda09630b2298 Mon Sep 17 00:00:00 2001
From: Sudeep Holla <Sudeep.Holla@arm.com>
Date: Wed, 2 Dec 2015 14:10:44 +0000
Subject: ACPI / processor_idle: replace PREFIX with pr_fmt

Like few of the other ACPI modules, replace PREFIX with pr_fmt and
change all the printk call sites to use pr_* companion functions
in processor_idle.

Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index 175c86b..e057aa8 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -23,6 +23,7 @@
  *
  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  */
+#define pr_fmt(fmt) "ACPI: " fmt
 
 #include <linux/module.h>
 #include <linux/acpi.h>
@@ -43,8 +44,6 @@
 #include <asm/apic.h>
 #endif
 
-#define PREFIX "ACPI: "
-
 #define ACPI_PROCESSOR_CLASS            "processor"
 #define _COMPONENT              ACPI_PROCESSOR_COMPONENT
 ACPI_MODULE_NAME("processor_idle");
@@ -81,9 +80,9 @@ static int set_max_cstate(const struct dmi_system_id *id)
 	if (max_cstate > ACPI_PROCESSOR_MAX_POWER)
 		return 0;
 
-	printk(KERN_NOTICE PREFIX "%s detected - limiting to C%ld max_cstate."
-	       " Override with \"processor.max_cstate=%d\"\n", id->ident,
-	       (long)id->driver_data, ACPI_PROCESSOR_MAX_POWER + 1);
+	pr_notice("%s detected - limiting to C%ld max_cstate."
+		  " Override with \"processor.max_cstate=%d\"\n", id->ident,
+		  (long)id->driver_data, ACPI_PROCESSOR_MAX_POWER + 1);
 
 	max_cstate = (long)id->driver_data;
 
@@ -351,7 +350,7 @@ static int acpi_processor_get_power_info_cst(struct acpi_processor *pr)
 
 	/* There must be at least 2 elements */
 	if (!cst || (cst->type != ACPI_TYPE_PACKAGE) || cst->package.count < 2) {
-		printk(KERN_ERR PREFIX "not enough elements in _CST\n");
+		pr_err("not enough elements in _CST\n");
 		ret = -EFAULT;
 		goto end;
 	}
@@ -360,7 +359,7 @@ static int acpi_processor_get_power_info_cst(struct acpi_processor *pr)
 
 	/* Validate number of power states. */
 	if (count < 1 || count != cst->package.count - 1) {
-		printk(KERN_ERR PREFIX "count given by _CST is not valid\n");
+		pr_err("count given by _CST is not valid\n");
 		ret = -EFAULT;
 		goto end;
 	}
@@ -469,11 +468,9 @@ static int acpi_processor_get_power_info_cst(struct acpi_processor *pr)
 		 * (From 1 through ACPI_PROCESSOR_MAX_POWER - 1)
 		 */
 		if (current_count >= (ACPI_PROCESSOR_MAX_POWER - 1)) {
-			printk(KERN_WARNING
-			       "Limiting number of power states to max (%d)\n",
-			       ACPI_PROCESSOR_MAX_POWER);
-			printk(KERN_WARNING
-			       "Please increase ACPI_PROCESSOR_MAX_POWER if needed.\n");
+			pr_warn("Limiting number of power states to max (%d)\n",
+				ACPI_PROCESSOR_MAX_POWER);
+			pr_warn("Please increase ACPI_PROCESSOR_MAX_POWER if needed.\n");
 			break;
 		}
 	}
@@ -1097,8 +1094,8 @@ int acpi_processor_power_init(struct acpi_processor *pr)
 			retval = cpuidle_register_driver(&acpi_idle_driver);
 			if (retval)
 				return retval;
-			printk(KERN_DEBUG "ACPI: %s registered with cpuidle\n",
-					acpi_idle_driver.name);
+			pr_debug("%s registered with cpuidle\n",
+				 acpi_idle_driver.name);
 		}
 
 		dev = kzalloc(sizeof(*dev), GFP_KERNEL);
-- 
cgit v0.10.2


From 7024b18ca461bab45e5fb329f6e3d904d5109401 Mon Sep 17 00:00:00 2001
From: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Date: Tue, 16 Feb 2016 20:19:18 +0100
Subject: cpuidle: menu: avoid expensive square root computation

Computing the integer square root is a rather expensive operation, at
least compared to doing a 64x64 -> 64 multiply (avg*avg) and, on 64
bit platforms, doing an extra comparison to a constant (variance <=
U64_MAX/36).

On 64 bit platforms, this does mean that we add a restriction on the
range of the variance where we end up using the estimate (since
previously the stddev <= ULONG_MAX was a tautology), but on the other
hand, we extend the range quite substantially on 32 bit platforms - in
both cases, we now allow standard deviations up to 715 seconds, which
is for example guaranteed if all observations are less than 1430
seconds.

Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c
index 0742b32..beef7ae 100644
--- a/drivers/cpuidle/governors/menu.c
+++ b/drivers/cpuidle/governors/menu.c
@@ -200,7 +200,7 @@ static void get_typical_interval(struct menu_device *data)
 {
 	int i, divisor;
 	unsigned int max, thresh;
-	uint64_t avg, stddev;
+	uint64_t avg, variance;
 
 	thresh = UINT_MAX; /* Discard outliers above this value */
 
@@ -224,36 +224,35 @@ again:
 	else
 		do_div(avg, divisor);
 
-	/* Then try to determine standard deviation */
-	stddev = 0;
+	/* Then try to determine variance */
+	variance = 0;
 	for (i = 0; i < INTERVALS; i++) {
 		unsigned int value = data->intervals[i];
 		if (value <= thresh) {
 			int64_t diff = value - avg;
-			stddev += diff * diff;
+			variance += diff * diff;
 		}
 	}
 	if (divisor == INTERVALS)
-		stddev >>= INTERVAL_SHIFT;
+		variance >>= INTERVAL_SHIFT;
 	else
-		do_div(stddev, divisor);
+		do_div(variance, divisor);
 
 	/*
-	 * The typical interval is obtained when standard deviation is small
-	 * or standard deviation is small compared to the average interval.
-	 *
-	 * int_sqrt() formal parameter type is unsigned long. When the
-	 * greatest difference to an outlier exceeds ~65 ms * sqrt(divisor)
-	 * the resulting squared standard deviation exceeds the input domain
-	 * of int_sqrt on platforms where unsigned long is 32 bits in size.
-	 * In such case reject the candidate average.
+	 * The typical interval is obtained when standard deviation is
+	 * small (stddev <= 20 us, variance <= 400 us^2) or standard
+	 * deviation is small compared to the average interval (avg >
+	 * 6*stddev, avg^2 > 36*variance). The average is smaller than
+	 * UINT_MAX aka U32_MAX, so computing its square does not
+	 * overflow a u64. We simply reject this candidate average if
+	 * the standard deviation is greater than 715 s (which is
+	 * rather unlikely).
 	 *
 	 * Use this result only if there is no timer to wake us up sooner.
 	 */
-	if (likely(stddev <= ULONG_MAX)) {
-		stddev = int_sqrt(stddev);
-		if (((avg > stddev * 6) && (divisor * 4 >= INTERVALS * 3))
-							|| stddev <= 20) {
+	if (likely(variance <= U64_MAX/36)) {
+		if (((avg*avg > variance*36) && (divisor * 4 >= INTERVALS * 3))
+							|| variance <= 400) {
 			if (data->next_timer_us > avg)
 				data->predicted_us = avg;
 			return;
-- 
cgit v0.10.2


From 3b99669b75db04e411bb298591224a9e8e4f57fb Mon Sep 17 00:00:00 2001
From: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Date: Tue, 16 Feb 2016 20:19:19 +0100
Subject: cpuidle: menu: help gcc generate slightly better code

We know that the avg variable actually ends up holding a 32 bit
quantity, since it's an average of such numbers. It is only a u64
because it is temporarily used to hold the sum. Making it an actual
u32 allows gcc to generate slightly better code, e.g. when computing
the square, it can do a 32x32->64 multiply.

Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c
index beef7ae..27fc733 100644
--- a/drivers/cpuidle/governors/menu.c
+++ b/drivers/cpuidle/governors/menu.c
@@ -199,8 +199,8 @@ static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev);
 static void get_typical_interval(struct menu_device *data)
 {
 	int i, divisor;
-	unsigned int max, thresh;
-	uint64_t avg, variance;
+	unsigned int max, thresh, avg;
+	uint64_t sum, variance;
 
 	thresh = UINT_MAX; /* Discard outliers above this value */
 
@@ -208,28 +208,28 @@ again:
 
 	/* First calculate the average of past intervals */
 	max = 0;
-	avg = 0;
+	sum = 0;
 	divisor = 0;
 	for (i = 0; i < INTERVALS; i++) {
 		unsigned int value = data->intervals[i];
 		if (value <= thresh) {
-			avg += value;
+			sum += value;
 			divisor++;
 			if (value > max)
 				max = value;
 		}
 	}
 	if (divisor == INTERVALS)
-		avg >>= INTERVAL_SHIFT;
+		avg = sum >> INTERVAL_SHIFT;
 	else
-		do_div(avg, divisor);
+		avg = div_u64(sum, divisor);
 
 	/* Then try to determine variance */
 	variance = 0;
 	for (i = 0; i < INTERVALS; i++) {
 		unsigned int value = data->intervals[i];
 		if (value <= thresh) {
-			int64_t diff = value - avg;
+			int64_t diff = (int64_t)value - avg;
 			variance += diff * diff;
 		}
 	}
@@ -251,7 +251,7 @@ again:
 	 * Use this result only if there is no timer to wake us up sooner.
 	 */
 	if (likely(variance <= U64_MAX/36)) {
-		if (((avg*avg > variance*36) && (divisor * 4 >= INTERVALS * 3))
+		if ((((u64)avg*avg > variance*36) && (divisor * 4 >= INTERVALS * 3))
 							|| variance <= 400) {
 			if (data->next_timer_us > avg)
 				data->predicted_us = avg;
-- 
cgit v0.10.2


From 69807a638f91524ed75027f808cd277417ecee7a Mon Sep 17 00:00:00 2001
From: Len Brown <len.brown@intel.com>
Date: Sat, 21 Nov 2015 12:22:47 -0500
Subject: tools/power turbostat: decode more CPUID fields

for debugging, dump a few more fields:

CPUID(1): SSE3 MONITOR EIST TM2 TSC MSR ACPI-TM TM

cpu0: MSR_IA32_MISC_ENABLE: 0x00850089 (TCC EIST MONITOR)

Signed-off-by: Len Brown <len.brown@intel.com>

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 0dac7e0..7ef8b9f 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -2673,6 +2673,19 @@ guess:
 
 	return 0;
 }
+
+void decode_misc_enable_msr(void)
+{
+	unsigned long long msr;
+
+	if (!get_msr(base_cpu, MSR_IA32_MISC_ENABLE, &msr))
+		fprintf(stderr, "cpu%d: MSR_IA32_MISC_ENABLE: 0x%08llx (%s %s %s)\n",
+			base_cpu, msr,
+			msr & (1 << 3) ? "TCC" : "",
+			msr & (1 << 16) ? "EIST" : "",
+			msr & (1 << 18) ? "MONITOR" : "");
+}
+
 void process_cpuid()
 {
 	unsigned int eax, ebx, ecx, edx, max_level;
@@ -2696,9 +2709,19 @@ void process_cpuid()
 	if (family == 6 || family == 0xf)
 		model += ((fms >> 16) & 0xf) << 4;
 
-	if (debug)
+	if (debug) {
 		fprintf(stderr, "%d CPUID levels; family:model:stepping 0x%x:%x:%x (%d:%d:%d)\n",
 			max_level, family, model, stepping, family, model, stepping);
+		fprintf(stderr, "CPUID(1): %s %s %s %s %s %s %s %s\n",
+			ecx & (1 << 0) ? "SSE3" : "-",
+			ecx & (1 << 3) ? "MONITOR" : "-",
+			ecx & (1 << 7) ? "EIST" : "-",
+			ecx & (1 << 8) ? "TM2" : "-",
+			edx & (1 << 4) ? "TSC" : "-",
+			edx & (1 << 5) ? "MSR" : "-",
+			edx & (1 << 22) ? "ACPI-TM" : "-",
+			edx & (1 << 29) ? "TM" : "-");
+	}
 
 	if (!(edx & (1 << 5)))
 		errx(1, "CPUID: no MSR");
@@ -2739,6 +2762,9 @@ void process_cpuid()
 			do_ptm ? "" : "No ",
 			has_epb ? "" : "No ");
 
+	if (debug)
+		decode_misc_enable_msr();
+
 	if (max_level > 0x15) {
 		unsigned int eax_crystal;
 		unsigned int ebx_tsc;
-- 
cgit v0.10.2


From 61a87ba7893a256d86c7eea6a7ab10d38ccac9b2 Mon Sep 17 00:00:00 2001
From: Len Brown <len.brown@intel.com>
Date: Mon, 23 Nov 2015 02:30:51 -0500
Subject: tools/power turbostat: CPUID(0x16) leaf shows base, max, and bus
 frequency

This CPUID leaf is available on Skylake:

CPUID(0x16): base_mhz: 1500 max_mhz: 2200 bus_mhz: 100

Signed-off-by: Len Brown <len.brown@intel.com>

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 7ef8b9f..4c4470f 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -2688,7 +2688,7 @@ void decode_misc_enable_msr(void)
 
 void process_cpuid()
 {
-	unsigned int eax, ebx, ecx, edx, max_level;
+	unsigned int eax, ebx, ecx, edx, max_level, max_extended_level;
 	unsigned int fms, family, model, stepping;
 
 	eax = ebx = ecx = edx = 0;
@@ -2732,9 +2732,9 @@ void process_cpuid()
 	 * This check is valid for both Intel and AMD.
 	 */
 	ebx = ecx = edx = 0;
-	__get_cpuid(0x80000000, &max_level, &ebx, &ecx, &edx);
+	__get_cpuid(0x80000000, &max_extended_level, &ebx, &ecx, &edx);
 
-	if (max_level >= 0x80000007) {
+	if (max_extended_level >= 0x80000007) {
 
 		/*
 		 * Non-Stop TSC is advertised by CPUID.EAX=0x80000007: EDX.bit8
@@ -2765,7 +2765,7 @@ void process_cpuid()
 	if (debug)
 		decode_misc_enable_msr();
 
-	if (max_level > 0x15) {
+	if (max_level >= 0x15) {
 		unsigned int eax_crystal;
 		unsigned int ebx_tsc;
 
@@ -2799,6 +2799,19 @@ void process_cpuid()
 			}
 		}
 	}
+	if (max_level >= 0x16) {
+		unsigned int base_mhz, max_mhz, bus_mhz, edx;
+
+		/*
+		 * CPUID 16H Base MHz, Max MHz, Bus MHz
+		 */
+		base_mhz = max_mhz = bus_mhz = edx = 0;
+
+		__get_cpuid(0x16, &base_mhz, &max_mhz, &bus_mhz, &edx);
+		if (debug)
+			fprintf(stderr, "CPUID(0x16): base_mhz: %d max_mhz: %d bus_mhz: %d\n",
+				base_mhz, max_mhz, bus_mhz);
+	}
 
 	if (has_aperf)
 		aperf_mperf_multiplier = get_aperf_mperf_multiplier(family, model);
@@ -3151,7 +3164,7 @@ int get_and_dump_counters(void)
 }
 
 void print_version() {
-	fprintf(stderr, "turbostat version 4.8 26-Sep, 2015"
+	fprintf(stderr, "turbostat version 4.9 22 Nov, 2015"
 		" - Len Brown <lenb@kernel.org>\n");
 }
 
-- 
cgit v0.10.2


From 670e27d809a9a29943e1d2e45823fa4fc16c29f0 Mon Sep 17 00:00:00 2001
From: Len Brown <len.brown@intel.com>
Date: Tue, 1 Dec 2015 01:36:39 -0500
Subject: x86 msr-index: Simplify syntax for HWP fields

syntax only, no functional change

Signed-off-by: Len Brown <len.brown@intel.com>

diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 690b402..5c5e7e5 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -230,10 +230,10 @@
 #define HWP_PACKAGE_LEVEL_REQUEST_BIT	(1<<11)
 
 /* IA32_HWP_CAPABILITIES */
-#define HWP_HIGHEST_PERF(x)		(x & 0xff)
-#define HWP_GUARANTEED_PERF(x)		((x & (0xff << 8)) >>8)
-#define HWP_MOSTEFFICIENT_PERF(x)	((x & (0xff << 16)) >>16)
-#define HWP_LOWEST_PERF(x)		((x & (0xff << 24)) >>24)
+#define HWP_HIGHEST_PERF(x)		(((x) >> 0) & 0xff)
+#define HWP_GUARANTEED_PERF(x)		(((x) >> 8) & 0xff)
+#define HWP_MOSTEFFICIENT_PERF(x)	(((x) >> 16) & 0xff)
+#define HWP_LOWEST_PERF(x)		(((x) >> 24) & 0xff)
 
 /* IA32_HWP_REQUEST */
 #define HWP_MIN_PERF(x) 		(x & 0xff)
-- 
cgit v0.10.2


From 7f5c258e1ce1e0909d3195694ac79f143051e513 Mon Sep 17 00:00:00 2001
From: Len Brown <len.brown@intel.com>
Date: Tue, 1 Dec 2015 01:36:39 -0500
Subject: tools/power turbostat: decode HWP registers

 # turbostat --debug
...
CPUID(6): ... HWP, HWPnotify, HWPwindow, HWPepp, HWPpkg ...
...
cpu0: MSR_PM_ENABLE: 0x00000001 (HWP)
cpu0: MSR_HWP_CAPABILITIES: 0x01050916 (high 0x16 guar 0x9 eff 0x5 low 0x1)
cpu0: MSR_HWP_REQUEST: 0x80001604 (min 0x4 max 0x16 des 0x0 epp 0x80 window 0x0 pkg 0x0)
cpu0: MSR_HWP_INTERRUPT: 0x00000001 (EN_Guaranteed_Perf_Change, Dis_Excursion_Min)
cpu0: MSR_HWP_STATUS: 0x00000000 (No-Guaranteed_Perf_Change, No-Excursion_Min)

Signed-off-by: Len Brown <len.brown@intel.com>

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 4c4470f..af3b955 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -98,6 +98,12 @@ unsigned int crystal_hz;
 unsigned long long tsc_hz;
 int base_cpu;
 double discover_bclk(unsigned int family, unsigned int model);
+unsigned int has_hwp;	/* IA32_PM_ENABLE, IA32_HWP_CAPABILITIES */
+			/* IA32_HWP_REQUEST, IA32_HWP_STATUS */
+unsigned int has_hwp_notify;		/* IA32_HWP_INTERRUPT */
+unsigned int has_hwp_activity_window;	/* IA32_HWP_REQUEST[bits 41:32] */
+unsigned int has_hwp_epp;		/* IA32_HWP_REQUEST[bits 31:24] */
+unsigned int has_hwp_pkg;		/* IA32_HWP_REQUEST_PKG */
 
 #define RAPL_PKG		(1 << 0)
 					/* 0x610 MSR_PKG_POWER_LIMIT */
@@ -2041,6 +2047,97 @@ int print_epb(struct thread_data *t, struct core_data *c, struct pkg_data *p)
 
 	return 0;
 }
+/*
+ * print_hwp()
+ * Decode the MSR_HWP_CAPABILITIES
+ */
+int print_hwp(struct thread_data *t, struct core_data *c, struct pkg_data *p)
+{
+	unsigned long long msr;
+	int cpu;
+
+	if (!has_hwp)
+		return 0;
+
+	cpu = t->cpu_id;
+
+	/* MSR_HWP_CAPABILITIES is per-package */
+	if (!(t->flags & CPU_IS_FIRST_THREAD_IN_CORE) || !(t->flags & CPU_IS_FIRST_CORE_IN_PACKAGE))
+		return 0;
+
+	if (cpu_migrate(cpu)) {
+		fprintf(stderr, "Could not migrate to CPU %d\n", cpu);
+		return -1;
+	}
+
+	if (get_msr(cpu, MSR_PM_ENABLE, &msr))
+		return 0;
+
+	fprintf(stderr, "cpu%d: MSR_PM_ENABLE: 0x%08llx (%sHWP)\n",
+		cpu, msr, (msr & (1 << 0)) ? "" : "No-");
+
+	/* MSR_PM_ENABLE[1] == 1 if HWP is enabled and MSRs visible */
+	if ((msr & (1 << 0)) == 0)
+		return 0;
+
+	if (get_msr(cpu, MSR_HWP_CAPABILITIES, &msr))
+		return 0;
+
+	fprintf(stderr, "cpu%d: MSR_HWP_CAPABILITIES: 0x%08llx "
+			"(high 0x%x guar 0x%x eff 0x%x low 0x%x)\n",
+			cpu, msr,
+			(unsigned int)HWP_HIGHEST_PERF(msr),
+			(unsigned int)HWP_GUARANTEED_PERF(msr),
+			(unsigned int)HWP_MOSTEFFICIENT_PERF(msr),
+			(unsigned int)HWP_LOWEST_PERF(msr));
+
+	if (get_msr(cpu, MSR_HWP_REQUEST, &msr))
+		return 0;
+
+	fprintf(stderr, "cpu%d: MSR_HWP_REQUEST: 0x%08llx "
+			"(min 0x%x max 0x%x des 0x%x epp 0x%x window 0x%x pkg 0x%x)\n",
+			cpu, msr,
+			(unsigned int)(((msr) >> 0) & 0xff),
+			(unsigned int)(((msr) >> 8) & 0xff),
+			(unsigned int)(((msr) >> 16) & 0xff),
+			(unsigned int)(((msr) >> 24) & 0xff),
+			(unsigned int)(((msr) >> 32) & 0xff3),
+			(unsigned int)(((msr) >> 42) & 0x1));
+
+	if (has_hwp_pkg) {
+		if (get_msr(cpu, MSR_HWP_REQUEST_PKG, &msr))
+			return 0;
+
+		fprintf(stderr, "cpu%d: MSR_HWP_REQUEST_PKG: 0x%08llx "
+			"(min 0x%x max 0x%x des 0x%x epp 0x%x window 0x%x)\n",
+			cpu, msr,
+			(unsigned int)(((msr) >> 0) & 0xff),
+			(unsigned int)(((msr) >> 8) & 0xff),
+			(unsigned int)(((msr) >> 16) & 0xff),
+			(unsigned int)(((msr) >> 24) & 0xff),
+			(unsigned int)(((msr) >> 32) & 0xff3));
+	}
+	if (has_hwp_notify) {
+		if (get_msr(cpu, MSR_HWP_INTERRUPT, &msr))
+			return 0;
+
+		fprintf(stderr, "cpu%d: MSR_HWP_INTERRUPT: 0x%08llx "
+			"(%s_Guaranteed_Perf_Change, %s_Excursion_Min)\n",
+			cpu, msr,
+			((msr) & 0x1) ? "EN" : "Dis",
+			((msr) & 0x2) ? "EN" : "Dis");
+	}
+	if (get_msr(cpu, MSR_HWP_STATUS, &msr))
+		return 0;
+
+	fprintf(stderr, "cpu%d: MSR_HWP_STATUS: 0x%08llx "
+			"(%sGuaranteed_Perf_Change, %sExcursion_Min)\n",
+			cpu, msr,
+			((msr) & 0x1) ? "" : "No-",
+			((msr) & 0x2) ? "" : "No-");
+
+	return 0;
+}
 
 /*
  * print_perf_limit()
@@ -2686,6 +2783,7 @@ void decode_misc_enable_msr(void)
 			msr & (1 << 18) ? "MONITOR" : "");
 }
 
+
 void process_cpuid()
 {
 	unsigned int eax, ebx, ecx, edx, max_level, max_extended_level;
@@ -2753,14 +2851,25 @@ void process_cpuid()
 	has_aperf = ecx & (1 << 0);
 	do_dts = eax & (1 << 0);
 	do_ptm = eax & (1 << 6);
+	has_hwp = eax & (1 << 7);
+	has_hwp_notify = eax & (1 << 8);
+	has_hwp_activity_window = eax & (1 << 9);
+	has_hwp_epp = eax & (1 << 10);
+	has_hwp_pkg = eax & (1 << 11);
 	has_epb = ecx & (1 << 3);
 
 	if (debug)
-		fprintf(stderr, "CPUID(6): %sAPERF, %sDTS, %sPTM, %sEPB\n",
-			has_aperf ? "" : "No ",
-			do_dts ? "" : "No ",
-			do_ptm ? "" : "No ",
-			has_epb ? "" : "No ");
+		fprintf(stderr, "CPUID(6): %sAPERF, %sDTS, %sPTM, %sHWP, "
+			"%sHWPnotify, %sHWPwindow, %sHWPepp, %sHWPpkg, %sEPB\n",
+			has_aperf ? "" : "No-",
+			do_dts ? "" : "No-",
+			do_ptm ? "" : "No-",
+			has_hwp ? "" : "No-",
+			has_hwp_notify ? "" : "No-",
+			has_hwp_activity_window ? "" : "No-",
+			has_hwp_epp ? "" : "No-",
+			has_hwp_pkg ? "" : "No-",
+			has_epb ? "" : "No-");
 
 	if (debug)
 		decode_misc_enable_msr();
@@ -3088,6 +3197,9 @@ void turbostat_init()
 
 
 	if (debug)
+		for_all_cpus(print_hwp, ODD_COUNTERS);
+
+	if (debug)
 		for_all_cpus(print_epb, ODD_COUNTERS);
 
 	if (debug)
@@ -3164,7 +3276,7 @@ int get_and_dump_counters(void)
 }
 
 void print_version() {
-	fprintf(stderr, "turbostat version 4.9 22 Nov, 2015"
+	fprintf(stderr, "turbostat version 4.10 10 Dec, 2015"
 		" - Len Brown <lenb@kernel.org>\n");
 }
 
-- 
cgit v0.10.2


From f0057310b40efe9f797ff337e9464e6a6fb9d782 Mon Sep 17 00:00:00 2001
From: Len Brown <len.brown@intel.com>
Date: Thu, 3 Dec 2015 01:35:36 -0500
Subject: tools/power turbostat: Decode MSR_MISC_PWR_MGMT

This MSR is helpful to show if P-state HW coordination
is enabled or disabled.

Signed-off-by: Len Brown <len.brown@intel.com>

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index af3b955..c600340 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -2783,6 +2783,26 @@ void decode_misc_enable_msr(void)
 			msr & (1 << 18) ? "MONITOR" : "");
 }
 
+/*
+ * Decode MSR_MISC_PWR_MGMT
+ *
+ * Decode the bits according to the Nehalem documentation
+ * bit[0] seems to continue to have same meaning going forward
+ * bit[1] less so...
+ */
+void decode_misc_pwr_mgmt_msr(void)
+{
+	unsigned long long msr;
+
+	if (!do_nhm_platform_info)
+		return;
+
+	if (!get_msr(base_cpu, MSR_MISC_PWR_MGMT, &msr))
+		fprintf(stderr, "cpu%d: MSR_MISC_PWR_MGMT: 0x%08llx (%sable-EIST_Coordination %sable-EPB)\n",
+			base_cpu, msr,
+			msr & (1 << 0) ? "DIS" : "EN",
+			msr & (1 << 1) ? "EN" : "DIS");
+}
 
 void process_cpuid()
 {
@@ -2936,6 +2956,9 @@ void process_cpuid()
 	do_slm_cstates = is_slm(family, model);
 	do_knl_cstates  = is_knl(family, model);
 
+	if (debug)
+		decode_misc_pwr_mgmt_msr();
+
 	rapl_probe(family, model);
 	perf_limit_reasons_probe(family, model);
 
-- 
cgit v0.10.2


From 5bc8ac0f68284e3c05e0465afb59c62c996d9d8a Mon Sep 17 00:00:00 2001
From: Felipe Franciosi <felipe@nutanix.com>
Date: Thu, 18 Feb 2016 14:51:46 +0000
Subject: Documentation: cpufreq: intel_pstate: fix typo

This just swaps a colon for a quote in the intel_pstate documentation.

Signed-off-by: Felipe Franciosi <felipe@nutanix.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/Documentation/cpu-freq/intel-pstate.txt b/Documentation/cpu-freq/intel-pstate.txt
index f7b12c0..e6bd1e6 100644
--- a/Documentation/cpu-freq/intel-pstate.txt
+++ b/Documentation/cpu-freq/intel-pstate.txt
@@ -25,7 +25,7 @@ callback, so cpufreq core can't request a transition to a specific frequency.
 The driver provides minimum and maximum frequency limits and callbacks to set a
 policy. The policy in cpufreq sysfs is referred to as the "scaling governor".
 The cpufreq core can request the driver to operate in any of the two policies:
-"performance: and "powersave". The driver decides which frequency to use based
+"performance" and "powersave". The driver decides which frequency to use based
 on the above policy selection considering minimum and maximum frequency limits.
 
 The Intel P-State driver falls under the latter category, which implements the
-- 
cgit v0.10.2


From a5da64477ee79efa748df256928ec8840a2a7986 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Tue, 16 Feb 2016 14:17:52 +0530
Subject: PM / OPP: Fix incorrect comments

Some comments were just copy/pasted from other sections and don't match
to the routines they were added for. Fix them.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Reviewed-by: Stephen Boyd <sboyd@codeaurora.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/base/power/opp/core.c b/drivers/base/power/opp/core.c
index 5fb2f06..bdae09c 100644
--- a/drivers/base/power/opp/core.c
+++ b/drivers/base/power/opp/core.c
@@ -1254,7 +1254,7 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_set_supported_hw);
 
 /**
  * dev_pm_opp_put_supported_hw() - Releases resources blocked for supported hw
- * @dev: Device for which supported-hw has to be set.
+ * @dev: Device for which supported-hw has to be put.
  *
  * This is required only for the V2 bindings, and is called for a matching
  * dev_pm_opp_set_supported_hw(). Until this is called, the device_opp structure
@@ -1303,7 +1303,7 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_put_supported_hw);
 
 /**
  * dev_pm_opp_set_prop_name() - Set prop-extn name
- * @dev: Device for which the regulator has to be set.
+ * @dev: Device for which the prop-name has to be set.
  * @name: name to postfix to properties.
  *
  * This is required only for the V2 bindings, and it enables a platform to
@@ -1362,7 +1362,7 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_set_prop_name);
 
 /**
  * dev_pm_opp_put_prop_name() - Releases resources blocked for prop-name
- * @dev: Device for which the regulator has to be set.
+ * @dev: Device for which the prop-name has to be put.
  *
  * This is required only for the V2 bindings, and is called for a matching
  * dev_pm_opp_set_prop_name(). Until this is called, the device_opp structure
-- 
cgit v0.10.2


From 2c2709dc6921c5d246b686521f932c73a20f428f Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Tue, 16 Feb 2016 14:17:53 +0530
Subject: PM / OPP: Rename structures for clarity

Stephen pointed out recently, that few structures always confuse him as
they aren't named properly. And this patch tries to address that:

Names are updated as:
- device_opp or dev_opp -> opp_table
- dev_opp_list -> opp_tables
- dev_opp_list_lock -> opp_table_lock
- device_list_opp -> opp_device (it was never a list, but a structure)
- list_dev -> opp_dev
- And similar changes in comments and function names as well.

This also fixes checkpatch warnings that were generated with this patch.

No functional changes.

Suggested-by: Stephen Boyd <sboyd@codeaurora.org>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Reviewed-by: Stephen Boyd <sboyd@codeaurora.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/base/power/opp/core.c b/drivers/base/power/opp/core.c
index bdae09c..433b600 100644
--- a/drivers/base/power/opp/core.c
+++ b/drivers/base/power/opp/core.c
@@ -25,40 +25,40 @@
 #include "opp.h"
 
 /*
- * The root of the list of all devices. All device_opp structures branch off
- * from here, with each device_opp containing the list of opp it supports in
+ * The root of the list of all opp-tables. All opp_table structures branch off
+ * from here, with each opp_table containing the list of opps it supports in
  * various states of availability.
  */
-static LIST_HEAD(dev_opp_list);
+static LIST_HEAD(opp_tables);
 /* Lock to allow exclusive modification to the device and opp lists */
-DEFINE_MUTEX(dev_opp_list_lock);
+DEFINE_MUTEX(opp_table_lock);
 
 #define opp_rcu_lockdep_assert()					\
 do {									\
 	RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&			\
-				!lockdep_is_held(&dev_opp_list_lock),	\
-			   "Missing rcu_read_lock() or "		\
-			   "dev_opp_list_lock protection");		\
+			 !lockdep_is_held(&opp_table_lock),		\
+			 "Missing rcu_read_lock() or "			\
+			 "opp_table_lock protection");			\
 } while (0)
 
-static struct device_list_opp *_find_list_dev(const struct device *dev,
-					      struct device_opp *dev_opp)
+static struct opp_device *_find_opp_dev(const struct device *dev,
+					struct opp_table *opp_table)
 {
-	struct device_list_opp *list_dev;
+	struct opp_device *opp_dev;
 
-	list_for_each_entry(list_dev, &dev_opp->dev_list, node)
-		if (list_dev->dev == dev)
-			return list_dev;
+	list_for_each_entry(opp_dev, &opp_table->dev_list, node)
+		if (opp_dev->dev == dev)
+			return opp_dev;
 
 	return NULL;
 }
 
-static struct device_opp *_managed_opp(const struct device_node *np)
+static struct opp_table *_managed_opp(const struct device_node *np)
 {
-	struct device_opp *dev_opp;
+	struct opp_table *opp_table;
 
-	list_for_each_entry_rcu(dev_opp, &dev_opp_list, node) {
-		if (dev_opp->np == np) {
+	list_for_each_entry_rcu(opp_table, &opp_tables, node) {
+		if (opp_table->np == np) {
 			/*
 			 * Multiple devices can point to the same OPP table and
 			 * so will have same node-pointer, np.
@@ -66,7 +66,7 @@ static struct device_opp *_managed_opp(const struct device_node *np)
 			 * But the OPPs will be considered as shared only if the
 			 * OPP table contains a "opp-shared" property.
 			 */
-			return dev_opp->shared_opp ? dev_opp : NULL;
+			return opp_table->shared_opp ? opp_table : NULL;
 		}
 	}
 
@@ -74,24 +74,24 @@ static struct device_opp *_managed_opp(const struct device_node *np)
 }
 
 /**
- * _find_device_opp() - find device_opp struct using device pointer
- * @dev:	device pointer used to lookup device OPPs
+ * _find_opp_table() - find opp_table struct using device pointer
+ * @dev:	device pointer used to lookup OPP table
  *
- * Search list of device OPPs for one containing matching device. Does a RCU
- * reader operation to grab the pointer needed.
+ * Search OPP table for one containing matching device. Does a RCU reader
+ * operation to grab the pointer needed.
  *
- * Return: pointer to 'struct device_opp' if found, otherwise -ENODEV or
+ * Return: pointer to 'struct opp_table' if found, otherwise -ENODEV or
  * -EINVAL based on type of error.
  *
  * Locking: For readers, this function must be called under rcu_read_lock().
- * device_opp is a RCU protected pointer, which means that device_opp is valid
+ * opp_table is a RCU protected pointer, which means that opp_table is valid
  * as long as we are under RCU lock.
  *
- * For Writers, this function must be called with dev_opp_list_lock held.
+ * For Writers, this function must be called with opp_table_lock held.
  */
-struct device_opp *_find_device_opp(struct device *dev)
+struct opp_table *_find_opp_table(struct device *dev)
 {
-	struct device_opp *dev_opp;
+	struct opp_table *opp_table;
 
 	opp_rcu_lockdep_assert();
 
@@ -100,9 +100,9 @@ struct device_opp *_find_device_opp(struct device *dev)
 		return ERR_PTR(-EINVAL);
 	}
 
-	list_for_each_entry_rcu(dev_opp, &dev_opp_list, node)
-		if (_find_list_dev(dev, dev_opp))
-			return dev_opp;
+	list_for_each_entry_rcu(opp_table, &opp_tables, node)
+		if (_find_opp_dev(dev, opp_table))
+			return opp_table;
 
 	return ERR_PTR(-ENODEV);
 }
@@ -215,16 +215,16 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_is_turbo);
  */
 unsigned long dev_pm_opp_get_max_clock_latency(struct device *dev)
 {
-	struct device_opp *dev_opp;
+	struct opp_table *opp_table;
 	unsigned long clock_latency_ns;
 
 	rcu_read_lock();
 
-	dev_opp = _find_device_opp(dev);
-	if (IS_ERR(dev_opp))
+	opp_table = _find_opp_table(dev);
+	if (IS_ERR(opp_table))
 		clock_latency_ns = 0;
 	else
-		clock_latency_ns = dev_opp->clock_latency_ns_max;
+		clock_latency_ns = opp_table->clock_latency_ns_max;
 
 	rcu_read_unlock();
 	return clock_latency_ns;
@@ -241,7 +241,7 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_get_max_clock_latency);
  */
 unsigned long dev_pm_opp_get_max_volt_latency(struct device *dev)
 {
-	struct device_opp *dev_opp;
+	struct opp_table *opp_table;
 	struct dev_pm_opp *opp;
 	struct regulator *reg;
 	unsigned long latency_ns = 0;
@@ -250,13 +250,13 @@ unsigned long dev_pm_opp_get_max_volt_latency(struct device *dev)
 
 	rcu_read_lock();
 
-	dev_opp = _find_device_opp(dev);
-	if (IS_ERR(dev_opp)) {
+	opp_table = _find_opp_table(dev);
+	if (IS_ERR(opp_table)) {
 		rcu_read_unlock();
 		return 0;
 	}
 
-	reg = dev_opp->regulator;
+	reg = opp_table->regulator;
 	if (IS_ERR(reg)) {
 		/* Regulator may not be required for device */
 		if (reg)
@@ -266,7 +266,7 @@ unsigned long dev_pm_opp_get_max_volt_latency(struct device *dev)
 		return 0;
 	}
 
-	list_for_each_entry_rcu(opp, &dev_opp->opp_list, node) {
+	list_for_each_entry_rcu(opp, &opp_table->opp_list, node) {
 		if (!opp->available)
 			continue;
 
@@ -279,7 +279,7 @@ unsigned long dev_pm_opp_get_max_volt_latency(struct device *dev)
 	rcu_read_unlock();
 
 	/*
-	 * The caller needs to ensure that dev_opp (and hence the regulator)
+	 * The caller needs to ensure that opp_table (and hence the regulator)
 	 * isn't freed, while we are executing this routine.
 	 */
 	ret = regulator_set_voltage_time(reg, min_uV, max_uV);
@@ -322,21 +322,21 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_get_max_transition_latency);
  */
 struct dev_pm_opp *dev_pm_opp_get_suspend_opp(struct device *dev)
 {
-	struct device_opp *dev_opp;
+	struct opp_table *opp_table;
 
 	opp_rcu_lockdep_assert();
 
-	dev_opp = _find_device_opp(dev);
-	if (IS_ERR(dev_opp) || !dev_opp->suspend_opp ||
-	    !dev_opp->suspend_opp->available)
+	opp_table = _find_opp_table(dev);
+	if (IS_ERR(opp_table) || !opp_table->suspend_opp ||
+	    !opp_table->suspend_opp->available)
 		return NULL;
 
-	return dev_opp->suspend_opp;
+	return opp_table->suspend_opp;
 }
 EXPORT_SYMBOL_GPL(dev_pm_opp_get_suspend_opp);
 
 /**
- * dev_pm_opp_get_opp_count() - Get number of opps available in the opp list
+ * dev_pm_opp_get_opp_count() - Get number of opps available in the opp table
  * @dev:	device for which we do this operation
  *
  * Return: This function returns the number of available opps if there are any,
@@ -346,21 +346,21 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_get_suspend_opp);
  */
 int dev_pm_opp_get_opp_count(struct device *dev)
 {
-	struct device_opp *dev_opp;
+	struct opp_table *opp_table;
 	struct dev_pm_opp *temp_opp;
 	int count = 0;
 
 	rcu_read_lock();
 
-	dev_opp = _find_device_opp(dev);
-	if (IS_ERR(dev_opp)) {
-		count = PTR_ERR(dev_opp);
-		dev_err(dev, "%s: device OPP not found (%d)\n",
+	opp_table = _find_opp_table(dev);
+	if (IS_ERR(opp_table)) {
+		count = PTR_ERR(opp_table);
+		dev_err(dev, "%s: OPP table not found (%d)\n",
 			__func__, count);
 		goto out_unlock;
 	}
 
-	list_for_each_entry_rcu(temp_opp, &dev_opp->opp_list, node) {
+	list_for_each_entry_rcu(temp_opp, &opp_table->opp_list, node) {
 		if (temp_opp->available)
 			count++;
 	}
@@ -377,7 +377,7 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_get_opp_count);
  * @freq:		frequency to search for
  * @available:		true/false - match for available opp
  *
- * Return: Searches for exact match in the opp list and returns pointer to the
+ * Return: Searches for exact match in the opp table and returns pointer to the
  * matching opp if found, else returns ERR_PTR in case of error and should
  * be handled using IS_ERR. Error return values can be:
  * EINVAL:	for bad pointer
@@ -401,19 +401,20 @@ struct dev_pm_opp *dev_pm_opp_find_freq_exact(struct device *dev,
 					      unsigned long freq,
 					      bool available)
 {
-	struct device_opp *dev_opp;
+	struct opp_table *opp_table;
 	struct dev_pm_opp *temp_opp, *opp = ERR_PTR(-ERANGE);
 
 	opp_rcu_lockdep_assert();
 
-	dev_opp = _find_device_opp(dev);
-	if (IS_ERR(dev_opp)) {
-		int r = PTR_ERR(dev_opp);
-		dev_err(dev, "%s: device OPP not found (%d)\n", __func__, r);
+	opp_table = _find_opp_table(dev);
+	if (IS_ERR(opp_table)) {
+		int r = PTR_ERR(opp_table);
+
+		dev_err(dev, "%s: OPP table not found (%d)\n", __func__, r);
 		return ERR_PTR(r);
 	}
 
-	list_for_each_entry_rcu(temp_opp, &dev_opp->opp_list, node) {
+	list_for_each_entry_rcu(temp_opp, &opp_table->opp_list, node) {
 		if (temp_opp->available == available &&
 				temp_opp->rate == freq) {
 			opp = temp_opp;
@@ -449,7 +450,7 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_find_freq_exact);
 struct dev_pm_opp *dev_pm_opp_find_freq_ceil(struct device *dev,
 					     unsigned long *freq)
 {
-	struct device_opp *dev_opp;
+	struct opp_table *opp_table;
 	struct dev_pm_opp *temp_opp, *opp = ERR_PTR(-ERANGE);
 
 	opp_rcu_lockdep_assert();
@@ -459,11 +460,11 @@ struct dev_pm_opp *dev_pm_opp_find_freq_ceil(struct device *dev,
 		return ERR_PTR(-EINVAL);
 	}
 
-	dev_opp = _find_device_opp(dev);
-	if (IS_ERR(dev_opp))
-		return ERR_CAST(dev_opp);
+	opp_table = _find_opp_table(dev);
+	if (IS_ERR(opp_table))
+		return ERR_CAST(opp_table);
 
-	list_for_each_entry_rcu(temp_opp, &dev_opp->opp_list, node) {
+	list_for_each_entry_rcu(temp_opp, &opp_table->opp_list, node) {
 		if (temp_opp->available && temp_opp->rate >= *freq) {
 			opp = temp_opp;
 			*freq = opp->rate;
@@ -499,7 +500,7 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_find_freq_ceil);
 struct dev_pm_opp *dev_pm_opp_find_freq_floor(struct device *dev,
 					      unsigned long *freq)
 {
-	struct device_opp *dev_opp;
+	struct opp_table *opp_table;
 	struct dev_pm_opp *temp_opp, *opp = ERR_PTR(-ERANGE);
 
 	opp_rcu_lockdep_assert();
@@ -509,11 +510,11 @@ struct dev_pm_opp *dev_pm_opp_find_freq_floor(struct device *dev,
 		return ERR_PTR(-EINVAL);
 	}
 
-	dev_opp = _find_device_opp(dev);
-	if (IS_ERR(dev_opp))
-		return ERR_CAST(dev_opp);
+	opp_table = _find_opp_table(dev);
+	if (IS_ERR(opp_table))
+		return ERR_CAST(opp_table);
 
-	list_for_each_entry_rcu(temp_opp, &dev_opp->opp_list, node) {
+	list_for_each_entry_rcu(temp_opp, &opp_table->opp_list, node) {
 		if (temp_opp->available) {
 			/* go to the next node, before choosing prev */
 			if (temp_opp->rate > *freq)
@@ -530,24 +531,24 @@ struct dev_pm_opp *dev_pm_opp_find_freq_floor(struct device *dev,
 EXPORT_SYMBOL_GPL(dev_pm_opp_find_freq_floor);
 
 /*
- * The caller needs to ensure that device_opp (and hence the clk) isn't freed,
+ * The caller needs to ensure that opp_table (and hence the clk) isn't freed,
  * while clk returned here is used.
  */
 static struct clk *_get_opp_clk(struct device *dev)
 {
-	struct device_opp *dev_opp;
+	struct opp_table *opp_table;
 	struct clk *clk;
 
 	rcu_read_lock();
 
-	dev_opp = _find_device_opp(dev);
-	if (IS_ERR(dev_opp)) {
+	opp_table = _find_opp_table(dev);
+	if (IS_ERR(opp_table)) {
 		dev_err(dev, "%s: device opp doesn't exist\n", __func__);
-		clk = ERR_CAST(dev_opp);
+		clk = ERR_CAST(opp_table);
 		goto unlock;
 	}
 
-	clk = dev_opp->clk;
+	clk = opp_table->clk;
 	if (IS_ERR(clk))
 		dev_err(dev, "%s: No clock available for the device\n",
 			__func__);
@@ -594,7 +595,7 @@ static int _set_opp_voltage(struct device *dev, struct regulator *reg,
  */
 int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq)
 {
-	struct device_opp *dev_opp;
+	struct opp_table *opp_table;
 	struct dev_pm_opp *old_opp, *opp;
 	struct regulator *reg;
 	struct clk *clk;
@@ -628,11 +629,11 @@ int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq)
 
 	rcu_read_lock();
 
-	dev_opp = _find_device_opp(dev);
-	if (IS_ERR(dev_opp)) {
+	opp_table = _find_opp_table(dev);
+	if (IS_ERR(opp_table)) {
 		dev_err(dev, "%s: device opp doesn't exist\n", __func__);
 		rcu_read_unlock();
-		return PTR_ERR(dev_opp);
+		return PTR_ERR(opp_table);
 	}
 
 	old_opp = dev_pm_opp_find_freq_ceil(dev, &old_freq);
@@ -658,7 +659,7 @@ int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq)
 	u_volt_min = opp->u_volt_min;
 	u_volt_max = opp->u_volt_max;
 
-	reg = dev_opp->regulator;
+	reg = opp_table->regulator;
 
 	rcu_read_unlock();
 
@@ -705,81 +706,81 @@ restore_voltage:
 }
 EXPORT_SYMBOL_GPL(dev_pm_opp_set_rate);
 
-/* List-dev Helpers */
-static void _kfree_list_dev_rcu(struct rcu_head *head)
+/* OPP-dev Helpers */
+static void _kfree_opp_dev_rcu(struct rcu_head *head)
 {
-	struct device_list_opp *list_dev;
+	struct opp_device *opp_dev;
 
-	list_dev = container_of(head, struct device_list_opp, rcu_head);
-	kfree_rcu(list_dev, rcu_head);
+	opp_dev = container_of(head, struct opp_device, rcu_head);
+	kfree_rcu(opp_dev, rcu_head);
 }
 
-static void _remove_list_dev(struct device_list_opp *list_dev,
-			     struct device_opp *dev_opp)
+static void _remove_opp_dev(struct opp_device *opp_dev,
+			    struct opp_table *opp_table)
 {
-	opp_debug_unregister(list_dev, dev_opp);
-	list_del(&list_dev->node);
-	call_srcu(&dev_opp->srcu_head.srcu, &list_dev->rcu_head,
-		  _kfree_list_dev_rcu);
+	opp_debug_unregister(opp_dev, opp_table);
+	list_del(&opp_dev->node);
+	call_srcu(&opp_table->srcu_head.srcu, &opp_dev->rcu_head,
+		  _kfree_opp_dev_rcu);
 }
 
-struct device_list_opp *_add_list_dev(const struct device *dev,
-				      struct device_opp *dev_opp)
+struct opp_device *_add_opp_dev(const struct device *dev,
+				struct opp_table *opp_table)
 {
-	struct device_list_opp *list_dev;
+	struct opp_device *opp_dev;
 	int ret;
 
-	list_dev = kzalloc(sizeof(*list_dev), GFP_KERNEL);
-	if (!list_dev)
+	opp_dev = kzalloc(sizeof(*opp_dev), GFP_KERNEL);
+	if (!opp_dev)
 		return NULL;
 
-	/* Initialize list-dev */
-	list_dev->dev = dev;
-	list_add_rcu(&list_dev->node, &dev_opp->dev_list);
+	/* Initialize opp-dev */
+	opp_dev->dev = dev;
+	list_add_rcu(&opp_dev->node, &opp_table->dev_list);
 
-	/* Create debugfs entries for the dev_opp */
-	ret = opp_debug_register(list_dev, dev_opp);
+	/* Create debugfs entries for the opp_table */
+	ret = opp_debug_register(opp_dev, opp_table);
 	if (ret)
 		dev_err(dev, "%s: Failed to register opp debugfs (%d)\n",
 			__func__, ret);
 
-	return list_dev;
+	return opp_dev;
 }
 
 /**
- * _add_device_opp() - Find device OPP table or allocate a new one
+ * _add_opp_table() - Find OPP table or allocate a new one
  * @dev:	device for which we do this operation
  *
  * It tries to find an existing table first, if it couldn't find one, it
  * allocates a new OPP table and returns that.
  *
- * Return: valid device_opp pointer if success, else NULL.
+ * Return: valid opp_table pointer if success, else NULL.
  */
-static struct device_opp *_add_device_opp(struct device *dev)
+static struct opp_table *_add_opp_table(struct device *dev)
 {
-	struct device_opp *dev_opp;
-	struct device_list_opp *list_dev;
+	struct opp_table *opp_table;
+	struct opp_device *opp_dev;
 	struct device_node *np;
 	int ret;
 
-	/* Check for existing list for 'dev' first */
-	dev_opp = _find_device_opp(dev);
-	if (!IS_ERR(dev_opp))
-		return dev_opp;
+	/* Check for existing table for 'dev' first */
+	opp_table = _find_opp_table(dev);
+	if (!IS_ERR(opp_table))
+		return opp_table;
 
 	/*
-	 * Allocate a new device OPP table. In the infrequent case where a new
+	 * Allocate a new OPP table. In the infrequent case where a new
 	 * device is needed to be added, we pay this penalty.
 	 */
-	dev_opp = kzalloc(sizeof(*dev_opp), GFP_KERNEL);
-	if (!dev_opp)
+	opp_table = kzalloc(sizeof(*opp_table), GFP_KERNEL);
+	if (!opp_table)
 		return NULL;
 
-	INIT_LIST_HEAD(&dev_opp->dev_list);
+	INIT_LIST_HEAD(&opp_table->dev_list);
 
-	list_dev = _add_list_dev(dev, dev_opp);
-	if (!list_dev) {
-		kfree(dev_opp);
+	opp_dev = _add_opp_dev(dev, opp_table);
+	if (!opp_dev) {
+		kfree(opp_table);
 		return NULL;
 	}
 
@@ -792,79 +793,80 @@ static struct device_opp *_add_device_opp(struct device *dev)
 		u32 val;
 
 		if (!of_property_read_u32(np, "clock-latency", &val))
-			dev_opp->clock_latency_ns_max = val;
+			opp_table->clock_latency_ns_max = val;
 		of_property_read_u32(np, "voltage-tolerance",
-				     &dev_opp->voltage_tolerance_v1);
+				     &opp_table->voltage_tolerance_v1);
 		of_node_put(np);
 	}
 
 	/* Set regulator to a non-NULL error value */
-	dev_opp->regulator = ERR_PTR(-ENXIO);
+	opp_table->regulator = ERR_PTR(-ENXIO);
 
 	/* Find clk for the device */
-	dev_opp->clk = clk_get(dev, NULL);
-	if (IS_ERR(dev_opp->clk)) {
-		ret = PTR_ERR(dev_opp->clk);
+	opp_table->clk = clk_get(dev, NULL);
+	if (IS_ERR(opp_table->clk)) {
+		ret = PTR_ERR(opp_table->clk);
 		if (ret != -EPROBE_DEFER)
 			dev_dbg(dev, "%s: Couldn't find clock: %d\n", __func__,
 				ret);
 	}
 
-	srcu_init_notifier_head(&dev_opp->srcu_head);
-	INIT_LIST_HEAD(&dev_opp->opp_list);
+	srcu_init_notifier_head(&opp_table->srcu_head);
+	INIT_LIST_HEAD(&opp_table->opp_list);
 
-	/* Secure the device list modification */
-	list_add_rcu(&dev_opp->node, &dev_opp_list);
-	return dev_opp;
+	/* Secure the device table modification */
+	list_add_rcu(&opp_table->node, &opp_tables);
+	return opp_table;
 }
 
 /**
- * _kfree_device_rcu() - Free device_opp RCU handler
+ * _kfree_device_rcu() - Free opp_table RCU handler
  * @head:	RCU head
  */
 static void _kfree_device_rcu(struct rcu_head *head)
 {
-	struct device_opp *device_opp = container_of(head, struct device_opp, rcu_head);
+	struct opp_table *opp_table = container_of(head, struct opp_table,
+						   rcu_head);
 
-	kfree_rcu(device_opp, rcu_head);
+	kfree_rcu(opp_table, rcu_head);
 }
 
 /**
- * _remove_device_opp() - Removes a device OPP table
- * @dev_opp: device OPP table to be removed.
+ * _remove_opp_table() - Removes a OPP table
+ * @opp_table: OPP table to be removed.
  *
- * Removes/frees device OPP table it it doesn't contain any OPPs.
+ * Removes/frees OPP table if it doesn't contain any OPPs.
  */
-static void _remove_device_opp(struct device_opp *dev_opp)
+static void _remove_opp_table(struct opp_table *opp_table)
 {
-	struct device_list_opp *list_dev;
+	struct opp_device *opp_dev;
 
-	if (!list_empty(&dev_opp->opp_list))
+	if (!list_empty(&opp_table->opp_list))
 		return;
 
-	if (dev_opp->supported_hw)
+	if (opp_table->supported_hw)
 		return;
 
-	if (dev_opp->prop_name)
+	if (opp_table->prop_name)
 		return;
 
-	if (!IS_ERR(dev_opp->regulator))
+	if (!IS_ERR(opp_table->regulator))
 		return;
 
 	/* Release clk */
-	if (!IS_ERR(dev_opp->clk))
-		clk_put(dev_opp->clk);
+	if (!IS_ERR(opp_table->clk))
+		clk_put(opp_table->clk);
 
-	list_dev = list_first_entry(&dev_opp->dev_list, struct device_list_opp,
-				    node);
+	opp_dev = list_first_entry(&opp_table->dev_list, struct opp_device,
+				   node);
 
-	_remove_list_dev(list_dev, dev_opp);
+	_remove_opp_dev(opp_dev, opp_table);
 
 	/* dev_list must be empty now */
-	WARN_ON(!list_empty(&dev_opp->dev_list));
+	WARN_ON(!list_empty(&opp_table->dev_list));
 
-	list_del_rcu(&dev_opp->node);
-	call_srcu(&dev_opp->srcu_head.srcu, &dev_opp->rcu_head,
+	list_del_rcu(&opp_table->node);
+	call_srcu(&opp_table->srcu_head.srcu, &opp_table->rcu_head,
 		  _kfree_device_rcu);
 }
 
@@ -881,17 +883,17 @@ static void _kfree_opp_rcu(struct rcu_head *head)
 
 /**
  * _opp_remove()  - Remove an OPP from a table definition
- * @dev_opp:	points back to the device_opp struct this opp belongs to
+ * @opp_table:	points back to the opp_table struct this opp belongs to
  * @opp:	pointer to the OPP to remove
  * @notify:	OPP_EVENT_REMOVE notification should be sent or not
  *
- * This function removes an opp definition from the opp list.
+ * This function removes an opp definition from the opp table.
  *
- * Locking: The internal device_opp and opp structures are RCU protected.
+ * Locking: The internal opp_table and opp structures are RCU protected.
  * It is assumed that the caller holds required mutex for an RCU updater
  * strategy.
  */
-static void _opp_remove(struct device_opp *dev_opp,
+static void _opp_remove(struct opp_table *opp_table,
 			struct dev_pm_opp *opp, bool notify)
 {
 	/*
@@ -899,22 +901,23 @@ static void _opp_remove(struct device_opp *dev_opp,
 	 * frequency/voltage list.
 	 */
 	if (notify)
-		srcu_notifier_call_chain(&dev_opp->srcu_head, OPP_EVENT_REMOVE, opp);
+		srcu_notifier_call_chain(&opp_table->srcu_head,
+					 OPP_EVENT_REMOVE, opp);
 	opp_debug_remove_one(opp);
 	list_del_rcu(&opp->node);
-	call_srcu(&dev_opp->srcu_head.srcu, &opp->rcu_head, _kfree_opp_rcu);
+	call_srcu(&opp_table->srcu_head.srcu, &opp->rcu_head, _kfree_opp_rcu);
 
-	_remove_device_opp(dev_opp);
+	_remove_opp_table(opp_table);
 }
 
 /**
- * dev_pm_opp_remove()  - Remove an OPP from OPP list
+ * dev_pm_opp_remove()  - Remove an OPP from OPP table
  * @dev:	device for which we do this operation
  * @freq:	OPP to remove with matching 'freq'
  *
- * This function removes an opp from the opp list.
+ * This function removes an opp from the opp table.
  *
- * Locking: The internal device_opp and opp structures are RCU protected.
+ * Locking: The internal opp_table and opp structures are RCU protected.
  * Hence this function internally uses RCU updater strategy with mutex locks
  * to keep the integrity of the internal data structures. Callers should ensure
  * that this function is *NOT* called under RCU protection or in contexts where
@@ -923,17 +926,17 @@ static void _opp_remove(struct device_opp *dev_opp,
 void dev_pm_opp_remove(struct device *dev, unsigned long freq)
 {
 	struct dev_pm_opp *opp;
-	struct device_opp *dev_opp;
+	struct opp_table *opp_table;
 	bool found = false;
 
-	/* Hold our list modification lock here */
-	mutex_lock(&dev_opp_list_lock);
+	/* Hold our table modification lock here */
+	mutex_lock(&opp_table_lock);
 
-	dev_opp = _find_device_opp(dev);
-	if (IS_ERR(dev_opp))
+	opp_table = _find_opp_table(dev);
+	if (IS_ERR(opp_table))
 		goto unlock;
 
-	list_for_each_entry(opp, &dev_opp->opp_list, node) {
+	list_for_each_entry(opp, &opp_table->opp_list, node) {
 		if (opp->rate == freq) {
 			found = true;
 			break;
@@ -946,14 +949,14 @@ void dev_pm_opp_remove(struct device *dev, unsigned long freq)
 		goto unlock;
 	}
 
-	_opp_remove(dev_opp, opp, true);
+	_opp_remove(opp_table, opp, true);
 unlock:
-	mutex_unlock(&dev_opp_list_lock);
+	mutex_unlock(&opp_table_lock);
 }
 EXPORT_SYMBOL_GPL(dev_pm_opp_remove);
 
 static struct dev_pm_opp *_allocate_opp(struct device *dev,
-					struct device_opp **dev_opp)
+					struct opp_table **opp_table)
 {
 	struct dev_pm_opp *opp;
 
@@ -964,8 +967,8 @@ static struct dev_pm_opp *_allocate_opp(struct device *dev,
 
 	INIT_LIST_HEAD(&opp->node);
 
-	*dev_opp = _add_device_opp(dev);
-	if (!*dev_opp) {
+	*opp_table = _add_opp_table(dev);
+	if (!*opp_table) {
 		kfree(opp);
 		return NULL;
 	}
@@ -974,9 +977,9 @@ static struct dev_pm_opp *_allocate_opp(struct device *dev,
 }
 
 static bool _opp_supported_by_regulators(struct dev_pm_opp *opp,
-					 struct device_opp *dev_opp)
+					 struct opp_table *opp_table)
 {
-	struct regulator *reg = dev_opp->regulator;
+	struct regulator *reg = opp_table->regulator;
 
 	if (!IS_ERR(reg) &&
 	    !regulator_is_supported_voltage(reg, opp->u_volt_min,
@@ -990,21 +993,21 @@ static bool _opp_supported_by_regulators(struct dev_pm_opp *opp,
 }
 
 static int _opp_add(struct device *dev, struct dev_pm_opp *new_opp,
-		    struct device_opp *dev_opp)
+		    struct opp_table *opp_table)
 {
 	struct dev_pm_opp *opp;
-	struct list_head *head = &dev_opp->opp_list;
+	struct list_head *head = &opp_table->opp_list;
 	int ret;
 
 	/*
 	 * Insert new OPP in order of increasing frequency and discard if
 	 * already present.
 	 *
-	 * Need to use &dev_opp->opp_list in the condition part of the 'for'
+	 * Need to use &opp_table->opp_list in the condition part of the 'for'
 	 * loop, don't replace it with head otherwise it will become an infinite
 	 * loop.
 	 */
-	list_for_each_entry_rcu(opp, &dev_opp->opp_list, node) {
+	list_for_each_entry_rcu(opp, &opp_table->opp_list, node) {
 		if (new_opp->rate > opp->rate) {
 			head = &opp->node;
 			continue;
@@ -1022,15 +1025,15 @@ static int _opp_add(struct device *dev, struct dev_pm_opp *new_opp,
 			0 : -EEXIST;
 	}
 
-	new_opp->dev_opp = dev_opp;
+	new_opp->opp_table = opp_table;
 	list_add_rcu(&new_opp->node, head);
 
-	ret = opp_debug_create_one(new_opp, dev_opp);
+	ret = opp_debug_create_one(new_opp, opp_table);
 	if (ret)
 		dev_err(dev, "%s: Failed to register opp to debugfs (%d)\n",
 			__func__, ret);
 
-	if (!_opp_supported_by_regulators(new_opp, dev_opp)) {
+	if (!_opp_supported_by_regulators(new_opp, opp_table)) {
 		new_opp->available = false;
 		dev_warn(dev, "%s: OPP not supported by regulators (%lu)\n",
 			 __func__, new_opp->rate);
@@ -1046,14 +1049,14 @@ static int _opp_add(struct device *dev, struct dev_pm_opp *new_opp,
  * @u_volt:	Voltage in uVolts for this OPP
  * @dynamic:	Dynamically added OPPs.
  *
- * This function adds an opp definition to the opp list and returns status.
+ * This function adds an opp definition to the opp table and returns status.
  * The opp is made available by default and it can be controlled using
  * dev_pm_opp_enable/disable functions and may be removed by dev_pm_opp_remove.
  *
  * NOTE: "dynamic" parameter impacts OPPs added by the dev_pm_opp_of_add_table
  * and freed by dev_pm_opp_of_remove_table.
  *
- * Locking: The internal device_opp and opp structures are RCU protected.
+ * Locking: The internal opp_table and opp structures are RCU protected.
  * Hence this function internally uses RCU updater strategy with mutex locks
  * to keep the integrity of the internal data structures. Callers should ensure
  * that this function is *NOT* called under RCU protection or in contexts where
@@ -1069,15 +1072,15 @@ static int _opp_add(struct device *dev, struct dev_pm_opp *new_opp,
 static int _opp_add_v1(struct device *dev, unsigned long freq, long u_volt,
 		       bool dynamic)
 {
-	struct device_opp *dev_opp;
+	struct opp_table *opp_table;
 	struct dev_pm_opp *new_opp;
 	unsigned long tol;
 	int ret;
 
-	/* Hold our list modification lock here */
-	mutex_lock(&dev_opp_list_lock);
+	/* Hold our table modification lock here */
+	mutex_lock(&opp_table_lock);
 
-	new_opp = _allocate_opp(dev, &dev_opp);
+	new_opp = _allocate_opp(dev, &opp_table);
 	if (!new_opp) {
 		ret = -ENOMEM;
 		goto unlock;
@@ -1085,36 +1088,36 @@ static int _opp_add_v1(struct device *dev, unsigned long freq, long u_volt,
 
 	/* populate the opp table */
 	new_opp->rate = freq;
-	tol = u_volt * dev_opp->voltage_tolerance_v1 / 100;
+	tol = u_volt * opp_table->voltage_tolerance_v1 / 100;
 	new_opp->u_volt = u_volt;
 	new_opp->u_volt_min = u_volt - tol;
 	new_opp->u_volt_max = u_volt + tol;
 	new_opp->available = true;
 	new_opp->dynamic = dynamic;
 
-	ret = _opp_add(dev, new_opp, dev_opp);
+	ret = _opp_add(dev, new_opp, opp_table);
 	if (ret)
 		goto free_opp;
 
-	mutex_unlock(&dev_opp_list_lock);
+	mutex_unlock(&opp_table_lock);
 
 	/*
 	 * Notify the changes in the availability of the operable
 	 * frequency/voltage list.
 	 */
-	srcu_notifier_call_chain(&dev_opp->srcu_head, OPP_EVENT_ADD, new_opp);
+	srcu_notifier_call_chain(&opp_table->srcu_head, OPP_EVENT_ADD, new_opp);
 	return 0;
 
 free_opp:
-	_opp_remove(dev_opp, new_opp, false);
+	_opp_remove(opp_table, new_opp, false);
 unlock:
-	mutex_unlock(&dev_opp_list_lock);
+	mutex_unlock(&opp_table_lock);
 	return ret;
 }
 
 /* TODO: Support multiple regulators */
 static int opp_parse_supplies(struct dev_pm_opp *opp, struct device *dev,
-			      struct device_opp *dev_opp)
+			      struct opp_table *opp_table)
 {
 	u32 microvolt[3] = {0};
 	u32 val;
@@ -1123,9 +1126,9 @@ static int opp_parse_supplies(struct dev_pm_opp *opp, struct device *dev,
 	char name[NAME_MAX];
 
 	/* Search for "opp-microvolt-<name>" */
-	if (dev_opp->prop_name) {
+	if (opp_table->prop_name) {
 		snprintf(name, sizeof(name), "opp-microvolt-%s",
-			 dev_opp->prop_name);
+			 opp_table->prop_name);
 		prop = of_find_property(opp->np, name, NULL);
 	}
 
@@ -1171,9 +1174,9 @@ static int opp_parse_supplies(struct dev_pm_opp *opp, struct device *dev,
 
 	/* Search for "opp-microamp-<name>" */
 	prop = NULL;
-	if (dev_opp->prop_name) {
+	if (opp_table->prop_name) {
 		snprintf(name, sizeof(name), "opp-microamp-%s",
-			 dev_opp->prop_name);
+			 opp_table->prop_name);
 		prop = of_find_property(opp->np, name, NULL);
 	}
 
@@ -1200,7 +1203,7 @@ static int opp_parse_supplies(struct dev_pm_opp *opp, struct device *dev,
  * OPPs, which are available for those versions, based on its 'opp-supported-hw'
  * property.
  *
- * Locking: The internal device_opp and opp structures are RCU protected.
+ * Locking: The internal opp_table and opp structures are RCU protected.
  * Hence this function internally uses RCU updater strategy with mutex locks
  * to keep the integrity of the internal data structures. Callers should ensure
  * that this function is *NOT* called under RCU protection or in contexts where
@@ -1209,44 +1212,44 @@ static int opp_parse_supplies(struct dev_pm_opp *opp, struct device *dev,
 int dev_pm_opp_set_supported_hw(struct device *dev, const u32 *versions,
 				unsigned int count)
 {
-	struct device_opp *dev_opp;
+	struct opp_table *opp_table;
 	int ret = 0;
 
-	/* Hold our list modification lock here */
-	mutex_lock(&dev_opp_list_lock);
+	/* Hold our table modification lock here */
+	mutex_lock(&opp_table_lock);
 
-	dev_opp = _add_device_opp(dev);
-	if (!dev_opp) {
+	opp_table = _add_opp_table(dev);
+	if (!opp_table) {
 		ret = -ENOMEM;
 		goto unlock;
 	}
 
-	/* Make sure there are no concurrent readers while updating dev_opp */
-	WARN_ON(!list_empty(&dev_opp->opp_list));
+	/* Make sure there are no concurrent readers while updating opp_table */
+	WARN_ON(!list_empty(&opp_table->opp_list));
 
-	/* Do we already have a version hierarchy associated with dev_opp? */
-	if (dev_opp->supported_hw) {
+	/* Do we already have a version hierarchy associated with opp_table? */
+	if (opp_table->supported_hw) {
 		dev_err(dev, "%s: Already have supported hardware list\n",
 			__func__);
 		ret = -EBUSY;
 		goto err;
 	}
 
-	dev_opp->supported_hw = kmemdup(versions, count * sizeof(*versions),
+	opp_table->supported_hw = kmemdup(versions, count * sizeof(*versions),
 					GFP_KERNEL);
-	if (!dev_opp->supported_hw) {
+	if (!opp_table->supported_hw) {
 		ret = -ENOMEM;
 		goto err;
 	}
 
-	dev_opp->supported_hw_count = count;
-	mutex_unlock(&dev_opp_list_lock);
+	opp_table->supported_hw_count = count;
+	mutex_unlock(&opp_table_lock);
 	return 0;
 
 err:
-	_remove_device_opp(dev_opp);
+	_remove_opp_table(opp_table);
 unlock:
-	mutex_unlock(&dev_opp_list_lock);
+	mutex_unlock(&opp_table_lock);
 
 	return ret;
 }
@@ -1257,10 +1260,10 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_set_supported_hw);
  * @dev: Device for which supported-hw has to be put.
  *
  * This is required only for the V2 bindings, and is called for a matching
- * dev_pm_opp_set_supported_hw(). Until this is called, the device_opp structure
+ * dev_pm_opp_set_supported_hw(). Until this is called, the opp_table structure
  * will not be freed.
  *
- * Locking: The internal device_opp and opp structures are RCU protected.
+ * Locking: The internal opp_table and opp structures are RCU protected.
  * Hence this function internally uses RCU updater strategy with mutex locks
  * to keep the integrity of the internal data structures. Callers should ensure
  * that this function is *NOT* called under RCU protection or in contexts where
@@ -1268,36 +1271,37 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_set_supported_hw);
  */
 void dev_pm_opp_put_supported_hw(struct device *dev)
 {
-	struct device_opp *dev_opp;
+	struct opp_table *opp_table;
 
-	/* Hold our list modification lock here */
-	mutex_lock(&dev_opp_list_lock);
+	/* Hold our table modification lock here */
+	mutex_lock(&opp_table_lock);
 
-	/* Check for existing list for 'dev' first */
-	dev_opp = _find_device_opp(dev);
-	if (IS_ERR(dev_opp)) {
-		dev_err(dev, "Failed to find dev_opp: %ld\n", PTR_ERR(dev_opp));
+	/* Check for existing table for 'dev' first */
+	opp_table = _find_opp_table(dev);
+	if (IS_ERR(opp_table)) {
+		dev_err(dev, "Failed to find opp_table: %ld\n",
+			PTR_ERR(opp_table));
 		goto unlock;
 	}
 
-	/* Make sure there are no concurrent readers while updating dev_opp */
-	WARN_ON(!list_empty(&dev_opp->opp_list));
+	/* Make sure there are no concurrent readers while updating opp_table */
+	WARN_ON(!list_empty(&opp_table->opp_list));
 
-	if (!dev_opp->supported_hw) {
+	if (!opp_table->supported_hw) {
 		dev_err(dev, "%s: Doesn't have supported hardware list\n",
 			__func__);
 		goto unlock;
 	}
 
-	kfree(dev_opp->supported_hw);
-	dev_opp->supported_hw = NULL;
-	dev_opp->supported_hw_count = 0;
+	kfree(opp_table->supported_hw);
+	opp_table->supported_hw = NULL;
+	opp_table->supported_hw_count = 0;
 
-	/* Try freeing device_opp if this was the last blocking resource */
-	_remove_device_opp(dev_opp);
+	/* Try freeing opp_table if this was the last blocking resource */
+	_remove_opp_table(opp_table);
 
 unlock:
-	mutex_unlock(&dev_opp_list_lock);
+	mutex_unlock(&opp_table_lock);
 }
 EXPORT_SYMBOL_GPL(dev_pm_opp_put_supported_hw);
 
@@ -1311,7 +1315,7 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_put_supported_hw);
  * which the extension will apply are opp-microvolt and opp-microamp. OPP core
  * should postfix the property name with -<name> while looking for them.
  *
- * Locking: The internal device_opp and opp structures are RCU protected.
+ * Locking: The internal opp_table and opp structures are RCU protected.
  * Hence this function internally uses RCU updater strategy with mutex locks
  * to keep the integrity of the internal data structures. Callers should ensure
  * that this function is *NOT* called under RCU protection or in contexts where
@@ -1319,42 +1323,42 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_put_supported_hw);
  */
 int dev_pm_opp_set_prop_name(struct device *dev, const char *name)
 {
-	struct device_opp *dev_opp;
+	struct opp_table *opp_table;
 	int ret = 0;
 
-	/* Hold our list modification lock here */
-	mutex_lock(&dev_opp_list_lock);
+	/* Hold our table modification lock here */
+	mutex_lock(&opp_table_lock);
 
-	dev_opp = _add_device_opp(dev);
-	if (!dev_opp) {
+	opp_table = _add_opp_table(dev);
+	if (!opp_table) {
 		ret = -ENOMEM;
 		goto unlock;
 	}
 
-	/* Make sure there are no concurrent readers while updating dev_opp */
-	WARN_ON(!list_empty(&dev_opp->opp_list));
+	/* Make sure there are no concurrent readers while updating opp_table */
+	WARN_ON(!list_empty(&opp_table->opp_list));
 
-	/* Do we already have a prop-name associated with dev_opp? */
-	if (dev_opp->prop_name) {
+	/* Do we already have a prop-name associated with opp_table? */
+	if (opp_table->prop_name) {
 		dev_err(dev, "%s: Already have prop-name %s\n", __func__,
-			dev_opp->prop_name);
+			opp_table->prop_name);
 		ret = -EBUSY;
 		goto err;
 	}
 
-	dev_opp->prop_name = kstrdup(name, GFP_KERNEL);
-	if (!dev_opp->prop_name) {
+	opp_table->prop_name = kstrdup(name, GFP_KERNEL);
+	if (!opp_table->prop_name) {
 		ret = -ENOMEM;
 		goto err;
 	}
 
-	mutex_unlock(&dev_opp_list_lock);
+	mutex_unlock(&opp_table_lock);
 	return 0;
 
 err:
-	_remove_device_opp(dev_opp);
+	_remove_opp_table(opp_table);
 unlock:
-	mutex_unlock(&dev_opp_list_lock);
+	mutex_unlock(&opp_table_lock);
 
 	return ret;
 }
@@ -1365,10 +1369,10 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_set_prop_name);
  * @dev: Device for which the prop-name has to be put.
  *
  * This is required only for the V2 bindings, and is called for a matching
- * dev_pm_opp_set_prop_name(). Until this is called, the device_opp structure
+ * dev_pm_opp_set_prop_name(). Until this is called, the opp_table structure
  * will not be freed.
  *
- * Locking: The internal device_opp and opp structures are RCU protected.
+ * Locking: The internal opp_table and opp structures are RCU protected.
  * Hence this function internally uses RCU updater strategy with mutex locks
  * to keep the integrity of the internal data structures. Callers should ensure
  * that this function is *NOT* called under RCU protection or in contexts where
@@ -1376,34 +1380,35 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_set_prop_name);
  */
 void dev_pm_opp_put_prop_name(struct device *dev)
 {
-	struct device_opp *dev_opp;
+	struct opp_table *opp_table;
 
-	/* Hold our list modification lock here */
-	mutex_lock(&dev_opp_list_lock);
+	/* Hold our table modification lock here */
+	mutex_lock(&opp_table_lock);
 
-	/* Check for existing list for 'dev' first */
-	dev_opp = _find_device_opp(dev);
-	if (IS_ERR(dev_opp)) {
-		dev_err(dev, "Failed to find dev_opp: %ld\n", PTR_ERR(dev_opp));
+	/* Check for existing table for 'dev' first */
+	opp_table = _find_opp_table(dev);
+	if (IS_ERR(opp_table)) {
+		dev_err(dev, "Failed to find opp_table: %ld\n",
+			PTR_ERR(opp_table));
 		goto unlock;
 	}
 
-	/* Make sure there are no concurrent readers while updating dev_opp */
-	WARN_ON(!list_empty(&dev_opp->opp_list));
+	/* Make sure there are no concurrent readers while updating opp_table */
+	WARN_ON(!list_empty(&opp_table->opp_list));
 
-	if (!dev_opp->prop_name) {
+	if (!opp_table->prop_name) {
 		dev_err(dev, "%s: Doesn't have a prop-name\n", __func__);
 		goto unlock;
 	}
 
-	kfree(dev_opp->prop_name);
-	dev_opp->prop_name = NULL;
+	kfree(opp_table->prop_name);
+	opp_table->prop_name = NULL;
 
-	/* Try freeing device_opp if this was the last blocking resource */
-	_remove_device_opp(dev_opp);
+	/* Try freeing opp_table if this was the last blocking resource */
+	_remove_opp_table(opp_table);
 
 unlock:
-	mutex_unlock(&dev_opp_list_lock);
+	mutex_unlock(&opp_table_lock);
 }
 EXPORT_SYMBOL_GPL(dev_pm_opp_put_prop_name);
 
@@ -1417,7 +1422,7 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_put_prop_name);
  *
  * This must be called before any OPPs are initialized for the device.
  *
- * Locking: The internal device_opp and opp structures are RCU protected.
+ * Locking: The internal opp_table and opp structures are RCU protected.
  * Hence this function internally uses RCU updater strategy with mutex locks
  * to keep the integrity of the internal data structures. Callers should ensure
  * that this function is *NOT* called under RCU protection or in contexts where
@@ -1425,26 +1430,26 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_put_prop_name);
  */
 int dev_pm_opp_set_regulator(struct device *dev, const char *name)
 {
-	struct device_opp *dev_opp;
+	struct opp_table *opp_table;
 	struct regulator *reg;
 	int ret;
 
-	mutex_lock(&dev_opp_list_lock);
+	mutex_lock(&opp_table_lock);
 
-	dev_opp = _add_device_opp(dev);
-	if (!dev_opp) {
+	opp_table = _add_opp_table(dev);
+	if (!opp_table) {
 		ret = -ENOMEM;
 		goto unlock;
 	}
 
 	/* This should be called before OPPs are initialized */
-	if (WARN_ON(!list_empty(&dev_opp->opp_list))) {
+	if (WARN_ON(!list_empty(&opp_table->opp_list))) {
 		ret = -EBUSY;
 		goto err;
 	}
 
 	/* Already have a regulator set */
-	if (WARN_ON(!IS_ERR(dev_opp->regulator))) {
+	if (WARN_ON(!IS_ERR(opp_table->regulator))) {
 		ret = -EBUSY;
 		goto err;
 	}
@@ -1458,15 +1463,15 @@ int dev_pm_opp_set_regulator(struct device *dev, const char *name)
 		goto err;
 	}
 
-	dev_opp->regulator = reg;
+	opp_table->regulator = reg;
 
-	mutex_unlock(&dev_opp_list_lock);
+	mutex_unlock(&opp_table_lock);
 	return 0;
 
 err:
-	_remove_device_opp(dev_opp);
+	_remove_opp_table(opp_table);
 unlock:
-	mutex_unlock(&dev_opp_list_lock);
+	mutex_unlock(&opp_table_lock);
 
 	return ret;
 }
@@ -1476,7 +1481,7 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_set_regulator);
  * dev_pm_opp_put_regulator() - Releases resources blocked for regulator
  * @dev: Device for which regulator was set.
  *
- * Locking: The internal device_opp and opp structures are RCU protected.
+ * Locking: The internal opp_table and opp structures are RCU protected.
  * Hence this function internally uses RCU updater strategy with mutex locks
  * to keep the integrity of the internal data structures. Callers should ensure
  * that this function is *NOT* called under RCU protection or in contexts where
@@ -1484,44 +1489,45 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_set_regulator);
  */
 void dev_pm_opp_put_regulator(struct device *dev)
 {
-	struct device_opp *dev_opp;
+	struct opp_table *opp_table;
 
-	mutex_lock(&dev_opp_list_lock);
+	mutex_lock(&opp_table_lock);
 
-	/* Check for existing list for 'dev' first */
-	dev_opp = _find_device_opp(dev);
-	if (IS_ERR(dev_opp)) {
-		dev_err(dev, "Failed to find dev_opp: %ld\n", PTR_ERR(dev_opp));
+	/* Check for existing table for 'dev' first */
+	opp_table = _find_opp_table(dev);
+	if (IS_ERR(opp_table)) {
+		dev_err(dev, "Failed to find opp_table: %ld\n",
+			PTR_ERR(opp_table));
 		goto unlock;
 	}
 
-	if (IS_ERR(dev_opp->regulator)) {
+	if (IS_ERR(opp_table->regulator)) {
 		dev_err(dev, "%s: Doesn't have regulator set\n", __func__);
 		goto unlock;
 	}
 
-	/* Make sure there are no concurrent readers while updating dev_opp */
-	WARN_ON(!list_empty(&dev_opp->opp_list));
+	/* Make sure there are no concurrent readers while updating opp_table */
+	WARN_ON(!list_empty(&opp_table->opp_list));
 
-	regulator_put(dev_opp->regulator);
-	dev_opp->regulator = ERR_PTR(-ENXIO);
+	regulator_put(opp_table->regulator);
+	opp_table->regulator = ERR_PTR(-ENXIO);
 
-	/* Try freeing device_opp if this was the last blocking resource */
-	_remove_device_opp(dev_opp);
+	/* Try freeing opp_table if this was the last blocking resource */
+	_remove_opp_table(opp_table);
 
 unlock:
-	mutex_unlock(&dev_opp_list_lock);
+	mutex_unlock(&opp_table_lock);
 }
 EXPORT_SYMBOL_GPL(dev_pm_opp_put_regulator);
 
-static bool _opp_is_supported(struct device *dev, struct device_opp *dev_opp,
+static bool _opp_is_supported(struct device *dev, struct opp_table *opp_table,
 			      struct device_node *np)
 {
-	unsigned int count = dev_opp->supported_hw_count;
+	unsigned int count = opp_table->supported_hw_count;
 	u32 version;
 	int ret;
 
-	if (!dev_opp->supported_hw)
+	if (!opp_table->supported_hw)
 		return true;
 
 	while (count--) {
@@ -1534,7 +1540,7 @@ static bool _opp_is_supported(struct device *dev, struct device_opp *dev_opp,
 		}
 
 		/* Both of these are bitwise masks of the versions */
-		if (!(version & dev_opp->supported_hw[count]))
+		if (!(version & opp_table->supported_hw[count]))
 			return false;
 	}
 
@@ -1546,11 +1552,11 @@ static bool _opp_is_supported(struct device *dev, struct device_opp *dev_opp,
  * @dev:	device for which we do this operation
  * @np:		device node
  *
- * This function adds an opp definition to the opp list and returns status. The
+ * This function adds an opp definition to the opp table and returns status. The
  * opp can be controlled using dev_pm_opp_enable/disable functions and may be
  * removed by dev_pm_opp_remove.
  *
- * Locking: The internal device_opp and opp structures are RCU protected.
+ * Locking: The internal opp_table and opp structures are RCU protected.
  * Hence this function internally uses RCU updater strategy with mutex locks
  * to keep the integrity of the internal data structures. Callers should ensure
  * that this function is *NOT* called under RCU protection or in contexts where
@@ -1566,16 +1572,16 @@ static bool _opp_is_supported(struct device *dev, struct device_opp *dev_opp,
  */
 static int _opp_add_static_v2(struct device *dev, struct device_node *np)
 {
-	struct device_opp *dev_opp;
+	struct opp_table *opp_table;
 	struct dev_pm_opp *new_opp;
 	u64 rate;
 	u32 val;
 	int ret;
 
-	/* Hold our list modification lock here */
-	mutex_lock(&dev_opp_list_lock);
+	/* Hold our table modification lock here */
+	mutex_lock(&opp_table_lock);
 
-	new_opp = _allocate_opp(dev, &dev_opp);
+	new_opp = _allocate_opp(dev, &opp_table);
 	if (!new_opp) {
 		ret = -ENOMEM;
 		goto unlock;
@@ -1588,7 +1594,7 @@ static int _opp_add_static_v2(struct device *dev, struct device_node *np)
 	}
 
 	/* Check if the OPP supports hardware's hierarchy of versions or not */
-	if (!_opp_is_supported(dev, dev_opp, np)) {
+	if (!_opp_is_supported(dev, opp_table, np)) {
 		dev_dbg(dev, "OPP not supported by hardware: %llu\n", rate);
 		goto free_opp;
 	}
@@ -1608,30 +1614,30 @@ static int _opp_add_static_v2(struct device *dev, struct device_node *np)
 	if (!of_property_read_u32(np, "clock-latency-ns", &val))
 		new_opp->clock_latency_ns = val;
 
-	ret = opp_parse_supplies(new_opp, dev, dev_opp);
+	ret = opp_parse_supplies(new_opp, dev, opp_table);
 	if (ret)
 		goto free_opp;
 
-	ret = _opp_add(dev, new_opp, dev_opp);
+	ret = _opp_add(dev, new_opp, opp_table);
 	if (ret)
 		goto free_opp;
 
 	/* OPP to select on device suspend */
 	if (of_property_read_bool(np, "opp-suspend")) {
-		if (dev_opp->suspend_opp) {
+		if (opp_table->suspend_opp) {
 			dev_warn(dev, "%s: Multiple suspend OPPs found (%lu %lu)\n",
-				 __func__, dev_opp->suspend_opp->rate,
+				 __func__, opp_table->suspend_opp->rate,
 				 new_opp->rate);
 		} else {
 			new_opp->suspend = true;
-			dev_opp->suspend_opp = new_opp;
+			opp_table->suspend_opp = new_opp;
 		}
 	}
 
-	if (new_opp->clock_latency_ns > dev_opp->clock_latency_ns_max)
-		dev_opp->clock_latency_ns_max = new_opp->clock_latency_ns;
+	if (new_opp->clock_latency_ns > opp_table->clock_latency_ns_max)
+		opp_table->clock_latency_ns_max = new_opp->clock_latency_ns;
 
-	mutex_unlock(&dev_opp_list_lock);
+	mutex_unlock(&opp_table_lock);
 
 	pr_debug("%s: turbo:%d rate:%lu uv:%lu uvmin:%lu uvmax:%lu latency:%lu\n",
 		 __func__, new_opp->turbo, new_opp->rate, new_opp->u_volt,
@@ -1642,13 +1648,13 @@ static int _opp_add_static_v2(struct device *dev, struct device_node *np)
 	 * Notify the changes in the availability of the operable
 	 * frequency/voltage list.
 	 */
-	srcu_notifier_call_chain(&dev_opp->srcu_head, OPP_EVENT_ADD, new_opp);
+	srcu_notifier_call_chain(&opp_table->srcu_head, OPP_EVENT_ADD, new_opp);
 	return 0;
 
 free_opp:
-	_opp_remove(dev_opp, new_opp, false);
+	_opp_remove(opp_table, new_opp, false);
 unlock:
-	mutex_unlock(&dev_opp_list_lock);
+	mutex_unlock(&opp_table_lock);
 	return ret;
 }
 
@@ -1658,11 +1664,11 @@ unlock:
  * @freq:	Frequency in Hz for this OPP
  * @u_volt:	Voltage in uVolts for this OPP
  *
- * This function adds an opp definition to the opp list and returns status.
+ * This function adds an opp definition to the opp table and returns status.
  * The opp is made available by default and it can be controlled using
  * dev_pm_opp_enable/disable functions.
  *
- * Locking: The internal device_opp and opp structures are RCU protected.
+ * Locking: The internal opp_table and opp structures are RCU protected.
  * Hence this function internally uses RCU updater strategy with mutex locks
  * to keep the integrity of the internal data structures. Callers should ensure
  * that this function is *NOT* called under RCU protection or in contexts where
@@ -1694,7 +1700,7 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_add);
  * copy operation, returns 0 if no modification was done OR modification was
  * successful.
  *
- * Locking: The internal device_opp and opp structures are RCU protected.
+ * Locking: The internal opp_table and opp structures are RCU protected.
  * Hence this function internally uses RCU updater strategy with mutex locks to
  * keep the integrity of the internal data structures. Callers should ensure
  * that this function is *NOT* called under RCU protection or in contexts where
@@ -1703,7 +1709,7 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_add);
 static int _opp_set_availability(struct device *dev, unsigned long freq,
 				 bool availability_req)
 {
-	struct device_opp *dev_opp;
+	struct opp_table *opp_table;
 	struct dev_pm_opp *new_opp, *tmp_opp, *opp = ERR_PTR(-ENODEV);
 	int r = 0;
 
@@ -1712,18 +1718,18 @@ static int _opp_set_availability(struct device *dev, unsigned long freq,
 	if (!new_opp)
 		return -ENOMEM;
 
-	mutex_lock(&dev_opp_list_lock);
+	mutex_lock(&opp_table_lock);
 
-	/* Find the device_opp */
-	dev_opp = _find_device_opp(dev);
-	if (IS_ERR(dev_opp)) {
-		r = PTR_ERR(dev_opp);
+	/* Find the opp_table */
+	opp_table = _find_opp_table(dev);
+	if (IS_ERR(opp_table)) {
+		r = PTR_ERR(opp_table);
 		dev_warn(dev, "%s: Device OPP not found (%d)\n", __func__, r);
 		goto unlock;
 	}
 
 	/* Do we have the frequency? */
-	list_for_each_entry(tmp_opp, &dev_opp->opp_list, node) {
+	list_for_each_entry(tmp_opp, &opp_table->opp_list, node) {
 		if (tmp_opp->rate == freq) {
 			opp = tmp_opp;
 			break;
@@ -1744,21 +1750,21 @@ static int _opp_set_availability(struct device *dev, unsigned long freq,
 	new_opp->available = availability_req;
 
 	list_replace_rcu(&opp->node, &new_opp->node);
-	mutex_unlock(&dev_opp_list_lock);
-	call_srcu(&dev_opp->srcu_head.srcu, &opp->rcu_head, _kfree_opp_rcu);
+	mutex_unlock(&opp_table_lock);
+	call_srcu(&opp_table->srcu_head.srcu, &opp->rcu_head, _kfree_opp_rcu);
 
 	/* Notify the change of the OPP availability */
 	if (availability_req)
-		srcu_notifier_call_chain(&dev_opp->srcu_head, OPP_EVENT_ENABLE,
-					 new_opp);
+		srcu_notifier_call_chain(&opp_table->srcu_head,
+					 OPP_EVENT_ENABLE, new_opp);
 	else
-		srcu_notifier_call_chain(&dev_opp->srcu_head, OPP_EVENT_DISABLE,
-					 new_opp);
+		srcu_notifier_call_chain(&opp_table->srcu_head,
+					 OPP_EVENT_DISABLE, new_opp);
 
 	return 0;
 
 unlock:
-	mutex_unlock(&dev_opp_list_lock);
+	mutex_unlock(&opp_table_lock);
 	kfree(new_opp);
 	return r;
 }
@@ -1772,7 +1778,7 @@ unlock:
  * corresponding error value. It is meant to be used for users an OPP available
  * after being temporarily made unavailable with dev_pm_opp_disable.
  *
- * Locking: The internal device_opp and opp structures are RCU protected.
+ * Locking: The internal opp_table and opp structures are RCU protected.
  * Hence this function indirectly uses RCU and mutex locks to keep the
  * integrity of the internal data structures. Callers should ensure that
  * this function is *NOT* called under RCU protection or in contexts where
@@ -1798,7 +1804,7 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_enable);
  * control by users to make this OPP not available until the circumstances are
  * right to make it available again (with a call to dev_pm_opp_enable).
  *
- * Locking: The internal device_opp and opp structures are RCU protected.
+ * Locking: The internal opp_table and opp structures are RCU protected.
  * Hence this function indirectly uses RCU and mutex locks to keep the
  * integrity of the internal data structures. Callers should ensure that
  * this function is *NOT* called under RCU protection or in contexts where
@@ -1816,26 +1822,26 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_disable);
 
 /**
  * dev_pm_opp_get_notifier() - find notifier_head of the device with opp
- * @dev:	device pointer used to lookup device OPPs.
+ * @dev:	device pointer used to lookup OPP table.
  *
  * Return: pointer to  notifier head if found, otherwise -ENODEV or
  * -EINVAL based on type of error casted as pointer. value must be checked
  *  with IS_ERR to determine valid pointer or error result.
  *
- * Locking: This function must be called under rcu_read_lock(). dev_opp is a RCU
- * protected pointer. The reason for the same is that the opp pointer which is
- * returned will remain valid for use with opp_get_{voltage, freq} only while
+ * Locking: This function must be called under rcu_read_lock(). opp_table is a
+ * RCU protected pointer. The reason for the same is that the opp pointer which
+ * is returned will remain valid for use with opp_get_{voltage, freq} only while
  * under the locked area. The pointer returned must be used prior to unlocking
  * with rcu_read_unlock() to maintain the integrity of the pointer.
  */
 struct srcu_notifier_head *dev_pm_opp_get_notifier(struct device *dev)
 {
-	struct device_opp *dev_opp = _find_device_opp(dev);
+	struct opp_table *opp_table = _find_opp_table(dev);
 
-	if (IS_ERR(dev_opp))
-		return ERR_CAST(dev_opp); /* matching type */
+	if (IS_ERR(opp_table))
+		return ERR_CAST(opp_table); /* matching type */
 
-	return &dev_opp->srcu_head;
+	return &opp_table->srcu_head;
 }
 EXPORT_SYMBOL_GPL(dev_pm_opp_get_notifier);
 
@@ -1843,11 +1849,11 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_get_notifier);
 /**
  * dev_pm_opp_of_remove_table() - Free OPP table entries created from static DT
  *				  entries
- * @dev:	device pointer used to lookup device OPPs.
+ * @dev:	device pointer used to lookup OPP table.
  *
  * Free OPPs created using static entries present in DT.
  *
- * Locking: The internal device_opp and opp structures are RCU protected.
+ * Locking: The internal opp_table and opp structures are RCU protected.
  * Hence this function indirectly uses RCU updater strategy with mutex locks
  * to keep the integrity of the internal data structures. Callers should ensure
  * that this function is *NOT* called under RCU protection or in contexts where
@@ -1855,38 +1861,38 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_get_notifier);
  */
 void dev_pm_opp_of_remove_table(struct device *dev)
 {
-	struct device_opp *dev_opp;
+	struct opp_table *opp_table;
 	struct dev_pm_opp *opp, *tmp;
 
-	/* Hold our list modification lock here */
-	mutex_lock(&dev_opp_list_lock);
+	/* Hold our table modification lock here */
+	mutex_lock(&opp_table_lock);
 
-	/* Check for existing list for 'dev' */
-	dev_opp = _find_device_opp(dev);
-	if (IS_ERR(dev_opp)) {
-		int error = PTR_ERR(dev_opp);
+	/* Check for existing table for 'dev' */
+	opp_table = _find_opp_table(dev);
+	if (IS_ERR(opp_table)) {
+		int error = PTR_ERR(opp_table);
 
 		if (error != -ENODEV)
-			WARN(1, "%s: dev_opp: %d\n",
+			WARN(1, "%s: opp_table: %d\n",
 			     IS_ERR_OR_NULL(dev) ?
 					"Invalid device" : dev_name(dev),
 			     error);
 		goto unlock;
 	}
 
-	/* Find if dev_opp manages a single device */
-	if (list_is_singular(&dev_opp->dev_list)) {
+	/* Find if opp_table manages a single device */
+	if (list_is_singular(&opp_table->dev_list)) {
 		/* Free static OPPs */
-		list_for_each_entry_safe(opp, tmp, &dev_opp->opp_list, node) {
+		list_for_each_entry_safe(opp, tmp, &opp_table->opp_list, node) {
 			if (!opp->dynamic)
-				_opp_remove(dev_opp, opp, true);
+				_opp_remove(opp_table, opp, true);
 		}
 	} else {
-		_remove_list_dev(_find_list_dev(dev, dev_opp), dev_opp);
+		_remove_opp_dev(_find_opp_dev(dev, opp_table), opp_table);
 	}
 
 unlock:
-	mutex_unlock(&dev_opp_list_lock);
+	mutex_unlock(&opp_table_lock);
 }
 EXPORT_SYMBOL_GPL(dev_pm_opp_of_remove_table);
 
@@ -1907,22 +1913,22 @@ struct device_node *_of_get_opp_desc_node(struct device *dev)
 static int _of_add_opp_table_v2(struct device *dev, struct device_node *opp_np)
 {
 	struct device_node *np;
-	struct device_opp *dev_opp;
+	struct opp_table *opp_table;
 	int ret = 0, count = 0;
 
-	mutex_lock(&dev_opp_list_lock);
+	mutex_lock(&opp_table_lock);
 
-	dev_opp = _managed_opp(opp_np);
-	if (dev_opp) {
+	opp_table = _managed_opp(opp_np);
+	if (opp_table) {
 		/* OPPs are already managed */
-		if (!_add_list_dev(dev, dev_opp))
+		if (!_add_opp_dev(dev, opp_table))
 			ret = -ENOMEM;
-		mutex_unlock(&dev_opp_list_lock);
+		mutex_unlock(&opp_table_lock);
 		return ret;
 	}
-	mutex_unlock(&dev_opp_list_lock);
+	mutex_unlock(&opp_table_lock);
 
-	/* We have opp-list node now, iterate over it and add OPPs */
+	/* We have opp-table node now, iterate over it and add OPPs */
 	for_each_available_child_of_node(opp_np, np) {
 		count++;
 
@@ -1938,19 +1944,19 @@ static int _of_add_opp_table_v2(struct device *dev, struct device_node *opp_np)
 	if (WARN_ON(!count))
 		return -ENOENT;
 
-	mutex_lock(&dev_opp_list_lock);
+	mutex_lock(&opp_table_lock);
 
-	dev_opp = _find_device_opp(dev);
-	if (WARN_ON(IS_ERR(dev_opp))) {
-		ret = PTR_ERR(dev_opp);
-		mutex_unlock(&dev_opp_list_lock);
+	opp_table = _find_opp_table(dev);
+	if (WARN_ON(IS_ERR(opp_table))) {
+		ret = PTR_ERR(opp_table);
+		mutex_unlock(&opp_table_lock);
 		goto free_table;
 	}
 
-	dev_opp->np = opp_np;
-	dev_opp->shared_opp = of_property_read_bool(opp_np, "opp-shared");
+	opp_table->np = opp_np;
+	opp_table->shared_opp = of_property_read_bool(opp_np, "opp-shared");
 
-	mutex_unlock(&dev_opp_list_lock);
+	mutex_unlock(&opp_table_lock);
 
 	return 0;
 
@@ -1979,7 +1985,7 @@ static int _of_add_opp_table_v1(struct device *dev)
 	 */
 	nr = prop->length / sizeof(u32);
 	if (nr % 2) {
-		dev_err(dev, "%s: Invalid OPP list\n", __func__);
+		dev_err(dev, "%s: Invalid OPP table\n", __func__);
 		return -EINVAL;
 	}
 
@@ -1999,11 +2005,11 @@ static int _of_add_opp_table_v1(struct device *dev)
 
 /**
  * dev_pm_opp_of_add_table() - Initialize opp table from device tree
- * @dev:	device pointer used to lookup device OPPs.
+ * @dev:	device pointer used to lookup OPP table.
  *
  * Register the initial OPP table with the OPP library for given device.
  *
- * Locking: The internal device_opp and opp structures are RCU protected.
+ * Locking: The internal opp_table and opp structures are RCU protected.
  * Hence this function indirectly uses RCU updater strategy with mutex locks
  * to keep the integrity of the internal data structures. Callers should ensure
  * that this function is *NOT* called under RCU protection or in contexts where
diff --git a/drivers/base/power/opp/cpu.c b/drivers/base/power/opp/cpu.c
index 9f0c155..ba2bdbd 100644
--- a/drivers/base/power/opp/cpu.c
+++ b/drivers/base/power/opp/cpu.c
@@ -31,7 +31,7 @@
  * @table:	Cpufreq table returned back to caller
  *
  * Generate a cpufreq table for a provided device- this assumes that the
- * opp list is already initialized and ready for usage.
+ * opp table is already initialized and ready for usage.
  *
  * This function allocates required memory for the cpufreq table. It is
  * expected that the caller does the required maintenance such as freeing
@@ -44,7 +44,7 @@
  * WARNING: It is  important for the callers to ensure refreshing their copy of
  * the table if any of the mentioned functions have been invoked in the interim.
  *
- * Locking: The internal device_opp and opp structures are RCU protected.
+ * Locking: The internal opp_table and opp structures are RCU protected.
  * Since we just use the regular accessor functions to access the internal data
  * structures, we use RCU read lock inside this function. As a result, users of
  * this function DONOT need to use explicit locks for invoking.
@@ -122,15 +122,15 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_free_cpufreq_table);
 /* Required only for V1 bindings, as v2 can manage it from DT itself */
 int dev_pm_opp_set_sharing_cpus(struct device *cpu_dev, cpumask_var_t cpumask)
 {
-	struct device_list_opp *list_dev;
-	struct device_opp *dev_opp;
+	struct opp_device *opp_dev;
+	struct opp_table *opp_table;
 	struct device *dev;
 	int cpu, ret = 0;
 
-	mutex_lock(&dev_opp_list_lock);
+	mutex_lock(&opp_table_lock);
 
-	dev_opp = _find_device_opp(cpu_dev);
-	if (IS_ERR(dev_opp)) {
+	opp_table = _find_opp_table(cpu_dev);
+	if (IS_ERR(opp_table)) {
 		ret = -EINVAL;
 		goto unlock;
 	}
@@ -146,15 +146,15 @@ int dev_pm_opp_set_sharing_cpus(struct device *cpu_dev, cpumask_var_t cpumask)
 			continue;
 		}
 
-		list_dev = _add_list_dev(dev, dev_opp);
-		if (!list_dev) {
-			dev_err(dev, "%s: failed to add list-dev for cpu%d device\n",
+		opp_dev = _add_opp_dev(dev, opp_table);
+		if (!opp_dev) {
+			dev_err(dev, "%s: failed to add opp-dev for cpu%d device\n",
 				__func__, cpu);
 			continue;
 		}
 	}
 unlock:
-	mutex_unlock(&dev_opp_list_lock);
+	mutex_unlock(&opp_table_lock);
 
 	return ret;
 }
diff --git a/drivers/base/power/opp/debugfs.c b/drivers/base/power/opp/debugfs.c
index ddfe477..ef1ae6b 100644
--- a/drivers/base/power/opp/debugfs.c
+++ b/drivers/base/power/opp/debugfs.c
@@ -34,9 +34,9 @@ void opp_debug_remove_one(struct dev_pm_opp *opp)
 	debugfs_remove_recursive(opp->dentry);
 }
 
-int opp_debug_create_one(struct dev_pm_opp *opp, struct device_opp *dev_opp)
+int opp_debug_create_one(struct dev_pm_opp *opp, struct opp_table *opp_table)
 {
-	struct dentry *pdentry = dev_opp->dentry;
+	struct dentry *pdentry = opp_table->dentry;
 	struct dentry *d;
 	char name[25];	/* 20 chars for 64 bit value + 5 (opp:\0) */
 
@@ -83,52 +83,52 @@ int opp_debug_create_one(struct dev_pm_opp *opp, struct device_opp *dev_opp)
 	return 0;
 }
 
-static int device_opp_debug_create_dir(struct device_list_opp *list_dev,
-				       struct device_opp *dev_opp)
+static int opp_list_debug_create_dir(struct opp_device *opp_dev,
+				     struct opp_table *opp_table)
 {
-	const struct device *dev = list_dev->dev;
+	const struct device *dev = opp_dev->dev;
 	struct dentry *d;
 
-	opp_set_dev_name(dev, dev_opp->dentry_name);
+	opp_set_dev_name(dev, opp_table->dentry_name);
 
 	/* Create device specific directory */
-	d = debugfs_create_dir(dev_opp->dentry_name, rootdir);
+	d = debugfs_create_dir(opp_table->dentry_name, rootdir);
 	if (!d) {
 		dev_err(dev, "%s: Failed to create debugfs dir\n", __func__);
 		return -ENOMEM;
 	}
 
-	list_dev->dentry = d;
-	dev_opp->dentry = d;
+	opp_dev->dentry = d;
+	opp_table->dentry = d;
 
 	return 0;
 }
 
-static int device_opp_debug_create_link(struct device_list_opp *list_dev,
-					struct device_opp *dev_opp)
+static int opp_list_debug_create_link(struct opp_device *opp_dev,
+				      struct opp_table *opp_table)
 {
-	const struct device *dev = list_dev->dev;
+	const struct device *dev = opp_dev->dev;
 	char name[NAME_MAX];
 	struct dentry *d;
 
-	opp_set_dev_name(list_dev->dev, name);
+	opp_set_dev_name(opp_dev->dev, name);
 
 	/* Create device specific directory link */
-	d = debugfs_create_symlink(name, rootdir, dev_opp->dentry_name);
+	d = debugfs_create_symlink(name, rootdir, opp_table->dentry_name);
 	if (!d) {
 		dev_err(dev, "%s: Failed to create link\n", __func__);
 		return -ENOMEM;
 	}
 
-	list_dev->dentry = d;
+	opp_dev->dentry = d;
 
 	return 0;
 }
 
 /**
  * opp_debug_register - add a device opp node to the debugfs 'opp' directory
- * @list_dev: list-dev pointer for device
- * @dev_opp: the device-opp being added
+ * @opp_dev: opp-dev pointer for device
+ * @opp_table: the device-opp being added
  *
  * Dynamically adds device specific directory in debugfs 'opp' directory. If the
  * device-opp is shared with other devices, then links will be created for all
@@ -136,73 +136,72 @@ static int device_opp_debug_create_link(struct device_list_opp *list_dev,
  *
  * Return: 0 on success, otherwise negative error.
  */
-int opp_debug_register(struct device_list_opp *list_dev,
-		       struct device_opp *dev_opp)
+int opp_debug_register(struct opp_device *opp_dev, struct opp_table *opp_table)
 {
 	if (!rootdir) {
 		pr_debug("%s: Uninitialized rootdir\n", __func__);
 		return -EINVAL;
 	}
 
-	if (dev_opp->dentry)
-		return device_opp_debug_create_link(list_dev, dev_opp);
+	if (opp_table->dentry)
+		return opp_list_debug_create_link(opp_dev, opp_table);
 
-	return device_opp_debug_create_dir(list_dev, dev_opp);
+	return opp_list_debug_create_dir(opp_dev, opp_table);
 }
 
-static void opp_migrate_dentry(struct device_list_opp *list_dev,
-			       struct device_opp *dev_opp)
+static void opp_migrate_dentry(struct opp_device *opp_dev,
+			       struct opp_table *opp_table)
 {
-	struct device_list_opp *new_dev;
+	struct opp_device *new_dev;
 	const struct device *dev;
 	struct dentry *dentry;
 
-	/* Look for next list-dev */
-	list_for_each_entry(new_dev, &dev_opp->dev_list, node)
-		if (new_dev != list_dev)
+	/* Look for next opp-dev */
+	list_for_each_entry(new_dev, &opp_table->dev_list, node)
+		if (new_dev != opp_dev)
 			break;
 
 	/* new_dev is guaranteed to be valid here */
 	dev = new_dev->dev;
 	debugfs_remove_recursive(new_dev->dentry);
 
-	opp_set_dev_name(dev, dev_opp->dentry_name);
+	opp_set_dev_name(dev, opp_table->dentry_name);
 
-	dentry = debugfs_rename(rootdir, list_dev->dentry, rootdir,
-				dev_opp->dentry_name);
+	dentry = debugfs_rename(rootdir, opp_dev->dentry, rootdir,
+				opp_table->dentry_name);
 	if (!dentry) {
 		dev_err(dev, "%s: Failed to rename link from: %s to %s\n",
-			__func__, dev_name(list_dev->dev), dev_name(dev));
+			__func__, dev_name(opp_dev->dev), dev_name(dev));
 		return;
 	}
 
 	new_dev->dentry = dentry;
-	dev_opp->dentry = dentry;
+	opp_table->dentry = dentry;
 }
 
 /**
  * opp_debug_unregister - remove a device opp node from debugfs opp directory
- * @list_dev: list-dev pointer for device
- * @dev_opp: the device-opp being removed
+ * @opp_dev: opp-dev pointer for device
+ * @opp_table: the device-opp being removed
  *
  * Dynamically removes device specific directory from debugfs 'opp' directory.
  */
-void opp_debug_unregister(struct device_list_opp *list_dev,
-			  struct device_opp *dev_opp)
+void opp_debug_unregister(struct opp_device *opp_dev,
+			  struct opp_table *opp_table)
 {
-	if (list_dev->dentry == dev_opp->dentry) {
+	if (opp_dev->dentry == opp_table->dentry) {
 		/* Move the real dentry object under another device */
-		if (!list_is_singular(&dev_opp->dev_list)) {
-			opp_migrate_dentry(list_dev, dev_opp);
+		if (!list_is_singular(&opp_table->dev_list)) {
+			opp_migrate_dentry(opp_dev, opp_table);
 			goto out;
 		}
-		dev_opp->dentry = NULL;
+		opp_table->dentry = NULL;
 	}
 
-	debugfs_remove_recursive(list_dev->dentry);
+	debugfs_remove_recursive(opp_dev->dentry);
 
 out:
-	list_dev->dentry = NULL;
+	opp_dev->dentry = NULL;
 }
 
 static int __init opp_debug_init(void)
diff --git a/drivers/base/power/opp/opp.h b/drivers/base/power/opp/opp.h
index 4f1bdfc..f67f806 100644
--- a/drivers/base/power/opp/opp.h
+++ b/drivers/base/power/opp/opp.h
@@ -26,12 +26,12 @@ struct clk;
 struct regulator;
 
 /* Lock to allow exclusive modification to the device and opp lists */
-extern struct mutex dev_opp_list_lock;
+extern struct mutex opp_table_lock;
 
 /*
  * Internal data structure organization with the OPP layer library is as
  * follows:
- * dev_opp_list (root)
+ * opp_tables (root)
  *	|- device 1 (represents voltage domain 1)
  *	|	|- opp 1 (availability, freq, voltage)
  *	|	|- opp 2 ..
@@ -40,18 +40,18 @@ extern struct mutex dev_opp_list_lock;
  *	|- device 2 (represents the next voltage domain)
  *	...
  *	`- device m (represents mth voltage domain)
- * device 1, 2.. are represented by dev_opp structure while each opp
+ * device 1, 2.. are represented by opp_table structure while each opp
  * is represented by the opp structure.
  */
 
 /**
  * struct dev_pm_opp - Generic OPP description structure
- * @node:	opp list node. The nodes are maintained throughout the lifetime
+ * @node:	opp table node. The nodes are maintained throughout the lifetime
  *		of boot. It is expected only an optimal set of OPPs are
  *		added to the library by the SoC framework.
- *		RCU usage: opp list is traversed with RCU locks. node
+ *		RCU usage: opp table is traversed with RCU locks. node
  *		modification is possible realtime, hence the modifications
- *		are protected by the dev_opp_list_lock for integrity.
+ *		are protected by the opp_table_lock for integrity.
  *		IMPORTANT: the opp nodes should be maintained in increasing
  *		order.
  * @available:	true/false - marks if this OPP as available or not
@@ -65,7 +65,7 @@ extern struct mutex dev_opp_list_lock;
  * @u_amp:	Maximum current drawn by the device in microamperes
  * @clock_latency_ns: Latency (in nanoseconds) of switching to this OPP's
  *		frequency from any other OPP's frequency.
- * @dev_opp:	points back to the device_opp struct this opp belongs to
+ * @opp_table:	points back to the opp_table struct this opp belongs to
  * @rcu_head:	RCU callback head used for deferred freeing
  * @np:		OPP's device node.
  * @dentry:	debugfs dentry pointer (per opp)
@@ -87,7 +87,7 @@ struct dev_pm_opp {
 	unsigned long u_amp;
 	unsigned long clock_latency_ns;
 
-	struct device_opp *dev_opp;
+	struct opp_table *opp_table;
 	struct rcu_head rcu_head;
 
 	struct device_node *np;
@@ -98,16 +98,16 @@ struct dev_pm_opp {
 };
 
 /**
- * struct device_list_opp - devices managed by 'struct device_opp'
+ * struct opp_device - devices managed by 'struct opp_table'
  * @node:	list node
  * @dev:	device to which the struct object belongs
  * @rcu_head:	RCU callback head used for deferred freeing
  * @dentry:	debugfs dentry pointer (per device)
  *
- * This is an internal data structure maintaining the list of devices that are
- * managed by 'struct device_opp'.
+ * This is an internal data structure maintaining the devices that are managed
+ * by 'struct opp_table'.
  */
-struct device_list_opp {
+struct opp_device {
 	struct list_head node;
 	const struct device *dev;
 	struct rcu_head rcu_head;
@@ -118,16 +118,16 @@ struct device_list_opp {
 };
 
 /**
- * struct device_opp - Device opp structure
- * @node:	list node - contains the devices with OPPs that
+ * struct opp_table - Device opp structure
+ * @node:	table node - contains the devices with OPPs that
  *		have been registered. Nodes once added are not modified in this
- *		list.
- *		RCU usage: nodes are not modified in the list of device_opp,
- *		however addition is possible and is secured by dev_opp_list_lock
+ *		table.
+ *		RCU usage: nodes are not modified in the table of opp_table,
+ *		however addition is possible and is secured by opp_table_lock
  * @srcu_head:	notifier head to notify the OPP availability changes.
  * @rcu_head:	RCU callback head used for deferred freeing
  * @dev_list:	list of devices that share these OPPs
- * @opp_list:	list of opps
+ * @opp_list:	table of opps
  * @np:		struct device_node pointer for opp's DT node.
  * @clock_latency_ns_max: Max clock latency in nanoseconds.
  * @shared_opp: OPP is shared between multiple devices.
@@ -150,7 +150,7 @@ struct device_list_opp {
  * need to wait for the grace period of both of them before freeing any
  * resources. And so we have used kfree_rcu() from within call_srcu() handlers.
  */
-struct device_opp {
+struct opp_table {
 	struct list_head node;
 
 	struct srcu_notifier_head srcu_head;
@@ -180,30 +180,27 @@ struct device_opp {
 };
 
 /* Routines internal to opp core */
-struct device_opp *_find_device_opp(struct device *dev);
-struct device_list_opp *_add_list_dev(const struct device *dev,
-				      struct device_opp *dev_opp);
+struct opp_table *_find_opp_table(struct device *dev);
+struct opp_device *_add_opp_dev(const struct device *dev, struct opp_table *opp_table);
 struct device_node *_of_get_opp_desc_node(struct device *dev);
 
 #ifdef CONFIG_DEBUG_FS
 void opp_debug_remove_one(struct dev_pm_opp *opp);
-int opp_debug_create_one(struct dev_pm_opp *opp, struct device_opp *dev_opp);
-int opp_debug_register(struct device_list_opp *list_dev,
-		       struct device_opp *dev_opp);
-void opp_debug_unregister(struct device_list_opp *list_dev,
-			  struct device_opp *dev_opp);
+int opp_debug_create_one(struct dev_pm_opp *opp, struct opp_table *opp_table);
+int opp_debug_register(struct opp_device *opp_dev, struct opp_table *opp_table);
+void opp_debug_unregister(struct opp_device *opp_dev, struct opp_table *opp_table);
 #else
 static inline void opp_debug_remove_one(struct dev_pm_opp *opp) {}
 
 static inline int opp_debug_create_one(struct dev_pm_opp *opp,
-				       struct device_opp *dev_opp)
+				       struct opp_table *opp_table)
 { return 0; }
-static inline int opp_debug_register(struct device_list_opp *list_dev,
-				     struct device_opp *dev_opp)
+static inline int opp_debug_register(struct opp_device *opp_dev,
+				     struct opp_table *opp_table)
 { return 0; }
 
-static inline void opp_debug_unregister(struct device_list_opp *list_dev,
-					struct device_opp *dev_opp)
+static inline void opp_debug_unregister(struct opp_device *opp_dev,
+					struct opp_table *opp_table)
 { }
 #endif		/* DEBUG_FS */
 
-- 
cgit v0.10.2


From db62fda318a6b9082ee9d230be8b45e56b6545c0 Mon Sep 17 00:00:00 2001
From: Sudeep Holla <Sudeep.Holla@arm.com>
Date: Wed, 17 Feb 2016 11:54:19 +0000
Subject: ACPI / processor : add support for ACPI0010 processor container

ACPI 6.0 adds support for optional processor container device which may
contain child objects that are either processor devices or other processor
containers. This allows representing hierarchical processor topologies.

It is declared using the _HID of ACPI0010. It is an abstract container
used to represent CPU topology and should not be used to hotplug
purposes.

If no matching handler is found for a device in acpi_scan_attach_handler,
acpi_bus_attach does a default enumeration for those devices with valid
HID in the acpi namespace. This patch adds a scan handler for these ACPI
processor containers to avoid default that enumeration and ensures the
platform devices are not created for them.

Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/acpi/acpi_processor.c b/drivers/acpi/acpi_processor.c
index 6979186..b5e54f2 100644
--- a/drivers/acpi/acpi_processor.c
+++ b/drivers/acpi/acpi_processor.c
@@ -514,7 +514,24 @@ static struct acpi_scan_handler processor_handler = {
 	},
 };
 
+static int acpi_processor_container_attach(struct acpi_device *dev,
+					   const struct acpi_device_id *id)
+{
+	return 1;
+}
+
+static const struct acpi_device_id processor_container_ids[] = {
+	{ ACPI_PROCESSOR_CONTAINER_HID, },
+	{ }
+};
+
+static struct acpi_scan_handler processor_container_handler = {
+	.ids = processor_container_ids,
+	.attach = acpi_processor_container_attach,
+};
+
 void __init acpi_processor_init(void)
 {
 	acpi_scan_add_handler_with_hotplug(&processor_handler, "processor");
+	acpi_scan_add_handler(&processor_container_handler);
 }
diff --git a/include/acpi/processor.h b/include/acpi/processor.h
index 07fb100..54d7860 100644
--- a/include/acpi/processor.h
+++ b/include/acpi/processor.h
@@ -9,6 +9,7 @@
 #define ACPI_PROCESSOR_CLASS		"processor"
 #define ACPI_PROCESSOR_DEVICE_NAME	"Processor"
 #define ACPI_PROCESSOR_DEVICE_HID	"ACPI0007"
+#define ACPI_PROCESSOR_CONTAINER_HID	"ACPI0010"
 
 #define ACPI_PROCESSOR_BUSY_METRIC	10
 
-- 
cgit v0.10.2


From 504997cf96fabf67b4768b7a8ca1a1b622abd839 Mon Sep 17 00:00:00 2001
From: Sudeep Holla <Sudeep.Holla@arm.com>
Date: Wed, 17 Feb 2016 12:03:23 +0000
Subject: ACPI / sleep: move acpi_processor_sleep to sleep.c

acpi_processor_sleep is neither related nor used by CPUIdle framework.
It's used in system suspend/resume path as a syscore operation. It makes
more sense to move it to acpi/sleep.c where all the S-state transition
(a.k.a. Linux system suspend/hiberate) related code are present.

Also make it depend on CONFIG_ACPI_SYSTEM_POWER_STATES_SUPPORT so that
it's not compiled on architecture like ARM64 where S-states are not
yet defined in ACPI.

Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/acpi/processor_driver.c b/drivers/acpi/processor_driver.c
index 11154a3..d2fa8cb 100644
--- a/drivers/acpi/processor_driver.c
+++ b/drivers/acpi/processor_driver.c
@@ -314,7 +314,6 @@ static int __init acpi_processor_driver_init(void)
 	if (result < 0)
 		return result;
 
-	acpi_processor_syscore_init();
 	register_hotcpu_notifier(&acpi_cpu_notifier);
 	acpi_thermal_cpufreq_init();
 	acpi_processor_ppc_init();
@@ -330,7 +329,6 @@ static void __exit acpi_processor_driver_exit(void)
 	acpi_processor_ppc_exit();
 	acpi_thermal_cpufreq_exit();
 	unregister_hotcpu_notifier(&acpi_cpu_notifier);
-	acpi_processor_syscore_exit();
 	driver_unregister(&acpi_processor_driver);
 }
 
diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index e057aa8..fadce35 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -31,7 +31,6 @@
 #include <linux/sched.h>       /* need_resched() */
 #include <linux/tick.h>
 #include <linux/cpuidle.h>
-#include <linux/syscore_ops.h>
 #include <acpi/processor.h>
 
 /*
@@ -193,42 +192,6 @@ static void lapic_timer_state_broadcast(struct acpi_processor *pr,
 
 #endif
 
-#ifdef CONFIG_PM_SLEEP
-static u32 saved_bm_rld;
-
-static int acpi_processor_suspend(void)
-{
-	acpi_read_bit_register(ACPI_BITREG_BUS_MASTER_RLD, &saved_bm_rld);
-	return 0;
-}
-
-static void acpi_processor_resume(void)
-{
-	u32 resumed_bm_rld = 0;
-
-	acpi_read_bit_register(ACPI_BITREG_BUS_MASTER_RLD, &resumed_bm_rld);
-	if (resumed_bm_rld == saved_bm_rld)
-		return;
-
-	acpi_write_bit_register(ACPI_BITREG_BUS_MASTER_RLD, saved_bm_rld);
-}
-
-static struct syscore_ops acpi_processor_syscore_ops = {
-	.suspend = acpi_processor_suspend,
-	.resume = acpi_processor_resume,
-};
-
-void acpi_processor_syscore_init(void)
-{
-	register_syscore_ops(&acpi_processor_syscore_ops);
-}
-
-void acpi_processor_syscore_exit(void)
-{
-	unregister_syscore_ops(&acpi_processor_syscore_ops);
-}
-#endif /* CONFIG_PM_SLEEP */
-
 #if defined(CONFIG_X86)
 static void tsc_check_state(int state)
 {
diff --git a/drivers/acpi/sleep.c b/drivers/acpi/sleep.c
index 9cb9752..fbfcce3 100644
--- a/drivers/acpi/sleep.c
+++ b/drivers/acpi/sleep.c
@@ -19,6 +19,7 @@
 #include <linux/reboot.h>
 #include <linux/acpi.h>
 #include <linux/module.h>
+#include <linux/syscore_ops.h>
 #include <asm/io.h>
 #include <trace/events/power.h>
 
@@ -677,6 +678,39 @@ static void acpi_sleep_suspend_setup(void)
 static inline void acpi_sleep_suspend_setup(void) {}
 #endif /* !CONFIG_SUSPEND */
 
+#ifdef CONFIG_PM_SLEEP
+static u32 saved_bm_rld;
+
+static int  acpi_save_bm_rld(void)
+{
+	acpi_read_bit_register(ACPI_BITREG_BUS_MASTER_RLD, &saved_bm_rld);
+	return 0;
+}
+
+static void  acpi_restore_bm_rld(void)
+{
+	u32 resumed_bm_rld = 0;
+
+	acpi_read_bit_register(ACPI_BITREG_BUS_MASTER_RLD, &resumed_bm_rld);
+	if (resumed_bm_rld == saved_bm_rld)
+		return;
+
+	acpi_write_bit_register(ACPI_BITREG_BUS_MASTER_RLD, saved_bm_rld);
+}
+
+static struct syscore_ops acpi_sleep_syscore_ops = {
+	.suspend = acpi_save_bm_rld,
+	.resume = acpi_restore_bm_rld,
+};
+
+void acpi_sleep_syscore_init(void)
+{
+	register_syscore_ops(&acpi_sleep_syscore_ops);
+}
+#else
+static inline void acpi_sleep_syscore_init(void) {}
+#endif /* CONFIG_PM_SLEEP */
+
 #ifdef CONFIG_HIBERNATION
 static unsigned long s4_hardware_signature;
 static struct acpi_table_facs *facs;
@@ -839,6 +873,7 @@ int __init acpi_sleep_init(void)
 
 	sleep_states[ACPI_STATE_S0] = 1;
 
+	acpi_sleep_syscore_init();
 	acpi_sleep_suspend_setup();
 	acpi_sleep_hibernate_setup();
 
diff --git a/include/acpi/processor.h b/include/acpi/processor.h
index 54d7860..6f1805d 100644
--- a/include/acpi/processor.h
+++ b/include/acpi/processor.h
@@ -395,14 +395,6 @@ static inline int acpi_processor_hotplug(struct acpi_processor *pr)
 }
 #endif /* CONFIG_ACPI_PROCESSOR_IDLE */
 
-#if defined(CONFIG_PM_SLEEP) & defined(CONFIG_ACPI_PROCESSOR_IDLE)
-void acpi_processor_syscore_init(void);
-void acpi_processor_syscore_exit(void);
-#else
-static inline void acpi_processor_syscore_init(void) {}
-static inline void acpi_processor_syscore_exit(void) {}
-#endif
-
 /* in processor_thermal.c */
 int acpi_processor_get_limit_info(struct acpi_processor *pr);
 extern const struct thermal_cooling_device_ops processor_cooling_ops;
-- 
cgit v0.10.2


From 63af4055726a56e04caf354aac58478d2af07ce8 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers3@gmail.com>
Date: Sat, 20 Feb 2016 21:50:01 -0600
Subject: cpufreq: fix comment about return value of cpufreq_register_driver()

The comment has been incorrect since commit 4dea5806d332
("cpufreq: return EEXIST instead of EBUSY for second registering").

Signed-off-by: Eric Biggers <ebiggers3@gmail.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 78a262f..d84aff1 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -2378,7 +2378,7 @@ EXPORT_SYMBOL_GPL(cpufreq_boost_enabled);
  * submitted by the CPU Frequency driver.
  *
  * Registers a CPU Frequency driver to this core code. This code
- * returns zero on success, -EBUSY when another driver got here first
+ * returns zero on success, -EEXIST when another driver got here first
  * (and isn't unregistered in the meantime).
  *
  */
-- 
cgit v0.10.2


From fd7dc7e6b6521453d1170dc1ed50320024b5ba9d Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers3@gmail.com>
Date: Sun, 21 Feb 2016 12:53:12 -0600
Subject: cpufreq: simplify for_each_suitable_policy() macro

Signed-off-by: Eric Biggers <ebiggers3@gmail.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index d84aff1..9be6543 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -38,48 +38,10 @@ static inline bool policy_is_inactive(struct cpufreq_policy *policy)
 	return cpumask_empty(policy->cpus);
 }
 
-static bool suitable_policy(struct cpufreq_policy *policy, bool active)
-{
-	return active == !policy_is_inactive(policy);
-}
-
-/* Finds Next Acive/Inactive policy */
-static struct cpufreq_policy *next_policy(struct cpufreq_policy *policy,
-					  bool active)
-{
-	do {
-		/* No more policies in the list */
-		if (list_is_last(&policy->policy_list, &cpufreq_policy_list))
-			return NULL;
-
-		policy = list_next_entry(policy, policy_list);
-	} while (!suitable_policy(policy, active));
-
-	return policy;
-}
-
-static struct cpufreq_policy *first_policy(bool active)
-{
-	struct cpufreq_policy *policy;
-
-	/* No policies in the list */
-	if (list_empty(&cpufreq_policy_list))
-		return NULL;
-
-	policy = list_first_entry(&cpufreq_policy_list, typeof(*policy),
-				  policy_list);
-
-	if (!suitable_policy(policy, active))
-		policy = next_policy(policy, active);
-
-	return policy;
-}
-
 /* Macros to iterate over CPU policies */
-#define for_each_suitable_policy(__policy, __active)	\
-	for (__policy = first_policy(__active);		\
-	     __policy;					\
-	     __policy = next_policy(__policy, __active))
+#define for_each_suitable_policy(__policy, __active)			 \
+	list_for_each_entry(__policy, &cpufreq_policy_list, policy_list) \
+		if ((__active) == !policy_is_inactive(__policy))
 
 #define for_each_active_policy(__policy)		\
 	for_each_suitable_policy(__policy, true)
-- 
cgit v0.10.2


From 41cfd64cf49fc84837341732a142f3d4cdc1e83a Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Mon, 22 Feb 2016 10:27:46 +0530
Subject: intel_pstate: Update frequencies of policy->cpus only from
 ->set_policy()

The intel-pstate driver is using intel_pstate_hwp_set() from two
separate paths, i.e. ->set_policy() callback and sysfs update path for
the files present in /sys/devices/system/cpu/intel_pstate/ directory.

While an update to the sysfs path applies to all the CPUs being managed
by the driver (which essentially means all the online CPUs), the update
via the ->set_policy() callback applies to a smaller group of CPUs
managed by the policy for which ->set_policy() is called.

And so, intel_pstate_hwp_set() should update frequencies of only the
CPUs that are part of policy->cpus mask, while it is called from
->set_policy() callback.

In order to do that, add a parameter (cpumask) to intel_pstate_hwp_set()
and apply the frequency changes only to the concerned CPUs.

For ->set_policy() path, we are only concerned about policy->cpus, and
so policy->rwsem lock taken by the core prior to calling ->set_policy()
is enough to take care of any races. The larger lock acquired by
get_online_cpus() is required only for the updates to sysfs files.

Add another routine, intel_pstate_hwp_set_online_cpus(), and call it
from the sysfs update paths.

This also fixes a lockdep reported recently, where policy->rwsem and
get_online_cpus() could have been acquired in any order causing an ABBA
deadlock. The sequence of events leading to that was:

intel_pstate_init(...)
	...cpufreq_online(...)
		down_write(&policy->rwsem); // Locks policy->rwsem
		...
		cpufreq_init_policy(policy);
			...intel_pstate_hwp_set();
				get_online_cpus(); // Temporarily locks cpu_hotplug.lock
		...
		up_write(&policy->rwsem);

pm_suspend(...)
	...disable_nonboot_cpus()
		_cpu_down()
			cpu_hotplug_begin(); // Locks cpu_hotplug.lock
			__cpu_notify(CPU_DOWN_PREPARE, ...);
				...cpufreq_offline_prepare();
					down_write(&policy->rwsem); // Locks policy->rwsem

Reported-and-tested-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Acked-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index cd83d47..e856776 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -286,7 +286,7 @@ static inline void update_turbo_state(void)
 		 cpu->pstate.max_pstate == cpu->pstate.turbo_pstate);
 }
 
-static void intel_pstate_hwp_set(void)
+static void intel_pstate_hwp_set(const struct cpumask *cpumask)
 {
 	int min, hw_min, max, hw_max, cpu, range, adj_range;
 	u64 value, cap;
@@ -296,9 +296,7 @@ static void intel_pstate_hwp_set(void)
 	hw_max = HWP_HIGHEST_PERF(cap);
 	range = hw_max - hw_min;
 
-	get_online_cpus();
-
-	for_each_online_cpu(cpu) {
+	for_each_cpu(cpu, cpumask) {
 		rdmsrl_on_cpu(cpu, MSR_HWP_REQUEST, &value);
 		adj_range = limits->min_perf_pct * range / 100;
 		min = hw_min + adj_range;
@@ -317,7 +315,12 @@ static void intel_pstate_hwp_set(void)
 		value |= HWP_MAX_PERF(max);
 		wrmsrl_on_cpu(cpu, MSR_HWP_REQUEST, value);
 	}
+}
 
+static void intel_pstate_hwp_set_online_cpus(void)
+{
+	get_online_cpus();
+	intel_pstate_hwp_set(cpu_online_mask);
 	put_online_cpus();
 }
 
@@ -439,7 +442,7 @@ static ssize_t store_no_turbo(struct kobject *a, struct attribute *b,
 	limits->no_turbo = clamp_t(int, input, 0, 1);
 
 	if (hwp_active)
-		intel_pstate_hwp_set();
+		intel_pstate_hwp_set_online_cpus();
 
 	return count;
 }
@@ -465,7 +468,7 @@ static ssize_t store_max_perf_pct(struct kobject *a, struct attribute *b,
 				  int_tofp(100));
 
 	if (hwp_active)
-		intel_pstate_hwp_set();
+		intel_pstate_hwp_set_online_cpus();
 	return count;
 }
 
@@ -490,7 +493,7 @@ static ssize_t store_min_perf_pct(struct kobject *a, struct attribute *b,
 				  int_tofp(100));
 
 	if (hwp_active)
-		intel_pstate_hwp_set();
+		intel_pstate_hwp_set_online_cpus();
 	return count;
 }
 
@@ -1141,7 +1144,7 @@ static int intel_pstate_set_policy(struct cpufreq_policy *policy)
 		pr_debug("intel_pstate: set performance\n");
 		limits = &performance_limits;
 		if (hwp_active)
-			intel_pstate_hwp_set();
+			intel_pstate_hwp_set(policy->cpus);
 		return 0;
 	}
 
@@ -1173,7 +1176,7 @@ static int intel_pstate_set_policy(struct cpufreq_policy *policy)
 				  int_tofp(100));
 
 	if (hwp_active)
-		intel_pstate_hwp_set();
+		intel_pstate_hwp_set(policy->cpus);
 
 	return 0;
 }
-- 
cgit v0.10.2


From 3008ea9b38a4c002791d0a6a2b12f672456eb076 Mon Sep 17 00:00:00 2001
From: waddlesplash <waddlesplash@gmail.com>
Date: Fri, 19 Feb 2016 14:16:03 +0800
Subject: ACPICA: aclocal: Put parens around some definitions.

ACPICA commit 7100a109f7d6523330d29f4d088cf1ffb756025f

Looking at where these are used, this shouldn't result in any behavioral changes, but it's best practices to have them.

Link: https://github.com/acpica/acpica/commit/7100a109
Signed-off-by: waddlesplash <waddlesplash@gmail.com>
Signed-off-by: Bob Moore <robert.moore@intel.com>
Signed-off-by: Lv Zheng <lv.zheng@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/acpi/acpica/aclocal.h b/drivers/acpi/acpica/aclocal.h
index e4977fa..9562a10 100644
--- a/drivers/acpi/acpica/aclocal.h
+++ b/drivers/acpi/acpica/aclocal.h
@@ -85,7 +85,7 @@ union acpi_parse_object;
 #define ACPI_MTX_MEMORY                 5	/* Debug memory tracking lists */
 
 #define ACPI_MAX_MUTEX                  5
-#define ACPI_NUM_MUTEX                  ACPI_MAX_MUTEX+1
+#define ACPI_NUM_MUTEX                  (ACPI_MAX_MUTEX+1)
 
 /* Lock structure for reader/writer interfaces */
 
@@ -103,11 +103,11 @@ struct acpi_rw_lock {
 #define ACPI_LOCK_HARDWARE              1
 
 #define ACPI_MAX_LOCK                   1
-#define ACPI_NUM_LOCK                   ACPI_MAX_LOCK+1
+#define ACPI_NUM_LOCK                   (ACPI_MAX_LOCK+1)
 
 /* This Thread ID means that the mutex is not in use (unlocked) */
 
-#define ACPI_MUTEX_NOT_ACQUIRED         (acpi_thread_id) 0
+#define ACPI_MUTEX_NOT_ACQUIRED         ((acpi_thread_id) 0)
 
 /* This Thread ID means an invalid thread ID */
 
-- 
cgit v0.10.2


From e448c09e5371fa23143923079e558fe86a746bc8 Mon Sep 17 00:00:00 2001
From: Bob Moore <robert.moore@intel.com>
Date: Fri, 19 Feb 2016 14:16:14 +0800
Subject: ACPICA: Remove incorrect "static" from a global structure

ACPICA commit 7a9956a2afd3863fa7d9fe4a64a957389d09f3c2

Reported by Colin Ian King. ACPICA BZ 1239.

Link: https://github.com/acpica/acpica/commit/7a9956a2
Link: https://bugs.acpica.org/show_bug.cgi?id=1239
Signed-off-by: Bob Moore <robert.moore@intel.com>
Signed-off-by: Lv Zheng <lv.zheng@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/acpi/acpica/acpredef.h b/drivers/acpi/acpica/acpredef.h
index 52f6bee..5faeab4 100644
--- a/drivers/acpi/acpica/acpredef.h
+++ b/drivers/acpi/acpica/acpredef.h
@@ -1125,7 +1125,7 @@ const union acpi_predefined_info acpi_gbl_resource_names[] = {
 	PACKAGE_INFO(0, 0, 0, 0, 0, 0)	/* Table terminator */
 };
 
-static const union acpi_predefined_info acpi_gbl_scope_names[] = {
+const union acpi_predefined_info acpi_gbl_scope_names[] = {
 	{{"_GPE", 0, 0}},
 	{{"_PR_", 0, 0}},
 	{{"_SB_", 0, 0}},
-- 
cgit v0.10.2


From 27eb7485a77de8ff2bb942a6507e186bef4f191e Mon Sep 17 00:00:00 2001
From: Bob Moore <robert.moore@intel.com>
Date: Fri, 19 Feb 2016 14:16:20 +0800
Subject: ACPICA: iASL: Fix some typos with the name strtoul64

ACPICA commit e9622aa824e00997dc92e8638733b7553a4dba26

Was defined as stroul64 in some places.

Link: https://github.com/acpica/acpica/commit/e9622aa8
Signed-off-by: Bob Moore <robert.moore@intel.com>
Signed-off-by: Lv Zheng <lv.zheng@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/acpi/acpica/utnonansi.c b/drivers/acpi/acpica/utnonansi.c
index c427a5c..d1f4a47 100644
--- a/drivers/acpi/acpica/utnonansi.c
+++ b/drivers/acpi/acpica/utnonansi.c
@@ -171,7 +171,7 @@ acpi_status acpi_ut_strtoul64(char *string, u32 base, u64 *ret_integer)
 	u8 sign_of0x = 0;
 	u8 term = 0;
 
-	ACPI_FUNCTION_TRACE_STR(ut_stroul64, string);
+	ACPI_FUNCTION_TRACE_STR(ut_strtoul64, string);
 
 	switch (base) {
 	case ACPI_ANY_BASE:
-- 
cgit v0.10.2


From 50db3052dac6e5ff7a8eeaaea138a6aaab6e608e Mon Sep 17 00:00:00 2001
From: Bob Moore <robert.moore@intel.com>
Date: Fri, 19 Feb 2016 14:16:27 +0800
Subject: ACPICA: iASL: Update to use internal acpi_ut_strtoul64 function

ACPICA commit e959584d23b520c53700e90282312d17b9603ed5

Was using a local Strtoul64, update to use the common acpi_ut_strtoul64
and remove the local Strtoul64.

Link: https://github.com/acpica/acpica/commit/e959584d
Signed-off-by: Bob Moore <robert.moore@intel.com>
Signed-off-by: Lv Zheng <lv.zheng@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/acpi/acpica/utnonansi.c b/drivers/acpi/acpica/utnonansi.c
index d1f4a47..d5c3adf 100644
--- a/drivers/acpi/acpica/utnonansi.c
+++ b/drivers/acpi/acpica/utnonansi.c
@@ -140,6 +140,67 @@ int acpi_ut_stricmp(char *string1, char *string2)
 	return (c1 - c2);
 }
 
+#if defined (ACPI_DEBUGGER) || defined (ACPI_APPLICATION)
+/*******************************************************************************
+ *
+ * FUNCTION:    acpi_ut_safe_strcpy, acpi_ut_safe_strcat, acpi_ut_safe_strncat
+ *
+ * PARAMETERS:  Adds a "DestSize" parameter to each of the standard string
+ *              functions. This is the size of the Destination buffer.
+ *
+ * RETURN:      TRUE if the operation would overflow the destination buffer.
+ *
+ * DESCRIPTION: Safe versions of standard Clib string functions. Ensure that
+ *              the result of the operation will not overflow the output string
+ *              buffer.
+ *
+ * NOTE:        These functions are typically only helpful for processing
+ *              user input and command lines. For most ACPICA code, the
+ *              required buffer length is precisely calculated before buffer
+ *              allocation, so the use of these functions is unnecessary.
+ *
+ ******************************************************************************/
+
+u8 acpi_ut_safe_strcpy(char *dest, acpi_size dest_size, char *source)
+{
+
+	if (strlen(source) >= dest_size) {
+		return (TRUE);
+	}
+
+	strcpy(dest, source);
+	return (FALSE);
+}
+
+u8 acpi_ut_safe_strcat(char *dest, acpi_size dest_size, char *source)
+{
+
+	if ((strlen(dest) + strlen(source)) >= dest_size) {
+		return (TRUE);
+	}
+
+	strcat(dest, source);
+	return (FALSE);
+}
+
+u8
+acpi_ut_safe_strncat(char *dest,
+		     acpi_size dest_size,
+		     char *source, acpi_size max_transfer_length)
+{
+	acpi_size actual_transfer_length;
+
+	actual_transfer_length = ACPI_MIN(max_transfer_length, strlen(source));
+
+	if ((strlen(dest) + actual_transfer_length) >= dest_size) {
+		return (TRUE);
+	}
+
+	strncat(dest, source, max_transfer_length);
+	return (FALSE);
+}
+#endif
+
 /*******************************************************************************
  *
  * FUNCTION:    acpi_ut_strtoul64
@@ -155,7 +216,15 @@ int acpi_ut_stricmp(char *string1, char *string2)
  *              32-bit or 64-bit conversion, depending on the current mode
  *              of the interpreter.
  *
- * NOTE:        Does not support Octal strings, not needed.
+ * NOTES:       acpi_gbl_integer_byte_width should be set to the proper width.
+ *              For the core ACPICA code, this width depends on the DSDT
+ *              version. For iASL, the default byte width is always 8.
+ *
+ *              Does not support Octal strings, not needed at this time.
+ *
+ *              There is an earlier version of the function after this one,
+ *              below. It is slightly different than this one, and the two
+ *              may eventually may need to be merged. (01/2016).
  *
  ******************************************************************************/
 
@@ -318,63 +387,162 @@ error_exit:
 	}
 }
 
-#if defined (ACPI_DEBUGGER) || defined (ACPI_APPLICATION)
+#ifdef _OBSOLETE_FUNCTIONS
+/* TBD: use version in ACPICA main code base? */
+/* DONE: 01/2016 */
+
 /*******************************************************************************
  *
- * FUNCTION:    acpi_ut_safe_strcpy, acpi_ut_safe_strcat, acpi_ut_safe_strncat
+ * FUNCTION:    strtoul64
  *
- * PARAMETERS:  Adds a "DestSize" parameter to each of the standard string
- *              functions. This is the size of the Destination buffer.
+ * PARAMETERS:  string              - Null terminated string
+ *              terminater          - Where a pointer to the terminating byte
+ *                                    is returned
+ *              base                - Radix of the string
  *
- * RETURN:      TRUE if the operation would overflow the destination buffer.
+ * RETURN:      Converted value
  *
- * DESCRIPTION: Safe versions of standard Clib string functions. Ensure that
- *              the result of the operation will not overflow the output string
- *              buffer.
- *
- * NOTE:        These functions are typically only helpful for processing
- *              user input and command lines. For most ACPICA code, the
- *              required buffer length is precisely calculated before buffer
- *              allocation, so the use of these functions is unnecessary.
+ * DESCRIPTION: Convert a string into an unsigned value.
  *
  ******************************************************************************/
 
-u8 acpi_ut_safe_strcpy(char *dest, acpi_size dest_size, char *source)
+acpi_status strtoul64(char *string, u32 base, u64 *ret_integer)
 {
+	u32 index;
+	u32 sign;
+	u64 return_value = 0;
+	acpi_status status = AE_OK;
 
-	if (strlen(source) >= dest_size) {
-		return (TRUE);
+	*ret_integer = 0;
+
+	switch (base) {
+	case 0:
+	case 8:
+	case 10:
+	case 16:
+
+		break;
+
+	default:
+		/*
+		 * The specified Base parameter is not in the domain of
+		 * this function:
+		 */
+		return (AE_BAD_PARAMETER);
 	}
 
-	strcpy(dest, source);
-	return (FALSE);
-}
+	/* Skip over any white space in the buffer: */
 
-u8 acpi_ut_safe_strcat(char *dest, acpi_size dest_size, char *source)
-{
+	while (isspace((int)*string) || *string == '\t') {
+		++string;
+	}
 
-	if ((strlen(dest) + strlen(source)) >= dest_size) {
-		return (TRUE);
+	/*
+	 * The buffer may contain an optional plus or minus sign.
+	 * If it does, then skip over it but remember what is was:
+	 */
+	if (*string == '-') {
+		sign = ACPI_SIGN_NEGATIVE;
+		++string;
+	} else if (*string == '+') {
+		++string;
+		sign = ACPI_SIGN_POSITIVE;
+	} else {
+		sign = ACPI_SIGN_POSITIVE;
 	}
 
-	strcat(dest, source);
-	return (FALSE);
-}
+	/*
+	 * If the input parameter Base is zero, then we need to
+	 * determine if it is octal, decimal, or hexadecimal:
+	 */
+	if (base == 0) {
+		if (*string == '0') {
+			if (tolower((int)*(++string)) == 'x') {
+				base = 16;
+				++string;
+			} else {
+				base = 8;
+			}
+		} else {
+			base = 10;
+		}
+	}
 
-u8
-acpi_ut_safe_strncat(char *dest,
-		     acpi_size dest_size,
-		     char *source, acpi_size max_transfer_length)
-{
-	acpi_size actual_transfer_length;
+	/*
+	 * For octal and hexadecimal bases, skip over the leading
+	 * 0 or 0x, if they are present.
+	 */
+	if (base == 8 && *string == '0') {
+		string++;
+	}
 
-	actual_transfer_length = ACPI_MIN(max_transfer_length, strlen(source));
+	if (base == 16 && *string == '0' && tolower((int)*(++string)) == 'x') {
+		string++;
+	}
 
-	if ((strlen(dest) + actual_transfer_length) >= dest_size) {
-		return (TRUE);
+	/* Main loop: convert the string to an unsigned long */
+
+	while (*string) {
+		if (isdigit((int)*string)) {
+			index = ((u8)*string) - '0';
+		} else {
+			index = (u8)toupper((int)*string);
+			if (isupper((int)index)) {
+				index = index - 'A' + 10;
+			} else {
+				goto error_exit;
+			}
+		}
+
+		if (index >= base) {
+			goto error_exit;
+		}
+
+		/* Check to see if value is out of range: */
+
+		if (return_value > ((ACPI_UINT64_MAX - (u64)index) / (u64)base)) {
+			goto error_exit;
+		} else {
+			return_value *= base;
+			return_value += index;
+		}
+
+		++string;
 	}
 
-	strncat(dest, source, max_transfer_length);
-	return (FALSE);
+	/* If a minus sign was present, then "the conversion is negated": */
+
+	if (sign == ACPI_SIGN_NEGATIVE) {
+		return_value = (ACPI_UINT32_MAX - return_value) + 1;
+	}
+
+	*ret_integer = return_value;
+	return (status);
+
+error_exit:
+	switch (base) {
+	case 8:
+
+		status = AE_BAD_OCTAL_CONSTANT;
+		break;
+
+	case 10:
+
+		status = AE_BAD_DECIMAL_CONSTANT;
+		break;
+
+	case 16:
+
+		status = AE_BAD_HEX_CONSTANT;
+		break;
+
+	default:
+
+		/* Base validated above */
+
+		break;
+	}
+
+	return (status);
 }
 #endif
-- 
cgit v0.10.2


From c340e5f0f8fe12079219ddf3c012bbc989b6f45b Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Fri, 19 Feb 2016 14:16:34 +0800
Subject: ACPICA: debugger: dbconvert: free pld_info on error return path

ACPICA commit 23e644670539e23818fa81e2af5e89ad6657e75c

A failed allocation of new_buffer causes a leak of pld_info
because the error return path fails to free pld_info. Ensure
it is freed on the error exit path.

Link: https://github.com/acpica/acpica/commit/23e64467
Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: Bob Moore <robert.moore@intel.com>
Signed-off-by: Lv Zheng <lv.zheng@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/acpi/acpica/dbconvert.c b/drivers/acpi/acpica/dbconvert.c
index 9fee88f..68f4e0f4 100644
--- a/drivers/acpi/acpica/dbconvert.c
+++ b/drivers/acpi/acpica/dbconvert.c
@@ -408,7 +408,7 @@ void acpi_db_dump_pld_buffer(union acpi_object *obj_desc)
 
 	new_buffer = acpi_db_encode_pld_buffer(pld_info);
 	if (!new_buffer) {
-		return;
+		goto exit;
 	}
 
 	/* The two bit-packed buffers should match */
@@ -479,6 +479,7 @@ void acpi_db_dump_pld_buffer(union acpi_object *obj_desc)
 			       pld_info->horizontal_offset);
 	}
 
-	ACPI_FREE(pld_info);
 	ACPI_FREE(new_buffer);
+exit:
+	ACPI_FREE(pld_info);
 }
-- 
cgit v0.10.2


From 05fb04b54f6b67f4eb901e99dd5b4604b46ffa0f Mon Sep 17 00:00:00 2001
From: Bob Moore <robert.moore@intel.com>
Date: Fri, 19 Feb 2016 14:16:42 +0800
Subject: ACPICA: Remove unnecessary arguments to ACPI_INFO

ACPICA commit 181f56605a771e0b91e24b0648d2565ca70bea20

This is used as a purely infomation message, without module name
and line number information. Therefore, these arguments are
not needed and they are unnecessary overhead.
Arguments are removed.
ACPICA BZ 872.

Link: https://github.com/acpica/acpica/commit/181f5660
Link: https://bugs.acpica.org/show_bug.cgi?id=872
Signed-off-by: Bob Moore <robert.moore@intel.com>
Signed-off-by: Lv Zheng <lv.zheng@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/acpi/acpica/dbcmds.c b/drivers/acpi/acpica/dbcmds.c
index 7ec62c4..772178c 100644
--- a/drivers/acpi/acpica/dbcmds.c
+++ b/drivers/acpi/acpica/dbcmds.c
@@ -348,7 +348,7 @@ void acpi_db_display_table_info(char *table_arg)
 		} else {
 			/* If the pointer is null, the table has been unloaded */
 
-			ACPI_INFO((AE_INFO, "%4.4s - Table has been unloaded",
+			ACPI_INFO(("%4.4s - Table has been unloaded",
 				   table_desc->signature.ascii));
 		}
 	}
diff --git a/drivers/acpi/acpica/dsmethod.c b/drivers/acpi/acpica/dsmethod.c
index 6a72047..1982310 100644
--- a/drivers/acpi/acpica/dsmethod.c
+++ b/drivers/acpi/acpica/dsmethod.c
@@ -809,8 +809,7 @@ acpi_ds_terminate_control_method(union acpi_operand_object *method_desc,
 		if (method_desc->method.
 		    info_flags & ACPI_METHOD_SERIALIZED_PENDING) {
 			if (walk_state) {
-				ACPI_INFO((AE_INFO,
-					   "Marking method %4.4s as Serialized "
+				ACPI_INFO(("Marking method %4.4s as Serialized "
 					   "because of AE_ALREADY_EXISTS error",
 					   walk_state->method_node->name.
 					   ascii));
diff --git a/drivers/acpi/acpica/dsobject.c b/drivers/acpi/acpica/dsobject.c
index c303e9d..a91de2b 100644
--- a/drivers/acpi/acpica/dsobject.c
+++ b/drivers/acpi/acpica/dsobject.c
@@ -524,8 +524,7 @@ acpi_ds_build_internal_package_obj(struct acpi_walk_state *walk_state,
 			arg = arg->common.next;
 		}
 
-		ACPI_INFO((AE_INFO,
-			   "Actual Package length (%u) is larger than "
+		ACPI_INFO(("Actual Package length (%u) is larger than "
 			   "NumElements field (%u), truncated",
 			   i, element_count));
 	} else if (i < element_count) {
diff --git a/drivers/acpi/acpica/evgpeblk.c b/drivers/acpi/acpica/evgpeblk.c
index 9275e62..447fa1c 100644
--- a/drivers/acpi/acpica/evgpeblk.c
+++ b/drivers/acpi/acpica/evgpeblk.c
@@ -499,8 +499,7 @@ acpi_ev_initialize_gpe_block(struct acpi_gpe_xrupt_info *gpe_xrupt_info,
 	}
 
 	if (gpe_enabled_count) {
-		ACPI_INFO((AE_INFO,
-			   "Enabled %u GPEs in block %02X to %02X",
+		ACPI_INFO(("Enabled %u GPEs in block %02X to %02X",
 			   gpe_enabled_count, (u32)gpe_block->block_base_number,
 			   (u32)(gpe_block->block_base_number +
 				 (gpe_block->gpe_count - 1))));
diff --git a/drivers/acpi/acpica/evgpeinit.c b/drivers/acpi/acpica/evgpeinit.c
index 9fdd8d0..7dc7547 100644
--- a/drivers/acpi/acpica/evgpeinit.c
+++ b/drivers/acpi/acpica/evgpeinit.c
@@ -281,7 +281,7 @@ void acpi_ev_update_gpes(acpi_owner_id table_owner_id)
 	}
 
 	if (walk_info.count) {
-		ACPI_INFO((AE_INFO, "Enabled %u new GPEs", walk_info.count));
+		ACPI_INFO(("Enabled %u new GPEs", walk_info.count));
 	}
 
 	(void)acpi_ut_release_mutex(ACPI_MTX_EVENTS);
diff --git a/drivers/acpi/acpica/exconfig.c b/drivers/acpi/acpica/exconfig.c
index 011df21..f741613 100644
--- a/drivers/acpi/acpica/exconfig.c
+++ b/drivers/acpi/acpica/exconfig.c
@@ -252,7 +252,7 @@ acpi_ex_load_table_op(struct acpi_walk_state *walk_state,
 
 	status = acpi_get_table_by_index(table_index, &table);
 	if (ACPI_SUCCESS(status)) {
-		ACPI_INFO((AE_INFO, "Dynamic OEM Table Load:"));
+		ACPI_INFO(("Dynamic OEM Table Load:"));
 		acpi_tb_print_table_header(0, table);
 	}
 
@@ -472,7 +472,7 @@ acpi_ex_load_op(union acpi_operand_object *obj_desc,
 
 	/* Install the new table into the local data structures */
 
-	ACPI_INFO((AE_INFO, "Dynamic OEM Table Load:"));
+	ACPI_INFO(("Dynamic OEM Table Load:"));
 	(void)acpi_ut_acquire_mutex(ACPI_MTX_TABLES);
 
 	status = acpi_tb_install_standard_table(ACPI_PTR_TO_PHYSADDR(table),
diff --git a/drivers/acpi/acpica/nseval.c b/drivers/acpi/acpica/nseval.c
index 65d58be..5d59cfc 100644
--- a/drivers/acpi/acpica/nseval.c
+++ b/drivers/acpi/acpica/nseval.c
@@ -378,8 +378,7 @@ void acpi_ns_exec_module_code_list(void)
 		acpi_ut_remove_reference(prev);
 	}
 
-	ACPI_INFO((AE_INFO,
-		   "Executed %u blocks of module-level executable AML code",
+	ACPI_INFO(("Executed %u blocks of module-level executable AML code",
 		   method_count));
 
 	ACPI_FREE(info);
diff --git a/drivers/acpi/acpica/tbinstal.c b/drivers/acpi/acpica/tbinstal.c
index b661a1e..4dc6108 100644
--- a/drivers/acpi/acpica/tbinstal.c
+++ b/drivers/acpi/acpica/tbinstal.c
@@ -267,8 +267,7 @@ acpi_tb_install_standard_table(acpi_physical_address address,
 	if (!reload &&
 	    acpi_gbl_disable_ssdt_table_install &&
 	    ACPI_COMPARE_NAME(&new_table_desc.signature, ACPI_SIG_SSDT)) {
-		ACPI_INFO((AE_INFO,
-			   "Ignoring installation of %4.4s at %8.8X%8.8X",
+		ACPI_INFO(("Ignoring installation of %4.4s at %8.8X%8.8X",
 			   new_table_desc.signature.ascii,
 			   ACPI_FORMAT_UINT64(address)));
 		goto release_and_exit;
@@ -432,7 +431,7 @@ finish_override:
 		return;
 	}
 
-	ACPI_INFO((AE_INFO, "%4.4s 0x%8.8X%8.8X"
+	ACPI_INFO(("%4.4s 0x%8.8X%8.8X"
 		   " %s table override, new table: 0x%8.8X%8.8X",
 		   old_table_desc->signature.ascii,
 		   ACPI_FORMAT_UINT64(old_table_desc->address),
diff --git a/drivers/acpi/acpica/tbprint.c b/drivers/acpi/acpica/tbprint.c
index fd4146d..26d61db 100644
--- a/drivers/acpi/acpica/tbprint.c
+++ b/drivers/acpi/acpica/tbprint.c
@@ -132,7 +132,7 @@ acpi_tb_print_table_header(acpi_physical_address address,
 
 		/* FACS only has signature and length fields */
 
-		ACPI_INFO((AE_INFO, "%-4.4s 0x%8.8X%8.8X %06X",
+		ACPI_INFO(("%-4.4s 0x%8.8X%8.8X %06X",
 			   header->signature, ACPI_FORMAT_UINT64(address),
 			   header->length));
 	} else if (ACPI_VALIDATE_RSDP_SIG(header->signature)) {
@@ -144,7 +144,7 @@ acpi_tb_print_table_header(acpi_physical_address address,
 		       ACPI_OEM_ID_SIZE);
 		acpi_tb_fix_string(local_header.oem_id, ACPI_OEM_ID_SIZE);
 
-		ACPI_INFO((AE_INFO, "RSDP 0x%8.8X%8.8X %06X (v%.2d %-6.6s)",
+		ACPI_INFO(("RSDP 0x%8.8X%8.8X %06X (v%.2d %-6.6s)",
 			   ACPI_FORMAT_UINT64(address),
 			   (ACPI_CAST_PTR(struct acpi_table_rsdp, header)->
 			    revision >
@@ -158,8 +158,7 @@ acpi_tb_print_table_header(acpi_physical_address address,
 
 		acpi_tb_cleanup_table_header(&local_header, header);
 
-		ACPI_INFO((AE_INFO,
-			   "%-4.4s 0x%8.8X%8.8X"
+		ACPI_INFO(("%-4.4s 0x%8.8X%8.8X"
 			   " %06X (v%.2d %-6.6s %-8.8s %08X %-4.4s %08X)",
 			   local_header.signature, ACPI_FORMAT_UINT64(address),
 			   local_header.length, local_header.revision,
diff --git a/drivers/acpi/acpica/tbutils.c b/drivers/acpi/acpica/tbutils.c
index 3269bef..9240c76 100644
--- a/drivers/acpi/acpica/tbutils.c
+++ b/drivers/acpi/acpica/tbutils.c
@@ -174,9 +174,7 @@ struct acpi_table_header *acpi_tb_copy_dsdt(u32 table_index)
 				      ACPI_TABLE_ORIGIN_INTERNAL_VIRTUAL,
 				      new_table);
 
-	ACPI_INFO((AE_INFO,
-		   "Forced DSDT copy: length 0x%05X copied locally, original unmapped",
-		   new_table->length));
+	ACPI_INFO(("Forced DSDT copy: length 0x%05X copied locally, original unmapped", new_table->length));
 
 	return (new_table);
 }
diff --git a/drivers/acpi/acpica/tbxfload.c b/drivers/acpi/acpica/tbxfload.c
index 278666e..c89943b 100644
--- a/drivers/acpi/acpica/tbxfload.c
+++ b/drivers/acpi/acpica/tbxfload.c
@@ -206,9 +206,7 @@ acpi_status acpi_tb_load_namespace(void)
 	}
 
 	if (!tables_failed) {
-		ACPI_INFO((AE_INFO,
-			   "%u ACPI AML tables successfully acquired and loaded\n",
-			   tables_loaded));
+		ACPI_INFO(("%u ACPI AML tables successfully acquired and loaded\n", tables_loaded));
 	} else {
 		ACPI_ERROR((AE_INFO,
 			    "%u table load failures, %u successful",
@@ -301,7 +299,7 @@ acpi_status acpi_load_table(struct acpi_table_header *table)
 
 	/* Install the table and load it into the namespace */
 
-	ACPI_INFO((AE_INFO, "Host-directed Dynamic ACPI Table Load:"));
+	ACPI_INFO(("Host-directed Dynamic ACPI Table Load:"));
 	(void)acpi_ut_acquire_mutex(ACPI_MTX_TABLES);
 
 	status = acpi_tb_install_standard_table(ACPI_PTR_TO_PHYSADDR(table),
diff --git a/drivers/acpi/acpica/uttrack.c b/drivers/acpi/acpica/uttrack.c
index c7c2bb8..60c406a 100644
--- a/drivers/acpi/acpica/uttrack.c
+++ b/drivers/acpi/acpica/uttrack.c
@@ -712,7 +712,7 @@ void acpi_ut_dump_allocations(u32 component, const char *module)
 	/* Print summary */
 
 	if (!num_outstanding) {
-		ACPI_INFO((AE_INFO, "No outstanding allocations"));
+		ACPI_INFO(("No outstanding allocations"));
 	} else {
 		ACPI_ERROR((AE_INFO, "%u(0x%X) Outstanding allocations",
 			    num_outstanding, num_outstanding));
diff --git a/drivers/acpi/acpica/utxferror.c b/drivers/acpi/acpica/utxferror.c
index 6fe5959..d9f15cb 100644
--- a/drivers/acpi/acpica/utxferror.c
+++ b/drivers/acpi/acpica/utxferror.c
@@ -175,8 +175,7 @@ ACPI_EXPORT_SYMBOL(acpi_warning)
  * TBD: module_name and line_number args are not needed, should be removed.
  *
  ******************************************************************************/
-void ACPI_INTERNAL_VAR_XFACE
-acpi_info(const char *module_name, u32 line_number, const char *format, ...)
+void ACPI_INTERNAL_VAR_XFACE acpi_info(const char *format, ...)
 {
 	va_list arg_list;
 
diff --git a/include/acpi/acpixf.h b/include/acpi/acpixf.h
index c96621e..1755697 100644
--- a/include/acpi/acpixf.h
+++ b/include/acpi/acpixf.h
@@ -897,11 +897,9 @@ ACPI_MSG_DEPENDENT_RETURN_VOID(ACPI_PRINTF_LIKE(3)
 				acpi_warning(const char *module_name,
 					     u32 line_number,
 					     const char *format, ...))
-ACPI_MSG_DEPENDENT_RETURN_VOID(ACPI_PRINTF_LIKE(3)
+ACPI_MSG_DEPENDENT_RETURN_VOID(ACPI_PRINTF_LIKE(1)
 				void ACPI_INTERNAL_VAR_XFACE
-				acpi_info(const char *module_name,
-					  u32 line_number,
-					  const char *format, ...))
+				acpi_info(const char *format, ...))
 ACPI_MSG_DEPENDENT_RETURN_VOID(ACPI_PRINTF_LIKE(3)
 				void ACPI_INTERNAL_VAR_XFACE
 				acpi_bios_error(const char *module_name,
-- 
cgit v0.10.2


From b3aec9725f387c680de308b5e94ec5bb7a5988a8 Mon Sep 17 00:00:00 2001
From: "David E. Box" <david.e.box@linux.intel.com>
Date: Fri, 19 Feb 2016 14:16:49 +0800
Subject: ACPICA: ACPI 6.0/iASL: Add support for the External AML opcode

ACPICA commit 882892feeafe8b8e5be10463133405cd4f1309d9

Support for both the compiler and disassembler.
Also, the interpreter will ignore this opcode if it
is ever encountered (should not happen).
David Box.

Link: https://github.com/acpica/acpica/commit/882892fe
Signed-off-by: David E. Box <david.e.box@linux.intel.com>
Signed-off-by: Bob Moore <robert.moore@intel.com>
Signed-off-by: Lv Zheng <lv.zheng@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/acpi/acpica/exoparg3.c b/drivers/acpi/acpica/exoparg3.c
index 28eb861..5aa21c4 100644
--- a/drivers/acpi/acpica/exoparg3.c
+++ b/drivers/acpi/acpica/exoparg3.c
@@ -123,8 +123,10 @@ acpi_status acpi_ex_opcode_3A_0T_0R(struct acpi_walk_state *walk_state)
 		 * op is intended for use by disassemblers in order to properly
 		 * disassemble control method invocations. The opcode or group of
 		 * opcodes should be surrounded by an "if (0)" clause to ensure that
-		 * AML interpreters never see the opcode.
+		 * AML interpreters never see the opcode. Thus, something is
+		 * wrong if an external opcode ever gets here.
 		 */
+		ACPI_ERROR((AE_INFO, "Executed External Op"));
 		status = AE_OK;
 		goto cleanup;
 
-- 
cgit v0.10.2


From cc82f18e8ccdd0968d90bcd5835684566f75598b Mon Sep 17 00:00:00 2001
From: Lv Zheng <lv.zheng@intel.com>
Date: Fri, 19 Feb 2016 14:16:56 +0800
Subject: ACPICA: Tables: make default region accessible during the table load

ACPICA commit 016b2a0917cca9cf0d40c38a1541017d9cf569dd

It is proven that the default regions should be accessible during the
table loading in order to execute module level AML code.
This patch moves default region handler installation code earlier in
order to make this happen.
Note that by putting the code here, we actually allow OSPMs to override
default region handlers between acpi_initialize_subsystem() and
acpi_load_tables(), without the need to introduce region handler override
mechanism in acpi_install_address_space_handler(). OSPMs are also couraged
to check acpi_install_address_space_handler() return value to determine if
acpi_remove_address_space_handler() should be invoked before installing new
address space handler. Lv Zheng.

Link: https://github.com/acpica/acpica/commit/016b2a09
Signed-off-by: Lv Zheng <lv.zheng@intel.com>
Signed-off-by: Bob Moore <robert.moore@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/acpi/acpica/tbxfload.c b/drivers/acpi/acpica/tbxfload.c
index c89943b..9496b84 100644
--- a/drivers/acpi/acpica/tbxfload.c
+++ b/drivers/acpi/acpica/tbxfload.c
@@ -47,6 +47,7 @@
 #include "accommon.h"
 #include "acnamesp.h"
 #include "actables.h"
+#include "acevents.h"
 
 #define _COMPONENT          ACPI_TABLES
 ACPI_MODULE_NAME("tbxfload")
@@ -68,6 +69,27 @@ acpi_status __init acpi_load_tables(void)
 
 	ACPI_FUNCTION_TRACE(acpi_load_tables);
 
+	/*
+	 * Install the default operation region handlers. These are the
+	 * handlers that are defined by the ACPI specification to be
+	 * "always accessible" -- namely, system_memory, system_IO, and
+	 * PCI_Config. This also means that no _REG methods need to be
+	 * run for these address spaces. We need to have these handlers
+	 * installed before any AML code can be executed, especially any
+	 * module-level code (11/2015).
+	 * Note that we allow OSPMs to install their own region handlers
+	 * between acpi_initialize_subsystem() and acpi_load_tables() to use
+	 * their customized default region handlers.
+	 */
+	if (acpi_gbl_group_module_level_code) {
+		status = acpi_ev_install_region_handlers();
+		if (ACPI_FAILURE(status) && status != AE_ALREADY_EXISTS) {
+			ACPI_EXCEPTION((AE_INFO, status,
+					"During Region initialization"));
+			return_ACPI_STATUS(status);
+		}
+	}
+
 	/* Load the namespace from the tables */
 
 	status = acpi_tb_load_namespace();
diff --git a/drivers/acpi/acpica/utxfinit.c b/drivers/acpi/acpica/utxfinit.c
index 721b87c..e1ffd7a 100644
--- a/drivers/acpi/acpica/utxfinit.c
+++ b/drivers/acpi/acpica/utxfinit.c
@@ -163,11 +163,13 @@ acpi_status __init acpi_enable_subsystem(u32 flags)
 	 * installed before any AML code can be executed, especially any
 	 * module-level code (11/2015).
 	 */
-	status = acpi_ev_install_region_handlers();
-	if (ACPI_FAILURE(status)) {
-		ACPI_EXCEPTION((AE_INFO, status,
-				"During Region initialization"));
-		return_ACPI_STATUS(status);
+	if (!acpi_gbl_group_module_level_code) {
+		status = acpi_ev_install_region_handlers();
+		if (ACPI_FAILURE(status)) {
+			ACPI_EXCEPTION((AE_INFO, status,
+					"During Region initialization"));
+			return_ACPI_STATUS(status);
+		}
 	}
 #if (!ACPI_REDUCED_HARDWARE)
 
-- 
cgit v0.10.2


From 9559130b11f05ddc1db14cfc3fe6ac25a1417a03 Mon Sep 17 00:00:00 2001
From: Lv Zheng <lv.zheng@intel.com>
Date: Fri, 19 Feb 2016 14:17:05 +0800
Subject: ACPICA: ACPICA: Tune _REG evaluations order in the initialization
 steps

ACPICA commit 77e0c7a482ac30ef857cf3c33d075e5fe5b5e449

This patch tunes _REG evaluations to be later than all table loading
facilities:
1. acpi_load_tables(): _REG is currently invoked after this function.
2. acpi_ns_exec_module_code_list(): this executes module level code, the
   execution should be a part of the table loading while we currently
   support this in a deferred way.
3. acpi_ns_initialize_objects(): this parses Region/Field/Buffer/Package where
   pkg_length primitive can be seen in the grammar, the parsing should be a
   part of the table loading while we currently support this in a deferred
   way.
Control method evaluation should happen after loading the tables. So this
patch changes the order of _REG evaluation when
acpi_gbl_group_module_level_code experiment is enabled. Lv Zheng.

Link: https://github.com/acpica/acpica/commit/77e0c7a4
Signed-off-by: Lv Zheng <lv.zheng@intel.com>
Signed-off-by: Bob Moore <robert.moore@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/acpi/acpica/utxfinit.c b/drivers/acpi/acpica/utxfinit.c
index e1ffd7a..9415830 100644
--- a/drivers/acpi/acpica/utxfinit.c
+++ b/drivers/acpi/acpica/utxfinit.c
@@ -262,23 +262,6 @@ acpi_status __init acpi_initialize_objects(u32 flags)
 
 	ACPI_FUNCTION_TRACE(acpi_initialize_objects);
 
-	/*
-	 * Run all _REG methods
-	 *
-	 * Note: Any objects accessed by the _REG methods will be automatically
-	 * initialized, even if they contain executable AML (see the call to
-	 * acpi_ns_initialize_objects below).
-	 */
-	acpi_gbl_reg_methods_enabled = TRUE;
-	if (!(flags & ACPI_NO_ADDRESS_SPACE_INIT)) {
-		ACPI_DEBUG_PRINT((ACPI_DB_EXEC,
-				  "[Init] Executing _REG OpRegion methods\n"));
-
-		status = acpi_ev_initialize_op_regions();
-		if (ACPI_FAILURE(status)) {
-			return_ACPI_STATUS(status);
-		}
-	}
 #ifdef ACPI_EXEC_APP
 	/*
 	 * This call implements the "initialization file" option for acpi_exec.
@@ -319,6 +302,24 @@ acpi_status __init acpi_initialize_objects(u32 flags)
 	}
 
 	/*
+	 * Run all _REG methods
+	 *
+	 * Note: Any objects accessed by the _REG methods will be automatically
+	 * initialized, even if they contain executable AML (see the call to
+	 * acpi_ns_initialize_objects below).
+	 */
+	acpi_gbl_reg_methods_enabled = TRUE;
+	if (!(flags & ACPI_NO_ADDRESS_SPACE_INIT)) {
+		ACPI_DEBUG_PRINT((ACPI_DB_EXEC,
+				  "[Init] Executing _REG OpRegion methods\n"));
+
+		status = acpi_ev_initialize_op_regions();
+		if (ACPI_FAILURE(status)) {
+			return_ACPI_STATUS(status);
+		}
+	}
+
+	/*
 	 * Initialize all device objects in the namespace. This runs the device
 	 * _STA and _INI methods.
 	 */
-- 
cgit v0.10.2


From ced043663e4becc7b66860ebca6d307182c97dfb Mon Sep 17 00:00:00 2001
From: Lv Zheng <lv.zheng@intel.com>
Date: Fri, 19 Feb 2016 14:17:15 +0800
Subject: ACPICA: Namespace: Ensure \_SB._INI executed before any _REG

ACPICA commit 8ae25b8d128b6b8509010be321ff6bf2760f3807

There is BIOS code relying on the fact that \_SB._INI should get evaluated
before any other control methods. This may implies a gap in ACPICA/Linux
initialization/enumeration process.

Before revealing Windows true behavior by more validations, this patch only
ensures \_SB._INI evaluated before any _REG control methods. This can help
to make progress to other initialization order fixes. Lv Zheng.

Link: https://github.com/acpica/acpica/commit/8ae25b8d
Signed-off-by: Lv Zheng <lv.zheng@intel.com>
Signed-off-by: Bob Moore <robert.moore@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/acpi/acpica/acnamesp.h b/drivers/acpi/acpica/acnamesp.h
index 9684ed6..022d69c 100644
--- a/drivers/acpi/acpica/acnamesp.h
+++ b/drivers/acpi/acpica/acnamesp.h
@@ -88,7 +88,7 @@
  */
 acpi_status acpi_ns_initialize_objects(void);
 
-acpi_status acpi_ns_initialize_devices(void);
+acpi_status acpi_ns_initialize_devices(u32 flags);
 
 /*
  * nsload -  Namespace loading
diff --git a/drivers/acpi/acpica/nsinit.c b/drivers/acpi/acpica/nsinit.c
index bd75d46..3281dd8 100644
--- a/drivers/acpi/acpica/nsinit.c
+++ b/drivers/acpi/acpica/nsinit.c
@@ -46,6 +46,7 @@
 #include "acnamesp.h"
 #include "acdispat.h"
 #include "acinterp.h"
+#include "acevents.h"
 
 #define _COMPONENT          ACPI_NAMESPACE
 ACPI_MODULE_NAME("nsinit")
@@ -133,82 +134,108 @@ acpi_status acpi_ns_initialize_objects(void)
  *
  ******************************************************************************/
 
-acpi_status acpi_ns_initialize_devices(void)
+acpi_status acpi_ns_initialize_devices(u32 flags)
 {
 	acpi_status status;
 	struct acpi_device_walk_info info;
 
 	ACPI_FUNCTION_TRACE(ns_initialize_devices);
 
-	/* Init counters */
+	if (!(flags & ACPI_NO_DEVICE_INIT)) {
+		ACPI_DEBUG_PRINT((ACPI_DB_EXEC,
+				  "[Init] Initializing ACPI Devices\n"));
 
-	info.device_count = 0;
-	info.num_STA = 0;
-	info.num_INI = 0;
+		/* Init counters */
 
-	ACPI_DEBUG_PRINT_RAW((ACPI_DB_INIT,
-			      "Initializing Device/Processor/Thermal objects "
-			      "and executing _INI/_STA methods:\n"));
+		info.device_count = 0;
+		info.num_STA = 0;
+		info.num_INI = 0;
 
-	/* Tree analysis: find all subtrees that contain _INI methods */
+		ACPI_DEBUG_PRINT_RAW((ACPI_DB_INIT,
+				      "Initializing Device/Processor/Thermal objects "
+				      "and executing _INI/_STA methods:\n"));
 
-	status = acpi_ns_walk_namespace(ACPI_TYPE_ANY, ACPI_ROOT_OBJECT,
-					ACPI_UINT32_MAX, FALSE,
-					acpi_ns_find_ini_methods, NULL, &info,
-					NULL);
-	if (ACPI_FAILURE(status)) {
-		goto error_exit;
-	}
+		/* Tree analysis: find all subtrees that contain _INI methods */
 
-	/* Allocate the evaluation information block */
+		status = acpi_ns_walk_namespace(ACPI_TYPE_ANY, ACPI_ROOT_OBJECT,
+						ACPI_UINT32_MAX, FALSE,
+						acpi_ns_find_ini_methods, NULL,
+						&info, NULL);
+		if (ACPI_FAILURE(status)) {
+			goto error_exit;
+		}
+
+		/* Allocate the evaluation information block */
 
-	info.evaluate_info =
-	    ACPI_ALLOCATE_ZEROED(sizeof(struct acpi_evaluate_info));
-	if (!info.evaluate_info) {
-		status = AE_NO_MEMORY;
-		goto error_exit;
+		info.evaluate_info =
+		    ACPI_ALLOCATE_ZEROED(sizeof(struct acpi_evaluate_info));
+		if (!info.evaluate_info) {
+			status = AE_NO_MEMORY;
+			goto error_exit;
+		}
+
+		/*
+		 * Execute the "global" _INI method that may appear at the root.
+		 * This support is provided for Windows compatibility (Vista+) and
+		 * is not part of the ACPI specification.
+		 */
+		info.evaluate_info->prefix_node = acpi_gbl_root_node;
+		info.evaluate_info->relative_pathname = METHOD_NAME__INI;
+		info.evaluate_info->parameters = NULL;
+		info.evaluate_info->flags = ACPI_IGNORE_RETURN_VALUE;
+
+		status = acpi_ns_evaluate(info.evaluate_info);
+		if (ACPI_SUCCESS(status)) {
+			info.num_INI++;
+		}
 	}
 
 	/*
-	 * Execute the "global" _INI method that may appear at the root. This
-	 * support is provided for Windows compatibility (Vista+) and is not
-	 * part of the ACPI specification.
+	 * Run all _REG methods
+	 *
+	 * Note: Any objects accessed by the _REG methods will be automatically
+	 * initialized, even if they contain executable AML (see the call to
+	 * acpi_ns_initialize_objects below).
 	 */
-	info.evaluate_info->prefix_node = acpi_gbl_root_node;
-	info.evaluate_info->relative_pathname = METHOD_NAME__INI;
-	info.evaluate_info->parameters = NULL;
-	info.evaluate_info->flags = ACPI_IGNORE_RETURN_VALUE;
+	if (!(flags & ACPI_NO_ADDRESS_SPACE_INIT)) {
+		ACPI_DEBUG_PRINT((ACPI_DB_EXEC,
+				  "[Init] Executing _REG OpRegion methods\n"));
 
-	status = acpi_ns_evaluate(info.evaluate_info);
-	if (ACPI_SUCCESS(status)) {
-		info.num_INI++;
+		status = acpi_ev_initialize_op_regions();
+		if (ACPI_FAILURE(status)) {
+			goto error_exit;
+		}
 	}
 
-	/* Walk namespace to execute all _INIs on present devices */
+	if (!(flags & ACPI_NO_DEVICE_INIT)) {
 
-	status = acpi_ns_walk_namespace(ACPI_TYPE_ANY, ACPI_ROOT_OBJECT,
-					ACPI_UINT32_MAX, FALSE,
-					acpi_ns_init_one_device, NULL, &info,
-					NULL);
+		/* Walk namespace to execute all _INIs on present devices */
 
-	/*
-	 * Any _OSI requests should be completed by now. If the BIOS has
-	 * requested any Windows OSI strings, we will always truncate
-	 * I/O addresses to 16 bits -- for Windows compatibility.
-	 */
-	if (acpi_gbl_osi_data >= ACPI_OSI_WIN_2000) {
-		acpi_gbl_truncate_io_addresses = TRUE;
-	}
+		status = acpi_ns_walk_namespace(ACPI_TYPE_ANY, ACPI_ROOT_OBJECT,
+						ACPI_UINT32_MAX, FALSE,
+						acpi_ns_init_one_device, NULL,
+						&info, NULL);
 
-	ACPI_FREE(info.evaluate_info);
-	if (ACPI_FAILURE(status)) {
-		goto error_exit;
-	}
+		/*
+		 * Any _OSI requests should be completed by now. If the BIOS has
+		 * requested any Windows OSI strings, we will always truncate
+		 * I/O addresses to 16 bits -- for Windows compatibility.
+		 */
+		if (acpi_gbl_osi_data >= ACPI_OSI_WIN_2000) {
+			acpi_gbl_truncate_io_addresses = TRUE;
+		}
 
-	ACPI_DEBUG_PRINT_RAW((ACPI_DB_INIT,
-			      "    Executed %u _INI methods requiring %u _STA executions "
-			      "(examined %u objects)\n",
-			      info.num_INI, info.num_STA, info.device_count));
+		ACPI_FREE(info.evaluate_info);
+		if (ACPI_FAILURE(status)) {
+			goto error_exit;
+		}
+
+		ACPI_DEBUG_PRINT_RAW((ACPI_DB_INIT,
+				      "    Executed %u _INI methods requiring %u _STA executions "
+				      "(examined %u objects)\n",
+				      info.num_INI, info.num_STA,
+				      info.device_count));
+	}
 
 	return_ACPI_STATUS(status);
 
diff --git a/drivers/acpi/acpica/utxfinit.c b/drivers/acpi/acpica/utxfinit.c
index 9415830..66d6f19 100644
--- a/drivers/acpi/acpica/utxfinit.c
+++ b/drivers/acpi/acpica/utxfinit.c
@@ -301,33 +301,14 @@ acpi_status __init acpi_initialize_objects(u32 flags)
 		}
 	}
 
-	/*
-	 * Run all _REG methods
-	 *
-	 * Note: Any objects accessed by the _REG methods will be automatically
-	 * initialized, even if they contain executable AML (see the call to
-	 * acpi_ns_initialize_objects below).
-	 */
 	acpi_gbl_reg_methods_enabled = TRUE;
-	if (!(flags & ACPI_NO_ADDRESS_SPACE_INIT)) {
-		ACPI_DEBUG_PRINT((ACPI_DB_EXEC,
-				  "[Init] Executing _REG OpRegion methods\n"));
-
-		status = acpi_ev_initialize_op_regions();
-		if (ACPI_FAILURE(status)) {
-			return_ACPI_STATUS(status);
-		}
-	}
 
 	/*
-	 * Initialize all device objects in the namespace. This runs the device
-	 * _STA and _INI methods.
+	 * Initialize all device/region objects in the namespace. This runs
+	 * the device _STA and _INI methods and region _REG methods.
 	 */
-	if (!(flags & ACPI_NO_DEVICE_INIT)) {
-		ACPI_DEBUG_PRINT((ACPI_DB_EXEC,
-				  "[Init] Initializing ACPI Devices\n"));
-
-		status = acpi_ns_initialize_devices();
+	if (!(flags & (ACPI_NO_DEVICE_INIT | ACPI_NO_ADDRESS_SPACE_INIT))) {
+		status = acpi_ns_initialize_devices(flags);
 		if (ACPI_FAILURE(status)) {
 			return_ACPI_STATUS(status);
 		}
-- 
cgit v0.10.2


From ebc3c9bb752b380c1728059fe5017da2f15678ff Mon Sep 17 00:00:00 2001
From: Lv Zheng <lv.zheng@intel.com>
Date: Fri, 19 Feb 2016 14:17:22 +0800
Subject: ACPICA: Namespace: Rename acpi_gbl_reg_methods_enabled to
 acpi_gbl_namespace_initialized

ACPICA commit 4be3b82cf45d324366ea8567102d5108c5ef47cb
ACPICA commit 19f84c249267fab0bfb138bd14d12510fb4faf24

The global variable actually means the availability of the namespace, and
control methods evaluations should happen after namespace readiness. Thus
this patch renames the global variable to reflect this logic. Lv Zheng.

Link: https://github.com/acpica/acpica/commit/4be3b82c
Link: https://github.com/acpica/acpica/commit/19f84c24
Signed-off-by: Lv Zheng <lv.zheng@intel.com>
Signed-off-by: Bob Moore <robert.moore@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/acpi/acpica/acglobal.h b/drivers/acpi/acpica/acglobal.h
index 55c8197..51b073b 100644
--- a/drivers/acpi/acpica/acglobal.h
+++ b/drivers/acpi/acpica/acglobal.h
@@ -165,7 +165,7 @@ ACPI_GLOBAL(u8, acpi_gbl_next_owner_id_offset);
 
 /* Initialization sequencing */
 
-ACPI_INIT_GLOBAL(u8, acpi_gbl_reg_methods_enabled, FALSE);
+ACPI_INIT_GLOBAL(u8, acpi_gbl_namespace_initialized, FALSE);
 
 /* Misc */
 
diff --git a/drivers/acpi/acpica/evregion.c b/drivers/acpi/acpica/evregion.c
index 47092b4..63924d1 100644
--- a/drivers/acpi/acpica/evregion.c
+++ b/drivers/acpi/acpica/evregion.c
@@ -600,7 +600,7 @@ acpi_ev_execute_reg_method(union acpi_operand_object *region_obj, u32 function)
 
 	if (region_obj2->extra.method_REG == NULL ||
 	    region_obj->region.handler == NULL ||
-	    !acpi_gbl_reg_methods_enabled) {
+	    !acpi_gbl_namespace_initialized) {
 		return_ACPI_STATUS(AE_OK);
 	}
 
diff --git a/drivers/acpi/acpica/nsinit.c b/drivers/acpi/acpica/nsinit.c
index 3281dd8..f029a3d 100644
--- a/drivers/acpi/acpica/nsinit.c
+++ b/drivers/acpi/acpica/nsinit.c
@@ -136,7 +136,7 @@ acpi_status acpi_ns_initialize_objects(void)
 
 acpi_status acpi_ns_initialize_devices(u32 flags)
 {
-	acpi_status status;
+	acpi_status status = AE_OK;
 	struct acpi_device_walk_info info;
 
 	ACPI_FUNCTION_TRACE(ns_initialize_devices);
diff --git a/drivers/acpi/acpica/utxfinit.c b/drivers/acpi/acpica/utxfinit.c
index 66d6f19..d70649d 100644
--- a/drivers/acpi/acpica/utxfinit.c
+++ b/drivers/acpi/acpica/utxfinit.c
@@ -301,7 +301,7 @@ acpi_status __init acpi_initialize_objects(u32 flags)
 		}
 	}
 
-	acpi_gbl_reg_methods_enabled = TRUE;
+	acpi_gbl_namespace_initialized = TRUE;
 
 	/*
 	 * Initialize all device/region objects in the namespace. This runs
-- 
cgit v0.10.2


From 6019d23a73c79fe4b0f531b0e968b14e1d6f50f1 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Fri, 26 Feb 2016 00:03:01 +0100
Subject: cpufreq: Rearrange __cpufreq_driver_target()

Drop a pointless label at a return statement from
__cpufreq_driver_target() and rearrange that function
to reduce the indentation level.

No intentional functional changes.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 9be6543..bdf258e 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -1805,7 +1805,8 @@ int __cpufreq_driver_target(struct cpufreq_policy *policy,
 			    unsigned int relation)
 {
 	unsigned int old_target_freq = target_freq;
-	int retval = -EINVAL;
+	struct cpufreq_frequency_table *freq_table;
+	int index, retval;
 
 	if (cpufreq_disabled())
 		return -ENODEV;
@@ -1832,34 +1833,28 @@ int __cpufreq_driver_target(struct cpufreq_policy *policy,
 	policy->restore_freq = policy->cur;
 
 	if (cpufreq_driver->target)
-		retval = cpufreq_driver->target(policy, target_freq, relation);
-	else if (cpufreq_driver->target_index) {
-		struct cpufreq_frequency_table *freq_table;
-		int index;
+		return cpufreq_driver->target(policy, target_freq, relation);
 
-		freq_table = cpufreq_frequency_get_table(policy->cpu);
-		if (unlikely(!freq_table)) {
-			pr_err("%s: Unable to find freq_table\n", __func__);
-			goto out;
-		}
-
-		retval = cpufreq_frequency_table_target(policy, freq_table,
-				target_freq, relation, &index);
-		if (unlikely(retval)) {
-			pr_err("%s: Unable to find matching freq\n", __func__);
-			goto out;
-		}
+	if (!cpufreq_driver->target_index)
+		return -EINVAL;
 
-		if (freq_table[index].frequency == policy->cur) {
-			retval = 0;
-			goto out;
-		}
+	freq_table = cpufreq_frequency_get_table(policy->cpu);
+	if (unlikely(!freq_table)) {
+		pr_err("%s: Unable to find freq_table\n", __func__);
+		return -EINVAL;
+	}
 
-		retval = __target_index(policy, freq_table, index);
+	retval = cpufreq_frequency_table_target(policy, freq_table, target_freq,
+						relation, &index);
+	if (unlikely(retval)) {
+		pr_err("%s: Unable to find matching freq\n", __func__);
+		return retval;
 	}
 
-out:
-	return retval;
+	if (freq_table[index].frequency == policy->cur)
+		return 0;
+
+	return __target_index(policy, freq_table, index);
 }
 EXPORT_SYMBOL_GPL(__cpufreq_driver_target);
 
-- 
cgit v0.10.2


From 9a909a142f02bfa5fd3e203a564abc82fd0240c3 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Fri, 26 Feb 2016 00:03:58 +0100
Subject: cpufreq: acpi-cpufreq: Drop pointless label from
 acpi_cpufreq_target()

The "out" label at the final return statement in acpi_cpufreq_target()
is totally pointless, so drop them and modify the code to return the
right values immediately instead of jumping to it.

No functional changes.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Reviewed-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>

diff --git a/drivers/cpufreq/acpi-cpufreq.c b/drivers/cpufreq/acpi-cpufreq.c
index 51eef87..17a8d0c 100644
--- a/drivers/cpufreq/acpi-cpufreq.c
+++ b/drivers/cpufreq/acpi-cpufreq.c
@@ -434,7 +434,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
 		} else {
 			pr_debug("Already at target state (P%d)\n",
 				next_perf_state);
-			goto out;
+			return 0;
 		}
 	}
 
@@ -456,8 +456,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
 		cmd.val = (u32) perf->states[next_perf_state].control;
 		break;
 	default:
-		result = -ENODEV;
-		goto out;
+		return -ENODEV;
 	}
 
 	/* cpufreq holds the hotplug lock, so we are safe from here on */
@@ -480,7 +479,6 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
 	if (!result)
 		perf->state = next_perf_state;
 
-out:
 	return result;
 }
 
-- 
cgit v0.10.2


From 34b0870515aaac6b7ea1ffdc370516b0a8024c82 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Fri, 26 Feb 2016 00:22:57 +0100
Subject: cpufreq: Simplify the cpufreq_for_each_valid_entry()

That macro uses an internal static inline function that is first
totally unnecessary and second hard to read, so simplify it and
get rid of that monster.

No functional changes.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>

diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index d0bf555..4064cfc 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -504,16 +504,6 @@ static inline void dev_pm_opp_free_cpufreq_table(struct device *dev,
 }
 #endif
 
-static inline bool cpufreq_next_valid(struct cpufreq_frequency_table **pos)
-{
-	while ((*pos)->frequency != CPUFREQ_TABLE_END)
-		if ((*pos)->frequency != CPUFREQ_ENTRY_INVALID)
-			return true;
-		else
-			(*pos)++;
-	return false;
-}
-
 /*
  * cpufreq_for_each_entry -	iterate over a cpufreq_frequency_table
  * @pos:	the cpufreq_frequency_table * to use as a loop cursor.
@@ -530,8 +520,11 @@ static inline bool cpufreq_next_valid(struct cpufreq_frequency_table **pos)
  * @table:      the cpufreq_frequency_table * to iterate over.
  */
 
-#define cpufreq_for_each_valid_entry(pos, table)	\
-	for (pos = table; cpufreq_next_valid(&pos); pos++)
+#define cpufreq_for_each_valid_entry(pos, table)			\
+	for (pos = table; pos->frequency != CPUFREQ_TABLE_END; pos++)	\
+		if (pos->frequency == CPUFREQ_ENTRY_INVALID)		\
+			continue;					\
+		else
 
 int cpufreq_frequency_table_cpuinfo(struct cpufreq_policy *policy,
 				    struct cpufreq_frequency_table *table);
-- 
cgit v0.10.2


From 7791e4aa59ad724e0b4c8b4dea547a5735108972 Mon Sep 17 00:00:00 2001
From: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Date: Thu, 25 Feb 2016 15:09:19 -0800
Subject: cpufreq: intel_pstate: Enable HWP by default

If the processor supports HWP, enable it by default without checking
for the cpu model. This will allow to enable HWP in all supported
processors without driver change.

Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index e856776..ebe8506b 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -1400,6 +1400,11 @@ static inline bool intel_pstate_platform_pwr_mgmt_exists(void) { return false; }
 static inline bool intel_pstate_has_acpi_ppc(void) { return false; }
 #endif /* CONFIG_ACPI */
 
+static const struct x86_cpu_id hwp_support_ids[] __initconst = {
+	{ X86_VENDOR_INTEL, 6, X86_MODEL_ANY, X86_FEATURE_HWP },
+	{}
+};
+
 static int __init intel_pstate_init(void)
 {
 	int cpu, rc = 0;
@@ -1409,17 +1414,16 @@ static int __init intel_pstate_init(void)
 	if (no_load)
 		return -ENODEV;
 
+	if (x86_match_cpu(hwp_support_ids) && !no_hwp) {
+		copy_cpu_funcs(&core_params.funcs);
+		hwp_active++;
+		goto hwp_cpu_matched;
+	}
+
 	id = x86_match_cpu(intel_pstate_cpu_ids);
 	if (!id)
 		return -ENODEV;
 
-	/*
-	 * The Intel pstate driver will be ignored if the platform
-	 * firmware has its own power management modes.
-	 */
-	if (intel_pstate_platform_pwr_mgmt_exists())
-		return -ENODEV;
-
 	cpu_def = (struct cpu_defaults *)id->driver_data;
 
 	copy_pid_params(&cpu_def->pid_policy);
@@ -1428,17 +1432,20 @@ static int __init intel_pstate_init(void)
 	if (intel_pstate_msrs_not_valid())
 		return -ENODEV;
 
+hwp_cpu_matched:
+	/*
+	 * The Intel pstate driver will be ignored if the platform
+	 * firmware has its own power management modes.
+	 */
+	if (intel_pstate_platform_pwr_mgmt_exists())
+		return -ENODEV;
+
 	pr_info("Intel P-state driver initializing.\n");
 
 	all_cpu_data = vzalloc(sizeof(void *) * num_possible_cpus());
 	if (!all_cpu_data)
 		return -ENOMEM;
 
-	if (static_cpu_has_safe(X86_FEATURE_HWP) && !no_hwp) {
-		pr_info("intel_pstate: HWP enabled\n");
-		hwp_active++;
-	}
-
 	if (!hwp_active && hwp_only)
 		goto out;
 
@@ -1449,6 +1456,9 @@ static int __init intel_pstate_init(void)
 	intel_pstate_debug_expose_params();
 	intel_pstate_sysfs_expose_params();
 
+	if (hwp_active)
+		pr_info("intel_pstate: HWP enabled\n");
+
 	return rc;
 out:
 	get_online_cpus();
-- 
cgit v0.10.2


From f05c966585ec5295516f946505661ec2f9966e5a Mon Sep 17 00:00:00 2001
From: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Date: Thu, 25 Feb 2016 15:09:31 -0800
Subject: cpufreq: intel_pstate: disable HWP notifications

Disable HWP Interrupt notification before enabling HWP. Since we don't
have HWP interrupt handling for possible performance interrupts, there
is not much use of enabling HWP interrupts.

Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index ebe8506b..9376670 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -534,6 +534,9 @@ static void __init intel_pstate_sysfs_expose_params(void)
 
 static void intel_pstate_hwp_enable(struct cpudata *cpudata)
 {
+	/* First disable HWP notification interrupt as we don't process them */
+	wrmsrl_on_cpu(cpudata->cpu, MSR_HWP_INTERRUPT, 0x00);
+
 	wrmsrl_on_cpu(cpudata->cpu, MSR_PM_ENABLE, 0x1);
 }
 
-- 
cgit v0.10.2


From c5e29ea7ac144113f60dc540d7bb00cea5056b10 Mon Sep 17 00:00:00 2001
From: Shilpasri G Bhat <shilpa.bhat@linux.vnet.ibm.com>
Date: Fri, 26 Feb 2016 16:06:51 +0530
Subject: cpufreq: powernv: Fix bugs in powernv_cpufreq_{init/exit}

Unregister the notifiers if cpufreq_driver_register() fails in
powernv_cpufreq_init(). Re-arrange the unregistration and cleanup routines
in powernv_cpufreq_exit() to free all the resources after the driver
has unregistered.

Signed-off-by: Shilpasri G Bhat <shilpa.bhat@linux.vnet.ibm.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c
index 1bbc10a..50bf120 100644
--- a/drivers/cpufreq/powernv-cpufreq.c
+++ b/drivers/cpufreq/powernv-cpufreq.c
@@ -595,6 +595,19 @@ out:
 	return ret;
 }
 
+static inline void clean_chip_info(void)
+{
+	kfree(chips);
+	kfree(core_to_chip_map);
+}
+
+static inline void unregister_all_notifiers(void)
+{
+	opal_message_notifier_unregister(OPAL_MSG_OCC,
+					 &powernv_cpufreq_opal_nb);
+	unregister_reboot_notifier(&powernv_cpufreq_reboot_nb);
+}
+
 static int __init powernv_cpufreq_init(void)
 {
 	int rc = 0;
@@ -605,30 +618,35 @@ static int __init powernv_cpufreq_init(void)
 
 	/* Discover pstates from device tree and init */
 	rc = init_powernv_pstates();
-	if (rc) {
-		pr_info("powernv-cpufreq disabled. System does not support PState control\n");
-		return rc;
-	}
+	if (rc)
+		goto out;
 
 	/* Populate chip info */
 	rc = init_chip_info();
 	if (rc)
-		return rc;
+		goto out;
 
 	register_reboot_notifier(&powernv_cpufreq_reboot_nb);
 	opal_message_notifier_register(OPAL_MSG_OCC, &powernv_cpufreq_opal_nb);
-	return cpufreq_register_driver(&powernv_cpufreq_driver);
+
+	rc = cpufreq_register_driver(&powernv_cpufreq_driver);
+	if (!rc)
+		return 0;
+
+	pr_info("Failed to register the cpufreq driver (%d)\n", rc);
+	unregister_all_notifiers();
+	clean_chip_info();
+out:
+	pr_info("Platform driver disabled. System does not support PState control\n");
+	return rc;
 }
 module_init(powernv_cpufreq_init);
 
 static void __exit powernv_cpufreq_exit(void)
 {
-	unregister_reboot_notifier(&powernv_cpufreq_reboot_nb);
-	opal_message_notifier_unregister(OPAL_MSG_OCC,
-					 &powernv_cpufreq_opal_nb);
-	kfree(chips);
-	kfree(core_to_chip_map);
 	cpufreq_unregister_driver(&powernv_cpufreq_driver);
+	unregister_all_notifiers();
+	clean_chip_info();
 }
 module_exit(powernv_cpufreq_exit);
 
-- 
cgit v0.10.2


From ed757a2c7bf7aa99d219b78349b4a0334851dc38 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Wed, 2 Mar 2016 03:05:22 +0100
Subject: cpufreq: acpi-cpufreq: Make read and write operations more efficient

Setting a new CPU frequency and reading the current request value
in the ACPI cpufreq driver involves each at least two switch
instructions (there's more if the policy is shared).  One of
them is present in drv_read/write() that prepares a command
structure and the other happens in subsequent do_drv_read/write()
when that structure is interpreted.  However, all of those switches
may be avoided by using function pointers.

To that end, add two function pointers to struct acpi_cpufreq_data
to represent read and write operations on the frequency register
and set them up during policy intitialization to point to the pair
of routines suitable for the given processor (Intel/AMD MSR access
or I/O port access).  Then, use those pointers in do_drv_read/write()
and modify drv_read/write() to prepare the command structure for
them without any checks.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/cpufreq/acpi-cpufreq.c b/drivers/cpufreq/acpi-cpufreq.c
index 17a8d0c..59a7b38 100644
--- a/drivers/cpufreq/acpi-cpufreq.c
+++ b/drivers/cpufreq/acpi-cpufreq.c
@@ -70,6 +70,8 @@ struct acpi_cpufreq_data {
 	unsigned int cpu_feature;
 	unsigned int acpi_perf_cpu;
 	cpumask_var_t freqdomain_cpus;
+	void (*cpu_freq_write)(struct acpi_pct_register *reg, u32 val);
+	u32 (*cpu_freq_read)(struct acpi_pct_register *reg);
 };
 
 /* acpi_perf_data is a pointer to percpu data. */
@@ -243,125 +245,119 @@ static unsigned extract_freq(u32 val, struct acpi_cpufreq_data *data)
 	}
 }
 
-struct msr_addr {
-	u32 reg;
-};
+u32 cpu_freq_read_intel(struct acpi_pct_register *not_used)
+{
+	u32 val, dummy;
 
-struct io_addr {
-	u16 port;
-	u8 bit_width;
-};
+	rdmsr(MSR_IA32_PERF_CTL, val, dummy);
+	return val;
+}
+
+void cpu_freq_write_intel(struct acpi_pct_register *not_used, u32 val)
+{
+	u32 lo, hi;
+
+	rdmsr(MSR_IA32_PERF_CTL, lo, hi);
+	lo = (lo & ~INTEL_MSR_RANGE) | (val & INTEL_MSR_RANGE);
+	wrmsr(MSR_IA32_PERF_CTL, lo, hi);
+}
+
+u32 cpu_freq_read_amd(struct acpi_pct_register *not_used)
+{
+	u32 val, dummy;
+
+	rdmsr(MSR_AMD_PERF_CTL, val, dummy);
+	return val;
+}
+
+void cpu_freq_write_amd(struct acpi_pct_register *not_used, u32 val)
+{
+	wrmsr(MSR_AMD_PERF_CTL, val, 0);
+}
+
+u32 cpu_freq_read_io(struct acpi_pct_register *reg)
+{
+	u32 val;
+
+	acpi_os_read_port(reg->address, &val, reg->bit_width);
+	return val;
+}
+
+void cpu_freq_write_io(struct acpi_pct_register *reg, u32 val)
+{
+	acpi_os_write_port(reg->address, val, reg->bit_width);
+}
 
 struct drv_cmd {
-	unsigned int type;
-	const struct cpumask *mask;
-	union {
-		struct msr_addr msr;
-		struct io_addr io;
-	} addr;
+	struct acpi_pct_register *reg;
 	u32 val;
+	union {
+		void (*write)(struct acpi_pct_register *reg, u32 val);
+		u32 (*read)(struct acpi_pct_register *reg);
+	} func;
 };
 
 /* Called via smp_call_function_single(), on the target CPU */
 static void do_drv_read(void *_cmd)
 {
 	struct drv_cmd *cmd = _cmd;
-	u32 h;
 
-	switch (cmd->type) {
-	case SYSTEM_INTEL_MSR_CAPABLE:
-	case SYSTEM_AMD_MSR_CAPABLE:
-		rdmsr(cmd->addr.msr.reg, cmd->val, h);
-		break;
-	case SYSTEM_IO_CAPABLE:
-		acpi_os_read_port((acpi_io_address)cmd->addr.io.port,
-				&cmd->val,
-				(u32)cmd->addr.io.bit_width);
-		break;
-	default:
-		break;
-	}
+	cmd->val = cmd->func.read(cmd->reg);
 }
 
-/* Called via smp_call_function_many(), on the target CPUs */
-static void do_drv_write(void *_cmd)
+static u32 drv_read(struct acpi_cpufreq_data *data, const struct cpumask *mask)
 {
-	struct drv_cmd *cmd = _cmd;
-	u32 lo, hi;
+	struct acpi_processor_performance *perf = to_perf_data(data);
+	struct drv_cmd cmd = {
+		.reg = &perf->control_register,
+		.func.read = data->cpu_freq_read,
+	};
+	int err;
 
-	switch (cmd->type) {
-	case SYSTEM_INTEL_MSR_CAPABLE:
-		rdmsr(cmd->addr.msr.reg, lo, hi);
-		lo = (lo & ~INTEL_MSR_RANGE) | (cmd->val & INTEL_MSR_RANGE);
-		wrmsr(cmd->addr.msr.reg, lo, hi);
-		break;
-	case SYSTEM_AMD_MSR_CAPABLE:
-		wrmsr(cmd->addr.msr.reg, cmd->val, 0);
-		break;
-	case SYSTEM_IO_CAPABLE:
-		acpi_os_write_port((acpi_io_address)cmd->addr.io.port,
-				cmd->val,
-				(u32)cmd->addr.io.bit_width);
-		break;
-	default:
-		break;
-	}
+	err = smp_call_function_any(mask, do_drv_read, &cmd, 1);
+	WARN_ON_ONCE(err);	/* smp_call_function_any() was buggy? */
+	return cmd.val;
 }
 
-static void drv_read(struct drv_cmd *cmd)
+/* Called via smp_call_function_many(), on the target CPUs */
+static void do_drv_write(void *_cmd)
 {
-	int err;
-	cmd->val = 0;
+	struct drv_cmd *cmd = _cmd;
 
-	err = smp_call_function_any(cmd->mask, do_drv_read, cmd, 1);
-	WARN_ON_ONCE(err);	/* smp_call_function_any() was buggy? */
+	cmd->func.write(cmd->reg, cmd->val);
 }
 
-static void drv_write(struct drv_cmd *cmd)
+static void drv_write(struct acpi_cpufreq_data *data,
+		      const struct cpumask *mask, u32 val)
 {
+	struct acpi_processor_performance *perf = to_perf_data(data);
+	struct drv_cmd cmd = {
+		.reg = &perf->control_register,
+		.val = val,
+		.func.write = data->cpu_freq_write,
+	};
 	int this_cpu;
 
 	this_cpu = get_cpu();
-	if (cpumask_test_cpu(this_cpu, cmd->mask))
-		do_drv_write(cmd);
-	smp_call_function_many(cmd->mask, do_drv_write, cmd, 1);
+	if (cpumask_test_cpu(this_cpu, mask))
+		do_drv_write(&cmd);
+
+	smp_call_function_many(mask, do_drv_write, &cmd, 1);
 	put_cpu();
 }
 
-static u32
-get_cur_val(const struct cpumask *mask, struct acpi_cpufreq_data *data)
+static u32 get_cur_val(const struct cpumask *mask, struct acpi_cpufreq_data *data)
 {
-	struct acpi_processor_performance *perf;
-	struct drv_cmd cmd;
+	u32 val;
 
 	if (unlikely(cpumask_empty(mask)))
 		return 0;
 
-	switch (data->cpu_feature) {
-	case SYSTEM_INTEL_MSR_CAPABLE:
-		cmd.type = SYSTEM_INTEL_MSR_CAPABLE;
-		cmd.addr.msr.reg = MSR_IA32_PERF_CTL;
-		break;
-	case SYSTEM_AMD_MSR_CAPABLE:
-		cmd.type = SYSTEM_AMD_MSR_CAPABLE;
-		cmd.addr.msr.reg = MSR_AMD_PERF_CTL;
-		break;
-	case SYSTEM_IO_CAPABLE:
-		cmd.type = SYSTEM_IO_CAPABLE;
-		perf = to_perf_data(data);
-		cmd.addr.io.port = perf->control_register.address;
-		cmd.addr.io.bit_width = perf->control_register.bit_width;
-		break;
-	default:
-		return 0;
-	}
-
-	cmd.mask = mask;
-	drv_read(&cmd);
+	val = drv_read(data, mask);
 
-	pr_debug("get_cur_val = %u\n", cmd.val);
+	pr_debug("get_cur_val = %u\n", val);
 
-	return cmd.val;
+	return val;
 }
 
 static unsigned int get_cur_freq_on_cpu(unsigned int cpu)
@@ -416,7 +412,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
 {
 	struct acpi_cpufreq_data *data = policy->driver_data;
 	struct acpi_processor_performance *perf;
-	struct drv_cmd cmd;
+	const struct cpumask *mask;
 	unsigned int next_perf_state = 0; /* Index into perf table */
 	int result = 0;
 
@@ -438,37 +434,17 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
 		}
 	}
 
-	switch (data->cpu_feature) {
-	case SYSTEM_INTEL_MSR_CAPABLE:
-		cmd.type = SYSTEM_INTEL_MSR_CAPABLE;
-		cmd.addr.msr.reg = MSR_IA32_PERF_CTL;
-		cmd.val = (u32) perf->states[next_perf_state].control;
-		break;
-	case SYSTEM_AMD_MSR_CAPABLE:
-		cmd.type = SYSTEM_AMD_MSR_CAPABLE;
-		cmd.addr.msr.reg = MSR_AMD_PERF_CTL;
-		cmd.val = (u32) perf->states[next_perf_state].control;
-		break;
-	case SYSTEM_IO_CAPABLE:
-		cmd.type = SYSTEM_IO_CAPABLE;
-		cmd.addr.io.port = perf->control_register.address;
-		cmd.addr.io.bit_width = perf->control_register.bit_width;
-		cmd.val = (u32) perf->states[next_perf_state].control;
-		break;
-	default:
-		return -ENODEV;
-	}
-
-	/* cpufreq holds the hotplug lock, so we are safe from here on */
-	if (policy->shared_type != CPUFREQ_SHARED_TYPE_ANY)
-		cmd.mask = policy->cpus;
-	else
-		cmd.mask = cpumask_of(policy->cpu);
+	/*
+	 * The core won't allow CPUs to go away until the governor has been
+	 * stopped, so we can rely on the stability of policy->cpus.
+	 */
+	mask = policy->shared_type == CPUFREQ_SHARED_TYPE_ANY ?
+		cpumask_of(policy->cpu) : policy->cpus;
 
-	drv_write(&cmd);
+	drv_write(data, mask, perf->states[next_perf_state].control);
 
 	if (acpi_pstate_strict) {
-		if (!check_freqs(cmd.mask, data->freq_table[index].frequency,
+		if (!check_freqs(mask, data->freq_table[index].frequency,
 					data)) {
 			pr_debug("acpi_cpufreq_target failed (%d)\n",
 				policy->cpu);
@@ -738,15 +714,21 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
 		}
 		pr_debug("SYSTEM IO addr space\n");
 		data->cpu_feature = SYSTEM_IO_CAPABLE;
+		data->cpu_freq_read = cpu_freq_read_io;
+		data->cpu_freq_write = cpu_freq_write_io;
 		break;
 	case ACPI_ADR_SPACE_FIXED_HARDWARE:
 		pr_debug("HARDWARE addr space\n");
 		if (check_est_cpu(cpu)) {
 			data->cpu_feature = SYSTEM_INTEL_MSR_CAPABLE;
+			data->cpu_freq_read = cpu_freq_read_intel;
+			data->cpu_freq_write = cpu_freq_write_intel;
 			break;
 		}
 		if (check_amd_hwpstate_cpu(cpu)) {
 			data->cpu_feature = SYSTEM_AMD_MSR_CAPABLE;
+			data->cpu_freq_read = cpu_freq_read_amd;
+			data->cpu_freq_write = cpu_freq_write_amd;
 			break;
 		}
 		result = -ENODEV;
-- 
cgit v0.10.2


From 1e059e20ac297346512d9581a8cd3f3cca6a29f9 Mon Sep 17 00:00:00 2001
From: Bob Moore <robert.moore@intel.com>
Date: Fri, 19 Feb 2016 14:17:43 +0800
Subject: ACPICA: Utilities: Update trace mechinism for acquire_object

ACPICA commit 0824ab90e03c2e4239e890615f447e7962b1daa2

Was not using the correct macro. Updated a comment in
acoutput.h

Link: https://github.com/acpica/acpica/commit/0824ab90
Signed-off-by: Bob Moore <robert.moore@intel.com>
Signed-off-by: Lv Zheng <lv.zheng@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/acpi/acpica/utcache.c b/drivers/acpi/acpica/utcache.c
index c9a720f..f8e9978 100644
--- a/drivers/acpi/acpica/utcache.c
+++ b/drivers/acpi/acpica/utcache.c
@@ -245,7 +245,7 @@ void *acpi_os_acquire_object(struct acpi_memory_list *cache)
 	acpi_status status;
 	void *object;
 
-	ACPI_FUNCTION_NAME(os_acquire_object);
+	ACPI_FUNCTION_TRACE(os_acquire_object);
 
 	if (!cache) {
 		return_PTR(NULL);
diff --git a/include/acpi/acoutput.h b/include/acpi/acoutput.h
index 5bfc619..34f601e 100644
--- a/include/acpi/acoutput.h
+++ b/include/acpi/acoutput.h
@@ -262,7 +262,7 @@
 #define ACPI_GET_FUNCTION_NAME          _acpi_function_name
 
 /*
- * The Name parameter should be the procedure name as a quoted string.
+ * The Name parameter should be the procedure name as a non-quoted string.
  * The function name is also used by the function exit macros below.
  * Note: (const char) is used to be compatible with the debug interfaces
  * and macros such as __func__.
-- 
cgit v0.10.2


From 16ccbd87c2e5d95f40f3b7a90f40cc2b34b89a22 Mon Sep 17 00:00:00 2001
From: Jacob Pan <jacob.jun.pan@linux.intel.com>
Date: Wed, 24 Feb 2016 13:31:35 -0800
Subject: cpumask: export cpumask_any_but

Export cpumask_any_but() for module use. This will be used by drivers
such as intel_rapl to locate an active cpu on a socket.

Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/lib/cpumask.c b/lib/cpumask.c
index 5a70f61..81dedaa 100644
--- a/lib/cpumask.c
+++ b/lib/cpumask.c
@@ -41,6 +41,7 @@ int cpumask_any_but(const struct cpumask *mask, unsigned int cpu)
 			break;
 	return i;
 }
+EXPORT_SYMBOL(cpumask_any_but);
 
 /* These are not inline because of header tangles. */
 #ifdef CONFIG_CPUMASK_OFFSTACK
-- 
cgit v0.10.2


From f14a1396d8f19b6c53593045eba86d10360a0cee Mon Sep 17 00:00:00 2001
From: Jacob Pan <jacob.jun.pan@linux.intel.com>
Date: Wed, 24 Feb 2016 13:31:36 -0800
Subject: powercap/rapl: reduce ipi calls

Reduce remote CPU calls for MSR access by combining read
modify write into one function.

Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/powercap/intel_rapl.c b/drivers/powercap/intel_rapl.c
index 6c592dc..a50e644 100644
--- a/drivers/powercap/intel_rapl.c
+++ b/drivers/powercap/intel_rapl.c
@@ -133,6 +133,12 @@ struct rapl_domain_data {
 	unsigned long timestamp;
 };
 
+struct msrl_action {
+	u32 msr_no;
+	u64 clear_mask;
+	u64 set_mask;
+	int err;
+};
 
 #define	DOMAIN_STATE_INACTIVE           BIT(0)
 #define	DOMAIN_STATE_POWER_LIMIT_SET    BIT(1)
@@ -800,35 +806,62 @@ static int rapl_read_data_raw(struct rapl_domain *rd,
 	return 0;
 }
 
+
+static int msrl_update_safe(u32 msr_no, u64 clear_mask, u64 set_mask)
+{
+	int err;
+	u64 val;
+
+	err = rdmsrl_safe(msr_no, &val);
+	if (err)
+		goto out;
+
+	val &= ~clear_mask;
+	val |= set_mask;
+
+	err = wrmsrl_safe(msr_no, val);
+
+out:
+	return err;
+}
+
+static void msrl_update_func(void *info)
+{
+	struct msrl_action *ma = info;
+
+	ma->err = msrl_update_safe(ma->msr_no, ma->clear_mask, ma->set_mask);
+}
+
 /* Similar use of primitive info in the read counterpart */
 static int rapl_write_data_raw(struct rapl_domain *rd,
 			enum rapl_primitives prim,
 			unsigned long long value)
 {
-	u64 msr_val;
-	u32 msr;
 	struct rapl_primitive_info *rp = &rpi[prim];
 	int cpu;
+	u64 bits;
+	struct msrl_action ma;
+	int ret;
 
 	cpu = find_active_cpu_on_package(rd->package_id);
 	if (cpu < 0)
 		return cpu;
-	msr = rd->msrs[rp->id];
-	if (rdmsrl_safe_on_cpu(cpu, msr, &msr_val)) {
-		dev_dbg(&rd->power_zone.dev,
-			"failed to read msr 0x%x on cpu %d\n", msr, cpu);
-		return -EIO;
-	}
-	value = rapl_unit_xlate(rd, rd->package_id, rp->unit, value, 1);
-	msr_val &= ~rp->mask;
-	msr_val |= value << rp->shift;
-	if (wrmsrl_safe_on_cpu(cpu, msr, msr_val)) {
-		dev_dbg(&rd->power_zone.dev,
-			"failed to write msr 0x%x on cpu %d\n", msr, cpu);
-		return -EIO;
-	}
 
-	return 0;
+	bits = rapl_unit_xlate(rd, rd->package_id, rp->unit, value, 1);
+	bits |= bits << rp->shift;
+	memset(&ma, 0, sizeof(ma));
+
+	ma.msr_no = rd->msrs[rp->id];
+	ma.clear_mask = rp->mask;
+	ma.set_mask = bits;
+
+	ret = smp_call_function_single(cpu, msrl_update_func, &ma, 1);
+	if (ret)
+		WARN_ON_ONCE(ret);
+	else
+		ret = ma.err;
+
+	return ret;
 }
 
 /*
@@ -893,6 +926,21 @@ static int rapl_check_unit_atom(struct rapl_package *rp, int cpu)
 	return 0;
 }
 
+static void power_limit_irq_save_cpu(void *info)
+{
+	u32 l, h = 0;
+	struct rapl_package *rp = (struct rapl_package *)info;
+
+	/* save the state of PLN irq mask bit before disabling it */
+	rdmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT, &l, &h);
+	if (!(rp->power_limit_irq & PACKAGE_PLN_INT_SAVED)) {
+		rp->power_limit_irq = l & PACKAGE_THERM_INT_PLN_ENABLE;
+		rp->power_limit_irq |= PACKAGE_PLN_INT_SAVED;
+	}
+	l &= ~PACKAGE_THERM_INT_PLN_ENABLE;
+	wrmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
+}
+
 
 /* REVISIT:
  * When package power limit is set artificially low by RAPL, LVT
@@ -906,7 +954,6 @@ static int rapl_check_unit_atom(struct rapl_package *rp, int cpu)
 
 static void package_power_limit_irq_save(int package_id)
 {
-	u32 l, h = 0;
 	int cpu;
 	struct rapl_package *rp;
 
@@ -920,20 +967,30 @@ static void package_power_limit_irq_save(int package_id)
 	cpu = find_active_cpu_on_package(package_id);
 	if (cpu < 0)
 		return;
-	/* save the state of PLN irq mask bit before disabling it */
-	rdmsr_safe_on_cpu(cpu, MSR_IA32_PACKAGE_THERM_INTERRUPT, &l, &h);
-	if (!(rp->power_limit_irq & PACKAGE_PLN_INT_SAVED)) {
-		rp->power_limit_irq = l & PACKAGE_THERM_INT_PLN_ENABLE;
-		rp->power_limit_irq |= PACKAGE_PLN_INT_SAVED;
-	}
-	l &= ~PACKAGE_THERM_INT_PLN_ENABLE;
-	wrmsr_on_cpu(cpu, MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
+	if (!boot_cpu_has(X86_FEATURE_PTS) || !boot_cpu_has(X86_FEATURE_PLN))
+		return;
+
+	smp_call_function_single(cpu, power_limit_irq_save_cpu, rp, 1);
+}
+
+static void power_limit_irq_restore_cpu(void *info)
+{
+	u32 l, h = 0;
+	struct rapl_package *rp = (struct rapl_package *)info;
+
+	rdmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT, &l, &h);
+
+	if (rp->power_limit_irq & PACKAGE_THERM_INT_PLN_ENABLE)
+		l |= PACKAGE_THERM_INT_PLN_ENABLE;
+	else
+		l &= ~PACKAGE_THERM_INT_PLN_ENABLE;
+
+	wrmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
 }
 
 /* restore per package power limit interrupt enable state */
 static void package_power_limit_irq_restore(int package_id)
 {
-	u32 l, h;
 	int cpu;
 	struct rapl_package *rp;
 
@@ -951,14 +1008,8 @@ static void package_power_limit_irq_restore(int package_id)
 	/* irq enable state not saved, nothing to restore */
 	if (!(rp->power_limit_irq & PACKAGE_PLN_INT_SAVED))
 		return;
-	rdmsr_safe_on_cpu(cpu, MSR_IA32_PACKAGE_THERM_INTERRUPT, &l, &h);
-
-	if (rp->power_limit_irq & PACKAGE_THERM_INT_PLN_ENABLE)
-		l |= PACKAGE_THERM_INT_PLN_ENABLE;
-	else
-		l &= ~PACKAGE_THERM_INT_PLN_ENABLE;
 
-	wrmsr_on_cpu(cpu, MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
+	smp_call_function_single(cpu, power_limit_irq_restore_cpu, rp, 1);
 }
 
 static void set_floor_freq_default(struct rapl_domain *rd, bool mode)
-- 
cgit v0.10.2


From 309557f558a6f276e364b08d916c0f644b5bd2e1 Mon Sep 17 00:00:00 2001
From: Jacob Pan <jacob.jun.pan@linux.intel.com>
Date: Wed, 24 Feb 2016 13:31:37 -0800
Subject: powercap/rapl: add package reference per domain

This patch adds to each rapl domain a reference of the package
it belongs to. At runtime, we can then avoid searching the package
data for each access. It simplifies the domain level operations
which depend on package level information.

Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/powercap/intel_rapl.c b/drivers/powercap/intel_rapl.c
index a50e644..3413692 100644
--- a/drivers/powercap/intel_rapl.c
+++ b/drivers/powercap/intel_rapl.c
@@ -155,6 +155,7 @@ struct rapl_power_limit {
 static const char pl1_name[] = "long_term";
 static const char pl2_name[] = "short_term";
 
+struct rapl_package;
 struct rapl_domain {
 	const char *name;
 	enum rapl_domain_type id;
@@ -165,7 +166,7 @@ struct rapl_domain {
 	u64 attr_map; /* track capabilities */
 	unsigned int state;
 	unsigned int domain_energy_unit;
-	int package_id;
+	struct rapl_package *rp;
 };
 #define power_zone_to_rapl_domain(_zone) \
 	container_of(_zone, struct rapl_domain, power_zone)
@@ -237,10 +238,10 @@ static int rapl_read_data_raw(struct rapl_domain *rd,
 static int rapl_write_data_raw(struct rapl_domain *rd,
 			enum rapl_primitives prim,
 			unsigned long long value);
-static u64 rapl_unit_xlate(struct rapl_domain *rd, int package,
+static u64 rapl_unit_xlate(struct rapl_domain *rd,
 			enum unit_type type, u64 value,
 			int to_raw);
-static void package_power_limit_irq_save(int package_id);
+static void package_power_limit_irq_save(struct rapl_package *rp);
 
 static LIST_HEAD(rapl_packages); /* guarded by CPU hotplug lock */
 
@@ -318,25 +319,19 @@ static int get_max_energy_counter(struct powercap_zone *pcd_dev, u64 *energy)
 {
 	struct rapl_domain *rd = power_zone_to_rapl_domain(pcd_dev);
 
-	*energy = rapl_unit_xlate(rd, 0, ENERGY_UNIT, ENERGY_STATUS_MASK, 0);
+	*energy = rapl_unit_xlate(rd, ENERGY_UNIT, ENERGY_STATUS_MASK, 0);
 	return 0;
 }
 
 static int release_zone(struct powercap_zone *power_zone)
 {
 	struct rapl_domain *rd = power_zone_to_rapl_domain(power_zone);
-	struct rapl_package *rp;
+	struct rapl_package *rp = rd->rp;
 
 	/* package zone is the last zone of a package, we can free
 	 * memory here since all children has been unregistered.
 	 */
 	if (rd->id == RAPL_DOMAIN_PACKAGE) {
-		rp = find_package_by_id(rd->package_id);
-		if (!rp) {
-			dev_warn(&power_zone->dev, "no package id %s\n",
-				rd->name);
-			return -ENODEV;
-		}
 		kfree(rd);
 		rp->domains = NULL;
 	}
@@ -438,11 +433,7 @@ static int set_power_limit(struct powercap_zone *power_zone, int id,
 
 	get_online_cpus();
 	rd = power_zone_to_rapl_domain(power_zone);
-	rp = find_package_by_id(rd->package_id);
-	if (!rp) {
-		ret = -ENODEV;
-		goto set_exit;
-	}
+	rp = rd->rp;
 
 	if (rd->state & DOMAIN_STATE_BIOS_LOCKED) {
 		dev_warn(&power_zone->dev, "%s locked by BIOS, monitoring only\n",
@@ -462,7 +453,7 @@ static int set_power_limit(struct powercap_zone *power_zone, int id,
 		ret = -EINVAL;
 	}
 	if (!ret)
-		package_power_limit_irq_save(rd->package_id);
+		package_power_limit_irq_save(rp);
 set_exit:
 	put_online_cpus();
 	return ret;
@@ -661,24 +652,19 @@ static void rapl_init_domains(struct rapl_package *rp)
 			break;
 		}
 		if (mask) {
-			rd->package_id = rp->id;
+			rd->rp = rp;
 			rd++;
 		}
 	}
 }
 
-static u64 rapl_unit_xlate(struct rapl_domain *rd, int package,
-			enum unit_type type, u64 value,
-			int to_raw)
+static u64 rapl_unit_xlate(struct rapl_domain *rd, enum unit_type type,
+			u64 value, int to_raw)
 {
 	u64 units = 1;
-	struct rapl_package *rp;
+	struct rapl_package *rp = rd->rp;
 	u64 scale = 1;
 
-	rp = find_package_by_id(package);
-	if (!rp)
-		return value;
-
 	switch (type) {
 	case POWER_UNIT:
 		units = rp->power_unit;
@@ -776,7 +762,7 @@ static int rapl_read_data_raw(struct rapl_domain *rd,
 	if (!msr)
 		return -EINVAL;
 	/* use physical package id to look up active cpus */
-	cpu = find_active_cpu_on_package(rd->package_id);
+	cpu = find_active_cpu_on_package(rd->rp->id);
 	if (cpu < 0)
 		return cpu;
 
@@ -799,7 +785,7 @@ static int rapl_read_data_raw(struct rapl_domain *rd,
 	final = value & rp->mask;
 	final = final >> rp->shift;
 	if (xlate)
-		*data = rapl_unit_xlate(rd, rd->package_id, rp->unit, final, 0);
+		*data = rapl_unit_xlate(rd, rp->unit, final, 0);
 	else
 		*data = final;
 
@@ -843,11 +829,11 @@ static int rapl_write_data_raw(struct rapl_domain *rd,
 	struct msrl_action ma;
 	int ret;
 
-	cpu = find_active_cpu_on_package(rd->package_id);
+	cpu = find_active_cpu_on_package(rd->rp->id);
 	if (cpu < 0)
 		return cpu;
 
-	bits = rapl_unit_xlate(rd, rd->package_id, rp->unit, value, 1);
+	bits = rapl_unit_xlate(rd, rp->unit, value, 1);
 	bits |= bits << rp->shift;
 	memset(&ma, 0, sizeof(ma));
 
@@ -952,19 +938,14 @@ static void power_limit_irq_save_cpu(void *info)
  * to do by adding an atomic notifier.
  */
 
-static void package_power_limit_irq_save(int package_id)
+static void package_power_limit_irq_save(struct rapl_package *rp)
 {
 	int cpu;
-	struct rapl_package *rp;
-
-	rp = find_package_by_id(package_id);
-	if (!rp)
-		return;
 
 	if (!boot_cpu_has(X86_FEATURE_PTS) || !boot_cpu_has(X86_FEATURE_PLN))
 		return;
 
-	cpu = find_active_cpu_on_package(package_id);
+	cpu = find_active_cpu_on_package(rp->id);
 	if (cpu < 0)
 		return;
 	if (!boot_cpu_has(X86_FEATURE_PTS) || !boot_cpu_has(X86_FEATURE_PLN))
@@ -989,19 +970,14 @@ static void power_limit_irq_restore_cpu(void *info)
 }
 
 /* restore per package power limit interrupt enable state */
-static void package_power_limit_irq_restore(int package_id)
+static void package_power_limit_irq_restore(struct rapl_package *rp)
 {
 	int cpu;
-	struct rapl_package *rp;
-
-	rp = find_package_by_id(package_id);
-	if (!rp)
-		return;
 
 	if (!boot_cpu_has(X86_FEATURE_PTS) || !boot_cpu_has(X86_FEATURE_PLN))
 		return;
 
-	cpu = find_active_cpu_on_package(package_id);
+	cpu = find_active_cpu_on_package(rp->id);
 	if (cpu < 0)
 		return;
 
@@ -1192,7 +1168,7 @@ static int rapl_unregister_powercap(void)
 	 * hotplug lock held
 	 */
 	list_for_each_entry(rp, &rapl_packages, plist) {
-		package_power_limit_irq_restore(rp->id);
+		package_power_limit_irq_restore(rp);
 
 		for (rd = rp->domains; rd < rp->domains + rp->nr_domains;
 		     rd++) {
-- 
cgit v0.10.2


From 323ee64aa175a67fbbe744e809777d17e6fb42d7 Mon Sep 17 00:00:00 2001
From: Jacob Pan <jacob.jun.pan@linux.intel.com>
Date: Wed, 24 Feb 2016 13:31:38 -0800
Subject: powercap/rapl: track lead cpu per package

RAPL driver operates on MSRs that are under package/socket
scope instead of core scope. However, the current code does not
keep track of which CPUs are available on each package for MSR
access. Therefore it has to search for an active CPU on a given
package each time.

This patch optimizes the package level operations by tracking a
per package lead CPU during initialization and CPU hotplug. The
runtime search for active CPU is avoided.

Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/powercap/intel_rapl.c b/drivers/powercap/intel_rapl.c
index 3413692..cdfd01f0 100644
--- a/drivers/powercap/intel_rapl.c
+++ b/drivers/powercap/intel_rapl.c
@@ -191,6 +191,7 @@ struct rapl_package {
 					* notify interrupt enable status.
 					*/
 	struct list_head plist;
+	int lead_cpu; /* one active cpu per package for access */
 };
 
 struct rapl_defaults {
@@ -267,20 +268,6 @@ static struct rapl_package *find_package_by_id(int id)
 	return NULL;
 }
 
-/* caller to ensure CPU hotplug lock is held */
-static int find_active_cpu_on_package(int package_id)
-{
-	int i;
-
-	for_each_online_cpu(i) {
-		if (topology_physical_package_id(i) == package_id)
-			return i;
-	}
-	/* all CPUs on this package are offline */
-
-	return -ENODEV;
-}
-
 /* caller must hold cpu hotplug lock */
 static void rapl_cleanup_data(void)
 {
@@ -761,10 +748,8 @@ static int rapl_read_data_raw(struct rapl_domain *rd,
 	msr = rd->msrs[rp->id];
 	if (!msr)
 		return -EINVAL;
-	/* use physical package id to look up active cpus */
-	cpu = find_active_cpu_on_package(rd->rp->id);
-	if (cpu < 0)
-		return cpu;
+
+	cpu = rd->rp->lead_cpu;
 
 	/* special-case package domain, which uses a different bit*/
 	if (prim == FW_LOCK && rd->id == RAPL_DOMAIN_PACKAGE) {
@@ -829,10 +814,7 @@ static int rapl_write_data_raw(struct rapl_domain *rd,
 	struct msrl_action ma;
 	int ret;
 
-	cpu = find_active_cpu_on_package(rd->rp->id);
-	if (cpu < 0)
-		return cpu;
-
+	cpu = rd->rp->lead_cpu;
 	bits = rapl_unit_xlate(rd, rp->unit, value, 1);
 	bits |= bits << rp->shift;
 	memset(&ma, 0, sizeof(ma));
@@ -940,18 +922,10 @@ static void power_limit_irq_save_cpu(void *info)
 
 static void package_power_limit_irq_save(struct rapl_package *rp)
 {
-	int cpu;
-
-	if (!boot_cpu_has(X86_FEATURE_PTS) || !boot_cpu_has(X86_FEATURE_PLN))
-		return;
-
-	cpu = find_active_cpu_on_package(rp->id);
-	if (cpu < 0)
-		return;
 	if (!boot_cpu_has(X86_FEATURE_PTS) || !boot_cpu_has(X86_FEATURE_PLN))
 		return;
 
-	smp_call_function_single(cpu, power_limit_irq_save_cpu, rp, 1);
+	smp_call_function_single(rp->lead_cpu, power_limit_irq_save_cpu, rp, 1);
 }
 
 static void power_limit_irq_restore_cpu(void *info)
@@ -972,20 +946,14 @@ static void power_limit_irq_restore_cpu(void *info)
 /* restore per package power limit interrupt enable state */
 static void package_power_limit_irq_restore(struct rapl_package *rp)
 {
-	int cpu;
-
 	if (!boot_cpu_has(X86_FEATURE_PTS) || !boot_cpu_has(X86_FEATURE_PLN))
 		return;
 
-	cpu = find_active_cpu_on_package(rp->id);
-	if (cpu < 0)
-		return;
-
 	/* irq enable state not saved, nothing to restore */
 	if (!(rp->power_limit_irq & PACKAGE_PLN_INT_SAVED))
 		return;
 
-	smp_call_function_single(cpu, power_limit_irq_restore_cpu, rp, 1);
+	smp_call_function_single(rp->lead_cpu, power_limit_irq_restore_cpu, rp, 1);
 }
 
 static void set_floor_freq_default(struct rapl_domain *rd, bool mode)
@@ -1419,7 +1387,8 @@ static int rapl_detect_topology(void)
 			/* add the new package to the list */
 			new_package->id = phy_package_id;
 			new_package->nr_cpus = 1;
-
+			/* use the first active cpu of the package to access */
+			new_package->lead_cpu = i;
 			/* check if the package contains valid domains */
 			if (rapl_detect_domains(new_package, i) ||
 				rapl_defaults->check_unit(new_package, i)) {
@@ -1475,6 +1444,8 @@ static int rapl_add_package(int cpu)
 	/* add the new package to the list */
 	rp->id = phy_package_id;
 	rp->nr_cpus = 1;
+	rp->lead_cpu = cpu;
+
 	/* check if the package contains valid domains */
 	if (rapl_detect_domains(rp, cpu) ||
 		rapl_defaults->check_unit(rp, cpu)) {
@@ -1507,6 +1478,7 @@ static int rapl_cpu_callback(struct notifier_block *nfb,
 	unsigned long cpu = (unsigned long)hcpu;
 	int phy_package_id;
 	struct rapl_package *rp;
+	int lead_cpu;
 
 	phy_package_id = topology_physical_package_id(cpu);
 	switch (action) {
@@ -1527,6 +1499,15 @@ static int rapl_cpu_callback(struct notifier_block *nfb,
 			break;
 		if (--rp->nr_cpus == 0)
 			rapl_remove_package(rp);
+		else if (cpu == rp->lead_cpu) {
+			/* choose another active cpu in the package */
+			lead_cpu = cpumask_any_but(topology_core_cpumask(cpu), cpu);
+			if (lead_cpu < nr_cpu_ids)
+				rp->lead_cpu = lead_cpu;
+			else /* should never go here */
+				pr_err("no active cpu available for package %d\n",
+					phy_package_id);
+		}
 	}
 
 	return NOTIFY_OK;
-- 
cgit v0.10.2


From 6954d43292674aaacc9b1cdbc28b1b2ea0cc6445 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Tue, 23 Feb 2016 17:49:17 +0100
Subject: PM / Domains: Restore alignment of slaves in debugfs output

The slave domains are no longer aligned with the table header in the
/sys/kernel/debug/pm_genpd/pm_genpd_summary output. Worse, the alignment
differs depending on the actual name of the state.

Format the state name and index into a buffer, and print that like
before to restore alignment.

Use "%u" for unsigned int while we're at it.

Fixes: fc5cbf0c94b6f7fd (PM / Domains: Support for multiple states)
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Acked-by: Ulf Hansson <ulf.hansson@linaro.org>
Tested-by: Kevin Hilman <khilman@baylibre.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c
index e8ca290..01015d8 100644
--- a/drivers/base/power/domain.c
+++ b/drivers/base/power/domain.c
@@ -1882,6 +1882,7 @@ static int pm_genpd_summary_one(struct seq_file *s,
 	struct pm_domain_data *pm_data;
 	const char *kobj_path;
 	struct gpd_link *link;
+	char state[16];
 	int ret;
 
 	ret = mutex_lock_interruptible(&genpd->lock);
@@ -1890,12 +1891,13 @@ static int pm_genpd_summary_one(struct seq_file *s,
 
 	if (WARN_ON(genpd->status >= ARRAY_SIZE(status_lookup)))
 		goto exit;
-	seq_printf(s, "%-30s  %s", genpd->name, status_lookup[genpd->status]);
-
 	if (genpd->status == GPD_STATE_POWER_OFF)
-		seq_printf(s, " %-13d ", genpd->state_idx);
+		snprintf(state, sizeof(state), "%s %u",
+			 status_lookup[genpd->status], genpd->state_idx);
 	else
-		seq_printf(s, " %-15s ", "");
+		snprintf(state, sizeof(state), "%s",
+			 status_lookup[genpd->status]);
+	seq_printf(s, "%-30s  %-15s ", genpd->name, state);
 
 	/*
 	 * Modifications on the list require holding locks on both
-- 
cgit v0.10.2


From 0ba554e45c674181a3e5c42c7a6cccbbc75a0dd7 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Tue, 23 Feb 2016 17:49:18 +0100
Subject: PM / Domains: Join state name and index in debugfs output

For low-power states, the state index is part of the state, hence join
them with a hyphen in the /sys/kernel/debug/pm_genpd/pm_genpd_summary
output. E.g. "off 0" becomes "off-0".

Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Acked-by: Ulf Hansson <ulf.hansson@linaro.org>
Tested-by: Kevin Hilman <khilman@baylibre.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c
index 01015d8..9e59543 100644
--- a/drivers/base/power/domain.c
+++ b/drivers/base/power/domain.c
@@ -1892,7 +1892,7 @@ static int pm_genpd_summary_one(struct seq_file *s,
 	if (WARN_ON(genpd->status >= ARRAY_SIZE(status_lookup)))
 		goto exit;
 	if (genpd->status == GPD_STATE_POWER_OFF)
-		snprintf(state, sizeof(state), "%s %u",
+		snprintf(state, sizeof(state), "%s-%u",
 			 status_lookup[genpd->status], genpd->state_idx);
 	else
 		snprintf(state, sizeof(state), "%s",
-- 
cgit v0.10.2


From 076395cae20381340a0e92ad3d76fe3e280f8a1c Mon Sep 17 00:00:00 2001
From: Laurent Pinchart <laurent.pinchart+renesas@ideasonboard.com>
Date: Wed, 2 Mar 2016 01:20:38 +0200
Subject: PM / Domains: Propagate start and restore errors during runtime
 resume

During runtime resume the return values of the start and restore steps
are ignored. As a result drivers are not notified of runtime resume
failures and can't propagate them up. Fix it by returning an error if
either the start or restore step fails, and clean up properly in the
error path.

Signed-off-by: Laurent Pinchart <laurent.pinchart+renesas@ideasonboard.com>
Acked-by: Kevin Hilman <khilman@baylibre.com>
Acked-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c
index 9e59543..7e44ae3 100644
--- a/drivers/base/power/domain.c
+++ b/drivers/base/power/domain.c
@@ -487,8 +487,13 @@ static int pm_genpd_runtime_resume(struct device *dev)
 	if (timed && runtime_pm)
 		time_start = ktime_get();
 
-	genpd_start_dev(genpd, dev);
-	genpd_restore_dev(genpd, dev);
+	ret = genpd_start_dev(genpd, dev);
+	if (ret)
+		goto err_poweroff;
+
+	ret = genpd_restore_dev(genpd, dev);
+	if (ret)
+		goto err_stop;
 
 	/* Update resume latency value if the measured time exceeds it. */
 	if (timed && runtime_pm) {
@@ -503,6 +508,17 @@ static int pm_genpd_runtime_resume(struct device *dev)
 	}
 
 	return 0;
+
+err_stop:
+	genpd_stop_dev(genpd, dev);
+err_poweroff:
+	if (!dev->power.irq_safe) {
+		mutex_lock(&genpd->lock);
+		genpd_poweroff(genpd, 0);
+		mutex_unlock(&genpd->lock);
+	}
+
+	return ret;
 }
 
 static bool pd_ignore_unused;
-- 
cgit v0.10.2


From beda5fc1ff9b527059290a97b672d2ee0eb7b92f Mon Sep 17 00:00:00 2001
From: Jon Hunter <jonathanh@nvidia.com>
Date: Fri, 4 Mar 2016 10:55:14 +0000
Subject: PM / Domains: Fix removal of a subdomain

Commit 30e7a65b3fdb (PM / Domains: Ensure subdomain is not in use
before removing) added a test to ensure that a subdomain is not a
master to another subdomain or if any devices are using the subdomain
before removing. This change incorrectly used the "slave_links" list to
determine if the subdomain is a master to another subdomain, where it
should have been using the "master_links" list instead. The
"slave_links" list will never be empty for a subdomain and so a
subdomain can never be removed. Fix this by testing if the
"master_links" list is empty instead.

Fixes: 30e7a65b3fdb (PM / Domains: Ensure subdomain is not in use before removing)
Signed-off-by: Jon Hunter <jonathanh@nvidia.com>
Reviewed-by: Thierry Reding <treding@nvidia.com>
Acked-by: Ulf Hansson <ulf.hansson@linaro.org>
Acked-by: Kevin Hilman <khilman@baylibre.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c
index 7e44ae3..79f5d39 100644
--- a/drivers/base/power/domain.c
+++ b/drivers/base/power/domain.c
@@ -1398,7 +1398,7 @@ int pm_genpd_remove_subdomain(struct generic_pm_domain *genpd,
 	mutex_lock(&subdomain->lock);
 	mutex_lock_nested(&genpd->lock, SINGLE_DEPTH_NESTING);
 
-	if (!list_empty(&subdomain->slave_links) || subdomain->device_count) {
+	if (!list_empty(&subdomain->master_links) || subdomain->device_count) {
 		pr_warn("%s: unable to remove subdomain %s\n", genpd->name,
 			subdomain->name);
 		ret = -EBUSY;
-- 
cgit v0.10.2


From 41795a8a3cff8e4ba54236ca16c3814ba9cd7f39 Mon Sep 17 00:00:00 2001
From: Jon Hunter <jonathanh@nvidia.com>
Date: Fri, 4 Mar 2016 10:55:15 +0000
Subject: PM / Domains: Fix potential NULL pointer dereference

In the function of_genpd_get_from_provider(), we never check to see if
the argument 'genpdspec' is NULL before dereferencing it. Add error
checking to handle any NULL pointers.

Signed-off-by: Jon Hunter <jonathanh@nvidia.com>
Acked-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c
index 79f5d39..56705b5 100644
--- a/drivers/base/power/domain.c
+++ b/drivers/base/power/domain.c
@@ -1702,6 +1702,9 @@ struct generic_pm_domain *of_genpd_get_from_provider(
 	struct generic_pm_domain *genpd = ERR_PTR(-ENOENT);
 	struct of_genpd_provider *provider;
 
+	if (!genpdspec)
+		return ERR_PTR(-EINVAL);
+
 	mutex_lock(&of_genpd_mutex);
 
 	/* Check if we have such a provider in our array */
-- 
cgit v0.10.2


From e237a5518425155faa508a087f28269f58074b92 Mon Sep 17 00:00:00 2001
From: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
Date: Mon, 15 Feb 2016 12:52:01 +0800
Subject: x86/ACPI/PCI: Recognize that Interrupt Line 255 means "not connected"

Per the x86-specific footnote to PCI spec r3.0, sec 6.2.4, the value 255 in
the Interrupt Line register means "unknown" or "no connection."
Previously, when we couldn't derive an IRQ from the _PRT, we fell back to
using the value from Interrupt Line as an IRQ.  It's questionable whether
we should do that at all, but the spec clearly suggests we shouldn't do it
for the value 255 on x86.

Calling request_irq() with IRQ 255 may succeed, but the driver won't
receive any interrupts.  Or, if IRQ 255 is shared with another device, it
may succeed, and the driver's ISR will be called at random times when the
*other* device interrupts.  Or it may fail if another device is using IRQ
255 with incompatible flags.  What we *want* is for request_irq() to fail
predictably so the driver can fall back to polling.

On x86, assume 255 in the Interrupt Line means the INTx line is not
connected.  In that case, set dev->irq to IRQ_NOTCONNECTED so request_irq()
will fail gracefully with -ENOTCONN.

We found this problem on a system where Secure Boot firmware assigned
Interrupt Line 255 to an i801_smbus device and another device was already
using MSI-X IRQ 255.  This was in v3.10, where i801_probe() fails if
request_irq() fails:

  i801_smbus 0000:00:1f.3: enabling device (0140 -> 0143)
  i801_smbus 0000:00:1f.3: can't derive routing for PCI INT C
  i801_smbus 0000:00:1f.3: PCI INT C: no GSI
  genirq: Flags mismatch irq 255. 00000080 (i801_smbus) vs. 00000000 (megasa)
  CPU: 0 PID: 2487 Comm: kworker/0:1 Not tainted 3.10.0-229.el7.x86_64 #1
  Hardware name: FUJITSU PRIMEQUEST 2800E2/D3736, BIOS PRIMEQUEST 2000 Serie5
  Call Trace:
    dump_stack+0x19/0x1b
    __setup_irq+0x54a/0x570
    request_threaded_irq+0xcc/0x170
    i801_probe+0x32f/0x508 [i2c_i801]
    local_pci_probe+0x45/0xa0
  i801_smbus 0000:00:1f.3: Failed to allocate irq 255: -16
  i801_smbus: probe of 0000:00:1f.3 failed with error -16

After aeb8a3d16ae0 ("i2c: i801: Check if interrupts are disabled"),
i801_probe() will fall back to polling if request_irq() fails.  But we
still need this patch because request_irq() may succeed or fail depending
on other devices in the system.  If request_irq() fails, i801_smbus will
work by falling back to polling, but if it succeeds, i801_smbus won't work
because it expects interrupts that it may not receive.

Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/acpi/pci_irq.c b/drivers/acpi/pci_irq.c
index c8e169e..2c45dd3 100644
--- a/drivers/acpi/pci_irq.c
+++ b/drivers/acpi/pci_irq.c
@@ -33,6 +33,7 @@
 #include <linux/pci.h>
 #include <linux/acpi.h>
 #include <linux/slab.h>
+#include <linux/interrupt.h>
 
 #define PREFIX "ACPI: "
 
@@ -387,6 +388,23 @@ static inline int acpi_isa_register_gsi(struct pci_dev *dev)
 }
 #endif
 
+static inline bool acpi_pci_irq_valid(struct pci_dev *dev, u8 pin)
+{
+#ifdef CONFIG_X86
+	/*
+	 * On x86 irq line 0xff means "unknown" or "no connection"
+	 * (PCI 3.0, Section 6.2.4, footnote on page 223).
+	 */
+	if (dev->irq == 0xff) {
+		dev->irq = IRQ_NOTCONNECTED;
+		dev_warn(&dev->dev, "PCI INT %c: not connected\n",
+			 pin_name(pin));
+		return false;
+	}
+#endif
+	return true;
+}
+
 int acpi_pci_irq_enable(struct pci_dev *dev)
 {
 	struct acpi_prt_entry *entry;
@@ -431,11 +449,14 @@ int acpi_pci_irq_enable(struct pci_dev *dev)
 	} else
 		gsi = -1;
 
-	/*
-	 * No IRQ known to the ACPI subsystem - maybe the BIOS / 
-	 * driver reported one, then use it. Exit in any case.
-	 */
 	if (gsi < 0) {
+		/*
+		 * No IRQ known to the ACPI subsystem - maybe the BIOS /
+		 * driver reported one, then use it. Exit in any case.
+		 */
+		if (!acpi_pci_irq_valid(dev, pin))
+			return 0;
+
 		if (acpi_isa_register_gsi(dev))
 			dev_warn(&dev->dev, "PCI INT %c: no GSI\n",
 				 pin_name(pin));
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 0e95fcc..358076e 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -125,6 +125,16 @@ struct irqaction {
 
 extern irqreturn_t no_action(int cpl, void *dev_id);
 
+/*
+ * If a (PCI) device interrupt is not connected we set dev->irq to
+ * IRQ_NOTCONNECTED. This causes request_irq() to fail with -ENOTCONN, so we
+ * can distingiush that case from other error returns.
+ *
+ * 0x80000000 is guaranteed to be outside the available range of interrupts
+ * and easy to distinguish from other possible incorrect values.
+ */
+#define IRQ_NOTCONNECTED	(1U << 31)
+
 extern int __must_check
 request_threaded_irq(unsigned int irq, irq_handler_t handler,
 		     irq_handler_t thread_fn,
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 8411872..e79e60f 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1609,6 +1609,9 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
 	struct irq_desc *desc;
 	int retval;
 
+	if (irq == IRQ_NOTCONNECTED)
+		return -ENOTCONN;
+
 	/*
 	 * Sanity-check: shared interrupts must pass in a real dev-ID,
 	 * otherwise we'll have trouble later trying to figure out
@@ -1699,9 +1702,13 @@ EXPORT_SYMBOL(request_threaded_irq);
 int request_any_context_irq(unsigned int irq, irq_handler_t handler,
 			    unsigned long flags, const char *name, void *dev_id)
 {
-	struct irq_desc *desc = irq_to_desc(irq);
+	struct irq_desc *desc;
 	int ret;
 
+	if (irq == IRQ_NOTCONNECTED)
+		return -ENOTCONN;
+
+	desc = irq_to_desc(irq);
 	if (!desc)
 		return -EINVAL;
 
-- 
cgit v0.10.2


From 34e2c555f3e13c90e9284e23d00f03be8a6e06c5 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Mon, 15 Feb 2016 20:20:42 +0100
Subject: cpufreq: Add mechanism for registering utilization update callbacks

Introduce a mechanism by which parts of the cpufreq subsystem
("setpolicy" drivers or the core) can register callbacks to be
executed from cpufreq_update_util() which is invoked by the
scheduler's update_load_avg() on CPU utilization changes.

This allows the "setpolicy" drivers to dispense with their timers
and do all of the computations they need and frequency/voltage
adjustments in the update_load_avg() code path, among other things.

The update_load_avg() changes were suggested by Peter Zijlstra.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Ingo Molnar <mingo@kernel.org>

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 34b17447..e172b2a 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -102,6 +102,51 @@ static LIST_HEAD(cpufreq_governor_list);
 static struct cpufreq_driver *cpufreq_driver;
 static DEFINE_PER_CPU(struct cpufreq_policy *, cpufreq_cpu_data);
 static DEFINE_RWLOCK(cpufreq_driver_lock);
+
+static DEFINE_PER_CPU(struct update_util_data *, cpufreq_update_util_data);
+
+/**
+ * cpufreq_set_update_util_data - Populate the CPU's update_util_data pointer.
+ * @cpu: The CPU to set the pointer for.
+ * @data: New pointer value.
+ *
+ * Set and publish the update_util_data pointer for the given CPU.  That pointer
+ * points to a struct update_util_data object containing a callback function
+ * to call from cpufreq_update_util().  That function will be called from an RCU
+ * read-side critical section, so it must not sleep.
+ *
+ * Callers must use RCU callbacks to free any memory that might be accessed
+ * via the old update_util_data pointer or invoke synchronize_rcu() right after
+ * this function to avoid use-after-free.
+ */
+void cpufreq_set_update_util_data(int cpu, struct update_util_data *data)
+{
+	rcu_assign_pointer(per_cpu(cpufreq_update_util_data, cpu), data);
+}
+EXPORT_SYMBOL_GPL(cpufreq_set_update_util_data);
+
+/**
+ * cpufreq_update_util - Take a note about CPU utilization changes.
+ * @time: Current time.
+ * @util: Current utilization.
+ * @max: Utilization ceiling.
+ *
+ * This function is called by the scheduler on every invocation of
+ * update_load_avg() on the CPU whose utilization is being updated.
+ */
+void cpufreq_update_util(u64 time, unsigned long util, unsigned long max)
+{
+	struct update_util_data *data;
+
+	rcu_read_lock();
+
+	data = rcu_dereference(*this_cpu_ptr(&cpufreq_update_util_data));
+	if (data && data->func)
+		data->func(data, time, util, max);
+
+	rcu_read_unlock();
+}
+
 DEFINE_MUTEX(cpufreq_governor_lock);
 
 /* Flag to suspend/resume CPUFreq governors */
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index d0bf555..704d85b 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -151,6 +151,36 @@ static inline bool policy_is_shared(struct cpufreq_policy *policy)
 extern struct kobject *cpufreq_global_kobject;
 
 #ifdef CONFIG_CPU_FREQ
+void cpufreq_update_util(u64 time, unsigned long util, unsigned long max);
+
+/**
+ * cpufreq_trigger_update - Trigger CPU performance state evaluation if needed.
+ * @time: Current time.
+ *
+ * The way cpufreq is currently arranged requires it to evaluate the CPU
+ * performance state (frequency/voltage) on a regular basis to prevent it from
+ * being stuck in a completely inadequate performance level for too long.
+ * That is not guaranteed to happen if the updates are only triggered from CFS,
+ * though, because they may not be coming in if RT or deadline tasks are active
+ * all the time (or there are RT and DL tasks only).
+ *
+ * As a workaround for that issue, this function is called by the RT and DL
+ * sched classes to trigger extra cpufreq updates to prevent it from stalling,
+ * but that really is a band-aid.  Going forward it should be replaced with
+ * solutions targeted more specifically at RT and DL tasks.
+ */
+static inline void cpufreq_trigger_update(u64 time)
+{
+	cpufreq_update_util(time, ULONG_MAX, 0);
+}
+
+struct update_util_data {
+	void (*func)(struct update_util_data *data,
+		     u64 time, unsigned long util, unsigned long max);
+};
+
+void cpufreq_set_update_util_data(int cpu, struct update_util_data *data);
+
 unsigned int cpufreq_get(unsigned int cpu);
 unsigned int cpufreq_quick_get(unsigned int cpu);
 unsigned int cpufreq_quick_get_max(unsigned int cpu);
@@ -162,6 +192,10 @@ int cpufreq_update_policy(unsigned int cpu);
 bool have_governor_per_policy(void);
 struct kobject *get_governor_parent_kobj(struct cpufreq_policy *policy);
 #else
+static inline void cpufreq_update_util(u64 time, unsigned long util,
+				       unsigned long max) {}
+static inline void cpufreq_trigger_update(u64 time) {}
+
 static inline unsigned int cpufreq_get(unsigned int cpu)
 {
 	return 0;
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index cd64c97..21a0aa6 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -726,6 +726,10 @@ static void update_curr_dl(struct rq *rq)
 	if (!dl_task(curr) || !on_dl_rq(dl_se))
 		return;
 
+	/* Kick cpufreq (see the comment in linux/cpufreq.h). */
+	if (cpu_of(rq) == smp_processor_id())
+		cpufreq_trigger_update(rq_clock(rq));
+
 	/*
 	 * Consumed budget is computed considering the time as
 	 * observed by schedulable tasks (excluding time spent
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 56b7d4b..e2987a7 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2824,7 +2824,8 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg)
 {
 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
 	u64 now = cfs_rq_clock_task(cfs_rq);
-	int cpu = cpu_of(rq_of(cfs_rq));
+	struct rq *rq = rq_of(cfs_rq);
+	int cpu = cpu_of(rq);
 
 	/*
 	 * Track task load average for carrying it to new CPU after migrated, and
@@ -2836,6 +2837,29 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg)
 
 	if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg)
 		update_tg_load_avg(cfs_rq, 0);
+
+	if (cpu == smp_processor_id() && &rq->cfs == cfs_rq) {
+		unsigned long max = rq->cpu_capacity_orig;
+
+		/*
+		 * There are a few boundary cases this might miss but it should
+		 * get called often enough that that should (hopefully) not be
+		 * a real problem -- added to that it only calls on the local
+		 * CPU, so if we enqueue remotely we'll miss an update, but
+		 * the next tick/schedule should update.
+		 *
+		 * It will not get called when we go idle, because the idle
+		 * thread is a different class (!fair), nor will the utilization
+		 * number include things like RT tasks.
+		 *
+		 * As is, the util number is not freq-invariant (we'd have to
+		 * implement arch_scale_freq_capacity() for that).
+		 *
+		 * See cpu_util().
+		 */
+		cpufreq_update_util(rq_clock(rq),
+				    min(cfs_rq->avg.util_avg, max), max);
+	}
 }
 
 static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 8ec86ab..27f5b03 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -945,6 +945,10 @@ static void update_curr_rt(struct rq *rq)
 	if (curr->sched_class != &rt_sched_class)
 		return;
 
+	/* Kick cpufreq (see the comment in linux/cpufreq.h). */
+	if (cpu_of(rq) == smp_processor_id())
+		cpufreq_trigger_update(rq_clock(rq));
+
 	delta_exec = rq_clock_task(rq) - curr->se.exec_start;
 	if (unlikely((s64)delta_exec <= 0))
 		return;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 10f1637..f042190 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -9,6 +9,7 @@
 #include <linux/irq_work.h>
 #include <linux/tick.h>
 #include <linux/slab.h>
+#include <linux/cpufreq.h>
 
 #include "cpupri.h"
 #include "cpudeadline.h"
-- 
cgit v0.10.2


From a4675fbc4a7abe072ac6ba38c252f22a91ebcd94 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Fri, 5 Feb 2016 01:45:30 +0100
Subject: cpufreq: intel_pstate: Replace timers with utilization update
 callbacks

Instead of using a per-CPU deferrable timer for utilization sampling
and P-states adjustments, register a utilization update callback that
will be invoked from the scheduler on utilization changes.

The sampling rate is still the same as what was used for the deferrable
timers, so the functional impact of this patch should not be significant.

Based on an earlier patch from Srinivas Pandruvada.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index cd83d47..f4d85c2 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -71,7 +71,7 @@ struct sample {
 	u64 mperf;
 	u64 tsc;
 	int freq;
-	ktime_t time;
+	u64 time;
 };
 
 struct pstate_data {
@@ -103,13 +103,13 @@ struct _pid {
 struct cpudata {
 	int cpu;
 
-	struct timer_list timer;
+	struct update_util_data update_util;
 
 	struct pstate_data pstate;
 	struct vid_data vid;
 	struct _pid pid;
 
-	ktime_t last_sample_time;
+	u64	last_sample_time;
 	u64	prev_aperf;
 	u64	prev_mperf;
 	u64	prev_tsc;
@@ -120,6 +120,7 @@ struct cpudata {
 static struct cpudata **all_cpu_data;
 struct pstate_adjust_policy {
 	int sample_rate_ms;
+	s64 sample_rate_ns;
 	int deadband;
 	int setpoint;
 	int p_gain_pct;
@@ -712,7 +713,7 @@ static void core_set_pstate(struct cpudata *cpudata, int pstate)
 	if (limits->no_turbo && !limits->turbo_disabled)
 		val |= (u64)1 << 32;
 
-	wrmsrl_on_cpu(cpudata->cpu, MSR_IA32_PERF_CTL, val);
+	wrmsrl(MSR_IA32_PERF_CTL, val);
 }
 
 static int knl_get_turbo_pstate(void)
@@ -883,7 +884,7 @@ static inline void intel_pstate_calc_busy(struct cpudata *cpu)
 	sample->core_pct_busy = (int32_t)core_pct;
 }
 
-static inline void intel_pstate_sample(struct cpudata *cpu)
+static inline void intel_pstate_sample(struct cpudata *cpu, u64 time)
 {
 	u64 aperf, mperf;
 	unsigned long flags;
@@ -900,7 +901,7 @@ static inline void intel_pstate_sample(struct cpudata *cpu)
 	local_irq_restore(flags);
 
 	cpu->last_sample_time = cpu->sample.time;
-	cpu->sample.time = ktime_get();
+	cpu->sample.time = time;
 	cpu->sample.aperf = aperf;
 	cpu->sample.mperf = mperf;
 	cpu->sample.tsc =  tsc;
@@ -915,22 +916,6 @@ static inline void intel_pstate_sample(struct cpudata *cpu)
 	cpu->prev_tsc = tsc;
 }
 
-static inline void intel_hwp_set_sample_time(struct cpudata *cpu)
-{
-	int delay;
-
-	delay = msecs_to_jiffies(50);
-	mod_timer_pinned(&cpu->timer, jiffies + delay);
-}
-
-static inline void intel_pstate_set_sample_time(struct cpudata *cpu)
-{
-	int delay;
-
-	delay = msecs_to_jiffies(pid_params.sample_rate_ms);
-	mod_timer_pinned(&cpu->timer, jiffies + delay);
-}
-
 static inline int32_t get_target_pstate_use_cpu_load(struct cpudata *cpu)
 {
 	struct sample *sample = &cpu->sample;
@@ -970,8 +955,7 @@ static inline int32_t get_target_pstate_use_cpu_load(struct cpudata *cpu)
 static inline int32_t get_target_pstate_use_performance(struct cpudata *cpu)
 {
 	int32_t core_busy, max_pstate, current_pstate, sample_ratio;
-	s64 duration_us;
-	u32 sample_time;
+	u64 duration_ns;
 
 	/*
 	 * core_busy is the ratio of actual performance to max
@@ -990,18 +974,16 @@ static inline int32_t get_target_pstate_use_performance(struct cpudata *cpu)
 	core_busy = mul_fp(core_busy, div_fp(max_pstate, current_pstate));
 
 	/*
-	 * Since we have a deferred timer, it will not fire unless
-	 * we are in C0.  So, determine if the actual elapsed time
-	 * is significantly greater (3x) than our sample interval.  If it
-	 * is, then we were idle for a long enough period of time
-	 * to adjust our busyness.
+	 * Since our utilization update callback will not run unless we are
+	 * in C0, check if the actual elapsed time is significantly greater (3x)
+	 * than our sample interval.  If it is, then we were idle for a long
+	 * enough period of time to adjust our busyness.
 	 */
-	sample_time = pid_params.sample_rate_ms  * USEC_PER_MSEC;
-	duration_us = ktime_us_delta(cpu->sample.time,
-				     cpu->last_sample_time);
-	if (duration_us > sample_time * 3) {
-		sample_ratio = div_fp(int_tofp(sample_time),
-				      int_tofp(duration_us));
+	duration_ns = cpu->sample.time - cpu->last_sample_time;
+	if ((s64)duration_ns > pid_params.sample_rate_ns * 3
+	    && cpu->last_sample_time > 0) {
+		sample_ratio = div_fp(int_tofp(pid_params.sample_rate_ns),
+				      int_tofp(duration_ns));
 		core_busy = mul_fp(core_busy, sample_ratio);
 	}
 
@@ -1031,23 +1013,17 @@ static inline void intel_pstate_adjust_busy_pstate(struct cpudata *cpu)
 		sample->freq);
 }
 
-static void intel_hwp_timer_func(unsigned long __data)
-{
-	struct cpudata *cpu = (struct cpudata *) __data;
-
-	intel_pstate_sample(cpu);
-	intel_hwp_set_sample_time(cpu);
-}
-
-static void intel_pstate_timer_func(unsigned long __data)
+static void intel_pstate_update_util(struct update_util_data *data, u64 time,
+				     unsigned long util, unsigned long max)
 {
-	struct cpudata *cpu = (struct cpudata *) __data;
-
-	intel_pstate_sample(cpu);
+	struct cpudata *cpu = container_of(data, struct cpudata, update_util);
+	u64 delta_ns = time - cpu->sample.time;
 
-	intel_pstate_adjust_busy_pstate(cpu);
-
-	intel_pstate_set_sample_time(cpu);
+	if ((s64)delta_ns >= pid_params.sample_rate_ns) {
+		intel_pstate_sample(cpu, time);
+		if (!hwp_active)
+			intel_pstate_adjust_busy_pstate(cpu);
+	}
 }
 
 #define ICPU(model, policy) \
@@ -1095,24 +1071,19 @@ static int intel_pstate_init_cpu(unsigned int cpunum)
 
 	cpu->cpu = cpunum;
 
-	if (hwp_active)
+	if (hwp_active) {
 		intel_pstate_hwp_enable(cpu);
+		pid_params.sample_rate_ms = 50;
+		pid_params.sample_rate_ns = 50 * NSEC_PER_MSEC;
+	}
 
 	intel_pstate_get_cpu_pstates(cpu);
 
-	init_timer_deferrable(&cpu->timer);
-	cpu->timer.data = (unsigned long)cpu;
-	cpu->timer.expires = jiffies + HZ/100;
-
-	if (!hwp_active)
-		cpu->timer.function = intel_pstate_timer_func;
-	else
-		cpu->timer.function = intel_hwp_timer_func;
-
 	intel_pstate_busy_pid_reset(cpu);
-	intel_pstate_sample(cpu);
+	intel_pstate_sample(cpu, 0);
 
-	add_timer_on(&cpu->timer, cpunum);
+	cpu->update_util.func = intel_pstate_update_util;
+	cpufreq_set_update_util_data(cpunum, &cpu->update_util);
 
 	pr_debug("intel_pstate: controlling: cpu %d\n", cpunum);
 
@@ -1196,7 +1167,9 @@ static void intel_pstate_stop_cpu(struct cpufreq_policy *policy)
 
 	pr_debug("intel_pstate: CPU %d exiting\n", cpu_num);
 
-	del_timer_sync(&all_cpu_data[cpu_num]->timer);
+	cpufreq_set_update_util_data(cpu_num, NULL);
+	synchronize_rcu();
+
 	if (hwp_active)
 		return;
 
@@ -1260,6 +1233,7 @@ static int intel_pstate_msrs_not_valid(void)
 static void copy_pid_params(struct pstate_adjust_policy *policy)
 {
 	pid_params.sample_rate_ms = policy->sample_rate_ms;
+	pid_params.sample_rate_ns = pid_params.sample_rate_ms * NSEC_PER_MSEC;
 	pid_params.p_gain_pct = policy->p_gain_pct;
 	pid_params.i_gain_pct = policy->i_gain_pct;
 	pid_params.d_gain_pct = policy->d_gain_pct;
@@ -1451,7 +1425,8 @@ out:
 	get_online_cpus();
 	for_each_online_cpu(cpu) {
 		if (all_cpu_data[cpu]) {
-			del_timer_sync(&all_cpu_data[cpu]->timer);
+			cpufreq_set_update_util_data(cpu, NULL);
+			synchronize_rcu();
 			kfree(all_cpu_data[cpu]);
 		}
 	}
-- 
cgit v0.10.2


From 9be4fd2c7723a3057b0b39676fe4c8d5fd7118a4 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Wed, 10 Feb 2016 16:53:50 +0100
Subject: cpufreq: governor: Replace timers with utilization update callbacks

Instead of using a per-CPU deferrable timer for queuing up governor
work items, register a utilization update callback that will be
invoked from the scheduler on utilization changes.

The sampling rate is still the same as what was used for the
deferrable timers and the added irq_work overhead should be offset by
the eliminated timers overhead, so in theory the functional impact of
this patch should not be significant.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Tested-by: Gautham R. Shenoy <ego@linux.vnet.ibm.com>

diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig
index 659879a..dcb972a 100644
--- a/drivers/cpufreq/Kconfig
+++ b/drivers/cpufreq/Kconfig
@@ -3,6 +3,7 @@ menu "CPU Frequency scaling"
 config CPU_FREQ
 	bool "CPU Frequency scaling"
 	select SRCU
+	select IRQ_WORK
 	help
 	  CPU Frequency scaling allows you to change the clock speed of 
 	  CPUs on the fly. This is a nice method to save power, because 
diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c
index 8504a70..bc002c8 100644
--- a/drivers/cpufreq/cpufreq_conservative.c
+++ b/drivers/cpufreq/cpufreq_conservative.c
@@ -112,14 +112,12 @@ static void cs_check_cpu(int cpu, unsigned int load)
 	}
 }
 
-static unsigned int cs_dbs_timer(struct cpufreq_policy *policy, bool modify_all)
+static unsigned int cs_dbs_timer(struct cpufreq_policy *policy)
 {
 	struct dbs_data *dbs_data = policy->governor_data;
 	struct cs_dbs_tuners *cs_tuners = dbs_data->tuners;
 
-	if (modify_all)
-		dbs_check_cpu(dbs_data, policy->cpu);
-
+	dbs_check_cpu(dbs_data, policy->cpu);
 	return delay_for_sampling_rate(cs_tuners->sampling_rate);
 }
 
diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c
index e0d1110..6bc2f50 100644
--- a/drivers/cpufreq/cpufreq_governor.c
+++ b/drivers/cpufreq/cpufreq_governor.c
@@ -128,10 +128,10 @@ void dbs_check_cpu(struct dbs_data *dbs_data, int cpu)
 		 * dropped down. So we perform the copy only once, upon the
 		 * first wake-up from idle.)
 		 *
-		 * Detecting this situation is easy: the governor's deferrable
-		 * timer would not have fired during CPU-idle periods. Hence
-		 * an unusually large 'wall_time' (as compared to the sampling
-		 * rate) indicates this scenario.
+		 * Detecting this situation is easy: the governor's utilization
+		 * update handler would not have run during CPU-idle periods.
+		 * Hence, an unusually large 'wall_time' (as compared to the
+		 * sampling rate) indicates this scenario.
 		 *
 		 * prev_load can be zero in two cases and we must recalculate it
 		 * for both cases:
@@ -161,72 +161,48 @@ void dbs_check_cpu(struct dbs_data *dbs_data, int cpu)
 }
 EXPORT_SYMBOL_GPL(dbs_check_cpu);
 
-void gov_add_timers(struct cpufreq_policy *policy, unsigned int delay)
+void gov_set_update_util(struct cpu_common_dbs_info *shared,
+			 unsigned int delay_us)
 {
+	struct cpufreq_policy *policy = shared->policy;
 	struct dbs_data *dbs_data = policy->governor_data;
-	struct cpu_dbs_info *cdbs;
 	int cpu;
 
+	gov_update_sample_delay(shared, delay_us);
+	shared->last_sample_time = 0;
+
 	for_each_cpu(cpu, policy->cpus) {
-		cdbs = dbs_data->cdata->get_cpu_cdbs(cpu);
-		cdbs->timer.expires = jiffies + delay;
-		add_timer_on(&cdbs->timer, cpu);
+		struct cpu_dbs_info *cdbs = dbs_data->cdata->get_cpu_cdbs(cpu);
+
+		cpufreq_set_update_util_data(cpu, &cdbs->update_util);
 	}
 }
-EXPORT_SYMBOL_GPL(gov_add_timers);
+EXPORT_SYMBOL_GPL(gov_set_update_util);
 
-static inline void gov_cancel_timers(struct cpufreq_policy *policy)
+static inline void gov_clear_update_util(struct cpufreq_policy *policy)
 {
-	struct dbs_data *dbs_data = policy->governor_data;
-	struct cpu_dbs_info *cdbs;
 	int i;
 
-	for_each_cpu(i, policy->cpus) {
-		cdbs = dbs_data->cdata->get_cpu_cdbs(i);
-		del_timer_sync(&cdbs->timer);
-	}
+	for_each_cpu(i, policy->cpus)
+		cpufreq_set_update_util_data(i, NULL);
+
+	synchronize_rcu();
 }
 
-void gov_cancel_work(struct cpu_common_dbs_info *shared)
+static void gov_cancel_work(struct cpu_common_dbs_info *shared)
 {
-	/* Tell dbs_timer_handler() to skip queuing up work items. */
+	/* Tell dbs_update_util_handler() to skip queuing up work items. */
 	atomic_inc(&shared->skip_work);
 	/*
-	 * If dbs_timer_handler() is already running, it may not notice the
-	 * incremented skip_work, so wait for it to complete to prevent its work
-	 * item from being queued up after the cancel_work_sync() below.
-	 */
-	gov_cancel_timers(shared->policy);
-	/*
-	 * In case dbs_timer_handler() managed to run and spawn a work item
-	 * before the timers have been canceled, wait for that work item to
-	 * complete and then cancel all of the timers set up by it.  If
-	 * dbs_timer_handler() runs again at that point, it will see the
-	 * positive value of skip_work and won't spawn any more work items.
+	 * If dbs_update_util_handler() is already running, it may not notice
+	 * the incremented skip_work, so wait for it to complete to prevent its
+	 * work item from being queued up after the cancel_work_sync() below.
 	 */
+	gov_clear_update_util(shared->policy);
+	irq_work_sync(&shared->irq_work);
 	cancel_work_sync(&shared->work);
-	gov_cancel_timers(shared->policy);
 	atomic_set(&shared->skip_work, 0);
 }
-EXPORT_SYMBOL_GPL(gov_cancel_work);
-
-/* Will return if we need to evaluate cpu load again or not */
-static bool need_load_eval(struct cpu_common_dbs_info *shared,
-			   unsigned int sampling_rate)
-{
-	if (policy_is_shared(shared->policy)) {
-		ktime_t time_now = ktime_get();
-		s64 delta_us = ktime_us_delta(time_now, shared->time_stamp);
-
-		/* Do nothing if we recently have sampled */
-		if (delta_us < (s64)(sampling_rate / 2))
-			return false;
-		else
-			shared->time_stamp = time_now;
-	}
-
-	return true;
-}
 
 static void dbs_work_handler(struct work_struct *work)
 {
@@ -234,56 +210,70 @@ static void dbs_work_handler(struct work_struct *work)
 					cpu_common_dbs_info, work);
 	struct cpufreq_policy *policy;
 	struct dbs_data *dbs_data;
-	unsigned int sampling_rate, delay;
-	bool eval_load;
+	unsigned int delay;
 
 	policy = shared->policy;
 	dbs_data = policy->governor_data;
 
-	/* Kill all timers */
-	gov_cancel_timers(policy);
-
-	if (dbs_data->cdata->governor == GOV_CONSERVATIVE) {
-		struct cs_dbs_tuners *cs_tuners = dbs_data->tuners;
-
-		sampling_rate = cs_tuners->sampling_rate;
-	} else {
-		struct od_dbs_tuners *od_tuners = dbs_data->tuners;
-
-		sampling_rate = od_tuners->sampling_rate;
-	}
-
-	eval_load = need_load_eval(shared, sampling_rate);
-
 	/*
-	 * Make sure cpufreq_governor_limits() isn't evaluating load in
-	 * parallel.
+	 * Make sure cpufreq_governor_limits() isn't evaluating load or the
+	 * ondemand governor isn't updating the sampling rate in parallel.
 	 */
 	mutex_lock(&shared->timer_mutex);
-	delay = dbs_data->cdata->gov_dbs_timer(policy, eval_load);
+	delay = dbs_data->cdata->gov_dbs_timer(policy);
+	shared->sample_delay_ns = jiffies_to_nsecs(delay);
 	mutex_unlock(&shared->timer_mutex);
 
+	/*
+	 * If the atomic operation below is reordered with respect to the
+	 * sample delay modification, the utilization update handler may end
+	 * up using a stale sample delay value.
+	 */
+	smp_mb__before_atomic();
 	atomic_dec(&shared->skip_work);
+}
+
+static void dbs_irq_work(struct irq_work *irq_work)
+{
+	struct cpu_common_dbs_info *shared;
 
-	gov_add_timers(policy, delay);
+	shared = container_of(irq_work, struct cpu_common_dbs_info, irq_work);
+	schedule_work(&shared->work);
 }
 
-static void dbs_timer_handler(unsigned long data)
+static inline void gov_queue_irq_work(struct cpu_common_dbs_info *shared)
 {
-	struct cpu_dbs_info *cdbs = (struct cpu_dbs_info *)data;
+#ifdef CONFIG_SMP
+	irq_work_queue_on(&shared->irq_work, smp_processor_id());
+#else
+	irq_work_queue(&shared->irq_work);
+#endif
+}
+
+static void dbs_update_util_handler(struct update_util_data *data, u64 time,
+				    unsigned long util, unsigned long max)
+{
+	struct cpu_dbs_info *cdbs = container_of(data, struct cpu_dbs_info, update_util);
 	struct cpu_common_dbs_info *shared = cdbs->shared;
 
 	/*
-	 * Timer handler may not be allowed to queue the work at the moment,
-	 * because:
-	 * - Another timer handler has done that
-	 * - We are stopping the governor
-	 * - Or we are updating the sampling rate of the ondemand governor
+	 * The work may not be allowed to be queued up right now.
+	 * Possible reasons:
+	 * - Work has already been queued up or is in progress.
+	 * - The governor is being stopped.
+	 * - It is too early (too little time from the previous sample).
 	 */
-	if (atomic_inc_return(&shared->skip_work) > 1)
-		atomic_dec(&shared->skip_work);
-	else
-		queue_work(system_wq, &shared->work);
+	if (atomic_inc_return(&shared->skip_work) == 1) {
+		u64 delta_ns;
+
+		delta_ns = time - shared->last_sample_time;
+		if ((s64)delta_ns >= shared->sample_delay_ns) {
+			shared->last_sample_time = time;
+			gov_queue_irq_work(shared);
+			return;
+		}
+	}
+	atomic_dec(&shared->skip_work);
 }
 
 static void set_sampling_rate(struct dbs_data *dbs_data,
@@ -315,6 +305,7 @@ static int alloc_common_dbs_info(struct cpufreq_policy *policy,
 
 	mutex_init(&shared->timer_mutex);
 	atomic_set(&shared->skip_work, 0);
+	init_irq_work(&shared->irq_work, dbs_irq_work);
 	INIT_WORK(&shared->work, dbs_work_handler);
 	return 0;
 }
@@ -467,9 +458,6 @@ static int cpufreq_governor_start(struct cpufreq_policy *policy,
 		io_busy = od_tuners->io_is_busy;
 	}
 
-	shared->policy = policy;
-	shared->time_stamp = ktime_get();
-
 	for_each_cpu(j, policy->cpus) {
 		struct cpu_dbs_info *j_cdbs = cdata->get_cpu_cdbs(j);
 		unsigned int prev_load;
@@ -485,10 +473,9 @@ static int cpufreq_governor_start(struct cpufreq_policy *policy,
 		if (ignore_nice)
 			j_cdbs->prev_cpu_nice = kcpustat_cpu(j).cpustat[CPUTIME_NICE];
 
-		__setup_timer(&j_cdbs->timer, dbs_timer_handler,
-			      (unsigned long)j_cdbs,
-			      TIMER_DEFERRABLE | TIMER_IRQSAFE);
+		j_cdbs->update_util.func = dbs_update_util_handler;
 	}
+	shared->policy = policy;
 
 	if (cdata->governor == GOV_CONSERVATIVE) {
 		struct cs_cpu_dbs_info_s *cs_dbs_info =
@@ -505,7 +492,7 @@ static int cpufreq_governor_start(struct cpufreq_policy *policy,
 		od_ops->powersave_bias_init_cpu(cpu);
 	}
 
-	gov_add_timers(policy, delay_for_sampling_rate(sampling_rate));
+	gov_set_update_util(shared, sampling_rate);
 	return 0;
 }
 
diff --git a/drivers/cpufreq/cpufreq_governor.h b/drivers/cpufreq/cpufreq_governor.h
index 91e767a..5417771 100644
--- a/drivers/cpufreq/cpufreq_governor.h
+++ b/drivers/cpufreq/cpufreq_governor.h
@@ -18,6 +18,7 @@
 #define _CPUFREQ_GOVERNOR_H
 
 #include <linux/atomic.h>
+#include <linux/irq_work.h>
 #include <linux/cpufreq.h>
 #include <linux/kernel_stat.h>
 #include <linux/module.h>
@@ -138,11 +139,19 @@ struct cpu_common_dbs_info {
 	 */
 	struct mutex timer_mutex;
 
-	ktime_t time_stamp;
+	u64 last_sample_time;
+	s64 sample_delay_ns;
 	atomic_t skip_work;
+	struct irq_work irq_work;
 	struct work_struct work;
 };
 
+static inline void gov_update_sample_delay(struct cpu_common_dbs_info *shared,
+					   unsigned int delay_us)
+{
+	shared->sample_delay_ns = delay_us * NSEC_PER_USEC;
+}
+
 /* Per cpu structures */
 struct cpu_dbs_info {
 	u64 prev_cpu_idle;
@@ -155,7 +164,7 @@ struct cpu_dbs_info {
 	 * wake-up from idle.
 	 */
 	unsigned int prev_load;
-	struct timer_list timer;
+	struct update_util_data update_util;
 	struct cpu_common_dbs_info *shared;
 };
 
@@ -212,8 +221,7 @@ struct common_dbs_data {
 
 	struct cpu_dbs_info *(*get_cpu_cdbs)(int cpu);
 	void *(*get_cpu_dbs_info_s)(int cpu);
-	unsigned int (*gov_dbs_timer)(struct cpufreq_policy *policy,
-				      bool modify_all);
+	unsigned int (*gov_dbs_timer)(struct cpufreq_policy *policy);
 	void (*gov_check_cpu)(int cpu, unsigned int load);
 	int (*init)(struct dbs_data *dbs_data, bool notify);
 	void (*exit)(struct dbs_data *dbs_data, bool notify);
@@ -270,9 +278,6 @@ static ssize_t show_sampling_rate_min_gov_pol				\
 }
 
 extern struct mutex cpufreq_governor_lock;
-
-void gov_add_timers(struct cpufreq_policy *policy, unsigned int delay);
-void gov_cancel_work(struct cpu_common_dbs_info *shared);
 void dbs_check_cpu(struct dbs_data *dbs_data, int cpu);
 int cpufreq_governor_dbs(struct cpufreq_policy *policy,
 		struct common_dbs_data *cdata, unsigned int event);
diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c
index 929e193..da7f351 100644
--- a/drivers/cpufreq/cpufreq_ondemand.c
+++ b/drivers/cpufreq/cpufreq_ondemand.c
@@ -189,7 +189,7 @@ static void od_check_cpu(int cpu, unsigned int load)
 	}
 }
 
-static unsigned int od_dbs_timer(struct cpufreq_policy *policy, bool modify_all)
+static unsigned int od_dbs_timer(struct cpufreq_policy *policy)
 {
 	struct dbs_data *dbs_data = policy->governor_data;
 	unsigned int cpu = policy->cpu;
@@ -198,9 +198,6 @@ static unsigned int od_dbs_timer(struct cpufreq_policy *policy, bool modify_all)
 	struct od_dbs_tuners *od_tuners = dbs_data->tuners;
 	int delay = 0, sample_type = dbs_info->sample_type;
 
-	if (!modify_all)
-		goto max_delay;
-
 	/* Common NORMAL_SAMPLE setup */
 	dbs_info->sample_type = OD_NORMAL_SAMPLE;
 	if (sample_type == OD_SUB_SAMPLE) {
@@ -216,7 +213,6 @@ static unsigned int od_dbs_timer(struct cpufreq_policy *policy, bool modify_all)
 		}
 	}
 
-max_delay:
 	if (!delay)
 		delay = delay_for_sampling_rate(od_tuners->sampling_rate
 				* dbs_info->rate_mult);
@@ -262,7 +258,6 @@ static void update_sampling_rate(struct dbs_data *dbs_data,
 		struct od_cpu_dbs_info_s *dbs_info;
 		struct cpu_dbs_info *cdbs;
 		struct cpu_common_dbs_info *shared;
-		unsigned long next_sampling, appointed_at;
 
 		dbs_info = &per_cpu(od_cpu_dbs_info, cpu);
 		cdbs = &dbs_info->cdbs;
@@ -286,20 +281,28 @@ static void update_sampling_rate(struct dbs_data *dbs_data,
 		 * policy will be governed by dbs_data, otherwise there can be
 		 * multiple policies that are governed by the same dbs_data.
 		 */
-		if (dbs_data != policy->governor_data)
-			continue;
-
-		/*
-		 * Checking this for any CPU should be fine, timers for all of
-		 * them are scheduled together.
-		 */
-		next_sampling = jiffies + usecs_to_jiffies(new_rate);
-		appointed_at = dbs_info->cdbs.timer.expires;
-
-		if (time_before(next_sampling, appointed_at)) {
-			gov_cancel_work(shared);
-			gov_add_timers(policy, usecs_to_jiffies(new_rate));
-
+		if (dbs_data == policy->governor_data) {
+			mutex_lock(&shared->timer_mutex);
+			/*
+			 * On 32-bit architectures this may race with the
+			 * sample_delay_ns read in dbs_update_util_handler(),
+			 * but that really doesn't matter.  If the read returns
+			 * a value that's too big, the sample will be skipped,
+			 * but the next invocation of dbs_update_util_handler()
+			 * (when the update has been completed) will take a
+			 * sample.  If the returned value is too small, the
+			 * sample will be taken immediately, but that isn't a
+			 * problem, as we want the new rate to take effect
+			 * immediately anyway.
+			 *
+			 * If this runs in parallel with dbs_work_handler(), we
+			 * may end up overwriting the sample_delay_ns value that
+			 * it has just written, but the difference should not be
+			 * too big and it will be corrected next time a sample
+			 * is taken, so it shouldn't be significant.
+			 */
+			gov_update_sample_delay(shared, new_rate);
+			mutex_unlock(&shared->timer_mutex);
 		}
 	}
 
-- 
cgit v0.10.2


From 2bb8d94fb03f808022c620f54b602a1e26d5cbac Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Sun, 7 Feb 2016 16:01:31 +0100
Subject: cpufreq: governor: Use common mutex for dbs_data protection

Every governor relying on the common code in cpufreq_governor.c
has to provide its own mutex in struct common_dbs_data.  However,
there actually is no need to have a separate mutex per governor
for this purpose, they may be using the same global mutex just
fine.  Accordingly, introduce a single common mutex for that and
drop the mutex field from struct common_dbs_data.

That at least will ensure that the mutex is always present and
initialized regardless of what the particular governors do.

Another benefit is that the common code does not need a pointer to
a governor-related structure to get to the mutex which sometimes
helps.

Finally, it makes the code generally easier to follow.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Saravana Kannan <skannan@codeaurora.org>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>

diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c
index bc002c8..8f0c3db 100644
--- a/drivers/cpufreq/cpufreq_conservative.c
+++ b/drivers/cpufreq/cpufreq_conservative.c
@@ -368,7 +368,6 @@ static struct common_dbs_data cs_dbs_cdata = {
 	.gov_check_cpu = cs_check_cpu,
 	.init = cs_init,
 	.exit = cs_exit,
-	.mutex = __MUTEX_INITIALIZER(cs_dbs_cdata.mutex),
 };
 
 static int cs_cpufreq_governor_dbs(struct cpufreq_policy *policy,
diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c
index 6bc2f50..f291fdd 100644
--- a/drivers/cpufreq/cpufreq_governor.c
+++ b/drivers/cpufreq/cpufreq_governor.c
@@ -22,6 +22,9 @@
 
 #include "cpufreq_governor.h"
 
+DEFINE_MUTEX(dbs_data_mutex);
+EXPORT_SYMBOL_GPL(dbs_data_mutex);
+
 static struct attribute_group *get_sysfs_attr(struct dbs_data *dbs_data)
 {
 	if (have_governor_per_policy())
@@ -543,7 +546,7 @@ int cpufreq_governor_dbs(struct cpufreq_policy *policy,
 	int ret;
 
 	/* Lock governor to block concurrent initialization of governor */
-	mutex_lock(&cdata->mutex);
+	mutex_lock(&dbs_data_mutex);
 
 	if (have_governor_per_policy())
 		dbs_data = policy->governor_data;
@@ -576,7 +579,7 @@ int cpufreq_governor_dbs(struct cpufreq_policy *policy,
 	}
 
 unlock:
-	mutex_unlock(&cdata->mutex);
+	mutex_unlock(&dbs_data_mutex);
 
 	return ret;
 }
diff --git a/drivers/cpufreq/cpufreq_governor.h b/drivers/cpufreq/cpufreq_governor.h
index 5417771..a9df62e 100644
--- a/drivers/cpufreq/cpufreq_governor.h
+++ b/drivers/cpufreq/cpufreq_governor.h
@@ -228,11 +228,6 @@ struct common_dbs_data {
 
 	/* Governor specific ops, see below */
 	void *gov_ops;
-
-	/*
-	 * Protects governor's data (struct dbs_data and struct common_dbs_data)
-	 */
-	struct mutex mutex;
 };
 
 /* Governor Per policy data */
@@ -277,6 +272,7 @@ static ssize_t show_sampling_rate_min_gov_pol				\
 	return sprintf(buf, "%u\n", dbs_data->min_sampling_rate);	\
 }
 
+extern struct mutex dbs_data_mutex;
 extern struct mutex cpufreq_governor_lock;
 void dbs_check_cpu(struct dbs_data *dbs_data, int cpu);
 int cpufreq_governor_dbs(struct cpufreq_policy *policy,
diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c
index da7f351..fac2f8f 100644
--- a/drivers/cpufreq/cpufreq_ondemand.c
+++ b/drivers/cpufreq/cpufreq_ondemand.c
@@ -249,7 +249,7 @@ static void update_sampling_rate(struct dbs_data *dbs_data,
 	/*
 	 * Lock governor so that governor start/stop can't execute in parallel.
 	 */
-	mutex_lock(&od_dbs_cdata.mutex);
+	mutex_lock(&dbs_data_mutex);
 
 	cpumask_copy(&cpumask, cpu_online_mask);
 
@@ -306,7 +306,7 @@ static void update_sampling_rate(struct dbs_data *dbs_data,
 		}
 	}
 
-	mutex_unlock(&od_dbs_cdata.mutex);
+	mutex_unlock(&dbs_data_mutex);
 }
 
 static ssize_t store_sampling_rate(struct dbs_data *dbs_data, const char *buf,
@@ -552,7 +552,6 @@ static struct common_dbs_data od_dbs_cdata = {
 	.gov_ops = &od_ops,
 	.init = od_init,
 	.exit = od_exit,
-	.mutex = __MUTEX_INITIALIZER(od_dbs_cdata.mutex),
 };
 
 static int od_cpufreq_governor_dbs(struct cpufreq_policy *policy,
-- 
cgit v0.10.2


From 5da3dd1e00918a9ac4b83885453bfa9cad732b44 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Fri, 5 Feb 2016 03:15:24 +0100
Subject: cpufreq: governor: Avoid passing dbs_data pointers around
 unnecessarily

Do not pass struct dbs_data pointers to the family of functions
implementing governor operations in cpufreq_governor.c as they can
take that pointer from policy->governor by themselves.

The cpufreq_governor_init() case is slightly more complicated, since
policy->governor may be NULL when it is invoked, but then it can reach
the pointer in question via its cdata argument just fine.

While at it, rework cpufreq_governor_dbs() to avoid a pointless
policy_governor check in the CPUFREQ_GOV_POLICY_INIT case.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>

diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c
index f291fdd..a329e1b 100644
--- a/drivers/cpufreq/cpufreq_governor.c
+++ b/drivers/cpufreq/cpufreq_governor.c
@@ -329,9 +329,9 @@ static void free_common_dbs_info(struct cpufreq_policy *policy,
 }
 
 static int cpufreq_governor_init(struct cpufreq_policy *policy,
-				 struct dbs_data *dbs_data,
 				 struct common_dbs_data *cdata)
 {
+	struct dbs_data *dbs_data = cdata->gdbs_data;
 	unsigned int latency;
 	int ret;
 
@@ -403,9 +403,9 @@ free_dbs_data:
 	return ret;
 }
 
-static int cpufreq_governor_exit(struct cpufreq_policy *policy,
-				 struct dbs_data *dbs_data)
+static int cpufreq_governor_exit(struct cpufreq_policy *policy)
 {
+	struct dbs_data *dbs_data = policy->governor_data;
 	struct common_dbs_data *cdata = dbs_data->cdata;
 	struct cpu_dbs_info *cdbs = cdata->get_cpu_cdbs(policy->cpu);
 
@@ -432,9 +432,9 @@ static int cpufreq_governor_exit(struct cpufreq_policy *policy,
 	return 0;
 }
 
-static int cpufreq_governor_start(struct cpufreq_policy *policy,
-				  struct dbs_data *dbs_data)
+static int cpufreq_governor_start(struct cpufreq_policy *policy)
 {
+	struct dbs_data *dbs_data = policy->governor_data;
 	struct common_dbs_data *cdata = dbs_data->cdata;
 	unsigned int sampling_rate, ignore_nice, j, cpu = policy->cpu;
 	struct cpu_dbs_info *cdbs = cdata->get_cpu_cdbs(cpu);
@@ -499,9 +499,9 @@ static int cpufreq_governor_start(struct cpufreq_policy *policy,
 	return 0;
 }
 
-static int cpufreq_governor_stop(struct cpufreq_policy *policy,
-				 struct dbs_data *dbs_data)
+static int cpufreq_governor_stop(struct cpufreq_policy *policy)
 {
+	struct dbs_data *dbs_data = policy->governor_data;
 	struct cpu_dbs_info *cdbs = dbs_data->cdata->get_cpu_cdbs(policy->cpu);
 	struct cpu_common_dbs_info *shared = cdbs->shared;
 
@@ -515,9 +515,9 @@ static int cpufreq_governor_stop(struct cpufreq_policy *policy,
 	return 0;
 }
 
-static int cpufreq_governor_limits(struct cpufreq_policy *policy,
-				   struct dbs_data *dbs_data)
+static int cpufreq_governor_limits(struct cpufreq_policy *policy)
 {
+	struct dbs_data *dbs_data = policy->governor_data;
 	struct common_dbs_data *cdata = dbs_data->cdata;
 	unsigned int cpu = policy->cpu;
 	struct cpu_dbs_info *cdbs = cdata->get_cpu_cdbs(cpu);
@@ -542,45 +542,31 @@ static int cpufreq_governor_limits(struct cpufreq_policy *policy,
 int cpufreq_governor_dbs(struct cpufreq_policy *policy,
 			 struct common_dbs_data *cdata, unsigned int event)
 {
-	struct dbs_data *dbs_data;
-	int ret;
+	int ret = -EINVAL;
 
 	/* Lock governor to block concurrent initialization of governor */
 	mutex_lock(&dbs_data_mutex);
 
-	if (have_governor_per_policy())
-		dbs_data = policy->governor_data;
-	else
-		dbs_data = cdata->gdbs_data;
-
-	if (!dbs_data && (event != CPUFREQ_GOV_POLICY_INIT)) {
-		ret = -EINVAL;
-		goto unlock;
-	}
-
-	switch (event) {
-	case CPUFREQ_GOV_POLICY_INIT:
-		ret = cpufreq_governor_init(policy, dbs_data, cdata);
-		break;
-	case CPUFREQ_GOV_POLICY_EXIT:
-		ret = cpufreq_governor_exit(policy, dbs_data);
-		break;
-	case CPUFREQ_GOV_START:
-		ret = cpufreq_governor_start(policy, dbs_data);
-		break;
-	case CPUFREQ_GOV_STOP:
-		ret = cpufreq_governor_stop(policy, dbs_data);
-		break;
-	case CPUFREQ_GOV_LIMITS:
-		ret = cpufreq_governor_limits(policy, dbs_data);
-		break;
-	default:
-		ret = -EINVAL;
+	if (event == CPUFREQ_GOV_POLICY_INIT) {
+		ret = cpufreq_governor_init(policy, cdata);
+	} else if (policy->governor_data) {
+		switch (event) {
+		case CPUFREQ_GOV_POLICY_EXIT:
+			ret = cpufreq_governor_exit(policy);
+			break;
+		case CPUFREQ_GOV_START:
+			ret = cpufreq_governor_start(policy);
+			break;
+		case CPUFREQ_GOV_STOP:
+			ret = cpufreq_governor_stop(policy);
+			break;
+		case CPUFREQ_GOV_LIMITS:
+			ret = cpufreq_governor_limits(policy);
+			break;
+		}
 	}
 
-unlock:
 	mutex_unlock(&dbs_data_mutex);
-
 	return ret;
 }
 EXPORT_SYMBOL_GPL(cpufreq_governor_dbs);
-- 
cgit v0.10.2


From af926185231a6e30d11a6035410b61405e203c3b Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Fri, 5 Feb 2016 03:16:08 +0100
Subject: cpufreq: governor: Put governor structure into common_dbs_data

For the ondemand and conservative governors (generally, governors
that use the common code in cpufreq_governor.c), there are two static
data structures representing the governor, the struct governor
structure (the interface to the cpufreq core) and the struct
common_dbs_data one (the interface to the cpufreq_governor.c code).

There's no fundamental reason why those two structures have to be
separate.  Moreover, if the struct governor one is included into
struct common_dbs_data, it will be possible to reach the latter from
the policy via its policy->governor pointer, so it won't be necessary
to pass a separate pointer to it around.  For this reason, embed
struct governor in struct common_dbs_data.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Saravana Kannan <skannan@codeaurora.org>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>

diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c
index 8f0c3db..4597f74 100644
--- a/drivers/cpufreq/cpufreq_conservative.c
+++ b/drivers/cpufreq/cpufreq_conservative.c
@@ -23,16 +23,6 @@
 
 static DEFINE_PER_CPU(struct cs_cpu_dbs_info_s, cs_cpu_dbs_info);
 
-static int cs_cpufreq_governor_dbs(struct cpufreq_policy *policy,
-				   unsigned int event);
-
-static struct cpufreq_governor cpufreq_gov_conservative = {
-	.name			= "conservative",
-	.governor		= cs_cpufreq_governor_dbs,
-	.max_transition_latency	= TRANSITION_LATENCY_LIMIT,
-	.owner			= THIS_MODULE,
-};
-
 static inline unsigned int get_freq_target(struct cs_dbs_tuners *cs_tuners,
 					   struct cpufreq_policy *policy)
 {
@@ -122,30 +112,7 @@ static unsigned int cs_dbs_timer(struct cpufreq_policy *policy)
 }
 
 static int dbs_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
-		void *data)
-{
-	struct cpufreq_freqs *freq = data;
-	struct cs_cpu_dbs_info_s *dbs_info =
-					&per_cpu(cs_cpu_dbs_info, freq->cpu);
-	struct cpufreq_policy *policy = cpufreq_cpu_get_raw(freq->cpu);
-
-	if (!policy)
-		return 0;
-
-	/* policy isn't governed by conservative governor */
-	if (policy->governor != &cpufreq_gov_conservative)
-		return 0;
-
-	/*
-	 * we only care if our internally tracked freq moves outside the 'valid'
-	 * ranges of frequency available to us otherwise we do not change it
-	*/
-	if (dbs_info->requested_freq > policy->max
-			|| dbs_info->requested_freq < policy->min)
-		dbs_info->requested_freq = freq->new;
-
-	return 0;
-}
+				void *data);
 
 static struct notifier_block cs_cpufreq_notifier_block = {
 	.notifier_call = dbs_cpufreq_notifier,
@@ -358,7 +325,16 @@ static void cs_exit(struct dbs_data *dbs_data, bool notify)
 
 define_get_cpu_dbs_routines(cs_cpu_dbs_info);
 
+static int cs_cpufreq_governor_dbs(struct cpufreq_policy *policy,
+				   unsigned int event);
+
 static struct common_dbs_data cs_dbs_cdata = {
+	.gov = {
+		.name = "conservative",
+		.governor = cs_cpufreq_governor_dbs,
+		.max_transition_latency = TRANSITION_LATENCY_LIMIT,
+		.owner = THIS_MODULE,
+	},
 	.governor = GOV_CONSERVATIVE,
 	.attr_group_gov_sys = &cs_attr_group_gov_sys,
 	.attr_group_gov_pol = &cs_attr_group_gov_pol,
@@ -370,20 +346,48 @@ static struct common_dbs_data cs_dbs_cdata = {
 	.exit = cs_exit,
 };
 
+#define CPU_FREQ_GOV_CONSERVATIVE	(&cs_dbs_cdata.gov)
+
 static int cs_cpufreq_governor_dbs(struct cpufreq_policy *policy,
 				   unsigned int event)
 {
 	return cpufreq_governor_dbs(policy, &cs_dbs_cdata, event);
 }
 
+static int dbs_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
+				void *data)
+{
+	struct cpufreq_freqs *freq = data;
+	struct cs_cpu_dbs_info_s *dbs_info =
+					&per_cpu(cs_cpu_dbs_info, freq->cpu);
+	struct cpufreq_policy *policy = cpufreq_cpu_get_raw(freq->cpu);
+
+	if (!policy)
+		return 0;
+
+	/* policy isn't governed by conservative governor */
+	if (policy->governor != CPU_FREQ_GOV_CONSERVATIVE)
+		return 0;
+
+	/*
+	 * we only care if our internally tracked freq moves outside the 'valid'
+	 * ranges of frequency available to us otherwise we do not change it
+	*/
+	if (dbs_info->requested_freq > policy->max
+			|| dbs_info->requested_freq < policy->min)
+		dbs_info->requested_freq = freq->new;
+
+	return 0;
+}
+
 static int __init cpufreq_gov_dbs_init(void)
 {
-	return cpufreq_register_governor(&cpufreq_gov_conservative);
+	return cpufreq_register_governor(CPU_FREQ_GOV_CONSERVATIVE);
 }
 
 static void __exit cpufreq_gov_dbs_exit(void)
 {
-	cpufreq_unregister_governor(&cpufreq_gov_conservative);
+	cpufreq_unregister_governor(CPU_FREQ_GOV_CONSERVATIVE);
 }
 
 MODULE_AUTHOR("Alexander Clouter <alex@digriz.org.uk>");
@@ -395,7 +399,7 @@ MODULE_LICENSE("GPL");
 #ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE
 struct cpufreq_governor *cpufreq_default_governor(void)
 {
-	return &cpufreq_gov_conservative;
+	return CPU_FREQ_GOV_CONSERVATIVE;
 }
 
 fs_initcall(cpufreq_gov_dbs_init);
diff --git a/drivers/cpufreq/cpufreq_governor.h b/drivers/cpufreq/cpufreq_governor.h
index a9df62e..2fa3cf1 100644
--- a/drivers/cpufreq/cpufreq_governor.h
+++ b/drivers/cpufreq/cpufreq_governor.h
@@ -206,7 +206,8 @@ struct cs_dbs_tuners {
 /* Common Governor data across policies */
 struct dbs_data;
 struct common_dbs_data {
-	/* Common across governors */
+	struct cpufreq_governor gov;
+
 	#define GOV_ONDEMAND		0
 	#define GOV_CONSERVATIVE	1
 	int governor;
diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c
index fac2f8f..836116c 100644
--- a/drivers/cpufreq/cpufreq_ondemand.c
+++ b/drivers/cpufreq/cpufreq_ondemand.c
@@ -31,8 +31,6 @@ static DEFINE_PER_CPU(struct od_cpu_dbs_info_s, od_cpu_dbs_info);
 
 static struct od_ops od_ops;
 
-static struct cpufreq_governor cpufreq_gov_ondemand;
-
 static unsigned int default_powersave_bias;
 
 static void ondemand_powersave_bias_init_cpu(int cpu)
@@ -541,7 +539,16 @@ static struct od_ops od_ops = {
 	.freq_increase = dbs_freq_increase,
 };
 
+static int od_cpufreq_governor_dbs(struct cpufreq_policy *policy,
+				   unsigned int event);
+
 static struct common_dbs_data od_dbs_cdata = {
+	.gov = {
+		.name = "ondemand",
+		.governor = od_cpufreq_governor_dbs,
+		.max_transition_latency	= TRANSITION_LATENCY_LIMIT,
+		.owner = THIS_MODULE,
+	},
 	.governor = GOV_ONDEMAND,
 	.attr_group_gov_sys = &od_attr_group_gov_sys,
 	.attr_group_gov_pol = &od_attr_group_gov_pol,
@@ -554,19 +561,14 @@ static struct common_dbs_data od_dbs_cdata = {
 	.exit = od_exit,
 };
 
+#define CPU_FREQ_GOV_ONDEMAND	(&od_dbs_cdata.gov)
+
 static int od_cpufreq_governor_dbs(struct cpufreq_policy *policy,
 		unsigned int event)
 {
 	return cpufreq_governor_dbs(policy, &od_dbs_cdata, event);
 }
 
-static struct cpufreq_governor cpufreq_gov_ondemand = {
-	.name			= "ondemand",
-	.governor		= od_cpufreq_governor_dbs,
-	.max_transition_latency	= TRANSITION_LATENCY_LIMIT,
-	.owner			= THIS_MODULE,
-};
-
 static void od_set_powersave_bias(unsigned int powersave_bias)
 {
 	struct cpufreq_policy *policy;
@@ -592,7 +594,7 @@ static void od_set_powersave_bias(unsigned int powersave_bias)
 		policy = shared->policy;
 		cpumask_or(&done, &done, policy->cpus);
 
-		if (policy->governor != &cpufreq_gov_ondemand)
+		if (policy->governor != CPU_FREQ_GOV_ONDEMAND)
 			continue;
 
 		dbs_data = policy->governor_data;
@@ -620,12 +622,12 @@ EXPORT_SYMBOL_GPL(od_unregister_powersave_bias_handler);
 
 static int __init cpufreq_gov_dbs_init(void)
 {
-	return cpufreq_register_governor(&cpufreq_gov_ondemand);
+	return cpufreq_register_governor(CPU_FREQ_GOV_ONDEMAND);
 }
 
 static void __exit cpufreq_gov_dbs_exit(void)
 {
-	cpufreq_unregister_governor(&cpufreq_gov_ondemand);
+	cpufreq_unregister_governor(CPU_FREQ_GOV_ONDEMAND);
 }
 
 MODULE_AUTHOR("Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>");
@@ -637,7 +639,7 @@ MODULE_LICENSE("GPL");
 #ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND
 struct cpufreq_governor *cpufreq_default_governor(void)
 {
-	return &cpufreq_gov_ondemand;
+	return CPU_FREQ_GOV_ONDEMAND;
 }
 
 fs_initcall(cpufreq_gov_dbs_init);
-- 
cgit v0.10.2


From 7bdad34d0890b69c30e8c6a50c9c2311a839fd68 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Sun, 7 Feb 2016 16:05:07 +0100
Subject: cpufreq: governor: Rename some data types and variables

The ondemand and conservative governors are represented by
struct common_dbs_data whose name doesn't reflect the purpose it
is used for, so rename it to struct dbs_governor and rename
variables of that type accordingly.

No functional changes.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>

diff --git a/drivers/cpufreq/amd_freq_sensitivity.c b/drivers/cpufreq/amd_freq_sensitivity.c
index f6b79ab..a7d237b 100644
--- a/drivers/cpufreq/amd_freq_sensitivity.c
+++ b/drivers/cpufreq/amd_freq_sensitivity.c
@@ -48,7 +48,7 @@ static unsigned int amd_powersave_bias_target(struct cpufreq_policy *policy,
 	struct dbs_data *od_data = policy->governor_data;
 	struct od_dbs_tuners *od_tuners = od_data->tuners;
 	struct od_cpu_dbs_info_s *od_info =
-		od_data->cdata->get_cpu_dbs_info_s(policy->cpu);
+		od_data->gov->get_cpu_dbs_info_s(policy->cpu);
 
 	if (!od_info->freq_table)
 		return freq_next;
diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c
index 4597f74..c65ac36 100644
--- a/drivers/cpufreq/cpufreq_conservative.c
+++ b/drivers/cpufreq/cpufreq_conservative.c
@@ -119,7 +119,7 @@ static struct notifier_block cs_cpufreq_notifier_block = {
 };
 
 /************************** sysfs interface ************************/
-static struct common_dbs_data cs_dbs_cdata;
+static struct dbs_governor cs_dbs_gov;
 
 static ssize_t store_sampling_down_factor(struct dbs_data *dbs_data,
 		const char *buf, size_t count)
@@ -328,7 +328,7 @@ define_get_cpu_dbs_routines(cs_cpu_dbs_info);
 static int cs_cpufreq_governor_dbs(struct cpufreq_policy *policy,
 				   unsigned int event);
 
-static struct common_dbs_data cs_dbs_cdata = {
+static struct dbs_governor cs_dbs_gov = {
 	.gov = {
 		.name = "conservative",
 		.governor = cs_cpufreq_governor_dbs,
@@ -346,12 +346,12 @@ static struct common_dbs_data cs_dbs_cdata = {
 	.exit = cs_exit,
 };
 
-#define CPU_FREQ_GOV_CONSERVATIVE	(&cs_dbs_cdata.gov)
+#define CPU_FREQ_GOV_CONSERVATIVE	(&cs_dbs_gov.gov)
 
 static int cs_cpufreq_governor_dbs(struct cpufreq_policy *policy,
 				   unsigned int event)
 {
-	return cpufreq_governor_dbs(policy, &cs_dbs_cdata, event);
+	return cpufreq_governor_dbs(policy, &cs_dbs_gov, event);
 }
 
 static int dbs_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c
index a329e1b..dc5bb29 100644
--- a/drivers/cpufreq/cpufreq_governor.c
+++ b/drivers/cpufreq/cpufreq_governor.c
@@ -28,14 +28,14 @@ EXPORT_SYMBOL_GPL(dbs_data_mutex);
 static struct attribute_group *get_sysfs_attr(struct dbs_data *dbs_data)
 {
 	if (have_governor_per_policy())
-		return dbs_data->cdata->attr_group_gov_pol;
+		return dbs_data->gov->attr_group_gov_pol;
 	else
-		return dbs_data->cdata->attr_group_gov_sys;
+		return dbs_data->gov->attr_group_gov_sys;
 }
 
 void dbs_check_cpu(struct dbs_data *dbs_data, int cpu)
 {
-	struct cpu_dbs_info *cdbs = dbs_data->cdata->get_cpu_cdbs(cpu);
+	struct cpu_dbs_info *cdbs = dbs_data->gov->get_cpu_cdbs(cpu);
 	struct od_dbs_tuners *od_tuners = dbs_data->tuners;
 	struct cs_dbs_tuners *cs_tuners = dbs_data->tuners;
 	struct cpufreq_policy *policy = cdbs->shared->policy;
@@ -44,9 +44,9 @@ void dbs_check_cpu(struct dbs_data *dbs_data, int cpu)
 	unsigned int ignore_nice;
 	unsigned int j;
 
-	if (dbs_data->cdata->governor == GOV_ONDEMAND) {
+	if (dbs_data->gov->governor == GOV_ONDEMAND) {
 		struct od_cpu_dbs_info_s *od_dbs_info =
-				dbs_data->cdata->get_cpu_dbs_info_s(cpu);
+				dbs_data->gov->get_cpu_dbs_info_s(cpu);
 
 		/*
 		 * Sometimes, the ondemand governor uses an additional
@@ -71,7 +71,7 @@ void dbs_check_cpu(struct dbs_data *dbs_data, int cpu)
 		unsigned int load;
 		int io_busy = 0;
 
-		j_cdbs = dbs_data->cdata->get_cpu_cdbs(j);
+		j_cdbs = dbs_data->gov->get_cpu_cdbs(j);
 
 		/*
 		 * For the purpose of ondemand, waiting for disk IO is
@@ -79,7 +79,7 @@ void dbs_check_cpu(struct dbs_data *dbs_data, int cpu)
 		 * not that the system is actually idle. So do not add
 		 * the iowait time to the cpu idle time.
 		 */
-		if (dbs_data->cdata->governor == GOV_ONDEMAND)
+		if (dbs_data->gov->governor == GOV_ONDEMAND)
 			io_busy = od_tuners->io_is_busy;
 		cur_idle_time = get_cpu_idle_time(j, &cur_wall_time, io_busy);
 
@@ -160,7 +160,7 @@ void dbs_check_cpu(struct dbs_data *dbs_data, int cpu)
 			max_load = load;
 	}
 
-	dbs_data->cdata->gov_check_cpu(cpu, max_load);
+	dbs_data->gov->gov_check_cpu(cpu, max_load);
 }
 EXPORT_SYMBOL_GPL(dbs_check_cpu);
 
@@ -175,7 +175,7 @@ void gov_set_update_util(struct cpu_common_dbs_info *shared,
 	shared->last_sample_time = 0;
 
 	for_each_cpu(cpu, policy->cpus) {
-		struct cpu_dbs_info *cdbs = dbs_data->cdata->get_cpu_cdbs(cpu);
+		struct cpu_dbs_info *cdbs = dbs_data->gov->get_cpu_cdbs(cpu);
 
 		cpufreq_set_update_util_data(cpu, &cdbs->update_util);
 	}
@@ -223,7 +223,7 @@ static void dbs_work_handler(struct work_struct *work)
 	 * ondemand governor isn't updating the sampling rate in parallel.
 	 */
 	mutex_lock(&shared->timer_mutex);
-	delay = dbs_data->cdata->gov_dbs_timer(policy);
+	delay = dbs_data->gov->gov_dbs_timer(policy);
 	shared->sample_delay_ns = jiffies_to_nsecs(delay);
 	mutex_unlock(&shared->timer_mutex);
 
@@ -282,7 +282,7 @@ static void dbs_update_util_handler(struct update_util_data *data, u64 time,
 static void set_sampling_rate(struct dbs_data *dbs_data,
 		unsigned int sampling_rate)
 {
-	if (dbs_data->cdata->governor == GOV_CONSERVATIVE) {
+	if (dbs_data->gov->governor == GOV_CONSERVATIVE) {
 		struct cs_dbs_tuners *cs_tuners = dbs_data->tuners;
 		cs_tuners->sampling_rate = sampling_rate;
 	} else {
@@ -292,7 +292,7 @@ static void set_sampling_rate(struct dbs_data *dbs_data,
 }
 
 static int alloc_common_dbs_info(struct cpufreq_policy *policy,
-				 struct common_dbs_data *cdata)
+				 struct dbs_governor *gov)
 {
 	struct cpu_common_dbs_info *shared;
 	int j;
@@ -304,7 +304,7 @@ static int alloc_common_dbs_info(struct cpufreq_policy *policy,
 
 	/* Set shared for all CPUs, online+offline */
 	for_each_cpu(j, policy->related_cpus)
-		cdata->get_cpu_cdbs(j)->shared = shared;
+		gov->get_cpu_cdbs(j)->shared = shared;
 
 	mutex_init(&shared->timer_mutex);
 	atomic_set(&shared->skip_work, 0);
@@ -314,24 +314,24 @@ static int alloc_common_dbs_info(struct cpufreq_policy *policy,
 }
 
 static void free_common_dbs_info(struct cpufreq_policy *policy,
-				 struct common_dbs_data *cdata)
+				 struct dbs_governor *gov)
 {
-	struct cpu_dbs_info *cdbs = cdata->get_cpu_cdbs(policy->cpu);
+	struct cpu_dbs_info *cdbs = gov->get_cpu_cdbs(policy->cpu);
 	struct cpu_common_dbs_info *shared = cdbs->shared;
 	int j;
 
 	mutex_destroy(&shared->timer_mutex);
 
 	for_each_cpu(j, policy->cpus)
-		cdata->get_cpu_cdbs(j)->shared = NULL;
+		gov->get_cpu_cdbs(j)->shared = NULL;
 
 	kfree(shared);
 }
 
 static int cpufreq_governor_init(struct cpufreq_policy *policy,
-				 struct common_dbs_data *cdata)
+				 struct dbs_governor *gov)
 {
-	struct dbs_data *dbs_data = cdata->gdbs_data;
+	struct dbs_data *dbs_data = gov->gdbs_data;
 	unsigned int latency;
 	int ret;
 
@@ -343,7 +343,7 @@ static int cpufreq_governor_init(struct cpufreq_policy *policy,
 		if (WARN_ON(have_governor_per_policy()))
 			return -EINVAL;
 
-		ret = alloc_common_dbs_info(policy, cdata);
+		ret = alloc_common_dbs_info(policy, gov);
 		if (ret)
 			return ret;
 
@@ -356,14 +356,14 @@ static int cpufreq_governor_init(struct cpufreq_policy *policy,
 	if (!dbs_data)
 		return -ENOMEM;
 
-	ret = alloc_common_dbs_info(policy, cdata);
+	ret = alloc_common_dbs_info(policy, gov);
 	if (ret)
 		goto free_dbs_data;
 
-	dbs_data->cdata = cdata;
+	dbs_data->gov = gov;
 	dbs_data->usage_count = 1;
 
-	ret = cdata->init(dbs_data, !policy->governor->initialized);
+	ret = gov->init(dbs_data, !policy->governor->initialized);
 	if (ret)
 		goto free_common_dbs_info;
 
@@ -379,7 +379,7 @@ static int cpufreq_governor_init(struct cpufreq_policy *policy,
 					latency * LATENCY_MULTIPLIER));
 
 	if (!have_governor_per_policy())
-		cdata->gdbs_data = dbs_data;
+		gov->gdbs_data = dbs_data;
 
 	policy->governor_data = dbs_data;
 
@@ -394,10 +394,10 @@ reset_gdbs_data:
 	policy->governor_data = NULL;
 
 	if (!have_governor_per_policy())
-		cdata->gdbs_data = NULL;
-	cdata->exit(dbs_data, !policy->governor->initialized);
+		gov->gdbs_data = NULL;
+	gov->exit(dbs_data, !policy->governor->initialized);
 free_common_dbs_info:
-	free_common_dbs_info(policy, cdata);
+	free_common_dbs_info(policy, gov);
 free_dbs_data:
 	kfree(dbs_data);
 	return ret;
@@ -406,8 +406,8 @@ free_dbs_data:
 static int cpufreq_governor_exit(struct cpufreq_policy *policy)
 {
 	struct dbs_data *dbs_data = policy->governor_data;
-	struct common_dbs_data *cdata = dbs_data->cdata;
-	struct cpu_dbs_info *cdbs = cdata->get_cpu_cdbs(policy->cpu);
+	struct dbs_governor *gov = dbs_data->gov;
+	struct cpu_dbs_info *cdbs = gov->get_cpu_cdbs(policy->cpu);
 
 	/* State should be equivalent to INIT */
 	if (!cdbs->shared || cdbs->shared->policy)
@@ -420,24 +420,24 @@ static int cpufreq_governor_exit(struct cpufreq_policy *policy)
 		policy->governor_data = NULL;
 
 		if (!have_governor_per_policy())
-			cdata->gdbs_data = NULL;
+			gov->gdbs_data = NULL;
 
-		cdata->exit(dbs_data, policy->governor->initialized == 1);
+		gov->exit(dbs_data, policy->governor->initialized == 1);
 		kfree(dbs_data);
 	} else {
 		policy->governor_data = NULL;
 	}
 
-	free_common_dbs_info(policy, cdata);
+	free_common_dbs_info(policy, gov);
 	return 0;
 }
 
 static int cpufreq_governor_start(struct cpufreq_policy *policy)
 {
 	struct dbs_data *dbs_data = policy->governor_data;
-	struct common_dbs_data *cdata = dbs_data->cdata;
+	struct dbs_governor *gov = dbs_data->gov;
 	unsigned int sampling_rate, ignore_nice, j, cpu = policy->cpu;
-	struct cpu_dbs_info *cdbs = cdata->get_cpu_cdbs(cpu);
+	struct cpu_dbs_info *cdbs = gov->get_cpu_cdbs(cpu);
 	struct cpu_common_dbs_info *shared = cdbs->shared;
 	int io_busy = 0;
 
@@ -448,7 +448,7 @@ static int cpufreq_governor_start(struct cpufreq_policy *policy)
 	if (!shared || shared->policy)
 		return -EBUSY;
 
-	if (cdata->governor == GOV_CONSERVATIVE) {
+	if (gov->governor == GOV_CONSERVATIVE) {
 		struct cs_dbs_tuners *cs_tuners = dbs_data->tuners;
 
 		sampling_rate = cs_tuners->sampling_rate;
@@ -462,7 +462,7 @@ static int cpufreq_governor_start(struct cpufreq_policy *policy)
 	}
 
 	for_each_cpu(j, policy->cpus) {
-		struct cpu_dbs_info *j_cdbs = cdata->get_cpu_cdbs(j);
+		struct cpu_dbs_info *j_cdbs = gov->get_cpu_cdbs(j);
 		unsigned int prev_load;
 
 		j_cdbs->prev_cpu_idle =
@@ -480,15 +480,15 @@ static int cpufreq_governor_start(struct cpufreq_policy *policy)
 	}
 	shared->policy = policy;
 
-	if (cdata->governor == GOV_CONSERVATIVE) {
+	if (gov->governor == GOV_CONSERVATIVE) {
 		struct cs_cpu_dbs_info_s *cs_dbs_info =
-			cdata->get_cpu_dbs_info_s(cpu);
+			gov->get_cpu_dbs_info_s(cpu);
 
 		cs_dbs_info->down_skip = 0;
 		cs_dbs_info->requested_freq = policy->cur;
 	} else {
-		struct od_ops *od_ops = cdata->gov_ops;
-		struct od_cpu_dbs_info_s *od_dbs_info = cdata->get_cpu_dbs_info_s(cpu);
+		struct od_ops *od_ops = gov->gov_ops;
+		struct od_cpu_dbs_info_s *od_dbs_info = gov->get_cpu_dbs_info_s(cpu);
 
 		od_dbs_info->rate_mult = 1;
 		od_dbs_info->sample_type = OD_NORMAL_SAMPLE;
@@ -502,7 +502,7 @@ static int cpufreq_governor_start(struct cpufreq_policy *policy)
 static int cpufreq_governor_stop(struct cpufreq_policy *policy)
 {
 	struct dbs_data *dbs_data = policy->governor_data;
-	struct cpu_dbs_info *cdbs = dbs_data->cdata->get_cpu_cdbs(policy->cpu);
+	struct cpu_dbs_info *cdbs = dbs_data->gov->get_cpu_cdbs(policy->cpu);
 	struct cpu_common_dbs_info *shared = cdbs->shared;
 
 	/* State should be equivalent to START */
@@ -518,9 +518,9 @@ static int cpufreq_governor_stop(struct cpufreq_policy *policy)
 static int cpufreq_governor_limits(struct cpufreq_policy *policy)
 {
 	struct dbs_data *dbs_data = policy->governor_data;
-	struct common_dbs_data *cdata = dbs_data->cdata;
+	struct dbs_governor *gov = dbs_data->gov;
 	unsigned int cpu = policy->cpu;
-	struct cpu_dbs_info *cdbs = cdata->get_cpu_cdbs(cpu);
+	struct cpu_dbs_info *cdbs = gov->get_cpu_cdbs(cpu);
 
 	/* State should be equivalent to START */
 	if (!cdbs->shared || !cdbs->shared->policy)
@@ -540,7 +540,7 @@ static int cpufreq_governor_limits(struct cpufreq_policy *policy)
 }
 
 int cpufreq_governor_dbs(struct cpufreq_policy *policy,
-			 struct common_dbs_data *cdata, unsigned int event)
+			 struct dbs_governor *gov, unsigned int event)
 {
 	int ret = -EINVAL;
 
@@ -548,7 +548,7 @@ int cpufreq_governor_dbs(struct cpufreq_policy *policy,
 	mutex_lock(&dbs_data_mutex);
 
 	if (event == CPUFREQ_GOV_POLICY_INIT) {
-		ret = cpufreq_governor_init(policy, cdata);
+		ret = cpufreq_governor_init(policy, gov);
 	} else if (policy->governor_data) {
 		switch (event) {
 		case CPUFREQ_GOV_POLICY_EXIT:
diff --git a/drivers/cpufreq/cpufreq_governor.h b/drivers/cpufreq/cpufreq_governor.h
index 2fa3cf1..ed87b84 100644
--- a/drivers/cpufreq/cpufreq_governor.h
+++ b/drivers/cpufreq/cpufreq_governor.h
@@ -78,7 +78,7 @@ __ATTR(_name, 0644, show_##_name##_gov_pol, store_##_name##_gov_pol)
 static ssize_t show_##file_name##_gov_sys				\
 (struct kobject *kobj, struct attribute *attr, char *buf)		\
 {									\
-	struct _gov##_dbs_tuners *tuners = _gov##_dbs_cdata.gdbs_data->tuners; \
+	struct _gov##_dbs_tuners *tuners = _gov##_dbs_gov.gdbs_data->tuners; \
 	return sprintf(buf, "%u\n", tuners->file_name);			\
 }									\
 									\
@@ -94,7 +94,7 @@ static ssize_t show_##file_name##_gov_pol				\
 static ssize_t store_##file_name##_gov_sys				\
 (struct kobject *kobj, struct attribute *attr, const char *buf, size_t count) \
 {									\
-	struct dbs_data *dbs_data = _gov##_dbs_cdata.gdbs_data;		\
+	struct dbs_data *dbs_data = _gov##_dbs_gov.gdbs_data;		\
 	return store_##file_name(dbs_data, buf, count);			\
 }									\
 									\
@@ -205,7 +205,7 @@ struct cs_dbs_tuners {
 
 /* Common Governor data across policies */
 struct dbs_data;
-struct common_dbs_data {
+struct dbs_governor {
 	struct cpufreq_governor gov;
 
 	#define GOV_ONDEMAND		0
@@ -233,7 +233,7 @@ struct common_dbs_data {
 
 /* Governor Per policy data */
 struct dbs_data {
-	struct common_dbs_data *cdata;
+	struct dbs_governor *gov;
 	unsigned int min_sampling_rate;
 	int usage_count;
 	void *tuners;
@@ -262,7 +262,7 @@ static inline int delay_for_sampling_rate(unsigned int sampling_rate)
 static ssize_t show_sampling_rate_min_gov_sys				\
 (struct kobject *kobj, struct attribute *attr, char *buf)		\
 {									\
-	struct dbs_data *dbs_data = _gov##_dbs_cdata.gdbs_data;		\
+	struct dbs_data *dbs_data = _gov##_dbs_gov.gdbs_data;		\
 	return sprintf(buf, "%u\n", dbs_data->min_sampling_rate);	\
 }									\
 									\
@@ -277,7 +277,7 @@ extern struct mutex dbs_data_mutex;
 extern struct mutex cpufreq_governor_lock;
 void dbs_check_cpu(struct dbs_data *dbs_data, int cpu);
 int cpufreq_governor_dbs(struct cpufreq_policy *policy,
-		struct common_dbs_data *cdata, unsigned int event);
+		struct dbs_governor *gov, unsigned int event);
 void od_register_powersave_bias_handler(unsigned int (*f)
 		(struct cpufreq_policy *, unsigned int, unsigned int),
 		unsigned int powersave_bias);
diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c
index 836116c..c38a4a1 100644
--- a/drivers/cpufreq/cpufreq_ondemand.c
+++ b/drivers/cpufreq/cpufreq_ondemand.c
@@ -219,7 +219,7 @@ static unsigned int od_dbs_timer(struct cpufreq_policy *policy)
 }
 
 /************************** sysfs interface ************************/
-static struct common_dbs_data od_dbs_cdata;
+static struct dbs_governor od_dbs_gov;
 
 /**
  * update_sampling_rate - update sampling rate effective immediately if needed.
@@ -542,7 +542,7 @@ static struct od_ops od_ops = {
 static int od_cpufreq_governor_dbs(struct cpufreq_policy *policy,
 				   unsigned int event);
 
-static struct common_dbs_data od_dbs_cdata = {
+static struct dbs_governor od_dbs_gov = {
 	.gov = {
 		.name = "ondemand",
 		.governor = od_cpufreq_governor_dbs,
@@ -561,12 +561,12 @@ static struct common_dbs_data od_dbs_cdata = {
 	.exit = od_exit,
 };
 
-#define CPU_FREQ_GOV_ONDEMAND	(&od_dbs_cdata.gov)
+#define CPU_FREQ_GOV_ONDEMAND	(&od_dbs_gov.gov)
 
 static int od_cpufreq_governor_dbs(struct cpufreq_policy *policy,
 		unsigned int event)
 {
-	return cpufreq_governor_dbs(policy, &od_dbs_cdata, event);
+	return cpufreq_governor_dbs(policy, &od_dbs_gov, event);
 }
 
 static void od_set_powersave_bias(unsigned int powersave_bias)
-- 
cgit v0.10.2


From 906a6e5aaef24d3c80bf6a06c794c7541aca64be Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Sun, 7 Feb 2016 16:07:51 +0100
Subject: cpufreq: governor: Rework cpufreq_governor_dbs()

Since it is possible to obtain a pointer to struct dbs_governor
from a pointer to the struct governor embedded in it via
container_of(), the second argument of cpufreq_governor_init()
is not necessary.  Accordingly, cpufreq_governor_dbs() doesn't
need its second argument either and the ->governor callbacks
for both the ondemand and conservative governors may be set
to cpufreq_governor_dbs() directly.  Make that happen.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Saravana Kannan <skannan@codeaurora.org>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>

diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c
index c65ac36..20c8291 100644
--- a/drivers/cpufreq/cpufreq_conservative.c
+++ b/drivers/cpufreq/cpufreq_conservative.c
@@ -325,13 +325,10 @@ static void cs_exit(struct dbs_data *dbs_data, bool notify)
 
 define_get_cpu_dbs_routines(cs_cpu_dbs_info);
 
-static int cs_cpufreq_governor_dbs(struct cpufreq_policy *policy,
-				   unsigned int event);
-
 static struct dbs_governor cs_dbs_gov = {
 	.gov = {
 		.name = "conservative",
-		.governor = cs_cpufreq_governor_dbs,
+		.governor = cpufreq_governor_dbs,
 		.max_transition_latency = TRANSITION_LATENCY_LIMIT,
 		.owner = THIS_MODULE,
 	},
@@ -348,12 +345,6 @@ static struct dbs_governor cs_dbs_gov = {
 
 #define CPU_FREQ_GOV_CONSERVATIVE	(&cs_dbs_gov.gov)
 
-static int cs_cpufreq_governor_dbs(struct cpufreq_policy *policy,
-				   unsigned int event)
-{
-	return cpufreq_governor_dbs(policy, &cs_dbs_gov, event);
-}
-
 static int dbs_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
 				void *data)
 {
diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c
index dc5bb29..7e579fc 100644
--- a/drivers/cpufreq/cpufreq_governor.c
+++ b/drivers/cpufreq/cpufreq_governor.c
@@ -328,9 +328,10 @@ static void free_common_dbs_info(struct cpufreq_policy *policy,
 	kfree(shared);
 }
 
-static int cpufreq_governor_init(struct cpufreq_policy *policy,
-				 struct dbs_governor *gov)
+static int cpufreq_governor_init(struct cpufreq_policy *policy)
 {
+	struct dbs_governor *gov = container_of(policy->governor,
+						struct dbs_governor, gov);
 	struct dbs_data *dbs_data = gov->gdbs_data;
 	unsigned int latency;
 	int ret;
@@ -539,8 +540,7 @@ static int cpufreq_governor_limits(struct cpufreq_policy *policy)
 	return 0;
 }
 
-int cpufreq_governor_dbs(struct cpufreq_policy *policy,
-			 struct dbs_governor *gov, unsigned int event)
+int cpufreq_governor_dbs(struct cpufreq_policy *policy, unsigned int event)
 {
 	int ret = -EINVAL;
 
@@ -548,7 +548,7 @@ int cpufreq_governor_dbs(struct cpufreq_policy *policy,
 	mutex_lock(&dbs_data_mutex);
 
 	if (event == CPUFREQ_GOV_POLICY_INIT) {
-		ret = cpufreq_governor_init(policy, gov);
+		ret = cpufreq_governor_init(policy);
 	} else if (policy->governor_data) {
 		switch (event) {
 		case CPUFREQ_GOV_POLICY_EXIT:
diff --git a/drivers/cpufreq/cpufreq_governor.h b/drivers/cpufreq/cpufreq_governor.h
index ed87b84..8e280b8 100644
--- a/drivers/cpufreq/cpufreq_governor.h
+++ b/drivers/cpufreq/cpufreq_governor.h
@@ -276,8 +276,7 @@ static ssize_t show_sampling_rate_min_gov_pol				\
 extern struct mutex dbs_data_mutex;
 extern struct mutex cpufreq_governor_lock;
 void dbs_check_cpu(struct dbs_data *dbs_data, int cpu);
-int cpufreq_governor_dbs(struct cpufreq_policy *policy,
-		struct dbs_governor *gov, unsigned int event);
+int cpufreq_governor_dbs(struct cpufreq_policy *policy, unsigned int event);
 void od_register_powersave_bias_handler(unsigned int (*f)
 		(struct cpufreq_policy *, unsigned int, unsigned int),
 		unsigned int powersave_bias);
diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c
index c38a4a1..dcbcbf4 100644
--- a/drivers/cpufreq/cpufreq_ondemand.c
+++ b/drivers/cpufreq/cpufreq_ondemand.c
@@ -539,13 +539,10 @@ static struct od_ops od_ops = {
 	.freq_increase = dbs_freq_increase,
 };
 
-static int od_cpufreq_governor_dbs(struct cpufreq_policy *policy,
-				   unsigned int event);
-
 static struct dbs_governor od_dbs_gov = {
 	.gov = {
 		.name = "ondemand",
-		.governor = od_cpufreq_governor_dbs,
+		.governor = cpufreq_governor_dbs,
 		.max_transition_latency	= TRANSITION_LATENCY_LIMIT,
 		.owner = THIS_MODULE,
 	},
@@ -563,12 +560,6 @@ static struct dbs_governor od_dbs_gov = {
 
 #define CPU_FREQ_GOV_ONDEMAND	(&od_dbs_gov.gov)
 
-static int od_cpufreq_governor_dbs(struct cpufreq_policy *policy,
-		unsigned int event)
-{
-	return cpufreq_governor_dbs(policy, &od_dbs_gov, event);
-}
-
 static void od_set_powersave_bias(unsigned int powersave_bias)
 {
 	struct cpufreq_policy *policy;
-- 
cgit v0.10.2


From ea59ee0dc9796a4e879291cc2f4728d04c499313 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Sun, 7 Feb 2016 16:09:51 +0100
Subject: cpufreq: governor: Drop the gov pointer from struct dbs_data

Since it is possible to obtain a pointer to struct dbs_governor
from a pointer to the struct governor embedded in it with the help
of container_of(), the additional gov pointer in struct dbs_data
isn't really necessary.

Drop that pointer and make the code using it reach the dbs_governor
object via policy->governor.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>

diff --git a/drivers/cpufreq/amd_freq_sensitivity.c b/drivers/cpufreq/amd_freq_sensitivity.c
index a7d237b..6395a5f 100644
--- a/drivers/cpufreq/amd_freq_sensitivity.c
+++ b/drivers/cpufreq/amd_freq_sensitivity.c
@@ -48,7 +48,7 @@ static unsigned int amd_powersave_bias_target(struct cpufreq_policy *policy,
 	struct dbs_data *od_data = policy->governor_data;
 	struct od_dbs_tuners *od_tuners = od_data->tuners;
 	struct od_cpu_dbs_info_s *od_info =
-		od_data->gov->get_cpu_dbs_info_s(policy->cpu);
+		dbs_governor_of(policy)->get_cpu_dbs_info_s(policy->cpu);
 
 	if (!od_info->freq_table)
 		return freq_next;
diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c
index 20c8291..7d5f181 100644
--- a/drivers/cpufreq/cpufreq_conservative.c
+++ b/drivers/cpufreq/cpufreq_conservative.c
@@ -107,7 +107,7 @@ static unsigned int cs_dbs_timer(struct cpufreq_policy *policy)
 	struct dbs_data *dbs_data = policy->governor_data;
 	struct cs_dbs_tuners *cs_tuners = dbs_data->tuners;
 
-	dbs_check_cpu(dbs_data, policy->cpu);
+	dbs_check_cpu(policy, policy->cpu);
 	return delay_for_sampling_rate(cs_tuners->sampling_rate);
 }
 
diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c
index 7e579fc..d3fa8b3 100644
--- a/drivers/cpufreq/cpufreq_governor.c
+++ b/drivers/cpufreq/cpufreq_governor.c
@@ -25,28 +25,27 @@
 DEFINE_MUTEX(dbs_data_mutex);
 EXPORT_SYMBOL_GPL(dbs_data_mutex);
 
-static struct attribute_group *get_sysfs_attr(struct dbs_data *dbs_data)
+static struct attribute_group *get_sysfs_attr(struct dbs_governor *gov)
 {
-	if (have_governor_per_policy())
-		return dbs_data->gov->attr_group_gov_pol;
-	else
-		return dbs_data->gov->attr_group_gov_sys;
+	return have_governor_per_policy() ?
+		gov->attr_group_gov_pol : gov->attr_group_gov_sys;
 }
 
-void dbs_check_cpu(struct dbs_data *dbs_data, int cpu)
+void dbs_check_cpu(struct cpufreq_policy *policy, int cpu)
 {
-	struct cpu_dbs_info *cdbs = dbs_data->gov->get_cpu_cdbs(cpu);
+	struct dbs_governor *gov = dbs_governor_of(policy);
+	struct cpu_dbs_info *cdbs = gov->get_cpu_cdbs(cpu);
+	struct dbs_data *dbs_data = policy->governor_data;
 	struct od_dbs_tuners *od_tuners = dbs_data->tuners;
 	struct cs_dbs_tuners *cs_tuners = dbs_data->tuners;
-	struct cpufreq_policy *policy = cdbs->shared->policy;
 	unsigned int sampling_rate;
 	unsigned int max_load = 0;
 	unsigned int ignore_nice;
 	unsigned int j;
 
-	if (dbs_data->gov->governor == GOV_ONDEMAND) {
+	if (gov->governor == GOV_ONDEMAND) {
 		struct od_cpu_dbs_info_s *od_dbs_info =
-				dbs_data->gov->get_cpu_dbs_info_s(cpu);
+				gov->get_cpu_dbs_info_s(cpu);
 
 		/*
 		 * Sometimes, the ondemand governor uses an additional
@@ -71,7 +70,7 @@ void dbs_check_cpu(struct dbs_data *dbs_data, int cpu)
 		unsigned int load;
 		int io_busy = 0;
 
-		j_cdbs = dbs_data->gov->get_cpu_cdbs(j);
+		j_cdbs = gov->get_cpu_cdbs(j);
 
 		/*
 		 * For the purpose of ondemand, waiting for disk IO is
@@ -79,7 +78,7 @@ void dbs_check_cpu(struct dbs_data *dbs_data, int cpu)
 		 * not that the system is actually idle. So do not add
 		 * the iowait time to the cpu idle time.
 		 */
-		if (dbs_data->gov->governor == GOV_ONDEMAND)
+		if (gov->governor == GOV_ONDEMAND)
 			io_busy = od_tuners->io_is_busy;
 		cur_idle_time = get_cpu_idle_time(j, &cur_wall_time, io_busy);
 
@@ -160,7 +159,7 @@ void dbs_check_cpu(struct dbs_data *dbs_data, int cpu)
 			max_load = load;
 	}
 
-	dbs_data->gov->gov_check_cpu(cpu, max_load);
+	gov->gov_check_cpu(cpu, max_load);
 }
 EXPORT_SYMBOL_GPL(dbs_check_cpu);
 
@@ -168,14 +167,14 @@ void gov_set_update_util(struct cpu_common_dbs_info *shared,
 			 unsigned int delay_us)
 {
 	struct cpufreq_policy *policy = shared->policy;
-	struct dbs_data *dbs_data = policy->governor_data;
+	struct dbs_governor *gov = dbs_governor_of(policy);
 	int cpu;
 
 	gov_update_sample_delay(shared, delay_us);
 	shared->last_sample_time = 0;
 
 	for_each_cpu(cpu, policy->cpus) {
-		struct cpu_dbs_info *cdbs = dbs_data->gov->get_cpu_cdbs(cpu);
+		struct cpu_dbs_info *cdbs = gov->get_cpu_cdbs(cpu);
 
 		cpufreq_set_update_util_data(cpu, &cdbs->update_util);
 	}
@@ -212,18 +211,18 @@ static void dbs_work_handler(struct work_struct *work)
 	struct cpu_common_dbs_info *shared = container_of(work, struct
 					cpu_common_dbs_info, work);
 	struct cpufreq_policy *policy;
-	struct dbs_data *dbs_data;
+	struct dbs_governor *gov;
 	unsigned int delay;
 
 	policy = shared->policy;
-	dbs_data = policy->governor_data;
+	gov = dbs_governor_of(policy);
 
 	/*
 	 * Make sure cpufreq_governor_limits() isn't evaluating load or the
 	 * ondemand governor isn't updating the sampling rate in parallel.
 	 */
 	mutex_lock(&shared->timer_mutex);
-	delay = dbs_data->gov->gov_dbs_timer(policy);
+	delay = gov->gov_dbs_timer(policy);
 	shared->sample_delay_ns = jiffies_to_nsecs(delay);
 	mutex_unlock(&shared->timer_mutex);
 
@@ -280,9 +279,10 @@ static void dbs_update_util_handler(struct update_util_data *data, u64 time,
 }
 
 static void set_sampling_rate(struct dbs_data *dbs_data,
-		unsigned int sampling_rate)
+			      struct dbs_governor *gov,
+			      unsigned int sampling_rate)
 {
-	if (dbs_data->gov->governor == GOV_CONSERVATIVE) {
+	if (gov->governor == GOV_CONSERVATIVE) {
 		struct cs_dbs_tuners *cs_tuners = dbs_data->tuners;
 		cs_tuners->sampling_rate = sampling_rate;
 	} else {
@@ -330,8 +330,7 @@ static void free_common_dbs_info(struct cpufreq_policy *policy,
 
 static int cpufreq_governor_init(struct cpufreq_policy *policy)
 {
-	struct dbs_governor *gov = container_of(policy->governor,
-						struct dbs_governor, gov);
+	struct dbs_governor *gov = dbs_governor_of(policy);
 	struct dbs_data *dbs_data = gov->gdbs_data;
 	unsigned int latency;
 	int ret;
@@ -361,7 +360,6 @@ static int cpufreq_governor_init(struct cpufreq_policy *policy)
 	if (ret)
 		goto free_dbs_data;
 
-	dbs_data->gov = gov;
 	dbs_data->usage_count = 1;
 
 	ret = gov->init(dbs_data, !policy->governor->initialized);
@@ -376,7 +374,7 @@ static int cpufreq_governor_init(struct cpufreq_policy *policy)
 	/* Bring kernel and HW constraints together */
 	dbs_data->min_sampling_rate = max(dbs_data->min_sampling_rate,
 					  MIN_LATENCY_MULTIPLIER * latency);
-	set_sampling_rate(dbs_data, max(dbs_data->min_sampling_rate,
+	set_sampling_rate(dbs_data, gov, max(dbs_data->min_sampling_rate,
 					latency * LATENCY_MULTIPLIER));
 
 	if (!have_governor_per_policy())
@@ -385,7 +383,7 @@ static int cpufreq_governor_init(struct cpufreq_policy *policy)
 	policy->governor_data = dbs_data;
 
 	ret = sysfs_create_group(get_governor_parent_kobj(policy),
-				 get_sysfs_attr(dbs_data));
+				 get_sysfs_attr(gov));
 	if (ret)
 		goto reset_gdbs_data;
 
@@ -406,8 +404,8 @@ free_dbs_data:
 
 static int cpufreq_governor_exit(struct cpufreq_policy *policy)
 {
+	struct dbs_governor *gov = dbs_governor_of(policy);
 	struct dbs_data *dbs_data = policy->governor_data;
-	struct dbs_governor *gov = dbs_data->gov;
 	struct cpu_dbs_info *cdbs = gov->get_cpu_cdbs(policy->cpu);
 
 	/* State should be equivalent to INIT */
@@ -416,7 +414,7 @@ static int cpufreq_governor_exit(struct cpufreq_policy *policy)
 
 	if (!--dbs_data->usage_count) {
 		sysfs_remove_group(get_governor_parent_kobj(policy),
-				   get_sysfs_attr(dbs_data));
+				   get_sysfs_attr(gov));
 
 		policy->governor_data = NULL;
 
@@ -435,8 +433,8 @@ static int cpufreq_governor_exit(struct cpufreq_policy *policy)
 
 static int cpufreq_governor_start(struct cpufreq_policy *policy)
 {
+	struct dbs_governor *gov = dbs_governor_of(policy);
 	struct dbs_data *dbs_data = policy->governor_data;
-	struct dbs_governor *gov = dbs_data->gov;
 	unsigned int sampling_rate, ignore_nice, j, cpu = policy->cpu;
 	struct cpu_dbs_info *cdbs = gov->get_cpu_cdbs(cpu);
 	struct cpu_common_dbs_info *shared = cdbs->shared;
@@ -502,8 +500,8 @@ static int cpufreq_governor_start(struct cpufreq_policy *policy)
 
 static int cpufreq_governor_stop(struct cpufreq_policy *policy)
 {
-	struct dbs_data *dbs_data = policy->governor_data;
-	struct cpu_dbs_info *cdbs = dbs_data->gov->get_cpu_cdbs(policy->cpu);
+	struct dbs_governor *gov = dbs_governor_of(policy);
+	struct cpu_dbs_info *cdbs = gov->get_cpu_cdbs(policy->cpu);
 	struct cpu_common_dbs_info *shared = cdbs->shared;
 
 	/* State should be equivalent to START */
@@ -518,8 +516,7 @@ static int cpufreq_governor_stop(struct cpufreq_policy *policy)
 
 static int cpufreq_governor_limits(struct cpufreq_policy *policy)
 {
-	struct dbs_data *dbs_data = policy->governor_data;
-	struct dbs_governor *gov = dbs_data->gov;
+	struct dbs_governor *gov = dbs_governor_of(policy);
 	unsigned int cpu = policy->cpu;
 	struct cpu_dbs_info *cdbs = gov->get_cpu_cdbs(cpu);
 
@@ -534,7 +531,7 @@ static int cpufreq_governor_limits(struct cpufreq_policy *policy)
 	else if (policy->min > cdbs->shared->policy->cur)
 		__cpufreq_driver_target(cdbs->shared->policy, policy->min,
 					CPUFREQ_RELATION_L);
-	dbs_check_cpu(dbs_data, cpu);
+	dbs_check_cpu(policy, cpu);
 	mutex_unlock(&cdbs->shared->timer_mutex);
 
 	return 0;
diff --git a/drivers/cpufreq/cpufreq_governor.h b/drivers/cpufreq/cpufreq_governor.h
index 8e280b8..c8b7ec2 100644
--- a/drivers/cpufreq/cpufreq_governor.h
+++ b/drivers/cpufreq/cpufreq_governor.h
@@ -231,9 +231,13 @@ struct dbs_governor {
 	void *gov_ops;
 };
 
+static inline struct dbs_governor *dbs_governor_of(struct cpufreq_policy *policy)
+{
+	return container_of(policy->governor, struct dbs_governor, gov);
+}
+
 /* Governor Per policy data */
 struct dbs_data {
-	struct dbs_governor *gov;
 	unsigned int min_sampling_rate;
 	int usage_count;
 	void *tuners;
@@ -275,7 +279,7 @@ static ssize_t show_sampling_rate_min_gov_pol				\
 
 extern struct mutex dbs_data_mutex;
 extern struct mutex cpufreq_governor_lock;
-void dbs_check_cpu(struct dbs_data *dbs_data, int cpu);
+void dbs_check_cpu(struct cpufreq_policy *policy, int cpu);
 int cpufreq_governor_dbs(struct cpufreq_policy *policy, unsigned int event);
 void od_register_powersave_bias_handler(unsigned int (*f)
 		(struct cpufreq_policy *, unsigned int, unsigned int),
diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c
index dcbcbf4..65ad39d 100644
--- a/drivers/cpufreq/cpufreq_ondemand.c
+++ b/drivers/cpufreq/cpufreq_ondemand.c
@@ -203,7 +203,7 @@ static unsigned int od_dbs_timer(struct cpufreq_policy *policy)
 		__cpufreq_driver_target(policy, dbs_info->freq_lo,
 					CPUFREQ_RELATION_H);
 	} else {
-		dbs_check_cpu(dbs_data, cpu);
+		dbs_check_cpu(policy, cpu);
 		if (dbs_info->freq_lo) {
 			/* Setup timer for SUB_SAMPLE */
 			dbs_info->sample_type = OD_SUB_SAMPLE;
-- 
cgit v0.10.2


From e40e7b255e591d0448500c7910ec5693f58026bd Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Wed, 10 Feb 2016 17:07:44 +0100
Subject: cpufreq: governor: Rename cpu_common_dbs_info to policy_dbs_info

The struct cpu_common_dbs_info structure represents the per-policy
part of the governor data (for the ondemand and conservative
governors), but its name doesn't reflect its purpose.

Rename it to struct policy_dbs_info and rename variables related to
it accordingly.

No functional changes.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>

diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c
index 7d5f181..b2df5de 100644
--- a/drivers/cpufreq/cpufreq_conservative.c
+++ b/drivers/cpufreq/cpufreq_conservative.c
@@ -47,7 +47,7 @@ static inline unsigned int get_freq_target(struct cs_dbs_tuners *cs_tuners,
 static void cs_check_cpu(int cpu, unsigned int load)
 {
 	struct cs_cpu_dbs_info_s *dbs_info = &per_cpu(cs_cpu_dbs_info, cpu);
-	struct cpufreq_policy *policy = dbs_info->cdbs.shared->policy;
+	struct cpufreq_policy *policy = dbs_info->cdbs.policy_dbs->policy;
 	struct dbs_data *dbs_data = policy->governor_data;
 	struct cs_dbs_tuners *cs_tuners = dbs_data->tuners;
 
diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c
index d3fa8b3..b425cd3da 100644
--- a/drivers/cpufreq/cpufreq_governor.c
+++ b/drivers/cpufreq/cpufreq_governor.c
@@ -163,15 +163,15 @@ void dbs_check_cpu(struct cpufreq_policy *policy, int cpu)
 }
 EXPORT_SYMBOL_GPL(dbs_check_cpu);
 
-void gov_set_update_util(struct cpu_common_dbs_info *shared,
+void gov_set_update_util(struct policy_dbs_info *policy_dbs,
 			 unsigned int delay_us)
 {
-	struct cpufreq_policy *policy = shared->policy;
+	struct cpufreq_policy *policy = policy_dbs->policy;
 	struct dbs_governor *gov = dbs_governor_of(policy);
 	int cpu;
 
-	gov_update_sample_delay(shared, delay_us);
-	shared->last_sample_time = 0;
+	gov_update_sample_delay(policy_dbs, delay_us);
+	policy_dbs->last_sample_time = 0;
 
 	for_each_cpu(cpu, policy->cpus) {
 		struct cpu_dbs_info *cdbs = gov->get_cpu_cdbs(cpu);
@@ -191,40 +191,40 @@ static inline void gov_clear_update_util(struct cpufreq_policy *policy)
 	synchronize_rcu();
 }
 
-static void gov_cancel_work(struct cpu_common_dbs_info *shared)
+static void gov_cancel_work(struct policy_dbs_info *policy_dbs)
 {
 	/* Tell dbs_update_util_handler() to skip queuing up work items. */
-	atomic_inc(&shared->skip_work);
+	atomic_inc(&policy_dbs->skip_work);
 	/*
 	 * If dbs_update_util_handler() is already running, it may not notice
 	 * the incremented skip_work, so wait for it to complete to prevent its
 	 * work item from being queued up after the cancel_work_sync() below.
 	 */
-	gov_clear_update_util(shared->policy);
-	irq_work_sync(&shared->irq_work);
-	cancel_work_sync(&shared->work);
-	atomic_set(&shared->skip_work, 0);
+	gov_clear_update_util(policy_dbs->policy);
+	irq_work_sync(&policy_dbs->irq_work);
+	cancel_work_sync(&policy_dbs->work);
+	atomic_set(&policy_dbs->skip_work, 0);
 }
 
 static void dbs_work_handler(struct work_struct *work)
 {
-	struct cpu_common_dbs_info *shared = container_of(work, struct
-					cpu_common_dbs_info, work);
+	struct policy_dbs_info *policy_dbs;
 	struct cpufreq_policy *policy;
 	struct dbs_governor *gov;
 	unsigned int delay;
 
-	policy = shared->policy;
+	policy_dbs = container_of(work, struct policy_dbs_info, work);
+	policy = policy_dbs->policy;
 	gov = dbs_governor_of(policy);
 
 	/*
 	 * Make sure cpufreq_governor_limits() isn't evaluating load or the
 	 * ondemand governor isn't updating the sampling rate in parallel.
 	 */
-	mutex_lock(&shared->timer_mutex);
+	mutex_lock(&policy_dbs->timer_mutex);
 	delay = gov->gov_dbs_timer(policy);
-	shared->sample_delay_ns = jiffies_to_nsecs(delay);
-	mutex_unlock(&shared->timer_mutex);
+	policy_dbs->sample_delay_ns = jiffies_to_nsecs(delay);
+	mutex_unlock(&policy_dbs->timer_mutex);
 
 	/*
 	 * If the atomic operation below is reordered with respect to the
@@ -232,23 +232,23 @@ static void dbs_work_handler(struct work_struct *work)
 	 * up using a stale sample delay value.
 	 */
 	smp_mb__before_atomic();
-	atomic_dec(&shared->skip_work);
+	atomic_dec(&policy_dbs->skip_work);
 }
 
 static void dbs_irq_work(struct irq_work *irq_work)
 {
-	struct cpu_common_dbs_info *shared;
+	struct policy_dbs_info *policy_dbs;
 
-	shared = container_of(irq_work, struct cpu_common_dbs_info, irq_work);
-	schedule_work(&shared->work);
+	policy_dbs = container_of(irq_work, struct policy_dbs_info, irq_work);
+	schedule_work(&policy_dbs->work);
 }
 
-static inline void gov_queue_irq_work(struct cpu_common_dbs_info *shared)
+static inline void gov_queue_irq_work(struct policy_dbs_info *policy_dbs)
 {
 #ifdef CONFIG_SMP
-	irq_work_queue_on(&shared->irq_work, smp_processor_id());
+	irq_work_queue_on(&policy_dbs->irq_work, smp_processor_id());
 #else
-	irq_work_queue(&shared->irq_work);
+	irq_work_queue(&policy_dbs->irq_work);
 #endif
 }
 
@@ -256,7 +256,7 @@ static void dbs_update_util_handler(struct update_util_data *data, u64 time,
 				    unsigned long util, unsigned long max)
 {
 	struct cpu_dbs_info *cdbs = container_of(data, struct cpu_dbs_info, update_util);
-	struct cpu_common_dbs_info *shared = cdbs->shared;
+	struct policy_dbs_info *policy_dbs = cdbs->policy_dbs;
 
 	/*
 	 * The work may not be allowed to be queued up right now.
@@ -265,17 +265,17 @@ static void dbs_update_util_handler(struct update_util_data *data, u64 time,
 	 * - The governor is being stopped.
 	 * - It is too early (too little time from the previous sample).
 	 */
-	if (atomic_inc_return(&shared->skip_work) == 1) {
+	if (atomic_inc_return(&policy_dbs->skip_work) == 1) {
 		u64 delta_ns;
 
-		delta_ns = time - shared->last_sample_time;
-		if ((s64)delta_ns >= shared->sample_delay_ns) {
-			shared->last_sample_time = time;
-			gov_queue_irq_work(shared);
+		delta_ns = time - policy_dbs->last_sample_time;
+		if ((s64)delta_ns >= policy_dbs->sample_delay_ns) {
+			policy_dbs->last_sample_time = time;
+			gov_queue_irq_work(policy_dbs);
 			return;
 		}
 	}
-	atomic_dec(&shared->skip_work);
+	atomic_dec(&policy_dbs->skip_work);
 }
 
 static void set_sampling_rate(struct dbs_data *dbs_data,
@@ -291,41 +291,41 @@ static void set_sampling_rate(struct dbs_data *dbs_data,
 	}
 }
 
-static int alloc_common_dbs_info(struct cpufreq_policy *policy,
+static int alloc_policy_dbs_info(struct cpufreq_policy *policy,
 				 struct dbs_governor *gov)
 {
-	struct cpu_common_dbs_info *shared;
+	struct policy_dbs_info *policy_dbs;
 	int j;
 
 	/* Allocate memory for the common information for policy->cpus */
-	shared = kzalloc(sizeof(*shared), GFP_KERNEL);
-	if (!shared)
+	policy_dbs = kzalloc(sizeof(*policy_dbs), GFP_KERNEL);
+	if (!policy_dbs)
 		return -ENOMEM;
 
-	/* Set shared for all CPUs, online+offline */
+	/* Set policy_dbs for all CPUs, online+offline */
 	for_each_cpu(j, policy->related_cpus)
-		gov->get_cpu_cdbs(j)->shared = shared;
+		gov->get_cpu_cdbs(j)->policy_dbs = policy_dbs;
 
-	mutex_init(&shared->timer_mutex);
-	atomic_set(&shared->skip_work, 0);
-	init_irq_work(&shared->irq_work, dbs_irq_work);
-	INIT_WORK(&shared->work, dbs_work_handler);
+	mutex_init(&policy_dbs->timer_mutex);
+	atomic_set(&policy_dbs->skip_work, 0);
+	init_irq_work(&policy_dbs->irq_work, dbs_irq_work);
+	INIT_WORK(&policy_dbs->work, dbs_work_handler);
 	return 0;
 }
 
-static void free_common_dbs_info(struct cpufreq_policy *policy,
+static void free_policy_dbs_info(struct cpufreq_policy *policy,
 				 struct dbs_governor *gov)
 {
 	struct cpu_dbs_info *cdbs = gov->get_cpu_cdbs(policy->cpu);
-	struct cpu_common_dbs_info *shared = cdbs->shared;
+	struct policy_dbs_info *policy_dbs = cdbs->policy_dbs;
 	int j;
 
-	mutex_destroy(&shared->timer_mutex);
+	mutex_destroy(&policy_dbs->timer_mutex);
 
 	for_each_cpu(j, policy->cpus)
-		gov->get_cpu_cdbs(j)->shared = NULL;
+		gov->get_cpu_cdbs(j)->policy_dbs = NULL;
 
-	kfree(shared);
+	kfree(policy_dbs);
 }
 
 static int cpufreq_governor_init(struct cpufreq_policy *policy)
@@ -343,7 +343,7 @@ static int cpufreq_governor_init(struct cpufreq_policy *policy)
 		if (WARN_ON(have_governor_per_policy()))
 			return -EINVAL;
 
-		ret = alloc_common_dbs_info(policy, gov);
+		ret = alloc_policy_dbs_info(policy, gov);
 		if (ret)
 			return ret;
 
@@ -356,7 +356,7 @@ static int cpufreq_governor_init(struct cpufreq_policy *policy)
 	if (!dbs_data)
 		return -ENOMEM;
 
-	ret = alloc_common_dbs_info(policy, gov);
+	ret = alloc_policy_dbs_info(policy, gov);
 	if (ret)
 		goto free_dbs_data;
 
@@ -364,7 +364,7 @@ static int cpufreq_governor_init(struct cpufreq_policy *policy)
 
 	ret = gov->init(dbs_data, !policy->governor->initialized);
 	if (ret)
-		goto free_common_dbs_info;
+		goto free_policy_dbs_info;
 
 	/* policy latency is in ns. Convert it to us first */
 	latency = policy->cpuinfo.transition_latency / 1000;
@@ -395,8 +395,8 @@ reset_gdbs_data:
 	if (!have_governor_per_policy())
 		gov->gdbs_data = NULL;
 	gov->exit(dbs_data, !policy->governor->initialized);
-free_common_dbs_info:
-	free_common_dbs_info(policy, gov);
+free_policy_dbs_info:
+	free_policy_dbs_info(policy, gov);
 free_dbs_data:
 	kfree(dbs_data);
 	return ret;
@@ -409,7 +409,7 @@ static int cpufreq_governor_exit(struct cpufreq_policy *policy)
 	struct cpu_dbs_info *cdbs = gov->get_cpu_cdbs(policy->cpu);
 
 	/* State should be equivalent to INIT */
-	if (!cdbs->shared || cdbs->shared->policy)
+	if (!cdbs->policy_dbs || cdbs->policy_dbs->policy)
 		return -EBUSY;
 
 	if (!--dbs_data->usage_count) {
@@ -427,7 +427,7 @@ static int cpufreq_governor_exit(struct cpufreq_policy *policy)
 		policy->governor_data = NULL;
 	}
 
-	free_common_dbs_info(policy, gov);
+	free_policy_dbs_info(policy, gov);
 	return 0;
 }
 
@@ -437,14 +437,14 @@ static int cpufreq_governor_start(struct cpufreq_policy *policy)
 	struct dbs_data *dbs_data = policy->governor_data;
 	unsigned int sampling_rate, ignore_nice, j, cpu = policy->cpu;
 	struct cpu_dbs_info *cdbs = gov->get_cpu_cdbs(cpu);
-	struct cpu_common_dbs_info *shared = cdbs->shared;
+	struct policy_dbs_info *policy_dbs = cdbs->policy_dbs;
 	int io_busy = 0;
 
 	if (!policy->cur)
 		return -EINVAL;
 
 	/* State should be equivalent to INIT */
-	if (!shared || shared->policy)
+	if (!policy_dbs || policy_dbs->policy)
 		return -EBUSY;
 
 	if (gov->governor == GOV_CONSERVATIVE) {
@@ -477,7 +477,7 @@ static int cpufreq_governor_start(struct cpufreq_policy *policy)
 
 		j_cdbs->update_util.func = dbs_update_util_handler;
 	}
-	shared->policy = policy;
+	policy_dbs->policy = policy;
 
 	if (gov->governor == GOV_CONSERVATIVE) {
 		struct cs_cpu_dbs_info_s *cs_dbs_info =
@@ -494,7 +494,7 @@ static int cpufreq_governor_start(struct cpufreq_policy *policy)
 		od_ops->powersave_bias_init_cpu(cpu);
 	}
 
-	gov_set_update_util(shared, sampling_rate);
+	gov_set_update_util(policy_dbs, sampling_rate);
 	return 0;
 }
 
@@ -502,14 +502,14 @@ static int cpufreq_governor_stop(struct cpufreq_policy *policy)
 {
 	struct dbs_governor *gov = dbs_governor_of(policy);
 	struct cpu_dbs_info *cdbs = gov->get_cpu_cdbs(policy->cpu);
-	struct cpu_common_dbs_info *shared = cdbs->shared;
+	struct policy_dbs_info *policy_dbs = cdbs->policy_dbs;
 
 	/* State should be equivalent to START */
-	if (!shared || !shared->policy)
+	if (!policy_dbs || !policy_dbs->policy)
 		return -EBUSY;
 
-	gov_cancel_work(shared);
-	shared->policy = NULL;
+	gov_cancel_work(policy_dbs);
+	policy_dbs->policy = NULL;
 
 	return 0;
 }
@@ -521,18 +521,18 @@ static int cpufreq_governor_limits(struct cpufreq_policy *policy)
 	struct cpu_dbs_info *cdbs = gov->get_cpu_cdbs(cpu);
 
 	/* State should be equivalent to START */
-	if (!cdbs->shared || !cdbs->shared->policy)
+	if (!cdbs->policy_dbs || !cdbs->policy_dbs->policy)
 		return -EBUSY;
 
-	mutex_lock(&cdbs->shared->timer_mutex);
-	if (policy->max < cdbs->shared->policy->cur)
-		__cpufreq_driver_target(cdbs->shared->policy, policy->max,
+	mutex_lock(&cdbs->policy_dbs->timer_mutex);
+	if (policy->max < cdbs->policy_dbs->policy->cur)
+		__cpufreq_driver_target(cdbs->policy_dbs->policy, policy->max,
 					CPUFREQ_RELATION_H);
-	else if (policy->min > cdbs->shared->policy->cur)
-		__cpufreq_driver_target(cdbs->shared->policy, policy->min,
+	else if (policy->min > cdbs->policy_dbs->policy->cur)
+		__cpufreq_driver_target(cdbs->policy_dbs->policy, policy->min,
 					CPUFREQ_RELATION_L);
 	dbs_check_cpu(policy, cpu);
-	mutex_unlock(&cdbs->shared->timer_mutex);
+	mutex_unlock(&cdbs->policy_dbs->timer_mutex);
 
 	return 0;
 }
diff --git a/drivers/cpufreq/cpufreq_governor.h b/drivers/cpufreq/cpufreq_governor.h
index c8b7ec2..c90a2d3 100644
--- a/drivers/cpufreq/cpufreq_governor.h
+++ b/drivers/cpufreq/cpufreq_governor.h
@@ -131,7 +131,7 @@ static void *get_cpu_dbs_info_s(int cpu)				\
  */
 
 /* Common to all CPUs of a policy */
-struct cpu_common_dbs_info {
+struct policy_dbs_info {
 	struct cpufreq_policy *policy;
 	/*
 	 * Per policy mutex that serializes load evaluation from limit-change
@@ -146,10 +146,10 @@ struct cpu_common_dbs_info {
 	struct work_struct work;
 };
 
-static inline void gov_update_sample_delay(struct cpu_common_dbs_info *shared,
+static inline void gov_update_sample_delay(struct policy_dbs_info *policy_dbs,
 					   unsigned int delay_us)
 {
-	shared->sample_delay_ns = delay_us * NSEC_PER_USEC;
+	policy_dbs->sample_delay_ns = delay_us * NSEC_PER_USEC;
 }
 
 /* Per cpu structures */
@@ -165,7 +165,7 @@ struct cpu_dbs_info {
 	 */
 	unsigned int prev_load;
 	struct update_util_data update_util;
-	struct cpu_common_dbs_info *shared;
+	struct policy_dbs_info *policy_dbs;
 };
 
 struct od_cpu_dbs_info_s {
diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c
index 65ad39d..4a23327 100644
--- a/drivers/cpufreq/cpufreq_ondemand.c
+++ b/drivers/cpufreq/cpufreq_ondemand.c
@@ -151,7 +151,7 @@ static void dbs_freq_increase(struct cpufreq_policy *policy, unsigned int freq)
 static void od_check_cpu(int cpu, unsigned int load)
 {
 	struct od_cpu_dbs_info_s *dbs_info = &per_cpu(od_cpu_dbs_info, cpu);
-	struct cpufreq_policy *policy = dbs_info->cdbs.shared->policy;
+	struct cpufreq_policy *policy = dbs_info->cdbs.policy_dbs->policy;
 	struct dbs_data *dbs_data = policy->governor_data;
 	struct od_dbs_tuners *od_tuners = dbs_data->tuners;
 
@@ -255,20 +255,20 @@ static void update_sampling_rate(struct dbs_data *dbs_data,
 		struct cpufreq_policy *policy;
 		struct od_cpu_dbs_info_s *dbs_info;
 		struct cpu_dbs_info *cdbs;
-		struct cpu_common_dbs_info *shared;
+		struct policy_dbs_info *policy_dbs;
 
 		dbs_info = &per_cpu(od_cpu_dbs_info, cpu);
 		cdbs = &dbs_info->cdbs;
-		shared = cdbs->shared;
+		policy_dbs = cdbs->policy_dbs;
 
 		/*
-		 * A valid shared and shared->policy means governor hasn't
-		 * stopped or exited yet.
+		 * A valid policy_dbs and policy_dbs->policy means governor
+		 * hasn't stopped or exited yet.
 		 */
-		if (!shared || !shared->policy)
+		if (!policy_dbs || !policy_dbs->policy)
 			continue;
 
-		policy = shared->policy;
+		policy = policy_dbs->policy;
 
 		/* clear all CPUs of this policy */
 		cpumask_andnot(&cpumask, &cpumask, policy->cpus);
@@ -280,7 +280,7 @@ static void update_sampling_rate(struct dbs_data *dbs_data,
 		 * multiple policies that are governed by the same dbs_data.
 		 */
 		if (dbs_data == policy->governor_data) {
-			mutex_lock(&shared->timer_mutex);
+			mutex_lock(&policy_dbs->timer_mutex);
 			/*
 			 * On 32-bit architectures this may race with the
 			 * sample_delay_ns read in dbs_update_util_handler(),
@@ -299,8 +299,8 @@ static void update_sampling_rate(struct dbs_data *dbs_data,
 			 * too big and it will be corrected next time a sample
 			 * is taken, so it shouldn't be significant.
 			 */
-			gov_update_sample_delay(shared, new_rate);
-			mutex_unlock(&shared->timer_mutex);
+			gov_update_sample_delay(policy_dbs, new_rate);
+			mutex_unlock(&policy_dbs->timer_mutex);
 		}
 	}
 
@@ -573,16 +573,16 @@ static void od_set_powersave_bias(unsigned int powersave_bias)
 
 	get_online_cpus();
 	for_each_online_cpu(cpu) {
-		struct cpu_common_dbs_info *shared;
+		struct policy_dbs_info *policy_dbs;
 
 		if (cpumask_test_cpu(cpu, &done))
 			continue;
 
-		shared = per_cpu(od_cpu_dbs_info, cpu).cdbs.shared;
-		if (!shared)
+		policy_dbs = per_cpu(od_cpu_dbs_info, cpu).cdbs.policy_dbs;
+		if (!policy_dbs)
 			continue;
 
-		policy = shared->policy;
+		policy = policy_dbs->policy;
 		cpumask_or(&done, &done, policy->cpus);
 
 		if (policy->governor != CPU_FREQ_GOV_ONDEMAND)
-- 
cgit v0.10.2


From d10b5eb5fce436ba22443ab83eeb36e195dbf772 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Sat, 6 Feb 2016 13:50:24 +0100
Subject: cpufreq: governor: Drop cpu argument from dbs_check_cpu()

Since policy->cpu is always passed as the second argument to
dbs_check_cpu(), it is not really necessary to pass it, because
the function can obtain that value via its first argument just fine.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>

diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c
index b2df5de..b8054e5 100644
--- a/drivers/cpufreq/cpufreq_conservative.c
+++ b/drivers/cpufreq/cpufreq_conservative.c
@@ -107,7 +107,7 @@ static unsigned int cs_dbs_timer(struct cpufreq_policy *policy)
 	struct dbs_data *dbs_data = policy->governor_data;
 	struct cs_dbs_tuners *cs_tuners = dbs_data->tuners;
 
-	dbs_check_cpu(policy, policy->cpu);
+	dbs_check_cpu(policy);
 	return delay_for_sampling_rate(cs_tuners->sampling_rate);
 }
 
diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c
index b425cd3da..431d81f 100644
--- a/drivers/cpufreq/cpufreq_governor.c
+++ b/drivers/cpufreq/cpufreq_governor.c
@@ -31,8 +31,9 @@ static struct attribute_group *get_sysfs_attr(struct dbs_governor *gov)
 		gov->attr_group_gov_pol : gov->attr_group_gov_sys;
 }
 
-void dbs_check_cpu(struct cpufreq_policy *policy, int cpu)
+void dbs_check_cpu(struct cpufreq_policy *policy)
 {
+	int cpu = policy->cpu;
 	struct dbs_governor *gov = dbs_governor_of(policy);
 	struct cpu_dbs_info *cdbs = gov->get_cpu_cdbs(cpu);
 	struct dbs_data *dbs_data = policy->governor_data;
@@ -517,8 +518,7 @@ static int cpufreq_governor_stop(struct cpufreq_policy *policy)
 static int cpufreq_governor_limits(struct cpufreq_policy *policy)
 {
 	struct dbs_governor *gov = dbs_governor_of(policy);
-	unsigned int cpu = policy->cpu;
-	struct cpu_dbs_info *cdbs = gov->get_cpu_cdbs(cpu);
+	struct cpu_dbs_info *cdbs = gov->get_cpu_cdbs(policy->cpu);
 
 	/* State should be equivalent to START */
 	if (!cdbs->policy_dbs || !cdbs->policy_dbs->policy)
@@ -531,7 +531,7 @@ static int cpufreq_governor_limits(struct cpufreq_policy *policy)
 	else if (policy->min > cdbs->policy_dbs->policy->cur)
 		__cpufreq_driver_target(cdbs->policy_dbs->policy, policy->min,
 					CPUFREQ_RELATION_L);
-	dbs_check_cpu(policy, cpu);
+	dbs_check_cpu(policy);
 	mutex_unlock(&cdbs->policy_dbs->timer_mutex);
 
 	return 0;
diff --git a/drivers/cpufreq/cpufreq_governor.h b/drivers/cpufreq/cpufreq_governor.h
index c90a2d3..63868d7 100644
--- a/drivers/cpufreq/cpufreq_governor.h
+++ b/drivers/cpufreq/cpufreq_governor.h
@@ -279,7 +279,7 @@ static ssize_t show_sampling_rate_min_gov_pol				\
 
 extern struct mutex dbs_data_mutex;
 extern struct mutex cpufreq_governor_lock;
-void dbs_check_cpu(struct cpufreq_policy *policy, int cpu);
+void dbs_check_cpu(struct cpufreq_policy *policy);
 int cpufreq_governor_dbs(struct cpufreq_policy *policy, unsigned int event);
 void od_register_powersave_bias_handler(unsigned int (*f)
 		(struct cpufreq_policy *, unsigned int, unsigned int),
diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c
index 4a23327..9ef4402 100644
--- a/drivers/cpufreq/cpufreq_ondemand.c
+++ b/drivers/cpufreq/cpufreq_ondemand.c
@@ -190,9 +190,7 @@ static void od_check_cpu(int cpu, unsigned int load)
 static unsigned int od_dbs_timer(struct cpufreq_policy *policy)
 {
 	struct dbs_data *dbs_data = policy->governor_data;
-	unsigned int cpu = policy->cpu;
-	struct od_cpu_dbs_info_s *dbs_info = &per_cpu(od_cpu_dbs_info,
-			cpu);
+	struct od_cpu_dbs_info_s *dbs_info = &per_cpu(od_cpu_dbs_info, policy->cpu);
 	struct od_dbs_tuners *od_tuners = dbs_data->tuners;
 	int delay = 0, sample_type = dbs_info->sample_type;
 
@@ -203,7 +201,7 @@ static unsigned int od_dbs_timer(struct cpufreq_policy *policy)
 		__cpufreq_driver_target(policy, dbs_info->freq_lo,
 					CPUFREQ_RELATION_H);
 	} else {
-		dbs_check_cpu(policy, cpu);
+		dbs_check_cpu(policy);
 		if (dbs_info->freq_lo) {
 			/* Setup timer for SUB_SAMPLE */
 			dbs_info->sample_type = OD_SUB_SAMPLE;
-- 
cgit v0.10.2


From e9751894000af398d5895b3ee96052f57b80cc44 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Sun, 7 Feb 2016 16:23:49 +0100
Subject: cpufreq: governor: Simplify cpufreq_governor_limits()

Use the observation that cpufreq_governor_limits() doesn't have to
get to the policy object it wants to manipulate by walking the
reference chain cdbs->policy_dbs->policy, as the final pointer is
actually equal to its argument, and make it access the policy
object directy via its argument.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>

diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c
index 431d81f..ff247a7 100644
--- a/drivers/cpufreq/cpufreq_governor.c
+++ b/drivers/cpufreq/cpufreq_governor.c
@@ -519,20 +519,19 @@ static int cpufreq_governor_limits(struct cpufreq_policy *policy)
 {
 	struct dbs_governor *gov = dbs_governor_of(policy);
 	struct cpu_dbs_info *cdbs = gov->get_cpu_cdbs(policy->cpu);
+	struct policy_dbs_info *policy_dbs = cdbs->policy_dbs;
 
 	/* State should be equivalent to START */
-	if (!cdbs->policy_dbs || !cdbs->policy_dbs->policy)
+	if (!policy_dbs || !policy_dbs->policy)
 		return -EBUSY;
 
-	mutex_lock(&cdbs->policy_dbs->timer_mutex);
-	if (policy->max < cdbs->policy_dbs->policy->cur)
-		__cpufreq_driver_target(cdbs->policy_dbs->policy, policy->max,
-					CPUFREQ_RELATION_H);
-	else if (policy->min > cdbs->policy_dbs->policy->cur)
-		__cpufreq_driver_target(cdbs->policy_dbs->policy, policy->min,
-					CPUFREQ_RELATION_L);
+	mutex_lock(&policy_dbs->timer_mutex);
+	if (policy->max < policy->cur)
+		__cpufreq_driver_target(policy, policy->max, CPUFREQ_RELATION_H);
+	else if (policy->min > policy->cur)
+		__cpufreq_driver_target(policy, policy->min, CPUFREQ_RELATION_L);
 	dbs_check_cpu(policy);
-	mutex_unlock(&cdbs->policy_dbs->timer_mutex);
+	mutex_unlock(&policy_dbs->timer_mutex);
 
 	return 0;
 }
-- 
cgit v0.10.2


From bc505475b85de9a9903e84ef0b369d4637354201 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Sun, 7 Feb 2016 16:24:26 +0100
Subject: cpufreq: governor: Rearrange governor data structures

The struct policy_dbs_info objects representing per-policy governor
data are not accessible directly from the corresponding policy
objects.  To access them, one has to get a pointer to the
struct cpu_dbs_info of policy->cpu and use the policy_dbs field of
that which isn't really straightforward.

To address that rearrange the governor data structures so the
governor_data pointer in struct cpufreq_policy will point to
struct policy_dbs_info (instead of struct dbs_data) and that will
contain a pointer to struct dbs_data.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>

diff --git a/drivers/cpufreq/amd_freq_sensitivity.c b/drivers/cpufreq/amd_freq_sensitivity.c
index 6395a5f..82ae100 100644
--- a/drivers/cpufreq/amd_freq_sensitivity.c
+++ b/drivers/cpufreq/amd_freq_sensitivity.c
@@ -45,7 +45,8 @@ static unsigned int amd_powersave_bias_target(struct cpufreq_policy *policy,
 	long d_actual, d_reference;
 	struct msr actual, reference;
 	struct cpu_data_t *data = &per_cpu(cpu_data, policy->cpu);
-	struct dbs_data *od_data = policy->governor_data;
+	struct policy_dbs_info *policy_dbs = policy->governor_data;
+	struct dbs_data *od_data = policy_dbs->dbs_data;
 	struct od_dbs_tuners *od_tuners = od_data->tuners;
 	struct od_cpu_dbs_info_s *od_info =
 		dbs_governor_of(policy)->get_cpu_dbs_info_s(policy->cpu);
diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c
index b8054e5..1a899bb 100644
--- a/drivers/cpufreq/cpufreq_conservative.c
+++ b/drivers/cpufreq/cpufreq_conservative.c
@@ -48,7 +48,8 @@ static void cs_check_cpu(int cpu, unsigned int load)
 {
 	struct cs_cpu_dbs_info_s *dbs_info = &per_cpu(cs_cpu_dbs_info, cpu);
 	struct cpufreq_policy *policy = dbs_info->cdbs.policy_dbs->policy;
-	struct dbs_data *dbs_data = policy->governor_data;
+	struct policy_dbs_info *policy_dbs = policy->governor_data;
+	struct dbs_data *dbs_data = policy_dbs->dbs_data;
 	struct cs_dbs_tuners *cs_tuners = dbs_data->tuners;
 
 	/*
@@ -104,7 +105,8 @@ static void cs_check_cpu(int cpu, unsigned int load)
 
 static unsigned int cs_dbs_timer(struct cpufreq_policy *policy)
 {
-	struct dbs_data *dbs_data = policy->governor_data;
+	struct policy_dbs_info *policy_dbs = policy->governor_data;
+	struct dbs_data *dbs_data = policy_dbs->dbs_data;
 	struct cs_dbs_tuners *cs_tuners = dbs_data->tuners;
 
 	dbs_check_cpu(policy);
diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c
index ff247a7..82e50dc 100644
--- a/drivers/cpufreq/cpufreq_governor.c
+++ b/drivers/cpufreq/cpufreq_governor.c
@@ -35,8 +35,8 @@ void dbs_check_cpu(struct cpufreq_policy *policy)
 {
 	int cpu = policy->cpu;
 	struct dbs_governor *gov = dbs_governor_of(policy);
-	struct cpu_dbs_info *cdbs = gov->get_cpu_cdbs(cpu);
-	struct dbs_data *dbs_data = policy->governor_data;
+	struct policy_dbs_info *policy_dbs = policy->governor_data;
+	struct dbs_data *dbs_data = policy_dbs->dbs_data;
 	struct od_dbs_tuners *od_tuners = dbs_data->tuners;
 	struct cs_dbs_tuners *cs_tuners = dbs_data->tuners;
 	unsigned int sampling_rate;
@@ -95,6 +95,7 @@ void dbs_check_cpu(struct cpufreq_policy *policy)
 		j_cdbs->prev_cpu_idle = cur_idle_time;
 
 		if (ignore_nice) {
+			struct cpu_dbs_info *cdbs = gov->get_cpu_cdbs(cpu);
 			u64 cur_nice;
 			unsigned long cur_nice_jiffies;
 
@@ -292,8 +293,8 @@ static void set_sampling_rate(struct dbs_data *dbs_data,
 	}
 }
 
-static int alloc_policy_dbs_info(struct cpufreq_policy *policy,
-				 struct dbs_governor *gov)
+static struct policy_dbs_info *alloc_policy_dbs_info(struct cpufreq_policy *policy,
+						     struct dbs_governor *gov)
 {
 	struct policy_dbs_info *policy_dbs;
 	int j;
@@ -301,7 +302,7 @@ static int alloc_policy_dbs_info(struct cpufreq_policy *policy,
 	/* Allocate memory for the common information for policy->cpus */
 	policy_dbs = kzalloc(sizeof(*policy_dbs), GFP_KERNEL);
 	if (!policy_dbs)
-		return -ENOMEM;
+		return NULL;
 
 	/* Set policy_dbs for all CPUs, online+offline */
 	for_each_cpu(j, policy->related_cpus)
@@ -311,7 +312,7 @@ static int alloc_policy_dbs_info(struct cpufreq_policy *policy,
 	atomic_set(&policy_dbs->skip_work, 0);
 	init_irq_work(&policy_dbs->irq_work, dbs_irq_work);
 	INIT_WORK(&policy_dbs->work, dbs_work_handler);
-	return 0;
+	return policy_dbs;
 }
 
 static void free_policy_dbs_info(struct cpufreq_policy *policy,
@@ -333,6 +334,7 @@ static int cpufreq_governor_init(struct cpufreq_policy *policy)
 {
 	struct dbs_governor *gov = dbs_governor_of(policy);
 	struct dbs_data *dbs_data = gov->gdbs_data;
+	struct policy_dbs_info *policy_dbs;
 	unsigned int latency;
 	int ret;
 
@@ -340,26 +342,26 @@ static int cpufreq_governor_init(struct cpufreq_policy *policy)
 	if (policy->governor_data)
 		return -EBUSY;
 
-	if (dbs_data) {
-		if (WARN_ON(have_governor_per_policy()))
-			return -EINVAL;
-
-		ret = alloc_policy_dbs_info(policy, gov);
-		if (ret)
-			return ret;
+	policy_dbs = alloc_policy_dbs_info(policy, gov);
+	if (!policy_dbs)
+		return -ENOMEM;
 
+	if (dbs_data) {
+		if (WARN_ON(have_governor_per_policy())) {
+			ret = -EINVAL;
+			goto free_policy_dbs_info;
+		}
 		dbs_data->usage_count++;
-		policy->governor_data = dbs_data;
+		policy_dbs->dbs_data = dbs_data;
+		policy->governor_data = policy_dbs;
 		return 0;
 	}
 
 	dbs_data = kzalloc(sizeof(*dbs_data), GFP_KERNEL);
-	if (!dbs_data)
-		return -ENOMEM;
-
-	ret = alloc_policy_dbs_info(policy, gov);
-	if (ret)
-		goto free_dbs_data;
+	if (!dbs_data) {
+		ret = -ENOMEM;
+		goto free_policy_dbs_info;
+	}
 
 	dbs_data->usage_count = 1;
 
@@ -381,7 +383,8 @@ static int cpufreq_governor_init(struct cpufreq_policy *policy)
 	if (!have_governor_per_policy())
 		gov->gdbs_data = dbs_data;
 
-	policy->governor_data = dbs_data;
+	policy_dbs->dbs_data = dbs_data;
+	policy->governor_data = policy_dbs;
 
 	ret = sysfs_create_group(get_governor_parent_kobj(policy),
 				 get_sysfs_attr(gov));
@@ -396,21 +399,21 @@ reset_gdbs_data:
 	if (!have_governor_per_policy())
 		gov->gdbs_data = NULL;
 	gov->exit(dbs_data, !policy->governor->initialized);
+	kfree(dbs_data);
+
 free_policy_dbs_info:
 	free_policy_dbs_info(policy, gov);
-free_dbs_data:
-	kfree(dbs_data);
 	return ret;
 }
 
 static int cpufreq_governor_exit(struct cpufreq_policy *policy)
 {
 	struct dbs_governor *gov = dbs_governor_of(policy);
-	struct dbs_data *dbs_data = policy->governor_data;
-	struct cpu_dbs_info *cdbs = gov->get_cpu_cdbs(policy->cpu);
+	struct policy_dbs_info *policy_dbs = policy->governor_data;
+	struct dbs_data *dbs_data = policy_dbs->dbs_data;
 
 	/* State should be equivalent to INIT */
-	if (!cdbs->policy_dbs || cdbs->policy_dbs->policy)
+	if (policy_dbs->policy)
 		return -EBUSY;
 
 	if (!--dbs_data->usage_count) {
@@ -435,17 +438,16 @@ static int cpufreq_governor_exit(struct cpufreq_policy *policy)
 static int cpufreq_governor_start(struct cpufreq_policy *policy)
 {
 	struct dbs_governor *gov = dbs_governor_of(policy);
-	struct dbs_data *dbs_data = policy->governor_data;
+	struct policy_dbs_info *policy_dbs = policy->governor_data;
+	struct dbs_data *dbs_data = policy_dbs->dbs_data;
 	unsigned int sampling_rate, ignore_nice, j, cpu = policy->cpu;
-	struct cpu_dbs_info *cdbs = gov->get_cpu_cdbs(cpu);
-	struct policy_dbs_info *policy_dbs = cdbs->policy_dbs;
 	int io_busy = 0;
 
 	if (!policy->cur)
 		return -EINVAL;
 
 	/* State should be equivalent to INIT */
-	if (!policy_dbs || policy_dbs->policy)
+	if (policy_dbs->policy)
 		return -EBUSY;
 
 	if (gov->governor == GOV_CONSERVATIVE) {
@@ -501,12 +503,10 @@ static int cpufreq_governor_start(struct cpufreq_policy *policy)
 
 static int cpufreq_governor_stop(struct cpufreq_policy *policy)
 {
-	struct dbs_governor *gov = dbs_governor_of(policy);
-	struct cpu_dbs_info *cdbs = gov->get_cpu_cdbs(policy->cpu);
-	struct policy_dbs_info *policy_dbs = cdbs->policy_dbs;
+	struct policy_dbs_info *policy_dbs = policy->governor_data;
 
 	/* State should be equivalent to START */
-	if (!policy_dbs || !policy_dbs->policy)
+	if (!policy_dbs->policy)
 		return -EBUSY;
 
 	gov_cancel_work(policy_dbs);
@@ -517,12 +517,10 @@ static int cpufreq_governor_stop(struct cpufreq_policy *policy)
 
 static int cpufreq_governor_limits(struct cpufreq_policy *policy)
 {
-	struct dbs_governor *gov = dbs_governor_of(policy);
-	struct cpu_dbs_info *cdbs = gov->get_cpu_cdbs(policy->cpu);
-	struct policy_dbs_info *policy_dbs = cdbs->policy_dbs;
+	struct policy_dbs_info *policy_dbs = policy->governor_data;
 
 	/* State should be equivalent to START */
-	if (!policy_dbs || !policy_dbs->policy)
+	if (!policy_dbs->policy)
 		return -EBUSY;
 
 	mutex_lock(&policy_dbs->timer_mutex);
diff --git a/drivers/cpufreq/cpufreq_governor.h b/drivers/cpufreq/cpufreq_governor.h
index 63868d7..95e6834 100644
--- a/drivers/cpufreq/cpufreq_governor.h
+++ b/drivers/cpufreq/cpufreq_governor.h
@@ -85,7 +85,8 @@ static ssize_t show_##file_name##_gov_sys				\
 static ssize_t show_##file_name##_gov_pol				\
 (struct cpufreq_policy *policy, char *buf)				\
 {									\
-	struct dbs_data *dbs_data = policy->governor_data;		\
+	struct policy_dbs_info *policy_dbs = policy->governor_data;	\
+	struct dbs_data *dbs_data = policy_dbs->dbs_data;		\
 	struct _gov##_dbs_tuners *tuners = dbs_data->tuners;		\
 	return sprintf(buf, "%u\n", tuners->file_name);			\
 }
@@ -101,8 +102,8 @@ static ssize_t store_##file_name##_gov_sys				\
 static ssize_t store_##file_name##_gov_pol				\
 (struct cpufreq_policy *policy, const char *buf, size_t count)		\
 {									\
-	struct dbs_data *dbs_data = policy->governor_data;		\
-	return store_##file_name(dbs_data, buf, count);			\
+	struct policy_dbs_info *policy_dbs = policy->governor_data;	\
+	return store_##file_name(policy_dbs->dbs_data, buf, count);			\
 }
 
 #define show_store_one(_gov, file_name)					\
@@ -130,6 +131,13 @@ static void *get_cpu_dbs_info_s(int cpu)				\
  * cs_*: Conservative governor
  */
 
+/* Governor demand based switching data (per-policy or global). */
+struct dbs_data {
+	unsigned int min_sampling_rate;
+	int usage_count;
+	void *tuners;
+};
+
 /* Common to all CPUs of a policy */
 struct policy_dbs_info {
 	struct cpufreq_policy *policy;
@@ -144,6 +152,8 @@ struct policy_dbs_info {
 	atomic_t skip_work;
 	struct irq_work irq_work;
 	struct work_struct work;
+	/* dbs_data may be shared between multiple policy objects */
+	struct dbs_data *dbs_data;
 };
 
 static inline void gov_update_sample_delay(struct policy_dbs_info *policy_dbs,
@@ -204,7 +214,6 @@ struct cs_dbs_tuners {
 };
 
 /* Common Governor data across policies */
-struct dbs_data;
 struct dbs_governor {
 	struct cpufreq_governor gov;
 
@@ -236,13 +245,6 @@ static inline struct dbs_governor *dbs_governor_of(struct cpufreq_policy *policy
 	return container_of(policy->governor, struct dbs_governor, gov);
 }
 
-/* Governor Per policy data */
-struct dbs_data {
-	unsigned int min_sampling_rate;
-	int usage_count;
-	void *tuners;
-};
-
 /* Governor specific ops, will be passed to dbs_data->gov_ops */
 struct od_ops {
 	void (*powersave_bias_init_cpu)(int cpu);
@@ -273,7 +275,8 @@ static ssize_t show_sampling_rate_min_gov_sys				\
 static ssize_t show_sampling_rate_min_gov_pol				\
 (struct cpufreq_policy *policy, char *buf)				\
 {									\
-	struct dbs_data *dbs_data = policy->governor_data;		\
+	struct policy_dbs_info *policy_dbs = policy->governor_data;	\
+	struct dbs_data *dbs_data = policy_dbs->dbs_data;		\
 	return sprintf(buf, "%u\n", dbs_data->min_sampling_rate);	\
 }
 
diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c
index 9ef4402..b7ef2e7 100644
--- a/drivers/cpufreq/cpufreq_ondemand.c
+++ b/drivers/cpufreq/cpufreq_ondemand.c
@@ -78,7 +78,8 @@ static unsigned int generic_powersave_bias_target(struct cpufreq_policy *policy,
 	unsigned int jiffies_total, jiffies_hi, jiffies_lo;
 	struct od_cpu_dbs_info_s *dbs_info = &per_cpu(od_cpu_dbs_info,
 						   policy->cpu);
-	struct dbs_data *dbs_data = policy->governor_data;
+	struct policy_dbs_info *policy_dbs = policy->governor_data;
+	struct dbs_data *dbs_data = policy_dbs->dbs_data;
 	struct od_dbs_tuners *od_tuners = dbs_data->tuners;
 
 	if (!dbs_info->freq_table) {
@@ -130,7 +131,8 @@ static void ondemand_powersave_bias_init(void)
 
 static void dbs_freq_increase(struct cpufreq_policy *policy, unsigned int freq)
 {
-	struct dbs_data *dbs_data = policy->governor_data;
+	struct policy_dbs_info *policy_dbs = policy->governor_data;
+	struct dbs_data *dbs_data = policy_dbs->dbs_data;
 	struct od_dbs_tuners *od_tuners = dbs_data->tuners;
 
 	if (od_tuners->powersave_bias)
@@ -151,8 +153,9 @@ static void dbs_freq_increase(struct cpufreq_policy *policy, unsigned int freq)
 static void od_check_cpu(int cpu, unsigned int load)
 {
 	struct od_cpu_dbs_info_s *dbs_info = &per_cpu(od_cpu_dbs_info, cpu);
-	struct cpufreq_policy *policy = dbs_info->cdbs.policy_dbs->policy;
-	struct dbs_data *dbs_data = policy->governor_data;
+	struct policy_dbs_info *policy_dbs = dbs_info->cdbs.policy_dbs;
+	struct cpufreq_policy *policy = policy_dbs->policy;
+	struct dbs_data *dbs_data = policy_dbs->dbs_data;
 	struct od_dbs_tuners *od_tuners = dbs_data->tuners;
 
 	dbs_info->freq_lo = 0;
@@ -189,7 +192,8 @@ static void od_check_cpu(int cpu, unsigned int load)
 
 static unsigned int od_dbs_timer(struct cpufreq_policy *policy)
 {
-	struct dbs_data *dbs_data = policy->governor_data;
+	struct policy_dbs_info *policy_dbs = policy->governor_data;
+	struct dbs_data *dbs_data = policy_dbs->dbs_data;
 	struct od_cpu_dbs_info_s *dbs_info = &per_cpu(od_cpu_dbs_info, policy->cpu);
 	struct od_dbs_tuners *od_tuners = dbs_data->tuners;
 	int delay = 0, sample_type = dbs_info->sample_type;
@@ -277,7 +281,7 @@ static void update_sampling_rate(struct dbs_data *dbs_data,
 		 * policy will be governed by dbs_data, otherwise there can be
 		 * multiple policies that are governed by the same dbs_data.
 		 */
-		if (dbs_data == policy->governor_data) {
+		if (dbs_data == policy_dbs->dbs_data) {
 			mutex_lock(&policy_dbs->timer_mutex);
 			/*
 			 * On 32-bit architectures this may race with the
@@ -586,7 +590,7 @@ static void od_set_powersave_bias(unsigned int powersave_bias)
 		if (policy->governor != CPU_FREQ_GOV_ONDEMAND)
 			continue;
 
-		dbs_data = policy->governor_data;
+		dbs_data = policy_dbs->dbs_data;
 		od_tuners = dbs_data->tuners;
 		od_tuners->powersave_bias = default_powersave_bias;
 	}
-- 
cgit v0.10.2


From cea6a9e77228c261191bc92df0d24bf5356b99ff Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Sun, 7 Feb 2016 16:25:02 +0100
Subject: cpufreq: governor: Symmetrize cpu_dbs_info initialization and cleanup

Make the initialization of struct cpu_dbs_info objects in
alloc_policy_dbs_info() and the code that cleans them up in
free_policy_dbs_info() more symmetrical.  In particular,
set/clear the update_util.func field in those functions along
with the policy_dbs field.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>

diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c
index 82e50dc..7c08d83 100644
--- a/drivers/cpufreq/cpufreq_governor.c
+++ b/drivers/cpufreq/cpufreq_governor.c
@@ -304,14 +304,18 @@ static struct policy_dbs_info *alloc_policy_dbs_info(struct cpufreq_policy *poli
 	if (!policy_dbs)
 		return NULL;
 
-	/* Set policy_dbs for all CPUs, online+offline */
-	for_each_cpu(j, policy->related_cpus)
-		gov->get_cpu_cdbs(j)->policy_dbs = policy_dbs;
-
 	mutex_init(&policy_dbs->timer_mutex);
 	atomic_set(&policy_dbs->skip_work, 0);
 	init_irq_work(&policy_dbs->irq_work, dbs_irq_work);
 	INIT_WORK(&policy_dbs->work, dbs_work_handler);
+
+	/* Set policy_dbs for all CPUs, online+offline */
+	for_each_cpu(j, policy->related_cpus) {
+		struct cpu_dbs_info *j_cdbs = gov->get_cpu_cdbs(j);
+
+		j_cdbs->policy_dbs = policy_dbs;
+		j_cdbs->update_util.func = dbs_update_util_handler;
+	}
 	return policy_dbs;
 }
 
@@ -324,9 +328,12 @@ static void free_policy_dbs_info(struct cpufreq_policy *policy,
 
 	mutex_destroy(&policy_dbs->timer_mutex);
 
-	for_each_cpu(j, policy->cpus)
-		gov->get_cpu_cdbs(j)->policy_dbs = NULL;
+	for_each_cpu(j, policy->related_cpus) {
+		struct cpu_dbs_info *j_cdbs = gov->get_cpu_cdbs(j);
 
+		j_cdbs->policy_dbs = NULL;
+		j_cdbs->update_util.func = NULL;
+	}
 	kfree(policy_dbs);
 }
 
@@ -477,8 +484,6 @@ static int cpufreq_governor_start(struct cpufreq_policy *policy)
 
 		if (ignore_nice)
 			j_cdbs->prev_cpu_nice = kcpustat_cpu(j).cpustat[CPUTIME_NICE];
-
-		j_cdbs->update_util.func = dbs_update_util_handler;
 	}
 	policy_dbs->policy = policy;
 
-- 
cgit v0.10.2


From 686cc637c99324ad52a6f8e59181f6407405bfe2 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Mon, 8 Feb 2016 23:41:10 +0100
Subject: cpufreq: governor: Rename skip_work to work_count

The skip_work field in struct policy_dbs_info technically is a
counter, so give it a new name to reflect that.

No functional changes.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>

diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c
index 7c08d83..298be52 100644
--- a/drivers/cpufreq/cpufreq_governor.c
+++ b/drivers/cpufreq/cpufreq_governor.c
@@ -196,16 +196,16 @@ static inline void gov_clear_update_util(struct cpufreq_policy *policy)
 static void gov_cancel_work(struct policy_dbs_info *policy_dbs)
 {
 	/* Tell dbs_update_util_handler() to skip queuing up work items. */
-	atomic_inc(&policy_dbs->skip_work);
+	atomic_inc(&policy_dbs->work_count);
 	/*
 	 * If dbs_update_util_handler() is already running, it may not notice
-	 * the incremented skip_work, so wait for it to complete to prevent its
+	 * the incremented work_count, so wait for it to complete to prevent its
 	 * work item from being queued up after the cancel_work_sync() below.
 	 */
 	gov_clear_update_util(policy_dbs->policy);
 	irq_work_sync(&policy_dbs->irq_work);
 	cancel_work_sync(&policy_dbs->work);
-	atomic_set(&policy_dbs->skip_work, 0);
+	atomic_set(&policy_dbs->work_count, 0);
 }
 
 static void dbs_work_handler(struct work_struct *work)
@@ -234,7 +234,7 @@ static void dbs_work_handler(struct work_struct *work)
 	 * up using a stale sample delay value.
 	 */
 	smp_mb__before_atomic();
-	atomic_dec(&policy_dbs->skip_work);
+	atomic_dec(&policy_dbs->work_count);
 }
 
 static void dbs_irq_work(struct irq_work *irq_work)
@@ -267,7 +267,7 @@ static void dbs_update_util_handler(struct update_util_data *data, u64 time,
 	 * - The governor is being stopped.
 	 * - It is too early (too little time from the previous sample).
 	 */
-	if (atomic_inc_return(&policy_dbs->skip_work) == 1) {
+	if (atomic_inc_return(&policy_dbs->work_count) == 1) {
 		u64 delta_ns;
 
 		delta_ns = time - policy_dbs->last_sample_time;
@@ -277,7 +277,7 @@ static void dbs_update_util_handler(struct update_util_data *data, u64 time,
 			return;
 		}
 	}
-	atomic_dec(&policy_dbs->skip_work);
+	atomic_dec(&policy_dbs->work_count);
 }
 
 static void set_sampling_rate(struct dbs_data *dbs_data,
@@ -305,7 +305,7 @@ static struct policy_dbs_info *alloc_policy_dbs_info(struct cpufreq_policy *poli
 		return NULL;
 
 	mutex_init(&policy_dbs->timer_mutex);
-	atomic_set(&policy_dbs->skip_work, 0);
+	atomic_set(&policy_dbs->work_count, 0);
 	init_irq_work(&policy_dbs->irq_work, dbs_irq_work);
 	INIT_WORK(&policy_dbs->work, dbs_work_handler);
 
diff --git a/drivers/cpufreq/cpufreq_governor.h b/drivers/cpufreq/cpufreq_governor.h
index 95e6834..3753722 100644
--- a/drivers/cpufreq/cpufreq_governor.h
+++ b/drivers/cpufreq/cpufreq_governor.h
@@ -149,7 +149,7 @@ struct policy_dbs_info {
 
 	u64 last_sample_time;
 	s64 sample_delay_ns;
-	atomic_t skip_work;
+	atomic_t work_count;
 	struct irq_work irq_work;
 	struct work_struct work;
 	/* dbs_data may be shared between multiple policy objects */
-- 
cgit v0.10.2


From fafd5e8ab29d965d6c7db326f2d4189dd9f3b002 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Mon, 8 Feb 2016 23:57:22 +0100
Subject: cpufreq: governor: Drop pointless goto from cpufreq_governor_init()

It is silly to jump around "return 0", so don't do that.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>

diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c
index 298be52..d6bd402 100644
--- a/drivers/cpufreq/cpufreq_governor.c
+++ b/drivers/cpufreq/cpufreq_governor.c
@@ -395,12 +395,11 @@ static int cpufreq_governor_init(struct cpufreq_policy *policy)
 
 	ret = sysfs_create_group(get_governor_parent_kobj(policy),
 				 get_sysfs_attr(gov));
-	if (ret)
-		goto reset_gdbs_data;
+	if (!ret)
+		return 0;
 
-	return 0;
+	/* Failure, so roll back. */
 
-reset_gdbs_data:
 	policy->governor_data = NULL;
 
 	if (!have_governor_per_policy())
-- 
cgit v0.10.2


From d0684d3b8934cfb8171755cdb1fc87f4c0335655 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Tue, 9 Feb 2016 09:01:31 +0530
Subject: cpufreq: governor: Create generic macro for common tunables

Some tunables are present in governor-specific structures, whereas one
(min_sampling_rate) is located directly in struct dbs_data.

There is a special macro for creating its sysfs attribute and the
show/store callbacks, but since more tunables are going to be moved
to struct dbs_data, a new generic macro for such cases will be useful,
so add it and use it for min_sampling_rate.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Tested-by: Juri Lelli <juri.lelli@arm.com>
Tested-by: Shilpasri G Bhat <shilpa.bhat@linux.vnet.ibm.com>
[ rjw: Subject & changelog ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c
index 1a899bb..a69eb7e 100644
--- a/drivers/cpufreq/cpufreq_conservative.c
+++ b/drivers/cpufreq/cpufreq_conservative.c
@@ -245,7 +245,7 @@ show_store_one(cs, up_threshold);
 show_store_one(cs, down_threshold);
 show_store_one(cs, ignore_nice_load);
 show_store_one(cs, freq_step);
-declare_show_sampling_rate_min(cs);
+show_one_common(cs, min_sampling_rate);
 
 gov_sys_pol_attr_rw(sampling_rate);
 gov_sys_pol_attr_rw(sampling_down_factor);
@@ -253,10 +253,10 @@ gov_sys_pol_attr_rw(up_threshold);
 gov_sys_pol_attr_rw(down_threshold);
 gov_sys_pol_attr_rw(ignore_nice_load);
 gov_sys_pol_attr_rw(freq_step);
-gov_sys_pol_attr_ro(sampling_rate_min);
+gov_sys_pol_attr_ro(min_sampling_rate);
 
 static struct attribute *dbs_attributes_gov_sys[] = {
-	&sampling_rate_min_gov_sys.attr,
+	&min_sampling_rate_gov_sys.attr,
 	&sampling_rate_gov_sys.attr,
 	&sampling_down_factor_gov_sys.attr,
 	&up_threshold_gov_sys.attr,
@@ -272,7 +272,7 @@ static struct attribute_group cs_attr_group_gov_sys = {
 };
 
 static struct attribute *dbs_attributes_gov_pol[] = {
-	&sampling_rate_min_gov_pol.attr,
+	&min_sampling_rate_gov_pol.attr,
 	&sampling_rate_gov_pol.attr,
 	&sampling_down_factor_gov_pol.attr,
 	&up_threshold_gov_pol.attr,
diff --git a/drivers/cpufreq/cpufreq_governor.h b/drivers/cpufreq/cpufreq_governor.h
index 3753722..cdf7536 100644
--- a/drivers/cpufreq/cpufreq_governor.h
+++ b/drivers/cpufreq/cpufreq_governor.h
@@ -110,6 +110,26 @@ static ssize_t store_##file_name##_gov_pol				\
 show_one(_gov, file_name);						\
 store_one(_gov, file_name)
 
+#define show_one_common(_gov, file_name)				\
+static ssize_t show_##file_name##_gov_sys				\
+(struct kobject *kobj, struct attribute *attr, char *buf)		\
+{									\
+	struct dbs_data *dbs_data = _gov##_dbs_gov.gdbs_data;		\
+	return sprintf(buf, "%u\n", dbs_data->file_name);		\
+}									\
+									\
+static ssize_t show_##file_name##_gov_pol				\
+(struct cpufreq_policy *policy, char *buf)				\
+{									\
+	struct policy_dbs_info *policy_dbs = policy->governor_data;	\
+	struct dbs_data *dbs_data = policy_dbs->dbs_data;		\
+	return sprintf(buf, "%u\n", dbs_data->file_name);		\
+}
+
+#define show_store_one_common(_gov, file_name)				\
+show_one_common(_gov, file_name);					\
+store_one(_gov, file_name)
+
 /* create helper routines */
 #define define_get_cpu_dbs_routines(_dbs_info)				\
 static struct cpu_dbs_info *get_cpu_cdbs(int cpu)			\
@@ -264,22 +284,6 @@ static inline int delay_for_sampling_rate(unsigned int sampling_rate)
 	return delay;
 }
 
-#define declare_show_sampling_rate_min(_gov)				\
-static ssize_t show_sampling_rate_min_gov_sys				\
-(struct kobject *kobj, struct attribute *attr, char *buf)		\
-{									\
-	struct dbs_data *dbs_data = _gov##_dbs_gov.gdbs_data;		\
-	return sprintf(buf, "%u\n", dbs_data->min_sampling_rate);	\
-}									\
-									\
-static ssize_t show_sampling_rate_min_gov_pol				\
-(struct cpufreq_policy *policy, char *buf)				\
-{									\
-	struct policy_dbs_info *policy_dbs = policy->governor_data;	\
-	struct dbs_data *dbs_data = policy_dbs->dbs_data;		\
-	return sprintf(buf, "%u\n", dbs_data->min_sampling_rate);	\
-}
-
 extern struct mutex dbs_data_mutex;
 extern struct mutex cpufreq_governor_lock;
 void dbs_check_cpu(struct cpufreq_policy *policy);
diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c
index b7ef2e7..8c44bc3 100644
--- a/drivers/cpufreq/cpufreq_ondemand.c
+++ b/drivers/cpufreq/cpufreq_ondemand.c
@@ -443,7 +443,7 @@ show_store_one(od, up_threshold);
 show_store_one(od, sampling_down_factor);
 show_store_one(od, ignore_nice_load);
 show_store_one(od, powersave_bias);
-declare_show_sampling_rate_min(od);
+show_one_common(od, min_sampling_rate);
 
 gov_sys_pol_attr_rw(sampling_rate);
 gov_sys_pol_attr_rw(io_is_busy);
@@ -451,10 +451,10 @@ gov_sys_pol_attr_rw(up_threshold);
 gov_sys_pol_attr_rw(sampling_down_factor);
 gov_sys_pol_attr_rw(ignore_nice_load);
 gov_sys_pol_attr_rw(powersave_bias);
-gov_sys_pol_attr_ro(sampling_rate_min);
+gov_sys_pol_attr_ro(min_sampling_rate);
 
 static struct attribute *dbs_attributes_gov_sys[] = {
-	&sampling_rate_min_gov_sys.attr,
+	&min_sampling_rate_gov_sys.attr,
 	&sampling_rate_gov_sys.attr,
 	&up_threshold_gov_sys.attr,
 	&sampling_down_factor_gov_sys.attr,
@@ -470,7 +470,7 @@ static struct attribute_group od_attr_group_gov_sys = {
 };
 
 static struct attribute *dbs_attributes_gov_pol[] = {
-	&sampling_rate_min_gov_pol.attr,
+	&min_sampling_rate_gov_pol.attr,
 	&sampling_rate_gov_pol.attr,
 	&up_threshold_gov_pol.attr,
 	&sampling_down_factor_gov_pol.attr,
-- 
cgit v0.10.2


From ff4b17895e3166084c76ae703cb1c757bcc59799 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Tue, 9 Feb 2016 09:01:32 +0530
Subject: cpufreq: governor: Move common tunables to 'struct dbs_data'

There are a few common tunables shared between the ondemand and
conservative governors.  Move them to struct dbs_data to simplify
code.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Tested-by: Juri Lelli <juri.lelli@arm.com>
Tested-by: Shilpasri G Bhat <shilpa.bhat@linux.vnet.ibm.com>
[ rjw: Changelog ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c
index a69eb7e..4f640b0 100644
--- a/drivers/cpufreq/cpufreq_conservative.c
+++ b/drivers/cpufreq/cpufreq_conservative.c
@@ -60,7 +60,7 @@ static void cs_check_cpu(int cpu, unsigned int load)
 		return;
 
 	/* Check for frequency increase */
-	if (load > cs_tuners->up_threshold) {
+	if (load > dbs_data->up_threshold) {
 		dbs_info->down_skip = 0;
 
 		/* if we are already at full speed then break out early */
@@ -78,7 +78,7 @@ static void cs_check_cpu(int cpu, unsigned int load)
 	}
 
 	/* if sampling_down_factor is active break out early */
-	if (++dbs_info->down_skip < cs_tuners->sampling_down_factor)
+	if (++dbs_info->down_skip < dbs_data->sampling_down_factor)
 		return;
 	dbs_info->down_skip = 0;
 
@@ -107,10 +107,9 @@ static unsigned int cs_dbs_timer(struct cpufreq_policy *policy)
 {
 	struct policy_dbs_info *policy_dbs = policy->governor_data;
 	struct dbs_data *dbs_data = policy_dbs->dbs_data;
-	struct cs_dbs_tuners *cs_tuners = dbs_data->tuners;
 
 	dbs_check_cpu(policy);
-	return delay_for_sampling_rate(cs_tuners->sampling_rate);
+	return delay_for_sampling_rate(dbs_data->sampling_rate);
 }
 
 static int dbs_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
@@ -126,7 +125,6 @@ static struct dbs_governor cs_dbs_gov;
 static ssize_t store_sampling_down_factor(struct dbs_data *dbs_data,
 		const char *buf, size_t count)
 {
-	struct cs_dbs_tuners *cs_tuners = dbs_data->tuners;
 	unsigned int input;
 	int ret;
 	ret = sscanf(buf, "%u", &input);
@@ -134,14 +132,13 @@ static ssize_t store_sampling_down_factor(struct dbs_data *dbs_data,
 	if (ret != 1 || input > MAX_SAMPLING_DOWN_FACTOR || input < 1)
 		return -EINVAL;
 
-	cs_tuners->sampling_down_factor = input;
+	dbs_data->sampling_down_factor = input;
 	return count;
 }
 
 static ssize_t store_sampling_rate(struct dbs_data *dbs_data, const char *buf,
 		size_t count)
 {
-	struct cs_dbs_tuners *cs_tuners = dbs_data->tuners;
 	unsigned int input;
 	int ret;
 	ret = sscanf(buf, "%u", &input);
@@ -149,7 +146,7 @@ static ssize_t store_sampling_rate(struct dbs_data *dbs_data, const char *buf,
 	if (ret != 1)
 		return -EINVAL;
 
-	cs_tuners->sampling_rate = max(input, dbs_data->min_sampling_rate);
+	dbs_data->sampling_rate = max(input, dbs_data->min_sampling_rate);
 	return count;
 }
 
@@ -164,7 +161,7 @@ static ssize_t store_up_threshold(struct dbs_data *dbs_data, const char *buf,
 	if (ret != 1 || input > 100 || input <= cs_tuners->down_threshold)
 		return -EINVAL;
 
-	cs_tuners->up_threshold = input;
+	dbs_data->up_threshold = input;
 	return count;
 }
 
@@ -178,7 +175,7 @@ static ssize_t store_down_threshold(struct dbs_data *dbs_data, const char *buf,
 
 	/* cannot be lower than 11 otherwise freq will not fall */
 	if (ret != 1 || input < 11 || input > 100 ||
-			input >= cs_tuners->up_threshold)
+			input >= dbs_data->up_threshold)
 		return -EINVAL;
 
 	cs_tuners->down_threshold = input;
@@ -188,7 +185,6 @@ static ssize_t store_down_threshold(struct dbs_data *dbs_data, const char *buf,
 static ssize_t store_ignore_nice_load(struct dbs_data *dbs_data,
 		const char *buf, size_t count)
 {
-	struct cs_dbs_tuners *cs_tuners = dbs_data->tuners;
 	unsigned int input, j;
 	int ret;
 
@@ -199,10 +195,10 @@ static ssize_t store_ignore_nice_load(struct dbs_data *dbs_data,
 	if (input > 1)
 		input = 1;
 
-	if (input == cs_tuners->ignore_nice_load) /* nothing to do */
+	if (input == dbs_data->ignore_nice_load) /* nothing to do */
 		return count;
 
-	cs_tuners->ignore_nice_load = input;
+	dbs_data->ignore_nice_load = input;
 
 	/* we need to re-evaluate prev_cpu_idle */
 	for_each_online_cpu(j) {
@@ -210,7 +206,7 @@ static ssize_t store_ignore_nice_load(struct dbs_data *dbs_data,
 		dbs_info = &per_cpu(cs_cpu_dbs_info, j);
 		dbs_info->cdbs.prev_cpu_idle = get_cpu_idle_time(j,
 					&dbs_info->cdbs.prev_cpu_wall, 0);
-		if (cs_tuners->ignore_nice_load)
+		if (dbs_data->ignore_nice_load)
 			dbs_info->cdbs.prev_cpu_nice =
 				kcpustat_cpu(j).cpustat[CPUTIME_NICE];
 	}
@@ -239,12 +235,12 @@ static ssize_t store_freq_step(struct dbs_data *dbs_data, const char *buf,
 	return count;
 }
 
-show_store_one(cs, sampling_rate);
-show_store_one(cs, sampling_down_factor);
-show_store_one(cs, up_threshold);
 show_store_one(cs, down_threshold);
-show_store_one(cs, ignore_nice_load);
 show_store_one(cs, freq_step);
+show_store_one_common(cs, sampling_rate);
+show_store_one_common(cs, sampling_down_factor);
+show_store_one_common(cs, up_threshold);
+show_store_one_common(cs, ignore_nice_load);
 show_one_common(cs, min_sampling_rate);
 
 gov_sys_pol_attr_rw(sampling_rate);
@@ -299,11 +295,11 @@ static int cs_init(struct dbs_data *dbs_data, bool notify)
 		return -ENOMEM;
 	}
 
-	tuners->up_threshold = DEF_FREQUENCY_UP_THRESHOLD;
 	tuners->down_threshold = DEF_FREQUENCY_DOWN_THRESHOLD;
-	tuners->sampling_down_factor = DEF_SAMPLING_DOWN_FACTOR;
-	tuners->ignore_nice_load = 0;
 	tuners->freq_step = DEF_FREQUENCY_STEP;
+	dbs_data->up_threshold = DEF_FREQUENCY_UP_THRESHOLD;
+	dbs_data->sampling_down_factor = DEF_SAMPLING_DOWN_FACTOR;
+	dbs_data->ignore_nice_load = 0;
 
 	dbs_data->tuners = tuners;
 	dbs_data->min_sampling_rate = MIN_SAMPLING_RATE_RATIO *
diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c
index d6bd402..3569782 100644
--- a/drivers/cpufreq/cpufreq_governor.c
+++ b/drivers/cpufreq/cpufreq_governor.c
@@ -38,10 +38,9 @@ void dbs_check_cpu(struct cpufreq_policy *policy)
 	struct policy_dbs_info *policy_dbs = policy->governor_data;
 	struct dbs_data *dbs_data = policy_dbs->dbs_data;
 	struct od_dbs_tuners *od_tuners = dbs_data->tuners;
-	struct cs_dbs_tuners *cs_tuners = dbs_data->tuners;
-	unsigned int sampling_rate;
+	unsigned int sampling_rate = dbs_data->sampling_rate;
+	unsigned int ignore_nice = dbs_data->ignore_nice_load;
 	unsigned int max_load = 0;
-	unsigned int ignore_nice;
 	unsigned int j;
 
 	if (gov->governor == GOV_ONDEMAND) {
@@ -54,13 +53,8 @@ void dbs_check_cpu(struct cpufreq_policy *policy)
 		 * the 'sampling_rate', so as to keep the wake-up-from-idle
 		 * detection logic a bit conservative.
 		 */
-		sampling_rate = od_tuners->sampling_rate;
 		sampling_rate *= od_dbs_info->rate_mult;
 
-		ignore_nice = od_tuners->ignore_nice_load;
-	} else {
-		sampling_rate = cs_tuners->sampling_rate;
-		ignore_nice = cs_tuners->ignore_nice_load;
 	}
 
 	/* Get Absolute Load */
@@ -280,19 +274,6 @@ static void dbs_update_util_handler(struct update_util_data *data, u64 time,
 	atomic_dec(&policy_dbs->work_count);
 }
 
-static void set_sampling_rate(struct dbs_data *dbs_data,
-			      struct dbs_governor *gov,
-			      unsigned int sampling_rate)
-{
-	if (gov->governor == GOV_CONSERVATIVE) {
-		struct cs_dbs_tuners *cs_tuners = dbs_data->tuners;
-		cs_tuners->sampling_rate = sampling_rate;
-	} else {
-		struct od_dbs_tuners *od_tuners = dbs_data->tuners;
-		od_tuners->sampling_rate = sampling_rate;
-	}
-}
-
 static struct policy_dbs_info *alloc_policy_dbs_info(struct cpufreq_policy *policy,
 						     struct dbs_governor *gov)
 {
@@ -384,8 +365,8 @@ static int cpufreq_governor_init(struct cpufreq_policy *policy)
 	/* Bring kernel and HW constraints together */
 	dbs_data->min_sampling_rate = max(dbs_data->min_sampling_rate,
 					  MIN_LATENCY_MULTIPLIER * latency);
-	set_sampling_rate(dbs_data, gov, max(dbs_data->min_sampling_rate,
-					latency * LATENCY_MULTIPLIER));
+	dbs_data->sampling_rate = max(dbs_data->min_sampling_rate,
+				      LATENCY_MULTIPLIER * latency);
 
 	if (!have_governor_per_policy())
 		gov->gdbs_data = dbs_data;
@@ -456,16 +437,12 @@ static int cpufreq_governor_start(struct cpufreq_policy *policy)
 	if (policy_dbs->policy)
 		return -EBUSY;
 
-	if (gov->governor == GOV_CONSERVATIVE) {
-		struct cs_dbs_tuners *cs_tuners = dbs_data->tuners;
+	sampling_rate = dbs_data->sampling_rate;
+	ignore_nice = dbs_data->ignore_nice_load;
 
-		sampling_rate = cs_tuners->sampling_rate;
-		ignore_nice = cs_tuners->ignore_nice_load;
-	} else {
+	if (gov->governor == GOV_ONDEMAND) {
 		struct od_dbs_tuners *od_tuners = dbs_data->tuners;
 
-		sampling_rate = od_tuners->sampling_rate;
-		ignore_nice = od_tuners->ignore_nice_load;
 		io_busy = od_tuners->io_is_busy;
 	}
 
diff --git a/drivers/cpufreq/cpufreq_governor.h b/drivers/cpufreq/cpufreq_governor.h
index cdf7536..e296362 100644
--- a/drivers/cpufreq/cpufreq_governor.h
+++ b/drivers/cpufreq/cpufreq_governor.h
@@ -153,9 +153,13 @@ static void *get_cpu_dbs_info_s(int cpu)				\
 
 /* Governor demand based switching data (per-policy or global). */
 struct dbs_data {
-	unsigned int min_sampling_rate;
 	int usage_count;
 	void *tuners;
+	unsigned int min_sampling_rate;
+	unsigned int ignore_nice_load;
+	unsigned int sampling_rate;
+	unsigned int sampling_down_factor;
+	unsigned int up_threshold;
 };
 
 /* Common to all CPUs of a policy */
@@ -216,19 +220,11 @@ struct cs_cpu_dbs_info_s {
 
 /* Per policy Governors sysfs tunables */
 struct od_dbs_tuners {
-	unsigned int ignore_nice_load;
-	unsigned int sampling_rate;
-	unsigned int sampling_down_factor;
-	unsigned int up_threshold;
 	unsigned int powersave_bias;
 	unsigned int io_is_busy;
 };
 
 struct cs_dbs_tuners {
-	unsigned int ignore_nice_load;
-	unsigned int sampling_rate;
-	unsigned int sampling_down_factor;
-	unsigned int up_threshold;
 	unsigned int down_threshold;
 	unsigned int freq_step;
 };
diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c
index 8c44bc3..13c64b6 100644
--- a/drivers/cpufreq/cpufreq_ondemand.c
+++ b/drivers/cpufreq/cpufreq_ondemand.c
@@ -110,7 +110,7 @@ static unsigned int generic_powersave_bias_target(struct cpufreq_policy *policy,
 		dbs_info->freq_lo_jiffies = 0;
 		return freq_lo;
 	}
-	jiffies_total = usecs_to_jiffies(od_tuners->sampling_rate);
+	jiffies_total = usecs_to_jiffies(dbs_data->sampling_rate);
 	jiffies_hi = (freq_avg - freq_lo) * jiffies_total;
 	jiffies_hi += ((freq_hi - freq_lo) / 2);
 	jiffies_hi /= (freq_hi - freq_lo);
@@ -161,11 +161,10 @@ static void od_check_cpu(int cpu, unsigned int load)
 	dbs_info->freq_lo = 0;
 
 	/* Check for frequency increase */
-	if (load > od_tuners->up_threshold) {
+	if (load > dbs_data->up_threshold) {
 		/* If switching to max speed, apply sampling_down_factor */
 		if (policy->cur < policy->max)
-			dbs_info->rate_mult =
-				od_tuners->sampling_down_factor;
+			dbs_info->rate_mult = dbs_data->sampling_down_factor;
 		dbs_freq_increase(policy, policy->max);
 	} else {
 		/* Calculate the next frequency proportional to load */
@@ -195,7 +194,6 @@ static unsigned int od_dbs_timer(struct cpufreq_policy *policy)
 	struct policy_dbs_info *policy_dbs = policy->governor_data;
 	struct dbs_data *dbs_data = policy_dbs->dbs_data;
 	struct od_cpu_dbs_info_s *dbs_info = &per_cpu(od_cpu_dbs_info, policy->cpu);
-	struct od_dbs_tuners *od_tuners = dbs_data->tuners;
 	int delay = 0, sample_type = dbs_info->sample_type;
 
 	/* Common NORMAL_SAMPLE setup */
@@ -214,7 +212,7 @@ static unsigned int od_dbs_timer(struct cpufreq_policy *policy)
 	}
 
 	if (!delay)
-		delay = delay_for_sampling_rate(od_tuners->sampling_rate
+		delay = delay_for_sampling_rate(dbs_data->sampling_rate
 				* dbs_info->rate_mult);
 
 	return delay;
@@ -239,11 +237,10 @@ static struct dbs_governor od_dbs_gov;
 static void update_sampling_rate(struct dbs_data *dbs_data,
 		unsigned int new_rate)
 {
-	struct od_dbs_tuners *od_tuners = dbs_data->tuners;
 	struct cpumask cpumask;
 	int cpu;
 
-	od_tuners->sampling_rate = new_rate = max(new_rate,
+	dbs_data->sampling_rate = new_rate = max(new_rate,
 			dbs_data->min_sampling_rate);
 
 	/*
@@ -348,7 +345,6 @@ static ssize_t store_io_is_busy(struct dbs_data *dbs_data, const char *buf,
 static ssize_t store_up_threshold(struct dbs_data *dbs_data, const char *buf,
 		size_t count)
 {
-	struct od_dbs_tuners *od_tuners = dbs_data->tuners;
 	unsigned int input;
 	int ret;
 	ret = sscanf(buf, "%u", &input);
@@ -358,21 +354,20 @@ static ssize_t store_up_threshold(struct dbs_data *dbs_data, const char *buf,
 		return -EINVAL;
 	}
 
-	od_tuners->up_threshold = input;
+	dbs_data->up_threshold = input;
 	return count;
 }
 
 static ssize_t store_sampling_down_factor(struct dbs_data *dbs_data,
 		const char *buf, size_t count)
 {
-	struct od_dbs_tuners *od_tuners = dbs_data->tuners;
 	unsigned int input, j;
 	int ret;
 	ret = sscanf(buf, "%u", &input);
 
 	if (ret != 1 || input > MAX_SAMPLING_DOWN_FACTOR || input < 1)
 		return -EINVAL;
-	od_tuners->sampling_down_factor = input;
+	dbs_data->sampling_down_factor = input;
 
 	/* Reset down sampling multiplier in case it was active */
 	for_each_online_cpu(j) {
@@ -399,10 +394,10 @@ static ssize_t store_ignore_nice_load(struct dbs_data *dbs_data,
 	if (input > 1)
 		input = 1;
 
-	if (input == od_tuners->ignore_nice_load) { /* nothing to do */
+	if (input == dbs_data->ignore_nice_load) { /* nothing to do */
 		return count;
 	}
-	od_tuners->ignore_nice_load = input;
+	dbs_data->ignore_nice_load = input;
 
 	/* we need to re-evaluate prev_cpu_idle */
 	for_each_online_cpu(j) {
@@ -410,7 +405,7 @@ static ssize_t store_ignore_nice_load(struct dbs_data *dbs_data,
 		dbs_info = &per_cpu(od_cpu_dbs_info, j);
 		dbs_info->cdbs.prev_cpu_idle = get_cpu_idle_time(j,
 			&dbs_info->cdbs.prev_cpu_wall, od_tuners->io_is_busy);
-		if (od_tuners->ignore_nice_load)
+		if (dbs_data->ignore_nice_load)
 			dbs_info->cdbs.prev_cpu_nice =
 				kcpustat_cpu(j).cpustat[CPUTIME_NICE];
 
@@ -437,12 +432,12 @@ static ssize_t store_powersave_bias(struct dbs_data *dbs_data, const char *buf,
 	return count;
 }
 
-show_store_one(od, sampling_rate);
 show_store_one(od, io_is_busy);
-show_store_one(od, up_threshold);
-show_store_one(od, sampling_down_factor);
-show_store_one(od, ignore_nice_load);
 show_store_one(od, powersave_bias);
+show_store_one_common(od, sampling_rate);
+show_store_one_common(od, up_threshold);
+show_store_one_common(od, sampling_down_factor);
+show_store_one_common(od, ignore_nice_load);
 show_one_common(od, min_sampling_rate);
 
 gov_sys_pol_attr_rw(sampling_rate);
@@ -504,7 +499,7 @@ static int od_init(struct dbs_data *dbs_data, bool notify)
 	put_cpu();
 	if (idle_time != -1ULL) {
 		/* Idle micro accounting is supported. Use finer thresholds */
-		tuners->up_threshold = MICRO_FREQUENCY_UP_THRESHOLD;
+		dbs_data->up_threshold = MICRO_FREQUENCY_UP_THRESHOLD;
 		/*
 		 * In nohz/micro accounting case we set the minimum frequency
 		 * not depending on HZ, but fixed (very low). The deferred
@@ -512,15 +507,15 @@ static int od_init(struct dbs_data *dbs_data, bool notify)
 		*/
 		dbs_data->min_sampling_rate = MICRO_FREQUENCY_MIN_SAMPLE_RATE;
 	} else {
-		tuners->up_threshold = DEF_FREQUENCY_UP_THRESHOLD;
+		dbs_data->up_threshold = DEF_FREQUENCY_UP_THRESHOLD;
 
 		/* For correct statistics, we need 10 ticks for each measure */
 		dbs_data->min_sampling_rate = MIN_SAMPLING_RATE_RATIO *
 			jiffies_to_usecs(10);
 	}
 
-	tuners->sampling_down_factor = DEF_SAMPLING_DOWN_FACTOR;
-	tuners->ignore_nice_load = 0;
+	dbs_data->sampling_down_factor = DEF_SAMPLING_DOWN_FACTOR;
+	dbs_data->ignore_nice_load = 0;
 	tuners->powersave_bias = default_powersave_bias;
 	tuners->io_is_busy = should_io_be_busy();
 
-- 
cgit v0.10.2


From c4435630361d9bebf7154a0c842dc1fb7ae39c99 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Tue, 9 Feb 2016 09:01:33 +0530
Subject: cpufreq: governor: New sysfs show/store callbacks for governor
 tunables

The ondemand and conservative governors use the global-attr or freq-attr
structures to represent sysfs attributes corresponding to their tunables
(which of them is actually used depends on whether or not different
policy objects can use the same governor with different tunables at the
same time and, consequently, on where those attributes are located in
sysfs).

Unfortunately, in the freq-attr case, the standard cpufreq show/store
sysfs attribute callbacks are applied to the governor tunable attributes
and they always acquire the policy->rwsem lock before carrying out the
operation.  That may lead to an ABBA deadlock if governor tunable
attributes are removed under policy->rwsem while one of them is being
accessed concurrently (if sysfs attributes removal wins the race, it
will wait for the access to complete with policy->rwsem held while the
attribute callback will block on policy->rwsem indefinitely).

We attempted to address this issue by dropping policy->rwsem around
governor tunable attributes removal (that is, around invocations of the
->governor callback with the event arg equal to CPUFREQ_GOV_POLICY_EXIT)
in cpufreq_set_policy(), but that opened up race conditions that had not
been possible with policy->rwsem held all the time.  Therefore
policy->rwsem cannot be dropped in cpufreq_set_policy() at any point,
but the deadlock situation described above must be avoided too.

To that end, use the observation that in principle governor tunables may
be represented by the same data type regardless of whether the governor
is system-wide or per-policy and introduce a new structure, struct
governor_attr, for representing them and new corresponding macros for
creating show/store sysfs callbacks for them.  Also make their parent
kobject use a new kobject type whose default show/store callbacks are
not related to the standard core cpufreq ones in any way (and they don't
acquire policy->rwsem in particular).

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Tested-by: Juri Lelli <juri.lelli@arm.com>
Tested-by: Shilpasri G Bhat <shilpa.bhat@linux.vnet.ibm.com>
[ rjw: Subject & changelog + rebase ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c
index 4f640b0..ed081db 100644
--- a/drivers/cpufreq/cpufreq_conservative.c
+++ b/drivers/cpufreq/cpufreq_conservative.c
@@ -235,54 +235,33 @@ static ssize_t store_freq_step(struct dbs_data *dbs_data, const char *buf,
 	return count;
 }
 
-show_store_one(cs, down_threshold);
-show_store_one(cs, freq_step);
-show_store_one_common(cs, sampling_rate);
-show_store_one_common(cs, sampling_down_factor);
-show_store_one_common(cs, up_threshold);
-show_store_one_common(cs, ignore_nice_load);
-show_one_common(cs, min_sampling_rate);
-
-gov_sys_pol_attr_rw(sampling_rate);
-gov_sys_pol_attr_rw(sampling_down_factor);
-gov_sys_pol_attr_rw(up_threshold);
-gov_sys_pol_attr_rw(down_threshold);
-gov_sys_pol_attr_rw(ignore_nice_load);
-gov_sys_pol_attr_rw(freq_step);
-gov_sys_pol_attr_ro(min_sampling_rate);
-
-static struct attribute *dbs_attributes_gov_sys[] = {
-	&min_sampling_rate_gov_sys.attr,
-	&sampling_rate_gov_sys.attr,
-	&sampling_down_factor_gov_sys.attr,
-	&up_threshold_gov_sys.attr,
-	&down_threshold_gov_sys.attr,
-	&ignore_nice_load_gov_sys.attr,
-	&freq_step_gov_sys.attr,
+gov_show_one_common(sampling_rate);
+gov_show_one_common(sampling_down_factor);
+gov_show_one_common(up_threshold);
+gov_show_one_common(ignore_nice_load);
+gov_show_one_common(min_sampling_rate);
+gov_show_one(cs, down_threshold);
+gov_show_one(cs, freq_step);
+
+gov_attr_rw(sampling_rate);
+gov_attr_rw(sampling_down_factor);
+gov_attr_rw(up_threshold);
+gov_attr_rw(ignore_nice_load);
+gov_attr_ro(min_sampling_rate);
+gov_attr_rw(down_threshold);
+gov_attr_rw(freq_step);
+
+static struct attribute *cs_attributes[] = {
+	&min_sampling_rate.attr,
+	&sampling_rate.attr,
+	&sampling_down_factor.attr,
+	&up_threshold.attr,
+	&down_threshold.attr,
+	&ignore_nice_load.attr,
+	&freq_step.attr,
 	NULL
 };
 
-static struct attribute_group cs_attr_group_gov_sys = {
-	.attrs = dbs_attributes_gov_sys,
-	.name = "conservative",
-};
-
-static struct attribute *dbs_attributes_gov_pol[] = {
-	&min_sampling_rate_gov_pol.attr,
-	&sampling_rate_gov_pol.attr,
-	&sampling_down_factor_gov_pol.attr,
-	&up_threshold_gov_pol.attr,
-	&down_threshold_gov_pol.attr,
-	&ignore_nice_load_gov_pol.attr,
-	&freq_step_gov_pol.attr,
-	NULL
-};
-
-static struct attribute_group cs_attr_group_gov_pol = {
-	.attrs = dbs_attributes_gov_pol,
-	.name = "conservative",
-};
-
 /************************** sysfs end ************************/
 
 static int cs_init(struct dbs_data *dbs_data, bool notify)
@@ -331,8 +310,7 @@ static struct dbs_governor cs_dbs_gov = {
 		.owner = THIS_MODULE,
 	},
 	.governor = GOV_CONSERVATIVE,
-	.attr_group_gov_sys = &cs_attr_group_gov_sys,
-	.attr_group_gov_pol = &cs_attr_group_gov_pol,
+	.kobj_type = { .default_attrs = cs_attributes },
 	.get_cpu_cdbs = get_cpu_cdbs,
 	.get_cpu_dbs_info_s = get_cpu_dbs_info_s,
 	.gov_dbs_timer = cs_dbs_timer,
diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c
index 3569782..00cb468 100644
--- a/drivers/cpufreq/cpufreq_governor.c
+++ b/drivers/cpufreq/cpufreq_governor.c
@@ -25,12 +25,58 @@
 DEFINE_MUTEX(dbs_data_mutex);
 EXPORT_SYMBOL_GPL(dbs_data_mutex);
 
-static struct attribute_group *get_sysfs_attr(struct dbs_governor *gov)
+static inline struct dbs_data *to_dbs_data(struct kobject *kobj)
 {
-	return have_governor_per_policy() ?
-		gov->attr_group_gov_pol : gov->attr_group_gov_sys;
+	return container_of(kobj, struct dbs_data, kobj);
 }
 
+static inline struct governor_attr *to_gov_attr(struct attribute *attr)
+{
+	return container_of(attr, struct governor_attr, attr);
+}
+
+static ssize_t governor_show(struct kobject *kobj, struct attribute *attr,
+			     char *buf)
+{
+	struct dbs_data *dbs_data = to_dbs_data(kobj);
+	struct governor_attr *gattr = to_gov_attr(attr);
+	int ret = -EIO;
+
+	if (gattr->show)
+		ret = gattr->show(dbs_data, buf);
+
+	return ret;
+}
+
+static ssize_t governor_store(struct kobject *kobj, struct attribute *attr,
+			      const char *buf, size_t count)
+{
+	struct dbs_data *dbs_data = to_dbs_data(kobj);
+	struct governor_attr *gattr = to_gov_attr(attr);
+	int ret = -EIO;
+
+	mutex_lock(&dbs_data->mutex);
+
+	if (gattr->store)
+		ret = gattr->store(dbs_data, buf, count);
+
+	mutex_unlock(&dbs_data->mutex);
+
+	return ret;
+}
+
+/*
+ * Sysfs Ops for accessing governor attributes.
+ *
+ * All show/store invocations for governor specific sysfs attributes, will first
+ * call the below show/store callbacks and the attribute specific callback will
+ * be called from within it.
+ */
+static const struct sysfs_ops governor_sysfs_ops = {
+	.show	= governor_show,
+	.store	= governor_store,
+};
+
 void dbs_check_cpu(struct cpufreq_policy *policy)
 {
 	int cpu = policy->cpu;
@@ -352,6 +398,7 @@ static int cpufreq_governor_init(struct cpufreq_policy *policy)
 	}
 
 	dbs_data->usage_count = 1;
+	mutex_init(&dbs_data->mutex);
 
 	ret = gov->init(dbs_data, !policy->governor->initialized);
 	if (ret)
@@ -374,12 +421,15 @@ static int cpufreq_governor_init(struct cpufreq_policy *policy)
 	policy_dbs->dbs_data = dbs_data;
 	policy->governor_data = policy_dbs;
 
-	ret = sysfs_create_group(get_governor_parent_kobj(policy),
-				 get_sysfs_attr(gov));
+	gov->kobj_type.sysfs_ops = &governor_sysfs_ops;
+	ret = kobject_init_and_add(&dbs_data->kobj, &gov->kobj_type,
+				   get_governor_parent_kobj(policy),
+				   "%s", gov->gov.name);
 	if (!ret)
 		return 0;
 
 	/* Failure, so roll back. */
+	pr_err("cpufreq: Governor initialization failed (dbs_data kobject init error %d)\n", ret);
 
 	policy->governor_data = NULL;
 
@@ -404,8 +454,7 @@ static int cpufreq_governor_exit(struct cpufreq_policy *policy)
 		return -EBUSY;
 
 	if (!--dbs_data->usage_count) {
-		sysfs_remove_group(get_governor_parent_kobj(policy),
-				   get_sysfs_attr(gov));
+		kobject_put(&dbs_data->kobj);
 
 		policy->governor_data = NULL;
 
@@ -413,6 +462,7 @@ static int cpufreq_governor_exit(struct cpufreq_policy *policy)
 			gov->gdbs_data = NULL;
 
 		gov->exit(dbs_data, policy->governor->initialized == 1);
+		mutex_destroy(&dbs_data->mutex);
 		kfree(dbs_data);
 	} else {
 		policy->governor_data = NULL;
diff --git a/drivers/cpufreq/cpufreq_governor.h b/drivers/cpufreq/cpufreq_governor.h
index e296362..bdb6e49 100644
--- a/drivers/cpufreq/cpufreq_governor.h
+++ b/drivers/cpufreq/cpufreq_governor.h
@@ -160,8 +160,44 @@ struct dbs_data {
 	unsigned int sampling_rate;
 	unsigned int sampling_down_factor;
 	unsigned int up_threshold;
+
+	struct kobject kobj;
+	/* Protect concurrent updates to governor tunables from sysfs */
+	struct mutex mutex;
+};
+
+/* Governor's specific attributes */
+struct dbs_data;
+struct governor_attr {
+	struct attribute attr;
+	ssize_t (*show)(struct dbs_data *dbs_data, char *buf);
+	ssize_t (*store)(struct dbs_data *dbs_data, const char *buf,
+			 size_t count);
 };
 
+#define gov_show_one(_gov, file_name)					\
+static ssize_t show_##file_name						\
+(struct dbs_data *dbs_data, char *buf)					\
+{									\
+	struct _gov##_dbs_tuners *tuners = dbs_data->tuners;		\
+	return sprintf(buf, "%u\n", tuners->file_name);			\
+}
+
+#define gov_show_one_common(file_name)					\
+static ssize_t show_##file_name						\
+(struct dbs_data *dbs_data, char *buf)					\
+{									\
+	return sprintf(buf, "%u\n", dbs_data->file_name);		\
+}
+
+#define gov_attr_ro(_name)						\
+static struct governor_attr _name =					\
+__ATTR(_name, 0444, show_##_name, NULL)
+
+#define gov_attr_rw(_name)						\
+static struct governor_attr _name =					\
+__ATTR(_name, 0644, show_##_name, store_##_name)
+
 /* Common to all CPUs of a policy */
 struct policy_dbs_info {
 	struct cpufreq_policy *policy;
@@ -236,8 +272,7 @@ struct dbs_governor {
 	#define GOV_ONDEMAND		0
 	#define GOV_CONSERVATIVE	1
 	int governor;
-	struct attribute_group *attr_group_gov_sys; /* one governor - system */
-	struct attribute_group *attr_group_gov_pol; /* one governor - policy */
+	struct kobj_type kobj_type;
 
 	/*
 	 * Common data for platforms that don't set
diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c
index 13c64b6..e36792f 100644
--- a/drivers/cpufreq/cpufreq_ondemand.c
+++ b/drivers/cpufreq/cpufreq_ondemand.c
@@ -432,54 +432,33 @@ static ssize_t store_powersave_bias(struct dbs_data *dbs_data, const char *buf,
 	return count;
 }
 
-show_store_one(od, io_is_busy);
-show_store_one(od, powersave_bias);
-show_store_one_common(od, sampling_rate);
-show_store_one_common(od, up_threshold);
-show_store_one_common(od, sampling_down_factor);
-show_store_one_common(od, ignore_nice_load);
-show_one_common(od, min_sampling_rate);
-
-gov_sys_pol_attr_rw(sampling_rate);
-gov_sys_pol_attr_rw(io_is_busy);
-gov_sys_pol_attr_rw(up_threshold);
-gov_sys_pol_attr_rw(sampling_down_factor);
-gov_sys_pol_attr_rw(ignore_nice_load);
-gov_sys_pol_attr_rw(powersave_bias);
-gov_sys_pol_attr_ro(min_sampling_rate);
-
-static struct attribute *dbs_attributes_gov_sys[] = {
-	&min_sampling_rate_gov_sys.attr,
-	&sampling_rate_gov_sys.attr,
-	&up_threshold_gov_sys.attr,
-	&sampling_down_factor_gov_sys.attr,
-	&ignore_nice_load_gov_sys.attr,
-	&powersave_bias_gov_sys.attr,
-	&io_is_busy_gov_sys.attr,
+gov_show_one_common(sampling_rate);
+gov_show_one_common(up_threshold);
+gov_show_one_common(sampling_down_factor);
+gov_show_one_common(ignore_nice_load);
+gov_show_one_common(min_sampling_rate);
+gov_show_one(od, io_is_busy);
+gov_show_one(od, powersave_bias);
+
+gov_attr_rw(sampling_rate);
+gov_attr_rw(io_is_busy);
+gov_attr_rw(up_threshold);
+gov_attr_rw(sampling_down_factor);
+gov_attr_rw(ignore_nice_load);
+gov_attr_rw(powersave_bias);
+gov_attr_ro(min_sampling_rate);
+
+static struct attribute *od_attributes[] = {
+	&min_sampling_rate.attr,
+	&sampling_rate.attr,
+	&up_threshold.attr,
+	&sampling_down_factor.attr,
+	&ignore_nice_load.attr,
+	&powersave_bias.attr,
+	&io_is_busy.attr,
 	NULL
 };
 
-static struct attribute_group od_attr_group_gov_sys = {
-	.attrs = dbs_attributes_gov_sys,
-	.name = "ondemand",
-};
-
-static struct attribute *dbs_attributes_gov_pol[] = {
-	&min_sampling_rate_gov_pol.attr,
-	&sampling_rate_gov_pol.attr,
-	&up_threshold_gov_pol.attr,
-	&sampling_down_factor_gov_pol.attr,
-	&ignore_nice_load_gov_pol.attr,
-	&powersave_bias_gov_pol.attr,
-	&io_is_busy_gov_pol.attr,
-	NULL
-};
-
-static struct attribute_group od_attr_group_gov_pol = {
-	.attrs = dbs_attributes_gov_pol,
-	.name = "ondemand",
-};
-
 /************************** sysfs end ************************/
 
 static int od_init(struct dbs_data *dbs_data, bool notify)
@@ -544,8 +523,7 @@ static struct dbs_governor od_dbs_gov = {
 		.owner = THIS_MODULE,
 	},
 	.governor = GOV_ONDEMAND,
-	.attr_group_gov_sys = &od_attr_group_gov_sys,
-	.attr_group_gov_pol = &od_attr_group_gov_pol,
+	.kobj_type = { .default_attrs = od_attributes },
 	.get_cpu_cdbs = get_cpu_cdbs,
 	.get_cpu_dbs_info_s = get_cpu_dbs_info_s,
 	.gov_dbs_timer = od_dbs_timer,
-- 
cgit v0.10.2


From fd8ddc482a7a5e015c0613c4d96543d5efad047c Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Tue, 9 Feb 2016 09:01:34 +0530
Subject: cpufreq: governor: Drop unused macros for creating governor tunable
 attributes

The previous commit introduced a new set of macros for creating sysfs
attributes that represent governor tunables and the old macros used for
this purpose are not needed any more, so drop them.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Tested-by: Juri Lelli <juri.lelli@arm.com>
Tested-by: Shilpasri G Bhat <shilpa.bhat@linux.vnet.ibm.com>
[ rjw: Subject & changelog ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/cpufreq/cpufreq_governor.h b/drivers/cpufreq/cpufreq_governor.h
index bdb6e49..0eb66a6 100644
--- a/drivers/cpufreq/cpufreq_governor.h
+++ b/drivers/cpufreq/cpufreq_governor.h
@@ -41,95 +41,6 @@
 /* Ondemand Sampling types */
 enum {OD_NORMAL_SAMPLE, OD_SUB_SAMPLE};
 
-/*
- * Macro for creating governors sysfs routines
- *
- * - gov_sys: One governor instance per whole system
- * - gov_pol: One governor instance per policy
- */
-
-/* Create attributes */
-#define gov_sys_attr_ro(_name)						\
-static struct global_attr _name##_gov_sys =				\
-__ATTR(_name, 0444, show_##_name##_gov_sys, NULL)
-
-#define gov_sys_attr_rw(_name)						\
-static struct global_attr _name##_gov_sys =				\
-__ATTR(_name, 0644, show_##_name##_gov_sys, store_##_name##_gov_sys)
-
-#define gov_pol_attr_ro(_name)						\
-static struct freq_attr _name##_gov_pol =				\
-__ATTR(_name, 0444, show_##_name##_gov_pol, NULL)
-
-#define gov_pol_attr_rw(_name)						\
-static struct freq_attr _name##_gov_pol =				\
-__ATTR(_name, 0644, show_##_name##_gov_pol, store_##_name##_gov_pol)
-
-#define gov_sys_pol_attr_rw(_name)					\
-	gov_sys_attr_rw(_name);						\
-	gov_pol_attr_rw(_name)
-
-#define gov_sys_pol_attr_ro(_name)					\
-	gov_sys_attr_ro(_name);						\
-	gov_pol_attr_ro(_name)
-
-/* Create show/store routines */
-#define show_one(_gov, file_name)					\
-static ssize_t show_##file_name##_gov_sys				\
-(struct kobject *kobj, struct attribute *attr, char *buf)		\
-{									\
-	struct _gov##_dbs_tuners *tuners = _gov##_dbs_gov.gdbs_data->tuners; \
-	return sprintf(buf, "%u\n", tuners->file_name);			\
-}									\
-									\
-static ssize_t show_##file_name##_gov_pol				\
-(struct cpufreq_policy *policy, char *buf)				\
-{									\
-	struct policy_dbs_info *policy_dbs = policy->governor_data;	\
-	struct dbs_data *dbs_data = policy_dbs->dbs_data;		\
-	struct _gov##_dbs_tuners *tuners = dbs_data->tuners;		\
-	return sprintf(buf, "%u\n", tuners->file_name);			\
-}
-
-#define store_one(_gov, file_name)					\
-static ssize_t store_##file_name##_gov_sys				\
-(struct kobject *kobj, struct attribute *attr, const char *buf, size_t count) \
-{									\
-	struct dbs_data *dbs_data = _gov##_dbs_gov.gdbs_data;		\
-	return store_##file_name(dbs_data, buf, count);			\
-}									\
-									\
-static ssize_t store_##file_name##_gov_pol				\
-(struct cpufreq_policy *policy, const char *buf, size_t count)		\
-{									\
-	struct policy_dbs_info *policy_dbs = policy->governor_data;	\
-	return store_##file_name(policy_dbs->dbs_data, buf, count);			\
-}
-
-#define show_store_one(_gov, file_name)					\
-show_one(_gov, file_name);						\
-store_one(_gov, file_name)
-
-#define show_one_common(_gov, file_name)				\
-static ssize_t show_##file_name##_gov_sys				\
-(struct kobject *kobj, struct attribute *attr, char *buf)		\
-{									\
-	struct dbs_data *dbs_data = _gov##_dbs_gov.gdbs_data;		\
-	return sprintf(buf, "%u\n", dbs_data->file_name);		\
-}									\
-									\
-static ssize_t show_##file_name##_gov_pol				\
-(struct cpufreq_policy *policy, char *buf)				\
-{									\
-	struct policy_dbs_info *policy_dbs = policy->governor_data;	\
-	struct dbs_data *dbs_data = policy_dbs->dbs_data;		\
-	return sprintf(buf, "%u\n", dbs_data->file_name);		\
-}
-
-#define show_store_one_common(_gov, file_name)				\
-show_one_common(_gov, file_name);					\
-store_one(_gov, file_name)
-
 /* create helper routines */
 #define define_get_cpu_dbs_routines(_dbs_info)				\
 static struct cpu_dbs_info *get_cpu_cdbs(int cpu)			\
-- 
cgit v0.10.2


From 68e80dae09033d778b98dc88e5bfe8fdade188e5 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Tue, 9 Feb 2016 09:01:35 +0530
Subject: Revert "cpufreq: Drop rwsem lock around CPUFREQ_GOV_POLICY_EXIT"

Earlier, when the struct freq-attr was used to represent governor
attributes, the standard cpufreq show/store sysfs attribute callbacks
were applied to the governor tunable attributes and they always acquire
the policy->rwsem lock before carrying out the operation.  That could
have resulted in an ABBA deadlock if governor tunable attributes are
removed under policy->rwsem while one of them is being accessed
concurrently (if sysfs attributes removal wins the race, it will wait
for the access to complete with policy->rwsem held while the attribute
callback will block on policy->rwsem indefinitely).

We attempted to address this issue by dropping policy->rwsem around
governor tunable attributes removal (that is, around invocations of the
->governor callback with the event arg equal to CPUFREQ_GOV_POLICY_EXIT)
in cpufreq_set_policy(), but that opened up race conditions that had not
been possible with policy->rwsem held all the time.

The previous commit, "cpufreq: governor: New sysfs show/store callbacks
for governor tunables", fixed the original ABBA deadlock by adding new
governor specific show/store callbacks.

We don't have to drop rwsem around invocations of governor event
CPUFREQ_GOV_POLICY_EXIT anymore, and original fix can be reverted now.

Fixes: 955ef4833574 (cpufreq: Drop rwsem lock around CPUFREQ_GOV_POLICY_EXIT)
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Reported-by: Juri Lelli <juri.lelli@arm.com>
Tested-by: Juri Lelli <juri.lelli@arm.com>
Tested-by: Shilpasri G Bhat <shilpa.bhat@linux.vnet.ibm.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index e172b2a..e92e9ea 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -2205,10 +2205,7 @@ static int cpufreq_set_policy(struct cpufreq_policy *policy,
 			return ret;
 		}
 
-		up_write(&policy->rwsem);
 		ret = __cpufreq_governor(policy, CPUFREQ_GOV_POLICY_EXIT);
-		down_write(&policy->rwsem);
-
 		if (ret) {
 			pr_err("%s: Failed to Exit Governor: %s (%d)\n",
 			       __func__, old_gov->name, ret);
@@ -2224,9 +2221,7 @@ static int cpufreq_set_policy(struct cpufreq_policy *policy,
 		if (!ret)
 			goto out;
 
-		up_write(&policy->rwsem);
 		__cpufreq_governor(policy, CPUFREQ_GOV_POLICY_EXIT);
-		down_write(&policy->rwsem);
 	}
 
 	/* new governor failed, so re-start old one */
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 704d85b..cac3d1b 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -100,10 +100,6 @@ struct cpufreq_policy {
 	 * - Any routine that will write to the policy structure and/or may take away
 	 *   the policy altogether (eg. CPU hotplug), will hold this lock in write
 	 *   mode before doing so.
-	 *
-	 * Additional rules:
-	 * - Lock should not be held across
-	 *     __cpufreq_governor(data, CPUFREQ_GOV_POLICY_EXIT);
 	 */
 	struct rw_semaphore	rwsem;
 
-- 
cgit v0.10.2


From c54df0718423ea2941151d8516eb76ca6a32a4b4 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Wed, 10 Feb 2016 11:00:25 +0530
Subject: cpufreq: governor: Create and traverse list of policy_dbs to avoid
 deadlock

The dbs_data_mutex lock is currently used in two places.  First,
cpufreq_governor_dbs() uses it to guarantee mutual exclusion between
invocations of governor operations from the core.  Second, it is used by
ondemand governor's update_sampling_rate() to ensure the stability of
data structures walked by it.

The second usage is quite problematic, because update_sampling_rate() is
called from a governor sysfs attribute's ->store callback and that leads
to a deadlock scenario involving cpufreq_governor_exit() which runs
under dbs_data_mutex.  Thus it is better to rework the code so
update_sampling_rate() doesn't need to acquire dbs_data_mutex.

To that end, rework update_sampling_rate() to walk a list of policy_dbs
objects supported by the dbs_data one it has been called for (instead of
walking cpu_dbs_info object for all CPUs).  The list manipulation is
protected with dbs_data->mutex which also is held around the execution
of update_sampling_rate(), it is not necessary to hold dbs_data_mutex in
that function any more.

Reported-by: Juri Lelli <juri.lelli@arm.com>
Reported-by: Shilpasri G Bhat <shilpa.bhat@linux.vnet.ibm.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
[ rjw: Subject & changelog ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c
index 00cb468..2f35270 100644
--- a/drivers/cpufreq/cpufreq_governor.c
+++ b/drivers/cpufreq/cpufreq_governor.c
@@ -385,9 +385,14 @@ static int cpufreq_governor_init(struct cpufreq_policy *policy)
 			ret = -EINVAL;
 			goto free_policy_dbs_info;
 		}
-		dbs_data->usage_count++;
 		policy_dbs->dbs_data = dbs_data;
 		policy->governor_data = policy_dbs;
+
+		mutex_lock(&dbs_data->mutex);
+		dbs_data->usage_count++;
+		list_add(&policy_dbs->list, &dbs_data->policy_dbs_list);
+		mutex_unlock(&dbs_data->mutex);
+
 		return 0;
 	}
 
@@ -397,7 +402,7 @@ static int cpufreq_governor_init(struct cpufreq_policy *policy)
 		goto free_policy_dbs_info;
 	}
 
-	dbs_data->usage_count = 1;
+	INIT_LIST_HEAD(&dbs_data->policy_dbs_list);
 	mutex_init(&dbs_data->mutex);
 
 	ret = gov->init(dbs_data, !policy->governor->initialized);
@@ -418,9 +423,12 @@ static int cpufreq_governor_init(struct cpufreq_policy *policy)
 	if (!have_governor_per_policy())
 		gov->gdbs_data = dbs_data;
 
-	policy_dbs->dbs_data = dbs_data;
 	policy->governor_data = policy_dbs;
 
+	policy_dbs->dbs_data = dbs_data;
+	dbs_data->usage_count = 1;
+	list_add(&policy_dbs->list, &dbs_data->policy_dbs_list);
+
 	gov->kobj_type.sysfs_ops = &governor_sysfs_ops;
 	ret = kobject_init_and_add(&dbs_data->kobj, &gov->kobj_type,
 				   get_governor_parent_kobj(policy),
@@ -448,12 +456,18 @@ static int cpufreq_governor_exit(struct cpufreq_policy *policy)
 	struct dbs_governor *gov = dbs_governor_of(policy);
 	struct policy_dbs_info *policy_dbs = policy->governor_data;
 	struct dbs_data *dbs_data = policy_dbs->dbs_data;
+	int count;
 
 	/* State should be equivalent to INIT */
 	if (policy_dbs->policy)
 		return -EBUSY;
 
-	if (!--dbs_data->usage_count) {
+	mutex_lock(&dbs_data->mutex);
+	list_del(&policy_dbs->list);
+	count = --dbs_data->usage_count;
+	mutex_unlock(&dbs_data->mutex);
+
+	if (!count) {
 		kobject_put(&dbs_data->kobj);
 
 		policy->governor_data = NULL;
diff --git a/drivers/cpufreq/cpufreq_governor.h b/drivers/cpufreq/cpufreq_governor.h
index 0eb66a6..8bf4775 100644
--- a/drivers/cpufreq/cpufreq_governor.h
+++ b/drivers/cpufreq/cpufreq_governor.h
@@ -73,7 +73,11 @@ struct dbs_data {
 	unsigned int up_threshold;
 
 	struct kobject kobj;
-	/* Protect concurrent updates to governor tunables from sysfs */
+	struct list_head policy_dbs_list;
+	/*
+	 * Protect concurrent updates to governor tunables from sysfs,
+	 * policy_dbs_list and usage_count.
+	 */
 	struct mutex mutex;
 };
 
@@ -125,6 +129,7 @@ struct policy_dbs_info {
 	struct work_struct work;
 	/* dbs_data may be shared between multiple policy objects */
 	struct dbs_data *dbs_data;
+	struct list_head list;
 };
 
 static inline void gov_update_sample_delay(struct policy_dbs_info *policy_dbs,
diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c
index e36792f..38301c6 100644
--- a/drivers/cpufreq/cpufreq_ondemand.c
+++ b/drivers/cpufreq/cpufreq_ondemand.c
@@ -226,84 +226,55 @@ static struct dbs_governor od_dbs_gov;
  * @new_rate: new sampling rate
  *
  * If new rate is smaller than the old, simply updating
- * dbs_tuners_int.sampling_rate might not be appropriate. For example, if the
+ * dbs.sampling_rate might not be appropriate. For example, if the
  * original sampling_rate was 1 second and the requested new sampling rate is 10
  * ms because the user needs immediate reaction from ondemand governor, but not
  * sure if higher frequency will be required or not, then, the governor may
  * change the sampling rate too late; up to 1 second later. Thus, if we are
  * reducing the sampling rate, we need to make the new value effective
  * immediately.
+ *
+ * On the other hand, if new rate is larger than the old, then we may evaluate
+ * the load too soon, and it might we worth updating sample_delay_ns then as
+ * well.
+ *
+ * This must be called with dbs_data->mutex held, otherwise traversing
+ * policy_dbs_list isn't safe.
  */
 static void update_sampling_rate(struct dbs_data *dbs_data,
 		unsigned int new_rate)
 {
-	struct cpumask cpumask;
-	int cpu;
+	struct policy_dbs_info *policy_dbs;
 
 	dbs_data->sampling_rate = new_rate = max(new_rate,
 			dbs_data->min_sampling_rate);
 
 	/*
-	 * Lock governor so that governor start/stop can't execute in parallel.
+	 * We are operating under dbs_data->mutex and so the list and its
+	 * entries can't be freed concurrently.
 	 */
-	mutex_lock(&dbs_data_mutex);
-
-	cpumask_copy(&cpumask, cpu_online_mask);
-
-	for_each_cpu(cpu, &cpumask) {
-		struct cpufreq_policy *policy;
-		struct od_cpu_dbs_info_s *dbs_info;
-		struct cpu_dbs_info *cdbs;
-		struct policy_dbs_info *policy_dbs;
-
-		dbs_info = &per_cpu(od_cpu_dbs_info, cpu);
-		cdbs = &dbs_info->cdbs;
-		policy_dbs = cdbs->policy_dbs;
-
+	list_for_each_entry(policy_dbs, &dbs_data->policy_dbs_list, list) {
+		mutex_lock(&policy_dbs->timer_mutex);
 		/*
-		 * A valid policy_dbs and policy_dbs->policy means governor
-		 * hasn't stopped or exited yet.
+		 * On 32-bit architectures this may race with the
+		 * sample_delay_ns read in dbs_update_util_handler(), but that
+		 * really doesn't matter.  If the read returns a value that's
+		 * too big, the sample will be skipped, but the next invocation
+		 * of dbs_update_util_handler() (when the update has been
+		 * completed) will take a sample.  If the returned value is too
+		 * small, the sample will be taken immediately, but that isn't a
+		 * problem, as we want the new rate to take effect immediately
+		 * anyway.
+		 *
+		 * If this runs in parallel with dbs_work_handler(), we may end
+		 * up overwriting the sample_delay_ns value that it has just
+		 * written, but the difference should not be too big and it will
+		 * be corrected next time a sample is taken, so it shouldn't be
+		 * significant.
 		 */
-		if (!policy_dbs || !policy_dbs->policy)
-			continue;
-
-		policy = policy_dbs->policy;
-
-		/* clear all CPUs of this policy */
-		cpumask_andnot(&cpumask, &cpumask, policy->cpus);
-
-		/*
-		 * Update sampling rate for CPUs whose policy is governed by
-		 * dbs_data. In case of governor_per_policy, only a single
-		 * policy will be governed by dbs_data, otherwise there can be
-		 * multiple policies that are governed by the same dbs_data.
-		 */
-		if (dbs_data == policy_dbs->dbs_data) {
-			mutex_lock(&policy_dbs->timer_mutex);
-			/*
-			 * On 32-bit architectures this may race with the
-			 * sample_delay_ns read in dbs_update_util_handler(),
-			 * but that really doesn't matter.  If the read returns
-			 * a value that's too big, the sample will be skipped,
-			 * but the next invocation of dbs_update_util_handler()
-			 * (when the update has been completed) will take a
-			 * sample.  If the returned value is too small, the
-			 * sample will be taken immediately, but that isn't a
-			 * problem, as we want the new rate to take effect
-			 * immediately anyway.
-			 *
-			 * If this runs in parallel with dbs_work_handler(), we
-			 * may end up overwriting the sample_delay_ns value that
-			 * it has just written, but the difference should not be
-			 * too big and it will be corrected next time a sample
-			 * is taken, so it shouldn't be significant.
-			 */
-			gov_update_sample_delay(policy_dbs, new_rate);
-			mutex_unlock(&policy_dbs->timer_mutex);
-		}
+		gov_update_sample_delay(policy_dbs, new_rate);
+		mutex_unlock(&policy_dbs->timer_mutex);
 	}
-
-	mutex_unlock(&dbs_data_mutex);
 }
 
 static ssize_t store_sampling_rate(struct dbs_data *dbs_data, const char *buf,
-- 
cgit v0.10.2


From 69cee7147b4a4ea02085d571cd2d9974d4a4d8d5 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Thu, 11 Feb 2016 17:31:11 +0530
Subject: cpufreq: Merge cpufreq_offline_prepare/finish routines

Commit 1aee40ac9c86 (cpufreq: Invoke __cpufreq_remove_dev_finish()
after releasing cpu_hotplug.lock) split the cpufreq's CPU offline
routine in two pieces, one of them to be run with CPU offline/online
locked and the other to be called later.  The reason for that split
was a possible deadlock scenario involving cpufreq sysfs attributes
and CPU offline.

However, the handling of CPU offline in cpufreq has changed since
then.  Policy sysfs attributes are never removed during CPU offline,
so there's no need to worry about accessing them during CPU offline,
because that can't lead to any deadlocks now.  Governor sysfs
attributes are still removed in __cpufreq_governor(_EXIT), but
there is a new kobject type for them now and its show/store
callbacks don't lock CPU offline/online (they don't need to do
that).

This means that the CPU offline code in cpufreq doesn't need to
be split any more, so combine cpufreq_offline_prepare() with
cpufreq_offline_finish().

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
[ rjw: Changelog ]
Tested-by: Juri Lelli <juri.lelli@arm.com>
Tested-by: Shilpasri G Bhat <shilpa.bhat@linux.vnet.ibm.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index e92e9ea..f65553d 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -1362,9 +1362,10 @@ static int cpufreq_add_dev(struct device *dev, struct subsys_interface *sif)
 	return ret;
 }
 
-static void cpufreq_offline_prepare(unsigned int cpu)
+static void cpufreq_offline(unsigned int cpu)
 {
 	struct cpufreq_policy *policy;
+	int ret;
 
 	pr_debug("%s: unregistering CPU %u\n", __func__, cpu);
 
@@ -1375,7 +1376,7 @@ static void cpufreq_offline_prepare(unsigned int cpu)
 	}
 
 	if (has_target()) {
-		int ret = __cpufreq_governor(policy, CPUFREQ_GOV_STOP);
+		ret = __cpufreq_governor(policy, CPUFREQ_GOV_STOP);
 		if (ret)
 			pr_err("%s: Failed to stop governor\n", __func__);
 	}
@@ -1398,34 +1399,23 @@ static void cpufreq_offline_prepare(unsigned int cpu)
 	/* Start governor again for active policy */
 	if (!policy_is_inactive(policy)) {
 		if (has_target()) {
-			int ret = __cpufreq_governor(policy, CPUFREQ_GOV_START);
+			ret = __cpufreq_governor(policy, CPUFREQ_GOV_START);
 			if (!ret)
 				ret = __cpufreq_governor(policy, CPUFREQ_GOV_LIMITS);
 
 			if (ret)
 				pr_err("%s: Failed to start governor\n", __func__);
 		}
-	} else if (cpufreq_driver->stop_cpu) {
-		cpufreq_driver->stop_cpu(policy);
-	}
-}
 
-static void cpufreq_offline_finish(unsigned int cpu)
-{
-	struct cpufreq_policy *policy = per_cpu(cpufreq_cpu_data, cpu);
-
-	if (!policy) {
-		pr_debug("%s: No cpu_data found\n", __func__);
 		return;
 	}
 
-	/* Only proceed for inactive policies */
-	if (!policy_is_inactive(policy))
-		return;
+	if (cpufreq_driver->stop_cpu)
+		cpufreq_driver->stop_cpu(policy);
 
 	/* If cpu is last user of policy, free policy */
 	if (has_target()) {
-		int ret = __cpufreq_governor(policy, CPUFREQ_GOV_POLICY_EXIT);
+		ret = __cpufreq_governor(policy, CPUFREQ_GOV_POLICY_EXIT);
 		if (ret)
 			pr_err("%s: Failed to exit governor\n", __func__);
 	}
@@ -1454,10 +1444,8 @@ static void cpufreq_remove_dev(struct device *dev, struct subsys_interface *sif)
 	if (!policy)
 		return;
 
-	if (cpu_online(cpu)) {
-		cpufreq_offline_prepare(cpu);
-		cpufreq_offline_finish(cpu);
-	}
+	if (cpu_online(cpu))
+		cpufreq_offline(cpu);
 
 	cpumask_clear_cpu(cpu, policy->real_cpus);
 	remove_cpu_dev_symlink(policy, cpu);
@@ -2305,11 +2293,7 @@ static int cpufreq_cpu_callback(struct notifier_block *nfb,
 		break;
 
 	case CPU_DOWN_PREPARE:
-		cpufreq_offline_prepare(cpu);
-		break;
-
-	case CPU_POST_DEAD:
-		cpufreq_offline_finish(cpu);
+		cpufreq_offline(cpu);
 		break;
 
 	case CPU_DOWN_FAILED:
-- 
cgit v0.10.2


From 49f18560f8bac5315047edfb673dd13d56cbcbc9 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Thu, 11 Feb 2016 17:31:12 +0530
Subject: cpufreq: Call __cpufreq_governor() with policy->rwsem held

The cpufreq core code is not consistent with respect to invoking
__cpufreq_governor() under policy->rwsem.

Changing all code to always hold policy->rwsem around
__cpufreq_governor() invocations will allow us to remove
cpufreq_governor_lock that is used today because we can't
guarantee that __cpufreq_governor() isn't executed twice in
parallel for the same policy.

We should also ensure that policy->rwsem is held across governor
state changes.

For example, while adding a CPU to the policy in the CPU online path,
we need to stop the governor, change policy->cpus, start the governor
and then refresh its limits. The complete sequence must be guaranteed
to complete without interruptions by concurrent governor state
updates.  That can be achieved by holding policy->rwsem around those
sequences of operations.

Also note that after this patch cpufreq_driver->stop_cpu() and
->exit() will get called under policy->rwsem which wasn't the case
earlier. That shouldn't have any side effects, though.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Tested-by: Juri Lelli <juri.lelli@arm.com>
Tested-by: Shilpasri G Bhat <shilpa.bhat@linux.vnet.ibm.com>
[ rjw: Changelog ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index f65553d..6928768 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -1049,30 +1049,29 @@ static int cpufreq_add_policy_cpu(struct cpufreq_policy *policy, unsigned int cp
 	if (cpumask_test_cpu(cpu, policy->cpus))
 		return 0;
 
+	down_write(&policy->rwsem);
 	if (has_target()) {
 		ret = __cpufreq_governor(policy, CPUFREQ_GOV_STOP);
 		if (ret) {
 			pr_err("%s: Failed to stop governor\n", __func__);
-			return ret;
+			goto unlock;
 		}
 	}
 
-	down_write(&policy->rwsem);
 	cpumask_set_cpu(cpu, policy->cpus);
-	up_write(&policy->rwsem);
 
 	if (has_target()) {
 		ret = __cpufreq_governor(policy, CPUFREQ_GOV_START);
 		if (!ret)
 			ret = __cpufreq_governor(policy, CPUFREQ_GOV_LIMITS);
 
-		if (ret) {
+		if (ret)
 			pr_err("%s: Failed to start governor\n", __func__);
-			return ret;
-		}
 	}
 
-	return 0;
+unlock:
+	up_write(&policy->rwsem);
+	return ret;
 }
 
 static struct cpufreq_policy *cpufreq_policy_alloc(unsigned int cpu)
@@ -1375,13 +1374,13 @@ static void cpufreq_offline(unsigned int cpu)
 		return;
 	}
 
+	down_write(&policy->rwsem);
 	if (has_target()) {
 		ret = __cpufreq_governor(policy, CPUFREQ_GOV_STOP);
 		if (ret)
 			pr_err("%s: Failed to stop governor\n", __func__);
 	}
 
-	down_write(&policy->rwsem);
 	cpumask_clear_cpu(cpu, policy->cpus);
 
 	if (policy_is_inactive(policy)) {
@@ -1394,7 +1393,6 @@ static void cpufreq_offline(unsigned int cpu)
 		/* Nominate new CPU */
 		policy->cpu = cpumask_any(policy->cpus);
 	}
-	up_write(&policy->rwsem);
 
 	/* Start governor again for active policy */
 	if (!policy_is_inactive(policy)) {
@@ -1407,7 +1405,7 @@ static void cpufreq_offline(unsigned int cpu)
 				pr_err("%s: Failed to start governor\n", __func__);
 		}
 
-		return;
+		goto unlock;
 	}
 
 	if (cpufreq_driver->stop_cpu)
@@ -1429,6 +1427,9 @@ static void cpufreq_offline(unsigned int cpu)
 		cpufreq_driver->exit(policy);
 		policy->freq_table = NULL;
 	}
+
+unlock:
+	up_write(&policy->rwsem);
 }
 
 /**
@@ -1625,6 +1626,7 @@ EXPORT_SYMBOL(cpufreq_generic_suspend);
 void cpufreq_suspend(void)
 {
 	struct cpufreq_policy *policy;
+	int ret;
 
 	if (!cpufreq_driver)
 		return;
@@ -1635,7 +1637,11 @@ void cpufreq_suspend(void)
 	pr_debug("%s: Suspending Governors\n", __func__);
 
 	for_each_active_policy(policy) {
-		if (__cpufreq_governor(policy, CPUFREQ_GOV_STOP))
+		down_write(&policy->rwsem);
+		ret = __cpufreq_governor(policy, CPUFREQ_GOV_STOP);
+		up_write(&policy->rwsem);
+
+		if (ret)
 			pr_err("%s: Failed to stop governor for policy: %p\n",
 				__func__, policy);
 		else if (cpufreq_driver->suspend
@@ -1657,6 +1663,7 @@ suspend:
 void cpufreq_resume(void)
 {
 	struct cpufreq_policy *policy;
+	int ret;
 
 	if (!cpufreq_driver)
 		return;
@@ -1669,13 +1676,20 @@ void cpufreq_resume(void)
 	pr_debug("%s: Resuming Governors\n", __func__);
 
 	for_each_active_policy(policy) {
-		if (cpufreq_driver->resume && cpufreq_driver->resume(policy))
+		if (cpufreq_driver->resume && cpufreq_driver->resume(policy)) {
 			pr_err("%s: Failed to resume driver: %p\n", __func__,
 				policy);
-		else if (__cpufreq_governor(policy, CPUFREQ_GOV_START)
-		    || __cpufreq_governor(policy, CPUFREQ_GOV_LIMITS))
-			pr_err("%s: Failed to start governor for policy: %p\n",
-				__func__, policy);
+		} else {
+			down_write(&policy->rwsem);
+			ret = __cpufreq_governor(policy, CPUFREQ_GOV_START);
+			if (!ret)
+				__cpufreq_governor(policy, CPUFREQ_GOV_LIMITS);
+			up_write(&policy->rwsem);
+
+			if (ret)
+				pr_err("%s: Failed to start governor for policy: %p\n",
+				       __func__, policy);
+		}
 	}
 
 	/*
@@ -2326,8 +2340,11 @@ static int cpufreq_boost_set_sw(int state)
 				       __func__);
 				break;
 			}
+
+			down_write(&policy->rwsem);
 			policy->user_policy.max = policy->max;
 			__cpufreq_governor(policy, CPUFREQ_GOV_LIMITS);
+			up_write(&policy->rwsem);
 		}
 	}
 
-- 
cgit v0.10.2


From 99522fe6788f5bf627dce7c20ed9484c933511a3 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Thu, 11 Feb 2016 17:31:13 +0530
Subject: cpufreq: Remove cpufreq_governor_lock

We used to drop policy->rwsem just before calling __cpufreq_governor()
in some cases earlier and so it was possible that __cpufreq_governor()
ran concurrently via separate threads for the same policy.

In order to guarantee valid state transitions for governors,
'governor_enabled' was required to be protected using some locking
and cpufreq_governor_lock was added for that.

But now __cpufreq_governor() is always called under policy->rwsem,
and 'governor_enabled' is protected against races even without
cpufreq_governor_lock.

Get rid of the extra lock now.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Tested-by: Juri Lelli <juri.lelli@arm.com>
Tested-by: Shilpasri G Bhat <shilpa.bhat@linux.vnet.ibm.com>
[ rjw : Changelog ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 6928768..bc93272 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -147,8 +147,6 @@ void cpufreq_update_util(u64 time, unsigned long util, unsigned long max)
 	rcu_read_unlock();
 }
 
-DEFINE_MUTEX(cpufreq_governor_lock);
-
 /* Flag to suspend/resume CPUFreq governors */
 static bool cpufreq_suspended;
 
@@ -2015,11 +2013,9 @@ static int __cpufreq_governor(struct cpufreq_policy *policy,
 
 	pr_debug("%s: for CPU %u, event %u\n", __func__, policy->cpu, event);
 
-	mutex_lock(&cpufreq_governor_lock);
 	if ((policy->governor_enabled && event == CPUFREQ_GOV_START)
 	    || (!policy->governor_enabled
 	    && (event == CPUFREQ_GOV_LIMITS || event == CPUFREQ_GOV_STOP))) {
-		mutex_unlock(&cpufreq_governor_lock);
 		return -EBUSY;
 	}
 
@@ -2028,8 +2024,6 @@ static int __cpufreq_governor(struct cpufreq_policy *policy,
 	else if (event == CPUFREQ_GOV_START)
 		policy->governor_enabled = true;
 
-	mutex_unlock(&cpufreq_governor_lock);
-
 	ret = policy->governor->governor(policy, event);
 
 	if (!ret) {
@@ -2039,12 +2033,10 @@ static int __cpufreq_governor(struct cpufreq_policy *policy,
 			policy->governor->initialized--;
 	} else {
 		/* Restore original values */
-		mutex_lock(&cpufreq_governor_lock);
 		if (event == CPUFREQ_GOV_STOP)
 			policy->governor_enabled = true;
 		else if (event == CPUFREQ_GOV_START)
 			policy->governor_enabled = false;
-		mutex_unlock(&cpufreq_governor_lock);
 	}
 
 	if (((event == CPUFREQ_GOV_POLICY_INIT) && ret) ||
diff --git a/drivers/cpufreq/cpufreq_governor.h b/drivers/cpufreq/cpufreq_governor.h
index 8bf4775..e9ec411 100644
--- a/drivers/cpufreq/cpufreq_governor.h
+++ b/drivers/cpufreq/cpufreq_governor.h
@@ -232,7 +232,6 @@ static inline int delay_for_sampling_rate(unsigned int sampling_rate)
 }
 
 extern struct mutex dbs_data_mutex;
-extern struct mutex cpufreq_governor_lock;
 void dbs_check_cpu(struct cpufreq_policy *policy);
 int cpufreq_governor_dbs(struct cpufreq_policy *policy, unsigned int event);
 void od_register_powersave_bias_handler(unsigned int (*f)
-- 
cgit v0.10.2


From 581c214b21e4faba06d913952e38e80635d9ada5 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Thu, 11 Feb 2016 17:31:14 +0530
Subject: cpufreq: governor: No need to manage state machine now

The cpufreq core now guarantees that policy->rwsem won't be dropped
while running the ->governor callback for the CPUFREQ_GOV_POLICY_EXIT
event and will be held acquired until the complete sequence of governor
state changes has finished.

This allows governor state machine checks to be dropped from multiple
functions in cpufreq_governor.c.

This also means that policy_dbs->policy can be initialized upfront, so
the entire initialization of struct policy_dbs can be carried out in
one place.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Tested-by: Juri Lelli <juri.lelli@arm.com>
Tested-by: Shilpasri G Bhat <shilpa.bhat@linux.vnet.ibm.com>
[ rjw: Changelog ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c
index 2f35270..a34de9d 100644
--- a/drivers/cpufreq/cpufreq_governor.c
+++ b/drivers/cpufreq/cpufreq_governor.c
@@ -233,8 +233,10 @@ static inline void gov_clear_update_util(struct cpufreq_policy *policy)
 	synchronize_rcu();
 }
 
-static void gov_cancel_work(struct policy_dbs_info *policy_dbs)
+static void gov_cancel_work(struct cpufreq_policy *policy)
 {
+	struct policy_dbs_info *policy_dbs = policy->governor_data;
+
 	/* Tell dbs_update_util_handler() to skip queuing up work items. */
 	atomic_inc(&policy_dbs->work_count);
 	/*
@@ -331,6 +333,7 @@ static struct policy_dbs_info *alloc_policy_dbs_info(struct cpufreq_policy *poli
 	if (!policy_dbs)
 		return NULL;
 
+	policy_dbs->policy = policy;
 	mutex_init(&policy_dbs->timer_mutex);
 	atomic_set(&policy_dbs->work_count, 0);
 	init_irq_work(&policy_dbs->irq_work, dbs_irq_work);
@@ -458,10 +461,6 @@ static int cpufreq_governor_exit(struct cpufreq_policy *policy)
 	struct dbs_data *dbs_data = policy_dbs->dbs_data;
 	int count;
 
-	/* State should be equivalent to INIT */
-	if (policy_dbs->policy)
-		return -EBUSY;
-
 	mutex_lock(&dbs_data->mutex);
 	list_del(&policy_dbs->list);
 	count = --dbs_data->usage_count;
@@ -497,10 +496,6 @@ static int cpufreq_governor_start(struct cpufreq_policy *policy)
 	if (!policy->cur)
 		return -EINVAL;
 
-	/* State should be equivalent to INIT */
-	if (policy_dbs->policy)
-		return -EBUSY;
-
 	sampling_rate = dbs_data->sampling_rate;
 	ignore_nice = dbs_data->ignore_nice_load;
 
@@ -525,7 +520,6 @@ static int cpufreq_governor_start(struct cpufreq_policy *policy)
 		if (ignore_nice)
 			j_cdbs->prev_cpu_nice = kcpustat_cpu(j).cpustat[CPUTIME_NICE];
 	}
-	policy_dbs->policy = policy;
 
 	if (gov->governor == GOV_CONSERVATIVE) {
 		struct cs_cpu_dbs_info_s *cs_dbs_info =
@@ -548,14 +542,7 @@ static int cpufreq_governor_start(struct cpufreq_policy *policy)
 
 static int cpufreq_governor_stop(struct cpufreq_policy *policy)
 {
-	struct policy_dbs_info *policy_dbs = policy->governor_data;
-
-	/* State should be equivalent to START */
-	if (!policy_dbs->policy)
-		return -EBUSY;
-
-	gov_cancel_work(policy_dbs);
-	policy_dbs->policy = NULL;
+	gov_cancel_work(policy);
 
 	return 0;
 }
@@ -564,10 +551,6 @@ static int cpufreq_governor_limits(struct cpufreq_policy *policy)
 {
 	struct policy_dbs_info *policy_dbs = policy->governor_data;
 
-	/* State should be equivalent to START */
-	if (!policy_dbs->policy)
-		return -EBUSY;
-
 	mutex_lock(&policy_dbs->timer_mutex);
 	if (policy->max < policy->cur)
 		__cpufreq_driver_target(policy, policy->max, CPUFREQ_RELATION_H);
-- 
cgit v0.10.2


From aded387b94b69aeab10e1d112bab7f82c9241527 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Thu, 11 Feb 2016 17:31:15 +0530
Subject: cpufreq: conservative: Update sample_delay_ns immediately

The ondemand governor already updates sample_delay_ns immediately on
updates to the sampling rate, but conservative doesn't do that.

It was left out earlier as the code was really too complex to get
that done easily.  Things are sorted out very well now, however, and
the conservative governor can be modified to follow ondemand in that
respect.

Moreover, since the code needed to implement that in the
conservative governor would be identical to the corresponding
ondemand governor's code, make that code common and change both
governors to use it.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Tested-by: Juri Lelli <juri.lelli@arm.com>
Tested-by: Shilpasri G Bhat <shilpa.bhat@linux.vnet.ibm.com>
[ rjw: Changelog ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c
index ed081db..6243502 100644
--- a/drivers/cpufreq/cpufreq_conservative.c
+++ b/drivers/cpufreq/cpufreq_conservative.c
@@ -136,20 +136,6 @@ static ssize_t store_sampling_down_factor(struct dbs_data *dbs_data,
 	return count;
 }
 
-static ssize_t store_sampling_rate(struct dbs_data *dbs_data, const char *buf,
-		size_t count)
-{
-	unsigned int input;
-	int ret;
-	ret = sscanf(buf, "%u", &input);
-
-	if (ret != 1)
-		return -EINVAL;
-
-	dbs_data->sampling_rate = max(input, dbs_data->min_sampling_rate);
-	return count;
-}
-
 static ssize_t store_up_threshold(struct dbs_data *dbs_data, const char *buf,
 		size_t count)
 {
diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c
index a34de9d..d41db19 100644
--- a/drivers/cpufreq/cpufreq_governor.c
+++ b/drivers/cpufreq/cpufreq_governor.c
@@ -25,6 +25,69 @@
 DEFINE_MUTEX(dbs_data_mutex);
 EXPORT_SYMBOL_GPL(dbs_data_mutex);
 
+/* Common sysfs tunables */
+/**
+ * store_sampling_rate - update sampling rate effective immediately if needed.
+ *
+ * If new rate is smaller than the old, simply updating
+ * dbs.sampling_rate might not be appropriate. For example, if the
+ * original sampling_rate was 1 second and the requested new sampling rate is 10
+ * ms because the user needs immediate reaction from ondemand governor, but not
+ * sure if higher frequency will be required or not, then, the governor may
+ * change the sampling rate too late; up to 1 second later. Thus, if we are
+ * reducing the sampling rate, we need to make the new value effective
+ * immediately.
+ *
+ * On the other hand, if new rate is larger than the old, then we may evaluate
+ * the load too soon, and it might we worth updating sample_delay_ns then as
+ * well.
+ *
+ * This must be called with dbs_data->mutex held, otherwise traversing
+ * policy_dbs_list isn't safe.
+ */
+ssize_t store_sampling_rate(struct dbs_data *dbs_data, const char *buf,
+			    size_t count)
+{
+	struct policy_dbs_info *policy_dbs;
+	unsigned int rate;
+	int ret;
+	ret = sscanf(buf, "%u", &rate);
+	if (ret != 1)
+		return -EINVAL;
+
+	dbs_data->sampling_rate = max(rate, dbs_data->min_sampling_rate);
+
+	/*
+	 * We are operating under dbs_data->mutex and so the list and its
+	 * entries can't be freed concurrently.
+	 */
+	list_for_each_entry(policy_dbs, &dbs_data->policy_dbs_list, list) {
+		mutex_lock(&policy_dbs->timer_mutex);
+		/*
+		 * On 32-bit architectures this may race with the
+		 * sample_delay_ns read in dbs_update_util_handler(), but that
+		 * really doesn't matter.  If the read returns a value that's
+		 * too big, the sample will be skipped, but the next invocation
+		 * of dbs_update_util_handler() (when the update has been
+		 * completed) will take a sample.  If the returned value is too
+		 * small, the sample will be taken immediately, but that isn't a
+		 * problem, as we want the new rate to take effect immediately
+		 * anyway.
+		 *
+		 * If this runs in parallel with dbs_work_handler(), we may end
+		 * up overwriting the sample_delay_ns value that it has just
+		 * written, but the difference should not be too big and it will
+		 * be corrected next time a sample is taken, so it shouldn't be
+		 * significant.
+		 */
+		gov_update_sample_delay(policy_dbs, dbs_data->sampling_rate);
+		mutex_unlock(&policy_dbs->timer_mutex);
+	}
+
+	return count;
+}
+EXPORT_SYMBOL_GPL(store_sampling_rate);
+
 static inline struct dbs_data *to_dbs_data(struct kobject *kobj)
 {
 	return container_of(kobj, struct dbs_data, kobj);
diff --git a/drivers/cpufreq/cpufreq_governor.h b/drivers/cpufreq/cpufreq_governor.h
index e9ec411..8138eff 100644
--- a/drivers/cpufreq/cpufreq_governor.h
+++ b/drivers/cpufreq/cpufreq_governor.h
@@ -238,4 +238,6 @@ void od_register_powersave_bias_handler(unsigned int (*f)
 		(struct cpufreq_policy *, unsigned int, unsigned int),
 		unsigned int powersave_bias);
 void od_unregister_powersave_bias_handler(void);
+ssize_t store_sampling_rate(struct dbs_data *dbs_data, const char *buf,
+			    size_t count);
 #endif /* _CPUFREQ_GOVERNOR_H */
diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c
index 38301c6..1221382 100644
--- a/drivers/cpufreq/cpufreq_ondemand.c
+++ b/drivers/cpufreq/cpufreq_ondemand.c
@@ -221,75 +221,6 @@ static unsigned int od_dbs_timer(struct cpufreq_policy *policy)
 /************************** sysfs interface ************************/
 static struct dbs_governor od_dbs_gov;
 
-/**
- * update_sampling_rate - update sampling rate effective immediately if needed.
- * @new_rate: new sampling rate
- *
- * If new rate is smaller than the old, simply updating
- * dbs.sampling_rate might not be appropriate. For example, if the
- * original sampling_rate was 1 second and the requested new sampling rate is 10
- * ms because the user needs immediate reaction from ondemand governor, but not
- * sure if higher frequency will be required or not, then, the governor may
- * change the sampling rate too late; up to 1 second later. Thus, if we are
- * reducing the sampling rate, we need to make the new value effective
- * immediately.
- *
- * On the other hand, if new rate is larger than the old, then we may evaluate
- * the load too soon, and it might we worth updating sample_delay_ns then as
- * well.
- *
- * This must be called with dbs_data->mutex held, otherwise traversing
- * policy_dbs_list isn't safe.
- */
-static void update_sampling_rate(struct dbs_data *dbs_data,
-		unsigned int new_rate)
-{
-	struct policy_dbs_info *policy_dbs;
-
-	dbs_data->sampling_rate = new_rate = max(new_rate,
-			dbs_data->min_sampling_rate);
-
-	/*
-	 * We are operating under dbs_data->mutex and so the list and its
-	 * entries can't be freed concurrently.
-	 */
-	list_for_each_entry(policy_dbs, &dbs_data->policy_dbs_list, list) {
-		mutex_lock(&policy_dbs->timer_mutex);
-		/*
-		 * On 32-bit architectures this may race with the
-		 * sample_delay_ns read in dbs_update_util_handler(), but that
-		 * really doesn't matter.  If the read returns a value that's
-		 * too big, the sample will be skipped, but the next invocation
-		 * of dbs_update_util_handler() (when the update has been
-		 * completed) will take a sample.  If the returned value is too
-		 * small, the sample will be taken immediately, but that isn't a
-		 * problem, as we want the new rate to take effect immediately
-		 * anyway.
-		 *
-		 * If this runs in parallel with dbs_work_handler(), we may end
-		 * up overwriting the sample_delay_ns value that it has just
-		 * written, but the difference should not be too big and it will
-		 * be corrected next time a sample is taken, so it shouldn't be
-		 * significant.
-		 */
-		gov_update_sample_delay(policy_dbs, new_rate);
-		mutex_unlock(&policy_dbs->timer_mutex);
-	}
-}
-
-static ssize_t store_sampling_rate(struct dbs_data *dbs_data, const char *buf,
-		size_t count)
-{
-	unsigned int input;
-	int ret;
-	ret = sscanf(buf, "%u", &input);
-	if (ret != 1)
-		return -EINVAL;
-
-	update_sampling_rate(dbs_data, input);
-	return count;
-}
-
 static ssize_t store_io_is_busy(struct dbs_data *dbs_data, const char *buf,
 		size_t count)
 {
-- 
cgit v0.10.2


From a23d6d180914dd91e320283c81e4f84f028e24f4 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Thu, 11 Feb 2016 17:31:16 +0530
Subject: cpufreq: ondemand: Rearrange od_dbs_timer() to avoid updating delay

Avoid extra checks in od_dbs_timer() by rearranging updates to the
local delay variable in it.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
[ rjw: Changelog ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c
index 1221382..0b79f14 100644
--- a/drivers/cpufreq/cpufreq_ondemand.c
+++ b/drivers/cpufreq/cpufreq_ondemand.c
@@ -194,7 +194,7 @@ static unsigned int od_dbs_timer(struct cpufreq_policy *policy)
 	struct policy_dbs_info *policy_dbs = policy->governor_data;
 	struct dbs_data *dbs_data = policy_dbs->dbs_data;
 	struct od_cpu_dbs_info_s *dbs_info = &per_cpu(od_cpu_dbs_info, policy->cpu);
-	int delay = 0, sample_type = dbs_info->sample_type;
+	int delay, sample_type = dbs_info->sample_type;
 
 	/* Common NORMAL_SAMPLE setup */
 	dbs_info->sample_type = OD_NORMAL_SAMPLE;
@@ -208,13 +208,12 @@ static unsigned int od_dbs_timer(struct cpufreq_policy *policy)
 			/* Setup timer for SUB_SAMPLE */
 			dbs_info->sample_type = OD_SUB_SAMPLE;
 			delay = dbs_info->freq_hi_jiffies;
+		} else {
+			delay = delay_for_sampling_rate(dbs_data->sampling_rate
+							* dbs_info->rate_mult);
 		}
 	}
 
-	if (!delay)
-		delay = delay_for_sampling_rate(dbs_data->sampling_rate
-				* dbs_info->rate_mult);
-
 	return delay;
 }
 
-- 
cgit v0.10.2


From b9db42730aeb23f91d7585786de25a260ab04098 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Mon, 15 Feb 2016 22:15:34 +0100
Subject: cpufreq: governor: Avoid irq_work_queue_on() crash on non-SMP ARM

As it turns out, irq_work_queue_on() will crash if invoked on
non-SMP ARM platforms, but in fact it is not necessary to use that
function in the cpufreq governor code (as it doesn't matter to that
code which CPU will handle the irq_work), so change it to always use
irq_work_queue().

Fixes: 8fb47ff100af (cpufreq: governor: Replace timers with utilization update callbacks)
Reported-and-tested-by: Guenter Roeck <linux@roeck-us.net>
Reported-and-tested-by: Tony Lindgren <tony@atomide.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c
index d41db19..580b692 100644
--- a/drivers/cpufreq/cpufreq_governor.c
+++ b/drivers/cpufreq/cpufreq_governor.c
@@ -350,15 +350,6 @@ static void dbs_irq_work(struct irq_work *irq_work)
 	schedule_work(&policy_dbs->work);
 }
 
-static inline void gov_queue_irq_work(struct policy_dbs_info *policy_dbs)
-{
-#ifdef CONFIG_SMP
-	irq_work_queue_on(&policy_dbs->irq_work, smp_processor_id());
-#else
-	irq_work_queue(&policy_dbs->irq_work);
-#endif
-}
-
 static void dbs_update_util_handler(struct update_util_data *data, u64 time,
 				    unsigned long util, unsigned long max)
 {
@@ -378,7 +369,7 @@ static void dbs_update_util_handler(struct update_util_data *data, u64 time,
 		delta_ns = time - policy_dbs->last_sample_time;
 		if ((s64)delta_ns >= policy_dbs->sample_delay_ns) {
 			policy_dbs->last_sample_time = time;
-			gov_queue_irq_work(policy_dbs);
+			irq_work_queue(&policy_dbs->irq_work);
 			return;
 		}
 	}
-- 
cgit v0.10.2


From f62b93740c30d0a3f50258d45415f00b763dd70a Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Mon, 15 Feb 2016 02:12:56 +0100
Subject: cpufreq: governor: Simplify gov_cancel_work() slightly

The atomic work counter incrementation in gov_cancel_work() is not
necessary any more, because work items won't be queued up after
gov_clear_update_util() anyway, so drop it along with the comment
about how it may be missed by the gov_clear_update_util().

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>

diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c
index 580b692..c78af11 100644
--- a/drivers/cpufreq/cpufreq_governor.c
+++ b/drivers/cpufreq/cpufreq_governor.c
@@ -300,13 +300,6 @@ static void gov_cancel_work(struct cpufreq_policy *policy)
 {
 	struct policy_dbs_info *policy_dbs = policy->governor_data;
 
-	/* Tell dbs_update_util_handler() to skip queuing up work items. */
-	atomic_inc(&policy_dbs->work_count);
-	/*
-	 * If dbs_update_util_handler() is already running, it may not notice
-	 * the incremented work_count, so wait for it to complete to prevent its
-	 * work item from being queued up after the cancel_work_sync() below.
-	 */
 	gov_clear_update_util(policy_dbs->policy);
 	irq_work_sync(&policy_dbs->irq_work);
 	cancel_work_sync(&policy_dbs->work);
@@ -360,7 +353,6 @@ static void dbs_update_util_handler(struct update_util_data *data, u64 time,
 	 * The work may not be allowed to be queued up right now.
 	 * Possible reasons:
 	 * - Work has already been queued up or is in progress.
-	 * - The governor is being stopped.
 	 * - It is too early (too little time from the previous sample).
 	 */
 	if (atomic_inc_return(&policy_dbs->work_count) == 1) {
-- 
cgit v0.10.2


From e4db2813d2e558b6b6bee464308678a57732b390 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Mon, 15 Feb 2016 02:13:42 +0100
Subject: cpufreq: governor: Avoid atomic operations in hot paths

Rework the handling of work items by dbs_update_util_handler() and
dbs_work_handler() so the former (which is executed in scheduler
paths) only uses atomic operations when absolutely necessary.  That
is, when the policy is shared and dbs_update_util_handler() has
already decided that this is the time to queue up a work item.

In particular, this avoids the atomic ops entirely on platforms where
policy objects are never shared.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>

diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c
index c78af11..e5a08a1 100644
--- a/drivers/cpufreq/cpufreq_governor.c
+++ b/drivers/cpufreq/cpufreq_governor.c
@@ -304,6 +304,7 @@ static void gov_cancel_work(struct cpufreq_policy *policy)
 	irq_work_sync(&policy_dbs->irq_work);
 	cancel_work_sync(&policy_dbs->work);
 	atomic_set(&policy_dbs->work_count, 0);
+	policy_dbs->work_in_progress = false;
 }
 
 static void dbs_work_handler(struct work_struct *work)
@@ -326,13 +327,15 @@ static void dbs_work_handler(struct work_struct *work)
 	policy_dbs->sample_delay_ns = jiffies_to_nsecs(delay);
 	mutex_unlock(&policy_dbs->timer_mutex);
 
+	/* Allow the utilization update handler to queue up more work. */
+	atomic_set(&policy_dbs->work_count, 0);
 	/*
-	 * If the atomic operation below is reordered with respect to the
-	 * sample delay modification, the utilization update handler may end
-	 * up using a stale sample delay value.
+	 * If the update below is reordered with respect to the sample delay
+	 * modification, the utilization update handler may end up using a stale
+	 * sample delay value.
 	 */
-	smp_mb__before_atomic();
-	atomic_dec(&policy_dbs->work_count);
+	smp_wmb();
+	policy_dbs->work_in_progress = false;
 }
 
 static void dbs_irq_work(struct irq_work *irq_work)
@@ -348,6 +351,7 @@ static void dbs_update_util_handler(struct update_util_data *data, u64 time,
 {
 	struct cpu_dbs_info *cdbs = container_of(data, struct cpu_dbs_info, update_util);
 	struct policy_dbs_info *policy_dbs = cdbs->policy_dbs;
+	u64 delta_ns;
 
 	/*
 	 * The work may not be allowed to be queued up right now.
@@ -355,17 +359,30 @@ static void dbs_update_util_handler(struct update_util_data *data, u64 time,
 	 * - Work has already been queued up or is in progress.
 	 * - It is too early (too little time from the previous sample).
 	 */
-	if (atomic_inc_return(&policy_dbs->work_count) == 1) {
-		u64 delta_ns;
-
-		delta_ns = time - policy_dbs->last_sample_time;
-		if ((s64)delta_ns >= policy_dbs->sample_delay_ns) {
-			policy_dbs->last_sample_time = time;
-			irq_work_queue(&policy_dbs->irq_work);
-			return;
-		}
-	}
-	atomic_dec(&policy_dbs->work_count);
+	if (policy_dbs->work_in_progress)
+		return;
+
+	/*
+	 * If the reads below are reordered before the check above, the value
+	 * of sample_delay_ns used in the computation may be stale.
+	 */
+	smp_rmb();
+	delta_ns = time - policy_dbs->last_sample_time;
+	if ((s64)delta_ns < policy_dbs->sample_delay_ns)
+		return;
+
+	/*
+	 * If the policy is not shared, the irq_work may be queued up right away
+	 * at this point.  Otherwise, we need to ensure that only one of the
+	 * CPUs sharing the policy will do that.
+	 */
+	if (policy_dbs->is_shared &&
+	    !atomic_add_unless(&policy_dbs->work_count, 1, 1))
+		return;
+
+	policy_dbs->last_sample_time = time;
+	policy_dbs->work_in_progress = true;
+	irq_work_queue(&policy_dbs->irq_work);
 }
 
 static struct policy_dbs_info *alloc_policy_dbs_info(struct cpufreq_policy *policy,
@@ -542,6 +559,8 @@ static int cpufreq_governor_start(struct cpufreq_policy *policy)
 	if (!policy->cur)
 		return -EINVAL;
 
+	policy_dbs->is_shared = policy_is_shared(policy);
+
 	sampling_rate = dbs_data->sampling_rate;
 	ignore_nice = dbs_data->ignore_nice_load;
 
diff --git a/drivers/cpufreq/cpufreq_governor.h b/drivers/cpufreq/cpufreq_governor.h
index 8138eff..521daac 100644
--- a/drivers/cpufreq/cpufreq_governor.h
+++ b/drivers/cpufreq/cpufreq_governor.h
@@ -130,6 +130,9 @@ struct policy_dbs_info {
 	/* dbs_data may be shared between multiple policy objects */
 	struct dbs_data *dbs_data;
 	struct list_head list;
+	/* Status indicators */
+	bool is_shared;		/* This object is used by multiple CPUs */
+	bool work_in_progress;	/* Work is being queued up or in progress */
 };
 
 static inline void gov_update_sample_delay(struct policy_dbs_info *policy_dbs,
-- 
cgit v0.10.2


From 679b8fe43a6b723787cae1d9599ed776d7ce238b Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Mon, 15 Feb 2016 02:15:50 +0100
Subject: cpufreq: governor: Fix nice contribution computation in
 dbs_check_cpu()

The contribution of the CPU nice time to the idle time in dbs_check_cpu()
is computed in a bogus way, as the code may subtract current and previous
nice values for different CPUs.

That doesn't matter for cases when cpufreq policies are not shared,
but may lead to problems otherwise.

Fix the computation and simplify it to avoid taking unnecessary steps.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>

diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c
index e5a08a1..c546970 100644
--- a/drivers/cpufreq/cpufreq_governor.c
+++ b/drivers/cpufreq/cpufreq_governor.c
@@ -198,22 +198,10 @@ void dbs_check_cpu(struct cpufreq_policy *policy)
 		j_cdbs->prev_cpu_idle = cur_idle_time;
 
 		if (ignore_nice) {
-			struct cpu_dbs_info *cdbs = gov->get_cpu_cdbs(cpu);
-			u64 cur_nice;
-			unsigned long cur_nice_jiffies;
+			u64 cur_nice = kcpustat_cpu(j).cpustat[CPUTIME_NICE];
 
-			cur_nice = kcpustat_cpu(j).cpustat[CPUTIME_NICE] -
-					 cdbs->prev_cpu_nice;
-			/*
-			 * Assumption: nice time between sampling periods will
-			 * be less than 2^32 jiffies for 32 bit sys
-			 */
-			cur_nice_jiffies = (unsigned long)
-					cputime64_to_jiffies64(cur_nice);
-
-			cdbs->prev_cpu_nice =
-				kcpustat_cpu(j).cpustat[CPUTIME_NICE];
-			idle_time += jiffies_to_usecs(cur_nice_jiffies);
+			idle_time += cputime_to_usecs(cur_nice - j_cdbs->prev_cpu_nice);
+			j_cdbs->prev_cpu_nice = cur_nice;
 		}
 
 		if (unlikely(!wall_time || wall_time < idle_time))
-- 
cgit v0.10.2


From 57eb832f90e645dcb97d651ad052c0537cc1b3a7 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Tue, 16 Feb 2016 00:58:47 +0100
Subject: cpufreq: governor: Clean up load-related computations

Clean up some load-related computations in dbs_check_cpu() and
cpufreq_governor_start() to get rid of unnecessary operations and
type casts and make the code easier to read.

No functional changes.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>

diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c
index c546970..1f580cb 100644
--- a/drivers/cpufreq/cpufreq_governor.c
+++ b/drivers/cpufreq/cpufreq_governor.c
@@ -186,16 +186,15 @@ void dbs_check_cpu(struct cpufreq_policy *policy)
 			io_busy = od_tuners->io_is_busy;
 		cur_idle_time = get_cpu_idle_time(j, &cur_wall_time, io_busy);
 
-		wall_time = (unsigned int)
-			(cur_wall_time - j_cdbs->prev_cpu_wall);
+		wall_time = cur_wall_time - j_cdbs->prev_cpu_wall;
 		j_cdbs->prev_cpu_wall = cur_wall_time;
 
-		if (cur_idle_time < j_cdbs->prev_cpu_idle)
-			cur_idle_time = j_cdbs->prev_cpu_idle;
-
-		idle_time = (unsigned int)
-			(cur_idle_time - j_cdbs->prev_cpu_idle);
-		j_cdbs->prev_cpu_idle = cur_idle_time;
+		if (cur_idle_time <= j_cdbs->prev_cpu_idle) {
+			idle_time = 0;
+		} else {
+			idle_time = cur_idle_time - j_cdbs->prev_cpu_idle;
+			j_cdbs->prev_cpu_idle = cur_idle_time;
+		}
 
 		if (ignore_nice) {
 			u64 cur_nice = kcpustat_cpu(j).cpustat[CPUTIME_NICE];
@@ -562,13 +561,10 @@ static int cpufreq_governor_start(struct cpufreq_policy *policy)
 		struct cpu_dbs_info *j_cdbs = gov->get_cpu_cdbs(j);
 		unsigned int prev_load;
 
-		j_cdbs->prev_cpu_idle =
-			get_cpu_idle_time(j, &j_cdbs->prev_cpu_wall, io_busy);
+		j_cdbs->prev_cpu_idle = get_cpu_idle_time(j, &j_cdbs->prev_cpu_wall, io_busy);
 
-		prev_load = (unsigned int)(j_cdbs->prev_cpu_wall -
-					    j_cdbs->prev_cpu_idle);
-		j_cdbs->prev_load = 100 * prev_load /
-				    (unsigned int)j_cdbs->prev_cpu_wall;
+		prev_load = j_cdbs->prev_cpu_wall - j_cdbs->prev_cpu_idle;
+		j_cdbs->prev_load = 100 * prev_load / (unsigned int)j_cdbs->prev_cpu_wall;
 
 		if (ignore_nice)
 			j_cdbs->prev_cpu_nice = kcpustat_cpu(j).cpustat[CPUTIME_NICE];
-- 
cgit v0.10.2


From 4cccf7555770b787fa80791a1407a27301f03920 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Mon, 15 Feb 2016 02:19:31 +0100
Subject: cpufreq: governor: Get rid of the ->gov_check_cpu callback

The way the ->gov_check_cpu governor callback is used by the ondemand
and conservative governors is not really straightforward.  Namely, the
governor calls dbs_check_cpu() that updates the load information for
the policy and the invokes ->gov_check_cpu() for the governor.

To get rid of that entanglement, notice that cpufreq_governor_limits()
doesn't need to call dbs_check_cpu() directly.  Instead, it can simply
reset the sample delay to 0 which will cause a sample to be taken
immediately.  The result of that is practically equivalent to calling
dbs_check_cpu() except that it will trigger a full update of governor
internal state and not just the ->gov_check_cpu() part.

Following that observation, make cpufreq_governor_limits() reset
the sample delay and turn dbs_check_cpu() into a function that will
simply evaluate the load and return the result called dbs_update().

That function can now be called by governors from the routines that
previously were pointed to by ->gov_check_cpu and those routines
can be called directly by each governor instead of dbs_check_cpu().
This way ->gov_check_cpu becomes unnecessary, so drop it.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>

diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c
index 6243502..2e9040e 100644
--- a/drivers/cpufreq/cpufreq_conservative.c
+++ b/drivers/cpufreq/cpufreq_conservative.c
@@ -44,20 +44,20 @@ static inline unsigned int get_freq_target(struct cs_dbs_tuners *cs_tuners,
  * Any frequency increase takes it to the maximum frequency. Frequency reduction
  * happens at minimum steps of 5% (default) of maximum frequency
  */
-static void cs_check_cpu(int cpu, unsigned int load)
+static unsigned int cs_dbs_timer(struct cpufreq_policy *policy)
 {
-	struct cs_cpu_dbs_info_s *dbs_info = &per_cpu(cs_cpu_dbs_info, cpu);
-	struct cpufreq_policy *policy = dbs_info->cdbs.policy_dbs->policy;
+	struct cs_cpu_dbs_info_s *dbs_info = &per_cpu(cs_cpu_dbs_info, policy->cpu);
 	struct policy_dbs_info *policy_dbs = policy->governor_data;
 	struct dbs_data *dbs_data = policy_dbs->dbs_data;
 	struct cs_dbs_tuners *cs_tuners = dbs_data->tuners;
+	unsigned int load = dbs_update(policy);
 
 	/*
 	 * break out if we 'cannot' reduce the speed as the user might
 	 * want freq_step to be zero
 	 */
 	if (cs_tuners->freq_step == 0)
-		return;
+		goto out;
 
 	/* Check for frequency increase */
 	if (load > dbs_data->up_threshold) {
@@ -65,7 +65,7 @@ static void cs_check_cpu(int cpu, unsigned int load)
 
 		/* if we are already at full speed then break out early */
 		if (dbs_info->requested_freq == policy->max)
-			return;
+			goto out;
 
 		dbs_info->requested_freq += get_freq_target(cs_tuners, policy);
 
@@ -74,12 +74,12 @@ static void cs_check_cpu(int cpu, unsigned int load)
 
 		__cpufreq_driver_target(policy, dbs_info->requested_freq,
 			CPUFREQ_RELATION_H);
-		return;
+		goto out;
 	}
 
 	/* if sampling_down_factor is active break out early */
 	if (++dbs_info->down_skip < dbs_data->sampling_down_factor)
-		return;
+		goto out;
 	dbs_info->down_skip = 0;
 
 	/* Check for frequency decrease */
@@ -89,7 +89,7 @@ static void cs_check_cpu(int cpu, unsigned int load)
 		 * if we cannot reduce the frequency anymore, break out early
 		 */
 		if (policy->cur == policy->min)
-			return;
+			goto out;
 
 		freq_target = get_freq_target(cs_tuners, policy);
 		if (dbs_info->requested_freq > freq_target)
@@ -99,16 +99,9 @@ static void cs_check_cpu(int cpu, unsigned int load)
 
 		__cpufreq_driver_target(policy, dbs_info->requested_freq,
 				CPUFREQ_RELATION_L);
-		return;
 	}
-}
-
-static unsigned int cs_dbs_timer(struct cpufreq_policy *policy)
-{
-	struct policy_dbs_info *policy_dbs = policy->governor_data;
-	struct dbs_data *dbs_data = policy_dbs->dbs_data;
 
-	dbs_check_cpu(policy);
+ out:
 	return delay_for_sampling_rate(dbs_data->sampling_rate);
 }
 
@@ -300,7 +293,6 @@ static struct dbs_governor cs_dbs_gov = {
 	.get_cpu_cdbs = get_cpu_cdbs,
 	.get_cpu_dbs_info_s = get_cpu_dbs_info_s,
 	.gov_dbs_timer = cs_dbs_timer,
-	.gov_check_cpu = cs_check_cpu,
 	.init = cs_init,
 	.exit = cs_exit,
 };
diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c
index 1f580cb..99d25af 100644
--- a/drivers/cpufreq/cpufreq_governor.c
+++ b/drivers/cpufreq/cpufreq_governor.c
@@ -140,9 +140,8 @@ static const struct sysfs_ops governor_sysfs_ops = {
 	.store	= governor_store,
 };
 
-void dbs_check_cpu(struct cpufreq_policy *policy)
+unsigned int dbs_update(struct cpufreq_policy *policy)
 {
-	int cpu = policy->cpu;
 	struct dbs_governor *gov = dbs_governor_of(policy);
 	struct policy_dbs_info *policy_dbs = policy->governor_data;
 	struct dbs_data *dbs_data = policy_dbs->dbs_data;
@@ -154,7 +153,7 @@ void dbs_check_cpu(struct cpufreq_policy *policy)
 
 	if (gov->governor == GOV_ONDEMAND) {
 		struct od_cpu_dbs_info_s *od_dbs_info =
-				gov->get_cpu_dbs_info_s(cpu);
+				gov->get_cpu_dbs_info_s(policy->cpu);
 
 		/*
 		 * Sometimes, the ondemand governor uses an additional
@@ -250,10 +249,9 @@ void dbs_check_cpu(struct cpufreq_policy *policy)
 		if (load > max_load)
 			max_load = load;
 	}
-
-	gov->gov_check_cpu(cpu, max_load);
+	return max_load;
 }
-EXPORT_SYMBOL_GPL(dbs_check_cpu);
+EXPORT_SYMBOL_GPL(dbs_update);
 
 void gov_set_update_util(struct policy_dbs_info *policy_dbs,
 			 unsigned int delay_us)
@@ -601,11 +599,14 @@ static int cpufreq_governor_limits(struct cpufreq_policy *policy)
 	struct policy_dbs_info *policy_dbs = policy->governor_data;
 
 	mutex_lock(&policy_dbs->timer_mutex);
+
 	if (policy->max < policy->cur)
 		__cpufreq_driver_target(policy, policy->max, CPUFREQ_RELATION_H);
 	else if (policy->min > policy->cur)
 		__cpufreq_driver_target(policy, policy->min, CPUFREQ_RELATION_L);
-	dbs_check_cpu(policy);
+
+	gov_update_sample_delay(policy_dbs, 0);
+
 	mutex_unlock(&policy_dbs->timer_mutex);
 
 	return 0;
diff --git a/drivers/cpufreq/cpufreq_governor.h b/drivers/cpufreq/cpufreq_governor.h
index 521daac..38b9512 100644
--- a/drivers/cpufreq/cpufreq_governor.h
+++ b/drivers/cpufreq/cpufreq_governor.h
@@ -202,7 +202,6 @@ struct dbs_governor {
 	struct cpu_dbs_info *(*get_cpu_cdbs)(int cpu);
 	void *(*get_cpu_dbs_info_s)(int cpu);
 	unsigned int (*gov_dbs_timer)(struct cpufreq_policy *policy);
-	void (*gov_check_cpu)(int cpu, unsigned int load);
 	int (*init)(struct dbs_data *dbs_data, bool notify);
 	void (*exit)(struct dbs_data *dbs_data, bool notify);
 
@@ -235,7 +234,7 @@ static inline int delay_for_sampling_rate(unsigned int sampling_rate)
 }
 
 extern struct mutex dbs_data_mutex;
-void dbs_check_cpu(struct cpufreq_policy *policy);
+unsigned int dbs_update(struct cpufreq_policy *policy);
 int cpufreq_governor_dbs(struct cpufreq_policy *policy, unsigned int event);
 void od_register_powersave_bias_handler(unsigned int (*f)
 		(struct cpufreq_policy *, unsigned int, unsigned int),
diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c
index 0b79f14..707c017 100644
--- a/drivers/cpufreq/cpufreq_ondemand.c
+++ b/drivers/cpufreq/cpufreq_ondemand.c
@@ -150,13 +150,13 @@ static void dbs_freq_increase(struct cpufreq_policy *policy, unsigned int freq)
  * (default), then we try to increase frequency. Else, we adjust the frequency
  * proportional to load.
  */
-static void od_check_cpu(int cpu, unsigned int load)
+static void od_update(struct cpufreq_policy *policy)
 {
-	struct od_cpu_dbs_info_s *dbs_info = &per_cpu(od_cpu_dbs_info, cpu);
+	struct od_cpu_dbs_info_s *dbs_info = &per_cpu(od_cpu_dbs_info, policy->cpu);
 	struct policy_dbs_info *policy_dbs = dbs_info->cdbs.policy_dbs;
-	struct cpufreq_policy *policy = policy_dbs->policy;
 	struct dbs_data *dbs_data = policy_dbs->dbs_data;
 	struct od_dbs_tuners *od_tuners = dbs_data->tuners;
+	unsigned int load = dbs_update(policy);
 
 	dbs_info->freq_lo = 0;
 
@@ -198,12 +198,16 @@ static unsigned int od_dbs_timer(struct cpufreq_policy *policy)
 
 	/* Common NORMAL_SAMPLE setup */
 	dbs_info->sample_type = OD_NORMAL_SAMPLE;
-	if (sample_type == OD_SUB_SAMPLE) {
+	/*
+	 * OD_SUB_SAMPLE doesn't make sense if sample_delay_ns is 0, so ignore
+	 * it then.
+	 */
+	if (sample_type == OD_SUB_SAMPLE && policy_dbs->sample_delay_ns > 0) {
 		delay = dbs_info->freq_lo_jiffies;
 		__cpufreq_driver_target(policy, dbs_info->freq_lo,
 					CPUFREQ_RELATION_H);
 	} else {
-		dbs_check_cpu(policy);
+		od_update(policy);
 		if (dbs_info->freq_lo) {
 			/* Setup timer for SUB_SAMPLE */
 			dbs_info->sample_type = OD_SUB_SAMPLE;
@@ -428,7 +432,6 @@ static struct dbs_governor od_dbs_gov = {
 	.get_cpu_cdbs = get_cpu_cdbs,
 	.get_cpu_dbs_info_s = get_cpu_dbs_info_s,
 	.gov_dbs_timer = od_dbs_timer,
-	.gov_check_cpu = od_check_cpu,
 	.gov_ops = &od_ops,
 	.init = od_init,
 	.exit = od_exit,
-- 
cgit v0.10.2


From 78347cdb89065f9d40ea28596ef2bd8058eb6d12 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Mon, 15 Feb 2016 02:20:11 +0100
Subject: cpufreq: governor: Reset sample delay in store_sampling_rate()

If store_sampling_rate() updates the sample delay when the ondemand
governor is in the middle of its high/low dance (OD_SUB_SAMPLE sample
type is set), the governor will still do the bottom half of the
previous sample which may take too much time.

To prevent that from happening, change store_sampling_rate() to always
reset the sample delay to 0 which also is consistent with the new
behavior of cpufreq_governor_limits().

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>

diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c
index 99d25af..fd4cdc2 100644
--- a/drivers/cpufreq/cpufreq_governor.c
+++ b/drivers/cpufreq/cpufreq_governor.c
@@ -38,10 +38,6 @@ EXPORT_SYMBOL_GPL(dbs_data_mutex);
  * reducing the sampling rate, we need to make the new value effective
  * immediately.
  *
- * On the other hand, if new rate is larger than the old, then we may evaluate
- * the load too soon, and it might we worth updating sample_delay_ns then as
- * well.
- *
  * This must be called with dbs_data->mutex held, otherwise traversing
  * policy_dbs_list isn't safe.
  */
@@ -69,18 +65,14 @@ ssize_t store_sampling_rate(struct dbs_data *dbs_data, const char *buf,
 		 * really doesn't matter.  If the read returns a value that's
 		 * too big, the sample will be skipped, but the next invocation
 		 * of dbs_update_util_handler() (when the update has been
-		 * completed) will take a sample.  If the returned value is too
-		 * small, the sample will be taken immediately, but that isn't a
-		 * problem, as we want the new rate to take effect immediately
-		 * anyway.
+		 * completed) will take a sample.
 		 *
 		 * If this runs in parallel with dbs_work_handler(), we may end
 		 * up overwriting the sample_delay_ns value that it has just
-		 * written, but the difference should not be too big and it will
-		 * be corrected next time a sample is taken, so it shouldn't be
-		 * significant.
+		 * written, but it will be corrected next time a sample is
+		 * taken, so it shouldn't be significant.
 		 */
-		gov_update_sample_delay(policy_dbs, dbs_data->sampling_rate);
+		gov_update_sample_delay(policy_dbs, 0);
 		mutex_unlock(&policy_dbs->timer_mutex);
 	}
 
-- 
cgit v0.10.2


From 57dc3bcd454eb420ddf25d89852993b61b351327 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Mon, 15 Feb 2016 02:20:51 +0100
Subject: cpufreq: governor: Move rate_mult to struct policy_dbs

The rate_mult field in struct od_cpu_dbs_info_s is used by the code
shared with the conservative governor and to access it that code
has to do an ugly governor type check.  However, first of all it
is ever only used for policy->cpu, so it is per-policy rather than
per-CPU and second, it is initialized to 1 by cpufreq_governor_start(),
so if the conservative governor never modifies it, it will have no
effect on the results of any computations.

For these reasons, move rate_mult to struct policy_dbs_info (as a
common field).

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>

diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c
index fd4cdc2..b002c0d 100644
--- a/drivers/cpufreq/cpufreq_governor.c
+++ b/drivers/cpufreq/cpufreq_governor.c
@@ -138,24 +138,17 @@ unsigned int dbs_update(struct cpufreq_policy *policy)
 	struct policy_dbs_info *policy_dbs = policy->governor_data;
 	struct dbs_data *dbs_data = policy_dbs->dbs_data;
 	struct od_dbs_tuners *od_tuners = dbs_data->tuners;
-	unsigned int sampling_rate = dbs_data->sampling_rate;
 	unsigned int ignore_nice = dbs_data->ignore_nice_load;
 	unsigned int max_load = 0;
-	unsigned int j;
+	unsigned int sampling_rate, j;
 
-	if (gov->governor == GOV_ONDEMAND) {
-		struct od_cpu_dbs_info_s *od_dbs_info =
-				gov->get_cpu_dbs_info_s(policy->cpu);
-
-		/*
-		 * Sometimes, the ondemand governor uses an additional
-		 * multiplier to give long delays. So apply this multiplier to
-		 * the 'sampling_rate', so as to keep the wake-up-from-idle
-		 * detection logic a bit conservative.
-		 */
-		sampling_rate *= od_dbs_info->rate_mult;
-
-	}
+	/*
+	 * Sometimes governors may use an additional multiplier to increase
+	 * sample delays temporarily.  Apply that multiplier to sampling_rate
+	 * so as to keep the wake-up-from-idle detection logic a bit
+	 * conservative.
+	 */
+	sampling_rate = dbs_data->sampling_rate * policy_dbs->rate_mult;
 
 	/* Get Absolute Load */
 	for_each_cpu(j, policy->cpus) {
@@ -537,6 +530,7 @@ static int cpufreq_governor_start(struct cpufreq_policy *policy)
 		return -EINVAL;
 
 	policy_dbs->is_shared = policy_is_shared(policy);
+	policy_dbs->rate_mult = 1;
 
 	sampling_rate = dbs_data->sampling_rate;
 	ignore_nice = dbs_data->ignore_nice_load;
@@ -570,7 +564,6 @@ static int cpufreq_governor_start(struct cpufreq_policy *policy)
 		struct od_ops *od_ops = gov->gov_ops;
 		struct od_cpu_dbs_info_s *od_dbs_info = gov->get_cpu_dbs_info_s(cpu);
 
-		od_dbs_info->rate_mult = 1;
 		od_dbs_info->sample_type = OD_NORMAL_SAMPLE;
 		od_ops->powersave_bias_init_cpu(cpu);
 	}
diff --git a/drivers/cpufreq/cpufreq_governor.h b/drivers/cpufreq/cpufreq_governor.h
index 38b9512..f21d1e1 100644
--- a/drivers/cpufreq/cpufreq_governor.h
+++ b/drivers/cpufreq/cpufreq_governor.h
@@ -130,6 +130,8 @@ struct policy_dbs_info {
 	/* dbs_data may be shared between multiple policy objects */
 	struct dbs_data *dbs_data;
 	struct list_head list;
+	/* Multiplier for increasing sample delay temporarily. */
+	unsigned int rate_mult;
 	/* Status indicators */
 	bool is_shared;		/* This object is used by multiple CPUs */
 	bool work_in_progress;	/* Work is being queued up or in progress */
@@ -163,7 +165,6 @@ struct od_cpu_dbs_info_s {
 	unsigned int freq_lo;
 	unsigned int freq_lo_jiffies;
 	unsigned int freq_hi_jiffies;
-	unsigned int rate_mult;
 	unsigned int sample_type:1;
 };
 
diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c
index 707c017..812d994 100644
--- a/drivers/cpufreq/cpufreq_ondemand.c
+++ b/drivers/cpufreq/cpufreq_ondemand.c
@@ -164,7 +164,7 @@ static void od_update(struct cpufreq_policy *policy)
 	if (load > dbs_data->up_threshold) {
 		/* If switching to max speed, apply sampling_down_factor */
 		if (policy->cur < policy->max)
-			dbs_info->rate_mult = dbs_data->sampling_down_factor;
+			policy_dbs->rate_mult = dbs_data->sampling_down_factor;
 		dbs_freq_increase(policy, policy->max);
 	} else {
 		/* Calculate the next frequency proportional to load */
@@ -175,7 +175,7 @@ static void od_update(struct cpufreq_policy *policy)
 		freq_next = min_f + load * (max_f - min_f) / 100;
 
 		/* No longer fully busy, reset rate_mult */
-		dbs_info->rate_mult = 1;
+		policy_dbs->rate_mult = 1;
 
 		if (!od_tuners->powersave_bias) {
 			__cpufreq_driver_target(policy, freq_next,
@@ -214,7 +214,7 @@ static unsigned int od_dbs_timer(struct cpufreq_policy *policy)
 			delay = dbs_info->freq_hi_jiffies;
 		} else {
 			delay = delay_for_sampling_rate(dbs_data->sampling_rate
-							* dbs_info->rate_mult);
+							* policy_dbs->rate_mult);
 		}
 	}
 
@@ -266,20 +266,27 @@ static ssize_t store_up_threshold(struct dbs_data *dbs_data, const char *buf,
 static ssize_t store_sampling_down_factor(struct dbs_data *dbs_data,
 		const char *buf, size_t count)
 {
-	unsigned int input, j;
+	struct policy_dbs_info *policy_dbs;
+	unsigned int input;
 	int ret;
 	ret = sscanf(buf, "%u", &input);
 
 	if (ret != 1 || input > MAX_SAMPLING_DOWN_FACTOR || input < 1)
 		return -EINVAL;
+
 	dbs_data->sampling_down_factor = input;
 
 	/* Reset down sampling multiplier in case it was active */
-	for_each_online_cpu(j) {
-		struct od_cpu_dbs_info_s *dbs_info = &per_cpu(od_cpu_dbs_info,
-				j);
-		dbs_info->rate_mult = 1;
+	list_for_each_entry(policy_dbs, &dbs_data->policy_dbs_list, list) {
+		/*
+		 * Doing this without locking might lead to using different
+		 * rate_mult values in od_update() and od_dbs_timer().
+		 */
+		mutex_lock(&policy_dbs->timer_mutex);
+		policy_dbs->rate_mult = 1;
+		mutex_unlock(&policy_dbs->timer_mutex);
 	}
+
 	return count;
 }
 
-- 
cgit v0.10.2


From 6e96c5b3ac5181d4b787590e54c4af99d3fa5f2e Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Mon, 15 Feb 2016 02:21:35 +0100
Subject: cpufreq: ondemand: Simplify conditionals in od_dbs_timer()

Reduce the indentation level in the conditionals in od_dbs_timer()
and drop the delay variable from it.

No functional changes.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Reviewed-by: Viresh Kumar <viresh.kumar@linaro.org>

diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c
index 812d994..cb5a097 100644
--- a/drivers/cpufreq/cpufreq_ondemand.c
+++ b/drivers/cpufreq/cpufreq_ondemand.c
@@ -194,7 +194,7 @@ static unsigned int od_dbs_timer(struct cpufreq_policy *policy)
 	struct policy_dbs_info *policy_dbs = policy->governor_data;
 	struct dbs_data *dbs_data = policy_dbs->dbs_data;
 	struct od_cpu_dbs_info_s *dbs_info = &per_cpu(od_cpu_dbs_info, policy->cpu);
-	int delay, sample_type = dbs_info->sample_type;
+	int sample_type = dbs_info->sample_type;
 
 	/* Common NORMAL_SAMPLE setup */
 	dbs_info->sample_type = OD_NORMAL_SAMPLE;
@@ -203,22 +203,20 @@ static unsigned int od_dbs_timer(struct cpufreq_policy *policy)
 	 * it then.
 	 */
 	if (sample_type == OD_SUB_SAMPLE && policy_dbs->sample_delay_ns > 0) {
-		delay = dbs_info->freq_lo_jiffies;
 		__cpufreq_driver_target(policy, dbs_info->freq_lo,
 					CPUFREQ_RELATION_H);
-	} else {
-		od_update(policy);
-		if (dbs_info->freq_lo) {
-			/* Setup timer for SUB_SAMPLE */
-			dbs_info->sample_type = OD_SUB_SAMPLE;
-			delay = dbs_info->freq_hi_jiffies;
-		} else {
-			delay = delay_for_sampling_rate(dbs_data->sampling_rate
-							* policy_dbs->rate_mult);
-		}
+		return dbs_info->freq_lo_jiffies;
+	}
+
+	od_update(policy);
+
+	if (dbs_info->freq_lo) {
+		/* Setup timer for SUB_SAMPLE */
+		dbs_info->sample_type = OD_SUB_SAMPLE;
+		return dbs_info->freq_hi_jiffies;
 	}
 
-	return delay;
+	return delay_for_sampling_rate(dbs_data->sampling_rate * policy_dbs->rate_mult);
 }
 
 /************************** sysfs interface ************************/
-- 
cgit v0.10.2


From 07aa4402a009bc83194860e7869c491bab854d1c Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Mon, 15 Feb 2016 02:22:13 +0100
Subject: cpufreq: governor: Use microseconds in sample delay computations

Do not convert microseconds to jiffies and the other way around
in governor computations related to the sampling rate and sample
delay and drop delay_for_sampling_rate() which isn't of any use
then.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>

diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c
index 2e9040e..4a6f8e1 100644
--- a/drivers/cpufreq/cpufreq_conservative.c
+++ b/drivers/cpufreq/cpufreq_conservative.c
@@ -102,7 +102,7 @@ static unsigned int cs_dbs_timer(struct cpufreq_policy *policy)
 	}
 
  out:
-	return delay_for_sampling_rate(dbs_data->sampling_rate);
+	return dbs_data->sampling_rate;
 }
 
 static int dbs_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c
index b002c0d..56dba71 100644
--- a/drivers/cpufreq/cpufreq_governor.c
+++ b/drivers/cpufreq/cpufreq_governor.c
@@ -282,7 +282,6 @@ static void dbs_work_handler(struct work_struct *work)
 	struct policy_dbs_info *policy_dbs;
 	struct cpufreq_policy *policy;
 	struct dbs_governor *gov;
-	unsigned int delay;
 
 	policy_dbs = container_of(work, struct policy_dbs_info, work);
 	policy = policy_dbs->policy;
@@ -293,8 +292,7 @@ static void dbs_work_handler(struct work_struct *work)
 	 * ondemand governor isn't updating the sampling rate in parallel.
 	 */
 	mutex_lock(&policy_dbs->timer_mutex);
-	delay = gov->gov_dbs_timer(policy);
-	policy_dbs->sample_delay_ns = jiffies_to_nsecs(delay);
+	gov_update_sample_delay(policy_dbs, gov->gov_dbs_timer(policy));
 	mutex_unlock(&policy_dbs->timer_mutex);
 
 	/* Allow the utilization update handler to queue up more work. */
diff --git a/drivers/cpufreq/cpufreq_governor.h b/drivers/cpufreq/cpufreq_governor.h
index f21d1e1..7ae0c71 100644
--- a/drivers/cpufreq/cpufreq_governor.h
+++ b/drivers/cpufreq/cpufreq_governor.h
@@ -163,8 +163,8 @@ struct od_cpu_dbs_info_s {
 	struct cpu_dbs_info cdbs;
 	struct cpufreq_frequency_table *freq_table;
 	unsigned int freq_lo;
-	unsigned int freq_lo_jiffies;
-	unsigned int freq_hi_jiffies;
+	unsigned int freq_lo_delay_us;
+	unsigned int freq_hi_delay_us;
 	unsigned int sample_type:1;
 };
 
@@ -223,17 +223,6 @@ struct od_ops {
 	void (*freq_increase)(struct cpufreq_policy *policy, unsigned int freq);
 };
 
-static inline int delay_for_sampling_rate(unsigned int sampling_rate)
-{
-	int delay = usecs_to_jiffies(sampling_rate);
-
-	/* We want all CPUs to do sampling nearly on same jiffy */
-	if (num_online_cpus() > 1)
-		delay -= jiffies % delay;
-
-	return delay;
-}
-
 extern struct mutex dbs_data_mutex;
 unsigned int dbs_update(struct cpufreq_policy *policy);
 int cpufreq_governor_dbs(struct cpufreq_policy *policy, unsigned int event);
diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c
index cb5a097..a3ee745 100644
--- a/drivers/cpufreq/cpufreq_ondemand.c
+++ b/drivers/cpufreq/cpufreq_ondemand.c
@@ -66,8 +66,8 @@ static int should_io_be_busy(void)
 
 /*
  * Find right freq to be set now with powersave_bias on.
- * Returns the freq_hi to be used right now and will set freq_hi_jiffies,
- * freq_lo, and freq_lo_jiffies in percpu area for averaging freqs.
+ * Returns the freq_hi to be used right now and will set freq_hi_delay_us,
+ * freq_lo, and freq_lo_delay_us in percpu area for averaging freqs.
  */
 static unsigned int generic_powersave_bias_target(struct cpufreq_policy *policy,
 		unsigned int freq_next, unsigned int relation)
@@ -75,7 +75,7 @@ static unsigned int generic_powersave_bias_target(struct cpufreq_policy *policy,
 	unsigned int freq_req, freq_reduc, freq_avg;
 	unsigned int freq_hi, freq_lo;
 	unsigned int index = 0;
-	unsigned int jiffies_total, jiffies_hi, jiffies_lo;
+	unsigned int delay_hi_us;
 	struct od_cpu_dbs_info_s *dbs_info = &per_cpu(od_cpu_dbs_info,
 						   policy->cpu);
 	struct policy_dbs_info *policy_dbs = policy->governor_data;
@@ -84,7 +84,7 @@ static unsigned int generic_powersave_bias_target(struct cpufreq_policy *policy,
 
 	if (!dbs_info->freq_table) {
 		dbs_info->freq_lo = 0;
-		dbs_info->freq_lo_jiffies = 0;
+		dbs_info->freq_lo_delay_us = 0;
 		return freq_next;
 	}
 
@@ -107,17 +107,15 @@ static unsigned int generic_powersave_bias_target(struct cpufreq_policy *policy,
 	/* Find out how long we have to be in hi and lo freqs */
 	if (freq_hi == freq_lo) {
 		dbs_info->freq_lo = 0;
-		dbs_info->freq_lo_jiffies = 0;
+		dbs_info->freq_lo_delay_us = 0;
 		return freq_lo;
 	}
-	jiffies_total = usecs_to_jiffies(dbs_data->sampling_rate);
-	jiffies_hi = (freq_avg - freq_lo) * jiffies_total;
-	jiffies_hi += ((freq_hi - freq_lo) / 2);
-	jiffies_hi /= (freq_hi - freq_lo);
-	jiffies_lo = jiffies_total - jiffies_hi;
+	delay_hi_us = (freq_avg - freq_lo) * dbs_data->sampling_rate;
+	delay_hi_us += (freq_hi - freq_lo) / 2;
+	delay_hi_us /= freq_hi - freq_lo;
+	dbs_info->freq_hi_delay_us = delay_hi_us;
 	dbs_info->freq_lo = freq_lo;
-	dbs_info->freq_lo_jiffies = jiffies_lo;
-	dbs_info->freq_hi_jiffies = jiffies_hi;
+	dbs_info->freq_lo_delay_us = dbs_data->sampling_rate - delay_hi_us;
 	return freq_hi;
 }
 
@@ -205,7 +203,7 @@ static unsigned int od_dbs_timer(struct cpufreq_policy *policy)
 	if (sample_type == OD_SUB_SAMPLE && policy_dbs->sample_delay_ns > 0) {
 		__cpufreq_driver_target(policy, dbs_info->freq_lo,
 					CPUFREQ_RELATION_H);
-		return dbs_info->freq_lo_jiffies;
+		return dbs_info->freq_lo_delay_us;
 	}
 
 	od_update(policy);
@@ -213,10 +211,10 @@ static unsigned int od_dbs_timer(struct cpufreq_policy *policy)
 	if (dbs_info->freq_lo) {
 		/* Setup timer for SUB_SAMPLE */
 		dbs_info->sample_type = OD_SUB_SAMPLE;
-		return dbs_info->freq_hi_jiffies;
+		return dbs_info->freq_hi_delay_us;
 	}
 
-	return delay_for_sampling_rate(dbs_data->sampling_rate * policy_dbs->rate_mult);
+	return dbs_data->sampling_rate * policy_dbs->rate_mult;
 }
 
 /************************** sysfs interface ************************/
-- 
cgit v0.10.2


From a7f35cffb980f3aec75f74559a4320974c845b78 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Tue, 16 Feb 2016 21:02:24 +0100
Subject: cpufreq: ondemand: Simplify od_update() slightly

Drop some lines of code from od_update() by arranging the statements
in there in a more logical way.

No functional changes.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>

diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c
index a3ee745..34e3a1b 100644
--- a/drivers/cpufreq/cpufreq_ondemand.c
+++ b/drivers/cpufreq/cpufreq_ondemand.c
@@ -175,14 +175,11 @@ static void od_update(struct cpufreq_policy *policy)
 		/* No longer fully busy, reset rate_mult */
 		policy_dbs->rate_mult = 1;
 
-		if (!od_tuners->powersave_bias) {
-			__cpufreq_driver_target(policy, freq_next,
-					CPUFREQ_RELATION_C);
-			return;
-		}
-
-		freq_next = od_ops.powersave_bias_target(policy, freq_next,
-					CPUFREQ_RELATION_L);
+		if (od_tuners->powersave_bias)
+			freq_next = od_ops.powersave_bias_target(policy,
+								 freq_next,
+								 CPUFREQ_RELATION_L);
+
 		__cpufreq_driver_target(policy, freq_next, CPUFREQ_RELATION_C);
 	}
 }
-- 
cgit v0.10.2


From 8eb055d3f53e52805907ea54e2eec0885be91a50 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Tue, 16 Feb 2016 21:02:32 +0100
Subject: cpufreq: ondemand: Drop unused callback from struct od_ops

The ->freq_increase callback in struct od_ops is never invoked,
so drop it.

No functional changes.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>

diff --git a/drivers/cpufreq/cpufreq_governor.h b/drivers/cpufreq/cpufreq_governor.h
index 7ae0c71..675e1cd 100644
--- a/drivers/cpufreq/cpufreq_governor.h
+++ b/drivers/cpufreq/cpufreq_governor.h
@@ -220,7 +220,6 @@ struct od_ops {
 	void (*powersave_bias_init_cpu)(int cpu);
 	unsigned int (*powersave_bias_target)(struct cpufreq_policy *policy,
 			unsigned int freq_next, unsigned int relation);
-	void (*freq_increase)(struct cpufreq_policy *policy, unsigned int freq);
 };
 
 extern struct mutex dbs_data_mutex;
diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c
index 34e3a1b..375fdcf 100644
--- a/drivers/cpufreq/cpufreq_ondemand.c
+++ b/drivers/cpufreq/cpufreq_ondemand.c
@@ -417,7 +417,6 @@ define_get_cpu_dbs_routines(od_cpu_dbs_info);
 static struct od_ops od_ops = {
 	.powersave_bias_init_cpu = ondemand_powersave_bias_init_cpu,
 	.powersave_bias_target = generic_powersave_bias_target,
-	.freq_increase = dbs_freq_increase,
 };
 
 static struct dbs_governor od_dbs_gov = {
-- 
cgit v0.10.2


From 574ef14d5dbcd2743326cc1b28e61a1e7733162a Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Thu, 18 Feb 2016 02:19:00 +0100
Subject: cpufreq: governor: Close dbs_data update race condition

It is possible for a dbs_data object to be updated after its
usage counter has become 0.  That may happen if governor_store()
runs (via a govenor tunable sysfs attribute write) in parallel
with cpufreq_governor_exit() called for the last cpufreq policy
associated with the dbs_data in question.  In that case, if
governor_store() acquires dbs_data->mutex right after
cpufreq_governor_exit() has released it, the ->store() callback
invoked by it may operate on dbs_data with no users.  Although
sysfs will cause the kobject_put() in cpufreq_governor_exit() to
block until governor_store() has returned, that situation may
lead to some unexpected results, depending on the implementation
of the ->store callback, and therefore it should be avoided.

To that end, modify governor_store() to check the dbs_data's
usage count before invoking the ->store() callback and return
an error if it is 0 at that point.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>

diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c
index 56dba71..65ed859 100644
--- a/drivers/cpufreq/cpufreq_governor.c
+++ b/drivers/cpufreq/cpufreq_governor.c
@@ -112,7 +112,7 @@ static ssize_t governor_store(struct kobject *kobj, struct attribute *attr,
 
 	mutex_lock(&dbs_data->mutex);
 
-	if (gattr->store)
+	if (dbs_data->usage_count && gattr->store)
 		ret = gattr->store(dbs_data, buf, count);
 
 	mutex_unlock(&dbs_data->mutex);
-- 
cgit v0.10.2


From 8847e038c1d19c20dda0d7a590e31ffa528da8a5 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Thu, 18 Feb 2016 02:20:13 +0100
Subject: cpufreq: governor: Move io_is_busy to struct dbs_data

The io_is_busy governor tunable is only used by the ondemand governor
and is located in the ondemand-specific data structure, but it is
looked at by the common governor code that has to do ugly things to
get to that value, so move it to struct dbs_data and modify ondemand
accordingly.

Since the conservative governor never touches that field, it will
be always 0 for that governor and it won't have any effect on the
results of computations in that case.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>

diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c
index 65ed859..6026816 100644
--- a/drivers/cpufreq/cpufreq_governor.c
+++ b/drivers/cpufreq/cpufreq_governor.c
@@ -137,10 +137,9 @@ unsigned int dbs_update(struct cpufreq_policy *policy)
 	struct dbs_governor *gov = dbs_governor_of(policy);
 	struct policy_dbs_info *policy_dbs = policy->governor_data;
 	struct dbs_data *dbs_data = policy_dbs->dbs_data;
-	struct od_dbs_tuners *od_tuners = dbs_data->tuners;
 	unsigned int ignore_nice = dbs_data->ignore_nice_load;
 	unsigned int max_load = 0;
-	unsigned int sampling_rate, j;
+	unsigned int sampling_rate, io_busy, j;
 
 	/*
 	 * Sometimes governors may use an additional multiplier to increase
@@ -149,6 +148,12 @@ unsigned int dbs_update(struct cpufreq_policy *policy)
 	 * conservative.
 	 */
 	sampling_rate = dbs_data->sampling_rate * policy_dbs->rate_mult;
+	/*
+	 * For the purpose of ondemand, waiting for disk IO is an indication
+	 * that you're performance critical, and not that the system is actually
+	 * idle, so do not add the iowait time to the CPU idle time then.
+	 */
+	io_busy = dbs_data->io_is_busy;
 
 	/* Get Absolute Load */
 	for_each_cpu(j, policy->cpus) {
@@ -156,18 +161,9 @@ unsigned int dbs_update(struct cpufreq_policy *policy)
 		u64 cur_wall_time, cur_idle_time;
 		unsigned int idle_time, wall_time;
 		unsigned int load;
-		int io_busy = 0;
 
 		j_cdbs = gov->get_cpu_cdbs(j);
 
-		/*
-		 * For the purpose of ondemand, waiting for disk IO is
-		 * an indication that you're performance critical, and
-		 * not that the system is actually idle. So do not add
-		 * the iowait time to the cpu idle time.
-		 */
-		if (gov->governor == GOV_ONDEMAND)
-			io_busy = od_tuners->io_is_busy;
 		cur_idle_time = get_cpu_idle_time(j, &cur_wall_time, io_busy);
 
 		wall_time = cur_wall_time - j_cdbs->prev_cpu_wall;
@@ -522,7 +518,7 @@ static int cpufreq_governor_start(struct cpufreq_policy *policy)
 	struct policy_dbs_info *policy_dbs = policy->governor_data;
 	struct dbs_data *dbs_data = policy_dbs->dbs_data;
 	unsigned int sampling_rate, ignore_nice, j, cpu = policy->cpu;
-	int io_busy = 0;
+	unsigned int io_busy;
 
 	if (!policy->cur)
 		return -EINVAL;
@@ -532,12 +528,7 @@ static int cpufreq_governor_start(struct cpufreq_policy *policy)
 
 	sampling_rate = dbs_data->sampling_rate;
 	ignore_nice = dbs_data->ignore_nice_load;
-
-	if (gov->governor == GOV_ONDEMAND) {
-		struct od_dbs_tuners *od_tuners = dbs_data->tuners;
-
-		io_busy = od_tuners->io_is_busy;
-	}
+	io_busy = dbs_data->io_is_busy;
 
 	for_each_cpu(j, policy->cpus) {
 		struct cpu_dbs_info *j_cdbs = gov->get_cpu_cdbs(j);
diff --git a/drivers/cpufreq/cpufreq_governor.h b/drivers/cpufreq/cpufreq_governor.h
index 675e1cd..7b36393 100644
--- a/drivers/cpufreq/cpufreq_governor.h
+++ b/drivers/cpufreq/cpufreq_governor.h
@@ -71,6 +71,7 @@ struct dbs_data {
 	unsigned int sampling_rate;
 	unsigned int sampling_down_factor;
 	unsigned int up_threshold;
+	unsigned int io_is_busy;
 
 	struct kobject kobj;
 	struct list_head policy_dbs_list;
@@ -177,7 +178,6 @@ struct cs_cpu_dbs_info_s {
 /* Per policy Governors sysfs tunables */
 struct od_dbs_tuners {
 	unsigned int powersave_bias;
-	unsigned int io_is_busy;
 };
 
 struct cs_dbs_tuners {
diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c
index 375fdcf..330b588 100644
--- a/drivers/cpufreq/cpufreq_ondemand.c
+++ b/drivers/cpufreq/cpufreq_ondemand.c
@@ -220,7 +220,6 @@ static struct dbs_governor od_dbs_gov;
 static ssize_t store_io_is_busy(struct dbs_data *dbs_data, const char *buf,
 		size_t count)
 {
-	struct od_dbs_tuners *od_tuners = dbs_data->tuners;
 	unsigned int input;
 	int ret;
 	unsigned int j;
@@ -228,14 +227,14 @@ static ssize_t store_io_is_busy(struct dbs_data *dbs_data, const char *buf,
 	ret = sscanf(buf, "%u", &input);
 	if (ret != 1)
 		return -EINVAL;
-	od_tuners->io_is_busy = !!input;
+	dbs_data->io_is_busy = !!input;
 
 	/* we need to re-evaluate prev_cpu_idle */
 	for_each_online_cpu(j) {
 		struct od_cpu_dbs_info_s *dbs_info = &per_cpu(od_cpu_dbs_info,
 									j);
 		dbs_info->cdbs.prev_cpu_idle = get_cpu_idle_time(j,
-			&dbs_info->cdbs.prev_cpu_wall, od_tuners->io_is_busy);
+			&dbs_info->cdbs.prev_cpu_wall, dbs_data->io_is_busy);
 	}
 	return count;
 }
@@ -286,7 +285,6 @@ static ssize_t store_sampling_down_factor(struct dbs_data *dbs_data,
 static ssize_t store_ignore_nice_load(struct dbs_data *dbs_data,
 		const char *buf, size_t count)
 {
-	struct od_dbs_tuners *od_tuners = dbs_data->tuners;
 	unsigned int input;
 	int ret;
 
@@ -309,7 +307,7 @@ static ssize_t store_ignore_nice_load(struct dbs_data *dbs_data,
 		struct od_cpu_dbs_info_s *dbs_info;
 		dbs_info = &per_cpu(od_cpu_dbs_info, j);
 		dbs_info->cdbs.prev_cpu_idle = get_cpu_idle_time(j,
-			&dbs_info->cdbs.prev_cpu_wall, od_tuners->io_is_busy);
+			&dbs_info->cdbs.prev_cpu_wall, dbs_data->io_is_busy);
 		if (dbs_data->ignore_nice_load)
 			dbs_info->cdbs.prev_cpu_nice =
 				kcpustat_cpu(j).cpustat[CPUTIME_NICE];
@@ -342,7 +340,7 @@ gov_show_one_common(up_threshold);
 gov_show_one_common(sampling_down_factor);
 gov_show_one_common(ignore_nice_load);
 gov_show_one_common(min_sampling_rate);
-gov_show_one(od, io_is_busy);
+gov_show_one_common(io_is_busy);
 gov_show_one(od, powersave_bias);
 
 gov_attr_rw(sampling_rate);
@@ -401,7 +399,7 @@ static int od_init(struct dbs_data *dbs_data, bool notify)
 	dbs_data->sampling_down_factor = DEF_SAMPLING_DOWN_FACTOR;
 	dbs_data->ignore_nice_load = 0;
 	tuners->powersave_bias = default_powersave_bias;
-	tuners->io_is_busy = should_io_be_busy();
+	dbs_data->io_is_busy = should_io_be_busy();
 
 	dbs_data->tuners = tuners;
 	return 0;
-- 
cgit v0.10.2


From 702c9e542a25cf95683c08c56e711eddb80020ac Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Thu, 18 Feb 2016 02:21:21 +0100
Subject: cpufreq: governor: Add a ->start callback for governors

To avoid having to check the governor type explicitly in the common
code in order to initialize data structures specific to the governor
type properly, add a ->start callback to struct dbs_governor and
use it to initialize those data structures for the ondemand and
conservative governors.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>

diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c
index 4a6f8e1..c11fe95 100644
--- a/drivers/cpufreq/cpufreq_conservative.c
+++ b/drivers/cpufreq/cpufreq_conservative.c
@@ -279,6 +279,14 @@ static void cs_exit(struct dbs_data *dbs_data, bool notify)
 	kfree(dbs_data->tuners);
 }
 
+static void cs_start(struct cpufreq_policy *policy)
+{
+	struct cs_cpu_dbs_info_s *dbs_info = &per_cpu(cs_cpu_dbs_info, policy->cpu);
+
+	dbs_info->down_skip = 0;
+	dbs_info->requested_freq = policy->cur;
+}
+
 define_get_cpu_dbs_routines(cs_cpu_dbs_info);
 
 static struct dbs_governor cs_dbs_gov = {
@@ -295,6 +303,7 @@ static struct dbs_governor cs_dbs_gov = {
 	.gov_dbs_timer = cs_dbs_timer,
 	.init = cs_init,
 	.exit = cs_exit,
+	.start = cs_start,
 };
 
 #define CPU_FREQ_GOV_CONSERVATIVE	(&cs_dbs_gov.gov)
diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c
index 6026816..badbd46 100644
--- a/drivers/cpufreq/cpufreq_governor.c
+++ b/drivers/cpufreq/cpufreq_governor.c
@@ -517,7 +517,7 @@ static int cpufreq_governor_start(struct cpufreq_policy *policy)
 	struct dbs_governor *gov = dbs_governor_of(policy);
 	struct policy_dbs_info *policy_dbs = policy->governor_data;
 	struct dbs_data *dbs_data = policy_dbs->dbs_data;
-	unsigned int sampling_rate, ignore_nice, j, cpu = policy->cpu;
+	unsigned int sampling_rate, ignore_nice, j;
 	unsigned int io_busy;
 
 	if (!policy->cur)
@@ -543,19 +543,7 @@ static int cpufreq_governor_start(struct cpufreq_policy *policy)
 			j_cdbs->prev_cpu_nice = kcpustat_cpu(j).cpustat[CPUTIME_NICE];
 	}
 
-	if (gov->governor == GOV_CONSERVATIVE) {
-		struct cs_cpu_dbs_info_s *cs_dbs_info =
-			gov->get_cpu_dbs_info_s(cpu);
-
-		cs_dbs_info->down_skip = 0;
-		cs_dbs_info->requested_freq = policy->cur;
-	} else {
-		struct od_ops *od_ops = gov->gov_ops;
-		struct od_cpu_dbs_info_s *od_dbs_info = gov->get_cpu_dbs_info_s(cpu);
-
-		od_dbs_info->sample_type = OD_NORMAL_SAMPLE;
-		od_ops->powersave_bias_init_cpu(cpu);
-	}
+	gov->start(policy);
 
 	gov_set_update_util(policy_dbs, sampling_rate);
 	return 0;
diff --git a/drivers/cpufreq/cpufreq_governor.h b/drivers/cpufreq/cpufreq_governor.h
index 7b36393..2ae0ad5 100644
--- a/drivers/cpufreq/cpufreq_governor.h
+++ b/drivers/cpufreq/cpufreq_governor.h
@@ -205,6 +205,7 @@ struct dbs_governor {
 	unsigned int (*gov_dbs_timer)(struct cpufreq_policy *policy);
 	int (*init)(struct dbs_data *dbs_data, bool notify);
 	void (*exit)(struct dbs_data *dbs_data, bool notify);
+	void (*start)(struct cpufreq_policy *policy);
 
 	/* Governor specific ops, see below */
 	void *gov_ops;
diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c
index 330b588..de069f8 100644
--- a/drivers/cpufreq/cpufreq_ondemand.c
+++ b/drivers/cpufreq/cpufreq_ondemand.c
@@ -410,6 +410,15 @@ static void od_exit(struct dbs_data *dbs_data, bool notify)
 	kfree(dbs_data->tuners);
 }
 
+static void od_start(struct cpufreq_policy *policy)
+{
+	unsigned int cpu = policy->cpu;
+	struct od_cpu_dbs_info_s *dbs_info = &per_cpu(od_cpu_dbs_info, cpu);
+
+	dbs_info->sample_type = OD_NORMAL_SAMPLE;
+	od_ops.powersave_bias_init_cpu(cpu);
+}
+
 define_get_cpu_dbs_routines(od_cpu_dbs_info);
 
 static struct od_ops od_ops = {
@@ -432,6 +441,7 @@ static struct dbs_governor od_dbs_gov = {
 	.gov_ops = &od_ops,
 	.init = od_init,
 	.exit = od_exit,
+	.start = od_start,
 };
 
 #define CPU_FREQ_GOV_ONDEMAND	(&od_dbs_gov.gov)
-- 
cgit v0.10.2


From 8434dadbb457813a127f56d9f0fb7d22035027b9 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Thu, 18 Feb 2016 02:22:42 +0100
Subject: cpufreq: governor: Drop unused governor callback and data fields

After some previous changes, the ->get_cpu_dbs_info_s governor
callback and the "governor" field in struct dbs_governor (whose
value represents the governor type) are not used any more, so
drop them.

Also drop the unused gov_ops field from struct dbs_governor.

No functional changes.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>

diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c
index c11fe95..cdc7531 100644
--- a/drivers/cpufreq/cpufreq_conservative.c
+++ b/drivers/cpufreq/cpufreq_conservative.c
@@ -296,10 +296,8 @@ static struct dbs_governor cs_dbs_gov = {
 		.max_transition_latency = TRANSITION_LATENCY_LIMIT,
 		.owner = THIS_MODULE,
 	},
-	.governor = GOV_CONSERVATIVE,
 	.kobj_type = { .default_attrs = cs_attributes },
 	.get_cpu_cdbs = get_cpu_cdbs,
-	.get_cpu_dbs_info_s = get_cpu_dbs_info_s,
 	.gov_dbs_timer = cs_dbs_timer,
 	.init = cs_init,
 	.exit = cs_exit,
diff --git a/drivers/cpufreq/cpufreq_governor.h b/drivers/cpufreq/cpufreq_governor.h
index 2ae0ad5..ee46f34 100644
--- a/drivers/cpufreq/cpufreq_governor.h
+++ b/drivers/cpufreq/cpufreq_governor.h
@@ -46,11 +46,6 @@ enum {OD_NORMAL_SAMPLE, OD_SUB_SAMPLE};
 static struct cpu_dbs_info *get_cpu_cdbs(int cpu)			\
 {									\
 	return &per_cpu(_dbs_info, cpu).cdbs;				\
-}									\
-									\
-static void *get_cpu_dbs_info_s(int cpu)				\
-{									\
-	return &per_cpu(_dbs_info, cpu);				\
 }
 
 /*
@@ -188,10 +183,6 @@ struct cs_dbs_tuners {
 /* Common Governor data across policies */
 struct dbs_governor {
 	struct cpufreq_governor gov;
-
-	#define GOV_ONDEMAND		0
-	#define GOV_CONSERVATIVE	1
-	int governor;
 	struct kobj_type kobj_type;
 
 	/*
@@ -201,14 +192,10 @@ struct dbs_governor {
 	struct dbs_data *gdbs_data;
 
 	struct cpu_dbs_info *(*get_cpu_cdbs)(int cpu);
-	void *(*get_cpu_dbs_info_s)(int cpu);
 	unsigned int (*gov_dbs_timer)(struct cpufreq_policy *policy);
 	int (*init)(struct dbs_data *dbs_data, bool notify);
 	void (*exit)(struct dbs_data *dbs_data, bool notify);
 	void (*start)(struct cpufreq_policy *policy);
-
-	/* Governor specific ops, see below */
-	void *gov_ops;
 };
 
 static inline struct dbs_governor *dbs_governor_of(struct cpufreq_policy *policy)
@@ -216,7 +203,7 @@ static inline struct dbs_governor *dbs_governor_of(struct cpufreq_policy *policy
 	return container_of(policy->governor, struct dbs_governor, gov);
 }
 
-/* Governor specific ops, will be passed to dbs_data->gov_ops */
+/* Governor specific operations */
 struct od_ops {
 	void (*powersave_bias_init_cpu)(int cpu);
 	unsigned int (*powersave_bias_target)(struct cpufreq_policy *policy,
diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c
index de069f8..41d239c 100644
--- a/drivers/cpufreq/cpufreq_ondemand.c
+++ b/drivers/cpufreq/cpufreq_ondemand.c
@@ -433,12 +433,9 @@ static struct dbs_governor od_dbs_gov = {
 		.max_transition_latency	= TRANSITION_LATENCY_LIMIT,
 		.owner = THIS_MODULE,
 	},
-	.governor = GOV_ONDEMAND,
 	.kobj_type = { .default_attrs = od_attributes },
 	.get_cpu_cdbs = get_cpu_cdbs,
-	.get_cpu_dbs_info_s = get_cpu_dbs_info_s,
 	.gov_dbs_timer = od_dbs_timer,
-	.gov_ops = &od_ops,
 	.init = od_init,
 	.exit = od_exit,
 	.start = od_start,
-- 
cgit v0.10.2


From 76c5f66aa10720a377dfe8beebd39a0b2a938965 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Thu, 18 Feb 2016 02:24:32 +0100
Subject: cpufreq: ondemand: Drop one more callback from struct od_ops

The ->powersave_bias_init_cpu callback in struct od_ops is only used
in one place and that invocation may be replaced with a direct call
to the function pointed to by that callback, so change the code
accordingly and drop the callback.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>

diff --git a/drivers/cpufreq/cpufreq_governor.h b/drivers/cpufreq/cpufreq_governor.h
index ee46f34..ec98065 100644
--- a/drivers/cpufreq/cpufreq_governor.h
+++ b/drivers/cpufreq/cpufreq_governor.h
@@ -205,7 +205,6 @@ static inline struct dbs_governor *dbs_governor_of(struct cpufreq_policy *policy
 
 /* Governor specific operations */
 struct od_ops {
-	void (*powersave_bias_init_cpu)(int cpu);
 	unsigned int (*powersave_bias_target)(struct cpufreq_policy *policy,
 			unsigned int freq_next, unsigned int relation);
 };
diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c
index 41d239c..393fcf1 100644
--- a/drivers/cpufreq/cpufreq_ondemand.c
+++ b/drivers/cpufreq/cpufreq_ondemand.c
@@ -416,13 +416,12 @@ static void od_start(struct cpufreq_policy *policy)
 	struct od_cpu_dbs_info_s *dbs_info = &per_cpu(od_cpu_dbs_info, cpu);
 
 	dbs_info->sample_type = OD_NORMAL_SAMPLE;
-	od_ops.powersave_bias_init_cpu(cpu);
+	ondemand_powersave_bias_init_cpu(cpu);
 }
 
 define_get_cpu_dbs_routines(od_cpu_dbs_info);
 
 static struct od_ops od_ops = {
-	.powersave_bias_init_cpu = ondemand_powersave_bias_init_cpu,
 	.powersave_bias_target = generic_powersave_bias_target,
 };
 
-- 
cgit v0.10.2


From a33cce1c6cc3268d8b4843bf1e4ac1e70b27d107 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Thu, 18 Feb 2016 02:26:55 +0100
Subject: cpufreq: governor: Fix CPU load information updates via ->store

The ->store() callbacks of some tunable sysfs attributes of the
ondemand and conservative governors trigger immediate updates of
the CPU load information for all CPUs "governed" by the given
dbs_data by walking the cpu_dbs_info structures for all online
CPUs in the system and updating them.

This is questionable for two reasons.  First, it may lead to a lot of
extra overhead on a system with many CPUs if the given dbs_data is
only associated with a few of them.  Second, if governor tunables are
per-policy, the CPUs associated with the other sets of governor
tunables should not be updated.

To address this issue, use the observation that in all of the places
in question the update operation may be carried out in the same way
(because all of the tunables involved are now located in struct
dbs_data and readily available to the common code) and make the
code in those places invoke the same (new) helper function that
will carry out the update correctly.

That new function always checks the ignore_nice_load tunable value
and updates the CPUs' prev_cpu_nice data fields if that's set, which
wasn't done by the original code in store_io_is_busy(), but it
should have been done in there too.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>

diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c
index cdc7531..876984c 100644
--- a/drivers/cpufreq/cpufreq_conservative.c
+++ b/drivers/cpufreq/cpufreq_conservative.c
@@ -23,6 +23,8 @@
 
 static DEFINE_PER_CPU(struct cs_cpu_dbs_info_s, cs_cpu_dbs_info);
 
+static struct dbs_governor cs_dbs_gov;
+
 static inline unsigned int get_freq_target(struct cs_dbs_tuners *cs_tuners,
 					   struct cpufreq_policy *policy)
 {
@@ -164,7 +166,7 @@ static ssize_t store_down_threshold(struct dbs_data *dbs_data, const char *buf,
 static ssize_t store_ignore_nice_load(struct dbs_data *dbs_data,
 		const char *buf, size_t count)
 {
-	unsigned int input, j;
+	unsigned int input;
 	int ret;
 
 	ret = sscanf(buf, "%u", &input);
@@ -180,15 +182,8 @@ static ssize_t store_ignore_nice_load(struct dbs_data *dbs_data,
 	dbs_data->ignore_nice_load = input;
 
 	/* we need to re-evaluate prev_cpu_idle */
-	for_each_online_cpu(j) {
-		struct cs_cpu_dbs_info_s *dbs_info;
-		dbs_info = &per_cpu(cs_cpu_dbs_info, j);
-		dbs_info->cdbs.prev_cpu_idle = get_cpu_idle_time(j,
-					&dbs_info->cdbs.prev_cpu_wall, 0);
-		if (dbs_data->ignore_nice_load)
-			dbs_info->cdbs.prev_cpu_nice =
-				kcpustat_cpu(j).cpustat[CPUTIME_NICE];
-	}
+	gov_update_cpu_data(&cs_dbs_gov, dbs_data);
+
 	return count;
 }
 
diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c
index badbd46..4b14f04 100644
--- a/drivers/cpufreq/cpufreq_governor.c
+++ b/drivers/cpufreq/cpufreq_governor.c
@@ -80,6 +80,36 @@ ssize_t store_sampling_rate(struct dbs_data *dbs_data, const char *buf,
 }
 EXPORT_SYMBOL_GPL(store_sampling_rate);
 
+/**
+ * gov_update_cpu_data - Update CPU load data.
+ * @gov: Governor whose data is to be updated.
+ * @dbs_data: Top-level governor data pointer.
+ *
+ * Update CPU load data for all CPUs in the domain governed by @dbs_data
+ * (that may be a single policy or a bunch of them if governor tunables are
+ * system-wide).
+ *
+ * Call under the @dbs_data mutex.
+ */
+void gov_update_cpu_data(struct dbs_governor *gov, struct dbs_data *dbs_data)
+{
+	struct policy_dbs_info *policy_dbs;
+
+	list_for_each_entry(policy_dbs, &dbs_data->policy_dbs_list, list) {
+		unsigned int j;
+
+		for_each_cpu(j, policy_dbs->policy->cpus) {
+			struct cpu_dbs_info *j_cdbs = gov->get_cpu_cdbs(j);
+
+			j_cdbs->prev_cpu_idle = get_cpu_idle_time(j, &j_cdbs->prev_cpu_wall,
+								  dbs_data->io_is_busy);
+			if (dbs_data->ignore_nice_load)
+				j_cdbs->prev_cpu_nice = kcpustat_cpu(j).cpustat[CPUTIME_NICE];
+		}
+	}
+}
+EXPORT_SYMBOL_GPL(gov_update_cpu_data);
+
 static inline struct dbs_data *to_dbs_data(struct kobject *kobj)
 {
 	return container_of(kobj, struct dbs_data, kobj);
diff --git a/drivers/cpufreq/cpufreq_governor.h b/drivers/cpufreq/cpufreq_governor.h
index ec98065..5c7d1ea 100644
--- a/drivers/cpufreq/cpufreq_governor.h
+++ b/drivers/cpufreq/cpufreq_governor.h
@@ -218,4 +218,5 @@ void od_register_powersave_bias_handler(unsigned int (*f)
 void od_unregister_powersave_bias_handler(void);
 ssize_t store_sampling_rate(struct dbs_data *dbs_data, const char *buf,
 			    size_t count);
+void gov_update_cpu_data(struct dbs_governor *gov, struct dbs_data *dbs_data);
 #endif /* _CPUFREQ_GOVERNOR_H */
diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c
index 393fcf1..216ea44 100644
--- a/drivers/cpufreq/cpufreq_ondemand.c
+++ b/drivers/cpufreq/cpufreq_ondemand.c
@@ -29,6 +29,7 @@
 
 static DEFINE_PER_CPU(struct od_cpu_dbs_info_s, od_cpu_dbs_info);
 
+static struct dbs_governor od_dbs_gov;
 static struct od_ops od_ops;
 
 static unsigned int default_powersave_bias;
@@ -222,7 +223,6 @@ static ssize_t store_io_is_busy(struct dbs_data *dbs_data, const char *buf,
 {
 	unsigned int input;
 	int ret;
-	unsigned int j;
 
 	ret = sscanf(buf, "%u", &input);
 	if (ret != 1)
@@ -230,12 +230,8 @@ static ssize_t store_io_is_busy(struct dbs_data *dbs_data, const char *buf,
 	dbs_data->io_is_busy = !!input;
 
 	/* we need to re-evaluate prev_cpu_idle */
-	for_each_online_cpu(j) {
-		struct od_cpu_dbs_info_s *dbs_info = &per_cpu(od_cpu_dbs_info,
-									j);
-		dbs_info->cdbs.prev_cpu_idle = get_cpu_idle_time(j,
-			&dbs_info->cdbs.prev_cpu_wall, dbs_data->io_is_busy);
-	}
+	gov_update_cpu_data(&od_dbs_gov, dbs_data);
+
 	return count;
 }
 
@@ -288,8 +284,6 @@ static ssize_t store_ignore_nice_load(struct dbs_data *dbs_data,
 	unsigned int input;
 	int ret;
 
-	unsigned int j;
-
 	ret = sscanf(buf, "%u", &input);
 	if (ret != 1)
 		return -EINVAL;
@@ -303,16 +297,8 @@ static ssize_t store_ignore_nice_load(struct dbs_data *dbs_data,
 	dbs_data->ignore_nice_load = input;
 
 	/* we need to re-evaluate prev_cpu_idle */
-	for_each_online_cpu(j) {
-		struct od_cpu_dbs_info_s *dbs_info;
-		dbs_info = &per_cpu(od_cpu_dbs_info, j);
-		dbs_info->cdbs.prev_cpu_idle = get_cpu_idle_time(j,
-			&dbs_info->cdbs.prev_cpu_wall, dbs_data->io_is_busy);
-		if (dbs_data->ignore_nice_load)
-			dbs_info->cdbs.prev_cpu_nice =
-				kcpustat_cpu(j).cpustat[CPUTIME_NICE];
+	gov_update_cpu_data(&od_dbs_gov, dbs_data);
 
-	}
 	return count;
 }
 
-- 
cgit v0.10.2


From d1db75fffc22504c586c3fae8d602384ea899340 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Thu, 18 Feb 2016 02:28:24 +0100
Subject: cpufreq: ondemand: Rework the handling of powersave bias updates

The ondemand_powersave_bias_init() function used for resetting data
fields related to the powersave bias tunable of the ondemand governor
works by walking all of the online CPUs in the system and updating the
od_cpu_dbs_info_s structures for all of them.

However, if governor tunables are per policy, the update should not
touch the CPUs that are not associated with the given dbs_data.

Moreover, since the data fields in question are only ever used for
policy->cpu in each policy governed by ondemand, the update can be
limited to those specific CPUs.

Rework the code to take the above observations into account.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>

diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c
index 216ea44..43d89f6 100644
--- a/drivers/cpufreq/cpufreq_ondemand.c
+++ b/drivers/cpufreq/cpufreq_ondemand.c
@@ -34,14 +34,6 @@ static struct od_ops od_ops;
 
 static unsigned int default_powersave_bias;
 
-static void ondemand_powersave_bias_init_cpu(int cpu)
-{
-	struct od_cpu_dbs_info_s *dbs_info = &per_cpu(od_cpu_dbs_info, cpu);
-
-	dbs_info->freq_table = cpufreq_frequency_get_table(cpu);
-	dbs_info->freq_lo = 0;
-}
-
 /*
  * Not all CPUs want IO time to be accounted as busy; this depends on how
  * efficient idling at a higher frequency/voltage is.
@@ -120,12 +112,13 @@ static unsigned int generic_powersave_bias_target(struct cpufreq_policy *policy,
 	return freq_hi;
 }
 
-static void ondemand_powersave_bias_init(void)
+static void ondemand_powersave_bias_init(struct cpufreq_policy *policy)
 {
-	int i;
-	for_each_online_cpu(i) {
-		ondemand_powersave_bias_init_cpu(i);
-	}
+	unsigned int cpu = policy->cpu;
+	struct od_cpu_dbs_info_s *dbs_info = &per_cpu(od_cpu_dbs_info, cpu);
+
+	dbs_info->freq_table = cpufreq_frequency_get_table(cpu);
+	dbs_info->freq_lo = 0;
 }
 
 static void dbs_freq_increase(struct cpufreq_policy *policy, unsigned int freq)
@@ -306,6 +299,7 @@ static ssize_t store_powersave_bias(struct dbs_data *dbs_data, const char *buf,
 		size_t count)
 {
 	struct od_dbs_tuners *od_tuners = dbs_data->tuners;
+	struct policy_dbs_info *policy_dbs;
 	unsigned int input;
 	int ret;
 	ret = sscanf(buf, "%u", &input);
@@ -317,7 +311,10 @@ static ssize_t store_powersave_bias(struct dbs_data *dbs_data, const char *buf,
 		input = 1000;
 
 	od_tuners->powersave_bias = input;
-	ondemand_powersave_bias_init();
+
+	list_for_each_entry(policy_dbs, &dbs_data->policy_dbs_list, list)
+		ondemand_powersave_bias_init(policy_dbs->policy);
+
 	return count;
 }
 
@@ -398,11 +395,10 @@ static void od_exit(struct dbs_data *dbs_data, bool notify)
 
 static void od_start(struct cpufreq_policy *policy)
 {
-	unsigned int cpu = policy->cpu;
-	struct od_cpu_dbs_info_s *dbs_info = &per_cpu(od_cpu_dbs_info, cpu);
+	struct od_cpu_dbs_info_s *dbs_info = &per_cpu(od_cpu_dbs_info, policy->cpu);
 
 	dbs_info->sample_type = OD_NORMAL_SAMPLE;
-	ondemand_powersave_bias_init_cpu(cpu);
+	ondemand_powersave_bias_init(policy);
 }
 
 define_get_cpu_dbs_routines(od_cpu_dbs_info);
-- 
cgit v0.10.2


From 7d5a9956af4ccf7d5cc0cd1f8d27d1691321bfc6 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Thu, 18 Feb 2016 18:40:14 +0100
Subject: cpufreq: governor: Make governor private data per-policy

Some fields in struct od_cpu_dbs_info_s and struct cs_cpu_dbs_info_s
are only used for a limited set of CPUs.  Namely, if a policy is
shared between multiple CPUs, those fields will only be used for one
of them (policy->cpu).  This means that they really are per-policy
rather than per-CPU and holding room for them in per-CPU data
structures is generally wasteful.  Also moving those fields into
per-policy data structures will allow some significant simplifications
to be made going forward.

For this reason, introduce struct cs_policy_dbs_info and
struct od_policy_dbs_info to hold those fields.  Define each of the
new structures as an extension of struct policy_dbs_info (such that
struct policy_dbs_info is embedded in each of them) and introduce
new ->alloc and ->free governor callbacks to allocate and free
those structures, respectively, such that ->alloc() will return
a pointer to the struct policy_dbs_info embedded in the allocated
data structure and ->free() will take that pointer as its argument.

With that, modify the code accessing the data fields in question
in per-CPU data objects to look for them in the new structures
via the struct policy_dbs_info pointer available to it and drop
them from struct od_cpu_dbs_info_s and struct cs_cpu_dbs_info_s.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>

diff --git a/drivers/cpufreq/amd_freq_sensitivity.c b/drivers/cpufreq/amd_freq_sensitivity.c
index 82ae100..404360c 100644
--- a/drivers/cpufreq/amd_freq_sensitivity.c
+++ b/drivers/cpufreq/amd_freq_sensitivity.c
@@ -21,7 +21,7 @@
 #include <asm/msr.h>
 #include <asm/cpufeature.h>
 
-#include "cpufreq_governor.h"
+#include "cpufreq_ondemand.h"
 
 #define MSR_AMD64_FREQ_SENSITIVITY_ACTUAL	0xc0010080
 #define MSR_AMD64_FREQ_SENSITIVITY_REFERENCE	0xc0010081
@@ -48,8 +48,7 @@ static unsigned int amd_powersave_bias_target(struct cpufreq_policy *policy,
 	struct policy_dbs_info *policy_dbs = policy->governor_data;
 	struct dbs_data *od_data = policy_dbs->dbs_data;
 	struct od_dbs_tuners *od_tuners = od_data->tuners;
-	struct od_cpu_dbs_info_s *od_info =
-		dbs_governor_of(policy)->get_cpu_dbs_info_s(policy->cpu);
+	struct od_policy_dbs_info *od_info = to_dbs_info(policy_dbs);
 
 	if (!od_info->freq_table)
 		return freq_next;
diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c
index 876984c..ffffda2 100644
--- a/drivers/cpufreq/cpufreq_conservative.c
+++ b/drivers/cpufreq/cpufreq_conservative.c
@@ -14,6 +14,17 @@
 #include <linux/slab.h>
 #include "cpufreq_governor.h"
 
+struct cs_policy_dbs_info {
+	struct policy_dbs_info policy_dbs;
+	unsigned int down_skip;
+	unsigned int requested_freq;
+};
+
+static inline struct cs_policy_dbs_info *to_dbs_info(struct policy_dbs_info *policy_dbs)
+{
+	return container_of(policy_dbs, struct cs_policy_dbs_info, policy_dbs);
+}
+
 /* Conservative governor macros */
 #define DEF_FREQUENCY_UP_THRESHOLD		(80)
 #define DEF_FREQUENCY_DOWN_THRESHOLD		(20)
@@ -48,8 +59,8 @@ static inline unsigned int get_freq_target(struct cs_dbs_tuners *cs_tuners,
  */
 static unsigned int cs_dbs_timer(struct cpufreq_policy *policy)
 {
-	struct cs_cpu_dbs_info_s *dbs_info = &per_cpu(cs_cpu_dbs_info, policy->cpu);
 	struct policy_dbs_info *policy_dbs = policy->governor_data;
+	struct cs_policy_dbs_info *dbs_info = to_dbs_info(policy_dbs);
 	struct dbs_data *dbs_data = policy_dbs->dbs_data;
 	struct cs_dbs_tuners *cs_tuners = dbs_data->tuners;
 	unsigned int load = dbs_update(policy);
@@ -238,6 +249,19 @@ static struct attribute *cs_attributes[] = {
 
 /************************** sysfs end ************************/
 
+static struct policy_dbs_info *cs_alloc(void)
+{
+	struct cs_policy_dbs_info *dbs_info;
+
+	dbs_info = kzalloc(sizeof(*dbs_info), GFP_KERNEL);
+	return dbs_info ? &dbs_info->policy_dbs : NULL;
+}
+
+static void cs_free(struct policy_dbs_info *policy_dbs)
+{
+	kfree(to_dbs_info(policy_dbs));
+}
+
 static int cs_init(struct dbs_data *dbs_data, bool notify)
 {
 	struct cs_dbs_tuners *tuners;
@@ -276,7 +300,7 @@ static void cs_exit(struct dbs_data *dbs_data, bool notify)
 
 static void cs_start(struct cpufreq_policy *policy)
 {
-	struct cs_cpu_dbs_info_s *dbs_info = &per_cpu(cs_cpu_dbs_info, policy->cpu);
+	struct cs_policy_dbs_info *dbs_info = to_dbs_info(policy->governor_data);
 
 	dbs_info->down_skip = 0;
 	dbs_info->requested_freq = policy->cur;
@@ -294,6 +318,8 @@ static struct dbs_governor cs_dbs_gov = {
 	.kobj_type = { .default_attrs = cs_attributes },
 	.get_cpu_cdbs = get_cpu_cdbs,
 	.gov_dbs_timer = cs_dbs_timer,
+	.alloc = cs_alloc,
+	.free = cs_free,
 	.init = cs_init,
 	.exit = cs_exit,
 	.start = cs_start,
@@ -305,9 +331,8 @@ static int dbs_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
 				void *data)
 {
 	struct cpufreq_freqs *freq = data;
-	struct cs_cpu_dbs_info_s *dbs_info =
-					&per_cpu(cs_cpu_dbs_info, freq->cpu);
 	struct cpufreq_policy *policy = cpufreq_cpu_get_raw(freq->cpu);
+	struct cs_policy_dbs_info *dbs_info;
 
 	if (!policy)
 		return 0;
@@ -316,6 +341,7 @@ static int dbs_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
 	if (policy->governor != CPU_FREQ_GOV_CONSERVATIVE)
 		return 0;
 
+	dbs_info = to_dbs_info(policy->governor_data);
 	/*
 	 * we only care if our internally tracked freq moves outside the 'valid'
 	 * ranges of frequency available to us otherwise we do not change it
diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c
index 4b14f04..6cbc846 100644
--- a/drivers/cpufreq/cpufreq_governor.c
+++ b/drivers/cpufreq/cpufreq_governor.c
@@ -385,8 +385,8 @@ static struct policy_dbs_info *alloc_policy_dbs_info(struct cpufreq_policy *poli
 	struct policy_dbs_info *policy_dbs;
 	int j;
 
-	/* Allocate memory for the common information for policy->cpus */
-	policy_dbs = kzalloc(sizeof(*policy_dbs), GFP_KERNEL);
+	/* Allocate memory for per-policy governor data. */
+	policy_dbs = gov->alloc();
 	if (!policy_dbs)
 		return NULL;
 
@@ -421,7 +421,7 @@ static void free_policy_dbs_info(struct cpufreq_policy *policy,
 		j_cdbs->policy_dbs = NULL;
 		j_cdbs->update_util.func = NULL;
 	}
-	kfree(policy_dbs);
+	gov->free(policy_dbs);
 }
 
 static int cpufreq_governor_init(struct cpufreq_policy *policy)
@@ -582,7 +582,6 @@ static int cpufreq_governor_start(struct cpufreq_policy *policy)
 static int cpufreq_governor_stop(struct cpufreq_policy *policy)
 {
 	gov_cancel_work(policy);
-
 	return 0;
 }
 
diff --git a/drivers/cpufreq/cpufreq_governor.h b/drivers/cpufreq/cpufreq_governor.h
index 5c7d1ea..354e0d3 100644
--- a/drivers/cpufreq/cpufreq_governor.h
+++ b/drivers/cpufreq/cpufreq_governor.h
@@ -157,17 +157,10 @@ struct cpu_dbs_info {
 
 struct od_cpu_dbs_info_s {
 	struct cpu_dbs_info cdbs;
-	struct cpufreq_frequency_table *freq_table;
-	unsigned int freq_lo;
-	unsigned int freq_lo_delay_us;
-	unsigned int freq_hi_delay_us;
-	unsigned int sample_type:1;
 };
 
 struct cs_cpu_dbs_info_s {
 	struct cpu_dbs_info cdbs;
-	unsigned int down_skip;
-	unsigned int requested_freq;
 };
 
 /* Per policy Governors sysfs tunables */
@@ -193,6 +186,8 @@ struct dbs_governor {
 
 	struct cpu_dbs_info *(*get_cpu_cdbs)(int cpu);
 	unsigned int (*gov_dbs_timer)(struct cpufreq_policy *policy);
+	struct policy_dbs_info *(*alloc)(void);
+	void (*free)(struct policy_dbs_info *policy_dbs);
 	int (*init)(struct dbs_data *dbs_data, bool notify);
 	void (*exit)(struct dbs_data *dbs_data, bool notify);
 	void (*start)(struct cpufreq_policy *policy);
diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c
index 43d89f6..cdf4316 100644
--- a/drivers/cpufreq/cpufreq_ondemand.c
+++ b/drivers/cpufreq/cpufreq_ondemand.c
@@ -16,7 +16,8 @@
 #include <linux/percpu-defs.h>
 #include <linux/slab.h>
 #include <linux/tick.h>
-#include "cpufreq_governor.h"
+
+#include "cpufreq_ondemand.h"
 
 /* On-demand governor macros */
 #define DEF_FREQUENCY_UP_THRESHOLD		(80)
@@ -69,9 +70,8 @@ static unsigned int generic_powersave_bias_target(struct cpufreq_policy *policy,
 	unsigned int freq_hi, freq_lo;
 	unsigned int index = 0;
 	unsigned int delay_hi_us;
-	struct od_cpu_dbs_info_s *dbs_info = &per_cpu(od_cpu_dbs_info,
-						   policy->cpu);
 	struct policy_dbs_info *policy_dbs = policy->governor_data;
+	struct od_policy_dbs_info *dbs_info = to_dbs_info(policy_dbs);
 	struct dbs_data *dbs_data = policy_dbs->dbs_data;
 	struct od_dbs_tuners *od_tuners = dbs_data->tuners;
 
@@ -114,10 +114,9 @@ static unsigned int generic_powersave_bias_target(struct cpufreq_policy *policy,
 
 static void ondemand_powersave_bias_init(struct cpufreq_policy *policy)
 {
-	unsigned int cpu = policy->cpu;
-	struct od_cpu_dbs_info_s *dbs_info = &per_cpu(od_cpu_dbs_info, cpu);
+	struct od_policy_dbs_info *dbs_info = to_dbs_info(policy->governor_data);
 
-	dbs_info->freq_table = cpufreq_frequency_get_table(cpu);
+	dbs_info->freq_table = cpufreq_frequency_get_table(policy->cpu);
 	dbs_info->freq_lo = 0;
 }
 
@@ -144,8 +143,8 @@ static void dbs_freq_increase(struct cpufreq_policy *policy, unsigned int freq)
  */
 static void od_update(struct cpufreq_policy *policy)
 {
-	struct od_cpu_dbs_info_s *dbs_info = &per_cpu(od_cpu_dbs_info, policy->cpu);
-	struct policy_dbs_info *policy_dbs = dbs_info->cdbs.policy_dbs;
+	struct policy_dbs_info *policy_dbs = policy->governor_data;
+	struct od_policy_dbs_info *dbs_info = to_dbs_info(policy_dbs);
 	struct dbs_data *dbs_data = policy_dbs->dbs_data;
 	struct od_dbs_tuners *od_tuners = dbs_data->tuners;
 	unsigned int load = dbs_update(policy);
@@ -182,7 +181,7 @@ static unsigned int od_dbs_timer(struct cpufreq_policy *policy)
 {
 	struct policy_dbs_info *policy_dbs = policy->governor_data;
 	struct dbs_data *dbs_data = policy_dbs->dbs_data;
-	struct od_cpu_dbs_info_s *dbs_info = &per_cpu(od_cpu_dbs_info, policy->cpu);
+	struct od_policy_dbs_info *dbs_info = to_dbs_info(policy_dbs);
 	int sample_type = dbs_info->sample_type;
 
 	/* Common NORMAL_SAMPLE setup */
@@ -347,6 +346,19 @@ static struct attribute *od_attributes[] = {
 
 /************************** sysfs end ************************/
 
+static struct policy_dbs_info *od_alloc(void)
+{
+	struct od_policy_dbs_info *dbs_info;
+
+	dbs_info = kzalloc(sizeof(*dbs_info), GFP_KERNEL);
+	return dbs_info ? &dbs_info->policy_dbs : NULL;
+}
+
+static void od_free(struct policy_dbs_info *policy_dbs)
+{
+	kfree(to_dbs_info(policy_dbs));
+}
+
 static int od_init(struct dbs_data *dbs_data, bool notify)
 {
 	struct od_dbs_tuners *tuners;
@@ -395,7 +407,7 @@ static void od_exit(struct dbs_data *dbs_data, bool notify)
 
 static void od_start(struct cpufreq_policy *policy)
 {
-	struct od_cpu_dbs_info_s *dbs_info = &per_cpu(od_cpu_dbs_info, policy->cpu);
+	struct od_policy_dbs_info *dbs_info = to_dbs_info(policy->governor_data);
 
 	dbs_info->sample_type = OD_NORMAL_SAMPLE;
 	ondemand_powersave_bias_init(policy);
@@ -417,6 +429,8 @@ static struct dbs_governor od_dbs_gov = {
 	.kobj_type = { .default_attrs = od_attributes },
 	.get_cpu_cdbs = get_cpu_cdbs,
 	.gov_dbs_timer = od_dbs_timer,
+	.alloc = od_alloc,
+	.free = od_free,
 	.init = od_init,
 	.exit = od_exit,
 	.start = od_start,
diff --git a/drivers/cpufreq/cpufreq_ondemand.h b/drivers/cpufreq/cpufreq_ondemand.h
new file mode 100644
index 0000000..22403e4
--- /dev/null
+++ b/drivers/cpufreq/cpufreq_ondemand.h
@@ -0,0 +1,26 @@
+/*
+ * Header file for CPUFreq ondemand governor and related code.
+ *
+ * Copyright (C) 2016, Intel Corporation
+ * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include "cpufreq_governor.h"
+
+struct od_policy_dbs_info {
+	struct policy_dbs_info policy_dbs;
+	struct cpufreq_frequency_table *freq_table;
+	unsigned int freq_lo;
+	unsigned int freq_lo_delay_us;
+	unsigned int freq_hi_delay_us;
+	unsigned int sample_type:1;
+};
+
+static inline struct od_policy_dbs_info *to_dbs_info(struct policy_dbs_info *policy_dbs)
+{
+	return container_of(policy_dbs, struct od_policy_dbs_info, policy_dbs);
+}
-- 
cgit v0.10.2


From 8c8f77fd0719a079450f59debed4f69ede825adb Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Sun, 21 Feb 2016 00:51:27 +0100
Subject: cpufreq: governor: Move per-CPU data to the common code

After previous changes there is only one piece of code in the
ondemand governor making references to per-CPU data structures,
but it can be easily modified to avoid doing that, so modify it
accordingly and move the definition of per-CPU data used by the
ondemand and conservative governors to the common code.  Next,
change that code to access the per-CPU data structures directly
rather than via a governor callback.

This causes the ->get_cpu_cdbs governor callback to become
unnecessary, so drop it along with the macro and function
definitions related to it.

Finally, drop the definitions of struct od_cpu_dbs_info_s and
struct cs_cpu_dbs_info_s that aren't necessary any more.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>

diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c
index ffffda2..5d1edc5 100644
--- a/drivers/cpufreq/cpufreq_conservative.c
+++ b/drivers/cpufreq/cpufreq_conservative.c
@@ -32,10 +32,6 @@ static inline struct cs_policy_dbs_info *to_dbs_info(struct policy_dbs_info *pol
 #define DEF_SAMPLING_DOWN_FACTOR		(1)
 #define MAX_SAMPLING_DOWN_FACTOR		(10)
 
-static DEFINE_PER_CPU(struct cs_cpu_dbs_info_s, cs_cpu_dbs_info);
-
-static struct dbs_governor cs_dbs_gov;
-
 static inline unsigned int get_freq_target(struct cs_dbs_tuners *cs_tuners,
 					   struct cpufreq_policy *policy)
 {
@@ -193,7 +189,7 @@ static ssize_t store_ignore_nice_load(struct dbs_data *dbs_data,
 	dbs_data->ignore_nice_load = input;
 
 	/* we need to re-evaluate prev_cpu_idle */
-	gov_update_cpu_data(&cs_dbs_gov, dbs_data);
+	gov_update_cpu_data(dbs_data);
 
 	return count;
 }
@@ -306,8 +302,6 @@ static void cs_start(struct cpufreq_policy *policy)
 	dbs_info->requested_freq = policy->cur;
 }
 
-define_get_cpu_dbs_routines(cs_cpu_dbs_info);
-
 static struct dbs_governor cs_dbs_gov = {
 	.gov = {
 		.name = "conservative",
@@ -316,7 +310,6 @@ static struct dbs_governor cs_dbs_gov = {
 		.owner = THIS_MODULE,
 	},
 	.kobj_type = { .default_attrs = cs_attributes },
-	.get_cpu_cdbs = get_cpu_cdbs,
 	.gov_dbs_timer = cs_dbs_timer,
 	.alloc = cs_alloc,
 	.free = cs_free,
diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c
index 6cbc846..75217b8 100644
--- a/drivers/cpufreq/cpufreq_governor.c
+++ b/drivers/cpufreq/cpufreq_governor.c
@@ -22,6 +22,8 @@
 
 #include "cpufreq_governor.h"
 
+static DEFINE_PER_CPU(struct cpu_dbs_info, cpu_dbs);
+
 DEFINE_MUTEX(dbs_data_mutex);
 EXPORT_SYMBOL_GPL(dbs_data_mutex);
 
@@ -82,7 +84,6 @@ EXPORT_SYMBOL_GPL(store_sampling_rate);
 
 /**
  * gov_update_cpu_data - Update CPU load data.
- * @gov: Governor whose data is to be updated.
  * @dbs_data: Top-level governor data pointer.
  *
  * Update CPU load data for all CPUs in the domain governed by @dbs_data
@@ -91,7 +92,7 @@ EXPORT_SYMBOL_GPL(store_sampling_rate);
  *
  * Call under the @dbs_data mutex.
  */
-void gov_update_cpu_data(struct dbs_governor *gov, struct dbs_data *dbs_data)
+void gov_update_cpu_data(struct dbs_data *dbs_data)
 {
 	struct policy_dbs_info *policy_dbs;
 
@@ -99,7 +100,7 @@ void gov_update_cpu_data(struct dbs_governor *gov, struct dbs_data *dbs_data)
 		unsigned int j;
 
 		for_each_cpu(j, policy_dbs->policy->cpus) {
-			struct cpu_dbs_info *j_cdbs = gov->get_cpu_cdbs(j);
+			struct cpu_dbs_info *j_cdbs = &per_cpu(cpu_dbs, j);
 
 			j_cdbs->prev_cpu_idle = get_cpu_idle_time(j, &j_cdbs->prev_cpu_wall,
 								  dbs_data->io_is_busy);
@@ -164,7 +165,6 @@ static const struct sysfs_ops governor_sysfs_ops = {
 
 unsigned int dbs_update(struct cpufreq_policy *policy)
 {
-	struct dbs_governor *gov = dbs_governor_of(policy);
 	struct policy_dbs_info *policy_dbs = policy->governor_data;
 	struct dbs_data *dbs_data = policy_dbs->dbs_data;
 	unsigned int ignore_nice = dbs_data->ignore_nice_load;
@@ -187,13 +187,11 @@ unsigned int dbs_update(struct cpufreq_policy *policy)
 
 	/* Get Absolute Load */
 	for_each_cpu(j, policy->cpus) {
-		struct cpu_dbs_info *j_cdbs;
+		struct cpu_dbs_info *j_cdbs = &per_cpu(cpu_dbs, j);
 		u64 cur_wall_time, cur_idle_time;
 		unsigned int idle_time, wall_time;
 		unsigned int load;
 
-		j_cdbs = gov->get_cpu_cdbs(j);
-
 		cur_idle_time = get_cpu_idle_time(j, &cur_wall_time, io_busy);
 
 		wall_time = cur_wall_time - j_cdbs->prev_cpu_wall;
@@ -268,14 +266,13 @@ void gov_set_update_util(struct policy_dbs_info *policy_dbs,
 			 unsigned int delay_us)
 {
 	struct cpufreq_policy *policy = policy_dbs->policy;
-	struct dbs_governor *gov = dbs_governor_of(policy);
 	int cpu;
 
 	gov_update_sample_delay(policy_dbs, delay_us);
 	policy_dbs->last_sample_time = 0;
 
 	for_each_cpu(cpu, policy->cpus) {
-		struct cpu_dbs_info *cdbs = gov->get_cpu_cdbs(cpu);
+		struct cpu_dbs_info *cdbs = &per_cpu(cpu_dbs, cpu);
 
 		cpufreq_set_update_util_data(cpu, &cdbs->update_util);
 	}
@@ -398,7 +395,7 @@ static struct policy_dbs_info *alloc_policy_dbs_info(struct cpufreq_policy *poli
 
 	/* Set policy_dbs for all CPUs, online+offline */
 	for_each_cpu(j, policy->related_cpus) {
-		struct cpu_dbs_info *j_cdbs = gov->get_cpu_cdbs(j);
+		struct cpu_dbs_info *j_cdbs = &per_cpu(cpu_dbs, j);
 
 		j_cdbs->policy_dbs = policy_dbs;
 		j_cdbs->update_util.func = dbs_update_util_handler;
@@ -406,17 +403,15 @@ static struct policy_dbs_info *alloc_policy_dbs_info(struct cpufreq_policy *poli
 	return policy_dbs;
 }
 
-static void free_policy_dbs_info(struct cpufreq_policy *policy,
+static void free_policy_dbs_info(struct policy_dbs_info *policy_dbs,
 				 struct dbs_governor *gov)
 {
-	struct cpu_dbs_info *cdbs = gov->get_cpu_cdbs(policy->cpu);
-	struct policy_dbs_info *policy_dbs = cdbs->policy_dbs;
 	int j;
 
 	mutex_destroy(&policy_dbs->timer_mutex);
 
-	for_each_cpu(j, policy->related_cpus) {
-		struct cpu_dbs_info *j_cdbs = gov->get_cpu_cdbs(j);
+	for_each_cpu(j, policy_dbs->policy->related_cpus) {
+		struct cpu_dbs_info *j_cdbs = &per_cpu(cpu_dbs, j);
 
 		j_cdbs->policy_dbs = NULL;
 		j_cdbs->update_util.func = NULL;
@@ -507,7 +502,7 @@ static int cpufreq_governor_init(struct cpufreq_policy *policy)
 	kfree(dbs_data);
 
 free_policy_dbs_info:
-	free_policy_dbs_info(policy, gov);
+	free_policy_dbs_info(policy_dbs, gov);
 	return ret;
 }
 
@@ -538,7 +533,7 @@ static int cpufreq_governor_exit(struct cpufreq_policy *policy)
 		policy->governor_data = NULL;
 	}
 
-	free_policy_dbs_info(policy, gov);
+	free_policy_dbs_info(policy_dbs, gov);
 	return 0;
 }
 
@@ -561,7 +556,7 @@ static int cpufreq_governor_start(struct cpufreq_policy *policy)
 	io_busy = dbs_data->io_is_busy;
 
 	for_each_cpu(j, policy->cpus) {
-		struct cpu_dbs_info *j_cdbs = gov->get_cpu_cdbs(j);
+		struct cpu_dbs_info *j_cdbs = &per_cpu(cpu_dbs, j);
 		unsigned int prev_load;
 
 		j_cdbs->prev_cpu_idle = get_cpu_idle_time(j, &j_cdbs->prev_cpu_wall, io_busy);
diff --git a/drivers/cpufreq/cpufreq_governor.h b/drivers/cpufreq/cpufreq_governor.h
index 354e0d3..58749da 100644
--- a/drivers/cpufreq/cpufreq_governor.h
+++ b/drivers/cpufreq/cpufreq_governor.h
@@ -41,13 +41,6 @@
 /* Ondemand Sampling types */
 enum {OD_NORMAL_SAMPLE, OD_SUB_SAMPLE};
 
-/* create helper routines */
-#define define_get_cpu_dbs_routines(_dbs_info)				\
-static struct cpu_dbs_info *get_cpu_cdbs(int cpu)			\
-{									\
-	return &per_cpu(_dbs_info, cpu).cdbs;				\
-}
-
 /*
  * Abbreviations:
  * dbs: used as a shortform for demand based switching It helps to keep variable
@@ -155,14 +148,6 @@ struct cpu_dbs_info {
 	struct policy_dbs_info *policy_dbs;
 };
 
-struct od_cpu_dbs_info_s {
-	struct cpu_dbs_info cdbs;
-};
-
-struct cs_cpu_dbs_info_s {
-	struct cpu_dbs_info cdbs;
-};
-
 /* Per policy Governors sysfs tunables */
 struct od_dbs_tuners {
 	unsigned int powersave_bias;
@@ -184,7 +169,6 @@ struct dbs_governor {
 	 */
 	struct dbs_data *gdbs_data;
 
-	struct cpu_dbs_info *(*get_cpu_cdbs)(int cpu);
 	unsigned int (*gov_dbs_timer)(struct cpufreq_policy *policy);
 	struct policy_dbs_info *(*alloc)(void);
 	void (*free)(struct policy_dbs_info *policy_dbs);
@@ -213,5 +197,5 @@ void od_register_powersave_bias_handler(unsigned int (*f)
 void od_unregister_powersave_bias_handler(void);
 ssize_t store_sampling_rate(struct dbs_data *dbs_data, const char *buf,
 			    size_t count);
-void gov_update_cpu_data(struct dbs_governor *gov, struct dbs_data *dbs_data);
+void gov_update_cpu_data(struct dbs_data *dbs_data);
 #endif /* _CPUFREQ_GOVERNOR_H */
diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c
index cdf4316..acd8027 100644
--- a/drivers/cpufreq/cpufreq_ondemand.c
+++ b/drivers/cpufreq/cpufreq_ondemand.c
@@ -28,9 +28,6 @@
 #define MIN_FREQUENCY_UP_THRESHOLD		(11)
 #define MAX_FREQUENCY_UP_THRESHOLD		(100)
 
-static DEFINE_PER_CPU(struct od_cpu_dbs_info_s, od_cpu_dbs_info);
-
-static struct dbs_governor od_dbs_gov;
 static struct od_ops od_ops;
 
 static unsigned int default_powersave_bias;
@@ -222,7 +219,7 @@ static ssize_t store_io_is_busy(struct dbs_data *dbs_data, const char *buf,
 	dbs_data->io_is_busy = !!input;
 
 	/* we need to re-evaluate prev_cpu_idle */
-	gov_update_cpu_data(&od_dbs_gov, dbs_data);
+	gov_update_cpu_data(dbs_data);
 
 	return count;
 }
@@ -289,7 +286,7 @@ static ssize_t store_ignore_nice_load(struct dbs_data *dbs_data,
 	dbs_data->ignore_nice_load = input;
 
 	/* we need to re-evaluate prev_cpu_idle */
-	gov_update_cpu_data(&od_dbs_gov, dbs_data);
+	gov_update_cpu_data(dbs_data);
 
 	return count;
 }
@@ -413,8 +410,6 @@ static void od_start(struct cpufreq_policy *policy)
 	ondemand_powersave_bias_init(policy);
 }
 
-define_get_cpu_dbs_routines(od_cpu_dbs_info);
-
 static struct od_ops od_ops = {
 	.powersave_bias_target = generic_powersave_bias_target,
 };
@@ -427,7 +422,6 @@ static struct dbs_governor od_dbs_gov = {
 		.owner = THIS_MODULE,
 	},
 	.kobj_type = { .default_attrs = od_attributes },
-	.get_cpu_cdbs = get_cpu_cdbs,
 	.gov_dbs_timer = od_dbs_timer,
 	.alloc = od_alloc,
 	.free = od_free,
@@ -440,9 +434,6 @@ static struct dbs_governor od_dbs_gov = {
 
 static void od_set_powersave_bias(unsigned int powersave_bias)
 {
-	struct cpufreq_policy *policy;
-	struct dbs_data *dbs_data;
-	struct od_dbs_tuners *od_tuners;
 	unsigned int cpu;
 	cpumask_t done;
 
@@ -451,21 +442,24 @@ static void od_set_powersave_bias(unsigned int powersave_bias)
 
 	get_online_cpus();
 	for_each_online_cpu(cpu) {
+		struct cpufreq_policy *policy;
 		struct policy_dbs_info *policy_dbs;
+		struct dbs_data *dbs_data;
+		struct od_dbs_tuners *od_tuners;
 
 		if (cpumask_test_cpu(cpu, &done))
 			continue;
 
-		policy_dbs = per_cpu(od_cpu_dbs_info, cpu).cdbs.policy_dbs;
+		policy = cpufreq_cpu_get_raw(cpu);
+		if (!policy || policy->governor != CPU_FREQ_GOV_ONDEMAND)
+			continue;
+
+		policy_dbs = policy->governor_data;
 		if (!policy_dbs)
 			continue;
 
-		policy = policy_dbs->policy;
 		cpumask_or(&done, &done, policy->cpus);
 
-		if (policy->governor != CPU_FREQ_GOV_ONDEMAND)
-			continue;
-
 		dbs_data = policy_dbs->dbs_data;
 		od_tuners = dbs_data->tuners;
 		od_tuners->powersave_bias = default_powersave_bias;
-- 
cgit v0.10.2


From 47ebaac1f32dc606262be48a72f9cea6af376414 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Thu, 18 Feb 2016 18:41:36 +0100
Subject: cpufreq: governor: Relocate definitions of tuners structures

Move the definitions of struct od_dbs_tuners and struct cs_dbs_tuners
from the common governor header to the ondemand and conservative
governor code, respectively, as they don't need to be in the common
header any more.

No functional changes.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>

diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c
index 5d1edc5..bf4913f 100644
--- a/drivers/cpufreq/cpufreq_conservative.c
+++ b/drivers/cpufreq/cpufreq_conservative.c
@@ -25,6 +25,11 @@ static inline struct cs_policy_dbs_info *to_dbs_info(struct policy_dbs_info *pol
 	return container_of(policy_dbs, struct cs_policy_dbs_info, policy_dbs);
 }
 
+struct cs_dbs_tuners {
+	unsigned int down_threshold;
+	unsigned int freq_step;
+};
+
 /* Conservative governor macros */
 #define DEF_FREQUENCY_UP_THRESHOLD		(80)
 #define DEF_FREQUENCY_DOWN_THRESHOLD		(20)
diff --git a/drivers/cpufreq/cpufreq_governor.h b/drivers/cpufreq/cpufreq_governor.h
index 58749da..ece70ab 100644
--- a/drivers/cpufreq/cpufreq_governor.h
+++ b/drivers/cpufreq/cpufreq_governor.h
@@ -148,16 +148,6 @@ struct cpu_dbs_info {
 	struct policy_dbs_info *policy_dbs;
 };
 
-/* Per policy Governors sysfs tunables */
-struct od_dbs_tuners {
-	unsigned int powersave_bias;
-};
-
-struct cs_dbs_tuners {
-	unsigned int down_threshold;
-	unsigned int freq_step;
-};
-
 /* Common Governor data across policies */
 struct dbs_governor {
 	struct cpufreq_governor gov;
diff --git a/drivers/cpufreq/cpufreq_ondemand.h b/drivers/cpufreq/cpufreq_ondemand.h
index 22403e4..f0121db 100644
--- a/drivers/cpufreq/cpufreq_ondemand.h
+++ b/drivers/cpufreq/cpufreq_ondemand.h
@@ -24,3 +24,7 @@ static inline struct od_policy_dbs_info *to_dbs_info(struct policy_dbs_info *pol
 {
 	return container_of(policy_dbs, struct od_policy_dbs_info, policy_dbs);
 }
+
+struct od_dbs_tuners {
+	unsigned int powersave_bias;
+};
-- 
cgit v0.10.2


From e3f5ed9393042188a1716d3873415ef44161addf Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Thu, 18 Feb 2016 02:33:43 +0100
Subject: cpufreq: governor: Make dbs_data_mutex static

That mutex is only used by cpufreq_governor_dbs() and it doesn't
need to be exported to modules, so make it static and drop the
export incantation.

No functional changes.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>

diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c
index 75217b8..4f0bd48 100644
--- a/drivers/cpufreq/cpufreq_governor.c
+++ b/drivers/cpufreq/cpufreq_governor.c
@@ -24,8 +24,7 @@
 
 static DEFINE_PER_CPU(struct cpu_dbs_info, cpu_dbs);
 
-DEFINE_MUTEX(dbs_data_mutex);
-EXPORT_SYMBOL_GPL(dbs_data_mutex);
+static DEFINE_MUTEX(dbs_data_mutex);
 
 /* Common sysfs tunables */
 /**
diff --git a/drivers/cpufreq/cpufreq_governor.h b/drivers/cpufreq/cpufreq_governor.h
index ece70ab..61ff82f 100644
--- a/drivers/cpufreq/cpufreq_governor.h
+++ b/drivers/cpufreq/cpufreq_governor.h
@@ -178,7 +178,6 @@ struct od_ops {
 			unsigned int freq_next, unsigned int relation);
 };
 
-extern struct mutex dbs_data_mutex;
 unsigned int dbs_update(struct cpufreq_policy *policy);
 int cpufreq_governor_dbs(struct cpufreq_policy *policy, unsigned int event);
 void od_register_powersave_bias_handler(unsigned int (*f)
-- 
cgit v0.10.2


From 1112e9d83e5cd153b35dfbb52721f8b3d3163016 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Sun, 21 Feb 2016 00:53:06 +0100
Subject: cpufreq: governor: Narrow down the dbs_data_mutex coverage

Since cpufreq_governor_dbs() is now always called with policy->rwsem
held, it cannot be executed twice in parallel for the same policy.
Thus it is not necessary to hold dbs_data_mutex around the invocations
of cpufreq_governor_start/stop/limits() from it as those functions
never modify any data that can be shared between different policies.

However, cpufreq_governor_dbs() may be executed twice in parallal
for different policies using the same gov->gdbs_data object and
dbs_data_mutex is still necessary to protect that object against
concurrent updates.

For this reason, narrow down the dbs_data_mutex locking to
cpufreq_governor_init/exit() where it is needed and rename the
mutex to gov_dbs_data_mutex to reflect its purpose.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>

diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c
index 4f0bd48..542c9ca 100644
--- a/drivers/cpufreq/cpufreq_governor.c
+++ b/drivers/cpufreq/cpufreq_governor.c
@@ -24,7 +24,7 @@
 
 static DEFINE_PER_CPU(struct cpu_dbs_info, cpu_dbs);
 
-static DEFINE_MUTEX(dbs_data_mutex);
+static DEFINE_MUTEX(gov_dbs_data_mutex);
 
 /* Common sysfs tunables */
 /**
@@ -421,10 +421,10 @@ static void free_policy_dbs_info(struct policy_dbs_info *policy_dbs,
 static int cpufreq_governor_init(struct cpufreq_policy *policy)
 {
 	struct dbs_governor *gov = dbs_governor_of(policy);
-	struct dbs_data *dbs_data = gov->gdbs_data;
+	struct dbs_data *dbs_data;
 	struct policy_dbs_info *policy_dbs;
 	unsigned int latency;
-	int ret;
+	int ret = 0;
 
 	/* State should be equivalent to EXIT */
 	if (policy->governor_data)
@@ -434,6 +434,10 @@ static int cpufreq_governor_init(struct cpufreq_policy *policy)
 	if (!policy_dbs)
 		return -ENOMEM;
 
+	/* Protect gov->gdbs_data against concurrent updates. */
+	mutex_lock(&gov_dbs_data_mutex);
+
+	dbs_data = gov->gdbs_data;
 	if (dbs_data) {
 		if (WARN_ON(have_governor_per_policy())) {
 			ret = -EINVAL;
@@ -446,8 +450,7 @@ static int cpufreq_governor_init(struct cpufreq_policy *policy)
 		dbs_data->usage_count++;
 		list_add(&policy_dbs->list, &dbs_data->policy_dbs_list);
 		mutex_unlock(&dbs_data->mutex);
-
-		return 0;
+		goto out;
 	}
 
 	dbs_data = kzalloc(sizeof(*dbs_data), GFP_KERNEL);
@@ -488,7 +491,7 @@ static int cpufreq_governor_init(struct cpufreq_policy *policy)
 				   get_governor_parent_kobj(policy),
 				   "%s", gov->gov.name);
 	if (!ret)
-		return 0;
+		goto out;
 
 	/* Failure, so roll back. */
 	pr_err("cpufreq: Governor initialization failed (dbs_data kobject init error %d)\n", ret);
@@ -502,6 +505,9 @@ static int cpufreq_governor_init(struct cpufreq_policy *policy)
 
 free_policy_dbs_info:
 	free_policy_dbs_info(policy_dbs, gov);
+
+out:
+	mutex_unlock(&gov_dbs_data_mutex);
 	return ret;
 }
 
@@ -512,6 +518,9 @@ static int cpufreq_governor_exit(struct cpufreq_policy *policy)
 	struct dbs_data *dbs_data = policy_dbs->dbs_data;
 	int count;
 
+	/* Protect gov->gdbs_data against concurrent updates. */
+	mutex_lock(&gov_dbs_data_mutex);
+
 	mutex_lock(&dbs_data->mutex);
 	list_del(&policy_dbs->list);
 	count = --dbs_data->usage_count;
@@ -533,6 +542,8 @@ static int cpufreq_governor_exit(struct cpufreq_policy *policy)
 	}
 
 	free_policy_dbs_info(policy_dbs, gov);
+
+	mutex_unlock(&gov_dbs_data_mutex);
 	return 0;
 }
 
@@ -599,31 +610,20 @@ static int cpufreq_governor_limits(struct cpufreq_policy *policy)
 
 int cpufreq_governor_dbs(struct cpufreq_policy *policy, unsigned int event)
 {
-	int ret = -EINVAL;
-
-	/* Lock governor to block concurrent initialization of governor */
-	mutex_lock(&dbs_data_mutex);
-
 	if (event == CPUFREQ_GOV_POLICY_INIT) {
-		ret = cpufreq_governor_init(policy);
+		return cpufreq_governor_init(policy);
 	} else if (policy->governor_data) {
 		switch (event) {
 		case CPUFREQ_GOV_POLICY_EXIT:
-			ret = cpufreq_governor_exit(policy);
-			break;
+			return cpufreq_governor_exit(policy);
 		case CPUFREQ_GOV_START:
-			ret = cpufreq_governor_start(policy);
-			break;
+			return cpufreq_governor_start(policy);
 		case CPUFREQ_GOV_STOP:
-			ret = cpufreq_governor_stop(policy);
-			break;
+			return cpufreq_governor_stop(policy);
 		case CPUFREQ_GOV_LIMITS:
-			ret = cpufreq_governor_limits(policy);
-			break;
+			return cpufreq_governor_limits(policy);
 		}
 	}
-
-	mutex_unlock(&dbs_data_mutex);
-	return ret;
+	return -EINVAL;
 }
 EXPORT_SYMBOL_GPL(cpufreq_governor_dbs);
-- 
cgit v0.10.2


From 94ab5e030fe10cfcc700050cc21535b824943077 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Sun, 21 Feb 2016 03:15:34 +0100
Subject: cpufreq: governor: Make gov_set_update_util() static

The gov_set_update_util() routine is only used internally by the
common governor code and it doesn't need to be exported, so make
it static.

No functional changes.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>

diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c
index 542c9ca..c9a571f 100644
--- a/drivers/cpufreq/cpufreq_governor.c
+++ b/drivers/cpufreq/cpufreq_governor.c
@@ -261,8 +261,8 @@ unsigned int dbs_update(struct cpufreq_policy *policy)
 }
 EXPORT_SYMBOL_GPL(dbs_update);
 
-void gov_set_update_util(struct policy_dbs_info *policy_dbs,
-			 unsigned int delay_us)
+static void gov_set_update_util(struct policy_dbs_info *policy_dbs,
+				unsigned int delay_us)
 {
 	struct cpufreq_policy *policy = policy_dbs->policy;
 	int cpu;
@@ -276,7 +276,6 @@ void gov_set_update_util(struct policy_dbs_info *policy_dbs,
 		cpufreq_set_update_util_data(cpu, &cdbs->update_util);
 	}
 }
-EXPORT_SYMBOL_GPL(gov_set_update_util);
 
 static inline void gov_clear_update_util(struct cpufreq_policy *policy)
 {
-- 
cgit v0.10.2


From 27de34823984e844f5dc042d39bb43f5dc98966f Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Mon, 22 Feb 2016 14:14:34 +0100
Subject: cpufreq: governor: Fix race in dbs_update_util_handler()

There is a scenario that may lead to undesired results in
dbs_update_util_handler().  Namely, if two CPUs sharing a policy
enter the funtion at the same time, pass the sample delay check
and then one of them is stalled until dbs_work_handler() (queued
up by the other CPU) clears the work counter, it may update the
work counter and queue up another work item prematurely.

To prevent that from happening, use the observation that the CPU
queuing up a work item in dbs_update_util_handler() updates the
last sample time.  This means that if another CPU was stalling after
passing the sample delay check and now successfully updated the work
counter as a result of the race described above, it will see the new
value of the last sample time which is different from what it used in
the sample delay check before.  If that happens, the sample delay
check passed previously is not valid any more, so the CPU should not
continue.

Fixes: f17cbb53783c (cpufreq: governor: Avoid atomic operations in hot paths)
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>

diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c
index c9a571f..064582a 100644
--- a/drivers/cpufreq/cpufreq_governor.c
+++ b/drivers/cpufreq/cpufreq_governor.c
@@ -340,7 +340,7 @@ static void dbs_update_util_handler(struct update_util_data *data, u64 time,
 {
 	struct cpu_dbs_info *cdbs = container_of(data, struct cpu_dbs_info, update_util);
 	struct policy_dbs_info *policy_dbs = cdbs->policy_dbs;
-	u64 delta_ns;
+	u64 delta_ns, lst;
 
 	/*
 	 * The work may not be allowed to be queued up right now.
@@ -356,7 +356,8 @@ static void dbs_update_util_handler(struct update_util_data *data, u64 time,
 	 * of sample_delay_ns used in the computation may be stale.
 	 */
 	smp_rmb();
-	delta_ns = time - policy_dbs->last_sample_time;
+	lst = READ_ONCE(policy_dbs->last_sample_time);
+	delta_ns = time - lst;
 	if ((s64)delta_ns < policy_dbs->sample_delay_ns)
 		return;
 
@@ -365,9 +366,19 @@ static void dbs_update_util_handler(struct update_util_data *data, u64 time,
 	 * at this point.  Otherwise, we need to ensure that only one of the
 	 * CPUs sharing the policy will do that.
 	 */
-	if (policy_dbs->is_shared &&
-	    !atomic_add_unless(&policy_dbs->work_count, 1, 1))
-		return;
+	if (policy_dbs->is_shared) {
+		if (!atomic_add_unless(&policy_dbs->work_count, 1, 1))
+			return;
+
+		/*
+		 * If another CPU updated last_sample_time in the meantime, we
+		 * shouldn't be here, so clear the work counter and bail out.
+		 */
+		if (unlikely(lst != READ_ONCE(policy_dbs->last_sample_time))) {
+			atomic_set(&policy_dbs->work_count, 0);
+			return;
+		}
+	}
 
 	policy_dbs->last_sample_time = time;
 	policy_dbs->work_in_progress = true;
-- 
cgit v0.10.2


From f737236b128cac7c355d0650a98c42ae4313f3f1 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Mon, 22 Feb 2016 14:18:20 +0530
Subject: cpufreq: governor: Drop unnecessary checks from show() and store()

The show() and store() routines in the cpufreq-governor core don't need
to check if the struct governor_attr they want to use really provides
the callbacks they need as expected (if that's not the case, it means a
bug in the code anyway), so change them to avoid doing that.

Also change the error value to -EBUSY, if the governor is getting
removed and we aren't allowed to store any more changes.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c
index 064582a..70079e2 100644
--- a/drivers/cpufreq/cpufreq_governor.c
+++ b/drivers/cpufreq/cpufreq_governor.c
@@ -125,12 +125,8 @@ static ssize_t governor_show(struct kobject *kobj, struct attribute *attr,
 {
 	struct dbs_data *dbs_data = to_dbs_data(kobj);
 	struct governor_attr *gattr = to_gov_attr(attr);
-	int ret = -EIO;
 
-	if (gattr->show)
-		ret = gattr->show(dbs_data, buf);
-
-	return ret;
+	return gattr->show(dbs_data, buf);
 }
 
 static ssize_t governor_store(struct kobject *kobj, struct attribute *attr,
@@ -138,11 +134,11 @@ static ssize_t governor_store(struct kobject *kobj, struct attribute *attr,
 {
 	struct dbs_data *dbs_data = to_dbs_data(kobj);
 	struct governor_attr *gattr = to_gov_attr(attr);
-	int ret = -EIO;
+	int ret = -EBUSY;
 
 	mutex_lock(&dbs_data->mutex);
 
-	if (dbs_data->usage_count && gattr->store)
+	if (dbs_data->usage_count)
 		ret = gattr->store(dbs_data, buf, count);
 
 	mutex_unlock(&dbs_data->mutex);
-- 
cgit v0.10.2


From 11eb69b984aae216ae43c79d2d43441ee68a63ca Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Mon, 22 Feb 2016 16:36:42 +0530
Subject: cpufreq: Relocate handle_update() to kill its declaration

handle_update() is declared at the top of the file as its user appear
before its definition. Relocate the routine to get rid of this.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index bc93272..316beff 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -159,7 +159,6 @@ static inline bool has_target(void)
 static int __cpufreq_governor(struct cpufreq_policy *policy,
 		unsigned int event);
 static unsigned int __cpufreq_get(struct cpufreq_policy *policy);
-static void handle_update(struct work_struct *work);
 
 /**
  * Two notifier lists: the "policy" list is involved in the
@@ -1072,6 +1071,15 @@ unlock:
 	return ret;
 }
 
+static void handle_update(struct work_struct *work)
+{
+	struct cpufreq_policy *policy =
+		container_of(work, struct cpufreq_policy, update);
+	unsigned int cpu = policy->cpu;
+	pr_debug("handle_update for cpu %u called\n", cpu);
+	cpufreq_update_policy(cpu);
+}
+
 static struct cpufreq_policy *cpufreq_policy_alloc(unsigned int cpu)
 {
 	struct device *dev = get_cpu_device(cpu);
@@ -1453,15 +1461,6 @@ static void cpufreq_remove_dev(struct device *dev, struct subsys_interface *sif)
 		cpufreq_policy_free(policy, true);
 }
 
-static void handle_update(struct work_struct *work)
-{
-	struct cpufreq_policy *policy =
-		container_of(work, struct cpufreq_policy, update);
-	unsigned int cpu = policy->cpu;
-	pr_debug("handle_update for cpu %u called\n", cpu);
-	cpufreq_update_policy(cpu);
-}
-
 /**
  *	cpufreq_out_of_sync - If actual and saved CPU frequency differs, we're
  *	in deep trouble.
-- 
cgit v0.10.2


From a1317e091ab1386812ee8ab4e3bbd89f2811bc74 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Mon, 22 Feb 2016 16:36:43 +0530
Subject: cpufreq: Rename __cpufreq_governor() to cpufreq_governor()

The __ at the beginning of the routine aren't really necessary at all.
Rename it to cpufreq_governor() instead.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 316beff..b3d05a9 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -156,8 +156,7 @@ static inline bool has_target(void)
 }
 
 /* internal prototypes */
-static int __cpufreq_governor(struct cpufreq_policy *policy,
-		unsigned int event);
+static int cpufreq_governor(struct cpufreq_policy *policy, unsigned int event);
 static unsigned int __cpufreq_get(struct cpufreq_policy *policy);
 
 /**
@@ -1048,7 +1047,7 @@ static int cpufreq_add_policy_cpu(struct cpufreq_policy *policy, unsigned int cp
 
 	down_write(&policy->rwsem);
 	if (has_target()) {
-		ret = __cpufreq_governor(policy, CPUFREQ_GOV_STOP);
+		ret = cpufreq_governor(policy, CPUFREQ_GOV_STOP);
 		if (ret) {
 			pr_err("%s: Failed to stop governor\n", __func__);
 			goto unlock;
@@ -1058,9 +1057,9 @@ static int cpufreq_add_policy_cpu(struct cpufreq_policy *policy, unsigned int cp
 	cpumask_set_cpu(cpu, policy->cpus);
 
 	if (has_target()) {
-		ret = __cpufreq_governor(policy, CPUFREQ_GOV_START);
+		ret = cpufreq_governor(policy, CPUFREQ_GOV_START);
 		if (!ret)
-			ret = __cpufreq_governor(policy, CPUFREQ_GOV_LIMITS);
+			ret = cpufreq_governor(policy, CPUFREQ_GOV_LIMITS);
 
 		if (ret)
 			pr_err("%s: Failed to start governor\n", __func__);
@@ -1382,7 +1381,7 @@ static void cpufreq_offline(unsigned int cpu)
 
 	down_write(&policy->rwsem);
 	if (has_target()) {
-		ret = __cpufreq_governor(policy, CPUFREQ_GOV_STOP);
+		ret = cpufreq_governor(policy, CPUFREQ_GOV_STOP);
 		if (ret)
 			pr_err("%s: Failed to stop governor\n", __func__);
 	}
@@ -1403,9 +1402,9 @@ static void cpufreq_offline(unsigned int cpu)
 	/* Start governor again for active policy */
 	if (!policy_is_inactive(policy)) {
 		if (has_target()) {
-			ret = __cpufreq_governor(policy, CPUFREQ_GOV_START);
+			ret = cpufreq_governor(policy, CPUFREQ_GOV_START);
 			if (!ret)
-				ret = __cpufreq_governor(policy, CPUFREQ_GOV_LIMITS);
+				ret = cpufreq_governor(policy, CPUFREQ_GOV_LIMITS);
 
 			if (ret)
 				pr_err("%s: Failed to start governor\n", __func__);
@@ -1419,7 +1418,7 @@ static void cpufreq_offline(unsigned int cpu)
 
 	/* If cpu is last user of policy, free policy */
 	if (has_target()) {
-		ret = __cpufreq_governor(policy, CPUFREQ_GOV_POLICY_EXIT);
+		ret = cpufreq_governor(policy, CPUFREQ_GOV_POLICY_EXIT);
 		if (ret)
 			pr_err("%s: Failed to exit governor\n", __func__);
 	}
@@ -1635,7 +1634,7 @@ void cpufreq_suspend(void)
 
 	for_each_active_policy(policy) {
 		down_write(&policy->rwsem);
-		ret = __cpufreq_governor(policy, CPUFREQ_GOV_STOP);
+		ret = cpufreq_governor(policy, CPUFREQ_GOV_STOP);
 		up_write(&policy->rwsem);
 
 		if (ret)
@@ -1678,9 +1677,9 @@ void cpufreq_resume(void)
 				policy);
 		} else {
 			down_write(&policy->rwsem);
-			ret = __cpufreq_governor(policy, CPUFREQ_GOV_START);
+			ret = cpufreq_governor(policy, CPUFREQ_GOV_START);
 			if (!ret)
-				__cpufreq_governor(policy, CPUFREQ_GOV_LIMITS);
+				cpufreq_governor(policy, CPUFREQ_GOV_LIMITS);
 			up_write(&policy->rwsem);
 
 			if (ret)
@@ -1977,8 +1976,7 @@ __weak struct cpufreq_governor *cpufreq_fallback_governor(void)
 	return NULL;
 }
 
-static int __cpufreq_governor(struct cpufreq_policy *policy,
-					unsigned int event)
+static int cpufreq_governor(struct cpufreq_policy *policy, unsigned int event)
 {
 	int ret;
 
@@ -2190,7 +2188,7 @@ static int cpufreq_set_policy(struct cpufreq_policy *policy,
 	old_gov = policy->governor;
 	/* end old governor */
 	if (old_gov) {
-		ret = __cpufreq_governor(policy, CPUFREQ_GOV_STOP);
+		ret = cpufreq_governor(policy, CPUFREQ_GOV_STOP);
 		if (ret) {
 			/* This can happen due to race with other operations */
 			pr_debug("%s: Failed to Stop Governor: %s (%d)\n",
@@ -2198,7 +2196,7 @@ static int cpufreq_set_policy(struct cpufreq_policy *policy,
 			return ret;
 		}
 
-		ret = __cpufreq_governor(policy, CPUFREQ_GOV_POLICY_EXIT);
+		ret = cpufreq_governor(policy, CPUFREQ_GOV_POLICY_EXIT);
 		if (ret) {
 			pr_err("%s: Failed to Exit Governor: %s (%d)\n",
 			       __func__, old_gov->name, ret);
@@ -2208,30 +2206,30 @@ static int cpufreq_set_policy(struct cpufreq_policy *policy,
 
 	/* start new governor */
 	policy->governor = new_policy->governor;
-	ret = __cpufreq_governor(policy, CPUFREQ_GOV_POLICY_INIT);
+	ret = cpufreq_governor(policy, CPUFREQ_GOV_POLICY_INIT);
 	if (!ret) {
-		ret = __cpufreq_governor(policy, CPUFREQ_GOV_START);
+		ret = cpufreq_governor(policy, CPUFREQ_GOV_START);
 		if (!ret)
 			goto out;
 
-		__cpufreq_governor(policy, CPUFREQ_GOV_POLICY_EXIT);
+		cpufreq_governor(policy, CPUFREQ_GOV_POLICY_EXIT);
 	}
 
 	/* new governor failed, so re-start old one */
 	pr_debug("starting governor %s failed\n", policy->governor->name);
 	if (old_gov) {
 		policy->governor = old_gov;
-		if (__cpufreq_governor(policy, CPUFREQ_GOV_POLICY_INIT))
+		if (cpufreq_governor(policy, CPUFREQ_GOV_POLICY_INIT))
 			policy->governor = NULL;
 		else
-			__cpufreq_governor(policy, CPUFREQ_GOV_START);
+			cpufreq_governor(policy, CPUFREQ_GOV_START);
 	}
 
 	return ret;
 
  out:
 	pr_debug("governor: change or update limits\n");
-	return __cpufreq_governor(policy, CPUFREQ_GOV_LIMITS);
+	return cpufreq_governor(policy, CPUFREQ_GOV_LIMITS);
 }
 
 /**
@@ -2334,7 +2332,7 @@ static int cpufreq_boost_set_sw(int state)
 
 			down_write(&policy->rwsem);
 			policy->user_policy.max = policy->max;
-			__cpufreq_governor(policy, CPUFREQ_GOV_LIMITS);
+			cpufreq_governor(policy, CPUFREQ_GOV_LIMITS);
 			up_write(&policy->rwsem);
 		}
 	}
-- 
cgit v0.10.2


From 242aa883a64d8c54cfeee47f3603b21bc705e081 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Mon, 22 Feb 2016 16:36:44 +0530
Subject: cpufreq: Remove 'policy->governor_enabled'

The entire sequence of events (like INIT/START or STOP/EXIT) for which
cpufreq_governor() is called, is guaranteed to be protected by
policy->rwsem now.

The additional checks that were added earlier (as we were forced to drop
policy->rwsem before calling cpufreq_governor() for EXIT event), aren't
required anymore.

Over that, they weren't sufficient really. They just take care of
START/STOP events, but not INIT/EXIT and the state machine was never
maintained properly by them.

Kill the unnecessary checks and policy->governor_enabled field.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index b3d05a9..dd568aa 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -2010,17 +2010,6 @@ static int cpufreq_governor(struct cpufreq_policy *policy, unsigned int event)
 
 	pr_debug("%s: for CPU %u, event %u\n", __func__, policy->cpu, event);
 
-	if ((policy->governor_enabled && event == CPUFREQ_GOV_START)
-	    || (!policy->governor_enabled
-	    && (event == CPUFREQ_GOV_LIMITS || event == CPUFREQ_GOV_STOP))) {
-		return -EBUSY;
-	}
-
-	if (event == CPUFREQ_GOV_STOP)
-		policy->governor_enabled = false;
-	else if (event == CPUFREQ_GOV_START)
-		policy->governor_enabled = true;
-
 	ret = policy->governor->governor(policy, event);
 
 	if (!ret) {
@@ -2028,12 +2017,6 @@ static int cpufreq_governor(struct cpufreq_policy *policy, unsigned int event)
 			policy->governor->initialized++;
 		else if (event == CPUFREQ_GOV_POLICY_EXIT)
 			policy->governor->initialized--;
-	} else {
-		/* Restore original values */
-		if (event == CPUFREQ_GOV_STOP)
-			policy->governor_enabled = true;
-		else if (event == CPUFREQ_GOV_START)
-			policy->governor_enabled = false;
 	}
 
 	if (((event == CPUFREQ_GOV_POLICY_INIT) && ret) ||
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index cac3d1b..a50c5b2 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -80,7 +80,6 @@ struct cpufreq_policy {
 	unsigned int		last_policy; /* policy before unplug */
 	struct cpufreq_governor	*governor; /* see below */
 	void			*governor_data;
-	bool			governor_enabled; /* governor start/stop flag */
 	char			last_governor[CPUFREQ_NAME_LEN]; /* last governor used */
 
 	struct work_struct	update; /* if update_policy() needs to be
-- 
cgit v0.10.2


From e6f036571e1f65021a442ec7aad087a6a239ecfb Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Sun, 28 Feb 2016 02:33:29 +0100
Subject: cpufreq: Select IRQ_WORK if CPU_FREQ_GOV_COMMON is set

Commit 0eb463be3436 (cpufreq: governor: Replace timers with utilization
update callbacks) made CPU_FREQ select IRQ_WORK, but that's not
necessary, as it is sufficient for IRQ_WORK to be selected by
CPU_FREQ_GOV_COMMON, so modify the cpufreq Kconfig to that effect.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>

diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig
index dcb972a..aa403aa 100644
--- a/drivers/cpufreq/Kconfig
+++ b/drivers/cpufreq/Kconfig
@@ -3,7 +3,6 @@ menu "CPU Frequency scaling"
 config CPU_FREQ
 	bool "CPU Frequency scaling"
 	select SRCU
-	select IRQ_WORK
 	help
 	  CPU Frequency scaling allows you to change the clock speed of 
 	  CPUs on the fly. This is a nice method to save power, because 
@@ -20,6 +19,7 @@ config CPU_FREQ
 if CPU_FREQ
 
 config CPU_FREQ_GOV_COMMON
+	select IRQ_WORK
 	bool
 
 config CPU_FREQ_BOOST_SW
-- 
cgit v0.10.2


From 08f511fd41c3afe303eb9b41bff0570f7c1b6937 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Fri, 4 Mar 2016 03:58:22 +0100
Subject: cpufreq: Reduce cpufreq_update_util() overhead a bit

Use the observation that cpufreq_update_util() is only called
by the scheduler with rq->lock held, so the callers of
cpufreq_set_update_util_data() can use synchronize_sched()
instead of synchronize_rcu() to wait for cpufreq_update_util()
to complete.  Moreover, if they are updated to do that,
rcu_read_(un)lock() calls in cpufreq_update_util() might be
replaced with rcu_read_(un)lock_sched(), respectively, but
those aren't really necessary, because the scheduler calls
that function from RCU-sched read-side critical sections
already.

In addition to that, if cpufreq_set_update_util_data() checks
the func field in the struct update_util_data before setting
the per-CPU pointer to it, the data->func check may be dropped
from cpufreq_update_util() as well.

Make the above changes to reduce the overhead from
cpufreq_update_util() in the scheduler paths invoking it
and to make the cleanup after removing its callbacks less
heavy-weight somewhat.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index dd568aa..6eca12a 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -115,12 +115,15 @@ static DEFINE_PER_CPU(struct update_util_data *, cpufreq_update_util_data);
  * to call from cpufreq_update_util().  That function will be called from an RCU
  * read-side critical section, so it must not sleep.
  *
- * Callers must use RCU callbacks to free any memory that might be accessed
- * via the old update_util_data pointer or invoke synchronize_rcu() right after
- * this function to avoid use-after-free.
+ * Callers must use RCU-sched callbacks to free any memory that might be
+ * accessed via the old update_util_data pointer or invoke synchronize_sched()
+ * right after this function to avoid use-after-free.
  */
 void cpufreq_set_update_util_data(int cpu, struct update_util_data *data)
 {
+	if (WARN_ON(data && !data->func))
+		return;
+
 	rcu_assign_pointer(per_cpu(cpufreq_update_util_data, cpu), data);
 }
 EXPORT_SYMBOL_GPL(cpufreq_set_update_util_data);
@@ -133,18 +136,24 @@ EXPORT_SYMBOL_GPL(cpufreq_set_update_util_data);
  *
  * This function is called by the scheduler on every invocation of
  * update_load_avg() on the CPU whose utilization is being updated.
+ *
+ * It can only be called from RCU-sched read-side critical sections.
  */
 void cpufreq_update_util(u64 time, unsigned long util, unsigned long max)
 {
 	struct update_util_data *data;
 
-	rcu_read_lock();
+#ifdef CONFIG_LOCKDEP
+	WARN_ON(debug_locks && !rcu_read_lock_sched_held());
+#endif
 
-	data = rcu_dereference(*this_cpu_ptr(&cpufreq_update_util_data));
-	if (data && data->func)
+	data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data));
+	/*
+	 * If this isn't inside of an RCU-sched read-side critical section, data
+	 * may become NULL after the check below.
+	 */
+	if (data)
 		data->func(data, time, util, max);
-
-	rcu_read_unlock();
 }
 
 /* Flag to suspend/resume CPUFreq governors */
diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c
index 70079e2..db46190 100644
--- a/drivers/cpufreq/cpufreq_governor.c
+++ b/drivers/cpufreq/cpufreq_governor.c
@@ -280,7 +280,7 @@ static inline void gov_clear_update_util(struct cpufreq_policy *policy)
 	for_each_cpu(i, policy->cpus)
 		cpufreq_set_update_util_data(i, NULL);
 
-	synchronize_rcu();
+	synchronize_sched();
 }
 
 static void gov_cancel_work(struct cpufreq_policy *policy)
diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index f4d85c2..2165d2b 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -1168,7 +1168,7 @@ static void intel_pstate_stop_cpu(struct cpufreq_policy *policy)
 	pr_debug("intel_pstate: CPU %d exiting\n", cpu_num);
 
 	cpufreq_set_update_util_data(cpu_num, NULL);
-	synchronize_rcu();
+	synchronize_sched();
 
 	if (hwp_active)
 		return;
@@ -1426,7 +1426,7 @@ out:
 	for_each_online_cpu(cpu) {
 		if (all_cpu_data[cpu]) {
 			cpufreq_set_update_util_data(cpu, NULL);
-			synchronize_rcu();
+			synchronize_sched();
 			kfree(all_cpu_data[cpu]);
 		}
 	}
-- 
cgit v0.10.2


From edd4a893e097d744e8069acf585f8b02dbbc9134 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Thu, 3 Mar 2016 14:51:33 +0530
Subject: Revert "cpufreq: postfix policy directory with the first CPU in
 related_cpus"

Revert commit 3510fac45492 (cpufreq: postfix policy directory with the
first CPU in related_cpus).

Earlier, the policy->kobj was added to the kobject core, before ->init()
callback was called for the cpufreq drivers. Which allowed those drivers
to add or remove, driver dependent, sysfs files/directories to the same
kobj from their ->init() and ->exit() callbacks.

That isn't possible anymore after commit 3510fac45492.

Now, there is no other clean alternative that people can adopt.

Its better to revert the earlier commit to allow cpufreq drivers to
create/remove sysfs files from ->init() and ->exit() callbacks.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index bdf258e..abca44c 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -985,6 +985,7 @@ static struct cpufreq_policy *cpufreq_policy_alloc(unsigned int cpu)
 {
 	struct device *dev = get_cpu_device(cpu);
 	struct cpufreq_policy *policy;
+	int ret;
 
 	if (WARN_ON(!dev))
 		return NULL;
@@ -1002,7 +1003,13 @@ static struct cpufreq_policy *cpufreq_policy_alloc(unsigned int cpu)
 	if (!zalloc_cpumask_var(&policy->real_cpus, GFP_KERNEL))
 		goto err_free_rcpumask;
 
-	kobject_init(&policy->kobj, &ktype_cpufreq);
+	ret = kobject_init_and_add(&policy->kobj, &ktype_cpufreq,
+				   cpufreq_global_kobject, "policy%u", cpu);
+	if (ret) {
+		pr_err("%s: failed to init policy->kobj: %d\n", __func__, ret);
+		goto err_free_real_cpus;
+	}
+
 	INIT_LIST_HEAD(&policy->policy_list);
 	init_rwsem(&policy->rwsem);
 	spin_lock_init(&policy->transition_lock);
@@ -1013,6 +1020,8 @@ static struct cpufreq_policy *cpufreq_policy_alloc(unsigned int cpu)
 	policy->cpu = cpu;
 	return policy;
 
+err_free_real_cpus:
+	free_cpumask_var(policy->real_cpus);
 err_free_rcpumask:
 	free_cpumask_var(policy->related_cpus);
 err_free_cpumask:
@@ -1117,16 +1126,6 @@ static int cpufreq_online(unsigned int cpu)
 		cpumask_copy(policy->related_cpus, policy->cpus);
 		/* Remember CPUs present at the policy creation time. */
 		cpumask_and(policy->real_cpus, policy->cpus, cpu_present_mask);
-
-		/* Name and add the kobject */
-		ret = kobject_add(&policy->kobj, cpufreq_global_kobject,
-				  "policy%u",
-				  cpumask_first(policy->related_cpus));
-		if (ret) {
-			pr_err("%s: failed to add policy->kobj: %d\n", __func__,
-			       ret);
-			goto out_exit_policy;
-		}
 	}
 
 	/*
-- 
cgit v0.10.2


From 1d751584e00b2538fe15a615fdad2a60a5a90a4e Mon Sep 17 00:00:00 2001
From: Kaiyen Chang <kaiyen.chang@intel.com>
Date: Wed, 10 Feb 2016 15:23:27 -0800
Subject: ACPI / fan: Make struct dev_pm_ops const

Silence the following checkpatch warning:
WARNING: struct dev_pm_ops should normally be const.

Signed-off-by: Kaiyen Chang <kaiyen.chang@intel.com>
Signed-off-by: Brian Norris <briannorris@chromium.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/acpi/fan.c b/drivers/acpi/fan.c
index 6322db6..384cfc3 100644
--- a/drivers/acpi/fan.c
+++ b/drivers/acpi/fan.c
@@ -46,7 +46,7 @@ MODULE_DEVICE_TABLE(acpi, fan_device_ids);
 #ifdef CONFIG_PM_SLEEP
 static int acpi_fan_suspend(struct device *dev);
 static int acpi_fan_resume(struct device *dev);
-static struct dev_pm_ops acpi_fan_pm = {
+static const struct dev_pm_ops acpi_fan_pm = {
 	.resume = acpi_fan_resume,
 	.freeze = acpi_fan_suspend,
 	.thaw = acpi_fan_resume,
-- 
cgit v0.10.2


From 1b6e75ee295990d866ded77c29ffc6f88b831e26 Mon Sep 17 00:00:00 2001
From: Oleg Drokin <green@linuxhacker.ru>
Date: Sat, 6 Feb 2016 02:08:08 -0500
Subject: ACPI / EC: Deny write access unless requested by module param

In debugfs it's not enough to just set file mode to read-only to
deny write access to a file, instead just don't provide
the write method unless write access is really requested.

Signed-off-by: Oleg Drokin <green@linuxhacker.ru>
Acked-by: Thomas Renninger <trenn@suse.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/acpi/ec_sys.c b/drivers/acpi/ec_sys.c
index bea8e42..6c7dd7a 100644
--- a/drivers/acpi/ec_sys.c
+++ b/drivers/acpi/ec_sys.c
@@ -73,6 +73,9 @@ static ssize_t acpi_ec_write_io(struct file *f, const char __user *buf,
 	loff_t init_off = *off;
 	int err = 0;
 
+	if (!write_support)
+		return -EINVAL;
+
 	if (*off >= EC_SPACE_SIZE)
 		return 0;
 	if (*off + count >= EC_SPACE_SIZE) {
-- 
cgit v0.10.2


From 0f9aeb7a6a84b1b72efe4e6ac32eb5028e25bb28 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Mon, 15 Feb 2016 22:40:04 +0000
Subject: ACPI / video: remove unused device_decode array
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

device_decode is now no longer used, so we may as well remove it.
Fixes gcc 6 warning:

drivers/acpi/acpi_video.c:221:19: warning: ‘device_decode’ defined
 but not used [-Wunused-const-variable]
 static const char device_decode[][30] = {
                   ^~~~~~~~~~~~~

Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/acpi/acpi_video.c b/drivers/acpi/acpi_video.c
index a76f8be..4361bc9 100644
--- a/drivers/acpi/acpi_video.c
+++ b/drivers/acpi/acpi_video.c
@@ -218,13 +218,6 @@ struct acpi_video_device {
 	struct thermal_cooling_device *cooling_dev;
 };
 
-static const char device_decode[][30] = {
-	"motherboard VGA device",
-	"PCI VGA device",
-	"AGP VGA device",
-	"UNKNOWN",
-};
-
 static void acpi_video_device_notify(acpi_handle handle, u32 event, void *data);
 static void acpi_video_device_rebind(struct acpi_video_bus *video);
 static void acpi_video_device_bind(struct acpi_video_bus *video,
-- 
cgit v0.10.2


From ad62e1e67751d21a3e15bca9d3ededc73de2cfa0 Mon Sep 17 00:00:00 2001
From: Ashwin Chaugule <ashwin.chaugule@linaro.org>
Date: Wed, 17 Feb 2016 13:20:59 -0700
Subject: ACPI / CPPC: Optimize PCC Read Write operations

Previously the send_pcc_cmd() code checked if the
PCC operation had completed before returning from
the function. This check was performed regardless
of the PCC op type (i.e. Read/Write). Knowing
the type of cmd can be used to optimize the check
and avoid needless waiting. e.g. with Write ops,
the actual Writing is done before calling send_pcc_cmd().
And the subsequent Writes will check if the channel is
free at the entry of send_pcc_cmd() anyway.

However, for Read cmds, we need to wait for the cmd
completion bit to be flipped, since the actual Read
ops follow after returning from the send_pcc_cmd(). So,
only do the looping check at the end for Read ops.

Also, instead of using udelay() calls, use ktime as a
means to check for deadlines. The current deadline
in which the Remote should flip the cmd completion bit
is defined as N * Nominal latency. Where N is arbitrary
and large enough to work on slow emulators and Nominal
latency comes from the ACPI table (PCCT). This helps
in working around the CONFIG_HZ effects on udelay()
and also avoids needing different ACPI tables for Silicon
and Emulation platforms.

Signed-off-by: Ashwin Chaugule <ashwin.chaugule@linaro.org>
Signed-off-by: Prashanth Prakash <pprakash@codeaurora.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/acpi/cppc_acpi.c b/drivers/acpi/cppc_acpi.c
index 6730f96..026cdd4 100644
--- a/drivers/acpi/cppc_acpi.c
+++ b/drivers/acpi/cppc_acpi.c
@@ -39,6 +39,7 @@
 
 #include <linux/cpufreq.h>
 #include <linux/delay.h>
+#include <linux/ktime.h>
 
 #include <acpi/cppc_acpi.h>
 /*
@@ -63,25 +64,53 @@ static struct mbox_chan *pcc_channel;
 static void __iomem *pcc_comm_addr;
 static u64 comm_base_addr;
 static int pcc_subspace_idx = -1;
-static u16 pcc_cmd_delay;
 static bool pcc_channel_acquired;
+static ktime_t deadline;
 
 /*
  * Arbitrary Retries in case the remote processor is slow to respond
- * to PCC commands.
+ * to PCC commands. Keeping it high enough to cover emulators where
+ * the processors run painfully slow.
  */
 #define NUM_RETRIES 500
 
+static int check_pcc_chan(void)
+{
+	int ret = -EIO;
+	struct acpi_pcct_shared_memory __iomem *generic_comm_base = pcc_comm_addr;
+	ktime_t next_deadline = ktime_add(ktime_get(), deadline);
+
+	/* Retry in case the remote processor was too slow to catch up. */
+	while (!ktime_after(ktime_get(), next_deadline)) {
+		if (readw_relaxed(&generic_comm_base->status) & PCC_CMD_COMPLETE) {
+			ret = 0;
+			break;
+		}
+		/*
+		 * Reducing the bus traffic in case this loop takes longer than
+		 * a few retries.
+		 */
+		udelay(3);
+	}
+
+	return ret;
+}
+
 static int send_pcc_cmd(u16 cmd)
 {
-	int retries, result = -EIO;
-	struct acpi_pcct_hw_reduced *pcct_ss = pcc_channel->con_priv;
+	int ret = -EIO;
 	struct acpi_pcct_shared_memory *generic_comm_base =
 		(struct acpi_pcct_shared_memory *) pcc_comm_addr;
-	u32 cmd_latency = pcct_ss->latency;
 
-	/* Min time OS should wait before sending next command. */
-	udelay(pcc_cmd_delay);
+	/*
+	 * For CMD_WRITE we know for a fact the caller should have checked
+	 * the channel before writing to PCC space
+	 */
+	if (cmd == CMD_READ) {
+		ret = check_pcc_chan();
+		if (ret)
+			return ret;
+	}
 
 	/* Write to the shared comm region. */
 	writew(cmd, &generic_comm_base->command);
@@ -90,31 +119,30 @@ static int send_pcc_cmd(u16 cmd)
 	writew(0, &generic_comm_base->status);
 
 	/* Ring doorbell */
-	result = mbox_send_message(pcc_channel, &cmd);
-	if (result < 0) {
+	ret = mbox_send_message(pcc_channel, &cmd);
+	if (ret < 0) {
 		pr_err("Err sending PCC mbox message. cmd:%d, ret:%d\n",
-				cmd, result);
-		return result;
+				cmd, ret);
+		return ret;
 	}
 
-	/* Wait for a nominal time to let platform process command. */
-	udelay(cmd_latency);
-
-	/* Retry in case the remote processor was too slow to catch up. */
-	for (retries = NUM_RETRIES; retries > 0; retries--) {
-		if (readw_relaxed(&generic_comm_base->status) & PCC_CMD_COMPLETE) {
-			result = 0;
-			break;
-		}
-	}
+	/*
+	 * For READs we need to ensure the cmd completed to ensure
+	 * the ensuing read()s can proceed. For WRITEs we dont care
+	 * because the actual write()s are done before coming here
+	 * and the next READ or WRITE will check if the channel
+	 * is busy/free at the entry of this call.
+	 */
+	if (cmd == CMD_READ)
+		ret = check_pcc_chan();
 
-	mbox_client_txdone(pcc_channel, result);
-	return result;
+	mbox_client_txdone(pcc_channel, ret);
+	return ret;
 }
 
 static void cppc_chan_tx_done(struct mbox_client *cl, void *msg, int ret)
 {
-	if (ret)
+	if (ret < 0)
 		pr_debug("TX did not complete: CMD sent:%x, ret:%d\n",
 				*(u16 *)msg, ret);
 	else
@@ -306,6 +334,7 @@ static int register_pcc_channel(int pcc_subspace_idx)
 {
 	struct acpi_pcct_hw_reduced *cppc_ss;
 	unsigned int len;
+	u64 usecs_lat;
 
 	if (pcc_subspace_idx >= 0) {
 		pcc_channel = pcc_mbox_request_channel(&cppc_mbox_cl,
@@ -335,7 +364,14 @@ static int register_pcc_channel(int pcc_subspace_idx)
 		 */
 		comm_base_addr = cppc_ss->base_address;
 		len = cppc_ss->length;
-		pcc_cmd_delay = cppc_ss->min_turnaround_time;
+
+		/*
+		 * cppc_ss->latency is just a Nominal value. In reality
+		 * the remote processor could be much slower to reply.
+		 * So add an arbitrary amount of wait on top of Nominal.
+		 */
+		usecs_lat = NUM_RETRIES * cppc_ss->latency;
+		deadline = ns_to_ktime(usecs_lat * NSEC_PER_USEC);
 
 		pcc_comm_addr = acpi_os_ioremap(comm_base_addr, len);
 		if (!pcc_comm_addr) {
@@ -604,7 +640,7 @@ int cppc_get_perf_caps(int cpunum, struct cppc_perf_caps *perf_caps)
 			(ref_perf->cpc_entry.reg.space_id == ACPI_ADR_SPACE_PLATFORM_COMM) ||
 			(nom_perf->cpc_entry.reg.space_id == ACPI_ADR_SPACE_PLATFORM_COMM)) {
 		/* Ring doorbell once to update PCC subspace */
-		if (send_pcc_cmd(CMD_READ)) {
+		if (send_pcc_cmd(CMD_READ) < 0) {
 			ret = -EIO;
 			goto out_err;
 		}
@@ -662,7 +698,7 @@ int cppc_get_perf_ctrs(int cpunum, struct cppc_perf_fb_ctrs *perf_fb_ctrs)
 	if ((delivered_reg->cpc_entry.reg.space_id == ACPI_ADR_SPACE_PLATFORM_COMM) ||
 			(reference_reg->cpc_entry.reg.space_id == ACPI_ADR_SPACE_PLATFORM_COMM)) {
 		/* Ring doorbell once to update PCC subspace */
-		if (send_pcc_cmd(CMD_READ)) {
+		if (send_pcc_cmd(CMD_READ) < 0) {
 			ret = -EIO;
 			goto out_err;
 		}
@@ -713,6 +749,13 @@ int cppc_set_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls)
 
 	spin_lock(&pcc_lock);
 
+	/* If this is PCC reg, check if channel is free before writing */
+	if (desired_reg->cpc_entry.reg.space_id == ACPI_ADR_SPACE_PLATFORM_COMM) {
+		ret = check_pcc_chan();
+		if (ret)
+			goto busy_channel;
+	}
+
 	/*
 	 * Skip writing MIN/MAX until Linux knows how to come up with
 	 * useful values.
@@ -722,10 +765,10 @@ int cppc_set_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls)
 	/* Is this a PCC reg ?*/
 	if (desired_reg->cpc_entry.reg.space_id == ACPI_ADR_SPACE_PLATFORM_COMM) {
 		/* Ring doorbell so Remote can get our perf request. */
-		if (send_pcc_cmd(CMD_WRITE))
+		if (send_pcc_cmd(CMD_WRITE) < 0)
 			ret = -EIO;
 	}
-
+busy_channel:
 	spin_unlock(&pcc_lock);
 
 	return ret;
-- 
cgit v0.10.2


From 77e3d86f2f8c533c9c72fb5585b623b447d9bd57 Mon Sep 17 00:00:00 2001
From: "Prakash, Prashanth" <pprakash@codeaurora.org>
Date: Wed, 17 Feb 2016 13:21:00 -0700
Subject: ACPI / CPPC: optimized cpc_read and cpc_write

cpc_read and cpc_write are used while holding the pcc_lock spin_lock,
so they need to be as fast as possible. acpi_os_read/write_memory
APIs linearly search through a list for cached mapping which is
quite expensive. Since the PCC subspace is already mapped into
virtual address space during initialization, we can just add the
offset and access the necessary CPPC registers.

This patch + similar changes to PCC driver reduce the time per freq.
transition from around 200us to about 20us for the CPPC cpufreq
driver.

Signed-off-by: Prashanth Prakash <pprakash@codeaurora.org>
Acked-by: Ashwin Chaugule <ashwin.chaugule@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/acpi/cppc_acpi.c b/drivers/acpi/cppc_acpi.c
index 026cdd4..6970b44 100644
--- a/drivers/acpi/cppc_acpi.c
+++ b/drivers/acpi/cppc_acpi.c
@@ -67,6 +67,9 @@ static int pcc_subspace_idx = -1;
 static bool pcc_channel_acquired;
 static ktime_t deadline;
 
+/* pcc mapped address + header size + offset within PCC subspace */
+#define GET_PCC_VADDR(offs) (pcc_comm_addr + 0x8 + (offs))
+
 /*
  * Arbitrary Retries in case the remote processor is slow to respond
  * to PCC commands. Keeping it high enough to cover emulators where
@@ -582,29 +585,74 @@ void acpi_cppc_processor_exit(struct acpi_processor *pr)
 }
 EXPORT_SYMBOL_GPL(acpi_cppc_processor_exit);
 
-static u64 get_phys_addr(struct cpc_reg *reg)
-{
-	/* PCC communication addr space begins at byte offset 0x8. */
-	if (reg->space_id == ACPI_ADR_SPACE_PLATFORM_COMM)
-		return (u64)comm_base_addr + 0x8 + reg->address;
-	else
-		return reg->address;
-}
+/*
+ * Since cpc_read and cpc_write are called while holding pcc_lock, it should be
+ * as fast as possible. We have already mapped the PCC subspace during init, so
+ * we can directly write to it.
+ */
 
-static void cpc_read(struct cpc_reg *reg, u64 *val)
+static int cpc_read(struct cpc_reg *reg, u64 *val)
 {
-	u64 addr = get_phys_addr(reg);
+	int ret_val = 0;
+
+	*val = 0;
+	if (reg->space_id == ACPI_ADR_SPACE_PLATFORM_COMM) {
+		void __iomem *vaddr = GET_PCC_VADDR(reg->address);
 
-	acpi_os_read_memory((acpi_physical_address)addr,
-			val, reg->bit_width);
+		switch (reg->bit_width) {
+		case 8:
+			*val = readb(vaddr);
+			break;
+		case 16:
+			*val = readw(vaddr);
+			break;
+		case 32:
+			*val = readl(vaddr);
+			break;
+		case 64:
+			*val = readq(vaddr);
+			break;
+		default:
+			pr_debug("Error: Cannot read %u bit width from PCC\n",
+				reg->bit_width);
+			ret_val = -EFAULT;
+		}
+	} else
+		ret_val = acpi_os_read_memory((acpi_physical_address)reg->address,
+					val, reg->bit_width);
+	return ret_val;
 }
 
-static void cpc_write(struct cpc_reg *reg, u64 val)
+static int cpc_write(struct cpc_reg *reg, u64 val)
 {
-	u64 addr = get_phys_addr(reg);
+	int ret_val = 0;
+
+	if (reg->space_id == ACPI_ADR_SPACE_PLATFORM_COMM) {
+		void __iomem *vaddr = GET_PCC_VADDR(reg->address);
 
-	acpi_os_write_memory((acpi_physical_address)addr,
-			val, reg->bit_width);
+		switch (reg->bit_width) {
+		case 8:
+			writeb(val, vaddr);
+			break;
+		case 16:
+			writew(val, vaddr);
+			break;
+		case 32:
+			writel(val, vaddr);
+			break;
+		case 64:
+			writeq(val, vaddr);
+			break;
+		default:
+			pr_debug("Error: Cannot write %u bit width to PCC\n",
+				reg->bit_width);
+			ret_val = -EFAULT;
+			break;
+		}
+	} else
+		ret_val = acpi_os_write_memory((acpi_physical_address)reg->address,
+				val, reg->bit_width);
+	return ret_val;
 }
 
 /**
-- 
cgit v0.10.2


From 8b0f57889843af6304b80724e36bd3d93b6484b1 Mon Sep 17 00:00:00 2001
From: "Prakash, Prashanth" <pprakash@codeaurora.org>
Date: Wed, 17 Feb 2016 13:21:01 -0700
Subject: mailbox: pcc: optimized pcc_send_data

pcc_send_data() can be invoked during the execution of performance
critical code as in cppc_cpufreq driver. With acpi_* APIs, the
doorbell register accessed in pcc_send_data() if present in system
memory will be searched (in cached virt to phys addr mapping),
mapped, read/written and then unmapped. These operations take
significant amount of time.

This patch maps the performance critical doorbell register
during init and then reads/writes to it directly using the
mapped virtual address. This patch + similar changes to CPPC
acpi driver reduce the time per freq. transition from around
200us to about 20us for the CPPC cpufreq driver

Signed-off-by: Prashanth Prakash <pprakash@codeaurora.org>
Acked-by: Ashwin Chaugule <ashwin.chaugule@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/mailbox/pcc.c b/drivers/mailbox/pcc.c
index 45d85ae..ac11a9b 100644
--- a/drivers/mailbox/pcc.c
+++ b/drivers/mailbox/pcc.c
@@ -63,6 +63,7 @@
 #include <linux/platform_device.h>
 #include <linux/mailbox_controller.h>
 #include <linux/mailbox_client.h>
+#include <linux/io-64-nonatomic-lo-hi.h>
 
 #include "mailbox.h"
 
@@ -70,6 +71,9 @@
 
 static struct mbox_chan *pcc_mbox_channels;
 
+/* Array of cached virtual address for doorbell registers */
+static void __iomem **pcc_doorbell_vaddr;
+
 static struct mbox_controller pcc_mbox_ctrl = {};
 /**
  * get_pcc_channel - Given a PCC subspace idx, get
@@ -166,6 +170,66 @@ void pcc_mbox_free_channel(struct mbox_chan *chan)
 }
 EXPORT_SYMBOL_GPL(pcc_mbox_free_channel);
 
+/*
+ * PCC can be used with perf critical drivers such as CPPC
+ * So it makes sense to locally cache the virtual address and
+ * use it to read/write to PCC registers such as doorbell register
+ *
+ * The below read_register and write_registers are used to read and
+ * write from perf critical registers such as PCC doorbell register
+ */
+static int read_register(void __iomem *vaddr, u64 *val, unsigned int bit_width)
+{
+	int ret_val = 0;
+
+	switch (bit_width) {
+	case 8:
+		*val = readb(vaddr);
+		break;
+	case 16:
+		*val = readw(vaddr);
+		break;
+	case 32:
+		*val = readl(vaddr);
+		break;
+	case 64:
+		*val = readq(vaddr);
+		break;
+	default:
+		pr_debug("Error: Cannot read register of %u bit width",
+			bit_width);
+		ret_val = -EFAULT;
+		break;
+	}
+	return ret_val;
+}
+
+static int write_register(void __iomem *vaddr, u64 val, unsigned int bit_width)
+{
+	int ret_val = 0;
+
+	switch (bit_width) {
+	case 8:
+		writeb(val, vaddr);
+		break;
+	case 16:
+		writew(val, vaddr);
+		break;
+	case 32:
+		writel(val, vaddr);
+		break;
+	case 64:
+		writeq(val, vaddr);
+		break;
+	default:
+		pr_debug("Error: Cannot write register of %u bit width",
+			bit_width);
+		ret_val = -EFAULT;
+		break;
+	}
+	return ret_val;
+}
+
 /**
  * pcc_send_data - Called from Mailbox Controller code. Used
  *		here only to ring the channel doorbell. The PCC client
@@ -181,21 +245,39 @@ EXPORT_SYMBOL_GPL(pcc_mbox_free_channel);
 static int pcc_send_data(struct mbox_chan *chan, void *data)
 {
 	struct acpi_pcct_hw_reduced *pcct_ss = chan->con_priv;
-	struct acpi_generic_address doorbell;
+	struct acpi_generic_address *doorbell;
 	u64 doorbell_preserve;
 	u64 doorbell_val;
 	u64 doorbell_write;
+	u32 id = chan - pcc_mbox_channels;
+	int ret = 0;
+
+	if (id >= pcc_mbox_ctrl.num_chans) {
+		pr_debug("pcc_send_data: Invalid mbox_chan passed\n");
+		return -ENOENT;
+	}
 
-	doorbell = pcct_ss->doorbell_register;
+	doorbell = &pcct_ss->doorbell_register;
 	doorbell_preserve = pcct_ss->preserve_mask;
 	doorbell_write = pcct_ss->write_mask;
 
 	/* Sync notification from OS to Platform. */
-	acpi_read(&doorbell_val, &doorbell);
-	acpi_write((doorbell_val & doorbell_preserve) | doorbell_write,
-			&doorbell);
-
-	return 0;
+	if (pcc_doorbell_vaddr[id]) {
+		ret = read_register(pcc_doorbell_vaddr[id], &doorbell_val,
+			doorbell->bit_width);
+		if (ret)
+			return ret;
+		ret = write_register(pcc_doorbell_vaddr[id],
+			(doorbell_val & doorbell_preserve) | doorbell_write,
+			doorbell->bit_width);
+	} else {
+		ret = acpi_read(&doorbell_val, doorbell);
+		if (ret)
+			return ret;
+		ret = acpi_write((doorbell_val & doorbell_preserve) | doorbell_write,
+			doorbell);
+	}
+	return ret;
 }
 
 static const struct mbox_chan_ops pcc_chan_ops = {
@@ -271,14 +353,29 @@ static int __init acpi_pcc_probe(void)
 		return -ENOMEM;
 	}
 
+	pcc_doorbell_vaddr = kcalloc(count, sizeof(void *), GFP_KERNEL);
+	if (!pcc_doorbell_vaddr) {
+		kfree(pcc_mbox_channels);
+		return -ENOMEM;
+	}
+
 	/* Point to the first PCC subspace entry */
 	pcct_entry = (struct acpi_subtable_header *) (
 		(unsigned long) pcct_tbl + sizeof(struct acpi_table_pcct));
 
 	for (i = 0; i < count; i++) {
+		struct acpi_generic_address *db_reg;
+		struct acpi_pcct_hw_reduced *pcct_ss;
 		pcc_mbox_channels[i].con_priv = pcct_entry;
 		pcct_entry = (struct acpi_subtable_header *)
 			((unsigned long) pcct_entry + pcct_entry->length);
+
+		/* If doorbell is in system memory cache the virt address */
+		pcct_ss = (struct acpi_pcct_hw_reduced *)pcct_entry;
+		db_reg = &pcct_ss->doorbell_register;
+		if (db_reg->space_id == ACPI_ADR_SPACE_SYSTEM_MEMORY)
+			pcc_doorbell_vaddr[i] = acpi_os_ioremap(db_reg->address,
+							db_reg->bit_width/8);
 	}
 
 	pcc_mbox_ctrl.num_chans = count;
-- 
cgit v0.10.2


From beee23aebc6650609ef1547f6d813fa5065f74aa Mon Sep 17 00:00:00 2001
From: "Prakash, Prashanth" <pprakash@codeaurora.org>
Date: Wed, 17 Feb 2016 13:21:02 -0700
Subject: ACPI / CPPC: replace writeX/readX to PCC with relaxed version

We do not have a strict read/write order requirement while accessing
PCC subspace. The only requirement is all access should be committed
before triggering the PCC doorbell to transfer the ownership of PCC
to the platform and this requirement is enforced by the PCC driver.

Profiling on a many core system shows improvement of about 1.8us on
average per freq change request(about 10% improvement on average).
Since these operations are executed while holding the pcc_lock,
reducing this time helps the CPPC implementation to scale much
better as the number of cores increases.

Signed-off-by: Prashanth Prakash <pprakash@codeaurora.org>
Acked-by: Ashwin Chaugule <ashwin.chaugule@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/acpi/cppc_acpi.c b/drivers/acpi/cppc_acpi.c
index 6970b44..8a0911a 100644
--- a/drivers/acpi/cppc_acpi.c
+++ b/drivers/acpi/cppc_acpi.c
@@ -116,10 +116,10 @@ static int send_pcc_cmd(u16 cmd)
 	}
 
 	/* Write to the shared comm region. */
-	writew(cmd, &generic_comm_base->command);
+	writew_relaxed(cmd, &generic_comm_base->command);
 
 	/* Flip CMD COMPLETE bit */
-	writew(0, &generic_comm_base->status);
+	writew_relaxed(0, &generic_comm_base->status);
 
 	/* Ring doorbell */
 	ret = mbox_send_message(pcc_channel, &cmd);
@@ -601,16 +601,16 @@ static int cpc_read(struct cpc_reg *reg, u64 *val)
 
 		switch (reg->bit_width) {
 		case 8:
-			*val = readb(vaddr);
+			*val = readb_relaxed(vaddr);
 			break;
 		case 16:
-			*val = readw(vaddr);
+			*val = readw_relaxed(vaddr);
 			break;
 		case 32:
-			*val = readl(vaddr);
+			*val = readl_relaxed(vaddr);
 			break;
 		case 64:
-			*val = readq(vaddr);
+			*val = readq_relaxed(vaddr);
 			break;
 		default:
 			pr_debug("Error: Cannot read %u bit width from PCC\n",
@@ -632,16 +632,16 @@ static int cpc_write(struct cpc_reg *reg, u64 val)
 
 		switch (reg->bit_width) {
 		case 8:
-			writeb(val, vaddr);
+			writeb_relaxed(val, vaddr);
 			break;
 		case 16:
-			writew(val, vaddr);
+			writew_relaxed(val, vaddr);
 			break;
 		case 32:
-			writel(val, vaddr);
+			writel_relaxed(val, vaddr);
 			break;
 		case 64:
-			writeq(val, vaddr);
+			writeq_relaxed(val, vaddr);
 			break;
 		default:
 			pr_debug("Error: Cannot write %u bit width to PCC\n",
-- 
cgit v0.10.2


From f387e5b901adb8352fcdf031bb2c539e390b92e2 Mon Sep 17 00:00:00 2001
From: "Prakash, Prashanth" <pprakash@codeaurora.org>
Date: Wed, 17 Feb 2016 13:21:03 -0700
Subject: ACPI / CPPC: use MRTT/MPAR to decide if/when a req can be sent

The ACPI spec defines Minimum Request Turnaround Time(MRTT) and
Maximum Periodic Access Rate(MPAR) to prevent the OSPM from sending
too many requests than the platform can handle. For further details
on these parameters please refer to section 14.1.3 of ACPI 6.0 spec.

This patch includes MRTT/MPAR in deciding if or when a CPPC request
can be sent to the platform to make sure CPPC implementation is
compliant to the spec.

Signed-off-by: Prashanth Prakash <pprakash@codeaurora.org>
Acked-by: Ashwin Chaugule <ashwin.chaugule@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/acpi/cppc_acpi.c b/drivers/acpi/cppc_acpi.c
index 8a0911a..8adac69 100644
--- a/drivers/acpi/cppc_acpi.c
+++ b/drivers/acpi/cppc_acpi.c
@@ -66,6 +66,7 @@ static u64 comm_base_addr;
 static int pcc_subspace_idx = -1;
 static bool pcc_channel_acquired;
 static ktime_t deadline;
+static unsigned int pcc_mpar, pcc_mrtt;
 
 /* pcc mapped address + header size + offset within PCC subspace */
 #define GET_PCC_VADDR(offs) (pcc_comm_addr + 0x8 + (offs))
@@ -85,6 +86,11 @@ static int check_pcc_chan(void)
 
 	/* Retry in case the remote processor was too slow to catch up. */
 	while (!ktime_after(ktime_get(), next_deadline)) {
+		/*
+		 * Per spec, prior to boot the PCC space wil be initialized by
+		 * platform and should have set the command completion bit when
+		 * PCC can be used by OSPM
+		 */
 		if (readw_relaxed(&generic_comm_base->status) & PCC_CMD_COMPLETE) {
 			ret = 0;
 			break;
@@ -104,6 +110,9 @@ static int send_pcc_cmd(u16 cmd)
 	int ret = -EIO;
 	struct acpi_pcct_shared_memory *generic_comm_base =
 		(struct acpi_pcct_shared_memory *) pcc_comm_addr;
+	static ktime_t last_cmd_cmpl_time, last_mpar_reset;
+	static int mpar_count;
+	unsigned int time_delta;
 
 	/*
 	 * For CMD_WRITE we know for a fact the caller should have checked
@@ -115,6 +124,41 @@ static int send_pcc_cmd(u16 cmd)
 			return ret;
 	}
 
+	/*
+	 * Handle the Minimum Request Turnaround Time(MRTT)
+	 * "The minimum amount of time that OSPM must wait after the completion
+	 * of a command before issuing the next command, in microseconds"
+	 */
+	if (pcc_mrtt) {
+		time_delta = ktime_us_delta(ktime_get(), last_cmd_cmpl_time);
+		if (pcc_mrtt > time_delta)
+			udelay(pcc_mrtt - time_delta);
+	}
+
+	/*
+	 * Handle the non-zero Maximum Periodic Access Rate(MPAR)
+	 * "The maximum number of periodic requests that the subspace channel can
+	 * support, reported in commands per minute. 0 indicates no limitation."
+	 *
+	 * This parameter should be ideally zero or large enough so that it can
+	 * handle maximum number of requests that all the cores in the system can
+	 * collectively generate. If it is not, we will follow the spec and just
+	 * not send the request to the platform after hitting the MPAR limit in
+	 * any 60s window
+	 */
+	if (pcc_mpar) {
+		if (mpar_count == 0) {
+			time_delta = ktime_ms_delta(ktime_get(), last_mpar_reset);
+			if (time_delta < 60 * MSEC_PER_SEC) {
+				pr_debug("PCC cmd not sent due to MPAR limit");
+				return -EIO;
+			}
+			last_mpar_reset = ktime_get();
+			mpar_count = pcc_mpar;
+		}
+		mpar_count--;
+	}
+
 	/* Write to the shared comm region. */
 	writew_relaxed(cmd, &generic_comm_base->command);
 
@@ -135,9 +179,17 @@ static int send_pcc_cmd(u16 cmd)
 	 * because the actual write()s are done before coming here
 	 * and the next READ or WRITE will check if the channel
 	 * is busy/free at the entry of this call.
+	 *
+	 * If Minimum Request Turnaround Time is non-zero, we need
+	 * to record the completion time of both READ and WRITE
+	 * command for proper handling of MRTT, so we need to check
+	 * for pcc_mrtt in addition to CMD_READ
 	 */
-	if (cmd == CMD_READ)
+	if (cmd == CMD_READ || pcc_mrtt) {
 		ret = check_pcc_chan();
+		if (pcc_mrtt)
+			last_cmd_cmpl_time = ktime_get();
+	}
 
 	mbox_client_txdone(pcc_channel, ret);
 	return ret;
@@ -375,6 +427,8 @@ static int register_pcc_channel(int pcc_subspace_idx)
 		 */
 		usecs_lat = NUM_RETRIES * cppc_ss->latency;
 		deadline = ns_to_ktime(usecs_lat * NSEC_PER_USEC);
+		pcc_mrtt = cppc_ss->min_turnaround_time;
+		pcc_mpar = cppc_ss->max_access_rate;
 
 		pcc_comm_addr = acpi_os_ioremap(comm_base_addr, len);
 		if (!pcc_comm_addr) {
-- 
cgit v0.10.2


From cc079f8cf729aaa9e704bbaba3d21a20a6abc036 Mon Sep 17 00:00:00 2001
From: Paul Gortmaker <paul.gortmaker@windriver.com>
Date: Mon, 15 Feb 2016 00:27:49 -0500
Subject: drivers/acpi: make bgrt driver explicitly non-modular

The Kconfig for this driver is currently:

config ACPI_BGRT
    bool "Boottime Graphics Resource Table support"

...meaning that it currently is not being built as a module by anyone.
Lets remove all modular references, so that when reading the driver
there is no doubt it is builtin-only.

Since module_init translates to device_initcall in the non-modular
case, the init ordering remains unchanged with this commit.

We also delete the MODULE_LICENSE tag etc. since all that information
was (or is now) contained at the top of the file in the comments.

Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/acpi/bgrt.c b/drivers/acpi/bgrt.c
index a83e3c6..75f128e 100644
--- a/drivers/acpi/bgrt.c
+++ b/drivers/acpi/bgrt.c
@@ -1,4 +1,6 @@
 /*
+ * BGRT boot graphic support
+ * Authors: Matthew Garrett, Josh Triplett <josh@joshtriplett.org>
  * Copyright 2012 Red Hat, Inc <mjg@redhat.com>
  * Copyright 2012 Intel Corporation
  *
@@ -8,7 +10,6 @@
  */
 
 #include <linux/kernel.h>
-#include <linux/module.h>
 #include <linux/init.h>
 #include <linux/device.h>
 #include <linux/sysfs.h>
@@ -103,9 +104,4 @@ out_kobject:
 	kobject_put(bgrt_kobj);
 	return ret;
 }
-
-module_init(bgrt_init);
-
-MODULE_AUTHOR("Matthew Garrett, Josh Triplett <josh@joshtriplett.org>");
-MODULE_DESCRIPTION("BGRT boot graphic support");
-MODULE_LICENSE("GPL");
+device_initcall(bgrt_init);
-- 
cgit v0.10.2


From 020bf066a6019807fc52a83e8bea5e0ad6a285e1 Mon Sep 17 00:00:00 2001
From: Paul Gortmaker <paul.gortmaker@windriver.com>
Date: Mon, 15 Feb 2016 00:27:50 -0500
Subject: drivers/acpi: make apei/ghes.c more explicitly non-modular

The Kconfig currently controlling compilation of this code is:

config ACPI_APEI_GHES
      bool "APEI Generic Hardware Error Source"

...meaning that it currently is not being built as a module by anyone.

Lets remove the modular code that is essentially orphaned, so that
when reading the driver there is no doubt it is builtin-only.

Since module_init translates to device_initcall in the non-modular
case, the init ordering remains unchanged with this commit.

We replace module.h with moduleparam.h as we are keeping the
pre-existing module_param that the file has, as currently that is
the easiest way to maintain compatibility with the existing boot
arg use cases.

Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index 3dd9c46..60746ef 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -26,7 +26,7 @@
  */
 
 #include <linux/kernel.h>
-#include <linux/module.h>
+#include <linux/moduleparam.h>
 #include <linux/init.h>
 #include <linux/acpi.h>
 #include <linux/io.h>
@@ -79,6 +79,11 @@
 	((struct acpi_hest_generic_status *)				\
 	 ((struct ghes_estatus_node *)(estatus_node) + 1))
 
+/*
+ * This driver isn't really modular, however for the time being,
+ * continuing to use module_param is the easiest way to remain
+ * compatible with existing boot arg use cases.
+ */
 bool ghes_disable;
 module_param_named(disable, ghes_disable, bool, 0);
 
@@ -1148,18 +1153,4 @@ err_ioremap_exit:
 err:
 	return rc;
 }
-
-static void __exit ghes_exit(void)
-{
-	platform_driver_unregister(&ghes_platform_driver);
-	ghes_estatus_pool_exit();
-	ghes_ioremap_exit();
-}
-
-module_init(ghes_init);
-module_exit(ghes_exit);
-
-MODULE_AUTHOR("Huang Ying");
-MODULE_DESCRIPTION("APEI Generic Hardware Error Source support");
-MODULE_LICENSE("GPL");
-MODULE_ALIAS("platform:GHES");
+device_initcall(ghes_init);
-- 
cgit v0.10.2


From 75829dcf10862966f52716f2d67ac1c1b1eb486b Mon Sep 17 00:00:00 2001
From: Paul Gortmaker <paul.gortmaker@windriver.com>
Date: Mon, 15 Feb 2016 00:27:51 -0500
Subject: drivers/acpi: make pmic/intel_pmic_crc.c explicitly non-modular

The Kconfig currently controlling compilation of this code is:

drivers/acpi/Kconfig:config CRC_PMIC_OPREGION
drivers/acpi/Kconfig:   bool "ACPI operation region support for CrystalCove PMIC"

...meaning that it currently is not being built as a module by anyone.

Lets remove the couple modular references, so that when reading
the driver there is no doubt it is builtin-only.

Since module_init translates to device_initcall in the non-modular
case, the init ordering remains unchanged with this commit.

We also delete the MODULE_LICENSE tag etc. since all that information
is already contained at the top of the file in the comments.

Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
Acked-by: Aaron Lu <aaron.lu@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/acpi/pmic/intel_pmic_crc.c b/drivers/acpi/pmic/intel_pmic_crc.c
index 42df46a..fcd1852 100644
--- a/drivers/acpi/pmic/intel_pmic_crc.c
+++ b/drivers/acpi/pmic/intel_pmic_crc.c
@@ -13,7 +13,7 @@
  * GNU General Public License for more details.
  */
 
-#include <linux/module.h>
+#include <linux/init.h>
 #include <linux/acpi.h>
 #include <linux/mfd/intel_soc_pmic.h>
 #include <linux/regmap.h>
@@ -205,7 +205,4 @@ static int __init intel_crc_pmic_opregion_driver_init(void)
 {
 	return platform_driver_register(&intel_crc_pmic_opregion_driver);
 }
-module_init(intel_crc_pmic_opregion_driver_init);
-
-MODULE_DESCRIPTION("CrystalCove ACPI operation region driver");
-MODULE_LICENSE("GPL");
+device_initcall(intel_crc_pmic_opregion_driver_init);
-- 
cgit v0.10.2


From b2ca5dae31d61c4610ea3228f43f7cc66312897c Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Thu, 21 Jan 2016 17:05:47 +0000
Subject: ACPI: Add acpi_force_32bit_fadt_addr option to force 32 bit FADT
 addresses

Some HP laptops seem to have invalid 64 bit FADT X_PM* addresses
which are causing various boot issues.  In these cases, it would
be useful to force ACPI to use the valid legacy 32 bit equivalent
PM addresses.  Add a acpi_force_32bit_fadt_addr to set the ACPICA
acpi_gbl_use32_bit_fadt_addresses to TRUE to force this override.

Link: https://bugs.launchpad.net/bugs/1529381
Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 9a53c92..64b5431 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -193,6 +193,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 			(e.g. thinkpad_acpi, sony_acpi, etc.) instead
 			of the ACPI video.ko driver.
 
+	acpi_force_32bit_fadt_addr
+			force FADT to use 32 bit addresses rather than the
+			64 bit X_* addresses. Some firmware have broken 64
+			bit addresses for force ACPI ignore these and use
+			the older legacy 32 bit addresses.
+
 	acpica_no_return_repair [HW, ACPI]
 			Disable AML predefined validation mechanism
 			This mechanism can repair the evaluation result to make
diff --git a/drivers/acpi/tables.c b/drivers/acpi/tables.c
index 6c0f079..ebdf564 100644
--- a/drivers/acpi/tables.c
+++ b/drivers/acpi/tables.c
@@ -484,3 +484,13 @@ static int __init acpi_force_table_verification_setup(char *s)
 }
 
 early_param("acpi_force_table_verification", acpi_force_table_verification_setup);
+
+static int __init acpi_force_32bit_fadt_addr(char *s)
+{
+	pr_info("Forcing 32 Bit FADT addresses\n");
+	acpi_gbl_use32_bit_fadt_addresses = TRUE;
+
+	return 0;
+}
+
+early_param("acpi_force_32bit_fadt_addr", acpi_force_32bit_fadt_addr);
-- 
cgit v0.10.2


From bea3c377c286f4f63f67f2e061c1d8851af76b7e Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Sun, 28 Feb 2016 20:31:49 +0000
Subject: ACPI / util: remove redundant check if element is NULL

element is &package->package.elements[i] which can never be NULL
so the check to see if it is NULL is redundant and can be removed.

Detected with static analysis by CoverityScan

Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/acpi/utils.c b/drivers/acpi/utils.c
index f2f9873..f12a724 100644
--- a/drivers/acpi/utils.c
+++ b/drivers/acpi/utils.c
@@ -201,10 +201,6 @@ acpi_extract_package(union acpi_object *package,
 		u8 **pointer = NULL;
 		union acpi_object *element = &(package->package.elements[i]);
 
-		if (!element) {
-			return AE_BAD_DATA;
-		}
-
 		switch (element->type) {
 
 		case ACPI_TYPE_INTEGER:
-- 
cgit v0.10.2


From 86e75410f067e4ff09d6a94d6bb6800ec956ccfe Mon Sep 17 00:00:00 2001
From: Harb Abdulhamid <harba@codeaurora.org>
Date: Tue, 1 Mar 2016 13:31:45 -0600
Subject: PNP / ACPI: add ACPI_RESOURCE_TYPE_SERIAL_BUS as a valid type

An error message is printed for resources of type 19, which is a valid
supported resource type.  The Firmware Test Suite tool (fwts) reports
this as a test failure.  This change fixes the false test failures
for ASL that use type 19 (ACPI_RESOURCE_TYPE_SERIAL_BUS) resources.

Signed-off-by: Harb Abdulhamid <harba@codeaurora.org>
Signed-off-by: Timur Tabi <timur@codeaurora.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/pnp/pnpacpi/rsparser.c b/drivers/pnp/pnpacpi/rsparser.c
index 0579649..4b717c6 100644
--- a/drivers/pnp/pnpacpi/rsparser.c
+++ b/drivers/pnp/pnpacpi/rsparser.c
@@ -252,6 +252,10 @@ static acpi_status pnpacpi_allocated_resource(struct acpi_resource *res,
 	case ACPI_RESOURCE_TYPE_GENERIC_REGISTER:
 		break;
 
+	case ACPI_RESOURCE_TYPE_SERIAL_BUS:
+		/* serial bus connections (I2C/SPI/UART) are not pnp */
+		break;
+
 	default:
 		dev_warn(&dev->dev, "unknown resource type %d in _CRS\n",
 			 res->type);
-- 
cgit v0.10.2


From a543132ee04d1599a0abe913eaf43d1986021d24 Mon Sep 17 00:00:00 2001
From: Lv Zheng <lv.zheng@intel.com>
Date: Wed, 2 Mar 2016 14:16:17 +0800
Subject: ACPI / OSL: Clean up initrd table override code

This patch cleans up the initrd table override code by merging
redundant logics and re-ordering code blocks.

No functional changes.

Signed-off-by: Lv Zheng <lv.zheng@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/acpi/osl.c b/drivers/acpi/osl.c
index 67da6fb..a8c417c 100644
--- a/drivers/acpi/osl.c
+++ b/drivers/acpi/osl.c
@@ -602,6 +602,14 @@ acpi_os_predefined_override(const struct acpi_predefined_names *init_val,
 	return AE_OK;
 }
 
+static void acpi_table_taint(struct acpi_table_header *table)
+{
+	pr_warn(PREFIX
+		"Override [%4.4s-%8.8s], this is unsafe: tainting kernel\n",
+		table->signature, table->oem_table_id);
+	add_taint(TAINT_OVERRIDDEN_ACPI_TABLE, LOCKDEP_NOW_UNRELIABLE);
+}
+
 #ifdef CONFIG_ACPI_INITRD_TABLE_OVERRIDE
 #include <linux/earlycpio.h>
 #include <linux/memblock.h>
@@ -746,96 +754,78 @@ void __init acpi_initrd_override(void *data, size_t size)
 		}
 	}
 }
-#endif /* CONFIG_ACPI_INITRD_TABLE_OVERRIDE */
-
-static void acpi_table_taint(struct acpi_table_header *table)
-{
-	pr_warn(PREFIX
-		"Override [%4.4s-%8.8s], this is unsafe: tainting kernel\n",
-		table->signature, table->oem_table_id);
-	add_taint(TAINT_OVERRIDDEN_ACPI_TABLE, LOCKDEP_NOW_UNRELIABLE);
-}
-
-
-acpi_status
-acpi_os_table_override(struct acpi_table_header * existing_table,
-		       struct acpi_table_header ** new_table)
-{
-	if (!existing_table || !new_table)
-		return AE_BAD_PARAMETER;
-
-	*new_table = NULL;
-
-#ifdef CONFIG_ACPI_CUSTOM_DSDT
-	if (strncmp(existing_table->signature, "DSDT", 4) == 0)
-		*new_table = (struct acpi_table_header *)AmlCode;
-#endif
-	if (*new_table != NULL)
-		acpi_table_taint(existing_table);
-	return AE_OK;
-}
 
 acpi_status
 acpi_os_physical_table_override(struct acpi_table_header *existing_table,
-				acpi_physical_address *address,
-				u32 *table_length)
+				acpi_physical_address *address, u32 *length)
 {
-#ifndef CONFIG_ACPI_INITRD_TABLE_OVERRIDE
-	*table_length = 0;
-	*address = 0;
-	return AE_OK;
-#else
 	int table_offset = 0;
 	struct acpi_table_header *table;
+	u32 table_length;
 
-	*table_length = 0;
+	*length = 0;
 	*address = 0;
-
 	if (!acpi_tables_addr)
 		return AE_OK;
 
-	do {
-		if (table_offset + ACPI_HEADER_SIZE > all_tables_size) {
-			WARN_ON(1);
-			return AE_OK;
-		}
-
+	while (table_offset + ACPI_HEADER_SIZE <= all_tables_size) {
 		table = acpi_os_map_memory(acpi_tables_addr + table_offset,
 					   ACPI_HEADER_SIZE);
-
 		if (table_offset + table->length > all_tables_size) {
 			acpi_os_unmap_memory(table, ACPI_HEADER_SIZE);
 			WARN_ON(1);
 			return AE_OK;
 		}
 
-		table_offset += table->length;
+		table_length = table->length;
 
-		if (memcmp(existing_table->signature, table->signature, 4)) {
-			acpi_os_unmap_memory(table,
-				     ACPI_HEADER_SIZE);
-			continue;
-		}
-
-		/* Only override tables with matching oem id */
-		if (memcmp(table->oem_table_id, existing_table->oem_table_id,
+		/* Only override tables matched */
+		if (memcmp(existing_table->signature, table->signature, 4) ||
+		    memcmp(table->oem_table_id, existing_table->oem_table_id,
 			   ACPI_OEM_TABLE_ID_SIZE)) {
-			acpi_os_unmap_memory(table,
-				     ACPI_HEADER_SIZE);
-			continue;
+			acpi_os_unmap_memory(table, ACPI_HEADER_SIZE);
+			goto next_table;
 		}
 
-		table_offset -= table->length;
-		*table_length = table->length;
-		acpi_os_unmap_memory(table, ACPI_HEADER_SIZE);
+		*length = table_length;
 		*address = acpi_tables_addr + table_offset;
+		acpi_table_taint(existing_table);
+		acpi_os_unmap_memory(table, ACPI_HEADER_SIZE);
 		break;
-	} while (table_offset + ACPI_HEADER_SIZE < all_tables_size);
 
-	if (*address != 0)
-		acpi_table_taint(existing_table);
+next_table:
+		table_offset += table_length;
+	}
 	return AE_OK;
+}
+#else
+acpi_status
+acpi_os_physical_table_override(struct acpi_table_header *existing_table,
+				acpi_physical_address *address,
+				u32 *table_length)
+{
+	*table_length = 0;
+	*address = 0;
+	return AE_OK;
+}
+#endif /* CONFIG_ACPI_INITRD_TABLE_OVERRIDE */
+
+acpi_status
+acpi_os_table_override(struct acpi_table_header *existing_table,
+		       struct acpi_table_header **new_table)
+{
+	if (!existing_table || !new_table)
+		return AE_BAD_PARAMETER;
+
+	*new_table = NULL;
+
+#ifdef CONFIG_ACPI_CUSTOM_DSDT
+	if (strncmp(existing_table->signature, "DSDT", 4) == 0)
+		*new_table = (struct acpi_table_header *)AmlCode;
 #endif
+	if (*new_table != NULL)
+		acpi_table_taint(existing_table);
+	return AE_OK;
 }
 
 static irqreturn_t acpi_irq(int irq, void *dev_id)
-- 
cgit v0.10.2


From c85cc817e5b6c45a78c3b34170dfeb6469b56d82 Mon Sep 17 00:00:00 2001
From: Lv Zheng <lv.zheng@intel.com>
Date: Wed, 2 Mar 2016 14:16:25 +0800
Subject: ACPI / OSL: Add support to install tables via initrd

This patch adds support to install tables from initrd.

If a table in the initrd wasn't used by the override mechanism,
the table would be installed after initializing all RSDT/XSDT
tables.

Link: https://lkml.org/lkml/2014/2/28/368
Reported-by: Thomas Renninger <trenn@suse.de>
Signed-off-by: Lv Zheng <lv.zheng@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/acpi/internal.h b/drivers/acpi/internal.h
index 1e6833a..27c2cb9 100644
--- a/drivers/acpi/internal.h
+++ b/drivers/acpi/internal.h
@@ -20,6 +20,7 @@
 
 #define PREFIX "ACPI: "
 
+void acpi_initrd_initialize_tables(void);
 acpi_status acpi_os_initialize1(void);
 void init_acpi_device_notify(void);
 int acpi_scan_init(void);
diff --git a/drivers/acpi/osl.c b/drivers/acpi/osl.c
index a8c417c..814d5f8 100644
--- a/drivers/acpi/osl.c
+++ b/drivers/acpi/osl.c
@@ -644,6 +644,7 @@ static const char * const table_sigs[] = {
 
 #define ACPI_OVERRIDE_TABLES 64
 static struct cpio_data __initdata acpi_initrd_files[ACPI_OVERRIDE_TABLES];
+static DECLARE_BITMAP(acpi_initrd_installed, ACPI_OVERRIDE_TABLES);
 
 #define MAP_CHUNK_SIZE   (NR_FIX_BTMAPS << PAGE_SHIFT)
 
@@ -760,6 +761,7 @@ acpi_os_physical_table_override(struct acpi_table_header *existing_table,
 				acpi_physical_address *address, u32 *length)
 {
 	int table_offset = 0;
+	int table_index = 0;
 	struct acpi_table_header *table;
 	u32 table_length;
 
@@ -780,7 +782,8 @@ acpi_os_physical_table_override(struct acpi_table_header *existing_table,
 		table_length = table->length;
 
 		/* Only override tables matched */
-		if (memcmp(existing_table->signature, table->signature, 4) ||
+		if (test_bit(table_index, acpi_initrd_installed) ||
+		    memcmp(existing_table->signature, table->signature, 4) ||
 		    memcmp(table->oem_table_id, existing_table->oem_table_id,
 			   ACPI_OEM_TABLE_ID_SIZE)) {
 			acpi_os_unmap_memory(table, ACPI_HEADER_SIZE);
@@ -791,13 +794,54 @@ acpi_os_physical_table_override(struct acpi_table_header *existing_table,
 		*address = acpi_tables_addr + table_offset;
 		acpi_table_taint(existing_table);
 		acpi_os_unmap_memory(table, ACPI_HEADER_SIZE);
+		set_bit(table_index, acpi_initrd_installed);
 		break;
 
 next_table:
 		table_offset += table_length;
+		table_index++;
 	}
 	return AE_OK;
 }
+
+void __init acpi_initrd_initialize_tables(void)
+{
+	int table_offset = 0;
+	int table_index = 0;
+	u32 table_length;
+	struct acpi_table_header *table;
+
+	if (!acpi_tables_addr)
+		return;
+
+	while (table_offset + ACPI_HEADER_SIZE <= all_tables_size) {
+		table = acpi_os_map_memory(acpi_tables_addr + table_offset,
+					   ACPI_HEADER_SIZE);
+		if (table_offset + table->length > all_tables_size) {
+			acpi_os_unmap_memory(table, ACPI_HEADER_SIZE);
+			WARN_ON(1);
+			return;
+		}
+
+		table_length = table->length;
+
+		/* Skip RSDT/XSDT which should only be used for override */
+		if (test_bit(table_index, acpi_initrd_installed) ||
+		    ACPI_COMPARE_NAME(table->signature, ACPI_SIG_RSDT) ||
+		    ACPI_COMPARE_NAME(table->signature, ACPI_SIG_XSDT)) {
+			acpi_os_unmap_memory(table, ACPI_HEADER_SIZE);
+			goto next_table;
+		}
+
+		acpi_table_taint(table);
+		acpi_os_unmap_memory(table, ACPI_HEADER_SIZE);
+		acpi_install_table(acpi_tables_addr + table_offset, TRUE);
+		set_bit(table_index, acpi_initrd_installed);
+next_table:
+		table_offset += table_length;
+		table_index++;
+	}
+}
 #else
 acpi_status
 acpi_os_physical_table_override(struct acpi_table_header *existing_table,
@@ -808,6 +852,10 @@ acpi_os_physical_table_override(struct acpi_table_header *existing_table,
 	*address = 0;
 	return AE_OK;
 }
+
+void __init acpi_initrd_initialize_tables(void)
+{
+}
 #endif /* CONFIG_ACPI_INITRD_TABLE_OVERRIDE */
 
 acpi_status
diff --git a/drivers/acpi/tables.c b/drivers/acpi/tables.c
index 6c0f079..57c0a45 100644
--- a/drivers/acpi/tables.c
+++ b/drivers/acpi/tables.c
@@ -32,6 +32,7 @@
 #include <linux/errno.h>
 #include <linux/acpi.h>
 #include <linux/bootmem.h>
+#include "internal.h"
 
 #define ACPI_MAX_TABLES		128
 
@@ -456,6 +457,7 @@ int __init acpi_table_init(void)
 	status = acpi_initialize_tables(initial_tables, ACPI_MAX_TABLES, 0);
 	if (ACPI_FAILURE(status))
 		return -EINVAL;
+	acpi_initrd_initialize_tables();
 
 	check_multiple_madt();
 	return 0;
-- 
cgit v0.10.2


From adaf9fcd136970e480d7ca834c0cf25ce922ea74 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Thu, 10 Mar 2016 20:44:47 +0100
Subject: cpufreq: Move scheduler-related code to the sched directory

Create cpufreq.c under kernel/sched/ and move the cpufreq code
related to the scheduler to that file and to sched.h.

Redefine cpufreq_update_util() as a static inline function to avoid
function calls at its call sites in the scheduler code (as suggested
by Peter Zijlstra).

Also move the definition of struct update_util_data and declaration
of cpufreq_set_update_util_data() from include/linux/cpufreq.h to
include/linux/sched.h.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 6eca12a..58e1a39 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -103,59 +103,6 @@ static struct cpufreq_driver *cpufreq_driver;
 static DEFINE_PER_CPU(struct cpufreq_policy *, cpufreq_cpu_data);
 static DEFINE_RWLOCK(cpufreq_driver_lock);
 
-static DEFINE_PER_CPU(struct update_util_data *, cpufreq_update_util_data);
-
-/**
- * cpufreq_set_update_util_data - Populate the CPU's update_util_data pointer.
- * @cpu: The CPU to set the pointer for.
- * @data: New pointer value.
- *
- * Set and publish the update_util_data pointer for the given CPU.  That pointer
- * points to a struct update_util_data object containing a callback function
- * to call from cpufreq_update_util().  That function will be called from an RCU
- * read-side critical section, so it must not sleep.
- *
- * Callers must use RCU-sched callbacks to free any memory that might be
- * accessed via the old update_util_data pointer or invoke synchronize_sched()
- * right after this function to avoid use-after-free.
- */
-void cpufreq_set_update_util_data(int cpu, struct update_util_data *data)
-{
-	if (WARN_ON(data && !data->func))
-		return;
-
-	rcu_assign_pointer(per_cpu(cpufreq_update_util_data, cpu), data);
-}
-EXPORT_SYMBOL_GPL(cpufreq_set_update_util_data);
-
-/**
- * cpufreq_update_util - Take a note about CPU utilization changes.
- * @time: Current time.
- * @util: Current utilization.
- * @max: Utilization ceiling.
- *
- * This function is called by the scheduler on every invocation of
- * update_load_avg() on the CPU whose utilization is being updated.
- *
- * It can only be called from RCU-sched read-side critical sections.
- */
-void cpufreq_update_util(u64 time, unsigned long util, unsigned long max)
-{
-	struct update_util_data *data;
-
-#ifdef CONFIG_LOCKDEP
-	WARN_ON(debug_locks && !rcu_read_lock_sched_held());
-#endif
-
-	data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data));
-	/*
-	 * If this isn't inside of an RCU-sched read-side critical section, data
-	 * may become NULL after the check below.
-	 */
-	if (data)
-		data->func(data, time, util, max);
-}
-
 /* Flag to suspend/resume CPUFreq governors */
 static bool cpufreq_suspended;
 
diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c
index db46190..1c25ef4 100644
--- a/drivers/cpufreq/cpufreq_governor.c
+++ b/drivers/cpufreq/cpufreq_governor.c
@@ -18,6 +18,7 @@
 
 #include <linux/export.h>
 #include <linux/kernel_stat.h>
+#include <linux/sched.h>
 #include <linux/slab.h>
 
 #include "cpufreq_governor.h"
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index a50c5b2..a5ea52f 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -146,36 +146,6 @@ static inline bool policy_is_shared(struct cpufreq_policy *policy)
 extern struct kobject *cpufreq_global_kobject;
 
 #ifdef CONFIG_CPU_FREQ
-void cpufreq_update_util(u64 time, unsigned long util, unsigned long max);
-
-/**
- * cpufreq_trigger_update - Trigger CPU performance state evaluation if needed.
- * @time: Current time.
- *
- * The way cpufreq is currently arranged requires it to evaluate the CPU
- * performance state (frequency/voltage) on a regular basis to prevent it from
- * being stuck in a completely inadequate performance level for too long.
- * That is not guaranteed to happen if the updates are only triggered from CFS,
- * though, because they may not be coming in if RT or deadline tasks are active
- * all the time (or there are RT and DL tasks only).
- *
- * As a workaround for that issue, this function is called by the RT and DL
- * sched classes to trigger extra cpufreq updates to prevent it from stalling,
- * but that really is a band-aid.  Going forward it should be replaced with
- * solutions targeted more specifically at RT and DL tasks.
- */
-static inline void cpufreq_trigger_update(u64 time)
-{
-	cpufreq_update_util(time, ULONG_MAX, 0);
-}
-
-struct update_util_data {
-	void (*func)(struct update_util_data *data,
-		     u64 time, unsigned long util, unsigned long max);
-};
-
-void cpufreq_set_update_util_data(int cpu, struct update_util_data *data);
-
 unsigned int cpufreq_get(unsigned int cpu);
 unsigned int cpufreq_quick_get(unsigned int cpu);
 unsigned int cpufreq_quick_get_max(unsigned int cpu);
@@ -187,10 +157,6 @@ int cpufreq_update_policy(unsigned int cpu);
 bool have_governor_per_policy(void);
 struct kobject *get_governor_parent_kobj(struct cpufreq_policy *policy);
 #else
-static inline void cpufreq_update_util(u64 time, unsigned long util,
-				       unsigned long max) {}
-static inline void cpufreq_trigger_update(u64 time) {}
-
 static inline unsigned int cpufreq_get(unsigned int cpu)
 {
 	return 0;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index a10494a..913e755 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -3207,4 +3207,13 @@ static inline unsigned long rlimit_max(unsigned int limit)
 	return task_rlimit_max(current, limit);
 }
 
+#ifdef CONFIG_CPU_FREQ
+struct update_util_data {
+	void (*func)(struct update_util_data *data,
+		     u64 time, unsigned long util, unsigned long max);
+};
+
+void cpufreq_set_update_util_data(int cpu, struct update_util_data *data);
+#endif /* CONFIG_CPU_FREQ */
+
 #endif
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 6768797..9507522 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -19,3 +19,4 @@ obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
 obj-$(CONFIG_SCHEDSTATS) += stats.o
 obj-$(CONFIG_SCHED_DEBUG) += debug.o
 obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
+obj-$(CONFIG_CPU_FREQ) += cpufreq.o
diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c
new file mode 100644
index 0000000..928c4ba
--- /dev/null
+++ b/kernel/sched/cpufreq.c
@@ -0,0 +1,37 @@
+/*
+ * Scheduler code and data structures related to cpufreq.
+ *
+ * Copyright (C) 2016, Intel Corporation
+ * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include "sched.h"
+
+DEFINE_PER_CPU(struct update_util_data *, cpufreq_update_util_data);
+
+/**
+ * cpufreq_set_update_util_data - Populate the CPU's update_util_data pointer.
+ * @cpu: The CPU to set the pointer for.
+ * @data: New pointer value.
+ *
+ * Set and publish the update_util_data pointer for the given CPU.  That pointer
+ * points to a struct update_util_data object containing a callback function
+ * to call from cpufreq_update_util().  That function will be called from an RCU
+ * read-side critical section, so it must not sleep.
+ *
+ * Callers must use RCU-sched callbacks to free any memory that might be
+ * accessed via the old update_util_data pointer or invoke synchronize_sched()
+ * right after this function to avoid use-after-free.
+ */
+void cpufreq_set_update_util_data(int cpu, struct update_util_data *data)
+{
+	if (WARN_ON(data && !data->func))
+		return;
+
+	rcu_assign_pointer(per_cpu(cpufreq_update_util_data, cpu), data);
+}
+EXPORT_SYMBOL_GPL(cpufreq_set_update_util_data);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index f042190..faf7e27 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -9,7 +9,6 @@
 #include <linux/irq_work.h>
 #include <linux/tick.h>
 #include <linux/slab.h>
-#include <linux/cpufreq.h>
 
 #include "cpupri.h"
 #include "cpudeadline.h"
@@ -1739,3 +1738,51 @@ static inline u64 irq_time_read(int cpu)
 }
 #endif /* CONFIG_64BIT */
 #endif /* CONFIG_IRQ_TIME_ACCOUNTING */
+
+#ifdef CONFIG_CPU_FREQ
+DECLARE_PER_CPU(struct update_util_data *, cpufreq_update_util_data);
+
+/**
+ * cpufreq_update_util - Take a note about CPU utilization changes.
+ * @time: Current time.
+ * @util: Current utilization.
+ * @max: Utilization ceiling.
+ *
+ * This function is called by the scheduler on every invocation of
+ * update_load_avg() on the CPU whose utilization is being updated.
+ *
+ * It can only be called from RCU-sched read-side critical sections.
+ */
+static inline void cpufreq_update_util(u64 time, unsigned long util, unsigned long max)
+{
+       struct update_util_data *data;
+
+       data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data));
+       if (data)
+               data->func(data, time, util, max);
+}
+
+/**
+ * cpufreq_trigger_update - Trigger CPU performance state evaluation if needed.
+ * @time: Current time.
+ *
+ * The way cpufreq is currently arranged requires it to evaluate the CPU
+ * performance state (frequency/voltage) on a regular basis to prevent it from
+ * being stuck in a completely inadequate performance level for too long.
+ * That is not guaranteed to happen if the updates are only triggered from CFS,
+ * though, because they may not be coming in if RT or deadline tasks are active
+ * all the time (or there are RT and DL tasks only).
+ *
+ * As a workaround for that issue, this function is called by the RT and DL
+ * sched classes to trigger extra cpufreq updates to prevent it from stalling,
+ * but that really is a band-aid.  Going forward it should be replaced with
+ * solutions targeted more specifically at RT and DL tasks.
+ */
+static inline void cpufreq_trigger_update(u64 time)
+{
+	cpufreq_update_util(time, ULONG_MAX, 0);
+}
+#else
+static inline void cpufreq_update_util(u64 time, unsigned long util, unsigned long max) {}
+static inline void cpufreq_trigger_update(u64 time) {}
+#endif /* CONFIG_CPU_FREQ */
-- 
cgit v0.10.2


From 5508df89756f8378024828e185724a9bd2348985 Mon Sep 17 00:00:00 2001
From: Lv Zheng <lv.zheng@intel.com>
Date: Thu, 10 Mar 2016 10:54:29 +0800
Subject: ACPICA / Interpreter: Fix a regression triggered because of wrong
 Linux ECDT support

It is reported that the following commit triggers regressions:
 Linux commit: efaed9be998b5ae0afb7458e057e5f4402b43fa0
 ACPICA commit: 31178590dde82368fdb0f6b0e466b6c0add96c57
 Subject: ACPICA: Events: Enhance acpi_ev_execute_reg_method() to
          ensure no _REG evaluations can happen during OS early boot
          stages

This is because that the ECDT support is not corrected in Linux, and Linux
requires to execute _REG for ECDT (though this sounds so wrong), we need to
ensure acpi_gbl_namespace_initialized is set before ECDT probing in order
for _REG to be executed. Since we have to move
"acpi_gbl_namespace_initialized = TRUE" to the initialization step
happening before ECDT probing, acpi_load_tables() is the best candidate for
now. Thus this patch fixes the regression by doing so.

But if the ECDT support is fixed, Linux will not execute _REG for ECDT, and
ECDT probing will happen before acpi_load_tables(). At that time, we still
want to ensure acpi_gbl_namespace_initialized is set after executing
acpi_ns_initialize_objects() (under the condition of
acpi_gbl_group_module_level_code = FALSE), this patch also moves
acpi_ns_initialize_objects() to acpi_load_tables() accordingly.

Since acpi_ns_initialize_objects() doesn't seem to be skippable, this
patch also removes ACPI_NO_OBJECT_INIT for the one invoked in
acpi_load_tables(). And since the default region handlers should always be
installed before loading the tables, this patch also removes useless
acpi_gbl_group_module_level_code check accordingly. Reported by Chris
Bainbridge, Fixed by Lv Zheng.

Fixes: efaed9be998b (ACPICA: Events: Enhance acpi_ev_execute_reg_method() to ensure no _REG evaluations can happen during OS early boot stages)
Reported-and-tested-by: Chris Bainbridge <chris.bainbridge@gmail.com>
Signed-off-by: Lv Zheng <lv.zheng@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/acpi/acpica/nsinit.c b/drivers/acpi/acpica/nsinit.c
index f029a3d..d4aa8b6 100644
--- a/drivers/acpi/acpica/nsinit.c
+++ b/drivers/acpi/acpica/nsinit.c
@@ -84,6 +84,8 @@ acpi_status acpi_ns_initialize_objects(void)
 
 	ACPI_FUNCTION_TRACE(ns_initialize_objects);
 
+	ACPI_DEBUG_PRINT((ACPI_DB_EXEC,
+			  "[Init] Completing Initialization of ACPI Objects\n"));
 	ACPI_DEBUG_PRINT((ACPI_DB_DISPATCH,
 			  "**** Starting initialization of namespace objects ****\n"));
 	ACPI_DEBUG_PRINT_RAW((ACPI_DB_INIT,
diff --git a/drivers/acpi/acpica/tbxfload.c b/drivers/acpi/acpica/tbxfload.c
index 9496b84..3151968 100644
--- a/drivers/acpi/acpica/tbxfload.c
+++ b/drivers/acpi/acpica/tbxfload.c
@@ -81,13 +81,11 @@ acpi_status __init acpi_load_tables(void)
 	 * between acpi_initialize_subsystem() and acpi_load_tables() to use
 	 * their customized default region handlers.
 	 */
-	if (acpi_gbl_group_module_level_code) {
-		status = acpi_ev_install_region_handlers();
-		if (ACPI_FAILURE(status) && status != AE_ALREADY_EXISTS) {
-			ACPI_EXCEPTION((AE_INFO, status,
-					"During Region initialization"));
-			return_ACPI_STATUS(status);
-		}
+	status = acpi_ev_install_region_handlers();
+	if (ACPI_FAILURE(status) && status != AE_ALREADY_EXISTS) {
+		ACPI_EXCEPTION((AE_INFO, status,
+				"During Region initialization"));
+		return_ACPI_STATUS(status);
 	}
 
 	/* Load the namespace from the tables */
@@ -105,6 +103,20 @@ acpi_status __init acpi_load_tables(void)
 				"While loading namespace from ACPI tables"));
 	}
 
+	if (!acpi_gbl_group_module_level_code) {
+		/*
+		 * Initialize the objects that remain uninitialized. This
+		 * runs the executable AML that may be part of the
+		 * declaration of these objects:
+		 * operation_regions, buffer_fields, Buffers, and Packages.
+		 */
+		status = acpi_ns_initialize_objects();
+		if (ACPI_FAILURE(status)) {
+			return_ACPI_STATUS(status);
+		}
+	}
+
+	acpi_gbl_namespace_initialized = TRUE;
 	return_ACPI_STATUS(status);
 }
 
diff --git a/drivers/acpi/acpica/utxfinit.c b/drivers/acpi/acpica/utxfinit.c
index d70649d..75b5f27 100644
--- a/drivers/acpi/acpica/utxfinit.c
+++ b/drivers/acpi/acpica/utxfinit.c
@@ -154,23 +154,6 @@ acpi_status __init acpi_enable_subsystem(u32 flags)
 	 */
 	acpi_gbl_early_initialization = FALSE;
 
-	/*
-	 * Install the default operation region handlers. These are the
-	 * handlers that are defined by the ACPI specification to be
-	 * "always accessible" -- namely, system_memory, system_IO, and
-	 * PCI_Config. This also means that no _REG methods need to be
-	 * run for these address spaces. We need to have these handlers
-	 * installed before any AML code can be executed, especially any
-	 * module-level code (11/2015).
-	 */
-	if (!acpi_gbl_group_module_level_code) {
-		status = acpi_ev_install_region_handlers();
-		if (ACPI_FAILURE(status)) {
-			ACPI_EXCEPTION((AE_INFO, status,
-					"During Region initialization"));
-			return_ACPI_STATUS(status);
-		}
-	}
 #if (!ACPI_REDUCED_HARDWARE)
 
 	/* Enable ACPI mode */
@@ -284,25 +267,21 @@ acpi_status __init acpi_initialize_objects(u32 flags)
 	 */
 	if (acpi_gbl_group_module_level_code) {
 		acpi_ns_exec_module_code_list();
-	}
 
-	/*
-	 * Initialize the objects that remain uninitialized. This runs the
-	 * executable AML that may be part of the declaration of these objects:
-	 * operation_regions, buffer_fields, Buffers, and Packages.
-	 */
-	if (!(flags & ACPI_NO_OBJECT_INIT)) {
-		ACPI_DEBUG_PRINT((ACPI_DB_EXEC,
-				  "[Init] Completing Initialization of ACPI Objects\n"));
-
-		status = acpi_ns_initialize_objects();
-		if (ACPI_FAILURE(status)) {
-			return_ACPI_STATUS(status);
+		/*
+		 * Initialize the objects that remain uninitialized. This
+		 * runs the executable AML that may be part of the
+		 * declaration of these objects:
+		 * operation_regions, buffer_fields, Buffers, and Packages.
+		 */
+		if (!(flags & ACPI_NO_OBJECT_INIT)) {
+			status = acpi_ns_initialize_objects();
+			if (ACPI_FAILURE(status)) {
+				return_ACPI_STATUS(status);
+			}
 		}
 	}
 
-	acpi_gbl_namespace_initialized = TRUE;
-
 	/*
 	 * Initialize all device/region objects in the namespace. This runs
 	 * the device _STA and _INI methods and region _REG methods.
-- 
cgit v0.10.2


From 0d67e0fa1664ad6aaba0552e170608bafb4b6196 Mon Sep 17 00:00:00 2001
From: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Date: Thu, 10 Mar 2016 13:03:18 +0200
Subject: device property: fix for a case of use-after-free

In device_remove_property_set(), the secondary fwnode needs
to be cleared before the pset is freed. This fixes a
use-after-free when a property set is providing the primary
fwnode.

As a result of the fix, the primary fwnode may end up
containing ERR_PTR(-ENODEV), so also adding checks for it to
the property handling code.

Reported-by: John Youn <John.Youn@synopsys.com>
Signed-off-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/base/property.c b/drivers/base/property.c
index a163f2c5..76628a7 100644
--- a/drivers/base/property.c
+++ b/drivers/base/property.c
@@ -218,7 +218,8 @@ bool fwnode_property_present(struct fwnode_handle *fwnode, const char *propname)
 	bool ret;
 
 	ret = __fwnode_property_present(fwnode, propname);
-	if (ret == false && fwnode && !IS_ERR_OR_NULL(fwnode->secondary))
+	if (ret == false && !IS_ERR_OR_NULL(fwnode) &&
+	    !IS_ERR_OR_NULL(fwnode->secondary))
 		ret = __fwnode_property_present(fwnode->secondary, propname);
 	return ret;
 }
@@ -423,7 +424,8 @@ EXPORT_SYMBOL_GPL(device_property_match_string);
 	int _ret_;									\
 	_ret_ = FWNODE_PROP_READ(_fwnode_, _propname_, _type_, _proptype_,		\
 				 _val_, _nval_);					\
-	if (_ret_ == -EINVAL && _fwnode_ && !IS_ERR_OR_NULL(_fwnode_->secondary))	\
+	if (_ret_ == -EINVAL && !IS_ERR_OR_NULL(_fwnode_) &&				\
+	    !IS_ERR_OR_NULL(_fwnode_->secondary))					\
 		_ret_ = FWNODE_PROP_READ(_fwnode_->secondary, _propname_, _type_,	\
 				_proptype_, _val_, _nval_);				\
 	_ret_;										\
@@ -593,7 +595,8 @@ int fwnode_property_read_string_array(struct fwnode_handle *fwnode,
 	int ret;
 
 	ret = __fwnode_property_read_string_array(fwnode, propname, val, nval);
-	if (ret == -EINVAL && fwnode && !IS_ERR_OR_NULL(fwnode->secondary))
+	if (ret == -EINVAL && !IS_ERR_OR_NULL(fwnode) &&
+	    !IS_ERR_OR_NULL(fwnode->secondary))
 		ret = __fwnode_property_read_string_array(fwnode->secondary,
 							  propname, val, nval);
 	return ret;
@@ -621,7 +624,8 @@ int fwnode_property_read_string(struct fwnode_handle *fwnode,
 	int ret;
 
 	ret = __fwnode_property_read_string(fwnode, propname, val);
-	if (ret == -EINVAL && fwnode && !IS_ERR_OR_NULL(fwnode->secondary))
+	if (ret == -EINVAL && !IS_ERR_OR_NULL(fwnode) &&
+	    !IS_ERR_OR_NULL(fwnode->secondary))
 		ret = __fwnode_property_read_string(fwnode->secondary,
 						    propname, val);
 	return ret;
@@ -820,11 +824,16 @@ void device_remove_property_set(struct device *dev)
 	 * the pset. If there is no real firmware node (ACPI/DT) primary
 	 * will hold the pset.
 	 */
-	if (!is_pset_node(fwnode))
-		fwnode = fwnode->secondary;
-	if (!IS_ERR(fwnode) && is_pset_node(fwnode))
+	if (is_pset_node(fwnode)) {
+		set_primary_fwnode(dev, NULL);
 		pset_free_set(to_pset_node(fwnode));
-	set_secondary_fwnode(dev, NULL);
+	} else {
+		fwnode = fwnode->secondary;
+		if (!IS_ERR(fwnode) && is_pset_node(fwnode)) {
+			set_secondary_fwnode(dev, NULL);
+			pset_free_set(to_pset_node(fwnode));
+		}
+	}
 }
 EXPORT_SYMBOL_GPL(device_remove_property_set);
 
-- 
cgit v0.10.2


From e4e666ba74d4f6eff04d3c567b9f2d50a46837e4 Mon Sep 17 00:00:00 2001
From: Xiangliang Yu <Xiangliang.Yu@amd.com>
Date: Thu, 10 Mar 2016 19:34:52 +0800
Subject: i2c: designware: Add device HID for future AMD I2C controller

Add device HID AMDI0010 to match the AMD ACPI Vendor ID (AMDI) that
was registered in http://www.uefi.org/acpi_id_list, and the I2C
controller on future AMD paltform will use the HID instead of AMD0010.

Signed-off-by: Xiangliang Yu <Xiangliang.Yu@amd.com>
Acked-by: Jarkko Nikula <jarkko.nikula@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/acpi/acpi_apd.c b/drivers/acpi/acpi_apd.c
index d507cf6..d0aad06 100644
--- a/drivers/acpi/acpi_apd.c
+++ b/drivers/acpi/acpi_apd.c
@@ -143,6 +143,7 @@ static const struct acpi_device_id acpi_apd_device_ids[] = {
 	/* Generic apd devices */
 #ifdef CONFIG_X86_AMD_PLATFORM_DEVICE
 	{ "AMD0010", APD_ADDR(cz_i2c_desc) },
+	{ "AMDI0010", APD_ADDR(cz_i2c_desc) },
 	{ "AMD0020", APD_ADDR(cz_uart_desc) },
 	{ "AMD0030", },
 #endif
diff --git a/drivers/i2c/busses/i2c-designware-platdrv.c b/drivers/i2c/busses/i2c-designware-platdrv.c
index 438f1b4..d656657 100644
--- a/drivers/i2c/busses/i2c-designware-platdrv.c
+++ b/drivers/i2c/busses/i2c-designware-platdrv.c
@@ -123,6 +123,7 @@ static const struct acpi_device_id dw_i2c_acpi_match[] = {
 	{ "80860F41", 0 },
 	{ "808622C1", 0 },
 	{ "AMD0010", ACCESS_INTR_MASK },
+	{ "AMDI0010", ACCESS_INTR_MASK },
 	{ "AMDI0510", 0 },
 	{ "APMC0D0F", 0 },
 	{ }
-- 
cgit v0.10.2


From b54a0dfd56d5c314302987e0ff173b3f1bfcb555 Mon Sep 17 00:00:00 2001
From: Philippe Longepe <philippe.longepe@linux.intel.com>
Date: Tue, 8 Mar 2016 10:31:14 +0100
Subject: intel_pstate: Remove extra conversions in pid calculation

pid->setpoint and pid->deadband can be initialized in fixed point, so we
can avoid the int_tofp in pid_calc.

Signed-off-by: Philippe Longepe <philippe.longepe@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index 23bb798..864214d 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -198,8 +198,8 @@ static struct perf_limits *limits = &powersave_limits;
 
 static inline void pid_reset(struct _pid *pid, int setpoint, int busy,
 			     int deadband, int integral) {
-	pid->setpoint = setpoint;
-	pid->deadband  = deadband;
+	pid->setpoint = int_tofp(setpoint);
+	pid->deadband  = int_tofp(deadband);
 	pid->integral  = int_tofp(integral);
 	pid->last_err  = int_tofp(setpoint) - int_tofp(busy);
 }
@@ -225,9 +225,9 @@ static signed int pid_calc(struct _pid *pid, int32_t busy)
 	int32_t pterm, dterm, fp_error;
 	int32_t integral_limit;
 
-	fp_error = int_tofp(pid->setpoint) - busy;
+	fp_error = pid->setpoint - busy;
 
-	if (abs(fp_error) <= int_tofp(pid->deadband))
+	if (abs(fp_error) <= pid->deadband)
 		return 0;
 
 	pterm = mul_fp(pid->p_gain, fp_error);
-- 
cgit v0.10.2


From a158bed5dc92bd83338225135d448958e0b3745d Mon Sep 17 00:00:00 2001
From: Philippe Longepe <philippe.longepe@linux.intel.com>
Date: Sun, 6 Mar 2016 08:34:04 +0100
Subject: intel_pstate: Optimize calculation for max/min_perf_adj

mul_fp(int_tofp(A), B) expands to:
((A << FRAC_BITS) * B) >> FRAC_BITS, so the same result can be obtained
via simple multiplication A * B.  Apply this observation to
max_perf * limits->max_perf and max_perf * limits->min_perf in
intel_pstate_get_min_max()."

Signed-off-by: Philippe Longepe <philippe.longepe@linux.intel.com>
Acked-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index 864214d..5b5bfc1 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -831,11 +831,11 @@ static void intel_pstate_get_min_max(struct cpudata *cpu, int *min, int *max)
 	 * policy, or by cpu specific default values determined through
 	 * experimentation.
 	 */
-	max_perf_adj = fp_toint(mul_fp(int_tofp(max_perf), limits->max_perf));
+	max_perf_adj = fp_toint(max_perf * limits->max_perf);
 	*max = clamp_t(int, max_perf_adj,
 			cpu->pstate.min_pstate, cpu->pstate.turbo_pstate);
 
-	min_perf = fp_toint(mul_fp(int_tofp(max_perf), limits->min_perf));
+	min_perf = fp_toint(max_perf * limits->min_perf);
 	*min = clamp_t(int, min_perf, cpu->pstate.min_pstate, max_perf);
 }
 
-- 
cgit v0.10.2


From 7349ec0470b62820ae226e30770b9d84a53ced9d Mon Sep 17 00:00:00 2001
From: Philippe Longepe <philippe.longepe@linux.intel.com>
Date: Sun, 6 Mar 2016 08:34:05 +0100
Subject: intel_pstate: Move intel_pstate_calc_busy() into
 get_target_pstate_use_performance()

The cpu_load algorithm doesn't need to invoke intel_pstate_calc_busy(),
so move that call from intel_pstate_sample() to
get_target_pstate_use_performance().

Signed-off-by: Philippe Longepe <philippe.longepe@linux.intel.com>
Acked-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index 5b5bfc1..95cc217 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -915,8 +915,6 @@ static inline void intel_pstate_sample(struct cpudata *cpu, u64 time)
 	cpu->sample.mperf -= cpu->prev_mperf;
 	cpu->sample.tsc -= cpu->prev_tsc;
 
-	intel_pstate_calc_busy(cpu);
-
 	cpu->prev_aperf = aperf;
 	cpu->prev_mperf = mperf;
 	cpu->prev_tsc = tsc;
@@ -945,7 +943,6 @@ static inline int32_t get_target_pstate_use_cpu_load(struct cpudata *cpu)
 	mperf = cpu->sample.mperf + delta_iowait_mperf;
 	cpu->prev_cummulative_iowait = cummulative_iowait;
 
-
 	/*
 	 * The load can be estimated as the ratio of the mperf counter
 	 * running at a constant frequency during active periods
@@ -963,6 +960,8 @@ static inline int32_t get_target_pstate_use_performance(struct cpudata *cpu)
 	int32_t core_busy, max_pstate, current_pstate, sample_ratio;
 	u64 duration_ns;
 
+	intel_pstate_calc_busy(cpu);
+
 	/*
 	 * core_busy is the ratio of actual performance to max
 	 * max_pstate is the max non turbo pstate available
-- 
cgit v0.10.2


From 8fa520af50817d5f30d293f507c937f561b3e6b9 Mon Sep 17 00:00:00 2001
From: Philippe Longepe <philippe.longepe@linux.intel.com>
Date: Sun, 6 Mar 2016 08:34:06 +0100
Subject: intel_pstate: Remove freq calculation from intel_pstate_calc_busy()

Use a helper function to compute the average pstate and call it only
where it is needed (only when tracing or in intel_pstate_get).

Signed-off-by: Philippe Longepe <philippe.longepe@linux.intel.com>
Acked-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index 95cc217..4acb904 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -881,12 +881,6 @@ static inline void intel_pstate_calc_busy(struct cpudata *cpu)
 	core_pct = int_tofp(sample->aperf) * int_tofp(100);
 	core_pct = div64_u64(core_pct, int_tofp(sample->mperf));
 
-	sample->freq = fp_toint(
-		mul_fp(int_tofp(
-			cpu->pstate.max_pstate_physical *
-			cpu->pstate.scaling / 100),
-			core_pct));
-
 	sample->core_pct_busy = (int32_t)core_pct;
 }
 
@@ -920,6 +914,12 @@ static inline void intel_pstate_sample(struct cpudata *cpu, u64 time)
 	cpu->prev_tsc = tsc;
 }
 
+static inline int32_t get_avg_frequency(struct cpudata *cpu)
+{
+	return div64_u64(cpu->pstate.max_pstate_physical * cpu->sample.aperf *
+		cpu->pstate.scaling, cpu->sample.mperf);
+}
+
 static inline int32_t get_target_pstate_use_cpu_load(struct cpudata *cpu)
 {
 	struct sample *sample = &cpu->sample;
@@ -1015,7 +1015,7 @@ static inline void intel_pstate_adjust_busy_pstate(struct cpudata *cpu)
 		sample->mperf,
 		sample->aperf,
 		sample->tsc,
-		sample->freq);
+		get_avg_frequency(cpu));
 }
 
 static void intel_pstate_update_util(struct update_util_data *data, u64 time,
@@ -1104,7 +1104,7 @@ static unsigned int intel_pstate_get(unsigned int cpu_num)
 	if (!cpu)
 		return 0;
 	sample = &cpu->sample;
-	return sample->freq;
+	return get_avg_frequency(cpu);
 }
 
 static int intel_pstate_set_policy(struct cpufreq_policy *policy)
-- 
cgit v0.10.2


From 4fec7ad5f637159525265a45f66482cf8817b45f Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Thu, 10 Mar 2016 23:45:19 +0100
Subject: intel_pstate: Do not skip samples partially

If the current value of MPERF or the current value of TSC is the
same as the previous one, respectively, intel_pstate_sample() bails
out early and skips the sample.

However, intel_pstate_adjust_busy_pstate() is still called in that
case which is not correct, so modify intel_pstate_sample() to
return a bool value indicating whether or not the sample has been
taken and use it to decide whether or not to call
intel_pstate_adjust_busy_pstate().

While at it, remove redundant parentheses from the MPERF/TSC
check in intel_pstate_sample().

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index 4acb904..cb56074 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -884,7 +884,7 @@ static inline void intel_pstate_calc_busy(struct cpudata *cpu)
 	sample->core_pct_busy = (int32_t)core_pct;
 }
 
-static inline void intel_pstate_sample(struct cpudata *cpu, u64 time)
+static inline bool intel_pstate_sample(struct cpudata *cpu, u64 time)
 {
 	u64 aperf, mperf;
 	unsigned long flags;
@@ -894,9 +894,9 @@ static inline void intel_pstate_sample(struct cpudata *cpu, u64 time)
 	rdmsrl(MSR_IA32_APERF, aperf);
 	rdmsrl(MSR_IA32_MPERF, mperf);
 	tsc = rdtsc();
-	if ((cpu->prev_mperf == mperf) || (cpu->prev_tsc == tsc)) {
+	if (cpu->prev_mperf == mperf || cpu->prev_tsc == tsc) {
 		local_irq_restore(flags);
-		return;
+		return false;
 	}
 	local_irq_restore(flags);
 
@@ -912,6 +912,7 @@ static inline void intel_pstate_sample(struct cpudata *cpu, u64 time)
 	cpu->prev_aperf = aperf;
 	cpu->prev_mperf = mperf;
 	cpu->prev_tsc = tsc;
+	return true;
 }
 
 static inline int32_t get_avg_frequency(struct cpudata *cpu)
@@ -1025,8 +1026,9 @@ static void intel_pstate_update_util(struct update_util_data *data, u64 time,
 	u64 delta_ns = time - cpu->sample.time;
 
 	if ((s64)delta_ns >= pid_params.sample_rate_ns) {
-		intel_pstate_sample(cpu, time);
-		if (!hwp_active)
+		bool sample_taken = intel_pstate_sample(cpu, time);
+
+		if (sample_taken && !hwp_active)
 			intel_pstate_adjust_busy_pstate(cpu);
 	}
 }
-- 
cgit v0.10.2


From e56c92565dfe2c8de203f42f7d5237d14784b93d Mon Sep 17 00:00:00 2001
From: Josh Hunt <johunt@akamai.com>
Date: Tue, 8 Mar 2016 10:52:11 -0500
Subject: ACPI / APEI: Fix leaked resources

We leak the NVS and arch resources (if used), in apei_resources_request.
They are allocated to make sure we exclude them from the APEI resources,
but they are never freed at the end of the function. Free them now.

Signed-off-by: Josh Hunt <johunt@akamai.com>
Reviewed-by: Chen, Gong <gong.chen@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/acpi/apei/apei-base.c b/drivers/acpi/apei/apei-base.c
index a2c8d7a..da370e1 100644
--- a/drivers/acpi/apei/apei-base.c
+++ b/drivers/acpi/apei/apei-base.c
@@ -536,7 +536,8 @@ int apei_resources_request(struct apei_resources *resources,
 		goto err_unmap_ioport;
 	}
 
-	return 0;
+	goto arch_res_fini;
+
 err_unmap_ioport:
 	list_for_each_entry(res, &resources->ioport, list) {
 		if (res == res_bak)
@@ -551,7 +552,8 @@ err_unmap_iomem:
 		release_mem_region(res->start, res->end - res->start);
 	}
 arch_res_fini:
-	apei_resources_fini(&arch_res);
+	if (arch_apei_filter_addr)
+		apei_resources_fini(&arch_res);
 nvs_res_fini:
 	apei_resources_fini(&nvs_resources);
 	return rc;
-- 
cgit v0.10.2


From 3a03d126ece4388d05937090a87bade560117751 Mon Sep 17 00:00:00 2001
From: Josh Hunt <johunt@akamai.com>
Date: Tue, 8 Mar 2016 10:52:12 -0500
Subject: ACPI / APEI: ERST: Fixed leaked resources in erst_init

erst_init currently leaks resources allocated from its call to
apei_resources_init(). The data allocated there gets copied
into apei_resources_all and can be freed when we're done with it.

Signed-off-by: Josh Hunt <johunt@akamai.com>
Reviewed-by: Chen, Gong <gong.chen@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/acpi/apei/erst.c b/drivers/acpi/apei/erst.c
index 6e6bc10..006c389 100644
--- a/drivers/acpi/apei/erst.c
+++ b/drivers/acpi/apei/erst.c
@@ -1207,6 +1207,9 @@ static int __init erst_init(void)
 		"Failed to allocate %lld bytes for persistent store error log.\n",
 		erst_erange.size);
 
+	/* Cleanup ERST Resources */
+	apei_resources_fini(&erst_resources);
+
 	return 0;
 
 err_release_erange:
-- 
cgit v0.10.2


From 2a0609c02e6558df6075f258af98a54a74b050ff Mon Sep 17 00:00:00 2001
From: Len Brown <len.brown@intel.com>
Date: Fri, 12 Feb 2016 22:44:48 -0500
Subject: tools/power turbostat: allow sub-sec intervals

turbostat -i interval_sec

will sample and display statistics every interval_sec.
interval_sec used to be a whole number of seconds,
but now we accept a decimal, as small as 0.001 sec (1 ms).

Signed-off-by: Len Brown <len.brown@intel.com>

diff --git a/tools/power/x86/turbostat/turbostat.8 b/tools/power/x86/turbostat/turbostat.8
index 622db68..4d8f198 100644
--- a/tools/power/x86/turbostat/turbostat.8
+++ b/tools/power/x86/turbostat/turbostat.8
@@ -34,7 +34,7 @@ name as necessary to disambiguate it from others is necessary.  Note that option
 \fB--debug\fP displays additional system configuration information.  Invoking this parameter
 more than once may also enable internal turbostat debug information.
 .PP
-\fB--interval seconds\fP overrides the default 5-second measurement interval.
+\fB--interval seconds\fP overrides the default 5.0 second measurement interval.
 .PP
 \fB--help\fP displays usage for the most common parameters.
 .PP
diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index c600340..e411cc4 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -38,12 +38,13 @@
 #include <string.h>
 #include <ctype.h>
 #include <sched.h>
+#include <time.h>
 #include <cpuid.h>
 #include <linux/capability.h>
 #include <errno.h>
 
 char *proc_stat = "/proc/stat";
-unsigned int interval_sec = 5;
+struct timespec interval_ts = {5, 0};
 unsigned int debug;
 unsigned int rapl_joules;
 unsigned int summary_only;
@@ -1728,7 +1729,7 @@ restart:
 			re_initialize();
 			goto restart;
 		}
-		sleep(interval_sec);
+		nanosleep(&interval_ts, NULL);
 		retval = for_all_cpus(get_counters, ODD_COUNTERS);
 		if (retval < -1) {
 			exit(retval);
@@ -1742,7 +1743,7 @@ restart:
 		compute_average(EVEN_COUNTERS);
 		format_all_counters(EVEN_COUNTERS);
 		flush_stdout();
-		sleep(interval_sec);
+		nanosleep(&interval_ts, NULL);
 		retval = for_all_cpus(get_counters, EVEN_COUNTERS);
 		if (retval < -1) {
 			exit(retval);
@@ -3347,7 +3348,18 @@ void cmdline(int argc, char **argv)
 			help();
 			exit(1);
 		case 'i':
-			interval_sec = atoi(optarg);
+			{
+				double interval = strtod(optarg, NULL);
+
+				if (interval < 0.001) {
+					fprintf(stderr, "interval %f seconds is too small\n",
+						interval);
+					exit(2);
+				}
+
+				interval_ts.tv_sec = interval;
+				interval_ts.tv_nsec = (interval - interval_ts.tv_sec) * 1000000000;
+			}
 			break;
 		case 'J':
 			rapl_joules++;
-- 
cgit v0.10.2


From 121b48bb77187cf2ed3053e147d2e6c1e864083c Mon Sep 17 00:00:00 2001
From: "Chrzaniuk, Hubert" <hubert.chrzaniuk@intel.com>
Date: Wed, 10 Feb 2016 16:35:17 +0100
Subject: tools/power turbostat: Intel Xeon x200: fix erroneous bclk value

x200 does not enable any way to programmatically obtain bus clock
speed. Bclk for the architecture has a fixed value of 100 MHz.
At the same time x200 cannot be included in has_snb_msrs since
it does not support C7 idle state.

prior to this patch, MHz values reported on this chip
were erroneously calculated using bclk of 133MHz,
causing MHz values to be reported 33% higher than actual.

Signed-off-by: Hubert Chrzaniuk <hubert.chrzaniuk@intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index e411cc4..652d08d 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -2697,7 +2697,7 @@ double slm_bclk(void)
 
 double discover_bclk(unsigned int family, unsigned int model)
 {
-	if (has_snb_msrs(family, model))
+	if (has_snb_msrs(family, model) || is_knl(family, model))
 		return 100.00;
 	else if (is_slm(family, model))
 		return slm_bclk();
-- 
cgit v0.10.2


From cbf97abaf3689652bcddc0741dc49629d1838142 Mon Sep 17 00:00:00 2001
From: Hubert Chrzaniuk <hubert.chrzaniuk@intel.com>
Date: Wed, 10 Feb 2016 14:55:22 +0100
Subject: tools/power turbostat: Intel Xeon x200: fix turbo-ratio decoding

Following changes have been made:
- changed MSR_NHM_TURBO_RATIO_LIMIT to MSR_TURBO_RATIO_LIMIT in debug print
  for consistency with Developer Manual
- updated definition of bitfields in MSR_TURBO_RATIO_LIMIT and appropriate
  parsing code
- added x200 to list of architectures that do not support Nahlem compatible
  definition of MSR_TURBO_RATIO_LIMIT register (x200 has the register but
  bits definition is custom)
- fixed typo in code that parses MSR_TURBO_RATIO_LIMIT
  (logical instead of bitwise operator)
- changed MSR_TURBO_RATIO_LIMIT parsing algorithm so the print out had the
  same order as implementations for other platforms

Signed-off-by: Hubert Chrzaniuk <hubert.chrzaniuk@intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 652d08d..4e5386d 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -1328,21 +1328,23 @@ dump_nhm_turbo_ratio_limits(void)
 static void
 dump_knl_turbo_ratio_limits(void)
 {
-	int cores;
-	unsigned int ratio;
+	const unsigned int buckets_no = 7;
+
 	unsigned long long msr;
-	int delta_cores;
-	int delta_ratio;
-	int i;
+	int delta_cores, delta_ratio;
+	int i, b_nr;
+	unsigned int cores[buckets_no];
+	unsigned int ratio[buckets_no];
 
 	get_msr(base_cpu, MSR_NHM_TURBO_RATIO_LIMIT, &msr);
 
-	fprintf(stderr, "cpu%d: MSR_NHM_TURBO_RATIO_LIMIT: 0x%08llx\n",
+	fprintf(stderr, "cpu%d: MSR_TURBO_RATIO_LIMIT: 0x%08llx\n",
 		base_cpu, msr);
 
 	/**
 	 * Turbo encoding in KNL is as follows:
-	 * [7:0] -- Base value of number of active cores of bucket 1.
+	 * [0] -- Reserved
+	 * [7:1] -- Base value of number of active cores of bucket 1.
 	 * [15:8] -- Base value of freq ratio of bucket 1.
 	 * [20:16] -- +ve delta of number of active cores of bucket 2.
 	 * i.e. active cores of bucket 2 =
@@ -1361,29 +1363,25 @@ dump_knl_turbo_ratio_limits(void)
 	 * [60:56]-- +ve delta of number of active cores of bucket 7.
 	 * [63:61]-- -ve delta of freq ratio of bucket 7.
 	 */
-	cores = msr & 0xFF;
-	ratio = (msr >> 8) && 0xFF;
-	if (ratio > 0)
-		fprintf(stderr,
-			"%d * %.0f = %.0f MHz max turbo %d active cores\n",
-			ratio, bclk, ratio * bclk, cores);
-
-	for (i = 16; i < 64; i = i + 8) {
+
+	b_nr = 0;
+	cores[b_nr] = (msr & 0xFF) >> 1;
+	ratio[b_nr] = (msr >> 8) & 0xFF;
+
+	for (i = 16; i < 64; i += 8) {
 		delta_cores = (msr >> i) & 0x1F;
-		delta_ratio = (msr >> (i + 5)) && 0x7;
-		if (!delta_cores || !delta_ratio)
-			return;
-		cores = cores + delta_cores;
-		ratio = ratio - delta_ratio;
-
-		/** -ve ratios will make successive ratio calculations
-		 * negative. Hence return instead of carrying on.
-		 */
-		if (ratio > 0)
+		delta_ratio = (msr >> (i + 5)) & 0x7;
+
+		cores[b_nr + 1] = cores[b_nr] + delta_cores;
+		ratio[b_nr + 1] = ratio[b_nr] - delta_ratio;
+		b_nr++;
+	}
+
+	for (i = buckets_no - 1; i >= 0; i--)
+		if (i > 0 ? ratio[i] != ratio[i - 1] : 1)
 			fprintf(stderr,
 				"%d * %.0f = %.0f MHz max turbo %d active cores\n",
-				ratio, bclk, ratio * bclk, cores);
-	}
+				ratio[i], bclk, ratio[i] * bclk, cores[i]);
 }
 
 static void
@@ -1896,6 +1894,7 @@ int has_nhm_turbo_ratio_limit(unsigned int family, unsigned int model)
 	/* Nehalem compatible, but do not include turbo-ratio limit support */
 	case 0x2E:	/* Nehalem-EX Xeon - Beckton */
 	case 0x2F:	/* Westmere-EX Xeon - Eagleton */
+	case 0x57:	/* PHI - Knights Landing (different MSR definition) */
 		return 0;
 	default:
 		return 1;
-- 
cgit v0.10.2


From 75d2e44e60490ba1fee076a5f4dcfbdc8598e8c4 Mon Sep 17 00:00:00 2001
From: Len Brown <len.brown@intel.com>
Date: Sat, 13 Feb 2016 23:41:53 -0500
Subject: tools/power turbostat: re-name "%Busy" field to "Busy%"

some tools processing turbostat output
have difficulty with items that begin with %...

Reported-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>

diff --git a/tools/power/x86/turbostat/turbostat.8 b/tools/power/x86/turbostat/turbostat.8
index 4d8f198..7771eea 100644
--- a/tools/power/x86/turbostat/turbostat.8
+++ b/tools/power/x86/turbostat/turbostat.8
@@ -61,7 +61,7 @@ displays the statistics gathered since it was forked.
 .nf
 \fBCPU\fP Linux CPU (logical processor) number.  Yes, it is okay that on many systems the CPUs are not listed in numerical order -- for efficiency reasons, turbostat runs in topology order, so HT siblings appear together.
 \fBAVG_MHz\fP number of cycles executed divided by time elapsed.
-\fB%Busy\fP percent of the interval that the CPU retired instructions, aka. % of time in "C0" state.
+\fBBusy%\fP percent of the interval that the CPU retired instructions, aka. % of time in "C0" state.
 \fBBzy_MHz\fP average clock rate while the CPU was busy (in "c0" state).
 \fBTSC_MHz\fP average MHz that the TSC ran during the entire interval.
 .fi
@@ -89,7 +89,7 @@ Without any parameters, turbostat displays statistics ever 5 seconds.
 for turbostat to fork).
 .nf
 [root@hsw]# ./turbostat
-     CPU Avg_MHz   %Busy Bzy_MHz TSC_MHz
+     CPU Avg_MHz   Busy% Bzy_MHz TSC_MHz
        -     488   12.51    3898    3498
        0       0    0.01    3885    3498
        4    3897   99.99    3898    3498
@@ -145,7 +145,7 @@ cpu0: MSR_IA32_THERM_STATUS: 0x88340000 (48 C +/- 1)
 cpu1: MSR_IA32_THERM_STATUS: 0x88440000 (32 C +/- 1)
 cpu2: MSR_IA32_THERM_STATUS: 0x88450000 (31 C +/- 1)
 cpu3: MSR_IA32_THERM_STATUS: 0x88490000 (27 C +/- 1)
-    Core     CPU Avg_MHz   %Busy Bzy_MHz TSC_MHz     SMI  CPU%c1  CPU%c3  CPU%c6  CPU%c7 CoreTmp  PkgTmp PkgWatt CorWatt GFXWatt
+    Core     CPU Avg_MHz   Busy% Bzy_MHz TSC_MHz     SMI  CPU%c1  CPU%c3  CPU%c6  CPU%c7 CoreTmp  PkgTmp PkgWatt CorWatt GFXWatt
        -       -     493   12.64    3898    3498       0   12.64    0.00    0.00   74.72      47      47   21.62   13.74    0.00
        0       0       4    0.11    3894    3498       0   99.89    0.00    0.00    0.00      47      47   21.62   13.74    0.00
        0       4    3897   99.98    3898    3498       0    0.02
@@ -178,7 +178,7 @@ until ^C while the other CPUs are mostly idle:
 .nf
 root@hsw: turbostat cat /dev/zero > /dev/null
 ^C
-     CPU Avg_MHz   %Busy Bzy_MHz TSC_MHz
+     CPU Avg_MHz   Busy% Bzy_MHz TSC_MHz
        -     482   12.51    3854    3498
        0       0    0.01    1960    3498
        4       0    0.00    2128    3498
@@ -192,12 +192,12 @@ root@hsw: turbostat cat /dev/zero > /dev/null
 
 .fi
 Above the cycle soaker drives cpu5 up its 3.9 GHz turbo limit.
-The first row shows the average MHz and %Busy across all the processors in the system.
+The first row shows the average MHz and Busy% across all the processors in the system.
 
 Note that the Avg_MHz column reflects the total number of cycles executed
-divided by the measurement interval.  If the %Busy column is 100%,
+divided by the measurement interval.  If the Busy% column is 100%,
 then the processor was running at that speed the entire interval.
-The Avg_MHz multiplied by the %Busy results in the Bzy_MHz --
+The Avg_MHz multiplied by the Busy% results in the Bzy_MHz --
 which is the average frequency while the processor was executing --
 not including any non-busy idle time.
 
@@ -233,7 +233,7 @@ in the brand string in /proc/cpuinfo.  On a system where
 the TSC stops in idle, TSC_MHz will drop
 below the processor's base frequency.
 
-%Busy = MPERF_delta/TSC_delta
+Busy% = MPERF_delta/TSC_delta
 
 Bzy_MHz = TSC_delta/APERF_delta/MPERF_delta/measurement_interval
 
diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 4e5386d..947239b 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -293,7 +293,7 @@ int get_msr(int cpu, off_t offset, unsigned long long *msr)
 /*
  * Example Format w/ field column widths:
  *
- *  Package    Core     CPU Avg_MHz Bzy_MHz TSC_MHz     SMI   %Busy CPU_%c1 CPU_%c3 CPU_%c6 CPU_%c7 CoreTmp  PkgTmp Pkg%pc2 Pkg%pc3 Pkg%pc6 Pkg%pc7 PkgWatt CorWatt GFXWatt
+ *  Package    Core     CPU Avg_MHz Bzy_MHz TSC_MHz     SMI   Busy% CPU_%c1 CPU_%c3 CPU_%c6 CPU_%c7 CoreTmp  PkgTmp Pkg%pc2 Pkg%pc3 Pkg%pc6 Pkg%pc7 PkgWatt CorWatt GFXWatt
  * 123456781234567812345678123456781234567812345678123456781234567812345678123456781234567812345678123456781234567812345678123456781234567812345678123456781234567812345678
  */
 
@@ -308,7 +308,7 @@ void print_header(void)
 	if (has_aperf)
 		outp += sprintf(outp, " Avg_MHz");
 	if (has_aperf)
-		outp += sprintf(outp, "   %%Busy");
+		outp += sprintf(outp, "   Busy%%");
 	if (has_aperf)
 		outp += sprintf(outp, " Bzy_MHz");
 	outp += sprintf(outp, " TSC_MHz");
@@ -511,7 +511,7 @@ int format_counters(struct thread_data *t, struct core_data *c,
 		outp += sprintf(outp, "%8.0f",
 			1.0 / units * t->aperf / interval_float);
 
-	/* %Busy */
+	/* Busy% */
 	if (has_aperf) {
 		if (!skip_c0)
 			outp += sprintf(outp, "%8.2f", 100.0 * t->mperf/t->tsc/tsc_tweak);
-- 
cgit v0.10.2


From b7d8c1483bbf6ec9d2dd76d6a1c91a38c3f6ac35 Mon Sep 17 00:00:00 2001
From: Len Brown <len.brown@intel.com>
Date: Sat, 13 Feb 2016 23:36:17 -0500
Subject: tools/power turbostat: add --out option for saving output in a file

By default...

Turbostat --debug gconfiguration info goes to stderr.

In FORK mode, turbostat statistics go to stderr.

In PERIODIC mode, turbostat statistics go to stdout.

These defaults do not change, but an option "--out file"
will send all output above only to the specified file.

Signed-off-by: Len Brown <len.brown@intel.com>

diff --git a/tools/power/x86/turbostat/turbostat.8 b/tools/power/x86/turbostat/turbostat.8
index 7771eea..89a55d5 100644
--- a/tools/power/x86/turbostat/turbostat.8
+++ b/tools/power/x86/turbostat/turbostat.8
@@ -36,6 +36,9 @@ more than once may also enable internal turbostat debug information.
 .PP
 \fB--interval seconds\fP overrides the default 5.0 second measurement interval.
 .PP
+\fB--out output_file\fP turbostat output is written to the specified output_file.
+The file is truncated if it already exists, and it is created if it does not exist.
+.PP
 \fB--help\fP displays usage for the most common parameters.
 .PP
 \fB--Joules\fP displays energy in Joules, rather than dividing Joules by time to print power in Watts.
@@ -83,10 +86,11 @@ Note that multiple CPUs per core indicate support for Intel(R) Hyper-Threading T
 \fBRAM_%\fP percent of the interval that RAPL throttling was active on DRAM.
 .fi
 .PP
-.SH EXAMPLE
+.SH PERIODIC EXAMPLE
 Without any parameters, turbostat displays statistics ever 5 seconds.
-(override interval with "-i sec" option, or specify a command
-for turbostat to fork).
+Periodic output goes to stdout, by default, unless --out is used to specify an output file.
+The 5-second interval can be changed with th "-i sec" option.
+Or a command may be specified as in "FORK EXAMPLE" below.
 .nf
 [root@hsw]# ./turbostat
      CPU Avg_MHz   Busy% Bzy_MHz TSC_MHz
@@ -171,7 +175,9 @@ The --debug option adds additional columns to the measurement ouput, including C
 See the field definitions above.
 .SH FORK EXAMPLE
 If turbostat is invoked with a command, it will fork that command
-and output the statistics gathered when the command exits.
+and output the statistics gathered after the command exits.
+In this case, turbostat output goes to stderr, by default.
+Output can instead be saved to a file using the --out option.
 eg. Here a cycle soaker is run on 1 CPU (see %c0) for a few seconds
 until ^C while the other CPUs are mostly idle:
 
diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 947239b..5f9f41a 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -44,6 +44,7 @@
 #include <errno.h>
 
 char *proc_stat = "/proc/stat";
+FILE *outf;
 struct timespec interval_ts = {5, 0};
 unsigned int debug;
 unsigned int rapl_joules;
@@ -652,15 +653,24 @@ done:
 	return 0;
 }
 
-void flush_stdout()
+void flush_output_stdout(void)
 {
-	fputs(output_buffer, stdout);
-	fflush(stdout);
+	FILE *filep;
+
+	if (outf == stderr)
+		filep = stdout;
+	else
+		filep = outf;
+
+	fputs(output_buffer, filep);
+	fflush(filep);
+
 	outp = output_buffer;
 }
-void flush_stderr()
+void flush_output_stderr(void)
 {
-	fputs(output_buffer, stderr);
+	fputs(output_buffer, outf);
+	fflush(outf);
 	outp = output_buffer;
 }
 void format_all_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
@@ -752,9 +762,9 @@ delta_thread(struct thread_data *new, struct thread_data *old,
 		} else {
 
 			if (!aperf_mperf_unstable) {
-				fprintf(stderr, "%s: APERF or MPERF went backwards *\n", progname);
-				fprintf(stderr, "* Frequency results do not cover entire interval *\n");
-				fprintf(stderr, "* fix this by running Linux-2.6.30 or later *\n");
+				fprintf(outf, "%s: APERF or MPERF went backwards *\n", progname);
+				fprintf(outf, "* Frequency results do not cover entire interval *\n");
+				fprintf(outf, "* fix this by running Linux-2.6.30 or later *\n");
 
 				aperf_mperf_unstable = 1;
 			}
@@ -789,7 +799,8 @@ delta_thread(struct thread_data *new, struct thread_data *old,
 	}
 
 	if (old->mperf == 0) {
-		if (debug > 1) fprintf(stderr, "cpu%d MPERF 0!\n", old->cpu_id);
+		if (debug > 1)
+			fprintf(outf, "cpu%d MPERF 0!\n", old->cpu_id);
 		old->mperf = 1;	/* divide by 0 protection */
 	}
 
@@ -989,7 +1000,7 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
 	unsigned long long msr;
 
 	if (cpu_migrate(cpu)) {
-		fprintf(stderr, "Could not migrate to CPU %d\n", cpu);
+		fprintf(outf, "Could not migrate to CPU %d\n", cpu);
 		return -1;
 	}
 
@@ -1182,18 +1193,18 @@ dump_nhm_platform_info(void)
 
 	get_msr(base_cpu, MSR_PLATFORM_INFO, &msr);
 
-	fprintf(stderr, "cpu%d: MSR_PLATFORM_INFO: 0x%08llx\n", base_cpu, msr);
+	fprintf(outf, "cpu%d: MSR_PLATFORM_INFO: 0x%08llx\n", base_cpu, msr);
 
 	ratio = (msr >> 40) & 0xFF;
-	fprintf(stderr, "%d * %.0f = %.0f MHz max efficiency frequency\n",
+	fprintf(outf, "%d * %.0f = %.0f MHz max efficiency frequency\n",
 		ratio, bclk, ratio * bclk);
 
 	ratio = (msr >> 8) & 0xFF;
-	fprintf(stderr, "%d * %.0f = %.0f MHz base frequency\n",
+	fprintf(outf, "%d * %.0f = %.0f MHz base frequency\n",
 		ratio, bclk, ratio * bclk);
 
 	get_msr(base_cpu, MSR_IA32_POWER_CTL, &msr);
-	fprintf(stderr, "cpu%d: MSR_IA32_POWER_CTL: 0x%08llx (C1E auto-promotion: %sabled)\n",
+	fprintf(outf, "cpu%d: MSR_IA32_POWER_CTL: 0x%08llx (C1E auto-promotion: %sabled)\n",
 		base_cpu, msr, msr & 0x2 ? "EN" : "DIS");
 
 	return;
@@ -1207,16 +1218,16 @@ dump_hsw_turbo_ratio_limits(void)
 
 	get_msr(base_cpu, MSR_TURBO_RATIO_LIMIT2, &msr);
 
-	fprintf(stderr, "cpu%d: MSR_TURBO_RATIO_LIMIT2: 0x%08llx\n", base_cpu, msr);
+	fprintf(outf, "cpu%d: MSR_TURBO_RATIO_LIMIT2: 0x%08llx\n", base_cpu, msr);
 
 	ratio = (msr >> 8) & 0xFF;
 	if (ratio)
-		fprintf(stderr, "%d * %.0f = %.0f MHz max turbo 18 active cores\n",
+		fprintf(outf, "%d * %.0f = %.0f MHz max turbo 18 active cores\n",
 			ratio, bclk, ratio * bclk);
 
 	ratio = (msr >> 0) & 0xFF;
 	if (ratio)
-		fprintf(stderr, "%d * %.0f = %.0f MHz max turbo 17 active cores\n",
+		fprintf(outf, "%d * %.0f = %.0f MHz max turbo 17 active cores\n",
 			ratio, bclk, ratio * bclk);
 	return;
 }
@@ -1229,46 +1240,46 @@ dump_ivt_turbo_ratio_limits(void)
 
 	get_msr(base_cpu, MSR_TURBO_RATIO_LIMIT1, &msr);
 
-	fprintf(stderr, "cpu%d: MSR_TURBO_RATIO_LIMIT1: 0x%08llx\n", base_cpu, msr);
+	fprintf(outf, "cpu%d: MSR_TURBO_RATIO_LIMIT1: 0x%08llx\n", base_cpu, msr);
 
 	ratio = (msr >> 56) & 0xFF;
 	if (ratio)
-		fprintf(stderr, "%d * %.0f = %.0f MHz max turbo 16 active cores\n",
+		fprintf(outf, "%d * %.0f = %.0f MHz max turbo 16 active cores\n",
 			ratio, bclk, ratio * bclk);
 
 	ratio = (msr >> 48) & 0xFF;
 	if (ratio)
-		fprintf(stderr, "%d * %.0f = %.0f MHz max turbo 15 active cores\n",
+		fprintf(outf, "%d * %.0f = %.0f MHz max turbo 15 active cores\n",
 			ratio, bclk, ratio * bclk);
 
 	ratio = (msr >> 40) & 0xFF;
 	if (ratio)
-		fprintf(stderr, "%d * %.0f = %.0f MHz max turbo 14 active cores\n",
+		fprintf(outf, "%d * %.0f = %.0f MHz max turbo 14 active cores\n",
 			ratio, bclk, ratio * bclk);
 
 	ratio = (msr >> 32) & 0xFF;
 	if (ratio)
-		fprintf(stderr, "%d * %.0f = %.0f MHz max turbo 13 active cores\n",
+		fprintf(outf, "%d * %.0f = %.0f MHz max turbo 13 active cores\n",
 			ratio, bclk, ratio * bclk);
 
 	ratio = (msr >> 24) & 0xFF;
 	if (ratio)
-		fprintf(stderr, "%d * %.0f = %.0f MHz max turbo 12 active cores\n",
+		fprintf(outf, "%d * %.0f = %.0f MHz max turbo 12 active cores\n",
 			ratio, bclk, ratio * bclk);
 
 	ratio = (msr >> 16) & 0xFF;
 	if (ratio)
-		fprintf(stderr, "%d * %.0f = %.0f MHz max turbo 11 active cores\n",
+		fprintf(outf, "%d * %.0f = %.0f MHz max turbo 11 active cores\n",
 			ratio, bclk, ratio * bclk);
 
 	ratio = (msr >> 8) & 0xFF;
 	if (ratio)
-		fprintf(stderr, "%d * %.0f = %.0f MHz max turbo 10 active cores\n",
+		fprintf(outf, "%d * %.0f = %.0f MHz max turbo 10 active cores\n",
 			ratio, bclk, ratio * bclk);
 
 	ratio = (msr >> 0) & 0xFF;
 	if (ratio)
-		fprintf(stderr, "%d * %.0f = %.0f MHz max turbo 9 active cores\n",
+		fprintf(outf, "%d * %.0f = %.0f MHz max turbo 9 active cores\n",
 			ratio, bclk, ratio * bclk);
 	return;
 }
@@ -1281,46 +1292,46 @@ dump_nhm_turbo_ratio_limits(void)
 
 	get_msr(base_cpu, MSR_TURBO_RATIO_LIMIT, &msr);
 
-	fprintf(stderr, "cpu%d: MSR_TURBO_RATIO_LIMIT: 0x%08llx\n", base_cpu, msr);
+	fprintf(outf, "cpu%d: MSR_TURBO_RATIO_LIMIT: 0x%08llx\n", base_cpu, msr);
 
 	ratio = (msr >> 56) & 0xFF;
 	if (ratio)
-		fprintf(stderr, "%d * %.0f = %.0f MHz max turbo 8 active cores\n",
+		fprintf(outf, "%d * %.0f = %.0f MHz max turbo 8 active cores\n",
 			ratio, bclk, ratio * bclk);
 
 	ratio = (msr >> 48) & 0xFF;
 	if (ratio)
-		fprintf(stderr, "%d * %.0f = %.0f MHz max turbo 7 active cores\n",
+		fprintf(outf, "%d * %.0f = %.0f MHz max turbo 7 active cores\n",
 			ratio, bclk, ratio * bclk);
 
 	ratio = (msr >> 40) & 0xFF;
 	if (ratio)
-		fprintf(stderr, "%d * %.0f = %.0f MHz max turbo 6 active cores\n",
+		fprintf(outf, "%d * %.0f = %.0f MHz max turbo 6 active cores\n",
 			ratio, bclk, ratio * bclk);
 
 	ratio = (msr >> 32) & 0xFF;
 	if (ratio)
-		fprintf(stderr, "%d * %.0f = %.0f MHz max turbo 5 active cores\n",
+		fprintf(outf, "%d * %.0f = %.0f MHz max turbo 5 active cores\n",
 			ratio, bclk, ratio * bclk);
 
 	ratio = (msr >> 24) & 0xFF;
 	if (ratio)
-		fprintf(stderr, "%d * %.0f = %.0f MHz max turbo 4 active cores\n",
+		fprintf(outf, "%d * %.0f = %.0f MHz max turbo 4 active cores\n",
 			ratio, bclk, ratio * bclk);
 
 	ratio = (msr >> 16) & 0xFF;
 	if (ratio)
-		fprintf(stderr, "%d * %.0f = %.0f MHz max turbo 3 active cores\n",
+		fprintf(outf, "%d * %.0f = %.0f MHz max turbo 3 active cores\n",
 			ratio, bclk, ratio * bclk);
 
 	ratio = (msr >> 8) & 0xFF;
 	if (ratio)
-		fprintf(stderr, "%d * %.0f = %.0f MHz max turbo 2 active cores\n",
+		fprintf(outf, "%d * %.0f = %.0f MHz max turbo 2 active cores\n",
 			ratio, bclk, ratio * bclk);
 
 	ratio = (msr >> 0) & 0xFF;
 	if (ratio)
-		fprintf(stderr, "%d * %.0f = %.0f MHz max turbo 1 active cores\n",
+		fprintf(outf, "%d * %.0f = %.0f MHz max turbo 1 active cores\n",
 			ratio, bclk, ratio * bclk);
 	return;
 }
@@ -1338,7 +1349,7 @@ dump_knl_turbo_ratio_limits(void)
 
 	get_msr(base_cpu, MSR_NHM_TURBO_RATIO_LIMIT, &msr);
 
-	fprintf(stderr, "cpu%d: MSR_TURBO_RATIO_LIMIT: 0x%08llx\n",
+	fprintf(outf, "cpu%d: MSR_TURBO_RATIO_LIMIT: 0x%08llx\n",
 		base_cpu, msr);
 
 	/**
@@ -1379,7 +1390,7 @@ dump_knl_turbo_ratio_limits(void)
 
 	for (i = buckets_no - 1; i >= 0; i--)
 		if (i > 0 ? ratio[i] != ratio[i - 1] : 1)
-			fprintf(stderr,
+			fprintf(outf,
 				"%d * %.0f = %.0f MHz max turbo %d active cores\n",
 				ratio[i], bclk, ratio[i] * bclk, cores[i]);
 }
@@ -1394,9 +1405,9 @@ dump_nhm_cst_cfg(void)
 #define SNB_C1_AUTO_UNDEMOTE              (1UL << 27)
 #define SNB_C3_AUTO_UNDEMOTE              (1UL << 28)
 
-	fprintf(stderr, "cpu%d: MSR_NHM_SNB_PKG_CST_CFG_CTL: 0x%08llx", base_cpu, msr);
+	fprintf(outf, "cpu%d: MSR_NHM_SNB_PKG_CST_CFG_CTL: 0x%08llx", base_cpu, msr);
 
-	fprintf(stderr, " (%s%s%s%s%slocked: pkg-cstate-limit=%d: %s)\n",
+	fprintf(outf, " (%s%s%s%s%slocked: pkg-cstate-limit=%d: %s)\n",
 		(msr & SNB_C3_AUTO_UNDEMOTE) ? "UNdemote-C3, " : "",
 		(msr & SNB_C1_AUTO_UNDEMOTE) ? "UNdemote-C1, " : "",
 		(msr & NHM_C3_AUTO_DEMOTE) ? "demote-C3, " : "",
@@ -1413,41 +1424,41 @@ dump_config_tdp(void)
 	unsigned long long msr;
 
 	get_msr(base_cpu, MSR_CONFIG_TDP_NOMINAL, &msr);
-	fprintf(stderr, "cpu%d: MSR_CONFIG_TDP_NOMINAL: 0x%08llx", base_cpu, msr);
-	fprintf(stderr, " (base_ratio=%d)\n", (unsigned int)msr & 0xEF);
+	fprintf(outf, "cpu%d: MSR_CONFIG_TDP_NOMINAL: 0x%08llx", base_cpu, msr);
+	fprintf(outf, " (base_ratio=%d)\n", (unsigned int)msr & 0xEF);
 
 	get_msr(base_cpu, MSR_CONFIG_TDP_LEVEL_1, &msr);
-	fprintf(stderr, "cpu%d: MSR_CONFIG_TDP_LEVEL_1: 0x%08llx (", base_cpu, msr);
+	fprintf(outf, "cpu%d: MSR_CONFIG_TDP_LEVEL_1: 0x%08llx (", base_cpu, msr);
 	if (msr) {
-		fprintf(stderr, "PKG_MIN_PWR_LVL1=%d ", (unsigned int)(msr >> 48) & 0xEFFF);
-		fprintf(stderr, "PKG_MAX_PWR_LVL1=%d ", (unsigned int)(msr >> 32) & 0xEFFF);
-		fprintf(stderr, "LVL1_RATIO=%d ", (unsigned int)(msr >> 16) & 0xEF);
-		fprintf(stderr, "PKG_TDP_LVL1=%d", (unsigned int)(msr) & 0xEFFF);
+		fprintf(outf, "PKG_MIN_PWR_LVL1=%d ", (unsigned int)(msr >> 48) & 0xEFFF);
+		fprintf(outf, "PKG_MAX_PWR_LVL1=%d ", (unsigned int)(msr >> 32) & 0xEFFF);
+		fprintf(outf, "LVL1_RATIO=%d ", (unsigned int)(msr >> 16) & 0xEF);
+		fprintf(outf, "PKG_TDP_LVL1=%d", (unsigned int)(msr) & 0xEFFF);
 	}
-	fprintf(stderr, ")\n");
+	fprintf(outf, ")\n");
 
 	get_msr(base_cpu, MSR_CONFIG_TDP_LEVEL_2, &msr);
-	fprintf(stderr, "cpu%d: MSR_CONFIG_TDP_LEVEL_2: 0x%08llx (", base_cpu, msr);
+	fprintf(outf, "cpu%d: MSR_CONFIG_TDP_LEVEL_2: 0x%08llx (", base_cpu, msr);
 	if (msr) {
-		fprintf(stderr, "PKG_MIN_PWR_LVL2=%d ", (unsigned int)(msr >> 48) & 0xEFFF);
-		fprintf(stderr, "PKG_MAX_PWR_LVL2=%d ", (unsigned int)(msr >> 32) & 0xEFFF);
-		fprintf(stderr, "LVL2_RATIO=%d ", (unsigned int)(msr >> 16) & 0xEF);
-		fprintf(stderr, "PKG_TDP_LVL2=%d", (unsigned int)(msr) & 0xEFFF);
+		fprintf(outf, "PKG_MIN_PWR_LVL2=%d ", (unsigned int)(msr >> 48) & 0xEFFF);
+		fprintf(outf, "PKG_MAX_PWR_LVL2=%d ", (unsigned int)(msr >> 32) & 0xEFFF);
+		fprintf(outf, "LVL2_RATIO=%d ", (unsigned int)(msr >> 16) & 0xEF);
+		fprintf(outf, "PKG_TDP_LVL2=%d", (unsigned int)(msr) & 0xEFFF);
 	}
-	fprintf(stderr, ")\n");
+	fprintf(outf, ")\n");
 
 	get_msr(base_cpu, MSR_CONFIG_TDP_CONTROL, &msr);
-	fprintf(stderr, "cpu%d: MSR_CONFIG_TDP_CONTROL: 0x%08llx (", base_cpu, msr);
+	fprintf(outf, "cpu%d: MSR_CONFIG_TDP_CONTROL: 0x%08llx (", base_cpu, msr);
 	if ((msr) & 0x3)
-		fprintf(stderr, "TDP_LEVEL=%d ", (unsigned int)(msr) & 0x3);
-	fprintf(stderr, " lock=%d", (unsigned int)(msr >> 31) & 1);
-	fprintf(stderr, ")\n");
+		fprintf(outf, "TDP_LEVEL=%d ", (unsigned int)(msr) & 0x3);
+	fprintf(outf, " lock=%d", (unsigned int)(msr >> 31) & 1);
+	fprintf(outf, ")\n");
 	
 	get_msr(base_cpu, MSR_TURBO_ACTIVATION_RATIO, &msr);
-	fprintf(stderr, "cpu%d: MSR_TURBO_ACTIVATION_RATIO: 0x%08llx (", base_cpu, msr);
-	fprintf(stderr, "MAX_NON_TURBO_RATIO=%d", (unsigned int)(msr) & 0x7F);
-	fprintf(stderr, " lock=%d", (unsigned int)(msr >> 31) & 1);
-	fprintf(stderr, ")\n");
+	fprintf(outf, "cpu%d: MSR_TURBO_ACTIVATION_RATIO: 0x%08llx (", base_cpu, msr);
+	fprintf(outf, "MAX_NON_TURBO_RATIO=%d", (unsigned int)(msr) & 0x7F);
+	fprintf(outf, " lock=%d", (unsigned int)(msr >> 31) & 1);
+	fprintf(outf, ")\n");
 }
 
 void free_all_buffers(void)
@@ -1486,7 +1497,7 @@ void free_all_buffers(void)
  */
 FILE *fopen_or_die(const char *path, const char *mode)
 {
-	FILE *filep = fopen(path, "r");
+	FILE *filep = fopen(path, mode);
 	if (!filep)
 		err(1, "%s: open failed", path);
 	return filep;
@@ -1740,7 +1751,7 @@ restart:
 		for_all_cpus_2(delta_cpu, ODD_COUNTERS, EVEN_COUNTERS);
 		compute_average(EVEN_COUNTERS);
 		format_all_counters(EVEN_COUNTERS);
-		flush_stdout();
+		flush_output_stdout();
 		nanosleep(&interval_ts, NULL);
 		retval = for_all_cpus(get_counters, EVEN_COUNTERS);
 		if (retval < -1) {
@@ -1754,7 +1765,7 @@ restart:
 		for_all_cpus_2(delta_cpu, EVEN_COUNTERS, ODD_COUNTERS);
 		compute_average(ODD_COUNTERS);
 		format_all_counters(ODD_COUNTERS);
-		flush_stdout();
+		flush_output_stdout();
 	}
 }
 
@@ -2022,7 +2033,7 @@ int print_epb(struct thread_data *t, struct core_data *c, struct pkg_data *p)
 		return 0;
 
 	if (cpu_migrate(cpu)) {
-		fprintf(stderr, "Could not migrate to CPU %d\n", cpu);
+		fprintf(outf, "Could not migrate to CPU %d\n", cpu);
 		return -1;
 	}
 
@@ -2043,7 +2054,7 @@ int print_epb(struct thread_data *t, struct core_data *c, struct pkg_data *p)
 		epb_string = "custom";
 		break;
 	}
-	fprintf(stderr, "cpu%d: MSR_IA32_ENERGY_PERF_BIAS: 0x%08llx (%s)\n", cpu, msr, epb_string);
+	fprintf(outf, "cpu%d: MSR_IA32_ENERGY_PERF_BIAS: 0x%08llx (%s)\n", cpu, msr, epb_string);
 
 	return 0;
 }
@@ -2066,14 +2077,14 @@ int print_hwp(struct thread_data *t, struct core_data *c, struct pkg_data *p)
 		return 0;
 
 	if (cpu_migrate(cpu)) {
-		fprintf(stderr, "Could not migrate to CPU %d\n", cpu);
+		fprintf(outf, "Could not migrate to CPU %d\n", cpu);
 		return -1;
 	}
 
 	if (get_msr(cpu, MSR_PM_ENABLE, &msr))
 		return 0;
 
-	fprintf(stderr, "cpu%d: MSR_PM_ENABLE: 0x%08llx (%sHWP)\n",
+	fprintf(outf, "cpu%d: MSR_PM_ENABLE: 0x%08llx (%sHWP)\n",
 		cpu, msr, (msr & (1 << 0)) ? "" : "No-");
 
 	/* MSR_PM_ENABLE[1] == 1 if HWP is enabled and MSRs visible */
@@ -2083,7 +2094,7 @@ int print_hwp(struct thread_data *t, struct core_data *c, struct pkg_data *p)
 	if (get_msr(cpu, MSR_HWP_CAPABILITIES, &msr))
 		return 0;
 
-	fprintf(stderr, "cpu%d: MSR_HWP_CAPABILITIES: 0x%08llx "
+	fprintf(outf, "cpu%d: MSR_HWP_CAPABILITIES: 0x%08llx "
 			"(high 0x%x guar 0x%x eff 0x%x low 0x%x)\n",
 			cpu, msr,
 			(unsigned int)HWP_HIGHEST_PERF(msr),
@@ -2094,7 +2105,7 @@ int print_hwp(struct thread_data *t, struct core_data *c, struct pkg_data *p)
 	if (get_msr(cpu, MSR_HWP_REQUEST, &msr))
 		return 0;
 
-	fprintf(stderr, "cpu%d: MSR_HWP_REQUEST: 0x%08llx "
+	fprintf(outf, "cpu%d: MSR_HWP_REQUEST: 0x%08llx "
 			"(min 0x%x max 0x%x des 0x%x epp 0x%x window 0x%x pkg 0x%x)\n",
 			cpu, msr,
 			(unsigned int)(((msr) >> 0) & 0xff),
@@ -2108,7 +2119,7 @@ int print_hwp(struct thread_data *t, struct core_data *c, struct pkg_data *p)
 		if (get_msr(cpu, MSR_HWP_REQUEST_PKG, &msr))
 			return 0;
 
-		fprintf(stderr, "cpu%d: MSR_HWP_REQUEST_PKG: 0x%08llx "
+		fprintf(outf, "cpu%d: MSR_HWP_REQUEST_PKG: 0x%08llx "
 			"(min 0x%x max 0x%x des 0x%x epp 0x%x window 0x%x)\n",
 			cpu, msr,
 			(unsigned int)(((msr) >> 0) & 0xff),
@@ -2121,7 +2132,7 @@ int print_hwp(struct thread_data *t, struct core_data *c, struct pkg_data *p)
 		if (get_msr(cpu, MSR_HWP_INTERRUPT, &msr))
 			return 0;
 
-		fprintf(stderr, "cpu%d: MSR_HWP_INTERRUPT: 0x%08llx "
+		fprintf(outf, "cpu%d: MSR_HWP_INTERRUPT: 0x%08llx "
 			"(%s_Guaranteed_Perf_Change, %s_Excursion_Min)\n",
 			cpu, msr,
 			((msr) & 0x1) ? "EN" : "Dis",
@@ -2130,7 +2141,7 @@ int print_hwp(struct thread_data *t, struct core_data *c, struct pkg_data *p)
 	if (get_msr(cpu, MSR_HWP_STATUS, &msr))
 		return 0;
 
-	fprintf(stderr, "cpu%d: MSR_HWP_STATUS: 0x%08llx "
+	fprintf(outf, "cpu%d: MSR_HWP_STATUS: 0x%08llx "
 			"(%sGuaranteed_Perf_Change, %sExcursion_Min)\n",
 			cpu, msr,
 			((msr) & 0x1) ? "" : "No-",
@@ -2154,14 +2165,14 @@ int print_perf_limit(struct thread_data *t, struct core_data *c, struct pkg_data
 		return 0;
 
 	if (cpu_migrate(cpu)) {
-		fprintf(stderr, "Could not migrate to CPU %d\n", cpu);
+		fprintf(outf, "Could not migrate to CPU %d\n", cpu);
 		return -1;
 	}
 
 	if (do_core_perf_limit_reasons) {
 		get_msr(cpu, MSR_CORE_PERF_LIMIT_REASONS, &msr);
-		fprintf(stderr, "cpu%d: MSR_CORE_PERF_LIMIT_REASONS, 0x%08llx", cpu, msr);
-		fprintf(stderr, " (Active: %s%s%s%s%s%s%s%s%s%s%s%s%s%s)",
+		fprintf(outf, "cpu%d: MSR_CORE_PERF_LIMIT_REASONS, 0x%08llx", cpu, msr);
+		fprintf(outf, " (Active: %s%s%s%s%s%s%s%s%s%s%s%s%s%s)",
 			(msr & 1 << 15) ? "bit15, " : "",
 			(msr & 1 << 14) ? "bit14, " : "",
 			(msr & 1 << 13) ? "Transitions, " : "",
@@ -2176,7 +2187,7 @@ int print_perf_limit(struct thread_data *t, struct core_data *c, struct pkg_data
 			(msr & 1 << 2) ? "bit2, " : "",
 			(msr & 1 << 1) ? "ThermStatus, " : "",
 			(msr & 1 << 0) ? "PROCHOT, " : "");
-		fprintf(stderr, " (Logged: %s%s%s%s%s%s%s%s%s%s%s%s%s%s)\n",
+		fprintf(outf, " (Logged: %s%s%s%s%s%s%s%s%s%s%s%s%s%s)\n",
 			(msr & 1 << 31) ? "bit31, " : "",
 			(msr & 1 << 30) ? "bit30, " : "",
 			(msr & 1 << 29) ? "Transitions, " : "",
@@ -2195,8 +2206,8 @@ int print_perf_limit(struct thread_data *t, struct core_data *c, struct pkg_data
 	}
 	if (do_gfx_perf_limit_reasons) {
 		get_msr(cpu, MSR_GFX_PERF_LIMIT_REASONS, &msr);
-		fprintf(stderr, "cpu%d: MSR_GFX_PERF_LIMIT_REASONS, 0x%08llx", cpu, msr);
-		fprintf(stderr, " (Active: %s%s%s%s%s%s%s%s)",
+		fprintf(outf, "cpu%d: MSR_GFX_PERF_LIMIT_REASONS, 0x%08llx", cpu, msr);
+		fprintf(outf, " (Active: %s%s%s%s%s%s%s%s)",
 			(msr & 1 << 0) ? "PROCHOT, " : "",
 			(msr & 1 << 1) ? "ThermStatus, " : "",
 			(msr & 1 << 4) ? "Graphics, " : "",
@@ -2205,7 +2216,7 @@ int print_perf_limit(struct thread_data *t, struct core_data *c, struct pkg_data
 			(msr & 1 << 9) ? "GFXPwr, " : "",
 			(msr & 1 << 10) ? "PkgPwrL1, " : "",
 			(msr & 1 << 11) ? "PkgPwrL2, " : "");
-		fprintf(stderr, " (Logged: %s%s%s%s%s%s%s%s)\n",
+		fprintf(outf, " (Logged: %s%s%s%s%s%s%s%s)\n",
 			(msr & 1 << 16) ? "PROCHOT, " : "",
 			(msr & 1 << 17) ? "ThermStatus, " : "",
 			(msr & 1 << 20) ? "Graphics, " : "",
@@ -2217,15 +2228,15 @@ int print_perf_limit(struct thread_data *t, struct core_data *c, struct pkg_data
 	}
 	if (do_ring_perf_limit_reasons) {
 		get_msr(cpu, MSR_RING_PERF_LIMIT_REASONS, &msr);
-		fprintf(stderr, "cpu%d: MSR_RING_PERF_LIMIT_REASONS, 0x%08llx", cpu, msr);
-		fprintf(stderr, " (Active: %s%s%s%s%s%s)",
+		fprintf(outf, "cpu%d: MSR_RING_PERF_LIMIT_REASONS, 0x%08llx", cpu, msr);
+		fprintf(outf, " (Active: %s%s%s%s%s%s)",
 			(msr & 1 << 0) ? "PROCHOT, " : "",
 			(msr & 1 << 1) ? "ThermStatus, " : "",
 			(msr & 1 << 6) ? "VR-Therm, " : "",
 			(msr & 1 << 8) ? "Amps, " : "",
 			(msr & 1 << 10) ? "PkgPwrL1, " : "",
 			(msr & 1 << 11) ? "PkgPwrL2, " : "");
-		fprintf(stderr, " (Logged: %s%s%s%s%s%s)\n",
+		fprintf(outf, " (Logged: %s%s%s%s%s%s)\n",
 			(msr & 1 << 16) ? "PROCHOT, " : "",
 			(msr & 1 << 17) ? "ThermStatus, " : "",
 			(msr & 1 << 22) ? "VR-Therm, " : "",
@@ -2348,7 +2359,7 @@ void rapl_probe(unsigned int family, unsigned int model)
 
 	rapl_joule_counter_range = 0xFFFFFFFF * rapl_energy_units / tdp;
 	if (debug)
-		fprintf(stderr, "RAPL: %.0f sec. Joule Counter Range, at %.0f Watts\n", rapl_joule_counter_range, tdp);
+		fprintf(outf, "RAPL: %.0f sec. Joule Counter Range, at %.0f Watts\n", rapl_joule_counter_range, tdp);
 
 	return;
 }
@@ -2390,7 +2401,7 @@ int print_thermal(struct thread_data *t, struct core_data *c, struct pkg_data *p
 		return 0;
 
 	if (cpu_migrate(cpu)) {
-		fprintf(stderr, "Could not migrate to CPU %d\n", cpu);
+		fprintf(outf, "Could not migrate to CPU %d\n", cpu);
 		return -1;
 	}
 
@@ -2399,7 +2410,7 @@ int print_thermal(struct thread_data *t, struct core_data *c, struct pkg_data *p
 			return 0;
 
 		dts = (msr >> 16) & 0x7F;
-		fprintf(stderr, "cpu%d: MSR_IA32_PACKAGE_THERM_STATUS: 0x%08llx (%d C)\n",
+		fprintf(outf, "cpu%d: MSR_IA32_PACKAGE_THERM_STATUS: 0x%08llx (%d C)\n",
 			cpu, msr, tcc_activation_temp - dts);
 
 #ifdef	THERM_DEBUG
@@ -2408,7 +2419,7 @@ int print_thermal(struct thread_data *t, struct core_data *c, struct pkg_data *p
 
 		dts = (msr >> 16) & 0x7F;
 		dts2 = (msr >> 8) & 0x7F;
-		fprintf(stderr, "cpu%d: MSR_IA32_PACKAGE_THERM_INTERRUPT: 0x%08llx (%d C, %d C)\n",
+		fprintf(outf, "cpu%d: MSR_IA32_PACKAGE_THERM_INTERRUPT: 0x%08llx (%d C, %d C)\n",
 			cpu, msr, tcc_activation_temp - dts, tcc_activation_temp - dts2);
 #endif
 	}
@@ -2422,7 +2433,7 @@ int print_thermal(struct thread_data *t, struct core_data *c, struct pkg_data *p
 
 		dts = (msr >> 16) & 0x7F;
 		resolution = (msr >> 27) & 0xF;
-		fprintf(stderr, "cpu%d: MSR_IA32_THERM_STATUS: 0x%08llx (%d C +/- %d)\n",
+		fprintf(outf, "cpu%d: MSR_IA32_THERM_STATUS: 0x%08llx (%d C +/- %d)\n",
 			cpu, msr, tcc_activation_temp - dts, resolution);
 
 #ifdef THERM_DEBUG
@@ -2431,7 +2442,7 @@ int print_thermal(struct thread_data *t, struct core_data *c, struct pkg_data *p
 
 		dts = (msr >> 16) & 0x7F;
 		dts2 = (msr >> 8) & 0x7F;
-		fprintf(stderr, "cpu%d: MSR_IA32_THERM_INTERRUPT: 0x%08llx (%d C, %d C)\n",
+		fprintf(outf, "cpu%d: MSR_IA32_THERM_INTERRUPT: 0x%08llx (%d C, %d C)\n",
 			cpu, msr, tcc_activation_temp - dts, tcc_activation_temp - dts2);
 #endif
 	}
@@ -2441,7 +2452,7 @@ int print_thermal(struct thread_data *t, struct core_data *c, struct pkg_data *p
 	
 void print_power_limit_msr(int cpu, unsigned long long msr, char *label)
 {
-	fprintf(stderr, "cpu%d: %s: %sabled (%f Watts, %f sec, clamp %sabled)\n",
+	fprintf(outf, "cpu%d: %s: %sabled (%f Watts, %f sec, clamp %sabled)\n",
 		cpu, label,
 		((msr >> 15) & 1) ? "EN" : "DIS",
 		((msr >> 0) & 0x7FFF) * rapl_power_units,
@@ -2465,7 +2476,7 @@ int print_rapl(struct thread_data *t, struct core_data *c, struct pkg_data *p)
 
 	cpu = t->cpu_id;
 	if (cpu_migrate(cpu)) {
-		fprintf(stderr, "Could not migrate to CPU %d\n", cpu);
+		fprintf(outf, "Could not migrate to CPU %d\n", cpu);
 		return -1;
 	}
 
@@ -2473,7 +2484,7 @@ int print_rapl(struct thread_data *t, struct core_data *c, struct pkg_data *p)
 		return -1;
 
 	if (debug) {
-		fprintf(stderr, "cpu%d: MSR_RAPL_POWER_UNIT: 0x%08llx "
+		fprintf(outf, "cpu%d: MSR_RAPL_POWER_UNIT: 0x%08llx "
 			"(%f Watts, %f Joules, %f sec.)\n", cpu, msr,
 			rapl_power_units, rapl_energy_units, rapl_time_units);
 	}
@@ -2483,7 +2494,7 @@ int print_rapl(struct thread_data *t, struct core_data *c, struct pkg_data *p)
                 	return -5;
 
 
-		fprintf(stderr, "cpu%d: MSR_PKG_POWER_INFO: 0x%08llx (%.0f W TDP, RAPL %.0f - %.0f W, %f sec.)\n",
+		fprintf(outf, "cpu%d: MSR_PKG_POWER_INFO: 0x%08llx (%.0f W TDP, RAPL %.0f - %.0f W, %f sec.)\n",
 			cpu, msr,
 			((msr >>  0) & RAPL_POWER_GRANULARITY) * rapl_power_units,
 			((msr >> 16) & RAPL_POWER_GRANULARITY) * rapl_power_units,
@@ -2496,11 +2507,11 @@ int print_rapl(struct thread_data *t, struct core_data *c, struct pkg_data *p)
 		if (get_msr(cpu, MSR_PKG_POWER_LIMIT, &msr))
 			return -9;
 
-		fprintf(stderr, "cpu%d: MSR_PKG_POWER_LIMIT: 0x%08llx (%slocked)\n",
+		fprintf(outf, "cpu%d: MSR_PKG_POWER_LIMIT: 0x%08llx (%slocked)\n",
 			cpu, msr, (msr >> 63) & 1 ? "": "UN");
 
 		print_power_limit_msr(cpu, msr, "PKG Limit #1");
-		fprintf(stderr, "cpu%d: PKG Limit #2: %sabled (%f Watts, %f* sec, clamp %sabled)\n",
+		fprintf(outf, "cpu%d: PKG Limit #2: %sabled (%f Watts, %f* sec, clamp %sabled)\n",
 			cpu,
 			((msr >> 47) & 1) ? "EN" : "DIS",
 			((msr >> 32) & 0x7FFF) * rapl_power_units,
@@ -2512,7 +2523,7 @@ int print_rapl(struct thread_data *t, struct core_data *c, struct pkg_data *p)
 		if (get_msr(cpu, MSR_DRAM_POWER_INFO, &msr))
                 	return -6;
 
-		fprintf(stderr, "cpu%d: MSR_DRAM_POWER_INFO,: 0x%08llx (%.0f W TDP, RAPL %.0f - %.0f W, %f sec.)\n",
+		fprintf(outf, "cpu%d: MSR_DRAM_POWER_INFO,: 0x%08llx (%.0f W TDP, RAPL %.0f - %.0f W, %f sec.)\n",
 			cpu, msr,
 			((msr >>  0) & RAPL_POWER_GRANULARITY) * rapl_power_units,
 			((msr >> 16) & RAPL_POWER_GRANULARITY) * rapl_power_units,
@@ -2522,7 +2533,7 @@ int print_rapl(struct thread_data *t, struct core_data *c, struct pkg_data *p)
 	if (do_rapl & RAPL_DRAM) {
 		if (get_msr(cpu, MSR_DRAM_POWER_LIMIT, &msr))
 			return -9;
-		fprintf(stderr, "cpu%d: MSR_DRAM_POWER_LIMIT: 0x%08llx (%slocked)\n",
+		fprintf(outf, "cpu%d: MSR_DRAM_POWER_LIMIT: 0x%08llx (%slocked)\n",
 				cpu, msr, (msr >> 31) & 1 ? "": "UN");
 
 		print_power_limit_msr(cpu, msr, "DRAM Limit");
@@ -2532,7 +2543,7 @@ int print_rapl(struct thread_data *t, struct core_data *c, struct pkg_data *p)
 			if (get_msr(cpu, MSR_PP0_POLICY, &msr))
 				return -7;
 
-			fprintf(stderr, "cpu%d: MSR_PP0_POLICY: %lld\n", cpu, msr & 0xF);
+			fprintf(outf, "cpu%d: MSR_PP0_POLICY: %lld\n", cpu, msr & 0xF);
 		}
 	}
 	if (do_rapl & RAPL_CORES) {
@@ -2540,7 +2551,7 @@ int print_rapl(struct thread_data *t, struct core_data *c, struct pkg_data *p)
 
 			if (get_msr(cpu, MSR_PP0_POWER_LIMIT, &msr))
 				return -9;
-			fprintf(stderr, "cpu%d: MSR_PP0_POWER_LIMIT: 0x%08llx (%slocked)\n",
+			fprintf(outf, "cpu%d: MSR_PP0_POWER_LIMIT: 0x%08llx (%slocked)\n",
 					cpu, msr, (msr >> 31) & 1 ? "": "UN");
 			print_power_limit_msr(cpu, msr, "Cores Limit");
 		}
@@ -2550,11 +2561,11 @@ int print_rapl(struct thread_data *t, struct core_data *c, struct pkg_data *p)
 			if (get_msr(cpu, MSR_PP1_POLICY, &msr))
 				return -8;
 
-			fprintf(stderr, "cpu%d: MSR_PP1_POLICY: %lld\n", cpu, msr & 0xF);
+			fprintf(outf, "cpu%d: MSR_PP1_POLICY: %lld\n", cpu, msr & 0xF);
 
 			if (get_msr(cpu, MSR_PP1_POWER_LIMIT, &msr))
 				return -9;
-			fprintf(stderr, "cpu%d: MSR_PP1_POWER_LIMIT: 0x%08llx (%slocked)\n",
+			fprintf(outf, "cpu%d: MSR_PP1_POWER_LIMIT: 0x%08llx (%slocked)\n",
 					cpu, msr, (msr >> 31) & 1 ? "": "UN");
 			print_power_limit_msr(cpu, msr, "GFX Limit");
 		}
@@ -2680,16 +2691,16 @@ double slm_bclk(void)
 	double freq;
 
 	if (get_msr(base_cpu, MSR_FSB_FREQ, &msr))
-		fprintf(stderr, "SLM BCLK: unknown\n");
+		fprintf(outf, "SLM BCLK: unknown\n");
 
 	i = msr & 0xf;
 	if (i >= SLM_BCLK_FREQS) {
-		fprintf(stderr, "SLM BCLK[%d] invalid\n", i);
+		fprintf(outf, "SLM BCLK[%d] invalid\n", i);
 		msr = 3;
 	}
 	freq = slm_freq_table[i];
 
-	fprintf(stderr, "SLM BCLK: %.1f Mhz\n", freq);
+	fprintf(outf, "SLM BCLK: %.1f Mhz\n", freq);
 
 	return freq;
 }
@@ -2732,13 +2743,13 @@ int set_temperature_target(struct thread_data *t, struct core_data *c, struct pk
 
 	cpu = t->cpu_id;
 	if (cpu_migrate(cpu)) {
-		fprintf(stderr, "Could not migrate to CPU %d\n", cpu);
+		fprintf(outf, "Could not migrate to CPU %d\n", cpu);
 		return -1;
 	}
 
 	if (tcc_activation_temp_override != 0) {
 		tcc_activation_temp = tcc_activation_temp_override;
-		fprintf(stderr, "cpu%d: Using cmdline TCC Target (%d C)\n",
+		fprintf(outf, "cpu%d: Using cmdline TCC Target (%d C)\n",
 			cpu, tcc_activation_temp);
 		return 0;
 	}
@@ -2753,7 +2764,7 @@ int set_temperature_target(struct thread_data *t, struct core_data *c, struct pk
 	target_c_local = (msr >> 16) & 0xFF;
 
 	if (debug)
-		fprintf(stderr, "cpu%d: MSR_IA32_TEMPERATURE_TARGET: 0x%08llx (%d C)\n",
+		fprintf(outf, "cpu%d: MSR_IA32_TEMPERATURE_TARGET: 0x%08llx (%d C)\n",
 			cpu, msr, target_c_local);
 
 	if (!target_c_local)
@@ -2765,7 +2776,7 @@ int set_temperature_target(struct thread_data *t, struct core_data *c, struct pk
 
 guess:
 	tcc_activation_temp = TJMAX_DEFAULT;
-	fprintf(stderr, "cpu%d: Guessing tjMax %d C, Please use -T to specify\n",
+	fprintf(outf, "cpu%d: Guessing tjMax %d C, Please use -T to specify\n",
 		cpu, tcc_activation_temp);
 
 	return 0;
@@ -2776,7 +2787,7 @@ void decode_misc_enable_msr(void)
 	unsigned long long msr;
 
 	if (!get_msr(base_cpu, MSR_IA32_MISC_ENABLE, &msr))
-		fprintf(stderr, "cpu%d: MSR_IA32_MISC_ENABLE: 0x%08llx (%s %s %s)\n",
+		fprintf(outf, "cpu%d: MSR_IA32_MISC_ENABLE: 0x%08llx (%s %s %s)\n",
 			base_cpu, msr,
 			msr & (1 << 3) ? "TCC" : "",
 			msr & (1 << 16) ? "EIST" : "",
@@ -2798,7 +2809,7 @@ void decode_misc_pwr_mgmt_msr(void)
 		return;
 
 	if (!get_msr(base_cpu, MSR_MISC_PWR_MGMT, &msr))
-		fprintf(stderr, "cpu%d: MSR_MISC_PWR_MGMT: 0x%08llx (%sable-EIST_Coordination %sable-EPB)\n",
+		fprintf(outf, "cpu%d: MSR_MISC_PWR_MGMT: 0x%08llx (%sable-EIST_Coordination %sable-EPB)\n",
 			base_cpu, msr,
 			msr & (1 << 0) ? "DIS" : "EN",
 			msr & (1 << 1) ? "EN" : "DIS");
@@ -2817,7 +2828,7 @@ void process_cpuid()
 		genuine_intel = 1;
 
 	if (debug)
-		fprintf(stderr, "CPUID(0): %.4s%.4s%.4s ",
+		fprintf(outf, "CPUID(0): %.4s%.4s%.4s ",
 			(char *)&ebx, (char *)&edx, (char *)&ecx);
 
 	__get_cpuid(1, &fms, &ebx, &ecx, &edx);
@@ -2828,9 +2839,9 @@ void process_cpuid()
 		model += ((fms >> 16) & 0xf) << 4;
 
 	if (debug) {
-		fprintf(stderr, "%d CPUID levels; family:model:stepping 0x%x:%x:%x (%d:%d:%d)\n",
+		fprintf(outf, "%d CPUID levels; family:model:stepping 0x%x:%x:%x (%d:%d:%d)\n",
 			max_level, family, model, stepping, family, model, stepping);
-		fprintf(stderr, "CPUID(1): %s %s %s %s %s %s %s %s\n",
+		fprintf(outf, "CPUID(1): %s %s %s %s %s %s %s %s\n",
 			ecx & (1 << 0) ? "SSE3" : "-",
 			ecx & (1 << 3) ? "MONITOR" : "-",
 			ecx & (1 << 7) ? "EIST" : "-",
@@ -2879,7 +2890,7 @@ void process_cpuid()
 	has_epb = ecx & (1 << 3);
 
 	if (debug)
-		fprintf(stderr, "CPUID(6): %sAPERF, %sDTS, %sPTM, %sHWP, "
+		fprintf(outf, "CPUID(6): %sAPERF, %sDTS, %sPTM, %sHWP, "
 			"%sHWPnotify, %sHWPwindow, %sHWPepp, %sHWPpkg, %sEPB\n",
 			has_aperf ? "" : "No-",
 			do_dts ? "" : "No-",
@@ -2907,7 +2918,7 @@ void process_cpuid()
 		if (ebx_tsc != 0) {
 
 			if (debug && (ebx != 0))
-				fprintf(stderr, "CPUID(0x15): eax_crystal: %d ebx_tsc: %d ecx_crystal_hz: %d\n",
+				fprintf(outf, "CPUID(0x15): eax_crystal: %d ebx_tsc: %d ecx_crystal_hz: %d\n",
 					eax_crystal, ebx_tsc, crystal_hz);
 
 			if (crystal_hz == 0)
@@ -2923,7 +2934,7 @@ void process_cpuid()
 			if (crystal_hz) {
 				tsc_hz =  (unsigned long long) crystal_hz * ebx_tsc / eax_crystal;
 				if (debug)
-					fprintf(stderr, "TSC: %lld MHz (%d Hz * %d / %d / 1000000)\n",
+					fprintf(outf, "TSC: %lld MHz (%d Hz * %d / %d / 1000000)\n",
 						tsc_hz / 1000000, crystal_hz, ebx_tsc,  eax_crystal);
 			}
 		}
@@ -2938,7 +2949,7 @@ void process_cpuid()
 
 		__get_cpuid(0x16, &base_mhz, &max_mhz, &bus_mhz, &edx);
 		if (debug)
-			fprintf(stderr, "CPUID(0x16): base_mhz: %d max_mhz: %d bus_mhz: %d\n",
+			fprintf(outf, "CPUID(0x16): base_mhz: %d max_mhz: %d bus_mhz: %d\n",
 				base_mhz, max_mhz, bus_mhz);
 	}
 
@@ -2973,7 +2984,7 @@ void process_cpuid()
 
 void help()
 {
-	fprintf(stderr,
+	fprintf(outf,
 	"Usage: turbostat [OPTIONS][(--interval seconds) | COMMAND ...]\n"
 	"\n"
 	"Turbostat forks the specified COMMAND and prints statistics\n"
@@ -2985,6 +2996,7 @@ void help()
 	"--help		print this help message\n"
 	"--counter msr	print 32-bit counter at address \"msr\"\n"
 	"--Counter msr	print 64-bit Counter at address \"msr\"\n"
+	"--out file	create or truncate \"file\" for all output\n"
 	"--msr msr	print 32-bit value at address \"msr\"\n"
 	"--MSR msr	print 64-bit Value at address \"msr\"\n"
 	"--version	print version information\n"
@@ -3029,7 +3041,7 @@ void topology_probe()
 		show_cpu = 1;
 
 	if (debug > 1)
-		fprintf(stderr, "num_cpus %d max_cpu_num %d\n", topo.num_cpus, topo.max_cpu_num);
+		fprintf(outf, "num_cpus %d max_cpu_num %d\n", topo.num_cpus, topo.max_cpu_num);
 
 	cpus = calloc(1, (topo.max_cpu_num  + 1) * sizeof(struct cpu_topology));
 	if (cpus == NULL)
@@ -3064,7 +3076,7 @@ void topology_probe()
 
 		if (cpu_is_not_present(i)) {
 			if (debug > 1)
-				fprintf(stderr, "cpu%d NOT PRESENT\n", i);
+				fprintf(outf, "cpu%d NOT PRESENT\n", i);
 			continue;
 		}
 		cpus[i].core_id = get_core_id(i);
@@ -3079,26 +3091,26 @@ void topology_probe()
 		if (siblings > max_siblings)
 			max_siblings = siblings;
 		if (debug > 1)
-			fprintf(stderr, "cpu %d pkg %d core %d\n",
+			fprintf(outf, "cpu %d pkg %d core %d\n",
 				i, cpus[i].physical_package_id, cpus[i].core_id);
 	}
 	topo.num_cores_per_pkg = max_core_id + 1;
 	if (debug > 1)
-		fprintf(stderr, "max_core_id %d, sizing for %d cores per package\n",
+		fprintf(outf, "max_core_id %d, sizing for %d cores per package\n",
 			max_core_id, topo.num_cores_per_pkg);
 	if (debug && !summary_only && topo.num_cores_per_pkg > 1)
 		show_core = 1;
 
 	topo.num_packages = max_package_id + 1;
 	if (debug > 1)
-		fprintf(stderr, "max_package_id %d, sizing for %d packages\n",
+		fprintf(outf, "max_package_id %d, sizing for %d packages\n",
 			max_package_id, topo.num_packages);
 	if (debug && !summary_only && topo.num_packages > 1)
 		show_pkg = 1;
 
 	topo.num_threads_per_core = max_siblings;
 	if (debug > 1)
-		fprintf(stderr, "max_siblings %d\n", max_siblings);
+		fprintf(outf, "max_siblings %d\n", max_siblings);
 
 	free(cpus);
 }
@@ -3207,7 +3219,7 @@ void set_base_cpu(void)
 		err(-ENODEV, "No valid cpus found");
 
 	if (debug > 1)
-		fprintf(stderr, "base_cpu = %d\n", base_cpu);
+		fprintf(outf, "base_cpu = %d\n", base_cpu);
 }
 
 void turbostat_init()
@@ -3274,9 +3286,10 @@ int fork_it(char **argv)
 	for_all_cpus_2(delta_cpu, ODD_COUNTERS, EVEN_COUNTERS);
 	compute_average(EVEN_COUNTERS);
 	format_all_counters(EVEN_COUNTERS);
-	flush_stderr();
 
-	fprintf(stderr, "%.6f sec\n", tv_delta.tv_sec + tv_delta.tv_usec/1000000.0);
+	fprintf(outf, "%.6f sec\n", tv_delta.tv_sec + tv_delta.tv_usec/1000000.0);
+
+	flush_output_stderr();
 
 	return status;
 }
@@ -3293,13 +3306,13 @@ int get_and_dump_counters(void)
 	if (status)
 		return status;
 
-	flush_stdout();
+	flush_output_stdout();
 
 	return status;
 }
 
 void print_version() {
-	fprintf(stderr, "turbostat version 4.10 10 Dec, 2015"
+	fprintf(outf, "turbostat version 4.10 10 Dec, 2015"
 		" - Len Brown <lenb@kernel.org>\n");
 }
 
@@ -3317,6 +3330,7 @@ void cmdline(int argc, char **argv)
 		{"Joules",	no_argument,		0, 'J'},
 		{"MSR",		required_argument,	0, 'M'},
 		{"msr",		required_argument,	0, 'm'},
+		{"out",		required_argument,	0, 'o'},
 		{"Package",	no_argument,		0, 'p'},
 		{"processor",	no_argument,		0, 'p'},
 		{"Summary",	no_argument,		0, 'S'},
@@ -3327,7 +3341,7 @@ void cmdline(int argc, char **argv)
 
 	progname = argv[0];
 
-	while ((opt = getopt_long_only(argc, argv, "+C:c:Ddhi:JM:m:PpST:v",
+	while ((opt = getopt_long_only(argc, argv, "+C:c:Ddhi:JM:m:o:PpST:v",
 				long_options, &option_index)) != -1) {
 		switch (opt) {
 		case 'C':
@@ -3351,7 +3365,7 @@ void cmdline(int argc, char **argv)
 				double interval = strtod(optarg, NULL);
 
 				if (interval < 0.001) {
-					fprintf(stderr, "interval %f seconds is too small\n",
+					fprintf(outf, "interval %f seconds is too small\n",
 						interval);
 					exit(2);
 				}
@@ -3369,6 +3383,9 @@ void cmdline(int argc, char **argv)
 		case 'm':
 			sscanf(optarg, "%x", &extra_msr_offset32);
 			break;
+		case 'o':
+			outf = fopen_or_die(optarg, "w");
+			break;
 		case 'P':
 			show_pkg_only++;
 			break;
@@ -3391,6 +3408,8 @@ void cmdline(int argc, char **argv)
 
 int main(int argc, char **argv)
 {
+	outf = stderr;
+
 	cmdline(argc, argv);
 
 	if (debug)
-- 
cgit v0.10.2


From 58cc30a4e6086d542302e04eea39b2a438e4c5dd Mon Sep 17 00:00:00 2001
From: Len Brown <len.brown@intel.com>
Date: Wed, 24 Feb 2016 18:16:40 -0500
Subject: tools/power turbostat: fix compiler warnings

Signed-off-by: Len Brown <len.brown@intel.com>

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 5f9f41a..ad5bd56 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -1987,7 +1987,7 @@ int has_config_tdp(unsigned int family, unsigned int model)
 }
 
 static void
-dump_cstate_pstate_config_info(family, model)
+dump_cstate_pstate_config_info(int family, int model)
 {
 	if (!do_nhm_platform_info)
 		return;
@@ -2250,7 +2250,7 @@ int print_perf_limit(struct thread_data *t, struct core_data *c, struct pkg_data
 #define	RAPL_POWER_GRANULARITY	0x7FFF	/* 15 bit power granularity */
 #define	RAPL_TIME_GRANULARITY	0x3F /* 6 bit time granularity */
 
-double get_tdp(model)
+double get_tdp(int model)
 {
 	unsigned long long msr;
 
@@ -2364,7 +2364,7 @@ void rapl_probe(unsigned int family, unsigned int model)
 	return;
 }
 
-void perf_limit_reasons_probe(family, model)
+void perf_limit_reasons_probe(int family, int model)
 {
 	if (!genuine_intel)
 		return;
@@ -2974,7 +2974,7 @@ void process_cpuid()
 	perf_limit_reasons_probe(family, model);
 
 	if (debug)
-		dump_cstate_pstate_config_info();
+		dump_cstate_pstate_config_info(family, model);
 
 	if (has_skl_msrs(family, model))
 		calculate_tsc_tweak();
-- 
cgit v0.10.2


From 36229897ba966bb0dc9e060222ff17b198252367 Mon Sep 17 00:00:00 2001
From: Len Brown <len.brown@intel.com>
Date: Fri, 26 Feb 2016 20:51:02 -0500
Subject: tools/power turbostat: make fewer systems calls

skip the open(2)/close(2) on each msr read
by keeping the /dev/cpu/*/msr files open.

The remaining read(2) is generally far fewer cycles
than the removed open(2) system call.

Signed-off-by: Len Brown <len.brown@intel.com>

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index ad5bd56..2e47c2b 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -45,6 +45,7 @@
 
 char *proc_stat = "/proc/stat";
 FILE *outf;
+int *fd_percpu;
 struct timespec interval_ts = {5, 0};
 unsigned int debug;
 unsigned int rapl_joules;
@@ -270,23 +271,34 @@ int cpu_migrate(int cpu)
 	else
 		return 0;
 }
-
-int get_msr(int cpu, off_t offset, unsigned long long *msr)
+int get_msr_fd(int cpu)
 {
-	ssize_t retval;
 	char pathname[32];
 	int fd;
 
+	fd = fd_percpu[cpu];
+
+	if (fd)
+		return fd;
+
 	sprintf(pathname, "/dev/cpu/%d/msr", cpu);
 	fd = open(pathname, O_RDONLY);
 	if (fd < 0)
 		err(-1, "%s open failed, try chown or chmod +r /dev/cpu/*/msr, or run as root", pathname);
 
-	retval = pread(fd, msr, sizeof *msr, offset);
-	close(fd);
+	fd_percpu[cpu] = fd;
+
+	return fd;
+}
+
+int get_msr(int cpu, off_t offset, unsigned long long *msr)
+{
+	ssize_t retval;
+
+	retval = pread(get_msr_fd(cpu), msr, sizeof(*msr), offset);
 
 	if (retval != sizeof *msr)
-		err(-1, "%s offset 0x%llx read failed", pathname, (unsigned long long)offset);
+		err(-1, "msr %d offset 0x%llx read failed", cpu, (unsigned long long)offset);
 
 	return 0;
 }
@@ -1453,19 +1465,30 @@ dump_config_tdp(void)
 		fprintf(outf, "TDP_LEVEL=%d ", (unsigned int)(msr) & 0x3);
 	fprintf(outf, " lock=%d", (unsigned int)(msr >> 31) & 1);
 	fprintf(outf, ")\n");
-	
+
 	get_msr(base_cpu, MSR_TURBO_ACTIVATION_RATIO, &msr);
 	fprintf(outf, "cpu%d: MSR_TURBO_ACTIVATION_RATIO: 0x%08llx (", base_cpu, msr);
 	fprintf(outf, "MAX_NON_TURBO_RATIO=%d", (unsigned int)(msr) & 0x7F);
 	fprintf(outf, " lock=%d", (unsigned int)(msr >> 31) & 1);
 	fprintf(outf, ")\n");
 }
+void free_fd_percpu(void)
+{
+	int i;
+
+	for (i = 0; i < topo.max_cpu_num; ++i) {
+		if (fd_percpu[i] != 0)
+			close(fd_percpu[i]);
+	}
+
+	free(fd_percpu);
+}
 
 void free_all_buffers(void)
 {
 	CPU_FREE(cpu_present_set);
 	cpu_present_set = NULL;
-	cpu_present_set = 0;
+	cpu_present_setsize = 0;
 
 	CPU_FREE(cpu_affinity_set);
 	cpu_affinity_set = NULL;
@@ -1490,6 +1513,8 @@ void free_all_buffers(void)
 	free(output_buffer);
 	output_buffer = NULL;
 	outp = NULL;
+
+	free_fd_percpu();
 }
 
 /*
@@ -2449,7 +2474,7 @@ int print_thermal(struct thread_data *t, struct core_data *c, struct pkg_data *p
 
 	return 0;
 }
-	
+
 void print_power_limit_msr(int cpu, unsigned long long msr, char *label)
 {
 	fprintf(outf, "cpu%d: %s: %sabled (%f Watts, %f sec, clamp %sabled)\n",
@@ -3202,10 +3227,16 @@ void allocate_output_buffer()
 	if (outp == NULL)
 		err(-1, "calloc output buffer");
 }
-
+void allocate_fd_percpu(void)
+{
+	fd_percpu = calloc(topo.max_cpu_num, sizeof(int));
+	if (fd_percpu == NULL)
+		err(-1, "calloc fd_percpu");
+}
 void setup_all_buffers(void)
 {
 	topology_probe();
+	allocate_fd_percpu();
 	allocate_counters(&thread_even, &core_even, &package_even);
 	allocate_counters(&thread_odd, &core_odd, &package_odd);
 	allocate_output_buffer();
-- 
cgit v0.10.2


From 562a2d377bb9882c49debc9e1be7127a1717e242 Mon Sep 17 00:00:00 2001
From: Len Brown <len.brown@intel.com>
Date: Fri, 26 Feb 2016 23:48:05 -0500
Subject: tools/power turbostat: show IRQs per CPU

The new IRQ column shows how many interrupts have occurred on each CPU
during the measurement inteval.  This information comes from
the difference between /proc/interrupts shapshots made before
and after the measurement interval.

The first row, the system summary, shows the sum of the IRQS
for all CPUs during that interval.

Signed-off-by: Len Brown <len.brown@intel.com>

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 2e47c2b..c679326 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -75,6 +75,7 @@ unsigned int extra_msr_offset64;
 unsigned int extra_delta_offset32;
 unsigned int extra_delta_offset64;
 unsigned int aperf_mperf_multiplier = 1;
+int do_irq = 1;
 int do_smi;
 double bclk;
 double base_hz;
@@ -154,6 +155,7 @@ struct thread_data {
 	unsigned long long extra_delta64;
 	unsigned long long extra_msr32;
 	unsigned long long extra_delta32;
+	unsigned int irq_count;
 	unsigned int smi_count;
 	unsigned int cpu_id;
 	unsigned int flags;
@@ -221,6 +223,9 @@ struct topo_params {
 
 struct timeval tv_even, tv_odd, tv_delta;
 
+int *irq_column_2_cpu;	/* /proc/interrupts column numbers */
+int *irqs_per_cpu;		/* indexed by cpu_num */
+
 void setup_all_buffers(void);
 
 int cpu_is_not_present(int cpu)
@@ -306,8 +311,8 @@ int get_msr(int cpu, off_t offset, unsigned long long *msr)
 /*
  * Example Format w/ field column widths:
  *
- *  Package    Core     CPU Avg_MHz Bzy_MHz TSC_MHz     SMI   Busy% CPU_%c1 CPU_%c3 CPU_%c6 CPU_%c7 CoreTmp  PkgTmp Pkg%pc2 Pkg%pc3 Pkg%pc6 Pkg%pc7 PkgWatt CorWatt GFXWatt
- * 123456781234567812345678123456781234567812345678123456781234567812345678123456781234567812345678123456781234567812345678123456781234567812345678123456781234567812345678
+ *  Package    Core     CPU Avg_MHz Bzy_MHz TSC_MHz     IRQ   SMI   Busy% CPU_%c1 CPU_%c3 CPU_%c6 CPU_%c7 CoreTmp  PkgTmp Pkg%pc2 Pkg%pc3 Pkg%pc6 Pkg%pc7 PkgWatt CorWatt GFXWatt
+ * 12345678123456781234567812345678123456781234567812345678123456781234567812345678123456781234567812345678123456781234567812345678123456781234567812345678123456781234567812345678
  */
 
 void print_header(void)
@@ -338,6 +343,8 @@ void print_header(void)
 	if (!debug)
 		goto done;
 
+	if (do_irq)
+		outp += sprintf(outp, "     IRQ");
 	if (do_smi)
 		outp += sprintf(outp, "     SMI");
 
@@ -429,6 +436,8 @@ int dump_counters(struct thread_data *t, struct core_data *c,
 			extra_msr_offset32, t->extra_msr32);
 		outp += sprintf(outp, "msr0x%x: %016llX\n",
 			extra_msr_offset64, t->extra_msr64);
+		if (do_irq)
+			outp += sprintf(outp, "IRQ: %08X\n", t->irq_count);
 		if (do_smi)
 			outp += sprintf(outp, "SMI: %08X\n", t->smi_count);
 	}
@@ -562,6 +571,10 @@ int format_counters(struct thread_data *t, struct core_data *c,
 	if (!debug)
 		goto done;
 
+	/* IRQ */
+	if (do_irq)
+		outp += sprintf(outp, "%8d", t->irq_count);
+
 	/* SMI */
 	if (do_smi)
 		outp += sprintf(outp, "%8d", t->smi_count);
@@ -827,6 +840,9 @@ delta_thread(struct thread_data *new, struct thread_data *old,
 	old->extra_msr32 = new->extra_msr32;
 	old->extra_msr64 = new->extra_msr64;
 
+	if (do_irq)
+		old->irq_count = new->irq_count - old->irq_count;
+
 	if (do_smi)
 		old->smi_count = new->smi_count - old->smi_count;
 }
@@ -856,10 +872,12 @@ void clear_counters(struct thread_data *t, struct core_data *c, struct pkg_data
 	t->mperf = 0;
 	t->c1 = 0;
 
-	t->smi_count = 0;
 	t->extra_delta32 = 0;
 	t->extra_delta64 = 0;
 
+	t->irq_count = 0;
+	t->smi_count = 0;
+
 	/* tells format_counters to dump all fields from this set */
 	t->flags = CPU_IS_FIRST_THREAD_IN_CORE | CPU_IS_FIRST_CORE_IN_PACKAGE;
 
@@ -903,6 +921,9 @@ int sum_counters(struct thread_data *t, struct core_data *c,
 	average.threads.extra_delta32 += t->extra_delta32;
 	average.threads.extra_delta64 += t->extra_delta64;
 
+	average.threads.irq_count += t->irq_count;
+	average.threads.smi_count += t->smi_count;
+
 	/* sum per-core values only for 1st thread in core */
 	if (!(t->flags & CPU_IS_FIRST_THREAD_IN_CORE))
 		return 0;
@@ -1000,7 +1021,6 @@ static unsigned long long rdtsc(void)
 	return low | ((unsigned long long)high) << 32;
 }
 
-
 /*
  * get_counters(...)
  * migrate to cpu
@@ -1027,6 +1047,8 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
 		t->mperf = t->mperf * aperf_mperf_multiplier;
 	}
 
+	if (do_irq)
+		t->irq_count = irqs_per_cpu[cpu];
 	if (do_smi) {
 		if (get_msr(cpu, MSR_SMI_COUNT, &msr))
 			return -5;
@@ -1515,6 +1537,9 @@ void free_all_buffers(void)
 	outp = NULL;
 
 	free_fd_percpu();
+
+	free(irq_column_2_cpu);
+	free(irqs_per_cpu);
 }
 
 /*
@@ -1737,6 +1762,83 @@ int mark_cpu_present(int cpu)
 	return 0;
 }
 
+/*
+ * snapshot_proc_interrupts()
+ *
+ * read and record summary of /proc/interrupts
+ *
+ * return 1 if config change requires a restart, else return 0
+ */
+int snapshot_proc_interrupts(void)
+{
+	static FILE *fp;
+	int column, retval;
+
+	if (fp == NULL)
+		fp = fopen_or_die("/proc/interrupts", "r");
+	else
+		rewind(fp);
+
+	/* read 1st line of /proc/interrupts to get cpu* name for each column */
+	for (column = 0; column < topo.num_cpus; ++column) {
+		int cpu_number;
+
+		retval = fscanf(fp, " CPU%d", &cpu_number);
+		if (retval != 1)
+			break;
+
+		if (cpu_number > topo.max_cpu_num) {
+			warn("/proc/interrupts: cpu%d: > %d", cpu_number, topo.max_cpu_num);
+			return 1;
+		}
+
+		irq_column_2_cpu[column] = cpu_number;
+		irqs_per_cpu[cpu_number] = 0;
+	}
+
+	/* read /proc/interrupt count lines and sum up irqs per cpu */
+	while (1) {
+		int column;
+		char buf[64];
+
+		retval = fscanf(fp, " %s:", buf);	/* flush irq# "N:" */
+		if (retval != 1)
+			break;
+
+		/* read the count per cpu */
+		for (column = 0; column < topo.num_cpus; ++column) {
+
+			int cpu_number, irq_count;
+
+			retval = fscanf(fp, " %d", &irq_count);
+			if (retval != 1)
+				break;
+
+			cpu_number = irq_column_2_cpu[column];
+			irqs_per_cpu[cpu_number] += irq_count;
+
+		}
+
+		while (getc(fp) != '\n')
+			;	/* flush interrupt description */
+
+	}
+	return 0;
+}
+
+/*
+ * snapshot /proc and /sys files
+ *
+ * return 1 if configuration restart needed, else return 0
+ */
+int snapshot_proc_sysfs_files(void)
+{
+	if (snapshot_proc_interrupts())
+		return 1;
+
+	return 0;
+}
+
 void turbostat_loop()
 {
 	int retval;
@@ -1745,6 +1847,7 @@ void turbostat_loop()
 restart:
 	restarted++;
 
+	snapshot_proc_sysfs_files();
 	retval = for_all_cpus(get_counters, EVEN_COUNTERS);
 	if (retval < -1) {
 		exit(retval);
@@ -1764,6 +1867,8 @@ restart:
 			goto restart;
 		}
 		nanosleep(&interval_ts, NULL);
+		if (snapshot_proc_sysfs_files())
+			goto restart;
 		retval = for_all_cpus(get_counters, ODD_COUNTERS);
 		if (retval < -1) {
 			exit(retval);
@@ -1778,6 +1883,8 @@ restart:
 		format_all_counters(EVEN_COUNTERS);
 		flush_output_stdout();
 		nanosleep(&interval_ts, NULL);
+		if (snapshot_proc_sysfs_files())
+			goto restart;
 		retval = for_all_cpus(get_counters, EVEN_COUNTERS);
 		if (retval < -1) {
 			exit(retval);
@@ -3233,9 +3340,20 @@ void allocate_fd_percpu(void)
 	if (fd_percpu == NULL)
 		err(-1, "calloc fd_percpu");
 }
+void allocate_irq_buffers(void)
+{
+	irq_column_2_cpu = calloc(topo.num_cpus, sizeof(int));
+	if (irq_column_2_cpu == NULL)
+		err(-1, "calloc %d", topo.num_cpus);
+
+	irqs_per_cpu = calloc(topo.max_cpu_num, sizeof(int));
+	if (irqs_per_cpu == NULL)
+		err(-1, "calloc %d", topo.max_cpu_num);
+}
 void setup_all_buffers(void)
 {
 	topology_probe();
+	allocate_irq_buffers();
 	allocate_fd_percpu();
 	allocate_counters(&thread_even, &core_even, &package_even);
 	allocate_counters(&thread_odd, &core_odd, &package_odd);
-- 
cgit v0.10.2


From 27d47356b6dfa92042a17a0b474f08910d4c8e8f Mon Sep 17 00:00:00 2001
From: Len Brown <len.brown@intel.com>
Date: Sat, 27 Feb 2016 00:37:54 -0500
Subject: tools/power turbostat: show GFXMHz

Under the column "GFXMHz", show a snapshot of this attribute:
/sys/class/graphics/fb0/device/drm/card0/gt_cur_freq_mhz

This is an instantaneous snapshot of what sysfs presents
at the end of the measurement interval.  turbostat does
not average or otherwise perform any math on this value.

Signed-off-by: Len Brown <len.brown@intel.com>

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index c679326..d599f91 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -90,6 +90,8 @@ char *output_buffer, *outp;
 unsigned int do_rapl;
 unsigned int do_dts;
 unsigned int do_ptm;
+unsigned int do_gfx_mhz;
+unsigned int gfx_cur_mhz;
 unsigned int tcc_activation_temp;
 unsigned int tcc_activation_temp_override;
 double rapl_power_units, rapl_time_units;
@@ -183,6 +185,7 @@ struct pkg_data {
 	unsigned long long pkg_any_core_c0;
 	unsigned long long pkg_any_gfxe_c0;
 	unsigned long long pkg_both_core_gfxe_c0;
+	unsigned int gfx_mhz;
 	unsigned int package_id;
 	unsigned int energy_pkg;	/* MSR_PKG_ENERGY_STATUS */
 	unsigned int energy_dram;	/* MSR_DRAM_ENERGY_STATUS */
@@ -311,7 +314,7 @@ int get_msr(int cpu, off_t offset, unsigned long long *msr)
 /*
  * Example Format w/ field column widths:
  *
- *  Package    Core     CPU Avg_MHz Bzy_MHz TSC_MHz     IRQ   SMI   Busy% CPU_%c1 CPU_%c3 CPU_%c6 CPU_%c7 CoreTmp  PkgTmp Pkg%pc2 Pkg%pc3 Pkg%pc6 Pkg%pc7 PkgWatt CorWatt GFXWatt
+ *  Package    Core     CPU Avg_MHz Bzy_MHz TSC_MHz     IRQ   SMI   Busy% CPU_%c1 CPU_%c3 CPU_%c6 CPU_%c7 CoreTmp  PkgTmp  GFXMHz Pkg%pc2 Pkg%pc3 Pkg%pc6 Pkg%pc7 PkgWatt CorWatt GFXWatt
  * 12345678123456781234567812345678123456781234567812345678123456781234567812345678123456781234567812345678123456781234567812345678123456781234567812345678123456781234567812345678
  */
 
@@ -362,6 +365,9 @@ void print_header(void)
 	if (do_ptm)
 		outp += sprintf(outp, "  PkgTmp");
 
+	if (do_gfx_mhz)
+		outp += sprintf(outp, "  GFXMHz");
+
 	if (do_skl_residency) {
 		outp += sprintf(outp, " Totl%%C0");
 		outp += sprintf(outp, "  Any%%C0");
@@ -608,6 +614,10 @@ int format_counters(struct thread_data *t, struct core_data *c,
 	if (do_ptm)
 		outp += sprintf(outp, "%8d", p->pkg_temp_c);
 
+	/* GFXMHz */
+	if (do_gfx_mhz)
+		outp += sprintf(outp, "%8d", p->gfx_mhz);
+
 	/* Totl%C0, Any%C0 GFX%C0 CPUGFX% */
 	if (do_skl_residency) {
 		outp += sprintf(outp, "%8.2f", 100.0 * p->pkg_wtd_core_c0/t->tsc);
@@ -746,6 +756,8 @@ delta_package(struct pkg_data *new, struct pkg_data *old)
 	old->pc10 = new->pc10 - old->pc10;
 	old->pkg_temp_c = new->pkg_temp_c;
 
+	old->gfx_mhz = new->gfx_mhz;
+
 	DELTA_WRAP32(new->energy_pkg, old->energy_pkg);
 	DELTA_WRAP32(new->energy_cores, old->energy_cores);
 	DELTA_WRAP32(new->energy_gfx, old->energy_gfx);
@@ -909,6 +921,8 @@ void clear_counters(struct thread_data *t, struct core_data *c, struct pkg_data
 	p->rapl_pkg_perf_status = 0;
 	p->rapl_dram_perf_status = 0;
 	p->pkg_temp_c = 0;
+
+	p->gfx_mhz = 0;
 }
 int sum_counters(struct thread_data *t, struct core_data *c,
 	struct pkg_data *p)
@@ -961,6 +975,8 @@ int sum_counters(struct thread_data *t, struct core_data *c,
 	average.packages.energy_cores += p->energy_cores;
 	average.packages.energy_gfx += p->energy_gfx;
 
+	average.packages.gfx_mhz = p->gfx_mhz;
+
 	average.packages.pkg_temp_c = MAX(average.packages.pkg_temp_c, p->pkg_temp_c);
 
 	average.packages.rapl_pkg_perf_status += p->rapl_pkg_perf_status;
@@ -1176,6 +1192,9 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
 			return -17;
 		p->pkg_temp_c = tcc_activation_temp - ((msr >> 16) & 0x7F);
 	}
+	if (do_gfx_mhz)
+		p->gfx_mhz = gfx_cur_mhz;
+
 	return 0;
 }
 
@@ -1825,6 +1844,30 @@ int snapshot_proc_interrupts(void)
 	}
 	return 0;
 }
+/*
+ * snapshot_gfx_mhz()
+ *
+ * record snapshot of
+ * /sys/class/graphics/fb0/device/drm/card0/gt_cur_freq_mhz
+ *
+ * return 1 if config change requires a restart, else return 0
+ */
+int snapshot_gfx_mhz(void)
+{
+	static FILE *fp;
+	int retval;
+
+	if (fp == NULL)
+		fp = fopen_or_die("/sys/class/graphics/fb0/device/drm/card0/gt_cur_freq_mhz", "r");
+	else
+		rewind(fp);
+
+	retval = fscanf(fp, "%d", &gfx_cur_mhz);
+	if (retval != 1)
+		err(1, "GFX MHz");
+
+	return 0;
+}
 
 /*
  * snapshot /proc and /sys files
@@ -1836,6 +1879,9 @@ int snapshot_proc_sysfs_files(void)
 	if (snapshot_proc_interrupts())
 		return 1;
 
+	if (do_gfx_mhz)
+		snapshot_gfx_mhz();
+
 	return 0;
 }
 
@@ -3111,6 +3157,8 @@ void process_cpuid()
 	if (has_skl_msrs(family, model))
 		calculate_tsc_tweak();
 
+	do_gfx_mhz = !access("/sys/class/graphics/fb0/device/drm/card0/gt_cur_freq_mhz", R_OK);
+
 	return;
 }
 
-- 
cgit v0.10.2


From fdf676e51f301d207586d9bac509b8ce055bae8a Mon Sep 17 00:00:00 2001
From: Len Brown <len.brown@intel.com>
Date: Sat, 27 Feb 2016 01:28:12 -0500
Subject: tools/power turbostat: show GFX%rc6

The column "GFX%c6" show the percentage of time the GPU
is in the "render C6" state, rc6.  Deep package C-states on several
systems depend on the GPU being in RC6.

This information comes from the counter
/sys/class/drm/card0/power/rc6_residency_ms,
as read before and after the measurement interval.

Signed-off-by: Len Brown <len.brown@intel.com>

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index d599f91..9896619 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -90,6 +90,8 @@ char *output_buffer, *outp;
 unsigned int do_rapl;
 unsigned int do_dts;
 unsigned int do_ptm;
+unsigned int do_gfx_rc6_ms;
+unsigned long long  gfx_cur_rc6_ms;
 unsigned int do_gfx_mhz;
 unsigned int gfx_cur_mhz;
 unsigned int tcc_activation_temp;
@@ -185,6 +187,7 @@ struct pkg_data {
 	unsigned long long pkg_any_core_c0;
 	unsigned long long pkg_any_gfxe_c0;
 	unsigned long long pkg_both_core_gfxe_c0;
+	unsigned long long gfx_rc6_ms;
 	unsigned int gfx_mhz;
 	unsigned int package_id;
 	unsigned int energy_pkg;	/* MSR_PKG_ENERGY_STATUS */
@@ -365,6 +368,9 @@ void print_header(void)
 	if (do_ptm)
 		outp += sprintf(outp, "  PkgTmp");
 
+	if (do_gfx_rc6_ms)
+		outp += sprintf(outp, " GFX%%rc6");
+
 	if (do_gfx_mhz)
 		outp += sprintf(outp, "  GFXMHz");
 
@@ -614,6 +620,10 @@ int format_counters(struct thread_data *t, struct core_data *c,
 	if (do_ptm)
 		outp += sprintf(outp, "%8d", p->pkg_temp_c);
 
+	/* GFXrc6 */
+	if (do_gfx_rc6_ms)
+		outp += sprintf(outp, "%8.2f", 100.0 * p->gfx_rc6_ms / 1000.0 / interval_float);
+
 	/* GFXMHz */
 	if (do_gfx_mhz)
 		outp += sprintf(outp, "%8d", p->gfx_mhz);
@@ -756,6 +766,7 @@ delta_package(struct pkg_data *new, struct pkg_data *old)
 	old->pc10 = new->pc10 - old->pc10;
 	old->pkg_temp_c = new->pkg_temp_c;
 
+	old->gfx_rc6_ms = new->gfx_rc6_ms - old->gfx_rc6_ms;
 	old->gfx_mhz = new->gfx_mhz;
 
 	DELTA_WRAP32(new->energy_pkg, old->energy_pkg);
@@ -922,6 +933,7 @@ void clear_counters(struct thread_data *t, struct core_data *c, struct pkg_data
 	p->rapl_dram_perf_status = 0;
 	p->pkg_temp_c = 0;
 
+	p->gfx_rc6_ms = 0;
 	p->gfx_mhz = 0;
 }
 int sum_counters(struct thread_data *t, struct core_data *c,
@@ -975,6 +987,7 @@ int sum_counters(struct thread_data *t, struct core_data *c,
 	average.packages.energy_cores += p->energy_cores;
 	average.packages.energy_gfx += p->energy_gfx;
 
+	average.packages.gfx_rc6_ms = p->gfx_rc6_ms;
 	average.packages.gfx_mhz = p->gfx_mhz;
 
 	average.packages.pkg_temp_c = MAX(average.packages.pkg_temp_c, p->pkg_temp_c);
@@ -1192,6 +1205,10 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
 			return -17;
 		p->pkg_temp_c = tcc_activation_temp - ((msr >> 16) & 0x7F);
 	}
+
+	if (do_gfx_rc6_ms)
+		p->gfx_rc6_ms = gfx_cur_rc6_ms;
+
 	if (do_gfx_mhz)
 		p->gfx_mhz = gfx_cur_mhz;
 
@@ -1845,6 +1862,29 @@ int snapshot_proc_interrupts(void)
 	return 0;
 }
 /*
+ * snapshot_gfx_rc6_ms()
+ *
+ * record snapshot of
+ * /sys/class/drm/card0/power/rc6_residency_ms
+ *
+ * return 1 if config change requires a restart, else return 0
+ */
+int snapshot_gfx_rc6_ms(void)
+{
+	FILE *fp;
+	int retval;
+
+	fp = fopen_or_die("/sys/class/drm/card0/power/rc6_residency_ms", "r");
+
+	retval = fscanf(fp, "%lld", &gfx_cur_rc6_ms);
+	if (retval != 1)
+		err(1, "GFX rc6");
+
+	fclose(fp);
+
+	return 0;
+}
+/*
  * snapshot_gfx_mhz()
  *
  * record snapshot of
@@ -1879,6 +1919,9 @@ int snapshot_proc_sysfs_files(void)
 	if (snapshot_proc_interrupts())
 		return 1;
 
+	if (do_gfx_rc6_ms)
+		snapshot_gfx_rc6_ms();
+
 	if (do_gfx_mhz)
 		snapshot_gfx_mhz();
 
@@ -3157,6 +3200,8 @@ void process_cpuid()
 	if (has_skl_msrs(family, model))
 		calculate_tsc_tweak();
 
+	do_gfx_rc6_ms = !access("/sys/class/drm/card0/power/rc6_residency_ms", R_OK);
+
 	do_gfx_mhz = !access("/sys/class/graphics/fb0/device/drm/card0/gt_cur_freq_mhz", R_OK);
 
 	return;
-- 
cgit v0.10.2


From 0102b06747c7d24e334d2b27c4b43eed693676f1 Mon Sep 17 00:00:00 2001
From: Len Brown <len.brown@intel.com>
Date: Sat, 27 Feb 2016 03:11:29 -0500
Subject: tools/power turbostat: detect and work around syscall jitter

The accuracy of Bzy_Mhz and Busy% depend on reading
the TSC, APERF, and MPERF close together in time.

When there is a very short measurement interval,
or a large system is profoundly idle, the changes
in APERF and MPERF may be very small.
They can be small enough that an expensive interrupt
between reading APERF and MPERF can cause the APERF/MPERF
ratio to become inaccurate, resulting in invalid
calculation and display of Bzy_MHz.

A dummy APERF read of APERF makes this problem
much more rare.  Apparently this 1st systemn call
after exiting a long stretch of idle is when we
typically see expensive timer interrupts that cause
large jitter.

For the cases that dummy APERF read fails to prevent,
we compare the latency of the APERF and MPERF reads.
If they differ by more than 2x, we re-issue them.

Signed-off-by: Len Brown <len.brown@intel.com>

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 9896619..43a6dda 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -1059,19 +1059,68 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
 {
 	int cpu = t->cpu_id;
 	unsigned long long msr;
+	int aperf_mperf_retry_count = 0;
 
 	if (cpu_migrate(cpu)) {
 		fprintf(outf, "Could not migrate to CPU %d\n", cpu);
 		return -1;
 	}
 
+retry:
 	t->tsc = rdtsc();	/* we are running on local CPU of interest */
 
 	if (has_aperf) {
+		unsigned long long tsc_before, tsc_between, tsc_after, aperf_time, mperf_time;
+
+		/*
+		 * The TSC, APERF and MPERF must be read together for
+		 * APERF/MPERF and MPERF/TSC to give accurate results.
+		 *
+		 * Unfortunately, APERF and MPERF are read by
+		 * individual system call, so delays may occur
+		 * between them.  If the time to read them
+		 * varies by a large amount, we re-read them.
+		 */
+
+		/*
+		 * This initial dummy APERF read has been seen to
+		 * reduce jitter in the subsequent reads.
+		 */
+
+		if (get_msr(cpu, MSR_IA32_APERF, &t->aperf))
+			return -3;
+
+		t->tsc = rdtsc();	/* re-read close to APERF */
+
+		tsc_before = t->tsc;
+
 		if (get_msr(cpu, MSR_IA32_APERF, &t->aperf))
 			return -3;
+
+		tsc_between = rdtsc();
+
 		if (get_msr(cpu, MSR_IA32_MPERF, &t->mperf))
 			return -4;
+
+		tsc_after = rdtsc();
+
+		aperf_time = tsc_between - tsc_before;
+		mperf_time = tsc_after - tsc_between;
+
+		/*
+		 * If the system call latency to read APERF and MPERF
+		 * differ by more than 2x, then try again.
+		 */
+		if ((aperf_time > (2 * mperf_time)) || (mperf_time > (2 * aperf_time))) {
+			aperf_mperf_retry_count++;
+			if (aperf_mperf_retry_count < 5)
+				goto retry;
+			else
+				warnx("cpu%d jitter %lld %lld",
+					cpu, aperf_time, mperf_time);
+		}
+		aperf_mperf_retry_count = 0;
+
 		t->aperf = t->aperf * aperf_mperf_multiplier;
 		t->mperf = t->mperf * aperf_mperf_multiplier;
 	}
@@ -3554,7 +3603,7 @@ int get_and_dump_counters(void)
 }
 
 void print_version() {
-	fprintf(outf, "turbostat version 4.10 10 Dec, 2015"
+	fprintf(outf, "turbostat version 4.11 27 Feb 2016"
 		" - Len Brown <lenb@kernel.org>\n");
 }
 
-- 
cgit v0.10.2


From aa8d8cc79af16e16da04efff1c1a72b1ea4a9e7e Mon Sep 17 00:00:00 2001
From: Len Brown <len.brown@intel.com>
Date: Fri, 11 Mar 2016 13:26:03 -0500
Subject: tools/power turbostat: indicate SMX and SGX support

SGX presence is related to a SKL power workaround,
so lets show when that is enabled.

Signed-off-by: Len Brown <len.brown@intel.com>

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 43a6dda..db9c9d1 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -3052,6 +3052,17 @@ guess:
 	return 0;
 }
 
+void decode_feature_control_msr(void)
+{
+	unsigned long long msr;
+
+	if (!get_msr(base_cpu, MSR_IA32_FEATURE_CONTROL, &msr))
+		fprintf(outf, "cpu%d: MSR_IA32_FEATURE_CONTROL: 0x%08llx (%sLocked %s)\n",
+			base_cpu, msr,
+			msr & FEATURE_CONTROL_LOCKED ? "" : "UN-",
+			msr & (1 << 18) ? "SGX" : "");
+}
+
 void decode_misc_enable_msr(void)
 {
 	unsigned long long msr;
@@ -3111,9 +3122,10 @@ void process_cpuid()
 	if (debug) {
 		fprintf(outf, "%d CPUID levels; family:model:stepping 0x%x:%x:%x (%d:%d:%d)\n",
 			max_level, family, model, stepping, family, model, stepping);
-		fprintf(outf, "CPUID(1): %s %s %s %s %s %s %s %s\n",
+		fprintf(outf, "CPUID(1): %s %s %s %s %s %s %s %s %s\n",
 			ecx & (1 << 0) ? "SSE3" : "-",
 			ecx & (1 << 3) ? "MONITOR" : "-",
+			ecx & (1 << 6) ? "SMX" : "-",
 			ecx & (1 << 7) ? "EIST" : "-",
 			ecx & (1 << 8) ? "TM2" : "-",
 			edx & (1 << 4) ? "TSC" : "-",
@@ -3175,6 +3187,20 @@ void process_cpuid()
 	if (debug)
 		decode_misc_enable_msr();
 
+	if (max_level >= 0x7) {
+		int has_sgx;
+
+		ecx = 0;
+
+		__cpuid_count(0x7, 0, eax, ebx, ecx, edx);
+
+		has_sgx = ebx & (1 << 2);
+		fprintf(outf, "CPUID(7): %sSGX\n", has_sgx ? "" : "No-");
+
+		if (has_sgx)
+			decode_feature_control_msr();
+	}
+
 	if (max_level >= 0x15) {
 		unsigned int eax_crystal;
 		unsigned int ebx_tsc;
-- 
cgit v0.10.2


From 5aea2f7f645b27635b856311dee5b775d277c686 Mon Sep 17 00:00:00 2001
From: Len Brown <len.brown@intel.com>
Date: Sun, 13 Mar 2016 03:14:35 -0400
Subject: tools/power turbostat: call __cpuid() instead of __get_cpuid()

turbostat already checks whether calling each cpuid leavf is legal,
and it doesn't look at the function return value,
so call the simpler gcc intrinsic __cpuid() instead of __get_cpuid().

syntax only, no functional change

Signed-off-by: Len Brown <len.brown@intel.com>

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index db9c9d1..b34241c 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -3103,7 +3103,7 @@ void process_cpuid()
 
 	eax = ebx = ecx = edx = 0;
 
-	__get_cpuid(0, &max_level, &ebx, &ecx, &edx);
+	__cpuid(0, max_level, ebx, ecx, edx);
 
 	if (ebx == 0x756e6547 && edx == 0x49656e69 && ecx == 0x6c65746e)
 		genuine_intel = 1;
@@ -3112,7 +3112,7 @@ void process_cpuid()
 		fprintf(outf, "CPUID(0): %.4s%.4s%.4s ",
 			(char *)&ebx, (char *)&edx, (char *)&ecx);
 
-	__get_cpuid(1, &fms, &ebx, &ecx, &edx);
+	__cpuid(1, fms, ebx, ecx, edx);
 	family = (fms >> 8) & 0xf;
 	model = (fms >> 4) & 0xf;
 	stepping = fms & 0xf;
@@ -3143,7 +3143,7 @@ void process_cpuid()
 	 * This check is valid for both Intel and AMD.
 	 */
 	ebx = ecx = edx = 0;
-	__get_cpuid(0x80000000, &max_extended_level, &ebx, &ecx, &edx);
+	__cpuid(0x80000000, max_extended_level, ebx, ecx, edx);
 
 	if (max_extended_level >= 0x80000007) {
 
@@ -3151,7 +3151,7 @@ void process_cpuid()
 		 * Non-Stop TSC is advertised by CPUID.EAX=0x80000007: EDX.bit8
 		 * this check is valid for both Intel and AMD
 		 */
-		__get_cpuid(0x80000007, &eax, &ebx, &ecx, &edx);
+		__cpuid(0x80000007, eax, ebx, ecx, edx);
 		has_invariant_tsc = edx & (1 << 8);
 	}
 
@@ -3160,7 +3160,7 @@ void process_cpuid()
 	 * this check is valid for both Intel and AMD
 	 */
 
-	__get_cpuid(0x6, &eax, &ebx, &ecx, &edx);
+	__cpuid(0x6, eax, ebx, ecx, edx);
 	has_aperf = ecx & (1 << 0);
 	do_dts = eax & (1 << 0);
 	do_ptm = eax & (1 << 6);
@@ -3209,7 +3209,7 @@ void process_cpuid()
 		 * CPUID 15H TSC/Crystal ratio, possibly Crystal Hz
 		 */
 		eax_crystal = ebx_tsc = crystal_hz = edx = 0;
-		__get_cpuid(0x15, &eax_crystal, &ebx_tsc, &crystal_hz, &edx);
+		__cpuid(0x15, eax_crystal, ebx_tsc, crystal_hz, edx);
 
 		if (ebx_tsc != 0) {
 
@@ -3243,7 +3243,7 @@ void process_cpuid()
 		 */
 		base_mhz = max_mhz = bus_mhz = edx = 0;
 
-		__get_cpuid(0x16, &base_mhz, &max_mhz, &bus_mhz, &edx);
+		__cpuid(0x16, base_mhz, max_mhz, bus_mhz, edx);
 		if (debug)
 			fprintf(outf, "CPUID(0x16): base_mhz: %d max_mhz: %d bus_mhz: %d\n",
 				base_mhz, max_mhz, bus_mhz);
-- 
cgit v0.10.2


From 6c34f160df82ae07bfff5b4c51f50622e9c2759e Mon Sep 17 00:00:00 2001
From: Len Brown <len.brown@intel.com>
Date: Sun, 13 Mar 2016 03:21:22 -0400
Subject: tools/power turbostat: correct output for MSR_NHM_SNB_PKG_CST_CFG_CTL
 dump

MSR_NHM_SNB_PKG_CST_CFG_CTL: 0x1e008008 (...pkg-cstate-limit=0: unlimited)
should print as
MSR_NHM_SNB_PKG_CST_CFG_CTL: 0x1e008008 (...pkg-cstate-limit=8: unlimited)

Signed-off-by: Len Brown <len.brown@intel.com>

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index b34241c..a551630 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -1532,7 +1532,7 @@ dump_nhm_cst_cfg(void)
 		(msr & NHM_C3_AUTO_DEMOTE) ? "demote-C3, " : "",
 		(msr & NHM_C1_AUTO_DEMOTE) ? "demote-C1, " : "",
 		(msr & (1 << 15)) ? "" : "UN",
-		(unsigned int)msr & 7,
+		(unsigned int)msr & 0xF,
 		pkg_cstate_limit_strings[pkg_cstate_limit]);
 	return;
 }
-- 
cgit v0.10.2


From 685b535b2cdb9cdf354321f8af9ed17dcf19d19f Mon Sep 17 00:00:00 2001
From: Chen Yu <yu.c.chen@intel.com>
Date: Sun, 13 Dec 2015 21:09:31 +0800
Subject: tools/power turbostat: bugfix: TDP MSRs print bits fixing

MSR_CONFIG_TDP_NOMINAL:
should print all 8 bits of base_ratio (bit 0:7) 0xFF

MSR_CONFIG_TDP_LEVEL_1:
should print all 15 bits of PKG_MIN_PWR_LVL1 (bit 48:62) 0x7FFF
should print all 15 bits of PKG_MAX_PWR_LVL1 (bit 32:46) 0x7FFF
should print all 8 bits of LVL1_RATIO (bit 16:23) 0xFF
should print all 15 bits of PKG_TDP_LVL1 (bit 0:14) 0x7FFF

And the same modification to MSR_CONFIG_TDP_LEVEL_2.

MSR_TURBO_ACTIVATION_RATIO:
should print all 8 bits of MAX_NON_TURBO_RATIO (bit 0:7) 0xFF

Signed-off-by: Chen Yu <yu.c.chen@intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index a551630..ee1551b 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -1544,25 +1544,25 @@ dump_config_tdp(void)
 
 	get_msr(base_cpu, MSR_CONFIG_TDP_NOMINAL, &msr);
 	fprintf(outf, "cpu%d: MSR_CONFIG_TDP_NOMINAL: 0x%08llx", base_cpu, msr);
-	fprintf(outf, " (base_ratio=%d)\n", (unsigned int)msr & 0xEF);
+	fprintf(outf, " (base_ratio=%d)\n", (unsigned int)msr & 0xFF);
 
 	get_msr(base_cpu, MSR_CONFIG_TDP_LEVEL_1, &msr);
 	fprintf(outf, "cpu%d: MSR_CONFIG_TDP_LEVEL_1: 0x%08llx (", base_cpu, msr);
 	if (msr) {
-		fprintf(outf, "PKG_MIN_PWR_LVL1=%d ", (unsigned int)(msr >> 48) & 0xEFFF);
-		fprintf(outf, "PKG_MAX_PWR_LVL1=%d ", (unsigned int)(msr >> 32) & 0xEFFF);
-		fprintf(outf, "LVL1_RATIO=%d ", (unsigned int)(msr >> 16) & 0xEF);
-		fprintf(outf, "PKG_TDP_LVL1=%d", (unsigned int)(msr) & 0xEFFF);
+		fprintf(outf, "PKG_MIN_PWR_LVL1=%d ", (unsigned int)(msr >> 48) & 0x7FFF);
+		fprintf(outf, "PKG_MAX_PWR_LVL1=%d ", (unsigned int)(msr >> 32) & 0x7FFF);
+		fprintf(outf, "LVL1_RATIO=%d ", (unsigned int)(msr >> 16) & 0xFF);
+		fprintf(outf, "PKG_TDP_LVL1=%d", (unsigned int)(msr) & 0x7FFF);
 	}
 	fprintf(outf, ")\n");
 
 	get_msr(base_cpu, MSR_CONFIG_TDP_LEVEL_2, &msr);
 	fprintf(outf, "cpu%d: MSR_CONFIG_TDP_LEVEL_2: 0x%08llx (", base_cpu, msr);
 	if (msr) {
-		fprintf(outf, "PKG_MIN_PWR_LVL2=%d ", (unsigned int)(msr >> 48) & 0xEFFF);
-		fprintf(outf, "PKG_MAX_PWR_LVL2=%d ", (unsigned int)(msr >> 32) & 0xEFFF);
-		fprintf(outf, "LVL2_RATIO=%d ", (unsigned int)(msr >> 16) & 0xEF);
-		fprintf(outf, "PKG_TDP_LVL2=%d", (unsigned int)(msr) & 0xEFFF);
+		fprintf(outf, "PKG_MIN_PWR_LVL2=%d ", (unsigned int)(msr >> 48) & 0x7FFF);
+		fprintf(outf, "PKG_MAX_PWR_LVL2=%d ", (unsigned int)(msr >> 32) & 0x7FFF);
+		fprintf(outf, "LVL2_RATIO=%d ", (unsigned int)(msr >> 16) & 0xFF);
+		fprintf(outf, "PKG_TDP_LVL2=%d", (unsigned int)(msr) & 0x7FFF);
 	}
 	fprintf(outf, ")\n");
 
@@ -1575,7 +1575,7 @@ dump_config_tdp(void)
 
 	get_msr(base_cpu, MSR_TURBO_ACTIVATION_RATIO, &msr);
 	fprintf(outf, "cpu%d: MSR_TURBO_ACTIVATION_RATIO: 0x%08llx (", base_cpu, msr);
-	fprintf(outf, "MAX_NON_TURBO_RATIO=%d", (unsigned int)(msr) & 0x7F);
+	fprintf(outf, "MAX_NON_TURBO_RATIO=%d", (unsigned int)(msr) & 0xFF);
 	fprintf(outf, " lock=%d", (unsigned int)(msr >> 31) & 1);
 	fprintf(outf, ")\n");
 }
-- 
cgit v0.10.2