From deaa51465105a7eda19a627b10372f4f7c51a4df Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Wed, 11 Nov 2015 07:59:01 +0530
Subject: PM / OPP: Add debugfs support

This patch adds debugfs support to OPP layer to export OPPs and their
properties for all the devices.

This creates a top level directory: /sys/kernel/debug/opp and then
device specific directories (based on device names) inside it. For
example: 'cpu0', 'cpu1', etc..

If multiple devices share the OPP table, then the real directory is
created only for the first device. For all others, links are created to
the real directory.

Inside the device specific directory, a separate directory is created
for each OPP. And within that files per opp property.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Reviewed-by: Stephen Boyd <sboyd@codeaurora.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/base/power/opp/Makefile b/drivers/base/power/opp/Makefile
index 33c1e18..19837ef 100644
--- a/drivers/base/power/opp/Makefile
+++ b/drivers/base/power/opp/Makefile
@@ -1,2 +1,3 @@
 ccflags-$(CONFIG_DEBUG_DRIVER)	:= -DDEBUG
 obj-y				+= core.o cpu.o
+obj-$(CONFIG_DEBUG_FS)		+= debugfs.o
diff --git a/drivers/base/power/opp/core.c b/drivers/base/power/opp/core.c
index b8e76f7..6aa172b 100644
--- a/drivers/base/power/opp/core.c
+++ b/drivers/base/power/opp/core.c
@@ -463,6 +463,7 @@ static void _kfree_list_dev_rcu(struct rcu_head *head)
 static void _remove_list_dev(struct device_list_opp *list_dev,
 			     struct device_opp *dev_opp)
 {
+	opp_debug_unregister(list_dev, dev_opp);
 	list_del(&list_dev->node);
 	call_srcu(&dev_opp->srcu_head.srcu, &list_dev->rcu_head,
 		  _kfree_list_dev_rcu);
@@ -472,6 +473,7 @@ struct device_list_opp *_add_list_dev(const struct device *dev,
 				      struct device_opp *dev_opp)
 {
 	struct device_list_opp *list_dev;
+	int ret;
 
 	list_dev = kzalloc(sizeof(*list_dev), GFP_KERNEL);
 	if (!list_dev)
@@ -481,6 +483,12 @@ struct device_list_opp *_add_list_dev(const struct device *dev,
 	list_dev->dev = dev;
 	list_add_rcu(&list_dev->node, &dev_opp->dev_list);
 
+	/* Create debugfs entries for the dev_opp */
+	ret = opp_debug_register(list_dev, dev_opp);
+	if (ret)
+		dev_err(dev, "%s: Failed to register opp debugfs (%d)\n",
+			__func__, ret);
+
 	return list_dev;
 }
 
@@ -596,6 +604,7 @@ static void _opp_remove(struct device_opp *dev_opp,
 	 */
 	if (notify)
 		srcu_notifier_call_chain(&dev_opp->srcu_head, OPP_EVENT_REMOVE, opp);
+	opp_debug_remove_one(opp);
 	list_del_rcu(&opp->node);
 	call_srcu(&dev_opp->srcu_head.srcu, &opp->rcu_head, _kfree_opp_rcu);
 
@@ -673,6 +682,7 @@ static int _opp_add(struct device *dev, struct dev_pm_opp *new_opp,
 {
 	struct dev_pm_opp *opp;
 	struct list_head *head = &dev_opp->opp_list;
+	int ret;
 
 	/*
 	 * Insert new OPP in order of increasing frequency and discard if
@@ -703,6 +713,11 @@ static int _opp_add(struct device *dev, struct dev_pm_opp *new_opp,
 	new_opp->dev_opp = dev_opp;
 	list_add_rcu(&new_opp->node, head);
 
+	ret = opp_debug_create_one(new_opp, dev_opp);
+	if (ret)
+		dev_err(dev, "%s: Failed to register opp to debugfs (%d)\n",
+			__func__, ret);
+
 	return 0;
 }
 
@@ -889,12 +904,14 @@ static int _opp_add_static_v2(struct device *dev, struct device_node *np)
 
 	/* OPP to select on device suspend */
 	if (of_property_read_bool(np, "opp-suspend")) {
-		if (dev_opp->suspend_opp)
+		if (dev_opp->suspend_opp) {
 			dev_warn(dev, "%s: Multiple suspend OPPs found (%lu %lu)\n",
 				 __func__, dev_opp->suspend_opp->rate,
 				 new_opp->rate);
-		else
+		} else {
+			new_opp->suspend = true;
 			dev_opp->suspend_opp = new_opp;
+		}
 	}
 
 	if (new_opp->clock_latency_ns > dev_opp->clock_latency_ns_max)
diff --git a/drivers/base/power/opp/debugfs.c b/drivers/base/power/opp/debugfs.c
new file mode 100644
index 0000000..ddfe477
--- /dev/null
+++ b/drivers/base/power/opp/debugfs.c
@@ -0,0 +1,219 @@
+/*
+ * Generic OPP debugfs interface
+ *
+ * Copyright (C) 2015-2016 Viresh Kumar <viresh.kumar@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/debugfs.h>
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/init.h>
+#include <linux/limits.h>
+
+#include "opp.h"
+
+static struct dentry *rootdir;
+
+static void opp_set_dev_name(const struct device *dev, char *name)
+{
+	if (dev->parent)
+		snprintf(name, NAME_MAX, "%s-%s", dev_name(dev->parent),
+			 dev_name(dev));
+	else
+		snprintf(name, NAME_MAX, "%s", dev_name(dev));
+}
+
+void opp_debug_remove_one(struct dev_pm_opp *opp)
+{
+	debugfs_remove_recursive(opp->dentry);
+}
+
+int opp_debug_create_one(struct dev_pm_opp *opp, struct device_opp *dev_opp)
+{
+	struct dentry *pdentry = dev_opp->dentry;
+	struct dentry *d;
+	char name[25];	/* 20 chars for 64 bit value + 5 (opp:\0) */
+
+	/* Rate is unique to each OPP, use it to give opp-name */
+	snprintf(name, sizeof(name), "opp:%lu", opp->rate);
+
+	/* Create per-opp directory */
+	d = debugfs_create_dir(name, pdentry);
+	if (!d)
+		return -ENOMEM;
+
+	if (!debugfs_create_bool("available", S_IRUGO, d, &opp->available))
+		return -ENOMEM;
+
+	if (!debugfs_create_bool("dynamic", S_IRUGO, d, &opp->dynamic))
+		return -ENOMEM;
+
+	if (!debugfs_create_bool("turbo", S_IRUGO, d, &opp->turbo))
+		return -ENOMEM;
+
+	if (!debugfs_create_bool("suspend", S_IRUGO, d, &opp->suspend))
+		return -ENOMEM;
+
+	if (!debugfs_create_ulong("rate_hz", S_IRUGO, d, &opp->rate))
+		return -ENOMEM;
+
+	if (!debugfs_create_ulong("u_volt_target", S_IRUGO, d, &opp->u_volt))
+		return -ENOMEM;
+
+	if (!debugfs_create_ulong("u_volt_min", S_IRUGO, d, &opp->u_volt_min))
+		return -ENOMEM;
+
+	if (!debugfs_create_ulong("u_volt_max", S_IRUGO, d, &opp->u_volt_max))
+		return -ENOMEM;
+
+	if (!debugfs_create_ulong("u_amp", S_IRUGO, d, &opp->u_amp))
+		return -ENOMEM;
+
+	if (!debugfs_create_ulong("clock_latency_ns", S_IRUGO, d,
+				  &opp->clock_latency_ns))
+		return -ENOMEM;
+
+	opp->dentry = d;
+	return 0;
+}
+
+static int device_opp_debug_create_dir(struct device_list_opp *list_dev,
+				       struct device_opp *dev_opp)
+{
+	const struct device *dev = list_dev->dev;
+	struct dentry *d;
+
+	opp_set_dev_name(dev, dev_opp->dentry_name);
+
+	/* Create device specific directory */
+	d = debugfs_create_dir(dev_opp->dentry_name, rootdir);
+	if (!d) {
+		dev_err(dev, "%s: Failed to create debugfs dir\n", __func__);
+		return -ENOMEM;
+	}
+
+	list_dev->dentry = d;
+	dev_opp->dentry = d;
+
+	return 0;
+}
+
+static int device_opp_debug_create_link(struct device_list_opp *list_dev,
+					struct device_opp *dev_opp)
+{
+	const struct device *dev = list_dev->dev;
+	char name[NAME_MAX];
+	struct dentry *d;
+
+	opp_set_dev_name(list_dev->dev, name);
+
+	/* Create device specific directory link */
+	d = debugfs_create_symlink(name, rootdir, dev_opp->dentry_name);
+	if (!d) {
+		dev_err(dev, "%s: Failed to create link\n", __func__);
+		return -ENOMEM;
+	}
+
+	list_dev->dentry = d;
+
+	return 0;
+}
+
+/**
+ * opp_debug_register - add a device opp node to the debugfs 'opp' directory
+ * @list_dev: list-dev pointer for device
+ * @dev_opp: the device-opp being added
+ *
+ * Dynamically adds device specific directory in debugfs 'opp' directory. If the
+ * device-opp is shared with other devices, then links will be created for all
+ * devices except the first.
+ *
+ * Return: 0 on success, otherwise negative error.
+ */
+int opp_debug_register(struct device_list_opp *list_dev,
+		       struct device_opp *dev_opp)
+{
+	if (!rootdir) {
+		pr_debug("%s: Uninitialized rootdir\n", __func__);
+		return -EINVAL;
+	}
+
+	if (dev_opp->dentry)
+		return device_opp_debug_create_link(list_dev, dev_opp);
+
+	return device_opp_debug_create_dir(list_dev, dev_opp);
+}
+
+static void opp_migrate_dentry(struct device_list_opp *list_dev,
+			       struct device_opp *dev_opp)
+{
+	struct device_list_opp *new_dev;
+	const struct device *dev;
+	struct dentry *dentry;
+
+	/* Look for next list-dev */
+	list_for_each_entry(new_dev, &dev_opp->dev_list, node)
+		if (new_dev != list_dev)
+			break;
+
+	/* new_dev is guaranteed to be valid here */
+	dev = new_dev->dev;
+	debugfs_remove_recursive(new_dev->dentry);
+
+	opp_set_dev_name(dev, dev_opp->dentry_name);
+
+	dentry = debugfs_rename(rootdir, list_dev->dentry, rootdir,
+				dev_opp->dentry_name);
+	if (!dentry) {
+		dev_err(dev, "%s: Failed to rename link from: %s to %s\n",
+			__func__, dev_name(list_dev->dev), dev_name(dev));
+		return;
+	}
+
+	new_dev->dentry = dentry;
+	dev_opp->dentry = dentry;
+}
+
+/**
+ * opp_debug_unregister - remove a device opp node from debugfs opp directory
+ * @list_dev: list-dev pointer for device
+ * @dev_opp: the device-opp being removed
+ *
+ * Dynamically removes device specific directory from debugfs 'opp' directory.
+ */
+void opp_debug_unregister(struct device_list_opp *list_dev,
+			  struct device_opp *dev_opp)
+{
+	if (list_dev->dentry == dev_opp->dentry) {
+		/* Move the real dentry object under another device */
+		if (!list_is_singular(&dev_opp->dev_list)) {
+			opp_migrate_dentry(list_dev, dev_opp);
+			goto out;
+		}
+		dev_opp->dentry = NULL;
+	}
+
+	debugfs_remove_recursive(list_dev->dentry);
+
+out:
+	list_dev->dentry = NULL;
+}
+
+static int __init opp_debug_init(void)
+{
+	/* Create /sys/kernel/debug/opp directory */
+	rootdir = debugfs_create_dir("opp", NULL);
+	if (!rootdir) {
+		pr_err("%s: Failed to create root directory\n", __func__);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+core_initcall(opp_debug_init);
diff --git a/drivers/base/power/opp/opp.h b/drivers/base/power/opp/opp.h
index 7366b2a..a6bd8d2c 100644
--- a/drivers/base/power/opp/opp.h
+++ b/drivers/base/power/opp/opp.h
@@ -17,6 +17,7 @@
 #include <linux/device.h>
 #include <linux/kernel.h>
 #include <linux/list.h>
+#include <linux/limits.h>
 #include <linux/pm_opp.h>
 #include <linux/rculist.h>
 #include <linux/rcupdate.h>
@@ -53,6 +54,7 @@ extern struct mutex dev_opp_list_lock;
  * @dynamic:	not-created from static DT entries.
  * @available:	true/false - marks if this OPP as available or not
  * @turbo:	true if turbo (boost) OPP
+ * @suspend:	true if suspend OPP
  * @rate:	Frequency in hertz
  * @u_volt:	Target voltage in microvolts corresponding to this OPP
  * @u_volt_min:	Minimum voltage in microvolts corresponding to this OPP
@@ -63,6 +65,7 @@ extern struct mutex dev_opp_list_lock;
  * @dev_opp:	points back to the device_opp struct this opp belongs to
  * @rcu_head:	RCU callback head used for deferred freeing
  * @np:		OPP's device node.
+ * @dentry:	debugfs dentry pointer (per opp)
  *
  * This structure stores the OPP information for a given device.
  */
@@ -72,6 +75,7 @@ struct dev_pm_opp {
 	bool available;
 	bool dynamic;
 	bool turbo;
+	bool suspend;
 	unsigned long rate;
 
 	unsigned long u_volt;
@@ -84,6 +88,10 @@ struct dev_pm_opp {
 	struct rcu_head rcu_head;
 
 	struct device_node *np;
+
+#ifdef CONFIG_DEBUG_FS
+	struct dentry *dentry;
+#endif
 };
 
 /**
@@ -91,6 +99,7 @@ struct dev_pm_opp {
  * @node:	list node
  * @dev:	device to which the struct object belongs
  * @rcu_head:	RCU callback head used for deferred freeing
+ * @dentry:	debugfs dentry pointer (per device)
  *
  * This is an internal data structure maintaining the list of devices that are
  * managed by 'struct device_opp'.
@@ -99,6 +108,10 @@ struct device_list_opp {
 	struct list_head node;
 	const struct device *dev;
 	struct rcu_head rcu_head;
+
+#ifdef CONFIG_DEBUG_FS
+	struct dentry *dentry;
+#endif
 };
 
 /**
@@ -114,6 +127,8 @@ struct device_list_opp {
  * @opp_list:	list of opps
  * @np:		struct device_node pointer for opp's DT node.
  * @shared_opp: OPP is shared between multiple devices.
+ * @dentry:	debugfs dentry pointer of the real device directory (not links).
+ * @dentry_name: Name of the real dentry.
  *
  * This is an internal data structure maintaining the link to opps attached to
  * a device. This structure is not meant to be shared to users as it is
@@ -135,6 +150,11 @@ struct device_opp {
 	unsigned long clock_latency_ns_max;
 	bool shared_opp;
 	struct dev_pm_opp *suspend_opp;
+
+#ifdef CONFIG_DEBUG_FS
+	struct dentry *dentry;
+	char dentry_name[NAME_MAX];
+#endif
 };
 
 /* Routines internal to opp core */
@@ -143,4 +163,26 @@ struct device_list_opp *_add_list_dev(const struct device *dev,
 				      struct device_opp *dev_opp);
 struct device_node *_of_get_opp_desc_node(struct device *dev);
 
+#ifdef CONFIG_DEBUG_FS
+void opp_debug_remove_one(struct dev_pm_opp *opp);
+int opp_debug_create_one(struct dev_pm_opp *opp, struct device_opp *dev_opp);
+int opp_debug_register(struct device_list_opp *list_dev,
+		       struct device_opp *dev_opp);
+void opp_debug_unregister(struct device_list_opp *list_dev,
+			  struct device_opp *dev_opp);
+#else
+static inline void opp_debug_remove_one(struct dev_pm_opp *opp) {}
+
+static inline int opp_debug_create_one(struct dev_pm_opp *opp,
+				       struct device_opp *dev_opp)
+{ return 0; }
+static inline int opp_debug_register(struct device_list_opp *list_dev,
+				     struct device_opp *dev_opp)
+{ return 0; }
+
+static inline void opp_debug_unregister(struct device_list_opp *list_dev,
+					struct device_opp *dev_opp)
+{ }
+#endif		/* DEBUG_FS */
+
 #endif		/* __DRIVER_OPP_H__ */
-- 
cgit v0.10.2


From 1c4d12de2719dfdf27c6dab31e7a5641ee293c94 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Wed, 11 Nov 2015 08:10:54 +0530
Subject: PM / OPP: Add "opp-supported-hw" binding

We may want to enable only a subset of OPPs, from the bigger list of
OPPs, based on what version of the hardware we are running on. This
would enable us to not duplicate OPP tables for every version of the
hardware we support.

To enable that, this patch defines a new property 'opp-supported-hw'. It
can support any number of hierarchy levels of the versions the hardware
follows. And based on the selected hardware versions, we can pick only
the relevant OPPs at runtime.

Reviewed-by: Stephen Boyd <sboyd@codeaurora.org>
Acked-by: Rob Herring <robh@kernel.org>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/Documentation/devicetree/bindings/opp/opp.txt b/Documentation/devicetree/bindings/opp/opp.txt
index 0cb44dc..d072fa0 100644
--- a/Documentation/devicetree/bindings/opp/opp.txt
+++ b/Documentation/devicetree/bindings/opp/opp.txt
@@ -123,6 +123,26 @@ Optional properties:
 - opp-suspend: Marks the OPP to be used during device suspend. Only one OPP in
   the table should have this.
 
+- opp-supported-hw: This enables us to select only a subset of OPPs from the
+  larger OPP table, based on what version of the hardware we are running on. We
+  still can't have multiple nodes with the same opp-hz value in OPP table.
+
+  It's an user defined array containing a hierarchy of hardware version numbers,
+  supported by the OPP. For example: a platform with hierarchy of three levels
+  of versions (A, B and C), this field should be like <X Y Z>, where X
+  corresponds to Version hierarchy A, Y corresponds to version hierarchy B and Z
+  corresponds to version hierarchy C.
+
+  Each level of hierarchy is represented by a 32 bit value, and so there can be
+  only 32 different supported version per hierarchy. i.e. 1 bit per version. A
+  value of 0xFFFFFFFF will enable the OPP for all versions for that hierarchy
+  level. And a value of 0x00000000 will disable the OPP completely, and so we
+  never want that to happen.
+
+  If 32 values aren't sufficient for a version hierarchy, than that version
+  hierarchy can be contained in multiple 32 bit values. i.e. <X Y Z1 Z2> in the
+  above example, Z1 & Z2 refer to the version hierarchy Z.
+
 - status: Marks the node enabled/disabled.
 
 Example 1: Single cluster Dual-core ARM cortex A9, switch DVFS states together.
@@ -463,3 +483,48 @@ Example 5: Multiple OPP tables
 		};
 	};
 };
+
+Example 6: opp-supported-hw
+(example: three level hierarchy of versions: cuts, substrate and process)
+
+/ {
+	cpus {
+		cpu@0 {
+			compatible = "arm,cortex-a7";
+			...
+
+			cpu-supply = <&cpu_supply>
+			operating-points-v2 = <&cpu0_opp_table_slow>;
+		};
+	};
+
+	opp_table {
+		compatible = "operating-points-v2";
+		status = "okay";
+		opp-shared;
+
+		opp00 {
+			/*
+			 * Supports all substrate and process versions for 0xF
+			 * cuts, i.e. only first four cuts.
+			 */
+			opp-supported-hw = <0xF 0xFFFFFFFF 0xFFFFFFFF>
+			opp-hz = /bits/ 64 <600000000>;
+			opp-microvolt = <900000 915000 925000>;
+			...
+		};
+
+		opp01 {
+			/*
+			 * Supports:
+			 * - cuts: only one, 6th cut (represented by 6th bit).
+			 * - substrate: supports 16 different substrate versions
+			 * - process: supports 9 different process versions
+			 */
+			opp-supported-hw = <0x20 0xff0000ff 0x0000f4f0>
+			opp-hz = /bits/ 64 <800000000>;
+			opp-microvolt = <900000 915000 925000>;
+			...
+		};
+	};
+};
-- 
cgit v0.10.2


From ffdb8cc7a27c89175e541e68e2a73f1f63ab8c6b Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Wed, 11 Nov 2015 08:10:55 +0530
Subject: PM / OPP: Add {opp-microvolt|opp-microamp}-<name> binding

Depending on the version of hardware or its properties, which are only
known at runtime, various properties of the OPP can change. For example,
an OPP with frequency 1.2 GHz, may have different voltage/current
requirements based on the version of the hardware it is running on.

In order to not replicate the same OPP tables for varying values of all
such fields, this commit introduces the concept of opp-property-<name>.
The <name> can be chosen by the platform at runtime, and OPPs will be
initialized depending on that name string. Currently support is extended
for the following properties:
- opp-microvolt-<name>
- opp-microamp-<name>

If the name string isn't provided by the platform, or if it is provided
but doesn't match the properties present in the OPP node, we will fall
back to the original properties without the -<name> string, if they are
available.

Reviewed-by: Stephen Boyd <sboyd@codeaurora.org>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/Documentation/devicetree/bindings/opp/opp.txt b/Documentation/devicetree/bindings/opp/opp.txt
index d072fa0..a3e7f0d 100644
--- a/Documentation/devicetree/bindings/opp/opp.txt
+++ b/Documentation/devicetree/bindings/opp/opp.txt
@@ -100,6 +100,14 @@ Optional properties:
   Entries for multiple regulators must be present in the same order as
   regulators are specified in device's DT node.
 
+- opp-microvolt-<name>: Named opp-microvolt property. This is exactly similar to
+  the above opp-microvolt property, but allows multiple voltage ranges to be
+  provided for the same OPP. At runtime, the platform can pick a <name> and
+  matching opp-microvolt-<name> property will be enabled for all OPPs. If the
+  platform doesn't pick a specific <name> or the <name> doesn't match with any
+  opp-microvolt-<name> properties, then opp-microvolt property shall be used, if
+  present.
+
 - opp-microamp: The maximum current drawn by the device in microamperes
   considering system specific parameters (such as transients, process, aging,
   maximum operating temperature range etc.) as necessary. This may be used to
@@ -112,6 +120,9 @@ Optional properties:
   for few regulators, then this should be marked as zero for them. If it isn't
   required for any regulator, then this property need not be present.
 
+- opp-microamp-<name>: Named opp-microamp property. Similar to
+  opp-microvolt-<name> property, but for microamp instead.
+
 - clock-latency-ns: Specifies the maximum possible transition latency (in
   nanoseconds) for switching to this OPP from any other OPP.
 
@@ -528,3 +539,39 @@ Example 6: opp-supported-hw
 		};
 	};
 };
+
+Example 7: opp-microvolt-<name>, opp-microamp-<name>:
+(example: device with two possible microvolt ranges: slow and fast)
+
+/ {
+	cpus {
+		cpu@0 {
+			compatible = "arm,cortex-a7";
+			...
+
+			operating-points-v2 = <&cpu0_opp_table>;
+		};
+	};
+
+	cpu0_opp_table: opp_table0 {
+		compatible = "operating-points-v2";
+		opp-shared;
+
+		opp00 {
+			opp-hz = /bits/ 64 <1000000000>;
+			opp-microvolt-slow = <900000 915000 925000>;
+			opp-microvolt-fast = <970000 975000 985000>;
+			opp-microamp-slow =  <70000>;
+			opp-microamp-fast =  <71000>;
+		};
+
+		opp01 {
+			opp-hz = /bits/ 64 <1200000000>;
+			opp-microvolt-slow = <900000 915000 925000>, /* Supply vcc0 */
+					      <910000 925000 935000>; /* Supply vcc1 */
+			opp-microvolt-fast = <970000 975000 985000>, /* Supply vcc0 */
+					     <960000 965000 975000>; /* Supply vcc1 */
+			opp-microamp =  <70000>; /* Will be used for both slow/fast */
+		};
+	};
+};
-- 
cgit v0.10.2


From af87a39a5f7cf6ef252b1aec3e2e6508a40e51f1 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Wed, 11 Nov 2015 08:10:56 +0530
Subject: PM / OPP: Remove 'operating-points-names' binding

These aren't used until now by any DT files and wouldn't be used now as
we have a better scheme in place now, i.e. opp-property-<name>
properties.

Remove the (useless) binding without breaking ABI.

Reviewed-by: Stephen Boyd <sboyd@codeaurora.org>
Acked-by: Rob Herring <robh@kernel.org>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/Documentation/devicetree/bindings/opp/opp.txt b/Documentation/devicetree/bindings/opp/opp.txt
index a3e7f0d..24eac9a 100644
--- a/Documentation/devicetree/bindings/opp/opp.txt
+++ b/Documentation/devicetree/bindings/opp/opp.txt
@@ -45,21 +45,10 @@ Devices supporting OPPs must set their "operating-points-v2" property with
 phandle to a OPP table in their DT node. The OPP core will use this phandle to
 find the operating points for the device.
 
-Devices may want to choose OPP tables at runtime and so can provide a list of
-phandles here. But only *one* of them should be chosen at runtime. This must be
-accompanied by a corresponding "operating-points-names" property, to uniquely
-identify the OPP tables.
-
 If required, this can be extended for SoC vendor specfic bindings. Such bindings
 should be documented as Documentation/devicetree/bindings/power/<vendor>-opp.txt
 and should have a compatible description like: "operating-points-v2-<vendor>".
 
-Optional properties:
-- operating-points-names: Names of OPP tables (required if multiple OPP
-  tables are present), to uniquely identify them. The same list must be present
-  for all the CPUs which are sharing clock/voltage rails and hence the OPP
-  tables.
-
 * OPP Table Node
 
 This describes the OPPs belonging to a device. This node can have following
@@ -448,54 +437,7 @@ Example 4: Handling multiple regulators
 	};
 };
 
-Example 5: Multiple OPP tables
-
-/ {
-	cpus {
-		cpu@0 {
-			compatible = "arm,cortex-a7";
-			...
-
-			cpu-supply = <&cpu_supply>
-			operating-points-v2 = <&cpu0_opp_table_slow>, <&cpu0_opp_table_fast>;
-			operating-points-names = "slow", "fast";
-		};
-	};
-
-	cpu0_opp_table_slow: opp_table_slow {
-		compatible = "operating-points-v2";
-		status = "okay";
-		opp-shared;
-
-		opp00 {
-			opp-hz = /bits/ 64 <600000000>;
-			...
-		};
-
-		opp01 {
-			opp-hz = /bits/ 64 <800000000>;
-			...
-		};
-	};
-
-	cpu0_opp_table_fast: opp_table_fast {
-		compatible = "operating-points-v2";
-		status = "okay";
-		opp-shared;
-
-		opp10 {
-			opp-hz = /bits/ 64 <1000000000>;
-			...
-		};
-
-		opp11 {
-			opp-hz = /bits/ 64 <1100000000>;
-			...
-		};
-	};
-};
-
-Example 6: opp-supported-hw
+Example 5: opp-supported-hw
 (example: three level hierarchy of versions: cuts, substrate and process)
 
 / {
@@ -540,7 +482,7 @@ Example 6: opp-supported-hw
 	};
 };
 
-Example 7: opp-microvolt-<name>, opp-microamp-<name>:
+Example 6: opp-microvolt-<name>, opp-microamp-<name>:
 (example: device with two possible microvolt ranges: slow and fast)
 
 / {
-- 
cgit v0.10.2


From 754dcf35f34698661801ae1d391efa02affe83a7 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Wed, 11 Nov 2015 08:10:57 +0530
Subject: PM / OPP: Rename OPP nodes as opp@<opp-hz>

It would be better to name OPP nodes as opp@<opp-hz> as that will ensure
that multiple DT nodes don't contain the same frequency. Of course we
expect the writer to name the node with its opp-hz frequency and not any
other frequency.

And that will let the compile error out if multiple nodes are using the
same opp-hz frequency.

Suggested-by: Stephen Boyd <sboyd@codeaurora.org>
Reviewed-by: Stephen Boyd <sboyd@codeaurora.org>
Acked-by: Rob Herring <robh@kernel.org>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/Documentation/devicetree/bindings/opp/opp.txt b/Documentation/devicetree/bindings/opp/opp.txt
index 24eac9a..601256f 100644
--- a/Documentation/devicetree/bindings/opp/opp.txt
+++ b/Documentation/devicetree/bindings/opp/opp.txt
@@ -177,20 +177,20 @@ Example 1: Single cluster Dual-core ARM cortex A9, switch DVFS states together.
 		compatible = "operating-points-v2";
 		opp-shared;
 
-		opp00 {
+		opp@1000000000 {
 			opp-hz = /bits/ 64 <1000000000>;
 			opp-microvolt = <970000 975000 985000>;
 			opp-microamp = <70000>;
 			clock-latency-ns = <300000>;
 			opp-suspend;
 		};
-		opp01 {
+		opp@1100000000 {
 			opp-hz = /bits/ 64 <1100000000>;
 			opp-microvolt = <980000 1000000 1010000>;
 			opp-microamp = <80000>;
 			clock-latency-ns = <310000>;
 		};
-		opp02 {
+		opp@1200000000 {
 			opp-hz = /bits/ 64 <1200000000>;
 			opp-microvolt = <1025000>;
 			clock-latency-ns = <290000>;
@@ -256,20 +256,20 @@ independently.
 		 * independently.
 		 */
 
-		opp00 {
+		opp@1000000000 {
 			opp-hz = /bits/ 64 <1000000000>;
 			opp-microvolt = <970000 975000 985000>;
 			opp-microamp = <70000>;
 			clock-latency-ns = <300000>;
 			opp-suspend;
 		};
-		opp01 {
+		opp@1100000000 {
 			opp-hz = /bits/ 64 <1100000000>;
 			opp-microvolt = <980000 1000000 1010000>;
 			opp-microamp = <80000>;
 			clock-latency-ns = <310000>;
 		};
-		opp02 {
+		opp@1200000000 {
 			opp-hz = /bits/ 64 <1200000000>;
 			opp-microvolt = <1025000>;
 			opp-microamp = <90000;
@@ -332,20 +332,20 @@ DVFS state together.
 		compatible = "operating-points-v2";
 		opp-shared;
 
-		opp00 {
+		opp@1000000000 {
 			opp-hz = /bits/ 64 <1000000000>;
 			opp-microvolt = <970000 975000 985000>;
 			opp-microamp = <70000>;
 			clock-latency-ns = <300000>;
 			opp-suspend;
 		};
-		opp01 {
+		opp@1100000000 {
 			opp-hz = /bits/ 64 <1100000000>;
 			opp-microvolt = <980000 1000000 1010000>;
 			opp-microamp = <80000>;
 			clock-latency-ns = <310000>;
 		};
-		opp02 {
+		opp@1200000000 {
 			opp-hz = /bits/ 64 <1200000000>;
 			opp-microvolt = <1025000>;
 			opp-microamp = <90000>;
@@ -358,20 +358,20 @@ DVFS state together.
 		compatible = "operating-points-v2";
 		opp-shared;
 
-		opp10 {
+		opp@1300000000 {
 			opp-hz = /bits/ 64 <1300000000>;
 			opp-microvolt = <1045000 1050000 1055000>;
 			opp-microamp = <95000>;
 			clock-latency-ns = <400000>;
 			opp-suspend;
 		};
-		opp11 {
+		opp@1400000000 {
 			opp-hz = /bits/ 64 <1400000000>;
 			opp-microvolt = <1075000>;
 			opp-microamp = <100000>;
 			clock-latency-ns = <400000>;
 		};
-		opp12 {
+		opp@1500000000 {
 			opp-hz = /bits/ 64 <1500000000>;
 			opp-microvolt = <1010000 1100000 1110000>;
 			opp-microamp = <95000>;
@@ -398,7 +398,7 @@ Example 4: Handling multiple regulators
 		compatible = "operating-points-v2";
 		opp-shared;
 
-		opp00 {
+		opp@1000000000 {
 			opp-hz = /bits/ 64 <1000000000>;
 			opp-microvolt = <970000>, /* Supply 0 */
 					<960000>, /* Supply 1 */
@@ -411,7 +411,7 @@ Example 4: Handling multiple regulators
 
 		/* OR */
 
-		opp00 {
+		opp@1000000000 {
 			opp-hz = /bits/ 64 <1000000000>;
 			opp-microvolt = <970000 975000 985000>, /* Supply 0 */
 					<960000 965000 975000>, /* Supply 1 */
@@ -424,7 +424,7 @@ Example 4: Handling multiple regulators
 
 		/* OR */
 
-		opp00 {
+		opp@1000000000 {
 			opp-hz = /bits/ 64 <1000000000>;
 			opp-microvolt = <970000 975000 985000>, /* Supply 0 */
 					<960000 965000 975000>, /* Supply 1 */
@@ -456,7 +456,7 @@ Example 5: opp-supported-hw
 		status = "okay";
 		opp-shared;
 
-		opp00 {
+		opp@600000000 {
 			/*
 			 * Supports all substrate and process versions for 0xF
 			 * cuts, i.e. only first four cuts.
@@ -467,7 +467,7 @@ Example 5: opp-supported-hw
 			...
 		};
 
-		opp01 {
+		opp@800000000 {
 			/*
 			 * Supports:
 			 * - cuts: only one, 6th cut (represented by 6th bit).
@@ -499,7 +499,7 @@ Example 6: opp-microvolt-<name>, opp-microamp-<name>:
 		compatible = "operating-points-v2";
 		opp-shared;
 
-		opp00 {
+		opp@1000000000 {
 			opp-hz = /bits/ 64 <1000000000>;
 			opp-microvolt-slow = <900000 915000 925000>;
 			opp-microvolt-fast = <970000 975000 985000>;
@@ -507,7 +507,7 @@ Example 6: opp-microvolt-<name>, opp-microamp-<name>:
 			opp-microamp-fast =  <71000>;
 		};
 
-		opp01 {
+		opp@1200000000 {
 			opp-hz = /bits/ 64 <1200000000>;
 			opp-microvolt-slow = <900000 915000 925000>, /* Supply vcc0 */
 					      <910000 925000 935000>; /* Supply vcc1 */
-- 
cgit v0.10.2


From 2aae9915267e60b6ab7af3777f92ca793992e9b9 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Wed, 11 Nov 2015 08:10:58 +0530
Subject: ARM: dts: exynos4412: Rename OPP nodes as opp@<opp-hz>

OPP bindings got updated to name OPP nodes this way, make changes
according to that.

Reviewed-by: Krzysztof Kozlowski <k.kozlowski@samsung.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/arch/arm/boot/dts/exynos4412.dtsi b/arch/arm/boot/dts/exynos4412.dtsi
index 294cfe4..40beede 100644
--- a/arch/arm/boot/dts/exynos4412.dtsi
+++ b/arch/arm/boot/dts/exynos4412.dtsi
@@ -64,73 +64,73 @@
 		compatible = "operating-points-v2";
 		opp-shared;
 
-		opp00 {
+		opp@200000000 {
 			opp-hz = /bits/ 64 <200000000>;
 			opp-microvolt = <900000>;
 			clock-latency-ns = <200000>;
 		};
-		opp01 {
+		opp@300000000 {
 			opp-hz = /bits/ 64 <300000000>;
 			opp-microvolt = <900000>;
 			clock-latency-ns = <200000>;
 		};
-		opp02 {
+		opp@400000000 {
 			opp-hz = /bits/ 64 <400000000>;
 			opp-microvolt = <925000>;
 			clock-latency-ns = <200000>;
 		};
-		opp03 {
+		opp@500000000 {
 			opp-hz = /bits/ 64 <500000000>;
 			opp-microvolt = <950000>;
 			clock-latency-ns = <200000>;
 		};
-		opp04 {
+		opp@600000000 {
 			opp-hz = /bits/ 64 <600000000>;
 			opp-microvolt = <975000>;
 			clock-latency-ns = <200000>;
 		};
-		opp05 {
+		opp@700000000 {
 			opp-hz = /bits/ 64 <700000000>;
 			opp-microvolt = <987500>;
 			clock-latency-ns = <200000>;
 		};
-		opp06 {
+		opp@800000000 {
 			opp-hz = /bits/ 64 <800000000>;
 			opp-microvolt = <1000000>;
 			clock-latency-ns = <200000>;
 			opp-suspend;
 		};
-		opp07 {
+		opp@900000000 {
 			opp-hz = /bits/ 64 <900000000>;
 			opp-microvolt = <1037500>;
 			clock-latency-ns = <200000>;
 		};
-		opp08 {
+		opp@1000000000 {
 			opp-hz = /bits/ 64 <1000000000>;
 			opp-microvolt = <1087500>;
 			clock-latency-ns = <200000>;
 		};
-		opp09 {
+		opp@1100000000 {
 			opp-hz = /bits/ 64 <1100000000>;
 			opp-microvolt = <1137500>;
 			clock-latency-ns = <200000>;
 		};
-		opp10 {
+		opp@1200000000 {
 			opp-hz = /bits/ 64 <1200000000>;
 			opp-microvolt = <1187500>;
 			clock-latency-ns = <200000>;
 		};
-		opp11 {
+		opp@1300000000 {
 			opp-hz = /bits/ 64 <1300000000>;
 			opp-microvolt = <1250000>;
 			clock-latency-ns = <200000>;
 		};
-		opp12 {
+		opp@1400000000 {
 			opp-hz = /bits/ 64 <1400000000>;
 			opp-microvolt = <1287500>;
 			clock-latency-ns = <200000>;
 		};
-		opp13 {
+		opp@1500000000 {
 			opp-hz = /bits/ 64 <1500000000>;
 			opp-microvolt = <1350000>;
 			clock-latency-ns = <200000>;
-- 
cgit v0.10.2


From dc4e7b1fa20a840d2317fcfdaa1064fc09d2afcb Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Thu, 19 Nov 2015 09:13:56 +0530
Subject: PM / OPP: Add missing doc comments

Few doc-style comments were missing, add them. Rearrange another one to
match the sequence within the structure.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Acked-by: Pavel Machek <pavel@ucw.cz>
Reviewed-by: Stephen Boyd <sboyd@codeaurora.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/base/power/opp/opp.h b/drivers/base/power/opp/opp.h
index a6bd8d2c..b8880c7 100644
--- a/drivers/base/power/opp/opp.h
+++ b/drivers/base/power/opp/opp.h
@@ -51,8 +51,8 @@ extern struct mutex dev_opp_list_lock;
  *		are protected by the dev_opp_list_lock for integrity.
  *		IMPORTANT: the opp nodes should be maintained in increasing
  *		order.
- * @dynamic:	not-created from static DT entries.
  * @available:	true/false - marks if this OPP as available or not
+ * @dynamic:	not-created from static DT entries.
  * @turbo:	true if turbo (boost) OPP
  * @suspend:	true if suspend OPP
  * @rate:	Frequency in hertz
@@ -126,7 +126,9 @@ struct device_list_opp {
  * @dev_list:	list of devices that share these OPPs
  * @opp_list:	list of opps
  * @np:		struct device_node pointer for opp's DT node.
+ * @clock_latency_ns_max: Max clock latency in nanoseconds.
  * @shared_opp: OPP is shared between multiple devices.
+ * @suspend_opp: Pointer to OPP to be used during device suspend.
  * @dentry:	debugfs dentry pointer of the real device directory (not links).
  * @dentry_name: Name of the real dentry.
  *
-- 
cgit v0.10.2


From e128c864070055e062f6c90c64c03aad18452ac3 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Thu, 3 Dec 2015 09:37:49 +0530
Subject: cpufreq: ondemand: Update sampling rate only for concerned policies

We are comparing policy->governor against cpufreq_gov_ondemand to make
sure that we update sampling rate only for the concerned CPUs. But that
isn't enough.

In case of governor_per_policy, there can be multiple instances of
ondemand governor and we will always end up updating all of them with
current code. What we rather need to do, is to compare dbs_data with
poilcy->governor_data, which will match only for the policies governed
by dbs_data.

This code is also racy as the governor might be getting stopped at that
time and we may end up scheduling work for a policy, which we have just
disabled.

Fix that by protecting the entire function with &od_dbs_cdata.mutex,
which will prevent against races with policy START/STOP/etc.

After these locks are in place, we can safely get the policy via per-cpu
dbs_info.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c
index 03ac6ce..089ca6a 100644
--- a/drivers/cpufreq/cpufreq_ondemand.c
+++ b/drivers/cpufreq/cpufreq_ondemand.c
@@ -252,20 +252,39 @@ static void update_sampling_rate(struct dbs_data *dbs_data,
 	od_tuners->sampling_rate = new_rate = max(new_rate,
 			dbs_data->min_sampling_rate);
 
+	/*
+	 * Lock governor so that governor start/stop can't execute in parallel.
+	 */
+	mutex_lock(&od_dbs_cdata.mutex);
+
 	for_each_online_cpu(cpu) {
 		struct cpufreq_policy *policy;
 		struct od_cpu_dbs_info_s *dbs_info;
+		struct cpu_dbs_info *cdbs;
+		struct cpu_common_dbs_info *shared;
 		unsigned long next_sampling, appointed_at;
 
-		policy = cpufreq_cpu_get(cpu);
-		if (!policy)
+		dbs_info = &per_cpu(od_cpu_dbs_info, cpu);
+		cdbs = &dbs_info->cdbs;
+		shared = cdbs->shared;
+
+		/*
+		 * A valid shared and shared->policy means governor hasn't
+		 * stopped or exited yet.
+		 */
+		if (!shared || !shared->policy)
 			continue;
-		if (policy->governor != &cpufreq_gov_ondemand) {
-			cpufreq_cpu_put(policy);
+
+		policy = shared->policy;
+
+		/*
+		 * Update sampling rate for CPUs whose policy is governed by
+		 * dbs_data. In case of governor_per_policy, only a single
+		 * policy will be governed by dbs_data, otherwise there can be
+		 * multiple policies that are governed by the same dbs_data.
+		 */
+		if (dbs_data != policy->governor_data)
 			continue;
-		}
-		dbs_info = &per_cpu(od_cpu_dbs_info, cpu);
-		cpufreq_cpu_put(policy);
 
 		if (!delayed_work_pending(&dbs_info->cdbs.dwork))
 			continue;
@@ -281,6 +300,8 @@ static void update_sampling_rate(struct dbs_data *dbs_data,
 
 		}
 	}
+
+	mutex_unlock(&od_dbs_cdata.mutex);
 }
 
 static ssize_t store_sampling_rate(struct dbs_data *dbs_data, const char *buf,
-- 
cgit v0.10.2


From e68fe18c5b5442baca162ccf3b273326e6132a51 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Thu, 3 Dec 2015 09:37:50 +0530
Subject: cpufreq: ondemand: Work is guaranteed to be pending

We are guaranteed to have works scheduled for policy->cpus, as the
policy isn't stopped yet. And so there is no need to check that again.
Drop it.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c
index 089ca6a..08f2aa6 100644
--- a/drivers/cpufreq/cpufreq_ondemand.c
+++ b/drivers/cpufreq/cpufreq_ondemand.c
@@ -286,9 +286,6 @@ static void update_sampling_rate(struct dbs_data *dbs_data,
 		if (dbs_data != policy->governor_data)
 			continue;
 
-		if (!delayed_work_pending(&dbs_info->cdbs.dwork))
-			continue;
-
 		next_sampling = jiffies + usecs_to_jiffies(new_rate);
 		appointed_at = dbs_info->cdbs.dwork.timer.expires;
 
-- 
cgit v0.10.2


From affde5d06af1e39c2929e36a063e3912f02fc58f Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Thu, 3 Dec 2015 09:37:51 +0530
Subject: cpufreq: governor: Pass policy as argument to ->gov_dbs_timer()

Pass 'policy' as argument to ->gov_dbs_timer() instead of cdbs and
dbs_data.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c
index 1fa1deb..606ad74ab 100644
--- a/drivers/cpufreq/cpufreq_conservative.c
+++ b/drivers/cpufreq/cpufreq_conservative.c
@@ -115,13 +115,13 @@ static void cs_check_cpu(int cpu, unsigned int load)
 	}
 }
 
-static unsigned int cs_dbs_timer(struct cpu_dbs_info *cdbs,
-				 struct dbs_data *dbs_data, bool modify_all)
+static unsigned int cs_dbs_timer(struct cpufreq_policy *policy, bool modify_all)
 {
+	struct dbs_data *dbs_data = policy->governor_data;
 	struct cs_dbs_tuners *cs_tuners = dbs_data->tuners;
 
 	if (modify_all)
-		dbs_check_cpu(dbs_data, cdbs->shared->policy->cpu);
+		dbs_check_cpu(dbs_data, policy->cpu);
 
 	return delay_for_sampling_rate(cs_tuners->sampling_rate);
 }
diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c
index b260576..cdcb56a 100644
--- a/drivers/cpufreq/cpufreq_governor.c
+++ b/drivers/cpufreq/cpufreq_governor.c
@@ -253,7 +253,7 @@ static void dbs_timer(struct work_struct *work)
 	if (!need_load_eval(cdbs->shared, sampling_rate))
 		modify_all = false;
 
-	delay = dbs_data->cdata->gov_dbs_timer(cdbs, dbs_data, modify_all);
+	delay = dbs_data->cdata->gov_dbs_timer(policy, modify_all);
 	gov_queue_work(dbs_data, policy, delay, modify_all);
 
 unlock:
diff --git a/drivers/cpufreq/cpufreq_governor.h b/drivers/cpufreq/cpufreq_governor.h
index 5621bb0..0c75890 100644
--- a/drivers/cpufreq/cpufreq_governor.h
+++ b/drivers/cpufreq/cpufreq_governor.h
@@ -209,8 +209,7 @@ struct common_dbs_data {
 
 	struct cpu_dbs_info *(*get_cpu_cdbs)(int cpu);
 	void *(*get_cpu_dbs_info_s)(int cpu);
-	unsigned int (*gov_dbs_timer)(struct cpu_dbs_info *cdbs,
-				      struct dbs_data *dbs_data,
+	unsigned int (*gov_dbs_timer)(struct cpufreq_policy *policy,
 				      bool modify_all);
 	void (*gov_check_cpu)(int cpu, unsigned int load);
 	int (*init)(struct dbs_data *dbs_data, bool notify);
diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c
index 08f2aa6..fc0384b 100644
--- a/drivers/cpufreq/cpufreq_ondemand.c
+++ b/drivers/cpufreq/cpufreq_ondemand.c
@@ -191,10 +191,9 @@ static void od_check_cpu(int cpu, unsigned int load)
 	}
 }
 
-static unsigned int od_dbs_timer(struct cpu_dbs_info *cdbs,
-				 struct dbs_data *dbs_data, bool modify_all)
+static unsigned int od_dbs_timer(struct cpufreq_policy *policy, bool modify_all)
 {
-	struct cpufreq_policy *policy = cdbs->shared->policy;
+	struct dbs_data *dbs_data = policy->governor_data;
 	unsigned int cpu = policy->cpu;
 	struct od_cpu_dbs_info_s *dbs_info = &per_cpu(od_cpu_dbs_info,
 			cpu);
-- 
cgit v0.10.2


From 5e4500d8dba16d88b528cf037566b84747ec23f0 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Thu, 3 Dec 2015 09:37:52 +0530
Subject: cpufreq: governor: initialize/destroy timer_mutex with 'shared'

timer_mutex is required to be initialized only while memory for 'shared'
is allocated and in a similar way it is required to be destroyed only
when memory for 'shared' is freed.

There is no need to do the same every time we start/stop the governor.
Move code to initialize/destroy timer_mutex to the relevant places.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c
index cdcb56a..999e1f6 100644
--- a/drivers/cpufreq/cpufreq_governor.c
+++ b/drivers/cpufreq/cpufreq_governor.c
@@ -287,6 +287,7 @@ static int alloc_common_dbs_info(struct cpufreq_policy *policy,
 	for_each_cpu(j, policy->related_cpus)
 		cdata->get_cpu_cdbs(j)->shared = shared;
 
+	mutex_init(&shared->timer_mutex);
 	return 0;
 }
 
@@ -297,6 +298,8 @@ static void free_common_dbs_info(struct cpufreq_policy *policy,
 	struct cpu_common_dbs_info *shared = cdbs->shared;
 	int j;
 
+	mutex_destroy(&shared->timer_mutex);
+
 	for_each_cpu(j, policy->cpus)
 		cdata->get_cpu_cdbs(j)->shared = NULL;
 
@@ -433,7 +436,6 @@ static int cpufreq_governor_start(struct cpufreq_policy *policy,
 
 	shared->policy = policy;
 	shared->time_stamp = ktime_get();
-	mutex_init(&shared->timer_mutex);
 
 	for_each_cpu(j, policy->cpus) {
 		struct cpu_dbs_info *j_cdbs = cdata->get_cpu_cdbs(j);
@@ -493,8 +495,6 @@ static int cpufreq_governor_stop(struct cpufreq_policy *policy,
 	mutex_unlock(&shared->timer_mutex);
 
 	gov_cancel_work(dbs_data, policy);
-
-	mutex_destroy(&shared->timer_mutex);
 	return 0;
 }
 
-- 
cgit v0.10.2


From 70f43e5e798c8818d97d8d6a9bd4cd3235af9686 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Wed, 9 Dec 2015 07:34:42 +0530
Subject: cpufreq: governor: replace per-CPU delayed work with timers

cpufreq governors evaluate load at sampling rate and based on that they
update frequency for a group of CPUs belonging to the same cpufreq
policy.

This is required to be done in a single thread for all policy->cpus, but
because we don't want to wakeup idle CPUs to do just that, we use
deferrable work for this. If we would have used a single delayed
deferrable work for the entire policy, there were chances that the CPU
required to run the handler can be in idle and we might end up not
changing the frequency for the entire group with load variations.

And so we were forced to keep per-cpu works, and only the one that
expires first need to do the real work and others are rescheduled for
next sampling time.

We have been using the more complex solution until now, where we used a
delayed deferrable work for this, which is a combination of a timer and
a work.

This could be made lightweight by keeping per-cpu deferred timers with a
single work item, which is scheduled by the first timer that expires.

This patch does just that and here are important changes:
- The timer handler will run in irq context and so we need to use a
  spin_lock instead of the timer_mutex. And so a separate timer_lock is
  created. This also makes the use of the mutex and lock quite clear, as
  we know what exactly they are protecting.
- A new field 'skip_work' is added to track when the timer handlers can
  queue a work. More comments present in code.

Suggested-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Reviewed-by: Ashwin Chaugule <ashwin.chaugule@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c
index 999e1f6..2d61eae 100644
--- a/drivers/cpufreq/cpufreq_governor.c
+++ b/drivers/cpufreq/cpufreq_governor.c
@@ -158,47 +158,53 @@ void dbs_check_cpu(struct dbs_data *dbs_data, int cpu)
 }
 EXPORT_SYMBOL_GPL(dbs_check_cpu);
 
-static inline void __gov_queue_work(int cpu, struct dbs_data *dbs_data,
-		unsigned int delay)
+void gov_add_timers(struct cpufreq_policy *policy, unsigned int delay)
 {
-	struct cpu_dbs_info *cdbs = dbs_data->cdata->get_cpu_cdbs(cpu);
-
-	mod_delayed_work_on(cpu, system_wq, &cdbs->dwork, delay);
-}
-
-void gov_queue_work(struct dbs_data *dbs_data, struct cpufreq_policy *policy,
-		unsigned int delay, bool all_cpus)
-{
-	int i;
+	struct dbs_data *dbs_data = policy->governor_data;
+	struct cpu_dbs_info *cdbs;
+	int cpu;
 
-	if (!all_cpus) {
-		/*
-		 * Use raw_smp_processor_id() to avoid preemptible warnings.
-		 * We know that this is only called with all_cpus == false from
-		 * works that have been queued with *_work_on() functions and
-		 * those works are canceled during CPU_DOWN_PREPARE so they
-		 * can't possibly run on any other CPU.
-		 */
-		__gov_queue_work(raw_smp_processor_id(), dbs_data, delay);
-	} else {
-		for_each_cpu(i, policy->cpus)
-			__gov_queue_work(i, dbs_data, delay);
+	for_each_cpu(cpu, policy->cpus) {
+		cdbs = dbs_data->cdata->get_cpu_cdbs(cpu);
+		cdbs->timer.expires = jiffies + delay;
+		add_timer_on(&cdbs->timer, cpu);
 	}
 }
-EXPORT_SYMBOL_GPL(gov_queue_work);
+EXPORT_SYMBOL_GPL(gov_add_timers);
 
-static inline void gov_cancel_work(struct dbs_data *dbs_data,
-		struct cpufreq_policy *policy)
+static inline void gov_cancel_timers(struct cpufreq_policy *policy)
 {
+	struct dbs_data *dbs_data = policy->governor_data;
 	struct cpu_dbs_info *cdbs;
 	int i;
 
 	for_each_cpu(i, policy->cpus) {
 		cdbs = dbs_data->cdata->get_cpu_cdbs(i);
-		cancel_delayed_work_sync(&cdbs->dwork);
+		del_timer_sync(&cdbs->timer);
 	}
 }
 
+void gov_cancel_work(struct cpu_common_dbs_info *shared)
+{
+	unsigned long flags;
+
+	/*
+	 * No work will be queued from timer handlers after skip_work is
+	 * updated. And so we can safely cancel the work first and then the
+	 * timers.
+	 */
+	spin_lock_irqsave(&shared->timer_lock, flags);
+	shared->skip_work++;
+	spin_unlock_irqrestore(&shared->timer_lock, flags);
+
+	cancel_work_sync(&shared->work);
+
+	gov_cancel_timers(shared->policy);
+
+	shared->skip_work = 0;
+}
+EXPORT_SYMBOL_GPL(gov_cancel_work);
+
 /* Will return if we need to evaluate cpu load again or not */
 static bool need_load_eval(struct cpu_common_dbs_info *shared,
 			   unsigned int sampling_rate)
@@ -217,29 +223,22 @@ static bool need_load_eval(struct cpu_common_dbs_info *shared,
 	return true;
 }
 
-static void dbs_timer(struct work_struct *work)
+static void dbs_work_handler(struct work_struct *work)
 {
-	struct cpu_dbs_info *cdbs = container_of(work, struct cpu_dbs_info,
-						 dwork.work);
-	struct cpu_common_dbs_info *shared = cdbs->shared;
+	struct cpu_common_dbs_info *shared = container_of(work, struct
+					cpu_common_dbs_info, work);
 	struct cpufreq_policy *policy;
 	struct dbs_data *dbs_data;
 	unsigned int sampling_rate, delay;
-	bool modify_all = true;
-
-	mutex_lock(&shared->timer_mutex);
+	unsigned long flags;
+	bool eval_load;
 
 	policy = shared->policy;
-
-	/*
-	 * Governor might already be disabled and there is no point continuing
-	 * with the work-handler.
-	 */
-	if (!policy)
-		goto unlock;
-
 	dbs_data = policy->governor_data;
 
+	/* Kill all timers */
+	gov_cancel_timers(policy);
+
 	if (dbs_data->cdata->governor == GOV_CONSERVATIVE) {
 		struct cs_dbs_tuners *cs_tuners = dbs_data->tuners;
 
@@ -250,14 +249,43 @@ static void dbs_timer(struct work_struct *work)
 		sampling_rate = od_tuners->sampling_rate;
 	}
 
-	if (!need_load_eval(cdbs->shared, sampling_rate))
-		modify_all = false;
+	eval_load = need_load_eval(shared, sampling_rate);
 
-	delay = dbs_data->cdata->gov_dbs_timer(policy, modify_all);
-	gov_queue_work(dbs_data, policy, delay, modify_all);
-
-unlock:
+	/*
+	 * Make sure cpufreq_governor_limits() isn't evaluating load in
+	 * parallel.
+	 */
+	mutex_lock(&shared->timer_mutex);
+	delay = dbs_data->cdata->gov_dbs_timer(policy, eval_load);
 	mutex_unlock(&shared->timer_mutex);
+
+	spin_lock_irqsave(&shared->timer_lock, flags);
+	shared->skip_work--;
+	spin_unlock_irqrestore(&shared->timer_lock, flags);
+
+	gov_add_timers(policy, delay);
+}
+
+static void dbs_timer_handler(unsigned long data)
+{
+	struct cpu_dbs_info *cdbs = (struct cpu_dbs_info *)data;
+	struct cpu_common_dbs_info *shared = cdbs->shared;
+	unsigned long flags;
+
+	spin_lock_irqsave(&shared->timer_lock, flags);
+
+	/*
+	 * Timer handler isn't allowed to queue work at the moment, because:
+	 * - Another timer handler has done that
+	 * - We are stopping the governor
+	 * - Or we are updating the sampling rate of ondemand governor
+	 */
+	if (!shared->skip_work) {
+		shared->skip_work++;
+		queue_work(system_wq, &shared->work);
+	}
+
+	spin_unlock_irqrestore(&shared->timer_lock, flags);
 }
 
 static void set_sampling_rate(struct dbs_data *dbs_data,
@@ -288,6 +316,8 @@ static int alloc_common_dbs_info(struct cpufreq_policy *policy,
 		cdata->get_cpu_cdbs(j)->shared = shared;
 
 	mutex_init(&shared->timer_mutex);
+	spin_lock_init(&shared->timer_lock);
+	INIT_WORK(&shared->work, dbs_work_handler);
 	return 0;
 }
 
@@ -452,7 +482,9 @@ static int cpufreq_governor_start(struct cpufreq_policy *policy,
 		if (ignore_nice)
 			j_cdbs->prev_cpu_nice = kcpustat_cpu(j).cpustat[CPUTIME_NICE];
 
-		INIT_DEFERRABLE_WORK(&j_cdbs->dwork, dbs_timer);
+		__setup_timer(&j_cdbs->timer, dbs_timer_handler,
+			      (unsigned long)j_cdbs,
+			      TIMER_DEFERRABLE | TIMER_IRQSAFE);
 	}
 
 	if (cdata->governor == GOV_CONSERVATIVE) {
@@ -470,8 +502,7 @@ static int cpufreq_governor_start(struct cpufreq_policy *policy,
 		od_ops->powersave_bias_init_cpu(cpu);
 	}
 
-	gov_queue_work(dbs_data, policy, delay_for_sampling_rate(sampling_rate),
-		       true);
+	gov_add_timers(policy, delay_for_sampling_rate(sampling_rate));
 	return 0;
 }
 
@@ -485,16 +516,9 @@ static int cpufreq_governor_stop(struct cpufreq_policy *policy,
 	if (!shared || !shared->policy)
 		return -EBUSY;
 
-	/*
-	 * Work-handler must see this updated, as it should not proceed any
-	 * further after governor is disabled. And so timer_mutex is taken while
-	 * updating this value.
-	 */
-	mutex_lock(&shared->timer_mutex);
+	gov_cancel_work(shared);
 	shared->policy = NULL;
-	mutex_unlock(&shared->timer_mutex);
 
-	gov_cancel_work(dbs_data, policy);
 	return 0;
 }
 
diff --git a/drivers/cpufreq/cpufreq_governor.h b/drivers/cpufreq/cpufreq_governor.h
index 0c75890..7674290 100644
--- a/drivers/cpufreq/cpufreq_governor.h
+++ b/drivers/cpufreq/cpufreq_governor.h
@@ -132,12 +132,20 @@ static void *get_cpu_dbs_info_s(int cpu)				\
 struct cpu_common_dbs_info {
 	struct cpufreq_policy *policy;
 	/*
-	 * percpu mutex that serializes governor limit change with dbs_timer
-	 * invocation. We do not want dbs_timer to run when user is changing
-	 * the governor or limits.
+	 * Per policy mutex that serializes load evaluation from limit-change
+	 * and work-handler.
 	 */
 	struct mutex timer_mutex;
+
+	/*
+	 * Per policy lock that serializes access to queuing work from timer
+	 * handlers.
+	 */
+	spinlock_t timer_lock;
+
 	ktime_t time_stamp;
+	unsigned int skip_work;
+	struct work_struct work;
 };
 
 /* Per cpu structures */
@@ -152,7 +160,7 @@ struct cpu_dbs_info {
 	 * wake-up from idle.
 	 */
 	unsigned int prev_load;
-	struct delayed_work dwork;
+	struct timer_list timer;
 	struct cpu_common_dbs_info *shared;
 };
 
@@ -268,11 +276,11 @@ static ssize_t show_sampling_rate_min_gov_pol				\
 
 extern struct mutex cpufreq_governor_lock;
 
+void gov_add_timers(struct cpufreq_policy *policy, unsigned int delay);
+void gov_cancel_work(struct cpu_common_dbs_info *shared);
 void dbs_check_cpu(struct dbs_data *dbs_data, int cpu);
 int cpufreq_governor_dbs(struct cpufreq_policy *policy,
 		struct common_dbs_data *cdata, unsigned int event);
-void gov_queue_work(struct dbs_data *dbs_data, struct cpufreq_policy *policy,
-		unsigned int delay, bool all_cpus);
 void od_register_powersave_bias_handler(unsigned int (*f)
 		(struct cpufreq_policy *, unsigned int, unsigned int),
 		unsigned int powersave_bias);
diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c
index fc0384b..f879012 100644
--- a/drivers/cpufreq/cpufreq_ondemand.c
+++ b/drivers/cpufreq/cpufreq_ondemand.c
@@ -286,13 +286,11 @@ static void update_sampling_rate(struct dbs_data *dbs_data,
 			continue;
 
 		next_sampling = jiffies + usecs_to_jiffies(new_rate);
-		appointed_at = dbs_info->cdbs.dwork.timer.expires;
+		appointed_at = dbs_info->cdbs.timer.expires;
 
 		if (time_before(next_sampling, appointed_at)) {
-			cancel_delayed_work_sync(&dbs_info->cdbs.dwork);
-
-			gov_queue_work(dbs_data, policy,
-				       usecs_to_jiffies(new_rate), true);
+			gov_cancel_work(shared);
+			gov_add_timers(policy, usecs_to_jiffies(new_rate));
 
 		}
 	}
-- 
cgit v0.10.2


From f08f638b9c7f1bf3cb9006d3d26bf568d807ede0 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Thu, 3 Dec 2015 09:37:54 +0530
Subject: cpufreq: ondemand: update update_sampling_rate() to make it more
 efficient

Currently update_sampling_rate() runs over each online CPU and
cancels/queues timers on all policy->cpus every time. This should be
done just once for any cpu belonging to a policy.

Create a cpumask and keep on clearing it as and when we process
policies, so that we don't have to traverse through all CPUs of the same
policy.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c
index f879012..eae5107 100644
--- a/drivers/cpufreq/cpufreq_ondemand.c
+++ b/drivers/cpufreq/cpufreq_ondemand.c
@@ -246,6 +246,7 @@ static void update_sampling_rate(struct dbs_data *dbs_data,
 		unsigned int new_rate)
 {
 	struct od_dbs_tuners *od_tuners = dbs_data->tuners;
+	struct cpumask cpumask;
 	int cpu;
 
 	od_tuners->sampling_rate = new_rate = max(new_rate,
@@ -256,7 +257,9 @@ static void update_sampling_rate(struct dbs_data *dbs_data,
 	 */
 	mutex_lock(&od_dbs_cdata.mutex);
 
-	for_each_online_cpu(cpu) {
+	cpumask_copy(&cpumask, cpu_online_mask);
+
+	for_each_cpu(cpu, &cpumask) {
 		struct cpufreq_policy *policy;
 		struct od_cpu_dbs_info_s *dbs_info;
 		struct cpu_dbs_info *cdbs;
@@ -276,6 +279,9 @@ static void update_sampling_rate(struct dbs_data *dbs_data,
 
 		policy = shared->policy;
 
+		/* clear all CPUs of this policy */
+		cpumask_andnot(&cpumask, &cpumask, policy->cpus);
+
 		/*
 		 * Update sampling rate for CPUs whose policy is governed by
 		 * dbs_data. In case of governor_per_policy, only a single
@@ -285,6 +291,10 @@ static void update_sampling_rate(struct dbs_data *dbs_data,
 		if (dbs_data != policy->governor_data)
 			continue;
 
+		/*
+		 * Checking this for any CPU should be fine, timers for all of
+		 * them are scheduled together.
+		 */
 		next_sampling = jiffies + usecs_to_jiffies(new_rate);
 		appointed_at = dbs_info->cdbs.timer.expires;
 
-- 
cgit v0.10.2


From 2dd3e724b4e2237cfaaf155cab72af02c1c420cc Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Tue, 8 Dec 2015 21:44:05 +0100
Subject: cpufreq: governor: Use lockless timer function

It is possible to get rid of the timer_lock spinlock used by the
governor timer function for synchronization, but a couple of races
need to be avoided.

The first race is between multiple dbs_timer_handler() instances
that may be running in parallel with each other on different
CPUs.  Namely, one of them has to queue up the work item, but it
cannot be queued up more than once.  To achieve that,
atomic_inc_return() can be used on the skip_work field of
struct cpu_common_dbs_info.

The second race is between an already running dbs_timer_handler()
and gov_cancel_work().  In that case the dbs_timer_handler() might
not notice the skip_work incrementation in gov_cancel_work() and
it might queue up its work item after gov_cancel_work() had
returned (and that work item would corrupt skip_work going
forward).  To prevent that from happening, gov_cancel_work()
can be made wait for the timer function to complete (on all CPUs)
right after skip_work has been incremented.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>

diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c
index 2d61eae..4de12fd 100644
--- a/drivers/cpufreq/cpufreq_governor.c
+++ b/drivers/cpufreq/cpufreq_governor.c
@@ -186,22 +186,24 @@ static inline void gov_cancel_timers(struct cpufreq_policy *policy)
 
 void gov_cancel_work(struct cpu_common_dbs_info *shared)
 {
-	unsigned long flags;
-
+	/* Tell dbs_timer_handler() to skip queuing up work items. */
+	atomic_inc(&shared->skip_work);
 	/*
-	 * No work will be queued from timer handlers after skip_work is
-	 * updated. And so we can safely cancel the work first and then the
-	 * timers.
+	 * If dbs_timer_handler() is already running, it may not notice the
+	 * incremented skip_work, so wait for it to complete to prevent its work
+	 * item from being queued up after the cancel_work_sync() below.
+	 */
+	gov_cancel_timers(shared->policy);
+	/*
+	 * In case dbs_timer_handler() managed to run and spawn a work item
+	 * before the timers have been canceled, wait for that work item to
+	 * complete and then cancel all of the timers set up by it.  If
+	 * dbs_timer_handler() runs again at that point, it will see the
+	 * positive value of skip_work and won't spawn any more work items.
 	 */
-	spin_lock_irqsave(&shared->timer_lock, flags);
-	shared->skip_work++;
-	spin_unlock_irqrestore(&shared->timer_lock, flags);
-
 	cancel_work_sync(&shared->work);
-
 	gov_cancel_timers(shared->policy);
-
-	shared->skip_work = 0;
+	atomic_set(&shared->skip_work, 0);
 }
 EXPORT_SYMBOL_GPL(gov_cancel_work);
 
@@ -230,7 +232,6 @@ static void dbs_work_handler(struct work_struct *work)
 	struct cpufreq_policy *policy;
 	struct dbs_data *dbs_data;
 	unsigned int sampling_rate, delay;
-	unsigned long flags;
 	bool eval_load;
 
 	policy = shared->policy;
@@ -259,9 +260,7 @@ static void dbs_work_handler(struct work_struct *work)
 	delay = dbs_data->cdata->gov_dbs_timer(policy, eval_load);
 	mutex_unlock(&shared->timer_mutex);
 
-	spin_lock_irqsave(&shared->timer_lock, flags);
-	shared->skip_work--;
-	spin_unlock_irqrestore(&shared->timer_lock, flags);
+	atomic_dec(&shared->skip_work);
 
 	gov_add_timers(policy, delay);
 }
@@ -270,22 +269,18 @@ static void dbs_timer_handler(unsigned long data)
 {
 	struct cpu_dbs_info *cdbs = (struct cpu_dbs_info *)data;
 	struct cpu_common_dbs_info *shared = cdbs->shared;
-	unsigned long flags;
-
-	spin_lock_irqsave(&shared->timer_lock, flags);
 
 	/*
-	 * Timer handler isn't allowed to queue work at the moment, because:
+	 * Timer handler may not be allowed to queue the work at the moment,
+	 * because:
 	 * - Another timer handler has done that
 	 * - We are stopping the governor
-	 * - Or we are updating the sampling rate of ondemand governor
+	 * - Or we are updating the sampling rate of the ondemand governor
 	 */
-	if (!shared->skip_work) {
-		shared->skip_work++;
+	if (atomic_inc_return(&shared->skip_work) > 1)
+		atomic_dec(&shared->skip_work);
+	else
 		queue_work(system_wq, &shared->work);
-	}
-
-	spin_unlock_irqrestore(&shared->timer_lock, flags);
 }
 
 static void set_sampling_rate(struct dbs_data *dbs_data,
@@ -316,7 +311,7 @@ static int alloc_common_dbs_info(struct cpufreq_policy *policy,
 		cdata->get_cpu_cdbs(j)->shared = shared;
 
 	mutex_init(&shared->timer_mutex);
-	spin_lock_init(&shared->timer_lock);
+	atomic_set(&shared->skip_work, 0);
 	INIT_WORK(&shared->work, dbs_work_handler);
 	return 0;
 }
diff --git a/drivers/cpufreq/cpufreq_governor.h b/drivers/cpufreq/cpufreq_governor.h
index 7674290..91e767a 100644
--- a/drivers/cpufreq/cpufreq_governor.h
+++ b/drivers/cpufreq/cpufreq_governor.h
@@ -17,6 +17,7 @@
 #ifndef _CPUFREQ_GOVERNOR_H
 #define _CPUFREQ_GOVERNOR_H
 
+#include <linux/atomic.h>
 #include <linux/cpufreq.h>
 #include <linux/kernel_stat.h>
 #include <linux/module.h>
@@ -137,14 +138,8 @@ struct cpu_common_dbs_info {
 	 */
 	struct mutex timer_mutex;
 
-	/*
-	 * Per policy lock that serializes access to queuing work from timer
-	 * handlers.
-	 */
-	spinlock_t timer_lock;
-
 	ktime_t time_stamp;
-	unsigned int skip_work;
+	atomic_t skip_work;
 	struct work_struct work;
 };
 
-- 
cgit v0.10.2


From 3be3f8f36e7349006f19c8c8f0d686e98462a993 Mon Sep 17 00:00:00 2001
From: Punit Agrawal <punit.agrawal@arm.com>
Date: Tue, 17 Nov 2015 12:06:21 +0000
Subject: devicetree: bindings: Add optional dynamic-power-coefficient property

The dynamic power consumption of a device is proportional to the
square of voltage (V) and the clock frequency (f). It can be expressed as

Pdyn = dynamic-power-coefficient * V^2 * f.

The coefficient represents the running time dynamic power consumption in
units of mw/MHz/uVolt^2 and can be used in the above formula to
calculate the dynamic power in mW.

Signed-off-by: Punit Agrawal <punit.agrawal@arm.com>
Acked-by: Rob Herring <robh@kernel.org>
Reviewed-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/Documentation/devicetree/bindings/arm/cpus.txt b/Documentation/devicetree/bindings/arm/cpus.txt
index 3a07a87..6aca64f 100644
--- a/Documentation/devicetree/bindings/arm/cpus.txt
+++ b/Documentation/devicetree/bindings/arm/cpus.txt
@@ -242,6 +242,23 @@ nodes to be present and contain the properties described below.
 		Definition: Specifies the syscon node controlling the cpu core
 			    power domains.
 
+	- dynamic-power-coefficient
+		Usage: optional
+		Value type: <prop-encoded-array>
+		Definition: A u32 value that represents the running time dynamic
+			    power coefficient in units of mW/MHz/uVolt^2. The
+			    coefficient can either be calculated from power
+			    measurements or derived by analysis.
+
+			    The dynamic power consumption of the CPU  is
+			    proportional to the square of the Voltage (V) and
+			    the clock frequency (f). The coefficient is used to
+			    calculate the dynamic power as below -
+
+			    Pdyn = dynamic-power-coefficient * V^2 * f
+
+			    where voltage is in uV, frequency is in MHz.
+
 Example 1 (dual-cluster big.LITTLE system 32-bit):
 
 	cpus {
-- 
cgit v0.10.2


From f8fa8ae06b8c2c25d81c99766f9226adc5c3e073 Mon Sep 17 00:00:00 2001
From: Punit Agrawal <punit.agrawal@arm.com>
Date: Tue, 17 Nov 2015 12:06:22 +0000
Subject: cpufreq-dt: Supply power coefficient when registering cooling devices

Support registering cooling devices with dynamic power coefficient
where provided by the device tree. This allows OF registered cooling
devices driver to be used with the power_allocator thermal governor.

Signed-off-by: Punit Agrawal <punit.agrawal@arm.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Reviewed-by: Javi Merino <javi.merino@arm.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/cpufreq/cpufreq-dt.c b/drivers/cpufreq/cpufreq-dt.c
index 90d6408..1ceece9 100644
--- a/drivers/cpufreq/cpufreq-dt.c
+++ b/drivers/cpufreq/cpufreq-dt.c
@@ -407,8 +407,13 @@ static void cpufreq_ready(struct cpufreq_policy *policy)
 	 * thermal DT code takes care of matching them.
 	 */
 	if (of_find_property(np, "#cooling-cells", NULL)) {
-		priv->cdev = of_cpufreq_cooling_register(np,
-							 policy->related_cpus);
+		u32 power_coefficient = 0;
+
+		of_property_read_u32(np, "dynamic-power-coefficient",
+				     &power_coefficient);
+
+		priv->cdev = of_cpufreq_power_cooling_register(np,
+				policy->related_cpus, power_coefficient, NULL);
 		if (IS_ERR(priv->cdev)) {
 			dev_err(priv->cpu_dev,
 				"running cpufreq without cooling device: %ld\n",
-- 
cgit v0.10.2


From 2f7e8a175db72bdaf377235962fd85796edb3fbc Mon Sep 17 00:00:00 2001
From: Punit Agrawal <punit.agrawal@arm.com>
Date: Tue, 17 Nov 2015 12:06:23 +0000
Subject: cpufreq: arm_big_little: Add support to register a cpufreq cooling
 device

Register passive cooling devices when initialising cpufreq on
big.LITTLE systems. If the device tree provides a dynamic power
coefficient for the CPUs then the bound cooling device will support
the extensions that allow it to be used with all the existing thermal
governors including the power allocator governor.

A cooling device will be created per individual frequency domain and
can be bound to thermal zones via the thermal DT bindings.

Signed-off-by: Punit Agrawal <punit.agrawal@arm.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/cpufreq/Kconfig.arm b/drivers/cpufreq/Kconfig.arm
index 235a1ba..80fbfb3 100644
--- a/drivers/cpufreq/Kconfig.arm
+++ b/drivers/cpufreq/Kconfig.arm
@@ -6,6 +6,8 @@
 config ARM_BIG_LITTLE_CPUFREQ
 	tristate "Generic ARM big LITTLE CPUfreq driver"
 	depends on (ARM_CPU_TOPOLOGY || ARM64) && HAVE_CLK
+	# if CPU_THERMAL is on and THERMAL=m, ARM_BIT_LITTLE_CPUFREQ cannot be =y
+	depends on !CPU_THERMAL || THERMAL
 	select PM_OPP
 	help
 	  This enables the Generic CPUfreq driver for ARM big.LITTLE platforms.
diff --git a/drivers/cpufreq/arm_big_little.c b/drivers/cpufreq/arm_big_little.c
index c5d256c..c251247 100644
--- a/drivers/cpufreq/arm_big_little.c
+++ b/drivers/cpufreq/arm_big_little.c
@@ -23,6 +23,7 @@
 #include <linux/cpu.h>
 #include <linux/cpufreq.h>
 #include <linux/cpumask.h>
+#include <linux/cpu_cooling.h>
 #include <linux/export.h>
 #include <linux/module.h>
 #include <linux/mutex.h>
@@ -55,6 +56,7 @@ static bool bL_switching_enabled;
 #define ACTUAL_FREQ(cluster, freq)  ((cluster == A7_CLUSTER) ? freq << 1 : freq)
 #define VIRT_FREQ(cluster, freq)    ((cluster == A7_CLUSTER) ? freq >> 1 : freq)
 
+static struct thermal_cooling_device *cdev[MAX_CLUSTERS];
 static struct cpufreq_arm_bL_ops *arm_bL_ops;
 static struct clk *clk[MAX_CLUSTERS];
 static struct cpufreq_frequency_table *freq_table[MAX_CLUSTERS + 1];
@@ -493,6 +495,12 @@ static int bL_cpufreq_init(struct cpufreq_policy *policy)
 static int bL_cpufreq_exit(struct cpufreq_policy *policy)
 {
 	struct device *cpu_dev;
+	int cur_cluster = cpu_to_cluster(policy->cpu);
+
+	if (cur_cluster < MAX_CLUSTERS) {
+		cpufreq_cooling_unregister(cdev[cur_cluster]);
+		cdev[cur_cluster] = NULL;
+	}
 
 	cpu_dev = get_cpu_device(policy->cpu);
 	if (!cpu_dev) {
@@ -507,6 +515,38 @@ static int bL_cpufreq_exit(struct cpufreq_policy *policy)
 	return 0;
 }
 
+static void bL_cpufreq_ready(struct cpufreq_policy *policy)
+{
+	struct device *cpu_dev = get_cpu_device(policy->cpu);
+	int cur_cluster = cpu_to_cluster(policy->cpu);
+	struct device_node *np;
+
+	/* Do not register a cpu_cooling device if we are in IKS mode */
+	if (cur_cluster >= MAX_CLUSTERS)
+		return;
+
+	np = of_node_get(cpu_dev->of_node);
+	if (WARN_ON(!np))
+		return;
+
+	if (of_find_property(np, "#cooling-cells", NULL)) {
+		u32 power_coefficient = 0;
+
+		of_property_read_u32(np, "dynamic-power-coefficient",
+				     &power_coefficient);
+
+		cdev[cur_cluster] = of_cpufreq_power_cooling_register(np,
+				policy->related_cpus, power_coefficient, NULL);
+		if (IS_ERR(cdev[cur_cluster])) {
+			dev_err(cpu_dev,
+				"running cpufreq without cooling device: %ld\n",
+				PTR_ERR(cdev[cur_cluster]));
+			cdev[cur_cluster] = NULL;
+		}
+	}
+	of_node_put(np);
+}
+
 static struct cpufreq_driver bL_cpufreq_driver = {
 	.name			= "arm-big-little",
 	.flags			= CPUFREQ_STICKY |
@@ -517,6 +557,7 @@ static struct cpufreq_driver bL_cpufreq_driver = {
 	.get			= bL_cpufreq_get_rate,
 	.init			= bL_cpufreq_init,
 	.exit			= bL_cpufreq_exit,
+	.ready			= bL_cpufreq_ready,
 	.attr			= cpufreq_generic_attr,
 };
 
-- 
cgit v0.10.2


From 790d849bf811a8ab5d4cd2cce0f6fda92f6aebf2 Mon Sep 17 00:00:00 2001
From: Jacob Tanenbaum <jtanenba@redhat.com>
Date: Thu, 19 Nov 2015 10:29:01 -0500
Subject: cpufreq: pcc-cpufreq: update default value of
 cpuinfo_transition_latency

The cpufreq documentation specifies

policy->cpuinfo.transition_latency   the time it takes on this CPU to
                                switch between two frequencies in
                                nanoseconds (if appropriate, else
                                specify CPUFREQ_ETERNAL)

currently pcc-cpufreq does not expose the value and sets it to zero. I
changed the pcc-cpufreq driver and it's documentation to conform to the
default value specified in Documentation/cpu-freq/cpu-drivers.txt

Signed-off-by: Jacob Tanenbaum <jtanenba@redhat.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/Documentation/cpu-freq/pcc-cpufreq.txt b/Documentation/cpu-freq/pcc-cpufreq.txt
index 9e3c3b3..0a94224 100644
--- a/Documentation/cpu-freq/pcc-cpufreq.txt
+++ b/Documentation/cpu-freq/pcc-cpufreq.txt
@@ -159,8 +159,8 @@ to be strictly associated with a P-state.
 
 2.2 cpuinfo_transition_latency:
 -------------------------------
-The cpuinfo_transition_latency field is 0. The PCC specification does
-not include a field to expose this value currently.
+The cpuinfo_transition_latency field is CPUFREQ_ETERNAL. The PCC specification
+does not include a field to expose this value currently.
 
 2.3 cpuinfo_cur_freq:
 ---------------------
diff --git a/drivers/cpufreq/pcc-cpufreq.c b/drivers/cpufreq/pcc-cpufreq.c
index 2a0d589..808a320 100644
--- a/drivers/cpufreq/pcc-cpufreq.c
+++ b/drivers/cpufreq/pcc-cpufreq.c
@@ -555,6 +555,8 @@ static int pcc_cpufreq_cpu_init(struct cpufreq_policy *policy)
 	policy->min = policy->cpuinfo.min_freq =
 		ioread32(&pcch_hdr->minimum_frequency) * 1000;
 
+	policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL;
+
 	pr_debug("init: policy->max is %d, policy->min is %d\n",
 		policy->max, policy->min);
 out:
-- 
cgit v0.10.2


From 8ae1702a0df5e0730607b97fd9fd1f8066870832 Mon Sep 17 00:00:00 2001
From: Hongtao Jia <hongtao.jia@freescale.com>
Date: Thu, 26 Nov 2015 17:21:11 +0800
Subject: cpufreq: qoriq: Register cooling device based on device tree

Register the qoriq cpufreq driver as a cooling device, based on the
thermal device tree framework. When temperature crosses the passive trip
point cpufreq is used to throttle CPUs.

Signed-off-by: Jia Hongtao <hongtao.jia@freescale.com>
Reviewed-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/cpufreq/qoriq-cpufreq.c b/drivers/cpufreq/qoriq-cpufreq.c
index 358f075..b23e525 100644
--- a/drivers/cpufreq/qoriq-cpufreq.c
+++ b/drivers/cpufreq/qoriq-cpufreq.c
@@ -12,6 +12,7 @@
 
 #include <linux/clk.h>
 #include <linux/cpufreq.h>
+#include <linux/cpu_cooling.h>
 #include <linux/errno.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
@@ -33,6 +34,7 @@
 struct cpu_data {
 	struct clk **pclk;
 	struct cpufreq_frequency_table *table;
+	struct thermal_cooling_device *cdev;
 };
 
 /**
@@ -321,6 +323,27 @@ static int qoriq_cpufreq_target(struct cpufreq_policy *policy,
 	return clk_set_parent(policy->clk, parent);
 }
 
+
+static void qoriq_cpufreq_ready(struct cpufreq_policy *policy)
+{
+	struct cpu_data *cpud = policy->driver_data;
+	struct device_node *np = of_get_cpu_node(policy->cpu, NULL);
+
+	if (of_find_property(np, "#cooling-cells", NULL)) {
+		cpud->cdev = of_cpufreq_cooling_register(np,
+							 policy->related_cpus);
+
+		if (IS_ERR(cpud->cdev)) {
+			pr_err("Failed to register cooling device cpu%d: %ld\n",
+					policy->cpu, PTR_ERR(cpud->cdev));
+
+			cpud->cdev = NULL;
+		}
+	}
+
+	of_node_put(np);
+}
+
 static struct cpufreq_driver qoriq_cpufreq_driver = {
 	.name		= "qoriq_cpufreq",
 	.flags		= CPUFREQ_CONST_LOOPS,
@@ -329,6 +352,7 @@ static struct cpufreq_driver qoriq_cpufreq_driver = {
 	.verify		= cpufreq_generic_frequency_table_verify,
 	.target_index	= qoriq_cpufreq_target,
 	.get		= cpufreq_generic_get,
+	.ready		= qoriq_cpufreq_ready,
 	.attr		= cpufreq_generic_attr,
 };
 
-- 
cgit v0.10.2


From 9bb46b87d662ab704bd852db9916f0e51db3e94b Mon Sep 17 00:00:00 2001
From: Pi-Cheng Chen <pi-cheng.chen@linaro.org>
Date: Sun, 29 Nov 2015 16:31:35 +0800
Subject: cpufreq: mt8173: add CPUFREQ_HAVE_GOVERNOR_PER_POLICY flag

Add CPUFREQ_HAVE_GOVERNOR_PER_POLICY to have individual set of tunables
for each cluster of MT8173.

Signed-off-by: Pi-Cheng Chen <pi-cheng.chen@linaro.org>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/cpufreq/mt8173-cpufreq.c b/drivers/cpufreq/mt8173-cpufreq.c
index 83001dc..c438109 100644
--- a/drivers/cpufreq/mt8173-cpufreq.c
+++ b/drivers/cpufreq/mt8173-cpufreq.c
@@ -469,7 +469,8 @@ static int mtk_cpufreq_exit(struct cpufreq_policy *policy)
 }
 
 static struct cpufreq_driver mt8173_cpufreq_driver = {
-	.flags = CPUFREQ_STICKY | CPUFREQ_NEED_INITIAL_FREQ_CHECK,
+	.flags = CPUFREQ_STICKY | CPUFREQ_NEED_INITIAL_FREQ_CHECK |
+		 CPUFREQ_HAVE_GOVERNOR_PER_POLICY,
 	.verify = cpufreq_generic_frequency_table_verify,
 	.target_index = mtk_cpufreq_set_target,
 	.get = cpufreq_generic_get,
-- 
cgit v0.10.2


From 93625d52e7a74492416f77fed945ba34e0ae0c18 Mon Sep 17 00:00:00 2001
From: Pi-Cheng Chen <pi-cheng.chen@linaro.org>
Date: Sun, 29 Nov 2015 16:31:36 +0800
Subject: cpufreq: mt8173: remove redundant regulator_get_voltage() call

Remove redundant regulator_get_voltage() call to get Vsram value
since it will be obtained later at the beginning of voltage tracking
loop.

Signed-off-by: Pi-Cheng Chen <pi-cheng.chen@linaro.org>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/cpufreq/mt8173-cpufreq.c b/drivers/cpufreq/mt8173-cpufreq.c
index c438109..750cda7 100644
--- a/drivers/cpufreq/mt8173-cpufreq.c
+++ b/drivers/cpufreq/mt8173-cpufreq.c
@@ -59,7 +59,6 @@ static int mtk_cpufreq_voltage_tracking(struct mtk_cpu_dvfs_info *info,
 	int old_vproc, old_vsram, new_vsram, vsram, vproc, ret;
 
 	old_vproc = regulator_get_voltage(proc_reg);
-	old_vsram = regulator_get_voltage(sram_reg);
 	/* Vsram should not exceed the maximum allowed voltage of SoC. */
 	new_vsram = min(new_vproc + MIN_VOLT_SHIFT, MAX_VOLT_LIMIT);
 
-- 
cgit v0.10.2


From 40be4c3ccbf4078e2f8426a7962879b7a447cde4 Mon Sep 17 00:00:00 2001
From: Pi-Cheng Chen <pi-cheng.chen@linaro.org>
Date: Sun, 29 Nov 2015 16:31:37 +0800
Subject: cpufreq: mt8173: check return value of regulator_get_voltage() call

Sometimes regulator_get_voltage() call returns negative values for
reasons(e.g. underlying I2C bus timeout). Add check for the return
values and fail out early.

Signed-off-by: Pi-Cheng Chen <pi-cheng.chen@linaro.org>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/cpufreq/mt8173-cpufreq.c b/drivers/cpufreq/mt8173-cpufreq.c
index 750cda7..9d0fe37 100644
--- a/drivers/cpufreq/mt8173-cpufreq.c
+++ b/drivers/cpufreq/mt8173-cpufreq.c
@@ -59,6 +59,10 @@ static int mtk_cpufreq_voltage_tracking(struct mtk_cpu_dvfs_info *info,
 	int old_vproc, old_vsram, new_vsram, vsram, vproc, ret;
 
 	old_vproc = regulator_get_voltage(proc_reg);
+	if (old_vproc < 0) {
+		pr_err("%s: invalid Vproc value: %d\n", __func__, old_vproc);
+		return old_vproc;
+	}
 	/* Vsram should not exceed the maximum allowed voltage of SoC. */
 	new_vsram = min(new_vproc + MIN_VOLT_SHIFT, MAX_VOLT_LIMIT);
 
@@ -71,7 +75,17 @@ static int mtk_cpufreq_voltage_tracking(struct mtk_cpu_dvfs_info *info,
 		 */
 		do {
 			old_vsram = regulator_get_voltage(sram_reg);
+			if (old_vsram < 0) {
+				pr_err("%s: invalid Vsram value: %d\n",
+				       __func__, old_vsram);
+				return old_vsram;
+			}
 			old_vproc = regulator_get_voltage(proc_reg);
+			if (old_vproc < 0) {
+				pr_err("%s: invalid Vproc value: %d\n",
+				       __func__, old_vproc);
+				return old_vproc;
+			}
 
 			vsram = min(new_vsram, old_vproc + MAX_VOLT_SHIFT);
 
@@ -116,7 +130,17 @@ static int mtk_cpufreq_voltage_tracking(struct mtk_cpu_dvfs_info *info,
 		 */
 		do {
 			old_vproc = regulator_get_voltage(proc_reg);
+			if (old_vproc < 0) {
+				pr_err("%s: invalid Vproc value: %d\n",
+				       __func__, old_vproc);
+				return old_vproc;
+			}
 			old_vsram = regulator_get_voltage(sram_reg);
+			if (old_vsram < 0) {
+				pr_err("%s: invalid Vsram value: %d\n",
+				       __func__, old_vsram);
+				return old_vsram;
+			}
 
 			vproc = max(new_vproc, old_vsram - MAX_VOLT_SHIFT);
 			ret = regulator_set_voltage(proc_reg, vproc,
@@ -184,6 +208,10 @@ static int mtk_cpufreq_set_target(struct cpufreq_policy *policy,
 
 	old_freq_hz = clk_get_rate(cpu_clk);
 	old_vproc = regulator_get_voltage(info->proc_reg);
+	if (old_vproc < 0) {
+		pr_err("%s: invalid Vproc value: %d\n", __func__, old_vproc);
+		return old_vproc;
+	}
 
 	freq_hz = freq_table[index].frequency * 1000;
 
-- 
cgit v0.10.2


From 157386b6fc1465f292b66c4133409033650ad335 Mon Sep 17 00:00:00 2001
From: Philippe Longepe <philippe.longepe@intel.com>
Date: Fri, 4 Dec 2015 17:40:30 +0100
Subject: cpufreq: intel_pstate: Configurable algorithm to get target pstate

Target systems using different cpus have different power and performance
requirements. They may use different algorithms to get the next P-state
based on their power or performance preference.

For example, power-constrained systems may not want to use
high-performance P-states as aggressively as a full-size desktop or a
server platform. A server platform may want to run close to the max to
achieve better performance, while laptop-like systems may prefer
sacrificing performance for longer battery lifes.

For the above reasons, modify intel_pstate to allow the target P-state
selection algorithm to be depend on the CPU ID.

Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Signed-off-by: Philippe Longepe <philippe.longepe@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index 4d07cbd..ff58029 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -66,6 +66,7 @@ static inline int ceiling_fp(int32_t x)
 
 struct sample {
 	int32_t core_pct_busy;
+	int32_t busy_scaled;
 	u64 aperf;
 	u64 mperf;
 	u64 tsc;
@@ -133,6 +134,7 @@ struct pstate_funcs {
 	int (*get_scaling)(void);
 	void (*set)(struct cpudata*, int pstate);
 	void (*get_vid)(struct cpudata *);
+	int32_t (*get_target_pstate)(struct cpudata *);
 };
 
 struct cpu_defaults {
@@ -140,6 +142,8 @@ struct cpu_defaults {
 	struct pstate_funcs funcs;
 };
 
+static inline int32_t get_target_pstate_use_performance(struct cpudata *cpu);
+
 static struct pstate_adjust_policy pid_params;
 static struct pstate_funcs pstate_funcs;
 static int hwp_active;
@@ -738,6 +742,7 @@ static struct cpu_defaults core_params = {
 		.get_turbo = core_get_turbo_pstate,
 		.get_scaling = core_get_scaling,
 		.set = core_set_pstate,
+		.get_target_pstate = get_target_pstate_use_performance,
 	},
 };
 
@@ -758,6 +763,7 @@ static struct cpu_defaults silvermont_params = {
 		.set = atom_set_pstate,
 		.get_scaling = silvermont_get_scaling,
 		.get_vid = atom_get_vid,
+		.get_target_pstate = get_target_pstate_use_performance,
 	},
 };
 
@@ -778,6 +784,7 @@ static struct cpu_defaults airmont_params = {
 		.set = atom_set_pstate,
 		.get_scaling = airmont_get_scaling,
 		.get_vid = atom_get_vid,
+		.get_target_pstate = get_target_pstate_use_performance,
 	},
 };
 
@@ -797,6 +804,7 @@ static struct cpu_defaults knl_params = {
 		.get_turbo = knl_get_turbo_pstate,
 		.get_scaling = core_get_scaling,
 		.set = core_set_pstate,
+		.get_target_pstate = get_target_pstate_use_performance,
 	},
 };
 
@@ -922,7 +930,7 @@ static inline void intel_pstate_set_sample_time(struct cpudata *cpu)
 	mod_timer_pinned(&cpu->timer, jiffies + delay);
 }
 
-static inline int32_t intel_pstate_get_scaled_busy(struct cpudata *cpu)
+static inline int32_t get_target_pstate_use_performance(struct cpudata *cpu)
 {
 	int32_t core_busy, max_pstate, current_pstate, sample_ratio;
 	s64 duration_us;
@@ -960,30 +968,24 @@ static inline int32_t intel_pstate_get_scaled_busy(struct cpudata *cpu)
 		core_busy = mul_fp(core_busy, sample_ratio);
 	}
 
-	return core_busy;
+	cpu->sample.busy_scaled = core_busy;
+	return cpu->pstate.current_pstate - pid_calc(&cpu->pid, core_busy);
 }
 
 static inline void intel_pstate_adjust_busy_pstate(struct cpudata *cpu)
 {
-	int32_t busy_scaled;
-	struct _pid *pid;
-	signed int ctl;
-	int from;
+	int from, target_pstate;
 	struct sample *sample;
 
 	from = cpu->pstate.current_pstate;
 
-	pid = &cpu->pid;
-	busy_scaled = intel_pstate_get_scaled_busy(cpu);
+	target_pstate = pstate_funcs.get_target_pstate(cpu);
 
-	ctl = pid_calc(pid, busy_scaled);
-
-	/* Negative values of ctl increase the pstate and vice versa */
-	intel_pstate_set_pstate(cpu, cpu->pstate.current_pstate - ctl, true);
+	intel_pstate_set_pstate(cpu, target_pstate, true);
 
 	sample = &cpu->sample;
 	trace_pstate_sample(fp_toint(sample->core_pct_busy),
-		fp_toint(busy_scaled),
+		fp_toint(sample->busy_scaled),
 		from,
 		cpu->pstate.current_pstate,
 		sample->mperf,
@@ -1237,6 +1239,8 @@ static void copy_cpu_funcs(struct pstate_funcs *funcs)
 	pstate_funcs.get_scaling = funcs->get_scaling;
 	pstate_funcs.set       = funcs->set;
 	pstate_funcs.get_vid   = funcs->get_vid;
+	pstate_funcs.get_target_pstate = funcs->get_target_pstate;
+
 }
 
 #if IS_ENABLED(CONFIG_ACPI)
-- 
cgit v0.10.2


From e70eed2b64545ab5c9d2f4d43372d79762f1b985 Mon Sep 17 00:00:00 2001
From: Philippe Longepe <philippe.longepe@intel.com>
Date: Fri, 4 Dec 2015 17:40:32 +0100
Subject: cpufreq: intel_pstate: Account for non C0 time

The current function to calculate cpu utilization uses the average P-state
ratio (APerf/Mperf) scaled by the ratio of the current P-state to the
max available non-turbo one. This leads to an overestimation of
utilization which causes higher-performance P-states to be selected more
often and that leads to increased energy consumption.

This is a problem for low-power systems, so it is better to use a
different utilization calculation algorithm for them.

Namely, the Percent Busy value (or load) can be estimated as the ratio of the
MPERF counter that runs at a constant rate only during active periods (C0) to
the time stamp counter (TSC) that also runs (at the same rate) during idle.
That is:

Percent Busy = 100 * (delta_mperf / delta_tsc)

Use this algorithm for platforms with SoCs based on the Airmont and Silvermont
Atom cores.

Signed-off-by: Philippe Longepe <philippe.longepe@intel.com>
Signed-off-by: Stephane Gasparini <stephane.gasparini@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index ff58029..8bfebae 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -143,6 +143,7 @@ struct cpu_defaults {
 };
 
 static inline int32_t get_target_pstate_use_performance(struct cpudata *cpu);
+static inline int32_t get_target_pstate_use_cpu_load(struct cpudata *cpu);
 
 static struct pstate_adjust_policy pid_params;
 static struct pstate_funcs pstate_funcs;
@@ -763,7 +764,7 @@ static struct cpu_defaults silvermont_params = {
 		.set = atom_set_pstate,
 		.get_scaling = silvermont_get_scaling,
 		.get_vid = atom_get_vid,
-		.get_target_pstate = get_target_pstate_use_performance,
+		.get_target_pstate = get_target_pstate_use_cpu_load,
 	},
 };
 
@@ -784,7 +785,7 @@ static struct cpu_defaults airmont_params = {
 		.set = atom_set_pstate,
 		.get_scaling = airmont_get_scaling,
 		.get_vid = atom_get_vid,
-		.get_target_pstate = get_target_pstate_use_performance,
+		.get_target_pstate = get_target_pstate_use_cpu_load,
 	},
 };
 
@@ -890,12 +891,11 @@ static inline void intel_pstate_sample(struct cpudata *cpu)
 	local_irq_save(flags);
 	rdmsrl(MSR_IA32_APERF, aperf);
 	rdmsrl(MSR_IA32_MPERF, mperf);
-	if (cpu->prev_mperf == mperf) {
+	tsc = rdtsc();
+	if ((cpu->prev_mperf == mperf) || (cpu->prev_tsc == tsc)) {
 		local_irq_restore(flags);
 		return;
 	}
-
-	tsc = rdtsc();
 	local_irq_restore(flags);
 
 	cpu->last_sample_time = cpu->sample.time;
@@ -930,6 +930,25 @@ static inline void intel_pstate_set_sample_time(struct cpudata *cpu)
 	mod_timer_pinned(&cpu->timer, jiffies + delay);
 }
 
+static inline int32_t get_target_pstate_use_cpu_load(struct cpudata *cpu)
+{
+	struct sample *sample = &cpu->sample;
+	int32_t cpu_load;
+
+	/*
+	 * The load can be estimated as the ratio of the mperf counter
+	 * running at a constant frequency during active periods
+	 * (C0) and the time stamp counter running at the same frequency
+	 * also during C-states.
+	 */
+	cpu_load = div64_u64(int_tofp(100) * sample->mperf, sample->tsc);
+
+	cpu->sample.busy_scaled = cpu_load;
+
+	return cpu->pstate.current_pstate - pid_calc(&cpu->pid, cpu_load);
+}
+
+
 static inline int32_t get_target_pstate_use_performance(struct cpudata *cpu)
 {
 	int32_t core_busy, max_pstate, current_pstate, sample_ratio;
-- 
cgit v0.10.2


From 63d1d656a5232f2f189b217b50542eadcf9d74ae Mon Sep 17 00:00:00 2001
From: Philippe Longepe <philippe.longepe@intel.com>
Date: Fri, 4 Dec 2015 17:40:35 +0100
Subject: cpufreq: intel_pstate: Account for IO wait time

In cases where we have many IOs, the global load becomes low and the
load algorithm will decrease the requested P-State. Because of that,
the IOs overheads will increase and impact the IO performances.

To improve IO bound work, we can count the io-wait time as busy time
in calculating CPU busy.

This change uses get_cpu_iowait_time_us() to obtain the IO wait time value
and converts time into number of cycles spent waiting on IO at the TSC
rate. At the moment, this trick is only used for Atom.

Signed-off-by: Philippe Longepe <philippe.longepe@intel.com>
Signed-off-by: Stephane Gasparini <stephane.gasparini@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index 8bfebae..efc5813 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -113,6 +113,7 @@ struct cpudata {
 	u64	prev_aperf;
 	u64	prev_mperf;
 	u64	prev_tsc;
+	u64	prev_cummulative_iowait;
 	struct sample sample;
 };
 
@@ -933,22 +934,39 @@ static inline void intel_pstate_set_sample_time(struct cpudata *cpu)
 static inline int32_t get_target_pstate_use_cpu_load(struct cpudata *cpu)
 {
 	struct sample *sample = &cpu->sample;
+	u64 cummulative_iowait, delta_iowait_us;
+	u64 delta_iowait_mperf;
+	u64 mperf, now;
 	int32_t cpu_load;
 
+	cummulative_iowait = get_cpu_iowait_time_us(cpu->cpu, &now);
+
+	/*
+	 * Convert iowait time into number of IO cycles spent at max_freq.
+	 * IO is considered as busy only for the cpu_load algorithm. For
+	 * performance this is not needed since we always try to reach the
+	 * maximum P-State, so we are already boosting the IOs.
+	 */
+	delta_iowait_us = cummulative_iowait - cpu->prev_cummulative_iowait;
+	delta_iowait_mperf = div64_u64(delta_iowait_us * cpu->pstate.scaling *
+		cpu->pstate.max_pstate, MSEC_PER_SEC);
+
+	mperf = cpu->sample.mperf + delta_iowait_mperf;
+	cpu->prev_cummulative_iowait = cummulative_iowait;
+
+
 	/*
 	 * The load can be estimated as the ratio of the mperf counter
 	 * running at a constant frequency during active periods
 	 * (C0) and the time stamp counter running at the same frequency
 	 * also during C-states.
 	 */
-	cpu_load = div64_u64(int_tofp(100) * sample->mperf, sample->tsc);
-
+	cpu_load = div64_u64(int_tofp(100) * mperf, sample->tsc);
 	cpu->sample.busy_scaled = cpu_load;
 
 	return cpu->pstate.current_pstate - pid_calc(&cpu->pid, cpu_load);
 }
 
-
 static inline int32_t get_target_pstate_use_performance(struct cpudata *cpu)
 {
 	int32_t core_busy, max_pstate, current_pstate, sample_ratio;
-- 
cgit v0.10.2


From 7de36b0aa51a5a59e28fb2da768fa3ab07de0674 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Wed, 9 Dec 2015 08:01:46 +0530
Subject: PM / OPP: Parse 'opp-supported-hw' binding

OPP bindings allow a platform to enable OPPs based on the version of the
hardware they are used for.

Add support to the OPP-core to parse these bindings, by introducing
dev_pm_opp_{set|put}_supported_hw() APIs.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Tested-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/base/power/opp/core.c b/drivers/base/power/opp/core.c
index 6aa172b..55cf1a9 100644
--- a/drivers/base/power/opp/core.c
+++ b/drivers/base/power/opp/core.c
@@ -559,6 +559,9 @@ static void _remove_device_opp(struct device_opp *dev_opp)
 	if (!list_empty(&dev_opp->opp_list))
 		return;
 
+	if (dev_opp->supported_hw)
+		return;
+
 	list_dev = list_first_entry(&dev_opp->dev_list, struct device_list_opp,
 				    node);
 
@@ -834,6 +837,145 @@ static int opp_parse_supplies(struct dev_pm_opp *opp, struct device *dev)
 }
 
 /**
+ * dev_pm_opp_set_supported_hw() - Set supported platforms
+ * @dev: Device for which supported-hw has to be set.
+ * @versions: Array of hierarchy of versions to match.
+ * @count: Number of elements in the array.
+ *
+ * This is required only for the V2 bindings, and it enables a platform to
+ * specify the hierarchy of versions it supports. OPP layer will then enable
+ * OPPs, which are available for those versions, based on its 'opp-supported-hw'
+ * property.
+ *
+ * Locking: The internal device_opp and opp structures are RCU protected.
+ * Hence this function internally uses RCU updater strategy with mutex locks
+ * to keep the integrity of the internal data structures. Callers should ensure
+ * that this function is *NOT* called under RCU protection or in contexts where
+ * mutex cannot be locked.
+ */
+int dev_pm_opp_set_supported_hw(struct device *dev, const u32 *versions,
+				unsigned int count)
+{
+	struct device_opp *dev_opp;
+	int ret = 0;
+
+	/* Hold our list modification lock here */
+	mutex_lock(&dev_opp_list_lock);
+
+	dev_opp = _add_device_opp(dev);
+	if (!dev_opp) {
+		ret = -ENOMEM;
+		goto unlock;
+	}
+
+	/* Make sure there are no concurrent readers while updating dev_opp */
+	WARN_ON(!list_empty(&dev_opp->opp_list));
+
+	/* Do we already have a version hierarchy associated with dev_opp? */
+	if (dev_opp->supported_hw) {
+		dev_err(dev, "%s: Already have supported hardware list\n",
+			__func__);
+		ret = -EBUSY;
+		goto err;
+	}
+
+	dev_opp->supported_hw = kmemdup(versions, count * sizeof(*versions),
+					GFP_KERNEL);
+	if (!dev_opp->supported_hw) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	dev_opp->supported_hw_count = count;
+	mutex_unlock(&dev_opp_list_lock);
+	return 0;
+
+err:
+	_remove_device_opp(dev_opp);
+unlock:
+	mutex_unlock(&dev_opp_list_lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_set_supported_hw);
+
+/**
+ * dev_pm_opp_put_supported_hw() - Releases resources blocked for supported hw
+ * @dev: Device for which supported-hw has to be set.
+ *
+ * This is required only for the V2 bindings, and is called for a matching
+ * dev_pm_opp_set_supported_hw(). Until this is called, the device_opp structure
+ * will not be freed.
+ *
+ * Locking: The internal device_opp and opp structures are RCU protected.
+ * Hence this function internally uses RCU updater strategy with mutex locks
+ * to keep the integrity of the internal data structures. Callers should ensure
+ * that this function is *NOT* called under RCU protection or in contexts where
+ * mutex cannot be locked.
+ */
+void dev_pm_opp_put_supported_hw(struct device *dev)
+{
+	struct device_opp *dev_opp;
+
+	/* Hold our list modification lock here */
+	mutex_lock(&dev_opp_list_lock);
+
+	/* Check for existing list for 'dev' first */
+	dev_opp = _find_device_opp(dev);
+	if (IS_ERR(dev_opp)) {
+		dev_err(dev, "Failed to find dev_opp: %ld\n", PTR_ERR(dev_opp));
+		goto unlock;
+	}
+
+	/* Make sure there are no concurrent readers while updating dev_opp */
+	WARN_ON(!list_empty(&dev_opp->opp_list));
+
+	if (!dev_opp->supported_hw) {
+		dev_err(dev, "%s: Doesn't have supported hardware list\n",
+			__func__);
+		goto unlock;
+	}
+
+	kfree(dev_opp->supported_hw);
+	dev_opp->supported_hw = NULL;
+	dev_opp->supported_hw_count = 0;
+
+	/* Try freeing device_opp if this was the last blocking resource */
+	_remove_device_opp(dev_opp);
+
+unlock:
+	mutex_unlock(&dev_opp_list_lock);
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_put_supported_hw);
+
+static bool _opp_is_supported(struct device *dev, struct device_opp *dev_opp,
+			      struct device_node *np)
+{
+	unsigned int count = dev_opp->supported_hw_count;
+	u32 version;
+	int ret;
+
+	if (!dev_opp->supported_hw)
+		return true;
+
+	while (count--) {
+		ret = of_property_read_u32_index(np, "opp-supported-hw", count,
+						 &version);
+		if (ret) {
+			dev_warn(dev, "%s: failed to read opp-supported-hw property at index %d: %d\n",
+				 __func__, count, ret);
+			return false;
+		}
+
+		/* Both of these are bitwise masks of the versions */
+		if (!(version & dev_opp->supported_hw[count]))
+			return false;
+	}
+
+	return true;
+}
+
+/**
  * _opp_add_static_v2() - Allocate static OPPs (As per 'v2' DT bindings)
  * @dev:	device for which we do this operation
  * @np:		device node
@@ -879,6 +1021,12 @@ static int _opp_add_static_v2(struct device *dev, struct device_node *np)
 		goto free_opp;
 	}
 
+	/* Check if the OPP supports hardware's hierarchy of versions or not */
+	if (!_opp_is_supported(dev, dev_opp, np)) {
+		dev_dbg(dev, "OPP not supported by hardware: %llu\n", rate);
+		goto free_opp;
+	}
+
 	/*
 	 * Rate is defined as an unsigned long in clk API, and so casting
 	 * explicitly to its type. Must be fixed once rate is 64 bit
diff --git a/drivers/base/power/opp/opp.h b/drivers/base/power/opp/opp.h
index b8880c7..70f4564 100644
--- a/drivers/base/power/opp/opp.h
+++ b/drivers/base/power/opp/opp.h
@@ -129,6 +129,8 @@ struct device_list_opp {
  * @clock_latency_ns_max: Max clock latency in nanoseconds.
  * @shared_opp: OPP is shared between multiple devices.
  * @suspend_opp: Pointer to OPP to be used during device suspend.
+ * @supported_hw: Array of version number to support.
+ * @supported_hw_count: Number of elements in supported_hw array.
  * @dentry:	debugfs dentry pointer of the real device directory (not links).
  * @dentry_name: Name of the real dentry.
  *
@@ -153,6 +155,9 @@ struct device_opp {
 	bool shared_opp;
 	struct dev_pm_opp *suspend_opp;
 
+	unsigned int *supported_hw;
+	unsigned int supported_hw_count;
+
 #ifdef CONFIG_DEBUG_FS
 	struct dentry *dentry;
 	char dentry_name[NAME_MAX];
diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h
index 9a2e503..3a85110 100644
--- a/include/linux/pm_opp.h
+++ b/include/linux/pm_opp.h
@@ -55,6 +55,9 @@ int dev_pm_opp_enable(struct device *dev, unsigned long freq);
 int dev_pm_opp_disable(struct device *dev, unsigned long freq);
 
 struct srcu_notifier_head *dev_pm_opp_get_notifier(struct device *dev);
+int dev_pm_opp_set_supported_hw(struct device *dev, const u32 *versions,
+				unsigned int count);
+void dev_pm_opp_put_supported_hw(struct device *dev);
 #else
 static inline unsigned long dev_pm_opp_get_voltage(struct dev_pm_opp *opp)
 {
@@ -129,6 +132,16 @@ static inline struct srcu_notifier_head *dev_pm_opp_get_notifier(
 {
 	return ERR_PTR(-EINVAL);
 }
+
+static inline int dev_pm_opp_set_supported_hw(struct device *dev,
+					      const u32 *versions,
+					      unsigned int count)
+{
+	return -EINVAL;
+}
+
+static inline void dev_pm_opp_put_supported_hw(struct device *dev) {}
+
 #endif		/* CONFIG_PM_OPP */
 
 #if defined(CONFIG_PM_OPP) && defined(CONFIG_OF)
-- 
cgit v0.10.2


From 01fb4d3c39d35b725441e8a9a26b3f3ad67793ed Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Wed, 9 Dec 2015 08:01:47 +0530
Subject: PM / OPP: Parse 'opp-<prop>-<name>' bindings

OPP bindings (for few properties) allow a platform to choose a
value/range among a set of available options. The options are present as
opp-<prop>-<name>, where the platform needs to supply the <name> string.

The OPP properties which allow such an option are: opp-microvolt and
opp-microamp.

Add support to the OPP-core to parse these bindings, by introducing
dev_pm_opp_{set|put}_prop_name() APIs.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Tested-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/base/power/opp/core.c b/drivers/base/power/opp/core.c
index 55cf1a9..5c01fec 100644
--- a/drivers/base/power/opp/core.c
+++ b/drivers/base/power/opp/core.c
@@ -562,6 +562,9 @@ static void _remove_device_opp(struct device_opp *dev_opp)
 	if (dev_opp->supported_hw)
 		return;
 
+	if (dev_opp->prop_name)
+		return;
+
 	list_dev = list_first_entry(&dev_opp->dev_list, struct device_list_opp,
 				    node);
 
@@ -794,35 +797,48 @@ unlock:
 }
 
 /* TODO: Support multiple regulators */
-static int opp_parse_supplies(struct dev_pm_opp *opp, struct device *dev)
+static int opp_parse_supplies(struct dev_pm_opp *opp, struct device *dev,
+			      struct device_opp *dev_opp)
 {
 	u32 microvolt[3] = {0};
 	u32 val;
 	int count, ret;
+	struct property *prop = NULL;
+	char name[NAME_MAX];
+
+	/* Search for "opp-microvolt-<name>" */
+	if (dev_opp->prop_name) {
+		sprintf(name, "opp-microvolt-%s", dev_opp->prop_name);
+		prop = of_find_property(opp->np, name, NULL);
+	}
+
+	if (!prop) {
+		/* Search for "opp-microvolt" */
+		name[13] = '\0';
+		prop = of_find_property(opp->np, name, NULL);
 
-	/* Missing property isn't a problem, but an invalid entry is */
-	if (!of_find_property(opp->np, "opp-microvolt", NULL))
-		return 0;
+		/* Missing property isn't a problem, but an invalid entry is */
+		if (!prop)
+			return 0;
+	}
 
-	count = of_property_count_u32_elems(opp->np, "opp-microvolt");
+	count = of_property_count_u32_elems(opp->np, name);
 	if (count < 0) {
-		dev_err(dev, "%s: Invalid opp-microvolt property (%d)\n",
-			__func__, count);
+		dev_err(dev, "%s: Invalid %s property (%d)\n",
+			__func__, name, count);
 		return count;
 	}
 
 	/* There can be one or three elements here */
 	if (count != 1 && count != 3) {
-		dev_err(dev, "%s: Invalid number of elements in opp-microvolt property (%d)\n",
-			__func__, count);
+		dev_err(dev, "%s: Invalid number of elements in %s property (%d)\n",
+			__func__, name, count);
 		return -EINVAL;
 	}
 
-	ret = of_property_read_u32_array(opp->np, "opp-microvolt", microvolt,
-					 count);
+	ret = of_property_read_u32_array(opp->np, name, microvolt, count);
 	if (ret) {
-		dev_err(dev, "%s: error parsing opp-microvolt: %d\n", __func__,
-			ret);
+		dev_err(dev, "%s: error parsing %s: %d\n", __func__, name, ret);
 		return -EINVAL;
 	}
 
@@ -830,7 +846,20 @@ static int opp_parse_supplies(struct dev_pm_opp *opp, struct device *dev)
 	opp->u_volt_min = microvolt[1];
 	opp->u_volt_max = microvolt[2];
 
-	if (!of_property_read_u32(opp->np, "opp-microamp", &val))
+	/* Search for "opp-microamp-<name>" */
+	prop = NULL;
+	if (dev_opp->prop_name) {
+		sprintf(name, "opp-microamp-%s", dev_opp->prop_name);
+		prop = of_find_property(opp->np, name, NULL);
+	}
+
+	if (!prop) {
+		/* Search for "opp-microamp" */
+		name[12] = '\0';
+		prop = of_find_property(opp->np, name, NULL);
+	}
+
+	if (prop && !of_property_read_u32(opp->np, name, &val))
 		opp->u_amp = val;
 
 	return 0;
@@ -948,6 +977,112 @@ unlock:
 }
 EXPORT_SYMBOL_GPL(dev_pm_opp_put_supported_hw);
 
+/**
+ * dev_pm_opp_set_prop_name() - Set prop-extn name
+ * @dev: Device for which the regulator has to be set.
+ * @name: name to postfix to properties.
+ *
+ * This is required only for the V2 bindings, and it enables a platform to
+ * specify the extn to be used for certain property names. The properties to
+ * which the extension will apply are opp-microvolt and opp-microamp. OPP core
+ * should postfix the property name with -<name> while looking for them.
+ *
+ * Locking: The internal device_opp and opp structures are RCU protected.
+ * Hence this function internally uses RCU updater strategy with mutex locks
+ * to keep the integrity of the internal data structures. Callers should ensure
+ * that this function is *NOT* called under RCU protection or in contexts where
+ * mutex cannot be locked.
+ */
+int dev_pm_opp_set_prop_name(struct device *dev, const char *name)
+{
+	struct device_opp *dev_opp;
+	int ret = 0;
+
+	/* Hold our list modification lock here */
+	mutex_lock(&dev_opp_list_lock);
+
+	dev_opp = _add_device_opp(dev);
+	if (!dev_opp) {
+		ret = -ENOMEM;
+		goto unlock;
+	}
+
+	/* Make sure there are no concurrent readers while updating dev_opp */
+	WARN_ON(!list_empty(&dev_opp->opp_list));
+
+	/* Do we already have a prop-name associated with dev_opp? */
+	if (dev_opp->prop_name) {
+		dev_err(dev, "%s: Already have prop-name %s\n", __func__,
+			dev_opp->prop_name);
+		ret = -EBUSY;
+		goto err;
+	}
+
+	dev_opp->prop_name = kstrdup(name, GFP_KERNEL);
+	if (!dev_opp->prop_name) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	mutex_unlock(&dev_opp_list_lock);
+	return 0;
+
+err:
+	_remove_device_opp(dev_opp);
+unlock:
+	mutex_unlock(&dev_opp_list_lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_set_prop_name);
+
+/**
+ * dev_pm_opp_put_prop_name() - Releases resources blocked for prop-name
+ * @dev: Device for which the regulator has to be set.
+ *
+ * This is required only for the V2 bindings, and is called for a matching
+ * dev_pm_opp_set_prop_name(). Until this is called, the device_opp structure
+ * will not be freed.
+ *
+ * Locking: The internal device_opp and opp structures are RCU protected.
+ * Hence this function internally uses RCU updater strategy with mutex locks
+ * to keep the integrity of the internal data structures. Callers should ensure
+ * that this function is *NOT* called under RCU protection or in contexts where
+ * mutex cannot be locked.
+ */
+void dev_pm_opp_put_prop_name(struct device *dev)
+{
+	struct device_opp *dev_opp;
+
+	/* Hold our list modification lock here */
+	mutex_lock(&dev_opp_list_lock);
+
+	/* Check for existing list for 'dev' first */
+	dev_opp = _find_device_opp(dev);
+	if (IS_ERR(dev_opp)) {
+		dev_err(dev, "Failed to find dev_opp: %ld\n", PTR_ERR(dev_opp));
+		goto unlock;
+	}
+
+	/* Make sure there are no concurrent readers while updating dev_opp */
+	WARN_ON(!list_empty(&dev_opp->opp_list));
+
+	if (!dev_opp->prop_name) {
+		dev_err(dev, "%s: Doesn't have a prop-name\n", __func__);
+		goto unlock;
+	}
+
+	kfree(dev_opp->prop_name);
+	dev_opp->prop_name = NULL;
+
+	/* Try freeing device_opp if this was the last blocking resource */
+	_remove_device_opp(dev_opp);
+
+unlock:
+	mutex_unlock(&dev_opp_list_lock);
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_put_prop_name);
+
 static bool _opp_is_supported(struct device *dev, struct device_opp *dev_opp,
 			      struct device_node *np)
 {
@@ -1042,7 +1177,7 @@ static int _opp_add_static_v2(struct device *dev, struct device_node *np)
 	if (!of_property_read_u32(np, "clock-latency-ns", &val))
 		new_opp->clock_latency_ns = val;
 
-	ret = opp_parse_supplies(new_opp, dev);
+	ret = opp_parse_supplies(new_opp, dev, dev_opp);
 	if (ret)
 		goto free_opp;
 
diff --git a/drivers/base/power/opp/opp.h b/drivers/base/power/opp/opp.h
index 70f4564..690638e 100644
--- a/drivers/base/power/opp/opp.h
+++ b/drivers/base/power/opp/opp.h
@@ -131,6 +131,7 @@ struct device_list_opp {
  * @suspend_opp: Pointer to OPP to be used during device suspend.
  * @supported_hw: Array of version number to support.
  * @supported_hw_count: Number of elements in supported_hw array.
+ * @prop_name: A name to postfix to many DT properties, while parsing them.
  * @dentry:	debugfs dentry pointer of the real device directory (not links).
  * @dentry_name: Name of the real dentry.
  *
@@ -157,6 +158,7 @@ struct device_opp {
 
 	unsigned int *supported_hw;
 	unsigned int supported_hw_count;
+	const char *prop_name;
 
 #ifdef CONFIG_DEBUG_FS
 	struct dentry *dentry;
diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h
index 3a85110..95403d2 100644
--- a/include/linux/pm_opp.h
+++ b/include/linux/pm_opp.h
@@ -58,6 +58,8 @@ struct srcu_notifier_head *dev_pm_opp_get_notifier(struct device *dev);
 int dev_pm_opp_set_supported_hw(struct device *dev, const u32 *versions,
 				unsigned int count);
 void dev_pm_opp_put_supported_hw(struct device *dev);
+int dev_pm_opp_set_prop_name(struct device *dev, const char *name);
+void dev_pm_opp_put_prop_name(struct device *dev);
 #else
 static inline unsigned long dev_pm_opp_get_voltage(struct dev_pm_opp *opp)
 {
@@ -142,6 +144,13 @@ static inline int dev_pm_opp_set_supported_hw(struct device *dev,
 
 static inline void dev_pm_opp_put_supported_hw(struct device *dev) {}
 
+static inline int dev_pm_opp_set_prop_name(struct device *dev, const char *name)
+{
+	return -EINVAL;
+}
+
+static inline void dev_pm_opp_put_prop_name(struct device *dev) {}
+
 #endif		/* CONFIG_PM_OPP */
 
 #if defined(CONFIG_PM_OPP) && defined(CONFIG_OF)
-- 
cgit v0.10.2


From 89b56047f6f9b15fa3e9df3e34fa391835972ab7 Mon Sep 17 00:00:00 2001
From: Pi-Cheng Chen <pi-cheng.chen@linaro.org>
Date: Thu, 10 Dec 2015 11:48:13 +0800
Subject: cpufreq: mt8173: Move resources allocation into ->probe()

Since the return value of ->init() of cpufreq driver is not propagated
to the device driver model now, move resources allocation into
->probe() to handle -EPROBE_DEFER properly.

Signed-off-by: Pi-Cheng Chen <pi-cheng.chen@linaro.org>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/cpufreq/mt8173-cpufreq.c b/drivers/cpufreq/mt8173-cpufreq.c
index 9d0fe37..fd601b9 100644
--- a/drivers/cpufreq/mt8173-cpufreq.c
+++ b/drivers/cpufreq/mt8173-cpufreq.c
@@ -41,16 +41,35 @@
  * the original PLL becomes stable at target frequency.
  */
 struct mtk_cpu_dvfs_info {
+	struct cpumask cpus;
 	struct device *cpu_dev;
 	struct regulator *proc_reg;
 	struct regulator *sram_reg;
 	struct clk *cpu_clk;
 	struct clk *inter_clk;
 	struct thermal_cooling_device *cdev;
+	struct list_head list_head;
 	int intermediate_voltage;
 	bool need_voltage_tracking;
 };
 
+static LIST_HEAD(dvfs_info_list);
+
+static struct mtk_cpu_dvfs_info *mtk_cpu_dvfs_info_lookup(int cpu)
+{
+	struct mtk_cpu_dvfs_info *info;
+	struct list_head *list;
+
+	list_for_each(list, &dvfs_info_list) {
+		info = list_entry(list, struct mtk_cpu_dvfs_info, list_head);
+
+		if (cpumask_test_cpu(cpu, &info->cpus))
+			return info;
+	}
+
+	return NULL;
+}
+
 static int mtk_cpufreq_voltage_tracking(struct mtk_cpu_dvfs_info *info,
 					int new_vproc)
 {
@@ -402,6 +421,9 @@ static int mtk_cpu_dvfs_info_init(struct mtk_cpu_dvfs_info *info, int cpu)
 	 */
 	info->need_voltage_tracking = !IS_ERR(sram_reg);
 
+	/* CPUs in the same cluster share a clock and power domain. */
+	cpumask_copy(&info->cpus, &cpu_topology[cpu].core_sibling);
+
 	return 0;
 
 out_free_opp_table:
@@ -440,22 +462,18 @@ static int mtk_cpufreq_init(struct cpufreq_policy *policy)
 	struct cpufreq_frequency_table *freq_table;
 	int ret;
 
-	info = kzalloc(sizeof(*info), GFP_KERNEL);
-	if (!info)
-		return -ENOMEM;
-
-	ret = mtk_cpu_dvfs_info_init(info, policy->cpu);
-	if (ret) {
-		pr_err("%s failed to initialize dvfs info for cpu%d\n",
-		       __func__, policy->cpu);
-		goto out_free_dvfs_info;
+	info = mtk_cpu_dvfs_info_lookup(policy->cpu);
+	if (!info) {
+		pr_err("dvfs info for cpu%d is not initialized.\n",
+		       policy->cpu);
+		return -EINVAL;
 	}
 
 	ret = dev_pm_opp_init_cpufreq_table(info->cpu_dev, &freq_table);
 	if (ret) {
 		pr_err("failed to init cpufreq table for cpu%d: %d\n",
 		       policy->cpu, ret);
-		goto out_release_dvfs_info;
+		return ret;
 	}
 
 	ret = cpufreq_table_validate_and_show(policy, freq_table);
@@ -464,8 +482,7 @@ static int mtk_cpufreq_init(struct cpufreq_policy *policy)
 		goto out_free_cpufreq_table;
 	}
 
-	/* CPUs in the same cluster share a clock and power domain. */
-	cpumask_copy(policy->cpus, &cpu_topology[policy->cpu].core_sibling);
+	cpumask_copy(policy->cpus, &info->cpus);
 	policy->driver_data = info;
 	policy->clk = info->cpu_clk;
 
@@ -473,13 +490,6 @@ static int mtk_cpufreq_init(struct cpufreq_policy *policy)
 
 out_free_cpufreq_table:
 	dev_pm_opp_free_cpufreq_table(info->cpu_dev, &freq_table);
-
-out_release_dvfs_info:
-	mtk_cpu_dvfs_info_release(info);
-
-out_free_dvfs_info:
-	kfree(info);
-
 	return ret;
 }
 
@@ -489,8 +499,6 @@ static int mtk_cpufreq_exit(struct cpufreq_policy *policy)
 
 	cpufreq_cooling_unregister(info->cdev);
 	dev_pm_opp_free_cpufreq_table(info->cpu_dev, &policy->freq_table);
-	mtk_cpu_dvfs_info_release(info);
-	kfree(info);
 
 	return 0;
 }
@@ -510,11 +518,47 @@ static struct cpufreq_driver mt8173_cpufreq_driver = {
 
 static int mt8173_cpufreq_probe(struct platform_device *pdev)
 {
-	int ret;
+	struct mtk_cpu_dvfs_info *info;
+	struct list_head *list, *tmp;
+	int cpu, ret;
+
+	for_each_possible_cpu(cpu) {
+		info = mtk_cpu_dvfs_info_lookup(cpu);
+		if (info)
+			continue;
+
+		info = devm_kzalloc(&pdev->dev, sizeof(*info), GFP_KERNEL);
+		if (!info) {
+			ret = -ENOMEM;
+			goto release_dvfs_info_list;
+		}
+
+		ret = mtk_cpu_dvfs_info_init(info, cpu);
+		if (ret) {
+			dev_err(&pdev->dev,
+				"failed to initialize dvfs info for cpu%d\n",
+				cpu);
+			goto release_dvfs_info_list;
+		}
+
+		list_add(&info->list_head, &dvfs_info_list);
+	}
 
 	ret = cpufreq_register_driver(&mt8173_cpufreq_driver);
-	if (ret)
-		pr_err("failed to register mtk cpufreq driver\n");
+	if (ret) {
+		dev_err(&pdev->dev, "failed to register mtk cpufreq driver\n");
+		goto release_dvfs_info_list;
+	}
+
+	return 0;
+
+release_dvfs_info_list:
+	list_for_each_safe(list, tmp, &dvfs_info_list) {
+		info = list_entry(list, struct mtk_cpu_dvfs_info, list_head);
+
+		mtk_cpu_dvfs_info_release(info);
+		list_del(list);
+	}
 
 	return ret;
 }
-- 
cgit v0.10.2


From ab0ea257fc58d8742f73f50fba3797dfe001aa3c Mon Sep 17 00:00:00 2001
From: Lee Jones <lee.jones@linaro.org>
Date: Thu, 10 Dec 2015 09:42:16 +0000
Subject: cpufreq: st: Provide runtime initialised driver for ST's platforms

The bootloader is charged with the responsibility to provide platform
specific Dynamic Voltage and Frequency Scaling (DVFS) information via
Device Tree.  This driver takes the supplied configuration and
registers it with the new generic OPP framework, to then be used with
CPUFreq.

Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/cpufreq/Kconfig.arm b/drivers/cpufreq/Kconfig.arm
index 80fbfb3..ff9be36 100644
--- a/drivers/cpufreq/Kconfig.arm
+++ b/drivers/cpufreq/Kconfig.arm
@@ -219,6 +219,16 @@ config ARM_SPEAR_CPUFREQ
 	help
 	  This adds the CPUFreq driver support for SPEAr SOCs.
 
+config ARM_STI_CPUFREQ
+	tristate "STi CPUFreq support"
+	depends on SOC_STIH407
+	help
+	  This driver uses the generic OPP framework to match the running
+	  platform with a predefined set of suitable values.  If not provided
+	  we will fall-back so safe-values contained in Device Tree.  Enable
+	  this config option if you wish to add CPUFreq support for STi based
+	  SoCs.
+
 config ARM_TEGRA20_CPUFREQ
 	bool "Tegra20 CPUFreq support"
 	depends on ARCH_TEGRA
diff --git a/drivers/cpufreq/Makefile b/drivers/cpufreq/Makefile
index c0af1a1..9e63fb1 100644
--- a/drivers/cpufreq/Makefile
+++ b/drivers/cpufreq/Makefile
@@ -73,6 +73,7 @@ obj-$(CONFIG_ARM_SA1100_CPUFREQ)	+= sa1100-cpufreq.o
 obj-$(CONFIG_ARM_SA1110_CPUFREQ)	+= sa1110-cpufreq.o
 obj-$(CONFIG_ARM_SCPI_CPUFREQ)		+= scpi-cpufreq.o
 obj-$(CONFIG_ARM_SPEAR_CPUFREQ)		+= spear-cpufreq.o
+obj-$(CONFIG_ARM_STI_CPUFREQ)		+= sti-cpufreq.o
 obj-$(CONFIG_ARM_TEGRA20_CPUFREQ)	+= tegra20-cpufreq.o
 obj-$(CONFIG_ARM_TEGRA124_CPUFREQ)	+= tegra124-cpufreq.o
 obj-$(CONFIG_ARM_VEXPRESS_SPC_CPUFREQ)	+= vexpress-spc-cpufreq.o
diff --git a/drivers/cpufreq/sti-cpufreq.c b/drivers/cpufreq/sti-cpufreq.c
new file mode 100644
index 0000000..a9c659f
--- /dev/null
+++ b/drivers/cpufreq/sti-cpufreq.c
@@ -0,0 +1,294 @@
+/*
+ * Match running platform with pre-defined OPP values for CPUFreq
+ *
+ * Author: Ajit Pal Singh <ajitpal.singh@st.com>
+ *         Lee Jones <lee.jones@linaro.org>
+ *
+ * Copyright (C) 2015 STMicroelectronics (R&D) Limited
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the version 2 of the GNU General Public License as
+ * published by the Free Software Foundation
+ */
+
+#include <linux/cpu.h>
+#include <linux/io.h>
+#include <linux/mfd/syscon.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/of_platform.h>
+#include <linux/pm_opp.h>
+#include <linux/regmap.h>
+
+#define VERSION_ELEMENTS	3
+#define MAX_PCODE_NAME_LEN	7
+
+#define VERSION_SHIFT		28
+#define HW_INFO_INDEX		1
+#define MAJOR_ID_INDEX		1
+#define MINOR_ID_INDEX		2
+
+/*
+ * Only match on "suitable for ALL versions" entries
+ *
+ * This will be used with the BIT() macro.  It sets the
+ * top bit of a 32bit value and is equal to 0x80000000.
+ */
+#define DEFAULT_VERSION		31
+
+enum {
+	PCODE = 0,
+	SUBSTRATE,
+	DVFS_MAX_REGFIELDS,
+};
+
+/**
+ * ST CPUFreq Driver Data
+ *
+ * @cpu_node		CPU's OF node
+ * @syscfg_eng		Engineering Syscon register map
+ * @regmap		Syscon register map
+ */
+static struct sti_cpufreq_ddata {
+	struct device *cpu;
+	struct regmap *syscfg_eng;
+	struct regmap *syscfg;
+} ddata;
+
+static int sti_cpufreq_fetch_major(void) {
+	struct device_node *np = ddata.cpu->of_node;
+	struct device *dev = ddata.cpu;
+	unsigned int major_offset;
+	unsigned int socid;
+	int ret;
+
+	ret = of_property_read_u32_index(np, "st,syscfg",
+					 MAJOR_ID_INDEX, &major_offset);
+	if (ret) {
+		dev_err(dev, "No major number offset provided in %s [%d]\n",
+			np->full_name, ret);
+		return ret;
+	}
+
+	ret = regmap_read(ddata.syscfg, major_offset, &socid);
+	if (ret) {
+		dev_err(dev, "Failed to read major number from syscon [%d]\n",
+			ret);
+		return ret;
+	}
+
+	return ((socid >> VERSION_SHIFT) & 0xf) + 1;
+}
+
+static int sti_cpufreq_fetch_minor(void)
+{
+	struct device *dev = ddata.cpu;
+	struct device_node *np = dev->of_node;
+	unsigned int minor_offset;
+	unsigned int minid;
+	int ret;
+
+	ret = of_property_read_u32_index(np, "st,syscfg-eng",
+					 MINOR_ID_INDEX, &minor_offset);
+	if (ret) {
+		dev_err(dev,
+			"No minor number offset provided %s [%d]\n",
+			np->full_name, ret);
+		return ret;
+	}
+
+	ret = regmap_read(ddata.syscfg_eng, minor_offset, &minid);
+	if (ret) {
+		dev_err(dev,
+			"Failed to read the minor number from syscon [%d]\n",
+			ret);
+		return ret;
+	}
+
+	return minid & 0xf;
+}
+
+static int sti_cpufreq_fetch_regmap_field(const struct reg_field *reg_fields,
+					  int hw_info_offset, int field)
+{
+	struct regmap_field *regmap_field;
+	struct reg_field reg_field = reg_fields[field];
+	struct device *dev = ddata.cpu;
+	unsigned int value;
+	int ret;
+
+	reg_field.reg = hw_info_offset;
+	regmap_field = devm_regmap_field_alloc(dev,
+					       ddata.syscfg_eng,
+					       reg_field);
+	if (IS_ERR(regmap_field)) {
+		dev_err(dev, "Failed to allocate reg field\n");
+		return PTR_ERR(regmap_field);
+	}
+
+	ret = regmap_field_read(regmap_field, &value);
+	if (ret) {
+		dev_err(dev, "Failed to read %s code\n",
+			field ? "SUBSTRATE" : "PCODE");
+		return ret;
+	}
+
+	return value;
+}
+
+static const struct reg_field sti_stih407_dvfs_regfields[DVFS_MAX_REGFIELDS] = {
+	[PCODE]		= REG_FIELD(0, 16, 19),
+	[SUBSTRATE]	= REG_FIELD(0, 0, 2),
+};
+
+static const struct reg_field *sti_cpufreq_match(void)
+{
+	if (of_machine_is_compatible("st,stih407") ||
+	    of_machine_is_compatible("st,stih410"))
+		return sti_stih407_dvfs_regfields;
+
+	return NULL;
+}
+
+static int sti_cpufreq_set_opp_info(void)
+{
+	struct device *dev = ddata.cpu;
+	struct device_node *np = dev->of_node;
+	const struct reg_field *reg_fields;
+	unsigned int hw_info_offset;
+	unsigned int version[VERSION_ELEMENTS];
+	int pcode, substrate, major, minor;
+	int ret;
+	char name[MAX_PCODE_NAME_LEN];
+
+	reg_fields = sti_cpufreq_match();
+	if (!reg_fields) {
+		dev_err(dev, "This SoC doesn't support voltage scaling");
+		return -ENODEV;
+	}
+
+	ret = of_property_read_u32_index(np, "st,syscfg-eng",
+					 HW_INFO_INDEX, &hw_info_offset);
+	if (ret) {
+		dev_warn(dev, "Failed to read HW info offset from DT\n");
+		substrate = DEFAULT_VERSION;
+		pcode = 0;
+		goto use_defaults;
+	}
+
+	pcode = sti_cpufreq_fetch_regmap_field(reg_fields,
+					       hw_info_offset,
+					       PCODE);
+	if (pcode < 0) {
+		dev_warn(dev, "Failed to obtain process code\n");
+		/* Use default pcode */
+		pcode = 0;
+	}
+
+	substrate = sti_cpufreq_fetch_regmap_field(reg_fields,
+						   hw_info_offset,
+						   SUBSTRATE);
+	if (substrate) {
+		dev_warn(dev, "Failed to obtain substrate code\n");
+		/* Use default substrate */
+		substrate = DEFAULT_VERSION;
+	}
+
+use_defaults:
+	major = sti_cpufreq_fetch_major();
+	if (major < 0) {
+		dev_err(dev, "Failed to obtain major version\n");
+		/* Use default major number */
+		major = DEFAULT_VERSION;
+	}
+
+	minor = sti_cpufreq_fetch_minor();
+	if (minor < 0) {
+		dev_err(dev, "Failed to obtain minor version\n");
+		/* Use default minor number */
+		minor = DEFAULT_VERSION;
+	}
+
+	snprintf(name, MAX_PCODE_NAME_LEN, "pcode%d", pcode);
+
+	ret = dev_pm_opp_set_prop_name(dev, name);
+	if (ret) {
+		dev_err(dev, "Failed to set prop name\n");
+		return ret;
+	}
+
+	version[0] = BIT(major);
+	version[1] = BIT(minor);
+	version[2] = BIT(substrate);
+
+	ret = dev_pm_opp_set_supported_hw(dev, version, VERSION_ELEMENTS);
+	if (ret) {
+		dev_err(dev, "Failed to set supported hardware\n");
+		return ret;
+	}
+
+	dev_dbg(dev, "pcode: %d major: %d minor: %d substrate: %d\n",
+		pcode, major, minor, substrate);
+	dev_dbg(dev, "version[0]: %x version[1]: %x version[2]: %x\n",
+		version[0], version[1], version[2]);
+
+	return 0;
+}
+
+static int sti_cpufreq_fetch_syscon_regsiters(void)
+{
+	struct device *dev = ddata.cpu;
+	struct device_node *np = dev->of_node;
+
+	ddata.syscfg = syscon_regmap_lookup_by_phandle(np, "st,syscfg");
+	if (IS_ERR(ddata.syscfg)) {
+		dev_err(dev,  "\"st,syscfg\" not supplied\n");
+		return PTR_ERR(ddata.syscfg);
+	}
+
+	ddata.syscfg_eng = syscon_regmap_lookup_by_phandle(np, "st,syscfg-eng");
+	if (IS_ERR(ddata.syscfg_eng)) {
+		dev_err(dev, "\"st,syscfg-eng\" not supplied\n");
+		return PTR_ERR(ddata.syscfg_eng);
+	}
+
+	return 0;
+}
+
+static int sti_cpufreq_init(void)
+{
+	int ret;
+
+	ddata.cpu = get_cpu_device(0);
+	if (!ddata.cpu) {
+		dev_err(ddata.cpu, "Failed to get device for CPU0\n");
+		goto skip_voltage_scaling;
+	}
+
+	if (!of_get_property(ddata.cpu->of_node, "operating-points-v2", NULL)) {
+		dev_err(ddata.cpu, "OPP-v2 not supported\n");
+		goto skip_voltage_scaling;
+	}
+
+	ret = sti_cpufreq_fetch_syscon_regsiters();
+	if (ret)
+		goto skip_voltage_scaling;
+
+	ret = sti_cpufreq_set_opp_info();
+	if (!ret)
+		goto register_cpufreq_dt;
+
+skip_voltage_scaling:
+	dev_err(ddata.cpu, "Not doing voltage scaling\n");
+
+register_cpufreq_dt:
+	platform_device_register_simple("cpufreq-dt", -1, NULL, 0);
+
+	return 0;
+}
+module_init(sti_cpufreq_init);
+
+MODULE_DESCRIPTION("STMicroelectronics CPUFreq/OPP driver");
+MODULE_AUTHOR("Ajitpal Singh <ajitpal.singh@st.com>");
+MODULE_AUTHOR("Lee Jones <lee.jones@linaro.org>");
+MODULE_LICENSE("GPL v2");
-- 
cgit v0.10.2


From b122bcd94743239cc26a5732fef87b28d7f5c22a Mon Sep 17 00:00:00 2001
From: Lee Jones <lee.jones@linaro.org>
Date: Thu, 10 Dec 2015 09:42:17 +0000
Subject: dt: cpufreq: st: Provide bindings for ST's CPUFreq implementation

Acked-by: Rob Herring <robh@kernel.org>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/Documentation/devicetree/bindings/cpufreq/cpufreq-st.txt b/Documentation/devicetree/bindings/cpufreq/cpufreq-st.txt
new file mode 100644
index 0000000..d91a02a
--- /dev/null
+++ b/Documentation/devicetree/bindings/cpufreq/cpufreq-st.txt
@@ -0,0 +1,91 @@
+Binding for ST's CPUFreq driver
+===============================
+
+ST's CPUFreq driver attempts to read 'process' and 'version' attributes
+from the SoC, then supplies the OPP framework with 'prop' and 'supported
+hardware' information respectively.  The framework is then able to read
+the DT and operate in the usual way.
+
+For more information about the expected DT format [See: ../opp/opp.txt].
+
+Frequency Scaling only
+----------------------
+
+No vendor specific driver required for this.
+
+Located in CPU's node:
+
+- operating-points		: [See: ../power/opp.txt]
+
+Example [safe]
+--------------
+
+cpus {
+	cpu@0 {
+				 /* kHz     uV   */
+		operating-points = <1500000 0
+				    1200000 0
+				    800000  0
+				    500000  0>;
+	};
+};
+
+Dynamic Voltage and Frequency Scaling (DVFS)
+--------------------------------------------
+
+This requires the ST CPUFreq driver to supply 'process' and 'version' info.
+
+Located in CPU's node:
+
+- operating-points-v2		: [See ../power/opp.txt]
+
+Example [unsafe]
+----------------
+
+cpus {
+	cpu@0 {
+		operating-points-v2	= <&cpu0_opp_table>;
+	};
+};
+
+cpu0_opp_table: opp_table {
+	compatible = "operating-points-v2";
+
+	/* ############################################################### */
+	/* # WARNING: Do not attempt to copy/replicate these nodes,      # */
+	/* #          they are only to be supplied by the bootloader !!! # */
+	/* ############################################################### */
+	opp0 {
+		/*			   Major       Minor       Substrate */
+		/*			   2           all         all       */
+		opp-supported-hw	= <0x00000004  0xffffffff  0xffffffff>;
+		opp-hz			= /bits/ 64 <1500000000>;
+		clock-latency-ns	= <10000000>;
+
+		opp-microvolt-pcode0	= <1200000>;
+		opp-microvolt-pcode1	= <1200000>;
+		opp-microvolt-pcode2	= <1200000>;
+		opp-microvolt-pcode3	= <1200000>;
+		opp-microvolt-pcode4	= <1170000>;
+		opp-microvolt-pcode5	= <1140000>;
+		opp-microvolt-pcode6	= <1100000>;
+		opp-microvolt-pcode7	= <1070000>;
+	};
+
+	opp1 {
+		/*			   Major       Minor       Substrate */
+		/*			   all         all         all       */
+		opp-supported-hw	= <0xffffffff  0xffffffff  0xffffffff>;
+		opp-hz			= /bits/ 64 <1200000000>;
+		clock-latency-ns	= <10000000>;
+
+		opp-microvolt-pcode0	= <1110000>;
+		opp-microvolt-pcode1	= <1150000>;
+		opp-microvolt-pcode2	= <1100000>;
+		opp-microvolt-pcode3	= <1080000>;
+		opp-microvolt-pcode4	= <1040000>;
+		opp-microvolt-pcode5	= <1020000>;
+		opp-microvolt-pcode6	= <980000>;
+		opp-microvolt-pcode7	= <930000>;
+	};
+};
-- 
cgit v0.10.2