From a4714a898e85205e1118ec923cde43d88eb105f6 Mon Sep 17 00:00:00 2001 From: Randy Wright Date: Wed, 4 Jun 2014 08:55:59 -0700 Subject: ACPI: Fix bug when ACPI reset register is implemented in system memory Use acpi_os_map_generic_address to pre-map the reset register if it is memory mapped, thereby preventing the BUG_ON() in line 1319 of mm/vmalloc.c from triggering during panic-triggered reboots. Link: https://bugzilla.kernel.org/show_bug.cgi?id=77131 Signed-off-by: Randy Wright Signed-off-by: David E. Box [rjw: Changelog, simplified code] Signed-off-by: Rafael J. Wysocki diff --git a/drivers/acpi/osl.c b/drivers/acpi/osl.c index 6776c59..88cddf8 100644 --- a/drivers/acpi/osl.c +++ b/drivers/acpi/osl.c @@ -1798,6 +1798,16 @@ acpi_status __init acpi_os_initialize(void) acpi_os_map_generic_address(&acpi_gbl_FADT.xpm1b_event_block); acpi_os_map_generic_address(&acpi_gbl_FADT.xgpe0_block); acpi_os_map_generic_address(&acpi_gbl_FADT.xgpe1_block); + if (acpi_gbl_FADT.flags & ACPI_FADT_RESET_REGISTER) { + /* + * Use acpi_os_map_generic_address to pre-map the reset + * register if it's in system memory. + */ + int rv; + + rv = acpi_os_map_generic_address(&acpi_gbl_FADT.reset_register); + pr_debug(PREFIX "%s: map reset_reg status %d\n", __func__, rv); + } return AE_OK; } @@ -1826,6 +1836,8 @@ acpi_status acpi_os_terminate(void) acpi_os_unmap_generic_address(&acpi_gbl_FADT.xgpe0_block); acpi_os_unmap_generic_address(&acpi_gbl_FADT.xpm1b_event_block); acpi_os_unmap_generic_address(&acpi_gbl_FADT.xpm1a_event_block); + if (acpi_gbl_FADT.flags & ACPI_FADT_RESET_REGISTER) + acpi_os_unmap_generic_address(&acpi_gbl_FADT.reset_register); destroy_workqueue(kacpid_wq); destroy_workqueue(kacpi_notify_wq); -- cgit v0.10.2 From 751109aad5834375ca9ed0dcfcd85a00cbf872b5 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 5 Jun 2014 22:47:35 +0200 Subject: ACPI / video: Change the default for video.use_native_backlight to 1 Now that we're hoping to have resolved all of the problems with video.use_native_backlight=1, make that the default at last. Link: http://marc.info/?l=linux-acpi&m=139716088401106&w=2 Signed-off-by: Rafael J. Wysocki diff --git a/drivers/acpi/video.c b/drivers/acpi/video.c index 101fb09..fb9ffe9 100644 --- a/drivers/acpi/video.c +++ b/drivers/acpi/video.c @@ -82,7 +82,7 @@ module_param(allow_duplicates, bool, 0644); * For Windows 8 systems: used to decide if video module * should skip registering backlight interface of its own. */ -static int use_native_backlight_param = -1; +static int use_native_backlight_param = 1; module_param_named(use_native_backlight, use_native_backlight_param, int, 0444); static bool use_native_backlight_dmi = false; -- cgit v0.10.2 From 1c03a2d04d7ab6d27c1fef8614f08187d974bd21 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Mon, 2 Jun 2014 22:49:28 +0530 Subject: cpufreq: add support for intermediate (stable) frequencies Douglas Anderson, recently pointed out an interesting problem due to which udelay() was expiring earlier than it should. While transitioning between frequencies few platforms may temporarily switch to a stable frequency, waiting for the main PLL to stabilize. For example: When we transition between very low frequencies on exynos, like between 200MHz and 300MHz, we may temporarily switch to a PLL running at 800MHz. No CPUFREQ notification is sent for that. That means there's a period of time when we're running at 800MHz but loops_per_jiffy is calibrated at between 200MHz and 300MHz. And so udelay behaves badly. To get this fixed in a generic way, introduce another set of callbacks get_intermediate() and target_intermediate(), only for drivers with target_index() and CPUFREQ_ASYNC_NOTIFICATION unset. get_intermediate() should return a stable intermediate frequency platform wants to switch to, and target_intermediate() should set CPU to that frequency, before jumping to the frequency corresponding to 'index'. Core will take care of sending notifications and driver doesn't have to handle them in target_intermediate() or target_index(). NOTE: ->target_index() should restore to policy->restore_freq in case of failures as core would send notifications for that. Tested-by: Stephen Warren Signed-off-by: Viresh Kumar Reviewed-by: Doug Anderson Signed-off-by: Rafael J. Wysocki diff --git a/Documentation/cpu-freq/cpu-drivers.txt b/Documentation/cpu-freq/cpu-drivers.txt index b045fe5..14f4e63 100644 --- a/Documentation/cpu-freq/cpu-drivers.txt +++ b/Documentation/cpu-freq/cpu-drivers.txt @@ -26,6 +26,7 @@ Contents: 1.4 target/target_index or setpolicy? 1.5 target/target_index 1.6 setpolicy +1.7 get_intermediate and target_intermediate 2. Frequency Table Helpers @@ -79,6 +80,10 @@ cpufreq_driver.attr - A pointer to a NULL-terminated list of "struct freq_attr" which allow to export values to sysfs. +cpufreq_driver.get_intermediate +and target_intermediate Used to switch to stable frequency while + changing CPU frequency. + 1.2 Per-CPU Initialization -------------------------- @@ -151,7 +156,7 @@ Some cpufreq-capable processors switch the frequency between certain limits on their own. These shall use the ->setpolicy call -1.4. target/target_index +1.5. target/target_index ------------- The target_index call has two arguments: struct cpufreq_policy *policy, @@ -160,6 +165,9 @@ and unsigned int index (into the exposed frequency table). The CPUfreq driver must set the new frequency when called here. The actual frequency must be determined by freq_table[index].frequency. +It should always restore to earlier frequency (i.e. policy->restore_freq) in +case of errors, even if we switched to intermediate frequency earlier. + Deprecated: ---------- The target call has three arguments: struct cpufreq_policy *policy, @@ -179,7 +187,7 @@ Here again the frequency table helper might assist you - see section 2 for details. -1.5 setpolicy +1.6 setpolicy --------------- The setpolicy call only takes a struct cpufreq_policy *policy as @@ -190,6 +198,23 @@ setting when policy->policy is CPUFREQ_POLICY_PERFORMANCE, and a powersaving-oriented setting when CPUFREQ_POLICY_POWERSAVE. Also check the reference implementation in drivers/cpufreq/longrun.c +1.7 get_intermediate and target_intermediate +-------------------------------------------- + +Only for drivers with target_index() and CPUFREQ_ASYNC_NOTIFICATION unset. + +get_intermediate should return a stable intermediate frequency platform wants to +switch to, and target_intermediate() should set CPU to to that frequency, before +jumping to the frequency corresponding to 'index'. Core will take care of +sending notifications and driver doesn't have to handle them in +target_intermediate() or target_index(). + +Drivers can return '0' from get_intermediate() in case they don't wish to switch +to intermediate frequency for some target frequency. In that case core will +directly call ->target_index(). + +NOTE: ->target_index() should restore to policy->restore_freq in case of +failures as core would send notifications for that. 2. Frequency Table Helpers diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index ae11dd5..aed2b0c 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -1816,20 +1816,55 @@ EXPORT_SYMBOL(cpufreq_unregister_notifier); * GOVERNORS * *********************************************************************/ +/* Must set freqs->new to intermediate frequency */ +static int __target_intermediate(struct cpufreq_policy *policy, + struct cpufreq_freqs *freqs, int index) +{ + int ret; + + freqs->new = cpufreq_driver->get_intermediate(policy, index); + + /* We don't need to switch to intermediate freq */ + if (!freqs->new) + return 0; + + pr_debug("%s: cpu: %d, switching to intermediate freq: oldfreq: %u, intermediate freq: %u\n", + __func__, policy->cpu, freqs->old, freqs->new); + + cpufreq_freq_transition_begin(policy, freqs); + ret = cpufreq_driver->target_intermediate(policy, index); + cpufreq_freq_transition_end(policy, freqs, ret); + + if (ret) + pr_err("%s: Failed to change to intermediate frequency: %d\n", + __func__, ret); + + return ret; +} + static int __target_index(struct cpufreq_policy *policy, struct cpufreq_frequency_table *freq_table, int index) { - struct cpufreq_freqs freqs; + struct cpufreq_freqs freqs = {.old = policy->cur, .flags = 0}; + unsigned int intermediate_freq = 0; int retval = -EINVAL; bool notify; notify = !(cpufreq_driver->flags & CPUFREQ_ASYNC_NOTIFICATION); - if (notify) { - freqs.old = policy->cur; - freqs.new = freq_table[index].frequency; - freqs.flags = 0; + /* Handle switching to intermediate frequency */ + if (cpufreq_driver->get_intermediate) { + retval = __target_intermediate(policy, &freqs, index); + if (retval) + return retval; + + intermediate_freq = freqs.new; + /* Set old freq to intermediate */ + if (intermediate_freq) + freqs.old = freqs.new; + } + freqs.new = freq_table[index].frequency; pr_debug("%s: cpu: %d, oldfreq: %u, new freq: %u\n", __func__, policy->cpu, freqs.old, freqs.new); @@ -1841,9 +1876,23 @@ static int __target_index(struct cpufreq_policy *policy, pr_err("%s: Failed to change cpu frequency: %d\n", __func__, retval); - if (notify) + if (notify) { cpufreq_freq_transition_end(policy, &freqs, retval); + /* + * Failed after setting to intermediate freq? Driver should have + * reverted back to initial frequency and so should we. Check + * here for intermediate_freq instead of get_intermediate, in + * case we have't switched to intermediate freq at all. + */ + if (unlikely(retval && intermediate_freq)) { + freqs.old = intermediate_freq; + freqs.new = policy->restore_freq; + cpufreq_freq_transition_begin(policy, &freqs); + cpufreq_freq_transition_end(policy, &freqs, 0); + } + } + return retval; } @@ -1875,6 +1924,9 @@ int __cpufreq_driver_target(struct cpufreq_policy *policy, if (target_freq == policy->cur) return 0; + /* Save last value to restore later on errors */ + policy->restore_freq = policy->cur; + if (cpufreq_driver->target) retval = cpufreq_driver->target(policy, target_freq, relation); else if (cpufreq_driver->target_index) { @@ -2361,7 +2413,8 @@ int cpufreq_register_driver(struct cpufreq_driver *driver_data) !(driver_data->setpolicy || driver_data->target_index || driver_data->target) || (driver_data->setpolicy && (driver_data->target_index || - driver_data->target))) + driver_data->target)) || + (!!driver_data->get_intermediate != !!driver_data->target_intermediate)) return -EINVAL; pr_debug("trying to register driver %s\n", driver_data->name); diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 3f45889..ec4112d 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -75,6 +75,7 @@ struct cpufreq_policy { unsigned int max; /* in kHz */ unsigned int cur; /* in kHz, only needed if cpufreq * governors are used */ + unsigned int restore_freq; /* = policy->cur before transition */ unsigned int suspend_freq; /* freq to set during suspend */ unsigned int policy; /* see above */ @@ -221,11 +222,35 @@ struct cpufreq_driver { /* define one out of two */ int (*setpolicy) (struct cpufreq_policy *policy); + + /* + * On failure, should always restore frequency to policy->restore_freq + * (i.e. old freq). + */ int (*target) (struct cpufreq_policy *policy, /* Deprecated */ unsigned int target_freq, unsigned int relation); int (*target_index) (struct cpufreq_policy *policy, unsigned int index); + /* + * Only for drivers with target_index() and CPUFREQ_ASYNC_NOTIFICATION + * unset. + * + * get_intermediate should return a stable intermediate frequency + * platform wants to switch to and target_intermediate() should set CPU + * to to that frequency, before jumping to the frequency corresponding + * to 'index'. Core will take care of sending notifications and driver + * doesn't have to handle them in target_intermediate() or + * target_index(). + * + * Drivers can return '0' from get_intermediate() in case they don't + * wish to switch to intermediate frequency for some target frequency. + * In that case core will directly call ->target_index(). + */ + unsigned int (*get_intermediate)(struct cpufreq_policy *policy, + unsigned int index); + int (*target_intermediate)(struct cpufreq_policy *policy, + unsigned int index); /* should be defined, if possible */ unsigned int (*get) (unsigned int cpu); -- cgit v0.10.2 From 00917ddc7c2fa7d2790241ed3570e8e4a0466923 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Mon, 2 Jun 2014 22:49:29 +0530 Subject: cpufreq: Tegra: implement intermediate frequency callbacks Tegra has been switching to intermediate frequency (pll_p_clk) forever. CPUFreq core has better support for handling notifications for these frequencies and so we can adapt Tegra's driver to it. Also do a WARN() if clk_set_parent() fails while moving back to pll_x as we should have atleast restored to earlier frequency on error. Tested-by: Stephen Warren Signed-off-by: Viresh Kumar Reviewed-by: Doug Anderson Signed-off-by: Rafael J. Wysocki diff --git a/drivers/cpufreq/tegra-cpufreq.c b/drivers/cpufreq/tegra-cpufreq.c index 6e774c6..a5fbc0a 100644 --- a/drivers/cpufreq/tegra-cpufreq.c +++ b/drivers/cpufreq/tegra-cpufreq.c @@ -45,46 +45,51 @@ static struct clk *cpu_clk; static struct clk *pll_x_clk; static struct clk *pll_p_clk; static struct clk *emc_clk; +static bool pll_x_prepared; -static int tegra_cpu_clk_set_rate(unsigned long rate) +static unsigned int tegra_get_intermediate(struct cpufreq_policy *policy, + unsigned int index) +{ + unsigned int ifreq = clk_get_rate(pll_p_clk) / 1000; + + /* + * Don't switch to intermediate freq if: + * - we are already at it, i.e. policy->cur == ifreq + * - index corresponds to ifreq + */ + if ((freq_table[index].frequency == ifreq) || (policy->cur == ifreq)) + return 0; + + return ifreq; +} + +static int tegra_target_intermediate(struct cpufreq_policy *policy, + unsigned int index) { int ret; /* * Take an extra reference to the main pll so it doesn't turn - * off when we move the cpu off of it + * off when we move the cpu off of it as enabling it again while we + * switch to it from tegra_target() would take additional time. Though + * when target-freq is intermediate freq, we don't need to take this + * reference. */ clk_prepare_enable(pll_x_clk); ret = clk_set_parent(cpu_clk, pll_p_clk); - if (ret) { - pr_err("Failed to switch cpu to clock pll_p\n"); - goto out; - } - - if (rate == clk_get_rate(pll_p_clk)) - goto out; - - ret = clk_set_rate(pll_x_clk, rate); - if (ret) { - pr_err("Failed to change pll_x to %lu\n", rate); - goto out; - } - - ret = clk_set_parent(cpu_clk, pll_x_clk); - if (ret) { - pr_err("Failed to switch cpu to clock pll_x\n"); - goto out; - } + if (ret) + clk_disable_unprepare(pll_x_clk); + else + pll_x_prepared = true; -out: - clk_disable_unprepare(pll_x_clk); return ret; } static int tegra_target(struct cpufreq_policy *policy, unsigned int index) { unsigned long rate = freq_table[index].frequency; + unsigned int ifreq = clk_get_rate(pll_p_clk) / 1000; int ret = 0; /* @@ -98,10 +103,30 @@ static int tegra_target(struct cpufreq_policy *policy, unsigned int index) else clk_set_rate(emc_clk, 100000000); /* emc 50Mhz */ - ret = tegra_cpu_clk_set_rate(rate * 1000); + /* + * target freq == pll_p, don't need to take extra reference to pll_x_clk + * as it isn't used anymore. + */ + if (rate == ifreq) + return clk_set_parent(cpu_clk, pll_p_clk); + + ret = clk_set_rate(pll_x_clk, rate * 1000); + /* Restore to earlier frequency on error, i.e. pll_x */ if (ret) - pr_err("cpu-tegra: Failed to set cpu frequency to %lu kHz\n", - rate); + pr_err("Failed to change pll_x to %lu\n", rate); + + ret = clk_set_parent(cpu_clk, pll_x_clk); + /* This shouldn't fail while changing or restoring */ + WARN_ON(ret); + + /* + * Drop count to pll_x clock only if we switched to intermediate freq + * earlier while transitioning to a target frequency. + */ + if (pll_x_prepared) { + clk_disable_unprepare(pll_x_clk); + pll_x_prepared = false; + } return ret; } @@ -137,16 +162,18 @@ static int tegra_cpu_exit(struct cpufreq_policy *policy) } static struct cpufreq_driver tegra_cpufreq_driver = { - .flags = CPUFREQ_NEED_INITIAL_FREQ_CHECK, - .verify = cpufreq_generic_frequency_table_verify, - .target_index = tegra_target, - .get = cpufreq_generic_get, - .init = tegra_cpu_init, - .exit = tegra_cpu_exit, - .name = "tegra", - .attr = cpufreq_generic_attr, + .flags = CPUFREQ_NEED_INITIAL_FREQ_CHECK, + .verify = cpufreq_generic_frequency_table_verify, + .get_intermediate = tegra_get_intermediate, + .target_intermediate = tegra_target_intermediate, + .target_index = tegra_target, + .get = cpufreq_generic_get, + .init = tegra_cpu_init, + .exit = tegra_cpu_exit, + .name = "tegra", + .attr = cpufreq_generic_attr, #ifdef CONFIG_PM - .suspend = cpufreq_generic_suspend, + .suspend = cpufreq_generic_suspend, #endif }; -- cgit v0.10.2 From 57aa5ea0eea267f90be5e69def4e877ce2c81677 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 5 Jun 2014 23:50:10 +0200 Subject: Revert "cpufreq: Enable big.LITTLE cpufreq driver on arm64" This reverts commit 4920ab84979d (cpufreq: Enable big.LITTLE cpufreq driver on arm64) that breaks build on arm64. Fixes: 4920ab84979d (cpufreq: Enable big.LITTLE cpufreq driver on arm64) Reported-by: Catalin Marinas Signed-off-by: Rafael J. Wysocki diff --git a/drivers/cpufreq/Kconfig.arm b/drivers/cpufreq/Kconfig.arm index 6a7dd3e..6e05a1e 100644 --- a/drivers/cpufreq/Kconfig.arm +++ b/drivers/cpufreq/Kconfig.arm @@ -5,8 +5,7 @@ # big LITTLE core layer and glue drivers config ARM_BIG_LITTLE_CPUFREQ tristate "Generic ARM big LITTLE CPUfreq driver" - depends on (BIG_LITTLE && ARM_CPU_TOPOLOGY) || (ARM64 && SMP) - depends on HAVE_CLK + depends on ARM && BIG_LITTLE && ARM_CPU_TOPOLOGY && HAVE_CLK select PM_OPP help This enables the Generic CPUfreq driver for ARM big.LITTLE platforms. -- cgit v0.10.2 From 906fe033145aab7d65a428bfda2cf19c75720894 Mon Sep 17 00:00:00 2001 From: Ed Swarthout Date: Thu, 5 Jun 2014 18:56:17 -0500 Subject: cpufreq: ppc-corenet-cpu-freq: do_div use quotient Commit 6712d2931933 (cpufreq: ppc-corenet-cpufreq: Fix __udivdi3 modpost error) used the remainder from do_div instead of the quotient. Fix that and add one to ensure minimum is met. Fixes: 6712d2931933 (cpufreq: ppc-corenet-cpufreq: Fix __udivdi3 modpost error) Signed-off-by: Ed Swarthout Cc: 3.15+ # 3.15+ Signed-off-by: Rafael J. Wysocki diff --git a/drivers/cpufreq/ppc-corenet-cpufreq.c b/drivers/cpufreq/ppc-corenet-cpufreq.c index 0af618a..3607070 100644 --- a/drivers/cpufreq/ppc-corenet-cpufreq.c +++ b/drivers/cpufreq/ppc-corenet-cpufreq.c @@ -138,7 +138,7 @@ static int corenet_cpufreq_cpu_init(struct cpufreq_policy *policy) struct cpufreq_frequency_table *table; struct cpu_data *data; unsigned int cpu = policy->cpu; - u64 transition_latency_hz; + u64 u64temp; np = of_get_cpu_node(cpu, NULL); if (!np) @@ -206,9 +206,10 @@ static int corenet_cpufreq_cpu_init(struct cpufreq_policy *policy) for_each_cpu(i, per_cpu(cpu_mask, cpu)) per_cpu(cpu_data, i) = data; - transition_latency_hz = 12ULL * NSEC_PER_SEC; - policy->cpuinfo.transition_latency = - do_div(transition_latency_hz, fsl_get_sys_freq()); + /* Minimum transition latency is 12 platform clocks */ + u64temp = 12ULL * NSEC_PER_SEC; + do_div(u64temp, fsl_get_sys_freq()); + policy->cpuinfo.transition_latency = u64temp + 1; of_node_put(np); -- cgit v0.10.2 From bb3632c6101b2fad07e6246721466b984b1e0e9d Mon Sep 17 00:00:00 2001 From: Todd E Brandt Date: Fri, 6 Jun 2014 05:40:17 -0700 Subject: PM / sleep: trace events for suspend/resume Adds trace events that give finer resolution into suspend/resume. These events are graphed in the timelines generated by the analyze_suspend.py script. They represent large areas of time consumed that are typical to suspend and resume. The event is triggered by calling the function "trace_suspend_resume" with three arguments: a string (the name of the event to be displayed in the timeline), an integer (case specific number, such as the power state or cpu number), and a boolean (where true is used to denote the start of the timeline event, and false to denote the end). The suspend_resume trace event reproduces the data that the machine_suspend trace event did, so the latter has been removed. Signed-off-by: Todd Brandt Acked-by: Steven Rostedt Signed-off-by: Rafael J. Wysocki diff --git a/drivers/acpi/sleep.c b/drivers/acpi/sleep.c index c11e379..b3e3cc7 100644 --- a/drivers/acpi/sleep.c +++ b/drivers/acpi/sleep.c @@ -19,6 +19,7 @@ #include #include #include +#include #include "internal.h" #include "sleep.h" @@ -501,6 +502,7 @@ static int acpi_suspend_enter(suspend_state_t pm_state) ACPI_FLUSH_CPU_CACHE(); + trace_suspend_resume(TPS("acpi_suspend"), acpi_state, true); switch (acpi_state) { case ACPI_STATE_S1: barrier(); @@ -516,6 +518,7 @@ static int acpi_suspend_enter(suspend_state_t pm_state) pr_info(PREFIX "Low-level resume complete\n"); break; } + trace_suspend_resume(TPS("acpi_suspend"), acpi_state, false); /* This violates the spec but is required for bug compatibility. */ acpi_write_bit_register(ACPI_BITREG_SCI_ENABLE, 1); diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index 343ffad..de3fe4fe 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -545,6 +545,7 @@ static void dpm_resume_noirq(pm_message_t state) struct device *dev; ktime_t starttime = ktime_get(); + trace_suspend_resume(TPS("dpm_resume_noirq"), state.event, true); mutex_lock(&dpm_list_mtx); pm_transition = state; @@ -587,6 +588,7 @@ static void dpm_resume_noirq(pm_message_t state) dpm_show_time(starttime, state, "noirq"); resume_device_irqs(); cpuidle_resume(); + trace_suspend_resume(TPS("dpm_resume_noirq"), state.event, false); } /** @@ -664,6 +666,7 @@ static void dpm_resume_early(pm_message_t state) struct device *dev; ktime_t starttime = ktime_get(); + trace_suspend_resume(TPS("dpm_resume_early"), state.event, true); mutex_lock(&dpm_list_mtx); pm_transition = state; @@ -703,6 +706,7 @@ static void dpm_resume_early(pm_message_t state) mutex_unlock(&dpm_list_mtx); async_synchronize_full(); dpm_show_time(starttime, state, "early"); + trace_suspend_resume(TPS("dpm_resume_early"), state.event, false); } /** @@ -834,6 +838,7 @@ void dpm_resume(pm_message_t state) struct device *dev; ktime_t starttime = ktime_get(); + trace_suspend_resume(TPS("dpm_resume"), state.event, true); might_sleep(); mutex_lock(&dpm_list_mtx); @@ -875,6 +880,7 @@ void dpm_resume(pm_message_t state) dpm_show_time(starttime, state, NULL); cpufreq_resume(); + trace_suspend_resume(TPS("dpm_resume"), state.event, false); } /** @@ -932,6 +938,7 @@ void dpm_complete(pm_message_t state) { struct list_head list; + trace_suspend_resume(TPS("dpm_complete"), state.event, true); might_sleep(); INIT_LIST_HEAD(&list); @@ -951,6 +958,7 @@ void dpm_complete(pm_message_t state) } list_splice(&list, &dpm_list); mutex_unlock(&dpm_list_mtx); + trace_suspend_resume(TPS("dpm_complete"), state.event, false); } /** @@ -1086,6 +1094,7 @@ static int dpm_suspend_noirq(pm_message_t state) ktime_t starttime = ktime_get(); int error = 0; + trace_suspend_resume(TPS("dpm_suspend_noirq"), state.event, true); cpuidle_pause(); suspend_device_irqs(); mutex_lock(&dpm_list_mtx); @@ -1126,6 +1135,7 @@ static int dpm_suspend_noirq(pm_message_t state) } else { dpm_show_time(starttime, state, "noirq"); } + trace_suspend_resume(TPS("dpm_suspend_noirq"), state.event, false); return error; } @@ -1222,6 +1232,7 @@ static int dpm_suspend_late(pm_message_t state) ktime_t starttime = ktime_get(); int error = 0; + trace_suspend_resume(TPS("dpm_suspend_late"), state.event, true); mutex_lock(&dpm_list_mtx); pm_transition = state; async_error = 0; @@ -1257,6 +1268,7 @@ static int dpm_suspend_late(pm_message_t state) } else { dpm_show_time(starttime, state, "late"); } + trace_suspend_resume(TPS("dpm_suspend_late"), state.event, false); return error; } @@ -1461,6 +1473,7 @@ int dpm_suspend(pm_message_t state) ktime_t starttime = ktime_get(); int error = 0; + trace_suspend_resume(TPS("dpm_suspend"), state.event, true); might_sleep(); cpufreq_suspend(); @@ -1498,6 +1511,7 @@ int dpm_suspend(pm_message_t state) dpm_save_failed_step(SUSPEND_SUSPEND); } else dpm_show_time(starttime, state, NULL); + trace_suspend_resume(TPS("dpm_suspend"), state.event, false); return error; } @@ -1582,6 +1596,7 @@ int dpm_prepare(pm_message_t state) { int error = 0; + trace_suspend_resume(TPS("dpm_prepare"), state.event, true); might_sleep(); mutex_lock(&dpm_list_mtx); @@ -1612,6 +1627,7 @@ int dpm_prepare(pm_message_t state) put_device(dev); } mutex_unlock(&dpm_list_mtx); + trace_suspend_resume(TPS("dpm_prepare"), state.event, false); return error; } diff --git a/drivers/base/syscore.c b/drivers/base/syscore.c index e8d11b6..dbb8350 100644 --- a/drivers/base/syscore.c +++ b/drivers/base/syscore.c @@ -10,6 +10,7 @@ #include #include #include +#include static LIST_HEAD(syscore_ops_list); static DEFINE_MUTEX(syscore_ops_lock); @@ -49,6 +50,7 @@ int syscore_suspend(void) struct syscore_ops *ops; int ret = 0; + trace_suspend_resume(TPS("syscore_suspend"), 0, true); pr_debug("Checking wakeup interrupts\n"); /* Return error code if there are any wakeup interrupts pending. */ @@ -70,6 +72,7 @@ int syscore_suspend(void) "Interrupts enabled after %pF\n", ops->suspend); } + trace_suspend_resume(TPS("syscore_suspend"), 0, false); return 0; err_out: @@ -92,6 +95,7 @@ void syscore_resume(void) { struct syscore_ops *ops; + trace_suspend_resume(TPS("syscore_resume"), 0, true); WARN_ONCE(!irqs_disabled(), "Interrupts enabled before system core resume.\n"); @@ -103,6 +107,7 @@ void syscore_resume(void) WARN_ONCE(!irqs_disabled(), "Interrupts enabled after %pF\n", ops->resume); } + trace_suspend_resume(TPS("syscore_resume"), 0, false); } EXPORT_SYMBOL_GPL(syscore_resume); #endif /* CONFIG_PM_SLEEP */ diff --git a/include/trace/events/power.h b/include/trace/events/power.h index 9a7e08d..f88c857 100644 --- a/include/trace/events/power.h +++ b/include/trace/events/power.h @@ -7,6 +7,9 @@ #include #include #include +#include + +#define TPS(x) tracepoint_string(x) DECLARE_EVENT_CLASS(cpu, @@ -97,23 +100,6 @@ DEFINE_EVENT(cpu, cpu_frequency, TP_ARGS(frequency, cpu_id) ); -TRACE_EVENT(machine_suspend, - - TP_PROTO(unsigned int state), - - TP_ARGS(state), - - TP_STRUCT__entry( - __field( u32, state ) - ), - - TP_fast_assign( - __entry->state = state; - ), - - TP_printk("state=%lu", (unsigned long)__entry->state) -); - TRACE_EVENT(device_pm_report_time, TP_PROTO(struct device *dev, const char *pm_ops, s64 ops_time, @@ -151,6 +137,28 @@ TRACE_EVENT(device_pm_report_time, __entry->ops_time, __entry->error) ); +TRACE_EVENT(suspend_resume, + + TP_PROTO(const char *action, int val, bool start), + + TP_ARGS(action, val, start), + + TP_STRUCT__entry( + __field(const char *, action) + __field(int, val) + __field(bool, start) + ), + + TP_fast_assign( + __entry->action = action; + __entry->val = val; + __entry->start = start; + ), + + TP_printk("%s[%u] %s", __entry->action, (unsigned int)__entry->val, + (__entry->start)?"begin":"end") +); + DECLARE_EVENT_CLASS(wakeup_source, TP_PROTO(const char *name, unsigned int state), diff --git a/kernel/cpu.c b/kernel/cpu.c index a9e710e..76deba0 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -20,6 +20,7 @@ #include #include #include +#include #include "smpboot.h" @@ -522,7 +523,9 @@ int disable_nonboot_cpus(void) for_each_online_cpu(cpu) { if (cpu == first_cpu) continue; + trace_suspend_resume(TPS("CPU_OFF"), cpu, true); error = _cpu_down(cpu, 1); + trace_suspend_resume(TPS("CPU_OFF"), cpu, false); if (!error) cpumask_set_cpu(cpu, frozen_cpus); else { @@ -566,7 +569,9 @@ void __ref enable_nonboot_cpus(void) arch_enable_nonboot_cpus_begin(); for_each_cpu(cpu, frozen_cpus) { + trace_suspend_resume(TPS("CPU_ON"), cpu, true); error = _cpu_up(cpu, 1); + trace_suspend_resume(TPS("CPU_ON"), cpu, false); if (!error) { printk(KERN_INFO "CPU%d is up\n", cpu); continue; diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index df88d55..49e0a20 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -28,6 +28,7 @@ #include #include #include +#include #include "power.h" @@ -292,7 +293,9 @@ static int create_image(int platform_mode) in_suspend = 1; save_processor_state(); + trace_suspend_resume(TPS("machine_suspend"), PM_EVENT_HIBERNATE, true); error = swsusp_arch_suspend(); + trace_suspend_resume(TPS("machine_suspend"), PM_EVENT_HIBERNATE, false); if (error) printk(KERN_ERR "PM: Error %d creating hibernation image\n", error); diff --git a/kernel/power/process.c b/kernel/power/process.c index 06ec8869..0ca8d83 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -17,6 +17,7 @@ #include #include #include +#include /* * Timeout for stopping processes @@ -175,6 +176,7 @@ void thaw_processes(void) struct task_struct *g, *p; struct task_struct *curr = current; + trace_suspend_resume(TPS("thaw_processes"), 0, true); if (pm_freezing) atomic_dec(&system_freezing_cnt); pm_freezing = false; @@ -201,6 +203,7 @@ void thaw_processes(void) schedule(); printk("done.\n"); + trace_suspend_resume(TPS("thaw_processes"), 0, false); } void thaw_kernel_threads(void) diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 963e6d0..4dd8822 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -177,7 +177,9 @@ static int suspend_prepare(suspend_state_t state) if (error) goto Finish; + trace_suspend_resume(TPS("freeze_processes"), 0, true); error = suspend_freeze_processes(); + trace_suspend_resume(TPS("freeze_processes"), 0, false); if (!error) return 0; @@ -240,7 +242,9 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) * all the devices are suspended. */ if (state == PM_SUSPEND_FREEZE) { + trace_suspend_resume(TPS("machine_suspend"), state, true); freeze_enter(); + trace_suspend_resume(TPS("machine_suspend"), state, false); goto Platform_wake; } @@ -256,7 +260,11 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) if (!error) { *wakeup = pm_wakeup_pending(); if (!(suspend_test(TEST_CORE) || *wakeup)) { + trace_suspend_resume(TPS("machine_suspend"), + state, true); error = suspend_ops->enter(state); + trace_suspend_resume(TPS("machine_suspend"), + state, false); events_check_enabled = false; } syscore_resume(); @@ -294,7 +302,6 @@ int suspend_devices_and_enter(suspend_state_t state) if (need_suspend_ops(state) && !suspend_ops) return -ENOSYS; - trace_machine_suspend(state); if (need_suspend_ops(state) && suspend_ops->begin) { error = suspend_ops->begin(state); if (error) @@ -331,7 +338,6 @@ int suspend_devices_and_enter(suspend_state_t state) else if (state == PM_SUSPEND_FREEZE && freeze_ops->end) freeze_ops->end(); - trace_machine_suspend(PWR_EVENT_EXIT); return error; Recover_platform: @@ -365,6 +371,7 @@ static int enter_state(suspend_state_t state) { int error; + trace_suspend_resume(TPS("suspend_enter"), state, true); if (state == PM_SUSPEND_FREEZE) { #ifdef CONFIG_PM_DEBUG if (pm_test_level != TEST_NONE && pm_test_level <= TEST_CPUS) { @@ -382,9 +389,11 @@ static int enter_state(suspend_state_t state) if (state == PM_SUSPEND_FREEZE) freeze_begin(); + trace_suspend_resume(TPS("sync_filesystems"), 0, true); printk(KERN_INFO "PM: Syncing filesystems ... "); sys_sync(); printk("done.\n"); + trace_suspend_resume(TPS("sync_filesystems"), 0, false); pr_debug("PM: Preparing system for %s sleep\n", pm_states[state].label); error = suspend_prepare(state); @@ -394,6 +403,7 @@ static int enter_state(suspend_state_t state) if (suspend_test(TEST_FREEZER)) goto Finish; + trace_suspend_resume(TPS("suspend_enter"), state, false); pr_debug("PM: Entering %s sleep\n", pm_states[state].label); pm_restrict_gfp_mask(); error = suspend_devices_and_enter(state); -- cgit v0.10.2 From 18b46abd0009516c1973a57ccf4d01b9eaa3422a Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Sun, 8 Jun 2014 02:11:43 +0530 Subject: cpufreq: governor: Be friendly towards latency-sensitive bursty workloads Cpufreq governors like the ondemand governor calculate the load on the CPU periodically by employing deferrable timers. A deferrable timer won't fire if the CPU is completely idle (and there are no other timers to be run), in order to avoid unnecessary wakeups and thus save CPU power. However, the load calculation logic is agnostic to all this, and this can lead to the problem described below. Time (ms) CPU 1 100 Task-A running 110 Governor's timer fires, finds load as 100% in the last 10ms interval and increases the CPU frequency. 110.5 Task-A running 120 Governor's timer fires, finds load as 100% in the last 10ms interval and increases the CPU frequency. 125 Task-A went to sleep. With nothing else to do, CPU 1 went completely idle. 200 Task-A woke up and started running again. 200.5 Governor's deferred timer (which was originally programmed to fire at time 130) fires now. It calculates load for the time period 120 to 200.5, and finds the load is almost zero. Hence it decreases the CPU frequency to the minimum. 210 Governor's timer fires, finds load as 100% in the last 10ms interval and increases the CPU frequency. So, after the workload woke up and started running, the frequency was suddenly dropped to absolute minimum, and after that, there was an unnecessary delay of 10ms (sampling period) to increase the CPU frequency back to a reasonable value. And this pattern repeats for every wake-up-from-cpu-idle for that workload. This can be quite undesirable for latency- or response-time sensitive bursty workloads. So we need to fix the governor's logic to detect such wake-up-from- cpu-idle scenarios and start the workload at a reasonably high CPU frequency. One extreme solution would be to fake a load of 100% in such scenarios. But that might lead to undesirable side-effects such as frequency spikes (which might also need voltage changes) especially if the previous frequency happened to be very low. We just want to avoid the stupidity of dropping down the frequency to a minimum and then enduring a needless (and long) delay before ramping it up back again. So, let us simply carry forward the previous load - that is, let us just pretend that the 'load' for the current time-window is the same as the load for the previous window. That way, the frequency and voltage will continue to be set to whatever values they were set at previously. This means that bursty workloads will get a chance to influence the CPU frequency at which they wake up from cpu-idle, based on their past execution history. Thus, they might be able to avoid suffering from slow wakeups and long response-times. However, we should take care not to over-do this. For example, such a "copy previous load" logic will benefit cases like this: (where # represents busy and . represents idle) ##########.........#########.........###########...........##########........ but it will be detrimental in cases like the one shown below, because it will retain the high frequency (copied from the previous interval) even in a mostly idle system: ##########.........#.................#.....................#............... (i.e., the workload finished and the remaining tasks are such that their busy periods are smaller than the sampling interval, which causes the timer to always get deferred. So, this will make the copy-previous-load logic copy the initial high load to subsequent idle periods over and over again, thus keeping the frequency high unnecessarily). So, we modify this copy-previous-load logic such that it is used only once upon every wakeup-from-idle. Thus if we have 2 consecutive idle periods, the previous load won't get blindly copied over; cpufreq will freshly evaluate the load in the second idle interval, thus ensuring that the system comes back to its normal state. [ The right way to solve this whole problem is to teach the CPU frequency governors to also track load on a per-task basis, not just a per-CPU basis, and then use both the data sources intelligently to set the appropriate frequency on the CPUs. But that involves redesigning the cpufreq subsystem, so this patch should make the situation bearable until then. ] Experimental results: +-------------------+ I ran a modified version of ebizzy (called 'sleeping-ebizzy') that sleeps in between its execution such that its total utilization can be a user-defined value, say 10% or 20% (higher the utilization specified, lesser the amount of sleeps injected). This ebizzy was run with a single-thread, tied to CPU 8. Behavior observed with tracing (sample taken from 40% utilization runs): ------------------------------------------------------------------------ Without patch: ~~~~~~~~~~~~~~ kworker/8:2-12137 416.335742: cpu_frequency: state=2061000 cpu_id=8 kworker/8:2-12137 416.335744: sched_switch: prev_comm=kworker/8:2 ==> next_comm=ebizzy <...>-40753 416.345741: sched_switch: prev_comm=ebizzy ==> next_comm=kworker/8:2 kworker/8:2-12137 416.345744: cpu_frequency: state=4123000 cpu_id=8 kworker/8:2-12137 416.345746: sched_switch: prev_comm=kworker/8:2 ==> next_comm=ebizzy <...>-40753 416.355738: sched_switch: prev_comm=ebizzy ==> next_comm=kworker/8:2 --------------------------------------------------------------------- <...>-40753 416.402202: sched_switch: prev_comm=ebizzy ==> next_comm=swapper/8 -0 416.502130: sched_switch: prev_comm=swapper/8 ==> next_comm=ebizzy <...>-40753 416.505738: sched_switch: prev_comm=ebizzy ==> next_comm=kworker/8:2 kworker/8:2-12137 416.505739: cpu_frequency: state=2061000 cpu_id=8 kworker/8:2-12137 416.505741: sched_switch: prev_comm=kworker/8:2 ==> next_comm=ebizzy <...>-40753 416.515739: sched_switch: prev_comm=ebizzy ==> next_comm=kworker/8:2 kworker/8:2-12137 416.515742: cpu_frequency: state=4123000 cpu_id=8 kworker/8:2-12137 416.515744: sched_switch: prev_comm=kworker/8:2 ==> next_comm=ebizzy Observation: Ebizzy went idle at 416.402202, and started running again at 416.502130. But cpufreq noticed the long idle period, and dropped the frequency at 416.505739, only to increase it back again at 416.515742, realizing that the workload is in-fact CPU bound. Thus ebizzy needlessly ran at the lowest frequency for almost 13 milliseconds (almost 1 full sample period), and this pattern repeats on every sleep-wakeup. This could hurt latency-sensitive workloads quite a lot. With patch: ~~~~~~~~~~~ kworker/8:2-29802 464.832535: cpu_frequency: state=2061000 cpu_id=8 --------------------------------------------------------------------- kworker/8:2-29802 464.962538: sched_switch: prev_comm=kworker/8:2 ==> next_comm=ebizzy <...>-40738 464.972533: sched_switch: prev_comm=ebizzy ==> next_comm=kworker/8:2 kworker/8:2-29802 464.972536: cpu_frequency: state=4123000 cpu_id=8 kworker/8:2-29802 464.972538: sched_switch: prev_comm=kworker/8:2 ==> next_comm=ebizzy <...>-40738 464.982531: sched_switch: prev_comm=ebizzy ==> next_comm=kworker/8:2 --------------------------------------------------------------------- kworker/8:2-29802 465.022533: sched_switch: prev_comm=kworker/8:2 ==> next_comm=ebizzy <...>-40738 465.032531: sched_switch: prev_comm=ebizzy ==> next_comm=kworker/8:2 kworker/8:2-29802 465.032532: sched_switch: prev_comm=kworker/8:2 ==> next_comm=ebizzy <...>-40738 465.035797: sched_switch: prev_comm=ebizzy ==> next_comm=swapper/8 -0 465.240178: sched_switch: prev_comm=swapper/8 ==> next_comm=ebizzy <...>-40738 465.242533: sched_switch: prev_comm=ebizzy ==> next_comm=kworker/8:2 kworker/8:2-29802 465.242535: sched_switch: prev_comm=kworker/8:2 ==> next_comm=ebizzy <...>-40738 465.252531: sched_switch: prev_comm=ebizzy ==> next_comm=kworker/8:2 Observation: Ebizzy went idle at 465.035797, and started running again at 465.240178. Since ebizzy was the only real workload running on this CPU, cpufreq retained the frequency at 4.1Ghz throughout the run of ebizzy, no matter how many times ebizzy slept and woke-up in-between. Thus, ebizzy got the 10ms worth of 4.1 Ghz benefit during every sleep-wakeup (as compared to the run without the patch) and this boost gave a modest improvement in total throughput, as shown below. Sleeping-ebizzy records-per-second: ----------------------------------- Utilization Without patch With patch Difference (Absolute and % values) 10% 274767 277046 + 2279 (+0.829%) 20% 543429 553484 + 10055 (+1.850%) 40% 1090744 1107959 + 17215 (+1.578%) 60% 1634908 1662018 + 27110 (+1.658%) A rudimentary and somewhat approximately latency-sensitive workload such as sleeping-ebizzy itself showed a consistent, noticeable performance improvement with this patch. Hence, workloads that are truly latency-sensitive will benefit quite a bit from this change. Moreover, this is an overall win-win since this patch does not hurt power-savings at all (because, this patch does not reduce the idle time or idle residency; and the high frequency of the CPU when it goes to cpu-idle does not affect/hurt the power-savings of deep idle states). Signed-off-by: Srivatsa S. Bhat Reviewed-by: Gautham R. Shenoy Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c index e1c6433..9004450 100644 --- a/drivers/cpufreq/cpufreq_governor.c +++ b/drivers/cpufreq/cpufreq_governor.c @@ -36,14 +36,29 @@ void dbs_check_cpu(struct dbs_data *dbs_data, int cpu) struct od_dbs_tuners *od_tuners = dbs_data->tuners; struct cs_dbs_tuners *cs_tuners = dbs_data->tuners; struct cpufreq_policy *policy; + unsigned int sampling_rate; unsigned int max_load = 0; unsigned int ignore_nice; unsigned int j; - if (dbs_data->cdata->governor == GOV_ONDEMAND) + if (dbs_data->cdata->governor == GOV_ONDEMAND) { + struct od_cpu_dbs_info_s *od_dbs_info = + dbs_data->cdata->get_cpu_dbs_info_s(cpu); + + /* + * Sometimes, the ondemand governor uses an additional + * multiplier to give long delays. So apply this multiplier to + * the 'sampling_rate', so as to keep the wake-up-from-idle + * detection logic a bit conservative. + */ + sampling_rate = od_tuners->sampling_rate; + sampling_rate *= od_dbs_info->rate_mult; + ignore_nice = od_tuners->ignore_nice_load; - else + } else { + sampling_rate = cs_tuners->sampling_rate; ignore_nice = cs_tuners->ignore_nice_load; + } policy = cdbs->cur_policy; @@ -96,7 +111,36 @@ void dbs_check_cpu(struct dbs_data *dbs_data, int cpu) if (unlikely(!wall_time || wall_time < idle_time)) continue; - load = 100 * (wall_time - idle_time) / wall_time; + /* + * If the CPU had gone completely idle, and a task just woke up + * on this CPU now, it would be unfair to calculate 'load' the + * usual way for this elapsed time-window, because it will show + * near-zero load, irrespective of how CPU intensive that task + * actually is. This is undesirable for latency-sensitive bursty + * workloads. + * + * To avoid this, we reuse the 'load' from the previous + * time-window and give this task a chance to start with a + * reasonably high CPU frequency. (However, we shouldn't over-do + * this copy, lest we get stuck at a high load (high frequency) + * for too long, even when the current system load has actually + * dropped down. So we perform the copy only once, upon the + * first wake-up from idle.) + * + * Detecting this situation is easy: the governor's deferrable + * timer would not have fired during CPU-idle periods. Hence + * an unusually large 'wall_time' (as compared to the sampling + * rate) indicates this scenario. + */ + if (unlikely(wall_time > (2 * sampling_rate)) && + j_cdbs->copy_prev_load) { + load = j_cdbs->prev_load; + j_cdbs->copy_prev_load = false; + } else { + load = 100 * (wall_time - idle_time) / wall_time; + j_cdbs->prev_load = load; + j_cdbs->copy_prev_load = true; + } if (load > max_load) max_load = load; @@ -318,11 +362,19 @@ int cpufreq_governor_dbs(struct cpufreq_policy *policy, for_each_cpu(j, policy->cpus) { struct cpu_dbs_common_info *j_cdbs = dbs_data->cdata->get_cpu_cdbs(j); + unsigned int prev_load; j_cdbs->cpu = j; j_cdbs->cur_policy = policy; j_cdbs->prev_cpu_idle = get_cpu_idle_time(j, &j_cdbs->prev_cpu_wall, io_busy); + + prev_load = (unsigned int) + (j_cdbs->prev_cpu_wall - j_cdbs->prev_cpu_idle); + j_cdbs->prev_load = 100 * prev_load / + (unsigned int) j_cdbs->prev_cpu_wall; + j_cdbs->copy_prev_load = true; + if (ignore_nice) j_cdbs->prev_cpu_nice = kcpustat_cpu(j).cpustat[CPUTIME_NICE]; diff --git a/drivers/cpufreq/cpufreq_governor.h b/drivers/cpufreq/cpufreq_governor.h index bfb9ae1..c2a5b7e 100644 --- a/drivers/cpufreq/cpufreq_governor.h +++ b/drivers/cpufreq/cpufreq_governor.h @@ -134,6 +134,12 @@ struct cpu_dbs_common_info { u64 prev_cpu_idle; u64 prev_cpu_wall; u64 prev_cpu_nice; + unsigned int prev_load; + /* + * Flag to ensure that we copy the previous load only once, upon the + * first wake-up from idle. + */ + bool copy_prev_load; struct cpufreq_policy *cur_policy; struct delayed_work work; /* -- cgit v0.10.2 From c8ae481b9a12f5cea080651ea87736104b111f8e Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Mon, 9 Jun 2014 14:21:24 +0530 Subject: cpufreq: governor: remove copy_prev_load from 'struct cpu_dbs_common_info' 'copy_prev_load' was recently added by commit: 18b46ab (cpufreq: governor: Be friendly towards latency-sensitive bursty workloads). It actually is a bit redundant as we also have 'prev_load' which can store any integer value and can be used instead of 'copy_prev_load' by setting it zero. True load can also turn out to be zero during long idle intervals (and hence the actual value of 'prev_load' and the overloaded value can clash). However this is not a problem because, if the true load was really zero in the previous interval, it makes sense to evaluate the load afresh for the current interval rather than copying the previous load. So, drop 'copy_prev_load' and use 'prev_load' instead. Update comments as well to make it more clear. There is another change here which was probably missed by Srivatsa during the last version of updates he made. The unlikely in the 'if' statement was covering only half of the condition and the whole line should actually come under it. Also checkpatch is made more silent as it was reporting this (--strict option): CHECK: Alignment should match open parenthesis + if (unlikely(wall_time > (2 * sampling_rate) && + j_cdbs->prev_load)) { Signed-off-by: Viresh Kumar Reviewed-by: Srivatsa S. Bhat Acked-by: Pavel Machek Signed-off-by: Rafael J. Wysocki diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c index 9004450..1b44496 100644 --- a/drivers/cpufreq/cpufreq_governor.c +++ b/drivers/cpufreq/cpufreq_governor.c @@ -131,15 +131,25 @@ void dbs_check_cpu(struct dbs_data *dbs_data, int cpu) * timer would not have fired during CPU-idle periods. Hence * an unusually large 'wall_time' (as compared to the sampling * rate) indicates this scenario. + * + * prev_load can be zero in two cases and we must recalculate it + * for both cases: + * - during long idle intervals + * - explicitly set to zero */ - if (unlikely(wall_time > (2 * sampling_rate)) && - j_cdbs->copy_prev_load) { + if (unlikely(wall_time > (2 * sampling_rate) && + j_cdbs->prev_load)) { load = j_cdbs->prev_load; - j_cdbs->copy_prev_load = false; + + /* + * Perform a destructive copy, to ensure that we copy + * the previous load only once, upon the first wake-up + * from idle. + */ + j_cdbs->prev_load = 0; } else { load = 100 * (wall_time - idle_time) / wall_time; j_cdbs->prev_load = load; - j_cdbs->copy_prev_load = true; } if (load > max_load) @@ -373,7 +383,6 @@ int cpufreq_governor_dbs(struct cpufreq_policy *policy, (j_cdbs->prev_cpu_wall - j_cdbs->prev_cpu_idle); j_cdbs->prev_load = 100 * prev_load / (unsigned int) j_cdbs->prev_cpu_wall; - j_cdbs->copy_prev_load = true; if (ignore_nice) j_cdbs->prev_cpu_nice = diff --git a/drivers/cpufreq/cpufreq_governor.h b/drivers/cpufreq/cpufreq_governor.h index c2a5b7e..cc401d1 100644 --- a/drivers/cpufreq/cpufreq_governor.h +++ b/drivers/cpufreq/cpufreq_governor.h @@ -134,12 +134,13 @@ struct cpu_dbs_common_info { u64 prev_cpu_idle; u64 prev_cpu_wall; u64 prev_cpu_nice; - unsigned int prev_load; /* - * Flag to ensure that we copy the previous load only once, upon the - * first wake-up from idle. + * Used to keep track of load in the previous interval. However, when + * explicitly set to zero, it is used as a flag to ensure that we copy + * the previous load to the current interval only once, upon the first + * wake-up from idle. */ - bool copy_prev_load; + unsigned int prev_load; struct cpufreq_policy *cur_policy; struct delayed_work work; /* -- cgit v0.10.2 From 9b758d4e293deb282dd4870503e4d2778cf69df0 Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Mon, 9 Jun 2014 18:01:42 +0530 Subject: PM / Documentation: Update copyright in suspend-and-cpuhotplug.txt Extend the year to 2014 in the copyright. Signed-off-by: Srivatsa S. Bhat Signed-off-by: Rafael J. Wysocki diff --git a/Documentation/power/suspend-and-cpuhotplug.txt b/Documentation/power/suspend-and-cpuhotplug.txt index e13dafc..2850df3 100644 --- a/Documentation/power/suspend-and-cpuhotplug.txt +++ b/Documentation/power/suspend-and-cpuhotplug.txt @@ -1,6 +1,6 @@ Interaction of Suspend code (S3) with the CPU hotplug infrastructure - (C) 2011 Srivatsa S. Bhat + (C) 2011 - 2014 Srivatsa S. Bhat I. How does the regular CPU hotplug code differ from how the Suspend-to-RAM -- cgit v0.10.2 From 93575b757831fd6ae26eabc3e2f4b3cb2f42b9e5 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Mon, 9 Jun 2014 19:06:17 +0530 Subject: cpufreq: Mark CPU0 driver with CPUFREQ_NEED_INITIAL_FREQ_CHECK flag Sometimes boot loaders set CPU frequency to a value outside of frequency table present with cpufreq core. In such cases CPU might be unstable if it has to run on that frequency for long duration of time and so its better to set it to a frequency which is specified in frequency table. Sachin recently found this problem with cpufreq-cpu0 driver when he was testing it for Exynos. Set this flag for cpufreq-cpu0 driver. Reported-and-tested-by: Sachin Kamat Signed-off-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki diff --git a/drivers/cpufreq/cpufreq-cpu0.c b/drivers/cpufreq/cpufreq-cpu0.c index 09b9129..ee1ae30 100644 --- a/drivers/cpufreq/cpufreq-cpu0.c +++ b/drivers/cpufreq/cpufreq-cpu0.c @@ -104,7 +104,7 @@ static int cpu0_cpufreq_init(struct cpufreq_policy *policy) } static struct cpufreq_driver cpu0_cpufreq_driver = { - .flags = CPUFREQ_STICKY, + .flags = CPUFREQ_STICKY | CPUFREQ_NEED_INITIAL_FREQ_CHECK, .verify = cpufreq_generic_frequency_table_verify, .target_index = cpu0_set_target, .get = cpufreq_generic_get, -- cgit v0.10.2 From 830bcac4e483d347087ed51eed564d1467e1e363 Mon Sep 17 00:00:00 2001 From: Stratos Karafotis Date: Tue, 10 Jun 2014 00:00:09 +0300 Subject: cpufreq: intel_pstate: Remove duplicate CPU ID check We check the CPU ID during driver init. There is no need to do it again per logical CPU initialization. So, remove the duplicate check. Signed-off-by: Stratos Karafotis Acked-by: Viresh Kumar Acked-by: Dirk Brandewie Signed-off-by: Rafael J. Wysocki diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index aebd457..4e7f492 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -691,14 +691,8 @@ MODULE_DEVICE_TABLE(x86cpu, intel_pstate_cpu_ids); static int intel_pstate_init_cpu(unsigned int cpunum) { - - const struct x86_cpu_id *id; struct cpudata *cpu; - id = x86_match_cpu(intel_pstate_cpu_ids); - if (!id) - return -ENODEV; - all_cpu_data[cpunum] = kzalloc(sizeof(struct cpudata), GFP_KERNEL); if (!all_cpu_data[cpunum]) return -ENOMEM; -- cgit v0.10.2 From 40cc5490132075d7be85a4483caa0a7264f45ed4 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Tue, 10 Jun 2014 10:27:12 +0530 Subject: cpufreq: tegra: update comment for clarity MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Tegra's driver got updated a bit (00917dd cpufreq: Tegra: implement intermediate frequency callbacks) and implements new 'intermediate freq' infrastructure of core. Above commit updated comments about when to call clk_prepare_enable(pll_x_clk) and Doug wasn't satisfied with those comments and said this: > The "Though when target-freq is intermediate freq, we don't need to > take this reference." makes me think that this function is actually > called when target-freq is intermediate freq.  I don't think it is, > right? For better clarity just make that comment more explicit about when we call tegra_target_intermediate(). Reviewed-by: Stephen Warren Reported-and-reviewed-by: Doug Anderson Signed-off-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki diff --git a/drivers/cpufreq/tegra-cpufreq.c b/drivers/cpufreq/tegra-cpufreq.c index a5fbc0a..8084c7f 100644 --- a/drivers/cpufreq/tegra-cpufreq.c +++ b/drivers/cpufreq/tegra-cpufreq.c @@ -71,9 +71,12 @@ static int tegra_target_intermediate(struct cpufreq_policy *policy, /* * Take an extra reference to the main pll so it doesn't turn * off when we move the cpu off of it as enabling it again while we - * switch to it from tegra_target() would take additional time. Though - * when target-freq is intermediate freq, we don't need to take this - * reference. + * switch to it from tegra_target() would take additional time. + * + * When target-freq is equal to intermediate freq we don't need to + * switch to an intermediate freq and so this routine isn't called. + * Also, we wouldn't be using pll_x anymore and must not take extra + * reference to it, as it can be disabled now to save some power. */ clk_prepare_enable(pll_x_clk); -- cgit v0.10.2 From 5fbfbcd3e842ddfe9dbbe8865feba909963a87ec Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Tue, 10 Jun 2014 10:39:46 +0530 Subject: cpufreq: cpufreq-cpu0: remove dependency on THERMAL and REGULATOR cpufreq-cpu0 uses thermal framework to register a cooling device, but doesn't depend on it as there are dummy calls provided by thermal layer when CONFIG_THERMAL=n. And when these calls fail, the driver is still usable. Similar explanation is valid for regulators as well. We do have dummy calls available for regulator APIs and the driver can work even when those calls fail. So, we don't really need to mention thermal and regulators as a dependency for cpufreq-cpu0 in Kconfig as platforms without support for thermal/regulator can also use this driver. Remove this dependency. Signed-off-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig index 1fbe11f..e473d65 100644 --- a/drivers/cpufreq/Kconfig +++ b/drivers/cpufreq/Kconfig @@ -185,7 +185,7 @@ config CPU_FREQ_GOV_CONSERVATIVE config GENERIC_CPUFREQ_CPU0 tristate "Generic CPU0 cpufreq driver" - depends on HAVE_CLK && REGULATOR && OF && THERMAL && CPU_THERMAL + depends on HAVE_CLK && OF select PM_OPP help This adds a generic cpufreq driver for CPU0 frequency management. -- cgit v0.10.2 From e8bca479c3f269ebb3a3acea5ef63314bb677060 Mon Sep 17 00:00:00 2001 From: Todd E Brandt Date: Tue, 10 Jun 2014 07:31:22 -0700 Subject: PM / sleep: trace events for device PM callbacks Adds two trace events which supply the same info that initcall_debug provides, but via ftrace instead of dmesg. The existing initcall_debug calls require the pm_print_times_enabled var to be set (either via sysfs or via the kernel cmd line). The new trace events provide all the same info as the initcall_debug prints but with less overhead, and also with coverage of device prepare and complete device callbacks. These events replace the device_pm_report_time event (which has been removed). device_pm_callback_start is called first and provides the device and callback info. device_pm_callback_end is called after with the device name and error info. The time and pid are gathered from the trace data headers. Signed-off-by: Todd Brandt Signed-off-by: Rafael J. Wysocki diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index de3fe4fe..bf41296 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -214,9 +214,6 @@ static void initcall_debug_report(struct device *dev, ktime_t calltime, pr_info("call %s+ returned %d after %Ld usecs\n", dev_name(dev), error, (unsigned long long)nsecs >> 10); } - - trace_device_pm_report_time(dev, info, nsecs, pm_verb(state.event), - error); } /** @@ -387,7 +384,9 @@ static int dpm_run_callback(pm_callback_t cb, struct device *dev, calltime = initcall_debug_start(dev); pm_dev_dbg(dev, state, info); + trace_device_pm_callback_start(dev, info, state.event); error = cb(dev); + trace_device_pm_callback_end(dev, error); suspend_report_result(cb, error); initcall_debug_report(dev, calltime, error, state, info); @@ -919,7 +918,9 @@ static void device_complete(struct device *dev, pm_message_t state) if (callback) { pm_dev_dbg(dev, state, info); + trace_device_pm_callback_start(dev, info, state.event); callback(dev); + trace_device_pm_callback_end(dev, 0); } device_unlock(dev); @@ -1307,7 +1308,9 @@ static int legacy_suspend(struct device *dev, pm_message_t state, calltime = initcall_debug_start(dev); + trace_device_pm_callback_start(dev, info, state.event); error = cb(dev, state); + trace_device_pm_callback_end(dev, error); suspend_report_result(cb, error); initcall_debug_report(dev, calltime, error, state, info); @@ -1563,8 +1566,11 @@ static int device_prepare(struct device *dev, pm_message_t state) callback = dev->driver->pm->prepare; } - if (callback) + if (callback) { + trace_device_pm_callback_start(dev, info, state.event); ret = callback(dev); + trace_device_pm_callback_end(dev, ret); + } device_unlock(dev); diff --git a/include/trace/events/power.h b/include/trace/events/power.h index f88c857..d19840b 100644 --- a/include/trace/events/power.h +++ b/include/trace/events/power.h @@ -93,6 +93,17 @@ TRACE_EVENT(pstate_sample, #define PWR_EVENT_EXIT -1 #endif +#define pm_verb_symbolic(event) \ + __print_symbolic(event, \ + { PM_EVENT_SUSPEND, "suspend" }, \ + { PM_EVENT_RESUME, "resume" }, \ + { PM_EVENT_FREEZE, "freeze" }, \ + { PM_EVENT_QUIESCE, "quiesce" }, \ + { PM_EVENT_HIBERNATE, "hibernate" }, \ + { PM_EVENT_THAW, "thaw" }, \ + { PM_EVENT_RESTORE, "restore" }, \ + { PM_EVENT_RECOVER, "recover" }) + DEFINE_EVENT(cpu, cpu_frequency, TP_PROTO(unsigned int frequency, unsigned int cpu_id), @@ -100,41 +111,54 @@ DEFINE_EVENT(cpu, cpu_frequency, TP_ARGS(frequency, cpu_id) ); -TRACE_EVENT(device_pm_report_time, +TRACE_EVENT(device_pm_callback_start, - TP_PROTO(struct device *dev, const char *pm_ops, s64 ops_time, - char *pm_event_str, int error), + TP_PROTO(struct device *dev, const char *pm_ops, int event), - TP_ARGS(dev, pm_ops, ops_time, pm_event_str, error), + TP_ARGS(dev, pm_ops, event), TP_STRUCT__entry( __string(device, dev_name(dev)) __string(driver, dev_driver_string(dev)) __string(parent, dev->parent ? dev_name(dev->parent) : "none") __string(pm_ops, pm_ops ? pm_ops : "none ") - __string(pm_event_str, pm_event_str) - __field(s64, ops_time) - __field(int, error) + __field(int, event) ), TP_fast_assign( - const char *tmp = dev->parent ? dev_name(dev->parent) : "none"; - const char *tmp_i = pm_ops ? pm_ops : "none "; + __assign_str(device, dev_name(dev)); + __assign_str(driver, dev_driver_string(dev)); + __assign_str(parent, + dev->parent ? dev_name(dev->parent) : "none"); + __assign_str(pm_ops, pm_ops ? pm_ops : "none "); + __entry->event = event; + ), + + TP_printk("%s %s, parent: %s, %s[%s]", __get_str(driver), + __get_str(device), __get_str(parent), __get_str(pm_ops), + pm_verb_symbolic(__entry->event)) +); + +TRACE_EVENT(device_pm_callback_end, + + TP_PROTO(struct device *dev, int error), + TP_ARGS(dev, error), + + TP_STRUCT__entry( + __string(device, dev_name(dev)) + __string(driver, dev_driver_string(dev)) + __field(int, error) + ), + + TP_fast_assign( __assign_str(device, dev_name(dev)); __assign_str(driver, dev_driver_string(dev)); - __assign_str(parent, tmp); - __assign_str(pm_ops, tmp_i); - __assign_str(pm_event_str, pm_event_str); - __entry->ops_time = ops_time; __entry->error = error; ), - /* ops_str has an extra space at the end */ - TP_printk("%s %s parent=%s state=%s ops=%snsecs=%lld err=%d", - __get_str(driver), __get_str(device), __get_str(parent), - __get_str(pm_event_str), __get_str(pm_ops), - __entry->ops_time, __entry->error) + TP_printk("%s %s, err=%d", + __get_str(driver), __get_str(device), __entry->error) ); TRACE_EVENT(suspend_resume, -- cgit v0.10.2 From 882d18a702c66404fcb62b84748f719f9b47441c Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 10 Jun 2014 22:46:35 +0200 Subject: ACPI / hotplug / PCI: Add hotplug contexts to PCI host bridges After relatively recent changes in the ACPI-based PCI hotplug (ACPIPHP) code, the acpiphp_check_host_bridge() executed for PCI host bridges via acpi_pci_root_scan_dependent() doesn't do anything useful, because those bridges do not have hotplug contexts. That happens by mistake, so fix it by making acpiphp_enumerate_slots() add hotplug contexts to PCI host bridges too and modify acpiphp_remove_slots() to drop those contexts for host bridges as appropriate. Link: https://bugzilla.kernel.org/show_bug.cgi?id=76901 Fixes: 2d8b1d566a5f (ACPI / hotplug / PCI: Get rid of check_sub_bridges()) Reported-and-tested-by: Gavin Guo Acked-by: Bjorn Helgaas Reviewed-by: Mika Westerberg Cc: 3.15+ # 3.15+ Signed-off-by: Rafael J. Wysocki diff --git a/drivers/pci/hotplug/acpiphp.h b/drivers/pci/hotplug/acpiphp.h index 2b85924..b0e61bf 100644 --- a/drivers/pci/hotplug/acpiphp.h +++ b/drivers/pci/hotplug/acpiphp.h @@ -142,6 +142,16 @@ static inline acpi_handle func_to_handle(struct acpiphp_func *func) return func_to_acpi_device(func)->handle; } +struct acpiphp_root_context { + struct acpi_hotplug_context hp; + struct acpiphp_bridge *root_bridge; +}; + +static inline struct acpiphp_root_context *to_acpiphp_root_context(struct acpi_hotplug_context *hp) +{ + return container_of(hp, struct acpiphp_root_context, hp); +} + /* * struct acpiphp_attention_info - device specific attention registration * diff --git a/drivers/pci/hotplug/acpiphp_glue.c b/drivers/pci/hotplug/acpiphp_glue.c index 75e1783..91aa3d7 100644 --- a/drivers/pci/hotplug/acpiphp_glue.c +++ b/drivers/pci/hotplug/acpiphp_glue.c @@ -373,17 +373,13 @@ static acpi_status acpiphp_add_context(acpi_handle handle, u32 lvl, void *data, static struct acpiphp_bridge *acpiphp_dev_to_bridge(struct acpi_device *adev) { - struct acpiphp_context *context; struct acpiphp_bridge *bridge = NULL; acpi_lock_hp_context(); - context = acpiphp_get_context(adev); - if (context) { - bridge = context->bridge; + if (adev->hp) { + bridge = to_acpiphp_root_context(adev->hp)->root_bridge; if (bridge) get_bridge(bridge); - - acpiphp_put_context(context); } acpi_unlock_hp_context(); return bridge; @@ -881,7 +877,17 @@ void acpiphp_enumerate_slots(struct pci_bus *bus) */ get_device(&bus->dev); - if (!pci_is_root_bus(bridge->pci_bus)) { + acpi_lock_hp_context(); + if (pci_is_root_bus(bridge->pci_bus)) { + struct acpiphp_root_context *root_context; + + root_context = kzalloc(sizeof(*root_context), GFP_KERNEL); + if (!root_context) + goto err; + + root_context->root_bridge = bridge; + acpi_set_hp_context(adev, &root_context->hp, NULL, NULL, NULL); + } else { struct acpiphp_context *context; /* @@ -890,21 +896,16 @@ void acpiphp_enumerate_slots(struct pci_bus *bus) * parent is going to be handled by pciehp, in which case this * bridge is not interesting to us either. */ - acpi_lock_hp_context(); context = acpiphp_get_context(adev); - if (!context) { - acpi_unlock_hp_context(); - put_device(&bus->dev); - pci_dev_put(bridge->pci_dev); - kfree(bridge); - return; - } + if (!context) + goto err; + bridge->context = context; context->bridge = bridge; /* Get a reference to the parent bridge. */ get_bridge(context->func.parent); - acpi_unlock_hp_context(); } + acpi_unlock_hp_context(); /* Must be added to the list prior to calling acpiphp_add_context(). */ mutex_lock(&bridge_mutex); @@ -919,6 +920,30 @@ void acpiphp_enumerate_slots(struct pci_bus *bus) cleanup_bridge(bridge); put_bridge(bridge); } + return; + + err: + acpi_unlock_hp_context(); + put_device(&bus->dev); + pci_dev_put(bridge->pci_dev); + kfree(bridge); +} + +void acpiphp_drop_bridge(struct acpiphp_bridge *bridge) +{ + if (pci_is_root_bus(bridge->pci_bus)) { + struct acpiphp_root_context *root_context; + struct acpi_device *adev; + + acpi_lock_hp_context(); + adev = ACPI_COMPANION(bridge->pci_bus->bridge); + root_context = to_acpiphp_root_context(adev->hp); + adev->hp = NULL; + acpi_unlock_hp_context(); + kfree(root_context); + } + cleanup_bridge(bridge); + put_bridge(bridge); } /** @@ -936,8 +961,7 @@ void acpiphp_remove_slots(struct pci_bus *bus) list_for_each_entry(bridge, &bridge_list, list) if (bridge->pci_bus == bus) { mutex_unlock(&bridge_mutex); - cleanup_bridge(bridge); - put_bridge(bridge); + acpiphp_drop_bridge(bridge); return; } -- cgit v0.10.2