From 7467571f4480b273007517b26297c07154c73924 Mon Sep 17 00:00:00 2001 From: Tero Kristo Date: Thu, 24 Feb 2011 17:19:23 +0200 Subject: cpuidle: menu: fixed wrapping timers at 4.294 seconds Cpuidle menu governor is using u32 as a temporary datatype for storing nanosecond values which wrap around at 4.294 seconds. This causes errors in predicted sleep times resulting in higher than should be C state selection and increased power consumption. This also breaks cpuidle state residency statistics. cc: stable@kernel.org # .32.x through .39.x Signed-off-by: Tero Kristo Signed-off-by: Len Brown diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c index f508690..c47f3d0 100644 --- a/drivers/cpuidle/governors/menu.c +++ b/drivers/cpuidle/governors/menu.c @@ -237,6 +237,7 @@ static int menu_select(struct cpuidle_device *dev) unsigned int power_usage = -1; int i; int multiplier; + struct timespec t; if (data->needs_update) { menu_update(dev); @@ -251,8 +252,9 @@ static int menu_select(struct cpuidle_device *dev) return 0; /* determine the expected residency time, round up */ + t = ktime_to_timespec(tick_nohz_get_sleep_length()); data->expected_us = - DIV_ROUND_UP((u32)ktime_to_ns(tick_nohz_get_sleep_length()), 1000); + t.tv_sec * USEC_PER_SEC + t.tv_nsec / NSEC_PER_USEC; data->bucket = which_bucket(data->expected_us); -- cgit v0.10.2 From 333c5ae9948194428fe6c5ef5c088304fc98263b Mon Sep 17 00:00:00 2001 From: Tim Chen Date: Fri, 11 Feb 2011 12:49:04 -0800 Subject: idle governor: Avoid lock acquisition to read pm_qos before entering idle Thanks to the reviews and comments by Rafael, James, Mark and Andi. Here's version 2 of the patch incorporating your comments and also some update to my previous patch comments. I noticed that before entering idle state, the menu idle governor will look up the current pm_qos target value according to the list of qos requests received. This look up currently needs the acquisition of a lock to access the list of qos requests to find the qos target value, slowing down the entrance into idle state due to contention by multiple cpus to access this list. The contention is severe when there are a lot of cpus waking and going into idle. For example, for a simple workload that has 32 pair of processes ping ponging messages to each other, where 64 cpu cores are active in test system, I see the following profile with 37.82% of cpu cycles spent in contention of pm_qos_lock: - 37.82% swapper [kernel.kallsyms] [k] _raw_spin_lock_irqsave - _raw_spin_lock_irqsave - 95.65% pm_qos_request menu_select cpuidle_idle_call - cpu_idle 99.98% start_secondary A better approach will be to cache the updated pm_qos target value so reading it does not require lock acquisition as in the patch below. With this patch the contention for pm_qos_lock is removed and I saw a 2.2X increase in throughput for my message passing workload. cc: stable@kernel.org Signed-off-by: Tim Chen Acked-by: Andi Kleen Acked-by: James Bottomley Acked-by: mark gross Signed-off-by: Len Brown diff --git a/include/linux/pm_qos_params.h b/include/linux/pm_qos_params.h index 77cbddb..a7d87f9 100644 --- a/include/linux/pm_qos_params.h +++ b/include/linux/pm_qos_params.h @@ -16,6 +16,10 @@ #define PM_QOS_NUM_CLASSES 4 #define PM_QOS_DEFAULT_VALUE -1 +#define PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE (2000 * USEC_PER_SEC) +#define PM_QOS_NETWORK_LAT_DEFAULT_VALUE (2000 * USEC_PER_SEC) +#define PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE 0 + struct pm_qos_request_list { struct plist_node list; int pm_qos_class; diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c index aeaa7f8..6a8fad8 100644 --- a/kernel/pm_qos_params.c +++ b/kernel/pm_qos_params.c @@ -53,11 +53,17 @@ enum pm_qos_type { PM_QOS_MIN /* return the smallest value */ }; +/* + * Note: The lockless read path depends on the CPU accessing + * target_value atomically. Atomic access is only guaranteed on all CPU + * types linux supports for 32 bit quantites + */ struct pm_qos_object { struct plist_head requests; struct blocking_notifier_head *notifiers; struct miscdevice pm_qos_power_miscdev; char *name; + s32 target_value; /* Do not change to 64 bit */ s32 default_value; enum pm_qos_type type; }; @@ -70,7 +76,8 @@ static struct pm_qos_object cpu_dma_pm_qos = { .requests = PLIST_HEAD_INIT(cpu_dma_pm_qos.requests, pm_qos_lock), .notifiers = &cpu_dma_lat_notifier, .name = "cpu_dma_latency", - .default_value = 2000 * USEC_PER_SEC, + .target_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE, + .default_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE, .type = PM_QOS_MIN, }; @@ -79,7 +86,8 @@ static struct pm_qos_object network_lat_pm_qos = { .requests = PLIST_HEAD_INIT(network_lat_pm_qos.requests, pm_qos_lock), .notifiers = &network_lat_notifier, .name = "network_latency", - .default_value = 2000 * USEC_PER_SEC, + .target_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE, + .default_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE, .type = PM_QOS_MIN }; @@ -89,7 +97,8 @@ static struct pm_qos_object network_throughput_pm_qos = { .requests = PLIST_HEAD_INIT(network_throughput_pm_qos.requests, pm_qos_lock), .notifiers = &network_throughput_notifier, .name = "network_throughput", - .default_value = 0, + .target_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE, + .default_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE, .type = PM_QOS_MAX, }; @@ -132,6 +141,16 @@ static inline int pm_qos_get_value(struct pm_qos_object *o) } } +static inline s32 pm_qos_read_value(struct pm_qos_object *o) +{ + return o->target_value; +} + +static inline void pm_qos_set_value(struct pm_qos_object *o, s32 value) +{ + o->target_value = value; +} + static void update_target(struct pm_qos_object *o, struct plist_node *node, int del, int value) { @@ -156,6 +175,7 @@ static void update_target(struct pm_qos_object *o, struct plist_node *node, plist_add(node, &o->requests); } curr_value = pm_qos_get_value(o); + pm_qos_set_value(o, curr_value); spin_unlock_irqrestore(&pm_qos_lock, flags); if (prev_value != curr_value) @@ -190,18 +210,11 @@ static int find_pm_qos_object_by_minor(int minor) * pm_qos_request - returns current system wide qos expectation * @pm_qos_class: identification of which qos value is requested * - * This function returns the current target value in an atomic manner. + * This function returns the current target value. */ int pm_qos_request(int pm_qos_class) { - unsigned long flags; - int value; - - spin_lock_irqsave(&pm_qos_lock, flags); - value = pm_qos_get_value(pm_qos_array[pm_qos_class]); - spin_unlock_irqrestore(&pm_qos_lock, flags); - - return value; + return pm_qos_read_value(pm_qos_array[pm_qos_class]); } EXPORT_SYMBOL_GPL(pm_qos_request); -- cgit v0.10.2 From 02c68a02018669d1817c43c42de800975cbec467 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Fri, 1 Apr 2011 16:59:53 -0400 Subject: x86 idle: clarify AMD erratum 400 workaround The workaround for AMD erratum 400 uses the term "c1e" falsely suggesting: 1. Intel C1E is somehow involved 2. All AMD processors with C1E are involved Use the string "amd_c1e" instead of simply "c1e" to clarify that this workaround is specific to AMD's version of C1E. Use the string "e400" to clarify that the workaround is specific to AMD processors with Erratum 400. This patch is text-substitution only, with no functional change. cc: x86@kernel.org Acked-by: Borislav Petkov Signed-off-by: Len Brown diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h index 4ea15ca..52fd57f 100644 --- a/arch/x86/include/asm/acpi.h +++ b/arch/x86/include/asm/acpi.h @@ -138,7 +138,7 @@ static inline unsigned int acpi_processor_cstate_check(unsigned int max_cstate) boot_cpu_data.x86_model <= 0x05 && boot_cpu_data.x86_mask < 0x0A) return 1; - else if (c1e_detected) + else if (amd_e400_c1e_detected) return 1; else return max_cstate; diff --git a/arch/x86/include/asm/idle.h b/arch/x86/include/asm/idle.h index 38d8737..f49253d7 100644 --- a/arch/x86/include/asm/idle.h +++ b/arch/x86/include/asm/idle.h @@ -16,6 +16,6 @@ static inline void enter_idle(void) { } static inline void exit_idle(void) { } #endif /* CONFIG_X86_64 */ -void c1e_remove_cpu(int cpu); +void amd_e400_remove_cpu(int cpu); #endif /* _ASM_X86_IDLE_H */ diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 45636ce..b9c03fb 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -758,10 +758,10 @@ static inline void __sti_mwait(unsigned long eax, unsigned long ecx) extern void mwait_idle_with_hints(unsigned long eax, unsigned long ecx); extern void select_idle_routine(const struct cpuinfo_x86 *c); -extern void init_c1e_mask(void); +extern void init_amd_e400_c1e_mask(void); extern unsigned long boot_option_idle_override; -extern bool c1e_detected; +extern bool amd_e400_c1e_detected; enum idle_boot_override {IDLE_NO_OVERRIDE=0, IDLE_HALT, IDLE_NOMWAIT, IDLE_POLL, IDLE_FORCE_MWAIT}; diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 1d59834..745a602 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -887,7 +887,7 @@ static void vgetcpu_set_mode(void) void __init identify_boot_cpu(void) { identify_cpu(&boot_cpu_data); - init_c1e_mask(); + init_amd_e400_c1e_mask(); #ifdef CONFIG_X86_32 sysenter_setup(); enable_sep_cpu(); diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index ff45541..2efbfb7 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -538,45 +538,45 @@ int mwait_usable(const struct cpuinfo_x86 *c) return (edx & MWAIT_EDX_C1); } -bool c1e_detected; -EXPORT_SYMBOL(c1e_detected); +bool amd_e400_c1e_detected; +EXPORT_SYMBOL(amd_e400_c1e_detected); -static cpumask_var_t c1e_mask; +static cpumask_var_t amd_e400_c1e_mask; -void c1e_remove_cpu(int cpu) +void amd_e400_remove_cpu(int cpu) { - if (c1e_mask != NULL) - cpumask_clear_cpu(cpu, c1e_mask); + if (amd_e400_c1e_mask != NULL) + cpumask_clear_cpu(cpu, amd_e400_c1e_mask); } /* - * C1E aware idle routine. We check for C1E active in the interrupt + * AMD Erratum 400 aware idle routine. We check for C1E active in the interrupt * pending message MSR. If we detect C1E, then we handle it the same * way as C3 power states (local apic timer and TSC stop) */ -static void c1e_idle(void) +static void amd_e400_idle(void) { if (need_resched()) return; - if (!c1e_detected) { + if (!amd_e400_c1e_detected) { u32 lo, hi; rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi); if (lo & K8_INTP_C1E_ACTIVE_MASK) { - c1e_detected = true; + amd_e400_c1e_detected = true; if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC)) mark_tsc_unstable("TSC halt in AMD C1E"); printk(KERN_INFO "System has AMD C1E enabled\n"); } } - if (c1e_detected) { + if (amd_e400_c1e_detected) { int cpu = smp_processor_id(); - if (!cpumask_test_cpu(cpu, c1e_mask)) { - cpumask_set_cpu(cpu, c1e_mask); + if (!cpumask_test_cpu(cpu, amd_e400_c1e_mask)) { + cpumask_set_cpu(cpu, amd_e400_c1e_mask); /* * Force broadcast so ACPI can not interfere. */ @@ -619,17 +619,17 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) pm_idle = mwait_idle; } else if (cpu_has_amd_erratum(amd_erratum_400)) { /* E400: APIC timer interrupt does not wake up CPU from C1e */ - printk(KERN_INFO "using C1E aware idle routine\n"); - pm_idle = c1e_idle; + printk(KERN_INFO "using AMD E400 aware idle routine\n"); + pm_idle = amd_e400_idle; } else pm_idle = default_idle; } -void __init init_c1e_mask(void) +void __init init_amd_e400_c1e_mask(void) { - /* If we're using c1e_idle, we need to allocate c1e_mask. */ - if (pm_idle == c1e_idle) - zalloc_cpumask_var(&c1e_mask, GFP_KERNEL); + /* If we're using amd_e400_idle, we need to allocate amd_e400_c1e_mask. */ + if (pm_idle == amd_e400_idle) + zalloc_cpumask_var(&amd_e400_c1e_mask, GFP_KERNEL); } static int __init idle_setup(char *str) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 08776a9..2c33633 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1379,7 +1379,7 @@ void play_dead_common(void) { idle_task_exit(); reset_lazy_tlbstate(); - c1e_remove_cpu(raw_smp_processor_id()); + amd_e400_remove_cpu(raw_smp_processor_id()); mb(); /* Ack it */ diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c index d615b7d..431ab11 100644 --- a/drivers/acpi/processor_idle.c +++ b/drivers/acpi/processor_idle.c @@ -161,7 +161,7 @@ static void lapic_timer_check_state(int state, struct acpi_processor *pr, if (cpu_has(&cpu_data(pr->id), X86_FEATURE_ARAT)) return; - if (c1e_detected) + if (amd_e400_c1e_detected) type = ACPI_STATE_C1; /* -- cgit v0.10.2 From 06ae40ce073daf233607a3c54a489f2c1e44683e Mon Sep 17 00:00:00 2001 From: Len Brown Date: Fri, 1 Apr 2011 15:28:09 -0400 Subject: x86 idle: EXPORT_SYMBOL(default_idle, pm_idle) only when APM demands it In the long run, we don't want default_idle() or (pm_idle)() to be exported outside of process.c. Start by not exporting them to modules, unless the APM build demands it. cc: x86@kernel.org cc: Jiri Kosina Signed-off-by: Len Brown diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 2efbfb7..84f3cda 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -340,7 +340,9 @@ EXPORT_SYMBOL(boot_option_idle_override); * Powermanagement idle function, if any.. */ void (*pm_idle)(void); +#if defined(CONFIG_APM_MODULE) && defined(CONFIG_APM_CPU_IDLE) EXPORT_SYMBOL(pm_idle); +#endif #ifdef CONFIG_X86_32 /* @@ -400,7 +402,7 @@ void default_idle(void) cpu_relax(); } } -#ifdef CONFIG_APM_MODULE +#if defined(CONFIG_APM_MODULE) && defined(CONFIG_APM_CPU_IDLE) EXPORT_SYMBOL(default_idle); #endif -- cgit v0.10.2 From 3b70b2e5fcf6315eb833a1bcc2b810bdc75484ff Mon Sep 17 00:00:00 2001 From: Len Brown Date: Fri, 1 Apr 2011 15:08:48 -0400 Subject: x86 idle floppy: deprecate disable_hlt() Plan to remove floppy_disable_hlt in 2012, an ancient workaround with comments that it should be removed. This allows us to remove clutter and a run-time branch from the idle code. WARN_ONCE() on invocation until it is removed. cc: x86@kernel.org cc: stable@kernel.org # .39.x Signed-off-by: Len Brown diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt index b3f35e5..5540615 100644 --- a/Documentation/feature-removal-schedule.txt +++ b/Documentation/feature-removal-schedule.txt @@ -6,6 +6,14 @@ be removed from this file. --------------------------- +What: x86 floppy disable_hlt +When: 2012 +Why: ancient workaround of dubious utility clutters the + code used by everybody else. +Who: Len Brown + +--------------------------- + What: PRISM54 When: 2.6.34 diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index 77fc76f..20aea9b 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -1038,6 +1038,7 @@ static void floppy_disable_hlt(void) { unsigned long flags; + WARN_ONCE(1, "floppy_disable_hlt() scheduled for removal in 2012"); spin_lock_irqsave(&floppy_hlt_lock, flags); if (!hlt_disabled) { hlt_disabled = 1; -- cgit v0.10.2 From 99c63221435963e0cee2402686ba99293c2ffa9e Mon Sep 17 00:00:00 2001 From: Len Brown Date: Fri, 1 Apr 2011 15:19:23 -0400 Subject: x86 idle APM: deprecate CONFIG_APM_CPU_IDLE We don't want to export the pm_idle function pointer to modules. Currently CONFIG_APM_CPU_IDLE w/ CONFIG_APM_MODULE forces us to. CONFIG_APM_CPU_IDLE is of dubious value, it runs only on 32-bit uniprocessor laptops that are over 10 years old. It calls into the BIOS during idle, and is known to cause a number of machines to fail. Removing CONFIG_APM_CPU_IDLE and will allow us to stop exporting pm_idle. Any systems that were calling into the APM BIOS at run-time will simply use HLT instead. cc: x86@kernel.org cc: Jiri Kosina cc: stable@kernel.org # .39.x Signed-off-by: Len Brown diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt index 5540615..fc505c1 100644 --- a/Documentation/feature-removal-schedule.txt +++ b/Documentation/feature-removal-schedule.txt @@ -14,6 +14,16 @@ Who: Len Brown --------------------------- +What: CONFIG_APM_CPU_IDLE, and its ability to call APM BIOS in idle +When: 2012 +Why: This optional sub-feature of APM is of dubious reliability, + and ancient APM laptops are likely better served by calling HLT. + Deleting CONFIG_APM_CPU_IDLE allows x86 to stop exporting + the pm_idle function pointer to modules. +Who: Len Brown + +---------------------------- + What: PRISM54 When: 2.6.34 diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 0e4f24c..4c4ac32 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -359,6 +359,7 @@ struct apm_user { * idle percentage above which bios idle calls are done */ #ifdef CONFIG_APM_CPU_IDLE +#warning deprecated CONFIG_APM_CPU_IDLE will be deleted in 2012 #define DEFAULT_IDLE_THRESHOLD 95 #else #define DEFAULT_IDLE_THRESHOLD 100 @@ -902,6 +903,7 @@ static void apm_cpu_idle(void) unsigned int jiffies_since_last_check = jiffies - last_jiffies; unsigned int bucket; + WARN_ONCE(1, "deprecated apm_cpu_idle will be deleted in 2012"); recalc: if (jiffies_since_last_check > IDLE_CALC_LIMIT) { use_apm_idle = 0; -- cgit v0.10.2 From cdaab4a0d330f70c0e5ad8c3f7c65c2e375ea180 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Fri, 1 Apr 2011 15:41:17 -0400 Subject: x86 idle: deprecate "no-hlt" cmdline param We'd rather that modern machines not check if HLT works on every entry into idle, for the benefit of machines that had marginal electricals 15-years ago. If those machines are still running the upstream kernel, they can use "idle=poll". The only difference will be that they'll now invoke HLT in machine_hlt(). cc: x86@kernel.org # .39.x Signed-off-by: Len Brown diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt index fc505c1..f1b0eb0 100644 --- a/Documentation/feature-removal-schedule.txt +++ b/Documentation/feature-removal-schedule.txt @@ -24,6 +24,17 @@ Who: Len Brown ---------------------------- +What: x86_32 "no-hlt" cmdline param +When: 2012 +Why: remove a branch from idle path, simplify code used by everybody. + This option disabled the use of HLT in idle and machine_halt() + for hardware that was flakey 15-years ago. Today we have + "idle=poll" that removed HLT from idle, and so if such a machine + is still running the upstream kernel, "idle=poll" is likely sufficient. +Who: Len Brown + +---------------------------- + What: PRISM54 When: 2.6.34 diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index c39576c..525514c 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -19,6 +19,7 @@ static int __init no_halt(char *s) { + WARN_ONCE(1, "\"no-hlt\" is deprecated, please use \"idle=poll\"\n"); boot_cpu_data.hlt_works_ok = 0; return 1; } -- cgit v0.10.2 From 5d4c47e0195b989f284907358bd5c268a44b91c7 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Fri, 1 Apr 2011 15:46:09 -0400 Subject: x86 idle: deprecate mwait_idle() and "idle=mwait" cmdline param mwait_idle() is a C1-only idle loop intended to be more efficient than HLT on SMP hardware that supports it. But mwait_idle() has been replaced by the more general mwait_idle_with_hints(), which handles both C1 and deeper C-states. ACPI uses only mwait_idle_with_hints(), and never uses mwait_idle(). Deprecate mwait_idle() and the "idle=mwait" cmdline param to simplify the x86 idle code. After this change, kernels configured with (!CONFIG_ACPI=n && !CONFIG_INTEL_IDLE=n) when run on hardware that support MWAIT will simply use HLT. If MWAIT is desired on those systems, cpuidle and the cpuidle drivers above can be used. cc: x86@kernel.org cc: stable@kernel.org # .39.x Signed-off-by: Len Brown diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt index f1b0eb0..13f0bb3 100644 --- a/Documentation/feature-removal-schedule.txt +++ b/Documentation/feature-removal-schedule.txt @@ -35,6 +35,13 @@ Who: Len Brown ---------------------------- +What: x86 "idle=mwait" cmdline param +When: 2012 +Why: simplify x86 idle code +Who: Len Brown + +---------------------------- + What: PRISM54 When: 2.6.34 diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 84f3cda..8fb1829 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -645,6 +645,7 @@ static int __init idle_setup(char *str) boot_option_idle_override = IDLE_POLL; } else if (!strcmp(str, "mwait")) { boot_option_idle_override = IDLE_FORCE_MWAIT; + WARN_ONCE(1, "\idle=mwait\" will be removed in 2012\"\n"); } else if (!strcmp(str, "halt")) { /* * When the boot option of idle=halt is added, halt is -- cgit v0.10.2