From 43b68879de27b1993518687fbc6013da80cdcbfe Mon Sep 17 00:00:00 2001 From: Gregory CLEMENT Date: Thu, 26 Feb 2015 18:20:48 +0100 Subject: cpuidle: mvebu: Fix the CPU PM notifier usage As stated in kernel/cpu_pm.c, "Platform is responsible for ensuring that cpu_pm_enter is not called twice on the same CPU before cpu_pm_exit is called.". In the current code in case of failure when calling mvebu_v7_cpu_suspend, the function cpu_pm_exit() is never called whereas cpu_pm_enter() was called just before. This patch moves the cpu_pm_exit() in order to balance the cpu_pm_enter() calls. Cc: stable@vger.kernel.org Reported-by: Fulvio Benini Signed-off-by: Gregory CLEMENT Signed-off-by: Daniel Lezcano diff --git a/drivers/cpuidle/cpuidle-mvebu-v7.c b/drivers/cpuidle/cpuidle-mvebu-v7.c index 38e6861..cefa074 100644 --- a/drivers/cpuidle/cpuidle-mvebu-v7.c +++ b/drivers/cpuidle/cpuidle-mvebu-v7.c @@ -37,11 +37,11 @@ static int mvebu_v7_enter_idle(struct cpuidle_device *dev, deepidle = true; ret = mvebu_v7_cpu_suspend(deepidle); + cpu_pm_exit(); + if (ret) return ret; - cpu_pm_exit(); - return index; } -- cgit v0.10.2 From ce6031c89a35cffd5a5992b08377b77f49a004b9 Mon Sep 17 00:00:00 2001 From: Sebastien Rannou Date: Fri, 13 Feb 2015 15:55:03 +0100 Subject: cpuidle: mvebu: Update cpuidle thresholds for Armada XP SOCs Originally, the thresholds used in the cpuidle driver for Armada SOCs were temporarily chosen, leaving room for improvements. This commit updates the thresholds for the Armada XP SOCs with values that positively impact performances: without patch with patch vendor kernel - iperf localhost (gbit/sec) ~3.7 ~6.4 ~5.4 - ioping tmpfs (iops) ~163k ~206k ~179k - ioping tmpfs (mib/s) ~636 ~805 ~699 The idle power consumption is negatively impacted (proportionally less than the performance gain), and we are still performing better than the vendor kernel here: without patch with patch vendor kernel - power consumption idle (W) ~2.4 ~3.2 ~4.4 - power consumption busy (W) ~8.6 ~8.3 ~8.6 There is still room for improvement regarding the value of these thresholds, they were chosen to mimic the vendor kernel. This patch only impacts Armada XP SOCs and was tested on Online Labs C1 boards. A similar approach can be taken to improve the performances of the Armada 370 and Armada 38x SOCs. Thanks a lot to Thomas Petazzoni, Gregory Clement and Willy Tarreau for the discussions and tips around this topic. Signed-off-by: Sebastien Rannou Signed-off-by: Daniel Lezcano Acked-by: Gregory CLEMENT diff --git a/drivers/cpuidle/cpuidle-mvebu-v7.c b/drivers/cpuidle/cpuidle-mvebu-v7.c index cefa074..980151f 100644 --- a/drivers/cpuidle/cpuidle-mvebu-v7.c +++ b/drivers/cpuidle/cpuidle-mvebu-v7.c @@ -50,17 +50,17 @@ static struct cpuidle_driver armadaxp_idle_driver = { .states[0] = ARM_CPUIDLE_WFI_STATE, .states[1] = { .enter = mvebu_v7_enter_idle, - .exit_latency = 10, + .exit_latency = 100, .power_usage = 50, - .target_residency = 100, + .target_residency = 1000, .name = "MV CPU IDLE", .desc = "CPU power down", }, .states[2] = { .enter = mvebu_v7_enter_idle, - .exit_latency = 100, + .exit_latency = 1000, .power_usage = 5, - .target_residency = 1000, + .target_residency = 10000, .flags = MVEBU_V7_FLAG_DEEP_IDLE, .name = "MV CPU DEEP IDLE", .desc = "CPU and L2 Fabric power down", -- cgit v0.10.2 From d474a4d365aaa5c7aabcf11a74ea43aa23f6f2e9 Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Fri, 13 Mar 2015 03:48:56 -0700 Subject: powercap / RAPL: handle domains with different energy units The current driver assumes all RAPL domains within a CPU package have the same energy unit. This is no longer true for HSW server CPUs since DRAM domain has is own fixed energy unit which can be different than the package energy unit enumerated by package power MSR. In fact, the default HSW EP package power unit is 61uJ whereas DRAM domain unit is 15.3uJ. The result is that DRAM power consumption is counted 4x more than real power reported by energy counters, similarly for max_energy_range_uj of DRAM domain. This patch adds domain specific energy unit per cpu type, it allows domain energy unit to override package energy unit if non zero. Please see this document for details. "Intel Xeon Processor E5-1600 and E5-2600 v3 Product Families, Volume 2 of 2. Datasheet, September 2014, Reference Number: 330784-001 " Signed-off-by: Jacob Pan Cc: 3.10+ # 3.10+ Signed-off-by: Rafael J. Wysocki diff --git a/drivers/powercap/intel_rapl.c b/drivers/powercap/intel_rapl.c index 97b5e4e..63d4033 100644 --- a/drivers/powercap/intel_rapl.c +++ b/drivers/powercap/intel_rapl.c @@ -73,7 +73,7 @@ #define TIME_WINDOW_MAX_MSEC 40000 #define TIME_WINDOW_MIN_MSEC 250 - +#define ENERGY_UNIT_SCALE 1000 /* scale from driver unit to powercap unit */ enum unit_type { ARBITRARY_UNIT, /* no translation */ POWER_UNIT, @@ -158,6 +158,7 @@ struct rapl_domain { struct rapl_power_limit rpl[NR_POWER_LIMITS]; u64 attr_map; /* track capabilities */ unsigned int state; + unsigned int domain_energy_unit; int package_id; }; #define power_zone_to_rapl_domain(_zone) \ @@ -190,6 +191,7 @@ struct rapl_defaults { void (*set_floor_freq)(struct rapl_domain *rd, bool mode); u64 (*compute_time_window)(struct rapl_package *rp, u64 val, bool to_raw); + unsigned int dram_domain_energy_unit; }; static struct rapl_defaults *rapl_defaults; @@ -227,7 +229,8 @@ static int rapl_read_data_raw(struct rapl_domain *rd, static int rapl_write_data_raw(struct rapl_domain *rd, enum rapl_primitives prim, unsigned long long value); -static u64 rapl_unit_xlate(int package, enum unit_type type, u64 value, +static u64 rapl_unit_xlate(struct rapl_domain *rd, int package, + enum unit_type type, u64 value, int to_raw); static void package_power_limit_irq_save(int package_id); @@ -305,7 +308,9 @@ static int get_energy_counter(struct powercap_zone *power_zone, u64 *energy_raw) static int get_max_energy_counter(struct powercap_zone *pcd_dev, u64 *energy) { - *energy = rapl_unit_xlate(0, ENERGY_UNIT, ENERGY_STATUS_MASK, 0); + struct rapl_domain *rd = power_zone_to_rapl_domain(pcd_dev); + + *energy = rapl_unit_xlate(rd, 0, ENERGY_UNIT, ENERGY_STATUS_MASK, 0); return 0; } @@ -639,6 +644,11 @@ static void rapl_init_domains(struct rapl_package *rp) rd->msrs[4] = MSR_DRAM_POWER_INFO; rd->rpl[0].prim_id = PL1_ENABLE; rd->rpl[0].name = pl1_name; + rd->domain_energy_unit = + rapl_defaults->dram_domain_energy_unit; + if (rd->domain_energy_unit) + pr_info("DRAM domain energy unit %dpj\n", + rd->domain_energy_unit); break; } if (mask) { @@ -648,11 +658,13 @@ static void rapl_init_domains(struct rapl_package *rp) } } -static u64 rapl_unit_xlate(int package, enum unit_type type, u64 value, +static u64 rapl_unit_xlate(struct rapl_domain *rd, int package, + enum unit_type type, u64 value, int to_raw) { u64 units = 1; struct rapl_package *rp; + u64 scale = 1; rp = find_package_by_id(package); if (!rp) @@ -663,7 +675,12 @@ static u64 rapl_unit_xlate(int package, enum unit_type type, u64 value, units = rp->power_unit; break; case ENERGY_UNIT: - units = rp->energy_unit; + scale = ENERGY_UNIT_SCALE; + /* per domain unit takes precedence */ + if (rd && rd->domain_energy_unit) + units = rd->domain_energy_unit; + else + units = rp->energy_unit; break; case TIME_UNIT: return rapl_defaults->compute_time_window(rp, value, to_raw); @@ -673,11 +690,11 @@ static u64 rapl_unit_xlate(int package, enum unit_type type, u64 value, }; if (to_raw) - return div64_u64(value, units); + return div64_u64(value, units) * scale; value *= units; - return value; + return div64_u64(value, scale); } /* in the order of enum rapl_primitives */ @@ -773,7 +790,7 @@ static int rapl_read_data_raw(struct rapl_domain *rd, final = value & rp->mask; final = final >> rp->shift; if (xlate) - *data = rapl_unit_xlate(rd->package_id, rp->unit, final, 0); + *data = rapl_unit_xlate(rd, rd->package_id, rp->unit, final, 0); else *data = final; @@ -799,7 +816,7 @@ static int rapl_write_data_raw(struct rapl_domain *rd, "failed to read msr 0x%x on cpu %d\n", msr, cpu); return -EIO; } - value = rapl_unit_xlate(rd->package_id, rp->unit, value, 1); + value = rapl_unit_xlate(rd, rd->package_id, rp->unit, value, 1); msr_val &= ~rp->mask; msr_val |= value << rp->shift; if (wrmsrl_safe_on_cpu(cpu, msr, msr_val)) { @@ -818,7 +835,7 @@ static int rapl_write_data_raw(struct rapl_domain *rd, * calculate units differ on different CPUs. * We convert the units to below format based on CPUs. * i.e. - * energy unit: microJoules : Represented in microJoules by default + * energy unit: picoJoules : Represented in picoJoules by default * power unit : microWatts : Represented in milliWatts by default * time unit : microseconds: Represented in seconds by default */ @@ -834,7 +851,7 @@ static int rapl_check_unit_core(struct rapl_package *rp, int cpu) } value = (msr_val & ENERGY_UNIT_MASK) >> ENERGY_UNIT_OFFSET; - rp->energy_unit = 1000000 / (1 << value); + rp->energy_unit = ENERGY_UNIT_SCALE * 1000000 / (1 << value); value = (msr_val & POWER_UNIT_MASK) >> POWER_UNIT_OFFSET; rp->power_unit = 1000000 / (1 << value); @@ -842,7 +859,7 @@ static int rapl_check_unit_core(struct rapl_package *rp, int cpu) value = (msr_val & TIME_UNIT_MASK) >> TIME_UNIT_OFFSET; rp->time_unit = 1000000 / (1 << value); - pr_debug("Core CPU package %d energy=%duJ, time=%dus, power=%duW\n", + pr_debug("Core CPU package %d energy=%dpJ, time=%dus, power=%duW\n", rp->id, rp->energy_unit, rp->time_unit, rp->power_unit); return 0; @@ -859,7 +876,7 @@ static int rapl_check_unit_atom(struct rapl_package *rp, int cpu) return -ENODEV; } value = (msr_val & ENERGY_UNIT_MASK) >> ENERGY_UNIT_OFFSET; - rp->energy_unit = 1 << value; + rp->energy_unit = ENERGY_UNIT_SCALE * 1 << value; value = (msr_val & POWER_UNIT_MASK) >> POWER_UNIT_OFFSET; rp->power_unit = (1 << value) * 1000; @@ -867,7 +884,7 @@ static int rapl_check_unit_atom(struct rapl_package *rp, int cpu) value = (msr_val & TIME_UNIT_MASK) >> TIME_UNIT_OFFSET; rp->time_unit = 1000000 / (1 << value); - pr_debug("Atom package %d energy=%duJ, time=%dus, power=%duW\n", + pr_debug("Atom package %d energy=%dpJ, time=%dus, power=%duW\n", rp->id, rp->energy_unit, rp->time_unit, rp->power_unit); return 0; @@ -1017,6 +1034,13 @@ static const struct rapl_defaults rapl_defaults_core = { .compute_time_window = rapl_compute_time_window_core, }; +static const struct rapl_defaults rapl_defaults_hsw_server = { + .check_unit = rapl_check_unit_core, + .set_floor_freq = set_floor_freq_default, + .compute_time_window = rapl_compute_time_window_core, + .dram_domain_energy_unit = 15300, +}; + static const struct rapl_defaults rapl_defaults_atom = { .check_unit = rapl_check_unit_atom, .set_floor_freq = set_floor_freq_atom, @@ -1037,7 +1061,7 @@ static const struct x86_cpu_id rapl_ids[] = { RAPL_CPU(0x3a, rapl_defaults_core),/* Ivy Bridge */ RAPL_CPU(0x3c, rapl_defaults_core),/* Haswell */ RAPL_CPU(0x3d, rapl_defaults_core),/* Broadwell */ - RAPL_CPU(0x3f, rapl_defaults_core),/* Haswell */ + RAPL_CPU(0x3f, rapl_defaults_hsw_server),/* Haswell servers */ RAPL_CPU(0x45, rapl_defaults_core),/* Haswell ULT */ RAPL_CPU(0x4C, rapl_defaults_atom),/* Braswell */ RAPL_CPU(0x4A, rapl_defaults_atom),/* Tangier */ -- cgit v0.10.2 From 886016835a87757839baced91a7ff09e9636b6b1 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 17 Mar 2015 16:38:10 +0100 Subject: rtc: at91rm9200: double locking bug in at91_rtc_interrupt() There is a typo here so we deadlock. Fixes: dd1f1f391dd7 ('rtc: at91rm9200: rework wakeup and interrupt handling') Signed-off-by: Dan Carpenter Acked-by: Boris Brezillon Reported-by: David Dueck Signed-off-by: Nicolas Ferre Signed-off-by: Rafael J. Wysocki diff --git a/drivers/rtc/rtc-at91rm9200.c b/drivers/rtc/rtc-at91rm9200.c index b4f7744..b283a1a 100644 --- a/drivers/rtc/rtc-at91rm9200.c +++ b/drivers/rtc/rtc-at91rm9200.c @@ -324,7 +324,7 @@ static irqreturn_t at91_rtc_interrupt(int irq, void *dev_id) ret = IRQ_HANDLED; } - spin_lock(&suspended_lock); + spin_unlock(&suspended_lock); return ret; } -- cgit v0.10.2 From 9e8ce4b96b781b003e3174fbbc62e1d4388c8b8f Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 20 Mar 2015 14:56:19 +0100 Subject: Revert "x86/PCI: Refine the way to release PCI IRQ resources" Commit b4b55cda5874 (Refine the way to release PCI IRQ resources) introduced a regression in the PCI IRQ resource management by causing the IRQ resource of a device, established when pci_enabled_device() is called on a fully disabled device, to be released when the driver is unbound from the device, regardless of the enable_cnt. This leads to the situation that an ill-behaved driver can now make a device unusable to subsequent drivers by an imbalance in their use of pci_enable/disable_device(). That is a serious problem for secondary drivers like vfio-pci, which are innocent of the transgressions of the previous driver. Since the solution of this problem is not immediate and requires further discussion, revert commit b4b55cda5874 and the issue it was supposed to address (a bug related to xen-pciback) will be taken care of in a different way going forward. Reported-by: Alex Williamson Signed-off-by: Rafael J. Wysocki diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h index fa1195d..164e3f8 100644 --- a/arch/x86/include/asm/pci_x86.h +++ b/arch/x86/include/asm/pci_x86.h @@ -93,6 +93,8 @@ extern raw_spinlock_t pci_config_lock; extern int (*pcibios_enable_irq)(struct pci_dev *dev); extern void (*pcibios_disable_irq)(struct pci_dev *dev); +extern bool mp_should_keep_irq(struct device *dev); + struct pci_raw_ops { int (*read)(unsigned int domain, unsigned int bus, unsigned int devfn, int reg, int len, u32 *val); diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c index 3d2612b..2fb3847 100644 --- a/arch/x86/pci/common.c +++ b/arch/x86/pci/common.c @@ -513,31 +513,6 @@ void __init pcibios_set_cache_line_size(void) } } -/* - * Some device drivers assume dev->irq won't change after calling - * pci_disable_device(). So delay releasing of IRQ resource to driver - * unbinding time. Otherwise it will break PM subsystem and drivers - * like xen-pciback etc. - */ -static int pci_irq_notifier(struct notifier_block *nb, unsigned long action, - void *data) -{ - struct pci_dev *dev = to_pci_dev(data); - - if (action != BUS_NOTIFY_UNBOUND_DRIVER) - return NOTIFY_DONE; - - if (pcibios_disable_irq) - pcibios_disable_irq(dev); - - return NOTIFY_OK; -} - -static struct notifier_block pci_irq_nb = { - .notifier_call = pci_irq_notifier, - .priority = INT_MIN, -}; - int __init pcibios_init(void) { if (!raw_pci_ops) { @@ -550,9 +525,6 @@ int __init pcibios_init(void) if (pci_bf_sort >= pci_force_bf) pci_sort_breadthfirst(); - - bus_register_notifier(&pci_bus_type, &pci_irq_nb); - return 0; } @@ -711,6 +683,12 @@ int pcibios_enable_device(struct pci_dev *dev, int mask) return 0; } +void pcibios_disable_device (struct pci_dev *dev) +{ + if (!pci_dev_msi_enabled(dev) && pcibios_disable_irq) + pcibios_disable_irq(dev); +} + int pci_ext_cfg_avail(void) { if (raw_pci_ext_ops) diff --git a/arch/x86/pci/intel_mid_pci.c b/arch/x86/pci/intel_mid_pci.c index efb8493..852aa4c 100644 --- a/arch/x86/pci/intel_mid_pci.c +++ b/arch/x86/pci/intel_mid_pci.c @@ -234,10 +234,10 @@ static int intel_mid_pci_irq_enable(struct pci_dev *dev) static void intel_mid_pci_irq_disable(struct pci_dev *dev) { - if (dev->irq_managed && dev->irq > 0) { + if (!mp_should_keep_irq(&dev->dev) && dev->irq_managed && + dev->irq > 0) { mp_unmap_irq(dev->irq); dev->irq_managed = 0; - dev->irq = 0; } } diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c index e71b3db..5dc6ca5 100644 --- a/arch/x86/pci/irq.c +++ b/arch/x86/pci/irq.c @@ -1256,9 +1256,22 @@ static int pirq_enable_irq(struct pci_dev *dev) return 0; } +bool mp_should_keep_irq(struct device *dev) +{ + if (dev->power.is_prepared) + return true; +#ifdef CONFIG_PM + if (dev->power.runtime_status == RPM_SUSPENDING) + return true; +#endif + + return false; +} + static void pirq_disable_irq(struct pci_dev *dev) { - if (io_apic_assign_pci_irqs && dev->irq_managed && dev->irq) { + if (io_apic_assign_pci_irqs && !mp_should_keep_irq(&dev->dev) && + dev->irq_managed && dev->irq) { mp_unmap_irq(dev->irq); dev->irq = 0; dev->irq_managed = 0; diff --git a/drivers/acpi/pci_irq.c b/drivers/acpi/pci_irq.c index e7f718d..b1def41 100644 --- a/drivers/acpi/pci_irq.c +++ b/drivers/acpi/pci_irq.c @@ -485,6 +485,14 @@ void acpi_pci_irq_disable(struct pci_dev *dev) if (!pin || !dev->irq_managed || dev->irq <= 0) return; + /* Keep IOAPIC pin configuration when suspending */ + if (dev->dev.power.is_prepared) + return; +#ifdef CONFIG_PM + if (dev->dev.power.runtime_status == RPM_SUSPENDING) + return; +#endif + entry = acpi_pci_irq_lookup(dev, pin); if (!entry) return; @@ -505,6 +513,5 @@ void acpi_pci_irq_disable(struct pci_dev *dev) if (gsi >= 0) { acpi_unregister_gsi(gsi); dev->irq_managed = 0; - dev->irq = 0; } } -- cgit v0.10.2