From 79c5fcebfe4021f326a6715009f0b6b622d5df92 Mon Sep 17 00:00:00 2001 From: Jesse Larrew Date: Thu, 7 Jun 2012 16:04:34 -0500 Subject: powerpc/vphn: Fix arch_update_cpu_topology() return value arch_update_cpu_topology() should only return 1 when the topology has actually changed, and should return 0 otherwise. This patch fixes a potential bug where rebuild_sched_domains() would reinitialize the sched domains even when the topology hasn't changed. Signed-off-by: Jesse Larrew Signed-off-by: Benjamin Herrenschmidt diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index 39b1597..59213cf 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -1436,11 +1436,11 @@ static long vphn_get_associativity(unsigned long cpu, /* * Update the node maps and sysfs entries for each cpu whose home node - * has changed. + * has changed. Returns 1 when the topology has changed, and 0 otherwise. */ int arch_update_cpu_topology(void) { - int cpu, nid, old_nid; + int cpu, nid, old_nid, changed = 0; unsigned int associativity[VPHN_ASSOC_BUFSIZE] = {0}; struct device *dev; @@ -1466,9 +1466,10 @@ int arch_update_cpu_topology(void) dev = get_cpu_device(cpu); if (dev) kobject_uevent(&dev->kobj, KOBJ_CHANGE); + changed = 1; } - return 1; + return changed; } static void topology_work_fn(struct work_struct *work) -- cgit v0.10.2 From dabe859ec6360a12e71f39bf695d174e19ff2688 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 26 Jul 2012 13:56:11 +0000 Subject: powerpc: Give hypervisor decrementer interrupts their own handler At the moment the handler for hypervisor decrementer interrupts is the same as for decrementer interrupts, i.e. timer_interrupt(). This is bogus; if we ever do get a hypervisor decrementer interrupt it won't have anything to do with the next timer event. In fact the only time we get hypervisor decrementer interrupts is when one is left pending on exit from a KVM guest. When we get a hypervisor decrementer interrupt we don't need to do anything special to clear it, since they are edge-triggered on the transition of HDEC from 0 to -1. Thus this adds an empty handler function for them. We don't need to have them masked when interrupts are soft-disabled, so we use STD_EXCEPTION_HV instead of MASKABLE_EXCEPTION_HV. Signed-off-by: Paul Mackerras Signed-off-by: Benjamin Herrenschmidt diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index e894515..39aa97d 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -186,7 +186,7 @@ hardware_interrupt_hv: KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0x800) MASKABLE_EXCEPTION_PSERIES(0x900, 0x900, decrementer) - MASKABLE_EXCEPTION_HV(0x980, 0x982, decrementer) + STD_EXCEPTION_HV(0x980, 0x982, hdecrementer) STD_EXCEPTION_PSERIES(0xa00, 0xa00, trap_0a) KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0xa00) @@ -486,6 +486,7 @@ machine_check_common: STD_EXCEPTION_COMMON_ASYNC(0x500, hardware_interrupt, do_IRQ) STD_EXCEPTION_COMMON_ASYNC(0x900, decrementer, .timer_interrupt) + STD_EXCEPTION_COMMON(0x980, hdecrementer, .hdec_interrupt) STD_EXCEPTION_COMMON(0xa00, trap_0a, .unknown_exception) STD_EXCEPTION_COMMON(0xb00, trap_0b, .unknown_exception) STD_EXCEPTION_COMMON(0xd00, single_step, .single_step_exception) diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index be171ee..e49e931 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -535,6 +535,15 @@ void timer_interrupt(struct pt_regs * regs) trace_timer_interrupt_exit(regs); } +/* + * Hypervisor decrementer interrupts shouldn't occur but are sometimes + * left pending on exit from a KVM guest. We don't need to do anything + * to clear them, as they are edge-triggered. + */ +void hdec_interrupt(struct pt_regs *regs) +{ +} + #ifdef CONFIG_SUSPEND static void generic_suspend_disable_irqs(void) { -- cgit v0.10.2 From 375f561a4131a0f501c8845a2a20f2ca1abc8f7a Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 26 Jul 2012 18:51:09 +0000 Subject: powerpc/powernv: Always go into nap mode when CPU is offline The CPU hotplug code for the powernv platform currently only puts offline CPUs into nap mode if the powersave_nap variable is set. However, HV-style KVM on this platform requires secondary CPU threads to be offline and in nap mode. Since we know nap mode works just fine on all POWER7 machines, and the only machines that support the powernv platform are POWER7 machines, this changes the code to always put offline CPUs into nap mode, regardless of powersave_nap. Powersave_nap still controls whether or not CPUs go into nap mode when idle, as before. Signed-off-by: Paul Mackerras Signed-off-by: Benjamin Herrenschmidt diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h index 53b6dfa..54b73a2 100644 --- a/arch/powerpc/include/asm/processor.h +++ b/arch/powerpc/include/asm/processor.h @@ -386,6 +386,7 @@ extern unsigned long cpuidle_disable; enum idle_boot_override {IDLE_NO_OVERRIDE = 0, IDLE_POWERSAVE_OFF}; extern int powersave_nap; /* set if nap mode can be used in idle loop */ +extern void power7_nap(void); #ifdef CONFIG_PSERIES_IDLE extern void update_smt_snooze_delay(int snooze); diff --git a/arch/powerpc/kernel/idle_power7.S b/arch/powerpc/kernel/idle_power7.S index 7140d83..e11863f 100644 --- a/arch/powerpc/kernel/idle_power7.S +++ b/arch/powerpc/kernel/idle_power7.S @@ -28,7 +28,9 @@ _GLOBAL(power7_idle) lwz r4,ADDROFF(powersave_nap)(r3) cmpwi 0,r4,0 beqlr + /* fall through */ +_GLOBAL(power7_nap) /* NAP is a state loss, we create a regs frame on the * stack, fill it up with the state we care about and * stick a pointer to it in PACAR1. We really only diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c index 3ef4625..7698b6e 100644 --- a/arch/powerpc/platforms/powernv/smp.c +++ b/arch/powerpc/platforms/powernv/smp.c @@ -106,14 +106,6 @@ static void pnv_smp_cpu_kill_self(void) { unsigned int cpu; - /* If powersave_nap is enabled, use NAP mode, else just - * spin aimlessly - */ - if (!powersave_nap) { - generic_mach_cpu_die(); - return; - } - /* Standard hot unplug procedure */ local_irq_disable(); idle_task_exit(); @@ -128,7 +120,7 @@ static void pnv_smp_cpu_kill_self(void) */ mtspr(SPRN_LPCR, mfspr(SPRN_LPCR) & ~(u64)LPCR_PECE1); while (!generic_check_cpu_restart(cpu)) { - power7_idle(); + power7_nap(); if (!generic_check_cpu_restart(cpu)) { DBG("CPU%d Unexpected exit while offline !\n", cpu); /* We may be getting an IPI, so we re-enable -- cgit v0.10.2 From 1b6ca2a6fe56e7697d57348646e07df08f43b1bb Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Mon, 3 Sep 2012 16:47:56 +0000 Subject: powerpc: Update DSCR on all CPUs when writing sysfs dscr_default Writing to dscr_default in sysfs doesn't actually change the DSCR - we rely on a context switch on each CPU to do the work. There is no guarantee we will get a context switch in a reasonable amount of time so fire off an IPI to force an immediate change. This issue was found with the following test case: http://ozlabs.org/~anton/junkcode/dscr_explicit_test.c Signed-off-by: Anton Blanchard Cc: # 3.0+ Signed-off-by: Benjamin Herrenschmidt diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c index 3529446..d4cbbd1 100644 --- a/arch/powerpc/kernel/sysfs.c +++ b/arch/powerpc/kernel/sysfs.c @@ -194,6 +194,12 @@ static ssize_t show_dscr_default(struct device *dev, return sprintf(buf, "%lx\n", dscr_default); } +static void update_dscr(void *dummy) +{ + if (!current->thread.dscr_inherit) + mtspr(SPRN_DSCR, dscr_default); +} + static ssize_t __used store_dscr_default(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) @@ -206,6 +212,8 @@ static ssize_t __used store_dscr_default(struct device *dev, return -EINVAL; dscr_default = val; + on_each_cpu(update_dscr, NULL, 1); + return count; } -- cgit v0.10.2 From 00ca0de02f80924dfff6b4f630e1dff3db005e35 Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Mon, 3 Sep 2012 16:48:46 +0000 Subject: powerpc: Keep thread.dscr and thread.dscr_inherit in sync When we update the DSCR either via emulation of mtspr(DSCR) or via a change to dscr_default in sysfs we don't update thread.dscr. We will eventually update it at context switch time but there is a period where thread.dscr is incorrect. If we fork at this point we will copy the old value of thread.dscr into the child. To avoid this, always keep thread.dscr in sync with reality. This issue was found with the following testcase: http://ozlabs.org/~anton/junkcode/dscr_inherit_test.c Signed-off-by: Anton Blanchard Cc: # 3.0+ Signed-off-by: Benjamin Herrenschmidt diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c index d4cbbd1..8302af6 100644 --- a/arch/powerpc/kernel/sysfs.c +++ b/arch/powerpc/kernel/sysfs.c @@ -196,8 +196,10 @@ static ssize_t show_dscr_default(struct device *dev, static void update_dscr(void *dummy) { - if (!current->thread.dscr_inherit) + if (!current->thread.dscr_inherit) { + current->thread.dscr = dscr_default; mtspr(SPRN_DSCR, dscr_default); + } } static ssize_t __used store_dscr_default(struct device *dev, diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index 1589723..ae0843f 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -972,8 +972,9 @@ static int emulate_instruction(struct pt_regs *regs) cpu_has_feature(CPU_FTR_DSCR)) { PPC_WARN_EMULATED(mtdscr, regs); rd = (instword >> 21) & 0x1f; - mtspr(SPRN_DSCR, regs->gpr[rd]); + current->thread.dscr = regs->gpr[rd]; current->thread.dscr_inherit = 1; + mtspr(SPRN_DSCR, current->thread.dscr); return 0; } #endif -- cgit v0.10.2 From 1021cb268b3025573c4811f1dee4a11260c4507b Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Mon, 3 Sep 2012 16:49:47 +0000 Subject: powerpc: Fix DSCR inheritance in copy_thread() If the default DSCR is non zero we set thread.dscr_inherit in copy_thread() meaning the new thread and all its children will ignore future updates to the default DSCR. This is not intended and is a change in behaviour that a number of our users have hit. We just need to inherit thread.dscr and thread.dscr_inherit from the parent which ends up being much simpler. This was found with the following test case: http://ozlabs.org/~anton/junkcode/dscr_default_test.c Signed-off-by: Anton Blanchard Cc: # 3.0+ Signed-off-by: Benjamin Herrenschmidt diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 710f400..1a1f2dd 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -802,16 +802,8 @@ int copy_thread(unsigned long clone_flags, unsigned long usp, #endif /* CONFIG_PPC_STD_MMU_64 */ #ifdef CONFIG_PPC64 if (cpu_has_feature(CPU_FTR_DSCR)) { - if (current->thread.dscr_inherit) { - p->thread.dscr_inherit = 1; - p->thread.dscr = current->thread.dscr; - } else if (0 != dscr_default) { - p->thread.dscr_inherit = 1; - p->thread.dscr = dscr_default; - } else { - p->thread.dscr_inherit = 0; - p->thread.dscr = 0; - } + p->thread.dscr_inherit = current->thread.dscr_inherit; + p->thread.dscr = current->thread.dscr; } #endif -- cgit v0.10.2 From 714332858bfd40dcf8f741498336d93875c23aa7 Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Mon, 3 Sep 2012 16:51:10 +0000 Subject: powerpc: Restore correct DSCR in context switch During a context switch we always restore the per thread DSCR value. If we aren't doing explicit DSCR management (ie thread.dscr_inherit == 0) and the default DSCR changed while the process has been sleeping we end up with the wrong value. Check thread.dscr_inherit and select the default DSCR or per thread DSCR as required. This was found with the following test case, when running with more threads than CPUs (ie forcing context switching): http://ozlabs.org/~anton/junkcode/dscr_default_test.c With the four patches applied I can run a combination of all test cases successfully at the same time: http://ozlabs.org/~anton/junkcode/dscr_default_test.c http://ozlabs.org/~anton/junkcode/dscr_explicit_test.c http://ozlabs.org/~anton/junkcode/dscr_inherit_test.c Signed-off-by: Anton Blanchard Cc: # 3.0+ Signed-off-by: Benjamin Herrenschmidt diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 85b05c4..e899572 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -76,6 +76,7 @@ int main(void) DEFINE(SIGSEGV, SIGSEGV); DEFINE(NMI_MASK, NMI_MASK); DEFINE(THREAD_DSCR, offsetof(struct thread_struct, dscr)); + DEFINE(THREAD_DSCR_INHERIT, offsetof(struct thread_struct, dscr_inherit)); #else DEFINE(THREAD_INFO, offsetof(struct task_struct, stack)); #endif /* CONFIG_PPC64 */ diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index 4b01a25..b40e0b4 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -370,6 +370,12 @@ _GLOBAL(ret_from_fork) li r3,0 b syscall_exit + .section ".toc","aw" +DSCR_DEFAULT: + .tc dscr_default[TC],dscr_default + + .section ".text" + /* * This routine switches between two different tasks. The process * state of one is saved on its kernel stack. Then the state @@ -509,9 +515,6 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT) mr r1,r8 /* start using new stack pointer */ std r7,PACAKSAVE(r13) - ld r6,_CCR(r1) - mtcrf 0xFF,r6 - #ifdef CONFIG_ALTIVEC BEGIN_FTR_SECTION ld r0,THREAD_VRSAVE(r4) @@ -520,14 +523,22 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) #endif /* CONFIG_ALTIVEC */ #ifdef CONFIG_PPC64 BEGIN_FTR_SECTION + lwz r6,THREAD_DSCR_INHERIT(r4) + ld r7,DSCR_DEFAULT@toc(2) ld r0,THREAD_DSCR(r4) - cmpd r0,r25 - beq 1f + cmpwi r6,0 + bne 1f + ld r0,0(r7) +1: cmpd r0,r25 + beq 2f mtspr SPRN_DSCR,r0 -1: +2: END_FTR_SECTION_IFSET(CPU_FTR_DSCR) #endif + ld r6,_CCR(r1) + mtcrf 0xFF,r6 + /* r3-r13 are destroyed -- Cort */ REST_8GPRS(14, r1) REST_10GPRS(22, r1) -- cgit v0.10.2 From 9fb1b36ca1234e64a5d1cc573175303395e3354d Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Tue, 4 Sep 2012 18:33:08 +0000 Subject: powerpc: Make sure IPI handlers see data written by IPI senders We have been observing hangs, both of KVM guest vcpu tasks and more generally, where a process that is woken doesn't properly wake up and continue to run, but instead sticks in TASK_WAKING state. This happens because the update of rq->wake_list in ttwu_queue_remote() is not ordered with the update of ipi_message in smp_muxed_ipi_message_pass(), and the reading of rq->wake_list in scheduler_ipi() is not ordered with the reading of ipi_message in smp_ipi_demux(). Thus it is possible for the IPI receiver not to see the updated rq->wake_list and therefore conclude that there is nothing for it to do. In order to make sure that anything done before smp_send_reschedule() is ordered before anything done in the resulting call to scheduler_ipi(), this adds barriers in smp_muxed_message_pass() and smp_ipi_demux(). The barrier in smp_muxed_message_pass() is a full barrier to ensure that there is a full ordering between the smp_send_reschedule() caller and scheduler_ipi(). In smp_ipi_demux(), we use xchg() rather than xchg_local() because xchg() includes release and acquire barriers. Using xchg() rather than xchg_local() makes sense given that ipi_message is not just accessed locally. This moves the barrier between setting the message and calling the cause_ipi() function into the individual cause_ipi implementations. Most of them -- those that used outb, out_8 or similar -- already had a full barrier because out_8 etc. include a sync before the MMIO store. This adds an explicit barrier in the two remaining cases. These changes made no measurable difference to the speed of IPIs as measured using a simple ping-pong latency test across two CPUs on different cores of a POWER7 machine. The analysis of the reason why processes were not waking up properly is due to Milton Miller. Cc: stable@vger.kernel.org # v3.0+ Reported-by: Milton Miller Signed-off-by: Paul Mackerras Signed-off-by: Benjamin Herrenschmidt diff --git a/arch/powerpc/kernel/dbell.c b/arch/powerpc/kernel/dbell.c index 5b25c80..a892680 100644 --- a/arch/powerpc/kernel/dbell.c +++ b/arch/powerpc/kernel/dbell.c @@ -28,6 +28,8 @@ void doorbell_setup_this_cpu(void) void doorbell_cause_ipi(int cpu, unsigned long data) { + /* Order previous accesses vs. msgsnd, which is treated as a store */ + mb(); ppc_msgsnd(PPC_DBELL, 0, data); } diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index 0321007..8d4214a 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -198,8 +198,15 @@ void smp_muxed_ipi_message_pass(int cpu, int msg) struct cpu_messages *info = &per_cpu(ipi_message, cpu); char *message = (char *)&info->messages; + /* + * Order previous accesses before accesses in the IPI handler. + */ + smp_mb(); message[msg] = 1; - mb(); + /* + * cause_ipi functions are required to include a full barrier + * before doing whatever causes the IPI. + */ smp_ops->cause_ipi(cpu, info->data); } @@ -211,7 +218,7 @@ irqreturn_t smp_ipi_demux(void) mb(); /* order any irq clear */ do { - all = xchg_local(&info->messages, 0); + all = xchg(&info->messages, 0); #ifdef __BIG_ENDIAN if (all & (1 << (24 - 8 * PPC_MSG_CALL_FUNCTION))) diff --git a/arch/powerpc/sysdev/xics/icp-hv.c b/arch/powerpc/sysdev/xics/icp-hv.c index 14469cf..df0fc58 100644 --- a/arch/powerpc/sysdev/xics/icp-hv.c +++ b/arch/powerpc/sysdev/xics/icp-hv.c @@ -65,7 +65,11 @@ static inline void icp_hv_set_xirr(unsigned int value) static inline void icp_hv_set_qirr(int n_cpu , u8 value) { int hw_cpu = get_hard_smp_processor_id(n_cpu); - long rc = plpar_hcall_norets(H_IPI, hw_cpu, value); + long rc; + + /* Make sure all previous accesses are ordered before IPI sending */ + mb(); + rc = plpar_hcall_norets(H_IPI, hw_cpu, value); if (rc != H_SUCCESS) { pr_err("%s: bad return code qirr cpu=%d hw_cpu=%d mfrr=0x%x " "returned %ld\n", __func__, n_cpu, hw_cpu, value, rc); -- cgit v0.10.2 From 636802ef96eebe279b22ad9f9dacfe29291e45c7 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Tue, 4 Sep 2012 15:08:28 +0000 Subject: powerpc: Don't use __put_user() in patch_instruction patch_instruction() can be called very early on ppc32, when the kernel isn't yet running at it's linked address. That can cause the ! is_kernel_addr() test in __put_user() to trip and call might_sleep() which is very bad at that point during boot. Use a lower level function instead for now, at least until we get to rework ppc32 boot process to do the code patching later, like ppc64 does. Signed-off-by: Benjamin Herrenschmidt diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c index dd223b3..17e5b23 100644 --- a/arch/powerpc/lib/code-patching.c +++ b/arch/powerpc/lib/code-patching.c @@ -20,7 +20,7 @@ int patch_instruction(unsigned int *addr, unsigned int instr) { int err; - err = __put_user(instr, addr); + __put_user_size(instr, addr, 4, err); if (err) return err; asm ("dcbst 0, %0; sync; icbi 0,%0; sync; isync" : : "r" (addr)); -- cgit v0.10.2