From af726f21ed8af2cdaa4e93098dc211521218ae65 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Sat, 22 Nov 2014 18:00:31 -0800 Subject: x86_64, traps: Fix the espfix64 #DF fixup and rewrite it in C There's nothing special enough about the espfix64 double fault fixup to justify writing it in assembly. Move it to C. This also fixes a bug: if the double fault came from an IST stack, the old asm code would return to a partially uninitialized stack frame. Fixes: 3891a04aafd668686239349ea58f3314ea2af86b Signed-off-by: Andy Lutomirski Reviewed-by: Thomas Gleixner Cc: stable@vger.kernel.org Signed-off-by: Linus Torvalds diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index df088bb..a4dc8de 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -828,6 +828,7 @@ ENTRY(native_iret) jnz native_irq_return_ldt #endif +.global native_irq_return_iret native_irq_return_iret: iretq _ASM_EXTABLE(native_irq_return_iret, bad_iret) @@ -922,37 +923,6 @@ ENTRY(retint_kernel) CFI_ENDPROC END(common_interrupt) - /* - * If IRET takes a fault on the espfix stack, then we - * end up promoting it to a doublefault. In that case, - * modify the stack to make it look like we just entered - * the #GP handler from user space, similar to bad_iret. - */ -#ifdef CONFIG_X86_ESPFIX64 - ALIGN -__do_double_fault: - XCPT_FRAME 1 RDI+8 - movq RSP(%rdi),%rax /* Trap on the espfix stack? */ - sarq $PGDIR_SHIFT,%rax - cmpl $ESPFIX_PGD_ENTRY,%eax - jne do_double_fault /* No, just deliver the fault */ - cmpl $__KERNEL_CS,CS(%rdi) - jne do_double_fault - movq RIP(%rdi),%rax - cmpq $native_irq_return_iret,%rax - jne do_double_fault /* This shouldn't happen... */ - movq PER_CPU_VAR(kernel_stack),%rax - subq $(6*8-KERNEL_STACK_OFFSET),%rax /* Reset to original stack */ - movq %rax,RSP(%rdi) - movq $0,(%rax) /* Missing (lost) #GP error code */ - movq $general_protection,RIP(%rdi) - retq - CFI_ENDPROC -END(__do_double_fault) -#else -# define __do_double_fault do_double_fault -#endif - /* * APIC interrupts. */ @@ -1124,7 +1094,7 @@ idtentry overflow do_overflow has_error_code=0 idtentry bounds do_bounds has_error_code=0 idtentry invalid_op do_invalid_op has_error_code=0 idtentry device_not_available do_device_not_available has_error_code=0 -idtentry double_fault __do_double_fault has_error_code=1 paranoid=1 +idtentry double_fault do_double_fault has_error_code=1 paranoid=1 idtentry coprocessor_segment_overrun do_coprocessor_segment_overrun has_error_code=0 idtentry invalid_TSS do_invalid_TSS has_error_code=1 idtentry segment_not_present do_segment_not_present has_error_code=1 diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 0d0e922..8196627 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -259,6 +259,30 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) static const char str[] = "double fault"; struct task_struct *tsk = current; +#ifdef CONFIG_X86_ESPFIX64 + extern unsigned char native_irq_return_iret[]; + + /* + * If IRET takes a non-IST fault on the espfix64 stack, then we + * end up promoting it to a doublefault. In that case, modify + * the stack to make it look like we just entered the #GP + * handler from user space, similar to bad_iret. + */ + if (((long)regs->sp >> PGDIR_SHIFT) == ESPFIX_PGD_ENTRY && + regs->cs == __KERNEL_CS && + regs->ip == (unsigned long)native_irq_return_iret) + { + struct pt_regs *normal_regs = task_pt_regs(current); + + /* Fake a #GP(0) from userspace. */ + memmove(&normal_regs->ip, (void *)regs->sp, 5*8); + normal_regs->orig_ax = 0; /* Missing (lost) #GP error code */ + regs->ip = (unsigned long)general_protection; + regs->sp = (unsigned long)&normal_regs->orig_ax; + return; + } +#endif + exception_enter(); /* Return not checked because double check cannot be ignored */ notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV); -- cgit v0.10.2 From 6f442be2fb22be02cafa606f1769fa1e6f894441 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Sat, 22 Nov 2014 18:00:32 -0800 Subject: x86_64, traps: Stop using IST for #SS On a 32-bit kernel, this has no effect, since there are no IST stacks. On a 64-bit kernel, #SS can only happen in user code, on a failed iret to user space, a canonical violation on access via RSP or RBP, or a genuine stack segment violation in 32-bit kernel code. The first two cases don't need IST, and the latter two cases are unlikely fatal bugs, and promoting them to double faults would be fine. This fixes a bug in which the espfix64 code mishandles a stack segment violation. This saves 4k of memory per CPU and a tiny bit of code. Signed-off-by: Andy Lutomirski Reviewed-by: Thomas Gleixner Cc: stable@vger.kernel.org Signed-off-by: Linus Torvalds diff --git a/arch/x86/include/asm/page_32_types.h b/arch/x86/include/asm/page_32_types.h index f48b17d..3a52ee0 100644 --- a/arch/x86/include/asm/page_32_types.h +++ b/arch/x86/include/asm/page_32_types.h @@ -20,7 +20,6 @@ #define THREAD_SIZE_ORDER 1 #define THREAD_SIZE (PAGE_SIZE << THREAD_SIZE_ORDER) -#define STACKFAULT_STACK 0 #define DOUBLEFAULT_STACK 1 #define NMI_STACK 0 #define DEBUG_STACK 0 diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h index 6782051..75450b2 100644 --- a/arch/x86/include/asm/page_64_types.h +++ b/arch/x86/include/asm/page_64_types.h @@ -14,12 +14,11 @@ #define IRQ_STACK_ORDER 2 #define IRQ_STACK_SIZE (PAGE_SIZE << IRQ_STACK_ORDER) -#define STACKFAULT_STACK 1 -#define DOUBLEFAULT_STACK 2 -#define NMI_STACK 3 -#define DEBUG_STACK 4 -#define MCE_STACK 5 -#define N_EXCEPTION_STACKS 5 /* hw limit: 7 */ +#define DOUBLEFAULT_STACK 1 +#define NMI_STACK 2 +#define DEBUG_STACK 3 +#define MCE_STACK 4 +#define N_EXCEPTION_STACKS 4 /* hw limit: 7 */ #define PUD_PAGE_SIZE (_AC(1, UL) << PUD_SHIFT) #define PUD_PAGE_MASK (~(PUD_PAGE_SIZE-1)) diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index bc8352e..707adc6 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -39,6 +39,7 @@ asmlinkage void simd_coprocessor_error(void); #ifdef CONFIG_TRACING asmlinkage void trace_page_fault(void); +#define trace_stack_segment stack_segment #define trace_divide_error divide_error #define trace_bounds bounds #define trace_invalid_op invalid_op diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 1abcb50..ff86f19 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -24,7 +24,6 @@ static char x86_stack_ids[][8] = { [ DEBUG_STACK-1 ] = "#DB", [ NMI_STACK-1 ] = "NMI", [ DOUBLEFAULT_STACK-1 ] = "#DF", - [ STACKFAULT_STACK-1 ] = "#SS", [ MCE_STACK-1 ] = "#MC", #if DEBUG_STKSZ > EXCEPTION_STKSZ [ N_EXCEPTION_STACKS ... diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index a4dc8de..49a0c17 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1259,7 +1259,7 @@ apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \ idtentry debug do_debug has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK idtentry int3 do_int3 has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK -idtentry stack_segment do_stack_segment has_error_code=1 paranoid=1 +idtentry stack_segment do_stack_segment has_error_code=1 #ifdef CONFIG_XEN idtentry xen_debug do_debug has_error_code=0 idtentry xen_int3 do_int3 has_error_code=0 diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 8196627..48035e9 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -233,27 +233,11 @@ DO_ERROR(X86_TRAP_UD, SIGILL, "invalid opcode", invalid_op) DO_ERROR(X86_TRAP_OLD_MF, SIGFPE, "coprocessor segment overrun",coprocessor_segment_overrun) DO_ERROR(X86_TRAP_TS, SIGSEGV, "invalid TSS", invalid_TSS) DO_ERROR(X86_TRAP_NP, SIGBUS, "segment not present", segment_not_present) -#ifdef CONFIG_X86_32 DO_ERROR(X86_TRAP_SS, SIGBUS, "stack segment", stack_segment) -#endif DO_ERROR(X86_TRAP_AC, SIGBUS, "alignment check", alignment_check) #ifdef CONFIG_X86_64 /* Runs on IST stack */ -dotraplinkage void do_stack_segment(struct pt_regs *regs, long error_code) -{ - enum ctx_state prev_state; - - prev_state = exception_enter(); - if (notify_die(DIE_TRAP, "stack segment", regs, error_code, - X86_TRAP_SS, SIGBUS) != NOTIFY_STOP) { - preempt_conditional_sti(regs); - do_trap(X86_TRAP_SS, SIGBUS, "stack segment", regs, error_code, NULL); - preempt_conditional_cli(regs); - } - exception_exit(prev_state); -} - dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) { static const char str[] = "double fault"; @@ -802,7 +786,7 @@ void __init trap_init(void) set_intr_gate(X86_TRAP_OLD_MF, coprocessor_segment_overrun); set_intr_gate(X86_TRAP_TS, invalid_TSS); set_intr_gate(X86_TRAP_NP, segment_not_present); - set_intr_gate_ist(X86_TRAP_SS, &stack_segment, STACKFAULT_STACK); + set_intr_gate(X86_TRAP_SS, stack_segment); set_intr_gate(X86_TRAP_GP, general_protection); set_intr_gate(X86_TRAP_SPURIOUS, spurious_interrupt_bug); set_intr_gate(X86_TRAP_MF, coprocessor_error); -- cgit v0.10.2 From b645af2d5905c4e32399005b867987919cbfc3ae Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Sat, 22 Nov 2014 18:00:33 -0800 Subject: x86_64, traps: Rework bad_iret It's possible for iretq to userspace to fail. This can happen because of a bad CS, SS, or RIP. Historically, we've handled it by fixing up an exception from iretq to land at bad_iret, which pretends that the failed iret frame was really the hardware part of #GP(0) from userspace. To make this work, there's an extra fixup to fudge the gs base into a usable state. This is suboptimal because it loses the original exception. It's also buggy because there's no guarantee that we were on the kernel stack to begin with. For example, if the failing iret happened on return from an NMI, then we'll end up executing general_protection on the NMI stack. This is bad for several reasons, the most immediate of which is that general_protection, as a non-paranoid idtentry, will try to deliver signals and/or schedule from the wrong stack. This patch throws out bad_iret entirely. As a replacement, it augments the existing swapgs fudge into a full-blown iret fixup, mostly written in C. It's should be clearer and more correct. Signed-off-by: Andy Lutomirski Reviewed-by: Thomas Gleixner Cc: stable@vger.kernel.org Signed-off-by: Linus Torvalds diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 49a0c17..c0226ab 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -830,8 +830,13 @@ ENTRY(native_iret) .global native_irq_return_iret native_irq_return_iret: + /* + * This may fault. Non-paranoid faults on return to userspace are + * handled by fixup_bad_iret. These include #SS, #GP, and #NP. + * Double-faults due to espfix64 are handled in do_double_fault. + * Other faults here are fatal. + */ iretq - _ASM_EXTABLE(native_irq_return_iret, bad_iret) #ifdef CONFIG_X86_ESPFIX64 native_irq_return_ldt: @@ -859,25 +864,6 @@ native_irq_return_ldt: jmp native_irq_return_iret #endif - .section .fixup,"ax" -bad_iret: - /* - * The iret traps when the %cs or %ss being restored is bogus. - * We've lost the original trap vector and error code. - * #GPF is the most likely one to get for an invalid selector. - * So pretend we completed the iret and took the #GPF in user mode. - * - * We are now running with the kernel GS after exception recovery. - * But error_entry expects us to have user GS to match the user %cs, - * so swap back. - */ - pushq $0 - - SWAPGS - jmp general_protection - - .previous - /* edi: workmask, edx: work */ retint_careful: CFI_RESTORE_STATE @@ -1369,17 +1355,16 @@ error_sti: /* * There are two places in the kernel that can potentially fault with - * usergs. Handle them here. The exception handlers after iret run with - * kernel gs again, so don't set the user space flag. B stepping K8s - * sometimes report an truncated RIP for IRET exceptions returning to - * compat mode. Check for these here too. + * usergs. Handle them here. B stepping K8s sometimes report a + * truncated RIP for IRET exceptions returning to compat mode. Check + * for these here too. */ error_kernelspace: CFI_REL_OFFSET rcx, RCX+8 incl %ebx leaq native_irq_return_iret(%rip),%rcx cmpq %rcx,RIP+8(%rsp) - je error_swapgs + je error_bad_iret movl %ecx,%eax /* zero extend */ cmpq %rax,RIP+8(%rsp) je bstep_iret @@ -1390,7 +1375,15 @@ error_kernelspace: bstep_iret: /* Fix truncated RIP */ movq %rcx,RIP+8(%rsp) - jmp error_swapgs + /* fall through */ + +error_bad_iret: + SWAPGS + mov %rsp,%rdi + call fixup_bad_iret + mov %rax,%rsp + decl %ebx /* Return to usergs */ + jmp error_sti CFI_ENDPROC END(error_entry) diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 48035e9..de801f2 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -407,6 +407,35 @@ asmlinkage __visible struct pt_regs *sync_regs(struct pt_regs *eregs) return regs; } NOKPROBE_SYMBOL(sync_regs); + +struct bad_iret_stack { + void *error_entry_ret; + struct pt_regs regs; +}; + +asmlinkage __visible +struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s) +{ + /* + * This is called from entry_64.S early in handling a fault + * caused by a bad iret to user mode. To handle the fault + * correctly, we want move our stack frame to task_pt_regs + * and we want to pretend that the exception came from the + * iret target. + */ + struct bad_iret_stack *new_stack = + container_of(task_pt_regs(current), + struct bad_iret_stack, regs); + + /* Copy the IRET target to the new stack. */ + memmove(&new_stack->regs.ip, (void *)s->regs.sp, 5*8); + + /* Copy the remainder of the stack from the current stack. */ + memmove(new_stack, s, offsetof(struct bad_iret_stack, regs.ip)); + + BUG_ON(!user_mode_vm(&new_stack->regs)); + return new_stack; +} #endif /* -- cgit v0.10.2