From 96ed4cd0aa86ac1a753baeef26c911a9449493e9 Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Tue, 9 Dec 2014 14:54:44 -0800 Subject: x86/doc: Update documentation after file shuffling While at it, also refer to the 32 bit entry file. Signed-off-by: Luis R. Rodriguez Cc: H. Peter Anvin Cc: Borislav Petkov Cc: linux-doc@vger.kernel.org Cc: bpoirier@suse.de Link: http://lkml.kernel.org/r/1418165684-6226-1-git-send-email-mcgrof@do-not-panic.com Signed-off-by: Ingo Molnar diff --git a/Documentation/x86/entry_64.txt b/Documentation/x86/entry_64.txt index bc7226e..4a1c5c2 100644 --- a/Documentation/x86/entry_64.txt +++ b/Documentation/x86/entry_64.txt @@ -7,9 +7,12 @@ http://lkml.kernel.org/r/<20110529191055.GC9835%40elte.hu> The x86 architecture has quite a few different ways to jump into kernel code. Most of these entry points are registered in arch/x86/kernel/traps.c and implemented in arch/x86/kernel/entry_64.S -and arch/x86/ia32/ia32entry.S. +for 64-bit, arch/x86/kernel/entry_32.S for 32-bit and finally +arch/x86/ia32/ia32entry.S which implements the 32-bit compatibility +syscall entry points and thus provides for 32-bit processes the +ability to execute syscalls when running on 64-bit kernels. -The IDT vector assignments are listed in arch/x86/include/irq_vectors.h. +The IDT vector assignments are listed in arch/x86/include/asm/irq_vectors.h. Some of these entries are: -- cgit v0.10.2 From c072b90c8dfe135072f646cc50b826e30c5aa558 Mon Sep 17 00:00:00 2001 From: Xishi Qiu Date: Wed, 10 Dec 2014 10:09:01 +0800 Subject: x86/mm: Fix zone ranges boot printout This is the usual physical memory layout boot printout: ... [ 0.000000] Zone ranges: [ 0.000000] DMA [mem 0x00001000-0x00ffffff] [ 0.000000] DMA32 [mem 0x01000000-0xffffffff] [ 0.000000] Normal [mem 0x100000000-0xc3fffffff] [ 0.000000] Movable zone start for each node [ 0.000000] Early memory node ranges [ 0.000000] node 0: [mem 0x00001000-0x00099fff] [ 0.000000] node 0: [mem 0x00100000-0xbf78ffff] [ 0.000000] node 0: [mem 0x100000000-0x63fffffff] [ 0.000000] node 1: [mem 0x640000000-0xc3fffffff] ... This is the log when we set "mem=2G" on the boot cmdline: ... [ 0.000000] Zone ranges: [ 0.000000] DMA [mem 0x00001000-0x00ffffff] [ 0.000000] DMA32 [mem 0x01000000-0xffffffff] // should be 0x7fffffff, right? [ 0.000000] Normal empty [ 0.000000] Movable zone start for each node [ 0.000000] Early memory node ranges [ 0.000000] node 0: [mem 0x00001000-0x00099fff] [ 0.000000] node 0: [mem 0x00100000-0x7fffffff] ... This patch fixes the printout, the following log shows the right ranges: ... [ 0.000000] Zone ranges: [ 0.000000] DMA [mem 0x00001000-0x00ffffff] [ 0.000000] DMA32 [mem 0x01000000-0x7fffffff] [ 0.000000] Normal empty [ 0.000000] Movable zone start for each node [ 0.000000] Early memory node ranges [ 0.000000] node 0: [mem 0x00001000-0x00099fff] [ 0.000000] node 0: [mem 0x00100000-0x7fffffff] ... Suggested-by: Andrew Morton Signed-off-by: Xishi Qiu Cc: Linux MM Cc: Cc: Rik van Riel Link: http://lkml.kernel.org/r/5487AB3D.6070306@huawei.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/include/asm/dma.h b/arch/x86/include/asm/dma.h index 0bdb0c5..fe884e1 100644 --- a/arch/x86/include/asm/dma.h +++ b/arch/x86/include/asm/dma.h @@ -70,7 +70,7 @@ #define MAX_DMA_CHANNELS 8 /* 16MB ISA DMA zone */ -#define MAX_DMA_PFN ((16 * 1024 * 1024) >> PAGE_SHIFT) +#define MAX_DMA_PFN ((16UL * 1024 * 1024) >> PAGE_SHIFT) /* 4GB broken PCI/AGP hardware bus master zone */ #define MAX_DMA32_PFN ((4UL * 1024 * 1024 * 1024) >> PAGE_SHIFT) diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 66dba36..07244aa 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -674,10 +674,10 @@ void __init zone_sizes_init(void) memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); #ifdef CONFIG_ZONE_DMA - max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN; + max_zone_pfns[ZONE_DMA] = min(MAX_DMA_PFN, max_low_pfn); #endif #ifdef CONFIG_ZONE_DMA32 - max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; + max_zone_pfns[ZONE_DMA32] = min(MAX_DMA32_PFN, max_low_pfn); #endif max_zone_pfns[ZONE_NORMAL] = max_low_pfn; #ifdef CONFIG_HIGHMEM -- cgit v0.10.2 From 29258cf49eb794f00989fc47da8700759a42778b Mon Sep 17 00:00:00 2001 From: Xishi Qiu Date: Wed, 10 Dec 2014 10:09:03 +0800 Subject: x86/mm: Use min() instead of min_t() in the e820 printout code The type of "MAX_DMA_PFN" and "xXx_pfn" are both unsigned long now, so use min() instead of min_t(). Suggested-by: Andrew Morton Signed-off-by: Xishi Qiu Cc: Linux MM Cc: Cc: Rik van Riel Link: http://lkml.kernel.org/r/5487AB3F.7050807@huawei.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 49f8864..dd2f07a 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -1114,8 +1114,8 @@ void __init memblock_find_dma_reserve(void) * at first, and assume boot_mem will not take below MAX_DMA_PFN */ for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) { - start_pfn = min_t(unsigned long, start_pfn, MAX_DMA_PFN); - end_pfn = min_t(unsigned long, end_pfn, MAX_DMA_PFN); + start_pfn = min(start_pfn, MAX_DMA_PFN); + end_pfn = min(end_pfn, MAX_DMA_PFN); nr_pages += end_pfn - start_pfn; } -- cgit v0.10.2 From f647d7c155f069c1a068030255c300663516420e Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 8 Dec 2014 13:55:20 -0800 Subject: x86_64, switch_to(): Load TLS descriptors before switching DS and ES Otherwise, if buggy user code points DS or ES into the TLS array, they would be corrupted after a context switch. This also significantly improves the comments and documents some gotchas in the code. Before this patch, the both tests below failed. With this patch, the es test passes, although the gsbase test still fails. ----- begin es test ----- /* * Copyright (c) 2014 Andy Lutomirski * GPL v2 */ static unsigned short GDT3(int idx) { return (idx << 3) | 3; } static int create_tls(int idx, unsigned int base) { struct user_desc desc = { .entry_number = idx, .base_addr = base, .limit = 0xfffff, .seg_32bit = 1, .contents = 0, /* Data, grow-up */ .read_exec_only = 0, .limit_in_pages = 1, .seg_not_present = 0, .useable = 0, }; if (syscall(SYS_set_thread_area, &desc) != 0) err(1, "set_thread_area"); return desc.entry_number; } int main() { int idx = create_tls(-1, 0); printf("Allocated GDT index %d\n", idx); unsigned short orig_es; asm volatile ("mov %%es,%0" : "=rm" (orig_es)); int errors = 0; int total = 1000; for (int i = 0; i < total; i++) { asm volatile ("mov %0,%%es" : : "rm" (GDT3(idx))); usleep(100); unsigned short es; asm volatile ("mov %%es,%0" : "=rm" (es)); asm volatile ("mov %0,%%es" : : "rm" (orig_es)); if (es != GDT3(idx)) { if (errors == 0) printf("[FAIL]\tES changed from 0x%hx to 0x%hx\n", GDT3(idx), es); errors++; } } if (errors) { printf("[FAIL]\tES was corrupted %d/%d times\n", errors, total); return 1; } else { printf("[OK]\tES was preserved\n"); return 0; } } ----- end es test ----- ----- begin gsbase test ----- /* * gsbase.c, a gsbase test * Copyright (c) 2014 Andy Lutomirski * GPL v2 */ static unsigned char *testptr, *testptr2; static unsigned char read_gs_testvals(void) { unsigned char ret; asm volatile ("movb %%gs:%1, %0" : "=r" (ret) : "m" (*testptr)); return ret; } int main() { int errors = 0; testptr = mmap((void *)0x200000000UL, 1, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_FIXED | MAP_ANONYMOUS, -1, 0); if (testptr == MAP_FAILED) err(1, "mmap"); testptr2 = mmap((void *)0x300000000UL, 1, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_FIXED | MAP_ANONYMOUS, -1, 0); if (testptr2 == MAP_FAILED) err(1, "mmap"); *testptr = 0; *testptr2 = 1; if (syscall(SYS_arch_prctl, ARCH_SET_GS, (unsigned long)testptr2 - (unsigned long)testptr) != 0) err(1, "ARCH_SET_GS"); usleep(100); if (read_gs_testvals() == 1) { printf("[OK]\tARCH_SET_GS worked\n"); } else { printf("[FAIL]\tARCH_SET_GS failed\n"); errors++; } asm volatile ("mov %0,%%gs" : : "r" (0)); if (read_gs_testvals() == 0) { printf("[OK]\tWriting 0 to gs worked\n"); } else { printf("[FAIL]\tWriting 0 to gs failed\n"); errors++; } usleep(100); if (read_gs_testvals() == 0) { printf("[OK]\tgsbase is still zero\n"); } else { printf("[FAIL]\tgsbase was corrupted\n"); errors++; } return errors == 0 ? 0 : 1; } ----- end gsbase test ----- Signed-off-by: Andy Lutomirski Cc: Cc: Andi Kleen Cc: Linus Torvalds Link: http://lkml.kernel.org/r/509d27c9fec78217691c3dad91cec87e1006b34a.1418075657.git.luto@amacapital.net Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 3ed4a68..5a2c029 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -283,24 +283,9 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) fpu = switch_fpu_prepare(prev_p, next_p, cpu); - /* - * Reload esp0, LDT and the page table pointer: - */ + /* Reload esp0 and ss1. */ load_sp0(tss, next); - /* - * Switch DS and ES. - * This won't pick up thread selector changes, but I guess that is ok. - */ - savesegment(es, prev->es); - if (unlikely(next->es | prev->es)) - loadsegment(es, next->es); - - savesegment(ds, prev->ds); - if (unlikely(next->ds | prev->ds)) - loadsegment(ds, next->ds); - - /* We must save %fs and %gs before load_TLS() because * %fs and %gs may be cleared by load_TLS(). * @@ -309,41 +294,101 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) savesegment(fs, fsindex); savesegment(gs, gsindex); + /* + * Load TLS before restoring any segments so that segment loads + * reference the correct GDT entries. + */ load_TLS(next, cpu); /* - * Leave lazy mode, flushing any hypercalls made here. - * This must be done before restoring TLS segments so - * the GDT and LDT are properly updated, and must be - * done before math_state_restore, so the TS bit is up - * to date. + * Leave lazy mode, flushing any hypercalls made here. This + * must be done after loading TLS entries in the GDT but before + * loading segments that might reference them, and and it must + * be done before math_state_restore, so the TS bit is up to + * date. */ arch_end_context_switch(next_p); + /* Switch DS and ES. + * + * Reading them only returns the selectors, but writing them (if + * nonzero) loads the full descriptor from the GDT or LDT. The + * LDT for next is loaded in switch_mm, and the GDT is loaded + * above. + * + * We therefore need to write new values to the segment + * registers on every context switch unless both the new and old + * values are zero. + * + * Note that we don't need to do anything for CS and SS, as + * those are saved and restored as part of pt_regs. + */ + savesegment(es, prev->es); + if (unlikely(next->es | prev->es)) + loadsegment(es, next->es); + + savesegment(ds, prev->ds); + if (unlikely(next->ds | prev->ds)) + loadsegment(ds, next->ds); + /* * Switch FS and GS. * - * Segment register != 0 always requires a reload. Also - * reload when it has changed. When prev process used 64bit - * base always reload to avoid an information leak. + * These are even more complicated than FS and GS: they have + * 64-bit bases are that controlled by arch_prctl. Those bases + * only differ from the values in the GDT or LDT if the selector + * is 0. + * + * Loading the segment register resets the hidden base part of + * the register to 0 or the value from the GDT / LDT. If the + * next base address zero, writing 0 to the segment register is + * much faster than using wrmsr to explicitly zero the base. + * + * The thread_struct.fs and thread_struct.gs values are 0 + * if the fs and gs bases respectively are not overridden + * from the values implied by fsindex and gsindex. They + * are nonzero, and store the nonzero base addresses, if + * the bases are overridden. + * + * (fs != 0 && fsindex != 0) || (gs != 0 && gsindex != 0) should + * be impossible. + * + * Therefore we need to reload the segment registers if either + * the old or new selector is nonzero, and we need to override + * the base address if next thread expects it to be overridden. + * + * This code is unnecessarily slow in the case where the old and + * new indexes are zero and the new base is nonzero -- it will + * unnecessarily write 0 to the selector before writing the new + * base address. + * + * Note: This all depends on arch_prctl being the only way that + * user code can override the segment base. Once wrfsbase and + * wrgsbase are enabled, most of this code will need to change. */ if (unlikely(fsindex | next->fsindex | prev->fs)) { loadsegment(fs, next->fsindex); + /* - * Check if the user used a selector != 0; if yes - * clear 64bit base, since overloaded base is always - * mapped to the Null selector + * If user code wrote a nonzero value to FS, then it also + * cleared the overridden base address. + * + * XXX: if user code wrote 0 to FS and cleared the base + * address itself, we won't notice and we'll incorrectly + * restore the prior base address next time we reschdule + * the process. */ if (fsindex) prev->fs = 0; } - /* when next process has a 64bit base use it */ if (next->fs) wrmsrl(MSR_FS_BASE, next->fs); prev->fsindex = fsindex; if (unlikely(gsindex | next->gsindex | prev->gs)) { load_gs_index(next->gsindex); + + /* This works (and fails) the same way as fsindex above. */ if (gsindex) prev->gs = 0; } -- cgit v0.10.2 From 5de2b61a63f0982641eb00b9a6a9650f23487eaa Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 9 Dec 2014 16:45:17 +0100 Subject: x86/asm: Guard against building the 32/64-bit versions of the asm-offsets*.c file directly Sometimes it is helpful to build a kernel compilation unit directly, i.e.: make .../.i in order to look at compiler output. Since asm-offsets_{32,64}.c are included by asm-offsets.c and building them directly doesn't evaluate the macros used (thus making the preprocessor output not very useful), error out when an attempt is made to build them. Issue a hint for the user to build asm-offsets.c instead. Suggested-by: Michael Matz Signed-off-by: Borislav Petkov Cc: Michal Marek Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1418139917-12722-1-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index d67c4be..3b3b9d3 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -1,3 +1,7 @@ +#ifndef __LINUX_KBUILD_H +# error "Please do not build this file directly, build asm-offsets.c instead" +#endif + #include #include diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index e7c798b..4c0c596 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -1,3 +1,7 @@ +#ifndef __LINUX_KBUILD_H +# error "Please do not build this file directly, build asm-offsets.c instead" +#endif + #include #define __SYSCALL_64(nr, sym, compat) [nr] = 1, -- cgit v0.10.2 From be9d1738b17f066b9d2f9e63eb6e2abda997fa1b Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 9 Dec 2014 13:25:59 +0100 Subject: x86/asm: Unify segment selector defines Those are identical on 32- and 64-bit, unify them. No functional change. Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1418127959-29902-1-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h index 6f1c3a8..db257a5 100644 --- a/arch/x86/include/asm/segment.h +++ b/arch/x86/include/asm/segment.h @@ -23,6 +23,15 @@ #define GDT_ENTRY_BOOT_TSS (GDT_ENTRY_BOOT_CS + 2) #define __BOOT_TSS (GDT_ENTRY_BOOT_TSS * 8) +#define SEGMENT_RPL_MASK 0x3 /* + * Bottom two bits of selector give the ring + * privilege level + */ +#define SEGMENT_TI_MASK 0x4 /* Bit 2 is table indicator (LDT/GDT) */ +#define USER_RPL 0x3 /* User mode is privilege level 3 */ +#define SEGMENT_LDT 0x4 /* LDT segment has TI set... */ +#define SEGMENT_GDT 0x0 /* ... GDT has it cleared */ + #ifdef CONFIG_X86_32 /* * The layout of the per-CPU GDT under Linux: @@ -125,16 +134,6 @@ #define PNP_TS1 (GDT_ENTRY_PNPBIOS_TS1 * 8) /* transfer data segment */ #define PNP_TS2 (GDT_ENTRY_PNPBIOS_TS2 * 8) /* another data segment */ -/* Bottom two bits of selector give the ring privilege level */ -#define SEGMENT_RPL_MASK 0x3 -/* Bit 2 is table indicator (LDT/GDT) */ -#define SEGMENT_TI_MASK 0x4 - -/* User mode is privilege level 3 */ -#define USER_RPL 0x3 -/* LDT segment has TI set, GDT has it cleared */ -#define SEGMENT_LDT 0x4 -#define SEGMENT_GDT 0x0 /* * Matching rules for certain types of segments. @@ -192,17 +191,6 @@ #define get_kernel_rpl() 0 #endif -/* User mode is privilege level 3 */ -#define USER_RPL 0x3 -/* LDT segment has TI set, GDT has it cleared */ -#define SEGMENT_LDT 0x4 -#define SEGMENT_GDT 0x0 - -/* Bottom two bits of selector give the ring privilege level */ -#define SEGMENT_RPL_MASK 0x3 -/* Bit 2 is table indicator (LDT/GDT) */ -#define SEGMENT_TI_MASK 0x4 - #define IDT_ENTRIES 256 #define NUM_EXCEPTION_VECTORS 32 /* Bitmask of exception vectors which push an error code on the stack */ -- cgit v0.10.2 From f0905c5a32ce6e9b743b4d9c70e53d1ce447852d Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Fri, 12 Dec 2014 16:25:47 -0800 Subject: MAINTAINERS: Add me as x86 VDSO submaintainer Here goes... :) Signed-off-by: Andy Lutomirski Acked-by: Thomas Gleixner Link: http://lkml.kernel.org/r/1042001e502f8e0deb0edfeeac209b68378650cf.1418430292.git.luto@amacapital.net Signed-off-by: Ingo Molnar diff --git a/MAINTAINERS b/MAINTAINERS index c444907..7a54fa8 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10295,6 +10295,13 @@ L: linux-edac@vger.kernel.org S: Maintained F: arch/x86/kernel/cpu/mcheck/* +X86 VDSO +M: Andy Lutomirski +L: linux-kernel@vger.kernel.org +T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git x86/vdso +S: Maintained +F: arch/x86/vdso/ + XC2028/3028 TUNER DRIVER M: Mauro Carvalho Chehab L: linux-media@vger.kernel.org -- cgit v0.10.2 From 41bdc78544b8a93a9c6814b8bbbfef966272abbe Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 4 Dec 2014 16:48:16 -0800 Subject: x86/tls: Validate TLS entries to protect espfix Installing a 16-bit RW data segment into the GDT defeats espfix. AFAICT this will not affect glibc, Wine, or dosemu at all. Signed-off-by: Andy Lutomirski Acked-by: H. Peter Anvin Cc: stable@vger.kernel.org Cc: Konrad Rzeszutek Wilk Cc: Linus Torvalds Cc: security@kernel.org Cc: Willy Tarreau Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c index f7fec09..e7650bd 100644 --- a/arch/x86/kernel/tls.c +++ b/arch/x86/kernel/tls.c @@ -27,6 +27,21 @@ static int get_free_idx(void) return -ESRCH; } +static bool tls_desc_okay(const struct user_desc *info) +{ + if (LDT_empty(info)) + return true; + + /* + * espfix is required for 16-bit data segments, but espfix + * only works for LDT segments. + */ + if (!info->seg_32bit) + return false; + + return true; +} + static void set_tls_desc(struct task_struct *p, int idx, const struct user_desc *info, int n) { @@ -66,6 +81,9 @@ int do_set_thread_area(struct task_struct *p, int idx, if (copy_from_user(&info, u_info, sizeof(info))) return -EFAULT; + if (!tls_desc_okay(&info)) + return -EINVAL; + if (idx == -1) idx = info.entry_number; @@ -192,6 +210,7 @@ int regset_tls_set(struct task_struct *target, const struct user_regset *regset, { struct user_desc infobuf[GDT_ENTRY_TLS_ENTRIES]; const struct user_desc *info; + int i; if (pos >= GDT_ENTRY_TLS_ENTRIES * sizeof(struct user_desc) || (pos % sizeof(struct user_desc)) != 0 || @@ -205,6 +224,10 @@ int regset_tls_set(struct task_struct *target, const struct user_regset *regset, else info = infobuf; + for (i = 0; i < count / sizeof(struct user_desc); i++) + if (!tls_desc_okay(info + i)) + return -EINVAL; + set_tls_desc(target, GDT_ENTRY_TLS_MIN + (pos / sizeof(struct user_desc)), info, count / sizeof(struct user_desc)); -- cgit v0.10.2 From 0e58af4e1d2166e9e33375a0f121e4867010d4f8 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 4 Dec 2014 16:48:17 -0800 Subject: x86/tls: Disallow unusual TLS segments Users have no business installing custom code segments into the GDT, and segments that are not present but are otherwise valid are a historical source of interesting attacks. For completeness, block attempts to set the L bit. (Prior to this patch, the L bit would have been silently dropped.) This is an ABI break. I've checked glibc, musl, and Wine, and none of them look like they'll have any trouble. Note to stable maintainers: this is a hardening patch that fixes no known bugs. Given the possibility of ABI issues, this probably shouldn't be backported quickly. Signed-off-by: Andy Lutomirski Acked-by: H. Peter Anvin Cc: stable@vger.kernel.org # optional Cc: Konrad Rzeszutek Wilk Cc: Linus Torvalds Cc: security@kernel.org Cc: Willy Tarreau Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c index e7650bd..3e551ee 100644 --- a/arch/x86/kernel/tls.c +++ b/arch/x86/kernel/tls.c @@ -39,6 +39,28 @@ static bool tls_desc_okay(const struct user_desc *info) if (!info->seg_32bit) return false; + /* Only allow data segments in the TLS array. */ + if (info->contents > 1) + return false; + + /* + * Non-present segments with DPL 3 present an interesting attack + * surface. The kernel should handle such segments correctly, + * but TLS is very difficult to protect in a sandbox, so prevent + * such segments from being created. + * + * If userspace needs to remove a TLS entry, it can still delete + * it outright. + */ + if (info->seg_not_present) + return false; + +#ifdef CONFIG_X86_64 + /* The L bit makes no sense for data. */ + if (info->lm) + return false; +#endif + return true; } -- cgit v0.10.2