From f872f5400cc01373d8e29d9c7a5296ccfaf4ccf3 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 29 Dec 2015 20:12:19 -0800 Subject: mm: Add a vm_special_mapping.fault() method Requiring special mappings to give a list of struct pages is inflexible: it prevents sane use of IO memory in a special mapping, it's inefficient (it requires arch code to initialize a list of struct pages, and it requires the mm core to walk the entire list just to figure out how long it is), and it prevents arch code from doing anything fancy when a special mapping fault occurs. Add a .fault method as an alternative to filling in a .pages array. Looks-OK-to: Andrew Morton Signed-off-by: Andy Lutomirski Reviewed-by: Kees Cook Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Quentin Casasnovas Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/a26d1677c0bc7e774c33f469451a78ca31e9e6af.1451446564.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index f8d1492..c88e48a 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -568,10 +568,26 @@ static inline void clear_tlb_flush_pending(struct mm_struct *mm) } #endif -struct vm_special_mapping -{ - const char *name; +struct vm_fault; + +struct vm_special_mapping { + const char *name; /* The name, e.g. "[vdso]". */ + + /* + * If .fault is not provided, this points to a + * NULL-terminated array of pages that back the special mapping. + * + * This must not be NULL unless .fault is provided. + */ struct page **pages; + + /* + * If non-NULL, then this is called to resolve page faults + * on the special mapping. If used, .pages is not checked. + */ + int (*fault)(const struct vm_special_mapping *sm, + struct vm_area_struct *vma, + struct vm_fault *vmf); }; enum tlb_flush_reason { diff --git a/mm/mmap.c b/mm/mmap.c index 2ce04a6..f717453 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -3030,11 +3030,16 @@ static int special_mapping_fault(struct vm_area_struct *vma, pgoff_t pgoff; struct page **pages; - if (vma->vm_ops == &legacy_special_mapping_vmops) + if (vma->vm_ops == &legacy_special_mapping_vmops) { pages = vma->vm_private_data; - else - pages = ((struct vm_special_mapping *)vma->vm_private_data)-> - pages; + } else { + struct vm_special_mapping *sm = vma->vm_private_data; + + if (sm->fault) + return sm->fault(sm, vma, vmf); + + pages = sm->pages; + } for (pgoff = vmf->pgoff; pgoff && *pages; ++pages) pgoff--; -- cgit v0.10.2 From 1745cbc5d0dee0749a6bc0ea8e872c5db0074061 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 29 Dec 2015 20:12:20 -0800 Subject: mm: Add vm_insert_pfn_prot() The x86 vvar vma contains pages with differing cacheability flags. x86 currently implements this by manually inserting all the ptes using (io_)remap_pfn_range when the vma is set up. x86 wants to move to using .fault with VM_FAULT_NOPAGE to set up the mappings as needed. The correct API to use to insert a pfn in .fault is vm_insert_pfn(), but vm_insert_pfn() can't override the vma's cache mode, and the HPET page in particular needs to be uncached despite the fact that the rest of the VMA is cached. Add vm_insert_pfn_prot() to support varying cacheability within the same non-COW VMA in a more sane manner. x86 could alternatively use multiple VMAs, but that's messy, would break CRIU, and would create unnecessary VMAs that would waste memory. Signed-off-by: Andy Lutomirski Reviewed-by: Kees Cook Acked-by: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Quentin Casasnovas Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/d2938d1eb37be7a5e4f86182db646551f11e45aa.1451446564.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/include/linux/mm.h b/include/linux/mm.h index 00bad77..87ef1d7 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2080,6 +2080,8 @@ int remap_pfn_range(struct vm_area_struct *, unsigned long addr, int vm_insert_page(struct vm_area_struct *, unsigned long addr, struct page *); int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn); +int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, pgprot_t pgprot); int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn); int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len); diff --git a/mm/memory.c b/mm/memory.c index c387430..a29f0b9 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1564,8 +1564,29 @@ out: int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn) { + return vm_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot); +} +EXPORT_SYMBOL(vm_insert_pfn); + +/** + * vm_insert_pfn_prot - insert single pfn into user vma with specified pgprot + * @vma: user vma to map to + * @addr: target user address of this page + * @pfn: source kernel pfn + * @pgprot: pgprot flags for the inserted page + * + * This is exactly like vm_insert_pfn, except that it allows drivers to + * to override pgprot on a per-page basis. + * + * This only makes sense for IO mappings, and it makes no sense for + * cow mappings. In general, using multiple vmas is preferable; + * vm_insert_pfn_prot should only be used if using multiple VMAs is + * impractical. + */ +int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, pgprot_t pgprot) +{ int ret; - pgprot_t pgprot = vma->vm_page_prot; /* * Technically, architectures with pte_special can avoid all these * restrictions (same for remap_pfn_range). However we would like @@ -1587,7 +1608,7 @@ int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, return ret; } -EXPORT_SYMBOL(vm_insert_pfn); +EXPORT_SYMBOL(vm_insert_pfn_prot); int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn) -- cgit v0.10.2 From 352b78c62f27b356b182008acd3117f3ee03ffd2 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 29 Dec 2015 20:12:21 -0800 Subject: x86/vdso: Track each mm's loaded vDSO image as well as its base As we start to do more intelligent things with the vDSO at runtime (as opposed to just at mm initialization time), we'll need to know which vDSO is in use. In principle, we could guess based on the mm type, but that's over-complicated and error-prone. Instead, just track it in the mmu context. Signed-off-by: Andy Lutomirski Reviewed-by: Kees Cook Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Quentin Casasnovas Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/c99ac48681bad709ca7ad5ee899d9042a3af6b00.1451446564.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index b8f69e2..80b0210 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -121,6 +121,7 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr) text_start = addr - image->sym_vvar_start; current->mm->context.vdso = (void __user *)text_start; + current->mm->context.vdso_image = image; /* * MAYWRITE to allow gdb to COW and set breakpoints diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h index 55234d5..1ea0bae 100644 --- a/arch/x86/include/asm/mmu.h +++ b/arch/x86/include/asm/mmu.h @@ -19,7 +19,8 @@ typedef struct { #endif struct mutex lock; - void __user *vdso; + void __user *vdso; /* vdso base address */ + const struct vdso_image *vdso_image; /* vdso image in use */ atomic_t perf_rdpmc_allowed; /* nonzero if rdpmc is allowed */ } mm_context_t; -- cgit v0.10.2 From 05ef76b20fc4297b0d3f8a956f1c809a8a1b3f1d Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 29 Dec 2015 20:12:22 -0800 Subject: x86/vdso: Use .fault for the vDSO text mapping The old scheme for mapping the vDSO text is rather complicated. vdso2c generates a struct vm_special_mapping and a blank .pages array of the correct size for each vdso image. Init code in vdso/vma.c populates the .pages array for each vDSO image, and the mapping code selects the appropriate struct vm_special_mapping. With .fault, we can use a less roundabout approach: vdso_fault() just returns the appropriate page for the selected vDSO image. Signed-off-by: Andy Lutomirski Reviewed-by: Kees Cook Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Quentin Casasnovas Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/f886954c186bafd74e1b967c8931d852ae199aa2.1451446564.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/entry/vdso/vdso2c.h b/arch/x86/entry/vdso/vdso2c.h index 0224987..abe961c 100644 --- a/arch/x86/entry/vdso/vdso2c.h +++ b/arch/x86/entry/vdso/vdso2c.h @@ -150,16 +150,9 @@ static void BITSFUNC(go)(void *raw_addr, size_t raw_len, } fprintf(outfile, "\n};\n\n"); - fprintf(outfile, "static struct page *pages[%lu];\n\n", - mapping_size / 4096); - fprintf(outfile, "const struct vdso_image %s = {\n", name); fprintf(outfile, "\t.data = raw_data,\n"); fprintf(outfile, "\t.size = %lu,\n", mapping_size); - fprintf(outfile, "\t.text_mapping = {\n"); - fprintf(outfile, "\t\t.name = \"[vdso]\",\n"); - fprintf(outfile, "\t\t.pages = pages,\n"); - fprintf(outfile, "\t},\n"); if (alt_sec) { fprintf(outfile, "\t.alt = %lu,\n", (unsigned long)GET_LE(&alt_sec->sh_offset)); diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index 80b0210..eb50d7c 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -27,13 +27,7 @@ unsigned int __read_mostly vdso64_enabled = 1; void __init init_vdso_image(const struct vdso_image *image) { - int i; - int npages = (image->size) / PAGE_SIZE; - BUG_ON(image->size % PAGE_SIZE != 0); - for (i = 0; i < npages; i++) - image->text_mapping.pages[i] = - virt_to_page(image->data + i*PAGE_SIZE); apply_alternatives((struct alt_instr *)(image->data + image->alt), (struct alt_instr *)(image->data + image->alt + @@ -90,6 +84,24 @@ static unsigned long vdso_addr(unsigned long start, unsigned len) #endif } +static int vdso_fault(const struct vm_special_mapping *sm, + struct vm_area_struct *vma, struct vm_fault *vmf) +{ + const struct vdso_image *image = vma->vm_mm->context.vdso_image; + + if (!image || (vmf->pgoff << PAGE_SHIFT) >= image->size) + return VM_FAULT_SIGBUS; + + vmf->page = virt_to_page(image->data + (vmf->pgoff << PAGE_SHIFT)); + get_page(vmf->page); + return 0; +} + +static const struct vm_special_mapping text_mapping = { + .name = "[vdso]", + .fault = vdso_fault, +}; + static int map_vdso(const struct vdso_image *image, bool calculate_addr) { struct mm_struct *mm = current->mm; @@ -131,7 +143,7 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr) image->size, VM_READ|VM_EXEC| VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, - &image->text_mapping); + &text_mapping); if (IS_ERR(vma)) { ret = PTR_ERR(vma); diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h index deabaf9..43dc55b 100644 --- a/arch/x86/include/asm/vdso.h +++ b/arch/x86/include/asm/vdso.h @@ -13,9 +13,6 @@ struct vdso_image { void *data; unsigned long size; /* Always a multiple of PAGE_SIZE */ - /* text_mapping.pages is big enough for data/size page pointers */ - struct vm_special_mapping text_mapping; - unsigned long alt, alt_len; long sym_vvar_start; /* Negative offset to the vvar area */ -- cgit v0.10.2 From a48a7042613eb1524d18b7b1ed7d3a6b611fd21f Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 29 Dec 2015 20:12:23 -0800 Subject: x86/vdso: Use ->fault() instead of remap_pfn_range() for the vvar mapping This is IMO much less ugly, and it also opens the door to disallowing unprivileged userspace HPET access on systems with usable TSCs. Signed-off-by: Andy Lutomirski Reviewed-by: Kees Cook Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Quentin Casasnovas Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/c19c2909e5ee3c3d8742f916586676bb7c40345f.1451446564.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index eb50d7c..4b5461b 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -102,18 +102,69 @@ static const struct vm_special_mapping text_mapping = { .fault = vdso_fault, }; +static int vvar_fault(const struct vm_special_mapping *sm, + struct vm_area_struct *vma, struct vm_fault *vmf) +{ + const struct vdso_image *image = vma->vm_mm->context.vdso_image; + long sym_offset; + int ret = -EFAULT; + + if (!image) + return VM_FAULT_SIGBUS; + + sym_offset = (long)(vmf->pgoff << PAGE_SHIFT) + + image->sym_vvar_start; + + /* + * Sanity check: a symbol offset of zero means that the page + * does not exist for this vdso image, not that the page is at + * offset zero relative to the text mapping. This should be + * impossible here, because sym_offset should only be zero for + * the page past the end of the vvar mapping. + */ + if (sym_offset == 0) + return VM_FAULT_SIGBUS; + + if (sym_offset == image->sym_vvar_page) { + ret = vm_insert_pfn(vma, (unsigned long)vmf->virtual_address, + __pa_symbol(&__vvar_page) >> PAGE_SHIFT); + } else if (sym_offset == image->sym_hpet_page) { +#ifdef CONFIG_HPET_TIMER + if (hpet_address) { + ret = vm_insert_pfn_prot( + vma, + (unsigned long)vmf->virtual_address, + hpet_address >> PAGE_SHIFT, + pgprot_noncached(PAGE_READONLY)); + } +#endif + } else if (sym_offset == image->sym_pvclock_page) { + struct pvclock_vsyscall_time_info *pvti = + pvclock_pvti_cpu0_va(); + if (pvti) { + ret = vm_insert_pfn( + vma, + (unsigned long)vmf->virtual_address, + __pa(pvti) >> PAGE_SHIFT); + } + } + + if (ret == 0 || ret == -EBUSY) + return VM_FAULT_NOPAGE; + + return VM_FAULT_SIGBUS; +} + static int map_vdso(const struct vdso_image *image, bool calculate_addr) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma; unsigned long addr, text_start; int ret = 0; - static struct page *no_pages[] = {NULL}; - static struct vm_special_mapping vvar_mapping = { + static const struct vm_special_mapping vvar_mapping = { .name = "[vvar]", - .pages = no_pages, + .fault = vvar_fault, }; - struct pvclock_vsyscall_time_info *pvti; if (calculate_addr) { addr = vdso_addr(current->mm->start_stack, @@ -153,7 +204,8 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr) vma = _install_special_mapping(mm, addr, -image->sym_vvar_start, - VM_READ|VM_MAYREAD, + VM_READ|VM_MAYREAD|VM_IO|VM_DONTDUMP| + VM_PFNMAP, &vvar_mapping); if (IS_ERR(vma)) { @@ -161,41 +213,6 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr) goto up_fail; } - if (image->sym_vvar_page) - ret = remap_pfn_range(vma, - text_start + image->sym_vvar_page, - __pa_symbol(&__vvar_page) >> PAGE_SHIFT, - PAGE_SIZE, - PAGE_READONLY); - - if (ret) - goto up_fail; - -#ifdef CONFIG_HPET_TIMER - if (hpet_address && image->sym_hpet_page) { - ret = io_remap_pfn_range(vma, - text_start + image->sym_hpet_page, - hpet_address >> PAGE_SHIFT, - PAGE_SIZE, - pgprot_noncached(PAGE_READONLY)); - - if (ret) - goto up_fail; - } -#endif - - pvti = pvclock_pvti_cpu0_va(); - if (pvti && image->sym_pvclock_page) { - ret = remap_pfn_range(vma, - text_start + image->sym_pvclock_page, - __pa(pvti) >> PAGE_SHIFT, - PAGE_SIZE, - PAGE_READONLY); - - if (ret) - goto up_fail; - } - up_fail: if (ret) current->mm->context.vdso = NULL; -- cgit v0.10.2 From bd902c536298830e4d126dcf6491b46d3f1bf96e Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 29 Dec 2015 20:12:24 -0800 Subject: x86/vdso: Disallow vvar access to vclock IO for never-used vclocks It makes me uncomfortable that even modern systems grant every process direct read access to the HPET. While fixing this for real without regressing anything is a mess (unmapping the HPET is tricky because we don't adequately track all the mappings), we can do almost as well by tracking which vclocks have ever been used and only allowing pages associated with used vclocks to be faulted in. This will cause rogue programs that try to peek at the HPET to get SIGBUS instead on most systems. We can't restrict faults to vclock pages that are associated with the currently selected vclock due to a race: a process could start to access the HPET for the first time and race against a switch away from the HPET as the current clocksource. We can't segfault the process trying to peek at the HPET in this case, even though the process isn't going to do anything useful with the data. Signed-off-by: Andy Lutomirski Reviewed-by: Kees Cook Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Quentin Casasnovas Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/e79d06295625c02512277737ab55085a498ac5d8.1451446564.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index 4b5461b..7c912fe 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -130,7 +130,7 @@ static int vvar_fault(const struct vm_special_mapping *sm, __pa_symbol(&__vvar_page) >> PAGE_SHIFT); } else if (sym_offset == image->sym_hpet_page) { #ifdef CONFIG_HPET_TIMER - if (hpet_address) { + if (hpet_address && vclock_was_used(VCLOCK_HPET)) { ret = vm_insert_pfn_prot( vma, (unsigned long)vmf->virtual_address, @@ -141,7 +141,7 @@ static int vvar_fault(const struct vm_special_mapping *sm, } else if (sym_offset == image->sym_pvclock_page) { struct pvclock_vsyscall_time_info *pvti = pvclock_pvti_cpu0_va(); - if (pvti) { + if (pvti && vclock_was_used(VCLOCK_PVCLOCK)) { ret = vm_insert_pfn( vma, (unsigned long)vmf->virtual_address, diff --git a/arch/x86/entry/vsyscall/vsyscall_gtod.c b/arch/x86/entry/vsyscall/vsyscall_gtod.c index 51e3304..0fb3a10 100644 --- a/arch/x86/entry/vsyscall/vsyscall_gtod.c +++ b/arch/x86/entry/vsyscall/vsyscall_gtod.c @@ -16,6 +16,8 @@ #include #include +int vclocks_used __read_mostly; + DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data); void update_vsyscall_tz(void) @@ -26,12 +28,17 @@ void update_vsyscall_tz(void) void update_vsyscall(struct timekeeper *tk) { + int vclock_mode = tk->tkr_mono.clock->archdata.vclock_mode; struct vsyscall_gtod_data *vdata = &vsyscall_gtod_data; + /* Mark the new vclock used. */ + BUILD_BUG_ON(VCLOCK_MAX >= 32); + WRITE_ONCE(vclocks_used, READ_ONCE(vclocks_used) | (1 << vclock_mode)); + gtod_write_begin(vdata); /* copy vsyscall data */ - vdata->vclock_mode = tk->tkr_mono.clock->archdata.vclock_mode; + vdata->vclock_mode = vclock_mode; vdata->cycle_last = tk->tkr_mono.cycle_last; vdata->mask = tk->tkr_mono.mask; vdata->mult = tk->tkr_mono.mult; diff --git a/arch/x86/include/asm/clocksource.h b/arch/x86/include/asm/clocksource.h index eda81dc..d194266 100644 --- a/arch/x86/include/asm/clocksource.h +++ b/arch/x86/include/asm/clocksource.h @@ -3,10 +3,11 @@ #ifndef _ASM_X86_CLOCKSOURCE_H #define _ASM_X86_CLOCKSOURCE_H -#define VCLOCK_NONE 0 /* No vDSO clock available. */ -#define VCLOCK_TSC 1 /* vDSO should use vread_tsc. */ -#define VCLOCK_HPET 2 /* vDSO should use vread_hpet. */ -#define VCLOCK_PVCLOCK 3 /* vDSO should use vread_pvclock. */ +#define VCLOCK_NONE 0 /* No vDSO clock available. */ +#define VCLOCK_TSC 1 /* vDSO should use vread_tsc. */ +#define VCLOCK_HPET 2 /* vDSO should use vread_hpet. */ +#define VCLOCK_PVCLOCK 3 /* vDSO should use vread_pvclock. */ +#define VCLOCK_MAX 3 struct arch_clocksource_data { int vclock_mode; diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h index f556c48..e728699 100644 --- a/arch/x86/include/asm/vgtod.h +++ b/arch/x86/include/asm/vgtod.h @@ -37,6 +37,12 @@ struct vsyscall_gtod_data { }; extern struct vsyscall_gtod_data vsyscall_gtod_data; +extern int vclocks_used; +static inline bool vclock_was_used(int vclock) +{ + return READ_ONCE(vclocks_used) & (1 << vclock); +} + static inline unsigned gtod_read_begin(const struct vsyscall_gtod_data *s) { unsigned ret; -- cgit v0.10.2 From 2024315124b43bb8b25a119ec6c614f0647dcc6d Mon Sep 17 00:00:00 2001 From: Alexander Kuleshov Date: Mon, 18 Jan 2016 20:13:14 +0600 Subject: x86/asm/entry: Remove unused SAVE_ALL/RESTORE_ALL macros for !CONFIG_x86_64 SAVE_ALL and RESTORE_ALL macros for !CONFIG_X86_64 were introduced in commit: 1a338ac32 commit ('sched, x86: Optimize the preempt_schedule() call') ... and were used in the ___preempt_schedule() and ___preempt_schedule_context() functions from the arch/x86/kernel/preempt.S. But the arch/x86/kernel/preempt.S file was removed in the following commit: 0ad6e3c5 commit ('x86: Speed up ___preempt_schedule*() by using THUNK helpers') The ___preempt_schedule()/___preempt_schedule_context() functions were reimplemeted and do not use SAVE_ALL/RESTORE_ALL anymore. These macros have no users anymore, so we can remove them. Signed-off-by: Alexander Kuleshov Acked-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1453126394-13717-1-git-send-email-kuleshovmail@gmail.com [ Improved the changelog. ] Signed-off-by: Ingo Molnar diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h index e32206e..9a9e588 100644 --- a/arch/x86/entry/calling.h +++ b/arch/x86/entry/calling.h @@ -201,37 +201,6 @@ For 32-bit we have the following conventions - kernel is built with .byte 0xf1 .endm -#else /* CONFIG_X86_64 */ - -/* - * For 32bit only simplified versions of SAVE_ALL/RESTORE_ALL. These - * are different from the entry_32.S versions in not changing the segment - * registers. So only suitable for in kernel use, not when transitioning - * from or to user space. The resulting stack frame is not a standard - * pt_regs frame. The main use case is calling C code from assembler - * when all the registers need to be preserved. - */ - - .macro SAVE_ALL - pushl %eax - pushl %ebp - pushl %edi - pushl %esi - pushl %edx - pushl %ecx - pushl %ebx - .endm - - .macro RESTORE_ALL - popl %ebx - popl %ecx - popl %edx - popl %esi - popl %edi - popl %ebp - popl %eax - .endm - #endif /* CONFIG_X86_64 */ /* -- cgit v0.10.2 From a1ff5726081858a9ad98934eff7af6616c576875 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Sat, 16 Jan 2016 10:58:12 +0100 Subject: x86/cpufeature: Add AMD AVIC bit CPUID Fn8000_000A_EDX[13] denotes support for AMD's Virtual Interrupt controller, i.e., APIC virtualization. Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: David Kaplan Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Joerg Roedel Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tom Lendacky Link: http://lkml.kernel.org/r/1452938292-12327-1-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 7ad8c94..bbf166e 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -268,6 +268,7 @@ #define X86_FEATURE_DECODEASSISTS (15*32+ 7) /* Decode Assists support */ #define X86_FEATURE_PAUSEFILTER (15*32+10) /* filtered pause intercept */ #define X86_FEATURE_PFTHRESHOLD (15*32+12) /* pause filter threshold */ +#define X86_FEATURE_AVIC (15*32+13) /* Virtual Interrupt Controller */ /* * BUG word(s) -- cgit v0.10.2 From 43c75f933be26422f166d6d869a19997312f4732 Mon Sep 17 00:00:00 2001 From: Seth Jennings Date: Mon, 30 Nov 2015 10:47:43 -0600 Subject: x86/mm: Streamline and restore probe_memory_block_size() The cumulative effect of the following two commits: bdee237c0343 ("x86: mm: Use 2GB memory block size on large-memory x86-64 systems") 982792c782ef ("x86, mm: probe memory block size for generic x86 64bit") ... is some pretty convoluted code. The first commit also removed code for the UV case without stated reason, which might lead to unexpected change in behavior. This commit has no other (intended) functional change; just seeks to simplify and make the code more understandable, beyond restoring the UV behavior. The whole section with the "tail size" doesn't seem to be reachable, since both the >= 64GB and < 64GB case return, so it was removed. Signed-off-by: Seth Jennings Cc: Daniel J Blueman Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Yinghai Lu Link: http://lkml.kernel.org/r/1448902063-18885-1-git-send-email-sjennings@variantweb.net [ Rewrote the title and changelog. ] Signed-off-by: Ingo Molnar diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 8829482..8f18fec 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -52,6 +52,7 @@ #include #include #include +#include #include #include "mm_internal.h" @@ -1193,26 +1194,13 @@ int kern_addr_valid(unsigned long addr) static unsigned long probe_memory_block_size(void) { - /* start from 2g */ - unsigned long bz = 1UL<<31; + unsigned long bz = MIN_MEMORY_BLOCK_SIZE; - if (totalram_pages >= (64ULL << (30 - PAGE_SHIFT))) { - pr_info("Using 2GB memory block size for large-memory system\n"); - return 2UL * 1024 * 1024 * 1024; - } - - /* less than 64g installed */ - if ((max_pfn << PAGE_SHIFT) < (16UL << 32)) - return MIN_MEMORY_BLOCK_SIZE; - - /* get the tail size */ - while (bz > MIN_MEMORY_BLOCK_SIZE) { - if (!((max_pfn << PAGE_SHIFT) & (bz - 1))) - break; - bz >>= 1; - } + /* if system is UV or has 64GB of RAM or more, use large blocks */ + if (is_uv_system() || ((max_pfn << PAGE_SHIFT) >= (64UL << 30))) + bz = 2UL << 30; /* 2GB */ - printk(KERN_DEBUG "memory block size : %ldMB\n", bz >> 20); + pr_info("x86/mm: Memory block size: %ldMB\n", bz >> 20); return bz; } -- cgit v0.10.2 From 95d97adb2bb85d964bae4538e0574e742e522dda Mon Sep 17 00:00:00 2001 From: "Dmitry V. Levin" Date: Thu, 17 Dec 2015 23:56:52 +0000 Subject: x86/signal: Cleanup get_nr_restart_syscall() Check for TS_COMPAT instead of TIF_IA32 to distinguish ia32 tasks from 64-bit tasks. Check for __X32_SYSCALL_BIT iff CONFIG_X86_X32_ABI is defined. Suggested-by: Andy Lutomirski Signed-off-by: Dmitry V. Levin Acked-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Elvira Khabirova Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20160111145515.GB29007@altlinux.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index cb6282c..c07ff5d 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -692,12 +692,15 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs) static inline unsigned long get_nr_restart_syscall(const struct pt_regs *regs) { -#if defined(CONFIG_X86_32) || !defined(CONFIG_X86_64) +#ifdef CONFIG_X86_64 + if (is_ia32_task()) + return __NR_ia32_restart_syscall; +#endif +#ifdef CONFIG_X86_X32_ABI + return __NR_restart_syscall | (regs->orig_ax & __X32_SYSCALL_BIT); +#else return __NR_restart_syscall; -#else /* !CONFIG_X86_32 && CONFIG_X86_64 */ - return test_thread_flag(TIF_IA32) ? __NR_ia32_restart_syscall : - __NR_restart_syscall | (regs->orig_ax & __X32_SYSCALL_BIT); -#endif /* CONFIG_X86_32 || !CONFIG_X86_64 */ +#endif } /* -- cgit v0.10.2 From 997963edd912a6d77d68b2bbc19f40ce8facabd7 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Fri, 18 Dec 2015 06:39:18 -0600 Subject: x86/asm: Clean up frame pointer macros The asm macros for setting up and restoring the frame pointer aren't currently being used. However, they will be needed soon to help asm functions to comply with stacktool. Rename FRAME/ENDFRAME to FRAME_BEGIN/FRAME_END for more symmetry. Also make the code more readable and improve the comments. Signed-off-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Bernd Petrovitsch Cc: Borislav Petkov Cc: Brian Gerst Cc: Chris J Arges Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Jiri Slaby Cc: Linus Torvalds Cc: Michal Marek Cc: Namhyung Kim Cc: Pedro Alves Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/3f488a8e3bfc8ac7d4d3d350953e664e7182b044.1450442274.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/include/asm/frame.h b/arch/x86/include/asm/frame.h index 793179c..cec213b 100644 --- a/arch/x86/include/asm/frame.h +++ b/arch/x86/include/asm/frame.h @@ -1,23 +1,34 @@ +#ifndef _ASM_X86_FRAME_H +#define _ASM_X86_FRAME_H + #ifdef __ASSEMBLY__ #include -/* The annotation hides the frame from the unwinder and makes it look - like a ordinary ebp save/restore. This avoids some special cases for - frame pointer later */ +/* + * These are stack frame creation macros. They should be used by every + * callable non-leaf asm function to make kernel stack traces more reliable. + */ #ifdef CONFIG_FRAME_POINTER - .macro FRAME - __ASM_SIZE(push,) %__ASM_REG(bp) - __ASM_SIZE(mov) %__ASM_REG(sp), %__ASM_REG(bp) - .endm - .macro ENDFRAME - __ASM_SIZE(pop,) %__ASM_REG(bp) - .endm -#else - .macro FRAME - .endm - .macro ENDFRAME - .endm -#endif + +.macro FRAME_BEGIN + push %_ASM_BP + _ASM_MOV %_ASM_SP, %_ASM_BP +.endm + +.macro FRAME_END + pop %_ASM_BP +.endm + +#define FRAME_OFFSET __ASM_SEL(4, 8) + +#else /* !CONFIG_FRAME_POINTER */ + +#define FRAME_BEGIN +#define FRAME_END +#define FRAME_OFFSET 0 + +#endif /* CONFIG_FRAME_POINTER */ #endif /* __ASSEMBLY__ */ +#endif /* _ASM_X86_FRAME_H */ -- cgit v0.10.2 From ec5186557abbe711dfd34e1863735dfecb0602cc Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Fri, 18 Dec 2015 06:39:19 -0600 Subject: x86/asm: Add C versions of frame pointer macros Add C versions of the frame pointer macros which can be used to create a stack frame in inline assembly. Signed-off-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Bernd Petrovitsch Cc: Borislav Petkov Cc: Brian Gerst Cc: Chris J Arges Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Jiri Slaby Cc: Linus Torvalds Cc: Michal Marek Cc: Namhyung Kim Cc: Pedro Alves Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/f6786a282bf232ede3e2866414eae3cf02c7d662.1450442274.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/include/asm/frame.h b/arch/x86/include/asm/frame.h index cec213b..6e4d170 100644 --- a/arch/x86/include/asm/frame.h +++ b/arch/x86/include/asm/frame.h @@ -1,16 +1,17 @@ #ifndef _ASM_X86_FRAME_H #define _ASM_X86_FRAME_H -#ifdef __ASSEMBLY__ - #include /* * These are stack frame creation macros. They should be used by every * callable non-leaf asm function to make kernel stack traces more reliable. */ + #ifdef CONFIG_FRAME_POINTER +#ifdef __ASSEMBLY__ + .macro FRAME_BEGIN push %_ASM_BP _ASM_MOV %_ASM_SP, %_ASM_BP @@ -20,6 +21,16 @@ pop %_ASM_BP .endm +#else /* !__ASSEMBLY__ */ + +#define FRAME_BEGIN \ + "push %" _ASM_BP "\n" \ + _ASM_MOV "%" _ASM_SP ", %" _ASM_BP "\n" + +#define FRAME_END "pop %" _ASM_BP "\n" + +#endif /* __ASSEMBLY__ */ + #define FRAME_OFFSET __ASM_SEL(4, 8) #else /* !CONFIG_FRAME_POINTER */ @@ -30,5 +41,4 @@ #endif /* CONFIG_FRAME_POINTER */ -#endif /* __ASSEMBLY__ */ #endif /* _ASM_X86_FRAME_H */ -- cgit v0.10.2 From 320d25b6a05f8b73c23fc21025d2906ecdd2d4fc Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 19 Jan 2016 13:38:58 -0800 Subject: x86/mm/32: Set NX in __supported_pte_mask before enabling paging There's a short window in which very early mappings can end up with NX clear because they are created before we've noticed that we have NX. It turns out that we detect NX very early, so there's no need to defer __supported_pte_mask setup. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Arjan van de Ven Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Mike Galbraith Cc: Pavel Machek Cc: Peter Zijlstra Cc: Stephen Smalley Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/2b544627345f7110160545a3f47031eb45c3ad4f.1453239349.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 6bc9ae2..57fc3f8 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -389,6 +389,12 @@ default_entry: /* Make changes effective */ wrmsr + /* + * And make sure that all the mappings we set up have NX set from + * the beginning. + */ + orl $(1 << (_PAGE_BIT_NX - 32)), pa(__supported_pte_mask + 4) + enable_paging: /* diff --git a/arch/x86/mm/setup_nx.c b/arch/x86/mm/setup_nx.c index 92e2eac..78f5d59 100644 --- a/arch/x86/mm/setup_nx.c +++ b/arch/x86/mm/setup_nx.c @@ -31,9 +31,8 @@ early_param("noexec", noexec_setup); void x86_configure_nx(void) { - if (boot_cpu_has(X86_FEATURE_NX) && !disable_nx) - __supported_pte_mask |= _PAGE_NX; - else + /* If disable_nx is set, clear NX on all new mappings going forward. */ + if (disable_nx) __supported_pte_mask &= ~_PAGE_NX; } -- cgit v0.10.2 From 7c360572b430a0e9757bafc0c20f26c920f2a07f Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 19 Jan 2016 13:38:59 -0800 Subject: x86/mm: Make kmap_prot into a #define The value (once we initialize it) is a foregone conclusion. Make it a #define to save a tiny amount of text and data size and to make it more comprehensible. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Arjan van de Ven Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Mike Galbraith Cc: Pavel Machek Cc: Peter Zijlstra Cc: Stephen Smalley Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/0850eb0213de9da88544ff7fae72dc6d06d2b441.1453239349.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index 6d7d0e5..8554f96 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -138,7 +138,7 @@ extern void reserve_top_address(unsigned long reserve); extern int fixmaps_set; extern pte_t *kmap_pte; -extern pgprot_t kmap_prot; +#define kmap_prot PAGE_KERNEL extern pte_t *pkmap_page_table; void __native_set_fixmap(enum fixed_addresses idx, pte_t pte); diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index cb4ef3d..a4bb1c7 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -388,7 +388,6 @@ repeat: } pte_t *kmap_pte; -pgprot_t kmap_prot; static inline pte_t *kmap_get_fixmap_pte(unsigned long vaddr) { @@ -405,8 +404,6 @@ static void __init kmap_init(void) */ kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN); kmap_pte = kmap_get_fixmap_pte(kmap_vstart); - - kmap_prot = PAGE_KERNEL; } #ifdef CONFIG_HIGHMEM -- cgit v0.10.2 From 14365449b6ce34cf6a3040ff8ebbb39d89d67159 Mon Sep 17 00:00:00 2001 From: Alexander Kuleshov Date: Tue, 26 Jan 2016 18:21:21 +0600 Subject: x86/asm: Remove unused L3_PAGE_OFFSET L3_PAGE_OFFSET was introduced in commit a6523748bd (paravirt/x86, 64-bit: move __PAGE_OFFSET to leave a space for hypervisor), but has no users. Signed-off-by: Alexander Kuleshov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Denys Vlasenko Cc: Andrey Ryabinin Link: http://lkml.kernel.org/r/1453810881-30622-1-git-send-email-kuleshovmail@gmail.com Signed-off-by: Thomas Gleixner diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index ffdc0e8..2e97468 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -38,7 +38,6 @@ #define pud_index(x) (((x) >> PUD_SHIFT) & (PTRS_PER_PUD-1)) L4_PAGE_OFFSET = pgd_index(__PAGE_OFFSET) -L3_PAGE_OFFSET = pud_index(__PAGE_OFFSET) L4_START_KERNEL = pgd_index(__START_KERNEL_map) L3_START_KERNEL = pud_index(__START_KERNEL_map) -- cgit v0.10.2 From c31b34255b48d1a169693c9c70c49ad6418cfd20 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 28 Jan 2016 15:11:19 -0800 Subject: selftests/x86: Extend Makefile to allow 64-bit-only tests Previously the Makefile supported 32-bit-only tests and tests that were 32-bit and 64-bit. This adds the support for tests that are only built as 64-bit binaries. There aren't any yet, but there might be a few some day. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Shuah Khan Cc: Thomas Gleixner Cc: linux-api@vger.kernel.org Link: http://lkml.kernel.org/r/99789bfe65706e6df32cc7e13f656e8c9fa92031.1454022279.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile index d0c473f..9c81f26 100644 --- a/tools/testing/selftests/x86/Makefile +++ b/tools/testing/selftests/x86/Makefile @@ -11,8 +11,9 @@ TARGETS_C_32BIT_ONLY := entry_from_vm86 syscall_arg_fault sigreturn test_syscall vdso_restorer TARGETS_C_32BIT_ALL := $(TARGETS_C_BOTHBITS) $(TARGETS_C_32BIT_ONLY) +TARGETS_C_64BIT_ALL := $(TARGETS_C_BOTHBITS) $(TARGETS_C_64BIT_ONLY) BINARIES_32 := $(TARGETS_C_32BIT_ALL:%=%_32) -BINARIES_64 := $(TARGETS_C_BOTHBITS:%=%_64) +BINARIES_64 := $(TARGETS_C_64BIT_ALL:%=%_64) CFLAGS := -O2 -g -std=gnu99 -pthread -Wall @@ -40,7 +41,7 @@ clean: $(TARGETS_C_32BIT_ALL:%=%_32): %_32: %.c $(CC) -m32 -o $@ $(CFLAGS) $(EXTRA_CFLAGS) $^ -lrt -ldl -lm -$(TARGETS_C_BOTHBITS:%=%_64): %_64: %.c +$(TARGETS_C_64BIT_ALL:%=%_64): %_64: %.c $(CC) -m64 -o $@ $(CFLAGS) $(EXTRA_CFLAGS) $^ -lrt -ldl # x86_64 users should be encouraged to install 32-bit libraries -- cgit v0.10.2 From e21d50f3864e2a8995f5d2a41dea3f0fa07758b4 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 28 Jan 2016 15:11:20 -0800 Subject: selftests/x86: Add check_initial_reg_state() This checks that ELF binaries are started with an appropriately blank register state. ( There's currently a nasty special case in the entry asm to arrange for this. I'm planning on removing the special case, and this will help make sure I don't break it. ) Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/ef54f8d066b30a3eb36bbf26300eebb242185700.1454022279.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile index 9c81f26..df4f767 100644 --- a/tools/testing/selftests/x86/Makefile +++ b/tools/testing/selftests/x86/Makefile @@ -4,7 +4,8 @@ include ../lib.mk .PHONY: all all_32 all_64 warn_32bit_failure clean -TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs syscall_nt ptrace_syscall +TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs syscall_nt ptrace_syscall \ + check_initial_reg_state TARGETS_C_32BIT_ONLY := entry_from_vm86 syscall_arg_fault sigreturn test_syscall_vdso unwind_vdso \ test_FCMOV test_FCOMI test_FISTTP \ ldt_gdt \ @@ -66,3 +67,9 @@ endif sysret_ss_attrs_64: thunks.S ptrace_syscall_32: raw_syscall_helper_32.S test_syscall_vdso_32: thunks_32.S + +# check_initial_reg_state is special: it needs a custom entry, and it +# needs to be static so that its interpreter doesn't destroy its initial +# state. +check_initial_reg_state_32: CFLAGS += -Wl,-ereal_start -static +check_initial_reg_state_64: CFLAGS += -Wl,-ereal_start -static diff --git a/tools/testing/selftests/x86/check_initial_reg_state.c b/tools/testing/selftests/x86/check_initial_reg_state.c new file mode 100644 index 0000000..6aaed9b --- /dev/null +++ b/tools/testing/selftests/x86/check_initial_reg_state.c @@ -0,0 +1,109 @@ +/* + * check_initial_reg_state.c - check that execve sets the correct state + * Copyright (c) 2014-2016 Andrew Lutomirski + * + * This program is free software; you can redistribute it and/or modify + * it under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#define _GNU_SOURCE + +#include + +unsigned long ax, bx, cx, dx, si, di, bp, sp, flags; +unsigned long r8, r9, r10, r11, r12, r13, r14, r15; + +asm ( + ".pushsection .text\n\t" + ".type real_start, @function\n\t" + ".global real_start\n\t" + "real_start:\n\t" +#ifdef __x86_64__ + "mov %rax, ax\n\t" + "mov %rbx, bx\n\t" + "mov %rcx, cx\n\t" + "mov %rdx, dx\n\t" + "mov %rsi, si\n\t" + "mov %rdi, di\n\t" + "mov %rbp, bp\n\t" + "mov %rsp, sp\n\t" + "mov %r8, r8\n\t" + "mov %r9, r9\n\t" + "mov %r10, r10\n\t" + "mov %r11, r11\n\t" + "mov %r12, r12\n\t" + "mov %r13, r13\n\t" + "mov %r14, r14\n\t" + "mov %r15, r15\n\t" + "pushfq\n\t" + "popq flags\n\t" +#else + "mov %eax, ax\n\t" + "mov %ebx, bx\n\t" + "mov %ecx, cx\n\t" + "mov %edx, dx\n\t" + "mov %esi, si\n\t" + "mov %edi, di\n\t" + "mov %ebp, bp\n\t" + "mov %esp, sp\n\t" + "pushfl\n\t" + "popl flags\n\t" +#endif + "jmp _start\n\t" + ".size real_start, . - real_start\n\t" + ".popsection"); + +int main() +{ + int nerrs = 0; + + if (sp == 0) { + printf("[FAIL]\tTest was built incorrectly\n"); + return 1; + } + + if (ax || bx || cx || dx || si || di || bp +#ifdef __x86_64__ + || r8 || r9 || r10 || r11 || r12 || r13 || r14 || r15 +#endif + ) { + printf("[FAIL]\tAll GPRs except SP should be 0\n"); +#define SHOW(x) printf("\t" #x " = 0x%lx\n", x); + SHOW(ax); + SHOW(bx); + SHOW(cx); + SHOW(dx); + SHOW(si); + SHOW(di); + SHOW(bp); + SHOW(sp); +#ifdef __x86_64__ + SHOW(r8); + SHOW(r9); + SHOW(r10); + SHOW(r11); + SHOW(r12); + SHOW(r13); + SHOW(r14); + SHOW(r15); +#endif + nerrs++; + } else { + printf("[OK]\tAll GPRs except SP are 0\n"); + } + + if (flags != 0x202) { + printf("[FAIL]\tFLAGS is 0x%lx, but it should be 0x202\n", flags); + nerrs++; + } else { + printf("[OK]\tFLAGS is 0x202\n"); + } + + return nerrs ? 1 : 0; +} -- cgit v0.10.2 From fba324744bfd2a7948a7710d7a021d76dafb9b67 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 28 Jan 2016 15:11:21 -0800 Subject: x86/syscalls: Refactor syscalltbl.sh This splits out the code to emit a syscall line. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1bfcbba991f5cfaa9291ff950a593daa972a205f.1454022279.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/entry/syscalls/syscalltbl.sh b/arch/x86/entry/syscalls/syscalltbl.sh index 0e7f8ec..167965e 100644 --- a/arch/x86/entry/syscalls/syscalltbl.sh +++ b/arch/x86/entry/syscalls/syscalltbl.sh @@ -3,13 +3,21 @@ in="$1" out="$2" +emit() { + abi="$1" + nr="$2" + entry="$3" + compat="$4" + if [ -n "$compat" ]; then + echo "__SYSCALL_${abi}($nr, $entry, $compat)" + elif [ -n "$entry" ]; then + echo "__SYSCALL_${abi}($nr, $entry, $entry)" + fi +} + grep '^[0-9]' "$in" | sort -n | ( while read nr abi name entry compat; do abi=`echo "$abi" | tr '[a-z]' '[A-Z]'` - if [ -n "$compat" ]; then - echo "__SYSCALL_${abi}($nr, $entry, $compat)" - elif [ -n "$entry" ]; then - echo "__SYSCALL_${abi}($nr, $entry, $entry)" - fi + emit "$abi" "$nr" "$entry" "$compat" done ) > "$out" -- cgit v0.10.2 From 32324ce15ea8cb4c8acc28acb2fd36fabf73e9db Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 28 Jan 2016 15:11:22 -0800 Subject: x86/syscalls: Remove __SYSCALL_COMMON and __SYSCALL_X32 The common/64/x32 distinction has no effect other than determining which kernels actually support the syscall. Move the logic into syscalltbl.sh. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/58d4a95f40e43b894f93288b4a3633963d0ee22e.1454022279.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/entry/syscall_64.c b/arch/x86/entry/syscall_64.c index 41283d2..974fd89 100644 --- a/arch/x86/entry/syscall_64.c +++ b/arch/x86/entry/syscall_64.c @@ -6,14 +6,6 @@ #include #include -#define __SYSCALL_COMMON(nr, sym, compat) __SYSCALL_64(nr, sym, compat) - -#ifdef CONFIG_X86_X32_ABI -# define __SYSCALL_X32(nr, sym, compat) __SYSCALL_64(nr, sym, compat) -#else -# define __SYSCALL_X32(nr, sym, compat) /* nothing */ -#endif - #define __SYSCALL_64(nr, sym, compat) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ; #include #undef __SYSCALL_64 diff --git a/arch/x86/entry/syscalls/syscalltbl.sh b/arch/x86/entry/syscalls/syscalltbl.sh index 167965e..5ebeaf1 100644 --- a/arch/x86/entry/syscalls/syscalltbl.sh +++ b/arch/x86/entry/syscalls/syscalltbl.sh @@ -18,6 +18,21 @@ emit() { grep '^[0-9]' "$in" | sort -n | ( while read nr abi name entry compat; do abi=`echo "$abi" | tr '[a-z]' '[A-Z]'` - emit "$abi" "$nr" "$entry" "$compat" + if [ "$abi" == "COMMON" -o "$abi" == "64" ]; then + # COMMON is the same as 64, except that we don't expect X32 + # programs to use it. Our expectation has nothing to do with + # any generated code, so treat them the same. + emit 64 "$nr" "$entry" "$compat" + elif [ "$abi" == "X32" ]; then + # X32 is equivalent to 64 on an X32-compatible kernel. + echo "#ifdef CONFIG_X86_X32_ABI" + emit 64 "$nr" "$entry" "$compat" + echo "#endif" + elif [ "$abi" == "I386" ]; then + emit "$abi" "$nr" "$entry" "$compat" + else + echo "Unknown abi $abi" >&2 + exit 1 + fi done ) > "$out" diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index f2edafb..29db3b3 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -5,12 +5,6 @@ #include #define __SYSCALL_64(nr, sym, compat) [nr] = 1, -#define __SYSCALL_COMMON(nr, sym, compat) [nr] = 1, -#ifdef CONFIG_X86_X32_ABI -# define __SYSCALL_X32(nr, sym, compat) [nr] = 1, -#else -# define __SYSCALL_X32(nr, sym, compat) /* nothing */ -#endif static char syscalls_64[] = { #include }; diff --git a/arch/x86/um/sys_call_table_64.c b/arch/x86/um/sys_call_table_64.c index b74ea6c..71a497c 100644 --- a/arch/x86/um/sys_call_table_64.c +++ b/arch/x86/um/sys_call_table_64.c @@ -35,9 +35,6 @@ #define stub_execveat sys_execveat #define stub_rt_sigreturn sys_rt_sigreturn -#define __SYSCALL_COMMON(nr, sym, compat) __SYSCALL_64(nr, sym, compat) -#define __SYSCALL_X32(nr, sym, compat) /* Not supported */ - #define __SYSCALL_64(nr, sym, compat) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ; #include diff --git a/arch/x86/um/user-offsets.c b/arch/x86/um/user-offsets.c index ce7e360..5edf4f4 100644 --- a/arch/x86/um/user-offsets.c +++ b/arch/x86/um/user-offsets.c @@ -15,8 +15,6 @@ static char syscalls[] = { }; #else #define __SYSCALL_64(nr, sym, compat) [nr] = 1, -#define __SYSCALL_COMMON(nr, sym, compat) [nr] = 1, -#define __SYSCALL_X32(nr, sym, compat) /* Not supported */ static char syscalls[] = { #include }; -- cgit v0.10.2 From 3e65654e3db6df6aba9c5b895f8b8e6a8d8eb508 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 28 Jan 2016 15:11:23 -0800 Subject: x86/syscalls: Move compat syscall entry handling into syscalltbl.sh Rather than duplicating the compat entry handling in all consumers of syscalls_BITS.h, handle it directly in syscalltbl.sh. Now we generate entries in syscalls_32.h like: __SYSCALL_I386(5, sys_open) __SYSCALL_I386(5, compat_sys_open) and all of its consumers implicitly get the right entry point. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/b7c2b501dc0e6e43050e916b95807c3e2e16e9bb.1454022279.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/entry/syscall_32.c b/arch/x86/entry/syscall_32.c index 9a66498..3e28297 100644 --- a/arch/x86/entry/syscall_32.c +++ b/arch/x86/entry/syscall_32.c @@ -6,17 +6,11 @@ #include #include -#ifdef CONFIG_IA32_EMULATION -#define SYM(sym, compat) compat -#else -#define SYM(sym, compat) sym -#endif - -#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage long SYM(sym, compat)(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ; +#define __SYSCALL_I386(nr, sym) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ; #include #undef __SYSCALL_I386 -#define __SYSCALL_I386(nr, sym, compat) [nr] = SYM(sym, compat), +#define __SYSCALL_I386(nr, sym) [nr] = sym, extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); diff --git a/arch/x86/entry/syscall_64.c b/arch/x86/entry/syscall_64.c index 974fd89..3781989 100644 --- a/arch/x86/entry/syscall_64.c +++ b/arch/x86/entry/syscall_64.c @@ -6,11 +6,11 @@ #include #include -#define __SYSCALL_64(nr, sym, compat) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ; +#define __SYSCALL_64(nr, sym) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ; #include #undef __SYSCALL_64 -#define __SYSCALL_64(nr, sym, compat) [nr] = sym, +#define __SYSCALL_64(nr, sym) [nr] = sym, extern long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); diff --git a/arch/x86/entry/syscalls/syscalltbl.sh b/arch/x86/entry/syscalls/syscalltbl.sh index 5ebeaf1..b81479c 100644 --- a/arch/x86/entry/syscalls/syscalltbl.sh +++ b/arch/x86/entry/syscalls/syscalltbl.sh @@ -8,10 +8,24 @@ emit() { nr="$2" entry="$3" compat="$4" - if [ -n "$compat" ]; then - echo "__SYSCALL_${abi}($nr, $entry, $compat)" - elif [ -n "$entry" ]; then - echo "__SYSCALL_${abi}($nr, $entry, $entry)" + + if [ "$abi" == "64" -a -n "$compat" ]; then + echo "a compat entry for a 64-bit syscall makes no sense" >&2 + exit 1 + fi + + if [ -z "$compat" ]; then + if [ -n "$entry" ]; then + echo "__SYSCALL_${abi}($nr, $entry)" + fi + else + echo "#ifdef CONFIG_X86_32" + if [ -n "$entry" ]; then + echo "__SYSCALL_${abi}($nr, $entry)" + fi + echo "#else" + echo "__SYSCALL_${abi}($nr, $compat)" + echo "#endif" fi } diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index 6ce3902..abec4c9 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -7,7 +7,7 @@ #include #include "../../../drivers/lguest/lg.h" -#define __SYSCALL_I386(nr, sym, compat) [nr] = 1, +#define __SYSCALL_I386(nr, sym) [nr] = 1, static char syscalls[] = { #include }; diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index 29db3b3..9677bf9 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -4,11 +4,11 @@ #include -#define __SYSCALL_64(nr, sym, compat) [nr] = 1, +#define __SYSCALL_64(nr, sym) [nr] = 1, static char syscalls_64[] = { #include }; -#define __SYSCALL_I386(nr, sym, compat) [nr] = 1, +#define __SYSCALL_I386(nr, sym) [nr] = 1, static char syscalls_ia32[] = { #include }; diff --git a/arch/x86/um/sys_call_table_32.c b/arch/x86/um/sys_call_table_32.c index 439c099..d4669a6 100644 --- a/arch/x86/um/sys_call_table_32.c +++ b/arch/x86/um/sys_call_table_32.c @@ -25,11 +25,11 @@ #define old_mmap sys_old_mmap -#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ; +#define __SYSCALL_I386(nr, sym) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ; #include #undef __SYSCALL_I386 -#define __SYSCALL_I386(nr, sym, compat) [ nr ] = sym, +#define __SYSCALL_I386(nr, sym) [ nr ] = sym, extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); diff --git a/arch/x86/um/sys_call_table_64.c b/arch/x86/um/sys_call_table_64.c index 71a497c..6ee5268 100644 --- a/arch/x86/um/sys_call_table_64.c +++ b/arch/x86/um/sys_call_table_64.c @@ -35,11 +35,11 @@ #define stub_execveat sys_execveat #define stub_rt_sigreturn sys_rt_sigreturn -#define __SYSCALL_64(nr, sym, compat) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ; +#define __SYSCALL_64(nr, sym) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ; #include #undef __SYSCALL_64 -#define __SYSCALL_64(nr, sym, compat) [ nr ] = sym, +#define __SYSCALL_64(nr, sym) [ nr ] = sym, extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); diff --git a/arch/x86/um/user-offsets.c b/arch/x86/um/user-offsets.c index 5edf4f4..6c9a9c1 100644 --- a/arch/x86/um/user-offsets.c +++ b/arch/x86/um/user-offsets.c @@ -9,12 +9,12 @@ #include #ifdef __i386__ -#define __SYSCALL_I386(nr, sym, compat) [nr] = 1, +#define __SYSCALL_I386(nr, sym) [nr] = 1, static char syscalls[] = { #include }; #else -#define __SYSCALL_64(nr, sym, compat) [nr] = 1, +#define __SYSCALL_64(nr, sym) [nr] = 1, static char syscalls[] = { #include }; -- cgit v0.10.2 From cfcbadb49dabb05efa23e1a0f95f3391c0a815bc Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 28 Jan 2016 15:11:24 -0800 Subject: x86/syscalls: Add syscall entry qualifiers This will let us specify something like 'sys_xyz/foo' instead of 'sys_xyz' in the syscall table, where the 'foo' qualifier conveys some extra information to the C code. The intent is to allow things like sys_execve/ptregs to indicate that sys_execve() touches pt_regs. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/2de06e33dce62556b3ec662006fcb295504e296e.1454022279.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/entry/syscall_32.c b/arch/x86/entry/syscall_32.c index 3e28297..8f895ee 100644 --- a/arch/x86/entry/syscall_32.c +++ b/arch/x86/entry/syscall_32.c @@ -6,11 +6,11 @@ #include #include -#define __SYSCALL_I386(nr, sym) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ; +#define __SYSCALL_I386(nr, sym, qual) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ; #include #undef __SYSCALL_I386 -#define __SYSCALL_I386(nr, sym) [nr] = sym, +#define __SYSCALL_I386(nr, sym, qual) [nr] = sym, extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); diff --git a/arch/x86/entry/syscall_64.c b/arch/x86/entry/syscall_64.c index 3781989..a1d4087 100644 --- a/arch/x86/entry/syscall_64.c +++ b/arch/x86/entry/syscall_64.c @@ -6,11 +6,11 @@ #include #include -#define __SYSCALL_64(nr, sym) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ; +#define __SYSCALL_64(nr, sym, qual) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ; #include #undef __SYSCALL_64 -#define __SYSCALL_64(nr, sym) [nr] = sym, +#define __SYSCALL_64(nr, sym, qual) [nr] = sym, extern long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); diff --git a/arch/x86/entry/syscalls/syscalltbl.sh b/arch/x86/entry/syscalls/syscalltbl.sh index b81479c..cd3d301 100644 --- a/arch/x86/entry/syscalls/syscalltbl.sh +++ b/arch/x86/entry/syscalls/syscalltbl.sh @@ -3,6 +3,19 @@ in="$1" out="$2" +syscall_macro() { + abi="$1" + nr="$2" + entry="$3" + + # Entry can be either just a function name or "function/qualifier" + real_entry="${entry%%/*}" + qualifier="${entry:${#real_entry}}" # Strip the function name + qualifier="${qualifier:1}" # Strip the slash, if any + + echo "__SYSCALL_${abi}($nr, $real_entry, $qualifier)" +} + emit() { abi="$1" nr="$2" @@ -16,15 +29,15 @@ emit() { if [ -z "$compat" ]; then if [ -n "$entry" ]; then - echo "__SYSCALL_${abi}($nr, $entry)" + syscall_macro "$abi" "$nr" "$entry" fi else echo "#ifdef CONFIG_X86_32" if [ -n "$entry" ]; then - echo "__SYSCALL_${abi}($nr, $entry)" + syscall_macro "$abi" "$nr" "$entry" fi echo "#else" - echo "__SYSCALL_${abi}($nr, $compat)" + syscall_macro "$abi" "$nr" "$compat" echo "#endif" fi } diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index abec4c9..fdeb0ce 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -7,7 +7,7 @@ #include #include "../../../drivers/lguest/lg.h" -#define __SYSCALL_I386(nr, sym) [nr] = 1, +#define __SYSCALL_I386(nr, sym, qual) [nr] = 1, static char syscalls[] = { #include }; diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index 9677bf9..d875f97 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -4,11 +4,11 @@ #include -#define __SYSCALL_64(nr, sym) [nr] = 1, +#define __SYSCALL_64(nr, sym, qual) [nr] = 1, static char syscalls_64[] = { #include }; -#define __SYSCALL_I386(nr, sym) [nr] = 1, +#define __SYSCALL_I386(nr, sym, qual) [nr] = 1, static char syscalls_ia32[] = { #include }; diff --git a/arch/x86/um/sys_call_table_32.c b/arch/x86/um/sys_call_table_32.c index d4669a6..bfce503 100644 --- a/arch/x86/um/sys_call_table_32.c +++ b/arch/x86/um/sys_call_table_32.c @@ -25,11 +25,11 @@ #define old_mmap sys_old_mmap -#define __SYSCALL_I386(nr, sym) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ; +#define __SYSCALL_I386(nr, sym, qual) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ; #include #undef __SYSCALL_I386 -#define __SYSCALL_I386(nr, sym) [ nr ] = sym, +#define __SYSCALL_I386(nr, sym, qual) [ nr ] = sym, extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); diff --git a/arch/x86/um/sys_call_table_64.c b/arch/x86/um/sys_call_table_64.c index 6ee5268..f306413 100644 --- a/arch/x86/um/sys_call_table_64.c +++ b/arch/x86/um/sys_call_table_64.c @@ -35,11 +35,11 @@ #define stub_execveat sys_execveat #define stub_rt_sigreturn sys_rt_sigreturn -#define __SYSCALL_64(nr, sym) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ; +#define __SYSCALL_64(nr, sym, qual) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ; #include #undef __SYSCALL_64 -#define __SYSCALL_64(nr, sym) [ nr ] = sym, +#define __SYSCALL_64(nr, sym, qual) [ nr ] = sym, extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); diff --git a/arch/x86/um/user-offsets.c b/arch/x86/um/user-offsets.c index 6c9a9c1..470564b 100644 --- a/arch/x86/um/user-offsets.c +++ b/arch/x86/um/user-offsets.c @@ -9,12 +9,12 @@ #include #ifdef __i386__ -#define __SYSCALL_I386(nr, sym) [nr] = 1, +#define __SYSCALL_I386(nr, sym, qual) [nr] = 1, static char syscalls[] = { #include }; #else -#define __SYSCALL_64(nr, sym) [nr] = 1, +#define __SYSCALL_64(nr, sym, qual) [nr] = 1, static char syscalls[] = { #include }; -- cgit v0.10.2 From 302f5b260c322696cbeb962a263a4d2d99864aed Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 28 Jan 2016 15:11:25 -0800 Subject: x86/entry/64: Always run ptregs-using syscalls on the slow path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 64-bit syscalls currently have an optimization in which they are called with partial pt_regs. A small handful require full pt_regs. In the 32-bit and compat cases, I cleaned this up by forcing full pt_regs for all syscalls. The performance hit doesn't really matter as the affected system calls are fundamentally heavy and this is the 32-bit compat case. I want to clean up the 64-bit case as well, but I don't want to hurt fast path performance. To do that, I want to force the syscalls that use pt_regs onto the slow path. This will enable us to make slow path syscalls be real ABI-compliant C functions. Use the new syscall entry qualification machinery for this. 'stub_clone' is now 'stub_clone/ptregs'. The next patch will eliminate the stubs, and we'll just have 'sys_clone/ptregs'. As of this patch, two-phase entry tracing is no longer used. It has served its purpose (namely a huge speedup on some workloads prior to more general opportunistic SYSRET support), and once the dust settles I'll send patches to back it out. The implementation is heavily based on a patch from Brian Gerst: http://lkml.kernel.org/g/1449666173-15366-1-git-send-email-brgerst@gmail.com Originally-From: Brian Gerst Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: Frédéric Weisbecker Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Linux Kernel Mailing List Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/b9beda88460bcefec6e7d792bd44eca9b760b0c4.1454022279.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 9d34d3c..f1c8f15 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -182,7 +182,15 @@ entry_SYSCALL_64_fastpath: #endif ja 1f /* return -ENOSYS (already in pt_regs->ax) */ movq %r10, %rcx + + /* + * This call instruction is handled specially in stub_ptregs_64. + * It might end up jumping to the slow path. If it jumps, RAX is + * clobbered. + */ call *sys_call_table(, %rax, 8) +.Lentry_SYSCALL_64_after_fastpath_call: + movq %rax, RAX(%rsp) 1: /* @@ -235,25 +243,13 @@ GLOBAL(int_ret_from_sys_call_irqs_off) /* Do syscall entry tracing */ tracesys: - movq %rsp, %rdi - movl $AUDIT_ARCH_X86_64, %esi - call syscall_trace_enter_phase1 - test %rax, %rax - jnz tracesys_phase2 /* if needed, run the slow path */ - RESTORE_C_REGS_EXCEPT_RAX /* else restore clobbered regs */ - movq ORIG_RAX(%rsp), %rax - jmp entry_SYSCALL_64_fastpath /* and return to the fast path */ - -tracesys_phase2: SAVE_EXTRA_REGS movq %rsp, %rdi - movl $AUDIT_ARCH_X86_64, %esi - movq %rax, %rdx - call syscall_trace_enter_phase2 + call syscall_trace_enter /* * Reload registers from stack in case ptrace changed them. - * We don't reload %rax because syscall_trace_entry_phase2() returned + * We don't reload %rax because syscall_trace_enter() returned * the value it wants us to use in the table lookup. */ RESTORE_C_REGS_EXCEPT_RAX @@ -355,6 +351,38 @@ opportunistic_sysret_failed: jmp restore_c_regs_and_iret END(entry_SYSCALL_64) +ENTRY(stub_ptregs_64) + /* + * Syscalls marked as needing ptregs land here. + * If we are on the fast path, we need to save the extra regs. + * If we are on the slow path, the extra regs are already saved. + * + * RAX stores a pointer to the C function implementing the syscall. + */ + cmpq $.Lentry_SYSCALL_64_after_fastpath_call, (%rsp) + jne 1f + + /* Called from fast path -- pop return address and jump to slow path */ + popq %rax + jmp tracesys /* called from fast path */ + +1: + /* Called from C */ + jmp *%rax /* called from C */ +END(stub_ptregs_64) + +.macro ptregs_stub func +ENTRY(ptregs_\func) + leaq \func(%rip), %rax + jmp stub_ptregs_64 +END(ptregs_\func) +.endm + +/* Instantiate ptregs_stub for each ptregs-using syscall */ +#define __SYSCALL_64_QUAL_(sym) +#define __SYSCALL_64_QUAL_ptregs(sym) ptregs_stub sym +#define __SYSCALL_64(nr, sym, qual) __SYSCALL_64_QUAL_##qual(sym) +#include .macro FORK_LIKE func ENTRY(stub_\func) diff --git a/arch/x86/entry/syscall_64.c b/arch/x86/entry/syscall_64.c index a1d4087..9dbc5ab 100644 --- a/arch/x86/entry/syscall_64.c +++ b/arch/x86/entry/syscall_64.c @@ -6,11 +6,14 @@ #include #include -#define __SYSCALL_64(nr, sym, qual) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ; +#define __SYSCALL_64_QUAL_(sym) sym +#define __SYSCALL_64_QUAL_ptregs(sym) ptregs_##sym + +#define __SYSCALL_64(nr, sym, qual) extern asmlinkage long __SYSCALL_64_QUAL_##qual(sym)(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); #include #undef __SYSCALL_64 -#define __SYSCALL_64(nr, sym, qual) [nr] = sym, +#define __SYSCALL_64(nr, sym, qual) [nr] = __SYSCALL_64_QUAL_##qual(sym), extern long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index dc1040a..5de342a 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -21,7 +21,7 @@ 12 common brk sys_brk 13 64 rt_sigaction sys_rt_sigaction 14 common rt_sigprocmask sys_rt_sigprocmask -15 64 rt_sigreturn stub_rt_sigreturn +15 64 rt_sigreturn stub_rt_sigreturn/ptregs 16 64 ioctl sys_ioctl 17 common pread64 sys_pread64 18 common pwrite64 sys_pwrite64 @@ -62,10 +62,10 @@ 53 common socketpair sys_socketpair 54 64 setsockopt sys_setsockopt 55 64 getsockopt sys_getsockopt -56 common clone stub_clone -57 common fork stub_fork -58 common vfork stub_vfork -59 64 execve stub_execve +56 common clone stub_clone/ptregs +57 common fork stub_fork/ptregs +58 common vfork stub_vfork/ptregs +59 64 execve stub_execve/ptregs 60 common exit sys_exit 61 common wait4 sys_wait4 62 common kill sys_kill @@ -328,7 +328,7 @@ 319 common memfd_create sys_memfd_create 320 common kexec_file_load sys_kexec_file_load 321 common bpf sys_bpf -322 64 execveat stub_execveat +322 64 execveat stub_execveat/ptregs 323 common userfaultfd sys_userfaultfd 324 common membarrier sys_membarrier 325 common mlock2 sys_mlock2 @@ -346,7 +346,7 @@ 517 x32 recvfrom compat_sys_recvfrom 518 x32 sendmsg compat_sys_sendmsg 519 x32 recvmsg compat_sys_recvmsg -520 x32 execve stub_x32_execve +520 x32 execve stub_x32_execve/ptregs 521 x32 ptrace compat_sys_ptrace 522 x32 rt_sigpending compat_sys_rt_sigpending 523 x32 rt_sigtimedwait compat_sys_rt_sigtimedwait @@ -371,4 +371,4 @@ 542 x32 getsockopt compat_sys_getsockopt 543 x32 io_setup compat_sys_io_setup 544 x32 io_submit compat_sys_io_submit -545 x32 execveat stub_x32_execveat +545 x32 execveat stub_x32_execveat/ptregs -- cgit v0.10.2 From 46eabf06c04a6847a694a0c1413d4ac57e5b058a Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 28 Jan 2016 15:11:26 -0800 Subject: x86/entry/64: Call all native slow-path syscalls with full pt-regs This removes all of the remaining asm syscall stubs except for stub_ptregs_64. Entries in the main syscall table are now all callable from C. The resulting asm is every bit as ridiculous as it looks. The next few patches will clean it up. This patch is here to let reviewers rest their brains and for bisection. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/a6b3801be0d505d50aefabda02d3b93efbfc9c73.1454022279.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index f1c8f15..f7050a5 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -253,7 +253,6 @@ tracesys: * the value it wants us to use in the table lookup. */ RESTORE_C_REGS_EXCEPT_RAX - RESTORE_EXTRA_REGS #if __SYSCALL_MASK == ~0 cmpq $__NR_syscall_max, %rax #else @@ -264,6 +263,7 @@ tracesys: movq %r10, %rcx /* fixup for C */ call *sys_call_table(, %rax, 8) movq %rax, RAX(%rsp) + RESTORE_EXTRA_REGS 1: /* Use IRET because user could have changed pt_regs->foo */ @@ -384,83 +384,6 @@ END(ptregs_\func) #define __SYSCALL_64(nr, sym, qual) __SYSCALL_64_QUAL_##qual(sym) #include - .macro FORK_LIKE func -ENTRY(stub_\func) - SAVE_EXTRA_REGS 8 - jmp sys_\func -END(stub_\func) - .endm - - FORK_LIKE clone - FORK_LIKE fork - FORK_LIKE vfork - -ENTRY(stub_execve) - call sys_execve -return_from_execve: - testl %eax, %eax - jz 1f - /* exec failed, can use fast SYSRET code path in this case */ - ret -1: - /* must use IRET code path (pt_regs->cs may have changed) */ - addq $8, %rsp - ZERO_EXTRA_REGS - movq %rax, RAX(%rsp) - jmp int_ret_from_sys_call -END(stub_execve) -/* - * Remaining execve stubs are only 7 bytes long. - * ENTRY() often aligns to 16 bytes, which in this case has no benefits. - */ - .align 8 -GLOBAL(stub_execveat) - call sys_execveat - jmp return_from_execve -END(stub_execveat) - -#if defined(CONFIG_X86_X32_ABI) - .align 8 -GLOBAL(stub_x32_execve) - call compat_sys_execve - jmp return_from_execve -END(stub_x32_execve) - .align 8 -GLOBAL(stub_x32_execveat) - call compat_sys_execveat - jmp return_from_execve -END(stub_x32_execveat) -#endif - -/* - * sigreturn is special because it needs to restore all registers on return. - * This cannot be done with SYSRET, so use the IRET return path instead. - */ -ENTRY(stub_rt_sigreturn) - /* - * SAVE_EXTRA_REGS result is not normally needed: - * sigreturn overwrites all pt_regs->GPREGS. - * But sigreturn can fail (!), and there is no easy way to detect that. - * To make sure RESTORE_EXTRA_REGS doesn't restore garbage on error, - * we SAVE_EXTRA_REGS here. - */ - SAVE_EXTRA_REGS 8 - call sys_rt_sigreturn -return_from_stub: - addq $8, %rsp - RESTORE_EXTRA_REGS - movq %rax, RAX(%rsp) - jmp int_ret_from_sys_call -END(stub_rt_sigreturn) - -#ifdef CONFIG_X86_X32_ABI -ENTRY(stub_x32_rt_sigreturn) - SAVE_EXTRA_REGS 8 - call sys32_x32_rt_sigreturn - jmp return_from_stub -END(stub_x32_rt_sigreturn) -#endif - /* * A newly forked process directly context switches into this address. * diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 5de342a..dcf107c 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -21,7 +21,7 @@ 12 common brk sys_brk 13 64 rt_sigaction sys_rt_sigaction 14 common rt_sigprocmask sys_rt_sigprocmask -15 64 rt_sigreturn stub_rt_sigreturn/ptregs +15 64 rt_sigreturn sys_rt_sigreturn/ptregs 16 64 ioctl sys_ioctl 17 common pread64 sys_pread64 18 common pwrite64 sys_pwrite64 @@ -62,10 +62,10 @@ 53 common socketpair sys_socketpair 54 64 setsockopt sys_setsockopt 55 64 getsockopt sys_getsockopt -56 common clone stub_clone/ptregs -57 common fork stub_fork/ptregs -58 common vfork stub_vfork/ptregs -59 64 execve stub_execve/ptregs +56 common clone sys_clone/ptregs +57 common fork sys_fork/ptregs +58 common vfork sys_vfork/ptregs +59 64 execve sys_execve/ptregs 60 common exit sys_exit 61 common wait4 sys_wait4 62 common kill sys_kill @@ -328,7 +328,7 @@ 319 common memfd_create sys_memfd_create 320 common kexec_file_load sys_kexec_file_load 321 common bpf sys_bpf -322 64 execveat stub_execveat/ptregs +322 64 execveat sys_execveat/ptregs 323 common userfaultfd sys_userfaultfd 324 common membarrier sys_membarrier 325 common mlock2 sys_mlock2 @@ -339,14 +339,14 @@ # for native 64-bit operation. # 512 x32 rt_sigaction compat_sys_rt_sigaction -513 x32 rt_sigreturn stub_x32_rt_sigreturn +513 x32 rt_sigreturn sys32_x32_rt_sigreturn 514 x32 ioctl compat_sys_ioctl 515 x32 readv compat_sys_readv 516 x32 writev compat_sys_writev 517 x32 recvfrom compat_sys_recvfrom 518 x32 sendmsg compat_sys_sendmsg 519 x32 recvmsg compat_sys_recvmsg -520 x32 execve stub_x32_execve/ptregs +520 x32 execve compat_sys_execve/ptregs 521 x32 ptrace compat_sys_ptrace 522 x32 rt_sigpending compat_sys_rt_sigpending 523 x32 rt_sigtimedwait compat_sys_rt_sigtimedwait @@ -371,4 +371,4 @@ 542 x32 getsockopt compat_sys_getsockopt 543 x32 io_setup compat_sys_io_setup 544 x32 io_submit compat_sys_io_submit -545 x32 execveat stub_x32_execveat/ptregs +545 x32 execveat compat_sys_execveat/ptregs -- cgit v0.10.2 From 24d978b76ffd20ecff8a8d1c21b16fe740f8b119 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 28 Jan 2016 15:11:27 -0800 Subject: x86/entry/64: Stop using int_ret_from_sys_call in ret_from_fork ret_from_fork is now open-coded and is no longer tangled up with the syscall code. This isn't so bad -- this adds very little code, and IMO the result is much easier to understand. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/a0747e2a5e47084655a1e96351c545b755c41fa7.1454022279.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index f7050a5..cb5d940 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -390,7 +390,6 @@ END(ptregs_\func) * rdi: prev task we switched from */ ENTRY(ret_from_fork) - LOCK ; btr $TIF_FORK, TI_flags(%r8) pushq $0x0002 @@ -398,28 +397,32 @@ ENTRY(ret_from_fork) call schedule_tail /* rdi: 'prev' task parameter */ - RESTORE_EXTRA_REGS - testb $3, CS(%rsp) /* from kernel_thread? */ + jnz 1f /* - * By the time we get here, we have no idea whether our pt_regs, - * ti flags, and ti status came from the 64-bit SYSCALL fast path, - * the slow path, or one of the 32-bit compat paths. - * Use IRET code path to return, since it can safely handle - * all of the above. + * We came from kernel_thread. This code path is quite twisted, and + * someone should clean it up. + * + * copy_thread_tls stashes the function pointer in RBX and the + * parameter to be passed in RBP. The called function is permitted + * to call do_execve and thereby jump to user mode. */ - jnz int_ret_from_sys_call + movq RBP(%rsp), %rdi + call *RBX(%rsp) + movl $0, RAX(%rsp) /* - * We came from kernel_thread - * nb: we depend on RESTORE_EXTRA_REGS above + * Fall through as though we're exiting a syscall. This makes a + * twisted sort of sense if we just called do_execve. */ - movq %rbp, %rdi - call *%rbx - movl $0, RAX(%rsp) - RESTORE_EXTRA_REGS - jmp int_ret_from_sys_call + +1: + movq %rsp, %rdi + call syscall_return_slowpath /* returns with IRQs disabled */ + TRACE_IRQS_ON /* user mode is traced as IRQS on */ + SWAPGS + jmp restore_regs_and_iret END(ret_from_fork) /* -- cgit v0.10.2 From 1e423bff959e48166f5b7efca01fdb0dbdf05846 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 28 Jan 2016 15:11:28 -0800 Subject: x86/entry/64: Migrate the 64-bit syscall slow path to C This is more complicated than the 32-bit and compat cases because it preserves an asm fast path for the case where the callee-saved regs aren't needed in pt_regs and no entry or exit work needs to be done. This appears to slow down fastpath syscalls by no more than one cycle on my Skylake laptop. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/ce2335a4d42dc164b24132ee5e8c7716061f947b.1454022279.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 0366374..75175f9 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -344,6 +344,32 @@ __visible inline void syscall_return_slowpath(struct pt_regs *regs) prepare_exit_to_usermode(regs); } +#ifdef CONFIG_X86_64 +__visible void do_syscall_64(struct pt_regs *regs) +{ + struct thread_info *ti = pt_regs_to_thread_info(regs); + unsigned long nr = regs->orig_ax; + + local_irq_enable(); + + if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY) + nr = syscall_trace_enter(regs); + + /* + * NB: Native and x32 syscalls are dispatched from the same + * table. The only functional difference is the x32 bit in + * regs->orig_ax, which changes the behavior of some syscalls. + */ + if (likely((nr & __SYSCALL_MASK) < NR_syscalls)) { + regs->ax = sys_call_table[nr & __SYSCALL_MASK]( + regs->di, regs->si, regs->dx, + regs->r10, regs->r8, regs->r9); + } + + syscall_return_slowpath(regs); +} +#endif + #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) /* * Does a 32-bit syscall. Called with IRQs on and does all entry and diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index cb5d940..567aa52 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -145,17 +145,11 @@ GLOBAL(entry_SYSCALL_64_after_swapgs) movq %rsp, PER_CPU_VAR(rsp_scratch) movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp + TRACE_IRQS_OFF + /* Construct struct pt_regs on stack */ pushq $__USER_DS /* pt_regs->ss */ pushq PER_CPU_VAR(rsp_scratch) /* pt_regs->sp */ - /* - * Re-enable interrupts. - * We use 'rsp_scratch' as a scratch space, hence irq-off block above - * must execute atomically in the face of possible interrupt-driven - * task preemption. We must enable interrupts only after we're done - * with using rsp_scratch: - */ - ENABLE_INTERRUPTS(CLBR_NONE) pushq %r11 /* pt_regs->flags */ pushq $__USER_CS /* pt_regs->cs */ pushq %rcx /* pt_regs->ip */ @@ -171,9 +165,21 @@ GLOBAL(entry_SYSCALL_64_after_swapgs) pushq %r11 /* pt_regs->r11 */ sub $(6*8), %rsp /* pt_regs->bp, bx, r12-15 not saved */ - testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jnz tracesys + /* + * If we need to do entry work or if we guess we'll need to do + * exit work, go straight to the slow path. + */ + testl $_TIF_WORK_SYSCALL_ENTRY|_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + jnz entry_SYSCALL64_slow_path + entry_SYSCALL_64_fastpath: + /* + * Easy case: enable interrupts and issue the syscall. If the syscall + * needs pt_regs, we'll call a stub that disables interrupts again + * and jumps to the slow path. + */ + TRACE_IRQS_ON + ENABLE_INTERRUPTS(CLBR_NONE) #if __SYSCALL_MASK == ~0 cmpq $__NR_syscall_max, %rax #else @@ -193,88 +199,43 @@ entry_SYSCALL_64_fastpath: movq %rax, RAX(%rsp) 1: -/* - * Syscall return path ending with SYSRET (fast path). - * Has incompletely filled pt_regs. - */ - LOCKDEP_SYS_EXIT - /* - * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, - * it is too small to ever cause noticeable irq latency. - */ - DISABLE_INTERRUPTS(CLBR_NONE) /* - * We must check ti flags with interrupts (or at least preemption) - * off because we must *never* return to userspace without - * processing exit work that is enqueued if we're preempted here. - * In particular, returning to userspace with any of the one-shot - * flags (TIF_NOTIFY_RESUME, TIF_USER_RETURN_NOTIFY, etc) set is - * very bad. + * If we get here, then we know that pt_regs is clean for SYSRET64. + * If we see that no exit work is required (which we are required + * to check with IRQs off), then we can go straight to SYSRET64. */ + DISABLE_INTERRUPTS(CLBR_NONE) + TRACE_IRQS_OFF testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jnz int_ret_from_sys_call_irqs_off /* Go to the slow path */ + jnz 1f - RESTORE_C_REGS_EXCEPT_RCX_R11 - movq RIP(%rsp), %rcx - movq EFLAGS(%rsp), %r11 + LOCKDEP_SYS_EXIT + TRACE_IRQS_ON /* user mode is traced as IRQs on */ + RESTORE_C_REGS movq RSP(%rsp), %rsp - /* - * 64-bit SYSRET restores rip from rcx, - * rflags from r11 (but RF and VM bits are forced to 0), - * cs and ss are loaded from MSRs. - * Restoration of rflags re-enables interrupts. - * - * NB: On AMD CPUs with the X86_BUG_SYSRET_SS_ATTRS bug, the ss - * descriptor is not reinitialized. This means that we should - * avoid SYSRET with SS == NULL, which could happen if we schedule, - * exit the kernel, and re-enter using an interrupt vector. (All - * interrupt entries on x86_64 set SS to NULL.) We prevent that - * from happening by reloading SS in __switch_to. (Actually - * detecting the failure in 64-bit userspace is tricky but can be - * done.) - */ USERGS_SYSRET64 -GLOBAL(int_ret_from_sys_call_irqs_off) +1: + /* + * The fast path looked good when we started, but something changed + * along the way and we need to switch to the slow path. Calling + * raise(3) will trigger this, for example. IRQs are off. + */ TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_NONE) - jmp int_ret_from_sys_call - - /* Do syscall entry tracing */ -tracesys: SAVE_EXTRA_REGS movq %rsp, %rdi - call syscall_trace_enter - - /* - * Reload registers from stack in case ptrace changed them. - * We don't reload %rax because syscall_trace_enter() returned - * the value it wants us to use in the table lookup. - */ - RESTORE_C_REGS_EXCEPT_RAX -#if __SYSCALL_MASK == ~0 - cmpq $__NR_syscall_max, %rax -#else - andl $__SYSCALL_MASK, %eax - cmpl $__NR_syscall_max, %eax -#endif - ja 1f /* return -ENOSYS (already in pt_regs->ax) */ - movq %r10, %rcx /* fixup for C */ - call *sys_call_table(, %rax, 8) - movq %rax, RAX(%rsp) - RESTORE_EXTRA_REGS -1: - /* Use IRET because user could have changed pt_regs->foo */ + call syscall_return_slowpath /* returns with IRQs disabled */ + jmp return_from_SYSCALL_64 -/* - * Syscall return path ending with IRET. - * Has correct iret frame. - */ -GLOBAL(int_ret_from_sys_call) +entry_SYSCALL64_slow_path: + /* IRQs are off. */ SAVE_EXTRA_REGS movq %rsp, %rdi - call syscall_return_slowpath /* returns with IRQs disabled */ + call do_syscall_64 /* returns with IRQs disabled */ + +return_from_SYSCALL_64: RESTORE_EXTRA_REGS TRACE_IRQS_IRETQ /* we're about to change IF */ @@ -364,7 +325,7 @@ ENTRY(stub_ptregs_64) /* Called from fast path -- pop return address and jump to slow path */ popq %rax - jmp tracesys /* called from fast path */ + jmp entry_SYSCALL64_slow_path /* called from fast path */ 1: /* Called from C */ -- cgit v0.10.2 From cd4d09ec6f6c12a2cc3db5b7d8876a325a53545b Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 26 Jan 2016 22:12:04 +0100 Subject: x86/cpufeature: Carve out X86_FEATURE_* Move them to a separate header and have the following dependency: x86/cpufeatures.h <- x86/processor.h <- x86/cpufeature.h This makes it easier to use the header in asm code and not include the whole cpufeature.h and add guards for asm. Suggested-by: H. Peter Anvin Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1453842730-28463-5-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 87d40a7..c0c6253 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -666,7 +666,7 @@ bytes respectively. Such letter suffixes can also be entirely omitted. clearcpuid=BITNUM [X86] Disable CPUID feature X for the kernel. See - arch/x86/include/asm/cpufeature.h for the valid bit + arch/x86/include/asm/cpufeatures.h for the valid bit numbers. Note the Linux specific bits are not necessarily stable over kernel options, but the vendor specific ones should be. diff --git a/arch/x86/boot/cpuflags.h b/arch/x86/boot/cpuflags.h index ea97697..4cb404f 100644 --- a/arch/x86/boot/cpuflags.h +++ b/arch/x86/boot/cpuflags.h @@ -1,7 +1,7 @@ #ifndef BOOT_CPUFLAGS_H #define BOOT_CPUFLAGS_H -#include +#include #include struct cpu_features { diff --git a/arch/x86/boot/mkcpustr.c b/arch/x86/boot/mkcpustr.c index 637097e..f72498d 100644 --- a/arch/x86/boot/mkcpustr.c +++ b/arch/x86/boot/mkcpustr.c @@ -17,7 +17,7 @@ #include "../include/asm/required-features.h" #include "../include/asm/disabled-features.h" -#include "../include/asm/cpufeature.h" +#include "../include/asm/cpufeatures.h" #include "../kernel/cpu/capflags.c" int main(void) diff --git a/arch/x86/crypto/crc32-pclmul_glue.c b/arch/x86/crypto/crc32-pclmul_glue.c index 07d2c6c..27226df 100644 --- a/arch/x86/crypto/crc32-pclmul_glue.c +++ b/arch/x86/crypto/crc32-pclmul_glue.c @@ -33,7 +33,7 @@ #include #include -#include +#include #include #include diff --git a/arch/x86/crypto/crc32c-intel_glue.c b/arch/x86/crypto/crc32c-intel_glue.c index 0e98716..0857b1a 100644 --- a/arch/x86/crypto/crc32c-intel_glue.c +++ b/arch/x86/crypto/crc32c-intel_glue.c @@ -30,7 +30,7 @@ #include #include -#include +#include #include #include diff --git a/arch/x86/crypto/crct10dif-pclmul_glue.c b/arch/x86/crypto/crct10dif-pclmul_glue.c index a3fcfc9..cd4df93 100644 --- a/arch/x86/crypto/crct10dif-pclmul_glue.c +++ b/arch/x86/crypto/crct10dif-pclmul_glue.c @@ -30,7 +30,7 @@ #include #include #include -#include +#include #include asmlinkage __u16 crc_t10dif_pcl(__u16 crc, const unsigned char *buf, diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 75175f9..c6ab2eb 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -26,6 +26,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS #include diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 77d8c51..4c52283 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -40,7 +40,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/entry/vdso/vdso32-setup.c b/arch/x86/entry/vdso/vdso32-setup.c index 08a317a..7853b53 100644 --- a/arch/x86/entry/vdso/vdso32-setup.c +++ b/arch/x86/entry/vdso/vdso32-setup.c @@ -11,7 +11,6 @@ #include #include -#include #include #include diff --git a/arch/x86/entry/vdso/vdso32/system_call.S b/arch/x86/entry/vdso/vdso32/system_call.S index 3a1d929..0109ac6 100644 --- a/arch/x86/entry/vdso/vdso32/system_call.S +++ b/arch/x86/entry/vdso/vdso32/system_call.S @@ -3,7 +3,7 @@ */ #include -#include +#include #include /* diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index 7c912fe..429d54d 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -20,6 +20,7 @@ #include #include #include +#include #if defined(CONFIG_X86_64) unsigned int __read_mostly vdso64_enabled = 1; diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 7bfc85b..99afb66 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -152,12 +152,6 @@ static inline int alternatives_text_reserved(void *start, void *end) ".popsection" /* - * This must be included *after* the definition of ALTERNATIVE due to - * - */ -#include - -/* * Alternative instructions for different CPU types or capabilities. * * This allows to use optimized instructions even on generic binary diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index c80f6b6..0899cfc 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -6,7 +6,6 @@ #include #include -#include #include #include #include diff --git a/arch/x86/include/asm/arch_hweight.h b/arch/x86/include/asm/arch_hweight.h index 259a7c1..02e799f 100644 --- a/arch/x86/include/asm/arch_hweight.h +++ b/arch/x86/include/asm/arch_hweight.h @@ -1,6 +1,8 @@ #ifndef _ASM_X86_HWEIGHT_H #define _ASM_X86_HWEIGHT_H +#include + #ifdef CONFIG_64BIT /* popcnt %edi, %eax -- redundant REX prefix for alignment */ #define POPCNT32 ".byte 0xf3,0x40,0x0f,0xb8,0xc7" diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h index ad19841..9733361 100644 --- a/arch/x86/include/asm/cmpxchg.h +++ b/arch/x86/include/asm/cmpxchg.h @@ -2,6 +2,7 @@ #define ASM_X86_CMPXCHG_H #include +#include #include /* Provides LOCK_PREFIX */ /* diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index bbf166e..3cce9f3 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -1,289 +1,7 @@ -/* - * Defines x86 CPU feature bits - */ #ifndef _ASM_X86_CPUFEATURE_H #define _ASM_X86_CPUFEATURE_H -#ifndef _ASM_X86_REQUIRED_FEATURES_H -#include -#endif - -#ifndef _ASM_X86_DISABLED_FEATURES_H -#include -#endif - -#define NCAPINTS 16 /* N 32-bit words worth of info */ -#define NBUGINTS 1 /* N 32-bit bug flags */ - -/* - * Note: If the comment begins with a quoted string, that string is used - * in /proc/cpuinfo instead of the macro name. If the string is "", - * this feature bit is not displayed in /proc/cpuinfo at all. - */ - -/* Intel-defined CPU features, CPUID level 0x00000001 (edx), word 0 */ -#define X86_FEATURE_FPU ( 0*32+ 0) /* Onboard FPU */ -#define X86_FEATURE_VME ( 0*32+ 1) /* Virtual Mode Extensions */ -#define X86_FEATURE_DE ( 0*32+ 2) /* Debugging Extensions */ -#define X86_FEATURE_PSE ( 0*32+ 3) /* Page Size Extensions */ -#define X86_FEATURE_TSC ( 0*32+ 4) /* Time Stamp Counter */ -#define X86_FEATURE_MSR ( 0*32+ 5) /* Model-Specific Registers */ -#define X86_FEATURE_PAE ( 0*32+ 6) /* Physical Address Extensions */ -#define X86_FEATURE_MCE ( 0*32+ 7) /* Machine Check Exception */ -#define X86_FEATURE_CX8 ( 0*32+ 8) /* CMPXCHG8 instruction */ -#define X86_FEATURE_APIC ( 0*32+ 9) /* Onboard APIC */ -#define X86_FEATURE_SEP ( 0*32+11) /* SYSENTER/SYSEXIT */ -#define X86_FEATURE_MTRR ( 0*32+12) /* Memory Type Range Registers */ -#define X86_FEATURE_PGE ( 0*32+13) /* Page Global Enable */ -#define X86_FEATURE_MCA ( 0*32+14) /* Machine Check Architecture */ -#define X86_FEATURE_CMOV ( 0*32+15) /* CMOV instructions */ - /* (plus FCMOVcc, FCOMI with FPU) */ -#define X86_FEATURE_PAT ( 0*32+16) /* Page Attribute Table */ -#define X86_FEATURE_PSE36 ( 0*32+17) /* 36-bit PSEs */ -#define X86_FEATURE_PN ( 0*32+18) /* Processor serial number */ -#define X86_FEATURE_CLFLUSH ( 0*32+19) /* CLFLUSH instruction */ -#define X86_FEATURE_DS ( 0*32+21) /* "dts" Debug Store */ -#define X86_FEATURE_ACPI ( 0*32+22) /* ACPI via MSR */ -#define X86_FEATURE_MMX ( 0*32+23) /* Multimedia Extensions */ -#define X86_FEATURE_FXSR ( 0*32+24) /* FXSAVE/FXRSTOR, CR4.OSFXSR */ -#define X86_FEATURE_XMM ( 0*32+25) /* "sse" */ -#define X86_FEATURE_XMM2 ( 0*32+26) /* "sse2" */ -#define X86_FEATURE_SELFSNOOP ( 0*32+27) /* "ss" CPU self snoop */ -#define X86_FEATURE_HT ( 0*32+28) /* Hyper-Threading */ -#define X86_FEATURE_ACC ( 0*32+29) /* "tm" Automatic clock control */ -#define X86_FEATURE_IA64 ( 0*32+30) /* IA-64 processor */ -#define X86_FEATURE_PBE ( 0*32+31) /* Pending Break Enable */ - -/* AMD-defined CPU features, CPUID level 0x80000001, word 1 */ -/* Don't duplicate feature flags which are redundant with Intel! */ -#define X86_FEATURE_SYSCALL ( 1*32+11) /* SYSCALL/SYSRET */ -#define X86_FEATURE_MP ( 1*32+19) /* MP Capable. */ -#define X86_FEATURE_NX ( 1*32+20) /* Execute Disable */ -#define X86_FEATURE_MMXEXT ( 1*32+22) /* AMD MMX extensions */ -#define X86_FEATURE_FXSR_OPT ( 1*32+25) /* FXSAVE/FXRSTOR optimizations */ -#define X86_FEATURE_GBPAGES ( 1*32+26) /* "pdpe1gb" GB pages */ -#define X86_FEATURE_RDTSCP ( 1*32+27) /* RDTSCP */ -#define X86_FEATURE_LM ( 1*32+29) /* Long Mode (x86-64) */ -#define X86_FEATURE_3DNOWEXT ( 1*32+30) /* AMD 3DNow! extensions */ -#define X86_FEATURE_3DNOW ( 1*32+31) /* 3DNow! */ - -/* Transmeta-defined CPU features, CPUID level 0x80860001, word 2 */ -#define X86_FEATURE_RECOVERY ( 2*32+ 0) /* CPU in recovery mode */ -#define X86_FEATURE_LONGRUN ( 2*32+ 1) /* Longrun power control */ -#define X86_FEATURE_LRTI ( 2*32+ 3) /* LongRun table interface */ - -/* Other features, Linux-defined mapping, word 3 */ -/* This range is used for feature bits which conflict or are synthesized */ -#define X86_FEATURE_CXMMX ( 3*32+ 0) /* Cyrix MMX extensions */ -#define X86_FEATURE_K6_MTRR ( 3*32+ 1) /* AMD K6 nonstandard MTRRs */ -#define X86_FEATURE_CYRIX_ARR ( 3*32+ 2) /* Cyrix ARRs (= MTRRs) */ -#define X86_FEATURE_CENTAUR_MCR ( 3*32+ 3) /* Centaur MCRs (= MTRRs) */ -/* cpu types for specific tunings: */ -#define X86_FEATURE_K8 ( 3*32+ 4) /* "" Opteron, Athlon64 */ -#define X86_FEATURE_K7 ( 3*32+ 5) /* "" Athlon */ -#define X86_FEATURE_P3 ( 3*32+ 6) /* "" P3 */ -#define X86_FEATURE_P4 ( 3*32+ 7) /* "" P4 */ -#define X86_FEATURE_CONSTANT_TSC ( 3*32+ 8) /* TSC ticks at a constant rate */ -#define X86_FEATURE_UP ( 3*32+ 9) /* smp kernel running on up */ -/* free, was #define X86_FEATURE_FXSAVE_LEAK ( 3*32+10) * "" FXSAVE leaks FOP/FIP/FOP */ -#define X86_FEATURE_ARCH_PERFMON ( 3*32+11) /* Intel Architectural PerfMon */ -#define X86_FEATURE_PEBS ( 3*32+12) /* Precise-Event Based Sampling */ -#define X86_FEATURE_BTS ( 3*32+13) /* Branch Trace Store */ -#define X86_FEATURE_SYSCALL32 ( 3*32+14) /* "" syscall in ia32 userspace */ -#define X86_FEATURE_SYSENTER32 ( 3*32+15) /* "" sysenter in ia32 userspace */ -#define X86_FEATURE_REP_GOOD ( 3*32+16) /* rep microcode works well */ -#define X86_FEATURE_MFENCE_RDTSC ( 3*32+17) /* "" Mfence synchronizes RDTSC */ -#define X86_FEATURE_LFENCE_RDTSC ( 3*32+18) /* "" Lfence synchronizes RDTSC */ -/* free, was #define X86_FEATURE_11AP ( 3*32+19) * "" Bad local APIC aka 11AP */ -#define X86_FEATURE_NOPL ( 3*32+20) /* The NOPL (0F 1F) instructions */ -#define X86_FEATURE_ALWAYS ( 3*32+21) /* "" Always-present feature */ -#define X86_FEATURE_XTOPOLOGY ( 3*32+22) /* cpu topology enum extensions */ -#define X86_FEATURE_TSC_RELIABLE ( 3*32+23) /* TSC is known to be reliable */ -#define X86_FEATURE_NONSTOP_TSC ( 3*32+24) /* TSC does not stop in C states */ -/* free, was #define X86_FEATURE_CLFLUSH_MONITOR ( 3*32+25) * "" clflush reqd with monitor */ -#define X86_FEATURE_EXTD_APICID ( 3*32+26) /* has extended APICID (8 bits) */ -#define X86_FEATURE_AMD_DCM ( 3*32+27) /* multi-node processor */ -#define X86_FEATURE_APERFMPERF ( 3*32+28) /* APERFMPERF */ -#define X86_FEATURE_EAGER_FPU ( 3*32+29) /* "eagerfpu" Non lazy FPU restore */ -#define X86_FEATURE_NONSTOP_TSC_S3 ( 3*32+30) /* TSC doesn't stop in S3 state */ - -/* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */ -#define X86_FEATURE_XMM3 ( 4*32+ 0) /* "pni" SSE-3 */ -#define X86_FEATURE_PCLMULQDQ ( 4*32+ 1) /* PCLMULQDQ instruction */ -#define X86_FEATURE_DTES64 ( 4*32+ 2) /* 64-bit Debug Store */ -#define X86_FEATURE_MWAIT ( 4*32+ 3) /* "monitor" Monitor/Mwait support */ -#define X86_FEATURE_DSCPL ( 4*32+ 4) /* "ds_cpl" CPL Qual. Debug Store */ -#define X86_FEATURE_VMX ( 4*32+ 5) /* Hardware virtualization */ -#define X86_FEATURE_SMX ( 4*32+ 6) /* Safer mode */ -#define X86_FEATURE_EST ( 4*32+ 7) /* Enhanced SpeedStep */ -#define X86_FEATURE_TM2 ( 4*32+ 8) /* Thermal Monitor 2 */ -#define X86_FEATURE_SSSE3 ( 4*32+ 9) /* Supplemental SSE-3 */ -#define X86_FEATURE_CID ( 4*32+10) /* Context ID */ -#define X86_FEATURE_SDBG ( 4*32+11) /* Silicon Debug */ -#define X86_FEATURE_FMA ( 4*32+12) /* Fused multiply-add */ -#define X86_FEATURE_CX16 ( 4*32+13) /* CMPXCHG16B */ -#define X86_FEATURE_XTPR ( 4*32+14) /* Send Task Priority Messages */ -#define X86_FEATURE_PDCM ( 4*32+15) /* Performance Capabilities */ -#define X86_FEATURE_PCID ( 4*32+17) /* Process Context Identifiers */ -#define X86_FEATURE_DCA ( 4*32+18) /* Direct Cache Access */ -#define X86_FEATURE_XMM4_1 ( 4*32+19) /* "sse4_1" SSE-4.1 */ -#define X86_FEATURE_XMM4_2 ( 4*32+20) /* "sse4_2" SSE-4.2 */ -#define X86_FEATURE_X2APIC ( 4*32+21) /* x2APIC */ -#define X86_FEATURE_MOVBE ( 4*32+22) /* MOVBE instruction */ -#define X86_FEATURE_POPCNT ( 4*32+23) /* POPCNT instruction */ -#define X86_FEATURE_TSC_DEADLINE_TIMER ( 4*32+24) /* Tsc deadline timer */ -#define X86_FEATURE_AES ( 4*32+25) /* AES instructions */ -#define X86_FEATURE_XSAVE ( 4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */ -#define X86_FEATURE_OSXSAVE ( 4*32+27) /* "" XSAVE enabled in the OS */ -#define X86_FEATURE_AVX ( 4*32+28) /* Advanced Vector Extensions */ -#define X86_FEATURE_F16C ( 4*32+29) /* 16-bit fp conversions */ -#define X86_FEATURE_RDRAND ( 4*32+30) /* The RDRAND instruction */ -#define X86_FEATURE_HYPERVISOR ( 4*32+31) /* Running on a hypervisor */ - -/* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000001, word 5 */ -#define X86_FEATURE_XSTORE ( 5*32+ 2) /* "rng" RNG present (xstore) */ -#define X86_FEATURE_XSTORE_EN ( 5*32+ 3) /* "rng_en" RNG enabled */ -#define X86_FEATURE_XCRYPT ( 5*32+ 6) /* "ace" on-CPU crypto (xcrypt) */ -#define X86_FEATURE_XCRYPT_EN ( 5*32+ 7) /* "ace_en" on-CPU crypto enabled */ -#define X86_FEATURE_ACE2 ( 5*32+ 8) /* Advanced Cryptography Engine v2 */ -#define X86_FEATURE_ACE2_EN ( 5*32+ 9) /* ACE v2 enabled */ -#define X86_FEATURE_PHE ( 5*32+10) /* PadLock Hash Engine */ -#define X86_FEATURE_PHE_EN ( 5*32+11) /* PHE enabled */ -#define X86_FEATURE_PMM ( 5*32+12) /* PadLock Montgomery Multiplier */ -#define X86_FEATURE_PMM_EN ( 5*32+13) /* PMM enabled */ - -/* More extended AMD flags: CPUID level 0x80000001, ecx, word 6 */ -#define X86_FEATURE_LAHF_LM ( 6*32+ 0) /* LAHF/SAHF in long mode */ -#define X86_FEATURE_CMP_LEGACY ( 6*32+ 1) /* If yes HyperThreading not valid */ -#define X86_FEATURE_SVM ( 6*32+ 2) /* Secure virtual machine */ -#define X86_FEATURE_EXTAPIC ( 6*32+ 3) /* Extended APIC space */ -#define X86_FEATURE_CR8_LEGACY ( 6*32+ 4) /* CR8 in 32-bit mode */ -#define X86_FEATURE_ABM ( 6*32+ 5) /* Advanced bit manipulation */ -#define X86_FEATURE_SSE4A ( 6*32+ 6) /* SSE-4A */ -#define X86_FEATURE_MISALIGNSSE ( 6*32+ 7) /* Misaligned SSE mode */ -#define X86_FEATURE_3DNOWPREFETCH ( 6*32+ 8) /* 3DNow prefetch instructions */ -#define X86_FEATURE_OSVW ( 6*32+ 9) /* OS Visible Workaround */ -#define X86_FEATURE_IBS ( 6*32+10) /* Instruction Based Sampling */ -#define X86_FEATURE_XOP ( 6*32+11) /* extended AVX instructions */ -#define X86_FEATURE_SKINIT ( 6*32+12) /* SKINIT/STGI instructions */ -#define X86_FEATURE_WDT ( 6*32+13) /* Watchdog timer */ -#define X86_FEATURE_LWP ( 6*32+15) /* Light Weight Profiling */ -#define X86_FEATURE_FMA4 ( 6*32+16) /* 4 operands MAC instructions */ -#define X86_FEATURE_TCE ( 6*32+17) /* translation cache extension */ -#define X86_FEATURE_NODEID_MSR ( 6*32+19) /* NodeId MSR */ -#define X86_FEATURE_TBM ( 6*32+21) /* trailing bit manipulations */ -#define X86_FEATURE_TOPOEXT ( 6*32+22) /* topology extensions CPUID leafs */ -#define X86_FEATURE_PERFCTR_CORE ( 6*32+23) /* core performance counter extensions */ -#define X86_FEATURE_PERFCTR_NB ( 6*32+24) /* NB performance counter extensions */ -#define X86_FEATURE_BPEXT (6*32+26) /* data breakpoint extension */ -#define X86_FEATURE_PERFCTR_L2 ( 6*32+28) /* L2 performance counter extensions */ -#define X86_FEATURE_MWAITX ( 6*32+29) /* MWAIT extension (MONITORX/MWAITX) */ - -/* - * Auxiliary flags: Linux defined - For features scattered in various - * CPUID levels like 0x6, 0xA etc, word 7. - * - * Reuse free bits when adding new feature flags! - */ - -#define X86_FEATURE_CPB ( 7*32+ 2) /* AMD Core Performance Boost */ -#define X86_FEATURE_EPB ( 7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */ - -#define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */ -#define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */ - -#define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */ - -/* Virtualization flags: Linux defined, word 8 */ -#define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */ -#define X86_FEATURE_VNMI ( 8*32+ 1) /* Intel Virtual NMI */ -#define X86_FEATURE_FLEXPRIORITY ( 8*32+ 2) /* Intel FlexPriority */ -#define X86_FEATURE_EPT ( 8*32+ 3) /* Intel Extended Page Table */ -#define X86_FEATURE_VPID ( 8*32+ 4) /* Intel Virtual Processor ID */ - -#define X86_FEATURE_VMMCALL ( 8*32+15) /* Prefer vmmcall to vmcall */ -#define X86_FEATURE_XENPV ( 8*32+16) /* "" Xen paravirtual guest */ - - -/* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */ -#define X86_FEATURE_FSGSBASE ( 9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/ -#define X86_FEATURE_TSC_ADJUST ( 9*32+ 1) /* TSC adjustment MSR 0x3b */ -#define X86_FEATURE_BMI1 ( 9*32+ 3) /* 1st group bit manipulation extensions */ -#define X86_FEATURE_HLE ( 9*32+ 4) /* Hardware Lock Elision */ -#define X86_FEATURE_AVX2 ( 9*32+ 5) /* AVX2 instructions */ -#define X86_FEATURE_SMEP ( 9*32+ 7) /* Supervisor Mode Execution Protection */ -#define X86_FEATURE_BMI2 ( 9*32+ 8) /* 2nd group bit manipulation extensions */ -#define X86_FEATURE_ERMS ( 9*32+ 9) /* Enhanced REP MOVSB/STOSB */ -#define X86_FEATURE_INVPCID ( 9*32+10) /* Invalidate Processor Context ID */ -#define X86_FEATURE_RTM ( 9*32+11) /* Restricted Transactional Memory */ -#define X86_FEATURE_CQM ( 9*32+12) /* Cache QoS Monitoring */ -#define X86_FEATURE_MPX ( 9*32+14) /* Memory Protection Extension */ -#define X86_FEATURE_AVX512F ( 9*32+16) /* AVX-512 Foundation */ -#define X86_FEATURE_RDSEED ( 9*32+18) /* The RDSEED instruction */ -#define X86_FEATURE_ADX ( 9*32+19) /* The ADCX and ADOX instructions */ -#define X86_FEATURE_SMAP ( 9*32+20) /* Supervisor Mode Access Prevention */ -#define X86_FEATURE_PCOMMIT ( 9*32+22) /* PCOMMIT instruction */ -#define X86_FEATURE_CLFLUSHOPT ( 9*32+23) /* CLFLUSHOPT instruction */ -#define X86_FEATURE_CLWB ( 9*32+24) /* CLWB instruction */ -#define X86_FEATURE_AVX512PF ( 9*32+26) /* AVX-512 Prefetch */ -#define X86_FEATURE_AVX512ER ( 9*32+27) /* AVX-512 Exponential and Reciprocal */ -#define X86_FEATURE_AVX512CD ( 9*32+28) /* AVX-512 Conflict Detection */ -#define X86_FEATURE_SHA_NI ( 9*32+29) /* SHA1/SHA256 Instruction Extensions */ - -/* Extended state features, CPUID level 0x0000000d:1 (eax), word 10 */ -#define X86_FEATURE_XSAVEOPT (10*32+ 0) /* XSAVEOPT */ -#define X86_FEATURE_XSAVEC (10*32+ 1) /* XSAVEC */ -#define X86_FEATURE_XGETBV1 (10*32+ 2) /* XGETBV with ECX = 1 */ -#define X86_FEATURE_XSAVES (10*32+ 3) /* XSAVES/XRSTORS */ - -/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:0 (edx), word 11 */ -#define X86_FEATURE_CQM_LLC (11*32+ 1) /* LLC QoS if 1 */ - -/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:1 (edx), word 12 */ -#define X86_FEATURE_CQM_OCCUP_LLC (12*32+ 0) /* LLC occupancy monitoring if 1 */ - -/* AMD-defined CPU features, CPUID level 0x80000008 (ebx), word 13 */ -#define X86_FEATURE_CLZERO (13*32+0) /* CLZERO instruction */ - -/* Thermal and Power Management Leaf, CPUID level 0x00000006 (eax), word 14 */ -#define X86_FEATURE_DTHERM (14*32+ 0) /* Digital Thermal Sensor */ -#define X86_FEATURE_IDA (14*32+ 1) /* Intel Dynamic Acceleration */ -#define X86_FEATURE_ARAT (14*32+ 2) /* Always Running APIC Timer */ -#define X86_FEATURE_PLN (14*32+ 4) /* Intel Power Limit Notification */ -#define X86_FEATURE_PTS (14*32+ 6) /* Intel Package Thermal Status */ -#define X86_FEATURE_HWP (14*32+ 7) /* Intel Hardware P-states */ -#define X86_FEATURE_HWP_NOTIFY (14*32+ 8) /* HWP Notification */ -#define X86_FEATURE_HWP_ACT_WINDOW (14*32+ 9) /* HWP Activity Window */ -#define X86_FEATURE_HWP_EPP (14*32+10) /* HWP Energy Perf. Preference */ -#define X86_FEATURE_HWP_PKG_REQ (14*32+11) /* HWP Package Level Request */ - -/* AMD SVM Feature Identification, CPUID level 0x8000000a (edx), word 15 */ -#define X86_FEATURE_NPT (15*32+ 0) /* Nested Page Table support */ -#define X86_FEATURE_LBRV (15*32+ 1) /* LBR Virtualization support */ -#define X86_FEATURE_SVML (15*32+ 2) /* "svm_lock" SVM locking MSR */ -#define X86_FEATURE_NRIPS (15*32+ 3) /* "nrip_save" SVM next_rip save */ -#define X86_FEATURE_TSCRATEMSR (15*32+ 4) /* "tsc_scale" TSC scaling support */ -#define X86_FEATURE_VMCBCLEAN (15*32+ 5) /* "vmcb_clean" VMCB clean bits support */ -#define X86_FEATURE_FLUSHBYASID (15*32+ 6) /* flush-by-ASID support */ -#define X86_FEATURE_DECODEASSISTS (15*32+ 7) /* Decode Assists support */ -#define X86_FEATURE_PAUSEFILTER (15*32+10) /* filtered pause intercept */ -#define X86_FEATURE_PFTHRESHOLD (15*32+12) /* pause filter threshold */ -#define X86_FEATURE_AVIC (15*32+13) /* Virtual Interrupt Controller */ - -/* - * BUG word(s) - */ -#define X86_BUG(x) (NCAPINTS*32 + (x)) - -#define X86_BUG_F00F X86_BUG(0) /* Intel F00F */ -#define X86_BUG_FDIV X86_BUG(1) /* FPU FDIV */ -#define X86_BUG_COMA X86_BUG(2) /* Cyrix 6x86 coma */ -#define X86_BUG_AMD_TLB_MMATCH X86_BUG(3) /* "tlb_mmatch" AMD Erratum 383 */ -#define X86_BUG_AMD_APIC_C1E X86_BUG(4) /* "apic_c1e" AMD Erratum 400 */ -#define X86_BUG_11AP X86_BUG(5) /* Bad local APIC aka 11AP */ -#define X86_BUG_FXSAVE_LEAK X86_BUG(6) /* FXSAVE leaks FOP/FIP/FOP */ -#define X86_BUG_CLFLUSH_MONITOR X86_BUG(7) /* AAI65, CLFLUSH required before MONITOR */ -#define X86_BUG_SYSRET_SS_ATTRS X86_BUG(8) /* SYSRET doesn't fix up SS attrs */ +#include #if defined(__KERNEL__) && !defined(__ASSEMBLY__) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h new file mode 100644 index 0000000..0ceb6ad --- /dev/null +++ b/arch/x86/include/asm/cpufeatures.h @@ -0,0 +1,288 @@ +#ifndef _ASM_X86_CPUFEATURES_H +#define _ASM_X86_CPUFEATURES_H + +#ifndef _ASM_X86_REQUIRED_FEATURES_H +#include +#endif + +#ifndef _ASM_X86_DISABLED_FEATURES_H +#include +#endif + +/* + * Defines x86 CPU feature bits + */ +#define NCAPINTS 16 /* N 32-bit words worth of info */ +#define NBUGINTS 1 /* N 32-bit bug flags */ + +/* + * Note: If the comment begins with a quoted string, that string is used + * in /proc/cpuinfo instead of the macro name. If the string is "", + * this feature bit is not displayed in /proc/cpuinfo at all. + */ + +/* Intel-defined CPU features, CPUID level 0x00000001 (edx), word 0 */ +#define X86_FEATURE_FPU ( 0*32+ 0) /* Onboard FPU */ +#define X86_FEATURE_VME ( 0*32+ 1) /* Virtual Mode Extensions */ +#define X86_FEATURE_DE ( 0*32+ 2) /* Debugging Extensions */ +#define X86_FEATURE_PSE ( 0*32+ 3) /* Page Size Extensions */ +#define X86_FEATURE_TSC ( 0*32+ 4) /* Time Stamp Counter */ +#define X86_FEATURE_MSR ( 0*32+ 5) /* Model-Specific Registers */ +#define X86_FEATURE_PAE ( 0*32+ 6) /* Physical Address Extensions */ +#define X86_FEATURE_MCE ( 0*32+ 7) /* Machine Check Exception */ +#define X86_FEATURE_CX8 ( 0*32+ 8) /* CMPXCHG8 instruction */ +#define X86_FEATURE_APIC ( 0*32+ 9) /* Onboard APIC */ +#define X86_FEATURE_SEP ( 0*32+11) /* SYSENTER/SYSEXIT */ +#define X86_FEATURE_MTRR ( 0*32+12) /* Memory Type Range Registers */ +#define X86_FEATURE_PGE ( 0*32+13) /* Page Global Enable */ +#define X86_FEATURE_MCA ( 0*32+14) /* Machine Check Architecture */ +#define X86_FEATURE_CMOV ( 0*32+15) /* CMOV instructions */ + /* (plus FCMOVcc, FCOMI with FPU) */ +#define X86_FEATURE_PAT ( 0*32+16) /* Page Attribute Table */ +#define X86_FEATURE_PSE36 ( 0*32+17) /* 36-bit PSEs */ +#define X86_FEATURE_PN ( 0*32+18) /* Processor serial number */ +#define X86_FEATURE_CLFLUSH ( 0*32+19) /* CLFLUSH instruction */ +#define X86_FEATURE_DS ( 0*32+21) /* "dts" Debug Store */ +#define X86_FEATURE_ACPI ( 0*32+22) /* ACPI via MSR */ +#define X86_FEATURE_MMX ( 0*32+23) /* Multimedia Extensions */ +#define X86_FEATURE_FXSR ( 0*32+24) /* FXSAVE/FXRSTOR, CR4.OSFXSR */ +#define X86_FEATURE_XMM ( 0*32+25) /* "sse" */ +#define X86_FEATURE_XMM2 ( 0*32+26) /* "sse2" */ +#define X86_FEATURE_SELFSNOOP ( 0*32+27) /* "ss" CPU self snoop */ +#define X86_FEATURE_HT ( 0*32+28) /* Hyper-Threading */ +#define X86_FEATURE_ACC ( 0*32+29) /* "tm" Automatic clock control */ +#define X86_FEATURE_IA64 ( 0*32+30) /* IA-64 processor */ +#define X86_FEATURE_PBE ( 0*32+31) /* Pending Break Enable */ + +/* AMD-defined CPU features, CPUID level 0x80000001, word 1 */ +/* Don't duplicate feature flags which are redundant with Intel! */ +#define X86_FEATURE_SYSCALL ( 1*32+11) /* SYSCALL/SYSRET */ +#define X86_FEATURE_MP ( 1*32+19) /* MP Capable. */ +#define X86_FEATURE_NX ( 1*32+20) /* Execute Disable */ +#define X86_FEATURE_MMXEXT ( 1*32+22) /* AMD MMX extensions */ +#define X86_FEATURE_FXSR_OPT ( 1*32+25) /* FXSAVE/FXRSTOR optimizations */ +#define X86_FEATURE_GBPAGES ( 1*32+26) /* "pdpe1gb" GB pages */ +#define X86_FEATURE_RDTSCP ( 1*32+27) /* RDTSCP */ +#define X86_FEATURE_LM ( 1*32+29) /* Long Mode (x86-64) */ +#define X86_FEATURE_3DNOWEXT ( 1*32+30) /* AMD 3DNow! extensions */ +#define X86_FEATURE_3DNOW ( 1*32+31) /* 3DNow! */ + +/* Transmeta-defined CPU features, CPUID level 0x80860001, word 2 */ +#define X86_FEATURE_RECOVERY ( 2*32+ 0) /* CPU in recovery mode */ +#define X86_FEATURE_LONGRUN ( 2*32+ 1) /* Longrun power control */ +#define X86_FEATURE_LRTI ( 2*32+ 3) /* LongRun table interface */ + +/* Other features, Linux-defined mapping, word 3 */ +/* This range is used for feature bits which conflict or are synthesized */ +#define X86_FEATURE_CXMMX ( 3*32+ 0) /* Cyrix MMX extensions */ +#define X86_FEATURE_K6_MTRR ( 3*32+ 1) /* AMD K6 nonstandard MTRRs */ +#define X86_FEATURE_CYRIX_ARR ( 3*32+ 2) /* Cyrix ARRs (= MTRRs) */ +#define X86_FEATURE_CENTAUR_MCR ( 3*32+ 3) /* Centaur MCRs (= MTRRs) */ +/* cpu types for specific tunings: */ +#define X86_FEATURE_K8 ( 3*32+ 4) /* "" Opteron, Athlon64 */ +#define X86_FEATURE_K7 ( 3*32+ 5) /* "" Athlon */ +#define X86_FEATURE_P3 ( 3*32+ 6) /* "" P3 */ +#define X86_FEATURE_P4 ( 3*32+ 7) /* "" P4 */ +#define X86_FEATURE_CONSTANT_TSC ( 3*32+ 8) /* TSC ticks at a constant rate */ +#define X86_FEATURE_UP ( 3*32+ 9) /* smp kernel running on up */ +/* free, was #define X86_FEATURE_FXSAVE_LEAK ( 3*32+10) * "" FXSAVE leaks FOP/FIP/FOP */ +#define X86_FEATURE_ARCH_PERFMON ( 3*32+11) /* Intel Architectural PerfMon */ +#define X86_FEATURE_PEBS ( 3*32+12) /* Precise-Event Based Sampling */ +#define X86_FEATURE_BTS ( 3*32+13) /* Branch Trace Store */ +#define X86_FEATURE_SYSCALL32 ( 3*32+14) /* "" syscall in ia32 userspace */ +#define X86_FEATURE_SYSENTER32 ( 3*32+15) /* "" sysenter in ia32 userspace */ +#define X86_FEATURE_REP_GOOD ( 3*32+16) /* rep microcode works well */ +#define X86_FEATURE_MFENCE_RDTSC ( 3*32+17) /* "" Mfence synchronizes RDTSC */ +#define X86_FEATURE_LFENCE_RDTSC ( 3*32+18) /* "" Lfence synchronizes RDTSC */ +/* free, was #define X86_FEATURE_11AP ( 3*32+19) * "" Bad local APIC aka 11AP */ +#define X86_FEATURE_NOPL ( 3*32+20) /* The NOPL (0F 1F) instructions */ +#define X86_FEATURE_ALWAYS ( 3*32+21) /* "" Always-present feature */ +#define X86_FEATURE_XTOPOLOGY ( 3*32+22) /* cpu topology enum extensions */ +#define X86_FEATURE_TSC_RELIABLE ( 3*32+23) /* TSC is known to be reliable */ +#define X86_FEATURE_NONSTOP_TSC ( 3*32+24) /* TSC does not stop in C states */ +/* free, was #define X86_FEATURE_CLFLUSH_MONITOR ( 3*32+25) * "" clflush reqd with monitor */ +#define X86_FEATURE_EXTD_APICID ( 3*32+26) /* has extended APICID (8 bits) */ +#define X86_FEATURE_AMD_DCM ( 3*32+27) /* multi-node processor */ +#define X86_FEATURE_APERFMPERF ( 3*32+28) /* APERFMPERF */ +#define X86_FEATURE_EAGER_FPU ( 3*32+29) /* "eagerfpu" Non lazy FPU restore */ +#define X86_FEATURE_NONSTOP_TSC_S3 ( 3*32+30) /* TSC doesn't stop in S3 state */ + +/* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */ +#define X86_FEATURE_XMM3 ( 4*32+ 0) /* "pni" SSE-3 */ +#define X86_FEATURE_PCLMULQDQ ( 4*32+ 1) /* PCLMULQDQ instruction */ +#define X86_FEATURE_DTES64 ( 4*32+ 2) /* 64-bit Debug Store */ +#define X86_FEATURE_MWAIT ( 4*32+ 3) /* "monitor" Monitor/Mwait support */ +#define X86_FEATURE_DSCPL ( 4*32+ 4) /* "ds_cpl" CPL Qual. Debug Store */ +#define X86_FEATURE_VMX ( 4*32+ 5) /* Hardware virtualization */ +#define X86_FEATURE_SMX ( 4*32+ 6) /* Safer mode */ +#define X86_FEATURE_EST ( 4*32+ 7) /* Enhanced SpeedStep */ +#define X86_FEATURE_TM2 ( 4*32+ 8) /* Thermal Monitor 2 */ +#define X86_FEATURE_SSSE3 ( 4*32+ 9) /* Supplemental SSE-3 */ +#define X86_FEATURE_CID ( 4*32+10) /* Context ID */ +#define X86_FEATURE_SDBG ( 4*32+11) /* Silicon Debug */ +#define X86_FEATURE_FMA ( 4*32+12) /* Fused multiply-add */ +#define X86_FEATURE_CX16 ( 4*32+13) /* CMPXCHG16B */ +#define X86_FEATURE_XTPR ( 4*32+14) /* Send Task Priority Messages */ +#define X86_FEATURE_PDCM ( 4*32+15) /* Performance Capabilities */ +#define X86_FEATURE_PCID ( 4*32+17) /* Process Context Identifiers */ +#define X86_FEATURE_DCA ( 4*32+18) /* Direct Cache Access */ +#define X86_FEATURE_XMM4_1 ( 4*32+19) /* "sse4_1" SSE-4.1 */ +#define X86_FEATURE_XMM4_2 ( 4*32+20) /* "sse4_2" SSE-4.2 */ +#define X86_FEATURE_X2APIC ( 4*32+21) /* x2APIC */ +#define X86_FEATURE_MOVBE ( 4*32+22) /* MOVBE instruction */ +#define X86_FEATURE_POPCNT ( 4*32+23) /* POPCNT instruction */ +#define X86_FEATURE_TSC_DEADLINE_TIMER ( 4*32+24) /* Tsc deadline timer */ +#define X86_FEATURE_AES ( 4*32+25) /* AES instructions */ +#define X86_FEATURE_XSAVE ( 4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */ +#define X86_FEATURE_OSXSAVE ( 4*32+27) /* "" XSAVE enabled in the OS */ +#define X86_FEATURE_AVX ( 4*32+28) /* Advanced Vector Extensions */ +#define X86_FEATURE_F16C ( 4*32+29) /* 16-bit fp conversions */ +#define X86_FEATURE_RDRAND ( 4*32+30) /* The RDRAND instruction */ +#define X86_FEATURE_HYPERVISOR ( 4*32+31) /* Running on a hypervisor */ + +/* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000001, word 5 */ +#define X86_FEATURE_XSTORE ( 5*32+ 2) /* "rng" RNG present (xstore) */ +#define X86_FEATURE_XSTORE_EN ( 5*32+ 3) /* "rng_en" RNG enabled */ +#define X86_FEATURE_XCRYPT ( 5*32+ 6) /* "ace" on-CPU crypto (xcrypt) */ +#define X86_FEATURE_XCRYPT_EN ( 5*32+ 7) /* "ace_en" on-CPU crypto enabled */ +#define X86_FEATURE_ACE2 ( 5*32+ 8) /* Advanced Cryptography Engine v2 */ +#define X86_FEATURE_ACE2_EN ( 5*32+ 9) /* ACE v2 enabled */ +#define X86_FEATURE_PHE ( 5*32+10) /* PadLock Hash Engine */ +#define X86_FEATURE_PHE_EN ( 5*32+11) /* PHE enabled */ +#define X86_FEATURE_PMM ( 5*32+12) /* PadLock Montgomery Multiplier */ +#define X86_FEATURE_PMM_EN ( 5*32+13) /* PMM enabled */ + +/* More extended AMD flags: CPUID level 0x80000001, ecx, word 6 */ +#define X86_FEATURE_LAHF_LM ( 6*32+ 0) /* LAHF/SAHF in long mode */ +#define X86_FEATURE_CMP_LEGACY ( 6*32+ 1) /* If yes HyperThreading not valid */ +#define X86_FEATURE_SVM ( 6*32+ 2) /* Secure virtual machine */ +#define X86_FEATURE_EXTAPIC ( 6*32+ 3) /* Extended APIC space */ +#define X86_FEATURE_CR8_LEGACY ( 6*32+ 4) /* CR8 in 32-bit mode */ +#define X86_FEATURE_ABM ( 6*32+ 5) /* Advanced bit manipulation */ +#define X86_FEATURE_SSE4A ( 6*32+ 6) /* SSE-4A */ +#define X86_FEATURE_MISALIGNSSE ( 6*32+ 7) /* Misaligned SSE mode */ +#define X86_FEATURE_3DNOWPREFETCH ( 6*32+ 8) /* 3DNow prefetch instructions */ +#define X86_FEATURE_OSVW ( 6*32+ 9) /* OS Visible Workaround */ +#define X86_FEATURE_IBS ( 6*32+10) /* Instruction Based Sampling */ +#define X86_FEATURE_XOP ( 6*32+11) /* extended AVX instructions */ +#define X86_FEATURE_SKINIT ( 6*32+12) /* SKINIT/STGI instructions */ +#define X86_FEATURE_WDT ( 6*32+13) /* Watchdog timer */ +#define X86_FEATURE_LWP ( 6*32+15) /* Light Weight Profiling */ +#define X86_FEATURE_FMA4 ( 6*32+16) /* 4 operands MAC instructions */ +#define X86_FEATURE_TCE ( 6*32+17) /* translation cache extension */ +#define X86_FEATURE_NODEID_MSR ( 6*32+19) /* NodeId MSR */ +#define X86_FEATURE_TBM ( 6*32+21) /* trailing bit manipulations */ +#define X86_FEATURE_TOPOEXT ( 6*32+22) /* topology extensions CPUID leafs */ +#define X86_FEATURE_PERFCTR_CORE ( 6*32+23) /* core performance counter extensions */ +#define X86_FEATURE_PERFCTR_NB ( 6*32+24) /* NB performance counter extensions */ +#define X86_FEATURE_BPEXT (6*32+26) /* data breakpoint extension */ +#define X86_FEATURE_PERFCTR_L2 ( 6*32+28) /* L2 performance counter extensions */ +#define X86_FEATURE_MWAITX ( 6*32+29) /* MWAIT extension (MONITORX/MWAITX) */ + +/* + * Auxiliary flags: Linux defined - For features scattered in various + * CPUID levels like 0x6, 0xA etc, word 7. + * + * Reuse free bits when adding new feature flags! + */ + +#define X86_FEATURE_CPB ( 7*32+ 2) /* AMD Core Performance Boost */ +#define X86_FEATURE_EPB ( 7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */ + +#define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */ +#define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */ + +#define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */ + +/* Virtualization flags: Linux defined, word 8 */ +#define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */ +#define X86_FEATURE_VNMI ( 8*32+ 1) /* Intel Virtual NMI */ +#define X86_FEATURE_FLEXPRIORITY ( 8*32+ 2) /* Intel FlexPriority */ +#define X86_FEATURE_EPT ( 8*32+ 3) /* Intel Extended Page Table */ +#define X86_FEATURE_VPID ( 8*32+ 4) /* Intel Virtual Processor ID */ + +#define X86_FEATURE_VMMCALL ( 8*32+15) /* Prefer vmmcall to vmcall */ +#define X86_FEATURE_XENPV ( 8*32+16) /* "" Xen paravirtual guest */ + + +/* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */ +#define X86_FEATURE_FSGSBASE ( 9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/ +#define X86_FEATURE_TSC_ADJUST ( 9*32+ 1) /* TSC adjustment MSR 0x3b */ +#define X86_FEATURE_BMI1 ( 9*32+ 3) /* 1st group bit manipulation extensions */ +#define X86_FEATURE_HLE ( 9*32+ 4) /* Hardware Lock Elision */ +#define X86_FEATURE_AVX2 ( 9*32+ 5) /* AVX2 instructions */ +#define X86_FEATURE_SMEP ( 9*32+ 7) /* Supervisor Mode Execution Protection */ +#define X86_FEATURE_BMI2 ( 9*32+ 8) /* 2nd group bit manipulation extensions */ +#define X86_FEATURE_ERMS ( 9*32+ 9) /* Enhanced REP MOVSB/STOSB */ +#define X86_FEATURE_INVPCID ( 9*32+10) /* Invalidate Processor Context ID */ +#define X86_FEATURE_RTM ( 9*32+11) /* Restricted Transactional Memory */ +#define X86_FEATURE_CQM ( 9*32+12) /* Cache QoS Monitoring */ +#define X86_FEATURE_MPX ( 9*32+14) /* Memory Protection Extension */ +#define X86_FEATURE_AVX512F ( 9*32+16) /* AVX-512 Foundation */ +#define X86_FEATURE_RDSEED ( 9*32+18) /* The RDSEED instruction */ +#define X86_FEATURE_ADX ( 9*32+19) /* The ADCX and ADOX instructions */ +#define X86_FEATURE_SMAP ( 9*32+20) /* Supervisor Mode Access Prevention */ +#define X86_FEATURE_PCOMMIT ( 9*32+22) /* PCOMMIT instruction */ +#define X86_FEATURE_CLFLUSHOPT ( 9*32+23) /* CLFLUSHOPT instruction */ +#define X86_FEATURE_CLWB ( 9*32+24) /* CLWB instruction */ +#define X86_FEATURE_AVX512PF ( 9*32+26) /* AVX-512 Prefetch */ +#define X86_FEATURE_AVX512ER ( 9*32+27) /* AVX-512 Exponential and Reciprocal */ +#define X86_FEATURE_AVX512CD ( 9*32+28) /* AVX-512 Conflict Detection */ +#define X86_FEATURE_SHA_NI ( 9*32+29) /* SHA1/SHA256 Instruction Extensions */ + +/* Extended state features, CPUID level 0x0000000d:1 (eax), word 10 */ +#define X86_FEATURE_XSAVEOPT (10*32+ 0) /* XSAVEOPT */ +#define X86_FEATURE_XSAVEC (10*32+ 1) /* XSAVEC */ +#define X86_FEATURE_XGETBV1 (10*32+ 2) /* XGETBV with ECX = 1 */ +#define X86_FEATURE_XSAVES (10*32+ 3) /* XSAVES/XRSTORS */ + +/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:0 (edx), word 11 */ +#define X86_FEATURE_CQM_LLC (11*32+ 1) /* LLC QoS if 1 */ + +/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:1 (edx), word 12 */ +#define X86_FEATURE_CQM_OCCUP_LLC (12*32+ 0) /* LLC occupancy monitoring if 1 */ + +/* AMD-defined CPU features, CPUID level 0x80000008 (ebx), word 13 */ +#define X86_FEATURE_CLZERO (13*32+0) /* CLZERO instruction */ + +/* Thermal and Power Management Leaf, CPUID level 0x00000006 (eax), word 14 */ +#define X86_FEATURE_DTHERM (14*32+ 0) /* Digital Thermal Sensor */ +#define X86_FEATURE_IDA (14*32+ 1) /* Intel Dynamic Acceleration */ +#define X86_FEATURE_ARAT (14*32+ 2) /* Always Running APIC Timer */ +#define X86_FEATURE_PLN (14*32+ 4) /* Intel Power Limit Notification */ +#define X86_FEATURE_PTS (14*32+ 6) /* Intel Package Thermal Status */ +#define X86_FEATURE_HWP (14*32+ 7) /* Intel Hardware P-states */ +#define X86_FEATURE_HWP_NOTIFY (14*32+ 8) /* HWP Notification */ +#define X86_FEATURE_HWP_ACT_WINDOW (14*32+ 9) /* HWP Activity Window */ +#define X86_FEATURE_HWP_EPP (14*32+10) /* HWP Energy Perf. Preference */ +#define X86_FEATURE_HWP_PKG_REQ (14*32+11) /* HWP Package Level Request */ + +/* AMD SVM Feature Identification, CPUID level 0x8000000a (edx), word 15 */ +#define X86_FEATURE_NPT (15*32+ 0) /* Nested Page Table support */ +#define X86_FEATURE_LBRV (15*32+ 1) /* LBR Virtualization support */ +#define X86_FEATURE_SVML (15*32+ 2) /* "svm_lock" SVM locking MSR */ +#define X86_FEATURE_NRIPS (15*32+ 3) /* "nrip_save" SVM next_rip save */ +#define X86_FEATURE_TSCRATEMSR (15*32+ 4) /* "tsc_scale" TSC scaling support */ +#define X86_FEATURE_VMCBCLEAN (15*32+ 5) /* "vmcb_clean" VMCB clean bits support */ +#define X86_FEATURE_FLUSHBYASID (15*32+ 6) /* flush-by-ASID support */ +#define X86_FEATURE_DECODEASSISTS (15*32+ 7) /* Decode Assists support */ +#define X86_FEATURE_PAUSEFILTER (15*32+10) /* filtered pause intercept */ +#define X86_FEATURE_PFTHRESHOLD (15*32+12) /* pause filter threshold */ +#define X86_FEATURE_AVIC (15*32+13) /* Virtual Interrupt Controller */ + +/* + * BUG word(s) + */ +#define X86_BUG(x) (NCAPINTS*32 + (x)) + +#define X86_BUG_F00F X86_BUG(0) /* Intel F00F */ +#define X86_BUG_FDIV X86_BUG(1) /* FPU FDIV */ +#define X86_BUG_COMA X86_BUG(2) /* Cyrix 6x86 coma */ +#define X86_BUG_AMD_TLB_MMATCH X86_BUG(3) /* "tlb_mmatch" AMD Erratum 383 */ +#define X86_BUG_AMD_APIC_C1E X86_BUG(4) /* "apic_c1e" AMD Erratum 400 */ +#define X86_BUG_11AP X86_BUG(5) /* Bad local APIC aka 11AP */ +#define X86_BUG_FXSAVE_LEAK X86_BUG(6) /* FXSAVE leaks FOP/FIP/FOP */ +#define X86_BUG_CLFLUSH_MONITOR X86_BUG(7) /* AAI65, CLFLUSH required before MONITOR */ +#define X86_BUG_SYSRET_SS_ATTRS X86_BUG(8) /* SYSRET doesn't fix up SS attrs */ + +#endif /* _ASM_X86_CPUFEATURES_H */ diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 0fd440d..d01199d 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -17,6 +17,7 @@ #include #include #include +#include /* * High level FPU state handling functions: diff --git a/arch/x86/include/asm/irq_work.h b/arch/x86/include/asm/irq_work.h index 78162f8..d0afb05 100644 --- a/arch/x86/include/asm/irq_work.h +++ b/arch/x86/include/asm/irq_work.h @@ -1,7 +1,7 @@ #ifndef _ASM_IRQ_WORK_H #define _ASM_IRQ_WORK_H -#include +#include static inline bool arch_irq_work_has_interrupt(void) { diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h index c70689b..0deeb2d 100644 --- a/arch/x86/include/asm/mwait.h +++ b/arch/x86/include/asm/mwait.h @@ -3,6 +3,8 @@ #include +#include + #define MWAIT_SUBSTATE_MASK 0xf #define MWAIT_CSTATE_MASK 0xf #define MWAIT_SUBSTATE_SIZE 4 diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 2d5a50c..491a3d9 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -13,7 +13,7 @@ struct vm86; #include #include #include -#include +#include #include #include #include @@ -24,7 +24,6 @@ struct vm86; #include #include -#include #include #include #include diff --git a/arch/x86/include/asm/smap.h b/arch/x86/include/asm/smap.h index ba665eb..db33330 100644 --- a/arch/x86/include/asm/smap.h +++ b/arch/x86/include/asm/smap.h @@ -15,7 +15,7 @@ #include #include -#include +#include /* "Raw" instruction opcodes */ #define __ASM_CLAC .byte 0x0f,0x01,0xca diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index dfcf072..20a3de5 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -16,7 +16,6 @@ #endif #include #include -#include extern int smp_num_siblings; extern unsigned int num_processors; diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index c7b5510..c0778fc 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -49,7 +49,7 @@ */ #ifndef __ASSEMBLY__ struct task_struct; -#include +#include #include struct thread_info { diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 6df2029..0bb31cb 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -5,6 +5,7 @@ #include #include +#include #include #ifdef CONFIG_PARAVIRT diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h index b89c34c..3076986 100644 --- a/arch/x86/include/asm/uaccess_64.h +++ b/arch/x86/include/asm/uaccess_64.h @@ -8,7 +8,7 @@ #include #include #include -#include +#include #include /* diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 5803130..faa7b52 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -64,7 +64,7 @@ ifdef CONFIG_X86_FEATURE_NAMES quiet_cmd_mkcapflags = MKCAP $@ cmd_mkcapflags = $(CONFIG_SHELL) $(srctree)/$(src)/mkcapflags.sh $< $@ -cpufeature = $(src)/../../include/asm/cpufeature.h +cpufeature = $(src)/../../include/asm/cpufeatures.h targets += capflags.c $(obj)/capflags.c: $(cpufeature) $(src)/mkcapflags.sh FORCE diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c index ae20be6..6608c03 100644 --- a/arch/x86/kernel/cpu/centaur.c +++ b/arch/x86/kernel/cpu/centaur.c @@ -1,7 +1,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c index aaf152e..15e47c1 100644 --- a/arch/x86/kernel/cpu/cyrix.c +++ b/arch/x86/kernel/cpu/cyrix.c @@ -8,6 +8,7 @@ #include #include #include +#include #include "cpu.h" diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 565648b..9299e3b 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -8,7 +8,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 0b6c523..341449c 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -14,7 +14,7 @@ #include #include -#include +#include #include #include diff --git a/arch/x86/kernel/cpu/match.c b/arch/x86/kernel/cpu/match.c index afa9f0d..fbb5e90 100644 --- a/arch/x86/kernel/cpu/match.c +++ b/arch/x86/kernel/cpu/match.c @@ -1,5 +1,5 @@ #include -#include +#include #include #include #include diff --git a/arch/x86/kernel/cpu/mkcapflags.sh b/arch/x86/kernel/cpu/mkcapflags.sh index 3f20710..6988c74 100644 --- a/arch/x86/kernel/cpu/mkcapflags.sh +++ b/arch/x86/kernel/cpu/mkcapflags.sh @@ -1,6 +1,6 @@ #!/bin/sh # -# Generate the x86_cap/bug_flags[] arrays from include/asm/cpufeature.h +# Generate the x86_cap/bug_flags[] arrays from include/asm/cpufeatures.h # IN=$1 @@ -49,8 +49,8 @@ dump_array() trap 'rm "$OUT"' EXIT ( - echo "#ifndef _ASM_X86_CPUFEATURE_H" - echo "#include " + echo "#ifndef _ASM_X86_CPUFEATURES_H" + echo "#include " echo "#endif" echo "" diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 5c3d149..74f1d90 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -47,7 +47,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/x86/kernel/cpu/transmeta.c b/arch/x86/kernel/cpu/transmeta.c index 252da7a..a19a663 100644 --- a/arch/x86/kernel/cpu/transmeta.c +++ b/arch/x86/kernel/cpu/transmeta.c @@ -1,6 +1,6 @@ #include #include -#include +#include #include #include "cpu.h" diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 569c1e4..b3c2a69 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -24,6 +24,7 @@ #include #include #include +#include /* * The e820 map is the map that gets modified e.g. with command line parameters diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 6bc9ae2..af11129 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -19,7 +19,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index b8e6ff5..be0ebbb 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -12,6 +12,7 @@ #include #include +#include #include #include #include diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index 64f9616..7f3550a 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -40,7 +40,7 @@ #include #include -#include +#include #include static struct class *msr_class; diff --git a/arch/x86/kernel/verify_cpu.S b/arch/x86/kernel/verify_cpu.S index 07efb35..014ea59 100644 --- a/arch/x86/kernel/verify_cpu.S +++ b/arch/x86/kernel/verify_cpu.S @@ -30,7 +30,7 @@ * appropriately. Either display a message or halt. */ -#include +#include #include verify_cpu: diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S index a2fe51b..65be7cf 100644 --- a/arch/x86/lib/clear_page_64.S +++ b/arch/x86/lib/clear_page_64.S @@ -1,5 +1,5 @@ #include -#include +#include #include /* diff --git a/arch/x86/lib/copy_page_64.S b/arch/x86/lib/copy_page_64.S index 009f982..24ef1c2 100644 --- a/arch/x86/lib/copy_page_64.S +++ b/arch/x86/lib/copy_page_64.S @@ -1,7 +1,7 @@ /* Written 2003 by Andi Kleen, based on a kernel by Evandro Menezes */ #include -#include +#include #include /* diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S index 982ce34..fba3430 100644 --- a/arch/x86/lib/copy_user_64.S +++ b/arch/x86/lib/copy_user_64.S @@ -10,7 +10,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S index 16698bb..a0de849 100644 --- a/arch/x86/lib/memcpy_64.S +++ b/arch/x86/lib/memcpy_64.S @@ -1,7 +1,7 @@ /* Copyright 2002 Andi Kleen */ #include -#include +#include #include /* diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S index ca2afdd..90ce01b 100644 --- a/arch/x86/lib/memmove_64.S +++ b/arch/x86/lib/memmove_64.S @@ -6,7 +6,7 @@ * - Copyright 2011 Fenghua Yu */ #include -#include +#include #include #undef memmove diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S index 2661fad..c9c8122 100644 --- a/arch/x86/lib/memset_64.S +++ b/arch/x86/lib/memset_64.S @@ -1,7 +1,7 @@ /* Copyright 2002 Andi Kleen, SuSE Labs */ #include -#include +#include #include .weak memset diff --git a/arch/x86/mm/setup_nx.c b/arch/x86/mm/setup_nx.c index 92e2eac..f65a33f 100644 --- a/arch/x86/mm/setup_nx.c +++ b/arch/x86/mm/setup_nx.c @@ -4,6 +4,7 @@ #include #include +#include static int disable_nx; diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index 50d86c0..660a83c 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -24,7 +24,6 @@ #include #include #include -#include #include "op_x86_model.h" #include "op_counter.h" diff --git a/arch/x86/um/asm/barrier.h b/arch/x86/um/asm/barrier.h index 174781a..00c3190 100644 --- a/arch/x86/um/asm/barrier.h +++ b/arch/x86/um/asm/barrier.h @@ -3,7 +3,7 @@ #include #include -#include +#include #include #include diff --git a/lib/atomic64_test.c b/lib/atomic64_test.c index d62de8b..1234818 100644 --- a/lib/atomic64_test.c +++ b/lib/atomic64_test.c @@ -17,7 +17,7 @@ #include #ifdef CONFIG_X86 -#include /* for boot_cpu_has below */ +#include /* for boot_cpu_has below */ #endif #define TEST(bit, op, c_op, val) \ -- cgit v0.10.2 From bc696ca05f5a8927329ec276a892341e006b00ba Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 26 Jan 2016 22:12:05 +0100 Subject: x86/cpufeature: Replace the old static_cpu_has() with safe variant So the old one didn't work properly before alternatives had run. And it was supposed to provide an optimized JMP because the assumption was that the offset it is jumping to is within a signed byte and thus a two-byte JMP. So I did an x86_64 allyesconfig build and dumped all possible sites where static_cpu_has() was used. The optimization amounted to all in all 12(!) places where static_cpu_has() had generated a 2-byte JMP. Which has saved us a whopping 36 bytes! This clearly is not worth the trouble so we can remove it. The only place where the optimization might count - in __switch_to() - we will handle differently. But that's not subject of this patch. Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1453842730-28463-6-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 9b18ed9..68a2d1f 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -350,16 +350,6 @@ config DEBUG_IMR_SELFTEST If unsure say N here. -config X86_DEBUG_STATIC_CPU_HAS - bool "Debug alternatives" - depends on DEBUG_KERNEL - ---help--- - This option causes additional code to be generated which - fails if static_cpu_has() is used before alternatives have - run. - - If unsure, say N. - config X86_DEBUG_FPU bool "Debug the x86 FPU code" depends on DEBUG_KERNEL diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 3cce9f3..a261cf2 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -125,103 +125,19 @@ extern const char * const x86_bug_flags[NBUGINTS*32]; #define cpu_has_osxsave boot_cpu_has(X86_FEATURE_OSXSAVE) #define cpu_has_hypervisor boot_cpu_has(X86_FEATURE_HYPERVISOR) /* - * Do not add any more of those clumsy macros - use static_cpu_has_safe() for + * Do not add any more of those clumsy macros - use static_cpu_has() for * fast paths and boot_cpu_has() otherwise! */ #if __GNUC__ >= 4 && defined(CONFIG_X86_FAST_FEATURE_TESTS) -extern void warn_pre_alternatives(void); -extern bool __static_cpu_has_safe(u16 bit); +extern bool __static_cpu_has(u16 bit); /* * Static testing of CPU features. Used the same as boot_cpu_has(). * These are only valid after alternatives have run, but will statically * patch the target code for additional performance. */ -static __always_inline __pure bool __static_cpu_has(u16 bit) -{ -#ifdef CC_HAVE_ASM_GOTO - -#ifdef CONFIG_X86_DEBUG_STATIC_CPU_HAS - - /* - * Catch too early usage of this before alternatives - * have run. - */ - asm_volatile_goto("1: jmp %l[t_warn]\n" - "2:\n" - ".section .altinstructions,\"a\"\n" - " .long 1b - .\n" - " .long 0\n" /* no replacement */ - " .word %P0\n" /* 1: do replace */ - " .byte 2b - 1b\n" /* source len */ - " .byte 0\n" /* replacement len */ - " .byte 0\n" /* pad len */ - ".previous\n" - /* skipping size check since replacement size = 0 */ - : : "i" (X86_FEATURE_ALWAYS) : : t_warn); - -#endif - - asm_volatile_goto("1: jmp %l[t_no]\n" - "2:\n" - ".section .altinstructions,\"a\"\n" - " .long 1b - .\n" - " .long 0\n" /* no replacement */ - " .word %P0\n" /* feature bit */ - " .byte 2b - 1b\n" /* source len */ - " .byte 0\n" /* replacement len */ - " .byte 0\n" /* pad len */ - ".previous\n" - /* skipping size check since replacement size = 0 */ - : : "i" (bit) : : t_no); - return true; - t_no: - return false; - -#ifdef CONFIG_X86_DEBUG_STATIC_CPU_HAS - t_warn: - warn_pre_alternatives(); - return false; -#endif - -#else /* CC_HAVE_ASM_GOTO */ - - u8 flag; - /* Open-coded due to __stringify() in ALTERNATIVE() */ - asm volatile("1: movb $0,%0\n" - "2:\n" - ".section .altinstructions,\"a\"\n" - " .long 1b - .\n" - " .long 3f - .\n" - " .word %P1\n" /* feature bit */ - " .byte 2b - 1b\n" /* source len */ - " .byte 4f - 3f\n" /* replacement len */ - " .byte 0\n" /* pad len */ - ".previous\n" - ".section .discard,\"aw\",@progbits\n" - " .byte 0xff + (4f-3f) - (2b-1b)\n" /* size check */ - ".previous\n" - ".section .altinstr_replacement,\"ax\"\n" - "3: movb $1,%0\n" - "4:\n" - ".previous\n" - : "=qm" (flag) : "i" (bit)); - return flag; - -#endif /* CC_HAVE_ASM_GOTO */ -} - -#define static_cpu_has(bit) \ -( \ - __builtin_constant_p(boot_cpu_has(bit)) ? \ - boot_cpu_has(bit) : \ - __builtin_constant_p(bit) ? \ - __static_cpu_has(bit) : \ - boot_cpu_has(bit) \ -) - -static __always_inline __pure bool _static_cpu_has_safe(u16 bit) +static __always_inline __pure bool _static_cpu_has(u16 bit) { #ifdef CC_HAVE_ASM_GOTO asm_volatile_goto("1: jmp %l[t_dynamic]\n" @@ -255,7 +171,7 @@ static __always_inline __pure bool _static_cpu_has_safe(u16 bit) t_no: return false; t_dynamic: - return __static_cpu_has_safe(bit); + return __static_cpu_has(bit); #else u8 flag; /* Open-coded due to __stringify() in ALTERNATIVE() */ @@ -293,22 +209,21 @@ static __always_inline __pure bool _static_cpu_has_safe(u16 bit) ".previous\n" : "=qm" (flag) : "i" (bit), "i" (X86_FEATURE_ALWAYS)); - return (flag == 2 ? __static_cpu_has_safe(bit) : flag); + return (flag == 2 ? __static_cpu_has(bit) : flag); #endif /* CC_HAVE_ASM_GOTO */ } -#define static_cpu_has_safe(bit) \ +#define static_cpu_has(bit) \ ( \ __builtin_constant_p(boot_cpu_has(bit)) ? \ boot_cpu_has(bit) : \ - _static_cpu_has_safe(bit) \ + _static_cpu_has(bit) \ ) #else /* * gcc 3.x is too stupid to do the static test; fall back to dynamic. */ #define static_cpu_has(bit) boot_cpu_has(bit) -#define static_cpu_has_safe(bit) boot_cpu_has(bit) #endif #define cpu_has_bug(c, bit) cpu_has(c, (bit)) @@ -316,7 +231,6 @@ static __always_inline __pure bool _static_cpu_has_safe(u16 bit) #define clear_cpu_bug(c, bit) clear_cpu_cap(c, (bit)) #define static_cpu_has_bug(bit) static_cpu_has((bit)) -#define static_cpu_has_bug_safe(bit) static_cpu_has_safe((bit)) #define boot_cpu_has_bug(bit) cpu_has_bug(&boot_cpu_data, (bit)) #define MAX_CPU_FEATURES (NCAPINTS * 32) diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index d01199d..c2e46eb 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -59,22 +59,22 @@ extern u64 fpu__get_supported_xfeatures_mask(void); */ static __always_inline __pure bool use_eager_fpu(void) { - return static_cpu_has_safe(X86_FEATURE_EAGER_FPU); + return static_cpu_has(X86_FEATURE_EAGER_FPU); } static __always_inline __pure bool use_xsaveopt(void) { - return static_cpu_has_safe(X86_FEATURE_XSAVEOPT); + return static_cpu_has(X86_FEATURE_XSAVEOPT); } static __always_inline __pure bool use_xsave(void) { - return static_cpu_has_safe(X86_FEATURE_XSAVE); + return static_cpu_has(X86_FEATURE_XSAVE); } static __always_inline __pure bool use_fxsr(void) { - return static_cpu_has_safe(X86_FEATURE_FXSR); + return static_cpu_has(X86_FEATURE_FXSR); } /* @@ -301,7 +301,7 @@ static inline void copy_xregs_to_kernel_booting(struct xregs_state *xstate) WARN_ON(system_state != SYSTEM_BOOTING); - if (static_cpu_has_safe(X86_FEATURE_XSAVES)) + if (static_cpu_has(X86_FEATURE_XSAVES)) XSTATE_OP(XSAVES, xstate, lmask, hmask, err); else XSTATE_OP(XSAVE, xstate, lmask, hmask, err); @@ -323,7 +323,7 @@ static inline void copy_kernel_to_xregs_booting(struct xregs_state *xstate) WARN_ON(system_state != SYSTEM_BOOTING); - if (static_cpu_has_safe(X86_FEATURE_XSAVES)) + if (static_cpu_has(X86_FEATURE_XSAVES)) XSTATE_OP(XRSTORS, xstate, lmask, hmask, err); else XSTATE_OP(XRSTOR, xstate, lmask, hmask, err); @@ -461,7 +461,7 @@ static inline void copy_kernel_to_fpregs(union fpregs_state *fpstate) * pending. Clear the x87 state here by setting it to fixed values. * "m" is a random variable that should be in L1. */ - if (unlikely(static_cpu_has_bug_safe(X86_BUG_FXSAVE_LEAK))) { + if (unlikely(static_cpu_has_bug(X86_BUG_FXSAVE_LEAK))) { asm volatile( "fnclex\n\t" "emms\n\t" diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c index c80c02c..ab5c2c6 100644 --- a/arch/x86/kernel/apic/apic_numachip.c +++ b/arch/x86/kernel/apic/apic_numachip.c @@ -30,7 +30,7 @@ static unsigned int numachip1_get_apic_id(unsigned long x) unsigned long value; unsigned int id = (x >> 24) & 0xff; - if (static_cpu_has_safe(X86_FEATURE_NODEID_MSR)) { + if (static_cpu_has(X86_FEATURE_NODEID_MSR)) { rdmsrl(MSR_FAM10H_NODE_ID, value); id |= (value << 2) & 0xff00; } @@ -178,7 +178,7 @@ static void fixup_cpu_id(struct cpuinfo_x86 *c, int node) this_cpu_write(cpu_llc_id, node); /* Account for nodes per socket in multi-core-module processors */ - if (static_cpu_has_safe(X86_FEATURE_NODEID_MSR)) { + if (static_cpu_has(X86_FEATURE_NODEID_MSR)) { rdmsrl(MSR_FAM10H_NODE_ID, val); nodes = ((val >> 3) & 7) + 1; } diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 37830de..ee49981 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1475,19 +1475,11 @@ void cpu_init(void) } #endif -#ifdef CONFIG_X86_DEBUG_STATIC_CPU_HAS -void warn_pre_alternatives(void) -{ - WARN(1, "You're using static_cpu_has before alternatives have run!\n"); -} -EXPORT_SYMBOL_GPL(warn_pre_alternatives); -#endif - -inline bool __static_cpu_has_safe(u16 bit) +inline bool __static_cpu_has(u16 bit) { return boot_cpu_has(bit); } -EXPORT_SYMBOL_GPL(__static_cpu_has_safe); +EXPORT_SYMBOL_GPL(__static_cpu_has); static void bsp_resume(void) { diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index e574b85..3dce1ca 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -362,7 +362,7 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus) /* make room for real-mode segments */ tsk->thread.sp0 += 16; - if (static_cpu_has_safe(X86_FEATURE_SEP)) + if (static_cpu_has(X86_FEATURE_SEP)) tsk->thread.sysenter_cs = 0; load_sp0(tss, &tsk->thread); diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index cd83d47..3a4b39a 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -1431,7 +1431,7 @@ static int __init intel_pstate_init(void) if (!all_cpu_data) return -ENOMEM; - if (static_cpu_has_safe(X86_FEATURE_HWP) && !no_hwp) { + if (static_cpu_has(X86_FEATURE_HWP) && !no_hwp) { pr_info("intel_pstate: HWP enabled\n"); hwp_active++; } diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index dd08e29..d928649 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -930,7 +930,7 @@ static int check_async_write(struct inode *inode, unsigned long bio_flags) if (bio_flags & EXTENT_BIO_TREE_LOG) return 0; #ifdef CONFIG_X86 - if (static_cpu_has_safe(X86_FEATURE_XMM4_2)) + if (static_cpu_has(X86_FEATURE_XMM4_2)) return 0; #endif return 1; -- cgit v0.10.2 From a362bf9f5e7dd659b96d01382da7b855f4e5a7a1 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 27 Jan 2016 09:43:25 +0100 Subject: x86/cpufeature: Get rid of the non-asm goto variant I can simply quote hpa from the mail: "Get rid of the non-asm goto variant and just fall back to dynamic if asm goto is unavailable. It doesn't make any sense, really, if it is supposed to be safe, and by now the asm goto-capable gcc is in more wide use. (Originally the gcc 3.x fallback to pure dynamic didn't exist, either.)" Booy, am I lazy. Cleanup the whole CC_HAVE_ASM_GOTO ifdeffery too, while at it. Suggested-by: H. Peter Anvin Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20160127084325.GB30712@pd.tnic Signed-off-by: Ingo Molnar diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index a261cf2..9048c1b 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -129,17 +129,16 @@ extern const char * const x86_bug_flags[NBUGINTS*32]; * fast paths and boot_cpu_has() otherwise! */ -#if __GNUC__ >= 4 && defined(CONFIG_X86_FAST_FEATURE_TESTS) +#if defined(CC_HAVE_ASM_GOTO) && defined(CONFIG_X86_FAST_FEATURE_TESTS) extern bool __static_cpu_has(u16 bit); /* * Static testing of CPU features. Used the same as boot_cpu_has(). - * These are only valid after alternatives have run, but will statically - * patch the target code for additional performance. + * These will statically patch the target code for additional + * performance. */ static __always_inline __pure bool _static_cpu_has(u16 bit) { -#ifdef CC_HAVE_ASM_GOTO asm_volatile_goto("1: jmp %l[t_dynamic]\n" "2:\n" ".skip -(((5f-4f) - (2b-1b)) > 0) * " @@ -172,45 +171,6 @@ static __always_inline __pure bool _static_cpu_has(u16 bit) return false; t_dynamic: return __static_cpu_has(bit); -#else - u8 flag; - /* Open-coded due to __stringify() in ALTERNATIVE() */ - asm volatile("1: movb $2,%0\n" - "2:\n" - ".section .altinstructions,\"a\"\n" - " .long 1b - .\n" /* src offset */ - " .long 3f - .\n" /* repl offset */ - " .word %P2\n" /* always replace */ - " .byte 2b - 1b\n" /* source len */ - " .byte 4f - 3f\n" /* replacement len */ - " .byte 0\n" /* pad len */ - ".previous\n" - ".section .discard,\"aw\",@progbits\n" - " .byte 0xff + (4f-3f) - (2b-1b)\n" /* size check */ - ".previous\n" - ".section .altinstr_replacement,\"ax\"\n" - "3: movb $0,%0\n" - "4:\n" - ".previous\n" - ".section .altinstructions,\"a\"\n" - " .long 1b - .\n" /* src offset */ - " .long 5f - .\n" /* repl offset */ - " .word %P1\n" /* feature bit */ - " .byte 4b - 3b\n" /* src len */ - " .byte 6f - 5f\n" /* repl len */ - " .byte 0\n" /* pad len */ - ".previous\n" - ".section .discard,\"aw\",@progbits\n" - " .byte 0xff + (6f-5f) - (4b-3b)\n" /* size check */ - ".previous\n" - ".section .altinstr_replacement,\"ax\"\n" - "5: movb $1,%0\n" - "6:\n" - ".previous\n" - : "=qm" (flag) - : "i" (bit), "i" (X86_FEATURE_ALWAYS)); - return (flag == 2 ? __static_cpu_has(bit) : flag); -#endif /* CC_HAVE_ASM_GOTO */ } #define static_cpu_has(bit) \ @@ -221,7 +181,8 @@ static __always_inline __pure bool _static_cpu_has(u16 bit) ) #else /* - * gcc 3.x is too stupid to do the static test; fall back to dynamic. + * Fall back to dynamic for gcc versions which don't support asm goto. Should be + * a minority now anyway. */ #define static_cpu_has(bit) boot_cpu_has(bit) #endif -- cgit v0.10.2 From 337e4cc84021212a87b04b77b65cccc49304909e Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 26 Jan 2016 22:12:07 +0100 Subject: x86/alternatives: Add an auxilary section Add .altinstr_aux for additional instructions which will be used before and/or during patching. All stuff which needs more sophisticated patching should go there. See next patch. Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1453842730-28463-8-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 74e4bf1..92dc211 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -195,6 +195,17 @@ SECTIONS :init #endif + /* + * Section for code used exclusively before alternatives are run. All + * references to such code must be patched out by alternatives, normally + * by using X86_FEATURE_ALWAYS CPU feature bit. + * + * See static_cpu_has() for an example. + */ + .altinstr_aux : AT(ADDR(.altinstr_aux) - LOAD_OFFSET) { + *(.altinstr_aux) + } + INIT_DATA_SECTION(16) .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) { -- cgit v0.10.2 From 2476f2fa20568bd5d9e09cd35bcd73e99a6f4cc6 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Wed, 27 Jan 2016 09:45:25 +0100 Subject: x86/alternatives: Discard dynamic check after init Move the code to do the dynamic check to the altinstr_aux section so that it is discarded after alternatives have run and a static branch has been chosen. This way we're changing the dynamic branch from C code to assembly, which makes it *substantially* smaller while avoiding a completely unnecessary call to an out of line function. Signed-off-by: Brian Gerst [ Changed it to do TESTB, as hpa suggested. ] Signed-off-by: Borislav Petkov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Dave Young Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Kristen Carlson Accardi Cc: Laura Abbott Cc: Linus Torvalds Cc: Peter Zijlstra (Intel) Cc: Peter Zijlstra Cc: Prarit Bhargava Cc: Ross Zwisler Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1452972124-7380-1-git-send-email-brgerst@gmail.com Link: http://lkml.kernel.org/r/20160127084525.GC30712@pd.tnic Signed-off-by: Ingo Molnar diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 9048c1b..9fba7a5 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -130,8 +130,6 @@ extern const char * const x86_bug_flags[NBUGINTS*32]; */ #if defined(CC_HAVE_ASM_GOTO) && defined(CONFIG_X86_FAST_FEATURE_TESTS) -extern bool __static_cpu_has(u16 bit); - /* * Static testing of CPU features. Used the same as boot_cpu_has(). * These will statically patch the target code for additional @@ -139,7 +137,7 @@ extern bool __static_cpu_has(u16 bit); */ static __always_inline __pure bool _static_cpu_has(u16 bit) { - asm_volatile_goto("1: jmp %l[t_dynamic]\n" + asm_volatile_goto("1: jmp 6f\n" "2:\n" ".skip -(((5f-4f) - (2b-1b)) > 0) * " "((5f-4f) - (2b-1b)),0x90\n" @@ -164,13 +162,20 @@ static __always_inline __pure bool _static_cpu_has(u16 bit) " .byte 0\n" /* repl len */ " .byte 0\n" /* pad len */ ".previous\n" - : : "i" (bit), "i" (X86_FEATURE_ALWAYS) - : : t_dynamic, t_no); + ".section .altinstr_aux,\"ax\"\n" + "6:\n" + " testb %[bitnum],%[cap_byte]\n" + " jnz %l[t_yes]\n" + " jmp %l[t_no]\n" + ".previous\n" + : : "i" (bit), "i" (X86_FEATURE_ALWAYS), + [bitnum] "i" (1 << (bit & 7)), + [cap_byte] "m" (((const char *)boot_cpu_data.x86_capability)[bit >> 3]) + : : t_yes, t_no); + t_yes: return true; t_no: return false; - t_dynamic: - return __static_cpu_has(bit); } #define static_cpu_has(bit) \ diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index ee49981..079d83f 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1475,12 +1475,6 @@ void cpu_init(void) } #endif -inline bool __static_cpu_has(u16 bit) -{ - return boot_cpu_has(bit); -} -EXPORT_SYMBOL_GPL(__static_cpu_has); - static void bsp_resume(void) { if (this_cpu->c_bsp_resume) -- cgit v0.10.2 From 8c725306993198f845038dc9e45a1267099867a6 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 26 Jan 2016 22:12:09 +0100 Subject: x86/vdso: Use static_cpu_has() ... and simplify and speed up a tad. Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1453842730-28463-10-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index 429d54d..10f7045 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -285,7 +285,7 @@ static void vgetcpu_cpu_init(void *arg) #ifdef CONFIG_NUMA node = cpu_to_node(cpu); #endif - if (cpu_has(&cpu_data(cpu), X86_FEATURE_RDTSCP)) + if (static_cpu_has(X86_FEATURE_RDTSCP)) write_rdtscp_aux((node << 12) | cpu); /* -- cgit v0.10.2 From b7765086b7c5a5be029a739c2caa161da51c2076 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Sun, 31 Jan 2016 09:33:26 -0800 Subject: x86/entry/64: Fix an IRQ state error on ptregs-using syscalls I messed up the IRQ state when jumping off the fast path due to invocation of a ptregs-using syscall. This bug shouldn't have had any impact yet, but it would have caused problems with subsequent context tracking cleanups. Reported-and-tested-by: Borislav Petkov Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: 1e423bff959e x86/entry/64: ("Migrate the 64-bit syscall slow path to C") Link: http://lkml.kernel.org/r/ab92cd365fb7b0a56869e920017790d96610fdca.1454261517.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 567aa52..9f7bb80 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -191,8 +191,8 @@ entry_SYSCALL_64_fastpath: /* * This call instruction is handled specially in stub_ptregs_64. - * It might end up jumping to the slow path. If it jumps, RAX is - * clobbered. + * It might end up jumping to the slow path. If it jumps, RAX + * and all argument registers are clobbered. */ call *sys_call_table(, %rax, 8) .Lentry_SYSCALL_64_after_fastpath_call: @@ -315,17 +315,24 @@ END(entry_SYSCALL_64) ENTRY(stub_ptregs_64) /* * Syscalls marked as needing ptregs land here. - * If we are on the fast path, we need to save the extra regs. - * If we are on the slow path, the extra regs are already saved. + * If we are on the fast path, we need to save the extra regs, + * which we achieve by trying again on the slow path. If we are on + * the slow path, the extra regs are already saved. * * RAX stores a pointer to the C function implementing the syscall. + * IRQs are on. */ cmpq $.Lentry_SYSCALL_64_after_fastpath_call, (%rsp) jne 1f - /* Called from fast path -- pop return address and jump to slow path */ + /* + * Called from fast path -- disable IRQs again, pop return address + * and jump to slow path + */ + DISABLE_INTERRUPTS(CLBR_NONE) + TRACE_IRQS_OFF popq %rax - jmp entry_SYSCALL64_slow_path /* called from fast path */ + jmp entry_SYSCALL64_slow_path 1: /* Called from C */ -- cgit v0.10.2 From eb2a54c3271cb6443ae93ec44a91687b60c559a3 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Sun, 31 Jan 2016 09:33:27 -0800 Subject: x86/entry/64: Fix fast-path syscall return register state I was fishing RIP (i.e. RCX) out of pt_regs->cx and RFLAGS (i.e. R11) out of pt_regs->r11. While it usually worked (pt_regs started out with CX == IP and R11 == FLAGS), it was very fragile. In particular, it broke sys_iopl() because sys_iopl() forgot to mark itself as using ptregs. Undo that part of the syscall rework. There was no compelling reason to do it this way. While I'm at it, load RCX and R11 before the other regs to be a little friendlier to the CPU, as they will be the first of the reloaded registers to be used. Reported-and-tested-by: Borislav Petkov Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: 1e423bff959e x86/entry/64: ("Migrate the 64-bit syscall slow path to C") Link: http://lkml.kernel.org/r/a85f8360c397e48186a9bc3e565ad74307a7b011.1454261517.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 9f7bb80..70eadb0 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -212,7 +212,9 @@ entry_SYSCALL_64_fastpath: LOCKDEP_SYS_EXIT TRACE_IRQS_ON /* user mode is traced as IRQs on */ - RESTORE_C_REGS + movq RIP(%rsp), %rcx + movq EFLAGS(%rsp), %r11 + RESTORE_C_REGS_EXCEPT_RCX_R11 movq RSP(%rsp), %rsp USERGS_SYSRET64 -- cgit v0.10.2 From bb56968a37a44070de92d5690c4b08dd98a5d3f1 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Sun, 31 Jan 2016 09:33:28 -0800 Subject: x86/syscalls/64: Mark sys_iopl() as using ptregs sys_iopl() both reads and writes pt_regs->flags. Mark it as using ptregs. This isn't strictly necessary, as pt_regs->flags is available even in the fast path, but this is very lightweight now that we have syscall qualifiers and it could avoid some pain down the road. Reported-and-tested-by: Borislav Petkov Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/3de0ca692fa8bf414c5e3d7afe3e6195d1a10e1f.1454261517.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index dcf107c..2e5b565 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -178,7 +178,7 @@ 169 common reboot sys_reboot 170 common sethostname sys_sethostname 171 common setdomainname sys_setdomainname -172 common iopl sys_iopl +172 common iopl sys_iopl/ptregs 173 common ioperm sys_ioperm 174 64 create_module 175 common init_module sys_init_module -- cgit v0.10.2 From d99e1bd175f4291ddb6e62b22bb5bdbe3976389a Mon Sep 17 00:00:00 2001 From: Alexander Kuleshov Date: Mon, 25 Jan 2016 20:41:46 +0100 Subject: x86/entry/traps: Refactor preemption and interrupt flag handling Make the preemption and interrupt flag handling more readable by removing preempt_conditional_sti() and preempt_conditional_cli() helpers and using preempt_disable() and preempt_enable_no_resched() instead. Rename contitional_sti() and conditional_cli() to the more understandable cond_local_irq_enable() and cond_local_irq_disable() respectively, while at it. Suggested-by: Borislav Petkov Signed-off-by: Alexander Kuleshov [ Boris: massage text. ] Signed-off-by: Borislav Petkov Acked-by: Thomas Gleixner Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: H Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Tony Luck Cc: Wang Nan Link: http://lkml.kernel.org/r/1453750913-4781-2-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index ade185a..410e8e2 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -83,30 +83,16 @@ gate_desc idt_table[NR_VECTORS] __page_aligned_bss; DECLARE_BITMAP(used_vectors, NR_VECTORS); EXPORT_SYMBOL_GPL(used_vectors); -static inline void conditional_sti(struct pt_regs *regs) +static inline void cond_local_irq_enable(struct pt_regs *regs) { if (regs->flags & X86_EFLAGS_IF) local_irq_enable(); } -static inline void preempt_conditional_sti(struct pt_regs *regs) -{ - preempt_count_inc(); - if (regs->flags & X86_EFLAGS_IF) - local_irq_enable(); -} - -static inline void conditional_cli(struct pt_regs *regs) -{ - if (regs->flags & X86_EFLAGS_IF) - local_irq_disable(); -} - -static inline void preempt_conditional_cli(struct pt_regs *regs) +static inline void cond_local_irq_disable(struct pt_regs *regs) { if (regs->flags & X86_EFLAGS_IF) local_irq_disable(); - preempt_count_dec(); } void ist_enter(struct pt_regs *regs) @@ -286,7 +272,7 @@ static void do_error_trap(struct pt_regs *regs, long error_code, char *str, if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) != NOTIFY_STOP) { - conditional_sti(regs); + cond_local_irq_enable(regs); do_trap(trapnr, signr, str, regs, error_code, fill_trap_info(regs, signr, trapnr, &info)); } @@ -368,7 +354,7 @@ dotraplinkage void do_bounds(struct pt_regs *regs, long error_code) if (notify_die(DIE_TRAP, "bounds", regs, error_code, X86_TRAP_BR, SIGSEGV) == NOTIFY_STOP) return; - conditional_sti(regs); + cond_local_irq_enable(regs); if (!user_mode(regs)) die("bounds", regs, error_code); @@ -443,7 +429,7 @@ do_general_protection(struct pt_regs *regs, long error_code) struct task_struct *tsk; RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU"); - conditional_sti(regs); + cond_local_irq_enable(regs); if (v8086_mode(regs)) { local_irq_enable(); @@ -517,9 +503,11 @@ dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code) * as we may switch to the interrupt stack. */ debug_stack_usage_inc(); - preempt_conditional_sti(regs); + preempt_disable(); + cond_local_irq_enable(regs); do_trap(X86_TRAP_BP, SIGTRAP, "int3", regs, error_code, NULL); - preempt_conditional_cli(regs); + cond_local_irq_disable(regs); + preempt_enable_no_resched(); debug_stack_usage_dec(); exit: ist_exit(regs); @@ -648,12 +636,14 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code) debug_stack_usage_inc(); /* It's safe to allow irq's after DR6 has been saved */ - preempt_conditional_sti(regs); + preempt_disable(); + cond_local_irq_enable(regs); if (v8086_mode(regs)) { handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, X86_TRAP_DB); - preempt_conditional_cli(regs); + cond_local_irq_disable(regs); + preempt_enable_no_resched(); debug_stack_usage_dec(); goto exit; } @@ -673,7 +663,8 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code) si_code = get_si_code(tsk->thread.debugreg6); if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS) || user_icebp) send_sigtrap(tsk, regs, error_code, si_code); - preempt_conditional_cli(regs); + cond_local_irq_disable(regs); + preempt_enable_no_resched(); debug_stack_usage_dec(); exit: @@ -696,7 +687,7 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr) if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, SIGFPE) == NOTIFY_STOP) return; - conditional_sti(regs); + cond_local_irq_enable(regs); if (!user_mode(regs)) { if (!fixup_exception(regs)) { @@ -743,7 +734,7 @@ do_simd_coprocessor_error(struct pt_regs *regs, long error_code) dotraplinkage void do_spurious_interrupt_bug(struct pt_regs *regs, long error_code) { - conditional_sti(regs); + cond_local_irq_enable(regs); } dotraplinkage void @@ -756,7 +747,7 @@ do_device_not_available(struct pt_regs *regs, long error_code) if (read_cr0() & X86_CR0_EM) { struct math_emu_info info = { }; - conditional_sti(regs); + cond_local_irq_enable(regs); info.regs = regs; math_emulate(&info); @@ -765,7 +756,7 @@ do_device_not_available(struct pt_regs *regs, long error_code) #endif fpu__restore(¤t->thread.fpu); /* interrupts still off */ #ifdef CONFIG_X86_32 - conditional_sti(regs); + cond_local_irq_enable(regs); #endif } NOKPROBE_SYMBOL(do_device_not_available); -- cgit v0.10.2 From c1a0bf347c40dd4b0a5bb10fdf4de76a1fbbbe8c Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 8 Feb 2016 09:57:34 +0100 Subject: x86/mm/numa: Clean up numa_clear_kernel_node_hotplug() So we fixed an overflow bug in numa_clear_kernel_node_hotplug(): 2b54ab3c66d4 ("x86/mm/numa: Fix memory corruption on 32-bit NUMA kernels") ... and the bug was indirectly caused by poor coding style, such as using start/end local variables unnecessarily, which lost the physaddr_t type. So make the code more readable and try to fully comment all the thinking behind the logic. No change in functionality. Cc: Andrew Morton Cc: Brad Spengler Cc: Chen Tang Cc: "H. Peter Anvin" Cc: Lai Jiangshan Cc: Linus Torvalds Cc: PaX Team Cc: Taku Izumi Cc: Tang Chen Cc: Thomas Gleixner Cc: Wen Congyang Cc: Yasuaki Ishimatsu Cc: y14sg1 Cc: Zhang Yanfei Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index d04f809..ede4506 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -465,46 +465,65 @@ static bool __init numa_meminfo_cover_memory(const struct numa_meminfo *mi) return true; } +/* + * Mark all currently memblock-reserved physical memory (which covers the + * kernel's own memory ranges) as hot-unswappable. + */ static void __init numa_clear_kernel_node_hotplug(void) { - int i, nid; - nodemask_t numa_kernel_nodes = NODE_MASK_NONE; - phys_addr_t start, end; - struct memblock_region *r; + nodemask_t reserved_nodemask = NODE_MASK_NONE; + struct memblock_region *mb_region; + int i; /* + * We have to do some preprocessing of memblock regions, to + * make them suitable for reservation. + * * At this time, all memory regions reserved by memblock are - * used by the kernel. Set the nid in memblock.reserved will - * mark out all the nodes the kernel resides in. + * used by the kernel, but those regions are not split up + * along node boundaries yet, and don't necessarily have their + * node ID set yet either. + * + * So iterate over all memory known to the x86 architecture, + * and use those ranges to set the nid in memblock.reserved. + * This will split up the memblock regions along node + * boundaries and will set the node IDs as well. */ for (i = 0; i < numa_meminfo.nr_blks; i++) { - struct numa_memblk *mb = &numa_meminfo.blk[i]; + struct numa_memblk *mb = numa_meminfo.blk + i; - memblock_set_node(mb->start, mb->end - mb->start, - &memblock.reserved, mb->nid); + memblock_set_node(mb->start, mb->end - mb->start, &memblock.reserved, mb->nid); } /* - * Mark all kernel nodes. + * Now go over all reserved memblock regions, to construct a + * node mask of all kernel reserved memory areas. * - * When booting with mem=nn[kMG] or in a kdump kernel, numa_meminfo - * may not include all the memblock.reserved memory ranges because - * trim_snb_memory() reserves specific pages for Sandy Bridge graphics. + * [ Note, when booting with mem=nn[kMG] or in a kdump kernel, + * numa_meminfo might not include all memblock.reserved + * memory ranges, because quirks such as trim_snb_memory() + * reserve specific pages for Sandy Bridge graphics. ] */ - for_each_memblock(reserved, r) - if (r->nid != MAX_NUMNODES) - node_set(r->nid, numa_kernel_nodes); + for_each_memblock(reserved, mb_region) { + if (mb_region->nid != MAX_NUMNODES) + node_set(mb_region->nid, reserved_nodemask); + } - /* Clear MEMBLOCK_HOTPLUG flag for memory in kernel nodes. */ + /* + * Finally, clear the MEMBLOCK_HOTPLUG flag for all memory + * belonging to the reserved node mask. + * + * Note that this will include memory regions that reside + * on nodes that contain kernel memory - entire nodes + * become hot-unpluggable: + */ for (i = 0; i < numa_meminfo.nr_blks; i++) { - nid = numa_meminfo.blk[i].nid; - if (!node_isset(nid, numa_kernel_nodes)) - continue; + struct numa_memblk *mb = numa_meminfo.blk + i; - start = numa_meminfo.blk[i].start; - end = numa_meminfo.blk[i].end; + if (!node_isset(mb->nid, reserved_nodemask)) + continue; - memblock_clear_hotplug(start, end - start); + memblock_clear_hotplug(mb->start, mb->end - mb->start); } } -- cgit v0.10.2 From 5f7ee246850ba18a6a7bcb3d5eddb6db68354688 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 8 Feb 2016 12:14:55 +0100 Subject: x86/mm/numa: Check for failures in numa_clear_kernel_node_hotplug() numa_clear_kernel_node_hotplug() uses memblock_set_node() without checking for failures. memblock_set_node() is a complex function that might extend the memblock array - which extension might fail - so check for this possibility. It's not supposed to happen (because realistically if we have so little memory that this fails then we likely won't be able to boot anyway), but do the check nevertheless. Cc: Andrew Morton Cc: Brad Spengler Cc: Chen Tang Cc: "H. Peter Anvin" Cc: Lai Jiangshan Cc: Linus Torvalds Cc: PaX Team Cc: Taku Izumi Cc: Tang Chen Cc: Thomas Gleixner Cc: Wen Congyang Cc: Yasuaki Ishimatsu Cc: y14sg1 Cc: Zhang Yanfei Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index ede4506..f70c1ff 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -491,8 +491,10 @@ static void __init numa_clear_kernel_node_hotplug(void) */ for (i = 0; i < numa_meminfo.nr_blks; i++) { struct numa_memblk *mb = numa_meminfo.blk + i; + int ret; - memblock_set_node(mb->start, mb->end - mb->start, &memblock.reserved, mb->nid); + ret = memblock_set_node(mb->start, mb->end - mb->start, &memblock.reserved, mb->nid); + WARN_ON_ONCE(ret); } /* -- cgit v0.10.2 From 8dd5032d9c540111dd673078738d137a998d6c3f Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Sun, 7 Feb 2016 22:51:27 +0100 Subject: x86/asm/bitops: Force inlining of test_and_set_bit and friends Sometimes GCC mysteriously doesn't inline very small functions we expect to be inlined, see: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=66122 Arguably, GCC should do better, but GCC people aren't willing to invest time into it and are asking to use __always_inline instead. With this .config: http://busybox.net/~vda/kernel_config_OPTIMIZE_INLINING_and_Os here's an example of functions getting deinlined many times: test_and_set_bit (166 copies, ~1260 calls) 55 push %rbp 48 89 e5 mov %rsp,%rbp f0 48 0f ab 3e lock bts %rdi,(%rsi) 72 04 jb 31 c0 xor %eax,%eax eb 05 jmp b8 01 00 00 00 mov $0x1,%eax 5d pop %rbp c3 retq test_and_clear_bit (124 copies, ~1000 calls) 55 push %rbp 48 89 e5 mov %rsp,%rbp f0 48 0f b3 3e lock btr %rdi,(%rsi) 72 04 jb 31 c0 xor %eax,%eax eb 05 jmp b8 01 00 00 00 mov $0x1,%eax 5d pop %rbp c3 retq change_bit (3 copies, 8 calls) 55 push %rbp 48 89 e5 mov %rsp,%rbp f0 48 0f bb 3e lock btc %rdi,(%rsi) 5d pop %rbp c3 retq clear_bit_unlock (2 copies, 11 calls) 55 push %rbp 48 89 e5 mov %rsp,%rbp f0 48 0f b3 3e lock btr %rdi,(%rsi) 5d pop %rbp c3 retq This patch works it around via s/inline/__always_inline/. Code size decrease by ~13.5k after the patch: text data bss dec filename 92110727 20826144 36417536 149354407 vmlinux.before 92097234 20826176 36417536 149340946 vmlinux.after Signed-off-by: Denys Vlasenko Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: David Rientjes Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Thomas Graf Link: http://lkml.kernel.org/r/1454881887-1367-1-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h index cfe3b95..7766d1c 100644 --- a/arch/x86/include/asm/bitops.h +++ b/arch/x86/include/asm/bitops.h @@ -91,7 +91,7 @@ set_bit(long nr, volatile unsigned long *addr) * If it's called on the same region of memory simultaneously, the effect * may be that only one operation succeeds. */ -static inline void __set_bit(long nr, volatile unsigned long *addr) +static __always_inline void __set_bit(long nr, volatile unsigned long *addr) { asm volatile("bts %1,%0" : ADDR : "Ir" (nr) : "memory"); } @@ -128,13 +128,13 @@ clear_bit(long nr, volatile unsigned long *addr) * clear_bit() is atomic and implies release semantics before the memory * operation. It can be used for an unlock. */ -static inline void clear_bit_unlock(long nr, volatile unsigned long *addr) +static __always_inline void clear_bit_unlock(long nr, volatile unsigned long *addr) { barrier(); clear_bit(nr, addr); } -static inline void __clear_bit(long nr, volatile unsigned long *addr) +static __always_inline void __clear_bit(long nr, volatile unsigned long *addr) { asm volatile("btr %1,%0" : ADDR : "Ir" (nr)); } @@ -151,7 +151,7 @@ static inline void __clear_bit(long nr, volatile unsigned long *addr) * No memory barrier is required here, because x86 cannot reorder stores past * older loads. Same principle as spin_unlock. */ -static inline void __clear_bit_unlock(long nr, volatile unsigned long *addr) +static __always_inline void __clear_bit_unlock(long nr, volatile unsigned long *addr) { barrier(); __clear_bit(nr, addr); @@ -166,7 +166,7 @@ static inline void __clear_bit_unlock(long nr, volatile unsigned long *addr) * If it's called on the same region of memory simultaneously, the effect * may be that only one operation succeeds. */ -static inline void __change_bit(long nr, volatile unsigned long *addr) +static __always_inline void __change_bit(long nr, volatile unsigned long *addr) { asm volatile("btc %1,%0" : ADDR : "Ir" (nr)); } @@ -180,7 +180,7 @@ static inline void __change_bit(long nr, volatile unsigned long *addr) * Note that @nr may be almost arbitrarily large; this function is not * restricted to acting on a single-word quantity. */ -static inline void change_bit(long nr, volatile unsigned long *addr) +static __always_inline void change_bit(long nr, volatile unsigned long *addr) { if (IS_IMMEDIATE(nr)) { asm volatile(LOCK_PREFIX "xorb %1,%0" @@ -201,7 +201,7 @@ static inline void change_bit(long nr, volatile unsigned long *addr) * This operation is atomic and cannot be reordered. * It also implies a memory barrier. */ -static inline int test_and_set_bit(long nr, volatile unsigned long *addr) +static __always_inline int test_and_set_bit(long nr, volatile unsigned long *addr) { GEN_BINARY_RMWcc(LOCK_PREFIX "bts", *addr, "Ir", nr, "%0", "c"); } @@ -228,7 +228,7 @@ test_and_set_bit_lock(long nr, volatile unsigned long *addr) * If two examples of this operation race, one can appear to succeed * but actually fail. You must protect multiple accesses with a lock. */ -static inline int __test_and_set_bit(long nr, volatile unsigned long *addr) +static __always_inline int __test_and_set_bit(long nr, volatile unsigned long *addr) { int oldbit; @@ -247,7 +247,7 @@ static inline int __test_and_set_bit(long nr, volatile unsigned long *addr) * This operation is atomic and cannot be reordered. * It also implies a memory barrier. */ -static inline int test_and_clear_bit(long nr, volatile unsigned long *addr) +static __always_inline int test_and_clear_bit(long nr, volatile unsigned long *addr) { GEN_BINARY_RMWcc(LOCK_PREFIX "btr", *addr, "Ir", nr, "%0", "c"); } @@ -268,7 +268,7 @@ static inline int test_and_clear_bit(long nr, volatile unsigned long *addr) * accessed from a hypervisor on the same CPU if running in a VM: don't change * this without also updating arch/x86/kernel/kvm.c */ -static inline int __test_and_clear_bit(long nr, volatile unsigned long *addr) +static __always_inline int __test_and_clear_bit(long nr, volatile unsigned long *addr) { int oldbit; @@ -280,7 +280,7 @@ static inline int __test_and_clear_bit(long nr, volatile unsigned long *addr) } /* WARNING: non atomic and it can be reordered! */ -static inline int __test_and_change_bit(long nr, volatile unsigned long *addr) +static __always_inline int __test_and_change_bit(long nr, volatile unsigned long *addr) { int oldbit; @@ -300,7 +300,7 @@ static inline int __test_and_change_bit(long nr, volatile unsigned long *addr) * This operation is atomic and cannot be reordered. * It also implies a memory barrier. */ -static inline int test_and_change_bit(long nr, volatile unsigned long *addr) +static __always_inline int test_and_change_bit(long nr, volatile unsigned long *addr) { GEN_BINARY_RMWcc(LOCK_PREFIX "btc", *addr, "Ir", nr, "%0", "c"); } @@ -311,7 +311,7 @@ static __always_inline int constant_test_bit(long nr, const volatile unsigned lo (addr[nr >> _BITOPS_LONG_SHIFT])) != 0; } -static inline int variable_test_bit(long nr, volatile const unsigned long *addr) +static __always_inline int variable_test_bit(long nr, volatile const unsigned long *addr) { int oldbit; @@ -343,7 +343,7 @@ static int test_bit(int nr, const volatile unsigned long *addr); * * Undefined if no bit exists, so code should check against 0 first. */ -static inline unsigned long __ffs(unsigned long word) +static __always_inline unsigned long __ffs(unsigned long word) { asm("rep; bsf %1,%0" : "=r" (word) @@ -357,7 +357,7 @@ static inline unsigned long __ffs(unsigned long word) * * Undefined if no zero exists, so code should check against ~0UL first. */ -static inline unsigned long ffz(unsigned long word) +static __always_inline unsigned long ffz(unsigned long word) { asm("rep; bsf %1,%0" : "=r" (word) @@ -371,7 +371,7 @@ static inline unsigned long ffz(unsigned long word) * * Undefined if no set bit exists, so code should check against 0 first. */ -static inline unsigned long __fls(unsigned long word) +static __always_inline unsigned long __fls(unsigned long word) { asm("bsr %1,%0" : "=r" (word) @@ -393,7 +393,7 @@ static inline unsigned long __fls(unsigned long word) * set bit if value is nonzero. The first (least significant) bit * is at position 1. */ -static inline int ffs(int x) +static __always_inline int ffs(int x) { int r; @@ -434,7 +434,7 @@ static inline int ffs(int x) * set bit if value is nonzero. The last (most significant) bit is * at position 32. */ -static inline int fls(int x) +static __always_inline int fls(int x) { int r; -- cgit v0.10.2 From 69e0210fd01ff157d332102219aaf5c26ca8069b Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Mon, 11 Jan 2016 15:51:18 +0300 Subject: x86/kasan: Clear kasan_zero_page after TLB flush Currently we clear kasan_zero_page before __flush_tlb_all(). This works with current implementation of native_flush_tlb[_global]() because it doesn't cause do any writes to kasan shadow memory. But any subtle change made in native_flush_tlb*() could break this. Also current code seems doesn't work for paravirt guests (lguest). Only after the TLB flush we can be sure that kasan_zero_page is not used as early shadow anymore (instrumented code will not write to it). So it should cleared it only after the TLB flush. Signed-off-by: Andrey Ryabinin Reviewed-by: Borislav Petkov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Luis R. Rodriguez Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Toshi Kani Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/1452516679-32040-2-git-send-email-aryabinin@virtuozzo.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c index d470cf2..303e470 100644 --- a/arch/x86/mm/kasan_init_64.c +++ b/arch/x86/mm/kasan_init_64.c @@ -120,11 +120,16 @@ void __init kasan_init(void) kasan_populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END), (void *)KASAN_SHADOW_END); - memset(kasan_zero_page, 0, PAGE_SIZE); - load_cr3(init_level4_pgt); __flush_tlb_all(); - init_task.kasan_depth = 0; + /* + * kasan_zero_page has been used as early shadow memory, thus it may + * contain some garbage. Now we can clear it, since after the TLB flush + * no one should write to it. + */ + memset(kasan_zero_page, 0, PAGE_SIZE); + + init_task.kasan_depth = 0; pr_info("KernelAddressSanitizer initialized\n"); } -- cgit v0.10.2 From 063fb3e56f6dd29b2633b678b837e1d904200e6f Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Mon, 11 Jan 2016 15:51:19 +0300 Subject: x86/kasan: Write protect kasan zero shadow After kasan_init() executed, no one is allowed to write to kasan_zero_page, so write protect it. Signed-off-by: Andrey Ryabinin Reviewed-by: Borislav Petkov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Luis R. Rodriguez Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Toshi Kani Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/1452516679-32040-3-git-send-email-aryabinin@virtuozzo.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c index 303e470..1b1110f 100644 --- a/arch/x86/mm/kasan_init_64.c +++ b/arch/x86/mm/kasan_init_64.c @@ -125,10 +125,16 @@ void __init kasan_init(void) /* * kasan_zero_page has been used as early shadow memory, thus it may - * contain some garbage. Now we can clear it, since after the TLB flush - * no one should write to it. + * contain some garbage. Now we can clear and write protect it, since + * after the TLB flush no one should write to it. */ memset(kasan_zero_page, 0, PAGE_SIZE); + for (i = 0; i < PTRS_PER_PTE; i++) { + pte_t pte = __pte(__pa(kasan_zero_page) | __PAGE_KERNEL_RO); + set_pte(&kasan_zero_pte[i], pte); + } + /* Flush TLBs again to be sure that write protection applied. */ + __flush_tlb_all(); init_task.kasan_depth = 0; pr_info("KernelAddressSanitizer initialized\n"); -- cgit v0.10.2 From 060a402a1ddb551455ee410de2eadd3349f2801b Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Fri, 29 Jan 2016 11:42:57 -0800 Subject: x86/mm: Add INVPCID helpers This adds helpers for each of the four currently-specified INVPCID modes. Signed-off-by: Andy Lutomirski Reviewed-by: Borislav Petkov Cc: Andrew Morton Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Luis R. Rodriguez Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Toshi Kani Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/8a62b23ad686888cee01da134c91409e22064db9.1454096309.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 6df2029..8b57683 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -7,6 +7,54 @@ #include #include +static inline void __invpcid(unsigned long pcid, unsigned long addr, + unsigned long type) +{ + u64 desc[2] = { pcid, addr }; + + /* + * The memory clobber is because the whole point is to invalidate + * stale TLB entries and, especially if we're flushing global + * mappings, we don't want the compiler to reorder any subsequent + * memory accesses before the TLB flush. + * + * The hex opcode is invpcid (%ecx), %eax in 32-bit mode and + * invpcid (%rcx), %rax in long mode. + */ + asm volatile (".byte 0x66, 0x0f, 0x38, 0x82, 0x01" + : : "m" (desc), "a" (type), "c" (desc) : "memory"); +} + +#define INVPCID_TYPE_INDIV_ADDR 0 +#define INVPCID_TYPE_SINGLE_CTXT 1 +#define INVPCID_TYPE_ALL_INCL_GLOBAL 2 +#define INVPCID_TYPE_ALL_NON_GLOBAL 3 + +/* Flush all mappings for a given pcid and addr, not including globals. */ +static inline void invpcid_flush_one(unsigned long pcid, + unsigned long addr) +{ + __invpcid(pcid, addr, INVPCID_TYPE_INDIV_ADDR); +} + +/* Flush all mappings for a given PCID, not including globals. */ +static inline void invpcid_flush_single_context(unsigned long pcid) +{ + __invpcid(pcid, 0, INVPCID_TYPE_SINGLE_CTXT); +} + +/* Flush all mappings, including globals, for all PCIDs. */ +static inline void invpcid_flush_all(void) +{ + __invpcid(0, 0, INVPCID_TYPE_ALL_INCL_GLOBAL); +} + +/* Flush all mappings for all PCIDs except globals. */ +static inline void invpcid_flush_all_nonglobals(void) +{ + __invpcid(0, 0, INVPCID_TYPE_ALL_NON_GLOBAL); +} + #ifdef CONFIG_PARAVIRT #include #else -- cgit v0.10.2 From d12a72b844a49d4162f24cefdab30bed3f86730e Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Fri, 29 Jan 2016 11:42:58 -0800 Subject: x86/mm: Add a 'noinvpcid' boot option to turn off INVPCID This adds a chicken bit to turn off INVPCID in case something goes wrong. It's an early_param() because we do TLB flushes before we parse __setup() parameters. Signed-off-by: Andy Lutomirski Reviewed-by: Borislav Petkov Cc: Andrew Morton Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Luis R. Rodriguez Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Toshi Kani Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/f586317ed1bc2b87aee652267e515b90051af385.1454096309.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 551ecf0..e4c4d2a 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -2566,6 +2566,8 @@ bytes respectively. Such letter suffixes can also be entirely omitted. nointroute [IA-64] + noinvpcid [X86] Disable the INVPCID cpu feature. + nojitter [IA-64] Disables jitter checking for ITC timers. no-kvmclock [X86,KVM] Disable paravirtualized KVM clock driver diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 37830de..f4d0aa6 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -162,6 +162,22 @@ static int __init x86_mpx_setup(char *s) } __setup("nompx", x86_mpx_setup); +static int __init x86_noinvpcid_setup(char *s) +{ + /* noinvpcid doesn't accept parameters */ + if (s) + return -EINVAL; + + /* do not emit a message if the feature is not present */ + if (!boot_cpu_has(X86_FEATURE_INVPCID)) + return 0; + + setup_clear_cpu_cap(X86_FEATURE_INVPCID); + pr_info("noinvpcid: INVPCID feature disabled\n"); + return 0; +} +early_param("noinvpcid", x86_noinvpcid_setup); + #ifdef CONFIG_X86_32 static int cachesize_override = -1; static int disable_x86_serial_nr = 1; -- cgit v0.10.2 From d8bced79af1db6734f66b42064cc773cada2ce99 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Fri, 29 Jan 2016 11:42:59 -0800 Subject: x86/mm: If INVPCID is available, use it to flush global mappings On my Skylake laptop, INVPCID function 2 (flush absolutely everything) takes about 376ns, whereas saving flags, twiddling CR4.PGE to flush global mappings, and restoring flags takes about 539ns. Signed-off-by: Andy Lutomirski Reviewed-by: Borislav Petkov Cc: Andrew Morton Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Luis R. Rodriguez Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Toshi Kani Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/ed0ef62581c0ea9c99b9bf6df726015e96d44743.1454096309.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 8b57683..fc9a2fd 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -152,6 +152,15 @@ static inline void __native_flush_tlb_global(void) { unsigned long flags; + if (static_cpu_has(X86_FEATURE_INVPCID)) { + /* + * Using INVPCID is considerably faster than a pair of writes + * to CR4 sandwiched inside an IRQ flag save/restore. + */ + invpcid_flush_all(); + return; + } + /* * Read-modify-write to CR4 - protect it from preemption and * from interrupts. (Use the raw variant because this code can -- cgit v0.10.2 From ce1143aa60273220a9f89012f2aaaed04f97e9a2 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 25 Jan 2016 23:06:49 -0800 Subject: x86/dmi: Switch dmi_remap() from ioremap() [uncached] to ioremap_cache() DMI cacheability is very confused on x86. dmi_early_remap() uses early_ioremap(), which uses FIXMAP_PAGE_IO, which is __PAGE_KERNEL_IO, which is __PAGE_KERNEL, which is cached. Don't ask me why this makes any sense. dmi_remap() uses ioremap(), which requests an uncached mapping. However, on non-EFI systems, the DMI data generally lives between 0xf0000 and 0x100000, which is in the legacy ISA range, which triggers a special case in the PAT code that overrides the cache mode requested by ioremap() and forces a WB mapping. On a UEFI boot, however, the DMI table can live at any physical address. On my laptop, it's around 0x77dd0000. That's nowhere near the legacy ISA range, so the ioremap() implicit uncached type is honored and we end up with a UC- mapping. UC- is a very, very slow way to read from main memory, so dmi_walk() is likely to take much longer than necessary. Given that, even on UEFI, we do early cached DMI reads, it seems safe to just ask for cached access. Switch to ioremap_cache(). I haven't tried to benchmark this, but I'd guess it saves several milliseconds of boot time. Signed-off-by: Andy Lutomirski Cc: Andrew Morton Cc: Andy Lutomirski Cc: Arjan van de Ven Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Jean Delvare Cc: Linus Torvalds Cc: Luis R. Rodriguez Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Toshi Kani Link: http://lkml.kernel.org/r/3147c38e51f439f3c8911db34c7d4ab22d854915.1453791969.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/include/asm/dmi.h b/arch/x86/include/asm/dmi.h index 535192f..3c69fed 100644 --- a/arch/x86/include/asm/dmi.h +++ b/arch/x86/include/asm/dmi.h @@ -15,7 +15,7 @@ static __always_inline __init void *dmi_alloc(unsigned len) /* Use early IO mappings for DMI because it's initialized early */ #define dmi_early_remap early_ioremap #define dmi_early_unmap early_iounmap -#define dmi_remap ioremap +#define dmi_remap ioremap_cache #define dmi_unmap iounmap #endif /* _ASM_X86_DMI_H */ -- cgit v0.10.2 From dd7b6847670a84b7bb7c38f8e69b2f12059bca66 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Mon, 25 Jan 2016 12:25:15 -0500 Subject: x86/mm: Honour passed pgprot in track_pfn_insert() and track_pfn_remap() track_pfn_insert() overwrites the pgprot that is passed in with a value based on the VMA's page_prot. This is a problem for people trying to do clever things with the new vm_insert_pfn_prot() as it will simply overwrite the passed protection flags. If we use the current value of the pgprot as the base, then it will behave as people are expecting. Also fix track_pfn_remap() in the same way. Signed-off-by: Matthew Wilcox Acked-by: Andy Lutomirski Cc: Andrew Morton Cc: Andy Lutomirski Cc: Kees Cook Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/1453742717-10326-2-git-send-email-matthew.r.wilcox@intel.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index f4ae536..04e2e71 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -943,7 +943,7 @@ int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot, return -EINVAL; } - *prot = __pgprot((pgprot_val(vma->vm_page_prot) & (~_PAGE_CACHE_MASK)) | + *prot = __pgprot((pgprot_val(*prot) & (~_PAGE_CACHE_MASK)) | cachemode2protval(pcm)); return 0; @@ -959,7 +959,7 @@ int track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, /* Set prot based on lookup */ pcm = lookup_memtype(pfn_t_to_phys(pfn)); - *prot = __pgprot((pgprot_val(vma->vm_page_prot) & (~_PAGE_CACHE_MASK)) | + *prot = __pgprot((pgprot_val(*prot) & (~_PAGE_CACHE_MASK)) | cachemode2protval(pcm)); return 0; -- cgit v0.10.2 From 4ecd16ec7059390b430af34bd8bc3ca2b5dcef9a Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Sun, 24 Jan 2016 14:38:06 -0800 Subject: x86/fpu: Fix math emulation in eager fpu mode Systems without an FPU are generally old and therefore use lazy FPU switching. Unsurprisingly, math emulation in eager FPU mode is a bit buggy. Fix it. There were two bugs involving kernel code trying to use the FPU registers in eager mode even if they didn't exist and one BUG_ON() that was incorrect. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Quentin Casasnovas Cc: Rik van Riel Cc: Sai Praneeth Prakhya Cc: Thomas Gleixner Cc: yu-cheng yu Link: http://lkml.kernel.org/r/b4b8d112436bd6fab866e1b4011131507e8d7fbe.1453675014.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 0fd440d..a1f78a9 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -589,7 +589,8 @@ switch_fpu_prepare(struct fpu *old_fpu, struct fpu *new_fpu, int cpu) * If the task has used the math, pre-load the FPU on xsave processors * or if the past 5 consecutive context-switches used math. */ - fpu.preload = new_fpu->fpstate_active && + fpu.preload = static_cpu_has(X86_FEATURE_FPU) && + new_fpu->fpstate_active && (use_eager_fpu() || new_fpu->counter > 5); if (old_fpu->fpregs_active) { diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index d25097c..08e1e11 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -423,7 +423,7 @@ void fpu__clear(struct fpu *fpu) { WARN_ON_FPU(fpu != ¤t->thread.fpu); /* Almost certainly an anomaly */ - if (!use_eager_fpu()) { + if (!use_eager_fpu() || !static_cpu_has(X86_FEATURE_FPU)) { /* FPU state will be reallocated lazily at the first use. */ fpu__drop(fpu); } else { diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index ade185a..87f80fe 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -750,7 +750,6 @@ dotraplinkage void do_device_not_available(struct pt_regs *regs, long error_code) { RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU"); - BUG_ON(use_eager_fpu()); #ifdef CONFIG_MATH_EMULATION if (read_cr0() & X86_CR0_EM) { -- cgit v0.10.2 From 5ed73f40735c68d8a656b46d09b1885d3b8740ae Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Sun, 24 Jan 2016 14:38:07 -0800 Subject: x86/fpu: Fix FNSAVE usage in eagerfpu mode In eager fpu mode, having deactivated FPU without immediately reloading some other context is illegal. Therefore, to recover from FNSAVE, we can't just deactivate the state -- we need to reload it if we're not actively context switching. We had this wrong in fpu__save() and fpu__copy(). Fix both. __kernel_fpu_begin() was fine -- add a comment. This fixes a warning triggerable with nofxsr eagerfpu=on. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Quentin Casasnovas Cc: Rik van Riel Cc: Sai Praneeth Prakhya Cc: Thomas Gleixner Cc: yu-cheng yu Link: http://lkml.kernel.org/r/60662444e13c76f06e23c15c5dcdba31b4ac3d67.1453675014.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 08e1e11..7a9244d 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -114,6 +114,10 @@ void __kernel_fpu_begin(void) kernel_fpu_disable(); if (fpu->fpregs_active) { + /* + * Ignore return value -- we don't care if reg state + * is clobbered. + */ copy_fpregs_to_fpstate(fpu); } else { this_cpu_write(fpu_fpregs_owner_ctx, NULL); @@ -189,8 +193,12 @@ void fpu__save(struct fpu *fpu) preempt_disable(); if (fpu->fpregs_active) { - if (!copy_fpregs_to_fpstate(fpu)) - fpregs_deactivate(fpu); + if (!copy_fpregs_to_fpstate(fpu)) { + if (use_eager_fpu()) + copy_kernel_to_fpregs(&fpu->state); + else + fpregs_deactivate(fpu); + } } preempt_enable(); } @@ -259,7 +267,11 @@ static void fpu_copy(struct fpu *dst_fpu, struct fpu *src_fpu) preempt_disable(); if (!copy_fpregs_to_fpstate(dst_fpu)) { memcpy(&src_fpu->state, &dst_fpu->state, xstate_size); - fpregs_deactivate(src_fpu); + + if (use_eager_fpu()) + copy_kernel_to_fpregs(&src_fpu->state); + else + fpregs_deactivate(src_fpu); } preempt_enable(); } -- cgit v0.10.2 From a20d7297045f7fdcd676c15243192eb0e95a4306 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Sun, 24 Jan 2016 14:38:08 -0800 Subject: x86/fpu: Fold fpu_copy() into fpu__copy() Splitting it into two functions needlessly obfuscated the code. While we're at it, improve the comment slightly. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Quentin Casasnovas Cc: Rik van Riel Cc: Sai Praneeth Prakhya Cc: Thomas Gleixner Cc: yu-cheng yu Link: http://lkml.kernel.org/r/3eb5a63a9c5c84077b2677a7dfe684eef96fe59e.1453675014.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 7a9244d..299b58b 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -231,14 +231,15 @@ void fpstate_init(union fpregs_state *state) } EXPORT_SYMBOL_GPL(fpstate_init); -/* - * Copy the current task's FPU state to a new task's FPU context. - * - * In both the 'eager' and the 'lazy' case we save hardware registers - * directly to the destination buffer. - */ -static void fpu_copy(struct fpu *dst_fpu, struct fpu *src_fpu) +int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu) { + dst_fpu->counter = 0; + dst_fpu->fpregs_active = 0; + dst_fpu->last_cpu = -1; + + if (!src_fpu->fpstate_active || !cpu_has_fpu) + return 0; + WARN_ON_FPU(src_fpu != ¤t->thread.fpu); /* @@ -251,10 +252,9 @@ static void fpu_copy(struct fpu *dst_fpu, struct fpu *src_fpu) /* * Save current FPU registers directly into the child * FPU context, without any memory-to-memory copying. - * - * If the FPU context got destroyed in the process (FNSAVE - * done on old CPUs) then copy it back into the source - * context and mark the current task for lazy restore. + * In lazy mode, if the FPU context isn't loaded into + * fpregs, CR0.TS will be set and do_device_not_available + * will load the FPU context. * * We have to do all this with preemption disabled, * mostly because of the FNSAVE case, because in that @@ -274,16 +274,6 @@ static void fpu_copy(struct fpu *dst_fpu, struct fpu *src_fpu) fpregs_deactivate(src_fpu); } preempt_enable(); -} - -int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu) -{ - dst_fpu->counter = 0; - dst_fpu->fpregs_active = 0; - dst_fpu->last_cpu = -1; - - if (src_fpu->fpstate_active && cpu_has_fpu) - fpu_copy(dst_fpu, src_fpu); return 0; } -- cgit v0.10.2 From c6ab109f7e0eae3bae3bb10f8ddb0df67735c150 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Sun, 24 Jan 2016 14:38:09 -0800 Subject: x86/fpu: Speed up lazy FPU restores slightly If we have an FPU, there's no need to check CR0 for FPU emulation. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Quentin Casasnovas Cc: Rik van Riel Cc: Sai Praneeth Prakhya Cc: Thomas Gleixner Cc: yu-cheng yu Link: http://lkml.kernel.org/r/980004297e233c27066d54e71382c44cdd36ef7c.1453675014.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 87f80fe..36a9c01 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -752,7 +752,7 @@ do_device_not_available(struct pt_regs *regs, long error_code) RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU"); #ifdef CONFIG_MATH_EMULATION - if (read_cr0() & X86_CR0_EM) { + if (!boot_cpu_has(X86_FEATURE_FPU) && (read_cr0() & X86_CR0_EM)) { struct math_emu_info info = { }; conditional_sti(regs); -- cgit v0.10.2 From 58122bf1d856a4ea9581d62a07c557d997d46a19 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Sun, 24 Jan 2016 14:38:10 -0800 Subject: x86/fpu: Default eagerfpu=on on all CPUs We have eager and lazy FPU modes, introduced in: 304bceda6a18 ("x86, fpu: use non-lazy fpu restore for processors supporting xsave") The result is rather messy. There are two code paths in almost all of the FPU code, and only one of them (the eager case) is tested frequently, since most kernel developers have new enough hardware that we use eagerfpu. It seems that, on any remotely recent hardware, eagerfpu is a win: glibc uses SSE2, so laziness is probably overoptimistic, and, in any case, manipulating TS is far slower that saving and restoring the full state. (Stores to CR0.TS are serializing and are poorly optimized.) To try to shake out any latent issues on old hardware, this changes the default to eager on all CPUs. If no performance or functionality problems show up, a subsequent patch could remove lazy mode entirely. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Quentin Casasnovas Cc: Rik van Riel Cc: Sai Praneeth Prakhya Cc: Thomas Gleixner Cc: yu-cheng yu Link: http://lkml.kernel.org/r/ac290de61bf08d9cfc2664a4f5080257ffc1075a.1453675014.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index 6d9f0a7..471fe27 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -260,7 +260,10 @@ static void __init fpu__init_system_xstate_size_legacy(void) * not only saved the restores along the way, but we also have the * FPU ready to be used for the original task. * - * 'eager' switching is used on modern CPUs, there we switch the FPU + * 'lazy' is deprecated because it's almost never a performance win + * and it's much more complicated than 'eager'. + * + * 'eager' switching is by default on all CPUs, there we switch the FPU * state during every context switch, regardless of whether the task * has used FPU instructions in that time slice or not. This is done * because modern FPU context saving instructions are able to optimize @@ -271,7 +274,7 @@ static void __init fpu__init_system_xstate_size_legacy(void) * to use 'eager' restores, if we detect that a task is using the FPU * frequently. See the fpu->counter logic in fpu/internal.h for that. ] */ -static enum { AUTO, ENABLE, DISABLE } eagerfpu = AUTO; +static enum { ENABLE, DISABLE } eagerfpu = ENABLE; /* * Find supported xfeatures based on cpu features and command-line input. @@ -348,15 +351,9 @@ static void __init fpu__init_system_ctx_switch(void) */ static void __init fpu__init_parse_early_param(void) { - /* - * No need to check "eagerfpu=auto" again, since it is the - * initial default. - */ if (cmdline_find_option_bool(boot_command_line, "eagerfpu=off")) { eagerfpu = DISABLE; fpu__clear_eager_fpu_features(); - } else if (cmdline_find_option_bool(boot_command_line, "eagerfpu=on")) { - eagerfpu = ENABLE; } if (cmdline_find_option_bool(boot_command_line, "no387")) -- cgit v0.10.2 From e2c7698cd61f11d4077fdb28148b2d31b82ac848 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 10 Feb 2016 15:51:16 +0100 Subject: x86/mm: Fix INVPCID asm constraint MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit So we want to specify the dependency on both @pcid and @addr so that the compiler doesn't reorder accesses to them *before* the TLB flush. But for that to work, we need to express this properly in the inline asm and deref the whole desc array, not the pointer to it. See clwb() for an example. This fixes the build error on 32-bit: arch/x86/include/asm/tlbflush.h: In function ‘__invpcid’: arch/x86/include/asm/tlbflush.h:26:18: error: memory input 0 is not directly addressable which gcc4.7 caught but 5.x didn't. Which is strange. :-\ Signed-off-by: Borislav Petkov Cc: Andrew Morton Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Luis R. Rodriguez Cc: Michael Matz Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Toshi Kani Cc: linux-mm@kvack.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index fc9a2fd..d0cce90 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -10,7 +10,7 @@ static inline void __invpcid(unsigned long pcid, unsigned long addr, unsigned long type) { - u64 desc[2] = { pcid, addr }; + struct { u64 d[2]; } desc = { { pcid, addr } }; /* * The memory clobber is because the whole point is to invalidate @@ -22,7 +22,7 @@ static inline void __invpcid(unsigned long pcid, unsigned long addr, * invpcid (%rcx), %rax in long mode. */ asm volatile (".byte 0x66, 0x0f, 0x38, 0x82, 0x01" - : : "m" (desc), "a" (type), "c" (desc) : "memory"); + : : "m" (desc), "a" (type), "c" (&desc) : "memory"); } #define INVPCID_TYPE_INDIV_ADDR 0 -- cgit v0.10.2 From f2cc8e0791c70833758101d9756609a08dd601ec Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 16 Feb 2016 00:19:18 +0100 Subject: x86/cpufeature: Speed up cpu_feature_enabled() When GCC cannot do constant folding for this macro, it falls back to cpu_has(). But static_cpu_has() is optimal and it works at all times now. So use it and speedup the fallback case. Before we had this: mov 0x99d674(%rip),%rdx # ffffffff81b0d9f4 shr $0x2e,%rdx and $0x1,%edx jne ffffffff811704e9 After alternatives patching, it turns into: jmp 0xffffffff81170390 nopl (%rax) ... callq ffffffff81056e00 ffffffff81170390: mov 0x170(%r12),%rdi Signed-off-by: Borislav Petkov Cc: Joerg Roedel Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1455578358-28347-1-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 9fba7a5..68e4e82 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -88,8 +88,7 @@ extern const char * const x86_bug_flags[NBUGINTS*32]; * is not relevant. */ #define cpu_feature_enabled(bit) \ - (__builtin_constant_p(bit) && DISABLED_MASK_BIT_SET(bit) ? 0 : \ - cpu_has(&boot_cpu_data, bit)) + (__builtin_constant_p(bit) && DISABLED_MASK_BIT_SET(bit) ? 0 : static_cpu_has(bit)) #define boot_cpu_has(bit) cpu_has(&boot_cpu_data, bit) -- cgit v0.10.2