diff options
author | Scott Wood <scottwood@freescale.com> | 2015-02-13 22:12:06 (GMT) |
---|---|---|
committer | Scott Wood <scottwood@freescale.com> | 2015-02-13 22:19:22 (GMT) |
commit | 6faa2909871d8937cb2f79a10e1b21ffe193fac1 (patch) | |
tree | f558a94f1553814cc122ab8d9e04c0ebad5262a5 /arch/x86 | |
parent | fcb2fb84301c673ee15ca04e7a2fc965712d49a0 (diff) | |
download | linux-fsl-qoriq-6faa2909871d8937cb2f79a10e1b21ffe193fac1.tar.xz |
Reset to 3.12.37
Diffstat (limited to 'arch/x86')
116 files changed, 1399 insertions, 676 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index edbb857..5b5c6ea 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -21,7 +21,6 @@ config X86_64 ### Arch settings config X86 def_bool y - select HAVE_PREEMPT_LAZY select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS select HAVE_AOUT if X86_32 select HAVE_UNSTABLE_SCHED_CLOCK @@ -124,6 +123,7 @@ config X86 select COMPAT_OLD_SIGACTION if IA32_EMULATION select RTC_LIB select HAVE_DEBUG_STACKOVERFLOW + select ARCH_SUPPORTS_ATOMIC_RMW config INSTRUCTION_DECODER def_bool y @@ -180,11 +180,8 @@ config ARCH_MAY_HAVE_PC_FDC def_bool y depends on ISA_DMA_API -config RWSEM_GENERIC_SPINLOCK - def_bool PREEMPT_RT_FULL - config RWSEM_XCHGADD_ALGORITHM - def_bool !RWSEM_GENERIC_SPINLOCK && !PREEMPT_RT_FULL + def_bool y config GENERIC_CALIBRATE_DELAY def_bool y @@ -474,7 +471,7 @@ config X86_MDFLD select MFD_INTEL_MSIC ---help--- Medfield is Intel's Low Power Intel Architecture (LPIA) based Moblin - Internet Device(MID) platform. + Internet Device(MID) platform. Unlike standard x86 PCs, Medfield does not have many legacy devices nor standard legacy replacement devices/features. e.g. Medfield does not contain i8259, i8254, HPET, legacy BIOS, most of the io ports. @@ -821,7 +818,7 @@ config IOMMU_HELPER config MAXSMP bool "Enable Maximum number of SMP Processors and NUMA Nodes" depends on X86_64 && SMP && DEBUG_KERNEL - select CPUMASK_OFFSTACK if !PREEMPT_RT_FULL + select CPUMASK_OFFSTACK ---help--- Enable maximum number of CPUS and NUMA Nodes for this architecture. If unsure, say N. @@ -864,7 +861,7 @@ source "kernel/Kconfig.preempt" config X86_UP_APIC bool "Local APIC support on uniprocessors" - depends on X86_32 && !SMP && !X86_32_NON_STANDARD && !PCI_MSI + depends on X86_32 && !SMP && !X86_32_NON_STANDARD ---help--- A local APIC (Advanced Programmable Interrupt Controller) is an integrated interrupt controller in the CPU. If you have a single-CPU @@ -875,6 +872,10 @@ config X86_UP_APIC performance counters), and the NMI watchdog which detects hard lockups. +config X86_UP_APIC_MSI + def_bool y + select X86_UP_APIC if X86_32 && !SMP && !X86_32_NON_STANDARD && PCI_MSI + config X86_UP_IOAPIC bool "IO-APIC support on uniprocessors" depends on X86_UP_APIC @@ -976,10 +977,27 @@ config VM86 default y depends on X86_32 ---help--- - This option is required by programs like DOSEMU to run 16-bit legacy - code on X86 processors. It also may be needed by software like - XFree86 to initialize some video cards via BIOS. Disabling this - option saves about 6k. + This option is required by programs like DOSEMU to run + 16-bit real mode legacy code on x86 processors. It also may + be needed by software like XFree86 to initialize some video + cards via BIOS. Disabling this option saves about 6K. + +config X86_16BIT + bool "Enable support for 16-bit segments" if EXPERT + default y + ---help--- + This option is required by programs like Wine to run 16-bit + protected mode legacy code on x86 processors. Disabling + this option saves about 300 bytes on i386, or around 6K text + plus 16K runtime memory on x86-64, + +config X86_ESPFIX32 + def_bool y + depends on X86_16BIT && X86_32 + +config X86_ESPFIX64 + def_bool y + depends on X86_16BIT && X86_64 config TOSHIBA tristate "Toshiba Laptop support" @@ -1594,6 +1612,7 @@ config EFI config EFI_STUB bool "EFI stub support" depends on EFI + select RELOCATABLE ---help--- This kernel feature allows a bzImage to be loaded directly by EFI firmware without the use of a bootloader. @@ -1885,6 +1904,10 @@ config USE_PERCPU_NUMA_NODE_ID def_bool y depends on NUMA +config ARCH_ENABLE_HUGEPAGE_MIGRATION + def_bool y + depends on X86_64 && HUGETLB_PAGE && MIGRATION + menu "Power management and ACPI options" config ARCH_HIBERNATION_HEADER diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c index b7388a4..9b883a8 100644 --- a/arch/x86/boot/compressed/eboot.c +++ b/arch/x86/boot/compressed/eboot.c @@ -865,6 +865,9 @@ fail: * Because the x86 boot code expects to be passed a boot_params we * need to create one ourselves (usually the bootloader would create * one for us). + * + * The caller is responsible for filling out ->code32_start in the + * returned boot_params. */ struct boot_params *make_boot_params(void *handle, efi_system_table_t *_table) { @@ -921,8 +924,6 @@ struct boot_params *make_boot_params(void *handle, efi_system_table_t *_table) hdr->vid_mode = 0xffff; hdr->boot_flag = 0xAA55; - hdr->code32_start = (__u64)(unsigned long)image->image_base; - hdr->type_of_loader = 0x21; /* Convert unicode cmdline to ascii */ diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S index 5d6f689..b1bd969 100644 --- a/arch/x86/boot/compressed/head_32.S +++ b/arch/x86/boot/compressed/head_32.S @@ -50,6 +50,13 @@ ENTRY(efi_pe_entry) pushl %eax pushl %esi pushl %ecx + + call reloc +reloc: + popl %ecx + subl reloc, %ecx + movl %ecx, BP_code32_start(%eax) + sub $0x4, %esp ENTRY(efi_stub_entry) @@ -63,12 +70,7 @@ ENTRY(efi_stub_entry) hlt jmp 1b 2: - call 3f -3: - popl %eax - subl $3b, %eax - subl BP_pref_address(%esi), %eax - add BP_code32_start(%esi), %eax + movl BP_code32_start(%esi), %eax leal preferred_addr(%eax), %eax jmp *%eax diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index c337422..a558403 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -215,6 +215,8 @@ ENTRY(efi_pe_entry) cmpq $0,%rax je 1f mov %rax, %rdx + leaq startup_32(%rip), %rax + movl %eax, BP_code32_start(%rdx) popq %rsi popq %rdi @@ -228,12 +230,7 @@ ENTRY(efi_stub_entry) hlt jmp 1b 2: - call 3f -3: - popq %rax - subq $3b, %rax - subq BP_pref_address(%rsi), %rax - add BP_code32_start(%esi), %eax + movl BP_code32_start(%esi), %eax leaq preferred_addr(%rax), %rax jmp *%rax diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 434f077..1b05afd 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -401,6 +401,8 @@ asmlinkage void decompress_kernel(void *rmode, memptr heap, unsigned char *output, unsigned long output_len) { + unsigned char *output_orig = output; + real_mode = rmode; sanitize_boot_params(real_mode); @@ -439,7 +441,12 @@ asmlinkage void decompress_kernel(void *rmode, memptr heap, debug_putstr("\nDecompressing Linux... "); decompress(input_data, input_len, NULL, NULL, output, NULL, error); parse_elf(output); - handle_relocations(output, output_len); + /* + * 32-bit always performs relocations. 64-bit relocations are only + * needed if kASLR has chosen a different load address. + */ + if (!IS_ENABLED(CONFIG_X86_64) || output != output_orig) + handle_relocations(output, output_len); debug_putstr("done.\nBooting the kernel.\n"); return; } diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S index 9ec06a1..4257124 100644 --- a/arch/x86/boot/header.S +++ b/arch/x86/boot/header.S @@ -91,10 +91,9 @@ bs_die: .section ".bsdata", "a" bugger_off_msg: - .ascii "Direct floppy boot is not supported. " - .ascii "Use a boot loader program instead.\r\n" + .ascii "Use a boot loader.\r\n" .ascii "\n" - .ascii "Remove disk and press any key to reboot ...\r\n" + .ascii "Remove disk and press any key to reboot...\r\n" .byte 0 #ifdef CONFIG_EFI_STUB @@ -108,7 +107,7 @@ coff_header: #else .word 0x8664 # x86-64 #endif - .word 3 # nr_sections + .word 4 # nr_sections .long 0 # TimeDateStamp .long 0 # PointerToSymbolTable .long 1 # NumberOfSymbols @@ -250,6 +249,25 @@ section_table: .word 0 # NumberOfLineNumbers .long 0x60500020 # Characteristics (section flags) + # + # The offset & size fields are filled in by build.c. + # + .ascii ".bss" + .byte 0 + .byte 0 + .byte 0 + .byte 0 + .long 0 + .long 0x0 + .long 0 # Size of initialized data + # on disk + .long 0x0 + .long 0 # PointerToRelocations + .long 0 # PointerToLineNumbers + .word 0 # NumberOfRelocations + .word 0 # NumberOfLineNumbers + .long 0xc8000080 # Characteristics (section flags) + #endif /* CONFIG_EFI_STUB */ # Kernel attributes; used by setup. This is part 1 of the diff --git a/arch/x86/boot/tools/build.c b/arch/x86/boot/tools/build.c index c941d6a..687dd28 100644 --- a/arch/x86/boot/tools/build.c +++ b/arch/x86/boot/tools/build.c @@ -141,7 +141,7 @@ static void usage(void) #ifdef CONFIG_EFI_STUB -static void update_pecoff_section_header(char *section_name, u32 offset, u32 size) +static void update_pecoff_section_header_fields(char *section_name, u32 vma, u32 size, u32 datasz, u32 offset) { unsigned int pe_header; unsigned short num_sections; @@ -162,10 +162,10 @@ static void update_pecoff_section_header(char *section_name, u32 offset, u32 siz put_unaligned_le32(size, section + 0x8); /* section header vma field */ - put_unaligned_le32(offset, section + 0xc); + put_unaligned_le32(vma, section + 0xc); /* section header 'size of initialised data' field */ - put_unaligned_le32(size, section + 0x10); + put_unaligned_le32(datasz, section + 0x10); /* section header 'file offset' field */ put_unaligned_le32(offset, section + 0x14); @@ -177,6 +177,11 @@ static void update_pecoff_section_header(char *section_name, u32 offset, u32 siz } } +static void update_pecoff_section_header(char *section_name, u32 offset, u32 size) +{ + update_pecoff_section_header_fields(section_name, offset, size, size, offset); +} + static void update_pecoff_setup_and_reloc(unsigned int size) { u32 setup_offset = 0x200; @@ -201,9 +206,6 @@ static void update_pecoff_text(unsigned int text_start, unsigned int file_sz) pe_header = get_unaligned_le32(&buf[0x3c]); - /* Size of image */ - put_unaligned_le32(file_sz, &buf[pe_header + 0x50]); - /* * Size of code: Subtract the size of the first sector (512 bytes) * which includes the header. @@ -218,6 +220,22 @@ static void update_pecoff_text(unsigned int text_start, unsigned int file_sz) update_pecoff_section_header(".text", text_start, text_sz); } +static void update_pecoff_bss(unsigned int file_sz, unsigned int init_sz) +{ + unsigned int pe_header; + unsigned int bss_sz = init_sz - file_sz; + + pe_header = get_unaligned_le32(&buf[0x3c]); + + /* Size of uninitialized data */ + put_unaligned_le32(bss_sz, &buf[pe_header + 0x24]); + + /* Size of image */ + put_unaligned_le32(init_sz, &buf[pe_header + 0x50]); + + update_pecoff_section_header_fields(".bss", file_sz, bss_sz, 0, 0); +} + #endif /* CONFIG_EFI_STUB */ @@ -269,6 +287,9 @@ int main(int argc, char ** argv) int fd; void *kernel; u32 crc = 0xffffffffUL; +#ifdef CONFIG_EFI_STUB + unsigned int init_sz; +#endif /* Defaults for old kernel */ #ifdef CONFIG_X86_32 @@ -339,7 +360,9 @@ int main(int argc, char ** argv) put_unaligned_le32(sys_size, &buf[0x1f4]); #ifdef CONFIG_EFI_STUB - update_pecoff_text(setup_sectors * 512, sz + i + ((sys_size * 16) - sz)); + update_pecoff_text(setup_sectors * 512, i + (sys_size * 16)); + init_sz = get_unaligned_le32(&buf[0x260]); + update_pecoff_bss(i + (sys_size * 16), init_sz); #ifdef CONFIG_X86_64 /* Yes, this is really how we defined it :( */ efi_stub_entry -= 0x200; diff --git a/arch/x86/crypto/aes_glue.c b/arch/x86/crypto/aes_glue.c index aafe8ce..e26984f 100644 --- a/arch/x86/crypto/aes_glue.c +++ b/arch/x86/crypto/aes_glue.c @@ -66,5 +66,5 @@ module_exit(aes_fini); MODULE_DESCRIPTION("Rijndael (AES) Cipher Algorithm, asm optimized"); MODULE_LICENSE("GPL"); -MODULE_ALIAS("aes"); -MODULE_ALIAS("aes-asm"); +MODULE_ALIAS_CRYPTO("aes"); +MODULE_ALIAS_CRYPTO("aes-asm"); diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index 3fbe870..f89e749 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -252,14 +252,14 @@ static int ecb_encrypt(struct blkcipher_desc *desc, err = blkcipher_walk_virt(desc, &walk); desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + kernel_fpu_begin(); while ((nbytes = walk.nbytes)) { - kernel_fpu_begin(); aesni_ecb_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr, - nbytes & AES_BLOCK_MASK); - kernel_fpu_end(); + nbytes & AES_BLOCK_MASK); nbytes &= AES_BLOCK_SIZE - 1; err = blkcipher_walk_done(desc, &walk, nbytes); } + kernel_fpu_end(); return err; } @@ -276,14 +276,14 @@ static int ecb_decrypt(struct blkcipher_desc *desc, err = blkcipher_walk_virt(desc, &walk); desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + kernel_fpu_begin(); while ((nbytes = walk.nbytes)) { - kernel_fpu_begin(); aesni_ecb_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr, nbytes & AES_BLOCK_MASK); - kernel_fpu_end(); nbytes &= AES_BLOCK_SIZE - 1; err = blkcipher_walk_done(desc, &walk, nbytes); } + kernel_fpu_end(); return err; } @@ -300,14 +300,14 @@ static int cbc_encrypt(struct blkcipher_desc *desc, err = blkcipher_walk_virt(desc, &walk); desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + kernel_fpu_begin(); while ((nbytes = walk.nbytes)) { - kernel_fpu_begin(); aesni_cbc_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr, nbytes & AES_BLOCK_MASK, walk.iv); - kernel_fpu_end(); nbytes &= AES_BLOCK_SIZE - 1; err = blkcipher_walk_done(desc, &walk, nbytes); } + kernel_fpu_end(); return err; } @@ -324,14 +324,14 @@ static int cbc_decrypt(struct blkcipher_desc *desc, err = blkcipher_walk_virt(desc, &walk); desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + kernel_fpu_begin(); while ((nbytes = walk.nbytes)) { - kernel_fpu_begin(); aesni_cbc_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr, nbytes & AES_BLOCK_MASK, walk.iv); - kernel_fpu_end(); nbytes &= AES_BLOCK_SIZE - 1; err = blkcipher_walk_done(desc, &walk, nbytes); } + kernel_fpu_end(); return err; } @@ -364,20 +364,18 @@ static int ctr_crypt(struct blkcipher_desc *desc, err = blkcipher_walk_virt_block(desc, &walk, AES_BLOCK_SIZE); desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + kernel_fpu_begin(); while ((nbytes = walk.nbytes) >= AES_BLOCK_SIZE) { - kernel_fpu_begin(); aesni_ctr_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr, nbytes & AES_BLOCK_MASK, walk.iv); - kernel_fpu_end(); nbytes &= AES_BLOCK_SIZE - 1; err = blkcipher_walk_done(desc, &walk, nbytes); } if (walk.nbytes) { - kernel_fpu_begin(); ctr_crypt_final(ctx, &walk); - kernel_fpu_end(); err = blkcipher_walk_done(desc, &walk, 0); } + kernel_fpu_end(); return err; } @@ -1375,4 +1373,4 @@ module_exit(aesni_exit); MODULE_DESCRIPTION("Rijndael (AES) Cipher Algorithm, Intel AES-NI instructions optimized"); MODULE_LICENSE("GPL"); -MODULE_ALIAS("aes"); +MODULE_ALIAS_CRYPTO("aes"); diff --git a/arch/x86/crypto/blowfish_glue.c b/arch/x86/crypto/blowfish_glue.c index 50ec333..1477cfc 100644 --- a/arch/x86/crypto/blowfish_glue.c +++ b/arch/x86/crypto/blowfish_glue.c @@ -481,5 +481,5 @@ module_exit(fini); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Blowfish Cipher Algorithm, asm optimized"); -MODULE_ALIAS("blowfish"); -MODULE_ALIAS("blowfish-asm"); +MODULE_ALIAS_CRYPTO("blowfish"); +MODULE_ALIAS_CRYPTO("blowfish-asm"); diff --git a/arch/x86/crypto/camellia_aesni_avx2_glue.c b/arch/x86/crypto/camellia_aesni_avx2_glue.c index 414fe5d..da710fc 100644 --- a/arch/x86/crypto/camellia_aesni_avx2_glue.c +++ b/arch/x86/crypto/camellia_aesni_avx2_glue.c @@ -582,5 +582,5 @@ module_exit(camellia_aesni_fini); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Camellia Cipher Algorithm, AES-NI/AVX2 optimized"); -MODULE_ALIAS("camellia"); -MODULE_ALIAS("camellia-asm"); +MODULE_ALIAS_CRYPTO("camellia"); +MODULE_ALIAS_CRYPTO("camellia-asm"); diff --git a/arch/x86/crypto/camellia_aesni_avx_glue.c b/arch/x86/crypto/camellia_aesni_avx_glue.c index 37fd0c0..883e1af 100644 --- a/arch/x86/crypto/camellia_aesni_avx_glue.c +++ b/arch/x86/crypto/camellia_aesni_avx_glue.c @@ -574,5 +574,5 @@ module_exit(camellia_aesni_fini); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Camellia Cipher Algorithm, AES-NI/AVX optimized"); -MODULE_ALIAS("camellia"); -MODULE_ALIAS("camellia-asm"); +MODULE_ALIAS_CRYPTO("camellia"); +MODULE_ALIAS_CRYPTO("camellia-asm"); diff --git a/arch/x86/crypto/camellia_glue.c b/arch/x86/crypto/camellia_glue.c index c171dcb..5c8b626 100644 --- a/arch/x86/crypto/camellia_glue.c +++ b/arch/x86/crypto/camellia_glue.c @@ -1725,5 +1725,5 @@ module_exit(fini); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Camellia Cipher Algorithm, asm optimized"); -MODULE_ALIAS("camellia"); -MODULE_ALIAS("camellia-asm"); +MODULE_ALIAS_CRYPTO("camellia"); +MODULE_ALIAS_CRYPTO("camellia-asm"); diff --git a/arch/x86/crypto/cast5_avx_glue.c b/arch/x86/crypto/cast5_avx_glue.c index 2d48e83..d416069 100644 --- a/arch/x86/crypto/cast5_avx_glue.c +++ b/arch/x86/crypto/cast5_avx_glue.c @@ -60,7 +60,7 @@ static inline void cast5_fpu_end(bool fpu_enabled) static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk, bool enc) { - bool fpu_enabled; + bool fpu_enabled = false; struct cast5_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); const unsigned int bsize = CAST5_BLOCK_SIZE; unsigned int nbytes; @@ -76,7 +76,7 @@ static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk, u8 *wsrc = walk->src.virt.addr; u8 *wdst = walk->dst.virt.addr; - fpu_enabled = cast5_fpu_begin(false, nbytes); + fpu_enabled = cast5_fpu_begin(fpu_enabled, nbytes); /* Process multi-block batch */ if (nbytes >= bsize * CAST5_PARALLEL_BLOCKS) { @@ -104,9 +104,10 @@ static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk, } while (nbytes >= bsize); done: - cast5_fpu_end(fpu_enabled); err = blkcipher_walk_done(desc, walk, nbytes); } + + cast5_fpu_end(fpu_enabled); return err; } @@ -230,7 +231,7 @@ done: static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, struct scatterlist *src, unsigned int nbytes) { - bool fpu_enabled; + bool fpu_enabled = false; struct blkcipher_walk walk; int err; @@ -239,11 +240,12 @@ static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; while ((nbytes = walk.nbytes)) { - fpu_enabled = cast5_fpu_begin(false, nbytes); + fpu_enabled = cast5_fpu_begin(fpu_enabled, nbytes); nbytes = __cbc_decrypt(desc, &walk); - cast5_fpu_end(fpu_enabled); err = blkcipher_walk_done(desc, &walk, nbytes); } + + cast5_fpu_end(fpu_enabled); return err; } @@ -313,7 +315,7 @@ done: static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst, struct scatterlist *src, unsigned int nbytes) { - bool fpu_enabled; + bool fpu_enabled = false; struct blkcipher_walk walk; int err; @@ -322,12 +324,13 @@ static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst, desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; while ((nbytes = walk.nbytes) >= CAST5_BLOCK_SIZE) { - fpu_enabled = cast5_fpu_begin(false, nbytes); + fpu_enabled = cast5_fpu_begin(fpu_enabled, nbytes); nbytes = __ctr_crypt(desc, &walk); - cast5_fpu_end(fpu_enabled); err = blkcipher_walk_done(desc, &walk, nbytes); } + cast5_fpu_end(fpu_enabled); + if (walk.nbytes) { ctr_crypt_final(desc, &walk); err = blkcipher_walk_done(desc, &walk, 0); @@ -491,4 +494,4 @@ module_exit(cast5_exit); MODULE_DESCRIPTION("Cast5 Cipher Algorithm, AVX optimized"); MODULE_LICENSE("GPL"); -MODULE_ALIAS("cast5"); +MODULE_ALIAS_CRYPTO("cast5"); diff --git a/arch/x86/crypto/cast6_avx_glue.c b/arch/x86/crypto/cast6_avx_glue.c index 8d0dfb8..c197562 100644 --- a/arch/x86/crypto/cast6_avx_glue.c +++ b/arch/x86/crypto/cast6_avx_glue.c @@ -611,4 +611,4 @@ module_exit(cast6_exit); MODULE_DESCRIPTION("Cast6 Cipher Algorithm, AVX optimized"); MODULE_LICENSE("GPL"); -MODULE_ALIAS("cast6"); +MODULE_ALIAS_CRYPTO("cast6"); diff --git a/arch/x86/crypto/crc32-pclmul_glue.c b/arch/x86/crypto/crc32-pclmul_glue.c index 9d014a7..1937fc1 100644 --- a/arch/x86/crypto/crc32-pclmul_glue.c +++ b/arch/x86/crypto/crc32-pclmul_glue.c @@ -197,5 +197,5 @@ module_exit(crc32_pclmul_mod_fini); MODULE_AUTHOR("Alexander Boyko <alexander_boyko@xyratex.com>"); MODULE_LICENSE("GPL"); -MODULE_ALIAS("crc32"); -MODULE_ALIAS("crc32-pclmul"); +MODULE_ALIAS_CRYPTO("crc32"); +MODULE_ALIAS_CRYPTO("crc32-pclmul"); diff --git a/arch/x86/crypto/crc32c-intel_glue.c b/arch/x86/crypto/crc32c-intel_glue.c index 6812ad9..28640c3 100644 --- a/arch/x86/crypto/crc32c-intel_glue.c +++ b/arch/x86/crypto/crc32c-intel_glue.c @@ -280,5 +280,5 @@ MODULE_AUTHOR("Austin Zhang <austin.zhang@intel.com>, Kent Liu <kent.liu@intel.c MODULE_DESCRIPTION("CRC32c (Castagnoli) optimization using Intel Hardware."); MODULE_LICENSE("GPL"); -MODULE_ALIAS("crc32c"); -MODULE_ALIAS("crc32c-intel"); +MODULE_ALIAS_CRYPTO("crc32c"); +MODULE_ALIAS_CRYPTO("crc32c-intel"); diff --git a/arch/x86/crypto/crct10dif-pclmul_glue.c b/arch/x86/crypto/crct10dif-pclmul_glue.c index 7845d7f..b6c67bf 100644 --- a/arch/x86/crypto/crct10dif-pclmul_glue.c +++ b/arch/x86/crypto/crct10dif-pclmul_glue.c @@ -147,5 +147,5 @@ MODULE_AUTHOR("Tim Chen <tim.c.chen@linux.intel.com>"); MODULE_DESCRIPTION("T10 DIF CRC calculation accelerated with PCLMULQDQ."); MODULE_LICENSE("GPL"); -MODULE_ALIAS("crct10dif"); -MODULE_ALIAS("crct10dif-pclmul"); +MODULE_ALIAS_CRYPTO("crct10dif"); +MODULE_ALIAS_CRYPTO("crct10dif-pclmul"); diff --git a/arch/x86/crypto/fpu.c b/arch/x86/crypto/fpu.c index 98d7a18..f368ba2 100644 --- a/arch/x86/crypto/fpu.c +++ b/arch/x86/crypto/fpu.c @@ -17,6 +17,7 @@ #include <linux/kernel.h> #include <linux/module.h> #include <linux/slab.h> +#include <linux/crypto.h> #include <asm/i387.h> struct crypto_fpu_ctx { @@ -159,3 +160,5 @@ void __exit crypto_fpu_exit(void) { crypto_unregister_template(&crypto_fpu_tmpl); } + +MODULE_ALIAS_CRYPTO("fpu"); diff --git a/arch/x86/crypto/ghash-clmulni-intel_glue.c b/arch/x86/crypto/ghash-clmulni-intel_glue.c index d785cf2..a8d6f69 100644 --- a/arch/x86/crypto/ghash-clmulni-intel_glue.c +++ b/arch/x86/crypto/ghash-clmulni-intel_glue.c @@ -341,4 +341,4 @@ module_exit(ghash_pclmulqdqni_mod_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("GHASH Message Digest Algorithm, " "acclerated by PCLMULQDQ-NI"); -MODULE_ALIAS("ghash"); +MODULE_ALIAS_CRYPTO("ghash"); diff --git a/arch/x86/crypto/glue_helper.c b/arch/x86/crypto/glue_helper.c index 4a2bd21..432f1d76 100644 --- a/arch/x86/crypto/glue_helper.c +++ b/arch/x86/crypto/glue_helper.c @@ -39,7 +39,7 @@ static int __glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx, void *ctx = crypto_blkcipher_ctx(desc->tfm); const unsigned int bsize = 128 / 8; unsigned int nbytes, i, func_bytes; - bool fpu_enabled; + bool fpu_enabled = false; int err; err = blkcipher_walk_virt(desc, walk); @@ -49,7 +49,7 @@ static int __glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx, u8 *wdst = walk->dst.virt.addr; fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit, - desc, false, nbytes); + desc, fpu_enabled, nbytes); for (i = 0; i < gctx->num_funcs; i++) { func_bytes = bsize * gctx->funcs[i].num_blocks; @@ -71,10 +71,10 @@ static int __glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx, } done: - glue_fpu_end(fpu_enabled); err = blkcipher_walk_done(desc, walk, nbytes); } + glue_fpu_end(fpu_enabled); return err; } @@ -194,7 +194,7 @@ int glue_cbc_decrypt_128bit(const struct common_glue_ctx *gctx, struct scatterlist *src, unsigned int nbytes) { const unsigned int bsize = 128 / 8; - bool fpu_enabled; + bool fpu_enabled = false; struct blkcipher_walk walk; int err; @@ -203,12 +203,12 @@ int glue_cbc_decrypt_128bit(const struct common_glue_ctx *gctx, while ((nbytes = walk.nbytes)) { fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit, - desc, false, nbytes); + desc, fpu_enabled, nbytes); nbytes = __glue_cbc_decrypt_128bit(gctx, desc, &walk); - glue_fpu_end(fpu_enabled); err = blkcipher_walk_done(desc, &walk, nbytes); } + glue_fpu_end(fpu_enabled); return err; } EXPORT_SYMBOL_GPL(glue_cbc_decrypt_128bit); @@ -278,7 +278,7 @@ int glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx, struct scatterlist *src, unsigned int nbytes) { const unsigned int bsize = 128 / 8; - bool fpu_enabled; + bool fpu_enabled = false; struct blkcipher_walk walk; int err; @@ -287,12 +287,13 @@ int glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx, while ((nbytes = walk.nbytes) >= bsize) { fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit, - desc, false, nbytes); + desc, fpu_enabled, nbytes); nbytes = __glue_ctr_crypt_128bit(gctx, desc, &walk); - glue_fpu_end(fpu_enabled); err = blkcipher_walk_done(desc, &walk, nbytes); } + glue_fpu_end(fpu_enabled); + if (walk.nbytes) { glue_ctr_crypt_final_128bit( gctx->funcs[gctx->num_funcs - 1].fn_u.ctr, desc, &walk); @@ -347,7 +348,7 @@ int glue_xts_crypt_128bit(const struct common_glue_ctx *gctx, void *tweak_ctx, void *crypt_ctx) { const unsigned int bsize = 128 / 8; - bool fpu_enabled; + bool fpu_enabled = false; struct blkcipher_walk walk; int err; @@ -360,21 +361,21 @@ int glue_xts_crypt_128bit(const struct common_glue_ctx *gctx, /* set minimum length to bsize, for tweak_fn */ fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit, - desc, false, + desc, fpu_enabled, nbytes < bsize ? bsize : nbytes); + /* calculate first value of T */ tweak_fn(tweak_ctx, walk.iv, walk.iv); - glue_fpu_end(fpu_enabled); while (nbytes) { - fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit, - desc, false, nbytes); nbytes = __glue_xts_crypt_128bit(gctx, crypt_ctx, desc, &walk); - glue_fpu_end(fpu_enabled); err = blkcipher_walk_done(desc, &walk, nbytes); nbytes = walk.nbytes; } + + glue_fpu_end(fpu_enabled); + return err; } EXPORT_SYMBOL_GPL(glue_xts_crypt_128bit); diff --git a/arch/x86/crypto/salsa20_glue.c b/arch/x86/crypto/salsa20_glue.c index 5e8e677..399a29d 100644 --- a/arch/x86/crypto/salsa20_glue.c +++ b/arch/x86/crypto/salsa20_glue.c @@ -119,5 +119,5 @@ module_exit(fini); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION ("Salsa20 stream cipher algorithm (optimized assembly version)"); -MODULE_ALIAS("salsa20"); -MODULE_ALIAS("salsa20-asm"); +MODULE_ALIAS_CRYPTO("salsa20"); +MODULE_ALIAS_CRYPTO("salsa20-asm"); diff --git a/arch/x86/crypto/serpent_avx2_glue.c b/arch/x86/crypto/serpent_avx2_glue.c index 23aabc6..cb57caf 100644 --- a/arch/x86/crypto/serpent_avx2_glue.c +++ b/arch/x86/crypto/serpent_avx2_glue.c @@ -558,5 +558,5 @@ module_exit(fini); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Serpent Cipher Algorithm, AVX2 optimized"); -MODULE_ALIAS("serpent"); -MODULE_ALIAS("serpent-asm"); +MODULE_ALIAS_CRYPTO("serpent"); +MODULE_ALIAS_CRYPTO("serpent-asm"); diff --git a/arch/x86/crypto/serpent_avx_glue.c b/arch/x86/crypto/serpent_avx_glue.c index 9ae83cf..0a86e8b 100644 --- a/arch/x86/crypto/serpent_avx_glue.c +++ b/arch/x86/crypto/serpent_avx_glue.c @@ -617,4 +617,4 @@ module_exit(serpent_exit); MODULE_DESCRIPTION("Serpent Cipher Algorithm, AVX optimized"); MODULE_LICENSE("GPL"); -MODULE_ALIAS("serpent"); +MODULE_ALIAS_CRYPTO("serpent"); diff --git a/arch/x86/crypto/serpent_sse2_glue.c b/arch/x86/crypto/serpent_sse2_glue.c index 97a356e..279f389 100644 --- a/arch/x86/crypto/serpent_sse2_glue.c +++ b/arch/x86/crypto/serpent_sse2_glue.c @@ -618,4 +618,4 @@ module_exit(serpent_sse2_exit); MODULE_DESCRIPTION("Serpent Cipher Algorithm, SSE2 optimized"); MODULE_LICENSE("GPL"); -MODULE_ALIAS("serpent"); +MODULE_ALIAS_CRYPTO("serpent"); diff --git a/arch/x86/crypto/sha1_ssse3_glue.c b/arch/x86/crypto/sha1_ssse3_glue.c index 4a11a9d..29e1060 100644 --- a/arch/x86/crypto/sha1_ssse3_glue.c +++ b/arch/x86/crypto/sha1_ssse3_glue.c @@ -237,4 +237,4 @@ module_exit(sha1_ssse3_mod_fini); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("SHA1 Secure Hash Algorithm, Supplemental SSE3 accelerated"); -MODULE_ALIAS("sha1"); +MODULE_ALIAS_CRYPTO("sha1"); diff --git a/arch/x86/crypto/sha256_ssse3_glue.c b/arch/x86/crypto/sha256_ssse3_glue.c index e52947f..4dc100d 100644 --- a/arch/x86/crypto/sha256_ssse3_glue.c +++ b/arch/x86/crypto/sha256_ssse3_glue.c @@ -318,5 +318,5 @@ module_exit(sha256_ssse3_mod_fini); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("SHA256 Secure Hash Algorithm, Supplemental SSE3 accelerated"); -MODULE_ALIAS("sha256"); -MODULE_ALIAS("sha384"); +MODULE_ALIAS_CRYPTO("sha256"); +MODULE_ALIAS_CRYPTO("sha224"); diff --git a/arch/x86/crypto/sha512_ssse3_glue.c b/arch/x86/crypto/sha512_ssse3_glue.c index f30cd10..26a5898 100644 --- a/arch/x86/crypto/sha512_ssse3_glue.c +++ b/arch/x86/crypto/sha512_ssse3_glue.c @@ -141,7 +141,7 @@ static int sha512_ssse3_final(struct shash_desc *desc, u8 *out) /* save number of bits */ bits[1] = cpu_to_be64(sctx->count[0] << 3); - bits[0] = cpu_to_be64(sctx->count[1] << 3) | sctx->count[0] >> 61; + bits[0] = cpu_to_be64(sctx->count[1] << 3 | sctx->count[0] >> 61); /* Pad out to 112 mod 128 and append length */ index = sctx->count[0] & 0x7f; @@ -326,5 +326,5 @@ module_exit(sha512_ssse3_mod_fini); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("SHA512 Secure Hash Algorithm, Supplemental SSE3 accelerated"); -MODULE_ALIAS("sha512"); -MODULE_ALIAS("sha384"); +MODULE_ALIAS_CRYPTO("sha512"); +MODULE_ALIAS_CRYPTO("sha384"); diff --git a/arch/x86/crypto/twofish_avx_glue.c b/arch/x86/crypto/twofish_avx_glue.c index a62ba54..c8c12c1 100644 --- a/arch/x86/crypto/twofish_avx_glue.c +++ b/arch/x86/crypto/twofish_avx_glue.c @@ -579,4 +579,4 @@ module_exit(twofish_exit); MODULE_DESCRIPTION("Twofish Cipher Algorithm, AVX optimized"); MODULE_LICENSE("GPL"); -MODULE_ALIAS("twofish"); +MODULE_ALIAS_CRYPTO("twofish"); diff --git a/arch/x86/crypto/twofish_glue.c b/arch/x86/crypto/twofish_glue.c index 0a52023..77e06c2 100644 --- a/arch/x86/crypto/twofish_glue.c +++ b/arch/x86/crypto/twofish_glue.c @@ -96,5 +96,5 @@ module_exit(fini); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION ("Twofish Cipher Algorithm, asm optimized"); -MODULE_ALIAS("twofish"); -MODULE_ALIAS("twofish-asm"); +MODULE_ALIAS_CRYPTO("twofish"); +MODULE_ALIAS_CRYPTO("twofish-asm"); diff --git a/arch/x86/crypto/twofish_glue_3way.c b/arch/x86/crypto/twofish_glue_3way.c index 13e63b3..56d8a08 100644 --- a/arch/x86/crypto/twofish_glue_3way.c +++ b/arch/x86/crypto/twofish_glue_3way.c @@ -495,5 +495,5 @@ module_exit(fini); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Twofish Cipher Algorithm, 3-way parallel asm optimized"); -MODULE_ALIAS("twofish"); -MODULE_ALIAS("twofish-asm"); +MODULE_ALIAS_CRYPTO("twofish"); +MODULE_ALIAS_CRYPTO("twofish-asm"); diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index 4299eb0..92a2e93 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -151,6 +151,16 @@ ENTRY(ia32_sysenter_target) 1: movl (%rbp),%ebp _ASM_EXTABLE(1b,ia32_badarg) ASM_CLAC + + /* + * Sysenter doesn't filter flags, so we need to clear NT + * ourselves. To save a few cycles, we can check whether + * NT was set instead of doing an unconditional popfq. + */ + testl $X86_EFLAGS_NT,EFLAGS-ARGOFFSET(%rsp) + jnz sysenter_fix_flags +sysenter_flags_fixed: + orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET) testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) CFI_REMEMBER_STATE @@ -184,6 +194,8 @@ sysexit_from_sys_call: TRACE_IRQS_ON ENABLE_INTERRUPTS_SYSEXIT32 + CFI_RESTORE_STATE + #ifdef CONFIG_AUDITSYSCALL .macro auditsys_entry_common movl %esi,%r9d /* 6th arg: 4th syscall arg */ @@ -226,7 +238,6 @@ sysexit_from_sys_call: .endm sysenter_auditsys: - CFI_RESTORE_STATE auditsys_entry_common movl %ebp,%r9d /* reload 6th syscall arg */ jmp sysenter_dispatch @@ -235,6 +246,11 @@ sysexit_audit: auditsys_exit sysexit_from_sys_call #endif +sysenter_fix_flags: + pushq_cfi $(X86_EFLAGS_IF|X86_EFLAGS_FIXED) + popfq_cfi + jmp sysenter_flags_fixed + sysenter_tracesys: #ifdef CONFIG_AUDITSYSCALL testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 89270b4..c2f19a8 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -203,6 +203,7 @@ #define X86_FEATURE_DECODEASSISTS (8*32+12) /* AMD Decode Assists support */ #define X86_FEATURE_PAUSEFILTER (8*32+13) /* AMD filtered pause intercept */ #define X86_FEATURE_PFTHRESHOLD (8*32+14) /* AMD pause filter threshold */ +#define X86_FEATURE_VMMCALL (8*32+15) /* Prefer vmmcall to vmcall */ /* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */ diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index b90e5df..f6aaf7d 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -251,7 +251,8 @@ static inline void native_load_tls(struct thread_struct *t, unsigned int cpu) gdt[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i]; } -#define _LDT_empty(info) \ +/* This intentionally ignores lm, since 32-bit apps don't have that field. */ +#define LDT_empty(info) \ ((info)->base_addr == 0 && \ (info)->limit == 0 && \ (info)->contents == 0 && \ @@ -261,11 +262,18 @@ static inline void native_load_tls(struct thread_struct *t, unsigned int cpu) (info)->seg_not_present == 1 && \ (info)->useable == 0) -#ifdef CONFIG_X86_64 -#define LDT_empty(info) (_LDT_empty(info) && ((info)->lm == 0)) -#else -#define LDT_empty(info) (_LDT_empty(info)) -#endif +/* Lots of programs expect an all-zero user_desc to mean "no segment at all". */ +static inline bool LDT_zero(const struct user_desc *info) +{ + return (info->base_addr == 0 && + info->limit == 0 && + info->contents == 0 && + info->read_exec_only == 0 && + info->seg_32bit == 0 && + info->limit_in_pages == 0 && + info->seg_not_present == 0 && + info->useable == 0); +} static inline void clear_LDT(void) { diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index 9c999c1..01f15b2 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -155,8 +155,9 @@ do { \ #define elf_check_arch(x) \ ((x)->e_machine == EM_X86_64) -#define compat_elf_check_arch(x) \ - (elf_check_arch_ia32(x) || (x)->e_machine == EM_X86_64) +#define compat_elf_check_arch(x) \ + (elf_check_arch_ia32(x) || \ + (IS_ENABLED(CONFIG_X86_X32_ABI) && (x)->e_machine == EM_X86_64)) #if __USER32_DS != __USER_DS # error "The following code assumes __USER32_DS == __USER_DS" diff --git a/arch/x86/include/asm/espfix.h b/arch/x86/include/asm/espfix.h new file mode 100644 index 0000000..99efebb --- /dev/null +++ b/arch/x86/include/asm/espfix.h @@ -0,0 +1,16 @@ +#ifndef _ASM_X86_ESPFIX_H +#define _ASM_X86_ESPFIX_H + +#ifdef CONFIG_X86_64 + +#include <asm/percpu.h> + +DECLARE_PER_CPU_READ_MOSTLY(unsigned long, espfix_stack); +DECLARE_PER_CPU_READ_MOSTLY(unsigned long, espfix_waddr); + +extern void init_espfix_bsp(void); +extern void init_espfix_ap(void); + +#endif /* CONFIG_X86_64 */ + +#endif /* _ASM_X86_ESPFIX_H */ diff --git a/arch/x86/include/asm/hugetlb.h b/arch/x86/include/asm/hugetlb.h index a809121..68c0539 100644 --- a/arch/x86/include/asm/hugetlb.h +++ b/arch/x86/include/asm/hugetlb.h @@ -52,6 +52,7 @@ static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm, static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { + ptep_clear_flush(vma, addr, ptep); } static inline int huge_pte_none(pte_t pte) diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h index 0ea10f27..cb6cfcd 100644 --- a/arch/x86/include/asm/irq.h +++ b/arch/x86/include/asm/irq.h @@ -25,6 +25,7 @@ extern void irq_ctx_init(int cpu); #ifdef CONFIG_HOTPLUG_CPU #include <linux/cpumask.h> +extern int check_irq_vectors_for_cpu_disable(void); extern void fixup_irqs(void); extern void irq_force_complete_move(int); #endif diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h index bba3cf8..0a8b519 100644 --- a/arch/x86/include/asm/irqflags.h +++ b/arch/x86/include/asm/irqflags.h @@ -129,7 +129,7 @@ static inline notrace unsigned long arch_local_irq_save(void) #define PARAVIRT_ADJUST_EXCEPTION_FRAME /* */ -#define INTERRUPT_RETURN iretq +#define INTERRUPT_RETURN jmp native_iret #define USERGS_SYSRET64 \ swapgs; \ sysretq; diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index c76ff74..7cb77dd 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -455,7 +455,7 @@ struct kvm_vcpu_arch { bool nmi_injected; /* Trying to inject an NMI this entry */ struct mtrr_state_type mtrr_state; - u32 pat; + u64 pat; int switch_db_regs; unsigned long db[KVM_NR_DB_REGS]; @@ -473,6 +473,7 @@ struct kvm_vcpu_arch { u64 mmio_gva; unsigned access; gfn_t mmio_gfn; + u64 mmio_gen; struct kvm_pmu pmu; @@ -973,6 +974,20 @@ static inline void kvm_inject_gp(struct kvm_vcpu *vcpu, u32 error_code) kvm_queue_exception_e(vcpu, GP_VECTOR, error_code); } +static inline u64 get_canonical(u64 la) +{ + return ((int64_t)la << 16) >> 16; +} + +static inline bool is_noncanonical_address(u64 la) +{ +#ifdef CONFIG_X86_64 + return get_canonical(la) != la; +#else + return false; +#endif +} + #define TSS_IOPB_BASE_OFFSET 0x66 #define TSS_BASE_SIZE 0x68 #define TSS_IOPB_SIZE (65536 / 8) @@ -1031,7 +1046,7 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v); void kvm_vcpu_reset(struct kvm_vcpu *vcpu); void kvm_define_shared_msr(unsigned index, u32 msr); -void kvm_set_shared_msr(unsigned index, u64 val, u64 mask); +int kvm_set_shared_msr(unsigned index, u64 val, u64 mask); bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip); diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h index c7678e4..e62cf89 100644 --- a/arch/x86/include/asm/kvm_para.h +++ b/arch/x86/include/asm/kvm_para.h @@ -2,6 +2,7 @@ #define _ASM_X86_KVM_PARA_H #include <asm/processor.h> +#include <asm/alternative.h> #include <uapi/asm/kvm_para.h> extern void kvmclock_init(void); @@ -16,10 +17,15 @@ static inline bool kvm_check_and_clear_guest_paused(void) } #endif /* CONFIG_KVM_GUEST */ -/* This instruction is vmcall. On non-VT architectures, it will generate a - * trap that we will then rewrite to the appropriate instruction. +#ifdef CONFIG_DEBUG_RODATA +#define KVM_HYPERCALL \ + ALTERNATIVE(".byte 0x0f,0x01,0xc1", ".byte 0x0f,0x01,0xd9", X86_FEATURE_VMMCALL) +#else +/* On AMD processors, vmcall will generate a trap that we will + * then rewrite to the appropriate instruction. */ #define KVM_HYPERCALL ".byte 0x0f,0x01,0xc1" +#endif /* For KVM hypercalls, a three-byte sequence of either the vmcall or the vmmcall * instruction. The hypervisor may replace it with something else but only the diff --git a/arch/x86/include/asm/page_32_types.h b/arch/x86/include/asm/page_32_types.h index f48b17d..3a52ee0 100644 --- a/arch/x86/include/asm/page_32_types.h +++ b/arch/x86/include/asm/page_32_types.h @@ -20,7 +20,6 @@ #define THREAD_SIZE_ORDER 1 #define THREAD_SIZE (PAGE_SIZE << THREAD_SIZE_ORDER) -#define STACKFAULT_STACK 0 #define DOUBLEFAULT_STACK 1 #define NMI_STACK 0 #define DEBUG_STACK 0 diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h index 43dcd80..d1d2972 100644 --- a/arch/x86/include/asm/page_64_types.h +++ b/arch/x86/include/asm/page_64_types.h @@ -14,12 +14,11 @@ #define IRQ_STACK_ORDER 2 #define IRQ_STACK_SIZE (PAGE_SIZE << IRQ_STACK_ORDER) -#define STACKFAULT_STACK 1 -#define DOUBLEFAULT_STACK 2 -#define NMI_STACK 3 -#define DEBUG_STACK 4 -#define MCE_STACK 5 -#define N_EXCEPTION_STACKS 5 /* hw limit: 7 */ +#define DOUBLEFAULT_STACK 1 +#define NMI_STACK 2 +#define DEBUG_STACK 3 +#define MCE_STACK 4 +#define N_EXCEPTION_STACKS 4 /* hw limit: 7 */ #define PUD_PAGE_SIZE (_AC(1, UL) << PUD_SHIFT) #define PUD_PAGE_MASK (~(PUD_PAGE_SIZE-1)) diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index 2d88344..b1609f2 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h @@ -61,6 +61,8 @@ typedef struct { pteval_t pte; } pte_t; #define MODULES_VADDR _AC(0xffffffffa0000000, UL) #define MODULES_END _AC(0xffffffffff000000, UL) #define MODULES_LEN (MODULES_END - MODULES_VADDR) +#define ESPFIX_PGD_ENTRY _AC(-2, UL) +#define ESPFIX_BASE_ADDR (ESPFIX_PGD_ENTRY << PGDIR_SHIFT) #define EARLY_DYNAMIC_PAGE_TABLES 64 diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index 942a086..68e9f00 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -232,6 +232,22 @@ static inline unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs, #define ARCH_HAS_USER_SINGLE_STEP_INFO +/* + * When hitting ptrace_stop(), we cannot return using SYSRET because + * that does not restore the full CPU state, only a minimal set. The + * ptracer can change arbitrary register values, which is usually okay + * because the usual ptrace stops run off the signal delivery path which + * forces IRET; however, ptrace_event() stops happen in arbitrary places + * in the kernel and don't force IRET path. + * + * So force IRET path after a ptrace stop. + */ +#define arch_ptrace_stop_needed(code, info) \ +({ \ + set_thread_flag(TIF_NOTIFY_RESUME); \ + false; \ +}) + struct user_desc; extern int do_get_thread_area(struct task_struct *p, int idx, struct user_desc __user *info); diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index 3475554..ad1d8ec 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -64,6 +64,8 @@ static inline void x86_ce4100_early_setup(void) { } #ifndef _SETUP +#include <asm/espfix.h> + /* * This is set up by the setup-routine at boot-time */ diff --git a/arch/x86/include/asm/signal.h b/arch/x86/include/asm/signal.h index 6ec0792..35e67a4 100644 --- a/arch/x86/include/asm/signal.h +++ b/arch/x86/include/asm/signal.h @@ -23,19 +23,6 @@ typedef struct { unsigned long sig[_NSIG_WORDS]; } sigset_t; -/* - * Because some traps use the IST stack, we must keep preemption - * disabled while calling do_trap(), but do_trap() may call - * force_sig_info() which will grab the signal spin_locks for the - * task, which in PREEMPT_RT_FULL are mutexes. By defining - * ARCH_RT_DELAYS_SIGNAL_SEND the force_sig_info() will set - * TIF_NOTIFY_RESUME and set up the signal to be sent on exit of the - * trap. - */ -#if defined(CONFIG_PREEMPT_RT_FULL) && defined(CONFIG_X86_64) -#define ARCH_RT_DELAYS_SIGNAL_SEND -#endif - #ifndef CONFIG_COMPAT typedef sigset_t compat_sigset_t; #endif diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h index 64fb5cb..6a99859 100644 --- a/arch/x86/include/asm/stackprotector.h +++ b/arch/x86/include/asm/stackprotector.h @@ -57,7 +57,7 @@ */ static __always_inline void boot_init_stack_canary(void) { - u64 uninitialized_var(canary); + u64 canary; u64 tsc; #ifdef CONFIG_X86_64 @@ -68,16 +68,8 @@ static __always_inline void boot_init_stack_canary(void) * of randomness. The TSC only matters for very early init, * there it already has some randomness on most systems. Later * on during the bootup the random pool has true entropy too. - * - * For preempt-rt we need to weaken the randomness a bit, as - * we can't call into the random generator from atomic context - * due to locking constraints. We just leave canary - * uninitialized and use the TSC based randomness on top of - * it. */ -#ifndef CONFIG_PREEMPT_RT_FULL get_random_bytes(&canary, sizeof(canary)); -#endif tsc = __native_read_tsc(); canary += tsc + (tsc << 32UL); diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index f08e527..bbbac92 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -30,8 +30,6 @@ struct thread_info { __u32 cpu; /* current CPU */ int preempt_count; /* 0 => preemptable, <0 => BUG */ - int preempt_lazy_count; /* 0 => lazy preemptable, - <0 => BUG */ mm_segment_t addr_limit; struct restart_block restart_block; void __user *sysenter_return; @@ -83,7 +81,6 @@ struct thread_info { #define TIF_SYSCALL_EMU 6 /* syscall emulation active */ #define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */ #define TIF_SECCOMP 8 /* secure computing */ -#define TIF_NEED_RESCHED_LAZY 9 /* lazy rescheduling necessary */ #define TIF_MCE_NOTIFY 10 /* notify userspace of an MCE */ #define TIF_USER_RETURN_NOTIFY 11 /* notify kernel of userspace return */ #define TIF_UPROBE 12 /* breakpointed or singlestepping */ @@ -108,7 +105,6 @@ struct thread_info { #define _TIF_SYSCALL_EMU (1 << TIF_SYSCALL_EMU) #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) #define _TIF_SECCOMP (1 << TIF_SECCOMP) -#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY) #define _TIF_MCE_NOTIFY (1 << TIF_MCE_NOTIFY) #define _TIF_USER_RETURN_NOTIFY (1 << TIF_USER_RETURN_NOTIFY) #define _TIF_UPROBE (1 << TIF_UPROBE) @@ -149,7 +145,7 @@ struct thread_info { /* Only used for 64 bit */ #define _TIF_DO_NOTIFY_MASK \ (_TIF_SIGPENDING | _TIF_MCE_NOTIFY | _TIF_NOTIFY_RESUME | \ - _TIF_USER_RETURN_NOTIFY) + _TIF_USER_RETURN_NOTIFY | _TIF_UPROBE) /* flags to check in __switch_to() */ #define _TIF_WORK_CTXSW \ @@ -158,8 +154,6 @@ struct thread_info { #define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY) #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW) -#define _TIF_NEED_RESCHED_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY) - #define PREEMPT_ACTIVE 0x10000000 #ifdef CONFIG_X86_32 diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index e6d90ba..04905bf 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -62,7 +62,7 @@ static inline void __flush_tlb_all(void) static inline void __flush_tlb_one(unsigned long addr) { - count_vm_event(NR_TLB_LOCAL_FLUSH_ONE); + count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE); __flush_tlb_single(addr); } @@ -93,13 +93,13 @@ static inline void __flush_tlb_one(unsigned long addr) */ static inline void __flush_tlb_up(void) { - count_vm_event(NR_TLB_LOCAL_FLUSH_ALL); + count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); __flush_tlb(); } static inline void flush_tlb_all(void) { - count_vm_event(NR_TLB_LOCAL_FLUSH_ALL); + count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); __flush_tlb_all(); } diff --git a/arch/x86/include/asm/vsyscall.h b/arch/x86/include/asm/vsyscall.h index 2a46ca7..2874be9 100644 --- a/arch/x86/include/asm/vsyscall.h +++ b/arch/x86/include/asm/vsyscall.h @@ -34,7 +34,7 @@ static inline unsigned int __getcpu(void) native_read_tscp(&p); } else { /* Load per CPU data from GDT */ - asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG)); + asm volatile ("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG)); } return p; diff --git a/arch/x86/include/uapi/asm/ldt.h b/arch/x86/include/uapi/asm/ldt.h index 46727eb..6e1aaf7 100644 --- a/arch/x86/include/uapi/asm/ldt.h +++ b/arch/x86/include/uapi/asm/ldt.h @@ -28,6 +28,13 @@ struct user_desc { unsigned int seg_not_present:1; unsigned int useable:1; #ifdef __x86_64__ + /* + * Because this bit is not present in 32-bit user code, user + * programs can pass uninitialized values here. Therefore, in + * any context in which a user_desc comes from a 32-bit program, + * the kernel must act as though lm == 0, regardless of the + * actual value. + */ unsigned int lm:1; #endif }; diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h index 0e79420..990a2fe 100644 --- a/arch/x86/include/uapi/asm/vmx.h +++ b/arch/x86/include/uapi/asm/vmx.h @@ -67,6 +67,7 @@ #define EXIT_REASON_EPT_MISCONFIG 49 #define EXIT_REASON_INVEPT 50 #define EXIT_REASON_PREEMPTION_TIMER 52 +#define EXIT_REASON_INVVPID 53 #define EXIT_REASON_WBINVD 54 #define EXIT_REASON_XSETBV 55 #define EXIT_REASON_APIC_WRITE 56 @@ -114,6 +115,7 @@ { EXIT_REASON_EOI_INDUCED, "EOI_INDUCED" }, \ { EXIT_REASON_INVALID_STATE, "INVALID_STATE" }, \ { EXIT_REASON_INVD, "INVD" }, \ + { EXIT_REASON_INVVPID, "INVVPID" }, \ { EXIT_REASON_INVPCID, "INVPCID" } #endif /* _UAPIVMX_H */ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index a5408b9..32f1140 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -29,6 +29,7 @@ obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o obj-y += syscall_$(BITS).o obj-$(CONFIG_X86_64) += vsyscall_64.o obj-$(CONFIG_X86_64) += vsyscall_emu_64.o +obj-$(CONFIG_X86_ESPFIX64) += espfix_64.o obj-y += bootflag.o e820.o obj-y += pci-dma.o quirks.o topology.o kdebugfs.o obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index a7eb82d..7170f17 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1282,7 +1282,7 @@ void setup_local_APIC(void) unsigned int value, queued; int i, j, acked = 0; unsigned long long tsc = 0, ntsc; - long long max_loops = cpu_khz; + long long max_loops = cpu_khz ? cpu_khz : 1000000; if (cpu_has_tsc) rdtscll(tsc); @@ -1379,7 +1379,7 @@ void setup_local_APIC(void) break; } if (queued) { - if (cpu_has_tsc) { + if (cpu_has_tsc && cpu_khz) { rdtscll(ntsc); max_loops = (cpu_khz << 10) - (ntsc - tsc); } else diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 5816e6a..e63a5bd 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2396,8 +2396,7 @@ static bool io_apic_level_ack_pending(struct irq_cfg *cfg) static inline bool ioapic_irqd_mask(struct irq_data *data, struct irq_cfg *cfg) { /* If we are moving the irq we need to mask it */ - if (unlikely(irqd_is_setaffinity_pending(data) && - !irqd_irq_inprogress(data))) { + if (unlikely(irqd_is_setaffinity_pending(data))) { mask_ioapic(cfg); return true; } diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index a36d9cf..2861082 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -33,7 +33,6 @@ void common(void) { OFFSET(TI_status, thread_info, status); OFFSET(TI_addr_limit, thread_info, addr_limit); OFFSET(TI_preempt_count, thread_info, preempt_count); - OFFSET(TI_preempt_lazy_count, thread_info, preempt_lazy_count); BLANK(); OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx); diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 28233b9..ee51e67 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -509,6 +509,13 @@ static void early_init_amd(struct cpuinfo_x86 *c) } #endif + /* + * This is only needed to tell the kernel whether to use VMCALL + * and VMMCALL. VMMCALL is never executed except under virt, so + * we can set it unconditionally. + */ + set_cpu_cap(c, X86_FEATURE_VMMCALL); + /* F16h erratum 793, CVE-2013-6885 */ if (c->x86 == 0x16 && c->x86_model <= 0xf) { u64 val; diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 3533e2c..00cc6f7 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -144,6 +144,8 @@ EXPORT_PER_CPU_SYMBOL_GPL(gdt_page); static int __init x86_xsave_setup(char *s) { + if (strlen(s)) + return 0; setup_clear_cpu_cap(X86_FEATURE_XSAVE); setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT); setup_clear_cpu_cap(X86_FEATURE_AVX); @@ -1135,7 +1137,7 @@ void syscall_init(void) /* Flags to clear on syscall */ wrmsrl(MSR_SYSCALL_MASK, X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF| - X86_EFLAGS_IOPL|X86_EFLAGS_AC); + X86_EFLAGS_IOPL|X86_EFLAGS_AC|X86_EFLAGS_NT); } /* diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 87c0be5..f4a9985 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -154,6 +154,21 @@ static void early_init_intel(struct cpuinfo_x86 *c) setup_clear_cpu_cap(X86_FEATURE_ERMS); } } + + /* + * Intel Quark Core DevMan_001.pdf section 6.4.11 + * "The operating system also is required to invalidate (i.e., flush) + * the TLB when any changes are made to any of the page table entries. + * The operating system must reload CR3 to cause the TLB to be flushed" + * + * As a result cpu_has_pge() in arch/x86/include/asm/tlbflush.h should + * be false so that __flush_tlb_all() causes CR3 insted of CR4.PGE + * to be modified + */ + if (c->x86 == 5 && c->x86_model == 9) { + pr_info("Disabling PGE capability bit\n"); + setup_clear_cpu_cap(X86_FEATURE_PGE); + } } #ifdef CONFIG_X86_32 @@ -369,6 +384,13 @@ static void init_intel(struct cpuinfo_x86 *c) detect_extended_topology(c); l2 = init_intel_cacheinfo(c); + + /* Detect legacy cache sizes if init_intel_cacheinfo did not */ + if (l2 == 0) { + cpu_detect_cache_sizes(c); + l2 = c->x86_cache_size; + } + if (c->cpuid_level > 9) { unsigned eax = cpuid_eax(10); /* Check for version and the number of counters */ @@ -483,6 +505,13 @@ static unsigned int intel_size_cache(struct cpuinfo_x86 *c, unsigned int size) */ if ((c->x86 == 6) && (c->x86_model == 11) && (size == 0)) size = 256; + + /* + * Intel Quark SoC X1000 contains a 4-way set associative + * 16K cache with a 16 byte cache line and 256 lines per tag + */ + if ((c->x86 == 5) && (c->x86_model == 9)) + size = 16; return size; } #endif @@ -688,7 +717,8 @@ static const struct cpu_dev intel_cpu_dev = { [3] = "OverDrive PODP5V83", [4] = "Pentium MMX", [7] = "Mobile Pentium 75 - 200", - [8] = "Mobile Pentium MMX" + [8] = "Mobile Pentium MMX", + [9] = "Quark SoC X1000", } }, { .vendor = X86_VENDOR_INTEL, .family = 6, .model_names = diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 3a7ab0b..b3218cd 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -18,7 +18,6 @@ #include <linux/rcupdate.h> #include <linux/kobject.h> #include <linux/uaccess.h> -#include <linux/kthread.h> #include <linux/kdebug.h> #include <linux/kernel.h> #include <linux/percpu.h> @@ -42,7 +41,6 @@ #include <linux/debugfs.h> #include <linux/irq_work.h> #include <linux/export.h> -#include <linux/jiffies.h> #include <asm/processor.h> #include <asm/mce.h> @@ -1270,7 +1268,7 @@ void mce_log_therm_throt_event(__u64 status) static unsigned long check_interval = 5 * 60; /* 5 minutes */ static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */ -static DEFINE_PER_CPU(struct hrtimer, mce_timer); +static DEFINE_PER_CPU(struct timer_list, mce_timer); static unsigned long mce_adjust_timer_default(unsigned long interval) { @@ -1280,10 +1278,13 @@ static unsigned long mce_adjust_timer_default(unsigned long interval) static unsigned long (*mce_adjust_timer)(unsigned long interval) = mce_adjust_timer_default; -static enum hrtimer_restart mce_timer_fn(struct hrtimer *timer) +static void mce_timer_fn(unsigned long data) { + struct timer_list *t = &__get_cpu_var(mce_timer); unsigned long iv; + WARN_ON(smp_processor_id() != data); + if (mce_available(__this_cpu_ptr(&cpu_info))) { machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_poll_banks)); @@ -1304,11 +1305,9 @@ static enum hrtimer_restart mce_timer_fn(struct hrtimer *timer) __this_cpu_write(mce_next_interval, iv); /* Might have become 0 after CMCI storm subsided */ if (iv) { - hrtimer_forward_now(timer, ns_to_ktime( - jiffies_to_usecs(iv) * 1000ULL)); - return HRTIMER_RESTART; + t->expires = jiffies + iv; + add_timer_on(t, smp_processor_id()); } - return HRTIMER_NORESTART; } /* @@ -1316,37 +1315,28 @@ static enum hrtimer_restart mce_timer_fn(struct hrtimer *timer) */ void mce_timer_kick(unsigned long interval) { - struct hrtimer *t = &__get_cpu_var(mce_timer); + struct timer_list *t = &__get_cpu_var(mce_timer); + unsigned long when = jiffies + interval; unsigned long iv = __this_cpu_read(mce_next_interval); - if (hrtimer_active(t)) { - s64 exp; - s64 intv_us; - - intv_us = jiffies_to_usecs(interval); - exp = ktime_to_us(hrtimer_expires_remaining(t)); - if (intv_us < exp) { - hrtimer_cancel(t); - hrtimer_start_range_ns(t, - ns_to_ktime(intv_us * 1000), - 0, HRTIMER_MODE_REL_PINNED); - } + if (timer_pending(t)) { + if (time_before(when, t->expires)) + mod_timer_pinned(t, when); } else { - hrtimer_start_range_ns(t, - ns_to_ktime(jiffies_to_usecs(interval) * 1000ULL), - 0, HRTIMER_MODE_REL_PINNED); + t->expires = round_jiffies(when); + add_timer_on(t, smp_processor_id()); } if (interval < iv) __this_cpu_write(mce_next_interval, interval); } -/* Must not be called in IRQ context where hrtimer_cancel() can deadlock */ +/* Must not be called in IRQ context where del_timer_sync() can deadlock */ static void mce_timer_delete_all(void) { int cpu; for_each_online_cpu(cpu) - hrtimer_cancel(&per_cpu(mce_timer, cpu)); + del_timer_sync(&per_cpu(mce_timer, cpu)); } static void mce_do_trigger(struct work_struct *work) @@ -1356,63 +1346,6 @@ static void mce_do_trigger(struct work_struct *work) static DECLARE_WORK(mce_trigger_work, mce_do_trigger); -static void __mce_notify_work(void) -{ - /* Not more than two messages every minute */ - static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); - - /* wake processes polling /dev/mcelog */ - wake_up_interruptible(&mce_chrdev_wait); - - /* - * There is no risk of missing notifications because - * work_pending is always cleared before the function is - * executed. - */ - if (mce_helper[0] && !work_pending(&mce_trigger_work)) - schedule_work(&mce_trigger_work); - - if (__ratelimit(&ratelimit)) - pr_info(HW_ERR "Machine check events logged\n"); -} - -#ifdef CONFIG_PREEMPT_RT_FULL -struct task_struct *mce_notify_helper; - -static int mce_notify_helper_thread(void *unused) -{ - while (1) { - set_current_state(TASK_INTERRUPTIBLE); - schedule(); - if (kthread_should_stop()) - break; - __mce_notify_work(); - } - return 0; -} - -static int mce_notify_work_init(void) -{ - mce_notify_helper = kthread_run(mce_notify_helper_thread, NULL, - "mce-notify"); - if (!mce_notify_helper) - return -ENOMEM; - - return 0; -} - -static void mce_notify_work(void) -{ - wake_up_process(mce_notify_helper); -} -#else -static void mce_notify_work(void) -{ - __mce_notify_work(); -} -static inline int mce_notify_work_init(void) { return 0; } -#endif - /* * Notify the user(s) about new machine check events. * Can be called from interrupt context, but not from machine check/NMI @@ -1420,8 +1353,19 @@ static inline int mce_notify_work_init(void) { return 0; } */ int mce_notify_irq(void) { + /* Not more than two messages every minute */ + static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); + if (test_and_clear_bit(0, &mce_need_notify)) { - mce_notify_work(); + /* wake processes polling /dev/mcelog */ + wake_up_interruptible(&mce_chrdev_wait); + + if (mce_helper[0]) + schedule_work(&mce_trigger_work); + + if (__ratelimit(&ratelimit)) + pr_info(HW_ERR "Machine check events logged\n"); + return 1; } return 0; @@ -1692,7 +1636,7 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) } } -static void mce_start_timer(unsigned int cpu, struct hrtimer *t) +static void mce_start_timer(unsigned int cpu, struct timer_list *t) { unsigned long iv = mce_adjust_timer(check_interval * HZ); @@ -1701,17 +1645,16 @@ static void mce_start_timer(unsigned int cpu, struct hrtimer *t) if (mca_cfg.ignore_ce || !iv) return; - hrtimer_start_range_ns(t, ns_to_ktime(jiffies_to_usecs(iv) * 1000ULL), - 0, HRTIMER_MODE_REL_PINNED); + t->expires = round_jiffies(jiffies + iv); + add_timer_on(t, smp_processor_id()); } static void __mcheck_cpu_init_timer(void) { - struct hrtimer *t = &__get_cpu_var(mce_timer); + struct timer_list *t = &__get_cpu_var(mce_timer); unsigned int cpu = smp_processor_id(); - hrtimer_init(t, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - t->function = mce_timer_fn; + setup_timer(t, mce_timer_fn, cpu); mce_start_timer(cpu, t); } @@ -2386,8 +2329,6 @@ static void mce_disable_cpu(void *h) if (!mce_available(__this_cpu_ptr(&cpu_info))) return; - hrtimer_cancel(&__get_cpu_var(mce_timer)); - if (!(action & CPU_TASKS_FROZEN)) cmci_clear(); for (i = 0; i < mca_cfg.banks; i++) { @@ -2414,7 +2355,6 @@ static void mce_reenable_cpu(void *h) if (b->init) wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl); } - __mcheck_cpu_init_timer(); } /* Get notified when a cpu comes on/off. Be hotplug friendly. */ @@ -2422,6 +2362,7 @@ static int mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { unsigned int cpu = (unsigned long)hcpu; + struct timer_list *t = &per_cpu(mce_timer, cpu); switch (action & ~CPU_TASKS_FROZEN) { case CPU_ONLINE: @@ -2437,9 +2378,11 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) break; case CPU_DOWN_PREPARE: smp_call_function_single(cpu, mce_disable_cpu, &action, 1); + del_timer_sync(t); break; case CPU_DOWN_FAILED: smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); + mce_start_timer(cpu, t); break; } @@ -2501,8 +2444,6 @@ static __init int mcheck_init_device(void) /* register character device /dev/mcelog */ misc_register(&mce_chrdev_device); - err = mce_notify_work_init(); - return err; } device_initcall_sync(mcheck_init_device); diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index 71a39f3..6474807 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -63,6 +63,7 @@ static struct clocksource hyperv_cs = { .rating = 400, /* use this when running on Hyperv*/ .read = read_hv_clock, .mask = CLOCKSOURCE_MASK(64), + .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; static void __init ms_hyperv_init_platform(void) diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index ce2d0a2..0e25a1b 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -683,7 +683,7 @@ static void prepare_set(void) __acquires(set_atomicity_lock) } /* Flush all TLBs via a mov %cr3, %reg; mov %reg, %cr3 */ - count_vm_event(NR_TLB_LOCAL_FLUSH_ALL); + count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); __flush_tlb(); /* Save MTRR state */ @@ -697,7 +697,7 @@ static void prepare_set(void) __acquires(set_atomicity_lock) static void post_set(void) __releases(set_atomicity_lock) { /* Flush TLBs (no need to flush caches - they are disabled) */ - count_vm_event(NR_TLB_LOCAL_FLUSH_ALL); + count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); __flush_tlb(); /* Intel (P6) standard MTRRs */ diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 5edd3c0..c7106f1 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -118,6 +118,9 @@ static int x86_pmu_extra_regs(u64 config, struct perf_event *event) continue; if (event->attr.config1 & ~er->valid_mask) return -EINVAL; + /* Check if the extra msrs can be safely accessed*/ + if (!er->extra_msr_access) + return -ENXIO; reg->idx = er->idx; reg->config = event->attr.config1; diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index cc16faa..53bd272 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -279,14 +279,16 @@ struct extra_reg { u64 config_mask; u64 valid_mask; int idx; /* per_xxx->regs[] reg index */ + bool extra_msr_access; }; #define EVENT_EXTRA_REG(e, ms, m, vm, i) { \ - .event = (e), \ - .msr = (ms), \ - .config_mask = (m), \ - .valid_mask = (vm), \ - .idx = EXTRA_REG_##i, \ + .event = (e), \ + .msr = (ms), \ + .config_mask = (m), \ + .valid_mask = (vm), \ + .idx = EXTRA_REG_##i, \ + .extra_msr_access = true, \ } #define INTEL_EVENT_EXTRA_REG(event, msr, vm, idx) \ diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index f31a165..b400d0b 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1365,6 +1365,15 @@ again: intel_pmu_lbr_read(); /* + * CondChgd bit 63 doesn't mean any overflow status. Ignore + * and clear the bit. + */ + if (__test_and_clear_bit(63, (unsigned long *)&status)) { + if (!status) + goto done; + } + + /* * PEBS overflow sets bit 62 in the global status register */ if (__test_and_clear_bit(62, (unsigned long *)&status)) { @@ -2135,6 +2144,41 @@ static void intel_snb_check_microcode(void) } } +/* + * Under certain circumstances, access certain MSR may cause #GP. + * The function tests if the input MSR can be safely accessed. + */ +static bool check_msr(unsigned long msr, u64 mask) +{ + u64 val_old, val_new, val_tmp; + + /* + * Read the current value, change it and read it back to see if it + * matches, this is needed to detect certain hardware emulators + * (qemu/kvm) that don't trap on the MSR access and always return 0s. + */ + if (rdmsrl_safe(msr, &val_old)) + return false; + + /* + * Only change the bits which can be updated by wrmsrl. + */ + val_tmp = val_old ^ mask; + if (wrmsrl_safe(msr, val_tmp) || + rdmsrl_safe(msr, &val_new)) + return false; + + if (val_new != val_tmp) + return false; + + /* Here it's sure that the MSR can be safely accessed. + * Restore the old value and return. + */ + wrmsrl(msr, val_old); + + return true; +} + static __init void intel_sandybridge_quirk(void) { x86_pmu.check_microcode = intel_snb_check_microcode; @@ -2198,7 +2242,8 @@ __init int intel_pmu_init(void) union cpuid10_ebx ebx; struct event_constraint *c; unsigned int unused; - int version; + struct extra_reg *er; + int version, i; if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { switch (boot_cpu_data.x86) { @@ -2243,10 +2288,7 @@ __init int intel_pmu_init(void) if (version > 1) x86_pmu.num_counters_fixed = max((int)edx.split.num_counters_fixed, 3); - /* - * v2 and above have a perf capabilities MSR - */ - if (version > 1) { + if (boot_cpu_has(X86_FEATURE_PDCM)) { u64 capabilities; rdmsrl(MSR_IA32_PERF_CAPABILITIES, capabilities); @@ -2404,6 +2446,9 @@ __init int intel_pmu_init(void) case 62: /* IvyBridge EP */ memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, sizeof(hw_cache_event_ids)); + /* dTLB-load-misses on IVB is different than SNB */ + hw_cache_event_ids[C(DTLB)][C(OP_READ)][C(RESULT_MISS)] = 0x8108; /* DTLB_LOAD_MISSES.DEMAND_LD_MISS_CAUSES_A_WALK */ + memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs, sizeof(hw_cache_extra_regs)); @@ -2503,6 +2548,34 @@ __init int intel_pmu_init(void) } } + /* + * Access LBR MSR may cause #GP under certain circumstances. + * E.g. KVM doesn't support LBR MSR + * Check all LBT MSR here. + * Disable LBR access if any LBR MSRs can not be accessed. + */ + if (x86_pmu.lbr_nr && !check_msr(x86_pmu.lbr_tos, 0x3UL)) + x86_pmu.lbr_nr = 0; + for (i = 0; i < x86_pmu.lbr_nr; i++) { + if (!(check_msr(x86_pmu.lbr_from + i, 0xffffUL) && + check_msr(x86_pmu.lbr_to + i, 0xffffUL))) + x86_pmu.lbr_nr = 0; + } + + /* + * Access extra MSR may cause #GP under certain circumstances. + * E.g. KVM doesn't support offcore event + * Check all extra_regs here. + */ + if (x86_pmu.extra_regs) { + for (er = x86_pmu.extra_regs; er->msr; er++) { + er->extra_msr_access = check_msr(er->msr, 0x1ffUL); + /* Disable LBR select mapping */ + if ((er->idx == EXTRA_REG_LBR) && !er->extra_msr_access) + x86_pmu.lbr_sel_map = NULL; + } + } + /* Support full width counters using alternative MSR range */ if (x86_pmu.intel_cap.full_width_write) { x86_pmu.max_period = x86_pmu.cntval_mask; diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c index 4118f9f..3e1cfbb 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c @@ -2764,6 +2764,17 @@ static struct intel_uncore_box *uncore_event_to_box(struct perf_event *event) return uncore_pmu_to_box(uncore_event_to_pmu(event), smp_processor_id()); } +/* + * Using uncore_pmu_event_init pmu event_init callback + * as a detection point for uncore events. + */ +static int uncore_pmu_event_init(struct perf_event *event); + +static bool is_uncore_event(struct perf_event *event) +{ + return event->pmu->event_init == uncore_pmu_event_init; +} + static int uncore_collect_events(struct intel_uncore_box *box, struct perf_event *leader, bool dogrp) { @@ -2778,13 +2789,18 @@ uncore_collect_events(struct intel_uncore_box *box, struct perf_event *leader, b return -EINVAL; n = box->n_events; - box->event_list[n] = leader; - n++; + + if (is_uncore_event(leader)) { + box->event_list[n] = leader; + n++; + } + if (!dogrp) return n; list_for_each_entry(event, &leader->sibling_list, group_entry) { - if (event->state <= PERF_EVENT_STATE_OFF) + if (!is_uncore_event(event) || + event->state <= PERF_EVENT_STATE_OFF) continue; if (n >= max_count) diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index addb207..66e274a 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -24,7 +24,6 @@ static char x86_stack_ids[][8] = { [ DEBUG_STACK-1 ] = "#DB", [ NMI_STACK-1 ] = "NMI", [ DOUBLEFAULT_STACK-1 ] = "#DF", - [ STACKFAULT_STACK-1 ] = "#SS", [ MCE_STACK-1 ] = "#MC", #if DEBUG_STKSZ > EXCEPTION_STKSZ [ N_EXCEPTION_STACKS ... diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index e491bfd..1f1c33d 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -364,22 +364,14 @@ ENTRY(resume_kernel) DISABLE_INTERRUPTS(CLBR_ANY) cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ? jnz restore_all +need_resched: movl TI_flags(%ebp), %ecx # need_resched set ? testb $_TIF_NEED_RESCHED, %cl - jnz 1f - - cmpl $0,TI_preempt_lazy_count(%ebp) # non-zero preempt_lazy_count ? - jnz restore_all - testl $_TIF_NEED_RESCHED_LAZY, %ecx jz restore_all - -1: testl $X86_EFLAGS_IF,PT_EFLAGS(%esp) # interrupts off (exception path) ? + testl $X86_EFLAGS_IF,PT_EFLAGS(%esp) # interrupts off (exception path) ? jz restore_all call preempt_schedule_irq - movl TI_flags(%ebp), %ecx # need_resched set ? - testl $_TIF_NEED_RESCHED_MASK, %ecx - jnz 1b - jmp restore_all + jmp need_resched END(resume_kernel) #endif CFI_ENDPROC @@ -442,8 +434,9 @@ sysenter_past_esp: jnz sysenter_audit sysenter_do_call: cmpl $(NR_syscalls), %eax - jae syscall_badsys + jae sysenter_badsys call *sys_call_table(,%eax,4) +sysenter_after_call: movl %eax,PT_EAX(%esp) LOCKDEP_SYS_EXIT DISABLE_INTERRUPTS(CLBR_ANY) @@ -524,6 +517,7 @@ ENTRY(system_call) jae syscall_badsys syscall_call: call *sys_call_table(,%eax,4) +syscall_after_call: movl %eax,PT_EAX(%esp) # store the return value syscall_exit: LOCKDEP_SYS_EXIT @@ -538,6 +532,7 @@ syscall_exit: restore_all: TRACE_IRQS_IRET restore_all_notrace: +#ifdef CONFIG_X86_ESPFIX32 movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS # Warning: PT_OLDSS(%esp) contains the wrong/random values if we # are returning to the kernel. @@ -548,6 +543,7 @@ restore_all_notrace: cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax CFI_REMEMBER_STATE je ldt_ss # returning to user-space with LDT SS +#endif restore_nocheck: RESTORE_REGS 4 # skip orig_eax/error_code irq_return: @@ -560,13 +556,9 @@ ENTRY(iret_exc) .previous _ASM_EXTABLE(irq_return,iret_exc) +#ifdef CONFIG_X86_ESPFIX32 CFI_RESTORE_STATE ldt_ss: - larl PT_OLDSS(%esp), %eax - jnz restore_nocheck - testl $0x00400000, %eax # returning to 32bit stack? - jnz restore_nocheck # allright, normal return - #ifdef CONFIG_PARAVIRT /* * The kernel can't run on a non-flat stack if paravirt mode @@ -608,6 +600,7 @@ ldt_ss: lss (%esp), %esp /* switch to espfix segment */ CFI_ADJUST_CFA_OFFSET -8 jmp restore_nocheck +#endif CFI_ENDPROC ENDPROC(system_call) @@ -615,7 +608,7 @@ ENDPROC(system_call) ALIGN RING0_PTREGS_FRAME # can't unwind into user space anyway work_pending: - testl $_TIF_NEED_RESCHED_MASK, %ecx + testb $_TIF_NEED_RESCHED, %cl jz work_notifysig work_resched: call schedule @@ -628,7 +621,7 @@ work_resched: andl $_TIF_WORK_MASK, %ecx # is there any work to be done other # than syscall tracing? jz restore_all - testl $_TIF_NEED_RESCHED_MASK, %ecx + testb $_TIF_NEED_RESCHED, %cl jnz work_resched work_notifysig: # deal with pending signals and @@ -698,8 +691,13 @@ syscall_fault: END(syscall_fault) syscall_badsys: - movl $-ENOSYS,PT_EAX(%esp) - jmp resume_userspace + movl $-ENOSYS,%eax + jmp syscall_after_call +END(syscall_badsys) + +sysenter_badsys: + movl $-ENOSYS,%eax + jmp sysenter_after_call END(syscall_badsys) CFI_ENDPROC /* @@ -715,6 +713,7 @@ END(syscall_badsys) * the high word of the segment base from the GDT and swiches to the * normal stack and adjusts ESP with the matching offset. */ +#ifdef CONFIG_X86_ESPFIX32 /* fixup the stack */ mov GDT_ESPFIX_SS + 4, %al /* bits 16..23 */ mov GDT_ESPFIX_SS + 7, %ah /* bits 24..31 */ @@ -724,8 +723,10 @@ END(syscall_badsys) pushl_cfi %eax lss (%esp), %esp /* switch to the normal stack segment */ CFI_ADJUST_CFA_OFFSET -8 +#endif .endm .macro UNWIND_ESPFIX_STACK +#ifdef CONFIG_X86_ESPFIX32 movl %ss, %eax /* see if on espfix stack */ cmpw $__ESPFIX_SS, %ax @@ -736,6 +737,7 @@ END(syscall_badsys) /* switch to normal stack */ FIXUP_ESPFIX_STACK 27: +#endif .endm /* @@ -1356,11 +1358,13 @@ END(debug) ENTRY(nmi) RING0_INT_FRAME ASM_CLAC +#ifdef CONFIG_X86_ESPFIX32 pushl_cfi %eax movl %ss, %eax cmpw $__ESPFIX_SS, %ax popl_cfi %eax je nmi_espfix_stack +#endif cmpl $ia32_sysenter_target,(%esp) je nmi_stack_fixup pushl_cfi %eax @@ -1400,6 +1404,7 @@ nmi_debug_stack_check: FIX_STACK 24, nmi_stack_correct, 1 jmp nmi_stack_correct +#ifdef CONFIG_X86_ESPFIX32 nmi_espfix_stack: /* We have a RING0_INT_FRAME here. * @@ -1421,6 +1426,7 @@ nmi_espfix_stack: lss 12+4(%esp), %esp # back to espfix stack CFI_ADJUST_CFA_OFFSET -24 jmp irq_return +#endif CFI_ENDPROC END(nmi) diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index abca4f4..e965606 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -58,6 +58,7 @@ #include <asm/asm.h> #include <asm/context_tracking.h> #include <asm/smap.h> +#include <asm/pgtable_types.h> #include <linux/err.h> /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */ @@ -658,8 +659,8 @@ sysret_check: /* Handle reschedules */ /* edx: work, edi: workmask */ sysret_careful: - testl $_TIF_NEED_RESCHED_MASK,%edx - jz sysret_signal + bt $TIF_NEED_RESCHED,%edx + jnc sysret_signal TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_NONE) pushq_cfi %rdi @@ -771,8 +772,8 @@ GLOBAL(int_with_check) /* First do a reschedule test. */ /* edx: work, edi: workmask */ int_careful: - testl $_TIF_NEED_RESCHED_MASK,%edx - jz int_very_careful + bt $TIF_NEED_RESCHED,%edx + jnc int_very_careful TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_NONE) pushq_cfi %rdi @@ -1041,38 +1042,58 @@ restore_args: irq_return: INTERRUPT_RETURN - _ASM_EXTABLE(irq_return, bad_iret) -#ifdef CONFIG_PARAVIRT ENTRY(native_iret) - iretq - _ASM_EXTABLE(native_iret, bad_iret) + /* + * Are we returning to a stack segment from the LDT? Note: in + * 64-bit mode SS:RSP on the exception stack is always valid. + */ +#ifdef CONFIG_X86_ESPFIX64 + testb $4,(SS-RIP)(%rsp) + jnz native_irq_return_ldt #endif - .section .fixup,"ax" -bad_iret: +.global native_irq_return_iret +native_irq_return_iret: /* - * The iret traps when the %cs or %ss being restored is bogus. - * We've lost the original trap vector and error code. - * #GPF is the most likely one to get for an invalid selector. - * So pretend we completed the iret and took the #GPF in user mode. - * - * We are now running with the kernel GS after exception recovery. - * But error_entry expects us to have user GS to match the user %cs, - * so swap back. + * This may fault. Non-paranoid faults on return to userspace are + * handled by fixup_bad_iret. These include #SS, #GP, and #NP. + * Double-faults due to espfix64 are handled in do_double_fault. + * Other faults here are fatal. */ - pushq $0 + iretq +#ifdef CONFIG_X86_ESPFIX64 +native_irq_return_ldt: + pushq_cfi %rax + pushq_cfi %rdi SWAPGS - jmp general_protection - - .previous + movq PER_CPU_VAR(espfix_waddr),%rdi + movq %rax,(0*8)(%rdi) /* RAX */ + movq (2*8)(%rsp),%rax /* RIP */ + movq %rax,(1*8)(%rdi) + movq (3*8)(%rsp),%rax /* CS */ + movq %rax,(2*8)(%rdi) + movq (4*8)(%rsp),%rax /* RFLAGS */ + movq %rax,(3*8)(%rdi) + movq (6*8)(%rsp),%rax /* SS */ + movq %rax,(5*8)(%rdi) + movq (5*8)(%rsp),%rax /* RSP */ + movq %rax,(4*8)(%rdi) + andl $0xffff0000,%eax + popq_cfi %rdi + orq PER_CPU_VAR(espfix_stack),%rax + SWAPGS + movq %rax,%rsp + popq_cfi %rax + jmp native_irq_return_iret +#endif /* edi: workmask, edx: work */ retint_careful: CFI_RESTORE_STATE - testl $_TIF_NEED_RESCHED_MASK,%edx - jz retint_signal + bt $TIF_NEED_RESCHED,%edx + jnc retint_signal TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_NONE) pushq_cfi %rdi @@ -1105,22 +1126,16 @@ retint_signal: ENTRY(retint_kernel) cmpl $0,TI_preempt_count(%rcx) jnz retint_restore_args - bt $TIF_NEED_RESCHED,TI_flags(%rcx) - jc 1f - - cmpl $0,TI_preempt_lazy_count(%rcx) - jnz retint_restore_args - bt $TIF_NEED_RESCHED_LAZY,TI_flags(%rcx) + bt $TIF_NEED_RESCHED,TI_flags(%rcx) jnc retint_restore_args - -1: bt $9,EFLAGS-ARGOFFSET(%rsp) /* interrupts off? */ + bt $9,EFLAGS-ARGOFFSET(%rsp) /* interrupts off? */ jnc retint_restore_args call preempt_schedule_irq jmp exit_intr #endif - CFI_ENDPROC END(common_interrupt) + /* * End of kprobes section */ @@ -1347,7 +1362,6 @@ bad_gs: jmp 2b .previous -#ifndef CONFIG_PREEMPT_RT_FULL /* Call softirq on interrupt stack. Interrupts are off. */ ENTRY(call_softirq) CFI_STARTPROC @@ -1367,7 +1381,6 @@ ENTRY(call_softirq) ret CFI_ENDPROC END(call_softirq) -#endif #ifdef CONFIG_XEN zeroentry xen_hypervisor_callback xen_do_hypervisor_callback @@ -1483,7 +1496,7 @@ apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \ paranoidzeroentry_ist debug do_debug DEBUG_STACK paranoidzeroentry_ist int3 do_int3 DEBUG_STACK -paranoiderrorentry stack_segment do_stack_segment +errorentry stack_segment do_stack_segment #ifdef CONFIG_XEN zeroentry xen_debug do_debug zeroentry xen_int3 do_int3 @@ -1537,7 +1550,7 @@ paranoid_userspace: movq %rsp,%rdi /* &pt_regs */ call sync_regs movq %rax,%rsp /* switch stack for scheduling */ - testl $_TIF_NEED_RESCHED_MASK,%ebx + testl $_TIF_NEED_RESCHED,%ebx jnz paranoid_schedule movl %ebx,%edx /* arg3: thread flags */ TRACE_IRQS_ON @@ -1593,16 +1606,15 @@ error_sti: /* * There are two places in the kernel that can potentially fault with - * usergs. Handle them here. The exception handlers after iret run with - * kernel gs again, so don't set the user space flag. B stepping K8s - * sometimes report an truncated RIP for IRET exceptions returning to - * compat mode. Check for these here too. + * usergs. Handle them here. B stepping K8s sometimes report a + * truncated RIP for IRET exceptions returning to compat mode. Check + * for these here too. */ error_kernelspace: incl %ebx - leaq irq_return(%rip),%rcx + leaq native_irq_return_iret(%rip),%rcx cmpq %rcx,RIP+8(%rsp) - je error_swapgs + je error_bad_iret movl %ecx,%eax /* zero extend */ cmpq %rax,RIP+8(%rsp) je bstep_iret @@ -1613,7 +1625,15 @@ error_kernelspace: bstep_iret: /* Fix truncated RIP */ movq %rcx,RIP+8(%rsp) - jmp error_swapgs + /* fall through */ + +error_bad_iret: + SWAPGS + mov %rsp,%rdi + call fixup_bad_iret + mov %rax,%rsp + decl %ebx /* Return to usergs */ + jmp error_sti CFI_ENDPROC END(error_entry) diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c new file mode 100644 index 0000000..94d857f --- /dev/null +++ b/arch/x86/kernel/espfix_64.c @@ -0,0 +1,208 @@ +/* ----------------------------------------------------------------------- * + * + * Copyright 2014 Intel Corporation; author: H. Peter Anvin + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * ----------------------------------------------------------------------- */ + +/* + * The IRET instruction, when returning to a 16-bit segment, only + * restores the bottom 16 bits of the user space stack pointer. This + * causes some 16-bit software to break, but it also leaks kernel state + * to user space. + * + * This works around this by creating percpu "ministacks", each of which + * is mapped 2^16 times 64K apart. When we detect that the return SS is + * on the LDT, we copy the IRET frame to the ministack and use the + * relevant alias to return to userspace. The ministacks are mapped + * readonly, so if the IRET fault we promote #GP to #DF which is an IST + * vector and thus has its own stack; we then do the fixup in the #DF + * handler. + * + * This file sets up the ministacks and the related page tables. The + * actual ministack invocation is in entry_64.S. + */ + +#include <linux/init.h> +#include <linux/init_task.h> +#include <linux/kernel.h> +#include <linux/percpu.h> +#include <linux/gfp.h> +#include <linux/random.h> +#include <asm/pgtable.h> +#include <asm/pgalloc.h> +#include <asm/setup.h> +#include <asm/espfix.h> + +/* + * Note: we only need 6*8 = 48 bytes for the espfix stack, but round + * it up to a cache line to avoid unnecessary sharing. + */ +#define ESPFIX_STACK_SIZE (8*8UL) +#define ESPFIX_STACKS_PER_PAGE (PAGE_SIZE/ESPFIX_STACK_SIZE) + +/* There is address space for how many espfix pages? */ +#define ESPFIX_PAGE_SPACE (1UL << (PGDIR_SHIFT-PAGE_SHIFT-16)) + +#define ESPFIX_MAX_CPUS (ESPFIX_STACKS_PER_PAGE * ESPFIX_PAGE_SPACE) +#if CONFIG_NR_CPUS > ESPFIX_MAX_CPUS +# error "Need more than one PGD for the ESPFIX hack" +#endif + +#define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO) + +/* This contains the *bottom* address of the espfix stack */ +DEFINE_PER_CPU_READ_MOSTLY(unsigned long, espfix_stack); +DEFINE_PER_CPU_READ_MOSTLY(unsigned long, espfix_waddr); + +/* Initialization mutex - should this be a spinlock? */ +static DEFINE_MUTEX(espfix_init_mutex); + +/* Page allocation bitmap - each page serves ESPFIX_STACKS_PER_PAGE CPUs */ +#define ESPFIX_MAX_PAGES DIV_ROUND_UP(CONFIG_NR_CPUS, ESPFIX_STACKS_PER_PAGE) +static void *espfix_pages[ESPFIX_MAX_PAGES]; + +static __page_aligned_bss pud_t espfix_pud_page[PTRS_PER_PUD] + __aligned(PAGE_SIZE); + +static unsigned int page_random, slot_random; + +/* + * This returns the bottom address of the espfix stack for a specific CPU. + * The math allows for a non-power-of-two ESPFIX_STACK_SIZE, in which case + * we have to account for some amount of padding at the end of each page. + */ +static inline unsigned long espfix_base_addr(unsigned int cpu) +{ + unsigned long page, slot; + unsigned long addr; + + page = (cpu / ESPFIX_STACKS_PER_PAGE) ^ page_random; + slot = (cpu + slot_random) % ESPFIX_STACKS_PER_PAGE; + addr = (page << PAGE_SHIFT) + (slot * ESPFIX_STACK_SIZE); + addr = (addr & 0xffffUL) | ((addr & ~0xffffUL) << 16); + addr += ESPFIX_BASE_ADDR; + return addr; +} + +#define PTE_STRIDE (65536/PAGE_SIZE) +#define ESPFIX_PTE_CLONES (PTRS_PER_PTE/PTE_STRIDE) +#define ESPFIX_PMD_CLONES PTRS_PER_PMD +#define ESPFIX_PUD_CLONES (65536/(ESPFIX_PTE_CLONES*ESPFIX_PMD_CLONES)) + +#define PGTABLE_PROT ((_KERNPG_TABLE & ~_PAGE_RW) | _PAGE_NX) + +static void init_espfix_random(void) +{ + unsigned long rand; + + /* + * This is run before the entropy pools are initialized, + * but this is hopefully better than nothing. + */ + if (!arch_get_random_long(&rand)) { + /* The constant is an arbitrary large prime */ + rdtscll(rand); + rand *= 0xc345c6b72fd16123UL; + } + + slot_random = rand % ESPFIX_STACKS_PER_PAGE; + page_random = (rand / ESPFIX_STACKS_PER_PAGE) + & (ESPFIX_PAGE_SPACE - 1); +} + +void __init init_espfix_bsp(void) +{ + pgd_t *pgd_p; + pteval_t ptemask; + + ptemask = __supported_pte_mask; + + /* Install the espfix pud into the kernel page directory */ + pgd_p = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)]; + pgd_populate(&init_mm, pgd_p, (pud_t *)espfix_pud_page); + + /* Randomize the locations */ + init_espfix_random(); + + /* The rest is the same as for any other processor */ + init_espfix_ap(); +} + +void init_espfix_ap(void) +{ + unsigned int cpu, page; + unsigned long addr; + pud_t pud, *pud_p; + pmd_t pmd, *pmd_p; + pte_t pte, *pte_p; + int n; + void *stack_page; + pteval_t ptemask; + + /* We only have to do this once... */ + if (likely(this_cpu_read(espfix_stack))) + return; /* Already initialized */ + + cpu = smp_processor_id(); + addr = espfix_base_addr(cpu); + page = cpu/ESPFIX_STACKS_PER_PAGE; + + /* Did another CPU already set this up? */ + stack_page = ACCESS_ONCE(espfix_pages[page]); + if (likely(stack_page)) + goto done; + + mutex_lock(&espfix_init_mutex); + + /* Did we race on the lock? */ + stack_page = ACCESS_ONCE(espfix_pages[page]); + if (stack_page) + goto unlock_done; + + ptemask = __supported_pte_mask; + + pud_p = &espfix_pud_page[pud_index(addr)]; + pud = *pud_p; + if (!pud_present(pud)) { + pmd_p = (pmd_t *)__get_free_page(PGALLOC_GFP); + pud = __pud(__pa(pmd_p) | (PGTABLE_PROT & ptemask)); + paravirt_alloc_pmd(&init_mm, __pa(pmd_p) >> PAGE_SHIFT); + for (n = 0; n < ESPFIX_PUD_CLONES; n++) + set_pud(&pud_p[n], pud); + } + + pmd_p = pmd_offset(&pud, addr); + pmd = *pmd_p; + if (!pmd_present(pmd)) { + pte_p = (pte_t *)__get_free_page(PGALLOC_GFP); + pmd = __pmd(__pa(pte_p) | (PGTABLE_PROT & ptemask)); + paravirt_alloc_pte(&init_mm, __pa(pte_p) >> PAGE_SHIFT); + for (n = 0; n < ESPFIX_PMD_CLONES; n++) + set_pmd(&pmd_p[n], pmd); + } + + pte_p = pte_offset_kernel(&pmd, addr); + stack_page = (void *)__get_free_page(GFP_KERNEL); + pte = __pte(__pa(stack_page) | (__PAGE_KERNEL_RO & ptemask)); + for (n = 0; n < ESPFIX_PTE_CLONES; n++) + set_pte(&pte_p[n*PTE_STRIDE], pte); + + /* Job is done for this CPU and any CPU which shares this page */ + ACCESS_ONCE(espfix_pages[page]) = stack_page; + +unlock_done: + mutex_unlock(&espfix_init_mutex); +done: + this_cpu_write(espfix_stack, addr); + this_cpu_write(espfix_waddr, (unsigned long)stack_page + + (addr & ~PAGE_MASK)); +} diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index e625319..f8ab203 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -297,16 +297,7 @@ int ftrace_int3_handler(struct pt_regs *regs) static int ftrace_write(unsigned long ip, const char *val, int size) { - /* - * On x86_64, kernel text mappings are mapped read-only with - * CONFIG_DEBUG_RODATA. So we use the kernel identity mapping instead - * of the kernel text mapping to modify the kernel text. - * - * For 32bit kernels, these mappings are same and we can use - * kernel identity mapping to modify code. - */ - if (within(ip, (unsigned long)_text, (unsigned long)_etext)) - ip = (unsigned long)__va(__pa_symbol(ip)); + ip = text_ip_addr(ip); return probe_kernel_write((void *)ip, val, size); } @@ -659,8 +650,8 @@ ftrace_modify_code(unsigned long ip, unsigned const char *old_code, ret = -EPERM; goto out; } - run_sync(); out: + run_sync(); return ret; fail_update: diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 22d0687..3910078 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -262,6 +262,83 @@ __visible void smp_trace_x86_platform_ipi(struct pt_regs *regs) EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq); #ifdef CONFIG_HOTPLUG_CPU + +/* These two declarations are only used in check_irq_vectors_for_cpu_disable() + * below, which is protected by stop_machine(). Putting them on the stack + * results in a stack frame overflow. Dynamically allocating could result in a + * failure so declare these two cpumasks as global. + */ +static struct cpumask affinity_new, online_new; + +/* + * This cpu is going to be removed and its vectors migrated to the remaining + * online cpus. Check to see if there are enough vectors in the remaining cpus. + * This function is protected by stop_machine(). + */ +int check_irq_vectors_for_cpu_disable(void) +{ + int irq, cpu; + unsigned int this_cpu, vector, this_count, count; + struct irq_desc *desc; + struct irq_data *data; + + this_cpu = smp_processor_id(); + cpumask_copy(&online_new, cpu_online_mask); + cpu_clear(this_cpu, online_new); + + this_count = 0; + for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { + irq = __this_cpu_read(vector_irq[vector]); + if (irq >= 0) { + desc = irq_to_desc(irq); + data = irq_desc_get_irq_data(desc); + cpumask_copy(&affinity_new, data->affinity); + cpu_clear(this_cpu, affinity_new); + + /* Do not count inactive or per-cpu irqs. */ + if (!irq_has_action(irq) || irqd_is_per_cpu(data)) + continue; + + /* + * A single irq may be mapped to multiple + * cpu's vector_irq[] (for example IOAPIC cluster + * mode). In this case we have two + * possibilities: + * + * 1) the resulting affinity mask is empty; that is + * this the down'd cpu is the last cpu in the irq's + * affinity mask, or + * + * 2) the resulting affinity mask is no longer + * a subset of the online cpus but the affinity + * mask is not zero; that is the down'd cpu is the + * last online cpu in a user set affinity mask. + */ + if (cpumask_empty(&affinity_new) || + !cpumask_subset(&affinity_new, &online_new)) + this_count++; + } + } + + count = 0; + for_each_online_cpu(cpu) { + if (cpu == this_cpu) + continue; + for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; + vector++) { + if (per_cpu(vector_irq, cpu)[vector] < 0) + count++; + } + } + + if (count < this_count) { + pr_warn("CPU %d disable failed: CPU has %u vectors assigned and there are only %u available.\n", + this_cpu, this_count, count); + return -ERANGE; + } + return 0; +} + /* A cpu has been removed from cpu_online_mask. Reset irq affinities. */ void fixup_irqs(void) { diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 9da1bc7..4186755 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -149,7 +149,6 @@ void irq_ctx_init(int cpu) cpu, per_cpu(hardirq_ctx, cpu), per_cpu(softirq_ctx, cpu)); } -#ifndef CONFIG_PREEMPT_RT_FULL asmlinkage void do_softirq(void) { unsigned long flags; @@ -180,7 +179,6 @@ asmlinkage void do_softirq(void) local_irq_restore(flags); } -#endif bool handle_irq(unsigned irq, struct pt_regs *regs) { diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 831f247..d04d3ec 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -88,7 +88,7 @@ bool handle_irq(unsigned irq, struct pt_regs *regs) return true; } -#ifndef CONFIG_PREEMPT_RT_FULL + extern void call_softirq(void); asmlinkage void do_softirq(void) @@ -108,4 +108,3 @@ asmlinkage void do_softirq(void) } local_irq_restore(flags); } -#endif diff --git a/arch/x86/kernel/irq_work.c b/arch/x86/kernel/irq_work.c index 3d21f7b..1de84e3 100644 --- a/arch/x86/kernel/irq_work.c +++ b/arch/x86/kernel/irq_work.c @@ -38,7 +38,6 @@ __visible void smp_trace_irq_work_interrupt(struct pt_regs *regs) exiting_irq(); } -#ifndef CONFIG_PREEMPT_RT_FULL void arch_irq_work_raise(void) { #ifdef CONFIG_X86_LOCAL_APIC @@ -49,4 +48,3 @@ void arch_irq_work_raise(void) apic_wait_icr_idle(); #endif } -#endif diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 79a3f96..a1f5b18 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -1017,6 +1017,15 @@ int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs) regs->flags &= ~X86_EFLAGS_IF; trace_hardirqs_off(); regs->ip = (unsigned long)(jp->entry); + + /* + * jprobes use jprobe_return() which skips the normal return + * path of the function, and this messes up the accounting of the + * function graph tracer to get messed up. + * + * Pause function graph tracing while performing the jprobe function. + */ + pause_graph_tracing(); return 1; } @@ -1042,24 +1051,25 @@ int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); u8 *addr = (u8 *) (regs->ip - 1); struct jprobe *jp = container_of(p, struct jprobe, kp); + void *saved_sp = kcb->jprobe_saved_sp; if ((addr > (u8 *) jprobe_return) && (addr < (u8 *) jprobe_return_end)) { - if (stack_addr(regs) != kcb->jprobe_saved_sp) { + if (stack_addr(regs) != saved_sp) { struct pt_regs *saved_regs = &kcb->jprobe_saved_regs; printk(KERN_ERR "current sp %p does not match saved sp %p\n", - stack_addr(regs), kcb->jprobe_saved_sp); + stack_addr(regs), saved_sp); printk(KERN_ERR "Saved registers for jprobe %p\n", jp); show_regs(saved_regs); printk(KERN_ERR "Current registers\n"); show_regs(regs); BUG(); } + /* It's OK to start function graph tracing again */ + unpause_graph_tracing(); *regs = kcb->jprobe_saved_regs; - memcpy((kprobe_opcode_t *)(kcb->jprobe_saved_sp), - kcb->jprobes_stack, - MIN_STACK_SIZE(kcb->jprobe_saved_sp)); + memcpy(saved_sp, kcb->jprobes_stack, MIN_STACK_SIZE(saved_sp)); preempt_enable_no_resched(); return 1; } diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index f022c54..e725933 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -280,7 +280,14 @@ do_async_page_fault(struct pt_regs *regs, unsigned long error_code) static void __init paravirt_ops_setup(void) { pv_info.name = "KVM"; - pv_info.paravirt_enabled = 1; + + /* + * KVM isn't paravirt in the sense of paravirt_enabled. A KVM + * guest kernel works like a bare metal kernel with additional + * features, and paravirt_enabled is about features that are + * missing. + */ + pv_info.paravirt_enabled = 0; if (kvm_para_has_feature(KVM_FEATURE_NOP_IO_DELAY)) pv_cpu_ops.io_delay = kvm_io_delay; diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 1570e07..23457e5 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -262,7 +262,6 @@ void __init kvmclock_init(void) #endif kvm_get_preset_lpj(); clocksource_register_hz(&kvm_clock, NSEC_PER_SEC); - pv_info.paravirt_enabled = 1; pv_info.name = "KVM"; if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT)) diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index ebc9873..c37886d 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -229,6 +229,11 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode) } } + if (!IS_ENABLED(CONFIG_X86_16BIT) && !ldt_info.seg_32bit) { + error = -EINVAL; + goto out_unlock; + } + fill_ldt(&ldt, &ldt_info); if (oldmode) ldt.avl = 0; diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c index 3f08f34..a1da673 100644 --- a/arch/x86/kernel/paravirt_patch_64.c +++ b/arch/x86/kernel/paravirt_patch_64.c @@ -6,7 +6,6 @@ DEF_NATIVE(pv_irq_ops, irq_disable, "cli"); DEF_NATIVE(pv_irq_ops, irq_enable, "sti"); DEF_NATIVE(pv_irq_ops, restore_fl, "pushq %rdi; popfq"); DEF_NATIVE(pv_irq_ops, save_fl, "pushfq; popq %rax"); -DEF_NATIVE(pv_cpu_ops, iret, "iretq"); DEF_NATIVE(pv_mmu_ops, read_cr2, "movq %cr2, %rax"); DEF_NATIVE(pv_mmu_ops, read_cr3, "movq %cr3, %rax"); DEF_NATIVE(pv_mmu_ops, write_cr3, "movq %rdi, %cr3"); @@ -50,7 +49,6 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf, PATCH_SITE(pv_irq_ops, save_fl); PATCH_SITE(pv_irq_ops, irq_enable); PATCH_SITE(pv_irq_ops, irq_disable); - PATCH_SITE(pv_cpu_ops, iret); PATCH_SITE(pv_cpu_ops, irq_enable_sysexit); PATCH_SITE(pv_cpu_ops, usergs_sysret32); PATCH_SITE(pv_cpu_ops, usergs_sysret64); diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 6ce0537..884f98f 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -36,7 +36,6 @@ #include <linux/uaccess.h> #include <linux/io.h> #include <linux/kdebug.h> -#include <linux/highmem.h> #include <asm/pgtable.h> #include <asm/ldt.h> @@ -220,35 +219,6 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) } EXPORT_SYMBOL_GPL(start_thread); -#ifdef CONFIG_PREEMPT_RT_FULL -static void switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p) -{ - int i; - - /* - * Clear @prev's kmap_atomic mappings - */ - for (i = 0; i < prev_p->kmap_idx; i++) { - int idx = i + KM_TYPE_NR * smp_processor_id(); - pte_t *ptep = kmap_pte - idx; - - kpte_clear_flush(ptep, __fix_to_virt(FIX_KMAP_BEGIN + idx)); - } - /* - * Restore @next_p's kmap_atomic mappings - */ - for (i = 0; i < next_p->kmap_idx; i++) { - int idx = i + KM_TYPE_NR * smp_processor_id(); - - if (!pte_none(next_p->kmap_pte[i])) - set_pte(kmap_pte - idx, next_p->kmap_pte[i]); - } -} -#else -static inline void -switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p) { } -#endif - /* * switch_to(x,y) should switch tasks from x to y. @@ -328,8 +298,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT)) __switch_to_xtra(prev_p, next_p, tss); - switch_kmaps(prev_p, next_p); - /* * Leave lazy mode, flushing any hypercalls made here. * This must be done before restoring TLS segments so diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 7461f50..0686fe3 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -1441,15 +1441,6 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, force_sig_info(SIGTRAP, &info, tsk); } - -#ifdef CONFIG_X86_32 -# define IS_IA32 1 -#elif defined CONFIG_IA32_EMULATION -# define IS_IA32 is_compat_task() -#else -# define IS_IA32 0 -#endif - /* * We must return the syscall number to actually look up in the table. * This can be -1L to skip running any syscall at all. @@ -1487,7 +1478,7 @@ long syscall_trace_enter(struct pt_regs *regs) if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) trace_sys_enter(regs, regs->orig_ax); - if (IS_IA32) + if (is_ia32_task()) audit_syscall_entry(AUDIT_ARCH_I386, regs->orig_ax, regs->bx, regs->cx, diff --git a/arch/x86/kernel/resource.c b/arch/x86/kernel/resource.c index 2a26819..80eab01 100644 --- a/arch/x86/kernel/resource.c +++ b/arch/x86/kernel/resource.c @@ -37,10 +37,12 @@ static void remove_e820_regions(struct resource *avail) void arch_remove_reservations(struct resource *avail) { - /* Trim out BIOS areas (low 1MB and high 2MB) and E820 regions */ + /* + * Trim out BIOS area (high 2MB) and E820 regions. We do not remove + * the low 1MB unconditionally, as this area is needed for some ISA + * cards requiring a memory range, e.g. the i82365 PCMCIA controller. + */ if (avail->flags & IORESOURCE_MEM) { - if (avail->start < BIOS_END) - avail->start = BIOS_END; resource_clip(avail, BIOS_ROM_BASE, BIOS_ROM_END); remove_e820_regions(avail); diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index ecfe089..b88fc86 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -673,6 +673,11 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs) * handler too. */ regs->flags &= ~(X86_EFLAGS_DF|X86_EFLAGS_RF|X86_EFLAGS_TF); + /* + * Ensure the signal handler starts with the new fpu state. + */ + if (used_math()) + drop_init_fpu(current); } signal_setup_done(failed, ksig, test_thread_flag(TIF_SINGLESTEP)); } @@ -739,14 +744,6 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) mce_notify_process(); #endif /* CONFIG_X86_64 && CONFIG_X86_MCE */ -#ifdef ARCH_RT_DELAYS_SIGNAL_SEND - if (unlikely(current->forced_info.si_signo)) { - struct task_struct *t = current; - force_sig_info(t->forced_info.si_signo, &t->forced_info, t); - t->forced_info.si_signo = 0; - } -#endif - if (thread_info_flags & _TIF_UPROBE) uprobe_notify_resume(regs); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 6cacab6..9ccb7ef 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -265,6 +265,13 @@ static void notrace start_secondary(void *unused) check_tsc_sync_target(); /* + * Enable the espfix hack for this CPU + */ +#ifdef CONFIG_X86_ESPFIX64 + init_espfix_ap(); +#endif + + /* * We need to hold vector_lock so there the set of online cpus * does not change while we are assigning vectors to cpus. Holding * this lock ensures we don't half assign or remove an irq from a cpu. @@ -1278,6 +1285,9 @@ static void remove_siblinginfo(int cpu) for_each_cpu(sibling, cpu_sibling_mask(cpu)) cpumask_clear_cpu(cpu, cpu_sibling_mask(sibling)); + for_each_cpu(sibling, cpu_llc_shared_mask(cpu)) + cpumask_clear_cpu(cpu, cpu_llc_shared_mask(sibling)); + cpumask_clear(cpu_llc_shared_mask(cpu)); cpumask_clear(cpu_sibling_mask(cpu)); cpumask_clear(cpu_core_mask(cpu)); c->phys_proc_id = 0; @@ -1310,6 +1320,12 @@ void cpu_disable_common(void) int native_cpu_disable(void) { + int ret; + + ret = check_irq_vectors_for_cpu_disable(); + if (ret) + return ret; + clear_local_APIC(); cpu_disable_common(); diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c index f7fec09..7fc5e84 100644 --- a/arch/x86/kernel/tls.c +++ b/arch/x86/kernel/tls.c @@ -27,6 +27,58 @@ static int get_free_idx(void) return -ESRCH; } +static bool tls_desc_okay(const struct user_desc *info) +{ + /* + * For historical reasons (i.e. no one ever documented how any + * of the segmentation APIs work), user programs can and do + * assume that a struct user_desc that's all zeros except for + * entry_number means "no segment at all". This never actually + * worked. In fact, up to Linux 3.19, a struct user_desc like + * this would create a 16-bit read-write segment with base and + * limit both equal to zero. + * + * That was close enough to "no segment at all" until we + * hardened this function to disallow 16-bit TLS segments. Fix + * it up by interpreting these zeroed segments the way that they + * were almost certainly intended to be interpreted. + * + * The correct way to ask for "no segment at all" is to specify + * a user_desc that satisfies LDT_empty. To keep everything + * working, we accept both. + * + * Note that there's a similar kludge in modify_ldt -- look at + * the distinction between modes 1 and 0x11. + */ + if (LDT_empty(info) || LDT_zero(info)) + return true; + + /* + * espfix is required for 16-bit data segments, but espfix + * only works for LDT segments. + */ + if (!info->seg_32bit) + return false; + + /* Only allow data segments in the TLS array. */ + if (info->contents > 1) + return false; + + /* + * Non-present segments with DPL 3 present an interesting attack + * surface. The kernel should handle such segments correctly, + * but TLS is very difficult to protect in a sandbox, so prevent + * such segments from being created. + * + * If userspace needs to remove a TLS entry, it can still delete + * it outright. + */ + if (info->seg_not_present) + return false; + + return true; +} + static void set_tls_desc(struct task_struct *p, int idx, const struct user_desc *info, int n) { @@ -40,7 +92,7 @@ static void set_tls_desc(struct task_struct *p, int idx, cpu = get_cpu(); while (n-- > 0) { - if (LDT_empty(info)) + if (LDT_empty(info) || LDT_zero(info)) desc->a = desc->b = 0; else fill_ldt(desc, info); @@ -66,6 +118,9 @@ int do_set_thread_area(struct task_struct *p, int idx, if (copy_from_user(&info, u_info, sizeof(info))) return -EFAULT; + if (!tls_desc_okay(&info)) + return -EINVAL; + if (idx == -1) idx = info.entry_number; @@ -192,6 +247,7 @@ int regset_tls_set(struct task_struct *target, const struct user_regset *regset, { struct user_desc infobuf[GDT_ENTRY_TLS_ENTRIES]; const struct user_desc *info; + int i; if (pos >= GDT_ENTRY_TLS_ENTRIES * sizeof(struct user_desc) || (pos % sizeof(struct user_desc)) != 0 || @@ -205,6 +261,10 @@ int regset_tls_set(struct task_struct *target, const struct user_regset *regset, else info = infobuf; + for (i = 0; i < count / sizeof(struct user_desc); i++) + if (!tls_desc_okay(info + i)) + return -EINVAL; + set_tls_desc(target, GDT_ENTRY_TLS_MIN + (pos / sizeof(struct user_desc)), info, count / sizeof(struct user_desc)); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 6663bb5..88f3d3b 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -86,21 +86,9 @@ static inline void conditional_sti(struct pt_regs *regs) local_irq_enable(); } -static inline void conditional_sti_ist(struct pt_regs *regs) +static inline void preempt_conditional_sti(struct pt_regs *regs) { -#ifdef CONFIG_X86_64 - /* - * X86_64 uses a per CPU stack on the IST for certain traps - * like int3. The task can not be preempted when using one - * of these stacks, thus preemption must be disabled, otherwise - * the stack can be corrupted if the task is scheduled out, - * and another task comes in and uses this stack. - * - * On x86_32 the task keeps its own stack and it is OK if the - * task schedules out. - */ inc_preempt_count(); -#endif if (regs->flags & X86_EFLAGS_IF) local_irq_enable(); } @@ -111,13 +99,11 @@ static inline void conditional_cli(struct pt_regs *regs) local_irq_disable(); } -static inline void conditional_cli_ist(struct pt_regs *regs) +static inline void preempt_conditional_cli(struct pt_regs *regs) { if (regs->flags & X86_EFLAGS_IF) local_irq_disable(); -#ifdef CONFIG_X86_64 dec_preempt_count(); -#endif } static int __kprobes @@ -235,33 +221,41 @@ DO_ERROR(X86_TRAP_OLD_MF, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) DO_ERROR(X86_TRAP_TS, SIGSEGV, "invalid TSS", invalid_TSS) DO_ERROR(X86_TRAP_NP, SIGBUS, "segment not present", segment_not_present) -#ifdef CONFIG_X86_32 DO_ERROR(X86_TRAP_SS, SIGBUS, "stack segment", stack_segment) -#endif DO_ERROR_INFO(X86_TRAP_AC, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0) #ifdef CONFIG_X86_64 /* Runs on IST stack */ -dotraplinkage void do_stack_segment(struct pt_regs *regs, long error_code) -{ - enum ctx_state prev_state; - - prev_state = exception_enter(); - if (notify_die(DIE_TRAP, "stack segment", regs, error_code, - X86_TRAP_SS, SIGBUS) != NOTIFY_STOP) { - conditional_sti_ist(regs); - do_trap(X86_TRAP_SS, SIGBUS, "stack segment", regs, error_code, NULL); - conditional_cli_ist(regs); - } - exception_exit(prev_state); -} - dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) { static const char str[] = "double fault"; struct task_struct *tsk = current; +#ifdef CONFIG_X86_ESPFIX64 + extern unsigned char native_irq_return_iret[]; + + /* + * If IRET takes a non-IST fault on the espfix64 stack, then we + * end up promoting it to a doublefault. In that case, modify + * the stack to make it look like we just entered the #GP + * handler from user space, similar to bad_iret. + */ + if (((long)regs->sp >> PGDIR_SHIFT) == ESPFIX_PGD_ENTRY && + regs->cs == __KERNEL_CS && + regs->ip == (unsigned long)native_irq_return_iret) + { + struct pt_regs *normal_regs = task_pt_regs(current); + + /* Fake a #GP(0) from userspace. */ + memmove(&normal_regs->ip, (void *)regs->sp, 5*8); + normal_regs->orig_ax = 0; /* Missing (lost) #GP error code */ + regs->ip = (unsigned long)general_protection; + regs->sp = (unsigned long)&normal_regs->orig_ax; + return; + } +#endif + exception_enter(); /* Return not checked because double check cannot be ignored */ notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV); @@ -361,9 +355,9 @@ dotraplinkage void __kprobes notrace do_int3(struct pt_regs *regs, long error_co * as we may switch to the interrupt stack. */ debug_stack_usage_inc(); - conditional_sti_ist(regs); + preempt_conditional_sti(regs); do_trap(X86_TRAP_BP, SIGTRAP, "int3", regs, error_code, NULL); - conditional_cli_ist(regs); + preempt_conditional_cli(regs); debug_stack_usage_dec(); exit: exception_exit(prev_state); @@ -375,7 +369,7 @@ exit: * for scheduling or signal handling. The actual stack switch is done in * entry.S */ -asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs) +asmlinkage notrace __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs) { struct pt_regs *regs = eregs; /* Did already sync */ @@ -394,6 +388,35 @@ asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs) *regs = *eregs; return regs; } + +struct bad_iret_stack { + void *error_entry_ret; + struct pt_regs regs; +}; + +asmlinkage __visible notrace __kprobes +struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s) +{ + /* + * This is called from entry_64.S early in handling a fault + * caused by a bad iret to user mode. To handle the fault + * correctly, we want move our stack frame to task_pt_regs + * and we want to pretend that the exception came from the + * iret target. + */ + struct bad_iret_stack *new_stack = + container_of(task_pt_regs(current), + struct bad_iret_stack, regs); + + /* Copy the IRET target to the new stack. */ + memmove(&new_stack->regs.ip, (void *)s->regs.sp, 5*8); + + /* Copy the remainder of the stack from the current stack. */ + memmove(new_stack, s, offsetof(struct bad_iret_stack, regs.ip)); + + BUG_ON(!user_mode_vm(&new_stack->regs)); + return new_stack; +} #endif /* @@ -469,12 +492,12 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) debug_stack_usage_inc(); /* It's safe to allow irq's after DR6 has been saved */ - conditional_sti_ist(regs); + preempt_conditional_sti(regs); if (regs->flags & X86_VM_MASK) { handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, X86_TRAP_DB); - conditional_cli_ist(regs); + preempt_conditional_cli(regs); debug_stack_usage_dec(); goto exit; } @@ -494,7 +517,7 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) si_code = get_si_code(tsk->thread.debugreg6); if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS) || user_icebp) send_sigtrap(tsk, regs, error_code, si_code); - conditional_cli_ist(regs); + preempt_conditional_cli(regs); debug_stack_usage_dec(); exit: @@ -766,7 +789,7 @@ void __init trap_init(void) set_intr_gate(X86_TRAP_OLD_MF, &coprocessor_segment_overrun); set_intr_gate(X86_TRAP_TS, &invalid_TSS); set_intr_gate(X86_TRAP_NP, &segment_not_present); - set_intr_gate_ist(X86_TRAP_SS, &stack_segment, STACKFAULT_STACK); + set_intr_gate(X86_TRAP_SS, stack_segment); set_intr_gate(X86_TRAP_GP, &general_protection); set_intr_gate(X86_TRAP_SPURIOUS, &spurious_interrupt_bug); set_intr_gate(X86_TRAP_MF, &coprocessor_error); diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 930e5d4..cefe57c 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -386,7 +386,7 @@ static unsigned long quick_pit_calibrate(void) goto success; } } - pr_err("Fast TSC calibration failed\n"); + pr_info("Fast TSC calibration failed\n"); return 0; success: @@ -974,14 +974,17 @@ void __init tsc_init(void) x86_init.timers.tsc_pre_init(); - if (!cpu_has_tsc) + if (!cpu_has_tsc) { + setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER); return; + } tsc_khz = x86_platform.calibrate_tsc(); cpu_khz = tsc_khz; if (!tsc_khz) { mark_tsc_unstable("could not calculate TSC khz"); + setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER); return; } diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 1f96f93..09ce23a 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -125,10 +125,10 @@ static void warn_bad_vsyscall(const char *level, struct pt_regs *regs, if (!show_unhandled_signals) return; - pr_notice_ratelimited("%s%s[%d] %s ip:%lx cs:%lx sp:%lx ax:%lx si:%lx di:%lx\n", - level, current->comm, task_pid_nr(current), - message, regs->ip, regs->cs, - regs->sp, regs->ax, regs->si, regs->di); + printk_ratelimited("%s%s[%d] %s ip:%lx cs:%lx sp:%lx ax:%lx si:%lx di:%lx\n", + level, current->comm, task_pid_nr(current), + message, regs->ip, regs->cs, + regs->sp, regs->ax, regs->si, regs->di); } static int addr_to_vsyscall_nr(unsigned long addr) diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index 422fd82..f5869fc 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -268,8 +268,6 @@ int save_xstate_sig(void __user *buf, void __user *buf_fx, int size) if (use_fxsr() && save_xstate_epilog(buf_fx, ia32_fxstate)) return -1; - drop_init_fpu(tsk); /* trigger finit */ - return 0; } @@ -399,8 +397,11 @@ int __restore_xstate_sig(void __user *buf, void __user *buf_fx, int size) set_used_math(); } - if (use_eager_fpu()) + if (use_eager_fpu()) { + preempt_disable(); math_state_restore(); + preempt_enable(); + } return err; } else { diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 92e6f4a..ab1d459 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -498,11 +498,6 @@ static void rsp_increment(struct x86_emulate_ctxt *ctxt, int inc) masked_increment(reg_rmw(ctxt, VCPU_REGS_RSP), stack_mask(ctxt), inc); } -static inline void jmp_rel(struct x86_emulate_ctxt *ctxt, int rel) -{ - register_address_increment(ctxt, &ctxt->_eip, rel); -} - static u32 desc_limit_scaled(struct desc_struct *desc) { u32 limit = get_desc_limit(desc); @@ -576,6 +571,38 @@ static int emulate_nm(struct x86_emulate_ctxt *ctxt) return emulate_exception(ctxt, NM_VECTOR, 0, false); } +static inline int assign_eip_far(struct x86_emulate_ctxt *ctxt, ulong dst, + int cs_l) +{ + switch (ctxt->op_bytes) { + case 2: + ctxt->_eip = (u16)dst; + break; + case 4: + ctxt->_eip = (u32)dst; + break; + case 8: + if ((cs_l && is_noncanonical_address(dst)) || + (!cs_l && (dst & ~(u32)-1))) + return emulate_gp(ctxt, 0); + ctxt->_eip = dst; + break; + default: + WARN(1, "unsupported eip assignment size\n"); + } + return X86EMUL_CONTINUE; +} + +static inline int assign_eip_near(struct x86_emulate_ctxt *ctxt, ulong dst) +{ + return assign_eip_far(ctxt, dst, ctxt->mode == X86EMUL_MODE_PROT64); +} + +static inline int jmp_rel(struct x86_emulate_ctxt *ctxt, int rel) +{ + return assign_eip_near(ctxt, ctxt->_eip + rel); +} + static u16 get_segment_selector(struct x86_emulate_ctxt *ctxt, unsigned seg) { u16 selector; @@ -1964,13 +1991,15 @@ static int em_grp45(struct x86_emulate_ctxt *ctxt) case 2: /* call near abs */ { long int old_eip; old_eip = ctxt->_eip; - ctxt->_eip = ctxt->src.val; + rc = assign_eip_near(ctxt, ctxt->src.val); + if (rc != X86EMUL_CONTINUE) + break; ctxt->src.val = old_eip; rc = em_push(ctxt); break; } case 4: /* jmp abs */ - ctxt->_eip = ctxt->src.val; + rc = assign_eip_near(ctxt, ctxt->src.val); break; case 5: /* jmp far */ rc = em_jmp_far(ctxt); @@ -2002,16 +2031,21 @@ static int em_cmpxchg8b(struct x86_emulate_ctxt *ctxt) static int em_ret(struct x86_emulate_ctxt *ctxt) { - ctxt->dst.type = OP_REG; - ctxt->dst.addr.reg = &ctxt->_eip; - ctxt->dst.bytes = ctxt->op_bytes; - return em_pop(ctxt); + int rc; + unsigned long eip; + + rc = emulate_pop(ctxt, &eip, ctxt->op_bytes); + if (rc != X86EMUL_CONTINUE) + return rc; + + return assign_eip_near(ctxt, eip); } static int em_ret_far(struct x86_emulate_ctxt *ctxt) { int rc; unsigned long cs; + int cpl = ctxt->ops->cpl(ctxt); rc = emulate_pop(ctxt, &ctxt->_eip, ctxt->op_bytes); if (rc != X86EMUL_CONTINUE) @@ -2021,6 +2055,9 @@ static int em_ret_far(struct x86_emulate_ctxt *ctxt) rc = emulate_pop(ctxt, &cs, ctxt->op_bytes); if (rc != X86EMUL_CONTINUE) return rc; + /* Outer-privilege level return is not implemented */ + if (ctxt->mode >= X86EMUL_MODE_PROT16 && (cs & 3) > cpl) + return X86EMUL_UNHANDLEABLE; rc = load_segment_descriptor(ctxt, (u16)cs, VCPU_SREG_CS); return rc; } @@ -2279,7 +2316,7 @@ static int em_sysexit(struct x86_emulate_ctxt *ctxt) { const struct x86_emulate_ops *ops = ctxt->ops; struct desc_struct cs, ss; - u64 msr_data; + u64 msr_data, rcx, rdx; int usermode; u16 cs_sel = 0, ss_sel = 0; @@ -2295,6 +2332,9 @@ static int em_sysexit(struct x86_emulate_ctxt *ctxt) else usermode = X86EMUL_MODE_PROT32; + rcx = reg_read(ctxt, VCPU_REGS_RCX); + rdx = reg_read(ctxt, VCPU_REGS_RDX); + cs.dpl = 3; ss.dpl = 3; ops->get_msr(ctxt, MSR_IA32_SYSENTER_CS, &msr_data); @@ -2312,6 +2352,9 @@ static int em_sysexit(struct x86_emulate_ctxt *ctxt) ss_sel = cs_sel + 8; cs.d = 0; cs.l = 1; + if (is_noncanonical_address(rcx) || + is_noncanonical_address(rdx)) + return emulate_gp(ctxt, 0); break; } cs_sel |= SELECTOR_RPL_MASK; @@ -2320,8 +2363,8 @@ static int em_sysexit(struct x86_emulate_ctxt *ctxt) ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS); ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS); - ctxt->_eip = reg_read(ctxt, VCPU_REGS_RDX); - *reg_write(ctxt, VCPU_REGS_RSP) = reg_read(ctxt, VCPU_REGS_RCX); + ctxt->_eip = rdx; + *reg_write(ctxt, VCPU_REGS_RSP) = rcx; return X86EMUL_CONTINUE; } @@ -2860,10 +2903,13 @@ static int em_aad(struct x86_emulate_ctxt *ctxt) static int em_call(struct x86_emulate_ctxt *ctxt) { + int rc; long rel = ctxt->src.val; ctxt->src.val = (unsigned long)ctxt->_eip; - jmp_rel(ctxt, rel); + rc = jmp_rel(ctxt, rel); + if (rc != X86EMUL_CONTINUE) + return rc; return em_push(ctxt); } @@ -2895,11 +2941,12 @@ static int em_call_far(struct x86_emulate_ctxt *ctxt) static int em_ret_near_imm(struct x86_emulate_ctxt *ctxt) { int rc; + unsigned long eip; - ctxt->dst.type = OP_REG; - ctxt->dst.addr.reg = &ctxt->_eip; - ctxt->dst.bytes = ctxt->op_bytes; - rc = emulate_pop(ctxt, &ctxt->dst.val, ctxt->op_bytes); + rc = emulate_pop(ctxt, &eip, ctxt->op_bytes); + if (rc != X86EMUL_CONTINUE) + return rc; + rc = assign_eip_near(ctxt, eip); if (rc != X86EMUL_CONTINUE) return rc; rsp_increment(ctxt, ctxt->src.val); @@ -3189,20 +3236,24 @@ static int em_lmsw(struct x86_emulate_ctxt *ctxt) static int em_loop(struct x86_emulate_ctxt *ctxt) { + int rc = X86EMUL_CONTINUE; + register_address_increment(ctxt, reg_rmw(ctxt, VCPU_REGS_RCX), -1); if ((address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) != 0) && (ctxt->b == 0xe2 || test_cc(ctxt->b ^ 0x5, ctxt->eflags))) - jmp_rel(ctxt, ctxt->src.val); + rc = jmp_rel(ctxt, ctxt->src.val); - return X86EMUL_CONTINUE; + return rc; } static int em_jcxz(struct x86_emulate_ctxt *ctxt) { + int rc = X86EMUL_CONTINUE; + if (address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) == 0) - jmp_rel(ctxt, ctxt->src.val); + rc = jmp_rel(ctxt, ctxt->src.val); - return X86EMUL_CONTINUE; + return rc; } static int em_in(struct x86_emulate_ctxt *ctxt) @@ -4554,7 +4605,7 @@ special_insn: break; case 0x70 ... 0x7f: /* jcc (short) */ if (test_cc(ctxt->b, ctxt->eflags)) - jmp_rel(ctxt, ctxt->src.val); + rc = jmp_rel(ctxt, ctxt->src.val); break; case 0x8d: /* lea r16/r32, m */ ctxt->dst.val = ctxt->src.addr.mem.ea; @@ -4583,7 +4634,7 @@ special_insn: break; case 0xe9: /* jmp rel */ case 0xeb: /* jmp rel short */ - jmp_rel(ctxt, ctxt->src.val); + rc = jmp_rel(ctxt, ctxt->src.val); ctxt->dst.type = OP_NONE; /* Disable writeback. */ break; case 0xf4: /* hlt */ @@ -4703,7 +4754,7 @@ twobyte_insn: break; case 0x80 ... 0x8f: /* jnz rel, etc*/ if (test_cc(ctxt->b, ctxt->eflags)) - jmp_rel(ctxt, ctxt->src.val); + rc = jmp_rel(ctxt, ctxt->src.val); break; case 0x90 ... 0x9f: /* setcc r/m8 */ ctxt->dst.val = test_cc(ctxt->b, ctxt->eflags); diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 518d864..298781d 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -262,8 +262,10 @@ void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu) return; timer = &pit->pit_state.timer; + mutex_lock(&pit->pit_state.lock); if (hrtimer_cancel(timer)) hrtimer_start_expires(timer, HRTIMER_MODE_ABS); + mutex_unlock(&pit->pit_state.lock); } static void destroy_pit_timer(struct kvm_pit *pit) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index d86ff15..92bbb39 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -360,6 +360,8 @@ static inline void apic_clear_irr(int vec, struct kvm_lapic *apic) static inline void apic_set_isr(int vec, struct kvm_lapic *apic) { + /* Note that we never get here with APIC virtualization enabled. */ + if (!__apic_test_and_set_vector(vec, apic->regs + APIC_ISR)) ++apic->isr_count; BUG_ON(apic->isr_count > MAX_APIC_VECTOR); @@ -371,12 +373,48 @@ static inline void apic_set_isr(int vec, struct kvm_lapic *apic) apic->highest_isr_cache = vec; } +static inline int apic_find_highest_isr(struct kvm_lapic *apic) +{ + int result; + + /* + * Note that isr_count is always 1, and highest_isr_cache + * is always -1, with APIC virtualization enabled. + */ + if (!apic->isr_count) + return -1; + if (likely(apic->highest_isr_cache != -1)) + return apic->highest_isr_cache; + + result = find_highest_vector(apic->regs + APIC_ISR); + ASSERT(result == -1 || result >= 16); + + return result; +} + static inline void apic_clear_isr(int vec, struct kvm_lapic *apic) { - if (__apic_test_and_clear_vector(vec, apic->regs + APIC_ISR)) + struct kvm_vcpu *vcpu; + if (!__apic_test_and_clear_vector(vec, apic->regs + APIC_ISR)) + return; + + vcpu = apic->vcpu; + + /* + * We do get here for APIC virtualization enabled if the guest + * uses the Hyper-V APIC enlightenment. In this case we may need + * to trigger a new interrupt delivery by writing the SVI field; + * on the other hand isr_count and highest_isr_cache are unused + * and must be left alone. + */ + if (unlikely(kvm_apic_vid_enabled(vcpu->kvm))) + kvm_x86_ops->hwapic_isr_update(vcpu->kvm, + apic_find_highest_isr(apic)); + else { --apic->isr_count; - BUG_ON(apic->isr_count < 0); - apic->highest_isr_cache = -1; + BUG_ON(apic->isr_count < 0); + apic->highest_isr_cache = -1; + } } int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu) @@ -456,22 +494,6 @@ static void pv_eoi_clr_pending(struct kvm_vcpu *vcpu) __clear_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention); } -static inline int apic_find_highest_isr(struct kvm_lapic *apic) -{ - int result; - - /* Note that isr_count is always 1 with vid enabled */ - if (!apic->isr_count) - return -1; - if (likely(apic->highest_isr_cache != -1)) - return apic->highest_isr_cache; - - result = find_highest_vector(apic->regs + APIC_ISR); - ASSERT(result == -1 || result >= 16); - - return result; -} - void kvm_apic_update_tmr(struct kvm_vcpu *vcpu, u32 *tmr) { struct kvm_lapic *apic = vcpu->arch.apic; @@ -1605,6 +1627,8 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu) int vector = kvm_apic_has_interrupt(vcpu); struct kvm_lapic *apic = vcpu->arch.apic; + /* Note that we never get here with APIC virtualization enabled. */ + if (vector == -1) return -1; diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 74dd129..8ad01b4 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -198,16 +198,20 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask) EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask); /* - * spte bits of bit 3 ~ bit 11 are used as low 9 bits of generation number, - * the bits of bits 52 ~ bit 61 are used as high 10 bits of generation - * number. + * the low bit of the generation number is always presumed to be zero. + * This disables mmio caching during memslot updates. The concept is + * similar to a seqcount but instead of retrying the access we just punt + * and ignore the cache. + * + * spte bits 3-11 are used as bits 1-9 of the generation number, + * the bits 52-61 are used as bits 10-19 of the generation number. */ -#define MMIO_SPTE_GEN_LOW_SHIFT 3 +#define MMIO_SPTE_GEN_LOW_SHIFT 2 #define MMIO_SPTE_GEN_HIGH_SHIFT 52 -#define MMIO_GEN_SHIFT 19 -#define MMIO_GEN_LOW_SHIFT 9 -#define MMIO_GEN_LOW_MASK ((1 << MMIO_GEN_LOW_SHIFT) - 1) +#define MMIO_GEN_SHIFT 20 +#define MMIO_GEN_LOW_SHIFT 10 +#define MMIO_GEN_LOW_MASK ((1 << MMIO_GEN_LOW_SHIFT) - 2) #define MMIO_GEN_MASK ((1 << MMIO_GEN_SHIFT) - 1) #define MMIO_MAX_GEN ((1 << MMIO_GEN_SHIFT) - 1) @@ -3161,7 +3165,7 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu) if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) return; - vcpu_clear_mmio_info(vcpu, ~0ul); + vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY); kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC); if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) { hpa_t root = vcpu->arch.mmu.root_hpa; @@ -4424,8 +4428,8 @@ void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm) * The very rare case: if the generation-number is round, * zap all shadow pages. */ - if (unlikely(kvm_current_mmio_generation(kvm) >= MMIO_MAX_GEN)) { - printk_ratelimited(KERN_INFO "kvm: zapping shadow pages for mmio generation wraparound\n"); + if (unlikely(kvm_current_mmio_generation(kvm) == 0)) { + printk_ratelimited(KERN_DEBUG "kvm: zapping shadow pages for mmio generation wraparound\n"); kvm_mmu_invalidate_zap_all_pages(kvm); } } diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 612c717..5dcdff5 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -3204,7 +3204,7 @@ static int wrmsr_interception(struct vcpu_svm *svm) msr.host_initiated = false; svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; - if (svm_set_msr(&svm->vcpu, &msr)) { + if (kvm_set_msr(&svm->vcpu, &msr)) { trace_kvm_msr_write_ex(ecx, data); kvm_inject_gp(&svm->vcpu, 0); } else { @@ -3486,9 +3486,9 @@ static int handle_exit(struct kvm_vcpu *vcpu) if (exit_code >= ARRAY_SIZE(svm_exit_handlers) || !svm_exit_handlers[exit_code]) { - kvm_run->exit_reason = KVM_EXIT_UNKNOWN; - kvm_run->hw.hardware_exit_reason = exit_code; - return 0; + WARN_ONCE(1, "vmx: unexpected exit reason 0x%x\n", exit_code); + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; } return svm_exit_handlers[exit_code](svm); diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 59181e6..c7663b1 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2540,12 +2540,15 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; msr = find_msr_entry(vmx, msr_index); if (msr) { + u64 old_msr_data = msr->data; msr->data = data; if (msr - vmx->guest_msrs < vmx->save_nmsrs) { preempt_disable(); - kvm_set_shared_msr(msr->index, msr->data, - msr->mask); + ret = kvm_set_shared_msr(msr->index, msr->data, + msr->mask); preempt_enable(); + if (ret) + msr->data = old_msr_data; } break; } @@ -5113,7 +5116,7 @@ static int handle_wrmsr(struct kvm_vcpu *vcpu) msr.data = data; msr.index = ecx; msr.host_initiated = false; - if (vmx_set_msr(vcpu, &msr) != 0) { + if (kvm_set_msr(vcpu, &msr) != 0) { trace_kvm_msr_write_ex(ecx, data); kvm_inject_gp(vcpu, 0); return 1; @@ -6385,6 +6388,12 @@ static int handle_invept(struct kvm_vcpu *vcpu) return 1; } +static int handle_invvpid(struct kvm_vcpu *vcpu) +{ + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; +} + /* * The exit handlers return 1 if the exit was handled fully and guest execution * may resume. Otherwise they set the kvm_run parameter to indicate what needs @@ -6430,6 +6439,7 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_MWAIT_INSTRUCTION] = handle_invalid_op, [EXIT_REASON_MONITOR_INSTRUCTION] = handle_invalid_op, [EXIT_REASON_INVEPT] = handle_invept, + [EXIT_REASON_INVVPID] = handle_invvpid, }; static const int kvm_vmx_max_exit_handlers = @@ -6656,7 +6666,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) case EXIT_REASON_VMPTRST: case EXIT_REASON_VMREAD: case EXIT_REASON_VMRESUME: case EXIT_REASON_VMWRITE: case EXIT_REASON_VMOFF: case EXIT_REASON_VMON: - case EXIT_REASON_INVEPT: + case EXIT_REASON_INVEPT: case EXIT_REASON_INVVPID: /* * VMX instructions trap unconditionally. This allows L1 to * emulate them for its L2 guest, i.e., allows 3-level nesting! @@ -6812,10 +6822,10 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) && kvm_vmx_exit_handlers[exit_reason]) return kvm_vmx_exit_handlers[exit_reason](vcpu); else { - vcpu->run->exit_reason = KVM_EXIT_UNKNOWN; - vcpu->run->hw.hardware_exit_reason = exit_reason; + WARN_ONCE(1, "vmx: unexpected exit reason 0x%x\n", exit_reason); + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; } - return 0; } static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e1b7b17..790551b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -225,20 +225,25 @@ static void kvm_shared_msr_cpu_online(void) shared_msr_update(i, shared_msrs_global.msrs[i]); } -void kvm_set_shared_msr(unsigned slot, u64 value, u64 mask) +int kvm_set_shared_msr(unsigned slot, u64 value, u64 mask) { unsigned int cpu = smp_processor_id(); struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu); + int err; if (((value ^ smsr->values[slot].curr) & mask) == 0) - return; + return 0; smsr->values[slot].curr = value; - wrmsrl(shared_msrs_global.msrs[slot], value); + err = wrmsrl_safe(shared_msrs_global.msrs[slot], value); + if (err) + return 1; + if (!smsr->registered) { smsr->urn.on_user_return = kvm_on_user_return; user_return_notifier_register(&smsr->urn); smsr->registered = true; } + return 0; } EXPORT_SYMBOL_GPL(kvm_set_shared_msr); @@ -910,7 +915,6 @@ void kvm_enable_efer_bits(u64 mask) } EXPORT_SYMBOL_GPL(kvm_enable_efer_bits); - /* * Writes msr value into into the appropriate "register". * Returns 0 on success, non-0 otherwise. @@ -918,8 +922,34 @@ EXPORT_SYMBOL_GPL(kvm_enable_efer_bits); */ int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) { + switch (msr->index) { + case MSR_FS_BASE: + case MSR_GS_BASE: + case MSR_KERNEL_GS_BASE: + case MSR_CSTAR: + case MSR_LSTAR: + if (is_noncanonical_address(msr->data)) + return 1; + break; + case MSR_IA32_SYSENTER_EIP: + case MSR_IA32_SYSENTER_ESP: + /* + * IA32_SYSENTER_ESP and IA32_SYSENTER_EIP cause #GP if + * non-canonical address is written on Intel but not on + * AMD (which ignores the top 32-bits, because it does + * not implement 64-bit SYSENTER). + * + * 64-bit code should hence be able to write a non-canonical + * value on AMD. Making the address canonical ensures that + * vmentry does not fail on Intel after writing a non-canonical + * value, and that something deterministic happens if the guest + * invokes 64-bit SYSENTER. + */ + msr->data = get_canonical(msr->data); + } return kvm_x86_ops->set_msr(vcpu, msr); } +EXPORT_SYMBOL_GPL(kvm_set_msr); /* * Adapt set_msr() to msr_io()'s calling convention @@ -1073,7 +1103,6 @@ static inline u64 get_kernel_ns(void) { struct timespec ts; - WARN_ON(preemptible()); ktime_get_ts(&ts); monotonic_to_bootbased(&ts); return timespec_to_ns(&ts); @@ -4842,7 +4871,7 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu) ++vcpu->stat.insn_emulation_fail; trace_kvm_emulate_insn_failed(vcpu); - if (!is_guest_mode(vcpu)) { + if (!is_guest_mode(vcpu) && kvm_x86_ops->get_cpl(vcpu) == 0) { vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; vcpu->run->internal.ndata = 0; @@ -5493,13 +5522,6 @@ int kvm_arch_init(void *opaque) goto out; } -#ifdef CONFIG_PREEMPT_RT_FULL - if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { - printk(KERN_ERR "RT requires X86_FEATURE_CONSTANT_TSC\n"); - return -EOPNOTSUPP; - } -#endif - r = kvm_mmu_module_init(); if (r) goto out_free_percpu; diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 3186542..7626d3e 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -78,15 +78,23 @@ static inline void vcpu_cache_mmio_info(struct kvm_vcpu *vcpu, vcpu->arch.mmio_gva = gva & PAGE_MASK; vcpu->arch.access = access; vcpu->arch.mmio_gfn = gfn; + vcpu->arch.mmio_gen = kvm_memslots(vcpu->kvm)->generation; +} + +static inline bool vcpu_match_mmio_gen(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.mmio_gen == kvm_memslots(vcpu->kvm)->generation; } /* - * Clear the mmio cache info for the given gva, - * specially, if gva is ~0ul, we clear all mmio cache info. + * Clear the mmio cache info for the given gva. If gva is MMIO_GVA_ANY, we + * clear all mmio cache info. */ +#define MMIO_GVA_ANY (~(gva_t)0) + static inline void vcpu_clear_mmio_info(struct kvm_vcpu *vcpu, gva_t gva) { - if (gva != (~0ul) && vcpu->arch.mmio_gva != (gva & PAGE_MASK)) + if (gva != MMIO_GVA_ANY && vcpu->arch.mmio_gva != (gva & PAGE_MASK)) return; vcpu->arch.mmio_gva = 0; @@ -94,7 +102,8 @@ static inline void vcpu_clear_mmio_info(struct kvm_vcpu *vcpu, gva_t gva) static inline bool vcpu_match_mmio_gva(struct kvm_vcpu *vcpu, unsigned long gva) { - if (vcpu->arch.mmio_gva && vcpu->arch.mmio_gva == (gva & PAGE_MASK)) + if (vcpu_match_mmio_gen(vcpu) && vcpu->arch.mmio_gva && + vcpu->arch.mmio_gva == (gva & PAGE_MASK)) return true; return false; @@ -102,7 +111,8 @@ static inline bool vcpu_match_mmio_gva(struct kvm_vcpu *vcpu, unsigned long gva) static inline bool vcpu_match_mmio_gpa(struct kvm_vcpu *vcpu, gpa_t gpa) { - if (vcpu->arch.mmio_gfn && vcpu->arch.mmio_gfn == gpa >> PAGE_SHIFT) + if (vcpu_match_mmio_gen(vcpu) && vcpu->arch.mmio_gfn && + vcpu->arch.mmio_gfn == gpa >> PAGE_SHIFT) return true; return false; diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index 0002a3a..3620928 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -30,11 +30,13 @@ struct pg_state { unsigned long start_address; unsigned long current_address; const struct addr_marker *marker; + unsigned long lines; }; struct addr_marker { unsigned long start_address; const char *name; + unsigned long max_lines; }; /* indices for address_markers; keep sync'd w/ address_markers below */ @@ -45,6 +47,7 @@ enum address_markers_idx { LOW_KERNEL_NR, VMALLOC_START_NR, VMEMMAP_START_NR, + ESPFIX_START_NR, HIGH_KERNEL_NR, MODULES_VADDR_NR, MODULES_END_NR, @@ -67,6 +70,7 @@ static struct addr_marker address_markers[] = { { PAGE_OFFSET, "Low Kernel Mapping" }, { VMALLOC_START, "vmalloc() Area" }, { VMEMMAP_START, "Vmemmap" }, + { ESPFIX_BASE_ADDR, "ESPfix Area", 16 }, { __START_KERNEL_map, "High Kernel Mapping" }, { MODULES_VADDR, "Modules" }, { MODULES_END, "End Modules" }, @@ -163,7 +167,7 @@ static void note_page(struct seq_file *m, struct pg_state *st, pgprot_t new_prot, int level) { pgprotval_t prot, cur; - static const char units[] = "KMGTPE"; + static const char units[] = "BKMGTPE"; /* * If we have a "break" in the series, we need to flush the state that @@ -178,6 +182,7 @@ static void note_page(struct seq_file *m, struct pg_state *st, st->current_prot = new_prot; st->level = level; st->marker = address_markers; + st->lines = 0; seq_printf(m, "---[ %s ]---\n", st->marker->name); } else if (prot != cur || level != st->level || st->current_address >= st->marker[1].start_address) { @@ -188,17 +193,21 @@ static void note_page(struct seq_file *m, struct pg_state *st, /* * Now print the actual finished series */ - seq_printf(m, "0x%0*lx-0x%0*lx ", - width, st->start_address, - width, st->current_address); - - delta = (st->current_address - st->start_address) >> 10; - while (!(delta & 1023) && unit[1]) { - delta >>= 10; - unit++; + if (!st->marker->max_lines || + st->lines < st->marker->max_lines) { + seq_printf(m, "0x%0*lx-0x%0*lx ", + width, st->start_address, + width, st->current_address); + + delta = (st->current_address - st->start_address) >> 10; + while (!(delta & 1023) && unit[1]) { + delta >>= 10; + unit++; + } + seq_printf(m, "%9lu%c ", delta, *unit); + printk_prot(m, st->current_prot, st->level); } - seq_printf(m, "%9lu%c ", delta, *unit); - printk_prot(m, st->current_prot, st->level); + st->lines++; /* * We print markers for special areas of address space, diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index b5c8e37..5b90bbcad9 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1098,7 +1098,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code) * If we're in an interrupt, have no user context or are running * in an atomic region then we must not take the fault: */ - if (unlikely(!mm || pagefault_disabled())) { + if (unlikely(in_atomic() || !mm)) { bad_area_nosemaphore(regs, error_code, address); return; } diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c index 7f96844..4500142 100644 --- a/arch/x86/mm/highmem_32.c +++ b/arch/x86/mm/highmem_32.c @@ -32,7 +32,6 @@ EXPORT_SYMBOL(kunmap); */ void *kmap_atomic_prot(struct page *page, pgprot_t prot) { - pte_t pte = mk_pte(page, prot); unsigned long vaddr; int idx, type; @@ -46,10 +45,7 @@ void *kmap_atomic_prot(struct page *page, pgprot_t prot) idx = type + KM_TYPE_NR*smp_processor_id(); vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); BUG_ON(!pte_none(*(kmap_pte-idx))); -#ifdef CONFIG_PREEMPT_RT_FULL - current->kmap_pte[type] = pte; -#endif - set_pte(kmap_pte-idx, pte); + set_pte(kmap_pte-idx, mk_pte(page, prot)); arch_flush_lazy_mmu_mode(); return (void *)vaddr; @@ -92,9 +88,6 @@ void __kunmap_atomic(void *kvaddr) * is a bad idea also, in case the page changes cacheability * attributes or becomes a protected page in a hypervisor. */ -#ifdef CONFIG_PREEMPT_RT_FULL - current->kmap_pte[type] = __pte(0); -#endif kpte_clear_flush(kmap_pte-idx, vaddr); kmap_atomic_idx_pop(); arch_flush_lazy_mmu_mode(); diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index 9d980d8..fa029fb 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -58,11 +58,6 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address, { return NULL; } - -int pmd_huge_support(void) -{ - return 0; -} #else struct page * @@ -80,11 +75,6 @@ int pud_huge(pud_t pud) { return !!(pud_val(pud) & _PAGE_PSE); } - -int pmd_huge_support(void) -{ - return 1; -} #endif /* x86_64 also uses this file */ diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 104d56a..b599241 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1110,7 +1110,7 @@ void mark_rodata_ro(void) unsigned long end = (unsigned long) &__end_rodata_hpage_align; unsigned long text_end = PFN_ALIGN(&__stop___ex_table); unsigned long rodata_end = PFN_ALIGN(&__end_rodata); - unsigned long all_end = PFN_ALIGN(&_end); + unsigned long all_end; printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n", (end - start) >> 10); @@ -1121,7 +1121,16 @@ void mark_rodata_ro(void) /* * The rodata/data/bss/brk section (but not the kernel text!) * should also be not-executable. + * + * We align all_end to PMD_SIZE because the existing mapping + * is a full PMD. If we would align _brk_end to PAGE_SIZE we + * split the PMD and the reminder between _brk_end and the end + * of the PMD will remain mapped executable. + * + * Any PMD which was setup after the one which covers _brk_end + * has been zapped already via cleanup_highmem(). */ + all_end = roundup((unsigned long)_brk_end, PMD_SIZE); set_memory_nx(rodata_start, (all_end - rodata_start) >> PAGE_SHIFT); rodata_test(); diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c index 62377d6..7b179b4 100644 --- a/arch/x86/mm/iomap_32.c +++ b/arch/x86/mm/iomap_32.c @@ -56,7 +56,6 @@ EXPORT_SYMBOL_GPL(iomap_free); void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot) { - pte_t pte = pfn_pte(pfn, prot); unsigned long vaddr; int idx, type; @@ -65,12 +64,7 @@ void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot) type = kmap_atomic_idx_push(); idx = type + KM_TYPE_NR * smp_processor_id(); vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); - WARN_ON(!pte_none(*(kmap_pte - idx))); - -#ifdef CONFIG_PREEMPT_RT_FULL - current->kmap_pte[type] = pte; -#endif - set_pte(kmap_pte - idx, pte); + set_pte(kmap_pte - idx, pfn_pte(pfn, prot)); arch_flush_lazy_mmu_mode(); return (void *)vaddr; @@ -116,9 +110,6 @@ iounmap_atomic(void __iomem *kvaddr) * is a bad idea also, in case the page changes cacheability * attributes or becomes a protected page in a hypervisor. */ -#ifdef CONFIG_PREEMPT_RT_FULL - current->kmap_pte[type] = __pte(0); -#endif kpte_clear_flush(kmap_pte-idx, vaddr); kmap_atomic_idx_pop(); } diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 799580c..94bd247 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -50,6 +50,21 @@ int ioremap_change_attr(unsigned long vaddr, unsigned long size, return err; } +static int __ioremap_check_ram(unsigned long start_pfn, unsigned long nr_pages, + void *arg) +{ + unsigned long i; + + for (i = 0; i < nr_pages; ++i) + if (pfn_valid(start_pfn + i) && + !PageReserved(pfn_to_page(start_pfn + i))) + return 1; + + WARN_ONCE(1, "ioremap on RAM pfn 0x%lx\n", start_pfn); + + return 0; +} + /* * Remap an arbitrary physical address space into the kernel virtual * address space. Needed when the kernel wants to access high addresses @@ -93,14 +108,11 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, /* * Don't allow anybody to remap normal RAM that we're using.. */ + pfn = phys_addr >> PAGE_SHIFT; last_pfn = last_addr >> PAGE_SHIFT; - for (pfn = phys_addr >> PAGE_SHIFT; pfn <= last_pfn; pfn++) { - int is_ram = page_is_ram(pfn); - - if (is_ram && pfn_valid(pfn) && !PageReserved(pfn_to_page(pfn))) - return NULL; - WARN_ON_ONCE(is_ram); - } + if (walk_system_ram_range(pfn, last_pfn - pfn + 1, NULL, + __ioremap_check_ram) == 1) + return NULL; /* * Mappings have to be page-aligned diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index bb32480..aabdf76 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -389,7 +389,7 @@ phys_addr_t slow_virt_to_phys(void *__virt_addr) psize = page_level_size(level); pmask = page_level_mask(level); offset = virt_addr & ~pmask; - phys_addr = pte_pfn(*pte) << PAGE_SHIFT; + phys_addr = (phys_addr_t)pte_pfn(*pte) << PAGE_SHIFT; return (phys_addr | offset); } EXPORT_SYMBOL_GPL(slow_virt_to_phys); diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index dfa537a..5da29d0 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -386,13 +386,20 @@ int pmdp_test_and_clear_young(struct vm_area_struct *vma, int ptep_clear_flush_young(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) { - int young; - - young = ptep_test_and_clear_young(vma, address, ptep); - if (young) - flush_tlb_page(vma, address); - - return young; + /* + * On x86 CPUs, clearing the accessed bit without a TLB flush + * doesn't cause data corruption. [ It could cause incorrect + * page aging and the (mistaken) reclaim of hot pages, but the + * chance of that should be relatively low. ] + * + * So as a performance optimization don't flush the TLB when + * clearing the accessed bit, it will eventually be flushed by + * a context switch or a VM operation anyway. [ In the rare + * event of it not getting flushed for a long time the delay + * shouldn't really matter because there's no real memory + * pressure for swapout to react to. ] + */ + return ptep_test_and_clear_young(vma, address, ptep); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index ae699b3..dd8dda1 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -103,7 +103,7 @@ static void flush_tlb_func(void *info) if (f->flush_mm != this_cpu_read(cpu_tlbstate.active_mm)) return; - count_vm_event(NR_TLB_REMOTE_FLUSH_RECEIVED); + count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED); if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) { if (f->flush_end == TLB_FLUSH_ALL) local_flush_tlb(); @@ -131,7 +131,7 @@ void native_flush_tlb_others(const struct cpumask *cpumask, info.flush_start = start; info.flush_end = end; - count_vm_event(NR_TLB_REMOTE_FLUSH); + count_vm_tlb_event(NR_TLB_REMOTE_FLUSH); if (is_uv_system()) { unsigned int cpu; @@ -151,44 +151,19 @@ void flush_tlb_current_task(void) preempt_disable(); - count_vm_event(NR_TLB_LOCAL_FLUSH_ALL); + count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); local_flush_tlb(); if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL); preempt_enable(); } -/* - * It can find out the THP large page, or - * HUGETLB page in tlb_flush when THP disabled - */ -static inline unsigned long has_large_page(struct mm_struct *mm, - unsigned long start, unsigned long end) -{ - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - unsigned long addr = ALIGN(start, HPAGE_SIZE); - for (; addr < end; addr += HPAGE_SIZE) { - pgd = pgd_offset(mm, addr); - if (likely(!pgd_none(*pgd))) { - pud = pud_offset(pgd, addr); - if (likely(!pud_none(*pud))) { - pmd = pmd_offset(pud, addr); - if (likely(!pmd_none(*pmd))) - if (pmd_large(*pmd)) - return addr; - } - } - } - return 0; -} - void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, unsigned long end, unsigned long vmflag) { unsigned long addr; unsigned act_entries, tlb_entries = 0; + unsigned long nr_base_pages; preempt_disable(); if (current->active_mm != mm) @@ -210,21 +185,20 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, tlb_entries = tlb_lli_4k[ENTRIES]; else tlb_entries = tlb_lld_4k[ENTRIES]; + /* Assume all of TLB entries was occupied by this task */ - act_entries = mm->total_vm > tlb_entries ? tlb_entries : mm->total_vm; + act_entries = tlb_entries >> tlb_flushall_shift; + act_entries = mm->total_vm > act_entries ? act_entries : mm->total_vm; + nr_base_pages = (end - start) >> PAGE_SHIFT; /* tlb_flushall_shift is on balance point, details in commit log */ - if ((end - start) >> PAGE_SHIFT > act_entries >> tlb_flushall_shift) { - count_vm_event(NR_TLB_LOCAL_FLUSH_ALL); + if (nr_base_pages > act_entries) { + count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); local_flush_tlb(); } else { - if (has_large_page(mm, start, end)) { - local_flush_tlb(); - goto flush_all; - } /* flush range by one by one 'invlpg' */ for (addr = start; addr < end; addr += PAGE_SIZE) { - count_vm_event(NR_TLB_LOCAL_FLUSH_ONE); + count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE); __flush_tlb_single(addr); } @@ -262,7 +236,7 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long start) static void do_flush_tlb_all(void *info) { - count_vm_event(NR_TLB_REMOTE_FLUSH_RECEIVED); + count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED); __flush_tlb_all(); if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY) leave_mm(smp_processor_id()); @@ -270,7 +244,7 @@ static void do_flush_tlb_all(void *info) void flush_tlb_all(void) { - count_vm_event(NR_TLB_REMOTE_FLUSH); + count_vm_tlb_event(NR_TLB_REMOTE_FLUSH); on_each_cpu(do_flush_tlb_all, NULL, 1); } diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c index db6b1ab..96a159a 100644 --- a/arch/x86/pci/i386.c +++ b/arch/x86/pci/i386.c @@ -162,6 +162,10 @@ pcibios_align_resource(void *data, const struct resource *res, return start; if (start & 0x300) start = (start + 0x3ff) & ~0x3ff; + } else if (res->flags & IORESOURCE_MEM) { + /* The low 1MB range is reserved for ISA cards */ + if (start < BIOS_END) + start = BIOS_END; } return start; } diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl index 38ae65d..63a8993 100644 --- a/arch/x86/syscalls/syscall_64.tbl +++ b/arch/x86/syscalls/syscall_64.tbl @@ -212,10 +212,10 @@ 203 common sched_setaffinity sys_sched_setaffinity 204 common sched_getaffinity sys_sched_getaffinity 205 64 set_thread_area -206 common io_setup sys_io_setup +206 64 io_setup sys_io_setup 207 common io_destroy sys_io_destroy 208 common io_getevents sys_io_getevents -209 common io_submit sys_io_submit +209 64 io_submit sys_io_submit 210 common io_cancel sys_io_cancel 211 64 get_thread_area 212 common lookup_dcookie sys_lookup_dcookie @@ -356,3 +356,5 @@ 540 x32 process_vm_writev compat_sys_process_vm_writev 541 x32 setsockopt compat_sys_setsockopt 542 x32 getsockopt compat_sys_getsockopt +543 x32 io_setup compat_sys_io_setup +544 x32 io_submit compat_sys_io_submit diff --git a/arch/x86/um/sys_call_table_32.c b/arch/x86/um/sys_call_table_32.c index 531d426..bd16d6c 100644 --- a/arch/x86/um/sys_call_table_32.c +++ b/arch/x86/um/sys_call_table_32.c @@ -34,7 +34,7 @@ typedef asmlinkage void (*sys_call_ptr_t)(void); extern asmlinkage void sys_ni_syscall(void); -const sys_call_ptr_t sys_call_table[] __cacheline_aligned = { +const sys_call_ptr_t sys_call_table[] ____cacheline_aligned = { /* * Smells like a compiler bug -- it doesn't work * when the & below is removed. diff --git a/arch/x86/um/sys_call_table_64.c b/arch/x86/um/sys_call_table_64.c index f2f0723..9578308 100644 --- a/arch/x86/um/sys_call_table_64.c +++ b/arch/x86/um/sys_call_table_64.c @@ -46,7 +46,7 @@ typedef void (*sys_call_ptr_t)(void); extern void sys_ni_syscall(void); -const sys_call_ptr_t sys_call_table[] __cacheline_aligned = { +const sys_call_ptr_t sys_call_table[] ____cacheline_aligned = { /* * Smells like a compiler bug -- it doesn't work * when the & below is removed. diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c index 431e875..ab6ba35 100644 --- a/arch/x86/vdso/vma.c +++ b/arch/x86/vdso/vma.c @@ -117,30 +117,45 @@ subsys_initcall(init_vdso); struct linux_binprm; -/* Put the vdso above the (randomized) stack with another randomized offset. - This way there is no hole in the middle of address space. - To save memory make sure it is still in the same PTE as the stack top. - This doesn't give that many random bits */ +/* + * Put the vdso above the (randomized) stack with another randomized + * offset. This way there is no hole in the middle of address space. + * To save memory make sure it is still in the same PTE as the stack + * top. This doesn't give that many random bits. + * + * Note that this algorithm is imperfect: the distribution of the vdso + * start address within a PMD is biased toward the end. + * + * Only used for the 64-bit and x32 vdsos. + */ static unsigned long vdso_addr(unsigned long start, unsigned len) { unsigned long addr, end; unsigned offset; - end = (start + PMD_SIZE - 1) & PMD_MASK; + + /* + * Round up the start address. It can start out unaligned as a result + * of stack start randomization. + */ + start = PAGE_ALIGN(start); + + /* Round the lowest possible end address up to a PMD boundary. */ + end = (start + len + PMD_SIZE - 1) & PMD_MASK; if (end >= TASK_SIZE_MAX) end = TASK_SIZE_MAX; end -= len; - /* This loses some more bits than a modulo, but is cheaper */ - offset = get_random_int() & (PTRS_PER_PTE - 1); - addr = start + (offset << PAGE_SHIFT); - if (addr >= end) - addr = end; + + if (end > start) { + offset = get_random_int() % (((end - start) >> PAGE_SHIFT) + 1); + addr = start + (offset << PAGE_SHIFT); + } else { + addr = start; + } /* - * page-align it here so that get_unmapped_area doesn't - * align it wrongfully again to the next page. addr can come in 4K - * unaligned here as a result of stack start randomization. + * Forcibly align the final address in case we have a hardware + * issue that requires alignment for performance reasons. */ - addr = PAGE_ALIGN(addr); addr = align_vdso_addr(addr); return addr; diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c index be6b860..ba81b54 100644 --- a/arch/x86/xen/spinlock.c +++ b/arch/x86/xen/spinlock.c @@ -274,7 +274,7 @@ void __init xen_init_spinlocks(void) printk(KERN_DEBUG "xen: PV spinlocks disabled\n"); return; } - + printk(KERN_DEBUG "xen: PV spinlocks enabled\n"); pv_lock_ops.lock_spinning = PV_CALLEE_SAVE(xen_lock_spinning); pv_lock_ops.unlock_kick = xen_unlock_kick; } @@ -290,6 +290,9 @@ static __init int xen_init_spinlocks_jump(void) if (!xen_pvspin) return 0; + if (!xen_domain()) + return 0; + static_key_slow_inc(¶virt_ticketlocks_enabled); return 0; } diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index ee36589..90bfa52 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -445,7 +445,7 @@ void xen_setup_timer(int cpu) irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt, IRQF_DISABLED|IRQF_PERCPU| IRQF_NOBALANCING|IRQF_TIMER| - IRQF_FORCE_RESUME, + IRQF_FORCE_RESUME|IRQF_EARLY_RESUME, name, NULL); memcpy(evt, xen_clockevent, sizeof(*evt)); |