summaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
authorScott Wood <scottwood@freescale.com>2015-02-13 22:12:06 (GMT)
committerScott Wood <scottwood@freescale.com>2015-02-13 22:19:22 (GMT)
commit6faa2909871d8937cb2f79a10e1b21ffe193fac1 (patch)
treef558a94f1553814cc122ab8d9e04c0ebad5262a5 /arch/x86
parentfcb2fb84301c673ee15ca04e7a2fc965712d49a0 (diff)
downloadlinux-fsl-qoriq-6faa2909871d8937cb2f79a10e1b21ffe193fac1.tar.xz
Reset to 3.12.37
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig47
-rw-r--r--arch/x86/boot/compressed/eboot.c5
-rw-r--r--arch/x86/boot/compressed/head_32.S14
-rw-r--r--arch/x86/boot/compressed/head_64.S9
-rw-r--r--arch/x86/boot/compressed/misc.c9
-rw-r--r--arch/x86/boot/header.S26
-rw-r--r--arch/x86/boot/tools/build.c37
-rw-r--r--arch/x86/crypto/aes_glue.c4
-rw-r--r--arch/x86/crypto/aesni-intel_glue.c26
-rw-r--r--arch/x86/crypto/blowfish_glue.c4
-rw-r--r--arch/x86/crypto/camellia_aesni_avx2_glue.c4
-rw-r--r--arch/x86/crypto/camellia_aesni_avx_glue.c4
-rw-r--r--arch/x86/crypto/camellia_glue.c4
-rw-r--r--arch/x86/crypto/cast5_avx_glue.c23
-rw-r--r--arch/x86/crypto/cast6_avx_glue.c2
-rw-r--r--arch/x86/crypto/crc32-pclmul_glue.c4
-rw-r--r--arch/x86/crypto/crc32c-intel_glue.c4
-rw-r--r--arch/x86/crypto/crct10dif-pclmul_glue.c4
-rw-r--r--arch/x86/crypto/fpu.c3
-rw-r--r--arch/x86/crypto/ghash-clmulni-intel_glue.c2
-rw-r--r--arch/x86/crypto/glue_helper.c31
-rw-r--r--arch/x86/crypto/salsa20_glue.c4
-rw-r--r--arch/x86/crypto/serpent_avx2_glue.c4
-rw-r--r--arch/x86/crypto/serpent_avx_glue.c2
-rw-r--r--arch/x86/crypto/serpent_sse2_glue.c2
-rw-r--r--arch/x86/crypto/sha1_ssse3_glue.c2
-rw-r--r--arch/x86/crypto/sha256_ssse3_glue.c4
-rw-r--r--arch/x86/crypto/sha512_ssse3_glue.c6
-rw-r--r--arch/x86/crypto/twofish_avx_glue.c2
-rw-r--r--arch/x86/crypto/twofish_glue.c4
-rw-r--r--arch/x86/crypto/twofish_glue_3way.c4
-rw-r--r--arch/x86/ia32/ia32entry.S18
-rw-r--r--arch/x86/include/asm/cpufeature.h1
-rw-r--r--arch/x86/include/asm/desc.h20
-rw-r--r--arch/x86/include/asm/elf.h5
-rw-r--r--arch/x86/include/asm/espfix.h16
-rw-r--r--arch/x86/include/asm/hugetlb.h1
-rw-r--r--arch/x86/include/asm/irq.h1
-rw-r--r--arch/x86/include/asm/irqflags.h2
-rw-r--r--arch/x86/include/asm/kvm_host.h19
-rw-r--r--arch/x86/include/asm/kvm_para.h10
-rw-r--r--arch/x86/include/asm/page_32_types.h1
-rw-r--r--arch/x86/include/asm/page_64_types.h11
-rw-r--r--arch/x86/include/asm/pgtable_64_types.h2
-rw-r--r--arch/x86/include/asm/ptrace.h16
-rw-r--r--arch/x86/include/asm/setup.h2
-rw-r--r--arch/x86/include/asm/signal.h13
-rw-r--r--arch/x86/include/asm/stackprotector.h10
-rw-r--r--arch/x86/include/asm/thread_info.h8
-rw-r--r--arch/x86/include/asm/tlbflush.h6
-rw-r--r--arch/x86/include/asm/vsyscall.h2
-rw-r--r--arch/x86/include/uapi/asm/ldt.h7
-rw-r--r--arch/x86/include/uapi/asm/vmx.h2
-rw-r--r--arch/x86/kernel/Makefile1
-rw-r--r--arch/x86/kernel/apic/apic.c4
-rw-r--r--arch/x86/kernel/apic/io_apic.c3
-rw-r--r--arch/x86/kernel/asm-offsets.c1
-rw-r--r--arch/x86/kernel/cpu/amd.c7
-rw-r--r--arch/x86/kernel/cpu/common.c4
-rw-r--r--arch/x86/kernel/cpu/intel.c32
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c131
-rw-r--r--arch/x86/kernel/cpu/mshyperv.c1
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c4
-rw-r--r--arch/x86/kernel/cpu/perf_event.c3
-rw-r--r--arch/x86/kernel/cpu/perf_event.h12
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel.c83
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_uncore.c22
-rw-r--r--arch/x86/kernel/dumpstack_64.c1
-rw-r--r--arch/x86/kernel/entry_32.S48
-rw-r--r--arch/x86/kernel/entry_64.S108
-rw-r--r--arch/x86/kernel/espfix_64.c208
-rw-r--r--arch/x86/kernel/ftrace.c13
-rw-r--r--arch/x86/kernel/irq.c77
-rw-r--r--arch/x86/kernel/irq_32.c2
-rw-r--r--arch/x86/kernel/irq_64.c3
-rw-r--r--arch/x86/kernel/irq_work.c2
-rw-r--r--arch/x86/kernel/kprobes/core.c20
-rw-r--r--arch/x86/kernel/kvm.c9
-rw-r--r--arch/x86/kernel/kvmclock.c1
-rw-r--r--arch/x86/kernel/ldt.c5
-rw-r--r--arch/x86/kernel/paravirt_patch_64.c2
-rw-r--r--arch/x86/kernel/process_32.c32
-rw-r--r--arch/x86/kernel/ptrace.c11
-rw-r--r--arch/x86/kernel/resource.c8
-rw-r--r--arch/x86/kernel/signal.c13
-rw-r--r--arch/x86/kernel/smpboot.c16
-rw-r--r--arch/x86/kernel/tls.c62
-rw-r--r--arch/x86/kernel/traps.c101
-rw-r--r--arch/x86/kernel/tsc.c7
-rw-r--r--arch/x86/kernel/vsyscall_64.c8
-rw-r--r--arch/x86/kernel/xsave.c7
-rw-r--r--arch/x86/kvm/emulate.c103
-rw-r--r--arch/x86/kvm/i8254.c2
-rw-r--r--arch/x86/kvm/lapic.c62
-rw-r--r--arch/x86/kvm/mmu.c24
-rw-r--r--arch/x86/kvm/svm.c8
-rw-r--r--arch/x86/kvm/vmx.c24
-rw-r--r--arch/x86/kvm/x86.c48
-rw-r--r--arch/x86/kvm/x86.h20
-rw-r--r--arch/x86/mm/dump_pagetables.c31
-rw-r--r--arch/x86/mm/fault.c2
-rw-r--r--arch/x86/mm/highmem_32.c9
-rw-r--r--arch/x86/mm/hugetlbpage.c10
-rw-r--r--arch/x86/mm/init_64.c11
-rw-r--r--arch/x86/mm/iomap_32.c11
-rw-r--r--arch/x86/mm/ioremap.c26
-rw-r--r--arch/x86/mm/pageattr.c2
-rw-r--r--arch/x86/mm/pgtable.c21
-rw-r--r--arch/x86/mm/tlb.c52
-rw-r--r--arch/x86/pci/i386.c4
-rw-r--r--arch/x86/syscalls/syscall_64.tbl6
-rw-r--r--arch/x86/um/sys_call_table_32.c2
-rw-r--r--arch/x86/um/sys_call_table_64.c2
-rw-r--r--arch/x86/vdso/vma.c43
-rw-r--r--arch/x86/xen/spinlock.c5
-rw-r--r--arch/x86/xen/time.c2
116 files changed, 1399 insertions, 676 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index edbb857..5b5c6ea 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -21,7 +21,6 @@ config X86_64
### Arch settings
config X86
def_bool y
- select HAVE_PREEMPT_LAZY
select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS
select HAVE_AOUT if X86_32
select HAVE_UNSTABLE_SCHED_CLOCK
@@ -124,6 +123,7 @@ config X86
select COMPAT_OLD_SIGACTION if IA32_EMULATION
select RTC_LIB
select HAVE_DEBUG_STACKOVERFLOW
+ select ARCH_SUPPORTS_ATOMIC_RMW
config INSTRUCTION_DECODER
def_bool y
@@ -180,11 +180,8 @@ config ARCH_MAY_HAVE_PC_FDC
def_bool y
depends on ISA_DMA_API
-config RWSEM_GENERIC_SPINLOCK
- def_bool PREEMPT_RT_FULL
-
config RWSEM_XCHGADD_ALGORITHM
- def_bool !RWSEM_GENERIC_SPINLOCK && !PREEMPT_RT_FULL
+ def_bool y
config GENERIC_CALIBRATE_DELAY
def_bool y
@@ -474,7 +471,7 @@ config X86_MDFLD
select MFD_INTEL_MSIC
---help---
Medfield is Intel's Low Power Intel Architecture (LPIA) based Moblin
- Internet Device(MID) platform.
+ Internet Device(MID) platform.
Unlike standard x86 PCs, Medfield does not have many legacy devices
nor standard legacy replacement devices/features. e.g. Medfield does
not contain i8259, i8254, HPET, legacy BIOS, most of the io ports.
@@ -821,7 +818,7 @@ config IOMMU_HELPER
config MAXSMP
bool "Enable Maximum number of SMP Processors and NUMA Nodes"
depends on X86_64 && SMP && DEBUG_KERNEL
- select CPUMASK_OFFSTACK if !PREEMPT_RT_FULL
+ select CPUMASK_OFFSTACK
---help---
Enable maximum number of CPUS and NUMA Nodes for this architecture.
If unsure, say N.
@@ -864,7 +861,7 @@ source "kernel/Kconfig.preempt"
config X86_UP_APIC
bool "Local APIC support on uniprocessors"
- depends on X86_32 && !SMP && !X86_32_NON_STANDARD && !PCI_MSI
+ depends on X86_32 && !SMP && !X86_32_NON_STANDARD
---help---
A local APIC (Advanced Programmable Interrupt Controller) is an
integrated interrupt controller in the CPU. If you have a single-CPU
@@ -875,6 +872,10 @@ config X86_UP_APIC
performance counters), and the NMI watchdog which detects hard
lockups.
+config X86_UP_APIC_MSI
+ def_bool y
+ select X86_UP_APIC if X86_32 && !SMP && !X86_32_NON_STANDARD && PCI_MSI
+
config X86_UP_IOAPIC
bool "IO-APIC support on uniprocessors"
depends on X86_UP_APIC
@@ -976,10 +977,27 @@ config VM86
default y
depends on X86_32
---help---
- This option is required by programs like DOSEMU to run 16-bit legacy
- code on X86 processors. It also may be needed by software like
- XFree86 to initialize some video cards via BIOS. Disabling this
- option saves about 6k.
+ This option is required by programs like DOSEMU to run
+ 16-bit real mode legacy code on x86 processors. It also may
+ be needed by software like XFree86 to initialize some video
+ cards via BIOS. Disabling this option saves about 6K.
+
+config X86_16BIT
+ bool "Enable support for 16-bit segments" if EXPERT
+ default y
+ ---help---
+ This option is required by programs like Wine to run 16-bit
+ protected mode legacy code on x86 processors. Disabling
+ this option saves about 300 bytes on i386, or around 6K text
+ plus 16K runtime memory on x86-64,
+
+config X86_ESPFIX32
+ def_bool y
+ depends on X86_16BIT && X86_32
+
+config X86_ESPFIX64
+ def_bool y
+ depends on X86_16BIT && X86_64
config TOSHIBA
tristate "Toshiba Laptop support"
@@ -1594,6 +1612,7 @@ config EFI
config EFI_STUB
bool "EFI stub support"
depends on EFI
+ select RELOCATABLE
---help---
This kernel feature allows a bzImage to be loaded directly
by EFI firmware without the use of a bootloader.
@@ -1885,6 +1904,10 @@ config USE_PERCPU_NUMA_NODE_ID
def_bool y
depends on NUMA
+config ARCH_ENABLE_HUGEPAGE_MIGRATION
+ def_bool y
+ depends on X86_64 && HUGETLB_PAGE && MIGRATION
+
menu "Power management and ACPI options"
config ARCH_HIBERNATION_HEADER
diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c
index b7388a4..9b883a8 100644
--- a/arch/x86/boot/compressed/eboot.c
+++ b/arch/x86/boot/compressed/eboot.c
@@ -865,6 +865,9 @@ fail:
* Because the x86 boot code expects to be passed a boot_params we
* need to create one ourselves (usually the bootloader would create
* one for us).
+ *
+ * The caller is responsible for filling out ->code32_start in the
+ * returned boot_params.
*/
struct boot_params *make_boot_params(void *handle, efi_system_table_t *_table)
{
@@ -921,8 +924,6 @@ struct boot_params *make_boot_params(void *handle, efi_system_table_t *_table)
hdr->vid_mode = 0xffff;
hdr->boot_flag = 0xAA55;
- hdr->code32_start = (__u64)(unsigned long)image->image_base;
-
hdr->type_of_loader = 0x21;
/* Convert unicode cmdline to ascii */
diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S
index 5d6f689..b1bd969 100644
--- a/arch/x86/boot/compressed/head_32.S
+++ b/arch/x86/boot/compressed/head_32.S
@@ -50,6 +50,13 @@ ENTRY(efi_pe_entry)
pushl %eax
pushl %esi
pushl %ecx
+
+ call reloc
+reloc:
+ popl %ecx
+ subl reloc, %ecx
+ movl %ecx, BP_code32_start(%eax)
+
sub $0x4, %esp
ENTRY(efi_stub_entry)
@@ -63,12 +70,7 @@ ENTRY(efi_stub_entry)
hlt
jmp 1b
2:
- call 3f
-3:
- popl %eax
- subl $3b, %eax
- subl BP_pref_address(%esi), %eax
- add BP_code32_start(%esi), %eax
+ movl BP_code32_start(%esi), %eax
leal preferred_addr(%eax), %eax
jmp *%eax
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index c337422..a558403 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -215,6 +215,8 @@ ENTRY(efi_pe_entry)
cmpq $0,%rax
je 1f
mov %rax, %rdx
+ leaq startup_32(%rip), %rax
+ movl %eax, BP_code32_start(%rdx)
popq %rsi
popq %rdi
@@ -228,12 +230,7 @@ ENTRY(efi_stub_entry)
hlt
jmp 1b
2:
- call 3f
-3:
- popq %rax
- subq $3b, %rax
- subq BP_pref_address(%rsi), %rax
- add BP_code32_start(%esi), %eax
+ movl BP_code32_start(%esi), %eax
leaq preferred_addr(%rax), %rax
jmp *%rax
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index 434f077..1b05afd 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -401,6 +401,8 @@ asmlinkage void decompress_kernel(void *rmode, memptr heap,
unsigned char *output,
unsigned long output_len)
{
+ unsigned char *output_orig = output;
+
real_mode = rmode;
sanitize_boot_params(real_mode);
@@ -439,7 +441,12 @@ asmlinkage void decompress_kernel(void *rmode, memptr heap,
debug_putstr("\nDecompressing Linux... ");
decompress(input_data, input_len, NULL, NULL, output, NULL, error);
parse_elf(output);
- handle_relocations(output, output_len);
+ /*
+ * 32-bit always performs relocations. 64-bit relocations are only
+ * needed if kASLR has chosen a different load address.
+ */
+ if (!IS_ENABLED(CONFIG_X86_64) || output != output_orig)
+ handle_relocations(output, output_len);
debug_putstr("done.\nBooting the kernel.\n");
return;
}
diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S
index 9ec06a1..4257124 100644
--- a/arch/x86/boot/header.S
+++ b/arch/x86/boot/header.S
@@ -91,10 +91,9 @@ bs_die:
.section ".bsdata", "a"
bugger_off_msg:
- .ascii "Direct floppy boot is not supported. "
- .ascii "Use a boot loader program instead.\r\n"
+ .ascii "Use a boot loader.\r\n"
.ascii "\n"
- .ascii "Remove disk and press any key to reboot ...\r\n"
+ .ascii "Remove disk and press any key to reboot...\r\n"
.byte 0
#ifdef CONFIG_EFI_STUB
@@ -108,7 +107,7 @@ coff_header:
#else
.word 0x8664 # x86-64
#endif
- .word 3 # nr_sections
+ .word 4 # nr_sections
.long 0 # TimeDateStamp
.long 0 # PointerToSymbolTable
.long 1 # NumberOfSymbols
@@ -250,6 +249,25 @@ section_table:
.word 0 # NumberOfLineNumbers
.long 0x60500020 # Characteristics (section flags)
+ #
+ # The offset & size fields are filled in by build.c.
+ #
+ .ascii ".bss"
+ .byte 0
+ .byte 0
+ .byte 0
+ .byte 0
+ .long 0
+ .long 0x0
+ .long 0 # Size of initialized data
+ # on disk
+ .long 0x0
+ .long 0 # PointerToRelocations
+ .long 0 # PointerToLineNumbers
+ .word 0 # NumberOfRelocations
+ .word 0 # NumberOfLineNumbers
+ .long 0xc8000080 # Characteristics (section flags)
+
#endif /* CONFIG_EFI_STUB */
# Kernel attributes; used by setup. This is part 1 of the
diff --git a/arch/x86/boot/tools/build.c b/arch/x86/boot/tools/build.c
index c941d6a..687dd28 100644
--- a/arch/x86/boot/tools/build.c
+++ b/arch/x86/boot/tools/build.c
@@ -141,7 +141,7 @@ static void usage(void)
#ifdef CONFIG_EFI_STUB
-static void update_pecoff_section_header(char *section_name, u32 offset, u32 size)
+static void update_pecoff_section_header_fields(char *section_name, u32 vma, u32 size, u32 datasz, u32 offset)
{
unsigned int pe_header;
unsigned short num_sections;
@@ -162,10 +162,10 @@ static void update_pecoff_section_header(char *section_name, u32 offset, u32 siz
put_unaligned_le32(size, section + 0x8);
/* section header vma field */
- put_unaligned_le32(offset, section + 0xc);
+ put_unaligned_le32(vma, section + 0xc);
/* section header 'size of initialised data' field */
- put_unaligned_le32(size, section + 0x10);
+ put_unaligned_le32(datasz, section + 0x10);
/* section header 'file offset' field */
put_unaligned_le32(offset, section + 0x14);
@@ -177,6 +177,11 @@ static void update_pecoff_section_header(char *section_name, u32 offset, u32 siz
}
}
+static void update_pecoff_section_header(char *section_name, u32 offset, u32 size)
+{
+ update_pecoff_section_header_fields(section_name, offset, size, size, offset);
+}
+
static void update_pecoff_setup_and_reloc(unsigned int size)
{
u32 setup_offset = 0x200;
@@ -201,9 +206,6 @@ static void update_pecoff_text(unsigned int text_start, unsigned int file_sz)
pe_header = get_unaligned_le32(&buf[0x3c]);
- /* Size of image */
- put_unaligned_le32(file_sz, &buf[pe_header + 0x50]);
-
/*
* Size of code: Subtract the size of the first sector (512 bytes)
* which includes the header.
@@ -218,6 +220,22 @@ static void update_pecoff_text(unsigned int text_start, unsigned int file_sz)
update_pecoff_section_header(".text", text_start, text_sz);
}
+static void update_pecoff_bss(unsigned int file_sz, unsigned int init_sz)
+{
+ unsigned int pe_header;
+ unsigned int bss_sz = init_sz - file_sz;
+
+ pe_header = get_unaligned_le32(&buf[0x3c]);
+
+ /* Size of uninitialized data */
+ put_unaligned_le32(bss_sz, &buf[pe_header + 0x24]);
+
+ /* Size of image */
+ put_unaligned_le32(init_sz, &buf[pe_header + 0x50]);
+
+ update_pecoff_section_header_fields(".bss", file_sz, bss_sz, 0, 0);
+}
+
#endif /* CONFIG_EFI_STUB */
@@ -269,6 +287,9 @@ int main(int argc, char ** argv)
int fd;
void *kernel;
u32 crc = 0xffffffffUL;
+#ifdef CONFIG_EFI_STUB
+ unsigned int init_sz;
+#endif
/* Defaults for old kernel */
#ifdef CONFIG_X86_32
@@ -339,7 +360,9 @@ int main(int argc, char ** argv)
put_unaligned_le32(sys_size, &buf[0x1f4]);
#ifdef CONFIG_EFI_STUB
- update_pecoff_text(setup_sectors * 512, sz + i + ((sys_size * 16) - sz));
+ update_pecoff_text(setup_sectors * 512, i + (sys_size * 16));
+ init_sz = get_unaligned_le32(&buf[0x260]);
+ update_pecoff_bss(i + (sys_size * 16), init_sz);
#ifdef CONFIG_X86_64 /* Yes, this is really how we defined it :( */
efi_stub_entry -= 0x200;
diff --git a/arch/x86/crypto/aes_glue.c b/arch/x86/crypto/aes_glue.c
index aafe8ce..e26984f 100644
--- a/arch/x86/crypto/aes_glue.c
+++ b/arch/x86/crypto/aes_glue.c
@@ -66,5 +66,5 @@ module_exit(aes_fini);
MODULE_DESCRIPTION("Rijndael (AES) Cipher Algorithm, asm optimized");
MODULE_LICENSE("GPL");
-MODULE_ALIAS("aes");
-MODULE_ALIAS("aes-asm");
+MODULE_ALIAS_CRYPTO("aes");
+MODULE_ALIAS_CRYPTO("aes-asm");
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index 3fbe870..f89e749 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -252,14 +252,14 @@ static int ecb_encrypt(struct blkcipher_desc *desc,
err = blkcipher_walk_virt(desc, &walk);
desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+ kernel_fpu_begin();
while ((nbytes = walk.nbytes)) {
- kernel_fpu_begin();
aesni_ecb_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr,
- nbytes & AES_BLOCK_MASK);
- kernel_fpu_end();
+ nbytes & AES_BLOCK_MASK);
nbytes &= AES_BLOCK_SIZE - 1;
err = blkcipher_walk_done(desc, &walk, nbytes);
}
+ kernel_fpu_end();
return err;
}
@@ -276,14 +276,14 @@ static int ecb_decrypt(struct blkcipher_desc *desc,
err = blkcipher_walk_virt(desc, &walk);
desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+ kernel_fpu_begin();
while ((nbytes = walk.nbytes)) {
- kernel_fpu_begin();
aesni_ecb_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr,
nbytes & AES_BLOCK_MASK);
- kernel_fpu_end();
nbytes &= AES_BLOCK_SIZE - 1;
err = blkcipher_walk_done(desc, &walk, nbytes);
}
+ kernel_fpu_end();
return err;
}
@@ -300,14 +300,14 @@ static int cbc_encrypt(struct blkcipher_desc *desc,
err = blkcipher_walk_virt(desc, &walk);
desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+ kernel_fpu_begin();
while ((nbytes = walk.nbytes)) {
- kernel_fpu_begin();
aesni_cbc_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr,
nbytes & AES_BLOCK_MASK, walk.iv);
- kernel_fpu_end();
nbytes &= AES_BLOCK_SIZE - 1;
err = blkcipher_walk_done(desc, &walk, nbytes);
}
+ kernel_fpu_end();
return err;
}
@@ -324,14 +324,14 @@ static int cbc_decrypt(struct blkcipher_desc *desc,
err = blkcipher_walk_virt(desc, &walk);
desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+ kernel_fpu_begin();
while ((nbytes = walk.nbytes)) {
- kernel_fpu_begin();
aesni_cbc_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr,
nbytes & AES_BLOCK_MASK, walk.iv);
- kernel_fpu_end();
nbytes &= AES_BLOCK_SIZE - 1;
err = blkcipher_walk_done(desc, &walk, nbytes);
}
+ kernel_fpu_end();
return err;
}
@@ -364,20 +364,18 @@ static int ctr_crypt(struct blkcipher_desc *desc,
err = blkcipher_walk_virt_block(desc, &walk, AES_BLOCK_SIZE);
desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+ kernel_fpu_begin();
while ((nbytes = walk.nbytes) >= AES_BLOCK_SIZE) {
- kernel_fpu_begin();
aesni_ctr_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr,
nbytes & AES_BLOCK_MASK, walk.iv);
- kernel_fpu_end();
nbytes &= AES_BLOCK_SIZE - 1;
err = blkcipher_walk_done(desc, &walk, nbytes);
}
if (walk.nbytes) {
- kernel_fpu_begin();
ctr_crypt_final(ctx, &walk);
- kernel_fpu_end();
err = blkcipher_walk_done(desc, &walk, 0);
}
+ kernel_fpu_end();
return err;
}
@@ -1375,4 +1373,4 @@ module_exit(aesni_exit);
MODULE_DESCRIPTION("Rijndael (AES) Cipher Algorithm, Intel AES-NI instructions optimized");
MODULE_LICENSE("GPL");
-MODULE_ALIAS("aes");
+MODULE_ALIAS_CRYPTO("aes");
diff --git a/arch/x86/crypto/blowfish_glue.c b/arch/x86/crypto/blowfish_glue.c
index 50ec333..1477cfc 100644
--- a/arch/x86/crypto/blowfish_glue.c
+++ b/arch/x86/crypto/blowfish_glue.c
@@ -481,5 +481,5 @@ module_exit(fini);
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("Blowfish Cipher Algorithm, asm optimized");
-MODULE_ALIAS("blowfish");
-MODULE_ALIAS("blowfish-asm");
+MODULE_ALIAS_CRYPTO("blowfish");
+MODULE_ALIAS_CRYPTO("blowfish-asm");
diff --git a/arch/x86/crypto/camellia_aesni_avx2_glue.c b/arch/x86/crypto/camellia_aesni_avx2_glue.c
index 414fe5d..da710fc 100644
--- a/arch/x86/crypto/camellia_aesni_avx2_glue.c
+++ b/arch/x86/crypto/camellia_aesni_avx2_glue.c
@@ -582,5 +582,5 @@ module_exit(camellia_aesni_fini);
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("Camellia Cipher Algorithm, AES-NI/AVX2 optimized");
-MODULE_ALIAS("camellia");
-MODULE_ALIAS("camellia-asm");
+MODULE_ALIAS_CRYPTO("camellia");
+MODULE_ALIAS_CRYPTO("camellia-asm");
diff --git a/arch/x86/crypto/camellia_aesni_avx_glue.c b/arch/x86/crypto/camellia_aesni_avx_glue.c
index 37fd0c0..883e1af 100644
--- a/arch/x86/crypto/camellia_aesni_avx_glue.c
+++ b/arch/x86/crypto/camellia_aesni_avx_glue.c
@@ -574,5 +574,5 @@ module_exit(camellia_aesni_fini);
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("Camellia Cipher Algorithm, AES-NI/AVX optimized");
-MODULE_ALIAS("camellia");
-MODULE_ALIAS("camellia-asm");
+MODULE_ALIAS_CRYPTO("camellia");
+MODULE_ALIAS_CRYPTO("camellia-asm");
diff --git a/arch/x86/crypto/camellia_glue.c b/arch/x86/crypto/camellia_glue.c
index c171dcb..5c8b626 100644
--- a/arch/x86/crypto/camellia_glue.c
+++ b/arch/x86/crypto/camellia_glue.c
@@ -1725,5 +1725,5 @@ module_exit(fini);
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("Camellia Cipher Algorithm, asm optimized");
-MODULE_ALIAS("camellia");
-MODULE_ALIAS("camellia-asm");
+MODULE_ALIAS_CRYPTO("camellia");
+MODULE_ALIAS_CRYPTO("camellia-asm");
diff --git a/arch/x86/crypto/cast5_avx_glue.c b/arch/x86/crypto/cast5_avx_glue.c
index 2d48e83..d416069 100644
--- a/arch/x86/crypto/cast5_avx_glue.c
+++ b/arch/x86/crypto/cast5_avx_glue.c
@@ -60,7 +60,7 @@ static inline void cast5_fpu_end(bool fpu_enabled)
static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
bool enc)
{
- bool fpu_enabled;
+ bool fpu_enabled = false;
struct cast5_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
const unsigned int bsize = CAST5_BLOCK_SIZE;
unsigned int nbytes;
@@ -76,7 +76,7 @@ static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
u8 *wsrc = walk->src.virt.addr;
u8 *wdst = walk->dst.virt.addr;
- fpu_enabled = cast5_fpu_begin(false, nbytes);
+ fpu_enabled = cast5_fpu_begin(fpu_enabled, nbytes);
/* Process multi-block batch */
if (nbytes >= bsize * CAST5_PARALLEL_BLOCKS) {
@@ -104,9 +104,10 @@ static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
} while (nbytes >= bsize);
done:
- cast5_fpu_end(fpu_enabled);
err = blkcipher_walk_done(desc, walk, nbytes);
}
+
+ cast5_fpu_end(fpu_enabled);
return err;
}
@@ -230,7 +231,7 @@ done:
static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
struct scatterlist *src, unsigned int nbytes)
{
- bool fpu_enabled;
+ bool fpu_enabled = false;
struct blkcipher_walk walk;
int err;
@@ -239,11 +240,12 @@ static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
while ((nbytes = walk.nbytes)) {
- fpu_enabled = cast5_fpu_begin(false, nbytes);
+ fpu_enabled = cast5_fpu_begin(fpu_enabled, nbytes);
nbytes = __cbc_decrypt(desc, &walk);
- cast5_fpu_end(fpu_enabled);
err = blkcipher_walk_done(desc, &walk, nbytes);
}
+
+ cast5_fpu_end(fpu_enabled);
return err;
}
@@ -313,7 +315,7 @@ done:
static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
struct scatterlist *src, unsigned int nbytes)
{
- bool fpu_enabled;
+ bool fpu_enabled = false;
struct blkcipher_walk walk;
int err;
@@ -322,12 +324,13 @@ static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
while ((nbytes = walk.nbytes) >= CAST5_BLOCK_SIZE) {
- fpu_enabled = cast5_fpu_begin(false, nbytes);
+ fpu_enabled = cast5_fpu_begin(fpu_enabled, nbytes);
nbytes = __ctr_crypt(desc, &walk);
- cast5_fpu_end(fpu_enabled);
err = blkcipher_walk_done(desc, &walk, nbytes);
}
+ cast5_fpu_end(fpu_enabled);
+
if (walk.nbytes) {
ctr_crypt_final(desc, &walk);
err = blkcipher_walk_done(desc, &walk, 0);
@@ -491,4 +494,4 @@ module_exit(cast5_exit);
MODULE_DESCRIPTION("Cast5 Cipher Algorithm, AVX optimized");
MODULE_LICENSE("GPL");
-MODULE_ALIAS("cast5");
+MODULE_ALIAS_CRYPTO("cast5");
diff --git a/arch/x86/crypto/cast6_avx_glue.c b/arch/x86/crypto/cast6_avx_glue.c
index 8d0dfb8..c197562 100644
--- a/arch/x86/crypto/cast6_avx_glue.c
+++ b/arch/x86/crypto/cast6_avx_glue.c
@@ -611,4 +611,4 @@ module_exit(cast6_exit);
MODULE_DESCRIPTION("Cast6 Cipher Algorithm, AVX optimized");
MODULE_LICENSE("GPL");
-MODULE_ALIAS("cast6");
+MODULE_ALIAS_CRYPTO("cast6");
diff --git a/arch/x86/crypto/crc32-pclmul_glue.c b/arch/x86/crypto/crc32-pclmul_glue.c
index 9d014a7..1937fc1 100644
--- a/arch/x86/crypto/crc32-pclmul_glue.c
+++ b/arch/x86/crypto/crc32-pclmul_glue.c
@@ -197,5 +197,5 @@ module_exit(crc32_pclmul_mod_fini);
MODULE_AUTHOR("Alexander Boyko <alexander_boyko@xyratex.com>");
MODULE_LICENSE("GPL");
-MODULE_ALIAS("crc32");
-MODULE_ALIAS("crc32-pclmul");
+MODULE_ALIAS_CRYPTO("crc32");
+MODULE_ALIAS_CRYPTO("crc32-pclmul");
diff --git a/arch/x86/crypto/crc32c-intel_glue.c b/arch/x86/crypto/crc32c-intel_glue.c
index 6812ad9..28640c3 100644
--- a/arch/x86/crypto/crc32c-intel_glue.c
+++ b/arch/x86/crypto/crc32c-intel_glue.c
@@ -280,5 +280,5 @@ MODULE_AUTHOR("Austin Zhang <austin.zhang@intel.com>, Kent Liu <kent.liu@intel.c
MODULE_DESCRIPTION("CRC32c (Castagnoli) optimization using Intel Hardware.");
MODULE_LICENSE("GPL");
-MODULE_ALIAS("crc32c");
-MODULE_ALIAS("crc32c-intel");
+MODULE_ALIAS_CRYPTO("crc32c");
+MODULE_ALIAS_CRYPTO("crc32c-intel");
diff --git a/arch/x86/crypto/crct10dif-pclmul_glue.c b/arch/x86/crypto/crct10dif-pclmul_glue.c
index 7845d7f..b6c67bf 100644
--- a/arch/x86/crypto/crct10dif-pclmul_glue.c
+++ b/arch/x86/crypto/crct10dif-pclmul_glue.c
@@ -147,5 +147,5 @@ MODULE_AUTHOR("Tim Chen <tim.c.chen@linux.intel.com>");
MODULE_DESCRIPTION("T10 DIF CRC calculation accelerated with PCLMULQDQ.");
MODULE_LICENSE("GPL");
-MODULE_ALIAS("crct10dif");
-MODULE_ALIAS("crct10dif-pclmul");
+MODULE_ALIAS_CRYPTO("crct10dif");
+MODULE_ALIAS_CRYPTO("crct10dif-pclmul");
diff --git a/arch/x86/crypto/fpu.c b/arch/x86/crypto/fpu.c
index 98d7a18..f368ba2 100644
--- a/arch/x86/crypto/fpu.c
+++ b/arch/x86/crypto/fpu.c
@@ -17,6 +17,7 @@
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/slab.h>
+#include <linux/crypto.h>
#include <asm/i387.h>
struct crypto_fpu_ctx {
@@ -159,3 +160,5 @@ void __exit crypto_fpu_exit(void)
{
crypto_unregister_template(&crypto_fpu_tmpl);
}
+
+MODULE_ALIAS_CRYPTO("fpu");
diff --git a/arch/x86/crypto/ghash-clmulni-intel_glue.c b/arch/x86/crypto/ghash-clmulni-intel_glue.c
index d785cf2..a8d6f69 100644
--- a/arch/x86/crypto/ghash-clmulni-intel_glue.c
+++ b/arch/x86/crypto/ghash-clmulni-intel_glue.c
@@ -341,4 +341,4 @@ module_exit(ghash_pclmulqdqni_mod_exit);
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("GHASH Message Digest Algorithm, "
"acclerated by PCLMULQDQ-NI");
-MODULE_ALIAS("ghash");
+MODULE_ALIAS_CRYPTO("ghash");
diff --git a/arch/x86/crypto/glue_helper.c b/arch/x86/crypto/glue_helper.c
index 4a2bd21..432f1d76 100644
--- a/arch/x86/crypto/glue_helper.c
+++ b/arch/x86/crypto/glue_helper.c
@@ -39,7 +39,7 @@ static int __glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx,
void *ctx = crypto_blkcipher_ctx(desc->tfm);
const unsigned int bsize = 128 / 8;
unsigned int nbytes, i, func_bytes;
- bool fpu_enabled;
+ bool fpu_enabled = false;
int err;
err = blkcipher_walk_virt(desc, walk);
@@ -49,7 +49,7 @@ static int __glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx,
u8 *wdst = walk->dst.virt.addr;
fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
- desc, false, nbytes);
+ desc, fpu_enabled, nbytes);
for (i = 0; i < gctx->num_funcs; i++) {
func_bytes = bsize * gctx->funcs[i].num_blocks;
@@ -71,10 +71,10 @@ static int __glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx,
}
done:
- glue_fpu_end(fpu_enabled);
err = blkcipher_walk_done(desc, walk, nbytes);
}
+ glue_fpu_end(fpu_enabled);
return err;
}
@@ -194,7 +194,7 @@ int glue_cbc_decrypt_128bit(const struct common_glue_ctx *gctx,
struct scatterlist *src, unsigned int nbytes)
{
const unsigned int bsize = 128 / 8;
- bool fpu_enabled;
+ bool fpu_enabled = false;
struct blkcipher_walk walk;
int err;
@@ -203,12 +203,12 @@ int glue_cbc_decrypt_128bit(const struct common_glue_ctx *gctx,
while ((nbytes = walk.nbytes)) {
fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
- desc, false, nbytes);
+ desc, fpu_enabled, nbytes);
nbytes = __glue_cbc_decrypt_128bit(gctx, desc, &walk);
- glue_fpu_end(fpu_enabled);
err = blkcipher_walk_done(desc, &walk, nbytes);
}
+ glue_fpu_end(fpu_enabled);
return err;
}
EXPORT_SYMBOL_GPL(glue_cbc_decrypt_128bit);
@@ -278,7 +278,7 @@ int glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx,
struct scatterlist *src, unsigned int nbytes)
{
const unsigned int bsize = 128 / 8;
- bool fpu_enabled;
+ bool fpu_enabled = false;
struct blkcipher_walk walk;
int err;
@@ -287,12 +287,13 @@ int glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx,
while ((nbytes = walk.nbytes) >= bsize) {
fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
- desc, false, nbytes);
+ desc, fpu_enabled, nbytes);
nbytes = __glue_ctr_crypt_128bit(gctx, desc, &walk);
- glue_fpu_end(fpu_enabled);
err = blkcipher_walk_done(desc, &walk, nbytes);
}
+ glue_fpu_end(fpu_enabled);
+
if (walk.nbytes) {
glue_ctr_crypt_final_128bit(
gctx->funcs[gctx->num_funcs - 1].fn_u.ctr, desc, &walk);
@@ -347,7 +348,7 @@ int glue_xts_crypt_128bit(const struct common_glue_ctx *gctx,
void *tweak_ctx, void *crypt_ctx)
{
const unsigned int bsize = 128 / 8;
- bool fpu_enabled;
+ bool fpu_enabled = false;
struct blkcipher_walk walk;
int err;
@@ -360,21 +361,21 @@ int glue_xts_crypt_128bit(const struct common_glue_ctx *gctx,
/* set minimum length to bsize, for tweak_fn */
fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
- desc, false,
+ desc, fpu_enabled,
nbytes < bsize ? bsize : nbytes);
+
/* calculate first value of T */
tweak_fn(tweak_ctx, walk.iv, walk.iv);
- glue_fpu_end(fpu_enabled);
while (nbytes) {
- fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
- desc, false, nbytes);
nbytes = __glue_xts_crypt_128bit(gctx, crypt_ctx, desc, &walk);
- glue_fpu_end(fpu_enabled);
err = blkcipher_walk_done(desc, &walk, nbytes);
nbytes = walk.nbytes;
}
+
+ glue_fpu_end(fpu_enabled);
+
return err;
}
EXPORT_SYMBOL_GPL(glue_xts_crypt_128bit);
diff --git a/arch/x86/crypto/salsa20_glue.c b/arch/x86/crypto/salsa20_glue.c
index 5e8e677..399a29d 100644
--- a/arch/x86/crypto/salsa20_glue.c
+++ b/arch/x86/crypto/salsa20_glue.c
@@ -119,5 +119,5 @@ module_exit(fini);
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION ("Salsa20 stream cipher algorithm (optimized assembly version)");
-MODULE_ALIAS("salsa20");
-MODULE_ALIAS("salsa20-asm");
+MODULE_ALIAS_CRYPTO("salsa20");
+MODULE_ALIAS_CRYPTO("salsa20-asm");
diff --git a/arch/x86/crypto/serpent_avx2_glue.c b/arch/x86/crypto/serpent_avx2_glue.c
index 23aabc6..cb57caf 100644
--- a/arch/x86/crypto/serpent_avx2_glue.c
+++ b/arch/x86/crypto/serpent_avx2_glue.c
@@ -558,5 +558,5 @@ module_exit(fini);
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("Serpent Cipher Algorithm, AVX2 optimized");
-MODULE_ALIAS("serpent");
-MODULE_ALIAS("serpent-asm");
+MODULE_ALIAS_CRYPTO("serpent");
+MODULE_ALIAS_CRYPTO("serpent-asm");
diff --git a/arch/x86/crypto/serpent_avx_glue.c b/arch/x86/crypto/serpent_avx_glue.c
index 9ae83cf..0a86e8b 100644
--- a/arch/x86/crypto/serpent_avx_glue.c
+++ b/arch/x86/crypto/serpent_avx_glue.c
@@ -617,4 +617,4 @@ module_exit(serpent_exit);
MODULE_DESCRIPTION("Serpent Cipher Algorithm, AVX optimized");
MODULE_LICENSE("GPL");
-MODULE_ALIAS("serpent");
+MODULE_ALIAS_CRYPTO("serpent");
diff --git a/arch/x86/crypto/serpent_sse2_glue.c b/arch/x86/crypto/serpent_sse2_glue.c
index 97a356e..279f389 100644
--- a/arch/x86/crypto/serpent_sse2_glue.c
+++ b/arch/x86/crypto/serpent_sse2_glue.c
@@ -618,4 +618,4 @@ module_exit(serpent_sse2_exit);
MODULE_DESCRIPTION("Serpent Cipher Algorithm, SSE2 optimized");
MODULE_LICENSE("GPL");
-MODULE_ALIAS("serpent");
+MODULE_ALIAS_CRYPTO("serpent");
diff --git a/arch/x86/crypto/sha1_ssse3_glue.c b/arch/x86/crypto/sha1_ssse3_glue.c
index 4a11a9d..29e1060 100644
--- a/arch/x86/crypto/sha1_ssse3_glue.c
+++ b/arch/x86/crypto/sha1_ssse3_glue.c
@@ -237,4 +237,4 @@ module_exit(sha1_ssse3_mod_fini);
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("SHA1 Secure Hash Algorithm, Supplemental SSE3 accelerated");
-MODULE_ALIAS("sha1");
+MODULE_ALIAS_CRYPTO("sha1");
diff --git a/arch/x86/crypto/sha256_ssse3_glue.c b/arch/x86/crypto/sha256_ssse3_glue.c
index e52947f..4dc100d 100644
--- a/arch/x86/crypto/sha256_ssse3_glue.c
+++ b/arch/x86/crypto/sha256_ssse3_glue.c
@@ -318,5 +318,5 @@ module_exit(sha256_ssse3_mod_fini);
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("SHA256 Secure Hash Algorithm, Supplemental SSE3 accelerated");
-MODULE_ALIAS("sha256");
-MODULE_ALIAS("sha384");
+MODULE_ALIAS_CRYPTO("sha256");
+MODULE_ALIAS_CRYPTO("sha224");
diff --git a/arch/x86/crypto/sha512_ssse3_glue.c b/arch/x86/crypto/sha512_ssse3_glue.c
index f30cd10..26a5898 100644
--- a/arch/x86/crypto/sha512_ssse3_glue.c
+++ b/arch/x86/crypto/sha512_ssse3_glue.c
@@ -141,7 +141,7 @@ static int sha512_ssse3_final(struct shash_desc *desc, u8 *out)
/* save number of bits */
bits[1] = cpu_to_be64(sctx->count[0] << 3);
- bits[0] = cpu_to_be64(sctx->count[1] << 3) | sctx->count[0] >> 61;
+ bits[0] = cpu_to_be64(sctx->count[1] << 3 | sctx->count[0] >> 61);
/* Pad out to 112 mod 128 and append length */
index = sctx->count[0] & 0x7f;
@@ -326,5 +326,5 @@ module_exit(sha512_ssse3_mod_fini);
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("SHA512 Secure Hash Algorithm, Supplemental SSE3 accelerated");
-MODULE_ALIAS("sha512");
-MODULE_ALIAS("sha384");
+MODULE_ALIAS_CRYPTO("sha512");
+MODULE_ALIAS_CRYPTO("sha384");
diff --git a/arch/x86/crypto/twofish_avx_glue.c b/arch/x86/crypto/twofish_avx_glue.c
index a62ba54..c8c12c1 100644
--- a/arch/x86/crypto/twofish_avx_glue.c
+++ b/arch/x86/crypto/twofish_avx_glue.c
@@ -579,4 +579,4 @@ module_exit(twofish_exit);
MODULE_DESCRIPTION("Twofish Cipher Algorithm, AVX optimized");
MODULE_LICENSE("GPL");
-MODULE_ALIAS("twofish");
+MODULE_ALIAS_CRYPTO("twofish");
diff --git a/arch/x86/crypto/twofish_glue.c b/arch/x86/crypto/twofish_glue.c
index 0a52023..77e06c2 100644
--- a/arch/x86/crypto/twofish_glue.c
+++ b/arch/x86/crypto/twofish_glue.c
@@ -96,5 +96,5 @@ module_exit(fini);
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION ("Twofish Cipher Algorithm, asm optimized");
-MODULE_ALIAS("twofish");
-MODULE_ALIAS("twofish-asm");
+MODULE_ALIAS_CRYPTO("twofish");
+MODULE_ALIAS_CRYPTO("twofish-asm");
diff --git a/arch/x86/crypto/twofish_glue_3way.c b/arch/x86/crypto/twofish_glue_3way.c
index 13e63b3..56d8a08 100644
--- a/arch/x86/crypto/twofish_glue_3way.c
+++ b/arch/x86/crypto/twofish_glue_3way.c
@@ -495,5 +495,5 @@ module_exit(fini);
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("Twofish Cipher Algorithm, 3-way parallel asm optimized");
-MODULE_ALIAS("twofish");
-MODULE_ALIAS("twofish-asm");
+MODULE_ALIAS_CRYPTO("twofish");
+MODULE_ALIAS_CRYPTO("twofish-asm");
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 4299eb0..92a2e93 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -151,6 +151,16 @@ ENTRY(ia32_sysenter_target)
1: movl (%rbp),%ebp
_ASM_EXTABLE(1b,ia32_badarg)
ASM_CLAC
+
+ /*
+ * Sysenter doesn't filter flags, so we need to clear NT
+ * ourselves. To save a few cycles, we can check whether
+ * NT was set instead of doing an unconditional popfq.
+ */
+ testl $X86_EFLAGS_NT,EFLAGS-ARGOFFSET(%rsp)
+ jnz sysenter_fix_flags
+sysenter_flags_fixed:
+
orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET)
testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
CFI_REMEMBER_STATE
@@ -184,6 +194,8 @@ sysexit_from_sys_call:
TRACE_IRQS_ON
ENABLE_INTERRUPTS_SYSEXIT32
+ CFI_RESTORE_STATE
+
#ifdef CONFIG_AUDITSYSCALL
.macro auditsys_entry_common
movl %esi,%r9d /* 6th arg: 4th syscall arg */
@@ -226,7 +238,6 @@ sysexit_from_sys_call:
.endm
sysenter_auditsys:
- CFI_RESTORE_STATE
auditsys_entry_common
movl %ebp,%r9d /* reload 6th syscall arg */
jmp sysenter_dispatch
@@ -235,6 +246,11 @@ sysexit_audit:
auditsys_exit sysexit_from_sys_call
#endif
+sysenter_fix_flags:
+ pushq_cfi $(X86_EFLAGS_IF|X86_EFLAGS_FIXED)
+ popfq_cfi
+ jmp sysenter_flags_fixed
+
sysenter_tracesys:
#ifdef CONFIG_AUDITSYSCALL
testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 89270b4..c2f19a8 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -203,6 +203,7 @@
#define X86_FEATURE_DECODEASSISTS (8*32+12) /* AMD Decode Assists support */
#define X86_FEATURE_PAUSEFILTER (8*32+13) /* AMD filtered pause intercept */
#define X86_FEATURE_PFTHRESHOLD (8*32+14) /* AMD pause filter threshold */
+#define X86_FEATURE_VMMCALL (8*32+15) /* Prefer vmmcall to vmcall */
/* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */
diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index b90e5df..f6aaf7d 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -251,7 +251,8 @@ static inline void native_load_tls(struct thread_struct *t, unsigned int cpu)
gdt[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i];
}
-#define _LDT_empty(info) \
+/* This intentionally ignores lm, since 32-bit apps don't have that field. */
+#define LDT_empty(info) \
((info)->base_addr == 0 && \
(info)->limit == 0 && \
(info)->contents == 0 && \
@@ -261,11 +262,18 @@ static inline void native_load_tls(struct thread_struct *t, unsigned int cpu)
(info)->seg_not_present == 1 && \
(info)->useable == 0)
-#ifdef CONFIG_X86_64
-#define LDT_empty(info) (_LDT_empty(info) && ((info)->lm == 0))
-#else
-#define LDT_empty(info) (_LDT_empty(info))
-#endif
+/* Lots of programs expect an all-zero user_desc to mean "no segment at all". */
+static inline bool LDT_zero(const struct user_desc *info)
+{
+ return (info->base_addr == 0 &&
+ info->limit == 0 &&
+ info->contents == 0 &&
+ info->read_exec_only == 0 &&
+ info->seg_32bit == 0 &&
+ info->limit_in_pages == 0 &&
+ info->seg_not_present == 0 &&
+ info->useable == 0);
+}
static inline void clear_LDT(void)
{
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index 9c999c1..01f15b2 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -155,8 +155,9 @@ do { \
#define elf_check_arch(x) \
((x)->e_machine == EM_X86_64)
-#define compat_elf_check_arch(x) \
- (elf_check_arch_ia32(x) || (x)->e_machine == EM_X86_64)
+#define compat_elf_check_arch(x) \
+ (elf_check_arch_ia32(x) || \
+ (IS_ENABLED(CONFIG_X86_X32_ABI) && (x)->e_machine == EM_X86_64))
#if __USER32_DS != __USER_DS
# error "The following code assumes __USER32_DS == __USER_DS"
diff --git a/arch/x86/include/asm/espfix.h b/arch/x86/include/asm/espfix.h
new file mode 100644
index 0000000..99efebb
--- /dev/null
+++ b/arch/x86/include/asm/espfix.h
@@ -0,0 +1,16 @@
+#ifndef _ASM_X86_ESPFIX_H
+#define _ASM_X86_ESPFIX_H
+
+#ifdef CONFIG_X86_64
+
+#include <asm/percpu.h>
+
+DECLARE_PER_CPU_READ_MOSTLY(unsigned long, espfix_stack);
+DECLARE_PER_CPU_READ_MOSTLY(unsigned long, espfix_waddr);
+
+extern void init_espfix_bsp(void);
+extern void init_espfix_ap(void);
+
+#endif /* CONFIG_X86_64 */
+
+#endif /* _ASM_X86_ESPFIX_H */
diff --git a/arch/x86/include/asm/hugetlb.h b/arch/x86/include/asm/hugetlb.h
index a809121..68c0539 100644
--- a/arch/x86/include/asm/hugetlb.h
+++ b/arch/x86/include/asm/hugetlb.h
@@ -52,6 +52,7 @@ static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
static inline void huge_ptep_clear_flush(struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep)
{
+ ptep_clear_flush(vma, addr, ptep);
}
static inline int huge_pte_none(pte_t pte)
diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h
index 0ea10f27..cb6cfcd 100644
--- a/arch/x86/include/asm/irq.h
+++ b/arch/x86/include/asm/irq.h
@@ -25,6 +25,7 @@ extern void irq_ctx_init(int cpu);
#ifdef CONFIG_HOTPLUG_CPU
#include <linux/cpumask.h>
+extern int check_irq_vectors_for_cpu_disable(void);
extern void fixup_irqs(void);
extern void irq_force_complete_move(int);
#endif
diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h
index bba3cf8..0a8b519 100644
--- a/arch/x86/include/asm/irqflags.h
+++ b/arch/x86/include/asm/irqflags.h
@@ -129,7 +129,7 @@ static inline notrace unsigned long arch_local_irq_save(void)
#define PARAVIRT_ADJUST_EXCEPTION_FRAME /* */
-#define INTERRUPT_RETURN iretq
+#define INTERRUPT_RETURN jmp native_iret
#define USERGS_SYSRET64 \
swapgs; \
sysretq;
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index c76ff74..7cb77dd 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -455,7 +455,7 @@ struct kvm_vcpu_arch {
bool nmi_injected; /* Trying to inject an NMI this entry */
struct mtrr_state_type mtrr_state;
- u32 pat;
+ u64 pat;
int switch_db_regs;
unsigned long db[KVM_NR_DB_REGS];
@@ -473,6 +473,7 @@ struct kvm_vcpu_arch {
u64 mmio_gva;
unsigned access;
gfn_t mmio_gfn;
+ u64 mmio_gen;
struct kvm_pmu pmu;
@@ -973,6 +974,20 @@ static inline void kvm_inject_gp(struct kvm_vcpu *vcpu, u32 error_code)
kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
}
+static inline u64 get_canonical(u64 la)
+{
+ return ((int64_t)la << 16) >> 16;
+}
+
+static inline bool is_noncanonical_address(u64 la)
+{
+#ifdef CONFIG_X86_64
+ return get_canonical(la) != la;
+#else
+ return false;
+#endif
+}
+
#define TSS_IOPB_BASE_OFFSET 0x66
#define TSS_BASE_SIZE 0x68
#define TSS_IOPB_SIZE (65536 / 8)
@@ -1031,7 +1046,7 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
void kvm_vcpu_reset(struct kvm_vcpu *vcpu);
void kvm_define_shared_msr(unsigned index, u32 msr);
-void kvm_set_shared_msr(unsigned index, u64 val, u64 mask);
+int kvm_set_shared_msr(unsigned index, u64 val, u64 mask);
bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip);
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index c7678e4..e62cf89 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -2,6 +2,7 @@
#define _ASM_X86_KVM_PARA_H
#include <asm/processor.h>
+#include <asm/alternative.h>
#include <uapi/asm/kvm_para.h>
extern void kvmclock_init(void);
@@ -16,10 +17,15 @@ static inline bool kvm_check_and_clear_guest_paused(void)
}
#endif /* CONFIG_KVM_GUEST */
-/* This instruction is vmcall. On non-VT architectures, it will generate a
- * trap that we will then rewrite to the appropriate instruction.
+#ifdef CONFIG_DEBUG_RODATA
+#define KVM_HYPERCALL \
+ ALTERNATIVE(".byte 0x0f,0x01,0xc1", ".byte 0x0f,0x01,0xd9", X86_FEATURE_VMMCALL)
+#else
+/* On AMD processors, vmcall will generate a trap that we will
+ * then rewrite to the appropriate instruction.
*/
#define KVM_HYPERCALL ".byte 0x0f,0x01,0xc1"
+#endif
/* For KVM hypercalls, a three-byte sequence of either the vmcall or the vmmcall
* instruction. The hypervisor may replace it with something else but only the
diff --git a/arch/x86/include/asm/page_32_types.h b/arch/x86/include/asm/page_32_types.h
index f48b17d..3a52ee0 100644
--- a/arch/x86/include/asm/page_32_types.h
+++ b/arch/x86/include/asm/page_32_types.h
@@ -20,7 +20,6 @@
#define THREAD_SIZE_ORDER 1
#define THREAD_SIZE (PAGE_SIZE << THREAD_SIZE_ORDER)
-#define STACKFAULT_STACK 0
#define DOUBLEFAULT_STACK 1
#define NMI_STACK 0
#define DEBUG_STACK 0
diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h
index 43dcd80..d1d2972 100644
--- a/arch/x86/include/asm/page_64_types.h
+++ b/arch/x86/include/asm/page_64_types.h
@@ -14,12 +14,11 @@
#define IRQ_STACK_ORDER 2
#define IRQ_STACK_SIZE (PAGE_SIZE << IRQ_STACK_ORDER)
-#define STACKFAULT_STACK 1
-#define DOUBLEFAULT_STACK 2
-#define NMI_STACK 3
-#define DEBUG_STACK 4
-#define MCE_STACK 5
-#define N_EXCEPTION_STACKS 5 /* hw limit: 7 */
+#define DOUBLEFAULT_STACK 1
+#define NMI_STACK 2
+#define DEBUG_STACK 3
+#define MCE_STACK 4
+#define N_EXCEPTION_STACKS 4 /* hw limit: 7 */
#define PUD_PAGE_SIZE (_AC(1, UL) << PUD_SHIFT)
#define PUD_PAGE_MASK (~(PUD_PAGE_SIZE-1))
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index 2d88344..b1609f2 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -61,6 +61,8 @@ typedef struct { pteval_t pte; } pte_t;
#define MODULES_VADDR _AC(0xffffffffa0000000, UL)
#define MODULES_END _AC(0xffffffffff000000, UL)
#define MODULES_LEN (MODULES_END - MODULES_VADDR)
+#define ESPFIX_PGD_ENTRY _AC(-2, UL)
+#define ESPFIX_BASE_ADDR (ESPFIX_PGD_ENTRY << PGDIR_SHIFT)
#define EARLY_DYNAMIC_PAGE_TABLES 64
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 942a086..68e9f00 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -232,6 +232,22 @@ static inline unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs,
#define ARCH_HAS_USER_SINGLE_STEP_INFO
+/*
+ * When hitting ptrace_stop(), we cannot return using SYSRET because
+ * that does not restore the full CPU state, only a minimal set. The
+ * ptracer can change arbitrary register values, which is usually okay
+ * because the usual ptrace stops run off the signal delivery path which
+ * forces IRET; however, ptrace_event() stops happen in arbitrary places
+ * in the kernel and don't force IRET path.
+ *
+ * So force IRET path after a ptrace stop.
+ */
+#define arch_ptrace_stop_needed(code, info) \
+({ \
+ set_thread_flag(TIF_NOTIFY_RESUME); \
+ false; \
+})
+
struct user_desc;
extern int do_get_thread_area(struct task_struct *p, int idx,
struct user_desc __user *info);
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index 3475554..ad1d8ec 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -64,6 +64,8 @@ static inline void x86_ce4100_early_setup(void) { }
#ifndef _SETUP
+#include <asm/espfix.h>
+
/*
* This is set up by the setup-routine at boot-time
*/
diff --git a/arch/x86/include/asm/signal.h b/arch/x86/include/asm/signal.h
index 6ec0792..35e67a4 100644
--- a/arch/x86/include/asm/signal.h
+++ b/arch/x86/include/asm/signal.h
@@ -23,19 +23,6 @@ typedef struct {
unsigned long sig[_NSIG_WORDS];
} sigset_t;
-/*
- * Because some traps use the IST stack, we must keep preemption
- * disabled while calling do_trap(), but do_trap() may call
- * force_sig_info() which will grab the signal spin_locks for the
- * task, which in PREEMPT_RT_FULL are mutexes. By defining
- * ARCH_RT_DELAYS_SIGNAL_SEND the force_sig_info() will set
- * TIF_NOTIFY_RESUME and set up the signal to be sent on exit of the
- * trap.
- */
-#if defined(CONFIG_PREEMPT_RT_FULL) && defined(CONFIG_X86_64)
-#define ARCH_RT_DELAYS_SIGNAL_SEND
-#endif
-
#ifndef CONFIG_COMPAT
typedef sigset_t compat_sigset_t;
#endif
diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h
index 64fb5cb..6a99859 100644
--- a/arch/x86/include/asm/stackprotector.h
+++ b/arch/x86/include/asm/stackprotector.h
@@ -57,7 +57,7 @@
*/
static __always_inline void boot_init_stack_canary(void)
{
- u64 uninitialized_var(canary);
+ u64 canary;
u64 tsc;
#ifdef CONFIG_X86_64
@@ -68,16 +68,8 @@ static __always_inline void boot_init_stack_canary(void)
* of randomness. The TSC only matters for very early init,
* there it already has some randomness on most systems. Later
* on during the bootup the random pool has true entropy too.
- *
- * For preempt-rt we need to weaken the randomness a bit, as
- * we can't call into the random generator from atomic context
- * due to locking constraints. We just leave canary
- * uninitialized and use the TSC based randomness on top of
- * it.
*/
-#ifndef CONFIG_PREEMPT_RT_FULL
get_random_bytes(&canary, sizeof(canary));
-#endif
tsc = __native_read_tsc();
canary += tsc + (tsc << 32UL);
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index f08e527..bbbac92 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -30,8 +30,6 @@ struct thread_info {
__u32 cpu; /* current CPU */
int preempt_count; /* 0 => preemptable,
<0 => BUG */
- int preempt_lazy_count; /* 0 => lazy preemptable,
- <0 => BUG */
mm_segment_t addr_limit;
struct restart_block restart_block;
void __user *sysenter_return;
@@ -83,7 +81,6 @@ struct thread_info {
#define TIF_SYSCALL_EMU 6 /* syscall emulation active */
#define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */
#define TIF_SECCOMP 8 /* secure computing */
-#define TIF_NEED_RESCHED_LAZY 9 /* lazy rescheduling necessary */
#define TIF_MCE_NOTIFY 10 /* notify userspace of an MCE */
#define TIF_USER_RETURN_NOTIFY 11 /* notify kernel of userspace return */
#define TIF_UPROBE 12 /* breakpointed or singlestepping */
@@ -108,7 +105,6 @@ struct thread_info {
#define _TIF_SYSCALL_EMU (1 << TIF_SYSCALL_EMU)
#define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT)
#define _TIF_SECCOMP (1 << TIF_SECCOMP)
-#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY)
#define _TIF_MCE_NOTIFY (1 << TIF_MCE_NOTIFY)
#define _TIF_USER_RETURN_NOTIFY (1 << TIF_USER_RETURN_NOTIFY)
#define _TIF_UPROBE (1 << TIF_UPROBE)
@@ -149,7 +145,7 @@ struct thread_info {
/* Only used for 64 bit */
#define _TIF_DO_NOTIFY_MASK \
(_TIF_SIGPENDING | _TIF_MCE_NOTIFY | _TIF_NOTIFY_RESUME | \
- _TIF_USER_RETURN_NOTIFY)
+ _TIF_USER_RETURN_NOTIFY | _TIF_UPROBE)
/* flags to check in __switch_to() */
#define _TIF_WORK_CTXSW \
@@ -158,8 +154,6 @@ struct thread_info {
#define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY)
#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW)
-#define _TIF_NEED_RESCHED_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)
-
#define PREEMPT_ACTIVE 0x10000000
#ifdef CONFIG_X86_32
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index e6d90ba..04905bf 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -62,7 +62,7 @@ static inline void __flush_tlb_all(void)
static inline void __flush_tlb_one(unsigned long addr)
{
- count_vm_event(NR_TLB_LOCAL_FLUSH_ONE);
+ count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE);
__flush_tlb_single(addr);
}
@@ -93,13 +93,13 @@ static inline void __flush_tlb_one(unsigned long addr)
*/
static inline void __flush_tlb_up(void)
{
- count_vm_event(NR_TLB_LOCAL_FLUSH_ALL);
+ count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
__flush_tlb();
}
static inline void flush_tlb_all(void)
{
- count_vm_event(NR_TLB_LOCAL_FLUSH_ALL);
+ count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
__flush_tlb_all();
}
diff --git a/arch/x86/include/asm/vsyscall.h b/arch/x86/include/asm/vsyscall.h
index 2a46ca7..2874be9 100644
--- a/arch/x86/include/asm/vsyscall.h
+++ b/arch/x86/include/asm/vsyscall.h
@@ -34,7 +34,7 @@ static inline unsigned int __getcpu(void)
native_read_tscp(&p);
} else {
/* Load per CPU data from GDT */
- asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG));
+ asm volatile ("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG));
}
return p;
diff --git a/arch/x86/include/uapi/asm/ldt.h b/arch/x86/include/uapi/asm/ldt.h
index 46727eb..6e1aaf7 100644
--- a/arch/x86/include/uapi/asm/ldt.h
+++ b/arch/x86/include/uapi/asm/ldt.h
@@ -28,6 +28,13 @@ struct user_desc {
unsigned int seg_not_present:1;
unsigned int useable:1;
#ifdef __x86_64__
+ /*
+ * Because this bit is not present in 32-bit user code, user
+ * programs can pass uninitialized values here. Therefore, in
+ * any context in which a user_desc comes from a 32-bit program,
+ * the kernel must act as though lm == 0, regardless of the
+ * actual value.
+ */
unsigned int lm:1;
#endif
};
diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h
index 0e79420..990a2fe 100644
--- a/arch/x86/include/uapi/asm/vmx.h
+++ b/arch/x86/include/uapi/asm/vmx.h
@@ -67,6 +67,7 @@
#define EXIT_REASON_EPT_MISCONFIG 49
#define EXIT_REASON_INVEPT 50
#define EXIT_REASON_PREEMPTION_TIMER 52
+#define EXIT_REASON_INVVPID 53
#define EXIT_REASON_WBINVD 54
#define EXIT_REASON_XSETBV 55
#define EXIT_REASON_APIC_WRITE 56
@@ -114,6 +115,7 @@
{ EXIT_REASON_EOI_INDUCED, "EOI_INDUCED" }, \
{ EXIT_REASON_INVALID_STATE, "INVALID_STATE" }, \
{ EXIT_REASON_INVD, "INVD" }, \
+ { EXIT_REASON_INVVPID, "INVVPID" }, \
{ EXIT_REASON_INVPCID, "INVPCID" }
#endif /* _UAPIVMX_H */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index a5408b9..32f1140 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -29,6 +29,7 @@ obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o
obj-y += syscall_$(BITS).o
obj-$(CONFIG_X86_64) += vsyscall_64.o
obj-$(CONFIG_X86_64) += vsyscall_emu_64.o
+obj-$(CONFIG_X86_ESPFIX64) += espfix_64.o
obj-y += bootflag.o e820.o
obj-y += pci-dma.o quirks.o topology.o kdebugfs.o
obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index a7eb82d..7170f17 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -1282,7 +1282,7 @@ void setup_local_APIC(void)
unsigned int value, queued;
int i, j, acked = 0;
unsigned long long tsc = 0, ntsc;
- long long max_loops = cpu_khz;
+ long long max_loops = cpu_khz ? cpu_khz : 1000000;
if (cpu_has_tsc)
rdtscll(tsc);
@@ -1379,7 +1379,7 @@ void setup_local_APIC(void)
break;
}
if (queued) {
- if (cpu_has_tsc) {
+ if (cpu_has_tsc && cpu_khz) {
rdtscll(ntsc);
max_loops = (cpu_khz << 10) - (ntsc - tsc);
} else
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 5816e6a..e63a5bd 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -2396,8 +2396,7 @@ static bool io_apic_level_ack_pending(struct irq_cfg *cfg)
static inline bool ioapic_irqd_mask(struct irq_data *data, struct irq_cfg *cfg)
{
/* If we are moving the irq we need to mask it */
- if (unlikely(irqd_is_setaffinity_pending(data) &&
- !irqd_irq_inprogress(data))) {
+ if (unlikely(irqd_is_setaffinity_pending(data))) {
mask_ioapic(cfg);
return true;
}
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index a36d9cf..2861082 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -33,7 +33,6 @@ void common(void) {
OFFSET(TI_status, thread_info, status);
OFFSET(TI_addr_limit, thread_info, addr_limit);
OFFSET(TI_preempt_count, thread_info, preempt_count);
- OFFSET(TI_preempt_lazy_count, thread_info, preempt_lazy_count);
BLANK();
OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx);
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 28233b9..ee51e67 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -509,6 +509,13 @@ static void early_init_amd(struct cpuinfo_x86 *c)
}
#endif
+ /*
+ * This is only needed to tell the kernel whether to use VMCALL
+ * and VMMCALL. VMMCALL is never executed except under virt, so
+ * we can set it unconditionally.
+ */
+ set_cpu_cap(c, X86_FEATURE_VMMCALL);
+
/* F16h erratum 793, CVE-2013-6885 */
if (c->x86 == 0x16 && c->x86_model <= 0xf) {
u64 val;
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 3533e2c..00cc6f7 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -144,6 +144,8 @@ EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
static int __init x86_xsave_setup(char *s)
{
+ if (strlen(s))
+ return 0;
setup_clear_cpu_cap(X86_FEATURE_XSAVE);
setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
setup_clear_cpu_cap(X86_FEATURE_AVX);
@@ -1135,7 +1137,7 @@ void syscall_init(void)
/* Flags to clear on syscall */
wrmsrl(MSR_SYSCALL_MASK,
X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|
- X86_EFLAGS_IOPL|X86_EFLAGS_AC);
+ X86_EFLAGS_IOPL|X86_EFLAGS_AC|X86_EFLAGS_NT);
}
/*
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 87c0be5..f4a9985 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -154,6 +154,21 @@ static void early_init_intel(struct cpuinfo_x86 *c)
setup_clear_cpu_cap(X86_FEATURE_ERMS);
}
}
+
+ /*
+ * Intel Quark Core DevMan_001.pdf section 6.4.11
+ * "The operating system also is required to invalidate (i.e., flush)
+ * the TLB when any changes are made to any of the page table entries.
+ * The operating system must reload CR3 to cause the TLB to be flushed"
+ *
+ * As a result cpu_has_pge() in arch/x86/include/asm/tlbflush.h should
+ * be false so that __flush_tlb_all() causes CR3 insted of CR4.PGE
+ * to be modified
+ */
+ if (c->x86 == 5 && c->x86_model == 9) {
+ pr_info("Disabling PGE capability bit\n");
+ setup_clear_cpu_cap(X86_FEATURE_PGE);
+ }
}
#ifdef CONFIG_X86_32
@@ -369,6 +384,13 @@ static void init_intel(struct cpuinfo_x86 *c)
detect_extended_topology(c);
l2 = init_intel_cacheinfo(c);
+
+ /* Detect legacy cache sizes if init_intel_cacheinfo did not */
+ if (l2 == 0) {
+ cpu_detect_cache_sizes(c);
+ l2 = c->x86_cache_size;
+ }
+
if (c->cpuid_level > 9) {
unsigned eax = cpuid_eax(10);
/* Check for version and the number of counters */
@@ -483,6 +505,13 @@ static unsigned int intel_size_cache(struct cpuinfo_x86 *c, unsigned int size)
*/
if ((c->x86 == 6) && (c->x86_model == 11) && (size == 0))
size = 256;
+
+ /*
+ * Intel Quark SoC X1000 contains a 4-way set associative
+ * 16K cache with a 16 byte cache line and 256 lines per tag
+ */
+ if ((c->x86 == 5) && (c->x86_model == 9))
+ size = 16;
return size;
}
#endif
@@ -688,7 +717,8 @@ static const struct cpu_dev intel_cpu_dev = {
[3] = "OverDrive PODP5V83",
[4] = "Pentium MMX",
[7] = "Mobile Pentium 75 - 200",
- [8] = "Mobile Pentium MMX"
+ [8] = "Mobile Pentium MMX",
+ [9] = "Quark SoC X1000",
}
},
{ .vendor = X86_VENDOR_INTEL, .family = 6, .model_names =
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 3a7ab0b..b3218cd 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -18,7 +18,6 @@
#include <linux/rcupdate.h>
#include <linux/kobject.h>
#include <linux/uaccess.h>
-#include <linux/kthread.h>
#include <linux/kdebug.h>
#include <linux/kernel.h>
#include <linux/percpu.h>
@@ -42,7 +41,6 @@
#include <linux/debugfs.h>
#include <linux/irq_work.h>
#include <linux/export.h>
-#include <linux/jiffies.h>
#include <asm/processor.h>
#include <asm/mce.h>
@@ -1270,7 +1268,7 @@ void mce_log_therm_throt_event(__u64 status)
static unsigned long check_interval = 5 * 60; /* 5 minutes */
static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */
-static DEFINE_PER_CPU(struct hrtimer, mce_timer);
+static DEFINE_PER_CPU(struct timer_list, mce_timer);
static unsigned long mce_adjust_timer_default(unsigned long interval)
{
@@ -1280,10 +1278,13 @@ static unsigned long mce_adjust_timer_default(unsigned long interval)
static unsigned long (*mce_adjust_timer)(unsigned long interval) =
mce_adjust_timer_default;
-static enum hrtimer_restart mce_timer_fn(struct hrtimer *timer)
+static void mce_timer_fn(unsigned long data)
{
+ struct timer_list *t = &__get_cpu_var(mce_timer);
unsigned long iv;
+ WARN_ON(smp_processor_id() != data);
+
if (mce_available(__this_cpu_ptr(&cpu_info))) {
machine_check_poll(MCP_TIMESTAMP,
&__get_cpu_var(mce_poll_banks));
@@ -1304,11 +1305,9 @@ static enum hrtimer_restart mce_timer_fn(struct hrtimer *timer)
__this_cpu_write(mce_next_interval, iv);
/* Might have become 0 after CMCI storm subsided */
if (iv) {
- hrtimer_forward_now(timer, ns_to_ktime(
- jiffies_to_usecs(iv) * 1000ULL));
- return HRTIMER_RESTART;
+ t->expires = jiffies + iv;
+ add_timer_on(t, smp_processor_id());
}
- return HRTIMER_NORESTART;
}
/*
@@ -1316,37 +1315,28 @@ static enum hrtimer_restart mce_timer_fn(struct hrtimer *timer)
*/
void mce_timer_kick(unsigned long interval)
{
- struct hrtimer *t = &__get_cpu_var(mce_timer);
+ struct timer_list *t = &__get_cpu_var(mce_timer);
+ unsigned long when = jiffies + interval;
unsigned long iv = __this_cpu_read(mce_next_interval);
- if (hrtimer_active(t)) {
- s64 exp;
- s64 intv_us;
-
- intv_us = jiffies_to_usecs(interval);
- exp = ktime_to_us(hrtimer_expires_remaining(t));
- if (intv_us < exp) {
- hrtimer_cancel(t);
- hrtimer_start_range_ns(t,
- ns_to_ktime(intv_us * 1000),
- 0, HRTIMER_MODE_REL_PINNED);
- }
+ if (timer_pending(t)) {
+ if (time_before(when, t->expires))
+ mod_timer_pinned(t, when);
} else {
- hrtimer_start_range_ns(t,
- ns_to_ktime(jiffies_to_usecs(interval) * 1000ULL),
- 0, HRTIMER_MODE_REL_PINNED);
+ t->expires = round_jiffies(when);
+ add_timer_on(t, smp_processor_id());
}
if (interval < iv)
__this_cpu_write(mce_next_interval, interval);
}
-/* Must not be called in IRQ context where hrtimer_cancel() can deadlock */
+/* Must not be called in IRQ context where del_timer_sync() can deadlock */
static void mce_timer_delete_all(void)
{
int cpu;
for_each_online_cpu(cpu)
- hrtimer_cancel(&per_cpu(mce_timer, cpu));
+ del_timer_sync(&per_cpu(mce_timer, cpu));
}
static void mce_do_trigger(struct work_struct *work)
@@ -1356,63 +1346,6 @@ static void mce_do_trigger(struct work_struct *work)
static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
-static void __mce_notify_work(void)
-{
- /* Not more than two messages every minute */
- static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
-
- /* wake processes polling /dev/mcelog */
- wake_up_interruptible(&mce_chrdev_wait);
-
- /*
- * There is no risk of missing notifications because
- * work_pending is always cleared before the function is
- * executed.
- */
- if (mce_helper[0] && !work_pending(&mce_trigger_work))
- schedule_work(&mce_trigger_work);
-
- if (__ratelimit(&ratelimit))
- pr_info(HW_ERR "Machine check events logged\n");
-}
-
-#ifdef CONFIG_PREEMPT_RT_FULL
-struct task_struct *mce_notify_helper;
-
-static int mce_notify_helper_thread(void *unused)
-{
- while (1) {
- set_current_state(TASK_INTERRUPTIBLE);
- schedule();
- if (kthread_should_stop())
- break;
- __mce_notify_work();
- }
- return 0;
-}
-
-static int mce_notify_work_init(void)
-{
- mce_notify_helper = kthread_run(mce_notify_helper_thread, NULL,
- "mce-notify");
- if (!mce_notify_helper)
- return -ENOMEM;
-
- return 0;
-}
-
-static void mce_notify_work(void)
-{
- wake_up_process(mce_notify_helper);
-}
-#else
-static void mce_notify_work(void)
-{
- __mce_notify_work();
-}
-static inline int mce_notify_work_init(void) { return 0; }
-#endif
-
/*
* Notify the user(s) about new machine check events.
* Can be called from interrupt context, but not from machine check/NMI
@@ -1420,8 +1353,19 @@ static inline int mce_notify_work_init(void) { return 0; }
*/
int mce_notify_irq(void)
{
+ /* Not more than two messages every minute */
+ static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
+
if (test_and_clear_bit(0, &mce_need_notify)) {
- mce_notify_work();
+ /* wake processes polling /dev/mcelog */
+ wake_up_interruptible(&mce_chrdev_wait);
+
+ if (mce_helper[0])
+ schedule_work(&mce_trigger_work);
+
+ if (__ratelimit(&ratelimit))
+ pr_info(HW_ERR "Machine check events logged\n");
+
return 1;
}
return 0;
@@ -1692,7 +1636,7 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
}
}
-static void mce_start_timer(unsigned int cpu, struct hrtimer *t)
+static void mce_start_timer(unsigned int cpu, struct timer_list *t)
{
unsigned long iv = mce_adjust_timer(check_interval * HZ);
@@ -1701,17 +1645,16 @@ static void mce_start_timer(unsigned int cpu, struct hrtimer *t)
if (mca_cfg.ignore_ce || !iv)
return;
- hrtimer_start_range_ns(t, ns_to_ktime(jiffies_to_usecs(iv) * 1000ULL),
- 0, HRTIMER_MODE_REL_PINNED);
+ t->expires = round_jiffies(jiffies + iv);
+ add_timer_on(t, smp_processor_id());
}
static void __mcheck_cpu_init_timer(void)
{
- struct hrtimer *t = &__get_cpu_var(mce_timer);
+ struct timer_list *t = &__get_cpu_var(mce_timer);
unsigned int cpu = smp_processor_id();
- hrtimer_init(t, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
- t->function = mce_timer_fn;
+ setup_timer(t, mce_timer_fn, cpu);
mce_start_timer(cpu, t);
}
@@ -2386,8 +2329,6 @@ static void mce_disable_cpu(void *h)
if (!mce_available(__this_cpu_ptr(&cpu_info)))
return;
- hrtimer_cancel(&__get_cpu_var(mce_timer));
-
if (!(action & CPU_TASKS_FROZEN))
cmci_clear();
for (i = 0; i < mca_cfg.banks; i++) {
@@ -2414,7 +2355,6 @@ static void mce_reenable_cpu(void *h)
if (b->init)
wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl);
}
- __mcheck_cpu_init_timer();
}
/* Get notified when a cpu comes on/off. Be hotplug friendly. */
@@ -2422,6 +2362,7 @@ static int
mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
{
unsigned int cpu = (unsigned long)hcpu;
+ struct timer_list *t = &per_cpu(mce_timer, cpu);
switch (action & ~CPU_TASKS_FROZEN) {
case CPU_ONLINE:
@@ -2437,9 +2378,11 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
break;
case CPU_DOWN_PREPARE:
smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
+ del_timer_sync(t);
break;
case CPU_DOWN_FAILED:
smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
+ mce_start_timer(cpu, t);
break;
}
@@ -2501,8 +2444,6 @@ static __init int mcheck_init_device(void)
/* register character device /dev/mcelog */
misc_register(&mce_chrdev_device);
- err = mce_notify_work_init();
-
return err;
}
device_initcall_sync(mcheck_init_device);
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 71a39f3..6474807 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -63,6 +63,7 @@ static struct clocksource hyperv_cs = {
.rating = 400, /* use this when running on Hyperv*/
.read = read_hv_clock,
.mask = CLOCKSOURCE_MASK(64),
+ .flags = CLOCK_SOURCE_IS_CONTINUOUS,
};
static void __init ms_hyperv_init_platform(void)
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index ce2d0a2..0e25a1b 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -683,7 +683,7 @@ static void prepare_set(void) __acquires(set_atomicity_lock)
}
/* Flush all TLBs via a mov %cr3, %reg; mov %reg, %cr3 */
- count_vm_event(NR_TLB_LOCAL_FLUSH_ALL);
+ count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
__flush_tlb();
/* Save MTRR state */
@@ -697,7 +697,7 @@ static void prepare_set(void) __acquires(set_atomicity_lock)
static void post_set(void) __releases(set_atomicity_lock)
{
/* Flush TLBs (no need to flush caches - they are disabled) */
- count_vm_event(NR_TLB_LOCAL_FLUSH_ALL);
+ count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
__flush_tlb();
/* Intel (P6) standard MTRRs */
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 5edd3c0..c7106f1 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -118,6 +118,9 @@ static int x86_pmu_extra_regs(u64 config, struct perf_event *event)
continue;
if (event->attr.config1 & ~er->valid_mask)
return -EINVAL;
+ /* Check if the extra msrs can be safely accessed*/
+ if (!er->extra_msr_access)
+ return -ENXIO;
reg->idx = er->idx;
reg->config = event->attr.config1;
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index cc16faa..53bd272 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -279,14 +279,16 @@ struct extra_reg {
u64 config_mask;
u64 valid_mask;
int idx; /* per_xxx->regs[] reg index */
+ bool extra_msr_access;
};
#define EVENT_EXTRA_REG(e, ms, m, vm, i) { \
- .event = (e), \
- .msr = (ms), \
- .config_mask = (m), \
- .valid_mask = (vm), \
- .idx = EXTRA_REG_##i, \
+ .event = (e), \
+ .msr = (ms), \
+ .config_mask = (m), \
+ .valid_mask = (vm), \
+ .idx = EXTRA_REG_##i, \
+ .extra_msr_access = true, \
}
#define INTEL_EVENT_EXTRA_REG(event, msr, vm, idx) \
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index f31a165..b400d0b 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1365,6 +1365,15 @@ again:
intel_pmu_lbr_read();
/*
+ * CondChgd bit 63 doesn't mean any overflow status. Ignore
+ * and clear the bit.
+ */
+ if (__test_and_clear_bit(63, (unsigned long *)&status)) {
+ if (!status)
+ goto done;
+ }
+
+ /*
* PEBS overflow sets bit 62 in the global status register
*/
if (__test_and_clear_bit(62, (unsigned long *)&status)) {
@@ -2135,6 +2144,41 @@ static void intel_snb_check_microcode(void)
}
}
+/*
+ * Under certain circumstances, access certain MSR may cause #GP.
+ * The function tests if the input MSR can be safely accessed.
+ */
+static bool check_msr(unsigned long msr, u64 mask)
+{
+ u64 val_old, val_new, val_tmp;
+
+ /*
+ * Read the current value, change it and read it back to see if it
+ * matches, this is needed to detect certain hardware emulators
+ * (qemu/kvm) that don't trap on the MSR access and always return 0s.
+ */
+ if (rdmsrl_safe(msr, &val_old))
+ return false;
+
+ /*
+ * Only change the bits which can be updated by wrmsrl.
+ */
+ val_tmp = val_old ^ mask;
+ if (wrmsrl_safe(msr, val_tmp) ||
+ rdmsrl_safe(msr, &val_new))
+ return false;
+
+ if (val_new != val_tmp)
+ return false;
+
+ /* Here it's sure that the MSR can be safely accessed.
+ * Restore the old value and return.
+ */
+ wrmsrl(msr, val_old);
+
+ return true;
+}
+
static __init void intel_sandybridge_quirk(void)
{
x86_pmu.check_microcode = intel_snb_check_microcode;
@@ -2198,7 +2242,8 @@ __init int intel_pmu_init(void)
union cpuid10_ebx ebx;
struct event_constraint *c;
unsigned int unused;
- int version;
+ struct extra_reg *er;
+ int version, i;
if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
switch (boot_cpu_data.x86) {
@@ -2243,10 +2288,7 @@ __init int intel_pmu_init(void)
if (version > 1)
x86_pmu.num_counters_fixed = max((int)edx.split.num_counters_fixed, 3);
- /*
- * v2 and above have a perf capabilities MSR
- */
- if (version > 1) {
+ if (boot_cpu_has(X86_FEATURE_PDCM)) {
u64 capabilities;
rdmsrl(MSR_IA32_PERF_CAPABILITIES, capabilities);
@@ -2404,6 +2446,9 @@ __init int intel_pmu_init(void)
case 62: /* IvyBridge EP */
memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
+ /* dTLB-load-misses on IVB is different than SNB */
+ hw_cache_event_ids[C(DTLB)][C(OP_READ)][C(RESULT_MISS)] = 0x8108; /* DTLB_LOAD_MISSES.DEMAND_LD_MISS_CAUSES_A_WALK */
+
memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs,
sizeof(hw_cache_extra_regs));
@@ -2503,6 +2548,34 @@ __init int intel_pmu_init(void)
}
}
+ /*
+ * Access LBR MSR may cause #GP under certain circumstances.
+ * E.g. KVM doesn't support LBR MSR
+ * Check all LBT MSR here.
+ * Disable LBR access if any LBR MSRs can not be accessed.
+ */
+ if (x86_pmu.lbr_nr && !check_msr(x86_pmu.lbr_tos, 0x3UL))
+ x86_pmu.lbr_nr = 0;
+ for (i = 0; i < x86_pmu.lbr_nr; i++) {
+ if (!(check_msr(x86_pmu.lbr_from + i, 0xffffUL) &&
+ check_msr(x86_pmu.lbr_to + i, 0xffffUL)))
+ x86_pmu.lbr_nr = 0;
+ }
+
+ /*
+ * Access extra MSR may cause #GP under certain circumstances.
+ * E.g. KVM doesn't support offcore event
+ * Check all extra_regs here.
+ */
+ if (x86_pmu.extra_regs) {
+ for (er = x86_pmu.extra_regs; er->msr; er++) {
+ er->extra_msr_access = check_msr(er->msr, 0x1ffUL);
+ /* Disable LBR select mapping */
+ if ((er->idx == EXTRA_REG_LBR) && !er->extra_msr_access)
+ x86_pmu.lbr_sel_map = NULL;
+ }
+ }
+
/* Support full width counters using alternative MSR range */
if (x86_pmu.intel_cap.full_width_write) {
x86_pmu.max_period = x86_pmu.cntval_mask;
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
index 4118f9f..3e1cfbb 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
@@ -2764,6 +2764,17 @@ static struct intel_uncore_box *uncore_event_to_box(struct perf_event *event)
return uncore_pmu_to_box(uncore_event_to_pmu(event), smp_processor_id());
}
+/*
+ * Using uncore_pmu_event_init pmu event_init callback
+ * as a detection point for uncore events.
+ */
+static int uncore_pmu_event_init(struct perf_event *event);
+
+static bool is_uncore_event(struct perf_event *event)
+{
+ return event->pmu->event_init == uncore_pmu_event_init;
+}
+
static int
uncore_collect_events(struct intel_uncore_box *box, struct perf_event *leader, bool dogrp)
{
@@ -2778,13 +2789,18 @@ uncore_collect_events(struct intel_uncore_box *box, struct perf_event *leader, b
return -EINVAL;
n = box->n_events;
- box->event_list[n] = leader;
- n++;
+
+ if (is_uncore_event(leader)) {
+ box->event_list[n] = leader;
+ n++;
+ }
+
if (!dogrp)
return n;
list_for_each_entry(event, &leader->sibling_list, group_entry) {
- if (event->state <= PERF_EVENT_STATE_OFF)
+ if (!is_uncore_event(event) ||
+ event->state <= PERF_EVENT_STATE_OFF)
continue;
if (n >= max_count)
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index addb207..66e274a 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -24,7 +24,6 @@ static char x86_stack_ids[][8] = {
[ DEBUG_STACK-1 ] = "#DB",
[ NMI_STACK-1 ] = "NMI",
[ DOUBLEFAULT_STACK-1 ] = "#DF",
- [ STACKFAULT_STACK-1 ] = "#SS",
[ MCE_STACK-1 ] = "#MC",
#if DEBUG_STKSZ > EXCEPTION_STKSZ
[ N_EXCEPTION_STACKS ...
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index e491bfd..1f1c33d 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -364,22 +364,14 @@ ENTRY(resume_kernel)
DISABLE_INTERRUPTS(CLBR_ANY)
cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ?
jnz restore_all
+need_resched:
movl TI_flags(%ebp), %ecx # need_resched set ?
testb $_TIF_NEED_RESCHED, %cl
- jnz 1f
-
- cmpl $0,TI_preempt_lazy_count(%ebp) # non-zero preempt_lazy_count ?
- jnz restore_all
- testl $_TIF_NEED_RESCHED_LAZY, %ecx
jz restore_all
-
-1: testl $X86_EFLAGS_IF,PT_EFLAGS(%esp) # interrupts off (exception path) ?
+ testl $X86_EFLAGS_IF,PT_EFLAGS(%esp) # interrupts off (exception path) ?
jz restore_all
call preempt_schedule_irq
- movl TI_flags(%ebp), %ecx # need_resched set ?
- testl $_TIF_NEED_RESCHED_MASK, %ecx
- jnz 1b
- jmp restore_all
+ jmp need_resched
END(resume_kernel)
#endif
CFI_ENDPROC
@@ -442,8 +434,9 @@ sysenter_past_esp:
jnz sysenter_audit
sysenter_do_call:
cmpl $(NR_syscalls), %eax
- jae syscall_badsys
+ jae sysenter_badsys
call *sys_call_table(,%eax,4)
+sysenter_after_call:
movl %eax,PT_EAX(%esp)
LOCKDEP_SYS_EXIT
DISABLE_INTERRUPTS(CLBR_ANY)
@@ -524,6 +517,7 @@ ENTRY(system_call)
jae syscall_badsys
syscall_call:
call *sys_call_table(,%eax,4)
+syscall_after_call:
movl %eax,PT_EAX(%esp) # store the return value
syscall_exit:
LOCKDEP_SYS_EXIT
@@ -538,6 +532,7 @@ syscall_exit:
restore_all:
TRACE_IRQS_IRET
restore_all_notrace:
+#ifdef CONFIG_X86_ESPFIX32
movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS
# Warning: PT_OLDSS(%esp) contains the wrong/random values if we
# are returning to the kernel.
@@ -548,6 +543,7 @@ restore_all_notrace:
cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax
CFI_REMEMBER_STATE
je ldt_ss # returning to user-space with LDT SS
+#endif
restore_nocheck:
RESTORE_REGS 4 # skip orig_eax/error_code
irq_return:
@@ -560,13 +556,9 @@ ENTRY(iret_exc)
.previous
_ASM_EXTABLE(irq_return,iret_exc)
+#ifdef CONFIG_X86_ESPFIX32
CFI_RESTORE_STATE
ldt_ss:
- larl PT_OLDSS(%esp), %eax
- jnz restore_nocheck
- testl $0x00400000, %eax # returning to 32bit stack?
- jnz restore_nocheck # allright, normal return
-
#ifdef CONFIG_PARAVIRT
/*
* The kernel can't run on a non-flat stack if paravirt mode
@@ -608,6 +600,7 @@ ldt_ss:
lss (%esp), %esp /* switch to espfix segment */
CFI_ADJUST_CFA_OFFSET -8
jmp restore_nocheck
+#endif
CFI_ENDPROC
ENDPROC(system_call)
@@ -615,7 +608,7 @@ ENDPROC(system_call)
ALIGN
RING0_PTREGS_FRAME # can't unwind into user space anyway
work_pending:
- testl $_TIF_NEED_RESCHED_MASK, %ecx
+ testb $_TIF_NEED_RESCHED, %cl
jz work_notifysig
work_resched:
call schedule
@@ -628,7 +621,7 @@ work_resched:
andl $_TIF_WORK_MASK, %ecx # is there any work to be done other
# than syscall tracing?
jz restore_all
- testl $_TIF_NEED_RESCHED_MASK, %ecx
+ testb $_TIF_NEED_RESCHED, %cl
jnz work_resched
work_notifysig: # deal with pending signals and
@@ -698,8 +691,13 @@ syscall_fault:
END(syscall_fault)
syscall_badsys:
- movl $-ENOSYS,PT_EAX(%esp)
- jmp resume_userspace
+ movl $-ENOSYS,%eax
+ jmp syscall_after_call
+END(syscall_badsys)
+
+sysenter_badsys:
+ movl $-ENOSYS,%eax
+ jmp sysenter_after_call
END(syscall_badsys)
CFI_ENDPROC
/*
@@ -715,6 +713,7 @@ END(syscall_badsys)
* the high word of the segment base from the GDT and swiches to the
* normal stack and adjusts ESP with the matching offset.
*/
+#ifdef CONFIG_X86_ESPFIX32
/* fixup the stack */
mov GDT_ESPFIX_SS + 4, %al /* bits 16..23 */
mov GDT_ESPFIX_SS + 7, %ah /* bits 24..31 */
@@ -724,8 +723,10 @@ END(syscall_badsys)
pushl_cfi %eax
lss (%esp), %esp /* switch to the normal stack segment */
CFI_ADJUST_CFA_OFFSET -8
+#endif
.endm
.macro UNWIND_ESPFIX_STACK
+#ifdef CONFIG_X86_ESPFIX32
movl %ss, %eax
/* see if on espfix stack */
cmpw $__ESPFIX_SS, %ax
@@ -736,6 +737,7 @@ END(syscall_badsys)
/* switch to normal stack */
FIXUP_ESPFIX_STACK
27:
+#endif
.endm
/*
@@ -1356,11 +1358,13 @@ END(debug)
ENTRY(nmi)
RING0_INT_FRAME
ASM_CLAC
+#ifdef CONFIG_X86_ESPFIX32
pushl_cfi %eax
movl %ss, %eax
cmpw $__ESPFIX_SS, %ax
popl_cfi %eax
je nmi_espfix_stack
+#endif
cmpl $ia32_sysenter_target,(%esp)
je nmi_stack_fixup
pushl_cfi %eax
@@ -1400,6 +1404,7 @@ nmi_debug_stack_check:
FIX_STACK 24, nmi_stack_correct, 1
jmp nmi_stack_correct
+#ifdef CONFIG_X86_ESPFIX32
nmi_espfix_stack:
/* We have a RING0_INT_FRAME here.
*
@@ -1421,6 +1426,7 @@ nmi_espfix_stack:
lss 12+4(%esp), %esp # back to espfix stack
CFI_ADJUST_CFA_OFFSET -24
jmp irq_return
+#endif
CFI_ENDPROC
END(nmi)
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index abca4f4..e965606 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -58,6 +58,7 @@
#include <asm/asm.h>
#include <asm/context_tracking.h>
#include <asm/smap.h>
+#include <asm/pgtable_types.h>
#include <linux/err.h>
/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
@@ -658,8 +659,8 @@ sysret_check:
/* Handle reschedules */
/* edx: work, edi: workmask */
sysret_careful:
- testl $_TIF_NEED_RESCHED_MASK,%edx
- jz sysret_signal
+ bt $TIF_NEED_RESCHED,%edx
+ jnc sysret_signal
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_NONE)
pushq_cfi %rdi
@@ -771,8 +772,8 @@ GLOBAL(int_with_check)
/* First do a reschedule test. */
/* edx: work, edi: workmask */
int_careful:
- testl $_TIF_NEED_RESCHED_MASK,%edx
- jz int_very_careful
+ bt $TIF_NEED_RESCHED,%edx
+ jnc int_very_careful
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_NONE)
pushq_cfi %rdi
@@ -1041,38 +1042,58 @@ restore_args:
irq_return:
INTERRUPT_RETURN
- _ASM_EXTABLE(irq_return, bad_iret)
-#ifdef CONFIG_PARAVIRT
ENTRY(native_iret)
- iretq
- _ASM_EXTABLE(native_iret, bad_iret)
+ /*
+ * Are we returning to a stack segment from the LDT? Note: in
+ * 64-bit mode SS:RSP on the exception stack is always valid.
+ */
+#ifdef CONFIG_X86_ESPFIX64
+ testb $4,(SS-RIP)(%rsp)
+ jnz native_irq_return_ldt
#endif
- .section .fixup,"ax"
-bad_iret:
+.global native_irq_return_iret
+native_irq_return_iret:
/*
- * The iret traps when the %cs or %ss being restored is bogus.
- * We've lost the original trap vector and error code.
- * #GPF is the most likely one to get for an invalid selector.
- * So pretend we completed the iret and took the #GPF in user mode.
- *
- * We are now running with the kernel GS after exception recovery.
- * But error_entry expects us to have user GS to match the user %cs,
- * so swap back.
+ * This may fault. Non-paranoid faults on return to userspace are
+ * handled by fixup_bad_iret. These include #SS, #GP, and #NP.
+ * Double-faults due to espfix64 are handled in do_double_fault.
+ * Other faults here are fatal.
*/
- pushq $0
+ iretq
+#ifdef CONFIG_X86_ESPFIX64
+native_irq_return_ldt:
+ pushq_cfi %rax
+ pushq_cfi %rdi
SWAPGS
- jmp general_protection
-
- .previous
+ movq PER_CPU_VAR(espfix_waddr),%rdi
+ movq %rax,(0*8)(%rdi) /* RAX */
+ movq (2*8)(%rsp),%rax /* RIP */
+ movq %rax,(1*8)(%rdi)
+ movq (3*8)(%rsp),%rax /* CS */
+ movq %rax,(2*8)(%rdi)
+ movq (4*8)(%rsp),%rax /* RFLAGS */
+ movq %rax,(3*8)(%rdi)
+ movq (6*8)(%rsp),%rax /* SS */
+ movq %rax,(5*8)(%rdi)
+ movq (5*8)(%rsp),%rax /* RSP */
+ movq %rax,(4*8)(%rdi)
+ andl $0xffff0000,%eax
+ popq_cfi %rdi
+ orq PER_CPU_VAR(espfix_stack),%rax
+ SWAPGS
+ movq %rax,%rsp
+ popq_cfi %rax
+ jmp native_irq_return_iret
+#endif
/* edi: workmask, edx: work */
retint_careful:
CFI_RESTORE_STATE
- testl $_TIF_NEED_RESCHED_MASK,%edx
- jz retint_signal
+ bt $TIF_NEED_RESCHED,%edx
+ jnc retint_signal
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_NONE)
pushq_cfi %rdi
@@ -1105,22 +1126,16 @@ retint_signal:
ENTRY(retint_kernel)
cmpl $0,TI_preempt_count(%rcx)
jnz retint_restore_args
- bt $TIF_NEED_RESCHED,TI_flags(%rcx)
- jc 1f
-
- cmpl $0,TI_preempt_lazy_count(%rcx)
- jnz retint_restore_args
- bt $TIF_NEED_RESCHED_LAZY,TI_flags(%rcx)
+ bt $TIF_NEED_RESCHED,TI_flags(%rcx)
jnc retint_restore_args
-
-1: bt $9,EFLAGS-ARGOFFSET(%rsp) /* interrupts off? */
+ bt $9,EFLAGS-ARGOFFSET(%rsp) /* interrupts off? */
jnc retint_restore_args
call preempt_schedule_irq
jmp exit_intr
#endif
-
CFI_ENDPROC
END(common_interrupt)
+
/*
* End of kprobes section
*/
@@ -1347,7 +1362,6 @@ bad_gs:
jmp 2b
.previous
-#ifndef CONFIG_PREEMPT_RT_FULL
/* Call softirq on interrupt stack. Interrupts are off. */
ENTRY(call_softirq)
CFI_STARTPROC
@@ -1367,7 +1381,6 @@ ENTRY(call_softirq)
ret
CFI_ENDPROC
END(call_softirq)
-#endif
#ifdef CONFIG_XEN
zeroentry xen_hypervisor_callback xen_do_hypervisor_callback
@@ -1483,7 +1496,7 @@ apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \
paranoidzeroentry_ist debug do_debug DEBUG_STACK
paranoidzeroentry_ist int3 do_int3 DEBUG_STACK
-paranoiderrorentry stack_segment do_stack_segment
+errorentry stack_segment do_stack_segment
#ifdef CONFIG_XEN
zeroentry xen_debug do_debug
zeroentry xen_int3 do_int3
@@ -1537,7 +1550,7 @@ paranoid_userspace:
movq %rsp,%rdi /* &pt_regs */
call sync_regs
movq %rax,%rsp /* switch stack for scheduling */
- testl $_TIF_NEED_RESCHED_MASK,%ebx
+ testl $_TIF_NEED_RESCHED,%ebx
jnz paranoid_schedule
movl %ebx,%edx /* arg3: thread flags */
TRACE_IRQS_ON
@@ -1593,16 +1606,15 @@ error_sti:
/*
* There are two places in the kernel that can potentially fault with
- * usergs. Handle them here. The exception handlers after iret run with
- * kernel gs again, so don't set the user space flag. B stepping K8s
- * sometimes report an truncated RIP for IRET exceptions returning to
- * compat mode. Check for these here too.
+ * usergs. Handle them here. B stepping K8s sometimes report a
+ * truncated RIP for IRET exceptions returning to compat mode. Check
+ * for these here too.
*/
error_kernelspace:
incl %ebx
- leaq irq_return(%rip),%rcx
+ leaq native_irq_return_iret(%rip),%rcx
cmpq %rcx,RIP+8(%rsp)
- je error_swapgs
+ je error_bad_iret
movl %ecx,%eax /* zero extend */
cmpq %rax,RIP+8(%rsp)
je bstep_iret
@@ -1613,7 +1625,15 @@ error_kernelspace:
bstep_iret:
/* Fix truncated RIP */
movq %rcx,RIP+8(%rsp)
- jmp error_swapgs
+ /* fall through */
+
+error_bad_iret:
+ SWAPGS
+ mov %rsp,%rdi
+ call fixup_bad_iret
+ mov %rax,%rsp
+ decl %ebx /* Return to usergs */
+ jmp error_sti
CFI_ENDPROC
END(error_entry)
diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c
new file mode 100644
index 0000000..94d857f
--- /dev/null
+++ b/arch/x86/kernel/espfix_64.c
@@ -0,0 +1,208 @@
+/* ----------------------------------------------------------------------- *
+ *
+ * Copyright 2014 Intel Corporation; author: H. Peter Anvin
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * The IRET instruction, when returning to a 16-bit segment, only
+ * restores the bottom 16 bits of the user space stack pointer. This
+ * causes some 16-bit software to break, but it also leaks kernel state
+ * to user space.
+ *
+ * This works around this by creating percpu "ministacks", each of which
+ * is mapped 2^16 times 64K apart. When we detect that the return SS is
+ * on the LDT, we copy the IRET frame to the ministack and use the
+ * relevant alias to return to userspace. The ministacks are mapped
+ * readonly, so if the IRET fault we promote #GP to #DF which is an IST
+ * vector and thus has its own stack; we then do the fixup in the #DF
+ * handler.
+ *
+ * This file sets up the ministacks and the related page tables. The
+ * actual ministack invocation is in entry_64.S.
+ */
+
+#include <linux/init.h>
+#include <linux/init_task.h>
+#include <linux/kernel.h>
+#include <linux/percpu.h>
+#include <linux/gfp.h>
+#include <linux/random.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/setup.h>
+#include <asm/espfix.h>
+
+/*
+ * Note: we only need 6*8 = 48 bytes for the espfix stack, but round
+ * it up to a cache line to avoid unnecessary sharing.
+ */
+#define ESPFIX_STACK_SIZE (8*8UL)
+#define ESPFIX_STACKS_PER_PAGE (PAGE_SIZE/ESPFIX_STACK_SIZE)
+
+/* There is address space for how many espfix pages? */
+#define ESPFIX_PAGE_SPACE (1UL << (PGDIR_SHIFT-PAGE_SHIFT-16))
+
+#define ESPFIX_MAX_CPUS (ESPFIX_STACKS_PER_PAGE * ESPFIX_PAGE_SPACE)
+#if CONFIG_NR_CPUS > ESPFIX_MAX_CPUS
+# error "Need more than one PGD for the ESPFIX hack"
+#endif
+
+#define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO)
+
+/* This contains the *bottom* address of the espfix stack */
+DEFINE_PER_CPU_READ_MOSTLY(unsigned long, espfix_stack);
+DEFINE_PER_CPU_READ_MOSTLY(unsigned long, espfix_waddr);
+
+/* Initialization mutex - should this be a spinlock? */
+static DEFINE_MUTEX(espfix_init_mutex);
+
+/* Page allocation bitmap - each page serves ESPFIX_STACKS_PER_PAGE CPUs */
+#define ESPFIX_MAX_PAGES DIV_ROUND_UP(CONFIG_NR_CPUS, ESPFIX_STACKS_PER_PAGE)
+static void *espfix_pages[ESPFIX_MAX_PAGES];
+
+static __page_aligned_bss pud_t espfix_pud_page[PTRS_PER_PUD]
+ __aligned(PAGE_SIZE);
+
+static unsigned int page_random, slot_random;
+
+/*
+ * This returns the bottom address of the espfix stack for a specific CPU.
+ * The math allows for a non-power-of-two ESPFIX_STACK_SIZE, in which case
+ * we have to account for some amount of padding at the end of each page.
+ */
+static inline unsigned long espfix_base_addr(unsigned int cpu)
+{
+ unsigned long page, slot;
+ unsigned long addr;
+
+ page = (cpu / ESPFIX_STACKS_PER_PAGE) ^ page_random;
+ slot = (cpu + slot_random) % ESPFIX_STACKS_PER_PAGE;
+ addr = (page << PAGE_SHIFT) + (slot * ESPFIX_STACK_SIZE);
+ addr = (addr & 0xffffUL) | ((addr & ~0xffffUL) << 16);
+ addr += ESPFIX_BASE_ADDR;
+ return addr;
+}
+
+#define PTE_STRIDE (65536/PAGE_SIZE)
+#define ESPFIX_PTE_CLONES (PTRS_PER_PTE/PTE_STRIDE)
+#define ESPFIX_PMD_CLONES PTRS_PER_PMD
+#define ESPFIX_PUD_CLONES (65536/(ESPFIX_PTE_CLONES*ESPFIX_PMD_CLONES))
+
+#define PGTABLE_PROT ((_KERNPG_TABLE & ~_PAGE_RW) | _PAGE_NX)
+
+static void init_espfix_random(void)
+{
+ unsigned long rand;
+
+ /*
+ * This is run before the entropy pools are initialized,
+ * but this is hopefully better than nothing.
+ */
+ if (!arch_get_random_long(&rand)) {
+ /* The constant is an arbitrary large prime */
+ rdtscll(rand);
+ rand *= 0xc345c6b72fd16123UL;
+ }
+
+ slot_random = rand % ESPFIX_STACKS_PER_PAGE;
+ page_random = (rand / ESPFIX_STACKS_PER_PAGE)
+ & (ESPFIX_PAGE_SPACE - 1);
+}
+
+void __init init_espfix_bsp(void)
+{
+ pgd_t *pgd_p;
+ pteval_t ptemask;
+
+ ptemask = __supported_pte_mask;
+
+ /* Install the espfix pud into the kernel page directory */
+ pgd_p = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)];
+ pgd_populate(&init_mm, pgd_p, (pud_t *)espfix_pud_page);
+
+ /* Randomize the locations */
+ init_espfix_random();
+
+ /* The rest is the same as for any other processor */
+ init_espfix_ap();
+}
+
+void init_espfix_ap(void)
+{
+ unsigned int cpu, page;
+ unsigned long addr;
+ pud_t pud, *pud_p;
+ pmd_t pmd, *pmd_p;
+ pte_t pte, *pte_p;
+ int n;
+ void *stack_page;
+ pteval_t ptemask;
+
+ /* We only have to do this once... */
+ if (likely(this_cpu_read(espfix_stack)))
+ return; /* Already initialized */
+
+ cpu = smp_processor_id();
+ addr = espfix_base_addr(cpu);
+ page = cpu/ESPFIX_STACKS_PER_PAGE;
+
+ /* Did another CPU already set this up? */
+ stack_page = ACCESS_ONCE(espfix_pages[page]);
+ if (likely(stack_page))
+ goto done;
+
+ mutex_lock(&espfix_init_mutex);
+
+ /* Did we race on the lock? */
+ stack_page = ACCESS_ONCE(espfix_pages[page]);
+ if (stack_page)
+ goto unlock_done;
+
+ ptemask = __supported_pte_mask;
+
+ pud_p = &espfix_pud_page[pud_index(addr)];
+ pud = *pud_p;
+ if (!pud_present(pud)) {
+ pmd_p = (pmd_t *)__get_free_page(PGALLOC_GFP);
+ pud = __pud(__pa(pmd_p) | (PGTABLE_PROT & ptemask));
+ paravirt_alloc_pmd(&init_mm, __pa(pmd_p) >> PAGE_SHIFT);
+ for (n = 0; n < ESPFIX_PUD_CLONES; n++)
+ set_pud(&pud_p[n], pud);
+ }
+
+ pmd_p = pmd_offset(&pud, addr);
+ pmd = *pmd_p;
+ if (!pmd_present(pmd)) {
+ pte_p = (pte_t *)__get_free_page(PGALLOC_GFP);
+ pmd = __pmd(__pa(pte_p) | (PGTABLE_PROT & ptemask));
+ paravirt_alloc_pte(&init_mm, __pa(pte_p) >> PAGE_SHIFT);
+ for (n = 0; n < ESPFIX_PMD_CLONES; n++)
+ set_pmd(&pmd_p[n], pmd);
+ }
+
+ pte_p = pte_offset_kernel(&pmd, addr);
+ stack_page = (void *)__get_free_page(GFP_KERNEL);
+ pte = __pte(__pa(stack_page) | (__PAGE_KERNEL_RO & ptemask));
+ for (n = 0; n < ESPFIX_PTE_CLONES; n++)
+ set_pte(&pte_p[n*PTE_STRIDE], pte);
+
+ /* Job is done for this CPU and any CPU which shares this page */
+ ACCESS_ONCE(espfix_pages[page]) = stack_page;
+
+unlock_done:
+ mutex_unlock(&espfix_init_mutex);
+done:
+ this_cpu_write(espfix_stack, addr);
+ this_cpu_write(espfix_waddr, (unsigned long)stack_page
+ + (addr & ~PAGE_MASK));
+}
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index e625319..f8ab203 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -297,16 +297,7 @@ int ftrace_int3_handler(struct pt_regs *regs)
static int ftrace_write(unsigned long ip, const char *val, int size)
{
- /*
- * On x86_64, kernel text mappings are mapped read-only with
- * CONFIG_DEBUG_RODATA. So we use the kernel identity mapping instead
- * of the kernel text mapping to modify the kernel text.
- *
- * For 32bit kernels, these mappings are same and we can use
- * kernel identity mapping to modify code.
- */
- if (within(ip, (unsigned long)_text, (unsigned long)_etext))
- ip = (unsigned long)__va(__pa_symbol(ip));
+ ip = text_ip_addr(ip);
return probe_kernel_write((void *)ip, val, size);
}
@@ -659,8 +650,8 @@ ftrace_modify_code(unsigned long ip, unsigned const char *old_code,
ret = -EPERM;
goto out;
}
- run_sync();
out:
+ run_sync();
return ret;
fail_update:
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 22d0687..3910078 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -262,6 +262,83 @@ __visible void smp_trace_x86_platform_ipi(struct pt_regs *regs)
EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq);
#ifdef CONFIG_HOTPLUG_CPU
+
+/* These two declarations are only used in check_irq_vectors_for_cpu_disable()
+ * below, which is protected by stop_machine(). Putting them on the stack
+ * results in a stack frame overflow. Dynamically allocating could result in a
+ * failure so declare these two cpumasks as global.
+ */
+static struct cpumask affinity_new, online_new;
+
+/*
+ * This cpu is going to be removed and its vectors migrated to the remaining
+ * online cpus. Check to see if there are enough vectors in the remaining cpus.
+ * This function is protected by stop_machine().
+ */
+int check_irq_vectors_for_cpu_disable(void)
+{
+ int irq, cpu;
+ unsigned int this_cpu, vector, this_count, count;
+ struct irq_desc *desc;
+ struct irq_data *data;
+
+ this_cpu = smp_processor_id();
+ cpumask_copy(&online_new, cpu_online_mask);
+ cpu_clear(this_cpu, online_new);
+
+ this_count = 0;
+ for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
+ irq = __this_cpu_read(vector_irq[vector]);
+ if (irq >= 0) {
+ desc = irq_to_desc(irq);
+ data = irq_desc_get_irq_data(desc);
+ cpumask_copy(&affinity_new, data->affinity);
+ cpu_clear(this_cpu, affinity_new);
+
+ /* Do not count inactive or per-cpu irqs. */
+ if (!irq_has_action(irq) || irqd_is_per_cpu(data))
+ continue;
+
+ /*
+ * A single irq may be mapped to multiple
+ * cpu's vector_irq[] (for example IOAPIC cluster
+ * mode). In this case we have two
+ * possibilities:
+ *
+ * 1) the resulting affinity mask is empty; that is
+ * this the down'd cpu is the last cpu in the irq's
+ * affinity mask, or
+ *
+ * 2) the resulting affinity mask is no longer
+ * a subset of the online cpus but the affinity
+ * mask is not zero; that is the down'd cpu is the
+ * last online cpu in a user set affinity mask.
+ */
+ if (cpumask_empty(&affinity_new) ||
+ !cpumask_subset(&affinity_new, &online_new))
+ this_count++;
+ }
+ }
+
+ count = 0;
+ for_each_online_cpu(cpu) {
+ if (cpu == this_cpu)
+ continue;
+ for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS;
+ vector++) {
+ if (per_cpu(vector_irq, cpu)[vector] < 0)
+ count++;
+ }
+ }
+
+ if (count < this_count) {
+ pr_warn("CPU %d disable failed: CPU has %u vectors assigned and there are only %u available.\n",
+ this_cpu, this_count, count);
+ return -ERANGE;
+ }
+ return 0;
+}
+
/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */
void fixup_irqs(void)
{
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 9da1bc7..4186755 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -149,7 +149,6 @@ void irq_ctx_init(int cpu)
cpu, per_cpu(hardirq_ctx, cpu), per_cpu(softirq_ctx, cpu));
}
-#ifndef CONFIG_PREEMPT_RT_FULL
asmlinkage void do_softirq(void)
{
unsigned long flags;
@@ -180,7 +179,6 @@ asmlinkage void do_softirq(void)
local_irq_restore(flags);
}
-#endif
bool handle_irq(unsigned irq, struct pt_regs *regs)
{
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 831f247..d04d3ec 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -88,7 +88,7 @@ bool handle_irq(unsigned irq, struct pt_regs *regs)
return true;
}
-#ifndef CONFIG_PREEMPT_RT_FULL
+
extern void call_softirq(void);
asmlinkage void do_softirq(void)
@@ -108,4 +108,3 @@ asmlinkage void do_softirq(void)
}
local_irq_restore(flags);
}
-#endif
diff --git a/arch/x86/kernel/irq_work.c b/arch/x86/kernel/irq_work.c
index 3d21f7b..1de84e3 100644
--- a/arch/x86/kernel/irq_work.c
+++ b/arch/x86/kernel/irq_work.c
@@ -38,7 +38,6 @@ __visible void smp_trace_irq_work_interrupt(struct pt_regs *regs)
exiting_irq();
}
-#ifndef CONFIG_PREEMPT_RT_FULL
void arch_irq_work_raise(void)
{
#ifdef CONFIG_X86_LOCAL_APIC
@@ -49,4 +48,3 @@ void arch_irq_work_raise(void)
apic_wait_icr_idle();
#endif
}
-#endif
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index 79a3f96..a1f5b18 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -1017,6 +1017,15 @@ int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
regs->flags &= ~X86_EFLAGS_IF;
trace_hardirqs_off();
regs->ip = (unsigned long)(jp->entry);
+
+ /*
+ * jprobes use jprobe_return() which skips the normal return
+ * path of the function, and this messes up the accounting of the
+ * function graph tracer to get messed up.
+ *
+ * Pause function graph tracing while performing the jprobe function.
+ */
+ pause_graph_tracing();
return 1;
}
@@ -1042,24 +1051,25 @@ int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
u8 *addr = (u8 *) (regs->ip - 1);
struct jprobe *jp = container_of(p, struct jprobe, kp);
+ void *saved_sp = kcb->jprobe_saved_sp;
if ((addr > (u8 *) jprobe_return) &&
(addr < (u8 *) jprobe_return_end)) {
- if (stack_addr(regs) != kcb->jprobe_saved_sp) {
+ if (stack_addr(regs) != saved_sp) {
struct pt_regs *saved_regs = &kcb->jprobe_saved_regs;
printk(KERN_ERR
"current sp %p does not match saved sp %p\n",
- stack_addr(regs), kcb->jprobe_saved_sp);
+ stack_addr(regs), saved_sp);
printk(KERN_ERR "Saved registers for jprobe %p\n", jp);
show_regs(saved_regs);
printk(KERN_ERR "Current registers\n");
show_regs(regs);
BUG();
}
+ /* It's OK to start function graph tracing again */
+ unpause_graph_tracing();
*regs = kcb->jprobe_saved_regs;
- memcpy((kprobe_opcode_t *)(kcb->jprobe_saved_sp),
- kcb->jprobes_stack,
- MIN_STACK_SIZE(kcb->jprobe_saved_sp));
+ memcpy(saved_sp, kcb->jprobes_stack, MIN_STACK_SIZE(saved_sp));
preempt_enable_no_resched();
return 1;
}
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index f022c54..e725933 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -280,7 +280,14 @@ do_async_page_fault(struct pt_regs *regs, unsigned long error_code)
static void __init paravirt_ops_setup(void)
{
pv_info.name = "KVM";
- pv_info.paravirt_enabled = 1;
+
+ /*
+ * KVM isn't paravirt in the sense of paravirt_enabled. A KVM
+ * guest kernel works like a bare metal kernel with additional
+ * features, and paravirt_enabled is about features that are
+ * missing.
+ */
+ pv_info.paravirt_enabled = 0;
if (kvm_para_has_feature(KVM_FEATURE_NOP_IO_DELAY))
pv_cpu_ops.io_delay = kvm_io_delay;
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 1570e07..23457e5 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -262,7 +262,6 @@ void __init kvmclock_init(void)
#endif
kvm_get_preset_lpj();
clocksource_register_hz(&kvm_clock, NSEC_PER_SEC);
- pv_info.paravirt_enabled = 1;
pv_info.name = "KVM";
if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT))
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index ebc9873..c37886d 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -229,6 +229,11 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
}
}
+ if (!IS_ENABLED(CONFIG_X86_16BIT) && !ldt_info.seg_32bit) {
+ error = -EINVAL;
+ goto out_unlock;
+ }
+
fill_ldt(&ldt, &ldt_info);
if (oldmode)
ldt.avl = 0;
diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c
index 3f08f34..a1da673 100644
--- a/arch/x86/kernel/paravirt_patch_64.c
+++ b/arch/x86/kernel/paravirt_patch_64.c
@@ -6,7 +6,6 @@ DEF_NATIVE(pv_irq_ops, irq_disable, "cli");
DEF_NATIVE(pv_irq_ops, irq_enable, "sti");
DEF_NATIVE(pv_irq_ops, restore_fl, "pushq %rdi; popfq");
DEF_NATIVE(pv_irq_ops, save_fl, "pushfq; popq %rax");
-DEF_NATIVE(pv_cpu_ops, iret, "iretq");
DEF_NATIVE(pv_mmu_ops, read_cr2, "movq %cr2, %rax");
DEF_NATIVE(pv_mmu_ops, read_cr3, "movq %cr3, %rax");
DEF_NATIVE(pv_mmu_ops, write_cr3, "movq %rdi, %cr3");
@@ -50,7 +49,6 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
PATCH_SITE(pv_irq_ops, save_fl);
PATCH_SITE(pv_irq_ops, irq_enable);
PATCH_SITE(pv_irq_ops, irq_disable);
- PATCH_SITE(pv_cpu_ops, iret);
PATCH_SITE(pv_cpu_ops, irq_enable_sysexit);
PATCH_SITE(pv_cpu_ops, usergs_sysret32);
PATCH_SITE(pv_cpu_ops, usergs_sysret64);
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 6ce0537..884f98f 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -36,7 +36,6 @@
#include <linux/uaccess.h>
#include <linux/io.h>
#include <linux/kdebug.h>
-#include <linux/highmem.h>
#include <asm/pgtable.h>
#include <asm/ldt.h>
@@ -220,35 +219,6 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
}
EXPORT_SYMBOL_GPL(start_thread);
-#ifdef CONFIG_PREEMPT_RT_FULL
-static void switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p)
-{
- int i;
-
- /*
- * Clear @prev's kmap_atomic mappings
- */
- for (i = 0; i < prev_p->kmap_idx; i++) {
- int idx = i + KM_TYPE_NR * smp_processor_id();
- pte_t *ptep = kmap_pte - idx;
-
- kpte_clear_flush(ptep, __fix_to_virt(FIX_KMAP_BEGIN + idx));
- }
- /*
- * Restore @next_p's kmap_atomic mappings
- */
- for (i = 0; i < next_p->kmap_idx; i++) {
- int idx = i + KM_TYPE_NR * smp_processor_id();
-
- if (!pte_none(next_p->kmap_pte[i]))
- set_pte(kmap_pte - idx, next_p->kmap_pte[i]);
- }
-}
-#else
-static inline void
-switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p) { }
-#endif
-
/*
* switch_to(x,y) should switch tasks from x to y.
@@ -328,8 +298,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT))
__switch_to_xtra(prev_p, next_p, tss);
- switch_kmaps(prev_p, next_p);
-
/*
* Leave lazy mode, flushing any hypercalls made here.
* This must be done before restoring TLS segments so
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 7461f50..0686fe3 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -1441,15 +1441,6 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs,
force_sig_info(SIGTRAP, &info, tsk);
}
-
-#ifdef CONFIG_X86_32
-# define IS_IA32 1
-#elif defined CONFIG_IA32_EMULATION
-# define IS_IA32 is_compat_task()
-#else
-# define IS_IA32 0
-#endif
-
/*
* We must return the syscall number to actually look up in the table.
* This can be -1L to skip running any syscall at all.
@@ -1487,7 +1478,7 @@ long syscall_trace_enter(struct pt_regs *regs)
if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
trace_sys_enter(regs, regs->orig_ax);
- if (IS_IA32)
+ if (is_ia32_task())
audit_syscall_entry(AUDIT_ARCH_I386,
regs->orig_ax,
regs->bx, regs->cx,
diff --git a/arch/x86/kernel/resource.c b/arch/x86/kernel/resource.c
index 2a26819..80eab01 100644
--- a/arch/x86/kernel/resource.c
+++ b/arch/x86/kernel/resource.c
@@ -37,10 +37,12 @@ static void remove_e820_regions(struct resource *avail)
void arch_remove_reservations(struct resource *avail)
{
- /* Trim out BIOS areas (low 1MB and high 2MB) and E820 regions */
+ /*
+ * Trim out BIOS area (high 2MB) and E820 regions. We do not remove
+ * the low 1MB unconditionally, as this area is needed for some ISA
+ * cards requiring a memory range, e.g. the i82365 PCMCIA controller.
+ */
if (avail->flags & IORESOURCE_MEM) {
- if (avail->start < BIOS_END)
- avail->start = BIOS_END;
resource_clip(avail, BIOS_ROM_BASE, BIOS_ROM_END);
remove_e820_regions(avail);
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index ecfe089..b88fc86 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -673,6 +673,11 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs)
* handler too.
*/
regs->flags &= ~(X86_EFLAGS_DF|X86_EFLAGS_RF|X86_EFLAGS_TF);
+ /*
+ * Ensure the signal handler starts with the new fpu state.
+ */
+ if (used_math())
+ drop_init_fpu(current);
}
signal_setup_done(failed, ksig, test_thread_flag(TIF_SINGLESTEP));
}
@@ -739,14 +744,6 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
mce_notify_process();
#endif /* CONFIG_X86_64 && CONFIG_X86_MCE */
-#ifdef ARCH_RT_DELAYS_SIGNAL_SEND
- if (unlikely(current->forced_info.si_signo)) {
- struct task_struct *t = current;
- force_sig_info(t->forced_info.si_signo, &t->forced_info, t);
- t->forced_info.si_signo = 0;
- }
-#endif
-
if (thread_info_flags & _TIF_UPROBE)
uprobe_notify_resume(regs);
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 6cacab6..9ccb7ef 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -265,6 +265,13 @@ static void notrace start_secondary(void *unused)
check_tsc_sync_target();
/*
+ * Enable the espfix hack for this CPU
+ */
+#ifdef CONFIG_X86_ESPFIX64
+ init_espfix_ap();
+#endif
+
+ /*
* We need to hold vector_lock so there the set of online cpus
* does not change while we are assigning vectors to cpus. Holding
* this lock ensures we don't half assign or remove an irq from a cpu.
@@ -1278,6 +1285,9 @@ static void remove_siblinginfo(int cpu)
for_each_cpu(sibling, cpu_sibling_mask(cpu))
cpumask_clear_cpu(cpu, cpu_sibling_mask(sibling));
+ for_each_cpu(sibling, cpu_llc_shared_mask(cpu))
+ cpumask_clear_cpu(cpu, cpu_llc_shared_mask(sibling));
+ cpumask_clear(cpu_llc_shared_mask(cpu));
cpumask_clear(cpu_sibling_mask(cpu));
cpumask_clear(cpu_core_mask(cpu));
c->phys_proc_id = 0;
@@ -1310,6 +1320,12 @@ void cpu_disable_common(void)
int native_cpu_disable(void)
{
+ int ret;
+
+ ret = check_irq_vectors_for_cpu_disable();
+ if (ret)
+ return ret;
+
clear_local_APIC();
cpu_disable_common();
diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c
index f7fec09..7fc5e84 100644
--- a/arch/x86/kernel/tls.c
+++ b/arch/x86/kernel/tls.c
@@ -27,6 +27,58 @@ static int get_free_idx(void)
return -ESRCH;
}
+static bool tls_desc_okay(const struct user_desc *info)
+{
+ /*
+ * For historical reasons (i.e. no one ever documented how any
+ * of the segmentation APIs work), user programs can and do
+ * assume that a struct user_desc that's all zeros except for
+ * entry_number means "no segment at all". This never actually
+ * worked. In fact, up to Linux 3.19, a struct user_desc like
+ * this would create a 16-bit read-write segment with base and
+ * limit both equal to zero.
+ *
+ * That was close enough to "no segment at all" until we
+ * hardened this function to disallow 16-bit TLS segments. Fix
+ * it up by interpreting these zeroed segments the way that they
+ * were almost certainly intended to be interpreted.
+ *
+ * The correct way to ask for "no segment at all" is to specify
+ * a user_desc that satisfies LDT_empty. To keep everything
+ * working, we accept both.
+ *
+ * Note that there's a similar kludge in modify_ldt -- look at
+ * the distinction between modes 1 and 0x11.
+ */
+ if (LDT_empty(info) || LDT_zero(info))
+ return true;
+
+ /*
+ * espfix is required for 16-bit data segments, but espfix
+ * only works for LDT segments.
+ */
+ if (!info->seg_32bit)
+ return false;
+
+ /* Only allow data segments in the TLS array. */
+ if (info->contents > 1)
+ return false;
+
+ /*
+ * Non-present segments with DPL 3 present an interesting attack
+ * surface. The kernel should handle such segments correctly,
+ * but TLS is very difficult to protect in a sandbox, so prevent
+ * such segments from being created.
+ *
+ * If userspace needs to remove a TLS entry, it can still delete
+ * it outright.
+ */
+ if (info->seg_not_present)
+ return false;
+
+ return true;
+}
+
static void set_tls_desc(struct task_struct *p, int idx,
const struct user_desc *info, int n)
{
@@ -40,7 +92,7 @@ static void set_tls_desc(struct task_struct *p, int idx,
cpu = get_cpu();
while (n-- > 0) {
- if (LDT_empty(info))
+ if (LDT_empty(info) || LDT_zero(info))
desc->a = desc->b = 0;
else
fill_ldt(desc, info);
@@ -66,6 +118,9 @@ int do_set_thread_area(struct task_struct *p, int idx,
if (copy_from_user(&info, u_info, sizeof(info)))
return -EFAULT;
+ if (!tls_desc_okay(&info))
+ return -EINVAL;
+
if (idx == -1)
idx = info.entry_number;
@@ -192,6 +247,7 @@ int regset_tls_set(struct task_struct *target, const struct user_regset *regset,
{
struct user_desc infobuf[GDT_ENTRY_TLS_ENTRIES];
const struct user_desc *info;
+ int i;
if (pos >= GDT_ENTRY_TLS_ENTRIES * sizeof(struct user_desc) ||
(pos % sizeof(struct user_desc)) != 0 ||
@@ -205,6 +261,10 @@ int regset_tls_set(struct task_struct *target, const struct user_regset *regset,
else
info = infobuf;
+ for (i = 0; i < count / sizeof(struct user_desc); i++)
+ if (!tls_desc_okay(info + i))
+ return -EINVAL;
+
set_tls_desc(target,
GDT_ENTRY_TLS_MIN + (pos / sizeof(struct user_desc)),
info, count / sizeof(struct user_desc));
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 6663bb5..88f3d3b 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -86,21 +86,9 @@ static inline void conditional_sti(struct pt_regs *regs)
local_irq_enable();
}
-static inline void conditional_sti_ist(struct pt_regs *regs)
+static inline void preempt_conditional_sti(struct pt_regs *regs)
{
-#ifdef CONFIG_X86_64
- /*
- * X86_64 uses a per CPU stack on the IST for certain traps
- * like int3. The task can not be preempted when using one
- * of these stacks, thus preemption must be disabled, otherwise
- * the stack can be corrupted if the task is scheduled out,
- * and another task comes in and uses this stack.
- *
- * On x86_32 the task keeps its own stack and it is OK if the
- * task schedules out.
- */
inc_preempt_count();
-#endif
if (regs->flags & X86_EFLAGS_IF)
local_irq_enable();
}
@@ -111,13 +99,11 @@ static inline void conditional_cli(struct pt_regs *regs)
local_irq_disable();
}
-static inline void conditional_cli_ist(struct pt_regs *regs)
+static inline void preempt_conditional_cli(struct pt_regs *regs)
{
if (regs->flags & X86_EFLAGS_IF)
local_irq_disable();
-#ifdef CONFIG_X86_64
dec_preempt_count();
-#endif
}
static int __kprobes
@@ -235,33 +221,41 @@ DO_ERROR(X86_TRAP_OLD_MF, SIGFPE, "coprocessor segment overrun",
coprocessor_segment_overrun)
DO_ERROR(X86_TRAP_TS, SIGSEGV, "invalid TSS", invalid_TSS)
DO_ERROR(X86_TRAP_NP, SIGBUS, "segment not present", segment_not_present)
-#ifdef CONFIG_X86_32
DO_ERROR(X86_TRAP_SS, SIGBUS, "stack segment", stack_segment)
-#endif
DO_ERROR_INFO(X86_TRAP_AC, SIGBUS, "alignment check", alignment_check,
BUS_ADRALN, 0)
#ifdef CONFIG_X86_64
/* Runs on IST stack */
-dotraplinkage void do_stack_segment(struct pt_regs *regs, long error_code)
-{
- enum ctx_state prev_state;
-
- prev_state = exception_enter();
- if (notify_die(DIE_TRAP, "stack segment", regs, error_code,
- X86_TRAP_SS, SIGBUS) != NOTIFY_STOP) {
- conditional_sti_ist(regs);
- do_trap(X86_TRAP_SS, SIGBUS, "stack segment", regs, error_code, NULL);
- conditional_cli_ist(regs);
- }
- exception_exit(prev_state);
-}
-
dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
{
static const char str[] = "double fault";
struct task_struct *tsk = current;
+#ifdef CONFIG_X86_ESPFIX64
+ extern unsigned char native_irq_return_iret[];
+
+ /*
+ * If IRET takes a non-IST fault on the espfix64 stack, then we
+ * end up promoting it to a doublefault. In that case, modify
+ * the stack to make it look like we just entered the #GP
+ * handler from user space, similar to bad_iret.
+ */
+ if (((long)regs->sp >> PGDIR_SHIFT) == ESPFIX_PGD_ENTRY &&
+ regs->cs == __KERNEL_CS &&
+ regs->ip == (unsigned long)native_irq_return_iret)
+ {
+ struct pt_regs *normal_regs = task_pt_regs(current);
+
+ /* Fake a #GP(0) from userspace. */
+ memmove(&normal_regs->ip, (void *)regs->sp, 5*8);
+ normal_regs->orig_ax = 0; /* Missing (lost) #GP error code */
+ regs->ip = (unsigned long)general_protection;
+ regs->sp = (unsigned long)&normal_regs->orig_ax;
+ return;
+ }
+#endif
+
exception_enter();
/* Return not checked because double check cannot be ignored */
notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV);
@@ -361,9 +355,9 @@ dotraplinkage void __kprobes notrace do_int3(struct pt_regs *regs, long error_co
* as we may switch to the interrupt stack.
*/
debug_stack_usage_inc();
- conditional_sti_ist(regs);
+ preempt_conditional_sti(regs);
do_trap(X86_TRAP_BP, SIGTRAP, "int3", regs, error_code, NULL);
- conditional_cli_ist(regs);
+ preempt_conditional_cli(regs);
debug_stack_usage_dec();
exit:
exception_exit(prev_state);
@@ -375,7 +369,7 @@ exit:
* for scheduling or signal handling. The actual stack switch is done in
* entry.S
*/
-asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
+asmlinkage notrace __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
{
struct pt_regs *regs = eregs;
/* Did already sync */
@@ -394,6 +388,35 @@ asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
*regs = *eregs;
return regs;
}
+
+struct bad_iret_stack {
+ void *error_entry_ret;
+ struct pt_regs regs;
+};
+
+asmlinkage __visible notrace __kprobes
+struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s)
+{
+ /*
+ * This is called from entry_64.S early in handling a fault
+ * caused by a bad iret to user mode. To handle the fault
+ * correctly, we want move our stack frame to task_pt_regs
+ * and we want to pretend that the exception came from the
+ * iret target.
+ */
+ struct bad_iret_stack *new_stack =
+ container_of(task_pt_regs(current),
+ struct bad_iret_stack, regs);
+
+ /* Copy the IRET target to the new stack. */
+ memmove(&new_stack->regs.ip, (void *)s->regs.sp, 5*8);
+
+ /* Copy the remainder of the stack from the current stack. */
+ memmove(new_stack, s, offsetof(struct bad_iret_stack, regs.ip));
+
+ BUG_ON(!user_mode_vm(&new_stack->regs));
+ return new_stack;
+}
#endif
/*
@@ -469,12 +492,12 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
debug_stack_usage_inc();
/* It's safe to allow irq's after DR6 has been saved */
- conditional_sti_ist(regs);
+ preempt_conditional_sti(regs);
if (regs->flags & X86_VM_MASK) {
handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code,
X86_TRAP_DB);
- conditional_cli_ist(regs);
+ preempt_conditional_cli(regs);
debug_stack_usage_dec();
goto exit;
}
@@ -494,7 +517,7 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
si_code = get_si_code(tsk->thread.debugreg6);
if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS) || user_icebp)
send_sigtrap(tsk, regs, error_code, si_code);
- conditional_cli_ist(regs);
+ preempt_conditional_cli(regs);
debug_stack_usage_dec();
exit:
@@ -766,7 +789,7 @@ void __init trap_init(void)
set_intr_gate(X86_TRAP_OLD_MF, &coprocessor_segment_overrun);
set_intr_gate(X86_TRAP_TS, &invalid_TSS);
set_intr_gate(X86_TRAP_NP, &segment_not_present);
- set_intr_gate_ist(X86_TRAP_SS, &stack_segment, STACKFAULT_STACK);
+ set_intr_gate(X86_TRAP_SS, stack_segment);
set_intr_gate(X86_TRAP_GP, &general_protection);
set_intr_gate(X86_TRAP_SPURIOUS, &spurious_interrupt_bug);
set_intr_gate(X86_TRAP_MF, &coprocessor_error);
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 930e5d4..cefe57c 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -386,7 +386,7 @@ static unsigned long quick_pit_calibrate(void)
goto success;
}
}
- pr_err("Fast TSC calibration failed\n");
+ pr_info("Fast TSC calibration failed\n");
return 0;
success:
@@ -974,14 +974,17 @@ void __init tsc_init(void)
x86_init.timers.tsc_pre_init();
- if (!cpu_has_tsc)
+ if (!cpu_has_tsc) {
+ setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER);
return;
+ }
tsc_khz = x86_platform.calibrate_tsc();
cpu_khz = tsc_khz;
if (!tsc_khz) {
mark_tsc_unstable("could not calculate TSC khz");
+ setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER);
return;
}
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 1f96f93..09ce23a 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -125,10 +125,10 @@ static void warn_bad_vsyscall(const char *level, struct pt_regs *regs,
if (!show_unhandled_signals)
return;
- pr_notice_ratelimited("%s%s[%d] %s ip:%lx cs:%lx sp:%lx ax:%lx si:%lx di:%lx\n",
- level, current->comm, task_pid_nr(current),
- message, regs->ip, regs->cs,
- regs->sp, regs->ax, regs->si, regs->di);
+ printk_ratelimited("%s%s[%d] %s ip:%lx cs:%lx sp:%lx ax:%lx si:%lx di:%lx\n",
+ level, current->comm, task_pid_nr(current),
+ message, regs->ip, regs->cs,
+ regs->sp, regs->ax, regs->si, regs->di);
}
static int addr_to_vsyscall_nr(unsigned long addr)
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index 422fd82..f5869fc 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -268,8 +268,6 @@ int save_xstate_sig(void __user *buf, void __user *buf_fx, int size)
if (use_fxsr() && save_xstate_epilog(buf_fx, ia32_fxstate))
return -1;
- drop_init_fpu(tsk); /* trigger finit */
-
return 0;
}
@@ -399,8 +397,11 @@ int __restore_xstate_sig(void __user *buf, void __user *buf_fx, int size)
set_used_math();
}
- if (use_eager_fpu())
+ if (use_eager_fpu()) {
+ preempt_disable();
math_state_restore();
+ preempt_enable();
+ }
return err;
} else {
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 92e6f4a..ab1d459 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -498,11 +498,6 @@ static void rsp_increment(struct x86_emulate_ctxt *ctxt, int inc)
masked_increment(reg_rmw(ctxt, VCPU_REGS_RSP), stack_mask(ctxt), inc);
}
-static inline void jmp_rel(struct x86_emulate_ctxt *ctxt, int rel)
-{
- register_address_increment(ctxt, &ctxt->_eip, rel);
-}
-
static u32 desc_limit_scaled(struct desc_struct *desc)
{
u32 limit = get_desc_limit(desc);
@@ -576,6 +571,38 @@ static int emulate_nm(struct x86_emulate_ctxt *ctxt)
return emulate_exception(ctxt, NM_VECTOR, 0, false);
}
+static inline int assign_eip_far(struct x86_emulate_ctxt *ctxt, ulong dst,
+ int cs_l)
+{
+ switch (ctxt->op_bytes) {
+ case 2:
+ ctxt->_eip = (u16)dst;
+ break;
+ case 4:
+ ctxt->_eip = (u32)dst;
+ break;
+ case 8:
+ if ((cs_l && is_noncanonical_address(dst)) ||
+ (!cs_l && (dst & ~(u32)-1)))
+ return emulate_gp(ctxt, 0);
+ ctxt->_eip = dst;
+ break;
+ default:
+ WARN(1, "unsupported eip assignment size\n");
+ }
+ return X86EMUL_CONTINUE;
+}
+
+static inline int assign_eip_near(struct x86_emulate_ctxt *ctxt, ulong dst)
+{
+ return assign_eip_far(ctxt, dst, ctxt->mode == X86EMUL_MODE_PROT64);
+}
+
+static inline int jmp_rel(struct x86_emulate_ctxt *ctxt, int rel)
+{
+ return assign_eip_near(ctxt, ctxt->_eip + rel);
+}
+
static u16 get_segment_selector(struct x86_emulate_ctxt *ctxt, unsigned seg)
{
u16 selector;
@@ -1964,13 +1991,15 @@ static int em_grp45(struct x86_emulate_ctxt *ctxt)
case 2: /* call near abs */ {
long int old_eip;
old_eip = ctxt->_eip;
- ctxt->_eip = ctxt->src.val;
+ rc = assign_eip_near(ctxt, ctxt->src.val);
+ if (rc != X86EMUL_CONTINUE)
+ break;
ctxt->src.val = old_eip;
rc = em_push(ctxt);
break;
}
case 4: /* jmp abs */
- ctxt->_eip = ctxt->src.val;
+ rc = assign_eip_near(ctxt, ctxt->src.val);
break;
case 5: /* jmp far */
rc = em_jmp_far(ctxt);
@@ -2002,16 +2031,21 @@ static int em_cmpxchg8b(struct x86_emulate_ctxt *ctxt)
static int em_ret(struct x86_emulate_ctxt *ctxt)
{
- ctxt->dst.type = OP_REG;
- ctxt->dst.addr.reg = &ctxt->_eip;
- ctxt->dst.bytes = ctxt->op_bytes;
- return em_pop(ctxt);
+ int rc;
+ unsigned long eip;
+
+ rc = emulate_pop(ctxt, &eip, ctxt->op_bytes);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ return assign_eip_near(ctxt, eip);
}
static int em_ret_far(struct x86_emulate_ctxt *ctxt)
{
int rc;
unsigned long cs;
+ int cpl = ctxt->ops->cpl(ctxt);
rc = emulate_pop(ctxt, &ctxt->_eip, ctxt->op_bytes);
if (rc != X86EMUL_CONTINUE)
@@ -2021,6 +2055,9 @@ static int em_ret_far(struct x86_emulate_ctxt *ctxt)
rc = emulate_pop(ctxt, &cs, ctxt->op_bytes);
if (rc != X86EMUL_CONTINUE)
return rc;
+ /* Outer-privilege level return is not implemented */
+ if (ctxt->mode >= X86EMUL_MODE_PROT16 && (cs & 3) > cpl)
+ return X86EMUL_UNHANDLEABLE;
rc = load_segment_descriptor(ctxt, (u16)cs, VCPU_SREG_CS);
return rc;
}
@@ -2279,7 +2316,7 @@ static int em_sysexit(struct x86_emulate_ctxt *ctxt)
{
const struct x86_emulate_ops *ops = ctxt->ops;
struct desc_struct cs, ss;
- u64 msr_data;
+ u64 msr_data, rcx, rdx;
int usermode;
u16 cs_sel = 0, ss_sel = 0;
@@ -2295,6 +2332,9 @@ static int em_sysexit(struct x86_emulate_ctxt *ctxt)
else
usermode = X86EMUL_MODE_PROT32;
+ rcx = reg_read(ctxt, VCPU_REGS_RCX);
+ rdx = reg_read(ctxt, VCPU_REGS_RDX);
+
cs.dpl = 3;
ss.dpl = 3;
ops->get_msr(ctxt, MSR_IA32_SYSENTER_CS, &msr_data);
@@ -2312,6 +2352,9 @@ static int em_sysexit(struct x86_emulate_ctxt *ctxt)
ss_sel = cs_sel + 8;
cs.d = 0;
cs.l = 1;
+ if (is_noncanonical_address(rcx) ||
+ is_noncanonical_address(rdx))
+ return emulate_gp(ctxt, 0);
break;
}
cs_sel |= SELECTOR_RPL_MASK;
@@ -2320,8 +2363,8 @@ static int em_sysexit(struct x86_emulate_ctxt *ctxt)
ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS);
ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS);
- ctxt->_eip = reg_read(ctxt, VCPU_REGS_RDX);
- *reg_write(ctxt, VCPU_REGS_RSP) = reg_read(ctxt, VCPU_REGS_RCX);
+ ctxt->_eip = rdx;
+ *reg_write(ctxt, VCPU_REGS_RSP) = rcx;
return X86EMUL_CONTINUE;
}
@@ -2860,10 +2903,13 @@ static int em_aad(struct x86_emulate_ctxt *ctxt)
static int em_call(struct x86_emulate_ctxt *ctxt)
{
+ int rc;
long rel = ctxt->src.val;
ctxt->src.val = (unsigned long)ctxt->_eip;
- jmp_rel(ctxt, rel);
+ rc = jmp_rel(ctxt, rel);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
return em_push(ctxt);
}
@@ -2895,11 +2941,12 @@ static int em_call_far(struct x86_emulate_ctxt *ctxt)
static int em_ret_near_imm(struct x86_emulate_ctxt *ctxt)
{
int rc;
+ unsigned long eip;
- ctxt->dst.type = OP_REG;
- ctxt->dst.addr.reg = &ctxt->_eip;
- ctxt->dst.bytes = ctxt->op_bytes;
- rc = emulate_pop(ctxt, &ctxt->dst.val, ctxt->op_bytes);
+ rc = emulate_pop(ctxt, &eip, ctxt->op_bytes);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ rc = assign_eip_near(ctxt, eip);
if (rc != X86EMUL_CONTINUE)
return rc;
rsp_increment(ctxt, ctxt->src.val);
@@ -3189,20 +3236,24 @@ static int em_lmsw(struct x86_emulate_ctxt *ctxt)
static int em_loop(struct x86_emulate_ctxt *ctxt)
{
+ int rc = X86EMUL_CONTINUE;
+
register_address_increment(ctxt, reg_rmw(ctxt, VCPU_REGS_RCX), -1);
if ((address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) != 0) &&
(ctxt->b == 0xe2 || test_cc(ctxt->b ^ 0x5, ctxt->eflags)))
- jmp_rel(ctxt, ctxt->src.val);
+ rc = jmp_rel(ctxt, ctxt->src.val);
- return X86EMUL_CONTINUE;
+ return rc;
}
static int em_jcxz(struct x86_emulate_ctxt *ctxt)
{
+ int rc = X86EMUL_CONTINUE;
+
if (address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) == 0)
- jmp_rel(ctxt, ctxt->src.val);
+ rc = jmp_rel(ctxt, ctxt->src.val);
- return X86EMUL_CONTINUE;
+ return rc;
}
static int em_in(struct x86_emulate_ctxt *ctxt)
@@ -4554,7 +4605,7 @@ special_insn:
break;
case 0x70 ... 0x7f: /* jcc (short) */
if (test_cc(ctxt->b, ctxt->eflags))
- jmp_rel(ctxt, ctxt->src.val);
+ rc = jmp_rel(ctxt, ctxt->src.val);
break;
case 0x8d: /* lea r16/r32, m */
ctxt->dst.val = ctxt->src.addr.mem.ea;
@@ -4583,7 +4634,7 @@ special_insn:
break;
case 0xe9: /* jmp rel */
case 0xeb: /* jmp rel short */
- jmp_rel(ctxt, ctxt->src.val);
+ rc = jmp_rel(ctxt, ctxt->src.val);
ctxt->dst.type = OP_NONE; /* Disable writeback. */
break;
case 0xf4: /* hlt */
@@ -4703,7 +4754,7 @@ twobyte_insn:
break;
case 0x80 ... 0x8f: /* jnz rel, etc*/
if (test_cc(ctxt->b, ctxt->eflags))
- jmp_rel(ctxt, ctxt->src.val);
+ rc = jmp_rel(ctxt, ctxt->src.val);
break;
case 0x90 ... 0x9f: /* setcc r/m8 */
ctxt->dst.val = test_cc(ctxt->b, ctxt->eflags);
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 518d864..298781d 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -262,8 +262,10 @@ void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
return;
timer = &pit->pit_state.timer;
+ mutex_lock(&pit->pit_state.lock);
if (hrtimer_cancel(timer))
hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
+ mutex_unlock(&pit->pit_state.lock);
}
static void destroy_pit_timer(struct kvm_pit *pit)
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index d86ff15..92bbb39 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -360,6 +360,8 @@ static inline void apic_clear_irr(int vec, struct kvm_lapic *apic)
static inline void apic_set_isr(int vec, struct kvm_lapic *apic)
{
+ /* Note that we never get here with APIC virtualization enabled. */
+
if (!__apic_test_and_set_vector(vec, apic->regs + APIC_ISR))
++apic->isr_count;
BUG_ON(apic->isr_count > MAX_APIC_VECTOR);
@@ -371,12 +373,48 @@ static inline void apic_set_isr(int vec, struct kvm_lapic *apic)
apic->highest_isr_cache = vec;
}
+static inline int apic_find_highest_isr(struct kvm_lapic *apic)
+{
+ int result;
+
+ /*
+ * Note that isr_count is always 1, and highest_isr_cache
+ * is always -1, with APIC virtualization enabled.
+ */
+ if (!apic->isr_count)
+ return -1;
+ if (likely(apic->highest_isr_cache != -1))
+ return apic->highest_isr_cache;
+
+ result = find_highest_vector(apic->regs + APIC_ISR);
+ ASSERT(result == -1 || result >= 16);
+
+ return result;
+}
+
static inline void apic_clear_isr(int vec, struct kvm_lapic *apic)
{
- if (__apic_test_and_clear_vector(vec, apic->regs + APIC_ISR))
+ struct kvm_vcpu *vcpu;
+ if (!__apic_test_and_clear_vector(vec, apic->regs + APIC_ISR))
+ return;
+
+ vcpu = apic->vcpu;
+
+ /*
+ * We do get here for APIC virtualization enabled if the guest
+ * uses the Hyper-V APIC enlightenment. In this case we may need
+ * to trigger a new interrupt delivery by writing the SVI field;
+ * on the other hand isr_count and highest_isr_cache are unused
+ * and must be left alone.
+ */
+ if (unlikely(kvm_apic_vid_enabled(vcpu->kvm)))
+ kvm_x86_ops->hwapic_isr_update(vcpu->kvm,
+ apic_find_highest_isr(apic));
+ else {
--apic->isr_count;
- BUG_ON(apic->isr_count < 0);
- apic->highest_isr_cache = -1;
+ BUG_ON(apic->isr_count < 0);
+ apic->highest_isr_cache = -1;
+ }
}
int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
@@ -456,22 +494,6 @@ static void pv_eoi_clr_pending(struct kvm_vcpu *vcpu)
__clear_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention);
}
-static inline int apic_find_highest_isr(struct kvm_lapic *apic)
-{
- int result;
-
- /* Note that isr_count is always 1 with vid enabled */
- if (!apic->isr_count)
- return -1;
- if (likely(apic->highest_isr_cache != -1))
- return apic->highest_isr_cache;
-
- result = find_highest_vector(apic->regs + APIC_ISR);
- ASSERT(result == -1 || result >= 16);
-
- return result;
-}
-
void kvm_apic_update_tmr(struct kvm_vcpu *vcpu, u32 *tmr)
{
struct kvm_lapic *apic = vcpu->arch.apic;
@@ -1605,6 +1627,8 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu)
int vector = kvm_apic_has_interrupt(vcpu);
struct kvm_lapic *apic = vcpu->arch.apic;
+ /* Note that we never get here with APIC virtualization enabled. */
+
if (vector == -1)
return -1;
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 74dd129..8ad01b4 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -198,16 +198,20 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask)
EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
/*
- * spte bits of bit 3 ~ bit 11 are used as low 9 bits of generation number,
- * the bits of bits 52 ~ bit 61 are used as high 10 bits of generation
- * number.
+ * the low bit of the generation number is always presumed to be zero.
+ * This disables mmio caching during memslot updates. The concept is
+ * similar to a seqcount but instead of retrying the access we just punt
+ * and ignore the cache.
+ *
+ * spte bits 3-11 are used as bits 1-9 of the generation number,
+ * the bits 52-61 are used as bits 10-19 of the generation number.
*/
-#define MMIO_SPTE_GEN_LOW_SHIFT 3
+#define MMIO_SPTE_GEN_LOW_SHIFT 2
#define MMIO_SPTE_GEN_HIGH_SHIFT 52
-#define MMIO_GEN_SHIFT 19
-#define MMIO_GEN_LOW_SHIFT 9
-#define MMIO_GEN_LOW_MASK ((1 << MMIO_GEN_LOW_SHIFT) - 1)
+#define MMIO_GEN_SHIFT 20
+#define MMIO_GEN_LOW_SHIFT 10
+#define MMIO_GEN_LOW_MASK ((1 << MMIO_GEN_LOW_SHIFT) - 2)
#define MMIO_GEN_MASK ((1 << MMIO_GEN_SHIFT) - 1)
#define MMIO_MAX_GEN ((1 << MMIO_GEN_SHIFT) - 1)
@@ -3161,7 +3165,7 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu)
if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
return;
- vcpu_clear_mmio_info(vcpu, ~0ul);
+ vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) {
hpa_t root = vcpu->arch.mmu.root_hpa;
@@ -4424,8 +4428,8 @@ void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm)
* The very rare case: if the generation-number is round,
* zap all shadow pages.
*/
- if (unlikely(kvm_current_mmio_generation(kvm) >= MMIO_MAX_GEN)) {
- printk_ratelimited(KERN_INFO "kvm: zapping shadow pages for mmio generation wraparound\n");
+ if (unlikely(kvm_current_mmio_generation(kvm) == 0)) {
+ printk_ratelimited(KERN_DEBUG "kvm: zapping shadow pages for mmio generation wraparound\n");
kvm_mmu_invalidate_zap_all_pages(kvm);
}
}
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 612c717..5dcdff5 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -3204,7 +3204,7 @@ static int wrmsr_interception(struct vcpu_svm *svm)
msr.host_initiated = false;
svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
- if (svm_set_msr(&svm->vcpu, &msr)) {
+ if (kvm_set_msr(&svm->vcpu, &msr)) {
trace_kvm_msr_write_ex(ecx, data);
kvm_inject_gp(&svm->vcpu, 0);
} else {
@@ -3486,9 +3486,9 @@ static int handle_exit(struct kvm_vcpu *vcpu)
if (exit_code >= ARRAY_SIZE(svm_exit_handlers)
|| !svm_exit_handlers[exit_code]) {
- kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
- kvm_run->hw.hardware_exit_reason = exit_code;
- return 0;
+ WARN_ONCE(1, "vmx: unexpected exit reason 0x%x\n", exit_code);
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
}
return svm_exit_handlers[exit_code](svm);
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 59181e6..c7663b1 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2540,12 +2540,15 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
break;
msr = find_msr_entry(vmx, msr_index);
if (msr) {
+ u64 old_msr_data = msr->data;
msr->data = data;
if (msr - vmx->guest_msrs < vmx->save_nmsrs) {
preempt_disable();
- kvm_set_shared_msr(msr->index, msr->data,
- msr->mask);
+ ret = kvm_set_shared_msr(msr->index, msr->data,
+ msr->mask);
preempt_enable();
+ if (ret)
+ msr->data = old_msr_data;
}
break;
}
@@ -5113,7 +5116,7 @@ static int handle_wrmsr(struct kvm_vcpu *vcpu)
msr.data = data;
msr.index = ecx;
msr.host_initiated = false;
- if (vmx_set_msr(vcpu, &msr) != 0) {
+ if (kvm_set_msr(vcpu, &msr) != 0) {
trace_kvm_msr_write_ex(ecx, data);
kvm_inject_gp(vcpu, 0);
return 1;
@@ -6385,6 +6388,12 @@ static int handle_invept(struct kvm_vcpu *vcpu)
return 1;
}
+static int handle_invvpid(struct kvm_vcpu *vcpu)
+{
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+}
+
/*
* The exit handlers return 1 if the exit was handled fully and guest execution
* may resume. Otherwise they set the kvm_run parameter to indicate what needs
@@ -6430,6 +6439,7 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
[EXIT_REASON_MWAIT_INSTRUCTION] = handle_invalid_op,
[EXIT_REASON_MONITOR_INSTRUCTION] = handle_invalid_op,
[EXIT_REASON_INVEPT] = handle_invept,
+ [EXIT_REASON_INVVPID] = handle_invvpid,
};
static const int kvm_vmx_max_exit_handlers =
@@ -6656,7 +6666,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
case EXIT_REASON_VMPTRST: case EXIT_REASON_VMREAD:
case EXIT_REASON_VMRESUME: case EXIT_REASON_VMWRITE:
case EXIT_REASON_VMOFF: case EXIT_REASON_VMON:
- case EXIT_REASON_INVEPT:
+ case EXIT_REASON_INVEPT: case EXIT_REASON_INVVPID:
/*
* VMX instructions trap unconditionally. This allows L1 to
* emulate them for its L2 guest, i.e., allows 3-level nesting!
@@ -6812,10 +6822,10 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
&& kvm_vmx_exit_handlers[exit_reason])
return kvm_vmx_exit_handlers[exit_reason](vcpu);
else {
- vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
- vcpu->run->hw.hardware_exit_reason = exit_reason;
+ WARN_ONCE(1, "vmx: unexpected exit reason 0x%x\n", exit_reason);
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
}
- return 0;
}
static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e1b7b17..790551b 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -225,20 +225,25 @@ static void kvm_shared_msr_cpu_online(void)
shared_msr_update(i, shared_msrs_global.msrs[i]);
}
-void kvm_set_shared_msr(unsigned slot, u64 value, u64 mask)
+int kvm_set_shared_msr(unsigned slot, u64 value, u64 mask)
{
unsigned int cpu = smp_processor_id();
struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
+ int err;
if (((value ^ smsr->values[slot].curr) & mask) == 0)
- return;
+ return 0;
smsr->values[slot].curr = value;
- wrmsrl(shared_msrs_global.msrs[slot], value);
+ err = wrmsrl_safe(shared_msrs_global.msrs[slot], value);
+ if (err)
+ return 1;
+
if (!smsr->registered) {
smsr->urn.on_user_return = kvm_on_user_return;
user_return_notifier_register(&smsr->urn);
smsr->registered = true;
}
+ return 0;
}
EXPORT_SYMBOL_GPL(kvm_set_shared_msr);
@@ -910,7 +915,6 @@ void kvm_enable_efer_bits(u64 mask)
}
EXPORT_SYMBOL_GPL(kvm_enable_efer_bits);
-
/*
* Writes msr value into into the appropriate "register".
* Returns 0 on success, non-0 otherwise.
@@ -918,8 +922,34 @@ EXPORT_SYMBOL_GPL(kvm_enable_efer_bits);
*/
int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
{
+ switch (msr->index) {
+ case MSR_FS_BASE:
+ case MSR_GS_BASE:
+ case MSR_KERNEL_GS_BASE:
+ case MSR_CSTAR:
+ case MSR_LSTAR:
+ if (is_noncanonical_address(msr->data))
+ return 1;
+ break;
+ case MSR_IA32_SYSENTER_EIP:
+ case MSR_IA32_SYSENTER_ESP:
+ /*
+ * IA32_SYSENTER_ESP and IA32_SYSENTER_EIP cause #GP if
+ * non-canonical address is written on Intel but not on
+ * AMD (which ignores the top 32-bits, because it does
+ * not implement 64-bit SYSENTER).
+ *
+ * 64-bit code should hence be able to write a non-canonical
+ * value on AMD. Making the address canonical ensures that
+ * vmentry does not fail on Intel after writing a non-canonical
+ * value, and that something deterministic happens if the guest
+ * invokes 64-bit SYSENTER.
+ */
+ msr->data = get_canonical(msr->data);
+ }
return kvm_x86_ops->set_msr(vcpu, msr);
}
+EXPORT_SYMBOL_GPL(kvm_set_msr);
/*
* Adapt set_msr() to msr_io()'s calling convention
@@ -1073,7 +1103,6 @@ static inline u64 get_kernel_ns(void)
{
struct timespec ts;
- WARN_ON(preemptible());
ktime_get_ts(&ts);
monotonic_to_bootbased(&ts);
return timespec_to_ns(&ts);
@@ -4842,7 +4871,7 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu)
++vcpu->stat.insn_emulation_fail;
trace_kvm_emulate_insn_failed(vcpu);
- if (!is_guest_mode(vcpu)) {
+ if (!is_guest_mode(vcpu) && kvm_x86_ops->get_cpl(vcpu) == 0) {
vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
vcpu->run->internal.ndata = 0;
@@ -5493,13 +5522,6 @@ int kvm_arch_init(void *opaque)
goto out;
}
-#ifdef CONFIG_PREEMPT_RT_FULL
- if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
- printk(KERN_ERR "RT requires X86_FEATURE_CONSTANT_TSC\n");
- return -EOPNOTSUPP;
- }
-#endif
-
r = kvm_mmu_module_init();
if (r)
goto out_free_percpu;
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 3186542..7626d3e 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -78,15 +78,23 @@ static inline void vcpu_cache_mmio_info(struct kvm_vcpu *vcpu,
vcpu->arch.mmio_gva = gva & PAGE_MASK;
vcpu->arch.access = access;
vcpu->arch.mmio_gfn = gfn;
+ vcpu->arch.mmio_gen = kvm_memslots(vcpu->kvm)->generation;
+}
+
+static inline bool vcpu_match_mmio_gen(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.mmio_gen == kvm_memslots(vcpu->kvm)->generation;
}
/*
- * Clear the mmio cache info for the given gva,
- * specially, if gva is ~0ul, we clear all mmio cache info.
+ * Clear the mmio cache info for the given gva. If gva is MMIO_GVA_ANY, we
+ * clear all mmio cache info.
*/
+#define MMIO_GVA_ANY (~(gva_t)0)
+
static inline void vcpu_clear_mmio_info(struct kvm_vcpu *vcpu, gva_t gva)
{
- if (gva != (~0ul) && vcpu->arch.mmio_gva != (gva & PAGE_MASK))
+ if (gva != MMIO_GVA_ANY && vcpu->arch.mmio_gva != (gva & PAGE_MASK))
return;
vcpu->arch.mmio_gva = 0;
@@ -94,7 +102,8 @@ static inline void vcpu_clear_mmio_info(struct kvm_vcpu *vcpu, gva_t gva)
static inline bool vcpu_match_mmio_gva(struct kvm_vcpu *vcpu, unsigned long gva)
{
- if (vcpu->arch.mmio_gva && vcpu->arch.mmio_gva == (gva & PAGE_MASK))
+ if (vcpu_match_mmio_gen(vcpu) && vcpu->arch.mmio_gva &&
+ vcpu->arch.mmio_gva == (gva & PAGE_MASK))
return true;
return false;
@@ -102,7 +111,8 @@ static inline bool vcpu_match_mmio_gva(struct kvm_vcpu *vcpu, unsigned long gva)
static inline bool vcpu_match_mmio_gpa(struct kvm_vcpu *vcpu, gpa_t gpa)
{
- if (vcpu->arch.mmio_gfn && vcpu->arch.mmio_gfn == gpa >> PAGE_SHIFT)
+ if (vcpu_match_mmio_gen(vcpu) && vcpu->arch.mmio_gfn &&
+ vcpu->arch.mmio_gfn == gpa >> PAGE_SHIFT)
return true;
return false;
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index 0002a3a..3620928 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -30,11 +30,13 @@ struct pg_state {
unsigned long start_address;
unsigned long current_address;
const struct addr_marker *marker;
+ unsigned long lines;
};
struct addr_marker {
unsigned long start_address;
const char *name;
+ unsigned long max_lines;
};
/* indices for address_markers; keep sync'd w/ address_markers below */
@@ -45,6 +47,7 @@ enum address_markers_idx {
LOW_KERNEL_NR,
VMALLOC_START_NR,
VMEMMAP_START_NR,
+ ESPFIX_START_NR,
HIGH_KERNEL_NR,
MODULES_VADDR_NR,
MODULES_END_NR,
@@ -67,6 +70,7 @@ static struct addr_marker address_markers[] = {
{ PAGE_OFFSET, "Low Kernel Mapping" },
{ VMALLOC_START, "vmalloc() Area" },
{ VMEMMAP_START, "Vmemmap" },
+ { ESPFIX_BASE_ADDR, "ESPfix Area", 16 },
{ __START_KERNEL_map, "High Kernel Mapping" },
{ MODULES_VADDR, "Modules" },
{ MODULES_END, "End Modules" },
@@ -163,7 +167,7 @@ static void note_page(struct seq_file *m, struct pg_state *st,
pgprot_t new_prot, int level)
{
pgprotval_t prot, cur;
- static const char units[] = "KMGTPE";
+ static const char units[] = "BKMGTPE";
/*
* If we have a "break" in the series, we need to flush the state that
@@ -178,6 +182,7 @@ static void note_page(struct seq_file *m, struct pg_state *st,
st->current_prot = new_prot;
st->level = level;
st->marker = address_markers;
+ st->lines = 0;
seq_printf(m, "---[ %s ]---\n", st->marker->name);
} else if (prot != cur || level != st->level ||
st->current_address >= st->marker[1].start_address) {
@@ -188,17 +193,21 @@ static void note_page(struct seq_file *m, struct pg_state *st,
/*
* Now print the actual finished series
*/
- seq_printf(m, "0x%0*lx-0x%0*lx ",
- width, st->start_address,
- width, st->current_address);
-
- delta = (st->current_address - st->start_address) >> 10;
- while (!(delta & 1023) && unit[1]) {
- delta >>= 10;
- unit++;
+ if (!st->marker->max_lines ||
+ st->lines < st->marker->max_lines) {
+ seq_printf(m, "0x%0*lx-0x%0*lx ",
+ width, st->start_address,
+ width, st->current_address);
+
+ delta = (st->current_address - st->start_address) >> 10;
+ while (!(delta & 1023) && unit[1]) {
+ delta >>= 10;
+ unit++;
+ }
+ seq_printf(m, "%9lu%c ", delta, *unit);
+ printk_prot(m, st->current_prot, st->level);
}
- seq_printf(m, "%9lu%c ", delta, *unit);
- printk_prot(m, st->current_prot, st->level);
+ st->lines++;
/*
* We print markers for special areas of address space,
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index b5c8e37..5b90bbcad9 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1098,7 +1098,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code)
* If we're in an interrupt, have no user context or are running
* in an atomic region then we must not take the fault:
*/
- if (unlikely(!mm || pagefault_disabled())) {
+ if (unlikely(in_atomic() || !mm)) {
bad_area_nosemaphore(regs, error_code, address);
return;
}
diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
index 7f96844..4500142 100644
--- a/arch/x86/mm/highmem_32.c
+++ b/arch/x86/mm/highmem_32.c
@@ -32,7 +32,6 @@ EXPORT_SYMBOL(kunmap);
*/
void *kmap_atomic_prot(struct page *page, pgprot_t prot)
{
- pte_t pte = mk_pte(page, prot);
unsigned long vaddr;
int idx, type;
@@ -46,10 +45,7 @@ void *kmap_atomic_prot(struct page *page, pgprot_t prot)
idx = type + KM_TYPE_NR*smp_processor_id();
vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
BUG_ON(!pte_none(*(kmap_pte-idx)));
-#ifdef CONFIG_PREEMPT_RT_FULL
- current->kmap_pte[type] = pte;
-#endif
- set_pte(kmap_pte-idx, pte);
+ set_pte(kmap_pte-idx, mk_pte(page, prot));
arch_flush_lazy_mmu_mode();
return (void *)vaddr;
@@ -92,9 +88,6 @@ void __kunmap_atomic(void *kvaddr)
* is a bad idea also, in case the page changes cacheability
* attributes or becomes a protected page in a hypervisor.
*/
-#ifdef CONFIG_PREEMPT_RT_FULL
- current->kmap_pte[type] = __pte(0);
-#endif
kpte_clear_flush(kmap_pte-idx, vaddr);
kmap_atomic_idx_pop();
arch_flush_lazy_mmu_mode();
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index 9d980d8..fa029fb 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -58,11 +58,6 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address,
{
return NULL;
}
-
-int pmd_huge_support(void)
-{
- return 0;
-}
#else
struct page *
@@ -80,11 +75,6 @@ int pud_huge(pud_t pud)
{
return !!(pud_val(pud) & _PAGE_PSE);
}
-
-int pmd_huge_support(void)
-{
- return 1;
-}
#endif
/* x86_64 also uses this file */
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 104d56a..b599241 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -1110,7 +1110,7 @@ void mark_rodata_ro(void)
unsigned long end = (unsigned long) &__end_rodata_hpage_align;
unsigned long text_end = PFN_ALIGN(&__stop___ex_table);
unsigned long rodata_end = PFN_ALIGN(&__end_rodata);
- unsigned long all_end = PFN_ALIGN(&_end);
+ unsigned long all_end;
printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
(end - start) >> 10);
@@ -1121,7 +1121,16 @@ void mark_rodata_ro(void)
/*
* The rodata/data/bss/brk section (but not the kernel text!)
* should also be not-executable.
+ *
+ * We align all_end to PMD_SIZE because the existing mapping
+ * is a full PMD. If we would align _brk_end to PAGE_SIZE we
+ * split the PMD and the reminder between _brk_end and the end
+ * of the PMD will remain mapped executable.
+ *
+ * Any PMD which was setup after the one which covers _brk_end
+ * has been zapped already via cleanup_highmem().
*/
+ all_end = roundup((unsigned long)_brk_end, PMD_SIZE);
set_memory_nx(rodata_start, (all_end - rodata_start) >> PAGE_SHIFT);
rodata_test();
diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c
index 62377d6..7b179b4 100644
--- a/arch/x86/mm/iomap_32.c
+++ b/arch/x86/mm/iomap_32.c
@@ -56,7 +56,6 @@ EXPORT_SYMBOL_GPL(iomap_free);
void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot)
{
- pte_t pte = pfn_pte(pfn, prot);
unsigned long vaddr;
int idx, type;
@@ -65,12 +64,7 @@ void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot)
type = kmap_atomic_idx_push();
idx = type + KM_TYPE_NR * smp_processor_id();
vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
- WARN_ON(!pte_none(*(kmap_pte - idx)));
-
-#ifdef CONFIG_PREEMPT_RT_FULL
- current->kmap_pte[type] = pte;
-#endif
- set_pte(kmap_pte - idx, pte);
+ set_pte(kmap_pte - idx, pfn_pte(pfn, prot));
arch_flush_lazy_mmu_mode();
return (void *)vaddr;
@@ -116,9 +110,6 @@ iounmap_atomic(void __iomem *kvaddr)
* is a bad idea also, in case the page changes cacheability
* attributes or becomes a protected page in a hypervisor.
*/
-#ifdef CONFIG_PREEMPT_RT_FULL
- current->kmap_pte[type] = __pte(0);
-#endif
kpte_clear_flush(kmap_pte-idx, vaddr);
kmap_atomic_idx_pop();
}
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 799580c..94bd247 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -50,6 +50,21 @@ int ioremap_change_attr(unsigned long vaddr, unsigned long size,
return err;
}
+static int __ioremap_check_ram(unsigned long start_pfn, unsigned long nr_pages,
+ void *arg)
+{
+ unsigned long i;
+
+ for (i = 0; i < nr_pages; ++i)
+ if (pfn_valid(start_pfn + i) &&
+ !PageReserved(pfn_to_page(start_pfn + i)))
+ return 1;
+
+ WARN_ONCE(1, "ioremap on RAM pfn 0x%lx\n", start_pfn);
+
+ return 0;
+}
+
/*
* Remap an arbitrary physical address space into the kernel virtual
* address space. Needed when the kernel wants to access high addresses
@@ -93,14 +108,11 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
/*
* Don't allow anybody to remap normal RAM that we're using..
*/
+ pfn = phys_addr >> PAGE_SHIFT;
last_pfn = last_addr >> PAGE_SHIFT;
- for (pfn = phys_addr >> PAGE_SHIFT; pfn <= last_pfn; pfn++) {
- int is_ram = page_is_ram(pfn);
-
- if (is_ram && pfn_valid(pfn) && !PageReserved(pfn_to_page(pfn)))
- return NULL;
- WARN_ON_ONCE(is_ram);
- }
+ if (walk_system_ram_range(pfn, last_pfn - pfn + 1, NULL,
+ __ioremap_check_ram) == 1)
+ return NULL;
/*
* Mappings have to be page-aligned
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index bb32480..aabdf76 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -389,7 +389,7 @@ phys_addr_t slow_virt_to_phys(void *__virt_addr)
psize = page_level_size(level);
pmask = page_level_mask(level);
offset = virt_addr & ~pmask;
- phys_addr = pte_pfn(*pte) << PAGE_SHIFT;
+ phys_addr = (phys_addr_t)pte_pfn(*pte) << PAGE_SHIFT;
return (phys_addr | offset);
}
EXPORT_SYMBOL_GPL(slow_virt_to_phys);
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index dfa537a..5da29d0 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -386,13 +386,20 @@ int pmdp_test_and_clear_young(struct vm_area_struct *vma,
int ptep_clear_flush_young(struct vm_area_struct *vma,
unsigned long address, pte_t *ptep)
{
- int young;
-
- young = ptep_test_and_clear_young(vma, address, ptep);
- if (young)
- flush_tlb_page(vma, address);
-
- return young;
+ /*
+ * On x86 CPUs, clearing the accessed bit without a TLB flush
+ * doesn't cause data corruption. [ It could cause incorrect
+ * page aging and the (mistaken) reclaim of hot pages, but the
+ * chance of that should be relatively low. ]
+ *
+ * So as a performance optimization don't flush the TLB when
+ * clearing the accessed bit, it will eventually be flushed by
+ * a context switch or a VM operation anyway. [ In the rare
+ * event of it not getting flushed for a long time the delay
+ * shouldn't really matter because there's no real memory
+ * pressure for swapout to react to. ]
+ */
+ return ptep_test_and_clear_young(vma, address, ptep);
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index ae699b3..dd8dda1 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -103,7 +103,7 @@ static void flush_tlb_func(void *info)
if (f->flush_mm != this_cpu_read(cpu_tlbstate.active_mm))
return;
- count_vm_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
+ count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) {
if (f->flush_end == TLB_FLUSH_ALL)
local_flush_tlb();
@@ -131,7 +131,7 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
info.flush_start = start;
info.flush_end = end;
- count_vm_event(NR_TLB_REMOTE_FLUSH);
+ count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
if (is_uv_system()) {
unsigned int cpu;
@@ -151,44 +151,19 @@ void flush_tlb_current_task(void)
preempt_disable();
- count_vm_event(NR_TLB_LOCAL_FLUSH_ALL);
+ count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
local_flush_tlb();
if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL);
preempt_enable();
}
-/*
- * It can find out the THP large page, or
- * HUGETLB page in tlb_flush when THP disabled
- */
-static inline unsigned long has_large_page(struct mm_struct *mm,
- unsigned long start, unsigned long end)
-{
- pgd_t *pgd;
- pud_t *pud;
- pmd_t *pmd;
- unsigned long addr = ALIGN(start, HPAGE_SIZE);
- for (; addr < end; addr += HPAGE_SIZE) {
- pgd = pgd_offset(mm, addr);
- if (likely(!pgd_none(*pgd))) {
- pud = pud_offset(pgd, addr);
- if (likely(!pud_none(*pud))) {
- pmd = pmd_offset(pud, addr);
- if (likely(!pmd_none(*pmd)))
- if (pmd_large(*pmd))
- return addr;
- }
- }
- }
- return 0;
-}
-
void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
unsigned long end, unsigned long vmflag)
{
unsigned long addr;
unsigned act_entries, tlb_entries = 0;
+ unsigned long nr_base_pages;
preempt_disable();
if (current->active_mm != mm)
@@ -210,21 +185,20 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
tlb_entries = tlb_lli_4k[ENTRIES];
else
tlb_entries = tlb_lld_4k[ENTRIES];
+
/* Assume all of TLB entries was occupied by this task */
- act_entries = mm->total_vm > tlb_entries ? tlb_entries : mm->total_vm;
+ act_entries = tlb_entries >> tlb_flushall_shift;
+ act_entries = mm->total_vm > act_entries ? act_entries : mm->total_vm;
+ nr_base_pages = (end - start) >> PAGE_SHIFT;
/* tlb_flushall_shift is on balance point, details in commit log */
- if ((end - start) >> PAGE_SHIFT > act_entries >> tlb_flushall_shift) {
- count_vm_event(NR_TLB_LOCAL_FLUSH_ALL);
+ if (nr_base_pages > act_entries) {
+ count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
local_flush_tlb();
} else {
- if (has_large_page(mm, start, end)) {
- local_flush_tlb();
- goto flush_all;
- }
/* flush range by one by one 'invlpg' */
for (addr = start; addr < end; addr += PAGE_SIZE) {
- count_vm_event(NR_TLB_LOCAL_FLUSH_ONE);
+ count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE);
__flush_tlb_single(addr);
}
@@ -262,7 +236,7 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long start)
static void do_flush_tlb_all(void *info)
{
- count_vm_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
+ count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
__flush_tlb_all();
if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY)
leave_mm(smp_processor_id());
@@ -270,7 +244,7 @@ static void do_flush_tlb_all(void *info)
void flush_tlb_all(void)
{
- count_vm_event(NR_TLB_REMOTE_FLUSH);
+ count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
on_each_cpu(do_flush_tlb_all, NULL, 1);
}
diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c
index db6b1ab..96a159a 100644
--- a/arch/x86/pci/i386.c
+++ b/arch/x86/pci/i386.c
@@ -162,6 +162,10 @@ pcibios_align_resource(void *data, const struct resource *res,
return start;
if (start & 0x300)
start = (start + 0x3ff) & ~0x3ff;
+ } else if (res->flags & IORESOURCE_MEM) {
+ /* The low 1MB range is reserved for ISA cards */
+ if (start < BIOS_END)
+ start = BIOS_END;
}
return start;
}
diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl
index 38ae65d..63a8993 100644
--- a/arch/x86/syscalls/syscall_64.tbl
+++ b/arch/x86/syscalls/syscall_64.tbl
@@ -212,10 +212,10 @@
203 common sched_setaffinity sys_sched_setaffinity
204 common sched_getaffinity sys_sched_getaffinity
205 64 set_thread_area
-206 common io_setup sys_io_setup
+206 64 io_setup sys_io_setup
207 common io_destroy sys_io_destroy
208 common io_getevents sys_io_getevents
-209 common io_submit sys_io_submit
+209 64 io_submit sys_io_submit
210 common io_cancel sys_io_cancel
211 64 get_thread_area
212 common lookup_dcookie sys_lookup_dcookie
@@ -356,3 +356,5 @@
540 x32 process_vm_writev compat_sys_process_vm_writev
541 x32 setsockopt compat_sys_setsockopt
542 x32 getsockopt compat_sys_getsockopt
+543 x32 io_setup compat_sys_io_setup
+544 x32 io_submit compat_sys_io_submit
diff --git a/arch/x86/um/sys_call_table_32.c b/arch/x86/um/sys_call_table_32.c
index 531d426..bd16d6c 100644
--- a/arch/x86/um/sys_call_table_32.c
+++ b/arch/x86/um/sys_call_table_32.c
@@ -34,7 +34,7 @@ typedef asmlinkage void (*sys_call_ptr_t)(void);
extern asmlinkage void sys_ni_syscall(void);
-const sys_call_ptr_t sys_call_table[] __cacheline_aligned = {
+const sys_call_ptr_t sys_call_table[] ____cacheline_aligned = {
/*
* Smells like a compiler bug -- it doesn't work
* when the & below is removed.
diff --git a/arch/x86/um/sys_call_table_64.c b/arch/x86/um/sys_call_table_64.c
index f2f0723..9578308 100644
--- a/arch/x86/um/sys_call_table_64.c
+++ b/arch/x86/um/sys_call_table_64.c
@@ -46,7 +46,7 @@ typedef void (*sys_call_ptr_t)(void);
extern void sys_ni_syscall(void);
-const sys_call_ptr_t sys_call_table[] __cacheline_aligned = {
+const sys_call_ptr_t sys_call_table[] ____cacheline_aligned = {
/*
* Smells like a compiler bug -- it doesn't work
* when the & below is removed.
diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c
index 431e875..ab6ba35 100644
--- a/arch/x86/vdso/vma.c
+++ b/arch/x86/vdso/vma.c
@@ -117,30 +117,45 @@ subsys_initcall(init_vdso);
struct linux_binprm;
-/* Put the vdso above the (randomized) stack with another randomized offset.
- This way there is no hole in the middle of address space.
- To save memory make sure it is still in the same PTE as the stack top.
- This doesn't give that many random bits */
+/*
+ * Put the vdso above the (randomized) stack with another randomized
+ * offset. This way there is no hole in the middle of address space.
+ * To save memory make sure it is still in the same PTE as the stack
+ * top. This doesn't give that many random bits.
+ *
+ * Note that this algorithm is imperfect: the distribution of the vdso
+ * start address within a PMD is biased toward the end.
+ *
+ * Only used for the 64-bit and x32 vdsos.
+ */
static unsigned long vdso_addr(unsigned long start, unsigned len)
{
unsigned long addr, end;
unsigned offset;
- end = (start + PMD_SIZE - 1) & PMD_MASK;
+
+ /*
+ * Round up the start address. It can start out unaligned as a result
+ * of stack start randomization.
+ */
+ start = PAGE_ALIGN(start);
+
+ /* Round the lowest possible end address up to a PMD boundary. */
+ end = (start + len + PMD_SIZE - 1) & PMD_MASK;
if (end >= TASK_SIZE_MAX)
end = TASK_SIZE_MAX;
end -= len;
- /* This loses some more bits than a modulo, but is cheaper */
- offset = get_random_int() & (PTRS_PER_PTE - 1);
- addr = start + (offset << PAGE_SHIFT);
- if (addr >= end)
- addr = end;
+
+ if (end > start) {
+ offset = get_random_int() % (((end - start) >> PAGE_SHIFT) + 1);
+ addr = start + (offset << PAGE_SHIFT);
+ } else {
+ addr = start;
+ }
/*
- * page-align it here so that get_unmapped_area doesn't
- * align it wrongfully again to the next page. addr can come in 4K
- * unaligned here as a result of stack start randomization.
+ * Forcibly align the final address in case we have a hardware
+ * issue that requires alignment for performance reasons.
*/
- addr = PAGE_ALIGN(addr);
addr = align_vdso_addr(addr);
return addr;
diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c
index be6b860..ba81b54 100644
--- a/arch/x86/xen/spinlock.c
+++ b/arch/x86/xen/spinlock.c
@@ -274,7 +274,7 @@ void __init xen_init_spinlocks(void)
printk(KERN_DEBUG "xen: PV spinlocks disabled\n");
return;
}
-
+ printk(KERN_DEBUG "xen: PV spinlocks enabled\n");
pv_lock_ops.lock_spinning = PV_CALLEE_SAVE(xen_lock_spinning);
pv_lock_ops.unlock_kick = xen_unlock_kick;
}
@@ -290,6 +290,9 @@ static __init int xen_init_spinlocks_jump(void)
if (!xen_pvspin)
return 0;
+ if (!xen_domain())
+ return 0;
+
static_key_slow_inc(&paravirt_ticketlocks_enabled);
return 0;
}
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index ee36589..90bfa52 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -445,7 +445,7 @@ void xen_setup_timer(int cpu)
irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt,
IRQF_DISABLED|IRQF_PERCPU|
IRQF_NOBALANCING|IRQF_TIMER|
- IRQF_FORCE_RESUME,
+ IRQF_FORCE_RESUME|IRQF_EARLY_RESUME,
name, NULL);
memcpy(evt, xen_clockevent, sizeof(*evt));