summaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig67
-rw-r--r--arch/x86/boot/bitops.h8
-rw-r--r--arch/x86/boot/boot.h18
-rw-r--r--arch/x86/boot/compressed/Makefile18
-rw-r--r--arch/x86/boot/compressed/eboot.c2
-rw-r--r--arch/x86/boot/compressed/kaslr.c251
-rw-r--r--arch/x86/boot/compressed/misc.c49
-rw-r--r--arch/x86/boot/compressed/misc.h25
-rw-r--r--arch/x86/boot/compressed/pagetable.c29
-rw-r--r--arch/x86/boot/cpu.c2
-rw-r--r--arch/x86/boot/cpucheck.c33
-rw-r--r--arch/x86/boot/cpuflags.c1
-rw-r--r--arch/x86/boot/cpuflags.h1
-rw-r--r--arch/x86/boot/string.c2
-rw-r--r--arch/x86/entry/common.c6
-rw-r--r--arch/x86/entry/entry_32.S11
-rw-r--r--arch/x86/entry/entry_64.S11
-rw-r--r--arch/x86/entry/syscalls/syscall_64.tbl4
-rw-r--r--arch/x86/entry/thunk_64.S6
-rw-r--r--arch/x86/entry/vdso/Makefile5
-rw-r--r--arch/x86/entry/vdso/vdso32/sigreturn.S8
-rw-r--r--arch/x86/entry/vdso/vdso32/system_call.S7
-rw-r--r--arch/x86/entry/vdso/vma.c47
-rw-r--r--arch/x86/entry/vsyscall/vsyscall_64.c10
-rw-r--r--arch/x86/events/core.c25
-rw-r--r--arch/x86/events/intel/core.c201
-rw-r--r--arch/x86/events/intel/cstate.c47
-rw-r--r--arch/x86/events/intel/lbr.c124
-rw-r--r--arch/x86/events/intel/rapl.c32
-rw-r--r--arch/x86/events/intel/uncore.c88
-rw-r--r--arch/x86/events/intel/uncore.h5
-rw-r--r--arch/x86/events/intel/uncore_snb.c67
-rw-r--r--arch/x86/events/intel/uncore_snbep.c96
-rw-r--r--arch/x86/events/msr.c63
-rw-r--r--arch/x86/events/perf_event.h12
-rw-r--r--arch/x86/include/asm/Kbuild6
-rw-r--r--arch/x86/include/asm/apic.h1
-rw-r--r--arch/x86/include/asm/apm.h6
-rw-r--r--arch/x86/include/asm/arch_hweight.h24
-rw-r--r--arch/x86/include/asm/archrandom.h132
-rw-r--r--arch/x86/include/asm/asm.h12
-rw-r--r--arch/x86/include/asm/atomic.h51
-rw-r--r--arch/x86/include/asm/atomic64_32.h25
-rw-r--r--arch/x86/include/asm/atomic64_64.h53
-rw-r--r--arch/x86/include/asm/bios_ebda.h2
-rw-r--r--arch/x86/include/asm/bitops.h50
-rw-r--r--arch/x86/include/asm/checksum_32.h3
-rw-r--r--arch/x86/include/asm/compat.h11
-rw-r--r--arch/x86/include/asm/cpu.h1
-rw-r--r--arch/x86/include/asm/cpufeatures.h6
-rw-r--r--arch/x86/include/asm/efi.h10
-rw-r--r--arch/x86/include/asm/kaslr.h15
-rw-r--r--arch/x86/include/asm/kdebug.h1
-rw-r--r--arch/x86/include/asm/local.h16
-rw-r--r--arch/x86/include/asm/mutex_32.h2
-rw-r--r--arch/x86/include/asm/mutex_64.h6
-rw-r--r--arch/x86/include/asm/page_64_types.h11
-rw-r--r--arch/x86/include/asm/percpu.h17
-rw-r--r--arch/x86/include/asm/pgtable.h30
-rw-r--r--arch/x86/include/asm/pgtable_64.h26
-rw-r--r--arch/x86/include/asm/pgtable_64_types.h15
-rw-r--r--arch/x86/include/asm/pgtable_types.h8
-rw-r--r--arch/x86/include/asm/preempt.h2
-rw-r--r--arch/x86/include/asm/processor.h20
-rw-r--r--arch/x86/include/asm/rmwcc.h20
-rw-r--r--arch/x86/include/asm/rwsem.h35
-rw-r--r--arch/x86/include/asm/signal.h6
-rw-r--r--arch/x86/include/asm/smp.h6
-rw-r--r--arch/x86/include/asm/sync_bitops.h18
-rw-r--r--arch/x86/include/asm/thread_info.h9
-rw-r--r--arch/x86/include/asm/topology.h9
-rw-r--r--arch/x86/include/asm/uaccess.h33
-rw-r--r--arch/x86/include/asm/unistd.h2
-rw-r--r--arch/x86/include/asm/x86_init.h7
-rw-r--r--arch/x86/kernel/apic/apic.c6
-rw-r--r--arch/x86/kernel/apic/apic_flat_64.c2
-rw-r--r--arch/x86/kernel/apic/apic_noop.c1
-rw-r--r--arch/x86/kernel/apic/apic_numachip.c2
-rw-r--r--arch/x86/kernel/apic/bigsmp_32.c1
-rw-r--r--arch/x86/kernel/apic/io_apic.c18
-rw-r--r--arch/x86/kernel/apic/probe_32.c1
-rw-r--r--arch/x86/kernel/apic/x2apic_cluster.c1
-rw-r--r--arch/x86/kernel/apic/x2apic_phys.c1
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c1
-rw-r--r--arch/x86/kernel/asm-offsets.c4
-rw-r--r--arch/x86/kernel/cpu/common.c2
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-apei.c2
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c2
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd.c2
-rw-r--r--arch/x86/kernel/cpu/rdrand.c4
-rw-r--r--arch/x86/kernel/dumpstack.c25
-rw-r--r--arch/x86/kernel/dumpstack_32.c4
-rw-r--r--arch/x86/kernel/dumpstack_64.c16
-rw-r--r--arch/x86/kernel/early-quirks.c105
-rw-r--r--arch/x86/kernel/ebda.c114
-rw-r--r--arch/x86/kernel/head32.c2
-rw-r--r--arch/x86/kernel/head64.c2
-rw-r--r--arch/x86/kernel/head_64.S3
-rw-r--r--arch/x86/kernel/i386_ksyms_32.c2
-rw-r--r--arch/x86/kernel/kvm.c2
-rw-r--r--arch/x86/kernel/platform-quirks.c4
-rw-r--r--arch/x86/kernel/setup.c3
-rw-r--r--arch/x86/kernel/signal_compat.c108
-rw-r--r--arch/x86/kernel/smpboot.c26
-rw-r--r--arch/x86/kernel/vm86_32.c5
-rw-r--r--arch/x86/kernel/x8664_ksyms_64.c3
-rw-r--r--arch/x86/lguest/boot.c17
-rw-r--r--arch/x86/lib/Makefile3
-rw-r--r--arch/x86/lib/copy_user_64.S8
-rw-r--r--arch/x86/lib/csum-wrappers_64.c1
-rw-r--r--arch/x86/lib/getuser.S20
-rw-r--r--arch/x86/lib/hweight.S77
-rw-r--r--arch/x86/lib/kaslr.c90
-rw-r--r--arch/x86/lib/putuser.S10
-rw-r--r--arch/x86/lib/usercopy_64.c2
-rw-r--r--arch/x86/mm/Makefile1
-rw-r--r--arch/x86/mm/dump_pagetables.c16
-rw-r--r--arch/x86/mm/extable.c15
-rw-r--r--arch/x86/mm/fault.c4
-rw-r--r--arch/x86/mm/init.c4
-rw-r--r--arch/x86/mm/init_64.c204
-rw-r--r--arch/x86/mm/kasan_init_64.c4
-rw-r--r--arch/x86/mm/kaslr.c172
-rw-r--r--arch/x86/mm/pageattr.c49
-rw-r--r--arch/x86/mm/pat.c5
-rw-r--r--arch/x86/mm/pgtable_32.c2
-rw-r--r--arch/x86/platform/efi/efi.c17
-rw-r--r--arch/x86/platform/efi/efi_32.c3
-rw-r--r--arch/x86/platform/efi/efi_64.c26
-rw-r--r--arch/x86/platform/uv/bios_uv.c3
-rw-r--r--arch/x86/ras/mce_amd_inj.c58
-rw-r--r--arch/x86/realmode/init.c5
-rw-r--r--arch/x86/xen/apic.c1
-rw-r--r--arch/x86/xen/enlighten.c4
134 files changed, 2319 insertions, 1064 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index d9a94da..5977fea 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -49,7 +49,6 @@ config X86
select ARCH_WANTS_DYNAMIC_TASK_STRUCT
select ARCH_WANT_FRAME_POINTERS
select ARCH_WANT_IPC_PARSE_VERSION if X86_32
- select ARCH_WANT_OPTIONAL_GPIOLIB
select BUILDTIME_EXTABLE_SORT
select CLKEVT_I8253
select CLKSRC_I8253 if X86_32
@@ -294,11 +293,6 @@ config X86_32_LAZY_GS
def_bool y
depends on X86_32 && !CC_STACKPROTECTOR
-config ARCH_HWEIGHT_CFLAGS
- string
- default "-fcall-saved-ecx -fcall-saved-edx" if X86_32
- default "-fcall-saved-rdi -fcall-saved-rsi -fcall-saved-rdx -fcall-saved-rcx -fcall-saved-r8 -fcall-saved-r9 -fcall-saved-r10 -fcall-saved-r11" if X86_64
-
config ARCH_SUPPORTS_UPROBES
def_bool y
@@ -643,7 +637,7 @@ config STA2X11
select X86_DMA_REMAP
select SWIOTLB
select MFD_STA2X11
- select ARCH_REQUIRE_GPIOLIB
+ select GPIOLIB
default n
---help---
This adds support for boards based on the STA2X11 IO-Hub,
@@ -1934,21 +1928,26 @@ config RANDOMIZE_BASE
attempts relying on knowledge of the location of kernel
code internals.
- The kernel physical and virtual address can be randomized
- from 16MB up to 1GB on 64-bit and 512MB on 32-bit. (Note that
- using RANDOMIZE_BASE reduces the memory space available to
- kernel modules from 1.5GB to 1GB.)
+ On 64-bit, the kernel physical and virtual addresses are
+ randomized separately. The physical address will be anywhere
+ between 16MB and the top of physical memory (up to 64TB). The
+ virtual address will be randomized from 16MB up to 1GB (9 bits
+ of entropy). Note that this also reduces the memory space
+ available to kernel modules from 1.5GB to 1GB.
+
+ On 32-bit, the kernel physical and virtual addresses are
+ randomized together. They will be randomized from 16MB up to
+ 512MB (8 bits of entropy).
Entropy is generated using the RDRAND instruction if it is
supported. If RDTSC is supported, its value is mixed into
the entropy pool as well. If neither RDRAND nor RDTSC are
- supported, then entropy is read from the i8254 timer.
-
- Since the kernel is built using 2GB addressing, and
- PHYSICAL_ALIGN must be at a minimum of 2MB, only 10 bits of
- entropy is theoretically possible. Currently, with the
- default value for PHYSICAL_ALIGN and due to page table
- layouts, 64-bit uses 9 bits of entropy and 32-bit uses 8 bits.
+ supported, then entropy is read from the i8254 timer. The
+ usable entropy is limited by the kernel being built using
+ 2GB addressing, and that PHYSICAL_ALIGN must be at a
+ minimum of 2MB. As a result, only 10 bits of entropy are
+ theoretically possible, but the implementations are further
+ limited due to memory layouts.
If CONFIG_HIBERNATE is also enabled, KASLR is disabled at boot
time. To enable it, boot with "kaslr" on the kernel command
@@ -1988,6 +1987,38 @@ config PHYSICAL_ALIGN
Don't change this unless you know what you are doing.
+config RANDOMIZE_MEMORY
+ bool "Randomize the kernel memory sections"
+ depends on X86_64
+ depends on RANDOMIZE_BASE
+ default RANDOMIZE_BASE
+ ---help---
+ Randomizes the base virtual address of kernel memory sections
+ (physical memory mapping, vmalloc & vmemmap). This security feature
+ makes exploits relying on predictable memory locations less reliable.
+
+ The order of allocations remains unchanged. Entropy is generated in
+ the same way as RANDOMIZE_BASE. Current implementation in the optimal
+ configuration have in average 30,000 different possible virtual
+ addresses for each memory section.
+
+ If unsure, say N.
+
+config RANDOMIZE_MEMORY_PHYSICAL_PADDING
+ hex "Physical memory mapping padding" if EXPERT
+ depends on RANDOMIZE_MEMORY
+ default "0xa" if MEMORY_HOTPLUG
+ default "0x0"
+ range 0x1 0x40 if MEMORY_HOTPLUG
+ range 0x0 0x40
+ ---help---
+ Define the padding in terabytes added to the existing physical
+ memory size during kernel memory randomization. It is useful
+ for memory hotplug support but reduces the entropy available for
+ address randomization.
+
+ If unsure, leave at the default value.
+
config HOTPLUG_CPU
bool "Support for hot-pluggable CPUs"
depends on SMP
diff --git a/arch/x86/boot/bitops.h b/arch/x86/boot/bitops.h
index 878e4b9..0d41d68 100644
--- a/arch/x86/boot/bitops.h
+++ b/arch/x86/boot/bitops.h
@@ -16,14 +16,16 @@
#define BOOT_BITOPS_H
#define _LINUX_BITOPS_H /* Inhibit inclusion of <linux/bitops.h> */
-static inline int constant_test_bit(int nr, const void *addr)
+#include <linux/types.h>
+
+static inline bool constant_test_bit(int nr, const void *addr)
{
const u32 *p = (const u32 *)addr;
return ((1UL << (nr & 31)) & (p[nr >> 5])) != 0;
}
-static inline int variable_test_bit(int nr, const void *addr)
+static inline bool variable_test_bit(int nr, const void *addr)
{
- u8 v;
+ bool v;
const u32 *p = (const u32 *)addr;
asm("btl %2,%1; setc %0" : "=qm" (v) : "m" (*p), "Ir" (nr));
diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h
index 9011a88..e5612f3 100644
--- a/arch/x86/boot/boot.h
+++ b/arch/x86/boot/boot.h
@@ -24,6 +24,7 @@
#include <linux/types.h>
#include <linux/edd.h>
#include <asm/setup.h>
+#include <asm/asm.h>
#include "bitops.h"
#include "ctype.h"
#include "cpuflags.h"
@@ -176,18 +177,18 @@ static inline void wrgs32(u32 v, addr_t addr)
}
/* Note: these only return true/false, not a signed return value! */
-static inline int memcmp_fs(const void *s1, addr_t s2, size_t len)
+static inline bool memcmp_fs(const void *s1, addr_t s2, size_t len)
{
- u8 diff;
- asm volatile("fs; repe; cmpsb; setnz %0"
- : "=qm" (diff), "+D" (s1), "+S" (s2), "+c" (len));
+ bool diff;
+ asm volatile("fs; repe; cmpsb" CC_SET(nz)
+ : CC_OUT(nz) (diff), "+D" (s1), "+S" (s2), "+c" (len));
return diff;
}
-static inline int memcmp_gs(const void *s1, addr_t s2, size_t len)
+static inline bool memcmp_gs(const void *s1, addr_t s2, size_t len)
{
- u8 diff;
- asm volatile("gs; repe; cmpsb; setnz %0"
- : "=qm" (diff), "+D" (s1), "+S" (s2), "+c" (len));
+ bool diff;
+ asm volatile("gs; repe; cmpsb" CC_SET(nz)
+ : CC_OUT(nz) (diff), "+D" (s1), "+S" (s2), "+c" (len));
return diff;
}
@@ -294,6 +295,7 @@ static inline int cmdline_find_option_bool(const char *option)
/* cpu.c, cpucheck.c */
int check_cpu(int *cpu_level_ptr, int *req_level_ptr, u32 **err_flags_ptr);
+int check_knl_erratum(void);
int validate_cpu(void);
/* early_serial_console.c */
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index f135688..536ccfc 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -85,7 +85,25 @@ vmlinux-objs-$(CONFIG_EFI_STUB) += $(obj)/eboot.o $(obj)/efi_stub_$(BITS).o \
$(objtree)/drivers/firmware/efi/libstub/lib.a
vmlinux-objs-$(CONFIG_EFI_MIXED) += $(obj)/efi_thunk_$(BITS).o
+# The compressed kernel is built with -fPIC/-fPIE so that a boot loader
+# can place it anywhere in memory and it will still run. However, since
+# it is executed as-is without any ELF relocation processing performed
+# (and has already had all relocation sections stripped from the binary),
+# none of the code can use data relocations (e.g. static assignments of
+# pointer values), since they will be meaningless at runtime. This check
+# will refuse to link the vmlinux if any of these relocations are found.
+quiet_cmd_check_data_rel = DATAREL $@
+define cmd_check_data_rel
+ for obj in $(filter %.o,$^); do \
+ readelf -S $$obj | grep -qF .rel.local && { \
+ echo "error: $$obj has data relocations!" >&2; \
+ exit 1; \
+ } || true; \
+ done
+endef
+
$(obj)/vmlinux: $(vmlinux-objs-y) FORCE
+ $(call if_changed,check_data_rel)
$(call if_changed,ld)
OBJCOPYFLAGS_vmlinux.bin := -R .comment -S
diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c
index 52fef60..ff574da 100644
--- a/arch/x86/boot/compressed/eboot.c
+++ b/arch/x86/boot/compressed/eboot.c
@@ -757,7 +757,6 @@ struct boot_params *make_boot_params(struct efi_config *c)
struct boot_params *boot_params;
struct apm_bios_info *bi;
struct setup_header *hdr;
- struct efi_info *efi;
efi_loaded_image_t *image;
void *options, *handle;
efi_guid_t proto = LOADED_IMAGE_PROTOCOL_GUID;
@@ -800,7 +799,6 @@ struct boot_params *make_boot_params(struct efi_config *c)
memset(boot_params, 0x0, 0x4000);
hdr = &boot_params->hdr;
- efi = &boot_params->efi_info;
bi = &boot_params->apm_bios_info;
/* Copy the second sector to boot_params */
diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c
index cfeb025..a66854d 100644
--- a/arch/x86/boot/compressed/kaslr.c
+++ b/arch/x86/boot/compressed/kaslr.c
@@ -12,10 +12,6 @@
#include "misc.h"
#include "error.h"
-#include <asm/msr.h>
-#include <asm/archrandom.h>
-#include <asm/e820.h>
-
#include <generated/compile.h>
#include <linux/module.h>
#include <linux/uts.h>
@@ -26,26 +22,6 @@
static const char build_str[] = UTS_RELEASE " (" LINUX_COMPILE_BY "@"
LINUX_COMPILE_HOST ") (" LINUX_COMPILER ") " UTS_VERSION;
-#define I8254_PORT_CONTROL 0x43
-#define I8254_PORT_COUNTER0 0x40
-#define I8254_CMD_READBACK 0xC0
-#define I8254_SELECT_COUNTER0 0x02
-#define I8254_STATUS_NOTREADY 0x40
-static inline u16 i8254(void)
-{
- u16 status, timer;
-
- do {
- outb(I8254_PORT_CONTROL,
- I8254_CMD_READBACK | I8254_SELECT_COUNTER0);
- status = inb(I8254_PORT_COUNTER0);
- timer = inb(I8254_PORT_COUNTER0);
- timer |= inb(I8254_PORT_COUNTER0) << 8;
- } while (status & I8254_STATUS_NOTREADY);
-
- return timer;
-}
-
static unsigned long rotate_xor(unsigned long hash, const void *area,
size_t size)
{
@@ -62,7 +38,7 @@ static unsigned long rotate_xor(unsigned long hash, const void *area,
}
/* Attempt to create a simple but unpredictable starting entropy. */
-static unsigned long get_random_boot(void)
+static unsigned long get_boot_seed(void)
{
unsigned long hash = 0;
@@ -72,50 +48,8 @@ static unsigned long get_random_boot(void)
return hash;
}
-static unsigned long get_random_long(const char *purpose)
-{
-#ifdef CONFIG_X86_64
- const unsigned long mix_const = 0x5d6008cbf3848dd3UL;
-#else
- const unsigned long mix_const = 0x3f39e593UL;
-#endif
- unsigned long raw, random = get_random_boot();
- bool use_i8254 = true;
-
- debug_putstr(purpose);
- debug_putstr(" KASLR using");
-
- if (has_cpuflag(X86_FEATURE_RDRAND)) {
- debug_putstr(" RDRAND");
- if (rdrand_long(&raw)) {
- random ^= raw;
- use_i8254 = false;
- }
- }
-
- if (has_cpuflag(X86_FEATURE_TSC)) {
- debug_putstr(" RDTSC");
- raw = rdtsc();
-
- random ^= raw;
- use_i8254 = false;
- }
-
- if (use_i8254) {
- debug_putstr(" i8254");
- random ^= i8254();
- }
-
- /* Circular multiply for better bit diffusion */
- asm("mul %3"
- : "=a" (random), "=d" (raw)
- : "a" (random), "rm" (mix_const));
- random += raw;
-
- debug_putstr("...\n");
-
- return random;
-}
+#define KASLR_COMPRESSED_BOOT
+#include "../../lib/kaslr.c"
struct mem_vector {
unsigned long start;
@@ -132,17 +66,6 @@ enum mem_avoid_index {
static struct mem_vector mem_avoid[MEM_AVOID_MAX];
-static bool mem_contains(struct mem_vector *region, struct mem_vector *item)
-{
- /* Item at least partially before region. */
- if (item->start < region->start)
- return false;
- /* Item at least partially after region. */
- if (item->start + item->size > region->start + region->size)
- return false;
- return true;
-}
-
static bool mem_overlaps(struct mem_vector *one, struct mem_vector *two)
{
/* Item one is entirely before item two. */
@@ -296,6 +219,7 @@ static bool mem_avoid_overlap(struct mem_vector *img,
if (mem_overlaps(img, &mem_avoid[i]) &&
mem_avoid[i].start < earliest) {
*overlap = mem_avoid[i];
+ earliest = overlap->start;
is_overlapping = true;
}
}
@@ -310,6 +234,7 @@ static bool mem_avoid_overlap(struct mem_vector *img,
if (mem_overlaps(img, &avoid) && (avoid.start < earliest)) {
*overlap = avoid;
+ earliest = overlap->start;
is_overlapping = true;
}
@@ -319,8 +244,6 @@ static bool mem_avoid_overlap(struct mem_vector *img,
return is_overlapping;
}
-static unsigned long slots[KERNEL_IMAGE_SIZE / CONFIG_PHYSICAL_ALIGN];
-
struct slot_area {
unsigned long addr;
int num;
@@ -351,36 +274,44 @@ static void store_slot_info(struct mem_vector *region, unsigned long image_size)
}
}
-static void slots_append(unsigned long addr)
-{
- /* Overflowing the slots list should be impossible. */
- if (slot_max >= KERNEL_IMAGE_SIZE / CONFIG_PHYSICAL_ALIGN)
- return;
-
- slots[slot_max++] = addr;
-}
-
static unsigned long slots_fetch_random(void)
{
+ unsigned long slot;
+ int i;
+
/* Handle case of no slots stored. */
if (slot_max == 0)
return 0;
- return slots[get_random_long("Physical") % slot_max];
+ slot = kaslr_get_random_long("Physical") % slot_max;
+
+ for (i = 0; i < slot_area_index; i++) {
+ if (slot >= slot_areas[i].num) {
+ slot -= slot_areas[i].num;
+ continue;
+ }
+ return slot_areas[i].addr + slot * CONFIG_PHYSICAL_ALIGN;
+ }
+
+ if (i == slot_area_index)
+ debug_putstr("slots_fetch_random() failed!?\n");
+ return 0;
}
static void process_e820_entry(struct e820entry *entry,
unsigned long minimum,
unsigned long image_size)
{
- struct mem_vector region, img, overlap;
+ struct mem_vector region, overlap;
+ struct slot_area slot_area;
+ unsigned long start_orig;
/* Skip non-RAM entries. */
if (entry->type != E820_RAM)
return;
- /* Ignore entries entirely above our maximum. */
- if (entry->addr >= KERNEL_IMAGE_SIZE)
+ /* On 32-bit, ignore entries entirely above our maximum. */
+ if (IS_ENABLED(CONFIG_X86_32) && entry->addr >= KERNEL_IMAGE_SIZE)
return;
/* Ignore entries entirely below our minimum. */
@@ -390,31 +321,55 @@ static void process_e820_entry(struct e820entry *entry,
region.start = entry->addr;
region.size = entry->size;
- /* Potentially raise address to minimum location. */
- if (region.start < minimum)
- region.start = minimum;
+ /* Give up if slot area array is full. */
+ while (slot_area_index < MAX_SLOT_AREA) {
+ start_orig = region.start;
- /* Potentially raise address to meet alignment requirements. */
- region.start = ALIGN(region.start, CONFIG_PHYSICAL_ALIGN);
+ /* Potentially raise address to minimum location. */
+ if (region.start < minimum)
+ region.start = minimum;
- /* Did we raise the address above the bounds of this e820 region? */
- if (region.start > entry->addr + entry->size)
- return;
+ /* Potentially raise address to meet alignment needs. */
+ region.start = ALIGN(region.start, CONFIG_PHYSICAL_ALIGN);
- /* Reduce size by any delta from the original address. */
- region.size -= region.start - entry->addr;
+ /* Did we raise the address above this e820 region? */
+ if (region.start > entry->addr + entry->size)
+ return;
- /* Reduce maximum size to fit end of image within maximum limit. */
- if (region.start + region.size > KERNEL_IMAGE_SIZE)
- region.size = KERNEL_IMAGE_SIZE - region.start;
+ /* Reduce size by any delta from the original address. */
+ region.size -= region.start - start_orig;
- /* Walk each aligned slot and check for avoided areas. */
- for (img.start = region.start, img.size = image_size ;
- mem_contains(&region, &img) ;
- img.start += CONFIG_PHYSICAL_ALIGN) {
- if (mem_avoid_overlap(&img, &overlap))
- continue;
- slots_append(img.start);
+ /* On 32-bit, reduce region size to fit within max size. */
+ if (IS_ENABLED(CONFIG_X86_32) &&
+ region.start + region.size > KERNEL_IMAGE_SIZE)
+ region.size = KERNEL_IMAGE_SIZE - region.start;
+
+ /* Return if region can't contain decompressed kernel */
+ if (region.size < image_size)
+ return;
+
+ /* If nothing overlaps, store the region and return. */
+ if (!mem_avoid_overlap(&region, &overlap)) {
+ store_slot_info(&region, image_size);
+ return;
+ }
+
+ /* Store beginning of region if holds at least image_size. */
+ if (overlap.start > region.start + image_size) {
+ struct mem_vector beginning;
+
+ beginning.start = region.start;
+ beginning.size = overlap.start - region.start;
+ store_slot_info(&beginning, image_size);
+ }
+
+ /* Return if overlap extends to or past end of region. */
+ if (overlap.start + overlap.size >= region.start + region.size)
+ return;
+
+ /* Clip off the overlapping region and start over. */
+ region.size -= overlap.start - region.start + overlap.size;
+ region.start = overlap.start + overlap.size;
}
}
@@ -431,6 +386,10 @@ static unsigned long find_random_phys_addr(unsigned long minimum,
for (i = 0; i < boot_params->e820_entries; i++) {
process_e820_entry(&boot_params->e820_map[i], minimum,
image_size);
+ if (slot_area_index == MAX_SLOT_AREA) {
+ debug_putstr("Aborted e820 scan (slot_areas full)!\n");
+ break;
+ }
}
return slots_fetch_random();
@@ -454,7 +413,7 @@ static unsigned long find_random_virt_addr(unsigned long minimum,
slots = (KERNEL_IMAGE_SIZE - minimum - image_size) /
CONFIG_PHYSICAL_ALIGN + 1;
- random_addr = get_random_long("Virtual") % slots;
+ random_addr = kaslr_get_random_long("Virtual") % slots;
return random_addr * CONFIG_PHYSICAL_ALIGN + minimum;
}
@@ -463,48 +422,54 @@ static unsigned long find_random_virt_addr(unsigned long minimum,
* Since this function examines addresses much more numerically,
* it takes the input and output pointers as 'unsigned long'.
*/
-unsigned char *choose_random_location(unsigned long input,
- unsigned long input_size,
- unsigned long output,
- unsigned long output_size)
+void choose_random_location(unsigned long input,
+ unsigned long input_size,
+ unsigned long *output,
+ unsigned long output_size,
+ unsigned long *virt_addr)
{
- unsigned long choice = output;
- unsigned long random_addr;
+ unsigned long random_addr, min_addr;
+
+ /* By default, keep output position unchanged. */
+ *virt_addr = *output;
-#ifdef CONFIG_HIBERNATION
- if (!cmdline_find_option_bool("kaslr")) {
- warn("KASLR disabled: 'kaslr' not on cmdline (hibernation selected).");
- goto out;
- }
-#else
if (cmdline_find_option_bool("nokaslr")) {
warn("KASLR disabled: 'nokaslr' on cmdline.");
- goto out;
+ return;
}
-#endif
boot_params->hdr.loadflags |= KASLR_FLAG;
+ /* Prepare to add new identity pagetables on demand. */
+ initialize_identity_maps();
+
/* Record the various known unsafe memory ranges. */
- mem_avoid_init(input, input_size, output);
+ mem_avoid_init(input, input_size, *output);
+
+ /*
+ * Low end of the randomization range should be the
+ * smaller of 512M or the initial kernel image
+ * location:
+ */
+ min_addr = min(*output, 512UL << 20);
/* Walk e820 and find a random address. */
- random_addr = find_random_phys_addr(output, output_size);
+ random_addr = find_random_phys_addr(min_addr, output_size);
if (!random_addr) {
warn("KASLR disabled: could not find suitable E820 region!");
- goto out;
+ } else {
+ /* Update the new physical address location. */
+ if (*output != random_addr) {
+ add_identity_map(random_addr, output_size);
+ *output = random_addr;
+ }
}
- /* Always enforce the minimum. */
- if (random_addr < choice)
- goto out;
-
- choice = random_addr;
-
- add_identity_map(choice, output_size);
-
/* This actually loads the identity pagetable on x86_64. */
finalize_identity_maps();
-out:
- return (unsigned char *)choice;
+
+ /* Pick random virtual address starting from LOAD_PHYSICAL_ADDR. */
+ if (IS_ENABLED(CONFIG_X86_64))
+ random_addr = find_random_virt_addr(LOAD_PHYSICAL_ADDR, output_size);
+ *virt_addr = random_addr;
}
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index f14db4e..b3c5a5f0 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -170,7 +170,8 @@ void __puthex(unsigned long value)
}
#if CONFIG_X86_NEED_RELOCS
-static void handle_relocations(void *output, unsigned long output_len)
+static void handle_relocations(void *output, unsigned long output_len,
+ unsigned long virt_addr)
{
int *reloc;
unsigned long delta, map, ptr;
@@ -182,11 +183,6 @@ static void handle_relocations(void *output, unsigned long output_len)
* and where it was actually loaded.
*/
delta = min_addr - LOAD_PHYSICAL_ADDR;
- if (!delta) {
- debug_putstr("No relocation needed... ");
- return;
- }
- debug_putstr("Performing relocations... ");
/*
* The kernel contains a table of relocation addresses. Those
@@ -198,6 +194,20 @@ static void handle_relocations(void *output, unsigned long output_len)
map = delta - __START_KERNEL_map;
/*
+ * 32-bit always performs relocations. 64-bit relocations are only
+ * needed if KASLR has chosen a different starting address offset
+ * from __START_KERNEL_map.
+ */
+ if (IS_ENABLED(CONFIG_X86_64))
+ delta = virt_addr - LOAD_PHYSICAL_ADDR;
+
+ if (!delta) {
+ debug_putstr("No relocation needed... ");
+ return;
+ }
+ debug_putstr("Performing relocations... ");
+
+ /*
* Process relocations: 32 bit relocations first then 64 bit after.
* Three sets of binary relocations are added to the end of the kernel
* before compression. Each relocation table entry is the kernel
@@ -250,7 +260,8 @@ static void handle_relocations(void *output, unsigned long output_len)
#endif
}
#else
-static inline void handle_relocations(void *output, unsigned long output_len)
+static inline void handle_relocations(void *output, unsigned long output_len,
+ unsigned long virt_addr)
{ }
#endif
@@ -327,7 +338,7 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap,
unsigned long output_len)
{
const unsigned long kernel_total_size = VO__end - VO__text;
- unsigned char *output_orig = output;
+ unsigned long virt_addr = (unsigned long)output;
/* Retain x86 boot parameters pointer passed from startup_32/64. */
boot_params = rmode;
@@ -366,13 +377,16 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap,
* the entire decompressed kernel plus relocation table, or the
* entire decompressed kernel plus .bss and .brk sections.
*/
- output = choose_random_location((unsigned long)input_data, input_len,
- (unsigned long)output,
- max(output_len, kernel_total_size));
+ choose_random_location((unsigned long)input_data, input_len,
+ (unsigned long *)&output,
+ max(output_len, kernel_total_size),
+ &virt_addr);
/* Validate memory location choices. */
if ((unsigned long)output & (MIN_KERNEL_ALIGN - 1))
- error("Destination address inappropriately aligned");
+ error("Destination physical address inappropriately aligned");
+ if (virt_addr & (MIN_KERNEL_ALIGN - 1))
+ error("Destination virtual address inappropriately aligned");
#ifdef CONFIG_X86_64
if (heap > 0x3fffffffffffUL)
error("Destination address too large");
@@ -382,19 +396,16 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap,
#endif
#ifndef CONFIG_RELOCATABLE
if ((unsigned long)output != LOAD_PHYSICAL_ADDR)
- error("Wrong destination address");
+ error("Destination address does not match LOAD_PHYSICAL_ADDR");
+ if ((unsigned long)output != virt_addr)
+ error("Destination virtual address changed when not relocatable");
#endif
debug_putstr("\nDecompressing Linux... ");
__decompress(input_data, input_len, NULL, NULL, output, output_len,
NULL, error);
parse_elf(output);
- /*
- * 32-bit always performs relocations. 64-bit relocations are only
- * needed if kASLR has chosen a different load address.
- */
- if (!IS_ENABLED(CONFIG_X86_64) || output != output_orig)
- handle_relocations(output, output_len);
+ handle_relocations(output, output_len, virt_addr);
debug_putstr("done.\nBooting the kernel.\n");
return output;
}
diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
index b6fec1f..1c8355e 100644
--- a/arch/x86/boot/compressed/misc.h
+++ b/arch/x86/boot/compressed/misc.h
@@ -67,28 +67,33 @@ int cmdline_find_option_bool(const char *option);
#if CONFIG_RANDOMIZE_BASE
/* kaslr.c */
-unsigned char *choose_random_location(unsigned long input_ptr,
- unsigned long input_size,
- unsigned long output_ptr,
- unsigned long output_size);
+void choose_random_location(unsigned long input,
+ unsigned long input_size,
+ unsigned long *output,
+ unsigned long output_size,
+ unsigned long *virt_addr);
/* cpuflags.c */
bool has_cpuflag(int flag);
#else
-static inline
-unsigned char *choose_random_location(unsigned long input_ptr,
- unsigned long input_size,
- unsigned long output_ptr,
- unsigned long output_size)
+static inline void choose_random_location(unsigned long input,
+ unsigned long input_size,
+ unsigned long *output,
+ unsigned long output_size,
+ unsigned long *virt_addr)
{
- return (unsigned char *)output_ptr;
+ /* No change from existing output location. */
+ *virt_addr = *output;
}
#endif
#ifdef CONFIG_X86_64
+void initialize_identity_maps(void);
void add_identity_map(unsigned long start, unsigned long size);
void finalize_identity_maps(void);
extern unsigned char _pgtable[];
#else
+static inline void initialize_identity_maps(void)
+{ }
static inline void add_identity_map(unsigned long start, unsigned long size)
{ }
static inline void finalize_identity_maps(void)
diff --git a/arch/x86/boot/compressed/pagetable.c b/arch/x86/boot/compressed/pagetable.c
index 34b95df..56589d0 100644
--- a/arch/x86/boot/compressed/pagetable.c
+++ b/arch/x86/boot/compressed/pagetable.c
@@ -2,6 +2,9 @@
* This code is used on x86_64 to create page table identity mappings on
* demand by building up a new set of page tables (or appending to the
* existing ones), and then switching over to them when ready.
+ *
+ * Copyright (C) 2015-2016 Yinghai Lu
+ * Copyright (C) 2016 Kees Cook
*/
/*
@@ -17,6 +20,9 @@
/* These actually do the work of building the kernel identity maps. */
#include <asm/init.h>
#include <asm/pgtable.h>
+/* Use the static base for this part of the boot process */
+#undef __PAGE_OFFSET
+#define __PAGE_OFFSET __PAGE_OFFSET_BASE
#include "../../mm/ident_map.c"
/* Used by pgtable.h asm code to force instruction serialization. */
@@ -59,9 +65,21 @@ static struct alloc_pgt_data pgt_data;
/* The top level page table entry pointer. */
static unsigned long level4p;
+/*
+ * Mapping information structure passed to kernel_ident_mapping_init().
+ * Due to relocation, pointers must be assigned at run time not build time.
+ */
+static struct x86_mapping_info mapping_info = {
+ .pmd_flag = __PAGE_KERNEL_LARGE_EXEC,
+};
+
/* Locates and clears a region for a new top level page table. */
-static void prepare_level4(void)
+void initialize_identity_maps(void)
{
+ /* Init mapping_info with run-time function/buffer pointers. */
+ mapping_info.alloc_pgt_page = alloc_pgt_page;
+ mapping_info.context = &pgt_data;
+
/*
* It should be impossible for this not to already be true,
* but since calling this a second time would rewind the other
@@ -96,17 +114,8 @@ static void prepare_level4(void)
*/
void add_identity_map(unsigned long start, unsigned long size)
{
- struct x86_mapping_info mapping_info = {
- .alloc_pgt_page = alloc_pgt_page,
- .context = &pgt_data,
- .pmd_flag = __PAGE_KERNEL_LARGE_EXEC,
- };
unsigned long end = start + size;
- /* Make sure we have a top level page table ready to use. */
- if (!level4p)
- prepare_level4();
-
/* Align boundary to 2M. */
start = round_down(start, PMD_SIZE);
end = round_up(end, PMD_SIZE);
diff --git a/arch/x86/boot/cpu.c b/arch/x86/boot/cpu.c
index 29207f6..26240dd 100644
--- a/arch/x86/boot/cpu.c
+++ b/arch/x86/boot/cpu.c
@@ -93,6 +93,8 @@ int validate_cpu(void)
show_cap_strs(err_flags);
putchar('\n');
return -1;
+ } else if (check_knl_erratum()) {
+ return -1;
} else {
return 0;
}
diff --git a/arch/x86/boot/cpucheck.c b/arch/x86/boot/cpucheck.c
index 1fd7d57..4ad7d70 100644
--- a/arch/x86/boot/cpucheck.c
+++ b/arch/x86/boot/cpucheck.c
@@ -24,6 +24,7 @@
# include "boot.h"
#endif
#include <linux/types.h>
+#include <asm/intel-family.h>
#include <asm/processor-flags.h>
#include <asm/required-features.h>
#include <asm/msr-index.h>
@@ -175,6 +176,8 @@ int check_cpu(int *cpu_level_ptr, int *req_level_ptr, u32 **err_flags_ptr)
puts("WARNING: PAE disabled. Use parameter 'forcepae' to enable at your own risk!\n");
}
}
+ if (!err)
+ err = check_knl_erratum();
if (err_flags_ptr)
*err_flags_ptr = err ? err_flags : NULL;
@@ -185,3 +188,33 @@ int check_cpu(int *cpu_level_ptr, int *req_level_ptr, u32 **err_flags_ptr)
return (cpu.level < req_level || err) ? -1 : 0;
}
+
+int check_knl_erratum(void)
+{
+ /*
+ * First check for the affected model/family:
+ */
+ if (!is_intel() ||
+ cpu.family != 6 ||
+ cpu.model != INTEL_FAM6_XEON_PHI_KNL)
+ return 0;
+
+ /*
+ * This erratum affects the Accessed/Dirty bits, and can
+ * cause stray bits to be set in !Present PTEs. We have
+ * enough bits in our 64-bit PTEs (which we have on real
+ * 64-bit mode or PAE) to avoid using these troublesome
+ * bits. But, we do not have enough space in our 32-bit
+ * PTEs. So, refuse to run on 32-bit non-PAE kernels.
+ */
+ if (IS_ENABLED(CONFIG_X86_64) || IS_ENABLED(CONFIG_X86_PAE))
+ return 0;
+
+ puts("This 32-bit kernel can not run on this Xeon Phi x200\n"
+ "processor due to a processor erratum. Use a 64-bit\n"
+ "kernel, or enable PAE in this 32-bit kernel.\n\n");
+
+ return -1;
+}
+
+
diff --git a/arch/x86/boot/cpuflags.c b/arch/x86/boot/cpuflags.c
index 431fa5f..6687ab9 100644
--- a/arch/x86/boot/cpuflags.c
+++ b/arch/x86/boot/cpuflags.c
@@ -102,6 +102,7 @@ void get_cpuflags(void)
cpuid(0x1, &tfms, &ignored, &cpu.flags[4],
&cpu.flags[0]);
cpu.level = (tfms >> 8) & 15;
+ cpu.family = cpu.level;
cpu.model = (tfms >> 4) & 15;
if (cpu.level >= 6)
cpu.model += ((tfms >> 16) & 0xf) << 4;
diff --git a/arch/x86/boot/cpuflags.h b/arch/x86/boot/cpuflags.h
index 4cb404f..15ad56a 100644
--- a/arch/x86/boot/cpuflags.h
+++ b/arch/x86/boot/cpuflags.h
@@ -6,6 +6,7 @@
struct cpu_features {
int level; /* Family, or 64 for x86-64 */
+ int family; /* Family, always */
int model;
u32 flags[NCAPINTS];
};
diff --git a/arch/x86/boot/string.c b/arch/x86/boot/string.c
index 318b846..cc3bd58 100644
--- a/arch/x86/boot/string.c
+++ b/arch/x86/boot/string.c
@@ -17,7 +17,7 @@
int memcmp(const void *s1, const void *s2, size_t len)
{
- u8 diff;
+ bool diff;
asm("repe; cmpsb; setnz %0"
: "=qm" (diff), "+D" (s1), "+S" (s2), "+c" (len));
return diff;
diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index ec138e5..9e1e27d 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -40,10 +40,10 @@ static struct thread_info *pt_regs_to_thread_info(struct pt_regs *regs)
#ifdef CONFIG_CONTEXT_TRACKING
/* Called on entry from user mode with IRQs off. */
-__visible void enter_from_user_mode(void)
+__visible inline void enter_from_user_mode(void)
{
CT_WARN_ON(ct_state() != CONTEXT_USER);
- user_exit();
+ user_exit_irqoff();
}
#else
static inline void enter_from_user_mode(void) {}
@@ -274,7 +274,7 @@ __visible inline void prepare_exit_to_usermode(struct pt_regs *regs)
ti->status &= ~TS_COMPAT;
#endif
- user_enter();
+ user_enter_irqoff();
}
#define SYSCALL_EXIT_WORK_FLAGS \
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 983e5d3..0b56666 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -1153,3 +1153,14 @@ ENTRY(async_page_fault)
jmp error_code
END(async_page_fault)
#endif
+
+ENTRY(rewind_stack_do_exit)
+ /* Prevent any naive code from trying to unwind to our caller. */
+ xorl %ebp, %ebp
+
+ movl PER_CPU_VAR(cpu_current_top_of_stack), %esi
+ leal -TOP_OF_KERNEL_STACK_PADDING-PTREGS_SIZE(%esi), %esp
+
+ call do_exit
+1: jmp 1b
+END(rewind_stack_do_exit)
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 9ee0da1..b846875 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -1423,3 +1423,14 @@ ENTRY(ignore_sysret)
mov $-ENOSYS, %eax
sysret
END(ignore_sysret)
+
+ENTRY(rewind_stack_do_exit)
+ /* Prevent any naive code from trying to unwind to our caller. */
+ xorl %ebp, %ebp
+
+ movq PER_CPU_VAR(cpu_current_top_of_stack), %rax
+ leaq -TOP_OF_KERNEL_STACK_PADDING-PTREGS_SIZE(%rax), %rsp
+
+ call do_exit
+1: jmp 1b
+END(rewind_stack_do_exit)
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 555263e..e9ce9c7 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -374,5 +374,5 @@
543 x32 io_setup compat_sys_io_setup
544 x32 io_submit compat_sys_io_submit
545 x32 execveat compat_sys_execveat/ptregs
-534 x32 preadv2 compat_sys_preadv2
-535 x32 pwritev2 compat_sys_pwritev2
+546 x32 preadv2 compat_sys_preadv64v2
+547 x32 pwritev2 compat_sys_pwritev64v2
diff --git a/arch/x86/entry/thunk_64.S b/arch/x86/entry/thunk_64.S
index 027aec4..627ecbc 100644
--- a/arch/x86/entry/thunk_64.S
+++ b/arch/x86/entry/thunk_64.S
@@ -33,7 +33,7 @@
.endif
call \func
- jmp restore
+ jmp .L_restore
_ASM_NOKPROBE(\name)
.endm
@@ -54,7 +54,7 @@
#if defined(CONFIG_TRACE_IRQFLAGS) \
|| defined(CONFIG_DEBUG_LOCK_ALLOC) \
|| defined(CONFIG_PREEMPT)
-restore:
+.L_restore:
popq %r11
popq %r10
popq %r9
@@ -66,5 +66,5 @@ restore:
popq %rdi
popq %rbp
ret
- _ASM_NOKPROBE(restore)
+ _ASM_NOKPROBE(.L_restore)
#endif
diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile
index 253b72e..68b63fd 100644
--- a/arch/x86/entry/vdso/Makefile
+++ b/arch/x86/entry/vdso/Makefile
@@ -134,7 +134,7 @@ VDSO_LDFLAGS_vdso32.lds = -m32 -Wl,-m,elf_i386 -Wl,-soname=linux-gate.so.1
override obj-dirs = $(dir $(obj)) $(obj)/vdso32/
targets += vdso32/vdso32.lds
-targets += vdso32/note.o vdso32/vclock_gettime.o vdso32/system_call.o
+targets += vdso32/note.o vdso32/system_call.o vdso32/sigreturn.o
targets += vdso32/vclock_gettime.o
KBUILD_AFLAGS_32 := $(filter-out -m64,$(KBUILD_AFLAGS)) -DBUILD_VDSO
@@ -156,7 +156,8 @@ $(obj)/vdso32.so.dbg: FORCE \
$(obj)/vdso32/vdso32.lds \
$(obj)/vdso32/vclock_gettime.o \
$(obj)/vdso32/note.o \
- $(obj)/vdso32/system_call.o
+ $(obj)/vdso32/system_call.o \
+ $(obj)/vdso32/sigreturn.o
$(call if_changed,vdso)
#
diff --git a/arch/x86/entry/vdso/vdso32/sigreturn.S b/arch/x86/entry/vdso/vdso32/sigreturn.S
index d7ec4e2..20633e0 100644
--- a/arch/x86/entry/vdso/vdso32/sigreturn.S
+++ b/arch/x86/entry/vdso/vdso32/sigreturn.S
@@ -1,11 +1,3 @@
-/*
- * Common code for the sigreturn entry points in vDSO images.
- * So far this code is the same for both int80 and sysenter versions.
- * This file is #include'd by int80.S et al to define them first thing.
- * The kernel assumes that the addresses of these routines are constant
- * for all vDSO implementations.
- */
-
#include <linux/linkage.h>
#include <asm/unistd_32.h>
#include <asm/asm-offsets.h>
diff --git a/arch/x86/entry/vdso/vdso32/system_call.S b/arch/x86/entry/vdso/vdso32/system_call.S
index 0109ac6..ed4bc97 100644
--- a/arch/x86/entry/vdso/vdso32/system_call.S
+++ b/arch/x86/entry/vdso/vdso32/system_call.S
@@ -2,16 +2,11 @@
* AT_SYSINFO entry point
*/
+#include <linux/linkage.h>
#include <asm/dwarf2.h>
#include <asm/cpufeatures.h>
#include <asm/alternative-asm.h>
-/*
- * First get the common code for the sigreturn entry points.
- * This must come first.
- */
-#include "sigreturn.S"
-
.text
.globl __kernel_vsyscall
.type __kernel_vsyscall,@function
diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
index ab220ac..3329844 100644
--- a/arch/x86/entry/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
@@ -12,6 +12,7 @@
#include <linux/random.h>
#include <linux/elf.h>
#include <linux/cpu.h>
+#include <linux/ptrace.h>
#include <asm/pvclock.h>
#include <asm/vgtod.h>
#include <asm/proto.h>
@@ -97,10 +98,40 @@ static int vdso_fault(const struct vm_special_mapping *sm,
return 0;
}
-static const struct vm_special_mapping text_mapping = {
- .name = "[vdso]",
- .fault = vdso_fault,
-};
+static void vdso_fix_landing(const struct vdso_image *image,
+ struct vm_area_struct *new_vma)
+{
+#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
+ if (in_ia32_syscall() && image == &vdso_image_32) {
+ struct pt_regs *regs = current_pt_regs();
+ unsigned long vdso_land = image->sym_int80_landing_pad;
+ unsigned long old_land_addr = vdso_land +
+ (unsigned long)current->mm->context.vdso;
+
+ /* Fixing userspace landing - look at do_fast_syscall_32 */
+ if (regs->ip == old_land_addr)
+ regs->ip = new_vma->vm_start + vdso_land;
+ }
+#endif
+}
+
+static int vdso_mremap(const struct vm_special_mapping *sm,
+ struct vm_area_struct *new_vma)
+{
+ unsigned long new_size = new_vma->vm_end - new_vma->vm_start;
+ const struct vdso_image *image = current->mm->context.vdso_image;
+
+ if (image->size != new_size)
+ return -EINVAL;
+
+ if (WARN_ON_ONCE(current->mm != new_vma->vm_mm))
+ return -EFAULT;
+
+ vdso_fix_landing(image, new_vma);
+ current->mm->context.vdso = (void __user *)new_vma->vm_start;
+
+ return 0;
+}
static int vvar_fault(const struct vm_special_mapping *sm,
struct vm_area_struct *vma, struct vm_fault *vmf)
@@ -151,6 +182,12 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr)
struct vm_area_struct *vma;
unsigned long addr, text_start;
int ret = 0;
+
+ static const struct vm_special_mapping vdso_mapping = {
+ .name = "[vdso]",
+ .fault = vdso_fault,
+ .mremap = vdso_mremap,
+ };
static const struct vm_special_mapping vvar_mapping = {
.name = "[vvar]",
.fault = vvar_fault,
@@ -185,7 +222,7 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr)
image->size,
VM_READ|VM_EXEC|
VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC,
- &text_mapping);
+ &vdso_mapping);
if (IS_ERR(vma)) {
ret = PTR_ERR(vma);
diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c
index 174c254..75fc719 100644
--- a/arch/x86/entry/vsyscall/vsyscall_64.c
+++ b/arch/x86/entry/vsyscall/vsyscall_64.c
@@ -96,7 +96,7 @@ static bool write_ok_or_segv(unsigned long ptr, size_t size)
{
/*
* XXX: if access_ok, get_user, and put_user handled
- * sig_on_uaccess_error, this could go away.
+ * sig_on_uaccess_err, this could go away.
*/
if (!access_ok(VERIFY_WRITE, (void __user *)ptr, size)) {
@@ -125,7 +125,7 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
struct task_struct *tsk;
unsigned long caller;
int vsyscall_nr, syscall_nr, tmp;
- int prev_sig_on_uaccess_error;
+ int prev_sig_on_uaccess_err;
long ret;
/*
@@ -221,8 +221,8 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
* With a real vsyscall, page faults cause SIGSEGV. We want to
* preserve that behavior to make writing exploits harder.
*/
- prev_sig_on_uaccess_error = current_thread_info()->sig_on_uaccess_error;
- current_thread_info()->sig_on_uaccess_error = 1;
+ prev_sig_on_uaccess_err = current->thread.sig_on_uaccess_err;
+ current->thread.sig_on_uaccess_err = 1;
ret = -EFAULT;
switch (vsyscall_nr) {
@@ -243,7 +243,7 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
break;
}
- current_thread_info()->sig_on_uaccess_error = prev_sig_on_uaccess_error;
+ current->thread.sig_on_uaccess_err = prev_sig_on_uaccess_err;
check_fault:
if (ret == -EFAULT) {
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 26ced53..dfebbde2 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -263,7 +263,7 @@ static bool check_hw_exists(void)
msr_fail:
pr_cont("Broken PMU hardware detected, using software events only.\n");
- pr_info("%sFailed to access perfctr msr (MSR %x is %Lx)\n",
+ printk("%sFailed to access perfctr msr (MSR %x is %Lx)\n",
boot_cpu_has(X86_FEATURE_HYPERVISOR) ? KERN_INFO : KERN_ERR,
reg, val_new);
@@ -1622,6 +1622,29 @@ ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr, cha
}
EXPORT_SYMBOL_GPL(events_sysfs_show);
+ssize_t events_ht_sysfs_show(struct device *dev, struct device_attribute *attr,
+ char *page)
+{
+ struct perf_pmu_events_ht_attr *pmu_attr =
+ container_of(attr, struct perf_pmu_events_ht_attr, attr);
+
+ /*
+ * Report conditional events depending on Hyper-Threading.
+ *
+ * This is overly conservative as usually the HT special
+ * handling is not needed if the other CPU thread is idle.
+ *
+ * Note this does not (and cannot) handle the case when thread
+ * siblings are invisible, for example with virtualization
+ * if they are owned by some other guest. The user tool
+ * has to re-read when a thread sibling gets onlined later.
+ */
+ return sprintf(page, "%s",
+ topology_max_smt_threads() > 1 ?
+ pmu_attr->event_str_ht :
+ pmu_attr->event_str_noht);
+}
+
EVENT_ATTR(cpu-cycles, CPU_CYCLES );
EVENT_ATTR(instructions, INSTRUCTIONS );
EVENT_ATTR(cache-references, CACHE_REFERENCES );
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 9b4f9d3..0974ba1 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -16,6 +16,7 @@
#include <asm/cpufeature.h>
#include <asm/hardirq.h>
+#include <asm/intel-family.h>
#include <asm/apic.h>
#include "../perf_event.h"
@@ -185,7 +186,7 @@ static struct event_constraint intel_slm_event_constraints[] __read_mostly =
EVENT_CONSTRAINT_END
};
-struct event_constraint intel_skl_event_constraints[] = {
+static struct event_constraint intel_skl_event_constraints[] = {
FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
@@ -204,10 +205,8 @@ struct event_constraint intel_skl_event_constraints[] = {
};
static struct extra_reg intel_knl_extra_regs[] __read_mostly = {
- INTEL_UEVENT_EXTRA_REG(0x01b7,
- MSR_OFFCORE_RSP_0, 0x7f9ffbffffull, RSP_0),
- INTEL_UEVENT_EXTRA_REG(0x02b7,
- MSR_OFFCORE_RSP_1, 0x3f9ffbffffull, RSP_1),
+ INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x799ffbb6e7ull, RSP_0),
+ INTEL_UEVENT_EXTRA_REG(0x02b7, MSR_OFFCORE_RSP_1, 0x399ffbffe7ull, RSP_1),
EVENT_EXTRA_END
};
@@ -243,14 +242,51 @@ EVENT_ATTR_STR(mem-loads, mem_ld_nhm, "event=0x0b,umask=0x10,ldlat=3");
EVENT_ATTR_STR(mem-loads, mem_ld_snb, "event=0xcd,umask=0x1,ldlat=3");
EVENT_ATTR_STR(mem-stores, mem_st_snb, "event=0xcd,umask=0x2");
-struct attribute *nhm_events_attrs[] = {
+static struct attribute *nhm_events_attrs[] = {
EVENT_PTR(mem_ld_nhm),
NULL,
};
-struct attribute *snb_events_attrs[] = {
+/*
+ * topdown events for Intel Core CPUs.
+ *
+ * The events are all in slots, which is a free slot in a 4 wide
+ * pipeline. Some events are already reported in slots, for cycle
+ * events we multiply by the pipeline width (4).
+ *
+ * With Hyper Threading on, topdown metrics are either summed or averaged
+ * between the threads of a core: (count_t0 + count_t1).
+ *
+ * For the average case the metric is always scaled to pipeline width,
+ * so we use factor 2 ((count_t0 + count_t1) / 2 * 4)
+ */
+
+EVENT_ATTR_STR_HT(topdown-total-slots, td_total_slots,
+ "event=0x3c,umask=0x0", /* cpu_clk_unhalted.thread */
+ "event=0x3c,umask=0x0,any=1"); /* cpu_clk_unhalted.thread_any */
+EVENT_ATTR_STR_HT(topdown-total-slots.scale, td_total_slots_scale, "4", "2");
+EVENT_ATTR_STR(topdown-slots-issued, td_slots_issued,
+ "event=0xe,umask=0x1"); /* uops_issued.any */
+EVENT_ATTR_STR(topdown-slots-retired, td_slots_retired,
+ "event=0xc2,umask=0x2"); /* uops_retired.retire_slots */
+EVENT_ATTR_STR(topdown-fetch-bubbles, td_fetch_bubbles,
+ "event=0x9c,umask=0x1"); /* idq_uops_not_delivered_core */
+EVENT_ATTR_STR_HT(topdown-recovery-bubbles, td_recovery_bubbles,
+ "event=0xd,umask=0x3,cmask=1", /* int_misc.recovery_cycles */
+ "event=0xd,umask=0x3,cmask=1,any=1"); /* int_misc.recovery_cycles_any */
+EVENT_ATTR_STR_HT(topdown-recovery-bubbles.scale, td_recovery_bubbles_scale,
+ "4", "2");
+
+static struct attribute *snb_events_attrs[] = {
EVENT_PTR(mem_ld_snb),
EVENT_PTR(mem_st_snb),
+ EVENT_PTR(td_slots_issued),
+ EVENT_PTR(td_slots_retired),
+ EVENT_PTR(td_fetch_bubbles),
+ EVENT_PTR(td_total_slots),
+ EVENT_PTR(td_total_slots_scale),
+ EVENT_PTR(td_recovery_bubbles),
+ EVENT_PTR(td_recovery_bubbles_scale),
NULL,
};
@@ -280,7 +316,7 @@ static struct event_constraint intel_hsw_event_constraints[] = {
EVENT_CONSTRAINT_END
};
-struct event_constraint intel_bdw_event_constraints[] = {
+static struct event_constraint intel_bdw_event_constraints[] = {
FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
@@ -1361,6 +1397,29 @@ static __initconst const u64 atom_hw_cache_event_ids
},
};
+EVENT_ATTR_STR(topdown-total-slots, td_total_slots_slm, "event=0x3c");
+EVENT_ATTR_STR(topdown-total-slots.scale, td_total_slots_scale_slm, "2");
+/* no_alloc_cycles.not_delivered */
+EVENT_ATTR_STR(topdown-fetch-bubbles, td_fetch_bubbles_slm,
+ "event=0xca,umask=0x50");
+EVENT_ATTR_STR(topdown-fetch-bubbles.scale, td_fetch_bubbles_scale_slm, "2");
+/* uops_retired.all */
+EVENT_ATTR_STR(topdown-slots-issued, td_slots_issued_slm,
+ "event=0xc2,umask=0x10");
+/* uops_retired.all */
+EVENT_ATTR_STR(topdown-slots-retired, td_slots_retired_slm,
+ "event=0xc2,umask=0x10");
+
+static struct attribute *slm_events_attrs[] = {
+ EVENT_PTR(td_total_slots_slm),
+ EVENT_PTR(td_total_slots_scale_slm),
+ EVENT_PTR(td_fetch_bubbles_slm),
+ EVENT_PTR(td_fetch_bubbles_scale_slm),
+ EVENT_PTR(td_slots_issued_slm),
+ EVENT_PTR(td_slots_retired_slm),
+ NULL
+};
+
static struct extra_reg intel_slm_extra_regs[] __read_mostly =
{
/* must define OFFCORE_RSP_X first, see intel_fixup_er() */
@@ -3290,11 +3349,11 @@ static int intel_snb_pebs_broken(int cpu)
u32 rev = UINT_MAX; /* default to broken for unknown models */
switch (cpu_data(cpu).x86_model) {
- case 42: /* SNB */
+ case INTEL_FAM6_SANDYBRIDGE:
rev = 0x28;
break;
- case 45: /* SNB-EP */
+ case INTEL_FAM6_SANDYBRIDGE_X:
switch (cpu_data(cpu).x86_mask) {
case 6: rev = 0x618; break;
case 7: rev = 0x70c; break;
@@ -3331,6 +3390,13 @@ static void intel_snb_check_microcode(void)
}
}
+static bool is_lbr_from(unsigned long msr)
+{
+ unsigned long lbr_from_nr = x86_pmu.lbr_from + x86_pmu.lbr_nr;
+
+ return x86_pmu.lbr_from <= msr && msr < lbr_from_nr;
+}
+
/*
* Under certain circumstances, access certain MSR may cause #GP.
* The function tests if the input MSR can be safely accessed.
@@ -3351,13 +3417,24 @@ static bool check_msr(unsigned long msr, u64 mask)
* Only change the bits which can be updated by wrmsrl.
*/
val_tmp = val_old ^ mask;
+
+ if (is_lbr_from(msr))
+ val_tmp = lbr_from_signext_quirk_wr(val_tmp);
+
if (wrmsrl_safe(msr, val_tmp) ||
rdmsrl_safe(msr, &val_new))
return false;
+ /*
+ * Quirk only affects validation in wrmsr(), so wrmsrl()'s value
+ * should equal rdmsrl()'s even with the quirk.
+ */
if (val_new != val_tmp)
return false;
+ if (is_lbr_from(msr))
+ val_old = lbr_from_signext_quirk_wr(val_old);
+
/* Here it's sure that the MSR can be safely accessed.
* Restore the old value and return.
*/
@@ -3466,6 +3543,13 @@ static struct attribute *hsw_events_attrs[] = {
EVENT_PTR(cycles_ct),
EVENT_PTR(mem_ld_hsw),
EVENT_PTR(mem_st_hsw),
+ EVENT_PTR(td_slots_issued),
+ EVENT_PTR(td_slots_retired),
+ EVENT_PTR(td_fetch_bubbles),
+ EVENT_PTR(td_total_slots),
+ EVENT_PTR(td_total_slots_scale),
+ EVENT_PTR(td_recovery_bubbles),
+ EVENT_PTR(td_recovery_bubbles_scale),
NULL
};
@@ -3537,15 +3621,15 @@ __init int intel_pmu_init(void)
* Install the hw-cache-events table:
*/
switch (boot_cpu_data.x86_model) {
- case 14: /* 65nm Core "Yonah" */
+ case INTEL_FAM6_CORE_YONAH:
pr_cont("Core events, ");
break;
- case 15: /* 65nm Core2 "Merom" */
+ case INTEL_FAM6_CORE2_MEROM:
x86_add_quirk(intel_clovertown_quirk);
- case 22: /* 65nm Core2 "Merom-L" */
- case 23: /* 45nm Core2 "Penryn" */
- case 29: /* 45nm Core2 "Dunnington (MP) */
+ case INTEL_FAM6_CORE2_MEROM_L:
+ case INTEL_FAM6_CORE2_PENRYN:
+ case INTEL_FAM6_CORE2_DUNNINGTON:
memcpy(hw_cache_event_ids, core2_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
@@ -3556,9 +3640,9 @@ __init int intel_pmu_init(void)
pr_cont("Core2 events, ");
break;
- case 30: /* 45nm Nehalem */
- case 26: /* 45nm Nehalem-EP */
- case 46: /* 45nm Nehalem-EX */
+ case INTEL_FAM6_NEHALEM:
+ case INTEL_FAM6_NEHALEM_EP:
+ case INTEL_FAM6_NEHALEM_EX:
memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
memcpy(hw_cache_extra_regs, nehalem_hw_cache_extra_regs,
@@ -3586,11 +3670,11 @@ __init int intel_pmu_init(void)
pr_cont("Nehalem events, ");
break;
- case 28: /* 45nm Atom "Pineview" */
- case 38: /* 45nm Atom "Lincroft" */
- case 39: /* 32nm Atom "Penwell" */
- case 53: /* 32nm Atom "Cloverview" */
- case 54: /* 32nm Atom "Cedarview" */
+ case INTEL_FAM6_ATOM_PINEVIEW:
+ case INTEL_FAM6_ATOM_LINCROFT:
+ case INTEL_FAM6_ATOM_PENWELL:
+ case INTEL_FAM6_ATOM_CLOVERVIEW:
+ case INTEL_FAM6_ATOM_CEDARVIEW:
memcpy(hw_cache_event_ids, atom_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
@@ -3602,9 +3686,9 @@ __init int intel_pmu_init(void)
pr_cont("Atom events, ");
break;
- case 55: /* 22nm Atom "Silvermont" */
- case 76: /* 14nm Atom "Airmont" */
- case 77: /* 22nm Atom "Silvermont Avoton/Rangely" */
+ case INTEL_FAM6_ATOM_SILVERMONT1:
+ case INTEL_FAM6_ATOM_SILVERMONT2:
+ case INTEL_FAM6_ATOM_AIRMONT:
memcpy(hw_cache_event_ids, slm_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
memcpy(hw_cache_extra_regs, slm_hw_cache_extra_regs,
@@ -3616,11 +3700,12 @@ __init int intel_pmu_init(void)
x86_pmu.pebs_constraints = intel_slm_pebs_event_constraints;
x86_pmu.extra_regs = intel_slm_extra_regs;
x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+ x86_pmu.cpu_events = slm_events_attrs;
pr_cont("Silvermont events, ");
break;
- case 92: /* 14nm Atom "Goldmont" */
- case 95: /* 14nm Atom "Goldmont Denverton" */
+ case INTEL_FAM6_ATOM_GOLDMONT:
+ case INTEL_FAM6_ATOM_DENVERTON:
memcpy(hw_cache_event_ids, glm_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
memcpy(hw_cache_extra_regs, glm_hw_cache_extra_regs,
@@ -3643,9 +3728,9 @@ __init int intel_pmu_init(void)
pr_cont("Goldmont events, ");
break;
- case 37: /* 32nm Westmere */
- case 44: /* 32nm Westmere-EP */
- case 47: /* 32nm Westmere-EX */
+ case INTEL_FAM6_WESTMERE:
+ case INTEL_FAM6_WESTMERE_EP:
+ case INTEL_FAM6_WESTMERE_EX:
memcpy(hw_cache_event_ids, westmere_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
memcpy(hw_cache_extra_regs, nehalem_hw_cache_extra_regs,
@@ -3672,8 +3757,8 @@ __init int intel_pmu_init(void)
pr_cont("Westmere events, ");
break;
- case 42: /* 32nm SandyBridge */
- case 45: /* 32nm SandyBridge-E/EN/EP */
+ case INTEL_FAM6_SANDYBRIDGE:
+ case INTEL_FAM6_SANDYBRIDGE_X:
x86_add_quirk(intel_sandybridge_quirk);
x86_add_quirk(intel_ht_bug);
memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,
@@ -3686,7 +3771,7 @@ __init int intel_pmu_init(void)
x86_pmu.event_constraints = intel_snb_event_constraints;
x86_pmu.pebs_constraints = intel_snb_pebs_event_constraints;
x86_pmu.pebs_aliases = intel_pebs_aliases_snb;
- if (boot_cpu_data.x86_model == 45)
+ if (boot_cpu_data.x86_model == INTEL_FAM6_SANDYBRIDGE_X)
x86_pmu.extra_regs = intel_snbep_extra_regs;
else
x86_pmu.extra_regs = intel_snb_extra_regs;
@@ -3708,8 +3793,8 @@ __init int intel_pmu_init(void)
pr_cont("SandyBridge events, ");
break;
- case 58: /* 22nm IvyBridge */
- case 62: /* 22nm IvyBridge-EP/EX */
+ case INTEL_FAM6_IVYBRIDGE:
+ case INTEL_FAM6_IVYBRIDGE_X:
x86_add_quirk(intel_ht_bug);
memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
@@ -3725,7 +3810,7 @@ __init int intel_pmu_init(void)
x86_pmu.pebs_constraints = intel_ivb_pebs_event_constraints;
x86_pmu.pebs_aliases = intel_pebs_aliases_ivb;
x86_pmu.pebs_prec_dist = true;
- if (boot_cpu_data.x86_model == 62)
+ if (boot_cpu_data.x86_model == INTEL_FAM6_IVYBRIDGE_X)
x86_pmu.extra_regs = intel_snbep_extra_regs;
else
x86_pmu.extra_regs = intel_snb_extra_regs;
@@ -3743,10 +3828,10 @@ __init int intel_pmu_init(void)
break;
- case 60: /* 22nm Haswell Core */
- case 63: /* 22nm Haswell Server */
- case 69: /* 22nm Haswell ULT */
- case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */
+ case INTEL_FAM6_HASWELL_CORE:
+ case INTEL_FAM6_HASWELL_X:
+ case INTEL_FAM6_HASWELL_ULT:
+ case INTEL_FAM6_HASWELL_GT3E:
x86_add_quirk(intel_ht_bug);
x86_pmu.late_ack = true;
memcpy(hw_cache_event_ids, hsw_hw_cache_event_ids, sizeof(hw_cache_event_ids));
@@ -3770,10 +3855,10 @@ __init int intel_pmu_init(void)
pr_cont("Haswell events, ");
break;
- case 61: /* 14nm Broadwell Core-M */
- case 86: /* 14nm Broadwell Xeon D */
- case 71: /* 14nm Broadwell + GT3e (Intel Iris Pro graphics) */
- case 79: /* 14nm Broadwell Server */
+ case INTEL_FAM6_BROADWELL_CORE:
+ case INTEL_FAM6_BROADWELL_XEON_D:
+ case INTEL_FAM6_BROADWELL_GT3E:
+ case INTEL_FAM6_BROADWELL_X:
x86_pmu.late_ack = true;
memcpy(hw_cache_event_ids, hsw_hw_cache_event_ids, sizeof(hw_cache_event_ids));
memcpy(hw_cache_extra_regs, hsw_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
@@ -3806,7 +3891,7 @@ __init int intel_pmu_init(void)
pr_cont("Broadwell events, ");
break;
- case 87: /* Knights Landing Xeon Phi */
+ case INTEL_FAM6_XEON_PHI_KNL:
memcpy(hw_cache_event_ids,
slm_hw_cache_event_ids, sizeof(hw_cache_event_ids));
memcpy(hw_cache_extra_regs,
@@ -3824,16 +3909,22 @@ __init int intel_pmu_init(void)
pr_cont("Knights Landing events, ");
break;
- case 142: /* 14nm Kabylake Mobile */
- case 158: /* 14nm Kabylake Desktop */
- case 78: /* 14nm Skylake Mobile */
- case 94: /* 14nm Skylake Desktop */
- case 85: /* 14nm Skylake Server */
+ case INTEL_FAM6_SKYLAKE_MOBILE:
+ case INTEL_FAM6_SKYLAKE_DESKTOP:
+ case INTEL_FAM6_SKYLAKE_X:
+ case INTEL_FAM6_KABYLAKE_MOBILE:
+ case INTEL_FAM6_KABYLAKE_DESKTOP:
x86_pmu.late_ack = true;
memcpy(hw_cache_event_ids, skl_hw_cache_event_ids, sizeof(hw_cache_event_ids));
memcpy(hw_cache_extra_regs, skl_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
intel_pmu_lbr_init_skl();
+ /* INT_MISC.RECOVERY_CYCLES has umask 1 in Skylake */
+ event_attr_td_recovery_bubbles.event_str_noht =
+ "event=0xd,umask=0x1,cmask=1";
+ event_attr_td_recovery_bubbles.event_str_ht =
+ "event=0xd,umask=0x1,cmask=1,any=1";
+
x86_pmu.event_constraints = intel_skl_event_constraints;
x86_pmu.pebs_constraints = intel_skl_pebs_event_constraints;
x86_pmu.extra_regs = intel_skl_extra_regs;
@@ -3914,6 +4005,8 @@ __init int intel_pmu_init(void)
x86_pmu.lbr_nr = 0;
}
+ if (x86_pmu.lbr_nr)
+ pr_cont("%d-deep LBR, ", x86_pmu.lbr_nr);
/*
* Access extra MSR may cause #GP under certain circumstances.
* E.g. KVM doesn't support offcore event
@@ -3946,16 +4039,14 @@ __init int intel_pmu_init(void)
*/
static __init int fixup_ht_bug(void)
{
- int cpu = smp_processor_id();
- int w, c;
+ int c;
/*
* problem not present on this CPU model, nothing to do
*/
if (!(x86_pmu.flags & PMU_FL_EXCL_ENABLED))
return 0;
- w = cpumask_weight(topology_sibling_cpumask(cpu));
- if (w > 1) {
+ if (topology_max_smt_threads() > 1) {
pr_info("PMU erratum BJ122, BV98, HSD29 worked around, HT is on\n");
return 0;
}
diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c
index 9ba4e41..4c7638b 100644
--- a/arch/x86/events/intel/cstate.c
+++ b/arch/x86/events/intel/cstate.c
@@ -89,6 +89,7 @@
#include <linux/slab.h>
#include <linux/perf_event.h>
#include <asm/cpu_device_id.h>
+#include <asm/intel-family.h>
#include "../perf_event.h"
MODULE_LICENSE("GPL");
@@ -511,37 +512,37 @@ static const struct cstate_model slm_cstates __initconst = {
{ X86_VENDOR_INTEL, 6, model, X86_FEATURE_ANY, (unsigned long) &(states) }
static const struct x86_cpu_id intel_cstates_match[] __initconst = {
- X86_CSTATES_MODEL(30, nhm_cstates), /* 45nm Nehalem */
- X86_CSTATES_MODEL(26, nhm_cstates), /* 45nm Nehalem-EP */
- X86_CSTATES_MODEL(46, nhm_cstates), /* 45nm Nehalem-EX */
+ X86_CSTATES_MODEL(INTEL_FAM6_NEHALEM, nhm_cstates),
+ X86_CSTATES_MODEL(INTEL_FAM6_NEHALEM_EP, nhm_cstates),
+ X86_CSTATES_MODEL(INTEL_FAM6_NEHALEM_EX, nhm_cstates),
- X86_CSTATES_MODEL(37, nhm_cstates), /* 32nm Westmere */
- X86_CSTATES_MODEL(44, nhm_cstates), /* 32nm Westmere-EP */
- X86_CSTATES_MODEL(47, nhm_cstates), /* 32nm Westmere-EX */
+ X86_CSTATES_MODEL(INTEL_FAM6_WESTMERE, nhm_cstates),
+ X86_CSTATES_MODEL(INTEL_FAM6_WESTMERE_EP, nhm_cstates),
+ X86_CSTATES_MODEL(INTEL_FAM6_WESTMERE_EX, nhm_cstates),
- X86_CSTATES_MODEL(42, snb_cstates), /* 32nm SandyBridge */
- X86_CSTATES_MODEL(45, snb_cstates), /* 32nm SandyBridge-E/EN/EP */
+ X86_CSTATES_MODEL(INTEL_FAM6_SANDYBRIDGE, snb_cstates),
+ X86_CSTATES_MODEL(INTEL_FAM6_SANDYBRIDGE_X, snb_cstates),
- X86_CSTATES_MODEL(58, snb_cstates), /* 22nm IvyBridge */
- X86_CSTATES_MODEL(62, snb_cstates), /* 22nm IvyBridge-EP/EX */
+ X86_CSTATES_MODEL(INTEL_FAM6_IVYBRIDGE, snb_cstates),
+ X86_CSTATES_MODEL(INTEL_FAM6_IVYBRIDGE_X, snb_cstates),
- X86_CSTATES_MODEL(60, snb_cstates), /* 22nm Haswell Core */
- X86_CSTATES_MODEL(63, snb_cstates), /* 22nm Haswell Server */
- X86_CSTATES_MODEL(70, snb_cstates), /* 22nm Haswell + GT3e */
+ X86_CSTATES_MODEL(INTEL_FAM6_HASWELL_CORE, snb_cstates),
+ X86_CSTATES_MODEL(INTEL_FAM6_HASWELL_X, snb_cstates),
+ X86_CSTATES_MODEL(INTEL_FAM6_HASWELL_GT3E, snb_cstates),
- X86_CSTATES_MODEL(69, hswult_cstates), /* 22nm Haswell ULT */
+ X86_CSTATES_MODEL(INTEL_FAM6_HASWELL_ULT, hswult_cstates),
- X86_CSTATES_MODEL(55, slm_cstates), /* 22nm Atom Silvermont */
- X86_CSTATES_MODEL(77, slm_cstates), /* 22nm Atom Avoton/Rangely */
- X86_CSTATES_MODEL(76, slm_cstates), /* 22nm Atom Airmont */
+ X86_CSTATES_MODEL(INTEL_FAM6_ATOM_SILVERMONT1, slm_cstates),
+ X86_CSTATES_MODEL(INTEL_FAM6_ATOM_SILVERMONT2, slm_cstates),
+ X86_CSTATES_MODEL(INTEL_FAM6_ATOM_AIRMONT, slm_cstates),
- X86_CSTATES_MODEL(61, snb_cstates), /* 14nm Broadwell Core-M */
- X86_CSTATES_MODEL(86, snb_cstates), /* 14nm Broadwell Xeon D */
- X86_CSTATES_MODEL(71, snb_cstates), /* 14nm Broadwell + GT3e */
- X86_CSTATES_MODEL(79, snb_cstates), /* 14nm Broadwell Server */
+ X86_CSTATES_MODEL(INTEL_FAM6_BROADWELL_CORE, snb_cstates),
+ X86_CSTATES_MODEL(INTEL_FAM6_BROADWELL_XEON_D, snb_cstates),
+ X86_CSTATES_MODEL(INTEL_FAM6_BROADWELL_GT3E, snb_cstates),
+ X86_CSTATES_MODEL(INTEL_FAM6_BROADWELL_X, snb_cstates),
- X86_CSTATES_MODEL(78, snb_cstates), /* 14nm Skylake Mobile */
- X86_CSTATES_MODEL(94, snb_cstates), /* 14nm Skylake Desktop */
+ X86_CSTATES_MODEL(INTEL_FAM6_SKYLAKE_MOBILE, snb_cstates),
+ X86_CSTATES_MODEL(INTEL_FAM6_SKYLAKE_DESKTOP, snb_cstates),
{ },
};
MODULE_DEVICE_TABLE(x86cpu, intel_cstates_match);
diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index 9e2b40c..707d358 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -77,9 +77,11 @@ static enum {
LBR_IND_JMP |\
LBR_FAR)
-#define LBR_FROM_FLAG_MISPRED (1ULL << 63)
-#define LBR_FROM_FLAG_IN_TX (1ULL << 62)
-#define LBR_FROM_FLAG_ABORT (1ULL << 61)
+#define LBR_FROM_FLAG_MISPRED BIT_ULL(63)
+#define LBR_FROM_FLAG_IN_TX BIT_ULL(62)
+#define LBR_FROM_FLAG_ABORT BIT_ULL(61)
+
+#define LBR_FROM_SIGNEXT_2MSB (BIT_ULL(60) | BIT_ULL(59))
/*
* x86control flow change classification
@@ -235,6 +237,97 @@ enum {
LBR_VALID,
};
+/*
+ * For formats with LBR_TSX flags (e.g. LBR_FORMAT_EIP_FLAGS2), bits 61:62 in
+ * MSR_LAST_BRANCH_FROM_x are the TSX flags when TSX is supported, but when
+ * TSX is not supported they have no consistent behavior:
+ *
+ * - For wrmsr(), bits 61:62 are considered part of the sign extension.
+ * - For HW updates (branch captures) bits 61:62 are always OFF and are not
+ * part of the sign extension.
+ *
+ * Therefore, if:
+ *
+ * 1) LBR has TSX format
+ * 2) CPU has no TSX support enabled
+ *
+ * ... then any value passed to wrmsr() must be sign extended to 63 bits and any
+ * value from rdmsr() must be converted to have a 61 bits sign extension,
+ * ignoring the TSX flags.
+ */
+static inline bool lbr_from_signext_quirk_needed(void)
+{
+ int lbr_format = x86_pmu.intel_cap.lbr_format;
+ bool tsx_support = boot_cpu_has(X86_FEATURE_HLE) ||
+ boot_cpu_has(X86_FEATURE_RTM);
+
+ return !tsx_support && (lbr_desc[lbr_format] & LBR_TSX);
+}
+
+DEFINE_STATIC_KEY_FALSE(lbr_from_quirk_key);
+
+/* If quirk is enabled, ensure sign extension is 63 bits: */
+inline u64 lbr_from_signext_quirk_wr(u64 val)
+{
+ if (static_branch_unlikely(&lbr_from_quirk_key)) {
+ /*
+ * Sign extend into bits 61:62 while preserving bit 63.
+ *
+ * Quirk is enabled when TSX is disabled. Therefore TSX bits
+ * in val are always OFF and must be changed to be sign
+ * extension bits. Since bits 59:60 are guaranteed to be
+ * part of the sign extension bits, we can just copy them
+ * to 61:62.
+ */
+ val |= (LBR_FROM_SIGNEXT_2MSB & val) << 2;
+ }
+ return val;
+}
+
+/*
+ * If quirk is needed, ensure sign extension is 61 bits:
+ */
+u64 lbr_from_signext_quirk_rd(u64 val)
+{
+ if (static_branch_unlikely(&lbr_from_quirk_key)) {
+ /*
+ * Quirk is on when TSX is not enabled. Therefore TSX
+ * flags must be read as OFF.
+ */
+ val &= ~(LBR_FROM_FLAG_IN_TX | LBR_FROM_FLAG_ABORT);
+ }
+ return val;
+}
+
+static inline void wrlbr_from(unsigned int idx, u64 val)
+{
+ val = lbr_from_signext_quirk_wr(val);
+ wrmsrl(x86_pmu.lbr_from + idx, val);
+}
+
+static inline void wrlbr_to(unsigned int idx, u64 val)
+{
+ wrmsrl(x86_pmu.lbr_to + idx, val);
+}
+
+static inline u64 rdlbr_from(unsigned int idx)
+{
+ u64 val;
+
+ rdmsrl(x86_pmu.lbr_from + idx, val);
+
+ return lbr_from_signext_quirk_rd(val);
+}
+
+static inline u64 rdlbr_to(unsigned int idx)
+{
+ u64 val;
+
+ rdmsrl(x86_pmu.lbr_to + idx, val);
+
+ return val;
+}
+
static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx)
{
int i;
@@ -251,8 +344,9 @@ static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx)
tos = task_ctx->tos;
for (i = 0; i < tos; i++) {
lbr_idx = (tos - i) & mask;
- wrmsrl(x86_pmu.lbr_from + lbr_idx, task_ctx->lbr_from[i]);
- wrmsrl(x86_pmu.lbr_to + lbr_idx, task_ctx->lbr_to[i]);
+ wrlbr_from(lbr_idx, task_ctx->lbr_from[i]);
+ wrlbr_to (lbr_idx, task_ctx->lbr_to[i]);
+
if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO)
wrmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr_info[i]);
}
@@ -262,9 +356,9 @@ static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx)
static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx)
{
- int i;
unsigned lbr_idx, mask;
u64 tos;
+ int i;
if (task_ctx->lbr_callstack_users == 0) {
task_ctx->lbr_stack_state = LBR_NONE;
@@ -275,8 +369,8 @@ static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx)
tos = intel_pmu_lbr_tos();
for (i = 0; i < tos; i++) {
lbr_idx = (tos - i) & mask;
- rdmsrl(x86_pmu.lbr_from + lbr_idx, task_ctx->lbr_from[i]);
- rdmsrl(x86_pmu.lbr_to + lbr_idx, task_ctx->lbr_to[i]);
+ task_ctx->lbr_from[i] = rdlbr_from(lbr_idx);
+ task_ctx->lbr_to[i] = rdlbr_to(lbr_idx);
if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO)
rdmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr_info[i]);
}
@@ -452,8 +546,8 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
u16 cycles = 0;
int lbr_flags = lbr_desc[lbr_format];
- rdmsrl(x86_pmu.lbr_from + lbr_idx, from);
- rdmsrl(x86_pmu.lbr_to + lbr_idx, to);
+ from = rdlbr_from(lbr_idx);
+ to = rdlbr_to(lbr_idx);
if (lbr_format == LBR_FORMAT_INFO && need_info) {
u64 info;
@@ -956,7 +1050,6 @@ void __init intel_pmu_lbr_init_core(void)
* SW branch filter usage:
* - compensate for lack of HW filter
*/
- pr_cont("4-deep LBR, ");
}
/* nehalem/westmere */
@@ -977,7 +1070,6 @@ void __init intel_pmu_lbr_init_nhm(void)
* That requires LBR_FAR but that means far
* jmp need to be filtered out
*/
- pr_cont("16-deep LBR, ");
}
/* sandy bridge */
@@ -997,7 +1089,6 @@ void __init intel_pmu_lbr_init_snb(void)
* That requires LBR_FAR but that means far
* jmp need to be filtered out
*/
- pr_cont("16-deep LBR, ");
}
/* haswell */
@@ -1011,7 +1102,8 @@ void intel_pmu_lbr_init_hsw(void)
x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
x86_pmu.lbr_sel_map = hsw_lbr_sel_map;
- pr_cont("16-deep LBR, ");
+ if (lbr_from_signext_quirk_needed())
+ static_branch_enable(&lbr_from_quirk_key);
}
/* skylake */
@@ -1031,7 +1123,6 @@ __init void intel_pmu_lbr_init_skl(void)
* That requires LBR_FAR but that means far
* jmp need to be filtered out
*/
- pr_cont("32-deep LBR, ");
}
/* atom */
@@ -1057,7 +1148,6 @@ void __init intel_pmu_lbr_init_atom(void)
* SW branch filter usage:
* - compensate for lack of HW filter
*/
- pr_cont("8-deep LBR, ");
}
/* slm */
@@ -1088,6 +1178,4 @@ void intel_pmu_lbr_init_knl(void)
x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
x86_pmu.lbr_sel_map = snb_lbr_sel_map;
-
- pr_cont("8-deep LBR, ");
}
diff --git a/arch/x86/events/intel/rapl.c b/arch/x86/events/intel/rapl.c
index e30eef4..d0c58b3 100644
--- a/arch/x86/events/intel/rapl.c
+++ b/arch/x86/events/intel/rapl.c
@@ -55,6 +55,7 @@
#include <linux/slab.h>
#include <linux/perf_event.h>
#include <asm/cpu_device_id.h>
+#include <asm/intel-family.h>
#include "../perf_event.h"
MODULE_LICENSE("GPL");
@@ -786,26 +787,27 @@ static const struct intel_rapl_init_fun skl_rapl_init __initconst = {
};
static const struct x86_cpu_id rapl_cpu_match[] __initconst = {
- X86_RAPL_MODEL_MATCH(42, snb_rapl_init), /* Sandy Bridge */
- X86_RAPL_MODEL_MATCH(45, snbep_rapl_init), /* Sandy Bridge-EP */
+ X86_RAPL_MODEL_MATCH(INTEL_FAM6_SANDYBRIDGE, snb_rapl_init),
+ X86_RAPL_MODEL_MATCH(INTEL_FAM6_SANDYBRIDGE_X, snbep_rapl_init),
- X86_RAPL_MODEL_MATCH(58, snb_rapl_init), /* Ivy Bridge */
- X86_RAPL_MODEL_MATCH(62, snbep_rapl_init), /* IvyTown */
+ X86_RAPL_MODEL_MATCH(INTEL_FAM6_IVYBRIDGE, snb_rapl_init),
+ X86_RAPL_MODEL_MATCH(INTEL_FAM6_IVYBRIDGE_X, snbep_rapl_init),
- X86_RAPL_MODEL_MATCH(60, hsw_rapl_init), /* Haswell */
- X86_RAPL_MODEL_MATCH(63, hsx_rapl_init), /* Haswell-Server */
- X86_RAPL_MODEL_MATCH(69, hsw_rapl_init), /* Haswell-Celeron */
- X86_RAPL_MODEL_MATCH(70, hsw_rapl_init), /* Haswell GT3e */
+ X86_RAPL_MODEL_MATCH(INTEL_FAM6_HASWELL_CORE, hsw_rapl_init),
+ X86_RAPL_MODEL_MATCH(INTEL_FAM6_HASWELL_X, hsw_rapl_init),
+ X86_RAPL_MODEL_MATCH(INTEL_FAM6_HASWELL_ULT, hsw_rapl_init),
+ X86_RAPL_MODEL_MATCH(INTEL_FAM6_HASWELL_GT3E, hsw_rapl_init),
- X86_RAPL_MODEL_MATCH(61, hsw_rapl_init), /* Broadwell */
- X86_RAPL_MODEL_MATCH(71, hsw_rapl_init), /* Broadwell-H */
- X86_RAPL_MODEL_MATCH(79, hsx_rapl_init), /* Broadwell-Server */
- X86_RAPL_MODEL_MATCH(86, hsx_rapl_init), /* Broadwell Xeon D */
+ X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_CORE, hsw_rapl_init),
+ X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_GT3E, hsw_rapl_init),
+ X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_X, hsw_rapl_init),
+ X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_XEON_D, hsw_rapl_init),
- X86_RAPL_MODEL_MATCH(87, knl_rapl_init), /* Knights Landing */
+ X86_RAPL_MODEL_MATCH(INTEL_FAM6_XEON_PHI_KNL, knl_rapl_init),
- X86_RAPL_MODEL_MATCH(78, skl_rapl_init), /* Skylake */
- X86_RAPL_MODEL_MATCH(94, skl_rapl_init), /* Skylake H/S */
+ X86_RAPL_MODEL_MATCH(INTEL_FAM6_SKYLAKE_MOBILE, skl_rapl_init),
+ X86_RAPL_MODEL_MATCH(INTEL_FAM6_SKYLAKE_DESKTOP, skl_rapl_init),
+ X86_RAPL_MODEL_MATCH(INTEL_FAM6_SKYLAKE_X, hsx_rapl_init),
{},
};
diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index fce7406..59b4974 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -1,4 +1,5 @@
#include <asm/cpu_device_id.h>
+#include <asm/intel-family.h>
#include "uncore.h"
static struct intel_uncore_type *empty_uncore[] = { NULL, };
@@ -882,7 +883,7 @@ uncore_types_init(struct intel_uncore_type **types, bool setid)
static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
{
struct intel_uncore_type *type;
- struct intel_uncore_pmu *pmu;
+ struct intel_uncore_pmu *pmu = NULL;
struct intel_uncore_box *box;
int phys_id, pkg, ret;
@@ -903,20 +904,37 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id
}
type = uncore_pci_uncores[UNCORE_PCI_DEV_TYPE(id->driver_data)];
+
/*
- * for performance monitoring unit with multiple boxes,
- * each box has a different function id.
- */
- pmu = &type->pmus[UNCORE_PCI_DEV_IDX(id->driver_data)];
- /* Knights Landing uses a common PCI device ID for multiple instances of
- * an uncore PMU device type. There is only one entry per device type in
- * the knl_uncore_pci_ids table inspite of multiple devices present for
- * some device types. Hence PCI device idx would be 0 for all devices.
- * So increment pmu pointer to point to an unused array element.
+ * Some platforms, e.g. Knights Landing, use a common PCI device ID
+ * for multiple instances of an uncore PMU device type. We should check
+ * PCI slot and func to indicate the uncore box.
*/
- if (boot_cpu_data.x86_model == 87) {
- while (pmu->func_id >= 0)
- pmu++;
+ if (id->driver_data & ~0xffff) {
+ struct pci_driver *pci_drv = pdev->driver;
+ const struct pci_device_id *ids = pci_drv->id_table;
+ unsigned int devfn;
+
+ while (ids && ids->vendor) {
+ if ((ids->vendor == pdev->vendor) &&
+ (ids->device == pdev->device)) {
+ devfn = PCI_DEVFN(UNCORE_PCI_DEV_DEV(ids->driver_data),
+ UNCORE_PCI_DEV_FUNC(ids->driver_data));
+ if (devfn == pdev->devfn) {
+ pmu = &type->pmus[UNCORE_PCI_DEV_IDX(ids->driver_data)];
+ break;
+ }
+ }
+ ids++;
+ }
+ if (pmu == NULL)
+ return -ENODEV;
+ } else {
+ /*
+ * for performance monitoring unit with multiple boxes,
+ * each box has a different function id.
+ */
+ pmu = &type->pmus[UNCORE_PCI_DEV_IDX(id->driver_data)];
}
if (WARN_ON_ONCE(pmu->boxes[pkg] != NULL))
@@ -956,7 +974,7 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id
static void uncore_pci_remove(struct pci_dev *pdev)
{
- struct intel_uncore_box *box = pci_get_drvdata(pdev);
+ struct intel_uncore_box *box;
struct intel_uncore_pmu *pmu;
int i, phys_id, pkg;
@@ -1361,30 +1379,32 @@ static const struct intel_uncore_init_fun knl_uncore_init __initconst = {
};
static const struct intel_uncore_init_fun skl_uncore_init __initconst = {
+ .cpu_init = skl_uncore_cpu_init,
.pci_init = skl_uncore_pci_init,
};
static const struct x86_cpu_id intel_uncore_match[] __initconst = {
- X86_UNCORE_MODEL_MATCH(26, nhm_uncore_init), /* Nehalem */
- X86_UNCORE_MODEL_MATCH(30, nhm_uncore_init),
- X86_UNCORE_MODEL_MATCH(37, nhm_uncore_init), /* Westmere */
- X86_UNCORE_MODEL_MATCH(44, nhm_uncore_init),
- X86_UNCORE_MODEL_MATCH(42, snb_uncore_init), /* Sandy Bridge */
- X86_UNCORE_MODEL_MATCH(58, ivb_uncore_init), /* Ivy Bridge */
- X86_UNCORE_MODEL_MATCH(60, hsw_uncore_init), /* Haswell */
- X86_UNCORE_MODEL_MATCH(69, hsw_uncore_init), /* Haswell Celeron */
- X86_UNCORE_MODEL_MATCH(70, hsw_uncore_init), /* Haswell */
- X86_UNCORE_MODEL_MATCH(61, bdw_uncore_init), /* Broadwell */
- X86_UNCORE_MODEL_MATCH(71, bdw_uncore_init), /* Broadwell */
- X86_UNCORE_MODEL_MATCH(45, snbep_uncore_init), /* Sandy Bridge-EP */
- X86_UNCORE_MODEL_MATCH(46, nhmex_uncore_init), /* Nehalem-EX */
- X86_UNCORE_MODEL_MATCH(47, nhmex_uncore_init), /* Westmere-EX aka. Xeon E7 */
- X86_UNCORE_MODEL_MATCH(62, ivbep_uncore_init), /* Ivy Bridge-EP */
- X86_UNCORE_MODEL_MATCH(63, hswep_uncore_init), /* Haswell-EP */
- X86_UNCORE_MODEL_MATCH(79, bdx_uncore_init), /* BDX-EP */
- X86_UNCORE_MODEL_MATCH(86, bdx_uncore_init), /* BDX-DE */
- X86_UNCORE_MODEL_MATCH(87, knl_uncore_init), /* Knights Landing */
- X86_UNCORE_MODEL_MATCH(94, skl_uncore_init), /* SkyLake */
+ X86_UNCORE_MODEL_MATCH(INTEL_FAM6_NEHALEM_EP, nhm_uncore_init),
+ X86_UNCORE_MODEL_MATCH(INTEL_FAM6_NEHALEM, nhm_uncore_init),
+ X86_UNCORE_MODEL_MATCH(INTEL_FAM6_WESTMERE, nhm_uncore_init),
+ X86_UNCORE_MODEL_MATCH(INTEL_FAM6_WESTMERE_EP, nhm_uncore_init),
+ X86_UNCORE_MODEL_MATCH(INTEL_FAM6_SANDYBRIDGE, snb_uncore_init),
+ X86_UNCORE_MODEL_MATCH(INTEL_FAM6_IVYBRIDGE, ivb_uncore_init),
+ X86_UNCORE_MODEL_MATCH(INTEL_FAM6_HASWELL_CORE, hsw_uncore_init),
+ X86_UNCORE_MODEL_MATCH(INTEL_FAM6_HASWELL_ULT, hsw_uncore_init),
+ X86_UNCORE_MODEL_MATCH(INTEL_FAM6_HASWELL_GT3E, hsw_uncore_init),
+ X86_UNCORE_MODEL_MATCH(INTEL_FAM6_BROADWELL_CORE, bdw_uncore_init),
+ X86_UNCORE_MODEL_MATCH(INTEL_FAM6_BROADWELL_GT3E, bdw_uncore_init),
+ X86_UNCORE_MODEL_MATCH(INTEL_FAM6_SANDYBRIDGE_X, snbep_uncore_init),
+ X86_UNCORE_MODEL_MATCH(INTEL_FAM6_NEHALEM_EX, nhmex_uncore_init),
+ X86_UNCORE_MODEL_MATCH(INTEL_FAM6_WESTMERE_EX, nhmex_uncore_init),
+ X86_UNCORE_MODEL_MATCH(INTEL_FAM6_IVYBRIDGE_X, ivbep_uncore_init),
+ X86_UNCORE_MODEL_MATCH(INTEL_FAM6_HASWELL_X, hswep_uncore_init),
+ X86_UNCORE_MODEL_MATCH(INTEL_FAM6_BROADWELL_X, bdx_uncore_init),
+ X86_UNCORE_MODEL_MATCH(INTEL_FAM6_BROADWELL_XEON_D, bdx_uncore_init),
+ X86_UNCORE_MODEL_MATCH(INTEL_FAM6_XEON_PHI_KNL, knl_uncore_init),
+ X86_UNCORE_MODEL_MATCH(INTEL_FAM6_SKYLAKE_DESKTOP,skl_uncore_init),
+ X86_UNCORE_MODEL_MATCH(INTEL_FAM6_SKYLAKE_MOBILE, skl_uncore_init),
{},
};
diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h
index 79766b9..d6063e4 100644
--- a/arch/x86/events/intel/uncore.h
+++ b/arch/x86/events/intel/uncore.h
@@ -15,7 +15,11 @@
#define UNCORE_PMC_IDX_FIXED UNCORE_PMC_IDX_MAX_GENERIC
#define UNCORE_PMC_IDX_MAX (UNCORE_PMC_IDX_FIXED + 1)
+#define UNCORE_PCI_DEV_FULL_DATA(dev, func, type, idx) \
+ ((dev << 24) | (func << 16) | (type << 8) | idx)
#define UNCORE_PCI_DEV_DATA(type, idx) ((type << 8) | idx)
+#define UNCORE_PCI_DEV_DEV(data) ((data >> 24) & 0xff)
+#define UNCORE_PCI_DEV_FUNC(data) ((data >> 16) & 0xff)
#define UNCORE_PCI_DEV_TYPE(data) ((data >> 8) & 0xff)
#define UNCORE_PCI_DEV_IDX(data) (data & 0xff)
#define UNCORE_EXTRA_PCI_DEV 0xff
@@ -360,6 +364,7 @@ int bdw_uncore_pci_init(void);
int skl_uncore_pci_init(void);
void snb_uncore_cpu_init(void);
void nhm_uncore_cpu_init(void);
+void skl_uncore_cpu_init(void);
int snb_pci2phy_map_init(int devid);
/* perf_event_intel_uncore_snbep.c */
diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c
index 96531d2..97a69db 100644
--- a/arch/x86/events/intel/uncore_snb.c
+++ b/arch/x86/events/intel/uncore_snb.c
@@ -1,4 +1,4 @@
-/* Nehalem/SandBridge/Haswell uncore support */
+/* Nehalem/SandBridge/Haswell/Broadwell/Skylake uncore support */
#include "uncore.h"
/* Uncore IMC PCI IDs */
@@ -9,6 +9,7 @@
#define PCI_DEVICE_ID_INTEL_HSW_U_IMC 0x0a04
#define PCI_DEVICE_ID_INTEL_BDW_IMC 0x1604
#define PCI_DEVICE_ID_INTEL_SKL_IMC 0x191f
+#define PCI_DEVICE_ID_INTEL_SKL_U_IMC 0x190c
/* SNB event control */
#define SNB_UNC_CTL_EV_SEL_MASK 0x000000ff
@@ -64,6 +65,10 @@
#define NHM_UNC_PERFEVTSEL0 0x3c0
#define NHM_UNC_UNCORE_PMC0 0x3b0
+/* SKL uncore global control */
+#define SKL_UNC_PERF_GLOBAL_CTL 0xe01
+#define SKL_UNC_GLOBAL_CTL_CORE_ALL ((1 << 5) - 1)
+
DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7");
DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15");
DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18");
@@ -179,6 +184,60 @@ void snb_uncore_cpu_init(void)
snb_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;
}
+static void skl_uncore_msr_init_box(struct intel_uncore_box *box)
+{
+ if (box->pmu->pmu_idx == 0) {
+ wrmsrl(SKL_UNC_PERF_GLOBAL_CTL,
+ SNB_UNC_GLOBAL_CTL_EN | SKL_UNC_GLOBAL_CTL_CORE_ALL);
+ }
+}
+
+static void skl_uncore_msr_exit_box(struct intel_uncore_box *box)
+{
+ if (box->pmu->pmu_idx == 0)
+ wrmsrl(SKL_UNC_PERF_GLOBAL_CTL, 0);
+}
+
+static struct intel_uncore_ops skl_uncore_msr_ops = {
+ .init_box = skl_uncore_msr_init_box,
+ .exit_box = skl_uncore_msr_exit_box,
+ .disable_event = snb_uncore_msr_disable_event,
+ .enable_event = snb_uncore_msr_enable_event,
+ .read_counter = uncore_msr_read_counter,
+};
+
+static struct intel_uncore_type skl_uncore_cbox = {
+ .name = "cbox",
+ .num_counters = 4,
+ .num_boxes = 5,
+ .perf_ctr_bits = 44,
+ .fixed_ctr_bits = 48,
+ .perf_ctr = SNB_UNC_CBO_0_PER_CTR0,
+ .event_ctl = SNB_UNC_CBO_0_PERFEVTSEL0,
+ .fixed_ctr = SNB_UNC_FIXED_CTR,
+ .fixed_ctl = SNB_UNC_FIXED_CTR_CTRL,
+ .single_fixed = 1,
+ .event_mask = SNB_UNC_RAW_EVENT_MASK,
+ .msr_offset = SNB_UNC_CBO_MSR_OFFSET,
+ .ops = &skl_uncore_msr_ops,
+ .format_group = &snb_uncore_format_group,
+ .event_descs = snb_uncore_events,
+};
+
+static struct intel_uncore_type *skl_msr_uncores[] = {
+ &skl_uncore_cbox,
+ &snb_uncore_arb,
+ NULL,
+};
+
+void skl_uncore_cpu_init(void)
+{
+ uncore_msr_uncores = skl_msr_uncores;
+ if (skl_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores)
+ skl_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;
+ snb_uncore_arb.ops = &skl_uncore_msr_ops;
+}
+
enum {
SNB_PCI_UNCORE_IMC,
};
@@ -544,6 +603,11 @@ static const struct pci_device_id skl_uncore_pci_ids[] = {
PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SKL_IMC),
.driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
},
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SKL_U_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+
{ /* end: all zeroes */ },
};
@@ -587,6 +651,7 @@ static const struct imc_uncore_pci_dev desktop_imc_pci_ids[] = {
IMC_DEV(HSW_U_IMC, &hsw_uncore_pci_driver), /* 4th Gen Core ULT Mobile Processor */
IMC_DEV(BDW_IMC, &bdw_uncore_pci_driver), /* 5th Gen Core U */
IMC_DEV(SKL_IMC, &skl_uncore_pci_driver), /* 6th Gen Core */
+ IMC_DEV(SKL_U_IMC, &skl_uncore_pci_driver), /* 6th Gen Core U */
{ /* end marker */ }
};
diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c
index 874e8bd..824e540 100644
--- a/arch/x86/events/intel/uncore_snbep.c
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -2164,21 +2164,101 @@ static struct intel_uncore_type *knl_pci_uncores[] = {
*/
static const struct pci_device_id knl_uncore_pci_ids[] = {
- { /* MC UClk */
+ { /* MC0 UClk */
PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7841),
- .driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_MC_UCLK, 0),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(10, 0, KNL_PCI_UNCORE_MC_UCLK, 0),
},
- { /* MC DClk Channel */
+ { /* MC1 UClk */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7841),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(11, 0, KNL_PCI_UNCORE_MC_UCLK, 1),
+ },
+ { /* MC0 DClk CH 0 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7843),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(8, 2, KNL_PCI_UNCORE_MC_DCLK, 0),
+ },
+ { /* MC0 DClk CH 1 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7843),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(8, 3, KNL_PCI_UNCORE_MC_DCLK, 1),
+ },
+ { /* MC0 DClk CH 2 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7843),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(8, 4, KNL_PCI_UNCORE_MC_DCLK, 2),
+ },
+ { /* MC1 DClk CH 0 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7843),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(9, 2, KNL_PCI_UNCORE_MC_DCLK, 3),
+ },
+ { /* MC1 DClk CH 1 */
PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7843),
- .driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_MC_DCLK, 0),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(9, 3, KNL_PCI_UNCORE_MC_DCLK, 4),
+ },
+ { /* MC1 DClk CH 2 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7843),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(9, 4, KNL_PCI_UNCORE_MC_DCLK, 5),
+ },
+ { /* EDC0 UClk */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7833),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(15, 0, KNL_PCI_UNCORE_EDC_UCLK, 0),
+ },
+ { /* EDC1 UClk */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7833),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(16, 0, KNL_PCI_UNCORE_EDC_UCLK, 1),
+ },
+ { /* EDC2 UClk */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7833),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(17, 0, KNL_PCI_UNCORE_EDC_UCLK, 2),
+ },
+ { /* EDC3 UClk */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7833),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(18, 0, KNL_PCI_UNCORE_EDC_UCLK, 3),
},
- { /* EDC UClk */
+ { /* EDC4 UClk */
PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7833),
- .driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_EDC_UCLK, 0),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(19, 0, KNL_PCI_UNCORE_EDC_UCLK, 4),
+ },
+ { /* EDC5 UClk */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7833),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(20, 0, KNL_PCI_UNCORE_EDC_UCLK, 5),
+ },
+ { /* EDC6 UClk */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7833),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(21, 0, KNL_PCI_UNCORE_EDC_UCLK, 6),
+ },
+ { /* EDC7 UClk */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7833),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(22, 0, KNL_PCI_UNCORE_EDC_UCLK, 7),
+ },
+ { /* EDC0 EClk */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7835),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(24, 2, KNL_PCI_UNCORE_EDC_ECLK, 0),
+ },
+ { /* EDC1 EClk */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7835),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(25, 2, KNL_PCI_UNCORE_EDC_ECLK, 1),
+ },
+ { /* EDC2 EClk */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7835),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(26, 2, KNL_PCI_UNCORE_EDC_ECLK, 2),
+ },
+ { /* EDC3 EClk */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7835),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(27, 2, KNL_PCI_UNCORE_EDC_ECLK, 3),
+ },
+ { /* EDC4 EClk */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7835),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(28, 2, KNL_PCI_UNCORE_EDC_ECLK, 4),
+ },
+ { /* EDC5 EClk */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7835),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(29, 2, KNL_PCI_UNCORE_EDC_ECLK, 5),
+ },
+ { /* EDC6 EClk */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7835),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(30, 2, KNL_PCI_UNCORE_EDC_ECLK, 6),
},
- { /* EDC EClk */
+ { /* EDC7 EClk */
PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7835),
- .driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_EDC_ECLK, 0),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(31, 2, KNL_PCI_UNCORE_EDC_ECLK, 7),
},
{ /* M2PCIe */
PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7817),
diff --git a/arch/x86/events/msr.c b/arch/x86/events/msr.c
index 85ef3c2..50b3a05 100644
--- a/arch/x86/events/msr.c
+++ b/arch/x86/events/msr.c
@@ -1,4 +1,5 @@
#include <linux/perf_event.h>
+#include <asm/intel-family.h>
enum perf_msr_id {
PERF_MSR_TSC = 0,
@@ -34,39 +35,43 @@ static bool test_intel(int idx)
return false;
switch (boot_cpu_data.x86_model) {
- case 30: /* 45nm Nehalem */
- case 26: /* 45nm Nehalem-EP */
- case 46: /* 45nm Nehalem-EX */
-
- case 37: /* 32nm Westmere */
- case 44: /* 32nm Westmere-EP */
- case 47: /* 32nm Westmere-EX */
-
- case 42: /* 32nm SandyBridge */
- case 45: /* 32nm SandyBridge-E/EN/EP */
-
- case 58: /* 22nm IvyBridge */
- case 62: /* 22nm IvyBridge-EP/EX */
-
- case 60: /* 22nm Haswell Core */
- case 63: /* 22nm Haswell Server */
- case 69: /* 22nm Haswell ULT */
- case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */
-
- case 61: /* 14nm Broadwell Core-M */
- case 86: /* 14nm Broadwell Xeon D */
- case 71: /* 14nm Broadwell + GT3e (Intel Iris Pro graphics) */
- case 79: /* 14nm Broadwell Server */
-
- case 55: /* 22nm Atom "Silvermont" */
- case 77: /* 22nm Atom "Silvermont Avoton/Rangely" */
- case 76: /* 14nm Atom "Airmont" */
+ case INTEL_FAM6_NEHALEM:
+ case INTEL_FAM6_NEHALEM_EP:
+ case INTEL_FAM6_NEHALEM_EX:
+
+ case INTEL_FAM6_WESTMERE:
+ case INTEL_FAM6_WESTMERE2:
+ case INTEL_FAM6_WESTMERE_EP:
+ case INTEL_FAM6_WESTMERE_EX:
+
+ case INTEL_FAM6_SANDYBRIDGE:
+ case INTEL_FAM6_SANDYBRIDGE_X:
+
+ case INTEL_FAM6_IVYBRIDGE:
+ case INTEL_FAM6_IVYBRIDGE_X:
+
+ case INTEL_FAM6_HASWELL_CORE:
+ case INTEL_FAM6_HASWELL_X:
+ case INTEL_FAM6_HASWELL_ULT:
+ case INTEL_FAM6_HASWELL_GT3E:
+
+ case INTEL_FAM6_BROADWELL_CORE:
+ case INTEL_FAM6_BROADWELL_XEON_D:
+ case INTEL_FAM6_BROADWELL_GT3E:
+ case INTEL_FAM6_BROADWELL_X:
+
+ case INTEL_FAM6_ATOM_SILVERMONT1:
+ case INTEL_FAM6_ATOM_SILVERMONT2:
+ case INTEL_FAM6_ATOM_AIRMONT:
if (idx == PERF_MSR_SMI)
return true;
break;
- case 78: /* 14nm Skylake Mobile */
- case 94: /* 14nm Skylake Desktop */
+ case INTEL_FAM6_SKYLAKE_MOBILE:
+ case INTEL_FAM6_SKYLAKE_DESKTOP:
+ case INTEL_FAM6_SKYLAKE_X:
+ case INTEL_FAM6_KABYLAKE_MOBILE:
+ case INTEL_FAM6_KABYLAKE_DESKTOP:
if (idx == PERF_MSR_SMI || idx == PERF_MSR_PPERF)
return true;
break;
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 8bd764d..8c4a477 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -668,6 +668,14 @@ static struct perf_pmu_events_attr event_attr_##v = { \
.event_str = str, \
};
+#define EVENT_ATTR_STR_HT(_name, v, noht, ht) \
+static struct perf_pmu_events_ht_attr event_attr_##v = { \
+ .attr = __ATTR(_name, 0444, events_ht_sysfs_show, NULL),\
+ .id = 0, \
+ .event_str_noht = noht, \
+ .event_str_ht = ht, \
+}
+
extern struct x86_pmu x86_pmu __read_mostly;
static inline bool x86_pmu_has_lbr_callstack(void)
@@ -803,6 +811,8 @@ struct attribute **merge_attr(struct attribute **a, struct attribute **b);
ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr,
char *page);
+ssize_t events_ht_sysfs_show(struct device *dev, struct device_attribute *attr,
+ char *page);
#ifdef CONFIG_CPU_SUP_AMD
@@ -892,6 +902,8 @@ void intel_ds_init(void);
void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in);
+u64 lbr_from_signext_quirk_wr(u64 val);
+
void intel_pmu_lbr_reset(void);
void intel_pmu_lbr_enable(struct perf_event *event);
diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild
index aeac434..2cfed17 100644
--- a/arch/x86/include/asm/Kbuild
+++ b/arch/x86/include/asm/Kbuild
@@ -1,5 +1,11 @@
+generated-y += syscalls_32.h
+generated-y += syscalls_64.h
+generated-y += unistd_32_ia32.h
+generated-y += unistd_64_x32.h
+generated-y += xen-hypercalls.h
+
genhdr-y += unistd_32.h
genhdr-y += unistd_64.h
genhdr-y += unistd_x32.h
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index bc27611..f5befd4 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -300,7 +300,6 @@ struct apic {
unsigned int (*get_apic_id)(unsigned long x);
unsigned long (*set_apic_id)(unsigned int id);
- unsigned long apic_id_mask;
int (*cpu_mask_to_apicid_and)(const struct cpumask *cpumask,
const struct cpumask *andmask,
diff --git a/arch/x86/include/asm/apm.h b/arch/x86/include/asm/apm.h
index 20370c6..93eebc63 100644
--- a/arch/x86/include/asm/apm.h
+++ b/arch/x86/include/asm/apm.h
@@ -45,11 +45,11 @@ static inline void apm_bios_call_asm(u32 func, u32 ebx_in, u32 ecx_in,
: "memory", "cc");
}
-static inline u8 apm_bios_call_simple_asm(u32 func, u32 ebx_in,
- u32 ecx_in, u32 *eax)
+static inline bool apm_bios_call_simple_asm(u32 func, u32 ebx_in,
+ u32 ecx_in, u32 *eax)
{
int cx, dx, si;
- u8 error;
+ bool error;
/*
* N.B. We do NOT need a cld after the BIOS call
diff --git a/arch/x86/include/asm/arch_hweight.h b/arch/x86/include/asm/arch_hweight.h
index 02e799f..e7cd631 100644
--- a/arch/x86/include/asm/arch_hweight.h
+++ b/arch/x86/include/asm/arch_hweight.h
@@ -4,8 +4,8 @@
#include <asm/cpufeatures.h>
#ifdef CONFIG_64BIT
-/* popcnt %edi, %eax -- redundant REX prefix for alignment */
-#define POPCNT32 ".byte 0xf3,0x40,0x0f,0xb8,0xc7"
+/* popcnt %edi, %eax */
+#define POPCNT32 ".byte 0xf3,0x0f,0xb8,0xc7"
/* popcnt %rdi, %rax */
#define POPCNT64 ".byte 0xf3,0x48,0x0f,0xb8,0xc7"
#define REG_IN "D"
@@ -17,19 +17,15 @@
#define REG_OUT "a"
#endif
-/*
- * __sw_hweightXX are called from within the alternatives below
- * and callee-clobbered registers need to be taken care of. See
- * ARCH_HWEIGHT_CFLAGS in <arch/x86/Kconfig> for the respective
- * compiler switches.
- */
+#define __HAVE_ARCH_SW_HWEIGHT
+
static __always_inline unsigned int __arch_hweight32(unsigned int w)
{
- unsigned int res = 0;
+ unsigned int res;
asm (ALTERNATIVE("call __sw_hweight32", POPCNT32, X86_FEATURE_POPCNT)
- : "="REG_OUT (res)
- : REG_IN (w));
+ : "="REG_OUT (res)
+ : REG_IN (w));
return res;
}
@@ -53,11 +49,11 @@ static inline unsigned long __arch_hweight64(__u64 w)
#else
static __always_inline unsigned long __arch_hweight64(__u64 w)
{
- unsigned long res = 0;
+ unsigned long res;
asm (ALTERNATIVE("call __sw_hweight64", POPCNT64, X86_FEATURE_POPCNT)
- : "="REG_OUT (res)
- : REG_IN (w));
+ : "="REG_OUT (res)
+ : REG_IN (w));
return res;
}
diff --git a/arch/x86/include/asm/archrandom.h b/arch/x86/include/asm/archrandom.h
index 69f1366..5b0579a 100644
--- a/arch/x86/include/asm/archrandom.h
+++ b/arch/x86/include/asm/archrandom.h
@@ -25,8 +25,6 @@
#include <asm/processor.h>
#include <asm/cpufeature.h>
-#include <asm/alternative.h>
-#include <asm/nops.h>
#define RDRAND_RETRY_LOOPS 10
@@ -40,97 +38,91 @@
# define RDSEED_LONG RDSEED_INT
#endif
-#ifdef CONFIG_ARCH_RANDOM
+/* Unconditional execution of RDRAND and RDSEED */
-/* Instead of arch_get_random_long() when alternatives haven't run. */
-static inline int rdrand_long(unsigned long *v)
+static inline bool rdrand_long(unsigned long *v)
{
- int ok;
- asm volatile("1: " RDRAND_LONG "\n\t"
- "jc 2f\n\t"
- "decl %0\n\t"
- "jnz 1b\n\t"
- "2:"
- : "=r" (ok), "=a" (*v)
- : "0" (RDRAND_RETRY_LOOPS));
- return ok;
+ bool ok;
+ unsigned int retry = RDRAND_RETRY_LOOPS;
+ do {
+ asm volatile(RDRAND_LONG "\n\t"
+ CC_SET(c)
+ : CC_OUT(c) (ok), "=a" (*v));
+ if (ok)
+ return true;
+ } while (--retry);
+ return false;
+}
+
+static inline bool rdrand_int(unsigned int *v)
+{
+ bool ok;
+ unsigned int retry = RDRAND_RETRY_LOOPS;
+ do {
+ asm volatile(RDRAND_INT "\n\t"
+ CC_SET(c)
+ : CC_OUT(c) (ok), "=a" (*v));
+ if (ok)
+ return true;
+ } while (--retry);
+ return false;
}
-/* A single attempt at RDSEED */
static inline bool rdseed_long(unsigned long *v)
{
- unsigned char ok;
+ bool ok;
asm volatile(RDSEED_LONG "\n\t"
- "setc %0"
- : "=qm" (ok), "=a" (*v));
+ CC_SET(c)
+ : CC_OUT(c) (ok), "=a" (*v));
return ok;
}
-#define GET_RANDOM(name, type, rdrand, nop) \
-static inline int name(type *v) \
-{ \
- int ok; \
- alternative_io("movl $0, %0\n\t" \
- nop, \
- "\n1: " rdrand "\n\t" \
- "jc 2f\n\t" \
- "decl %0\n\t" \
- "jnz 1b\n\t" \
- "2:", \
- X86_FEATURE_RDRAND, \
- ASM_OUTPUT2("=r" (ok), "=a" (*v)), \
- "0" (RDRAND_RETRY_LOOPS)); \
- return ok; \
-}
-
-#define GET_SEED(name, type, rdseed, nop) \
-static inline int name(type *v) \
-{ \
- unsigned char ok; \
- alternative_io("movb $0, %0\n\t" \
- nop, \
- rdseed "\n\t" \
- "setc %0", \
- X86_FEATURE_RDSEED, \
- ASM_OUTPUT2("=q" (ok), "=a" (*v))); \
- return ok; \
+static inline bool rdseed_int(unsigned int *v)
+{
+ bool ok;
+ asm volatile(RDSEED_INT "\n\t"
+ CC_SET(c)
+ : CC_OUT(c) (ok), "=a" (*v));
+ return ok;
}
-#ifdef CONFIG_X86_64
-
-GET_RANDOM(arch_get_random_long, unsigned long, RDRAND_LONG, ASM_NOP5);
-GET_RANDOM(arch_get_random_int, unsigned int, RDRAND_INT, ASM_NOP4);
-
-GET_SEED(arch_get_random_seed_long, unsigned long, RDSEED_LONG, ASM_NOP5);
-GET_SEED(arch_get_random_seed_int, unsigned int, RDSEED_INT, ASM_NOP4);
-
-#else
-
-GET_RANDOM(arch_get_random_long, unsigned long, RDRAND_LONG, ASM_NOP3);
-GET_RANDOM(arch_get_random_int, unsigned int, RDRAND_INT, ASM_NOP3);
-
-GET_SEED(arch_get_random_seed_long, unsigned long, RDSEED_LONG, ASM_NOP4);
-GET_SEED(arch_get_random_seed_int, unsigned int, RDSEED_INT, ASM_NOP4);
-
-#endif /* CONFIG_X86_64 */
-
+/* Conditional execution based on CPU type */
#define arch_has_random() static_cpu_has(X86_FEATURE_RDRAND)
#define arch_has_random_seed() static_cpu_has(X86_FEATURE_RDSEED)
-#else
+/*
+ * These are the generic interfaces; they must not be declared if the
+ * stubs in <linux/random.h> are to be invoked,
+ * i.e. CONFIG_ARCH_RANDOM is not defined.
+ */
+#ifdef CONFIG_ARCH_RANDOM
-static inline int rdrand_long(unsigned long *v)
+static inline bool arch_get_random_long(unsigned long *v)
{
- return 0;
+ return arch_has_random() ? rdrand_long(v) : false;
}
-static inline bool rdseed_long(unsigned long *v)
+static inline bool arch_get_random_int(unsigned int *v)
{
- return 0;
+ return arch_has_random() ? rdrand_int(v) : false;
}
-#endif /* CONFIG_ARCH_RANDOM */
+static inline bool arch_get_random_seed_long(unsigned long *v)
+{
+ return arch_has_random_seed() ? rdseed_long(v) : false;
+}
+
+static inline bool arch_get_random_seed_int(unsigned int *v)
+{
+ return arch_has_random_seed() ? rdseed_int(v) : false;
+}
extern void x86_init_rdrand(struct cpuinfo_x86 *c);
+#else /* !CONFIG_ARCH_RANDOM */
+
+static inline void x86_init_rdrand(struct cpuinfo_x86 *c) { }
+
+#endif /* !CONFIG_ARCH_RANDOM */
+
#endif /* ASM_X86_ARCHRANDOM_H */
diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h
index f5063b6..7acb51c 100644
--- a/arch/x86/include/asm/asm.h
+++ b/arch/x86/include/asm/asm.h
@@ -42,6 +42,18 @@
#define _ASM_SI __ASM_REG(si)
#define _ASM_DI __ASM_REG(di)
+/*
+ * Macros to generate condition code outputs from inline assembly,
+ * The output operand must be type "bool".
+ */
+#ifdef __GCC_ASM_FLAG_OUTPUTS__
+# define CC_SET(c) "\n\t/* output condition code " #c "*/\n"
+# define CC_OUT(c) "=@cc" #c
+#else
+# define CC_SET(c) "\n\tset" #c " %[_cc_" #c "]\n"
+# define CC_OUT(c) [_cc_ ## c] "=qm"
+#endif
+
/* Exception table entry */
#ifdef __ASSEMBLY__
# define _ASM_EXTABLE_HANDLE(from, to, handler) \
diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h
index 3e86742..14635c5 100644
--- a/arch/x86/include/asm/atomic.h
+++ b/arch/x86/include/asm/atomic.h
@@ -75,9 +75,9 @@ static __always_inline void atomic_sub(int i, atomic_t *v)
* true if the result is zero, or false for all
* other cases.
*/
-static __always_inline int atomic_sub_and_test(int i, atomic_t *v)
+static __always_inline bool atomic_sub_and_test(int i, atomic_t *v)
{
- GEN_BINARY_RMWcc(LOCK_PREFIX "subl", v->counter, "er", i, "%0", "e");
+ GEN_BINARY_RMWcc(LOCK_PREFIX "subl", v->counter, "er", i, "%0", e);
}
/**
@@ -112,9 +112,9 @@ static __always_inline void atomic_dec(atomic_t *v)
* returns true if the result is 0, or false for all other
* cases.
*/
-static __always_inline int atomic_dec_and_test(atomic_t *v)
+static __always_inline bool atomic_dec_and_test(atomic_t *v)
{
- GEN_UNARY_RMWcc(LOCK_PREFIX "decl", v->counter, "%0", "e");
+ GEN_UNARY_RMWcc(LOCK_PREFIX "decl", v->counter, "%0", e);
}
/**
@@ -125,9 +125,9 @@ static __always_inline int atomic_dec_and_test(atomic_t *v)
* and returns true if the result is zero, or false for all
* other cases.
*/
-static __always_inline int atomic_inc_and_test(atomic_t *v)
+static __always_inline bool atomic_inc_and_test(atomic_t *v)
{
- GEN_UNARY_RMWcc(LOCK_PREFIX "incl", v->counter, "%0", "e");
+ GEN_UNARY_RMWcc(LOCK_PREFIX "incl", v->counter, "%0", e);
}
/**
@@ -139,9 +139,9 @@ static __always_inline int atomic_inc_and_test(atomic_t *v)
* if the result is negative, or false when
* result is greater than or equal to zero.
*/
-static __always_inline int atomic_add_negative(int i, atomic_t *v)
+static __always_inline bool atomic_add_negative(int i, atomic_t *v)
{
- GEN_BINARY_RMWcc(LOCK_PREFIX "addl", v->counter, "er", i, "%0", "s");
+ GEN_BINARY_RMWcc(LOCK_PREFIX "addl", v->counter, "er", i, "%0", s);
}
/**
@@ -171,6 +171,16 @@ static __always_inline int atomic_sub_return(int i, atomic_t *v)
#define atomic_inc_return(v) (atomic_add_return(1, v))
#define atomic_dec_return(v) (atomic_sub_return(1, v))
+static __always_inline int atomic_fetch_add(int i, atomic_t *v)
+{
+ return xadd(&v->counter, i);
+}
+
+static __always_inline int atomic_fetch_sub(int i, atomic_t *v)
+{
+ return xadd(&v->counter, -i);
+}
+
static __always_inline int atomic_cmpxchg(atomic_t *v, int old, int new)
{
return cmpxchg(&v->counter, old, new);
@@ -190,10 +200,29 @@ static inline void atomic_##op(int i, atomic_t *v) \
: "memory"); \
}
-ATOMIC_OP(and)
-ATOMIC_OP(or)
-ATOMIC_OP(xor)
+#define ATOMIC_FETCH_OP(op, c_op) \
+static inline int atomic_fetch_##op(int i, atomic_t *v) \
+{ \
+ int old, val = atomic_read(v); \
+ for (;;) { \
+ old = atomic_cmpxchg(v, val, val c_op i); \
+ if (old == val) \
+ break; \
+ val = old; \
+ } \
+ return old; \
+}
+
+#define ATOMIC_OPS(op, c_op) \
+ ATOMIC_OP(op) \
+ ATOMIC_FETCH_OP(op, c_op)
+
+ATOMIC_OPS(and, &)
+ATOMIC_OPS(or , |)
+ATOMIC_OPS(xor, ^)
+#undef ATOMIC_OPS
+#undef ATOMIC_FETCH_OP
#undef ATOMIC_OP
/**
diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h
index a984111..71d7705 100644
--- a/arch/x86/include/asm/atomic64_32.h
+++ b/arch/x86/include/asm/atomic64_32.h
@@ -320,10 +320,29 @@ static inline void atomic64_##op(long long i, atomic64_t *v) \
c = old; \
}
-ATOMIC64_OP(and, &)
-ATOMIC64_OP(or, |)
-ATOMIC64_OP(xor, ^)
+#define ATOMIC64_FETCH_OP(op, c_op) \
+static inline long long atomic64_fetch_##op(long long i, atomic64_t *v) \
+{ \
+ long long old, c = 0; \
+ while ((old = atomic64_cmpxchg(v, c, c c_op i)) != c) \
+ c = old; \
+ return old; \
+}
+
+ATOMIC64_FETCH_OP(add, +)
+
+#define atomic64_fetch_sub(i, v) atomic64_fetch_add(-(i), (v))
+
+#define ATOMIC64_OPS(op, c_op) \
+ ATOMIC64_OP(op, c_op) \
+ ATOMIC64_FETCH_OP(op, c_op)
+
+ATOMIC64_OPS(and, &)
+ATOMIC64_OPS(or, |)
+ATOMIC64_OPS(xor, ^)
+#undef ATOMIC64_OPS
+#undef ATOMIC64_FETCH_OP
#undef ATOMIC64_OP
#endif /* _ASM_X86_ATOMIC64_32_H */
diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h
index 0373510..89ed2f6 100644
--- a/arch/x86/include/asm/atomic64_64.h
+++ b/arch/x86/include/asm/atomic64_64.h
@@ -70,9 +70,9 @@ static inline void atomic64_sub(long i, atomic64_t *v)
* true if the result is zero, or false for all
* other cases.
*/
-static inline int atomic64_sub_and_test(long i, atomic64_t *v)
+static inline bool atomic64_sub_and_test(long i, atomic64_t *v)
{
- GEN_BINARY_RMWcc(LOCK_PREFIX "subq", v->counter, "er", i, "%0", "e");
+ GEN_BINARY_RMWcc(LOCK_PREFIX "subq", v->counter, "er", i, "%0", e);
}
/**
@@ -109,9 +109,9 @@ static __always_inline void atomic64_dec(atomic64_t *v)
* returns true if the result is 0, or false for all other
* cases.
*/
-static inline int atomic64_dec_and_test(atomic64_t *v)
+static inline bool atomic64_dec_and_test(atomic64_t *v)
{
- GEN_UNARY_RMWcc(LOCK_PREFIX "decq", v->counter, "%0", "e");
+ GEN_UNARY_RMWcc(LOCK_PREFIX "decq", v->counter, "%0", e);
}
/**
@@ -122,9 +122,9 @@ static inline int atomic64_dec_and_test(atomic64_t *v)
* and returns true if the result is zero, or false for all
* other cases.
*/
-static inline int atomic64_inc_and_test(atomic64_t *v)
+static inline bool atomic64_inc_and_test(atomic64_t *v)
{
- GEN_UNARY_RMWcc(LOCK_PREFIX "incq", v->counter, "%0", "e");
+ GEN_UNARY_RMWcc(LOCK_PREFIX "incq", v->counter, "%0", e);
}
/**
@@ -136,9 +136,9 @@ static inline int atomic64_inc_and_test(atomic64_t *v)
* if the result is negative, or false when
* result is greater than or equal to zero.
*/
-static inline int atomic64_add_negative(long i, atomic64_t *v)
+static inline bool atomic64_add_negative(long i, atomic64_t *v)
{
- GEN_BINARY_RMWcc(LOCK_PREFIX "addq", v->counter, "er", i, "%0", "s");
+ GEN_BINARY_RMWcc(LOCK_PREFIX "addq", v->counter, "er", i, "%0", s);
}
/**
@@ -158,6 +158,16 @@ static inline long atomic64_sub_return(long i, atomic64_t *v)
return atomic64_add_return(-i, v);
}
+static inline long atomic64_fetch_add(long i, atomic64_t *v)
+{
+ return xadd(&v->counter, i);
+}
+
+static inline long atomic64_fetch_sub(long i, atomic64_t *v)
+{
+ return xadd(&v->counter, -i);
+}
+
#define atomic64_inc_return(v) (atomic64_add_return(1, (v)))
#define atomic64_dec_return(v) (atomic64_sub_return(1, (v)))
@@ -180,7 +190,7 @@ static inline long atomic64_xchg(atomic64_t *v, long new)
* Atomically adds @a to @v, so long as it was not @u.
* Returns the old value of @v.
*/
-static inline int atomic64_add_unless(atomic64_t *v, long a, long u)
+static inline bool atomic64_add_unless(atomic64_t *v, long a, long u)
{
long c, old;
c = atomic64_read(v);
@@ -229,10 +239,29 @@ static inline void atomic64_##op(long i, atomic64_t *v) \
: "memory"); \
}
-ATOMIC64_OP(and)
-ATOMIC64_OP(or)
-ATOMIC64_OP(xor)
+#define ATOMIC64_FETCH_OP(op, c_op) \
+static inline long atomic64_fetch_##op(long i, atomic64_t *v) \
+{ \
+ long old, val = atomic64_read(v); \
+ for (;;) { \
+ old = atomic64_cmpxchg(v, val, val c_op i); \
+ if (old == val) \
+ break; \
+ val = old; \
+ } \
+ return old; \
+}
+
+#define ATOMIC64_OPS(op, c_op) \
+ ATOMIC64_OP(op) \
+ ATOMIC64_FETCH_OP(op, c_op)
+
+ATOMIC64_OPS(and, &)
+ATOMIC64_OPS(or, |)
+ATOMIC64_OPS(xor, ^)
+#undef ATOMIC64_OPS
+#undef ATOMIC64_FETCH_OP
#undef ATOMIC64_OP
#endif /* _ASM_X86_ATOMIC64_64_H */
diff --git a/arch/x86/include/asm/bios_ebda.h b/arch/x86/include/asm/bios_ebda.h
index 2b00c77..4b7b8e7 100644
--- a/arch/x86/include/asm/bios_ebda.h
+++ b/arch/x86/include/asm/bios_ebda.h
@@ -17,7 +17,7 @@ static inline unsigned int get_bios_ebda(void)
return address; /* 0 means none */
}
-void reserve_ebda_region(void);
+void reserve_bios_regions(void);
#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION
/*
diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h
index 7766d1c..68557f52 100644
--- a/arch/x86/include/asm/bitops.h
+++ b/arch/x86/include/asm/bitops.h
@@ -201,9 +201,9 @@ static __always_inline void change_bit(long nr, volatile unsigned long *addr)
* This operation is atomic and cannot be reordered.
* It also implies a memory barrier.
*/
-static __always_inline int test_and_set_bit(long nr, volatile unsigned long *addr)
+static __always_inline bool test_and_set_bit(long nr, volatile unsigned long *addr)
{
- GEN_BINARY_RMWcc(LOCK_PREFIX "bts", *addr, "Ir", nr, "%0", "c");
+ GEN_BINARY_RMWcc(LOCK_PREFIX "bts", *addr, "Ir", nr, "%0", c);
}
/**
@@ -213,7 +213,7 @@ static __always_inline int test_and_set_bit(long nr, volatile unsigned long *add
*
* This is the same as test_and_set_bit on x86.
*/
-static __always_inline int
+static __always_inline bool
test_and_set_bit_lock(long nr, volatile unsigned long *addr)
{
return test_and_set_bit(nr, addr);
@@ -228,13 +228,13 @@ test_and_set_bit_lock(long nr, volatile unsigned long *addr)
* If two examples of this operation race, one can appear to succeed
* but actually fail. You must protect multiple accesses with a lock.
*/
-static __always_inline int __test_and_set_bit(long nr, volatile unsigned long *addr)
+static __always_inline bool __test_and_set_bit(long nr, volatile unsigned long *addr)
{
- int oldbit;
+ bool oldbit;
asm("bts %2,%1\n\t"
- "sbb %0,%0"
- : "=r" (oldbit), ADDR
+ CC_SET(c)
+ : CC_OUT(c) (oldbit), ADDR
: "Ir" (nr));
return oldbit;
}
@@ -247,9 +247,9 @@ static __always_inline int __test_and_set_bit(long nr, volatile unsigned long *a
* This operation is atomic and cannot be reordered.
* It also implies a memory barrier.
*/
-static __always_inline int test_and_clear_bit(long nr, volatile unsigned long *addr)
+static __always_inline bool test_and_clear_bit(long nr, volatile unsigned long *addr)
{
- GEN_BINARY_RMWcc(LOCK_PREFIX "btr", *addr, "Ir", nr, "%0", "c");
+ GEN_BINARY_RMWcc(LOCK_PREFIX "btr", *addr, "Ir", nr, "%0", c);
}
/**
@@ -268,25 +268,25 @@ static __always_inline int test_and_clear_bit(long nr, volatile unsigned long *a
* accessed from a hypervisor on the same CPU if running in a VM: don't change
* this without also updating arch/x86/kernel/kvm.c
*/
-static __always_inline int __test_and_clear_bit(long nr, volatile unsigned long *addr)
+static __always_inline bool __test_and_clear_bit(long nr, volatile unsigned long *addr)
{
- int oldbit;
+ bool oldbit;
asm volatile("btr %2,%1\n\t"
- "sbb %0,%0"
- : "=r" (oldbit), ADDR
+ CC_SET(c)
+ : CC_OUT(c) (oldbit), ADDR
: "Ir" (nr));
return oldbit;
}
/* WARNING: non atomic and it can be reordered! */
-static __always_inline int __test_and_change_bit(long nr, volatile unsigned long *addr)
+static __always_inline bool __test_and_change_bit(long nr, volatile unsigned long *addr)
{
- int oldbit;
+ bool oldbit;
asm volatile("btc %2,%1\n\t"
- "sbb %0,%0"
- : "=r" (oldbit), ADDR
+ CC_SET(c)
+ : CC_OUT(c) (oldbit), ADDR
: "Ir" (nr) : "memory");
return oldbit;
@@ -300,24 +300,24 @@ static __always_inline int __test_and_change_bit(long nr, volatile unsigned long
* This operation is atomic and cannot be reordered.
* It also implies a memory barrier.
*/
-static __always_inline int test_and_change_bit(long nr, volatile unsigned long *addr)
+static __always_inline bool test_and_change_bit(long nr, volatile unsigned long *addr)
{
- GEN_BINARY_RMWcc(LOCK_PREFIX "btc", *addr, "Ir", nr, "%0", "c");
+ GEN_BINARY_RMWcc(LOCK_PREFIX "btc", *addr, "Ir", nr, "%0", c);
}
-static __always_inline int constant_test_bit(long nr, const volatile unsigned long *addr)
+static __always_inline bool constant_test_bit(long nr, const volatile unsigned long *addr)
{
return ((1UL << (nr & (BITS_PER_LONG-1))) &
(addr[nr >> _BITOPS_LONG_SHIFT])) != 0;
}
-static __always_inline int variable_test_bit(long nr, volatile const unsigned long *addr)
+static __always_inline bool variable_test_bit(long nr, volatile const unsigned long *addr)
{
- int oldbit;
+ bool oldbit;
asm volatile("bt %2,%1\n\t"
- "sbb %0,%0"
- : "=r" (oldbit)
+ CC_SET(c)
+ : CC_OUT(c) (oldbit)
: "m" (*(unsigned long *)addr), "Ir" (nr));
return oldbit;
@@ -329,7 +329,7 @@ static __always_inline int variable_test_bit(long nr, volatile const unsigned lo
* @nr: bit number to test
* @addr: Address to start counting from
*/
-static int test_bit(int nr, const volatile unsigned long *addr);
+static bool test_bit(int nr, const volatile unsigned long *addr);
#endif
#define test_bit(nr, addr) \
diff --git a/arch/x86/include/asm/checksum_32.h b/arch/x86/include/asm/checksum_32.h
index 532f85e..7b53743 100644
--- a/arch/x86/include/asm/checksum_32.h
+++ b/arch/x86/include/asm/checksum_32.h
@@ -2,8 +2,7 @@
#define _ASM_X86_CHECKSUM_32_H
#include <linux/in6.h>
-
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
/*
* computes the checksum of a memory block at buff, length len,
diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h
index 5a3b2c1..a188061 100644
--- a/arch/x86/include/asm/compat.h
+++ b/arch/x86/include/asm/compat.h
@@ -40,6 +40,7 @@ typedef s32 compat_long_t;
typedef s64 __attribute__((aligned(4))) compat_s64;
typedef u32 compat_uint_t;
typedef u32 compat_ulong_t;
+typedef u32 compat_u32;
typedef u64 __attribute__((aligned(4))) compat_u64;
typedef u32 compat_uptr_t;
@@ -181,6 +182,16 @@ typedef struct compat_siginfo {
/* SIGILL, SIGFPE, SIGSEGV, SIGBUS */
struct {
unsigned int _addr; /* faulting insn/memory ref. */
+ short int _addr_lsb; /* Valid LSB of the reported address. */
+ union {
+ /* used when si_code=SEGV_BNDERR */
+ struct {
+ compat_uptr_t _lower;
+ compat_uptr_t _upper;
+ } _addr_bnd;
+ /* used when si_code=SEGV_PKUERR */
+ compat_u32 _pkey;
+ };
} _sigfault;
/* SIGPOLL */
diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h
index 678637a..59d34c5 100644
--- a/arch/x86/include/asm/cpu.h
+++ b/arch/x86/include/asm/cpu.h
@@ -17,7 +17,6 @@ static inline void prefill_possible_map(void) {}
#define cpu_physical_id(cpu) boot_cpu_physical_apicid
#define safe_smp_processor_id() 0
-#define stack_smp_processor_id() 0
#endif /* CONFIG_SMP */
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 4a41348..c64b1e9 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -301,10 +301,6 @@
#define X86_BUG_FXSAVE_LEAK X86_BUG(6) /* FXSAVE leaks FOP/FIP/FOP */
#define X86_BUG_CLFLUSH_MONITOR X86_BUG(7) /* AAI65, CLFLUSH required before MONITOR */
#define X86_BUG_SYSRET_SS_ATTRS X86_BUG(8) /* SYSRET doesn't fix up SS attrs */
-#define X86_BUG_NULL_SEG X86_BUG(9) /* Nulling a selector preserves the base */
-#define X86_BUG_SWAPGS_FENCE X86_BUG(10) /* SWAPGS without input dep on GS */
-
-
#ifdef CONFIG_X86_32
/*
* 64-bit kernels don't use X86_BUG_ESPFIX. Make the define conditional
@@ -312,5 +308,7 @@
*/
#define X86_BUG_ESPFIX X86_BUG(9) /* "" IRET to 16-bit SS corrupts ESP/RSP high bits */
#endif
+#define X86_BUG_NULL_SEG X86_BUG(10) /* Nulling a selector preserves the base */
+#define X86_BUG_SWAPGS_FENCE X86_BUG(11) /* SWAPGS without input dep on GS */
#endif /* _ASM_X86_CPUFEATURES_H */
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index 78d1e74..d0bb76d 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -41,10 +41,9 @@ extern unsigned long asmlinkage efi_call_phys(void *, ...);
/*
* Wrap all the virtual calls in a way that forces the parameters on the stack.
*/
-#define arch_efi_call_virt(f, args...) \
+#define arch_efi_call_virt(p, f, args...) \
({ \
- ((efi_##f##_t __attribute__((regparm(0)))*) \
- efi.systab->runtime->f)(args); \
+ ((efi_##f##_t __attribute__((regparm(0)))*) p->f)(args); \
})
#define efi_ioremap(addr, size, type, attr) ioremap_cache(addr, size)
@@ -81,8 +80,8 @@ struct efi_scratch {
} \
})
-#define arch_efi_call_virt(f, args...) \
- efi_call((void *)efi.systab->runtime->f, args) \
+#define arch_efi_call_virt(p, f, args...) \
+ efi_call((void *)p->f, args) \
#define arch_efi_call_virt_teardown() \
({ \
@@ -125,7 +124,6 @@ extern void __init efi_map_region_fixed(efi_memory_desc_t *md);
extern void efi_sync_low_kernel_mappings(void);
extern int __init efi_alloc_page_tables(void);
extern int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages);
-extern void __init efi_cleanup_page_tables(unsigned long pa_memmap, unsigned num_pages);
extern void __init old_map_region(efi_memory_desc_t *md);
extern void __init runtime_code_page_mkexec(void);
extern void __init efi_runtime_update_mappings(void);
diff --git a/arch/x86/include/asm/kaslr.h b/arch/x86/include/asm/kaslr.h
new file mode 100644
index 0000000..2674ee3
--- /dev/null
+++ b/arch/x86/include/asm/kaslr.h
@@ -0,0 +1,15 @@
+#ifndef _ASM_KASLR_H_
+#define _ASM_KASLR_H_
+
+unsigned long kaslr_get_random_long(const char *purpose);
+
+#ifdef CONFIG_RANDOMIZE_MEMORY
+extern unsigned long page_offset_base;
+extern unsigned long vmalloc_base;
+
+void kernel_randomize_memory(void);
+#else
+static inline void kernel_randomize_memory(void) { }
+#endif /* CONFIG_RANDOMIZE_MEMORY */
+
+#endif
diff --git a/arch/x86/include/asm/kdebug.h b/arch/x86/include/asm/kdebug.h
index e5f5dc9..1ef9d58 100644
--- a/arch/x86/include/asm/kdebug.h
+++ b/arch/x86/include/asm/kdebug.h
@@ -26,6 +26,7 @@ extern void die(const char *, struct pt_regs *,long);
extern int __must_check __die(const char *, struct pt_regs *, long);
extern void show_trace(struct task_struct *t, struct pt_regs *regs,
unsigned long *sp, unsigned long bp);
+extern void show_stack_regs(struct pt_regs *regs);
extern void __show_regs(struct pt_regs *regs, int all);
extern unsigned long oops_begin(void);
extern void oops_end(unsigned long, struct pt_regs *, int signr);
diff --git a/arch/x86/include/asm/local.h b/arch/x86/include/asm/local.h
index 4ad6560..7511978 100644
--- a/arch/x86/include/asm/local.h
+++ b/arch/x86/include/asm/local.h
@@ -50,9 +50,9 @@ static inline void local_sub(long i, local_t *l)
* true if the result is zero, or false for all
* other cases.
*/
-static inline int local_sub_and_test(long i, local_t *l)
+static inline bool local_sub_and_test(long i, local_t *l)
{
- GEN_BINARY_RMWcc(_ASM_SUB, l->a.counter, "er", i, "%0", "e");
+ GEN_BINARY_RMWcc(_ASM_SUB, l->a.counter, "er", i, "%0", e);
}
/**
@@ -63,9 +63,9 @@ static inline int local_sub_and_test(long i, local_t *l)
* returns true if the result is 0, or false for all other
* cases.
*/
-static inline int local_dec_and_test(local_t *l)
+static inline bool local_dec_and_test(local_t *l)
{
- GEN_UNARY_RMWcc(_ASM_DEC, l->a.counter, "%0", "e");
+ GEN_UNARY_RMWcc(_ASM_DEC, l->a.counter, "%0", e);
}
/**
@@ -76,9 +76,9 @@ static inline int local_dec_and_test(local_t *l)
* and returns true if the result is zero, or false for all
* other cases.
*/
-static inline int local_inc_and_test(local_t *l)
+static inline bool local_inc_and_test(local_t *l)
{
- GEN_UNARY_RMWcc(_ASM_INC, l->a.counter, "%0", "e");
+ GEN_UNARY_RMWcc(_ASM_INC, l->a.counter, "%0", e);
}
/**
@@ -90,9 +90,9 @@ static inline int local_inc_and_test(local_t *l)
* if the result is negative, or false when
* result is greater than or equal to zero.
*/
-static inline int local_add_negative(long i, local_t *l)
+static inline bool local_add_negative(long i, local_t *l)
{
- GEN_BINARY_RMWcc(_ASM_ADD, l->a.counter, "er", i, "%0", "s");
+ GEN_BINARY_RMWcc(_ASM_ADD, l->a.counter, "er", i, "%0", s);
}
/**
diff --git a/arch/x86/include/asm/mutex_32.h b/arch/x86/include/asm/mutex_32.h
index 85e6cda..e9355a8 100644
--- a/arch/x86/include/asm/mutex_32.h
+++ b/arch/x86/include/asm/mutex_32.h
@@ -101,7 +101,7 @@ static inline int __mutex_fastpath_trylock(atomic_t *count,
int (*fail_fn)(atomic_t *))
{
/* cmpxchg because it never induces a false contention state. */
- if (likely(atomic_cmpxchg(count, 1, 0) == 1))
+ if (likely(atomic_read(count) == 1 && atomic_cmpxchg(count, 1, 0) == 1))
return 1;
return 0;
diff --git a/arch/x86/include/asm/mutex_64.h b/arch/x86/include/asm/mutex_64.h
index 07537a4..d985075 100644
--- a/arch/x86/include/asm/mutex_64.h
+++ b/arch/x86/include/asm/mutex_64.h
@@ -118,10 +118,10 @@ do { \
static inline int __mutex_fastpath_trylock(atomic_t *count,
int (*fail_fn)(atomic_t *))
{
- if (likely(atomic_cmpxchg(count, 1, 0) == 1))
+ if (likely(atomic_read(count) == 1 && atomic_cmpxchg(count, 1, 0) == 1))
return 1;
- else
- return 0;
+
+ return 0;
}
#endif /* _ASM_X86_MUTEX_64_H */
diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h
index d5c2f8b..9215e05 100644
--- a/arch/x86/include/asm/page_64_types.h
+++ b/arch/x86/include/asm/page_64_types.h
@@ -1,6 +1,10 @@
#ifndef _ASM_X86_PAGE_64_DEFS_H
#define _ASM_X86_PAGE_64_DEFS_H
+#ifndef __ASSEMBLY__
+#include <asm/kaslr.h>
+#endif
+
#ifdef CONFIG_KASAN
#define KASAN_STACK_ORDER 1
#else
@@ -32,7 +36,12 @@
* hypervisor to fit. Choosing 16 slots here is arbitrary, but it's
* what Xen requires.
*/
-#define __PAGE_OFFSET _AC(0xffff880000000000, UL)
+#define __PAGE_OFFSET_BASE _AC(0xffff880000000000, UL)
+#ifdef CONFIG_RANDOMIZE_MEMORY
+#define __PAGE_OFFSET page_offset_base
+#else
+#define __PAGE_OFFSET __PAGE_OFFSET_BASE
+#endif /* CONFIG_RANDOMIZE_MEMORY */
#define __START_KERNEL_map _AC(0xffffffff80000000, UL)
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index e0ba66c..e02e3f8 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -510,14 +510,15 @@ do { \
/* This is not atomic against other CPUs -- CPU preemption needs to be off */
#define x86_test_and_clear_bit_percpu(bit, var) \
({ \
- int old__; \
- asm volatile("btr %2,"__percpu_arg(1)"\n\tsbbl %0,%0" \
- : "=r" (old__), "+m" (var) \
+ bool old__; \
+ asm volatile("btr %2,"__percpu_arg(1)"\n\t" \
+ CC_SET(c) \
+ : CC_OUT(c) (old__), "+m" (var) \
: "dIr" (bit)); \
old__; \
})
-static __always_inline int x86_this_cpu_constant_test_bit(unsigned int nr,
+static __always_inline bool x86_this_cpu_constant_test_bit(unsigned int nr,
const unsigned long __percpu *addr)
{
unsigned long __percpu *a = (unsigned long *)addr + nr / BITS_PER_LONG;
@@ -529,14 +530,14 @@ static __always_inline int x86_this_cpu_constant_test_bit(unsigned int nr,
#endif
}
-static inline int x86_this_cpu_variable_test_bit(int nr,
+static inline bool x86_this_cpu_variable_test_bit(int nr,
const unsigned long __percpu *addr)
{
- int oldbit;
+ bool oldbit;
asm volatile("bt "__percpu_arg(2)",%1\n\t"
- "sbb %0,%0"
- : "=r" (oldbit)
+ CC_SET(c)
+ : CC_OUT(c) (oldbit)
: "m" (*(unsigned long *)addr), "Ir" (nr));
return oldbit;
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 1a27396..437feb4 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -480,7 +480,7 @@ pte_t *populate_extra_pte(unsigned long vaddr);
static inline int pte_none(pte_t pte)
{
- return !pte.pte;
+ return !(pte.pte & ~(_PAGE_KNL_ERRATUM_MASK));
}
#define __HAVE_ARCH_PTE_SAME
@@ -552,7 +552,8 @@ static inline int pmd_none(pmd_t pmd)
{
/* Only check low word on 32-bit platforms, since it might be
out of sync with upper half. */
- return (unsigned long)native_pmd_val(pmd) == 0;
+ unsigned long val = native_pmd_val(pmd);
+ return (val & ~_PAGE_KNL_ERRATUM_MASK) == 0;
}
static inline unsigned long pmd_page_vaddr(pmd_t pmd)
@@ -616,7 +617,7 @@ static inline unsigned long pages_to_mb(unsigned long npg)
#if CONFIG_PGTABLE_LEVELS > 2
static inline int pud_none(pud_t pud)
{
- return native_pud_val(pud) == 0;
+ return (native_pud_val(pud) & ~(_PAGE_KNL_ERRATUM_MASK)) == 0;
}
static inline int pud_present(pud_t pud)
@@ -694,6 +695,12 @@ static inline int pgd_bad(pgd_t pgd)
static inline int pgd_none(pgd_t pgd)
{
+ /*
+ * There is no need to do a workaround for the KNL stray
+ * A/D bit erratum here. PGDs only point to page tables
+ * except on 32-bit non-PAE which is not supported on
+ * KNL.
+ */
return !native_pgd_val(pgd);
}
#endif /* CONFIG_PGTABLE_LEVELS > 3 */
@@ -729,6 +736,23 @@ extern int direct_gbpages;
void init_mem_mapping(void);
void early_alloc_pgt_buf(void);
+#ifdef CONFIG_X86_64
+/* Realmode trampoline initialization. */
+extern pgd_t trampoline_pgd_entry;
+static inline void __meminit init_trampoline_default(void)
+{
+ /* Default trampoline pgd value */
+ trampoline_pgd_entry = init_level4_pgt[pgd_index(__PAGE_OFFSET)];
+}
+# ifdef CONFIG_RANDOMIZE_MEMORY
+void __meminit init_trampoline(void);
+# else
+# define init_trampoline init_trampoline_default
+# endif
+#else
+static inline void init_trampoline(void) { }
+#endif
+
/* local pte updates need not use xchg for locking */
static inline pte_t native_local_ptep_get_and_clear(pte_t *ptep)
{
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 2ee7811..7e8ec7a 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -140,18 +140,32 @@ static inline int pgd_large(pgd_t pgd) { return 0; }
#define pte_offset_map(dir, address) pte_offset_kernel((dir), (address))
#define pte_unmap(pte) ((void)(pte))/* NOP */
-/* Encode and de-code a swap entry */
+/*
+ * Encode and de-code a swap entry
+ *
+ * | ... | 11| 10| 9|8|7|6|5| 4| 3|2|1|0| <- bit number
+ * | ... |SW3|SW2|SW1|G|L|D|A|CD|WT|U|W|P| <- bit names
+ * | OFFSET (14->63) | TYPE (10-13) |0|X|X|X| X| X|X|X|0| <- swp entry
+ *
+ * G (8) is aliased and used as a PROT_NONE indicator for
+ * !present ptes. We need to start storing swap entries above
+ * there. We also need to avoid using A and D because of an
+ * erratum where they can be incorrectly set by hardware on
+ * non-present PTEs.
+ */
+#define SWP_TYPE_FIRST_BIT (_PAGE_BIT_PROTNONE + 1)
#define SWP_TYPE_BITS 5
-#define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 1)
+/* Place the offset above the type: */
+#define SWP_OFFSET_FIRST_BIT (SWP_TYPE_FIRST_BIT + SWP_TYPE_BITS + 1)
#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS)
-#define __swp_type(x) (((x).val >> (_PAGE_BIT_PRESENT + 1)) \
+#define __swp_type(x) (((x).val >> (SWP_TYPE_FIRST_BIT)) \
& ((1U << SWP_TYPE_BITS) - 1))
-#define __swp_offset(x) ((x).val >> SWP_OFFSET_SHIFT)
+#define __swp_offset(x) ((x).val >> SWP_OFFSET_FIRST_BIT)
#define __swp_entry(type, offset) ((swp_entry_t) { \
- ((type) << (_PAGE_BIT_PRESENT + 1)) \
- | ((offset) << SWP_OFFSET_SHIFT) })
+ ((type) << (SWP_TYPE_FIRST_BIT)) \
+ | ((offset) << SWP_OFFSET_FIRST_BIT) })
#define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val((pte)) })
#define __swp_entry_to_pte(x) ((pte_t) { .pte = (x).val })
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index e6844df..6fdef9e 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -5,6 +5,7 @@
#ifndef __ASSEMBLY__
#include <linux/types.h>
+#include <asm/kaslr.h>
/*
* These are used to make use of C type-checking..
@@ -53,10 +54,16 @@ typedef struct { pteval_t pte; } pte_t;
#define PGDIR_MASK (~(PGDIR_SIZE - 1))
/* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */
-#define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL)
-#define VMALLOC_START _AC(0xffffc90000000000, UL)
-#define VMALLOC_END _AC(0xffffe8ffffffffff, UL)
-#define VMEMMAP_START _AC(0xffffea0000000000, UL)
+#define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL)
+#define VMALLOC_SIZE_TB _AC(32, UL)
+#define __VMALLOC_BASE _AC(0xffffc90000000000, UL)
+#define VMEMMAP_START _AC(0xffffea0000000000, UL)
+#ifdef CONFIG_RANDOMIZE_MEMORY
+#define VMALLOC_START vmalloc_base
+#else
+#define VMALLOC_START __VMALLOC_BASE
+#endif /* CONFIG_RANDOMIZE_MEMORY */
+#define VMALLOC_END (VMALLOC_START + _AC((VMALLOC_SIZE_TB << 40) - 1, UL))
#define MODULES_VADDR (__START_KERNEL_map + KERNEL_IMAGE_SIZE)
#define MODULES_END _AC(0xffffffffff000000, UL)
#define MODULES_LEN (MODULES_END - MODULES_VADDR)
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 7b5efe2..f1218f5 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -70,6 +70,12 @@
_PAGE_PKEY_BIT2 | \
_PAGE_PKEY_BIT3)
+#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
+#define _PAGE_KNL_ERRATUM_MASK (_PAGE_DIRTY | _PAGE_ACCESSED)
+#else
+#define _PAGE_KNL_ERRATUM_MASK 0
+#endif
+
#ifdef CONFIG_KMEMCHECK
#define _PAGE_HIDDEN (_AT(pteval_t, 1) << _PAGE_BIT_HIDDEN)
#else
@@ -475,8 +481,6 @@ extern pmd_t *lookup_pmd_address(unsigned long address);
extern phys_addr_t slow_virt_to_phys(void *__address);
extern int kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address,
unsigned numpages, unsigned long page_flags);
-void kernel_unmap_pages_in_pgd(pgd_t *root, unsigned long address,
- unsigned numpages);
#endif /* !__ASSEMBLY__ */
#endif /* _ASM_X86_PGTABLE_DEFS_H */
diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h
index d397deb..17f2186 100644
--- a/arch/x86/include/asm/preempt.h
+++ b/arch/x86/include/asm/preempt.h
@@ -81,7 +81,7 @@ static __always_inline void __preempt_count_sub(int val)
*/
static __always_inline bool __preempt_count_dec_and_test(void)
{
- GEN_UNARY_RMWcc("decl", __preempt_count, __percpu_arg(0), "e");
+ GEN_UNARY_RMWcc("decl", __preempt_count, __percpu_arg(0), e);
}
/*
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 965c5d2..63def95 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -372,6 +372,10 @@ extern unsigned int fpu_user_xstate_size;
struct perf_event;
+typedef struct {
+ unsigned long seg;
+} mm_segment_t;
+
struct thread_struct {
/* Cached TLS descriptors: */
struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES];
@@ -420,6 +424,11 @@ struct thread_struct {
/* Max allowed port in the bitmap, in bytes: */
unsigned io_bitmap_max;
+ mm_segment_t addr_limit;
+
+ unsigned int sig_on_uaccess_err:1;
+ unsigned int uaccess_err:1; /* uaccess failed */
+
/* Floating point and extended processor state */
struct fpu fpu;
/*
@@ -491,11 +500,6 @@ static inline void load_sp0(struct tss_struct *tss,
#define set_iopl_mask native_set_iopl_mask
#endif /* CONFIG_PARAVIRT */
-typedef struct {
- unsigned long seg;
-} mm_segment_t;
-
-
/* Free all resources held by a thread. */
extern void release_thread(struct task_struct *);
@@ -717,6 +721,7 @@ static inline void spin_lock_prefetch(const void *x)
.sp0 = TOP_OF_INIT_STACK, \
.sysenter_cs = __KERNEL_CS, \
.io_bitmap_ptr = NULL, \
+ .addr_limit = KERNEL_DS, \
}
extern unsigned long thread_saved_pc(struct task_struct *tsk);
@@ -766,8 +771,9 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk);
#define STACK_TOP TASK_SIZE
#define STACK_TOP_MAX TASK_SIZE_MAX
-#define INIT_THREAD { \
- .sp0 = TOP_OF_INIT_STACK \
+#define INIT_THREAD { \
+ .sp0 = TOP_OF_INIT_STACK, \
+ .addr_limit = KERNEL_DS, \
}
/*
diff --git a/arch/x86/include/asm/rmwcc.h b/arch/x86/include/asm/rmwcc.h
index 8f7866a..661dd30 100644
--- a/arch/x86/include/asm/rmwcc.h
+++ b/arch/x86/include/asm/rmwcc.h
@@ -1,11 +1,13 @@
#ifndef _ASM_X86_RMWcc
#define _ASM_X86_RMWcc
-#ifdef CC_HAVE_ASM_GOTO
+#if !defined(__GCC_ASM_FLAG_OUTPUTS__) && defined(CC_HAVE_ASM_GOTO)
+
+/* Use asm goto */
#define __GEN_RMWcc(fullop, var, cc, ...) \
do { \
- asm_volatile_goto (fullop "; j" cc " %l[cc_label]" \
+ asm_volatile_goto (fullop "; j" #cc " %l[cc_label]" \
: : "m" (var), ## __VA_ARGS__ \
: "memory" : cc_label); \
return 0; \
@@ -19,15 +21,17 @@ cc_label: \
#define GEN_BINARY_RMWcc(op, var, vcon, val, arg0, cc) \
__GEN_RMWcc(op " %1, " arg0, var, cc, vcon (val))
-#else /* !CC_HAVE_ASM_GOTO */
+#else /* defined(__GCC_ASM_FLAG_OUTPUTS__) || !defined(CC_HAVE_ASM_GOTO) */
+
+/* Use flags output or a set instruction */
#define __GEN_RMWcc(fullop, var, cc, ...) \
do { \
- char c; \
- asm volatile (fullop "; set" cc " %1" \
- : "+m" (var), "=qm" (c) \
+ bool c; \
+ asm volatile (fullop ";" CC_SET(cc) \
+ : "+m" (var), CC_OUT(cc) (c) \
: __VA_ARGS__ : "memory"); \
- return c != 0; \
+ return c; \
} while (0)
#define GEN_UNARY_RMWcc(op, var, arg0, cc) \
@@ -36,6 +40,6 @@ do { \
#define GEN_BINARY_RMWcc(op, var, vcon, val, arg0, cc) \
__GEN_RMWcc(op " %2, " arg0, var, cc, vcon (val))
-#endif /* CC_HAVE_ASM_GOTO */
+#endif /* defined(__GCC_ASM_FLAG_OUTPUTS__) || !defined(CC_HAVE_ASM_GOTO) */
#endif /* _ASM_X86_RMWcc */
diff --git a/arch/x86/include/asm/rwsem.h b/arch/x86/include/asm/rwsem.h
index 453744c..8dbc762 100644
--- a/arch/x86/include/asm/rwsem.h
+++ b/arch/x86/include/asm/rwsem.h
@@ -77,7 +77,7 @@ static inline void __down_read(struct rw_semaphore *sem)
/*
* trylock for reading -- returns 1 if successful, 0 if contention
*/
-static inline int __down_read_trylock(struct rw_semaphore *sem)
+static inline bool __down_read_trylock(struct rw_semaphore *sem)
{
long result, tmp;
asm volatile("# beginning __down_read_trylock\n\t"
@@ -93,7 +93,7 @@ static inline int __down_read_trylock(struct rw_semaphore *sem)
: "+m" (sem->count), "=&a" (result), "=&r" (tmp)
: "i" (RWSEM_ACTIVE_READ_BIAS)
: "memory", "cc");
- return result >= 0 ? 1 : 0;
+ return result >= 0;
}
/*
@@ -134,9 +134,10 @@ static inline int __down_write_killable(struct rw_semaphore *sem)
/*
* trylock for writing -- returns 1 if successful, 0 if contention
*/
-static inline int __down_write_trylock(struct rw_semaphore *sem)
+static inline bool __down_write_trylock(struct rw_semaphore *sem)
{
- long result, tmp;
+ bool result;
+ long tmp0, tmp1;
asm volatile("# beginning __down_write_trylock\n\t"
" mov %0,%1\n\t"
"1:\n\t"
@@ -144,14 +145,14 @@ static inline int __down_write_trylock(struct rw_semaphore *sem)
/* was the active mask 0 before? */
" jnz 2f\n\t"
" mov %1,%2\n\t"
- " add %3,%2\n\t"
+ " add %4,%2\n\t"
LOCK_PREFIX " cmpxchg %2,%0\n\t"
" jnz 1b\n\t"
"2:\n\t"
- " sete %b1\n\t"
- " movzbl %b1, %k1\n\t"
+ CC_SET(e)
"# ending __down_write_trylock\n\t"
- : "+m" (sem->count), "=&a" (result), "=&r" (tmp)
+ : "+m" (sem->count), "=&a" (tmp0), "=&r" (tmp1),
+ CC_OUT(e) (result)
: "er" (RWSEM_ACTIVE_WRITE_BIAS)
: "memory", "cc");
return result;
@@ -213,23 +214,5 @@ static inline void __downgrade_write(struct rw_semaphore *sem)
: "memory", "cc");
}
-/*
- * implement atomic add functionality
- */
-static inline void rwsem_atomic_add(long delta, struct rw_semaphore *sem)
-{
- asm volatile(LOCK_PREFIX _ASM_ADD "%1,%0"
- : "+m" (sem->count)
- : "er" (delta));
-}
-
-/*
- * implement exchange and add functionality
- */
-static inline long rwsem_atomic_update(long delta, struct rw_semaphore *sem)
-{
- return delta + xadd(&sem->count, delta);
-}
-
#endif /* __KERNEL__ */
#endif /* _ASM_X86_RWSEM_H */
diff --git a/arch/x86/include/asm/signal.h b/arch/x86/include/asm/signal.h
index 2138c9a..dd1e7d6 100644
--- a/arch/x86/include/asm/signal.h
+++ b/arch/x86/include/asm/signal.h
@@ -81,9 +81,9 @@ static inline int __const_sigismember(sigset_t *set, int _sig)
static inline int __gen_sigismember(sigset_t *set, int _sig)
{
- int ret;
- asm("btl %2,%1\n\tsbbl %0,%0"
- : "=r"(ret) : "m"(*set), "Ir"(_sig-1) : "cc");
+ unsigned char ret;
+ asm("btl %2,%1\n\tsetc %0"
+ : "=qm"(ret) : "m"(*set), "Ir"(_sig-1) : "cc");
return ret;
}
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 66b0573..0576b61 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -172,12 +172,6 @@ extern int safe_smp_processor_id(void);
#elif defined(CONFIG_X86_64_SMP)
#define raw_smp_processor_id() (this_cpu_read(cpu_number))
-#define stack_smp_processor_id() \
-({ \
- struct thread_info *ti; \
- __asm__("andq %%rsp,%0; ":"=r" (ti) : "0" (CURRENT_MASK)); \
- ti->cpu; \
-})
#define safe_smp_processor_id() smp_processor_id()
#endif
diff --git a/arch/x86/include/asm/sync_bitops.h b/arch/x86/include/asm/sync_bitops.h
index f28a24b..cbf8847 100644
--- a/arch/x86/include/asm/sync_bitops.h
+++ b/arch/x86/include/asm/sync_bitops.h
@@ -79,10 +79,10 @@ static inline void sync_change_bit(long nr, volatile unsigned long *addr)
*/
static inline int sync_test_and_set_bit(long nr, volatile unsigned long *addr)
{
- int oldbit;
+ unsigned char oldbit;
- asm volatile("lock; bts %2,%1\n\tsbbl %0,%0"
- : "=r" (oldbit), "+m" (ADDR)
+ asm volatile("lock; bts %2,%1\n\tsetc %0"
+ : "=qm" (oldbit), "+m" (ADDR)
: "Ir" (nr) : "memory");
return oldbit;
}
@@ -97,10 +97,10 @@ static inline int sync_test_and_set_bit(long nr, volatile unsigned long *addr)
*/
static inline int sync_test_and_clear_bit(long nr, volatile unsigned long *addr)
{
- int oldbit;
+ unsigned char oldbit;
- asm volatile("lock; btr %2,%1\n\tsbbl %0,%0"
- : "=r" (oldbit), "+m" (ADDR)
+ asm volatile("lock; btr %2,%1\n\tsetc %0"
+ : "=qm" (oldbit), "+m" (ADDR)
: "Ir" (nr) : "memory");
return oldbit;
}
@@ -115,10 +115,10 @@ static inline int sync_test_and_clear_bit(long nr, volatile unsigned long *addr)
*/
static inline int sync_test_and_change_bit(long nr, volatile unsigned long *addr)
{
- int oldbit;
+ unsigned char oldbit;
- asm volatile("lock; btc %2,%1\n\tsbbl %0,%0"
- : "=r" (oldbit), "+m" (ADDR)
+ asm volatile("lock; btc %2,%1\n\tsetc %0"
+ : "=qm" (oldbit), "+m" (ADDR)
: "Ir" (nr) : "memory");
return oldbit;
}
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 30c133a..89bff04 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -57,9 +57,6 @@ struct thread_info {
__u32 flags; /* low level flags */
__u32 status; /* thread synchronous flags */
__u32 cpu; /* current CPU */
- mm_segment_t addr_limit;
- unsigned int sig_on_uaccess_error:1;
- unsigned int uaccess_err:1; /* uaccess failed */
};
#define INIT_THREAD_INFO(tsk) \
@@ -67,7 +64,6 @@ struct thread_info {
.task = &tsk, \
.flags = 0, \
.cpu = 0, \
- .addr_limit = KERNEL_DS, \
}
#define init_thread_info (init_thread_union.thread_info)
@@ -186,11 +182,6 @@ static inline unsigned long current_stack_pointer(void)
# define cpu_current_top_of_stack (cpu_tss + TSS_sp0)
#endif
-/* Load thread_info address into "reg" */
-#define GET_THREAD_INFO(reg) \
- _ASM_MOV PER_CPU_VAR(cpu_current_top_of_stack),reg ; \
- _ASM_SUB $(THREAD_SIZE),reg ;
-
/*
* ASM operand which evaluates to a 'thread_info' address of
* the current task, if it is known that "reg" is exactly "off"
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index 7f991bd5..e346572 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -129,6 +129,14 @@ extern const struct cpumask *cpu_coregroup_mask(int cpu);
extern unsigned int __max_logical_packages;
#define topology_max_packages() (__max_logical_packages)
+
+extern int __max_smt_threads;
+
+static inline int topology_max_smt_threads(void)
+{
+ return __max_smt_threads;
+}
+
int topology_update_package_map(unsigned int apicid, unsigned int cpu);
extern int topology_phys_to_logical_pkg(unsigned int pkg);
#else
@@ -136,6 +144,7 @@ extern int topology_phys_to_logical_pkg(unsigned int pkg);
static inline int
topology_update_package_map(unsigned int apicid, unsigned int cpu) { return 0; }
static inline int topology_phys_to_logical_pkg(unsigned int pkg) { return 0; }
+static inline int topology_max_smt_threads(void) { return 1; }
#endif
static inline void arch_fix_phys_package_id(int num, u32 slot)
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 2982387..c03bfb6 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -29,12 +29,12 @@
#define USER_DS MAKE_MM_SEG(TASK_SIZE_MAX)
#define get_ds() (KERNEL_DS)
-#define get_fs() (current_thread_info()->addr_limit)
-#define set_fs(x) (current_thread_info()->addr_limit = (x))
+#define get_fs() (current->thread.addr_limit)
+#define set_fs(x) (current->thread.addr_limit = (x))
#define segment_eq(a, b) ((a).seg == (b).seg)
-#define user_addr_max() (current_thread_info()->addr_limit.seg)
+#define user_addr_max() (current->thread.addr_limit.seg)
#define __addr_ok(addr) \
((unsigned long __force)(addr) < user_addr_max())
@@ -342,7 +342,26 @@ do { \
} while (0)
#ifdef CONFIG_X86_32
-#define __get_user_asm_u64(x, ptr, retval, errret) (x) = __get_user_bad()
+#define __get_user_asm_u64(x, ptr, retval, errret) \
+({ \
+ __typeof__(ptr) __ptr = (ptr); \
+ asm volatile(ASM_STAC "\n" \
+ "1: movl %2,%%eax\n" \
+ "2: movl %3,%%edx\n" \
+ "3: " ASM_CLAC "\n" \
+ ".section .fixup,\"ax\"\n" \
+ "4: mov %4,%0\n" \
+ " xorl %%eax,%%eax\n" \
+ " xorl %%edx,%%edx\n" \
+ " jmp 3b\n" \
+ ".previous\n" \
+ _ASM_EXTABLE(1b, 4b) \
+ _ASM_EXTABLE(2b, 4b) \
+ : "=r" (retval), "=A"(x) \
+ : "m" (__m(__ptr)), "m" __m(((u32 *)(__ptr)) + 1), \
+ "i" (errret), "0" (retval)); \
+})
+
#define __get_user_asm_ex_u64(x, ptr) (x) = __get_user_bad()
#else
#define __get_user_asm_u64(x, ptr, retval, errret) \
@@ -429,7 +448,7 @@ do { \
#define __get_user_nocheck(x, ptr, size) \
({ \
int __gu_err; \
- unsigned long __gu_val; \
+ __inttype(*(ptr)) __gu_val; \
__uaccess_begin(); \
__get_user_size(__gu_val, (ptr), (size), __gu_err, -EFAULT); \
__uaccess_end(); \
@@ -468,13 +487,13 @@ struct __large_struct { unsigned long buf[100]; };
* uaccess_try and catch
*/
#define uaccess_try do { \
- current_thread_info()->uaccess_err = 0; \
+ current->thread.uaccess_err = 0; \
__uaccess_begin(); \
barrier();
#define uaccess_catch(err) \
__uaccess_end(); \
- (err) |= (current_thread_info()->uaccess_err ? -EFAULT : 0); \
+ (err) |= (current->thread.uaccess_err ? -EFAULT : 0); \
} while (0)
/**
diff --git a/arch/x86/include/asm/unistd.h b/arch/x86/include/asm/unistd.h
index 2b19caa..32712a9 100644
--- a/arch/x86/include/asm/unistd.h
+++ b/arch/x86/include/asm/unistd.h
@@ -26,6 +26,8 @@
# define __ARCH_WANT_COMPAT_SYS_GETDENTS64
# define __ARCH_WANT_COMPAT_SYS_PREADV64
# define __ARCH_WANT_COMPAT_SYS_PWRITEV64
+# define __ARCH_WANT_COMPAT_SYS_PREADV64V2
+# define __ARCH_WANT_COMPAT_SYS_PWRITEV64V2
# endif
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index 4dcdf74..66c15a0 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -168,14 +168,15 @@ struct x86_legacy_devices {
* struct x86_legacy_features - legacy x86 features
*
* @rtc: this device has a CMOS real-time clock present
- * @ebda_search: it's safe to search for the EBDA signature in the hardware's
- * low RAM
+ * @reserve_bios_regions: boot code will search for the EBDA address and the
+ * start of the 640k - 1M BIOS region. If false, the platform must
+ * ensure that its memory map correctly reserves sub-1MB regions as needed.
* @devices: legacy x86 devices, refer to struct x86_legacy_devices
* documentation for further details.
*/
struct x86_legacy_features {
int rtc;
- int ebda_search;
+ int reserve_bios_regions;
struct x86_legacy_devices devices;
};
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 60078a6..f943d2f 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -2045,7 +2045,7 @@ int generic_processor_info(int apicid, int version)
int thiscpu = max + disabled_cpus - 1;
pr_warning(
- "ACPI: NR_CPUS/possible_cpus limit of %i almost"
+ "APIC: NR_CPUS/possible_cpus limit of %i almost"
" reached. Keeping one slot for boot cpu."
" Processor %d/0x%x ignored.\n", max, thiscpu, apicid);
@@ -2057,7 +2057,7 @@ int generic_processor_info(int apicid, int version)
int thiscpu = max + disabled_cpus;
pr_warning(
- "ACPI: NR_CPUS/possible_cpus limit of %i reached."
+ "APIC: NR_CPUS/possible_cpus limit of %i reached."
" Processor %d/0x%x ignored.\n", max, thiscpu, apicid);
disabled_cpus++;
@@ -2085,7 +2085,7 @@ int generic_processor_info(int apicid, int version)
if (topology_update_package_map(apicid, cpu) < 0) {
int thiscpu = max + disabled_cpus;
- pr_warning("ACPI: Package limit reached. Processor %d/0x%x ignored.\n",
+ pr_warning("APIC: Package limit reached. Processor %d/0x%x ignored.\n",
thiscpu, apicid);
disabled_cpus++;
return -ENOSPC;
diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index 76f89e2..0487477 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -181,7 +181,6 @@ static struct apic apic_flat = {
.get_apic_id = flat_get_apic_id,
.set_apic_id = set_apic_id,
- .apic_id_mask = 0xFFu << 24,
.cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and,
@@ -278,7 +277,6 @@ static struct apic apic_physflat = {
.get_apic_id = flat_get_apic_id,
.set_apic_id = set_apic_id,
- .apic_id_mask = 0xFFu << 24,
.cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and,
diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c
index 13d19ed..2cebf59 100644
--- a/arch/x86/kernel/apic/apic_noop.c
+++ b/arch/x86/kernel/apic/apic_noop.c
@@ -141,7 +141,6 @@ struct apic apic_noop = {
.get_apic_id = noop_get_apic_id,
.set_apic_id = NULL,
- .apic_id_mask = 0x0F << 24,
.cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and,
diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c
index ab5c2c6..714d4fd 100644
--- a/arch/x86/kernel/apic/apic_numachip.c
+++ b/arch/x86/kernel/apic/apic_numachip.c
@@ -269,7 +269,6 @@ static const struct apic apic_numachip1 __refconst = {
.get_apic_id = numachip1_get_apic_id,
.set_apic_id = numachip1_set_apic_id,
- .apic_id_mask = 0xffU << 24,
.cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and,
@@ -321,7 +320,6 @@ static const struct apic apic_numachip2 __refconst = {
.get_apic_id = numachip2_get_apic_id,
.set_apic_id = numachip2_set_apic_id,
- .apic_id_mask = 0xffU << 24,
.cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and,
diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c
index cf9bd89..06dbaa4 100644
--- a/arch/x86/kernel/apic/bigsmp_32.c
+++ b/arch/x86/kernel/apic/bigsmp_32.c
@@ -171,7 +171,6 @@ static struct apic apic_bigsmp = {
.get_apic_id = bigsmp_get_apic_id,
.set_apic_id = NULL,
- .apic_id_mask = 0xFF << 24,
.cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and,
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 446702e..e587295 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -2567,29 +2567,25 @@ static struct resource * __init ioapic_setup_resources(void)
unsigned long n;
struct resource *res;
char *mem;
- int i, num = 0;
+ int i;
- for_each_ioapic(i)
- num++;
- if (num == 0)
+ if (nr_ioapics == 0)
return NULL;
n = IOAPIC_RESOURCE_NAME_SIZE + sizeof(struct resource);
- n *= num;
+ n *= nr_ioapics;
mem = alloc_bootmem(n);
res = (void *)mem;
- mem += sizeof(struct resource) * num;
+ mem += sizeof(struct resource) * nr_ioapics;
- num = 0;
for_each_ioapic(i) {
- res[num].name = mem;
- res[num].flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+ res[i].name = mem;
+ res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY;
snprintf(mem, IOAPIC_RESOURCE_NAME_SIZE, "IOAPIC %u", i);
mem += IOAPIC_RESOURCE_NAME_SIZE;
- ioapics[i].iomem_res = &res[num];
- num++;
+ ioapics[i].iomem_res = &res[i];
}
ioapic_resources = res;
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
index f316e34..93edfa0 100644
--- a/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -101,7 +101,6 @@ static struct apic apic_default = {
.get_apic_id = default_get_apic_id,
.set_apic_id = NULL,
- .apic_id_mask = 0x0F << 24,
.cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and,
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index aca8b75..24170d0 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -270,7 +270,6 @@ static struct apic apic_x2apic_cluster = {
.get_apic_id = x2apic_get_apic_id,
.set_apic_id = x2apic_set_apic_id,
- .apic_id_mask = 0xFFFFFFFFu,
.cpu_mask_to_apicid_and = x2apic_cpu_mask_to_apicid_and,
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index a1242e2..4f13f54f 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -126,7 +126,6 @@ static struct apic apic_x2apic_phys = {
.get_apic_id = x2apic_get_apic_id,
.set_apic_id = x2apic_set_apic_id,
- .apic_id_mask = 0xFFFFFFFFu,
.cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and,
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 2900315..5a58c91 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -582,7 +582,6 @@ static struct apic __refdata apic_x2apic_uv_x = {
.get_apic_id = x2apic_get_apic_id,
.set_apic_id = set_apic_id,
- .apic_id_mask = 0xFFFFFFFFu,
.cpu_mask_to_apicid_and = uv_cpu_mask_to_apicid_and,
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index 674134e..2bd5c6f 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -31,7 +31,9 @@ void common(void) {
BLANK();
OFFSET(TI_flags, thread_info, flags);
OFFSET(TI_status, thread_info, status);
- OFFSET(TI_addr_limit, thread_info, addr_limit);
+
+ BLANK();
+ OFFSET(TASK_addr_limit, task_struct, thread.addr_limit);
BLANK();
OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx);
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 0fe6953f..d22a7b9 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1452,7 +1452,7 @@ void cpu_init(void)
struct task_struct *me;
struct tss_struct *t;
unsigned long v;
- int cpu = stack_smp_processor_id();
+ int cpu = raw_smp_processor_id();
int i;
wait_for_master_cpu(cpu);
diff --git a/arch/x86/kernel/cpu/mcheck/mce-apei.c b/arch/x86/kernel/cpu/mcheck/mce-apei.c
index 34c89a3..83f1a98 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-apei.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-apei.c
@@ -46,7 +46,7 @@ void apei_mce_report_mem_error(int severity, struct cper_sec_mem_err *mem_err)
return;
mce_setup(&m);
- m.bank = 1;
+ m.bank = -1;
/* Fake a memory read error with unknown channel */
m.status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV | 0x9f;
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 92e5e37..58af630 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -425,7 +425,7 @@ static u64 mce_rdmsrl(u32 msr)
}
if (rdmsrl_safe(msr, &v)) {
- WARN_ONCE(1, "mce: Unable to read msr %d!\n", msr);
+ WARN_ONCE(1, "mce: Unable to read MSR 0x%x!\n", msr);
/*
* Return zero in case the access faulted. This should
* not happen normally but can happen if the CPU does
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 10b0661..7b7f3be 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -93,7 +93,7 @@ const char * const amd_df_mcablock_names[] = {
EXPORT_SYMBOL_GPL(amd_df_mcablock_names);
static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks);
-static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */
+static DEFINE_PER_CPU(unsigned int, bank_map); /* see which banks are on */
static void amd_threshold_interrupt(void);
static void amd_deferred_error_interrupt(void);
diff --git a/arch/x86/kernel/cpu/rdrand.c b/arch/x86/kernel/cpu/rdrand.c
index f6f50c4..cfa97ff 100644
--- a/arch/x86/kernel/cpu/rdrand.c
+++ b/arch/x86/kernel/cpu/rdrand.c
@@ -39,9 +39,9 @@ __setup("nordrand", x86_rdrand_setup);
*/
#define SANITY_CHECK_LOOPS 8
+#ifdef CONFIG_ARCH_RANDOM
void x86_init_rdrand(struct cpuinfo_x86 *c)
{
-#ifdef CONFIG_ARCH_RANDOM
unsigned long tmp;
int i;
@@ -55,5 +55,5 @@ void x86_init_rdrand(struct cpuinfo_x86 *c)
return;
}
}
-#endif
}
+#endif
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index ef8017c..92e8f0a 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -87,7 +87,7 @@ static inline int valid_stack_ptr(struct task_struct *task,
else
return 0;
}
- return p > t && p < t + THREAD_SIZE - size;
+ return p >= t && p < t + THREAD_SIZE - size;
}
unsigned long
@@ -98,6 +98,14 @@ print_context_stack(struct task_struct *task,
{
struct stack_frame *frame = (struct stack_frame *)bp;
+ /*
+ * If we overflowed the stack into a guard page, jump back to the
+ * bottom of the usable stack.
+ */
+ if ((unsigned long)task_stack_page(task) - (unsigned long)stack <
+ PAGE_SIZE)
+ stack = (unsigned long *)task_stack_page(task);
+
while (valid_stack_ptr(task, stack, sizeof(*stack), end)) {
unsigned long addr;
@@ -197,6 +205,11 @@ void show_stack(struct task_struct *task, unsigned long *sp)
show_stack_log_lvl(task, NULL, sp, bp, "");
}
+void show_stack_regs(struct pt_regs *regs)
+{
+ show_stack_log_lvl(current, regs, (unsigned long *)regs->sp, regs->bp, "");
+}
+
static arch_spinlock_t die_lock = __ARCH_SPIN_LOCK_UNLOCKED;
static int die_owner = -1;
static unsigned int die_nest_count;
@@ -226,6 +239,8 @@ unsigned long oops_begin(void)
EXPORT_SYMBOL_GPL(oops_begin);
NOKPROBE_SYMBOL(oops_begin);
+void __noreturn rewind_stack_do_exit(int signr);
+
void oops_end(unsigned long flags, struct pt_regs *regs, int signr)
{
if (regs && kexec_should_crash(current))
@@ -247,7 +262,13 @@ void oops_end(unsigned long flags, struct pt_regs *regs, int signr)
panic("Fatal exception in interrupt");
if (panic_on_oops)
panic("Fatal exception");
- do_exit(signr);
+
+ /*
+ * We're not going to return, but we might be on an IST stack or
+ * have very little stack space left. Rewind the stack and kill
+ * the task.
+ */
+ rewind_stack_do_exit(signr);
}
NOKPROBE_SYMBOL(oops_end);
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index fef917e..948d77d 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -96,7 +96,9 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
int i;
if (sp == NULL) {
- if (task)
+ if (regs)
+ sp = (unsigned long *)regs->sp;
+ else if (task)
sp = (unsigned long *)task->thread.sp;
else
sp = (unsigned long *)&sp;
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index d558a8a..6dede08 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -264,7 +264,9 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
* back trace for this cpu:
*/
if (sp == NULL) {
- if (task)
+ if (regs)
+ sp = (unsigned long *)regs->sp;
+ else if (task)
sp = (unsigned long *)task->thread.sp;
else
sp = (unsigned long *)&sp;
@@ -272,6 +274,8 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
stack = sp;
for (i = 0; i < kstack_depth_to_print; i++) {
+ unsigned long word;
+
if (stack >= irq_stack && stack <= irq_stack_end) {
if (stack == irq_stack_end) {
stack = (unsigned long *) (irq_stack_end[-1]);
@@ -281,12 +285,18 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
if (kstack_end(stack))
break;
}
+
+ if (probe_kernel_address(stack, word))
+ break;
+
if ((i % STACKSLOTS_PER_LINE) == 0) {
if (i != 0)
pr_cont("\n");
- printk("%s %016lx", log_lvl, *stack++);
+ printk("%s %016lx", log_lvl, word);
} else
- pr_cont(" %016lx", *stack++);
+ pr_cont(" %016lx", word);
+
+ stack++;
touch_nmi_watchdog();
}
preempt_enable();
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index bca14c8..57b7137 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -11,7 +11,11 @@
#include <linux/pci.h>
#include <linux/acpi.h>
+#include <linux/delay.h>
+#include <linux/dmi.h>
#include <linux/pci_ids.h>
+#include <linux/bcma/bcma.h>
+#include <linux/bcma/bcma_regs.h>
#include <drm/i915_drm.h>
#include <asm/pci-direct.h>
#include <asm/dma.h>
@@ -21,6 +25,9 @@
#include <asm/iommu.h>
#include <asm/gart.h>
#include <asm/irq_remapping.h>
+#include <asm/early_ioremap.h>
+
+#define dev_err(msg) pr_err("pci 0000:%02x:%02x.%d: %s", bus, slot, func, msg)
static void __init fix_hypertransport_config(int num, int slot, int func)
{
@@ -76,6 +83,13 @@ static void __init nvidia_bugs(int num, int slot, int func)
#ifdef CONFIG_ACPI
#ifdef CONFIG_X86_IO_APIC
/*
+ * Only applies to Nvidia root ports (bus 0) and not to
+ * Nvidia graphics cards with PCI ports on secondary buses.
+ */
+ if (num)
+ return;
+
+ /*
* All timer overrides on Nvidia are
* wrong unless HPET is enabled.
* Unfortunately that's not true on many Asus boards.
@@ -590,6 +604,61 @@ static void __init force_disable_hpet(int num, int slot, int func)
#endif
}
+#define BCM4331_MMIO_SIZE 16384
+#define BCM4331_PM_CAP 0x40
+#define bcma_aread32(reg) ioread32(mmio + 1 * BCMA_CORE_SIZE + reg)
+#define bcma_awrite32(reg, val) iowrite32(val, mmio + 1 * BCMA_CORE_SIZE + reg)
+
+static void __init apple_airport_reset(int bus, int slot, int func)
+{
+ void __iomem *mmio;
+ u16 pmcsr;
+ u64 addr;
+ int i;
+
+ if (!dmi_match(DMI_SYS_VENDOR, "Apple Inc."))
+ return;
+
+ /* Card may have been put into PCI_D3hot by grub quirk */
+ pmcsr = read_pci_config_16(bus, slot, func, BCM4331_PM_CAP + PCI_PM_CTRL);
+
+ if ((pmcsr & PCI_PM_CTRL_STATE_MASK) != PCI_D0) {
+ pmcsr &= ~PCI_PM_CTRL_STATE_MASK;
+ write_pci_config_16(bus, slot, func, BCM4331_PM_CAP + PCI_PM_CTRL, pmcsr);
+ mdelay(10);
+
+ pmcsr = read_pci_config_16(bus, slot, func, BCM4331_PM_CAP + PCI_PM_CTRL);
+ if ((pmcsr & PCI_PM_CTRL_STATE_MASK) != PCI_D0) {
+ dev_err("Cannot power up Apple AirPort card\n");
+ return;
+ }
+ }
+
+ addr = read_pci_config(bus, slot, func, PCI_BASE_ADDRESS_0);
+ addr |= (u64)read_pci_config(bus, slot, func, PCI_BASE_ADDRESS_1) << 32;
+ addr &= PCI_BASE_ADDRESS_MEM_MASK;
+
+ mmio = early_ioremap(addr, BCM4331_MMIO_SIZE);
+ if (!mmio) {
+ dev_err("Cannot iomap Apple AirPort card\n");
+ return;
+ }
+
+ pr_info("Resetting Apple AirPort card (left enabled by EFI)\n");
+
+ for (i = 0; bcma_aread32(BCMA_RESET_ST) && i < 30; i++)
+ udelay(10);
+
+ bcma_awrite32(BCMA_RESET_CTL, BCMA_RESET_CTL_RESET);
+ bcma_aread32(BCMA_RESET_CTL);
+ udelay(1);
+
+ bcma_awrite32(BCMA_RESET_CTL, 0);
+ bcma_aread32(BCMA_RESET_CTL);
+ udelay(10);
+
+ early_iounmap(mmio, BCM4331_MMIO_SIZE);
+}
#define QFLAG_APPLY_ONCE 0x1
#define QFLAG_APPLIED 0x2
@@ -603,12 +672,6 @@ struct chipset {
void (*f)(int num, int slot, int func);
};
-/*
- * Only works for devices on the root bus. If you add any devices
- * not on bus 0 readd another loop level in early_quirks(). But
- * be careful because at least the Nvidia quirk here relies on
- * only matching on bus 0.
- */
static struct chipset early_qrk[] __initdata = {
{ PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID,
PCI_CLASS_BRIDGE_PCI, PCI_ANY_ID, QFLAG_APPLY_ONCE, nvidia_bugs },
@@ -638,9 +701,13 @@ static struct chipset early_qrk[] __initdata = {
*/
{ PCI_VENDOR_ID_INTEL, 0x0f00,
PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet},
+ { PCI_VENDOR_ID_BROADCOM, 0x4331,
+ PCI_CLASS_NETWORK_OTHER, PCI_ANY_ID, 0, apple_airport_reset},
{}
};
+static void __init early_pci_scan_bus(int bus);
+
/**
* check_dev_quirk - apply early quirks to a given PCI device
* @num: bus number
@@ -649,7 +716,7 @@ static struct chipset early_qrk[] __initdata = {
*
* Check the vendor & device ID against the early quirks table.
*
- * If the device is single function, let early_quirks() know so we don't
+ * If the device is single function, let early_pci_scan_bus() know so we don't
* poke at this device again.
*/
static int __init check_dev_quirk(int num, int slot, int func)
@@ -658,6 +725,7 @@ static int __init check_dev_quirk(int num, int slot, int func)
u16 vendor;
u16 device;
u8 type;
+ u8 sec;
int i;
class = read_pci_config_16(num, slot, func, PCI_CLASS_DEVICE);
@@ -685,25 +753,36 @@ static int __init check_dev_quirk(int num, int slot, int func)
type = read_pci_config_byte(num, slot, func,
PCI_HEADER_TYPE);
+
+ if ((type & 0x7f) == PCI_HEADER_TYPE_BRIDGE) {
+ sec = read_pci_config_byte(num, slot, func, PCI_SECONDARY_BUS);
+ if (sec > num)
+ early_pci_scan_bus(sec);
+ }
+
if (!(type & 0x80))
return -1;
return 0;
}
-void __init early_quirks(void)
+static void __init early_pci_scan_bus(int bus)
{
int slot, func;
- if (!early_pci_allowed())
- return;
-
/* Poor man's PCI discovery */
- /* Only scan the root bus */
for (slot = 0; slot < 32; slot++)
for (func = 0; func < 8; func++) {
/* Only probe function 0 on single fn devices */
- if (check_dev_quirk(0, slot, func))
+ if (check_dev_quirk(bus, slot, func))
break;
}
}
+
+void __init early_quirks(void)
+{
+ if (!early_pci_allowed())
+ return;
+
+ early_pci_scan_bus(0);
+}
diff --git a/arch/x86/kernel/ebda.c b/arch/x86/kernel/ebda.c
index afe65df..4312f8a 100644
--- a/arch/x86/kernel/ebda.c
+++ b/arch/x86/kernel/ebda.c
@@ -6,66 +6,92 @@
#include <asm/bios_ebda.h>
/*
+ * This function reserves all conventional PC system BIOS related
+ * firmware memory areas (some of which are data, some of which
+ * are code), that must not be used by the kernel as available
+ * RAM.
+ *
* The BIOS places the EBDA/XBDA at the top of conventional
* memory, and usually decreases the reported amount of
- * conventional memory (int 0x12) too. This also contains a
- * workaround for Dell systems that neglect to reserve EBDA.
- * The same workaround also avoids a problem with the AMD768MPX
- * chipset: reserve a page before VGA to prevent PCI prefetch
- * into it (errata #56). Usually the page is reserved anyways,
- * unless you have no PS/2 mouse plugged in.
+ * conventional memory (int 0x12) too.
+ *
+ * This means that as a first approximation on most systems we can
+ * guess the reserved BIOS area by looking at the low BIOS RAM size
+ * value and assume that everything above that value (up to 1MB) is
+ * reserved.
+ *
+ * But life in firmware country is not that simple:
+ *
+ * - This code also contains a quirk for Dell systems that neglect
+ * to reserve the EBDA area in the 'RAM size' value ...
+ *
+ * - The same quirk also avoids a problem with the AMD768MPX
+ * chipset: reserve a page before VGA to prevent PCI prefetch
+ * into it (errata #56). (Usually the page is reserved anyways,
+ * unless you have no PS/2 mouse plugged in.)
+ *
+ * - Plus paravirt systems don't have a reliable value in the
+ * 'BIOS RAM size' pointer we can rely on, so we must quirk
+ * them too.
+ *
+ * Due to those various problems this function is deliberately
+ * very conservative and tries to err on the side of reserving
+ * too much, to not risk reserving too little.
+ *
+ * Losing a small amount of memory in the bottom megabyte is
+ * rarely a problem, as long as we have enough memory to install
+ * the SMP bootup trampoline which *must* be in this area.
*
- * This functions is deliberately very conservative. Losing
- * memory in the bottom megabyte is rarely a problem, as long
- * as we have enough memory to install the trampoline. Using
- * memory that is in use by the BIOS or by some DMA device
- * the BIOS didn't shut down *is* a big problem.
+ * Using memory that is in use by the BIOS or by some DMA device
+ * the BIOS didn't shut down *is* a big problem to the kernel,
+ * obviously.
*/
-#define BIOS_LOWMEM_KILOBYTES 0x413
-#define LOWMEM_CAP 0x9f000U /* Absolute maximum */
-#define INSANE_CUTOFF 0x20000U /* Less than this = insane */
+#define BIOS_RAM_SIZE_KB_PTR 0x413
-void __init reserve_ebda_region(void)
+#define BIOS_START_MIN 0x20000U /* 128K, less than this is insane */
+#define BIOS_START_MAX 0x9f000U /* 640K, absolute maximum */
+
+void __init reserve_bios_regions(void)
{
- unsigned int lowmem, ebda_addr;
+ unsigned int bios_start, ebda_start;
/*
- * To determine the position of the EBDA and the
- * end of conventional memory, we need to look at
- * the BIOS data area. In a paravirtual environment
- * that area is absent. We'll just have to assume
- * that the paravirt case can handle memory setup
- * correctly, without our help.
+ * NOTE: In a paravirtual environment the BIOS reserved
+ * area is absent. We'll just have to assume that the
+ * paravirt case can handle memory setup correctly,
+ * without our help.
*/
- if (!x86_platform.legacy.ebda_search)
+ if (!x86_platform.legacy.reserve_bios_regions)
return;
- /* end of low (conventional) memory */
- lowmem = *(unsigned short *)__va(BIOS_LOWMEM_KILOBYTES);
- lowmem <<= 10;
-
- /* start of EBDA area */
- ebda_addr = get_bios_ebda();
-
/*
- * Note: some old Dells seem to need 4k EBDA without
- * reporting so, so just consider the memory above 0x9f000
- * to be off limits (bugzilla 2990).
+ * BIOS RAM size is encoded in kilobytes, convert it
+ * to bytes to get a first guess at where the BIOS
+ * firmware area starts:
*/
+ bios_start = *(unsigned short *)__va(BIOS_RAM_SIZE_KB_PTR);
+ bios_start <<= 10;
- /* If the EBDA address is below 128K, assume it is bogus */
- if (ebda_addr < INSANE_CUTOFF)
- ebda_addr = LOWMEM_CAP;
+ /*
+ * If bios_start is less than 128K, assume it is bogus
+ * and bump it up to 640K. Similarly, if bios_start is above 640K,
+ * don't trust it.
+ */
+ if (bios_start < BIOS_START_MIN || bios_start > BIOS_START_MAX)
+ bios_start = BIOS_START_MAX;
- /* If lowmem is less than 128K, assume it is bogus */
- if (lowmem < INSANE_CUTOFF)
- lowmem = LOWMEM_CAP;
+ /* Get the start address of the EBDA page: */
+ ebda_start = get_bios_ebda();
- /* Use the lower of the lowmem and EBDA markers as the cutoff */
- lowmem = min(lowmem, ebda_addr);
- lowmem = min(lowmem, LOWMEM_CAP); /* Absolute cap */
+ /*
+ * If the EBDA start address is sane and is below the BIOS region,
+ * then also reserve everything from the EBDA start address up to
+ * the BIOS region.
+ */
+ if (ebda_start >= BIOS_START_MIN && ebda_start < bios_start)
+ bios_start = ebda_start;
- /* reserve all memory between lowmem and the 1MB mark */
- memblock_reserve(lowmem, 0x100000 - lowmem);
+ /* Reserve all memory between bios_start and the 1MB mark: */
+ memblock_reserve(bios_start, 0x100000 - bios_start);
}
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index d784bb5..2dda0bc 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -26,7 +26,7 @@ static void __init i386_default_early_setup(void)
x86_init.resources.reserve_resources = i386_reserve_resources;
x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc;
- reserve_ebda_region();
+ reserve_bios_regions();
}
asmlinkage __visible void __init i386_start_kernel(void)
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index b72fb0b..99d48e7 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -183,7 +183,7 @@ void __init x86_64_start_reservations(char *real_mode_data)
copy_bootdata(__va(real_mode_data));
x86_early_init_platform_quirks();
- reserve_ebda_region();
+ reserve_bios_regions();
switch (boot_params.hdr.hardware_subarch) {
case X86_SUBARCH_INTEL_MID:
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 5df831e..9f8efc9 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -38,7 +38,7 @@
#define pud_index(x) (((x) >> PUD_SHIFT) & (PTRS_PER_PUD-1))
-L4_PAGE_OFFSET = pgd_index(__PAGE_OFFSET)
+L4_PAGE_OFFSET = pgd_index(__PAGE_OFFSET_BASE)
L4_START_KERNEL = pgd_index(__START_KERNEL_map)
L3_START_KERNEL = pud_index(__START_KERNEL_map)
@@ -299,6 +299,7 @@ ENTRY(secondary_startup_64)
pushq $__KERNEL_CS # set correct cs
pushq %rax # target address in negative space
lretq
+ENDPROC(secondary_startup_64)
#include "verify_cpu.S"
diff --git a/arch/x86/kernel/i386_ksyms_32.c b/arch/x86/kernel/i386_ksyms_32.c
index 64341aa..d40ee8a 100644
--- a/arch/x86/kernel/i386_ksyms_32.c
+++ b/arch/x86/kernel/i386_ksyms_32.c
@@ -42,3 +42,5 @@ EXPORT_SYMBOL(empty_zero_page);
EXPORT_SYMBOL(___preempt_schedule);
EXPORT_SYMBOL(___preempt_schedule_notrace);
#endif
+
+EXPORT_SYMBOL(__sw_hweight32);
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index eea2a6f..1ef5e48 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -301,8 +301,6 @@ static void kvm_register_steal_time(void)
if (!has_steal_clock)
return;
- memset(st, 0, sizeof(*st));
-
wrmsrl(MSR_KVM_STEAL_TIME, (slow_virt_to_phys(st) | KVM_MSR_ENABLED));
pr_info("kvm-stealtime: cpu %d, msr %llx\n",
cpu, (unsigned long long) slow_virt_to_phys(st));
diff --git a/arch/x86/kernel/platform-quirks.c b/arch/x86/kernel/platform-quirks.c
index b2f8a33..24a5030 100644
--- a/arch/x86/kernel/platform-quirks.c
+++ b/arch/x86/kernel/platform-quirks.c
@@ -7,12 +7,12 @@
void __init x86_early_init_platform_quirks(void)
{
x86_platform.legacy.rtc = 1;
- x86_platform.legacy.ebda_search = 0;
+ x86_platform.legacy.reserve_bios_regions = 0;
x86_platform.legacy.devices.pnpbios = 1;
switch (boot_params.hdr.hardware_subarch) {
case X86_SUBARCH_PC:
- x86_platform.legacy.ebda_search = 1;
+ x86_platform.legacy.reserve_bios_regions = 1;
break;
case X86_SUBARCH_XEN:
case X86_SUBARCH_LGUEST:
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index c4e7b39..a261658 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -113,6 +113,7 @@
#include <asm/prom.h>
#include <asm/microcode.h>
#include <asm/mmu_context.h>
+#include <asm/kaslr.h>
/*
* max_low_pfn_mapped: highest direct mapped pfn under 4GB
@@ -942,6 +943,8 @@ void __init setup_arch(char **cmdline_p)
x86_init.oem.arch_setup();
+ kernel_randomize_memory();
+
iomem_resource.end = (1ULL << boot_cpu_data.x86_phys_bits) - 1;
setup_memory_map();
parse_setup_data();
diff --git a/arch/x86/kernel/signal_compat.c b/arch/x86/kernel/signal_compat.c
index dc3c0b1..b44564b 100644
--- a/arch/x86/kernel/signal_compat.c
+++ b/arch/x86/kernel/signal_compat.c
@@ -1,11 +1,104 @@
#include <linux/compat.h>
#include <linux/uaccess.h>
+/*
+ * The compat_siginfo_t structure and handing code is very easy
+ * to break in several ways. It must always be updated when new
+ * updates are made to the main siginfo_t, and
+ * copy_siginfo_to_user32() must be updated when the
+ * (arch-independent) copy_siginfo_to_user() is updated.
+ *
+ * It is also easy to put a new member in the compat_siginfo_t
+ * which has implicit alignment which can move internal structure
+ * alignment around breaking the ABI. This can happen if you,
+ * for instance, put a plain 64-bit value in there.
+ */
+static inline void signal_compat_build_tests(void)
+{
+ int _sifields_offset = offsetof(compat_siginfo_t, _sifields);
+
+ /*
+ * If adding a new si_code, there is probably new data in
+ * the siginfo. Make sure folks bumping the si_code
+ * limits also have to look at this code. Make sure any
+ * new fields are handled in copy_siginfo_to_user32()!
+ */
+ BUILD_BUG_ON(NSIGILL != 8);
+ BUILD_BUG_ON(NSIGFPE != 8);
+ BUILD_BUG_ON(NSIGSEGV != 4);
+ BUILD_BUG_ON(NSIGBUS != 5);
+ BUILD_BUG_ON(NSIGTRAP != 4);
+ BUILD_BUG_ON(NSIGCHLD != 6);
+ BUILD_BUG_ON(NSIGSYS != 1);
+
+ /* This is part of the ABI and can never change in size: */
+ BUILD_BUG_ON(sizeof(compat_siginfo_t) != 128);
+ /*
+ * The offsets of all the (unioned) si_fields are fixed
+ * in the ABI, of course. Make sure none of them ever
+ * move and are always at the beginning:
+ */
+ BUILD_BUG_ON(offsetof(compat_siginfo_t, _sifields) != 3 * sizeof(int));
+#define CHECK_CSI_OFFSET(name) BUILD_BUG_ON(_sifields_offset != offsetof(compat_siginfo_t, _sifields.name))
+
+ /*
+ * Ensure that the size of each si_field never changes.
+ * If it does, it is a sign that the
+ * copy_siginfo_to_user32() code below needs to updated
+ * along with the size in the CHECK_SI_SIZE().
+ *
+ * We repeat this check for both the generic and compat
+ * siginfos.
+ *
+ * Note: it is OK for these to grow as long as the whole
+ * structure stays within the padding size (checked
+ * above).
+ */
+#define CHECK_CSI_SIZE(name, size) BUILD_BUG_ON(size != sizeof(((compat_siginfo_t *)0)->_sifields.name))
+#define CHECK_SI_SIZE(name, size) BUILD_BUG_ON(size != sizeof(((siginfo_t *)0)->_sifields.name))
+
+ CHECK_CSI_OFFSET(_kill);
+ CHECK_CSI_SIZE (_kill, 2*sizeof(int));
+ CHECK_SI_SIZE (_kill, 2*sizeof(int));
+
+ CHECK_CSI_OFFSET(_timer);
+ CHECK_CSI_SIZE (_timer, 5*sizeof(int));
+ CHECK_SI_SIZE (_timer, 6*sizeof(int));
+
+ CHECK_CSI_OFFSET(_rt);
+ CHECK_CSI_SIZE (_rt, 3*sizeof(int));
+ CHECK_SI_SIZE (_rt, 4*sizeof(int));
+
+ CHECK_CSI_OFFSET(_sigchld);
+ CHECK_CSI_SIZE (_sigchld, 5*sizeof(int));
+ CHECK_SI_SIZE (_sigchld, 8*sizeof(int));
+
+ CHECK_CSI_OFFSET(_sigchld_x32);
+ CHECK_CSI_SIZE (_sigchld_x32, 7*sizeof(int));
+ /* no _sigchld_x32 in the generic siginfo_t */
+
+ CHECK_CSI_OFFSET(_sigfault);
+ CHECK_CSI_SIZE (_sigfault, 4*sizeof(int));
+ CHECK_SI_SIZE (_sigfault, 8*sizeof(int));
+
+ CHECK_CSI_OFFSET(_sigpoll);
+ CHECK_CSI_SIZE (_sigpoll, 2*sizeof(int));
+ CHECK_SI_SIZE (_sigpoll, 4*sizeof(int));
+
+ CHECK_CSI_OFFSET(_sigsys);
+ CHECK_CSI_SIZE (_sigsys, 3*sizeof(int));
+ CHECK_SI_SIZE (_sigsys, 4*sizeof(int));
+
+ /* any new si_fields should be added here */
+}
+
int copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from)
{
int err = 0;
bool ia32 = test_thread_flag(TIF_IA32);
+ signal_compat_build_tests();
+
if (!access_ok(VERIFY_WRITE, to, sizeof(compat_siginfo_t)))
return -EFAULT;
@@ -32,6 +125,21 @@ int copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from)
&to->_sifields._pad[0]);
switch (from->si_code >> 16) {
case __SI_FAULT >> 16:
+ if (from->si_signo == SIGBUS &&
+ (from->si_code == BUS_MCEERR_AR ||
+ from->si_code == BUS_MCEERR_AO))
+ put_user_ex(from->si_addr_lsb, &to->si_addr_lsb);
+
+ if (from->si_signo == SIGSEGV) {
+ if (from->si_code == SEGV_BNDERR) {
+ compat_uptr_t lower = (unsigned long)&to->si_lower;
+ compat_uptr_t upper = (unsigned long)&to->si_upper;
+ put_user_ex(lower, &to->si_lower);
+ put_user_ex(upper, &to->si_upper);
+ }
+ if (from->si_code == SEGV_PKUERR)
+ put_user_ex(from->si_pkey, &to->si_pkey);
+ }
break;
case __SI_SYS >> 16:
put_user_ex(from->si_syscall, &to->si_syscall);
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index fafe8b9..d0a5193 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -105,6 +105,9 @@ static unsigned int max_physical_pkg_id __read_mostly;
unsigned int __max_logical_packages __read_mostly;
EXPORT_SYMBOL(__max_logical_packages);
+/* Maximum number of SMT threads on any online core */
+int __max_smt_threads __read_mostly;
+
static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip)
{
unsigned long flags;
@@ -493,7 +496,7 @@ void set_cpu_sibling_map(int cpu)
bool has_mp = has_smt || boot_cpu_data.x86_max_cores > 1;
struct cpuinfo_x86 *c = &cpu_data(cpu);
struct cpuinfo_x86 *o;
- int i;
+ int i, threads;
cpumask_set_cpu(cpu, cpu_sibling_setup_mask);
@@ -550,6 +553,10 @@ void set_cpu_sibling_map(int cpu)
if (match_die(c, o) && !topology_same_node(c, o))
primarily_use_numa_for_topology();
}
+
+ threads = cpumask_weight(topology_sibling_cpumask(cpu));
+ if (threads > __max_smt_threads)
+ __max_smt_threads = threads;
}
/* maps the cpu to the sched domain representing multi-core */
@@ -1285,7 +1292,6 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
cpumask_copy(cpu_callin_mask, cpumask_of(0));
mb();
- current_thread_info()->cpu = 0; /* needed? */
for_each_possible_cpu(i) {
zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL);
zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL);
@@ -1441,6 +1447,21 @@ __init void prefill_possible_map(void)
#ifdef CONFIG_HOTPLUG_CPU
+/* Recompute SMT state for all CPUs on offline */
+static void recompute_smt_state(void)
+{
+ int max_threads, cpu;
+
+ max_threads = 0;
+ for_each_online_cpu (cpu) {
+ int threads = cpumask_weight(topology_sibling_cpumask(cpu));
+
+ if (threads > max_threads)
+ max_threads = threads;
+ }
+ __max_smt_threads = max_threads;
+}
+
static void remove_siblinginfo(int cpu)
{
int sibling;
@@ -1465,6 +1486,7 @@ static void remove_siblinginfo(int cpu)
c->phys_proc_id = 0;
c->cpu_core_id = 0;
cpumask_clear_cpu(cpu, cpu_sibling_setup_mask);
+ recompute_smt_state();
}
static void remove_cpu_from_maps(int cpu)
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 3dce1ca..01f30e5 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -440,10 +440,7 @@ static inline unsigned long get_vflags(struct kernel_vm86_regs *regs)
static inline int is_revectored(int nr, struct revectored_struct *bitmap)
{
- __asm__ __volatile__("btl %2,%1\n\tsbbl %0,%0"
- :"=r" (nr)
- :"m" (*bitmap), "r" (nr));
- return nr;
+ return test_bit(nr, bitmap->__map);
}
#define val_byte(val, n) (((__u8 *)&val)[n])
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index cd05942..f1aebfb 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -44,6 +44,9 @@ EXPORT_SYMBOL(clear_page);
EXPORT_SYMBOL(csum_partial);
+EXPORT_SYMBOL(__sw_hweight32);
+EXPORT_SYMBOL(__sw_hweight64);
+
/*
* Export string functions. We normally rely on gcc builtin for most of these,
* but gcc sometimes decides not to inline them.
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 3847e73..25da5bc8 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -1233,8 +1233,6 @@ static void write_bar_via_cfg(u32 cfg_offset, u32 off, u32 val)
static void probe_pci_console(void)
{
u8 cap, common_cap = 0, device_cap = 0;
- /* Offset within BAR0 */
- u32 device_offset;
u32 device_len;
/* Avoid recursive printk into here. */
@@ -1258,24 +1256,16 @@ static void probe_pci_console(void)
u8 vndr = read_pci_config_byte(0, 1, 0, cap);
if (vndr == PCI_CAP_ID_VNDR) {
u8 type, bar;
- u32 offset, length;
type = read_pci_config_byte(0, 1, 0,
cap + offsetof(struct virtio_pci_cap, cfg_type));
bar = read_pci_config_byte(0, 1, 0,
cap + offsetof(struct virtio_pci_cap, bar));
- offset = read_pci_config(0, 1, 0,
- cap + offsetof(struct virtio_pci_cap, offset));
- length = read_pci_config(0, 1, 0,
- cap + offsetof(struct virtio_pci_cap, length));
switch (type) {
case VIRTIO_PCI_CAP_DEVICE_CFG:
- if (bar == 0) {
+ if (bar == 0)
device_cap = cap;
- device_offset = offset;
- device_len = length;
- }
break;
case VIRTIO_PCI_CAP_PCI_CFG:
console_access_cap = cap;
@@ -1297,13 +1287,16 @@ static void probe_pci_console(void)
* emerg_wr. If it doesn't support VIRTIO_CONSOLE_F_EMERG_WRITE
* it should ignore the access.
*/
+ device_len = read_pci_config(0, 1, 0,
+ device_cap + offsetof(struct virtio_pci_cap, length));
if (device_len < (offsetof(struct virtio_console_config, emerg_wr)
+ sizeof(u32))) {
printk(KERN_ERR "lguest: console missing emerg_wr field\n");
return;
}
- console_cfg_offset = device_offset;
+ console_cfg_offset = read_pci_config(0, 1, 0,
+ device_cap + offsetof(struct virtio_pci_cap, offset));
printk(KERN_INFO "lguest: Console via virtio-pci emerg_wr\n");
}
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index 72a5767..34a7413 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -24,8 +24,9 @@ lib-y += usercopy_$(BITS).o usercopy.o getuser.o putuser.o
lib-y += memcpy_$(BITS).o
lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o
lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o
+lib-$(CONFIG_RANDOMIZE_BASE) += kaslr.o
-obj-y += msr.o msr-reg.o msr-reg-export.o
+obj-y += msr.o msr-reg.o msr-reg-export.o hweight.o
ifeq ($(CONFIG_X86_32),y)
obj-y += atomic64_32.o
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index 2b0ef26..bf603eb 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -17,11 +17,11 @@
/* Standard copy_to_user with segment limit checking */
ENTRY(_copy_to_user)
- GET_THREAD_INFO(%rax)
+ mov PER_CPU_VAR(current_task), %rax
movq %rdi,%rcx
addq %rdx,%rcx
jc bad_to_user
- cmpq TI_addr_limit(%rax),%rcx
+ cmpq TASK_addr_limit(%rax),%rcx
ja bad_to_user
ALTERNATIVE_2 "jmp copy_user_generic_unrolled", \
"jmp copy_user_generic_string", \
@@ -32,11 +32,11 @@ ENDPROC(_copy_to_user)
/* Standard copy_from_user with segment limit checking */
ENTRY(_copy_from_user)
- GET_THREAD_INFO(%rax)
+ mov PER_CPU_VAR(current_task), %rax
movq %rsi,%rcx
addq %rdx,%rcx
jc bad_from_user
- cmpq TI_addr_limit(%rax),%rcx
+ cmpq TASK_addr_limit(%rax),%rcx
ja bad_from_user
ALTERNATIVE_2 "jmp copy_user_generic_unrolled", \
"jmp copy_user_generic_string", \
diff --git a/arch/x86/lib/csum-wrappers_64.c b/arch/x86/lib/csum-wrappers_64.c
index 28a6654..b6fcb9a 100644
--- a/arch/x86/lib/csum-wrappers_64.c
+++ b/arch/x86/lib/csum-wrappers_64.c
@@ -6,6 +6,7 @@
*/
#include <asm/checksum.h>
#include <linux/module.h>
+#include <linux/uaccess.h>
#include <asm/smap.h>
/**
diff --git a/arch/x86/lib/getuser.S b/arch/x86/lib/getuser.S
index 46668cd..0ef5128 100644
--- a/arch/x86/lib/getuser.S
+++ b/arch/x86/lib/getuser.S
@@ -35,8 +35,8 @@
.text
ENTRY(__get_user_1)
- GET_THREAD_INFO(%_ASM_DX)
- cmp TI_addr_limit(%_ASM_DX),%_ASM_AX
+ mov PER_CPU_VAR(current_task), %_ASM_DX
+ cmp TASK_addr_limit(%_ASM_DX),%_ASM_AX
jae bad_get_user
ASM_STAC
1: movzbl (%_ASM_AX),%edx
@@ -48,8 +48,8 @@ ENDPROC(__get_user_1)
ENTRY(__get_user_2)
add $1,%_ASM_AX
jc bad_get_user
- GET_THREAD_INFO(%_ASM_DX)
- cmp TI_addr_limit(%_ASM_DX),%_ASM_AX
+ mov PER_CPU_VAR(current_task), %_ASM_DX
+ cmp TASK_addr_limit(%_ASM_DX),%_ASM_AX
jae bad_get_user
ASM_STAC
2: movzwl -1(%_ASM_AX),%edx
@@ -61,8 +61,8 @@ ENDPROC(__get_user_2)
ENTRY(__get_user_4)
add $3,%_ASM_AX
jc bad_get_user
- GET_THREAD_INFO(%_ASM_DX)
- cmp TI_addr_limit(%_ASM_DX),%_ASM_AX
+ mov PER_CPU_VAR(current_task), %_ASM_DX
+ cmp TASK_addr_limit(%_ASM_DX),%_ASM_AX
jae bad_get_user
ASM_STAC
3: movl -3(%_ASM_AX),%edx
@@ -75,8 +75,8 @@ ENTRY(__get_user_8)
#ifdef CONFIG_X86_64
add $7,%_ASM_AX
jc bad_get_user
- GET_THREAD_INFO(%_ASM_DX)
- cmp TI_addr_limit(%_ASM_DX),%_ASM_AX
+ mov PER_CPU_VAR(current_task), %_ASM_DX
+ cmp TASK_addr_limit(%_ASM_DX),%_ASM_AX
jae bad_get_user
ASM_STAC
4: movq -7(%_ASM_AX),%rdx
@@ -86,8 +86,8 @@ ENTRY(__get_user_8)
#else
add $7,%_ASM_AX
jc bad_get_user_8
- GET_THREAD_INFO(%_ASM_DX)
- cmp TI_addr_limit(%_ASM_DX),%_ASM_AX
+ mov PER_CPU_VAR(current_task), %_ASM_DX
+ cmp TASK_addr_limit(%_ASM_DX),%_ASM_AX
jae bad_get_user_8
ASM_STAC
4: movl -7(%_ASM_AX),%edx
diff --git a/arch/x86/lib/hweight.S b/arch/x86/lib/hweight.S
new file mode 100644
index 0000000..02de3d7
--- /dev/null
+++ b/arch/x86/lib/hweight.S
@@ -0,0 +1,77 @@
+#include <linux/linkage.h>
+
+#include <asm/asm.h>
+
+/*
+ * unsigned int __sw_hweight32(unsigned int w)
+ * %rdi: w
+ */
+ENTRY(__sw_hweight32)
+
+#ifdef CONFIG_X86_64
+ movl %edi, %eax # w
+#endif
+ __ASM_SIZE(push,) %__ASM_REG(dx)
+ movl %eax, %edx # w -> t
+ shrl %edx # t >>= 1
+ andl $0x55555555, %edx # t &= 0x55555555
+ subl %edx, %eax # w -= t
+
+ movl %eax, %edx # w -> t
+ shrl $2, %eax # w_tmp >>= 2
+ andl $0x33333333, %edx # t &= 0x33333333
+ andl $0x33333333, %eax # w_tmp &= 0x33333333
+ addl %edx, %eax # w = w_tmp + t
+
+ movl %eax, %edx # w -> t
+ shrl $4, %edx # t >>= 4
+ addl %edx, %eax # w_tmp += t
+ andl $0x0f0f0f0f, %eax # w_tmp &= 0x0f0f0f0f
+ imull $0x01010101, %eax, %eax # w_tmp *= 0x01010101
+ shrl $24, %eax # w = w_tmp >> 24
+ __ASM_SIZE(pop,) %__ASM_REG(dx)
+ ret
+ENDPROC(__sw_hweight32)
+
+ENTRY(__sw_hweight64)
+#ifdef CONFIG_X86_64
+ pushq %rdx
+
+ movq %rdi, %rdx # w -> t
+ movabsq $0x5555555555555555, %rax
+ shrq %rdx # t >>= 1
+ andq %rdx, %rax # t &= 0x5555555555555555
+ movabsq $0x3333333333333333, %rdx
+ subq %rax, %rdi # w -= t
+
+ movq %rdi, %rax # w -> t
+ shrq $2, %rdi # w_tmp >>= 2
+ andq %rdx, %rax # t &= 0x3333333333333333
+ andq %rdi, %rdx # w_tmp &= 0x3333333333333333
+ addq %rdx, %rax # w = w_tmp + t
+
+ movq %rax, %rdx # w -> t
+ shrq $4, %rdx # t >>= 4
+ addq %rdx, %rax # w_tmp += t
+ movabsq $0x0f0f0f0f0f0f0f0f, %rdx
+ andq %rdx, %rax # w_tmp &= 0x0f0f0f0f0f0f0f0f
+ movabsq $0x0101010101010101, %rdx
+ imulq %rdx, %rax # w_tmp *= 0x0101010101010101
+ shrq $56, %rax # w = w_tmp >> 56
+
+ popq %rdx
+ ret
+#else /* CONFIG_X86_32 */
+ /* We're getting an u64 arg in (%eax,%edx): unsigned long hweight64(__u64 w) */
+ pushl %ecx
+
+ call __sw_hweight32
+ movl %eax, %ecx # stash away result
+ movl %edx, %eax # second part of input
+ call __sw_hweight32
+ addl %ecx, %eax # result
+
+ popl %ecx
+ ret
+#endif
+ENDPROC(__sw_hweight64)
diff --git a/arch/x86/lib/kaslr.c b/arch/x86/lib/kaslr.c
new file mode 100644
index 0000000..f7dfeda
--- /dev/null
+++ b/arch/x86/lib/kaslr.c
@@ -0,0 +1,90 @@
+/*
+ * Entropy functions used on early boot for KASLR base and memory
+ * randomization. The base randomization is done in the compressed
+ * kernel and memory randomization is done early when the regular
+ * kernel starts. This file is included in the compressed kernel and
+ * normally linked in the regular.
+ */
+#include <asm/kaslr.h>
+#include <asm/msr.h>
+#include <asm/archrandom.h>
+#include <asm/e820.h>
+#include <asm/io.h>
+
+/*
+ * When built for the regular kernel, several functions need to be stubbed out
+ * or changed to their regular kernel equivalent.
+ */
+#ifndef KASLR_COMPRESSED_BOOT
+#include <asm/cpufeature.h>
+#include <asm/setup.h>
+
+#define debug_putstr(v) early_printk(v)
+#define has_cpuflag(f) boot_cpu_has(f)
+#define get_boot_seed() kaslr_offset()
+#endif
+
+#define I8254_PORT_CONTROL 0x43
+#define I8254_PORT_COUNTER0 0x40
+#define I8254_CMD_READBACK 0xC0
+#define I8254_SELECT_COUNTER0 0x02
+#define I8254_STATUS_NOTREADY 0x40
+static inline u16 i8254(void)
+{
+ u16 status, timer;
+
+ do {
+ outb(I8254_PORT_CONTROL,
+ I8254_CMD_READBACK | I8254_SELECT_COUNTER0);
+ status = inb(I8254_PORT_COUNTER0);
+ timer = inb(I8254_PORT_COUNTER0);
+ timer |= inb(I8254_PORT_COUNTER0) << 8;
+ } while (status & I8254_STATUS_NOTREADY);
+
+ return timer;
+}
+
+unsigned long kaslr_get_random_long(const char *purpose)
+{
+#ifdef CONFIG_X86_64
+ const unsigned long mix_const = 0x5d6008cbf3848dd3UL;
+#else
+ const unsigned long mix_const = 0x3f39e593UL;
+#endif
+ unsigned long raw, random = get_boot_seed();
+ bool use_i8254 = true;
+
+ debug_putstr(purpose);
+ debug_putstr(" KASLR using");
+
+ if (has_cpuflag(X86_FEATURE_RDRAND)) {
+ debug_putstr(" RDRAND");
+ if (rdrand_long(&raw)) {
+ random ^= raw;
+ use_i8254 = false;
+ }
+ }
+
+ if (has_cpuflag(X86_FEATURE_TSC)) {
+ debug_putstr(" RDTSC");
+ raw = rdtsc();
+
+ random ^= raw;
+ use_i8254 = false;
+ }
+
+ if (use_i8254) {
+ debug_putstr(" i8254");
+ random ^= i8254();
+ }
+
+ /* Circular multiply for better bit diffusion */
+ asm("mul %3"
+ : "=a" (random), "=d" (raw)
+ : "a" (random), "rm" (mix_const));
+ random += raw;
+
+ debug_putstr("...\n");
+
+ return random;
+}
diff --git a/arch/x86/lib/putuser.S b/arch/x86/lib/putuser.S
index e0817a1..c891ece 100644
--- a/arch/x86/lib/putuser.S
+++ b/arch/x86/lib/putuser.S
@@ -29,14 +29,14 @@
* as they get called from within inline assembly.
*/
-#define ENTER GET_THREAD_INFO(%_ASM_BX)
+#define ENTER mov PER_CPU_VAR(current_task), %_ASM_BX
#define EXIT ASM_CLAC ; \
ret
.text
ENTRY(__put_user_1)
ENTER
- cmp TI_addr_limit(%_ASM_BX),%_ASM_CX
+ cmp TASK_addr_limit(%_ASM_BX),%_ASM_CX
jae bad_put_user
ASM_STAC
1: movb %al,(%_ASM_CX)
@@ -46,7 +46,7 @@ ENDPROC(__put_user_1)
ENTRY(__put_user_2)
ENTER
- mov TI_addr_limit(%_ASM_BX),%_ASM_BX
+ mov TASK_addr_limit(%_ASM_BX),%_ASM_BX
sub $1,%_ASM_BX
cmp %_ASM_BX,%_ASM_CX
jae bad_put_user
@@ -58,7 +58,7 @@ ENDPROC(__put_user_2)
ENTRY(__put_user_4)
ENTER
- mov TI_addr_limit(%_ASM_BX),%_ASM_BX
+ mov TASK_addr_limit(%_ASM_BX),%_ASM_BX
sub $3,%_ASM_BX
cmp %_ASM_BX,%_ASM_CX
jae bad_put_user
@@ -70,7 +70,7 @@ ENDPROC(__put_user_4)
ENTRY(__put_user_8)
ENTER
- mov TI_addr_limit(%_ASM_BX),%_ASM_BX
+ mov TASK_addr_limit(%_ASM_BX),%_ASM_BX
sub $7,%_ASM_BX
cmp %_ASM_BX,%_ASM_CX
jae bad_put_user
diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c
index 0a42327..9f760cd 100644
--- a/arch/x86/lib/usercopy_64.c
+++ b/arch/x86/lib/usercopy_64.c
@@ -6,7 +6,7 @@
* Copyright 2002 Andi Kleen <ak@suse.de>
*/
#include <linux/module.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
/*
* Zero Userspace
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 62c0043..96d2b84 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -37,4 +37,5 @@ obj-$(CONFIG_NUMA_EMU) += numa_emulation.o
obj-$(CONFIG_X86_INTEL_MPX) += mpx.o
obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o
+obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index 99bfb19..9a17250 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -72,9 +72,9 @@ static struct addr_marker address_markers[] = {
{ 0, "User Space" },
#ifdef CONFIG_X86_64
{ 0x8000000000000000UL, "Kernel Space" },
- { PAGE_OFFSET, "Low Kernel Mapping" },
- { VMALLOC_START, "vmalloc() Area" },
- { VMEMMAP_START, "Vmemmap" },
+ { 0/* PAGE_OFFSET */, "Low Kernel Mapping" },
+ { 0/* VMALLOC_START */, "vmalloc() Area" },
+ { 0/* VMEMMAP_START */, "Vmemmap" },
# ifdef CONFIG_X86_ESPFIX64
{ ESPFIX_BASE_ADDR, "ESPfix Area", 16 },
# endif
@@ -434,8 +434,16 @@ void ptdump_walk_pgd_level_checkwx(void)
static int __init pt_dump_init(void)
{
+ /*
+ * Various markers are not compile-time constants, so assign them
+ * here.
+ */
+#ifdef CONFIG_X86_64
+ address_markers[LOW_KERNEL_NR].start_address = PAGE_OFFSET;
+ address_markers[VMALLOC_START_NR].start_address = VMALLOC_START;
+ address_markers[VMEMMAP_START_NR].start_address = VMEMMAP_START;
+#endif
#ifdef CONFIG_X86_32
- /* Not a compile-time constant on x86-32 */
address_markers[VMALLOC_START_NR].start_address = VMALLOC_START;
address_markers[VMALLOC_END_NR].start_address = VMALLOC_END;
# ifdef CONFIG_HIGHMEM
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index 4bb53b8..832b98f 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -1,6 +1,7 @@
#include <linux/module.h>
#include <asm/uaccess.h>
#include <asm/traps.h>
+#include <asm/kdebug.h>
typedef bool (*ex_handler_t)(const struct exception_table_entry *,
struct pt_regs *, int);
@@ -37,7 +38,7 @@ bool ex_handler_ext(const struct exception_table_entry *fixup,
struct pt_regs *regs, int trapnr)
{
/* Special hack for uaccess_err */
- current_thread_info()->uaccess_err = 1;
+ current->thread.uaccess_err = 1;
regs->ip = ex_fixup_addr(fixup);
return true;
}
@@ -46,8 +47,9 @@ EXPORT_SYMBOL(ex_handler_ext);
bool ex_handler_rdmsr_unsafe(const struct exception_table_entry *fixup,
struct pt_regs *regs, int trapnr)
{
- WARN_ONCE(1, "unchecked MSR access error: RDMSR from 0x%x\n",
- (unsigned int)regs->cx);
+ if (pr_warn_once("unchecked MSR access error: RDMSR from 0x%x at rIP: 0x%lx (%pF)\n",
+ (unsigned int)regs->cx, regs->ip, (void *)regs->ip))
+ show_stack_regs(regs);
/* Pretend that the read succeeded and returned 0. */
regs->ip = ex_fixup_addr(fixup);
@@ -60,9 +62,10 @@ EXPORT_SYMBOL(ex_handler_rdmsr_unsafe);
bool ex_handler_wrmsr_unsafe(const struct exception_table_entry *fixup,
struct pt_regs *regs, int trapnr)
{
- WARN_ONCE(1, "unchecked MSR access error: WRMSR to 0x%x (tried to write 0x%08x%08x)\n",
- (unsigned int)regs->cx,
- (unsigned int)regs->dx, (unsigned int)regs->ax);
+ if (pr_warn_once("unchecked MSR access error: WRMSR to 0x%x (tried to write 0x%08x%08x) at rIP: 0x%lx (%pF)\n",
+ (unsigned int)regs->cx, (unsigned int)regs->dx,
+ (unsigned int)regs->ax, regs->ip, (void *)regs->ip))
+ show_stack_regs(regs);
/* Pretend that the write succeeded. */
regs->ip = ex_fixup_addr(fixup);
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 7d1fa7c..d22161a 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -439,7 +439,7 @@ static noinline int vmalloc_fault(unsigned long address)
* happen within a race in page table update. In the later
* case just flush:
*/
- pgd = pgd_offset(current->active_mm, address);
+ pgd = (pgd_t *)__va(read_cr3()) + pgd_index(address);
pgd_ref = pgd_offset_k(address);
if (pgd_none(*pgd_ref))
return -1;
@@ -737,7 +737,7 @@ no_context(struct pt_regs *regs, unsigned long error_code,
* In this case we need to make sure we're not recursively
* faulting through the emulate_vsyscall() logic.
*/
- if (current_thread_info()->sig_on_uaccess_error && signal) {
+ if (current->thread.sig_on_uaccess_err && signal) {
tsk->thread.trap_nr = X86_TRAP_PF;
tsk->thread.error_code = error_code | PF_USER;
tsk->thread.cr2 = address;
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 372aad2..cc82830 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -17,6 +17,7 @@
#include <asm/proto.h>
#include <asm/dma.h> /* for MAX_DMA_PFN */
#include <asm/microcode.h>
+#include <asm/kaslr.h>
/*
* We need to define the tracepoints somewhere, and tlb.c
@@ -590,6 +591,9 @@ void __init init_mem_mapping(void)
/* the ISA range is always mapped regardless of memory holes */
init_memory_mapping(0, ISA_END_ADDRESS);
+ /* Init the trampoline, possibly with KASLR memory offset */
+ init_trampoline();
+
/*
* If the allocation is in bottom-up direction, we setup direct mapping
* in bottom-up, otherwise we setup direct mapping in top-down.
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index bce2e5d..53cc225 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -328,22 +328,30 @@ void __init cleanup_highmap(void)
}
}
+/*
+ * Create PTE level page table mapping for physical addresses.
+ * It returns the last physical address mapped.
+ */
static unsigned long __meminit
-phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end,
+phys_pte_init(pte_t *pte_page, unsigned long paddr, unsigned long paddr_end,
pgprot_t prot)
{
- unsigned long pages = 0, next;
- unsigned long last_map_addr = end;
+ unsigned long pages = 0, paddr_next;
+ unsigned long paddr_last = paddr_end;
+ pte_t *pte;
int i;
- pte_t *pte = pte_page + pte_index(addr);
+ pte = pte_page + pte_index(paddr);
+ i = pte_index(paddr);
- for (i = pte_index(addr); i < PTRS_PER_PTE; i++, addr = next, pte++) {
- next = (addr & PAGE_MASK) + PAGE_SIZE;
- if (addr >= end) {
+ for (; i < PTRS_PER_PTE; i++, paddr = paddr_next, pte++) {
+ paddr_next = (paddr & PAGE_MASK) + PAGE_SIZE;
+ if (paddr >= paddr_end) {
if (!after_bootmem &&
- !e820_any_mapped(addr & PAGE_MASK, next, E820_RAM) &&
- !e820_any_mapped(addr & PAGE_MASK, next, E820_RESERVED_KERN))
+ !e820_any_mapped(paddr & PAGE_MASK, paddr_next,
+ E820_RAM) &&
+ !e820_any_mapped(paddr & PAGE_MASK, paddr_next,
+ E820_RESERVED_KERN))
set_pte(pte, __pte(0));
continue;
}
@@ -354,54 +362,61 @@ phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end,
* pagetable pages as RO. So assume someone who pre-setup
* these mappings are more intelligent.
*/
- if (pte_val(*pte)) {
+ if (!pte_none(*pte)) {
if (!after_bootmem)
pages++;
continue;
}
if (0)
- printk(" pte=%p addr=%lx pte=%016lx\n",
- pte, addr, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL).pte);
+ pr_info(" pte=%p addr=%lx pte=%016lx\n", pte, paddr,
+ pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL).pte);
pages++;
- set_pte(pte, pfn_pte(addr >> PAGE_SHIFT, prot));
- last_map_addr = (addr & PAGE_MASK) + PAGE_SIZE;
+ set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, prot));
+ paddr_last = (paddr & PAGE_MASK) + PAGE_SIZE;
}
update_page_count(PG_LEVEL_4K, pages);
- return last_map_addr;
+ return paddr_last;
}
+/*
+ * Create PMD level page table mapping for physical addresses. The virtual
+ * and physical address have to be aligned at this level.
+ * It returns the last physical address mapped.
+ */
static unsigned long __meminit
-phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
+phys_pmd_init(pmd_t *pmd_page, unsigned long paddr, unsigned long paddr_end,
unsigned long page_size_mask, pgprot_t prot)
{
- unsigned long pages = 0, next;
- unsigned long last_map_addr = end;
+ unsigned long pages = 0, paddr_next;
+ unsigned long paddr_last = paddr_end;
- int i = pmd_index(address);
+ int i = pmd_index(paddr);
- for (; i < PTRS_PER_PMD; i++, address = next) {
- pmd_t *pmd = pmd_page + pmd_index(address);
+ for (; i < PTRS_PER_PMD; i++, paddr = paddr_next) {
+ pmd_t *pmd = pmd_page + pmd_index(paddr);
pte_t *pte;
pgprot_t new_prot = prot;
- next = (address & PMD_MASK) + PMD_SIZE;
- if (address >= end) {
+ paddr_next = (paddr & PMD_MASK) + PMD_SIZE;
+ if (paddr >= paddr_end) {
if (!after_bootmem &&
- !e820_any_mapped(address & PMD_MASK, next, E820_RAM) &&
- !e820_any_mapped(address & PMD_MASK, next, E820_RESERVED_KERN))
+ !e820_any_mapped(paddr & PMD_MASK, paddr_next,
+ E820_RAM) &&
+ !e820_any_mapped(paddr & PMD_MASK, paddr_next,
+ E820_RESERVED_KERN))
set_pmd(pmd, __pmd(0));
continue;
}
- if (pmd_val(*pmd)) {
+ if (!pmd_none(*pmd)) {
if (!pmd_large(*pmd)) {
spin_lock(&init_mm.page_table_lock);
pte = (pte_t *)pmd_page_vaddr(*pmd);
- last_map_addr = phys_pte_init(pte, address,
- end, prot);
+ paddr_last = phys_pte_init(pte, paddr,
+ paddr_end, prot);
spin_unlock(&init_mm.page_table_lock);
continue;
}
@@ -420,7 +435,7 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
if (page_size_mask & (1 << PG_LEVEL_2M)) {
if (!after_bootmem)
pages++;
- last_map_addr = next;
+ paddr_last = paddr_next;
continue;
}
new_prot = pte_pgprot(pte_clrhuge(*(pte_t *)pmd));
@@ -430,51 +445,65 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
pages++;
spin_lock(&init_mm.page_table_lock);
set_pte((pte_t *)pmd,
- pfn_pte((address & PMD_MASK) >> PAGE_SHIFT,
+ pfn_pte((paddr & PMD_MASK) >> PAGE_SHIFT,
__pgprot(pgprot_val(prot) | _PAGE_PSE)));
spin_unlock(&init_mm.page_table_lock);
- last_map_addr = next;
+ paddr_last = paddr_next;
continue;
}
pte = alloc_low_page();
- last_map_addr = phys_pte_init(pte, address, end, new_prot);
+ paddr_last = phys_pte_init(pte, paddr, paddr_end, new_prot);
spin_lock(&init_mm.page_table_lock);
pmd_populate_kernel(&init_mm, pmd, pte);
spin_unlock(&init_mm.page_table_lock);
}
update_page_count(PG_LEVEL_2M, pages);
- return last_map_addr;
+ return paddr_last;
}
+/*
+ * Create PUD level page table mapping for physical addresses. The virtual
+ * and physical address do not have to be aligned at this level. KASLR can
+ * randomize virtual addresses up to this level.
+ * It returns the last physical address mapped.
+ */
static unsigned long __meminit
-phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
- unsigned long page_size_mask)
+phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end,
+ unsigned long page_size_mask)
{
- unsigned long pages = 0, next;
- unsigned long last_map_addr = end;
- int i = pud_index(addr);
+ unsigned long pages = 0, paddr_next;
+ unsigned long paddr_last = paddr_end;
+ unsigned long vaddr = (unsigned long)__va(paddr);
+ int i = pud_index(vaddr);
- for (; i < PTRS_PER_PUD; i++, addr = next) {
- pud_t *pud = pud_page + pud_index(addr);
+ for (; i < PTRS_PER_PUD; i++, paddr = paddr_next) {
+ pud_t *pud;
pmd_t *pmd;
pgprot_t prot = PAGE_KERNEL;
- next = (addr & PUD_MASK) + PUD_SIZE;
- if (addr >= end) {
+ vaddr = (unsigned long)__va(paddr);
+ pud = pud_page + pud_index(vaddr);
+ paddr_next = (paddr & PUD_MASK) + PUD_SIZE;
+
+ if (paddr >= paddr_end) {
if (!after_bootmem &&
- !e820_any_mapped(addr & PUD_MASK, next, E820_RAM) &&
- !e820_any_mapped(addr & PUD_MASK, next, E820_RESERVED_KERN))
+ !e820_any_mapped(paddr & PUD_MASK, paddr_next,
+ E820_RAM) &&
+ !e820_any_mapped(paddr & PUD_MASK, paddr_next,
+ E820_RESERVED_KERN))
set_pud(pud, __pud(0));
continue;
}
- if (pud_val(*pud)) {
+ if (!pud_none(*pud)) {
if (!pud_large(*pud)) {
pmd = pmd_offset(pud, 0);
- last_map_addr = phys_pmd_init(pmd, addr, end,
- page_size_mask, prot);
+ paddr_last = phys_pmd_init(pmd, paddr,
+ paddr_end,
+ page_size_mask,
+ prot);
__flush_tlb_all();
continue;
}
@@ -493,7 +522,7 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
if (page_size_mask & (1 << PG_LEVEL_1G)) {
if (!after_bootmem)
pages++;
- last_map_addr = next;
+ paddr_last = paddr_next;
continue;
}
prot = pte_pgprot(pte_clrhuge(*(pte_t *)pud));
@@ -503,16 +532,16 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
pages++;
spin_lock(&init_mm.page_table_lock);
set_pte((pte_t *)pud,
- pfn_pte((addr & PUD_MASK) >> PAGE_SHIFT,
+ pfn_pte((paddr & PUD_MASK) >> PAGE_SHIFT,
PAGE_KERNEL_LARGE));
spin_unlock(&init_mm.page_table_lock);
- last_map_addr = next;
+ paddr_last = paddr_next;
continue;
}
pmd = alloc_low_page();
- last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask,
- prot);
+ paddr_last = phys_pmd_init(pmd, paddr, paddr_end,
+ page_size_mask, prot);
spin_lock(&init_mm.page_table_lock);
pud_populate(&init_mm, pud, pmd);
@@ -522,38 +551,44 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
update_page_count(PG_LEVEL_1G, pages);
- return last_map_addr;
+ return paddr_last;
}
+/*
+ * Create page table mapping for the physical memory for specific physical
+ * addresses. The virtual and physical addresses have to be aligned on PMD level
+ * down. It returns the last physical address mapped.
+ */
unsigned long __meminit
-kernel_physical_mapping_init(unsigned long start,
- unsigned long end,
+kernel_physical_mapping_init(unsigned long paddr_start,
+ unsigned long paddr_end,
unsigned long page_size_mask)
{
bool pgd_changed = false;
- unsigned long next, last_map_addr = end;
- unsigned long addr;
+ unsigned long vaddr, vaddr_start, vaddr_end, vaddr_next, paddr_last;
- start = (unsigned long)__va(start);
- end = (unsigned long)__va(end);
- addr = start;
+ paddr_last = paddr_end;
+ vaddr = (unsigned long)__va(paddr_start);
+ vaddr_end = (unsigned long)__va(paddr_end);
+ vaddr_start = vaddr;
- for (; start < end; start = next) {
- pgd_t *pgd = pgd_offset_k(start);
+ for (; vaddr < vaddr_end; vaddr = vaddr_next) {
+ pgd_t *pgd = pgd_offset_k(vaddr);
pud_t *pud;
- next = (start & PGDIR_MASK) + PGDIR_SIZE;
+ vaddr_next = (vaddr & PGDIR_MASK) + PGDIR_SIZE;
if (pgd_val(*pgd)) {
pud = (pud_t *)pgd_page_vaddr(*pgd);
- last_map_addr = phys_pud_init(pud, __pa(start),
- __pa(end), page_size_mask);
+ paddr_last = phys_pud_init(pud, __pa(vaddr),
+ __pa(vaddr_end),
+ page_size_mask);
continue;
}
pud = alloc_low_page();
- last_map_addr = phys_pud_init(pud, __pa(start), __pa(end),
- page_size_mask);
+ paddr_last = phys_pud_init(pud, __pa(vaddr), __pa(vaddr_end),
+ page_size_mask);
spin_lock(&init_mm.page_table_lock);
pgd_populate(&init_mm, pgd, pud);
@@ -562,11 +597,11 @@ kernel_physical_mapping_init(unsigned long start,
}
if (pgd_changed)
- sync_global_pgds(addr, end - 1, 0);
+ sync_global_pgds(vaddr_start, vaddr_end - 1, 0);
__flush_tlb_all();
- return last_map_addr;
+ return paddr_last;
}
#ifndef CONFIG_NUMA
@@ -673,7 +708,7 @@ static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd)
for (i = 0; i < PTRS_PER_PTE; i++) {
pte = pte_start + i;
- if (pte_val(*pte))
+ if (!pte_none(*pte))
return;
}
@@ -691,7 +726,7 @@ static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud)
for (i = 0; i < PTRS_PER_PMD; i++) {
pmd = pmd_start + i;
- if (pmd_val(*pmd))
+ if (!pmd_none(*pmd))
return;
}
@@ -702,27 +737,6 @@ static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud)
spin_unlock(&init_mm.page_table_lock);
}
-/* Return true if pgd is changed, otherwise return false. */
-static bool __meminit free_pud_table(pud_t *pud_start, pgd_t *pgd)
-{
- pud_t *pud;
- int i;
-
- for (i = 0; i < PTRS_PER_PUD; i++) {
- pud = pud_start + i;
- if (pud_val(*pud))
- return false;
- }
-
- /* free a pud table */
- free_pagetable(pgd_page(*pgd), 0);
- spin_lock(&init_mm.page_table_lock);
- pgd_clear(pgd);
- spin_unlock(&init_mm.page_table_lock);
-
- return true;
-}
-
static void __meminit
remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end,
bool direct)
@@ -913,7 +927,6 @@ remove_pagetable(unsigned long start, unsigned long end, bool direct)
unsigned long addr;
pgd_t *pgd;
pud_t *pud;
- bool pgd_changed = false;
for (addr = start; addr < end; addr = next) {
next = pgd_addr_end(addr, end);
@@ -924,13 +937,8 @@ remove_pagetable(unsigned long start, unsigned long end, bool direct)
pud = (pud_t *)pgd_page_vaddr(*pgd);
remove_pud_table(pud, addr, next, direct);
- if (free_pud_table(pud, pgd))
- pgd_changed = true;
}
- if (pgd_changed)
- sync_global_pgds(start, end - 1, 1);
-
flush_tlb_all();
}
diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c
index 1b1110f..0493c17 100644
--- a/arch/x86/mm/kasan_init_64.c
+++ b/arch/x86/mm/kasan_init_64.c
@@ -54,8 +54,8 @@ static int kasan_die_handler(struct notifier_block *self,
void *data)
{
if (val == DIE_GPF) {
- pr_emerg("CONFIG_KASAN_INLINE enabled");
- pr_emerg("GPF could be caused by NULL-ptr deref or user memory access");
+ pr_emerg("CONFIG_KASAN_INLINE enabled\n");
+ pr_emerg("GPF could be caused by NULL-ptr deref or user memory access\n");
}
return NOTIFY_OK;
}
diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c
new file mode 100644
index 0000000..26dccd6
--- /dev/null
+++ b/arch/x86/mm/kaslr.c
@@ -0,0 +1,172 @@
+/*
+ * This file implements KASLR memory randomization for x86_64. It randomizes
+ * the virtual address space of kernel memory regions (physical memory
+ * mapping, vmalloc & vmemmap) for x86_64. This security feature mitigates
+ * exploits relying on predictable kernel addresses.
+ *
+ * Entropy is generated using the KASLR early boot functions now shared in
+ * the lib directory (originally written by Kees Cook). Randomization is
+ * done on PGD & PUD page table levels to increase possible addresses. The
+ * physical memory mapping code was adapted to support PUD level virtual
+ * addresses. This implementation on the best configuration provides 30,000
+ * possible virtual addresses in average for each memory region. An additional
+ * low memory page is used to ensure each CPU can start with a PGD aligned
+ * virtual address (for realmode).
+ *
+ * The order of each memory region is not changed. The feature looks at
+ * the available space for the regions based on different configuration
+ * options and randomizes the base and space between each. The size of the
+ * physical memory mapping is the available physical memory.
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/random.h>
+
+#include <asm/pgalloc.h>
+#include <asm/pgtable.h>
+#include <asm/setup.h>
+#include <asm/kaslr.h>
+
+#include "mm_internal.h"
+
+#define TB_SHIFT 40
+
+/*
+ * Virtual address start and end range for randomization. The end changes base
+ * on configuration to have the highest amount of space for randomization.
+ * It increases the possible random position for each randomized region.
+ *
+ * You need to add an if/def entry if you introduce a new memory region
+ * compatible with KASLR. Your entry must be in logical order with memory
+ * layout. For example, ESPFIX is before EFI because its virtual address is
+ * before. You also need to add a BUILD_BUG_ON in kernel_randomize_memory to
+ * ensure that this order is correct and won't be changed.
+ */
+static const unsigned long vaddr_start = __PAGE_OFFSET_BASE;
+static const unsigned long vaddr_end = VMEMMAP_START;
+
+/* Default values */
+unsigned long page_offset_base = __PAGE_OFFSET_BASE;
+EXPORT_SYMBOL(page_offset_base);
+unsigned long vmalloc_base = __VMALLOC_BASE;
+EXPORT_SYMBOL(vmalloc_base);
+
+/*
+ * Memory regions randomized by KASLR (except modules that use a separate logic
+ * earlier during boot). The list is ordered based on virtual addresses. This
+ * order is kept after randomization.
+ */
+static __initdata struct kaslr_memory_region {
+ unsigned long *base;
+ unsigned long size_tb;
+} kaslr_regions[] = {
+ { &page_offset_base, 64/* Maximum */ },
+ { &vmalloc_base, VMALLOC_SIZE_TB },
+};
+
+/* Get size in bytes used by the memory region */
+static inline unsigned long get_padding(struct kaslr_memory_region *region)
+{
+ return (region->size_tb << TB_SHIFT);
+}
+
+/*
+ * Apply no randomization if KASLR was disabled at boot or if KASAN
+ * is enabled. KASAN shadow mappings rely on regions being PGD aligned.
+ */
+static inline bool kaslr_memory_enabled(void)
+{
+ return kaslr_enabled() && !config_enabled(CONFIG_KASAN);
+}
+
+/* Initialize base and padding for each memory region randomized with KASLR */
+void __init kernel_randomize_memory(void)
+{
+ size_t i;
+ unsigned long vaddr = vaddr_start;
+ unsigned long rand, memory_tb;
+ struct rnd_state rand_state;
+ unsigned long remain_entropy;
+
+ if (!kaslr_memory_enabled())
+ return;
+
+ /*
+ * Update Physical memory mapping to available and
+ * add padding if needed (especially for memory hotplug support).
+ */
+ BUG_ON(kaslr_regions[0].base != &page_offset_base);
+ memory_tb = ((max_pfn << PAGE_SHIFT) >> TB_SHIFT) +
+ CONFIG_RANDOMIZE_MEMORY_PHYSICAL_PADDING;
+
+ /* Adapt phyiscal memory region size based on available memory */
+ if (memory_tb < kaslr_regions[0].size_tb)
+ kaslr_regions[0].size_tb = memory_tb;
+
+ /* Calculate entropy available between regions */
+ remain_entropy = vaddr_end - vaddr_start;
+ for (i = 0; i < ARRAY_SIZE(kaslr_regions); i++)
+ remain_entropy -= get_padding(&kaslr_regions[i]);
+
+ prandom_seed_state(&rand_state, kaslr_get_random_long("Memory"));
+
+ for (i = 0; i < ARRAY_SIZE(kaslr_regions); i++) {
+ unsigned long entropy;
+
+ /*
+ * Select a random virtual address using the extra entropy
+ * available.
+ */
+ entropy = remain_entropy / (ARRAY_SIZE(kaslr_regions) - i);
+ prandom_bytes_state(&rand_state, &rand, sizeof(rand));
+ entropy = (rand % (entropy + 1)) & PUD_MASK;
+ vaddr += entropy;
+ *kaslr_regions[i].base = vaddr;
+
+ /*
+ * Jump the region and add a minimum padding based on
+ * randomization alignment.
+ */
+ vaddr += get_padding(&kaslr_regions[i]);
+ vaddr = round_up(vaddr + 1, PUD_SIZE);
+ remain_entropy -= entropy;
+ }
+}
+
+/*
+ * Create PGD aligned trampoline table to allow real mode initialization
+ * of additional CPUs. Consume only 1 low memory page.
+ */
+void __meminit init_trampoline(void)
+{
+ unsigned long paddr, paddr_next;
+ pgd_t *pgd;
+ pud_t *pud_page, *pud_page_tramp;
+ int i;
+
+ if (!kaslr_memory_enabled()) {
+ init_trampoline_default();
+ return;
+ }
+
+ pud_page_tramp = alloc_low_page();
+
+ paddr = 0;
+ pgd = pgd_offset_k((unsigned long)__va(paddr));
+ pud_page = (pud_t *) pgd_page_vaddr(*pgd);
+
+ for (i = pud_index(paddr); i < PTRS_PER_PUD; i++, paddr = paddr_next) {
+ pud_t *pud, *pud_tramp;
+ unsigned long vaddr = (unsigned long)__va(paddr);
+
+ pud_tramp = pud_page_tramp + pud_index(paddr);
+ pud = pud_page + pud_index(vaddr);
+ paddr_next = (paddr & PUD_MASK) + PUD_SIZE;
+
+ *pud_tramp = *pud;
+ }
+
+ set_pgd(&trampoline_pgd_entry,
+ __pgd(_KERNPG_TABLE | __pa(pud_page_tramp)));
+}
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 7a1f7bb..849dc09 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -101,7 +101,8 @@ static inline unsigned long highmap_start_pfn(void)
static inline unsigned long highmap_end_pfn(void)
{
- return __pa_symbol(roundup(_brk_end, PMD_SIZE)) >> PAGE_SHIFT;
+ /* Do not reference physical address outside the kernel. */
+ return __pa_symbol(roundup(_brk_end, PMD_SIZE) - 1) >> PAGE_SHIFT;
}
#endif
@@ -112,6 +113,12 @@ within(unsigned long addr, unsigned long start, unsigned long end)
return addr >= start && addr < end;
}
+static inline int
+within_inclusive(unsigned long addr, unsigned long start, unsigned long end)
+{
+ return addr >= start && addr <= end;
+}
+
/*
* Flushing functions
*/
@@ -746,18 +753,6 @@ static bool try_to_free_pmd_page(pmd_t *pmd)
return true;
}
-static bool try_to_free_pud_page(pud_t *pud)
-{
- int i;
-
- for (i = 0; i < PTRS_PER_PUD; i++)
- if (!pud_none(pud[i]))
- return false;
-
- free_page((unsigned long)pud);
- return true;
-}
-
static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end)
{
pte_t *pte = pte_offset_kernel(pmd, start);
@@ -871,16 +866,6 @@ static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
*/
}
-static void unmap_pgd_range(pgd_t *root, unsigned long addr, unsigned long end)
-{
- pgd_t *pgd_entry = root + pgd_index(addr);
-
- unmap_pud_range(pgd_entry, addr, end);
-
- if (try_to_free_pud_page((pud_t *)pgd_page_vaddr(*pgd_entry)))
- pgd_clear(pgd_entry);
-}
-
static int alloc_pte_page(pmd_t *pmd)
{
pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
@@ -1113,7 +1098,12 @@ static int populate_pgd(struct cpa_data *cpa, unsigned long addr)
ret = populate_pud(cpa, addr, pgd_entry, pgprot);
if (ret < 0) {
- unmap_pgd_range(cpa->pgd, addr,
+ /*
+ * Leave the PUD page in place in case some other CPU or thread
+ * already found it, but remove any useless entries we just
+ * added to it.
+ */
+ unmap_pud_range(pgd_entry, addr,
addr + (cpa->numpages << PAGE_SHIFT));
return ret;
}
@@ -1185,7 +1175,7 @@ repeat:
return __cpa_process_fault(cpa, address, primary);
old_pte = *kpte;
- if (!pte_val(old_pte))
+ if (pte_none(old_pte))
return __cpa_process_fault(cpa, address, primary);
if (level == PG_LEVEL_4K) {
@@ -1316,7 +1306,8 @@ static int cpa_process_alias(struct cpa_data *cpa)
* to touch the high mapped kernel as well:
*/
if (!within(vaddr, (unsigned long)_text, _brk_end) &&
- within(cpa->pfn, highmap_start_pfn(), highmap_end_pfn())) {
+ within_inclusive(cpa->pfn, highmap_start_pfn(),
+ highmap_end_pfn())) {
unsigned long temp_cpa_vaddr = (cpa->pfn << PAGE_SHIFT) +
__START_KERNEL_map - phys_base;
alias_cpa = *cpa;
@@ -1991,12 +1982,6 @@ out:
return retval;
}
-void kernel_unmap_pages_in_pgd(pgd_t *root, unsigned long address,
- unsigned numpages)
-{
- unmap_pgd_range(root, address, address + (numpages << PAGE_SHIFT));
-}
-
/*
* The testcases use internal knowledge of the implementation that shouldn't
* be exposed to the rest of the kernel. Include these directly here.
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index fb0604f..db00e3e 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -755,11 +755,8 @@ static inline int range_is_allowed(unsigned long pfn, unsigned long size)
return 1;
while (cursor < to) {
- if (!devmem_is_allowed(pfn)) {
- pr_info("x86/PAT: Program %s tried to access /dev/mem between [mem %#010Lx-%#010Lx], PAT prevents it\n",
- current->comm, from, to - 1);
+ if (!devmem_is_allowed(pfn))
return 0;
- }
cursor += PAGE_SIZE;
pfn++;
}
diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c
index 75cc097..e67ae0e6 100644
--- a/arch/x86/mm/pgtable_32.c
+++ b/arch/x86/mm/pgtable_32.c
@@ -47,7 +47,7 @@ void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
return;
}
pte = pte_offset_kernel(pmd, vaddr);
- if (pte_val(pteval))
+ if (!pte_none(pteval))
set_pte_at(&init_mm, vaddr, pte, pteval);
else
pte_clear(&init_mm, vaddr, pte);
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index f93545e..17c8bbd 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -98,21 +98,6 @@ static efi_status_t __init phys_efi_set_virtual_address_map(
return status;
}
-void efi_get_time(struct timespec *now)
-{
- efi_status_t status;
- efi_time_t eft;
- efi_time_cap_t cap;
-
- status = efi.get_time(&eft, &cap);
- if (status != EFI_SUCCESS)
- pr_err("Oops: efitime: can't read time!\n");
-
- now->tv_sec = mktime(eft.year, eft.month, eft.day, eft.hour,
- eft.minute, eft.second);
- now->tv_nsec = 0;
-}
-
void __init efi_find_mirror(void)
{
efi_memory_desc_t *md;
@@ -978,8 +963,6 @@ static void __init __efi_enter_virtual_mode(void)
* EFI mixed mode we need all of memory to be accessible when
* we pass parameters to the EFI runtime services in the
* thunking code.
- *
- * efi_cleanup_page_tables(__pa(new_memmap), 1 << pg_shift);
*/
free_pages((unsigned long)new_memmap, pg_shift);
diff --git a/arch/x86/platform/efi/efi_32.c b/arch/x86/platform/efi/efi_32.c
index 338402b..cef39b0 100644
--- a/arch/x86/platform/efi/efi_32.c
+++ b/arch/x86/platform/efi/efi_32.c
@@ -49,9 +49,6 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages)
{
return 0;
}
-void __init efi_cleanup_page_tables(unsigned long pa_memmap, unsigned num_pages)
-{
-}
void __init efi_map_region(efi_memory_desc_t *md)
{
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index b226b3f..3e12c44 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -285,11 +285,6 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages)
return 0;
}
-void __init efi_cleanup_page_tables(unsigned long pa_memmap, unsigned num_pages)
-{
- kernel_unmap_pages_in_pgd(efi_pgd, pa_memmap, num_pages);
-}
-
static void __init __map_region(efi_memory_desc_t *md, u64 va)
{
unsigned long flags = _PAGE_RW;
@@ -466,22 +461,17 @@ extern efi_status_t efi64_thunk(u32, ...);
#define efi_thunk(f, ...) \
({ \
efi_status_t __s; \
- unsigned long flags; \
- u32 func; \
- \
- efi_sync_low_kernel_mappings(); \
- local_irq_save(flags); \
+ unsigned long __flags; \
+ u32 __func; \
\
- efi_scratch.prev_cr3 = read_cr3(); \
- write_cr3((unsigned long)efi_scratch.efi_pgt); \
- __flush_tlb_all(); \
+ local_irq_save(__flags); \
+ arch_efi_call_virt_setup(); \
\
- func = runtime_service32(f); \
- __s = efi64_thunk(func, __VA_ARGS__); \
+ __func = runtime_service32(f); \
+ __s = efi64_thunk(__func, __VA_ARGS__); \
\
- write_cr3(efi_scratch.prev_cr3); \
- __flush_tlb_all(); \
- local_irq_restore(flags); \
+ arch_efi_call_virt_teardown(); \
+ local_irq_restore(__flags); \
\
__s; \
})
diff --git a/arch/x86/platform/uv/bios_uv.c b/arch/x86/platform/uv/bios_uv.c
index 815fec6..66b2166 100644
--- a/arch/x86/platform/uv/bios_uv.c
+++ b/arch/x86/platform/uv/bios_uv.c
@@ -40,8 +40,7 @@ s64 uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, u64 a4, u64 a5)
*/
return BIOS_STATUS_UNIMPLEMENTED;
- ret = efi_call((void *)__va(tab->function), (u64)which,
- a1, a2, a3, a4, a5);
+ ret = efi_call_virt_pointer(tab, function, (u64)which, a1, a2, a3, a4, a5);
return ret;
}
EXPORT_SYMBOL_GPL(uv_bios_call);
diff --git a/arch/x86/ras/mce_amd_inj.c b/arch/x86/ras/mce_amd_inj.c
index e69f470..1104515 100644
--- a/arch/x86/ras/mce_amd_inj.c
+++ b/arch/x86/ras/mce_amd_inj.c
@@ -241,6 +241,31 @@ static void toggle_nb_mca_mst_cpu(u16 nid)
__func__, PCI_FUNC(F3->devfn), NBCFG);
}
+static void prepare_msrs(void *info)
+{
+ struct mce i_mce = *(struct mce *)info;
+ u8 b = i_mce.bank;
+
+ wrmsrl(MSR_IA32_MCG_STATUS, i_mce.mcgstatus);
+
+ if (boot_cpu_has(X86_FEATURE_SMCA)) {
+ if (i_mce.inject_flags == DFR_INT_INJ) {
+ wrmsrl(MSR_AMD64_SMCA_MCx_DESTAT(b), i_mce.status);
+ wrmsrl(MSR_AMD64_SMCA_MCx_DEADDR(b), i_mce.addr);
+ } else {
+ wrmsrl(MSR_AMD64_SMCA_MCx_STATUS(b), i_mce.status);
+ wrmsrl(MSR_AMD64_SMCA_MCx_ADDR(b), i_mce.addr);
+ }
+
+ wrmsrl(MSR_AMD64_SMCA_MCx_MISC(b), i_mce.misc);
+ } else {
+ wrmsrl(MSR_IA32_MCx_STATUS(b), i_mce.status);
+ wrmsrl(MSR_IA32_MCx_ADDR(b), i_mce.addr);
+ wrmsrl(MSR_IA32_MCx_MISC(b), i_mce.misc);
+ }
+
+}
+
static void do_inject(void)
{
u64 mcg_status = 0;
@@ -287,36 +312,9 @@ static void do_inject(void)
toggle_hw_mce_inject(cpu, true);
- wrmsr_on_cpu(cpu, MSR_IA32_MCG_STATUS,
- (u32)mcg_status, (u32)(mcg_status >> 32));
-
- if (boot_cpu_has(X86_FEATURE_SMCA)) {
- if (inj_type == DFR_INT_INJ) {
- wrmsr_on_cpu(cpu, MSR_AMD64_SMCA_MCx_DESTAT(b),
- (u32)i_mce.status, (u32)(i_mce.status >> 32));
-
- wrmsr_on_cpu(cpu, MSR_AMD64_SMCA_MCx_DEADDR(b),
- (u32)i_mce.addr, (u32)(i_mce.addr >> 32));
- } else {
- wrmsr_on_cpu(cpu, MSR_AMD64_SMCA_MCx_STATUS(b),
- (u32)i_mce.status, (u32)(i_mce.status >> 32));
-
- wrmsr_on_cpu(cpu, MSR_AMD64_SMCA_MCx_ADDR(b),
- (u32)i_mce.addr, (u32)(i_mce.addr >> 32));
- }
-
- wrmsr_on_cpu(cpu, MSR_AMD64_SMCA_MCx_MISC(b),
- (u32)i_mce.misc, (u32)(i_mce.misc >> 32));
- } else {
- wrmsr_on_cpu(cpu, MSR_IA32_MCx_STATUS(b),
- (u32)i_mce.status, (u32)(i_mce.status >> 32));
-
- wrmsr_on_cpu(cpu, MSR_IA32_MCx_ADDR(b),
- (u32)i_mce.addr, (u32)(i_mce.addr >> 32));
-
- wrmsr_on_cpu(cpu, MSR_IA32_MCx_MISC(b),
- (u32)i_mce.misc, (u32)(i_mce.misc >> 32));
- }
+ i_mce.mcgstatus = mcg_status;
+ i_mce.inject_flags = inj_type;
+ smp_call_function_single(cpu, prepare_msrs, &i_mce, 0);
toggle_hw_mce_inject(cpu, false);
diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c
index 0b7a63d..705e3ff 100644
--- a/arch/x86/realmode/init.c
+++ b/arch/x86/realmode/init.c
@@ -8,6 +8,9 @@
struct real_mode_header *real_mode_header;
u32 *trampoline_cr4_features;
+/* Hold the pgd entry used on booting additional CPUs */
+pgd_t trampoline_pgd_entry;
+
void __init reserve_real_mode(void)
{
phys_addr_t mem;
@@ -84,7 +87,7 @@ void __init setup_real_mode(void)
*trampoline_cr4_features = __read_cr4();
trampoline_pgd = (u64 *) __va(real_mode_header->trampoline_pgd);
- trampoline_pgd[0] = init_level4_pgt[pgd_index(__PAGE_OFFSET)].pgd;
+ trampoline_pgd[0] = trampoline_pgd_entry.pgd;
trampoline_pgd[511] = init_level4_pgt[511].pgd;
#endif
}
diff --git a/arch/x86/xen/apic.c b/arch/x86/xen/apic.c
index db52a7f..44c88ad 100644
--- a/arch/x86/xen/apic.c
+++ b/arch/x86/xen/apic.c
@@ -177,7 +177,6 @@ static struct apic xen_pv_apic = {
.get_apic_id = xen_get_apic_id,
.set_apic_id = xen_set_apic_id, /* Can be NULL on 32-bit. */
- .apic_id_mask = 0xFF << 24, /* Used by verify_local_APIC. Match with what xen_get_apic_id does. */
.cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and,
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 760789a..0f87db2 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -521,9 +521,7 @@ static void set_aliased_prot(void *v, pgprot_t prot)
preempt_disable();
- pagefault_disable(); /* Avoid warnings due to being atomic. */
- __get_user(dummy, (unsigned char __user __force *)v);
- pagefault_enable();
+ probe_kernel_read(&dummy, v, 1);
if (HYPERVISOR_update_va_mapping((unsigned long)v, pte, 0))
BUG();