diff options
Diffstat (limited to 'arch/x86')
57 files changed, 644 insertions, 482 deletions
diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c index a66854d..6de58f1 100644 --- a/arch/x86/boot/compressed/kaslr.c +++ b/arch/x86/boot/compressed/kaslr.c @@ -430,9 +430,6 @@ void choose_random_location(unsigned long input, { unsigned long random_addr, min_addr; - /* By default, keep output position unchanged. */ - *virt_addr = *output; - if (cmdline_find_option_bool("nokaslr")) { warn("KASLR disabled: 'nokaslr' on cmdline."); return; diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index b3c5a5f0..c945acd 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -338,7 +338,7 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap, unsigned long output_len) { const unsigned long kernel_total_size = VO__end - VO__text; - unsigned long virt_addr = (unsigned long)output; + unsigned long virt_addr = LOAD_PHYSICAL_ADDR; /* Retain x86 boot parameters pointer passed from startup_32/64. */ boot_params = rmode; @@ -397,7 +397,7 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap, #ifndef CONFIG_RELOCATABLE if ((unsigned long)output != LOAD_PHYSICAL_ADDR) error("Destination address does not match LOAD_PHYSICAL_ADDR"); - if ((unsigned long)output != virt_addr) + if (virt_addr != LOAD_PHYSICAL_ADDR) error("Destination virtual address changed when not relocatable"); #endif diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index 1c8355e..766a521 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -81,8 +81,6 @@ static inline void choose_random_location(unsigned long input, unsigned long output_size, unsigned long *virt_addr) { - /* No change from existing output location. */ - *virt_addr = *output; } #endif diff --git a/arch/x86/boot/string.c b/arch/x86/boot/string.c index cc3bd58..9e240fc 100644 --- a/arch/x86/boot/string.c +++ b/arch/x86/boot/string.c @@ -14,6 +14,7 @@ #include <linux/types.h> #include "ctype.h" +#include "string.h" int memcmp(const void *s1, const void *s2, size_t len) { diff --git a/arch/x86/boot/string.h b/arch/x86/boot/string.h index 725e820..113588d 100644 --- a/arch/x86/boot/string.h +++ b/arch/x86/boot/string.h @@ -18,4 +18,13 @@ int memcmp(const void *s1, const void *s2, size_t len); #define memset(d,c,l) __builtin_memset(d,c,l) #define memcmp __builtin_memcmp +extern int strcmp(const char *str1, const char *str2); +extern int strncmp(const char *cs, const char *ct, size_t count); +extern size_t strlen(const char *s); +extern char *strstr(const char *s1, const char *s2); +extern size_t strnlen(const char *s, size_t maxlen); +extern unsigned int atou(const char *s); +extern unsigned long long simple_strtoull(const char *cp, char **endp, + unsigned int base); + #endif /* BOOT_STRING_H */ diff --git a/arch/x86/crypto/sha1-mb/sha1_mb_mgr_flush_avx2.S b/arch/x86/crypto/sha1-mb/sha1_mb_mgr_flush_avx2.S index 96df6a3..a2ae689 100644 --- a/arch/x86/crypto/sha1-mb/sha1_mb_mgr_flush_avx2.S +++ b/arch/x86/crypto/sha1-mb/sha1_mb_mgr_flush_avx2.S @@ -157,8 +157,8 @@ LABEL skip_ %I .endr # Find min length - vmovdqa _lens+0*16(state), %xmm0 - vmovdqa _lens+1*16(state), %xmm1 + vmovdqu _lens+0*16(state), %xmm0 + vmovdqu _lens+1*16(state), %xmm1 vpminud %xmm1, %xmm0, %xmm2 # xmm2 has {D,C,B,A} vpalignr $8, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,D,C} @@ -178,8 +178,8 @@ LABEL skip_ %I vpsubd %xmm2, %xmm0, %xmm0 vpsubd %xmm2, %xmm1, %xmm1 - vmovdqa %xmm0, _lens+0*16(state) - vmovdqa %xmm1, _lens+1*16(state) + vmovdqu %xmm0, _lens+0*16(state) + vmovdqu %xmm1, _lens+1*16(state) # "state" and "args" are the same address, arg1 # len is arg2 @@ -235,8 +235,8 @@ ENTRY(sha1_mb_mgr_get_comp_job_avx2) jc .return_null # Find min length - vmovdqa _lens(state), %xmm0 - vmovdqa _lens+1*16(state), %xmm1 + vmovdqu _lens(state), %xmm0 + vmovdqu _lens+1*16(state), %xmm1 vpminud %xmm1, %xmm0, %xmm2 # xmm2 has {D,C,B,A} vpalignr $8, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,D,C} diff --git a/arch/x86/crypto/sha1_avx2_x86_64_asm.S b/arch/x86/crypto/sha1_avx2_x86_64_asm.S index 1cd792d..1eab79c 100644 --- a/arch/x86/crypto/sha1_avx2_x86_64_asm.S +++ b/arch/x86/crypto/sha1_avx2_x86_64_asm.S @@ -117,11 +117,10 @@ .set T1, REG_T1 .endm -#define K_BASE %r8 #define HASH_PTR %r9 +#define BLOCKS_CTR %r8 #define BUFFER_PTR %r10 #define BUFFER_PTR2 %r13 -#define BUFFER_END %r11 #define PRECALC_BUF %r14 #define WK_BUF %r15 @@ -205,14 +204,14 @@ * blended AVX2 and ALU instruction scheduling * 1 vector iteration per 8 rounds */ - vmovdqu ((i * 2) + PRECALC_OFFSET)(BUFFER_PTR), W_TMP + vmovdqu (i * 2)(BUFFER_PTR), W_TMP .elseif ((i & 7) == 1) - vinsertf128 $1, (((i-1) * 2)+PRECALC_OFFSET)(BUFFER_PTR2),\ + vinsertf128 $1, ((i-1) * 2)(BUFFER_PTR2),\ WY_TMP, WY_TMP .elseif ((i & 7) == 2) vpshufb YMM_SHUFB_BSWAP, WY_TMP, WY .elseif ((i & 7) == 4) - vpaddd K_XMM(K_BASE), WY, WY_TMP + vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP .elseif ((i & 7) == 7) vmovdqu WY_TMP, PRECALC_WK(i&~7) @@ -255,7 +254,7 @@ vpxor WY, WY_TMP, WY_TMP .elseif ((i & 7) == 7) vpxor WY_TMP2, WY_TMP, WY - vpaddd K_XMM(K_BASE), WY, WY_TMP + vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP vmovdqu WY_TMP, PRECALC_WK(i&~7) PRECALC_ROTATE_WY @@ -291,7 +290,7 @@ vpsrld $30, WY, WY vpor WY, WY_TMP, WY .elseif ((i & 7) == 7) - vpaddd K_XMM(K_BASE), WY, WY_TMP + vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP vmovdqu WY_TMP, PRECALC_WK(i&~7) PRECALC_ROTATE_WY @@ -446,6 +445,16 @@ .endm +/* Add constant only if (%2 > %3) condition met (uses RTA as temp) + * %1 + %2 >= %3 ? %4 : 0 + */ +.macro ADD_IF_GE a, b, c, d + mov \a, RTA + add $\d, RTA + cmp $\c, \b + cmovge RTA, \a +.endm + /* * macro implements 80 rounds of SHA-1, for multiple blocks with s/w pipelining */ @@ -463,13 +472,16 @@ lea (2*4*80+32)(%rsp), WK_BUF # Precalc WK for first 2 blocks - PRECALC_OFFSET = 0 + ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 2, 64 .set i, 0 .rept 160 PRECALC i .set i, i + 1 .endr - PRECALC_OFFSET = 128 + + /* Go to next block if needed */ + ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 3, 128 + ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128 xchg WK_BUF, PRECALC_BUF .align 32 @@ -479,8 +491,8 @@ _loop: * we use K_BASE value as a signal of a last block, * it is set below by: cmovae BUFFER_PTR, K_BASE */ - cmp K_BASE, BUFFER_PTR - jne _begin + test BLOCKS_CTR, BLOCKS_CTR + jnz _begin .align 32 jmp _end .align 32 @@ -512,10 +524,10 @@ _loop0: .set j, j+2 .endr - add $(2*64), BUFFER_PTR /* move to next odd-64-byte block */ - cmp BUFFER_END, BUFFER_PTR /* is current block the last one? */ - cmovae K_BASE, BUFFER_PTR /* signal the last iteration smartly */ - + /* Update Counter */ + sub $1, BLOCKS_CTR + /* Move to the next block only if needed*/ + ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 4, 128 /* * rounds * 60,62,64,66,68 @@ -532,8 +544,8 @@ _loop0: UPDATE_HASH 12(HASH_PTR), D UPDATE_HASH 16(HASH_PTR), E - cmp K_BASE, BUFFER_PTR /* is current block the last one? */ - je _loop + test BLOCKS_CTR, BLOCKS_CTR + jz _loop mov TB, B @@ -575,10 +587,10 @@ _loop2: .set j, j+2 .endr - add $(2*64), BUFFER_PTR2 /* move to next even-64-byte block */ - - cmp BUFFER_END, BUFFER_PTR2 /* is current block the last one */ - cmovae K_BASE, BUFFER_PTR /* signal the last iteration smartly */ + /* update counter */ + sub $1, BLOCKS_CTR + /* Move to the next block only if needed*/ + ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128 jmp _loop3 _loop3: @@ -641,19 +653,12 @@ _loop3: avx2_zeroupper - lea K_XMM_AR(%rip), K_BASE - + /* Setup initial values */ mov CTX, HASH_PTR mov BUF, BUFFER_PTR - lea 64(BUF), BUFFER_PTR2 - - shl $6, CNT /* mul by 64 */ - add BUF, CNT - add $64, CNT - mov CNT, BUFFER_END - cmp BUFFER_END, BUFFER_PTR2 - cmovae K_BASE, BUFFER_PTR2 + mov BUF, BUFFER_PTR2 + mov CNT, BLOCKS_CTR xmm_mov BSWAP_SHUFB_CTL(%rip), YMM_SHUFB_BSWAP diff --git a/arch/x86/crypto/sha256-mb/sha256_mb_mgr_flush_avx2.S b/arch/x86/crypto/sha256-mb/sha256_mb_mgr_flush_avx2.S index a78a069..ec9bee6 100644 --- a/arch/x86/crypto/sha256-mb/sha256_mb_mgr_flush_avx2.S +++ b/arch/x86/crypto/sha256-mb/sha256_mb_mgr_flush_avx2.S @@ -155,8 +155,8 @@ LABEL skip_ %I .endr # Find min length - vmovdqa _lens+0*16(state), %xmm0 - vmovdqa _lens+1*16(state), %xmm1 + vmovdqu _lens+0*16(state), %xmm0 + vmovdqu _lens+1*16(state), %xmm1 vpminud %xmm1, %xmm0, %xmm2 # xmm2 has {D,C,B,A} vpalignr $8, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,D,C} @@ -176,8 +176,8 @@ LABEL skip_ %I vpsubd %xmm2, %xmm0, %xmm0 vpsubd %xmm2, %xmm1, %xmm1 - vmovdqa %xmm0, _lens+0*16(state) - vmovdqa %xmm1, _lens+1*16(state) + vmovdqu %xmm0, _lens+0*16(state) + vmovdqu %xmm1, _lens+1*16(state) # "state" and "args" are the same address, arg1 # len is arg2 @@ -234,8 +234,8 @@ ENTRY(sha256_mb_mgr_get_comp_job_avx2) jc .return_null # Find min length - vmovdqa _lens(state), %xmm0 - vmovdqa _lens+1*16(state), %xmm1 + vmovdqu _lens(state), %xmm0 + vmovdqu _lens+1*16(state), %xmm1 vpminud %xmm1, %xmm0, %xmm2 # xmm2 has {D,C,B,A} vpalignr $8, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,D,C} diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index ef766a3..e7b0e7f 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1215,6 +1215,8 @@ ENTRY(nmi) * other IST entries. */ + ASM_CLAC + /* Use %rdx as our temp variable throughout */ pushq %rdx diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 38623e2..9604b25 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -505,6 +505,10 @@ int x86_pmu_hw_config(struct perf_event *event) if (event->attr.precise_ip > precise) return -EOPNOTSUPP; + + /* There's no sense in having PEBS for non sampling events: */ + if (!is_sampling_event(event)) + return -EINVAL; } /* * check that PEBS LBR correction does not conflict with diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 3bdb917..f0f197f 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -3164,13 +3164,16 @@ static void intel_pmu_cpu_starting(int cpu) if (x86_pmu.flags & PMU_FL_EXCL_CNTRS) { for_each_cpu(i, topology_sibling_cpumask(cpu)) { + struct cpu_hw_events *sibling; struct intel_excl_cntrs *c; - c = per_cpu(cpu_hw_events, i).excl_cntrs; + sibling = &per_cpu(cpu_hw_events, i); + c = sibling->excl_cntrs; if (c && c->core_id == core_id) { cpuc->kfree_on_online[1] = cpuc->excl_cntrs; cpuc->excl_cntrs = c; - cpuc->excl_thread_id = 1; + if (!sibling->excl_thread_id) + cpuc->excl_thread_id = 1; break; } } @@ -3975,7 +3978,7 @@ __init int intel_pmu_init(void) x86_pmu.num_counters, INTEL_PMC_MAX_GENERIC); x86_pmu.num_counters = INTEL_PMC_MAX_GENERIC; } - x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1; + x86_pmu.intel_ctrl = (1ULL << x86_pmu.num_counters) - 1; if (x86_pmu.num_counters_fixed > INTEL_PMC_MAX_FIXED) { WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!", diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c index fec8a46..1076c9a 100644 --- a/arch/x86/events/intel/cstate.c +++ b/arch/x86/events/intel/cstate.c @@ -434,6 +434,7 @@ static struct pmu cstate_core_pmu = { .stop = cstate_pmu_event_stop, .read = cstate_pmu_event_update, .capabilities = PERF_PMU_CAP_NO_INTERRUPT, + .module = THIS_MODULE, }; static struct pmu cstate_pkg_pmu = { @@ -447,6 +448,7 @@ static struct pmu cstate_pkg_pmu = { .stop = cstate_pmu_event_stop, .read = cstate_pmu_event_update, .capabilities = PERF_PMU_CAP_NO_INTERRUPT, + .module = THIS_MODULE, }; static const struct cstate_model nhm_cstates __initconst = { diff --git a/arch/x86/events/intel/rapl.c b/arch/x86/events/intel/rapl.c index 8b902b6..4c1b7ea 100644 --- a/arch/x86/events/intel/rapl.c +++ b/arch/x86/events/intel/rapl.c @@ -161,7 +161,13 @@ static u64 rapl_timer_ms; static inline struct rapl_pmu *cpu_to_rapl_pmu(unsigned int cpu) { - return rapl_pmus->pmus[topology_logical_package_id(cpu)]; + unsigned int pkgid = topology_logical_package_id(cpu); + + /* + * The unsigned check also catches the '-1' return value for non + * existent mappings in the topology map. + */ + return pkgid < rapl_pmus->maxpkg ? rapl_pmus->pmus[pkgid] : NULL; } static inline u64 rapl_read_counter(struct perf_event *event) @@ -402,6 +408,8 @@ static int rapl_pmu_event_init(struct perf_event *event) /* must be done before validate_group */ pmu = cpu_to_rapl_pmu(event->cpu); + if (!pmu) + return -EINVAL; event->cpu = pmu->cpu; event->pmu_private = pmu; event->hw.event_base = msr; @@ -585,6 +593,19 @@ static int rapl_cpu_online(unsigned int cpu) struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu); int target; + if (!pmu) { + pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu)); + if (!pmu) + return -ENOMEM; + + raw_spin_lock_init(&pmu->lock); + INIT_LIST_HEAD(&pmu->active_list); + pmu->pmu = &rapl_pmus->pmu; + pmu->timer_interval = ms_to_ktime(rapl_timer_ms); + rapl_hrtimer_init(pmu); + + rapl_pmus->pmus[topology_logical_package_id(cpu)] = pmu; + } /* * Check if there is an online cpu in the package which collects rapl * events already. @@ -598,27 +619,6 @@ static int rapl_cpu_online(unsigned int cpu) return 0; } -static int rapl_cpu_prepare(unsigned int cpu) -{ - struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu); - - if (pmu) - return 0; - - pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu)); - if (!pmu) - return -ENOMEM; - - raw_spin_lock_init(&pmu->lock); - INIT_LIST_HEAD(&pmu->active_list); - pmu->pmu = &rapl_pmus->pmu; - pmu->timer_interval = ms_to_ktime(rapl_timer_ms); - pmu->cpu = -1; - rapl_hrtimer_init(pmu); - rapl_pmus->pmus[topology_logical_package_id(cpu)] = pmu; - return 0; -} - static int rapl_check_hw_unit(bool apply_quirk) { u64 msr_rapl_power_unit_bits; @@ -697,6 +697,7 @@ static int __init init_rapl_pmus(void) rapl_pmus->pmu.start = rapl_pmu_event_start; rapl_pmus->pmu.stop = rapl_pmu_event_stop; rapl_pmus->pmu.read = rapl_pmu_event_read; + rapl_pmus->pmu.module = THIS_MODULE; return 0; } @@ -803,28 +804,21 @@ static int __init rapl_pmu_init(void) * Install callbacks. Core will call them for each online cpu. */ - ret = cpuhp_setup_state(CPUHP_PERF_X86_RAPL_PREP, "PERF_X86_RAPL_PREP", - rapl_cpu_prepare, NULL); - if (ret) - goto out; - ret = cpuhp_setup_state(CPUHP_AP_PERF_X86_RAPL_ONLINE, "AP_PERF_X86_RAPL_ONLINE", rapl_cpu_online, rapl_cpu_offline); if (ret) - goto out1; + goto out; ret = perf_pmu_register(&rapl_pmus->pmu, "power", -1); if (ret) - goto out2; + goto out1; rapl_advertise(); return 0; -out2: - cpuhp_remove_state(CPUHP_AP_PERF_X86_RAPL_ONLINE); out1: - cpuhp_remove_state(CPUHP_PERF_X86_RAPL_PREP); + cpuhp_remove_state(CPUHP_AP_PERF_X86_RAPL_ONLINE); out: pr_warn("Initialization failed (%d), disabled\n", ret); cleanup_rapl_pmus(); @@ -835,7 +829,6 @@ module_init(rapl_pmu_init); static void __exit intel_rapl_exit(void) { cpuhp_remove_state_nocalls(CPUHP_AP_PERF_X86_RAPL_ONLINE); - cpuhp_remove_state_nocalls(CPUHP_PERF_X86_RAPL_PREP); perf_pmu_unregister(&rapl_pmus->pmu); cleanup_rapl_pmus(); } diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index 19d646a..aec6cc9 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -733,6 +733,7 @@ static int uncore_pmu_register(struct intel_uncore_pmu *pmu) .start = uncore_pmu_event_start, .stop = uncore_pmu_event_stop, .read = uncore_pmu_event_read, + .module = THIS_MODULE, }; } else { pmu->pmu = *pmu->type->pmu; diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index 2724277..afe8024 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -2686,7 +2686,7 @@ static struct intel_uncore_type *hswep_msr_uncores[] = { void hswep_uncore_cpu_init(void) { - int pkg = topology_phys_to_logical_pkg(0); + int pkg = boot_cpu_data.logical_proc_id; if (hswep_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores) hswep_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores; diff --git a/arch/x86/include/asm/alternative-asm.h b/arch/x86/include/asm/alternative-asm.h index e7636ba..6c98821 100644 --- a/arch/x86/include/asm/alternative-asm.h +++ b/arch/x86/include/asm/alternative-asm.h @@ -62,8 +62,10 @@ #define new_len2 145f-144f /* - * max without conditionals. Idea adapted from: + * gas compatible max based on the idea from: * http://graphics.stanford.edu/~seander/bithacks.html#IntegerMinOrMax + * + * The additional "-" is needed because gas uses a "true" value of -1. */ #define alt_max_short(a, b) ((a) ^ (((a) ^ (b)) & -(-((a) < (b))))) diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 1b02038..d4aea31 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -103,12 +103,12 @@ static inline int alternatives_text_reserved(void *start, void *end) alt_end_marker ":\n" /* - * max without conditionals. Idea adapted from: + * gas compatible max based on the idea from: * http://graphics.stanford.edu/~seander/bithacks.html#IntegerMinOrMax * - * The additional "-" is needed because gas works with s32s. + * The additional "-" is needed because gas uses a "true" value of -1. */ -#define alt_max_short(a, b) "((" a ") ^ (((" a ") ^ (" b ")) & -(-((" a ") - (" b ")))))" +#define alt_max_short(a, b) "((" a ") ^ (((" a ") ^ (" b ")) & -(-((" a ") < (" b ")))))" /* * Pad the second replacement alternative with additional NOPs if it is diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index 94aad63..7bcd138 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -204,6 +204,7 @@ void set_personality_ia32(bool); #define ELF_CORE_COPY_REGS(pr_reg, regs) \ do { \ + unsigned long base; \ unsigned v; \ (pr_reg)[0] = (regs)->r15; \ (pr_reg)[1] = (regs)->r14; \ @@ -226,8 +227,8 @@ do { \ (pr_reg)[18] = (regs)->flags; \ (pr_reg)[19] = (regs)->sp; \ (pr_reg)[20] = (regs)->ss; \ - (pr_reg)[21] = current->thread.fsbase; \ - (pr_reg)[22] = current->thread.gsbase; \ + rdmsrl(MSR_FS_BASE, base); (pr_reg)[21] = base; \ + rdmsrl(MSR_KERNEL_GS_BASE, base); (pr_reg)[22] = base; \ asm("movl %%ds,%0" : "=r" (v)); (pr_reg)[23] = v; \ asm("movl %%es,%0" : "=r" (v)); (pr_reg)[24] = v; \ asm("movl %%fs,%0" : "=r" (v)); (pr_reg)[25] = v; \ @@ -245,12 +246,13 @@ extern int force_personality32; #define CORE_DUMP_USE_REGSET #define ELF_EXEC_PAGESIZE 4096 -/* This is the location that an ET_DYN program is loaded if exec'ed. Typical - use of this is to invoke "./ld.so someprog" to test out a new version of - the loader. We need to make sure that it is out of the way of the program - that it will "exec", and that there is sufficient room for the brk. */ - -#define ELF_ET_DYN_BASE (TASK_SIZE / 3 * 2) +/* + * This is the base location for PIE (ET_DYN with INTERP) loads. On + * 64-bit, this is above 4GB to leave the entire 32-bit address + * space open for things that want to use the area for 32-bit pointers. + */ +#define ELF_ET_DYN_BASE (mmap_is_ia32() ? 0x000400000UL : \ + (TASK_SIZE / 3 * 2)) /* This yields a mask that user programs can use to figure out what instruction set this CPU supports. This could be done in user space, diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index d34bd37..6c50201 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -304,13 +304,13 @@ static inline unsigned type in##bwl##_p(int port) \ static inline void outs##bwl(int port, const void *addr, unsigned long count) \ { \ asm volatile("rep; outs" #bwl \ - : "+S"(addr), "+c"(count) : "d"(port)); \ + : "+S"(addr), "+c"(count) : "d"(port) : "memory"); \ } \ \ static inline void ins##bwl(int port, void *addr, unsigned long count) \ { \ asm volatile("rep; ins" #bwl \ - : "+D"(addr), "+c"(count) : "d"(port)); \ + : "+D"(addr), "+c"(count) : "d"(port) : "memory"); \ } BUILDIO(b, b, char) diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index e9cd7be..fc3c7e4 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -221,6 +221,9 @@ struct x86_emulate_ops { void (*get_cpuid)(struct x86_emulate_ctxt *ctxt, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx); void (*set_nmi_mask)(struct x86_emulate_ctxt *ctxt, bool masked); + + unsigned (*get_hflags)(struct x86_emulate_ctxt *ctxt); + void (*set_hflags)(struct x86_emulate_ctxt *ctxt, unsigned hflags); }; typedef u32 __attribute__((vector_size(16))) sse128_t; @@ -290,10 +293,10 @@ struct x86_emulate_ctxt { /* interruptibility state, as a result of execution of STI or MOV SS */ int interruptibility; - int emul_flags; bool perm_ok; /* do not check permissions if true */ bool ud; /* inject an #UD if host doesn't support insn */ + bool tf; /* TF value before instruction (after for syscall/sysret) */ bool have_exception; struct x86_exception exception; diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 8e0a9fe..f9dd224 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -116,9 +116,7 @@ static inline int init_new_context(struct task_struct *tsk, mm->context.execute_only_pkey = -1; } #endif - init_new_context_ldt(tsk, mm); - - return 0; + return init_new_context_ldt(tsk, mm); } static inline void destroy_context(struct mm_struct *mm) { diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 78f3760..b601dda 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -405,6 +405,8 @@ #define MSR_IA32_TSC_ADJUST 0x0000003b #define MSR_IA32_BNDCFGS 0x00000d90 +#define MSR_IA32_BNDCFGS_RSVD 0x00000ffc + #define MSR_IA32_XSS 0x00000da0 #define FEATURE_CONTROL_LOCKED (1<<0) diff --git a/arch/x86/include/asm/pat.h b/arch/x86/include/asm/pat.h index 0b1ff4c..fffb279 100644 --- a/arch/x86/include/asm/pat.h +++ b/arch/x86/include/asm/pat.h @@ -7,6 +7,7 @@ bool pat_enabled(void); void pat_disable(const char *reason); extern void pat_init(void); +extern void init_cache_modes(void); extern int reserve_memtype(u64 start, u64 end, enum page_cache_mode req_pcm, enum page_cache_mode *ret_pcm); diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index a300aa1..dead0f3 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -68,6 +68,12 @@ static inline bool __chk_range_not_ok(unsigned long addr, unsigned long size, un __chk_range_not_ok((unsigned long __force)(addr), size, limit); \ }) +#ifdef CONFIG_DEBUG_ATOMIC_SLEEP +# define WARN_ON_IN_IRQ() WARN_ON_ONCE(!in_task()) +#else +# define WARN_ON_IN_IRQ() +#endif + /** * access_ok: - Checks if a user space pointer is valid * @type: Type of access: %VERIFY_READ or %VERIFY_WRITE. Note that @@ -88,8 +94,11 @@ static inline bool __chk_range_not_ok(unsigned long addr, unsigned long size, un * checks that the pointer is in the user space range - after calling * this function, memory access functions may still return -EFAULT. */ -#define access_ok(type, addr, size) \ - likely(!__range_not_ok(addr, size, user_addr_max())) +#define access_ok(type, addr, size) \ +({ \ + WARN_ON_IN_IRQ(); \ + likely(!__range_not_ok(addr, size, user_addr_max())); \ +}) /* * These are the main single-value transfer routines. They automatically diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h index a12a047..8b678af 100644 --- a/arch/x86/include/asm/xen/hypercall.h +++ b/arch/x86/include/asm/xen/hypercall.h @@ -43,6 +43,7 @@ #include <asm/page.h> #include <asm/pgtable.h> +#include <asm/smap.h> #include <xen/interface/xen.h> #include <xen/interface/sched.h> @@ -214,10 +215,12 @@ privcmd_call(unsigned call, __HYPERCALL_DECLS; __HYPERCALL_5ARG(a1, a2, a3, a4, a5); + stac(); asm volatile("call *%[call]" : __HYPERCALL_5PARAM : [call] "a" (&hypercall_page[call]) : __HYPERCALL_CLOBBER5); + clac(); return (long)__res; } diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 931ced8..b89bef9 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -176,10 +176,15 @@ static int acpi_register_lapic(int id, u32 acpiid, u8 enabled) return -EINVAL; } + if (!enabled) { + ++disabled_cpus; + return -EINVAL; + } + if (boot_cpu_physical_apicid != -1U) ver = boot_cpu_apic_version; - cpu = __generic_processor_info(id, ver, enabled); + cpu = generic_processor_info(id, ver); if (cpu >= 0) early_per_cpu(x86_cpu_to_acpiid, cpu) = acpiid; @@ -338,6 +343,14 @@ static void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, struct mpc_intsrc mp_irq; /* + * Check bus_irq boundary. + */ + if (bus_irq >= NR_IRQS_LEGACY) { + pr_warn("Invalid bus_irq %u for legacy override\n", bus_irq); + return; + } + + /* * Convert 'gsi' to 'ioapic.pin'. */ ioapic = mp_find_ioapic(gsi); diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index f223491..e2ead34 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -2070,7 +2070,7 @@ static int allocate_logical_cpuid(int apicid) return nr_logical_cpuids++; } -int __generic_processor_info(int apicid, int version, bool enabled) +int generic_processor_info(int apicid, int version) { int cpu, max = nr_cpu_ids; bool boot_cpu_detected = physid_isset(boot_cpu_physical_apicid, @@ -2128,11 +2128,9 @@ int __generic_processor_info(int apicid, int version, bool enabled) if (num_processors >= nr_cpu_ids) { int thiscpu = max + disabled_cpus; - if (enabled) { - pr_warning("APIC: NR_CPUS/possible_cpus limit of %i " - "reached. Processor %d/0x%x ignored.\n", - max, thiscpu, apicid); - } + pr_warning("APIC: NR_CPUS/possible_cpus limit of %i " + "reached. Processor %d/0x%x ignored.\n", + max, thiscpu, apicid); disabled_cpus++; return -EINVAL; @@ -2184,23 +2182,13 @@ int __generic_processor_info(int apicid, int version, bool enabled) apic->x86_32_early_logical_apicid(cpu); #endif set_cpu_possible(cpu, true); - - if (enabled) { - num_processors++; - physid_set(apicid, phys_cpu_present_map); - set_cpu_present(cpu, true); - } else { - disabled_cpus++; - } + physid_set(apicid, phys_cpu_present_map); + set_cpu_present(cpu, true); + num_processors++; return cpu; } -int generic_processor_info(int apicid, int version) -{ - return __generic_processor_info(apicid, version, true); -} - int hard_smp_processor_id(void) { return read_apic_id(); diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 7249f15..cf89928 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2116,7 +2116,7 @@ static inline void __init check_timer(void) int idx; idx = find_irq_entry(apic1, pin1, mp_INT); if (idx != -1 && irq_trigger(idx)) - unmask_ioapic_irq(irq_get_chip_data(0)); + unmask_ioapic_irq(irq_get_irq_data(0)); } irq_domain_deactivate_irq(irq_data); irq_domain_activate_irq(irq_data); diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 3dfca7b..a5b47c1 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -955,6 +955,9 @@ static int threshold_create_bank(unsigned int cpu, unsigned int bank) const char *name = get_name(bank, NULL); int err = 0; + if (!dev) + return -ENODEV; + if (is_shared_bank(bank)) { nb = node_to_amd_nb(amd_get_nb_id(cpu)); diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index cdc0dea..13dbcc0 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -34,6 +34,7 @@ #include <linux/mm.h> #include <asm/microcode_intel.h> +#include <asm/intel-family.h> #include <asm/processor.h> #include <asm/tlbflush.h> #include <asm/setup.h> @@ -1046,6 +1047,18 @@ static int get_ucode_fw(void *to, const void *from, size_t n) return 0; } +static bool is_blacklisted(unsigned int cpu) +{ + struct cpuinfo_x86 *c = &cpu_data(cpu); + + if (c->x86 == 6 && c->x86_model == INTEL_FAM6_BROADWELL_X) { + pr_err_once("late loading on model 79 is disabled.\n"); + return true; + } + + return false; +} + static enum ucode_state request_microcode_fw(int cpu, struct device *device, bool refresh_fw) { @@ -1054,6 +1067,9 @@ static enum ucode_state request_microcode_fw(int cpu, struct device *device, const struct firmware *firmware; enum ucode_state ret; + if (is_blacklisted(cpu)) + return UCODE_NFOUND; + sprintf(name, "intel-ucode/%02x-%02x-%02x", c->x86, c->x86_model, c->x86_mask); @@ -1078,6 +1094,9 @@ static int get_ucode_user(void *to, const void *from, size_t n) static enum ucode_state request_microcode_user(int cpu, const void __user *buf, size_t size) { + if (is_blacklisted(cpu)) + return UCODE_NFOUND; + return generic_load_microcode(cpu, (void *)buf, size, &get_ucode_user); } diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index c114b13..7052d9a 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -130,11 +130,16 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset, fpu__activate_fpstate_write(fpu); - if (boot_cpu_has(X86_FEATURE_XSAVES)) + if (boot_cpu_has(X86_FEATURE_XSAVES)) { ret = copyin_to_xsaves(kbuf, ubuf, xsave); - else + } else { ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, xsave, 0, -1); + /* xcomp_bv must be 0 when using uncompacted format */ + if (!ret && xsave->header.xcomp_bv) + ret = -EINVAL; + } + /* * In case of failure, mark all states as init: */ diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index a184c21..3ec0d2d 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -329,6 +329,10 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) } else { err = __copy_from_user(&fpu->state.xsave, buf_fx, state_size); + + /* xcomp_bv must be 0 when using uncompacted format */ + if (!err && state_size > offsetof(struct xregs_state, header) && fpu->state.xsave.header.xcomp_bv) + err = -EINVAL; } if (err || __copy_from_user(&env, buf, sizeof(env))) { diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 9cf697c..77f17cb 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -141,7 +141,8 @@ void kvm_async_pf_task_wait(u32 token) n.token = token; n.cpu = smp_processor_id(); - n.halted = is_idle_task(current) || preempt_count() > 1; + n.halted = is_idle_task(current) || preempt_count() > 1 || + rcu_preempt_depth(); init_swait_queue_head(&n.wq); hlist_add_head(&n.link, &b->list); raw_spin_unlock(&b->lock); @@ -152,6 +153,8 @@ void kvm_async_pf_task_wait(u32 token) if (hlist_unhashed(&n.link)) break; + rcu_irq_exit(); + if (!n.halted) { local_irq_enable(); schedule(); @@ -160,11 +163,11 @@ void kvm_async_pf_task_wait(u32 token) /* * We cannot reschedule. So halt. */ - rcu_irq_exit(); native_safe_halt(); local_irq_disable(); - rcu_irq_enter(); } + + rcu_irq_enter(); } if (!n.halted) finish_swait(&n.wq, &wait); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index b3760b3..0887d2a 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -136,6 +136,123 @@ void release_thread(struct task_struct *dead_task) } } +enum which_selector { + FS, + GS +}; + +/* + * Saves the FS or GS base for an outgoing thread if FSGSBASE extensions are + * not available. The goal is to be reasonably fast on non-FSGSBASE systems. + * It's forcibly inlined because it'll generate better code and this function + * is hot. + */ +static __always_inline void save_base_legacy(struct task_struct *prev_p, + unsigned short selector, + enum which_selector which) +{ + if (likely(selector == 0)) { + /* + * On Intel (without X86_BUG_NULL_SEG), the segment base could + * be the pre-existing saved base or it could be zero. On AMD + * (with X86_BUG_NULL_SEG), the segment base could be almost + * anything. + * + * This branch is very hot (it's hit twice on almost every + * context switch between 64-bit programs), and avoiding + * the RDMSR helps a lot, so we just assume that whatever + * value is already saved is correct. This matches historical + * Linux behavior, so it won't break existing applications. + * + * To avoid leaking state, on non-X86_BUG_NULL_SEG CPUs, if we + * report that the base is zero, it needs to actually be zero: + * see the corresponding logic in load_seg_legacy. + */ + } else { + /* + * If the selector is 1, 2, or 3, then the base is zero on + * !X86_BUG_NULL_SEG CPUs and could be anything on + * X86_BUG_NULL_SEG CPUs. In the latter case, Linux + * has never attempted to preserve the base across context + * switches. + * + * If selector > 3, then it refers to a real segment, and + * saving the base isn't necessary. + */ + if (which == FS) + prev_p->thread.fsbase = 0; + else + prev_p->thread.gsbase = 0; + } +} + +static __always_inline void save_fsgs(struct task_struct *task) +{ + savesegment(fs, task->thread.fsindex); + savesegment(gs, task->thread.gsindex); + save_base_legacy(task, task->thread.fsindex, FS); + save_base_legacy(task, task->thread.gsindex, GS); +} + +static __always_inline void loadseg(enum which_selector which, + unsigned short sel) +{ + if (which == FS) + loadsegment(fs, sel); + else + load_gs_index(sel); +} + +static __always_inline void load_seg_legacy(unsigned short prev_index, + unsigned long prev_base, + unsigned short next_index, + unsigned long next_base, + enum which_selector which) +{ + if (likely(next_index <= 3)) { + /* + * The next task is using 64-bit TLS, is not using this + * segment at all, or is having fun with arcane CPU features. + */ + if (next_base == 0) { + /* + * Nasty case: on AMD CPUs, we need to forcibly zero + * the base. + */ + if (static_cpu_has_bug(X86_BUG_NULL_SEG)) { + loadseg(which, __USER_DS); + loadseg(which, next_index); + } else { + /* + * We could try to exhaustively detect cases + * under which we can skip the segment load, + * but there's really only one case that matters + * for performance: if both the previous and + * next states are fully zeroed, we can skip + * the load. + * + * (This assumes that prev_base == 0 has no + * false positives. This is the case on + * Intel-style CPUs.) + */ + if (likely(prev_index | next_index | prev_base)) + loadseg(which, next_index); + } + } else { + if (prev_index != next_index) + loadseg(which, next_index); + wrmsrl(which == FS ? MSR_FS_BASE : MSR_KERNEL_GS_BASE, + next_base); + } + } else { + /* + * The next task is using a real segment. Loading the selector + * is sufficient. + */ + loadseg(which, next_index); + } +} + int copy_thread_tls(unsigned long clone_flags, unsigned long sp, unsigned long arg, struct task_struct *p, unsigned long tls) { @@ -216,10 +333,19 @@ start_thread_common(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp, unsigned int _cs, unsigned int _ss, unsigned int _ds) { + WARN_ON_ONCE(regs != current_pt_regs()); + + if (static_cpu_has(X86_BUG_NULL_SEG)) { + /* Loading zero below won't clear the base. */ + loadsegment(fs, __USER_DS); + load_gs_index(__USER_DS); + } + loadsegment(fs, 0); loadsegment(es, _ds); loadsegment(ds, _ds); load_gs_index(0); + regs->ip = new_ip; regs->sp = new_sp; regs->cs = _cs; @@ -264,7 +390,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) struct fpu *next_fpu = &next->fpu; int cpu = smp_processor_id(); struct tss_struct *tss = &per_cpu(cpu_tss, cpu); - unsigned prev_fsindex, prev_gsindex; fpu_switch_t fpu_switch; fpu_switch = switch_fpu_prepare(prev_fpu, next_fpu, cpu); @@ -274,8 +399,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) * * (e.g. xen_load_tls()) */ - savesegment(fs, prev_fsindex); - savesegment(gs, prev_gsindex); + save_fsgs(prev_p); /* * Load TLS before restoring any segments so that segment loads @@ -314,108 +438,10 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) if (unlikely(next->ds | prev->ds)) loadsegment(ds, next->ds); - /* - * Switch FS and GS. - * - * These are even more complicated than DS and ES: they have - * 64-bit bases are that controlled by arch_prctl. The bases - * don't necessarily match the selectors, as user code can do - * any number of things to cause them to be inconsistent. - * - * We don't promise to preserve the bases if the selectors are - * nonzero. We also don't promise to preserve the base if the - * selector is zero and the base doesn't match whatever was - * most recently passed to ARCH_SET_FS/GS. (If/when the - * FSGSBASE instructions are enabled, we'll need to offer - * stronger guarantees.) - * - * As an invariant, - * (fsbase != 0 && fsindex != 0) || (gsbase != 0 && gsindex != 0) is - * impossible. - */ - if (next->fsindex) { - /* Loading a nonzero value into FS sets the index and base. */ - loadsegment(fs, next->fsindex); - } else { - if (next->fsbase) { - /* Next index is zero but next base is nonzero. */ - if (prev_fsindex) - loadsegment(fs, 0); - wrmsrl(MSR_FS_BASE, next->fsbase); - } else { - /* Next base and index are both zero. */ - if (static_cpu_has_bug(X86_BUG_NULL_SEG)) { - /* - * We don't know the previous base and can't - * find out without RDMSR. Forcibly clear it. - */ - loadsegment(fs, __USER_DS); - loadsegment(fs, 0); - } else { - /* - * If the previous index is zero and ARCH_SET_FS - * didn't change the base, then the base is - * also zero and we don't need to do anything. - */ - if (prev->fsbase || prev_fsindex) - loadsegment(fs, 0); - } - } - } - /* - * Save the old state and preserve the invariant. - * NB: if prev_fsindex == 0, then we can't reliably learn the base - * without RDMSR because Intel user code can zero it without telling - * us and AMD user code can program any 32-bit value without telling - * us. - */ - if (prev_fsindex) - prev->fsbase = 0; - prev->fsindex = prev_fsindex; - - if (next->gsindex) { - /* Loading a nonzero value into GS sets the index and base. */ - load_gs_index(next->gsindex); - } else { - if (next->gsbase) { - /* Next index is zero but next base is nonzero. */ - if (prev_gsindex) - load_gs_index(0); - wrmsrl(MSR_KERNEL_GS_BASE, next->gsbase); - } else { - /* Next base and index are both zero. */ - if (static_cpu_has_bug(X86_BUG_NULL_SEG)) { - /* - * We don't know the previous base and can't - * find out without RDMSR. Forcibly clear it. - * - * This contains a pointless SWAPGS pair. - * Fixing it would involve an explicit check - * for Xen or a new pvop. - */ - load_gs_index(__USER_DS); - load_gs_index(0); - } else { - /* - * If the previous index is zero and ARCH_SET_GS - * didn't change the base, then the base is - * also zero and we don't need to do anything. - */ - if (prev->gsbase || prev_gsindex) - load_gs_index(0); - } - } - } - /* - * Save the old state and preserve the invariant. - * NB: if prev_gsindex == 0, then we can't reliably learn the base - * without RDMSR because Intel user code can zero it without telling - * us and AMD user code can program any 32-bit value without telling - * us. - */ - if (prev_gsindex) - prev->gsbase = 0; - prev->gsindex = prev_gsindex; + load_seg_legacy(prev->fsindex, prev->fsbase, + next->fsindex, next->fsbase, FS); + load_seg_legacy(prev->gsindex, prev->gsbase, + next->gsindex, next->gsbase, GS); switch_fpu_finish(next_fpu, fpu_switch); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 9c337b0..feaab07 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1054,6 +1054,13 @@ void __init setup_arch(char **cmdline_p) max_possible_pfn = max_pfn; /* + * This call is required when the CPU does not support PAT. If + * mtrr_bp_init() invoked it already via pat_init() the call has no + * effect. + */ + init_cache_modes(); + + /* * Define random base addresses for memory sections after max_pfn is * defined and before each memory section base is used. */ diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 36171bc..9fe7b9e 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -181,6 +181,12 @@ static void smp_callin(void) smp_store_cpu_info(cpuid); /* + * The topology information must be up to date before + * calibrate_delay() and notify_cpu_starting(). + */ + set_cpu_sibling_map(raw_smp_processor_id()); + + /* * Get our bogomips. * Update loops_per_jiffy in cpu_data. Previous call to * smp_store_cpu_info() stored a value that is close but not as @@ -190,11 +196,6 @@ static void smp_callin(void) cpu_data(cpuid).loops_per_jiffy = loops_per_jiffy; pr_debug("Stack at about %p\n", &cpuid); - /* - * This must be done before setting cpu_online_mask - * or calling notify_cpu_starting. - */ - set_cpu_sibling_map(raw_smp_processor_id()); wmb(); notify_cpu_starting(cpuid); diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index eea88fe..44bf5cf 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -694,6 +694,7 @@ unsigned long native_calibrate_tsc(void) crystal_khz = 24000; /* 24.0 MHz */ break; case INTEL_FAM6_SKYLAKE_X: + case INTEL_FAM6_ATOM_DENVERTON: crystal_khz = 25000; /* 25.0 MHz */ break; case INTEL_FAM6_ATOM_GOLDMONT: @@ -1381,12 +1382,10 @@ void __init tsc_init(void) unsigned long calibrate_delay_is_known(void) { int sibling, cpu = smp_processor_id(); - struct cpumask *mask = topology_core_cpumask(cpu); + int constant_tsc = cpu_has(&cpu_data(cpu), X86_FEATURE_CONSTANT_TSC); + const struct cpumask *mask = topology_core_cpumask(cpu); - if (!tsc_disabled && !cpu_has(&cpu_data(cpu), X86_FEATURE_CONSTANT_TSC)) - return 0; - - if (!mask) + if (tsc_disabled || !constant_tsc || !mask) return 0; sibling = cpumask_any_but(mask, cpu); diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 649d8f2..91af75e 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -456,7 +456,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, entry->ecx &= kvm_cpuid_7_0_ecx_x86_features; cpuid_mask(&entry->ecx, CPUID_7_ECX); /* PKU is not yet implemented for shadow paging. */ - if (!tdp_enabled) + if (!tdp_enabled || !boot_cpu_has(X86_FEATURE_OSPKE)) entry->ecx &= ~F(PKU); } else { entry->ebx = 0; diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index 35058c2..9368fec 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -144,6 +144,14 @@ static inline bool guest_cpuid_has_rtm(struct kvm_vcpu *vcpu) return best && (best->ebx & bit(X86_FEATURE_RTM)); } +static inline bool guest_cpuid_has_mpx(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 7, 0); + return best && (best->ebx & bit(X86_FEATURE_MPX)); +} + static inline bool guest_cpuid_has_rdtscp(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 9f676ad..72b737b 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2543,7 +2543,7 @@ static int em_rsm(struct x86_emulate_ctxt *ctxt) u64 smbase; int ret; - if ((ctxt->emul_flags & X86EMUL_SMM_MASK) == 0) + if ((ctxt->ops->get_hflags(ctxt) & X86EMUL_SMM_MASK) == 0) return emulate_ud(ctxt); /* @@ -2592,11 +2592,11 @@ static int em_rsm(struct x86_emulate_ctxt *ctxt) return X86EMUL_UNHANDLEABLE; } - if ((ctxt->emul_flags & X86EMUL_SMM_INSIDE_NMI_MASK) == 0) + if ((ctxt->ops->get_hflags(ctxt) & X86EMUL_SMM_INSIDE_NMI_MASK) == 0) ctxt->ops->set_nmi_mask(ctxt, false); - ctxt->emul_flags &= ~X86EMUL_SMM_INSIDE_NMI_MASK; - ctxt->emul_flags &= ~X86EMUL_SMM_MASK; + ctxt->ops->set_hflags(ctxt, ctxt->ops->get_hflags(ctxt) & + ~(X86EMUL_SMM_INSIDE_NMI_MASK | X86EMUL_SMM_MASK)); return X86EMUL_CONTINUE; } @@ -2738,6 +2738,7 @@ static int em_syscall(struct x86_emulate_ctxt *ctxt) ctxt->eflags &= ~(X86_EFLAGS_VM | X86_EFLAGS_IF); } + ctxt->tf = (ctxt->eflags & X86_EFLAGS_TF) != 0; return X86EMUL_CONTINUE; } @@ -5312,6 +5313,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) const struct x86_emulate_ops *ops = ctxt->ops; int rc = X86EMUL_CONTINUE; int saved_dst_type = ctxt->dst.type; + unsigned emul_flags; ctxt->mem_read.pos = 0; @@ -5326,6 +5328,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) goto done; } + emul_flags = ctxt->ops->get_hflags(ctxt); if (unlikely(ctxt->d & (No64|Undefined|Sse|Mmx|Intercept|CheckPerm|Priv|Prot|String))) { if ((ctxt->mode == X86EMUL_MODE_PROT64 && (ctxt->d & No64)) || @@ -5359,7 +5362,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) fetch_possible_mmx_operand(ctxt, &ctxt->dst); } - if (unlikely(ctxt->emul_flags & X86EMUL_GUEST_MASK) && ctxt->intercept) { + if (unlikely(emul_flags & X86EMUL_GUEST_MASK) && ctxt->intercept) { rc = emulator_check_intercept(ctxt, ctxt->intercept, X86_ICPT_PRE_EXCEPT); if (rc != X86EMUL_CONTINUE) @@ -5388,7 +5391,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) goto done; } - if (unlikely(ctxt->emul_flags & X86EMUL_GUEST_MASK) && (ctxt->d & Intercept)) { + if (unlikely(emul_flags & X86EMUL_GUEST_MASK) && (ctxt->d & Intercept)) { rc = emulator_check_intercept(ctxt, ctxt->intercept, X86_ICPT_POST_EXCEPT); if (rc != X86EMUL_CONTINUE) @@ -5442,7 +5445,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) special_insn: - if (unlikely(ctxt->emul_flags & X86EMUL_GUEST_MASK) && (ctxt->d & Intercept)) { + if (unlikely(emul_flags & X86EMUL_GUEST_MASK) && (ctxt->d & Intercept)) { rc = emulator_check_intercept(ctxt, ctxt->intercept, X86_ICPT_POST_MEMACCESS); if (rc != X86EMUL_CONTINUE) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 5f24127..d29c745 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3649,19 +3649,19 @@ static inline bool is_last_gpte(struct kvm_mmu *mmu, unsigned level, unsigned gpte) { /* - * PT_PAGE_TABLE_LEVEL always terminates. The RHS has bit 7 set - * iff level <= PT_PAGE_TABLE_LEVEL, which for our purpose means - * level == PT_PAGE_TABLE_LEVEL; set PT_PAGE_SIZE_MASK in gpte then. - */ - gpte |= level - PT_PAGE_TABLE_LEVEL - 1; - - /* * The RHS has bit 7 set iff level < mmu->last_nonleaf_level. * If it is clear, there are no large pages at this level, so clear * PT_PAGE_SIZE_MASK in gpte if that is the case. */ gpte &= level - mmu->last_nonleaf_level; + /* + * PT_PAGE_TABLE_LEVEL always terminates. The RHS has bit 7 set + * iff level <= PT_PAGE_TABLE_LEVEL, which for our purpose means + * level == PT_PAGE_TABLE_LEVEL; set PT_PAGE_SIZE_MASK in gpte then. + */ + gpte |= level - PT_PAGE_TABLE_LEVEL - 1; + return gpte & PT_PAGE_SIZE_MASK; } @@ -4169,6 +4169,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly) update_permission_bitmask(vcpu, context, true); update_pkru_bitmask(vcpu, context, true); + update_last_nonleaf_level(vcpu, context); reset_rsvds_bits_mask_ept(vcpu, context, execonly); reset_ept_shadow_zero_bits_mask(vcpu, context, execonly); } diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index a011054..3736390 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -324,10 +324,11 @@ retry_walk: --walker->level; index = PT_INDEX(addr, walker->level); - table_gfn = gpte_to_gfn(pte); offset = index * sizeof(pt_element_t); pte_gpa = gfn_to_gpa(table_gfn) + offset; + + BUG_ON(walker->level < 1); walker->table_gfn[walker->level - 1] = table_gfn; walker->pte_gpa[walker->level - 1] = pte_gpa; diff --git a/arch/x86/kvm/pmu_intel.c b/arch/x86/kvm/pmu_intel.c index 9d4a850..5ab4a36 100644 --- a/arch/x86/kvm/pmu_intel.c +++ b/arch/x86/kvm/pmu_intel.c @@ -294,7 +294,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) ((u64)1 << edx.split.bit_width_fixed) - 1; } - pmu->global_ctrl = ((1 << pmu->nr_arch_gp_counters) - 1) | + pmu->global_ctrl = ((1ull << pmu->nr_arch_gp_counters) - 1) | (((1ull << pmu->nr_arch_fixed_counters) - 1) << INTEL_PMC_IDX_FIXED); pmu->global_ctrl_mask = ~pmu->global_ctrl; diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 89b98e0..a8ae57a 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2167,46 +2167,44 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) struct pi_desc old, new; unsigned int dest; - if (!kvm_arch_has_assigned_device(vcpu->kvm) || - !irq_remapping_cap(IRQ_POSTING_CAP) || - !kvm_vcpu_apicv_active(vcpu)) + /* + * In case of hot-plug or hot-unplug, we may have to undo + * vmx_vcpu_pi_put even if there is no assigned device. And we + * always keep PI.NDST up to date for simplicity: it makes the + * code easier, and CPU migration is not a fast path. + */ + if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu) + return; + + /* + * First handle the simple case where no cmpxchg is necessary; just + * allow posting non-urgent interrupts. + * + * If the 'nv' field is POSTED_INTR_WAKEUP_VECTOR, do not change + * PI.NDST: pi_post_block will do it for us and the wakeup_handler + * expects the VCPU to be on the blocked_vcpu_list that matches + * PI.NDST. + */ + if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR || + vcpu->cpu == cpu) { + pi_clear_sn(pi_desc); return; + } + /* The full case. */ do { old.control = new.control = pi_desc->control; - /* - * If 'nv' field is POSTED_INTR_WAKEUP_VECTOR, there - * are two possible cases: - * 1. After running 'pre_block', context switch - * happened. For this case, 'sn' was set in - * vmx_vcpu_put(), so we need to clear it here. - * 2. After running 'pre_block', we were blocked, - * and woken up by some other guy. For this case, - * we don't need to do anything, 'pi_post_block' - * will do everything for us. However, we cannot - * check whether it is case #1 or case #2 here - * (maybe, not needed), so we also clear sn here, - * I think it is not a big deal. - */ - if (pi_desc->nv != POSTED_INTR_WAKEUP_VECTOR) { - if (vcpu->cpu != cpu) { - dest = cpu_physical_id(cpu); - - if (x2apic_enabled()) - new.ndst = dest; - else - new.ndst = (dest << 8) & 0xFF00; - } + dest = cpu_physical_id(cpu); - /* set 'NV' to 'notification vector' */ - new.nv = POSTED_INTR_VECTOR; - } + if (x2apic_enabled()) + new.ndst = dest; + else + new.ndst = (dest << 8) & 0xFF00; - /* Allow posting non-urgent interrupts */ new.sn = 0; - } while (cmpxchg(&pi_desc->control, old.control, - new.control) != old.control); + } while (cmpxchg64(&pi_desc->control, old.control, + new.control) != old.control); } static void decache_tsc_multiplier(struct vcpu_vmx *vmx) @@ -2455,7 +2453,7 @@ static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned nr) if (!(vmcs12->exception_bitmap & (1u << nr))) return 0; - nested_vmx_vmexit(vcpu, to_vmx(vcpu)->exit_reason, + nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, vmcs_read32(VM_EXIT_INTR_INFO), vmcs_readl(EXIT_QUALIFICATION)); return 1; @@ -2987,7 +2985,8 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP); break; case MSR_IA32_BNDCFGS: - if (!kvm_mpx_supported()) + if (!kvm_mpx_supported() || + (!msr_info->host_initiated && !guest_cpuid_has_mpx(vcpu))) return 1; msr_info->data = vmcs_read64(GUEST_BNDCFGS); break; @@ -3069,7 +3068,11 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vmcs_writel(GUEST_SYSENTER_ESP, data); break; case MSR_IA32_BNDCFGS: - if (!kvm_mpx_supported()) + if (!kvm_mpx_supported() || + (!msr_info->host_initiated && !guest_cpuid_has_mpx(vcpu))) + return 1; + if (is_noncanonical_address(data & PAGE_MASK) || + (data & MSR_IA32_BNDCFGS_RSVD)) return 1; vmcs_write64(GUEST_BNDCFGS, data); break; @@ -4756,21 +4759,30 @@ static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu) { #ifdef CONFIG_SMP if (vcpu->mode == IN_GUEST_MODE) { - struct vcpu_vmx *vmx = to_vmx(vcpu); - /* - * Currently, we don't support urgent interrupt, - * all interrupts are recognized as non-urgent - * interrupt, so we cannot post interrupts when - * 'SN' is set. + * The vector of interrupt to be delivered to vcpu had + * been set in PIR before this function. + * + * Following cases will be reached in this block, and + * we always send a notification event in all cases as + * explained below. + * + * Case 1: vcpu keeps in non-root mode. Sending a + * notification event posts the interrupt to vcpu. * - * If the vcpu is in guest mode, it means it is - * running instead of being scheduled out and - * waiting in the run queue, and that's the only - * case when 'SN' is set currently, warning if - * 'SN' is set. + * Case 2: vcpu exits to root mode and is still + * runnable. PIR will be synced to vIRR before the + * next vcpu entry. Sending a notification event in + * this case has no effect, as vcpu is not in root + * mode. + * + * Case 3: vcpu exits to root mode and is blocked. + * vcpu_block() has already synced PIR to vIRR and + * never blocks vcpu if vIRR is not cleared. Therefore, + * a blocked vcpu here does not wait for any requested + * interrupts in PIR, and sending a notification event + * which has no effect is safe here. */ - WARN_ON_ONCE(pi_test_sn(&vmx->pi_desc)); apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), POSTED_INTR_VECTOR); @@ -6474,7 +6486,6 @@ static __init int hardware_setup(void) vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false); vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false); vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false); - vmx_disable_intercept_for_msr(MSR_IA32_BNDCFGS, true); memcpy(vmx_msr_bitmap_legacy_x2apic, vmx_msr_bitmap_legacy, PAGE_SIZE); @@ -9183,6 +9194,13 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) vmx->msr_ia32_feature_control_valid_bits = FEATURE_CONTROL_LOCKED; + /* + * Enforce invariant: pi_desc.nv is always either POSTED_INTR_VECTOR + * or POSTED_INTR_WAKEUP_VECTOR. + */ + vmx->pi_desc.nv = POSTED_INTR_VECTOR; + vmx->pi_desc.sn = 1; + return &vmx->vcpu; free_vmcs: @@ -9992,6 +10010,11 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, page_to_phys(vmx->nested.virtual_apic_page)); vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold); + } else { +#ifdef CONFIG_X86_64 + exec_control |= CPU_BASED_CR8_LOAD_EXITING | + CPU_BASED_CR8_STORE_EXITING; +#endif } if (cpu_has_vmx_msr_bitmap() && @@ -10667,7 +10690,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, * (KVM doesn't change it)- no reason to call set_cr4_guest_host_mask(); */ vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); - kvm_set_cr4(vcpu, vmcs12->host_cr4); + vmx_set_cr4(vcpu, vmcs12->host_cr4); nested_ept_uninit_mmu_context(vcpu); @@ -10996,6 +11019,37 @@ static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm, kvm_mmu_clear_dirty_pt_masked(kvm, memslot, offset, mask); } +static void __pi_post_block(struct kvm_vcpu *vcpu) +{ + struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); + struct pi_desc old, new; + unsigned int dest; + + do { + old.control = new.control = pi_desc->control; + WARN(old.nv != POSTED_INTR_WAKEUP_VECTOR, + "Wakeup handler not enabled while the VCPU is blocked\n"); + + dest = cpu_physical_id(vcpu->cpu); + + if (x2apic_enabled()) + new.ndst = dest; + else + new.ndst = (dest << 8) & 0xFF00; + + /* set 'NV' to 'notification vector' */ + new.nv = POSTED_INTR_VECTOR; + } while (cmpxchg64(&pi_desc->control, old.control, + new.control) != old.control); + + if (!WARN_ON_ONCE(vcpu->pre_pcpu == -1)) { + spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); + list_del(&vcpu->blocked_vcpu_list); + spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); + vcpu->pre_pcpu = -1; + } +} + /* * This routine does the following things for vCPU which is going * to be blocked if VT-d PI is enabled. @@ -11011,7 +11065,6 @@ static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm, */ static int pi_pre_block(struct kvm_vcpu *vcpu) { - unsigned long flags; unsigned int dest; struct pi_desc old, new; struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); @@ -11021,34 +11074,20 @@ static int pi_pre_block(struct kvm_vcpu *vcpu) !kvm_vcpu_apicv_active(vcpu)) return 0; - vcpu->pre_pcpu = vcpu->cpu; - spin_lock_irqsave(&per_cpu(blocked_vcpu_on_cpu_lock, - vcpu->pre_pcpu), flags); - list_add_tail(&vcpu->blocked_vcpu_list, - &per_cpu(blocked_vcpu_on_cpu, - vcpu->pre_pcpu)); - spin_unlock_irqrestore(&per_cpu(blocked_vcpu_on_cpu_lock, - vcpu->pre_pcpu), flags); + WARN_ON(irqs_disabled()); + local_irq_disable(); + if (!WARN_ON_ONCE(vcpu->pre_pcpu != -1)) { + vcpu->pre_pcpu = vcpu->cpu; + spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); + list_add_tail(&vcpu->blocked_vcpu_list, + &per_cpu(blocked_vcpu_on_cpu, + vcpu->pre_pcpu)); + spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); + } do { old.control = new.control = pi_desc->control; - /* - * We should not block the vCPU if - * an interrupt is posted for it. - */ - if (pi_test_on(pi_desc) == 1) { - spin_lock_irqsave(&per_cpu(blocked_vcpu_on_cpu_lock, - vcpu->pre_pcpu), flags); - list_del(&vcpu->blocked_vcpu_list); - spin_unlock_irqrestore( - &per_cpu(blocked_vcpu_on_cpu_lock, - vcpu->pre_pcpu), flags); - vcpu->pre_pcpu = -1; - - return 1; - } - WARN((pi_desc->sn == 1), "Warning: SN field of posted-interrupts " "is set before blocking\n"); @@ -11070,10 +11109,15 @@ static int pi_pre_block(struct kvm_vcpu *vcpu) /* set 'NV' to 'wakeup vector' */ new.nv = POSTED_INTR_WAKEUP_VECTOR; - } while (cmpxchg(&pi_desc->control, old.control, - new.control) != old.control); + } while (cmpxchg64(&pi_desc->control, old.control, + new.control) != old.control); - return 0; + /* We should not block the vCPU if an interrupt is posted for it. */ + if (pi_test_on(pi_desc) == 1) + __pi_post_block(vcpu); + + local_irq_enable(); + return (vcpu->pre_pcpu == -1); } static int vmx_pre_block(struct kvm_vcpu *vcpu) @@ -11089,44 +11133,13 @@ static int vmx_pre_block(struct kvm_vcpu *vcpu) static void pi_post_block(struct kvm_vcpu *vcpu) { - struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); - struct pi_desc old, new; - unsigned int dest; - unsigned long flags; - - if (!kvm_arch_has_assigned_device(vcpu->kvm) || - !irq_remapping_cap(IRQ_POSTING_CAP) || - !kvm_vcpu_apicv_active(vcpu)) + if (vcpu->pre_pcpu == -1) return; - do { - old.control = new.control = pi_desc->control; - - dest = cpu_physical_id(vcpu->cpu); - - if (x2apic_enabled()) - new.ndst = dest; - else - new.ndst = (dest << 8) & 0xFF00; - - /* Allow posting non-urgent interrupts */ - new.sn = 0; - - /* set 'NV' to 'notification vector' */ - new.nv = POSTED_INTR_VECTOR; - } while (cmpxchg(&pi_desc->control, old.control, - new.control) != old.control); - - if(vcpu->pre_pcpu != -1) { - spin_lock_irqsave( - &per_cpu(blocked_vcpu_on_cpu_lock, - vcpu->pre_pcpu), flags); - list_del(&vcpu->blocked_vcpu_list); - spin_unlock_irqrestore( - &per_cpu(blocked_vcpu_on_cpu_lock, - vcpu->pre_pcpu), flags); - vcpu->pre_pcpu = -1; - } + WARN_ON(irqs_disabled()); + local_irq_disable(); + __pi_post_block(vcpu); + local_irq_enable(); } static void vmx_post_block(struct kvm_vcpu *vcpu) @@ -11154,7 +11167,7 @@ static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq, struct kvm_lapic_irq irq; struct kvm_vcpu *vcpu; struct vcpu_data vcpu_info; - int idx, ret = -EINVAL; + int idx, ret = 0; if (!kvm_arch_has_assigned_device(kvm) || !irq_remapping_cap(IRQ_POSTING_CAP) || @@ -11163,7 +11176,12 @@ static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq, idx = srcu_read_lock(&kvm->irq_srcu); irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu); - BUG_ON(guest_irq >= irq_rt->nr_rt_entries); + if (guest_irq >= irq_rt->nr_rt_entries || + hlist_empty(&irq_rt->map[guest_irq])) { + pr_warn_once("no route for guest_irq %u/%u (broken user space?)\n", + guest_irq, irq_rt->nr_rt_entries); + goto out; + } hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) { if (e->type != KVM_IRQ_ROUTING_MSI) @@ -11206,12 +11224,8 @@ static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq, if (set) ret = irq_set_vcpu_affinity(host_irq, &vcpu_info); - else { - /* suppress notification event before unposting */ - pi_set_sn(vcpu_to_pi_desc(vcpu)); + else ret = irq_set_vcpu_affinity(host_irq, NULL); - pi_clear_sn(vcpu_to_pi_desc(vcpu)); - } if (ret < 0) { printk(KERN_INFO "%s: failed to update PI IRTE\n", diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 62cde4f..595f814 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4999,6 +4999,8 @@ static bool emulator_get_segment(struct x86_emulate_ctxt *ctxt, u16 *selector, if (var.unusable) { memset(desc, 0, sizeof(*desc)); + if (base3) + *base3 = 0; return false; } @@ -5154,6 +5156,16 @@ static void emulator_set_nmi_mask(struct x86_emulate_ctxt *ctxt, bool masked) kvm_x86_ops->set_nmi_mask(emul_to_vcpu(ctxt), masked); } +static unsigned emulator_get_hflags(struct x86_emulate_ctxt *ctxt) +{ + return emul_to_vcpu(ctxt)->arch.hflags; +} + +static void emulator_set_hflags(struct x86_emulate_ctxt *ctxt, unsigned emul_flags) +{ + kvm_set_hflags(emul_to_vcpu(ctxt), emul_flags); +} + static const struct x86_emulate_ops emulate_ops = { .read_gpr = emulator_read_gpr, .write_gpr = emulator_write_gpr, @@ -5193,6 +5205,8 @@ static const struct x86_emulate_ops emulate_ops = { .intercept = emulator_intercept, .get_cpuid = emulator_get_cpuid, .set_nmi_mask = emulator_set_nmi_mask, + .get_hflags = emulator_get_hflags, + .set_hflags = emulator_set_hflags, }; static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask) @@ -5236,6 +5250,8 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu) kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); ctxt->eflags = kvm_get_rflags(vcpu); + ctxt->tf = (ctxt->eflags & X86_EFLAGS_TF) != 0; + ctxt->eip = kvm_rip_read(vcpu); ctxt->mode = (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL : (ctxt->eflags & X86_EFLAGS_VM) ? X86EMUL_MODE_VM86 : @@ -5245,7 +5261,6 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu) BUILD_BUG_ON(HF_GUEST_MASK != X86EMUL_GUEST_MASK); BUILD_BUG_ON(HF_SMM_MASK != X86EMUL_SMM_MASK); BUILD_BUG_ON(HF_SMM_INSIDE_NMI_MASK != X86EMUL_SMM_INSIDE_NMI_MASK); - ctxt->emul_flags = vcpu->arch.hflags; init_decode_cache(ctxt); vcpu->arch.emulate_regs_need_sync_from_vcpu = false; @@ -5452,37 +5467,26 @@ static int kvm_vcpu_check_hw_bp(unsigned long addr, u32 type, u32 dr7, return dr6; } -static void kvm_vcpu_check_singlestep(struct kvm_vcpu *vcpu, unsigned long rflags, int *r) +static void kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu, int *r) { struct kvm_run *kvm_run = vcpu->run; - /* - * rflags is the old, "raw" value of the flags. The new value has - * not been saved yet. - * - * This is correct even for TF set by the guest, because "the - * processor will not generate this exception after the instruction - * that sets the TF flag". - */ - if (unlikely(rflags & X86_EFLAGS_TF)) { - if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) { - kvm_run->debug.arch.dr6 = DR6_BS | DR6_FIXED_1 | - DR6_RTM; - kvm_run->debug.arch.pc = vcpu->arch.singlestep_rip; - kvm_run->debug.arch.exception = DB_VECTOR; - kvm_run->exit_reason = KVM_EXIT_DEBUG; - *r = EMULATE_USER_EXIT; - } else { - vcpu->arch.emulate_ctxt.eflags &= ~X86_EFLAGS_TF; - /* - * "Certain debug exceptions may clear bit 0-3. The - * remaining contents of the DR6 register are never - * cleared by the processor". - */ - vcpu->arch.dr6 &= ~15; - vcpu->arch.dr6 |= DR6_BS | DR6_RTM; - kvm_queue_exception(vcpu, DB_VECTOR); - } + if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) { + kvm_run->debug.arch.dr6 = DR6_BS | DR6_FIXED_1 | DR6_RTM; + kvm_run->debug.arch.pc = vcpu->arch.singlestep_rip; + kvm_run->debug.arch.exception = DB_VECTOR; + kvm_run->exit_reason = KVM_EXIT_DEBUG; + *r = EMULATE_USER_EXIT; + } else { + vcpu->arch.emulate_ctxt.eflags &= ~X86_EFLAGS_TF; + /* + * "Certain debug exceptions may clear bit 0-3. The + * remaining contents of the DR6 register are never + * cleared by the processor". + */ + vcpu->arch.dr6 &= ~15; + vcpu->arch.dr6 |= DR6_BS | DR6_RTM; + kvm_queue_exception(vcpu, DB_VECTOR); } } @@ -5636,11 +5640,10 @@ restart: unsigned long rflags = kvm_x86_ops->get_rflags(vcpu); toggle_interruptibility(vcpu, ctxt->interruptibility); vcpu->arch.emulate_regs_need_sync_to_vcpu = false; - if (vcpu->arch.hflags != ctxt->emul_flags) - kvm_set_hflags(vcpu, ctxt->emul_flags); kvm_rip_write(vcpu, ctxt->eip); - if (r == EMULATE_DONE) - kvm_vcpu_check_singlestep(vcpu, rflags, &r); + if (r == EMULATE_DONE && + (ctxt->tf || (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP))) + kvm_vcpu_do_singlestep(vcpu, &r); if (!ctxt->have_exception || exception_type(ctxt->exception.vector) == EXCPT_TRAP) __kvm_set_rflags(vcpu, ctxt->eflags); @@ -6111,7 +6114,8 @@ static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt) kvm_x86_ops->patch_hypercall(vcpu, instruction); - return emulator_write_emulated(ctxt, rip, instruction, 3, NULL); + return emulator_write_emulated(ctxt, rip, instruction, 3, + &ctxt->exception); } static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu) diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S index d376e4b..04c067b 100644 --- a/arch/x86/lib/copy_user_64.S +++ b/arch/x86/lib/copy_user_64.S @@ -84,7 +84,7 @@ ENTRY(copy_user_generic_unrolled) movl %edx,%ecx andl $63,%edx shrl $6,%ecx - jz 17f + jz .L_copy_short_string 1: movq (%rsi),%r8 2: movq 1*8(%rsi),%r9 3: movq 2*8(%rsi),%r10 @@ -105,7 +105,8 @@ ENTRY(copy_user_generic_unrolled) leaq 64(%rdi),%rdi decl %ecx jnz 1b -17: movl %edx,%ecx +.L_copy_short_string: + movl %edx,%ecx andl $7,%edx shrl $3,%ecx jz 20f @@ -221,6 +222,8 @@ EXPORT_SYMBOL(copy_user_generic_string) */ ENTRY(copy_user_enhanced_fast_string) ASM_STAC + cmpl $64,%edx + jb .L_copy_short_string /* less then 64 bytes, avoid the costly 'rep' */ movl %edx,%ecx 1: rep movsb diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 9f72ca3..1dd7960 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -191,8 +191,7 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr) * 6. T1 : reaches here, sees vma_pkey(vma)=5, when we really * faulted on a pte with its pkey=4. */ -static void fill_sig_info_pkey(int si_code, siginfo_t *info, - struct vm_area_struct *vma) +static void fill_sig_info_pkey(int si_code, siginfo_t *info, u32 *pkey) { /* This is effectively an #ifdef */ if (!boot_cpu_has(X86_FEATURE_OSPKE)) @@ -208,7 +207,7 @@ static void fill_sig_info_pkey(int si_code, siginfo_t *info, * valid VMA, so we should never reach this without a * valid VMA. */ - if (!vma) { + if (!pkey) { WARN_ONCE(1, "PKU fault with no VMA passed in"); info->si_pkey = 0; return; @@ -218,13 +217,12 @@ static void fill_sig_info_pkey(int si_code, siginfo_t *info, * absolutely guranteed to be 100% accurate because of * the race explained above. */ - info->si_pkey = vma_pkey(vma); + info->si_pkey = *pkey; } static void force_sig_info_fault(int si_signo, int si_code, unsigned long address, - struct task_struct *tsk, struct vm_area_struct *vma, - int fault) + struct task_struct *tsk, u32 *pkey, int fault) { unsigned lsb = 0; siginfo_t info; @@ -239,7 +237,7 @@ force_sig_info_fault(int si_signo, int si_code, unsigned long address, lsb = PAGE_SHIFT; info.si_addr_lsb = lsb; - fill_sig_info_pkey(si_code, &info, vma); + fill_sig_info_pkey(si_code, &info, pkey); force_sig_info(si_signo, &info, tsk); } @@ -718,8 +716,6 @@ no_context(struct pt_regs *regs, unsigned long error_code, struct task_struct *tsk = current; unsigned long flags; int sig; - /* No context means no VMA to pass down */ - struct vm_area_struct *vma = NULL; /* Are we prepared to handle this kernel fault? */ if (fixup_exception(regs, X86_TRAP_PF)) { @@ -744,7 +740,7 @@ no_context(struct pt_regs *regs, unsigned long error_code, /* XXX: hwpoison faults will set the wrong code. */ force_sig_info_fault(signal, si_code, address, - tsk, vma, 0); + tsk, NULL, 0); } /* @@ -853,8 +849,7 @@ show_signal_msg(struct pt_regs *regs, unsigned long error_code, static void __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, - unsigned long address, struct vm_area_struct *vma, - int si_code) + unsigned long address, u32 *pkey, int si_code) { struct task_struct *tsk = current; @@ -902,7 +897,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, tsk->thread.error_code = error_code; tsk->thread.trap_nr = X86_TRAP_PF; - force_sig_info_fault(SIGSEGV, si_code, address, tsk, vma, 0); + force_sig_info_fault(SIGSEGV, si_code, address, tsk, pkey, 0); return; } @@ -915,9 +910,9 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, static noinline void bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, - unsigned long address, struct vm_area_struct *vma) + unsigned long address, u32 *pkey) { - __bad_area_nosemaphore(regs, error_code, address, vma, SEGV_MAPERR); + __bad_area_nosemaphore(regs, error_code, address, pkey, SEGV_MAPERR); } static void @@ -925,6 +920,10 @@ __bad_area(struct pt_regs *regs, unsigned long error_code, unsigned long address, struct vm_area_struct *vma, int si_code) { struct mm_struct *mm = current->mm; + u32 pkey; + + if (vma) + pkey = vma_pkey(vma); /* * Something tried to access memory that isn't in our memory map.. @@ -932,7 +931,8 @@ __bad_area(struct pt_regs *regs, unsigned long error_code, */ up_read(&mm->mmap_sem); - __bad_area_nosemaphore(regs, error_code, address, vma, si_code); + __bad_area_nosemaphore(regs, error_code, address, + (vma) ? &pkey : NULL, si_code); } static noinline void @@ -975,7 +975,7 @@ bad_area_access_error(struct pt_regs *regs, unsigned long error_code, static void do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, - struct vm_area_struct *vma, unsigned int fault) + u32 *pkey, unsigned int fault) { struct task_struct *tsk = current; int code = BUS_ADRERR; @@ -1002,13 +1002,12 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, code = BUS_MCEERR_AR; } #endif - force_sig_info_fault(SIGBUS, code, address, tsk, vma, fault); + force_sig_info_fault(SIGBUS, code, address, tsk, pkey, fault); } static noinline void mm_fault_error(struct pt_regs *regs, unsigned long error_code, - unsigned long address, struct vm_area_struct *vma, - unsigned int fault) + unsigned long address, u32 *pkey, unsigned int fault) { if (fatal_signal_pending(current) && !(error_code & PF_USER)) { no_context(regs, error_code, address, 0, 0); @@ -1032,9 +1031,9 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code, } else { if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON| VM_FAULT_HWPOISON_LARGE)) - do_sigbus(regs, error_code, address, vma, fault); + do_sigbus(regs, error_code, address, pkey, fault); else if (fault & VM_FAULT_SIGSEGV) - bad_area_nosemaphore(regs, error_code, address, vma); + bad_area_nosemaphore(regs, error_code, address, pkey); else BUG(); } @@ -1220,6 +1219,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, struct mm_struct *mm; int fault, major = 0; unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; + u32 pkey; tsk = current; mm = tsk->mm; @@ -1420,9 +1420,10 @@ good_area: return; } + pkey = vma_pkey(vma); up_read(&mm->mmap_sem); if (unlikely(fault & VM_FAULT_ERROR)) { - mm_fault_error(regs, error_code, address, vma, fault); + mm_fault_error(regs, error_code, address, &pkey, fault); return; } diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 14b9dd7..3e27ded 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -94,10 +94,10 @@ __setup("noexec32=", nonx32_setup); */ void sync_global_pgds(unsigned long start, unsigned long end, int removed) { - unsigned long address; + unsigned long addr; - for (address = start; address <= end; address += PGDIR_SIZE) { - const pgd_t *pgd_ref = pgd_offset_k(address); + for (addr = start; addr <= end; addr = ALIGN(addr + 1, PGDIR_SIZE)) { + const pgd_t *pgd_ref = pgd_offset_k(addr); struct page *page; /* @@ -113,7 +113,7 @@ void sync_global_pgds(unsigned long start, unsigned long end, int removed) pgd_t *pgd; spinlock_t *pgt_lock; - pgd = (pgd_t *)page_address(page) + pgd_index(address); + pgd = (pgd_t *)page_address(page) + pgd_index(addr); /* the pgt_lock only for Xen */ pgt_lock = &pgd_page_get_mm(page)->page_table_lock; spin_lock(pgt_lock); @@ -689,7 +689,7 @@ static void __meminit free_pagetable(struct page *page, int order) if (PageReserved(page)) { __ClearPageReserved(page); - magic = (unsigned long)page->lru.next; + magic = (unsigned long)page->freelist; if (magic == SECTION_INFO || magic == MIX_SECTION_INFO) { while (nr_pages--) put_page_bootmem(page++); diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c index e4f8009..a75103e 100644 --- a/arch/x86/mm/mpx.c +++ b/arch/x86/mm/mpx.c @@ -293,7 +293,7 @@ siginfo_t *mpx_generate_siginfo(struct pt_regs *regs) * We were not able to extract an address from the instruction, * probably because there was something invalid in it. */ - if (info->si_addr == (void *)-1) { + if (info->si_addr == (void __user *)-1) { err = -EINVAL; goto err_out; } @@ -525,15 +525,7 @@ int mpx_handle_bd_fault(void) if (!kernel_managing_mpx_tables(current->mm)) return -EINVAL; - if (do_mpx_bt_fault()) { - force_sig(SIGSEGV, current); - /* - * The force_sig() is essentially "handling" this - * exception, so we do not pass up the error - * from do_mpx_bt_fault(). - */ - } - return 0; + return do_mpx_bt_fault(); } /* diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 83e701f..89d7907 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -36,14 +36,14 @@ #undef pr_fmt #define pr_fmt(fmt) "" fmt -static bool boot_cpu_done; - -static int __read_mostly __pat_enabled = IS_ENABLED(CONFIG_X86_PAT); -static void init_cache_modes(void); +static bool __read_mostly boot_cpu_done; +static bool __read_mostly pat_disabled = !IS_ENABLED(CONFIG_X86_PAT); +static bool __read_mostly pat_initialized; +static bool __read_mostly init_cm_done; void pat_disable(const char *reason) { - if (!__pat_enabled) + if (pat_disabled) return; if (boot_cpu_done) { @@ -51,10 +51,8 @@ void pat_disable(const char *reason) return; } - __pat_enabled = 0; + pat_disabled = true; pr_info("x86/PAT: %s\n", reason); - - init_cache_modes(); } static int __init nopat(char *str) @@ -66,7 +64,7 @@ early_param("nopat", nopat); bool pat_enabled(void) { - return !!__pat_enabled; + return pat_initialized; } EXPORT_SYMBOL_GPL(pat_enabled); @@ -204,6 +202,8 @@ static void __init_cache_modes(u64 pat) update_cache_mode_entry(i, cache); } pr_info("x86/PAT: Configuration [0-7]: %s\n", pat_msg); + + init_cm_done = true; } #define PAT(x, y) ((u64)PAT_ ## y << ((x)*8)) @@ -224,6 +224,7 @@ static void pat_bsp_init(u64 pat) } wrmsrl(MSR_IA32_CR_PAT, pat); + pat_initialized = true; __init_cache_modes(pat); } @@ -241,10 +242,9 @@ static void pat_ap_init(u64 pat) wrmsrl(MSR_IA32_CR_PAT, pat); } -static void init_cache_modes(void) +void init_cache_modes(void) { u64 pat = 0; - static int init_cm_done; if (init_cm_done) return; @@ -286,8 +286,6 @@ static void init_cache_modes(void) } __init_cache_modes(pat); - - init_cm_done = 1; } /** @@ -305,10 +303,8 @@ void pat_init(void) u64 pat; struct cpuinfo_x86 *c = &boot_cpu_data; - if (!pat_enabled()) { - init_cache_modes(); + if (pat_disabled) return; - } if ((c->x86_vendor == X86_VENDOR_INTEL) && (((c->x86 == 0x6) && (c->x86_model <= 0xd)) || diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index a7655f6..75fb011 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -263,8 +263,6 @@ void native_flush_tlb_others(const struct cpumask *cpumask, { struct flush_tlb_info info; - if (end == 0) - end = start + PAGE_SIZE; info.flush_mm = mm; info.flush_start = start; info.flush_end = end; @@ -393,7 +391,7 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long start) } if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) - flush_tlb_others(mm_cpumask(mm), mm, start, 0UL); + flush_tlb_others(mm_cpumask(mm), mm, start, start + PAGE_SIZE); preempt_enable(); } diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c index 350f709..7913b69 100644 --- a/arch/x86/oprofile/op_model_ppro.c +++ b/arch/x86/oprofile/op_model_ppro.c @@ -212,8 +212,8 @@ static void arch_perfmon_setup_counters(void) eax.full = cpuid_eax(0xa); /* Workaround for BIOS bugs in 6/15. Taken from perfmon2 */ - if (eax.split.version_id == 0 && __this_cpu_read(cpu_info.x86) == 6 && - __this_cpu_read(cpu_info.x86_model) == 15) { + if (eax.split.version_id == 0 && boot_cpu_data.x86 == 6 && + boot_cpu_data.x86_model == 15) { eax.split.version_id = 2; eax.split.num_counters = 2; eax.split.bit_width = 40; diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c index 6d52b94..20fa7c8 100644 --- a/arch/x86/pci/fixup.c +++ b/arch/x86/pci/fixup.c @@ -571,3 +571,35 @@ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2fc0, pci_invalid_bar); DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6f60, pci_invalid_bar); DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6fa0, pci_invalid_bar); DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6fc0, pci_invalid_bar); + +/* + * Apple MacBook Pro: Avoid [mem 0x7fa00000-0x7fbfffff] + * + * Using the [mem 0x7fa00000-0x7fbfffff] region, e.g., by assigning it to + * the 00:1c.0 Root Port, causes a conflict with [io 0x1804], which is used + * for soft poweroff and suspend-to-RAM. + * + * As far as we know, this is related to the address space, not to the Root + * Port itself. Attaching the quirk to the Root Port is a convenience, but + * it could probably also be a standalone DMI quirk. + * + * https://bugzilla.kernel.org/show_bug.cgi?id=103211 + */ +static void quirk_apple_mbp_poweroff(struct pci_dev *pdev) +{ + struct device *dev = &pdev->dev; + struct resource *res; + + if ((!dmi_match(DMI_PRODUCT_NAME, "MacBookPro11,4") && + !dmi_match(DMI_PRODUCT_NAME, "MacBookPro11,5")) || + pdev->bus->number != 0 || pdev->devfn != PCI_DEVFN(0x1c, 0)) + return; + + res = request_mem_region(0x7fa00000, 0x200000, + "MacBook Pro poweroff workaround"); + if (res) + dev_info(dev, "claimed %s %pR\n", res->name, res); + else + dev_info(dev, "can't work around MacBook Pro poweroff issue\n"); +} +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x8c10, quirk_apple_mbp_poweroff); diff --git a/arch/x86/platform/intel-mid/device_libs/Makefile b/arch/x86/platform/intel-mid/device_libs/Makefile index dd6cfa4..75029d0 100644 --- a/arch/x86/platform/intel-mid/device_libs/Makefile +++ b/arch/x86/platform/intel-mid/device_libs/Makefile @@ -15,7 +15,7 @@ obj-$(subst m,y,$(CONFIG_INTEL_MID_POWER_BUTTON)) += platform_msic_power_btn.o obj-$(subst m,y,$(CONFIG_GPIO_INTEL_PMIC)) += platform_pmic_gpio.o obj-$(subst m,y,$(CONFIG_INTEL_MFLD_THERMAL)) += platform_msic_thermal.o # SPI Devices -obj-$(subst m,y,$(CONFIG_SPI_SPIDEV)) += platform_spidev.o +obj-$(subst m,y,$(CONFIG_SPI_SPIDEV)) += platform_mrfld_spidev.o # I2C Devices obj-$(subst m,y,$(CONFIG_SENSORS_EMC1403)) += platform_emc1403.o obj-$(subst m,y,$(CONFIG_SENSORS_LIS3LV02D)) += platform_lis331.o diff --git a/arch/x86/platform/intel-mid/device_libs/platform_spidev.c b/arch/x86/platform/intel-mid/device_libs/platform_mrfld_spidev.c index 30c601b..27186ad 100644 --- a/arch/x86/platform/intel-mid/device_libs/platform_spidev.c +++ b/arch/x86/platform/intel-mid/device_libs/platform_mrfld_spidev.c @@ -11,6 +11,7 @@ * of the License. */ +#include <linux/err.h> #include <linux/init.h> #include <linux/sfi.h> #include <linux/spi/pxa2xx_spi.h> @@ -34,6 +35,9 @@ static void __init *spidev_platform_data(void *info) { struct spi_board_info *spi_info = info; + if (intel_mid_identify_cpu() != INTEL_MID_CPU_CHIP_TANGIER) + return ERR_PTR(-ENODEV); + spi_info->mode = SPI_MODE_0; spi_info->controller_data = &spidev_spi_chip; diff --git a/arch/x86/purgatory/Makefile b/arch/x86/purgatory/Makefile index 555b9fa..7dbdb78 100644 --- a/arch/x86/purgatory/Makefile +++ b/arch/x86/purgatory/Makefile @@ -8,6 +8,7 @@ PURGATORY_OBJS = $(addprefix $(obj)/,$(purgatory-y)) LDFLAGS_purgatory.ro := -e purgatory_start -r --no-undefined -nostdlib -z nodefaultlib targets += purgatory.ro +KASAN_SANITIZE := n KCOV_INSTRUMENT := n # Default KBUILD_CFLAGS can have -pg option set when FTRACE is enabled. That diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c index 0c2fae8..73eb7fd 100644 --- a/arch/x86/tools/relocs.c +++ b/arch/x86/tools/relocs.c @@ -992,11 +992,12 @@ static void emit_relocs(int as_text, int use_real_mode) die("Segment relocations found but --realmode not specified\n"); /* Order the relocations for more efficient processing */ - sort_relocs(&relocs16); sort_relocs(&relocs32); #if ELF_BITS == 64 sort_relocs(&relocs32neg); sort_relocs(&relocs64); +#else + sort_relocs(&relocs16); #endif /* Print the relocations */ |