diff options
Diffstat (limited to 'arch/x86')
-rw-r--r-- | arch/x86/events/amd/ibs.c | 52 | ||||
-rw-r--r-- | arch/x86/events/perf_event.h | 6 | ||||
-rw-r--r-- | arch/x86/include/asm/kvm_host.h | 2 | ||||
-rw-r--r-- | arch/x86/include/asm/msr-index.h | 8 | ||||
-rw-r--r-- | arch/x86/include/asm/pmem.h | 9 | ||||
-rw-r--r-- | arch/x86/include/asm/tlbflush.h | 6 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/mcheck/therm_throt.c | 3 | ||||
-rw-r--r-- | arch/x86/kvm/cpuid.c | 1 | ||||
-rw-r--r-- | arch/x86/kvm/hyperv.c | 5 | ||||
-rw-r--r-- | arch/x86/kvm/lapic.c | 8 | ||||
-rw-r--r-- | arch/x86/kvm/mmu.c | 12 | ||||
-rw-r--r-- | arch/x86/kvm/mmu.h | 9 | ||||
-rw-r--r-- | arch/x86/kvm/paging_tmpl.h | 2 | ||||
-rw-r--r-- | arch/x86/kvm/x86.c | 30 | ||||
-rw-r--r-- | arch/x86/mm/tlb.c | 14 | ||||
-rw-r--r-- | arch/x86/xen/apic.c | 12 | ||||
-rw-r--r-- | arch/x86/xen/smp.c | 2 |
17 files changed, 130 insertions, 51 deletions
diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index 3ea25c3..feb90f6 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -28,10 +28,46 @@ static u32 ibs_caps; #define IBS_FETCH_CONFIG_MASK (IBS_FETCH_RAND_EN | IBS_FETCH_MAX_CNT) #define IBS_OP_CONFIG_MASK IBS_OP_MAX_CNT + +/* + * IBS states: + * + * ENABLED; tracks the pmu::add(), pmu::del() state, when set the counter is taken + * and any further add()s must fail. + * + * STARTED/STOPPING/STOPPED; deal with pmu::start(), pmu::stop() state but are + * complicated by the fact that the IBS hardware can send late NMIs (ie. after + * we've cleared the EN bit). + * + * In order to consume these late NMIs we have the STOPPED state, any NMI that + * happens after we've cleared the EN state will clear this bit and report the + * NMI handled (this is fundamentally racy in the face or multiple NMI sources, + * someone else can consume our BIT and our NMI will go unhandled). + * + * And since we cannot set/clear this separate bit together with the EN bit, + * there are races; if we cleared STARTED early, an NMI could land in + * between clearing STARTED and clearing the EN bit (in fact multiple NMIs + * could happen if the period is small enough), and consume our STOPPED bit + * and trigger streams of unhandled NMIs. + * + * If, however, we clear STARTED late, an NMI can hit between clearing the + * EN bit and clearing STARTED, still see STARTED set and process the event. + * If this event will have the VALID bit clear, we bail properly, but this + * is not a given. With VALID set we can end up calling pmu::stop() again + * (the throttle logic) and trigger the WARNs in there. + * + * So what we do is set STOPPING before clearing EN to avoid the pmu::stop() + * nesting, and clear STARTED late, so that we have a well defined state over + * the clearing of the EN bit. + * + * XXX: we could probably be using !atomic bitops for all this. + */ + enum ibs_states { IBS_ENABLED = 0, IBS_STARTED = 1, IBS_STOPPING = 2, + IBS_STOPPED = 3, IBS_MAX_STATES, }; @@ -377,11 +413,10 @@ static void perf_ibs_start(struct perf_event *event, int flags) perf_ibs_set_period(perf_ibs, hwc, &period); /* - * Set STARTED before enabling the hardware, such that - * a subsequent NMI must observe it. Then clear STOPPING - * such that we don't consume NMIs by accident. + * Set STARTED before enabling the hardware, such that a subsequent NMI + * must observe it. */ - set_bit(IBS_STARTED, pcpu->state); + set_bit(IBS_STARTED, pcpu->state); clear_bit(IBS_STOPPING, pcpu->state); perf_ibs_enable_event(perf_ibs, hwc, period >> 4); @@ -396,6 +431,9 @@ static void perf_ibs_stop(struct perf_event *event, int flags) u64 config; int stopping; + if (test_and_set_bit(IBS_STOPPING, pcpu->state)) + return; + stopping = test_bit(IBS_STARTED, pcpu->state); if (!stopping && (hwc->state & PERF_HES_UPTODATE)) @@ -405,12 +443,12 @@ static void perf_ibs_stop(struct perf_event *event, int flags) if (stopping) { /* - * Set STOPPING before disabling the hardware, such that it + * Set STOPPED before disabling the hardware, such that it * must be visible to NMIs the moment we clear the EN bit, * at which point we can generate an !VALID sample which * we need to consume. */ - set_bit(IBS_STOPPING, pcpu->state); + set_bit(IBS_STOPPED, pcpu->state); perf_ibs_disable_event(perf_ibs, hwc, config); /* * Clear STARTED after disabling the hardware; if it were @@ -556,7 +594,7 @@ fail: * with samples that even have the valid bit cleared. * Mark all this NMIs as handled. */ - if (test_and_clear_bit(IBS_STOPPING, pcpu->state)) + if (test_and_clear_bit(IBS_STOPPED, pcpu->state)) return 1; return 0; diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 716d048..ad4dc7f 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -800,6 +800,9 @@ ssize_t intel_event_sysfs_show(char *page, u64 config); struct attribute **merge_attr(struct attribute **a, struct attribute **b); +ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr, + char *page); + #ifdef CONFIG_CPU_SUP_AMD int amd_pmu_init(void); @@ -930,9 +933,6 @@ int p6_pmu_init(void); int knc_pmu_init(void); -ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr, - char *page); - static inline int is_ht_workaround_enabled(void) { return !!(x86_pmu.flags & PMU_FL_EXCL_ENABLED); diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index f62a9f37..b7e3944 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -43,7 +43,7 @@ #define KVM_PIO_PAGE_OFFSET 1 #define KVM_COALESCED_MMIO_PAGE_OFFSET 2 -#define KVM_HALT_POLL_NS_DEFAULT 500000 +#define KVM_HALT_POLL_NS_DEFAULT 400000 #define KVM_IRQCHIP_NUM_PINS KVM_IOAPIC_NUM_PINS diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 426e946..5b3c9a5 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -167,6 +167,14 @@ #define MSR_PKG_C9_RESIDENCY 0x00000631 #define MSR_PKG_C10_RESIDENCY 0x00000632 +/* Interrupt Response Limit */ +#define MSR_PKGC3_IRTL 0x0000060a +#define MSR_PKGC6_IRTL 0x0000060b +#define MSR_PKGC7_IRTL 0x0000060c +#define MSR_PKGC8_IRTL 0x00000633 +#define MSR_PKGC9_IRTL 0x00000634 +#define MSR_PKGC10_IRTL 0x00000635 + /* Run Time Average Power Limiting (RAPL) Interface */ #define MSR_RAPL_POWER_UNIT 0x00000606 diff --git a/arch/x86/include/asm/pmem.h b/arch/x86/include/asm/pmem.h index bf8b35d..fbc5e92 100644 --- a/arch/x86/include/asm/pmem.h +++ b/arch/x86/include/asm/pmem.h @@ -47,6 +47,15 @@ static inline void arch_memcpy_to_pmem(void __pmem *dst, const void *src, BUG(); } +static inline int arch_memcpy_from_pmem(void *dst, const void __pmem *src, + size_t n) +{ + if (static_cpu_has(X86_FEATURE_MCE_RECOVERY)) + return memcpy_mcsafe(dst, (void __force *) src, n); + memcpy(dst, (void __force *) src, n); + return 0; +} + /** * arch_wmb_pmem - synchronize writes to persistent memory * diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index c24b422..1fde8d5 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -319,12 +319,6 @@ static inline void reset_lazy_tlbstate(void) #endif /* SMP */ -/* Not inlined due to inc_irq_stat not being defined yet */ -#define flush_tlb_local() { \ - inc_irq_stat(irq_tlb_count); \ - local_flush_tlb(); \ -} - #ifndef CONFIG_PARAVIRT #define flush_tlb_others(mask, mm, start, end) \ native_flush_tlb_others(mask, mm, start, end) diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index 0b445c2..ac780ca 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -384,6 +384,9 @@ static void intel_thermal_interrupt(void) { __u64 msr_val; + if (static_cpu_has(X86_FEATURE_HWP)) + wrmsrl_safe(MSR_HWP_STATUS, 0); + rdmsrl(MSR_IA32_THERM_STATUS, msr_val); /* Check for violation of core thermal thresholds*/ diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 8efb839..bbbaa80 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -534,6 +534,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, do_cpuid_1_ent(&entry[i], function, idx); if (idx == 1) { entry[i].eax &= kvm_cpuid_D_1_eax_x86_features; + cpuid_mask(&entry[i].eax, CPUID_D_1_EAX); entry[i].ebx = 0; if (entry[i].eax & (F(XSAVES)|F(XSAVEC))) entry[i].ebx = diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 5ff3485..01bd7b7 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -1116,6 +1116,11 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu) break; case HVCALL_POST_MESSAGE: case HVCALL_SIGNAL_EVENT: + /* don't bother userspace if it has no way to handle it */ + if (!vcpu_to_synic(vcpu)->active) { + res = HV_STATUS_INVALID_HYPERCALL_CODE; + break; + } vcpu->run->exit_reason = KVM_EXIT_HYPERV; vcpu->run->hyperv.type = KVM_EXIT_HYPERV_HCALL; vcpu->run->hyperv.u.hcall.input = param; diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 443d2a5..1a2da0e 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1369,7 +1369,7 @@ static void start_apic_timer(struct kvm_lapic *apic) hrtimer_start(&apic->lapic_timer.timer, ktime_add_ns(now, apic->lapic_timer.period), - HRTIMER_MODE_ABS); + HRTIMER_MODE_ABS_PINNED); apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016" PRIx64 ", " @@ -1402,7 +1402,7 @@ static void start_apic_timer(struct kvm_lapic *apic) expire = ktime_add_ns(now, ns); expire = ktime_sub_ns(expire, lapic_timer_advance_ns); hrtimer_start(&apic->lapic_timer.timer, - expire, HRTIMER_MODE_ABS); + expire, HRTIMER_MODE_ABS_PINNED); } else apic_timer_expired(apic); @@ -1868,7 +1868,7 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu) apic->vcpu = vcpu; hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC, - HRTIMER_MODE_ABS); + HRTIMER_MODE_ABS_PINNED); apic->lapic_timer.timer.function = apic_timer_fn; /* @@ -2003,7 +2003,7 @@ void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) timer = &vcpu->arch.apic->lapic_timer.timer; if (hrtimer_cancel(timer)) - hrtimer_start_expires(timer, HRTIMER_MODE_ABS); + hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED); } /* diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 70e95d0..1ff4dbb 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -557,8 +557,15 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte) !is_writable_pte(new_spte)) ret = true; - if (!shadow_accessed_mask) + if (!shadow_accessed_mask) { + /* + * We don't set page dirty when dropping non-writable spte. + * So do it now if the new spte is becoming non-writable. + */ + if (ret) + kvm_set_pfn_dirty(spte_to_pfn(old_spte)); return ret; + } /* * Flush TLB when accessed/dirty bits are changed in the page tables, @@ -605,7 +612,8 @@ static int mmu_spte_clear_track_bits(u64 *sptep) if (!shadow_accessed_mask || old_spte & shadow_accessed_mask) kvm_set_pfn_accessed(pfn); - if (!shadow_dirty_mask || (old_spte & shadow_dirty_mask)) + if (old_spte & (shadow_dirty_mask ? shadow_dirty_mask : + PT_WRITABLE_MASK)) kvm_set_pfn_dirty(pfn); return 1; } diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index b70df72..66b33b9 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -173,10 +173,9 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, int index = (pfec >> 1) + (smap >> (X86_EFLAGS_AC_BIT - PFERR_RSVD_BIT + 1)); bool fault = (mmu->permissions[index] >> pte_access) & 1; + u32 errcode = PFERR_PRESENT_MASK; WARN_ON(pfec & (PFERR_PK_MASK | PFERR_RSVD_MASK)); - pfec |= PFERR_PRESENT_MASK; - if (unlikely(mmu->pkru_mask)) { u32 pkru_bits, offset; @@ -189,15 +188,15 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, pkru_bits = (kvm_read_pkru(vcpu) >> (pte_pkey * 2)) & 3; /* clear present bit, replace PFEC.RSVD with ACC_USER_MASK. */ - offset = pfec - 1 + + offset = (pfec & ~1) + ((pte_access & PT_USER_MASK) << (PFERR_RSVD_BIT - PT_USER_SHIFT)); pkru_bits &= mmu->pkru_mask >> offset; - pfec |= -pkru_bits & PFERR_PK_MASK; + errcode |= -pkru_bits & PFERR_PK_MASK; fault |= (pkru_bits != 0); } - return -(uint32_t)fault & pfec; + return -(u32)fault & errcode; } void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm); diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 1d971c7..bc019f7 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -360,7 +360,7 @@ retry_walk: goto error; if (unlikely(is_rsvd_bits_set(mmu, pte, walker->level))) { - errcode |= PFERR_RSVD_MASK | PFERR_PRESENT_MASK; + errcode = PFERR_RSVD_MASK | PFERR_PRESENT_MASK; goto error; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 742d0f7..9b7798c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -700,7 +700,6 @@ static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) if ((xcr0 & XFEATURE_MASK_AVX512) != XFEATURE_MASK_AVX512) return 1; } - kvm_put_guest_xcr0(vcpu); vcpu->arch.xcr0 = xcr0; if ((xcr0 ^ old_xcr0) & XFEATURE_MASK_EXTEND) @@ -6095,12 +6094,10 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win) } /* try to inject new event if pending */ - if (vcpu->arch.nmi_pending) { - if (kvm_x86_ops->nmi_allowed(vcpu)) { - --vcpu->arch.nmi_pending; - vcpu->arch.nmi_injected = true; - kvm_x86_ops->set_nmi(vcpu); - } + if (vcpu->arch.nmi_pending && kvm_x86_ops->nmi_allowed(vcpu)) { + --vcpu->arch.nmi_pending; + vcpu->arch.nmi_injected = true; + kvm_x86_ops->set_nmi(vcpu); } else if (kvm_cpu_has_injectable_intr(vcpu)) { /* * Because interrupts can be injected asynchronously, we are @@ -6569,10 +6566,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (inject_pending_event(vcpu, req_int_win) != 0) req_immediate_exit = true; /* enable NMI/IRQ window open exits if needed */ - else if (vcpu->arch.nmi_pending) - kvm_x86_ops->enable_nmi_window(vcpu); - else if (kvm_cpu_has_injectable_intr(vcpu) || req_int_win) - kvm_x86_ops->enable_irq_window(vcpu); + else { + if (vcpu->arch.nmi_pending) + kvm_x86_ops->enable_nmi_window(vcpu); + if (kvm_cpu_has_injectable_intr(vcpu) || req_int_win) + kvm_x86_ops->enable_irq_window(vcpu); + } if (kvm_lapic_enabled(vcpu)) { update_cr8_intercept(vcpu); @@ -6590,8 +6589,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_x86_ops->prepare_guest_switch(vcpu); if (vcpu->fpu_active) kvm_load_guest_fpu(vcpu); - kvm_load_guest_xcr0(vcpu); - vcpu->mode = IN_GUEST_MODE; srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); @@ -6618,6 +6615,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) goto cancel_injection; } + kvm_load_guest_xcr0(vcpu); + if (req_immediate_exit) smp_send_reschedule(vcpu->cpu); @@ -6667,6 +6666,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) vcpu->mode = OUTSIDE_GUEST_MODE; smp_wmb(); + kvm_put_guest_xcr0(vcpu); + /* Interrupt is enabled by handle_external_intr() */ kvm_x86_ops->handle_external_intr(vcpu); @@ -7314,7 +7315,6 @@ void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) * and assume host would use all available bits. * Guest xcr0 would be loaded later. */ - kvm_put_guest_xcr0(vcpu); vcpu->guest_fpu_loaded = 1; __kernel_fpu_begin(); __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu.state); @@ -7323,8 +7323,6 @@ void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) { - kvm_put_guest_xcr0(vcpu); - if (!vcpu->guest_fpu_loaded) { vcpu->fpu_counter = 0; return; diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 8f4cc3d..fe9b9f7 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -104,10 +104,8 @@ static void flush_tlb_func(void *info) inc_irq_stat(irq_tlb_count); - if (f->flush_mm != this_cpu_read(cpu_tlbstate.active_mm)) + if (f->flush_mm && f->flush_mm != this_cpu_read(cpu_tlbstate.active_mm)) return; - if (!f->flush_end) - f->flush_end = f->flush_start + PAGE_SIZE; count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED); if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) { @@ -135,12 +133,20 @@ void native_flush_tlb_others(const struct cpumask *cpumask, unsigned long end) { struct flush_tlb_info info; + + if (end == 0) + end = start + PAGE_SIZE; info.flush_mm = mm; info.flush_start = start; info.flush_end = end; count_vm_tlb_event(NR_TLB_REMOTE_FLUSH); - trace_tlb_flush(TLB_REMOTE_SEND_IPI, end - start); + if (end == TLB_FLUSH_ALL) + trace_tlb_flush(TLB_REMOTE_SEND_IPI, TLB_FLUSH_ALL); + else + trace_tlb_flush(TLB_REMOTE_SEND_IPI, + (end - start) >> PAGE_SHIFT); + if (is_uv_system()) { unsigned int cpu; diff --git a/arch/x86/xen/apic.c b/arch/x86/xen/apic.c index abf4901..db52a7f 100644 --- a/arch/x86/xen/apic.c +++ b/arch/x86/xen/apic.c @@ -66,7 +66,7 @@ static u32 xen_apic_read(u32 reg) ret = HYPERVISOR_platform_op(&op); if (ret) - return 0; + op.u.pcpu_info.apic_id = BAD_APICID; return op.u.pcpu_info.apic_id << 24; } @@ -142,6 +142,14 @@ static void xen_silent_inquire(int apicid) { } +static int xen_cpu_present_to_apicid(int cpu) +{ + if (cpu_present(cpu)) + return xen_get_apic_id(xen_apic_read(APIC_ID)); + else + return BAD_APICID; +} + static struct apic xen_pv_apic = { .name = "Xen PV", .probe = xen_apic_probe_pv, @@ -162,7 +170,7 @@ static struct apic xen_pv_apic = { .ioapic_phys_id_map = default_ioapic_phys_id_map, /* Used on 32-bit */ .setup_apic_routing = NULL, - .cpu_present_to_apicid = default_cpu_present_to_apicid, + .cpu_present_to_apicid = xen_cpu_present_to_apicid, .apicid_to_cpu_present = physid_set_mask_of_physid, /* Used on 32-bit */ .check_phys_apicid_present = default_check_phys_apicid_present, /* smp_sanity_check needs it */ .phys_pkg_id = xen_phys_pkg_id, /* detect_ht */ diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 3c6d17f..719cf29 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -545,6 +545,8 @@ static void xen_play_dead(void) /* used only with HOTPLUG_CPU */ * data back is to call: */ tick_nohz_idle_enter(); + + cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); } #else /* !CONFIG_HOTPLUG_CPU */ |