diff options
Diffstat (limited to 'arch/x86')
51 files changed, 681 insertions, 269 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index a4f24f5..15b5cef 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -112,7 +112,7 @@ config X86 select GENERIC_STRNLEN_USER select HAVE_CONTEXT_TRACKING if X86_64 select HAVE_IRQ_TIME_ACCOUNTING - select HAVE_VIRT_TO_BUS + select VIRT_TO_BUS select MODULES_USE_ELF_REL if X86_32 select MODULES_USE_ELF_RELA if X86_64 select CLONE_BACKWARDS if X86_32 @@ -1549,6 +1549,7 @@ config X86_SMAP config EFI bool "EFI runtime service support" depends on ACPI + select UCS2_STRING ---help--- This enables the kernel to use EFI runtime services that are available (such as the EFI variable services). diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 8a84501..5ef205c 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -4,7 +4,7 @@ # create a compressed vmlinux image from the original vmlinux # -targets := vmlinux.lds vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma vmlinux.bin.xz vmlinux.bin.lzo head_$(BITS).o misc.o string.o cmdline.o early_serial_console.o piggy.o +targets := vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma vmlinux.bin.xz vmlinux.bin.lzo KBUILD_CFLAGS := -m$(BITS) -D__KERNEL__ $(LINUX_INCLUDE) -O2 KBUILD_CFLAGS += -fno-strict-aliasing -fPIC @@ -29,7 +29,6 @@ VMLINUX_OBJS = $(obj)/vmlinux.lds $(obj)/head_$(BITS).o $(obj)/misc.o \ $(obj)/piggy.o $(obj)/eboot.o: KBUILD_CFLAGS += -fshort-wchar -mno-red-zone -$(obj)/efi_stub_$(BITS).o: KBUILD_CLFAGS += -fshort-wchar -mno-red-zone ifeq ($(CONFIG_EFI_STUB), y) VMLINUX_OBJS += $(obj)/eboot.o $(obj)/efi_stub_$(BITS).o @@ -43,7 +42,7 @@ OBJCOPYFLAGS_vmlinux.bin := -R .comment -S $(obj)/vmlinux.bin: vmlinux FORCE $(call if_changed,objcopy) -targets += vmlinux.bin.all vmlinux.relocs +targets += $(patsubst $(obj)/%,%,$(VMLINUX_OBJS)) vmlinux.bin.all vmlinux.relocs CMD_RELOCS = arch/x86/tools/relocs quiet_cmd_relocs = RELOCS $@ diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c index c205035..35ee62f 100644 --- a/arch/x86/boot/compressed/eboot.c +++ b/arch/x86/boot/compressed/eboot.c @@ -251,6 +251,51 @@ static void find_bits(unsigned long mask, u8 *pos, u8 *size) *size = len; } +static efi_status_t setup_efi_vars(struct boot_params *params) +{ + struct setup_data *data; + struct efi_var_bootdata *efidata; + u64 store_size, remaining_size, var_size; + efi_status_t status; + + if (sys_table->runtime->hdr.revision < EFI_2_00_SYSTEM_TABLE_REVISION) + return EFI_UNSUPPORTED; + + data = (struct setup_data *)(unsigned long)params->hdr.setup_data; + + while (data && data->next) + data = (struct setup_data *)(unsigned long)data->next; + + status = efi_call_phys4((void *)sys_table->runtime->query_variable_info, + EFI_VARIABLE_NON_VOLATILE | + EFI_VARIABLE_BOOTSERVICE_ACCESS | + EFI_VARIABLE_RUNTIME_ACCESS, &store_size, + &remaining_size, &var_size); + + if (status != EFI_SUCCESS) + return status; + + status = efi_call_phys3(sys_table->boottime->allocate_pool, + EFI_LOADER_DATA, sizeof(*efidata), &efidata); + + if (status != EFI_SUCCESS) + return status; + + efidata->data.type = SETUP_EFI_VARS; + efidata->data.len = sizeof(struct efi_var_bootdata) - + sizeof(struct setup_data); + efidata->data.next = 0; + efidata->store_size = store_size; + efidata->remaining_size = remaining_size; + efidata->max_var_size = var_size; + + if (data) + data->next = (unsigned long)efidata; + else + params->hdr.setup_data = (unsigned long)efidata; + +} + static efi_status_t setup_efi_pci(struct boot_params *params) { efi_pci_io_protocol *pci; @@ -1157,6 +1202,8 @@ struct boot_params *efi_main(void *handle, efi_system_table_t *_table, setup_graphics(boot_params); + setup_efi_vars(boot_params); + setup_efi_pci(boot_params); status = efi_call_phys3(sys_table->boottime->allocate_pool, diff --git a/arch/x86/include/asm/bootparam_utils.h b/arch/x86/include/asm/bootparam_utils.h index 5b5e9cb..653668d 100644 --- a/arch/x86/include/asm/bootparam_utils.h +++ b/arch/x86/include/asm/bootparam_utils.h @@ -14,13 +14,29 @@ * analysis of kexec-tools; if other broken bootloaders initialize a * different set of fields we will need to figure out how to disambiguate. * + * Note: efi_info is commonly left uninitialized, but that field has a + * private magic, so it is better to leave it unchanged. */ static void sanitize_boot_params(struct boot_params *boot_params) { + /* + * IMPORTANT NOTE TO BOOTLOADER AUTHORS: do not simply clear + * this field. The purpose of this field is to guarantee + * compliance with the x86 boot spec located in + * Documentation/x86/boot.txt . That spec says that the + * *whole* structure should be cleared, after which only the + * portion defined by struct setup_header (boot_params->hdr) + * should be copied in. + * + * If you're having an issue because the sentinel is set, you + * need to change the whole structure to be cleared, not this + * (or any other) individual field, or you will soon have + * problems again. + */ if (boot_params->sentinel) { - /*fields in boot_params are not valid, clear them */ + /* fields in boot_params are left uninitialized, clear them */ memset(&boot_params->olpc_ofw_header, 0, - (char *)&boot_params->alt_mem_k - + (char *)&boot_params->efi_info - (char *)&boot_params->olpc_ofw_header); memset(&boot_params->kbd_status, 0, (char *)&boot_params->hdr - diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index 60c89f3..2fb5d58 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -102,6 +102,13 @@ extern void efi_call_phys_epilog(void); extern void efi_unmap_memmap(void); extern void efi_memory_uc(u64 addr, unsigned long size); +struct efi_var_bootdata { + struct setup_data data; + u64 store_size; + u64 remaining_size; + u64 max_var_size; +}; + #ifdef CONFIG_EFI static inline bool efi_is_native(void) diff --git a/arch/x86/include/asm/hugetlb.h b/arch/x86/include/asm/hugetlb.h index bdd35db..a809121 100644 --- a/arch/x86/include/asm/hugetlb.h +++ b/arch/x86/include/asm/hugetlb.h @@ -2,6 +2,7 @@ #define _ASM_X86_HUGETLB_H #include <asm/page.h> +#include <asm-generic/hugetlb.h> static inline int is_hugepage_only_range(struct mm_struct *mm, diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h index d3ddd17..5a6d287 100644 --- a/arch/x86/include/asm/kprobes.h +++ b/arch/x86/include/asm/kprobes.h @@ -77,6 +77,7 @@ struct arch_specific_insn { * a post_handler or break_handler). */ int boostable; + bool if_modifier; }; struct arch_optimized_insn { diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 635a74d..4979778 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -414,8 +414,8 @@ struct kvm_vcpu_arch { gpa_t time; struct pvclock_vcpu_time_info hv_clock; unsigned int hw_tsc_khz; - unsigned int time_offset; - struct page *time_page; + struct gfn_to_hva_cache pv_time; + bool pv_time_enabled; /* set guest stopped flag in pvclock flags field */ bool pvclock_set_guest_stopped_request; diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 5edd174..7361e47 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -703,7 +703,10 @@ static inline void arch_leave_lazy_mmu_mode(void) PVOP_VCALL0(pv_mmu_ops.lazy_mode.leave); } -void arch_flush_lazy_mmu_mode(void); +static inline void arch_flush_lazy_mmu_mode(void) +{ + PVOP_VCALL0(pv_mmu_ops.lazy_mode.flush); +} static inline void __set_fixmap(unsigned /* enum fixed_addresses */ idx, phys_addr_t phys, pgprot_t flags) diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 142236e..b3b0ec1 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -91,6 +91,7 @@ struct pv_lazy_ops { /* Set deferred update mode, used for batching operations. */ void (*enter)(void); void (*leave)(void); + void (*flush)(void); }; struct pv_time_ops { @@ -679,6 +680,7 @@ void paravirt_end_context_switch(struct task_struct *next); void paravirt_enter_lazy_mmu(void); void paravirt_leave_lazy_mmu(void); +void paravirt_flush_lazy_mmu(void); void _paravirt_nop(void); u32 _paravirt_ident_32(u32); diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h index 1ace47b..2e188d6 100644 --- a/arch/x86/include/asm/syscall.h +++ b/arch/x86/include/asm/syscall.h @@ -29,13 +29,13 @@ extern const unsigned long sys_call_table[]; */ static inline int syscall_get_nr(struct task_struct *task, struct pt_regs *regs) { - return regs->orig_ax & __SYSCALL_MASK; + return regs->orig_ax; } static inline void syscall_rollback(struct task_struct *task, struct pt_regs *regs) { - regs->ax = regs->orig_ax & __SYSCALL_MASK; + regs->ax = regs->orig_ax; } static inline long syscall_get_error(struct task_struct *task, diff --git a/arch/x86/include/asm/tlb.h b/arch/x86/include/asm/tlb.h index 4fef207..c779730 100644 --- a/arch/x86/include/asm/tlb.h +++ b/arch/x86/include/asm/tlb.h @@ -7,7 +7,7 @@ #define tlb_flush(tlb) \ { \ - if (tlb->fullmm == 0) \ + if (!tlb->fullmm && !tlb->need_flush_all) \ flush_tlb_mm_range(tlb->mm, tlb->start, tlb->end, 0UL); \ else \ flush_tlb_mm_range(tlb->mm, 0UL, TLB_FLUSH_ALL, 0UL); \ diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h index c20d1ce..e709884 100644 --- a/arch/x86/include/asm/xen/hypercall.h +++ b/arch/x86/include/asm/xen/hypercall.h @@ -382,14 +382,14 @@ HYPERVISOR_console_io(int cmd, int count, char *str) return _hypercall3(int, console_io, cmd, count, str); } -extern int __must_check HYPERVISOR_physdev_op_compat(int, void *); +extern int __must_check xen_physdev_op_compat(int, void *); static inline int HYPERVISOR_physdev_op(int cmd, void *arg) { int rc = _hypercall2(int, physdev_op, cmd, arg); if (unlikely(rc == -ENOSYS)) - rc = HYPERVISOR_physdev_op_compat(cmd, arg); + rc = xen_physdev_op_compat(cmd, arg); return rc; } diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h index c15ddaf..0874424 100644 --- a/arch/x86/include/uapi/asm/bootparam.h +++ b/arch/x86/include/uapi/asm/bootparam.h @@ -6,6 +6,7 @@ #define SETUP_E820_EXT 1 #define SETUP_DTB 2 #define SETUP_PCI 3 +#define SETUP_EFI_VARS 4 /* ram_size flags */ #define RAMDISK_IMAGE_START_MASK 0x07FF diff --git a/arch/x86/include/uapi/asm/msr-index.h b/arch/x86/include/uapi/asm/msr-index.h index 892ce40..7a060f4 100644 --- a/arch/x86/include/uapi/asm/msr-index.h +++ b/arch/x86/include/uapi/asm/msr-index.h @@ -44,6 +44,7 @@ #define SNB_C1_AUTO_UNDEMOTE (1UL << 27) #define SNB_C3_AUTO_UNDEMOTE (1UL << 28) +#define MSR_PLATFORM_INFO 0x000000ce #define MSR_MTRRcap 0x000000fe #define MSR_IA32_BBL_CR_CTL 0x00000119 #define MSR_IA32_BBL_CR_CTL3 0x0000011e diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index a0e067d..c9496313 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -43,10 +43,10 @@ obj-$(CONFIG_MTRR) += mtrr/ obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o perf_event_amd_ibs.o quiet_cmd_mkcapflags = MKCAP $@ - cmd_mkcapflags = $(PERL) $(srctree)/$(src)/mkcapflags.pl $< $@ + cmd_mkcapflags = $(CONFIG_SHELL) $(srctree)/$(src)/mkcapflags.sh $< $@ cpufeature = $(src)/../../include/asm/cpufeature.h targets += capflags.c -$(obj)/capflags.c: $(cpufeature) $(src)/mkcapflags.pl FORCE +$(obj)/capflags.c: $(cpufeature) $(src)/mkcapflags.sh FORCE $(call if_changed,mkcapflags) diff --git a/arch/x86/kernel/cpu/mkcapflags.pl b/arch/x86/kernel/cpu/mkcapflags.pl deleted file mode 100644 index 091972e..0000000 --- a/arch/x86/kernel/cpu/mkcapflags.pl +++ /dev/null @@ -1,48 +0,0 @@ -#!/usr/bin/perl -w -# -# Generate the x86_cap_flags[] array from include/asm-x86/cpufeature.h -# - -($in, $out) = @ARGV; - -open(IN, "< $in\0") or die "$0: cannot open: $in: $!\n"; -open(OUT, "> $out\0") or die "$0: cannot create: $out: $!\n"; - -print OUT "#ifndef _ASM_X86_CPUFEATURE_H\n"; -print OUT "#include <asm/cpufeature.h>\n"; -print OUT "#endif\n"; -print OUT "\n"; -print OUT "const char * const x86_cap_flags[NCAPINTS*32] = {\n"; - -%features = (); -$err = 0; - -while (defined($line = <IN>)) { - if ($line =~ /^\s*\#\s*define\s+(X86_FEATURE_(\S+))\s+(.*)$/) { - $macro = $1; - $feature = "\L$2"; - $tail = $3; - if ($tail =~ /\/\*\s*\"([^"]*)\".*\*\//) { - $feature = "\L$1"; - } - - next if ($feature eq ''); - - if ($features{$feature}++) { - print STDERR "$in: duplicate feature name: $feature\n"; - $err++; - } - printf OUT "\t%-32s = \"%s\",\n", "[$macro]", $feature; - } -} -print OUT "};\n"; - -close(IN); -close(OUT); - -if ($err) { - unlink($out); - exit(1); -} - -exit(0); diff --git a/arch/x86/kernel/cpu/mkcapflags.sh b/arch/x86/kernel/cpu/mkcapflags.sh new file mode 100644 index 0000000..2bf6165 --- /dev/null +++ b/arch/x86/kernel/cpu/mkcapflags.sh @@ -0,0 +1,41 @@ +#!/bin/sh +# +# Generate the x86_cap_flags[] array from include/asm/cpufeature.h +# + +IN=$1 +OUT=$2 + +TABS="$(printf '\t\t\t\t\t')" +trap 'rm "$OUT"' EXIT + +( + echo "#ifndef _ASM_X86_CPUFEATURE_H" + echo "#include <asm/cpufeature.h>" + echo "#endif" + echo "" + echo "const char * const x86_cap_flags[NCAPINTS*32] = {" + + # Iterate through any input lines starting with #define X86_FEATURE_ + sed -n -e 's/\t/ /g' -e 's/^ *# *define *X86_FEATURE_//p' $IN | + while read i + do + # Name is everything up to the first whitespace + NAME="$(echo "$i" | sed 's/ .*//')" + + # If the /* comment */ starts with a quote string, grab that. + VALUE="$(echo "$i" | sed -n 's@.*/\* *\("[^"]*"\).*\*/@\1@p')" + [ -z "$VALUE" ] && VALUE="\"$NAME\"" + [ "$VALUE" == '""' ] && continue + + # Name is uppercase, VALUE is all lowercase + VALUE="$(echo "$VALUE" | tr A-Z a-z)" + + TABCOUNT=$(( ( 5*8 - 14 - $(echo "$NAME" | wc -c) ) / 8 )) + printf "\t[%s]%.*s = %s,\n" \ + "X86_FEATURE_$NAME" "$TABCOUNT" "$TABS" "$VALUE" + done + echo "};" +) > $OUT + +trap - EXIT diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index a7d26d8..8f4be53 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -35,13 +35,6 @@ static bool __init ms_hyperv_platform(void) if (!boot_cpu_has(X86_FEATURE_HYPERVISOR)) return false; - /* - * Xen emulates Hyper-V to support enlightened Windows. - * Check to see first if we are on a Xen Hypervisor. - */ - if (xen_cpuid_base()) - return false; - cpuid(HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS, &eax, &hyp_signature[0], &hyp_signature[1], &hyp_signature[2]); @@ -82,12 +75,6 @@ static void __init ms_hyperv_init_platform(void) if (ms_hyperv.features & HV_X64_MSR_TIME_REF_COUNT_AVAILABLE) clocksource_register_hz(&hyperv_cs, NSEC_PER_SEC/100); -#if IS_ENABLED(CONFIG_HYPERV) - /* - * Setup the IDT for hypervisor callback. - */ - alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, hyperv_callback_vector); -#endif } const __refconst struct hypervisor_x86 x86_hyper_ms_hyperv = { @@ -103,6 +90,11 @@ static irq_handler_t vmbus_isr; void hv_register_vmbus_handler(int irq, irq_handler_t handler) { + /* + * Setup the IDT for hypervisor callback. + */ + alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, hyperv_callback_vector); + vmbus_irq = irq; vmbus_isr = handler; } diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 529c893..cc45deb 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -101,6 +101,10 @@ static struct event_constraint intel_snb_event_constraints[] __read_mostly = FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */ + INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_NO_DISPATCH */ + INTEL_UEVENT_CONSTRAINT(0x05a3, 0xf), /* CYCLE_ACTIVITY.STALLS_L2_PENDING */ + INTEL_UEVENT_CONSTRAINT(0x02a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */ + INTEL_UEVENT_CONSTRAINT(0x06a3, 0x4), /* CYCLE_ACTIVITY.STALLS_L1D_PENDING */ INTEL_EVENT_CONSTRAINT(0x48, 0x4), /* L1D_PEND_MISS.PENDING */ INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */ INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */ @@ -149,8 +153,14 @@ static struct event_constraint intel_gen_event_constraints[] __read_mostly = }; static struct extra_reg intel_snb_extra_regs[] __read_mostly = { - INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0x3fffffffffull, RSP_0), - INTEL_EVENT_EXTRA_REG(0xbb, MSR_OFFCORE_RSP_1, 0x3fffffffffull, RSP_1), + INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0x3f807f8fffull, RSP_0), + INTEL_EVENT_EXTRA_REG(0xbb, MSR_OFFCORE_RSP_1, 0x3f807f8fffull, RSP_1), + EVENT_EXTRA_END +}; + +static struct extra_reg intel_snbep_extra_regs[] __read_mostly = { + INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0x3fffff8fffull, RSP_0), + INTEL_EVENT_EXTRA_REG(0xbb, MSR_OFFCORE_RSP_1, 0x3fffff8fffull, RSP_1), EVENT_EXTRA_END }; @@ -2093,7 +2103,10 @@ __init int intel_pmu_init(void) x86_pmu.event_constraints = intel_snb_event_constraints; x86_pmu.pebs_constraints = intel_snb_pebs_event_constraints; x86_pmu.pebs_aliases = intel_pebs_aliases_snb; - x86_pmu.extra_regs = intel_snb_extra_regs; + if (boot_cpu_data.x86_model == 45) + x86_pmu.extra_regs = intel_snbep_extra_regs; + else + x86_pmu.extra_regs = intel_snb_extra_regs; /* all extra regs are per-cpu when HT is on */ x86_pmu.er_flags |= ERF_HAS_RSP_1; x86_pmu.er_flags |= ERF_NO_HT_SHARING; @@ -2119,7 +2132,10 @@ __init int intel_pmu_init(void) x86_pmu.event_constraints = intel_ivb_event_constraints; x86_pmu.pebs_constraints = intel_ivb_pebs_event_constraints; x86_pmu.pebs_aliases = intel_pebs_aliases_snb; - x86_pmu.extra_regs = intel_snb_extra_regs; + if (boot_cpu_data.x86_model == 62) + x86_pmu.extra_regs = intel_snbep_extra_regs; + else + x86_pmu.extra_regs = intel_snb_extra_regs; /* all extra regs are per-cpu when HT is on */ x86_pmu.er_flags |= ERF_HAS_RSP_1; x86_pmu.er_flags |= ERF_NO_HT_SHARING; diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index 826054a..26830f3 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -314,10 +314,11 @@ int intel_pmu_drain_bts_buffer(void) if (top <= at) return 0; + memset(®s, 0, sizeof(regs)); + ds->bts_index = ds->bts_buffer_base; perf_sample_data_init(&data, 0, event->hw.last_period); - regs.ip = 0; /* * Prepare a generic sample, i.e. fill in the invariant fields. @@ -729,3 +730,13 @@ void intel_ds_init(void) } } } + +void perf_restore_debug_store(void) +{ + struct debug_store *ds = __this_cpu_read(cpu_hw_events.ds); + + if (!x86_pmu.bts && !x86_pmu.pebs) + return; + + wrmsrl(MSR_IA32_DS_AREA, (unsigned long)ds); +} diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 3f06e61..7bfe318 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -375,6 +375,9 @@ static void __kprobes arch_copy_kprobe(struct kprobe *p) else p->ainsn.boostable = -1; + /* Check whether the instruction modifies Interrupt Flag or not */ + p->ainsn.if_modifier = is_IF_modifier(p->ainsn.insn); + /* Also, displacement change doesn't affect the first byte */ p->opcode = p->ainsn.insn[0]; } @@ -434,7 +437,7 @@ static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs, __this_cpu_write(current_kprobe, p); kcb->kprobe_saved_flags = kcb->kprobe_old_flags = (regs->flags & (X86_EFLAGS_TF | X86_EFLAGS_IF)); - if (is_IF_modifier(p->ainsn.insn)) + if (p->ainsn.if_modifier) kcb->kprobe_saved_flags &= ~X86_EFLAGS_IF; } diff --git a/arch/x86/kernel/microcode_core_early.c b/arch/x86/kernel/microcode_core_early.c index 577db84..833d51d 100644 --- a/arch/x86/kernel/microcode_core_early.c +++ b/arch/x86/kernel/microcode_core_early.c @@ -45,9 +45,6 @@ static int __cpuinit x86_vendor(void) u32 eax = 0x00000000; u32 ebx, ecx = 0, edx; - if (!have_cpuid_p()) - return X86_VENDOR_UNKNOWN; - native_cpuid(&eax, &ebx, &ecx, &edx); if (CPUID_IS(CPUID_INTEL1, CPUID_INTEL2, CPUID_INTEL3, ebx, ecx, edx)) @@ -59,18 +56,45 @@ static int __cpuinit x86_vendor(void) return X86_VENDOR_UNKNOWN; } +static int __cpuinit x86_family(void) +{ + u32 eax = 0x00000001; + u32 ebx, ecx = 0, edx; + int x86; + + native_cpuid(&eax, &ebx, &ecx, &edx); + + x86 = (eax >> 8) & 0xf; + if (x86 == 15) + x86 += (eax >> 20) & 0xff; + + return x86; +} + void __init load_ucode_bsp(void) { - int vendor = x86_vendor(); + int vendor, x86; + + if (!have_cpuid_p()) + return; - if (vendor == X86_VENDOR_INTEL) + vendor = x86_vendor(); + x86 = x86_family(); + + if (vendor == X86_VENDOR_INTEL && x86 >= 6) load_ucode_intel_bsp(); } void __cpuinit load_ucode_ap(void) { - int vendor = x86_vendor(); + int vendor, x86; + + if (!have_cpuid_p()) + return; + + vendor = x86_vendor(); + x86 = x86_family(); - if (vendor == X86_VENDOR_INTEL) + if (vendor == X86_VENDOR_INTEL && x86 >= 6) load_ucode_intel_ap(); } diff --git a/arch/x86/kernel/microcode_intel_early.c b/arch/x86/kernel/microcode_intel_early.c index 7890bc8..d893e8e 100644 --- a/arch/x86/kernel/microcode_intel_early.c +++ b/arch/x86/kernel/microcode_intel_early.c @@ -90,13 +90,13 @@ microcode_phys(struct microcode_intel **mc_saved_tmp, struct microcode_intel ***mc_saved; mc_saved = (struct microcode_intel ***) - __pa_symbol(&mc_saved_data->mc_saved); + __pa_nodebug(&mc_saved_data->mc_saved); for (i = 0; i < mc_saved_data->mc_saved_count; i++) { struct microcode_intel *p; p = *(struct microcode_intel **) - __pa(mc_saved_data->mc_saved + i); - mc_saved_tmp[i] = (struct microcode_intel *)__pa(p); + __pa_nodebug(mc_saved_data->mc_saved + i); + mc_saved_tmp[i] = (struct microcode_intel *)__pa_nodebug(p); } } #endif @@ -562,7 +562,7 @@ scan_microcode(unsigned long start, unsigned long end, struct cpio_data cd; long offset = 0; #ifdef CONFIG_X86_32 - char *p = (char *)__pa_symbol(ucode_name); + char *p = (char *)__pa_nodebug(ucode_name); #else char *p = ucode_name; #endif @@ -630,8 +630,8 @@ static void __cpuinit print_ucode(struct ucode_cpu_info *uci) if (mc_intel == NULL) return; - delay_ucode_info_p = (int *)__pa_symbol(&delay_ucode_info); - current_mc_date_p = (int *)__pa_symbol(¤t_mc_date); + delay_ucode_info_p = (int *)__pa_nodebug(&delay_ucode_info); + current_mc_date_p = (int *)__pa_nodebug(¤t_mc_date); *delay_ucode_info_p = 1; *current_mc_date_p = mc_intel->hdr.date; @@ -659,8 +659,8 @@ static inline void __cpuinit print_ucode(struct ucode_cpu_info *uci) } #endif -static int apply_microcode_early(struct mc_saved_data *mc_saved_data, - struct ucode_cpu_info *uci) +static int __cpuinit apply_microcode_early(struct mc_saved_data *mc_saved_data, + struct ucode_cpu_info *uci) { struct microcode_intel *mc_intel; unsigned int val[2]; @@ -741,15 +741,15 @@ load_ucode_intel_bsp(void) #ifdef CONFIG_X86_32 struct boot_params *boot_params_p; - boot_params_p = (struct boot_params *)__pa_symbol(&boot_params); + boot_params_p = (struct boot_params *)__pa_nodebug(&boot_params); ramdisk_image = boot_params_p->hdr.ramdisk_image; ramdisk_size = boot_params_p->hdr.ramdisk_size; initrd_start_early = ramdisk_image; initrd_end_early = initrd_start_early + ramdisk_size; _load_ucode_intel_bsp( - (struct mc_saved_data *)__pa_symbol(&mc_saved_data), - (unsigned long *)__pa_symbol(&mc_saved_in_initrd), + (struct mc_saved_data *)__pa_nodebug(&mc_saved_data), + (unsigned long *)__pa_nodebug(&mc_saved_in_initrd), initrd_start_early, initrd_end_early, &uci); #else ramdisk_image = boot_params.hdr.ramdisk_image; @@ -772,10 +772,10 @@ void __cpuinit load_ucode_intel_ap(void) unsigned long *initrd_start_p; mc_saved_in_initrd_p = - (unsigned long *)__pa_symbol(mc_saved_in_initrd); - mc_saved_data_p = (struct mc_saved_data *)__pa_symbol(&mc_saved_data); - initrd_start_p = (unsigned long *)__pa_symbol(&initrd_start); - initrd_start_addr = (unsigned long)__pa_symbol(*initrd_start_p); + (unsigned long *)__pa_nodebug(mc_saved_in_initrd); + mc_saved_data_p = (struct mc_saved_data *)__pa_nodebug(&mc_saved_data); + initrd_start_p = (unsigned long *)__pa_nodebug(&initrd_start); + initrd_start_addr = (unsigned long)__pa_nodebug(*initrd_start_p); #else mc_saved_data_p = &mc_saved_data; mc_saved_in_initrd_p = mc_saved_in_initrd; diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 17fff18..8bfb335 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -263,6 +263,18 @@ void paravirt_leave_lazy_mmu(void) leave_lazy(PARAVIRT_LAZY_MMU); } +void paravirt_flush_lazy_mmu(void) +{ + preempt_disable(); + + if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) { + arch_leave_lazy_mmu_mode(); + arch_enter_lazy_mmu_mode(); + } + + preempt_enable(); +} + void paravirt_start_context_switch(struct task_struct *prev) { BUG_ON(preemptible()); @@ -292,18 +304,6 @@ enum paravirt_lazy_mode paravirt_get_lazy_mode(void) return this_cpu_read(paravirt_lazy_mode); } -void arch_flush_lazy_mmu_mode(void) -{ - preempt_disable(); - - if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) { - arch_leave_lazy_mmu_mode(); - arch_enter_lazy_mmu_mode(); - } - - preempt_enable(); -} - struct pv_info pv_info = { .name = "bare hardware", .paravirt_enabled = 0, @@ -475,6 +475,7 @@ struct pv_mmu_ops pv_mmu_ops = { .lazy_mode = { .enter = paravirt_nop, .leave = paravirt_nop, + .flush = paravirt_nop, }, .set_fixmap = native_set_fixmap, diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 84d3285..fae9134 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -171,9 +171,15 @@ static struct resource bss_resource = { #ifdef CONFIG_X86_32 /* cpu data as detected by the assembly code in head.S */ -struct cpuinfo_x86 new_cpu_data __cpuinitdata = {0, 0, 0, 0, -1, 1, 0, 0, -1}; +struct cpuinfo_x86 new_cpu_data __cpuinitdata = { + .wp_works_ok = -1, + .fdiv_bug = -1, +}; /* common cpu data for all cpus */ -struct cpuinfo_x86 boot_cpu_data __read_mostly = {0, 0, 0, 0, -1, 1, 0, 0, -1}; +struct cpuinfo_x86 boot_cpu_data __read_mostly = { + .wp_works_ok = -1, + .fdiv_bug = -1, +}; EXPORT_SYMBOL(boot_cpu_data); unsigned int def_to_bigsmp; @@ -501,11 +507,14 @@ static void __init memblock_x86_reserve_range_setup_data(void) /* * Keep the crash kernel below this limit. On 32 bits earlier kernels * would limit the kernel to the low 512 MiB due to mapping restrictions. + * On 64bit, old kexec-tools need to under 896MiB. */ #ifdef CONFIG_X86_32 -# define CRASH_KERNEL_ADDR_MAX (512 << 20) +# define CRASH_KERNEL_ADDR_LOW_MAX (512 << 20) +# define CRASH_KERNEL_ADDR_HIGH_MAX (512 << 20) #else -# define CRASH_KERNEL_ADDR_MAX MAXMEM +# define CRASH_KERNEL_ADDR_LOW_MAX (896UL<<20) +# define CRASH_KERNEL_ADDR_HIGH_MAX MAXMEM #endif static void __init reserve_crashkernel_low(void) @@ -515,19 +524,35 @@ static void __init reserve_crashkernel_low(void) unsigned long long low_base = 0, low_size = 0; unsigned long total_low_mem; unsigned long long base; + bool auto_set = false; int ret; total_low_mem = memblock_mem_size(1UL<<(32-PAGE_SHIFT)); + /* crashkernel=Y,low */ ret = parse_crashkernel_low(boot_command_line, total_low_mem, &low_size, &base); - if (ret != 0 || low_size <= 0) - return; + if (ret != 0) { + /* + * two parts from lib/swiotlb.c: + * swiotlb size: user specified with swiotlb= or default. + * swiotlb overflow buffer: now is hardcoded to 32k. + * We round it to 8M for other buffers that + * may need to stay low too. + */ + low_size = swiotlb_size_or_default() + (8UL<<20); + auto_set = true; + } else { + /* passed with crashkernel=0,low ? */ + if (!low_size) + return; + } low_base = memblock_find_in_range(low_size, (1ULL<<32), low_size, alignment); if (!low_base) { - pr_info("crashkernel low reservation failed - No suitable area found.\n"); + if (!auto_set) + pr_info("crashkernel low reservation failed - No suitable area found.\n"); return; } @@ -548,14 +573,22 @@ static void __init reserve_crashkernel(void) const unsigned long long alignment = 16<<20; /* 16M */ unsigned long long total_mem; unsigned long long crash_size, crash_base; + bool high = false; int ret; total_mem = memblock_phys_mem_size(); + /* crashkernel=XM */ ret = parse_crashkernel(boot_command_line, total_mem, &crash_size, &crash_base); - if (ret != 0 || crash_size <= 0) - return; + if (ret != 0 || crash_size <= 0) { + /* crashkernel=X,high */ + ret = parse_crashkernel_high(boot_command_line, total_mem, + &crash_size, &crash_base); + if (ret != 0 || crash_size <= 0) + return; + high = true; + } /* 0 means: find the address automatically */ if (crash_base <= 0) { @@ -563,7 +596,9 @@ static void __init reserve_crashkernel(void) * kexec want bzImage is below CRASH_KERNEL_ADDR_MAX */ crash_base = memblock_find_in_range(alignment, - CRASH_KERNEL_ADDR_MAX, crash_size, alignment); + high ? CRASH_KERNEL_ADDR_HIGH_MAX : + CRASH_KERNEL_ADDR_LOW_MAX, + crash_size, alignment); if (!crash_base) { pr_info("crashkernel reservation failed - No suitable area found.\n"); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index a6ceaed..9f190a2 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1365,9 +1365,8 @@ static inline void mwait_play_dead(void) unsigned int eax, ebx, ecx, edx; unsigned int highest_cstate = 0; unsigned int highest_subcstate = 0; - int i; void *mwait_ptr; - struct cpuinfo_x86 *c = __this_cpu_ptr(&cpu_info); + int i; if (!this_cpu_has(X86_FEATURE_MWAIT)) return; diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 02b51dd..f77df1c 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1857,7 +1857,7 @@ int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data) if (!pv_eoi_enabled(vcpu)) return 0; return kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.pv_eoi.data, - addr); + addr, sizeof(u8)); } void kvm_lapic_init(void) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index f71500a..e172132 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1406,25 +1406,15 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) unsigned long flags, this_tsc_khz; struct kvm_vcpu_arch *vcpu = &v->arch; struct kvm_arch *ka = &v->kvm->arch; - void *shared_kaddr; s64 kernel_ns, max_kernel_ns; u64 tsc_timestamp, host_tsc; - struct pvclock_vcpu_time_info *guest_hv_clock; + struct pvclock_vcpu_time_info guest_hv_clock; u8 pvclock_flags; bool use_master_clock; kernel_ns = 0; host_tsc = 0; - /* Keep irq disabled to prevent changes to the clock */ - local_irq_save(flags); - this_tsc_khz = __get_cpu_var(cpu_tsc_khz); - if (unlikely(this_tsc_khz == 0)) { - local_irq_restore(flags); - kvm_make_request(KVM_REQ_CLOCK_UPDATE, v); - return 1; - } - /* * If the host uses TSC clock, then passthrough TSC as stable * to the guest. @@ -1436,6 +1426,15 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) kernel_ns = ka->master_kernel_ns; } spin_unlock(&ka->pvclock_gtod_sync_lock); + + /* Keep irq disabled to prevent changes to the clock */ + local_irq_save(flags); + this_tsc_khz = __get_cpu_var(cpu_tsc_khz); + if (unlikely(this_tsc_khz == 0)) { + local_irq_restore(flags); + kvm_make_request(KVM_REQ_CLOCK_UPDATE, v); + return 1; + } if (!use_master_clock) { host_tsc = native_read_tsc(); kernel_ns = get_kernel_ns(); @@ -1463,7 +1462,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) local_irq_restore(flags); - if (!vcpu->time_page) + if (!vcpu->pv_time_enabled) return 0; /* @@ -1525,12 +1524,12 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) */ vcpu->hv_clock.version += 2; - shared_kaddr = kmap_atomic(vcpu->time_page); - - guest_hv_clock = shared_kaddr + vcpu->time_offset; + if (unlikely(kvm_read_guest_cached(v->kvm, &vcpu->pv_time, + &guest_hv_clock, sizeof(guest_hv_clock)))) + return 0; /* retain PVCLOCK_GUEST_STOPPED if set in guest copy */ - pvclock_flags = (guest_hv_clock->flags & PVCLOCK_GUEST_STOPPED); + pvclock_flags = (guest_hv_clock.flags & PVCLOCK_GUEST_STOPPED); if (vcpu->pvclock_set_guest_stopped_request) { pvclock_flags |= PVCLOCK_GUEST_STOPPED; @@ -1543,12 +1542,9 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) vcpu->hv_clock.flags = pvclock_flags; - memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock, - sizeof(vcpu->hv_clock)); - - kunmap_atomic(shared_kaddr); - - mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT); + kvm_write_guest_cached(v->kvm, &vcpu->pv_time, + &vcpu->hv_clock, + sizeof(vcpu->hv_clock)); return 0; } @@ -1827,7 +1823,8 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data) return 0; } - if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa)) + if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa, + sizeof(u32))) return 1; vcpu->arch.apf.send_user_only = !(data & KVM_ASYNC_PF_SEND_ALWAYS); @@ -1837,10 +1834,7 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data) static void kvmclock_reset(struct kvm_vcpu *vcpu) { - if (vcpu->arch.time_page) { - kvm_release_page_dirty(vcpu->arch.time_page); - vcpu->arch.time_page = NULL; - } + vcpu->arch.pv_time_enabled = false; } static void accumulate_steal_time(struct kvm_vcpu *vcpu) @@ -1947,6 +1941,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_KVM_SYSTEM_TIME_NEW: case MSR_KVM_SYSTEM_TIME: { + u64 gpa_offset; kvmclock_reset(vcpu); vcpu->arch.time = data; @@ -1956,14 +1951,14 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (!(data & 1)) break; - /* ...but clean it before doing the actual write */ - vcpu->arch.time_offset = data & ~(PAGE_MASK | 1); - - vcpu->arch.time_page = - gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT); + gpa_offset = data & ~(PAGE_MASK | 1); - if (is_error_page(vcpu->arch.time_page)) - vcpu->arch.time_page = NULL; + if (kvm_gfn_to_hva_cache_init(vcpu->kvm, + &vcpu->arch.pv_time, data & ~1ULL, + sizeof(struct pvclock_vcpu_time_info))) + vcpu->arch.pv_time_enabled = false; + else + vcpu->arch.pv_time_enabled = true; break; } @@ -1980,7 +1975,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.st.stime, - data & KVM_STEAL_VALID_BITS)) + data & KVM_STEAL_VALID_BITS, + sizeof(struct kvm_steal_time))) return 1; vcpu->arch.st.msr_val = data; @@ -2967,7 +2963,7 @@ static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu, */ static int kvm_set_guest_paused(struct kvm_vcpu *vcpu) { - if (!vcpu->arch.time_page) + if (!vcpu->arch.pv_time_enabled) return -EINVAL; vcpu->arch.pvclock_set_guest_stopped_request = true; kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); @@ -6718,6 +6714,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) goto fail_free_wbinvd_dirty_mask; vcpu->arch.ia32_tsc_adjust_msr = 0x0; + vcpu->arch.pv_time_enabled = false; kvm_async_pf_hash_reset(vcpu); kvm_pmu_init(vcpu); diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 1cbd89c..7114c63 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -1334,6 +1334,7 @@ __init void lguest_init(void) pv_mmu_ops.read_cr3 = lguest_read_cr3; pv_mmu_ops.lazy_mode.enter = paravirt_enter_lazy_mmu; pv_mmu_ops.lazy_mode.leave = lguest_leave_lazy_mmu_mode; + pv_mmu_ops.lazy_mode.flush = paravirt_flush_lazy_mmu; pv_mmu_ops.pte_update = lguest_pte_update; pv_mmu_ops.pte_update_defer = lguest_pte_update; diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c index 05928aa..906fea3 100644 --- a/arch/x86/lib/usercopy_64.c +++ b/arch/x86/lib/usercopy_64.c @@ -74,10 +74,10 @@ copy_user_handle_tail(char *to, char *from, unsigned len, unsigned zerorest) char c; unsigned zero_len; - for (; len; --len) { + for (; len; --len, to++) { if (__get_user_nocheck(c, from++, sizeof(char))) break; - if (__put_user_nocheck(c, to++, sizeof(char))) + if (__put_user_nocheck(c, to, sizeof(char))) break; } diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 2b97525..0e88336 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -378,10 +378,12 @@ static noinline __kprobes int vmalloc_fault(unsigned long address) if (pgd_none(*pgd_ref)) return -1; - if (pgd_none(*pgd)) + if (pgd_none(*pgd)) { set_pgd(pgd, *pgd_ref); - else + arch_flush_lazy_mmu_mode(); + } else { BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); + } /* * Below here mismatches are bugs because these lower tables diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c index 6f31ee5..252b8f5 100644 --- a/arch/x86/mm/highmem_32.c +++ b/arch/x86/mm/highmem_32.c @@ -137,5 +137,4 @@ void __init set_highmem_pages_init(void) add_highpages_with_active_regions(nid, zone_start_pfn, zone_end_pfn); } - totalram_pages += totalhigh_pages; } diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 4903a03..fdc5dca 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -410,9 +410,8 @@ void __init init_mem_mapping(void) /* the ISA range is always mapped regardless of memory holes */ init_memory_mapping(0, ISA_END_ADDRESS); - /* xen has big range in reserved near end of ram, skip it at first */ - addr = memblock_find_in_range(ISA_END_ADDRESS, end, PMD_SIZE, - PAGE_SIZE); + /* xen has big range in reserved near end of ram, skip it at first.*/ + addr = memblock_find_in_range(ISA_END_ADDRESS, end, PMD_SIZE, PMD_SIZE); real_end = addr + PMD_SIZE; /* step_size need to be small so pgt_buf from BRK could cover it */ @@ -516,11 +515,8 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end) printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10); for (; addr < end; addr += PAGE_SIZE) { - ClearPageReserved(virt_to_page(addr)); - init_page_count(virt_to_page(addr)); memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE); - free_page(addr); - totalram_pages++; + free_reserved_page(virt_to_page(addr)); } #endif } diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 2d19001..3ac7e31 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -427,14 +427,6 @@ static void __init permanent_kmaps_init(pgd_t *pgd_base) pkmap_page_table = pte; } -static void __init add_one_highpage_init(struct page *page) -{ - ClearPageReserved(page); - init_page_count(page); - __free_page(page); - totalhigh_pages++; -} - void __init add_highpages_with_active_regions(int nid, unsigned long start_pfn, unsigned long end_pfn) { @@ -448,7 +440,7 @@ void __init add_highpages_with_active_regions(int nid, start_pfn, end_pfn); for ( ; pfn < e_pfn; pfn++) if (pfn_valid(pfn)) - add_one_highpage_init(pfn_to_page(pfn)); + free_highmem_page(pfn_to_page(pfn)); } } #else diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 474e28f..71ff55a 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1011,11 +1011,8 @@ remove_pagetable(unsigned long start, unsigned long end, bool direct) flush_tlb_all(); } -void __ref vmemmap_free(struct page *memmap, unsigned long nr_pages) +void __ref vmemmap_free(unsigned long start, unsigned long end) { - unsigned long start = (unsigned long)memmap; - unsigned long end = (unsigned long)(memmap + nr_pages); - remove_pagetable(start, end, false); } @@ -1067,10 +1064,9 @@ void __init mem_init(void) /* clear_bss() already clear the empty_zero_page */ - reservedpages = 0; - - /* this will put all low memory onto the freelists */ register_page_bootmem_info(); + + /* this will put all memory onto the freelists */ totalram_pages = free_all_bootmem(); absent_pages = absent_pages_in_range(0, max_pfn); @@ -1285,18 +1281,17 @@ static long __meminitdata addr_start, addr_end; static void __meminitdata *p_start, *p_end; static int __meminitdata node_start; -int __meminit -vmemmap_populate(struct page *start_page, unsigned long size, int node) +static int __meminit vmemmap_populate_hugepages(unsigned long start, + unsigned long end, int node) { - unsigned long addr = (unsigned long)start_page; - unsigned long end = (unsigned long)(start_page + size); + unsigned long addr; unsigned long next; pgd_t *pgd; pud_t *pud; pmd_t *pmd; - for (; addr < end; addr = next) { - void *p = NULL; + for (addr = start; addr < end; addr = next) { + next = pmd_addr_end(addr, end); pgd = vmemmap_pgd_populate(addr, node); if (!pgd) @@ -1306,31 +1301,14 @@ vmemmap_populate(struct page *start_page, unsigned long size, int node) if (!pud) return -ENOMEM; - if (!cpu_has_pse) { - next = (addr + PAGE_SIZE) & PAGE_MASK; - pmd = vmemmap_pmd_populate(pud, addr, node); - - if (!pmd) - return -ENOMEM; - - p = vmemmap_pte_populate(pmd, addr, node); + pmd = pmd_offset(pud, addr); + if (pmd_none(*pmd)) { + void *p; - if (!p) - return -ENOMEM; - - addr_end = addr + PAGE_SIZE; - p_end = p + PAGE_SIZE; - } else { - next = pmd_addr_end(addr, end); - - pmd = pmd_offset(pud, addr); - if (pmd_none(*pmd)) { + p = vmemmap_alloc_block_buf(PMD_SIZE, node); + if (p) { pte_t entry; - p = vmemmap_alloc_block_buf(PMD_SIZE, node); - if (!p) - return -ENOMEM; - entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL_LARGE); set_pmd(pmd, __pmd(pte_val(entry))); @@ -1347,15 +1325,32 @@ vmemmap_populate(struct page *start_page, unsigned long size, int node) addr_end = addr + PMD_SIZE; p_end = p + PMD_SIZE; - } else - vmemmap_verify((pte_t *)pmd, node, addr, next); + continue; + } + } else if (pmd_large(*pmd)) { + vmemmap_verify((pte_t *)pmd, node, addr, next); + continue; } - + pr_warn_once("vmemmap: falling back to regular page backing\n"); + if (vmemmap_populate_basepages(addr, next, node)) + return -ENOMEM; } - sync_global_pgds((unsigned long)start_page, end - 1); return 0; } +int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node) +{ + int err; + + if (cpu_has_pse) + err = vmemmap_populate_hugepages(start, end, node); + else + err = vmemmap_populate_basepages(start, end, node); + if (!err) + sync_global_pgds(start, end - 1); + return err; +} + #if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) && defined(CONFIG_HAVE_BOOTMEM_INFO_NODE) void register_page_bootmem_memmap(unsigned long section_nr, struct page *start_page, unsigned long size) diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 78fe3f1..9a1e658 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -282,12 +282,7 @@ void iounmap(volatile void __iomem *addr) in parallel. Reuse of the virtual address is prevented by leaving it in the global lists until we're done with it. cpa takes care of the direct mappings. */ - read_lock(&vmlist_lock); - for (p = vmlist; p; p = p->next) { - if (p->addr == (void __force *)addr) - break; - } - read_unlock(&vmlist_lock); + p = find_vm_area((void __force *)addr); if (!p) { printk(KERN_ERR "iounmap: bad address %p\n", addr); diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 72fe01e..a71c4e2 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -114,14 +114,11 @@ void numa_clear_node(int cpu) */ void __init setup_node_to_cpumask_map(void) { - unsigned int node, num = 0; + unsigned int node; /* setup nr_node_ids if not done yet */ - if (nr_node_ids == MAX_NUMNODES) { - for_each_node_mask(node, node_possible_map) - num = node; - nr_node_ids = num + 1; - } + if (nr_node_ids == MAX_NUMNODES) + setup_nr_node_ids(); /* allocate the map */ for (node = 0; node < nr_node_ids; node++) diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c index b008656..0e38951 100644 --- a/arch/x86/mm/pageattr-test.c +++ b/arch/x86/mm/pageattr-test.c @@ -68,7 +68,7 @@ static int print_split(struct split_state *s) s->gpg++; i += GPS/PAGE_SIZE; } else if (level == PG_LEVEL_2M) { - if (!(pte_val(*pte) & _PAGE_PSE)) { + if ((pte_val(*pte) & _PAGE_PRESENT) && !(pte_val(*pte) & _PAGE_PSE)) { printk(KERN_ERR "%lx level %d but not PSE %Lx\n", addr, level, (u64)pte_val(*pte)); diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 091934e..fb4e73e 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -467,7 +467,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, * We are safe now. Check whether the new pgprot is the same: */ old_pte = *kpte; - old_prot = new_prot = req_prot = pte_pgprot(old_pte); + old_prot = req_prot = pte_pgprot(old_pte); pgprot_val(req_prot) &= ~pgprot_val(cpa->mask_clr); pgprot_val(req_prot) |= pgprot_val(cpa->mask_set); @@ -478,12 +478,12 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, * a non present pmd. The canon_pgprot will clear _PAGE_GLOBAL * for the ancient hardware that doesn't support it. */ - if (pgprot_val(new_prot) & _PAGE_PRESENT) - pgprot_val(new_prot) |= _PAGE_PSE | _PAGE_GLOBAL; + if (pgprot_val(req_prot) & _PAGE_PRESENT) + pgprot_val(req_prot) |= _PAGE_PSE | _PAGE_GLOBAL; else - pgprot_val(new_prot) &= ~(_PAGE_PSE | _PAGE_GLOBAL); + pgprot_val(req_prot) &= ~(_PAGE_PSE | _PAGE_GLOBAL); - new_prot = canon_pgprot(new_prot); + req_prot = canon_pgprot(req_prot); /* * old_pte points to the large page base address. So we need @@ -1413,6 +1413,8 @@ void kernel_map_pages(struct page *page, int numpages, int enable) * but that can deadlock->flush only current cpu: */ __flush_tlb_all(); + + arch_flush_lazy_mmu_mode(); } #ifdef CONFIG_HIBERNATION diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 2610bd9..6574388 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -563,6 +563,13 @@ int kernel_map_sync_memtype(u64 base, unsigned long size, unsigned long flags) if (base > __pa(high_memory-1)) return 0; + /* + * some areas in the middle of the kernel identity range + * are not mapped, like the PCI space. + */ + if (!page_is_ram(base >> PAGE_SHIFT)) + return 0; + id_sz = (__pa(high_memory-1) <= base + size) ? __pa(high_memory) - base : size; diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 193350b..17fda6a 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -58,6 +58,13 @@ void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte) void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) { paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT); + /* + * NOTE! For PAE, any changes to the top page-directory-pointer-table + * entries need a full cr3 reload to flush. + */ +#ifdef CONFIG_X86_PAE + tlb->need_flush_all = 1; +#endif tlb_remove_page(tlb, virt_to_page(pmd)); } diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c index 901177d..305c68b 100644 --- a/arch/x86/pci/common.c +++ b/arch/x86/pci/common.c @@ -6,6 +6,7 @@ #include <linux/sched.h> #include <linux/pci.h> +#include <linux/pci-acpi.h> #include <linux/ioport.h> #include <linux/init.h> #include <linux/dmi.h> @@ -170,6 +171,16 @@ void pcibios_fixup_bus(struct pci_bus *b) pcibios_fixup_device_resources(dev); } +void pcibios_add_bus(struct pci_bus *bus) +{ + acpi_pci_add_bus(bus); +} + +void pcibios_remove_bus(struct pci_bus *bus) +{ + acpi_pci_remove_bus(bus); +} + /* * Only use DMI information to set this if nothing was passed * on the kernel command line (which was parsed earlier). diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c index 94e7662..4a9be6d 100644 --- a/arch/x86/pci/xen.c +++ b/arch/x86/pci/xen.c @@ -177,7 +177,7 @@ static int xen_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) goto error; i = 0; list_for_each_entry(msidesc, &dev->msi_list, list) { - irq = xen_bind_pirq_msi_to_irq(dev, msidesc, v[i], 0, + irq = xen_bind_pirq_msi_to_irq(dev, msidesc, v[i], (type == PCI_CAP_ID_MSIX) ? "pcifront-msi-x" : "pcifront-msi", @@ -244,7 +244,7 @@ static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) dev_dbg(&dev->dev, "xen: msi already bound to pirq=%d\n", pirq); } - irq = xen_bind_pirq_msi_to_irq(dev, msidesc, pirq, 0, + irq = xen_bind_pirq_msi_to_irq(dev, msidesc, pirq, (type == PCI_CAP_ID_MSIX) ? "msi-x" : "msi", DOMID_SELF); @@ -326,7 +326,7 @@ static int xen_initdom_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) } ret = xen_bind_pirq_msi_to_irq(dev, msidesc, - map_irq.pirq, map_irq.index, + map_irq.pirq, (type == PCI_CAP_ID_MSIX) ? "msi-x" : "msi", domid); diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 5f2ecaf..e4a86a6 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -41,6 +41,7 @@ #include <linux/io.h> #include <linux/reboot.h> #include <linux/bcd.h> +#include <linux/ucs2_string.h> #include <asm/setup.h> #include <asm/efi.h> @@ -51,6 +52,13 @@ #define EFI_DEBUG 1 +/* + * There's some additional metadata associated with each + * variable. Intel's reference implementation is 60 bytes - bump that + * to account for potential alignment constraints + */ +#define VAR_METADATA_SIZE 64 + struct efi __read_mostly efi = { .mps = EFI_INVALID_TABLE_ADDR, .acpi = EFI_INVALID_TABLE_ADDR, @@ -69,6 +77,13 @@ struct efi_memory_map memmap; static struct efi efi_phys __initdata; static efi_system_table_t efi_systab __initdata; +static u64 efi_var_store_size; +static u64 efi_var_remaining_size; +static u64 efi_var_max_var_size; +static u64 boot_used_size; +static u64 boot_var_size; +static u64 active_size; + unsigned long x86_efi_facility; /* @@ -98,6 +113,15 @@ static int __init setup_add_efi_memmap(char *arg) } early_param("add_efi_memmap", setup_add_efi_memmap); +static bool efi_no_storage_paranoia; + +static int __init setup_storage_paranoia(char *arg) +{ + efi_no_storage_paranoia = true; + return 0; +} +early_param("efi_no_storage_paranoia", setup_storage_paranoia); + static efi_status_t virt_efi_get_time(efi_time_t *tm, efi_time_cap_t *tc) { @@ -162,8 +186,53 @@ static efi_status_t virt_efi_get_next_variable(unsigned long *name_size, efi_char16_t *name, efi_guid_t *vendor) { - return efi_call_virt3(get_next_variable, - name_size, name, vendor); + efi_status_t status; + static bool finished = false; + static u64 var_size; + + status = efi_call_virt3(get_next_variable, + name_size, name, vendor); + + if (status == EFI_NOT_FOUND) { + finished = true; + if (var_size < boot_used_size) { + boot_var_size = boot_used_size - var_size; + active_size += boot_var_size; + } else { + printk(KERN_WARNING FW_BUG "efi: Inconsistent initial sizes\n"); + } + } + + if (boot_used_size && !finished) { + unsigned long size; + u32 attr; + efi_status_t s; + void *tmp; + + s = virt_efi_get_variable(name, vendor, &attr, &size, NULL); + + if (s != EFI_BUFFER_TOO_SMALL || !size) + return status; + + tmp = kmalloc(size, GFP_ATOMIC); + + if (!tmp) + return status; + + s = virt_efi_get_variable(name, vendor, &attr, &size, tmp); + + if (s == EFI_SUCCESS && (attr & EFI_VARIABLE_NON_VOLATILE)) { + var_size += size; + var_size += ucs2_strsize(name, 1024); + active_size += size; + active_size += VAR_METADATA_SIZE; + active_size += ucs2_strsize(name, 1024); + } + + kfree(tmp); + } + + return status; } static efi_status_t virt_efi_set_variable(efi_char16_t *name, @@ -172,9 +241,34 @@ static efi_status_t virt_efi_set_variable(efi_char16_t *name, unsigned long data_size, void *data) { - return efi_call_virt5(set_variable, - name, vendor, attr, - data_size, data); + efi_status_t status; + u32 orig_attr = 0; + unsigned long orig_size = 0; + + status = virt_efi_get_variable(name, vendor, &orig_attr, &orig_size, + NULL); + + if (status != EFI_BUFFER_TOO_SMALL) + orig_size = 0; + + status = efi_call_virt5(set_variable, + name, vendor, attr, + data_size, data); + + if (status == EFI_SUCCESS) { + if (orig_size) { + active_size -= orig_size; + active_size -= ucs2_strsize(name, 1024); + active_size -= VAR_METADATA_SIZE; + } + if (data_size) { + active_size += data_size; + active_size += ucs2_strsize(name, 1024); + active_size += VAR_METADATA_SIZE; + } + } + + return status; } static efi_status_t virt_efi_query_variable_info(u32 attr, @@ -682,6 +776,9 @@ void __init efi_init(void) char vendor[100] = "unknown"; int i = 0; void *tmp; + struct setup_data *data; + struct efi_var_bootdata *efi_var_data; + u64 pa_data; #ifdef CONFIG_X86_32 if (boot_params.efi_info.efi_systab_hi || @@ -699,6 +796,22 @@ void __init efi_init(void) if (efi_systab_init(efi_phys.systab)) return; + pa_data = boot_params.hdr.setup_data; + while (pa_data) { + data = early_ioremap(pa_data, sizeof(*efi_var_data)); + if (data->type == SETUP_EFI_VARS) { + efi_var_data = (struct efi_var_bootdata *)data; + + efi_var_store_size = efi_var_data->store_size; + efi_var_remaining_size = efi_var_data->remaining_size; + efi_var_max_var_size = efi_var_data->max_var_size; + } + pa_data = data->next; + early_iounmap(data, sizeof(*efi_var_data)); + } + + boot_used_size = efi_var_store_size - efi_var_remaining_size; + set_bit(EFI_SYSTEM_TABLES, &x86_efi_facility); /* @@ -999,3 +1112,48 @@ u64 efi_mem_attributes(unsigned long phys_addr) } return 0; } + +/* + * Some firmware has serious problems when using more than 50% of the EFI + * variable store, i.e. it triggers bugs that can brick machines. Ensure that + * we never use more than this safe limit. + * + * Return EFI_SUCCESS if it is safe to write 'size' bytes to the variable + * store. + */ +efi_status_t efi_query_variable_store(u32 attributes, unsigned long size) +{ + efi_status_t status; + u64 storage_size, remaining_size, max_size; + + status = efi.query_variable_info(attributes, &storage_size, + &remaining_size, &max_size); + if (status != EFI_SUCCESS) + return status; + + if (!max_size && remaining_size > size) + printk_once(KERN_ERR FW_BUG "Broken EFI implementation" + " is returning MaxVariableSize=0\n"); + /* + * Some firmware implementations refuse to boot if there's insufficient + * space in the variable store. We account for that by refusing the + * write if permitting it would reduce the available space to under + * 50%. However, some firmware won't reclaim variable space until + * after the used (not merely the actively used) space drops below + * a threshold. We can approximate that case with the value calculated + * above. If both the firmware and our calculations indicate that the + * available space would drop below 50%, refuse the write. + */ + + if (!storage_size || size > remaining_size || + (max_size && size > max_size)) + return EFI_OUT_OF_RESOURCES; + + if (!efi_no_storage_paranoia && + ((active_size + size + VAR_METADATA_SIZE > storage_size / 2) && + (remaining_size - size < storage_size / 2))) + return EFI_OUT_OF_RESOURCES; + + return EFI_SUCCESS; +} +EXPORT_SYMBOL_GPL(efi_query_variable_store); diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index 120cee1..3c68768 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -11,6 +11,7 @@ #include <linux/suspend.h> #include <linux/export.h> #include <linux/smp.h> +#include <linux/perf_event.h> #include <asm/pgtable.h> #include <asm/proto.h> @@ -228,6 +229,7 @@ static void __restore_processor_state(struct saved_context *ctxt) do_fpu_end(); x86_platform.restore_sched_clock_state(); mtrr_bp_restore(); + perf_restore_debug_store(); } /* Needed by apm.c */ diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index c8e1c7b..ddbd54a 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -31,6 +31,7 @@ #include <linux/pci.h> #include <linux/gfp.h> #include <linux/memblock.h> +#include <linux/edd.h> #include <xen/xen.h> #include <xen/events.h> @@ -1306,6 +1307,55 @@ static const struct machine_ops xen_machine_ops __initconst = { .emergency_restart = xen_emergency_restart, }; +static void __init xen_boot_params_init_edd(void) +{ +#if IS_ENABLED(CONFIG_EDD) + struct xen_platform_op op; + struct edd_info *edd_info; + u32 *mbr_signature; + unsigned nr; + int ret; + + edd_info = boot_params.eddbuf; + mbr_signature = boot_params.edd_mbr_sig_buffer; + + op.cmd = XENPF_firmware_info; + + op.u.firmware_info.type = XEN_FW_DISK_INFO; + for (nr = 0; nr < EDDMAXNR; nr++) { + struct edd_info *info = edd_info + nr; + + op.u.firmware_info.index = nr; + info->params.length = sizeof(info->params); + set_xen_guest_handle(op.u.firmware_info.u.disk_info.edd_params, + &info->params); + ret = HYPERVISOR_dom0_op(&op); + if (ret) + break; + +#define C(x) info->x = op.u.firmware_info.u.disk_info.x + C(device); + C(version); + C(interface_support); + C(legacy_max_cylinder); + C(legacy_max_head); + C(legacy_sectors_per_track); +#undef C + } + boot_params.eddbuf_entries = nr; + + op.u.firmware_info.type = XEN_FW_DISK_MBR_SIGNATURE; + for (nr = 0; nr < EDD_MBR_SIG_MAX; nr++) { + op.u.firmware_info.index = nr; + ret = HYPERVISOR_dom0_op(&op); + if (ret) + break; + mbr_signature[nr] = op.u.firmware_info.u.disk_mbr_signature.mbr_signature; + } + boot_params.edd_mbr_sig_buf_entries = nr; +#endif +} + /* * Set up the GDT and segment registers for -fstack-protector. Until * we do this, we have to be careful not to call any stack-protected @@ -1508,6 +1558,8 @@ asmlinkage void __init xen_start_kernel(void) /* Avoid searching for BIOS MP tables */ x86_init.mpparse.find_smp_config = x86_init_noop; x86_init.mpparse.get_smp_config = x86_init_uint_noop; + + xen_boot_params_init_edd(); } #ifdef CONFIG_PCI /* PCI BIOS service won't work from a PV guest. */ @@ -1589,8 +1641,11 @@ static int __cpuinit xen_hvm_cpu_notify(struct notifier_block *self, switch (action) { case CPU_UP_PREPARE: xen_vcpu_setup(cpu); - if (xen_have_vector_callback) + if (xen_have_vector_callback) { xen_init_lock_cpu(cpu); + if (xen_feature(XENFEAT_hvm_safe_pvclock)) + xen_setup_timer(cpu); + } break; default: break; diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index e8e3493..e006c18 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1467,8 +1467,6 @@ static void __init xen_write_cr3_init(unsigned long cr3) __xen_write_cr3(true, cr3); xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */ - - pv_mmu_ops.write_cr3 = &xen_write_cr3; } #endif @@ -1750,14 +1748,18 @@ static void *m2v(phys_addr_t maddr) } /* Set the page permissions on an identity-mapped pages */ -static void set_page_prot(void *addr, pgprot_t prot) +static void set_page_prot_flags(void *addr, pgprot_t prot, unsigned long flags) { unsigned long pfn = __pa(addr) >> PAGE_SHIFT; pte_t pte = pfn_pte(pfn, prot); - if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0)) + if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, flags)) BUG(); } +static void set_page_prot(void *addr, pgprot_t prot) +{ + return set_page_prot_flags(addr, prot, UVMF_NONE); +} #ifdef CONFIG_X86_32 static void __init xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn) { @@ -1841,12 +1843,12 @@ static void __init check_pt_base(unsigned long *pt_base, unsigned long *pt_end, unsigned long addr) { if (*pt_base == PFN_DOWN(__pa(addr))) { - set_page_prot((void *)addr, PAGE_KERNEL); + set_page_prot_flags((void *)addr, PAGE_KERNEL, UVMF_INVLPG); clear_page((void *)addr); (*pt_base)++; } if (*pt_end == PFN_DOWN(__pa(addr))) { - set_page_prot((void *)addr, PAGE_KERNEL); + set_page_prot_flags((void *)addr, PAGE_KERNEL, UVMF_INVLPG); clear_page((void *)addr); (*pt_end)--; } @@ -2122,6 +2124,7 @@ static void __init xen_post_allocator_init(void) #endif #ifdef CONFIG_X86_64 + pv_mmu_ops.write_cr3 = &xen_write_cr3; SetPagePinned(virt_to_page(level3_user_vsyscall)); #endif xen_mark_init_mm_pinned(); @@ -2197,6 +2200,7 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = { .lazy_mode = { .enter = paravirt_enter_lazy_mmu, .leave = xen_leave_lazy_mmu, + .flush = paravirt_flush_lazy_mmu, }, .set_fixmap = xen_set_fixmap, diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 09ea61d..0d466d7 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -144,6 +144,13 @@ static int xen_smp_intr_init(unsigned int cpu) goto fail; per_cpu(xen_callfuncsingle_irq, cpu) = rc; + /* + * The IRQ worker on PVHVM goes through the native path and uses the + * IPI mechanism. + */ + if (xen_hvm_domain()) + return 0; + callfunc_name = kasprintf(GFP_KERNEL, "irqwork%d", cpu); rc = bind_ipi_to_irqhandler(XEN_IRQ_WORK_VECTOR, cpu, @@ -167,6 +174,9 @@ static int xen_smp_intr_init(unsigned int cpu) if (per_cpu(xen_callfuncsingle_irq, cpu) >= 0) unbind_from_irqhandler(per_cpu(xen_callfuncsingle_irq, cpu), NULL); + if (xen_hvm_domain()) + return rc; + if (per_cpu(xen_irq_work, cpu) >= 0) unbind_from_irqhandler(per_cpu(xen_irq_work, cpu), NULL); @@ -418,7 +428,7 @@ static int xen_cpu_disable(void) static void xen_cpu_die(unsigned int cpu) { - while (HYPERVISOR_vcpu_op(VCPUOP_is_up, cpu, NULL)) { + while (xen_pv_domain() && HYPERVISOR_vcpu_op(VCPUOP_is_up, cpu, NULL)) { current->state = TASK_UNINTERRUPTIBLE; schedule_timeout(HZ/10); } @@ -426,7 +436,8 @@ static void xen_cpu_die(unsigned int cpu) unbind_from_irqhandler(per_cpu(xen_callfunc_irq, cpu), NULL); unbind_from_irqhandler(per_cpu(xen_debug_irq, cpu), NULL); unbind_from_irqhandler(per_cpu(xen_callfuncsingle_irq, cpu), NULL); - unbind_from_irqhandler(per_cpu(xen_irq_work, cpu), NULL); + if (!xen_hvm_domain()) + unbind_from_irqhandler(per_cpu(xen_irq_work, cpu), NULL); xen_uninit_lock_cpu(cpu); xen_teardown_timer(cpu); } @@ -657,11 +668,7 @@ static int __cpuinit xen_hvm_cpu_up(unsigned int cpu, struct task_struct *tidle) static void xen_hvm_cpu_die(unsigned int cpu) { - unbind_from_irqhandler(per_cpu(xen_resched_irq, cpu), NULL); - unbind_from_irqhandler(per_cpu(xen_callfunc_irq, cpu), NULL); - unbind_from_irqhandler(per_cpu(xen_debug_irq, cpu), NULL); - unbind_from_irqhandler(per_cpu(xen_callfuncsingle_irq, cpu), NULL); - unbind_from_irqhandler(per_cpu(xen_irq_work, cpu), NULL); + xen_cpu_die(cpu); native_cpu_die(cpu); } diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c index f7a080e..8b54603 100644 --- a/arch/x86/xen/spinlock.c +++ b/arch/x86/xen/spinlock.c @@ -364,6 +364,16 @@ void __cpuinit xen_init_lock_cpu(int cpu) int irq; const char *name; + WARN(per_cpu(lock_kicker_irq, cpu) > 0, "spinlock on CPU%d exists on IRQ%d!\n", + cpu, per_cpu(lock_kicker_irq, cpu)); + + /* + * See git commit f10cd522c5fbfec9ae3cc01967868c9c2401ed23 + * (xen: disable PV spinlocks on HVM) + */ + if (xen_hvm_domain()) + return; + name = kasprintf(GFP_KERNEL, "spinlock%d", cpu); irq = bind_ipi_to_irqhandler(XEN_SPIN_UNLOCK_VECTOR, cpu, @@ -382,11 +392,26 @@ void __cpuinit xen_init_lock_cpu(int cpu) void xen_uninit_lock_cpu(int cpu) { + /* + * See git commit f10cd522c5fbfec9ae3cc01967868c9c2401ed23 + * (xen: disable PV spinlocks on HVM) + */ + if (xen_hvm_domain()) + return; + unbind_from_irqhandler(per_cpu(lock_kicker_irq, cpu), NULL); + per_cpu(lock_kicker_irq, cpu) = -1; } void __init xen_init_spinlocks(void) { + /* + * See git commit f10cd522c5fbfec9ae3cc01967868c9c2401ed23 + * (xen: disable PV spinlocks on HVM) + */ + if (xen_hvm_domain()) + return; + BUILD_BUG_ON(sizeof(struct xen_spinlock) > sizeof(arch_spinlock_t)); pv_lock_ops.spin_is_locked = xen_spin_is_locked; diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index 0296a95..3d88bfd 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -377,7 +377,7 @@ static const struct clock_event_device xen_vcpuop_clockevent = { static const struct clock_event_device *xen_clockevent = &xen_timerop_clockevent; -static DEFINE_PER_CPU(struct clock_event_device, xen_clock_events); +static DEFINE_PER_CPU(struct clock_event_device, xen_clock_events) = { .irq = -1 }; static irqreturn_t xen_timer_interrupt(int irq, void *dev_id) { @@ -401,6 +401,9 @@ void xen_setup_timer(int cpu) struct clock_event_device *evt; int irq; + evt = &per_cpu(xen_clock_events, cpu); + WARN(evt->irq >= 0, "IRQ%d for CPU%d is already allocated\n", evt->irq, cpu); + printk(KERN_INFO "installing Xen timer for CPU %d\n", cpu); name = kasprintf(GFP_KERNEL, "timer%d", cpu); @@ -413,7 +416,6 @@ void xen_setup_timer(int cpu) IRQF_FORCE_RESUME, name, NULL); - evt = &per_cpu(xen_clock_events, cpu); memcpy(evt, xen_clockevent, sizeof(*evt)); evt->cpumask = cpumask_of(cpu); @@ -426,6 +428,7 @@ void xen_teardown_timer(int cpu) BUG_ON(cpu == 0); evt = &per_cpu(xen_clock_events, cpu); unbind_from_irqhandler(evt->irq, NULL); + evt->irq = -1; } void xen_setup_cpu_clockevents(void) @@ -497,7 +500,11 @@ static void xen_hvm_setup_cpu_clockevents(void) { int cpu = smp_processor_id(); xen_setup_runstate_info(cpu); - xen_setup_timer(cpu); + /* + * xen_setup_timer(cpu) - snprintf is bad in atomic context. Hence + * doing it xen_hvm_cpu_notify (which gets called by smp_init during + * early bootup and also during CPU hotplug events). + */ xen_setup_cpu_clockevents(); } |