diff options
Diffstat (limited to 'arch/x86/kernel')
37 files changed, 579 insertions, 155 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 5d4502c..cdb1b70 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -16,6 +16,10 @@ CFLAGS_REMOVE_ftrace.o = -pg CFLAGS_REMOVE_early_printk.o = -pg endif +KASAN_SANITIZE_head$(BITS).o := n +KASAN_SANITIZE_dumpstack.o := n +KASAN_SANITIZE_dumpstack_$(BITS).o := n + CFLAGS_irq.o := -I$(src)/../include/asm/trace obj-y := process_$(BITS).o signal.o entry_$(BITS).o @@ -63,6 +67,7 @@ obj-$(CONFIG_X86_MPPARSE) += mpparse.o obj-y += apic/ obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o +obj-$(CONFIG_LIVEPATCH) += livepatch.o obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o obj-$(CONFIG_FTRACE_SYSCALLS) += ftrace.o obj-$(CONFIG_X86_TSC) += trace_clock.o diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index a18fff3..3d525c6 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -613,6 +613,11 @@ int acpi_gsi_to_irq(u32 gsi, unsigned int *irqp) { int rc, irq, trigger, polarity; + if (acpi_irq_model == ACPI_IRQ_MODEL_PIC) { + *irqp = gsi; + return 0; + } + rc = acpi_get_override_irq(gsi, &trigger, &polarity); if (rc == 0) { trigger = trigger ? ACPI_LEVEL_SENSITIVE : ACPI_EDGE_SENSITIVE; @@ -845,13 +850,7 @@ int acpi_ioapic_registered(acpi_handle handle, u32 gsi_base) static int __init acpi_parse_sbf(struct acpi_table_header *table) { - struct acpi_table_boot *sb; - - sb = (struct acpi_table_boot *)table; - if (!sb) { - printk(KERN_WARNING PREFIX "Unable to map SBF\n"); - return -ENODEV; - } + struct acpi_table_boot *sb = (struct acpi_table_boot *)table; sbf_port = sb->cmos_index; /* Save CMOS port */ @@ -865,13 +864,7 @@ static struct resource *hpet_res __initdata; static int __init acpi_parse_hpet(struct acpi_table_header *table) { - struct acpi_table_hpet *hpet_tbl; - - hpet_tbl = (struct acpi_table_hpet *)table; - if (!hpet_tbl) { - printk(KERN_WARNING PREFIX "Unable to map HPET\n"); - return -ENODEV; - } + struct acpi_table_hpet *hpet_tbl = (struct acpi_table_hpet *)table; if (hpet_tbl->address.space_id != ACPI_SPACE_MEM) { printk(KERN_WARNING PREFIX "HPET timers must be located in " diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index 3136820..d1daead 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -78,7 +78,7 @@ int x86_acpi_suspend_lowlevel(void) header->pmode_cr0 = read_cr0(); if (__this_cpu_read(cpu_info.cpuid_level) >= 0) { - header->pmode_cr4 = read_cr4(); + header->pmode_cr4 = __read_cr4(); header->pmode_behavior |= (1 << WAKEUP_BEHAVIOR_RESTORE_CR4); } if (!rdmsr_safe(MSR_IA32_MISC_ENABLE, diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index b665d24..ad3639a 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1580,8 +1580,7 @@ static __init void try_to_enable_x2apic(int remap_mode) * under KVM */ if (max_physical_apicid > 255 || - (IS_ENABLED(CONFIG_HYPERVISOR_GUEST) && - !hypervisor_x2apic_available())) { + !hypervisor_x2apic_available()) { pr_info("x2apic: IRQ remapping doesn't support X2APIC mode\n"); x2apic_disable(); return; diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index b15bffc..b5c8ff5 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -19,6 +19,7 @@ #include <asm/archrandom.h> #include <asm/hypervisor.h> #include <asm/processor.h> +#include <asm/tlbflush.h> #include <asm/debugreg.h> #include <asm/sections.h> #include <asm/vsyscall.h> @@ -278,7 +279,7 @@ __setup("nosmep", setup_disable_smep); static __always_inline void setup_smep(struct cpuinfo_x86 *c) { if (cpu_has(c, X86_FEATURE_SMEP)) - set_in_cr4(X86_CR4_SMEP); + cr4_set_bits(X86_CR4_SMEP); } static __init int setup_disable_smap(char *arg) @@ -298,9 +299,9 @@ static __always_inline void setup_smap(struct cpuinfo_x86 *c) if (cpu_has(c, X86_FEATURE_SMAP)) { #ifdef CONFIG_X86_SMAP - set_in_cr4(X86_CR4_SMAP); + cr4_set_bits(X86_CR4_SMAP); #else - clear_in_cr4(X86_CR4_SMAP); + cr4_clear_bits(X86_CR4_SMAP); #endif } } @@ -1295,6 +1296,12 @@ void cpu_init(void) wait_for_master_cpu(cpu); /* + * Initialize the CR4 shadow before doing anything that could + * try to read it. + */ + cr4_init_shadow(); + + /* * Load microcode on this cpu if a valid microcode is available. * This is early microcode loading procedure. */ @@ -1313,7 +1320,7 @@ void cpu_init(void) pr_debug("Initializing CPU#%d\n", cpu); - clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); + cr4_clear_bits(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); /* * Initialize the per-CPU GDT with the boot GDT, @@ -1394,7 +1401,7 @@ void cpu_init(void) printk(KERN_INFO "Initializing CPU#%d\n", cpu); if (cpu_feature_enabled(X86_FEATURE_VME) || cpu_has_tsc || cpu_has_de) - clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); + cr4_clear_bits(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); load_current_idt(); switch_to_new_gdt(cpu); diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index c703507..6596433 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -952,20 +952,18 @@ static ssize_t show_size(struct _cpuid4_info *this_leaf, char *buf, static ssize_t show_shared_cpu_map_func(struct _cpuid4_info *this_leaf, int type, char *buf) { - ptrdiff_t len = PTR_ALIGN(buf + PAGE_SIZE - 1, PAGE_SIZE) - buf; - int n = 0; - - if (len > 1) { - const struct cpumask *mask; - - mask = to_cpumask(this_leaf->shared_cpu_map); - n = type ? - cpulist_scnprintf(buf, len-2, mask) : - cpumask_scnprintf(buf, len-2, mask); - buf[n++] = '\n'; - buf[n] = '\0'; - } - return n; + const struct cpumask *mask = to_cpumask(this_leaf->shared_cpu_map); + int ret; + + if (type) + ret = scnprintf(buf, PAGE_SIZE - 1, "%*pbl", + cpumask_pr_args(mask)); + else + ret = scnprintf(buf, PAGE_SIZE - 1, "%*pb", + cpumask_pr_args(mask)); + buf[ret++] = '\n'; + buf[ret] = '\0'; + return ret; } static inline ssize_t show_shared_cpu_map(struct _cpuid4_info *leaf, char *buf, diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index cdfed79..3c036cb 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -44,6 +44,7 @@ #include <asm/processor.h> #include <asm/traps.h> +#include <asm/tlbflush.h> #include <asm/mce.h> #include <asm/msr.h> @@ -151,14 +152,11 @@ static struct mce_log mcelog = { void mce_log(struct mce *mce) { unsigned next, entry; - int ret = 0; /* Emit the trace record: */ trace_mce_record(mce); - ret = atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, mce); - if (ret == NOTIFY_STOP) - return; + atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, mce); mce->finished = 0; wmb(); @@ -1452,7 +1450,7 @@ static void __mcheck_cpu_init_generic(void) bitmap_fill(all_banks, MAX_NR_BANKS); machine_check_poll(MCP_UC | m_fl, &all_banks); - set_in_cr4(X86_CR4_MCE); + cr4_set_bits(X86_CR4_MCE); rdmsrl(MSR_IA32_MCG_CAP, cap); if (cap & MCG_CTL_P) diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mcheck/p5.c index ec2663a..737b0ad 100644 --- a/arch/x86/kernel/cpu/mcheck/p5.c +++ b/arch/x86/kernel/cpu/mcheck/p5.c @@ -9,6 +9,7 @@ #include <asm/processor.h> #include <asm/traps.h> +#include <asm/tlbflush.h> #include <asm/mce.h> #include <asm/msr.h> @@ -65,7 +66,7 @@ void intel_p5_mcheck_init(struct cpuinfo_x86 *c) "Intel old style machine check architecture supported.\n"); /* Enable MCE: */ - set_in_cr4(X86_CR4_MCE); + cr4_set_bits(X86_CR4_MCE); printk(KERN_INFO "Intel old style machine check reporting enabled on CPU#%d.\n", smp_processor_id()); diff --git a/arch/x86/kernel/cpu/mcheck/winchip.c b/arch/x86/kernel/cpu/mcheck/winchip.c index bd5d46a..44f1382 100644 --- a/arch/x86/kernel/cpu/mcheck/winchip.c +++ b/arch/x86/kernel/cpu/mcheck/winchip.c @@ -8,6 +8,7 @@ #include <asm/processor.h> #include <asm/traps.h> +#include <asm/tlbflush.h> #include <asm/mce.h> #include <asm/msr.h> @@ -36,7 +37,7 @@ void winchip_mcheck_init(struct cpuinfo_x86 *c) lo &= ~(1<<4); /* Enable MCE */ wrmsr(MSR_IDT_FCR1, lo, hi); - set_in_cr4(X86_CR4_MCE); + cr4_set_bits(X86_CR4_MCE); printk(KERN_INFO "Winchip machine check reporting enabled on CPU#0.\n"); diff --git a/arch/x86/kernel/cpu/mtrr/cyrix.c b/arch/x86/kernel/cpu/mtrr/cyrix.c index 9e451b0..f8c81ba 100644 --- a/arch/x86/kernel/cpu/mtrr/cyrix.c +++ b/arch/x86/kernel/cpu/mtrr/cyrix.c @@ -138,8 +138,8 @@ static void prepare_set(void) /* Save value of CR4 and clear Page Global Enable (bit 7) */ if (cpu_has_pge) { - cr4 = read_cr4(); - write_cr4(cr4 & ~X86_CR4_PGE); + cr4 = __read_cr4(); + __write_cr4(cr4 & ~X86_CR4_PGE); } /* @@ -171,7 +171,7 @@ static void post_set(void) /* Restore value of CR4 */ if (cpu_has_pge) - write_cr4(cr4); + __write_cr4(cr4); } static void cyrix_set_arr(unsigned int reg, unsigned long base, diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 0e25a1b..7d74f7b 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -678,8 +678,8 @@ static void prepare_set(void) __acquires(set_atomicity_lock) /* Save value of CR4 and clear Page Global Enable (bit 7) */ if (cpu_has_pge) { - cr4 = read_cr4(); - write_cr4(cr4 & ~X86_CR4_PGE); + cr4 = __read_cr4(); + __write_cr4(cr4 & ~X86_CR4_PGE); } /* Flush all TLBs via a mov %cr3, %reg; mov %reg, %cr3 */ @@ -708,7 +708,7 @@ static void post_set(void) __releases(set_atomicity_lock) /* Restore value of CR4 */ if (cpu_has_pge) - write_cr4(cr4); + __write_cr4(cr4); raw_spin_unlock(&set_atomicity_lock); } diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 143e5f5..b71a7f8 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -31,6 +31,8 @@ #include <asm/nmi.h> #include <asm/smp.h> #include <asm/alternative.h> +#include <asm/mmu_context.h> +#include <asm/tlbflush.h> #include <asm/timer.h> #include <asm/desc.h> #include <asm/ldt.h> @@ -43,6 +45,8 @@ DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = { .enabled = 1, }; +struct static_key rdpmc_always_available = STATIC_KEY_INIT_FALSE; + u64 __read_mostly hw_cache_event_ids [PERF_COUNT_HW_CACHE_MAX] [PERF_COUNT_HW_CACHE_OP_MAX] @@ -1327,8 +1331,6 @@ x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu) break; case CPU_STARTING: - if (x86_pmu.attr_rdpmc) - set_in_cr4(X86_CR4_PCE); if (x86_pmu.cpu_starting) x86_pmu.cpu_starting(cpu); break; @@ -1804,14 +1806,44 @@ static int x86_pmu_event_init(struct perf_event *event) event->destroy(event); } + if (ACCESS_ONCE(x86_pmu.attr_rdpmc)) + event->hw.flags |= PERF_X86_EVENT_RDPMC_ALLOWED; + return err; } +static void refresh_pce(void *ignored) +{ + if (current->mm) + load_mm_cr4(current->mm); +} + +static void x86_pmu_event_mapped(struct perf_event *event) +{ + if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED)) + return; + + if (atomic_inc_return(¤t->mm->context.perf_rdpmc_allowed) == 1) + on_each_cpu_mask(mm_cpumask(current->mm), refresh_pce, NULL, 1); +} + +static void x86_pmu_event_unmapped(struct perf_event *event) +{ + if (!current->mm) + return; + + if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED)) + return; + + if (atomic_dec_and_test(¤t->mm->context.perf_rdpmc_allowed)) + on_each_cpu_mask(mm_cpumask(current->mm), refresh_pce, NULL, 1); +} + static int x86_pmu_event_idx(struct perf_event *event) { int idx = event->hw.idx; - if (!x86_pmu.attr_rdpmc) + if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED)) return 0; if (x86_pmu.num_counters_fixed && idx >= INTEL_PMC_IDX_FIXED) { @@ -1829,16 +1861,6 @@ static ssize_t get_attr_rdpmc(struct device *cdev, return snprintf(buf, 40, "%d\n", x86_pmu.attr_rdpmc); } -static void change_rdpmc(void *info) -{ - bool enable = !!(unsigned long)info; - - if (enable) - set_in_cr4(X86_CR4_PCE); - else - clear_in_cr4(X86_CR4_PCE); -} - static ssize_t set_attr_rdpmc(struct device *cdev, struct device_attribute *attr, const char *buf, size_t count) @@ -1850,14 +1872,27 @@ static ssize_t set_attr_rdpmc(struct device *cdev, if (ret) return ret; + if (val > 2) + return -EINVAL; + if (x86_pmu.attr_rdpmc_broken) return -ENOTSUPP; - if (!!val != !!x86_pmu.attr_rdpmc) { - x86_pmu.attr_rdpmc = !!val; - on_each_cpu(change_rdpmc, (void *)val, 1); + if ((val == 2) != (x86_pmu.attr_rdpmc == 2)) { + /* + * Changing into or out of always available, aka + * perf-event-bypassing mode. This path is extremely slow, + * but only root can trigger it, so it's okay. + */ + if (val == 2) + static_key_slow_inc(&rdpmc_always_available); + else + static_key_slow_dec(&rdpmc_always_available); + on_each_cpu(refresh_pce, NULL, 1); } + x86_pmu.attr_rdpmc = val; + return count; } @@ -1900,6 +1935,9 @@ static struct pmu pmu = { .event_init = x86_pmu_event_init, + .event_mapped = x86_pmu_event_mapped, + .event_unmapped = x86_pmu_event_unmapped, + .add = x86_pmu_add, .del = x86_pmu_del, .start = x86_pmu_start, @@ -1914,13 +1952,15 @@ static struct pmu pmu = { .flush_branch_stack = x86_pmu_flush_branch_stack, }; -void arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now) +void arch_perf_update_userpage(struct perf_event *event, + struct perf_event_mmap_page *userpg, u64 now) { struct cyc2ns_data *data; userpg->cap_user_time = 0; userpg->cap_user_time_zero = 0; - userpg->cap_user_rdpmc = x86_pmu.attr_rdpmc; + userpg->cap_user_rdpmc = + !!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED); userpg->pmc_width = x86_pmu.cntval_bits; if (!sched_clock_stable()) diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index 4e6cdb0..df525d2 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -71,6 +71,8 @@ struct event_constraint { #define PERF_X86_EVENT_COMMITTED 0x8 /* event passed commit_txn */ #define PERF_X86_EVENT_PEBS_LD_HSW 0x10 /* haswell style datala, load */ #define PERF_X86_EVENT_PEBS_NA_HSW 0x20 /* haswell style datala, unknown */ +#define PERF_X86_EVENT_RDPMC_ALLOWED 0x40 /* grant rdpmc permission */ + struct amd_nb { int nb_id; /* NorthBridge id */ diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index b74ebc7..cf3df1d 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -265,7 +265,10 @@ int __die(const char *str, struct pt_regs *regs, long err) printk("SMP "); #endif #ifdef CONFIG_DEBUG_PAGEALLOC - printk("DEBUG_PAGEALLOC"); + printk("DEBUG_PAGEALLOC "); +#endif +#ifdef CONFIG_KASAN + printk("KASAN"); #endif printk("\n"); if (notify_die(DIE_OOPS, str, regs, err, diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c index 01d1c18..a62536a 100644 --- a/arch/x86/kernel/early_printk.c +++ b/arch/x86/kernel/early_printk.c @@ -19,6 +19,7 @@ #include <linux/usb/ehci_def.h> #include <linux/efi.h> #include <asm/efi.h> +#include <asm/pci_x86.h> /* Simple VGA output */ #define VGABASE (__ISA_IO_base + 0xb8000) @@ -76,7 +77,7 @@ static struct console early_vga_console = { /* Serial functions loosely based on a similar package from Klaus P. Gerlicher */ -static int early_serial_base = 0x3f8; /* ttyS0 */ +static unsigned long early_serial_base = 0x3f8; /* ttyS0 */ #define XMTRDY 0x20 @@ -94,13 +95,40 @@ static int early_serial_base = 0x3f8; /* ttyS0 */ #define DLL 0 /* Divisor Latch Low */ #define DLH 1 /* Divisor latch High */ +static void mem32_serial_out(unsigned long addr, int offset, int value) +{ + uint32_t *vaddr = (uint32_t *)addr; + /* shift implied by pointer type */ + writel(value, vaddr + offset); +} + +static unsigned int mem32_serial_in(unsigned long addr, int offset) +{ + uint32_t *vaddr = (uint32_t *)addr; + /* shift implied by pointer type */ + return readl(vaddr + offset); +} + +static unsigned int io_serial_in(unsigned long addr, int offset) +{ + return inb(addr + offset); +} + +static void io_serial_out(unsigned long addr, int offset, int value) +{ + outb(value, addr + offset); +} + +static unsigned int (*serial_in)(unsigned long addr, int offset) = io_serial_in; +static void (*serial_out)(unsigned long addr, int offset, int value) = io_serial_out; + static int early_serial_putc(unsigned char ch) { unsigned timeout = 0xffff; - while ((inb(early_serial_base + LSR) & XMTRDY) == 0 && --timeout) + while ((serial_in(early_serial_base, LSR) & XMTRDY) == 0 && --timeout) cpu_relax(); - outb(ch, early_serial_base + TXR); + serial_out(early_serial_base, TXR, ch); return timeout ? 0 : -1; } @@ -114,13 +142,28 @@ static void early_serial_write(struct console *con, const char *s, unsigned n) } } +static __init void early_serial_hw_init(unsigned divisor) +{ + unsigned char c; + + serial_out(early_serial_base, LCR, 0x3); /* 8n1 */ + serial_out(early_serial_base, IER, 0); /* no interrupt */ + serial_out(early_serial_base, FCR, 0); /* no fifo */ + serial_out(early_serial_base, MCR, 0x3); /* DTR + RTS */ + + c = serial_in(early_serial_base, LCR); + serial_out(early_serial_base, LCR, c | DLAB); + serial_out(early_serial_base, DLL, divisor & 0xff); + serial_out(early_serial_base, DLH, (divisor >> 8) & 0xff); + serial_out(early_serial_base, LCR, c & ~DLAB); +} + #define DEFAULT_BAUD 9600 static __init void early_serial_init(char *s) { - unsigned char c; unsigned divisor; - unsigned baud = DEFAULT_BAUD; + unsigned long baud = DEFAULT_BAUD; char *e; if (*s == ',') @@ -145,24 +188,124 @@ static __init void early_serial_init(char *s) s++; } - outb(0x3, early_serial_base + LCR); /* 8n1 */ - outb(0, early_serial_base + IER); /* no interrupt */ - outb(0, early_serial_base + FCR); /* no fifo */ - outb(0x3, early_serial_base + MCR); /* DTR + RTS */ + if (*s) { + if (kstrtoul(s, 0, &baud) < 0 || baud == 0) + baud = DEFAULT_BAUD; + } + + /* Convert from baud to divisor value */ + divisor = 115200 / baud; + + /* These will always be IO based ports */ + serial_in = io_serial_in; + serial_out = io_serial_out; + + /* Set up the HW */ + early_serial_hw_init(divisor); +} + +#ifdef CONFIG_PCI +/* + * early_pci_serial_init() + * + * This function is invoked when the early_printk param starts with "pciserial" + * The rest of the param should be ",B:D.F,baud" where B, D & F describe the + * location of a PCI device that must be a UART device. + */ +static __init void early_pci_serial_init(char *s) +{ + unsigned divisor; + unsigned long baud = DEFAULT_BAUD; + u8 bus, slot, func; + uint32_t classcode, bar0; + uint16_t cmdreg; + char *e; + + + /* + * First, part the param to get the BDF values + */ + if (*s == ',') + ++s; + + if (*s == 0) + return; + + bus = (u8)simple_strtoul(s, &e, 16); + s = e; + if (*s != ':') + return; + ++s; + slot = (u8)simple_strtoul(s, &e, 16); + s = e; + if (*s != '.') + return; + ++s; + func = (u8)simple_strtoul(s, &e, 16); + s = e; + + /* A baud might be following */ + if (*s == ',') + s++; + + /* + * Second, find the device from the BDF + */ + cmdreg = read_pci_config(bus, slot, func, PCI_COMMAND); + classcode = read_pci_config(bus, slot, func, PCI_CLASS_REVISION); + bar0 = read_pci_config(bus, slot, func, PCI_BASE_ADDRESS_0); + + /* + * Verify it is a UART type device + */ + if (((classcode >> 16 != PCI_CLASS_COMMUNICATION_MODEM) && + (classcode >> 16 != PCI_CLASS_COMMUNICATION_SERIAL)) || + (((classcode >> 8) & 0xff) != 0x02)) /* 16550 I/F at BAR0 */ + return; + + /* + * Determine if it is IO or memory mapped + */ + if (bar0 & 0x01) { + /* it is IO mapped */ + serial_in = io_serial_in; + serial_out = io_serial_out; + early_serial_base = bar0&0xfffffffc; + write_pci_config(bus, slot, func, PCI_COMMAND, + cmdreg|PCI_COMMAND_IO); + } else { + /* It is memory mapped - assume 32-bit alignment */ + serial_in = mem32_serial_in; + serial_out = mem32_serial_out; + /* WARNING! assuming the address is always in the first 4G */ + early_serial_base = + (unsigned long)early_ioremap(bar0 & 0xfffffff0, 0x10); + write_pci_config(bus, slot, func, PCI_COMMAND, + cmdreg|PCI_COMMAND_MEMORY); + } + /* + * Lastly, initalize the hardware + */ if (*s) { - baud = simple_strtoul(s, &e, 0); - if (baud == 0 || s == e) + if (strcmp(s, "nocfg") == 0) + /* Sometimes, we want to leave the UART alone + * and assume the BIOS has set it up correctly. + * "nocfg" tells us this is the case, and we + * should do no more setup. + */ + return; + if (kstrtoul(s, 0, &baud) < 0 || baud == 0) baud = DEFAULT_BAUD; } + /* Convert from baud to divisor value */ divisor = 115200 / baud; - c = inb(early_serial_base + LCR); - outb(c | DLAB, early_serial_base + LCR); - outb(divisor & 0xff, early_serial_base + DLL); - outb((divisor >> 8) & 0xff, early_serial_base + DLH); - outb(c & ~DLAB, early_serial_base + LCR); + + /* Set up the HW */ + early_serial_hw_init(divisor); } +#endif static struct console early_serial_console = { .name = "earlyser", @@ -210,6 +353,13 @@ static int __init setup_early_printk(char *buf) early_serial_init(buf + 4); early_console_register(&early_serial_console, keep); } +#ifdef CONFIG_PCI + if (!strncmp(buf, "pciserial", 9)) { + early_pci_serial_init(buf + 9); + early_console_register(&early_serial_console, keep); + buf += 9; /* Keep from match the above "serial" */ + } +#endif if (!strncmp(buf, "vga", 3) && boot_params.screen_info.orig_video_isVGA == 1) { max_xpos = boot_params.screen_info.orig_video_cols; @@ -226,11 +376,6 @@ static int __init setup_early_printk(char *buf) early_console_register(&xenboot_console, keep); #endif #ifdef CONFIG_EARLY_PRINTK_INTEL_MID - if (!strncmp(buf, "mrst", 4)) { - mrst_early_console_init(); - early_console_register(&early_mrst_console, keep); - } - if (!strncmp(buf, "hsu", 3)) { hsu_early_console_init(buf + 3); early_console_register(&early_hsu_console, keep); diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 000d419..31e2d5b 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -982,6 +982,9 @@ ENTRY(xen_hypervisor_callback) ENTRY(xen_do_upcall) 1: mov %esp, %eax call xen_evtchn_do_upcall +#ifndef CONFIG_PREEMPT + call xen_maybe_preempt_hcall +#endif jmp ret_from_intr CFI_ENDPROC ENDPROC(xen_hypervisor_callback) diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index db13655..10074ad 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1208,6 +1208,9 @@ ENTRY(xen_do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs) popq %rsp CFI_DEF_CFA_REGISTER rsp decl PER_CPU_VAR(irq_count) +#ifndef CONFIG_PREEMPT + call xen_maybe_preempt_hcall +#endif jmp error_exit CFI_ENDPROC END(xen_do_hypervisor_callback) diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index d6c1b983..2911ef3 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -31,6 +31,7 @@ static void __init i386_default_early_setup(void) asmlinkage __visible void __init i386_start_kernel(void) { + cr4_init_shadow(); sanitize_boot_params(&boot_params); /* Call the subarch specific early setup function */ diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index eda1a86..c4f8d46 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -27,6 +27,7 @@ #include <asm/bios_ebda.h> #include <asm/bootparam_utils.h> #include <asm/microcode.h> +#include <asm/kasan.h> /* * Manage page tables very early on. @@ -46,7 +47,7 @@ static void __init reset_early_page_tables(void) next_early_pgt = 0; - write_cr3(__pa(early_level4_pgt)); + write_cr3(__pa_nodebug(early_level4_pgt)); } /* Create a new PMD entry */ @@ -59,7 +60,7 @@ int __init early_make_pgtable(unsigned long address) pmdval_t pmd, *pmd_p; /* Invalid address or early pgt is done ? */ - if (physaddr >= MAXMEM || read_cr3() != __pa(early_level4_pgt)) + if (physaddr >= MAXMEM || read_cr3() != __pa_nodebug(early_level4_pgt)) return -1; again: @@ -155,9 +156,13 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data) (__START_KERNEL & PGDIR_MASK))); BUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) <= MODULES_END); + cr4_init_shadow(); + /* Kill off the identity-map trampoline */ reset_early_page_tables(); + kasan_map_early_shadow(early_level4_pgt); + /* clear bss before set_intr_gate with early_idt_handler */ clear_bss(); @@ -179,6 +184,8 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data) /* set init_level4_pgt kernel high mapping*/ init_level4_pgt[511] = early_level4_pgt[511]; + kasan_map_early_shadow(init_level4_pgt); + x86_64_start_reservations(real_mode_data); } diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index a468c0a..6fd514d9 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -514,8 +514,38 @@ ENTRY(phys_base) /* This must match the first entry in level2_kernel_pgt */ .quad 0x0000000000000000 +#ifdef CONFIG_KASAN +#define FILL(VAL, COUNT) \ + .rept (COUNT) ; \ + .quad (VAL) ; \ + .endr + +NEXT_PAGE(kasan_zero_pte) + FILL(kasan_zero_page - __START_KERNEL_map + _KERNPG_TABLE, 512) +NEXT_PAGE(kasan_zero_pmd) + FILL(kasan_zero_pte - __START_KERNEL_map + _KERNPG_TABLE, 512) +NEXT_PAGE(kasan_zero_pud) + FILL(kasan_zero_pmd - __START_KERNEL_map + _KERNPG_TABLE, 512) + +#undef FILL +#endif + + #include "../../x86/xen/xen-head.S" __PAGE_ALIGNED_BSS NEXT_PAGE(empty_zero_page) .skip PAGE_SIZE + +#ifdef CONFIG_KASAN +/* + * This page used as early shadow. We don't use empty_zero_page + * at early stages, stack instrumentation could write some garbage + * to this page. + * Latter we reuse it as zero shadow for large ranges of memory + * that allowed to access, but not instrumented by kasan + * (vmalloc/vmemmap ...). + */ +NEXT_PAGE(kasan_zero_page) + .skip PAGE_SIZE +#endif diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 319bcb9..3acbff4 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -168,7 +168,7 @@ static void _hpet_print_config(const char *function, int line) #define hpet_print_config() \ do { \ if (hpet_verbose) \ - _hpet_print_config(__FUNCTION__, __LINE__); \ + _hpet_print_config(__func__, __LINE__); \ } while (0) /* diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index 81049ff..d5651fc 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -13,6 +13,7 @@ #include <asm/sigcontext.h> #include <asm/processor.h> #include <asm/math_emu.h> +#include <asm/tlbflush.h> #include <asm/uaccess.h> #include <asm/ptrace.h> #include <asm/i387.h> @@ -193,7 +194,7 @@ void fpu_init(void) if (cpu_has_xmm) cr4_mask |= X86_CR4_OSXMMEXCPT; if (cr4_mask) - set_in_cr4(cr4_mask); + cr4_set_bits(cr4_mask); cr0 = read_cr0(); cr0 &= ~(X86_CR0_TS|X86_CR0_EM); /* clear TS and EM */ diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 705ef8d..67b1cbe 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -302,6 +302,9 @@ int check_irq_vectors_for_cpu_disable(void) irq = __this_cpu_read(vector_irq[vector]); if (irq >= 0) { desc = irq_to_desc(irq); + if (!desc) + continue; + data = irq_desc_get_irq_data(desc); cpumask_copy(&affinity_new, data->affinity); cpu_clear(this_cpu, affinity_new); diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 98f654d..6a1146e 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -84,7 +84,7 @@ static volatile u32 twobyte_is_boostable[256 / 32] = { /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ /* ---------------------------------------------- */ W(0x00, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0) | /* 00 */ - W(0x10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 10 */ + W(0x10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1) , /* 10 */ W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 20 */ W(0x30, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 30 */ W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */ diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index 7c523bb..0dd8d08 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -322,7 +322,8 @@ void arch_remove_optimized_kprobe(struct optimized_kprobe *op) * Target instructions MUST be relocatable (checked inside) * This is called when new aggr(opt)probe is allocated or reused. */ -int arch_prepare_optimized_kprobe(struct optimized_kprobe *op) +int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, + struct kprobe *__unused) { u8 *buf; int ret; diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 94f6434..e354cc6 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -609,7 +609,7 @@ static inline void check_zero(void) u8 ret; u8 old; - old = ACCESS_ONCE(zero_stats); + old = READ_ONCE(zero_stats); if (unlikely(old)) { ret = cmpxchg(&zero_stats, old, 0); /* This ensures only one fellow resets the stat */ @@ -727,6 +727,7 @@ __visible void kvm_lock_spinning(struct arch_spinlock *lock, __ticket_t want) int cpu; u64 start; unsigned long flags; + __ticket_t head; if (in_nmi()) return; @@ -768,11 +769,15 @@ __visible void kvm_lock_spinning(struct arch_spinlock *lock, __ticket_t want) */ __ticket_enter_slowpath(lock); + /* make sure enter_slowpath, which is atomic does not cross the read */ + smp_mb__after_atomic(); + /* * check again make sure it didn't become free while * we weren't looking. */ - if (ACCESS_ONCE(lock->tickets.head) == want) { + head = READ_ONCE(lock->tickets.head); + if (__tickets_equal(head, want)) { add_stats(TAKEN_SLOW_PICKUP, 1); goto out; } @@ -803,8 +808,8 @@ static void kvm_unlock_kick(struct arch_spinlock *lock, __ticket_t ticket) add_stats(RELEASED_SLOW, 1); for_each_cpu(cpu, &waiting_cpus) { const struct kvm_lock_waiting *w = &per_cpu(klock_waiting, cpu); - if (ACCESS_ONCE(w->lock) == lock && - ACCESS_ONCE(w->want) == ticket) { + if (READ_ONCE(w->lock) == lock && + READ_ONCE(w->want) == ticket) { add_stats(RELEASED_SLOW_KICKED, 1); kvm_kick_cpu(cpu); break; diff --git a/arch/x86/kernel/livepatch.c b/arch/x86/kernel/livepatch.c new file mode 100644 index 0000000..ff3c3101d --- /dev/null +++ b/arch/x86/kernel/livepatch.c @@ -0,0 +1,90 @@ +/* + * livepatch.c - x86-specific Kernel Live Patching Core + * + * Copyright (C) 2014 Seth Jennings <sjenning@redhat.com> + * Copyright (C) 2014 SUSE + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +#include <linux/module.h> +#include <linux/uaccess.h> +#include <asm/cacheflush.h> +#include <asm/page_types.h> +#include <asm/elf.h> +#include <asm/livepatch.h> + +/** + * klp_write_module_reloc() - write a relocation in a module + * @mod: module in which the section to be modified is found + * @type: ELF relocation type (see asm/elf.h) + * @loc: address that the relocation should be written to + * @value: relocation value (sym address + addend) + * + * This function writes a relocation to the specified location for + * a particular module. + */ +int klp_write_module_reloc(struct module *mod, unsigned long type, + unsigned long loc, unsigned long value) +{ + int ret, numpages, size = 4; + bool readonly; + unsigned long val; + unsigned long core = (unsigned long)mod->module_core; + unsigned long core_ro_size = mod->core_ro_size; + unsigned long core_size = mod->core_size; + + switch (type) { + case R_X86_64_NONE: + return 0; + case R_X86_64_64: + val = value; + size = 8; + break; + case R_X86_64_32: + val = (u32)value; + break; + case R_X86_64_32S: + val = (s32)value; + break; + case R_X86_64_PC32: + val = (u32)(value - loc); + break; + default: + /* unsupported relocation type */ + return -EINVAL; + } + + if (loc < core || loc >= core + core_size) + /* loc does not point to any symbol inside the module */ + return -EINVAL; + + if (loc < core + core_ro_size) + readonly = true; + else + readonly = false; + + /* determine if the relocation spans a page boundary */ + numpages = ((loc & PAGE_MASK) == ((loc + size) & PAGE_MASK)) ? 1 : 2; + + if (readonly) + set_memory_rw(loc & PAGE_MASK, numpages); + + ret = probe_kernel_write((void *)loc, &val, size); + + if (readonly) + set_memory_ro(loc & PAGE_MASK, numpages); + + return ret; +} diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index ef00116..9bbb9b3 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -24,6 +24,7 @@ #include <linux/fs.h> #include <linux/string.h> #include <linux/kernel.h> +#include <linux/kasan.h> #include <linux/bug.h> #include <linux/mm.h> #include <linux/gfp.h> @@ -75,13 +76,22 @@ static unsigned long int get_module_load_offset(void) void *module_alloc(unsigned long size) { + void *p; + if (PAGE_ALIGN(size) > MODULES_LEN) return NULL; - return __vmalloc_node_range(size, 1, + + p = __vmalloc_node_range(size, MODULE_ALIGN, MODULES_VADDR + get_module_load_offset(), MODULES_END, GFP_KERNEL | __GFP_HIGHMEM, - PAGE_KERNEL_EXEC, NUMA_NO_NODE, + PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE, __builtin_return_address(0)); + if (p && (kasan_module_alloc(p, size) < 0)) { + vfree(p); + return NULL; + } + + return p; } #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index e127dda..046e2d6 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -28,6 +28,7 @@ #include <asm/fpu-internal.h> #include <asm/debugreg.h> #include <asm/nmi.h> +#include <asm/tlbflush.h> /* * per-CPU TSS segments. Threads are completely 'soft' on Linux, @@ -141,7 +142,7 @@ void flush_thread(void) static void hard_disable_TSC(void) { - write_cr4(read_cr4() | X86_CR4_TSD); + cr4_set_bits(X86_CR4_TSD); } void disable_TSC(void) @@ -158,7 +159,7 @@ void disable_TSC(void) static void hard_enable_TSC(void) { - write_cr4(read_cr4() & ~X86_CR4_TSD); + cr4_clear_bits(X86_CR4_TSD); } static void enable_TSC(void) diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 8f3ebfe..603c4f9 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -101,7 +101,7 @@ void __show_regs(struct pt_regs *regs, int all) cr0 = read_cr0(); cr2 = read_cr2(); cr3 = read_cr3(); - cr4 = read_cr4_safe(); + cr4 = __read_cr4_safe(); printk(KERN_DEFAULT "CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", cr0, cr2, cr3, cr4); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 5a2c029..67fcc43 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -93,7 +93,7 @@ void __show_regs(struct pt_regs *regs, int all) cr0 = read_cr0(); cr2 = read_cr2(); cr3 = read_cr3(); - cr4 = read_cr4(); + cr4 = __read_cr4(); printk(KERN_DEFAULT "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", fs, fsindex, gs, gsindex, shadowgs); diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c index fe3dbfe..cd96852 100644 --- a/arch/x86/kernel/rtc.c +++ b/arch/x86/kernel/rtc.c @@ -49,11 +49,11 @@ int mach_set_rtc_mmss(const struct timespec *now) retval = set_rtc_time(&tm); if (retval) printk(KERN_ERR "%s: RTC write failed with error %d\n", - __FUNCTION__, retval); + __func__, retval); } else { printk(KERN_ERR "%s: Invalid RTC value: write of %lx to RTC failed\n", - __FUNCTION__, nowtime); + __func__, nowtime); retval = -EINVAL; } return retval; diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 0d8071d..98dc931 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -89,6 +89,7 @@ #include <asm/cacheflush.h> #include <asm/processor.h> #include <asm/bugs.h> +#include <asm/kasan.h> #include <asm/vsyscall.h> #include <asm/cpu.h> @@ -1188,9 +1189,11 @@ void __init setup_arch(char **cmdline_p) x86_init.paging.pagetable_init(); + kasan_init(); + if (boot_cpu_data.cpuid_level >= 0) { /* A CPU has %cr4 if and only if it has CPUID */ - mmu_cr4_features = read_cr4(); + mmu_cr4_features = __read_cr4(); if (trampoline_cr4_features) *trampoline_cr4_features = mmu_cr4_features; } diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 2a33c8f..e504246 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -69,7 +69,7 @@ int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, unsigned int err = 0; /* Always make any pending restarted system calls return -EINTR */ - current_thread_info()->restart_block.fn = do_no_restart_syscall; + current->restart_block.fn = do_no_restart_syscall; get_user_try { diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 8b96a94..81f8adb0 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -66,27 +66,54 @@ * Good-instruction tables for 32-bit apps. This is non-const and volatile * to keep gcc from statically optimizing it out, as variable_test_bit makes * some versions of gcc to think only *(unsigned long*) is used. + * + * Opcodes we'll probably never support: + * 6c-6f - ins,outs. SEGVs if used in userspace + * e4-e7 - in,out imm. SEGVs if used in userspace + * ec-ef - in,out acc. SEGVs if used in userspace + * cc - int3. SIGTRAP if used in userspace + * ce - into. Not used in userspace - no kernel support to make it useful. SEGVs + * (why we support bound (62) then? it's similar, and similarly unused...) + * f1 - int1. SIGTRAP if used in userspace + * f4 - hlt. SEGVs if used in userspace + * fa - cli. SEGVs if used in userspace + * fb - sti. SEGVs if used in userspace + * + * Opcodes which need some work to be supported: + * 07,17,1f - pop es/ss/ds + * Normally not used in userspace, but would execute if used. + * Can cause GP or stack exception if tries to load wrong segment descriptor. + * We hesitate to run them under single step since kernel's handling + * of userspace single-stepping (TF flag) is fragile. + * We can easily refuse to support push es/cs/ss/ds (06/0e/16/1e) + * on the same grounds that they are never used. + * cd - int N. + * Used by userspace for "int 80" syscall entry. (Other "int N" + * cause GP -> SEGV since their IDT gates don't allow calls from CPL 3). + * Not supported since kernel's handling of userspace single-stepping + * (TF flag) is fragile. + * cf - iret. Normally not used in userspace. Doesn't SEGV unless arguments are bad */ #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) static volatile u32 good_insns_32[256 / 32] = { /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ /* ---------------------------------------------- */ - W(0x00, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0) | /* 00 */ + W(0x00, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* 00 */ W(0x10, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0) , /* 10 */ - W(0x20, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1) | /* 20 */ - W(0x30, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1) , /* 30 */ + W(0x20, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 20 */ + W(0x30, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 30 */ W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */ W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */ - W(0x60, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 60 */ + W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0) | /* 60 */ W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 70 */ W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */ W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */ W(0xa0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* a0 */ W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* b0 */ W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0) | /* c0 */ - W(0xd0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */ + W(0xd0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */ W(0xe0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* e0 */ - W(0xf0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1) /* f0 */ + W(0xf0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1) /* f0 */ /* ---------------------------------------------- */ /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ }; @@ -94,27 +121,61 @@ static volatile u32 good_insns_32[256 / 32] = { #define good_insns_32 NULL #endif -/* Good-instruction tables for 64-bit apps */ +/* Good-instruction tables for 64-bit apps. + * + * Genuinely invalid opcodes: + * 06,07 - formerly push/pop es + * 0e - formerly push cs + * 16,17 - formerly push/pop ss + * 1e,1f - formerly push/pop ds + * 27,2f,37,3f - formerly daa/das/aaa/aas + * 60,61 - formerly pusha/popa + * 62 - formerly bound. EVEX prefix for AVX512 (not yet supported) + * 82 - formerly redundant encoding of Group1 + * 9a - formerly call seg:ofs + * ce - formerly into + * d4,d5 - formerly aam/aad + * d6 - formerly undocumented salc + * ea - formerly jmp seg:ofs + * + * Opcodes we'll probably never support: + * 6c-6f - ins,outs. SEGVs if used in userspace + * e4-e7 - in,out imm. SEGVs if used in userspace + * ec-ef - in,out acc. SEGVs if used in userspace + * cc - int3. SIGTRAP if used in userspace + * f1 - int1. SIGTRAP if used in userspace + * f4 - hlt. SEGVs if used in userspace + * fa - cli. SEGVs if used in userspace + * fb - sti. SEGVs if used in userspace + * + * Opcodes which need some work to be supported: + * cd - int N. + * Used by userspace for "int 80" syscall entry. (Other "int N" + * cause GP -> SEGV since their IDT gates don't allow calls from CPL 3). + * Not supported since kernel's handling of userspace single-stepping + * (TF flag) is fragile. + * cf - iret. Normally not used in userspace. Doesn't SEGV unless arguments are bad + */ #if defined(CONFIG_X86_64) static volatile u32 good_insns_64[256 / 32] = { /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ /* ---------------------------------------------- */ - W(0x00, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) | /* 00 */ + W(0x00, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1) | /* 00 */ W(0x10, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) , /* 10 */ - W(0x20, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) | /* 20 */ - W(0x30, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) , /* 30 */ - W(0x40, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 40 */ + W(0x20, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0) | /* 20 */ + W(0x30, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0) , /* 30 */ + W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */ W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */ - W(0x60, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 60 */ + W(0x60, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0) | /* 60 */ W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 70 */ W(0x80, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */ - W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */ + W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1) , /* 90 */ W(0xa0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* a0 */ W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* b0 */ - W(0xc0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0) | /* c0 */ + W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0) | /* c0 */ W(0xd0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */ - W(0xe0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* e0 */ - W(0xf0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1) /* f0 */ + W(0xe0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0) | /* e0 */ + W(0xf0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1) /* f0 */ /* ---------------------------------------------- */ /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ }; @@ -122,49 +183,55 @@ static volatile u32 good_insns_64[256 / 32] = { #define good_insns_64 NULL #endif -/* Using this for both 64-bit and 32-bit apps */ +/* Using this for both 64-bit and 32-bit apps. + * Opcodes we don't support: + * 0f 00 - SLDT/STR/LLDT/LTR/VERR/VERW/-/- group. System insns + * 0f 01 - SGDT/SIDT/LGDT/LIDT/SMSW/-/LMSW/INVLPG group. + * Also encodes tons of other system insns if mod=11. + * Some are in fact non-system: xend, xtest, rdtscp, maybe more + * 0f 05 - syscall + * 0f 06 - clts (CPL0 insn) + * 0f 07 - sysret + * 0f 08 - invd (CPL0 insn) + * 0f 09 - wbinvd (CPL0 insn) + * 0f 0b - ud2 + * 0f 30 - wrmsr (CPL0 insn) (then why rdmsr is allowed, it's also CPL0 insn?) + * 0f 34 - sysenter + * 0f 35 - sysexit + * 0f 37 - getsec + * 0f 78 - vmread (Intel VMX. CPL0 insn) + * 0f 79 - vmwrite (Intel VMX. CPL0 insn) + * Note: with prefixes, these two opcodes are + * extrq/insertq/AVX512 convert vector ops. + * 0f ae - group15: [f]xsave,[f]xrstor,[v]{ld,st}mxcsr,clflush[opt], + * {rd,wr}{fs,gs}base,{s,l,m}fence. + * Why? They are all user-executable. + */ static volatile u32 good_2byte_insns[256 / 32] = { /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ /* ---------------------------------------------- */ - W(0x00, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1) | /* 00 */ - W(0x10, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1) , /* 10 */ - W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* 20 */ - W(0x30, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 30 */ + W(0x00, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1) | /* 00 */ + W(0x10, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 10 */ + W(0x20, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 20 */ + W(0x30, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1) , /* 30 */ W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */ W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */ W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 60 */ - W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1) , /* 70 */ + W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1) , /* 70 */ W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */ W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */ - W(0xa0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1) | /* a0 */ - W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1) , /* b0 */ + W(0xa0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1) | /* a0 */ + W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* b0 */ W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* c0 */ - W(0xd0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */ + W(0xd0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */ W(0xe0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* e0 */ - W(0xf0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0) /* f0 */ + W(0xf0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) /* f0 */ /* ---------------------------------------------- */ /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ }; #undef W /* - * opcodes we'll probably never support: - * - * 6c-6d, e4-e5, ec-ed - in - * 6e-6f, e6-e7, ee-ef - out - * cc, cd - int3, int - * cf - iret - * d6 - illegal instruction - * f1 - int1/icebp - * f4 - hlt - * fa, fb - cli, sti - * 0f - lar, lsl, syscall, clts, sysret, sysenter, sysexit, invd, wbinvd, ud2 - * - * invalid opcodes in 64-bit mode: - * - * 06, 0e, 16, 1e, 27, 2f, 37, 3f, 60-62, 82, c4-c5, d4-d5 - * 63 - we support this opcode in x86_64 but not in i386. - * * opcodes we may need to refine support for: * * 0f - 2-byte instructions: For many of these instructions, the validity diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c index 0406819..37d8fa4 100644 --- a/arch/x86/kernel/x8664_ksyms_64.c +++ b/arch/x86/kernel/x8664_ksyms_64.c @@ -50,13 +50,19 @@ EXPORT_SYMBOL(csum_partial); #undef memset #undef memmove +extern void *__memset(void *, int, __kernel_size_t); +extern void *__memcpy(void *, const void *, __kernel_size_t); +extern void *__memmove(void *, const void *, __kernel_size_t); extern void *memset(void *, int, __kernel_size_t); extern void *memcpy(void *, const void *, __kernel_size_t); -extern void *__memcpy(void *, const void *, __kernel_size_t); +extern void *memmove(void *, const void *, __kernel_size_t); + +EXPORT_SYMBOL(__memset); +EXPORT_SYMBOL(__memcpy); +EXPORT_SYMBOL(__memmove); EXPORT_SYMBOL(memset); EXPORT_SYMBOL(memcpy); -EXPORT_SYMBOL(__memcpy); EXPORT_SYMBOL(memmove); #ifndef CONFIG_DEBUG_VIRTUAL diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index 0de1fae..34f66e5 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -12,6 +12,7 @@ #include <asm/i387.h> #include <asm/fpu-internal.h> #include <asm/sigframe.h> +#include <asm/tlbflush.h> #include <asm/xcr.h> /* @@ -453,7 +454,7 @@ static void prepare_fx_sw_frame(void) */ static inline void xstate_enable(void) { - set_in_cr4(X86_CR4_OSXSAVE); + cr4_set_bits(X86_CR4_OSXSAVE); xsetbv(XCR_XFEATURE_ENABLED_MASK, pcntxt_mask); } |