diff options
Diffstat (limited to 'arch/x86')
49 files changed, 1346 insertions, 182 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 7cf916f..6a47bb2 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -72,6 +72,7 @@ config X86 select USE_GENERIC_SMP_HELPERS if SMP select HAVE_BPF_JIT if (X86_64 && NET) select CLKEVT_I8253 + select ARCH_HAVE_NMI_SAFE_CMPXCHG config INSTRUCTION_DECODER def_bool (KPROBES || PERF_EVENTS) diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index a0e866d..54edb207 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -672,7 +672,7 @@ ia32_sys_call_table: .quad sys32_vm86_warning /* vm86 */ .quad quiet_ni_syscall /* query_module */ .quad sys_poll - .quad compat_sys_nfsservctl + .quad quiet_ni_syscall /* old nfsservctl */ .quad sys_setresgid16 /* 170 */ .quad sys_getresgid16 .quad sys_prctl diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index 7b439d9..41935fa 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -27,8 +27,8 @@ static inline void fill_ldt(struct desc_struct *desc, const struct user_desc *in desc->base2 = (info->base_addr & 0xff000000) >> 24; /* - * Don't allow setting of the lm bit. It is useless anyway - * because 64bit system calls require __USER_CS: + * Don't allow setting of the lm bit. It would confuse + * user_64bit_mode and would get overridden by sysret anyway. */ desc->l = 0; } diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index d02804d..d8e8eef 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -40,8 +40,6 @@ #include <linux/compiler.h> #include <asm/page.h> -#include <xen/xen.h> - #define build_mmio_read(name, size, type, reg, barrier) \ static inline type name(const volatile void __iomem *addr) \ { type ret; asm volatile("mov" size " %1,%0":reg (ret) \ @@ -334,6 +332,7 @@ extern void fixup_early_ioremap(void); extern bool is_early_ioremap_ptep(pte_t *ptep); #ifdef CONFIG_XEN +#include <xen/xen.h> struct bio_vec; extern bool xen_biovec_phys_mergeable(const struct bio_vec *vec1, diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index f9a3209..7e50f06 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -17,7 +17,6 @@ * Vectors 0 ... 31 : system traps and exceptions - hardcoded events * Vectors 32 ... 127 : device interrupts * Vector 128 : legacy int80 syscall interface - * Vector 204 : legacy x86_64 vsyscall emulation * Vectors 129 ... INVALIDATE_TLB_VECTOR_START-1 except 204 : device interrupts * Vectors INVALIDATE_TLB_VECTOR_START ... 255 : special interrupts * @@ -51,9 +50,6 @@ #ifdef CONFIG_X86_32 # define SYSCALL_VECTOR 0x80 #endif -#ifdef CONFIG_X86_64 -# define VSYSCALL_EMU_VECTOR 0xcc -#endif /* * Vectors 0x30-0x3f are used for ISA interrupts. diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 2c76521..8e8b9a4 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -41,6 +41,7 @@ #include <asm/desc_defs.h> #include <asm/kmap_types.h> +#include <asm/pgtable_types.h> struct page; struct thread_struct; @@ -63,6 +64,11 @@ struct paravirt_callee_save { struct pv_info { unsigned int kernel_rpl; int shared_kernel_pmd; + +#ifdef CONFIG_X86_64 + u16 extra_user_64bit_cs; /* __USER_CS if none */ +#endif + int paravirt_enabled; const char *name; }; diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 2193715..0d1171c 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -751,8 +751,6 @@ static inline void __sti_mwait(unsigned long eax, unsigned long ecx) :: "a" (eax), "c" (ecx)); } -extern void mwait_idle_with_hints(unsigned long eax, unsigned long ecx); - extern void select_idle_routine(const struct cpuinfo_x86 *c); extern void init_amd_e400_c1e_mask(void); diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index 94e7618..3566454 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -131,6 +131,9 @@ struct pt_regs { #ifdef __KERNEL__ #include <linux/init.h> +#ifdef CONFIG_PARAVIRT +#include <asm/paravirt_types.h> +#endif struct cpuinfo_x86; struct task_struct; @@ -187,6 +190,22 @@ static inline int v8086_mode(struct pt_regs *regs) #endif } +#ifdef CONFIG_X86_64 +static inline bool user_64bit_mode(struct pt_regs *regs) +{ +#ifndef CONFIG_PARAVIRT + /* + * On non-paravirt systems, this is the only long mode CPL 3 + * selector. We do not allow long mode selectors in the LDT. + */ + return regs->cs == __USER_CS; +#else + /* Headers are too twisted for this to go in paravirt.h. */ + return regs->cs == __USER_CS || regs->cs == pv_info.extra_user_64bit_cs; +#endif +} +#endif + /* * X86_32 CPUs don't save ss and esp if the CPU is already in kernel mode * when it traps. The previous stack will be directly underneath the saved diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h index a518c0a..c59cc97 100644 --- a/arch/x86/include/asm/pvclock.h +++ b/arch/x86/include/asm/pvclock.h @@ -44,7 +44,7 @@ static inline u64 pvclock_scale_delta(u64 delta, u32 mul_frac, int shift) : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) ); #elif defined(__x86_64__) __asm__ ( - "mul %[mul_frac] ; shrd $32, %[hi], %[lo]" + "mulq %[mul_frac] ; shrd $32, %[hi], %[lo]" : [lo]"=a"(product), [hi]"=d"(tmp) : "0"(delta), diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index 2bae0a5..0012d09 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -40,7 +40,6 @@ asmlinkage void alignment_check(void); asmlinkage void machine_check(void); #endif /* CONFIG_X86_MCE */ asmlinkage void simd_coprocessor_error(void); -asmlinkage void emulate_vsyscall(void); dotraplinkage void do_divide_error(struct pt_regs *, long); dotraplinkage void do_debug(struct pt_regs *, long); @@ -67,7 +66,6 @@ dotraplinkage void do_alignment_check(struct pt_regs *, long); dotraplinkage void do_machine_check(struct pt_regs *, long); #endif dotraplinkage void do_simd_coprocessor_error(struct pt_regs *, long); -dotraplinkage void do_emulate_vsyscall(struct pt_regs *, long); #ifdef CONFIG_X86_32 dotraplinkage void do_iret_error(struct pt_regs *, long); #endif diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h index 705bf13..2010405 100644 --- a/arch/x86/include/asm/unistd_64.h +++ b/arch/x86/include/asm/unistd_64.h @@ -414,7 +414,7 @@ __SYSCALL(__NR_query_module, sys_ni_syscall) __SYSCALL(__NR_quotactl, sys_quotactl) #define __NR_nfsservctl 180 -__SYSCALL(__NR_nfsservctl, sys_nfsservctl) +__SYSCALL(__NR_nfsservctl, sys_ni_syscall) /* reserved for LiS/STREAMS */ #define __NR_getpmsg 181 @@ -681,6 +681,8 @@ __SYSCALL(__NR_syncfs, sys_syncfs) __SYSCALL(__NR_sendmmsg, sys_sendmmsg) #define __NR_setns 308 __SYSCALL(__NR_setns, sys_setns) +#define __NR_getcpu 309 +__SYSCALL(__NR_getcpu, sys_getcpu) #ifndef __NO_STUBS #define __ARCH_WANT_OLD_READDIR diff --git a/arch/x86/include/asm/vsyscall.h b/arch/x86/include/asm/vsyscall.h index 6010707..eaea1d3 100644 --- a/arch/x86/include/asm/vsyscall.h +++ b/arch/x86/include/asm/vsyscall.h @@ -27,6 +27,12 @@ extern struct timezone sys_tz; extern void map_vsyscall(void); +/* + * Called on instruction fetch fault in vsyscall page. + * Returns true if handled. + */ +extern bool emulate_vsyscall(struct pt_regs *regs, unsigned long address); + #endif /* __KERNEL__ */ #endif /* _ASM_X86_VSYSCALL_H */ diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h index 64a619d..7ff4669 100644 --- a/arch/x86/include/asm/xen/page.h +++ b/arch/x86/include/asm/xen/page.h @@ -39,7 +39,7 @@ typedef struct xpaddr { ((unsigned long)((u64)CONFIG_XEN_MAX_DOMAIN_MEMORY * 1024 * 1024 * 1024 / PAGE_SIZE)) extern unsigned long *machine_to_phys_mapping; -extern unsigned int machine_to_phys_order; +extern unsigned long machine_to_phys_nr; extern unsigned long get_phys_to_machine(unsigned long pfn); extern bool set_phys_to_machine(unsigned long pfn, unsigned long mfn); @@ -87,7 +87,7 @@ static inline unsigned long mfn_to_pfn(unsigned long mfn) if (xen_feature(XENFEAT_auto_translated_physmap)) return mfn; - if (unlikely((mfn >> machine_to_phys_order) != 0)) { + if (unlikely(mfn >= machine_to_phys_nr)) { pfn = ~0; goto try_override; } diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 0410557..82f2912 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -17,19 +17,6 @@ CFLAGS_REMOVE_ftrace.o = -pg CFLAGS_REMOVE_early_printk.o = -pg endif -# -# vsyscalls (which work on the user stack) should have -# no stack-protector checks: -# -nostackp := $(call cc-option, -fno-stack-protector) -CFLAGS_vsyscall_64.o := $(PROFILING) -g0 $(nostackp) -CFLAGS_hpet.o := $(nostackp) -CFLAGS_paravirt.o := $(nostackp) -GCOV_PROFILE_vsyscall_64.o := n -GCOV_PROFILE_hpet.o := n -GCOV_PROFILE_tsc.o := n -GCOV_PROFILE_paravirt.o := n - obj-y := process_$(BITS).o signal.o entry_$(BITS).o obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o obj-y += time.o ioport.o ldt.o dumpstack.o diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c index 5812404..f50e7fb 100644 --- a/arch/x86/kernel/acpi/cstate.c +++ b/arch/x86/kernel/acpi/cstate.c @@ -149,6 +149,29 @@ int acpi_processor_ffh_cstate_probe(unsigned int cpu, } EXPORT_SYMBOL_GPL(acpi_processor_ffh_cstate_probe); +/* + * This uses new MONITOR/MWAIT instructions on P4 processors with PNI, + * which can obviate IPI to trigger checking of need_resched. + * We execute MONITOR against need_resched and enter optimized wait state + * through MWAIT. Whenever someone changes need_resched, we would be woken + * up from MWAIT (without an IPI). + * + * New with Core Duo processors, MWAIT can take some hints based on CPU + * capability. + */ +void mwait_idle_with_hints(unsigned long ax, unsigned long cx) +{ + if (!need_resched()) { + if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR)) + clflush((void *)¤t_thread_info()->flags); + + __monitor((void *)¤t_thread_info()->flags, 0, 0); + smp_mb(); + if (!need_resched()) + __mwait(ax, cx); + } +} + void acpi_processor_ffh_cstate_enter(struct acpi_processor_cx *cx) { unsigned int cpu = smp_processor_id(); diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index adc66c3..34b1859 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -207,7 +207,6 @@ static int __cpuinit uv_wakeup_secondary(int phys_apicid, unsigned long start_ri ((start_rip << UVH_IPI_INT_VECTOR_SHFT) >> 12) | APIC_DM_INIT; uv_write_global_mmr64(pnode, UVH_IPI_INT, val); - mdelay(10); val = (1UL << UVH_IPI_INT_SEND_SHFT) | (phys_apicid << UVH_IPI_INT_APIC_ID_SHFT) | diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 08119a3..6b96110 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -149,7 +149,6 @@ struct set_mtrr_data { */ static int mtrr_rendezvous_handler(void *info) { -#ifdef CONFIG_SMP struct set_mtrr_data *data = info; /* @@ -171,7 +170,6 @@ static int mtrr_rendezvous_handler(void *info) } else if (mtrr_aps_delayed_init || !cpu_online(smp_processor_id())) { mtrr_if->set_all(); } -#endif return 0; } diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 4ee3abf..cfa62ec 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1900,6 +1900,9 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs) perf_callchain_store(entry, regs->ip); + if (!current->mm) + return; + if (perf_callchain_user32(regs, entry)) return; diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 45fbb8f..f88af2c 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1590,6 +1590,7 @@ static __init int intel_pmu_init(void) break; case 42: /* SandyBridge */ + case 45: /* SandyBridge, "Romely-EP" */ memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, sizeof(hw_cache_event_ids)); diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 5c1a9197..f3f6f53 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -54,6 +54,7 @@ #include <asm/ftrace.h> #include <asm/irq_vectors.h> #include <asm/cpufeature.h> +#include <asm/alternative-asm.h> /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */ #include <linux/elf-em.h> @@ -873,12 +874,7 @@ ENTRY(simd_coprocessor_error) 661: pushl_cfi $do_general_protection 662: .section .altinstructions,"a" - .balign 4 - .long 661b - .long 663f - .word X86_FEATURE_XMM - .byte 662b-661b - .byte 664f-663f + altinstruction_entry 661b, 663f, X86_FEATURE_XMM, 662b-661b, 664f-663f .previous .section .altinstr_replacement,"ax" 663: pushl $do_simd_coprocessor_error diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index e13329d..6419bb0 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1111,7 +1111,6 @@ zeroentry spurious_interrupt_bug do_spurious_interrupt_bug zeroentry coprocessor_error do_coprocessor_error errorentry alignment_check do_alignment_check zeroentry simd_coprocessor_error do_simd_coprocessor_error -zeroentry emulate_vsyscall do_emulate_vsyscall /* Reload gs selector with exception handling */ diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 613a793..d90272e 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -307,6 +307,10 @@ struct pv_info pv_info = { .paravirt_enabled = 0, .kernel_rpl = 0, .shared_kernel_pmd = 1, /* Only used when CONFIG_X86_PAE is set */ + +#ifdef CONFIG_X86_64 + .extra_user_64bit_cs = __USER_CS, +#endif }; struct pv_init_ops pv_init_ops = { diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index e1ba8cb..e7e3b01 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -438,29 +438,6 @@ void cpu_idle_wait(void) } EXPORT_SYMBOL_GPL(cpu_idle_wait); -/* - * This uses new MONITOR/MWAIT instructions on P4 processors with PNI, - * which can obviate IPI to trigger checking of need_resched. - * We execute MONITOR against need_resched and enter optimized wait state - * through MWAIT. Whenever someone changes need_resched, we would be woken - * up from MWAIT (without an IPI). - * - * New with Core Duo processors, MWAIT can take some hints based on CPU - * capability. - */ -void mwait_idle_with_hints(unsigned long ax, unsigned long cx) -{ - if (!need_resched()) { - if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR)) - clflush((void *)¤t_thread_info()->flags); - - __monitor((void *)¤t_thread_info()->flags, 0, 0); - smp_mb(); - if (!need_resched()) - __mwait(ax, cx); - } -} - /* Default MONITOR/MWAIT with no hints, used for default C1 state */ static void mwait_idle(void) { diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index a3d0dc5..7a3b651 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -38,6 +38,7 @@ #include <linux/uaccess.h> #include <linux/io.h> #include <linux/kdebug.h> +#include <linux/cpuidle.h> #include <asm/pgtable.h> #include <asm/system.h> @@ -109,7 +110,8 @@ void cpu_idle(void) local_irq_disable(); /* Don't trace irqs off for idle */ stop_critical_timings(); - pm_idle(); + if (cpuidle_idle_call()) + pm_idle(); start_critical_timings(); } tick_nohz_restart_sched_tick(); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index ca6f7ab..f693e44 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -37,6 +37,7 @@ #include <linux/uaccess.h> #include <linux/io.h> #include <linux/ftrace.h> +#include <linux/cpuidle.h> #include <asm/pgtable.h> #include <asm/system.h> @@ -136,7 +137,8 @@ void cpu_idle(void) enter_idle(); /* Don't trace irqs off for idle */ stop_critical_timings(); - pm_idle(); + if (cpuidle_idle_call()) + pm_idle(); start_critical_timings(); /* In many cases the interrupt that ended idle diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c index 7977f0c..c346d11 100644 --- a/arch/x86/kernel/step.c +++ b/arch/x86/kernel/step.c @@ -74,7 +74,7 @@ static int is_setting_trap_flag(struct task_struct *child, struct pt_regs *regs) #ifdef CONFIG_X86_64 case 0x40 ... 0x4f: - if (regs->cs != __USER_CS) + if (!user_64bit_mode(regs)) /* 32-bit mode: register increment */ return 0; /* 64-bit mode: REX prefix */ diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S index fbb0a04..bc19be3 100644 --- a/arch/x86/kernel/syscall_table_32.S +++ b/arch/x86/kernel/syscall_table_32.S @@ -168,7 +168,7 @@ ENTRY(sys_call_table) .long ptregs_vm86 .long sys_ni_syscall /* Old sys_query_module */ .long sys_poll - .long sys_nfsservctl + .long sys_ni_syscall /* Old nfsservctl */ .long sys_setresgid16 /* 170 */ .long sys_getresgid16 .long sys_prctl diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 9682ec5..6913369 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -872,12 +872,6 @@ void __init trap_init(void) set_bit(SYSCALL_VECTOR, used_vectors); #endif -#ifdef CONFIG_X86_64 - BUG_ON(test_bit(VSYSCALL_EMU_VECTOR, used_vectors)); - set_system_intr_gate(VSYSCALL_EMU_VECTOR, &emulate_vsyscall); - set_bit(VSYSCALL_EMU_VECTOR, used_vectors); -#endif - /* * Should be a barrier for any external CPU state: */ diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 4aa9c54..0f703f1 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -71,7 +71,6 @@ PHDRS { text PT_LOAD FLAGS(5); /* R_E */ data PT_LOAD FLAGS(6); /* RW_ */ #ifdef CONFIG_X86_64 - user PT_LOAD FLAGS(5); /* R_E */ #ifdef CONFIG_SMP percpu PT_LOAD FLAGS(6); /* RW_ */ #endif @@ -154,44 +153,16 @@ SECTIONS #ifdef CONFIG_X86_64 -#define VSYSCALL_ADDR (-10*1024*1024) - -#define VLOAD_OFFSET (VSYSCALL_ADDR - __vsyscall_0 + LOAD_OFFSET) -#define VLOAD(x) (ADDR(x) - VLOAD_OFFSET) - -#define VVIRT_OFFSET (VSYSCALL_ADDR - __vsyscall_0) -#define VVIRT(x) (ADDR(x) - VVIRT_OFFSET) - - . = ALIGN(4096); - __vsyscall_0 = .; - - . = VSYSCALL_ADDR; - .vsyscall : AT(VLOAD(.vsyscall)) { - *(.vsyscall_0) - - . = 1024; - *(.vsyscall_1) - - . = 2048; - *(.vsyscall_2) - - . = 4096; /* Pad the whole page. */ - } :user =0xcc - . = ALIGN(__vsyscall_0 + PAGE_SIZE, PAGE_SIZE); - -#undef VSYSCALL_ADDR -#undef VLOAD_OFFSET -#undef VLOAD -#undef VVIRT_OFFSET -#undef VVIRT - + . = ALIGN(PAGE_SIZE); __vvar_page = .; .vvar : AT(ADDR(.vvar) - LOAD_OFFSET) { + /* work around gold bug 13023 */ + __vvar_beginning_hack = .; - /* Place all vvars at the offsets in asm/vvar.h. */ -#define EMIT_VVAR(name, offset) \ - . = offset; \ + /* Place all vvars at the offsets in asm/vvar.h. */ +#define EMIT_VVAR(name, offset) \ + . = __vvar_beginning_hack + offset; \ *(.vvar_ ## name) #define __VVAR_KERNEL_LDS #include <asm/vvar.h> diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index dda7dff..18ae83d 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -18,9 +18,6 @@ * use the vDSO. */ -/* Disable profiling for userspace code: */ -#define DISABLE_BRANCH_PROFILING - #include <linux/time.h> #include <linux/init.h> #include <linux/kernel.h> @@ -50,12 +47,36 @@ #include <asm/vgtod.h> #include <asm/traps.h> +#define CREATE_TRACE_POINTS +#include "vsyscall_trace.h" + DEFINE_VVAR(int, vgetcpu_mode); DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data) = { .lock = __SEQLOCK_UNLOCKED(__vsyscall_gtod_data.lock), }; +static enum { EMULATE, NATIVE, NONE } vsyscall_mode = EMULATE; + +static int __init vsyscall_setup(char *str) +{ + if (str) { + if (!strcmp("emulate", str)) + vsyscall_mode = EMULATE; + else if (!strcmp("native", str)) + vsyscall_mode = NATIVE; + else if (!strcmp("none", str)) + vsyscall_mode = NONE; + else + return -EINVAL; + + return 0; + } + + return -EINVAL; +} +early_param("vsyscall", vsyscall_setup); + void update_vsyscall_tz(void) { unsigned long flags; @@ -100,7 +121,7 @@ static void warn_bad_vsyscall(const char *level, struct pt_regs *regs, printk("%s%s[%d] %s ip:%lx cs:%lx sp:%lx ax:%lx si:%lx di:%lx\n", level, tsk->comm, task_pid_nr(tsk), - message, regs->ip - 2, regs->cs, + message, regs->ip, regs->cs, regs->sp, regs->ax, regs->si, regs->di); } @@ -118,46 +139,39 @@ static int addr_to_vsyscall_nr(unsigned long addr) return nr; } -void dotraplinkage do_emulate_vsyscall(struct pt_regs *regs, long error_code) +bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) { struct task_struct *tsk; unsigned long caller; int vsyscall_nr; long ret; - local_irq_enable(); - /* - * Real 64-bit user mode code has cs == __USER_CS. Anything else - * is bogus. + * No point in checking CS -- the only way to get here is a user mode + * trap to a high address, which means that we're in 64-bit user code. */ - if (regs->cs != __USER_CS) { - /* - * If we trapped from kernel mode, we might as well OOPS now - * instead of returning to some random address and OOPSing - * then. - */ - BUG_ON(!user_mode(regs)); - /* Compat mode and non-compat 32-bit CS should both segfault. */ - warn_bad_vsyscall(KERN_WARNING, regs, - "illegal int 0xcc from 32-bit mode"); - goto sigsegv; + WARN_ON_ONCE(address != regs->ip); + + if (vsyscall_mode == NONE) { + warn_bad_vsyscall(KERN_INFO, regs, + "vsyscall attempted with vsyscall=none"); + return false; } - /* - * x86-ism here: regs->ip points to the instruction after the int 0xcc, - * and int 0xcc is two bytes long. - */ - vsyscall_nr = addr_to_vsyscall_nr(regs->ip - 2); + vsyscall_nr = addr_to_vsyscall_nr(address); + + trace_emulate_vsyscall(vsyscall_nr); + if (vsyscall_nr < 0) { warn_bad_vsyscall(KERN_WARNING, regs, - "illegal int 0xcc (exploit attempt?)"); + "misaligned vsyscall (exploit attempt or buggy program) -- look up the vsyscall kernel parameter if you need a workaround"); goto sigsegv; } if (get_user(caller, (unsigned long __user *)regs->sp) != 0) { - warn_bad_vsyscall(KERN_WARNING, regs, "int 0xcc with bad stack (exploit attempt?)"); + warn_bad_vsyscall(KERN_WARNING, regs, + "vsyscall with bad stack (exploit attempt?)"); goto sigsegv; } @@ -202,13 +216,11 @@ void dotraplinkage do_emulate_vsyscall(struct pt_regs *regs, long error_code) regs->ip = caller; regs->sp += 8; - local_irq_disable(); - return; + return true; sigsegv: - regs->ip -= 2; /* The faulting instruction should be the int 0xcc. */ force_sig(SIGSEGV, current); - local_irq_disable(); + return true; } /* @@ -256,15 +268,21 @@ cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg) void __init map_vsyscall(void) { - extern char __vsyscall_0; - unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0); + extern char __vsyscall_page; + unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page); extern char __vvar_page; unsigned long physaddr_vvar_page = __pa_symbol(&__vvar_page); - /* Note that VSYSCALL_MAPPED_PAGES must agree with the code below. */ - __set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_page0, PAGE_KERNEL_VSYSCALL); + __set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_vsyscall, + vsyscall_mode == NATIVE + ? PAGE_KERNEL_VSYSCALL + : PAGE_KERNEL_VVAR); + BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_FIRST_PAGE) != + (unsigned long)VSYSCALL_START); + __set_fixmap(VVAR_PAGE, physaddr_vvar_page, PAGE_KERNEL_VVAR); - BUILD_BUG_ON((unsigned long)__fix_to_virt(VVAR_PAGE) != (unsigned long)VVAR_ADDRESS); + BUILD_BUG_ON((unsigned long)__fix_to_virt(VVAR_PAGE) != + (unsigned long)VVAR_ADDRESS); } static int __init vsyscall_init(void) diff --git a/arch/x86/kernel/vsyscall_emu_64.S b/arch/x86/kernel/vsyscall_emu_64.S index ffa845e..c9596a9 100644 --- a/arch/x86/kernel/vsyscall_emu_64.S +++ b/arch/x86/kernel/vsyscall_emu_64.S @@ -7,21 +7,31 @@ */ #include <linux/linkage.h> + #include <asm/irq_vectors.h> +#include <asm/page_types.h> +#include <asm/unistd_64.h> + +__PAGE_ALIGNED_DATA + .globl __vsyscall_page + .balign PAGE_SIZE, 0xcc + .type __vsyscall_page, @object +__vsyscall_page: + + mov $__NR_gettimeofday, %rax + syscall + ret -/* The unused parts of the page are filled with 0xcc by the linker script. */ + .balign 1024, 0xcc + mov $__NR_time, %rax + syscall + ret -.section .vsyscall_0, "a" -ENTRY(vsyscall_0) - int $VSYSCALL_EMU_VECTOR -END(vsyscall_0) + .balign 1024, 0xcc + mov $__NR_getcpu, %rax + syscall + ret -.section .vsyscall_1, "a" -ENTRY(vsyscall_1) - int $VSYSCALL_EMU_VECTOR -END(vsyscall_1) + .balign 4096, 0xcc -.section .vsyscall_2, "a" -ENTRY(vsyscall_2) - int $VSYSCALL_EMU_VECTOR -END(vsyscall_2) + .size __vsyscall_page, 4096 diff --git a/arch/x86/kernel/vsyscall_trace.h b/arch/x86/kernel/vsyscall_trace.h new file mode 100644 index 0000000..a8b2ede --- /dev/null +++ b/arch/x86/kernel/vsyscall_trace.h @@ -0,0 +1,29 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM vsyscall + +#if !defined(__VSYSCALL_TRACE_H) || defined(TRACE_HEADER_MULTI_READ) +#define __VSYSCALL_TRACE_H + +#include <linux/tracepoint.h> + +TRACE_EVENT(emulate_vsyscall, + + TP_PROTO(int nr), + + TP_ARGS(nr), + + TP_STRUCT__entry(__field(int, nr)), + + TP_fast_assign( + __entry->nr = nr; + ), + + TP_printk("nr = %d", __entry->nr) +); + +#endif + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH ../../arch/x86/kernel +#define TRACE_INCLUDE_FILE vsyscall_trace +#include <trace/define_trace.h> diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 988724b..ff5790d 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -22,6 +22,8 @@ config KVM depends on HAVE_KVM # for device assignment: depends on PCI + # for TASKSTATS/TASK_DELAY_ACCT: + depends on NET select PREEMPT_NOTIFIERS select MMU_NOTIFIER select ANON_INODES @@ -31,6 +33,7 @@ config KVM select KVM_ASYNC_PF select USER_RETURN_NOTIFIER select KVM_MMIO + select TASKSTATS select TASK_DELAY_ACCT ---help--- Support hosting fully virtualized guest machines using hardware diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 4d09df0..0d17c8c 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -17,6 +17,7 @@ #include <asm/traps.h> /* dotraplinkage, ... */ #include <asm/pgalloc.h> /* pgd_*(), ... */ #include <asm/kmemcheck.h> /* kmemcheck_*(), ... */ +#include <asm/vsyscall.h> /* * Page fault error code bits: @@ -105,7 +106,7 @@ check_prefetch_opcode(struct pt_regs *regs, unsigned char *instr, * but for now it's good enough to assume that long * mode only uses well known segments or kernel. */ - return (!user_mode(regs)) || (regs->cs == __USER_CS); + return (!user_mode(regs) || user_64bit_mode(regs)); #endif case 0x60: /* 0x64 thru 0x67 are valid prefixes in all modes. */ @@ -720,6 +721,18 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, if (is_errata100(regs, address)) return; +#ifdef CONFIG_X86_64 + /* + * Instruction fetch faults in the vsyscall page might need + * emulation. + */ + if (unlikely((error_code & PF_INSTR) && + ((address & ~0xfff) == VSYSCALL_START))) { + if (emulate_vsyscall(regs, address)) + return; + } +#endif + if (unlikely(show_unhandled_signals)) show_signal_msg(regs, error_code, address, tsk); diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index ae3cb23..039d913 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -360,6 +360,20 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_pci_root *root) } } + /* After the PCI-E bus has been walked and all devices discovered, + * configure any settings of the fabric that might be necessary. + */ + if (bus) { + struct pci_bus *child; + list_for_each_entry(child, &bus->children, node) { + struct pci_dev *self = child->self; + if (!self) + continue; + + pcie_bus_configure_settings(child, self->pcie_mpss); + } + } + if (!bus) kfree(sd); diff --git a/arch/x86/platform/mrst/Makefile b/arch/x86/platform/mrst/Makefile index f61ccdd..1ea3877 100644 --- a/arch/x86/platform/mrst/Makefile +++ b/arch/x86/platform/mrst/Makefile @@ -1,3 +1,4 @@ obj-$(CONFIG_X86_MRST) += mrst.o obj-$(CONFIG_X86_MRST) += vrtc.o obj-$(CONFIG_EARLY_PRINTK_MRST) += early_printk_mrst.o +obj-$(CONFIG_X86_MRST) += pmu.o diff --git a/arch/x86/platform/mrst/mrst.c b/arch/x86/platform/mrst/mrst.c index 7000e74..58425ad 100644 --- a/arch/x86/platform/mrst/mrst.c +++ b/arch/x86/platform/mrst/mrst.c @@ -689,7 +689,9 @@ static int __init sfi_parse_devs(struct sfi_table_header *table) irq_attr.trigger = 1; irq_attr.polarity = 1; io_apic_set_pci_routing(NULL, pentry->irq, &irq_attr); - } + } else + pentry->irq = 0; /* No irq */ + switch (pentry->type) { case SFI_DEV_TYPE_IPC: /* ID as IRQ is a hack that will go away */ diff --git a/arch/x86/platform/mrst/pmu.c b/arch/x86/platform/mrst/pmu.c new file mode 100644 index 0000000..9281da7 --- /dev/null +++ b/arch/x86/platform/mrst/pmu.c @@ -0,0 +1,817 @@ +/* + * mrst/pmu.c - driver for MRST Power Management Unit + * + * Copyright (c) 2011, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include <linux/cpuidle.h> +#include <linux/debugfs.h> +#include <linux/delay.h> +#include <linux/interrupt.h> +#include <linux/module.h> +#include <linux/pci.h> +#include <linux/seq_file.h> +#include <linux/sfi.h> +#include <asm/intel_scu_ipc.h> +#include "pmu.h" + +#define IPCMSG_FW_REVISION 0xF4 + +struct mrst_device { + u16 pci_dev_num; /* DEBUG only */ + u16 lss; + u16 latest_request; + unsigned int pci_state_counts[PCI_D3cold + 1]; /* DEBUG only */ +}; + +/* + * comlete list of MRST PCI devices + */ +static struct mrst_device mrst_devs[] = { +/* 0 */ { 0x0800, LSS_SPI0 }, /* Moorestown SPI Ctrl 0 */ +/* 1 */ { 0x0801, LSS_SPI1 }, /* Moorestown SPI Ctrl 1 */ +/* 2 */ { 0x0802, LSS_I2C0 }, /* Moorestown I2C 0 */ +/* 3 */ { 0x0803, LSS_I2C1 }, /* Moorestown I2C 1 */ +/* 4 */ { 0x0804, LSS_I2C2 }, /* Moorestown I2C 2 */ +/* 5 */ { 0x0805, LSS_KBD }, /* Moorestown Keyboard Ctrl */ +/* 6 */ { 0x0806, LSS_USB_HC }, /* Moorestown USB Ctrl */ +/* 7 */ { 0x0807, LSS_SD_HC0 }, /* Moorestown SD Host Ctrl 0 */ +/* 8 */ { 0x0808, LSS_SD_HC1 }, /* Moorestown SD Host Ctrl 1 */ +/* 9 */ { 0x0809, LSS_NAND }, /* Moorestown NAND Ctrl */ +/* 10 */ { 0x080a, LSS_AUDIO }, /* Moorestown Audio Ctrl */ +/* 11 */ { 0x080b, LSS_IMAGING }, /* Moorestown ISP */ +/* 12 */ { 0x080c, LSS_SECURITY }, /* Moorestown Security Controller */ +/* 13 */ { 0x080d, LSS_DISPLAY }, /* Moorestown External Displays */ +/* 14 */ { 0x080e, 0 }, /* Moorestown SCU IPC */ +/* 15 */ { 0x080f, LSS_GPIO }, /* Moorestown GPIO Controller */ +/* 16 */ { 0x0810, 0 }, /* Moorestown Power Management Unit */ +/* 17 */ { 0x0811, LSS_USB_OTG }, /* Moorestown OTG Ctrl */ +/* 18 */ { 0x0812, LSS_SPI2 }, /* Moorestown SPI Ctrl 2 */ +/* 19 */ { 0x0813, 0 }, /* Moorestown SC DMA */ +/* 20 */ { 0x0814, LSS_AUDIO_LPE }, /* Moorestown LPE DMA */ +/* 21 */ { 0x0815, LSS_AUDIO_SSP }, /* Moorestown SSP0 */ + +/* 22 */ { 0x084F, LSS_SD_HC2 }, /* Moorestown SD Host Ctrl 2 */ + +/* 23 */ { 0x4102, 0 }, /* Lincroft */ +/* 24 */ { 0x4110, 0 }, /* Lincroft */ +}; + +/* n.b. We ignore PCI-id 0x815 in LSS9 b/c MeeGo has no driver for it */ +static u16 mrst_lss9_pci_ids[] = {0x080a, 0x0814, 0}; +static u16 mrst_lss10_pci_ids[] = {0x0800, 0x0801, 0x0802, 0x0803, + 0x0804, 0x0805, 0x080f, 0}; + +/* handle concurrent SMP invokations of pmu_pci_set_power_state() */ +static spinlock_t mrst_pmu_power_state_lock; + +static unsigned int wake_counters[MRST_NUM_LSS]; /* DEBUG only */ +static unsigned int pmu_irq_stats[INT_INVALID + 1]; /* DEBUG only */ + +static int graphics_is_off; +static int lss_s0i3_enabled; +static bool mrst_pmu_s0i3_enable; + +/* debug counters */ +static u32 pmu_wait_ready_calls; +static u32 pmu_wait_ready_udelays; +static u32 pmu_wait_ready_udelays_max; +static u32 pmu_wait_done_calls; +static u32 pmu_wait_done_udelays; +static u32 pmu_wait_done_udelays_max; +static u32 pmu_set_power_state_entry; +static u32 pmu_set_power_state_send_cmd; + +static struct mrst_device *pci_id_2_mrst_dev(u16 pci_dev_num) +{ + int index = 0; + + if ((pci_dev_num >= 0x0800) && (pci_dev_num <= 0x815)) + index = pci_dev_num - 0x800; + else if (pci_dev_num == 0x084F) + index = 22; + else if (pci_dev_num == 0x4102) + index = 23; + else if (pci_dev_num == 0x4110) + index = 24; + + if (pci_dev_num != mrst_devs[index].pci_dev_num) { + WARN_ONCE(1, FW_BUG "Unknown PCI device 0x%04X\n", pci_dev_num); + return 0; + } + + return &mrst_devs[index]; +} + +/** + * mrst_pmu_validate_cstates + * @dev: cpuidle_device + * + * Certain states are not appropriate for governor to pick in some cases. + * This function will be called as cpuidle_device's prepare callback and + * thus tells governor to ignore such states when selecting the next state + * to enter. + */ + +#define IDLE_STATE4_IS_C6 4 +#define IDLE_STATE5_IS_S0I3 5 + +int mrst_pmu_invalid_cstates(void) +{ + int cpu = smp_processor_id(); + + /* + * Demote to C4 if the PMU is busy. + * Since LSS changes leave the busy bit clear... + * busy means either the PMU is waiting for an ACK-C6 that + * isn't coming due to an MWAIT that returned immediately; + * or we returned from S0i3 successfully, and the PMU + * is not done sending us interrupts. + */ + if (pmu_read_busy_status()) + return 1 << IDLE_STATE4_IS_C6 | 1 << IDLE_STATE5_IS_S0I3; + + /* + * Disallow S0i3 if: PMU is not initialized, or CPU1 is active, + * or if device LSS is insufficient, or the GPU is active, + * or if it has been explicitly disabled. + */ + if (!pmu_reg || !cpumask_equal(cpu_online_mask, cpumask_of(cpu)) || + !lss_s0i3_enabled || !graphics_is_off || !mrst_pmu_s0i3_enable) + return 1 << IDLE_STATE5_IS_S0I3; + else + return 0; +} + +/* + * pmu_update_wake_counters(): read PM_WKS, update wake_counters[] + * DEBUG only. + */ +static void pmu_update_wake_counters(void) +{ + int lss; + u32 wake_status; + + wake_status = pmu_read_wks(); + + for (lss = 0; lss < MRST_NUM_LSS; ++lss) { + if (wake_status & (1 << lss)) + wake_counters[lss]++; + } +} + +int mrst_pmu_s0i3_entry(void) +{ + int status; + + /* Clear any possible error conditions */ + pmu_write_ics(0x300); + + /* set wake control to current D-states */ + pmu_write_wssc(S0I3_SSS_TARGET); + + status = mrst_s0i3_entry(PM_S0I3_COMMAND, &pmu_reg->pm_cmd); + pmu_update_wake_counters(); + return status; +} + +/* poll for maximum of 5ms for busy bit to clear */ +static int pmu_wait_ready(void) +{ + int udelays; + + pmu_wait_ready_calls++; + + for (udelays = 0; udelays < 500; ++udelays) { + if (udelays > pmu_wait_ready_udelays_max) + pmu_wait_ready_udelays_max = udelays; + + if (pmu_read_busy_status() == 0) + return 0; + + udelay(10); + pmu_wait_ready_udelays++; + } + + /* + * if this fires, observe + * /sys/kernel/debug/mrst_pmu_wait_ready_calls + * /sys/kernel/debug/mrst_pmu_wait_ready_udelays + */ + WARN_ONCE(1, "SCU not ready for 5ms"); + return -EBUSY; +} +/* poll for maximum of 50ms us for busy bit to clear */ +static int pmu_wait_done(void) +{ + int udelays; + + pmu_wait_done_calls++; + + for (udelays = 0; udelays < 500; ++udelays) { + if (udelays > pmu_wait_done_udelays_max) + pmu_wait_done_udelays_max = udelays; + + if (pmu_read_busy_status() == 0) + return 0; + + udelay(100); + pmu_wait_done_udelays++; + } + + /* + * if this fires, observe + * /sys/kernel/debug/mrst_pmu_wait_done_calls + * /sys/kernel/debug/mrst_pmu_wait_done_udelays + */ + WARN_ONCE(1, "SCU not done for 50ms"); + return -EBUSY; +} + +u32 mrst_pmu_msi_is_disabled(void) +{ + return pmu_msi_is_disabled(); +} + +void mrst_pmu_enable_msi(void) +{ + pmu_msi_enable(); +} + +/** + * pmu_irq - pmu driver interrupt handler + * Context: interrupt context + */ +static irqreturn_t pmu_irq(int irq, void *dummy) +{ + union pmu_pm_ics pmu_ics; + + pmu_ics.value = pmu_read_ics(); + + if (!pmu_ics.bits.pending) + return IRQ_NONE; + + switch (pmu_ics.bits.cause) { + case INT_SPURIOUS: + case INT_CMD_DONE: + case INT_CMD_ERR: + case INT_WAKE_RX: + case INT_SS_ERROR: + case INT_S0IX_MISS: + case INT_NO_ACKC6: + pmu_irq_stats[pmu_ics.bits.cause]++; + break; + default: + pmu_irq_stats[INT_INVALID]++; + } + + pmu_write_ics(pmu_ics.value); /* Clear pending interrupt */ + + return IRQ_HANDLED; +} + +/* + * Translate PCI power management to MRST LSS D-states + */ +static int pci_2_mrst_state(int lss, pci_power_t pci_state) +{ + switch (pci_state) { + case PCI_D0: + if (SSMSK(D0i1, lss) & D0I1_ACG_SSS_TARGET) + return D0i1; + else + return D0; + case PCI_D1: + return D0i1; + case PCI_D2: + return D0i2; + case PCI_D3hot: + case PCI_D3cold: + return D0i3; + default: + WARN(1, "pci_state %d\n", pci_state); + return 0; + } +} + +static int pmu_issue_command(u32 pm_ssc) +{ + union pmu_pm_set_cfg_cmd_t command; + + if (pmu_read_busy_status()) { + pr_debug("pmu is busy, Operation not permitted\n"); + return -1; + } + + /* + * enable interrupts in PMU so that interrupts are + * propagated when ioc bit for a particular set + * command is set + */ + + pmu_irq_enable(); + + /* Configure the sub systems for pmu2 */ + + pmu_write_ssc(pm_ssc); + + /* + * Send the set config command for pmu its configured + * for mode CM_IMMEDIATE & hence with No Trigger + */ + + command.pmu2_params.d_param.cfg_mode = CM_IMMEDIATE; + command.pmu2_params.d_param.cfg_delay = 0; + command.pmu2_params.d_param.rsvd = 0; + + /* construct the command to send SET_CFG to particular PMU */ + command.pmu2_params.d_param.cmd = SET_CFG_CMD; + command.pmu2_params.d_param.ioc = 0; + command.pmu2_params.d_param.mode_id = 0; + command.pmu2_params.d_param.sys_state = SYS_STATE_S0I0; + + /* write the value of PM_CMD into particular PMU */ + pr_debug("pmu command being written %x\n", + command.pmu_pm_set_cfg_cmd_value); + + pmu_write_cmd(command.pmu_pm_set_cfg_cmd_value); + + return 0; +} + +static u16 pmu_min_lss_pci_req(u16 *ids, u16 pci_state) +{ + u16 existing_request; + int i; + + for (i = 0; ids[i]; ++i) { + struct mrst_device *mrst_dev; + + mrst_dev = pci_id_2_mrst_dev(ids[i]); + if (unlikely(!mrst_dev)) + continue; + + existing_request = mrst_dev->latest_request; + if (existing_request < pci_state) + pci_state = existing_request; + } + return pci_state; +} + +/** + * pmu_pci_set_power_state - Callback function is used by all the PCI devices + * for a platform specific device power on/shutdown. + */ + +int pmu_pci_set_power_state(struct pci_dev *pdev, pci_power_t pci_state) +{ + u32 old_sss, new_sss; + int status = 0; + struct mrst_device *mrst_dev; + + pmu_set_power_state_entry++; + + BUG_ON(pdev->vendor != PCI_VENDOR_ID_INTEL); + BUG_ON(pci_state < PCI_D0 || pci_state > PCI_D3cold); + + mrst_dev = pci_id_2_mrst_dev(pdev->device); + if (unlikely(!mrst_dev)) + return -ENODEV; + + mrst_dev->pci_state_counts[pci_state]++; /* count invocations */ + + /* PMU driver calls self as part of PCI initialization, ignore */ + if (pdev->device == PCI_DEV_ID_MRST_PMU) + return 0; + + BUG_ON(!pmu_reg); /* SW bug if called before initialized */ + + spin_lock(&mrst_pmu_power_state_lock); + + if (pdev->d3_delay) { + dev_dbg(&pdev->dev, "d3_delay %d, should be 0\n", + pdev->d3_delay); + pdev->d3_delay = 0; + } + /* + * If Lincroft graphics, simply remember state + */ + if ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY + && !((pdev->class & PCI_SUB_CLASS_MASK) >> 8)) { + if (pci_state == PCI_D0) + graphics_is_off = 0; + else + graphics_is_off = 1; + goto ret; + } + + if (!mrst_dev->lss) + goto ret; /* device with no LSS */ + + if (mrst_dev->latest_request == pci_state) + goto ret; /* no change */ + + mrst_dev->latest_request = pci_state; /* record latest request */ + + /* + * LSS9 and LSS10 contain multiple PCI devices. + * Use the lowest numbered (highest power) state in the LSS + */ + if (mrst_dev->lss == 9) + pci_state = pmu_min_lss_pci_req(mrst_lss9_pci_ids, pci_state); + else if (mrst_dev->lss == 10) + pci_state = pmu_min_lss_pci_req(mrst_lss10_pci_ids, pci_state); + + status = pmu_wait_ready(); + if (status) + goto ret; + + old_sss = pmu_read_sss(); + new_sss = old_sss & ~SSMSK(3, mrst_dev->lss); + new_sss |= SSMSK(pci_2_mrst_state(mrst_dev->lss, pci_state), + mrst_dev->lss); + + if (new_sss == old_sss) + goto ret; /* nothing to do */ + + pmu_set_power_state_send_cmd++; + + status = pmu_issue_command(new_sss); + + if (unlikely(status != 0)) { + dev_err(&pdev->dev, "Failed to Issue a PM command\n"); + goto ret; + } + + if (pmu_wait_done()) + goto ret; + + lss_s0i3_enabled = + ((pmu_read_sss() & S0I3_SSS_TARGET) == S0I3_SSS_TARGET); +ret: + spin_unlock(&mrst_pmu_power_state_lock); + return status; +} + +#ifdef CONFIG_DEBUG_FS +static char *d0ix_names[] = {"D0", "D0i1", "D0i2", "D0i3"}; + +static inline const char *d0ix_name(int state) +{ + return d0ix_names[(int) state]; +} + +static int debug_mrst_pmu_show(struct seq_file *s, void *unused) +{ + struct pci_dev *pdev = NULL; + u32 cur_pmsss; + int lss; + + seq_printf(s, "0x%08X D0I1_ACG_SSS_TARGET\n", D0I1_ACG_SSS_TARGET); + + cur_pmsss = pmu_read_sss(); + + seq_printf(s, "0x%08X S0I3_SSS_TARGET\n", S0I3_SSS_TARGET); + + seq_printf(s, "0x%08X Current SSS ", cur_pmsss); + seq_printf(s, lss_s0i3_enabled ? "\n" : "[BLOCKS s0i3]\n"); + + if (cpumask_equal(cpu_online_mask, cpumask_of(0))) + seq_printf(s, "cpu0 is only cpu online\n"); + else + seq_printf(s, "cpu0 is NOT only cpu online [BLOCKS S0i3]\n"); + + seq_printf(s, "GFX: %s\n", graphics_is_off ? "" : "[BLOCKS s0i3]"); + + + for_each_pci_dev(pdev) { + int pos; + u16 pmcsr; + struct mrst_device *mrst_dev; + int i; + + mrst_dev = pci_id_2_mrst_dev(pdev->device); + + seq_printf(s, "%s %04x/%04X %-16.16s ", + dev_name(&pdev->dev), + pdev->vendor, pdev->device, + dev_driver_string(&pdev->dev)); + + if (unlikely (!mrst_dev)) { + seq_printf(s, " UNKNOWN\n"); + continue; + } + + if (mrst_dev->lss) + seq_printf(s, "LSS %2d %-4s ", mrst_dev->lss, + d0ix_name(((cur_pmsss >> + (mrst_dev->lss * 2)) & 0x3))); + else + seq_printf(s, " "); + + /* PCI PM config space setting */ + pos = pci_find_capability(pdev, PCI_CAP_ID_PM); + if (pos != 0) { + pci_read_config_word(pdev, pos + PCI_PM_CTRL, &pmcsr); + seq_printf(s, "PCI-%-4s", + pci_power_name(pmcsr & PCI_PM_CTRL_STATE_MASK)); + } else { + seq_printf(s, " "); + } + + seq_printf(s, " %s ", pci_power_name(mrst_dev->latest_request)); + for (i = 0; i <= PCI_D3cold; ++i) + seq_printf(s, "%d ", mrst_dev->pci_state_counts[i]); + + if (mrst_dev->lss) { + unsigned int lssmask; + + lssmask = SSMSK(D0i3, mrst_dev->lss); + + if ((lssmask & S0I3_SSS_TARGET) && + ((lssmask & cur_pmsss) != + (lssmask & S0I3_SSS_TARGET))) + seq_printf(s , "[BLOCKS s0i3]"); + } + + seq_printf(s, "\n"); + } + seq_printf(s, "Wake Counters:\n"); + for (lss = 0; lss < MRST_NUM_LSS; ++lss) + seq_printf(s, "LSS%d %d\n", lss, wake_counters[lss]); + + seq_printf(s, "Interrupt Counters:\n"); + seq_printf(s, + "INT_SPURIOUS \t%8u\n" "INT_CMD_DONE \t%8u\n" + "INT_CMD_ERR \t%8u\n" "INT_WAKE_RX \t%8u\n" + "INT_SS_ERROR \t%8u\n" "INT_S0IX_MISS\t%8u\n" + "INT_NO_ACKC6 \t%8u\n" "INT_INVALID \t%8u\n", + pmu_irq_stats[INT_SPURIOUS], pmu_irq_stats[INT_CMD_DONE], + pmu_irq_stats[INT_CMD_ERR], pmu_irq_stats[INT_WAKE_RX], + pmu_irq_stats[INT_SS_ERROR], pmu_irq_stats[INT_S0IX_MISS], + pmu_irq_stats[INT_NO_ACKC6], pmu_irq_stats[INT_INVALID]); + + seq_printf(s, "mrst_pmu_wait_ready_calls %8d\n", + pmu_wait_ready_calls); + seq_printf(s, "mrst_pmu_wait_ready_udelays %8d\n", + pmu_wait_ready_udelays); + seq_printf(s, "mrst_pmu_wait_ready_udelays_max %8d\n", + pmu_wait_ready_udelays_max); + seq_printf(s, "mrst_pmu_wait_done_calls %8d\n", + pmu_wait_done_calls); + seq_printf(s, "mrst_pmu_wait_done_udelays %8d\n", + pmu_wait_done_udelays); + seq_printf(s, "mrst_pmu_wait_done_udelays_max %8d\n", + pmu_wait_done_udelays_max); + seq_printf(s, "mrst_pmu_set_power_state_entry %8d\n", + pmu_set_power_state_entry); + seq_printf(s, "mrst_pmu_set_power_state_send_cmd %8d\n", + pmu_set_power_state_send_cmd); + seq_printf(s, "SCU busy: %d\n", pmu_read_busy_status()); + + return 0; +} + +static int debug_mrst_pmu_open(struct inode *inode, struct file *file) +{ + return single_open(file, debug_mrst_pmu_show, NULL); +} + +static const struct file_operations devices_state_operations = { + .open = debug_mrst_pmu_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; +#endif /* DEBUG_FS */ + +/* + * Validate SCU PCI shim PCI vendor capability byte + * against LSS hard-coded in mrst_devs[] above. + * DEBUG only. + */ +static void pmu_scu_firmware_debug(void) +{ + struct pci_dev *pdev = NULL; + + for_each_pci_dev(pdev) { + struct mrst_device *mrst_dev; + u8 pci_config_lss; + int pos; + + mrst_dev = pci_id_2_mrst_dev(pdev->device); + if (unlikely(!mrst_dev)) { + printk(KERN_ERR FW_BUG "pmu: Unknown " + "PCI device 0x%04X\n", pdev->device); + continue; + } + + if (mrst_dev->lss == 0) + continue; /* no LSS in our table */ + + pos = pci_find_capability(pdev, PCI_CAP_ID_VNDR); + if (!pos != 0) { + printk(KERN_ERR FW_BUG "pmu: 0x%04X " + "missing PCI Vendor Capability\n", + pdev->device); + continue; + } + pci_read_config_byte(pdev, pos + 4, &pci_config_lss); + if (!(pci_config_lss & PCI_VENDOR_CAP_LOG_SS_MASK)) { + printk(KERN_ERR FW_BUG "pmu: 0x%04X " + "invalid PCI Vendor Capability 0x%x " + " expected LSS 0x%X\n", + pdev->device, pci_config_lss, mrst_dev->lss); + continue; + } + pci_config_lss &= PCI_VENDOR_CAP_LOG_ID_MASK; + + if (mrst_dev->lss == pci_config_lss) + continue; + + printk(KERN_ERR FW_BUG "pmu: 0x%04X LSS = %d, expected %d\n", + pdev->device, pci_config_lss, mrst_dev->lss); + } +} + +/** + * pmu_probe + */ +static int __devinit pmu_probe(struct pci_dev *pdev, + const struct pci_device_id *pci_id) +{ + int ret; + struct mrst_pmu_reg *pmu; + + /* Init the device */ + ret = pci_enable_device(pdev); + if (ret) { + dev_err(&pdev->dev, "Unable to Enable PCI device\n"); + return ret; + } + + ret = pci_request_regions(pdev, MRST_PMU_DRV_NAME); + if (ret < 0) { + dev_err(&pdev->dev, "Cannot obtain PCI resources, aborting\n"); + goto out_err1; + } + + /* Map the memory of PMU reg base */ + pmu = pci_iomap(pdev, 0, 0); + if (!pmu) { + dev_err(&pdev->dev, "Unable to map the PMU address space\n"); + ret = -ENOMEM; + goto out_err2; + } + +#ifdef CONFIG_DEBUG_FS + /* /sys/kernel/debug/mrst_pmu */ + (void) debugfs_create_file("mrst_pmu", S_IFREG | S_IRUGO, + NULL, NULL, &devices_state_operations); +#endif + pmu_reg = pmu; /* success */ + + if (request_irq(pdev->irq, pmu_irq, 0, MRST_PMU_DRV_NAME, NULL)) { + dev_err(&pdev->dev, "Registering isr has failed\n"); + ret = -1; + goto out_err3; + } + + pmu_scu_firmware_debug(); + + pmu_write_wkc(S0I3_WAKE_SOURCES); /* Enable S0i3 wakeup sources */ + + pmu_wait_ready(); + + pmu_write_ssc(D0I1_ACG_SSS_TARGET); /* Enable Auto-Clock_Gating */ + pmu_write_cmd(0x201); + + spin_lock_init(&mrst_pmu_power_state_lock); + + /* Enable the hardware interrupt */ + pmu_irq_enable(); + return 0; + +out_err3: + free_irq(pdev->irq, NULL); + pci_iounmap(pdev, pmu_reg); + pmu_reg = NULL; +out_err2: + pci_release_region(pdev, 0); +out_err1: + pci_disable_device(pdev); + return ret; +} + +static void __devexit pmu_remove(struct pci_dev *pdev) +{ + dev_err(&pdev->dev, "Mid PM pmu_remove called\n"); + + /* Freeing up the irq */ + free_irq(pdev->irq, NULL); + + pci_iounmap(pdev, pmu_reg); + pmu_reg = NULL; + + /* disable the current PCI device */ + pci_release_region(pdev, 0); + pci_disable_device(pdev); +} + +static DEFINE_PCI_DEVICE_TABLE(pmu_pci_ids) = { + { PCI_VDEVICE(INTEL, PCI_DEV_ID_MRST_PMU), 0 }, + { } +}; + +MODULE_DEVICE_TABLE(pci, pmu_pci_ids); + +static struct pci_driver driver = { + .name = MRST_PMU_DRV_NAME, + .id_table = pmu_pci_ids, + .probe = pmu_probe, + .remove = __devexit_p(pmu_remove), +}; + +/** + * pmu_pci_register - register the PMU driver as PCI device + */ +static int __init pmu_pci_register(void) +{ + return pci_register_driver(&driver); +} + +/* Register and probe via fs_initcall() to preceed device_initcall() */ +fs_initcall(pmu_pci_register); + +static void __exit mid_pci_cleanup(void) +{ + pci_unregister_driver(&driver); +} + +static int ia_major; +static int ia_minor; + +static int pmu_sfi_parse_oem(struct sfi_table_header *table) +{ + struct sfi_table_simple *sb; + + sb = (struct sfi_table_simple *)table; + ia_major = (sb->pentry[1] >> 0) & 0xFFFF; + ia_minor = (sb->pentry[1] >> 16) & 0xFFFF; + printk(KERN_INFO "mrst_pmu: IA FW version v%x.%x\n", + ia_major, ia_minor); + + return 0; +} + +static int __init scu_fw_check(void) +{ + int ret; + u32 fw_version; + + if (!pmu_reg) + return 0; /* this driver didn't probe-out */ + + sfi_table_parse("OEMB", NULL, NULL, pmu_sfi_parse_oem); + + if (ia_major < 0x6005 || ia_minor < 0x1525) { + WARN(1, "mrst_pmu: IA FW version too old\n"); + return -1; + } + + ret = intel_scu_ipc_command(IPCMSG_FW_REVISION, 0, NULL, 0, + &fw_version, 1); + + if (ret) { + WARN(1, "mrst_pmu: IPC FW version? %d\n", ret); + } else { + int scu_major = (fw_version >> 8) & 0xFF; + int scu_minor = (fw_version >> 0) & 0xFF; + + printk(KERN_INFO "mrst_pmu: firmware v%x\n", fw_version); + + if ((scu_major >= 0xC0) && (scu_minor >= 0x49)) { + printk(KERN_INFO "mrst_pmu: enabling S0i3\n"); + mrst_pmu_s0i3_enable = true; + } else { + WARN(1, "mrst_pmu: S0i3 disabled, old firmware %X.%X", + scu_major, scu_minor); + } + } + return 0; +} +late_initcall(scu_fw_check); +module_exit(mid_pci_cleanup); diff --git a/arch/x86/platform/mrst/pmu.h b/arch/x86/platform/mrst/pmu.h new file mode 100644 index 0000000..bfbfe64 --- /dev/null +++ b/arch/x86/platform/mrst/pmu.h @@ -0,0 +1,234 @@ +/* + * mrst/pmu.h - private definitions for MRST Power Management Unit mrst/pmu.c + * + * Copyright (c) 2011, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#ifndef _MRST_PMU_H_ +#define _MRST_PMU_H_ + +#define PCI_DEV_ID_MRST_PMU 0x0810 +#define MRST_PMU_DRV_NAME "mrst_pmu" +#define PCI_SUB_CLASS_MASK 0xFF00 + +#define PCI_VENDOR_CAP_LOG_ID_MASK 0x7F +#define PCI_VENDOR_CAP_LOG_SS_MASK 0x80 + +#define SUB_SYS_ALL_D0I1 0x01155555 +#define S0I3_WAKE_SOURCES 0x00001FFF + +#define PM_S0I3_COMMAND \ + ((0 << 31) | /* Reserved */ \ + (0 << 30) | /* Core must be idle */ \ + (0xc2 << 22) | /* ACK C6 trigger */ \ + (3 << 19) | /* Trigger on DMI message */ \ + (3 << 16) | /* Enter S0i3 */ \ + (0 << 13) | /* Numeric mode ID (sw) */ \ + (3 << 9) | /* Trigger mode */ \ + (0 << 8) | /* Do not interrupt */ \ + (1 << 0)) /* Set configuration */ + +#define LSS_DMI 0 +#define LSS_SD_HC0 1 +#define LSS_SD_HC1 2 +#define LSS_NAND 3 +#define LSS_IMAGING 4 +#define LSS_SECURITY 5 +#define LSS_DISPLAY 6 +#define LSS_USB_HC 7 +#define LSS_USB_OTG 8 +#define LSS_AUDIO 9 +#define LSS_AUDIO_LPE 9 +#define LSS_AUDIO_SSP 9 +#define LSS_I2C0 10 +#define LSS_I2C1 10 +#define LSS_I2C2 10 +#define LSS_KBD 10 +#define LSS_SPI0 10 +#define LSS_SPI1 10 +#define LSS_SPI2 10 +#define LSS_GPIO 10 +#define LSS_SRAM 11 /* used by SCU, do not touch */ +#define LSS_SD_HC2 12 +/* LSS hardware bits 15,14,13 are hardwired to 0, thus unusable */ +#define MRST_NUM_LSS 13 + +#define MIN(a, b) (((a) < (b)) ? (a) : (b)) + +#define SSMSK(mask, lss) ((mask) << ((lss) * 2)) +#define D0 0 +#define D0i1 1 +#define D0i2 2 +#define D0i3 3 + +#define S0I3_SSS_TARGET ( \ + SSMSK(D0i1, LSS_DMI) | \ + SSMSK(D0i3, LSS_SD_HC0) | \ + SSMSK(D0i3, LSS_SD_HC1) | \ + SSMSK(D0i3, LSS_NAND) | \ + SSMSK(D0i3, LSS_SD_HC2) | \ + SSMSK(D0i3, LSS_IMAGING) | \ + SSMSK(D0i3, LSS_SECURITY) | \ + SSMSK(D0i3, LSS_DISPLAY) | \ + SSMSK(D0i3, LSS_USB_HC) | \ + SSMSK(D0i3, LSS_USB_OTG) | \ + SSMSK(D0i3, LSS_AUDIO) | \ + SSMSK(D0i1, LSS_I2C0)) + +/* + * D0i1 on Langwell is Autonomous Clock Gating (ACG). + * Enable ACG on every LSS except camera and audio + */ +#define D0I1_ACG_SSS_TARGET \ + (SUB_SYS_ALL_D0I1 & ~SSMSK(D0i1, LSS_IMAGING) & ~SSMSK(D0i1, LSS_AUDIO)) + +enum cm_mode { + CM_NOP, /* ignore the config mode value */ + CM_IMMEDIATE, + CM_DELAY, + CM_TRIGGER, + CM_INVALID +}; + +enum sys_state { + SYS_STATE_S0I0, + SYS_STATE_S0I1, + SYS_STATE_S0I2, + SYS_STATE_S0I3, + SYS_STATE_S3, + SYS_STATE_S5 +}; + +#define SET_CFG_CMD 1 + +enum int_status { + INT_SPURIOUS = 0, + INT_CMD_DONE = 1, + INT_CMD_ERR = 2, + INT_WAKE_RX = 3, + INT_SS_ERROR = 4, + INT_S0IX_MISS = 5, + INT_NO_ACKC6 = 6, + INT_INVALID = 7, +}; + +/* PMU register interface */ +static struct mrst_pmu_reg { + u32 pm_sts; /* 0x00 */ + u32 pm_cmd; /* 0x04 */ + u32 pm_ics; /* 0x08 */ + u32 _resv1; /* 0x0C */ + u32 pm_wkc[2]; /* 0x10 */ + u32 pm_wks[2]; /* 0x18 */ + u32 pm_ssc[4]; /* 0x20 */ + u32 pm_sss[4]; /* 0x30 */ + u32 pm_wssc[4]; /* 0x40 */ + u32 pm_c3c4; /* 0x50 */ + u32 pm_c5c6; /* 0x54 */ + u32 pm_msi_disable; /* 0x58 */ +} *pmu_reg; + +static inline u32 pmu_read_sts(void) { return readl(&pmu_reg->pm_sts); } +static inline u32 pmu_read_ics(void) { return readl(&pmu_reg->pm_ics); } +static inline u32 pmu_read_wks(void) { return readl(&pmu_reg->pm_wks[0]); } +static inline u32 pmu_read_sss(void) { return readl(&pmu_reg->pm_sss[0]); } + +static inline void pmu_write_cmd(u32 arg) { writel(arg, &pmu_reg->pm_cmd); } +static inline void pmu_write_ics(u32 arg) { writel(arg, &pmu_reg->pm_ics); } +static inline void pmu_write_wkc(u32 arg) { writel(arg, &pmu_reg->pm_wkc[0]); } +static inline void pmu_write_ssc(u32 arg) { writel(arg, &pmu_reg->pm_ssc[0]); } +static inline void pmu_write_wssc(u32 arg) + { writel(arg, &pmu_reg->pm_wssc[0]); } + +static inline void pmu_msi_enable(void) { writel(0, &pmu_reg->pm_msi_disable); } +static inline u32 pmu_msi_is_disabled(void) + { return readl(&pmu_reg->pm_msi_disable); } + +union pmu_pm_ics { + struct { + u32 cause:8; + u32 enable:1; + u32 pending:1; + u32 reserved:22; + } bits; + u32 value; +}; + +static inline void pmu_irq_enable(void) +{ + union pmu_pm_ics pmu_ics; + + pmu_ics.value = pmu_read_ics(); + pmu_ics.bits.enable = 1; + pmu_write_ics(pmu_ics.value); +} + +union pmu_pm_status { + struct { + u32 pmu_rev:8; + u32 pmu_busy:1; + u32 mode_id:4; + u32 Reserved:19; + } pmu_status_parts; + u32 pmu_status_value; +}; + +static inline int pmu_read_busy_status(void) +{ + union pmu_pm_status result; + + result.pmu_status_value = pmu_read_sts(); + + return result.pmu_status_parts.pmu_busy; +} + +/* pmu set config parameters */ +struct cfg_delay_param_t { + u32 cmd:8; + u32 ioc:1; + u32 cfg_mode:4; + u32 mode_id:3; + u32 sys_state:3; + u32 cfg_delay:8; + u32 rsvd:5; +}; + +struct cfg_trig_param_t { + u32 cmd:8; + u32 ioc:1; + u32 cfg_mode:4; + u32 mode_id:3; + u32 sys_state:3; + u32 cfg_trig_type:3; + u32 cfg_trig_val:8; + u32 cmbi:1; + u32 rsvd1:1; +}; + +union pmu_pm_set_cfg_cmd_t { + union { + struct cfg_delay_param_t d_param; + struct cfg_trig_param_t t_param; + } pmu2_params; + u32 pmu_pm_set_cfg_cmd_value; +}; + +#ifdef FUTURE_PATCH +extern int mrst_s0i3_entry(u32 regval, u32 *regaddr); +#else +static inline int mrst_s0i3_entry(u32 regval, u32 *regaddr) { return -1; } +#endif +#endif diff --git a/arch/x86/platform/olpc/olpc.c b/arch/x86/platform/olpc/olpc.c index 8b9940e..7cce722 100644 --- a/arch/x86/platform/olpc/olpc.c +++ b/arch/x86/platform/olpc/olpc.c @@ -161,13 +161,13 @@ restart: if (inbuf && inlen) { /* write data to EC */ for (i = 0; i < inlen; i++) { + pr_devel("olpc-ec: sending cmd arg 0x%x\n", inbuf[i]); + outb(inbuf[i], 0x68); if (wait_on_ibf(0x6c, 0)) { printk(KERN_ERR "olpc-ec: timeout waiting for" " EC accept data!\n"); goto err; } - pr_devel("olpc-ec: sending cmd arg 0x%x\n", inbuf[i]); - outb(inbuf[i], 0x68); } } if (outbuf && outlen) { diff --git a/arch/x86/vdso/vdso.S b/arch/x86/vdso/vdso.S index 1b979c1..01f5e3b 100644 --- a/arch/x86/vdso/vdso.S +++ b/arch/x86/vdso/vdso.S @@ -9,6 +9,7 @@ __PAGE_ALIGNED_DATA vdso_start: .incbin "arch/x86/vdso/vdso.so" vdso_end: + .align PAGE_SIZE /* extra data here leaks to userspace. */ .previous diff --git a/arch/x86/vdso/vdso32/sysenter.S b/arch/x86/vdso/vdso32/sysenter.S index e2800af..e354bce 100644 --- a/arch/x86/vdso/vdso32/sysenter.S +++ b/arch/x86/vdso/vdso32/sysenter.S @@ -43,7 +43,7 @@ __kernel_vsyscall: .space 7,0x90 /* 14: System call restart point is here! (SYSENTER_RETURN-2) */ - jmp .Lenter_kernel + int $0x80 /* 16: System call normal return point is here! */ VDSO32_SYSENTER_RETURN: /* Symbol used by sysenter.c via vdso32-syms.h */ pop %ebp diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile index 45e94ac..add2c2d 100644 --- a/arch/x86/xen/Makefile +++ b/arch/x86/xen/Makefile @@ -15,7 +15,7 @@ obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \ grant-table.o suspend.o platform-pci-unplug.o \ p2m.o -obj-$(CONFIG_FUNCTION_TRACER) += trace.o +obj-$(CONFIG_EVENT_TRACING) += trace.o obj-$(CONFIG_SMP) += smp.o obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 974a528..2d69617 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -77,8 +77,8 @@ EXPORT_SYMBOL_GPL(xen_domain_type); unsigned long *machine_to_phys_mapping = (void *)MACH2PHYS_VIRT_START; EXPORT_SYMBOL(machine_to_phys_mapping); -unsigned int machine_to_phys_order; -EXPORT_SYMBOL(machine_to_phys_order); +unsigned long machine_to_phys_nr; +EXPORT_SYMBOL(machine_to_phys_nr); struct start_info *xen_start_info; EXPORT_SYMBOL_GPL(xen_start_info); @@ -951,6 +951,10 @@ static const struct pv_info xen_info __initconst = { .paravirt_enabled = 1, .shared_kernel_pmd = 0, +#ifdef CONFIG_X86_64 + .extra_user_64bit_cs = FLAT_USER_CS64, +#endif + .name = "Xen", }; diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index f987bde..20a6142 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1713,15 +1713,19 @@ static void __init xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn) void __init xen_setup_machphys_mapping(void) { struct xen_machphys_mapping mapping; - unsigned long machine_to_phys_nr_ents; if (HYPERVISOR_memory_op(XENMEM_machphys_mapping, &mapping) == 0) { machine_to_phys_mapping = (unsigned long *)mapping.v_start; - machine_to_phys_nr_ents = mapping.max_mfn + 1; + machine_to_phys_nr = mapping.max_mfn + 1; } else { - machine_to_phys_nr_ents = MACH2PHYS_NR_ENTRIES; + machine_to_phys_nr = MACH2PHYS_NR_ENTRIES; } - machine_to_phys_order = fls(machine_to_phys_nr_ents - 1); +#ifdef CONFIG_X86_32 + if ((machine_to_phys_mapping + machine_to_phys_nr) + < machine_to_phys_mapping) + machine_to_phys_nr = (unsigned long *)NULL + - machine_to_phys_mapping; +#endif } #ifdef CONFIG_X86_64 @@ -1916,6 +1920,7 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) # endif #else case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE: + case VVAR_PAGE: #endif case FIX_TEXT_POKE0: case FIX_TEXT_POKE1: @@ -1956,7 +1961,8 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) #ifdef CONFIG_X86_64 /* Replicate changes to map the vsyscall page into the user pagetable vsyscall mapping. */ - if (idx >= VSYSCALL_LAST_PAGE && idx <= VSYSCALL_FIRST_PAGE) { + if ((idx >= VSYSCALL_LAST_PAGE && idx <= VSYSCALL_FIRST_PAGE) || + idx == VVAR_PAGE) { unsigned long vaddr = __fix_to_virt(idx); set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte); } diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 60aeeb5..c3b8d44 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -9,6 +9,7 @@ #include <linux/mm.h> #include <linux/pm.h> #include <linux/memblock.h> +#include <linux/cpuidle.h> #include <asm/elf.h> #include <asm/vdso.h> @@ -92,8 +93,6 @@ static unsigned long __init xen_release_chunk(phys_addr_t start_addr, if (end <= start) return 0; - printk(KERN_INFO "xen_release_chunk: looking at area pfn %lx-%lx: ", - start, end); for(pfn = start; pfn < end; pfn++) { unsigned long mfn = pfn_to_mfn(pfn); @@ -106,14 +105,14 @@ static unsigned long __init xen_release_chunk(phys_addr_t start_addr, ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation); - WARN(ret != 1, "Failed to release memory %lx-%lx err=%d\n", - start, end, ret); + WARN(ret != 1, "Failed to release pfn %lx err=%d\n", pfn, ret); if (ret == 1) { __set_phys_to_machine(pfn, INVALID_P2M_ENTRY); len++; } } - printk(KERN_CONT "%ld pages freed\n", len); + printk(KERN_INFO "Freeing %lx-%lx pfn range: %lu pages freed\n", + start, end, len); return len; } @@ -139,7 +138,7 @@ static unsigned long __init xen_return_unused_memory(unsigned long max_pfn, if (last_end < max_addr) released += xen_release_chunk(last_end, max_addr); - printk(KERN_INFO "released %ld pages of unused memory\n", released); + printk(KERN_INFO "released %lu pages of unused memory\n", released); return released; } @@ -185,6 +184,19 @@ static unsigned long __init xen_set_identity(const struct e820entry *list, PFN_UP(start_pci), PFN_DOWN(last)); return identity; } + +static unsigned long __init xen_get_max_pages(void) +{ + unsigned long max_pages = MAX_DOMAIN_PAGES; + domid_t domid = DOMID_SELF; + int ret; + + ret = HYPERVISOR_memory_op(XENMEM_maximum_reservation, &domid); + if (ret > 0) + max_pages = ret; + return min(max_pages, MAX_DOMAIN_PAGES); +} + /** * machine_specific_memory_setup - Hook for machine specific memory setup. **/ @@ -293,6 +305,12 @@ char * __init xen_memory_setup(void) sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); + extra_limit = xen_get_max_pages(); + if (extra_limit >= max_pfn) + extra_pages = extra_limit - max_pfn; + else + extra_pages = 0; + extra_pages += xen_return_unused_memory(xen_start_info->nr_pages, &e820); /* @@ -426,7 +444,7 @@ void __init xen_arch_setup(void) #ifdef CONFIG_X86_32 boot_cpu_data.hlt_works_ok = 1; #endif - pm_idle = default_idle; + disable_cpuidle(); boot_option_idle_override = IDLE_HALT; fiddle_vdso(); diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index b4533a8..d4fc6d4 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -32,6 +32,7 @@ #include <xen/page.h> #include <xen/events.h> +#include <xen/hvc-console.h> #include "xen-ops.h" #include "mmu.h" @@ -207,6 +208,15 @@ static void __init xen_smp_prepare_cpus(unsigned int max_cpus) unsigned cpu; unsigned int i; + if (skip_ioapic_setup) { + char *m = (max_cpus == 0) ? + "The nosmp parameter is incompatible with Xen; " \ + "use Xen dom0_max_vcpus=1 parameter" : + "The noapic parameter is incompatible with Xen"; + + xen_raw_printk(m); + panic(m); + } xen_init_lock_cpu(0); smp_store_cpu_info(0); @@ -521,8 +531,6 @@ static void __init xen_hvm_smp_prepare_cpus(unsigned int max_cpus) native_smp_prepare_cpus(max_cpus); WARN_ON(xen_smp_intr_init(0)); - if (!xen_have_vector_callback) - return; xen_init_lock_cpu(0); xen_init_spinlocks(); } @@ -546,6 +554,8 @@ static void xen_hvm_cpu_die(unsigned int cpu) void __init xen_hvm_smp_init(void) { + if (!xen_have_vector_callback) + return; smp_ops.smp_prepare_cpus = xen_hvm_smp_prepare_cpus; smp_ops.smp_send_reschedule = xen_smp_send_reschedule; smp_ops.cpu_up = xen_hvm_cpu_up; diff --git a/arch/x86/xen/trace.c b/arch/x86/xen/trace.c index 734beba..520022d 100644 --- a/arch/x86/xen/trace.c +++ b/arch/x86/xen/trace.c @@ -1,4 +1,5 @@ #include <linux/ftrace.h> +#include <xen/interface/xen.h> #define N(x) [__HYPERVISOR_##x] = "("#x")" static const char *xen_hypercall_names[] = { diff --git a/arch/x86/xen/xen-asm_32.S b/arch/x86/xen/xen-asm_32.S index 22a2093..b040b0e 100644 --- a/arch/x86/xen/xen-asm_32.S +++ b/arch/x86/xen/xen-asm_32.S @@ -113,11 +113,13 @@ xen_iret_start_crit: /* * If there's something pending, mask events again so we can - * jump back into xen_hypervisor_callback + * jump back into xen_hypervisor_callback. Otherwise do not + * touch XEN_vcpu_info_mask. */ - sete XEN_vcpu_info_mask(%eax) + jne 1f + movb $1, XEN_vcpu_info_mask(%eax) - popl %eax +1: popl %eax /* * From this point on the registers are restored and the stack |