From 48ffc70b675aa7798a52a2e92e20f6cce9140b3d Mon Sep 17 00:00:00 2001 From: Alok N Kataria Date: Wed, 18 Feb 2009 12:33:55 -0800 Subject: x86, vmi: TSC going backwards check in vmi clocksource Impact: fix time warps under vmware Similar to the check for TSC going backwards in the TSC clocksource, we also need this check for VMI clocksource. Signed-off-by: Alok N Kataria Cc: Zachary Amsden Signed-off-by: Ingo Molnar Cc: stable@kernel.org diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c index c4c1f9e..bde106c 100644 --- a/arch/x86/kernel/vmiclock_32.c +++ b/arch/x86/kernel/vmiclock_32.c @@ -283,10 +283,13 @@ void __devinit vmi_time_ap_init(void) #endif /** vmi clocksource */ +static struct clocksource clocksource_vmi; static cycle_t read_real_cycles(void) { - return vmi_timer_ops.get_cycle_counter(VMI_CYCLES_REAL); + cycle_t ret = (cycle_t)vmi_timer_ops.get_cycle_counter(VMI_CYCLES_REAL); + return ret >= clocksource_vmi.cycle_last ? + ret : clocksource_vmi.cycle_last; } static struct clocksource clocksource_vmi = { -- cgit v0.10.2 From 07a66d7c53a538e1a9759954a82bb6c07365eff9 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 20 Feb 2009 08:04:13 +0100 Subject: x86: use the right protections for split-up pagetables Steven Rostedt found a bug in where in his modified kernel ftrace was unable to modify the kernel text, due to the PMD itself having been marked read-only as well in split_large_page(). The fix, suggested by Linus, is to not try to 'clone' the reference protection of a huge-page, but to use the standard (and permissive) page protection bits of KERNPG_TABLE. The 'cloning' makes sense for the ptes but it's a confused and incorrect concept at the page table level - because the pagetable entry is a set of all ptes and hence cannot 'clone' any single protection attribute - the ptes can be any mixture of protections. With the permissive KERNPG_TABLE, even if the pte protections get changed after this point (due to ftrace doing code-patching or other similar activities like kprobes), the resulting combined protections will still be correct and the pte's restrictive (or permissive) protections will control it. Also update the comment. This bug was there for a long time but has not caused visible problems before as it needs a rather large read-only area to trigger. Steve possibly hacked his kernel with some really large arrays or so. Anyway, the bug is definitely worth fixing. [ Huang Ying also experienced problems in this area when writing the EFI code, but the real bug in split_large_page() was not realized back then. ] Reported-by: Steven Rostedt Reported-by: Huang Ying Acked-by: Linus Torvalds Signed-off-by: Ingo Molnar diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 8ca0d85..7be47d1 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -508,18 +508,13 @@ static int split_large_page(pte_t *kpte, unsigned long address) #endif /* - * Install the new, split up pagetable. Important details here: + * Install the new, split up pagetable. * - * On Intel the NX bit of all levels must be cleared to make a - * page executable. See section 4.13.2 of Intel 64 and IA-32 - * Architectures Software Developer's Manual). - * - * Mark the entry present. The current mapping might be - * set to not present, which we preserved above. + * We use the standard kernel pagetable protections for the new + * pagetable protections, the actual ptes set above control the + * primary protection behavior: */ - ref_prot = pte_pgprot(pte_mkexec(pte_clrhuge(*kpte))); - pgprot_val(ref_prot) |= _PAGE_PRESENT; - __set_pmd_pte(kpte, address, mk_pte(base, ref_prot)); + __set_pmd_pte(kpte, address, mk_pte(base, __pgprot(_KERNPG_TABLE))); base = NULL; out_unlock: -- cgit v0.10.2 From 000ab691172db3921efa3cb7f17fc79235a1de7f Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 17 Feb 2009 13:35:06 -0500 Subject: ftrace: allow archs to preform pre and post process for code modification This patch creates the weak functions: ftrace_arch_code_modify_prepare and ftrace_arch_code_modify_post_process that are called before and after the stop machine is called to modify the kernel text. If the arch needs to do pre or post processing, it only needs to define these functions. [ Update: Ingo Molnar suggested using the name ftrace_arch_code_modify_* over using ftrace_arch_modify_* ] Signed-off-by: Steven Rostedt diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 677432b..fdb2a89 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -99,6 +99,9 @@ stack_trace_sysctl(struct ctl_table *table, int write, /* asm/ftrace.h must be defined for archs supporting dynamic ftrace */ #include +int ftrace_arch_code_modify_prepare(void); +int ftrace_arch_code_modify_post_process(void); + enum { FTRACE_FL_FREE = (1 << 0), FTRACE_FL_FAILED = (1 << 1), diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index fdf913d..72316d9 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -585,6 +585,24 @@ ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec) return 1; } +/* + * archs can override this function if they must do something + * before the modifying code is performed. + */ +int __weak ftrace_arch_code_modify_prepare(void) +{ + return 0; +} + +/* + * archs can override this function if they must do something + * after the modifying code is performed. + */ +int __weak ftrace_arch_code_modify_post_process(void) +{ + return 0; +} + static int __ftrace_modify_code(void *data) { int *command = data; @@ -607,7 +625,17 @@ static int __ftrace_modify_code(void *data) static void ftrace_run_update_code(int command) { + int ret; + + ret = ftrace_arch_code_modify_prepare(); + FTRACE_WARN_ON(ret); + if (ret) + return; + stop_machine(__ftrace_modify_code, &command, NULL); + + ret = ftrace_arch_code_modify_post_process(); + FTRACE_WARN_ON(ret); } static ftrace_func_t saved_ftrace_func; -- cgit v0.10.2 From 16239630974516a8879a3695ee9b4dc661f79f96 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 17 Feb 2009 17:57:30 -0500 Subject: ftrace, x86: make kernel text writable only for conversions Impact: keep kernel text read only Because dynamic ftrace converts the calls to mcount into and out of nops at run time, we needed to always keep the kernel text writable. But this defeats the point of CONFIG_DEBUG_RODATA. This patch converts the kernel code to writable before ftrace modifies the text, and converts it back to read only afterward. The kernel text is converted to read/write, stop_machine is called to modify the code, then the kernel text is converted back to read only. The original version used SYSTEM_STATE to determine when it was OK or not to change the code to rw or ro. Andrew Morton pointed out that using SYSTEM_STATE is a bad idea since there is no guarantee to what its state will actually be. Instead, I moved the check into the set_kernel_text_* functions themselves, and use a local variable to determine when it is OK to change the kernel text RW permissions. [ Update: Ingo Molnar suggested moving the prototypes to cacheflush.h ] Reviewed-by: Andrew Morton Signed-off-by: Steven Rostedt diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h index 2f84665..6145063 100644 --- a/arch/x86/include/asm/cacheflush.h +++ b/arch/x86/include/asm/cacheflush.h @@ -104,6 +104,11 @@ void clflush_cache_range(void *addr, unsigned int size); #ifdef CONFIG_DEBUG_RODATA void mark_rodata_ro(void); extern const int rodata_test_data; +void set_kernel_text_rw(void); +void set_kernel_text_ro(void); +#else +static inline void set_kernel_text_rw(void) { } +static inline void set_kernel_text_ro(void) { } #endif #ifdef CONFIG_DEBUG_RODATA_TEST diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 231bdd3..77857d4 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -18,6 +18,7 @@ #include #include +#include #include #include #include @@ -26,6 +27,18 @@ #ifdef CONFIG_DYNAMIC_FTRACE +int ftrace_arch_code_modify_prepare(void) +{ + set_kernel_text_rw(); + return 0; +} + +int ftrace_arch_code_modify_post_process(void) +{ + set_kernel_text_ro(); + return 0; +} + union ftrace_code_union { char code[MCOUNT_INSN_SIZE]; struct { diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 2cef050..3eb2ed1 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -1155,17 +1155,47 @@ static noinline int do_test_wp_bit(void) const int rodata_test_data = 0xC3; EXPORT_SYMBOL_GPL(rodata_test_data); +static int kernel_set_to_readonly; + +void set_kernel_text_rw(void) +{ + unsigned long start = PFN_ALIGN(_text); + unsigned long size = PFN_ALIGN(_etext) - start; + + if (!kernel_set_to_readonly) + return; + + pr_debug("Set kernel text: %lx - %lx for read write\n", + start, start+size); + + set_pages_rw(virt_to_page(start), size >> PAGE_SHIFT); +} + +void set_kernel_text_ro(void) +{ + unsigned long start = PFN_ALIGN(_text); + unsigned long size = PFN_ALIGN(_etext) - start; + + if (!kernel_set_to_readonly) + return; + + pr_debug("Set kernel text: %lx - %lx for read only\n", + start, start+size); + + set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); +} + void mark_rodata_ro(void) { unsigned long start = PFN_ALIGN(_text); unsigned long size = PFN_ALIGN(_etext) - start; -#ifndef CONFIG_DYNAMIC_FTRACE - /* Dynamic tracing modifies the kernel text section */ set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); printk(KERN_INFO "Write protecting the kernel text: %luk\n", size >> 10); + kernel_set_to_readonly = 1; + #ifdef CONFIG_CPA_DEBUG printk(KERN_INFO "Testing CPA: Reverting %lx-%lx\n", start, start+size); @@ -1174,7 +1204,6 @@ void mark_rodata_ro(void) printk(KERN_INFO "Testing CPA: write protecting again\n"); set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT); #endif -#endif /* CONFIG_DYNAMIC_FTRACE */ start += size; size = (unsigned long)__end_rodata - start; diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index e6d36b4..63fdc53 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -986,21 +986,48 @@ void free_initmem(void) const int rodata_test_data = 0xC3; EXPORT_SYMBOL_GPL(rodata_test_data); +static int kernel_set_to_readonly; + +void set_kernel_text_rw(void) +{ + unsigned long start = PFN_ALIGN(_stext); + unsigned long end = PFN_ALIGN(__start_rodata); + + if (!kernel_set_to_readonly) + return; + + pr_debug("Set kernel text: %lx - %lx for read write\n", + start, end); + + set_memory_rw(start, (end - start) >> PAGE_SHIFT); +} + +void set_kernel_text_ro(void) +{ + unsigned long start = PFN_ALIGN(_stext); + unsigned long end = PFN_ALIGN(__start_rodata); + + if (!kernel_set_to_readonly) + return; + + pr_debug("Set kernel text: %lx - %lx for read only\n", + start, end); + + set_memory_ro(start, (end - start) >> PAGE_SHIFT); +} + void mark_rodata_ro(void) { unsigned long start = PFN_ALIGN(_stext), end = PFN_ALIGN(__end_rodata); unsigned long rodata_start = ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK; -#ifdef CONFIG_DYNAMIC_FTRACE - /* Dynamic tracing modifies the kernel text section */ - start = rodata_start; -#endif - printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n", (end - start) >> 10); set_memory_ro(start, (end - start) >> PAGE_SHIFT); + kernel_set_to_readonly = 1; + /* * The rodata section (but not the kernel text!) should also be * not-executable. -- cgit v0.10.2 From 90c7ac49aa819feb9433b5310089fca6399881c0 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 19 Feb 2009 13:32:57 -0500 Subject: ftrace: immediately stop code modification if failure is detected Impact: fix to prevent NMI lockup If the page fault handler produces a WARN_ON in the modifying of text, and the system is setup to have a high frequency of NMIs, we can lock up the system on a failure to modify code. The modifying of code with NMIs allows all NMIs to modify the code if it is about to run. This prevents a modifier on one CPU from modifying code running in NMI context on another CPU. The modifying is done through stop_machine, so only NMIs must be considered. But if the write causes the page fault handler to produce a warning, the print can slow it down enough that as soon as it is done it will take another NMI before going back to the process context. The new NMI will perform the write again causing another print and this will hang the box. This patch turns off the writing as soon as a failure is detected and does not wait for it to be turned off by the process context. This will keep NMIs from getting stuck in this back and forth of print outs. Signed-off-by: Steven Rostedt diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 77857d4..c56d738 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -124,6 +124,10 @@ static void ftrace_mod_code(void) */ mod_code_status = probe_kernel_write(mod_code_ip, mod_code_newcode, MCOUNT_INSN_SIZE); + + /* if we fail, then kill any new writers */ + if (mod_code_status) + mod_code_write = 0; } void ftrace_nmi_enter(void) -- cgit v0.10.2 From 4377245aa93b65b6597e4b7bb460fb9abc48b56b Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 19 Feb 2009 13:41:27 -0500 Subject: ftrace: break out modify loop immediately on detection of error Impact: added precaution on failure detection Break out of the modifying loop as soon as a failure is detected. This is just an added precaution found by code review and was not found by any bug chasing. Signed-off-by: Steven Rostedt diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 72316d9..11ad796 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -561,8 +561,11 @@ static void ftrace_replace_code(int enable) if ((system_state == SYSTEM_BOOTING) || !core_kernel_text(rec->ip)) { ftrace_free_rec(rec); - } else + } else { ftrace_bug(failed, rec->ip); + /* Stop processing */ + return; + } } } } -- cgit v0.10.2