From 3bd323a1da42525317e2ce6c93b97b5ba653bc9d Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 2 Feb 2009 14:52:00 -0800 Subject: x86 setup: a20: early timeout for a nonexistent keyboard controller When probing the keyboard controller to enable A20, if we get FF back (which is *possible* as a valid status word, but is extremely unlikely) then bail after much fewer iterations than we otherwise would, and abort the attempt to access the KBC. This hopefully should make it work a lot better for embedded platforms which don't have a KBC and where the BIOS doesn't implement INT 15h AX=2401h (and doesn't boot with A20 already enabled.) If this works, it will be the one remaining use of CONFIG_X86_ELAN as anything other than a processor type optimization option. Signed-off-by: H. Peter Anvin diff --git a/arch/x86/boot/a20.c b/arch/x86/boot/a20.c index 4063d63..fba8e9c 100644 --- a/arch/x86/boot/a20.c +++ b/arch/x86/boot/a20.c @@ -2,6 +2,7 @@ * * Copyright (C) 1991, 1992 Linus Torvalds * Copyright 2007-2008 rPath, Inc. - All Rights Reserved + * Copyright 2009 Intel Corporation * * This file is part of the Linux kernel, and is made available under * the terms of the GNU General Public License version 2. @@ -15,16 +16,23 @@ #include "boot.h" #define MAX_8042_LOOPS 100000 +#define MAX_8042_FF 32 static int empty_8042(void) { u8 status; int loops = MAX_8042_LOOPS; + int ffs = MAX_8042_FF; while (loops--) { io_delay(); status = inb(0x64); + if (status == 0xff) { + /* FF is a plausible, but very unlikely status */ + if (!--ffs) + return -1; /* Assume no KBC present */ + } if (status & 1) { /* Read and discard input data */ io_delay(); @@ -118,44 +126,43 @@ static void enable_a20_fast(void) int enable_a20(void) { -#if defined(CONFIG_X86_ELAN) - /* Elan croaks if we try to touch the KBC */ - enable_a20_fast(); - while (!a20_test_long()) - ; - return 0; -#elif defined(CONFIG_X86_VOYAGER) +#ifdef CONFIG_X86_VOYAGER /* On Voyager, a20_test() is unsafe? */ enable_a20_kbc(); return 0; #else int loops = A20_ENABLE_LOOPS; - while (loops--) { - /* First, check to see if A20 is already enabled - (legacy free, etc.) */ - if (a20_test_short()) - return 0; - - /* Next, try the BIOS (INT 0x15, AX=0x2401) */ - enable_a20_bios(); - if (a20_test_short()) - return 0; - - /* Try enabling A20 through the keyboard controller */ - empty_8042(); - if (a20_test_short()) - return 0; /* BIOS worked, but with delayed reaction */ - - enable_a20_kbc(); - if (a20_test_long()) - return 0; - - /* Finally, try enabling the "fast A20 gate" */ - enable_a20_fast(); - if (a20_test_long()) - return 0; - } - - return -1; + int kbc_err; + + while (loops--) { + /* First, check to see if A20 is already enabled + (legacy free, etc.) */ + if (a20_test_short()) + return 0; + + /* Next, try the BIOS (INT 0x15, AX=0x2401) */ + enable_a20_bios(); + if (a20_test_short()) + return 0; + + /* Try enabling A20 through the keyboard controller */ + kbc_err = empty_8042(); + + if (a20_test_short()) + return 0; /* BIOS worked, but with delayed reaction */ + + if (!kbc_err) { + enable_a20_kbc(); + if (a20_test_long()) + return 0; + } + + /* Finally, try enabling the "fast A20 gate" */ + enable_a20_fast(); + if (a20_test_long()) + return 0; + } + + return -1; #endif } -- cgit v0.10.2 From 9f339e7028e2855717af3193c938f9960ad13b38 Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Wed, 11 Feb 2009 15:10:27 +0100 Subject: x86, ptrace, mm: fix double-free on race Ptrace_detach() races with __ptrace_unlink() if the traced task is reaped while detaching. This might cause a double-free of the BTS buffer. Change the ptrace_detach() path to only do the memory accounting in ptrace_bts_detach() and leave the buffer free to ptrace_bts_untrace() which will be called from __ptrace_unlink(). The fix follows a proposal from Oleg Nesterov. Reported-by: Oleg Nesterov Signed-off-by: Markus Metzger Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 0a5df5f..5a4c23d 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -810,12 +810,16 @@ static void ptrace_bts_untrace(struct task_struct *child) static void ptrace_bts_detach(struct task_struct *child) { - if (unlikely(child->bts)) { - ds_release_bts(child->bts); - child->bts = NULL; - - ptrace_bts_free_buffer(child); - } + /* + * Ptrace_detach() races with ptrace_untrace() in case + * the child dies and is reaped by another thread. + * + * We only do the memory accounting at this point and + * leave the buffer deallocation and the bts tracer + * release to ptrace_bts_untrace() which will be called + * later on with tasklist_lock held. + */ + release_locked_buffer(child->bts_buffer, child->bts_size); } #else static inline void ptrace_bts_fork(struct task_struct *tsk) {} diff --git a/include/linux/mm.h b/include/linux/mm.h index e8ddc98..3d7fb44 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1305,5 +1305,6 @@ void vmemmap_populate_print_last(void); extern void *alloc_locked_buffer(size_t size); extern void free_locked_buffer(void *buffer, size_t size); +extern void release_locked_buffer(void *buffer, size_t size); #endif /* __KERNEL__ */ #endif /* _LINUX_MM_H */ diff --git a/mm/mlock.c b/mm/mlock.c index 028ec48..2b57f7e 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -657,7 +657,7 @@ void *alloc_locked_buffer(size_t size) return buffer; } -void free_locked_buffer(void *buffer, size_t size) +void release_locked_buffer(void *buffer, size_t size) { unsigned long pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT; @@ -667,6 +667,11 @@ void free_locked_buffer(void *buffer, size_t size) current->mm->locked_vm -= pgsz; up_write(¤t->mm->mmap_sem); +} + +void free_locked_buffer(void *buffer, size_t size) +{ + release_locked_buffer(buffer, size); kfree(buffer); } -- cgit v0.10.2 From 4f06b0436b2ddbd3b67b10e77098a6862787b3eb Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 11 Feb 2009 09:32:19 -0800 Subject: x86/cpa: make sure cpa is safe to call in lazy mmu mode Impact: fix race leading to crash under KVM and Xen The CPA code may be called while we're in lazy mmu update mode - for example, when using DEBUG_PAGE_ALLOC and doing a slab allocation in an interrupt handler which interrupted a lazy mmu update. In this case, the in-memory pagetable state may be out of date due to pending queued updates. We need to flush any pending updates before inspecting the page table. Similarly, we must explicitly flush any modifications CPA may have made (which comes down to flushing queued operations when flushing the TLB). Signed-off-by: Jeremy Fitzhardinge Acked-by: Marcelo Tosatti Cc: Stable Kernel Signed-off-by: Ingo Molnar diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 84ba748..f664bc1 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -576,6 +576,13 @@ static int __change_page_attr(struct cpa_data *cpa, int primary) else address = *cpa->vaddr; + /* + * If we're called with lazy mmu updates enabled, the + * in-memory pte state may be stale. Flush pending updates to + * bring them up to date. + */ + arch_flush_lazy_mmu_mode(); + repeat: kpte = lookup_address(address, &level); if (!kpte) @@ -854,6 +861,13 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages, } else cpa_flush_all(cache); + /* + * If we've been called with lazy mmu updates enabled, then + * make sure that everything gets flushed out before we + * return. + */ + arch_flush_lazy_mmu_mode(); + out: return ret; } -- cgit v0.10.2 From be03d9e8022030c16abf534e33e185bfc3d40eef Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Wed, 11 Feb 2009 11:20:23 -0800 Subject: x86, pat: fix warn_on_once() while mapping 0-1MB range with /dev/mem Jeff Mahoney reported: > With Suse's hwinfo tool, on -tip: > WARNING: at arch/x86/mm/pat.c:637 reserve_pfn_range+0x5b/0x26d() reserve_pfn_range() is not tracking the memory range below 1MB as non-RAM and as such is inconsistent with similar checks in reserve_memtype() and free_memtype() Rename the pagerange_is_ram() to pat_pagerange_is_ram() and add the "track legacy 1MB region as non RAM" condition. And also, fix reserve_pfn_range() to return -EINVAL, when the pfn range is RAM. This is to be consistent with this API design. Reported-and-tested-by: Jeff Mahoney Signed-off-by: Suresh Siddha Signed-off-by: Venkatesh Pallipadi Signed-off-by: Ingo Molnar diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h index e9873a2..7765791 100644 --- a/arch/x86/include/asm/page.h +++ b/arch/x86/include/asm/page.h @@ -57,7 +57,6 @@ typedef struct { pgdval_t pgd; } pgd_t; typedef struct { pgprotval_t pgprot; } pgprot_t; extern int page_is_ram(unsigned long pagenr); -extern int pagerange_is_ram(unsigned long start, unsigned long end); extern int devmem_is_allowed(unsigned long pagenr); extern void map_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot); diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index af750ab..f45d5e2 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -134,25 +134,6 @@ int page_is_ram(unsigned long pagenr) return 0; } -int pagerange_is_ram(unsigned long start, unsigned long end) -{ - int ram_page = 0, not_rampage = 0; - unsigned long page_nr; - - for (page_nr = (start >> PAGE_SHIFT); page_nr < (end >> PAGE_SHIFT); - ++page_nr) { - if (page_is_ram(page_nr)) - ram_page = 1; - else - not_rampage = 1; - - if (ram_page == not_rampage) - return -1; - } - - return ram_page; -} - /* * Fix up the linear direct mapping of the kernel to avoid cache attribute * conflicts. diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 7b61036..aebbf67 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -211,6 +211,33 @@ chk_conflict(struct memtype *new, struct memtype *entry, unsigned long *type) static struct memtype *cached_entry; static u64 cached_start; +static int pat_pagerange_is_ram(unsigned long start, unsigned long end) +{ + int ram_page = 0, not_rampage = 0; + unsigned long page_nr; + + for (page_nr = (start >> PAGE_SHIFT); page_nr < (end >> PAGE_SHIFT); + ++page_nr) { + /* + * For legacy reasons, physical address range in the legacy ISA + * region is tracked as non-RAM. This will allow users of + * /dev/mem to map portions of legacy ISA region, even when + * some of those portions are listed(or not even listed) with + * different e820 types(RAM/reserved/..) + */ + if (page_nr >= (ISA_END_ADDRESS >> PAGE_SHIFT) && + page_is_ram(page_nr)) + ram_page = 1; + else + not_rampage = 1; + + if (ram_page == not_rampage) + return -1; + } + + return ram_page; +} + /* * For RAM pages, mark the pages as non WB memory type using * PageNonWB (PG_arch_1). We allow only one set_memory_uc() or @@ -336,20 +363,12 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, if (new_type) *new_type = actual_type; - /* - * For legacy reasons, some parts of the physical address range in the - * legacy 1MB region is treated as non-RAM (even when listed as RAM in - * the e820 tables). So we will track the memory attributes of this - * legacy 1MB region using the linear memtype_list always. - */ - if (end >= ISA_END_ADDRESS) { - is_range_ram = pagerange_is_ram(start, end); - if (is_range_ram == 1) - return reserve_ram_pages_type(start, end, req_type, - new_type); - else if (is_range_ram < 0) - return -EINVAL; - } + is_range_ram = pat_pagerange_is_ram(start, end); + if (is_range_ram == 1) + return reserve_ram_pages_type(start, end, req_type, + new_type); + else if (is_range_ram < 0) + return -EINVAL; new = kmalloc(sizeof(struct memtype), GFP_KERNEL); if (!new) @@ -446,19 +465,11 @@ int free_memtype(u64 start, u64 end) if (is_ISA_range(start, end - 1)) return 0; - /* - * For legacy reasons, some parts of the physical address range in the - * legacy 1MB region is treated as non-RAM (even when listed as RAM in - * the e820 tables). So we will track the memory attributes of this - * legacy 1MB region using the linear memtype_list always. - */ - if (end >= ISA_END_ADDRESS) { - is_range_ram = pagerange_is_ram(start, end); - if (is_range_ram == 1) - return free_ram_pages_type(start, end); - else if (is_range_ram < 0) - return -EINVAL; - } + is_range_ram = pat_pagerange_is_ram(start, end); + if (is_range_ram == 1) + return free_ram_pages_type(start, end); + else if (is_range_ram < 0) + return -EINVAL; spin_lock(&memtype_lock); list_for_each_entry(entry, &memtype_list, nd) { @@ -626,17 +637,13 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot, unsigned long flags; unsigned long want_flags = (pgprot_val(*vma_prot) & _PAGE_CACHE_MASK); - is_ram = pagerange_is_ram(paddr, paddr + size); + is_ram = pat_pagerange_is_ram(paddr, paddr + size); - if (is_ram != 0) { - /* - * For mapping RAM pages, drivers need to call - * set_memory_[uc|wc|wb] directly, for reserve and free, before - * setting up the PTE. - */ - WARN_ON_ONCE(1); - return 0; - } + /* + * reserve_pfn_range() doesn't support RAM pages. + */ + if (is_ram != 0) + return -EINVAL; ret = reserve_memtype(paddr, paddr + size, want_flags, &flags); if (ret) @@ -693,7 +700,7 @@ static void free_pfn_range(u64 paddr, unsigned long size) { int is_ram; - is_ram = pagerange_is_ram(paddr, paddr + size); + is_ram = pat_pagerange_is_ram(paddr, paddr + size); if (is_ram == 0) free_memtype(paddr, paddr + size); } -- cgit v0.10.2 From d85cf93da66977dbc645352be1b2084a659d8a0b Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 12 Feb 2009 10:02:56 -0800 Subject: x86/paravirt: make arch_flush_lazy_mmu/cpu disable preemption Impact: avoid access to percpu vars in preempible context They are intended to be used whenever there's the possibility that there's some stale state which is going to be overwritten with a queued update, or to force a state change when we may be in lazy mode. Either way, we could end up calling it with preemption enabled, so wrap the functions in their own little preempt-disable section so they can be safely called in any context (though preemption should never be enabled if we're actually in a lazy state). (Move out of line to avoid #include dependencies.) Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Thomas Gleixner diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index ba3e2ff..a660ece 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -1352,14 +1352,7 @@ static inline void arch_leave_lazy_cpu_mode(void) PVOP_VCALL0(pv_cpu_ops.lazy_mode.leave); } -static inline void arch_flush_lazy_cpu_mode(void) -{ - if (unlikely(paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU)) { - arch_leave_lazy_cpu_mode(); - arch_enter_lazy_cpu_mode(); - } -} - +void arch_flush_lazy_cpu_mode(void); #define __HAVE_ARCH_ENTER_LAZY_MMU_MODE static inline void arch_enter_lazy_mmu_mode(void) @@ -1372,13 +1365,7 @@ static inline void arch_leave_lazy_mmu_mode(void) PVOP_VCALL0(pv_mmu_ops.lazy_mode.leave); } -static inline void arch_flush_lazy_mmu_mode(void) -{ - if (unlikely(paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU)) { - arch_leave_lazy_mmu_mode(); - arch_enter_lazy_mmu_mode(); - } -} +void arch_flush_lazy_mmu_mode(void); static inline void __set_fixmap(unsigned /* enum fixed_addresses */ idx, unsigned long phys, pgprot_t flags) diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index e4c8fb6..dcba6c5 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -268,6 +268,30 @@ enum paravirt_lazy_mode paravirt_get_lazy_mode(void) return __get_cpu_var(paravirt_lazy_mode); } +void arch_flush_lazy_mmu_mode(void) +{ + preempt_disable(); + + if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) { + arch_leave_lazy_mmu_mode(); + arch_enter_lazy_mmu_mode(); + } + + preempt_enable(); +} + +void arch_flush_lazy_cpu_mode(void) +{ + preempt_disable(); + + if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) { + arch_leave_lazy_cpu_mode(); + arch_enter_lazy_cpu_mode(); + } + + preempt_enable(); +} + struct pv_info pv_info = { .name = "bare hardware", .paravirt_enabled = 0, -- cgit v0.10.2 From 34b0900d323122113683685b200aae9f9b75e63b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 12 Feb 2009 21:30:48 +0100 Subject: x86: warn if arch_flush_lazy_mmu_cpu is called in preemptible context Impact: Catch cases where lazy MMU state is active in a preemtible context arch_flush_lazy_mmu_cpu() has been changed to disable preemption so the checks in enter/leave will never trigger. Put the preemtible() check into arch_flush_lazy_mmu_cpu() to catch such cases. Signed-off-by: Thomas Gleixner diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index dcba6c5..c6520a4 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -273,6 +273,7 @@ void arch_flush_lazy_mmu_mode(void) preempt_disable(); if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) { + WARN_ON(preempt_count() == 1); arch_leave_lazy_mmu_mode(); arch_enter_lazy_mmu_mode(); } @@ -285,6 +286,7 @@ void arch_flush_lazy_cpu_mode(void) preempt_disable(); if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) { + WARN_ON(preempt_count() == 1); arch_leave_lazy_cpu_mode(); arch_enter_lazy_cpu_mode(); } -- cgit v0.10.2 From 7ad9de6ac83bd825996d2de98c92e0f425c31050 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 12 Feb 2009 21:16:09 +0100 Subject: x86: CPA avoid repeated lazy mmu flush Impact: Flush the lazy MMU only once Pending mmu updates only need to be flushed once to bring the in-memory pagetable state up to date. Signed-off-by: Thomas Gleixner diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index f664bc1..8ca0d85 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -575,14 +575,6 @@ static int __change_page_attr(struct cpa_data *cpa, int primary) address = cpa->vaddr[cpa->curpage]; else address = *cpa->vaddr; - - /* - * If we're called with lazy mmu updates enabled, the - * in-memory pte state may be stale. Flush pending updates to - * bring them up to date. - */ - arch_flush_lazy_mmu_mode(); - repeat: kpte = lookup_address(address, &level); if (!kpte) @@ -819,6 +811,13 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages, vm_unmap_aliases(); + /* + * If we're called with lazy mmu updates enabled, the + * in-memory pte state may be stale. Flush pending updates to + * bring them up to date. + */ + arch_flush_lazy_mmu_mode(); + cpa.vaddr = addr; cpa.numpages = numpages; cpa.mask_set = mask_set; -- cgit v0.10.2 From b13e24644c138d0ddbc451403c30a96b09bfd556 Mon Sep 17 00:00:00 2001 From: john stultz Date: Thu, 12 Feb 2009 18:48:53 -0800 Subject: x86, hpet: fix for LS21 + HPET = boot hang Between 2.6.23 and 2.6.24-rc1 a change was made that broke IBM LS21 systems that had the HPET enabled in the BIOS, resulting in boot hangs for x86_64. Specifically commit b8ce33590687888ebb900d09557b8807c4539022, which merges the i386 and x86_64 HPET code. Prior to this commit, when we setup the HPET timers in x86_64, we did the following: hpet_writel(HPET_TN_ENABLE | HPET_TN_PERIODIC | HPET_TN_SETVAL | HPET_TN_32BIT, HPET_T0_CFG); However after the i386/x86_64 HPET merge, we do the following: cfg = hpet_readl(HPET_Tn_CFG(timer)); cfg |= HPET_TN_ENABLE | HPET_TN_PERIODIC | HPET_TN_SETVAL | HPET_TN_32BIT; hpet_writel(cfg, HPET_Tn_CFG(timer)); However on LS21s with HPET enabled in the BIOS, the HPET_T0_CFG register boots with Level triggered interrupts (HPET_TN_LEVEL) enabled. This causes the periodic interrupt to be not so periodic, and that results in the boot time hang I reported earlier in the delay calibration. My fix: Always disable HPET_TN_LEVEL when setting up periodic mode. Signed-off-by: John Stultz Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 64d5ad0..5c8da2c 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -269,6 +269,8 @@ static void hpet_set_mode(enum clock_event_mode mode, now = hpet_readl(HPET_COUNTER); cmp = now + (unsigned long) delta; cfg = hpet_readl(HPET_Tn_CFG(timer)); + /* Make sure we use edge triggered interrupts */ + cfg &= ~HPET_TN_LEVEL; cfg |= HPET_TN_ENABLE | HPET_TN_PERIODIC | HPET_TN_SETVAL | HPET_TN_32BIT; hpet_writel(cfg, HPET_Tn_CFG(timer)); -- cgit v0.10.2