From c4add2e537e6f60048dce8dc518254e7e605301d Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Fri, 30 Mar 2007 10:33:11 -0600 Subject: [IA64] rename ioremap variables to match i386 No functional change, just use the same names as i386. Signed-off-by: Bjorn Helgaas Signed-off-by: Tony Luck diff --git a/arch/ia64/mm/ioremap.c b/arch/ia64/mm/ioremap.c index 4280c07..1bc0c17 100644 --- a/arch/ia64/mm/ioremap.c +++ b/arch/ia64/mm/ioremap.c @@ -14,13 +14,13 @@ #include static inline void __iomem * -__ioremap (unsigned long offset, unsigned long size) +__ioremap (unsigned long phys_addr, unsigned long size) { - return (void __iomem *) (__IA64_UNCACHED_OFFSET | offset); + return (void __iomem *) (__IA64_UNCACHED_OFFSET | phys_addr); } void __iomem * -ioremap (unsigned long offset, unsigned long size) +ioremap (unsigned long phys_addr, unsigned long size) { u64 attr; unsigned long gran_base, gran_size; @@ -30,31 +30,31 @@ ioremap (unsigned long offset, unsigned long size) * as the rest of the kernel. For more details, see * Documentation/ia64/aliasing.txt. */ - attr = kern_mem_attribute(offset, size); + attr = kern_mem_attribute(phys_addr, size); if (attr & EFI_MEMORY_WB) - return (void __iomem *) phys_to_virt(offset); + return (void __iomem *) phys_to_virt(phys_addr); else if (attr & EFI_MEMORY_UC) - return __ioremap(offset, size); + return __ioremap(phys_addr, size); /* * Some chipsets don't support UC access to memory. If * WB is supported for the whole granule, we prefer that. */ - gran_base = GRANULEROUNDDOWN(offset); - gran_size = GRANULEROUNDUP(offset + size) - gran_base; + gran_base = GRANULEROUNDDOWN(phys_addr); + gran_size = GRANULEROUNDUP(phys_addr + size) - gran_base; if (efi_mem_attribute(gran_base, gran_size) & EFI_MEMORY_WB) - return (void __iomem *) phys_to_virt(offset); + return (void __iomem *) phys_to_virt(phys_addr); - return __ioremap(offset, size); + return __ioremap(phys_addr, size); } EXPORT_SYMBOL(ioremap); void __iomem * -ioremap_nocache (unsigned long offset, unsigned long size) +ioremap_nocache (unsigned long phys_addr, unsigned long size) { - if (kern_mem_attribute(offset, size) & EFI_MEMORY_WB) + if (kern_mem_attribute(phys_addr, size) & EFI_MEMORY_WB) return NULL; - return __ioremap(offset, size); + return __ioremap(phys_addr, size); } EXPORT_SYMBOL(ioremap_nocache); -- cgit v0.10.2 From 9b50ffb0c0281bc5a08ccd56ae9bb84296c28f38 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Fri, 30 Mar 2007 10:34:05 -0600 Subject: [IA64] make ioremap avoid unsupported attributes Example memory map (from HP sx1000 with VGA enabled): 0x00000 - 0x9FFFF supports only WB (cacheable) access 0xA0000 - 0xBFFFF supports only UC (uncacheable) access 0xC0000 - 0xFFFFF supports only WB (cacheable) access pci_read_rom() indirectly uses ioremap(0xC0000) to read the shadow VGA option ROM. ioremap() used to default to a 16MB or 64MB UC kernel identity mapping, which would cause an MCA when reading 0xC0000 since only WB is supported there. X uses reads the option ROM to initialize devices. A smaller test case is: # echo 1 > /sys/bus/pci/devices/0000:aa:03.0/rom # cp /sys/bus/pci/devices/0000:aa:03.0/rom x To avoid this, we can use the same ioremap_page_range() strategy that most architectures use for all ioremaps. These page table mappings come out of the vmalloc area. On ia64, these are in region 5 (0xA... addresses) and typically use 16KB or 64KB mappings instead of 16MB or 64MB mappings. The smaller mappings give more flexibility to use the correct attributes. Signed-off-by: Bjorn Helgaas Signed-off-by: Tony Luck diff --git a/arch/ia64/mm/ioremap.c b/arch/ia64/mm/ioremap.c index 1bc0c17..2a14062 100644 --- a/arch/ia64/mm/ioremap.c +++ b/arch/ia64/mm/ioremap.c @@ -1,5 +1,5 @@ /* - * (c) Copyright 2006 Hewlett-Packard Development Company, L.P. + * (c) Copyright 2006, 2007 Hewlett-Packard Development Company, L.P. * Bjorn Helgaas * * This program is free software; you can redistribute it and/or modify @@ -10,11 +10,13 @@ #include #include #include +#include +#include #include #include static inline void __iomem * -__ioremap (unsigned long phys_addr, unsigned long size) +__ioremap (unsigned long phys_addr) { return (void __iomem *) (__IA64_UNCACHED_OFFSET | phys_addr); } @@ -22,8 +24,13 @@ __ioremap (unsigned long phys_addr, unsigned long size) void __iomem * ioremap (unsigned long phys_addr, unsigned long size) { + void __iomem *addr; + struct vm_struct *area; + unsigned long offset; + pgprot_t prot; u64 attr; unsigned long gran_base, gran_size; + unsigned long page_base; /* * For things in kern_memmap, we must use the same attribute @@ -34,7 +41,7 @@ ioremap (unsigned long phys_addr, unsigned long size) if (attr & EFI_MEMORY_WB) return (void __iomem *) phys_to_virt(phys_addr); else if (attr & EFI_MEMORY_UC) - return __ioremap(phys_addr, size); + return __ioremap(phys_addr); /* * Some chipsets don't support UC access to memory. If @@ -45,7 +52,42 @@ ioremap (unsigned long phys_addr, unsigned long size) if (efi_mem_attribute(gran_base, gran_size) & EFI_MEMORY_WB) return (void __iomem *) phys_to_virt(phys_addr); - return __ioremap(phys_addr, size); + /* + * WB is not supported for the whole granule, so we can't use + * the region 7 identity mapping. If we can safely cover the + * area with kernel page table mappings, we can use those + * instead. + */ + page_base = phys_addr & PAGE_MASK; + size = PAGE_ALIGN(phys_addr + size) - page_base; + if (efi_mem_attribute(page_base, size) & EFI_MEMORY_WB) { + prot = PAGE_KERNEL; + + /* + * Mappings have to be page-aligned + */ + offset = phys_addr & ~PAGE_MASK; + phys_addr &= PAGE_MASK; + + /* + * Ok, go for it.. + */ + area = get_vm_area(size, VM_IOREMAP); + if (!area) + return NULL; + + area->phys_addr = phys_addr; + addr = (void __iomem *) area->addr; + if (ioremap_page_range((unsigned long) addr, + (unsigned long) addr + size, phys_addr, prot)) { + vunmap((void __force *) addr); + return NULL; + } + + return (void __iomem *) (offset + (char __iomem *)addr); + } + + return __ioremap(phys_addr); } EXPORT_SYMBOL(ioremap); @@ -55,6 +97,14 @@ ioremap_nocache (unsigned long phys_addr, unsigned long size) if (kern_mem_attribute(phys_addr, size) & EFI_MEMORY_WB) return NULL; - return __ioremap(phys_addr, size); + return __ioremap(phys_addr); } EXPORT_SYMBOL(ioremap_nocache); + +void +iounmap (volatile void __iomem *addr) +{ + if (REGION_NUMBER(addr) == RGN_GATE) + vunmap((void *) ((unsigned long) addr & PAGE_MASK)); +} +EXPORT_SYMBOL(iounmap); diff --git a/include/asm-ia64/io.h b/include/asm-ia64/io.h index 6311e16..eb17a86 100644 --- a/include/asm-ia64/io.h +++ b/include/asm-ia64/io.h @@ -421,11 +421,7 @@ __writeq (unsigned long val, volatile void __iomem *addr) extern void __iomem * ioremap(unsigned long offset, unsigned long size); extern void __iomem * ioremap_nocache (unsigned long offset, unsigned long size); - -static inline void -iounmap (volatile void __iomem *addr) -{ -} +extern void iounmap (volatile void __iomem *addr); /* Use normal IO mappings for DMI */ #define dmi_ioremap ioremap -- cgit v0.10.2 From 2cb22e23a5fcbcac2de49493aa57c7694028a06a Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Fri, 30 Mar 2007 10:34:44 -0600 Subject: [IA64] allow WB /sys/.../legacy_mem mmaps Allow cacheable mmaps of legacy_mem if WB access is supported for the region. The "legacy_mem" file often contains a shadow option ROM, and some versions of X depend on this. Tim Yamin reported that this change fixes X on a Dell PowerEdge 3250. Signed-off-by: Bjorn Helgaas Signed-off-by: Tony Luck diff --git a/arch/ia64/pci/pci.c b/arch/ia64/pci/pci.c index 0e83f3b..9f63589 100644 --- a/arch/ia64/pci/pci.c +++ b/arch/ia64/pci/pci.c @@ -659,8 +659,6 @@ pci_mmap_legacy_page_range(struct pci_bus *bus, struct vm_area_struct *vma) return -EINVAL; prot = phys_mem_access_prot(NULL, vma->vm_pgoff, size, vma->vm_page_prot); - if (pgprot_val(prot) != pgprot_val(pgprot_noncached(vma->vm_page_prot))) - return -EINVAL; addr = pci_get_legacy_mem(bus); if (IS_ERR(addr)) -- cgit v0.10.2 From 6d40fc514c9ea886dc18ddd20043a411816b63d1 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Fri, 30 Mar 2007 10:35:43 -0600 Subject: [IA64] fail mmaps that span areas with incompatible attributes Example memory map (from HP sx1000 with VGA enabled): 0x00000 - 0x9FFFF supports only WB (cacheable) access 0xA0000 - 0xBFFFF supports only UC (uncacheable) access 0xC0000 - 0xFFFFF supports only WB (cacheable) access Some versions of X map the entire 0x00000-0xFFFFF area at once. With the example above, this mmap must fail because there's no memory attribute that's safe for the entire area. Prior to this patch, we performed the mmap with a UC mapping. When X accessed the WB memory at 0xC0000, it caused an MCA. The crash can happen when mapping 0xC0000 from either /dev/mem or a /sys/.../legacy_mem file. Signed-off-by: Bjorn Helgaas Signed-off-by: Tony Luck diff --git a/arch/ia64/kernel/efi.c b/arch/ia64/kernel/efi.c index f45f91d..78d29b7 100644 --- a/arch/ia64/kernel/efi.c +++ b/arch/ia64/kernel/efi.c @@ -660,6 +660,29 @@ efi_memory_descriptor (unsigned long phys_addr) return NULL; } +static int +efi_memmap_intersects (unsigned long phys_addr, unsigned long size) +{ + void *efi_map_start, *efi_map_end, *p; + efi_memory_desc_t *md; + u64 efi_desc_size; + unsigned long end; + + efi_map_start = __va(ia64_boot_param->efi_memmap); + efi_map_end = efi_map_start + ia64_boot_param->efi_memmap_size; + efi_desc_size = ia64_boot_param->efi_memdesc_size; + + end = phys_addr + size; + + for (p = efi_map_start; p < efi_map_end; p += efi_desc_size) { + md = p; + + if (md->phys_addr < end && efi_md_end(md) > phys_addr) + return 1; + } + return 0; +} + u32 efi_mem_type (unsigned long phys_addr) { @@ -766,11 +789,28 @@ valid_phys_addr_range (unsigned long phys_addr, unsigned long size) int valid_mmap_phys_addr_range (unsigned long pfn, unsigned long size) { + unsigned long phys_addr = pfn << PAGE_SHIFT; + u64 attr; + + attr = efi_mem_attribute(phys_addr, size); + /* - * MMIO regions are often missing from the EFI memory map. - * We must allow mmap of them for programs like X, so we - * currently can't do any useful validation. + * /dev/mem mmap uses normal user pages, so we don't need the entire + * granule, but the entire region we're mapping must support the same + * attribute. */ + if (attr & EFI_MEMORY_WB || attr & EFI_MEMORY_UC) + return 1; + + /* + * Intel firmware doesn't tell us about all the MMIO regions, so + * in general we have to allow mmap requests. But if EFI *does* + * tell us about anything inside this region, we should deny it. + * The user can always map a smaller region to avoid the overlap. + */ + if (efi_memmap_intersects(phys_addr, size)) + return 0; + return 1; } -- cgit v0.10.2 From ddd83eff58888928115b3e225a46d3c686e64594 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Fri, 30 Mar 2007 10:39:42 -0600 Subject: [IA64] update memory attribute aliasing documentation & test cases Updates documentation and adds some test cases. Signed-off-by: Bjorn Helgaas Signed-off-by: Tony Luck diff --git a/Documentation/ia64/aliasing-test.c b/Documentation/ia64/aliasing-test.c new file mode 100644 index 0000000..3153167 --- /dev/null +++ b/Documentation/ia64/aliasing-test.c @@ -0,0 +1,247 @@ +/* + * Exercise /dev/mem mmap cases that have been troublesome in the past + * + * (c) Copyright 2007 Hewlett-Packard Development Company, L.P. + * Bjorn Helgaas + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +int sum; + +int map_mem(char *path, off_t offset, size_t length, int touch) +{ + int fd, rc; + void *addr; + int *c; + + fd = open(path, O_RDWR); + if (fd == -1) { + perror(path); + return -1; + } + + addr = mmap(NULL, length, PROT_READ|PROT_WRITE, MAP_SHARED, fd, offset); + if (addr == MAP_FAILED) + return 1; + + if (touch) { + c = (int *) addr; + while (c < (int *) (offset + length)) + sum += *c++; + } + + rc = munmap(addr, length); + if (rc == -1) { + perror("munmap"); + return -1; + } + + close(fd); + return 0; +} + +int scan_sysfs(char *path, char *file, off_t offset, size_t length, int touch) +{ + struct dirent **namelist; + char *name, *path2; + int i, n, r, rc, result = 0; + struct stat buf; + + n = scandir(path, &namelist, 0, alphasort); + if (n < 0) { + perror("scandir"); + return -1; + } + + for (i = 0; i < n; i++) { + name = namelist[i]->d_name; + + if (fnmatch(".", name, 0) == 0) + goto skip; + if (fnmatch("..", name, 0) == 0) + goto skip; + + path2 = malloc(strlen(path) + strlen(name) + 3); + strcpy(path2, path); + strcat(path2, "/"); + strcat(path2, name); + + if (fnmatch(file, name, 0) == 0) { + rc = map_mem(path2, offset, length, touch); + if (rc == 0) + fprintf(stderr, "PASS: %s 0x%lx-0x%lx is %s\n", path2, offset, offset + length, touch ? "readable" : "mappable"); + else if (rc > 0) + fprintf(stderr, "PASS: %s 0x%lx-0x%lx not mappable\n", path2, offset, offset + length); + else { + fprintf(stderr, "FAIL: %s 0x%lx-0x%lx not accessible\n", path2, offset, offset + length); + return rc; + } + } else { + r = lstat(path2, &buf); + if (r == 0 && S_ISDIR(buf.st_mode)) { + rc = scan_sysfs(path2, file, offset, length, touch); + if (rc < 0) + return rc; + } + } + + result |= rc; + free(path2); + +skip: + free(namelist[i]); + } + free(namelist); + return rc; +} + +char buf[1024]; + +int read_rom(char *path) +{ + int fd, rc; + size_t size = 0; + + fd = open(path, O_RDWR); + if (fd == -1) { + perror(path); + return -1; + } + + rc = write(fd, "1", 2); + if (rc <= 0) { + perror("write"); + return -1; + } + + do { + rc = read(fd, buf, sizeof(buf)); + if (rc > 0) + size += rc; + } while (rc > 0); + + close(fd); + return size; +} + +int scan_rom(char *path, char *file) +{ + struct dirent **namelist; + char *name, *path2; + int i, n, r, rc, result = 0; + struct stat buf; + + n = scandir(path, &namelist, 0, alphasort); + if (n < 0) { + perror("scandir"); + return -1; + } + + for (i = 0; i < n; i++) { + name = namelist[i]->d_name; + + if (fnmatch(".", name, 0) == 0) + goto skip; + if (fnmatch("..", name, 0) == 0) + goto skip; + + path2 = malloc(strlen(path) + strlen(name) + 3); + strcpy(path2, path); + strcat(path2, "/"); + strcat(path2, name); + + if (fnmatch(file, name, 0) == 0) { + rc = read_rom(path2); + + /* + * It's OK if the ROM is unreadable. Maybe there + * is no ROM, or some other error ocurred. The + * important thing is that no MCA happened. + */ + if (rc > 0) + fprintf(stderr, "PASS: %s read %ld bytes\n", path2, rc); + else { + fprintf(stderr, "PASS: %s not readable\n", path2); + return rc; + } + } else { + r = lstat(path2, &buf); + if (r == 0 && S_ISDIR(buf.st_mode)) { + rc = scan_rom(path2, file); + if (rc < 0) + return rc; + } + } + + result |= rc; + free(path2); + +skip: + free(namelist[i]); + } + free(namelist); + return rc; +} + +main() +{ + int rc; + + if (map_mem("/dev/mem", 0, 0xA0000, 1) == 0) + fprintf(stderr, "PASS: /dev/mem 0x0-0xa0000 is readable\n"); + else + fprintf(stderr, "FAIL: /dev/mem 0x0-0xa0000 not accessible\n"); + + /* + * It's not safe to blindly read the VGA frame buffer. If you know + * how to poke the card the right way, it should respond, but it's + * not safe in general. Many machines, e.g., Intel chipsets, cover + * up a non-responding card by just returning -1, but others will + * report the failure as a machine check. + */ + if (map_mem("/dev/mem", 0xA0000, 0x20000, 0) == 0) + fprintf(stderr, "PASS: /dev/mem 0xa0000-0xc0000 is mappable\n"); + else + fprintf(stderr, "FAIL: /dev/mem 0xa0000-0xc0000 not accessible\n"); + + if (map_mem("/dev/mem", 0xC0000, 0x40000, 1) == 0) + fprintf(stderr, "PASS: /dev/mem 0xc0000-0x100000 is readable\n"); + else + fprintf(stderr, "FAIL: /dev/mem 0xc0000-0x100000 not accessible\n"); + + /* + * Often you can map all the individual pieces above (0-0xA0000, + * 0xA0000-0xC0000, and 0xC0000-0x100000), but can't map the whole + * thing at once. This is because the individual pieces use different + * attributes, and there's no single attribute supported over the + * whole region. + */ + rc = map_mem("/dev/mem", 0, 1024*1024, 0); + if (rc == 0) + fprintf(stderr, "PASS: /dev/mem 0x0-0x100000 is mappable\n"); + else if (rc > 0) + fprintf(stderr, "PASS: /dev/mem 0x0-0x100000 not mappable\n"); + else + fprintf(stderr, "FAIL: /dev/mem 0x0-0x100000 not accessible\n"); + + scan_sysfs("/sys/class/pci_bus", "legacy_mem", 0, 0xA0000, 1); + scan_sysfs("/sys/class/pci_bus", "legacy_mem", 0xA0000, 0x20000, 0); + scan_sysfs("/sys/class/pci_bus", "legacy_mem", 0xC0000, 0x40000, 1); + scan_sysfs("/sys/class/pci_bus", "legacy_mem", 0, 1024*1024, 0); + + scan_rom("/sys/devices", "rom"); +} diff --git a/Documentation/ia64/aliasing.txt b/Documentation/ia64/aliasing.txt index 38f9a52..9a431a7 100644 --- a/Documentation/ia64/aliasing.txt +++ b/Documentation/ia64/aliasing.txt @@ -112,16 +112,6 @@ POTENTIAL ATTRIBUTE ALIASING CASES The /dev/mem mmap constraints apply. - However, since this is for mapping legacy MMIO space, WB access - does not make sense. This matters on machines without legacy - VGA support: these machines may have WB memory for the entire - first megabyte (or even the entire first granule). - - On these machines, we could mmap legacy_mem as WB, which would - be safe in terms of attribute aliasing, but X has no way of - knowing that it is accessing regular memory, not a frame buffer, - so the kernel should fail the mmap rather than doing it with WB. - read/write of /dev/mem This uses copy_from_user(), which implicitly uses a kernel @@ -138,14 +128,20 @@ POTENTIAL ATTRIBUTE ALIASING CASES ioremap() - This returns a kernel identity mapping for use inside the - kernel. + This returns a mapping for use inside the kernel. If the region is in kern_memmap, we should use the attribute - specified there. Otherwise, if the EFI memory map reports that - the entire granule supports WB, we should use that (granules - that are partially reserved or occupied by firmware do not appear - in kern_memmap). Otherwise, we should use a UC mapping. + specified there. + + If the EFI memory map reports that the entire granule supports + WB, we should use that (granules that are partially reserved + or occupied by firmware do not appear in kern_memmap). + + If the granule contains non-WB memory, but we can cover the + region safely with kernel page table mappings, we can use + ioremap_page_range() as most other architectures do. + + Failing all of the above, we have to fall back to a UC mapping. PAST PROBLEM CASES @@ -158,7 +154,7 @@ PAST PROBLEM CASES succeed. It may create either WB or UC user mappings, depending on whether the region is in kern_memmap or the EFI memory map. - mmap of 0x0-0xA0000 /dev/mem by "hwinfo" on HP sx1000 with VGA enabled + mmap of 0x0-0x9FFFF /dev/mem by "hwinfo" on HP sx1000 with VGA enabled See https://bugzilla.novell.com/show_bug.cgi?id=140858. @@ -171,28 +167,25 @@ PAST PROBLEM CASES so it is safe to use WB mappings. The kernel VGA driver may ioremap the VGA frame buffer at 0xA0000, - which will use a granule-sized UC mapping covering 0-0xFFFFF. This - granule covers some WB-only memory, but since UC is non-speculative, - the processor will never generate an uncacheable reference to the - WB-only areas unless the driver explicitly touches them. + which uses a granule-sized UC mapping. This granule will cover some + WB-only memory, but since UC is non-speculative, the processor will + never generate an uncacheable reference to the WB-only areas unless + the driver explicitly touches them. mmap of 0x0-0xFFFFF legacy_mem by "X" - If the EFI memory map reports this entire range as WB, there - is no VGA MMIO hole, and the mmap should fail or be done with - a WB mapping. + If the EFI memory map reports that the entire range supports the + same attributes, we can allow the mmap (and we will prefer WB if + supported, as is the case with HP sx[12]000 machines with VGA + disabled). - There's no easy way for X to determine whether the 0xA0000-0xBFFFF - region is a frame buffer or just memory, so I think it's best to - just fail this mmap request rather than using a WB mapping. As - far as I know, there's no need to map legacy_mem with WB - mappings. + If EFI reports the range as partly WB and partly UC (as on sx[12]000 + machines with VGA enabled), we must fail the mmap because there's no + safe attribute to use. - Otherwise, a UC mapping of the entire region is probably safe. - The VGA hole means the region will not be in kern_memmap. The - HP sx1000 chipset doesn't support UC access to the memory surrounding - the VGA hole, but X doesn't need that area anyway and should not - reference it. + If EFI reports some of the range but not all (as on Intel firmware + that doesn't report the VGA frame buffer at all), we should fail the + mmap and force the user to map just the specific region of interest. mmap of 0xA0000-0xBFFFF legacy_mem by "X" on HP sx1000 with VGA disabled @@ -202,6 +195,16 @@ PAST PROBLEM CASES This is a special case of the previous case, and the mmap should fail for the same reason as above. + read of /sys/devices/.../rom + + For VGA devices, this may cause an ioremap() of 0xC0000. This + used to be done with a UC mapping, because the VGA frame buffer + at 0xA0000 prevents use of a WB granule. The UC mapping causes + an MCA on HP sx[12]000 chipsets. + + We should use WB page table mappings to avoid covering the VGA + frame buffer. + NOTES [1] SDM rev 2.2, vol 2, sec 4.4.1. -- cgit v0.10.2