From 65fe935dd2387a4faf15314c73f5e6d31ef0217e Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 13 Jun 2016 15:10:02 -0700 Subject: x86/KASLR, x86/power: Remove x86 hibernation restrictions With the following fix: 70595b479ce1 ("x86/power/64: Fix crash whan the hibernation code passes control to the image kernel") ... there is no longer a problem with hibernation resuming a KASLR-booted kernel image, so remove the restriction. Signed-off-by: Kees Cook Cc: Andy Lutomirski Cc: Baoquan He Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Jonathan Corbet Cc: Len Brown Cc: Linus Torvalds Cc: Linux PM list Cc: Logan Gunthorpe Cc: Pavel Machek Cc: Peter Zijlstra Cc: Stephen Smalley Cc: Thomas Gleixner Cc: Yinghai Lu Cc: linux-doc@vger.kernel.org Link: http://lkml.kernel.org/r/20160613221002.GA29719@www.outflux.net Signed-off-by: Ingo Molnar diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 82b42c9..fa8c6d4 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1803,12 +1803,10 @@ bytes respectively. Such letter suffixes can also be entirely omitted. js= [HW,JOY] Analog joystick See Documentation/input/joystick.txt. - kaslr/nokaslr [X86] - Enable/disable kernel and module base offset ASLR - (Address Space Layout Randomization) if built into - the kernel. When CONFIG_HIBERNATION is selected, - kASLR is disabled by default. When kASLR is enabled, - hibernation will be disabled. + nokaslr [KNL] + When CONFIG_RANDOMIZE_BASE is set, this disables + kernel and module base offset ASLR (Address Space + Layout Randomization). keepinitrd [HW,ARM] diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c index cfeb025..dff4217 100644 --- a/arch/x86/boot/compressed/kaslr.c +++ b/arch/x86/boot/compressed/kaslr.c @@ -471,17 +471,10 @@ unsigned char *choose_random_location(unsigned long input, unsigned long choice = output; unsigned long random_addr; -#ifdef CONFIG_HIBERNATION - if (!cmdline_find_option_bool("kaslr")) { - warn("KASLR disabled: 'kaslr' not on cmdline (hibernation selected)."); - goto out; - } -#else if (cmdline_find_option_bool("nokaslr")) { warn("KASLR disabled: 'nokaslr' on cmdline."); goto out; } -#endif boot_params->hdr.loadflags |= KASLR_FLAG; diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index fca9254..9021387 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -1154,11 +1154,6 @@ static int __init nohibernate_setup(char *str) return 1; } -static int __init kaslr_nohibernate_setup(char *str) -{ - return nohibernate_setup(str); -} - static int __init page_poison_nohibernate_setup(char *str) { #ifdef CONFIG_PAGE_POISONING_ZERO @@ -1182,5 +1177,4 @@ __setup("hibernate=", hibernate_setup); __setup("resumewait", resumewait_setup); __setup("resumedelay=", resumedelay_setup); __setup("nohibernate", nohibernate_setup); -__setup("kaslr", kaslr_nohibernate_setup); __setup("page_poison=", page_poison_nohibernate_setup); -- cgit v0.10.2 From 98f78525371b55ccd1c480207ce10296c72fa340 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 25 May 2016 15:45:30 -0700 Subject: x86/boot: Refuse to build with data relocations The compressed kernel is built with -fPIC/-fPIE so that it can run in any location a bootloader happens to put it. However, since ELF relocation processing is not happening (and all the relocation information has already been stripped at link time), none of the code can use data relocations (e.g. static assignments of pointers). This is already noted in a warning comment at the top of misc.c, but this adds an explicit check for the condition during the linking stage to block any such bugs from appearing. If this was in place with the earlier bug in pagetable.c, the build would fail like this: ... CC arch/x86/boot/compressed/pagetable.o DATAREL arch/x86/boot/compressed/vmlinux error: arch/x86/boot/compressed/pagetable.o has data relocations! make[2]: *** [arch/x86/boot/compressed/vmlinux] Error 1 ... A clean build shows: ... CC arch/x86/boot/compressed/pagetable.o DATAREL arch/x86/boot/compressed/vmlinux LD arch/x86/boot/compressed/vmlinux ... Suggested-by: Ingo Molnar Signed-off-by: Kees Cook Cc: Andrew Morton Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Baoquan He Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Dmitry Vyukov Cc: H. Peter Anvin Cc: H.J. Lu Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Yinghai Lu Link: http://lkml.kernel.org/r/1464216334-17200-2-git-send-email-keescook@chromium.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index f135688..536ccfc 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -85,7 +85,25 @@ vmlinux-objs-$(CONFIG_EFI_STUB) += $(obj)/eboot.o $(obj)/efi_stub_$(BITS).o \ $(objtree)/drivers/firmware/efi/libstub/lib.a vmlinux-objs-$(CONFIG_EFI_MIXED) += $(obj)/efi_thunk_$(BITS).o +# The compressed kernel is built with -fPIC/-fPIE so that a boot loader +# can place it anywhere in memory and it will still run. However, since +# it is executed as-is without any ELF relocation processing performed +# (and has already had all relocation sections stripped from the binary), +# none of the code can use data relocations (e.g. static assignments of +# pointer values), since they will be meaningless at runtime. This check +# will refuse to link the vmlinux if any of these relocations are found. +quiet_cmd_check_data_rel = DATAREL $@ +define cmd_check_data_rel + for obj in $(filter %.o,$^); do \ + readelf -S $$obj | grep -qF .rel.local && { \ + echo "error: $$obj has data relocations!" >&2; \ + exit 1; \ + } || true; \ + done +endef + $(obj)/vmlinux: $(vmlinux-objs-y) FORCE + $(call if_changed,check_data_rel) $(call if_changed,ld) OBJCOPYFLAGS_vmlinux.bin := -R .comment -S -- cgit v0.10.2 From 11fdf97a3cd1a5a27625f820ceb74e1caba4fd26 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 25 May 2016 15:45:31 -0700 Subject: x86/KASLR: Clarify identity map interface This extracts the call to prepare_level4() into a top-level function that the user of the pagetable.c interface must call to initialize the new page tables. For clarity and to match the "finalize" function, it has been renamed to initialize_identity_maps(). This function also gains the initialization of mapping_info so we don't have to do it each time in add_identity_map(). Additionally add copyright notice to the top, to make it clear that the bulk of the pagetable.c code was written by Yinghai, and that I just added bugs later. :) Signed-off-by: Kees Cook Cc: Andrew Morton Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Baoquan He Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Dmitry Vyukov Cc: H. Peter Anvin Cc: H.J. Lu Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Yinghai Lu Link: http://lkml.kernel.org/r/1464216334-17200-3-git-send-email-keescook@chromium.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c index dff4217..54037c9 100644 --- a/arch/x86/boot/compressed/kaslr.c +++ b/arch/x86/boot/compressed/kaslr.c @@ -478,6 +478,9 @@ unsigned char *choose_random_location(unsigned long input, boot_params->hdr.loadflags |= KASLR_FLAG; + /* Prepare to add new identity pagetables on demand. */ + initialize_identity_maps(); + /* Record the various known unsafe memory ranges. */ mem_avoid_init(input, input_size, output); diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index b6fec1f..09c4ddd 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -85,10 +85,13 @@ unsigned char *choose_random_location(unsigned long input_ptr, #endif #ifdef CONFIG_X86_64 +void initialize_identity_maps(void); void add_identity_map(unsigned long start, unsigned long size); void finalize_identity_maps(void); extern unsigned char _pgtable[]; #else +static inline void initialize_identity_maps(void) +{ } static inline void add_identity_map(unsigned long start, unsigned long size) { } static inline void finalize_identity_maps(void) diff --git a/arch/x86/boot/compressed/pagetable.c b/arch/x86/boot/compressed/pagetable.c index 34b95df..6e31a6a 100644 --- a/arch/x86/boot/compressed/pagetable.c +++ b/arch/x86/boot/compressed/pagetable.c @@ -2,6 +2,9 @@ * This code is used on x86_64 to create page table identity mappings on * demand by building up a new set of page tables (or appending to the * existing ones), and then switching over to them when ready. + * + * Copyright (C) 2015-2016 Yinghai Lu + * Copyright (C) 2016 Kees Cook */ /* @@ -59,9 +62,21 @@ static struct alloc_pgt_data pgt_data; /* The top level page table entry pointer. */ static unsigned long level4p; +/* + * Mapping information structure passed to kernel_ident_mapping_init(). + * Due to relocation, pointers must be assigned at run time not build time. + */ +static struct x86_mapping_info mapping_info = { + .pmd_flag = __PAGE_KERNEL_LARGE_EXEC, +}; + /* Locates and clears a region for a new top level page table. */ -static void prepare_level4(void) +void initialize_identity_maps(void) { + /* Init mapping_info with run-time function/buffer pointers. */ + mapping_info.alloc_pgt_page = alloc_pgt_page; + mapping_info.context = &pgt_data; + /* * It should be impossible for this not to already be true, * but since calling this a second time would rewind the other @@ -96,17 +111,8 @@ static void prepare_level4(void) */ void add_identity_map(unsigned long start, unsigned long size) { - struct x86_mapping_info mapping_info = { - .alloc_pgt_page = alloc_pgt_page, - .context = &pgt_data, - .pmd_flag = __PAGE_KERNEL_LARGE_EXEC, - }; unsigned long end = start + size; - /* Make sure we have a top level page table ready to use. */ - if (!level4p) - prepare_level4(); - /* Align boundary to 2M. */ start = round_down(start, PMD_SIZE); end = round_up(end, PMD_SIZE); -- cgit v0.10.2 From 8391c73c96f28d4e8c40fd401fd0c9c04391b44a Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Wed, 25 May 2016 15:45:32 -0700 Subject: x86/KASLR: Randomize virtual address separately The current KASLR implementation randomizes the physical and virtual addresses of the kernel together (both are offset by the same amount). It calculates the delta of the physical address where vmlinux was linked to load and where it is finally loaded. If the delta is not equal to 0 (i.e. the kernel was relocated), relocation handling needs be done. On 64-bit, this patch randomizes both the physical address where kernel is decompressed and the virtual address where kernel text is mapped and will execute from. We now have two values being chosen, so the function arguments are reorganized to pass by pointer so they can be directly updated. Since relocation handling only depends on the virtual address, we must check the virtual delta, not the physical delta for processing kernel relocations. This also populates the page table for the new virtual address range. 32-bit does not support a separate virtual address, so it continues to use the physical offset for its virtual offset. Additionally updates the sanity checks done on the resulting kernel addresses since they are potentially separate now. [kees: rewrote changelog, limited virtual split to 64-bit only, update checks] [kees: fix CONFIG_RANDOMIZE_BASE=n boot failure] Signed-off-by: Baoquan He Signed-off-by: Kees Cook Cc: Andrew Morton Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Dmitry Vyukov Cc: H. Peter Anvin Cc: H.J. Lu Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Yinghai Lu Link: http://lkml.kernel.org/r/1464216334-17200-4-git-send-email-keescook@chromium.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c index 54037c9..5550546 100644 --- a/arch/x86/boot/compressed/kaslr.c +++ b/arch/x86/boot/compressed/kaslr.c @@ -463,17 +463,20 @@ static unsigned long find_random_virt_addr(unsigned long minimum, * Since this function examines addresses much more numerically, * it takes the input and output pointers as 'unsigned long'. */ -unsigned char *choose_random_location(unsigned long input, - unsigned long input_size, - unsigned long output, - unsigned long output_size) +void choose_random_location(unsigned long input, + unsigned long input_size, + unsigned long *output, + unsigned long output_size, + unsigned long *virt_addr) { - unsigned long choice = output; unsigned long random_addr; + /* By default, keep output position unchanged. */ + *virt_addr = *output; + if (cmdline_find_option_bool("nokaslr")) { warn("KASLR disabled: 'nokaslr' on cmdline."); - goto out; + return; } boot_params->hdr.loadflags |= KASLR_FLAG; @@ -482,25 +485,25 @@ unsigned char *choose_random_location(unsigned long input, initialize_identity_maps(); /* Record the various known unsafe memory ranges. */ - mem_avoid_init(input, input_size, output); + mem_avoid_init(input, input_size, *output); /* Walk e820 and find a random address. */ - random_addr = find_random_phys_addr(output, output_size); + random_addr = find_random_phys_addr(*output, output_size); if (!random_addr) { warn("KASLR disabled: could not find suitable E820 region!"); - goto out; + } else { + /* Update the new physical address location. */ + if (*output != random_addr) { + add_identity_map(random_addr, output_size); + *output = random_addr; + } } - /* Always enforce the minimum. */ - if (random_addr < choice) - goto out; - - choice = random_addr; - - add_identity_map(choice, output_size); - /* This actually loads the identity pagetable on x86_64. */ finalize_identity_maps(); -out: - return (unsigned char *)choice; + + /* Pick random virtual address starting from LOAD_PHYSICAL_ADDR. */ + if (IS_ENABLED(CONFIG_X86_64)) + random_addr = find_random_virt_addr(LOAD_PHYSICAL_ADDR, output_size); + *virt_addr = random_addr; } diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index f14db4e..b3c5a5f0 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -170,7 +170,8 @@ void __puthex(unsigned long value) } #if CONFIG_X86_NEED_RELOCS -static void handle_relocations(void *output, unsigned long output_len) +static void handle_relocations(void *output, unsigned long output_len, + unsigned long virt_addr) { int *reloc; unsigned long delta, map, ptr; @@ -182,11 +183,6 @@ static void handle_relocations(void *output, unsigned long output_len) * and where it was actually loaded. */ delta = min_addr - LOAD_PHYSICAL_ADDR; - if (!delta) { - debug_putstr("No relocation needed... "); - return; - } - debug_putstr("Performing relocations... "); /* * The kernel contains a table of relocation addresses. Those @@ -198,6 +194,20 @@ static void handle_relocations(void *output, unsigned long output_len) map = delta - __START_KERNEL_map; /* + * 32-bit always performs relocations. 64-bit relocations are only + * needed if KASLR has chosen a different starting address offset + * from __START_KERNEL_map. + */ + if (IS_ENABLED(CONFIG_X86_64)) + delta = virt_addr - LOAD_PHYSICAL_ADDR; + + if (!delta) { + debug_putstr("No relocation needed... "); + return; + } + debug_putstr("Performing relocations... "); + + /* * Process relocations: 32 bit relocations first then 64 bit after. * Three sets of binary relocations are added to the end of the kernel * before compression. Each relocation table entry is the kernel @@ -250,7 +260,8 @@ static void handle_relocations(void *output, unsigned long output_len) #endif } #else -static inline void handle_relocations(void *output, unsigned long output_len) +static inline void handle_relocations(void *output, unsigned long output_len, + unsigned long virt_addr) { } #endif @@ -327,7 +338,7 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap, unsigned long output_len) { const unsigned long kernel_total_size = VO__end - VO__text; - unsigned char *output_orig = output; + unsigned long virt_addr = (unsigned long)output; /* Retain x86 boot parameters pointer passed from startup_32/64. */ boot_params = rmode; @@ -366,13 +377,16 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap, * the entire decompressed kernel plus relocation table, or the * entire decompressed kernel plus .bss and .brk sections. */ - output = choose_random_location((unsigned long)input_data, input_len, - (unsigned long)output, - max(output_len, kernel_total_size)); + choose_random_location((unsigned long)input_data, input_len, + (unsigned long *)&output, + max(output_len, kernel_total_size), + &virt_addr); /* Validate memory location choices. */ if ((unsigned long)output & (MIN_KERNEL_ALIGN - 1)) - error("Destination address inappropriately aligned"); + error("Destination physical address inappropriately aligned"); + if (virt_addr & (MIN_KERNEL_ALIGN - 1)) + error("Destination virtual address inappropriately aligned"); #ifdef CONFIG_X86_64 if (heap > 0x3fffffffffffUL) error("Destination address too large"); @@ -382,19 +396,16 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap, #endif #ifndef CONFIG_RELOCATABLE if ((unsigned long)output != LOAD_PHYSICAL_ADDR) - error("Wrong destination address"); + error("Destination address does not match LOAD_PHYSICAL_ADDR"); + if ((unsigned long)output != virt_addr) + error("Destination virtual address changed when not relocatable"); #endif debug_putstr("\nDecompressing Linux... "); __decompress(input_data, input_len, NULL, NULL, output, output_len, NULL, error); parse_elf(output); - /* - * 32-bit always performs relocations. 64-bit relocations are only - * needed if kASLR has chosen a different load address. - */ - if (!IS_ENABLED(CONFIG_X86_64) || output != output_orig) - handle_relocations(output, output_len); + handle_relocations(output, output_len, virt_addr); debug_putstr("done.\nBooting the kernel.\n"); return output; } diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index 09c4ddd..1c8355e 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -67,20 +67,22 @@ int cmdline_find_option_bool(const char *option); #if CONFIG_RANDOMIZE_BASE /* kaslr.c */ -unsigned char *choose_random_location(unsigned long input_ptr, - unsigned long input_size, - unsigned long output_ptr, - unsigned long output_size); +void choose_random_location(unsigned long input, + unsigned long input_size, + unsigned long *output, + unsigned long output_size, + unsigned long *virt_addr); /* cpuflags.c */ bool has_cpuflag(int flag); #else -static inline -unsigned char *choose_random_location(unsigned long input_ptr, - unsigned long input_size, - unsigned long output_ptr, - unsigned long output_size) +static inline void choose_random_location(unsigned long input, + unsigned long input_size, + unsigned long *output, + unsigned long output_size, + unsigned long *virt_addr) { - return (unsigned char *)output_ptr; + /* No change from existing output location. */ + *virt_addr = *output; } #endif -- cgit v0.10.2 From ed9f007ee68478f6a50ec9971ade25a0129a5c0e Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 25 May 2016 15:45:33 -0700 Subject: x86/KASLR: Extend kernel image physical address randomization to addresses larger than 4G We want the physical address to be randomized anywhere between 16MB and the top of physical memory (up to 64TB). This patch exchanges the prior slots[] array for the new slot_areas[] array, and lifts the limitation of KERNEL_IMAGE_SIZE on the physical address offset for 64-bit. As before, process_e820_entry() walks memory and populates slot_areas[], splitting on any detected mem_avoid collisions. Finally, since the slots[] array and its associated functions are not needed any more, so they are removed. Based on earlier patches by Baoquan He. Originally-from: Baoquan He Signed-off-by: Kees Cook Cc: Andrew Morton Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Baoquan He Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Dmitry Vyukov Cc: H. Peter Anvin Cc: H.J. Lu Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Yinghai Lu Link: http://lkml.kernel.org/r/1464216334-17200-5-git-send-email-keescook@chromium.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 0a7b885..770ae52 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1934,21 +1934,26 @@ config RANDOMIZE_BASE attempts relying on knowledge of the location of kernel code internals. - The kernel physical and virtual address can be randomized - from 16MB up to 1GB on 64-bit and 512MB on 32-bit. (Note that - using RANDOMIZE_BASE reduces the memory space available to - kernel modules from 1.5GB to 1GB.) + On 64-bit, the kernel physical and virtual addresses are + randomized separately. The physical address will be anywhere + between 16MB and the top of physical memory (up to 64TB). The + virtual address will be randomized from 16MB up to 1GB (9 bits + of entropy). Note that this also reduces the memory space + available to kernel modules from 1.5GB to 1GB. + + On 32-bit, the kernel physical and virtual addresses are + randomized together. They will be randomized from 16MB up to + 512MB (8 bits of entropy). Entropy is generated using the RDRAND instruction if it is supported. If RDTSC is supported, its value is mixed into the entropy pool as well. If neither RDRAND nor RDTSC are - supported, then entropy is read from the i8254 timer. - - Since the kernel is built using 2GB addressing, and - PHYSICAL_ALIGN must be at a minimum of 2MB, only 10 bits of - entropy is theoretically possible. Currently, with the - default value for PHYSICAL_ALIGN and due to page table - layouts, 64-bit uses 9 bits of entropy and 32-bit uses 8 bits. + supported, then entropy is read from the i8254 timer. The + usable entropy is limited by the kernel being built using + 2GB addressing, and that PHYSICAL_ALIGN must be at a + minimum of 2MB. As a result, only 10 bits of entropy are + theoretically possible, but the implementations are further + limited due to memory layouts. If CONFIG_HIBERNATE is also enabled, KASLR is disabled at boot time. To enable it, boot with "kaslr" on the kernel command diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c index 5550546..36e2811 100644 --- a/arch/x86/boot/compressed/kaslr.c +++ b/arch/x86/boot/compressed/kaslr.c @@ -132,17 +132,6 @@ enum mem_avoid_index { static struct mem_vector mem_avoid[MEM_AVOID_MAX]; -static bool mem_contains(struct mem_vector *region, struct mem_vector *item) -{ - /* Item at least partially before region. */ - if (item->start < region->start) - return false; - /* Item at least partially after region. */ - if (item->start + item->size > region->start + region->size) - return false; - return true; -} - static bool mem_overlaps(struct mem_vector *one, struct mem_vector *two) { /* Item one is entirely before item two. */ @@ -319,8 +308,6 @@ static bool mem_avoid_overlap(struct mem_vector *img, return is_overlapping; } -static unsigned long slots[KERNEL_IMAGE_SIZE / CONFIG_PHYSICAL_ALIGN]; - struct slot_area { unsigned long addr; int num; @@ -351,36 +338,44 @@ static void store_slot_info(struct mem_vector *region, unsigned long image_size) } } -static void slots_append(unsigned long addr) -{ - /* Overflowing the slots list should be impossible. */ - if (slot_max >= KERNEL_IMAGE_SIZE / CONFIG_PHYSICAL_ALIGN) - return; - - slots[slot_max++] = addr; -} - static unsigned long slots_fetch_random(void) { + unsigned long slot; + int i; + /* Handle case of no slots stored. */ if (slot_max == 0) return 0; - return slots[get_random_long("Physical") % slot_max]; + slot = get_random_long("Physical") % slot_max; + + for (i = 0; i < slot_area_index; i++) { + if (slot >= slot_areas[i].num) { + slot -= slot_areas[i].num; + continue; + } + return slot_areas[i].addr + slot * CONFIG_PHYSICAL_ALIGN; + } + + if (i == slot_area_index) + debug_putstr("slots_fetch_random() failed!?\n"); + return 0; } static void process_e820_entry(struct e820entry *entry, unsigned long minimum, unsigned long image_size) { - struct mem_vector region, img, overlap; + struct mem_vector region, overlap; + struct slot_area slot_area; + unsigned long start_orig; /* Skip non-RAM entries. */ if (entry->type != E820_RAM) return; - /* Ignore entries entirely above our maximum. */ - if (entry->addr >= KERNEL_IMAGE_SIZE) + /* On 32-bit, ignore entries entirely above our maximum. */ + if (IS_ENABLED(CONFIG_X86_32) && entry->addr >= KERNEL_IMAGE_SIZE) return; /* Ignore entries entirely below our minimum. */ @@ -390,31 +385,55 @@ static void process_e820_entry(struct e820entry *entry, region.start = entry->addr; region.size = entry->size; - /* Potentially raise address to minimum location. */ - if (region.start < minimum) - region.start = minimum; + /* Give up if slot area array is full. */ + while (slot_area_index < MAX_SLOT_AREA) { + start_orig = region.start; - /* Potentially raise address to meet alignment requirements. */ - region.start = ALIGN(region.start, CONFIG_PHYSICAL_ALIGN); + /* Potentially raise address to minimum location. */ + if (region.start < minimum) + region.start = minimum; - /* Did we raise the address above the bounds of this e820 region? */ - if (region.start > entry->addr + entry->size) - return; + /* Potentially raise address to meet alignment needs. */ + region.start = ALIGN(region.start, CONFIG_PHYSICAL_ALIGN); - /* Reduce size by any delta from the original address. */ - region.size -= region.start - entry->addr; + /* Did we raise the address above this e820 region? */ + if (region.start > entry->addr + entry->size) + return; - /* Reduce maximum size to fit end of image within maximum limit. */ - if (region.start + region.size > KERNEL_IMAGE_SIZE) - region.size = KERNEL_IMAGE_SIZE - region.start; + /* Reduce size by any delta from the original address. */ + region.size -= region.start - start_orig; - /* Walk each aligned slot and check for avoided areas. */ - for (img.start = region.start, img.size = image_size ; - mem_contains(®ion, &img) ; - img.start += CONFIG_PHYSICAL_ALIGN) { - if (mem_avoid_overlap(&img, &overlap)) - continue; - slots_append(img.start); + /* On 32-bit, reduce region size to fit within max size. */ + if (IS_ENABLED(CONFIG_X86_32) && + region.start + region.size > KERNEL_IMAGE_SIZE) + region.size = KERNEL_IMAGE_SIZE - region.start; + + /* Return if region can't contain decompressed kernel */ + if (region.size < image_size) + return; + + /* If nothing overlaps, store the region and return. */ + if (!mem_avoid_overlap(®ion, &overlap)) { + store_slot_info(®ion, image_size); + return; + } + + /* Store beginning of region if holds at least image_size. */ + if (overlap.start > region.start + image_size) { + struct mem_vector beginning; + + beginning.start = region.start; + beginning.size = overlap.start - region.start; + store_slot_info(&beginning, image_size); + } + + /* Return if overlap extends to or past end of region. */ + if (overlap.start + overlap.size >= region.start + region.size) + return; + + /* Clip off the overlapping region and start over. */ + region.size -= overlap.start - region.start + overlap.size; + region.start = overlap.start + overlap.size; } } @@ -431,6 +450,10 @@ static unsigned long find_random_phys_addr(unsigned long minimum, for (i = 0; i < boot_params->e820_entries; i++) { process_e820_entry(&boot_params->e820_map[i], minimum, image_size); + if (slot_area_index == MAX_SLOT_AREA) { + debug_putstr("Aborted e820 scan (slot_areas full)!\n"); + break; + } } return slots_fetch_random(); -- cgit v0.10.2 From e066cc47776a89bbdaf4184c0e75f7d389f9ab48 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 25 May 2016 15:45:34 -0700 Subject: x86/KASLR: Allow randomization below the load address Currently the kernel image physical address randomization's lower boundary is the original kernel load address. For bootloaders that load kernels into very high memory (e.g. kexec), this means randomization takes place in a very small window at the top of memory, ignoring the large region of physical memory below the load address. Since mem_avoid[] is already correctly tracking the regions that must be avoided, this patch changes the minimum address to whatever is less: 512M (to conservatively avoid unknown things in lower memory) or the load address. Now, for example, if the kernel is loaded at 8G, [512M, 8G) will be added to the list of possible physical memory positions. Signed-off-by: Yinghai Lu [ Rewrote the changelog, refactored the code to use min(). ] Signed-off-by: Kees Cook Cc: Andrew Morton Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Baoquan He Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Dmitry Vyukov Cc: H. Peter Anvin Cc: H.J. Lu Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1464216334-17200-6-git-send-email-keescook@chromium.org [ Edited the changelog some more, plus the code comment as well. ] Signed-off-by: Ingo Molnar diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c index 36e2811..749c9e0 100644 --- a/arch/x86/boot/compressed/kaslr.c +++ b/arch/x86/boot/compressed/kaslr.c @@ -492,7 +492,7 @@ void choose_random_location(unsigned long input, unsigned long output_size, unsigned long *virt_addr) { - unsigned long random_addr; + unsigned long random_addr, min_addr; /* By default, keep output position unchanged. */ *virt_addr = *output; @@ -510,8 +510,15 @@ void choose_random_location(unsigned long input, /* Record the various known unsafe memory ranges. */ mem_avoid_init(input, input_size, *output); + /* + * Low end of the randomization range should be the + * smaller of 512M or the initial kernel image + * location: + */ + min_addr = min(*output, 512UL << 20); + /* Walk e820 and find a random address. */ - random_addr = find_random_phys_addr(*output, output_size); + random_addr = find_random_phys_addr(min_addr, output_size); if (!random_addr) { warn("KASLR disabled: could not find suitable E820 region!"); } else { -- cgit v0.10.2 From dbf984d825935f61965bcfacfd8e8dfdaf3e8051 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Sat, 25 Jun 2016 13:24:57 +0200 Subject: x86/boot/64: Add forgotten end of function marker Add secondary_startup_64()'s ENDPROC() marker. No functionality change. Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/20160625112457.16930-1-bp@alien8.de Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 5df831e..c7920ba 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -299,6 +299,7 @@ ENTRY(secondary_startup_64) pushq $__KERNEL_CS # set correct cs pushq %rax # target address in negative space lretq +ENDPROC(secondary_startup_64) #include "verify_cpu.S" -- cgit v0.10.2 From 6daa2ec0b3e3808c55329d12de3c157cf38b17b0 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Fri, 1 Jul 2016 15:34:40 +0800 Subject: x86/KASLR: Fix boot crash with certain memory configurations Ye Xiaolong reported this boot crash: | | XZ-compressed data is corrupt | | -- System halted | Fix the bug in mem_avoid_overlap() of finding the earliest overlap. Reported-and-tested-by: Ye Xiaolong Signed-off-by: Baoquan He Cc: Kees Cook Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c index 749c9e0..010ea16 100644 --- a/arch/x86/boot/compressed/kaslr.c +++ b/arch/x86/boot/compressed/kaslr.c @@ -285,6 +285,7 @@ static bool mem_avoid_overlap(struct mem_vector *img, if (mem_overlaps(img, &mem_avoid[i]) && mem_avoid[i].start < earliest) { *overlap = mem_avoid[i]; + earliest = overlap->start; is_overlapping = true; } } @@ -299,6 +300,7 @@ static bool mem_avoid_overlap(struct mem_vector *img, if (mem_overlaps(img, &avoid) && (avoid.start < earliest)) { *overlap = avoid; + earliest = overlap->start; is_overlapping = true; } -- cgit v0.10.2 From d899a7d146a2ed8a7e6c2f61bcd232908bcbaabc Mon Sep 17 00:00:00 2001 From: Thomas Garnier Date: Tue, 21 Jun 2016 17:46:58 -0700 Subject: x86/mm: Refactor KASLR entropy functions Move the KASLR entropy functions into arch/x86/lib to be used in early kernel boot for KASLR memory randomization. Signed-off-by: Thomas Garnier Signed-off-by: Kees Cook Cc: Alexander Kuleshov Cc: Alexander Popov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Aneesh Kumar K.V Cc: Baoquan He Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Christian Borntraeger Cc: Dan Williams Cc: Dave Hansen Cc: Dave Young Cc: Denys Vlasenko Cc: Dmitry Vyukov Cc: H. Peter Anvin Cc: Jan Beulich Cc: Joerg Roedel Cc: Jonathan Corbet Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Kirill A. Shutemov Cc: Linus Torvalds Cc: Lv Zheng Cc: Mark Salter Cc: Martin Schwidefsky Cc: Matt Fleming Cc: Peter Zijlstra Cc: Stephen Smalley Cc: Thomas Gleixner Cc: Toshi Kani Cc: Xiao Guangrong Cc: Yinghai Lu Cc: kernel-hardening@lists.openwall.com Cc: linux-doc@vger.kernel.org Link: http://lkml.kernel.org/r/1466556426-32664-2-git-send-email-keescook@chromium.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c index 010ea16..a66854d 100644 --- a/arch/x86/boot/compressed/kaslr.c +++ b/arch/x86/boot/compressed/kaslr.c @@ -12,10 +12,6 @@ #include "misc.h" #include "error.h" -#include -#include -#include - #include #include #include @@ -26,26 +22,6 @@ static const char build_str[] = UTS_RELEASE " (" LINUX_COMPILE_BY "@" LINUX_COMPILE_HOST ") (" LINUX_COMPILER ") " UTS_VERSION; -#define I8254_PORT_CONTROL 0x43 -#define I8254_PORT_COUNTER0 0x40 -#define I8254_CMD_READBACK 0xC0 -#define I8254_SELECT_COUNTER0 0x02 -#define I8254_STATUS_NOTREADY 0x40 -static inline u16 i8254(void) -{ - u16 status, timer; - - do { - outb(I8254_PORT_CONTROL, - I8254_CMD_READBACK | I8254_SELECT_COUNTER0); - status = inb(I8254_PORT_COUNTER0); - timer = inb(I8254_PORT_COUNTER0); - timer |= inb(I8254_PORT_COUNTER0) << 8; - } while (status & I8254_STATUS_NOTREADY); - - return timer; -} - static unsigned long rotate_xor(unsigned long hash, const void *area, size_t size) { @@ -62,7 +38,7 @@ static unsigned long rotate_xor(unsigned long hash, const void *area, } /* Attempt to create a simple but unpredictable starting entropy. */ -static unsigned long get_random_boot(void) +static unsigned long get_boot_seed(void) { unsigned long hash = 0; @@ -72,50 +48,8 @@ static unsigned long get_random_boot(void) return hash; } -static unsigned long get_random_long(const char *purpose) -{ -#ifdef CONFIG_X86_64 - const unsigned long mix_const = 0x5d6008cbf3848dd3UL; -#else - const unsigned long mix_const = 0x3f39e593UL; -#endif - unsigned long raw, random = get_random_boot(); - bool use_i8254 = true; - - debug_putstr(purpose); - debug_putstr(" KASLR using"); - - if (has_cpuflag(X86_FEATURE_RDRAND)) { - debug_putstr(" RDRAND"); - if (rdrand_long(&raw)) { - random ^= raw; - use_i8254 = false; - } - } - - if (has_cpuflag(X86_FEATURE_TSC)) { - debug_putstr(" RDTSC"); - raw = rdtsc(); - - random ^= raw; - use_i8254 = false; - } - - if (use_i8254) { - debug_putstr(" i8254"); - random ^= i8254(); - } - - /* Circular multiply for better bit diffusion */ - asm("mul %3" - : "=a" (random), "=d" (raw) - : "a" (random), "rm" (mix_const)); - random += raw; - - debug_putstr("...\n"); - - return random; -} +#define KASLR_COMPRESSED_BOOT +#include "../../lib/kaslr.c" struct mem_vector { unsigned long start; @@ -349,7 +283,7 @@ static unsigned long slots_fetch_random(void) if (slot_max == 0) return 0; - slot = get_random_long("Physical") % slot_max; + slot = kaslr_get_random_long("Physical") % slot_max; for (i = 0; i < slot_area_index; i++) { if (slot >= slot_areas[i].num) { @@ -479,7 +413,7 @@ static unsigned long find_random_virt_addr(unsigned long minimum, slots = (KERNEL_IMAGE_SIZE - minimum - image_size) / CONFIG_PHYSICAL_ALIGN + 1; - random_addr = get_random_long("Virtual") % slots; + random_addr = kaslr_get_random_long("Virtual") % slots; return random_addr * CONFIG_PHYSICAL_ALIGN + minimum; } diff --git a/arch/x86/include/asm/kaslr.h b/arch/x86/include/asm/kaslr.h new file mode 100644 index 0000000..5547438 --- /dev/null +++ b/arch/x86/include/asm/kaslr.h @@ -0,0 +1,6 @@ +#ifndef _ASM_KASLR_H_ +#define _ASM_KASLR_H_ + +unsigned long kaslr_get_random_long(const char *purpose); + +#endif diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index 72a5767..cfa6d07 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -24,6 +24,7 @@ lib-y += usercopy_$(BITS).o usercopy.o getuser.o putuser.o lib-y += memcpy_$(BITS).o lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o +lib-$(CONFIG_RANDOMIZE_BASE) += kaslr.o obj-y += msr.o msr-reg.o msr-reg-export.o diff --git a/arch/x86/lib/kaslr.c b/arch/x86/lib/kaslr.c new file mode 100644 index 0000000..f7dfeda --- /dev/null +++ b/arch/x86/lib/kaslr.c @@ -0,0 +1,90 @@ +/* + * Entropy functions used on early boot for KASLR base and memory + * randomization. The base randomization is done in the compressed + * kernel and memory randomization is done early when the regular + * kernel starts. This file is included in the compressed kernel and + * normally linked in the regular. + */ +#include +#include +#include +#include +#include + +/* + * When built for the regular kernel, several functions need to be stubbed out + * or changed to their regular kernel equivalent. + */ +#ifndef KASLR_COMPRESSED_BOOT +#include +#include + +#define debug_putstr(v) early_printk(v) +#define has_cpuflag(f) boot_cpu_has(f) +#define get_boot_seed() kaslr_offset() +#endif + +#define I8254_PORT_CONTROL 0x43 +#define I8254_PORT_COUNTER0 0x40 +#define I8254_CMD_READBACK 0xC0 +#define I8254_SELECT_COUNTER0 0x02 +#define I8254_STATUS_NOTREADY 0x40 +static inline u16 i8254(void) +{ + u16 status, timer; + + do { + outb(I8254_PORT_CONTROL, + I8254_CMD_READBACK | I8254_SELECT_COUNTER0); + status = inb(I8254_PORT_COUNTER0); + timer = inb(I8254_PORT_COUNTER0); + timer |= inb(I8254_PORT_COUNTER0) << 8; + } while (status & I8254_STATUS_NOTREADY); + + return timer; +} + +unsigned long kaslr_get_random_long(const char *purpose) +{ +#ifdef CONFIG_X86_64 + const unsigned long mix_const = 0x5d6008cbf3848dd3UL; +#else + const unsigned long mix_const = 0x3f39e593UL; +#endif + unsigned long raw, random = get_boot_seed(); + bool use_i8254 = true; + + debug_putstr(purpose); + debug_putstr(" KASLR using"); + + if (has_cpuflag(X86_FEATURE_RDRAND)) { + debug_putstr(" RDRAND"); + if (rdrand_long(&raw)) { + random ^= raw; + use_i8254 = false; + } + } + + if (has_cpuflag(X86_FEATURE_TSC)) { + debug_putstr(" RDTSC"); + raw = rdtsc(); + + random ^= raw; + use_i8254 = false; + } + + if (use_i8254) { + debug_putstr(" i8254"); + random ^= i8254(); + } + + /* Circular multiply for better bit diffusion */ + asm("mul %3" + : "=a" (random), "=d" (raw) + : "a" (random), "rm" (mix_const)); + random += raw; + + debug_putstr("...\n"); + + return random; +} -- cgit v0.10.2 From 59b3d0206d74a700069e49160e8194b2ca93b703 Mon Sep 17 00:00:00 2001 From: Thomas Garnier Date: Tue, 21 Jun 2016 17:46:59 -0700 Subject: x86/mm: Update physical mapping variable names Change the variable names in kernel_physical_mapping_init() and related functions to correctly reflect physical and virtual memory addresses. Also add comments on each function to describe usage and alignment constraints. Signed-off-by: Thomas Garnier Signed-off-by: Kees Cook Cc: Alexander Kuleshov Cc: Alexander Popov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Aneesh Kumar K.V Cc: Baoquan He Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Christian Borntraeger Cc: Dan Williams Cc: Dave Hansen Cc: Dave Young Cc: Denys Vlasenko Cc: Dmitry Vyukov Cc: H. Peter Anvin Cc: Jan Beulich Cc: Joerg Roedel Cc: Jonathan Corbet Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Kirill A. Shutemov Cc: Linus Torvalds Cc: Lv Zheng Cc: Mark Salter Cc: Martin Schwidefsky Cc: Matt Fleming Cc: Peter Zijlstra Cc: Stephen Smalley Cc: Thomas Gleixner Cc: Toshi Kani Cc: Xiao Guangrong Cc: Yinghai Lu Cc: kernel-hardening@lists.openwall.com Cc: linux-doc@vger.kernel.org Link: http://lkml.kernel.org/r/1466556426-32664-3-git-send-email-keescook@chromium.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index bce2e5d..6714712 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -328,22 +328,30 @@ void __init cleanup_highmap(void) } } +/* + * Create PTE level page table mapping for physical addresses. + * It returns the last physical address mapped. + */ static unsigned long __meminit -phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end, +phys_pte_init(pte_t *pte_page, unsigned long paddr, unsigned long paddr_end, pgprot_t prot) { - unsigned long pages = 0, next; - unsigned long last_map_addr = end; + unsigned long pages = 0, paddr_next; + unsigned long paddr_last = paddr_end; + pte_t *pte; int i; - pte_t *pte = pte_page + pte_index(addr); + pte = pte_page + pte_index(paddr); + i = pte_index(paddr); - for (i = pte_index(addr); i < PTRS_PER_PTE; i++, addr = next, pte++) { - next = (addr & PAGE_MASK) + PAGE_SIZE; - if (addr >= end) { + for (; i < PTRS_PER_PTE; i++, paddr = paddr_next, pte++) { + paddr_next = (paddr & PAGE_MASK) + PAGE_SIZE; + if (paddr >= paddr_end) { if (!after_bootmem && - !e820_any_mapped(addr & PAGE_MASK, next, E820_RAM) && - !e820_any_mapped(addr & PAGE_MASK, next, E820_RESERVED_KERN)) + !e820_any_mapped(paddr & PAGE_MASK, paddr_next, + E820_RAM) && + !e820_any_mapped(paddr & PAGE_MASK, paddr_next, + E820_RESERVED_KERN)) set_pte(pte, __pte(0)); continue; } @@ -361,37 +369,44 @@ phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end, } if (0) - printk(" pte=%p addr=%lx pte=%016lx\n", - pte, addr, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL).pte); + pr_info(" pte=%p addr=%lx pte=%016lx\n", pte, paddr, + pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL).pte); pages++; - set_pte(pte, pfn_pte(addr >> PAGE_SHIFT, prot)); - last_map_addr = (addr & PAGE_MASK) + PAGE_SIZE; + set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, prot)); + paddr_last = (paddr & PAGE_MASK) + PAGE_SIZE; } update_page_count(PG_LEVEL_4K, pages); - return last_map_addr; + return paddr_last; } +/* + * Create PMD level page table mapping for physical addresses. The virtual + * and physical address have to be aligned at this level. + * It returns the last physical address mapped. + */ static unsigned long __meminit -phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, +phys_pmd_init(pmd_t *pmd_page, unsigned long paddr, unsigned long paddr_end, unsigned long page_size_mask, pgprot_t prot) { - unsigned long pages = 0, next; - unsigned long last_map_addr = end; + unsigned long pages = 0, paddr_next; + unsigned long paddr_last = paddr_end; - int i = pmd_index(address); + int i = pmd_index(paddr); - for (; i < PTRS_PER_PMD; i++, address = next) { - pmd_t *pmd = pmd_page + pmd_index(address); + for (; i < PTRS_PER_PMD; i++, paddr = paddr_next) { + pmd_t *pmd = pmd_page + pmd_index(paddr); pte_t *pte; pgprot_t new_prot = prot; - next = (address & PMD_MASK) + PMD_SIZE; - if (address >= end) { + paddr_next = (paddr & PMD_MASK) + PMD_SIZE; + if (paddr >= paddr_end) { if (!after_bootmem && - !e820_any_mapped(address & PMD_MASK, next, E820_RAM) && - !e820_any_mapped(address & PMD_MASK, next, E820_RESERVED_KERN)) + !e820_any_mapped(paddr & PMD_MASK, paddr_next, + E820_RAM) && + !e820_any_mapped(paddr & PMD_MASK, paddr_next, + E820_RESERVED_KERN)) set_pmd(pmd, __pmd(0)); continue; } @@ -400,8 +415,8 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, if (!pmd_large(*pmd)) { spin_lock(&init_mm.page_table_lock); pte = (pte_t *)pmd_page_vaddr(*pmd); - last_map_addr = phys_pte_init(pte, address, - end, prot); + paddr_last = phys_pte_init(pte, paddr, + paddr_end, prot); spin_unlock(&init_mm.page_table_lock); continue; } @@ -420,7 +435,7 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, if (page_size_mask & (1 << PG_LEVEL_2M)) { if (!after_bootmem) pages++; - last_map_addr = next; + paddr_last = paddr_next; continue; } new_prot = pte_pgprot(pte_clrhuge(*(pte_t *)pmd)); @@ -430,42 +445,49 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, pages++; spin_lock(&init_mm.page_table_lock); set_pte((pte_t *)pmd, - pfn_pte((address & PMD_MASK) >> PAGE_SHIFT, + pfn_pte((paddr & PMD_MASK) >> PAGE_SHIFT, __pgprot(pgprot_val(prot) | _PAGE_PSE))); spin_unlock(&init_mm.page_table_lock); - last_map_addr = next; + paddr_last = paddr_next; continue; } pte = alloc_low_page(); - last_map_addr = phys_pte_init(pte, address, end, new_prot); + paddr_last = phys_pte_init(pte, paddr, paddr_end, new_prot); spin_lock(&init_mm.page_table_lock); pmd_populate_kernel(&init_mm, pmd, pte); spin_unlock(&init_mm.page_table_lock); } update_page_count(PG_LEVEL_2M, pages); - return last_map_addr; + return paddr_last; } +/* + * Create PUD level page table mapping for physical addresses. The virtual + * and physical address have to be aligned at this level. + * It returns the last physical address mapped. + */ static unsigned long __meminit -phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end, - unsigned long page_size_mask) +phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end, + unsigned long page_size_mask) { - unsigned long pages = 0, next; - unsigned long last_map_addr = end; - int i = pud_index(addr); + unsigned long pages = 0, paddr_next; + unsigned long paddr_last = paddr_end; + int i = pud_index(paddr); - for (; i < PTRS_PER_PUD; i++, addr = next) { - pud_t *pud = pud_page + pud_index(addr); + for (; i < PTRS_PER_PUD; i++, paddr = paddr_next) { + pud_t *pud = pud_page + pud_index(paddr); pmd_t *pmd; pgprot_t prot = PAGE_KERNEL; - next = (addr & PUD_MASK) + PUD_SIZE; - if (addr >= end) { + paddr_next = (paddr & PUD_MASK) + PUD_SIZE; + if (paddr >= paddr_end) { if (!after_bootmem && - !e820_any_mapped(addr & PUD_MASK, next, E820_RAM) && - !e820_any_mapped(addr & PUD_MASK, next, E820_RESERVED_KERN)) + !e820_any_mapped(paddr & PUD_MASK, paddr_next, + E820_RAM) && + !e820_any_mapped(paddr & PUD_MASK, paddr_next, + E820_RESERVED_KERN)) set_pud(pud, __pud(0)); continue; } @@ -473,8 +495,10 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end, if (pud_val(*pud)) { if (!pud_large(*pud)) { pmd = pmd_offset(pud, 0); - last_map_addr = phys_pmd_init(pmd, addr, end, - page_size_mask, prot); + paddr_last = phys_pmd_init(pmd, paddr, + paddr_end, + page_size_mask, + prot); __flush_tlb_all(); continue; } @@ -493,7 +517,7 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end, if (page_size_mask & (1 << PG_LEVEL_1G)) { if (!after_bootmem) pages++; - last_map_addr = next; + paddr_last = paddr_next; continue; } prot = pte_pgprot(pte_clrhuge(*(pte_t *)pud)); @@ -503,16 +527,16 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end, pages++; spin_lock(&init_mm.page_table_lock); set_pte((pte_t *)pud, - pfn_pte((addr & PUD_MASK) >> PAGE_SHIFT, + pfn_pte((paddr & PUD_MASK) >> PAGE_SHIFT, PAGE_KERNEL_LARGE)); spin_unlock(&init_mm.page_table_lock); - last_map_addr = next; + paddr_last = paddr_next; continue; } pmd = alloc_low_page(); - last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask, - prot); + paddr_last = phys_pmd_init(pmd, paddr, paddr_end, + page_size_mask, prot); spin_lock(&init_mm.page_table_lock); pud_populate(&init_mm, pud, pmd); @@ -522,38 +546,44 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end, update_page_count(PG_LEVEL_1G, pages); - return last_map_addr; + return paddr_last; } +/* + * Create page table mapping for the physical memory for specific physical + * addresses. The virtual and physical addresses have to be aligned on PUD level + * down. It returns the last physical address mapped. + */ unsigned long __meminit -kernel_physical_mapping_init(unsigned long start, - unsigned long end, +kernel_physical_mapping_init(unsigned long paddr_start, + unsigned long paddr_end, unsigned long page_size_mask) { bool pgd_changed = false; - unsigned long next, last_map_addr = end; - unsigned long addr; + unsigned long vaddr, vaddr_start, vaddr_end, vaddr_next, paddr_last; - start = (unsigned long)__va(start); - end = (unsigned long)__va(end); - addr = start; + paddr_last = paddr_end; + vaddr = (unsigned long)__va(paddr_start); + vaddr_end = (unsigned long)__va(paddr_end); + vaddr_start = vaddr; - for (; start < end; start = next) { - pgd_t *pgd = pgd_offset_k(start); + for (; vaddr < vaddr_end; vaddr = vaddr_next) { + pgd_t *pgd = pgd_offset_k(vaddr); pud_t *pud; - next = (start & PGDIR_MASK) + PGDIR_SIZE; + vaddr_next = (vaddr & PGDIR_MASK) + PGDIR_SIZE; if (pgd_val(*pgd)) { pud = (pud_t *)pgd_page_vaddr(*pgd); - last_map_addr = phys_pud_init(pud, __pa(start), - __pa(end), page_size_mask); + paddr_last = phys_pud_init(pud, __pa(vaddr), + __pa(vaddr_end), + page_size_mask); continue; } pud = alloc_low_page(); - last_map_addr = phys_pud_init(pud, __pa(start), __pa(end), - page_size_mask); + paddr_last = phys_pud_init(pud, __pa(vaddr), __pa(vaddr_end), + page_size_mask); spin_lock(&init_mm.page_table_lock); pgd_populate(&init_mm, pgd, pud); @@ -562,11 +592,11 @@ kernel_physical_mapping_init(unsigned long start, } if (pgd_changed) - sync_global_pgds(addr, end - 1, 0); + sync_global_pgds(vaddr_start, vaddr_end - 1, 0); __flush_tlb_all(); - return last_map_addr; + return paddr_last; } #ifndef CONFIG_NUMA -- cgit v0.10.2 From faa379332f3cb3375db1849e27386f8bc9b97da4 Mon Sep 17 00:00:00 2001 From: Thomas Garnier Date: Tue, 21 Jun 2016 17:47:00 -0700 Subject: x86/mm: Add PUD VA support for physical mapping Minor change that allows early boot physical mapping of PUD level virtual addresses. The current implementation expects the virtual address to be PUD aligned. For KASLR memory randomization, we need to be able to randomize the offset used on the PUD table. It has no impact on current usage. Signed-off-by: Thomas Garnier Signed-off-by: Kees Cook Cc: Alexander Kuleshov Cc: Alexander Popov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Aneesh Kumar K.V Cc: Baoquan He Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Christian Borntraeger Cc: Dan Williams Cc: Dave Hansen Cc: Dave Young Cc: Denys Vlasenko Cc: Dmitry Vyukov Cc: H. Peter Anvin Cc: Jan Beulich Cc: Joerg Roedel Cc: Jonathan Corbet Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Kirill A. Shutemov Cc: Linus Torvalds Cc: Lv Zheng Cc: Mark Salter Cc: Martin Schwidefsky Cc: Matt Fleming Cc: Peter Zijlstra Cc: Stephen Smalley Cc: Thomas Gleixner Cc: Toshi Kani Cc: Xiao Guangrong Cc: Yinghai Lu Cc: kernel-hardening@lists.openwall.com Cc: linux-doc@vger.kernel.org Link: http://lkml.kernel.org/r/1466556426-32664-4-git-send-email-keescook@chromium.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 6714712..7bf1ddb 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -465,7 +465,8 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long paddr, unsigned long paddr_end, /* * Create PUD level page table mapping for physical addresses. The virtual - * and physical address have to be aligned at this level. + * and physical address do not have to be aligned at this level. KASLR can + * randomize virtual addresses up to this level. * It returns the last physical address mapped. */ static unsigned long __meminit @@ -474,14 +475,18 @@ phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end, { unsigned long pages = 0, paddr_next; unsigned long paddr_last = paddr_end; - int i = pud_index(paddr); + unsigned long vaddr = (unsigned long)__va(paddr); + int i = pud_index(vaddr); for (; i < PTRS_PER_PUD; i++, paddr = paddr_next) { - pud_t *pud = pud_page + pud_index(paddr); + pud_t *pud; pmd_t *pmd; pgprot_t prot = PAGE_KERNEL; + vaddr = (unsigned long)__va(paddr); + pud = pud_page + pud_index(vaddr); paddr_next = (paddr & PUD_MASK) + PUD_SIZE; + if (paddr >= paddr_end) { if (!after_bootmem && !e820_any_mapped(paddr & PUD_MASK, paddr_next, @@ -551,7 +556,7 @@ phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end, /* * Create page table mapping for the physical memory for specific physical - * addresses. The virtual and physical addresses have to be aligned on PUD level + * addresses. The virtual and physical addresses have to be aligned on PMD level * down. It returns the last physical address mapped. */ unsigned long __meminit -- cgit v0.10.2 From b234e8a09003af108d3573f0369e25c080676b14 Mon Sep 17 00:00:00 2001 From: Thomas Garnier Date: Tue, 21 Jun 2016 17:47:01 -0700 Subject: x86/mm: Separate variable for trampoline PGD Use a separate global variable to define the trampoline PGD used to start other processors. This change will allow KALSR memory randomization to change the trampoline PGD to be correctly aligned with physical memory. Signed-off-by: Thomas Garnier Signed-off-by: Kees Cook Cc: Alexander Kuleshov Cc: Alexander Popov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Aneesh Kumar K.V Cc: Baoquan He Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Christian Borntraeger Cc: Dan Williams Cc: Dave Hansen Cc: Dave Young Cc: Denys Vlasenko Cc: Dmitry Vyukov Cc: H. Peter Anvin Cc: Jan Beulich Cc: Joerg Roedel Cc: Jonathan Corbet Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Kirill A. Shutemov Cc: Linus Torvalds Cc: Lv Zheng Cc: Mark Salter Cc: Martin Schwidefsky Cc: Matt Fleming Cc: Peter Zijlstra Cc: Stephen Smalley Cc: Thomas Gleixner Cc: Toshi Kani Cc: Xiao Guangrong Cc: Yinghai Lu Cc: kernel-hardening@lists.openwall.com Cc: linux-doc@vger.kernel.org Link: http://lkml.kernel.org/r/1466556426-32664-5-git-send-email-keescook@chromium.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 1a27396..d455bef 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -729,6 +729,18 @@ extern int direct_gbpages; void init_mem_mapping(void); void early_alloc_pgt_buf(void); +#ifdef CONFIG_X86_64 +/* Realmode trampoline initialization. */ +extern pgd_t trampoline_pgd_entry; +static inline void __meminit init_trampoline(void) +{ + /* Default trampoline pgd value */ + trampoline_pgd_entry = init_level4_pgt[pgd_index(__PAGE_OFFSET)]; +} +#else +static inline void init_trampoline(void) { } +#endif + /* local pte updates need not use xchg for locking */ static inline pte_t native_local_ptep_get_and_clear(pte_t *ptep) { diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 372aad2..4252acd 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -590,6 +590,9 @@ void __init init_mem_mapping(void) /* the ISA range is always mapped regardless of memory holes */ init_memory_mapping(0, ISA_END_ADDRESS); + /* Init the trampoline, possibly with KASLR memory offset */ + init_trampoline(); + /* * If the allocation is in bottom-up direction, we setup direct mapping * in bottom-up, otherwise we setup direct mapping in top-down. diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c index 0b7a63d..705e3ff 100644 --- a/arch/x86/realmode/init.c +++ b/arch/x86/realmode/init.c @@ -8,6 +8,9 @@ struct real_mode_header *real_mode_header; u32 *trampoline_cr4_features; +/* Hold the pgd entry used on booting additional CPUs */ +pgd_t trampoline_pgd_entry; + void __init reserve_real_mode(void) { phys_addr_t mem; @@ -84,7 +87,7 @@ void __init setup_real_mode(void) *trampoline_cr4_features = __read_cr4(); trampoline_pgd = (u64 *) __va(real_mode_header->trampoline_pgd); - trampoline_pgd[0] = init_level4_pgt[pgd_index(__PAGE_OFFSET)].pgd; + trampoline_pgd[0] = trampoline_pgd_entry.pgd; trampoline_pgd[511] = init_level4_pgt[511].pgd; #endif } -- cgit v0.10.2 From 0483e1fa6e09d4948272680f691dccb1edb9677f Mon Sep 17 00:00:00 2001 From: Thomas Garnier Date: Tue, 21 Jun 2016 17:47:02 -0700 Subject: x86/mm: Implement ASLR for kernel memory regions Randomizes the virtual address space of kernel memory regions for x86_64. This first patch adds the infrastructure and does not randomize any region. The following patches will randomize the physical memory mapping, vmalloc and vmemmap regions. This security feature mitigates exploits relying on predictable kernel addresses. These addresses can be used to disclose the kernel modules base addresses or corrupt specific structures to elevate privileges bypassing the current implementation of KASLR. This feature can be enabled with the CONFIG_RANDOMIZE_MEMORY option. The order of each memory region is not changed. The feature looks at the available space for the regions based on different configuration options and randomizes the base and space between each. The size of the physical memory mapping is the available physical memory. No performance impact was detected while testing the feature. Entropy is generated using the KASLR early boot functions now shared in the lib directory (originally written by Kees Cook). Randomization is done on PGD & PUD page table levels to increase possible addresses. The physical memory mapping code was adapted to support PUD level virtual addresses. This implementation on the best configuration provides 30,000 possible virtual addresses in average for each memory region. An additional low memory page is used to ensure each CPU can start with a PGD aligned virtual address (for realmode). x86/dump_pagetable was updated to correctly display each region. Updated documentation on x86_64 memory layout accordingly. Performance data, after all patches in the series: Kernbench shows almost no difference (-+ less than 1%): Before: Average Optimal load -j 12 Run (std deviation): Elapsed Time 102.63 (1.2695) User Time 1034.89 (1.18115) System Time 87.056 (0.456416) Percent CPU 1092.9 (13.892) Context Switches 199805 (3455.33) Sleeps 97907.8 (900.636) After: Average Optimal load -j 12 Run (std deviation): Elapsed Time 102.489 (1.10636) User Time 1034.86 (1.36053) System Time 87.764 (0.49345) Percent CPU 1095 (12.7715) Context Switches 199036 (4298.1) Sleeps 97681.6 (1031.11) Hackbench shows 0% difference on average (hackbench 90 repeated 10 times): attemp,before,after 1,0.076,0.069 2,0.072,0.069 3,0.066,0.066 4,0.066,0.068 5,0.066,0.067 6,0.066,0.069 7,0.067,0.066 8,0.063,0.067 9,0.067,0.065 10,0.068,0.071 average,0.0677,0.0677 Signed-off-by: Thomas Garnier Signed-off-by: Kees Cook Cc: Alexander Kuleshov Cc: Alexander Popov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Aneesh Kumar K.V Cc: Baoquan He Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Christian Borntraeger Cc: Dan Williams Cc: Dave Hansen Cc: Dave Young Cc: Denys Vlasenko Cc: Dmitry Vyukov Cc: H. Peter Anvin Cc: Jan Beulich Cc: Joerg Roedel Cc: Jonathan Corbet Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Kirill A. Shutemov Cc: Linus Torvalds Cc: Lv Zheng Cc: Mark Salter Cc: Martin Schwidefsky Cc: Matt Fleming Cc: Peter Zijlstra Cc: Stephen Smalley Cc: Thomas Gleixner Cc: Toshi Kani Cc: Xiao Guangrong Cc: Yinghai Lu Cc: kernel-hardening@lists.openwall.com Cc: linux-doc@vger.kernel.org Link: http://lkml.kernel.org/r/1466556426-32664-6-git-send-email-keescook@chromium.org Signed-off-by: Ingo Molnar diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt index 5aa7383..8c7dd59 100644 --- a/Documentation/x86/x86_64/mm.txt +++ b/Documentation/x86/x86_64/mm.txt @@ -39,4 +39,8 @@ memory window (this size is arbitrary, it can be raised later if needed). The mappings are not part of any other kernel PGD and are only available during EFI runtime calls. +Note that if CONFIG_RANDOMIZE_MEMORY is enabled, the direct mapping of all +physical memory, vmalloc/ioremap space and virtual memory map are randomized. +Their order is preserved but their base will be offset early at boot time. + -Andi Kleen, Jul 2004 diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 930fe88..9719b8e 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1993,6 +1993,23 @@ config PHYSICAL_ALIGN Don't change this unless you know what you are doing. +config RANDOMIZE_MEMORY + bool "Randomize the kernel memory sections" + depends on X86_64 + depends on RANDOMIZE_BASE + default RANDOMIZE_BASE + ---help--- + Randomizes the base virtual address of kernel memory sections + (physical memory mapping, vmalloc & vmemmap). This security feature + makes exploits relying on predictable memory locations less reliable. + + The order of allocations remains unchanged. Entropy is generated in + the same way as RANDOMIZE_BASE. Current implementation in the optimal + configuration have in average 30,000 different possible virtual + addresses for each memory section. + + If unsure, say N. + config HOTPLUG_CPU bool "Support for hot-pluggable CPUs" depends on SMP diff --git a/arch/x86/include/asm/kaslr.h b/arch/x86/include/asm/kaslr.h index 5547438..683c9d7 100644 --- a/arch/x86/include/asm/kaslr.h +++ b/arch/x86/include/asm/kaslr.h @@ -3,4 +3,10 @@ unsigned long kaslr_get_random_long(const char *purpose); +#ifdef CONFIG_RANDOMIZE_MEMORY +void kernel_randomize_memory(void); +#else +static inline void kernel_randomize_memory(void) { } +#endif /* CONFIG_RANDOMIZE_MEMORY */ + #endif diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index d455bef..5472682 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -732,11 +732,16 @@ void early_alloc_pgt_buf(void); #ifdef CONFIG_X86_64 /* Realmode trampoline initialization. */ extern pgd_t trampoline_pgd_entry; -static inline void __meminit init_trampoline(void) +static inline void __meminit init_trampoline_default(void) { /* Default trampoline pgd value */ trampoline_pgd_entry = init_level4_pgt[pgd_index(__PAGE_OFFSET)]; } +# ifdef CONFIG_RANDOMIZE_MEMORY +void __meminit init_trampoline(void); +# else +# define init_trampoline init_trampoline_default +# endif #else static inline void init_trampoline(void) { } #endif diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index c4e7b39..a261658 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -113,6 +113,7 @@ #include #include #include +#include /* * max_low_pfn_mapped: highest direct mapped pfn under 4GB @@ -942,6 +943,8 @@ void __init setup_arch(char **cmdline_p) x86_init.oem.arch_setup(); + kernel_randomize_memory(); + iomem_resource.end = (1ULL << boot_cpu_data.x86_phys_bits) - 1; setup_memory_map(); parse_setup_data(); diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 62c0043..96d2b84 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -37,4 +37,5 @@ obj-$(CONFIG_NUMA_EMU) += numa_emulation.o obj-$(CONFIG_X86_INTEL_MPX) += mpx.o obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o +obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index 99bfb19..9a17250 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -72,9 +72,9 @@ static struct addr_marker address_markers[] = { { 0, "User Space" }, #ifdef CONFIG_X86_64 { 0x8000000000000000UL, "Kernel Space" }, - { PAGE_OFFSET, "Low Kernel Mapping" }, - { VMALLOC_START, "vmalloc() Area" }, - { VMEMMAP_START, "Vmemmap" }, + { 0/* PAGE_OFFSET */, "Low Kernel Mapping" }, + { 0/* VMALLOC_START */, "vmalloc() Area" }, + { 0/* VMEMMAP_START */, "Vmemmap" }, # ifdef CONFIG_X86_ESPFIX64 { ESPFIX_BASE_ADDR, "ESPfix Area", 16 }, # endif @@ -434,8 +434,16 @@ void ptdump_walk_pgd_level_checkwx(void) static int __init pt_dump_init(void) { + /* + * Various markers are not compile-time constants, so assign them + * here. + */ +#ifdef CONFIG_X86_64 + address_markers[LOW_KERNEL_NR].start_address = PAGE_OFFSET; + address_markers[VMALLOC_START_NR].start_address = VMALLOC_START; + address_markers[VMEMMAP_START_NR].start_address = VMEMMAP_START; +#endif #ifdef CONFIG_X86_32 - /* Not a compile-time constant on x86-32 */ address_markers[VMALLOC_START_NR].start_address = VMALLOC_START; address_markers[VMALLOC_END_NR].start_address = VMALLOC_END; # ifdef CONFIG_HIGHMEM diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 4252acd..cc82830 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -17,6 +17,7 @@ #include #include /* for MAX_DMA_PFN */ #include +#include /* * We need to define the tracepoints somewhere, and tlb.c diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c new file mode 100644 index 0000000..d5380a4 --- /dev/null +++ b/arch/x86/mm/kaslr.c @@ -0,0 +1,152 @@ +/* + * This file implements KASLR memory randomization for x86_64. It randomizes + * the virtual address space of kernel memory regions (physical memory + * mapping, vmalloc & vmemmap) for x86_64. This security feature mitigates + * exploits relying on predictable kernel addresses. + * + * Entropy is generated using the KASLR early boot functions now shared in + * the lib directory (originally written by Kees Cook). Randomization is + * done on PGD & PUD page table levels to increase possible addresses. The + * physical memory mapping code was adapted to support PUD level virtual + * addresses. This implementation on the best configuration provides 30,000 + * possible virtual addresses in average for each memory region. An additional + * low memory page is used to ensure each CPU can start with a PGD aligned + * virtual address (for realmode). + * + * The order of each memory region is not changed. The feature looks at + * the available space for the regions based on different configuration + * options and randomizes the base and space between each. The size of the + * physical memory mapping is the available physical memory. + */ + +#include +#include +#include + +#include +#include +#include +#include + +#include "mm_internal.h" + +#define TB_SHIFT 40 + +/* + * Virtual address start and end range for randomization. The end changes base + * on configuration to have the highest amount of space for randomization. + * It increases the possible random position for each randomized region. + * + * You need to add an if/def entry if you introduce a new memory region + * compatible with KASLR. Your entry must be in logical order with memory + * layout. For example, ESPFIX is before EFI because its virtual address is + * before. You also need to add a BUILD_BUG_ON in kernel_randomize_memory to + * ensure that this order is correct and won't be changed. + */ +static const unsigned long vaddr_start; +static const unsigned long vaddr_end; + +/* + * Memory regions randomized by KASLR (except modules that use a separate logic + * earlier during boot). The list is ordered based on virtual addresses. This + * order is kept after randomization. + */ +static __initdata struct kaslr_memory_region { + unsigned long *base; + unsigned long size_tb; +} kaslr_regions[] = { +}; + +/* Get size in bytes used by the memory region */ +static inline unsigned long get_padding(struct kaslr_memory_region *region) +{ + return (region->size_tb << TB_SHIFT); +} + +/* + * Apply no randomization if KASLR was disabled at boot or if KASAN + * is enabled. KASAN shadow mappings rely on regions being PGD aligned. + */ +static inline bool kaslr_memory_enabled(void) +{ + return kaslr_enabled() && !config_enabled(CONFIG_KASAN); +} + +/* Initialize base and padding for each memory region randomized with KASLR */ +void __init kernel_randomize_memory(void) +{ + size_t i; + unsigned long vaddr = vaddr_start; + unsigned long rand; + struct rnd_state rand_state; + unsigned long remain_entropy; + + if (!kaslr_memory_enabled()) + return; + + /* Calculate entropy available between regions */ + remain_entropy = vaddr_end - vaddr_start; + for (i = 0; i < ARRAY_SIZE(kaslr_regions); i++) + remain_entropy -= get_padding(&kaslr_regions[i]); + + prandom_seed_state(&rand_state, kaslr_get_random_long("Memory")); + + for (i = 0; i < ARRAY_SIZE(kaslr_regions); i++) { + unsigned long entropy; + + /* + * Select a random virtual address using the extra entropy + * available. + */ + entropy = remain_entropy / (ARRAY_SIZE(kaslr_regions) - i); + prandom_bytes_state(&rand_state, &rand, sizeof(rand)); + entropy = (rand % (entropy + 1)) & PUD_MASK; + vaddr += entropy; + *kaslr_regions[i].base = vaddr; + + /* + * Jump the region and add a minimum padding based on + * randomization alignment. + */ + vaddr += get_padding(&kaslr_regions[i]); + vaddr = round_up(vaddr + 1, PUD_SIZE); + remain_entropy -= entropy; + } +} + +/* + * Create PGD aligned trampoline table to allow real mode initialization + * of additional CPUs. Consume only 1 low memory page. + */ +void __meminit init_trampoline(void) +{ + unsigned long paddr, paddr_next; + pgd_t *pgd; + pud_t *pud_page, *pud_page_tramp; + int i; + + if (!kaslr_memory_enabled()) { + init_trampoline_default(); + return; + } + + pud_page_tramp = alloc_low_page(); + + paddr = 0; + pgd = pgd_offset_k((unsigned long)__va(paddr)); + pud_page = (pud_t *) pgd_page_vaddr(*pgd); + + for (i = pud_index(paddr); i < PTRS_PER_PUD; i++, paddr = paddr_next) { + pud_t *pud, *pud_tramp; + unsigned long vaddr = (unsigned long)__va(paddr); + + pud_tramp = pud_page_tramp + pud_index(paddr); + pud = pud_page + pud_index(vaddr); + paddr_next = (paddr & PUD_MASK) + PUD_SIZE; + + *pud_tramp = *pud; + } + + set_pgd(&trampoline_pgd_entry, + __pgd(_KERNPG_TABLE | __pa(pud_page_tramp))); +} -- cgit v0.10.2 From 021182e52fe01c1f7b126f97fd6ba048dc4234fd Mon Sep 17 00:00:00 2001 From: Thomas Garnier Date: Tue, 21 Jun 2016 17:47:03 -0700 Subject: x86/mm: Enable KASLR for physical mapping memory regions Add the physical mapping in the list of randomized memory regions. The physical memory mapping holds most allocations from boot and heap allocators. Knowing the base address and physical memory size, an attacker can deduce the PDE virtual address for the vDSO memory page. This attack was demonstrated at CanSecWest 2016, in the following presentation: "Getting Physical: Extreme Abuse of Intel Based Paged Systems": https://github.com/n3k/CansecWest2016_Getting_Physical_Extreme_Abuse_of_Intel_Based_Paging_Systems/blob/master/Presentation/CanSec2016_Presentation.pdf (See second part of the presentation). The exploits used against Linux worked successfully against 4.6+ but fail with KASLR memory enabled: https://github.com/n3k/CansecWest2016_Getting_Physical_Extreme_Abuse_of_Intel_Based_Paging_Systems/tree/master/Demos/Linux/exploits Similar research was done at Google leading to this patch proposal. Variants exists to overwrite /proc or /sys objects ACLs leading to elevation of privileges. These variants were tested against 4.6+. The page offset used by the compressed kernel retains the static value since it is not yet randomized during this boot stage. Signed-off-by: Thomas Garnier Signed-off-by: Kees Cook Cc: Alexander Kuleshov Cc: Alexander Popov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Aneesh Kumar K.V Cc: Baoquan He Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Christian Borntraeger Cc: Dan Williams Cc: Dave Hansen Cc: Dave Young Cc: Denys Vlasenko Cc: Dmitry Vyukov Cc: H. Peter Anvin Cc: Jan Beulich Cc: Joerg Roedel Cc: Jonathan Corbet Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Kirill A. Shutemov Cc: Linus Torvalds Cc: Lv Zheng Cc: Mark Salter Cc: Martin Schwidefsky Cc: Matt Fleming Cc: Peter Zijlstra Cc: Stephen Smalley Cc: Thomas Gleixner Cc: Toshi Kani Cc: Xiao Guangrong Cc: Yinghai Lu Cc: kernel-hardening@lists.openwall.com Cc: linux-doc@vger.kernel.org Link: http://lkml.kernel.org/r/1466556426-32664-7-git-send-email-keescook@chromium.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/boot/compressed/pagetable.c b/arch/x86/boot/compressed/pagetable.c index 6e31a6a..56589d0 100644 --- a/arch/x86/boot/compressed/pagetable.c +++ b/arch/x86/boot/compressed/pagetable.c @@ -20,6 +20,9 @@ /* These actually do the work of building the kernel identity maps. */ #include #include +/* Use the static base for this part of the boot process */ +#undef __PAGE_OFFSET +#define __PAGE_OFFSET __PAGE_OFFSET_BASE #include "../../mm/ident_map.c" /* Used by pgtable.h asm code to force instruction serialization. */ diff --git a/arch/x86/include/asm/kaslr.h b/arch/x86/include/asm/kaslr.h index 683c9d7..62b1b81 100644 --- a/arch/x86/include/asm/kaslr.h +++ b/arch/x86/include/asm/kaslr.h @@ -4,6 +4,8 @@ unsigned long kaslr_get_random_long(const char *purpose); #ifdef CONFIG_RANDOMIZE_MEMORY +extern unsigned long page_offset_base; + void kernel_randomize_memory(void); #else static inline void kernel_randomize_memory(void) { } diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h index d5c2f8b..9215e05 100644 --- a/arch/x86/include/asm/page_64_types.h +++ b/arch/x86/include/asm/page_64_types.h @@ -1,6 +1,10 @@ #ifndef _ASM_X86_PAGE_64_DEFS_H #define _ASM_X86_PAGE_64_DEFS_H +#ifndef __ASSEMBLY__ +#include +#endif + #ifdef CONFIG_KASAN #define KASAN_STACK_ORDER 1 #else @@ -32,7 +36,12 @@ * hypervisor to fit. Choosing 16 slots here is arbitrary, but it's * what Xen requires. */ -#define __PAGE_OFFSET _AC(0xffff880000000000, UL) +#define __PAGE_OFFSET_BASE _AC(0xffff880000000000, UL) +#ifdef CONFIG_RANDOMIZE_MEMORY +#define __PAGE_OFFSET page_offset_base +#else +#define __PAGE_OFFSET __PAGE_OFFSET_BASE +#endif /* CONFIG_RANDOMIZE_MEMORY */ #define __START_KERNEL_map _AC(0xffffffff80000000, UL) diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index c7920ba..9f8efc9 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -38,7 +38,7 @@ #define pud_index(x) (((x) >> PUD_SHIFT) & (PTRS_PER_PUD-1)) -L4_PAGE_OFFSET = pgd_index(__PAGE_OFFSET) +L4_PAGE_OFFSET = pgd_index(__PAGE_OFFSET_BASE) L4_START_KERNEL = pgd_index(__START_KERNEL_map) L3_START_KERNEL = pud_index(__START_KERNEL_map) diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c index d5380a4..609ecf2 100644 --- a/arch/x86/mm/kaslr.c +++ b/arch/x86/mm/kaslr.c @@ -43,8 +43,12 @@ * before. You also need to add a BUILD_BUG_ON in kernel_randomize_memory to * ensure that this order is correct and won't be changed. */ -static const unsigned long vaddr_start; -static const unsigned long vaddr_end; +static const unsigned long vaddr_start = __PAGE_OFFSET_BASE; +static const unsigned long vaddr_end = VMALLOC_START; + +/* Default values */ +unsigned long page_offset_base = __PAGE_OFFSET_BASE; +EXPORT_SYMBOL(page_offset_base); /* * Memory regions randomized by KASLR (except modules that use a separate logic @@ -55,6 +59,7 @@ static __initdata struct kaslr_memory_region { unsigned long *base; unsigned long size_tb; } kaslr_regions[] = { + { &page_offset_base, 64/* Maximum */ }, }; /* Get size in bytes used by the memory region */ @@ -77,13 +82,20 @@ void __init kernel_randomize_memory(void) { size_t i; unsigned long vaddr = vaddr_start; - unsigned long rand; + unsigned long rand, memory_tb; struct rnd_state rand_state; unsigned long remain_entropy; if (!kaslr_memory_enabled()) return; + BUG_ON(kaslr_regions[0].base != &page_offset_base); + memory_tb = ((max_pfn << PAGE_SHIFT) >> TB_SHIFT); + + /* Adapt phyiscal memory region size based on available memory */ + if (memory_tb < kaslr_regions[0].size_tb) + kaslr_regions[0].size_tb = memory_tb; + /* Calculate entropy available between regions */ remain_entropy = vaddr_end - vaddr_start; for (i = 0; i < ARRAY_SIZE(kaslr_regions); i++) -- cgit v0.10.2 From a95ae27c2ee1cba5f4f6b9dea43ffe88252e79b1 Mon Sep 17 00:00:00 2001 From: Thomas Garnier Date: Tue, 21 Jun 2016 17:47:04 -0700 Subject: x86/mm: Enable KASLR for vmalloc memory regions Add vmalloc to the list of randomized memory regions. The vmalloc memory region contains the allocation made through the vmalloc() API. The allocations are done sequentially to prevent fragmentation and each allocation address can easily be deduced especially from boot. Signed-off-by: Thomas Garnier Signed-off-by: Kees Cook Cc: Alexander Kuleshov Cc: Alexander Popov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Aneesh Kumar K.V Cc: Baoquan He Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Christian Borntraeger Cc: Dan Williams Cc: Dave Hansen Cc: Dave Young Cc: Denys Vlasenko Cc: Dmitry Vyukov Cc: H. Peter Anvin Cc: Jan Beulich Cc: Joerg Roedel Cc: Jonathan Corbet Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Kirill A. Shutemov Cc: Linus Torvalds Cc: Lv Zheng Cc: Mark Salter Cc: Martin Schwidefsky Cc: Matt Fleming Cc: Peter Zijlstra Cc: Stephen Smalley Cc: Thomas Gleixner Cc: Toshi Kani Cc: Xiao Guangrong Cc: Yinghai Lu Cc: kernel-hardening@lists.openwall.com Cc: linux-doc@vger.kernel.org Link: http://lkml.kernel.org/r/1466556426-32664-8-git-send-email-keescook@chromium.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/include/asm/kaslr.h b/arch/x86/include/asm/kaslr.h index 62b1b81..2674ee3 100644 --- a/arch/x86/include/asm/kaslr.h +++ b/arch/x86/include/asm/kaslr.h @@ -5,6 +5,7 @@ unsigned long kaslr_get_random_long(const char *purpose); #ifdef CONFIG_RANDOMIZE_MEMORY extern unsigned long page_offset_base; +extern unsigned long vmalloc_base; void kernel_randomize_memory(void); #else diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index e6844df..6fdef9e 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h @@ -5,6 +5,7 @@ #ifndef __ASSEMBLY__ #include +#include /* * These are used to make use of C type-checking.. @@ -53,10 +54,16 @@ typedef struct { pteval_t pte; } pte_t; #define PGDIR_MASK (~(PGDIR_SIZE - 1)) /* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */ -#define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL) -#define VMALLOC_START _AC(0xffffc90000000000, UL) -#define VMALLOC_END _AC(0xffffe8ffffffffff, UL) -#define VMEMMAP_START _AC(0xffffea0000000000, UL) +#define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL) +#define VMALLOC_SIZE_TB _AC(32, UL) +#define __VMALLOC_BASE _AC(0xffffc90000000000, UL) +#define VMEMMAP_START _AC(0xffffea0000000000, UL) +#ifdef CONFIG_RANDOMIZE_MEMORY +#define VMALLOC_START vmalloc_base +#else +#define VMALLOC_START __VMALLOC_BASE +#endif /* CONFIG_RANDOMIZE_MEMORY */ +#define VMALLOC_END (VMALLOC_START + _AC((VMALLOC_SIZE_TB << 40) - 1, UL)) #define MODULES_VADDR (__START_KERNEL_map + KERNEL_IMAGE_SIZE) #define MODULES_END _AC(0xffffffffff000000, UL) #define MODULES_LEN (MODULES_END - MODULES_VADDR) diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c index 609ecf2..c939cfe 100644 --- a/arch/x86/mm/kaslr.c +++ b/arch/x86/mm/kaslr.c @@ -44,11 +44,13 @@ * ensure that this order is correct and won't be changed. */ static const unsigned long vaddr_start = __PAGE_OFFSET_BASE; -static const unsigned long vaddr_end = VMALLOC_START; +static const unsigned long vaddr_end = VMEMMAP_START; /* Default values */ unsigned long page_offset_base = __PAGE_OFFSET_BASE; EXPORT_SYMBOL(page_offset_base); +unsigned long vmalloc_base = __VMALLOC_BASE; +EXPORT_SYMBOL(vmalloc_base); /* * Memory regions randomized by KASLR (except modules that use a separate logic @@ -60,6 +62,7 @@ static __initdata struct kaslr_memory_region { unsigned long size_tb; } kaslr_regions[] = { { &page_offset_base, 64/* Maximum */ }, + { &vmalloc_base, VMALLOC_SIZE_TB }, }; /* Get size in bytes used by the memory region */ -- cgit v0.10.2 From 90397a41779645d3abba5599f6bb538fdcab9339 Mon Sep 17 00:00:00 2001 From: Thomas Garnier Date: Tue, 21 Jun 2016 17:47:06 -0700 Subject: x86/mm: Add memory hotplug support for KASLR memory randomization Add a new option (CONFIG_RANDOMIZE_MEMORY_PHYSICAL_PADDING) to define the padding used for the physical memory mapping section when KASLR memory is enabled. It ensures there is enough virtual address space when CONFIG_MEMORY_HOTPLUG is used. The default value is 10 terabytes. If CONFIG_MEMORY_HOTPLUG is not used, no space is reserved increasing the entropy available. Signed-off-by: Thomas Garnier Signed-off-by: Kees Cook Cc: Alexander Kuleshov Cc: Alexander Popov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Aneesh Kumar K.V Cc: Baoquan He Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Christian Borntraeger Cc: Dan Williams Cc: Dave Hansen Cc: Dave Young Cc: Denys Vlasenko Cc: Dmitry Vyukov Cc: H. Peter Anvin Cc: Jan Beulich Cc: Joerg Roedel Cc: Jonathan Corbet Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Kirill A. Shutemov Cc: Linus Torvalds Cc: Lv Zheng Cc: Mark Salter Cc: Martin Schwidefsky Cc: Matt Fleming Cc: Peter Zijlstra Cc: Stephen Smalley Cc: Thomas Gleixner Cc: Toshi Kani Cc: Xiao Guangrong Cc: Yinghai Lu Cc: kernel-hardening@lists.openwall.com Cc: linux-doc@vger.kernel.org Link: http://lkml.kernel.org/r/1466556426-32664-10-git-send-email-keescook@chromium.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 9719b8e..703413f 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2010,6 +2010,21 @@ config RANDOMIZE_MEMORY If unsure, say N. +config RANDOMIZE_MEMORY_PHYSICAL_PADDING + hex "Physical memory mapping padding" if EXPERT + depends on RANDOMIZE_MEMORY + default "0xa" if MEMORY_HOTPLUG + default "0x0" + range 0x1 0x40 if MEMORY_HOTPLUG + range 0x0 0x40 + ---help--- + Define the padding in terabytes added to the existing physical + memory size during kernel memory randomization. It is useful + for memory hotplug support but reduces the entropy available for + address randomization. + + If unsure, leave at the default value. + config HOTPLUG_CPU bool "Support for hot-pluggable CPUs" depends on SMP diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c index c939cfe..26dccd6 100644 --- a/arch/x86/mm/kaslr.c +++ b/arch/x86/mm/kaslr.c @@ -92,8 +92,13 @@ void __init kernel_randomize_memory(void) if (!kaslr_memory_enabled()) return; + /* + * Update Physical memory mapping to available and + * add padding if needed (especially for memory hotplug support). + */ BUG_ON(kaslr_regions[0].base != &page_offset_base); - memory_tb = ((max_pfn << PAGE_SHIFT) >> TB_SHIFT); + memory_tb = ((max_pfn << PAGE_SHIFT) >> TB_SHIFT) + + CONFIG_RANDOMIZE_MEMORY_PHYSICAL_PADDING; /* Adapt phyiscal memory region size based on available memory */ if (memory_tb < kaslr_regions[0].size_tb) -- cgit v0.10.2 From 4ff5308744f5858e4e49e56a0445e2f8b73e47e0 Mon Sep 17 00:00:00 2001 From: Thomas Garnier Date: Wed, 15 Jun 2016 12:05:45 -0700 Subject: x86/mm: Do not reference phys addr beyond kernel The new physical address randomized KASLR implementation can cause the kernel to be aligned close to the end of physical memory. In this case, _brk_end aligned to PMD will go beyond what is expected safe and hit the assert in __phys_addr_symbol(): VIRTUAL_BUG_ON(y >= KERNEL_IMAGE_SIZE); Instead, perform an inclusive range check to avoid incorrectly triggering the assert: kernel BUG at arch/x86/mm/physaddr.c:38! invalid opcode: 0000 [#1] SMP ... RIP: 0010:[] __phys_addr_symbol+0x41/0x50 ... Call Trace: [] cpa_process_alias+0xa9/0x210 [] ? do_raw_spin_unlock+0xc1/0x100 [] __change_page_attr_set_clr+0x8cf/0xbd0 [] ? vm_unmap_aliases+0x7d/0x210 [] change_page_attr_set_clr+0x18c/0x4e0 [] set_memory_4k+0x2c/0x40 [] check_bugs+0x28/0x2a [] start_kernel+0x49d/0x4b9 [] ? early_idt_handler_array+0x120/0x120 [] x86_64_start_reservations+0x29/0x2b [] x86_64_start_kernel+0x143/0x152 Signed-off-by: Thomas Garnier Signed-off-by: Kees Cook Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Chris Wilson Cc: Christian Borntraeger Cc: Denys Vlasenko Cc: Dexuan Cui Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Matt Fleming Cc: Peter Zijlstra Cc: Sai Praneeth Cc: Thomas Gleixner Cc: Toshi Kani Link: http://lkml.kernel.org/r/20160615190545.GA26071@www.outflux.net Signed-off-by: Ingo Molnar diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 7a1f7bb..379b511 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -101,7 +101,8 @@ static inline unsigned long highmap_start_pfn(void) static inline unsigned long highmap_end_pfn(void) { - return __pa_symbol(roundup(_brk_end, PMD_SIZE)) >> PAGE_SHIFT; + /* Do not reference physical address outside the kernel. */ + return __pa_symbol(roundup(_brk_end, PMD_SIZE) - 1) >> PAGE_SHIFT; } #endif @@ -112,6 +113,12 @@ within(unsigned long addr, unsigned long start, unsigned long end) return addr >= start && addr < end; } +static inline int +within_inclusive(unsigned long addr, unsigned long start, unsigned long end) +{ + return addr >= start && addr <= end; +} + /* * Flushing functions */ @@ -1316,7 +1323,8 @@ static int cpa_process_alias(struct cpa_data *cpa) * to touch the high mapped kernel as well: */ if (!within(vaddr, (unsigned long)_text, _brk_end) && - within(cpa->pfn, highmap_start_pfn(), highmap_end_pfn())) { + within_inclusive(cpa->pfn, highmap_start_pfn(), + highmap_end_pfn())) { unsigned long temp_cpa_vaddr = (cpa->pfn << PAGE_SHIFT) + __START_KERNEL_map - phys_base; alias_cpa = *cpa; -- cgit v0.10.2 From edce21216a8887bf06ba85ee49a00695e44c4341 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 21 Jul 2016 09:53:52 +0200 Subject: x86/boot: Reorganize and clean up the BIOS area reservation code So the reserve_ebda_region() code has accumulated a number of problems over the years that make it really difficult to read and understand: - The calculation of 'lowmem' and 'ebda_addr' is an unnecessarily interleaved mess of first lowmem, then ebda_addr, then lowmem tweaks... - 'lowmem' here means 'super low mem' - i.e. 16-bit addressable memory. In other parts of the x86 code 'lowmem' means 32-bit addressable memory... This makes it super confusing to read. - It does not help at all that we have various memory range markers, half of which are 'start of range', half of which are 'end of range' - but this crucial property is not obvious in the naming at all ... gave me a headache trying to understand all this. - Also, the 'ebda_addr' name sucks: it highlights that it's an address (which is obvious, all values here are addresses!), while it does not highlight that it's the _start_ of the EBDA region ... - 'BIOS_LOWMEM_KILOBYTES' says a lot of things, except that this is the only value that is a pointer to a value, not a memory range address! - The function name itself is a misnomer: it says 'reserve_ebda_region()' while its main purpose is to reserve all the firmware ROM typically between 640K and 1MB, while the 'EBDA' part is only a small part of that ... - Likewise, the paravirt quirk flag name 'ebda_search' is misleading as well: this too should be about whether to reserve firmware areas in the paravirt case. - In fact thinking about this as 'end of RAM' is confusing: what this function *really* wants to reserve is firmware data and code areas! Once the thinking is inverted from a mixed 'ram' and 'reserved firmware area' notion to a pure 'reserved area' notion everything becomes a lot clearer. To improve all this rewrite the whole code (without changing the logic): - Firstly invert the naming from 'lowmem end' to 'BIOS reserved area start' and propagate this concept through all the variable names and constants. BIOS_RAM_SIZE_KB_PTR // was: BIOS_LOWMEM_KILOBYTES BIOS_START_MIN // was: INSANE_CUTOFF ebda_start // was: ebda_addr bios_start // was: lowmem BIOS_START_MAX // was: LOWMEM_CAP - Then clean up the name of the function itself by renaming it to reserve_bios_regions() and renaming the ::ebda_search paravirt flag to ::reserve_bios_regions. - Fix up all the comments (fix typos), harmonize and simplify their formulation and remove comments that become unnecessary due to the much better naming all around. Signed-off-by: Ingo Molnar diff --git a/arch/x86/include/asm/bios_ebda.h b/arch/x86/include/asm/bios_ebda.h index 2b00c77..4b7b8e7 100644 --- a/arch/x86/include/asm/bios_ebda.h +++ b/arch/x86/include/asm/bios_ebda.h @@ -17,7 +17,7 @@ static inline unsigned int get_bios_ebda(void) return address; /* 0 means none */ } -void reserve_ebda_region(void); +void reserve_bios_regions(void); #ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION /* diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 4dcdf74..c519c05 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -168,14 +168,14 @@ struct x86_legacy_devices { * struct x86_legacy_features - legacy x86 features * * @rtc: this device has a CMOS real-time clock present - * @ebda_search: it's safe to search for the EBDA signature in the hardware's + * @reserve_bios_regions: it's safe to search for the EBDA signature in the hardware's * low RAM * @devices: legacy x86 devices, refer to struct x86_legacy_devices * documentation for further details. */ struct x86_legacy_features { int rtc; - int ebda_search; + int reserve_bios_regions; struct x86_legacy_devices devices; }; diff --git a/arch/x86/kernel/ebda.c b/arch/x86/kernel/ebda.c index afe65df..6219eef 100644 --- a/arch/x86/kernel/ebda.c +++ b/arch/x86/kernel/ebda.c @@ -6,66 +6,104 @@ #include /* + * This function reserves all conventional PC system BIOS related + * firmware memory areas (some of which are data, some of which + * are code), that must not be used by the kernel as available + * RAM. + * * The BIOS places the EBDA/XBDA at the top of conventional * memory, and usually decreases the reported amount of - * conventional memory (int 0x12) too. This also contains a - * workaround for Dell systems that neglect to reserve EBDA. - * The same workaround also avoids a problem with the AMD768MPX - * chipset: reserve a page before VGA to prevent PCI prefetch - * into it (errata #56). Usually the page is reserved anyways, - * unless you have no PS/2 mouse plugged in. + * conventional memory (int 0x12) too. + * + * This means that as a first approximation on most systems we can + * guess the reserved BIOS area by looking at the low BIOS RAM size + * value and assume that everything above that value (up to 1MB) is + * reserved. + * + * But life in firmware country is not that simple: + * + * - This code also contains a quirk for Dell systems that neglect + * to reserve the EBDA area in the 'RAM size' value ... + * + * - The same quirk also avoids a problem with the AMD768MPX + * chipset: reserve a page before VGA to prevent PCI prefetch + * into it (errata #56). (Usually the page is reserved anyways, + * unless you have no PS/2 mouse plugged in.) + * + * - Plus paravirt systems don't have a reliable value in the + * 'BIOS RAM size' pointer we can rely on, so we must quirk + * them too. + * + * Due to those various problems this function is deliberately + * very conservative and tries to err on the side of reserving + * too much, to not risk reserving too little. + * + * Losing a small amount of memory in the bottom megabyte is + * rarely a problem, as long as we have enough memory to install + * the SMP bootup trampoline which *must* be in this area. * - * This functions is deliberately very conservative. Losing - * memory in the bottom megabyte is rarely a problem, as long - * as we have enough memory to install the trampoline. Using - * memory that is in use by the BIOS or by some DMA device - * the BIOS didn't shut down *is* a big problem. + * Using memory that is in use by the BIOS or by some DMA device + * the BIOS didn't shut down *is* a big problem to the kernel, + * obviously. */ -#define BIOS_LOWMEM_KILOBYTES 0x413 -#define LOWMEM_CAP 0x9f000U /* Absolute maximum */ -#define INSANE_CUTOFF 0x20000U /* Less than this = insane */ +#define BIOS_RAM_SIZE_KB_PTR 0x413 -void __init reserve_ebda_region(void) +#define BIOS_START_MIN 0x20000U /* 128K, less than this is insane */ +#define BIOS_START_MAX 0x9f000U /* 640K, absolute maximum */ + +void __init reserve_bios_regions(void) { - unsigned int lowmem, ebda_addr; + unsigned int bios_start, ebda_start; /* - * To determine the position of the EBDA and the - * end of conventional memory, we need to look at - * the BIOS data area. In a paravirtual environment - * that area is absent. We'll just have to assume - * that the paravirt case can handle memory setup - * correctly, without our help. + * NOTE: In a paravirtual environment the BIOS reserved + * area is absent. We'll just have to assume that the + * paravirt case can handle memory setup correctly, + * without our help. */ - if (!x86_platform.legacy.ebda_search) + if (!x86_platform.legacy.reserve_bios_regions) return; - /* end of low (conventional) memory */ - lowmem = *(unsigned short *)__va(BIOS_LOWMEM_KILOBYTES); - lowmem <<= 10; - - /* start of EBDA area */ - ebda_addr = get_bios_ebda(); + /* Get the start address of the EBDA page: */ + ebda_start = get_bios_ebda(); /* - * Note: some old Dells seem to need 4k EBDA without - * reporting so, so just consider the memory above 0x9f000 - * to be off limits (bugzilla 2990). + * Quirk: some old Dells seem to have a 4k EBDA without + * reporting so in their BIOS RAM size value, so just + * consider the memory above 640K to be off limits + * (bugzilla 2990). + * + * We detect this case by filtering for nonsensical EBDA + * addresses below 128K, where we can assume that they + * are bogus and bump it up to a fixed 640K value: */ + if (ebda_start < BIOS_START_MIN) + ebda_start = BIOS_START_MAX; - /* If the EBDA address is below 128K, assume it is bogus */ - if (ebda_addr < INSANE_CUTOFF) - ebda_addr = LOWMEM_CAP; + /* + * BIOS RAM size is encoded in kilobytes, convert it + * to bytes to get a first guess at where the BIOS + * firmware area starts: + */ + bios_start = *(unsigned short *)__va(BIOS_RAM_SIZE_KB_PTR); + bios_start <<= 10; - /* If lowmem is less than 128K, assume it is bogus */ - if (lowmem < INSANE_CUTOFF) - lowmem = LOWMEM_CAP; + /* + * If bios_start is less than 128K, assume it is bogus + * and bump it up to 640K: + */ + if (bios_start < BIOS_START_MIN) + bios_start = BIOS_START_MAX; - /* Use the lower of the lowmem and EBDA markers as the cutoff */ - lowmem = min(lowmem, ebda_addr); - lowmem = min(lowmem, LOWMEM_CAP); /* Absolute cap */ + /* + * Use the lower of the bios_start and ebda_start + * as the starting point, but don't allow it to + * go beyond 640K: + */ + bios_start = min(bios_start, ebda_start); + bios_start = min(bios_start, BIOS_START_MAX); - /* reserve all memory between lowmem and the 1MB mark */ - memblock_reserve(lowmem, 0x100000 - lowmem); + /* Reserve all memory between bios_start and the 1MB mark: */ + memblock_reserve(bios_start, 0x100000 - bios_start); } diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index d784bb5..2dda0bc 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -26,7 +26,7 @@ static void __init i386_default_early_setup(void) x86_init.resources.reserve_resources = i386_reserve_resources; x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc; - reserve_ebda_region(); + reserve_bios_regions(); } asmlinkage __visible void __init i386_start_kernel(void) diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index b72fb0b..99d48e7 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -183,7 +183,7 @@ void __init x86_64_start_reservations(char *real_mode_data) copy_bootdata(__va(real_mode_data)); x86_early_init_platform_quirks(); - reserve_ebda_region(); + reserve_bios_regions(); switch (boot_params.hdr.hardware_subarch) { case X86_SUBARCH_INTEL_MID: diff --git a/arch/x86/kernel/platform-quirks.c b/arch/x86/kernel/platform-quirks.c index b2f8a33..24a5030 100644 --- a/arch/x86/kernel/platform-quirks.c +++ b/arch/x86/kernel/platform-quirks.c @@ -7,12 +7,12 @@ void __init x86_early_init_platform_quirks(void) { x86_platform.legacy.rtc = 1; - x86_platform.legacy.ebda_search = 0; + x86_platform.legacy.reserve_bios_regions = 0; x86_platform.legacy.devices.pnpbios = 1; switch (boot_params.hdr.hardware_subarch) { case X86_SUBARCH_PC: - x86_platform.legacy.ebda_search = 1; + x86_platform.legacy.reserve_bios_regions = 1; break; case X86_SUBARCH_XEN: case X86_SUBARCH_LGUEST: -- cgit v0.10.2 From 30f027398b329c75c8f23a3c13be240b50866fdc Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 21 Jul 2016 14:16:51 -0700 Subject: x86/boot: Clarify what x86_legacy_features.reserve_bios_regions does It doesn't just control probing for the EBDA -- it controls whether we detect and reserve the <1MB BIOS regions in general. Signed-off-by: Andy Lutomirski Cc: Andrew Morton Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Kees Cook Cc: Linus Torvalds Cc: Luis R. Rodriguez Cc: Mario Limonciello Cc: Matthew Garrett Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Toshi Kani Link: http://lkml.kernel.org/r/55bd591115498440d461857a7b64f349a5d911f3.1469135598.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index c519c05..66c15a0 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -168,8 +168,9 @@ struct x86_legacy_devices { * struct x86_legacy_features - legacy x86 features * * @rtc: this device has a CMOS real-time clock present - * @reserve_bios_regions: it's safe to search for the EBDA signature in the hardware's - * low RAM + * @reserve_bios_regions: boot code will search for the EBDA address and the + * start of the 640k - 1M BIOS region. If false, the platform must + * ensure that its memory map correctly reserves sub-1MB regions as needed. * @devices: legacy x86 devices, refer to struct x86_legacy_devices * documentation for further details. */ -- cgit v0.10.2 From 6a79296cb15d947bcb4558011fe066e5d8252b35 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 21 Jul 2016 14:16:52 -0700 Subject: x86/boot: Simplify EBDA-vs-BIOS reservation logic Both the intent and the effect of reserve_bios_regions() is simple: reserve the range from the apparent BIOS start (suitably filtered) through 1MB and, if the EBDA start address is sensible, extend that reservation downward to cover the EBDA as well. The code is overcomplicated, though, and contains head-scratchers like: if (ebda_start < BIOS_START_MIN) ebda_start = BIOS_START_MAX; That snipped is trying to say "if ebda_start < BIOS_START_MIN, ignore it". Simplify it: reorder the code so that it makes sense. This should have no functional effect under any circumstances. Signed-off-by: Andy Lutomirski Cc: Andrew Morton Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Kees Cook Cc: Linus Torvalds Cc: Luis R. Rodriguez Cc: Mario Limonciello Cc: Matthew Garrett Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Toshi Kani Link: http://lkml.kernel.org/r/ef89c0c761be20ead8bd9a3275743e6259b6092a.1469135598.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/ebda.c b/arch/x86/kernel/ebda.c index 6219eef..4312f8a 100644 --- a/arch/x86/kernel/ebda.c +++ b/arch/x86/kernel/ebda.c @@ -65,22 +65,6 @@ void __init reserve_bios_regions(void) if (!x86_platform.legacy.reserve_bios_regions) return; - /* Get the start address of the EBDA page: */ - ebda_start = get_bios_ebda(); - - /* - * Quirk: some old Dells seem to have a 4k EBDA without - * reporting so in their BIOS RAM size value, so just - * consider the memory above 640K to be off limits - * (bugzilla 2990). - * - * We detect this case by filtering for nonsensical EBDA - * addresses below 128K, where we can assume that they - * are bogus and bump it up to a fixed 640K value: - */ - if (ebda_start < BIOS_START_MIN) - ebda_start = BIOS_START_MAX; - /* * BIOS RAM size is encoded in kilobytes, convert it * to bytes to get a first guess at where the BIOS @@ -91,18 +75,22 @@ void __init reserve_bios_regions(void) /* * If bios_start is less than 128K, assume it is bogus - * and bump it up to 640K: + * and bump it up to 640K. Similarly, if bios_start is above 640K, + * don't trust it. */ - if (bios_start < BIOS_START_MIN) + if (bios_start < BIOS_START_MIN || bios_start > BIOS_START_MAX) bios_start = BIOS_START_MAX; + /* Get the start address of the EBDA page: */ + ebda_start = get_bios_ebda(); + /* - * Use the lower of the bios_start and ebda_start - * as the starting point, but don't allow it to - * go beyond 640K: + * If the EBDA start address is sane and is below the BIOS region, + * then also reserve everything from the EBDA start address up to + * the BIOS region. */ - bios_start = min(bios_start, ebda_start); - bios_start = min(bios_start, BIOS_START_MAX); + if (ebda_start >= BIOS_START_MIN && ebda_start < bios_start) + bios_start = ebda_start; /* Reserve all memory between bios_start and the 1MB mark: */ memblock_reserve(bios_start, 0x100000 - bios_start); -- cgit v0.10.2