From 9b238748cb6e9fadab0e761f6d30ba311b4ac470 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 18 Apr 2016 09:42:10 -0700 Subject: x86/KASLR: Rename aslr.c to kaslr.c In order to avoid confusion over what this file provides, rename it to kaslr.c since it is used exclusively for the kernel ASLR, not userspace ASLR. Suggested-by: Ingo Molnar Signed-off-by: Kees Cook Cc: Andrew Morton Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Baoquan He Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Dmitry Vyukov Cc: H. Peter Anvin Cc: H.J. Lu Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Yinghai Lu Link: http://lkml.kernel.org/r/1460997735-24785-2-git-send-email-keescook@chromium.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 8774cb2..542c92f 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -62,7 +62,7 @@ vmlinux-objs-y := $(obj)/vmlinux.lds $(obj)/head_$(BITS).o $(obj)/misc.o \ $(obj)/piggy.o $(obj)/cpuflags.o vmlinux-objs-$(CONFIG_EARLY_PRINTK) += $(obj)/early_serial_console.o -vmlinux-objs-$(CONFIG_RANDOMIZE_BASE) += $(obj)/aslr.o +vmlinux-objs-$(CONFIG_RANDOMIZE_BASE) += $(obj)/kaslr.o $(obj)/eboot.o: KBUILD_CFLAGS += -fshort-wchar -mno-red-zone diff --git a/arch/x86/boot/compressed/aslr.c b/arch/x86/boot/compressed/aslr.c deleted file mode 100644 index 6a9b96b..0000000 --- a/arch/x86/boot/compressed/aslr.c +++ /dev/null @@ -1,339 +0,0 @@ -#include "misc.h" - -#include -#include -#include - -#include -#include -#include -#include -#include - -/* Simplified build-specific string for starting entropy. */ -static const char build_str[] = UTS_RELEASE " (" LINUX_COMPILE_BY "@" - LINUX_COMPILE_HOST ") (" LINUX_COMPILER ") " UTS_VERSION; - -#define I8254_PORT_CONTROL 0x43 -#define I8254_PORT_COUNTER0 0x40 -#define I8254_CMD_READBACK 0xC0 -#define I8254_SELECT_COUNTER0 0x02 -#define I8254_STATUS_NOTREADY 0x40 -static inline u16 i8254(void) -{ - u16 status, timer; - - do { - outb(I8254_PORT_CONTROL, - I8254_CMD_READBACK | I8254_SELECT_COUNTER0); - status = inb(I8254_PORT_COUNTER0); - timer = inb(I8254_PORT_COUNTER0); - timer |= inb(I8254_PORT_COUNTER0) << 8; - } while (status & I8254_STATUS_NOTREADY); - - return timer; -} - -static unsigned long rotate_xor(unsigned long hash, const void *area, - size_t size) -{ - size_t i; - unsigned long *ptr = (unsigned long *)area; - - for (i = 0; i < size / sizeof(hash); i++) { - /* Rotate by odd number of bits and XOR. */ - hash = (hash << ((sizeof(hash) * 8) - 7)) | (hash >> 7); - hash ^= ptr[i]; - } - - return hash; -} - -/* Attempt to create a simple but unpredictable starting entropy. */ -static unsigned long get_random_boot(void) -{ - unsigned long hash = 0; - - hash = rotate_xor(hash, build_str, sizeof(build_str)); - hash = rotate_xor(hash, real_mode, sizeof(*real_mode)); - - return hash; -} - -static unsigned long get_random_long(void) -{ -#ifdef CONFIG_X86_64 - const unsigned long mix_const = 0x5d6008cbf3848dd3UL; -#else - const unsigned long mix_const = 0x3f39e593UL; -#endif - unsigned long raw, random = get_random_boot(); - bool use_i8254 = true; - - debug_putstr("KASLR using"); - - if (has_cpuflag(X86_FEATURE_RDRAND)) { - debug_putstr(" RDRAND"); - if (rdrand_long(&raw)) { - random ^= raw; - use_i8254 = false; - } - } - - if (has_cpuflag(X86_FEATURE_TSC)) { - debug_putstr(" RDTSC"); - raw = rdtsc(); - - random ^= raw; - use_i8254 = false; - } - - if (use_i8254) { - debug_putstr(" i8254"); - random ^= i8254(); - } - - /* Circular multiply for better bit diffusion */ - asm("mul %3" - : "=a" (random), "=d" (raw) - : "a" (random), "rm" (mix_const)); - random += raw; - - debug_putstr("...\n"); - - return random; -} - -struct mem_vector { - unsigned long start; - unsigned long size; -}; - -#define MEM_AVOID_MAX 5 -static struct mem_vector mem_avoid[MEM_AVOID_MAX]; - -static bool mem_contains(struct mem_vector *region, struct mem_vector *item) -{ - /* Item at least partially before region. */ - if (item->start < region->start) - return false; - /* Item at least partially after region. */ - if (item->start + item->size > region->start + region->size) - return false; - return true; -} - -static bool mem_overlaps(struct mem_vector *one, struct mem_vector *two) -{ - /* Item one is entirely before item two. */ - if (one->start + one->size <= two->start) - return false; - /* Item one is entirely after item two. */ - if (one->start >= two->start + two->size) - return false; - return true; -} - -static void mem_avoid_init(unsigned long input, unsigned long input_size, - unsigned long output, unsigned long output_size) -{ - u64 initrd_start, initrd_size; - u64 cmd_line, cmd_line_size; - unsigned long unsafe, unsafe_len; - char *ptr; - - /* - * Avoid the region that is unsafe to overlap during - * decompression (see calculations at top of misc.c). - */ - unsafe_len = (output_size >> 12) + 32768 + 18; - unsafe = (unsigned long)input + input_size - unsafe_len; - mem_avoid[0].start = unsafe; - mem_avoid[0].size = unsafe_len; - - /* Avoid initrd. */ - initrd_start = (u64)real_mode->ext_ramdisk_image << 32; - initrd_start |= real_mode->hdr.ramdisk_image; - initrd_size = (u64)real_mode->ext_ramdisk_size << 32; - initrd_size |= real_mode->hdr.ramdisk_size; - mem_avoid[1].start = initrd_start; - mem_avoid[1].size = initrd_size; - - /* Avoid kernel command line. */ - cmd_line = (u64)real_mode->ext_cmd_line_ptr << 32; - cmd_line |= real_mode->hdr.cmd_line_ptr; - /* Calculate size of cmd_line. */ - ptr = (char *)(unsigned long)cmd_line; - for (cmd_line_size = 0; ptr[cmd_line_size++]; ) - ; - mem_avoid[2].start = cmd_line; - mem_avoid[2].size = cmd_line_size; - - /* Avoid heap memory. */ - mem_avoid[3].start = (unsigned long)free_mem_ptr; - mem_avoid[3].size = BOOT_HEAP_SIZE; - - /* Avoid stack memory. */ - mem_avoid[4].start = (unsigned long)free_mem_end_ptr; - mem_avoid[4].size = BOOT_STACK_SIZE; -} - -/* Does this memory vector overlap a known avoided area? */ -static bool mem_avoid_overlap(struct mem_vector *img) -{ - int i; - struct setup_data *ptr; - - for (i = 0; i < MEM_AVOID_MAX; i++) { - if (mem_overlaps(img, &mem_avoid[i])) - return true; - } - - /* Avoid all entries in the setup_data linked list. */ - ptr = (struct setup_data *)(unsigned long)real_mode->hdr.setup_data; - while (ptr) { - struct mem_vector avoid; - - avoid.start = (unsigned long)ptr; - avoid.size = sizeof(*ptr) + ptr->len; - - if (mem_overlaps(img, &avoid)) - return true; - - ptr = (struct setup_data *)(unsigned long)ptr->next; - } - - return false; -} - -static unsigned long slots[CONFIG_RANDOMIZE_BASE_MAX_OFFSET / - CONFIG_PHYSICAL_ALIGN]; -static unsigned long slot_max; - -static void slots_append(unsigned long addr) -{ - /* Overflowing the slots list should be impossible. */ - if (slot_max >= CONFIG_RANDOMIZE_BASE_MAX_OFFSET / - CONFIG_PHYSICAL_ALIGN) - return; - - slots[slot_max++] = addr; -} - -static unsigned long slots_fetch_random(void) -{ - /* Handle case of no slots stored. */ - if (slot_max == 0) - return 0; - - return slots[get_random_long() % slot_max]; -} - -static void process_e820_entry(struct e820entry *entry, - unsigned long minimum, - unsigned long image_size) -{ - struct mem_vector region, img; - - /* Skip non-RAM entries. */ - if (entry->type != E820_RAM) - return; - - /* Ignore entries entirely above our maximum. */ - if (entry->addr >= CONFIG_RANDOMIZE_BASE_MAX_OFFSET) - return; - - /* Ignore entries entirely below our minimum. */ - if (entry->addr + entry->size < minimum) - return; - - region.start = entry->addr; - region.size = entry->size; - - /* Potentially raise address to minimum location. */ - if (region.start < minimum) - region.start = minimum; - - /* Potentially raise address to meet alignment requirements. */ - region.start = ALIGN(region.start, CONFIG_PHYSICAL_ALIGN); - - /* Did we raise the address above the bounds of this e820 region? */ - if (region.start > entry->addr + entry->size) - return; - - /* Reduce size by any delta from the original address. */ - region.size -= region.start - entry->addr; - - /* Reduce maximum size to fit end of image within maximum limit. */ - if (region.start + region.size > CONFIG_RANDOMIZE_BASE_MAX_OFFSET) - region.size = CONFIG_RANDOMIZE_BASE_MAX_OFFSET - region.start; - - /* Walk each aligned slot and check for avoided areas. */ - for (img.start = region.start, img.size = image_size ; - mem_contains(®ion, &img) ; - img.start += CONFIG_PHYSICAL_ALIGN) { - if (mem_avoid_overlap(&img)) - continue; - slots_append(img.start); - } -} - -static unsigned long find_random_addr(unsigned long minimum, - unsigned long size) -{ - int i; - unsigned long addr; - - /* Make sure minimum is aligned. */ - minimum = ALIGN(minimum, CONFIG_PHYSICAL_ALIGN); - - /* Verify potential e820 positions, appending to slots list. */ - for (i = 0; i < real_mode->e820_entries; i++) { - process_e820_entry(&real_mode->e820_map[i], minimum, size); - } - - return slots_fetch_random(); -} - -unsigned char *choose_kernel_location(struct boot_params *boot_params, - unsigned char *input, - unsigned long input_size, - unsigned char *output, - unsigned long output_size) -{ - unsigned long choice = (unsigned long)output; - unsigned long random; - -#ifdef CONFIG_HIBERNATION - if (!cmdline_find_option_bool("kaslr")) { - debug_putstr("KASLR disabled by default...\n"); - goto out; - } -#else - if (cmdline_find_option_bool("nokaslr")) { - debug_putstr("KASLR disabled by cmdline...\n"); - goto out; - } -#endif - - boot_params->hdr.loadflags |= KASLR_FLAG; - - /* Record the various known unsafe memory ranges. */ - mem_avoid_init((unsigned long)input, input_size, - (unsigned long)output, output_size); - - /* Walk e820 and find a random address. */ - random = find_random_addr(choice, output_size); - if (!random) { - debug_putstr("KASLR could not find suitable E820 region...\n"); - goto out; - } - - /* Always enforce the minimum. */ - if (random < choice) - goto out; - - choice = random; -out: - return (unsigned char *)choice; -} diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c new file mode 100644 index 0000000..6a9b96b --- /dev/null +++ b/arch/x86/boot/compressed/kaslr.c @@ -0,0 +1,339 @@ +#include "misc.h" + +#include +#include +#include + +#include +#include +#include +#include +#include + +/* Simplified build-specific string for starting entropy. */ +static const char build_str[] = UTS_RELEASE " (" LINUX_COMPILE_BY "@" + LINUX_COMPILE_HOST ") (" LINUX_COMPILER ") " UTS_VERSION; + +#define I8254_PORT_CONTROL 0x43 +#define I8254_PORT_COUNTER0 0x40 +#define I8254_CMD_READBACK 0xC0 +#define I8254_SELECT_COUNTER0 0x02 +#define I8254_STATUS_NOTREADY 0x40 +static inline u16 i8254(void) +{ + u16 status, timer; + + do { + outb(I8254_PORT_CONTROL, + I8254_CMD_READBACK | I8254_SELECT_COUNTER0); + status = inb(I8254_PORT_COUNTER0); + timer = inb(I8254_PORT_COUNTER0); + timer |= inb(I8254_PORT_COUNTER0) << 8; + } while (status & I8254_STATUS_NOTREADY); + + return timer; +} + +static unsigned long rotate_xor(unsigned long hash, const void *area, + size_t size) +{ + size_t i; + unsigned long *ptr = (unsigned long *)area; + + for (i = 0; i < size / sizeof(hash); i++) { + /* Rotate by odd number of bits and XOR. */ + hash = (hash << ((sizeof(hash) * 8) - 7)) | (hash >> 7); + hash ^= ptr[i]; + } + + return hash; +} + +/* Attempt to create a simple but unpredictable starting entropy. */ +static unsigned long get_random_boot(void) +{ + unsigned long hash = 0; + + hash = rotate_xor(hash, build_str, sizeof(build_str)); + hash = rotate_xor(hash, real_mode, sizeof(*real_mode)); + + return hash; +} + +static unsigned long get_random_long(void) +{ +#ifdef CONFIG_X86_64 + const unsigned long mix_const = 0x5d6008cbf3848dd3UL; +#else + const unsigned long mix_const = 0x3f39e593UL; +#endif + unsigned long raw, random = get_random_boot(); + bool use_i8254 = true; + + debug_putstr("KASLR using"); + + if (has_cpuflag(X86_FEATURE_RDRAND)) { + debug_putstr(" RDRAND"); + if (rdrand_long(&raw)) { + random ^= raw; + use_i8254 = false; + } + } + + if (has_cpuflag(X86_FEATURE_TSC)) { + debug_putstr(" RDTSC"); + raw = rdtsc(); + + random ^= raw; + use_i8254 = false; + } + + if (use_i8254) { + debug_putstr(" i8254"); + random ^= i8254(); + } + + /* Circular multiply for better bit diffusion */ + asm("mul %3" + : "=a" (random), "=d" (raw) + : "a" (random), "rm" (mix_const)); + random += raw; + + debug_putstr("...\n"); + + return random; +} + +struct mem_vector { + unsigned long start; + unsigned long size; +}; + +#define MEM_AVOID_MAX 5 +static struct mem_vector mem_avoid[MEM_AVOID_MAX]; + +static bool mem_contains(struct mem_vector *region, struct mem_vector *item) +{ + /* Item at least partially before region. */ + if (item->start < region->start) + return false; + /* Item at least partially after region. */ + if (item->start + item->size > region->start + region->size) + return false; + return true; +} + +static bool mem_overlaps(struct mem_vector *one, struct mem_vector *two) +{ + /* Item one is entirely before item two. */ + if (one->start + one->size <= two->start) + return false; + /* Item one is entirely after item two. */ + if (one->start >= two->start + two->size) + return false; + return true; +} + +static void mem_avoid_init(unsigned long input, unsigned long input_size, + unsigned long output, unsigned long output_size) +{ + u64 initrd_start, initrd_size; + u64 cmd_line, cmd_line_size; + unsigned long unsafe, unsafe_len; + char *ptr; + + /* + * Avoid the region that is unsafe to overlap during + * decompression (see calculations at top of misc.c). + */ + unsafe_len = (output_size >> 12) + 32768 + 18; + unsafe = (unsigned long)input + input_size - unsafe_len; + mem_avoid[0].start = unsafe; + mem_avoid[0].size = unsafe_len; + + /* Avoid initrd. */ + initrd_start = (u64)real_mode->ext_ramdisk_image << 32; + initrd_start |= real_mode->hdr.ramdisk_image; + initrd_size = (u64)real_mode->ext_ramdisk_size << 32; + initrd_size |= real_mode->hdr.ramdisk_size; + mem_avoid[1].start = initrd_start; + mem_avoid[1].size = initrd_size; + + /* Avoid kernel command line. */ + cmd_line = (u64)real_mode->ext_cmd_line_ptr << 32; + cmd_line |= real_mode->hdr.cmd_line_ptr; + /* Calculate size of cmd_line. */ + ptr = (char *)(unsigned long)cmd_line; + for (cmd_line_size = 0; ptr[cmd_line_size++]; ) + ; + mem_avoid[2].start = cmd_line; + mem_avoid[2].size = cmd_line_size; + + /* Avoid heap memory. */ + mem_avoid[3].start = (unsigned long)free_mem_ptr; + mem_avoid[3].size = BOOT_HEAP_SIZE; + + /* Avoid stack memory. */ + mem_avoid[4].start = (unsigned long)free_mem_end_ptr; + mem_avoid[4].size = BOOT_STACK_SIZE; +} + +/* Does this memory vector overlap a known avoided area? */ +static bool mem_avoid_overlap(struct mem_vector *img) +{ + int i; + struct setup_data *ptr; + + for (i = 0; i < MEM_AVOID_MAX; i++) { + if (mem_overlaps(img, &mem_avoid[i])) + return true; + } + + /* Avoid all entries in the setup_data linked list. */ + ptr = (struct setup_data *)(unsigned long)real_mode->hdr.setup_data; + while (ptr) { + struct mem_vector avoid; + + avoid.start = (unsigned long)ptr; + avoid.size = sizeof(*ptr) + ptr->len; + + if (mem_overlaps(img, &avoid)) + return true; + + ptr = (struct setup_data *)(unsigned long)ptr->next; + } + + return false; +} + +static unsigned long slots[CONFIG_RANDOMIZE_BASE_MAX_OFFSET / + CONFIG_PHYSICAL_ALIGN]; +static unsigned long slot_max; + +static void slots_append(unsigned long addr) +{ + /* Overflowing the slots list should be impossible. */ + if (slot_max >= CONFIG_RANDOMIZE_BASE_MAX_OFFSET / + CONFIG_PHYSICAL_ALIGN) + return; + + slots[slot_max++] = addr; +} + +static unsigned long slots_fetch_random(void) +{ + /* Handle case of no slots stored. */ + if (slot_max == 0) + return 0; + + return slots[get_random_long() % slot_max]; +} + +static void process_e820_entry(struct e820entry *entry, + unsigned long minimum, + unsigned long image_size) +{ + struct mem_vector region, img; + + /* Skip non-RAM entries. */ + if (entry->type != E820_RAM) + return; + + /* Ignore entries entirely above our maximum. */ + if (entry->addr >= CONFIG_RANDOMIZE_BASE_MAX_OFFSET) + return; + + /* Ignore entries entirely below our minimum. */ + if (entry->addr + entry->size < minimum) + return; + + region.start = entry->addr; + region.size = entry->size; + + /* Potentially raise address to minimum location. */ + if (region.start < minimum) + region.start = minimum; + + /* Potentially raise address to meet alignment requirements. */ + region.start = ALIGN(region.start, CONFIG_PHYSICAL_ALIGN); + + /* Did we raise the address above the bounds of this e820 region? */ + if (region.start > entry->addr + entry->size) + return; + + /* Reduce size by any delta from the original address. */ + region.size -= region.start - entry->addr; + + /* Reduce maximum size to fit end of image within maximum limit. */ + if (region.start + region.size > CONFIG_RANDOMIZE_BASE_MAX_OFFSET) + region.size = CONFIG_RANDOMIZE_BASE_MAX_OFFSET - region.start; + + /* Walk each aligned slot and check for avoided areas. */ + for (img.start = region.start, img.size = image_size ; + mem_contains(®ion, &img) ; + img.start += CONFIG_PHYSICAL_ALIGN) { + if (mem_avoid_overlap(&img)) + continue; + slots_append(img.start); + } +} + +static unsigned long find_random_addr(unsigned long minimum, + unsigned long size) +{ + int i; + unsigned long addr; + + /* Make sure minimum is aligned. */ + minimum = ALIGN(minimum, CONFIG_PHYSICAL_ALIGN); + + /* Verify potential e820 positions, appending to slots list. */ + for (i = 0; i < real_mode->e820_entries; i++) { + process_e820_entry(&real_mode->e820_map[i], minimum, size); + } + + return slots_fetch_random(); +} + +unsigned char *choose_kernel_location(struct boot_params *boot_params, + unsigned char *input, + unsigned long input_size, + unsigned char *output, + unsigned long output_size) +{ + unsigned long choice = (unsigned long)output; + unsigned long random; + +#ifdef CONFIG_HIBERNATION + if (!cmdline_find_option_bool("kaslr")) { + debug_putstr("KASLR disabled by default...\n"); + goto out; + } +#else + if (cmdline_find_option_bool("nokaslr")) { + debug_putstr("KASLR disabled by cmdline...\n"); + goto out; + } +#endif + + boot_params->hdr.loadflags |= KASLR_FLAG; + + /* Record the various known unsafe memory ranges. */ + mem_avoid_init((unsigned long)input, input_size, + (unsigned long)output, output_size); + + /* Walk e820 and find a random address. */ + random = find_random_addr(choice, output_size); + if (!random) { + debug_putstr("KASLR could not find suitable E820 region...\n"); + goto out; + } + + /* Always enforce the minimum. */ + if (random < choice) + goto out; + + choice = random; +out: + return (unsigned char *)choice; +} diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index 3783dc3..a8c4e08 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -66,7 +66,7 @@ int cmdline_find_option_bool(const char *option); #if CONFIG_RANDOMIZE_BASE -/* aslr.c */ +/* kaslr.c */ unsigned char *choose_kernel_location(struct boot_params *boot_params, unsigned char *input, unsigned long input_size, -- cgit v0.10.2 From 206f25a8319b312b9983953a308b0e38e1943c1c Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 18 Apr 2016 09:42:11 -0700 Subject: x86/KASLR: Remove unneeded boot_params argument Since the boot_params can be found using the real_mode global variable, there is no need to pass around a pointer to it. This slightly simplifies the choose_kernel_location function and its callers. [kees: rewrote changelog, tracked file rename] Signed-off-by: Yinghai Lu Signed-off-by: Kees Cook Cc: Andrew Morton Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Baoquan He Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Dmitry Vyukov Cc: H. Peter Anvin Cc: H.J. Lu Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1460997735-24785-3-git-send-email-keescook@chromium.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c index 6a9b96b..622aa88 100644 --- a/arch/x86/boot/compressed/kaslr.c +++ b/arch/x86/boot/compressed/kaslr.c @@ -295,8 +295,7 @@ static unsigned long find_random_addr(unsigned long minimum, return slots_fetch_random(); } -unsigned char *choose_kernel_location(struct boot_params *boot_params, - unsigned char *input, +unsigned char *choose_kernel_location(unsigned char *input, unsigned long input_size, unsigned char *output, unsigned long output_size) @@ -316,7 +315,7 @@ unsigned char *choose_kernel_location(struct boot_params *boot_params, } #endif - boot_params->hdr.loadflags |= KASLR_FLAG; + real_mode->hdr.loadflags |= KASLR_FLAG; /* Record the various known unsafe memory ranges. */ mem_avoid_init((unsigned long)input, input_size, diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 79dac17..f35ad9e 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -428,7 +428,7 @@ asmlinkage __visible void *decompress_kernel(void *rmode, memptr heap, * the entire decompressed kernel plus relocation table, or the * entire decompressed kernel plus .bss and .brk sections. */ - output = choose_kernel_location(real_mode, input_data, input_len, output, + output = choose_kernel_location(input_data, input_len, output, output_len > run_size ? output_len : run_size); diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index a8c4e08..22346d5 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -67,8 +67,7 @@ int cmdline_find_option_bool(const char *option); #if CONFIG_RANDOMIZE_BASE /* kaslr.c */ -unsigned char *choose_kernel_location(struct boot_params *boot_params, - unsigned char *input, +unsigned char *choose_kernel_location(unsigned char *input, unsigned long input_size, unsigned char *output, unsigned long output_size); @@ -76,8 +75,7 @@ unsigned char *choose_kernel_location(struct boot_params *boot_params, bool has_cpuflag(int flag); #else static inline -unsigned char *choose_kernel_location(struct boot_params *boot_params, - unsigned char *input, +unsigned char *choose_kernel_location(unsigned char *input, unsigned long input_size, unsigned char *output, unsigned long output_size) -- cgit v0.10.2 From 6655e0aaf768c39a62eea739c453b9db1e841cfb Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 18 Apr 2016 09:42:12 -0700 Subject: x86/boot: Rename "real_mode" to "boot_params" The non-compressed boot code uses the (much more obvious) name "boot_params" for the global pointer to the x86 boot parameters. The compressed kernel loader code, though, was using the legacy name "real_mode". There is no need to have a different name, and changing it improves readability. Suggested-by: Ingo Molnar Signed-off-by: Kees Cook Cc: Andrew Morton Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Baoquan He Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Dmitry Vyukov Cc: H. Peter Anvin Cc: H.J. Lu Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Yinghai Lu Link: http://lkml.kernel.org/r/1460997735-24785-4-git-send-email-keescook@chromium.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/boot/compressed/cmdline.c b/arch/x86/boot/compressed/cmdline.c index b68e303..73ccf63 100644 --- a/arch/x86/boot/compressed/cmdline.c +++ b/arch/x86/boot/compressed/cmdline.c @@ -15,9 +15,9 @@ static inline char rdfs8(addr_t addr) #include "../cmdline.c" static unsigned long get_cmd_line_ptr(void) { - unsigned long cmd_line_ptr = real_mode->hdr.cmd_line_ptr; + unsigned long cmd_line_ptr = boot_params->hdr.cmd_line_ptr; - cmd_line_ptr |= (u64)real_mode->ext_cmd_line_ptr << 32; + cmd_line_ptr |= (u64)boot_params->ext_cmd_line_ptr << 32; return cmd_line_ptr; } diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c index 622aa88..a51ec84 100644 --- a/arch/x86/boot/compressed/kaslr.c +++ b/arch/x86/boot/compressed/kaslr.c @@ -55,7 +55,7 @@ static unsigned long get_random_boot(void) unsigned long hash = 0; hash = rotate_xor(hash, build_str, sizeof(build_str)); - hash = rotate_xor(hash, real_mode, sizeof(*real_mode)); + hash = rotate_xor(hash, boot_params, sizeof(*boot_params)); return hash; } @@ -152,16 +152,16 @@ static void mem_avoid_init(unsigned long input, unsigned long input_size, mem_avoid[0].size = unsafe_len; /* Avoid initrd. */ - initrd_start = (u64)real_mode->ext_ramdisk_image << 32; - initrd_start |= real_mode->hdr.ramdisk_image; - initrd_size = (u64)real_mode->ext_ramdisk_size << 32; - initrd_size |= real_mode->hdr.ramdisk_size; + initrd_start = (u64)boot_params->ext_ramdisk_image << 32; + initrd_start |= boot_params->hdr.ramdisk_image; + initrd_size = (u64)boot_params->ext_ramdisk_size << 32; + initrd_size |= boot_params->hdr.ramdisk_size; mem_avoid[1].start = initrd_start; mem_avoid[1].size = initrd_size; /* Avoid kernel command line. */ - cmd_line = (u64)real_mode->ext_cmd_line_ptr << 32; - cmd_line |= real_mode->hdr.cmd_line_ptr; + cmd_line = (u64)boot_params->ext_cmd_line_ptr << 32; + cmd_line |= boot_params->hdr.cmd_line_ptr; /* Calculate size of cmd_line. */ ptr = (char *)(unsigned long)cmd_line; for (cmd_line_size = 0; ptr[cmd_line_size++]; ) @@ -190,7 +190,7 @@ static bool mem_avoid_overlap(struct mem_vector *img) } /* Avoid all entries in the setup_data linked list. */ - ptr = (struct setup_data *)(unsigned long)real_mode->hdr.setup_data; + ptr = (struct setup_data *)(unsigned long)boot_params->hdr.setup_data; while (ptr) { struct mem_vector avoid; @@ -288,8 +288,8 @@ static unsigned long find_random_addr(unsigned long minimum, minimum = ALIGN(minimum, CONFIG_PHYSICAL_ALIGN); /* Verify potential e820 positions, appending to slots list. */ - for (i = 0; i < real_mode->e820_entries; i++) { - process_e820_entry(&real_mode->e820_map[i], minimum, size); + for (i = 0; i < boot_params->e820_entries; i++) { + process_e820_entry(&boot_params->e820_map[i], minimum, size); } return slots_fetch_random(); @@ -315,7 +315,7 @@ unsigned char *choose_kernel_location(unsigned char *input, } #endif - real_mode->hdr.loadflags |= KASLR_FLAG; + boot_params->hdr.loadflags |= KASLR_FLAG; /* Record the various known unsafe memory ranges. */ mem_avoid_init((unsigned long)input, input_size, diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index f35ad9e..462dfbf 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -114,7 +114,7 @@ static void error(char *m); /* * This is set up by the setup-routine at boot-time */ -struct boot_params *real_mode; /* Pointer to real-mode data */ +struct boot_params *boot_params; memptr free_mem_ptr; memptr free_mem_end_ptr; @@ -184,12 +184,12 @@ void __putstr(const char *s) } } - if (real_mode->screen_info.orig_video_mode == 0 && + if (boot_params->screen_info.orig_video_mode == 0 && lines == 0 && cols == 0) return; - x = real_mode->screen_info.orig_x; - y = real_mode->screen_info.orig_y; + x = boot_params->screen_info.orig_x; + y = boot_params->screen_info.orig_y; while ((c = *s++) != '\0') { if (c == '\n') { @@ -210,8 +210,8 @@ void __putstr(const char *s) } } - real_mode->screen_info.orig_x = x; - real_mode->screen_info.orig_y = y; + boot_params->screen_info.orig_x = x; + boot_params->screen_info.orig_y = y; pos = (x + cols * y) * 2; /* Update cursor position */ outb(14, vidport); @@ -392,14 +392,15 @@ asmlinkage __visible void *decompress_kernel(void *rmode, memptr heap, { unsigned char *output_orig = output; - real_mode = rmode; + /* Retain x86 boot parameters pointer passed from startup_32/64. */ + boot_params = rmode; - /* Clear it for solely in-kernel use */ - real_mode->hdr.loadflags &= ~KASLR_FLAG; + /* Clear flags intended for solely in-kernel use. */ + boot_params->hdr.loadflags &= ~KASLR_FLAG; - sanitize_boot_params(real_mode); + sanitize_boot_params(boot_params); - if (real_mode->screen_info.orig_video_mode == 7) { + if (boot_params->screen_info.orig_video_mode == 7) { vidmem = (char *) 0xb0000; vidport = 0x3b4; } else { @@ -407,8 +408,8 @@ asmlinkage __visible void *decompress_kernel(void *rmode, memptr heap, vidport = 0x3d4; } - lines = real_mode->screen_info.orig_video_lines; - cols = real_mode->screen_info.orig_video_cols; + lines = boot_params->screen_info.orig_video_lines; + cols = boot_params->screen_info.orig_video_cols; console_init(); debug_putstr("early console in decompress_kernel\n"); diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index 22346d5..1f750a51 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -32,7 +32,7 @@ /* misc.c */ extern memptr free_mem_ptr; extern memptr free_mem_end_ptr; -extern struct boot_params *real_mode; /* Pointer to real-mode data */ +extern struct boot_params *boot_params; void __putstr(const char *s); void __puthex(unsigned long value); #define error_putstr(__x) __putstr(__x) -- cgit v0.10.2 From c04028813221c2d39a4f368586795ac4466d311c Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 18 Apr 2016 09:42:13 -0700 Subject: x86/boot: Clarify purpose of functions in misc.c The function "decompress_kernel" now performs many more duties, so this patch renames it to "extract_kernel" and updates callers and comments. Additionally the file header comment for misc.c is improved to actually describe what is contained. Suggested-by: Ingo Molnar Signed-off-by: Kees Cook Cc: Andrew Morton Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Baoquan He Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Dmitry Vyukov Cc: H. Peter Anvin Cc: H.J. Lu Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Yinghai Lu Link: http://lkml.kernel.org/r/1460997735-24785-5-git-send-email-keescook@chromium.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S index 0256064..26dd9df 100644 --- a/arch/x86/boot/compressed/head_32.S +++ b/arch/x86/boot/compressed/head_32.S @@ -233,9 +233,9 @@ relocated: 2: /* - * Do the decompression, and jump to the new kernel.. + * Do the extraction, and jump to the new kernel.. */ - /* push arguments for decompress_kernel: */ + /* push arguments for extract_kernel: */ pushl $z_run_size /* size of kernel with .bss and .brk */ pushl $z_output_len /* decompressed length, end of relocs */ leal z_extract_offset_negative(%ebx), %ebp @@ -246,11 +246,11 @@ relocated: leal boot_heap(%ebx), %eax pushl %eax /* heap area */ pushl %esi /* real mode pointer */ - call decompress_kernel /* returns kernel location in %eax */ + call extract_kernel /* returns kernel location in %eax */ addl $28, %esp /* - * Jump to the decompressed kernel. + * Jump to the extracted kernel. */ xorl %ebx, %ebx jmp *%eax diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index 86558a1..d43c30e 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -408,7 +408,7 @@ relocated: 2: /* - * Do the decompression, and jump to the new kernel.. + * Do the extraction, and jump to the new kernel.. */ pushq %rsi /* Save the real mode argument */ movq $z_run_size, %r9 /* size of kernel with .bss and .brk */ @@ -419,7 +419,7 @@ relocated: movl $z_input_len, %ecx /* input_len */ movq %rbp, %r8 /* output target address */ movq $z_output_len, %r9 /* decompressed length, end of relocs */ - call decompress_kernel /* returns kernel location in %rax */ + call extract_kernel /* returns kernel location in %rax */ popq %r9 popq %rsi diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 462dfbf..0d69e80 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -1,8 +1,10 @@ /* * misc.c * - * This is a collection of several routines from gzip-1.0.3 - * adapted for Linux. + * This is a collection of several routines used to extract the kernel + * which includes KASLR relocation, decompression, ELF parsing, and + * relocation processing. Additionally included are the screen and serial + * output functions and related debugging support functions. * * malloc by Hannu Savolainen 1993 and Matthias Urlichs 1994 * puts by Nick Holloway 1993, better puts by Martin Mares 1995 @@ -383,7 +385,7 @@ static void parse_elf(void *output) free(phdrs); } -asmlinkage __visible void *decompress_kernel(void *rmode, memptr heap, +asmlinkage __visible void *extract_kernel(void *rmode, memptr heap, unsigned char *input_data, unsigned long input_len, unsigned char *output, @@ -412,7 +414,7 @@ asmlinkage __visible void *decompress_kernel(void *rmode, memptr heap, cols = boot_params->screen_info.orig_video_cols; console_init(); - debug_putstr("early console in decompress_kernel\n"); + debug_putstr("early console in extract_kernel\n"); free_mem_ptr = heap; /* Heap */ free_mem_end_ptr = heap + BOOT_HEAP_SIZE; -- cgit v0.10.2 From 7de828dfe607013546ece7ce25aa9839e8f93a66 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 18 Apr 2016 09:42:14 -0700 Subject: x86/KASLR: Clarify purpose of kaslr.c The name "choose_kernel_location" isn't specific enough, and doesn't describe the primary thing it does: choosing a random location. This patch renames it to "choose_random_location", and clarifies the what routines are contained in the kaslr.c source file. Suggested-by: Ingo Molnar Signed-off-by: Kees Cook Cc: Andrew Morton Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Baoquan He Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Dmitry Vyukov Cc: H. Peter Anvin Cc: H.J. Lu Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Yinghai Lu Link: http://lkml.kernel.org/r/1460997735-24785-6-git-send-email-keescook@chromium.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c index a51ec84..9e03190 100644 --- a/arch/x86/boot/compressed/kaslr.c +++ b/arch/x86/boot/compressed/kaslr.c @@ -1,3 +1,14 @@ +/* + * kaslr.c + * + * This contains the routines needed to generate a reasonable level of + * entropy to choose a randomized kernel base address offset in support + * of Kernel Address Space Layout Randomization (KASLR). Additionally + * handles walking the physical memory maps (and tracking memory regions + * to avoid) in order to select a physical memory location that can + * contain the entire properly aligned running kernel image. + * + */ #include "misc.h" #include @@ -295,7 +306,7 @@ static unsigned long find_random_addr(unsigned long minimum, return slots_fetch_random(); } -unsigned char *choose_kernel_location(unsigned char *input, +unsigned char *choose_random_location(unsigned char *input, unsigned long input_size, unsigned char *output, unsigned long output_size) diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 0d69e80..ad8c01a 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -431,7 +431,7 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap, * the entire decompressed kernel plus relocation table, or the * entire decompressed kernel plus .bss and .brk sections. */ - output = choose_kernel_location(input_data, input_len, output, + output = choose_random_location(input_data, input_len, output, output_len > run_size ? output_len : run_size); diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index 1f750a51..9887e0d 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -67,7 +67,7 @@ int cmdline_find_option_bool(const char *option); #if CONFIG_RANDOMIZE_BASE /* kaslr.c */ -unsigned char *choose_kernel_location(unsigned char *input, +unsigned char *choose_random_location(unsigned char *input, unsigned long input_size, unsigned char *output, unsigned long output_size); @@ -75,7 +75,7 @@ unsigned char *choose_kernel_location(unsigned char *input, bool has_cpuflag(int flag); #else static inline -unsigned char *choose_kernel_location(unsigned char *input, +unsigned char *choose_random_location(unsigned char *input, unsigned long input_size, unsigned char *output, unsigned long output_size) -- cgit v0.10.2 From 9016875df408fc5db6a94a3c5f5f5503c916cf81 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 18 Apr 2016 09:42:15 -0700 Subject: x86/KASLR: Rename "random" to "random_addr" The variable "random" is also the name of a libc function. It's better coding style to avoid overloading such things, so rename it to the more accurate "random_addr". Suggested-by: Ingo Molnar Signed-off-by: Kees Cook Cc: Andrew Morton Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Baoquan He Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Dmitry Vyukov Cc: H. Peter Anvin Cc: H.J. Lu Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Yinghai Lu Link: http://lkml.kernel.org/r/1460997735-24785-7-git-send-email-keescook@chromium.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c index 9e03190..9c29e78 100644 --- a/arch/x86/boot/compressed/kaslr.c +++ b/arch/x86/boot/compressed/kaslr.c @@ -312,7 +312,7 @@ unsigned char *choose_random_location(unsigned char *input, unsigned long output_size) { unsigned long choice = (unsigned long)output; - unsigned long random; + unsigned long random_addr; #ifdef CONFIG_HIBERNATION if (!cmdline_find_option_bool("kaslr")) { @@ -333,17 +333,17 @@ unsigned char *choose_random_location(unsigned char *input, (unsigned long)output, output_size); /* Walk e820 and find a random address. */ - random = find_random_addr(choice, output_size); - if (!random) { + random_addr = find_random_addr(choice, output_size); + if (!random_addr) { debug_putstr("KASLR could not find suitable E820 region...\n"); goto out; } /* Always enforce the minimum. */ - if (random < choice) + if (random_addr < choice) goto out; - choice = random; + choice = random_addr; out: return (unsigned char *)choice; } -- cgit v0.10.2 From 4252db10559fc3d1efc1e43613254fdd220b014b Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Wed, 20 Apr 2016 13:55:42 -0700 Subject: x86/KASLR: Update description for decompressor worst case size The comment that describes the analysis for the size of the decompressor code only took gzip into account (there are currently 6 other decompressors that could be used). The actual z_extract_offset calculation in code was already handling the correct maximum size, but this documentation hadn't been updated. This updates the documentation, fixes several typos, moves the comment to header.S, updates references, and adds a note at the end of the decompressor include list to remind us about updating the comment in the future. (Instead of moving the comment to mkpiggy.c, where the calculation is currently happening, it is being moved to header.S because the calculations in mkpiggy.c will be removed in favor of header.S calculations in a following patch, and it seemed like overkill to move the giant comment twice, especially when there's already reference to z_extract_offset in header.S.) Signed-off-by: Baoquan He [ Rewrote changelog, cleaned up comment style, moved comments around. ] Signed-off-by: Kees Cook Cc: Andrew Morton Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Dmitry Vyukov Cc: H. Peter Anvin Cc: H.J. Lu Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Yinghai Lu Link: http://lkml.kernel.org/r/1461185746-8017-2-git-send-email-keescook@chromium.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c index 9c29e78..7d86c5d 100644 --- a/arch/x86/boot/compressed/kaslr.c +++ b/arch/x86/boot/compressed/kaslr.c @@ -155,7 +155,7 @@ static void mem_avoid_init(unsigned long input, unsigned long input_size, /* * Avoid the region that is unsafe to overlap during - * decompression (see calculations at top of misc.c). + * decompression (see calculations in ../header.S). */ unsafe_len = (output_size >> 12) + 32768 + 18; unsafe = (unsigned long)input + input_size - unsafe_len; diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index ad8c01a..e96829b 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -14,90 +14,13 @@ #include "misc.h" #include "../string.h" -/* WARNING!! - * This code is compiled with -fPIC and it is relocated dynamically - * at run time, but no relocation processing is performed. - * This means that it is not safe to place pointers in static structures. - */ - /* - * Getting to provable safe in place decompression is hard. - * Worst case behaviours need to be analyzed. - * Background information: - * - * The file layout is: - * magic[2] - * method[1] - * flags[1] - * timestamp[4] - * extraflags[1] - * os[1] - * compressed data blocks[N] - * crc[4] orig_len[4] - * - * resulting in 18 bytes of non compressed data overhead. - * - * Files divided into blocks - * 1 bit (last block flag) - * 2 bits (block type) - * - * 1 block occurs every 32K -1 bytes or when there 50% compression - * has been achieved. The smallest block type encoding is always used. - * - * stored: - * 32 bits length in bytes. - * - * fixed: - * magic fixed tree. - * symbols. - * - * dynamic: - * dynamic tree encoding. - * symbols. - * - * - * The buffer for decompression in place is the length of the - * uncompressed data, plus a small amount extra to keep the algorithm safe. - * The compressed data is placed at the end of the buffer. The output - * pointer is placed at the start of the buffer and the input pointer - * is placed where the compressed data starts. Problems will occur - * when the output pointer overruns the input pointer. - * - * The output pointer can only overrun the input pointer if the input - * pointer is moving faster than the output pointer. A condition only - * triggered by data whose compressed form is larger than the uncompressed - * form. - * - * The worst case at the block level is a growth of the compressed data - * of 5 bytes per 32767 bytes. - * - * The worst case internal to a compressed block is very hard to figure. - * The worst case can at least be boundined by having one bit that represents - * 32764 bytes and then all of the rest of the bytes representing the very - * very last byte. - * - * All of which is enough to compute an amount of extra data that is required - * to be safe. To avoid problems at the block level allocating 5 extra bytes - * per 32767 bytes of data is sufficient. To avoind problems internal to a - * block adding an extra 32767 bytes (the worst case uncompressed block size) - * is sufficient, to ensure that in the worst case the decompressed data for - * block will stop the byte before the compressed data for a block begins. - * To avoid problems with the compressed data's meta information an extra 18 - * bytes are needed. Leading to the formula: - * - * extra_bytes = (uncompressed_size >> 12) + 32768 + 18 + decompressor_size. - * - * Adding 8 bytes per 32K is a bit excessive but much easier to calculate. - * Adding 32768 instead of 32767 just makes for round numbers. - * Adding the decompressor_size is necessary as it musht live after all - * of the data as well. Last I measured the decompressor is about 14K. - * 10K of actual data and 4K of bss. - * + * WARNING!! + * This code is compiled with -fPIC and it is relocated dynamically at + * run time, but no relocation processing is performed. This means that + * it is not safe to place pointers in static structures. */ -/* - * gzip declarations - */ #define STATIC static #undef memcpy @@ -148,6 +71,10 @@ static int lines, cols; #ifdef CONFIG_KERNEL_LZ4 #include "../../../../lib/decompress_unlz4.c" #endif +/* + * NOTE: When adding a new decompressor, please update the analysis in + * ../header.S. + */ static void scroll(void) { diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S index 6236b9e..fd85b9e 100644 --- a/arch/x86/boot/header.S +++ b/arch/x86/boot/header.S @@ -440,6 +440,94 @@ setup_data: .quad 0 # 64-bit physical pointer to pref_address: .quad LOAD_PHYSICAL_ADDR # preferred load addr +# +# Getting to provably safe in-place decompression is hard. Worst case +# behaviours need to be analyzed. Here let's take the decompression of +# a gzip-compressed kernel as example, to illustrate it: +# +# The file layout of gzip compressed kernel is: +# +# magic[2] +# method[1] +# flags[1] +# timestamp[4] +# extraflags[1] +# os[1] +# compressed data blocks[N] +# crc[4] orig_len[4] +# +# ... resulting in +18 bytes overhead of uncompressed data. +# +# (For more information, please refer to RFC 1951 and RFC 1952.) +# +# Files divided into blocks +# 1 bit (last block flag) +# 2 bits (block type) +# +# 1 block occurs every 32K -1 bytes or when there 50% compression +# has been achieved. The smallest block type encoding is always used. +# +# stored: +# 32 bits length in bytes. +# +# fixed: +# magic fixed tree. +# symbols. +# +# dynamic: +# dynamic tree encoding. +# symbols. +# +# +# The buffer for decompression in place is the length of the uncompressed +# data, plus a small amount extra to keep the algorithm safe. The +# compressed data is placed at the end of the buffer. The output pointer +# is placed at the start of the buffer and the input pointer is placed +# where the compressed data starts. Problems will occur when the output +# pointer overruns the input pointer. +# +# The output pointer can only overrun the input pointer if the input +# pointer is moving faster than the output pointer. A condition only +# triggered by data whose compressed form is larger than the uncompressed +# form. +# +# The worst case at the block level is a growth of the compressed data +# of 5 bytes per 32767 bytes. +# +# The worst case internal to a compressed block is very hard to figure. +# The worst case can at least be bounded by having one bit that represents +# 32764 bytes and then all of the rest of the bytes representing the very +# very last byte. +# +# All of which is enough to compute an amount of extra data that is required +# to be safe. To avoid problems at the block level allocating 5 extra bytes +# per 32767 bytes of data is sufficient. To avoid problems internal to a +# block adding an extra 32767 bytes (the worst case uncompressed block size) +# is sufficient, to ensure that in the worst case the decompressed data for +# block will stop the byte before the compressed data for a block begins. +# To avoid problems with the compressed data's meta information an extra 18 +# bytes are needed. Leading to the formula: +# +# extra_bytes = (uncompressed_size >> 12) + 32768 + 18 + decompressor_size +# +# Adding 8 bytes per 32K is a bit excessive but much easier to calculate. +# Adding 32768 instead of 32767 just makes for round numbers. +# Adding the decompressor_size is necessary as it musht live after all +# of the data as well. Last I measured the decompressor is about 14K. +# 10K of actual data and 4K of bss. +# +# Above analysis is for decompressing gzip compressed kernel only. Up to +# now 6 different decompressor are supported all together. And among them +# xz stores data in chunks and has maximum chunk of 64K. Hence safety +# margin should be updated to cover all decompressors so that we don't +# need to deal with each of them separately. Please check +# the description in lib/decompressor_xxx.c for specific information. +# +# extra_bytes = (uncompressed_size >> 12) + 65536 + 128 +# +# Note that this calculation, which results in z_extract_offset (below), +# is currently generated in compressed/mkpiggy.c + #define ZO_INIT_SIZE (ZO__end - ZO_startup_32 + ZO_z_extract_offset) #define VO_INIT_SIZE (VO__end - VO__text) #if ZO_INIT_SIZE > VO_INIT_SIZE -- cgit v0.10.2 From e8581e3d67788b6b29d055fa42c6cb5b258fee64 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Wed, 20 Apr 2016 13:55:43 -0700 Subject: x86/KASLR: Drop CONFIG_RANDOMIZE_BASE_MAX_OFFSET Currently CONFIG_RANDOMIZE_BASE_MAX_OFFSET is used to limit the maximum offset for kernel randomization. This limit doesn't need to be a CONFIG since it is tied completely to KERNEL_IMAGE_SIZE, and will make no sense once physical and virtual offsets are randomized separately. This patch removes CONFIG_RANDOMIZE_BASE_MAX_OFFSET and consolidates the Kconfig help text. [kees: rewrote changelog, dropped KERNEL_IMAGE_SIZE_DEFAULT, rewrote help] Signed-off-by: Baoquan He Signed-off-by: Kees Cook Cc: Andrew Morton Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Dmitry Vyukov Cc: H. Peter Anvin Cc: H.J. Lu Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Yinghai Lu Link: http://lkml.kernel.org/r/1461185746-8017-3-git-send-email-keescook@chromium.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 2dc18605..5892d54 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1932,54 +1932,38 @@ config RELOCATABLE (CONFIG_PHYSICAL_START) is used as the minimum location. config RANDOMIZE_BASE - bool "Randomize the address of the kernel image" + bool "Randomize the address of the kernel image (KASLR)" depends on RELOCATABLE default n ---help--- - Randomizes the physical and virtual address at which the - kernel image is decompressed, as a security feature that - deters exploit attempts relying on knowledge of the location - of kernel internals. + In support of Kernel Address Space Layout Randomization (KASLR), + this randomizes the physical address at which the kernel image + is decompressed and the virtual address where the kernel + image is mapped, as a security feature that deters exploit + attempts relying on knowledge of the location of kernel + code internals. + + The kernel physical and virtual address can be randomized + from 16MB up to 1GB on 64-bit and 512MB on 32-bit. (Note that + using RANDOMIZE_BASE reduces the memory space available to + kernel modules from 1.5GB to 1GB.) + + Entropy is generated using the RDRAND instruction if it is + supported. If RDTSC is supported, its value is mixed into + the entropy pool as well. If neither RDRAND nor RDTSC are + supported, then entropy is read from the i8254 timer. + + Since the kernel is built using 2GB addressing, and + PHYSICAL_ALIGN must be at a minimum of 2MB, only 10 bits of + entropy is theoretically possible. Currently, with the + default value for PHYSICAL_ALIGN and due to page table + layouts, 64-bit uses 9 bits of entropy and 32-bit uses 8 bits. + + If CONFIG_HIBERNATE is also enabled, KASLR is disabled at boot + time. To enable it, boot with "kaslr" on the kernel command + line (which will also disable hibernation). - Entropy is generated using the RDRAND instruction if it is - supported. If RDTSC is supported, it is used as well. If - neither RDRAND nor RDTSC are supported, then randomness is - read from the i8254 timer. - - The kernel will be offset by up to RANDOMIZE_BASE_MAX_OFFSET, - and aligned according to PHYSICAL_ALIGN. Since the kernel is - built using 2GiB addressing, and PHYSICAL_ALGIN must be at a - minimum of 2MiB, only 10 bits of entropy is theoretically - possible. At best, due to page table layouts, 64-bit can use - 9 bits of entropy and 32-bit uses 8 bits. - - If unsure, say N. - -config RANDOMIZE_BASE_MAX_OFFSET - hex "Maximum kASLR offset allowed" if EXPERT - depends on RANDOMIZE_BASE - range 0x0 0x20000000 if X86_32 - default "0x20000000" if X86_32 - range 0x0 0x40000000 if X86_64 - default "0x40000000" if X86_64 - ---help--- - The lesser of RANDOMIZE_BASE_MAX_OFFSET and available physical - memory is used to determine the maximal offset in bytes that will - be applied to the kernel when kernel Address Space Layout - Randomization (kASLR) is active. This must be a multiple of - PHYSICAL_ALIGN. - - On 32-bit this is limited to 512MiB by page table layouts. The - default is 512MiB. - - On 64-bit this is limited by how the kernel fixmap page table is - positioned, so this cannot be larger than 1GiB currently. Without - RANDOMIZE_BASE, there is a 512MiB to 1.5GiB split between kernel - and modules. When RANDOMIZE_BASE_MAX_OFFSET is above 512MiB, the - modules area will shrink to compensate, up to the current maximum - 1GiB to 1GiB split. The default is 1GiB. - - If unsure, leave at the default value. + If unsure, say N. # Relocation on x86 needs some additional build support config X86_NEED_RELOCS diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c index 7d86c5d..3ad71a0 100644 --- a/arch/x86/boot/compressed/kaslr.c +++ b/arch/x86/boot/compressed/kaslr.c @@ -217,15 +217,13 @@ static bool mem_avoid_overlap(struct mem_vector *img) return false; } -static unsigned long slots[CONFIG_RANDOMIZE_BASE_MAX_OFFSET / - CONFIG_PHYSICAL_ALIGN]; +static unsigned long slots[KERNEL_IMAGE_SIZE / CONFIG_PHYSICAL_ALIGN]; static unsigned long slot_max; static void slots_append(unsigned long addr) { /* Overflowing the slots list should be impossible. */ - if (slot_max >= CONFIG_RANDOMIZE_BASE_MAX_OFFSET / - CONFIG_PHYSICAL_ALIGN) + if (slot_max >= KERNEL_IMAGE_SIZE / CONFIG_PHYSICAL_ALIGN) return; slots[slot_max++] = addr; @@ -251,7 +249,7 @@ static void process_e820_entry(struct e820entry *entry, return; /* Ignore entries entirely above our maximum. */ - if (entry->addr >= CONFIG_RANDOMIZE_BASE_MAX_OFFSET) + if (entry->addr >= KERNEL_IMAGE_SIZE) return; /* Ignore entries entirely below our minimum. */ @@ -276,8 +274,8 @@ static void process_e820_entry(struct e820entry *entry, region.size -= region.start - entry->addr; /* Reduce maximum size to fit end of image within maximum limit. */ - if (region.start + region.size > CONFIG_RANDOMIZE_BASE_MAX_OFFSET) - region.size = CONFIG_RANDOMIZE_BASE_MAX_OFFSET - region.start; + if (region.start + region.size > KERNEL_IMAGE_SIZE) + region.size = KERNEL_IMAGE_SIZE - region.start; /* Walk each aligned slot and check for avoided areas. */ for (img.start = region.start, img.size = image_size ; diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h index 4928cf0..d5c2f8b 100644 --- a/arch/x86/include/asm/page_64_types.h +++ b/arch/x86/include/asm/page_64_types.h @@ -47,12 +47,10 @@ * are fully set up. If kernel ASLR is configured, it can extend the * kernel page table mapping, reducing the size of the modules area. */ -#define KERNEL_IMAGE_SIZE_DEFAULT (512 * 1024 * 1024) -#if defined(CONFIG_RANDOMIZE_BASE) && \ - CONFIG_RANDOMIZE_BASE_MAX_OFFSET > KERNEL_IMAGE_SIZE_DEFAULT -#define KERNEL_IMAGE_SIZE CONFIG_RANDOMIZE_BASE_MAX_OFFSET +#if defined(CONFIG_RANDOMIZE_BASE) +#define KERNEL_IMAGE_SIZE (1024 * 1024 * 1024) #else -#define KERNEL_IMAGE_SIZE KERNEL_IMAGE_SIZE_DEFAULT +#define KERNEL_IMAGE_SIZE (512 * 1024 * 1024) #endif #endif /* _ASM_X86_PAGE_64_DEFS_H */ diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index bd7a9b9..f2ee42d 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -804,9 +804,6 @@ void __init mem_init(void) BUILD_BUG_ON(VMALLOC_START >= VMALLOC_END); #undef high_memory #undef __FIXADDR_TOP -#ifdef CONFIG_RANDOMIZE_BASE - BUILD_BUG_ON(CONFIG_RANDOMIZE_BASE_MAX_OFFSET > KERNEL_IMAGE_SIZE); -#endif #ifdef CONFIG_HIGHMEM BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE > FIXADDR_START); -- cgit v0.10.2 From 1f208de37d10bb9067f3b061d281363be0cd1805 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 20 Apr 2016 13:55:44 -0700 Subject: x86/boot: Clean up things used by decompressors This rearranges the pieces needed to include the decompressor code in misc.c. It wasn't obvious why things were there, so a comment was added and definitions consolidated. Signed-off-by: Kees Cook Cc: Andrew Morton Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Baoquan He Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Dmitry Vyukov Cc: H. Peter Anvin Cc: H.J. Lu Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Yinghai Lu Link: http://lkml.kernel.org/r/1461185746-8017-4-git-send-email-keescook@chromium.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index e96829b..0381e25 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -21,19 +21,19 @@ * it is not safe to place pointers in static structures. */ +/* Macros used by the included decompressor code below. */ #define STATIC static -#undef memcpy - /* - * Use a normal definition of memset() from string.c. There are already + * Use normal definitions of mem*() from string.c. There are already * included header files which expect a definition of memset() and by * the time we define memset macro, it is too late. */ +#undef memcpy #undef memset #define memzero(s, n) memset((s), 0, (n)) - +/* Functions used by the included decompressor code below. */ static void error(char *m); /* -- cgit v0.10.2 From bf0118dbba9542ceb5d33d4a86830a6c88b0bbf6 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 20 Apr 2016 13:55:45 -0700 Subject: x86/boot: Make memcpy() handle overlaps Two uses of memcpy() (screen scrolling and ELF parsing) were handling overlapping memory areas. While there were no explicitly noticed bugs here (yet), it is best to fix this so that the copying will always be safe. Instead of making a new memmove() function that might collide with other memmove() definitions in the decompressors, this just makes the compressed boot code's copy of memcpy() overlap-safe. Suggested-by: Lasse Collin Reported-by: Yinghai Lu Signed-off-by: Kees Cook Cc: Andrew Morton Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Baoquan He Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Dmitry Vyukov Cc: H. Peter Anvin Cc: H.J. Lu Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1461185746-8017-5-git-send-email-keescook@chromium.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 0381e25..eacc855 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -301,9 +301,7 @@ static void parse_elf(void *output) #else dest = (void *)(phdr->p_paddr); #endif - memcpy(dest, - output + phdr->p_offset, - phdr->p_filesz); + memcpy(dest, output + phdr->p_offset, phdr->p_filesz); break; default: /* Ignore other PT_* */ break; } diff --git a/arch/x86/boot/compressed/string.c b/arch/x86/boot/compressed/string.c index 00e788b..1e10e40 100644 --- a/arch/x86/boot/compressed/string.c +++ b/arch/x86/boot/compressed/string.c @@ -1,7 +1,7 @@ #include "../string.c" #ifdef CONFIG_X86_32 -void *memcpy(void *dest, const void *src, size_t n) +void *__memcpy(void *dest, const void *src, size_t n) { int d0, d1, d2; asm volatile( @@ -15,7 +15,7 @@ void *memcpy(void *dest, const void *src, size_t n) return dest; } #else -void *memcpy(void *dest, const void *src, size_t n) +void *__memcpy(void *dest, const void *src, size_t n) { long d0, d1, d2; asm volatile( @@ -39,3 +39,21 @@ void *memset(void *s, int c, size_t n) ss[i] = c; return s; } + +/* + * This memcpy is overlap safe (i.e. it is memmove without conflicting + * with other definitions of memmove from the various decompressors. + */ +void *memcpy(void *dest, const void *src, size_t n) +{ + unsigned char *d = dest; + const unsigned char *s = src; + + if (d <= s || d - s >= n) + return __memcpy(dest, src, n); + + while (n-- > 0) + d[n] = s[n]; + + return dest; +} -- cgit v0.10.2 From 0f8ede1b8c4cb845c53072d7e49d71ca24a61ced Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 20 Apr 2016 13:55:46 -0700 Subject: x86/KASLR: Warn when KASLR is disabled If KASLR is built in but not available at run-time (either due to the current conflict with hibernation, command-line request, or e820 parsing failures), announce the state explicitly. To support this, a new "warn" function is created, based on the existing "error" function. Suggested-by: Ingo Molnar Signed-off-by: Kees Cook Cc: Andrew Morton Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Baoquan He Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Dmitry Vyukov Cc: H. Peter Anvin Cc: H.J. Lu Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Yinghai Lu Link: http://lkml.kernel.org/r/1461185746-8017-6-git-send-email-keescook@chromium.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c index 3ad71a0..8741a6d 100644 --- a/arch/x86/boot/compressed/kaslr.c +++ b/arch/x86/boot/compressed/kaslr.c @@ -314,12 +314,12 @@ unsigned char *choose_random_location(unsigned char *input, #ifdef CONFIG_HIBERNATION if (!cmdline_find_option_bool("kaslr")) { - debug_putstr("KASLR disabled by default...\n"); + warn("KASLR disabled: 'kaslr' not on cmdline (hibernation selected)."); goto out; } #else if (cmdline_find_option_bool("nokaslr")) { - debug_putstr("KASLR disabled by cmdline...\n"); + warn("KASLR disabled: 'nokaslr' on cmdline."); goto out; } #endif @@ -333,7 +333,7 @@ unsigned char *choose_random_location(unsigned char *input, /* Walk e820 and find a random address. */ random_addr = find_random_addr(choice, output_size); if (!random_addr) { - debug_putstr("KASLR could not find suitable E820 region...\n"); + warn("KASLR disabled: could not find suitable E820 region!"); goto out; } diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index eacc855..c57d785 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -166,11 +166,17 @@ void __puthex(unsigned long value) } } -static void error(char *x) +void warn(char *m) { error_putstr("\n\n"); - error_putstr(x); - error_putstr("\n\n -- System halted"); + error_putstr(m); + error_putstr("\n\n"); +} + +static void error(char *m) +{ + warn(m); + error_putstr(" -- System halted"); while (1) asm("hlt"); diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index 9887e0d..e75f6cf 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -35,6 +35,7 @@ extern memptr free_mem_end_ptr; extern struct boot_params *boot_params; void __putstr(const char *s); void __puthex(unsigned long value); +void warn(char *m); #define error_putstr(__x) __putstr(__x) #define error_puthex(__x) __puthex(__x) -- cgit v0.10.2 From 18c78a96239749fc4aaee84ca1eb5a8e81f2601d Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Wed, 13 Apr 2016 17:04:31 -0700 Subject: x86/boot: Enumerate documentation for the x86 hardware_subarch Although hardware_subarch has been in place since the x86 boot protocol 2.07 it hasn't been used much. Enumerate current possible values to avoid misuses and help with semantics later at boot time should this be used further. These enums should only ever be used by architecture x86 code, and all that code should be well contained and compartamentalized, clarify that as well. Signed-off-by: Luis R. Rodriguez Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: andrew.cooper3@citrix.com Cc: andriy.shevchenko@linux.intel.com Cc: bigeasy@linutronix.de Cc: boris.ostrovsky@oracle.com Cc: david.vrabel@citrix.com Cc: ffainelli@freebox.fr Cc: george.dunlap@citrix.com Cc: glin@suse.com Cc: jgross@suse.com Cc: jlee@suse.com Cc: josh@joshtriplett.org Cc: julien.grall@linaro.org Cc: konrad.wilk@oracle.com Cc: kozerkov@parallels.com Cc: lenb@kernel.org Cc: lguest@lists.ozlabs.org Cc: linux-acpi@vger.kernel.org Cc: lv.zheng@intel.com Cc: matt@codeblueprint.co.uk Cc: mbizon@freebox.fr Cc: rjw@rjwysocki.net Cc: robert.moore@intel.com Cc: rusty@rustcorp.com.au Cc: tiwai@suse.de Cc: toshi.kani@hp.com Cc: xen-devel@lists.xensource.com Link: http://lkml.kernel.org/r/1460592286-300-2-git-send-email-mcgrof@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h index 3292543..c18ce67 100644 --- a/arch/x86/include/uapi/asm/bootparam.h +++ b/arch/x86/include/uapi/asm/bootparam.h @@ -157,7 +157,46 @@ struct boot_params { __u8 _pad9[276]; /* 0xeec */ } __attribute__((packed)); -enum { +/** + * enum x86_hardware_subarch - x86 hardware subarchitecture + * + * The x86 hardware_subarch and hardware_subarch_data were added as of the x86 + * boot protocol 2.07 to help distinguish and support custom x86 boot + * sequences. This enum represents accepted values for the x86 + * hardware_subarch. Custom x86 boot sequences (not X86_SUBARCH_PC) do not + * have or simply *cannot* make use of natural stubs like BIOS or EFI, the + * hardware_subarch can be used on the Linux entry path to revector to a + * subarchitecture stub when needed. This subarchitecture stub can be used to + * set up Linux boot parameters or for special care to account for nonstandard + * handling of page tables. + * + * These enums should only ever be used by x86 code, and the code that uses + * it should be well contained and compartamentalized. + * + * KVM and Xen HVM do not have a subarch as these are expected to follow + * standard x86 boot entries. If there is a genuine need for "hypervisor" type + * that should be considered separately in the future. Future guest types + * should seriously consider working with standard x86 boot stubs such as + * the BIOS or EFI boot stubs. + * + * WARNING: this enum is only used for legacy hacks, for platform features that + * are not easily enumerated or discoverable. You should not ever use + * this for new features. + * + * @X86_SUBARCH_PC: Should be used if the hardware is enumerable using standard + * PC mechanisms (PCI, ACPI) and doesn't need a special boot flow. + * @X86_SUBARCH_LGUEST: Used for x86 hypervisor demo, lguest + * @X86_SUBARCH_XEN: Used for Xen guest types which follow the PV boot path, + * which start at asm startup_xen() entry point and later jump to the C + * xen_start_kernel() entry point. Both domU and dom0 type of guests are + * currently supportd through this PV boot path. + * @X86_SUBARCH_INTEL_MID: Used for Intel MID (Mobile Internet Device) platform + * systems which do not have the PCI legacy interfaces. + * @X86_SUBARCH_CE4100: Used for Intel CE media processor (CE4100) SoC for + * for settop boxes and media devices, the use of a subarch for CE4100 + * is more of a hack... + */ +enum x86_hardware_subarch { X86_SUBARCH_PC = 0, X86_SUBARCH_LGUEST, X86_SUBARCH_XEN, -- cgit v0.10.2 From ea1794812410e92c537c839bedeb2d2b2f87c80d Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Wed, 13 Apr 2016 17:04:32 -0700 Subject: x86/xen: Use X86_SUBARCH_XEN for PV guest boots The use of subarch should have no current effect on Xen PV guests, as such this should have no current functional effects. Signed-off-by: Luis R. Rodriguez Reviewed-by: David Vrabel Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: andrew.cooper3@citrix.com Cc: andriy.shevchenko@linux.intel.com Cc: bigeasy@linutronix.de Cc: boris.ostrovsky@oracle.com Cc: ffainelli@freebox.fr Cc: george.dunlap@citrix.com Cc: glin@suse.com Cc: jgross@suse.com Cc: jlee@suse.com Cc: josh@joshtriplett.org Cc: julien.grall@linaro.org Cc: konrad.wilk@oracle.com Cc: kozerkov@parallels.com Cc: lenb@kernel.org Cc: lguest@lists.ozlabs.org Cc: linux-acpi@vger.kernel.org Cc: lv.zheng@intel.com Cc: matt@codeblueprint.co.uk Cc: mbizon@freebox.fr Cc: rjw@rjwysocki.net Cc: robert.moore@intel.com Cc: rusty@rustcorp.com.au Cc: tiwai@suse.de Cc: toshi.kani@hp.com Cc: xen-devel@lists.xensource.com Link: http://lkml.kernel.org/r/1460592286-300-3-git-send-email-mcgrof@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 880862c..61f4d9f 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1670,6 +1670,7 @@ asmlinkage __visible void __init xen_start_kernel(void) boot_params.hdr.ramdisk_image = initrd_start; boot_params.hdr.ramdisk_size = xen_start_info->mod_len; boot_params.hdr.cmd_line_ptr = __pa(xen_start_info->cmd_line); + boot_params.hdr.hardware_subarch = X86_SUBARCH_XEN; if (!xen_initial_domain()) { add_preferred_console("xenboot", 0, NULL); -- cgit v0.10.2 From 907bb655797427cc6498045d6977e77f8363fff7 Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Wed, 13 Apr 2016 17:04:33 -0700 Subject: tools/lguest: Make lguest launcher use X86_SUBARCH_LGUEST explicitly Be explicit and make use of X86_SUBARCH_LGUEST directly. Signed-off-by: Luis R. Rodriguez Acked-by: Rusty Russell Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: andrew.cooper3@citrix.com Cc: andriy.shevchenko@linux.intel.com Cc: bigeasy@linutronix.de Cc: boris.ostrovsky@oracle.com Cc: david.vrabel@citrix.com Cc: ffainelli@freebox.fr Cc: george.dunlap@citrix.com Cc: glin@suse.com Cc: jgross@suse.com Cc: jlee@suse.com Cc: josh@joshtriplett.org Cc: julien.grall@linaro.org Cc: konrad.wilk@oracle.com Cc: kozerkov@parallels.com Cc: lenb@kernel.org Cc: lguest@lists.ozlabs.org Cc: linux-acpi@vger.kernel.org Cc: lv.zheng@intel.com Cc: matt@codeblueprint.co.uk Cc: mbizon@freebox.fr Cc: rjw@rjwysocki.net Cc: robert.moore@intel.com Cc: tiwai@suse.de Cc: toshi.kani@hp.com Cc: xen-devel@lists.xensource.com Link: http://lkml.kernel.org/r/1460592286-300-4-git-send-email-mcgrof@kernel.org Signed-off-by: Ingo Molnar diff --git a/tools/lguest/lguest.c b/tools/lguest/lguest.c index 80159e6..ff0aa580 100644 --- a/tools/lguest/lguest.c +++ b/tools/lguest/lguest.c @@ -3351,8 +3351,8 @@ int main(int argc, char *argv[]) /* Boot protocol version: 2.07 supports the fields for lguest. */ boot->hdr.version = 0x207; - /* The hardware_subarch value of "1" tells the Guest it's an lguest. */ - boot->hdr.hardware_subarch = 1; + /* X86_SUBARCH_LGUEST tells the Guest it's an lguest. */ + boot->hdr.hardware_subarch = X86_SUBARCH_LGUEST; /* Tell the entry path not to try to reload segment registers. */ boot->hdr.loadflags |= KEEP_SEGMENTS; -- cgit v0.10.2 From 8d152e7a5c7537b18b4e9e0eb96f549b016636dc Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Wed, 13 Apr 2016 17:04:34 -0700 Subject: x86/rtc: Replace paravirt rtc check with platform legacy quirk We have 4 types of x86 platforms that disable RTC: * Intel MID * Lguest - uses paravirt * Xen dom-U - uses paravirt * x86 on legacy systems annotated with an ACPI legacy flag We can consolidate all of these into a platform specific legacy quirk set early in boot through i386_start_kernel() and through x86_64_start_reservations(). This deals with the RTC quirks which we can rely on through the hardware subarch, the ACPI check can be dealt with separately. For Xen things are bit more complex given that the @X86_SUBARCH_XEN x86_hardware_subarch is shared on for Xen which uses the PV path for both domU and dom0. Since the semantics for differentiating between the two are Xen specific we provide a platform helper to help override default legacy features -- x86_platform.set_legacy_features(). Use of this helper is highly discouraged, its only purpose should be to account for the lack of semantics available within your given x86_hardware_subarch. As per 0-day, this bumps the vmlinux size using i386-tinyconfig as follows: TOTAL TEXT init.text x86_early_init_platform_quirks() +70 +62 +62 +43 Only 8 bytes overhead total, as the main increase in size is all removed via __init. Suggested-by: Ingo Molnar Signed-off-by: Luis R. Rodriguez Reviewed-by: Juergen Gross Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: andrew.cooper3@citrix.com Cc: andriy.shevchenko@linux.intel.com Cc: bigeasy@linutronix.de Cc: boris.ostrovsky@oracle.com Cc: david.vrabel@citrix.com Cc: ffainelli@freebox.fr Cc: george.dunlap@citrix.com Cc: glin@suse.com Cc: jlee@suse.com Cc: josh@joshtriplett.org Cc: julien.grall@linaro.org Cc: konrad.wilk@oracle.com Cc: kozerkov@parallels.com Cc: lenb@kernel.org Cc: lguest@lists.ozlabs.org Cc: linux-acpi@vger.kernel.org Cc: lv.zheng@intel.com Cc: matt@codeblueprint.co.uk Cc: mbizon@freebox.fr Cc: rjw@rjwysocki.net Cc: robert.moore@intel.com Cc: rusty@rustcorp.com.au Cc: tiwai@suse.de Cc: toshi.kani@hp.com Cc: xen-devel@lists.xensource.com Link: http://lkml.kernel.org/r/1460592286-300-5-git-send-email-mcgrof@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 4086abc..f9ed8a7 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -209,6 +209,7 @@ endif head-y := arch/x86/kernel/head_$(BITS).o head-y += arch/x86/kernel/head$(BITS).o head-y += arch/x86/kernel/head.o +head-y += arch/x86/kernel/platform-quirks.o libs-y += arch/x86/lib/ diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 601f1b8..6c7a4a1 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -20,12 +20,6 @@ static inline int paravirt_enabled(void) return pv_info.paravirt_enabled; } -static inline int paravirt_has_feature(unsigned int feature) -{ - WARN_ON_ONCE(!pv_info.paravirt_enabled); - return (pv_info.features & feature); -} - static inline void load_sp0(struct tss_struct *tss, struct thread_struct *thread) { diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index e8c2326..6acc1b2 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -70,14 +70,9 @@ struct pv_info { #endif int paravirt_enabled; - unsigned int features; /* valid only if paravirt_enabled is set */ const char *name; }; -#define paravirt_has(x) paravirt_has_feature(PV_SUPPORTED_##x) -/* Supported features */ -#define PV_SUPPORTED_RTC (1<<0) - struct pv_init_ops { /* * Patch may replace one of the defined code sequences with diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 9264476..0c70c7d 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -474,7 +474,6 @@ static inline unsigned long current_top_of_stack(void) #else #define __cpuid native_cpuid #define paravirt_enabled() 0 -#define paravirt_has(x) 0 static inline void load_sp0(struct tss_struct *tss, struct thread_struct *thread) diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 1ae89a2..8bb8c1a 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -142,6 +142,15 @@ struct x86_cpuinit_ops { struct timespec; /** + * struct x86_legacy_features - legacy x86 features + * + * @rtc: this device has a CMOS real-time clock present + */ +struct x86_legacy_features { + int rtc; +}; + +/** * struct x86_platform_ops - platform specific runtime functions * @calibrate_tsc: calibrate TSC * @get_wallclock: get time from HW clock like RTC etc. @@ -152,6 +161,14 @@ struct timespec; * @save_sched_clock_state: save state for sched_clock() on suspend * @restore_sched_clock_state: restore state for sched_clock() on resume * @apic_post_init: adjust apic if neeeded + * @legacy: legacy features + * @set_legacy_features: override legacy features. Use of this callback + * is highly discouraged. You should only need + * this if your hardware platform requires further + * custom fine tuning far beyong what may be + * possible in x86_early_init_platform_quirks() by + * only using the current x86_hardware_subarch + * semantics. */ struct x86_platform_ops { unsigned long (*calibrate_tsc)(void); @@ -165,6 +182,8 @@ struct x86_platform_ops { void (*save_sched_clock_state)(void); void (*restore_sched_clock_state)(void); void (*apic_post_init)(void); + struct x86_legacy_features legacy; + void (*set_legacy_features)(void); }; struct pci_dev; @@ -186,6 +205,8 @@ extern struct x86_cpuinit_ops x86_cpuinit; extern struct x86_platform_ops x86_platform; extern struct x86_msi_ops x86_msi; extern struct x86_io_apic_ops x86_io_apic_ops; + +extern void x86_early_init_platform_quirks(void); extern void x86_init_noop(void); extern void x86_init_uint_noop(unsigned int unused); diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 616ebd2..b81b22e 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -2,7 +2,11 @@ # Makefile for the linux kernel. # -extra-y := head_$(BITS).o head$(BITS).o head.o vmlinux.lds +extra-y := head_$(BITS).o +extra-y += head$(BITS).o +extra-y += head.o +extra-y += platform-quirks.o +extra-y += vmlinux.lds CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE) diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index 2911ef3..d784bb5 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -34,6 +34,8 @@ asmlinkage __visible void __init i386_start_kernel(void) cr4_init_shadow(); sanitize_boot_params(&boot_params); + x86_early_init_platform_quirks(); + /* Call the subarch specific early setup function */ switch (boot_params.hdr.hardware_subarch) { case X86_SUBARCH_INTEL_MID: diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 1f4422d..b72fb0b 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -182,6 +182,7 @@ void __init x86_64_start_reservations(char *real_mode_data) if (!boot_params.hdr.version) copy_bootdata(__va(real_mode_data)); + x86_early_init_platform_quirks(); reserve_ebda_region(); switch (boot_params.hdr.hardware_subarch) { diff --git a/arch/x86/kernel/platform-quirks.c b/arch/x86/kernel/platform-quirks.c new file mode 100644 index 0000000..021a5f9 --- /dev/null +++ b/arch/x86/kernel/platform-quirks.c @@ -0,0 +1,21 @@ +#include +#include + +#include +#include + +void __init x86_early_init_platform_quirks(void) +{ + x86_platform.legacy.rtc = 1; + + switch (boot_params.hdr.hardware_subarch) { + case X86_SUBARCH_XEN: + case X86_SUBARCH_LGUEST: + case X86_SUBARCH_INTEL_MID: + x86_platform.legacy.rtc = 0; + break; + } + + if (x86_platform.set_legacy_features) + x86_platform.set_legacy_features(); +} diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c index 4af8d06..62c48da 100644 --- a/arch/x86/kernel/rtc.c +++ b/arch/x86/kernel/rtc.c @@ -14,6 +14,7 @@ #include #include #include +#include #ifdef CONFIG_X86_32 /* @@ -188,10 +189,6 @@ static __init int add_rtc_cmos(void) if (of_have_populated_dt()) return 0; - /* Intel MID platforms don't have ioport rtc */ - if (intel_mid_identify_cpu()) - return -ENODEV; - #ifdef CONFIG_ACPI if (acpi_gbl_FADT.boot_flags & ACPI_FADT_NO_CMOS_RTC) { /* This warning can likely go away again in a year or two. */ @@ -200,7 +197,7 @@ static __init int add_rtc_cmos(void) } #endif - if (paravirt_enabled() && !paravirt_has(RTC)) + if (!x86_platform.legacy.rtc) return -ENODEV; platform_device_register(&rtc_device); diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index fd57d3a..f5497ee 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -1414,7 +1414,6 @@ __init void lguest_init(void) pv_info.kernel_rpl = 1; /* Everyone except Xen runs with this set. */ pv_info.shared_kernel_pmd = 1; - pv_info.features = 0; /* * We set up all the lguest overrides for sensitive operations. These diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 61f4d9f..752029d 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1193,7 +1193,6 @@ static const struct pv_info xen_info __initconst = { #ifdef CONFIG_X86_64 .extra_user_64bit_cs = FLAT_USER_CS64, #endif - .features = 0, .name = "Xen", }; @@ -1506,6 +1505,11 @@ static void __init xen_pvh_early_guest_init(void) } #endif /* CONFIG_XEN_PVH */ +static void __init xen_dom0_set_legacy_features(void) +{ + x86_platform.legacy.rtc = 1; +} + /* First C function to be called on Xen boot */ asmlinkage __visible void __init xen_start_kernel(void) { @@ -1527,8 +1531,6 @@ asmlinkage __visible void __init xen_start_kernel(void) /* Install Xen paravirt ops */ pv_info = xen_info; - if (xen_initial_domain()) - pv_info.features |= PV_SUPPORTED_RTC; pv_init_ops = xen_init_ops; if (!xen_pvh_domain()) { pv_cpu_ops = xen_cpu_ops; @@ -1688,6 +1690,8 @@ asmlinkage __visible void __init xen_start_kernel(void) .u.firmware_info.type = XEN_FW_KBD_SHIFT_FLAGS, }; + x86_platform.set_legacy_features = + xen_dom0_set_legacy_features; xen_init_vga(info, xen_start_info->console.dom0.info_size); xen_start_info->console.domU.mfn = 0; xen_start_info->console.domU.evtchn = 0; -- cgit v0.10.2 From 088a8ef8207f19aadbade0971af21ad89fdc3815 Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Wed, 13 Apr 2016 17:04:35 -0700 Subject: x86/ACPI: Move ACPI_FADT_NO_CMOS_RTC check to ACPI boot code This moves the ACPI specific check into the ACPI boot code, it also takes advantage of the x86_platform.legacy.rtc which is checked for already on the RTC initialization code. This lets us remove the nasty #ifdefery and consolidate the checks to use only one toggle to disable the RTC init code. The works as RTC is initialized by device_initcall(add_rtc_cmos), this will run late in boot on start_kernel() during rest_init(), acpi_parse_fadt() gets called earlier during setup_arch(). Signed-off-by: Luis R. Rodriguez Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: andrew.cooper3@citrix.com Cc: andriy.shevchenko@linux.intel.com Cc: bigeasy@linutronix.de Cc: boris.ostrovsky@oracle.com Cc: david.vrabel@citrix.com Cc: ffainelli@freebox.fr Cc: george.dunlap@citrix.com Cc: glin@suse.com Cc: jgross@suse.com Cc: jlee@suse.com Cc: josh@joshtriplett.org Cc: julien.grall@linaro.org Cc: konrad.wilk@oracle.com Cc: kozerkov@parallels.com Cc: lenb@kernel.org Cc: lguest@lists.ozlabs.org Cc: linux-acpi@vger.kernel.org Cc: lv.zheng@intel.com Cc: matt@codeblueprint.co.uk Cc: mbizon@freebox.fr Cc: rjw@rjwysocki.net Cc: robert.moore@intel.com Cc: rusty@rustcorp.com.au Cc: tiwai@suse.de Cc: toshi.kani@hp.com Cc: xen-devel@lists.xensource.com Link: http://lkml.kernel.org/r/1460592286-300-6-git-send-email-mcgrof@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 8c2f1ef..8c9c2bd 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -913,6 +913,10 @@ late_initcall(hpet_insert_resource); static int __init acpi_parse_fadt(struct acpi_table_header *table) { + if (acpi_gbl_FADT.boot_flags & ACPI_FADT_NO_CMOS_RTC) { + pr_debug("ACPI: not registering RTC platform device\n"); + x86_platform.legacy.rtc = 0; + } #ifdef CONFIG_X86_PM_TIMER /* detect the location of the ACPI PM Timer */ diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c index 62c48da..ff4f418 100644 --- a/arch/x86/kernel/rtc.c +++ b/arch/x86/kernel/rtc.c @@ -189,14 +189,6 @@ static __init int add_rtc_cmos(void) if (of_have_populated_dt()) return 0; -#ifdef CONFIG_ACPI - if (acpi_gbl_FADT.boot_flags & ACPI_FADT_NO_CMOS_RTC) { - /* This warning can likely go away again in a year or two. */ - pr_info("ACPI: not registering RTC platform device\n"); - return -ENODEV; - } -#endif - if (!x86_platform.legacy.rtc) return -ENODEV; -- cgit v0.10.2 From 1330e3bc544a1951d81b7f3c7d4cecf77d906f67 Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Wed, 13 Apr 2016 17:04:36 -0700 Subject: x86/init: Use a platform legacy quirk for EBDA This replaces the paravirt_enabled() check with a proper x86 legacy platform quirk. As per 0-day, this bumps the vmlinux size using i386-tinyconfig as follows: TOTAL TEXT init.text x86_early_init_platform_quirks() +39 +35 +35 +25 That's a 4 byte total overhead, the rest is all cleared out upon init as its all __init text. v2: document 0-day vmlinux size impact Signed-off-by: Luis R. Rodriguez Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: andrew.cooper3@citrix.com Cc: andriy.shevchenko@linux.intel.com Cc: bigeasy@linutronix.de Cc: boris.ostrovsky@oracle.com Cc: david.vrabel@citrix.com Cc: ffainelli@freebox.fr Cc: george.dunlap@citrix.com Cc: glin@suse.com Cc: jgross@suse.com Cc: jlee@suse.com Cc: josh@joshtriplett.org Cc: julien.grall@linaro.org Cc: konrad.wilk@oracle.com Cc: kozerkov@parallels.com Cc: lenb@kernel.org Cc: lguest@lists.ozlabs.org Cc: linux-acpi@vger.kernel.org Cc: lv.zheng@intel.com Cc: matt@codeblueprint.co.uk Cc: mbizon@freebox.fr Cc: rjw@rjwysocki.net Cc: robert.moore@intel.com Cc: rusty@rustcorp.com.au Cc: tiwai@suse.de Cc: toshi.kani@hp.com Cc: xen-devel@lists.xensource.com Link: http://lkml.kernel.org/r/1460592286-300-7-git-send-email-mcgrof@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 8bb8c1a..89d9d57 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -145,9 +145,12 @@ struct timespec; * struct x86_legacy_features - legacy x86 features * * @rtc: this device has a CMOS real-time clock present + * @ebda_search: it's safe to search for the EBDA signature in the hardware's + * low RAM */ struct x86_legacy_features { int rtc; + int ebda_search; }; /** diff --git a/arch/x86/kernel/head.c b/arch/x86/kernel/head.c index 992f442..afe65df 100644 --- a/arch/x86/kernel/head.c +++ b/arch/x86/kernel/head.c @@ -38,7 +38,7 @@ void __init reserve_ebda_region(void) * that the paravirt case can handle memory setup * correctly, without our help. */ - if (paravirt_enabled()) + if (!x86_platform.legacy.ebda_search) return; /* end of low (conventional) memory */ diff --git a/arch/x86/kernel/platform-quirks.c b/arch/x86/kernel/platform-quirks.c index 021a5f9..01b1597 100644 --- a/arch/x86/kernel/platform-quirks.c +++ b/arch/x86/kernel/platform-quirks.c @@ -7,8 +7,12 @@ void __init x86_early_init_platform_quirks(void) { x86_platform.legacy.rtc = 1; + x86_platform.legacy.ebda_search = 0; switch (boot_params.hdr.hardware_subarch) { + case X86_SUBARCH_PC: + x86_platform.legacy.ebda_search = 1; + break; case X86_SUBARCH_XEN: case X86_SUBARCH_LGUEST: case X86_SUBARCH_INTEL_MID: -- cgit v0.10.2 From 46504590321dc62a11065f8d00e1b12037c37018 Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Wed, 13 Apr 2016 17:04:37 -0700 Subject: tools/lguest: Force disable tboot and APM The paravirt_enabled() check is going away, the area tossed to the kernel on lguest is not zeroed out, so ensure lguest force disables tboot and APM just in case the kernel file being read might have this set for whatever reason. Signed-off-by: Luis R. Rodriguez Acked-by: Rusty Russell Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: andrew.cooper3@citrix.com Cc: andriy.shevchenko@linux.intel.com Cc: bigeasy@linutronix.de Cc: boris.ostrovsky@oracle.com Cc: david.vrabel@citrix.com Cc: ffainelli@freebox.fr Cc: george.dunlap@citrix.com Cc: glin@suse.com Cc: jgross@suse.com Cc: jlee@suse.com Cc: josh@joshtriplett.org Cc: julien.grall@linaro.org Cc: konrad.wilk@oracle.com Cc: kozerkov@parallels.com Cc: lenb@kernel.org Cc: lguest@lists.ozlabs.org Cc: linux-acpi@vger.kernel.org Cc: lv.zheng@intel.com Cc: matt@codeblueprint.co.uk Cc: mbizon@freebox.fr Cc: rjw@rjwysocki.net Cc: robert.moore@intel.com Cc: tiwai@suse.de Cc: toshi.kani@hp.com Cc: xen-devel@lists.xensource.com Link: http://lkml.kernel.org/r/1460592286-300-8-git-send-email-mcgrof@kernel.org Signed-off-by: Ingo Molnar diff --git a/tools/lguest/lguest.c b/tools/lguest/lguest.c index ff0aa580..d9836c5 100644 --- a/tools/lguest/lguest.c +++ b/tools/lguest/lguest.c @@ -3357,6 +3357,12 @@ int main(int argc, char *argv[]) /* Tell the entry path not to try to reload segment registers. */ boot->hdr.loadflags |= KEEP_SEGMENTS; + /* We don't support tboot: */ + boot->tboot_addr = 0; + + /* Ensure this is 0 to prevent APM from loading: */ + boot->apm_bios_info.version = 0; + /* We tell the kernel to initialize the Guest. */ tell_kernel(start); -- cgit v0.10.2 From 8bc55f805697ec2a69c6a576fac8ee36ea9772bb Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Wed, 13 Apr 2016 17:04:38 -0700 Subject: x86/apm32: Remove paravirt_enabled() use There is already a check for apm_info.bios == 0, the apm_info.bios is set from the boot_params.apm_bios_info. Both Xen and lguest, which are also the only ones that set paravirt_enabled to true, never set the apm_bios.info. The Xen folks are sure force disable to 0 is not needed because apm_info lives in .bss, we recently forced disabled this on lguest, and on the Xen side just to be sure Boris zeroed out the .bss for PV guests through commit 04b6b4a56884327c1648 ("xen/x86: Zero out .bss for PV guests"). With this care taken into consideration the paravirt_enabled() check is simply not needed anymore. Signed-off-by: Luis R. Rodriguez Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: andrew.cooper3@citrix.com Cc: andriy.shevchenko@linux.intel.com Cc: bigeasy@linutronix.de Cc: boris.ostrovsky@oracle.com Cc: david.vrabel@citrix.com Cc: ffainelli@freebox.fr Cc: george.dunlap@citrix.com Cc: glin@suse.com Cc: jgross@suse.com Cc: jlee@suse.com Cc: josh@joshtriplett.org Cc: julien.grall@linaro.org Cc: konrad.wilk@oracle.com Cc: kozerkov@parallels.com Cc: lenb@kernel.org Cc: lguest@lists.ozlabs.org Cc: linux-acpi@vger.kernel.org Cc: lv.zheng@intel.com Cc: matt@codeblueprint.co.uk Cc: mbizon@freebox.fr Cc: rjw@rjwysocki.net Cc: robert.moore@intel.com Cc: rusty@rustcorp.com.au Cc: tiwai@suse.de Cc: toshi.kani@hp.com Cc: xen-devel@lists.xensource.com Link: http://lkml.kernel.org/r/1460592286-300-9-git-send-email-mcgrof@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 9307f18..c7364bd 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -2267,7 +2267,7 @@ static int __init apm_init(void) dmi_check_system(apm_dmi_table); - if (apm_info.bios.version == 0 || paravirt_enabled() || machine_is_olpc()) { + if (apm_info.bios.version == 0 || machine_is_olpc()) { printk(KERN_INFO "apm: BIOS not found.\n"); return -ENODEV; } -- cgit v0.10.2 From 44ecf0ef907fe45510566d308d670aa5823a4dd5 Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Wed, 13 Apr 2016 17:04:39 -0700 Subject: x86/tboot: Remove paravirt_enabled() use There is already a check for boot_params.tboot_addr prior to paravirt_enabled(). Both Xen and lguest, which are also the only ones that set paravirt_enabled to true, never set the boot_params.tboot_addr. The Xen folks are sure a force disable to 0 is not needed, we recently forced disabled this on lguest. With this in place this check is no longer needed. Xen folks are sure force disable to 0 is not needed because apm_info lives in .bss, we recently forced disabled this on lguest, and on the Xen side just to be sure Boris zeroed out the .bss for PV guests through commit 04b6b4a56884327c1648 ("xen/x86: Zero out .bss for PV guests"). With this care taken into consideration the paravirt_enabled() check is simply not needed anymore. Signed-off-by: Luis R. Rodriguez Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: andrew.cooper3@citrix.com Cc: andriy.shevchenko@linux.intel.com Cc: bigeasy@linutronix.de Cc: boris.ostrovsky@oracle.com Cc: david.vrabel@citrix.com Cc: ffainelli@freebox.fr Cc: george.dunlap@citrix.com Cc: glin@suse.com Cc: jgross@suse.com Cc: jlee@suse.com Cc: josh@joshtriplett.org Cc: julien.grall@linaro.org Cc: konrad.wilk@oracle.com Cc: kozerkov@parallels.com Cc: lenb@kernel.org Cc: lguest@lists.ozlabs.org Cc: linux-acpi@vger.kernel.org Cc: lv.zheng@intel.com Cc: matt@codeblueprint.co.uk Cc: mbizon@freebox.fr Cc: rjw@rjwysocki.net Cc: robert.moore@intel.com Cc: rusty@rustcorp.com.au Cc: tiwai@suse.de Cc: toshi.kani@hp.com Cc: xen-devel@lists.xensource.com Link: http://lkml.kernel.org/r/1460592286-300-10-git-send-email-mcgrof@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index e72a07f..9b0185f 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -74,12 +74,6 @@ void __init tboot_probe(void) return; } - /* only a natively booted kernel should be using TXT */ - if (paravirt_enabled()) { - pr_warning("non-0 tboot_addr but pv_ops is enabled\n"); - return; - } - /* Map and check for tboot UUID. */ set_fixmap(FIX_TBOOT_BASE, boot_params.tboot_addr); tboot = (struct tboot *)fix_to_virt(FIX_TBOOT_BASE); -- cgit v0.10.2 From fa392794ed8329379f3f637da7c3c2f078309a77 Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Wed, 13 Apr 2016 17:04:40 -0700 Subject: x86/cpu/intel: Remove not needed paravirt_enabled() use for F00F work around The X86_BUG_F00F work around is responsible for fixing up the error generated on attempted F00F exploitation from an OOPS to a SIGILL. There is no reason why this code should not be allowed to run on PV guest on a F00F-affected CPU -- it would simply never trigger. The pv_enabled() check was there only to avoid printing the f00f workaround, so removing the check is purely a cosmetic change. Suggested-by: Andy Lutomirski Signed-off-by: Luis R. Rodriguez Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: andrew.cooper3@citrix.com Cc: andriy.shevchenko@linux.intel.com Cc: bigeasy@linutronix.de Cc: boris.ostrovsky@oracle.com Cc: david.vrabel@citrix.com Cc: ffainelli@freebox.fr Cc: george.dunlap@citrix.com Cc: glin@suse.com Cc: jgross@suse.com Cc: jlee@suse.com Cc: josh@joshtriplett.org Cc: julien.grall@linaro.org Cc: konrad.wilk@oracle.com Cc: kozerkov@parallels.com Cc: lenb@kernel.org Cc: lguest@lists.ozlabs.org Cc: linux-acpi@vger.kernel.org Cc: lv.zheng@intel.com Cc: matt@codeblueprint.co.uk Cc: mbizon@freebox.fr Cc: rjw@rjwysocki.net Cc: robert.moore@intel.com Cc: rusty@rustcorp.com.au Cc: tiwai@suse.de Cc: toshi.kani@hp.com Cc: xen-devel@lists.xensource.com Link: http://lkml.kernel.org/r/1460592286-300-11-git-send-email-mcgrof@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 1f7fdb9..016b3d9 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -233,7 +233,7 @@ static void intel_workarounds(struct cpuinfo_x86 *c) * The Quark is also family 5, but does not have the same bug. */ clear_cpu_bug(c, X86_BUG_F00F); - if (!paravirt_enabled() && c->x86 == 5 && c->x86_model < 9) { + if (c->x86 == 5 && c->x86_model < 9) { static int f00f_workaround_enabled; set_cpu_bug(c, X86_BUG_F00F); -- cgit v0.10.2 From 80dfd83dfab6e49a31ab8fc484a801aef1c567bd Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Wed, 13 Apr 2016 17:04:41 -0700 Subject: x86, drivers/pnpbios: Replace paravirt_enabled() check with legacy device check Since we are removing paravirt_enabled() replace it with a logical equivalent. Even though PNPBIOS is x86 specific we add an arch-specific type call, which can be implemented by any architecture to show how other legacy attribute devices can later be also checked for with other ACPI legacy attribute flags. This implicates the first ACPI 5.2.9.3 IA-PC Boot Architecture ACPI_FADT_LEGACY_DEVICES flag device, and shows how to add more. The reason pnpbios gets a defined structure and as such uses a different approach than the RTC legacy quirk is that ACPI has a respective RTC flag, while pnpbios does not. We fold the pnpbios quirk under ACPI_FADT_LEGACY_DEVICES ACPI flag use case, and use a struct of possible devices to enable future extensions of this. As per 0-day, this bumps the vmlinux size using i386-tinyconfig as follows: TOTAL TEXT init.text x86_early_init_platform_quirks() +32 +28 +28 +28 That's 4 byte overhead total, the rest is cleared out on init as its all __init text. v2: split out subarch handlng on switch to make it easier later to add other subarchs. The 'fall-through' switch handling can be confusing and we'll remove it later when we add handling for X86_SUBARCH_CE4100. v3: document vmlinux size impact as per 0-day, and also explain why pnpbios is treated differently than the RTC legacy feature. Signed-off-by: Luis R. Rodriguez Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Greg Kroah-Hartman Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: andrew.cooper3@citrix.com Cc: andriy.shevchenko@linux.intel.com Cc: bigeasy@linutronix.de Cc: boris.ostrovsky@oracle.com Cc: david.vrabel@citrix.com Cc: ffainelli@freebox.fr Cc: george.dunlap@citrix.com Cc: glin@suse.com Cc: jgross@suse.com Cc: jlee@suse.com Cc: josh@joshtriplett.org Cc: julien.grall@linaro.org Cc: konrad.wilk@oracle.com Cc: kozerkov@parallels.com Cc: lenb@kernel.org Cc: lguest@lists.ozlabs.org Cc: linux-acpi@vger.kernel.org Cc: lv.zheng@intel.com Cc: matt@codeblueprint.co.uk Cc: mbizon@freebox.fr Cc: rjw@rjwysocki.net Cc: robert.moore@intel.com Cc: rusty@rustcorp.com.au Cc: tiwai@suse.de Cc: toshi.kani@hp.com Cc: xen-devel@lists.xensource.com Link: http://lkml.kernel.org/r/1460592286-300-12-git-send-email-mcgrof@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 89d9d57..4dcdf74 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -142,15 +142,41 @@ struct x86_cpuinit_ops { struct timespec; /** + * struct x86_legacy_devices - legacy x86 devices + * + * @pnpbios: this platform can have a PNPBIOS. If this is disabled the platform + * is known to never have a PNPBIOS. + * + * These are devices known to require LPC or ISA bus. The definition of legacy + * devices adheres to the ACPI 5.2.9.3 IA-PC Boot Architecture flag + * ACPI_FADT_LEGACY_DEVICES. These devices consist of user visible devices on + * the LPC or ISA bus. User visible devices are devices that have end-user + * accessible connectors (for example, LPT parallel port). Legacy devices on + * the LPC bus consist for example of serial and parallel ports, PS/2 keyboard + * / mouse, and the floppy disk controller. A system that lacks all known + * legacy devices can assume all devices can be detected exclusively via + * standard device enumeration mechanisms including the ACPI namespace. + * + * A system which has does not have ACPI_FADT_LEGACY_DEVICES enabled must not + * have any of the legacy devices enumerated below present. + */ +struct x86_legacy_devices { + int pnpbios; +}; + +/** * struct x86_legacy_features - legacy x86 features * * @rtc: this device has a CMOS real-time clock present * @ebda_search: it's safe to search for the EBDA signature in the hardware's * low RAM + * @devices: legacy x86 devices, refer to struct x86_legacy_devices + * documentation for further details. */ struct x86_legacy_features { int rtc; int ebda_search; + struct x86_legacy_devices devices; }; /** diff --git a/arch/x86/kernel/platform-quirks.c b/arch/x86/kernel/platform-quirks.c index 01b1597..ab64382 100644 --- a/arch/x86/kernel/platform-quirks.c +++ b/arch/x86/kernel/platform-quirks.c @@ -8,6 +8,7 @@ void __init x86_early_init_platform_quirks(void) { x86_platform.legacy.rtc = 1; x86_platform.legacy.ebda_search = 0; + x86_platform.legacy.devices.pnpbios = 1; switch (boot_params.hdr.hardware_subarch) { case X86_SUBARCH_PC: @@ -15,6 +16,9 @@ void __init x86_early_init_platform_quirks(void) break; case X86_SUBARCH_XEN: case X86_SUBARCH_LGUEST: + x86_platform.legacy.devices.pnpbios = 0; + x86_platform.legacy.rtc = 0; + break; case X86_SUBARCH_INTEL_MID: x86_platform.legacy.rtc = 0; break; @@ -23,3 +27,10 @@ void __init x86_early_init_platform_quirks(void) if (x86_platform.set_legacy_features) x86_platform.set_legacy_features(); } + +#if defined(CONFIG_PNPBIOS) +bool __init arch_pnpbios_disabled(void) +{ + return x86_platform.legacy.devices.pnpbios == 0; +} +#endif diff --git a/drivers/pnp/pnpbios/core.c b/drivers/pnp/pnpbios/core.c index facd43b..81603d9 100644 --- a/drivers/pnp/pnpbios/core.c +++ b/drivers/pnp/pnpbios/core.c @@ -521,10 +521,11 @@ static int __init pnpbios_init(void) int ret; if (pnpbios_disabled || dmi_check_system(pnpbios_dmi_table) || - paravirt_enabled()) { + arch_pnpbios_disabled()) { printk(KERN_INFO "PnPBIOS: Disabled\n"); return -ENODEV; } + #ifdef CONFIG_PNPACPI if (!acpi_disabled && !pnpacpi_disabled) { pnpbios_disabled = 1; diff --git a/include/linux/pnp.h b/include/linux/pnp.h index 5df733b..2588ca6 100644 --- a/include/linux/pnp.h +++ b/include/linux/pnp.h @@ -337,9 +337,11 @@ extern struct mutex pnp_res_mutex; #ifdef CONFIG_PNPBIOS extern struct pnp_protocol pnpbios_protocol; +extern bool arch_pnpbios_disabled(void); #define pnp_device_is_pnpbios(dev) ((dev)->protocol == (&pnpbios_protocol)) #else #define pnp_device_is_pnpbios(dev) 0 +#define arch_pnpbios_disabled() false #endif #ifdef CONFIG_PNPACPI -- cgit v0.10.2 From 7a17b82ccd6671a4bb436df52eedeff906b02735 Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Wed, 13 Apr 2016 17:04:42 -0700 Subject: x86/ACPI: Parse ACPI_FADT_LEGACY_DEVICES ACPI 5.2.9.3 IA-PC Boot Architecture flag ACPI_FADT_LEGACY_DEVICES can be used to determine if a system has legacy devices LPC or ISA devices. The x86 platform already has a struct which lists known associated legacy devices, we start off careful only by disabling root devices we should not regress with. The struct and device list can be expanded with time to cover more root legacy components. Signed-off-by: Luis R. Rodriguez Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: andrew.cooper3@citrix.com Cc: andriy.shevchenko@linux.intel.com Cc: bigeasy@linutronix.de Cc: boris.ostrovsky@oracle.com Cc: david.vrabel@citrix.com Cc: ffainelli@freebox.fr Cc: george.dunlap@citrix.com Cc: glin@suse.com Cc: jgross@suse.com Cc: jlee@suse.com Cc: josh@joshtriplett.org Cc: julien.grall@linaro.org Cc: konrad.wilk@oracle.com Cc: kozerkov@parallels.com Cc: lenb@kernel.org Cc: lguest@lists.ozlabs.org Cc: linux-acpi@vger.kernel.org Cc: lv.zheng@intel.com Cc: matt@codeblueprint.co.uk Cc: mbizon@freebox.fr Cc: rjw@rjwysocki.net Cc: robert.moore@intel.com Cc: rusty@rustcorp.com.au Cc: tiwai@suse.de Cc: toshi.kani@hp.com Cc: xen-devel@lists.xensource.com Link: http://lkml.kernel.org/r/1460592286-300-13-git-send-email-mcgrof@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 8c9c2bd..c9a06e5 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -913,6 +913,11 @@ late_initcall(hpet_insert_resource); static int __init acpi_parse_fadt(struct acpi_table_header *table) { + if (!(acpi_gbl_FADT.boot_flags & ACPI_FADT_LEGACY_DEVICES)) { + pr_debug("ACPI: no legacy devices present\n"); + x86_platform.legacy.devices.pnpbios = 0; + } + if (acpi_gbl_FADT.boot_flags & ACPI_FADT_NO_CMOS_RTC) { pr_debug("ACPI: not registering RTC platform device\n"); x86_platform.legacy.rtc = 0; -- cgit v0.10.2 From f2d85299b7f11f73cc0a294e396cdae114e75787 Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Wed, 13 Apr 2016 17:04:43 -0700 Subject: x86/init: Rename EBDA code file This makes it clearer what this is. Signed-off-by: Luis R. Rodriguez Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: andrew.cooper3@citrix.com Cc: andriy.shevchenko@linux.intel.com Cc: bigeasy@linutronix.de Cc: boris.ostrovsky@oracle.com Cc: david.vrabel@citrix.com Cc: ffainelli@freebox.fr Cc: george.dunlap@citrix.com Cc: glin@suse.com Cc: jgross@suse.com Cc: jlee@suse.com Cc: josh@joshtriplett.org Cc: julien.grall@linaro.org Cc: konrad.wilk@oracle.com Cc: kozerkov@parallels.com Cc: lenb@kernel.org Cc: lguest@lists.ozlabs.org Cc: linux-acpi@vger.kernel.org Cc: lv.zheng@intel.com Cc: matt@codeblueprint.co.uk Cc: mbizon@freebox.fr Cc: rjw@rjwysocki.net Cc: robert.moore@intel.com Cc: rusty@rustcorp.com.au Cc: tiwai@suse.de Cc: toshi.kani@hp.com Cc: xen-devel@lists.xensource.com Link: http://lkml.kernel.org/r/1460592286-300-14-git-send-email-mcgrof@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/Makefile b/arch/x86/Makefile index f9ed8a7..6fce7f0 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -208,7 +208,7 @@ endif head-y := arch/x86/kernel/head_$(BITS).o head-y += arch/x86/kernel/head$(BITS).o -head-y += arch/x86/kernel/head.o +head-y += arch/x86/kernel/ebda.o head-y += arch/x86/kernel/platform-quirks.o libs-y += arch/x86/lib/ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index b81b22e..9abf855 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -4,7 +4,7 @@ extra-y := head_$(BITS).o extra-y += head$(BITS).o -extra-y += head.o +extra-y += ebda.o extra-y += platform-quirks.o extra-y += vmlinux.lds diff --git a/arch/x86/kernel/ebda.c b/arch/x86/kernel/ebda.c new file mode 100644 index 0000000..afe65df --- /dev/null +++ b/arch/x86/kernel/ebda.c @@ -0,0 +1,71 @@ +#include +#include +#include + +#include +#include + +/* + * The BIOS places the EBDA/XBDA at the top of conventional + * memory, and usually decreases the reported amount of + * conventional memory (int 0x12) too. This also contains a + * workaround for Dell systems that neglect to reserve EBDA. + * The same workaround also avoids a problem with the AMD768MPX + * chipset: reserve a page before VGA to prevent PCI prefetch + * into it (errata #56). Usually the page is reserved anyways, + * unless you have no PS/2 mouse plugged in. + * + * This functions is deliberately very conservative. Losing + * memory in the bottom megabyte is rarely a problem, as long + * as we have enough memory to install the trampoline. Using + * memory that is in use by the BIOS or by some DMA device + * the BIOS didn't shut down *is* a big problem. + */ + +#define BIOS_LOWMEM_KILOBYTES 0x413 +#define LOWMEM_CAP 0x9f000U /* Absolute maximum */ +#define INSANE_CUTOFF 0x20000U /* Less than this = insane */ + +void __init reserve_ebda_region(void) +{ + unsigned int lowmem, ebda_addr; + + /* + * To determine the position of the EBDA and the + * end of conventional memory, we need to look at + * the BIOS data area. In a paravirtual environment + * that area is absent. We'll just have to assume + * that the paravirt case can handle memory setup + * correctly, without our help. + */ + if (!x86_platform.legacy.ebda_search) + return; + + /* end of low (conventional) memory */ + lowmem = *(unsigned short *)__va(BIOS_LOWMEM_KILOBYTES); + lowmem <<= 10; + + /* start of EBDA area */ + ebda_addr = get_bios_ebda(); + + /* + * Note: some old Dells seem to need 4k EBDA without + * reporting so, so just consider the memory above 0x9f000 + * to be off limits (bugzilla 2990). + */ + + /* If the EBDA address is below 128K, assume it is bogus */ + if (ebda_addr < INSANE_CUTOFF) + ebda_addr = LOWMEM_CAP; + + /* If lowmem is less than 128K, assume it is bogus */ + if (lowmem < INSANE_CUTOFF) + lowmem = LOWMEM_CAP; + + /* Use the lower of the lowmem and EBDA markers as the cutoff */ + lowmem = min(lowmem, ebda_addr); + lowmem = min(lowmem, LOWMEM_CAP); /* Absolute cap */ + + /* reserve all memory between lowmem and the 1MB mark */ + memblock_reserve(lowmem, 0x100000 - lowmem); +} diff --git a/arch/x86/kernel/head.c b/arch/x86/kernel/head.c deleted file mode 100644 index afe65df..0000000 --- a/arch/x86/kernel/head.c +++ /dev/null @@ -1,71 +0,0 @@ -#include -#include -#include - -#include -#include - -/* - * The BIOS places the EBDA/XBDA at the top of conventional - * memory, and usually decreases the reported amount of - * conventional memory (int 0x12) too. This also contains a - * workaround for Dell systems that neglect to reserve EBDA. - * The same workaround also avoids a problem with the AMD768MPX - * chipset: reserve a page before VGA to prevent PCI prefetch - * into it (errata #56). Usually the page is reserved anyways, - * unless you have no PS/2 mouse plugged in. - * - * This functions is deliberately very conservative. Losing - * memory in the bottom megabyte is rarely a problem, as long - * as we have enough memory to install the trampoline. Using - * memory that is in use by the BIOS or by some DMA device - * the BIOS didn't shut down *is* a big problem. - */ - -#define BIOS_LOWMEM_KILOBYTES 0x413 -#define LOWMEM_CAP 0x9f000U /* Absolute maximum */ -#define INSANE_CUTOFF 0x20000U /* Less than this = insane */ - -void __init reserve_ebda_region(void) -{ - unsigned int lowmem, ebda_addr; - - /* - * To determine the position of the EBDA and the - * end of conventional memory, we need to look at - * the BIOS data area. In a paravirtual environment - * that area is absent. We'll just have to assume - * that the paravirt case can handle memory setup - * correctly, without our help. - */ - if (!x86_platform.legacy.ebda_search) - return; - - /* end of low (conventional) memory */ - lowmem = *(unsigned short *)__va(BIOS_LOWMEM_KILOBYTES); - lowmem <<= 10; - - /* start of EBDA area */ - ebda_addr = get_bios_ebda(); - - /* - * Note: some old Dells seem to need 4k EBDA without - * reporting so, so just consider the memory above 0x9f000 - * to be off limits (bugzilla 2990). - */ - - /* If the EBDA address is below 128K, assume it is bogus */ - if (ebda_addr < INSANE_CUTOFF) - ebda_addr = LOWMEM_CAP; - - /* If lowmem is less than 128K, assume it is bogus */ - if (lowmem < INSANE_CUTOFF) - lowmem = LOWMEM_CAP; - - /* Use the lower of the lowmem and EBDA markers as the cutoff */ - lowmem = min(lowmem, ebda_addr); - lowmem = min(lowmem, LOWMEM_CAP); /* Absolute cap */ - - /* reserve all memory between lowmem and the 1MB mark */ - memblock_reserve(lowmem, 0x100000 - lowmem); -} -- cgit v0.10.2 From 867fe800b4c423bce46e66ccb2ce91bebbd5afc6 Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Wed, 13 Apr 2016 17:04:44 -0700 Subject: x86/paravirt: Remove paravirt_enabled() Now that all previous paravirt_enabled() uses were replaced with proper x86 semantics by the previous patches we can remove the unused paravirt_enabled() mechanism. Signed-off-by: Luis R. Rodriguez Acked-by: Juergen Gross Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: andrew.cooper3@citrix.com Cc: andriy.shevchenko@linux.intel.com Cc: bigeasy@linutronix.de Cc: boris.ostrovsky@oracle.com Cc: david.vrabel@citrix.com Cc: ffainelli@freebox.fr Cc: george.dunlap@citrix.com Cc: glin@suse.com Cc: jlee@suse.com Cc: josh@joshtriplett.org Cc: julien.grall@linaro.org Cc: konrad.wilk@oracle.com Cc: kozerkov@parallels.com Cc: lenb@kernel.org Cc: lguest@lists.ozlabs.org Cc: linux-acpi@vger.kernel.org Cc: lv.zheng@intel.com Cc: matt@codeblueprint.co.uk Cc: mbizon@freebox.fr Cc: rjw@rjwysocki.net Cc: robert.moore@intel.com Cc: rusty@rustcorp.com.au Cc: tiwai@suse.de Cc: toshi.kani@hp.com Cc: xen-devel@lists.xensource.com Link: http://lkml.kernel.org/r/1460592286-300-15-git-send-email-mcgrof@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 6c7a4a1..dff26bc 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -15,11 +15,6 @@ #include #include -static inline int paravirt_enabled(void) -{ - return pv_info.paravirt_enabled; -} - static inline void load_sp0(struct tss_struct *tss, struct thread_struct *thread) { diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 6acc1b2..7fedf24 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -69,7 +69,6 @@ struct pv_info { u16 extra_user_64bit_cs; /* __USER_CS if none */ #endif - int paravirt_enabled; const char *name; }; diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 0c70c7d..8d326e8 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -473,7 +473,6 @@ static inline unsigned long current_top_of_stack(void) #include #else #define __cpuid native_cpuid -#define paravirt_enabled() 0 static inline void load_sp0(struct tss_struct *tss, struct thread_struct *thread) diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 8079508..c66546f 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -285,14 +285,6 @@ static void __init paravirt_ops_setup(void) { pv_info.name = "KVM"; - /* - * KVM isn't paravirt in the sense of paravirt_enabled. A KVM - * guest kernel works like a bare metal kernel with additional - * features, and paravirt_enabled is about features that are - * missing. - */ - pv_info.paravirt_enabled = 0; - if (kvm_para_has_feature(KVM_FEATURE_NOP_IO_DELAY)) pv_cpu_ops.io_delay = kvm_io_delay; diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index f08ac28..71a2d8a 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -294,7 +294,6 @@ enum paravirt_lazy_mode paravirt_get_lazy_mode(void) struct pv_info pv_info = { .name = "bare hardware", - .paravirt_enabled = 0, .kernel_rpl = 0, .shared_kernel_pmd = 1, /* Only used when CONFIG_X86_PAE is set */ diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index f5497ee..3847e73 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -1408,8 +1408,6 @@ __init void lguest_init(void) { /* We're under lguest. */ pv_info.name = "lguest"; - /* Paravirt is enabled. */ - pv_info.paravirt_enabled = 1; /* We're running at privilege level 1, not 0 as normal. */ pv_info.kernel_rpl = 1; /* Everyone except Xen runs with this set. */ diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 752029d..5fc20a1 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1187,7 +1187,6 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf, } static const struct pv_info xen_info __initconst = { - .paravirt_enabled = 1, .shared_kernel_pmd = 0, #ifdef CONFIG_X86_64 -- cgit v0.10.2 From f6935b7bfbf8345bea05f73dc48ce81b70f016e0 Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Wed, 13 Apr 2016 17:04:45 -0700 Subject: x86/init: Disable pnpbios for X86_SUBARCH_INTEL_MID As per hpa Intel MID platforms can also disable pnpbios: ttp://lkml.kernel.org/r/5702B5C2.7070101@zytor.com As per 0-day, this bumps the vmlinux size using i386-tinyconfig as follows: TOTAL TEXT init.text x86_early_init_platform_quirks() -8 -8 -8 -8 Suggested-by: H. Peter Anvin Signed-off-by: Luis R. Rodriguez Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: andrew.cooper3@citrix.com Cc: andriy.shevchenko@linux.intel.com Cc: bigeasy@linutronix.de Cc: boris.ostrovsky@oracle.com Cc: david.vrabel@citrix.com Cc: ffainelli@freebox.fr Cc: george.dunlap@citrix.com Cc: glin@suse.com Cc: jgross@suse.com Cc: jlee@suse.com Cc: josh@joshtriplett.org Cc: julien.grall@linaro.org Cc: konrad.wilk@oracle.com Cc: kozerkov@parallels.com Cc: lenb@kernel.org Cc: lguest@lists.ozlabs.org Cc: linux-acpi@vger.kernel.org Cc: lv.zheng@intel.com Cc: matt@codeblueprint.co.uk Cc: mbizon@freebox.fr Cc: rjw@rjwysocki.net Cc: robert.moore@intel.com Cc: rusty@rustcorp.com.au Cc: tiwai@suse.de Cc: toshi.kani@hp.com Cc: xen-devel@lists.xensource.com Link: http://lkml.kernel.org/r/1460592286-300-16-git-send-email-mcgrof@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/platform-quirks.c b/arch/x86/kernel/platform-quirks.c index ab64382..8539194 100644 --- a/arch/x86/kernel/platform-quirks.c +++ b/arch/x86/kernel/platform-quirks.c @@ -16,10 +16,8 @@ void __init x86_early_init_platform_quirks(void) break; case X86_SUBARCH_XEN: case X86_SUBARCH_LGUEST: - x86_platform.legacy.devices.pnpbios = 0; - x86_platform.legacy.rtc = 0; - break; case X86_SUBARCH_INTEL_MID: + x86_platform.legacy.devices.pnpbios = 0; x86_platform.legacy.rtc = 0; break; } -- cgit v0.10.2 From a50b22a7a1e60c48ca26cada362076b54823c501 Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Wed, 13 Apr 2016 17:04:46 -0700 Subject: x86/init: Disable pnpbios and rtc for X86_SUBARCH_CE4100 As per hpa CE4100 platforms can also disable pnpbios: http://lkml.kernel.org/r/5702B5C2.7070101@zytor.com Then Sebastian also recently noted that CE4100 also disables RTC probe, to do that Sebastian had long ago added the RTC of_have_populated_dt() check, he noted that it was meant to skip the RTC probe on all OF platforms but as of now, CE4100 was the only x86 DT using this. We can just fold this requirement into the platform quirk then. This now means that all of these match platform quirks for pnpbios and RTC preferences: * X86_SUBARCH_XEN * X86_SUBARCH_LGUEST * X86_SUBARCH_INTEL_MID * X86_SUBARCH_CE4100 Also see: http://lkml.kernel.org/r/570B52EA.60300@linutronix.de Suggested-by: H. Peter Anvin Suggested-by: Sebastian Andrzej Siewior Signed-off-by: Luis R. Rodriguez Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: andrew.cooper3@citrix.com Cc: andriy.shevchenko@linux.intel.com Cc: boris.ostrovsky@oracle.com Cc: david.vrabel@citrix.com Cc: ffainelli@freebox.fr Cc: george.dunlap@citrix.com Cc: glin@suse.com Cc: jgross@suse.com Cc: jlee@suse.com Cc: josh@joshtriplett.org Cc: julien.grall@linaro.org Cc: konrad.wilk@oracle.com Cc: kozerkov@parallels.com Cc: lenb@kernel.org Cc: lguest@lists.ozlabs.org Cc: linux-acpi@vger.kernel.org Cc: lv.zheng@intel.com Cc: matt@codeblueprint.co.uk Cc: mbizon@freebox.fr Cc: rjw@rjwysocki.net Cc: robert.moore@intel.com Cc: rusty@rustcorp.com.au Cc: tiwai@suse.de Cc: toshi.kani@hp.com Cc: xen-devel@lists.xensource.com Link: http://lkml.kernel.org/r/1460592286-300-17-git-send-email-mcgrof@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/platform-quirks.c b/arch/x86/kernel/platform-quirks.c index 8539194..b2f8a33 100644 --- a/arch/x86/kernel/platform-quirks.c +++ b/arch/x86/kernel/platform-quirks.c @@ -17,6 +17,7 @@ void __init x86_early_init_platform_quirks(void) case X86_SUBARCH_XEN: case X86_SUBARCH_LGUEST: case X86_SUBARCH_INTEL_MID: + case X86_SUBARCH_CE4100: x86_platform.legacy.devices.pnpbios = 0; x86_platform.legacy.rtc = 0; break; diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c index ff4f418..eceaa08 100644 --- a/arch/x86/kernel/rtc.c +++ b/arch/x86/kernel/rtc.c @@ -186,9 +186,6 @@ static __init int add_rtc_cmos(void) } } #endif - if (of_have_populated_dt()) - return 0; - if (!x86_platform.legacy.rtc) return -ENODEV; -- cgit v0.10.2 From 81b785f3e4114ed74fceb48a54e7de2f797a2ba1 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 26 Apr 2016 14:46:06 -0700 Subject: x86/boot: Rename overlapping memcpy() to memmove() Instead of having non-standard memcpy() behavior, explicitly call the new function memmove(), make it available to the decompressors, and switch the two overlap cases (screen scrolling and ELF parsing) to use memmove(). Additionally documents the purpose of compressed/string.c. Suggested-by: Lasse Collin Signed-off-by: Kees Cook Cc: Andrew Morton Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Baoquan He Cc: Borislav Petkov Cc: Dmitry Vyukov Cc: H.J. Lu Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Yinghai Lu Link: http://lkml.kernel.org/r/20160426214606.GA5758@www.outflux.net Signed-off-by: Ingo Molnar diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index c57d785..6dde6cc 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -32,9 +32,11 @@ #undef memcpy #undef memset #define memzero(s, n) memset((s), 0, (n)) +#define memmove memmove /* Functions used by the included decompressor code below. */ static void error(char *m); +void *memmove(void *dest, const void *src, size_t n); /* * This is set up by the setup-routine at boot-time @@ -80,7 +82,7 @@ static void scroll(void) { int i; - memcpy(vidmem, vidmem + cols * 2, (lines - 1) * cols * 2); + memmove(vidmem, vidmem + cols * 2, (lines - 1) * cols * 2); for (i = (lines - 1) * cols * 2; i < lines * cols * 2; i += 2) vidmem[i] = ' '; } @@ -307,7 +309,7 @@ static void parse_elf(void *output) #else dest = (void *)(phdr->p_paddr); #endif - memcpy(dest, output + phdr->p_offset, phdr->p_filesz); + memmove(dest, output + phdr->p_offset, phdr->p_filesz); break; default: /* Ignore other PT_* */ break; } diff --git a/arch/x86/boot/compressed/string.c b/arch/x86/boot/compressed/string.c index 1e10e40..2befeca 100644 --- a/arch/x86/boot/compressed/string.c +++ b/arch/x86/boot/compressed/string.c @@ -1,7 +1,14 @@ +/* + * This provides an optimized implementation of memcpy, and a simplified + * implementation of memset and memmove. These are used here because the + * standard kernel runtime versions are not yet available and we don't + * trust the gcc built-in implementations as they may do unexpected things + * (e.g. FPU ops) in the minimal decompression stub execution environment. + */ #include "../string.c" #ifdef CONFIG_X86_32 -void *__memcpy(void *dest, const void *src, size_t n) +void *memcpy(void *dest, const void *src, size_t n) { int d0, d1, d2; asm volatile( @@ -15,7 +22,7 @@ void *__memcpy(void *dest, const void *src, size_t n) return dest; } #else -void *__memcpy(void *dest, const void *src, size_t n) +void *memcpy(void *dest, const void *src, size_t n) { long d0, d1, d2; asm volatile( @@ -40,17 +47,13 @@ void *memset(void *s, int c, size_t n) return s; } -/* - * This memcpy is overlap safe (i.e. it is memmove without conflicting - * with other definitions of memmove from the various decompressors. - */ -void *memcpy(void *dest, const void *src, size_t n) +void *memmove(void *dest, const void *src, size_t n) { unsigned char *d = dest; const unsigned char *s = src; if (d <= s || d - s >= n) - return __memcpy(dest, src, n); + return memcpy(dest, src, n); while (n-- > 0) d[n] = s[n]; -- cgit v0.10.2 From 6f9af75faa1df61e1ee5bea8a787a90605bb528d Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Thu, 28 Apr 2016 17:09:03 -0700 Subject: x86/KASLR: Handle kernel relocations above 2G correctly When processing the relocation table, the offset used to calculate the relocation is an 'int'. This is sufficient for calculating the physical address of the relocs entry on 32-bit systems and on 64-bit systems when the relocation is under 2G. To handle relocations above 2G (seen in situations like kexec, netboot, etc), this offset needs to be calculated using a 'long' to avoid wrapping and miscalculating the relocation. Signed-off-by: Baoquan He [ Rewrote the changelog. ] Signed-off-by: Kees Cook Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Young Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vivek Goyal Cc: Yinghai Lu Cc: lasse.collin@tukaani.org Link: http://lkml.kernel.org/r/1461888548-32439-2-git-send-email-keescook@chromium.org Signed-off-by: Ingo Molnar Signed-off-by: Ingo Molnar diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 6dde6cc..4514514 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -232,7 +232,7 @@ static void handle_relocations(void *output, unsigned long output_len) * So we work backwards from the end of the decompressed image. */ for (reloc = output + output_len - sizeof(*reloc); *reloc; reloc--) { - int extended = *reloc; + long extended = *reloc; extended += map; ptr = (unsigned long)extended; -- cgit v0.10.2 From 974f221c84b05b1dc2f5ea50dc16d2a9d1e95eda Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 28 Apr 2016 17:09:04 -0700 Subject: x86/boot: Move compressed kernel to the end of the decompression buffer This change makes later calculations about where the kernel is located easier to reason about. To better understand this change, we must first clarify what 'VO' and 'ZO' are. These values were introduced in commits by hpa: 77d1a4999502 ("x86, boot: make symbols from the main vmlinux available") 37ba7ab5e33c ("x86, boot: make kernel_alignment adjustable; new bzImage fields") Specifically: All names prefixed with 'VO_': - relate to the uncompressed kernel image - the size of the VO image is: VO__end-VO__text ("VO_INIT_SIZE" define) All names prefixed with 'ZO_': - relate to the bootable compressed kernel image (boot/compressed/vmlinux), which is composed of the following memory areas: - head text - compressed kernel (VO image and relocs table) - decompressor code - the size of the ZO image is: ZO__end - ZO_startup_32 ("ZO_INIT_SIZE" define, though see below) The 'INIT_SIZE' value is used to find the larger of the two image sizes: #define ZO_INIT_SIZE (ZO__end - ZO_startup_32 + ZO_z_extract_offset) #define VO_INIT_SIZE (VO__end - VO__text) #if ZO_INIT_SIZE > VO_INIT_SIZE # define INIT_SIZE ZO_INIT_SIZE #else # define INIT_SIZE VO_INIT_SIZE #endif The current code uses extract_offset to decide where to position the copied ZO (i.e. ZO starts at extract_offset). (This is why ZO_INIT_SIZE currently includes the extract_offset.) Why does z_extract_offset exist? It's needed because we are trying to minimize the amount of RAM used for the whole act of creating an uncompressed, executable, properly relocation-linked kernel image in system memory. We do this so that kernels can be booted on even very small systems. To achieve the goal of minimal memory consumption we have implemented an in-place decompression strategy: instead of cleanly separating the VO and ZO images and also allocating some memory for the decompression code's runtime needs, we instead create this elaborate layout of memory buffers where the output (decompressed) stream, as it progresses, overlaps with and destroys the input (compressed) stream. This can only be done safely if the ZO image is placed to the end of the VO range, plus a certain amount of safety distance to make sure that when the last bytes of the VO range are decompressed, the compressed stream pointer is safely beyond the end of the VO range. z_extract_offset is calculated in arch/x86/boot/compressed/mkpiggy.c during the build process, at a point when we know the exact compressed and uncompressed size of the kernel images and can calculate this safe minimum offset value. (Note that the mkpiggy.c calculation is not perfect, because we don't know the decompressor used at that stage, so the z_extract_offset calculation is necessarily imprecise and is mostly based on gzip internals - we'll improve that in the next patch.) When INIT_SIZE is bigger than VO_INIT_SIZE (uncommon but possible), the copied ZO occupies the memory from extract_offset to the end of decompression buffer. It overlaps with the soon-to-be-uncompressed kernel like this: |-----compressed kernel image------| V V 0 extract_offset +INIT_SIZE |-----------|---------------|-------------------------|--------| | | | | VO__text startup_32 of ZO VO__end ZO__end ^ ^ |-------uncompressed kernel image---------| When INIT_SIZE is equal to VO_INIT_SIZE (likely) there's still space left from end of ZO to the end of decompressing buffer, like below. |-compressed kernel image-| V V 0 extract_offset +INIT_SIZE |-----------|---------------|-------------------------|--------| | | | | VO__text startup_32 of ZO ZO__end VO__end ^ ^ |------------uncompressed kernel image-------------| To simplify calculations and avoid special cases, it is cleaner to always place the compressed kernel image in memory so that ZO__end is at the end of the decompression buffer, instead of placing t at the start of extract_offset as is currently done. This patch adds BP_init_size (which is the INIT_SIZE as passed in from the boot_params) into asm-offsets.c to make it visible to the assembly code. Then when moving the ZO, it calculates the starting position of the copied ZO (via BP_init_size and the ZO run size) so that the VO__end will be at the end of the decompression buffer. To make the position calculation safe, the end of ZO is page aligned (and a comment is added to the existing VO alignment for good measure). Signed-off-by: Yinghai Lu [ Rewrote changelog and comments. ] Signed-off-by: Kees Cook Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Baoquan He Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Young Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vivek Goyal Cc: lasse.collin@tukaani.org Link: http://lkml.kernel.org/r/1461888548-32439-3-git-send-email-keescook@chromium.org [ Rewrote the changelog some more. ] Signed-off-by: Ingo Molnar diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S index 26dd9df..8a95e2f 100644 --- a/arch/x86/boot/compressed/head_32.S +++ b/arch/x86/boot/compressed/head_32.S @@ -176,7 +176,9 @@ preferred_addr: 1: /* Target address to relocate to for decompression */ - addl $z_extract_offset, %ebx + movl BP_init_size(%esi), %eax + subl $_end, %eax + addl %eax, %ebx /* Set up the stack */ leal boot_stack_end(%ebx), %esp @@ -238,8 +240,13 @@ relocated: /* push arguments for extract_kernel: */ pushl $z_run_size /* size of kernel with .bss and .brk */ pushl $z_output_len /* decompressed length, end of relocs */ - leal z_extract_offset_negative(%ebx), %ebp + + movl BP_init_size(%esi), %eax + subl $_end, %eax + movl %ebx, %ebp + subl %eax, %ebp pushl %ebp /* output address */ + pushl $z_input_len /* input_len */ leal input_data(%ebx), %eax pushl %eax /* input_data */ diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index d43c30e..09cdc0c 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -110,7 +110,9 @@ ENTRY(startup_32) 1: /* Target address to relocate to for decompression */ - addl $z_extract_offset, %ebx + movl BP_init_size(%esi), %eax + subl $_end, %eax + addl %eax, %ebx /* * Prepare for entering 64 bit mode @@ -338,7 +340,9 @@ preferred_addr: 1: /* Target address to relocate to for decompression */ - leaq z_extract_offset(%rbp), %rbx + movl BP_init_size(%rsi), %ebx + subl $_end, %ebx + addq %rbp, %rbx /* Set up the stack */ leaq boot_stack_end(%rbx), %rsp diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 4514514..4b4605e9 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -318,6 +318,23 @@ static void parse_elf(void *output) free(phdrs); } +/* + * The compressed kernel image (ZO), has been moved so that its position + * is against the end of the buffer used to hold the uncompressed kernel + * image (VO) and the execution environment (.bss, .brk), which makes sure + * there is room to do the in-place decompression. (See header.S for the + * calculations.) + * + * |-----compressed kernel image------| + * V V + * 0 extract_offset +INIT_SIZE + * |-----------|---------------|-------------------------|--------| + * | | | | + * VO__text startup_32 of ZO VO__end ZO__end + * ^ ^ + * |-------uncompressed kernel image---------| + * + */ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap, unsigned char *input_data, unsigned long input_len, diff --git a/arch/x86/boot/compressed/mkpiggy.c b/arch/x86/boot/compressed/mkpiggy.c index d8222f2..b980046 100644 --- a/arch/x86/boot/compressed/mkpiggy.c +++ b/arch/x86/boot/compressed/mkpiggy.c @@ -85,9 +85,6 @@ int main(int argc, char *argv[]) printf("z_output_len = %lu\n", (unsigned long)olen); printf(".globl z_extract_offset\n"); printf("z_extract_offset = 0x%lx\n", offs); - /* z_extract_offset_negative allows simplification of head_32.S */ - printf(".globl z_extract_offset_negative\n"); - printf("z_extract_offset_negative = -0x%lx\n", offs); printf(".globl z_run_size\n"); printf("z_run_size = %lu\n", run_size); diff --git a/arch/x86/boot/compressed/vmlinux.lds.S b/arch/x86/boot/compressed/vmlinux.lds.S index 34d047c..e24e0a0 100644 --- a/arch/x86/boot/compressed/vmlinux.lds.S +++ b/arch/x86/boot/compressed/vmlinux.lds.S @@ -70,5 +70,6 @@ SECTIONS _epgtable = . ; } #endif + . = ALIGN(PAGE_SIZE); /* keep ZO size page aligned */ _end = .; } diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index 5c04246..674134e 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -80,6 +80,7 @@ void common(void) { OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch); OFFSET(BP_version, boot_params, hdr.version); OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment); + OFFSET(BP_init_size, boot_params, hdr.init_size); OFFSET(BP_pref_address, boot_params, hdr.pref_address); OFFSET(BP_code32_start, boot_params, hdr.code32_start); diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 4c941f8..9297a00 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -334,7 +334,7 @@ SECTIONS __brk_limit = .; } - . = ALIGN(PAGE_SIZE); + . = ALIGN(PAGE_SIZE); /* keep VO_INIT_SIZE page aligned */ _end = .; STABS_DEBUG -- cgit v0.10.2 From d607251ba9acc0b5faeaa08818f60d041dd19472 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 28 Apr 2016 17:09:05 -0700 Subject: x86/boot: Calculate decompression size during boot not build Currently z_extract_offset is calculated in boot/compressed/mkpiggy.c. This doesn't work well because mkpiggy.c doesn't know the details of the decompressor in use. As a result, it can only make an estimation, which has risks: - output + output_len (VO) could be much bigger than input + input_len (ZO). In this case, the decompressed kernel plus relocs could overwrite the decompression code while it is running. - The head code of ZO could be bigger than z_extract_offset. In this case an overwrite could happen when the head code is running to move ZO to the end of buffer. Though currently the size of the head code is very small it's still a potential risk. Since there is no rule to limit the size of the head code of ZO, it runs the risk of suddenly becoming a (hard to find) bug. Instead, this moves the z_extract_offset calculation into header.S, and makes adjustments to be sure that the above two cases can never happen, and further corrects the comments describing the calculations. Since we have (in the previous patch) made ZO always be located against the end of decompression buffer, z_extract_offset is only used here to calculate an appropriate buffer size (INIT_SIZE), and is not longer used elsewhere. As such, it can be removed from voffset.h. Additionally clean up #if/#else #define to improve readability. Signed-off-by: Yinghai Lu Signed-off-by: Baoquan He [ Rewrote the changelog and comments. ] Signed-off-by: Kees Cook Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Young Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vivek Goyal Cc: lasse.collin@tukaani.org Link: http://lkml.kernel.org/r/1461888548-32439-4-git-send-email-keescook@chromium.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile index b1ef9e4..942f7dab 100644 --- a/arch/x86/boot/Makefile +++ b/arch/x86/boot/Makefile @@ -95,7 +95,7 @@ targets += voffset.h $(obj)/voffset.h: vmlinux FORCE $(call if_changed,voffset) -sed-zoffset := -e 's/^\([0-9a-fA-F]*\) [ABCDGRSTVW] \(startup_32\|startup_64\|efi32_stub_entry\|efi64_stub_entry\|efi_pe_entry\|input_data\|_end\|z_.*\)$$/\#define ZO_\2 0x\1/p' +sed-zoffset := -e 's/^\([0-9a-fA-F]*\) [ABCDGRSTVW] \(startup_32\|startup_64\|efi32_stub_entry\|efi64_stub_entry\|efi_pe_entry\|input_data\|_end\|_ehead\|_text\|z_.*\)$$/\#define ZO_\2 0x\1/p' quiet_cmd_zoffset = ZOFFSET $@ cmd_zoffset = $(NM) $< | sed -n $(sed-zoffset) > $@ diff --git a/arch/x86/boot/compressed/mkpiggy.c b/arch/x86/boot/compressed/mkpiggy.c index b980046..f095ed9 100644 --- a/arch/x86/boot/compressed/mkpiggy.c +++ b/arch/x86/boot/compressed/mkpiggy.c @@ -18,11 +18,10 @@ * * H. Peter Anvin * - * ----------------------------------------------------------------------- */ - -/* - * Compute the desired load offset from a compressed program; outputs - * a small assembly wrapper with the appropriate symbols defined. + * ----------------------------------------------------------------------- + * + * Outputs a small assembly wrapper with the appropriate symbols defined. + * */ #include @@ -35,7 +34,6 @@ int main(int argc, char *argv[]) { uint32_t olen; long ilen; - unsigned long offs; unsigned long run_size; FILE *f = NULL; int retval = 1; @@ -67,15 +65,6 @@ int main(int argc, char *argv[]) ilen = ftell(f); olen = get_unaligned_le32(&olen); - /* - * Now we have the input (compressed) and output (uncompressed) - * sizes, compute the necessary decompression offset... - */ - - offs = (olen > ilen) ? olen - ilen : 0; - offs += olen >> 12; /* Add 8 bytes for each 32K block */ - offs += 64*1024 + 128; /* Add 64K + 128 bytes slack */ - offs = (offs+4095) & ~4095; /* Round to a 4K boundary */ run_size = atoi(argv[2]); printf(".section \".rodata..compressed\",\"a\",@progbits\n"); @@ -83,8 +72,6 @@ int main(int argc, char *argv[]) printf("z_input_len = %lu\n", ilen); printf(".globl z_output_len\n"); printf("z_output_len = %lu\n", (unsigned long)olen); - printf(".globl z_extract_offset\n"); - printf("z_extract_offset = 0x%lx\n", offs); printf(".globl z_run_size\n"); printf("z_run_size = %lu\n", run_size); diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S index fd85b9e..3dd5be3 100644 --- a/arch/x86/boot/header.S +++ b/arch/x86/boot/header.S @@ -508,13 +508,10 @@ pref_address: .quad LOAD_PHYSICAL_ADDR # preferred load addr # To avoid problems with the compressed data's meta information an extra 18 # bytes are needed. Leading to the formula: # -# extra_bytes = (uncompressed_size >> 12) + 32768 + 18 + decompressor_size +# extra_bytes = (uncompressed_size >> 12) + 32768 + 18 # # Adding 8 bytes per 32K is a bit excessive but much easier to calculate. # Adding 32768 instead of 32767 just makes for round numbers. -# Adding the decompressor_size is necessary as it musht live after all -# of the data as well. Last I measured the decompressor is about 14K. -# 10K of actual data and 4K of bss. # # Above analysis is for decompressing gzip compressed kernel only. Up to # now 6 different decompressor are supported all together. And among them @@ -524,17 +521,35 @@ pref_address: .quad LOAD_PHYSICAL_ADDR # preferred load addr # the description in lib/decompressor_xxx.c for specific information. # # extra_bytes = (uncompressed_size >> 12) + 65536 + 128 -# -# Note that this calculation, which results in z_extract_offset (below), -# is currently generated in compressed/mkpiggy.c -#define ZO_INIT_SIZE (ZO__end - ZO_startup_32 + ZO_z_extract_offset) +#define ZO_z_extra_bytes ((ZO_z_output_len >> 12) + 65536 + 128) +#if ZO_z_output_len > ZO_z_input_len +# define ZO_z_extract_offset (ZO_z_output_len + ZO_z_extra_bytes - \ + ZO_z_input_len) +#else +# define ZO_z_extract_offset ZO_z_extra_bytes +#endif + +/* + * The extract_offset has to be bigger than ZO head section. Otherwise when + * the head code is running to move ZO to the end of the buffer, it will + * overwrite the head code itself. + */ +#if (ZO__ehead - ZO_startup_32) > ZO_z_extract_offset +# define ZO_z_min_extract_offset ((ZO__ehead - ZO_startup_32 + 4095) & ~4095) +#else +# define ZO_z_min_extract_offset ((ZO_z_extract_offset + 4095) & ~4095) +#endif + +#define ZO_INIT_SIZE (ZO__end - ZO_startup_32 + ZO_z_min_extract_offset) + #define VO_INIT_SIZE (VO__end - VO__text) #if ZO_INIT_SIZE > VO_INIT_SIZE -#define INIT_SIZE ZO_INIT_SIZE +# define INIT_SIZE ZO_INIT_SIZE #else -#define INIT_SIZE VO_INIT_SIZE +# define INIT_SIZE VO_INIT_SIZE #endif + init_size: .long INIT_SIZE # kernel initialization size handover_offset: .long 0 # Filled in by build.c -- cgit v0.10.2 From 67b6662559f7f77bcbd3ac67d09aaac11785f3c1 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 28 Apr 2016 17:09:06 -0700 Subject: x86/boot: Fix "run_size" calculation Currently, the "run_size" variable holds the total kernel size (size of code plus brk and bss) and is calculated via the shell script arch/x86/tools/calc_run_size.sh. It gets the file offset and mem size of the .bss and .brk sections from the vmlinux, and adds them as follows: run_size = $(( $offsetA + $sizeA + $sizeB )) However, this is not correct (it is too large). To illustrate, here's a walk-through of the script's calculation, compared to the correct way to find it. First, offsetA is found as the starting address of the first .bss or .brk section seen in the ELF file. The sizeA and sizeB values are the respective section sizes. [bhe@x1 linux]$ objdump -h vmlinux vmlinux: file format elf64-x86-64 Sections: Idx Name Size VMA LMA File off Algn 27 .bss 00170000 ffffffff81ec8000 0000000001ec8000 012c8000 2**12 ALLOC 28 .brk 00027000 ffffffff82038000 0000000002038000 012c8000 2**0 ALLOC Here, offsetA is 0x012c8000, with sizeA at 0x00170000 and sizeB at 0x00027000. The resulting run_size is 0x145f000: 0x012c8000 + 0x00170000 + 0x00027000 = 0x145f000 However, if we instead examine the ELF LOAD program headers, we see a different picture. [bhe@x1 linux]$ readelf -l vmlinux Elf file type is EXEC (Executable file) Entry point 0x1000000 There are 5 program headers, starting at offset 64 Program Headers: Type Offset VirtAddr PhysAddr FileSiz MemSiz Flags Align LOAD 0x0000000000200000 0xffffffff81000000 0x0000000001000000 0x0000000000b5e000 0x0000000000b5e000 R E 200000 LOAD 0x0000000000e00000 0xffffffff81c00000 0x0000000001c00000 0x0000000000145000 0x0000000000145000 RW 200000 LOAD 0x0000000001000000 0x0000000000000000 0x0000000001d45000 0x0000000000018158 0x0000000000018158 RW 200000 LOAD 0x000000000115e000 0xffffffff81d5e000 0x0000000001d5e000 0x000000000016a000 0x0000000000301000 RWE 200000 NOTE 0x000000000099bcac 0xffffffff8179bcac 0x000000000179bcac 0x00000000000001bc 0x00000000000001bc 4 Section to Segment mapping: Segment Sections... 00 .text .notes __ex_table .rodata __bug_table .pci_fixup .tracedata __ksymtab __ksymtab_gpl __ksymtab_strings __init_rodata __param __modver 01 .data .vvar 02 .data..percpu 03 .init.text .init.data .x86_cpu_dev.init .parainstructions .altinstructions .altinstr_replacement .iommu_table .apicdrivers .exit.text .smp_locks .bss .brk 04 .notes As mentioned, run_size needs to be the size of the running kernel including .bss and .brk. We can see from the Section/Segment mapping above that .bss and .brk are included in segment 03 (which corresponds to the final LOAD program header). To find the run_size, we calculate the end of the LOAD segment from its PhysAddr start (0x0000000001d5e000) and its MemSiz (0x0000000000301000), minus the physical load address of the kernel (the first LOAD segment's PhysAddr: 0x0000000001000000). The resulting run_size is 0x105f000: 0x0000000001d5e000 + 0x0000000000301000 - 0x0000000001000000 = 0x105f000 So, from this we can see that the existing run_size calculation is 0x400000 too high. And, as it turns out, the correct run_size is actually equal to VO_end - VO_text, which is certainly easier to calculate. _end: 0xffffffff8205f000 _text:0xffffffff81000000 0xffffffff8205f000 - 0xffffffff81000000 = 0x105f000 As a result, run_size is a simple constant, so we don't need to pass it around; we already have voffset.h for such things. We can share voffset.h between misc.c and header.S instead of getting run_size in other ways. This patch moves voffset.h creation code to boot/compressed/Makefile, and switches misc.c to use the VO_end - VO_text calculation for run_size. Dependence before: boot/header.S ==> boot/voffset.h ==> vmlinux boot/header.S ==> compressed/vmlinux ==> compressed/misc.c Dependence after: boot/header.S ==> compressed/vmlinux ==> compressed/misc.c ==> boot/voffset.h ==> vmlinux Signed-off-by: Yinghai Lu Signed-off-by: Baoquan He [ Rewrote the changelog. ] Signed-off-by: Kees Cook Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Young Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Triplett Cc: Junjie Mao Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vivek Goyal Cc: lasse.collin@tukaani.org Fixes: e6023367d779 ("x86, kaslr: Prevent .bss from overlaping initrd") Link: http://lkml.kernel.org/r/1461888548-32439-5-git-send-email-keescook@chromium.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile index 942f7dab..700a9c6 100644 --- a/arch/x86/boot/Makefile +++ b/arch/x86/boot/Makefile @@ -86,15 +86,6 @@ $(obj)/vmlinux.bin: $(obj)/compressed/vmlinux FORCE SETUP_OBJS = $(addprefix $(obj)/,$(setup-y)) -sed-voffset := -e 's/^\([0-9a-fA-F]*\) [ABCDGRSTVW] \(_text\|_end\)$$/\#define VO_\2 0x\1/p' - -quiet_cmd_voffset = VOFFSET $@ - cmd_voffset = $(NM) $< | sed -n $(sed-voffset) > $@ - -targets += voffset.h -$(obj)/voffset.h: vmlinux FORCE - $(call if_changed,voffset) - sed-zoffset := -e 's/^\([0-9a-fA-F]*\) [ABCDGRSTVW] \(startup_32\|startup_64\|efi32_stub_entry\|efi64_stub_entry\|efi_pe_entry\|input_data\|_end\|_ehead\|_text\|z_.*\)$$/\#define ZO_\2 0x\1/p' quiet_cmd_zoffset = ZOFFSET $@ @@ -106,7 +97,7 @@ $(obj)/zoffset.h: $(obj)/compressed/vmlinux FORCE AFLAGS_header.o += -I$(obj) -$(obj)/header.o: $(obj)/voffset.h $(obj)/zoffset.h +$(obj)/header.o: $(obj)/zoffset.h LDFLAGS_setup.elf := -T $(obj)/setup.elf: $(src)/setup.ld $(SETUP_OBJS) FORCE diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 542c92f..4599780 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -57,6 +57,18 @@ LDFLAGS_vmlinux := -T hostprogs-y := mkpiggy HOST_EXTRACFLAGS += -I$(srctree)/tools/include +sed-voffset := -e 's/^\([0-9a-fA-F]*\) [ABCDGRSTVW] \(_text\|_end\)$$/\#define VO_\2 _AC(0x\1,UL)/p' + +quiet_cmd_voffset = VOFFSET $@ + cmd_voffset = $(NM) $< | sed -n $(sed-voffset) > $@ + +targets += ../voffset.h + +$(obj)/../voffset.h: vmlinux FORCE + $(call if_changed,voffset) + +$(obj)/misc.o: $(obj)/../voffset.h + vmlinux-objs-y := $(obj)/vmlinux.lds $(obj)/head_$(BITS).o $(obj)/misc.o \ $(obj)/string.o $(obj)/cmdline.o \ $(obj)/piggy.o $(obj)/cpuflags.o diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 4b4605e9..cda93d1 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -13,6 +13,7 @@ #include "misc.h" #include "../string.h" +#include "../voffset.h" /* * WARNING!! @@ -363,6 +364,8 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap, lines = boot_params->screen_info.orig_video_lines; cols = boot_params->screen_info.orig_video_cols; + run_size = VO__end - VO__text; + console_init(); debug_putstr("early console in extract_kernel\n"); -- cgit v0.10.2 From 4d2d542482205d3df1a0852751f5b004cc6390cc Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 28 Apr 2016 17:09:07 -0700 Subject: x86/KASLR: Clean up unused code from old 'run_size' and rename it to 'kernel_total_size' Since 'run_size' is now calculated in misc.c, the old script and associated argument passing is no longer needed. This patch removes them, and renames 'run_size' to the more descriptive 'kernel_total_size'. Signed-off-by: Yinghai Lu Signed-off-by: Baoquan He [ Rewrote the changelog, renamed 'run_size' to 'kernel_total_size' ] Signed-off-by: Kees Cook Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Ard Biesheuvel Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Young Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Triplett Cc: Junjie Mao Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vivek Goyal Cc: lasse.collin@tukaani.org Link: http://lkml.kernel.org/r/1461888548-32439-6-git-send-email-keescook@chromium.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 4599780..adef26d 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -121,10 +121,8 @@ suffix-$(CONFIG_KERNEL_XZ) := xz suffix-$(CONFIG_KERNEL_LZO) := lzo suffix-$(CONFIG_KERNEL_LZ4) := lz4 -RUN_SIZE = $(shell $(OBJDUMP) -h vmlinux | \ - $(CONFIG_SHELL) $(srctree)/arch/x86/tools/calc_run_size.sh) quiet_cmd_mkpiggy = MKPIGGY $@ - cmd_mkpiggy = $(obj)/mkpiggy $< $(RUN_SIZE) > $@ || ( rm -f $@ ; false ) + cmd_mkpiggy = $(obj)/mkpiggy $< > $@ || ( rm -f $@ ; false ) targets += piggy.S $(obj)/piggy.S: $(obj)/vmlinux.bin.$(suffix-y) $(obj)/mkpiggy FORCE diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S index 8a95e2f..1038524 100644 --- a/arch/x86/boot/compressed/head_32.S +++ b/arch/x86/boot/compressed/head_32.S @@ -238,7 +238,6 @@ relocated: * Do the extraction, and jump to the new kernel.. */ /* push arguments for extract_kernel: */ - pushl $z_run_size /* size of kernel with .bss and .brk */ pushl $z_output_len /* decompressed length, end of relocs */ movl BP_init_size(%esi), %eax @@ -254,7 +253,7 @@ relocated: pushl %eax /* heap area */ pushl %esi /* real mode pointer */ call extract_kernel /* returns kernel location in %eax */ - addl $28, %esp + addl $24, %esp /* * Jump to the extracted kernel. diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index 09cdc0c..7c04700 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -415,8 +415,6 @@ relocated: * Do the extraction, and jump to the new kernel.. */ pushq %rsi /* Save the real mode argument */ - movq $z_run_size, %r9 /* size of kernel with .bss and .brk */ - pushq %r9 movq %rsi, %rdi /* real mode address */ leaq boot_heap(%rip), %rsi /* malloc area for uncompression */ leaq input_data(%rip), %rdx /* input_data */ @@ -424,7 +422,6 @@ relocated: movq %rbp, %r8 /* output target address */ movq $z_output_len, %r9 /* decompressed length, end of relocs */ call extract_kernel /* returns kernel location in %rax */ - popq %r9 popq %rsi /* diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index cda93d1..bee6238 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -340,9 +340,9 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap, unsigned char *input_data, unsigned long input_len, unsigned char *output, - unsigned long output_len, - unsigned long run_size) + unsigned long output_len) { + const unsigned long kernel_total_size = VO__end - VO__text; unsigned char *output_orig = output; /* Retain x86 boot parameters pointer passed from startup_32/64. */ @@ -364,8 +364,6 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap, lines = boot_params->screen_info.orig_video_lines; cols = boot_params->screen_info.orig_video_cols; - run_size = VO__end - VO__text; - console_init(); debug_putstr("early console in extract_kernel\n"); @@ -377,7 +375,7 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap, debug_putaddr(input_len); debug_putaddr(output); debug_putaddr(output_len); - debug_putaddr(run_size); + debug_putaddr(kernel_total_size); /* * The memory hole needed for the kernel is the larger of either @@ -385,8 +383,7 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap, * entire decompressed kernel plus .bss and .brk sections. */ output = choose_random_location(input_data, input_len, output, - output_len > run_size ? output_len - : run_size); + max(output_len, kernel_total_size)); /* Validate memory location choices. */ if ((unsigned long)output & (MIN_KERNEL_ALIGN - 1)) diff --git a/arch/x86/boot/compressed/mkpiggy.c b/arch/x86/boot/compressed/mkpiggy.c index f095ed9..72bad2c 100644 --- a/arch/x86/boot/compressed/mkpiggy.c +++ b/arch/x86/boot/compressed/mkpiggy.c @@ -34,13 +34,11 @@ int main(int argc, char *argv[]) { uint32_t olen; long ilen; - unsigned long run_size; FILE *f = NULL; int retval = 1; - if (argc < 3) { - fprintf(stderr, "Usage: %s compressed_file run_size\n", - argv[0]); + if (argc < 2) { + fprintf(stderr, "Usage: %s compressed_file\n", argv[0]); goto bail; } @@ -65,15 +63,11 @@ int main(int argc, char *argv[]) ilen = ftell(f); olen = get_unaligned_le32(&olen); - run_size = atoi(argv[2]); - printf(".section \".rodata..compressed\",\"a\",@progbits\n"); printf(".globl z_input_len\n"); printf("z_input_len = %lu\n", ilen); printf(".globl z_output_len\n"); printf("z_output_len = %lu\n", (unsigned long)olen); - printf(".globl z_run_size\n"); - printf("z_run_size = %lu\n", run_size); printf(".globl input_data, input_data_end\n"); printf("input_data:\n"); diff --git a/arch/x86/tools/calc_run_size.sh b/arch/x86/tools/calc_run_size.sh deleted file mode 100644 index 1a4c17b..0000000 --- a/arch/x86/tools/calc_run_size.sh +++ /dev/null @@ -1,42 +0,0 @@ -#!/bin/sh -# -# Calculate the amount of space needed to run the kernel, including room for -# the .bss and .brk sections. -# -# Usage: -# objdump -h a.out | sh calc_run_size.sh - -NUM='\([0-9a-fA-F]*[ \t]*\)' -OUT=$(sed -n 's/^[ \t0-9]*.b[sr][sk][ \t]*'"$NUM$NUM$NUM$NUM"'.*/\1\4/p') -if [ -z "$OUT" ] ; then - echo "Never found .bss or .brk file offset" >&2 - exit 1 -fi - -OUT=$(echo ${OUT# }) -sizeA=$(printf "%d" 0x${OUT%% *}) -OUT=${OUT#* } -offsetA=$(printf "%d" 0x${OUT%% *}) -OUT=${OUT#* } -sizeB=$(printf "%d" 0x${OUT%% *}) -OUT=${OUT#* } -offsetB=$(printf "%d" 0x${OUT%% *}) - -run_size=$(( $offsetA + $sizeA + $sizeB )) - -# BFD linker shows the same file offset in ELF. -if [ "$offsetA" -ne "$offsetB" ] ; then - # Gold linker shows them as consecutive. - endB=$(( $offsetB + $sizeB )) - if [ "$endB" != "$run_size" ] ; then - printf "sizeA: 0x%x\n" $sizeA >&2 - printf "offsetA: 0x%x\n" $offsetA >&2 - printf "sizeB: 0x%x\n" $sizeB >&2 - printf "offsetB: 0x%x\n" $offsetB >&2 - echo ".bss and .brk are non-contiguous" >&2 - exit 1 - fi -fi - -printf "%d\n" $run_size -exit 0 -- cgit v0.10.2 From 4abf061bf87bbd856c8d60199b2fba8b8f9b9fd6 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 28 Apr 2016 17:09:08 -0700 Subject: x86/boot: Correctly bounds-check relocations Relocation handling performs bounds checking on the resulting calculated addresses. The existing code uses output_len (VO size plus relocs size) as the max address. This is not right since the max_addr check should stop at the end of VO and exclude bss, brk, etc, which follows. The valid range should be VO [_text, __bss_start] in the loaded physical address space. This patch adds an export for __bss_start in voffset.h and uses it to set the correct limit for max_addr. Signed-off-by: Yinghai Lu [ Rewrote the changelog. ] Signed-off-by: Kees Cook Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Baoquan He Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Young Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vivek Goyal Cc: lasse.collin@tukaani.org Link: http://lkml.kernel.org/r/1461888548-32439-7-git-send-email-keescook@chromium.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index adef26d..75f2233 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -57,7 +57,7 @@ LDFLAGS_vmlinux := -T hostprogs-y := mkpiggy HOST_EXTRACFLAGS += -I$(srctree)/tools/include -sed-voffset := -e 's/^\([0-9a-fA-F]*\) [ABCDGRSTVW] \(_text\|_end\)$$/\#define VO_\2 _AC(0x\1,UL)/p' +sed-voffset := -e 's/^\([0-9a-fA-F]*\) [ABCDGRSTVW] \(_text\|__bss_start\|_end\)$$/\#define VO_\2 _AC(0x\1,UL)/p' quiet_cmd_voffset = VOFFSET $@ cmd_voffset = $(NM) $< | sed -n $(sed-voffset) > $@ diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index bee6238..8f0253d 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -191,7 +191,7 @@ static void handle_relocations(void *output, unsigned long output_len) int *reloc; unsigned long delta, map, ptr; unsigned long min_addr = (unsigned long)output; - unsigned long max_addr = min_addr + output_len; + unsigned long max_addr = min_addr + (VO___bss_start - VO__text); /* * Calculate the delta between where vmlinux was linked to load -- cgit v0.10.2 From dc425a6e140bca99bdb4823e9909c9d9b8ba36b6 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 2 May 2016 15:51:00 -0700 Subject: x86/boot: Extract error reporting functions Currently to use warn(), a caller would need to include misc.h. However, this means they would get the (unavailable during compressed boot) gcc built-in memcpy family of functions. But since string.c is defining these memcpy functions for use by misc.c, we end up in a weird circular dependency. To break this loop, move the error reporting functions outside of misc.c with their own header so that they can be independently included by other sources. Since the screen-writing routines use memmove(), keep the low-level *_putstr() functions in misc.c. Signed-off-by: Kees Cook Cc: Andy Lutomirski Cc: Baoquan He Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Lasse Collin Cc: Linus Torvalds Cc: One Thousand Gnomes Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Yinghai Lu Link: http://lkml.kernel.org/r/1462229461-3370-2-git-send-email-keescook@chromium.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 75f2233..77ce3a0 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -70,7 +70,7 @@ $(obj)/../voffset.h: vmlinux FORCE $(obj)/misc.o: $(obj)/../voffset.h vmlinux-objs-y := $(obj)/vmlinux.lds $(obj)/head_$(BITS).o $(obj)/misc.o \ - $(obj)/string.o $(obj)/cmdline.o \ + $(obj)/string.o $(obj)/cmdline.o $(obj)/error.o \ $(obj)/piggy.o $(obj)/cpuflags.o vmlinux-objs-$(CONFIG_EARLY_PRINTK) += $(obj)/early_serial_console.o diff --git a/arch/x86/boot/compressed/error.c b/arch/x86/boot/compressed/error.c new file mode 100644 index 0000000..6248740 --- /dev/null +++ b/arch/x86/boot/compressed/error.c @@ -0,0 +1,22 @@ +/* + * Callers outside of misc.c need access to the error reporting routines, + * but the *_putstr() functions need to stay in misc.c because of how + * memcpy() and memmove() are defined for the compressed boot environment. + */ +#include "misc.h" + +void warn(char *m) +{ + error_putstr("\n\n"); + error_putstr(m); + error_putstr("\n\n"); +} + +void error(char *m) +{ + warn(m); + error_putstr(" -- System halted"); + + while (1) + asm("hlt"); +} diff --git a/arch/x86/boot/compressed/error.h b/arch/x86/boot/compressed/error.h new file mode 100644 index 0000000..2e59dac --- /dev/null +++ b/arch/x86/boot/compressed/error.h @@ -0,0 +1,7 @@ +#ifndef BOOT_COMPRESSED_ERROR_H +#define BOOT_COMPRESSED_ERROR_H + +void warn(char *m); +void error(char *m); + +#endif /* BOOT_COMPRESSED_ERROR_H */ diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c index 8741a6d..f1818d9 100644 --- a/arch/x86/boot/compressed/kaslr.c +++ b/arch/x86/boot/compressed/kaslr.c @@ -10,6 +10,7 @@ * */ #include "misc.h" +#include "error.h" #include #include diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 8f0253d..9536d77 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -12,6 +12,7 @@ */ #include "misc.h" +#include "error.h" #include "../string.h" #include "../voffset.h" @@ -36,7 +37,6 @@ #define memmove memmove /* Functions used by the included decompressor code below. */ -static void error(char *m); void *memmove(void *dest, const void *src, size_t n); /* @@ -169,22 +169,6 @@ void __puthex(unsigned long value) } } -void warn(char *m) -{ - error_putstr("\n\n"); - error_putstr(m); - error_putstr("\n\n"); -} - -static void error(char *m) -{ - warn(m); - error_putstr(" -- System halted"); - - while (1) - asm("hlt"); -} - #if CONFIG_X86_NEED_RELOCS static void handle_relocations(void *output, unsigned long output_len) { diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index e75f6cf..9887e0d 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -35,7 +35,6 @@ extern memptr free_mem_end_ptr; extern struct boot_params *boot_params; void __putstr(const char *s); void __puthex(unsigned long value); -void warn(char *m); #define error_putstr(__x) __putstr(__x) #define error_puthex(__x) __puthex(__x) diff --git a/arch/x86/boot/compressed/string.c b/arch/x86/boot/compressed/string.c index 2befeca..faa4dc7 100644 --- a/arch/x86/boot/compressed/string.c +++ b/arch/x86/boot/compressed/string.c @@ -5,6 +5,8 @@ * trust the gcc built-in implementations as they may do unexpected things * (e.g. FPU ops) in the minimal decompression stub execution environment. */ +#include "error.h" + #include "../string.c" #ifdef CONFIG_X86_32 -- cgit v0.10.2 From 00ec2c37031eb1b1feda006c84748d126dc2ef27 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 2 May 2016 15:51:01 -0700 Subject: x86/boot: Warn on future overlapping memcpy() use If an overlapping memcpy() is ever attempted, we should at least report it, in case it might lead to problems, so it could be changed to a memmove() call instead. Suggested-by: Ingo Molnar Signed-off-by: Kees Cook Cc: Andy Lutomirski Cc: Baoquan He Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Lasse Collin Cc: Linus Torvalds Cc: One Thousand Gnomes Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Yinghai Lu Link: http://lkml.kernel.org/r/1462229461-3370-3-git-send-email-keescook@chromium.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/boot/compressed/string.c b/arch/x86/boot/compressed/string.c index faa4dc7..cea140c 100644 --- a/arch/x86/boot/compressed/string.c +++ b/arch/x86/boot/compressed/string.c @@ -10,7 +10,7 @@ #include "../string.c" #ifdef CONFIG_X86_32 -void *memcpy(void *dest, const void *src, size_t n) +static void *__memcpy(void *dest, const void *src, size_t n) { int d0, d1, d2; asm volatile( @@ -24,7 +24,7 @@ void *memcpy(void *dest, const void *src, size_t n) return dest; } #else -void *memcpy(void *dest, const void *src, size_t n) +static void *__memcpy(void *dest, const void *src, size_t n) { long d0, d1, d2; asm volatile( @@ -55,10 +55,20 @@ void *memmove(void *dest, const void *src, size_t n) const unsigned char *s = src; if (d <= s || d - s >= n) - return memcpy(dest, src, n); + return __memcpy(dest, src, n); while (n-- > 0) d[n] = s[n]; return dest; } + +/* Detect and warn about potential overlaps, but handle them with memmove. */ +void *memcpy(void *dest, const void *src, size_t n) +{ + if (dest > src && dest - src < n) { + warn("Avoiding potentially unsafe overlapping memcpy()!"); + return memmove(dest, src, n); + } + return __memcpy(dest, src, n); +} -- cgit v0.10.2 From 2bc1cd39fa9f659956b25e500422e700a6cd4ec3 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 5 May 2016 15:13:46 -0700 Subject: x86/boot: Clean up pointer casting Currently extract_kernel() defines the input and output buffer pointers as "unsigned char *" since that's effectively what they are. It passes these to the decompressor routine and to the ELF parser, which both logically deal with buffer pointers too. There is some casting ("unsigned long") done to validate the numerical value of the pointers, but it is relatively limited. However, choose_random_location() operates almost exclusively on the numerical representation of these pointers, so it ended up carrying a lot of "unsigned long" casts. With the future physical/virtual split these casts were going to multiply, so this attempts to solve the problem by doing all the casting in choose_random_location()'s entry and return instead of through-out the code. Adjusts argument names to be more meaningful, and changes one us of "choice" to "output" to make the future physical/virtual split more clear (i.e. "choice" should be strictly a function return value and not used as an intermediate). Suggested-by: Ingo Molnar Signed-off-by: Kees Cook Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Baoquan He Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Young Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vivek Goyal Cc: Yinghai Lu Cc: kernel-hardening@lists.openwall.com Cc: lasse.collin@tukaani.org Link: http://lkml.kernel.org/r/1462486436-3707-2-git-send-email-keescook@chromium.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c index f1818d9..2072d82 100644 --- a/arch/x86/boot/compressed/kaslr.c +++ b/arch/x86/boot/compressed/kaslr.c @@ -305,12 +305,21 @@ static unsigned long find_random_addr(unsigned long minimum, return slots_fetch_random(); } -unsigned char *choose_random_location(unsigned char *input, +unsigned char *choose_random_location(unsigned char *input_ptr, unsigned long input_size, - unsigned char *output, + unsigned char *output_ptr, unsigned long output_size) { - unsigned long choice = (unsigned long)output; + /* + * The caller of choose_random_location() uses unsigned char * for + * buffer pointers since it performs decompression, elf parsing, etc. + * Since this code examines addresses much more numerically, + * unsigned long is used internally here. Instead of sprinkling + * more casts into extract_kernel, do them here and at return. + */ + unsigned long input = (unsigned long)input_ptr; + unsigned long output = (unsigned long)output_ptr; + unsigned long choice = output; unsigned long random_addr; #ifdef CONFIG_HIBERNATION @@ -328,11 +337,10 @@ unsigned char *choose_random_location(unsigned char *input, boot_params->hdr.loadflags |= KASLR_FLAG; /* Record the various known unsafe memory ranges. */ - mem_avoid_init((unsigned long)input, input_size, - (unsigned long)output, output_size); + mem_avoid_init(input, input_size, output, output_size); /* Walk e820 and find a random address. */ - random_addr = find_random_addr(choice, output_size); + random_addr = find_random_addr(output, output_size); if (!random_addr) { warn("KASLR disabled: could not find suitable E820 region!"); goto out; diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index 9887e0d..1f23d02 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -67,20 +67,20 @@ int cmdline_find_option_bool(const char *option); #if CONFIG_RANDOMIZE_BASE /* kaslr.c */ -unsigned char *choose_random_location(unsigned char *input, +unsigned char *choose_random_location(unsigned char *input_ptr, unsigned long input_size, - unsigned char *output, + unsigned char *output_ptr, unsigned long output_size); /* cpuflags.c */ bool has_cpuflag(int flag); #else static inline -unsigned char *choose_random_location(unsigned char *input, +unsigned char *choose_random_location(unsigned char *input_ptr, unsigned long input_size, - unsigned char *output, + unsigned char *output_ptr, unsigned long output_size) { - return output; + return output_ptr; } #endif -- cgit v0.10.2 From 9dc1969c24eff8b7d7a9a565d1047b624921ba06 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 5 May 2016 15:13:47 -0700 Subject: x86/KASLR: Consolidate mem_avoid[] entries The mem_avoid[] array is used to track positions that should be avoided (like the compressed kernel, decompression code, etc) when selecting a memory position for the randomly relocated kernel. Since ZO is now at the end of the decompression buffer and the decompression code (and its heap and stack) are at the front, we can safely consolidate the decompression entry, the heap entry, and the stack entry. The boot_params memory, however, could be elsewhere, so it should be explicitly included. Signed-off-by: Yinghai Lu Signed-off-by: Baoquan He [ Rwrote changelog, cleaned up code comments. ] Signed-off-by: Kees Cook Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Young Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vivek Goyal Cc: kernel-hardening@lists.openwall.com Cc: lasse.collin@tukaani.org Link: http://lkml.kernel.org/r/1462486436-3707-3-git-send-email-keescook@chromium.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c index 2072d82..6392f00 100644 --- a/arch/x86/boot/compressed/kaslr.c +++ b/arch/x86/boot/compressed/kaslr.c @@ -121,7 +121,7 @@ struct mem_vector { unsigned long size; }; -#define MEM_AVOID_MAX 5 +#define MEM_AVOID_MAX 4 static struct mem_vector mem_avoid[MEM_AVOID_MAX]; static bool mem_contains(struct mem_vector *region, struct mem_vector *item) @@ -146,22 +146,71 @@ static bool mem_overlaps(struct mem_vector *one, struct mem_vector *two) return true; } +/* + * In theroy, KASLR can put the kernel anywhere in area of [16M, 64T). The + * mem_avoid array is used to store the ranges that need to be avoided when + * KASLR searches for a an appropriate random address. We must avoid any + * regions that are unsafe to overlap with during decompression, and other + * things like the initrd, cmdline and boot_params. + * + * How to calculate the unsafe areas is detailed here, and is informed by + * the decompression calculations in header.S, and the diagram in misc.c. + * + * The compressed vmlinux (ZO) plus relocs and the run space of ZO can't be + * overwritten by decompression output. + * + * ZO sits against the end of the decompression buffer, so we can calculate + * where text, data, bss, etc of ZO are positioned. + * + * The follow are already enforced by the code: + * - init_size >= kernel_total_size + * - input + input_len >= output + output_len + * - kernel_total_size could be >= or < output_len + * + * From this, we can make several observations, illustrated by a diagram: + * - init_size >= kernel_total_size + * - input + input_len > output + output_len + * - kernel_total_size >= output_len + * + * 0 output input input+input_len output+init_size + * | | | | | + * | | | | | + * |-----|--------|--------|------------------|----|------------|----------| + * | | | + * | | | + * output+init_size-ZO_INIT_SIZE output+output_len output+kernel_total_size + * + * [output, output+init_size) is for the buffer for decompressing the + * compressed kernel (ZO). + * + * [output, output+kernel_total_size) is for the uncompressed kernel (VO) + * and its bss, brk, etc. + * [output, output+output_len) is VO plus relocs + * + * [output+init_size-ZO_INIT_SIZE, output+init_size) is the copied ZO. + * [input, input+input_len) is the copied compressed (VO (vmlinux after + * objcopy) plus relocs), not the ZO. + * + * [input+input_len, output+init_size) is [_text, _end) for ZO. That was the + * first range in mem_avoid, which included ZO's heap and stack. Also + * [input, input+input_size) need be put in mem_avoid array, but since it + * is adjacent to the first entry, they can be merged. This is how we get + * the first entry in mem_avoid[]. + */ static void mem_avoid_init(unsigned long input, unsigned long input_size, - unsigned long output, unsigned long output_size) + unsigned long output) { + unsigned long init_size = boot_params->hdr.init_size; u64 initrd_start, initrd_size; u64 cmd_line, cmd_line_size; - unsigned long unsafe, unsafe_len; char *ptr; /* * Avoid the region that is unsafe to overlap during - * decompression (see calculations in ../header.S). + * decompression. */ - unsafe_len = (output_size >> 12) + 32768 + 18; - unsafe = (unsigned long)input + input_size - unsafe_len; - mem_avoid[0].start = unsafe; - mem_avoid[0].size = unsafe_len; + mem_avoid[0].start = input; + mem_avoid[0].size = (output + init_size) - input; /* Avoid initrd. */ initrd_start = (u64)boot_params->ext_ramdisk_image << 32; @@ -181,13 +230,9 @@ static void mem_avoid_init(unsigned long input, unsigned long input_size, mem_avoid[2].start = cmd_line; mem_avoid[2].size = cmd_line_size; - /* Avoid heap memory. */ - mem_avoid[3].start = (unsigned long)free_mem_ptr; - mem_avoid[3].size = BOOT_HEAP_SIZE; - - /* Avoid stack memory. */ - mem_avoid[4].start = (unsigned long)free_mem_end_ptr; - mem_avoid[4].size = BOOT_STACK_SIZE; + /* Avoid params */ + mem_avoid[3].start = (unsigned long)boot_params; + mem_avoid[3].size = sizeof(*boot_params); } /* Does this memory vector overlap a known avoided area? */ @@ -337,7 +382,7 @@ unsigned char *choose_random_location(unsigned char *input_ptr, boot_params->hdr.loadflags |= KASLR_FLAG; /* Record the various known unsafe memory ranges. */ - mem_avoid_init(input, input_size, output, output_size); + mem_avoid_init(input, input_size, output); /* Walk e820 and find a random address. */ random_addr = find_random_addr(output, output_size); -- cgit v0.10.2 From 549f90db68c9f8e21a40ec21c8047441984e7164 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 6 May 2016 13:50:15 +0200 Subject: x86/boot: Simplify pointer casting in choose_random_location() Pass them down as 'unsigned long' directly and get rid of more casting and assignments. Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: akpm@linux-foundation.org Cc: bhe@redhat.com Cc: dyoung@redhat.com Cc: linux-tip-commits@vger.kernel.org Cc: luto@kernel.org Cc: vgoyal@redhat.com Cc: yinghai@kernel.org Link: http://lkml.kernel.org/r/20160506115015.GI24044@pd.tnic Signed-off-by: Ingo Molnar diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c index 6392f00..ff12277 100644 --- a/arch/x86/boot/compressed/kaslr.c +++ b/arch/x86/boot/compressed/kaslr.c @@ -350,20 +350,15 @@ static unsigned long find_random_addr(unsigned long minimum, return slots_fetch_random(); } -unsigned char *choose_random_location(unsigned char *input_ptr, +/* + * Since this function examines addresses much more numerically, + * it takes the input and output pointers as 'unsigned long'. + */ +unsigned char *choose_random_location(unsigned long input, unsigned long input_size, - unsigned char *output_ptr, + unsigned long output, unsigned long output_size) { - /* - * The caller of choose_random_location() uses unsigned char * for - * buffer pointers since it performs decompression, elf parsing, etc. - * Since this code examines addresses much more numerically, - * unsigned long is used internally here. Instead of sprinkling - * more casts into extract_kernel, do them here and at return. - */ - unsigned long input = (unsigned long)input_ptr; - unsigned long output = (unsigned long)output_ptr; unsigned long choice = output; unsigned long random_addr; diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 9536d77..f14db4e 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -366,7 +366,8 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap, * the entire decompressed kernel plus relocation table, or the * entire decompressed kernel plus .bss and .brk sections. */ - output = choose_random_location(input_data, input_len, output, + output = choose_random_location((unsigned long)input_data, input_len, + (unsigned long)output, max(output_len, kernel_total_size)); /* Validate memory location choices. */ diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index 1f23d02..0112005a 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -67,20 +67,20 @@ int cmdline_find_option_bool(const char *option); #if CONFIG_RANDOMIZE_BASE /* kaslr.c */ -unsigned char *choose_random_location(unsigned char *input_ptr, +unsigned char *choose_random_location(unsigned long input_ptr, unsigned long input_size, - unsigned char *output_ptr, + unsigned long output_ptr, unsigned long output_size); /* cpuflags.c */ bool has_cpuflag(int flag); #else static inline -unsigned char *choose_random_location(unsigned char *input_ptr, +unsigned char *choose_random_location(unsigned long input_ptr, unsigned long input_size, - unsigned char *output_ptr, + unsigned long output_ptr, unsigned long output_size) { - return output_ptr; + return (unsigned char *)output_ptr; } #endif -- cgit v0.10.2 From ed09acde44e301b5c13755ab84821fa44b188b5e Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 6 May 2016 12:44:59 -0700 Subject: x86/KASLR: Improve comments around the mem_avoid[] logic This attempts to improve the comments that describe how the memory range used for decompression is avoided. Additionally uses an enum instead of raw numbers for the mem_avoid[] indexing. Suggested-by: Borislav Petkov Signed-off-by: Kees Cook Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Baoquan He Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Yinghai Lu Link: http://lkml.kernel.org/r/20160506194459.GA16480@www.outflux.net Signed-off-by: Ingo Molnar diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c index ff12277..8ef1186 100644 --- a/arch/x86/boot/compressed/kaslr.c +++ b/arch/x86/boot/compressed/kaslr.c @@ -121,7 +121,14 @@ struct mem_vector { unsigned long size; }; -#define MEM_AVOID_MAX 4 +enum mem_avoid_index { + MEM_AVOID_ZO_RANGE = 0, + MEM_AVOID_INITRD, + MEM_AVOID_CMDLINE, + MEM_AVOID_BOOTPARAMS, + MEM_AVOID_MAX, +}; + static struct mem_vector mem_avoid[MEM_AVOID_MAX]; static bool mem_contains(struct mem_vector *region, struct mem_vector *item) @@ -147,55 +154,78 @@ static bool mem_overlaps(struct mem_vector *one, struct mem_vector *two) } /* - * In theroy, KASLR can put the kernel anywhere in area of [16M, 64T). The - * mem_avoid array is used to store the ranges that need to be avoided when - * KASLR searches for a an appropriate random address. We must avoid any + * In theory, KASLR can put the kernel anywhere in the range of [16M, 64T). + * The mem_avoid array is used to store the ranges that need to be avoided + * when KASLR searches for an appropriate random address. We must avoid any * regions that are unsafe to overlap with during decompression, and other - * things like the initrd, cmdline and boot_params. + * things like the initrd, cmdline and boot_params. This comment seeks to + * explain mem_avoid as clearly as possible since incorrect mem_avoid + * memory ranges lead to really hard to debug boot failures. + * + * The initrd, cmdline, and boot_params are trivial to identify for + * avoiding. The are MEM_AVOID_INITRD, MEM_AVOID_CMDLINE, and + * MEM_AVOID_BOOTPARAMS respectively below. + * + * What is not obvious how to avoid is the range of memory that is used + * during decompression (MEM_AVOID_ZO_RANGE below). This range must cover + * the compressed kernel (ZO) and its run space, which is used to extract + * the uncompressed kernel (VO) and relocs. + * + * ZO's full run size sits against the end of the decompression buffer, so + * we can calculate where text, data, bss, etc of ZO are positioned more + * easily. + * + * For additional background, the decompression calculations can be found + * in header.S, and the memory diagram is based on the one found in misc.c. + * + * The following conditions are already enforced by the image layouts and + * associated code: + * - input + input_size >= output + output_size + * - kernel_total_size <= init_size + * - kernel_total_size <= output_size (see Note below) + * - output + init_size >= output + output_size * - * How to calculate the unsafe areas is detailed here, and is informed by - * the decompression calculations in header.S, and the diagram in misc.c. + * (Note that kernel_total_size and output_size have no fundamental + * relationship, but output_size is passed to choose_random_location + * as a maximum of the two. The diagram is showing a case where + * kernel_total_size is larger than output_size, but this case is + * handled by bumping output_size.) * - * The compressed vmlinux (ZO) plus relocs and the run space of ZO can't be - * overwritten by decompression output. + * The above conditions can be illustrated by a diagram: * - * ZO sits against the end of the decompression buffer, so we can calculate - * where text, data, bss, etc of ZO are positioned. + * 0 output input input+input_size output+init_size + * | | | | | + * | | | | | + * |-----|--------|--------|--------------|-----------|--|-------------| + * | | | + * | | | + * output+init_size-ZO_INIT_SIZE output+output_size output+kernel_total_size * - * The follow are already enforced by the code: - * - init_size >= kernel_total_size - * - input + input_len >= output + output_len - * - kernel_total_size could be >= or < output_len + * [output, output+init_size) is the entire memory range used for + * extracting the compressed image. * - * From this, we can make several observations, illustrated by a diagram: - * - init_size >= kernel_total_size - * - input + input_len > output + output_len - * - kernel_total_size >= output_len + * [output, output+kernel_total_size) is the range needed for the + * uncompressed kernel (VO) and its run size (bss, brk, etc). * - * 0 output input input+input_len output+init_size - * | | | | | - * | | | | | - * |-----|--------|--------|------------------|----|------------|----------| - * | | | - * | | | - * output+init_size-ZO_INIT_SIZE output+output_len output+kernel_total_size + * [output, output+output_size) is VO plus relocs (i.e. the entire + * uncompressed payload contained by ZO). This is the area of the buffer + * written to during decompression. * - * [output, output+init_size) is for the buffer for decompressing the - * compressed kernel (ZO). + * [output+init_size-ZO_INIT_SIZE, output+init_size) is the worst-case + * range of the copied ZO and decompression code. (i.e. the range + * covered backwards of size ZO_INIT_SIZE, starting from output+init_size.) * - * [output, output+kernel_total_size) is for the uncompressed kernel (VO) - * and its bss, brk, etc. - * [output, output+output_len) is VO plus relocs + * [input, input+input_size) is the original copied compressed image (ZO) + * (i.e. it does not include its run size). This range must be avoided + * because it contains the data used for decompression. * - * [output+init_size-ZO_INIT_SIZE, output+init_size) is the copied ZO. - * [input, input+input_len) is the copied compressed (VO (vmlinux after - * objcopy) plus relocs), not the ZO. + * [input+input_size, output+init_size) is [_text, _end) for ZO. This + * range includes ZO's heap and stack, and must be avoided since it + * performs the decompression. * - * [input+input_len, output+init_size) is [_text, _end) for ZO. That was the - * first range in mem_avoid, which included ZO's heap and stack. Also - * [input, input+input_size) need be put in mem_avoid array, but since it - * is adjacent to the first entry, they can be merged. This is how we get - * the first entry in mem_avoid[]. + * Since the above two ranges need to be avoided and they are adjacent, + * they can be merged, resulting in: [input, output+init_size) which + * becomes the MEM_AVOID_ZO_RANGE below. */ static void mem_avoid_init(unsigned long input, unsigned long input_size, unsigned long output) @@ -209,16 +239,16 @@ static void mem_avoid_init(unsigned long input, unsigned long input_size, * Avoid the region that is unsafe to overlap during * decompression. */ - mem_avoid[0].start = input; - mem_avoid[0].size = (output + init_size) - input; + mem_avoid[MEM_AVOID_ZO_RANGE].start = input; + mem_avoid[MEM_AVOID_ZO_RANGE].size = (output + init_size) - input; /* Avoid initrd. */ initrd_start = (u64)boot_params->ext_ramdisk_image << 32; initrd_start |= boot_params->hdr.ramdisk_image; initrd_size = (u64)boot_params->ext_ramdisk_size << 32; initrd_size |= boot_params->hdr.ramdisk_size; - mem_avoid[1].start = initrd_start; - mem_avoid[1].size = initrd_size; + mem_avoid[MEM_AVOID_INITRD].start = initrd_start; + mem_avoid[MEM_AVOID_INITRD].size = initrd_size; /* Avoid kernel command line. */ cmd_line = (u64)boot_params->ext_cmd_line_ptr << 32; @@ -227,12 +257,12 @@ static void mem_avoid_init(unsigned long input, unsigned long input_size, ptr = (char *)(unsigned long)cmd_line; for (cmd_line_size = 0; ptr[cmd_line_size++]; ) ; - mem_avoid[2].start = cmd_line; - mem_avoid[2].size = cmd_line_size; + mem_avoid[MEM_AVOID_CMDLINE].start = cmd_line; + mem_avoid[MEM_AVOID_CMDLINE].size = cmd_line_size; - /* Avoid params */ - mem_avoid[3].start = (unsigned long)boot_params; - mem_avoid[3].size = sizeof(*boot_params); + /* Avoid boot parameters. */ + mem_avoid[MEM_AVOID_BOOTPARAMS].start = (unsigned long)boot_params; + mem_avoid[MEM_AVOID_BOOTPARAMS].size = sizeof(*boot_params); } /* Does this memory vector overlap a known avoided area? */ -- cgit v0.10.2 From 8665e6ff2107204f981ba8f9ee37085a003fc9e9 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 6 May 2016 15:01:33 -0700 Subject: x86/boot: Clean up indenting for asm/boot.h Before adding more defines to asm/boot.h, this cleans up the existing indenting for readability. Suggested-by: Ingo Molnar Signed-off-by: Kees Cook Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Baoquan He Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Young Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vivek Goyal Cc: Yinghai Lu Cc: kernel-hardening@lists.openwall.com Cc: lasse.collin@tukaani.org Link: http://lkml.kernel.org/r/1462572095-11754-2-git-send-email-keescook@chromium.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/include/asm/boot.h b/arch/x86/include/asm/boot.h index 6b8d6e8..52e6ca6 100644 --- a/arch/x86/include/asm/boot.h +++ b/arch/x86/include/asm/boot.h @@ -12,29 +12,27 @@ /* Minimum kernel alignment, as a power of two */ #ifdef CONFIG_X86_64 -#define MIN_KERNEL_ALIGN_LG2 PMD_SHIFT +# define MIN_KERNEL_ALIGN_LG2 PMD_SHIFT #else -#define MIN_KERNEL_ALIGN_LG2 (PAGE_SHIFT + THREAD_SIZE_ORDER) +# define MIN_KERNEL_ALIGN_LG2 (PAGE_SHIFT + THREAD_SIZE_ORDER) #endif #define MIN_KERNEL_ALIGN (_AC(1, UL) << MIN_KERNEL_ALIGN_LG2) #if (CONFIG_PHYSICAL_ALIGN & (CONFIG_PHYSICAL_ALIGN-1)) || \ (CONFIG_PHYSICAL_ALIGN < MIN_KERNEL_ALIGN) -#error "Invalid value for CONFIG_PHYSICAL_ALIGN" +# error "Invalid value for CONFIG_PHYSICAL_ALIGN" #endif #ifdef CONFIG_KERNEL_BZIP2 -#define BOOT_HEAP_SIZE 0x400000 +# define BOOT_HEAP_SIZE 0x400000 #else /* !CONFIG_KERNEL_BZIP2 */ - -#define BOOT_HEAP_SIZE 0x10000 - -#endif /* !CONFIG_KERNEL_BZIP2 */ +# define BOOT_HEAP_SIZE 0x10000 +#endif #ifdef CONFIG_X86_64 -#define BOOT_STACK_SIZE 0x4000 -#else -#define BOOT_STACK_SIZE 0x1000 +# define BOOT_STACK_SIZE 0x4000 +#else /* !CONFIG_X86_64 */ +# define BOOT_STACK_SIZE 0x1000 #endif #endif /* _ASM_X86_BOOT_H */ -- cgit v0.10.2 From cf4fb15b3110df070fe9829a1ef38fef8316fb90 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 6 May 2016 15:01:34 -0700 Subject: x86/boot: Split out kernel_ident_mapping_init() In order to support on-demand page table creation when moving the kernel for KASLR, we need to use kernel_ident_mapping_init() in the decompression code. This splits it out into its own file for use outside of init_64.c. Additionally, checking for __pa/__va defines is added since they need to be overridden in the decompression code. [kees: rewrote changelog] Signed-off-by: Yinghai Lu Signed-off-by: Kees Cook Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Baoquan He Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Young Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vivek Goyal Cc: kernel-hardening@lists.openwall.com Cc: lasse.collin@tukaani.org Link: http://lkml.kernel.org/r/1462572095-11754-3-git-send-email-keescook@chromium.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h index 802dde3..cf8f619 100644 --- a/arch/x86/include/asm/page.h +++ b/arch/x86/include/asm/page.h @@ -37,7 +37,10 @@ static inline void copy_user_page(void *to, void *from, unsigned long vaddr, alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO | movableflags, vma, vaddr) #define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE +#ifndef __pa #define __pa(x) __phys_addr((unsigned long)(x)) +#endif + #define __pa_nodebug(x) __phys_addr_nodebug((unsigned long)(x)) /* __pa_symbol should be used for C visible symbols. This seems to be the official gcc blessed way to do such arithmetic. */ @@ -51,7 +54,9 @@ static inline void copy_user_page(void *to, void *from, unsigned long vaddr, #define __pa_symbol(x) \ __phys_addr_symbol(__phys_reloc_hide((unsigned long)(x))) +#ifndef __va #define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET)) +#endif #define __boot_va(x) __va(x) #define __boot_pa(x) __pa(x) diff --git a/arch/x86/mm/ident_map.c b/arch/x86/mm/ident_map.c new file mode 100644 index 0000000..751ca92 --- /dev/null +++ b/arch/x86/mm/ident_map.c @@ -0,0 +1,74 @@ + +static void ident_pmd_init(unsigned long pmd_flag, pmd_t *pmd_page, + unsigned long addr, unsigned long end) +{ + addr &= PMD_MASK; + for (; addr < end; addr += PMD_SIZE) { + pmd_t *pmd = pmd_page + pmd_index(addr); + + if (!pmd_present(*pmd)) + set_pmd(pmd, __pmd(addr | pmd_flag)); + } +} +static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page, + unsigned long addr, unsigned long end) +{ + unsigned long next; + + for (; addr < end; addr = next) { + pud_t *pud = pud_page + pud_index(addr); + pmd_t *pmd; + + next = (addr & PUD_MASK) + PUD_SIZE; + if (next > end) + next = end; + + if (pud_present(*pud)) { + pmd = pmd_offset(pud, 0); + ident_pmd_init(info->pmd_flag, pmd, addr, next); + continue; + } + pmd = (pmd_t *)info->alloc_pgt_page(info->context); + if (!pmd) + return -ENOMEM; + ident_pmd_init(info->pmd_flag, pmd, addr, next); + set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE)); + } + + return 0; +} + +int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page, + unsigned long addr, unsigned long end) +{ + unsigned long next; + int result; + int off = info->kernel_mapping ? pgd_index(__PAGE_OFFSET) : 0; + + for (; addr < end; addr = next) { + pgd_t *pgd = pgd_page + pgd_index(addr) + off; + pud_t *pud; + + next = (addr & PGDIR_MASK) + PGDIR_SIZE; + if (next > end) + next = end; + + if (pgd_present(*pgd)) { + pud = pud_offset(pgd, 0); + result = ident_pud_init(info, pud, addr, next); + if (result) + return result; + continue; + } + + pud = (pud_t *)info->alloc_pgt_page(info->context); + if (!pud) + return -ENOMEM; + result = ident_pud_init(info, pud, addr, next); + if (result) + return result; + set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE)); + } + + return 0; +} diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 214afda..65cfbee 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -58,79 +58,7 @@ #include "mm_internal.h" -static void ident_pmd_init(unsigned long pmd_flag, pmd_t *pmd_page, - unsigned long addr, unsigned long end) -{ - addr &= PMD_MASK; - for (; addr < end; addr += PMD_SIZE) { - pmd_t *pmd = pmd_page + pmd_index(addr); - - if (!pmd_present(*pmd)) - set_pmd(pmd, __pmd(addr | pmd_flag)); - } -} -static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page, - unsigned long addr, unsigned long end) -{ - unsigned long next; - - for (; addr < end; addr = next) { - pud_t *pud = pud_page + pud_index(addr); - pmd_t *pmd; - - next = (addr & PUD_MASK) + PUD_SIZE; - if (next > end) - next = end; - - if (pud_present(*pud)) { - pmd = pmd_offset(pud, 0); - ident_pmd_init(info->pmd_flag, pmd, addr, next); - continue; - } - pmd = (pmd_t *)info->alloc_pgt_page(info->context); - if (!pmd) - return -ENOMEM; - ident_pmd_init(info->pmd_flag, pmd, addr, next); - set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE)); - } - - return 0; -} - -int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page, - unsigned long addr, unsigned long end) -{ - unsigned long next; - int result; - int off = info->kernel_mapping ? pgd_index(__PAGE_OFFSET) : 0; - - for (; addr < end; addr = next) { - pgd_t *pgd = pgd_page + pgd_index(addr) + off; - pud_t *pud; - - next = (addr & PGDIR_MASK) + PGDIR_SIZE; - if (next > end) - next = end; - - if (pgd_present(*pgd)) { - pud = pud_offset(pgd, 0); - result = ident_pud_init(info, pud, addr, next); - if (result) - return result; - continue; - } - - pud = (pud_t *)info->alloc_pgt_page(info->context); - if (!pud) - return -ENOMEM; - result = ident_pud_init(info, pud, addr, next); - if (result) - return result; - set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE)); - } - - return 0; -} +#include "ident_map.c" /* * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the -- cgit v0.10.2 From 3a94707d7a7bb1eb82acae5fbc035247dd1ba8a5 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 6 May 2016 15:01:35 -0700 Subject: x86/KASLR: Build identity mappings on demand Currently KASLR only supports relocation in a small physical range (from 16M to 1G), due to using the initial kernel page table identity mapping. To support ranges above this, we need to have an identity mapping for the desired memory range before we can decompress (and later run) the kernel. 32-bit kernels already have the needed identity mapping. This patch adds identity mappings for the needed memory ranges on 64-bit kernels. This happens in two possible boot paths: If loaded via startup_32(), we need to set up the needed identity map. If loaded from a 64-bit bootloader, the bootloader will have already set up an identity mapping, and we'll start via the compressed kernel's startup_64(). In this case, the bootloader's page tables need to be avoided while selecting the new uncompressed kernel location. If not, the decompressor could overwrite them during decompression. To accomplish this, we could walk the pagetable and find every page that is used, and add them to mem_avoid, but this needs extra code and will require increasing the size of the mem_avoid array. Instead, we can create a new set of page tables for our own identity mapping instead. The pages for the new page table will come from the _pagetable section of the compressed kernel, which means they are already contained by in mem_avoid array. To do this, we reuse the code from the uncompressed kernel's identity mapping routines. The _pgtable will be shared by both the 32-bit and 64-bit paths to reduce init_size, as now the compressed kernel's _rodata to _end will contribute to init_size. To handle the possible mappings, we need to increase the existing page table buffer size: When booting via startup_64(), we need to cover the old VO, params, cmdline and uncompressed kernel. In an extreme case we could have them all beyond the 512G boundary, which needs (2+2)*4 pages with 2M mappings. And we'll need 2 for first 2M for VGA RAM. One more is needed for level4. This gets us to 19 pages total. When booting via startup_32(), KASLR could move the uncompressed kernel above 4G, so we need to create extra identity mappings, which should only need (2+2) pages at most when it is beyond the 512G boundary. So 19 pages is sufficient for this case as well. The resulting BOOT_*PGT_SIZE defines use the "_SIZE" suffix on their names to maintain logical consistency with the existing BOOT_HEAP_SIZE and BOOT_STACK_SIZE defines. This patch is based on earlier patches from Yinghai Lu and Baoquan He. Signed-off-by: Kees Cook Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Baoquan He Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Young Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Jiri Kosina Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vivek Goyal Cc: Yinghai Lu Cc: kernel-hardening@lists.openwall.com Cc: lasse.collin@tukaani.org Link: http://lkml.kernel.org/r/1462572095-11754-4-git-send-email-keescook@chromium.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 77ce3a0..cfdd8c3 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -75,6 +75,9 @@ vmlinux-objs-y := $(obj)/vmlinux.lds $(obj)/head_$(BITS).o $(obj)/misc.o \ vmlinux-objs-$(CONFIG_EARLY_PRINTK) += $(obj)/early_serial_console.o vmlinux-objs-$(CONFIG_RANDOMIZE_BASE) += $(obj)/kaslr.o +ifdef CONFIG_X86_64 + vmlinux-objs-$(CONFIG_RANDOMIZE_BASE) += $(obj)/pagetable.o +endif $(obj)/eboot.o: KBUILD_CFLAGS += -fshort-wchar -mno-red-zone diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index 7c04700..0d80a7a 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -134,7 +134,7 @@ ENTRY(startup_32) /* Initialize Page tables to 0 */ leal pgtable(%ebx), %edi xorl %eax, %eax - movl $((4096*6)/4), %ecx + movl $(BOOT_INIT_PGT_SIZE/4), %ecx rep stosl /* Build Level 4 */ @@ -486,4 +486,4 @@ boot_stack_end: .section ".pgtable","a",@nobits .balign 4096 pgtable: - .fill 6*4096, 1, 0 + .fill BOOT_PGT_SIZE, 1, 0 diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c index 8ef1186..f82975b 100644 --- a/arch/x86/boot/compressed/kaslr.c +++ b/arch/x86/boot/compressed/kaslr.c @@ -241,6 +241,8 @@ static void mem_avoid_init(unsigned long input, unsigned long input_size, */ mem_avoid[MEM_AVOID_ZO_RANGE].start = input; mem_avoid[MEM_AVOID_ZO_RANGE].size = (output + init_size) - input; + add_identity_map(mem_avoid[MEM_AVOID_ZO_RANGE].start, + mem_avoid[MEM_AVOID_ZO_RANGE].size); /* Avoid initrd. */ initrd_start = (u64)boot_params->ext_ramdisk_image << 32; @@ -249,6 +251,7 @@ static void mem_avoid_init(unsigned long input, unsigned long input_size, initrd_size |= boot_params->hdr.ramdisk_size; mem_avoid[MEM_AVOID_INITRD].start = initrd_start; mem_avoid[MEM_AVOID_INITRD].size = initrd_size; + /* No need to set mapping for initrd, it will be handled in VO. */ /* Avoid kernel command line. */ cmd_line = (u64)boot_params->ext_cmd_line_ptr << 32; @@ -259,10 +262,21 @@ static void mem_avoid_init(unsigned long input, unsigned long input_size, ; mem_avoid[MEM_AVOID_CMDLINE].start = cmd_line; mem_avoid[MEM_AVOID_CMDLINE].size = cmd_line_size; + add_identity_map(mem_avoid[MEM_AVOID_CMDLINE].start, + mem_avoid[MEM_AVOID_CMDLINE].size); /* Avoid boot parameters. */ mem_avoid[MEM_AVOID_BOOTPARAMS].start = (unsigned long)boot_params; mem_avoid[MEM_AVOID_BOOTPARAMS].size = sizeof(*boot_params); + add_identity_map(mem_avoid[MEM_AVOID_BOOTPARAMS].start, + mem_avoid[MEM_AVOID_BOOTPARAMS].size); + + /* We don't need to set a mapping for setup_data. */ + +#ifdef CONFIG_X86_VERBOSE_BOOTUP + /* Make sure video RAM can be used. */ + add_identity_map(0, PMD_SIZE); +#endif } /* Does this memory vector overlap a known avoided area? */ @@ -421,6 +435,9 @@ unsigned char *choose_random_location(unsigned long input, goto out; choice = random_addr; + + add_identity_map(choice, output_size); + finalize_identity_maps(); out: return (unsigned char *)choice; } diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index 0112005a..b6fec1f 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -84,6 +84,17 @@ unsigned char *choose_random_location(unsigned long input_ptr, } #endif +#ifdef CONFIG_X86_64 +void add_identity_map(unsigned long start, unsigned long size); +void finalize_identity_maps(void); +extern unsigned char _pgtable[]; +#else +static inline void add_identity_map(unsigned long start, unsigned long size) +{ } +static inline void finalize_identity_maps(void) +{ } +#endif + #ifdef CONFIG_EARLY_PRINTK /* early_serial_console.c */ extern int early_serial_base; diff --git a/arch/x86/boot/compressed/pagetable.c b/arch/x86/boot/compressed/pagetable.c new file mode 100644 index 0000000..3c99051 --- /dev/null +++ b/arch/x86/boot/compressed/pagetable.c @@ -0,0 +1,135 @@ +/* + * This code is used on x86_64 to create page table identity mappings on + * demand by building up a new set of page tables (or appending to the + * existing ones), and then switching over to them when ready. + */ + +/* + * Since we're dealing with identity mappings, physical and virtual + * addresses are the same, so override these defines which are ultimately + * used by the headers in misc.h. + */ +#define __pa(x) ((unsigned long)(x)) +#define __va(x) ((void *)((unsigned long)(x))) + +#include "misc.h" + +/* These actually do the work of building the kernel identity maps. */ +#include +#include +#include "../../mm/ident_map.c" + +/* Used by pgtable.h asm code to force instruction serialization. */ +unsigned long __force_order; + +/* Used to track our page table allocation area. */ +struct alloc_pgt_data { + unsigned char *pgt_buf; + unsigned long pgt_buf_size; + unsigned long pgt_buf_offset; +}; + +/* + * Allocates space for a page table entry, using struct alloc_pgt_data + * above. Besides the local callers, this is used as the allocation + * callback in mapping_info below. + */ +static void *alloc_pgt_page(void *context) +{ + struct alloc_pgt_data *pages = (struct alloc_pgt_data *)context; + unsigned char *entry; + + /* Validate there is space available for a new page. */ + if (pages->pgt_buf_offset >= pages->pgt_buf_size) { + debug_putstr("out of pgt_buf in " __FILE__ "!?\n"); + debug_putaddr(pages->pgt_buf_offset); + debug_putaddr(pages->pgt_buf_size); + return NULL; + } + + entry = pages->pgt_buf + pages->pgt_buf_offset; + pages->pgt_buf_offset += PAGE_SIZE; + + return entry; +} + +/* Used to track our allocated page tables. */ +static struct alloc_pgt_data pgt_data; + +/* The top level page table entry pointer. */ +static unsigned long level4p; + +/* Locates and clears a region for a new top level page table. */ +static void prepare_level4(void) +{ + /* + * It should be impossible for this not to already be true, + * but since calling this a second time would rewind the other + * counters, let's just make sure this is reset too. + */ + pgt_data.pgt_buf_offset = 0; + + /* + * If we came here via startup_32(), cr3 will be _pgtable already + * and we must append to the existing area instead of entirely + * overwriting it. + */ + level4p = read_cr3(); + if (level4p == (unsigned long)_pgtable) { + debug_putstr("booted via startup_32()\n"); + pgt_data.pgt_buf = _pgtable + BOOT_INIT_PGT_SIZE; + pgt_data.pgt_buf_size = BOOT_PGT_SIZE - BOOT_INIT_PGT_SIZE; + memset(pgt_data.pgt_buf, 0, pgt_data.pgt_buf_size); + } else { + debug_putstr("booted via startup_64()\n"); + pgt_data.pgt_buf = _pgtable; + pgt_data.pgt_buf_size = BOOT_PGT_SIZE; + memset(pgt_data.pgt_buf, 0, pgt_data.pgt_buf_size); + level4p = (unsigned long)alloc_pgt_page(&pgt_data); + } +} + +/* + * Mapping information structure passed to kernel_ident_mapping_init(). + * Since this never changes, there's no reason to repeatedly fill it + * in on the stack when calling add_identity_map(). + */ +static struct x86_mapping_info mapping_info = { + .alloc_pgt_page = alloc_pgt_page, + .context = &pgt_data, + .pmd_flag = __PAGE_KERNEL_LARGE_EXEC, +}; + +/* + * Adds the specified range to what will become the new identity mappings. + * Once all ranges have been added, the new mapping is activated by calling + * finalize_identity_maps() below. + */ +void add_identity_map(unsigned long start, unsigned long size) +{ + unsigned long end = start + size; + + /* Make sure we have a top level page table ready to use. */ + if (!level4p) + prepare_level4(); + + /* Align boundary to 2M. */ + start = round_down(start, PMD_SIZE); + end = round_up(end, PMD_SIZE); + if (start >= end) + return; + + /* Build the mapping. */ + kernel_ident_mapping_init(&mapping_info, (pgd_t *)level4p, + start, end); +} + +/* + * This switches the page tables to the new level4 that has been built + * via calls to add_identity_map() above. If booted via startup_32(), + * this is effectively a no-op. + */ +void finalize_identity_maps(void) +{ + write_cr3(level4p); +} diff --git a/arch/x86/include/asm/boot.h b/arch/x86/include/asm/boot.h index 52e6ca6..abd06b1 100644 --- a/arch/x86/include/asm/boot.h +++ b/arch/x86/include/asm/boot.h @@ -31,6 +31,25 @@ #ifdef CONFIG_X86_64 # define BOOT_STACK_SIZE 0x4000 + +# define BOOT_INIT_PGT_SIZE (6*4096) +# ifdef CONFIG_RANDOMIZE_BASE +/* + * Assuming all cross the 512GB boundary: + * 1 page for level4 + * (2+2)*4 pages for kernel, param, cmd_line, and randomized kernel + * 2 pages for first 2M (video RAM: CONFIG_X86_VERBOSE_BOOTUP). + * Total is 19 pages. + */ +# ifdef CONFIG_X86_VERBOSE_BOOTUP +# define BOOT_PGT_SIZE (19*4096) +# else /* !CONFIG_X86_VERBOSE_BOOTUP */ +# define BOOT_PGT_SIZE (17*4096) +# endif +# else /* !CONFIG_RANDOMIZE_BASE */ +# define BOOT_PGT_SIZE BOOT_INIT_PGT_SIZE +# endif + #else /* !CONFIG_X86_64 */ # define BOOT_STACK_SIZE 0x1000 #endif -- cgit v0.10.2 From 36a39ac967a548154a0fe44d71cb0063fa05010f Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Sat, 7 May 2016 11:59:40 +0200 Subject: x86/boot: Comment what finalize_identity_maps() does So it is not really obvious that finalize_identity_maps() doesn't do any finalization but it *actually* writes CR3 with the ident PGD. Comment that at the call site. Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: akpm@linux-foundation.org Cc: bhe@redhat.com Cc: dyoung@redhat.com Cc: jkosina@suse.cz Cc: linux-tip-commits@vger.kernel.org Cc: luto@kernel.org Cc: vgoyal@redhat.com Cc: yinghai@kernel.org Link: http://lkml.kernel.org/r/20160507100541.GA24613@pd.tnic Signed-off-by: Ingo Molnar diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c index f82975b..f5a138c 100644 --- a/arch/x86/boot/compressed/kaslr.c +++ b/arch/x86/boot/compressed/kaslr.c @@ -437,6 +437,8 @@ unsigned char *choose_random_location(unsigned long input, choice = random_addr; add_identity_map(choice, output_size); + + /* This actually loads the identity pagetable on x86_64. */ finalize_identity_maps(); out: return (unsigned char *)choice; -- cgit v0.10.2 From 434a6c9f90f7ab5ade619455df01ef5ebea533ee Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 9 May 2016 13:22:04 -0700 Subject: x86/KASLR: Initialize mapping_info every time As it turns out, mapping_info DOES need to be initialized every time, because pgt_data address could be changed during kernel relocation. So it can not be build time assigned. Without this, page tables were not being corrected updated, which could cause reboots when a physical address beyond 2G was chosen. Signed-off-by: Kees Cook Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Baoquan He Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Young Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vivek Goyal Cc: Yinghai Lu Cc: kernel-hardening@lists.openwall.com Cc: lasse.collin@tukaani.org Link: http://lkml.kernel.org/r/1462825332-10505-2-git-send-email-keescook@chromium.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/boot/compressed/pagetable.c b/arch/x86/boot/compressed/pagetable.c index 3c99051..34b95df 100644 --- a/arch/x86/boot/compressed/pagetable.c +++ b/arch/x86/boot/compressed/pagetable.c @@ -90,23 +90,17 @@ static void prepare_level4(void) } /* - * Mapping information structure passed to kernel_ident_mapping_init(). - * Since this never changes, there's no reason to repeatedly fill it - * in on the stack when calling add_identity_map(). - */ -static struct x86_mapping_info mapping_info = { - .alloc_pgt_page = alloc_pgt_page, - .context = &pgt_data, - .pmd_flag = __PAGE_KERNEL_LARGE_EXEC, -}; - -/* * Adds the specified range to what will become the new identity mappings. * Once all ranges have been added, the new mapping is activated by calling * finalize_identity_maps() below. */ void add_identity_map(unsigned long start, unsigned long size) { + struct x86_mapping_info mapping_info = { + .alloc_pgt_page = alloc_pgt_page, + .context = &pgt_data, + .pmd_flag = __PAGE_KERNEL_LARGE_EXEC, + }; unsigned long end = start + size; /* Make sure we have a top level page table ready to use. */ -- cgit v0.10.2 From cb18ef0da259db611fbf52806592fde5f469ae67 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 9 May 2016 13:22:05 -0700 Subject: x86/boot: Add missing file header comments There were some files with missing header comments. Since they are included from both compressed and regular kernels, make note of that. Also corrects a typo in the mem_avoid comments. Signed-off-by: Kees Cook Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Baoquan He Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Young Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vivek Goyal Cc: Yinghai Lu Cc: kernel-hardening@lists.openwall.com Cc: lasse.collin@tukaani.org Link: http://lkml.kernel.org/r/1462825332-10505-3-git-send-email-keescook@chromium.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c index f5a138c..f15d7b8 100644 --- a/arch/x86/boot/compressed/kaslr.c +++ b/arch/x86/boot/compressed/kaslr.c @@ -163,7 +163,7 @@ static bool mem_overlaps(struct mem_vector *one, struct mem_vector *two) * memory ranges lead to really hard to debug boot failures. * * The initrd, cmdline, and boot_params are trivial to identify for - * avoiding. The are MEM_AVOID_INITRD, MEM_AVOID_CMDLINE, and + * avoiding. They are MEM_AVOID_INITRD, MEM_AVOID_CMDLINE, and * MEM_AVOID_BOOTPARAMS respectively below. * * What is not obvious how to avoid is the range of memory that is used diff --git a/arch/x86/boot/early_serial_console.c b/arch/x86/boot/early_serial_console.c index 45a0768..f0b8d6d 100644 --- a/arch/x86/boot/early_serial_console.c +++ b/arch/x86/boot/early_serial_console.c @@ -1,3 +1,7 @@ +/* + * Serial port routines for use during early boot reporting. This code is + * included from both the compressed kernel and the regular kernel. + */ #include "boot.h" #define DEFAULT_SERIAL_PORT 0x3f8 /* ttyS0 */ diff --git a/arch/x86/mm/ident_map.c b/arch/x86/mm/ident_map.c index 751ca92..ec21796 100644 --- a/arch/x86/mm/ident_map.c +++ b/arch/x86/mm/ident_map.c @@ -1,3 +1,7 @@ +/* + * Helper routines for building identity mapping page tables. This is + * included by both the compressed kernel and the regular kernel. + */ static void ident_pmd_init(unsigned long pmd_flag, pmd_t *pmd_page, unsigned long addr, unsigned long end) @@ -10,6 +14,7 @@ static void ident_pmd_init(unsigned long pmd_flag, pmd_t *pmd_page, set_pmd(pmd, __pmd(addr | pmd_flag)); } } + static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page, unsigned long addr, unsigned long end) { -- cgit v0.10.2 From c401cf1524153f9c2ede7ab8ece403513925770a Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Mon, 9 May 2016 13:22:06 -0700 Subject: x86/KASLR: Add 'struct slot_area' to manage random_addr slots In order to support KASLR moving the kernel anywhere in physical memory (which could be up to 64TB), we need to handle counting the potential randomization locations in a more efficient manner. In the worst case with 64TB, there could be roughly 32 * 1024 * 1024 randomization slots if CONFIG_PHYSICAL_ALIGN is 0x1000000. Currently the starting address of candidate positions is stored into the slots[] array, one at a time. This method would cost too much memory and it's also very inefficient to get and save the slot information into the slot array one by one. This patch introduces 'struct slot_area' to manage each contiguous region of randomization slots. Each slot_area will contain the starting address and how many available slots are in this area. As with the original code, the slot_areas[] will avoid the mem_avoid[] regions. Since setup_data is a linked list, it could contain an unknown number of memory regions to be avoided, which could cause us to fragment the contiguous memory that the slot_area array is tracking. In normal operation this level of fragmentation will be extremely rare, but we choose a suitably large value (100) for the array. If setup_data forces the slot_area array to become highly fragmented and there are more slots available beyond the first 100 found, the rest will be ignored for KASLR selection. The function store_slot_info() is used to calculate the number of slots available in the passed-in memory region and stores it into slot_areas[] after adjusting for alignment and size requirements. Signed-off-by: Baoquan He [ Rewrote changelog, squashed with new functions. ] Signed-off-by: Kees Cook Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Young Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vivek Goyal Cc: Yinghai Lu Cc: kernel-hardening@lists.openwall.com Cc: lasse.collin@tukaani.org Link: http://lkml.kernel.org/r/1462825332-10505-4-git-send-email-keescook@chromium.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c index f15d7b8..81edf99 100644 --- a/arch/x86/boot/compressed/kaslr.c +++ b/arch/x86/boot/compressed/kaslr.c @@ -308,8 +308,37 @@ static bool mem_avoid_overlap(struct mem_vector *img) } static unsigned long slots[KERNEL_IMAGE_SIZE / CONFIG_PHYSICAL_ALIGN]; + +struct slot_area { + unsigned long addr; + int num; +}; + +#define MAX_SLOT_AREA 100 + +static struct slot_area slot_areas[MAX_SLOT_AREA]; + static unsigned long slot_max; +static unsigned long slot_area_index; + +static void store_slot_info(struct mem_vector *region, unsigned long image_size) +{ + struct slot_area slot_area; + + if (slot_area_index == MAX_SLOT_AREA) + return; + + slot_area.addr = region->start; + slot_area.num = (region->size - image_size) / + CONFIG_PHYSICAL_ALIGN + 1; + + if (slot_area.num > 0) { + slot_areas[slot_area_index++] = slot_area; + slot_max += slot_area.num; + } +} + static void slots_append(unsigned long addr) { /* Overflowing the slots list should be impossible. */ -- cgit v0.10.2 From 06486d6c97cebc2433a40a979f3849cd68184de9 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 9 May 2016 13:22:07 -0700 Subject: x86/KASLR: Return earliest overlap when avoiding regions In preparation for being able to detect where to split up contiguous memory regions that overlap with memory regions to avoid, we need to pass back what the earliest overlapping region was. This modifies the overlap checker to return that information. Based on a separate mem_min_overlap() implementation by Baoquan He. Signed-off-by: Kees Cook Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Baoquan He Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Young Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vivek Goyal Cc: Yinghai Lu Cc: kernel-hardening@lists.openwall.com Cc: lasse.collin@tukaani.org Link: http://lkml.kernel.org/r/1462825332-10505-5-git-send-email-keescook@chromium.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c index 81edf99..e55ebcb 100644 --- a/arch/x86/boot/compressed/kaslr.c +++ b/arch/x86/boot/compressed/kaslr.c @@ -279,15 +279,24 @@ static void mem_avoid_init(unsigned long input, unsigned long input_size, #endif } -/* Does this memory vector overlap a known avoided area? */ -static bool mem_avoid_overlap(struct mem_vector *img) +/* + * Does this memory vector overlap a known avoided area? If so, record the + * overlap region with the lowest address. + */ +static bool mem_avoid_overlap(struct mem_vector *img, + struct mem_vector *overlap) { int i; struct setup_data *ptr; + unsigned long earliest = img->start + img->size; + bool is_overlapping = false; for (i = 0; i < MEM_AVOID_MAX; i++) { - if (mem_overlaps(img, &mem_avoid[i])) - return true; + if (mem_overlaps(img, &mem_avoid[i]) && + mem_avoid[i].start < earliest) { + *overlap = mem_avoid[i]; + is_overlapping = true; + } } /* Avoid all entries in the setup_data linked list. */ @@ -298,13 +307,15 @@ static bool mem_avoid_overlap(struct mem_vector *img) avoid.start = (unsigned long)ptr; avoid.size = sizeof(*ptr) + ptr->len; - if (mem_overlaps(img, &avoid)) - return true; + if (mem_overlaps(img, &avoid) && (avoid.start < earliest)) { + *overlap = avoid; + is_overlapping = true; + } ptr = (struct setup_data *)(unsigned long)ptr->next; } - return false; + return is_overlapping; } static unsigned long slots[KERNEL_IMAGE_SIZE / CONFIG_PHYSICAL_ALIGN]; @@ -361,7 +372,7 @@ static void process_e820_entry(struct e820entry *entry, unsigned long minimum, unsigned long image_size) { - struct mem_vector region, img; + struct mem_vector region, img, overlap; /* Skip non-RAM entries. */ if (entry->type != E820_RAM) @@ -400,7 +411,7 @@ static void process_e820_entry(struct e820entry *entry, for (img.start = region.start, img.size = image_size ; mem_contains(®ion, &img) ; img.start += CONFIG_PHYSICAL_ALIGN) { - if (mem_avoid_overlap(&img)) + if (mem_avoid_overlap(&img, &overlap)) continue; slots_append(img.start); } -- cgit v0.10.2 From 071a74930e60d1fa51207d71f00a35b4f9d4d179 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Mon, 9 May 2016 13:22:08 -0700 Subject: x86/KASLR: Add virtual address choosing function To support randomizing the kernel virtual address separately from the physical address, this patch adds find_random_virt_addr() to choose a slot anywhere between LOAD_PHYSICAL_ADDR and KERNEL_IMAGE_SIZE. Since this address is virtual, not physical, we can place the kernel anywhere in this region, as long as it is aligned and (in the case of kernel being larger than the slot size) placed with enough room to load the entire kernel image. For clarity and readability, find_random_addr() is renamed to find_random_phys_addr() and has "size" renamed to "image_size" to match find_random_virt_addr(). Signed-off-by: Baoquan He [ Rewrote changelog, refactored slot calculation for readability. ] [ Renamed find_random_phys_addr() and size argument. ] Signed-off-by: Kees Cook Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Young Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vivek Goyal Cc: Yinghai Lu Cc: kernel-hardening@lists.openwall.com Cc: lasse.collin@tukaani.org Link: http://lkml.kernel.org/r/1462825332-10505-6-git-send-email-keescook@chromium.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c index e55ebcb..016a4f4 100644 --- a/arch/x86/boot/compressed/kaslr.c +++ b/arch/x86/boot/compressed/kaslr.c @@ -417,8 +417,8 @@ static void process_e820_entry(struct e820entry *entry, } } -static unsigned long find_random_addr(unsigned long minimum, - unsigned long size) +static unsigned long find_random_phys_addr(unsigned long minimum, + unsigned long image_size) { int i; unsigned long addr; @@ -428,12 +428,36 @@ static unsigned long find_random_addr(unsigned long minimum, /* Verify potential e820 positions, appending to slots list. */ for (i = 0; i < boot_params->e820_entries; i++) { - process_e820_entry(&boot_params->e820_map[i], minimum, size); + process_e820_entry(&boot_params->e820_map[i], minimum, + image_size); } return slots_fetch_random(); } +static unsigned long find_random_virt_addr(unsigned long minimum, + unsigned long image_size) +{ + unsigned long slots, random_addr; + + /* Make sure minimum is aligned. */ + minimum = ALIGN(minimum, CONFIG_PHYSICAL_ALIGN); + /* Align image_size for easy slot calculations. */ + image_size = ALIGN(image_size, CONFIG_PHYSICAL_ALIGN); + + /* + * There are how many CONFIG_PHYSICAL_ALIGN-sized slots + * that can hold image_size within the range of minimum to + * KERNEL_IMAGE_SIZE? + */ + slots = (KERNEL_IMAGE_SIZE - minimum - image_size) / + CONFIG_PHYSICAL_ALIGN + 1; + + random_addr = get_random_long() % slots; + + return random_addr * CONFIG_PHYSICAL_ALIGN + minimum; +} + /* * Since this function examines addresses much more numerically, * it takes the input and output pointers as 'unsigned long'. @@ -464,7 +488,7 @@ unsigned char *choose_random_location(unsigned long input, mem_avoid_init(input, input_size, output); /* Walk e820 and find a random address. */ - random_addr = find_random_addr(output, output_size); + random_addr = find_random_phys_addr(output, output_size); if (!random_addr) { warn("KASLR disabled: could not find suitable E820 region!"); goto out; -- cgit v0.10.2 From d2d3462f9f08da364c8fbd41e8e32229d610d49d Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 9 May 2016 13:22:09 -0700 Subject: x86/KASLR: Clarify purpose of each get_random_long() KASLR will be calling get_random_long() twice, but the debug output won't distinguishing between them. This patch adds a report on when it is fetching the physical vs virtual address. With this, once the virtual offset is separate, the report changes from: KASLR using RDTSC... KASLR using RDTSC... into: Physical KASLR using RDTSC... Virtual KASLR using RDTSC... Signed-off-by: Kees Cook Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Baoquan He Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Young Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vivek Goyal Cc: Yinghai Lu Cc: kernel-hardening@lists.openwall.com Cc: lasse.collin@tukaani.org Link: http://lkml.kernel.org/r/1462825332-10505-7-git-send-email-keescook@chromium.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c index 016a4f4..cfeb025 100644 --- a/arch/x86/boot/compressed/kaslr.c +++ b/arch/x86/boot/compressed/kaslr.c @@ -72,7 +72,7 @@ static unsigned long get_random_boot(void) return hash; } -static unsigned long get_random_long(void) +static unsigned long get_random_long(const char *purpose) { #ifdef CONFIG_X86_64 const unsigned long mix_const = 0x5d6008cbf3848dd3UL; @@ -82,7 +82,8 @@ static unsigned long get_random_long(void) unsigned long raw, random = get_random_boot(); bool use_i8254 = true; - debug_putstr("KASLR using"); + debug_putstr(purpose); + debug_putstr(" KASLR using"); if (has_cpuflag(X86_FEATURE_RDRAND)) { debug_putstr(" RDRAND"); @@ -365,7 +366,7 @@ static unsigned long slots_fetch_random(void) if (slot_max == 0) return 0; - return slots[get_random_long() % slot_max]; + return slots[get_random_long("Physical") % slot_max]; } static void process_e820_entry(struct e820entry *entry, @@ -453,7 +454,7 @@ static unsigned long find_random_virt_addr(unsigned long minimum, slots = (KERNEL_IMAGE_SIZE - minimum - image_size) / CONFIG_PHYSICAL_ALIGN + 1; - random_addr = get_random_long() % slots; + random_addr = get_random_long("Virtual") % slots; return random_addr * CONFIG_PHYSICAL_ALIGN + minimum; } -- cgit v0.10.2