From afda0f94483f46a4caddb529b8f95e0aaf015de6 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 1 May 2012 23:19:36 +0900 Subject: perf build-id: Fix filename size calculation The filename is a pointer variable so the sizeof(filename) will return length of a pointer. Fix it by using 'size'. Signed-off-by: Namhyung Kim Cc: Ingo Molnar Cc: Paul Mackerras Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1335881976-3282-1-git-send-email-namhyung@gmail.com Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index 4c7c2d7..c0b70c6 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -296,7 +296,7 @@ int build_id_cache__add_s(const char *sbuild_id, const char *debugdir, if (mkdir_p(filename, 0755)) goto out_free; - snprintf(filename + len, sizeof(filename) - len, "/%s", sbuild_id); + snprintf(filename + len, size - len, "/%s", sbuild_id); if (access(filename, F_OK)) { if (is_kallsyms) { -- cgit v0.10.2 From 5622c07b4741e0afd7607bce6e850b76eeb23210 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Fri, 27 Apr 2012 14:45:38 +0200 Subject: perf stat: Fix case where guest/host monitoring is not supported by kernel By default, perf stat sets exclude_guest = 1. But when you run perf on a kernel which does not support host/guest filtering, then you get an error saying the event in unsupported. This comes from the fact that when the perf_event_attr struct passed by the user is larger than the one known to the kernel there is safety check which ensures that all unknown bits are zero. But here, exclude_guest is 1 (part of the unknown bits) and thus the perf_event_open() syscall return EINVAL. To my surprise, running perf record on the same kernel did not exhibit the problem. The reason is that perf record handles the problem by catching the error and retrying with guest/host excludes set to zero. For some reason, this was not done with perf stat. This patch fixes this problem. Signed-off-by: Stephane Eranian Cc: Gleb Natapov Cc: Ingo Molnar Cc: Joerg Roedel Cc: Peter Zijlstra Cc: Robert Richter Link: http://lkml.kernel.org/r/20120427124538.GA7230@quad Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index c941bb6..4532a78 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -283,6 +283,8 @@ static int create_perf_stat_counter(struct perf_evsel *evsel, { struct perf_event_attr *attr = &evsel->attr; struct xyarray *group_fd = NULL; + bool exclude_guest_missing = false; + int ret; if (group && evsel != first) group_fd = first->fd; @@ -293,16 +295,39 @@ static int create_perf_stat_counter(struct perf_evsel *evsel, attr->inherit = !no_inherit; - if (system_wide) - return perf_evsel__open_per_cpu(evsel, evsel_list->cpus, +retry: + if (exclude_guest_missing) + evsel->attr.exclude_guest = evsel->attr.exclude_host = 0; + + if (system_wide) { + ret = perf_evsel__open_per_cpu(evsel, evsel_list->cpus, group, group_fd); + if (ret) + goto check_ret; + return 0; + } + if (!target_pid && !target_tid && (!group || evsel == first)) { attr->disabled = 1; attr->enable_on_exec = 1; } - return perf_evsel__open_per_thread(evsel, evsel_list->threads, - group, group_fd); + ret = perf_evsel__open_per_thread(evsel, evsel_list->threads, + group, group_fd); + if (!ret) + return 0; + /* fall through */ +check_ret: + if (ret && errno == EINVAL) { + if (!exclude_guest_missing && + (evsel->attr.exclude_guest || evsel->attr.exclude_host)) { + pr_debug("Old kernel, cannot exclude " + "guest or host samples.\n"); + exclude_guest_missing = true; + goto retry; + } + } + return ret; } /* -- cgit v0.10.2 From 09c0211c0bb0e40231e6ee9a35041d467ed72f16 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Fri, 4 May 2012 11:32:54 -0700 Subject: perf: Turn off compiler warnings for flex and bison generated files We don't know what types of warnings different versions of flex and bison combined with different versions of gcc is going to generate, so just punt and don't warn about anything. This fixes the build of perf for me on an openSUSE 12.1 system. Signed-off-by: Greg Kroah-Hartman Cc: Peter Zijlstra Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Link: http://lkml.kernel.org/r/20120504183254.GA11154@kroah.com Signed-off-by: Ingo Molnar diff --git a/tools/perf/Makefile b/tools/perf/Makefile index 9bf3fc7..92271d3 100644 --- a/tools/perf/Makefile +++ b/tools/perf/Makefile @@ -774,10 +774,10 @@ $(OUTPUT)perf.o perf.spec \ # over the general rule for .o $(OUTPUT)util/%-flex.o: $(OUTPUT)util/%-flex.c $(OUTPUT)PERF-CFLAGS - $(QUIET_CC)$(CC) -o $@ -c $(ALL_CFLAGS) -Iutil/ -Wno-redundant-decls -Wno-switch-default -Wno-unused-function $< + $(QUIET_CC)$(CC) -o $@ -c $(ALL_CFLAGS) -Iutil/ -w $< $(OUTPUT)util/%-bison.o: $(OUTPUT)util/%-bison.c $(OUTPUT)PERF-CFLAGS - $(QUIET_CC)$(CC) -o $@ -c $(ALL_CFLAGS) -DYYENABLE_NLS=0 -DYYLTYPE_IS_TRIVIAL=0 -Iutil/ -Wno-redundant-decls -Wno-switch-default -Wno-unused-function $< + $(QUIET_CC)$(CC) -o $@ -c $(ALL_CFLAGS) -DYYENABLE_NLS=0 -DYYLTYPE_IS_TRIVIAL=0 -Iutil/ -w $< $(OUTPUT)%.o: %.c $(OUTPUT)PERF-CFLAGS $(QUIET_CC)$(CC) -o $@ -c $(ALL_CFLAGS) $< -- cgit v0.10.2 From 7164b3f5e5461b58baa82e82544ca5af71af287b Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Mon, 16 Apr 2012 14:12:00 +0530 Subject: x86/microcode: Ensure that module is only loaded on supported Intel CPUs Exit early when there's no support for a particular CPU family. Signed-off-by: Srivatsa S. Bhat Cc: Greg Kroah-Hartman Cc: Andreas Herrmann Cc: Kay Sievers Cc: Dave Jones Cc: tigran@aivazian.fsnet.co.uk Cc: Borislav Petkov Link: http://lkml.kernel.org/r/4F8BDB58.6070007@linux.vnet.ibm.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c index 3ca42d0..0327e2b 100644 --- a/arch/x86/kernel/microcode_intel.c +++ b/arch/x86/kernel/microcode_intel.c @@ -147,12 +147,6 @@ static int collect_cpu_info(int cpu_num, struct cpu_signature *csig) memset(csig, 0, sizeof(*csig)); - if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 || - cpu_has(c, X86_FEATURE_IA64)) { - pr_err("CPU%d not a capable Intel processor\n", cpu_num); - return -1; - } - csig->sig = cpuid_eax(0x00000001); if ((c->x86_model >= 5) || (c->x86 > 6)) { @@ -463,6 +457,14 @@ static struct microcode_ops microcode_intel_ops = { struct microcode_ops * __init init_intel_microcode(void) { + struct cpuinfo_x86 *c = &cpu_data(0); + + if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 || + cpu_has(c, X86_FEATURE_IA64)) { + pr_err("Intel CPU family 0x%x not supported\n", c->x86); + return NULL; + } + return µcode_intel_ops; } -- cgit v0.10.2 From b2a3477727c1d43b92a3e4223f10ad5bf639df06 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Tue, 8 May 2012 15:19:29 +0100 Subject: x86: Fix section annotation of acpi_map_cpu2node() Commit 943bc7e110f2 ("x86: Fix section warnings") added __cpuinitdata here, while for functions __cpuinit should be used. Signed-off-by: Jan Beulich Cc: Link: http://lkml.kernel.org/r/4FA947910200007800082470@nat28.tlf.novell.com Signed-off-by: Ingo Molnar Cc: Steffen Persvold diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index a415b1f..7c439fe 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -593,7 +593,7 @@ void __init acpi_set_irq_model_ioapic(void) #ifdef CONFIG_ACPI_HOTPLUG_CPU #include -static void __cpuinitdata acpi_map_cpu2node(acpi_handle handle, int cpu, int physid) +static void __cpuinit acpi_map_cpu2node(acpi_handle handle, int cpu, int physid) { #ifdef CONFIG_ACPI_NUMA int nid; -- cgit v0.10.2 From 30b4e9eb783d94e9f5d503b15eb31720679ae1c7 Mon Sep 17 00:00:00 2001 From: Igor Mammedov Date: Wed, 9 May 2012 12:38:28 +0200 Subject: sched: Fix KVM and ia64 boot crash due to sched_groups circular linked list assumption If we have one cpu that failed to boot and boot cpu gave up on waiting for it and then another cpu is being booted, kernel might crash with following OOPS: BUG: unable to handle kernel NULL pointer dereference at 0000000000000018 IP: [] __bitmap_weight+0x30/0x80 Call Trace: [] build_sched_domains+0x7b6/0xa50 The crash happens in init_sched_groups_power() that expects sched_groups to be circular linked list. However it is not always true, since sched_groups preallocated in __sdt_alloc are initialized in build_sched_groups and it may exit early if (cpu != cpumask_first(sched_domain_span(sd))) return 0; without initializing sd->groups->next field. Fix bug by initializing next field right after sched_group was allocated. Also-Reported-by: Jiang Liu Signed-off-by: Igor Mammedov Cc: a.p.zijlstra@chello.nl Cc: pjt@google.com Cc: seto.hidetoshi@jp.fujitsu.com Link: http://lkml.kernel.org/r/1336559908-32533-1-git-send-email-imammedo@redhat.com Signed-off-by: Ingo Molnar diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 0533a68..e5212ae 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6382,6 +6382,8 @@ static int __sdt_alloc(const struct cpumask *cpu_map) if (!sg) return -ENOMEM; + sg->next = sg; + *per_cpu_ptr(sdd->sg, j) = sg; sgp = kzalloc_node(sizeof(struct sched_group_power), -- cgit v0.10.2 From 20d23aaa31da295378abff4272716e3dfc72baf0 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 8 May 2012 09:29:16 -0600 Subject: perf stat: handle ENXIO error for perf_event_open perf stat on PPC currently fails to run: $ perf stat -- sleep 1 Error: open_counter returned with 6 (No such device or address). /bin/dmesg may provide additional information. Fatal: Not all events could be opened. The problem is that until 2.6.37 (behavior changed with commit b0a873e) perf on PPC returns ENXIO when hw_perf_event_init() fails. With this patch we get the expected behavior: $ perf stat -v -- sleep 1 cycles event is not supported by the kernel. stalled-cycles-frontend event is not supported by the kernel. stalled-cycles-backend event is not supported by the kernel. instructions event is not supported by the kernel. branches event is not supported by the kernel. branch-misses event is not supported by the kernel. ... Signed-off-by: David Ahern Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1336490956-57145-1-git-send-email-dsahern@gmail.com Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 4532a78..1e5e9b2 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -488,8 +488,13 @@ static int run_perf_stat(int argc __used, const char **argv) list_for_each_entry(counter, &evsel_list->entries, node) { if (create_perf_stat_counter(counter, first) < 0) { + /* + * PPC returns ENXIO for HW counters until 2.6.37 + * (behavior changed with commit b0a873e). + */ if (errno == EINVAL || errno == ENOSYS || - errno == ENOENT || errno == EOPNOTSUPP) { + errno == ENOENT || errno == EOPNOTSUPP || + errno == ENXIO) { if (verbose) ui__warning("%s event is not supported by the kernel.\n", event_name(counter)); -- cgit v0.10.2 From 9b63776fa3ca96c4ecda76f6fa947b7b0add66ac Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 10 May 2012 15:55:43 -0400 Subject: tracing: Do not enable function event with enable With the adding of function tracing event to perf, it caused a side effect that produces the following warning when enabling all events in ftrace: # echo 1 > /sys/kernel/debug/tracing/events/enable [console] event trace: Could not enable event function This is because when enabling all events via the debugfs system it ignores events that do not have a ->reg() function assigned. This was to skip over the ftrace internal events (as they are not TRACE_EVENTs). But as the ftrace function event now has a ->reg() function attached to it for use with perf, it is no longer ignored. Worse yet, this ->reg() function is being called when it should not be. It returns an error and causes the above warning to be printed. By adding a new event_call flag (TRACE_EVENT_FL_IGNORE_ENABLE) and have all ftrace internel event structures have it set, setting the events/enable will no longe try to incorrectly enable the function event and does not warn. Signed-off-by: Steven Rostedt diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index 5f3f3be..176a939 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -179,6 +179,7 @@ enum { TRACE_EVENT_FL_RECORDED_CMD_BIT, TRACE_EVENT_FL_CAP_ANY_BIT, TRACE_EVENT_FL_NO_SET_FILTER_BIT, + TRACE_EVENT_FL_IGNORE_ENABLE_BIT, }; enum { @@ -187,6 +188,7 @@ enum { TRACE_EVENT_FL_RECORDED_CMD = (1 << TRACE_EVENT_FL_RECORDED_CMD_BIT), TRACE_EVENT_FL_CAP_ANY = (1 << TRACE_EVENT_FL_CAP_ANY_BIT), TRACE_EVENT_FL_NO_SET_FILTER = (1 << TRACE_EVENT_FL_NO_SET_FILTER_BIT), + TRACE_EVENT_FL_IGNORE_ENABLE = (1 << TRACE_EVENT_FL_IGNORE_ENABLE_BIT), }; struct ftrace_event_call { diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 079a93a..29111da 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -294,6 +294,9 @@ static int __ftrace_set_clr_event(const char *match, const char *sub, if (!call->name || !call->class || !call->class->reg) continue; + if (call->flags & TRACE_EVENT_FL_IGNORE_ENABLE) + continue; + if (match && strcmp(match, call->name) != 0 && strcmp(match, call->class->system) != 0) @@ -1164,7 +1167,7 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, return -1; } - if (call->class->reg) + if (call->class->reg && !(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)) trace_create_file("enable", 0644, call->dir, call, enable); diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 3dd15e8..e039906 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -180,6 +180,7 @@ struct ftrace_event_call __used event_##call = { \ .event.type = etype, \ .class = &event_class_ftrace_##call, \ .print_fmt = print, \ + .flags = TRACE_EVENT_FL_IGNORE_ENABLE, \ }; \ struct ftrace_event_call __used \ __attribute__((section("_ftrace_events"))) *__event_##call = &event_##call; -- cgit v0.10.2 From eab21585580344c8533d5e7bc4edea27ad1e268f Mon Sep 17 00:00:00 2001 From: Vinod Koul Date: Fri, 11 May 2012 11:24:41 +0530 Subject: dmaengine: pl330: dont complete descriptor for cyclic dma the cookie updates completed the cyclic dma descriptor wrongly. This caused the BUG_ON to be hit as submit is called for completed descriptor Fix this by not marking the cyclic descriptor as complete Signed-off-by: Vinod Koul Tested-by: Thomas Abraham diff --git a/drivers/dma/pl330.c b/drivers/dma/pl330.c index 2ee6e23..fa3fb21 100644 --- a/drivers/dma/pl330.c +++ b/drivers/dma/pl330.c @@ -2322,7 +2322,8 @@ static void pl330_tasklet(unsigned long data) /* Pick up ripe tomatoes */ list_for_each_entry_safe(desc, _dt, &pch->work_list, node) if (desc->status == DONE) { - dma_cookie_complete(&desc->txd); + if (pch->cyclic) + dma_cookie_complete(&desc->txd); list_move_tail(&desc->node, &list); } -- cgit v0.10.2 From d88a440eddf288c967daed827147e1121c2ab6b5 Mon Sep 17 00:00:00 2001 From: Jesper Juhl Date: Mon, 16 Apr 2012 21:27:54 +0200 Subject: dac960: Remove unused variables from DAC960_CreateProcEntries() The variables 'StatusProcEntry' and 'UserCommandProcEntry' are assigned to once and then never used. This patch gets rid of the variables. While I was there I also fixed the indentation of the function to use tabs rather than spaces for the lines that did not already do so. Signed-off-by: Jesper Juhl Signed-off-by: Jens Axboe diff --git a/drivers/block/DAC960.c b/drivers/block/DAC960.c index 8db9089..9a13e88 100644 --- a/drivers/block/DAC960.c +++ b/drivers/block/DAC960.c @@ -6580,24 +6580,21 @@ static const struct file_operations dac960_user_command_proc_fops = { static void DAC960_CreateProcEntries(DAC960_Controller_T *Controller) { - struct proc_dir_entry *StatusProcEntry; struct proc_dir_entry *ControllerProcEntry; - struct proc_dir_entry *UserCommandProcEntry; if (DAC960_ProcDirectoryEntry == NULL) { - DAC960_ProcDirectoryEntry = proc_mkdir("rd", NULL); - StatusProcEntry = proc_create("status", 0, - DAC960_ProcDirectoryEntry, - &dac960_proc_fops); + DAC960_ProcDirectoryEntry = proc_mkdir("rd", NULL); + proc_create("status", 0, DAC960_ProcDirectoryEntry, + &dac960_proc_fops); } - sprintf(Controller->ControllerName, "c%d", Controller->ControllerNumber); - ControllerProcEntry = proc_mkdir(Controller->ControllerName, - DAC960_ProcDirectoryEntry); - proc_create_data("initial_status", 0, ControllerProcEntry, &dac960_initial_status_proc_fops, Controller); - proc_create_data("current_status", 0, ControllerProcEntry, &dac960_current_status_proc_fops, Controller); - UserCommandProcEntry = proc_create_data("user_command", S_IWUSR | S_IRUSR, ControllerProcEntry, &dac960_user_command_proc_fops, Controller); - Controller->ControllerProcEntry = ControllerProcEntry; + sprintf(Controller->ControllerName, "c%d", Controller->ControllerNumber); + ControllerProcEntry = proc_mkdir(Controller->ControllerName, + DAC960_ProcDirectoryEntry); + proc_create_data("initial_status", 0, ControllerProcEntry, &dac960_initial_status_proc_fops, Controller); + proc_create_data("current_status", 0, ControllerProcEntry, &dac960_current_status_proc_fops, Controller); + proc_create_data("user_command", S_IWUSR | S_IRUSR, ControllerProcEntry, &dac960_user_command_proc_fops, Controller); + Controller->ControllerProcEntry = ControllerProcEntry; } -- cgit v0.10.2 From a09ba13eefb155a00d8d50008a0c0a2406985ddd Mon Sep 17 00:00:00 2001 From: Asai Thambi S P Date: Mon, 16 Apr 2012 21:27:55 +0200 Subject: mtip32xx: release the semaphore on an error path Release the semaphore in an error path in mtip_hw_get_scatterlist(). This fixes the smatch warning inconsistent returns. Reported-by: Dan Carpenter Signed-off-by: Asai Thambi S P Signed-off-by: Jens Axboe diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index 00f9fc9..304000c 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -2510,8 +2510,10 @@ static struct scatterlist *mtip_hw_get_scatterlist(struct driver_data *dd, up(&dd->port->cmd_slot); return NULL; } - if (unlikely(*tag < 0)) + if (unlikely(*tag < 0)) { + up(&dd->port->cmd_slot); return NULL; + } return dd->port->commands[*tag].sg; } -- cgit v0.10.2 From 080399aaaf3531f5b8761ec0ac30ff98891e8686 Mon Sep 17 00:00:00 2001 From: Jeff Moyer Date: Fri, 11 May 2012 16:34:10 +0200 Subject: block: don't mark buffers beyond end of disk as mapped Hi, We have a bug report open where a squashfs image mounted on ppc64 would exhibit errors due to trying to read beyond the end of the disk. It can easily be reproduced by doing the following: [root@ibm-p750e-02-lp3 ~]# ls -l install.img -rw-r--r-- 1 root root 142032896 Apr 30 16:46 install.img [root@ibm-p750e-02-lp3 ~]# mount -o loop ./install.img /mnt/test [root@ibm-p750e-02-lp3 ~]# dd if=/dev/loop0 of=/dev/null dd: reading `/dev/loop0': Input/output error 277376+0 records in 277376+0 records out 142016512 bytes (142 MB) copied, 0.9465 s, 150 MB/s In dmesg, you'll find the following: squashfs: version 4.0 (2009/01/31) Phillip Lougher [ 43.106012] attempt to access beyond end of device [ 43.106029] loop0: rw=0, want=277410, limit=277408 [ 43.106039] Buffer I/O error on device loop0, logical block 138704 [ 43.106053] attempt to access beyond end of device [ 43.106057] loop0: rw=0, want=277412, limit=277408 [ 43.106061] Buffer I/O error on device loop0, logical block 138705 [ 43.106066] attempt to access beyond end of device [ 43.106070] loop0: rw=0, want=277414, limit=277408 [ 43.106073] Buffer I/O error on device loop0, logical block 138706 [ 43.106078] attempt to access beyond end of device [ 43.106081] loop0: rw=0, want=277416, limit=277408 [ 43.106085] Buffer I/O error on device loop0, logical block 138707 [ 43.106089] attempt to access beyond end of device [ 43.106093] loop0: rw=0, want=277418, limit=277408 [ 43.106096] Buffer I/O error on device loop0, logical block 138708 [ 43.106101] attempt to access beyond end of device [ 43.106104] loop0: rw=0, want=277420, limit=277408 [ 43.106108] Buffer I/O error on device loop0, logical block 138709 [ 43.106112] attempt to access beyond end of device [ 43.106116] loop0: rw=0, want=277422, limit=277408 [ 43.106120] Buffer I/O error on device loop0, logical block 138710 [ 43.106124] attempt to access beyond end of device [ 43.106128] loop0: rw=0, want=277424, limit=277408 [ 43.106131] Buffer I/O error on device loop0, logical block 138711 [ 43.106135] attempt to access beyond end of device [ 43.106139] loop0: rw=0, want=277426, limit=277408 [ 43.106143] Buffer I/O error on device loop0, logical block 138712 [ 43.106147] attempt to access beyond end of device [ 43.106151] loop0: rw=0, want=277428, limit=277408 [ 43.106154] Buffer I/O error on device loop0, logical block 138713 [ 43.106158] attempt to access beyond end of device [ 43.106162] loop0: rw=0, want=277430, limit=277408 [ 43.106166] attempt to access beyond end of device [ 43.106169] loop0: rw=0, want=277432, limit=277408 ... [ 43.106307] attempt to access beyond end of device [ 43.106311] loop0: rw=0, want=277470, limit=2774 Squashfs manages to read in the end block(s) of the disk during the mount operation. Then, when dd reads the block device, it leads to block_read_full_page being called with buffers that are beyond end of disk, but are marked as mapped. Thus, it would end up submitting read I/O against them, resulting in the errors mentioned above. I fixed the problem by modifying init_page_buffers to only set the buffer mapped if it fell inside of i_size. Cheers, Jeff Signed-off-by: Jeff Moyer Acked-by: Nick Piggin -- Changes from v1->v2: re-used max_block, as suggested by Nick Piggin. Signed-off-by: Jens Axboe diff --git a/fs/block_dev.c b/fs/block_dev.c index e08f6a20..ba11c30 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -70,7 +70,7 @@ static void bdev_inode_switch_bdi(struct inode *inode, spin_unlock(&dst->wb.list_lock); } -static sector_t max_block(struct block_device *bdev) +sector_t blkdev_max_block(struct block_device *bdev) { sector_t retval = ~((sector_t)0); loff_t sz = i_size_read(bdev->bd_inode); @@ -163,7 +163,7 @@ static int blkdev_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh, int create) { - if (iblock >= max_block(I_BDEV(inode))) { + if (iblock >= blkdev_max_block(I_BDEV(inode))) { if (create) return -EIO; @@ -185,7 +185,7 @@ static int blkdev_get_blocks(struct inode *inode, sector_t iblock, struct buffer_head *bh, int create) { - sector_t end_block = max_block(I_BDEV(inode)); + sector_t end_block = blkdev_max_block(I_BDEV(inode)); unsigned long max_blocks = bh->b_size >> inode->i_blkbits; if ((iblock + max_blocks) > end_block) { diff --git a/fs/buffer.c b/fs/buffer.c index 351e18e..ad5938c 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -921,6 +921,7 @@ init_page_buffers(struct page *page, struct block_device *bdev, struct buffer_head *head = page_buffers(page); struct buffer_head *bh = head; int uptodate = PageUptodate(page); + sector_t end_block = blkdev_max_block(I_BDEV(bdev->bd_inode)); do { if (!buffer_mapped(bh)) { @@ -929,7 +930,8 @@ init_page_buffers(struct page *page, struct block_device *bdev, bh->b_blocknr = block; if (uptodate) set_buffer_uptodate(bh); - set_buffer_mapped(bh); + if (block < end_block) + set_buffer_mapped(bh); } block++; bh = bh->b_this_page; diff --git a/include/linux/fs.h b/include/linux/fs.h index 8de6755..25c40b9 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2051,6 +2051,7 @@ extern void unregister_blkdev(unsigned int, const char *); extern struct block_device *bdget(dev_t); extern struct block_device *bdgrab(struct block_device *bdev); extern void bd_set_size(struct block_device *, loff_t size); +extern sector_t blkdev_max_block(struct block_device *bdev); extern void bd_forget(struct inode *inode); extern void bdput(struct block_device *); extern void invalidate_bdev(struct block_device *); -- cgit v0.10.2 From f908ee9463b09ddd05e1c1a0111132212dc05fac Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Fri, 11 May 2012 16:36:44 +0200 Subject: bio allocation failure due to bio_get_nr_vecs() The number of bio_get_nr_vecs() is passed down via bio_alloc() to bvec_alloc_bs(), which fails the bio allocation if nr_iovecs > BIO_MAX_PAGES. For the underlying caller this causes an unexpected bio allocation failure. Limiting to queue_max_segments() is not sufficient, as max_segments also might be very large. bvec_alloc_bs(gfp_mask, nr_iovecs, ) => NULL when nr_iovecs > BIO_MAX_PAGES bio_alloc_bioset(gfp_mask, nr_iovecs, ...) bio_alloc(GFP_NOIO, nvecs) xfs_alloc_ioend_bio() Signed-off-by: Bernd Schubert Cc: stable@kernel.org Signed-off-by: Jens Axboe diff --git a/fs/bio.c b/fs/bio.c index e453924..84da885 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -505,9 +505,14 @@ EXPORT_SYMBOL(bio_clone); int bio_get_nr_vecs(struct block_device *bdev) { struct request_queue *q = bdev_get_queue(bdev); - return min_t(unsigned, + int nr_pages; + + nr_pages = min_t(unsigned, queue_max_segments(q), queue_max_sectors(q) / (PAGE_SIZE >> 9) + 1); + + return min_t(unsigned, nr_pages, BIO_MAX_PAGES); + } EXPORT_SYMBOL(bio_get_nr_vecs); -- cgit v0.10.2 From 998de4acb2ba188d20768d1065658377a2e7d29b Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 11 May 2012 17:42:37 +0100 Subject: ARM: 7417/1: vfp: ensure preemption is disabled when enabling VFP access The vfp_enable function enables access to the VFP co-processor register space (cp10 and cp11) on the current CPU and must be called with preemption disabled. Unfortunately, the vfp_init late initcall does not disable preemption and can lead to an oops during boot if thread migration occurs at the wrong time and we end up attempting to access the FPSID on a CPU with VFP access disabled. This patch fixes the initcall to call vfp_enable from a non-preemptible context on each CPU and adds a BUG_ON(preemptible) to ensure that any similar problems are easily spotted in the future. Cc: stable@vger.kernel.org Reported-by: Hyungwoo Yang Signed-off-by: Hyungwoo Yang Signed-off-by: Will Deacon Signed-off-by: Russell King diff --git a/arch/arm/vfp/vfpmodule.c b/arch/arm/vfp/vfpmodule.c index bc683b8..c5767b5 100644 --- a/arch/arm/vfp/vfpmodule.c +++ b/arch/arm/vfp/vfpmodule.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -432,7 +433,10 @@ void VFP_bounce(u32 trigger, u32 fpexc, struct pt_regs *regs) static void vfp_enable(void *unused) { - u32 access = get_copro_access(); + u32 access; + + BUG_ON(preemptible()); + access = get_copro_access(); /* * Enable full access to VFP (cp10 and cp11) @@ -657,7 +661,7 @@ static int __init vfp_init(void) unsigned int cpu_arch = cpu_architecture(); if (cpu_arch >= CPU_ARCH_ARMv6) - vfp_enable(NULL); + on_each_cpu(vfp_enable, NULL, 1); /* * First check that there is a VFP that we can use. @@ -678,8 +682,6 @@ static int __init vfp_init(void) } else { hotcpu_notifier(vfp_hotplug, 0); - smp_call_function(vfp_enable, NULL, 1); - VFP_arch = (vfpsid & FPSID_ARCH_MASK) >> FPSID_ARCH_BIT; /* Extract the architecture version */ printk("implementor %02x architecture %d part %02x variant %x rev %x\n", (vfpsid & FPSID_IMPLEMENTER_MASK) >> FPSID_IMPLEMENTER_BIT, -- cgit v0.10.2 From 85fd0bc95bc76d129db0d0114c40665745bbba07 Mon Sep 17 00:00:00 2001 From: Russell King Date: Mon, 14 May 2012 08:29:23 +0200 Subject: Fix blkdev.h build errors when BLOCK=n I see builds failing with: CC [M] drivers/mmc/host/dw_mmc.o In file included from drivers/mmc/host/dw_mmc.c:15: include/linux/blkdev.h:1404: warning: 'struct task_struct' declared inside parameter list include/linux/blkdev.h:1404: warning: its scope is only this definition or declaration, which is probably not what you want include/linux/blkdev.h:1408: warning: 'struct task_struct' declared inside parameter list include/linux/blkdev.h:1413: error: expected '=', ',', ';', 'asm' or '__attribute__' before 'blk_needs_flush_plug' make[4]: *** [drivers/mmc/host/dw_mmc.o] Error 1 This is because dw_mmc.c includes linux/blkdev.h as the very first file, and when CONFIG_BLOCK=n, blkdev.h omits all includes. As it requires linux/sched.h even when CONFIG_BLOCK=n, move this out of the #ifdef. Signed-off-by: Russell King Signed-off-by: Jens Axboe diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 2aa2466..4d4ac24 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1,9 +1,10 @@ #ifndef _LINUX_BLKDEV_H #define _LINUX_BLKDEV_H +#include + #ifdef CONFIG_BLOCK -#include #include #include #include -- cgit v0.10.2 From 97e7abc5e04bea6a5db8d54ad34407bb46e85403 Mon Sep 17 00:00:00 2001 From: Peter De Schrijver Date: Mon, 14 May 2012 13:27:09 +0300 Subject: ARM: tegra: Fix flow controller accesses flowctrl_write_cpu_csr uses the cpu halt offsets and vice versa. This patch fixes this bug. Reported-by: Dan Willemsen Signed-off-by: Peter De Schrijver [swarren: This problem was introduced in v3.4-rc1, in commit 26fe681 "ARM: tegra: functions to access the flowcontroller", when this file was first added] Signed-off-by: Stephen Warren diff --git a/arch/arm/mach-tegra/flowctrl.c b/arch/arm/mach-tegra/flowctrl.c index fef66a7..f07488e 100644 --- a/arch/arm/mach-tegra/flowctrl.c +++ b/arch/arm/mach-tegra/flowctrl.c @@ -53,10 +53,10 @@ static void flowctrl_update(u8 offset, u32 value) void flowctrl_write_cpu_csr(unsigned int cpuid, u32 value) { - return flowctrl_update(flowctrl_offset_halt_cpu[cpuid], value); + return flowctrl_update(flowctrl_offset_cpu_csr[cpuid], value); } void flowctrl_write_cpu_halt(unsigned int cpuid, u32 value) { - return flowctrl_update(flowctrl_offset_cpu_csr[cpuid], value); + return flowctrl_update(flowctrl_offset_halt_cpu[cpuid], value); } -- cgit v0.10.2 From c3709e6734daa4d9b37fe31592ebb0eb57bae1bb Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Mon, 14 May 2012 18:41:09 +0100 Subject: x86, kvm: KVM paravirt kernels don't check for CPUID being unavailable We set cpuid_level to -1 if there is no CPUID instruction (only possible on i386). Signed-off-by: Alan Cox Link: http://lkml.kernel.org/r/20120514174059.30236.1064.stgit@bluebook Resolves-bug: https://bugzilla.kernel.org/show_bug.cgi?id=12122 Cc: Avi Kivity Cc: Marcelo Tosatti Signed-off-by: H. Peter Anvin diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h index 734c376..183922e 100644 --- a/arch/x86/include/asm/kvm_para.h +++ b/arch/x86/include/asm/kvm_para.h @@ -170,6 +170,9 @@ static inline int kvm_para_available(void) unsigned int eax, ebx, ecx, edx; char signature[13]; + if (boot_cpu_data.cpuid_level < 0) + return 0; /* So we don't blow up on old processors */ + cpuid(KVM_CPUID_SIGNATURE, &eax, &ebx, &ecx, &edx); memcpy(signature + 0, &ebx, 4); memcpy(signature + 4, &ecx, 4); -- cgit v0.10.2 From dad1743e5993f19b3d7e7bd0fb35dc45b5326626 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Mon, 14 May 2012 15:07:48 -0700 Subject: x86/mce: Only restart instruction after machine check recovery if it is safe MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Section 15.3.1.2 of the software developer manual has this to say about the RIPV bit in the IA32_MCG_STATUS register: RIPV (restart IP valid) flag, bit 0 — Indicates (when set) that program execution can be restarted reliably at the instruction pointed to by the instruction pointer pushed on the stack when the machine-check exception is generated. When clear, the program cannot be reliably restarted at the pushed instruction pointer. We need to save the state of this bit in do_machine_check() and use it in mce_notify_process() to force a signal; even if memory_failure() says it made a complete recovery ... e.g. replaced a clean LRU page. Acked-by: Borislav Petkov Signed-off-by: Tony Luck diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index d086a09..11c9166 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -945,9 +945,10 @@ struct mce_info { atomic_t inuse; struct task_struct *t; __u64 paddr; + int restartable; } mce_info[MCE_INFO_MAX]; -static void mce_save_info(__u64 addr) +static void mce_save_info(__u64 addr, int c) { struct mce_info *mi; @@ -955,6 +956,7 @@ static void mce_save_info(__u64 addr) if (atomic_cmpxchg(&mi->inuse, 0, 1) == 0) { mi->t = current; mi->paddr = addr; + mi->restartable = c; return; } } @@ -1130,7 +1132,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) mce_panic("Fatal machine check on current CPU", &m, msg); if (worst == MCE_AR_SEVERITY) { /* schedule action before return to userland */ - mce_save_info(m.addr); + mce_save_info(m.addr, m.mcgstatus & MCG_STATUS_RIPV); set_thread_flag(TIF_MCE_NOTIFY); } else if (kill_it) { force_sig(SIGBUS, current); @@ -1179,7 +1181,13 @@ void mce_notify_process(void) pr_err("Uncorrected hardware memory error in user-access at %llx", mi->paddr); - if (memory_failure(pfn, MCE_VECTOR, MF_ACTION_REQUIRED) < 0) { + /* + * We must call memory_failure() here even if the current process is + * doomed. We still need to mark the page as poisoned and alert any + * other users of the page. + */ + if (memory_failure(pfn, MCE_VECTOR, MF_ACTION_REQUIRED) < 0 || + mi->restartable == 0) { pr_err("Memory error not recovered"); force_sig(SIGBUS, current); } -- cgit v0.10.2 From edc318d9fea6854df752ec8c645b960b0d5a1d23 Mon Sep 17 00:00:00 2001 From: Bernhard Kohl Date: Sun, 13 May 2012 23:39:37 +0200 Subject: target: Fix SPC-2 RELEASE bug for multi-session iSCSI client setups This patch addresses a bug in a special case for target core SPC-2 RELEASE logic where the same physical client (eg: iSCSI InitiatorName) with differing iSCSI session identifiers (ISID) is allowed to incorrectly release the same client's SPC-2 reservation from the non reservation holding path. Note this bug is specific to iscsi-target w/ SPC-2 reservations, and with the default enforce_pr_isids=1 device attr setting in target-core controls if a InitiatorName + different ISID reservations are handled the same as a single iSCSI client entity. Signed-off-by: Bernhard Kohl Cc: Signed-off-by: Nicholas Bellinger diff --git a/drivers/target/target_core_pr.c b/drivers/target/target_core_pr.c index 86f0c3b..c3148b1 100644 --- a/drivers/target/target_core_pr.c +++ b/drivers/target/target_core_pr.c @@ -220,6 +220,9 @@ int target_scsi2_reservation_release(struct se_task *task) if (dev->dev_reserved_node_acl != sess->se_node_acl) goto out_unlock; + if (dev->dev_res_bin_isid != sess->sess_bin_isid) + goto out_unlock; + dev->dev_reserved_node_acl = NULL; dev->dev_flags &= ~DF_SPC2_RESERVATIONS; if (dev->dev_flags & DF_SPC2_RESERVATIONS_WITH_ISID) { -- cgit v0.10.2 From 1ebfefcf37a6e308266a8d786e8cfea0a454058c Mon Sep 17 00:00:00 2001 From: Alexander Clouter Date: Sat, 12 May 2012 09:45:08 +0100 Subject: crypto: mv_cesa requires on CRYPTO_HASH to build Without CRYPTO_HASH being selected, mv_cesa has a lot of hooks into undefined exports. ---- MODPOST 81 modules Kernel: arch/arm/boot/Image is ready AS arch/arm/boot/compressed/head.o GZIP arch/arm/boot/compressed/piggy.gzip CC arch/arm/boot/compressed/misc.o CC arch/arm/boot/compressed/decompress.o ERROR: "crypto_ahash_type" [drivers/crypto/mv_cesa.ko] undefined! ERROR: "crypto_shash_final" [drivers/crypto/mv_cesa.ko] undefined! ERROR: "crypto_register_ahash" [drivers/crypto/mv_cesa.ko] undefined! ERROR: "crypto_unregister_ahash" [drivers/crypto/mv_cesa.ko] undefined! ERROR: "crypto_shash_update" [drivers/crypto/mv_cesa.ko] undefined! ERROR: "crypto_shash_digest" [drivers/crypto/mv_cesa.ko] undefined! ERROR: "crypto_shash_setkey" [drivers/crypto/mv_cesa.ko] undefined! ERROR: "crypto_alloc_shash" [drivers/crypto/mv_cesa.ko] undefined! make[1]: *** [__modpost] Error 1 make: *** [modules] Error 2 make: *** Waiting for unfinished jobs.... ---- Signed-off-by: Alexander Clouter Signed-off-by: Jason Cooper Cc: stable@vger.kernel.org diff --git a/drivers/crypto/Kconfig b/drivers/crypto/Kconfig index ab9abb4..dd414d9 100644 --- a/drivers/crypto/Kconfig +++ b/drivers/crypto/Kconfig @@ -164,6 +164,7 @@ config CRYPTO_DEV_MV_CESA select CRYPTO_ALGAPI select CRYPTO_AES select CRYPTO_BLKCIPHER2 + select CRYPTO_HASH help This driver allows you to utilize the Cryptographic Engines and Security Accelerator (CESA) which can be found on the Marvell Orion -- cgit v0.10.2 From d41160524d8d8ee721296e5c7e15f1ed727df7eb Mon Sep 17 00:00:00 2001 From: Vinod Koul Date: Fri, 11 May 2012 11:48:21 +0530 Subject: dmaengine: fix cyclic dma usage for cyclic dma, dont makr the descriptor as complte. Fix the remaining users of cyclic dma which do so Signed-off-by: Vinod Koul Tested-by: Nicolas Ferre Acked-by: Nicolas Ferre Tested-by: Mika Westerberg diff --git a/drivers/dma/at_hdmac.c b/drivers/dma/at_hdmac.c index 445fdf8..bf0d7e4 100644 --- a/drivers/dma/at_hdmac.c +++ b/drivers/dma/at_hdmac.c @@ -245,7 +245,9 @@ atc_chain_complete(struct at_dma_chan *atchan, struct at_desc *desc) dev_vdbg(chan2dev(&atchan->chan_common), "descriptor %u complete\n", txd->cookie); - dma_cookie_complete(txd); + /* mark the descriptor as complete for non cyclic cases only */ + if (!atc_chan_is_cyclic(atchan)) + dma_cookie_complete(txd); /* move children to free_list */ list_splice_init(&desc->tx_list, &atchan->free_list); diff --git a/drivers/dma/ep93xx_dma.c b/drivers/dma/ep93xx_dma.c index e6f133b..f6e9b57 100644 --- a/drivers/dma/ep93xx_dma.c +++ b/drivers/dma/ep93xx_dma.c @@ -703,7 +703,9 @@ static void ep93xx_dma_tasklet(unsigned long data) desc = ep93xx_dma_get_active(edmac); if (desc) { if (desc->complete) { - dma_cookie_complete(&desc->txd); + /* mark descriptor complete for non cyclic case only */ + if (!test_bit(EP93XX_DMA_IS_CYCLIC, &edmac->flags)) + dma_cookie_complete(&desc->txd); list_splice_init(&edmac->active, &list); } callback = desc->txd.callback; -- cgit v0.10.2 From 05c69d298c96703741cac9a5cbbf6c53bd55a6e2 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 15 May 2012 08:22:04 +0200 Subject: block: fix buffer overflow when printing partition UUIDs 6d1d8050b4bc8 "block, partition: add partition_meta_info to hd_struct" added part_unpack_uuid() which assumes that the passed in buffer has enough space for sprintfing "%pU" - 37 characters including '\0'. Unfortunately, b5af921ec0233 "init: add support for root devices specified by partition UUID" supplied 33 bytes buffer to the function leading to the following panic with stackprotector enabled. Kernel panic - not syncing: stack-protector: Kernel stack corrupted in: ffffffff81b14c7e [] panic+0xba/0x1c6 [] ? printk_all_partitions+0x259/0x26xb [] __stack_chk_fail+0x1b/0x20 [] printk_all_paritions+0x259/0x26xb [] mount_block_root+0x1bc/0x27f [] mount_root+0x57/0x5b [] prepare_namespace+0x13d/0x176 [] ? release_tgcred.isra.4+0x330/0x30 [] kernel_init+0x155/0x15a [] ? schedule_tail+0x27/0xb0 [] kernel_thread_helper+0x5/0x10 [] ? start_kernel+0x3c5/0x3c5 [] ? gs_change+0x13/0x13 Increase the buffer size, remove the dangerous part_unpack_uuid() and use snprintf() directly from printk_all_partitions(). Signed-off-by: Tejun Heo Reported-by: Szymon Gruszczynski Cc: Will Drewry Cc: stable@vger.kernel.org Signed-off-by: Jens Axboe diff --git a/block/genhd.c b/block/genhd.c index df9816e..9cf5583 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -743,7 +743,7 @@ void __init printk_all_partitions(void) struct hd_struct *part; char name_buf[BDEVNAME_SIZE]; char devt_buf[BDEVT_SIZE]; - u8 uuid[PARTITION_META_INFO_UUIDLTH * 2 + 1]; + char uuid_buf[PARTITION_META_INFO_UUIDLTH * 2 + 5]; /* * Don't show empty devices or things that have been @@ -762,14 +762,16 @@ void __init printk_all_partitions(void) while ((part = disk_part_iter_next(&piter))) { bool is_part0 = part == &disk->part0; - uuid[0] = 0; + uuid_buf[0] = '\0'; if (part->info) - part_unpack_uuid(part->info->uuid, uuid); + snprintf(uuid_buf, sizeof(uuid_buf), "%pU", + part->info->uuid); printk("%s%s %10llu %s %s", is_part0 ? "" : " ", bdevt_str(part_devt(part), devt_buf), (unsigned long long)part->nr_sects >> 1, - disk_name(disk, part->partno, name_buf), uuid); + disk_name(disk, part->partno, name_buf), + uuid_buf); if (is_part0) { if (disk->driverfs_dev != NULL && disk->driverfs_dev->driver != NULL) diff --git a/include/linux/genhd.h b/include/linux/genhd.h index e61d319..017a7fb 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -222,12 +222,6 @@ static inline void part_pack_uuid(const u8 *uuid_str, u8 *to) } } -static inline char *part_unpack_uuid(const u8 *uuid, char *out) -{ - sprintf(out, "%pU", uuid); - return out; -} - static inline int disk_max_parts(struct gendisk *disk) { if (disk->flags & GENHD_FL_EXT_DEVT) -- cgit v0.10.2 From 5e185581d7c46ddd33cd9c01106d1fc86efb9376 Mon Sep 17 00:00:00 2001 From: James Bottomley Date: Tue, 15 May 2012 11:04:19 +0100 Subject: [PARISC] fix PA1.1 oops on boot All PA1.1 systems have been oopsing on boot since commit f311847c2fcebd81912e2f0caf8a461dec28db41 Author: James Bottomley Date: Wed Dec 22 10:22:11 2010 -0600 parisc: flush pages through tmpalias space because a PA2.0 instruction was accidentally introduced into the PA1.1 TLB insertion interruption path when it was consolidated with the do_alias macro. Fix the do_alias macro only to use PA2.0 instructions if compiled for 64 bit. Cc: stable@vger.kernel.org #2.6.39+ Signed-off-by: James Bottomley diff --git a/arch/parisc/kernel/entry.S b/arch/parisc/kernel/entry.S index 6f05944..5350342 100644 --- a/arch/parisc/kernel/entry.S +++ b/arch/parisc/kernel/entry.S @@ -581,7 +581,11 @@ */ cmpiclr,= 0x01,\tmp,%r0 ldi (_PAGE_DIRTY|_PAGE_READ|_PAGE_WRITE),\prot +#ifdef CONFIG_64BIT depd,z \prot,8,7,\prot +#else + depw,z \prot,8,7,\prot +#endif /* * OK, it is in the temp alias region, check whether "from" or "to". * Check "subtle" note in pacache.S re: r23/r26. -- cgit v0.10.2 From 207f583d7179f707f402c36a7bda5ca1fd03ad5b Mon Sep 17 00:00:00 2001 From: John David Anglin Date: Wed, 16 May 2012 10:14:52 +0100 Subject: [PARISC] fix crash in flush_icache_page_asm on PA1.1 As pointed out by serveral people, PA1.1 only has a type 26 instruction meaning that the space register must be explicitly encoded. Not giving an explicit space means that the compiler uses the type 24 version which is PA2.0 only resulting in an illegal instruction crash. This regression was caused by commit f311847c2fcebd81912e2f0caf8a461dec28db41 Author: James Bottomley Date: Wed Dec 22 10:22:11 2010 -0600 parisc: flush pages through tmpalias space Reported-by: Helge Deller Signed-off-by: John David Anglin Cc: stable@vger.kernel.org #2.6.39+ Signed-off-by: James Bottomley diff --git a/arch/parisc/kernel/pacache.S b/arch/parisc/kernel/pacache.S index 93ff3d9..5d7218a 100644 --- a/arch/parisc/kernel/pacache.S +++ b/arch/parisc/kernel/pacache.S @@ -692,7 +692,7 @@ ENTRY(flush_icache_page_asm) /* Purge any old translation */ - pitlb (%sr0,%r28) + pitlb (%sr4,%r28) ldil L%icache_stride, %r1 ldw R%icache_stride(%r1), %r1 @@ -706,27 +706,29 @@ ENTRY(flush_icache_page_asm) sub %r25, %r1, %r25 -1: fic,m %r1(%r28) - fic,m %r1(%r28) - fic,m %r1(%r28) - fic,m %r1(%r28) - fic,m %r1(%r28) - fic,m %r1(%r28) - fic,m %r1(%r28) - fic,m %r1(%r28) - fic,m %r1(%r28) - fic,m %r1(%r28) - fic,m %r1(%r28) - fic,m %r1(%r28) - fic,m %r1(%r28) - fic,m %r1(%r28) - fic,m %r1(%r28) + /* fic only has the type 26 form on PA1.1, requiring an + * explicit space specification, so use %sr4 */ +1: fic,m %r1(%sr4,%r28) + fic,m %r1(%sr4,%r28) + fic,m %r1(%sr4,%r28) + fic,m %r1(%sr4,%r28) + fic,m %r1(%sr4,%r28) + fic,m %r1(%sr4,%r28) + fic,m %r1(%sr4,%r28) + fic,m %r1(%sr4,%r28) + fic,m %r1(%sr4,%r28) + fic,m %r1(%sr4,%r28) + fic,m %r1(%sr4,%r28) + fic,m %r1(%sr4,%r28) + fic,m %r1(%sr4,%r28) + fic,m %r1(%sr4,%r28) + fic,m %r1(%sr4,%r28) cmpb,COND(<<) %r28, %r25,1b - fic,m %r1(%r28) + fic,m %r1(%sr4,%r28) sync bv %r0(%r2) - pitlb (%sr0,%r25) + pitlb (%sr4,%r25) .exit .procend -- cgit v0.10.2 From b3cb8674811d1851bbf1486a73d62b90c119b994 Mon Sep 17 00:00:00 2001 From: James Bottomley Date: Wed, 16 May 2012 11:10:27 +0100 Subject: [PARISC] fix panic on prefetch(NULL) on PA7300LC Due to an errata, the PA7300LC generates a TLB miss interruption even on the prefetch instruction. This means that prefetch(NULL), which is supposed to be a nop on linux actually generates a NULL deref fault. Fix this by testing the address of prefetch against NULL before doing the prefetch. Cc: stable@vger.kernel.org Signed-off-by: James Bottomley diff --git a/arch/parisc/include/asm/prefetch.h b/arch/parisc/include/asm/prefetch.h index c5edc60..1ee7c82 100644 --- a/arch/parisc/include/asm/prefetch.h +++ b/arch/parisc/include/asm/prefetch.h @@ -21,7 +21,12 @@ #define ARCH_HAS_PREFETCH static inline void prefetch(const void *addr) { - __asm__("ldw 0(%0), %%r0" : : "r" (addr)); + __asm__( +#ifndef CONFIG_PA20 + /* Need to avoid prefetch of NULL on PA7300LC */ + " extrw,u,= %0,31,32,%%r0\n" +#endif + " ldw 0(%0), %%r0" : : "r" (addr)); } /* LDD is a PA2.0 addition. */ -- cgit v0.10.2 From 9b61a4d1b2064dbd0c9e61754305ac852170509f Mon Sep 17 00:00:00 2001 From: Russell King Date: Wed, 16 May 2012 15:19:20 +0100 Subject: ARM: prevent VM_GROWSDOWN mmaps extending below FIRST_USER_ADDRESS Cc: Reported-by: Al Viro Signed-off-by: Russell King diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c index f074675..5bb4835 100644 --- a/arch/arm/mm/fault.c +++ b/arch/arm/mm/fault.c @@ -247,7 +247,9 @@ good_area: return handle_mm_fault(mm, vma, addr & PAGE_MASK, flags); check_stack: - if (vma->vm_flags & VM_GROWSDOWN && !expand_stack(vma, addr)) + /* Don't allow expansion below FIRST_USER_ADDRESS */ + if (vma->vm_flags & VM_GROWSDOWN && + addr >= FIRST_USER_ADDRESS && !expand_stack(vma, addr)) goto good_area; out: return fault; -- cgit v0.10.2 From 6508fdbf40a92fd7c19d32780ea33ce8e8362b93 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 17 May 2012 10:08:45 +1000 Subject: md/raid10: set dev_sectors properly when resizing devices in array. raid10 stores dev_sectors in 'conf' separately from the one in 'mddev' because it can have a very significant effect on block addressing and so need to be updated carefully. However raid10_resize isn't updating it at all! To update it correctly, we need to make sure it is a proper multiple of the chunksize taking various details of the layout in to account. This calculation is currently done in setup_conf. So split it out from there and call it from raid10_resize as well. Then set conf->dev_sectors properly. Signed-off-by: NeilBrown diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index c8dbb84..3e7b154 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -3164,12 +3164,40 @@ raid10_size(struct mddev *mddev, sector_t sectors, int raid_disks) return size << conf->chunk_shift; } +static void calc_sectors(struct r10conf *conf, sector_t size) +{ + /* Calculate the number of sectors-per-device that will + * actually be used, and set conf->dev_sectors and + * conf->stride + */ + + size = size >> conf->chunk_shift; + sector_div(size, conf->far_copies); + size = size * conf->raid_disks; + sector_div(size, conf->near_copies); + /* 'size' is now the number of chunks in the array */ + /* calculate "used chunks per device" */ + size = size * conf->copies; + + /* We need to round up when dividing by raid_disks to + * get the stride size. + */ + size = DIV_ROUND_UP_SECTOR_T(size, conf->raid_disks); + + conf->dev_sectors = size << conf->chunk_shift; + + if (conf->far_offset) + conf->stride = 1 << conf->chunk_shift; + else { + sector_div(size, conf->near_copies); + conf->stride = size << conf->chunk_shift; + } +} static struct r10conf *setup_conf(struct mddev *mddev) { struct r10conf *conf = NULL; int nc, fc, fo; - sector_t stride, size; int err = -EINVAL; if (mddev->new_chunk_sectors < (PAGE_SIZE >> 9) || @@ -3219,28 +3247,7 @@ static struct r10conf *setup_conf(struct mddev *mddev) if (!conf->r10bio_pool) goto out; - size = mddev->dev_sectors >> conf->chunk_shift; - sector_div(size, fc); - size = size * conf->raid_disks; - sector_div(size, nc); - /* 'size' is now the number of chunks in the array */ - /* calculate "used chunks per device" in 'stride' */ - stride = size * conf->copies; - - /* We need to round up when dividing by raid_disks to - * get the stride size. - */ - stride += conf->raid_disks - 1; - sector_div(stride, conf->raid_disks); - - conf->dev_sectors = stride << conf->chunk_shift; - - if (fo) - stride = 1; - else - sector_div(stride, fc); - conf->stride = stride << conf->chunk_shift; - + calc_sectors(conf, mddev->dev_sectors); spin_lock_init(&conf->device_lock); INIT_LIST_HEAD(&conf->retry_list); @@ -3468,7 +3475,8 @@ static int raid10_resize(struct mddev *mddev, sector_t sectors) mddev->recovery_cp = oldsize; set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); } - mddev->dev_sectors = sectors; + calc_sectors(conf, sectors); + mddev->dev_sectors = conf->dev_sectors; mddev->resync_max_sectors = size; return 0; } -- cgit v0.10.2 From 0d9f4f135eb6dea06bdcb7065b1e4ff78274a5e9 Mon Sep 17 00:00:00 2001 From: Jonathan Brassow Date: Wed, 16 May 2012 04:06:14 -0500 Subject: MD: Add del_timer_sync to mddev_suspend (fix nasty panic) Use del_timer_sync to remove timer before mddev_suspend finishes. We don't want a timer going off after an mddev_suspend is called. This is especially true with device-mapper, since it can call the destructor function immediately following a suspend. This results in the removal (kfree) of the structures upon which the timer depends - resulting in a very ugly panic. Therefore, we add a del_timer_sync to mddev_suspend to prevent this. Cc: stable@vger.kernel.org Signed-off-by: NeilBrown diff --git a/drivers/md/md.c b/drivers/md/md.c index 477eb2e..01233d8 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -391,6 +391,8 @@ void mddev_suspend(struct mddev *mddev) synchronize_rcu(); wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0); mddev->pers->quiesce(mddev, 1); + + del_timer_sync(&mddev->safemode_timer); } EXPORT_SYMBOL_GPL(mddev_suspend); -- cgit v0.10.2 From ad3b8a83933e83e5a2a08956211343bda269e778 Mon Sep 17 00:00:00 2001 From: Barry Song Date: Thu, 17 May 2012 11:28:55 +0800 Subject: ARM: PRIMA2: fix irq domain size and IRQ mask of internal interrupt controller the old codes will cause 3.4 kernel warning as irq domain size is wrong: ------------[ cut here ]------------ WARNING: at kernel/irq/irqdomain.c:74 irq_domain_legacy_revmap+0x24/0x48() Modules linked in: [] (unwind_backtrace+0x0/0xf8) from [] (warn_slowpath_common+0x54/0x64) [] (warn_slowpath_common+0x54/0x64) from [] (warn_slowpath_null+0x1c/0x24) [] (warn_slowpath_null+0x1c/0x24) from [] (irq_domain_legacy_revmap+0x24/0x48) [] (irq_domain_legacy_revmap+0x24/0x48) from [] (irq_create_mapping+0x20/0x120) [] (irq_create_mapping+0x20/0x120) from [] (irq_create_of_mapping+0x7c/0xf0) [] (irq_create_of_mapping+0x7c/0xf0) from [] (irq_of_parse_and_map+0x2c/0x34) [] (irq_of_parse_and_map+0x2c/0x34) from [] (of_irq_to_resource+0x18/0x74) [] (of_irq_to_resource+0x18/0x74) from [] (of_irq_count+0x24/0x34) [] (of_irq_count+0x24/0x34) from [] (of_device_alloc+0x58/0x158) [] (of_device_alloc+0x58/0x158) from [] (of_platform_device_create_pdata+0x3c/0x80) [] (of_platform_device_create_pdata+0x3c/0x80) from [] (of_platform_bus_create+0xc8/0x190) [] (of_platform_bus_create+0xc8/0x190) from [] (of_platform_bus_create+0x12c/0x190) ---[ end trace 1b75b31a2719ed32 ]--- Signed-off-by: Barry Song Signed-off-by: Olof Johansson diff --git a/arch/arm/mach-prima2/irq.c b/arch/arm/mach-prima2/irq.c index 37c2de9..a7b9415 100644 --- a/arch/arm/mach-prima2/irq.c +++ b/arch/arm/mach-prima2/irq.c @@ -42,7 +42,8 @@ sirfsoc_alloc_gc(void __iomem *base, unsigned int irq_start, unsigned int num) static __init void sirfsoc_irq_init(void) { sirfsoc_alloc_gc(sirfsoc_intc_base, 0, 32); - sirfsoc_alloc_gc(sirfsoc_intc_base + 4, 32, SIRFSOC_INTENAL_IRQ_END - 32); + sirfsoc_alloc_gc(sirfsoc_intc_base + 4, 32, + SIRFSOC_INTENAL_IRQ_END + 1 - 32); writel_relaxed(0, sirfsoc_intc_base + SIRFSOC_INT_RISC_LEVEL0); writel_relaxed(0, sirfsoc_intc_base + SIRFSOC_INT_RISC_LEVEL1); @@ -68,7 +69,8 @@ void __init sirfsoc_of_irq_init(void) if (!sirfsoc_intc_base) panic("unable to map intc cpu registers\n"); - irq_domain_add_legacy(np, 32, 0, 0, &irq_domain_simple_ops, NULL); + irq_domain_add_legacy(np, SIRFSOC_INTENAL_IRQ_END + 1, 0, 0, + &irq_domain_simple_ops, NULL); of_node_put(np); -- cgit v0.10.2 From fa8b66ccd2d200b64496cfedcce90bf54fe7d6e9 Mon Sep 17 00:00:00 2001 From: Amit Shah Date: Wed, 25 Apr 2012 14:40:39 +0530 Subject: virtio: console: tell host of open ports after resume from s3/s4 If a port was open before going into one of the sleep states, the port can continue normal operation after restore. However, the host has to be told that the guest side of the connection is open to restore pre-suspend state. This wasn't noticed so far due to a bug in qemu that was fixed recently (which marked the guest-side connection as always open). CC: stable@vger.kernel.org # Only for 3.3 Signed-off-by: Amit Shah Signed-off-by: Michael S. Tsirkin diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c index ddf86b6..cdf2f54 100644 --- a/drivers/char/virtio_console.c +++ b/drivers/char/virtio_console.c @@ -1895,6 +1895,13 @@ static int virtcons_restore(struct virtio_device *vdev) /* Get port open/close status on the host */ send_control_msg(port, VIRTIO_CONSOLE_PORT_READY, 1); + + /* + * If a port was open at the time of suspending, we + * have to let the host know that it's still open. + */ + if (port->guest_connected) + send_control_msg(port, VIRTIO_CONSOLE_PORT_OPEN, 1); } return 0; } -- cgit v0.10.2 From b8ae0eb320b38415da94a41f75e9f99e0aaff06c Mon Sep 17 00:00:00 2001 From: Amit Shah Date: Fri, 27 Apr 2012 00:45:56 +0530 Subject: virtio: balloon: let host know of updated balloon size before module removal When the balloon module is removed, we deflate the balloon, reclaiming all the pages that were given to the host. However, we don't update the config values for the new balloon size, resulting in the host showing outdated balloon values. The size update is done after each leak and fill operation, only the module removal case was left out. Signed-off-by: Amit Shah Signed-off-by: Michael S. Tsirkin diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c index c2d05a8..8807fe5 100644 --- a/drivers/virtio/virtio_balloon.c +++ b/drivers/virtio/virtio_balloon.c @@ -390,6 +390,7 @@ static void __devexit virtballoon_remove(struct virtio_device *vdev) /* There might be pages left in the balloon: free them. */ while (vb->num_pages) leak_balloon(vb, vb->num_pages); + update_balloon_size(vb); /* Now we reset the device so we can clean up the queues. */ vdev->config->reset(vdev); -- cgit v0.10.2 From ec13ee80145ccb95b00e6e610044bbd94a170051 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Wed, 16 May 2012 10:57:12 +0300 Subject: virtio_net: invoke softirqs after __napi_schedule __napi_schedule might raise softirq but nothing causes do_softirq to trigger, so it does not in fact run. As a result, the error message "NOHZ: local_softirq_pending 08" sometimes occurs during boot of a KVM guest when the network service is started and we are oom: ... Bringing up loopback interface: [ OK ] Bringing up interface eth0: Determining IP information for eth0...NOHZ: local_softirq_pending 08 done. [ OK ] ... Further, receive queue processing might get delayed indefinitely until some interrupt triggers: virtio_net expected napi to be run immediately. One way to cause do_softirq to be executed is by invoking local_bh_enable(). As __napi_schedule is normally called from bh or irq context, this seems to make sense: disable bh before __napi_schedule and enable afterwards. In fact it's a very complicated way of calling do_softirq(), and works since this function is only used when we are not in interrupt context. It's not hot at all, in any ideal scenario. Reported-by: Ulrich Obergfell Tested-by: Ulrich Obergfell Signed-off-by: Michael S. Tsirkin Acked-by: Rusty Russell diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index af8acc8..cbefe67 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -492,7 +492,9 @@ static void virtnet_napi_enable(struct virtnet_info *vi) * We synchronize against interrupts via NAPI_STATE_SCHED */ if (napi_schedule_prep(&vi->napi)) { virtqueue_disable_cb(vi->rvq); + local_bh_disable(); __napi_schedule(&vi->napi); + local_bh_enable(); } } -- cgit v0.10.2 From 1a3abcf41f13666d4ed241c8cc7f48bd38e7b543 Mon Sep 17 00:00:00 2001 From: Vitaly Andrianov Date: Tue, 15 May 2012 15:01:16 +0100 Subject: ARM: 7418/1: LPAE: fix access flag setup in mem_type_table A zero value for prot_sect in the memory types table implies that section mappings should never be created for the memory type in question. This is checked for in alloc_init_section(). With LPAE, we set a bit to mask access flag faults for kernel mappings. This breaks the aforementioned (!prot_sect) check in alloc_init_section(). This patch fixes this bug by first checking for a non-zero prot_sect before setting the PMD_SECT_AF flag. Signed-off-by: Vitaly Andrianov Acked-by: Catalin Marinas Signed-off-by: Russell King diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c index 2c7cf2f..aa78de8 100644 --- a/arch/arm/mm/mmu.c +++ b/arch/arm/mm/mmu.c @@ -489,7 +489,8 @@ static void __init build_mem_type_table(void) */ for (i = 0; i < ARRAY_SIZE(mem_types); i++) { mem_types[i].prot_pte |= PTE_EXT_AF; - mem_types[i].prot_sect |= PMD_SECT_AF; + if (mem_types[i].prot_sect) + mem_types[i].prot_sect |= PMD_SECT_AF; } kern_pgprot |= PTE_EXT_AF; vecs_pgprot |= PTE_EXT_AF; -- cgit v0.10.2 From 56cb248428ead13a6b423ed3f3cf9e4aa01244b1 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 15 May 2012 15:51:54 +0100 Subject: ARM: 7419/1: vfp: fix VFP flushing regression on sigreturn path Commit ff9a184c ("ARM: 7400/1: vfp: clear fpscr length and stride bits on entry to sig handler") flushes the VFP state prior to entering a signal handler so that a VFP operation inside the handler will trap and force a restore of ABI-compliant registers. Reflushing and disabling VFP on the sigreturn path is predicated on the saved thread state indicating that VFP was used by the handler -- however for SMP platforms this is only set on context-switch, making the check unreliable and causing VFP register corruption in userspace since the register values are not necessarily those restored from the sigframe. This patch unconditionally flushes the VFP state after a signal handler. Since we already perform the flush before the handler and the flushing itself happens lazily, the redundant flush when VFP is not used by the handler is essentially a nop. Reported-by: Jon Medhurst Signed-off-by: Jon Medhurst Signed-off-by: Will Deacon Signed-off-by: Russell King diff --git a/arch/arm/vfp/vfpmodule.c b/arch/arm/vfp/vfpmodule.c index c5767b5..b0197b2 100644 --- a/arch/arm/vfp/vfpmodule.c +++ b/arch/arm/vfp/vfpmodule.c @@ -577,12 +577,6 @@ int vfp_preserve_user_clear_hwstate(struct user_vfp __user *ufp, * entry. */ hwstate->fpscr &= ~(FPSCR_LENGTH_MASK | FPSCR_STRIDE_MASK); - - /* - * Disable VFP in the hwstate so that we can detect if it gets - * used. - */ - hwstate->fpexc &= ~FPEXC_EN; return 0; } @@ -595,12 +589,8 @@ int vfp_restore_user_hwstate(struct user_vfp __user *ufp, unsigned long fpexc; int err = 0; - /* - * If VFP has been used, then disable it to avoid corrupting - * the new thread state. - */ - if (hwstate->fpexc & FPEXC_EN) - vfp_flush_hwstate(thread); + /* Disable VFP to avoid corrupting the new thread state. */ + vfp_flush_hwstate(thread); /* * Copy the floating point registers. There can be unused -- cgit v0.10.2 From cd9323fd68aee3c1c6b5b21e5746c9d1b586fb58 Mon Sep 17 00:00:00 2001 From: Nicholas Bellinger Date: Wed, 16 May 2012 16:05:26 -0700 Subject: target: Fix bug in handling of FILEIO + block_device resize ops This patch fixes a bug in the handling of FILEIO w/ underlying block_device resize operations where the original fd_dev->fd_dev_size was incorrectly being used in fd_get_blocks() for READ_CAPACITY response payloads. This patch avoids using fd_dev->fd_dev_size for FILEIO devices with an underlying block_device, and instead changes fd_get_blocks() to get the sector count directly from i_size_read() as recommended by hch. Reported-by: Christoph Hellwig Cc: Signed-off-by: Nicholas Bellinger diff --git a/drivers/target/target_core_file.c b/drivers/target/target_core_file.c index 7ed58e2..f286955 100644 --- a/drivers/target/target_core_file.c +++ b/drivers/target/target_core_file.c @@ -169,6 +169,7 @@ static struct se_device *fd_create_virtdevice( inode = file->f_mapping->host; if (S_ISBLK(inode->i_mode)) { struct request_queue *q; + unsigned long long dev_size; /* * Setup the local scope queue_limits from struct request_queue->limits * to pass into transport_add_device_to_core_hba() as struct se_dev_limits. @@ -183,13 +184,12 @@ static struct se_device *fd_create_virtdevice( * one (1) logical sector from underlying struct block_device */ fd_dev->fd_block_size = bdev_logical_block_size(inode->i_bdev); - fd_dev->fd_dev_size = (i_size_read(file->f_mapping->host) - + dev_size = (i_size_read(file->f_mapping->host) - fd_dev->fd_block_size); pr_debug("FILEIO: Using size: %llu bytes from struct" " block_device blocks: %llu logical_block_size: %d\n", - fd_dev->fd_dev_size, - div_u64(fd_dev->fd_dev_size, fd_dev->fd_block_size), + dev_size, div_u64(dev_size, fd_dev->fd_block_size), fd_dev->fd_block_size); } else { if (!(fd_dev->fbd_flags & FBDF_HAS_SIZE)) { @@ -605,10 +605,20 @@ static u32 fd_get_device_type(struct se_device *dev) static sector_t fd_get_blocks(struct se_device *dev) { struct fd_dev *fd_dev = dev->dev_ptr; - unsigned long long blocks_long = div_u64(fd_dev->fd_dev_size, - dev->se_sub_dev->se_dev_attrib.block_size); + struct file *f = fd_dev->fd_file; + struct inode *i = f->f_mapping->host; + unsigned long long dev_size; + /* + * When using a file that references an underlying struct block_device, + * ensure dev_size is always based on the current inode size in order + * to handle underlying block_device resize operations. + */ + if (S_ISBLK(i->i_mode)) + dev_size = (i_size_read(i) - fd_dev->fd_block_size); + else + dev_size = fd_dev->fd_dev_size; - return blocks_long; + return div_u64(dev_size, dev->se_sub_dev->se_dev_attrib.block_size); } static struct se_subsystem_api fileio_template = { -- cgit v0.10.2 From bad115cfe5b509043b684d3a007ab54b80090aa1 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Thu, 17 May 2012 11:14:14 +0000 Subject: tcp: do_tcp_sendpages() must try to push data out on oom conditions Since recent changes on TCP splicing (starting with commits 2f533844 "tcp: allow splice() to build full TSO packets" and 35f9c09f "tcp: tcp_sendpages() should call tcp_push() once"), I started seeing massive stalls when forwarding traffic between two sockets using splice() when pipe buffers were larger than socket buffers. Latest changes (net: netdev_alloc_skb() use build_skb()) made the problem even more apparent. The reason seems to be that if do_tcp_sendpages() fails on out of memory condition without being able to send at least one byte, tcp_push() is not called and the buffers cannot be flushed. After applying the attached patch, I cannot reproduce the stalls at all and the data rate it perfectly stable and steady under any condition which previously caused the problem to be permanent. The issue seems to have been there since before the kernel migrated to git, which makes me think that the stalls I occasionally experienced with tux during stress-tests years ago were probably related to the same issue. This issue was first encountered on 3.0.31 and 3.2.17, so please backport to -stable. Signed-off-by: Willy Tarreau Acked-by: Eric Dumazet Cc: diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 1272a88..6589e11 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -851,8 +851,7 @@ new_segment: wait_for_sndbuf: set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); wait_for_memory: - if (copied) - tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH); + tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH); if ((err = sk_stream_wait_memory(sk, &timeo)) != 0) goto do_error; -- cgit v0.10.2 From 8ce6909f77ba1b7bcdea65cc2388fd1742b6d669 Mon Sep 17 00:00:00 2001 From: Tushar Dave Date: Thu, 17 May 2012 01:04:50 +0000 Subject: e1000: Prevent reset task killing itself. Killing reset task while adapter is resetting causes deadlock. Only kill reset task if adapter is not resetting. Ref bug #43132 on bugzilla.kernel.org CC: stable@vger.kernel.org Signed-off-by: Tushar Dave Tested-by: Aaron Brown Signed-off-by: Jeff Kirsher Signed-off-by: David S. Miller diff --git a/drivers/net/ethernet/intel/e1000/e1000_main.c b/drivers/net/ethernet/intel/e1000/e1000_main.c index 37caa88..8d8908d 100644 --- a/drivers/net/ethernet/intel/e1000/e1000_main.c +++ b/drivers/net/ethernet/intel/e1000/e1000_main.c @@ -493,7 +493,11 @@ out: static void e1000_down_and_stop(struct e1000_adapter *adapter) { set_bit(__E1000_DOWN, &adapter->flags); - cancel_work_sync(&adapter->reset_task); + + /* Only kill reset task if adapter is not resetting */ + if (!test_bit(__E1000_RESETTING, &adapter->flags)) + cancel_work_sync(&adapter->reset_task); + cancel_delayed_work_sync(&adapter->watchdog_task); cancel_delayed_work_sync(&adapter->phy_info_task); cancel_delayed_work_sync(&adapter->fifo_stall_task); -- cgit v0.10.2 From 5c7dd710f691d1b44c39e32d2f05b4286ff51f99 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 18 May 2012 00:39:35 +0200 Subject: ACPI / PCI / PM: Fix device PM regression related to D3hot/D3cold MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 1cc0c998fdf2 ("ACPI: Fix D3hot v D3cold confusion") introduced a bug in __acpi_bus_set_power() and changed the behavior of acpi_pci_set_power_state() in such a way that it generally doesn't work as expected if PCI_D3hot is passed to it as the second argument. First off, if ACPI_STATE_D3 (equal to ACPI_STATE_D3_COLD) is passed to __acpi_bus_set_power() and the explicit_set flag is set for the D3cold state, the function will try to execute AML method called "_PS4", which doesn't exist. Fix this by adding a check to ensure that the name of the AML method to execute for transitions to ACPI_STATE_D3_COLD is correct in __acpi_bus_set_power(). Also make sure that the explicit_set flag for ACPI_STATE_D3_COLD will be set if _PS3 is present and modify acpi_power_transition() to avoid accessing power resources for ACPI_STATE_D3_COLD, because they don't exist. Second, if PCI_D3hot is passed to acpi_pci_set_power_state() as the target state, the function will request a transition to ACPI_STATE_D3_HOT instead of ACPI_STATE_D3. However, ACPI_STATE_D3_HOT is now only marked as supported if the _PR3 AML method is defined for the given device, which is rare. This causes problems to happen on systems where devices were successfully put into ACPI D3 by pci_set_power_state(PCI_D3hot) which doesn't work now. In particular, some unused graphics adapters are not turned off as a result. To fix this issue restore the old behavior of acpi_pci_set_power_state(), which is to request a transition to ACPI_STATE_D3 (equal to ACPI_STATE_D3_COLD) if either PCI_D3hot or PCI_D3cold is passed to it as the argument. This approach is not ideal, because generally power should not be removed from devices if PCI_D3hot is the target power state, but since this behavior is relied on, we have no choice but to restore it at the moment and spend more time on designing a better solution in the future. References: https://bugzilla.kernel.org/show_bug.cgi?id=43228 Reported-by: rocko Reported-by: Cristian Rodríguez Reported-and-tested-by: Peter Signed-off-by: Rafael J. Wysocki Signed-off-by: Linus Torvalds diff --git a/drivers/acpi/bus.c b/drivers/acpi/bus.c index 3263b68..3188da3 100644 --- a/drivers/acpi/bus.c +++ b/drivers/acpi/bus.c @@ -250,6 +250,10 @@ static int __acpi_bus_set_power(struct acpi_device *device, int state) return -ENODEV; } + /* For D3cold we should execute _PS3, not _PS4. */ + if (state == ACPI_STATE_D3_COLD) + object_name[3] = '3'; + /* * Transition Power * ---------------- diff --git a/drivers/acpi/power.c b/drivers/acpi/power.c index 330bb4d..0500f71 100644 --- a/drivers/acpi/power.c +++ b/drivers/acpi/power.c @@ -660,7 +660,7 @@ int acpi_power_on_resources(struct acpi_device *device, int state) int acpi_power_transition(struct acpi_device *device, int state) { - int result; + int result = 0; if (!device || (state < ACPI_STATE_D0) || (state > ACPI_STATE_D3_COLD)) return -EINVAL; @@ -679,8 +679,11 @@ int acpi_power_transition(struct acpi_device *device, int state) * (e.g. so the device doesn't lose power while transitioning). Then, * we dereference all power resources used in the current list. */ - result = acpi_power_on_list(&device->power.states[state].resources); - if (!result) + if (state < ACPI_STATE_D3_COLD) + result = acpi_power_on_list( + &device->power.states[state].resources); + + if (!result && device->power.state < ACPI_STATE_D3_COLD) acpi_power_off_list( &device->power.states[device->power.state].resources); diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c index 7417267..85cbfdc 100644 --- a/drivers/acpi/scan.c +++ b/drivers/acpi/scan.c @@ -908,6 +908,10 @@ static int acpi_bus_get_power_flags(struct acpi_device *device) device->power.states[ACPI_STATE_D3].flags.valid = 1; device->power.states[ACPI_STATE_D3].power = 0; + /* Set D3cold's explicit_set flag if _PS3 exists. */ + if (device->power.states[ACPI_STATE_D3_HOT].flags.explicit_set) + device->power.states[ACPI_STATE_D3_COLD].flags.explicit_set = 1; + acpi_bus_init_power(device); return 0; diff --git a/drivers/pci/pci-acpi.c b/drivers/pci/pci-acpi.c index 1929c0c..61e2fef 100644 --- a/drivers/pci/pci-acpi.c +++ b/drivers/pci/pci-acpi.c @@ -223,7 +223,7 @@ static int acpi_pci_set_power_state(struct pci_dev *dev, pci_power_t state) [PCI_D0] = ACPI_STATE_D0, [PCI_D1] = ACPI_STATE_D1, [PCI_D2] = ACPI_STATE_D2, - [PCI_D3hot] = ACPI_STATE_D3_HOT, + [PCI_D3hot] = ACPI_STATE_D3, [PCI_D3cold] = ACPI_STATE_D3 }; int error = -EINVAL; -- cgit v0.10.2 From c0a5f4a05af588a0f9951f8d24e2564b09501918 Mon Sep 17 00:00:00 2001 From: Rajkumar Kasirajan Date: Thu, 17 May 2012 17:03:24 -0700 Subject: drivers/rtc/rtc-pl031.c: configure correct wday for 2000-01-01 The reset date of the ST Micro version of PL031 is 2000-01-01. The correct weekday for 2000-01-01 is saturday, but pl031 is initialized to sunday. This may lead to alarm malfunction, so configure the correct wday if RTC_DR indicates reset. Signed-off-by: Rajkumar Kasirajan Signed-off-by: Linus Walleij Cc: Mattias Wallin Cc: Alessandro Zummo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/drivers/rtc/rtc-pl031.c b/drivers/rtc/rtc-pl031.c index 684ef4b..f027c06 100644 --- a/drivers/rtc/rtc-pl031.c +++ b/drivers/rtc/rtc-pl031.c @@ -312,6 +312,7 @@ static int pl031_probe(struct amba_device *adev, const struct amba_id *id) int ret; struct pl031_local *ldata; struct rtc_class_ops *ops = id->data; + unsigned long time; ret = amba_request_regions(adev, NULL); if (ret) @@ -343,6 +344,23 @@ static int pl031_probe(struct amba_device *adev, const struct amba_id *id) writel(readl(ldata->base + RTC_CR) | RTC_CR_CWEN, ldata->base + RTC_CR); + /* + * On ST PL031 variants, the RTC reset value does not provide correct + * weekday for 2000-01-01. Correct the erroneous sunday to saturday. + */ + if (ldata->hw_designer == AMBA_VENDOR_ST) { + if (readl(ldata->base + RTC_YDR) == 0x2000) { + time = readl(ldata->base + RTC_DR); + if ((time & + (RTC_MON_MASK | RTC_MDAY_MASK | RTC_WDAY_MASK)) + == 0x02120000) { + time = time | (0x7 << RTC_WDAY_SHIFT); + writel(0x2000, ldata->base + RTC_YLR); + writel(time, ldata->base + RTC_LR); + } + } + } + ldata->rtc = rtc_device_register("pl031", &adev->dev, ops, THIS_MODULE); if (IS_ERR(ldata->rtc)) { -- cgit v0.10.2 From eb94cd96e05d6c65a07937e66a04ea265c1b767d Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Thu, 17 May 2012 17:03:25 -0700 Subject: fs, proc: fix ABBA deadlock in case of execution attempt of map_files/ entries map_files/ entries are never supposed to be executed, still curious minds might try to run them, which leads to the following deadlock ====================================================== [ INFO: possible circular locking dependency detected ] 3.4.0-rc4-24406-g841e6a6 #121 Not tainted ------------------------------------------------------- bash/1556 is trying to acquire lock: (&sb->s_type->i_mutex_key#8){+.+.+.}, at: do_lookup+0x267/0x2b1 but task is already holding lock: (&sig->cred_guard_mutex){+.+.+.}, at: prepare_bprm_creds+0x2d/0x69 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (&sig->cred_guard_mutex){+.+.+.}: validate_chain+0x444/0x4f4 __lock_acquire+0x387/0x3f8 lock_acquire+0x12b/0x158 __mutex_lock_common+0x56/0x3a9 mutex_lock_killable_nested+0x40/0x45 lock_trace+0x24/0x59 proc_map_files_lookup+0x5a/0x165 __lookup_hash+0x52/0x73 do_lookup+0x276/0x2b1 walk_component+0x3d/0x114 do_last+0xfc/0x540 path_openat+0xd3/0x306 do_filp_open+0x3d/0x89 do_sys_open+0x74/0x106 sys_open+0x21/0x23 tracesys+0xdd/0xe2 -> #0 (&sb->s_type->i_mutex_key#8){+.+.+.}: check_prev_add+0x6a/0x1ef validate_chain+0x444/0x4f4 __lock_acquire+0x387/0x3f8 lock_acquire+0x12b/0x158 __mutex_lock_common+0x56/0x3a9 mutex_lock_nested+0x40/0x45 do_lookup+0x267/0x2b1 walk_component+0x3d/0x114 link_path_walk+0x1f9/0x48f path_openat+0xb6/0x306 do_filp_open+0x3d/0x89 open_exec+0x25/0xa0 do_execve_common+0xea/0x2f9 do_execve+0x43/0x45 sys_execve+0x43/0x5a stub_execve+0x6c/0xc0 This is because prepare_bprm_creds grabs task->signal->cred_guard_mutex and when do_lookup happens we try to grab task->signal->cred_guard_mutex again in lock_trace. Fix it using plain ptrace_may_access() helper in proc_map_files_lookup() and in proc_map_files_readdir() instead of lock_trace(), the caller must be CAP_SYS_ADMIN granted anyway. Signed-off-by: Cyrill Gorcunov Reported-by: Sasha Levin Cc: Konstantin Khlebnikov Cc: Pavel Emelyanov Cc: Dave Jones Cc: Vasiliy Kulikov Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/proc/base.c b/fs/proc/base.c index 1c8b280..8e139c9 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2177,16 +2177,16 @@ static struct dentry *proc_map_files_lookup(struct inode *dir, goto out; result = ERR_PTR(-EACCES); - if (lock_trace(task)) + if (!ptrace_may_access(task, PTRACE_MODE_READ)) goto out_put_task; result = ERR_PTR(-ENOENT); if (dname_to_vma_addr(dentry, &vm_start, &vm_end)) - goto out_unlock; + goto out_put_task; mm = get_task_mm(task); if (!mm) - goto out_unlock; + goto out_put_task; down_read(&mm->mmap_sem); vma = find_exact_vma(mm, vm_start, vm_end); @@ -2198,8 +2198,6 @@ static struct dentry *proc_map_files_lookup(struct inode *dir, out_no_vma: up_read(&mm->mmap_sem); mmput(mm); -out_unlock: - unlock_trace(task); out_put_task: put_task_struct(task); out: @@ -2233,7 +2231,7 @@ proc_map_files_readdir(struct file *filp, void *dirent, filldir_t filldir) goto out; ret = -EACCES; - if (lock_trace(task)) + if (!ptrace_may_access(task, PTRACE_MODE_READ)) goto out_put_task; ret = 0; @@ -2241,12 +2239,12 @@ proc_map_files_readdir(struct file *filp, void *dirent, filldir_t filldir) case 0: ino = inode->i_ino; if (filldir(dirent, ".", 1, 0, ino, DT_DIR) < 0) - goto out_unlock; + goto out_put_task; filp->f_pos++; case 1: ino = parent_ino(dentry); if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0) - goto out_unlock; + goto out_put_task; filp->f_pos++; default: { @@ -2257,7 +2255,7 @@ proc_map_files_readdir(struct file *filp, void *dirent, filldir_t filldir) mm = get_task_mm(task); if (!mm) - goto out_unlock; + goto out_put_task; down_read(&mm->mmap_sem); nr_files = 0; @@ -2287,7 +2285,7 @@ proc_map_files_readdir(struct file *filp, void *dirent, filldir_t filldir) flex_array_free(fa); up_read(&mm->mmap_sem); mmput(mm); - goto out_unlock; + goto out_put_task; } for (i = 0, vma = mm->mmap, pos = 2; vma; vma = vma->vm_next) { @@ -2332,8 +2330,6 @@ proc_map_files_readdir(struct file *filp, void *dirent, filldir_t filldir) } } -out_unlock: - unlock_trace(task); out_put_task: put_task_struct(task); out: -- cgit v0.10.2 From 02e1a9cd1ea99b67a668f13b61fdf5d42115db0a Mon Sep 17 00:00:00 2001 From: majianpeng Date: Thu, 17 May 2012 17:03:26 -0700 Subject: slub: missing test for partial pages flush work in flush_all() I found some kernel messages such as: SLUB raid5-md127: kmem_cache_destroy called for cache that still has objects. Pid: 6143, comm: mdadm Tainted: G O 3.4.0-rc6+ #75 Call Trace: kmem_cache_destroy+0x328/0x400 free_conf+0x2d/0xf0 [raid456] stop+0x41/0x60 [raid456] md_stop+0x1a/0x60 [md_mod] do_md_stop+0x74/0x470 [md_mod] md_ioctl+0xff/0x11f0 [md_mod] blkdev_ioctl+0xd8/0x7a0 block_ioctl+0x3b/0x40 do_vfs_ioctl+0x96/0x560 sys_ioctl+0x91/0xa0 system_call_fastpath+0x16/0x1b Then using kmemleak I found these messages: unreferenced object 0xffff8800b6db7380 (size 112): comm "mdadm", pid 5783, jiffies 4294810749 (age 90.589s) hex dump (first 32 bytes): 01 01 db b6 ad 4e ad de ff ff ff ff ff ff ff ff .....N.......... ff ff ff ff ff ff ff ff 98 40 4a 82 ff ff ff ff .........@J..... backtrace: kmemleak_alloc+0x21/0x50 kmem_cache_alloc+0xeb/0x1b0 kmem_cache_open+0x2f1/0x430 kmem_cache_create+0x158/0x320 setup_conf+0x649/0x770 [raid456] run+0x68b/0x840 [raid456] md_run+0x529/0x940 [md_mod] do_md_run+0x18/0xc0 [md_mod] md_ioctl+0xba8/0x11f0 [md_mod] blkdev_ioctl+0xd8/0x7a0 block_ioctl+0x3b/0x40 do_vfs_ioctl+0x96/0x560 sys_ioctl+0x91/0xa0 system_call_fastpath+0x16/0x1b This bug was introduced by commit a8364d5555b ("slub: only IPI CPUs that have per cpu obj to flush"), which did not include checks for per cpu partial pages being present on a cpu. Signed-off-by: majianpeng Cc: Gilad Ben-Yossef Acked-by: Christoph Lameter Cc: Pekka Enberg Tested-by: Jeff Layton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/slub.c b/mm/slub.c index ffe13fd..80848cd 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2040,7 +2040,7 @@ static bool has_cpu_slab(int cpu, void *info) struct kmem_cache *s = info; struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); - return !!(c->page); + return c->page || c->partial; } static void flush_all(struct kmem_cache *s) -- cgit v0.10.2 From 93c2d656c7120e29de8df5bc17bb2a97664104e9 Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Thu, 17 May 2012 17:03:26 -0700 Subject: frv: delete incorrect task prototypes causing compile fail Commit 41101809a865 ("fork: Provide weak arch_release_[task_struct| thread_info] functions") in -tip highlights a problem in the frv arch, where it has needles prototypes for alloc_task_struct_node and free_task_struct. This now shows up as: kernel/fork.c:120:66: error: static declaration of 'alloc_task_struct_node' follows non-static declaration kernel/fork.c:127:51: error: static declaration of 'free_task_struct' follows non-static declaration since that commit turned them into real functions. Since arch/frv does does not define define __HAVE_ARCH_TASK_STRUCT_ALLOCATOR (i.e. it just uses the generic ones) it shouldn't list these at all. Signed-off-by: Paul Gortmaker Cc: David Howells Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/frv/include/asm/processor.h b/arch/frv/include/asm/processor.h index 81c2e27..9b1a92b 100644 --- a/arch/frv/include/asm/processor.h +++ b/arch/frv/include/asm/processor.h @@ -135,10 +135,6 @@ unsigned long get_wchan(struct task_struct *p); #define KSTK_EIP(tsk) ((tsk)->thread.frame0->pc) #define KSTK_ESP(tsk) ((tsk)->thread.frame0->sp) -/* Allocation and freeing of basic task resources. */ -extern struct task_struct *alloc_task_struct_node(int node); -extern void free_task_struct(struct task_struct *p); - #define cpu_relax() barrier() /* data cache prefetch */ -- cgit v0.10.2 From e6d9668e119af44ae5bcd5f1197174531458afe3 Mon Sep 17 00:00:00 2001 From: Chris Metcalf Date: Fri, 18 May 2012 13:33:24 -0400 Subject: tilegx: enable SYSCALL_WRAPPERS support Some discussion with the glibc mailing lists revealed that this was necessary for 64-bit platforms with MIPS-like sign-extension rules for 32-bit values. The original symptom was that passing (uid_t)-1 to setreuid() was failing in programs linked -pthread because of the "setxid" mechanism for passing setxid-type function arguments to the syscall code. SYSCALL_WRAPPERS handles ensuring that all syscall arguments end up with proper sign-extension and is thus the appropriate fix for this problem. On other platforms (s390, powerpc, sparc64, and mips) this was fixed in 2.6.28.6. The general issue is tracked as CVE-2009-0029. Cc: Signed-off-by: Chris Metcalf diff --git a/arch/tile/Kconfig b/arch/tile/Kconfig index 96033e2..74239dd 100644 --- a/arch/tile/Kconfig +++ b/arch/tile/Kconfig @@ -11,6 +11,7 @@ config TILE select GENERIC_IRQ_PROBE select GENERIC_PENDING_IRQ if SMP select GENERIC_IRQ_SHOW + select HAVE_SYSCALL_WRAPPERS if TILEGX select SYS_HYPERVISOR select ARCH_HAVE_NMI_SAFE_CMPXCHG -- cgit v0.10.2 From d4b1133558e0d417342d5d2c49e4c35b428ff20d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 17 May 2012 23:52:26 +0000 Subject: pktgen: fix module unload for good commit c57b5468406 (pktgen: fix crash at module unload) did a very poor job with list primitives. 1) list_splice() arguments were in the wrong order 2) list_splice(list, head) has undefined behavior if head is not initialized. 3) We should use the list_splice_init() variant to clear pktgen_threads list. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 77a5998..b81369b 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -3755,13 +3755,13 @@ static void __exit pg_cleanup(void) { struct pktgen_thread *t; struct list_head *q, *n; - struct list_head list; + LIST_HEAD(list); /* Stop all interfaces & threads */ pktgen_exiting = true; mutex_lock(&pktgen_thread_lock); - list_splice(&list, &pktgen_threads); + list_splice_init(&pktgen_threads, &list); mutex_unlock(&pktgen_thread_lock); list_for_each_safe(q, n, &list) { -- cgit v0.10.2 From 30a08bf2d31d275c6fc71dd1811342777e95c831 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 18 May 2012 11:32:15 -0700 Subject: proc: move fd symlink i_mode calculations into tid_fd_revalidate() Instead of doing the i_mode calculations at proc_fd_instantiate() time, move them into tid_fd_revalidate(), which is where the other inode state (notably uid/gid information) is updated too. Otherwise we'll end up with stale i_mode information if an fd is re-used while the dentry still hangs around. Not that anything really *cares* (symlink permissions don't really matter), but Tetsuo Handa noticed that the owner read/write bits don't always match the state of the readability of the file descriptor, and we _used_ to get this right a long time ago in a galaxy far, far away. Besides, aside from fixing an ugly detail (that has apparently been this way since commit 61a28784028e: "proc: Remove the hard coded inode numbers" in 2006), this removes more lines of code than it adds. And it just makes sense to update i_mode in the same place we update i_uid/gid. Al Viro correctly points out that we could just do the inode fill in the inode iops ->getattr() function instead. However, that does require somewhat slightly more invasive changes, and adds yet *another* lookup of the file descriptor. We need to do the revalidate() for other reasons anyway, and have the file descriptor handy, so we might as well fill in the information at this point. Reported-by: Tetsuo Handa Cc: Al Viro Acked-by: Eric Biederman Signed-off-by: Linus Torvalds diff --git a/fs/proc/base.c b/fs/proc/base.c index 1c8b280..7d6ad98 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1799,10 +1799,15 @@ static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd) if (task) { files = get_files_struct(task); if (files) { + struct file *file; rcu_read_lock(); - if (fcheck_files(files, fd)) { + file = fcheck_files(files, fd); + if (file) { + unsigned i_mode, f_mode = file->f_mode; + rcu_read_unlock(); put_files_struct(files); + if (task_dumpable(task)) { rcu_read_lock(); cred = __task_cred(task); @@ -1813,7 +1818,14 @@ static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd) inode->i_uid = 0; inode->i_gid = 0; } - inode->i_mode &= ~(S_ISUID | S_ISGID); + + i_mode = S_IFLNK; + if (f_mode & FMODE_READ) + i_mode |= S_IRUSR | S_IXUSR; + if (f_mode & FMODE_WRITE) + i_mode |= S_IWUSR | S_IXUSR; + inode->i_mode = i_mode; + security_task_to_inode(task, inode); put_task_struct(task); return 1; @@ -1837,8 +1849,6 @@ static struct dentry *proc_fd_instantiate(struct inode *dir, struct dentry *dentry, struct task_struct *task, const void *ptr) { unsigned fd = *(const unsigned *)ptr; - struct file *file; - struct files_struct *files; struct inode *inode; struct proc_inode *ei; struct dentry *error = ERR_PTR(-ENOENT); @@ -1848,25 +1858,6 @@ static struct dentry *proc_fd_instantiate(struct inode *dir, goto out; ei = PROC_I(inode); ei->fd = fd; - files = get_files_struct(task); - if (!files) - goto out_iput; - inode->i_mode = S_IFLNK; - - /* - * We are not taking a ref to the file structure, so we must - * hold ->file_lock. - */ - spin_lock(&files->file_lock); - file = fcheck_files(files, fd); - if (!file) - goto out_unlock; - if (file->f_mode & FMODE_READ) - inode->i_mode |= S_IRUSR | S_IXUSR; - if (file->f_mode & FMODE_WRITE) - inode->i_mode |= S_IWUSR | S_IXUSR; - spin_unlock(&files->file_lock); - put_files_struct(files); inode->i_op = &proc_pid_link_inode_operations; inode->i_size = 64; @@ -1879,12 +1870,6 @@ static struct dentry *proc_fd_instantiate(struct inode *dir, out: return error; -out_unlock: - spin_unlock(&files->file_lock); - put_files_struct(files); -out_iput: - iput(inode); - goto out; } static struct dentry *proc_lookupfd_common(struct inode *dir, -- cgit v0.10.2 From b0d634d5683f0b186b242ce6a4f3b041edb8b956 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Sat, 19 May 2012 09:01:13 +1000 Subject: md/raid10: fix transcription error in calc_sectors conversion. The old code was sector_div(stride, fc); the new code was sector_dir(size, conf->near_copies); 'size' is right (the stride various wasn't really needed), but 'fc' means 'far_copies', and that is an important difference. Signed-off-by: NeilBrown diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 3e7b154..3f91c2e 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -3189,7 +3189,7 @@ static void calc_sectors(struct r10conf *conf, sector_t size) if (conf->far_offset) conf->stride = 1 << conf->chunk_shift; else { - sector_div(size, conf->near_copies); + sector_div(size, conf->far_copies); conf->stride = size << conf->chunk_shift; } } -- cgit v0.10.2 From f402693d06f32e746c6153e459c5fb064fa02741 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Sat, 19 May 2012 01:01:01 +0100 Subject: dm thin: fix table output when pool target disables discard passdown internally When the thin pool target clears the discard_passdown parameter internally, it incorrectly changes the table line reported to userspace. This breaks dumb string comparisons on these table lines in generic userspace device-mapper library code and leads to tables being reloaded repeatedly when nothing is actually meant to be changing. This patch corrects this by no longer changing the table line when discard passdown was disabled. We can still tell when discard passdown is overridden by looking for the message "Discard unsupported by data device (sdX): Disabling discard passdown." This automatic detection is also moved from the 'load' to the 'resume' so that it is re-evaluated should the properties of underlying devices change. Signed-off-by: Mike Snitzer Acked-by: Joe Thornber Signed-off-by: Alasdair G Kergon diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index 2fd87b5..eb3d138 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -1632,6 +1632,21 @@ static int bind_control_target(struct pool *pool, struct dm_target *ti) pool->low_water_blocks = pt->low_water_blocks; pool->pf = pt->pf; + /* + * If discard_passdown was enabled verify that the data device + * supports discards. Disable discard_passdown if not; otherwise + * -EOPNOTSUPP will be returned. + */ + if (pt->pf.discard_passdown) { + struct request_queue *q = bdev_get_queue(pt->data_dev->bdev); + if (!q || !blk_queue_discard(q)) { + char buf[BDEVNAME_SIZE]; + DMWARN("Discard unsupported by data device (%s): Disabling discard passdown.", + bdevname(pt->data_dev->bdev, buf)); + pool->pf.discard_passdown = 0; + } + } + return 0; } @@ -1988,19 +2003,6 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv) goto out_flags_changed; } - /* - * If discard_passdown was enabled verify that the data device - * supports discards. Disable discard_passdown if not; otherwise - * -EOPNOTSUPP will be returned. - */ - if (pf.discard_passdown) { - struct request_queue *q = bdev_get_queue(data_dev->bdev); - if (!q || !blk_queue_discard(q)) { - DMWARN("Discard unsupported by data device: Disabling discard passdown."); - pf.discard_passdown = 0; - } - } - pt->pool = pool; pt->ti = ti; pt->metadata_dev = metadata_dev; @@ -2385,7 +2387,7 @@ static int pool_status(struct dm_target *ti, status_type_t type, (unsigned long long)pt->low_water_blocks); count = !pool->pf.zero_new_blocks + !pool->pf.discard_enabled + - !pool->pf.discard_passdown; + !pt->pf.discard_passdown; DMEMIT("%u ", count); if (!pool->pf.zero_new_blocks) @@ -2394,7 +2396,7 @@ static int pool_status(struct dm_target *ti, status_type_t type, if (!pool->pf.discard_enabled) DMEMIT("ignore_discard "); - if (!pool->pf.discard_passdown) + if (!pt->pf.discard_passdown) DMEMIT("no_discard_passdown "); break; -- cgit v0.10.2 From 6520fe5564acf07ade7b18a1272db1184835c487 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 8 May 2012 21:22:24 +0300 Subject: x86, realmode: 16-bit real-mode code support for relocs tool A new option is added to the relocs tool called '--realmode'. This option causes the generation of 16-bit segment relocations and 32-bit linear relocations for the real-mode code. When the real-mode code is moved to the low-memory during kernel initialization, these relocation entries can be used to relocate the code properly. In the assembly code 16-bit segment relocations must be relative to the 'real_mode_seg' absolute symbol. Linear relocations must be relative to a symbol prefixed with 'pa_'. 16-bit segment relocation is used to load cs:ip in 16-bit code. Linear relocations are used in the 32-bit code for relocatable data references. They are declared in the linker script of the real-mode code. The relocs tool is moved to arch/x86/tools/relocs.c, and added new target archscripts that can be used to build scripts needed building an architecture. be compiled before building the arch/x86 tree. [ hpa: accelerating this because it detects invalid absolute relocations, a serious bug in binutils 2.22.52.0.x which currently produces bad kernels. ] Signed-off-by: H. Peter Anvin Link: http://lkml.kernel.org/r/1336501366-28617-2-git-send-email-jarkko.sakkinen@intel.com Signed-off-by: Jarkko Sakkinen Signed-off-by: H. Peter Anvin Cc: diff --git a/Makefile b/Makefile index 48bd1f5..db2bcf3 100644 --- a/Makefile +++ b/Makefile @@ -442,7 +442,7 @@ asm-generic: no-dot-config-targets := clean mrproper distclean \ cscope gtags TAGS tags help %docs check% coccicheck \ - include/linux/version.h headers_% archheaders \ + include/linux/version.h headers_% archheaders archscripts \ kernelversion %src-pkg config-targets := 0 @@ -979,7 +979,7 @@ prepare1: prepare2 include/linux/version.h include/generated/utsrelease.h \ include/config/auto.conf $(cmd_crmodverdir) -archprepare: archheaders prepare1 scripts_basic +archprepare: archheaders archscripts prepare1 scripts_basic prepare0: archprepare FORCE $(Q)$(MAKE) $(build)=. @@ -1049,8 +1049,11 @@ hdr-dst = $(if $(KBUILD_HEADERS), dst=include/asm-$(hdr-arch), dst=include/asm) PHONY += archheaders archheaders: +PHONY += archscripts +archscripts: + PHONY += __headers -__headers: include/linux/version.h scripts_basic asm-generic archheaders FORCE +__headers: include/linux/version.h scripts_basic asm-generic archheaders archscripts FORCE $(Q)$(MAKE) $(build)=scripts build_unifdef PHONY += headers_install_all diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 41a7237..94e91e4 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -134,6 +134,9 @@ KBUILD_CFLAGS += $(call cc-option,-mno-avx,) KBUILD_CFLAGS += $(mflags-y) KBUILD_AFLAGS += $(mflags-y) +archscripts: + $(Q)$(MAKE) $(build)=arch/x86/tools relocs + ### # Syscall table generation diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index fd55a2f..e398bb5 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -40,13 +40,12 @@ OBJCOPYFLAGS_vmlinux.bin := -R .comment -S $(obj)/vmlinux.bin: vmlinux FORCE $(call if_changed,objcopy) +targets += vmlinux.bin.all vmlinux.relocs -targets += vmlinux.bin.all vmlinux.relocs relocs -hostprogs-$(CONFIG_X86_NEED_RELOCS) += relocs - +CMD_RELOCS = arch/x86/tools/relocs quiet_cmd_relocs = RELOCS $@ - cmd_relocs = $(obj)/relocs $< > $@;$(obj)/relocs --abs-relocs $< -$(obj)/vmlinux.relocs: vmlinux $(obj)/relocs FORCE + cmd_relocs = $(CMD_RELOCS) $< > $@;$(CMD_RELOCS) --abs-relocs $< +$(obj)/vmlinux.relocs: vmlinux FORCE $(call if_changed,relocs) vmlinux.bin.all-y := $(obj)/vmlinux.bin diff --git a/arch/x86/boot/compressed/relocs.c b/arch/x86/boot/compressed/relocs.c deleted file mode 100644 index fb7117a..0000000 --- a/arch/x86/boot/compressed/relocs.c +++ /dev/null @@ -1,678 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#define USE_BSD -#include -#include -#include - -static void die(char *fmt, ...); - -#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) -static Elf32_Ehdr ehdr; -static unsigned long reloc_count, reloc_idx; -static unsigned long *relocs; - -struct section { - Elf32_Shdr shdr; - struct section *link; - Elf32_Sym *symtab; - Elf32_Rel *reltab; - char *strtab; -}; -static struct section *secs; - -/* - * Following symbols have been audited. There values are constant and do - * not change if bzImage is loaded at a different physical address than - * the address for which it has been compiled. Don't warn user about - * absolute relocations present w.r.t these symbols. - */ -static const char abs_sym_regex[] = - "^(xen_irq_disable_direct_reloc$|" - "xen_save_fl_direct_reloc$|" - "VDSO|" - "__crc_)"; -static regex_t abs_sym_regex_c; -static int is_abs_reloc(const char *sym_name) -{ - return !regexec(&abs_sym_regex_c, sym_name, 0, NULL, 0); -} - -/* - * These symbols are known to be relative, even if the linker marks them - * as absolute (typically defined outside any section in the linker script.) - */ -static const char rel_sym_regex[] = - "^_end$"; -static regex_t rel_sym_regex_c; -static int is_rel_reloc(const char *sym_name) -{ - return !regexec(&rel_sym_regex_c, sym_name, 0, NULL, 0); -} - -static void regex_init(void) -{ - char errbuf[128]; - int err; - - err = regcomp(&abs_sym_regex_c, abs_sym_regex, - REG_EXTENDED|REG_NOSUB); - if (err) { - regerror(err, &abs_sym_regex_c, errbuf, sizeof errbuf); - die("%s", errbuf); - } - - err = regcomp(&rel_sym_regex_c, rel_sym_regex, - REG_EXTENDED|REG_NOSUB); - if (err) { - regerror(err, &rel_sym_regex_c, errbuf, sizeof errbuf); - die("%s", errbuf); - } -} - -static void die(char *fmt, ...) -{ - va_list ap; - va_start(ap, fmt); - vfprintf(stderr, fmt, ap); - va_end(ap); - exit(1); -} - -static const char *sym_type(unsigned type) -{ - static const char *type_name[] = { -#define SYM_TYPE(X) [X] = #X - SYM_TYPE(STT_NOTYPE), - SYM_TYPE(STT_OBJECT), - SYM_TYPE(STT_FUNC), - SYM_TYPE(STT_SECTION), - SYM_TYPE(STT_FILE), - SYM_TYPE(STT_COMMON), - SYM_TYPE(STT_TLS), -#undef SYM_TYPE - }; - const char *name = "unknown sym type name"; - if (type < ARRAY_SIZE(type_name)) { - name = type_name[type]; - } - return name; -} - -static const char *sym_bind(unsigned bind) -{ - static const char *bind_name[] = { -#define SYM_BIND(X) [X] = #X - SYM_BIND(STB_LOCAL), - SYM_BIND(STB_GLOBAL), - SYM_BIND(STB_WEAK), -#undef SYM_BIND - }; - const char *name = "unknown sym bind name"; - if (bind < ARRAY_SIZE(bind_name)) { - name = bind_name[bind]; - } - return name; -} - -static const char *sym_visibility(unsigned visibility) -{ - static const char *visibility_name[] = { -#define SYM_VISIBILITY(X) [X] = #X - SYM_VISIBILITY(STV_DEFAULT), - SYM_VISIBILITY(STV_INTERNAL), - SYM_VISIBILITY(STV_HIDDEN), - SYM_VISIBILITY(STV_PROTECTED), -#undef SYM_VISIBILITY - }; - const char *name = "unknown sym visibility name"; - if (visibility < ARRAY_SIZE(visibility_name)) { - name = visibility_name[visibility]; - } - return name; -} - -static const char *rel_type(unsigned type) -{ - static const char *type_name[] = { -#define REL_TYPE(X) [X] = #X - REL_TYPE(R_386_NONE), - REL_TYPE(R_386_32), - REL_TYPE(R_386_PC32), - REL_TYPE(R_386_GOT32), - REL_TYPE(R_386_PLT32), - REL_TYPE(R_386_COPY), - REL_TYPE(R_386_GLOB_DAT), - REL_TYPE(R_386_JMP_SLOT), - REL_TYPE(R_386_RELATIVE), - REL_TYPE(R_386_GOTOFF), - REL_TYPE(R_386_GOTPC), -#undef REL_TYPE - }; - const char *name = "unknown type rel type name"; - if (type < ARRAY_SIZE(type_name) && type_name[type]) { - name = type_name[type]; - } - return name; -} - -static const char *sec_name(unsigned shndx) -{ - const char *sec_strtab; - const char *name; - sec_strtab = secs[ehdr.e_shstrndx].strtab; - name = ""; - if (shndx < ehdr.e_shnum) { - name = sec_strtab + secs[shndx].shdr.sh_name; - } - else if (shndx == SHN_ABS) { - name = "ABSOLUTE"; - } - else if (shndx == SHN_COMMON) { - name = "COMMON"; - } - return name; -} - -static const char *sym_name(const char *sym_strtab, Elf32_Sym *sym) -{ - const char *name; - name = ""; - if (sym->st_name) { - name = sym_strtab + sym->st_name; - } - else { - name = sec_name(secs[sym->st_shndx].shdr.sh_name); - } - return name; -} - - - -#if BYTE_ORDER == LITTLE_ENDIAN -#define le16_to_cpu(val) (val) -#define le32_to_cpu(val) (val) -#endif -#if BYTE_ORDER == BIG_ENDIAN -#define le16_to_cpu(val) bswap_16(val) -#define le32_to_cpu(val) bswap_32(val) -#endif - -static uint16_t elf16_to_cpu(uint16_t val) -{ - return le16_to_cpu(val); -} - -static uint32_t elf32_to_cpu(uint32_t val) -{ - return le32_to_cpu(val); -} - -static void read_ehdr(FILE *fp) -{ - if (fread(&ehdr, sizeof(ehdr), 1, fp) != 1) { - die("Cannot read ELF header: %s\n", - strerror(errno)); - } - if (memcmp(ehdr.e_ident, ELFMAG, SELFMAG) != 0) { - die("No ELF magic\n"); - } - if (ehdr.e_ident[EI_CLASS] != ELFCLASS32) { - die("Not a 32 bit executable\n"); - } - if (ehdr.e_ident[EI_DATA] != ELFDATA2LSB) { - die("Not a LSB ELF executable\n"); - } - if (ehdr.e_ident[EI_VERSION] != EV_CURRENT) { - die("Unknown ELF version\n"); - } - /* Convert the fields to native endian */ - ehdr.e_type = elf16_to_cpu(ehdr.e_type); - ehdr.e_machine = elf16_to_cpu(ehdr.e_machine); - ehdr.e_version = elf32_to_cpu(ehdr.e_version); - ehdr.e_entry = elf32_to_cpu(ehdr.e_entry); - ehdr.e_phoff = elf32_to_cpu(ehdr.e_phoff); - ehdr.e_shoff = elf32_to_cpu(ehdr.e_shoff); - ehdr.e_flags = elf32_to_cpu(ehdr.e_flags); - ehdr.e_ehsize = elf16_to_cpu(ehdr.e_ehsize); - ehdr.e_phentsize = elf16_to_cpu(ehdr.e_phentsize); - ehdr.e_phnum = elf16_to_cpu(ehdr.e_phnum); - ehdr.e_shentsize = elf16_to_cpu(ehdr.e_shentsize); - ehdr.e_shnum = elf16_to_cpu(ehdr.e_shnum); - ehdr.e_shstrndx = elf16_to_cpu(ehdr.e_shstrndx); - - if ((ehdr.e_type != ET_EXEC) && (ehdr.e_type != ET_DYN)) { - die("Unsupported ELF header type\n"); - } - if (ehdr.e_machine != EM_386) { - die("Not for x86\n"); - } - if (ehdr.e_version != EV_CURRENT) { - die("Unknown ELF version\n"); - } - if (ehdr.e_ehsize != sizeof(Elf32_Ehdr)) { - die("Bad Elf header size\n"); - } - if (ehdr.e_phentsize != sizeof(Elf32_Phdr)) { - die("Bad program header entry\n"); - } - if (ehdr.e_shentsize != sizeof(Elf32_Shdr)) { - die("Bad section header entry\n"); - } - if (ehdr.e_shstrndx >= ehdr.e_shnum) { - die("String table index out of bounds\n"); - } -} - -static void read_shdrs(FILE *fp) -{ - int i; - Elf32_Shdr shdr; - - secs = calloc(ehdr.e_shnum, sizeof(struct section)); - if (!secs) { - die("Unable to allocate %d section headers\n", - ehdr.e_shnum); - } - if (fseek(fp, ehdr.e_shoff, SEEK_SET) < 0) { - die("Seek to %d failed: %s\n", - ehdr.e_shoff, strerror(errno)); - } - for (i = 0; i < ehdr.e_shnum; i++) { - struct section *sec = &secs[i]; - if (fread(&shdr, sizeof shdr, 1, fp) != 1) - die("Cannot read ELF section headers %d/%d: %s\n", - i, ehdr.e_shnum, strerror(errno)); - sec->shdr.sh_name = elf32_to_cpu(shdr.sh_name); - sec->shdr.sh_type = elf32_to_cpu(shdr.sh_type); - sec->shdr.sh_flags = elf32_to_cpu(shdr.sh_flags); - sec->shdr.sh_addr = elf32_to_cpu(shdr.sh_addr); - sec->shdr.sh_offset = elf32_to_cpu(shdr.sh_offset); - sec->shdr.sh_size = elf32_to_cpu(shdr.sh_size); - sec->shdr.sh_link = elf32_to_cpu(shdr.sh_link); - sec->shdr.sh_info = elf32_to_cpu(shdr.sh_info); - sec->shdr.sh_addralign = elf32_to_cpu(shdr.sh_addralign); - sec->shdr.sh_entsize = elf32_to_cpu(shdr.sh_entsize); - if (sec->shdr.sh_link < ehdr.e_shnum) - sec->link = &secs[sec->shdr.sh_link]; - } - -} - -static void read_strtabs(FILE *fp) -{ - int i; - for (i = 0; i < ehdr.e_shnum; i++) { - struct section *sec = &secs[i]; - if (sec->shdr.sh_type != SHT_STRTAB) { - continue; - } - sec->strtab = malloc(sec->shdr.sh_size); - if (!sec->strtab) { - die("malloc of %d bytes for strtab failed\n", - sec->shdr.sh_size); - } - if (fseek(fp, sec->shdr.sh_offset, SEEK_SET) < 0) { - die("Seek to %d failed: %s\n", - sec->shdr.sh_offset, strerror(errno)); - } - if (fread(sec->strtab, 1, sec->shdr.sh_size, fp) - != sec->shdr.sh_size) { - die("Cannot read symbol table: %s\n", - strerror(errno)); - } - } -} - -static void read_symtabs(FILE *fp) -{ - int i,j; - for (i = 0; i < ehdr.e_shnum; i++) { - struct section *sec = &secs[i]; - if (sec->shdr.sh_type != SHT_SYMTAB) { - continue; - } - sec->symtab = malloc(sec->shdr.sh_size); - if (!sec->symtab) { - die("malloc of %d bytes for symtab failed\n", - sec->shdr.sh_size); - } - if (fseek(fp, sec->shdr.sh_offset, SEEK_SET) < 0) { - die("Seek to %d failed: %s\n", - sec->shdr.sh_offset, strerror(errno)); - } - if (fread(sec->symtab, 1, sec->shdr.sh_size, fp) - != sec->shdr.sh_size) { - die("Cannot read symbol table: %s\n", - strerror(errno)); - } - for (j = 0; j < sec->shdr.sh_size/sizeof(Elf32_Sym); j++) { - Elf32_Sym *sym = &sec->symtab[j]; - sym->st_name = elf32_to_cpu(sym->st_name); - sym->st_value = elf32_to_cpu(sym->st_value); - sym->st_size = elf32_to_cpu(sym->st_size); - sym->st_shndx = elf16_to_cpu(sym->st_shndx); - } - } -} - - -static void read_relocs(FILE *fp) -{ - int i,j; - for (i = 0; i < ehdr.e_shnum; i++) { - struct section *sec = &secs[i]; - if (sec->shdr.sh_type != SHT_REL) { - continue; - } - sec->reltab = malloc(sec->shdr.sh_size); - if (!sec->reltab) { - die("malloc of %d bytes for relocs failed\n", - sec->shdr.sh_size); - } - if (fseek(fp, sec->shdr.sh_offset, SEEK_SET) < 0) { - die("Seek to %d failed: %s\n", - sec->shdr.sh_offset, strerror(errno)); - } - if (fread(sec->reltab, 1, sec->shdr.sh_size, fp) - != sec->shdr.sh_size) { - die("Cannot read symbol table: %s\n", - strerror(errno)); - } - for (j = 0; j < sec->shdr.sh_size/sizeof(Elf32_Rel); j++) { - Elf32_Rel *rel = &sec->reltab[j]; - rel->r_offset = elf32_to_cpu(rel->r_offset); - rel->r_info = elf32_to_cpu(rel->r_info); - } - } -} - - -static void print_absolute_symbols(void) -{ - int i; - printf("Absolute symbols\n"); - printf(" Num: Value Size Type Bind Visibility Name\n"); - for (i = 0; i < ehdr.e_shnum; i++) { - struct section *sec = &secs[i]; - char *sym_strtab; - int j; - - if (sec->shdr.sh_type != SHT_SYMTAB) { - continue; - } - sym_strtab = sec->link->strtab; - for (j = 0; j < sec->shdr.sh_size/sizeof(Elf32_Sym); j++) { - Elf32_Sym *sym; - const char *name; - sym = &sec->symtab[j]; - name = sym_name(sym_strtab, sym); - if (sym->st_shndx != SHN_ABS) { - continue; - } - printf("%5d %08x %5d %10s %10s %12s %s\n", - j, sym->st_value, sym->st_size, - sym_type(ELF32_ST_TYPE(sym->st_info)), - sym_bind(ELF32_ST_BIND(sym->st_info)), - sym_visibility(ELF32_ST_VISIBILITY(sym->st_other)), - name); - } - } - printf("\n"); -} - -static void print_absolute_relocs(void) -{ - int i, printed = 0; - - for (i = 0; i < ehdr.e_shnum; i++) { - struct section *sec = &secs[i]; - struct section *sec_applies, *sec_symtab; - char *sym_strtab; - Elf32_Sym *sh_symtab; - int j; - if (sec->shdr.sh_type != SHT_REL) { - continue; - } - sec_symtab = sec->link; - sec_applies = &secs[sec->shdr.sh_info]; - if (!(sec_applies->shdr.sh_flags & SHF_ALLOC)) { - continue; - } - sh_symtab = sec_symtab->symtab; - sym_strtab = sec_symtab->link->strtab; - for (j = 0; j < sec->shdr.sh_size/sizeof(Elf32_Rel); j++) { - Elf32_Rel *rel; - Elf32_Sym *sym; - const char *name; - rel = &sec->reltab[j]; - sym = &sh_symtab[ELF32_R_SYM(rel->r_info)]; - name = sym_name(sym_strtab, sym); - if (sym->st_shndx != SHN_ABS) { - continue; - } - - /* Absolute symbols are not relocated if bzImage is - * loaded at a non-compiled address. Display a warning - * to user at compile time about the absolute - * relocations present. - * - * User need to audit the code to make sure - * some symbols which should have been section - * relative have not become absolute because of some - * linker optimization or wrong programming usage. - * - * Before warning check if this absolute symbol - * relocation is harmless. - */ - if (is_abs_reloc(name) || is_rel_reloc(name)) - continue; - - if (!printed) { - printf("WARNING: Absolute relocations" - " present\n"); - printf("Offset Info Type Sym.Value " - "Sym.Name\n"); - printed = 1; - } - - printf("%08x %08x %10s %08x %s\n", - rel->r_offset, - rel->r_info, - rel_type(ELF32_R_TYPE(rel->r_info)), - sym->st_value, - name); - } - } - - if (printed) - printf("\n"); -} - -static void walk_relocs(void (*visit)(Elf32_Rel *rel, Elf32_Sym *sym)) -{ - int i; - /* Walk through the relocations */ - for (i = 0; i < ehdr.e_shnum; i++) { - char *sym_strtab; - Elf32_Sym *sh_symtab; - struct section *sec_applies, *sec_symtab; - int j; - struct section *sec = &secs[i]; - - if (sec->shdr.sh_type != SHT_REL) { - continue; - } - sec_symtab = sec->link; - sec_applies = &secs[sec->shdr.sh_info]; - if (!(sec_applies->shdr.sh_flags & SHF_ALLOC)) { - continue; - } - sh_symtab = sec_symtab->symtab; - sym_strtab = sec_symtab->link->strtab; - for (j = 0; j < sec->shdr.sh_size/sizeof(Elf32_Rel); j++) { - Elf32_Rel *rel; - Elf32_Sym *sym; - unsigned r_type; - rel = &sec->reltab[j]; - sym = &sh_symtab[ELF32_R_SYM(rel->r_info)]; - r_type = ELF32_R_TYPE(rel->r_info); - /* Don't visit relocations to absolute symbols */ - if (sym->st_shndx == SHN_ABS && - !is_rel_reloc(sym_name(sym_strtab, sym))) { - continue; - } - switch (r_type) { - case R_386_NONE: - case R_386_PC32: - /* - * NONE can be ignored and and PC relative - * relocations don't need to be adjusted. - */ - break; - case R_386_32: - /* Visit relocations that need to be adjusted */ - visit(rel, sym); - break; - default: - die("Unsupported relocation type: %s (%d)\n", - rel_type(r_type), r_type); - break; - } - } - } -} - -static void count_reloc(Elf32_Rel *rel, Elf32_Sym *sym) -{ - reloc_count += 1; -} - -static void collect_reloc(Elf32_Rel *rel, Elf32_Sym *sym) -{ - /* Remember the address that needs to be adjusted. */ - relocs[reloc_idx++] = rel->r_offset; -} - -static int cmp_relocs(const void *va, const void *vb) -{ - const unsigned long *a, *b; - a = va; b = vb; - return (*a == *b)? 0 : (*a > *b)? 1 : -1; -} - -static void emit_relocs(int as_text) -{ - int i; - /* Count how many relocations I have and allocate space for them. */ - reloc_count = 0; - walk_relocs(count_reloc); - relocs = malloc(reloc_count * sizeof(relocs[0])); - if (!relocs) { - die("malloc of %d entries for relocs failed\n", - reloc_count); - } - /* Collect up the relocations */ - reloc_idx = 0; - walk_relocs(collect_reloc); - - /* Order the relocations for more efficient processing */ - qsort(relocs, reloc_count, sizeof(relocs[0]), cmp_relocs); - - /* Print the relocations */ - if (as_text) { - /* Print the relocations in a form suitable that - * gas will like. - */ - printf(".section \".data.reloc\",\"a\"\n"); - printf(".balign 4\n"); - for (i = 0; i < reloc_count; i++) { - printf("\t .long 0x%08lx\n", relocs[i]); - } - printf("\n"); - } - else { - unsigned char buf[4]; - /* Print a stop */ - fwrite("\0\0\0\0", 4, 1, stdout); - /* Now print each relocation */ - for (i = 0; i < reloc_count; i++) { - put_unaligned_le32(relocs[i], buf); - fwrite(buf, 4, 1, stdout); - } - } -} - -static void usage(void) -{ - die("relocs [--abs-syms |--abs-relocs | --text] vmlinux\n"); -} - -int main(int argc, char **argv) -{ - int show_absolute_syms, show_absolute_relocs; - int as_text; - const char *fname; - FILE *fp; - int i; - - regex_init(); - - show_absolute_syms = 0; - show_absolute_relocs = 0; - as_text = 0; - fname = NULL; - for (i = 1; i < argc; i++) { - char *arg = argv[i]; - if (*arg == '-') { - if (strcmp(argv[1], "--abs-syms") == 0) { - show_absolute_syms = 1; - continue; - } - - if (strcmp(argv[1], "--abs-relocs") == 0) { - show_absolute_relocs = 1; - continue; - } - else if (strcmp(argv[1], "--text") == 0) { - as_text = 1; - continue; - } - } - else if (!fname) { - fname = arg; - continue; - } - usage(); - } - if (!fname) { - usage(); - } - fp = fopen(fname, "r"); - if (!fp) { - die("Cannot open %s: %s\n", - fname, strerror(errno)); - } - read_ehdr(fp); - read_shdrs(fp); - read_strtabs(fp); - read_symtabs(fp); - read_relocs(fp); - if (show_absolute_syms) { - print_absolute_symbols(); - return 0; - } - if (show_absolute_relocs) { - print_absolute_relocs(); - return 0; - } - emit_relocs(as_text); - return 0; -} diff --git a/arch/x86/tools/.gitignore b/arch/x86/tools/.gitignore new file mode 100644 index 0000000..be0ed06 --- /dev/null +++ b/arch/x86/tools/.gitignore @@ -0,0 +1 @@ +relocs diff --git a/arch/x86/tools/Makefile b/arch/x86/tools/Makefile index d511aa9..733057b 100644 --- a/arch/x86/tools/Makefile +++ b/arch/x86/tools/Makefile @@ -36,3 +36,7 @@ HOSTCFLAGS_insn_sanity.o := -Wall -I$(objtree)/arch/x86/lib/ -I$(srctree)/arch/x $(obj)/test_get_len.o: $(srctree)/arch/x86/lib/insn.c $(srctree)/arch/x86/lib/inat.c $(srctree)/arch/x86/include/asm/inat_types.h $(srctree)/arch/x86/include/asm/inat.h $(srctree)/arch/x86/include/asm/insn.h $(objtree)/arch/x86/lib/inat-tables.c $(obj)/insn_sanity.o: $(srctree)/arch/x86/lib/insn.c $(srctree)/arch/x86/lib/inat.c $(srctree)/arch/x86/include/asm/inat_types.h $(srctree)/arch/x86/include/asm/inat.h $(srctree)/arch/x86/include/asm/insn.h $(objtree)/arch/x86/lib/inat-tables.c + +HOST_EXTRACFLAGS += -I$(srctree)/tools/include +hostprogs-y += relocs +relocs: $(obj)/relocs diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c new file mode 100644 index 0000000..0291470 --- /dev/null +++ b/arch/x86/tools/relocs.c @@ -0,0 +1,797 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#define USE_BSD +#include +#include +#include + +static void die(char *fmt, ...); + +#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) +static Elf32_Ehdr ehdr; +static unsigned long reloc_count, reloc_idx; +static unsigned long *relocs; +static unsigned long reloc16_count, reloc16_idx; +static unsigned long *relocs16; + +struct section { + Elf32_Shdr shdr; + struct section *link; + Elf32_Sym *symtab; + Elf32_Rel *reltab; + char *strtab; +}; +static struct section *secs; + +enum symtype { + S_ABS, + S_REL, + S_SEG, + S_LIN, + S_NSYMTYPES +}; + +static const char * const sym_regex_kernel[S_NSYMTYPES] = { +/* + * Following symbols have been audited. There values are constant and do + * not change if bzImage is loaded at a different physical address than + * the address for which it has been compiled. Don't warn user about + * absolute relocations present w.r.t these symbols. + */ + [S_ABS] = + "^(xen_irq_disable_direct_reloc$|" + "xen_save_fl_direct_reloc$|" + "VDSO|" + "__crc_)", + +/* + * These symbols are known to be relative, even if the linker marks them + * as absolute (typically defined outside any section in the linker script.) + */ + [S_REL] = + "^_end$", +}; + + +static const char * const sym_regex_realmode[S_NSYMTYPES] = { +/* + * These are 16-bit segment symbols when compiling 16-bit code. + */ + [S_SEG] = + "^real_mode_seg$", + +/* + * These are offsets belonging to segments, as opposed to linear addresses, + * when compiling 16-bit code. + */ + [S_LIN] = + "^pa_", +}; + +static const char * const *sym_regex; + +static regex_t sym_regex_c[S_NSYMTYPES]; +static int is_reloc(enum symtype type, const char *sym_name) +{ + return sym_regex[type] && + !regexec(&sym_regex_c[type], sym_name, 0, NULL, 0); +} + +static void regex_init(int use_real_mode) +{ + char errbuf[128]; + int err; + int i; + + if (use_real_mode) + sym_regex = sym_regex_realmode; + else + sym_regex = sym_regex_kernel; + + for (i = 0; i < S_NSYMTYPES; i++) { + if (!sym_regex[i]) + continue; + + err = regcomp(&sym_regex_c[i], sym_regex[i], + REG_EXTENDED|REG_NOSUB); + + if (err) { + regerror(err, &sym_regex_c[i], errbuf, sizeof errbuf); + die("%s", errbuf); + } + } +} + +static void die(char *fmt, ...) +{ + va_list ap; + va_start(ap, fmt); + vfprintf(stderr, fmt, ap); + va_end(ap); + exit(1); +} + +static const char *sym_type(unsigned type) +{ + static const char *type_name[] = { +#define SYM_TYPE(X) [X] = #X + SYM_TYPE(STT_NOTYPE), + SYM_TYPE(STT_OBJECT), + SYM_TYPE(STT_FUNC), + SYM_TYPE(STT_SECTION), + SYM_TYPE(STT_FILE), + SYM_TYPE(STT_COMMON), + SYM_TYPE(STT_TLS), +#undef SYM_TYPE + }; + const char *name = "unknown sym type name"; + if (type < ARRAY_SIZE(type_name)) { + name = type_name[type]; + } + return name; +} + +static const char *sym_bind(unsigned bind) +{ + static const char *bind_name[] = { +#define SYM_BIND(X) [X] = #X + SYM_BIND(STB_LOCAL), + SYM_BIND(STB_GLOBAL), + SYM_BIND(STB_WEAK), +#undef SYM_BIND + }; + const char *name = "unknown sym bind name"; + if (bind < ARRAY_SIZE(bind_name)) { + name = bind_name[bind]; + } + return name; +} + +static const char *sym_visibility(unsigned visibility) +{ + static const char *visibility_name[] = { +#define SYM_VISIBILITY(X) [X] = #X + SYM_VISIBILITY(STV_DEFAULT), + SYM_VISIBILITY(STV_INTERNAL), + SYM_VISIBILITY(STV_HIDDEN), + SYM_VISIBILITY(STV_PROTECTED), +#undef SYM_VISIBILITY + }; + const char *name = "unknown sym visibility name"; + if (visibility < ARRAY_SIZE(visibility_name)) { + name = visibility_name[visibility]; + } + return name; +} + +static const char *rel_type(unsigned type) +{ + static const char *type_name[] = { +#define REL_TYPE(X) [X] = #X + REL_TYPE(R_386_NONE), + REL_TYPE(R_386_32), + REL_TYPE(R_386_PC32), + REL_TYPE(R_386_GOT32), + REL_TYPE(R_386_PLT32), + REL_TYPE(R_386_COPY), + REL_TYPE(R_386_GLOB_DAT), + REL_TYPE(R_386_JMP_SLOT), + REL_TYPE(R_386_RELATIVE), + REL_TYPE(R_386_GOTOFF), + REL_TYPE(R_386_GOTPC), + REL_TYPE(R_386_8), + REL_TYPE(R_386_PC8), + REL_TYPE(R_386_16), + REL_TYPE(R_386_PC16), +#undef REL_TYPE + }; + const char *name = "unknown type rel type name"; + if (type < ARRAY_SIZE(type_name) && type_name[type]) { + name = type_name[type]; + } + return name; +} + +static const char *sec_name(unsigned shndx) +{ + const char *sec_strtab; + const char *name; + sec_strtab = secs[ehdr.e_shstrndx].strtab; + name = ""; + if (shndx < ehdr.e_shnum) { + name = sec_strtab + secs[shndx].shdr.sh_name; + } + else if (shndx == SHN_ABS) { + name = "ABSOLUTE"; + } + else if (shndx == SHN_COMMON) { + name = "COMMON"; + } + return name; +} + +static const char *sym_name(const char *sym_strtab, Elf32_Sym *sym) +{ + const char *name; + name = ""; + if (sym->st_name) { + name = sym_strtab + sym->st_name; + } + else { + name = sec_name(sym->st_shndx); + } + return name; +} + + + +#if BYTE_ORDER == LITTLE_ENDIAN +#define le16_to_cpu(val) (val) +#define le32_to_cpu(val) (val) +#endif +#if BYTE_ORDER == BIG_ENDIAN +#define le16_to_cpu(val) bswap_16(val) +#define le32_to_cpu(val) bswap_32(val) +#endif + +static uint16_t elf16_to_cpu(uint16_t val) +{ + return le16_to_cpu(val); +} + +static uint32_t elf32_to_cpu(uint32_t val) +{ + return le32_to_cpu(val); +} + +static void read_ehdr(FILE *fp) +{ + if (fread(&ehdr, sizeof(ehdr), 1, fp) != 1) { + die("Cannot read ELF header: %s\n", + strerror(errno)); + } + if (memcmp(ehdr.e_ident, ELFMAG, SELFMAG) != 0) { + die("No ELF magic\n"); + } + if (ehdr.e_ident[EI_CLASS] != ELFCLASS32) { + die("Not a 32 bit executable\n"); + } + if (ehdr.e_ident[EI_DATA] != ELFDATA2LSB) { + die("Not a LSB ELF executable\n"); + } + if (ehdr.e_ident[EI_VERSION] != EV_CURRENT) { + die("Unknown ELF version\n"); + } + /* Convert the fields to native endian */ + ehdr.e_type = elf16_to_cpu(ehdr.e_type); + ehdr.e_machine = elf16_to_cpu(ehdr.e_machine); + ehdr.e_version = elf32_to_cpu(ehdr.e_version); + ehdr.e_entry = elf32_to_cpu(ehdr.e_entry); + ehdr.e_phoff = elf32_to_cpu(ehdr.e_phoff); + ehdr.e_shoff = elf32_to_cpu(ehdr.e_shoff); + ehdr.e_flags = elf32_to_cpu(ehdr.e_flags); + ehdr.e_ehsize = elf16_to_cpu(ehdr.e_ehsize); + ehdr.e_phentsize = elf16_to_cpu(ehdr.e_phentsize); + ehdr.e_phnum = elf16_to_cpu(ehdr.e_phnum); + ehdr.e_shentsize = elf16_to_cpu(ehdr.e_shentsize); + ehdr.e_shnum = elf16_to_cpu(ehdr.e_shnum); + ehdr.e_shstrndx = elf16_to_cpu(ehdr.e_shstrndx); + + if ((ehdr.e_type != ET_EXEC) && (ehdr.e_type != ET_DYN)) { + die("Unsupported ELF header type\n"); + } + if (ehdr.e_machine != EM_386) { + die("Not for x86\n"); + } + if (ehdr.e_version != EV_CURRENT) { + die("Unknown ELF version\n"); + } + if (ehdr.e_ehsize != sizeof(Elf32_Ehdr)) { + die("Bad Elf header size\n"); + } + if (ehdr.e_phentsize != sizeof(Elf32_Phdr)) { + die("Bad program header entry\n"); + } + if (ehdr.e_shentsize != sizeof(Elf32_Shdr)) { + die("Bad section header entry\n"); + } + if (ehdr.e_shstrndx >= ehdr.e_shnum) { + die("String table index out of bounds\n"); + } +} + +static void read_shdrs(FILE *fp) +{ + int i; + Elf32_Shdr shdr; + + secs = calloc(ehdr.e_shnum, sizeof(struct section)); + if (!secs) { + die("Unable to allocate %d section headers\n", + ehdr.e_shnum); + } + if (fseek(fp, ehdr.e_shoff, SEEK_SET) < 0) { + die("Seek to %d failed: %s\n", + ehdr.e_shoff, strerror(errno)); + } + for (i = 0; i < ehdr.e_shnum; i++) { + struct section *sec = &secs[i]; + if (fread(&shdr, sizeof shdr, 1, fp) != 1) + die("Cannot read ELF section headers %d/%d: %s\n", + i, ehdr.e_shnum, strerror(errno)); + sec->shdr.sh_name = elf32_to_cpu(shdr.sh_name); + sec->shdr.sh_type = elf32_to_cpu(shdr.sh_type); + sec->shdr.sh_flags = elf32_to_cpu(shdr.sh_flags); + sec->shdr.sh_addr = elf32_to_cpu(shdr.sh_addr); + sec->shdr.sh_offset = elf32_to_cpu(shdr.sh_offset); + sec->shdr.sh_size = elf32_to_cpu(shdr.sh_size); + sec->shdr.sh_link = elf32_to_cpu(shdr.sh_link); + sec->shdr.sh_info = elf32_to_cpu(shdr.sh_info); + sec->shdr.sh_addralign = elf32_to_cpu(shdr.sh_addralign); + sec->shdr.sh_entsize = elf32_to_cpu(shdr.sh_entsize); + if (sec->shdr.sh_link < ehdr.e_shnum) + sec->link = &secs[sec->shdr.sh_link]; + } + +} + +static void read_strtabs(FILE *fp) +{ + int i; + for (i = 0; i < ehdr.e_shnum; i++) { + struct section *sec = &secs[i]; + if (sec->shdr.sh_type != SHT_STRTAB) { + continue; + } + sec->strtab = malloc(sec->shdr.sh_size); + if (!sec->strtab) { + die("malloc of %d bytes for strtab failed\n", + sec->shdr.sh_size); + } + if (fseek(fp, sec->shdr.sh_offset, SEEK_SET) < 0) { + die("Seek to %d failed: %s\n", + sec->shdr.sh_offset, strerror(errno)); + } + if (fread(sec->strtab, 1, sec->shdr.sh_size, fp) + != sec->shdr.sh_size) { + die("Cannot read symbol table: %s\n", + strerror(errno)); + } + } +} + +static void read_symtabs(FILE *fp) +{ + int i,j; + for (i = 0; i < ehdr.e_shnum; i++) { + struct section *sec = &secs[i]; + if (sec->shdr.sh_type != SHT_SYMTAB) { + continue; + } + sec->symtab = malloc(sec->shdr.sh_size); + if (!sec->symtab) { + die("malloc of %d bytes for symtab failed\n", + sec->shdr.sh_size); + } + if (fseek(fp, sec->shdr.sh_offset, SEEK_SET) < 0) { + die("Seek to %d failed: %s\n", + sec->shdr.sh_offset, strerror(errno)); + } + if (fread(sec->symtab, 1, sec->shdr.sh_size, fp) + != sec->shdr.sh_size) { + die("Cannot read symbol table: %s\n", + strerror(errno)); + } + for (j = 0; j < sec->shdr.sh_size/sizeof(Elf32_Sym); j++) { + Elf32_Sym *sym = &sec->symtab[j]; + sym->st_name = elf32_to_cpu(sym->st_name); + sym->st_value = elf32_to_cpu(sym->st_value); + sym->st_size = elf32_to_cpu(sym->st_size); + sym->st_shndx = elf16_to_cpu(sym->st_shndx); + } + } +} + + +static void read_relocs(FILE *fp) +{ + int i,j; + for (i = 0; i < ehdr.e_shnum; i++) { + struct section *sec = &secs[i]; + if (sec->shdr.sh_type != SHT_REL) { + continue; + } + sec->reltab = malloc(sec->shdr.sh_size); + if (!sec->reltab) { + die("malloc of %d bytes for relocs failed\n", + sec->shdr.sh_size); + } + if (fseek(fp, sec->shdr.sh_offset, SEEK_SET) < 0) { + die("Seek to %d failed: %s\n", + sec->shdr.sh_offset, strerror(errno)); + } + if (fread(sec->reltab, 1, sec->shdr.sh_size, fp) + != sec->shdr.sh_size) { + die("Cannot read symbol table: %s\n", + strerror(errno)); + } + for (j = 0; j < sec->shdr.sh_size/sizeof(Elf32_Rel); j++) { + Elf32_Rel *rel = &sec->reltab[j]; + rel->r_offset = elf32_to_cpu(rel->r_offset); + rel->r_info = elf32_to_cpu(rel->r_info); + } + } +} + + +static void print_absolute_symbols(void) +{ + int i; + printf("Absolute symbols\n"); + printf(" Num: Value Size Type Bind Visibility Name\n"); + for (i = 0; i < ehdr.e_shnum; i++) { + struct section *sec = &secs[i]; + char *sym_strtab; + int j; + + if (sec->shdr.sh_type != SHT_SYMTAB) { + continue; + } + sym_strtab = sec->link->strtab; + for (j = 0; j < sec->shdr.sh_size/sizeof(Elf32_Sym); j++) { + Elf32_Sym *sym; + const char *name; + sym = &sec->symtab[j]; + name = sym_name(sym_strtab, sym); + if (sym->st_shndx != SHN_ABS) { + continue; + } + printf("%5d %08x %5d %10s %10s %12s %s\n", + j, sym->st_value, sym->st_size, + sym_type(ELF32_ST_TYPE(sym->st_info)), + sym_bind(ELF32_ST_BIND(sym->st_info)), + sym_visibility(ELF32_ST_VISIBILITY(sym->st_other)), + name); + } + } + printf("\n"); +} + +static void print_absolute_relocs(void) +{ + int i, printed = 0; + + for (i = 0; i < ehdr.e_shnum; i++) { + struct section *sec = &secs[i]; + struct section *sec_applies, *sec_symtab; + char *sym_strtab; + Elf32_Sym *sh_symtab; + int j; + if (sec->shdr.sh_type != SHT_REL) { + continue; + } + sec_symtab = sec->link; + sec_applies = &secs[sec->shdr.sh_info]; + if (!(sec_applies->shdr.sh_flags & SHF_ALLOC)) { + continue; + } + sh_symtab = sec_symtab->symtab; + sym_strtab = sec_symtab->link->strtab; + for (j = 0; j < sec->shdr.sh_size/sizeof(Elf32_Rel); j++) { + Elf32_Rel *rel; + Elf32_Sym *sym; + const char *name; + rel = &sec->reltab[j]; + sym = &sh_symtab[ELF32_R_SYM(rel->r_info)]; + name = sym_name(sym_strtab, sym); + if (sym->st_shndx != SHN_ABS) { + continue; + } + + /* Absolute symbols are not relocated if bzImage is + * loaded at a non-compiled address. Display a warning + * to user at compile time about the absolute + * relocations present. + * + * User need to audit the code to make sure + * some symbols which should have been section + * relative have not become absolute because of some + * linker optimization or wrong programming usage. + * + * Before warning check if this absolute symbol + * relocation is harmless. + */ + if (is_reloc(S_ABS, name) || is_reloc(S_REL, name)) + continue; + + if (!printed) { + printf("WARNING: Absolute relocations" + " present\n"); + printf("Offset Info Type Sym.Value " + "Sym.Name\n"); + printed = 1; + } + + printf("%08x %08x %10s %08x %s\n", + rel->r_offset, + rel->r_info, + rel_type(ELF32_R_TYPE(rel->r_info)), + sym->st_value, + name); + } + } + + if (printed) + printf("\n"); +} + +static void walk_relocs(void (*visit)(Elf32_Rel *rel, Elf32_Sym *sym), + int use_real_mode) +{ + int i; + /* Walk through the relocations */ + for (i = 0; i < ehdr.e_shnum; i++) { + char *sym_strtab; + Elf32_Sym *sh_symtab; + struct section *sec_applies, *sec_symtab; + int j; + struct section *sec = &secs[i]; + + if (sec->shdr.sh_type != SHT_REL) { + continue; + } + sec_symtab = sec->link; + sec_applies = &secs[sec->shdr.sh_info]; + if (!(sec_applies->shdr.sh_flags & SHF_ALLOC)) { + continue; + } + sh_symtab = sec_symtab->symtab; + sym_strtab = sec_symtab->link->strtab; + for (j = 0; j < sec->shdr.sh_size/sizeof(Elf32_Rel); j++) { + Elf32_Rel *rel; + Elf32_Sym *sym; + unsigned r_type; + const char *symname; + rel = &sec->reltab[j]; + sym = &sh_symtab[ELF32_R_SYM(rel->r_info)]; + r_type = ELF32_R_TYPE(rel->r_info); + + switch (r_type) { + case R_386_NONE: + case R_386_PC32: + case R_386_PC16: + case R_386_PC8: + /* + * NONE can be ignored and and PC relative + * relocations don't need to be adjusted. + */ + break; + + case R_386_16: + symname = sym_name(sym_strtab, sym); + if (!use_real_mode) + goto bad; + if (sym->st_shndx == SHN_ABS) { + if (is_reloc(S_ABS, symname)) + break; + else if (!is_reloc(S_SEG, symname)) + goto bad; + } else { + if (is_reloc(S_LIN, symname)) + goto bad; + else + break; + } + visit(rel, sym); + break; + + case R_386_32: + symname = sym_name(sym_strtab, sym); + if (sym->st_shndx == SHN_ABS) { + if (is_reloc(S_ABS, symname)) + break; + else if (!is_reloc(S_REL, symname)) + goto bad; + } else { + if (use_real_mode && + !is_reloc(S_LIN, symname)) + break; + } + visit(rel, sym); + break; + default: + die("Unsupported relocation type: %s (%d)\n", + rel_type(r_type), r_type); + break; + bad: + symname = sym_name(sym_strtab, sym); + die("Invalid %s relocation: %s\n", + rel_type(r_type), symname); + } + } + } +} + +static void count_reloc(Elf32_Rel *rel, Elf32_Sym *sym) +{ + if (ELF32_R_TYPE(rel->r_info) == R_386_16) + reloc16_count++; + else + reloc_count++; +} + +static void collect_reloc(Elf32_Rel *rel, Elf32_Sym *sym) +{ + /* Remember the address that needs to be adjusted. */ + if (ELF32_R_TYPE(rel->r_info) == R_386_16) + relocs16[reloc16_idx++] = rel->r_offset; + else + relocs[reloc_idx++] = rel->r_offset; +} + +static int cmp_relocs(const void *va, const void *vb) +{ + const unsigned long *a, *b; + a = va; b = vb; + return (*a == *b)? 0 : (*a > *b)? 1 : -1; +} + +static int write32(unsigned int v, FILE *f) +{ + unsigned char buf[4]; + + put_unaligned_le32(v, buf); + return fwrite(buf, 1, 4, f) == 4 ? 0 : -1; +} + +static void emit_relocs(int as_text, int use_real_mode) +{ + int i; + /* Count how many relocations I have and allocate space for them. */ + reloc_count = 0; + walk_relocs(count_reloc, use_real_mode); + relocs = malloc(reloc_count * sizeof(relocs[0])); + if (!relocs) { + die("malloc of %d entries for relocs failed\n", + reloc_count); + } + + relocs16 = malloc(reloc16_count * sizeof(relocs[0])); + if (!relocs16) { + die("malloc of %d entries for relocs16 failed\n", + reloc16_count); + } + /* Collect up the relocations */ + reloc_idx = 0; + walk_relocs(collect_reloc, use_real_mode); + + if (reloc16_count && !use_real_mode) + die("Segment relocations found but --realmode not specified\n"); + + /* Order the relocations for more efficient processing */ + qsort(relocs, reloc_count, sizeof(relocs[0]), cmp_relocs); + qsort(relocs16, reloc16_count, sizeof(relocs16[0]), cmp_relocs); + + /* Print the relocations */ + if (as_text) { + /* Print the relocations in a form suitable that + * gas will like. + */ + printf(".section \".data.reloc\",\"a\"\n"); + printf(".balign 4\n"); + if (use_real_mode) { + printf("\t.long %lu\n", reloc16_count); + for (i = 0; i < reloc16_count; i++) + printf("\t.long 0x%08lx\n", relocs16[i]); + printf("\t.long %lu\n", reloc_count); + for (i = 0; i < reloc_count; i++) { + printf("\t.long 0x%08lx\n", relocs[i]); + } + } else { + /* Print a stop */ + printf("\t.long 0x%08lx\n", (unsigned long)0); + for (i = 0; i < reloc_count; i++) { + printf("\t.long 0x%08lx\n", relocs[i]); + } + } + + printf("\n"); + } + else { + if (use_real_mode) { + write32(reloc16_count, stdout); + for (i = 0; i < reloc16_count; i++) + write32(relocs16[i], stdout); + write32(reloc_count, stdout); + + /* Now print each relocation */ + for (i = 0; i < reloc_count; i++) + write32(relocs[i], stdout); + } else { + /* Print a stop */ + write32(0, stdout); + + /* Now print each relocation */ + for (i = 0; i < reloc_count; i++) { + write32(relocs[i], stdout); + } + } + } +} + +static void usage(void) +{ + die("relocs [--abs-syms|--abs-relocs|--text|--realmode] vmlinux\n"); +} + +int main(int argc, char **argv) +{ + int show_absolute_syms, show_absolute_relocs; + int as_text, use_real_mode; + const char *fname; + FILE *fp; + int i; + + show_absolute_syms = 0; + show_absolute_relocs = 0; + as_text = 0; + use_real_mode = 0; + fname = NULL; + for (i = 1; i < argc; i++) { + char *arg = argv[i]; + if (*arg == '-') { + if (strcmp(arg, "--abs-syms") == 0) { + show_absolute_syms = 1; + continue; + } + if (strcmp(arg, "--abs-relocs") == 0) { + show_absolute_relocs = 1; + continue; + } + if (strcmp(arg, "--text") == 0) { + as_text = 1; + continue; + } + if (strcmp(arg, "--realmode") == 0) { + use_real_mode = 1; + continue; + } + } + else if (!fname) { + fname = arg; + continue; + } + usage(); + } + if (!fname) { + usage(); + } + regex_init(use_real_mode); + fp = fopen(fname, "r"); + if (!fp) { + die("Cannot open %s: %s\n", + fname, strerror(errno)); + } + read_ehdr(fp); + read_shdrs(fp); + read_strtabs(fp); + read_symtabs(fp); + read_relocs(fp); + if (show_absolute_syms) { + print_absolute_symbols(); + return 0; + } + if (show_absolute_relocs) { + print_absolute_relocs(); + return 0; + } + emit_relocs(as_text, use_real_mode); + return 0; +} diff --git a/scripts/Makefile b/scripts/Makefile index df7678f..3626666 100644 --- a/scripts/Makefile +++ b/scripts/Makefile @@ -8,6 +8,8 @@ # conmakehash: Create arrays for initializing the kernel console tables # docproc: Used in Documentation/DocBook +HOST_EXTRACFLAGS += -I$(srctree)/tools/include + hostprogs-$(CONFIG_KALLSYMS) += kallsyms hostprogs-$(CONFIG_LOGO) += pnmtologo hostprogs-$(CONFIG_VT) += conmakehash -- cgit v0.10.2 From a3e854d95a76862cd37937e0b0438f540536771a Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 18 May 2012 00:24:09 -0700 Subject: x86, relocs: Workaround for binutils 2.22.52.0.1 section bug GNU ld 2.22.52.0.1 has a bug that it blindly changes symbols from section-relative to absolute if they are in a section of zero length. This turns the symbols __init_begin and __init_end into absolute symbols. Let the relocs program know that those should be treated as relative symbols. Reported-by: Ingo Molnar Signed-off-by: H. Peter Anvin Cc: H.J. Lu Cc: Cc: Jarkko Sakkinen diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c index 0291470..c3cfbb2 100644 --- a/arch/x86/tools/relocs.c +++ b/arch/x86/tools/relocs.c @@ -56,7 +56,11 @@ static const char * const sym_regex_kernel[S_NSYMTYPES] = { * as absolute (typically defined outside any section in the linker script.) */ [S_REL] = - "^_end$", + "^(__init_(begin|end)|" + "__x86_cpu_dev_(start|end)|" + "(__parainstructions|__alt_instructions)(|_end)|" + "(__iommu_table|__apicdrivers|__smp_locks)(|_end)|" + "_end)$" }; -- cgit v0.10.2 From 24ab82bd9bf18f3efc69a131d73577940941e1b7 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 18 May 2012 09:52:01 -0700 Subject: x86, relocs: When printing an error, say relative or absolute When the relocs tool throws an error, let the error message say if it is an absolute or relative symbol. This should make it a lot more clear what action the programmer needs to take and should help us find the reason if additional symbol bugs show up. Signed-off-by: H. Peter Anvin Cc: diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c index c3cfbb2..b43cfcd 100644 --- a/arch/x86/tools/relocs.c +++ b/arch/x86/tools/relocs.c @@ -563,10 +563,14 @@ static void walk_relocs(void (*visit)(Elf32_Rel *rel, Elf32_Sym *sym), Elf32_Sym *sym; unsigned r_type; const char *symname; + int shn_abs; + rel = &sec->reltab[j]; sym = &sh_symtab[ELF32_R_SYM(rel->r_info)]; r_type = ELF32_R_TYPE(rel->r_info); + shn_abs = sym->st_shndx == SHN_ABS; + switch (r_type) { case R_386_NONE: case R_386_PC32: @@ -582,7 +586,7 @@ static void walk_relocs(void (*visit)(Elf32_Rel *rel, Elf32_Sym *sym), symname = sym_name(sym_strtab, sym); if (!use_real_mode) goto bad; - if (sym->st_shndx == SHN_ABS) { + if (shn_abs) { if (is_reloc(S_ABS, symname)) break; else if (!is_reloc(S_SEG, symname)) @@ -598,7 +602,7 @@ static void walk_relocs(void (*visit)(Elf32_Rel *rel, Elf32_Sym *sym), case R_386_32: symname = sym_name(sym_strtab, sym); - if (sym->st_shndx == SHN_ABS) { + if (shn_abs) { if (is_reloc(S_ABS, symname)) break; else if (!is_reloc(S_REL, symname)) @@ -616,7 +620,8 @@ static void walk_relocs(void (*visit)(Elf32_Rel *rel, Elf32_Sym *sym), break; bad: symname = sym_name(sym_strtab, sym); - die("Invalid %s relocation: %s\n", + die("Invalid %s %s relocation: %s\n", + shn_abs ? "absolute" : "relative", rel_type(r_type), symname); } } -- cgit v0.10.2 From 62ade86ab6c7e26409229ca45503cae97bf698cf Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Fri, 18 May 2012 11:28:34 -0700 Subject: memcg,thp: fix res_counter:96 regression Occasionally, testing memcg's move_charge_at_immigrate on rc7 shows a flurry of hundreds of warnings at kernel/res_counter.c:96, where res_counter_uncharge_locked() does WARN_ON(counter->usage < val). The first trace of each flurry implicates __mem_cgroup_cancel_charge() of mc.precharge, and an audit of mc.precharge handling points to mem_cgroup_move_charge_pte_range()'s THP handling in commit 12724850e806 ("memcg: avoid THP split in task migration"). Checking !mc.precharge is good everywhere else, when a single page is to be charged; but here the "mc.precharge -= HPAGE_PMD_NR" likely to follow, is liable to result in underflow (a lot can change since the precharge was estimated). Simply check against HPAGE_PMD_NR: there's probably a better alternative, trying precharge for more, splitting if unsuccessful; but this one-liner is safer for now - no kernel/res_counter.c:96 warnings seen in 26 hours. Signed-off-by: Hugh Dickins Signed-off-by: Linus Torvalds diff --git a/mm/memcontrol.c b/mm/memcontrol.c index b659260..7685d4a 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5481,7 +5481,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, * part of thp split is not executed yet. */ if (pmd_trans_huge_lock(pmd, vma) == 1) { - if (!mc.precharge) { + if (mc.precharge < HPAGE_PMD_NR) { spin_unlock(&vma->vm_mm->page_table_lock); return 0; } -- cgit v0.10.2 From 76e10d158efb6d4516018846f60c2ab5501900bc Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 20 May 2012 15:29:13 -0700 Subject: Linux 3.4 diff --git a/Makefile b/Makefile index db2bcf3..a687963 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ VERSION = 3 PATCHLEVEL = 4 SUBLEVEL = 0 -EXTRAVERSION = -rc7 +EXTRAVERSION = NAME = Saber-toothed Squirrel # *DOCUMENTATION* -- cgit v0.10.2