From f6d5b33125c4fa63c16f7f54c533338c9695d82c Mon Sep 17 00:00:00 2001 From: John Stultz Date: Tue, 29 Mar 2011 18:00:27 -0700 Subject: RTC: Fix early irqs caused by calling rtc_set_alarm too early When we register an rtc device at boot, we read the alarm value in hardware and set the rtc device's aie_timer to that value. The initial method to do this was to simply call rtc_set_alarm() with the value read from hardware. However, this may cause problems as rtc_set_alarm may enable interupts, and the RTC alarm might fire, which can cause invalid pointer dereferencing since the RTC registration is not complete. This patch solves the issue by initializing the rtc_device.aie_timer y hand via rtc_initialize_alarm(). This avoids any calls to the RTC hardware which might enable interrupts too early. CC: Thomas Gleixner CC: Alessandro Zummo Reported-by: Konrad Rzeszutek Wilk Tested-by: Konrad Rzeszutek Wilk Signed-off-by: John Stultz diff --git a/drivers/rtc/class.c b/drivers/rtc/class.c index 09b4437..3901386 100644 --- a/drivers/rtc/class.c +++ b/drivers/rtc/class.c @@ -171,7 +171,7 @@ struct rtc_device *rtc_device_register(const char *name, struct device *dev, err = __rtc_read_alarm(rtc, &alrm); if (!err && !rtc_valid_tm(&alrm.time)) - rtc_set_alarm(rtc, &alrm); + rtc_initialize_alarm(rtc, &alrm); strlcpy(rtc->name, name, RTC_DEVICE_NAME_SIZE); dev_set_name(&rtc->dev, "rtc%d", id); diff --git a/drivers/rtc/interface.c b/drivers/rtc/interface.c index 8ec6b06..b2fea80 100644 --- a/drivers/rtc/interface.c +++ b/drivers/rtc/interface.c @@ -375,6 +375,32 @@ int rtc_set_alarm(struct rtc_device *rtc, struct rtc_wkalrm *alarm) } EXPORT_SYMBOL_GPL(rtc_set_alarm); +/* Called once per device from rtc_device_register */ +int rtc_initialize_alarm(struct rtc_device *rtc, struct rtc_wkalrm *alarm) +{ + int err; + + err = rtc_valid_tm(&alarm->time); + if (err != 0) + return err; + + err = mutex_lock_interruptible(&rtc->ops_lock); + if (err) + return err; + + rtc->aie_timer.node.expires = rtc_tm_to_ktime(alarm->time); + rtc->aie_timer.period = ktime_set(0, 0); + if (alarm->enabled) { + rtc->aie_timer.enabled = 1; + timerqueue_add(&rtc->timerqueue, &rtc->aie_timer.node); + } + mutex_unlock(&rtc->ops_lock); + return err; +} +EXPORT_SYMBOL_GPL(rtc_initialize_alarm); + + + int rtc_alarm_irq_enable(struct rtc_device *rtc, unsigned int enabled) { int err = mutex_lock_interruptible(&rtc->ops_lock); diff --git a/include/linux/rtc.h b/include/linux/rtc.h index 2ca7e8a..877ece4 100644 --- a/include/linux/rtc.h +++ b/include/linux/rtc.h @@ -228,6 +228,8 @@ extern int rtc_read_alarm(struct rtc_device *rtc, struct rtc_wkalrm *alrm); extern int rtc_set_alarm(struct rtc_device *rtc, struct rtc_wkalrm *alrm); +extern int rtc_initialize_alarm(struct rtc_device *rtc, + struct rtc_wkalrm *alrm); extern void rtc_update_irq(struct rtc_device *rtc, unsigned long num, unsigned long events); -- cgit v0.10.2 From a54aba87bb8e90f9e39bcfe151717b86abbbdd79 Mon Sep 17 00:00:00 2001 From: Vasily Khoruzhick Date: Thu, 24 Mar 2011 22:09:33 +0200 Subject: RTC: Fix s3c compile error due to missing s3c_rtc_setpie s3c_rtc_setpie was removed, and it resulted in compiler error: drivers/rtc/rtc-s3c.c: In function s3c_rtc_release drivers/rtc/rtc-s3c.c:339:2: error: implicit declaration of function s3c_rtc_setpie Fix it by removing s3c_rtc_release calls. [jstultz: An identical fix was also sent in by Jiri Pinkava ] CC: Thomas Gleixner CC: Alessandro Zummo Signed-off-by: Vasily Khoruzhick Signed-off-by: John Stultz diff --git a/drivers/rtc/rtc-s3c.c b/drivers/rtc/rtc-s3c.c index 7149649..b3466c4 100644 --- a/drivers/rtc/rtc-s3c.c +++ b/drivers/rtc/rtc-s3c.c @@ -336,7 +336,6 @@ static void s3c_rtc_release(struct device *dev) /* do not clear AIE here, it may be needed for wake */ - s3c_rtc_setpie(dev, 0); free_irq(s3c_rtc_alarmno, rtc_dev); free_irq(s3c_rtc_tickno, rtc_dev); } @@ -408,7 +407,6 @@ static int __devexit s3c_rtc_remove(struct platform_device *dev) platform_set_drvdata(dev, NULL); rtc_device_unregister(rtc); - s3c_rtc_setpie(&dev->dev, 0); s3c_rtc_setaie(&dev->dev, 0); clk_disable(rtc_clk); -- cgit v0.10.2 From 8c122b96866580c99e44f3f07ac93a993d964ec3 Mon Sep 17 00:00:00 2001 From: Mike Frysinger Date: Fri, 18 Mar 2011 04:26:24 -0400 Subject: RTC: add missing "return 0" in new alarm func for rtc-bfin.c The new bfin_rtc_alarm_irq_enable function forgot to add a "return 0" to the end leading to the build warning: drivers/rtc/rtc-bfin.c: In function 'bfin_rtc_alarm_irq_enable': drivers/rtc/rtc-bfin.c:253: warning: control reaches end of non-void function CC: stable@kernel.org CC: Thomas Gleixner CC: Alessandro Zummo Signed-off-by: Mike Frysinger Signed-off-by: John Stultz diff --git a/drivers/rtc/rtc-bfin.c b/drivers/rtc/rtc-bfin.c index ca9cff8..f249644 100644 --- a/drivers/rtc/rtc-bfin.c +++ b/drivers/rtc/rtc-bfin.c @@ -250,6 +250,8 @@ static int bfin_rtc_alarm_irq_enable(struct device *dev, unsigned int enabled) bfin_rtc_int_set_alarm(rtc); else bfin_rtc_int_clear(~(RTC_ISTAT_ALARM | RTC_ISTAT_ALARM_DAY)); + + return 0; } static int bfin_rtc_read_time(struct device *dev, struct rtc_time *tm) -- cgit v0.10.2 From 67c1b8c6aa354aad14aad85d36508fd73d1c6361 Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Thu, 7 Apr 2011 09:39:44 +0800 Subject: RTC: rtc-mrst: follow on to the change of rtc_device_register() commit f44f7f96a20 (RTC: Initialize kernel state from RTC) will call rtc_read_alarm() inside rtc_device_register(), so rtc-mrst driver need to call dev_set_drvdata() before rtc_device_register() get called. Cc: Alessandro Zummo Cc: John Stultz Cc: Thomas Gleixner Signed-off-by: Feng Tang Signed-off-by: John Stultz diff --git a/drivers/rtc/rtc-mrst.c b/drivers/rtc/rtc-mrst.c index b86bc32..d2c397c 100644 --- a/drivers/rtc/rtc-mrst.c +++ b/drivers/rtc/rtc-mrst.c @@ -342,6 +342,8 @@ vrtc_mrst_do_probe(struct device *dev, struct resource *iomem, int rtc_irq) mrst_rtc.irq = rtc_irq; mrst_rtc.iomem = iomem; + mrst_rtc.dev = dev; + dev_set_drvdata(dev, &mrst_rtc); mrst_rtc.rtc = rtc_device_register(driver_name, dev, &mrst_rtc_ops, THIS_MODULE); @@ -350,8 +352,6 @@ vrtc_mrst_do_probe(struct device *dev, struct resource *iomem, int rtc_irq) goto cleanup0; } - mrst_rtc.dev = dev; - dev_set_drvdata(dev, &mrst_rtc); rename_region(iomem, dev_name(&mrst_rtc.rtc->dev)); spin_lock_irq(&rtc_lock); @@ -376,9 +376,10 @@ vrtc_mrst_do_probe(struct device *dev, struct resource *iomem, int rtc_irq) return 0; cleanup1: - mrst_rtc.dev = NULL; rtc_device_unregister(mrst_rtc.rtc); cleanup0: + dev_set_drvdata(dev, NULL); + mrst_rtc.dev = NULL; release_region(iomem->start, iomem->end + 1 - iomem->start); dev_err(dev, "rtc-mrst: unable to initialise\n"); return retval; -- cgit v0.10.2 From 621d26567fd0c222f419e3b5ddf39e529e0fdcb3 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 8 Apr 2011 12:11:06 +0200 Subject: perf: Fix a build error with some GCC versions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix this: util/cgroup.c: In function ‘open_cgroup’: util/cgroup.c:16:16: error: ‘saved_ptr’ may be used uninitialized in this function util/cgroup.c:16:16: note: ‘saved_ptr’ was declared here Apparently newer GCC (4.6) can figure out that this variable is properly initialized - but some versions of GCC (such as 4.5.2) need help. Signed-off-by: Eric Dumazet Cc: Arnaldo Carvalho de Melo LKML-Reference: Signed-off-by: Ingo Molnar diff --git a/tools/perf/util/cgroup.c b/tools/perf/util/cgroup.c index 9fea755..96bee5c 100644 --- a/tools/perf/util/cgroup.c +++ b/tools/perf/util/cgroup.c @@ -13,7 +13,7 @@ cgroupfs_find_mountpoint(char *buf, size_t maxlen) { FILE *fp; char mountpoint[MAX_PATH+1], tokens[MAX_PATH+1], type[MAX_PATH+1]; - char *token, *saved_ptr; + char *token, *saved_ptr = NULL; int found = 0; fp = fopen("/proc/mounts", "r"); -- cgit v0.10.2 From e566b76ed30768140df8f0023904aed5a41244f7 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Wed, 6 Apr 2011 02:54:54 +0200 Subject: perf_event: Fix cgrp event scheduling bug in perf_enable_on_exec() There is a bug in perf_event_enable_on_exec() when cgroup events are active on a CPU: the cgroup events may be scheduled twice causing event state corruptions which eventually may lead to kernel panics. The reason is that the function needs to first schedule out the cgroup events, just like for the per-thread events. The cgroup event are scheduled back in automatically from the perf_event_context_sched_in() function. The patch also adds a WARN_ON_ONCE() is perf_cgroup_switch() to catch any bogus state. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110406005454.GA1062@quad Signed-off-by: Ingo Molnar diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 27960f1..8e81a98 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -364,6 +364,7 @@ void perf_cgroup_switch(struct task_struct *task, int mode) } if (mode & PERF_CGROUP_SWIN) { + WARN_ON_ONCE(cpuctx->cgrp); /* set cgrp before ctxsw in to * allow event_filter_match() to not * have to pass task around @@ -2423,6 +2424,14 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx) if (!ctx || !ctx->nr_events) goto out; + /* + * We must ctxsw out cgroup events to avoid conflict + * when invoking perf_task_event_sched_in() later on + * in this function. Otherwise we end up trying to + * ctxswin cgroup events which are already scheduled + * in. + */ + perf_cgroup_sched_out(current); task_ctx_sched_out(ctx, EVENT_ALL); raw_spin_lock(&ctx->lock); @@ -2447,6 +2456,9 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx) raw_spin_unlock(&ctx->lock); + /* + * Also calls ctxswin for cgroup events, if any: + */ perf_event_context_sched_in(ctx, ctx->task); out: local_irq_restore(flags); -- cgit v0.10.2 From b0432d8f162c7d5d9537b4cb749d44076b76a783 Mon Sep 17 00:00:00 2001 From: Ken Chen Date: Thu, 7 Apr 2011 17:23:22 -0700 Subject: sched: Fix sched-domain avg_load calculation In function find_busiest_group(), the sched-domain avg_load isn't calculated at all if there is a group imbalance within the domain. This will cause erroneous imbalance calculation. The reason is that calculate_imbalance() sees sds->avg_load = 0 and it will dump entire sds->max_load into imbalance variable, which is used later on to migrate entire load from busiest CPU to the puller CPU. This has two really bad effect: 1. stampede of task migration, and they won't be able to break out of the bad state because of positive feedback loop: large load delta -> heavier load migration -> larger imbalance and the cycle goes on. 2. severe imbalance in CPU queue depth. This causes really long scheduling latency blip which affects badly on application that has tight latency requirement. The fix is to have kernel calculate domain avg_load in both cases. This will ensure that imbalance calculation is always sensible and the target is usually half way between busiest and puller CPU. Signed-off-by: Ken Chen Signed-off-by: Peter Zijlstra Cc: Link: http://lkml.kernel.org/r/20110408002322.3A0D812217F@elm.corp.google.com Signed-off-by: Ingo Molnar diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 7f00772..60f9d40 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -3127,6 +3127,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, if (!sds.busiest || sds.busiest_nr_running == 0) goto out_balanced; + sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr; + /* * If the busiest group is imbalanced the below checks don't * work because they assumes all things are equal, which typically @@ -3151,7 +3153,6 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, * Don't pull any tasks if this group is already above the domain * average load. */ - sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr; if (sds.this_load >= sds.avg_load) goto out_balanced; -- cgit v0.10.2 From b30aef17f71cf9e24b10c11cbb5e5f0ebe8a85ab Mon Sep 17 00:00:00 2001 From: Ken Chen Date: Fri, 8 Apr 2011 12:20:16 -0700 Subject: sched: Fix erroneous all_pinned logic The scheduler load balancer has specific code to deal with cases of unbalanced system due to lots of unmovable tasks (for example because of hard CPU affinity). In those situation, it excludes the busiest CPU that has pinned tasks for load balance consideration such that it can perform second 2nd load balance pass on the rest of the system. This all works as designed if there is only one cgroup in the system. However, when we have multiple cgroups, this logic has false positives and triggers multiple load balance passes despite there are actually no pinned tasks at all. The reason it has false positives is that the all pinned logic is deep in the lowest function of can_migrate_task() and is too low level: load_balance_fair() iterates each task group and calls balance_tasks() to migrate target load. Along the way, balance_tasks() will also set a all_pinned variable. Given that task-groups are iterated, this all_pinned variable is essentially the status of last group in the scanning process. Task group can have number of reasons that no load being migrated, none due to cpu affinity. However, this status bit is being propagated back up to the higher level load_balance(), which incorrectly think that no tasks were moved. It kick off the all pinned logic and start multiple passes attempt to move load onto puller CPU. To fix this, move the all_pinned aggregation up at the iterator level. This ensures that the status is aggregated over all task-groups, not just last one in the list. Signed-off-by: Ken Chen Cc: stable@kernel.org Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/BANLkTi=ernzNawaR5tJZEsV_QVnfxqXmsQ@mail.gmail.com Signed-off-by: Ingo Molnar diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 60f9d40..6fa833a 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -2104,21 +2104,20 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, enum cpu_idle_type idle, int *all_pinned, int *this_best_prio, struct cfs_rq *busiest_cfs_rq) { - int loops = 0, pulled = 0, pinned = 0; + int loops = 0, pulled = 0; long rem_load_move = max_load_move; struct task_struct *p, *n; if (max_load_move == 0) goto out; - pinned = 1; - list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) { if (loops++ > sysctl_sched_nr_migrate) break; if ((p->se.load.weight >> 1) > rem_load_move || - !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) + !can_migrate_task(p, busiest, this_cpu, sd, idle, + all_pinned)) continue; pull_task(busiest, p, this_rq, this_cpu); @@ -2153,9 +2152,6 @@ out: */ schedstat_add(sd, lb_gained[idle], pulled); - if (all_pinned) - *all_pinned = pinned; - return max_load_move - rem_load_move; } @@ -3341,6 +3337,7 @@ redo: * still unbalanced. ld_moved simply stays zero, so it is * correctly treated as an imbalance. */ + all_pinned = 1; local_irq_save(flags); double_rq_lock(this_rq, busiest); ld_moved = move_tasks(this_rq, this_cpu, busiest, -- cgit v0.10.2 From 30d746c68025ee69ac17219aacc9b1614d951f01 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 7 Apr 2011 14:13:15 +0200 Subject: x86/ce4100: Add reg property to bridges without the reg property Ben's new code won't find the PCI & ISA bridge and the devices won't get the DT-node attached. Signed-off-by: Sebastian Andrzej Siewior Acked-by: Thomas Gleixner Cc: davem@davemloft.net Cc: monstr@monstr.eu Cc: Benjamin Herrenschmidt Link: http://lkml.kernel.org/r/20110407121315.GA9204@linutronix.de Signed-off-by: Ingo Molnar diff --git a/arch/x86/platform/ce4100/falconfalls.dts b/arch/x86/platform/ce4100/falconfalls.dts index dc701ea..2d6d226 100644 --- a/arch/x86/platform/ce4100/falconfalls.dts +++ b/arch/x86/platform/ce4100/falconfalls.dts @@ -74,6 +74,7 @@ compatible = "intel,ce4100-pci", "pci"; device_type = "pci"; bus-range = <1 1>; + reg = <0x0800 0x0 0x0 0x0 0x0>; ranges = <0x2000000 0 0xdffe0000 0x2000000 0 0xdffe0000 0 0x1000>; interrupt-parent = <&ioapic2>; @@ -412,6 +413,7 @@ #address-cells = <2>; #size-cells = <1>; compatible = "isa"; + reg = <0xf800 0x0 0x0 0x0 0x0>; ranges = <1 0 0 0 0 0x100>; rtc@70 { -- cgit v0.10.2 From 9d90e49da57fe73a2f35334fdd2fb60dbf3933ed Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Fri, 8 Apr 2011 11:23:00 -0700 Subject: x86/mrst: Fix boot crash caused by incorrect pin to irq mapping Moorestown systems crash on boot because the secondary CPU clockevent (apbt1) will fail to request irq#1, which does not have ioapic chip in its irq_desc[] entry. Background: Moorestown platform does not have ISA bus nor legacy IRQs. It reuses the range of legacy IRQs for regular device interrupts. The routing information of early system device IRQs (timers) are obtained from firmware provided SFI tables. We reuse/fake MP configuration table to facilitate IRQ setup with IOAPIC. Maintaining a 1:1 mapping of IOAPIC pin (RTE entry) and IRQ# makes routing information clean and easy to understand on Moorestown. Though optional. This patch allows SFI timer and vRTC IRQ to be treated as ISA IRQ so that pin2irq mapping will be 1:1. Also fixed MP table type and use macros to clearly set MP IRQ entries. As a result, apbt timer and RTC interrupts on Moorestown are within legacy IRQ range: # cat /proc/interrupts CPU0 CPU1 0: 11249 0 IO-APIC-edge apbt0 1: 0 12271 IO-APIC-edge apbt1 8: 887 0 IO-APIC-fasteoi dw_spi 13: 0 0 IO-APIC-fasteoi INTEL_MID_DMAC2 14: 0 0 IO-APIC-fasteoi rtc0 Further discussion of this patch can be found at: https://lkml.org/lkml/2010/6/10/70 Suggested-by: "Eric W. Biederman" Signed-off-by: Jacob Pan Cc: Feng Tang Cc: Alan Cox Cc: Arjan van de Ven Link: http://lkml.kernel.org/r/1302286980-21139-1-git-send-email-jacob.jun.pan@linux.intel.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/platform/mrst/mrst.c b/arch/x86/platform/mrst/mrst.c index 5c0207b..275dbc1 100644 --- a/arch/x86/platform/mrst/mrst.c +++ b/arch/x86/platform/mrst/mrst.c @@ -97,11 +97,11 @@ static int __init sfi_parse_mtmr(struct sfi_table_header *table) pentry->freq_hz, pentry->irq); if (!pentry->irq) continue; - mp_irq.type = MP_IOAPIC; + mp_irq.type = MP_INTSRC; mp_irq.irqtype = mp_INT; /* triggering mode edge bit 2-3, active high polarity bit 0-1 */ mp_irq.irqflag = 5; - mp_irq.srcbus = 0; + mp_irq.srcbus = MP_BUS_ISA; mp_irq.srcbusirq = pentry->irq; /* IRQ */ mp_irq.dstapic = MP_APIC_ALL; mp_irq.dstirq = pentry->irq; @@ -168,10 +168,10 @@ int __init sfi_parse_mrtc(struct sfi_table_header *table) for (totallen = 0; totallen < sfi_mrtc_num; totallen++, pentry++) { pr_debug("RTC[%d]: paddr = 0x%08x, irq = %d\n", totallen, (u32)pentry->phys_addr, pentry->irq); - mp_irq.type = MP_IOAPIC; + mp_irq.type = MP_INTSRC; mp_irq.irqtype = mp_INT; mp_irq.irqflag = 0xf; /* level trigger and active low */ - mp_irq.srcbus = 0; + mp_irq.srcbus = MP_BUS_ISA; mp_irq.srcbusirq = pentry->irq; /* IRQ */ mp_irq.dstapic = MP_APIC_ALL; mp_irq.dstirq = pentry->irq; @@ -282,7 +282,7 @@ void __init x86_mrst_early_setup(void) /* Avoid searching for BIOS MP tables */ x86_init.mpparse.find_smp_config = x86_init_noop; x86_init.mpparse.get_smp_config = x86_init_uint_noop; - + set_bit(MP_BUS_ISA, mp_bus_not_pci); } /* -- cgit v0.10.2 From 0cd9c6494ee5c19aef085152bc37f3a4e774a9e1 Mon Sep 17 00:00:00 2001 From: Darren Hart Date: Thu, 14 Apr 2011 15:41:57 -0700 Subject: futex: Set FLAGS_HAS_TIMEOUT during futex_wait restart setup The FLAGS_HAS_TIMEOUT flag was not getting set, causing the restart_block to restart futex_wait() without a timeout after a signal. Commit b41277dc7a18ee332d in 2.6.38 introduced the regression by accidentally removing the the FLAGS_HAS_TIMEOUT assignment from futex_wait() during the setup of the restart block. Restore the originaly behavior. Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=32922 Reported-by: Tim Smith Reported-by: Torsten Hilbrich Signed-off-by: Darren Hart Signed-off-by: Eric Dumazet Cc: Peter Zijlstra Cc: John Kacur Cc: stable@kernel.org Link: http://lkml.kernel.org/r/%3Cdaac0eb3af607f72b9a4d3126b2ba8fb5ed3b883.1302820917.git.dvhart%40linux.intel.com%3E Signed-off-by: Thomas Gleixner diff --git a/kernel/futex.c b/kernel/futex.c index dfb924f..fe28dc2 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1886,7 +1886,7 @@ retry: restart->futex.val = val; restart->futex.time = abs_time->tv64; restart->futex.bitset = bitset; - restart->futex.flags = flags; + restart->futex.flags = flags | FLAGS_HAS_TIMEOUT; ret = -ERESTART_RESTARTBLOCK; -- cgit v0.10.2 From 7d6b46707f2491a94f4bd3b4329d2d7f809e9368 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Fri, 15 Apr 2011 20:39:01 +0900 Subject: x86, NUMA: Fix fakenuma boot failure Currently, numa=fake boot parameter is broken. If it's used, kernel may panic due to devide by zero error depending on CPU configuration Call Trace: [] find_busiest_group+0x38c/0xd30 [] ? local_clock+0x6f/0x80 [] load_balance+0xa3/0x600 [] idle_balance+0xf3/0x180 [] schedule+0x722/0x7d0 [] ? wait_for_common+0x128/0x190 [] schedule_timeout+0x265/0x320 [] ? lock_release_holdtime+0x35/0x1a0 [] ? wait_for_common+0x128/0x190 [] ? __lock_release+0x9c/0x1d0 [] ? _raw_spin_unlock_irq+0x30/0x40 [] ? _raw_spin_unlock_irq+0x30/0x40 [] wait_for_common+0x130/0x190 [] ? try_to_wake_up+0x510/0x510 [] wait_for_completion+0x1d/0x20 [] kthread_create_on_node+0xac/0x150 [] ? process_scheduled_works+0x40/0x40 [] ? wait_for_common+0x4f/0x190 [] __alloc_workqueue_key+0x1a3/0x590 [] cpuset_init_smp+0x6b/0x7b [] kernel_init+0xc3/0x182 [] kernel_thread_helper+0x4/0x10 [] ? retint_restore_args+0x13/0x13 [] ? start_kernel+0x400/0x400 [] ? gs_change+0x13/0x13 The divede by zero is caused by the following line, group->cpu_power==0: kernel/sched_fair.c::update_sg_lb_stats() /* Adjust by relative CPU power of the group */ sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power; This regression was caused by commit e23bba6044 ("x86-64, NUMA: Unify emulated distance mapping") because it changes cpu -> node mapping in the process of dropping fake_physnodes(). old) all cpus are assinged node 0 now) cpus are assigned round robin (the logic is implemented by numa_init_array()) Note: The change in behavior only happens if the system doesn't have neither ACPI SRAT table nor AMD northbridge NUMA information. Round robin assignment doesn't work because init_numa_sched_groups_power() assumes all logical cpus in the same physical cpu share the same node (then it only accounts for group_first_cpu()), and the simple round robin breaks the above assumption. Thus, this patch implements a reassignment of node-ids if buggy firmware or numa emulation makes wrong cpu node map. Tt enforce all logical cpus in the same physical cpu share the same node. Signed-off-by: KOSAKI Motohiro Acked-by: Tejun Heo Cc: Yinghai Lu Cc: Brian Gerst Cc: Cyrill Gorcunov Cc: Shaohui Zheng Cc: David Rientjes Cc: H. Peter Anvin Link: http://lkml.kernel.org/r/20110415203928.1303.A69D9226@jp.fujitsu.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index c2871d3..8ed8908 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -312,6 +312,26 @@ void __cpuinit smp_store_cpu_info(int id) identify_secondary_cpu(c); } +static void __cpuinit check_cpu_siblings_on_same_node(int cpu1, int cpu2) +{ + int node1 = early_cpu_to_node(cpu1); + int node2 = early_cpu_to_node(cpu2); + + /* + * Our CPU scheduler assumes all logical cpus in the same physical cpu + * share the same node. But, buggy ACPI or NUMA emulation might assign + * them to different node. Fix it. + */ + if (node1 != node2) { + pr_warning("CPU %d in node %d and CPU %d in node %d are in the same physical CPU. forcing same node %d\n", + cpu1, node1, cpu2, node2, node2); + + numa_remove_cpu(cpu1); + numa_set_node(cpu1, node2); + numa_add_cpu(cpu1); + } +} + static void __cpuinit link_thread_siblings(int cpu1, int cpu2) { cpumask_set_cpu(cpu1, cpu_sibling_mask(cpu2)); @@ -320,6 +340,7 @@ static void __cpuinit link_thread_siblings(int cpu1, int cpu2) cpumask_set_cpu(cpu2, cpu_core_mask(cpu1)); cpumask_set_cpu(cpu1, cpu_llc_shared_mask(cpu2)); cpumask_set_cpu(cpu2, cpu_llc_shared_mask(cpu1)); + check_cpu_siblings_on_same_node(cpu1, cpu2); } @@ -361,10 +382,12 @@ void __cpuinit set_cpu_sibling_map(int cpu) per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i)) { cpumask_set_cpu(i, cpu_llc_shared_mask(cpu)); cpumask_set_cpu(cpu, cpu_llc_shared_mask(i)); + check_cpu_siblings_on_same_node(cpu, i); } if (c->phys_proc_id == cpu_data(i).phys_proc_id) { cpumask_set_cpu(i, cpu_core_mask(cpu)); cpumask_set_cpu(cpu, cpu_core_mask(i)); + check_cpu_siblings_on_same_node(cpu, i); /* * Does this new cpu bringup a new core? */ -- cgit v0.10.2 From 5bbc097d890409d8eff4e3f1d26f11a9d6b7c07e Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 15 Apr 2011 14:47:40 +0200 Subject: x86, amd: Disable GartTlbWlkErr when BIOS forgets it This patch disables GartTlbWlk errors on AMD Fam10h CPUs if the BIOS forgets to do is (or is just too old). Letting these errors enabled can cause a sync-flood on the CPU causing a reboot. The AMD BKDG recommends disabling GART TLB Wlk Error completely. This patch is the fix for https://bugzilla.kernel.org/show_bug.cgi?id=33012 on my machine. Signed-off-by: Joerg Roedel Link: http://lkml.kernel.org/r/20110415131152.GJ18463@8bytes.org Tested-by: Alexandre Demers Cc: Signed-off-by: H. Peter Anvin diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index fd5a1f3..3cce714 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -96,11 +96,15 @@ #define MSR_IA32_MC0_ADDR 0x00000402 #define MSR_IA32_MC0_MISC 0x00000403 +#define MSR_AMD64_MC0_MASK 0xc0010044 + #define MSR_IA32_MCx_CTL(x) (MSR_IA32_MC0_CTL + 4*(x)) #define MSR_IA32_MCx_STATUS(x) (MSR_IA32_MC0_STATUS + 4*(x)) #define MSR_IA32_MCx_ADDR(x) (MSR_IA32_MC0_ADDR + 4*(x)) #define MSR_IA32_MCx_MISC(x) (MSR_IA32_MC0_MISC + 4*(x)) +#define MSR_AMD64_MCx_MASK(x) (MSR_AMD64_MC0_MASK + (x)) + /* These are consecutive and not in the normal 4er MCE bank block */ #define MSR_IA32_MC0_CTL2 0x00000280 #define MSR_IA32_MCx_CTL2(x) (MSR_IA32_MC0_CTL2 + (x)) diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 3ecece0..3532d3b 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -615,6 +615,25 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) /* As a rule processors have APIC timer running in deep C states */ if (c->x86 >= 0xf && !cpu_has_amd_erratum(amd_erratum_400)) set_cpu_cap(c, X86_FEATURE_ARAT); + + /* + * Disable GART TLB Walk Errors on Fam10h. We do this here + * because this is always needed when GART is enabled, even in a + * kernel which has no MCE support built in. + */ + if (c->x86 == 0x10) { + /* + * BIOS should disable GartTlbWlk Errors themself. If + * it doesn't do it here as suggested by the BKDG. + * + * Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=33012 + */ + u64 mask; + + rdmsrl(MSR_AMD64_MCx_MASK(4), mask); + mask |= (1 << 10); + wrmsrl(MSR_AMD64_MCx_MASK(4), mask); + } } #ifdef CONFIG_X86_32 -- cgit v0.10.2