From d2c298eb24fe580153f1e122bb6da1bfdd798c59 Mon Sep 17 00:00:00 2001
From: Kevin Hao <haokexin@gmail.com>
Date: Tue, 9 Apr 2013 22:31:24 +0000
Subject: powerpc: add a missing label in resume_kernel

commit d8b92292408831d86ff7b781e66bf79301934b99 upstream.

A label 0 was missed in the patch a9c4e541 (powerpc/kprobe: Complete
kprobe and migrate exception frame). This will cause the kernel
branch to an undetermined address if there really has a conflict when
updating the thread flags.

Signed-off-by: Kevin Hao <haokexin@gmail.com>
Acked-By: Tiejun Chen <tiejun.chen@windriver.com>
Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index 3d990d3..e0822a3 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -634,7 +634,7 @@ resume_kernel:
 	/* Clear _TIF_EMULATE_STACK_STORE flag */
 	lis	r11,_TIF_EMULATE_STACK_STORE@h
 	addi	r5,r9,TI_FLAGS
-	ldarx	r4,0,r5
+0:	ldarx	r4,0,r5
 	andc	r4,r4,r11
 	stdcx.	r4,0,r5
 	bne-	0b
-- 
cgit v0.10.2


From 518a8ee27d903ee9d2340c264994c813e1d421b5 Mon Sep 17 00:00:00 2001
From: Scott Wood <scottwood@freescale.com>
Date: Mon, 18 Feb 2013 18:13:09 +0000
Subject: kvm/powerpc/e500mc: fix tlb invalidation on cpu migration

commit c5e6cb051c5f7d56f05bd6a4af22cb300a4ced79 upstream.

The existing check handles the case where we've migrated to a different
core than we last ran on, but it doesn't handle the case where we're
still on the same cpu we last ran on, but some other vcpu has run on
this cpu in the meantime.

Without this, guest segfaults (and other misbehavior) have been seen in
smp guests.

Signed-off-by: Scott Wood <scottwood@freescale.com>
Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

diff --git a/arch/powerpc/kvm/e500mc.c b/arch/powerpc/kvm/e500mc.c
index 1f89d26..2f4baa0 100644
--- a/arch/powerpc/kvm/e500mc.c
+++ b/arch/powerpc/kvm/e500mc.c
@@ -108,6 +108,8 @@ void kvmppc_mmu_msr_notify(struct kvm_vcpu *vcpu, u32 old_msr)
 {
 }
 
+static DEFINE_PER_CPU(struct kvm_vcpu *, last_vcpu_on_cpu);
+
 void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
 	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
@@ -136,8 +138,11 @@ void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 	mtspr(SPRN_GDEAR, vcpu->arch.shared->dar);
 	mtspr(SPRN_GESR, vcpu->arch.shared->esr);
 
-	if (vcpu->arch.oldpir != mfspr(SPRN_PIR))
+	if (vcpu->arch.oldpir != mfspr(SPRN_PIR) ||
+	    __get_cpu_var(last_vcpu_on_cpu) != vcpu) {
 		kvmppc_e500_tlbil_all(vcpu_e500);
+		__get_cpu_var(last_vcpu_on_cpu) = vcpu;
+	}
 
 	kvmppc_load_guest_fp(vcpu);
 }
-- 
cgit v0.10.2


From 3d9203cb7216794cd49cfb2aa90e7f69976fbe6d Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@arm.linux.org.uk>
Date: Mon, 8 Apr 2013 11:44:57 +0100
Subject: ARM: Do 15e0d9e37c (ARM: pm: let platforms select cpu_suspend
 support) properly

commit b6c7aabd923a17af993c5a5d5d7995f0b27c000a upstream.

Let's do the changes properly and fix the same problem everywhere, not
just for one case.

Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

diff --git a/arch/arm/mm/proc-arm920.S b/arch/arm/mm/proc-arm920.S
index 2c3b942..2556cf1 100644
--- a/arch/arm/mm/proc-arm920.S
+++ b/arch/arm/mm/proc-arm920.S
@@ -387,7 +387,7 @@ ENTRY(cpu_arm920_set_pte_ext)
 /* Suspend/resume support: taken from arch/arm/plat-s3c24xx/sleep.S */
 .globl	cpu_arm920_suspend_size
 .equ	cpu_arm920_suspend_size, 4 * 3
-#ifdef CONFIG_PM_SLEEP
+#ifdef CONFIG_ARM_CPU_SUSPEND
 ENTRY(cpu_arm920_do_suspend)
 	stmfd	sp!, {r4 - r6, lr}
 	mrc	p15, 0, r4, c13, c0, 0	@ PID
diff --git a/arch/arm/mm/proc-arm926.S b/arch/arm/mm/proc-arm926.S
index f1803f7..344c8a5 100644
--- a/arch/arm/mm/proc-arm926.S
+++ b/arch/arm/mm/proc-arm926.S
@@ -402,7 +402,7 @@ ENTRY(cpu_arm926_set_pte_ext)
 /* Suspend/resume support: taken from arch/arm/plat-s3c24xx/sleep.S */
 .globl	cpu_arm926_suspend_size
 .equ	cpu_arm926_suspend_size, 4 * 3
-#ifdef CONFIG_PM_SLEEP
+#ifdef CONFIG_ARM_CPU_SUSPEND
 ENTRY(cpu_arm926_do_suspend)
 	stmfd	sp!, {r4 - r6, lr}
 	mrc	p15, 0, r4, c13, c0, 0	@ PID
diff --git a/arch/arm/mm/proc-mohawk.S b/arch/arm/mm/proc-mohawk.S
index 82f9cdc..0b60dd3 100644
--- a/arch/arm/mm/proc-mohawk.S
+++ b/arch/arm/mm/proc-mohawk.S
@@ -350,7 +350,7 @@ ENTRY(cpu_mohawk_set_pte_ext)
 
 .globl	cpu_mohawk_suspend_size
 .equ	cpu_mohawk_suspend_size, 4 * 6
-#ifdef CONFIG_PM_SLEEP
+#ifdef CONFIG_ARM_CPU_SUSPEND
 ENTRY(cpu_mohawk_do_suspend)
 	stmfd	sp!, {r4 - r9, lr}
 	mrc	p14, 0, r4, c6, c0, 0	@ clock configuration, for turbo mode
diff --git a/arch/arm/mm/proc-sa1100.S b/arch/arm/mm/proc-sa1100.S
index 3aa0da1..d92dfd0 100644
--- a/arch/arm/mm/proc-sa1100.S
+++ b/arch/arm/mm/proc-sa1100.S
@@ -172,7 +172,7 @@ ENTRY(cpu_sa1100_set_pte_ext)
 
 .globl	cpu_sa1100_suspend_size
 .equ	cpu_sa1100_suspend_size, 4 * 3
-#ifdef CONFIG_PM_SLEEP
+#ifdef CONFIG_ARM_CPU_SUSPEND
 ENTRY(cpu_sa1100_do_suspend)
 	stmfd	sp!, {r4 - r6, lr}
 	mrc	p15, 0, r4, c3, c0, 0		@ domain ID
diff --git a/arch/arm/mm/proc-v6.S b/arch/arm/mm/proc-v6.S
index 09c5233..d222215 100644
--- a/arch/arm/mm/proc-v6.S
+++ b/arch/arm/mm/proc-v6.S
@@ -138,7 +138,7 @@ ENTRY(cpu_v6_set_pte_ext)
 /* Suspend/resume support: taken from arch/arm/mach-s3c64xx/sleep.S */
 .globl	cpu_v6_suspend_size
 .equ	cpu_v6_suspend_size, 4 * 6
-#ifdef CONFIG_PM_SLEEP
+#ifdef CONFIG_ARM_CPU_SUSPEND
 ENTRY(cpu_v6_do_suspend)
 	stmfd	sp!, {r4 - r9, lr}
 	mrc	p15, 0, r4, c13, c0, 0	@ FCSE/PID
diff --git a/arch/arm/mm/proc-xsc3.S b/arch/arm/mm/proc-xsc3.S
index eb93d64..e8efd83 100644
--- a/arch/arm/mm/proc-xsc3.S
+++ b/arch/arm/mm/proc-xsc3.S
@@ -413,7 +413,7 @@ ENTRY(cpu_xsc3_set_pte_ext)
 
 .globl	cpu_xsc3_suspend_size
 .equ	cpu_xsc3_suspend_size, 4 * 6
-#ifdef CONFIG_PM_SLEEP
+#ifdef CONFIG_ARM_CPU_SUSPEND
 ENTRY(cpu_xsc3_do_suspend)
 	stmfd	sp!, {r4 - r9, lr}
 	mrc	p14, 0, r4, c6, c0, 0	@ clock configuration, for turbo mode
diff --git a/arch/arm/mm/proc-xscale.S b/arch/arm/mm/proc-xscale.S
index 2551036..e766f88 100644
--- a/arch/arm/mm/proc-xscale.S
+++ b/arch/arm/mm/proc-xscale.S
@@ -528,7 +528,7 @@ ENTRY(cpu_xscale_set_pte_ext)
 
 .globl	cpu_xscale_suspend_size
 .equ	cpu_xscale_suspend_size, 4 * 6
-#ifdef CONFIG_PM_SLEEP
+#ifdef CONFIG_ARM_CPU_SUSPEND
 ENTRY(cpu_xscale_do_suspend)
 	stmfd	sp!, {r4 - r9, lr}
 	mrc	p14, 0, r4, c6, c0, 0	@ clock configuration, for turbo mode
-- 
cgit v0.10.2


From 0e55be921acde0e057419d570005913252f4efb7 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 9 Apr 2013 09:33:34 +0200
Subject: kthread: Prevent unpark race which puts threads on the wrong cpu

commit f2530dc71cf0822f90bb63ea4600caaef33a66bb upstream.

The smpboot threads rely on the park/unpark mechanism which binds per
cpu threads on a particular core. Though the functionality is racy:

CPU0	       	 	CPU1  	     	    CPU2
unpark(T)				    wake_up_process(T)
  clear(SHOULD_PARK)	T runs
			leave parkme() due to !SHOULD_PARK
  bind_to(CPU2)		BUG_ON(wrong CPU)

We cannot let the tasks move themself to the target CPU as one of
those tasks is actually the migration thread itself, which requires
that it starts running on the target cpu right away.

The solution to this problem is to prevent wakeups in park mode which
are not from unpark(). That way we can guarantee that the association
of the task to the target cpu is working correctly.

Add a new task state (TASK_PARKED) which prevents other wakeups and
use this state explicitly for the unpark wakeup.

Peter noticed: Also, since the task state is visible to userspace and
all the parked tasks are still in the PID space, its a good hint in ps
and friends that these tasks aren't really there for the moment.

The migration thread has another related issue.

CPU0	      	     	 CPU1
Bring up CPU2
create_thread(T)
park(T)
 wait_for_completion()
			 parkme()
			 complete()
sched_set_stop_task()
			 schedule(TASK_PARKED)

The sched_set_stop_task() call is issued while the task is on the
runqueue of CPU1 and that confuses the hell out of the stop_task class
on that cpu. So we need the same synchronizaion before
sched_set_stop_task().

Reported-by: Dave Jones <davej@redhat.com>
Reported-and-tested-by: Dave Hansen <dave@sr71.net>
Reported-and-tested-by: Borislav Petkov <bp@alien8.de>
Acked-by: Peter Ziljstra <peterz@infradead.org>
Cc: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
Cc: dhillf@gmail.com
Cc: Ingo Molnar <mingo@kernel.org>
Link: http://lkml.kernel.org/r/alpine.LFD.2.02.1304091635430.21884@ionos
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

diff --git a/fs/proc/array.c b/fs/proc/array.c
index 6a91e6f..be3c22f 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -143,6 +143,7 @@ static const char * const task_state_array[] = {
 	"x (dead)",		/*  64 */
 	"K (wakekill)",		/* 128 */
 	"W (waking)",		/* 256 */
+	"P (parked)",		/* 512 */
 };
 
 static inline const char *get_task_state(struct task_struct *tsk)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index d2112477..7e49270 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -163,9 +163,10 @@ print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 #define TASK_DEAD		64
 #define TASK_WAKEKILL		128
 #define TASK_WAKING		256
-#define TASK_STATE_MAX		512
+#define TASK_PARKED		512
+#define TASK_STATE_MAX		1024
 
-#define TASK_STATE_TO_CHAR_STR "RSDTtZXxKW"
+#define TASK_STATE_TO_CHAR_STR "RSDTtZXxKWP"
 
 extern char ___assert_task_state[1 - 2*!!(
 		sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1)];
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 5a8671e..e5586ca 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -147,7 +147,7 @@ TRACE_EVENT(sched_switch,
 		  __print_flags(__entry->prev_state & (TASK_STATE_MAX-1), "|",
 				{ 1, "S"} , { 2, "D" }, { 4, "T" }, { 8, "t" },
 				{ 16, "Z" }, { 32, "X" }, { 64, "x" },
-				{ 128, "W" }) : "R",
+				{ 128, "K" }, { 256, "W" }, { 512, "P" }) : "R",
 		__entry->prev_state & TASK_STATE_MAX ? "+" : "",
 		__entry->next_comm, __entry->next_pid, __entry->next_prio)
 );
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 691dc2e..9eb7fed 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -124,12 +124,12 @@ void *kthread_data(struct task_struct *task)
 
 static void __kthread_parkme(struct kthread *self)
 {
-	__set_current_state(TASK_INTERRUPTIBLE);
+	__set_current_state(TASK_PARKED);
 	while (test_bit(KTHREAD_SHOULD_PARK, &self->flags)) {
 		if (!test_and_set_bit(KTHREAD_IS_PARKED, &self->flags))
 			complete(&self->parked);
 		schedule();
-		__set_current_state(TASK_INTERRUPTIBLE);
+		__set_current_state(TASK_PARKED);
 	}
 	clear_bit(KTHREAD_IS_PARKED, &self->flags);
 	__set_current_state(TASK_RUNNING);
@@ -256,8 +256,13 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
 }
 EXPORT_SYMBOL(kthread_create_on_node);
 
-static void __kthread_bind(struct task_struct *p, unsigned int cpu)
+static void __kthread_bind(struct task_struct *p, unsigned int cpu, long state)
 {
+	/* Must have done schedule() in kthread() before we set_task_cpu */
+	if (!wait_task_inactive(p, state)) {
+		WARN_ON(1);
+		return;
+	}
 	/* It's safe because the task is inactive. */
 	do_set_cpus_allowed(p, cpumask_of(cpu));
 	p->flags |= PF_THREAD_BOUND;
@@ -274,12 +279,7 @@ static void __kthread_bind(struct task_struct *p, unsigned int cpu)
  */
 void kthread_bind(struct task_struct *p, unsigned int cpu)
 {
-	/* Must have done schedule() in kthread() before we set_task_cpu */
-	if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE)) {
-		WARN_ON(1);
-		return;
-	}
-	__kthread_bind(p, cpu);
+	__kthread_bind(p, cpu, TASK_UNINTERRUPTIBLE);
 }
 EXPORT_SYMBOL(kthread_bind);
 
@@ -324,6 +324,22 @@ static struct kthread *task_get_live_kthread(struct task_struct *k)
 	return NULL;
 }
 
+static void __kthread_unpark(struct task_struct *k, struct kthread *kthread)
+{
+	clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
+	/*
+	 * We clear the IS_PARKED bit here as we don't wait
+	 * until the task has left the park code. So if we'd
+	 * park before that happens we'd see the IS_PARKED bit
+	 * which might be about to be cleared.
+	 */
+	if (test_and_clear_bit(KTHREAD_IS_PARKED, &kthread->flags)) {
+		if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags))
+			__kthread_bind(k, kthread->cpu, TASK_PARKED);
+		wake_up_state(k, TASK_PARKED);
+	}
+}
+
 /**
  * kthread_unpark - unpark a thread created by kthread_create().
  * @k:		thread created by kthread_create().
@@ -336,20 +352,8 @@ void kthread_unpark(struct task_struct *k)
 {
 	struct kthread *kthread = task_get_live_kthread(k);
 
-	if (kthread) {
-		clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
-		/*
-		 * We clear the IS_PARKED bit here as we don't wait
-		 * until the task has left the park code. So if we'd
-		 * park before that happens we'd see the IS_PARKED bit
-		 * which might be about to be cleared.
-		 */
-		if (test_and_clear_bit(KTHREAD_IS_PARKED, &kthread->flags)) {
-			if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags))
-				__kthread_bind(k, kthread->cpu);
-			wake_up_process(k);
-		}
-	}
+	if (kthread)
+		__kthread_unpark(k, kthread);
 	put_task_struct(k);
 }
 
@@ -407,7 +411,7 @@ int kthread_stop(struct task_struct *k)
 	trace_sched_kthread_stop(k);
 	if (kthread) {
 		set_bit(KTHREAD_SHOULD_STOP, &kthread->flags);
-		clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
+		__kthread_unpark(k, kthread);
 		wake_up_process(k);
 		wait_for_completion(&kthread->exited);
 	}
-- 
cgit v0.10.2


From d7e0ec47be2ac4c26c4a2ed44e9541d790e7bb94 Mon Sep 17 00:00:00 2001
From: Michael Bohan <mbohan@codeaurora.org>
Date: Tue, 19 Mar 2013 19:19:25 -0700
Subject: hrtimer: Don't reinitialize a cpu_base lock on CPU_UP

commit 84cc8fd2fe65866e49d70b38b3fdf7219dd92fe0 upstream.

The current code makes the assumption that a cpu_base lock won't be
held if the CPU corresponding to that cpu_base is offline, which isn't
always true.

If a hrtimer is not queued, then it will not be migrated by
migrate_hrtimers() when a CPU is offlined. Therefore, the hrtimer's
cpu_base may still point to a CPU which has subsequently gone offline
if the timer wasn't enqueued at the time the CPU went down.

Normally this wouldn't be a problem, but a cpu_base's lock is blindly
reinitialized each time a CPU is brought up. If a CPU is brought
online during the period that another thread is performing a hrtimer
operation on a stale hrtimer, then the lock will be reinitialized
under its feet, and a SPIN_BUG() like the following will be observed:

<0>[   28.082085] BUG: spinlock already unlocked on CPU#0, swapper/0/0
<0>[   28.087078]  lock: 0xc4780b40, value 0x0 .magic: dead4ead, .owner: <none>/-1, .owner_cpu: -1
<4>[   42.451150] [<c0014398>] (unwind_backtrace+0x0/0x120) from [<c0269220>] (do_raw_spin_unlock+0x44/0xdc)
<4>[   42.460430] [<c0269220>] (do_raw_spin_unlock+0x44/0xdc) from [<c071b5bc>] (_raw_spin_unlock+0x8/0x30)
<4>[   42.469632] [<c071b5bc>] (_raw_spin_unlock+0x8/0x30) from [<c00a9ce0>] (__hrtimer_start_range_ns+0x1e4/0x4f8)
<4>[   42.479521] [<c00a9ce0>] (__hrtimer_start_range_ns+0x1e4/0x4f8) from [<c00aa014>] (hrtimer_start+0x20/0x28)
<4>[   42.489247] [<c00aa014>] (hrtimer_start+0x20/0x28) from [<c00e6190>] (rcu_idle_enter_common+0x1ac/0x320)
<4>[   42.498709] [<c00e6190>] (rcu_idle_enter_common+0x1ac/0x320) from [<c00e6440>] (rcu_idle_enter+0xa0/0xb8)
<4>[   42.508259] [<c00e6440>] (rcu_idle_enter+0xa0/0xb8) from [<c000f268>] (cpu_idle+0x24/0xf0)
<4>[   42.516503] [<c000f268>] (cpu_idle+0x24/0xf0) from [<c06ed3c0>] (rest_init+0x88/0xa0)
<4>[   42.524319] [<c06ed3c0>] (rest_init+0x88/0xa0) from [<c0c00978>] (start_kernel+0x3d0/0x434)

As an example, this particular crash occurred when hrtimer_start() was
executed on CPU #0. The code locked the hrtimer's current cpu_base
corresponding to CPU #1. CPU #0 then tried to switch the hrtimer's
cpu_base to an optimal CPU which was online. In this case, it selected
the cpu_base corresponding to CPU #3.

Before it could proceed, CPU #1 came online and reinitialized the
spinlock corresponding to its cpu_base. Thus now CPU #0 held a lock
which was reinitialized. When CPU #0 finally ended up unlocking the
old cpu_base corresponding to CPU #1 so that it could switch to CPU
#3, we hit this SPIN_BUG() above while in switch_hrtimer_base().

CPU #0                            CPU #1
----                              ----
...                               <offline>
hrtimer_start()
lock_hrtimer_base(base #1)
...                               init_hrtimers_cpu()
switch_hrtimer_base()             ...
...                               raw_spin_lock_init(&cpu_base->lock)
raw_spin_unlock(&cpu_base->lock)  ...
<spin_bug>

Solve this by statically initializing the lock.

Signed-off-by: Michael Bohan <mbohan@codeaurora.org>
Link: http://lkml.kernel.org/r/1363745965-23475-1-git-send-email-mbohan@codeaurora.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index cdd5607..e4cee8d 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -61,6 +61,7 @@
 DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
 {
 
+	.lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock),
 	.clock_base =
 	{
 		{
@@ -1640,8 +1641,6 @@ static void __cpuinit init_hrtimers_cpu(int cpu)
 	struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu);
 	int i;
 
-	raw_spin_lock_init(&cpu_base->lock);
-
 	for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
 		cpu_base->clock_base[i].cpu_base = cpu_base;
 		timerqueue_init_head(&cpu_base->clock_base[i].active);
-- 
cgit v0.10.2


From b15bae9faecfb714cbf0c4585cd46b32d0a40e5a Mon Sep 17 00:00:00 2001
From: Marc Kleine-Budde <mkl@pengutronix.de>
Date: Thu, 11 Apr 2013 10:08:27 +0200
Subject: can: mcp251x: add missing IRQF_ONESHOT to request_threaded_irq

commit db388d6460ffa53b3b38429da6f70a913f89b048 upstream.

Since commit:

    1c6c695 genirq: Reject bogus threaded irq requests

threaded irqs must provide a primary handler or set the IRQF_ONESHOT flag.
Since the mcp251x driver doesn't make use of a primary handler set the
IRQF_ONESHOT flag.

Reported-by: Mylene Josserand <Mylene.Josserand@navocap.com>
Tested-by: Mylene Josserand <Mylene.Josserand@navocap.com>
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

diff --git a/drivers/net/can/mcp251x.c b/drivers/net/can/mcp251x.c
index 5eaf47b..42b6d69 100644
--- a/drivers/net/can/mcp251x.c
+++ b/drivers/net/can/mcp251x.c
@@ -922,6 +922,7 @@ static int mcp251x_open(struct net_device *net)
 	struct mcp251x_priv *priv = netdev_priv(net);
 	struct spi_device *spi = priv->spi;
 	struct mcp251x_platform_data *pdata = spi->dev.platform_data;
+	unsigned long flags;
 	int ret;
 
 	ret = open_candev(net);
@@ -938,9 +939,14 @@ static int mcp251x_open(struct net_device *net)
 	priv->tx_skb = NULL;
 	priv->tx_len = 0;
 
+	flags = IRQF_ONESHOT;
+	if (pdata->irq_flags)
+		flags |= pdata->irq_flags;
+	else
+		flags |= IRQF_TRIGGER_FALLING;
+
 	ret = request_threaded_irq(spi->irq, NULL, mcp251x_can_ist,
-		  pdata->irq_flags ? pdata->irq_flags : IRQF_TRIGGER_FALLING,
-		  DEVICE_NAME, priv);
+				   flags, DEVICE_NAME, priv);
 	if (ret) {
 		dev_err(&spi->dev, "failed to acquire irq %d\n", spi->irq);
 		if (pdata->transceiver_enable)
-- 
cgit v0.10.2


From 3f78a9f022c9bc045a6d2cc864a1d77ad3c3f784 Mon Sep 17 00:00:00 2001
From: Christoph Fritz <chf.fritz@googlemail.com>
Date: Thu, 11 Apr 2013 21:32:57 +0200
Subject: can: sja1000: fix handling on dt properties on little endian systems

commit 0443de5fbf224abf41f688d8487b0c307dc5a4b4 upstream.

To get correct endianes on little endian cpus (like arm) while reading device
tree properties, this patch replaces of_get_property() with
of_property_read_u32(). While there use of_property_read_bool() for the
handling of the boolean "nxp,no-comparator-bypass" property.

Signed-off-by: Christoph Fritz <chf.fritz@googlemail.com>
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

diff --git a/drivers/net/can/sja1000/sja1000_of_platform.c b/drivers/net/can/sja1000/sja1000_of_platform.c
index 6433b81..8e0c4a0 100644
--- a/drivers/net/can/sja1000/sja1000_of_platform.c
+++ b/drivers/net/can/sja1000/sja1000_of_platform.c
@@ -96,8 +96,8 @@ static int sja1000_ofp_probe(struct platform_device *ofdev)
 	struct net_device *dev;
 	struct sja1000_priv *priv;
 	struct resource res;
-	const u32 *prop;
-	int err, irq, res_size, prop_size;
+	u32 prop;
+	int err, irq, res_size;
 	void __iomem *base;
 
 	err = of_address_to_resource(np, 0, &res);
@@ -138,27 +138,27 @@ static int sja1000_ofp_probe(struct platform_device *ofdev)
 	priv->read_reg = sja1000_ofp_read_reg;
 	priv->write_reg = sja1000_ofp_write_reg;
 
-	prop = of_get_property(np, "nxp,external-clock-frequency", &prop_size);
-	if (prop && (prop_size ==  sizeof(u32)))
-		priv->can.clock.freq = *prop / 2;
+	err = of_property_read_u32(np, "nxp,external-clock-frequency", &prop);
+	if (!err)
+		priv->can.clock.freq = prop / 2;
 	else
 		priv->can.clock.freq = SJA1000_OFP_CAN_CLOCK; /* default */
 
-	prop = of_get_property(np, "nxp,tx-output-mode", &prop_size);
-	if (prop && (prop_size == sizeof(u32)))
-		priv->ocr |= *prop & OCR_MODE_MASK;
+	err = of_property_read_u32(np, "nxp,tx-output-mode", &prop);
+	if (!err)
+		priv->ocr |= prop & OCR_MODE_MASK;
 	else
 		priv->ocr |= OCR_MODE_NORMAL; /* default */
 
-	prop = of_get_property(np, "nxp,tx-output-config", &prop_size);
-	if (prop && (prop_size == sizeof(u32)))
-		priv->ocr |= (*prop << OCR_TX_SHIFT) & OCR_TX_MASK;
+	err = of_property_read_u32(np, "nxp,tx-output-config", &prop);
+	if (!err)
+		priv->ocr |= (prop << OCR_TX_SHIFT) & OCR_TX_MASK;
 	else
 		priv->ocr |= OCR_TX0_PULLDOWN; /* default */
 
-	prop = of_get_property(np, "nxp,clock-out-frequency", &prop_size);
-	if (prop && (prop_size == sizeof(u32)) && *prop) {
-		u32 divider = priv->can.clock.freq * 2 / *prop;
+	err = of_property_read_u32(np, "nxp,clock-out-frequency", &prop);
+	if (!err && prop) {
+		u32 divider = priv->can.clock.freq * 2 / prop;
 
 		if (divider > 1)
 			priv->cdr |= divider / 2 - 1;
@@ -168,8 +168,7 @@ static int sja1000_ofp_probe(struct platform_device *ofdev)
 		priv->cdr |= CDR_CLK_OFF; /* default */
 	}
 
-	prop = of_get_property(np, "nxp,no-comparator-bypass", NULL);
-	if (!prop)
+	if (!of_property_read_bool(np, "nxp,no-comparator-bypass"))
 		priv->cdr |= CDR_CBP; /* default */
 
 	priv->irq_flags = IRQF_SHARED;
-- 
cgit v0.10.2


From 17e4c92ba62fc45697bd214592248e41db65888b Mon Sep 17 00:00:00 2001
From: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Date: Wed, 17 Apr 2013 15:58:27 -0700
Subject: hugetlbfs: stop setting VM_DONTDUMP in initializing vma(VM_HUGETLB)

commit a2fce9143057f4eb7675a21cca1b6beabe585c8b upstream.

Currently we fail to include any data on hugepages into coredump,
because VM_DONTDUMP is set on hugetlbfs's vma.  This behavior was
recently introduced by commit 314e51b9851b ("mm: kill vma flag
VM_RESERVED and mm->reserved_vm counter").

This looks to me a serious regression, so let's fix it.

Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Acked-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Reviewed-by: Rik van Riel <riel@redhat.com>
Acked-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Acked-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 78bde32..ccee8cc 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -110,7 +110,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 	 * way when do_mmap_pgoff unwinds (may be important on powerpc
 	 * and ia64).
 	 */
-	vma->vm_flags |= VM_HUGETLB | VM_DONTEXPAND | VM_DONTDUMP;
+	vma->vm_flags |= VM_HUGETLB | VM_DONTEXPAND;
 	vma->vm_ops = &hugetlb_vm_ops;
 
 	if (vma->vm_pgoff & (~huge_page_mask(h) >> PAGE_SHIFT))
-- 
cgit v0.10.2


From bff55c65c423aad2d509e01e6703f098620b3c58 Mon Sep 17 00:00:00 2001
From: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Date: Wed, 17 Apr 2013 15:58:30 -0700
Subject: hugetlbfs: add swap entry check in follow_hugetlb_page()

commit 9cc3a5bd40067b9a0fbd49199d0780463fc2140f upstream.

With applying the previous patch "hugetlbfs: stop setting VM_DONTDUMP in
initializing vma(VM_HUGETLB)" to reenable hugepage coredump, if a memory
error happens on a hugepage and the affected processes try to access the
error hugepage, we hit VM_BUG_ON(atomic_read(&page->_count) <= 0) in
get_page().

The reason for this bug is that coredump-related code doesn't recognise
"hugepage hwpoison entry" with which a pmd entry is replaced when a memory
error occurs on a hugepage.

In other words, physical address information is stored in different bit
layout between hugepage hwpoison entry and pmd entry, so
follow_hugetlb_page() which is called in get_dump_page() returns a wrong
page from a given address.

The expected behavior is like this:

  absent   is_swap_pte   FOLL_DUMP   Expected behavior
  -------------------------------------------------------------------
   true     false         false       hugetlb_fault
   false    true          false       hugetlb_fault
   false    false         false       return page
   true     false         true        skip page (to avoid allocation)
   false    true          true        hugetlb_fault
   false    false         true        return page

With this patch, we can call hugetlb_fault() and take proper actions (we
wait for migration entries, fail with VM_FAULT_HWPOISON_LARGE for
hwpoisoned entries,) and as the result we can dump all hugepages except
for hwpoisoned ones.

Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Rik van Riel <riel@redhat.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: HATAYAMA Daisuke <d.hatayama@jp.fujitsu.com>
Acked-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Acked-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index d7cec92..88eb939 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2965,7 +2965,17 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 			break;
 		}
 
-		if (absent ||
+		/*
+		 * We need call hugetlb_fault for both hugepages under migration
+		 * (in which case hugetlb_fault waits for the migration,) and
+		 * hwpoisoned hugepages (in which case we need to prevent the
+		 * caller from accessing to them.) In order to do this, we use
+		 * here is_swap_pte instead of is_hugetlb_entry_migration and
+		 * is_hugetlb_entry_hwpoisoned. This is because it simply covers
+		 * both cases, and because we can't follow correct pages
+		 * directly from any kind of swap entries.
+		 */
+		if (absent || is_swap_pte(huge_ptep_get(pte)) ||
 		    ((flags & FOLL_WRITE) && !pte_write(huge_ptep_get(pte)))) {
 			int ret;
 
-- 
cgit v0.10.2


From eb586a6aac367fe5589ae115d470c2c2de45f6fb Mon Sep 17 00:00:00 2001
From: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Date: Wed, 17 Apr 2013 15:58:28 -0700
Subject: fs/binfmt_elf.c: fix hugetlb memory check in vma_dump_size()

commit 23d9e482136e31c9d287633a6e473daa172767c4 upstream.

Documentation/filesystems/proc.txt says about coredump_filter bitmask,

  Note bit 0-4 doesn't effect any hugetlb memory. hugetlb memory are only
  effected by bit 5-6.

However current code can go into the subsequent flag checks of bit 0-4
for vma(VM_HUGETLB). So this patch inserts 'return' and makes it work
as written in the document.

Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Reviewed-by: Rik van Riel <riel@redhat.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Reviewed-by: HATAYAMA Daisuke <d.hatayama@jp.fujitsu.com>
Acked-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Acked-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 0c42cdb..5843a47 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1132,6 +1132,7 @@ static unsigned long vma_dump_size(struct vm_area_struct *vma,
 			goto whole;
 		if (!(vma->vm_flags & VM_SHARED) && FILTER(HUGETLB_PRIVATE))
 			goto whole;
+		return 0;
 	}
 
 	/* Do not dump I/O mapped devices or special mappings */
-- 
cgit v0.10.2


From 998f85133fd3067d715050826cf066efd59775f1 Mon Sep 17 00:00:00 2001
From: Emese Revfy <re.emese@gmail.com>
Date: Wed, 17 Apr 2013 15:58:36 -0700
Subject: kernel/signal.c: stop info leak via the tkill and the tgkill syscalls

commit b9e146d8eb3b9ecae5086d373b50fa0c1f3e7f0f upstream.

This fixes a kernel memory contents leak via the tkill and tgkill syscalls
for compat processes.

This is visible in the siginfo_t->_sifields._rt.si_sigval.sival_ptr field
when handling signals delivered from tkill.

The place of the infoleak:

int copy_siginfo_to_user32(compat_siginfo_t __user *to, siginfo_t *from)
{
        ...
        put_user_ex(ptr_to_compat(from->si_ptr), &to->si_ptr);
        ...
}

Signed-off-by: Emese Revfy <re.emese@gmail.com>
Reviewed-by: PaX Team <pageexec@freemail.hu>
Signed-off-by: Kees Cook <keescook@chromium.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Serge Hallyn <serge.hallyn@canonical.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

diff --git a/kernel/signal.c b/kernel/signal.c
index dec9c30..50e425c 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2880,7 +2880,7 @@ do_send_specific(pid_t tgid, pid_t pid, int sig, struct siginfo *info)
 
 static int do_tkill(pid_t tgid, pid_t pid, int sig)
 {
-	struct siginfo info;
+	struct siginfo info = {};
 
 	info.si_signo = sig;
 	info.si_errno = 0;
-- 
cgit v0.10.2


From a7be4986fe82c5a8465749c4a0f3e1367d588b03 Mon Sep 17 00:00:00 2001
From: Vyacheslav Dubeyko <slava@dubeyko.com>
Date: Wed, 17 Apr 2013 15:58:33 -0700
Subject: hfsplus: fix potential overflow in hfsplus_file_truncate()

commit 12f267a20aecf8b84a2a9069b9011f1661c779b4 upstream.

Change a u32 to loff_t hfsplus_file_truncate().

Signed-off-by: Vyacheslav Dubeyko <slava@dubeyko.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Hin-Tak Leung <htl10@users.sourceforge.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c
index eba76ea..fc8ddc1 100644
--- a/fs/hfsplus/extents.c
+++ b/fs/hfsplus/extents.c
@@ -533,7 +533,7 @@ void hfsplus_file_truncate(struct inode *inode)
 		struct address_space *mapping = inode->i_mapping;
 		struct page *page;
 		void *fsdata;
-		u32 size = inode->i_size;
+		loff_t size = inode->i_size;
 
 		res = pagecache_write_begin(NULL, mapping, size, 0,
 						AOP_FLAG_UNINTERRUPTIBLE,
-- 
cgit v0.10.2


From c402e9ee7a5313c962bf38da206ff593a53454a7 Mon Sep 17 00:00:00 2001
From: Joe Lawrence <Joe.Lawrence@stratus.com>
Date: Thu, 21 Feb 2013 13:28:09 +1100
Subject: md: raid1,10: Handle REQ_WRITE_SAME flag in write bios

commit c8dc9c654794a765ca61baed07f84ed8aaa7ca8c upstream.

Set mddev queue's max_write_same_sectors to its chunk_sector value (before
disk_stack_limits merges the underlying disk limits.)  With that in place,
be sure to handle writes coming down from the block layer that have the
REQ_WRITE_SAME flag set.  That flag needs to be copied into any newly cloned
write bio.

Signed-off-by: Joe Lawrence <joe.lawrence@stratus.com>
Acked-by: "Martin K. Petersen" <martin.petersen@oracle.com>
Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 75b1f89..fd86b37 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1001,6 +1001,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
 	const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA));
 	const unsigned long do_discard = (bio->bi_rw
 					  & (REQ_DISCARD | REQ_SECURE));
+	const unsigned long do_same = (bio->bi_rw & REQ_WRITE_SAME);
 	struct md_rdev *blocked_rdev;
 	struct blk_plug_cb *cb;
 	struct raid1_plug_cb *plug = NULL;
@@ -1302,7 +1303,8 @@ read_again:
 				   conf->mirrors[i].rdev->data_offset);
 		mbio->bi_bdev = conf->mirrors[i].rdev->bdev;
 		mbio->bi_end_io	= raid1_end_write_request;
-		mbio->bi_rw = WRITE | do_flush_fua | do_sync | do_discard;
+		mbio->bi_rw =
+			WRITE | do_flush_fua | do_sync | do_discard | do_same;
 		mbio->bi_private = r1_bio;
 
 		atomic_inc(&r1_bio->remaining);
@@ -2819,6 +2821,9 @@ static int run(struct mddev *mddev)
 	if (IS_ERR(conf))
 		return PTR_ERR(conf);
 
+	if (mddev->queue)
+		blk_queue_max_write_same_sectors(mddev->queue,
+						 mddev->chunk_sectors);
 	rdev_for_each(rdev, mddev) {
 		if (!mddev->gendisk)
 			continue;
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 8d925dc..b3898d4 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1106,6 +1106,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
 	const unsigned long do_fua = (bio->bi_rw & REQ_FUA);
 	const unsigned long do_discard = (bio->bi_rw
 					  & (REQ_DISCARD | REQ_SECURE));
+	const unsigned long do_same = (bio->bi_rw & REQ_WRITE_SAME);
 	unsigned long flags;
 	struct md_rdev *blocked_rdev;
 	struct blk_plug_cb *cb;
@@ -1461,7 +1462,8 @@ retry_write:
 							      rdev));
 			mbio->bi_bdev = rdev->bdev;
 			mbio->bi_end_io	= raid10_end_write_request;
-			mbio->bi_rw = WRITE | do_sync | do_fua | do_discard;
+			mbio->bi_rw =
+				WRITE | do_sync | do_fua | do_discard | do_same;
 			mbio->bi_private = r10_bio;
 
 			atomic_inc(&r10_bio->remaining);
@@ -1503,7 +1505,8 @@ retry_write:
 						   r10_bio, rdev));
 			mbio->bi_bdev = rdev->bdev;
 			mbio->bi_end_io	= raid10_end_write_request;
-			mbio->bi_rw = WRITE | do_sync | do_fua | do_discard;
+			mbio->bi_rw =
+				WRITE | do_sync | do_fua | do_discard | do_same;
 			mbio->bi_private = r10_bio;
 
 			atomic_inc(&r10_bio->remaining);
@@ -3570,6 +3573,8 @@ static int run(struct mddev *mddev)
 	if (mddev->queue) {
 		blk_queue_max_discard_sectors(mddev->queue,
 					      mddev->chunk_sectors);
+		blk_queue_max_write_same_sectors(mddev->queue,
+						 mddev->chunk_sectors);
 		blk_queue_io_min(mddev->queue, chunk_size);
 		if (conf->geo.raid_disks % conf->geo.near_copies)
 			blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks);
-- 
cgit v0.10.2


From 23488565ac95f40c5d6804614f36e0a1b0d27ec9 Mon Sep 17 00:00:00 2001
From: Andy Honig <ahonig@google.com>
Date: Mon, 11 Mar 2013 09:34:52 -0700
Subject: KVM: x86: fix for buffer overflow in handling of MSR_KVM_SYSTEM_TIME
 (CVE-2013-1796)

commit c300aa64ddf57d9c5d9c898a64b36877345dd4a9 upstream.

If the guest sets the GPA of the time_page so that the request to update the
time straddles a page then KVM will write onto an incorrect page.  The
write is done byusing kmap atomic to get a pointer to the page for the time
structure and then performing a memcpy to that page starting at an offset
that the guest controls.  Well behaved guests always provide a 32-byte aligned
address, however a malicious guest could use this to corrupt host kernel
memory.

Tested: Tested against kvmclock unit test.

Signed-off-by: Andrew Honig <ahonig@google.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Ben Hutchings <ben@decadent.org.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c243b81..f1616cb 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1960,6 +1960,11 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		/* ...but clean it before doing the actual write */
 		vcpu->arch.time_offset = data & ~(PAGE_MASK | 1);
 
+		/* Check that the address is 32-byte aligned. */
+		if (vcpu->arch.time_offset &
+				(sizeof(struct pvclock_vcpu_time_info) - 1))
+			break;
+
 		vcpu->arch.time_page =
 				gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT);
 
-- 
cgit v0.10.2


From 17b61c3d1ca2a739c44e0c24565ce97a2086156f Mon Sep 17 00:00:00 2001
From: Andy Honig <ahonig@google.com>
Date: Wed, 20 Feb 2013 14:48:10 -0800
Subject: KVM: x86: Convert MSR_KVM_SYSTEM_TIME to use gfn_to_hva_cache
 functions (CVE-2013-1797)

commit 0b79459b482e85cb7426aa7da683a9f2c97aeae1 upstream.

There is a potential use after free issue with the handling of
MSR_KVM_SYSTEM_TIME.  If the guest specifies a GPA in a movable or removable
memory such as frame buffers then KVM might continue to write to that
address even after it's removed via KVM_SET_USER_MEMORY_REGION.  KVM pins
the page in memory so it's unlikely to cause an issue, but if the user
space component re-purposes the memory previously used for the guest, then
the guest will be able to corrupt that memory.

Tested: Tested against kvmclock unit test

Signed-off-by: Andrew Honig <ahonig@google.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Ben Hutchings <ben@decadent.org.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index dc87b65..85039f9 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -419,8 +419,8 @@ struct kvm_vcpu_arch {
 	gpa_t time;
 	struct pvclock_vcpu_time_info hv_clock;
 	unsigned int hw_tsc_khz;
-	unsigned int time_offset;
-	struct page *time_page;
+	struct gfn_to_hva_cache pv_time;
+	bool pv_time_enabled;
 	/* set guest stopped flag in pvclock flags field */
 	bool pvclock_set_guest_stopped_request;
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index f1616cb..d47934e 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1408,10 +1408,9 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
 	unsigned long flags, this_tsc_khz;
 	struct kvm_vcpu_arch *vcpu = &v->arch;
 	struct kvm_arch *ka = &v->kvm->arch;
-	void *shared_kaddr;
 	s64 kernel_ns, max_kernel_ns;
 	u64 tsc_timestamp, host_tsc;
-	struct pvclock_vcpu_time_info *guest_hv_clock;
+	struct pvclock_vcpu_time_info guest_hv_clock;
 	u8 pvclock_flags;
 	bool use_master_clock;
 
@@ -1465,7 +1464,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
 
 	local_irq_restore(flags);
 
-	if (!vcpu->time_page)
+	if (!vcpu->pv_time_enabled)
 		return 0;
 
 	/*
@@ -1527,12 +1526,12 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
 	 */
 	vcpu->hv_clock.version += 2;
 
-	shared_kaddr = kmap_atomic(vcpu->time_page);
-
-	guest_hv_clock = shared_kaddr + vcpu->time_offset;
+	if (unlikely(kvm_read_guest_cached(v->kvm, &vcpu->pv_time,
+		&guest_hv_clock, sizeof(guest_hv_clock))))
+		return 0;
 
 	/* retain PVCLOCK_GUEST_STOPPED if set in guest copy */
-	pvclock_flags = (guest_hv_clock->flags & PVCLOCK_GUEST_STOPPED);
+	pvclock_flags = (guest_hv_clock.flags & PVCLOCK_GUEST_STOPPED);
 
 	if (vcpu->pvclock_set_guest_stopped_request) {
 		pvclock_flags |= PVCLOCK_GUEST_STOPPED;
@@ -1545,12 +1544,9 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
 
 	vcpu->hv_clock.flags = pvclock_flags;
 
-	memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock,
-	       sizeof(vcpu->hv_clock));
-
-	kunmap_atomic(shared_kaddr);
-
-	mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT);
+	kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
+				&vcpu->hv_clock,
+				sizeof(vcpu->hv_clock));
 	return 0;
 }
 
@@ -1839,10 +1835,7 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
 
 static void kvmclock_reset(struct kvm_vcpu *vcpu)
 {
-	if (vcpu->arch.time_page) {
-		kvm_release_page_dirty(vcpu->arch.time_page);
-		vcpu->arch.time_page = NULL;
-	}
+	vcpu->arch.pv_time_enabled = false;
 }
 
 static void accumulate_steal_time(struct kvm_vcpu *vcpu)
@@ -1948,6 +1941,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		break;
 	case MSR_KVM_SYSTEM_TIME_NEW:
 	case MSR_KVM_SYSTEM_TIME: {
+		u64 gpa_offset;
 		kvmclock_reset(vcpu);
 
 		vcpu->arch.time = data;
@@ -1957,19 +1951,17 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		if (!(data & 1))
 			break;
 
-		/* ...but clean it before doing the actual write */
-		vcpu->arch.time_offset = data & ~(PAGE_MASK | 1);
+		gpa_offset = data & ~(PAGE_MASK | 1);
 
 		/* Check that the address is 32-byte aligned. */
-		if (vcpu->arch.time_offset &
-				(sizeof(struct pvclock_vcpu_time_info) - 1))
+		if (gpa_offset & (sizeof(struct pvclock_vcpu_time_info) - 1))
 			break;
 
-		vcpu->arch.time_page =
-				gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT);
-
-		if (is_error_page(vcpu->arch.time_page))
-			vcpu->arch.time_page = NULL;
+		if (kvm_gfn_to_hva_cache_init(vcpu->kvm,
+		     &vcpu->arch.pv_time, data & ~1ULL))
+			vcpu->arch.pv_time_enabled = false;
+		else
+			vcpu->arch.pv_time_enabled = true;
 
 		break;
 	}
@@ -2972,7 +2964,7 @@ static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu,
  */
 static int kvm_set_guest_paused(struct kvm_vcpu *vcpu)
 {
-	if (!vcpu->arch.time_page)
+	if (!vcpu->arch.pv_time_enabled)
 		return -EINVAL;
 	vcpu->arch.pvclock_set_guest_stopped_request = true;
 	kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
@@ -6666,6 +6658,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 		goto fail_free_wbinvd_dirty_mask;
 
 	vcpu->arch.ia32_tsc_adjust_msr = 0x0;
+	vcpu->arch.pv_time_enabled = false;
 	kvm_async_pf_hash_reset(vcpu);
 	kvm_pmu_init(vcpu);
 
-- 
cgit v0.10.2


From 24a5d26686ee4bb22fcf5d3c8b9d9227cbc0c187 Mon Sep 17 00:00:00 2001
From: Andy Honig <ahonig@google.com>
Date: Wed, 20 Feb 2013 14:49:16 -0800
Subject: KVM: Fix bounds checking in ioapic indirect register reads
 (CVE-2013-1798)

commit a2c118bfab8bc6b8bb213abfc35201e441693d55 upstream.

If the guest specifies a IOAPIC_REG_SELECT with an invalid value and follows
that with a read of the IOAPIC_REG_WINDOW KVM does not properly validate
that request.  ioapic_read_indirect contains an
ASSERT(redir_index < IOAPIC_NUM_PINS), but the ASSERT has no effect in
non-debug builds.  In recent kernels this allows a guest to cause a kernel
oops by reading invalid memory.  In older kernels (pre-3.3) this allows a
guest to read from large ranges of host memory.

Tested: tested against apic unit tests.

Signed-off-by: Andrew Honig <ahonig@google.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Ben Hutchings <ben@decadent.org.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
index cfb7e4d..52058f0 100644
--- a/virt/kvm/ioapic.c
+++ b/virt/kvm/ioapic.c
@@ -73,9 +73,12 @@ static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic,
 			u32 redir_index = (ioapic->ioregsel - 0x10) >> 1;
 			u64 redir_content;
 
-			ASSERT(redir_index < IOAPIC_NUM_PINS);
+			if (redir_index < IOAPIC_NUM_PINS)
+				redir_content =
+					ioapic->redirtbl[redir_index].bits;
+			else
+				redir_content = ~0ULL;
 
-			redir_content = ioapic->redirtbl[redir_index].bits;
 			result = (ioapic->ioregsel & 0x1) ?
 			    (redir_content >> 32) & 0xffffffff :
 			    redir_content & 0xffffffff;
-- 
cgit v0.10.2


From cdbaff4e7c7d4730d37aa1da8c00b37bbbadbf23 Mon Sep 17 00:00:00 2001
From: Andrew Honig <ahonig@google.com>
Date: Fri, 29 Mar 2013 09:35:21 -0700
Subject: KVM: Allow cross page reads and writes from cached translations.

commit 8f964525a121f2ff2df948dac908dcc65be21b5b upstream.

This patch adds support for kvm_gfn_to_hva_cache_init functions for
reads and writes that will cross a page.  If the range falls within
the same memslot, then this will be a fast operation.  If the range
is split between two memslots, then the slower kvm_read_guest and
kvm_write_guest are used.

Tested: Test against kvm_clock unit tests.

Signed-off-by: Andrew Honig <ahonig@google.com>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
Cc: Ben Hutchings <ben@decadent.org.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 9392f52..a2f492c 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1781,7 +1781,7 @@ int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data)
 	if (!pv_eoi_enabled(vcpu))
 		return 0;
 	return kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.pv_eoi.data,
-					 addr);
+					 addr, sizeof(u8));
 }
 
 void kvm_lapic_init(void)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index d47934e..9a51121 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1825,7 +1825,8 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
 		return 0;
 	}
 
-	if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa))
+	if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa,
+					sizeof(u32)))
 		return 1;
 
 	vcpu->arch.apf.send_user_only = !(data & KVM_ASYNC_PF_SEND_ALWAYS);
@@ -1953,12 +1954,9 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 
 		gpa_offset = data & ~(PAGE_MASK | 1);
 
-		/* Check that the address is 32-byte aligned. */
-		if (gpa_offset & (sizeof(struct pvclock_vcpu_time_info) - 1))
-			break;
-
 		if (kvm_gfn_to_hva_cache_init(vcpu->kvm,
-		     &vcpu->arch.pv_time, data & ~1ULL))
+		     &vcpu->arch.pv_time, data & ~1ULL,
+		     sizeof(struct pvclock_vcpu_time_info)))
 			vcpu->arch.pv_time_enabled = false;
 		else
 			vcpu->arch.pv_time_enabled = true;
@@ -1978,7 +1976,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 			return 1;
 
 		if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.st.stime,
-							data & KVM_STEAL_VALID_BITS))
+						data & KVM_STEAL_VALID_BITS,
+						sizeof(struct kvm_steal_time)))
 			return 1;
 
 		vcpu->arch.st.msr_val = data;
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 2c497ab..ffdf8b7 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -511,7 +511,7 @@ int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
 int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
 			   void *data, unsigned long len);
 int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
-			      gpa_t gpa);
+			      gpa_t gpa, unsigned long len);
 int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len);
 int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len);
 struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn);
diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h
index fa7cc72..b0bcce0 100644
--- a/include/linux/kvm_types.h
+++ b/include/linux/kvm_types.h
@@ -71,6 +71,7 @@ struct gfn_to_hva_cache {
 	u64 generation;
 	gpa_t gpa;
 	unsigned long hva;
+	unsigned long len;
 	struct kvm_memory_slot *memslot;
 };
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 1cd693a..10afa34 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1476,21 +1476,38 @@ int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
 }
 
 int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
-			      gpa_t gpa)
+			      gpa_t gpa, unsigned long len)
 {
 	struct kvm_memslots *slots = kvm_memslots(kvm);
 	int offset = offset_in_page(gpa);
-	gfn_t gfn = gpa >> PAGE_SHIFT;
+	gfn_t start_gfn = gpa >> PAGE_SHIFT;
+	gfn_t end_gfn = (gpa + len - 1) >> PAGE_SHIFT;
+	gfn_t nr_pages_needed = end_gfn - start_gfn + 1;
+	gfn_t nr_pages_avail;
 
 	ghc->gpa = gpa;
 	ghc->generation = slots->generation;
-	ghc->memslot = gfn_to_memslot(kvm, gfn);
-	ghc->hva = gfn_to_hva_many(ghc->memslot, gfn, NULL);
-	if (!kvm_is_error_hva(ghc->hva))
+	ghc->len = len;
+	ghc->memslot = gfn_to_memslot(kvm, start_gfn);
+	ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn, &nr_pages_avail);
+	if (!kvm_is_error_hva(ghc->hva) && nr_pages_avail >= nr_pages_needed) {
 		ghc->hva += offset;
-	else
-		return -EFAULT;
-
+	} else {
+		/*
+		 * If the requested region crosses two memslots, we still
+		 * verify that the entire region is valid here.
+		 */
+		while (start_gfn <= end_gfn) {
+			ghc->memslot = gfn_to_memslot(kvm, start_gfn);
+			ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn,
+						   &nr_pages_avail);
+			if (kvm_is_error_hva(ghc->hva))
+				return -EFAULT;
+			start_gfn += nr_pages_avail;
+		}
+		/* Use the slow path for cross page reads and writes. */
+		ghc->memslot = NULL;
+	}
 	return 0;
 }
 EXPORT_SYMBOL_GPL(kvm_gfn_to_hva_cache_init);
@@ -1501,8 +1518,13 @@ int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
 	struct kvm_memslots *slots = kvm_memslots(kvm);
 	int r;
 
+	BUG_ON(len > ghc->len);
+
 	if (slots->generation != ghc->generation)
-		kvm_gfn_to_hva_cache_init(kvm, ghc, ghc->gpa);
+		kvm_gfn_to_hva_cache_init(kvm, ghc, ghc->gpa, ghc->len);
+
+	if (unlikely(!ghc->memslot))
+		return kvm_write_guest(kvm, ghc->gpa, data, len);
 
 	if (kvm_is_error_hva(ghc->hva))
 		return -EFAULT;
@@ -1522,8 +1544,13 @@ int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
 	struct kvm_memslots *slots = kvm_memslots(kvm);
 	int r;
 
+	BUG_ON(len > ghc->len);
+
 	if (slots->generation != ghc->generation)
-		kvm_gfn_to_hva_cache_init(kvm, ghc, ghc->gpa);
+		kvm_gfn_to_hva_cache_init(kvm, ghc, ghc->gpa, ghc->len);
+
+	if (unlikely(!ghc->memslot))
+		return kvm_read_guest(kvm, ghc->gpa, data, len);
 
 	if (kvm_is_error_hva(ghc->hva))
 		return -EFAULT;
-- 
cgit v0.10.2


From 1ed8d46cfb1af390003553d504d6ad38be31a443 Mon Sep 17 00:00:00 2001
From: Sascha Hauer <s.hauer@pengutronix.de>
Date: Tue, 26 Feb 2013 10:55:18 +0100
Subject: ARM: i.MX35: enable MAX clock

commit 5dc2eb7da1e387e31ce54f54af580c6a6f512ca6 upstream.

The i.MX35 has two bits per clock gate which are decoded as follows:
      0b00 -> clock off
      0b01 -> clock is on in run mode, off in wait/doze
      0b10 -> clock is on in run/wait mode, off in doze
      0b11 -> clock is always on

The reset value for the MAX clock is 0b10.

The MAX clock is needed by the SoC, yet unused in the Kernel, so the
common clock framework will disable it during late init time. It will
only disable clocks though which it detects as being turned on. This
detection is made depending on the lower bit of the gate. If the reset
value has been altered by the bootloader to 0b11 the clock framework
will detect the clock as turned on, yet unused, hence it will turn it
off and the system locks up.

This patch turns the MAX clock on unconditionally making the Kernel
independent of the bootloader.

Signed-off-by: Sascha Hauer <s.hauer@pengutronix.de>
Cc: Lingzhu Xiang <lxiang@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

diff --git a/arch/arm/mach-imx/clk-imx35.c b/arch/arm/mach-imx/clk-imx35.c
index 0edce4b..9f6ea4c 100644
--- a/arch/arm/mach-imx/clk-imx35.c
+++ b/arch/arm/mach-imx/clk-imx35.c
@@ -265,6 +265,7 @@ int __init mx35_clocks_init()
 	clk_prepare_enable(clk[gpio3_gate]);
 	clk_prepare_enable(clk[iim_gate]);
 	clk_prepare_enable(clk[emi_gate]);
+	clk_prepare_enable(clk[max_gate]);
 
 	/*
 	 * SCC is needed to boot via mmc after a watchdog reset. The clock code
-- 
cgit v0.10.2


From 0234185d8e7157d2f5eb884cdc1b3a00e95ba759 Mon Sep 17 00:00:00 2001
From: Markus Pargmann <mpa@pengutronix.de>
Date: Fri, 29 Mar 2013 16:20:09 +0100
Subject: ARM: clk-imx35: Bugfix iomux clock

commit cab1e0a36c9dd0b0671fb84197ed294513f5adc1 upstream.

This patch enables iomuxc_gate clock. It is necessary to be able to
reconfigure iomux pads. Without this clock enabled, the
clk_disable_unused function will disable this clock and the iomux pads
are not configurable anymore. This happens at every boot. After a reboot
(watchdog system reset) the clock is not enabled again, so all iomux pad
reconfigurations in boot code are without effect.

The iomux pads should be always configurable, so this patch always
enables it.

Signed-off-by: Markus Pargmann <mpa@pengutronix.de>
Signed-off-by: Sascha Hauer <s.hauer@pengutronix.de>
Signed-off-by: Shawn Guo <shawn.guo@linaro.org>
Cc: Lingzhu Xiang <lxiang@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

diff --git a/arch/arm/mach-imx/clk-imx35.c b/arch/arm/mach-imx/clk-imx35.c
index 9f6ea4c..5e3ca7a 100644
--- a/arch/arm/mach-imx/clk-imx35.c
+++ b/arch/arm/mach-imx/clk-imx35.c
@@ -266,6 +266,7 @@ int __init mx35_clocks_init()
 	clk_prepare_enable(clk[iim_gate]);
 	clk_prepare_enable(clk[emi_gate]);
 	clk_prepare_enable(clk[max_gate]);
+	clk_prepare_enable(clk[iomuxc_gate]);
 
 	/*
 	 * SCC is needed to boot via mmc after a watchdog reset. The clock code
-- 
cgit v0.10.2


From 641a15a68775a9e477c4c0738f9844d36ac3b59e Mon Sep 17 00:00:00 2001
From: Matt Carlson <mcarlson@broadcom.com>
Date: Thu, 14 Feb 2013 14:27:51 +0000
Subject: tg3: Add 57766 device support.

commit d3f677afb8076d09d090ff0a5d1229c9dd9f136e upstream.

The patch also adds a couple of fixes

 - For the 57766 and non Ax versions of 57765, bootcode needs to setup
   the PCIE Fast Training Sequence (FTS) value to prevent transmit hangs.
   Unfortunately, it does not have enough room in the selfboot case (i.e.
   devices with no NVRAM).  The driver needs to implement this.

 - For performance reasons, the 2k DMA engine mode on the 57766 should
   be enabled and dma size limited to 2k for standard sized packets.

Signed-off-by: Nithin Nayak Sujir <nsujir@broadcom.com>
Signed-off-by: Michael Chan <mchan@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Joseph Salisbury <joseph.salisbury@canonical.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c
index 8a5253c..6917998 100644
--- a/drivers/net/ethernet/broadcom/tg3.c
+++ b/drivers/net/ethernet/broadcom/tg3.c
@@ -330,6 +330,7 @@ static DEFINE_PCI_DEVICE_TABLE(tg3_pci_tbl) = {
 	{PCI_DEVICE(PCI_VENDOR_ID_BROADCOM, TG3PCI_DEVICE_TIGON3_5719)},
 	{PCI_DEVICE(PCI_VENDOR_ID_BROADCOM, TG3PCI_DEVICE_TIGON3_5720)},
 	{PCI_DEVICE(PCI_VENDOR_ID_BROADCOM, TG3PCI_DEVICE_TIGON3_57762)},
+	{PCI_DEVICE(PCI_VENDOR_ID_BROADCOM, TG3PCI_DEVICE_TIGON3_57766)},
 	{PCI_DEVICE(PCI_VENDOR_ID_SYSKONNECT, PCI_DEVICE_ID_SYSKONNECT_9DXX)},
 	{PCI_DEVICE(PCI_VENDOR_ID_SYSKONNECT, PCI_DEVICE_ID_SYSKONNECT_9MXX)},
 	{PCI_DEVICE(PCI_VENDOR_ID_ALTIMA, PCI_DEVICE_ID_ALTIMA_AC1000)},
@@ -9103,7 +9104,14 @@ static int tg3_reset_hw(struct tg3 *tp, int reset_phy)
 		}
 
 		if (GET_CHIP_REV(tp->pci_chip_rev_id) != CHIPREV_57765_AX) {
-			u32 grc_mode = tr32(GRC_MODE);
+			u32 grc_mode;
+
+			/* Fix transmit hangs */
+			val = tr32(TG3_CPMU_PADRNG_CTL);
+			val |= TG3_CPMU_PADRNG_CTL_RDIV2;
+			tw32(TG3_CPMU_PADRNG_CTL, val);
+
+			grc_mode = tr32(GRC_MODE);
 
 			/* Access the lower 1K of DL PCIE block registers. */
 			val = grc_mode & ~GRC_MODE_PCIE_PORT_MASK;
@@ -9413,6 +9421,14 @@ static int tg3_reset_hw(struct tg3 *tp, int reset_phy)
 	if (tg3_flag(tp, PCI_EXPRESS))
 		rdmac_mode |= RDMAC_MODE_FIFO_LONG_BURST;
 
+	if (GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_57766) {
+		tp->dma_limit = 0;
+		if (tp->dev->mtu <= ETH_DATA_LEN) {
+			rdmac_mode |= RDMAC_MODE_JMB_2K_MMRR;
+			tp->dma_limit = TG3_TX_BD_DMA_MAX_2K;
+		}
+	}
+
 	if (tg3_flag(tp, HW_TSO_1) ||
 	    tg3_flag(tp, HW_TSO_2) ||
 	    tg3_flag(tp, HW_TSO_3))
diff --git a/drivers/net/ethernet/broadcom/tg3.h b/drivers/net/ethernet/broadcom/tg3.h
index d330e81..6f9b74c 100644
--- a/drivers/net/ethernet/broadcom/tg3.h
+++ b/drivers/net/ethernet/broadcom/tg3.h
@@ -1159,6 +1159,8 @@
 #define  CPMU_MUTEX_GNT_DRIVER		 0x00001000
 #define TG3_CPMU_PHY_STRAP		0x00003664
 #define TG3_CPMU_PHY_STRAP_IS_SERDES	 0x00000020
+#define TG3_CPMU_PADRNG_CTL		0x00003668
+#define  TG3_CPMU_PADRNG_CTL_RDIV2	 0x00040000
 /* 0x3664 --> 0x36b0 unused */
 
 #define TG3_CPMU_EEE_MODE		0x000036b0
-- 
cgit v0.10.2


From 4f795d18597696cc0da217c5177ddd8e2fbbb932 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 18 Mar 2013 12:22:34 -0700
Subject: sched: Convert BUG_ON()s in try_to_wake_up_local() to WARN_ON_ONCE()s

commit 383efcd00053ec40023010ce5034bd702e7ab373 upstream.

try_to_wake_up_local() should only be invoked to wake up another
task in the same runqueue and BUG_ON()s are used to enforce the
rule. Missing try_to_wake_up_local() can stall workqueue
execution but such stalls are likely to be finite either by
another work item being queued or the one blocked getting
unblocked.  There's no reason to trigger BUG while holding rq
lock crashing the whole system.

Convert BUG_ON()s in try_to_wake_up_local() to WARN_ON_ONCE()s.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20130318192234.GD3042@htj.dyndns.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 26058d0..32ab413 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1488,8 +1488,10 @@ static void try_to_wake_up_local(struct task_struct *p)
 {
 	struct rq *rq = task_rq(p);
 
-	BUG_ON(rq != this_rq());
-	BUG_ON(p == current);
+	if (WARN_ON_ONCE(rq != this_rq()) ||
+	    WARN_ON_ONCE(p == current))
+		return;
+
 	lockdep_assert_held(&rq->lock);
 
 	if (!raw_spin_trylock(&p->pi_lock)) {
-- 
cgit v0.10.2


From dd135b0d8977ef119c7908c0153c3e94d8110e58 Mon Sep 17 00:00:00 2001
From: libin <huawei.libin@huawei.com>
Date: Mon, 8 Apr 2013 14:39:12 +0800
Subject: sched/debug: Fix sd->*_idx limit range avoiding overflow

commit fd9b86d37a600488dbd80fe60cca46b822bff1cd upstream.

Commit 201c373e8e ("sched/debug: Limit sd->*_idx range on
sysctl") was an incomplete bug fix.

This patch fixes sd->*_idx limit range to [0 ~ CPU_LOAD_IDX_MAX-1]
avoiding array overflow caused by setting sd->*_idx to CPU_LOAD_IDX_MAX
on sysctl.

Signed-off-by: Libin <huawei.libin@huawei.com>
Cc: <jiang.liu@huawei.com>
Cc: <guohanjun@huawei.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/51626610.2040607@huawei.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Jonghwan Choi <jhbird.choi@samsung.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 32ab413..5e2f7c3 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4950,7 +4950,7 @@ static void sd_free_ctl_entry(struct ctl_table **tablep)
 }
 
 static int min_load_idx = 0;
-static int max_load_idx = CPU_LOAD_IDX_MAX;
+static int max_load_idx = CPU_LOAD_IDX_MAX-1;
 
 static void
 set_table_entry(struct ctl_table *entry,
-- 
cgit v0.10.2


From e644d5af6228ef011179bae1ba7052fd2c481a3f Mon Sep 17 00:00:00 2001
From: Illia Ragozin <illia.ragozin@grapecom.com>
Date: Wed, 10 Apr 2013 19:43:34 +0100
Subject: ARM: 7696/1: Fix kexec by setting outer_cache.inv_all for Feroceon

commit cd272d1ea71583170e95dde02c76166c7f9017e6 upstream.

On Feroceon the L2 cache becomes non-coherent with the CPU
when the L1 caches are disabled. Thus the L2 needs to be invalidated
after both L1 caches are disabled.

On kexec before the starting the code for relocation the kernel,
the L1 caches are disabled in cpu_froc_fin (cpu_v7_proc_fin for Feroceon),
but after L2 cache is never invalidated, because inv_all is not set
in cache-feroceon-l2.c.
So kernel relocation and decompression may has (and usually has) errors.
Setting the function enables L2 invalidation and fixes the issue.

Signed-off-by: Illia Ragozin <illia.ragozin@grapecom.com>
Acked-by: Jason Cooper <jason@lakedaemon.net>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

diff --git a/arch/arm/mm/cache-feroceon-l2.c b/arch/arm/mm/cache-feroceon-l2.c
index dd3d591..48bc3c0 100644
--- a/arch/arm/mm/cache-feroceon-l2.c
+++ b/arch/arm/mm/cache-feroceon-l2.c
@@ -343,6 +343,7 @@ void __init feroceon_l2_init(int __l2_wt_override)
 	outer_cache.inv_range = feroceon_l2_inv_range;
 	outer_cache.clean_range = feroceon_l2_clean_range;
 	outer_cache.flush_range = feroceon_l2_flush_range;
+	outer_cache.inv_all = l2_inv_all;
 
 	enable_l2();
 
-- 
cgit v0.10.2


From e64371e1f2a968e70e950b2d9742718eb5699e35 Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Fri, 12 Apr 2013 19:04:19 +0100
Subject: ARM: 7698/1: perf: fix group validation when using enable_on_exec

commit cb2d8b342aa084d1f3ac29966245dec9163677fb upstream.

Events may be created with attr->disabled == 1 and attr->enable_on_exec
== 1, which confuses the group validation code because events with the
PERF_EVENT_STATE_OFF are not considered candidates for scheduling, which
may lead to failure at group scheduling time.

This patch fixes the validation check for ARM, so that events in the
OFF state are still considered when enable_on_exec is true.

Reported-by: Sudeep KarkadaNagesha <Sudeep.KarkadaNagesha@arm.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
Cc: Jiri Olsa <jolsa@redhat.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

diff --git a/arch/arm/kernel/perf_event.c b/arch/arm/kernel/perf_event.c
index f9e8657..23fa6a2 100644
--- a/arch/arm/kernel/perf_event.c
+++ b/arch/arm/kernel/perf_event.c
@@ -261,7 +261,10 @@ validate_event(struct pmu_hw_events *hw_events,
 	struct arm_pmu *armpmu = to_arm_pmu(event->pmu);
 	struct pmu *leader_pmu = event->group_leader->pmu;
 
-	if (event->pmu != leader_pmu || event->state <= PERF_EVENT_STATE_OFF)
+	if (event->pmu != leader_pmu || event->state < PERF_EVENT_STATE_OFF)
+		return 1;
+
+	if (event->state == PERF_EVENT_STATE_OFF && !event->attr.enable_on_exec)
 		return 1;
 
 	return armpmu->get_event_idx(hw_events, event) >= 0;
-- 
cgit v0.10.2


From 8387f3cbc4710e2e2df013ba1b0e7ba09499c016 Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@openwrt.org>
Date: Sun, 7 Apr 2013 21:10:48 +0200
Subject: ath9k_htc: accept 1.x firmware newer than 1.3

commit 319e7bd96aca64a478f3aad40711c928405b8b77 upstream.

Since the firmware has been open sourced, the minor version has been
bumped to 1.4 and the API/ABI will stay compatible across further 1.x
releases.

Signed-off-by: Felix Fietkau <nbd@openwrt.org>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

diff --git a/drivers/net/wireless/ath/ath9k/htc_drv_init.c b/drivers/net/wireless/ath/ath9k/htc_drv_init.c
index 05d5ba6..0663653 100644
--- a/drivers/net/wireless/ath/ath9k/htc_drv_init.c
+++ b/drivers/net/wireless/ath/ath9k/htc_drv_init.c
@@ -796,7 +796,7 @@ static int ath9k_init_firmware_version(struct ath9k_htc_priv *priv)
 	 * required version.
 	 */
 	if (priv->fw_version_major != MAJOR_VERSION_REQ ||
-	    priv->fw_version_minor != MINOR_VERSION_REQ) {
+	    priv->fw_version_minor < MINOR_VERSION_REQ) {
 		dev_err(priv->dev, "ath9k_htc: Please upgrade to FW version %d.%d\n",
 			MAJOR_VERSION_REQ, MINOR_VERSION_REQ);
 		return -EINVAL;
-- 
cgit v0.10.2


From 42a2d01229b66c76d96fc2e5eb96e8d9c3caaac0 Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@openwrt.org>
Date: Wed, 10 Apr 2013 15:26:06 +0200
Subject: ath9k_hw: change AR9580 initvals to fix a stability issue

commit f09a878511997c25a76bf111a32f6b8345a701a5 upstream.

The hardware parsing of Control Wrapper Frames needs to be disabled, as
it has been causing spurious decryption error reports. The initvals for
other chips have been updated to disable it, but AR9580 was left out for
some reason.

Signed-off-by: Felix Fietkau <nbd@openwrt.org>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

diff --git a/drivers/net/wireless/ath/ath9k/ar9580_1p0_initvals.h b/drivers/net/wireless/ath/ath9k/ar9580_1p0_initvals.h
index 6e1915a..c00c13a 100644
--- a/drivers/net/wireless/ath/ath9k/ar9580_1p0_initvals.h
+++ b/drivers/net/wireless/ath/ath9k/ar9580_1p0_initvals.h
@@ -519,7 +519,7 @@ static const u32 ar9580_1p0_mac_core[][2] = {
 	{0x00008258, 0x00000000},
 	{0x0000825c, 0x40000000},
 	{0x00008260, 0x00080922},
-	{0x00008264, 0x9bc00010},
+	{0x00008264, 0x9d400010},
 	{0x00008268, 0xffffffff},
 	{0x0000826c, 0x0000ffff},
 	{0x00008270, 0x00000000},
-- 
cgit v0.10.2


From 7ac7dd50d62c02b055f4e7f53b39040f863ff48a Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Wed, 10 Apr 2013 21:38:36 +0200
Subject: mac80211: fix cfg80211 interaction on auth/assoc request

commit 7b119dc06d871405fc7c3e9a73a6c987409ba639 upstream.

If authentication (or association with FT) is requested by
userspace, mac80211 currently doesn't tell cfg80211 that it
disconnected from the AP. That leaves inconsistent state:
cfg80211 thinks it's connected while mac80211 thinks it's
not. Typically this won't last long, as soon as mac80211
reports the new association to cfg80211 the old one goes
away. If, however, the new authentication or association
doesn't succeed, then cfg80211 will forever think the old
one still exists and will refuse attempts to authenticate
or associate with the AP it thinks it's connected to.

Anders reported that this leads to it taking a very long
time to reconnect to a network, or never even succeeding.
I tested this with an AP hacked to never respond to auth
frames, and one that works, and with just those two the
system never recovers because one won't work and cfg80211
thinks it's connected to the other so refuses connections
to it.

To fix this, simply make mac80211 tell cfg80211 when it is
no longer connected to the old AP, while authenticating or
associating to a new one.

Reported-by: Anders Kaseorg <andersk@mit.edu>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index e14e676..a1a7997 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -3723,8 +3723,16 @@ int ieee80211_mgd_auth(struct ieee80211_sub_if_data *sdata,
 	/* prep auth_data so we don't go into idle on disassoc */
 	ifmgd->auth_data = auth_data;
 
-	if (ifmgd->associated)
-		ieee80211_set_disassoc(sdata, 0, 0, false, NULL);
+	if (ifmgd->associated) {
+		u8 frame_buf[IEEE80211_DEAUTH_FRAME_LEN];
+
+		ieee80211_set_disassoc(sdata, IEEE80211_STYPE_DEAUTH,
+				       WLAN_REASON_UNSPECIFIED,
+				       false, frame_buf);
+
+		__cfg80211_send_deauth(sdata->dev, frame_buf,
+				       sizeof(frame_buf));
+	}
 
 	sdata_info(sdata, "authenticate with %pM\n", req->bss->bssid);
 
@@ -3783,8 +3791,16 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata,
 
 	mutex_lock(&ifmgd->mtx);
 
-	if (ifmgd->associated)
-		ieee80211_set_disassoc(sdata, 0, 0, false, NULL);
+	if (ifmgd->associated) {
+		u8 frame_buf[IEEE80211_DEAUTH_FRAME_LEN];
+
+		ieee80211_set_disassoc(sdata, IEEE80211_STYPE_DEAUTH,
+				       WLAN_REASON_UNSPECIFIED,
+				       false, frame_buf);
+
+		__cfg80211_send_deauth(sdata->dev, frame_buf,
+				       sizeof(frame_buf));
+	}
 
 	if (ifmgd->auth_data && !ifmgd->auth_data->done) {
 		err = -EBUSY;
-- 
cgit v0.10.2


From 401716d9514ff77c86d93805f3a45f5e6fdd8f10 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= <zajec5@gmail.com>
Date: Tue, 2 Apr 2013 15:57:26 +0200
Subject: ssb: implement spurious tone avoidance
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit 46fc4c909339f5a84d1679045297d9d2fb596987 upstream.

And make use of it in b43. This fixes a regression introduced with
49d55cef5b1925a5c1efb6aaddaa40fc7c693335
b43: N-PHY: implement spurious tone avoidance
This commit made BCM4322 use only MCS 0 on channel 13, which of course
resulted in performance drop (down to 0.7Mb/s).

Reported-by: Stefan Brüns <stefan.bruens@rwth-aachen.de>
Signed-off-by: Rafał Miłecki <zajec5@gmail.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

diff --git a/drivers/net/wireless/b43/phy_n.c b/drivers/net/wireless/b43/phy_n.c
index e8486c1..b70f220 100644
--- a/drivers/net/wireless/b43/phy_n.c
+++ b/drivers/net/wireless/b43/phy_n.c
@@ -5165,7 +5165,8 @@ static void b43_nphy_pmu_spur_avoid(struct b43_wldev *dev, bool avoid)
 #endif
 #ifdef CONFIG_B43_SSB
 	case B43_BUS_SSB:
-		/* FIXME */
+		ssb_pmu_spuravoid_pllupdate(&dev->dev->sdev->bus->chipco,
+					    avoid);
 		break;
 #endif
 	}
diff --git a/drivers/ssb/driver_chipcommon_pmu.c b/drivers/ssb/driver_chipcommon_pmu.c
index a43415a..bc75528 100644
--- a/drivers/ssb/driver_chipcommon_pmu.c
+++ b/drivers/ssb/driver_chipcommon_pmu.c
@@ -675,3 +675,32 @@ u32 ssb_pmu_get_controlclock(struct ssb_chipcommon *cc)
 		return 0;
 	}
 }
+
+void ssb_pmu_spuravoid_pllupdate(struct ssb_chipcommon *cc, int spuravoid)
+{
+	u32 pmu_ctl = 0;
+
+	switch (cc->dev->bus->chip_id) {
+	case 0x4322:
+		ssb_chipco_pll_write(cc, SSB_PMU1_PLLCTL0, 0x11100070);
+		ssb_chipco_pll_write(cc, SSB_PMU1_PLLCTL1, 0x1014140a);
+		ssb_chipco_pll_write(cc, SSB_PMU1_PLLCTL5, 0x88888854);
+		if (spuravoid == 1)
+			ssb_chipco_pll_write(cc, SSB_PMU1_PLLCTL2, 0x05201828);
+		else
+			ssb_chipco_pll_write(cc, SSB_PMU1_PLLCTL2, 0x05001828);
+		pmu_ctl = SSB_CHIPCO_PMU_CTL_PLL_UPD;
+		break;
+	case 43222:
+		/* TODO: BCM43222 requires updating PLLs too */
+		return;
+	default:
+		ssb_printk(KERN_ERR PFX
+			   "Unknown spuravoidance settings for chip 0x%04X, not changing PLL\n",
+			   cc->dev->bus->chip_id);
+		return;
+	}
+
+	chipco_set32(cc, SSB_CHIPCO_PMU_CTL, pmu_ctl);
+}
+EXPORT_SYMBOL_GPL(ssb_pmu_spuravoid_pllupdate);
diff --git a/include/linux/ssb/ssb_driver_chipcommon.h b/include/linux/ssb/ssb_driver_chipcommon.h
index 9e492be..6fcfe99 100644
--- a/include/linux/ssb/ssb_driver_chipcommon.h
+++ b/include/linux/ssb/ssb_driver_chipcommon.h
@@ -219,6 +219,7 @@
 #define SSB_CHIPCO_PMU_CTL			0x0600 /* PMU control */
 #define  SSB_CHIPCO_PMU_CTL_ILP_DIV		0xFFFF0000 /* ILP div mask */
 #define  SSB_CHIPCO_PMU_CTL_ILP_DIV_SHIFT	16
+#define  SSB_CHIPCO_PMU_CTL_PLL_UPD		0x00000400
 #define  SSB_CHIPCO_PMU_CTL_NOILPONW		0x00000200 /* No ILP on wait */
 #define  SSB_CHIPCO_PMU_CTL_HTREQEN		0x00000100 /* HT req enable */
 #define  SSB_CHIPCO_PMU_CTL_ALPREQEN		0x00000080 /* ALP req enable */
@@ -667,5 +668,6 @@ enum ssb_pmu_ldo_volt_id {
 void ssb_pmu_set_ldo_voltage(struct ssb_chipcommon *cc,
 			     enum ssb_pmu_ldo_volt_id id, u32 voltage);
 void ssb_pmu_set_ldo_paref(struct ssb_chipcommon *cc, bool on);
+void ssb_pmu_spuravoid_pllupdate(struct ssb_chipcommon *cc, int spuravoid);
 
 #endif /* LINUX_SSB_CHIPCO_H_ */
-- 
cgit v0.10.2


From 2822050e7d8bb82bced08b21c711ff79a4636f3e Mon Sep 17 00:00:00 2001
From: Mathias Krause <minipli@googlemail.com>
Date: Sun, 7 Apr 2013 14:05:39 +0200
Subject: crypto: algif - suppress sending source address information in
 recvmsg

commit 72a763d805a48ac8c0bf48fdb510e84c12de51fe upstream.

The current code does not set the msg_namelen member to 0 and therefore
makes net/socket.c leak the local sockaddr_storage variable to userland
-- 128 bytes of kernel stack memory. Fix that.

Signed-off-by: Mathias Krause <minipli@googlemail.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

diff --git a/crypto/algif_hash.c b/crypto/algif_hash.c
index ef5356c..0262210 100644
--- a/crypto/algif_hash.c
+++ b/crypto/algif_hash.c
@@ -161,6 +161,8 @@ static int hash_recvmsg(struct kiocb *unused, struct socket *sock,
 	else if (len < ds)
 		msg->msg_flags |= MSG_TRUNC;
 
+	msg->msg_namelen = 0;
+
 	lock_sock(sk);
 	if (ctx->more) {
 		ctx->more = 0;
diff --git a/crypto/algif_skcipher.c b/crypto/algif_skcipher.c
index 6a6dfc0..a1c4f0a 100644
--- a/crypto/algif_skcipher.c
+++ b/crypto/algif_skcipher.c
@@ -432,6 +432,7 @@ static int skcipher_recvmsg(struct kiocb *unused, struct socket *sock,
 	long copied = 0;
 
 	lock_sock(sk);
+	msg->msg_namelen = 0;
 	for (iov = msg->msg_iov, iovlen = msg->msg_iovlen; iovlen > 0;
 	     iovlen--, iov++) {
 		unsigned long seglen = iov->iov_len;
-- 
cgit v0.10.2


From ff91fd5bc105f29a34755a6dd6d547c877b7d027 Mon Sep 17 00:00:00 2001
From: Tommi Rantala <tt.rantala@gmail.com>
Date: Sat, 13 Apr 2013 22:49:14 +0300
Subject: perf: Treat attr.config as u64 in perf_swevent_init()

commit 8176cced706b5e5d15887584150764894e94e02f upstream.

Trinity discovered that we fail to check all 64 bits of
attr.config passed by user space, resulting to out-of-bounds
access of the perf_swevent_enabled array in
sw_perf_event_destroy().

Introduced in commit b0a873ebb ("perf: Register PMU
implementations").

Signed-off-by: Tommi Rantala <tt.rantala@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: davej@redhat.com
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
Link: http://lkml.kernel.org/r/1365882554-30259-1-git-send-email-tt.rantala@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 7b6646a..0600d3b 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -5328,7 +5328,7 @@ static void sw_perf_event_destroy(struct perf_event *event)
 
 static int perf_swevent_init(struct perf_event *event)
 {
-	int event_id = event->attr.config;
+	u64 event_id = event->attr.config;
 
 	if (event->attr.type != PERF_TYPE_SOFTWARE)
 		return -ENOENT;
-- 
cgit v0.10.2


From 73f25feefdcc89cce395b86e135e0936cbd2c7ae Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@google.com>
Date: Tue, 16 Apr 2013 13:51:43 +0200
Subject: perf/x86: Fix offcore_rsp valid mask for SNB/IVB

commit f1923820c447e986a9da0fc6bf60c1dccdf0408e upstream.

The valid mask for both offcore_response_0 and
offcore_response_1 was wrong for SNB/SNB-EP,
IVB/IVB-EP. It was possible to write to
reserved bit and cause a GP fault crashing
the kernel.

This patch fixes the problem by correctly marking the
reserved bits in the valid mask for all the processors
mentioned above.

A distinction between desktop and server parts is introduced
because bits 24-30 are only available on the server parts.

This version of the  patch is just a rebase to perf/urgent tree
and should apply to older kernels as well.

Signed-off-by: Stephane Eranian <eranian@google.com>
Cc: peterz@infradead.org
Cc: jolsa@redhat.com
Cc: ak@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 4914e94..70602f8 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -128,8 +128,14 @@ static struct event_constraint intel_gen_event_constraints[] __read_mostly =
 };
 
 static struct extra_reg intel_snb_extra_regs[] __read_mostly = {
-	INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0x3fffffffffull, RSP_0),
-	INTEL_EVENT_EXTRA_REG(0xbb, MSR_OFFCORE_RSP_1, 0x3fffffffffull, RSP_1),
+	INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0x3f807f8fffull, RSP_0),
+	INTEL_EVENT_EXTRA_REG(0xbb, MSR_OFFCORE_RSP_1, 0x3f807f8fffull, RSP_1),
+	EVENT_EXTRA_END
+};
+
+static struct extra_reg intel_snbep_extra_regs[] __read_mostly = {
+	INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0x3fffff8fffull, RSP_0),
+	INTEL_EVENT_EXTRA_REG(0xbb, MSR_OFFCORE_RSP_1, 0x3fffff8fffull, RSP_1),
 	EVENT_EXTRA_END
 };
 
@@ -2072,7 +2078,10 @@ __init int intel_pmu_init(void)
 		x86_pmu.event_constraints = intel_snb_event_constraints;
 		x86_pmu.pebs_constraints = intel_snb_pebs_event_constraints;
 		x86_pmu.pebs_aliases = intel_pebs_aliases_snb;
-		x86_pmu.extra_regs = intel_snb_extra_regs;
+		if (boot_cpu_data.x86_model == 45)
+			x86_pmu.extra_regs = intel_snbep_extra_regs;
+		else
+			x86_pmu.extra_regs = intel_snb_extra_regs;
 		/* all extra regs are per-cpu when HT is on */
 		x86_pmu.er_flags |= ERF_HAS_RSP_1;
 		x86_pmu.er_flags |= ERF_NO_HT_SHARING;
@@ -2098,7 +2107,10 @@ __init int intel_pmu_init(void)
 		x86_pmu.event_constraints = intel_snb_event_constraints;
 		x86_pmu.pebs_constraints = intel_ivb_pebs_event_constraints;
 		x86_pmu.pebs_aliases = intel_pebs_aliases_snb;
-		x86_pmu.extra_regs = intel_snb_extra_regs;
+		if (boot_cpu_data.x86_model == 62)
+			x86_pmu.extra_regs = intel_snbep_extra_regs;
+		else
+			x86_pmu.extra_regs = intel_snb_extra_regs;
 		/* all extra regs are per-cpu when HT is on */
 		x86_pmu.er_flags |= ERF_HAS_RSP_1;
 		x86_pmu.er_flags |= ERF_NO_HT_SHARING;
-- 
cgit v0.10.2


From d2b12161ea5208d87dae24bcc5f3fed9da79262e Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Sun, 14 Apr 2013 13:47:02 -0700
Subject: userns: Don't let unprivileged users trick privileged users into
 setting the id_map

commit 6708075f104c3c9b04b23336bb0366ca30c3931b upstream.

When we require privilege for setting /proc/<pid>/uid_map or
/proc/<pid>/gid_map no longer allow an unprivileged user to
open the file and pass it to a privileged program to write
to the file.

Instead when privilege is required require both the opener and the
writer to have the necessary capabilities.

I have tested this code and verified that setting /proc/<pid>/uid_map
fails when an unprivileged user opens the file and a privielged user
attempts to set the mapping, that unprivileged users can still map
their own id, and that a privileged users can still setup an arbitrary
mapping.

Reported-by: Andy Lutomirski <luto@amacapital.net>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andy Lutomirski <luto@amacapital.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index f45e128..6cc21a5 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -25,7 +25,8 @@
 
 static struct kmem_cache *user_ns_cachep __read_mostly;
 
-static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid,
+static bool new_idmap_permitted(const struct file *file,
+				struct user_namespace *ns, int cap_setid,
 				struct uid_gid_map *map);
 
 static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns)
@@ -666,7 +667,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
 
 	ret = -EPERM;
 	/* Validate the user is allowed to use user id's mapped to. */
-	if (!new_idmap_permitted(ns, cap_setid, &new_map))
+	if (!new_idmap_permitted(file, ns, cap_setid, &new_map))
 		goto out;
 
 	/* Map the lower ids from the parent user namespace to the
@@ -753,7 +754,8 @@ ssize_t proc_projid_map_write(struct file *file, const char __user *buf, size_t
 			 &ns->projid_map, &ns->parent->projid_map);
 }
 
-static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid,
+static bool new_idmap_permitted(const struct file *file,
+				struct user_namespace *ns, int cap_setid,
 				struct uid_gid_map *new_map)
 {
 	/* Allow mapping to your own filesystem ids */
@@ -777,8 +779,10 @@ static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid,
 
 	/* Allow the specified ids if we have the appropriate capability
 	 * (CAP_SETUID or CAP_SETGID) over the parent user namespace.
+	 * And the opener of the id file also had the approprpiate capability.
 	 */
-	if (ns_capable(ns->parent, cap_setid))
+	if (ns_capable(ns->parent, cap_setid) &&
+	    file_ns_capable(file, ns->parent, cap_setid))
 		return true;
 
 	return false;
-- 
cgit v0.10.2


From be4c1b46075d42cbd82bc0d1e833dc4de052ae85 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@amacapital.net>
Date: Sun, 14 Apr 2013 16:28:19 -0700
Subject: userns: Check uid_map's opener's fsuid, not the current fsuid

commit e3211c120a85b792978bcb4be7b2886df18d27f0 upstream.

Signed-off-by: Andy Lutomirski <luto@amacapital.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 6cc21a5..fdf40d6c 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -763,12 +763,12 @@ static bool new_idmap_permitted(const struct file *file,
 		u32 id = new_map->extent[0].lower_first;
 		if (cap_setid == CAP_SETUID) {
 			kuid_t uid = make_kuid(ns->parent, id);
-			if (uid_eq(uid, current_fsuid()))
+			if (uid_eq(uid, file->f_cred->fsuid))
 				return true;
 		}
 		else if (cap_setid == CAP_SETGID) {
 			kgid_t gid = make_kgid(ns->parent, id);
-			if (gid_eq(gid, current_fsgid()))
+			if (gid_eq(gid, file->f_cred->fsgid))
 				return true;
 		}
 	}
-- 
cgit v0.10.2


From 37ed4a3f3bb107bfb307610b9ab2b87db03df3a6 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@amacapital.net>
Date: Sun, 14 Apr 2013 11:44:04 -0700
Subject: userns: Changing any namespace id mappings should require privileges

commit 41c21e351e79004dbb4efa4bc14a53a7e0af38c5 upstream.

Changing uid/gid/projid mappings doesn't change your id within the
namespace; it reconfigures the namespace.  Unprivileged programs should
*not* be able to write these files.  (We're also checking the privileges
on the wrong task.)

Given the write-once nature of these files and the other security
checks, this is likely impossible to usefully exploit.

Signed-off-by: Andy Lutomirski <luto@amacapital.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index fdf40d6c..f359dc7 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -576,10 +576,10 @@ static ssize_t map_write(struct file *file, const char __user *buf,
 	if (map->nr_extents != 0)
 		goto out;
 
-	/* Require the appropriate privilege CAP_SETUID or CAP_SETGID
-	 * over the user namespace in order to set the id mapping.
+	/*
+	 * Adjusting namespace settings requires capabilities on the target.
 	 */
-	if (cap_valid(cap_setid) && !ns_capable(ns, cap_setid))
+	if (cap_valid(cap_setid) && !file_ns_capable(file, ns, CAP_SYS_ADMIN))
 		goto out;
 
 	/* Get a buffer */
-- 
cgit v0.10.2


From d8b1c028720bb2f3ba634c64ac2e834982b7ee7a Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Tue, 16 Apr 2013 13:45:37 -0700
Subject: vm: add vm_iomap_memory() helper function

commit b4cbb197c7e7a68dbad0d491242e3ca67420c13e upstream.

Various drivers end up replicating the code to mmap() their memory
buffers into user space, and our core memory remapping function may be
very flexible but it is unnecessarily complicated for the common cases
to use.

Our internal VM uses pfn's ("page frame numbers") which simplifies
things for the VM, and allows us to pass physical addresses around in a
denser and more efficient format than passing a "phys_addr_t" around,
and having to shift it up and down by the page size.  But it just means
that drivers end up doing that shifting instead at the interface level.

It also means that drivers end up mucking around with internal VM things
like the vma details (vm_pgoff, vm_start/end) way more than they really
need to.

So this just exports a function to map a certain physical memory range
into user space (using a phys_addr_t based interface that is much more
natural for a driver) and hides all the complexity from the driver.
Some drivers will still end up tweaking the vm_page_prot details for
things like prefetching or cacheability etc, but that's actually
relevant to the driver, rather than caring about what the page offset of
the mapping is into the particular IO memory region.

Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 66e2f7c..9568b90 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1623,6 +1623,8 @@ int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
 			unsigned long pfn);
 int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
 			unsigned long pfn);
+int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len);
+
 
 struct page *follow_page(struct vm_area_struct *, unsigned long address,
 			unsigned int foll_flags);
diff --git a/mm/memory.c b/mm/memory.c
index f8b734a..32a495a 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2358,6 +2358,53 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
 }
 EXPORT_SYMBOL(remap_pfn_range);
 
+/**
+ * vm_iomap_memory - remap memory to userspace
+ * @vma: user vma to map to
+ * @start: start of area
+ * @len: size of area
+ *
+ * This is a simplified io_remap_pfn_range() for common driver use. The
+ * driver just needs to give us the physical memory range to be mapped,
+ * we'll figure out the rest from the vma information.
+ *
+ * NOTE! Some drivers might want to tweak vma->vm_page_prot first to get
+ * whatever write-combining details or similar.
+ */
+int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len)
+{
+	unsigned long vm_len, pfn, pages;
+
+	/* Check that the physical memory area passed in looks valid */
+	if (start + len < start)
+		return -EINVAL;
+	/*
+	 * You *really* shouldn't map things that aren't page-aligned,
+	 * but we've historically allowed it because IO memory might
+	 * just have smaller alignment.
+	 */
+	len += start & ~PAGE_MASK;
+	pfn = start >> PAGE_SHIFT;
+	pages = (len + ~PAGE_MASK) >> PAGE_SHIFT;
+	if (pfn + pages < pfn)
+		return -EINVAL;
+
+	/* We start the mapping 'vm_pgoff' pages into the area */
+	if (vma->vm_pgoff > pages)
+		return -EINVAL;
+	pfn += vma->vm_pgoff;
+	pages -= vma->vm_pgoff;
+
+	/* Can we fit all of the mapping? */
+	vm_len = vma->vm_end - vma->vm_start;
+	if (vm_len >> PAGE_SHIFT > pages)
+		return -EINVAL;
+
+	/* Ok, let it rip */
+	return io_remap_pfn_range(vma, vma->vm_start, pfn, vm_len, vma->vm_page_prot);
+}
+EXPORT_SYMBOL(vm_iomap_memory);
+
 static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
 				     unsigned long addr, unsigned long end,
 				     pte_fn_t fn, void *data)
-- 
cgit v0.10.2


From b88e2557725053ef3ca54ee46be4143ba38b0cd4 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Fri, 19 Apr 2013 10:01:04 -0700
Subject: vm: convert snd_pcm_lib_mmap_iomem() to vm_iomap_memory() helper

commit 0fe09a45c4848b5b5607b968d959fdc1821c161d upstream.

This is my example conversion of a few existing mmap users.  The pcm
mmap case is one of the more straightforward ones.

Acked-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c
index 09b4286..f4aaf5a 100644
--- a/sound/core/pcm_native.c
+++ b/sound/core/pcm_native.c
@@ -3222,18 +3222,10 @@ EXPORT_SYMBOL_GPL(snd_pcm_lib_default_mmap);
 int snd_pcm_lib_mmap_iomem(struct snd_pcm_substream *substream,
 			   struct vm_area_struct *area)
 {
-	long size;
-	unsigned long offset;
+	struct snd_pcm_runtime *runtime = substream->runtime;;
 
 	area->vm_page_prot = pgprot_noncached(area->vm_page_prot);
-	area->vm_flags |= VM_IO;
-	size = area->vm_end - area->vm_start;
-	offset = area->vm_pgoff << PAGE_SHIFT;
-	if (io_remap_pfn_range(area, area->vm_start,
-				(substream->runtime->dma_addr + offset) >> PAGE_SHIFT,
-				size, area->vm_page_prot))
-		return -EAGAIN;
-	return 0;
+	return vm_iomap_memory(area, runtime->dma_addr, runtime->dma_bytes);
 }
 
 EXPORT_SYMBOL(snd_pcm_lib_mmap_iomem);
-- 
cgit v0.10.2


From b3ea73c2d47633f9c1059e6832c7784c12c6aa75 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Fri, 19 Apr 2013 09:57:35 -0700
Subject: vm: convert fb_mmap to vm_iomap_memory() helper

commit fc9bbca8f650e5f738af8806317c0a041a48ae4a upstream.

This is my example conversion of a few existing mmap users.  The
fb_mmap() case is a good example because it is a bit more complicated
than some: fb_mmap() mmaps one of two different memory areas depending
on the page offset of the mmap (but happily there is never any mixing of
the two, so the helper function still works).

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

diff --git a/drivers/video/fbmem.c b/drivers/video/fbmem.c
index dc61c12..0a49456 100644
--- a/drivers/video/fbmem.c
+++ b/drivers/video/fbmem.c
@@ -1373,15 +1373,12 @@ fb_mmap(struct file *file, struct vm_area_struct * vma)
 {
 	struct fb_info *info = file_fb_info(file);
 	struct fb_ops *fb;
-	unsigned long off;
+	unsigned long mmio_pgoff;
 	unsigned long start;
 	u32 len;
 
 	if (!info)
 		return -ENODEV;
-	if (vma->vm_pgoff > (~0UL >> PAGE_SHIFT))
-		return -EINVAL;
-	off = vma->vm_pgoff << PAGE_SHIFT;
 	fb = info->fbops;
 	if (!fb)
 		return -ENODEV;
@@ -1393,32 +1390,24 @@ fb_mmap(struct file *file, struct vm_area_struct * vma)
 		return res;
 	}
 
-	/* frame buffer memory */
+	/*
+	 * Ugh. This can be either the frame buffer mapping, or
+	 * if pgoff points past it, the mmio mapping.
+	 */
 	start = info->fix.smem_start;
-	len = PAGE_ALIGN((start & ~PAGE_MASK) + info->fix.smem_len);
-	if (off >= len) {
-		/* memory mapped io */
-		off -= len;
-		if (info->var.accel_flags) {
-			mutex_unlock(&info->mm_lock);
-			return -EINVAL;
-		}
+	len = info->fix.smem_len;
+	mmio_pgoff = PAGE_ALIGN((start & ~PAGE_MASK) + len) >> PAGE_SHIFT;
+	if (vma->vm_pgoff >= mmio_pgoff) {
+		vma->vm_pgoff -= mmio_pgoff;
 		start = info->fix.mmio_start;
-		len = PAGE_ALIGN((start & ~PAGE_MASK) + info->fix.mmio_len);
+		len = info->fix.mmio_len;
 	}
 	mutex_unlock(&info->mm_lock);
-	start &= PAGE_MASK;
-	if ((vma->vm_end - vma->vm_start + off) > len)
-		return -EINVAL;
-	off += start;
-	vma->vm_pgoff = off >> PAGE_SHIFT;
-	/* VM_IO | VM_DONTEXPAND | VM_DONTDUMP are set by io_remap_pfn_range()*/
+
 	vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
-	fb_pgprotect(file, vma, off);
-	if (io_remap_pfn_range(vma, vma->vm_start, off >> PAGE_SHIFT,
-			     vma->vm_end - vma->vm_start, vma->vm_page_prot))
-		return -EAGAIN;
-	return 0;
+	fb_pgprotect(file, vma, start);
+
+	return vm_iomap_memory(vma, start, len);
 }
 
 static int
-- 
cgit v0.10.2


From 969163b21e0449ffc341a07d8cd73c8859b7e70a Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Fri, 19 Apr 2013 09:46:39 -0700
Subject: vm: convert HPET mmap to vm_iomap_memory() helper

commit 2323036dfec8ce3ce6e1c86a49a31b039f3300d1 upstream.

This is my example conversion of a few existing mmap users.  The HPET
case is simple, widely available, and easy to test (Clemens Ladisch sent
a trivial test-program for it).

Test-program-by: Clemens Ladisch <clemens@ladisch.de>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

diff --git a/drivers/char/hpet.c b/drivers/char/hpet.c
index fe6d4be..615d262 100644
--- a/drivers/char/hpet.c
+++ b/drivers/char/hpet.c
@@ -373,26 +373,14 @@ static int hpet_mmap(struct file *file, struct vm_area_struct *vma)
 	struct hpet_dev *devp;
 	unsigned long addr;
 
-	if (((vma->vm_end - vma->vm_start) != PAGE_SIZE) || vma->vm_pgoff)
-		return -EINVAL;
-
 	devp = file->private_data;
 	addr = devp->hd_hpets->hp_hpet_phys;
 
 	if (addr & (PAGE_SIZE - 1))
 		return -ENOSYS;
 
-	vma->vm_flags |= VM_IO;
 	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
-
-	if (io_remap_pfn_range(vma, vma->vm_start, addr >> PAGE_SHIFT,
-					PAGE_SIZE, vma->vm_page_prot)) {
-		printk(KERN_ERR "%s: io_remap_pfn_range failed\n",
-			__func__);
-		return -EAGAIN;
-	}
-
-	return 0;
+	return vm_iomap_memory(vma, addr, PAGE_SIZE);
 #else
 	return -ENOSYS;
 #endif
-- 
cgit v0.10.2


From fdb6d5c17e8fe339de73879c18d5084ca3264e86 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Fri, 19 Apr 2013 09:53:07 -0700
Subject: vm: convert mtdchar mmap to vm_iomap_memory() helper

commit 8558e4a26b00225efeb085725bc319f91201b239 upstream.

This is my example conversion of a few existing mmap users.  The mtdchar
case is actually disabled right now (and stays disabled), but I did it
because it showed up on my "git grep", and I was familiar with the code
due to fixing an overflow problem in the code in commit 9c603e53d380
("mtdchar: fix offset overflow detection").

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

diff --git a/drivers/mtd/mtdchar.c b/drivers/mtd/mtdchar.c
index 82c0616..6e3d6dc 100644
--- a/drivers/mtd/mtdchar.c
+++ b/drivers/mtd/mtdchar.c
@@ -1159,45 +1159,17 @@ static int mtdchar_mmap(struct file *file, struct vm_area_struct *vma)
 	struct mtd_file_info *mfi = file->private_data;
 	struct mtd_info *mtd = mfi->mtd;
 	struct map_info *map = mtd->priv;
-	resource_size_t start, off;
-	unsigned long len, vma_len;
 
         /* This is broken because it assumes the MTD device is map-based
 	   and that mtd->priv is a valid struct map_info.  It should be
 	   replaced with something that uses the mtd_get_unmapped_area()
 	   operation properly. */
 	if (0 /*mtd->type == MTD_RAM || mtd->type == MTD_ROM*/) {
-		off = get_vm_offset(vma);
-		start = map->phys;
-		len = PAGE_ALIGN((start & ~PAGE_MASK) + map->size);
-		start &= PAGE_MASK;
-		vma_len = get_vm_size(vma);
-
-		/* Overflow in off+len? */
-		if (vma_len + off < off)
-			return -EINVAL;
-		/* Does it fit in the mapping? */
-		if (vma_len + off > len)
-			return -EINVAL;
-
-		off += start;
-		/* Did that overflow? */
-		if (off < start)
-			return -EINVAL;
-		if (set_vm_offset(vma, off) < 0)
-			return -EINVAL;
-		vma->vm_flags |= VM_IO | VM_DONTEXPAND | VM_DONTDUMP;
-
 #ifdef pgprot_noncached
-		if (file->f_flags & O_DSYNC || off >= __pa(high_memory))
+		if (file->f_flags & O_DSYNC || map->phys >= __pa(high_memory))
 			vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
 #endif
-		if (io_remap_pfn_range(vma, vma->vm_start, off >> PAGE_SHIFT,
-				       vma->vm_end - vma->vm_start,
-				       vma->vm_page_prot))
-			return -EAGAIN;
-
-		return 0;
+		return vm_iomap_memory(vma, map->phys, map->size);
 	}
 	return -ENOSYS;
 #else
-- 
cgit v0.10.2


From c5627099f70b1187b84fa711f6b5daf32f16b10e Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Fri, 5 Apr 2013 20:50:09 +0000
Subject: Btrfs: make sure nbytes are right after log replay

commit 4bc4bee4595662d8bff92180d5c32e3313a704b0 upstream.

While trying to track down a tree log replay bug I noticed that fsck was always
complaining about nbytes not being right for our fsynced file.  That is because
the new fsync stuff doesn't wait for ordered extents to complete, so the inodes
nbytes are not necessarily updated properly when we log it.  So to fix this we
need to set nbytes to whatever it is on the inode that is on disk, so when we
replay the extents we can just add the bytes that are being added as we replay
the extent.  This makes it work for the case that we have the wrong nbytes or
the case that we logged everything and nbytes is actually correct.  With this
I'm no longer getting nbytes errors out of btrfsck.

Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
Signed-off-by: Lingzhu Xiang <lxiang@redhat.com>
Reviewed-by: CAI Qian <caiqian@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 744a69b..8a00e2f 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -318,6 +318,7 @@ static noinline int overwrite_item(struct btrfs_trans_handle *trans,
 	unsigned long src_ptr;
 	unsigned long dst_ptr;
 	int overwrite_root = 0;
+	bool inode_item = key->type == BTRFS_INODE_ITEM_KEY;
 
 	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
 		overwrite_root = 1;
@@ -327,6 +328,9 @@ static noinline int overwrite_item(struct btrfs_trans_handle *trans,
 
 	/* look for the key in the destination tree */
 	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
+	if (ret < 0)
+		return ret;
+
 	if (ret == 0) {
 		char *src_copy;
 		char *dst_copy;
@@ -368,6 +372,30 @@ static noinline int overwrite_item(struct btrfs_trans_handle *trans,
 			return 0;
 		}
 
+		/*
+		 * We need to load the old nbytes into the inode so when we
+		 * replay the extents we've logged we get the right nbytes.
+		 */
+		if (inode_item) {
+			struct btrfs_inode_item *item;
+			u64 nbytes;
+
+			item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+					      struct btrfs_inode_item);
+			nbytes = btrfs_inode_nbytes(path->nodes[0], item);
+			item = btrfs_item_ptr(eb, slot,
+					      struct btrfs_inode_item);
+			btrfs_set_inode_nbytes(eb, item, nbytes);
+		}
+	} else if (inode_item) {
+		struct btrfs_inode_item *item;
+
+		/*
+		 * New inode, set nbytes to 0 so that the nbytes comes out
+		 * properly when we replay the extents.
+		 */
+		item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item);
+		btrfs_set_inode_nbytes(eb, item, 0);
 	}
 insert:
 	btrfs_release_path(path);
@@ -488,7 +516,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
 	u64 mask = root->sectorsize - 1;
 	u64 extent_end;
 	u64 start = key->offset;
-	u64 saved_nbytes;
+	u64 nbytes = 0;
 	struct btrfs_file_extent_item *item;
 	struct inode *inode = NULL;
 	unsigned long size;
@@ -498,10 +526,19 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
 	found_type = btrfs_file_extent_type(eb, item);
 
 	if (found_type == BTRFS_FILE_EXTENT_REG ||
-	    found_type == BTRFS_FILE_EXTENT_PREALLOC)
-		extent_end = start + btrfs_file_extent_num_bytes(eb, item);
-	else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
+	    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
+		nbytes = btrfs_file_extent_num_bytes(eb, item);
+		extent_end = start + nbytes;
+
+		/*
+		 * We don't add to the inodes nbytes if we are prealloc or a
+		 * hole.
+		 */
+		if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
+			nbytes = 0;
+	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
 		size = btrfs_file_extent_inline_len(eb, item);
+		nbytes = btrfs_file_extent_ram_bytes(eb, item);
 		extent_end = (start + size + mask) & ~mask;
 	} else {
 		ret = 0;
@@ -550,7 +587,6 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
 	}
 	btrfs_release_path(path);
 
-	saved_nbytes = inode_get_bytes(inode);
 	/* drop any overlapping extents */
 	ret = btrfs_drop_extents(trans, root, inode, start, extent_end, 1);
 	BUG_ON(ret);
@@ -637,7 +673,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
 		BUG_ON(ret);
 	}
 
-	inode_set_bytes(inode, saved_nbytes);
+	inode_add_bytes(inode, nbytes);
 	ret = btrfs_update_inode(trans, root, inode);
 out:
 	if (inode)
-- 
cgit v0.10.2


From 7a5324f9edf0e920c99ed90872e1baeaf04acbc1 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Wed, 17 Apr 2013 08:46:19 -0700
Subject: s390: move dummy io_remap_pfn_range() to asm/pgtable.h

commit 4f2e29031e6c67802e7370292dd050fd62f337ee upstream.

Commit b4cbb197c7e7 ("vm: add vm_iomap_memory() helper function") added
a helper function wrapper around io_remap_pfn_range(), and every other
architecture defined it in <asm/pgtable.h>.

The s390 choice of <asm/io.h> may make sense, but is not very convenient
for this case, and gratuitous differences like that cause unexpected errors like this:

   mm/memory.c: In function 'vm_iomap_memory':
   mm/memory.c:2439:2: error: implicit declaration of function 'io_remap_pfn_range' [-Werror=implicit-function-declaration]

Glory be the kbuild test robot who noticed this, bisected it, and
reported it to the guilty parties (ie me).

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

diff --git a/arch/s390/include/asm/io.h b/arch/s390/include/asm/io.h
index 27cb321..379d96e 100644
--- a/arch/s390/include/asm/io.h
+++ b/arch/s390/include/asm/io.h
@@ -50,10 +50,6 @@ void unxlate_dev_mem_ptr(unsigned long phys, void *addr);
 #define ioremap_nocache(addr, size)	ioremap(addr, size)
 #define ioremap_wc			ioremap_nocache
 
-/* TODO: s390 cannot support io_remap_pfn_range... */
-#define io_remap_pfn_range(vma, vaddr, pfn, size, prot) 	       \
-	remap_pfn_range(vma, vaddr, pfn, size, prot)
-
 static inline void __iomem *ioremap(unsigned long offset, unsigned long size)
 {
 	return (void __iomem *) offset;
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index 098adbb..1532d7f 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -56,6 +56,10 @@ extern unsigned long zero_page_mask;
 	 (((unsigned long)(vaddr)) &zero_page_mask))))
 #define __HAVE_COLOR_ZERO_PAGE
 
+/* TODO: s390 cannot support io_remap_pfn_range... */
+#define io_remap_pfn_range(vma, vaddr, pfn, size, prot) 	       \
+	remap_pfn_range(vma, vaddr, pfn, size, prot)
+
 #endif /* !__ASSEMBLY__ */
 
 /*
-- 
cgit v0.10.2


From 65dcded2f1e5582e8f0fe6ef315b2274270a6302 Mon Sep 17 00:00:00 2001
From: Ralf Baechle <ralf@linux-mips.org>
Date: Mon, 22 Apr 2013 17:57:54 +0200
Subject: Revert "MIPS: page.h: Provide more readable definition for
 PAGE_MASK."

commit 3b5e50edaf500f392f4a372296afc0b99ffa7e70 upstream.

This reverts commit c17a6554782ad531f4713b33fd6339ba67ef6391.

Manuel Lauss writes:

lmo commit c17a6554 (MIPS: page.h: Provide more readable definition for
PAGE_MASK) apparently breaks ioremap of 36-bit addresses on my Alchemy
systems (PCI and PCMCIA) The reason is that in arch/mips/mm/ioremap.c
line 157  (phys_addr &= PAGE_MASK) bits 32-35 are cut off.  Seems the
new PAGE_MASK is explicitly 32bit, or one could make it signed instead
of unsigned long.

From: Ralf Baechle <ralf@linux-mips.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

diff --git a/arch/mips/include/asm/page.h b/arch/mips/include/asm/page.h
index dbaec94..21bff32 100644
--- a/arch/mips/include/asm/page.h
+++ b/arch/mips/include/asm/page.h
@@ -31,7 +31,7 @@
 #define PAGE_SHIFT	16
 #endif
 #define PAGE_SIZE	(_AC(1,UL) << PAGE_SHIFT)
-#define PAGE_MASK       (~(PAGE_SIZE - 1))
+#define PAGE_MASK	(~((1 << PAGE_SHIFT) - 1))
 
 #ifdef CONFIG_MIPS_HUGE_TLB_SUPPORT
 #define HPAGE_SHIFT	(PAGE_SHIFT + PAGE_SHIFT - 3)
-- 
cgit v0.10.2


From 8299a17a0544a45529a954cf27f9bf40e8890ac6 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Thu, 25 Apr 2013 12:52:16 -0700
Subject: Linux 3.8.9


diff --git a/Makefile b/Makefile
index 7684f95..3ae4796 100644
--- a/Makefile
+++ b/Makefile
@@ -1,6 +1,6 @@
 VERSION = 3
 PATCHLEVEL = 8
-SUBLEVEL = 8
+SUBLEVEL = 9
 EXTRAVERSION =
 NAME = Displaced Humerus Anterior
 
-- 
cgit v0.10.2


From 97129f70b9a32ddf8929daacd2abee36a7f0ddad Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sun, 14 Apr 2013 10:06:31 -0700
Subject: Add file_ns_capable() helper function for open-time capability
 checking

commit 935d8aabd4331f47a89c3e1daa5779d23cf244ee upstream.

Nothing is using it yet, but this will allow us to delay the open-time
checks to use time, without breaking the normal UNIX permission
semantics where permissions are determined by the opener (and the file
descriptor can then be passed to a different process, or the process can
drop capabilities).

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Shea Levy <shea@shealevy.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

diff --git a/include/linux/capability.h b/include/linux/capability.h
index 98503b7..d9a4f7f 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -35,6 +35,7 @@ struct cpu_vfs_cap_data {
 #define _KERNEL_CAP_T_SIZE     (sizeof(kernel_cap_t))
 
 
+struct file;
 struct inode;
 struct dentry;
 struct user_namespace;
@@ -211,6 +212,7 @@ extern bool capable(int cap);
 extern bool ns_capable(struct user_namespace *ns, int cap);
 extern bool nsown_capable(int cap);
 extern bool inode_capable(const struct inode *inode, int cap);
+extern bool file_ns_capable(const struct file *file, struct user_namespace *ns, int cap);
 
 /* audit system wants to get cap info from files as well */
 extern int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps);
diff --git a/kernel/capability.c b/kernel/capability.c
index 493d972..f6c2ce5 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -393,6 +393,30 @@ bool ns_capable(struct user_namespace *ns, int cap)
 EXPORT_SYMBOL(ns_capable);
 
 /**
+ * file_ns_capable - Determine if the file's opener had a capability in effect
+ * @file:  The file we want to check
+ * @ns:  The usernamespace we want the capability in
+ * @cap: The capability to be tested for
+ *
+ * Return true if task that opened the file had a capability in effect
+ * when the file was opened.
+ *
+ * This does not set PF_SUPERPRIV because the caller may not
+ * actually be privileged.
+ */
+bool file_ns_capable(const struct file *file, struct user_namespace *ns, int cap)
+{
+	if (WARN_ON_ONCE(!cap_valid(cap)))
+		return false;
+
+	if (security_capable(file->f_cred, ns, cap) == 0)
+		return true;
+
+	return false;
+}
+EXPORT_SYMBOL(file_ns_capable);
+
+/**
  * capable - Determine if the current task has a superior capability in effect
  * @cap: The capability to be tested for
  *
-- 
cgit v0.10.2


From bb8dd670874b0a460a472582ac9e91acf0293d11 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Fri, 26 Apr 2013 12:18:32 -0700
Subject: Linux 3.8.10


diff --git a/Makefile b/Makefile
index 3ae4796..e2b10b9 100644
--- a/Makefile
+++ b/Makefile
@@ -1,6 +1,6 @@
 VERSION = 3
 PATCHLEVEL = 8
-SUBLEVEL = 9
+SUBLEVEL = 10
 EXTRAVERSION =
 NAME = Displaced Humerus Anterior
 
-- 
cgit v0.10.2


From 5cd6cf7f85179fbcceef4cd4907f3ff95565a31b Mon Sep 17 00:00:00 2001
From: Christoph Lameter <cl@linux.com>
Date: Wed, 23 Jan 2013 21:45:47 +0000
Subject: FIX [1/2] slub: Do not dereference NULL pointer in node_match

The variables accessed in slab_alloc are volatile and therefore
the page pointer passed to node_match can be NULL. The processing
of data in slab_alloc is tentative until either the cmpxhchg
succeeds or the __slab_alloc slowpath is invoked. Both are
able to perform the same allocation from the freelist.

Check for the NULL pointer in node_match.

A false positive will lead to a retry of the loop in __slab_alloc.

Signed-off-by: Christoph Lameter <cl@linux.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Pekka Enberg <penberg@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/mm/slub.c b/mm/slub.c
index ba2ca53..0f270db 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2041,7 +2041,7 @@ static void flush_all(struct kmem_cache *s)
 static inline int node_match(struct page *page, int node)
 {
 #ifdef CONFIG_NUMA
-	if (node != NUMA_NO_NODE && page_to_nid(page) != node)
+	if (!page || (node != NUMA_NO_NODE && page_to_nid(page) != node))
 		return 0;
 #endif
 	return 1;
-- 
cgit v0.10.2


From 664310c4cc4f91a3f08eda173f7a43e8e31823bc Mon Sep 17 00:00:00 2001
From: Christoph Lameter <cl@linux.com>
Date: Wed, 23 Jan 2013 21:45:48 +0000
Subject: FIX [2/2] slub: Tid must be retrieved from the percpu area of the
 current processor

As Steven Rostedt has pointer out: Rescheduling could occur on a differnet processor
after the determination of the per cpu pointer and before the tid is retrieved.
This could result in allocation from the wrong node in slab_alloc.

The effect is much more severe in slab_free() where we could free to the freelist
of the wrong page.

The window for something like that occurring is pretty small but it is possible.

Signed-off-by: Christoph Lameter <cl@linux.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Pekka Enberg <penberg@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/mm/slub.c b/mm/slub.c
index 0f270db..3d2308f 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2331,13 +2331,13 @@ static __always_inline void *slab_alloc_node(struct kmem_cache *s,
 
 	s = memcg_kmem_get_cache(s, gfpflags);
 redo:
-
 	/*
-	 * Must read kmem_cache cpu data via this cpu ptr. Preemption is
-	 * enabled. We may switch back and forth between cpus while
-	 * reading from one cpu area. That does not matter as long
-	 * as we end up on the original cpu again when doing the cmpxchg.
+	 * Preemption is disabled for the retrieval of the tid because that
+	 * must occur from the current processor. We cannot allow rescheduling
+	 * on a different processor between the determination of the pointer
+	 * and the retrieval of the tid.
 	 */
+	preempt_disable();
 	c = __this_cpu_ptr(s->cpu_slab);
 
 	/*
@@ -2347,7 +2347,7 @@ redo:
 	 * linked list in between.
 	 */
 	tid = c->tid;
-	barrier();
+	preempt_enable();
 
 	object = c->freelist;
 	page = c->page;
@@ -2594,10 +2594,11 @@ redo:
 	 * data is retrieved via this pointer. If we are on the same cpu
 	 * during the cmpxchg then the free will succedd.
 	 */
+	preempt_disable();
 	c = __this_cpu_ptr(s->cpu_slab);
 
 	tid = c->tid;
-	barrier();
+	preempt_enable();
 
 	if (likely(page == c->page)) {
 		set_freepointer(s, object, c->freelist);
-- 
cgit v0.10.2


From 122e7aafec37e161b4e03f1b37f4eed24f74faac Mon Sep 17 00:00:00 2001
From: "Bu, Yitian" <ybu@qti.qualcomm.com>
Date: Mon, 18 Feb 2013 12:53:37 +0000
Subject: printk: Fix rq->lock vs logbuf_lock unlock lock inversion

commit 07354eb1a74d1 ("locking printk: Annotate logbuf_lock as raw")
reintroduced a lock inversion problem which was fixed in commit
0b5e1c5255 ("printk: Release console_sem after logbuf_lock"). This
happened probably when fixing up patch rejects.

Restore the ordering and unlock logbuf_lock before releasing
console_sem.

Signed-off-by: ybu <ybu@qti.qualcomm.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: stable@vger.kernel.org
Link: http://lkml.kernel.org/r/E807E903FE6CBE4D95E420FBFCC273B827413C@nasanexd01h.na.qualcomm.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/printk.c b/kernel/printk.c
index 267ce78..e698e80 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -1358,9 +1358,9 @@ static int console_trylock_for_printk(unsigned int cpu)
 		}
 	}
 	logbuf_cpu = UINT_MAX;
+	raw_spin_unlock(&logbuf_lock);
 	if (wake)
 		up(&console_sem);
-	raw_spin_unlock(&logbuf_lock);
 	return retval;
 }
 
-- 
cgit v0.10.2


From f6377e88e9a7c46783c8bf13e35114d433a11468 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 25 May 2012 16:59:47 +0200
Subject: genirq: Add default affinity mask command line option

If we isolate CPUs, then we don't want random device interrupts on
them. Even w/o the user space irq balancer enabled we can end up with
irqs on non boot cpus.

Allow to restrict the default irq affinity mask.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 986614d..1ba0afe 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1182,6 +1182,15 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 			See comment before ip2_setup() in
 			drivers/char/ip2/ip2base.c.
 
+	irqaffinity=	[SMP] Set the default irq affinity mask
+			Format:
+			<cpu number>,...,<cpu number>
+			or
+			<cpu number>-<cpu number>
+			(must be a positive range in ascending order)
+			or a mixture
+			<cpu number>,...,<cpu number>-<cpu number>
+
 	irqfixup	[HW]
 			When an interrupt is not handled search all handlers
 			for it. Intended to get systems with badly broken
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 192a302..473b2b6 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -23,10 +23,27 @@
 static struct lock_class_key irq_desc_lock_class;
 
 #if defined(CONFIG_SMP)
+static int __init irq_affinity_setup(char *str)
+{
+	zalloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT);
+	cpulist_parse(str, irq_default_affinity);
+	/*
+	 * Set at least the boot cpu. We don't want to end up with
+	 * bugreports caused by random comandline masks
+	 */
+	cpumask_set_cpu(smp_processor_id(), irq_default_affinity);
+	return 1;
+}
+__setup("irqaffinity=", irq_affinity_setup);
+
 static void __init init_irq_default_affinity(void)
 {
-	alloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT);
-	cpumask_setall(irq_default_affinity);
+#ifdef CONFIG_CPUMASK_OFFSTACK
+	if (!irq_default_affinity)
+		zalloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT);
+#endif
+	if (cpumask_empty(irq_default_affinity))
+		cpumask_setall(irq_default_affinity);
 }
 #else
 static void __init init_irq_default_affinity(void)
-- 
cgit v0.10.2


From c43cde8c524978a84fdfb8c422543a2cb021894e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 25 Jan 2013 13:21:47 -0500
Subject: OF: Fixup resursive locking code paths

There is no real reason to use a rwlock for devtree_lock. It even
could be a mutex, but unfortunately it's locked from cpu hotplug
paths which can't schedule :(

So it needs to become a raw lock on rt as well.  The devtree_lock would
be the only user of a raw_rw_lock, so we are better off cleaning up the
recursive locking paths which allows us to convert devtree_lock to a
read_lock.

Here we do the standard thing of introducing __foo() as the "raw"
version of foo(), so that we can take better control of the locking.
The "raw" versions are not exported and are for internal use within
the file itself.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
Cc: devicetree-discuss@lists.ozlabs.org
Cc: Grant Likely <grant.likely@secretlab.ca>
Cc: Rob Herring <rob.herring@calxeda.com>
Link: http://lkml.kernel.org/r/1359138107-14159-1-git-send-email-paul.gortmaker@windriver.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/drivers/of/base.c b/drivers/of/base.c
index 2390ddb..21195a1 100644
--- a/drivers/of/base.c
+++ b/drivers/of/base.c
@@ -164,16 +164,14 @@ void of_node_put(struct device_node *node)
 EXPORT_SYMBOL(of_node_put);
 #endif /* CONFIG_OF_DYNAMIC */
 
-struct property *of_find_property(const struct device_node *np,
-				  const char *name,
-				  int *lenp)
+static struct property *__of_find_property(const struct device_node *np,
+					   const char *name, int *lenp)
 {
 	struct property *pp;
 
 	if (!np)
 		return NULL;
 
-	read_lock(&devtree_lock);
 	for (pp = np->properties; pp; pp = pp->next) {
 		if (of_prop_cmp(pp->name, name) == 0) {
 			if (lenp)
@@ -181,6 +179,18 @@ struct property *of_find_property(const struct device_node *np,
 			break;
 		}
 	}
+
+	return pp;
+}
+
+struct property *of_find_property(const struct device_node *np,
+				  const char *name,
+				  int *lenp)
+{
+	struct property *pp;
+
+	read_lock(&devtree_lock);
+	pp = __of_find_property(np, name, lenp);
 	read_unlock(&devtree_lock);
 
 	return pp;
@@ -214,8 +224,20 @@ EXPORT_SYMBOL(of_find_all_nodes);
  * Find a property with a given name for a given node
  * and return the value.
  */
+static const void *__of_get_property(const struct device_node *np,
+				     const char *name, int *lenp)
+{
+	struct property *pp = __of_find_property(np, name, lenp);
+
+	return pp ? pp->value : NULL;
+}
+
+/*
+ * Find a property with a given name for a given node
+ * and return the value.
+ */
 const void *of_get_property(const struct device_node *np, const char *name,
-			 int *lenp)
+			    int *lenp)
 {
 	struct property *pp = of_find_property(np, name, lenp);
 
@@ -226,13 +248,13 @@ EXPORT_SYMBOL(of_get_property);
 /** Checks if the given "compat" string matches one of the strings in
  * the device's "compatible" property
  */
-int of_device_is_compatible(const struct device_node *device,
-		const char *compat)
+static int __of_device_is_compatible(const struct device_node *device,
+				     const char *compat)
 {
 	const char* cp;
 	int cplen, l;
 
-	cp = of_get_property(device, "compatible", &cplen);
+	cp = __of_get_property(device, "compatible", &cplen);
 	if (cp == NULL)
 		return 0;
 	while (cplen > 0) {
@@ -245,6 +267,20 @@ int of_device_is_compatible(const struct device_node *device,
 
 	return 0;
 }
+
+/** Checks if the given "compat" string matches one of the strings in
+ * the device's "compatible" property
+ */
+int of_device_is_compatible(const struct device_node *device,
+		const char *compat)
+{
+	int res;
+
+	read_lock(&devtree_lock);
+	res = __of_device_is_compatible(device, compat);
+	read_unlock(&devtree_lock);
+	return res;
+}
 EXPORT_SYMBOL(of_device_is_compatible);
 
 /**
@@ -518,7 +554,8 @@ struct device_node *of_find_compatible_node(struct device_node *from,
 		if (type
 		    && !(np->type && (of_node_cmp(np->type, type) == 0)))
 			continue;
-		if (of_device_is_compatible(np, compatible) && of_node_get(np))
+		if (__of_device_is_compatible(np, compatible) &&
+		    of_node_get(np))
 			break;
 	}
 	of_node_put(from);
@@ -562,15 +599,9 @@ out:
 }
 EXPORT_SYMBOL(of_find_node_with_property);
 
-/**
- * of_match_node - Tell if an device_node has a matching of_match structure
- *	@matches:	array of of device match structures to search in
- *	@node:		the of device structure to match against
- *
- *	Low level utility function used by device matching.
- */
-const struct of_device_id *of_match_node(const struct of_device_id *matches,
-					 const struct device_node *node)
+static
+const struct of_device_id *__of_match_node(const struct of_device_id *matches,
+					   const struct device_node *node)
 {
 	if (!matches)
 		return NULL;
@@ -584,14 +615,32 @@ const struct of_device_id *of_match_node(const struct of_device_id *matches,
 			match &= node->type
 				&& !strcmp(matches->type, node->type);
 		if (matches->compatible[0])
-			match &= of_device_is_compatible(node,
-						matches->compatible);
+			match &= __of_device_is_compatible(node,
+							   matches->compatible);
 		if (match)
 			return matches;
 		matches++;
 	}
 	return NULL;
 }
+
+/**
+ * of_match_node - Tell if an device_node has a matching of_match structure
+ *	@matches:	array of of device match structures to search in
+ *	@node:		the of device structure to match against
+ *
+ *	Low level utility function used by device matching.
+ */
+const struct of_device_id *of_match_node(const struct of_device_id *matches,
+					 const struct device_node *node)
+{
+	const struct of_device_id *match;
+
+	read_lock(&devtree_lock);
+	match = __of_match_node(matches, node);
+	read_unlock(&devtree_lock);
+	return match;
+}
 EXPORT_SYMBOL(of_match_node);
 
 /**
@@ -619,7 +668,7 @@ struct device_node *of_find_matching_node_and_match(struct device_node *from,
 	read_lock(&devtree_lock);
 	np = from ? from->allnext : of_allnodes;
 	for (; np; np = np->allnext) {
-		if (of_match_node(matches, np) && of_node_get(np)) {
+		if (__of_match_node(matches, np) && of_node_get(np)) {
 			if (match)
 				*match = matches;
 			break;
-- 
cgit v0.10.2


From 444fb52ec868bf9c52ccd04dc14059e698b08e8a Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 6 Feb 2013 15:30:56 -0500
Subject: OF: Convert devtree lock from rw_lock to raw spinlock

With the locking cleanup in place (from "OF: Fixup resursive
locking code paths"), we can now do the conversion from the
rw_lock to a raw spinlock as required for preempt-rt.

The previous cleanup and this conversion were originally
separate since they predated when mainline got raw spinlock (in
commit c2f21ce2e31286a "locking: Implement new raw_spinlock").

So, at that point in time, the cleanup was considered plausible
for mainline, but not this conversion.  In any case, we've kept
them separate as it makes for easier review and better bisection.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Grant Likely <grant.likely@secretlab.ca>
Cc: Sam Ravnborg <sam@ravnborg.org>
Cc: <devicetree-discuss@lists.ozlabs.org>
Cc: Rob Herring <rob.herring@calxeda.com>
Link: http://lkml.kernel.org/r/1360182656-15898-1-git-send-email-paul.gortmaker@windriver.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
[PG: taken from preempt-rt, update subject & add a commit log]
Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>

diff --git a/arch/sparc/kernel/prom_common.c b/arch/sparc/kernel/prom_common.c
index 1303021..9f20566 100644
--- a/arch/sparc/kernel/prom_common.c
+++ b/arch/sparc/kernel/prom_common.c
@@ -64,7 +64,7 @@ int of_set_property(struct device_node *dp, const char *name, void *val, int len
 	err = -ENODEV;
 
 	mutex_lock(&of_set_property_mutex);
-	write_lock(&devtree_lock);
+	raw_spin_lock(&devtree_lock);
 	prevp = &dp->properties;
 	while (*prevp) {
 		struct property *prop = *prevp;
@@ -91,7 +91,7 @@ int of_set_property(struct device_node *dp, const char *name, void *val, int len
 		}
 		prevp = &(*prevp)->next;
 	}
-	write_unlock(&devtree_lock);
+	raw_spin_unlock(&devtree_lock);
 	mutex_unlock(&of_set_property_mutex);
 
 	/* XXX Upate procfs if necessary... */
diff --git a/drivers/of/base.c b/drivers/of/base.c
index 21195a1..662e009 100644
--- a/drivers/of/base.c
+++ b/drivers/of/base.c
@@ -55,7 +55,7 @@ static DEFINE_MUTEX(of_aliases_mutex);
 /* use when traversing tree through the allnext, child, sibling,
  * or parent members of struct device_node.
  */
-DEFINE_RWLOCK(devtree_lock);
+DEFINE_RAW_SPINLOCK(devtree_lock);
 
 int of_n_addr_cells(struct device_node *np)
 {
@@ -188,10 +188,11 @@ struct property *of_find_property(const struct device_node *np,
 				  int *lenp)
 {
 	struct property *pp;
+	unsigned long flags;
 
-	read_lock(&devtree_lock);
+	raw_spin_lock_irqsave(&devtree_lock, flags);
 	pp = __of_find_property(np, name, lenp);
-	read_unlock(&devtree_lock);
+	raw_spin_unlock_irqrestore(&devtree_lock, flags);
 
 	return pp;
 }
@@ -209,13 +210,13 @@ struct device_node *of_find_all_nodes(struct device_node *prev)
 {
 	struct device_node *np;
 
-	read_lock(&devtree_lock);
+	raw_spin_lock(&devtree_lock);
 	np = prev ? prev->allnext : of_allnodes;
 	for (; np != NULL; np = np->allnext)
 		if (of_node_get(np))
 			break;
 	of_node_put(prev);
-	read_unlock(&devtree_lock);
+	raw_spin_unlock(&devtree_lock);
 	return np;
 }
 EXPORT_SYMBOL(of_find_all_nodes);
@@ -274,11 +275,12 @@ static int __of_device_is_compatible(const struct device_node *device,
 int of_device_is_compatible(const struct device_node *device,
 		const char *compat)
 {
+	unsigned long flags;
 	int res;
 
-	read_lock(&devtree_lock);
+	raw_spin_lock_irqsave(&devtree_lock, flags);
 	res = __of_device_is_compatible(device, compat);
-	read_unlock(&devtree_lock);
+	raw_spin_unlock_irqrestore(&devtree_lock, flags);
 	return res;
 }
 EXPORT_SYMBOL(of_device_is_compatible);
@@ -340,13 +342,14 @@ EXPORT_SYMBOL(of_device_is_available);
 struct device_node *of_get_parent(const struct device_node *node)
 {
 	struct device_node *np;
+	unsigned long flags;
 
 	if (!node)
 		return NULL;
 
-	read_lock(&devtree_lock);
+	raw_spin_lock_irqsave(&devtree_lock, flags);
 	np = of_node_get(node->parent);
-	read_unlock(&devtree_lock);
+	raw_spin_unlock_irqrestore(&devtree_lock, flags);
 	return np;
 }
 EXPORT_SYMBOL(of_get_parent);
@@ -365,14 +368,15 @@ EXPORT_SYMBOL(of_get_parent);
 struct device_node *of_get_next_parent(struct device_node *node)
 {
 	struct device_node *parent;
+	unsigned long flags;
 
 	if (!node)
 		return NULL;
 
-	read_lock(&devtree_lock);
+	raw_spin_lock_irqsave(&devtree_lock, flags);
 	parent = of_node_get(node->parent);
 	of_node_put(node);
-	read_unlock(&devtree_lock);
+	raw_spin_unlock_irqrestore(&devtree_lock, flags);
 	return parent;
 }
 
@@ -388,14 +392,15 @@ struct device_node *of_get_next_child(const struct device_node *node,
 	struct device_node *prev)
 {
 	struct device_node *next;
+	unsigned long flags;
 
-	read_lock(&devtree_lock);
+	raw_spin_lock_irqsave(&devtree_lock, flags);
 	next = prev ? prev->sibling : node->child;
 	for (; next; next = next->sibling)
 		if (of_node_get(next))
 			break;
 	of_node_put(prev);
-	read_unlock(&devtree_lock);
+	raw_spin_unlock_irqrestore(&devtree_lock, flags);
 	return next;
 }
 EXPORT_SYMBOL(of_get_next_child);
@@ -413,7 +418,7 @@ struct device_node *of_get_next_available_child(const struct device_node *node,
 {
 	struct device_node *next;
 
-	read_lock(&devtree_lock);
+	raw_spin_lock(&devtree_lock);
 	next = prev ? prev->sibling : node->child;
 	for (; next; next = next->sibling) {
 		if (!of_device_is_available(next))
@@ -422,7 +427,7 @@ struct device_node *of_get_next_available_child(const struct device_node *node,
 			break;
 	}
 	of_node_put(prev);
-	read_unlock(&devtree_lock);
+	raw_spin_unlock(&devtree_lock);
 	return next;
 }
 EXPORT_SYMBOL(of_get_next_available_child);
@@ -460,14 +465,15 @@ EXPORT_SYMBOL(of_get_child_by_name);
 struct device_node *of_find_node_by_path(const char *path)
 {
 	struct device_node *np = of_allnodes;
+	unsigned long flags;
 
-	read_lock(&devtree_lock);
+	raw_spin_lock_irqsave(&devtree_lock, flags);
 	for (; np; np = np->allnext) {
 		if (np->full_name && (of_node_cmp(np->full_name, path) == 0)
 		    && of_node_get(np))
 			break;
 	}
-	read_unlock(&devtree_lock);
+	raw_spin_unlock_irqrestore(&devtree_lock, flags);
 	return np;
 }
 EXPORT_SYMBOL(of_find_node_by_path);
@@ -487,15 +493,16 @@ struct device_node *of_find_node_by_name(struct device_node *from,
 	const char *name)
 {
 	struct device_node *np;
+	unsigned long flags;
 
-	read_lock(&devtree_lock);
+	raw_spin_lock_irqsave(&devtree_lock, flags);
 	np = from ? from->allnext : of_allnodes;
 	for (; np; np = np->allnext)
 		if (np->name && (of_node_cmp(np->name, name) == 0)
 		    && of_node_get(np))
 			break;
 	of_node_put(from);
-	read_unlock(&devtree_lock);
+	raw_spin_unlock_irqrestore(&devtree_lock, flags);
 	return np;
 }
 EXPORT_SYMBOL(of_find_node_by_name);
@@ -516,15 +523,16 @@ struct device_node *of_find_node_by_type(struct device_node *from,
 	const char *type)
 {
 	struct device_node *np;
+	unsigned long flags;
 
-	read_lock(&devtree_lock);
+	raw_spin_lock_irqsave(&devtree_lock, flags);
 	np = from ? from->allnext : of_allnodes;
 	for (; np; np = np->allnext)
 		if (np->type && (of_node_cmp(np->type, type) == 0)
 		    && of_node_get(np))
 			break;
 	of_node_put(from);
-	read_unlock(&devtree_lock);
+	raw_spin_unlock_irqrestore(&devtree_lock, flags);
 	return np;
 }
 EXPORT_SYMBOL(of_find_node_by_type);
@@ -547,8 +555,9 @@ struct device_node *of_find_compatible_node(struct device_node *from,
 	const char *type, const char *compatible)
 {
 	struct device_node *np;
+	unsigned long flags;
 
-	read_lock(&devtree_lock);
+	raw_spin_lock_irqsave(&devtree_lock, flags);
 	np = from ? from->allnext : of_allnodes;
 	for (; np; np = np->allnext) {
 		if (type
@@ -559,7 +568,7 @@ struct device_node *of_find_compatible_node(struct device_node *from,
 			break;
 	}
 	of_node_put(from);
-	read_unlock(&devtree_lock);
+	raw_spin_unlock_irqrestore(&devtree_lock, flags);
 	return np;
 }
 EXPORT_SYMBOL(of_find_compatible_node);
@@ -581,8 +590,9 @@ struct device_node *of_find_node_with_property(struct device_node *from,
 {
 	struct device_node *np;
 	struct property *pp;
+	unsigned long flags;
 
-	read_lock(&devtree_lock);
+	raw_spin_lock_irqsave(&devtree_lock, flags);
 	np = from ? from->allnext : of_allnodes;
 	for (; np; np = np->allnext) {
 		for (pp = np->properties; pp; pp = pp->next) {
@@ -594,7 +604,7 @@ struct device_node *of_find_node_with_property(struct device_node *from,
 	}
 out:
 	of_node_put(from);
-	read_unlock(&devtree_lock);
+	raw_spin_unlock_irqrestore(&devtree_lock, flags);
 	return np;
 }
 EXPORT_SYMBOL(of_find_node_with_property);
@@ -635,10 +645,11 @@ const struct of_device_id *of_match_node(const struct of_device_id *matches,
 					 const struct device_node *node)
 {
 	const struct of_device_id *match;
+	unsigned long flags;
 
-	read_lock(&devtree_lock);
+	raw_spin_lock_irqsave(&devtree_lock, flags);
 	match = __of_match_node(matches, node);
-	read_unlock(&devtree_lock);
+	raw_spin_unlock_irqrestore(&devtree_lock, flags);
 	return match;
 }
 EXPORT_SYMBOL(of_match_node);
@@ -661,11 +672,12 @@ struct device_node *of_find_matching_node_and_match(struct device_node *from,
 					const struct of_device_id **match)
 {
 	struct device_node *np;
+	unsigned long flags;
 
 	if (match)
 		*match = NULL;
 
-	read_lock(&devtree_lock);
+	raw_spin_lock_irqsave(&devtree_lock, flags);
 	np = from ? from->allnext : of_allnodes;
 	for (; np; np = np->allnext) {
 		if (__of_match_node(matches, np) && of_node_get(np)) {
@@ -675,7 +687,7 @@ struct device_node *of_find_matching_node_and_match(struct device_node *from,
 		}
 	}
 	of_node_put(from);
-	read_unlock(&devtree_lock);
+	raw_spin_unlock_irqrestore(&devtree_lock, flags);
 	return np;
 }
 EXPORT_SYMBOL(of_find_matching_node_and_match);
@@ -718,12 +730,12 @@ struct device_node *of_find_node_by_phandle(phandle handle)
 {
 	struct device_node *np;
 
-	read_lock(&devtree_lock);
+	raw_spin_lock(&devtree_lock);
 	for (np = of_allnodes; np; np = np->allnext)
 		if (np->phandle == handle)
 			break;
 	of_node_get(np);
-	read_unlock(&devtree_lock);
+	raw_spin_unlock(&devtree_lock);
 	return np;
 }
 EXPORT_SYMBOL(of_find_node_by_phandle);
@@ -1195,18 +1207,18 @@ int of_add_property(struct device_node *np, struct property *prop)
 		return rc;
 
 	prop->next = NULL;
-	write_lock_irqsave(&devtree_lock, flags);
+	raw_spin_lock_irqsave(&devtree_lock, flags);
 	next = &np->properties;
 	while (*next) {
 		if (strcmp(prop->name, (*next)->name) == 0) {
 			/* duplicate ! don't insert it */
-			write_unlock_irqrestore(&devtree_lock, flags);
+			raw_spin_unlock_irqrestore(&devtree_lock, flags);
 			return -1;
 		}
 		next = &(*next)->next;
 	}
 	*next = prop;
-	write_unlock_irqrestore(&devtree_lock, flags);
+	raw_spin_unlock_irqrestore(&devtree_lock, flags);
 
 #ifdef CONFIG_PROC_DEVICETREE
 	/* try to add to proc as well if it was initialized */
@@ -1236,7 +1248,7 @@ int of_remove_property(struct device_node *np, struct property *prop)
 	if (rc)
 		return rc;
 
-	write_lock_irqsave(&devtree_lock, flags);
+	raw_spin_lock_irqsave(&devtree_lock, flags);
 	next = &np->properties;
 	while (*next) {
 		if (*next == prop) {
@@ -1249,7 +1261,7 @@ int of_remove_property(struct device_node *np, struct property *prop)
 		}
 		next = &(*next)->next;
 	}
-	write_unlock_irqrestore(&devtree_lock, flags);
+	raw_spin_unlock_irqrestore(&devtree_lock, flags);
 
 	if (!found)
 		return -ENODEV;
@@ -1289,7 +1301,7 @@ int of_update_property(struct device_node *np, struct property *newprop)
 	if (!oldprop)
 		return of_add_property(np, newprop);
 
-	write_lock_irqsave(&devtree_lock, flags);
+	raw_spin_lock_irqsave(&devtree_lock, flags);
 	next = &np->properties;
 	while (*next) {
 		if (*next == oldprop) {
@@ -1303,7 +1315,7 @@ int of_update_property(struct device_node *np, struct property *newprop)
 		}
 		next = &(*next)->next;
 	}
-	write_unlock_irqrestore(&devtree_lock, flags);
+	raw_spin_unlock_irqrestore(&devtree_lock, flags);
 
 	if (!found)
 		return -ENODEV;
@@ -1376,12 +1388,12 @@ int of_attach_node(struct device_node *np)
 	if (rc)
 		return rc;
 
-	write_lock_irqsave(&devtree_lock, flags);
+	raw_spin_lock_irqsave(&devtree_lock, flags);
 	np->sibling = np->parent->child;
 	np->allnext = of_allnodes;
 	np->parent->child = np;
 	of_allnodes = np;
-	write_unlock_irqrestore(&devtree_lock, flags);
+	raw_spin_unlock_irqrestore(&devtree_lock, flags);
 
 	of_add_proc_dt_entry(np);
 	return 0;
@@ -1424,17 +1436,17 @@ int of_detach_node(struct device_node *np)
 	if (rc)
 		return rc;
 
-	write_lock_irqsave(&devtree_lock, flags);
+	raw_spin_lock_irqsave(&devtree_lock, flags);
 
 	if (of_node_check_flag(np, OF_DETACHED)) {
 		/* someone already detached it */
-		write_unlock_irqrestore(&devtree_lock, flags);
+		raw_spin_unlock_irqrestore(&devtree_lock, flags);
 		return rc;
 	}
 
 	parent = np->parent;
 	if (!parent) {
-		write_unlock_irqrestore(&devtree_lock, flags);
+		raw_spin_unlock_irqrestore(&devtree_lock, flags);
 		return rc;
 	}
 
@@ -1461,7 +1473,7 @@ int of_detach_node(struct device_node *np)
 	}
 
 	of_node_set_flag(np, OF_DETACHED);
-	write_unlock_irqrestore(&devtree_lock, flags);
+	raw_spin_unlock_irqrestore(&devtree_lock, flags);
 
 	of_remove_proc_dt_entry(np);
 	return rc;
diff --git a/include/linux/of.h b/include/linux/of.h
index 5ebcc5c..bb35c42 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -92,7 +92,7 @@ static inline void of_node_put(struct device_node *node) { }
 extern struct device_node *of_allnodes;
 extern struct device_node *of_chosen;
 extern struct device_node *of_aliases;
-extern rwlock_t devtree_lock;
+extern raw_spinlock_t devtree_lock;
 
 static inline bool of_have_populated_dt(void)
 {
-- 
cgit v0.10.2


From 5e752828df6d23f42e8f0f5da2faea05e9012a81 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 17 Jul 2011 21:25:03 +0200
Subject: locking-various-init-fixes.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/drivers/char/random.c b/drivers/char/random.c
index 57d4b15..32a6c57 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -445,7 +445,7 @@ static struct entropy_store input_pool = {
 	.poolinfo = &poolinfo_table[0],
 	.name = "input",
 	.limit = 1,
-	.lock = __SPIN_LOCK_UNLOCKED(&input_pool.lock),
+	.lock = __SPIN_LOCK_UNLOCKED(input_pool.lock),
 	.pool = input_pool_data
 };
 
@@ -454,7 +454,7 @@ static struct entropy_store blocking_pool = {
 	.name = "blocking",
 	.limit = 1,
 	.pull = &input_pool,
-	.lock = __SPIN_LOCK_UNLOCKED(&blocking_pool.lock),
+	.lock = __SPIN_LOCK_UNLOCKED(blocking_pool.lock),
 	.pool = blocking_pool_data
 };
 
@@ -462,7 +462,7 @@ static struct entropy_store nonblocking_pool = {
 	.poolinfo = &poolinfo_table[1],
 	.name = "nonblocking",
 	.pull = &input_pool,
-	.lock = __SPIN_LOCK_UNLOCKED(&nonblocking_pool.lock),
+	.lock = __SPIN_LOCK_UNLOCKED(nonblocking_pool.lock),
 	.pool = nonblocking_pool_data
 };
 
diff --git a/drivers/usb/chipidea/debug.c b/drivers/usb/chipidea/debug.c
index 3bc244d..a62c4a4 100644
--- a/drivers/usb/chipidea/debug.c
+++ b/drivers/usb/chipidea/debug.c
@@ -222,7 +222,7 @@ static struct {
 } dbg_data = {
 	.idx = 0,
 	.tty = 0,
-	.lck = __RW_LOCK_UNLOCKED(lck)
+	.lck = __RW_LOCK_UNLOCKED(dbg_data.lck)
 };
 
 /**
diff --git a/fs/file.c b/fs/file.c
index 2b3570b..3906d95 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -516,7 +516,7 @@ struct files_struct init_files = {
 		.close_on_exec	= init_files.close_on_exec_init,
 		.open_fds	= init_files.open_fds_init,
 	},
-	.file_lock	= __SPIN_LOCK_UNLOCKED(init_task.file_lock),
+	.file_lock	= __SPIN_LOCK_UNLOCKED(init_files.file_lock),
 };
 
 /*
diff --git a/include/linux/idr.h b/include/linux/idr.h
index de7e190..e5eb125 100644
--- a/include/linux/idr.h
+++ b/include/linux/idr.h
@@ -136,7 +136,7 @@ struct ida {
 	struct ida_bitmap	*free_bitmap;
 };
 
-#define IDA_INIT(name)		{ .idr = IDR_INIT(name), .free_bitmap = NULL, }
+#define IDA_INIT(name)		{ .idr = IDR_INIT((name).idr), .free_bitmap = NULL, }
 #define DEFINE_IDA(name)	struct ida name = IDA_INIT(name)
 
 int ida_pre_get(struct ida *ida, gfp_t gfp_mask);
-- 
cgit v0.10.2


From a279b931bdc475fee1ba0454b5ce9d6e988c55b1 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Wed, 7 Dec 2011 12:48:42 +0100
Subject: intel_idle: Convert i7300_idle_lock to raw spinlock

24 core Intel box's first exposure to 3.0.12-rt30-rc3 didn't go well.

[   27.104159] i7300_idle: loaded v1.55
[   27.104192] BUG: scheduling while atomic: swapper/2/0/0x00000002
[   27.104309] Pid: 0, comm: swapper/2 Tainted: G           N  3.0.12-rt30-rc3-rt #1
[   27.104317] Call Trace:
[   27.104338]  [<ffffffff810046a5>] dump_trace+0x85/0x2e0
[   27.104372]  [<ffffffff8144eb00>] thread_return+0x12b/0x30b
[   27.104381]  [<ffffffff8144f1b9>] schedule+0x29/0xb0
[   27.104389]  [<ffffffff814506e5>] rt_spin_lock_slowlock+0xc5/0x240
[   27.104401]  [<ffffffffa01f818f>] i7300_idle_notifier+0x3f/0x360 [i7300_idle]
[   27.104415]  [<ffffffff814546c7>] notifier_call_chain+0x37/0x70
[   27.104426]  [<ffffffff81454748>] __atomic_notifier_call_chain+0x48/0x70
[   27.104439]  [<ffffffff81001a39>] cpu_idle+0x89/0xb0
[   27.104449] bad: scheduling from the idle thread!

Signed-off-by: Mike Galbraith <efault@gmx.de>
Cc: Steven Rostedt <rostedt@goodmis.org>
Link: http://lkml.kernel.org/r/1323258522.5057.73.camel@marge.simson.net
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/drivers/idle/i7300_idle.c b/drivers/idle/i7300_idle.c
index fa080eb..ffeebc7 100644
--- a/drivers/idle/i7300_idle.c
+++ b/drivers/idle/i7300_idle.c
@@ -75,7 +75,7 @@ static unsigned long past_skip;
 
 static struct pci_dev *fbd_dev;
 
-static spinlock_t i7300_idle_lock;
+static raw_spinlock_t i7300_idle_lock;
 static int i7300_idle_active;
 
 static u8 i7300_idle_thrtctl_saved;
@@ -457,7 +457,7 @@ static int i7300_idle_notifier(struct notifier_block *nb, unsigned long val,
 		idle_begin_time = ktime_get();
 	}
 
-	spin_lock_irqsave(&i7300_idle_lock, flags);
+	raw_spin_lock_irqsave(&i7300_idle_lock, flags);
 	if (val == IDLE_START) {
 
 		cpumask_set_cpu(smp_processor_id(), idle_cpumask);
@@ -506,7 +506,7 @@ static int i7300_idle_notifier(struct notifier_block *nb, unsigned long val,
 		}
 	}
 end:
-	spin_unlock_irqrestore(&i7300_idle_lock, flags);
+	raw_spin_unlock_irqrestore(&i7300_idle_lock, flags);
 	return 0;
 }
 
@@ -548,7 +548,7 @@ struct debugfs_file_info {
 
 static int __init i7300_idle_init(void)
 {
-	spin_lock_init(&i7300_idle_lock);
+	raw_spin_lock_init(&i7300_idle_lock);
 	total_us = 0;
 
 	if (i7300_idle_platform_probe(&fbd_dev, &ioat_dev, forceload))
-- 
cgit v0.10.2


From 82cad18aa12126453fdb0e3b342531e6c9e2d587 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 10 Apr 2012 11:14:55 +0200
Subject: ntp: Make ntp_lock raw.

This needs to be revisited. Not sure whether we can avoid to make this
lock raw, but it'd really like to.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 24174b4..bb1edfa 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -22,7 +22,7 @@
  * NTP timekeeping variables:
  */
 
-DEFINE_SPINLOCK(ntp_lock);
+DEFINE_RAW_SPINLOCK(ntp_lock);
 
 
 /* USER_HZ period (usecs): */
@@ -347,7 +347,7 @@ void ntp_clear(void)
 {
 	unsigned long flags;
 
-	spin_lock_irqsave(&ntp_lock, flags);
+	raw_spin_lock_irqsave(&ntp_lock, flags);
 
 	time_adjust	= 0;		/* stop active adjtime() */
 	time_status	|= STA_UNSYNC;
@@ -361,7 +361,7 @@ void ntp_clear(void)
 
 	/* Clear PPS state variables */
 	pps_clear();
-	spin_unlock_irqrestore(&ntp_lock, flags);
+	raw_spin_unlock_irqrestore(&ntp_lock, flags);
 
 }
 
@@ -371,9 +371,9 @@ u64 ntp_tick_length(void)
 	unsigned long flags;
 	s64 ret;
 
-	spin_lock_irqsave(&ntp_lock, flags);
+	raw_spin_lock_irqsave(&ntp_lock, flags);
 	ret = tick_length;
-	spin_unlock_irqrestore(&ntp_lock, flags);
+	raw_spin_unlock_irqrestore(&ntp_lock, flags);
 	return ret;
 }
 
@@ -394,7 +394,7 @@ int second_overflow(unsigned long secs)
 	int leap = 0;
 	unsigned long flags;
 
-	spin_lock_irqsave(&ntp_lock, flags);
+	raw_spin_lock_irqsave(&ntp_lock, flags);
 
 	/*
 	 * Leap second processing. If in leap-insert state at the end of the
@@ -478,7 +478,7 @@ int second_overflow(unsigned long secs)
 	time_adjust = 0;
 
 out:
-	spin_unlock_irqrestore(&ntp_lock, flags);
+	raw_spin_unlock_irqrestore(&ntp_lock, flags);
 
 	return leap;
 }
@@ -660,7 +660,7 @@ int do_adjtimex(struct timex *txc)
 
 	getnstimeofday(&ts);
 
-	spin_lock_irq(&ntp_lock);
+	raw_spin_lock_irq(&ntp_lock);
 
 	if (txc->modes & ADJ_ADJTIME) {
 		long save_adjust = time_adjust;
@@ -702,7 +702,7 @@ int do_adjtimex(struct timex *txc)
 	/* fill PPS status fields */
 	pps_fill_timex(txc);
 
-	spin_unlock_irq(&ntp_lock);
+	raw_spin_unlock_irq(&ntp_lock);
 
 	txc->time.tv_sec = ts.tv_sec;
 	txc->time.tv_usec = ts.tv_nsec;
@@ -900,7 +900,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
 
 	pts_norm = pps_normalize_ts(*phase_ts);
 
-	spin_lock_irqsave(&ntp_lock, flags);
+	raw_spin_lock_irqsave(&ntp_lock, flags);
 
 	/* clear the error bits, they will be set again if needed */
 	time_status &= ~(STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR);
@@ -913,7 +913,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
 	 * just start the frequency interval */
 	if (unlikely(pps_fbase.tv_sec == 0)) {
 		pps_fbase = *raw_ts;
-		spin_unlock_irqrestore(&ntp_lock, flags);
+		raw_spin_unlock_irqrestore(&ntp_lock, flags);
 		return;
 	}
 
@@ -928,7 +928,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
 		time_status |= STA_PPSJITTER;
 		/* restart the frequency calibration interval */
 		pps_fbase = *raw_ts;
-		spin_unlock_irqrestore(&ntp_lock, flags);
+		raw_spin_unlock_irqrestore(&ntp_lock, flags);
 		pr_err("hardpps: PPSJITTER: bad pulse\n");
 		return;
 	}
@@ -945,7 +945,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
 
 	hardpps_update_phase(pts_norm.nsec);
 
-	spin_unlock_irqrestore(&ntp_lock, flags);
+	raw_spin_unlock_irqrestore(&ntp_lock, flags);
 }
 EXPORT_SYMBOL(hardpps);
 
-- 
cgit v0.10.2


From c2ba4c5e5ab96d314b2191c71ab4e26c0fa410d2 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 16 Jul 2011 18:38:22 +0200
Subject: seqlock: Remove unused functions

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
index 600060e2..cb0599c 100644
--- a/include/linux/seqlock.h
+++ b/include/linux/seqlock.h
@@ -69,17 +69,6 @@ static inline void write_sequnlock(seqlock_t *sl)
 	spin_unlock(&sl->lock);
 }
 
-static inline int write_tryseqlock(seqlock_t *sl)
-{
-	int ret = spin_trylock(&sl->lock);
-
-	if (ret) {
-		++sl->sequence;
-		smp_wmb();
-	}
-	return ret;
-}
-
 /* Start of read calculation -- fetch last complete writer token */
 static __always_inline unsigned read_seqbegin(const seqlock_t *sl)
 {
@@ -269,14 +258,4 @@ static inline void write_seqcount_barrier(seqcount_t *s)
 #define write_sequnlock_bh(lock)					\
 	do { write_sequnlock(lock); local_bh_enable(); } while(0)
 
-#define read_seqbegin_irqsave(lock, flags)				\
-	({ local_irq_save(flags);   read_seqbegin(lock); })
-
-#define read_seqretry_irqrestore(lock, iv, flags)			\
-	({								\
-		int ret = read_seqretry(lock, iv);			\
-		local_irq_restore(flags);				\
-		ret;							\
-	})
-
 #endif /* __LINUX_SEQLOCK_H */
-- 
cgit v0.10.2


From aefe52578185afc3f3891e15bea9315f052cfca0 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 16 Jul 2011 18:40:26 +0200
Subject: seqlock: Use seqcount

No point in having different implementations for the same thing.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
index cb0599c..1829905 100644
--- a/include/linux/seqlock.h
+++ b/include/linux/seqlock.h
@@ -30,81 +30,12 @@
 #include <linux/preempt.h>
 #include <asm/processor.h>
 
-typedef struct {
-	unsigned sequence;
-	spinlock_t lock;
-} seqlock_t;
-
-/*
- * These macros triggered gcc-3.x compile-time problems.  We think these are
- * OK now.  Be cautious.
- */
-#define __SEQLOCK_UNLOCKED(lockname) \
-		 { 0, __SPIN_LOCK_UNLOCKED(lockname) }
-
-#define seqlock_init(x)					\
-	do {						\
-		(x)->sequence = 0;			\
-		spin_lock_init(&(x)->lock);		\
-	} while (0)
-
-#define DEFINE_SEQLOCK(x) \
-		seqlock_t x = __SEQLOCK_UNLOCKED(x)
-
-/* Lock out other writers and update the count.
- * Acts like a normal spin_lock/unlock.
- * Don't need preempt_disable() because that is in the spin_lock already.
- */
-static inline void write_seqlock(seqlock_t *sl)
-{
-	spin_lock(&sl->lock);
-	++sl->sequence;
-	smp_wmb();
-}
-
-static inline void write_sequnlock(seqlock_t *sl)
-{
-	smp_wmb();
-	sl->sequence++;
-	spin_unlock(&sl->lock);
-}
-
-/* Start of read calculation -- fetch last complete writer token */
-static __always_inline unsigned read_seqbegin(const seqlock_t *sl)
-{
-	unsigned ret;
-
-repeat:
-	ret = ACCESS_ONCE(sl->sequence);
-	if (unlikely(ret & 1)) {
-		cpu_relax();
-		goto repeat;
-	}
-	smp_rmb();
-
-	return ret;
-}
-
-/*
- * Test if reader processed invalid data.
- *
- * If sequence value changed then writer changed data while in section.
- */
-static __always_inline int read_seqretry(const seqlock_t *sl, unsigned start)
-{
-	smp_rmb();
-
-	return unlikely(sl->sequence != start);
-}
-
-
 /*
  * Version using sequence counter only.
  * This can be used when code has its own mutex protecting the
  * updating starting before the write_seqcountbeqin() and ending
  * after the write_seqcount_end().
  */
-
 typedef struct seqcount {
 	unsigned sequence;
 } seqcount_t;
@@ -207,7 +138,6 @@ static inline int __read_seqcount_retry(const seqcount_t *s, unsigned start)
 static inline int read_seqcount_retry(const seqcount_t *s, unsigned start)
 {
 	smp_rmb();
-
 	return __read_seqcount_retry(s, start);
 }
 
@@ -241,21 +171,101 @@ static inline void write_seqcount_barrier(seqcount_t *s)
 	s->sequence+=2;
 }
 
+typedef struct {
+	struct seqcount seqcount;
+	spinlock_t lock;
+} seqlock_t;
+
+/*
+ * These macros triggered gcc-3.x compile-time problems.  We think these are
+ * OK now.  Be cautious.
+ */
+#define __SEQLOCK_UNLOCKED(lockname)			\
+	{						\
+		.seqcount = SEQCNT_ZERO,		\
+		.lock =	__SPIN_LOCK_UNLOCKED(lockname)	\
+	}
+
+#define seqlock_init(x)					\
+	do {						\
+		seqcount_init(&(x)->seqcount);		\
+		spin_lock_init(&(x)->lock);		\
+	} while (0)
+
+#define DEFINE_SEQLOCK(x) \
+		seqlock_t x = __SEQLOCK_UNLOCKED(x)
+
+/*
+ * Read side functions for starting and finalizing a read side section.
+ */
+static inline unsigned read_seqbegin(const seqlock_t *sl)
+{
+	return read_seqcount_begin(&sl->seqcount);
+}
+
+static inline unsigned read_seqretry(const seqlock_t *sl, unsigned start)
+{
+	return read_seqcount_retry(&sl->seqcount, start);
+}
+
 /*
- * Possible sw/hw IRQ protected versions of the interfaces.
+ * Lock out other writers and update the count.
+ * Acts like a normal spin_lock/unlock.
+ * Don't need preempt_disable() because that is in the spin_lock already.
  */
+static inline void write_seqlock(seqlock_t *sl)
+{
+	spin_lock(&sl->lock);
+	write_seqcount_begin(&sl->seqcount);
+}
+
+static inline void write_sequnlock(seqlock_t *sl)
+{
+	write_seqcount_end(&sl->seqcount);
+	spin_unlock(&sl->lock);
+}
+
+static inline void write_seqlock_bh(seqlock_t *sl)
+{
+	spin_lock_bh(&sl->lock);
+	write_seqcount_begin(&sl->seqcount);
+}
+
+static inline void write_sequnlock_bh(seqlock_t *sl)
+{
+	write_seqcount_end(&sl->seqcount);
+	spin_unlock_bh(&sl->lock);
+}
+
+static inline void write_seqlock_irq(seqlock_t *sl)
+{
+	spin_lock_irq(&sl->lock);
+	write_seqcount_begin(&sl->seqcount);
+}
+
+static inline void write_sequnlock_irq(seqlock_t *sl)
+{
+	write_seqcount_end(&sl->seqcount);
+	spin_unlock_irq(&sl->lock);
+}
+
+static inline unsigned long __write_seqlock_irqsave(seqlock_t *sl)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&sl->lock, flags);
+	write_seqcount_begin(&sl->seqcount);
+	return flags;
+}
+
 #define write_seqlock_irqsave(lock, flags)				\
-	do { local_irq_save(flags); write_seqlock(lock); } while (0)
-#define write_seqlock_irq(lock)						\
-	do { local_irq_disable();   write_seqlock(lock); } while (0)
-#define write_seqlock_bh(lock)						\
-        do { local_bh_disable();    write_seqlock(lock); } while (0)
+	do { flags = __write_seqlock_irqsave(lock); } while (0)
 
-#define write_sequnlock_irqrestore(lock, flags)				\
-	do { write_sequnlock(lock); local_irq_restore(flags); } while(0)
-#define write_sequnlock_irq(lock)					\
-	do { write_sequnlock(lock); local_irq_enable(); } while(0)
-#define write_sequnlock_bh(lock)					\
-	do { write_sequnlock(lock); local_bh_enable(); } while(0)
+static inline void
+write_sequnlock_irqrestore(seqlock_t *sl, unsigned long flags)
+{
+	write_seqcount_end(&sl->seqcount);
+	spin_unlock_irqrestore(&sl->lock, flags);
+}
 
 #endif /* __LINUX_SEQLOCK_H */
-- 
cgit v0.10.2


From ca2aeac6ea3007005d9efaffb3598c9e13ba6937 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 3 Jul 2009 08:29:30 -0500
Subject: generic: Use raw local irq variant for generic cmpxchg

No point in tracing those.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/asm-generic/cmpxchg-local.h b/include/asm-generic/cmpxchg-local.h
index 2533fdd..d8d4c89 100644
--- a/include/asm-generic/cmpxchg-local.h
+++ b/include/asm-generic/cmpxchg-local.h
@@ -21,7 +21,7 @@ static inline unsigned long __cmpxchg_local_generic(volatile void *ptr,
 	if (size == 8 && sizeof(unsigned long) != 8)
 		wrong_size_cmpxchg(ptr);
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	switch (size) {
 	case 1: prev = *(u8 *)ptr;
 		if (prev == old)
@@ -42,7 +42,7 @@ static inline unsigned long __cmpxchg_local_generic(volatile void *ptr,
 	default:
 		wrong_size_cmpxchg(ptr);
 	}
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 	return prev;
 }
 
@@ -55,11 +55,11 @@ static inline u64 __cmpxchg64_local_generic(volatile void *ptr,
 	u64 prev;
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	prev = *(u64 *)ptr;
 	if (prev == old)
 		*(u64 *)ptr = new;
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 	return prev;
 }
 
-- 
cgit v0.10.2


From c2b417ef3e7386dc53a281d1cbc354eb8a9a6c21 Mon Sep 17 00:00:00 2001
From: Stephen Warren <swarren@nvidia.com>
Date: Mon, 11 Feb 2013 14:15:32 -0700
Subject: of: fix recursive locking in of_get_next_available_child()

of_get_next_available_child() acquires devtree_lock, then calls
of_device_is_available() which calls of_get_property() which calls
of_find_property() which tries to re-acquire devtree_lock, thus causing
deadlock.

To avoid this, create a new __of_device_is_available() which calls
__of_get_property() instead, which calls __of_find_property(), which
does not take the lock,. Update of_get_next_available_child() to call
the new __of_device_is_available() since it already owns the lock.

Signed-off-by: Stephen Warren <swarren@nvidia.com>
Signed-off-by: Grant Likely <grant.likely@secretlab.ca>

diff --git a/drivers/of/base.c b/drivers/of/base.c
index 662e009..ec2fd1f 100644
--- a/drivers/of/base.c
+++ b/drivers/of/base.c
@@ -307,19 +307,19 @@ int of_machine_is_compatible(const char *compat)
 EXPORT_SYMBOL(of_machine_is_compatible);
 
 /**
- *  of_device_is_available - check if a device is available for use
+ *  __of_device_is_available - check if a device is available for use
  *
- *  @device: Node to check for availability
+ *  @device: Node to check for availability, with locks already held
  *
  *  Returns 1 if the status property is absent or set to "okay" or "ok",
  *  0 otherwise
  */
-int of_device_is_available(const struct device_node *device)
+static int __of_device_is_available(const struct device_node *device)
 {
 	const char *status;
 	int statlen;
 
-	status = of_get_property(device, "status", &statlen);
+	status = __of_get_property(device, "status", &statlen);
 	if (status == NULL)
 		return 1;
 
@@ -330,6 +330,26 @@ int of_device_is_available(const struct device_node *device)
 
 	return 0;
 }
+
+/**
+ *  of_device_is_available - check if a device is available for use
+ *
+ *  @device: Node to check for availability
+ *
+ *  Returns 1 if the status property is absent or set to "okay" or "ok",
+ *  0 otherwise
+ */
+int of_device_is_available(const struct device_node *device)
+{
+	unsigned long flags;
+	int res;
+
+	raw_spin_lock_irqsave(&devtree_lock, flags);
+	res = __of_device_is_available(device);
+	raw_spin_unlock_irqrestore(&devtree_lock, flags);
+	return res;
+
+}
 EXPORT_SYMBOL(of_device_is_available);
 
 /**
@@ -421,7 +441,7 @@ struct device_node *of_get_next_available_child(const struct device_node *node,
 	raw_spin_lock(&devtree_lock);
 	next = prev ? prev->sibling : node->child;
 	for (; next; next = next->sibling) {
-		if (!of_device_is_available(next))
+		if (!__of_device_is_available(next))
 			continue;
 		if (of_node_get(next))
 			break;
-- 
cgit v0.10.2


From 7a8b1270b7114c13d04e42efc1f5bbbe76b123a3 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 24 Apr 2013 18:34:55 -0700
Subject: tcp: force a dst refcount when prequeue packet

Before escaping RCU protected section and adding packet into
prequeue, make sure the dst is refcounted.

Reported-by: Mike Galbraith <bitbucket@online.de>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>

diff --git a/include/net/tcp.h b/include/net/tcp.h
index aed42c7..4da2167 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1045,6 +1045,7 @@ static inline bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
 	if (sysctl_tcp_low_latency || !tp->ucopy.task)
 		return false;
 
+	skb_dst_force(skb);
 	__skb_queue_tail(&tp->ucopy.prequeue, skb);
 	tp->ucopy.memory += skb->truesize;
 	if (tp->ucopy.memory > sk->sk_rcvbuf) {
-- 
cgit v0.10.2


From c634207da9830edaa1da6caaaea6afb66225b64f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 30 Sep 2011 20:03:37 +0200
Subject: x86: hpet: Disable MSI on Lenovo W510

MSI based per cpu timers lose interrupts when intel_idle() is enabled
- independent of the c-state. With idle=poll the problem cannot be
observed. We have no idea yet, whether this is a W510 specific issue
or a general chipset oddity. Blacklist the known problem machine.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index e28670f..5ce3c25 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -8,6 +8,7 @@
 #include <linux/slab.h>
 #include <linux/hpet.h>
 #include <linux/init.h>
+#include <linux/dmi.h>
 #include <linux/cpu.h>
 #include <linux/pm.h>
 #include <linux/io.h>
@@ -573,6 +574,30 @@ static void init_one_hpet_msi_clockevent(struct hpet_dev *hdev, int cpu)
 #define RESERVE_TIMERS 0
 #endif
 
+static int __init dmi_disable_hpet_msi(const struct dmi_system_id *d)
+{
+	hpet_msi_disable = 1;
+	return 0;
+}
+
+static struct dmi_system_id __initdata dmi_hpet_table[] = {
+	/*
+	 * MSI based per cpu timers lose interrupts when intel_idle()
+	 * is enabled - independent of the c-state. With idle=poll the
+	 * problem cannot be observed. We have no idea yet, whether
+	 * this is a W510 specific issue or a general chipset oddity.
+	 */
+	{
+	 .callback = dmi_disable_hpet_msi,
+	 .ident = "Lenovo W510",
+	 .matches = {
+		     DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"),
+		     DMI_MATCH(DMI_PRODUCT_VERSION, "ThinkPad W510"),
+		     },
+	 },
+	{}
+};
+
 static void hpet_msi_capability_lookup(unsigned int start_timer)
 {
 	unsigned int id;
@@ -580,6 +605,8 @@ static void hpet_msi_capability_lookup(unsigned int start_timer)
 	unsigned int num_timers_used = 0;
 	int i;
 
+	dmi_check_system(dmi_hpet_table);
+
 	if (hpet_msi_disable)
 		return;
 
-- 
cgit v0.10.2


From 64fccd15a57d32f381fdf8297779f9710345a208 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 23 Jul 2011 11:04:08 +0200
Subject: early-printk-consolidate.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/arm/kernel/early_printk.c b/arch/arm/kernel/early_printk.c
index 85aa2b2..4307653 100644
--- a/arch/arm/kernel/early_printk.c
+++ b/arch/arm/kernel/early_printk.c
@@ -29,28 +29,17 @@ static void early_console_write(struct console *con, const char *s, unsigned n)
 	early_write(s, n);
 }
 
-static struct console early_console = {
+static struct console early_console_dev = {
 	.name =		"earlycon",
 	.write =	early_console_write,
 	.flags =	CON_PRINTBUFFER | CON_BOOT,
 	.index =	-1,
 };
 
-asmlinkage void early_printk(const char *fmt, ...)
-{
-	char buf[512];
-	int n;
-	va_list ap;
-
-	va_start(ap, fmt);
-	n = vscnprintf(buf, sizeof(buf), fmt, ap);
-	early_write(buf, n);
-	va_end(ap);
-}
-
 static int __init setup_early_printk(char *buf)
 {
-	register_console(&early_console);
+	early_console = &early_console_dev;
+	register_console(&early_console_dev);
 	return 0;
 }
 
diff --git a/arch/blackfin/kernel/early_printk.c b/arch/blackfin/kernel/early_printk.c
index 84ed837..61fbd2d 100644
--- a/arch/blackfin/kernel/early_printk.c
+++ b/arch/blackfin/kernel/early_printk.c
@@ -25,8 +25,6 @@ extern struct console *bfin_earlyserial_init(unsigned int port,
 extern struct console *bfin_jc_early_init(void);
 #endif
 
-static struct console *early_console;
-
 /* Default console */
 #define DEFAULT_PORT 0
 #define DEFAULT_CFLAG CS8|B57600
diff --git a/arch/microblaze/kernel/early_printk.c b/arch/microblaze/kernel/early_printk.c
index aba1f9a..b099a86 100644
--- a/arch/microblaze/kernel/early_printk.c
+++ b/arch/microblaze/kernel/early_printk.c
@@ -21,7 +21,6 @@
 #include <asm/setup.h>
 #include <asm/prom.h>
 
-static u32 early_console_initialized;
 static u32 base_addr;
 
 #ifdef CONFIG_SERIAL_UARTLITE_CONSOLE
@@ -109,27 +108,11 @@ static struct console early_serial_uart16550_console = {
 };
 #endif /* CONFIG_SERIAL_8250_CONSOLE */
 
-static struct console *early_console;
-
-void early_printk(const char *fmt, ...)
-{
-	char buf[512];
-	int n;
-	va_list ap;
-
-	if (early_console_initialized) {
-		va_start(ap, fmt);
-		n = vscnprintf(buf, 512, fmt, ap);
-		early_console->write(early_console, buf, n);
-		va_end(ap);
-	}
-}
-
 int __init setup_early_printk(char *opt)
 {
 	int version = 0;
 
-	if (early_console_initialized)
+	if (early_console)
 		return 1;
 
 	base_addr = of_early_console(&version);
@@ -159,7 +142,6 @@ int __init setup_early_printk(char *opt)
 		}
 
 		register_console(early_console);
-		early_console_initialized = 1;
 		return 0;
 	}
 	return 1;
@@ -169,7 +151,7 @@ int __init setup_early_printk(char *opt)
  * only for early console because of performance degression */
 void __init remap_early_printk(void)
 {
-	if (!early_console_initialized || !early_console)
+	if (!early_console)
 		return;
 	printk(KERN_INFO "early_printk_console remapping from 0x%x to ",
 								base_addr);
@@ -195,9 +177,9 @@ void __init remap_early_printk(void)
 
 void __init disable_early_printk(void)
 {
-	if (!early_console_initialized || !early_console)
+	if (!early_console)
 		return;
 	printk(KERN_WARNING "disabling early console\n");
 	unregister_console(early_console);
-	early_console_initialized = 0;
+	early_console = NULL;
 }
diff --git a/arch/mips/kernel/early_printk.c b/arch/mips/kernel/early_printk.c
index 9ae813e..86d325a 100644
--- a/arch/mips/kernel/early_printk.c
+++ b/arch/mips/kernel/early_printk.c
@@ -8,6 +8,7 @@
  *   written by Ralf Baechle (ralf@linux-mips.org)
  */
 #include <linux/console.h>
+#include <linux/printk.h>
 #include <linux/init.h>
 
 #include <asm/setup.h>
@@ -25,20 +26,18 @@ early_console_write(struct console *con, const char *s, unsigned n)
 	}
 }
 
-static struct console early_console __initdata = {
+static struct console early_console_prom = {
 	.name	= "early",
 	.write	= early_console_write,
 	.flags	= CON_PRINTBUFFER | CON_BOOT,
 	.index	= -1
 };
 
-static int early_console_initialized __initdata;
-
 void __init setup_early_printk(void)
 {
-	if (early_console_initialized)
+	if (early_console)
 		return;
-	early_console_initialized = 1;
+	early_console = &early_console_prom;
 
-	register_console(&early_console);
+	register_console(&early_console_prom);
 }
diff --git a/arch/powerpc/kernel/udbg.c b/arch/powerpc/kernel/udbg.c
index f974849..13b8670 100644
--- a/arch/powerpc/kernel/udbg.c
+++ b/arch/powerpc/kernel/udbg.c
@@ -156,15 +156,13 @@ static struct console udbg_console = {
 	.index	= 0,
 };
 
-static int early_console_initialized;
-
 /*
  * Called by setup_system after ppc_md->probe and ppc_md->early_init.
  * Call it again after setting udbg_putc in ppc_md->setup_arch.
  */
 void __init register_early_udbg_console(void)
 {
-	if (early_console_initialized)
+	if (early_console)
 		return;
 
 	if (!udbg_putc)
@@ -174,7 +172,7 @@ void __init register_early_udbg_console(void)
 		printk(KERN_INFO "early console immortal !\n");
 		udbg_console.flags &= ~CON_BOOT;
 	}
-	early_console_initialized = 1;
+	early_console = &udbg_console;
 	register_console(&udbg_console);
 }
 
diff --git a/arch/sh/kernel/sh_bios.c b/arch/sh/kernel/sh_bios.c
index 47475cc..a5b51b9 100644
--- a/arch/sh/kernel/sh_bios.c
+++ b/arch/sh/kernel/sh_bios.c
@@ -144,8 +144,6 @@ static struct console bios_console = {
 	.index		= -1,
 };
 
-static struct console *early_console;
-
 static int __init setup_early_printk(char *buf)
 {
 	int keep_early = 0;
diff --git a/arch/sparc/kernel/setup_32.c b/arch/sparc/kernel/setup_32.c
index 38bf80a..f4fb00e 100644
--- a/arch/sparc/kernel/setup_32.c
+++ b/arch/sparc/kernel/setup_32.c
@@ -309,6 +309,7 @@ void __init setup_arch(char **cmdline_p)
 
 	boot_flags_init(*cmdline_p);
 
+	early_console = &prom_early_console;
 	register_console(&prom_early_console);
 
 	printk("ARCH: ");
diff --git a/arch/sparc/kernel/setup_64.c b/arch/sparc/kernel/setup_64.c
index 0eaf005..ede7dc3 100644
--- a/arch/sparc/kernel/setup_64.c
+++ b/arch/sparc/kernel/setup_64.c
@@ -551,6 +551,12 @@ static void __init init_sparc64_elf_hwcap(void)
 		pause_patch();
 }
 
+static inline void register_prom_console(void)
+{
+	early_console = &prom_early_console;
+	register_console(&prom_early_console);
+}
+
 void __init setup_arch(char **cmdline_p)
 {
 	/* Initialize PROM console and command line. */
@@ -562,7 +568,7 @@ void __init setup_arch(char **cmdline_p)
 #ifdef CONFIG_EARLYFB
 	if (btext_find_display())
 #endif
-		register_console(&prom_early_console);
+		register_prom_console();
 
 	if (tlb_type == hypervisor)
 		printk("ARCH: SUN4V\n");
diff --git a/arch/tile/kernel/early_printk.c b/arch/tile/kernel/early_printk.c
index afb9c9a..34d72a1 100644
--- a/arch/tile/kernel/early_printk.c
+++ b/arch/tile/kernel/early_printk.c
@@ -17,6 +17,7 @@
 #include <linux/init.h>
 #include <linux/string.h>
 #include <linux/irqflags.h>
+#include <linux/printk.h>
 #include <asm/setup.h>
 #include <hv/hypervisor.h>
 
@@ -33,25 +34,8 @@ static struct console early_hv_console = {
 };
 
 /* Direct interface for emergencies */
-static struct console *early_console = &early_hv_console;
-static int early_console_initialized;
 static int early_console_complete;
 
-static void early_vprintk(const char *fmt, va_list ap)
-{
-	char buf[512];
-	int n = vscnprintf(buf, sizeof(buf), fmt, ap);
-	early_console->write(early_console, buf, n);
-}
-
-void early_printk(const char *fmt, ...)
-{
-	va_list ap;
-	va_start(ap, fmt);
-	early_vprintk(fmt, ap);
-	va_end(ap);
-}
-
 void early_panic(const char *fmt, ...)
 {
 	va_list ap;
@@ -69,14 +53,13 @@ static int __initdata keep_early;
 
 static int __init setup_early_printk(char *str)
 {
-	if (early_console_initialized)
+	if (early_console)
 		return 1;
 
 	if (str != NULL && strncmp(str, "keep", 4) == 0)
 		keep_early = 1;
 
 	early_console = &early_hv_console;
-	early_console_initialized = 1;
 	register_console(early_console);
 
 	return 0;
@@ -85,12 +68,12 @@ static int __init setup_early_printk(char *str)
 void __init disable_early_printk(void)
 {
 	early_console_complete = 1;
-	if (!early_console_initialized || !early_console)
+	if (!early_console)
 		return;
 	if (!keep_early) {
 		early_printk("disabling early console\n");
 		unregister_console(early_console);
-		early_console_initialized = 0;
+		early_console = NULL;
 	} else {
 		early_printk("keeping early console\n");
 	}
@@ -98,7 +81,7 @@ void __init disable_early_printk(void)
 
 void warn_early_printk(void)
 {
-	if (early_console_complete || early_console_initialized)
+	if (early_console_complete || early_console)
 		return;
 	early_printk("\
 Machine shutting down before console output is fully initialized.\n\
diff --git a/arch/um/kernel/early_printk.c b/arch/um/kernel/early_printk.c
index 49480f0..4a0800b 100644
--- a/arch/um/kernel/early_printk.c
+++ b/arch/um/kernel/early_printk.c
@@ -16,7 +16,7 @@ static void early_console_write(struct console *con, const char *s, unsigned int
 	um_early_printk(s, n);
 }
 
-static struct console early_console = {
+static struct console early_console_dev = {
 	.name = "earlycon",
 	.write = early_console_write,
 	.flags = CON_BOOT,
@@ -25,8 +25,10 @@ static struct console early_console = {
 
 static int __init setup_early_printk(char *buf)
 {
-	register_console(&early_console);
-
+	if (!early_console) {
+		early_console = &early_console_dev;
+		register_console(&early_console_dev);
+	}
 	return 0;
 }
 
diff --git a/arch/unicore32/kernel/early_printk.c b/arch/unicore32/kernel/early_printk.c
index 3922255..9be0d5d 100644
--- a/arch/unicore32/kernel/early_printk.c
+++ b/arch/unicore32/kernel/early_printk.c
@@ -33,21 +33,17 @@ static struct console early_ocd_console = {
 	.index =	-1,
 };
 
-/* Direct interface for emergencies */
-static struct console *early_console = &early_ocd_console;
-
-static int __initdata keep_early;
-
 static int __init setup_early_printk(char *buf)
 {
-	if (!buf)
+	int keep_early;
+
+	if (!buf || early_console)
 		return 0;
 
 	if (strstr(buf, "keep"))
 		keep_early = 1;
 
-	if (!strncmp(buf, "ocd", 3))
-		early_console = &early_ocd_console;
+	early_console = &early_ocd_console;
 
 	if (keep_early)
 		early_console->flags &= ~CON_BOOT;
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index 9b9f18b..d15f575 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -169,25 +169,9 @@ static struct console early_serial_console = {
 	.index =	-1,
 };
 
-/* Direct interface for emergencies */
-static struct console *early_console = &early_vga_console;
-static int __initdata early_console_initialized;
-
-asmlinkage void early_printk(const char *fmt, ...)
-{
-	char buf[512];
-	int n;
-	va_list ap;
-
-	va_start(ap, fmt);
-	n = vscnprintf(buf, sizeof(buf), fmt, ap);
-	early_console->write(early_console, buf, n);
-	va_end(ap);
-}
-
 static inline void early_console_register(struct console *con, int keep_early)
 {
-	if (early_console->index != -1) {
+	if (con->index != -1) {
 		printk(KERN_CRIT "ERROR: earlyprintk= %s already used\n",
 		       con->name);
 		return;
@@ -207,9 +191,8 @@ static int __init setup_early_printk(char *buf)
 	if (!buf)
 		return 0;
 
-	if (early_console_initialized)
+	if (early_console)
 		return 0;
-	early_console_initialized = 1;
 
 	keep = (strstr(buf, "keep") != NULL);
 
diff --git a/include/linux/console.h b/include/linux/console.h
index 47b858c..4a6948a 100644
--- a/include/linux/console.h
+++ b/include/linux/console.h
@@ -141,6 +141,7 @@ struct console {
 	for (con = console_drivers; con != NULL; con = con->next)
 
 extern int console_set_on_cmdline;
+extern struct console *early_console;
 
 extern int add_preferred_console(char *name, int idx, char *options);
 extern int update_console_cmdline(char *name, int idx, char *name_new, int idx_new, char *options);
diff --git a/include/linux/printk.h b/include/linux/printk.h
index 9afc01e..8111ffa 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -95,8 +95,14 @@ int no_printk(const char *fmt, ...)
 	return 0;
 }
 
+#ifdef CONFIG_EARLY_PRINTK
 extern asmlinkage __printf(1, 2)
 void early_printk(const char *fmt, ...);
+void early_vprintk(const char *fmt, va_list ap);
+#else
+static inline __printf(1, 2) __cold
+void early_printk(const char *s, ...) { }
+#endif
 
 extern int printk_needs_cpu(int cpu);
 extern void printk_tick(void);
diff --git a/kernel/printk.c b/kernel/printk.c
index e698e80..69270a6 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -48,13 +48,6 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/printk.h>
 
-/*
- * Architectures can override it:
- */
-void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...)
-{
-}
-
 /* printk's without a loglevel use this.. */
 #define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL
 
@@ -756,6 +749,29 @@ module_param(ignore_loglevel, bool, S_IRUGO | S_IWUSR);
 MODULE_PARM_DESC(ignore_loglevel, "ignore loglevel setting, to"
 	"print all kernel messages to the console.");
 
+#ifdef CONFIG_EARLY_PRINTK
+struct console *early_console;
+
+void early_vprintk(const char *fmt, va_list ap)
+{
+	if (early_console) {
+		char buf[512];
+		int n = vscnprintf(buf, sizeof(buf), fmt, ap);
+
+		early_console->write(early_console, buf, n);
+	}
+}
+
+asmlinkage void early_printk(const char *fmt, ...)
+{
+	va_list ap;
+
+	va_start(ap, fmt);
+	early_vprintk(fmt, ap);
+	va_end(ap);
+}
+#endif
+
 #ifdef CONFIG_BOOT_PRINTK_DELAY
 
 static int boot_delay; /* msecs delay after each printk during bootup */
-- 
cgit v0.10.2


From 5e3196ea5f58697c58dbcdf8beb762247be61411 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Tue, 19 Mar 2013 14:41:04 +0100
Subject: kernel/srcu: merge common code into a macro

DEFINE_SRCU() and DEFINE_STATIC_SRCU() does the same thing except for
the "static" attribute. This patch moves the common pieces into
_DEFINE_SRCU() which is used by the the former macros either adding the
static attribute or not.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>

diff --git a/include/linux/srcu.h b/include/linux/srcu.h
index 6eb691b..d04acb8 100644
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -102,13 +102,13 @@ void process_srcu(struct work_struct *work);
  * define and init a srcu struct at build time.
  * dont't call init_srcu_struct() nor cleanup_srcu_struct() on it.
  */
-#define DEFINE_SRCU(name)						\
+#define _DEFINE_SRCU(name, mod)						\
 	static DEFINE_PER_CPU(struct srcu_struct_array, name##_srcu_array);\
-	struct srcu_struct name = __SRCU_STRUCT_INIT(name);
+	mod struct srcu_struct name =					\
+				__SRCU_STRUCT_INIT(name);
 
-#define DEFINE_STATIC_SRCU(name)					\
-	static DEFINE_PER_CPU(struct srcu_struct_array, name##_srcu_array);\
-	static struct srcu_struct name = __SRCU_STRUCT_INIT(name);
+#define DEFINE_SRCU(name)		_DEFINE_SRCU(name, )
+#define DEFINE_STATIC_SRCU(name)	_DEFINE_SRCU(name, static)
 
 /**
  * call_srcu() - Queue a callback for invocation after an SRCU grace period
-- 
cgit v0.10.2


From 1ec4f67c4115cb47b76ba420d77dc3c84ae6d262 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Tue, 19 Mar 2013 14:44:30 +0100
Subject: kernel/SRCU: provide a static initializer

There are macros for static initializer for the three out of four
possible notifier types, that are:
	ATOMIC_NOTIFIER_HEAD()
	BLOCKING_NOTIFIER_HEAD()
	RAW_NOTIFIER_HEAD()

This patch provides a static initilizer for the forth type to make it
complete.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>

diff --git a/include/linux/notifier.h b/include/linux/notifier.h
index d65746e..6bfd703 100644
--- a/include/linux/notifier.h
+++ b/include/linux/notifier.h
@@ -42,9 +42,7 @@
  * in srcu_notifier_call_chain(): no cache bounces and no memory barriers.
  * As compensation, srcu_notifier_chain_unregister() is rather expensive.
  * SRCU notifier chains should be used when the chain will be called very
- * often but notifier_blocks will seldom be removed.  Also, SRCU notifier
- * chains are slightly more difficult to use because they require special
- * runtime initialization.
+ * often but notifier_blocks will seldom be removed.
  */
 
 struct notifier_block {
@@ -85,7 +83,7 @@ struct srcu_notifier_head {
 		(name)->head = NULL;		\
 	} while (0)
 
-/* srcu_notifier_heads must be initialized and cleaned up dynamically */
+/* srcu_notifier_heads must be cleaned up dynamically */
 extern void srcu_init_notifier_head(struct srcu_notifier_head *nh);
 #define srcu_cleanup_notifier_head(name)	\
 		cleanup_srcu_struct(&(name)->srcu);
@@ -98,7 +96,13 @@ extern void srcu_init_notifier_head(struct srcu_notifier_head *nh);
 		.head = NULL }
 #define RAW_NOTIFIER_INIT(name)	{				\
 		.head = NULL }
-/* srcu_notifier_heads cannot be initialized statically */
+
+#define SRCU_NOTIFIER_INIT(name, pcpu)				\
+	{							\
+		.mutex = __MUTEX_INITIALIZER(name.mutex),	\
+		.head = NULL,					\
+		.srcu = __SRCU_STRUCT_INIT(name.srcu, pcpu),	\
+	}
 
 #define ATOMIC_NOTIFIER_HEAD(name)				\
 	struct atomic_notifier_head name =			\
@@ -110,6 +114,18 @@ extern void srcu_init_notifier_head(struct srcu_notifier_head *nh);
 	struct raw_notifier_head name =				\
 		RAW_NOTIFIER_INIT(name)
 
+#define _SRCU_NOTIFIER_HEAD(name, mod)				\
+	static DEFINE_PER_CPU(struct srcu_struct_array,		\
+			name##_head_srcu_array);		\
+	mod struct srcu_notifier_head name =			\
+			SRCU_NOTIFIER_INIT(name, name##_head_srcu_array)
+
+#define SRCU_NOTIFIER_HEAD(name)				\
+	_SRCU_NOTIFIER_HEAD(name, )
+
+#define SRCU_NOTIFIER_HEAD_STATIC(name)				\
+	_SRCU_NOTIFIER_HEAD(name, static)
+
 #ifdef __KERNEL__
 
 extern int atomic_notifier_chain_register(struct atomic_notifier_head *nh,
diff --git a/include/linux/srcu.h b/include/linux/srcu.h
index d04acb8..fe9efd4 100644
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -84,10 +84,10 @@ int init_srcu_struct(struct srcu_struct *sp);
 
 void process_srcu(struct work_struct *work);
 
-#define __SRCU_STRUCT_INIT(name)					\
+#define __SRCU_STRUCT_INIT(name, pcpu_name)				\
 	{								\
 		.completed = -300,					\
-		.per_cpu_ref = &name##_srcu_array,			\
+		.per_cpu_ref = &pcpu_name,				\
 		.queue_lock = __SPIN_LOCK_UNLOCKED(name.queue_lock),	\
 		.running = false,					\
 		.batch_queue = RCU_BATCH_INIT(name.batch_queue),	\
@@ -105,7 +105,7 @@ void process_srcu(struct work_struct *work);
 #define _DEFINE_SRCU(name, mod)						\
 	static DEFINE_PER_CPU(struct srcu_struct_array, name##_srcu_array);\
 	mod struct srcu_struct name =					\
-				__SRCU_STRUCT_INIT(name);
+				__SRCU_STRUCT_INIT(name, name##_srcu_array);
 
 #define DEFINE_SRCU(name)		_DEFINE_SRCU(name, )
 #define DEFINE_STATIC_SRCU(name)	_DEFINE_SRCU(name, static)
-- 
cgit v0.10.2


From 8964a2cd84a4c6b27dd4c797ebf99f7aea1ea2ef Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 16 Mar 2011 14:45:31 +0100
Subject: arm: Mark pmu interupt IRQF_NO_THREAD

PMU interrupts must not be threaded.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/arm/kernel/perf_event_cpu.c b/arch/arm/kernel/perf_event_cpu.c
index 5f66206..aa1b171 100644
--- a/arch/arm/kernel/perf_event_cpu.c
+++ b/arch/arm/kernel/perf_event_cpu.c
@@ -118,7 +118,8 @@ static int cpu_pmu_request_irq(struct arm_pmu *cpu_pmu, irq_handler_t handler)
 			continue;
 		}
 
-		err = request_irq(irq, handler, IRQF_NOBALANCING, "arm-pmu",
+		err = request_irq(irq, handler,
+				  IRQF_NOBALANCING | IRQF_NO_THREAD, "arm-pmu",
 				  cpu_pmu);
 		if (err) {
 			pr_err("unable to request IRQ%d for ARM PMU counters\n",
-- 
cgit v0.10.2


From 842efab0dc722d920372a65c588bae62cecbfcd1 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 16 Jul 2011 13:15:20 +0200
Subject: arm: Allow forced irq threading

All timer interrupts and the perf interrupt are marked NO_THREAD, so
its safe to allow forced interrupt threading.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 67874b8..4890796 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -17,6 +17,7 @@ config ARM
 	select GENERIC_STRNCPY_FROM_USER
 	select GENERIC_STRNLEN_USER
 	select HARDIRQS_SW_RESEND
+	select IRQ_FORCED_THREADING
 	select HAVE_AOUT
 	select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL
 	select HAVE_ARCH_KGDB
-- 
cgit v0.10.2


From 7c2536624cc3b23e02e625bfea90294231d00b22 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 18 Jun 2012 19:53:17 +0200
Subject: powerpc: Mark low level irq handlers NO_THREAD

These low level handlers cannot be threaded. Mark them NO_THREAD

Reported-by: leroy christophe <christophe.leroy@c-s.fr>
Tested-by: leroy christophe <christophe.leroy@c-s.fr>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/powerpc/platforms/8xx/m8xx_setup.c b/arch/powerpc/platforms/8xx/m8xx_setup.c
index 1e12108..806cbbd 100644
--- a/arch/powerpc/platforms/8xx/m8xx_setup.c
+++ b/arch/powerpc/platforms/8xx/m8xx_setup.c
@@ -43,6 +43,7 @@ static irqreturn_t timebase_interrupt(int irq, void *dev)
 
 static struct irqaction tbint_irqaction = {
 	.handler = timebase_interrupt,
+	.flags = IRQF_NO_THREAD,
 	.name = "tbint",
 };
 
diff --git a/arch/powerpc/sysdev/cpm1.c b/arch/powerpc/sysdev/cpm1.c
index d4fa03f..5e6ff38 100644
--- a/arch/powerpc/sysdev/cpm1.c
+++ b/arch/powerpc/sysdev/cpm1.c
@@ -120,6 +120,7 @@ static irqreturn_t cpm_error_interrupt(int irq, void *dev)
 
 static struct irqaction cpm_error_irqaction = {
 	.handler = cpm_error_interrupt,
+	.flags = IRQF_NO_THREAD,
 	.name = "error",
 };
 
-- 
cgit v0.10.2


From e6560b0938cbf673f791a4dc61cfb084d13afec5 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 16 Feb 2013 00:12:36 +0100
Subject: timekeeping: Calc stuff once

Calculate the cycle interval shifted value once. No functional change,
just makes the code more readable.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index cbc6acb..06dc034 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1077,15 +1077,16 @@ static inline void accumulate_nsecs_to_secs(struct timekeeper *tk)
 static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset,
 						u32 shift)
 {
+	cycle_t interval = tk->cycle_interval << shift;
 	u64 raw_nsecs;
 
 	/* If the offset is smaller then a shifted interval, do nothing */
-	if (offset < tk->cycle_interval<<shift)
+	if (offset < interval)
 		return offset;
 
 	/* Accumulate one shifted interval */
-	offset -= tk->cycle_interval << shift;
-	tk->clock->cycle_last += tk->cycle_interval << shift;
+	offset -= interval;
+	tk->clock->cycle_last += interval;
 
 	tk->xtime_nsec += tk->xtime_interval << shift;
 	accumulate_nsecs_to_secs(tk);
-- 
cgit v0.10.2


From edb936740c089ba26126bb5973579a73d4c2f9b4 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 14 Feb 2013 22:38:07 +0100
Subject: timekeeping: Make jiffies_lock internal

Nothing outside of the timekeeping core needs that lock.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h
index 82ed068..8fb8edf 100644
--- a/include/linux/jiffies.h
+++ b/include/linux/jiffies.h
@@ -75,7 +75,6 @@ extern int register_refined_jiffies(long clock_tick_rate);
  */
 extern u64 __jiffy_data jiffies_64;
 extern unsigned long volatile __jiffy_data jiffies;
-extern seqlock_t jiffies_lock;
 
 #if (BITS_PER_LONG < 64)
 u64 get_jiffies_64(void);
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index cf3e59e..f5c9207 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -4,6 +4,8 @@
 #include <linux/hrtimer.h>
 #include <linux/tick.h>
 
+extern seqlock_t jiffies_lock;
+
 #ifdef CONFIG_GENERIC_CLOCKEVENTS_BUILD
 
 #define TICK_DO_TIMER_NONE	-1
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 06dc034..e2892da 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -23,6 +23,7 @@
 #include <linux/stop_machine.h>
 #include <linux/pvclock_gtod.h>
 
+#include "tick-internal.h"
 
 static struct timekeeper timekeeper;
 
-- 
cgit v0.10.2


From 6955694c73ed7fa59df3647c460d3c0e786073fa Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 15 Feb 2013 15:05:48 +0100
Subject: timekeeping: Move lock out of timekeeper struct

Make the lock a separate entity. Preparatory patch for shadow
timekeeper structure.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/timekeeper_internal.h b/include/linux/timekeeper_internal.h
index e1d558e..851bc43 100644
--- a/include/linux/timekeeper_internal.h
+++ b/include/linux/timekeeper_internal.h
@@ -62,8 +62,6 @@ struct timekeeper {
 	ktime_t			offs_boot;
 	/* The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. */
 	struct timespec		raw_time;
-	/* Seqlock for all timekeeper values */
-	seqlock_t		lock;
 };
 
 static inline struct timespec tk_xtime(struct timekeeper *tk)
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index e2892da..37887dc 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -26,6 +26,7 @@
 #include "tick-internal.h"
 
 static struct timekeeper timekeeper;
+static DEFINE_SEQLOCK(timekeeper_lock);
 
 /* flag for if timekeeping is suspended */
 int __read_mostly timekeeping_suspended;
@@ -194,11 +195,11 @@ int pvclock_gtod_register_notifier(struct notifier_block *nb)
 	unsigned long flags;
 	int ret;
 
-	write_seqlock_irqsave(&tk->lock, flags);
+	write_seqlock_irqsave(&timekeeper_lock, flags);
 	ret = raw_notifier_chain_register(&pvclock_gtod_chain, nb);
 	/* update timekeeping data */
 	update_pvclock_gtod(tk);
-	write_sequnlock_irqrestore(&tk->lock, flags);
+	write_sequnlock_irqrestore(&timekeeper_lock, flags);
 
 	return ret;
 }
@@ -212,13 +213,12 @@ EXPORT_SYMBOL_GPL(pvclock_gtod_register_notifier);
  */
 int pvclock_gtod_unregister_notifier(struct notifier_block *nb)
 {
-	struct timekeeper *tk = &timekeeper;
 	unsigned long flags;
 	int ret;
 
-	write_seqlock_irqsave(&tk->lock, flags);
+	write_seqlock_irqsave(&timekeeper_lock, flags);
 	ret = raw_notifier_chain_unregister(&pvclock_gtod_chain, nb);
-	write_sequnlock_irqrestore(&tk->lock, flags);
+	write_sequnlock_irqrestore(&timekeeper_lock, flags);
 
 	return ret;
 }
@@ -279,12 +279,12 @@ void getnstimeofday(struct timespec *ts)
 	WARN_ON(timekeeping_suspended);
 
 	do {
-		seq = read_seqbegin(&tk->lock);
+		seq = read_seqbegin(&timekeeper_lock);
 
 		ts->tv_sec = tk->xtime_sec;
 		nsecs = timekeeping_get_ns(tk);
 
-	} while (read_seqretry(&tk->lock, seq));
+	} while (read_seqretry(&timekeeper_lock, seq));
 
 	ts->tv_nsec = 0;
 	timespec_add_ns(ts, nsecs);
@@ -300,11 +300,11 @@ ktime_t ktime_get(void)
 	WARN_ON(timekeeping_suspended);
 
 	do {
-		seq = read_seqbegin(&tk->lock);
+		seq = read_seqbegin(&timekeeper_lock);
 		secs = tk->xtime_sec + tk->wall_to_monotonic.tv_sec;
 		nsecs = timekeeping_get_ns(tk) + tk->wall_to_monotonic.tv_nsec;
 
-	} while (read_seqretry(&tk->lock, seq));
+	} while (read_seqretry(&timekeeper_lock, seq));
 	/*
 	 * Use ktime_set/ktime_add_ns to create a proper ktime on
 	 * 32-bit architectures without CONFIG_KTIME_SCALAR.
@@ -331,12 +331,12 @@ void ktime_get_ts(struct timespec *ts)
 	WARN_ON(timekeeping_suspended);
 
 	do {
-		seq = read_seqbegin(&tk->lock);
+		seq = read_seqbegin(&timekeeper_lock);
 		ts->tv_sec = tk->xtime_sec;
 		nsec = timekeeping_get_ns(tk);
 		tomono = tk->wall_to_monotonic;
 
-	} while (read_seqretry(&tk->lock, seq));
+	} while (read_seqretry(&timekeeper_lock, seq));
 
 	ts->tv_sec += tomono.tv_sec;
 	ts->tv_nsec = 0;
@@ -364,7 +364,7 @@ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real)
 	WARN_ON_ONCE(timekeeping_suspended);
 
 	do {
-		seq = read_seqbegin(&tk->lock);
+		seq = read_seqbegin(&timekeeper_lock);
 
 		*ts_raw = tk->raw_time;
 		ts_real->tv_sec = tk->xtime_sec;
@@ -373,7 +373,7 @@ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real)
 		nsecs_raw = timekeeping_get_ns_raw(tk);
 		nsecs_real = timekeeping_get_ns(tk);
 
-	} while (read_seqretry(&tk->lock, seq));
+	} while (read_seqretry(&timekeeper_lock, seq));
 
 	timespec_add_ns(ts_raw, nsecs_raw);
 	timespec_add_ns(ts_real, nsecs_real);
@@ -413,7 +413,7 @@ int do_settimeofday(const struct timespec *tv)
 	if (!timespec_valid_strict(tv))
 		return -EINVAL;
 
-	write_seqlock_irqsave(&tk->lock, flags);
+	write_seqlock_irqsave(&timekeeper_lock, flags);
 
 	timekeeping_forward_now(tk);
 
@@ -427,7 +427,7 @@ int do_settimeofday(const struct timespec *tv)
 
 	timekeeping_update(tk, true);
 
-	write_sequnlock_irqrestore(&tk->lock, flags);
+	write_sequnlock_irqrestore(&timekeeper_lock, flags);
 
 	/* signal hrtimers about time change */
 	clock_was_set();
@@ -452,7 +452,7 @@ int timekeeping_inject_offset(struct timespec *ts)
 	if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC)
 		return -EINVAL;
 
-	write_seqlock_irqsave(&tk->lock, flags);
+	write_seqlock_irqsave(&timekeeper_lock, flags);
 
 	timekeeping_forward_now(tk);
 
@@ -469,7 +469,7 @@ int timekeeping_inject_offset(struct timespec *ts)
 error: /* even if we error out, we forwarded the time, so call update */
 	timekeeping_update(tk, true);
 
-	write_sequnlock_irqrestore(&tk->lock, flags);
+	write_sequnlock_irqrestore(&timekeeper_lock, flags);
 
 	/* signal hrtimers about time change */
 	clock_was_set();
@@ -491,7 +491,7 @@ static int change_clocksource(void *data)
 
 	new = (struct clocksource *) data;
 
-	write_seqlock_irqsave(&tk->lock, flags);
+	write_seqlock_irqsave(&timekeeper_lock, flags);
 
 	timekeeping_forward_now(tk);
 	if (!new->enable || new->enable(new) == 0) {
@@ -502,7 +502,7 @@ static int change_clocksource(void *data)
 	}
 	timekeeping_update(tk, true);
 
-	write_sequnlock_irqrestore(&tk->lock, flags);
+	write_sequnlock_irqrestore(&timekeeper_lock, flags);
 
 	return 0;
 }
@@ -552,11 +552,11 @@ void getrawmonotonic(struct timespec *ts)
 	s64 nsecs;
 
 	do {
-		seq = read_seqbegin(&tk->lock);
+		seq = read_seqbegin(&timekeeper_lock);
 		nsecs = timekeeping_get_ns_raw(tk);
 		*ts = tk->raw_time;
 
-	} while (read_seqretry(&tk->lock, seq));
+	} while (read_seqretry(&timekeeper_lock, seq));
 
 	timespec_add_ns(ts, nsecs);
 }
@@ -572,11 +572,11 @@ int timekeeping_valid_for_hres(void)
 	int ret;
 
 	do {
-		seq = read_seqbegin(&tk->lock);
+		seq = read_seqbegin(&timekeeper_lock);
 
 		ret = tk->clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;
 
-	} while (read_seqretry(&tk->lock, seq));
+	} while (read_seqretry(&timekeeper_lock, seq));
 
 	return ret;
 }
@@ -591,11 +591,11 @@ u64 timekeeping_max_deferment(void)
 	u64 ret;
 
 	do {
-		seq = read_seqbegin(&tk->lock);
+		seq = read_seqbegin(&timekeeper_lock);
 
 		ret = tk->clock->max_idle_ns;
 
-	} while (read_seqretry(&tk->lock, seq));
+	} while (read_seqretry(&timekeeper_lock, seq));
 
 	return ret;
 }
@@ -656,11 +656,9 @@ void __init timekeeping_init(void)
 		boot.tv_nsec = 0;
 	}
 
-	seqlock_init(&tk->lock);
-
 	ntp_init();
 
-	write_seqlock_irqsave(&tk->lock, flags);
+	write_seqlock_irqsave(&timekeeper_lock, flags);
 	clock = clocksource_default_clock();
 	if (clock->enable)
 		clock->enable(clock);
@@ -679,7 +677,7 @@ void __init timekeeping_init(void)
 	tmp.tv_nsec = 0;
 	tk_set_sleep_time(tk, tmp);
 
-	write_sequnlock_irqrestore(&tk->lock, flags);
+	write_sequnlock_irqrestore(&timekeeper_lock, flags);
 }
 
 /* time in seconds when suspend began */
@@ -726,7 +724,7 @@ void timekeeping_inject_sleeptime(struct timespec *delta)
 	if (!(ts.tv_sec == 0 && ts.tv_nsec == 0))
 		return;
 
-	write_seqlock_irqsave(&tk->lock, flags);
+	write_seqlock_irqsave(&timekeeper_lock, flags);
 
 	timekeeping_forward_now(tk);
 
@@ -734,7 +732,7 @@ void timekeeping_inject_sleeptime(struct timespec *delta)
 
 	timekeeping_update(tk, true);
 
-	write_sequnlock_irqrestore(&tk->lock, flags);
+	write_sequnlock_irqrestore(&timekeeper_lock, flags);
 
 	/* signal hrtimers about time change */
 	clock_was_set();
@@ -758,7 +756,7 @@ static void timekeeping_resume(void)
 	clockevents_resume();
 	clocksource_resume();
 
-	write_seqlock_irqsave(&tk->lock, flags);
+	write_seqlock_irqsave(&timekeeper_lock, flags);
 
 	if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) {
 		ts = timespec_sub(ts, timekeeping_suspend_time);
@@ -769,7 +767,7 @@ static void timekeeping_resume(void)
 	tk->ntp_error = 0;
 	timekeeping_suspended = 0;
 	timekeeping_update(tk, false);
-	write_sequnlock_irqrestore(&tk->lock, flags);
+	write_sequnlock_irqrestore(&timekeeper_lock, flags);
 
 	touch_softlockup_watchdog();
 
@@ -788,7 +786,7 @@ static int timekeeping_suspend(void)
 
 	read_persistent_clock(&timekeeping_suspend_time);
 
-	write_seqlock_irqsave(&tk->lock, flags);
+	write_seqlock_irqsave(&timekeeper_lock, flags);
 	timekeeping_forward_now(tk);
 	timekeeping_suspended = 1;
 
@@ -811,7 +809,7 @@ static int timekeeping_suspend(void)
 		timekeeping_suspend_time =
 			timespec_add(timekeeping_suspend_time, delta_delta);
 	}
-	write_sequnlock_irqrestore(&tk->lock, flags);
+	write_sequnlock_irqrestore(&timekeeper_lock, flags);
 
 	clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
 	clocksource_suspend();
@@ -1149,7 +1147,7 @@ static void update_wall_time(void)
 	int shift = 0, maxshift;
 	unsigned long flags;
 
-	write_seqlock_irqsave(&tk->lock, flags);
+	write_seqlock_irqsave(&timekeeper_lock, flags);
 
 	/* Make sure we're fully resumed: */
 	if (unlikely(timekeeping_suspended))
@@ -1204,7 +1202,7 @@ static void update_wall_time(void)
 	timekeeping_update(tk, false);
 
 out:
-	write_sequnlock_irqrestore(&tk->lock, flags);
+	write_sequnlock_irqrestore(&timekeeper_lock, flags);
 
 }
 
@@ -1252,13 +1250,13 @@ void get_monotonic_boottime(struct timespec *ts)
 	WARN_ON(timekeeping_suspended);
 
 	do {
-		seq = read_seqbegin(&tk->lock);
+		seq = read_seqbegin(&timekeeper_lock);
 		ts->tv_sec = tk->xtime_sec;
 		nsec = timekeeping_get_ns(tk);
 		tomono = tk->wall_to_monotonic;
 		sleep = tk->total_sleep_time;
 
-	} while (read_seqretry(&tk->lock, seq));
+	} while (read_seqretry(&timekeeper_lock, seq));
 
 	ts->tv_sec += tomono.tv_sec + sleep.tv_sec;
 	ts->tv_nsec = 0;
@@ -1317,10 +1315,10 @@ struct timespec current_kernel_time(void)
 	unsigned long seq;
 
 	do {
-		seq = read_seqbegin(&tk->lock);
+		seq = read_seqbegin(&timekeeper_lock);
 
 		now = tk_xtime(tk);
-	} while (read_seqretry(&tk->lock, seq));
+	} while (read_seqretry(&timekeeper_lock, seq));
 
 	return now;
 }
@@ -1333,11 +1331,11 @@ struct timespec get_monotonic_coarse(void)
 	unsigned long seq;
 
 	do {
-		seq = read_seqbegin(&tk->lock);
+		seq = read_seqbegin(&timekeeper_lock);
 
 		now = tk_xtime(tk);
 		mono = tk->wall_to_monotonic;
-	} while (read_seqretry(&tk->lock, seq));
+	} while (read_seqretry(&timekeeper_lock, seq));
 
 	set_normalized_timespec(&now, now.tv_sec + mono.tv_sec,
 				now.tv_nsec + mono.tv_nsec);
@@ -1368,11 +1366,11 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim,
 	unsigned long seq;
 
 	do {
-		seq = read_seqbegin(&tk->lock);
+		seq = read_seqbegin(&timekeeper_lock);
 		*xtim = tk_xtime(tk);
 		*wtom = tk->wall_to_monotonic;
 		*sleep = tk->total_sleep_time;
-	} while (read_seqretry(&tk->lock, seq));
+	} while (read_seqretry(&timekeeper_lock, seq));
 }
 
 #ifdef CONFIG_HIGH_RES_TIMERS
@@ -1392,14 +1390,14 @@ ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot)
 	u64 secs, nsecs;
 
 	do {
-		seq = read_seqbegin(&tk->lock);
+		seq = read_seqbegin(&timekeeper_lock);
 
 		secs = tk->xtime_sec;
 		nsecs = timekeeping_get_ns(tk);
 
 		*offs_real = tk->offs_real;
 		*offs_boot = tk->offs_boot;
-	} while (read_seqretry(&tk->lock, seq));
+	} while (read_seqretry(&timekeeper_lock, seq));
 
 	now = ktime_add_ns(ktime_set(secs, 0), nsecs);
 	now = ktime_sub(now, *offs_real);
@@ -1417,9 +1415,9 @@ ktime_t ktime_get_monotonic_offset(void)
 	struct timespec wtom;
 
 	do {
-		seq = read_seqbegin(&tk->lock);
+		seq = read_seqbegin(&timekeeper_lock);
 		wtom = tk->wall_to_monotonic;
-	} while (read_seqretry(&tk->lock, seq));
+	} while (read_seqretry(&timekeeper_lock, seq));
 
 	return timespec_to_ktime(wtom);
 }
-- 
cgit v0.10.2


From 4ef0e53558d3d43c0ab4dc6600d64d8448729c4e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 15 Feb 2013 15:03:17 +0100
Subject: timekeeping: Split timekeeper_lock into lock and seqcount

We want to shorten the seqcount write hold time. So split the seqlock
into a lock and a seqcount.

Open code the seqwrite_lock in the places which matter and drop the
sequence counter update where it's pointless.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 37887dc..bb62b9f 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -26,7 +26,8 @@
 #include "tick-internal.h"
 
 static struct timekeeper timekeeper;
-static DEFINE_SEQLOCK(timekeeper_lock);
+static DEFINE_RAW_SPINLOCK(timekeeper_lock);
+static seqcount_t timekeeper_seq;
 
 /* flag for if timekeeping is suspended */
 int __read_mostly timekeeping_suspended;
@@ -186,8 +187,6 @@ static void update_pvclock_gtod(struct timekeeper *tk)
 
 /**
  * pvclock_gtod_register_notifier - register a pvclock timedata update listener
- *
- * Must hold write on timekeeper.lock
  */
 int pvclock_gtod_register_notifier(struct notifier_block *nb)
 {
@@ -195,11 +194,10 @@ int pvclock_gtod_register_notifier(struct notifier_block *nb)
 	unsigned long flags;
 	int ret;
 
-	write_seqlock_irqsave(&timekeeper_lock, flags);
+	raw_spin_lock_irqsave(&timekeeper_lock, flags);
 	ret = raw_notifier_chain_register(&pvclock_gtod_chain, nb);
-	/* update timekeeping data */
 	update_pvclock_gtod(tk);
-	write_sequnlock_irqrestore(&timekeeper_lock, flags);
+	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
 
 	return ret;
 }
@@ -208,23 +206,21 @@ EXPORT_SYMBOL_GPL(pvclock_gtod_register_notifier);
 /**
  * pvclock_gtod_unregister_notifier - unregister a pvclock
  * timedata update listener
- *
- * Must hold write on timekeeper.lock
  */
 int pvclock_gtod_unregister_notifier(struct notifier_block *nb)
 {
 	unsigned long flags;
 	int ret;
 
-	write_seqlock_irqsave(&timekeeper_lock, flags);
+	raw_spin_lock_irqsave(&timekeeper_lock, flags);
 	ret = raw_notifier_chain_unregister(&pvclock_gtod_chain, nb);
-	write_sequnlock_irqrestore(&timekeeper_lock, flags);
+	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
 
 	return ret;
 }
 EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier);
 
-/* must hold write on timekeeper.lock */
+/* must hold timekeeper_lock */
 static void timekeeping_update(struct timekeeper *tk, bool clearntp)
 {
 	if (clearntp) {
@@ -279,12 +275,12 @@ void getnstimeofday(struct timespec *ts)
 	WARN_ON(timekeeping_suspended);
 
 	do {
-		seq = read_seqbegin(&timekeeper_lock);
+		seq = read_seqcount_begin(&timekeeper_seq);
 
 		ts->tv_sec = tk->xtime_sec;
 		nsecs = timekeeping_get_ns(tk);
 
-	} while (read_seqretry(&timekeeper_lock, seq));
+	} while (read_seqcount_retry(&timekeeper_seq, seq));
 
 	ts->tv_nsec = 0;
 	timespec_add_ns(ts, nsecs);
@@ -300,11 +296,11 @@ ktime_t ktime_get(void)
 	WARN_ON(timekeeping_suspended);
 
 	do {
-		seq = read_seqbegin(&timekeeper_lock);
+		seq = read_seqcount_begin(&timekeeper_seq);
 		secs = tk->xtime_sec + tk->wall_to_monotonic.tv_sec;
 		nsecs = timekeeping_get_ns(tk) + tk->wall_to_monotonic.tv_nsec;
 
-	} while (read_seqretry(&timekeeper_lock, seq));
+	} while (read_seqcount_retry(&timekeeper_seq, seq));
 	/*
 	 * Use ktime_set/ktime_add_ns to create a proper ktime on
 	 * 32-bit architectures without CONFIG_KTIME_SCALAR.
@@ -331,12 +327,12 @@ void ktime_get_ts(struct timespec *ts)
 	WARN_ON(timekeeping_suspended);
 
 	do {
-		seq = read_seqbegin(&timekeeper_lock);
+		seq = read_seqcount_begin(&timekeeper_seq);
 		ts->tv_sec = tk->xtime_sec;
 		nsec = timekeeping_get_ns(tk);
 		tomono = tk->wall_to_monotonic;
 
-	} while (read_seqretry(&timekeeper_lock, seq));
+	} while (read_seqcount_retry(&timekeeper_seq, seq));
 
 	ts->tv_sec += tomono.tv_sec;
 	ts->tv_nsec = 0;
@@ -364,7 +360,7 @@ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real)
 	WARN_ON_ONCE(timekeeping_suspended);
 
 	do {
-		seq = read_seqbegin(&timekeeper_lock);
+		seq = read_seqcount_begin(&timekeeper_seq);
 
 		*ts_raw = tk->raw_time;
 		ts_real->tv_sec = tk->xtime_sec;
@@ -373,7 +369,7 @@ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real)
 		nsecs_raw = timekeeping_get_ns_raw(tk);
 		nsecs_real = timekeeping_get_ns(tk);
 
-	} while (read_seqretry(&timekeeper_lock, seq));
+	} while (read_seqcount_retry(&timekeeper_seq, seq));
 
 	timespec_add_ns(ts_raw, nsecs_raw);
 	timespec_add_ns(ts_real, nsecs_real);
@@ -413,7 +409,8 @@ int do_settimeofday(const struct timespec *tv)
 	if (!timespec_valid_strict(tv))
 		return -EINVAL;
 
-	write_seqlock_irqsave(&timekeeper_lock, flags);
+	raw_spin_lock_irqsave(&timekeeper_lock, flags);
+	write_seqcount_begin(&timekeeper_seq);
 
 	timekeeping_forward_now(tk);
 
@@ -427,7 +424,8 @@ int do_settimeofday(const struct timespec *tv)
 
 	timekeeping_update(tk, true);
 
-	write_sequnlock_irqrestore(&timekeeper_lock, flags);
+	write_seqcount_end(&timekeeper_seq);
+	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
 
 	/* signal hrtimers about time change */
 	clock_was_set();
@@ -452,7 +450,8 @@ int timekeeping_inject_offset(struct timespec *ts)
 	if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC)
 		return -EINVAL;
 
-	write_seqlock_irqsave(&timekeeper_lock, flags);
+	raw_spin_lock_irqsave(&timekeeper_lock, flags);
+	write_seqcount_begin(&timekeeper_seq);
 
 	timekeeping_forward_now(tk);
 
@@ -469,7 +468,8 @@ int timekeeping_inject_offset(struct timespec *ts)
 error: /* even if we error out, we forwarded the time, so call update */
 	timekeeping_update(tk, true);
 
-	write_sequnlock_irqrestore(&timekeeper_lock, flags);
+	write_seqcount_end(&timekeeper_seq);
+	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
 
 	/* signal hrtimers about time change */
 	clock_was_set();
@@ -491,7 +491,8 @@ static int change_clocksource(void *data)
 
 	new = (struct clocksource *) data;
 
-	write_seqlock_irqsave(&timekeeper_lock, flags);
+	raw_spin_lock_irqsave(&timekeeper_lock, flags);
+	write_seqcount_begin(&timekeeper_seq);
 
 	timekeeping_forward_now(tk);
 	if (!new->enable || new->enable(new) == 0) {
@@ -502,7 +503,8 @@ static int change_clocksource(void *data)
 	}
 	timekeeping_update(tk, true);
 
-	write_sequnlock_irqrestore(&timekeeper_lock, flags);
+	write_seqcount_end(&timekeeper_seq);
+	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
 
 	return 0;
 }
@@ -552,11 +554,11 @@ void getrawmonotonic(struct timespec *ts)
 	s64 nsecs;
 
 	do {
-		seq = read_seqbegin(&timekeeper_lock);
+		seq = read_seqcount_begin(&timekeeper_seq);
 		nsecs = timekeeping_get_ns_raw(tk);
 		*ts = tk->raw_time;
 
-	} while (read_seqretry(&timekeeper_lock, seq));
+	} while (read_seqcount_retry(&timekeeper_seq, seq));
 
 	timespec_add_ns(ts, nsecs);
 }
@@ -572,11 +574,11 @@ int timekeeping_valid_for_hres(void)
 	int ret;
 
 	do {
-		seq = read_seqbegin(&timekeeper_lock);
+		seq = read_seqcount_begin(&timekeeper_seq);
 
 		ret = tk->clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;
 
-	} while (read_seqretry(&timekeeper_lock, seq));
+	} while (read_seqcount_retry(&timekeeper_seq, seq));
 
 	return ret;
 }
@@ -591,11 +593,11 @@ u64 timekeeping_max_deferment(void)
 	u64 ret;
 
 	do {
-		seq = read_seqbegin(&timekeeper_lock);
+		seq = read_seqcount_begin(&timekeeper_seq);
 
 		ret = tk->clock->max_idle_ns;
 
-	} while (read_seqretry(&timekeeper_lock, seq));
+	} while (read_seqcount_retry(&timekeeper_seq, seq));
 
 	return ret;
 }
@@ -658,7 +660,8 @@ void __init timekeeping_init(void)
 
 	ntp_init();
 
-	write_seqlock_irqsave(&timekeeper_lock, flags);
+	raw_spin_lock_irqsave(&timekeeper_lock, flags);
+	write_seqcount_begin(&timekeeper_seq);
 	clock = clocksource_default_clock();
 	if (clock->enable)
 		clock->enable(clock);
@@ -677,7 +680,8 @@ void __init timekeeping_init(void)
 	tmp.tv_nsec = 0;
 	tk_set_sleep_time(tk, tmp);
 
-	write_sequnlock_irqrestore(&timekeeper_lock, flags);
+	write_seqcount_end(&timekeeper_seq);
+	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
 }
 
 /* time in seconds when suspend began */
@@ -724,7 +728,8 @@ void timekeeping_inject_sleeptime(struct timespec *delta)
 	if (!(ts.tv_sec == 0 && ts.tv_nsec == 0))
 		return;
 
-	write_seqlock_irqsave(&timekeeper_lock, flags);
+	raw_spin_lock_irqsave(&timekeeper_lock, flags);
+	write_seqcount_begin(&timekeeper_seq);
 
 	timekeeping_forward_now(tk);
 
@@ -732,7 +737,8 @@ void timekeeping_inject_sleeptime(struct timespec *delta)
 
 	timekeeping_update(tk, true);
 
-	write_sequnlock_irqrestore(&timekeeper_lock, flags);
+	write_seqcount_end(&timekeeper_seq);
+	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
 
 	/* signal hrtimers about time change */
 	clock_was_set();
@@ -756,7 +762,8 @@ static void timekeeping_resume(void)
 	clockevents_resume();
 	clocksource_resume();
 
-	write_seqlock_irqsave(&timekeeper_lock, flags);
+	raw_spin_lock_irqsave(&timekeeper_lock, flags);
+	write_seqcount_begin(&timekeeper_seq);
 
 	if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) {
 		ts = timespec_sub(ts, timekeeping_suspend_time);
@@ -767,7 +774,8 @@ static void timekeeping_resume(void)
 	tk->ntp_error = 0;
 	timekeeping_suspended = 0;
 	timekeeping_update(tk, false);
-	write_sequnlock_irqrestore(&timekeeper_lock, flags);
+	write_seqcount_end(&timekeeper_seq);
+	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
 
 	touch_softlockup_watchdog();
 
@@ -786,7 +794,8 @@ static int timekeeping_suspend(void)
 
 	read_persistent_clock(&timekeeping_suspend_time);
 
-	write_seqlock_irqsave(&timekeeper_lock, flags);
+	raw_spin_lock_irqsave(&timekeeper_lock, flags);
+	write_seqcount_begin(&timekeeper_seq);
 	timekeeping_forward_now(tk);
 	timekeeping_suspended = 1;
 
@@ -809,7 +818,8 @@ static int timekeeping_suspend(void)
 		timekeeping_suspend_time =
 			timespec_add(timekeeping_suspend_time, delta_delta);
 	}
-	write_sequnlock_irqrestore(&timekeeper_lock, flags);
+	write_seqcount_end(&timekeeper_seq);
+	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
 
 	clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
 	clocksource_suspend();
@@ -1147,7 +1157,8 @@ static void update_wall_time(void)
 	int shift = 0, maxshift;
 	unsigned long flags;
 
-	write_seqlock_irqsave(&timekeeper_lock, flags);
+	raw_spin_lock_irqsave(&timekeeper_lock, flags);
+	write_seqcount_begin(&timekeeper_seq);
 
 	/* Make sure we're fully resumed: */
 	if (unlikely(timekeeping_suspended))
@@ -1202,7 +1213,8 @@ static void update_wall_time(void)
 	timekeeping_update(tk, false);
 
 out:
-	write_sequnlock_irqrestore(&timekeeper_lock, flags);
+	write_seqcount_end(&timekeeper_seq);
+	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
 
 }
 
@@ -1250,13 +1262,13 @@ void get_monotonic_boottime(struct timespec *ts)
 	WARN_ON(timekeeping_suspended);
 
 	do {
-		seq = read_seqbegin(&timekeeper_lock);
+		seq = read_seqcount_begin(&timekeeper_seq);
 		ts->tv_sec = tk->xtime_sec;
 		nsec = timekeeping_get_ns(tk);
 		tomono = tk->wall_to_monotonic;
 		sleep = tk->total_sleep_time;
 
-	} while (read_seqretry(&timekeeper_lock, seq));
+	} while (read_seqcount_retry(&timekeeper_seq, seq));
 
 	ts->tv_sec += tomono.tv_sec + sleep.tv_sec;
 	ts->tv_nsec = 0;
@@ -1315,10 +1327,10 @@ struct timespec current_kernel_time(void)
 	unsigned long seq;
 
 	do {
-		seq = read_seqbegin(&timekeeper_lock);
+		seq = read_seqcount_begin(&timekeeper_seq);
 
 		now = tk_xtime(tk);
-	} while (read_seqretry(&timekeeper_lock, seq));
+	} while (read_seqcount_retry(&timekeeper_seq, seq));
 
 	return now;
 }
@@ -1331,11 +1343,11 @@ struct timespec get_monotonic_coarse(void)
 	unsigned long seq;
 
 	do {
-		seq = read_seqbegin(&timekeeper_lock);
+		seq = read_seqcount_begin(&timekeeper_seq);
 
 		now = tk_xtime(tk);
 		mono = tk->wall_to_monotonic;
-	} while (read_seqretry(&timekeeper_lock, seq));
+	} while (read_seqcount_retry(&timekeeper_seq, seq));
 
 	set_normalized_timespec(&now, now.tv_sec + mono.tv_sec,
 				now.tv_nsec + mono.tv_nsec);
@@ -1366,11 +1378,11 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim,
 	unsigned long seq;
 
 	do {
-		seq = read_seqbegin(&timekeeper_lock);
+		seq = read_seqcount_begin(&timekeeper_seq);
 		*xtim = tk_xtime(tk);
 		*wtom = tk->wall_to_monotonic;
 		*sleep = tk->total_sleep_time;
-	} while (read_seqretry(&timekeeper_lock, seq));
+	} while (read_seqcount_retry(&timekeeper_seq, seq));
 }
 
 #ifdef CONFIG_HIGH_RES_TIMERS
@@ -1390,14 +1402,14 @@ ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot)
 	u64 secs, nsecs;
 
 	do {
-		seq = read_seqbegin(&timekeeper_lock);
+		seq = read_seqcount_begin(&timekeeper_seq);
 
 		secs = tk->xtime_sec;
 		nsecs = timekeeping_get_ns(tk);
 
 		*offs_real = tk->offs_real;
 		*offs_boot = tk->offs_boot;
-	} while (read_seqretry(&timekeeper_lock, seq));
+	} while (read_seqcount_retry(&timekeeper_seq, seq));
 
 	now = ktime_add_ns(ktime_set(secs, 0), nsecs);
 	now = ktime_sub(now, *offs_real);
@@ -1415,9 +1427,9 @@ ktime_t ktime_get_monotonic_offset(void)
 	struct timespec wtom;
 
 	do {
-		seq = read_seqbegin(&timekeeper_lock);
+		seq = read_seqcount_begin(&timekeeper_seq);
 		wtom = tk->wall_to_monotonic;
-	} while (read_seqretry(&timekeeper_lock, seq));
+	} while (read_seqcount_retry(&timekeeper_seq, seq));
 
 	return timespec_to_ktime(wtom);
 }
-- 
cgit v0.10.2


From 2b056dbb3b8f849fdfbfee215207e1a67ebc0c75 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 15 Feb 2013 17:15:49 +0100
Subject: timekeeping: Store cycle_last value in timekeeper struct as well

For implementing a shadow timekeeper and a split calculation/update
region we need to store the cycle_last value in the timekeeper and
update the value in the clocksource struct only in the update region.

Add the extra storage to the timekeeper.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/timekeeper_internal.h b/include/linux/timekeeper_internal.h
index 851bc43..2adf9c3 100644
--- a/include/linux/timekeeper_internal.h
+++ b/include/linux/timekeeper_internal.h
@@ -20,6 +20,8 @@ struct timekeeper {
 	u32			shift;
 	/* Number of clock cycles in one NTP interval. */
 	cycle_t			cycle_interval;
+	/* Last cycle value (also stored in clock->cycle_last) */
+	cycle_t			cycle_last;
 	/* Number of clock shifted nano seconds in one NTP interval. */
 	u64			xtime_interval;
 	/* shifted nano seconds left over when rounding cycle_interval */
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index bb62b9f..c59c9c6 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -96,7 +96,7 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
 
 	old_clock = tk->clock;
 	tk->clock = clock;
-	clock->cycle_last = clock->read(clock);
+	tk->cycle_last = clock->cycle_last = clock->read(clock);
 
 	/* Do the ns -> cycle conversion first, using original mult */
 	tmp = NTP_INTERVAL_LENGTH;
@@ -247,7 +247,7 @@ static void timekeeping_forward_now(struct timekeeper *tk)
 	clock = tk->clock;
 	cycle_now = clock->read(clock);
 	cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
-	clock->cycle_last = cycle_now;
+	tk->cycle_last = clock->cycle_last = cycle_now;
 
 	tk->xtime_nsec += cycle_delta * tk->mult;
 
-- 
cgit v0.10.2


From fc681abebae9e29b68e361ca2c85a88ca6e2e92f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 16 Feb 2013 00:06:18 +0100
Subject: timekeeping: Delay update of clock->cycle_last

For calculating the new timekeeper values store the new cycle_last
value in the timekeeper and update the clock->cycle_last just when we
actually update the new values.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index c59c9c6..d317692 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1095,7 +1095,7 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset,
 
 	/* Accumulate one shifted interval */
 	offset -= interval;
-	tk->clock->cycle_last += interval;
+	tk->cycle_last += interval;
 
 	tk->xtime_nsec += tk->xtime_interval << shift;
 	accumulate_nsecs_to_secs(tk);
@@ -1210,6 +1210,8 @@ static void update_wall_time(void)
 	 */
 	accumulate_nsecs_to_secs(tk);
 
+	/* Update clock->cycle_last with the new value */
+	clock->cycle_last = tk->cycle_last;
 	timekeeping_update(tk, false);
 
 out:
-- 
cgit v0.10.2


From a0c49feaa1d7a65d293e906e2ff7e1d98f6b6e32 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 15 Feb 2013 15:47:13 +0100
Subject: timekeeping: Implement a shadow timekeeper

Use the shadow timekeeper to do the update_wall_time() adjustments and
then copy it over to the real timekeeper.

Keep the shadow timekeeper in sync when updating stuff outside of
update_wall_time().

This allows us to limit the timekeeper_seq hold time to the update of
the real timekeeper and the vsyscall data in the next patch.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index d317692..9a0100b 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -28,6 +28,7 @@
 static struct timekeeper timekeeper;
 static DEFINE_RAW_SPINLOCK(timekeeper_lock);
 static seqcount_t timekeeper_seq;
+static struct timekeeper shadow_timekeeper;
 
 /* flag for if timekeeping is suspended */
 int __read_mostly timekeeping_suspended;
@@ -221,7 +222,7 @@ int pvclock_gtod_unregister_notifier(struct notifier_block *nb)
 EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier);
 
 /* must hold timekeeper_lock */
-static void timekeeping_update(struct timekeeper *tk, bool clearntp)
+static void timekeeping_update(struct timekeeper *tk, bool clearntp, bool mirror)
 {
 	if (clearntp) {
 		tk->ntp_error = 0;
@@ -229,6 +230,9 @@ static void timekeeping_update(struct timekeeper *tk, bool clearntp)
 	}
 	update_vsyscall(tk);
 	update_pvclock_gtod(tk);
+
+	if (mirror)
+		memcpy(&shadow_timekeeper, &timekeeper, sizeof(timekeeper));
 }
 
 /**
@@ -422,7 +426,7 @@ int do_settimeofday(const struct timespec *tv)
 
 	tk_set_xtime(tk, tv);
 
-	timekeeping_update(tk, true);
+	timekeeping_update(tk, true, true);
 
 	write_seqcount_end(&timekeeper_seq);
 	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
@@ -466,7 +470,7 @@ int timekeeping_inject_offset(struct timespec *ts)
 	tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *ts));
 
 error: /* even if we error out, we forwarded the time, so call update */
-	timekeeping_update(tk, true);
+	timekeeping_update(tk, true, true);
 
 	write_seqcount_end(&timekeeper_seq);
 	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
@@ -501,7 +505,7 @@ static int change_clocksource(void *data)
 		if (old->disable)
 			old->disable(old);
 	}
-	timekeeping_update(tk, true);
+	timekeeping_update(tk, true, true);
 
 	write_seqcount_end(&timekeeper_seq);
 	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
@@ -680,6 +684,8 @@ void __init timekeeping_init(void)
 	tmp.tv_nsec = 0;
 	tk_set_sleep_time(tk, tmp);
 
+	memcpy(&shadow_timekeeper, &timekeeper, sizeof(timekeeper));
+
 	write_seqcount_end(&timekeeper_seq);
 	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
 }
@@ -735,7 +741,7 @@ void timekeeping_inject_sleeptime(struct timespec *delta)
 
 	__timekeeping_inject_sleeptime(tk, delta);
 
-	timekeeping_update(tk, true);
+	timekeeping_update(tk, true, true);
 
 	write_seqcount_end(&timekeeper_seq);
 	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
@@ -773,7 +779,7 @@ static void timekeeping_resume(void)
 	tk->clock->cycle_last = tk->clock->read(tk->clock);
 	tk->ntp_error = 0;
 	timekeeping_suspended = 0;
-	timekeeping_update(tk, false);
+	timekeeping_update(tk, false, true);
 	write_seqcount_end(&timekeeper_seq);
 	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
 
@@ -1152,7 +1158,8 @@ static inline void old_vsyscall_fixup(struct timekeeper *tk)
 static void update_wall_time(void)
 {
 	struct clocksource *clock;
-	struct timekeeper *tk = &timekeeper;
+	struct timekeeper *real_tk = &timekeeper;
+	struct timekeeper *tk = &shadow_timekeeper;
 	cycle_t offset;
 	int shift = 0, maxshift;
 	unsigned long flags;
@@ -1164,16 +1171,16 @@ static void update_wall_time(void)
 	if (unlikely(timekeeping_suspended))
 		goto out;
 
-	clock = tk->clock;
+	clock = real_tk->clock;
 
 #ifdef CONFIG_ARCH_USES_GETTIMEOFFSET
-	offset = tk->cycle_interval;
+	offset = real_tk->cycle_interval;
 #else
 	offset = (clock->read(clock) - clock->cycle_last) & clock->mask;
 #endif
 
 	/* Check if there's really nothing to do */
-	if (offset < tk->cycle_interval)
+	if (offset < real_tk->cycle_interval)
 		goto out;
 
 	/*
@@ -1212,12 +1219,22 @@ static void update_wall_time(void)
 
 	/* Update clock->cycle_last with the new value */
 	clock->cycle_last = tk->cycle_last;
-	timekeeping_update(tk, false);
+	/*
+	 * Update the real timekeeper.
+	 *
+	 * We could avoid this memcpy by switching pointers, but that
+	 * requires changes to all other timekeeper usage sites as
+	 * well, i.e. move the timekeeper pointer getter into the
+	 * spinlocked/seqcount protected sections. And we trade this
+	 * memcpy under the timekeeper_seq against one before we start
+	 * updating.
+	 */
+	memcpy(real_tk, tk, sizeof(*tk));
+	timekeeping_update(real_tk, false, false);
 
 out:
 	write_seqcount_end(&timekeeper_seq);
 	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
-
 }
 
 /**
-- 
cgit v0.10.2


From 85d01af6660a4f542758213a288d1c0c5c8a328f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 16 Feb 2013 00:39:49 +0100
Subject: timekeeping: Shorten seq_count region

Shorten the seqcount write hold region to the actual update of the
timekeeper and the related data (e.g vsyscall).

On a contemporary x86 system this reduces the maximum latencies on
Preempt-RT from 8us to 4us on the non-timekeeping cores.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 9a0100b..1612708 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1165,7 +1165,6 @@ static void update_wall_time(void)
 	unsigned long flags;
 
 	raw_spin_lock_irqsave(&timekeeper_lock, flags);
-	write_seqcount_begin(&timekeeper_seq);
 
 	/* Make sure we're fully resumed: */
 	if (unlikely(timekeeping_suspended))
@@ -1217,6 +1216,7 @@ static void update_wall_time(void)
 	 */
 	accumulate_nsecs_to_secs(tk);
 
+	write_seqcount_begin(&timekeeper_seq);
 	/* Update clock->cycle_last with the new value */
 	clock->cycle_last = tk->cycle_last;
 	/*
@@ -1231,9 +1231,8 @@ static void update_wall_time(void)
 	 */
 	memcpy(real_tk, tk, sizeof(*tk));
 	timekeeping_update(real_tk, false, false);
-
-out:
 	write_seqcount_end(&timekeeper_seq);
+out:
 	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
 }
 
-- 
cgit v0.10.2


From 0c174f0ffa7face4670b88d0b9c326b2da790b28 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior $ <bigeasy@linutronix.de>
Date: Tue, 30 Apr 2013 03:16:52 -0500
Subject: net/cpsw: revert stable patches v3.8..v3.8.9

and apply them from net & net-next branch.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>

diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c
index 3b1be52..40aff68 100644
--- a/drivers/net/ethernet/ti/cpsw.c
+++ b/drivers/net/ethernet/ti/cpsw.c
@@ -375,7 +375,7 @@ void cpsw_tx_handler(void *token, int len, int status)
 	struct cpsw_priv	*priv = netdev_priv(ndev);
 
 	if (unlikely(netif_queue_stopped(ndev)))
-		netif_wake_queue(ndev);
+		netif_start_queue(ndev);
 	cpts_tx_timestamp(&priv->cpts, skb);
 	priv->stats.tx_packets++;
 	priv->stats.tx_bytes += len;
@@ -1111,7 +1111,7 @@ static int cpsw_probe_dt(struct cpsw_platform_data *data,
 		struct platform_device *mdio;
 
 		parp = of_get_property(slave_node, "phy_id", &lenp);
-		if ((parp == NULL) || (lenp != (sizeof(void *) * 2))) {
+		if ((parp == NULL) && (lenp != (sizeof(void *) * 2))) {
 			pr_err("Missing slave[%d] phy_id property\n", i);
 			ret = -EINVAL;
 			goto error_ret;
diff --git a/drivers/net/ethernet/ti/davinci_emac.c b/drivers/net/ethernet/ti/davinci_emac.c
index 4ebcb24..2a3e2c5 100644
--- a/drivers/net/ethernet/ti/davinci_emac.c
+++ b/drivers/net/ethernet/ti/davinci_emac.c
@@ -1055,7 +1055,7 @@ static void emac_tx_handler(void *token, int len, int status)
 	atomic_dec(&priv->cur_tx);
 
 	if (unlikely(netif_queue_stopped(ndev)))
-		netif_wake_queue(ndev);
+		netif_start_queue(ndev);
 	ndev->stats.tx_packets++;
 	ndev->stats.tx_bytes += len;
 	dev_kfree_skb_any(skb);
-- 
cgit v0.10.2


From fe14a3e9f04d8c3cbeb37672732b1a88ed150f91 Mon Sep 17 00:00:00 2001
From: Sebastian Siewior <bigeasy@linutronix.de>
Date: Tue, 30 Apr 2013 03:16:52 -0500
Subject: net/cpsw: giant cpsw patch
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch contains:
 git diff v3.8..net-merge drivers/net/ethernet/ti arch/arm/boot/dts/am33* include/linux/platform_data/cpsw.h
where the net-merge branch contains the following branches merged:
net-next as of 37fe066 net: fix address check in rtnl_fdb_del
net      as of b9e48de isdn/sc: Fix incorrect module_param_array types
linus    as of 697dfd8 Merge tag 'efi-urgent' into x86/urgent

including the following patches:

|commit 15c6ff3bc0ff3464a8c7efcdea09c86454571622
|Author: Jiri Pirko <jiri@resnulli.us>
|Date:   Tue Jan 1 03:30:17 2013 +0000
|
|    net: remove unnecessary NET_ADDR_RANDOM "bitclean"
|
|    NET_ADDR_SET is set in dev_set_mac_address() no need to alter
|    dev->addr_assign_type value in drivers.
|
|    Signed-off-by: Jiri Pirko <jiri@resnulli.us>
|    Signed-off-by: David S. Miller <davem@davemloft.net>
|
|commit 7826d43f2db45c9305a6e0ba165650e1a203f517
|Author: Jiri Pirko <jiri@resnulli.us>
|Date:   Sun Jan 6 00:44:26 2013 +0000
|
|    ethtool: fix drvinfo strings set in drivers
|
|    Use strlcpy where possible to ensure the string is \0 terminated.
|    Use always sizeof(string) instead of 32, ETHTOOL_BUSINFO_LEN
|    and custom defines.
|    Use snprintf instead of sprint.
|    Remove unnecessary inits of ->fw_version
|    Remove unnecessary inits of drvinfo struct.
|
|    Signed-off-by: Jiri Pirko <jiri@resnulli.us>
|    Signed-off-by: David S. Miller <davem@davemloft.net>
|
|commit 8ff25eebb8161d08ddc48539e6b4afa0b18d778f
|Author: Kees Cook <keescook@chromium.org>
|Date:   Tue Oct 2 11:18:24 2012 -0700
|
|    drivers/net/ethernet/ti: remove depends on CONFIG_EXPERIMENTAL
|
|    The CONFIG_EXPERIMENTAL config item has not carried much meaning for a
|    while now and is almost always enabled by default. As agreed during the
|    Linux kernel summit, remove it from any "depends on" lines in Kconfigs.
|
|    CC: Tony Lindgren <tony@atomide.com>
|    CC: Mugunthan V N <mugunthanvnm@ti.com>
|    CC: Kevin Hilman <khilman@ti.com>
|    CC: "David S. Miller" <davem@davemloft.net>
|    CC: Cyril Chemparathy <cyril@ti.com>
|    Signed-off-by: Kees Cook <keescook@chromium.org>
|    Acked-by: David S. Miller <davem@davemloft.net>
|
|commit f9a8f83b04e0c362a2fc660dbad980d24af209fc
|Author: Florian Fainelli <florian@openwrt.org>
|Date:   Mon Jan 14 00:52:52 2013 +0000
|
|    net: phy: remove flags argument from phy_{attach, connect, connect_direct}
|
|    The flags argument of the phy_{attach,connect,connect_direct} functions
|    is then used to assign a struct phy_device dev_flags with its value.
|    All callers but the tg3 driver pass the flag 0, which results in the
|    underlying PHY drivers in drivers/net/phy/ not being able to actually
|    use any of the flags they would set in dev_flags. This patch gets rid of
|    the flags argument, and passes phydev->dev_flags to the internal PHY
|    library call phy_attach_direct() such that drivers which actually modify
|    a phy device dev_flags get the value preserved for use by the underlying
|    phy driver.
|
|    Acked-by: Kosta Zertsekel <konszert@marvell.com>
|    Signed-off-by: Florian Fainelli <florian@openwrt.org>
|    Signed-off-by: David S. Miller <davem@davemloft.net>
|
|commit fae50823d0ee579e006a7ba2b20880e354388b25
|Author: Mugunthan V N <mugunthanvnm@ti.com>
|Date:   Thu Jan 17 06:31:34 2013 +0000
|
|    net: ethernet: davinci_cpdma: Add boundary for rx and tx descriptors
|
|    When there is heavy transmission traffic in the CPDMA, then Rx descriptors
|    memory is also utilized as tx desc memory looses all rx descriptors and the
|    driver stops working then.
|
|    This patch adds boundary for tx and rx descriptors in bd ram dividing the
|    descriptor memory to ensure that during heavy transmission tx doesn't use
|    rx descriptors.
|
|    This patch is already applied to davinci_emac driver, since CPSW and
|    davici_dmac shares the same CPDMA, moving the boundry seperation from
|    Davinci EMAC driver to CPDMA driver which was done in the following
|    commit
|
|    commit 86d8c07ff2448eb4e860e50f34ef6ee78e45c40c
|    Author: Sascha Hauer <s.hauer@pengutronix.de>
|    Date:   Tue Jan 3 05:27:47 2012 +0000
|
|        net/davinci: do not use all descriptors for tx packets
|
|        The driver uses a shared pool for both rx and tx descriptors.
|        During open it queues fixed number of 128 descriptors for receive
|        packets. For each received packet it tries to queue another
|        descriptor. If this fails the descriptor is lost for rx.
|        The driver has no limitation on tx descriptors to use, so it
|        can happen during a nmap / ping -f attack that the driver
|        allocates all descriptors for tx and looses all rx descriptors.
|        The driver stops working then.
|        To fix this limit the number of tx descriptors used to half of
|        the descriptors available, the rx path uses the other half.
|
|        Tested on a custom board using nmap / ping -f to the board from
|        two different hosts.
|
|    Signed-off-by: Mugunthan V N <mugunthanvnm@ti.com>
|    Signed-off-by: David S. Miller <davem@davemloft.net>
|
|commit 7373470202a3c98e1a338bf1acf51247cd100868
|Author: Thierry Reding <thierry.reding@avionic-design.de>
|Date:   Mon Jan 21 10:38:39 2013 +0100
|
|    net: ethernet: davinci: Fix build breakage
|
|    The correct name of the transmit DMA channel field in struct emac_priv
|    is txchan, not txch.
|
|    Signed-off-by: Thierry Reding <thierry.reding@avionic-design.de>
|    Signed-off-by: David S. Miller <davem@davemloft.net>
|
|commit b2adaca92c63b9bb8beb021d554f656e387a7648
|Author: Joe Perches <joe@perches.com>
|Date:   Sun Feb 3 17:43:58 2013 +0000
|
|    ethernet: Remove unnecessary alloc/OOM messages, alloc cleanups
|
|    alloc failures already get standardized OOM
|    messages and a dump_stack.
|
|    Convert kzalloc's with multiplies to kcalloc.
|    Convert kmalloc's with multiplies to kmalloc_array.
|    Fix a few whitespace defects.
|    Convert a constant 6 to ETH_ALEN.
|    Use parentheses around sizeof.
|    Convert vmalloc/memset to vzalloc.
|    Remove now unused size variables.
|
|    Signed-off-by: Joe Perches <joe@perches.com>
|    Signed-off-by: David S. Miller <davem@davemloft.net>
|
|commit e11b220f336c654db876027d40953acef90b0cae
|Author: Mugunthan V N <mugunthanvnm@ti.com>
|Date:   Tue Feb 5 08:26:47 2013 +0000
|
|    drivers: net: cpsw: Add helper functions for VLAN ALE implementation
|
|    Add helper functions for VLAN ALE implementations for Add, Delete
|    Dump VLAN related ALE entries
|
|    Signed-off-by: Mugunthan V N <mugunthanvnm@ti.com>
|    Signed-off-by: David S. Miller <davem@davemloft.net>
|
|commit 3b72c2fe0c6bbec42ed7f899931daef227b80322
|Author: Mugunthan V N <mugunthanvnm@ti.com>
|Date:   Tue Feb 5 08:26:48 2013 +0000
|
|    drivers: net:ethernet: cpsw: add support for VLAN
|
|    adding support for VLAN interface for cpsw.
|
|    CPSW VLAN Capability
|    * Can filter VLAN packets in Hardware
|
|    Signed-off-by: Mugunthan V N <mugunthanvnm@ti.com>
|    Signed-off-by: David S. Miller <davem@davemloft.net>
|
|commit f6575c90f6fc637697f130ea4a05892296c9a473
|Author: Vaibhav Bedia <vaibhav.bedia@ti.com>
|Date:   Tue Jan 29 16:45:07 2013 +0530
|
|    ARM: DTS: AM33XX: Add nodes for OCMC RAM and WKUP-M3
|
|    Since AM33XX supports only DT-boot, this is needed
|    for the appropriate device nodes to be created.
|
|    Note: OCMC RAM is part of the PER power domain and supports
|    retention. The assembly code for low power entry/exit will
|    run from OCMC RAM. To ensure that the OMAP PM code does not
|    attempt to disable the clock to OCMC RAM as part of the
|    suspend process add the no_idle_on_suspend flag.
|
|    Signed-off-by: Vaibhav Bedia <vaibhav.bedia@ti.com>
|    Acked-by: Santosh Shilimkar <santosh.shilimkar@ti.com>
|    Acked-by: Peter Korsgaard <jacmet@sunsite.dk>
|    Signed-off-by: Paul Walmsley <paul@pwsan.com>
|
|commit f6e135c81eeb648c6addc6aeff2ee80f28ea413b
|Author: Mugunthan V N <mugunthanvnm@ti.com>
|Date:   Mon Feb 11 09:52:18 2013 +0000
|
|    driver: net: ethernet: davinci_cpdma: add support for directed packet and source port detection
|
|    * Introduced parameter to add port number for directed packet in cpdma_chan_submit
|    * Source port detection macro with DMA descriptor status
|
|    Signed-off-by: Mugunthan V N <mugunthanvnm@ti.com>
|    Signed-off-by: David S. Miller <davem@davemloft.net>
|
|commit 9232b16df2167c8afcb89de39ee85f5091ebacff
|Author: Mugunthan V N <mugunthanvnm@ti.com>
|Date:   Mon Feb 11 09:52:19 2013 +0000
|
|    driver: net: ethernet: cpsw: make cpts as pointer
|
|    As CPTS is common module for both EMAC in Dual EMAC mode so making cpts as
|    pointer.
|
|    Signed-off-by: Mugunthan V N <mugunthanvnm@ti.com>
|    Signed-off-by: David S. Miller <davem@davemloft.net>
|
|commit d9ba8f9e6298af71ec1c1fd3d88c3ef68abd0ec3
|Author: Mugunthan V N <mugunthanvnm@ti.com>
|Date:   Mon Feb 11 09:52:20 2013 +0000
|
|    driver: net: ethernet: cpsw: dual emac interface implementation
|
|    The CPSW switch can act as Dual EMAC by segregating the switch ports
|    using VLAN and port VLAN as per the TRM description in
|    14.3.2.10.2 Dual Mac Mode
|
|    Following CPSW components will be common for both the interfaces.
|    * Interrupt source is common for both eth interfaces
|    * Interrupt pacing is common for both interfaces
|    * Hardware statistics is common for all the ports
|    * CPDMA is common for both eth interface
|    * CPTS is common for both the interface and it should not be enabled on
|      both the interface as timestamping information doesn't contain port
|      information.
|
|    Constrains
|    * Reserved VID of One port should not be used in other interface which will
|      enable switching functionality
|    * Same VID must not be used in both the interface which will enable switching
|      functionality
|
|    Signed-off-by: Mugunthan V N <mugunthanvnm@ti.com>
|    Signed-off-by: David S. Miller <davem@davemloft.net>
|
|commit 79876e0394aa46e74267a5871c4f4469544dcacf
|Author: Cyril Roelandt <tipecaml@gmail.com>
|Date:   Tue Feb 12 12:52:30 2013 +0000
|
|    net: ethernet: ti: remove redundant NULL check.
|
|    cpdma_chan_destroy() on a NULL pointer is a no-op, so the NULL check in
|    cpdma_ctlr_destroy() can safely be removed.
|
|    Signed-off-by: Cyril Roelandt <tipecaml@gmail.com>
|    Signed-off-by: David S. Miller <davem@davemloft.net>
|
|commit 6929e24e4cc46ce8d5b7dd8f8bdf4244c8d77f76
|Author: Arnd Bergmann <arnd@arndb.de>
|Date:   Thu Feb 14 17:53:01 2013 +0100
|
|    net: cwdavinci_cpdma: export symbols for cpsw
|
|    With the support for ARM AM33xx in multiplatform kernels
|    in 3.9, an older bug appears in ARM allmodconfig:
|    When the cpsw driver is built as a module with cpdma
|    support enabled, it uses symbols that the cpdma driver
|    does not export.
|
|    Without this patch, building allmodconfig results in:
|
|    ERROR: "cpdma_ctlr_int_ctrl" [drivers/net/ethernet/ti/ti_cpsw.ko] undefined!
|    ERROR: "cpdma_control_set" [drivers/net/ethernet/ti/ti_cpsw.ko] undefined!
|    ERROR: "cpdma_ctlr_eoi" [drivers/net/ethernet/ti/ti_cpsw.ko] undefined!
|
|    Signed-off-by: Arnd Bergmann <arnd@arndb.de>
|    Acked-by: David S. Miller <davem@davemloft.net>
|    Cc: Mugunthan V N <mugunthanvnm@ti.com>
|    Cc: Vaibhav Hiremath <hvaibhav@ti.com>
|    Cc: Richard Cochran <richardcochran@gmail.com>
|    Cc: netdev@vger.kernel.org
|
|commit 510a1e7249298f6bbd049e1ec98041ddf5ef6452
|Author: Mugunthan V N <mugunthanvnm@ti.com>
|Date:   Sun Feb 17 22:19:20 2013 +0000
|
|    drivers: net: davinci_cpdma: acknowledge interrupt properly
|
|    CPDMA interrupts are not properly acknowledged which leads to interrupt
|    storm, only cpdma interrupt 0 is acknowledged in Davinci CPDMA driver.
|    Changed cpdma_ctlr_eoi api to acknowledge 1 and 2 interrupts which are
|    used for rx and tx respectively.
|
|    Reported-by: Pantelis Antoniou <panto@antoniou-consulting.com>
|    Signed-off-by: Mugunthan V N <mugunthanvnm@ti.com>
|    Signed-off-by: David S. Miller <davem@davemloft.net>
|
|commit 06991c28f37ad68e5c03777f5c3b679b56e3dac1
|Merge: 460dc1e 74fef7a
|Author: Linus Torvalds <torvalds@linux-foundation.org>
|Date:   Thu Feb 21 12:05:51 2013 -0800
|
|    Merge tag 'driver-core-3.9-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/driver-core
|
|    Pull driver core patches from Greg Kroah-Hartman:
|     "Here is the big driver core merge for 3.9-rc1
|
|      There are two major series here, both of which touch lots of drivers
|      all over the kernel, and will cause you some merge conflicts:
|
|       - add a new function called devm_ioremap_resource() to properly be
|         able to check return values.
|
|       - remove CONFIG_EXPERIMENTAL
|
|      Other than those patches, there's not much here, some minor fixes and
|      updates"
|
|    Fix up trivial conflicts
|
|    * tag 'driver-core-3.9-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/driver-core: (221 commits)
|      base: memory: fix soft/hard_offline_page permissions
|      drivercore: Fix ordering between deferred_probe and exiting initcalls
|      backlight: fix class_find_device() arguments
|      TTY: mark tty_get_device call with the proper const values
|      driver-core: constify data for class_find_device()
|      firmware: Ignore abort check when no user-helper is used
|      firmware: Reduce ifdef CONFIG_FW_LOADER_USER_HELPER
|      firmware: Make user-mode helper optional
|      firmware: Refactoring for splitting user-mode helper code
|      Driver core: treat unregistered bus_types as having no devices
|      watchdog: Convert to devm_ioremap_resource()
|      thermal: Convert to devm_ioremap_resource()
|      spi: Convert to devm_ioremap_resource()
|      power: Convert to devm_ioremap_resource()
|      mtd: Convert to devm_ioremap_resource()
|      mmc: Convert to devm_ioremap_resource()
|      mfd: Convert to devm_ioremap_resource()
|      media: Convert to devm_ioremap_resource()
|      iommu: Convert to devm_ioremap_resource()
|      drm: Convert to devm_ioremap_resource()
|      ...
|
|commit 3298a3511f1e73255a8dc023efd909e569eea037
|Merge: 5ce7aba acb7452
|Author: Linus Torvalds <torvalds@linux-foundation.org>
|Date:   Thu Feb 21 15:20:41 2013 -0800
|
|    Merge tag 'multiplatform' of git://git.kernel.org/pub/scm/linux/kernel/git/arm/arm-soc
|
|    Pull ARM SoC multiplatform support from Arnd Bergmann:
|     "Converting more ARM platforms to multiplatform support.  This time,
|      OMAP gets converted, which is a major step since this is by far the
|      largest platform in terms of code size.  The same thing happens to the
|      vt8500 platform."
|
|    * tag 'multiplatform' of git://git.kernel.org/pub/scm/linux/kernel/git/arm/arm-soc:
|      net: cwdavinci_cpdma: export symbols for cpsw
|      remoteproc: omap: depend on OMAP_MBOX_FWK
|      [media] davinci: do not include mach/hardware.h
|      ARM: OMAP2+: Make sure files with omap initcalls include soc.h
|      ARM: OMAP2+: Include soc.h to drm.c to fix compiling
|      ARM: OMAP2+: Fix warning for hwspinlock omap_postcore_initcall
|      ARM: multi_v7_defconfig: add ARCH_ZYNQ
|      ARM: multi_v7_defconfig: remove unnecessary CONFIG_GPIOLIB
|      arm: vt8500: Remove remaining mach includes
|      arm: vt8500: Convert debug-macro.S to be multiplatform friendly
|      arm: vt8500: Remove single platform Kconfig options
|      ARM: OMAP2+: Remove now obsolete uncompress.h and debug-macro.S
|      ARM: OMAP2+: Add minimal support for booting vexpress
|      ARM: OMAP2+: Enable ARCH_MULTIPLATFORM support
|      ARM: OMAP2+: Disable code that currently does not work with multiplaform
|      ARM: OMAP2+: Add multiplatform debug_ll support
|      ARM: OMAP: Fix dmaengine init for multiplatform
|      ARM: OMAP: Fix i2c cmdline initcall for multiplatform
|      ARM: OMAP2+: Use omap initcalls
|      ARM: OMAP2+: Limit omap initcalls to omap only on multiplatform kernels
|
|commit 0237c11044b3670adcbe80cd6dd721285347f497
|Author: Daniel Mack <zonque@gmail.com>
|Date:   Tue Feb 26 04:06:20 2013 +0000
|
|    drivers: net: ethernet: cpsw: consider number of slaves in interation
|
|    Make cpsw_add_default_vlan() look at the actual number of slaves for its
|    iteration, so boards with less than 2 slaves don't ooops at boot.
|
|    Signed-off-by: Daniel Mack <zonque@gmail.com>
|    Cc: Mugunthan V N <mugunthanvnm@ti.com>
|    Cc: David S. Miller <davem@davemloft.net>
|    Acked-by: Mugunthan V N <mugunthanvnm@ti.com>
|    Signed-off-by: David S. Miller <davem@davemloft.net>
|
|commit 7307c00f335a4e986586b12334696098d2fc2bcd
|Merge: f8f466c 55ccb1a
|Author: Linus Torvalds <torvalds@linux-foundation.org>
|Date:   Thu Feb 28 20:00:40 2013 -0800
|
|    Merge tag 'late-omap' of git://git.kernel.org/pub/scm/linux/kernel/git/arm/arm-soc
|
|    Pull ARM SoC late OMAP changes from Olof Johansson:
|     "This branch contains changes for OMAP that came in late during the
|      release staging, close to when the merge window opened.
|
|      It contains, among other things:
|
|       - OMAP PM fixes and some patches for audio device integration
|       - OMAP clock fixes related to common clock conversion
|       - A set of patches cleaning up WFI entry and blocking.
|       - A set of fixes and IP block support for PM on TI AM33xx SoCs
|         (Beaglebone, etc)
|       - A set of smaller fixes and cleanups around AM33xx restart and
|         revision detection, as well as removal of some dead code
|         (CONFIG_32K_TIMER_HZ)"
|
|    * tag 'late-omap' of git://git.kernel.org/pub/scm/linux/kernel/git/arm/arm-soc: (34 commits)
|      ARM: omap2: include linux/errno.h in hwmod_reset
|      ARM: OMAP2+: fix some omap_device_build() calls that aren't compiled by default
|      ARM: OMAP4: hwmod data: Enable AESS hwmod device
|      ARM: OMAP4: hwmod data: Update AESS data with memory bank area
|      ARM: OMAP4+: AESS: enable internal auto-gating during initial setup
|      ASoC: TI AESS: add autogating-enable function, callable from architecture code
|      ARM: OMAP2+: hwmod: add enable_preprogram hook
|      ARM: OMAP4: clock data: Add missing clkdm association for dpll_usb
|      ARM: OMAP2+: PM: Fix the dt return condition in pm_late_init()
|      ARM: OMAP2: am33xx-hwmod: Fix "register offset NULL check" bug
|      ARM: OMAP2+: AM33xx: hwmod: add missing HWMOD_NO_IDLEST flags
|      ARM: OMAP: AM33xx hwmod: Add parent-child relationship for PWM subsystem
|      ARM: OMAP: AM33xx hwmod: Corrects PWM subsystem HWMOD entries
|      ARM: DTS: AM33XX: Add nodes for OCMC RAM and WKUP-M3
|      ARM: OMAP2+: AM33XX: Update the hardreset API
|      ARM: OMAP2+: AM33XX: hwmod: Update the WKUP-M3 hwmod with reset status bit
|      ARM: OMAP2+: AM33XX: hwmod: Fixup cpgmac0 hwmod entry
|      ARM: OMAP2+: AM33XX: hwmod: Update TPTC0 hwmod with the right flags
|      ARM: OMAP2+: AM33XX: hwmod: Register OCMC RAM hwmod
|      ARM: OMAP2+: AM33XX: CM/PRM: Use __ASSEMBLER__ macros in header files
|      ...
|
|commit 9da060d0ed571bbff434c4a1ef3e48db99a37ee0
|Merge: e3b5951 aab2b4b
|Author: Linus Torvalds <torvalds@linux-foundation.org>
|Date:   Tue Mar 5 18:42:29 2013 -0800
|
|    Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net
|
|    Pull networking fixes from David Miller:
|     "A moderately sized pile of fixes, some specifically for merge window
|      introduced regressions although others are for longer standing items
|      and have been queued up for -stable.
|
|      I'm kind of tired of all the RDS protocol bugs over the years, to be
|      honest, it's way out of proportion to the number of people who
|      actually use it.
|
|       1) Fix missing range initialization in netfilter IPSET, from Jozsef
|          Kadlecsik.
|
|       2) ieee80211_local->tim_lock needs to use BH disabling, from Johannes
|          Berg.
|
|       3) Fix DMA syncing in SFC driver, from Ben Hutchings.
|
|       4) Fix regression in BOND device MAC address setting, from Jiri
|          Pirko.
|
|       5) Missing usb_free_urb in ISDN Hisax driver, from Marina Makienko.
|
|       6) Fix UDP checksumming in bnx2x driver for 57710 and 57711 chips,
|          fix from Dmitry Kravkov.
|
|       7) Missing cfgspace_lock initialization in BCMA driver.
|
|       8) Validate parameter size for SCTP assoc stats getsockopt(), from
|          Guenter Roeck.
|
|       9) Fix SCTP association hangs, from Lee A Roberts.
|
|      10) Fix jumbo frame handling in r8169, from Francois Romieu.
|
|      11) Fix phy_device memory leak, from Petr Malat.
|
|      12) Omit trailing FCS from frames received in BGMAC driver, from Hauke
|          Mehrtens.
|
|      13) Missing socket refcount release in L2TP, from Guillaume Nault.
|
|      14) sctp_endpoint_init should respect passed in gfp_t, rather than use
|          GFP_KERNEL unconditionally.  From Dan Carpenter.
|
|      15) Add AISX AX88179 USB driver, from Freddy Xin.
|
|      16) Remove MAINTAINERS entries for drivers deleted during the merge
|          window, from Cesar Eduardo Barros.
|
|      17) RDS protocol can try to allocate huge amounts of memory, check
|          that the user's request length makes sense, from Cong Wang.
|
|      18) SCTP should use the provided KMALLOC_MAX_SIZE instead of it's own,
|          bogus, definition.  From Cong Wang.
|
|      19) Fix deadlocks in FEC driver by moving TX reclaim into NAPI poll,
|          from Frank Li.  Also, fix a build error introduced in the merge
|          window.
|
|      20) Fix bogus purging of default routes in ipv6, from Lorenzo Colitti.
|
|      21) Don't double count RTT measurements when we leave the TCP receive
|          fast path, from Neal Cardwell."
|
|    * git://git.kernel.org/pub/scm/linux/kernel/git/davem/net: (61 commits)
|      tcp: fix double-counted receiver RTT when leaving receiver fast path
|      CAIF: fix sparse warning for caif_usb
|      rds: simplify a warning message
|      net: fec: fix build error in no MXC platform
|      net: ipv6: Don't purge default router if accept_ra=2
|      net: fec: put tx to napi poll function to fix dead lock
|      sctp: use KMALLOC_MAX_SIZE instead of its own MAX_KMALLOC_SIZE
|      rds: limit the size allocated by rds_message_alloc()
|      MAINTAINERS: remove eexpress
|      MAINTAINERS: remove drivers/net/wan/cycx*
|      MAINTAINERS: remove 3c505
|      caif_dev: fix sparse warnings for caif_flow_cb
|      ax88179_178a: ASIX AX88179_178A USB 3.0/2.0 to gigabit ethernet adapter driver
|      sctp: use the passed in gfp flags instead GFP_KERNEL
|      ipv[4|6]: correct dropwatch false positive in local_deliver_finish
|      l2tp: Restore socket refcount when sendmsg succeeds
|      net/phy: micrel: Disable asymmetric pause for KSZ9021
|      bgmac: omit the fcs
|      phy: Fix phy_device_free memory leak
|      bnx2x: Fix KR2 work-around condition
|      ...
|
|commit 720a43efd30f04a0a492c85fb997361c44fbae05
|Author: Joe Perches <joe@perches.com>
|Date:   Fri Mar 8 15:03:25 2013 +0000
|
|    drivers:net: Remove unnecessary OOM messages after netdev_alloc_skb
|
|    Emitting netdev_alloc_skb and netdev_alloc_skb_ip_align OOM
|    messages is unnecessary as there is already a dump_stack
|    after allocation failures.
|
|    Other trivial changes around these removals:
|
|    Convert a few comparisons of pointer to 0 to !pointer.
|    Change flow to remove unnecessary label.
|    Remove now unused variable.
|    Hoist assignment from if.
|
|    Signed-off-by: Joe Perches <joe@perches.com>
|    Signed-off-by: David S. Miller <davem@davemloft.net>
|
|commit e86ac13b031cf71d8f40ff513e627aac80e6b765
|Author: Mugunthan V N <mugunthanvnm@ti.com>
|Date:   Mon Mar 11 23:16:35 2013 +0000
|
|    drivers: net: ethernet: cpsw: change cpts_active_slave to active_slave
|
|    Change cpts_active_slave to active_slave so that the same DT property
|    can be used to ethtool and SIOCGMIIPHY.
|
|    CC: Richard Cochran <richardcochran@gmail.com>
|    Signed-off-by: Mugunthan V N <mugunthanvnm@ti.com>
|    Signed-off-by: David S. Miller <davem@davemloft.net>
|
|commit d3bb9c58b567d240eaaa2dc8bd778696eaed5fbd
|Author: Mugunthan V N <mugunthanvnm@ti.com>
|Date:   Mon Mar 11 23:16:36 2013 +0000
|
|    driver: net: ethernet: cpsw: implement ethtool get/set phy setting
|
|    This patch implements get/set of the phy settings via ethtool apis
|
|    Signed-off-by: Mugunthan V N <mugunthanvnm@ti.com>
|    Signed-off-by: David S. Miller <davem@davemloft.net>
|
|commit ff5b8ef2ef3af0fd7e1cf6c8c1ed9ec5afbda422
|Author: Mugunthan V N <mugunthanvnm@ti.com>
|Date:   Mon Mar 11 23:16:37 2013 +0000
|
|    driver: net: ethernet: cpsw: implement interrupt pacing via ethtool
|
|    This patch implements support for interrupt pacing block of CPSW via ethtool
|    Inetrrupt pacing block is common of both the ethernet interface in
|    dual emac mode
|
|    Signed-off-by: Mugunthan V N <mugunthanvnm@ti.com>
|    Signed-off-by: David S. Miller <davem@davemloft.net>
|
|commit 11f2c988382b880e602a005c26436043c5d2c274
|Author: Mugunthan V N <mugunthanvnm@ti.com>
|Date:   Mon Mar 11 23:16:38 2013 +0000
|
|    drivers: net: ethernet: cpsw: implement get phy_id via ioctl
|
|    Implement get phy_id via ioctl SIOCGMIIPHY. In switch mode active phy_id
|    is returned and in dual EMAC mode slave's specific phy_id is returned.
|
|    Signed-off-by: Mugunthan V N <mugunthanvnm@ti.com>
|    Signed-off-by: David S. Miller <davem@davemloft.net>
|
|commit d35162f89b8f00537d7b240b76d2d0e8b8d29aa0
|Author: Daniel Mack <zonque@gmail.com>
|Date:   Tue Mar 12 06:31:19 2013 +0000
|
|    net: ethernet: cpsw: fix usage of cpdma_check_free_tx_desc()
|
|    Commit fae50823d0 ("net: ethernet: davinci_cpdma: Add boundary for rx
|    and tx descriptors") introduced a function to check the current
|    allocation state of tx packets. The return value is taken into account
|    to stop the netqork queue on the adapter in case there are no free
|    slots.
|
|    However, cpdma_check_free_tx_desc() returns 'true' if there is room in
|    the bitmap, not 'false', so the usage of the function is wrong.
|
|    Signed-off-by: Daniel Mack <zonque@gmail.com>
|    Cc: Mugunthan V N <mugunthanvnm@ti.com>
|    Reported-by: Sven Neumann <s.neumann@raumfeld.com>
|    Reported-by: Andreas Fenkart <andreas.fenkart@streamunlimited.com>
|    Tested-by: Mugunthan V N <mugunthanvnm@ti.com>
|    Acked-by: Mugunthan V N <mugunthanvnm@ti.com>
|    Tested-by: Andreas Fenkart <andreas.fenkart@streamunlimited.com>
|    Signed-off-by: David S. Miller <davem@davemloft.net>
|
|commit 75b9b61bb8a18e75afe7b10dd55681e748fa27df
|Author: Mugunthan V N <mugunthanvnm@ti.com>
|Date:   Fri Mar 15 04:10:16 2013 +0000
|
|    drivers: net: ethernet: ti: davinci_emac: fix usage of cpdma_check_free_tx_desc()
|
|    Fix which was done in the following commit in cpsw driver has
|    to be taken forward to davinci emac driver as well.
|
|    commit d35162f89b8f00537d7b240b76d2d0e8b8d29aa0
|    Author: Daniel Mack <zonque@gmail.com>
|    Date:   Tue Mar 12 06:31:19 2013 +0000
|
|        net: ethernet: cpsw: fix usage of cpdma_check_free_tx_desc()
|
|        Commit fae50823d0 ("net: ethernet: davinci_cpdma: Add boundary for rx
|        and tx descriptors") introduced a function to check the current
|        allocation state of tx packets. The return value is taken into account
|        to stop the netqork queue on the adapter in case there are no free
|        slots.
|
|        However, cpdma_check_free_tx_desc() returns 'true' if there is room in
|        the bitmap, not 'false', so the usage of the function is wrong.
|
|    Reported-by: Prabhakar Lad <prabhakar.csengg@gmail.com>
|    Tested-by: Prabhakar Lad <prabhakar.csengg@gmail.com>
|    Signed-off-by: Mugunthan V N <mugunthanvnm@ti.com>
|    Signed-off-by: David S. Miller <davem@davemloft.net>
|
|commit 61816596d1c9026d0ecb20c44f90452c41596ffe
|Merge: 23a9072 da2191e
|Author: David S. Miller <davem@davemloft.net>
|Date:   Wed Mar 20 12:46:26 2013 -0400
|
|    Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net
|
|    Pull in the 'net' tree to get Daniel Borkmann's flow dissector
|    infrastructure change.
|
|    Signed-off-by: David S. Miller <davem@davemloft.net>
|
|commit e052a5893b78d43bd183c6cc33bc346efe6bc6e5
|Author: Wei Yongjun <yongjun_wei@trendmicro.com.cn>
|Date:   Wed Mar 20 05:01:45 2013 +0000
|
|    net: ethernet: davinci_emac: make local function emac_poll_controller() static
|
|    emac_poll_controller() was not declared. It should be static.
|
|    Signed-off-by: Wei Yongjun <yongjun_wei@trendmicro.com.cn>
|    Acked-by: Mugunthan V N <mugunthanvnm@ti.com>
|    Signed-off-by: David S. Miller <davem@davemloft.net>
|
|commit ce16294fda230c787ce5c35f61b2f80d14d70a72
|Author: Lothar Waßmann <LW@KARO-electronics.de>
|Date:   Thu Mar 21 02:20:11 2013 +0000
|
|    net: ethernet: cpsw: fix erroneous condition in error check
|
|    The error check in cpsw_probe_dt() has an '&&' where an '||' is
|    meant to be. This causes a NULL pointer dereference when incomplet DT
|    data is passed to the driver ('phy_id' property for cpsw_emac1
|    missing).
|
|    Signed-off-by: Lothar Waßmann <LW@KARO-electronics.de>
|    Signed-off-by: David S. Miller <davem@davemloft.net>
|
|commit ea3d1cc285bf1ae1fa81b47418cd7fd79990bb06
|Merge: 2fa70df f4541d6
|Author: David S. Miller <davem@davemloft.net>
|Date:   Fri Mar 22 12:53:09 2013 -0400
|
|    Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net
|
|    Pull to get the thermal netlink multicast group name fix, otherwise
|    the assertion added in net-next to netlink to detect that kind of bug
|    makes systems unbootable for some folks.
|
|    Signed-off-by: David S. Miller <davem@davemloft.net>
|
|commit b8092861efd827deb8d84292674704ee8bf41b04
|Author: Sekhar Nori <nsekhar@ti.com>
|Date:   Sun Mar 24 23:25:46 2013 +0000
|
|    net/davinci_emac: use devres APIs
|
|    Use devres APIs where possible to simplify error handling
|    in driver probe.
|
|    While at it, also rename the goto targets in error path to
|    introduce some consistency in how they are named.
|
|    Signed-off-by: Sekhar Nori <nsekhar@ti.com>
|    Signed-off-by: David S. Miller <davem@davemloft.net>
|
|commit b56d6b3fca6d1214dbc9c5655f26e5d4ec04afc8
|Author: Mugunthan V N <mugunthanvnm@ti.com>
|Date:   Wed Mar 27 04:41:59 2013 +0000
|
|    drivers: net: ethernet: cpsw: use netif_wake_queue() while restarting tx queue
|
|    To restart tx queue use netif_wake_queue() intead of netif_start_queue()
|    so that net schedule will restart transmission immediately which will
|    increase network performance while doing huge data transfers.
|
|    Reported-by: Dan Franke <dan.franke@schneider-electric.com>
|    Suggested-by: Sriramakrishnan A G <srk@ti.com>
|    Signed-off-by: Mugunthan V N <mugunthanvnm@ti.com>
|    Acked-by: Eric Dumazet <edumazet@google.com>
|    Signed-off-by: David S. Miller <davem@davemloft.net>
|
|commit 7e51cde276ca820d526c6c21cf8147df595a36bf
|Author: Mugunthan V N <mugunthanvnm@ti.com>
|Date:   Wed Mar 27 04:42:00 2013 +0000
|
|    drivers: net: ethernet: davinci_emac: use netif_wake_queue() while restarting tx queue
|
|    To restart tx queue use netif_wake_queue() intead of netif_start_queue()
|    so that net schedule will restart transmission immediately which will
|    increase network performance while doing huge data transfers.
|
|    Reported-by: Dan Franke <dan.franke@schneider-electric.com>
|    Suggested-by: Sriramakrishnan A G <srk@ti.com>
|    Signed-off-by: Mugunthan V N <mugunthanvnm@ti.com>
|    Acked-by: Eric Dumazet <edumazet@google.com>
|    Signed-off-by: David S. Miller <davem@davemloft.net>
|
|commit a210576cf891e9e6d2c238eabcf5c1286b1e7526
|Merge: 7d4c04f 3658f36
|Author: David S. Miller <davem@davemloft.net>
|Date:   Mon Apr 1 13:36:50 2013 -0400
|
|    Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net
|
|    Conflicts:
|    	net/mac80211/sta_info.c
|    	net/wireless/core.h
|
|    Two minor conflicts in wireless.  Overlapping additions of extern
|    declarations in net/wireless/core.h and a bug fix overlapping with
|    the addition of a boolean parameter to __ieee80211_key_free().
|
|    Signed-off-by: David S. Miller <davem@davemloft.net>
|
|commit 91c4166c1a01c00b8bed74f7a7defa620071de88
|Author: Mugunthan V N <mugunthanvnm@ti.com>
|Date:   Mon Apr 15 07:31:28 2013 +0000
|
|    drivers: net: ethernet: cpsw: get slave VLAN id from slave node instead of cpsw node
|
|    Dual EMAC slave VLAN id must be got from slave node instead of cpsw node as
|    VLAN id for each slave will be different.
|
|    Reported-by: Mark Jackson <mpfj-list@mimc.co.uk>
|    Signed-off-by: Mugunthan V N <mugunthanvnm@ti.com>
|    Signed-off-by: David S. Miller <davem@davemloft.net>
|
|commit 1e0a8b13d35510e711fdf72e9a3e30bcb2bd49fa
|Author: Devendra Naga <devendra.aaru@gmail.com>
|Date:   Tue Apr 16 01:30:38 2013 +0000
|
|    tlan: cancel work at remove path
|
|    the work has been scheduled from interrupt, and not been
|    cancelled when the driver is unloaded, which doesn't remove
|    the work item from the global workqueue. call the
|    cancel_work_sync when the driver is removed (rmmod'ed).
|
|    Cc: Sriram <srk@ti.com>
|    Cc: Cyril Chemparathy <cyril@ti.com>
|    Cc: Vinay Hegde <vinay.hegde@ti.com>
|    Signed-off-by: Devendra Naga <devendra.aaru@gmail.com>
|    Signed-off-by: David S. Miller <davem@davemloft.net>
|
|commit f646968f8f7c624587de729115d802372b9063dd
|Author: Patrick McHardy <kaber@trash.net>
|Date:   Fri Apr 19 02:04:27 2013 +0000
|
|    net: vlan: rename NETIF_F_HW_VLAN_* feature flags to NETIF_F_HW_VLAN_CTAG_*
|
|    Rename the hardware VLAN acceleration features to include "CTAG" to indicate
|    that they only support CTAGs. Follow up patches will introduce 802.1ad
|    server provider tagging (STAGs) and require the distinction for hardware not
|    supporting acclerating both.
|
|    Signed-off-by: Patrick McHardy <kaber@trash.net>
|    Signed-off-by: David S. Miller <davem@davemloft.net>
|
|commit 80d5c3689b886308247da295a228a54df49a44f6
|Author: Patrick McHardy <kaber@trash.net>
|Date:   Fri Apr 19 02:04:28 2013 +0000
|
|    net: vlan: prepare for 802.1ad VLAN filtering offload
|
|    Change the rx_{add,kill}_vid callbacks to take a protocol argument in
|    preparation of 802.1ad support. The protocol argument used so far is
|    always htons(ETH_P_8021Q).
|
|    Signed-off-by: Patrick McHardy <kaber@trash.net>
|    Signed-off-by: David S. Miller <davem@davemloft.net>
|
|commit 6e0895c2ea326cc4bb11e8fa2f654628d5754c31
|Merge: 55fbbe4 60d509f
|Author: David S. Miller <davem@davemloft.net>
|Date:   Mon Apr 22 20:32:51 2013 -0400
|
|    Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net
|
|    Conflicts:
|    	drivers/net/ethernet/emulex/benet/be_main.c
|    	drivers/net/ethernet/intel/igb/igb_main.c
|    	drivers/net/wireless/brcm80211/brcmsmac/mac80211_if.c
|    	include/net/scm.h
|    	net/batman-adv/routing.c
|    	net/ipv4/tcp_input.c
|
|    The e{uid,gid} --> {uid,gid} credentials fix conflicted with the
|    cleanup in net-next to now pass cred structs around.
|
|    The be2net driver had a bug fix in 'net' that overlapped with the VLAN
|    interface changes by Patrick McHardy in net-next.
|
|    An IGB conflict existed because in 'net' the build_skb() support was
|    reverted, and in 'net-next' there was a comment style fix within that
|    code.
|
|    Several batman-adv conflicts were resolved by making sure that all
|    calls to batadv_is_my_mac() are changed to have a new bat_priv first
|    argument.
|
|    Eric Dumazet's TS ECR fix in TCP in 'net' conflicted with the F-RTO
|    rewrite in 'net-next', mostly overlapping changes.
|
|    Thanks to Stephen Rothwell and Antonio Quartulli for help with several
|    of these merge resolutions.
|
|    Signed-off-by: David S. Miller <davem@davemloft.net>
|
|commit 817f6d1a13754b043e1a6c1cb713763022860689
|Author: Sebastian Siewior <bigeasy@linutronix.de>
|Date:   Tue Apr 23 07:31:35 2013 +0000
|
|    net/davinci_cpdma: don't check for jiffies with interrupts
|
|    __cpdma_chan_process() holds the lock with interrupts off (and its
|    caller as well), same goes for cpdma_ctlr_start(). With interrupts off,
|    jiffies will not make any progress and if the wait condition never gets
|    true we wait for ever.
|    Tgis patch adds a a simple udelay and counting down attempt.
|
|    Acked-by: Mugunthan V N <mugunthanvnm@ti.com>
|    Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
|    Signed-off-by: David S. Miller <davem@davemloft.net>
|
|commit aacebbf8026ecdae1b55db3912e65c6b1308f5ed
|Author: Sebastian Siewior <bigeasy@linutronix.de>
|Date:   Tue Apr 23 07:31:36 2013 +0000
|
|    net/cpsw: don't continue if we miss to allocate rx skbs
|
|    if during "ifconfig up" we run out of mem we continue regardless how
|    many skbs we got. In worst case we have zero RX skbs and can't ever
|    receive further packets since the RX skbs are never reallocated. If
|    cpdma_chan_submit() fails we even leak the skb.
|    This patch changes the behavior here:
|    If we fail to allocate an skb during bring up we don't continue and
|    report that error. Same goes for errors from cpdma_chan_submit().
|    While here I changed to __netdev_alloc_skb_ip_align() so GFP_KERNEL can
|    be used.
|
|    Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
|    Acked-by: Mugunthan V N <mugunthanvnm@ti.com>
|    Signed-off-by: David S. Miller <davem@davemloft.net>
|
|commit fd51cf199421197d14099b4ba382301cc28e5544
|Author: Sebastian Siewior <bigeasy@linutronix.de>
|Date:   Tue Apr 23 07:31:37 2013 +0000
|
|    net/cpsw: don't rely only on netif_running() to check which device is active
|
|    netif_running() reports false before the ->ndo_stop() callback is
|    called. That means if one executes "ifconfig down" and the system
|    receives an interrupt before the interrupt source has been disabled we
|    hang for always for two reasons:
|    - we never disable the interrupt source because devices claim to be
|      already inactive and don't feel responsible.
|    - since the ISR always reports IRQ_HANDLED the line is never deactivated
|      because it looks like the ISR feels responsible.
|
|    This patch changes the logic in the ISR a little:
|    - If none of the status registers reports an active source (RX or TX,
|      misc is ignored because it is not actived) we leave with IRQ_NONE.
|    - the interrupt is deactivated
|    - The first active network device is taken and napi is scheduled. If
|      none are active (a small race window between ndo_down() and the
|      interrupt the) then we leave and should not come back because the
|      source is off.
|      There is no need to schedule the second NAPI because both share the
|      same dma queue.
|
|    Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
|    Acked-by: Mugunthan V N <mugunthanvnm@ti.com>
|    Signed-off-by: David S. Miller <davem@davemloft.net>
|
|commit aef614e13dfbdd3b9ae44ad110159f75b9029bba
|Author: Sebastian Siewior <bigeasy@linutronix.de>
|Date:   Tue Apr 23 07:31:38 2013 +0000
|
|    net/davinci_cpdma: remove unused argument in cpdma_chan_submit()
|
|    The gfp_mask argument is not used in cpdma_chan_submit() and always set
|    to GFP_KERNEL even in atomic sections. This patch drops it since it is
|    unused.
|
|    Acked-by: Mugunthan V N <mugunthanvnm@ti.com>
|    Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
|    Signed-off-by: David S. Miller <davem@davemloft.net>
|
|commit b4727e69b81b71c6e9696185091e8256d863f9be
|Author: Sebastian Siewior <bigeasy@linutronix.de>
|Date:   Tue Apr 23 07:31:39 2013 +0000
|
|    net/cpsw: redo rx skb allocation in rx path
|
|    In case that we run into OOM during the allocation of the new rx-skb we
|    don't get one and we have one skb less than we used to have. If this
|    continues to happen then we end up with no rx-skbs at all.
|    This patch changes the following:
|    - if we fail to allocate the new skb, then we treat the currently
|      completed skb as the new one and so drop the currently received data.
|    - instead of testing multiple times if the device is gone we rely one
|      the status field which is set to -ENOSYS in case the channel is going
|      down and incomplete requests are purged.
|      cpdma_chan_stop() removes most of the packages with -ENOSYS. The
|      currently active packet which is removed has the "tear down" bit set.
|      So if that bit is set, we send ENOSYS as well otherwise we pass the
|      status bits which are required to figure out which of the two possible
|      just finished.
|
|    Acked-by: Mugunthan V N <mugunthanvnm@ti.com>
|    Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
|    Signed-off-by: David S. Miller <davem@davemloft.net>
|
|commit 4bc21d4162366bb892dc1a4a92110c656e2622ca
|Author: Sebastian Siewior <bigeasy@linutronix.de>
|Date:   Wed Apr 24 08:48:22 2013 +0000
|
|    net/ti: add MODULE_DEVICE_TABLE + MODULE_LICENSE
|
|    If compiled as modules each one of these modules is missing something.
|    With this patch the modules are loaded on demand and don't taint the
|    kernel due to license issues.
|
|    Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
|    Acked-by: Mugunthan V N <mugunthanvnm@ti.com>
|    Signed-off-by: David S. Miller <davem@davemloft.net>
|
|commit d1bd9acfa3419dc9d5c32589b34a370ca6ae100e
|Author: Sebastian Siewior <bigeasy@linutronix.de>
|Date:   Wed Apr 24 08:48:23 2013 +0000
|
|    net/cpsw: make sure modules remove does not leak any ressources
|
|    This driver does not clean up properly after leaving. Here is a list:
|    - Use unregister_netdev(). free_netdev() is good but not enough
|    - Use the above also on the other ndev in case of dual mac
|    - Free data.slave_data. The name of the strucre makes it look like
|      it is platform_data but it is not. It is just a trick!
|    - Free all irqs. Again: freeing one irq is good start, but freeing all
|      of them is better.
|
|    With this rmmod & modprobe of cpsw seems to work. The remaining issue
|    is:
|    |WARNING: at fs/sysfs/dir.c:536 sysfs_add_one+0x9c/0xd4()
|    |sysfs: cannot create duplicate filename '/devices/ocp.2/4a100000.ethernet/4a101000.mdio'
|    |WARNING: at lib/kobject.c:196 kobject_add_internal+0x1a4/0x1c8()
|
|    comming from of_platform_populate() and I am not sure that this belongs
|    here.
|
|    Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
|    Acked-by: Mugunthan V N <mugunthanvnm@ti.com>
|    Signed-off-by: David S. Miller <davem@davemloft.net>
|
|commit 6e6ceaedb5901c7ebd23e5222726dab5362938bd
|Author: Sebastian Siewior <bigeasy@linutronix.de>
|Date:   Wed Apr 24 08:48:24 2013 +0000
|
|    net/cpsw: optimize the for_each_slave_macro()
|
|    text    data     bss     dec     hex filename
|    15530      92       4   15626    3d0a cpsw.o.before
|    15478      92       4   15574    3cd6 cpsw.o.after
|
|    52 bytes smaller, 13 for each invocation.
|
|    Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
|    Acked-by: Mugunthan V N <mugunthanvnm@ti.com>
|    Signed-off-by: David S. Miller <davem@davemloft.net>
|
|commit a11fbba9a7d338c4a4e4be624af0334bbf2c9a5a
|Author: Sebastian Siewior <bigeasy@linutronix.de>
|Date:   Wed Apr 24 08:48:25 2013 +0000
|
|    net/cpsw: fix irq_disable() with threaded interrupts
|
|    During high throughput it is likely that we receive both: an RX and TX
|    interrupt. The normal behaviour is that once we enter the ISR the
|    interrupts are disabled in the IRQ chip and so the ISR is invoked only
|    once and the interrupt line is disabled once. It will be re-enabled
|    after napi completes.
|    With threaded interrupts on the other hand the interrupt the interrupt
|    is disabled immediately and the ISR is marked for "later". By having TX
|    and RX interrupt marked pending we invoke them both and disable the
|    interrupt line twice. The napi callback is still executed once and so
|    after it completes we remain with interrupts disabled.
|
|    The initial patch simply removed the cpsw_{enable|disable}_irq() calls
|    and it worked well on my AM335X ES1.0 (beagle bone). On ES2.0 (beagle
|    bone black) it caused an never ending interrupt (even after the mask via
|    cpsw_intr_disable()) according to Mugunthan V N. Since I don't have the
|    ES2.0 and no idea what is going on this patch tracks the state of the
|    irq_disable() call and execute it only when not yet done.
|    The book keeping is done on the first struct since with dual_emac we can
|    have two of those and only one interrupt line.
|
|    Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
|    Acked-by: Mugunthan V N <mugunthanvnm@ti.com>
|    Signed-off-by: David S. Miller <davem@davemloft.net>

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>

diff --git a/arch/arm/boot/dts/am33xx.dtsi b/arch/arm/boot/dts/am33xx.dtsi
index c2f14e8..91fe4f1 100644
--- a/arch/arm/boot/dts/am33xx.dtsi
+++ b/arch/arm/boot/dts/am33xx.dtsi
@@ -349,7 +349,7 @@
 			rx_descs = <64>;
 			mac_control = <0x20>;
 			slaves = <2>;
-			cpts_active_slave = <0>;
+			active_slave = <0>;
 			cpts_clock_mult = <0x80000000>;
 			cpts_clock_shift = <29>;
 			reg = <0x4a100000 0x800
@@ -385,5 +385,19 @@
 				mac-address = [ 00 00 00 00 00 00 ];
 			};
 		};
+
+		ocmcram: ocmcram@40300000 {
+			compatible = "ti,am3352-ocmcram";
+			reg = <0x40300000 0x10000>;
+			ti,hwmods = "ocmcram";
+			ti,no_idle_on_suspend;
+		};
+
+		wkup_m3: wkup_m3@44d00000 {
+			compatible = "ti,am3353-wkup-m3";
+			reg = <0x44d00000 0x4000	/* M3 UMEM */
+			       0x44d80000 0x2000>;	/* M3 DMEM */
+			ti,hwmods = "wkup_m3";
+		};
 	};
 };
diff --git a/drivers/net/ethernet/ti/Kconfig b/drivers/net/ethernet/ti/Kconfig
index 4426151..de71b1e 100644
--- a/drivers/net/ethernet/ti/Kconfig
+++ b/drivers/net/ethernet/ti/Kconfig
@@ -88,8 +88,8 @@ config TLAN
 	  Please email feedback to <torben.mathiasen@compaq.com>.
 
 config CPMAC
-	tristate "TI AR7 CPMAC Ethernet support (EXPERIMENTAL)"
-	depends on EXPERIMENTAL && AR7
+	tristate "TI AR7 CPMAC Ethernet support"
+	depends on AR7
 	select PHYLIB
 	---help---
 	  TI AR7 CPMAC Ethernet support
diff --git a/drivers/net/ethernet/ti/cpmac.c b/drivers/net/ethernet/ti/cpmac.c
index d9625f6..31bbbca 100644
--- a/drivers/net/ethernet/ti/cpmac.c
+++ b/drivers/net/ethernet/ti/cpmac.c
@@ -904,10 +904,9 @@ static int cpmac_set_ringparam(struct net_device *dev,
 static void cpmac_get_drvinfo(struct net_device *dev,
 			      struct ethtool_drvinfo *info)
 {
-	strcpy(info->driver, "cpmac");
-	strcpy(info->version, CPMAC_VERSION);
-	info->fw_version[0] = '\0';
-	sprintf(info->bus_info, "%s", "cpmac");
+	strlcpy(info->driver, "cpmac", sizeof(info->driver));
+	strlcpy(info->version, CPMAC_VERSION, sizeof(info->version));
+	snprintf(info->bus_info, sizeof(info->bus_info), "%s", "cpmac");
 	info->regdump_len = 0;
 }
 
@@ -1173,8 +1172,8 @@ static int cpmac_probe(struct platform_device *pdev)
 	snprintf(priv->phy_name, MII_BUS_ID_SIZE, PHY_ID_FMT,
 						mdio_bus_id, phy_id);
 
-	priv->phy = phy_connect(dev, priv->phy_name, cpmac_adjust_link, 0,
-						PHY_INTERFACE_MODE_MII);
+	priv->phy = phy_connect(dev, priv->phy_name, cpmac_adjust_link,
+				PHY_INTERFACE_MODE_MII);
 
 	if (IS_ERR(priv->phy)) {
 		if (netif_msg_drv(priv))
diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c
index 40aff68..4e2d224 100644
--- a/drivers/net/ethernet/ti/cpsw.c
+++ b/drivers/net/ethernet/ti/cpsw.c
@@ -32,6 +32,7 @@
 #include <linux/of.h>
 #include <linux/of_net.h>
 #include <linux/of_device.h>
+#include <linux/if_vlan.h>
 
 #include <linux/platform_data/cpsw.h>
 
@@ -118,6 +119,20 @@ do {								\
 #define TX_PRIORITY_MAPPING	0x33221100
 #define CPDMA_TX_PRIORITY_MAP	0x76543210
 
+#define CPSW_VLAN_AWARE		BIT(1)
+#define CPSW_ALE_VLAN_AWARE	1
+
+#define CPSW_FIFO_NORMAL_MODE		(0 << 15)
+#define CPSW_FIFO_DUAL_MAC_MODE		(1 << 15)
+#define CPSW_FIFO_RATE_LIMIT_MODE	(2 << 15)
+
+#define CPSW_INTPACEEN		(0x3f << 16)
+#define CPSW_INTPRESCALE_MASK	(0x7FF << 0)
+#define CPSW_CMINTMAX_CNT	63
+#define CPSW_CMINTMIN_CNT	2
+#define CPSW_CMINTMAX_INTVL	(1000 / CPSW_CMINTMIN_CNT)
+#define CPSW_CMINTMIN_INTVL	((1000 / CPSW_CMINTMAX_CNT) + 1)
+
 #define cpsw_enable_irq(priv)	\
 	do {			\
 		u32 i;		\
@@ -131,6 +146,10 @@ do {								\
 			disable_irq_nosync(priv->irqs_table[i]); \
 	} while (0);
 
+#define cpsw_slave_index(priv)				\
+		((priv->data.dual_emac) ? priv->emac_port :	\
+		priv->data.active_slave)
+
 static int debug_level;
 module_param(debug_level, int, 0);
 MODULE_PARM_DESC(debug_level, "cpsw debug level (NETIF_MSG bits)");
@@ -152,6 +171,15 @@ struct cpsw_wr_regs {
 	u32	rx_en;
 	u32	tx_en;
 	u32	misc_en;
+	u32	mem_allign1[8];
+	u32	rx_thresh_stat;
+	u32	rx_stat;
+	u32	tx_stat;
+	u32	misc_stat;
+	u32	mem_allign2[8];
+	u32	rx_imax;
+	u32	tx_imax;
+
 };
 
 struct cpsw_ss_regs {
@@ -250,7 +278,7 @@ struct cpsw_ss_regs {
 struct cpsw_host_regs {
 	u32	max_blks;
 	u32	blk_cnt;
-	u32	flow_thresh;
+	u32	tx_in_ctl;
 	u32	port_vlan;
 	u32	tx_pri_map;
 	u32	cpdma_tx_pri_map;
@@ -277,6 +305,9 @@ struct cpsw_slave {
 	u32				mac_control;
 	struct cpsw_slave_data		*data;
 	struct phy_device		*phy;
+	struct net_device		*ndev;
+	u32				port_vlan;
+	u32				open_stat;
 };
 
 static inline u32 slave_read(struct cpsw_slave *slave, u32 offset)
@@ -303,6 +334,8 @@ struct cpsw_priv {
 	struct cpsw_host_regs __iomem	*host_port_regs;
 	u32				msg_enable;
 	u32				version;
+	u32				coal_intvl;
+	u32				bus_freq_mhz;
 	struct net_device_stats		stats;
 	int				rx_packet_max;
 	int				host_port;
@@ -315,17 +348,69 @@ struct cpsw_priv {
 	/* snapshot of IRQ numbers */
 	u32 irqs_table[4];
 	u32 num_irqs;
-	struct cpts cpts;
+	bool irq_enabled;
+	struct cpts *cpts;
+	u32 emac_port;
 };
 
 #define napi_to_priv(napi)	container_of(napi, struct cpsw_priv, napi)
-#define for_each_slave(priv, func, arg...)			\
-	do {							\
-		int idx;					\
-		for (idx = 0; idx < (priv)->data.slaves; idx++)	\
-			(func)((priv)->slaves + idx, ##arg);	\
+#define for_each_slave(priv, func, arg...)				\
+	do {								\
+		struct cpsw_slave *slave;				\
+		int n;							\
+		if (priv->data.dual_emac)				\
+			(func)((priv)->slaves + priv->emac_port, ##arg);\
+		else							\
+			for (n = (priv)->data.slaves,			\
+					slave = (priv)->slaves;		\
+					n; n--)				\
+				(func)(slave++, ##arg);			\
+	} while (0)
+#define cpsw_get_slave_ndev(priv, __slave_no__)				\
+	(priv->slaves[__slave_no__].ndev)
+#define cpsw_get_slave_priv(priv, __slave_no__)				\
+	((priv->slaves[__slave_no__].ndev) ?				\
+		netdev_priv(priv->slaves[__slave_no__].ndev) : NULL)	\
+
+#define cpsw_dual_emac_src_port_detect(status, priv, ndev, skb)		\
+	do {								\
+		if (!priv->data.dual_emac)				\
+			break;						\
+		if (CPDMA_RX_SOURCE_PORT(status) == 1) {		\
+			ndev = cpsw_get_slave_ndev(priv, 0);		\
+			priv = netdev_priv(ndev);			\
+			skb->dev = ndev;				\
+		} else if (CPDMA_RX_SOURCE_PORT(status) == 2) {		\
+			ndev = cpsw_get_slave_ndev(priv, 1);		\
+			priv = netdev_priv(ndev);			\
+			skb->dev = ndev;				\
+		}							\
+	} while (0)
+#define cpsw_add_mcast(priv, addr)					\
+	do {								\
+		if (priv->data.dual_emac) {				\
+			struct cpsw_slave *slave = priv->slaves +	\
+						priv->emac_port;	\
+			int slave_port = cpsw_get_slave_port(priv,	\
+						slave->slave_num);	\
+			cpsw_ale_add_mcast(priv->ale, addr,		\
+				1 << slave_port | 1 << priv->host_port,	\
+				ALE_VLAN, slave->port_vlan, 0);		\
+		} else {						\
+			cpsw_ale_add_mcast(priv->ale, addr,		\
+				ALE_ALL_PORTS << priv->host_port,	\
+				0, 0, 0);				\
+		}							\
 	} while (0)
 
+static inline int cpsw_get_slave_port(struct cpsw_priv *priv, u32 slave_num)
+{
+	if (priv->host_port == 0)
+		return slave_num + 1;
+	else
+		return slave_num;
+}
+
 static void cpsw_ndo_set_rx_mode(struct net_device *ndev)
 {
 	struct cpsw_priv *priv = netdev_priv(ndev);
@@ -344,8 +429,7 @@ static void cpsw_ndo_set_rx_mode(struct net_device *ndev)
 
 		/* program multicast address list into ALE register */
 		netdev_for_each_mc_addr(ha, ndev) {
-			cpsw_ale_add_mcast(priv->ale, (u8 *)ha->addr,
-				ALE_ALL_PORTS << priv->host_port, 0, 0);
+			cpsw_add_mcast(priv, (u8 *)ha->addr);
 		}
 	}
 }
@@ -374,9 +458,12 @@ void cpsw_tx_handler(void *token, int len, int status)
 	struct net_device	*ndev = skb->dev;
 	struct cpsw_priv	*priv = netdev_priv(ndev);
 
+	/* Check whether the queue is stopped due to stalled tx dma, if the
+	 * queue is stopped then start the queue as we have free desc for tx
+	 */
 	if (unlikely(netif_queue_stopped(ndev)))
-		netif_start_queue(ndev);
-	cpts_tx_timestamp(&priv->cpts, skb);
+		netif_wake_queue(ndev);
+	cpts_tx_timestamp(priv->cpts, skb);
 	priv->stats.tx_packets++;
 	priv->stats.tx_bytes += len;
 	dev_kfree_skb_any(skb);
@@ -385,61 +472,69 @@ void cpsw_tx_handler(void *token, int len, int status)
 void cpsw_rx_handler(void *token, int len, int status)
 {
 	struct sk_buff		*skb = token;
+	struct sk_buff		*new_skb;
 	struct net_device	*ndev = skb->dev;
 	struct cpsw_priv	*priv = netdev_priv(ndev);
 	int			ret = 0;
 
-	/* free and bail if we are shutting down */
-	if (unlikely(!netif_running(ndev)) ||
-			unlikely(!netif_carrier_ok(ndev))) {
+	cpsw_dual_emac_src_port_detect(status, priv, ndev, skb);
+
+	if (unlikely(status < 0)) {
+		/* the interface is going down, skbs are purged */
 		dev_kfree_skb_any(skb);
 		return;
 	}
-	if (likely(status >= 0)) {
+
+	new_skb = netdev_alloc_skb_ip_align(ndev, priv->rx_packet_max);
+	if (new_skb) {
 		skb_put(skb, len);
-		cpts_rx_timestamp(&priv->cpts, skb);
+		cpts_rx_timestamp(priv->cpts, skb);
 		skb->protocol = eth_type_trans(skb, ndev);
 		netif_receive_skb(skb);
 		priv->stats.rx_bytes += len;
 		priv->stats.rx_packets++;
-		skb = NULL;
-	}
-
-	if (unlikely(!netif_running(ndev))) {
-		if (skb)
-			dev_kfree_skb_any(skb);
-		return;
+	} else {
+		priv->stats.rx_dropped++;
+		new_skb = skb;
 	}
 
-	if (likely(!skb)) {
-		skb = netdev_alloc_skb_ip_align(ndev, priv->rx_packet_max);
-		if (WARN_ON(!skb))
-			return;
-
-		ret = cpdma_chan_submit(priv->rxch, skb, skb->data,
-					skb_tailroom(skb), GFP_KERNEL);
-	}
-	WARN_ON(ret < 0);
+	ret = cpdma_chan_submit(priv->rxch, new_skb, new_skb->data,
+			skb_tailroom(new_skb), 0);
+	if (WARN_ON(ret < 0))
+		dev_kfree_skb_any(new_skb);
 }
 
 static irqreturn_t cpsw_interrupt(int irq, void *dev_id)
 {
 	struct cpsw_priv *priv = dev_id;
+	u32 rx, tx, rx_thresh;
 
-	if (likely(netif_running(priv->ndev))) {
-		cpsw_intr_disable(priv);
+	rx_thresh = __raw_readl(&priv->wr_regs->rx_thresh_stat);
+	rx = __raw_readl(&priv->wr_regs->rx_stat);
+	tx = __raw_readl(&priv->wr_regs->tx_stat);
+	if (!rx_thresh && !rx && !tx)
+		return IRQ_NONE;
+
+	cpsw_intr_disable(priv);
+	if (priv->irq_enabled == true) {
 		cpsw_disable_irq(priv);
+		priv->irq_enabled = false;
+	}
+
+	if (netif_running(priv->ndev)) {
 		napi_schedule(&priv->napi);
+		return IRQ_HANDLED;
 	}
-	return IRQ_HANDLED;
-}
 
-static inline int cpsw_get_slave_port(struct cpsw_priv *priv, u32 slave_num)
-{
-	if (priv->host_port == 0)
-		return slave_num + 1;
-	else
-		return slave_num;
+	priv = cpsw_get_slave_priv(priv, 1);
+	if (!priv)
+		return IRQ_NONE;
+
+	if (netif_running(priv->ndev)) {
+		napi_schedule(&priv->napi);
+		return IRQ_HANDLED;
+	}
+	return IRQ_NONE;
 }
 
 static int cpsw_poll(struct napi_struct *napi, int budget)
@@ -448,19 +543,27 @@ static int cpsw_poll(struct napi_struct *napi, int budget)
 	int			num_tx, num_rx;
 
 	num_tx = cpdma_chan_process(priv->txch, 128);
-	num_rx = cpdma_chan_process(priv->rxch, budget);
-
-	if (num_rx || num_tx)
-		cpsw_dbg(priv, intr, "poll %d rx, %d tx pkts\n",
-			 num_rx, num_tx);
+	if (num_tx)
+		cpdma_ctlr_eoi(priv->dma, CPDMA_EOI_TX);
 
+	num_rx = cpdma_chan_process(priv->rxch, budget);
 	if (num_rx < budget) {
+		struct cpsw_priv *prim_cpsw;
+
 		napi_complete(napi);
 		cpsw_intr_enable(priv);
-		cpdma_ctlr_eoi(priv->dma);
-		cpsw_enable_irq(priv);
+		cpdma_ctlr_eoi(priv->dma, CPDMA_EOI_RX);
+		prim_cpsw = cpsw_get_slave_priv(priv, 0);
+		if (prim_cpsw->irq_enabled == false) {
+			cpsw_enable_irq(priv);
+			prim_cpsw->irq_enabled = true;
+		}
 	}
 
+	if (num_rx || num_tx)
+		cpsw_dbg(priv, intr, "poll %d rx, %d tx pkts\n",
+			 num_rx, num_tx);
+
 	return num_rx;
 }
 
@@ -548,6 +651,77 @@ static void cpsw_adjust_link(struct net_device *ndev)
 	}
 }
 
+static int cpsw_get_coalesce(struct net_device *ndev,
+				struct ethtool_coalesce *coal)
+{
+	struct cpsw_priv *priv = netdev_priv(ndev);
+
+	coal->rx_coalesce_usecs = priv->coal_intvl;
+	return 0;
+}
+
+static int cpsw_set_coalesce(struct net_device *ndev,
+				struct ethtool_coalesce *coal)
+{
+	struct cpsw_priv *priv = netdev_priv(ndev);
+	u32 int_ctrl;
+	u32 num_interrupts = 0;
+	u32 prescale = 0;
+	u32 addnl_dvdr = 1;
+	u32 coal_intvl = 0;
+
+	if (!coal->rx_coalesce_usecs)
+		return -EINVAL;
+
+	coal_intvl = coal->rx_coalesce_usecs;
+
+	int_ctrl =  readl(&priv->wr_regs->int_control);
+	prescale = priv->bus_freq_mhz * 4;
+
+	if (coal_intvl < CPSW_CMINTMIN_INTVL)
+		coal_intvl = CPSW_CMINTMIN_INTVL;
+
+	if (coal_intvl > CPSW_CMINTMAX_INTVL) {
+		/* Interrupt pacer works with 4us Pulse, we can
+		 * throttle further by dilating the 4us pulse.
+		 */
+		addnl_dvdr = CPSW_INTPRESCALE_MASK / prescale;
+
+		if (addnl_dvdr > 1) {
+			prescale *= addnl_dvdr;
+			if (coal_intvl > (CPSW_CMINTMAX_INTVL * addnl_dvdr))
+				coal_intvl = (CPSW_CMINTMAX_INTVL
+						* addnl_dvdr);
+		} else {
+			addnl_dvdr = 1;
+			coal_intvl = CPSW_CMINTMAX_INTVL;
+		}
+	}
+
+	num_interrupts = (1000 * addnl_dvdr) / coal_intvl;
+	writel(num_interrupts, &priv->wr_regs->rx_imax);
+	writel(num_interrupts, &priv->wr_regs->tx_imax);
+
+	int_ctrl |= CPSW_INTPACEEN;
+	int_ctrl &= (~CPSW_INTPRESCALE_MASK);
+	int_ctrl |= (prescale & CPSW_INTPRESCALE_MASK);
+	writel(int_ctrl, &priv->wr_regs->int_control);
+
+	cpsw_notice(priv, timer, "Set coalesce to %d usecs.\n", coal_intvl);
+	if (priv->data.dual_emac) {
+		int i;
+
+		for (i = 0; i < priv->data.slaves; i++) {
+			priv = netdev_priv(priv->slaves[i].ndev);
+			priv->coal_intvl = coal_intvl;
+		}
+	} else {
+		priv->coal_intvl = coal_intvl;
+	}
+
+	return 0;
+}
+
 static inline int __show_stat(char *buf, int maxlen, const char *name, u32 val)
 {
 	static char *leader = "........................................";
@@ -559,6 +733,54 @@ static inline int __show_stat(char *buf, int maxlen, const char *name, u32 val)
 				leader + strlen(name), val);
 }
 
+static int cpsw_common_res_usage_state(struct cpsw_priv *priv)
+{
+	u32 i;
+	u32 usage_count = 0;
+
+	if (!priv->data.dual_emac)
+		return 0;
+
+	for (i = 0; i < priv->data.slaves; i++)
+		if (priv->slaves[i].open_stat)
+			usage_count++;
+
+	return usage_count;
+}
+
+static inline int cpsw_tx_packet_submit(struct net_device *ndev,
+			struct cpsw_priv *priv, struct sk_buff *skb)
+{
+	if (!priv->data.dual_emac)
+		return cpdma_chan_submit(priv->txch, skb, skb->data,
+				  skb->len, 0);
+
+	if (ndev == cpsw_get_slave_ndev(priv, 0))
+		return cpdma_chan_submit(priv->txch, skb, skb->data,
+				  skb->len, 1);
+	else
+		return cpdma_chan_submit(priv->txch, skb, skb->data,
+				  skb->len, 2);
+}
+
+static inline void cpsw_add_dual_emac_def_ale_entries(
+		struct cpsw_priv *priv, struct cpsw_slave *slave,
+		u32 slave_port)
+{
+	u32 port_mask = 1 << slave_port | 1 << priv->host_port;
+
+	if (priv->version == CPSW_VERSION_1)
+		slave_write(slave, slave->port_vlan, CPSW1_PORT_VLAN);
+	else
+		slave_write(slave, slave->port_vlan, CPSW2_PORT_VLAN);
+	cpsw_ale_add_vlan(priv->ale, slave->port_vlan, port_mask,
+			  port_mask, port_mask, 0);
+	cpsw_ale_add_mcast(priv->ale, priv->ndev->broadcast,
+			   port_mask, ALE_VLAN, slave->port_vlan, 0);
+	cpsw_ale_add_ucast(priv->ale, priv->mac_addr,
+		priv->host_port, ALE_VLAN, slave->port_vlan);
+}
+
 static void cpsw_slave_open(struct cpsw_slave *slave, struct cpsw_priv *priv)
 {
 	char name[32];
@@ -588,11 +810,14 @@ static void cpsw_slave_open(struct cpsw_slave *slave, struct cpsw_priv *priv)
 
 	slave_port = cpsw_get_slave_port(priv, slave->slave_num);
 
-	cpsw_ale_add_mcast(priv->ale, priv->ndev->broadcast,
-			   1 << slave_port, 0, ALE_MCAST_FWD_2);
+	if (priv->data.dual_emac)
+		cpsw_add_dual_emac_def_ale_entries(priv, slave, slave_port);
+	else
+		cpsw_ale_add_mcast(priv->ale, priv->ndev->broadcast,
+				   1 << slave_port, 0, 0, ALE_MCAST_FWD_2);
 
 	slave->phy = phy_connect(priv->ndev, slave->data->phy_id,
-				 &cpsw_adjust_link, 0, slave->data->phy_if);
+				 &cpsw_adjust_link, slave->data->phy_if);
 	if (IS_ERR(slave->phy)) {
 		dev_err(priv->dev, "phy %s not found on slave %d\n",
 			slave->data->phy_id, slave->slave_num);
@@ -604,14 +829,44 @@ static void cpsw_slave_open(struct cpsw_slave *slave, struct cpsw_priv *priv)
 	}
 }
 
+static inline void cpsw_add_default_vlan(struct cpsw_priv *priv)
+{
+	const int vlan = priv->data.default_vlan;
+	const int port = priv->host_port;
+	u32 reg;
+	int i;
+
+	reg = (priv->version == CPSW_VERSION_1) ? CPSW1_PORT_VLAN :
+	       CPSW2_PORT_VLAN;
+
+	writel(vlan, &priv->host_port_regs->port_vlan);
+
+	for (i = 0; i < priv->data.slaves; i++)
+		slave_write(priv->slaves + i, vlan, reg);
+
+	cpsw_ale_add_vlan(priv->ale, vlan, ALE_ALL_PORTS << port,
+			  ALE_ALL_PORTS << port, ALE_ALL_PORTS << port,
+			  (ALE_PORT_1 | ALE_PORT_2) << port);
+}
+
 static void cpsw_init_host_port(struct cpsw_priv *priv)
 {
+	u32 control_reg;
+	u32 fifo_mode;
+
 	/* soft reset the controller and initialize ale */
 	soft_reset("cpsw", &priv->regs->soft_reset);
 	cpsw_ale_start(priv->ale);
 
 	/* switch to vlan unaware mode */
-	cpsw_ale_control_set(priv->ale, 0, ALE_VLAN_AWARE, 0);
+	cpsw_ale_control_set(priv->ale, priv->host_port, ALE_VLAN_AWARE,
+			     CPSW_ALE_VLAN_AWARE);
+	control_reg = readl(&priv->regs->control);
+	control_reg |= CPSW_VLAN_AWARE;
+	writel(control_reg, &priv->regs->control);
+	fifo_mode = (priv->data.dual_emac) ? CPSW_FIFO_DUAL_MAC_MODE :
+		     CPSW_FIFO_NORMAL_MODE;
+	writel(fifo_mode, &priv->host_port_regs->tx_in_ctl);
 
 	/* setup host port priority mapping */
 	__raw_writel(CPDMA_TX_PRIORITY_MAP,
@@ -621,18 +876,32 @@ static void cpsw_init_host_port(struct cpsw_priv *priv)
 	cpsw_ale_control_set(priv->ale, priv->host_port,
 			     ALE_PORT_STATE, ALE_PORT_STATE_FORWARD);
 
-	cpsw_ale_add_ucast(priv->ale, priv->mac_addr, priv->host_port, 0);
-	cpsw_ale_add_mcast(priv->ale, priv->ndev->broadcast,
-			   1 << priv->host_port, 0, ALE_MCAST_FWD_2);
+	if (!priv->data.dual_emac) {
+		cpsw_ale_add_ucast(priv->ale, priv->mac_addr, priv->host_port,
+				   0, 0);
+		cpsw_ale_add_mcast(priv->ale, priv->ndev->broadcast,
+				   1 << priv->host_port, 0, 0, ALE_MCAST_FWD_2);
+	}
+}
+
+static void cpsw_slave_stop(struct cpsw_slave *slave, struct cpsw_priv *priv)
+{
+	if (!slave->phy)
+		return;
+	phy_stop(slave->phy);
+	phy_disconnect(slave->phy);
+	slave->phy = NULL;
 }
 
 static int cpsw_ndo_open(struct net_device *ndev)
 {
 	struct cpsw_priv *priv = netdev_priv(ndev);
+	struct cpsw_priv *prim_cpsw;
 	int i, ret;
 	u32 reg;
 
-	cpsw_intr_disable(priv);
+	if (!cpsw_common_res_usage_state(priv))
+		cpsw_intr_disable(priv);
 	netif_carrier_off(ndev);
 
 	pm_runtime_get_sync(&priv->pdev->dev);
@@ -644,53 +913,81 @@ static int cpsw_ndo_open(struct net_device *ndev)
 		 CPSW_RTL_VERSION(reg));
 
 	/* initialize host and slave ports */
-	cpsw_init_host_port(priv);
+	if (!cpsw_common_res_usage_state(priv))
+		cpsw_init_host_port(priv);
 	for_each_slave(priv, cpsw_slave_open, priv);
 
-	/* setup tx dma to fixed prio and zero offset */
-	cpdma_control_set(priv->dma, CPDMA_TX_PRIO_FIXED, 1);
-	cpdma_control_set(priv->dma, CPDMA_RX_BUFFER_OFFSET, 0);
-
-	/* disable priority elevation and enable statistics on all ports */
-	__raw_writel(0, &priv->regs->ptype);
-
-	/* enable statistics collection only on the host port */
-	__raw_writel(0x7, &priv->regs->stat_port_en);
+	/* Add default VLAN */
+	if (!priv->data.dual_emac)
+		cpsw_add_default_vlan(priv);
+
+	if (!cpsw_common_res_usage_state(priv)) {
+		/* setup tx dma to fixed prio and zero offset */
+		cpdma_control_set(priv->dma, CPDMA_TX_PRIO_FIXED, 1);
+		cpdma_control_set(priv->dma, CPDMA_RX_BUFFER_OFFSET, 0);
+
+		/* disable priority elevation */
+		__raw_writel(0, &priv->regs->ptype);
+
+		/* enable statistics collection only on all ports */
+		__raw_writel(0x7, &priv->regs->stat_port_en);
+
+		if (WARN_ON(!priv->data.rx_descs))
+			priv->data.rx_descs = 128;
+
+		for (i = 0; i < priv->data.rx_descs; i++) {
+			struct sk_buff *skb;
+
+			ret = -ENOMEM;
+			skb = __netdev_alloc_skb_ip_align(priv->ndev,
+					priv->rx_packet_max, GFP_KERNEL);
+			if (!skb)
+				goto err_cleanup;
+			ret = cpdma_chan_submit(priv->rxch, skb, skb->data,
+					skb_tailroom(skb), 0);
+			if (ret < 0) {
+				kfree_skb(skb);
+				goto err_cleanup;
+			}
+		}
+		/* continue even if we didn't manage to submit all
+		 * receive descs
+		 */
+		cpsw_info(priv, ifup, "submitted %d rx descriptors\n", i);
+	}
 
-	if (WARN_ON(!priv->data.rx_descs))
-		priv->data.rx_descs = 128;
+	/* Enable Interrupt pacing if configured */
+	if (priv->coal_intvl != 0) {
+		struct ethtool_coalesce coal;
 
-	for (i = 0; i < priv->data.rx_descs; i++) {
-		struct sk_buff *skb;
+		coal.rx_coalesce_usecs = (priv->coal_intvl << 4);
+		cpsw_set_coalesce(ndev, &coal);
+	}
 
-		ret = -ENOMEM;
-		skb = netdev_alloc_skb_ip_align(priv->ndev,
-						priv->rx_packet_max);
-		if (!skb)
-			break;
-		ret = cpdma_chan_submit(priv->rxch, skb, skb->data,
-					skb_tailroom(skb), GFP_KERNEL);
-		if (WARN_ON(ret < 0))
-			break;
+	prim_cpsw = cpsw_get_slave_priv(priv, 0);
+	if (prim_cpsw->irq_enabled == false) {
+		if ((priv == prim_cpsw) || !netif_running(prim_cpsw->ndev)) {
+			prim_cpsw->irq_enabled = true;
+			cpsw_enable_irq(prim_cpsw);
+		}
 	}
-	/* continue even if we didn't manage to submit all receive descs */
-	cpsw_info(priv, ifup, "submitted %d rx descriptors\n", i);
 
 	cpdma_ctlr_start(priv->dma);
 	cpsw_intr_enable(priv);
 	napi_enable(&priv->napi);
-	cpdma_ctlr_eoi(priv->dma);
+	cpdma_ctlr_eoi(priv->dma, CPDMA_EOI_RX);
+	cpdma_ctlr_eoi(priv->dma, CPDMA_EOI_TX);
 
+	if (priv->data.dual_emac)
+		priv->slaves[priv->emac_port].open_stat = true;
 	return 0;
-}
 
-static void cpsw_slave_stop(struct cpsw_slave *slave, struct cpsw_priv *priv)
-{
-	if (!slave->phy)
-		return;
-	phy_stop(slave->phy);
-	phy_disconnect(slave->phy);
-	slave->phy = NULL;
+err_cleanup:
+	cpdma_ctlr_stop(priv->dma);
+	for_each_slave(priv, cpsw_slave_stop, priv);
+	pm_runtime_put_sync(&priv->pdev->dev);
+	netif_carrier_off(priv->ndev);
+	return ret;
 }
 
 static int cpsw_ndo_stop(struct net_device *ndev)
@@ -701,12 +998,17 @@ static int cpsw_ndo_stop(struct net_device *ndev)
 	netif_stop_queue(priv->ndev);
 	napi_disable(&priv->napi);
 	netif_carrier_off(priv->ndev);
-	cpsw_intr_disable(priv);
-	cpdma_ctlr_int_ctrl(priv->dma, false);
-	cpdma_ctlr_stop(priv->dma);
-	cpsw_ale_stop(priv->ale);
+
+	if (cpsw_common_res_usage_state(priv) <= 1) {
+		cpsw_intr_disable(priv);
+		cpdma_ctlr_int_ctrl(priv->dma, false);
+		cpdma_ctlr_stop(priv->dma);
+		cpsw_ale_stop(priv->ale);
+	}
 	for_each_slave(priv, cpsw_slave_stop, priv);
 	pm_runtime_put_sync(&priv->pdev->dev);
+	if (priv->data.dual_emac)
+		priv->slaves[priv->emac_port].open_stat = false;
 	return 0;
 }
 
@@ -724,18 +1026,24 @@ static netdev_tx_t cpsw_ndo_start_xmit(struct sk_buff *skb,
 		return NETDEV_TX_OK;
 	}
 
-	if (skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP && priv->cpts.tx_enable)
+	if (skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP &&
+				priv->cpts->tx_enable)
 		skb_shinfo(skb)->tx_flags |= SKBTX_IN_PROGRESS;
 
 	skb_tx_timestamp(skb);
 
-	ret = cpdma_chan_submit(priv->txch, skb, skb->data,
-				skb->len, GFP_KERNEL);
+	ret = cpsw_tx_packet_submit(ndev, priv, skb);
 	if (unlikely(ret != 0)) {
 		cpsw_err(priv, tx_err, "desc submit failed\n");
 		goto fail;
 	}
 
+	/* If there is no more tx desc left free then we need to
+	 * tell the kernel to stop sending us tx frames.
+	 */
+	if (unlikely(!cpdma_check_free_tx_desc(priv->txch)))
+		netif_stop_queue(ndev);
+
 	return NETDEV_TX_OK;
 fail:
 	priv->stats.tx_dropped++;
@@ -770,10 +1078,10 @@ static void cpsw_ndo_change_rx_flags(struct net_device *ndev, int flags)
 
 static void cpsw_hwtstamp_v1(struct cpsw_priv *priv)
 {
-	struct cpsw_slave *slave = &priv->slaves[priv->data.cpts_active_slave];
+	struct cpsw_slave *slave = &priv->slaves[priv->data.active_slave];
 	u32 ts_en, seq_id;
 
-	if (!priv->cpts.tx_enable && !priv->cpts.rx_enable) {
+	if (!priv->cpts->tx_enable && !priv->cpts->rx_enable) {
 		slave_write(slave, 0, CPSW1_TS_CTL);
 		return;
 	}
@@ -781,10 +1089,10 @@ static void cpsw_hwtstamp_v1(struct cpsw_priv *priv)
 	seq_id = (30 << CPSW_V1_SEQ_ID_OFS_SHIFT) | ETH_P_1588;
 	ts_en = EVENT_MSG_BITS << CPSW_V1_MSG_TYPE_OFS;
 
-	if (priv->cpts.tx_enable)
+	if (priv->cpts->tx_enable)
 		ts_en |= CPSW_V1_TS_TX_EN;
 
-	if (priv->cpts.rx_enable)
+	if (priv->cpts->rx_enable)
 		ts_en |= CPSW_V1_TS_RX_EN;
 
 	slave_write(slave, ts_en, CPSW1_TS_CTL);
@@ -793,16 +1101,21 @@ static void cpsw_hwtstamp_v1(struct cpsw_priv *priv)
 
 static void cpsw_hwtstamp_v2(struct cpsw_priv *priv)
 {
-	struct cpsw_slave *slave = &priv->slaves[priv->data.cpts_active_slave];
+	struct cpsw_slave *slave;
 	u32 ctrl, mtype;
 
+	if (priv->data.dual_emac)
+		slave = &priv->slaves[priv->emac_port];
+	else
+		slave = &priv->slaves[priv->data.active_slave];
+
 	ctrl = slave_read(slave, CPSW2_CONTROL);
 	ctrl &= ~CTRL_ALL_TS_MASK;
 
-	if (priv->cpts.tx_enable)
+	if (priv->cpts->tx_enable)
 		ctrl |= CTRL_TX_TS_BITS;
 
-	if (priv->cpts.rx_enable)
+	if (priv->cpts->rx_enable)
 		ctrl |= CTRL_RX_TS_BITS;
 
 	mtype = (30 << TS_SEQ_ID_OFFSET_SHIFT) | EVENT_MSG_BITS;
@@ -815,7 +1128,7 @@ static void cpsw_hwtstamp_v2(struct cpsw_priv *priv)
 static int cpsw_hwtstamp_ioctl(struct net_device *dev, struct ifreq *ifr)
 {
 	struct cpsw_priv *priv = netdev_priv(dev);
-	struct cpts *cpts = &priv->cpts;
+	struct cpts *cpts = priv->cpts;
 	struct hwtstamp_config cfg;
 
 	if (copy_from_user(&cfg, ifr->ifr_data, sizeof(cfg)))
@@ -879,14 +1192,26 @@ static int cpsw_hwtstamp_ioctl(struct net_device *dev, struct ifreq *ifr)
 
 static int cpsw_ndo_ioctl(struct net_device *dev, struct ifreq *req, int cmd)
 {
+	struct cpsw_priv *priv = netdev_priv(dev);
+	struct mii_ioctl_data *data = if_mii(req);
+	int slave_no = cpsw_slave_index(priv);
+
 	if (!netif_running(dev))
 		return -EINVAL;
 
+	switch (cmd) {
 #ifdef CONFIG_TI_CPTS
-	if (cmd == SIOCSHWTSTAMP)
+	case SIOCSHWTSTAMP:
 		return cpsw_hwtstamp_ioctl(dev, req);
 #endif
-	return -ENOTSUPP;
+	case SIOCGMIIPHY:
+		data->phy_id = priv->slaves[slave_no].phy->addr;
+		break;
+	default:
+		return -ENOTSUPP;
+	}
+
+	return 0;
 }
 
 static void cpsw_ndo_tx_timeout(struct net_device *ndev)
@@ -901,7 +1226,9 @@ static void cpsw_ndo_tx_timeout(struct net_device *ndev)
 	cpdma_chan_start(priv->txch);
 	cpdma_ctlr_int_ctrl(priv->dma, true);
 	cpsw_intr_enable(priv);
-	cpdma_ctlr_eoi(priv->dma);
+	cpdma_ctlr_eoi(priv->dma, CPDMA_EOI_RX);
+	cpdma_ctlr_eoi(priv->dma, CPDMA_EOI_TX);
+
 }
 
 static struct net_device_stats *cpsw_ndo_get_stats(struct net_device *ndev)
@@ -920,10 +1247,79 @@ static void cpsw_ndo_poll_controller(struct net_device *ndev)
 	cpsw_interrupt(ndev->irq, priv);
 	cpdma_ctlr_int_ctrl(priv->dma, true);
 	cpsw_intr_enable(priv);
-	cpdma_ctlr_eoi(priv->dma);
+	cpdma_ctlr_eoi(priv->dma, CPDMA_EOI_RX);
+	cpdma_ctlr_eoi(priv->dma, CPDMA_EOI_TX);
+
 }
 #endif
 
+static inline int cpsw_add_vlan_ale_entry(struct cpsw_priv *priv,
+				unsigned short vid)
+{
+	int ret;
+
+	ret = cpsw_ale_add_vlan(priv->ale, vid,
+				ALE_ALL_PORTS << priv->host_port,
+				0, ALE_ALL_PORTS << priv->host_port,
+				(ALE_PORT_1 | ALE_PORT_2) << priv->host_port);
+	if (ret != 0)
+		return ret;
+
+	ret = cpsw_ale_add_ucast(priv->ale, priv->mac_addr,
+				 priv->host_port, ALE_VLAN, vid);
+	if (ret != 0)
+		goto clean_vid;
+
+	ret = cpsw_ale_add_mcast(priv->ale, priv->ndev->broadcast,
+				 ALE_ALL_PORTS << priv->host_port,
+				 ALE_VLAN, vid, 0);
+	if (ret != 0)
+		goto clean_vlan_ucast;
+	return 0;
+
+clean_vlan_ucast:
+	cpsw_ale_del_ucast(priv->ale, priv->mac_addr,
+			    priv->host_port, ALE_VLAN, vid);
+clean_vid:
+	cpsw_ale_del_vlan(priv->ale, vid, 0);
+	return ret;
+}
+
+static int cpsw_ndo_vlan_rx_add_vid(struct net_device *ndev,
+				    __be16 proto, u16 vid)
+{
+	struct cpsw_priv *priv = netdev_priv(ndev);
+
+	if (vid == priv->data.default_vlan)
+		return 0;
+
+	dev_info(priv->dev, "Adding vlanid %d to vlan filter\n", vid);
+	return cpsw_add_vlan_ale_entry(priv, vid);
+}
+
+static int cpsw_ndo_vlan_rx_kill_vid(struct net_device *ndev,
+				     __be16 proto, u16 vid)
+{
+	struct cpsw_priv *priv = netdev_priv(ndev);
+	int ret;
+
+	if (vid == priv->data.default_vlan)
+		return 0;
+
+	dev_info(priv->dev, "removing vlanid %d from vlan filter\n", vid);
+	ret = cpsw_ale_del_vlan(priv->ale, vid, 0);
+	if (ret != 0)
+		return ret;
+
+	ret = cpsw_ale_del_ucast(priv->ale, priv->mac_addr,
+				 priv->host_port, ALE_VLAN, vid);
+	if (ret != 0)
+		return ret;
+
+	return cpsw_ale_del_mcast(priv->ale, priv->ndev->broadcast,
+				  0, ALE_VLAN, vid);
+}
+
 static const struct net_device_ops cpsw_netdev_ops = {
 	.ndo_open		= cpsw_ndo_open,
 	.ndo_stop		= cpsw_ndo_stop,
@@ -938,15 +1334,18 @@ static const struct net_device_ops cpsw_netdev_ops = {
 #ifdef CONFIG_NET_POLL_CONTROLLER
 	.ndo_poll_controller	= cpsw_ndo_poll_controller,
 #endif
+	.ndo_vlan_rx_add_vid	= cpsw_ndo_vlan_rx_add_vid,
+	.ndo_vlan_rx_kill_vid	= cpsw_ndo_vlan_rx_kill_vid,
 };
 
 static void cpsw_get_drvinfo(struct net_device *ndev,
 			     struct ethtool_drvinfo *info)
 {
 	struct cpsw_priv *priv = netdev_priv(ndev);
-	strcpy(info->driver, "TI CPSW Driver v1.0");
-	strcpy(info->version, "1.0");
-	strcpy(info->bus_info, priv->pdev->name);
+
+	strlcpy(info->driver, "TI CPSW Driver v1.0", sizeof(info->driver));
+	strlcpy(info->version, "1.0", sizeof(info->version));
+	strlcpy(info->bus_info, priv->pdev->name, sizeof(info->bus_info));
 }
 
 static u32 cpsw_get_msglevel(struct net_device *ndev)
@@ -974,7 +1373,7 @@ static int cpsw_get_ts_info(struct net_device *ndev,
 		SOF_TIMESTAMPING_RX_SOFTWARE |
 		SOF_TIMESTAMPING_SOFTWARE |
 		SOF_TIMESTAMPING_RAW_HARDWARE;
-	info->phc_index = priv->cpts.phc_index;
+	info->phc_index = priv->cpts->phc_index;
 	info->tx_types =
 		(1 << HWTSTAMP_TX_OFF) |
 		(1 << HWTSTAMP_TX_ON);
@@ -993,12 +1392,39 @@ static int cpsw_get_ts_info(struct net_device *ndev,
 	return 0;
 }
 
+static int cpsw_get_settings(struct net_device *ndev,
+			     struct ethtool_cmd *ecmd)
+{
+	struct cpsw_priv *priv = netdev_priv(ndev);
+	int slave_no = cpsw_slave_index(priv);
+
+	if (priv->slaves[slave_no].phy)
+		return phy_ethtool_gset(priv->slaves[slave_no].phy, ecmd);
+	else
+		return -EOPNOTSUPP;
+}
+
+static int cpsw_set_settings(struct net_device *ndev, struct ethtool_cmd *ecmd)
+{
+	struct cpsw_priv *priv = netdev_priv(ndev);
+	int slave_no = cpsw_slave_index(priv);
+
+	if (priv->slaves[slave_no].phy)
+		return phy_ethtool_sset(priv->slaves[slave_no].phy, ecmd);
+	else
+		return -EOPNOTSUPP;
+}
+
 static const struct ethtool_ops cpsw_ethtool_ops = {
 	.get_drvinfo	= cpsw_get_drvinfo,
 	.get_msglevel	= cpsw_get_msglevel,
 	.set_msglevel	= cpsw_set_msglevel,
 	.get_link	= ethtool_op_get_link,
 	.get_ts_info	= cpsw_get_ts_info,
+	.get_settings	= cpsw_get_settings,
+	.set_settings	= cpsw_set_settings,
+	.get_coalesce	= cpsw_get_coalesce,
+	.set_coalesce	= cpsw_set_coalesce,
 };
 
 static void cpsw_slave_init(struct cpsw_slave *slave, struct cpsw_priv *priv,
@@ -1011,6 +1437,7 @@ static void cpsw_slave_init(struct cpsw_slave *slave, struct cpsw_priv *priv,
 	slave->data	= data;
 	slave->regs	= regs + slave_reg_ofs;
 	slave->sliver	= regs + sliver_reg_ofs;
+	slave->port_vlan = data->dual_emac_res_vlan;
 }
 
 static int cpsw_probe_dt(struct cpsw_platform_data *data,
@@ -1030,12 +1457,12 @@ static int cpsw_probe_dt(struct cpsw_platform_data *data,
 	}
 	data->slaves = prop;
 
-	if (of_property_read_u32(node, "cpts_active_slave", &prop)) {
-		pr_err("Missing cpts_active_slave property in the DT.\n");
+	if (of_property_read_u32(node, "active_slave", &prop)) {
+		pr_err("Missing active_slave property in the DT.\n");
 		ret = -EINVAL;
 		goto error_ret;
 	}
-	data->cpts_active_slave = prop;
+	data->active_slave = prop;
 
 	if (of_property_read_u32(node, "cpts_clock_mult", &prop)) {
 		pr_err("Missing cpts_clock_mult property in the DT.\n");
@@ -1051,12 +1478,10 @@ static int cpsw_probe_dt(struct cpsw_platform_data *data,
 	}
 	data->cpts_clock_shift = prop;
 
-	data->slave_data = kzalloc(sizeof(struct cpsw_slave_data) *
-				   data->slaves, GFP_KERNEL);
-	if (!data->slave_data) {
-		pr_err("Could not allocate slave memory.\n");
+	data->slave_data = kcalloc(data->slaves, sizeof(struct cpsw_slave_data),
+				   GFP_KERNEL);
+	if (!data->slave_data)
 		return -EINVAL;
-	}
 
 	if (of_property_read_u32(node, "cpdma_channels", &prop)) {
 		pr_err("Missing cpdma_channels property in the DT.\n");
@@ -1093,6 +1518,9 @@ static int cpsw_probe_dt(struct cpsw_platform_data *data,
 	}
 	data->mac_control = prop;
 
+	if (!of_property_read_u32(node, "dual_emac", &prop))
+		data->dual_emac = prop;
+
 	/*
 	 * Populate all the child nodes here...
 	 */
@@ -1111,7 +1539,7 @@ static int cpsw_probe_dt(struct cpsw_platform_data *data,
 		struct platform_device *mdio;
 
 		parp = of_get_property(slave_node, "phy_id", &lenp);
-		if ((parp == NULL) && (lenp != (sizeof(void *) * 2))) {
+		if ((parp == NULL) || (lenp != (sizeof(void *) * 2))) {
 			pr_err("Missing slave[%d] phy_id property\n", i);
 			ret = -EINVAL;
 			goto error_ret;
@@ -1126,6 +1554,18 @@ static int cpsw_probe_dt(struct cpsw_platform_data *data,
 		if (mac_addr)
 			memcpy(slave_data->mac_addr, mac_addr, ETH_ALEN);
 
+		if (data->dual_emac) {
+			if (of_property_read_u32(slave_node, "dual_emac_res_vlan",
+						 &prop)) {
+				pr_err("Missing dual_emac_res_vlan in DT.\n");
+				slave_data->dual_emac_res_vlan = i+1;
+				pr_err("Using %d as Reserved VLAN for %d slave\n",
+				       slave_data->dual_emac_res_vlan, i);
+			} else {
+				slave_data->dual_emac_res_vlan = prop;
+			}
+		}
+
 		i++;
 	}
 
@@ -1136,9 +1576,85 @@ error_ret:
 	return ret;
 }
 
+static int cpsw_probe_dual_emac(struct platform_device *pdev,
+				struct cpsw_priv *priv)
+{
+	struct cpsw_platform_data	*data = &priv->data;
+	struct net_device		*ndev;
+	struct cpsw_priv		*priv_sl2;
+	int ret = 0, i;
+
+	ndev = alloc_etherdev(sizeof(struct cpsw_priv));
+	if (!ndev) {
+		pr_err("cpsw: error allocating net_device\n");
+		return -ENOMEM;
+	}
+
+	priv_sl2 = netdev_priv(ndev);
+	spin_lock_init(&priv_sl2->lock);
+	priv_sl2->data = *data;
+	priv_sl2->pdev = pdev;
+	priv_sl2->ndev = ndev;
+	priv_sl2->dev  = &ndev->dev;
+	priv_sl2->msg_enable = netif_msg_init(debug_level, CPSW_DEBUG);
+	priv_sl2->rx_packet_max = max(rx_packet_max, 128);
+
+	if (is_valid_ether_addr(data->slave_data[1].mac_addr)) {
+		memcpy(priv_sl2->mac_addr, data->slave_data[1].mac_addr,
+			ETH_ALEN);
+		pr_info("cpsw: Detected MACID = %pM\n", priv_sl2->mac_addr);
+	} else {
+		random_ether_addr(priv_sl2->mac_addr);
+		pr_info("cpsw: Random MACID = %pM\n", priv_sl2->mac_addr);
+	}
+	memcpy(ndev->dev_addr, priv_sl2->mac_addr, ETH_ALEN);
+
+	priv_sl2->slaves = priv->slaves;
+	priv_sl2->clk = priv->clk;
+
+	priv_sl2->coal_intvl = 0;
+	priv_sl2->bus_freq_mhz = priv->bus_freq_mhz;
+
+	priv_sl2->cpsw_res = priv->cpsw_res;
+	priv_sl2->regs = priv->regs;
+	priv_sl2->host_port = priv->host_port;
+	priv_sl2->host_port_regs = priv->host_port_regs;
+	priv_sl2->wr_regs = priv->wr_regs;
+	priv_sl2->dma = priv->dma;
+	priv_sl2->txch = priv->txch;
+	priv_sl2->rxch = priv->rxch;
+	priv_sl2->ale = priv->ale;
+	priv_sl2->emac_port = 1;
+	priv->slaves[1].ndev = ndev;
+	priv_sl2->cpts = priv->cpts;
+	priv_sl2->version = priv->version;
+
+	for (i = 0; i < priv->num_irqs; i++) {
+		priv_sl2->irqs_table[i] = priv->irqs_table[i];
+		priv_sl2->num_irqs = priv->num_irqs;
+	}
+	priv->irq_enabled = true;
+	ndev->features |= NETIF_F_HW_VLAN_CTAG_FILTER;
+
+	ndev->netdev_ops = &cpsw_netdev_ops;
+	SET_ETHTOOL_OPS(ndev, &cpsw_ethtool_ops);
+	netif_napi_add(ndev, &priv_sl2->napi, cpsw_poll, CPSW_POLL_WEIGHT);
+
+	/* register the network device */
+	SET_NETDEV_DEV(ndev, &pdev->dev);
+	ret = register_netdev(ndev);
+	if (ret) {
+		pr_err("cpsw: error registering net device\n");
+		free_netdev(ndev);
+		ret = -ENODEV;
+	}
+
+	return ret;
+}
+
 static int cpsw_probe(struct platform_device *pdev)
 {
-	struct cpsw_platform_data	*data = pdev->dev.platform_data;
+	struct cpsw_platform_data	*data;
 	struct net_device		*ndev;
 	struct cpsw_priv		*priv;
 	struct cpdma_params		dma_params;
@@ -1162,6 +1678,11 @@ static int cpsw_probe(struct platform_device *pdev)
 	priv->dev  = &ndev->dev;
 	priv->msg_enable = netif_msg_init(debug_level, CPSW_DEBUG);
 	priv->rx_packet_max = max(rx_packet_max, 128);
+	priv->cpts = devm_kzalloc(&pdev->dev, sizeof(struct cpts), GFP_KERNEL);
+	if (!ndev) {
+		pr_err("error allocating cpts\n");
+		goto clean_ndev_ret;
+	}
 
 	/*
 	 * This may be required here for child devices.
@@ -1194,12 +1715,17 @@ static int cpsw_probe(struct platform_device *pdev)
 	for (i = 0; i < data->slaves; i++)
 		priv->slaves[i].slave_num = i;
 
+	priv->slaves[0].ndev = ndev;
+	priv->emac_port = 0;
+
 	priv->clk = clk_get(&pdev->dev, "fck");
 	if (IS_ERR(priv->clk)) {
 		dev_err(&pdev->dev, "fck is not found\n");
 		ret = -ENODEV;
 		goto clean_slave_ret;
 	}
+	priv->coal_intvl = 0;
+	priv->bus_freq_mhz = clk_get_rate(priv->clk) / 1000000;
 
 	priv->cpsw_res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
 	if (!priv->cpsw_res) {
@@ -1248,7 +1774,7 @@ static int cpsw_probe(struct platform_device *pdev)
 	switch (priv->version) {
 	case CPSW_VERSION_1:
 		priv->host_port_regs = ss_regs + CPSW1_HOST_PORT_OFFSET;
-		priv->cpts.reg       = ss_regs + CPSW1_CPTS_OFFSET;
+		priv->cpts->reg       = ss_regs + CPSW1_CPTS_OFFSET;
 		dma_params.dmaregs   = ss_regs + CPSW1_CPDMA_OFFSET;
 		dma_params.txhdp     = ss_regs + CPSW1_STATERAM_OFFSET;
 		ale_params.ale_regs  = ss_regs + CPSW1_ALE_OFFSET;
@@ -1259,7 +1785,7 @@ static int cpsw_probe(struct platform_device *pdev)
 		break;
 	case CPSW_VERSION_2:
 		priv->host_port_regs = ss_regs + CPSW2_HOST_PORT_OFFSET;
-		priv->cpts.reg       = ss_regs + CPSW2_CPTS_OFFSET;
+		priv->cpts->reg       = ss_regs + CPSW2_CPTS_OFFSET;
 		dma_params.dmaregs   = ss_regs + CPSW2_CPDMA_OFFSET;
 		dma_params.txhdp     = ss_regs + CPSW2_STATERAM_OFFSET;
 		ale_params.ale_regs  = ss_regs + CPSW2_ALE_OFFSET;
@@ -1341,12 +1867,12 @@ static int cpsw_probe(struct platform_device *pdev)
 				goto clean_ale_ret;
 			}
 			priv->irqs_table[k] = i;
-			priv->num_irqs = k;
+			priv->num_irqs = k + 1;
 		}
 		k++;
 	}
 
-	ndev->flags |= IFF_ALLMULTI;	/* see cpsw_ndo_change_rx_flags() */
+	ndev->features |= NETIF_F_HW_VLAN_CTAG_FILTER;
 
 	ndev->netdev_ops = &cpsw_netdev_ops;
 	SET_ETHTOOL_OPS(ndev, &cpsw_ethtool_ops);
@@ -1361,17 +1887,26 @@ static int cpsw_probe(struct platform_device *pdev)
 		goto clean_irq_ret;
 	}
 
-	if (cpts_register(&pdev->dev, &priv->cpts,
+	if (cpts_register(&pdev->dev, priv->cpts,
 			  data->cpts_clock_mult, data->cpts_clock_shift))
 		dev_err(priv->dev, "error registering cpts device\n");
 
 	cpsw_notice(priv, probe, "initialized device (regs %x, irq %d)\n",
 		  priv->cpsw_res->start, ndev->irq);
 
+	if (priv->data.dual_emac) {
+		ret = cpsw_probe_dual_emac(pdev, priv);
+		if (ret) {
+			cpsw_err(priv, probe, "error probe slave 2 emac interface\n");
+			goto clean_irq_ret;
+		}
+	}
+
 	return 0;
 
 clean_irq_ret:
-	free_irq(ndev->irq, priv);
+	for (i = 0; i < priv->num_irqs; i++)
+		free_irq(priv->irqs_table[i], priv);
 clean_ale_ret:
 	cpsw_ale_destroy(priv->ale);
 clean_dma_ret:
@@ -1394,7 +1929,8 @@ clean_slave_ret:
 	pm_runtime_disable(&pdev->dev);
 	kfree(priv->slaves);
 clean_ndev_ret:
-	free_netdev(ndev);
+	kfree(priv->data.slave_data);
+	free_netdev(priv->ndev);
 	return ret;
 }
 
@@ -1402,12 +1938,17 @@ static int cpsw_remove(struct platform_device *pdev)
 {
 	struct net_device *ndev = platform_get_drvdata(pdev);
 	struct cpsw_priv *priv = netdev_priv(ndev);
+	int i;
 
-	pr_info("removing device");
 	platform_set_drvdata(pdev, NULL);
+	if (priv->data.dual_emac)
+		unregister_netdev(cpsw_get_slave_ndev(priv, 1));
+	unregister_netdev(ndev);
+
+	cpts_unregister(priv->cpts);
+	for (i = 0; i < priv->num_irqs; i++)
+		free_irq(priv->irqs_table[i], priv);
 
-	cpts_unregister(&priv->cpts);
-	free_irq(ndev->irq, priv);
 	cpsw_ale_destroy(priv->ale);
 	cpdma_chan_destroy(priv->txch);
 	cpdma_chan_destroy(priv->rxch);
@@ -1421,8 +1962,10 @@ static int cpsw_remove(struct platform_device *pdev)
 	pm_runtime_disable(&pdev->dev);
 	clk_put(priv->clk);
 	kfree(priv->slaves);
+	kfree(priv->data.slave_data);
+	if (priv->data.dual_emac)
+		free_netdev(cpsw_get_slave_ndev(priv, 1));
 	free_netdev(ndev);
-
 	return 0;
 }
 
@@ -1458,6 +2001,7 @@ static const struct of_device_id cpsw_of_mtable[] = {
 	{ .compatible = "ti,cpsw", },
 	{ /* sentinel */ },
 };
+MODULE_DEVICE_TABLE(of, cpsw_of_mtable);
 
 static struct platform_driver cpsw_driver = {
 	.driver = {
diff --git a/drivers/net/ethernet/ti/cpsw_ale.c b/drivers/net/ethernet/ti/cpsw_ale.c
index 0e9ccc2..7fa60d6 100644
--- a/drivers/net/ethernet/ti/cpsw_ale.c
+++ b/drivers/net/ethernet/ti/cpsw_ale.c
@@ -148,7 +148,7 @@ static int cpsw_ale_write(struct cpsw_ale *ale, int idx, u32 *ale_entry)
 	return idx;
 }
 
-static int cpsw_ale_match_addr(struct cpsw_ale *ale, u8 *addr)
+int cpsw_ale_match_addr(struct cpsw_ale *ale, u8 *addr, u16 vid)
 {
 	u32 ale_entry[ALE_ENTRY_WORDS];
 	int type, idx;
@@ -160,6 +160,8 @@ static int cpsw_ale_match_addr(struct cpsw_ale *ale, u8 *addr)
 		type = cpsw_ale_get_entry_type(ale_entry);
 		if (type != ALE_TYPE_ADDR && type != ALE_TYPE_VLAN_ADDR)
 			continue;
+		if (cpsw_ale_get_vlan_id(ale_entry) != vid)
+			continue;
 		cpsw_ale_get_addr(ale_entry, entry_addr);
 		if (memcmp(entry_addr, addr, 6) == 0)
 			return idx;
@@ -167,6 +169,22 @@ static int cpsw_ale_match_addr(struct cpsw_ale *ale, u8 *addr)
 	return -ENOENT;
 }
 
+int cpsw_ale_match_vlan(struct cpsw_ale *ale, u16 vid)
+{
+	u32 ale_entry[ALE_ENTRY_WORDS];
+	int type, idx;
+
+	for (idx = 0; idx < ale->params.ale_entries; idx++) {
+		cpsw_ale_read(ale, idx, ale_entry);
+		type = cpsw_ale_get_entry_type(ale_entry);
+		if (type != ALE_TYPE_VLAN)
+			continue;
+		if (cpsw_ale_get_vlan_id(ale_entry) == vid)
+			return idx;
+	}
+	return -ENOENT;
+}
+
 static int cpsw_ale_match_free(struct cpsw_ale *ale)
 {
 	u32 ale_entry[ALE_ENTRY_WORDS];
@@ -274,19 +292,32 @@ int cpsw_ale_flush(struct cpsw_ale *ale, int port_mask)
 	return 0;
 }
 
-int cpsw_ale_add_ucast(struct cpsw_ale *ale, u8 *addr, int port, int flags)
+static inline void cpsw_ale_set_vlan_entry_type(u32 *ale_entry,
+						int flags, u16 vid)
+{
+	if (flags & ALE_VLAN) {
+		cpsw_ale_set_entry_type(ale_entry, ALE_TYPE_VLAN_ADDR);
+		cpsw_ale_set_vlan_id(ale_entry, vid);
+	} else {
+		cpsw_ale_set_entry_type(ale_entry, ALE_TYPE_ADDR);
+	}
+}
+
+int cpsw_ale_add_ucast(struct cpsw_ale *ale, u8 *addr, int port,
+		       int flags, u16 vid)
 {
 	u32 ale_entry[ALE_ENTRY_WORDS] = {0, 0, 0};
 	int idx;
 
-	cpsw_ale_set_entry_type(ale_entry, ALE_TYPE_ADDR);
+	cpsw_ale_set_vlan_entry_type(ale_entry, flags, vid);
+
 	cpsw_ale_set_addr(ale_entry, addr);
 	cpsw_ale_set_ucast_type(ale_entry, ALE_UCAST_PERSISTANT);
 	cpsw_ale_set_secure(ale_entry, (flags & ALE_SECURE) ? 1 : 0);
 	cpsw_ale_set_blocked(ale_entry, (flags & ALE_BLOCKED) ? 1 : 0);
 	cpsw_ale_set_port_num(ale_entry, port);
 
-	idx = cpsw_ale_match_addr(ale, addr);
+	idx = cpsw_ale_match_addr(ale, addr, (flags & ALE_VLAN) ? vid : 0);
 	if (idx < 0)
 		idx = cpsw_ale_match_free(ale);
 	if (idx < 0)
@@ -298,12 +329,13 @@ int cpsw_ale_add_ucast(struct cpsw_ale *ale, u8 *addr, int port, int flags)
 	return 0;
 }
 
-int cpsw_ale_del_ucast(struct cpsw_ale *ale, u8 *addr, int port)
+int cpsw_ale_del_ucast(struct cpsw_ale *ale, u8 *addr, int port,
+		       int flags, u16 vid)
 {
 	u32 ale_entry[ALE_ENTRY_WORDS] = {0, 0, 0};
 	int idx;
 
-	idx = cpsw_ale_match_addr(ale, addr);
+	idx = cpsw_ale_match_addr(ale, addr, (flags & ALE_VLAN) ? vid : 0);
 	if (idx < 0)
 		return -ENOENT;
 
@@ -313,18 +345,19 @@ int cpsw_ale_del_ucast(struct cpsw_ale *ale, u8 *addr, int port)
 }
 
 int cpsw_ale_add_mcast(struct cpsw_ale *ale, u8 *addr, int port_mask,
-			int super, int mcast_state)
+		       int flags, u16 vid, int mcast_state)
 {
 	u32 ale_entry[ALE_ENTRY_WORDS] = {0, 0, 0};
 	int idx, mask;
 
-	idx = cpsw_ale_match_addr(ale, addr);
+	idx = cpsw_ale_match_addr(ale, addr, (flags & ALE_VLAN) ? vid : 0);
 	if (idx >= 0)
 		cpsw_ale_read(ale, idx, ale_entry);
 
-	cpsw_ale_set_entry_type(ale_entry, ALE_TYPE_ADDR);
+	cpsw_ale_set_vlan_entry_type(ale_entry, flags, vid);
+
 	cpsw_ale_set_addr(ale_entry, addr);
-	cpsw_ale_set_super(ale_entry, super);
+	cpsw_ale_set_super(ale_entry, (flags & ALE_BLOCKED) ? 1 : 0);
 	cpsw_ale_set_mcast_state(ale_entry, mcast_state);
 
 	mask = cpsw_ale_get_port_mask(ale_entry);
@@ -342,12 +375,13 @@ int cpsw_ale_add_mcast(struct cpsw_ale *ale, u8 *addr, int port_mask,
 	return 0;
 }
 
-int cpsw_ale_del_mcast(struct cpsw_ale *ale, u8 *addr, int port_mask)
+int cpsw_ale_del_mcast(struct cpsw_ale *ale, u8 *addr, int port_mask,
+		       int flags, u16 vid)
 {
 	u32 ale_entry[ALE_ENTRY_WORDS] = {0, 0, 0};
 	int idx;
 
-	idx = cpsw_ale_match_addr(ale, addr);
+	idx = cpsw_ale_match_addr(ale, addr, (flags & ALE_VLAN) ? vid : 0);
 	if (idx < 0)
 		return -EINVAL;
 
@@ -362,6 +396,55 @@ int cpsw_ale_del_mcast(struct cpsw_ale *ale, u8 *addr, int port_mask)
 	return 0;
 }
 
+int cpsw_ale_add_vlan(struct cpsw_ale *ale, u16 vid, int port, int untag,
+		      int reg_mcast, int unreg_mcast)
+{
+	u32 ale_entry[ALE_ENTRY_WORDS] = {0, 0, 0};
+	int idx;
+
+	idx = cpsw_ale_match_vlan(ale, vid);
+	if (idx >= 0)
+		cpsw_ale_read(ale, idx, ale_entry);
+
+	cpsw_ale_set_entry_type(ale_entry, ALE_TYPE_VLAN);
+	cpsw_ale_set_vlan_id(ale_entry, vid);
+
+	cpsw_ale_set_vlan_untag_force(ale_entry, untag);
+	cpsw_ale_set_vlan_reg_mcast(ale_entry, reg_mcast);
+	cpsw_ale_set_vlan_unreg_mcast(ale_entry, unreg_mcast);
+	cpsw_ale_set_vlan_member_list(ale_entry, port);
+
+	if (idx < 0)
+		idx = cpsw_ale_match_free(ale);
+	if (idx < 0)
+		idx = cpsw_ale_find_ageable(ale);
+	if (idx < 0)
+		return -ENOMEM;
+
+	cpsw_ale_write(ale, idx, ale_entry);
+	return 0;
+}
+
+int cpsw_ale_del_vlan(struct cpsw_ale *ale, u16 vid, int port_mask)
+{
+	u32 ale_entry[ALE_ENTRY_WORDS] = {0, 0, 0};
+	int idx;
+
+	idx = cpsw_ale_match_vlan(ale, vid);
+	if (idx < 0)
+		return -ENOENT;
+
+	cpsw_ale_read(ale, idx, ale_entry);
+
+	if (port_mask)
+		cpsw_ale_set_vlan_member_list(ale_entry, port_mask);
+	else
+		cpsw_ale_set_entry_type(ale_entry, ALE_TYPE_FREE);
+
+	cpsw_ale_write(ale, idx, ale_entry);
+	return 0;
+}
+
 struct ale_control_info {
 	const char	*name;
 	int		offset, port_offset;
diff --git a/drivers/net/ethernet/ti/cpsw_ale.h b/drivers/net/ethernet/ti/cpsw_ale.h
index 2bd09cb..30daa12 100644
--- a/drivers/net/ethernet/ti/cpsw_ale.h
+++ b/drivers/net/ethernet/ti/cpsw_ale.h
@@ -64,8 +64,14 @@ enum cpsw_ale_port_state {
 };
 
 /* ALE unicast entry flags - passed into cpsw_ale_add_ucast() */
-#define ALE_SECURE			1
-#define ALE_BLOCKED			2
+#define ALE_SECURE			BIT(0)
+#define ALE_BLOCKED			BIT(1)
+#define ALE_SUPER			BIT(2)
+#define ALE_VLAN			BIT(3)
+
+#define ALE_PORT_HOST			BIT(0)
+#define ALE_PORT_1			BIT(1)
+#define ALE_PORT_2			BIT(2)
 
 #define ALE_MCAST_FWD			0
 #define ALE_MCAST_BLOCK_LEARN_FWD	1
@@ -81,11 +87,17 @@ void cpsw_ale_stop(struct cpsw_ale *ale);
 int cpsw_ale_set_ageout(struct cpsw_ale *ale, int ageout);
 int cpsw_ale_flush(struct cpsw_ale *ale, int port_mask);
 int cpsw_ale_flush_multicast(struct cpsw_ale *ale, int port_mask);
-int cpsw_ale_add_ucast(struct cpsw_ale *ale, u8 *addr, int port, int flags);
-int cpsw_ale_del_ucast(struct cpsw_ale *ale, u8 *addr, int port);
+int cpsw_ale_add_ucast(struct cpsw_ale *ale, u8 *addr, int port,
+		       int flags, u16 vid);
+int cpsw_ale_del_ucast(struct cpsw_ale *ale, u8 *addr, int port,
+		       int flags, u16 vid);
 int cpsw_ale_add_mcast(struct cpsw_ale *ale, u8 *addr, int port_mask,
-			int super, int mcast_state);
-int cpsw_ale_del_mcast(struct cpsw_ale *ale, u8 *addr, int port_mask);
+		       int flags, u16 vid, int mcast_state);
+int cpsw_ale_del_mcast(struct cpsw_ale *ale, u8 *addr, int port_mask,
+		       int flags, u16 vid);
+int cpsw_ale_add_vlan(struct cpsw_ale *ale, u16 vid, int port, int untag,
+			int reg_mcast, int unreg_mcast);
+int cpsw_ale_del_vlan(struct cpsw_ale *ale, u16 vid, int port);
 
 int cpsw_ale_control_get(struct cpsw_ale *ale, int port, int control);
 int cpsw_ale_control_set(struct cpsw_ale *ale, int port,
diff --git a/drivers/net/ethernet/ti/davinci_cpdma.c b/drivers/net/ethernet/ti/davinci_cpdma.c
index 4995673..49dfd59 100644
--- a/drivers/net/ethernet/ti/davinci_cpdma.c
+++ b/drivers/net/ethernet/ti/davinci_cpdma.c
@@ -20,6 +20,7 @@
 #include <linux/err.h>
 #include <linux/dma-mapping.h>
 #include <linux/io.h>
+#include <linux/delay.h>
 
 #include "davinci_cpdma.h"
 
@@ -60,6 +61,9 @@
 #define CPDMA_DESC_EOQ		BIT(28)
 #define CPDMA_DESC_TD_COMPLETE	BIT(27)
 #define CPDMA_DESC_PASS_CRC	BIT(26)
+#define CPDMA_DESC_TO_PORT_EN	BIT(20)
+#define CPDMA_TO_PORT_SHIFT	16
+#define CPDMA_DESC_PORT_MASK	(BIT(18) | BIT(17) | BIT(16))
 
 #define CPDMA_TEARDOWN_VALUE	0xfffffffc
 
@@ -105,13 +109,13 @@ struct cpdma_ctlr {
 };
 
 struct cpdma_chan {
+	struct cpdma_desc __iomem	*head, *tail;
+	void __iomem			*hdp, *cp, *rxfree;
 	enum cpdma_state		state;
 	struct cpdma_ctlr		*ctlr;
 	int				chan_num;
 	spinlock_t			lock;
-	struct cpdma_desc __iomem	*head, *tail;
 	int				count;
-	void __iomem			*hdp, *cp, *rxfree;
 	u32				mask;
 	cpdma_handler_fn		handler;
 	enum dma_data_direction		dir;
@@ -132,6 +136,14 @@ struct cpdma_chan {
 #define chan_write(chan, fld, v)	__raw_writel(v, (chan)->fld)
 #define desc_write(desc, fld, v)	__raw_writel((u32)(v), &(desc)->fld)
 
+#define cpdma_desc_to_port(chan, mode, directed)			\
+	do {								\
+		if (!is_rx_chan(chan) && ((directed == 1) ||		\
+					  (directed == 2)))		\
+			mode |= (CPDMA_DESC_TO_PORT_EN |		\
+				 (directed << CPDMA_TO_PORT_SHIFT));	\
+	} while (0)
+
 /*
  * Utility constructs for a cpdma descriptor pool.  Some devices (e.g. davinci
  * emac) have dedicated on-chip memory for these descriptors.  Some other
@@ -217,17 +229,27 @@ desc_from_phys(struct cpdma_desc_pool *pool, dma_addr_t dma)
 }
 
 static struct cpdma_desc __iomem *
-cpdma_desc_alloc(struct cpdma_desc_pool *pool, int num_desc)
+cpdma_desc_alloc(struct cpdma_desc_pool *pool, int num_desc, bool is_rx)
 {
 	unsigned long flags;
 	int index;
+	int desc_start;
+	int desc_end;
 	struct cpdma_desc __iomem *desc = NULL;
 
 	spin_lock_irqsave(&pool->lock, flags);
 
-	index = bitmap_find_next_zero_area(pool->bitmap, pool->num_desc, 0,
-					   num_desc, 0);
-	if (index < pool->num_desc) {
+	if (is_rx) {
+		desc_start = 0;
+		desc_end = pool->num_desc/2;
+	 } else {
+		desc_start = pool->num_desc/2;
+		desc_end = pool->num_desc;
+	}
+
+	index = bitmap_find_next_zero_area(pool->bitmap,
+				desc_end, desc_start, num_desc, 0);
+	if (index < desc_end) {
 		bitmap_set(pool->bitmap, index, num_desc);
 		desc = pool->iomap + pool->desc_size * index;
 		pool->used_desc++;
@@ -291,14 +313,16 @@ int cpdma_ctlr_start(struct cpdma_ctlr *ctlr)
 	}
 
 	if (ctlr->params.has_soft_reset) {
-		unsigned long timeout = jiffies + HZ/10;
+		unsigned timeout = 10 * 100;
 
 		dma_reg_write(ctlr, CPDMA_SOFTRESET, 1);
-		while (time_before(jiffies, timeout)) {
+		while (timeout) {
 			if (dma_reg_read(ctlr, CPDMA_SOFTRESET) == 0)
 				break;
+			udelay(10);
+			timeout--;
 		}
-		WARN_ON(!time_before(jiffies, timeout));
+		WARN_ON(!timeout);
 	}
 
 	for (i = 0; i < ctlr->num_chan; i++) {
@@ -439,10 +463,8 @@ int cpdma_ctlr_destroy(struct cpdma_ctlr *ctlr)
 	if (ctlr->state != CPDMA_STATE_IDLE)
 		cpdma_ctlr_stop(ctlr);
 
-	for (i = 0; i < ARRAY_SIZE(ctlr->channels); i++) {
-		if (ctlr->channels[i])
-			cpdma_chan_destroy(ctlr->channels[i]);
-	}
+	for (i = 0; i < ARRAY_SIZE(ctlr->channels); i++)
+		cpdma_chan_destroy(ctlr->channels[i]);
 
 	cpdma_desc_pool_destroy(ctlr->pool);
 	spin_unlock_irqrestore(&ctlr->lock, flags);
@@ -473,11 +495,13 @@ int cpdma_ctlr_int_ctrl(struct cpdma_ctlr *ctlr, bool enable)
 	spin_unlock_irqrestore(&ctlr->lock, flags);
 	return 0;
 }
+EXPORT_SYMBOL_GPL(cpdma_ctlr_int_ctrl);
 
-void cpdma_ctlr_eoi(struct cpdma_ctlr *ctlr)
+void cpdma_ctlr_eoi(struct cpdma_ctlr *ctlr, u32 value)
 {
-	dma_reg_write(ctlr, CPDMA_MACEOIVECTOR, 0);
+	dma_reg_write(ctlr, CPDMA_MACEOIVECTOR, value);
 }
+EXPORT_SYMBOL_GPL(cpdma_ctlr_eoi);
 
 struct cpdma_chan *cpdma_chan_create(struct cpdma_ctlr *ctlr, int chan_num,
 				     cpdma_handler_fn handler)
@@ -652,7 +676,7 @@ static void __cpdma_chan_submit(struct cpdma_chan *chan,
 }
 
 int cpdma_chan_submit(struct cpdma_chan *chan, void *token, void *data,
-		      int len, gfp_t gfp_mask)
+		      int len, int directed)
 {
 	struct cpdma_ctlr		*ctlr = chan->ctlr;
 	struct cpdma_desc __iomem	*desc;
@@ -668,7 +692,7 @@ int cpdma_chan_submit(struct cpdma_chan *chan, void *token, void *data,
 		goto unlock_ret;
 	}
 
-	desc = cpdma_desc_alloc(ctlr->pool, 1);
+	desc = cpdma_desc_alloc(ctlr->pool, 1, is_rx_chan(chan));
 	if (!desc) {
 		chan->stats.desc_alloc_fail++;
 		ret = -ENOMEM;
@@ -682,6 +706,7 @@ int cpdma_chan_submit(struct cpdma_chan *chan, void *token, void *data,
 
 	buffer = dma_map_single(ctlr->dev, data, len, chan->dir);
 	mode = CPDMA_DESC_OWNER | CPDMA_DESC_SOP | CPDMA_DESC_EOP;
+	cpdma_desc_to_port(chan, mode, directed);
 
 	desc_write(desc, hw_next,   0);
 	desc_write(desc, hw_buffer, buffer);
@@ -704,6 +729,29 @@ unlock_ret:
 }
 EXPORT_SYMBOL_GPL(cpdma_chan_submit);
 
+bool cpdma_check_free_tx_desc(struct cpdma_chan *chan)
+{
+	unsigned long flags;
+	int index;
+	bool ret;
+	struct cpdma_ctlr	*ctlr = chan->ctlr;
+	struct cpdma_desc_pool	*pool = ctlr->pool;
+
+	spin_lock_irqsave(&pool->lock, flags);
+
+	index = bitmap_find_next_zero_area(pool->bitmap,
+				pool->num_desc, pool->num_desc/2, 1, 0);
+
+	if (index < pool->num_desc)
+		ret = true;
+	else
+		ret = false;
+
+	spin_unlock_irqrestore(&pool->lock, flags);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(cpdma_check_free_tx_desc);
+
 static void __cpdma_chan_free(struct cpdma_chan *chan,
 			      struct cpdma_desc __iomem *desc,
 			      int outlen, int status)
@@ -728,6 +776,7 @@ static int __cpdma_chan_process(struct cpdma_chan *chan)
 	struct cpdma_ctlr		*ctlr = chan->ctlr;
 	struct cpdma_desc __iomem	*desc;
 	int				status, outlen;
+	int				cb_status = 0;
 	struct cpdma_desc_pool		*pool = ctlr->pool;
 	dma_addr_t			desc_dma;
 	unsigned long			flags;
@@ -749,7 +798,8 @@ static int __cpdma_chan_process(struct cpdma_chan *chan)
 		status = -EBUSY;
 		goto unlock_ret;
 	}
-	status	= status & (CPDMA_DESC_EOQ | CPDMA_DESC_TD_COMPLETE);
+	status	= status & (CPDMA_DESC_EOQ | CPDMA_DESC_TD_COMPLETE |
+			    CPDMA_DESC_PORT_MASK);
 
 	chan->head = desc_from_phys(pool, desc_read(desc, hw_next));
 	chan_write(chan, cp, desc_dma);
@@ -762,8 +812,12 @@ static int __cpdma_chan_process(struct cpdma_chan *chan)
 	}
 
 	spin_unlock_irqrestore(&chan->lock, flags);
+	if (unlikely(status & CPDMA_DESC_TD_COMPLETE))
+		cb_status = -ENOSYS;
+	else
+		cb_status = status;
 
-	__cpdma_chan_free(chan, desc, outlen, status);
+	__cpdma_chan_free(chan, desc, outlen, cb_status);
 	return status;
 
 unlock_ret:
@@ -822,7 +876,7 @@ int cpdma_chan_stop(struct cpdma_chan *chan)
 	struct cpdma_desc_pool	*pool = ctlr->pool;
 	unsigned long		flags;
 	int			ret;
-	unsigned long		timeout;
+	unsigned		timeout;
 
 	spin_lock_irqsave(&chan->lock, flags);
 	if (chan->state != CPDMA_STATE_ACTIVE) {
@@ -837,14 +891,15 @@ int cpdma_chan_stop(struct cpdma_chan *chan)
 	dma_reg_write(ctlr, chan->td, chan_linear(chan));
 
 	/* wait for teardown complete */
-	timeout = jiffies + HZ/10;	/* 100 msec */
-	while (time_before(jiffies, timeout)) {
+	timeout = 100 * 100; /* 100 ms */
+	while (timeout) {
 		u32 cp = chan_read(chan, cp);
 		if ((cp & CPDMA_TEARDOWN_VALUE) == CPDMA_TEARDOWN_VALUE)
 			break;
-		cpu_relax();
+		udelay(10);
+		timeout--;
 	}
-	WARN_ON(!time_before(jiffies, timeout));
+	WARN_ON(!timeout);
 	chan_write(chan, cp, CPDMA_TEARDOWN_VALUE);
 
 	/* handle completed packets */
@@ -984,3 +1039,6 @@ unlock_ret:
 	spin_unlock_irqrestore(&ctlr->lock, flags);
 	return ret;
 }
+EXPORT_SYMBOL_GPL(cpdma_control_set);
+
+MODULE_LICENSE("GPL");
diff --git a/drivers/net/ethernet/ti/davinci_cpdma.h b/drivers/net/ethernet/ti/davinci_cpdma.h
index afa19a0..86dee48 100644
--- a/drivers/net/ethernet/ti/davinci_cpdma.h
+++ b/drivers/net/ethernet/ti/davinci_cpdma.h
@@ -24,6 +24,13 @@
 #define __chan_linear(chan_num)	((chan_num) & (CPDMA_MAX_CHANNELS - 1))
 #define chan_linear(chan)	__chan_linear((chan)->chan_num)
 
+#define CPDMA_RX_SOURCE_PORT(__status__)	((__status__ >> 16) & 0x7)
+
+#define CPDMA_EOI_RX_THRESH	0x0
+#define CPDMA_EOI_RX		0x1
+#define CPDMA_EOI_TX		0x2
+#define CPDMA_EOI_MISC		0x3
+
 struct cpdma_params {
 	struct device		*dev;
 	void __iomem		*dmaregs;
@@ -82,12 +89,13 @@ int cpdma_chan_dump(struct cpdma_chan *chan);
 int cpdma_chan_get_stats(struct cpdma_chan *chan,
 			 struct cpdma_chan_stats *stats);
 int cpdma_chan_submit(struct cpdma_chan *chan, void *token, void *data,
-		      int len, gfp_t gfp_mask);
+		      int len, int directed);
 int cpdma_chan_process(struct cpdma_chan *chan, int quota);
 
 int cpdma_ctlr_int_ctrl(struct cpdma_ctlr *ctlr, bool enable);
-void cpdma_ctlr_eoi(struct cpdma_ctlr *ctlr);
+void cpdma_ctlr_eoi(struct cpdma_ctlr *ctlr, u32 value);
 int cpdma_chan_int_ctrl(struct cpdma_chan *chan, bool enable);
+bool cpdma_check_free_tx_desc(struct cpdma_chan *chan);
 
 enum cpdma_control {
 	CPDMA_CMD_IDLE,			/* write-only */
diff --git a/drivers/net/ethernet/ti/davinci_emac.c b/drivers/net/ethernet/ti/davinci_emac.c
index 2a3e2c5..860e15d 100644
--- a/drivers/net/ethernet/ti/davinci_emac.c
+++ b/drivers/net/ethernet/ti/davinci_emac.c
@@ -120,7 +120,6 @@ static const char emac_version_string[] = "TI DaVinci EMAC Linux v6.1";
 #define EMAC_DEF_TX_CH			(0) /* Default 0th channel */
 #define EMAC_DEF_RX_CH			(0) /* Default 0th channel */
 #define EMAC_DEF_RX_NUM_DESC		(128)
-#define EMAC_DEF_TX_NUM_DESC		(128)
 #define EMAC_DEF_MAX_TX_CH		(1) /* Max TX channels configured */
 #define EMAC_DEF_MAX_RX_CH		(1) /* Max RX channels configured */
 #define EMAC_POLL_WEIGHT		(64) /* Default NAPI poll weight */
@@ -342,7 +341,6 @@ struct emac_priv {
 	u32 mac_hash2;
 	u32 multicast_hash_cnt[EMAC_NUM_MULTICAST_BITS];
 	u32 rx_addr_type;
-	atomic_t cur_tx;
 	const char *phy_id;
 #ifdef CONFIG_OF
 	struct device_node *phy_node;
@@ -480,8 +478,8 @@ static void emac_dump_regs(struct emac_priv *priv)
 static void emac_get_drvinfo(struct net_device *ndev,
 			     struct ethtool_drvinfo *info)
 {
-	strcpy(info->driver, emac_version_string);
-	strcpy(info->version, EMAC_MODULE_VERSION);
+	strlcpy(info->driver, emac_version_string, sizeof(info->driver));
+	strlcpy(info->version, EMAC_MODULE_VERSION, sizeof(info->version));
 }
 
 /**
@@ -1039,7 +1037,7 @@ static void emac_rx_handler(void *token, int len, int status)
 
 recycle:
 	ret = cpdma_chan_submit(priv->rxchan, skb, skb->data,
-			skb_tailroom(skb), GFP_KERNEL);
+			skb_tailroom(skb), 0);
 
 	WARN_ON(ret == -ENOMEM);
 	if (unlikely(ret < 0))
@@ -1050,12 +1048,12 @@ static void emac_tx_handler(void *token, int len, int status)
 {
 	struct sk_buff		*skb = token;
 	struct net_device	*ndev = skb->dev;
-	struct emac_priv	*priv = netdev_priv(ndev);
-
-	atomic_dec(&priv->cur_tx);
 
+	/* Check whether the queue is stopped due to stalled tx dma, if the
+	 * queue is stopped then start the queue as we have free desc for tx
+	 */
 	if (unlikely(netif_queue_stopped(ndev)))
-		netif_start_queue(ndev);
+		netif_wake_queue(ndev);
 	ndev->stats.tx_packets++;
 	ndev->stats.tx_bytes += len;
 	dev_kfree_skb_any(skb);
@@ -1094,14 +1092,17 @@ static int emac_dev_xmit(struct sk_buff *skb, struct net_device *ndev)
 	skb_tx_timestamp(skb);
 
 	ret_code = cpdma_chan_submit(priv->txchan, skb, skb->data, skb->len,
-				     GFP_KERNEL);
+				     0);
 	if (unlikely(ret_code != 0)) {
 		if (netif_msg_tx_err(priv) && net_ratelimit())
 			dev_err(emac_dev, "DaVinci EMAC: desc submit failed");
 		goto fail_tx;
 	}
 
-	if (atomic_inc_return(&priv->cur_tx) >= EMAC_DEF_TX_NUM_DESC)
+	/* If there is no more tx desc left free then we need to
+	 * tell the kernel to stop sending us tx frames.
+	 */
+	if (unlikely(!cpdma_check_free_tx_desc(priv->txchan)))
 		netif_stop_queue(ndev);
 
 	return NETDEV_TX_OK;
@@ -1264,7 +1265,6 @@ static int emac_dev_setmac_addr(struct net_device *ndev, void *addr)
 	/* Store mac addr in priv and rx channel and set it in EMAC hw */
 	memcpy(priv->mac_addr, sa->sa_data, ndev->addr_len);
 	memcpy(ndev->dev_addr, sa->sa_data, ndev->addr_len);
-	ndev->addr_assign_type &= ~NET_ADDR_RANDOM;
 
 	/* MAC address is configured only after the interface is enabled. */
 	if (netif_running(ndev)) {
@@ -1438,7 +1438,7 @@ static int emac_poll(struct napi_struct *napi, int budget)
  * Polled functionality used by netconsole and others in non interrupt mode
  *
  */
-void emac_poll_controller(struct net_device *ndev)
+static void emac_poll_controller(struct net_device *ndev)
 {
 	struct emac_priv *priv = netdev_priv(ndev);
 
@@ -1558,7 +1558,7 @@ static int emac_dev_open(struct net_device *ndev)
 			break;
 
 		ret = cpdma_chan_submit(priv->rxchan, skb, skb->data,
-					skb_tailroom(skb), GFP_KERNEL);
+					skb_tailroom(skb), 0);
 		if (WARN_ON(ret < 0))
 			break;
 	}
@@ -1600,7 +1600,7 @@ static int emac_dev_open(struct net_device *ndev)
 
 	if (priv->phy_id && *priv->phy_id) {
 		priv->phydev = phy_connect(ndev, priv->phy_id,
-					   &emac_adjust_link, 0,
+					   &emac_adjust_link,
 					   PHY_INTERFACE_MODE_MII);
 
 		if (IS_ERR(priv->phydev)) {
@@ -1865,21 +1865,18 @@ static int davinci_emac_probe(struct platform_device *pdev)
 
 
 	/* obtain emac clock from kernel */
-	emac_clk = clk_get(&pdev->dev, NULL);
+	emac_clk = devm_clk_get(&pdev->dev, NULL);
 	if (IS_ERR(emac_clk)) {
 		dev_err(&pdev->dev, "failed to get EMAC clock\n");
 		return -EBUSY;
 	}
 	emac_bus_frequency = clk_get_rate(emac_clk);
-	clk_put(emac_clk);
 
 	/* TODO: Probe PHY here if possible */
 
 	ndev = alloc_etherdev(sizeof(struct emac_priv));
-	if (!ndev) {
-		rc = -ENOMEM;
-		goto no_ndev;
-	}
+	if (!ndev)
+		return -ENOMEM;
 
 	platform_set_drvdata(pdev, ndev);
 	priv = netdev_priv(ndev);
@@ -1893,7 +1890,7 @@ static int davinci_emac_probe(struct platform_device *pdev)
 	if (!pdata) {
 		dev_err(&pdev->dev, "no platform data\n");
 		rc = -ENODEV;
-		goto probe_quit;
+		goto no_pdata;
 	}
 
 	/* MAC addr and PHY mask , RMII enable info from platform_data */
@@ -1913,23 +1910,23 @@ static int davinci_emac_probe(struct platform_device *pdev)
 	if (!res) {
 		dev_err(&pdev->dev,"error getting res\n");
 		rc = -ENOENT;
-		goto probe_quit;
+		goto no_pdata;
 	}
 
 	priv->emac_base_phys = res->start + pdata->ctrl_reg_offset;
 	size = resource_size(res);
-	if (!request_mem_region(res->start, size, ndev->name)) {
+	if (!devm_request_mem_region(&pdev->dev, res->start,
+				     size, ndev->name)) {
 		dev_err(&pdev->dev, "failed request_mem_region() for regs\n");
 		rc = -ENXIO;
-		goto probe_quit;
+		goto no_pdata;
 	}
 
-	priv->remap_addr = ioremap(res->start, size);
+	priv->remap_addr = devm_ioremap(&pdev->dev, res->start, size);
 	if (!priv->remap_addr) {
 		dev_err(&pdev->dev, "unable to map IO\n");
 		rc = -ENOMEM;
-		release_mem_region(res->start, size);
-		goto probe_quit;
+		goto no_pdata;
 	}
 	priv->emac_base = priv->remap_addr + pdata->ctrl_reg_offset;
 	ndev->base_addr = (unsigned long)priv->remap_addr;
@@ -1962,7 +1959,7 @@ static int davinci_emac_probe(struct platform_device *pdev)
 	if (!priv->dma) {
 		dev_err(&pdev->dev, "error initializing DMA\n");
 		rc = -ENOMEM;
-		goto no_dma;
+		goto no_pdata;
 	}
 
 	priv->txchan = cpdma_chan_create(priv->dma, tx_chan_num(EMAC_DEF_TX_CH),
@@ -1971,14 +1968,14 @@ static int davinci_emac_probe(struct platform_device *pdev)
 				       emac_rx_handler);
 	if (WARN_ON(!priv->txchan || !priv->rxchan)) {
 		rc = -ENOMEM;
-		goto no_irq_res;
+		goto no_cpdma_chan;
 	}
 
 	res = platform_get_resource(pdev, IORESOURCE_IRQ, 0);
 	if (!res) {
 		dev_err(&pdev->dev, "error getting irq res\n");
 		rc = -ENOENT;
-		goto no_irq_res;
+		goto no_cpdma_chan;
 	}
 	ndev->irq = res->start;
 
@@ -2000,7 +1997,7 @@ static int davinci_emac_probe(struct platform_device *pdev)
 	if (rc) {
 		dev_err(&pdev->dev, "error in register_netdev\n");
 		rc = -ENODEV;
-		goto no_irq_res;
+		goto no_cpdma_chan;
 	}
 
 
@@ -2015,20 +2012,14 @@ static int davinci_emac_probe(struct platform_device *pdev)
 
 	return 0;
 
-no_irq_res:
+no_cpdma_chan:
 	if (priv->txchan)
 		cpdma_chan_destroy(priv->txchan);
 	if (priv->rxchan)
 		cpdma_chan_destroy(priv->rxchan);
 	cpdma_ctlr_destroy(priv->dma);
-no_dma:
-	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	release_mem_region(res->start, resource_size(res));
-	iounmap(priv->remap_addr);
-
-probe_quit:
+no_pdata:
 	free_netdev(ndev);
-no_ndev:
 	return rc;
 }
 
@@ -2041,14 +2032,12 @@ no_ndev:
  */
 static int davinci_emac_remove(struct platform_device *pdev)
 {
-	struct resource *res;
 	struct net_device *ndev = platform_get_drvdata(pdev);
 	struct emac_priv *priv = netdev_priv(ndev);
 
 	dev_notice(&ndev->dev, "DaVinci EMAC: davinci_emac_remove()\n");
 
 	platform_set_drvdata(pdev, NULL);
-	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
 
 	if (priv->txchan)
 		cpdma_chan_destroy(priv->txchan);
@@ -2056,10 +2045,7 @@ static int davinci_emac_remove(struct platform_device *pdev)
 		cpdma_chan_destroy(priv->rxchan);
 	cpdma_ctlr_destroy(priv->dma);
 
-	release_mem_region(res->start, resource_size(res));
-
 	unregister_netdev(ndev);
-	iounmap(priv->remap_addr);
 	free_netdev(ndev);
 
 	return 0;
diff --git a/drivers/net/ethernet/ti/davinci_mdio.c b/drivers/net/ethernet/ti/davinci_mdio.c
index cca2550..12aec17 100644
--- a/drivers/net/ethernet/ti/davinci_mdio.c
+++ b/drivers/net/ethernet/ti/davinci_mdio.c
@@ -320,10 +320,8 @@ static int davinci_mdio_probe(struct platform_device *pdev)
 	int ret, addr;
 
 	data = kzalloc(sizeof(*data), GFP_KERNEL);
-	if (!data) {
-		dev_err(dev, "failed to alloc device data\n");
+	if (!data)
 		return -ENOMEM;
-	}
 
 	data->bus = mdiobus_alloc();
 	if (!data->bus) {
@@ -487,6 +485,7 @@ static const struct of_device_id davinci_mdio_of_mtable[] = {
 	{ .compatible = "ti,davinci_mdio", },
 	{ /* sentinel */ },
 };
+MODULE_DEVICE_TABLE(of, davinci_mdio_of_mtable);
 
 static struct platform_driver davinci_mdio_driver = {
 	.driver = {
diff --git a/drivers/net/ethernet/ti/tlan.c b/drivers/net/ethernet/ti/tlan.c
index 2272538..60c400f 100644
--- a/drivers/net/ethernet/ti/tlan.c
+++ b/drivers/net/ethernet/ti/tlan.c
@@ -320,6 +320,7 @@ static void tlan_remove_one(struct pci_dev *pdev)
 	free_netdev(dev);
 
 	pci_set_drvdata(pdev, NULL);
+	cancel_work_sync(&priv->tlan_tqueue);
 }
 
 static void tlan_start(struct net_device *dev)
@@ -1911,10 +1912,8 @@ static void tlan_reset_lists(struct net_device *dev)
 		list->frame_size = TLAN_MAX_FRAME_SIZE;
 		list->buffer[0].count = TLAN_MAX_FRAME_SIZE | TLAN_LAST_BUFFER;
 		skb = netdev_alloc_skb_ip_align(dev, TLAN_MAX_FRAME_SIZE + 5);
-		if (!skb) {
-			netdev_err(dev, "Out of memory for received data\n");
+		if (!skb)
 			break;
-		}
 
 		list->buffer[0].address = pci_map_single(priv->pci_dev,
 							 skb->data,
diff --git a/include/linux/platform_data/cpsw.h b/include/linux/platform_data/cpsw.h
index 24368a2..bb3cd58 100644
--- a/include/linux/platform_data/cpsw.h
+++ b/include/linux/platform_data/cpsw.h
@@ -21,6 +21,8 @@ struct cpsw_slave_data {
 	char		phy_id[MII_BUS_ID_SIZE];
 	int		phy_if;
 	u8		mac_addr[ETH_ALEN];
+	u16		dual_emac_res_vlan;	/* Reserved VLAN for DualEMAC */
+
 };
 
 struct cpsw_platform_data {
@@ -28,13 +30,15 @@ struct cpsw_platform_data {
 	u32	channels;	/* number of cpdma channels (symmetric) */
 	u32	slaves;		/* number of slave cpgmac ports */
 	struct cpsw_slave_data	*slave_data;
-	u32	cpts_active_slave; /* time stamping slave */
+	u32	active_slave; /* time stamping, ethtool and SIOCGMIIPHY slave */
 	u32	cpts_clock_mult;  /* convert input clock ticks to nanoseconds */
 	u32	cpts_clock_shift; /* convert input clock ticks to nanoseconds */
 	u32	ale_entries;	/* ale table size */
 	u32	bd_ram_size;  /*buffer descriptor ram size */
 	u32	rx_descs;	/* Number of Rx Descriptios */
 	u32	mac_control;	/* Mac control register */
+	u16	default_vlan;	/* Def VLAN for ALE lookup in VLAN aware mode*/
+	bool	dual_emac;	/* Enable Dual EMAC mode */
 };
 
 #endif /* __CPSW_H__ */
-- 
cgit v0.10.2


From 79d1e267c951243160a69197fd789c9f737c304d Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Tue, 30 Apr 2013 03:16:52 -0500
Subject: net/cpsw: revert f9a8f83b04e0c362a2fc660dbad980d24af209fc

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>

diff --git a/drivers/net/ethernet/ti/cpmac.c b/drivers/net/ethernet/ti/cpmac.c
index 31bbbca..70d1920 100644
--- a/drivers/net/ethernet/ti/cpmac.c
+++ b/drivers/net/ethernet/ti/cpmac.c
@@ -1172,8 +1172,8 @@ static int cpmac_probe(struct platform_device *pdev)
 	snprintf(priv->phy_name, MII_BUS_ID_SIZE, PHY_ID_FMT,
 						mdio_bus_id, phy_id);
 
-	priv->phy = phy_connect(dev, priv->phy_name, cpmac_adjust_link,
-				PHY_INTERFACE_MODE_MII);
+	priv->phy = phy_connect(dev, priv->phy_name, cpmac_adjust_link, 0,
+						PHY_INTERFACE_MODE_MII);
 
 	if (IS_ERR(priv->phy)) {
 		if (netif_msg_drv(priv))
diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c
index 4e2d224..0a211c5 100644
--- a/drivers/net/ethernet/ti/cpsw.c
+++ b/drivers/net/ethernet/ti/cpsw.c
@@ -817,7 +817,7 @@ static void cpsw_slave_open(struct cpsw_slave *slave, struct cpsw_priv *priv)
 				   1 << slave_port, 0, 0, ALE_MCAST_FWD_2);
 
 	slave->phy = phy_connect(priv->ndev, slave->data->phy_id,
-				 &cpsw_adjust_link, slave->data->phy_if);
+				 &cpsw_adjust_link, 0, slave->data->phy_if);
 	if (IS_ERR(slave->phy)) {
 		dev_err(priv->dev, "phy %s not found on slave %d\n",
 			slave->data->phy_id, slave->slave_num);
diff --git a/drivers/net/ethernet/ti/davinci_emac.c b/drivers/net/ethernet/ti/davinci_emac.c
index 860e15d..5aa9e4d 100644
--- a/drivers/net/ethernet/ti/davinci_emac.c
+++ b/drivers/net/ethernet/ti/davinci_emac.c
@@ -1600,7 +1600,7 @@ static int emac_dev_open(struct net_device *ndev)
 
 	if (priv->phy_id && *priv->phy_id) {
 		priv->phydev = phy_connect(ndev, priv->phy_id,
-					   &emac_adjust_link,
+					   &emac_adjust_link, 0,
 					   PHY_INTERFACE_MODE_MII);
 
 		if (IS_ERR(priv->phydev)) {
-- 
cgit v0.10.2


From da3a388231d98dadb89dc0d9f1842e9cbcc01bbc Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Tue, 30 Apr 2013 03:16:52 -0500
Subject: net/cpsw: revert f646968f8f7c624587de729115d802372b9063dd

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>

diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c
index 0a211c5..d289638 100644
--- a/drivers/net/ethernet/ti/cpsw.c
+++ b/drivers/net/ethernet/ti/cpsw.c
@@ -1634,7 +1634,7 @@ static int cpsw_probe_dual_emac(struct platform_device *pdev,
 		priv_sl2->num_irqs = priv->num_irqs;
 	}
 	priv->irq_enabled = true;
-	ndev->features |= NETIF_F_HW_VLAN_CTAG_FILTER;
+	ndev->features |= NETIF_F_HW_VLAN_FILTER;
 
 	ndev->netdev_ops = &cpsw_netdev_ops;
 	SET_ETHTOOL_OPS(ndev, &cpsw_ethtool_ops);
@@ -1872,7 +1872,7 @@ static int cpsw_probe(struct platform_device *pdev)
 		k++;
 	}
 
-	ndev->features |= NETIF_F_HW_VLAN_CTAG_FILTER;
+	ndev->features |= NETIF_F_HW_VLAN_FILTER;
 
 	ndev->netdev_ops = &cpsw_netdev_ops;
 	SET_ETHTOOL_OPS(ndev, &cpsw_ethtool_ops);
-- 
cgit v0.10.2


From 0cbaf4d585e454a87a718be69185846ce774db4c Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Tue, 30 Apr 2013 03:16:53 -0500
Subject: net/cpsw: revert 80d5c3689b886308247da295a228a54df49a44f6

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>

diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c
index d289638..f0d8fdf 100644
--- a/drivers/net/ethernet/ti/cpsw.c
+++ b/drivers/net/ethernet/ti/cpsw.c
@@ -1286,7 +1286,7 @@ clean_vid:
 }
 
 static int cpsw_ndo_vlan_rx_add_vid(struct net_device *ndev,
-				    __be16 proto, u16 vid)
+		unsigned short vid)
 {
 	struct cpsw_priv *priv = netdev_priv(ndev);
 
@@ -1298,7 +1298,7 @@ static int cpsw_ndo_vlan_rx_add_vid(struct net_device *ndev,
 }
 
 static int cpsw_ndo_vlan_rx_kill_vid(struct net_device *ndev,
-				     __be16 proto, u16 vid)
+		unsigned short vid)
 {
 	struct cpsw_priv *priv = netdev_priv(ndev);
 	int ret;
-- 
cgit v0.10.2


From 27b9d37a0f1dd165e195e4365df8808d2d51257a Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Tue, 30 Apr 2013 03:16:53 -0500
Subject: net/cpsw: fix missplaced init chunk

this is a fixup for "net/cpsw: fix irq_disable() with threaded interrupts"
where the assignment made into the slave and not into the main device.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>

diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c
index f0d8fdf..3c97c21 100644
--- a/drivers/net/ethernet/ti/cpsw.c
+++ b/drivers/net/ethernet/ti/cpsw.c
@@ -1633,7 +1633,6 @@ static int cpsw_probe_dual_emac(struct platform_device *pdev,
 		priv_sl2->irqs_table[i] = priv->irqs_table[i];
 		priv_sl2->num_irqs = priv->num_irqs;
 	}
-	priv->irq_enabled = true;
 	ndev->features |= NETIF_F_HW_VLAN_FILTER;
 
 	ndev->netdev_ops = &cpsw_netdev_ops;
@@ -1872,6 +1871,7 @@ static int cpsw_probe(struct platform_device *pdev)
 		k++;
 	}
 
+	priv->irq_enabled = true;
 	ndev->features |= NETIF_F_HW_VLAN_FILTER;
 
 	ndev->netdev_ops = &cpsw_netdev_ops;
-- 
cgit v0.10.2


From 9fa62c8906da587aa5d651ca11c55795f454d2fb Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 24 Apr 2013 20:01:22 +0200
Subject: net/cpsw: use a lock around source testing

For some reason on RT it happens that the TX interrupt fires over and
over again but according to tx_stat there is nothing going on. Same goes
for RX but not that often.
With this lock around it this is gone. However I still see from time to
time interrupts where each source is set to 0.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>

diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c
index 3c97c21..e660122 100644
--- a/drivers/net/ethernet/ti/cpsw.c
+++ b/drivers/net/ethernet/ti/cpsw.c
@@ -490,7 +490,7 @@ void cpsw_rx_handler(void *token, int len, int status)
 		skb_put(skb, len);
 		cpts_rx_timestamp(priv->cpts, skb);
 		skb->protocol = eth_type_trans(skb, ndev);
-		netif_receive_skb(skb);
+		netif_rx(skb);
 		priv->stats.rx_bytes += len;
 		priv->stats.rx_packets++;
 	} else {
@@ -507,19 +507,24 @@ void cpsw_rx_handler(void *token, int len, int status)
 static irqreturn_t cpsw_interrupt(int irq, void *dev_id)
 {
 	struct cpsw_priv *priv = dev_id;
+	unsigned long flags;
 	u32 rx, tx, rx_thresh;
 
+	spin_lock_irqsave(&priv->lock, flags);
 	rx_thresh = __raw_readl(&priv->wr_regs->rx_thresh_stat);
 	rx = __raw_readl(&priv->wr_regs->rx_stat);
 	tx = __raw_readl(&priv->wr_regs->tx_stat);
-	if (!rx_thresh && !rx && !tx)
+	if (!rx_thresh && !rx && !tx) {
+		spin_unlock_irqrestore(&priv->lock, flags);
 		return IRQ_NONE;
+	}
 
 	cpsw_intr_disable(priv);
 	if (priv->irq_enabled == true) {
 		cpsw_disable_irq(priv);
 		priv->irq_enabled = false;
 	}
+	spin_unlock_irqrestore(&priv->lock, flags);
 
 	if (netif_running(priv->ndev)) {
 		napi_schedule(&priv->napi);
@@ -541,7 +546,9 @@ static int cpsw_poll(struct napi_struct *napi, int budget)
 {
 	struct cpsw_priv	*priv = napi_to_priv(napi);
 	int			num_tx, num_rx;
+	unsigned long		flags;
 
+	spin_lock_irqsave(&priv->lock, flags);
 	num_tx = cpdma_chan_process(priv->txch, 128);
 	if (num_tx)
 		cpdma_ctlr_eoi(priv->dma, CPDMA_EOI_TX);
@@ -559,6 +566,7 @@ static int cpsw_poll(struct napi_struct *napi, int budget)
 			prim_cpsw->irq_enabled = true;
 		}
 	}
+	spin_unlock_irqrestore(&priv->lock, flags);
 
 	if (num_rx || num_tx)
 		cpsw_dbg(priv, intr, "poll %d rx, %d tx pkts\n",
-- 
cgit v0.10.2


From 4e688559f0176561b1956728329a3cf7045aa5d6 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Tue, 16 Apr 2013 12:34:09 +0200
Subject: net/cpsw: Use fallback for active_slave

In case the .dts has not been yet updated we also try to look for the
old "cpts_active_slave" property.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>

diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c
index e660122..1cfde0c 100644
--- a/drivers/net/ethernet/ti/cpsw.c
+++ b/drivers/net/ethernet/ti/cpsw.c
@@ -1467,8 +1467,12 @@ static int cpsw_probe_dt(struct cpsw_platform_data *data,
 
 	if (of_property_read_u32(node, "active_slave", &prop)) {
 		pr_err("Missing active_slave property in the DT.\n");
-		ret = -EINVAL;
-		goto error_ret;
+		if (of_property_read_u32(node, "cpts_active_slave", &prop)) {
+			ret = -EINVAL;
+			goto error_ret;
+		} else {
+			pr_err("Using old cpts_active_slave as fallback.\n");
+		}
 	}
 	data->active_slave = prop;
 
-- 
cgit v0.10.2


From fb2da1aeb8aa74df00f4b9dfc44e816d40ba1f6c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 22 Jun 2011 19:47:02 +0200
Subject: block: Shorten interrupt disabled regions

Moving the blk_sched_flush_plug() call out of the interrupt/preempt
disabled region in the scheduler allows us to replace
local_irq_save/restore(flags) by local_irq_disable/enable() in
blk_flush_plug().

Now instead of doing this we disable interrupts explicitely when we
lock the request_queue and reenable them when we drop the lock. That
allows interrupts to be handled when the plug list contains requests
for more than one queue.

Aside of that this change makes the scope of the irq disabled region
more obvious. The current code confused the hell out of me when
looking at:

 local_irq_save(flags);
   spin_lock(q->queue_lock);
   ...
   queue_unplugged(q...);
     scsi_request_fn();
       spin_unlock(q->queue_lock);
       spin_lock(shost->host_lock);
       spin_unlock_irq(shost->host_lock);

-------------------^^^ ????

       spin_lock_irq(q->queue_lock);
       spin_unlock(q->lock);
 local_irq_restore(flags);

Also add a comment to __blk_run_queue() documenting that
q->request_fn() can drop q->queue_lock and reenable interrupts, but
must return with q->queue_lock held and interrupts disabled.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: http://lkml.kernel.org/r/20110622174919.025446432@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/block/blk-core.c b/block/blk-core.c
index c973249..0ff1564 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2929,7 +2929,7 @@ static void queue_unplugged(struct request_queue *q, unsigned int depth,
 		blk_run_queue_async(q);
 	else
 		__blk_run_queue(q);
-	spin_unlock(q->queue_lock);
+	spin_unlock_irq(q->queue_lock);
 }
 
 static void flush_plug_callbacks(struct blk_plug *plug, bool from_schedule)
@@ -2977,7 +2977,6 @@ EXPORT_SYMBOL(blk_check_plugged);
 void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
 {
 	struct request_queue *q;
-	unsigned long flags;
 	struct request *rq;
 	LIST_HEAD(list);
 	unsigned int depth;
@@ -2998,11 +2997,6 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
 	q = NULL;
 	depth = 0;
 
-	/*
-	 * Save and disable interrupts here, to avoid doing it for every
-	 * queue lock we have to take.
-	 */
-	local_irq_save(flags);
 	while (!list_empty(&list)) {
 		rq = list_entry_rq(list.next);
 		list_del_init(&rq->queuelist);
@@ -3015,7 +3009,7 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
 				queue_unplugged(q, depth, from_schedule);
 			q = rq->q;
 			depth = 0;
-			spin_lock(q->queue_lock);
+			spin_lock_irq(q->queue_lock);
 		}
 
 		/*
@@ -3042,8 +3036,6 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
 	 */
 	if (q)
 		queue_unplugged(q, depth, from_schedule);
-
-	local_irq_restore(flags);
 }
 
 void blk_finish_plug(struct blk_plug *plug)
-- 
cgit v0.10.2


From b243750b7a4b82dea6f3b7d2b27dc230d976329a Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 14 Feb 2013 22:36:59 +0100
Subject: timekeeping-split-jiffies-lock.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index 7a925ba..ca9e113 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -67,7 +67,8 @@ static struct clocksource clocksource_jiffies = {
 	.shift		= JIFFIES_SHIFT,
 };
 
-__cacheline_aligned_in_smp DEFINE_SEQLOCK(jiffies_lock);
+__cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(jiffies_lock);
+__cacheline_aligned_in_smp seqcount_t jiffies_seq;
 
 #if (BITS_PER_LONG < 64)
 u64 get_jiffies_64(void)
@@ -76,9 +77,9 @@ u64 get_jiffies_64(void)
 	u64 ret;
 
 	do {
-		seq = read_seqbegin(&jiffies_lock);
+		seq = read_seqcount_begin(&jiffies_seq);
 		ret = jiffies_64;
-	} while (read_seqretry(&jiffies_lock, seq));
+	} while (read_seqcount_retry(&jiffies_seq, seq));
 	return ret;
 }
 EXPORT_SYMBOL(get_jiffies_64);
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index b1600a6..e8076c6 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -63,13 +63,15 @@ int tick_is_oneshot_available(void)
 static void tick_periodic(int cpu)
 {
 	if (tick_do_timer_cpu == cpu) {
-		write_seqlock(&jiffies_lock);
+		raw_spin_lock(&jiffies_lock);
+		write_seqcount_begin(&jiffies_seq);
 
 		/* Keep track of the next tick event */
 		tick_next_period = ktime_add(tick_next_period, tick_period);
 
 		do_timer(1);
-		write_sequnlock(&jiffies_lock);
+		write_seqcount_end(&jiffies_seq);
+		raw_spin_unlock(&jiffies_lock);
 	}
 
 	update_process_times(user_mode(get_irq_regs()));
@@ -130,9 +132,9 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
 		ktime_t next;
 
 		do {
-			seq = read_seqbegin(&jiffies_lock);
+			seq = read_seqcount_begin(&jiffies_seq);
 			next = tick_next_period;
-		} while (read_seqretry(&jiffies_lock, seq));
+		} while (read_seqcount_retry(&jiffies_seq, seq));
 
 		clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
 
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index f5c9207..ad8edee 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -4,7 +4,8 @@
 #include <linux/hrtimer.h>
 #include <linux/tick.h>
 
-extern seqlock_t jiffies_lock;
+extern raw_spinlock_t jiffies_lock;
+extern seqcount_t jiffies_seq;
 
 #ifdef CONFIG_GENERIC_CLOCKEVENTS_BUILD
 
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index e78feff..3365223 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -56,7 +56,8 @@ static void tick_do_update_jiffies64(ktime_t now)
 		return;
 
 	/* Reevalute with jiffies_lock held */
-	write_seqlock(&jiffies_lock);
+	raw_spin_lock(&jiffies_lock);
+	write_seqcount_begin(&jiffies_seq);
 
 	delta = ktime_sub(now, last_jiffies_update);
 	if (delta.tv64 >= tick_period.tv64) {
@@ -79,7 +80,8 @@ static void tick_do_update_jiffies64(ktime_t now)
 		/* Keep the tick_next_period variable up to date */
 		tick_next_period = ktime_add(last_jiffies_update, tick_period);
 	}
-	write_sequnlock(&jiffies_lock);
+	write_seqcount_end(&jiffies_seq);
+	raw_spin_unlock(&jiffies_lock);
 }
 
 /*
@@ -89,12 +91,14 @@ static ktime_t tick_init_jiffy_update(void)
 {
 	ktime_t period;
 
-	write_seqlock(&jiffies_lock);
+	raw_spin_lock(&jiffies_lock);
+	write_seqcount_begin(&jiffies_seq);
 	/* Did we start the jiffies update yet ? */
 	if (last_jiffies_update.tv64 == 0)
 		last_jiffies_update = tick_next_period;
 	period = last_jiffies_update;
-	write_sequnlock(&jiffies_lock);
+	write_seqcount_end(&jiffies_seq);
+	raw_spin_unlock(&jiffies_lock);
 	return period;
 }
 
@@ -325,11 +329,11 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
 
 	/* Read jiffies and the time when jiffies were updated last */
 	do {
-		seq = read_seqbegin(&jiffies_lock);
+		seq = read_seqcount_begin(&jiffies_seq);
 		last_update = last_jiffies_update;
 		last_jiffies = jiffies;
 		time_delta = timekeeping_max_deferment();
-	} while (read_seqretry(&jiffies_lock, seq));
+	} while (read_seqcount_retry(&jiffies_seq, seq));
 
 	if (rcu_needs_cpu(cpu, &rcu_delta_jiffies) || printk_needs_cpu(cpu) ||
 	    arch_needs_cpu(cpu)) {
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 1612708..99f943b 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1461,7 +1461,9 @@ EXPORT_SYMBOL_GPL(ktime_get_monotonic_offset);
  */
 void xtime_update(unsigned long ticks)
 {
-	write_seqlock(&jiffies_lock);
+	raw_spin_lock(&jiffies_lock);
+	write_seqcount_begin(&jiffies_seq);
 	do_timer(ticks);
-	write_sequnlock(&jiffies_lock);
+	write_seqcount_end(&jiffies_seq);
+	raw_spin_unlock(&jiffies_lock);
 }
-- 
cgit v0.10.2


From ab6f717aa03edcae09003a6a05a4dcdbd1ddf9c2 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 18 Jul 2011 21:32:10 +0200
Subject: mips-enable-interrupts-in-signal.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/mips/kernel/signal.c b/arch/mips/kernel/signal.c
index b6aa770..bfcaea6 100644
--- a/arch/mips/kernel/signal.c
+++ b/arch/mips/kernel/signal.c
@@ -601,6 +601,7 @@ asmlinkage void do_notify_resume(struct pt_regs *regs, void *unused,
 	__u32 thread_info_flags)
 {
 	local_irq_enable();
+	preempt_check_resched();
 
 	/* deal with pending signal delivery */
 	if (thread_info_flags & _TIF_SIGPENDING)
-- 
cgit v0.10.2


From 0f0ad79a0e233b732a11b1c2cddf3a84d725a534 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Thu, 29 Sep 2011 12:24:30 -0500
Subject: tracing: Account for preempt off in preempt_schedule()

The preempt_schedule() uses the preempt_disable_notrace() version
because it can cause infinite recursion by the function tracer as
the function tracer uses preempt_enable_notrace() which may call
back into the preempt_schedule() code as the NEED_RESCHED is still
set and the PREEMPT_ACTIVE has not been set yet.

See commit: d1f74e20b5b064a130cd0743a256c2d3cfe84010 that made this
change.

The preemptoff and preemptirqsoff latency tracers require the first
and last preempt count modifiers to enable tracing. But this skips
the checks. Since we can not convert them back to the non notrace
version, we can use the idle() hooks for the latency tracers here.
That is, the start/stop_critical_timings() works well to manually
start and stop the latency tracer for preempt off timings.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Clark Williams <williams@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 5e2f7c3..643549c 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3073,7 +3073,16 @@ asmlinkage void __sched notrace preempt_schedule(void)
 
 	do {
 		add_preempt_count_notrace(PREEMPT_ACTIVE);
+		/*
+		 * The add/subtract must not be traced by the function
+		 * tracer. But we still want to account for the
+		 * preempt off latency tracer. Since the _notrace versions
+		 * of add/subtract skip the accounting for latency tracer
+		 * we must force it manually.
+		 */
+		start_critical_timings();
 		__schedule();
+		stop_critical_timings();
 		sub_preempt_count_notrace(PREEMPT_ACTIVE);
 
 		/*
-- 
cgit v0.10.2


From 5c487908cfce3a3690fd605cd203ff176dd6fe88 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 21 Sep 2011 19:57:12 +0200
Subject: signal-revert-ptrace-preempt-magic.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/signal.c b/kernel/signal.c
index 50e425c..fee4a3e 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1902,15 +1902,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
 		if (gstop_done && ptrace_reparented(current))
 			do_notify_parent_cldstop(current, false, why);
 
-		/*
-		 * Don't want to allow preemption here, because
-		 * sys_ptrace() needs this task to be inactive.
-		 *
-		 * XXX: implement read_unlock_no_resched().
-		 */
-		preempt_disable();
 		read_unlock(&tasklist_lock);
-		preempt_enable_no_resched();
 		freezable_schedule();
 	} else {
 		/*
-- 
cgit v0.10.2


From ef4c54b94f379aa140b304f6758aadcbe65baf22 Mon Sep 17 00:00:00 2001
From: Frank Rowand <frank.rowand@am.sony.com>
Date: Mon, 19 Sep 2011 14:51:14 -0700
Subject: preempt-rt: Convert arm boot_lock to raw

The arm boot_lock is used by the secondary processor startup code.  The locking
task is the idle thread, which has idle->sched_class == &idle_sched_class.
idle_sched_class->enqueue_task == NULL, so if the idle task blocks on the
lock, the attempt to wake it when the lock becomes available will fail:

try_to_wake_up()
   ...
      activate_task()
         enqueue_task()
            p->sched_class->enqueue_task(rq, p, flags)

Fix by converting boot_lock to a raw spin lock.

Signed-off-by: Frank Rowand <frank.rowand@am.sony.com>
Link: http://lkml.kernel.org/r/4E77B952.3010606@am.sony.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/arm/mach-exynos/platsmp.c b/arch/arm/mach-exynos/platsmp.c
index c5c840e..d044812 100644
--- a/arch/arm/mach-exynos/platsmp.c
+++ b/arch/arm/mach-exynos/platsmp.c
@@ -71,7 +71,7 @@ static void __iomem *scu_base_addr(void)
 	return (void __iomem *)(S5P_VA_SCU);
 }
 
-static DEFINE_SPINLOCK(boot_lock);
+static DEFINE_RAW_SPINLOCK(boot_lock);
 
 static void __cpuinit exynos_secondary_init(unsigned int cpu)
 {
@@ -91,8 +91,8 @@ static void __cpuinit exynos_secondary_init(unsigned int cpu)
 	/*
 	 * Synchronise with the boot thread.
 	 */
-	spin_lock(&boot_lock);
-	spin_unlock(&boot_lock);
+	raw_spin_lock(&boot_lock);
+	raw_spin_unlock(&boot_lock);
 }
 
 static int __cpuinit exynos_boot_secondary(unsigned int cpu, struct task_struct *idle)
@@ -104,7 +104,7 @@ static int __cpuinit exynos_boot_secondary(unsigned int cpu, struct task_struct
 	 * Set synchronisation state between this boot processor
 	 * and the secondary one
 	 */
-	spin_lock(&boot_lock);
+	raw_spin_lock(&boot_lock);
 
 	/*
 	 * The secondary processor is waiting to be released from
@@ -133,7 +133,7 @@ static int __cpuinit exynos_boot_secondary(unsigned int cpu, struct task_struct
 
 		if (timeout == 0) {
 			printk(KERN_ERR "cpu1 power enable failed");
-			spin_unlock(&boot_lock);
+			raw_spin_unlock(&boot_lock);
 			return -ETIMEDOUT;
 		}
 	}
@@ -161,7 +161,7 @@ static int __cpuinit exynos_boot_secondary(unsigned int cpu, struct task_struct
 	 * now the secondary core is starting up let it run its
 	 * calibrations, then wait for it to finish
 	 */
-	spin_unlock(&boot_lock);
+	raw_spin_unlock(&boot_lock);
 
 	return pen_release != -1 ? -ENOSYS : 0;
 }
diff --git a/arch/arm/mach-msm/platsmp.c b/arch/arm/mach-msm/platsmp.c
index 7ed69b69..7b8043a 100644
--- a/arch/arm/mach-msm/platsmp.c
+++ b/arch/arm/mach-msm/platsmp.c
@@ -31,7 +31,7 @@
 
 extern void msm_secondary_startup(void);
 
-static DEFINE_SPINLOCK(boot_lock);
+static DEFINE_RAW_SPINLOCK(boot_lock);
 
 static inline int get_core_count(void)
 {
@@ -58,8 +58,8 @@ static void __cpuinit msm_secondary_init(unsigned int cpu)
 	/*
 	 * Synchronise with the boot thread.
 	 */
-	spin_lock(&boot_lock);
-	spin_unlock(&boot_lock);
+	raw_spin_lock(&boot_lock);
+	raw_spin_unlock(&boot_lock);
 }
 
 static __cpuinit void prepare_cold_cpu(unsigned int cpu)
@@ -96,7 +96,7 @@ static int __cpuinit msm_boot_secondary(unsigned int cpu, struct task_struct *id
 	 * set synchronisation state between this boot processor
 	 * and the secondary one
 	 */
-	spin_lock(&boot_lock);
+	raw_spin_lock(&boot_lock);
 
 	/*
 	 * The secondary processor is waiting to be released from
@@ -130,7 +130,7 @@ static int __cpuinit msm_boot_secondary(unsigned int cpu, struct task_struct *id
 	 * now the secondary core is starting up let it run its
 	 * calibrations, then wait for it to finish
 	 */
-	spin_unlock(&boot_lock);
+	raw_spin_unlock(&boot_lock);
 
 	return pen_release != -1 ? -ENOSYS : 0;
 }
diff --git a/arch/arm/mach-omap2/omap-smp.c b/arch/arm/mach-omap2/omap-smp.c
index cd42d92..ef1fbcc 100644
--- a/arch/arm/mach-omap2/omap-smp.c
+++ b/arch/arm/mach-omap2/omap-smp.c
@@ -45,7 +45,7 @@ u16 pm44xx_errata;
 /* SCU base address */
 static void __iomem *scu_base;
 
-static DEFINE_SPINLOCK(boot_lock);
+static DEFINE_RAW_SPINLOCK(boot_lock);
 
 void __iomem *omap4_get_scu_base(void)
 {
@@ -76,8 +76,8 @@ static void __cpuinit omap4_secondary_init(unsigned int cpu)
 	/*
 	 * Synchronise with the boot thread.
 	 */
-	spin_lock(&boot_lock);
-	spin_unlock(&boot_lock);
+	raw_spin_lock(&boot_lock);
+	raw_spin_unlock(&boot_lock);
 }
 
 static int __cpuinit omap4_boot_secondary(unsigned int cpu, struct task_struct *idle)
@@ -90,7 +90,7 @@ static int __cpuinit omap4_boot_secondary(unsigned int cpu, struct task_struct *
 	 * Set synchronisation state between this boot processor
 	 * and the secondary one
 	 */
-	spin_lock(&boot_lock);
+	raw_spin_lock(&boot_lock);
 
 	/*
 	 * Update the AuxCoreBoot0 with boot state for secondary core.
@@ -163,7 +163,7 @@ static int __cpuinit omap4_boot_secondary(unsigned int cpu, struct task_struct *
 	 * Now the secondary core is starting up let it run its
 	 * calibrations, then wait for it to finish
 	 */
-	spin_unlock(&boot_lock);
+	raw_spin_unlock(&boot_lock);
 
 	return 0;
 }
diff --git a/arch/arm/mach-spear13xx/platsmp.c b/arch/arm/mach-spear13xx/platsmp.c
index 2eaa3fa..bb61b1f 100644
--- a/arch/arm/mach-spear13xx/platsmp.c
+++ b/arch/arm/mach-spear13xx/platsmp.c
@@ -21,7 +21,7 @@
 #include <mach/spear.h>
 #include <mach/generic.h>
 
-static DEFINE_SPINLOCK(boot_lock);
+static DEFINE_RAW_SPINLOCK(boot_lock);
 
 static void __iomem *scu_base = IOMEM(VA_SCU_BASE);
 
@@ -44,8 +44,8 @@ static void __cpuinit spear13xx_secondary_init(unsigned int cpu)
 	/*
 	 * Synchronise with the boot thread.
 	 */
-	spin_lock(&boot_lock);
-	spin_unlock(&boot_lock);
+	raw_spin_lock(&boot_lock);
+	raw_spin_unlock(&boot_lock);
 }
 
 static int __cpuinit spear13xx_boot_secondary(unsigned int cpu, struct task_struct *idle)
@@ -56,7 +56,7 @@ static int __cpuinit spear13xx_boot_secondary(unsigned int cpu, struct task_stru
 	 * set synchronisation state between this boot processor
 	 * and the secondary one
 	 */
-	spin_lock(&boot_lock);
+	raw_spin_lock(&boot_lock);
 
 	/*
 	 * The secondary processor is waiting to be released from
@@ -83,7 +83,7 @@ static int __cpuinit spear13xx_boot_secondary(unsigned int cpu, struct task_stru
 	 * now the secondary core is starting up let it run its
 	 * calibrations, then wait for it to finish
 	 */
-	spin_unlock(&boot_lock);
+	raw_spin_unlock(&boot_lock);
 
 	return pen_release != -1 ? -ENOSYS : 0;
 }
diff --git a/arch/arm/mach-ux500/platsmp.c b/arch/arm/mach-ux500/platsmp.c
index 3db7782..dcecae1 100644
--- a/arch/arm/mach-ux500/platsmp.c
+++ b/arch/arm/mach-ux500/platsmp.c
@@ -50,7 +50,7 @@ static void __iomem *scu_base_addr(void)
 	return NULL;
 }
 
-static DEFINE_SPINLOCK(boot_lock);
+static DEFINE_RAW_SPINLOCK(boot_lock);
 
 static void __cpuinit ux500_secondary_init(unsigned int cpu)
 {
@@ -70,8 +70,8 @@ static void __cpuinit ux500_secondary_init(unsigned int cpu)
 	/*
 	 * Synchronise with the boot thread.
 	 */
-	spin_lock(&boot_lock);
-	spin_unlock(&boot_lock);
+	raw_spin_lock(&boot_lock);
+	raw_spin_unlock(&boot_lock);
 }
 
 static int __cpuinit ux500_boot_secondary(unsigned int cpu, struct task_struct *idle)
@@ -82,7 +82,7 @@ static int __cpuinit ux500_boot_secondary(unsigned int cpu, struct task_struct *
 	 * set synchronisation state between this boot processor
 	 * and the secondary one
 	 */
-	spin_lock(&boot_lock);
+	raw_spin_lock(&boot_lock);
 
 	/*
 	 * The secondary processor is waiting to be released from
@@ -103,7 +103,7 @@ static int __cpuinit ux500_boot_secondary(unsigned int cpu, struct task_struct *
 	 * now the secondary core is starting up let it run its
 	 * calibrations, then wait for it to finish
 	 */
-	spin_unlock(&boot_lock);
+	raw_spin_unlock(&boot_lock);
 
 	return pen_release != -1 ? -ENOSYS : 0;
 }
diff --git a/arch/arm/plat-versatile/platsmp.c b/arch/arm/plat-versatile/platsmp.c
index 04ca493..20f9044 100644
--- a/arch/arm/plat-versatile/platsmp.c
+++ b/arch/arm/plat-versatile/platsmp.c
@@ -32,7 +32,7 @@ static void __cpuinit write_pen_release(int val)
 	outer_clean_range(__pa(&pen_release), __pa(&pen_release + 1));
 }
 
-static DEFINE_SPINLOCK(boot_lock);
+static DEFINE_RAW_SPINLOCK(boot_lock);
 
 void __cpuinit versatile_secondary_init(unsigned int cpu)
 {
@@ -52,8 +52,8 @@ void __cpuinit versatile_secondary_init(unsigned int cpu)
 	/*
 	 * Synchronise with the boot thread.
 	 */
-	spin_lock(&boot_lock);
-	spin_unlock(&boot_lock);
+	raw_spin_lock(&boot_lock);
+	raw_spin_unlock(&boot_lock);
 }
 
 int __cpuinit versatile_boot_secondary(unsigned int cpu, struct task_struct *idle)
@@ -64,7 +64,7 @@ int __cpuinit versatile_boot_secondary(unsigned int cpu, struct task_struct *idl
 	 * Set synchronisation state between this boot processor
 	 * and the secondary one
 	 */
-	spin_lock(&boot_lock);
+	raw_spin_lock(&boot_lock);
 
 	/*
 	 * This is really belt and braces; we hold unintended secondary
@@ -94,7 +94,7 @@ int __cpuinit versatile_boot_secondary(unsigned int cpu, struct task_struct *idl
 	 * now the secondary core is starting up let it run its
 	 * calibrations, then wait for it to finish
 	 */
-	spin_unlock(&boot_lock);
+	raw_spin_unlock(&boot_lock);
 
 	return pen_release != -1 ? -ENOSYS : 0;
 }
-- 
cgit v0.10.2


From b79cd097fcdecea75eec849844c462bc5eff961c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 11 Apr 2012 11:26:38 +0200
Subject: arm-omap-make-wakeupgen_lock-raw.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/arm/mach-omap2/omap-wakeupgen.c b/arch/arm/mach-omap2/omap-wakeupgen.c
index 5d3b4f4..8633a43 100644
--- a/arch/arm/mach-omap2/omap-wakeupgen.c
+++ b/arch/arm/mach-omap2/omap-wakeupgen.c
@@ -46,7 +46,7 @@
 
 static void __iomem *wakeupgen_base;
 static void __iomem *sar_base;
-static DEFINE_SPINLOCK(wakeupgen_lock);
+static DEFINE_RAW_SPINLOCK(wakeupgen_lock);
 static unsigned int irq_target_cpu[MAX_IRQS];
 static unsigned int irq_banks = MAX_NR_REG_BANKS;
 static unsigned int max_irqs = MAX_IRQS;
@@ -134,9 +134,9 @@ static void wakeupgen_mask(struct irq_data *d)
 {
 	unsigned long flags;
 
-	spin_lock_irqsave(&wakeupgen_lock, flags);
+	raw_spin_lock_irqsave(&wakeupgen_lock, flags);
 	_wakeupgen_clear(d->irq, irq_target_cpu[d->irq]);
-	spin_unlock_irqrestore(&wakeupgen_lock, flags);
+	raw_spin_unlock_irqrestore(&wakeupgen_lock, flags);
 }
 
 /*
@@ -146,9 +146,9 @@ static void wakeupgen_unmask(struct irq_data *d)
 {
 	unsigned long flags;
 
-	spin_lock_irqsave(&wakeupgen_lock, flags);
+	raw_spin_lock_irqsave(&wakeupgen_lock, flags);
 	_wakeupgen_set(d->irq, irq_target_cpu[d->irq]);
-	spin_unlock_irqrestore(&wakeupgen_lock, flags);
+	raw_spin_unlock_irqrestore(&wakeupgen_lock, flags);
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
@@ -189,7 +189,7 @@ static void wakeupgen_irqmask_all(unsigned int cpu, unsigned int set)
 {
 	unsigned long flags;
 
-	spin_lock_irqsave(&wakeupgen_lock, flags);
+	raw_spin_lock_irqsave(&wakeupgen_lock, flags);
 	if (set) {
 		_wakeupgen_save_masks(cpu);
 		_wakeupgen_set_all(cpu, WKG_MASK_ALL);
@@ -197,7 +197,7 @@ static void wakeupgen_irqmask_all(unsigned int cpu, unsigned int set)
 		_wakeupgen_set_all(cpu, WKG_UNMASK_ALL);
 		_wakeupgen_restore_masks(cpu);
 	}
-	spin_unlock_irqrestore(&wakeupgen_lock, flags);
+	raw_spin_unlock_irqrestore(&wakeupgen_lock, flags);
 }
 #endif
 
-- 
cgit v0.10.2


From 0bad546518745928b4871fd595426886806dfe48 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 3 Jul 2009 08:29:20 -0500
Subject: posix-timers: Prevent broadcast signals

Posix timers should not send broadcast signals and kernel only
signals. Prevent it.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index e885be1..4b7183c 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -439,6 +439,7 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer)
 static struct pid *good_sigevent(sigevent_t * event)
 {
 	struct task_struct *rtn = current->group_leader;
+	int sig = event->sigev_signo;
 
 	if ((event->sigev_notify & SIGEV_THREAD_ID ) &&
 		(!(rtn = find_task_by_vpid(event->sigev_notify_thread_id)) ||
@@ -447,7 +448,8 @@ static struct pid *good_sigevent(sigevent_t * event)
 		return NULL;
 
 	if (((event->sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) &&
-	    ((event->sigev_signo <= 0) || (event->sigev_signo > SIGRTMAX)))
+	    (sig <= 0 || sig > SIGRTMAX || sig_kernel_only(sig) ||
+	     sig_kernel_coredump(sig)))
 		return NULL;
 
 	return task_pid(rtn);
-- 
cgit v0.10.2


From 6d955efe2a2e4c9610c2afd03790a4fb917df88c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 3 Jul 2009 08:44:56 -0500
Subject: signals: Allow rt tasks to cache one sigqueue struct

To avoid allocation allow rt tasks to cache one sigqueue struct in
task struct.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 7e49270..b36847e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1408,6 +1408,7 @@ struct task_struct {
 /* signal handlers */
 	struct signal_struct *signal;
 	struct sighand_struct *sighand;
+	struct sigqueue *sigqueue_cache;
 
 	sigset_t blocked, real_blocked;
 	sigset_t saved_sigmask;	/* restored if set_restore_sigmask() was used */
diff --git a/include/linux/signal.h b/include/linux/signal.h
index 0a89ffc..8942895 100644
--- a/include/linux/signal.h
+++ b/include/linux/signal.h
@@ -226,6 +226,7 @@ static inline void init_sigpending(struct sigpending *sig)
 }
 
 extern void flush_sigqueue(struct sigpending *queue);
+extern void flush_task_sigqueue(struct task_struct *tsk);
 
 /* Test if 'sig' is valid signal. Use this instead of testing _NSIG directly */
 static inline int valid_signal(unsigned long sig)
diff --git a/kernel/exit.c b/kernel/exit.c
index b4df219..8fb4a48 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -143,7 +143,7 @@ static void __exit_signal(struct task_struct *tsk)
 	 * Do this under ->siglock, we can race with another thread
 	 * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals.
 	 */
-	flush_sigqueue(&tsk->pending);
+	flush_task_sigqueue(tsk);
 	tsk->sighand = NULL;
 	spin_unlock(&sighand->siglock);
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 5630e52..e4cee21 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1230,6 +1230,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	spin_lock_init(&p->alloc_lock);
 
 	init_sigpending(&p->pending);
+	p->sigqueue_cache = NULL;
 
 	p->utime = p->stime = p->gtime = 0;
 	p->utimescaled = p->stimescaled = 0;
diff --git a/kernel/signal.c b/kernel/signal.c
index fee4a3e..9683ef8 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -348,13 +348,45 @@ static bool task_participate_group_stop(struct task_struct *task)
 	return false;
 }
 
+#ifdef __HAVE_ARCH_CMPXCHG
+static inline struct sigqueue *get_task_cache(struct task_struct *t)
+{
+	struct sigqueue *q = t->sigqueue_cache;
+
+	if (cmpxchg(&t->sigqueue_cache, q, NULL) != q)
+		return NULL;
+	return q;
+}
+
+static inline int put_task_cache(struct task_struct *t, struct sigqueue *q)
+{
+	if (cmpxchg(&t->sigqueue_cache, NULL, q) == NULL)
+		return 0;
+	return 1;
+}
+
+#else
+
+static inline struct sigqueue *get_task_cache(struct task_struct *t)
+{
+	return NULL;
+}
+
+static inline int put_task_cache(struct task_struct *t, struct sigqueue *q)
+{
+	return 1;
+}
+
+#endif
+
 /*
  * allocate a new signal queue record
  * - this may be called without locks if and only if t == current, otherwise an
  *   appropriate lock must be held to stop the target task from exiting
  */
 static struct sigqueue *
-__sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimit)
+__sigqueue_do_alloc(int sig, struct task_struct *t, gfp_t flags,
+		    int override_rlimit, int fromslab)
 {
 	struct sigqueue *q = NULL;
 	struct user_struct *user;
@@ -371,7 +403,10 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimi
 	if (override_rlimit ||
 	    atomic_read(&user->sigpending) <=
 			task_rlimit(t, RLIMIT_SIGPENDING)) {
-		q = kmem_cache_alloc(sigqueue_cachep, flags);
+		if (!fromslab)
+			q = get_task_cache(t);
+		if (!q)
+			q = kmem_cache_alloc(sigqueue_cachep, flags);
 	} else {
 		print_dropped_signal(sig);
 	}
@@ -388,6 +423,13 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimi
 	return q;
 }
 
+static struct sigqueue *
+__sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags,
+		 int override_rlimit)
+{
+	return __sigqueue_do_alloc(sig, t, flags, override_rlimit, 0);
+}
+
 static void __sigqueue_free(struct sigqueue *q)
 {
 	if (q->flags & SIGQUEUE_PREALLOC)
@@ -397,6 +439,21 @@ static void __sigqueue_free(struct sigqueue *q)
 	kmem_cache_free(sigqueue_cachep, q);
 }
 
+static void sigqueue_free_current(struct sigqueue *q)
+{
+	struct user_struct *up;
+
+	if (q->flags & SIGQUEUE_PREALLOC)
+		return;
+
+	up = q->user;
+	if (rt_prio(current->normal_prio) && !put_task_cache(current, q)) {
+		atomic_dec(&up->sigpending);
+		free_uid(up);
+	} else
+		  __sigqueue_free(q);
+}
+
 void flush_sigqueue(struct sigpending *queue)
 {
 	struct sigqueue *q;
@@ -410,6 +467,21 @@ void flush_sigqueue(struct sigpending *queue)
 }
 
 /*
+ * Called from __exit_signal. Flush tsk->pending and
+ * tsk->sigqueue_cache
+ */
+void flush_task_sigqueue(struct task_struct *tsk)
+{
+	struct sigqueue *q;
+
+	flush_sigqueue(&tsk->pending);
+
+	q = get_task_cache(tsk);
+	if (q)
+		kmem_cache_free(sigqueue_cachep, q);
+}
+
+/*
  * Flush all pending signals for a task.
  */
 void __flush_signals(struct task_struct *t)
@@ -561,7 +633,7 @@ static void collect_signal(int sig, struct sigpending *list, siginfo_t *info)
 still_pending:
 		list_del_init(&first->list);
 		copy_siginfo(info, &first->info);
-		__sigqueue_free(first);
+		sigqueue_free_current(first);
 	} else {
 		/*
 		 * Ok, it wasn't in the queue.  This must be
@@ -607,6 +679,8 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
 {
 	int signr;
 
+	WARN_ON_ONCE(tsk != current);
+
 	/* We only dequeue private signals from ourselves, we don't let
 	 * signalfd steal them
 	 */
@@ -1545,7 +1619,8 @@ EXPORT_SYMBOL(kill_pid);
  */
 struct sigqueue *sigqueue_alloc(void)
 {
-	struct sigqueue *q = __sigqueue_alloc(-1, current, GFP_KERNEL, 0);
+	/* Preallocated sigqueue objects always from the slabcache ! */
+	struct sigqueue *q = __sigqueue_do_alloc(-1, current, GFP_KERNEL, 0, 1);
 
 	if (q)
 		q->flags |= SIGQUEUE_PREALLOC;
-- 
cgit v0.10.2


From 543a01ffb0e31f7954e7e605ecd03b520710a1e7 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Tue, 30 Apr 2013 03:16:55 -0500
Subject: signal/x86: Delay calling signals in atomic

On x86_64 we must disable preemption before we enable interrupts
for stack faults, int3 and debugging, because the current task is using
a per CPU debug stack defined by the IST. If we schedule out, another task
can come in and use the same stack and cause the stack to be corrupted
and crash the kernel on return.

When CONFIG_PREEMPT_RT_FULL is enabled, spin_locks become mutexes, and
one of these is the spin lock used in signal handling.

Some of the debug code (int3) causes do_trap() to send a signal.
This function calls a spin lock that has been converted to a mutex
and has the possibility to sleep. If this happens, the above issues with
the corrupted stack is possible.

Instead of calling the signal right away, for PREEMPT_RT and x86_64,
the signal information is stored on the stacks task_struct and
TIF_NOTIFY_RESUME is set. Then on exit of the trap, the signal resume
code will send the signal when preemption is enabled.

[ rostedt: Switched from #ifdef CONFIG_PREEMPT_RT_FULL to
  ARCH_RT_DELAYS_SIGNAL_SEND and added comments to the code. ]

Cc: stable-rt@vger.kernel.org
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/x86/include/asm/signal.h b/arch/x86/include/asm/signal.h
index 111d272..aa4f8be 100644
--- a/arch/x86/include/asm/signal.h
+++ b/arch/x86/include/asm/signal.h
@@ -23,6 +23,19 @@ typedef struct {
 	unsigned long sig[_NSIG_WORDS];
 } sigset_t;
 
+/*
+ * Because some traps use the IST stack, we must keep preemption
+ * disabled while calling do_trap(), but do_trap() may call
+ * force_sig_info() which will grab the signal spin_locks for the
+ * task, which in PREEMPT_RT_FULL are mutexes.  By defining
+ * ARCH_RT_DELAYS_SIGNAL_SEND the force_sig_info() will set
+ * TIF_NOTIFY_RESUME and set up the signal to be sent on exit of the
+ * trap.
+ */
+#if defined(CONFIG_PREEMPT_RT_FULL) && defined(CONFIG_X86_64)
+#define ARCH_RT_DELAYS_SIGNAL_SEND
+#endif
+
 #ifndef CONFIG_COMPAT
 typedef sigset_t compat_sigset_t;
 #endif
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index d6bf1f3..4a3a5dd 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -808,6 +808,14 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
 		mce_notify_process();
 #endif /* CONFIG_X86_64 && CONFIG_X86_MCE */
 
+#ifdef ARCH_RT_DELAYS_SIGNAL_SEND
+	if (unlikely(current->forced_info.si_signo)) {
+		struct task_struct *t = current;
+		force_sig_info(t->forced_info.si_signo,	&t->forced_info, t);
+		t->forced_info.si_signo = 0;
+	}
+#endif
+
 	if (thread_info_flags & _TIF_UPROBE)
 		uprobe_notify_resume(regs);
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index b36847e..b02a1b0 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1413,6 +1413,10 @@ struct task_struct {
 	sigset_t blocked, real_blocked;
 	sigset_t saved_sigmask;	/* restored if set_restore_sigmask() was used */
 	struct sigpending pending;
+#ifdef CONFIG_PREEMPT_RT_FULL
+	/* TODO: move me into ->restart_block ? */
+	struct siginfo forced_info;
+#endif
 
 	unsigned long sas_ss_sp;
 	size_t sas_ss_size;
diff --git a/kernel/signal.c b/kernel/signal.c
index 9683ef8..491cb9b 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1302,8 +1302,8 @@ int do_send_sig_info(int sig, struct siginfo *info, struct task_struct *p,
  * We don't want to have recursive SIGSEGV's etc, for example,
  * that is why we also clear SIGNAL_UNKILLABLE.
  */
-int
-force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
+static int
+do_force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
 {
 	unsigned long int flags;
 	int ret, blocked, ignored;
@@ -1328,6 +1328,39 @@ force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
 	return ret;
 }
 
+int force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
+{
+/*
+ * On some archs, PREEMPT_RT has to delay sending a signal from a trap
+ * since it can not enable preemption, and the signal code's spin_locks
+ * turn into mutexes. Instead, it must set TIF_NOTIFY_RESUME which will
+ * send the signal on exit of the trap.
+ */
+#ifdef ARCH_RT_DELAYS_SIGNAL_SEND
+	if (in_atomic()) {
+		if (WARN_ON_ONCE(t != current))
+			return 0;
+		if (WARN_ON_ONCE(t->forced_info.si_signo))
+			return 0;
+
+		if (is_si_special(info)) {
+			WARN_ON_ONCE(info != SEND_SIG_PRIV);
+			t->forced_info.si_signo = sig;
+			t->forced_info.si_errno = 0;
+			t->forced_info.si_code = SI_KERNEL;
+			t->forced_info.si_pid = 0;
+			t->forced_info.si_uid = 0;
+		} else {
+			t->forced_info = *info;
+		}
+
+		set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
+		return 0;
+	}
+#endif
+	return do_force_sig_info(sig, info, t);
+}
+
 /*
  * Nuke all other threads in the group.
  */
-- 
cgit v0.10.2


From 67a47d9fa0d9c1c7c90973cb3e57f8612397377f Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 3 Jul 2009 08:29:30 -0500
Subject: drivers: random: Reduce preempt disabled region

No need to keep preemption disabled across the whole function.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/drivers/char/random.c b/drivers/char/random.c
index 32a6c57..50007e1 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -676,9 +676,12 @@ static void add_timer_randomness(struct timer_rand_state *state, unsigned num)
 	preempt_disable();
 	/* if over the trickle threshold, use only 1 in 4096 samples */
 	if (input_pool.entropy_count > trickle_thresh &&
-	    ((__this_cpu_inc_return(trickle_count) - 1) & 0xfff))
-		goto out;
+	    ((__this_cpu_inc_return(trickle_count) - 1) & 0xfff)) {
+		preempt_enable();
+		return;
+	}
 
+	preempt_enable();
 	sample.jiffies = jiffies;
 	sample.cycles = get_cycles();
 	sample.num = num;
@@ -719,8 +722,6 @@ static void add_timer_randomness(struct timer_rand_state *state, unsigned num)
 		credit_entropy_bits(&input_pool,
 				    min_t(int, fls(delta>>1), 11));
 	}
-out:
-	preempt_enable();
 }
 
 void add_input_randomness(unsigned int type, unsigned int code,
-- 
cgit v0.10.2


From 64cc85425dd384d678e9772e202fb6316c946f1e Mon Sep 17 00:00:00 2001
From: Benedikt Spranger <b.spranger@linutronix.de>
Date: Sat, 6 Mar 2010 17:47:10 +0100
Subject: ARM: AT91: PIT: Remove irq handler when clock event is unused

Setup and remove the interrupt handler in clock event mode selection.
This avoids calling the (shared) interrupt handler when the device is
not used.

Signed-off-by: Benedikt Spranger <b.spranger@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/arm/mach-at91/at91rm9200_time.c b/arch/arm/mach-at91/at91rm9200_time.c
index cafe988..0c36dcd 100644
--- a/arch/arm/mach-at91/at91rm9200_time.c
+++ b/arch/arm/mach-at91/at91rm9200_time.c
@@ -134,6 +134,7 @@ clkevt32k_mode(enum clock_event_mode mode, struct clock_event_device *dev)
 		break;
 	case CLOCK_EVT_MODE_SHUTDOWN:
 	case CLOCK_EVT_MODE_UNUSED:
+		remove_irq(AT91_ID_SYS, &at91rm9200_timer_irq);
 	case CLOCK_EVT_MODE_RESUME:
 		irqmask = 0;
 		break;
diff --git a/arch/arm/mach-at91/at91sam926x_time.c b/arch/arm/mach-at91/at91sam926x_time.c
index 358412f..a6f9751 100644
--- a/arch/arm/mach-at91/at91sam926x_time.c
+++ b/arch/arm/mach-at91/at91sam926x_time.c
@@ -77,7 +77,7 @@ static struct clocksource pit_clk = {
 	.flags		= CLOCK_SOURCE_IS_CONTINUOUS,
 };
 
-
+static struct irqaction at91sam926x_pit_irq;
 /*
  * Clockevent device:  interrupts every 1/HZ (== pit_cycles * MCK/16)
  */
@@ -86,6 +86,8 @@ pit_clkevt_mode(enum clock_event_mode mode, struct clock_event_device *dev)
 {
 	switch (mode) {
 	case CLOCK_EVT_MODE_PERIODIC:
+		/* Set up irq handler */
+		setup_irq(AT91_ID_SYS, &at91sam926x_pit_irq);
 		/* update clocksource counter */
 		pit_cnt += pit_cycle * PIT_PICNT(pit_read(AT91_PIT_PIVR));
 		pit_write(AT91_PIT_MR, (pit_cycle - 1) | AT91_PIT_PITEN
@@ -98,6 +100,7 @@ pit_clkevt_mode(enum clock_event_mode mode, struct clock_event_device *dev)
 	case CLOCK_EVT_MODE_UNUSED:
 		/* disable irq, leaving the clocksource active */
 		pit_write(AT91_PIT_MR, (pit_cycle - 1) | AT91_PIT_PITEN);
+		remove_irq(AT91_ID_SYS, &at91sam926x_pit_irq);
 		break;
 	case CLOCK_EVT_MODE_RESUME:
 		break;
-- 
cgit v0.10.2


From d3b42f3a4e2b3fdbb3aa7d54a397f9005bbd6878 Mon Sep 17 00:00:00 2001
From: Benedikt Spranger <b.spranger@linutronix.de>
Date: Mon, 8 Mar 2010 18:57:04 +0100
Subject: clocksource: TCLIB: Allow higher clock rates for clock events

As default the TCLIB uses the 32KiHz base clock rate for clock events.
Add a compile time selection to allow higher clock resulution.

Signed-off-by: Benedikt Spranger <b.spranger@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/drivers/clocksource/tcb_clksrc.c b/drivers/clocksource/tcb_clksrc.c
index 32cb929..ac0bb2e 100644
--- a/drivers/clocksource/tcb_clksrc.c
+++ b/drivers/clocksource/tcb_clksrc.c
@@ -23,8 +23,7 @@
  *     this 32 bit free-running counter. the second channel is not used.
  *
  *   - The third channel may be used to provide a 16-bit clockevent
- *     source, used in either periodic or oneshot mode.  This runs
- *     at 32 KiHZ, and can handle delays of up to two seconds.
+ *     source, used in either periodic or oneshot mode.
  *
  * A boot clocksource and clockevent source are also currently needed,
  * unless the relevant platforms (ARM/AT91, AVR32/AT32) are changed so
@@ -74,6 +73,7 @@ static struct clocksource clksrc = {
 struct tc_clkevt_device {
 	struct clock_event_device	clkevt;
 	struct clk			*clk;
+	u32				freq;
 	void __iomem			*regs;
 };
 
@@ -82,13 +82,6 @@ static struct tc_clkevt_device *to_tc_clkevt(struct clock_event_device *clkevt)
 	return container_of(clkevt, struct tc_clkevt_device, clkevt);
 }
 
-/* For now, we always use the 32K clock ... this optimizes for NO_HZ,
- * because using one of the divided clocks would usually mean the
- * tick rate can never be less than several dozen Hz (vs 0.5 Hz).
- *
- * A divided clock could be good for high resolution timers, since
- * 30.5 usec resolution can seem "low".
- */
 static u32 timer_clock;
 
 static void tc_mode(enum clock_event_mode m, struct clock_event_device *d)
@@ -111,11 +104,12 @@ static void tc_mode(enum clock_event_mode m, struct clock_event_device *d)
 	case CLOCK_EVT_MODE_PERIODIC:
 		clk_enable(tcd->clk);
 
-		/* slow clock, count up to RC, then irq and restart */
+		/* count up to RC, then irq and restart */
 		__raw_writel(timer_clock
 				| ATMEL_TC_WAVE | ATMEL_TC_WAVESEL_UP_AUTO,
 				regs + ATMEL_TC_REG(2, CMR));
-		__raw_writel((32768 + HZ/2) / HZ, tcaddr + ATMEL_TC_REG(2, RC));
+		__raw_writel((tcd->freq + HZ/2)/HZ,
+			     tcaddr + ATMEL_TC_REG(2, RC));
 
 		/* Enable clock and interrupts on RC compare */
 		__raw_writel(ATMEL_TC_CPCS, regs + ATMEL_TC_REG(2, IER));
@@ -128,7 +122,7 @@ static void tc_mode(enum clock_event_mode m, struct clock_event_device *d)
 	case CLOCK_EVT_MODE_ONESHOT:
 		clk_enable(tcd->clk);
 
-		/* slow clock, count up to RC, then irq and stop */
+		/* count up to RC, then irq and stop */
 		__raw_writel(timer_clock | ATMEL_TC_CPCSTOP
 				| ATMEL_TC_WAVE | ATMEL_TC_WAVESEL_UP_AUTO,
 				regs + ATMEL_TC_REG(2, CMR));
@@ -158,8 +152,12 @@ static struct tc_clkevt_device clkevt = {
 		.features	= CLOCK_EVT_FEAT_PERIODIC
 					| CLOCK_EVT_FEAT_ONESHOT,
 		.shift		= 32,
+#ifdef CONFIG_ATMEL_TCB_CLKSRC_USE_SLOW_CLOCK
 		/* Should be lower than at91rm9200's system timer */
 		.rating		= 125,
+#else
+		.rating		= 200,
+#endif
 		.set_next_event	= tc_next_event,
 		.set_mode	= tc_mode,
 	},
@@ -185,8 +183,9 @@ static struct irqaction tc_irqaction = {
 	.handler	= ch2_irq,
 };
 
-static void __init setup_clkevents(struct atmel_tc *tc, int clk32k_divisor_idx)
+static void __init setup_clkevents(struct atmel_tc *tc, int divisor_idx)
 {
+	unsigned divisor = atmel_tc_divisors[divisor_idx];
 	struct clk *t2_clk = tc->clk[2];
 	int irq = tc->irq[2];
 
@@ -194,11 +193,17 @@ static void __init setup_clkevents(struct atmel_tc *tc, int clk32k_divisor_idx)
 	clkevt.clk = t2_clk;
 	tc_irqaction.dev_id = &clkevt;
 
-	timer_clock = clk32k_divisor_idx;
+	timer_clock = divisor_idx;
 
-	clkevt.clkevt.mult = div_sc(32768, NSEC_PER_SEC, clkevt.clkevt.shift);
-	clkevt.clkevt.max_delta_ns
-		= clockevent_delta2ns(0xffff, &clkevt.clkevt);
+	if (!divisor)
+		clkevt.freq = 32768;
+	else
+		clkevt.freq = clk_get_rate(t2_clk)/divisor;
+
+	clkevt.clkevt.mult = div_sc(clkevt.freq, NSEC_PER_SEC,
+				    clkevt.clkevt.shift);
+	clkevt.clkevt.max_delta_ns =
+		clockevent_delta2ns(0xffff, &clkevt.clkevt);
 	clkevt.clkevt.min_delta_ns = clockevent_delta2ns(1, &clkevt.clkevt) + 1;
 	clkevt.clkevt.cpumask = cpumask_of(0);
 
@@ -327,8 +332,11 @@ static int __init tcb_clksrc_init(void)
 	clocksource_register_hz(&clksrc, divided_rate);
 
 	/* channel 2:  periodic and oneshot timer support */
+#ifdef CONFIG_ATMEL_TCB_CLKSRC_USE_SLOW_CLOCK
 	setup_clkevents(tc, clk32k_divisor_idx);
-
+#else
+	setup_clkevents(tc, best_divisor_idx);
+#endif
 	return 0;
 }
 arch_initcall(tcb_clksrc_init);
diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig
index b151b7c..e94f19d 100644
--- a/drivers/misc/Kconfig
+++ b/drivers/misc/Kconfig
@@ -78,8 +78,7 @@ config ATMEL_TCB_CLKSRC
 	  are combined to make a single 32-bit timer.
 
 	  When GENERIC_CLOCKEVENTS is defined, the third timer channel
-	  may be used as a clock event device supporting oneshot mode
-	  (delays of up to two seconds) based on the 32 KiHz clock.
+	  may be used as a clock event device supporting oneshot mode.
 
 config ATMEL_TCB_CLKSRC_BLOCK
 	int
@@ -93,6 +92,14 @@ config ATMEL_TCB_CLKSRC_BLOCK
 	  TC can be used for other purposes, such as PWM generation and
 	  interval timing.
 
+config ATMEL_TCB_CLKSRC_USE_SLOW_CLOCK
+	bool "TC Block use 32 KiHz clock"
+	depends on ATMEL_TCB_CLKSRC
+	default y
+	help
+	  Select this to use 32 KiHz base clock rate as TC block clock
+	  source for clock events.
+
 config IBM_ASM
 	tristate "Device driver for IBM RSA service processor"
 	depends on X86 && PCI && INPUT
-- 
cgit v0.10.2


From 5a939076a92a302c82d93d1ae4ebbe79387eb363 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 3 Jul 2009 08:30:18 -0500
Subject: drivers/net: tulip_remove_one needs to call pci_disable_device()

Otherwise the device is not completely shut down.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/drivers/net/ethernet/dec/tulip/tulip_core.c b/drivers/net/ethernet/dec/tulip/tulip_core.c
index 1e9443d..d25961b 100644
--- a/drivers/net/ethernet/dec/tulip/tulip_core.c
+++ b/drivers/net/ethernet/dec/tulip/tulip_core.c
@@ -1943,6 +1943,7 @@ static void tulip_remove_one(struct pci_dev *pdev)
 	pci_iounmap(pdev, tp->base_addr);
 	free_netdev (dev);
 	pci_release_regions (pdev);
+	pci_disable_device (pdev);
 	pci_set_drvdata (pdev, NULL);
 
 	/* pci_power_off (pdev, -1); */
-- 
cgit v0.10.2


From ed4596f4ffcc53c1158a64ea43417ca4981f5d35 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 3 Jul 2009 08:29:24 -0500
Subject: drivers/net: Use disable_irq_nosync() in 8139too

Use disable_irq_nosync() instead of disable_irq() as this might be
called in atomic context with netpoll.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/drivers/net/ethernet/realtek/8139too.c b/drivers/net/ethernet/realtek/8139too.c
index 5dc1616..3bed27d 100644
--- a/drivers/net/ethernet/realtek/8139too.c
+++ b/drivers/net/ethernet/realtek/8139too.c
@@ -2216,7 +2216,7 @@ static void rtl8139_poll_controller(struct net_device *dev)
 	struct rtl8139_private *tp = netdev_priv(dev);
 	const int irq = tp->pci_dev->irq;
 
-	disable_irq(irq);
+	disable_irq_nosync(irq);
 	rtl8139_interrupt(irq, dev);
 	enable_irq(irq);
 }
-- 
cgit v0.10.2


From 0f6faa2ae9ba93eb01cefa34dcfb9ba305a33ea8 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 3 Jul 2009 08:30:37 -0500
Subject: mm: Prepare decoupling the page fault disabling logic

Add a pagefault_disabled variable to task_struct to allow decoupling
the pagefault-disabled logic from the preempt count.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/sched.h b/include/linux/sched.h
index b02a1b0..c57929c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1453,6 +1453,7 @@ struct task_struct {
 	/* mutex deadlock detection */
 	struct mutex_waiter *blocked_on;
 #endif
+	int pagefault_disabled;
 #ifdef CONFIG_TRACE_IRQFLAGS
 	unsigned int irq_events;
 	unsigned long hardirq_enable_ip;
diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index 5ca0951..9414a1b 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -6,37 +6,10 @@
 
 /*
  * These routines enable/disable the pagefault handler in that
- * it will not take any locks and go straight to the fixup table.
- *
- * They have great resemblance to the preempt_disable/enable calls
- * and in fact they are identical; this is because currently there is
- * no other way to make the pagefault handlers do this. So we do
- * disable preemption but we don't necessarily care about that.
+ * it will not take any MM locks and go straight to the fixup table.
  */
-static inline void pagefault_disable(void)
-{
-	inc_preempt_count();
-	/*
-	 * make sure to have issued the store before a pagefault
-	 * can hit.
-	 */
-	barrier();
-}
-
-static inline void pagefault_enable(void)
-{
-	/*
-	 * make sure to issue those last loads/stores before enabling
-	 * the pagefault handler again.
-	 */
-	barrier();
-	dec_preempt_count();
-	/*
-	 * make sure we do..
-	 */
-	barrier();
-	preempt_check_resched();
-}
+extern void pagefault_disable(void);
+extern void pagefault_enable(void);
 
 #ifndef ARCH_HAS_NOCACHE_UACCESS
 
diff --git a/kernel/fork.c b/kernel/fork.c
index e4cee21..4349b83 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1285,6 +1285,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	p->hardirq_context = 0;
 	p->softirq_context = 0;
 #endif
+	p->pagefault_disabled = 0;
 #ifdef CONFIG_LOCKDEP
 	p->lockdep_depth = 0; /* no locks held yet */
 	p->curr_chain_key = 0;
diff --git a/mm/memory.c b/mm/memory.c
index 32a495a..0b1fbfd 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3717,6 +3717,35 @@ unlock:
 	return 0;
 }
 
+void pagefault_disable(void)
+{
+	inc_preempt_count();
+	current->pagefault_disabled++;
+	/*
+	 * make sure to have issued the store before a pagefault
+	 * can hit.
+	 */
+	barrier();
+}
+EXPORT_SYMBOL(pagefault_disable);
+
+void pagefault_enable(void)
+{
+	/*
+	 * make sure to issue those last loads/stores before enabling
+	 * the pagefault handler again.
+	 */
+	barrier();
+	current->pagefault_disabled--;
+	dec_preempt_count();
+	/*
+	 * make sure we do..
+	 */
+	barrier();
+	preempt_check_resched();
+}
+EXPORT_SYMBOL(pagefault_enable);
+
 /*
  * By the time we get here, we already hold the mm semaphore
  */
-- 
cgit v0.10.2


From 1d24a2ea21323c00e8b143b170fe1db68b591675 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 17 Mar 2011 11:32:28 +0100
Subject: mm: Fixup all fault handlers to check current->pagefault_disable

Necessary for decoupling pagefault disable from preempt count.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/alpha/mm/fault.c b/arch/alpha/mm/fault.c
index 0c4132d..f7d6c49 100644
--- a/arch/alpha/mm/fault.c
+++ b/arch/alpha/mm/fault.c
@@ -108,7 +108,7 @@ do_page_fault(unsigned long address, unsigned long mmcsr,
 
 	/* If we're in an interrupt context, or have no user context,
 	   we must not take the fault.  */
-	if (!mm || in_atomic())
+	if (!mm || in_atomic() || current->pagefault_disabled)
 		goto no_context;
 
 #ifdef CONFIG_ALPHA_LARGE_VMALLOC
diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c
index 5dbf13f..56269df 100644
--- a/arch/arm/mm/fault.c
+++ b/arch/arm/mm/fault.c
@@ -279,7 +279,7 @@ do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
 	 * If we're in an interrupt or have no user
 	 * context, we must not take the fault..
 	 */
-	if (in_atomic() || !mm)
+	if (in_atomic() || !mm || current->pagefault_disabled)
 		goto no_context;
 
 	/*
diff --git a/arch/avr32/mm/fault.c b/arch/avr32/mm/fault.c
index b2f2d2d..4850f93 100644
--- a/arch/avr32/mm/fault.c
+++ b/arch/avr32/mm/fault.c
@@ -81,7 +81,8 @@ asmlinkage void do_page_fault(unsigned long ecr, struct pt_regs *regs)
 	 * If we're in an interrupt or have no user context, we must
 	 * not take the fault...
 	 */
-	if (in_atomic() || !mm || regs->sr & SYSREG_BIT(GM))
+	if (in_atomic() || !mm || regs->sr & SYSREG_BIT(GM) ||
+	    current->pagefault_disabled)
 		goto no_context;
 
 	local_irq_enable();
diff --git a/arch/cris/mm/fault.c b/arch/cris/mm/fault.c
index 73312ab..c82f44c 100644
--- a/arch/cris/mm/fault.c
+++ b/arch/cris/mm/fault.c
@@ -114,7 +114,7 @@ do_page_fault(unsigned long address, struct pt_regs *regs,
 	 * user context, we must not take the fault.
 	 */
 
-	if (in_atomic() || !mm)
+	if (in_atomic() || !mm || current->pagefault_disabled)
 		goto no_context;
 
 retry:
diff --git a/arch/frv/mm/fault.c b/arch/frv/mm/fault.c
index 331c1e2..6372088 100644
--- a/arch/frv/mm/fault.c
+++ b/arch/frv/mm/fault.c
@@ -78,7 +78,7 @@ asmlinkage void do_page_fault(int datammu, unsigned long esr0, unsigned long ear
 	 * If we're in an interrupt or have no user
 	 * context, we must not take the fault..
 	 */
-	if (in_atomic() || !mm)
+	if (in_atomic() || !mm || current->pagefault_disabled)
 		goto no_context;
 
 	down_read(&mm->mmap_sem);
diff --git a/arch/ia64/mm/fault.c b/arch/ia64/mm/fault.c
index 6cf0341..ecac819 100644
--- a/arch/ia64/mm/fault.c
+++ b/arch/ia64/mm/fault.c
@@ -98,7 +98,7 @@ ia64_do_page_fault (unsigned long address, unsigned long isr, struct pt_regs *re
 	/*
 	 * If we're in an interrupt or have no user context, we must not take the fault..
 	 */
-	if (in_atomic() || !mm)
+	if (in_atomic() || !mm || current->pagefault_disabled)
 		goto no_context;
 
 #ifdef CONFIG_VIRTUAL_MEM_MAP
diff --git a/arch/m32r/mm/fault.c b/arch/m32r/mm/fault.c
index 3cdfa9c..1eec8af 100644
--- a/arch/m32r/mm/fault.c
+++ b/arch/m32r/mm/fault.c
@@ -114,7 +114,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code,
 	 * If we're in an interrupt or have no user context or are running in an
 	 * atomic region then we must not take the fault..
 	 */
-	if (in_atomic() || !mm)
+	if (in_atomic() || !mm || current->pagefault_disabled)
 		goto bad_area_nosemaphore;
 
 	/* When running in the kernel we expect faults to occur only to
diff --git a/arch/m68k/mm/fault.c b/arch/m68k/mm/fault.c
index a563727..152361e 100644
--- a/arch/m68k/mm/fault.c
+++ b/arch/m68k/mm/fault.c
@@ -85,7 +85,7 @@ int do_page_fault(struct pt_regs *regs, unsigned long address,
 	 * If we're in an interrupt or have no user
 	 * context, we must not take the fault..
 	 */
-	if (in_atomic() || !mm)
+	if (in_atomic() || !mm || current->pagefault_disabled)
 		goto no_context;
 
 retry:
diff --git a/arch/microblaze/mm/fault.c b/arch/microblaze/mm/fault.c
index 714b35a..fb9add1 100644
--- a/arch/microblaze/mm/fault.c
+++ b/arch/microblaze/mm/fault.c
@@ -108,7 +108,7 @@ void do_page_fault(struct pt_regs *regs, unsigned long address,
 	if ((error_code & 0x13) == 0x13 || (error_code & 0x11) == 0x11)
 		is_write = 0;
 
-	if (unlikely(in_atomic() || !mm)) {
+	if (unlikely(in_atomic() || !mm || current->pagefault_disabled)) {
 		if (kernel_mode(regs))
 			goto bad_area_nosemaphore;
 
diff --git a/arch/mips/mm/fault.c b/arch/mips/mm/fault.c
index ddcec1e..63fec6e 100644
--- a/arch/mips/mm/fault.c
+++ b/arch/mips/mm/fault.c
@@ -89,7 +89,7 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs, unsigned long writ
 	 * If we're in an interrupt or have no user
 	 * context, we must not take the fault..
 	 */
-	if (in_atomic() || !mm)
+	if (in_atomic() || !mm || current->pagefault_disabled)
 		goto bad_area_nosemaphore;
 
 retry:
diff --git a/arch/mn10300/mm/fault.c b/arch/mn10300/mm/fault.c
index d48a84f..b245bdf 100644
--- a/arch/mn10300/mm/fault.c
+++ b/arch/mn10300/mm/fault.c
@@ -168,7 +168,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long fault_code,
 	 * If we're in an interrupt or have no user
 	 * context, we must not take the fault..
 	 */
-	if (in_atomic() || !mm)
+	if (in_atomic() || !mm || current->pagefault_disabled)
 		goto no_context;
 
 retry:
diff --git a/arch/parisc/mm/fault.c b/arch/parisc/mm/fault.c
index 18162ce..09ecc8a 100644
--- a/arch/parisc/mm/fault.c
+++ b/arch/parisc/mm/fault.c
@@ -176,7 +176,7 @@ void do_page_fault(struct pt_regs *regs, unsigned long code,
 	unsigned long acc_type;
 	int fault;
 
-	if (in_atomic() || !mm)
+	if (in_atomic() || !mm || current->pagefault_disabled)
 		goto no_context;
 
 	down_read(&mm->mmap_sem);
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 3a8489a..feef7a6 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -259,7 +259,7 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,
 	if (!arch_irq_disabled_regs(regs))
 		local_irq_enable();
 
-	if (in_atomic() || mm == NULL) {
+	if (in_atomic() || mm == NULL || current->pagefault_disabled) {
 		if (!user_mode(regs))
 			return SIGSEGV;
 		/* in_atomic() in user mode is really bad,
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index 2fb9e63..a5fb7bc 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -296,7 +296,8 @@ static inline int do_exception(struct pt_regs *regs, int access)
 	 * user context.
 	 */
 	fault = VM_FAULT_BADCONTEXT;
-	if (unlikely(!user_space_fault(trans_exc_code) || in_atomic() || !mm))
+	if (unlikely(!user_space_fault(trans_exc_code) || in_atomic() || !mm ||
+		    tsk->pagefault_disabled))
 		goto out;
 
 	address = trans_exc_code & __FAIL_ADDR_MASK;
@@ -435,7 +436,8 @@ void __kprobes do_asce_exception(struct pt_regs *regs)
 	clear_tsk_thread_flag(current, TIF_PER_TRAP);
 
 	trans_exc_code = regs->int_parm_long;
-	if (unlikely(!user_space_fault(trans_exc_code) || in_atomic() || !mm))
+	if (unlikely(!user_space_fault(trans_exc_code) || in_atomic() || !mm ||
+		     current->pagefault_disabled()));
 		goto no_context;
 
 	down_read(&mm->mmap_sem);
diff --git a/arch/score/mm/fault.c b/arch/score/mm/fault.c
index 47b600e..4c12824 100644
--- a/arch/score/mm/fault.c
+++ b/arch/score/mm/fault.c
@@ -72,7 +72,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long write,
 	* If we're in an interrupt or have no user
 	* context, we must not take the fault..
 	*/
-	if (in_atomic() || !mm)
+	if (in_atomic() || !mm || current->pagefault_disabled)
 		goto bad_area_nosemaphore;
 
 	down_read(&mm->mmap_sem);
diff --git a/arch/sh/mm/fault.c b/arch/sh/mm/fault.c
index 1f49c28..c89dd12 100644
--- a/arch/sh/mm/fault.c
+++ b/arch/sh/mm/fault.c
@@ -440,7 +440,7 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
 	 * If we're in an interrupt, have no user context or are running
 	 * in an atomic region then we must not take the fault:
 	 */
-	if (unlikely(in_atomic() || !mm)) {
+	if (unlikely(in_atomic() || !mm || current->pagefault_disabled)) {
 		bad_area_nosemaphore(regs, error_code, address);
 		return;
 	}
diff --git a/arch/sparc/mm/fault_32.c b/arch/sparc/mm/fault_32.c
index e98bfda..ef35d69 100644
--- a/arch/sparc/mm/fault_32.c
+++ b/arch/sparc/mm/fault_32.c
@@ -200,7 +200,7 @@ asmlinkage void do_sparc_fault(struct pt_regs *regs, int text_fault, int write,
 	 * If we're in an interrupt or have no user
 	 * context, we must not take the fault..
 	 */
-	if (in_atomic() || !mm)
+	if (in_atomic() || !mm || current->pagefault_disabled)
 		goto no_context;
 
 	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
diff --git a/arch/sparc/mm/fault_64.c b/arch/sparc/mm/fault_64.c
index 5062ff3..e852238 100644
--- a/arch/sparc/mm/fault_64.c
+++ b/arch/sparc/mm/fault_64.c
@@ -321,7 +321,7 @@ asmlinkage void __kprobes do_sparc64_fault(struct pt_regs *regs)
 	 * If we're in an interrupt or have no user
 	 * context, we must not take the fault..
 	 */
-	if (in_atomic() || !mm)
+	if (in_atomic() || !mm || current->pagefault_enabled)
 		goto intr_or_no_mm;
 
 	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
diff --git a/arch/tile/mm/fault.c b/arch/tile/mm/fault.c
index 3d2b81c..a0fba28 100644
--- a/arch/tile/mm/fault.c
+++ b/arch/tile/mm/fault.c
@@ -360,7 +360,7 @@ static int handle_page_fault(struct pt_regs *regs,
 	 * If we're in an interrupt, have no user context or are running in an
 	 * atomic region then we must not take the fault.
 	 */
-	if (in_atomic() || !mm) {
+	if (in_atomic() || !mm || current->pagefault_disabled) {
 		vma = NULL;  /* happy compiler */
 		goto bad_area_nosemaphore;
 	}
diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c
index 089f398..bd897f6 100644
--- a/arch/um/kernel/trap.c
+++ b/arch/um/kernel/trap.c
@@ -39,7 +39,7 @@ int handle_page_fault(unsigned long address, unsigned long ip,
 	 * If the fault was during atomic operation, don't take the fault, just
 	 * fail.
 	 */
-	if (in_atomic())
+	if (in_atomic() || current->pagefault_disabled)
 		goto out_nosemaphore;
 
 retry:
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 4f7d793..fd0964b 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1110,7 +1110,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code)
 	 * If we're in an interrupt, have no user context or are running
 	 * in an atomic region then we must not take the fault:
 	 */
-	if (unlikely(in_atomic() || !mm)) {
+	if (unlikely(in_atomic() || !mm || current->pagefault_disabled)) {
 		bad_area_nosemaphore(regs, error_code, address);
 		return;
 	}
diff --git a/arch/xtensa/mm/fault.c b/arch/xtensa/mm/fault.c
index 4b7bc8d..0d82503 100644
--- a/arch/xtensa/mm/fault.c
+++ b/arch/xtensa/mm/fault.c
@@ -57,7 +57,7 @@ void do_page_fault(struct pt_regs *regs)
 	/* If we're in an interrupt or have no user
 	 * context, we must not take the fault..
 	 */
-	if (in_atomic() || !mm) {
+	if (in_atomic() || !mm || current->pagefault_disabled) {
 		bad_page_fault(regs, address, SIGSEGV);
 		return;
 	}
-- 
cgit v0.10.2


From f144545a86be62b9138ca6a0ab56a51561d33292 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 11 Aug 2011 15:31:31 +0200
Subject: mm: pagefault_disabled()

Wrap the test for pagefault_disabled() into a helper, this allows us
to remove the need for current->pagefault_disabled on !-rt kernels.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/n/tip-3yy517m8zsi9fpsf14xfaqkw@git.kernel.org

diff --git a/arch/alpha/mm/fault.c b/arch/alpha/mm/fault.c
index f7d6c49..fb9eaa4 100644
--- a/arch/alpha/mm/fault.c
+++ b/arch/alpha/mm/fault.c
@@ -108,7 +108,7 @@ do_page_fault(unsigned long address, unsigned long mmcsr,
 
 	/* If we're in an interrupt context, or have no user context,
 	   we must not take the fault.  */
-	if (!mm || in_atomic() || current->pagefault_disabled)
+	if (!mm || pagefault_disabled())
 		goto no_context;
 
 #ifdef CONFIG_ALPHA_LARGE_VMALLOC
diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c
index 56269df..149bab5 100644
--- a/arch/arm/mm/fault.c
+++ b/arch/arm/mm/fault.c
@@ -279,7 +279,7 @@ do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
 	 * If we're in an interrupt or have no user
 	 * context, we must not take the fault..
 	 */
-	if (in_atomic() || !mm || current->pagefault_disabled)
+	if (!mm || pagefault_disabled())
 		goto no_context;
 
 	/*
diff --git a/arch/avr32/mm/fault.c b/arch/avr32/mm/fault.c
index 4850f93..9577e69 100644
--- a/arch/avr32/mm/fault.c
+++ b/arch/avr32/mm/fault.c
@@ -81,8 +81,7 @@ asmlinkage void do_page_fault(unsigned long ecr, struct pt_regs *regs)
 	 * If we're in an interrupt or have no user context, we must
 	 * not take the fault...
 	 */
-	if (in_atomic() || !mm || regs->sr & SYSREG_BIT(GM) ||
-	    current->pagefault_disabled)
+	if (!mm || regs->sr & SYSREG_BIT(GM) || pagefault_disabled())
 		goto no_context;
 
 	local_irq_enable();
diff --git a/arch/cris/mm/fault.c b/arch/cris/mm/fault.c
index c82f44c..1a403d9 100644
--- a/arch/cris/mm/fault.c
+++ b/arch/cris/mm/fault.c
@@ -114,7 +114,7 @@ do_page_fault(unsigned long address, struct pt_regs *regs,
 	 * user context, we must not take the fault.
 	 */
 
-	if (in_atomic() || !mm || current->pagefault_disabled)
+	if (!mm || pagefault_disabled())
 		goto no_context;
 
 retry:
diff --git a/arch/frv/mm/fault.c b/arch/frv/mm/fault.c
index 6372088..e87972c 100644
--- a/arch/frv/mm/fault.c
+++ b/arch/frv/mm/fault.c
@@ -78,7 +78,7 @@ asmlinkage void do_page_fault(int datammu, unsigned long esr0, unsigned long ear
 	 * If we're in an interrupt or have no user
 	 * context, we must not take the fault..
 	 */
-	if (in_atomic() || !mm || current->pagefault_disabled)
+	if (!mm || pagefault_disabled())
 		goto no_context;
 
 	down_read(&mm->mmap_sem);
diff --git a/arch/ia64/mm/fault.c b/arch/ia64/mm/fault.c
index ecac819..dd88415 100644
--- a/arch/ia64/mm/fault.c
+++ b/arch/ia64/mm/fault.c
@@ -98,7 +98,7 @@ ia64_do_page_fault (unsigned long address, unsigned long isr, struct pt_regs *re
 	/*
 	 * If we're in an interrupt or have no user context, we must not take the fault..
 	 */
-	if (in_atomic() || !mm || current->pagefault_disabled)
+	if (!mm || pagefault_disabled())
 		goto no_context;
 
 #ifdef CONFIG_VIRTUAL_MEM_MAP
diff --git a/arch/m32r/mm/fault.c b/arch/m32r/mm/fault.c
index 1eec8af..6945056 100644
--- a/arch/m32r/mm/fault.c
+++ b/arch/m32r/mm/fault.c
@@ -114,7 +114,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code,
 	 * If we're in an interrupt or have no user context or are running in an
 	 * atomic region then we must not take the fault..
 	 */
-	if (in_atomic() || !mm || current->pagefault_disabled)
+	if (!mm || pagefault_disabled())
 		goto bad_area_nosemaphore;
 
 	/* When running in the kernel we expect faults to occur only to
diff --git a/arch/m68k/mm/fault.c b/arch/m68k/mm/fault.c
index 152361e..9ea40db 100644
--- a/arch/m68k/mm/fault.c
+++ b/arch/m68k/mm/fault.c
@@ -85,7 +85,7 @@ int do_page_fault(struct pt_regs *regs, unsigned long address,
 	 * If we're in an interrupt or have no user
 	 * context, we must not take the fault..
 	 */
-	if (in_atomic() || !mm || current->pagefault_disabled)
+	if (!mm || pagefault_disabled())
 		goto no_context;
 
 retry:
diff --git a/arch/microblaze/mm/fault.c b/arch/microblaze/mm/fault.c
index fb9add1..dc1d8c1 100644
--- a/arch/microblaze/mm/fault.c
+++ b/arch/microblaze/mm/fault.c
@@ -108,7 +108,7 @@ void do_page_fault(struct pt_regs *regs, unsigned long address,
 	if ((error_code & 0x13) == 0x13 || (error_code & 0x11) == 0x11)
 		is_write = 0;
 
-	if (unlikely(in_atomic() || !mm || current->pagefault_disabled)) {
+	if (unlikely(!mm || pagefault_disabled())) {
 		if (kernel_mode(regs))
 			goto bad_area_nosemaphore;
 
diff --git a/arch/mips/mm/fault.c b/arch/mips/mm/fault.c
index 63fec6e..90d193a 100644
--- a/arch/mips/mm/fault.c
+++ b/arch/mips/mm/fault.c
@@ -89,7 +89,7 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs, unsigned long writ
 	 * If we're in an interrupt or have no user
 	 * context, we must not take the fault..
 	 */
-	if (in_atomic() || !mm || current->pagefault_disabled)
+	if (!mm || pagefault_disabled())
 		goto bad_area_nosemaphore;
 
 retry:
diff --git a/arch/mn10300/mm/fault.c b/arch/mn10300/mm/fault.c
index b245bdf..34a83b9 100644
--- a/arch/mn10300/mm/fault.c
+++ b/arch/mn10300/mm/fault.c
@@ -168,7 +168,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long fault_code,
 	 * If we're in an interrupt or have no user
 	 * context, we must not take the fault..
 	 */
-	if (in_atomic() || !mm || current->pagefault_disabled)
+	if (!mm || pagefault_disabled())
 		goto no_context;
 
 retry:
diff --git a/arch/parisc/mm/fault.c b/arch/parisc/mm/fault.c
index 09ecc8a..df22f39 100644
--- a/arch/parisc/mm/fault.c
+++ b/arch/parisc/mm/fault.c
@@ -176,7 +176,7 @@ void do_page_fault(struct pt_regs *regs, unsigned long code,
 	unsigned long acc_type;
 	int fault;
 
-	if (in_atomic() || !mm || current->pagefault_disabled)
+	if (!mm || pagefault_disabled())
 		goto no_context;
 
 	down_read(&mm->mmap_sem);
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index feef7a6..66fdd82 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -259,7 +259,7 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,
 	if (!arch_irq_disabled_regs(regs))
 		local_irq_enable();
 
-	if (in_atomic() || mm == NULL || current->pagefault_disabled) {
+	if (!mm || pagefault_disabled()) {
 		if (!user_mode(regs))
 			return SIGSEGV;
 		/* in_atomic() in user mode is really bad,
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index a5fb7bc..62d659d 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -296,8 +296,8 @@ static inline int do_exception(struct pt_regs *regs, int access)
 	 * user context.
 	 */
 	fault = VM_FAULT_BADCONTEXT;
-	if (unlikely(!user_space_fault(trans_exc_code) || in_atomic() || !mm ||
-		    tsk->pagefault_disabled))
+	if (unlikely(!user_space_fault(trans_exc_code) ||
+		     !mm || pagefault_disabled()))
 		goto out;
 
 	address = trans_exc_code & __FAIL_ADDR_MASK;
@@ -436,8 +436,8 @@ void __kprobes do_asce_exception(struct pt_regs *regs)
 	clear_tsk_thread_flag(current, TIF_PER_TRAP);
 
 	trans_exc_code = regs->int_parm_long;
-	if (unlikely(!user_space_fault(trans_exc_code) || in_atomic() || !mm ||
-		     current->pagefault_disabled()));
+	if (unlikely(!user_space_fault(trans_exc_code) || !mm ||
+		     pagefault_disabled()))
 		goto no_context;
 
 	down_read(&mm->mmap_sem);
diff --git a/arch/score/mm/fault.c b/arch/score/mm/fault.c
index 4c12824..59fccbe 100644
--- a/arch/score/mm/fault.c
+++ b/arch/score/mm/fault.c
@@ -72,7 +72,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long write,
 	* If we're in an interrupt or have no user
 	* context, we must not take the fault..
 	*/
-	if (in_atomic() || !mm || current->pagefault_disabled)
+	if (!mm || pagefault_disabled())
 		goto bad_area_nosemaphore;
 
 	down_read(&mm->mmap_sem);
diff --git a/arch/sh/mm/fault.c b/arch/sh/mm/fault.c
index c89dd12..8ff1613 100644
--- a/arch/sh/mm/fault.c
+++ b/arch/sh/mm/fault.c
@@ -440,7 +440,7 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
 	 * If we're in an interrupt, have no user context or are running
 	 * in an atomic region then we must not take the fault:
 	 */
-	if (unlikely(in_atomic() || !mm || current->pagefault_disabled)) {
+	if (unlikely(!mm || pagefault_disabled())) {
 		bad_area_nosemaphore(regs, error_code, address);
 		return;
 	}
diff --git a/arch/sparc/mm/fault_32.c b/arch/sparc/mm/fault_32.c
index ef35d69..18cbe13 100644
--- a/arch/sparc/mm/fault_32.c
+++ b/arch/sparc/mm/fault_32.c
@@ -200,7 +200,7 @@ asmlinkage void do_sparc_fault(struct pt_regs *regs, int text_fault, int write,
 	 * If we're in an interrupt or have no user
 	 * context, we must not take the fault..
 	 */
-	if (in_atomic() || !mm || current->pagefault_disabled)
+	if (!mm || pagefault_disabled())
 		goto no_context;
 
 	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
diff --git a/arch/sparc/mm/fault_64.c b/arch/sparc/mm/fault_64.c
index e852238..2764ac6 100644
--- a/arch/sparc/mm/fault_64.c
+++ b/arch/sparc/mm/fault_64.c
@@ -321,7 +321,7 @@ asmlinkage void __kprobes do_sparc64_fault(struct pt_regs *regs)
 	 * If we're in an interrupt or have no user
 	 * context, we must not take the fault..
 	 */
-	if (in_atomic() || !mm || current->pagefault_enabled)
+	if (!mm || pagefault_disabled())
 		goto intr_or_no_mm;
 
 	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
diff --git a/arch/tile/mm/fault.c b/arch/tile/mm/fault.c
index a0fba28..1ba0ccc 100644
--- a/arch/tile/mm/fault.c
+++ b/arch/tile/mm/fault.c
@@ -360,7 +360,7 @@ static int handle_page_fault(struct pt_regs *regs,
 	 * If we're in an interrupt, have no user context or are running in an
 	 * atomic region then we must not take the fault.
 	 */
-	if (in_atomic() || !mm || current->pagefault_disabled) {
+	if (!mm || pagefault_disabled()) {
 		vma = NULL;  /* happy compiler */
 		goto bad_area_nosemaphore;
 	}
diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c
index bd897f6..991b33a 100644
--- a/arch/um/kernel/trap.c
+++ b/arch/um/kernel/trap.c
@@ -39,7 +39,7 @@ int handle_page_fault(unsigned long address, unsigned long ip,
 	 * If the fault was during atomic operation, don't take the fault, just
 	 * fail.
 	 */
-	if (in_atomic() || current->pagefault_disabled)
+	if (pagefault_disabled())
 		goto out_nosemaphore;
 
 retry:
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index fd0964b..9cc2653 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1110,7 +1110,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code)
 	 * If we're in an interrupt, have no user context or are running
 	 * in an atomic region then we must not take the fault:
 	 */
-	if (unlikely(in_atomic() || !mm || current->pagefault_disabled)) {
+	if (unlikely(!mm || pagefault_disabled())) {
 		bad_area_nosemaphore(regs, error_code, address);
 		return;
 	}
diff --git a/arch/xtensa/mm/fault.c b/arch/xtensa/mm/fault.c
index 0d82503..d57c257 100644
--- a/arch/xtensa/mm/fault.c
+++ b/arch/xtensa/mm/fault.c
@@ -57,7 +57,7 @@ void do_page_fault(struct pt_regs *regs)
 	/* If we're in an interrupt or have no user
 	 * context, we must not take the fault..
 	 */
-	if (in_atomic() || !mm || current->pagefault_disabled) {
+	if (!mm || pagefault_disabled()) {
 		bad_page_fault(regs, address, SIGSEGV);
 		return;
 	}
diff --git a/include/linux/sched.h b/include/linux/sched.h
index c57929c..7fbfd4d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -51,6 +51,7 @@ struct sched_param {
 #include <linux/cred.h>
 #include <linux/llist.h>
 #include <linux/uidgid.h>
+#include <linux/hardirq.h>
 
 #include <asm/processor.h>
 
@@ -1453,7 +1454,9 @@ struct task_struct {
 	/* mutex deadlock detection */
 	struct mutex_waiter *blocked_on;
 #endif
+#ifdef CONFIG_PREEMPT_RT_FULL
 	int pagefault_disabled;
+#endif
 #ifdef CONFIG_TRACE_IRQFLAGS
 	unsigned int irq_events;
 	unsigned long hardirq_enable_ip;
@@ -1629,6 +1632,17 @@ static inline void set_numabalancing_state(bool enabled)
 }
 #endif
 
+#ifdef CONFIG_PREEMPT_RT_FULL
+static inline bool cur_pf_disabled(void) { return current->pagefault_disabled; }
+#else
+static inline bool cur_pf_disabled(void) { return false; }
+#endif
+
+static inline bool pagefault_disabled(void)
+{
+	return in_atomic() || cur_pf_disabled();
+}
+
 /*
  * Priority of a process goes from 0..MAX_PRIO-1, valid RT
  * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH
diff --git a/kernel/fork.c b/kernel/fork.c
index 4349b83..6b22be6 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1285,7 +1285,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	p->hardirq_context = 0;
 	p->softirq_context = 0;
 #endif
+#ifdef CONFIG_PREEMPT_RT_FULL
 	p->pagefault_disabled = 0;
+#endif
 #ifdef CONFIG_LOCKDEP
 	p->lockdep_depth = 0; /* no locks held yet */
 	p->curr_chain_key = 0;
-- 
cgit v0.10.2


From 11d43f0e8e974e600a18c3db67b986afddd42a3a Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 5 Aug 2011 17:16:58 +0200
Subject: mm: raw_pagefault_disable

Adding migrate_disable() to pagefault_disable() to preserve the
per-cpu thing for kmap_atomic might not have been the best of choices.
But short of adding preempt_disable/migrate_disable foo all over the
kmap code it still seems the best way.

It does however yield the below borkage as well as wreck !-rt builds
since !-rt does rely on pagefault_disable() not preempting. So fix all
that up by adding raw_pagefault_disable().

 <NMI>  [<ffffffff81076d5c>] warn_slowpath_common+0x85/0x9d
 [<ffffffff81076e17>] warn_slowpath_fmt+0x46/0x48
 [<ffffffff814f7fca>] ? _raw_spin_lock+0x6c/0x73
 [<ffffffff810cac87>] ? watchdog_overflow_callback+0x9b/0xd0
 [<ffffffff810caca3>] watchdog_overflow_callback+0xb7/0xd0
 [<ffffffff810f51bb>] __perf_event_overflow+0x11c/0x1fe
 [<ffffffff810f298f>] ? perf_event_update_userpage+0x149/0x151
 [<ffffffff810f2846>] ? perf_event_task_disable+0x7c/0x7c
 [<ffffffff810f5b7c>] perf_event_overflow+0x14/0x16
 [<ffffffff81046e02>] x86_pmu_handle_irq+0xcb/0x108
 [<ffffffff814f9a6b>] perf_event_nmi_handler+0x46/0x91
 [<ffffffff814fb2ba>] notifier_call_chain+0x79/0xa6
 [<ffffffff814fb34d>] __atomic_notifier_call_chain+0x66/0x98
 [<ffffffff814fb2e7>] ? notifier_call_chain+0xa6/0xa6
 [<ffffffff814fb393>] atomic_notifier_call_chain+0x14/0x16
 [<ffffffff814fb3c3>] notify_die+0x2e/0x30
 [<ffffffff814f8f75>] do_nmi+0x7e/0x22b
 [<ffffffff814f8bca>] nmi+0x1a/0x2c
 [<ffffffff814fb130>] ? sub_preempt_count+0x4b/0xaa
 <<EOE>>  <IRQ>  [<ffffffff812d44cc>] delay_tsc+0xac/0xd1
 [<ffffffff812d4399>] __delay+0xf/0x11
 [<ffffffff812d95d9>] do_raw_spin_lock+0xd2/0x13c
 [<ffffffff814f813e>] _raw_spin_lock_irqsave+0x6b/0x85
 [<ffffffff8106772a>] ? task_rq_lock+0x35/0x8d
 [<ffffffff8106772a>] task_rq_lock+0x35/0x8d
 [<ffffffff8106fe2f>] migrate_disable+0x65/0x12c
 [<ffffffff81114e69>] pagefault_disable+0xe/0x1f
 [<ffffffff81039c73>] dump_trace+0x21f/0x2e2
 [<ffffffff8103ad79>] show_trace_log_lvl+0x54/0x5d
 [<ffffffff8103ad97>] show_trace+0x15/0x17
 [<ffffffff814f4f5f>] dump_stack+0x77/0x80
 [<ffffffff812d94b0>] spin_bug+0x9c/0xa3
 [<ffffffff81067745>] ? task_rq_lock+0x50/0x8d
 [<ffffffff812d954e>] do_raw_spin_lock+0x47/0x13c
 [<ffffffff814f7fbe>] _raw_spin_lock+0x60/0x73
 [<ffffffff81067745>] ? task_rq_lock+0x50/0x8d
 [<ffffffff81067745>] task_rq_lock+0x50/0x8d
 [<ffffffff8106fe2f>] migrate_disable+0x65/0x12c
 [<ffffffff81114e69>] pagefault_disable+0xe/0x1f
 [<ffffffff81039c73>] dump_trace+0x21f/0x2e2
 [<ffffffff8104369b>] save_stack_trace+0x2f/0x4c
 [<ffffffff810a7848>] save_trace+0x3f/0xaf
 [<ffffffff810aa2bd>] mark_lock+0x228/0x530
 [<ffffffff810aac27>] __lock_acquire+0x662/0x1812
 [<ffffffff8103dad4>] ? native_sched_clock+0x37/0x6d
 [<ffffffff810a790e>] ? trace_hardirqs_off_caller+0x1f/0x99
 [<ffffffff810693f6>] ? sched_rt_period_timer+0xbd/0x218
 [<ffffffff810ac403>] lock_acquire+0x145/0x18a
 [<ffffffff810693f6>] ? sched_rt_period_timer+0xbd/0x218
 [<ffffffff814f7f9e>] _raw_spin_lock+0x40/0x73
 [<ffffffff810693f6>] ? sched_rt_period_timer+0xbd/0x218
 [<ffffffff810693f6>] sched_rt_period_timer+0xbd/0x218
 [<ffffffff8109aa39>] __run_hrtimer+0x1e4/0x347
 [<ffffffff81069339>] ? can_migrate_task.clone.82+0x14a/0x14a
 [<ffffffff8109b97c>] hrtimer_interrupt+0xee/0x1d6
 [<ffffffff814fb23d>] ? add_preempt_count+0xae/0xb2
 [<ffffffff814ffb38>] smp_apic_timer_interrupt+0x85/0x98
 [<ffffffff814fef13>] apic_timer_interrupt+0x13/0x20


Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/n/tip-31keae8mkjiv8esq4rl76cib@git.kernel.org

diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index 9414a1b..44b3751 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -8,8 +8,34 @@
  * These routines enable/disable the pagefault handler in that
  * it will not take any MM locks and go straight to the fixup table.
  */
+static inline void raw_pagefault_disable(void)
+{
+	inc_preempt_count();
+	barrier();
+}
+
+static inline void raw_pagefault_enable(void)
+{
+	barrier();
+	dec_preempt_count();
+	barrier();
+	preempt_check_resched();
+}
+
+#ifndef CONFIG_PREEMPT_RT_FULL
+static inline void pagefault_disable(void)
+{
+	raw_pagefault_disable();
+}
+
+static inline void pagefault_enable(void)
+{
+	raw_pagefault_enable();
+}
+#else
 extern void pagefault_disable(void);
 extern void pagefault_enable(void);
+#endif
 
 #ifndef ARCH_HAS_NOCACHE_UACCESS
 
@@ -50,9 +76,9 @@ static inline unsigned long __copy_from_user_nocache(void *to,
 		mm_segment_t old_fs = get_fs();		\
 							\
 		set_fs(KERNEL_DS);			\
-		pagefault_disable();			\
+		raw_pagefault_disable();		\
 		ret = __copy_from_user_inatomic(&(retval), (__force typeof(retval) __user *)(addr), sizeof(retval));		\
-		pagefault_enable();			\
+		raw_pagefault_enable();			\
 		set_fs(old_fs);				\
 		ret;					\
 	})
diff --git a/mm/memory.c b/mm/memory.c
index 0b1fbfd..f52c023 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3717,6 +3717,7 @@ unlock:
 	return 0;
 }
 
+#ifdef CONFIG_PREEMPT_RT_FULL
 void pagefault_disable(void)
 {
 	inc_preempt_count();
@@ -3745,6 +3746,7 @@ void pagefault_enable(void)
 	preempt_check_resched();
 }
 EXPORT_SYMBOL(pagefault_enable);
+#endif
 
 /*
  * By the time we get here, we already hold the mm semaphore
-- 
cgit v0.10.2


From 1ae65057e506c77ec70212be622b7e8c1909715f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 17 Jun 2011 18:56:24 +0200
Subject: filemap-fix-up.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Wrecked-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/n/tip-m6yuzd6ul717hlnl2gj6p3ou@git.kernel.org

diff --git a/mm/filemap.c b/mm/filemap.c
index 83efee7..cb81968 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1955,7 +1955,7 @@ size_t iov_iter_copy_from_user_atomic(struct page *page,
 	char *kaddr;
 	size_t copied;
 
-	BUG_ON(!in_atomic());
+	BUG_ON(!pagefault_disabled());
 	kaddr = kmap_atomic(page);
 	if (likely(i->nr_segs == 1)) {
 		int left;
-- 
cgit v0.10.2


From 9b3dfb40154505b980b7817098adf9cd5caf7592 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 25 Jul 2009 22:06:27 +0200
Subject: mm: Remove preempt count from pagefault disable/enable

Now that all users are cleaned up, we can remove the preemption count.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/mm/memory.c b/mm/memory.c
index f52c023..cb7094f 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3720,7 +3720,6 @@ unlock:
 #ifdef CONFIG_PREEMPT_RT_FULL
 void pagefault_disable(void)
 {
-	inc_preempt_count();
 	current->pagefault_disabled++;
 	/*
 	 * make sure to have issued the store before a pagefault
@@ -3738,12 +3737,6 @@ void pagefault_enable(void)
 	 */
 	barrier();
 	current->pagefault_disabled--;
-	dec_preempt_count();
-	/*
-	 * make sure we do..
-	 */
-	barrier();
-	preempt_check_resched();
 }
 EXPORT_SYMBOL(pagefault_enable);
 #endif
-- 
cgit v0.10.2


From 69699eaaf713afcdf462bce937817f7fb6364cb9 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 15 Jul 2010 10:29:00 +0200
Subject: suspend: Prevent might sleep splats

timekeeping suspend/resume calls read_persistant_clock() which takes
rtc_lock. That results in might sleep warnings because at that point
we run with interrupts disabled.

We cannot convert rtc_lock to a raw spinlock as that would trigger
other might sleep warnings.

As a temporary workaround we disable the might sleep warnings by
setting system_state to SYSTEM_SUSPEND before calling sysdev_suspend()
and restoring it to SYSTEM_RUNNING afer sysdev_resume().

Needs to be revisited.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index c566927..8b3086d 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -412,6 +412,7 @@ extern enum system_states {
 	SYSTEM_HALT,
 	SYSTEM_POWER_OFF,
 	SYSTEM_RESTART,
+	SYSTEM_SUSPEND,
 } system_state;
 
 #define TAINT_PROPRIETARY_MODULE	0
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index b26f5f1..3321e2b 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -275,6 +275,8 @@ static int create_image(int platform_mode)
 
 	local_irq_disable();
 
+	system_state = SYSTEM_SUSPEND;
+
 	error = syscore_suspend();
 	if (error) {
 		printk(KERN_ERR "PM: Some system devices failed to power down, "
@@ -302,6 +304,7 @@ static int create_image(int platform_mode)
 	syscore_resume();
 
  Enable_irqs:
+	system_state = SYSTEM_RUNNING;
 	local_irq_enable();
 
  Enable_cpus:
@@ -427,6 +430,7 @@ static int resume_target_kernel(bool platform_mode)
 		goto Enable_cpus;
 
 	local_irq_disable();
+	system_state = SYSTEM_SUSPEND;
 
 	error = syscore_suspend();
 	if (error)
@@ -460,6 +464,7 @@ static int resume_target_kernel(bool platform_mode)
 	syscore_resume();
 
  Enable_irqs:
+	system_state = SYSTEM_RUNNING;
 	local_irq_enable();
 
  Enable_cpus:
@@ -542,6 +547,7 @@ int hibernation_platform_enter(void)
 		goto Platform_finish;
 
 	local_irq_disable();
+	system_state = SYSTEM_SUSPEND;
 	syscore_suspend();
 	if (pm_wakeup_pending()) {
 		error = -EAGAIN;
@@ -554,6 +560,7 @@ int hibernation_platform_enter(void)
 
  Power_up:
 	syscore_resume();
+	system_state = SYSTEM_RUNNING;
 	local_irq_enable();
 	enable_nonboot_cpus();
 
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index c8b7446..ff2dade 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -165,6 +165,8 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
 	arch_suspend_disable_irqs();
 	BUG_ON(!irqs_disabled());
 
+	system_state = SYSTEM_SUSPEND;
+
 	error = syscore_suspend();
 	if (!error) {
 		*wakeup = pm_wakeup_pending();
@@ -175,6 +177,8 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
 		syscore_resume();
 	}
 
+	system_state = SYSTEM_RUNNING;
+
 	arch_suspend_enable_irqs();
 	BUG_ON(irqs_disabled());
 
-- 
cgit v0.10.2


From 143e48ff08e767c612967fd6a812e64391fbcb01 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 21 Jun 2011 11:22:36 +0200
Subject: list-add-list-last-entry.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/list.h b/include/linux/list.h
index cc6d2aa..7a9851b 100644
--- a/include/linux/list.h
+++ b/include/linux/list.h
@@ -362,6 +362,17 @@ static inline void list_splice_tail_init(struct list_head *list,
 	list_entry((ptr)->next, type, member)
 
 /**
+ * list_last_entry - get the last element from a list
+ * @ptr:	the list head to take the element from.
+ * @type:	the type of the struct this is embedded in.
+ * @member:	the name of the list_struct within the struct.
+ *
+ * Note, that list is expected to be not empty.
+ */
+#define list_last_entry(ptr, type, member) \
+	list_entry((ptr)->prev, type, member)
+
+/**
  * list_for_each	-	iterate over a list
  * @pos:	the &struct list_head to use as a loop cursor.
  * @head:	the head for your list.
-- 
cgit v0.10.2


From a67c4f0854c03fa166677c2166879eeb45f8c531 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 21 Jun 2011 11:24:35 +0200
Subject: mm-page-alloc-use-list-last-entry.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6a83cd3..f61bc0f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -658,7 +658,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
 		do {
 			int mt;	/* migratetype of the to-be-freed page */
 
-			page = list_entry(list->prev, struct page, lru);
+			page = list_last_entry(list, struct page, lru);
 			/* must delete as __free_one_page list manipulates */
 			list_del(&page->lru);
 			mt = get_freepage_migratetype(page);
-- 
cgit v0.10.2


From 47d77c0dc2a2f46387d98120ed6ea1365a45bfe7 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 20 Jun 2011 10:42:04 +0200
Subject: mm-slab-move-debug-out.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/mm/slab.c b/mm/slab.c
index e7667a3..2aa9fd4 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3895,10 +3895,10 @@ void kmem_cache_free(struct kmem_cache *cachep, void *objp)
 	if (!cachep)
 		return;
 
-	local_irq_save(flags);
 	debug_check_no_locks_freed(objp, cachep->object_size);
 	if (!(cachep->flags & SLAB_DEBUG_OBJECTS))
 		debug_check_no_obj_freed(objp, cachep->object_size);
+	local_irq_save(flags);
 	__cache_free(cachep, objp, _RET_IP_);
 	local_irq_restore(flags);
 
@@ -3924,12 +3924,12 @@ void kfree(const void *objp)
 
 	if (unlikely(ZERO_OR_NULL_PTR(objp)))
 		return;
-	local_irq_save(flags);
 	kfree_debugcheck(objp);
 	c = virt_to_cache(objp);
 	debug_check_no_locks_freed(objp, c->object_size);
 
 	debug_check_no_obj_freed(objp, c->object_size);
+	local_irq_save(flags);
 	__cache_free(c, (void *)objp, _RET_IP_);
 	local_irq_restore(flags);
 }
-- 
cgit v0.10.2


From db0367c1df306d4a22eeee303c74934f8922f964 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 15 Jul 2011 21:24:27 +0200
Subject: rwsem-inlcude-fix.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/pid.h b/include/linux/pid.h
index 2381c97..3b67343 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -2,6 +2,7 @@
 #define _LINUX_PID_H
 
 #include <linux/rcupdate.h>
+#include <linux/atomic.h>
 
 enum pid_type
 {
-- 
cgit v0.10.2


From a9cb14c7be732b65f6838339b5f310c086453a76 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 14 Nov 2011 10:52:34 +0100
Subject: sysctl-include-fix.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 14a8ff2..b15655f 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -25,6 +25,7 @@
 #include <linux/rcupdate.h>
 #include <linux/wait.h>
 #include <linux/rbtree.h>
+#include <linux/atomic.h>
 #include <uapi/linux/sysctl.h>
 
 /* For the /proc/sys support */
-- 
cgit v0.10.2


From cc5644bd7858c56db9d1142e0456d7c1dd3b38a4 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 28 Jun 2011 10:59:58 +0200
Subject: net-flip-lock-dep-thingy.patch

=======================================================
[ INFO: possible circular locking dependency detected ]
3.0.0-rc3+ #26
-------------------------------------------------------
ip/1104 is trying to acquire lock:
 (local_softirq_lock){+.+...}, at: [<ffffffff81056d12>] __local_lock+0x25/0x68

but task is already holding lock:
 (sk_lock-AF_INET){+.+...}, at: [<ffffffff81433308>] lock_sock+0x10/0x12

which lock already depends on the new lock.


the existing dependency chain (in reverse order) is:

-> #1 (sk_lock-AF_INET){+.+...}:
       [<ffffffff810836e5>] lock_acquire+0x103/0x12e
       [<ffffffff813e2781>] lock_sock_nested+0x82/0x92
       [<ffffffff81433308>] lock_sock+0x10/0x12
       [<ffffffff81433afa>] tcp_close+0x1b/0x355
       [<ffffffff81453c99>] inet_release+0xc3/0xcd
       [<ffffffff813dff3f>] sock_release+0x1f/0x74
       [<ffffffff813dffbb>] sock_close+0x27/0x2b
       [<ffffffff81129c63>] fput+0x11d/0x1e3
       [<ffffffff81126577>] filp_close+0x70/0x7b
       [<ffffffff8112667a>] sys_close+0xf8/0x13d
       [<ffffffff814ae882>] system_call_fastpath+0x16/0x1b

-> #0 (local_softirq_lock){+.+...}:
       [<ffffffff81082ecc>] __lock_acquire+0xacc/0xdc8
       [<ffffffff810836e5>] lock_acquire+0x103/0x12e
       [<ffffffff814a7e40>] _raw_spin_lock+0x3b/0x4a
       [<ffffffff81056d12>] __local_lock+0x25/0x68
       [<ffffffff81056d8b>] local_bh_disable+0x36/0x3b
       [<ffffffff814a7fc4>] _raw_write_lock_bh+0x16/0x4f
       [<ffffffff81433c38>] tcp_close+0x159/0x355
       [<ffffffff81453c99>] inet_release+0xc3/0xcd
       [<ffffffff813dff3f>] sock_release+0x1f/0x74
       [<ffffffff813dffbb>] sock_close+0x27/0x2b
       [<ffffffff81129c63>] fput+0x11d/0x1e3
       [<ffffffff81126577>] filp_close+0x70/0x7b
       [<ffffffff8112667a>] sys_close+0xf8/0x13d
       [<ffffffff814ae882>] system_call_fastpath+0x16/0x1b

other info that might help us debug this:

 Possible unsafe locking scenario:

       CPU0                    CPU1
       ----                    ----
  lock(sk_lock-AF_INET);
                               lock(local_softirq_lock);
                               lock(sk_lock-AF_INET);
  lock(local_softirq_lock);

 *** DEADLOCK ***

1 lock held by ip/1104:
 #0:  (sk_lock-AF_INET){+.+...}, at: [<ffffffff81433308>] lock_sock+0x10/0x12

stack backtrace:
Pid: 1104, comm: ip Not tainted 3.0.0-rc3+ #26
Call Trace:
 [<ffffffff81081649>] print_circular_bug+0x1f8/0x209
 [<ffffffff81082ecc>] __lock_acquire+0xacc/0xdc8
 [<ffffffff81056d12>] ? __local_lock+0x25/0x68
 [<ffffffff810836e5>] lock_acquire+0x103/0x12e
 [<ffffffff81056d12>] ? __local_lock+0x25/0x68
 [<ffffffff81046c75>] ? get_parent_ip+0x11/0x41
 [<ffffffff814a7e40>] _raw_spin_lock+0x3b/0x4a
 [<ffffffff81056d12>] ? __local_lock+0x25/0x68
 [<ffffffff81046c8c>] ? get_parent_ip+0x28/0x41
 [<ffffffff81056d12>] __local_lock+0x25/0x68
 [<ffffffff81056d8b>] local_bh_disable+0x36/0x3b
 [<ffffffff81433308>] ? lock_sock+0x10/0x12
 [<ffffffff814a7fc4>] _raw_write_lock_bh+0x16/0x4f
 [<ffffffff81433c38>] tcp_close+0x159/0x355
 [<ffffffff81453c99>] inet_release+0xc3/0xcd
 [<ffffffff813dff3f>] sock_release+0x1f/0x74
 [<ffffffff813dffbb>] sock_close+0x27/0x2b
 [<ffffffff81129c63>] fput+0x11d/0x1e3
 [<ffffffff81126577>] filp_close+0x70/0x7b
 [<ffffffff8112667a>] sys_close+0xf8/0x13d
 [<ffffffff814ae882>] system_call_fastpath+0x16/0x1b


Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/net/core/sock.c b/net/core/sock.c
index bc131d4..45a50ee 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2287,12 +2287,11 @@ void lock_sock_nested(struct sock *sk, int subclass)
 	if (sk->sk_lock.owned)
 		__lock_sock(sk);
 	sk->sk_lock.owned = 1;
-	spin_unlock(&sk->sk_lock.slock);
+	spin_unlock_bh(&sk->sk_lock.slock);
 	/*
 	 * The sk_lock has mutex_lock() semantics here:
 	 */
 	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
-	local_bh_enable();
 }
 EXPORT_SYMBOL(lock_sock_nested);
 
-- 
cgit v0.10.2


From 77636e34a1875630ce9a93424c6bc52bcb5d8c78 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 28 Jun 2011 15:44:15 +0200
Subject: softirq-thread-do-softirq.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 5fa5afe..5fc9d8b 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -443,6 +443,7 @@ struct softirq_action
 
 asmlinkage void do_softirq(void);
 asmlinkage void __do_softirq(void);
+static inline void thread_do_softirq(void) { do_softirq(); }
 extern void open_softirq(int nr, void (*action)(struct softirq_action *));
 extern void softirq_init(void);
 extern void __raise_softirq_irqoff(unsigned int nr);
diff --git a/net/core/dev.c b/net/core/dev.c
index 5d9c43d..f6bc659 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3117,7 +3117,7 @@ int netif_rx_ni(struct sk_buff *skb)
 	preempt_disable();
 	err = netif_rx(skb);
 	if (local_softirq_pending())
-		do_softirq();
+		thread_do_softirq();
 	preempt_enable();
 
 	return err;
-- 
cgit v0.10.2


From 7cf0a06821afabc15dc5050d48a2174bb5dd50f4 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 28 Jun 2011 15:46:49 +0200
Subject: softirq-split-out-code.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/softirq.c b/kernel/softirq.c
index ed567ba..18784bb 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -77,6 +77,34 @@ static void wakeup_softirqd(void)
 		wake_up_process(tsk);
 }
 
+static void handle_pending_softirqs(u32 pending, int cpu)
+{
+	struct softirq_action *h = softirq_vec;
+	unsigned int prev_count = preempt_count();
+
+	local_irq_enable();
+	for ( ; pending; h++, pending >>= 1) {
+		unsigned int vec_nr = h - softirq_vec;
+
+		if (!(pending & 1))
+			continue;
+
+		kstat_incr_softirqs_this_cpu(vec_nr);
+		trace_softirq_entry(vec_nr);
+		h->action(h);
+		trace_softirq_exit(vec_nr);
+		if (unlikely(prev_count != preempt_count())) {
+			printk(KERN_ERR
+ "huh, entered softirq %u %s %p with preempt_count %08x exited with %08x?\n",
+			       vec_nr, softirq_to_name[vec_nr], h->action,
+			       prev_count, (unsigned int) preempt_count());
+			preempt_count() = prev_count;
+		}
+		rcu_bh_qs(cpu);
+	}
+	local_irq_disable();
+}
+
 /*
  * preempt_count and SOFTIRQ_OFFSET usage:
  * - preempt_count is changed by SOFTIRQ_OFFSET on entering or leaving
@@ -207,7 +235,6 @@ EXPORT_SYMBOL(local_bh_enable_ip);
 
 asmlinkage void __do_softirq(void)
 {
-	struct softirq_action *h;
 	__u32 pending;
 	int max_restart = MAX_SOFTIRQ_RESTART;
 	int cpu;
@@ -224,7 +251,7 @@ asmlinkage void __do_softirq(void)
 	vtime_account_irq_enter(current);
 
 	__local_bh_disable((unsigned long)__builtin_return_address(0),
-				SOFTIRQ_OFFSET);
+			   SOFTIRQ_OFFSET);
 	lockdep_softirq_enter();
 
 	cpu = smp_processor_id();
@@ -232,36 +259,7 @@ restart:
 	/* Reset the pending bitmask before enabling irqs */
 	set_softirq_pending(0);
 
-	local_irq_enable();
-
-	h = softirq_vec;
-
-	do {
-		if (pending & 1) {
-			unsigned int vec_nr = h - softirq_vec;
-			int prev_count = preempt_count();
-
-			kstat_incr_softirqs_this_cpu(vec_nr);
-
-			trace_softirq_entry(vec_nr);
-			h->action(h);
-			trace_softirq_exit(vec_nr);
-			if (unlikely(prev_count != preempt_count())) {
-				printk(KERN_ERR "huh, entered softirq %u %s %p"
-				       "with preempt_count %08x,"
-				       " exited with %08x?\n", vec_nr,
-				       softirq_to_name[vec_nr], h->action,
-				       prev_count, preempt_count());
-				preempt_count() = prev_count;
-			}
-
-			rcu_bh_qs(cpu);
-		}
-		h++;
-		pending >>= 1;
-	} while (pending);
-
-	local_irq_disable();
+	handle_pending_softirqs(pending, cpu);
 
 	pending = local_softirq_pending();
 	if (pending && --max_restart)
-- 
cgit v0.10.2


From 57b5c8653c9cbda2445b5c361fe04536d8ed5627 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 3 Jul 2009 08:29:27 -0500
Subject: x86: Do not unmask io_apic when interrupt is in progress

With threaded interrupts we might see an interrupt in progress on
migration. Do not unmask it when this is the case.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index b739d39..aaa6399 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -2428,7 +2428,8 @@ static bool io_apic_level_ack_pending(struct irq_cfg *cfg)
 static inline bool ioapic_irqd_mask(struct irq_data *data, struct irq_cfg *cfg)
 {
 	/* If we are moving the irq we need to mask it */
-	if (unlikely(irqd_is_setaffinity_pending(data))) {
+	if (unlikely(irqd_is_setaffinity_pending(data) &&
+		     !irqd_irq_inprogress(data))) {
 		mask_ioapic(cfg);
 		return true;
 	}
-- 
cgit v0.10.2


From c17fc8c6bc82c7d7184e7f6dd5d7546ed735afbf Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Tue, 30 Apr 2013 03:16:58 -0500
Subject: x86: Do not disable preemption in int3 on 32bit

Preemption must be disabled before enabling interrupts in do_trap
on x86_64 because the stack in use for int3 and debug is a per CPU
stack set by th IST. But 32bit does not have an IST and the stack
still belongs to the current task and there is no problem in scheduling
out the task.

Keep preemption enabled on X86_32 when enabling interrupts for
do_trap().

The name of the function is changed from preempt_conditional_sti/cli()
to conditional_sti/cli_ist(), to annotate that this function is used
when the stack is on the IST.

Cc: stable-rt@vger.kernel.org
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index ecffca1..be18ff6 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -85,9 +85,21 @@ static inline void conditional_sti(struct pt_regs *regs)
 		local_irq_enable();
 }
 
-static inline void preempt_conditional_sti(struct pt_regs *regs)
+static inline void conditional_sti_ist(struct pt_regs *regs)
 {
+#ifdef CONFIG_X86_64
+	/*
+	 * X86_64 uses a per CPU stack on the IST for certain traps
+	 * like int3. The task can not be preempted when using one
+	 * of these stacks, thus preemption must be disabled, otherwise
+	 * the stack can be corrupted if the task is scheduled out,
+	 * and another task comes in and uses this stack.
+	 *
+	 * On x86_32 the task keeps its own stack and it is OK if the
+	 * task schedules out.
+	 */
 	inc_preempt_count();
+#endif
 	if (regs->flags & X86_EFLAGS_IF)
 		local_irq_enable();
 }
@@ -98,11 +110,13 @@ static inline void conditional_cli(struct pt_regs *regs)
 		local_irq_disable();
 }
 
-static inline void preempt_conditional_cli(struct pt_regs *regs)
+static inline void conditional_cli_ist(struct pt_regs *regs)
 {
 	if (regs->flags & X86_EFLAGS_IF)
 		local_irq_disable();
+#ifdef CONFIG_X86_64
 	dec_preempt_count();
+#endif
 }
 
 static int __kprobes
@@ -229,9 +243,9 @@ dotraplinkage void do_stack_segment(struct pt_regs *regs, long error_code)
 	exception_enter(regs);
 	if (notify_die(DIE_TRAP, "stack segment", regs, error_code,
 		       X86_TRAP_SS, SIGBUS) != NOTIFY_STOP) {
-		preempt_conditional_sti(regs);
+		conditional_sti_ist(regs);
 		do_trap(X86_TRAP_SS, SIGBUS, "stack segment", regs, error_code, NULL);
-		preempt_conditional_cli(regs);
+		conditional_cli_ist(regs);
 	}
 	exception_exit(regs);
 }
@@ -331,9 +345,9 @@ dotraplinkage void __kprobes notrace do_int3(struct pt_regs *regs, long error_co
 	 * as we may switch to the interrupt stack.
 	 */
 	debug_stack_usage_inc();
-	preempt_conditional_sti(regs);
+	conditional_sti_ist(regs);
 	do_trap(X86_TRAP_BP, SIGTRAP, "int3", regs, error_code, NULL);
-	preempt_conditional_cli(regs);
+	conditional_cli_ist(regs);
 	debug_stack_usage_dec();
 exit:
 	exception_exit(regs);
@@ -438,12 +452,12 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
 	debug_stack_usage_inc();
 
 	/* It's safe to allow irq's after DR6 has been saved */
-	preempt_conditional_sti(regs);
+	conditional_sti_ist(regs);
 
 	if (regs->flags & X86_VM_MASK) {
 		handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code,
 					X86_TRAP_DB);
-		preempt_conditional_cli(regs);
+		conditional_cli_ist(regs);
 		debug_stack_usage_dec();
 		goto exit;
 	}
@@ -463,7 +477,7 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
 	si_code = get_si_code(tsk->thread.debugreg6);
 	if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS) || user_icebp)
 		send_sigtrap(tsk, regs, error_code, si_code);
-	preempt_conditional_cli(regs);
+	conditional_cli_ist(regs);
 	debug_stack_usage_dec();
 
 exit:
-- 
cgit v0.10.2


From 844df9e2244e38ee9770de467b32bb72f1a49d3e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 1 Dec 2011 00:07:16 +0100
Subject: pci: Use __wake_up_all_locked pci_unblock_user_cfg_access()

The waitqueue is protected by the pci_lock, so we can just avoid to
lock the waitqueue lock itself. That prevents the
might_sleep()/scheduling while atomic problem on RT

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable-rt@vger.kernel.org

diff --git a/drivers/pci/access.c b/drivers/pci/access.c
index 32046c5..0941838 100644
--- a/drivers/pci/access.c
+++ b/drivers/pci/access.c
@@ -465,7 +465,7 @@ void pci_cfg_access_unlock(struct pci_dev *dev)
 	WARN_ON(!dev->block_cfg_access);
 
 	dev->block_cfg_access = 0;
-	wake_up_all(&pci_cfg_wait);
+	wake_up_all_locked(&pci_cfg_wait);
 	raw_spin_unlock_irqrestore(&pci_lock, flags);
 }
 EXPORT_SYMBOL_GPL(pci_cfg_access_unlock);
-- 
cgit v0.10.2


From 8dc1cce0068dc61990519e527e6939e79337d891 Mon Sep 17 00:00:00 2001
From: Carsten Emde <C.Emde@osadl.org>
Date: Tue, 19 Jul 2011 14:03:41 +0100
Subject: latency-hist.patch

This patch provides a recording mechanism to store data of potential
sources of system latencies. The recordings separately determine the
latency caused by a delayed timer expiration, by a delayed wakeup of the
related user space program and by the sum of both. The histograms can be
enabled and reset individually. The data are accessible via the debug
filesystem. For details please consult Documentation/trace/histograms.txt.

Signed-off-by: Carsten Emde <C.Emde@osadl.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/Documentation/trace/histograms.txt b/Documentation/trace/histograms.txt
new file mode 100644
index 0000000..6f2aeab
--- /dev/null
+++ b/Documentation/trace/histograms.txt
@@ -0,0 +1,186 @@
+		Using the Linux Kernel Latency Histograms
+
+
+This document gives a short explanation how to enable, configure and use
+latency histograms. Latency histograms are primarily relevant in the
+context of real-time enabled kernels (CONFIG_PREEMPT/CONFIG_PREEMPT_RT)
+and are used in the quality management of the Linux real-time
+capabilities.
+
+
+* Purpose of latency histograms
+
+A latency histogram continuously accumulates the frequencies of latency
+data. There are two types of histograms
+- potential sources of latencies
+- effective latencies
+
+
+* Potential sources of latencies
+
+Potential sources of latencies are code segments where interrupts,
+preemption or both are disabled (aka critical sections). To create
+histograms of potential sources of latency, the kernel stores the time
+stamp at the start of a critical section, determines the time elapsed
+when the end of the section is reached, and increments the frequency
+counter of that latency value - irrespective of whether any concurrently
+running process is affected by latency or not.
+- Configuration items (in the Kernel hacking/Tracers submenu)
+  CONFIG_INTERRUPT_OFF_LATENCY
+  CONFIG_PREEMPT_OFF_LATENCY
+
+
+* Effective latencies
+
+Effective latencies are actually occuring during wakeup of a process. To
+determine effective latencies, the kernel stores the time stamp when a
+process is scheduled to be woken up, and determines the duration of the
+wakeup time shortly before control is passed over to this process. Note
+that the apparent latency in user space may be somewhat longer, since the
+process may be interrupted after control is passed over to it but before
+the execution in user space takes place. Simply measuring the interval
+between enqueuing and wakeup may also not appropriate in cases when a
+process is scheduled as a result of a timer expiration. The timer may have
+missed its deadline, e.g. due to disabled interrupts, but this latency
+would not be registered. Therefore, the offsets of missed timers are
+recorded in a separate histogram. If both wakeup latency and missed timer
+offsets are configured and enabled, a third histogram may be enabled that
+records the overall latency as a sum of the timer latency, if any, and the
+wakeup latency. This histogram is called "timerandwakeup".
+- Configuration items (in the Kernel hacking/Tracers submenu)
+  CONFIG_WAKEUP_LATENCY
+  CONFIG_MISSED_TIMER_OFSETS
+
+
+* Usage
+
+The interface to the administration of the latency histograms is located
+in the debugfs file system. To mount it, either enter
+
+mount -t sysfs nodev /sys
+mount -t debugfs nodev /sys/kernel/debug
+
+from shell command line level, or add
+
+nodev	/sys			sysfs	defaults	0 0
+nodev	/sys/kernel/debug	debugfs	defaults	0 0
+
+to the file /etc/fstab. All latency histogram related files are then
+available in the directory /sys/kernel/debug/tracing/latency_hist. A
+particular histogram type is enabled by writing non-zero to the related
+variable in the /sys/kernel/debug/tracing/latency_hist/enable directory.
+Select "preemptirqsoff" for the histograms of potential sources of
+latencies and "wakeup" for histograms of effective latencies etc. The
+histogram data - one per CPU - are available in the files
+
+/sys/kernel/debug/tracing/latency_hist/preemptoff/CPUx
+/sys/kernel/debug/tracing/latency_hist/irqsoff/CPUx
+/sys/kernel/debug/tracing/latency_hist/preemptirqsoff/CPUx
+/sys/kernel/debug/tracing/latency_hist/wakeup/CPUx
+/sys/kernel/debug/tracing/latency_hist/wakeup/sharedprio/CPUx
+/sys/kernel/debug/tracing/latency_hist/missed_timer_offsets/CPUx
+/sys/kernel/debug/tracing/latency_hist/timerandwakeup/CPUx
+
+The histograms are reset by writing non-zero to the file "reset" in a
+particular latency directory. To reset all latency data, use
+
+#!/bin/sh
+
+TRACINGDIR=/sys/kernel/debug/tracing
+HISTDIR=$TRACINGDIR/latency_hist
+
+if test -d $HISTDIR
+then
+  cd $HISTDIR
+  for i in `find . | grep /reset$`
+  do
+    echo 1 >$i
+  done
+fi
+
+
+* Data format
+
+Latency data are stored with a resolution of one microsecond. The
+maximum latency is 10,240 microseconds. The data are only valid, if the
+overflow register is empty. Every output line contains the latency in
+microseconds in the first row and the number of samples in the second
+row. To display only lines with a positive latency count, use, for
+example,
+
+grep -v " 0$" /sys/kernel/debug/tracing/latency_hist/preemptoff/CPU0
+
+#Minimum latency: 0 microseconds.
+#Average latency: 0 microseconds.
+#Maximum latency: 25 microseconds.
+#Total samples: 3104770694
+#There are 0 samples greater or equal than 10240 microseconds
+#usecs	         samples
+    0	      2984486876
+    1	        49843506
+    2	        58219047
+    3	         5348126
+    4	         2187960
+    5	         3388262
+    6	          959289
+    7	          208294
+    8	           40420
+    9	            4485
+   10	           14918
+   11	           18340
+   12	           25052
+   13	           19455
+   14	            5602
+   15	             969
+   16	              47
+   17	              18
+   18	              14
+   19	               1
+   20	               3
+   21	               2
+   22	               5
+   23	               2
+   25	               1
+
+
+* Wakeup latency of a selected process
+
+To only collect wakeup latency data of a particular process, write the
+PID of the requested process to
+
+/sys/kernel/debug/tracing/latency_hist/wakeup/pid
+
+PIDs are not considered, if this variable is set to 0.
+
+
+* Details of the process with the highest wakeup latency so far
+
+Selected data of the process that suffered from the highest wakeup
+latency that occurred in a particular CPU are available in the file
+
+/sys/kernel/debug/tracing/latency_hist/wakeup/max_latency-CPUx.
+
+In addition, other relevant system data at the time when the
+latency occurred are given.
+
+The format of the data is (all in one line):
+<PID> <Priority> <Latency> (<Timeroffset>) <Command> \
+<- <PID> <Priority> <Command> <Timestamp>
+
+The value of <Timeroffset> is only relevant in the combined timer
+and wakeup latency recording. In the wakeup recording, it is
+always 0, in the missed_timer_offsets recording, it is the same
+as <Latency>.
+
+When retrospectively searching for the origin of a latency and
+tracing was not enabled, it may be helpful to know the name and
+some basic data of the task that (finally) was switching to the
+late real-tlme task. In addition to the victim's data, also the
+data of the possible culprit are therefore displayed after the
+"<-" symbol.
+
+Finally, the timestamp of the time when the latency occurred
+in <seconds>.<microseconds> after the most recent system boot
+is provided.
+
+These data are also reset when the wakeup histogram is reset.
diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index cc07d27..5b9e789 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -111,6 +111,9 @@ struct hrtimer {
 	enum hrtimer_restart		(*function)(struct hrtimer *);
 	struct hrtimer_clock_base	*base;
 	unsigned long			state;
+#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
+	ktime_t				praecox;
+#endif
 #ifdef CONFIG_TIMER_STATS
 	int				start_pid;
 	void				*start_site;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 7fbfd4d..cb81412 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1599,6 +1599,12 @@ struct task_struct {
 	unsigned long trace;
 	/* bitmask and counter of trace recursion */
 	unsigned long trace_recursion;
+#ifdef CONFIG_WAKEUP_LATENCY_HIST
+	u64 preempt_timestamp_hist;
+#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
+	long timer_offset;
+#endif
+#endif
 #endif /* CONFIG_TRACING */
 #ifdef CONFIG_MEMCG /* memcg uses this to do batch job */
 	struct memcg_batch_info {
diff --git a/include/trace/events/hist.h b/include/trace/events/hist.h
new file mode 100644
index 0000000..28646db
--- /dev/null
+++ b/include/trace/events/hist.h
@@ -0,0 +1,69 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM hist
+
+#if !defined(_TRACE_HIST_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_HIST_H
+
+#include "latency_hist.h"
+#include <linux/tracepoint.h>
+
+#if !defined(CONFIG_PREEMPT_OFF_HIST) && !defined(CONFIG_INTERRUPT_OFF_HIST)
+#define trace_preemptirqsoff_hist(a,b)
+#else
+TRACE_EVENT(preemptirqsoff_hist,
+
+	TP_PROTO(int reason, int starthist),
+
+	TP_ARGS(reason, starthist),
+
+	TP_STRUCT__entry(
+		__field(int,	reason	)
+		__field(int,	starthist	)
+	),
+
+	TP_fast_assign(
+		__entry->reason		= reason;
+		__entry->starthist	= starthist;
+	),
+
+	TP_printk("reason=%s starthist=%s", getaction(__entry->reason),
+		  __entry->starthist ? "start" : "stop")
+);
+#endif
+
+#ifndef CONFIG_MISSED_TIMER_OFFSETS_HIST
+#define trace_hrtimer_interrupt(a,b,c,d)
+#else
+TRACE_EVENT(hrtimer_interrupt,
+
+	TP_PROTO(int cpu, long long offset, struct task_struct *curr, struct task_struct *task),
+
+	TP_ARGS(cpu, offset, curr, task),
+
+	TP_STRUCT__entry(
+		__field(int,		cpu	)
+		__field(long long,	offset	)
+		__array(char,		ccomm,	TASK_COMM_LEN)
+		__field(int,		cprio	)
+		__array(char,		tcomm,	TASK_COMM_LEN)
+		__field(int,		tprio	)
+	),
+
+	TP_fast_assign(
+		__entry->cpu	= cpu;
+		__entry->offset	= offset;
+		memcpy(__entry->ccomm, curr->comm, TASK_COMM_LEN);
+		__entry->cprio  = curr->prio;
+		memcpy(__entry->tcomm, task != NULL ? task->comm : "<none>", task != NULL ? TASK_COMM_LEN : 7);
+		__entry->tprio  = task != NULL ? task->prio : -1;
+	),
+
+	TP_printk("cpu=%d offset=%lld curr=%s[%d] thread=%s[%d]",
+		__entry->cpu, __entry->offset, __entry->ccomm, __entry->cprio, __entry->tcomm, __entry->tprio)
+);
+#endif
+
+#endif /* _TRACE_HIST_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/include/trace/events/latency_hist.h b/include/trace/events/latency_hist.h
new file mode 100644
index 0000000..7f70794
--- /dev/null
+++ b/include/trace/events/latency_hist.h
@@ -0,0 +1,29 @@
+#ifndef _LATENCY_HIST_H
+#define _LATENCY_HIST_H
+
+enum hist_action {
+	IRQS_ON,
+	PREEMPT_ON,
+	TRACE_STOP,
+	IRQS_OFF,
+	PREEMPT_OFF,
+	TRACE_START,
+};
+
+static char *actions[] = {
+	"IRQS_ON",
+	"PREEMPT_ON",
+	"TRACE_STOP",
+	"IRQS_OFF",
+	"PREEMPT_OFF",
+	"TRACE_START",
+};
+
+static inline char *getaction(int action)
+{
+	if (action >= 0 && action <= sizeof(actions)/sizeof(actions[0]))
+		return(actions[action]);
+	return("unknown");
+}
+
+#endif /* _LATENCY_HIST_H */
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index e4cee8d..05b3452 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -49,6 +49,7 @@
 #include <asm/uaccess.h>
 
 #include <trace/events/timer.h>
+#include <trace/events/hist.h>
 
 /*
  * The timer bases:
@@ -971,6 +972,17 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
 #endif
 	}
 
+#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
+	{
+		ktime_t now = new_base->get_time();
+
+		if (ktime_to_ns(tim) < ktime_to_ns(now))
+			timer->praecox = now;
+		else
+			timer->praecox = ktime_set(0, 0);
+	}
+#endif
+
 	hrtimer_set_expires_range_ns(timer, tim, delta_ns);
 
 	timer_stats_hrtimer_set_start_info(timer);
@@ -1247,6 +1259,8 @@ static void __run_hrtimer(struct hrtimer *timer, ktime_t *now)
 
 #ifdef CONFIG_HIGH_RES_TIMERS
 
+static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer);
+
 /*
  * High resolution timer interrupt
  * Called with interrupts disabled
@@ -1290,6 +1304,15 @@ retry:
 
 			timer = container_of(node, struct hrtimer, node);
 
+			trace_hrtimer_interrupt(raw_smp_processor_id(),
+			    ktime_to_ns(ktime_sub(ktime_to_ns(timer->praecox) ?
+				timer->praecox : hrtimer_get_expires(timer),
+				basenow)),
+			    current,
+			    timer->function == hrtimer_wakeup ?
+			    container_of(timer, struct hrtimer_sleeper,
+				timer)->task : NULL);
+
 			/*
 			 * The immediate goal for using the softexpires is
 			 * minimizing wakeups, not running timers at the
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 2747967..c872f5f 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -202,6 +202,24 @@ config IRQSOFF_TRACER
 	  enabled. This option and the preempt-off timing option can be
 	  used together or separately.)
 
+config INTERRUPT_OFF_HIST
+	bool "Interrupts-off Latency Histogram"
+	depends on IRQSOFF_TRACER
+	help
+	  This option generates continuously updated histograms (one per cpu)
+	  of the duration of time periods with interrupts disabled. The
+	  histograms are disabled by default. To enable them, write a non-zero
+	  number to
+
+	      /sys/kernel/debug/tracing/latency_hist/enable/preemptirqsoff
+
+	  If PREEMPT_OFF_HIST is also selected, additional histograms (one
+	  per cpu) are generated that accumulate the duration of time periods
+	  when both interrupts and preemption are disabled. The histogram data
+	  will be located in the debug file system at
+
+	      /sys/kernel/debug/tracing/latency_hist/irqsoff
+
 config PREEMPT_TRACER
 	bool "Preemption-off Latency Tracer"
 	default n
@@ -224,6 +242,24 @@ config PREEMPT_TRACER
 	  enabled. This option and the irqs-off timing option can be
 	  used together or separately.)
 
+config PREEMPT_OFF_HIST
+	bool "Preemption-off Latency Histogram"
+	depends on PREEMPT_TRACER
+	help
+	  This option generates continuously updated histograms (one per cpu)
+	  of the duration of time periods with preemption disabled. The
+	  histograms are disabled by default. To enable them, write a non-zero
+	  number to
+
+	      /sys/kernel/debug/tracing/latency_hist/enable/preemptirqsoff
+
+	  If INTERRUPT_OFF_HIST is also selected, additional histograms (one
+	  per cpu) are generated that accumulate the duration of time periods
+	  when both interrupts and preemption are disabled. The histogram data
+	  will be located in the debug file system at
+
+	      /sys/kernel/debug/tracing/latency_hist/preemptoff
+
 config SCHED_TRACER
 	bool "Scheduling Latency Tracer"
 	select GENERIC_TRACER
@@ -233,6 +269,74 @@ config SCHED_TRACER
 	  This tracer tracks the latency of the highest priority task
 	  to be scheduled in, starting from the point it has woken up.
 
+config WAKEUP_LATENCY_HIST
+	bool "Scheduling Latency Histogram"
+	depends on SCHED_TRACER
+	help
+	  This option generates continuously updated histograms (one per cpu)
+	  of the scheduling latency of the highest priority task.
+	  The histograms are disabled by default. To enable them, write a
+	  non-zero number to
+
+	      /sys/kernel/debug/tracing/latency_hist/enable/wakeup
+
+	  Two different algorithms are used, one to determine the latency of
+	  processes that exclusively use the highest priority of the system and
+	  another one to determine the latency of processes that share the
+	  highest system priority with other processes. The former is used to
+	  improve hardware and system software, the latter to optimize the
+	  priority design of a given system. The histogram data will be
+	  located in the debug file system at
+
+	      /sys/kernel/debug/tracing/latency_hist/wakeup
+
+	  and
+
+	      /sys/kernel/debug/tracing/latency_hist/wakeup/sharedprio
+
+	  If both Scheduling Latency Histogram and Missed Timer Offsets
+	  Histogram are selected, additional histogram data will be collected
+	  that contain, in addition to the wakeup latency, the timer latency, in
+	  case the wakeup was triggered by an expired timer. These histograms
+	  are available in the
+
+	      /sys/kernel/debug/tracing/latency_hist/timerandwakeup
+
+	  directory. They reflect the apparent interrupt and scheduling latency
+	  and are best suitable to determine the worst-case latency of a given
+	  system. To enable these histograms, write a non-zero number to
+
+	      /sys/kernel/debug/tracing/latency_hist/enable/timerandwakeup
+
+config MISSED_TIMER_OFFSETS_HIST
+	depends on HIGH_RES_TIMERS
+	select GENERIC_TRACER
+	bool "Missed Timer Offsets Histogram"
+	help
+	  Generate a histogram of missed timer offsets in microseconds. The
+	  histograms are disabled by default. To enable them, write a non-zero
+	  number to
+
+	      /sys/kernel/debug/tracing/latency_hist/enable/missed_timer_offsets
+
+	  The histogram data will be located in the debug file system at
+
+	      /sys/kernel/debug/tracing/latency_hist/missed_timer_offsets
+
+	  If both Scheduling Latency Histogram and Missed Timer Offsets
+	  Histogram are selected, additional histogram data will be collected
+	  that contain, in addition to the wakeup latency, the timer latency, in
+	  case the wakeup was triggered by an expired timer. These histograms
+	  are available in the
+
+	      /sys/kernel/debug/tracing/latency_hist/timerandwakeup
+
+	  directory. They reflect the apparent interrupt and scheduling latency
+	  and are best suitable to determine the worst-case latency of a given
+	  system. To enable these histograms, write a non-zero number to
+
+	      /sys/kernel/debug/tracing/latency_hist/enable/timerandwakeup
+
 config ENABLE_DEFAULT_TRACERS
 	bool "Trace process context switches and events"
 	depends on !GENERIC_TRACER
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index d7e2068..f5e0243 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -34,6 +34,10 @@ obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o
 obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o
 obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o
 obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o
+obj-$(CONFIG_INTERRUPT_OFF_HIST) += latency_hist.o
+obj-$(CONFIG_PREEMPT_OFF_HIST) += latency_hist.o
+obj-$(CONFIG_WAKEUP_LATENCY_HIST) += latency_hist.o
+obj-$(CONFIG_MISSED_TIMER_OFFSETS_HIST) += latency_hist.o
 obj-$(CONFIG_NOP_TRACER) += trace_nop.o
 obj-$(CONFIG_STACK_TRACER) += trace_stack.o
 obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
diff --git a/kernel/trace/latency_hist.c b/kernel/trace/latency_hist.c
new file mode 100644
index 0000000..6a4c869
--- /dev/null
+++ b/kernel/trace/latency_hist.c
@@ -0,0 +1,1176 @@
+/*
+ * kernel/trace/latency_hist.c
+ *
+ * Add support for histograms of preemption-off latency and
+ * interrupt-off latency and wakeup latency, it depends on
+ * Real-Time Preemption Support.
+ *
+ *  Copyright (C) 2005 MontaVista Software, Inc.
+ *  Yi Yang <yyang@ch.mvista.com>
+ *
+ *  Converted to work with the new latency tracer.
+ *  Copyright (C) 2008 Red Hat, Inc.
+ *    Steven Rostedt <srostedt@redhat.com>
+ *
+ */
+#include <linux/module.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+#include <linux/percpu.h>
+#include <linux/kallsyms.h>
+#include <linux/uaccess.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <asm/atomic.h>
+#include <asm/div64.h>
+
+#include "trace.h"
+#include <trace/events/sched.h>
+
+#define NSECS_PER_USECS 1000L
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/hist.h>
+
+enum {
+	IRQSOFF_LATENCY = 0,
+	PREEMPTOFF_LATENCY,
+	PREEMPTIRQSOFF_LATENCY,
+	WAKEUP_LATENCY,
+	WAKEUP_LATENCY_SHAREDPRIO,
+	MISSED_TIMER_OFFSETS,
+	TIMERANDWAKEUP_LATENCY,
+	MAX_LATENCY_TYPE,
+};
+
+#define MAX_ENTRY_NUM 10240
+
+struct hist_data {
+	atomic_t hist_mode; /* 0 log, 1 don't log */
+	long offset; /* set it to MAX_ENTRY_NUM/2 for a bipolar scale */
+	long min_lat;
+	long max_lat;
+	unsigned long long below_hist_bound_samples;
+	unsigned long long above_hist_bound_samples;
+	long long accumulate_lat;
+	unsigned long long total_samples;
+	unsigned long long hist_array[MAX_ENTRY_NUM];
+};
+
+struct enable_data {
+	int latency_type;
+	int enabled;
+};
+
+static char *latency_hist_dir_root = "latency_hist";
+
+#ifdef CONFIG_INTERRUPT_OFF_HIST
+static DEFINE_PER_CPU(struct hist_data, irqsoff_hist);
+static char *irqsoff_hist_dir = "irqsoff";
+static DEFINE_PER_CPU(cycles_t, hist_irqsoff_start);
+static DEFINE_PER_CPU(int, hist_irqsoff_counting);
+#endif
+
+#ifdef CONFIG_PREEMPT_OFF_HIST
+static DEFINE_PER_CPU(struct hist_data, preemptoff_hist);
+static char *preemptoff_hist_dir = "preemptoff";
+static DEFINE_PER_CPU(cycles_t, hist_preemptoff_start);
+static DEFINE_PER_CPU(int, hist_preemptoff_counting);
+#endif
+
+#if defined(CONFIG_PREEMPT_OFF_HIST) && defined(CONFIG_INTERRUPT_OFF_HIST)
+static DEFINE_PER_CPU(struct hist_data, preemptirqsoff_hist);
+static char *preemptirqsoff_hist_dir = "preemptirqsoff";
+static DEFINE_PER_CPU(cycles_t, hist_preemptirqsoff_start);
+static DEFINE_PER_CPU(int, hist_preemptirqsoff_counting);
+#endif
+
+#if defined(CONFIG_PREEMPT_OFF_HIST) || defined(CONFIG_INTERRUPT_OFF_HIST)
+static notrace void probe_preemptirqsoff_hist(void *v, int reason, int start);
+static struct enable_data preemptirqsoff_enabled_data = {
+	.latency_type = PREEMPTIRQSOFF_LATENCY,
+	.enabled = 0,
+};
+#endif
+
+#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
+    defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
+struct maxlatproc_data {
+	char comm[FIELD_SIZEOF(struct task_struct, comm)];
+	char current_comm[FIELD_SIZEOF(struct task_struct, comm)];
+	int pid;
+	int current_pid;
+	int prio;
+	int current_prio;
+	long latency;
+	long timeroffset;
+	cycle_t timestamp;
+};
+#endif
+
+#ifdef CONFIG_WAKEUP_LATENCY_HIST
+static DEFINE_PER_CPU(struct hist_data, wakeup_latency_hist);
+static DEFINE_PER_CPU(struct hist_data, wakeup_latency_hist_sharedprio);
+static char *wakeup_latency_hist_dir = "wakeup";
+static char *wakeup_latency_hist_dir_sharedprio = "sharedprio";
+static notrace void probe_wakeup_latency_hist_start(void *v,
+    struct task_struct *p, int success);
+static notrace void probe_wakeup_latency_hist_stop(void *v,
+    struct task_struct *prev, struct task_struct *next);
+static notrace void probe_sched_migrate_task(void *,
+    struct task_struct *task, int cpu);
+static struct enable_data wakeup_latency_enabled_data = {
+	.latency_type = WAKEUP_LATENCY,
+	.enabled = 0,
+};
+static DEFINE_PER_CPU(struct maxlatproc_data, wakeup_maxlatproc);
+static DEFINE_PER_CPU(struct maxlatproc_data, wakeup_maxlatproc_sharedprio);
+static DEFINE_PER_CPU(struct task_struct *, wakeup_task);
+static DEFINE_PER_CPU(int, wakeup_sharedprio);
+static unsigned long wakeup_pid;
+#endif
+
+#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
+static DEFINE_PER_CPU(struct hist_data, missed_timer_offsets);
+static char *missed_timer_offsets_dir = "missed_timer_offsets";
+static notrace void probe_hrtimer_interrupt(void *v, int cpu,
+    long long offset, struct task_struct *curr, struct task_struct *task);
+static struct enable_data missed_timer_offsets_enabled_data = {
+	.latency_type = MISSED_TIMER_OFFSETS,
+	.enabled = 0,
+};
+static DEFINE_PER_CPU(struct maxlatproc_data, missed_timer_offsets_maxlatproc);
+static unsigned long missed_timer_offsets_pid;
+#endif
+
+#if defined(CONFIG_WAKEUP_LATENCY_HIST) && \
+    defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
+static DEFINE_PER_CPU(struct hist_data, timerandwakeup_latency_hist);
+static char *timerandwakeup_latency_hist_dir = "timerandwakeup";
+static struct enable_data timerandwakeup_enabled_data = {
+	.latency_type = TIMERANDWAKEUP_LATENCY,
+	.enabled = 0,
+};
+static DEFINE_PER_CPU(struct maxlatproc_data, timerandwakeup_maxlatproc);
+#endif
+
+void notrace latency_hist(int latency_type, int cpu, long latency,
+			  long timeroffset, cycle_t stop,
+			  struct task_struct *p)
+{
+	struct hist_data *my_hist;
+#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
+    defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
+	struct maxlatproc_data *mp = NULL;
+#endif
+
+	if (cpu < 0 || cpu >= NR_CPUS || latency_type < 0 ||
+	    latency_type >= MAX_LATENCY_TYPE)
+		return;
+
+	switch (latency_type) {
+#ifdef CONFIG_INTERRUPT_OFF_HIST
+	case IRQSOFF_LATENCY:
+		my_hist = &per_cpu(irqsoff_hist, cpu);
+		break;
+#endif
+#ifdef CONFIG_PREEMPT_OFF_HIST
+	case PREEMPTOFF_LATENCY:
+		my_hist = &per_cpu(preemptoff_hist, cpu);
+		break;
+#endif
+#if defined(CONFIG_PREEMPT_OFF_HIST) && defined(CONFIG_INTERRUPT_OFF_HIST)
+	case PREEMPTIRQSOFF_LATENCY:
+		my_hist = &per_cpu(preemptirqsoff_hist, cpu);
+		break;
+#endif
+#ifdef CONFIG_WAKEUP_LATENCY_HIST
+	case WAKEUP_LATENCY:
+		my_hist = &per_cpu(wakeup_latency_hist, cpu);
+		mp = &per_cpu(wakeup_maxlatproc, cpu);
+		break;
+	case WAKEUP_LATENCY_SHAREDPRIO:
+		my_hist = &per_cpu(wakeup_latency_hist_sharedprio, cpu);
+		mp = &per_cpu(wakeup_maxlatproc_sharedprio, cpu);
+		break;
+#endif
+#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
+	case MISSED_TIMER_OFFSETS:
+		my_hist = &per_cpu(missed_timer_offsets, cpu);
+		mp = &per_cpu(missed_timer_offsets_maxlatproc, cpu);
+		break;
+#endif
+#if defined(CONFIG_WAKEUP_LATENCY_HIST) && \
+    defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
+	case TIMERANDWAKEUP_LATENCY:
+		my_hist = &per_cpu(timerandwakeup_latency_hist, cpu);
+		mp = &per_cpu(timerandwakeup_maxlatproc, cpu);
+		break;
+#endif
+
+	default:
+		return;
+	}
+
+	latency += my_hist->offset;
+
+	if (atomic_read(&my_hist->hist_mode) == 0)
+		return;
+
+	if (latency < 0 || latency >= MAX_ENTRY_NUM) {
+		if (latency < 0)
+			my_hist->below_hist_bound_samples++;
+		else
+			my_hist->above_hist_bound_samples++;
+	} else
+		my_hist->hist_array[latency]++;
+
+	if (unlikely(latency > my_hist->max_lat ||
+	    my_hist->min_lat == LONG_MAX)) {
+#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
+    defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
+		if (latency_type == WAKEUP_LATENCY ||
+		    latency_type == WAKEUP_LATENCY_SHAREDPRIO ||
+		    latency_type == MISSED_TIMER_OFFSETS ||
+		    latency_type == TIMERANDWAKEUP_LATENCY) {
+			strncpy(mp->comm, p->comm, sizeof(mp->comm));
+			strncpy(mp->current_comm, current->comm,
+			    sizeof(mp->current_comm));
+			mp->pid = task_pid_nr(p);
+			mp->current_pid = task_pid_nr(current);
+			mp->prio = p->prio;
+			mp->current_prio = current->prio;
+			mp->latency = latency;
+			mp->timeroffset = timeroffset;
+			mp->timestamp = stop;
+		}
+#endif
+		my_hist->max_lat = latency;
+	}
+	if (unlikely(latency < my_hist->min_lat))
+		my_hist->min_lat = latency;
+	my_hist->total_samples++;
+	my_hist->accumulate_lat += latency;
+}
+
+static void *l_start(struct seq_file *m, loff_t *pos)
+{
+	loff_t *index_ptr = NULL;
+	loff_t index = *pos;
+	struct hist_data *my_hist = m->private;
+
+	if (index == 0) {
+		char minstr[32], avgstr[32], maxstr[32];
+
+		atomic_dec(&my_hist->hist_mode);
+
+		if (likely(my_hist->total_samples)) {
+			long avg = (long) div64_s64(my_hist->accumulate_lat,
+			    my_hist->total_samples);
+			snprintf(minstr, sizeof(minstr), "%ld",
+			    my_hist->min_lat - my_hist->offset);
+			snprintf(avgstr, sizeof(avgstr), "%ld",
+			    avg - my_hist->offset);
+			snprintf(maxstr, sizeof(maxstr), "%ld",
+			    my_hist->max_lat - my_hist->offset);
+		} else {
+			strcpy(minstr, "<undef>");
+			strcpy(avgstr, minstr);
+			strcpy(maxstr, minstr);
+		}
+
+		seq_printf(m, "#Minimum latency: %s microseconds\n"
+			   "#Average latency: %s microseconds\n"
+			   "#Maximum latency: %s microseconds\n"
+			   "#Total samples: %llu\n"
+			   "#There are %llu samples lower than %ld"
+			   " microseconds.\n"
+			   "#There are %llu samples greater or equal"
+			   " than %ld microseconds.\n"
+			   "#usecs\t%16s\n",
+			   minstr, avgstr, maxstr,
+			   my_hist->total_samples,
+			   my_hist->below_hist_bound_samples,
+			   -my_hist->offset,
+			   my_hist->above_hist_bound_samples,
+			   MAX_ENTRY_NUM - my_hist->offset,
+			   "samples");
+	}
+	if (index < MAX_ENTRY_NUM) {
+		index_ptr = kmalloc(sizeof(loff_t), GFP_KERNEL);
+		if (index_ptr)
+			*index_ptr = index;
+	}
+
+	return index_ptr;
+}
+
+static void *l_next(struct seq_file *m, void *p, loff_t *pos)
+{
+	loff_t *index_ptr = p;
+	struct hist_data *my_hist = m->private;
+
+	if (++*pos >= MAX_ENTRY_NUM) {
+		atomic_inc(&my_hist->hist_mode);
+		return NULL;
+	}
+	*index_ptr = *pos;
+	return index_ptr;
+}
+
+static void l_stop(struct seq_file *m, void *p)
+{
+	kfree(p);
+}
+
+static int l_show(struct seq_file *m, void *p)
+{
+	int index = *(loff_t *) p;
+	struct hist_data *my_hist = m->private;
+
+	seq_printf(m, "%6ld\t%16llu\n", index - my_hist->offset,
+	    my_hist->hist_array[index]);
+	return 0;
+}
+
+static struct seq_operations latency_hist_seq_op = {
+	.start = l_start,
+	.next  = l_next,
+	.stop  = l_stop,
+	.show  = l_show
+};
+
+static int latency_hist_open(struct inode *inode, struct file *file)
+{
+	int ret;
+
+	ret = seq_open(file, &latency_hist_seq_op);
+	if (!ret) {
+		struct seq_file *seq = file->private_data;
+		seq->private = inode->i_private;
+	}
+	return ret;
+}
+
+static struct file_operations latency_hist_fops = {
+	.open = latency_hist_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = seq_release,
+};
+
+#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
+    defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
+static void clear_maxlatprocdata(struct maxlatproc_data *mp)
+{
+	mp->comm[0] = mp->current_comm[0] = '\0';
+	mp->prio = mp->current_prio = mp->pid = mp->current_pid =
+	    mp->latency = mp->timeroffset = -1;
+	mp->timestamp = 0;
+}
+#endif
+
+static void hist_reset(struct hist_data *hist)
+{
+	atomic_dec(&hist->hist_mode);
+
+	memset(hist->hist_array, 0, sizeof(hist->hist_array));
+	hist->below_hist_bound_samples = 0ULL;
+	hist->above_hist_bound_samples = 0ULL;
+	hist->min_lat = LONG_MAX;
+	hist->max_lat = LONG_MIN;
+	hist->total_samples = 0ULL;
+	hist->accumulate_lat = 0LL;
+
+	atomic_inc(&hist->hist_mode);
+}
+
+static ssize_t
+latency_hist_reset(struct file *file, const char __user *a,
+		   size_t size, loff_t *off)
+{
+	int cpu;
+	struct hist_data *hist = NULL;
+#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
+    defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
+	struct maxlatproc_data *mp = NULL;
+#endif
+	off_t latency_type = (off_t) file->private_data;
+
+	for_each_online_cpu(cpu) {
+
+		switch (latency_type) {
+#ifdef CONFIG_PREEMPT_OFF_HIST
+		case PREEMPTOFF_LATENCY:
+			hist = &per_cpu(preemptoff_hist, cpu);
+			break;
+#endif
+#ifdef CONFIG_INTERRUPT_OFF_HIST
+		case IRQSOFF_LATENCY:
+			hist = &per_cpu(irqsoff_hist, cpu);
+			break;
+#endif
+#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST)
+		case PREEMPTIRQSOFF_LATENCY:
+			hist = &per_cpu(preemptirqsoff_hist, cpu);
+			break;
+#endif
+#ifdef CONFIG_WAKEUP_LATENCY_HIST
+		case WAKEUP_LATENCY:
+			hist = &per_cpu(wakeup_latency_hist, cpu);
+			mp = &per_cpu(wakeup_maxlatproc, cpu);
+			break;
+		case WAKEUP_LATENCY_SHAREDPRIO:
+			hist = &per_cpu(wakeup_latency_hist_sharedprio, cpu);
+			mp = &per_cpu(wakeup_maxlatproc_sharedprio, cpu);
+			break;
+#endif
+#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
+		case MISSED_TIMER_OFFSETS:
+			hist = &per_cpu(missed_timer_offsets, cpu);
+			mp = &per_cpu(missed_timer_offsets_maxlatproc, cpu);
+			break;
+#endif
+#if defined(CONFIG_WAKEUP_LATENCY_HIST) && \
+    defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
+		case TIMERANDWAKEUP_LATENCY:
+			hist = &per_cpu(timerandwakeup_latency_hist, cpu);
+			mp = &per_cpu(timerandwakeup_maxlatproc, cpu);
+			break;
+#endif
+		}
+
+		hist_reset(hist);
+#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
+    defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
+		if (latency_type == WAKEUP_LATENCY ||
+		    latency_type == WAKEUP_LATENCY_SHAREDPRIO ||
+		    latency_type == MISSED_TIMER_OFFSETS ||
+		    latency_type == TIMERANDWAKEUP_LATENCY)
+			clear_maxlatprocdata(mp);
+#endif
+	}
+
+	return size;
+}
+
+#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
+    defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
+static ssize_t
+show_pid(struct file *file, char __user *ubuf, size_t cnt, loff_t *ppos)
+{
+	char buf[64];
+	int r;
+	unsigned long *this_pid = file->private_data;
+
+	r = snprintf(buf, sizeof(buf), "%lu\n", *this_pid);
+	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+
+static ssize_t do_pid(struct file *file, const char __user *ubuf,
+		      size_t cnt, loff_t *ppos)
+{
+	char buf[64];
+	unsigned long pid;
+	unsigned long *this_pid = file->private_data;
+
+	if (cnt >= sizeof(buf))
+		return -EINVAL;
+
+	if (copy_from_user(&buf, ubuf, cnt))
+		return -EFAULT;
+
+	buf[cnt] = '\0';
+
+	if (strict_strtoul(buf, 10, &pid))
+		return(-EINVAL);
+
+	*this_pid = pid;
+
+	return cnt;
+}
+#endif
+
+#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
+    defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
+static ssize_t
+show_maxlatproc(struct file *file, char __user *ubuf, size_t cnt, loff_t *ppos)
+{
+	int r;
+	struct maxlatproc_data *mp = file->private_data;
+	int strmaxlen = (TASK_COMM_LEN * 2) + (8 * 8);
+	unsigned long long t;
+	unsigned long usecs, secs;
+	char *buf;
+
+	if (mp->pid == -1 || mp->current_pid == -1) {
+		buf = "(none)\n";
+		return simple_read_from_buffer(ubuf, cnt, ppos, buf,
+		    strlen(buf));
+	}
+
+	buf = kmalloc(strmaxlen, GFP_KERNEL);
+	if (buf == NULL)
+		return -ENOMEM;
+
+	t = ns2usecs(mp->timestamp);
+	usecs = do_div(t, USEC_PER_SEC);
+	secs = (unsigned long) t;
+	r = snprintf(buf, strmaxlen,
+	    "%d %d %ld (%ld) %s <- %d %d %s %lu.%06lu\n", mp->pid,
+	    MAX_RT_PRIO-1 - mp->prio, mp->latency, mp->timeroffset, mp->comm,
+	    mp->current_pid, MAX_RT_PRIO-1 - mp->current_prio, mp->current_comm,
+	    secs, usecs);
+	r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+	kfree(buf);
+	return r;
+}
+#endif
+
+static ssize_t
+show_enable(struct file *file, char __user *ubuf, size_t cnt, loff_t *ppos)
+{
+	char buf[64];
+	struct enable_data *ed = file->private_data;
+	int r;
+
+	r = snprintf(buf, sizeof(buf), "%d\n", ed->enabled);
+	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+
+static ssize_t
+do_enable(struct file *file, const char __user *ubuf, size_t cnt, loff_t *ppos)
+{
+	char buf[64];
+	long enable;
+	struct enable_data *ed = file->private_data;
+
+	if (cnt >= sizeof(buf))
+		return -EINVAL;
+
+	if (copy_from_user(&buf, ubuf, cnt))
+		return -EFAULT;
+
+	buf[cnt] = 0;
+
+	if (strict_strtol(buf, 10, &enable))
+		return(-EINVAL);
+
+	if ((enable && ed->enabled) || (!enable && !ed->enabled))
+		return cnt;
+
+	if (enable) {
+		int ret;
+
+		switch (ed->latency_type) {
+#if defined(CONFIG_INTERRUPT_OFF_HIST) || defined(CONFIG_PREEMPT_OFF_HIST)
+		case PREEMPTIRQSOFF_LATENCY:
+			ret = register_trace_preemptirqsoff_hist(
+			    probe_preemptirqsoff_hist, NULL);
+			if (ret) {
+				pr_info("wakeup trace: Couldn't assign "
+				    "probe_preemptirqsoff_hist "
+				    "to trace_preemptirqsoff_hist\n");
+				return ret;
+			}
+			break;
+#endif
+#ifdef CONFIG_WAKEUP_LATENCY_HIST
+		case WAKEUP_LATENCY:
+			ret = register_trace_sched_wakeup(
+			    probe_wakeup_latency_hist_start, NULL);
+			if (ret) {
+				pr_info("wakeup trace: Couldn't assign "
+				    "probe_wakeup_latency_hist_start "
+				    "to trace_sched_wakeup\n");
+				return ret;
+			}
+			ret = register_trace_sched_wakeup_new(
+			    probe_wakeup_latency_hist_start, NULL);
+			if (ret) {
+				pr_info("wakeup trace: Couldn't assign "
+				    "probe_wakeup_latency_hist_start "
+				    "to trace_sched_wakeup_new\n");
+				unregister_trace_sched_wakeup(
+				    probe_wakeup_latency_hist_start, NULL);
+				return ret;
+			}
+			ret = register_trace_sched_switch(
+			    probe_wakeup_latency_hist_stop, NULL);
+			if (ret) {
+				pr_info("wakeup trace: Couldn't assign "
+				    "probe_wakeup_latency_hist_stop "
+				    "to trace_sched_switch\n");
+				unregister_trace_sched_wakeup(
+				    probe_wakeup_latency_hist_start, NULL);
+				unregister_trace_sched_wakeup_new(
+				    probe_wakeup_latency_hist_start, NULL);
+				return ret;
+			}
+			ret = register_trace_sched_migrate_task(
+			    probe_sched_migrate_task, NULL);
+			if (ret) {
+				pr_info("wakeup trace: Couldn't assign "
+				    "probe_sched_migrate_task "
+				    "to trace_sched_migrate_task\n");
+				unregister_trace_sched_wakeup(
+				    probe_wakeup_latency_hist_start, NULL);
+				unregister_trace_sched_wakeup_new(
+				    probe_wakeup_latency_hist_start, NULL);
+				unregister_trace_sched_switch(
+				    probe_wakeup_latency_hist_stop, NULL);
+				return ret;
+			}
+			break;
+#endif
+#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
+		case MISSED_TIMER_OFFSETS:
+			ret = register_trace_hrtimer_interrupt(
+			    probe_hrtimer_interrupt, NULL);
+			if (ret) {
+				pr_info("wakeup trace: Couldn't assign "
+				    "probe_hrtimer_interrupt "
+				    "to trace_hrtimer_interrupt\n");
+				return ret;
+			}
+			break;
+#endif
+#if defined(CONFIG_WAKEUP_LATENCY_HIST) && \
+    defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
+		case TIMERANDWAKEUP_LATENCY:
+			if (!wakeup_latency_enabled_data.enabled ||
+			    !missed_timer_offsets_enabled_data.enabled)
+				return -EINVAL;
+			break;
+#endif
+		default:
+			break;
+		}
+	} else {
+		switch (ed->latency_type) {
+#if defined(CONFIG_INTERRUPT_OFF_HIST) || defined(CONFIG_PREEMPT_OFF_HIST)
+		case PREEMPTIRQSOFF_LATENCY:
+			{
+				int cpu;
+
+				unregister_trace_preemptirqsoff_hist(
+				    probe_preemptirqsoff_hist, NULL);
+				for_each_online_cpu(cpu) {
+#ifdef CONFIG_INTERRUPT_OFF_HIST
+					per_cpu(hist_irqsoff_counting,
+					    cpu) = 0;
+#endif
+#ifdef CONFIG_PREEMPT_OFF_HIST
+					per_cpu(hist_preemptoff_counting,
+					    cpu) = 0;
+#endif
+#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST)
+					per_cpu(hist_preemptirqsoff_counting,
+					    cpu) = 0;
+#endif
+				}
+			}
+			break;
+#endif
+#ifdef CONFIG_WAKEUP_LATENCY_HIST
+		case WAKEUP_LATENCY:
+			{
+				int cpu;
+
+				unregister_trace_sched_wakeup(
+				    probe_wakeup_latency_hist_start, NULL);
+				unregister_trace_sched_wakeup_new(
+				    probe_wakeup_latency_hist_start, NULL);
+				unregister_trace_sched_switch(
+				    probe_wakeup_latency_hist_stop, NULL);
+				unregister_trace_sched_migrate_task(
+				    probe_sched_migrate_task, NULL);
+
+				for_each_online_cpu(cpu) {
+					per_cpu(wakeup_task, cpu) = NULL;
+					per_cpu(wakeup_sharedprio, cpu) = 0;
+				}
+			}
+#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
+			timerandwakeup_enabled_data.enabled = 0;
+#endif
+			break;
+#endif
+#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
+		case MISSED_TIMER_OFFSETS:
+			unregister_trace_hrtimer_interrupt(
+			    probe_hrtimer_interrupt, NULL);
+#ifdef CONFIG_WAKEUP_LATENCY_HIST
+			timerandwakeup_enabled_data.enabled = 0;
+#endif
+			break;
+#endif
+		default:
+			break;
+		}
+	}
+	ed->enabled = enable;
+	return cnt;
+}
+
+static const struct file_operations latency_hist_reset_fops = {
+	.open = tracing_open_generic,
+	.write = latency_hist_reset,
+};
+
+static const struct file_operations enable_fops = {
+	.open = tracing_open_generic,
+	.read = show_enable,
+	.write = do_enable,
+};
+
+#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
+    defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
+static const struct file_operations pid_fops = {
+	.open = tracing_open_generic,
+	.read = show_pid,
+	.write = do_pid,
+};
+
+static const struct file_operations maxlatproc_fops = {
+	.open = tracing_open_generic,
+	.read = show_maxlatproc,
+};
+#endif
+
+#if defined(CONFIG_INTERRUPT_OFF_HIST) || defined(CONFIG_PREEMPT_OFF_HIST)
+static notrace void probe_preemptirqsoff_hist(void *v, int reason,
+    int starthist)
+{
+	int cpu = raw_smp_processor_id();
+	int time_set = 0;
+
+	if (starthist) {
+		cycle_t uninitialized_var(start);
+
+		if (!preempt_count() && !irqs_disabled())
+			return;
+
+#ifdef CONFIG_INTERRUPT_OFF_HIST
+		if ((reason == IRQS_OFF || reason == TRACE_START) &&
+		    !per_cpu(hist_irqsoff_counting, cpu)) {
+			per_cpu(hist_irqsoff_counting, cpu) = 1;
+			start = ftrace_now(cpu);
+			time_set++;
+			per_cpu(hist_irqsoff_start, cpu) = start;
+		}
+#endif
+
+#ifdef CONFIG_PREEMPT_OFF_HIST
+		if ((reason == PREEMPT_OFF || reason == TRACE_START) &&
+		    !per_cpu(hist_preemptoff_counting, cpu)) {
+			per_cpu(hist_preemptoff_counting, cpu) = 1;
+			if (!(time_set++))
+				start = ftrace_now(cpu);
+			per_cpu(hist_preemptoff_start, cpu) = start;
+		}
+#endif
+
+#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST)
+		if (per_cpu(hist_irqsoff_counting, cpu) &&
+		    per_cpu(hist_preemptoff_counting, cpu) &&
+		    !per_cpu(hist_preemptirqsoff_counting, cpu)) {
+			per_cpu(hist_preemptirqsoff_counting, cpu) = 1;
+			if (!time_set)
+				start = ftrace_now(cpu);
+			per_cpu(hist_preemptirqsoff_start, cpu) = start;
+		}
+#endif
+	} else {
+		cycle_t uninitialized_var(stop);
+
+#ifdef CONFIG_INTERRUPT_OFF_HIST
+		if ((reason == IRQS_ON || reason == TRACE_STOP) &&
+		    per_cpu(hist_irqsoff_counting, cpu)) {
+			cycle_t start = per_cpu(hist_irqsoff_start, cpu);
+
+			stop = ftrace_now(cpu);
+			time_set++;
+			if (start) {
+				long latency = ((long) (stop - start)) /
+				    NSECS_PER_USECS;
+
+				latency_hist(IRQSOFF_LATENCY, cpu, latency, 0,
+				    stop, NULL);
+			}
+			per_cpu(hist_irqsoff_counting, cpu) = 0;
+		}
+#endif
+
+#ifdef CONFIG_PREEMPT_OFF_HIST
+		if ((reason == PREEMPT_ON || reason == TRACE_STOP) &&
+		    per_cpu(hist_preemptoff_counting, cpu)) {
+			cycle_t start = per_cpu(hist_preemptoff_start, cpu);
+
+			if (!(time_set++))
+				stop = ftrace_now(cpu);
+			if (start) {
+				long latency = ((long) (stop - start)) /
+				    NSECS_PER_USECS;
+
+				latency_hist(PREEMPTOFF_LATENCY, cpu, latency,
+				    0, stop, NULL);
+			}
+			per_cpu(hist_preemptoff_counting, cpu) = 0;
+		}
+#endif
+
+#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST)
+		if ((!per_cpu(hist_irqsoff_counting, cpu) ||
+		     !per_cpu(hist_preemptoff_counting, cpu)) &&
+		   per_cpu(hist_preemptirqsoff_counting, cpu)) {
+			cycle_t start = per_cpu(hist_preemptirqsoff_start, cpu);
+
+			if (!time_set)
+				stop = ftrace_now(cpu);
+			if (start) {
+				long latency = ((long) (stop - start)) /
+				    NSECS_PER_USECS;
+
+				latency_hist(PREEMPTIRQSOFF_LATENCY, cpu,
+				    latency, 0, stop, NULL);
+			}
+			per_cpu(hist_preemptirqsoff_counting, cpu) = 0;
+		}
+#endif
+	}
+}
+#endif
+
+#ifdef CONFIG_WAKEUP_LATENCY_HIST
+static DEFINE_RAW_SPINLOCK(wakeup_lock);
+static notrace void probe_sched_migrate_task(void *v, struct task_struct *task,
+    int cpu)
+{
+	int old_cpu = task_cpu(task);
+
+	if (cpu != old_cpu) {
+		unsigned long flags;
+		struct task_struct *cpu_wakeup_task;
+
+		raw_spin_lock_irqsave(&wakeup_lock, flags);
+
+		cpu_wakeup_task = per_cpu(wakeup_task, old_cpu);
+		if (task == cpu_wakeup_task) {
+			put_task_struct(cpu_wakeup_task);
+			per_cpu(wakeup_task, old_cpu) = NULL;
+			cpu_wakeup_task = per_cpu(wakeup_task, cpu) = task;
+			get_task_struct(cpu_wakeup_task);
+		}
+
+		raw_spin_unlock_irqrestore(&wakeup_lock, flags);
+	}
+}
+
+static notrace void probe_wakeup_latency_hist_start(void *v,
+    struct task_struct *p, int success)
+{
+	unsigned long flags;
+	struct task_struct *curr = current;
+	int cpu = task_cpu(p);
+	struct task_struct *cpu_wakeup_task;
+
+	raw_spin_lock_irqsave(&wakeup_lock, flags);
+
+	cpu_wakeup_task = per_cpu(wakeup_task, cpu);
+
+	if (wakeup_pid) {
+		if ((cpu_wakeup_task && p->prio == cpu_wakeup_task->prio) ||
+		    p->prio == curr->prio)
+			per_cpu(wakeup_sharedprio, cpu) = 1;
+		if (likely(wakeup_pid != task_pid_nr(p)))
+			goto out;
+	} else {
+		if (likely(!rt_task(p)) ||
+		    (cpu_wakeup_task && p->prio > cpu_wakeup_task->prio) ||
+		    p->prio > curr->prio)
+			goto out;
+		if ((cpu_wakeup_task && p->prio == cpu_wakeup_task->prio) ||
+		    p->prio == curr->prio)
+			per_cpu(wakeup_sharedprio, cpu) = 1;
+	}
+
+	if (cpu_wakeup_task)
+		put_task_struct(cpu_wakeup_task);
+	cpu_wakeup_task = per_cpu(wakeup_task, cpu) = p;
+	get_task_struct(cpu_wakeup_task);
+	cpu_wakeup_task->preempt_timestamp_hist =
+		ftrace_now(raw_smp_processor_id());
+out:
+	raw_spin_unlock_irqrestore(&wakeup_lock, flags);
+}
+
+static notrace void probe_wakeup_latency_hist_stop(void *v,
+    struct task_struct *prev, struct task_struct *next)
+{
+	unsigned long flags;
+	int cpu = task_cpu(next);
+	long latency;
+	cycle_t stop;
+	struct task_struct *cpu_wakeup_task;
+
+	raw_spin_lock_irqsave(&wakeup_lock, flags);
+
+	cpu_wakeup_task = per_cpu(wakeup_task, cpu);
+
+	if (cpu_wakeup_task == NULL)
+		goto out;
+
+	/* Already running? */
+	if (unlikely(current == cpu_wakeup_task))
+		goto out_reset;
+
+	if (next != cpu_wakeup_task) {
+		if (next->prio < cpu_wakeup_task->prio)
+			goto out_reset;
+
+		if (next->prio == cpu_wakeup_task->prio)
+			per_cpu(wakeup_sharedprio, cpu) = 1;
+
+		goto out;
+	}
+
+	if (current->prio == cpu_wakeup_task->prio)
+		per_cpu(wakeup_sharedprio, cpu) = 1;
+
+	/*
+	 * The task we are waiting for is about to be switched to.
+	 * Calculate latency and store it in histogram.
+	 */
+	stop = ftrace_now(raw_smp_processor_id());
+
+	latency = ((long) (stop - next->preempt_timestamp_hist)) /
+	    NSECS_PER_USECS;
+
+	if (per_cpu(wakeup_sharedprio, cpu)) {
+		latency_hist(WAKEUP_LATENCY_SHAREDPRIO, cpu, latency, 0, stop,
+		    next);
+		per_cpu(wakeup_sharedprio, cpu) = 0;
+	} else {
+		latency_hist(WAKEUP_LATENCY, cpu, latency, 0, stop, next);
+#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
+		if (timerandwakeup_enabled_data.enabled) {
+			latency_hist(TIMERANDWAKEUP_LATENCY, cpu,
+			    next->timer_offset + latency, next->timer_offset,
+			    stop, next);
+		}
+#endif
+	}
+
+out_reset:
+#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
+	next->timer_offset = 0;
+#endif
+	put_task_struct(cpu_wakeup_task);
+	per_cpu(wakeup_task, cpu) = NULL;
+out:
+	raw_spin_unlock_irqrestore(&wakeup_lock, flags);
+}
+#endif
+
+#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
+static notrace void probe_hrtimer_interrupt(void *v, int cpu,
+    long long latency_ns, struct task_struct *curr, struct task_struct *task)
+{
+	if (latency_ns <= 0 && task != NULL && rt_task(task) &&
+	    (task->prio < curr->prio ||
+	    (task->prio == curr->prio &&
+	    !cpumask_test_cpu(cpu, &task->cpus_allowed)))) {
+		long latency;
+		cycle_t now;
+
+		if (missed_timer_offsets_pid) {
+			if (likely(missed_timer_offsets_pid !=
+			    task_pid_nr(task)))
+				return;
+		}
+
+		now = ftrace_now(cpu);
+		latency = (long) div_s64(-latency_ns, NSECS_PER_USECS);
+		latency_hist(MISSED_TIMER_OFFSETS, cpu, latency, latency, now,
+		    task);
+#ifdef CONFIG_WAKEUP_LATENCY_HIST
+		task->timer_offset = latency;
+#endif
+	}
+}
+#endif
+
+static __init int latency_hist_init(void)
+{
+	struct dentry *latency_hist_root = NULL;
+	struct dentry *dentry;
+#ifdef CONFIG_WAKEUP_LATENCY_HIST
+	struct dentry *dentry_sharedprio;
+#endif
+	struct dentry *entry;
+	struct dentry *enable_root;
+	int i = 0;
+	struct hist_data *my_hist;
+	char name[64];
+	char *cpufmt = "CPU%d";
+#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
+    defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
+	char *cpufmt_maxlatproc = "max_latency-CPU%d";
+	struct maxlatproc_data *mp = NULL;
+#endif
+
+	dentry = tracing_init_dentry();
+	latency_hist_root = debugfs_create_dir(latency_hist_dir_root, dentry);
+	enable_root = debugfs_create_dir("enable", latency_hist_root);
+
+#ifdef CONFIG_INTERRUPT_OFF_HIST
+	dentry = debugfs_create_dir(irqsoff_hist_dir, latency_hist_root);
+	for_each_possible_cpu(i) {
+		sprintf(name, cpufmt, i);
+		entry = debugfs_create_file(name, 0444, dentry,
+		    &per_cpu(irqsoff_hist, i), &latency_hist_fops);
+		my_hist = &per_cpu(irqsoff_hist, i);
+		atomic_set(&my_hist->hist_mode, 1);
+		my_hist->min_lat = LONG_MAX;
+	}
+	entry = debugfs_create_file("reset", 0644, dentry,
+	    (void *)IRQSOFF_LATENCY, &latency_hist_reset_fops);
+#endif
+
+#ifdef CONFIG_PREEMPT_OFF_HIST
+	dentry = debugfs_create_dir(preemptoff_hist_dir,
+	    latency_hist_root);
+	for_each_possible_cpu(i) {
+		sprintf(name, cpufmt, i);
+		entry = debugfs_create_file(name, 0444, dentry,
+		    &per_cpu(preemptoff_hist, i), &latency_hist_fops);
+		my_hist = &per_cpu(preemptoff_hist, i);
+		atomic_set(&my_hist->hist_mode, 1);
+		my_hist->min_lat = LONG_MAX;
+	}
+	entry = debugfs_create_file("reset", 0644, dentry,
+	    (void *)PREEMPTOFF_LATENCY, &latency_hist_reset_fops);
+#endif
+
+#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST)
+	dentry = debugfs_create_dir(preemptirqsoff_hist_dir,
+	    latency_hist_root);
+	for_each_possible_cpu(i) {
+		sprintf(name, cpufmt, i);
+		entry = debugfs_create_file(name, 0444, dentry,
+		    &per_cpu(preemptirqsoff_hist, i), &latency_hist_fops);
+		my_hist = &per_cpu(preemptirqsoff_hist, i);
+		atomic_set(&my_hist->hist_mode, 1);
+		my_hist->min_lat = LONG_MAX;
+	}
+	entry = debugfs_create_file("reset", 0644, dentry,
+	    (void *)PREEMPTIRQSOFF_LATENCY, &latency_hist_reset_fops);
+#endif
+
+#if defined(CONFIG_INTERRUPT_OFF_HIST) || defined(CONFIG_PREEMPT_OFF_HIST)
+	entry = debugfs_create_file("preemptirqsoff", 0644,
+	    enable_root, (void *)&preemptirqsoff_enabled_data,
+	    &enable_fops);
+#endif
+
+#ifdef CONFIG_WAKEUP_LATENCY_HIST
+	dentry = debugfs_create_dir(wakeup_latency_hist_dir,
+	    latency_hist_root);
+	dentry_sharedprio = debugfs_create_dir(
+	    wakeup_latency_hist_dir_sharedprio, dentry);
+	for_each_possible_cpu(i) {
+		sprintf(name, cpufmt, i);
+
+		entry = debugfs_create_file(name, 0444, dentry,
+		    &per_cpu(wakeup_latency_hist, i),
+		    &latency_hist_fops);
+		my_hist = &per_cpu(wakeup_latency_hist, i);
+		atomic_set(&my_hist->hist_mode, 1);
+		my_hist->min_lat = LONG_MAX;
+
+		entry = debugfs_create_file(name, 0444, dentry_sharedprio,
+		    &per_cpu(wakeup_latency_hist_sharedprio, i),
+		    &latency_hist_fops);
+		my_hist = &per_cpu(wakeup_latency_hist_sharedprio, i);
+		atomic_set(&my_hist->hist_mode, 1);
+		my_hist->min_lat = LONG_MAX;
+
+		sprintf(name, cpufmt_maxlatproc, i);
+
+		mp = &per_cpu(wakeup_maxlatproc, i);
+		entry = debugfs_create_file(name, 0444, dentry, mp,
+		    &maxlatproc_fops);
+		clear_maxlatprocdata(mp);
+
+		mp = &per_cpu(wakeup_maxlatproc_sharedprio, i);
+		entry = debugfs_create_file(name, 0444, dentry_sharedprio, mp,
+		    &maxlatproc_fops);
+		clear_maxlatprocdata(mp);
+	}
+	entry = debugfs_create_file("pid", 0644, dentry,
+	    (void *)&wakeup_pid, &pid_fops);
+	entry = debugfs_create_file("reset", 0644, dentry,
+	    (void *)WAKEUP_LATENCY, &latency_hist_reset_fops);
+	entry = debugfs_create_file("reset", 0644, dentry_sharedprio,
+	    (void *)WAKEUP_LATENCY_SHAREDPRIO, &latency_hist_reset_fops);
+	entry = debugfs_create_file("wakeup", 0644,
+	    enable_root, (void *)&wakeup_latency_enabled_data,
+	    &enable_fops);
+#endif
+
+#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
+	dentry = debugfs_create_dir(missed_timer_offsets_dir,
+	    latency_hist_root);
+	for_each_possible_cpu(i) {
+		sprintf(name, cpufmt, i);
+		entry = debugfs_create_file(name, 0444, dentry,
+		    &per_cpu(missed_timer_offsets, i), &latency_hist_fops);
+		my_hist = &per_cpu(missed_timer_offsets, i);
+		atomic_set(&my_hist->hist_mode, 1);
+		my_hist->min_lat = LONG_MAX;
+
+		sprintf(name, cpufmt_maxlatproc, i);
+		mp = &per_cpu(missed_timer_offsets_maxlatproc, i);
+		entry = debugfs_create_file(name, 0444, dentry, mp,
+		    &maxlatproc_fops);
+		clear_maxlatprocdata(mp);
+	}
+	entry = debugfs_create_file("pid", 0644, dentry,
+	    (void *)&missed_timer_offsets_pid, &pid_fops);
+	entry = debugfs_create_file("reset", 0644, dentry,
+	    (void *)MISSED_TIMER_OFFSETS, &latency_hist_reset_fops);
+	entry = debugfs_create_file("missed_timer_offsets", 0644,
+	    enable_root, (void *)&missed_timer_offsets_enabled_data,
+	    &enable_fops);
+#endif
+
+#if defined(CONFIG_WAKEUP_LATENCY_HIST) && \
+    defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
+	dentry = debugfs_create_dir(timerandwakeup_latency_hist_dir,
+	    latency_hist_root);
+	for_each_possible_cpu(i) {
+		sprintf(name, cpufmt, i);
+		entry = debugfs_create_file(name, 0444, dentry,
+		    &per_cpu(timerandwakeup_latency_hist, i),
+		    &latency_hist_fops);
+		my_hist = &per_cpu(timerandwakeup_latency_hist, i);
+		atomic_set(&my_hist->hist_mode, 1);
+		my_hist->min_lat = LONG_MAX;
+
+		sprintf(name, cpufmt_maxlatproc, i);
+		mp = &per_cpu(timerandwakeup_maxlatproc, i);
+		entry = debugfs_create_file(name, 0444, dentry, mp,
+		    &maxlatproc_fops);
+		clear_maxlatprocdata(mp);
+	}
+	entry = debugfs_create_file("reset", 0644, dentry,
+	    (void *)TIMERANDWAKEUP_LATENCY, &latency_hist_reset_fops);
+	entry = debugfs_create_file("timerandwakeup", 0644,
+	    enable_root, (void *)&timerandwakeup_enabled_data,
+	    &enable_fops);
+#endif
+	return 0;
+}
+
+__initcall(latency_hist_init);
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 443b25b..f283bd0 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -17,6 +17,7 @@
 #include <linux/fs.h>
 
 #include "trace.h"
+#include <trace/events/hist.h>
 
 static struct trace_array		*irqsoff_trace __read_mostly;
 static int				tracer_enabled __read_mostly;
@@ -438,11 +439,13 @@ void start_critical_timings(void)
 {
 	if (preempt_trace() || irq_trace())
 		start_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
+	trace_preemptirqsoff_hist(TRACE_START, 1);
 }
 EXPORT_SYMBOL_GPL(start_critical_timings);
 
 void stop_critical_timings(void)
 {
+	trace_preemptirqsoff_hist(TRACE_STOP, 0);
 	if (preempt_trace() || irq_trace())
 		stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
 }
@@ -452,6 +455,7 @@ EXPORT_SYMBOL_GPL(stop_critical_timings);
 #ifdef CONFIG_PROVE_LOCKING
 void time_hardirqs_on(unsigned long a0, unsigned long a1)
 {
+	trace_preemptirqsoff_hist(IRQS_ON, 0);
 	if (!preempt_trace() && irq_trace())
 		stop_critical_timing(a0, a1);
 }
@@ -460,6 +464,7 @@ void time_hardirqs_off(unsigned long a0, unsigned long a1)
 {
 	if (!preempt_trace() && irq_trace())
 		start_critical_timing(a0, a1);
+	trace_preemptirqsoff_hist(IRQS_OFF, 1);
 }
 
 #else /* !CONFIG_PROVE_LOCKING */
@@ -485,6 +490,7 @@ inline void print_irqtrace_events(struct task_struct *curr)
  */
 void trace_hardirqs_on(void)
 {
+	trace_preemptirqsoff_hist(IRQS_ON, 0);
 	if (!preempt_trace() && irq_trace())
 		stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
 }
@@ -494,11 +500,13 @@ void trace_hardirqs_off(void)
 {
 	if (!preempt_trace() && irq_trace())
 		start_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
+	trace_preemptirqsoff_hist(IRQS_OFF, 1);
 }
 EXPORT_SYMBOL(trace_hardirqs_off);
 
 void trace_hardirqs_on_caller(unsigned long caller_addr)
 {
+	trace_preemptirqsoff_hist(IRQS_ON, 0);
 	if (!preempt_trace() && irq_trace())
 		stop_critical_timing(CALLER_ADDR0, caller_addr);
 }
@@ -508,6 +516,7 @@ void trace_hardirqs_off_caller(unsigned long caller_addr)
 {
 	if (!preempt_trace() && irq_trace())
 		start_critical_timing(CALLER_ADDR0, caller_addr);
+	trace_preemptirqsoff_hist(IRQS_OFF, 1);
 }
 EXPORT_SYMBOL(trace_hardirqs_off_caller);
 
@@ -517,12 +526,14 @@ EXPORT_SYMBOL(trace_hardirqs_off_caller);
 #ifdef CONFIG_PREEMPT_TRACER
 void trace_preempt_on(unsigned long a0, unsigned long a1)
 {
+	trace_preemptirqsoff_hist(PREEMPT_ON, 0);
 	if (preempt_trace() && !irq_trace())
 		stop_critical_timing(a0, a1);
 }
 
 void trace_preempt_off(unsigned long a0, unsigned long a1)
 {
+	trace_preemptirqsoff_hist(PREEMPT_ON, 1);
 	if (preempt_trace() && !irq_trace())
 		start_critical_timing(a0, a1);
 }
-- 
cgit v0.10.2


From 29c02743e3be15147919e5e0483a52cc976dcf1b Mon Sep 17 00:00:00 2001
From: Carsten Emde <C.Emde@osadl.org>
Date: Tue, 19 Jul 2011 13:53:12 +0100
Subject: hwlatdetect.patch

Jon Masters developed this wonderful SMI detector. For details please
consult Documentation/hwlat_detector.txt. It could be ported to Linux
3.0 RT without any major change.

Signed-off-by: Carsten Emde <C.Emde@osadl.org>

diff --git a/Documentation/hwlat_detector.txt b/Documentation/hwlat_detector.txt
new file mode 100644
index 0000000..cb61516
--- /dev/null
+++ b/Documentation/hwlat_detector.txt
@@ -0,0 +1,64 @@
+Introduction:
+-------------
+
+The module hwlat_detector is a special purpose kernel module that is used to
+detect large system latencies induced by the behavior of certain underlying
+hardware or firmware, independent of Linux itself. The code was developed
+originally to detect SMIs (System Management Interrupts) on x86 systems,
+however there is nothing x86 specific about this patchset. It was
+originally written for use by the "RT" patch since the Real Time
+kernel is highly latency sensitive.
+
+SMIs are usually not serviced by the Linux kernel, which typically does not
+even know that they are occuring. SMIs are instead are set up by BIOS code
+and are serviced by BIOS code, usually for "critical" events such as
+management of thermal sensors and fans. Sometimes though, SMIs are used for
+other tasks and those tasks can spend an inordinate amount of time in the
+handler (sometimes measured in milliseconds). Obviously this is a problem if
+you are trying to keep event service latencies down in the microsecond range.
+
+The hardware latency detector works by hogging all of the cpus for configurable
+amounts of time (by calling stop_machine()), polling the CPU Time Stamp Counter
+for some period, then looking for gaps in the TSC data. Any gap indicates a
+time when the polling was interrupted and since the machine is stopped and
+interrupts turned off the only thing that could do that would be an SMI.
+
+Note that the SMI detector should *NEVER* be used in a production environment.
+It is intended to be run manually to determine if the hardware platform has a
+problem with long system firmware service routines.
+
+Usage:
+------
+
+Loading the module hwlat_detector passing the parameter "enabled=1" (or by
+setting the "enable" entry in "hwlat_detector" debugfs toggled on) is the only
+step required to start the hwlat_detector. It is possible to redefine the
+threshold in microseconds (us) above which latency spikes will be taken
+into account (parameter "threshold=").
+
+Example:
+
+	# modprobe hwlat_detector enabled=1 threshold=100
+
+After the module is loaded, it creates a directory named "hwlat_detector" under
+the debugfs mountpoint, "/debug/hwlat_detector" for this text. It is necessary
+to have debugfs mounted, which might be on /sys/debug on your system.
+
+The /debug/hwlat_detector interface contains the following files:
+
+count			- number of latency spikes observed since last reset
+enable			- a global enable/disable toggle (0/1), resets count
+max			- maximum hardware latency actually observed (usecs)
+sample			- a pipe from which to read current raw sample data
+			  in the format <timestamp> <latency observed usecs>
+			  (can be opened O_NONBLOCK for a single sample)
+threshold		- minimum latency value to be considered (usecs)
+width			- time period to sample with CPUs held (usecs)
+			  must be less than the total window size (enforced)
+window			- total period of sampling, width being inside (usecs)
+
+By default we will set width to 500,000 and window to 1,000,000, meaning that
+we will sample every 1,000,000 usecs (1s) for 500,000 usecs (0.5s). If we
+observe any latencies that exceed the threshold (initially 100 usecs),
+then we write to a global sample ring buffer of 8K samples, which is
+consumed by reading from the "sample" (pipe) debugfs file interface.
diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig
index e94f19d..25ffcfd 100644
--- a/drivers/misc/Kconfig
+++ b/drivers/misc/Kconfig
@@ -121,6 +121,35 @@ config IBM_ASM
 	  for information on the specific driver level and support statement
 	  for your IBM server.
 
+config HWLAT_DETECTOR
+	tristate "Testing module to detect hardware-induced latencies"
+	depends on DEBUG_FS
+	depends on RING_BUFFER
+	default m
+	---help---
+	  A simple hardware latency detector. Use this module to detect
+	  large latencies introduced by the behavior of the underlying
+	  system firmware external to Linux. We do this using periodic
+	  use of stop_machine to grab all available CPUs and measure
+	  for unexplainable gaps in the CPU timestamp counter(s). By
+	  default, the module is not enabled until the "enable" file
+	  within the "hwlat_detector" debugfs directory is toggled.
+
+	  This module is often used to detect SMI (System Management
+	  Interrupts) on x86 systems, though is not x86 specific. To
+	  this end, we default to using a sample window of 1 second,
+	  during which we will sample for 0.5 seconds. If an SMI or
+	  similar event occurs during that time, it is recorded
+	  into an 8K samples global ring buffer until retreived.
+
+	  WARNING: This software should never be enabled (it can be built
+	  but should not be turned on after it is loaded) in a production
+	  environment where high latencies are a concern since the
+	  sampling mechanism actually introduces latencies for
+	  regular tasks while the CPU(s) are being held.
+
+	  If unsure, say N
+
 config PHANTOM
 	tristate "Sensable PHANToM (PCI)"
 	depends on PCI
diff --git a/drivers/misc/Makefile b/drivers/misc/Makefile
index 2129377..ec6ee3f 100644
--- a/drivers/misc/Makefile
+++ b/drivers/misc/Makefile
@@ -49,3 +49,4 @@ obj-y				+= carma/
 obj-$(CONFIG_USB_SWITCH_FSA9480) += fsa9480.o
 obj-$(CONFIG_ALTERA_STAPL)	+=altera-stapl/
 obj-$(CONFIG_INTEL_MEI)		+= mei/
+obj-$(CONFIG_HWLAT_DETECTOR)	+= hwlat_detector.o
diff --git a/drivers/misc/hwlat_detector.c b/drivers/misc/hwlat_detector.c
new file mode 100644
index 0000000..b7b7c90
--- /dev/null
+++ b/drivers/misc/hwlat_detector.c
@@ -0,0 +1,1212 @@
+/*
+ * hwlat_detector.c - A simple Hardware Latency detector.
+ *
+ * Use this module to detect large system latencies induced by the behavior of
+ * certain underlying system hardware or firmware, independent of Linux itself.
+ * The code was developed originally to detect the presence of SMIs on Intel
+ * and AMD systems, although there is no dependency upon x86 herein.
+ *
+ * The classical example usage of this module is in detecting the presence of
+ * SMIs or System Management Interrupts on Intel and AMD systems. An SMI is a
+ * somewhat special form of hardware interrupt spawned from earlier CPU debug
+ * modes in which the (BIOS/EFI/etc.) firmware arranges for the South Bridge
+ * LPC (or other device) to generate a special interrupt under certain
+ * circumstances, for example, upon expiration of a special SMI timer device,
+ * due to certain external thermal readings, on certain I/O address accesses,
+ * and other situations. An SMI hits a special CPU pin, triggers a special
+ * SMI mode (complete with special memory map), and the OS is unaware.
+ *
+ * Although certain hardware-inducing latencies are necessary (for example,
+ * a modern system often requires an SMI handler for correct thermal control
+ * and remote management) they can wreak havoc upon any OS-level performance
+ * guarantees toward low-latency, especially when the OS is not even made
+ * aware of the presence of these interrupts. For this reason, we need a
+ * somewhat brute force mechanism to detect these interrupts. In this case,
+ * we do it by hogging all of the CPU(s) for configurable timer intervals,
+ * sampling the built-in CPU timer, looking for discontiguous readings.
+ *
+ * WARNING: This implementation necessarily introduces latencies. Therefore,
+ *          you should NEVER use this module in a production environment
+ *          requiring any kind of low-latency performance guarantee(s).
+ *
+ * Copyright (C) 2008-2009 Jon Masters, Red Hat, Inc. <jcm@redhat.com>
+ *
+ * Includes useful feedback from Clark Williams <clark@redhat.com>
+ *
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2. This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/ring_buffer.h>
+#include <linux/stop_machine.h>
+#include <linux/time.h>
+#include <linux/hrtimer.h>
+#include <linux/kthread.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+#include <linux/uaccess.h>
+#include <linux/version.h>
+#include <linux/delay.h>
+#include <linux/slab.h>
+
+#define BUF_SIZE_DEFAULT	262144UL		/* 8K*(sizeof(entry)) */
+#define BUF_FLAGS		(RB_FL_OVERWRITE)	/* no block on full */
+#define U64STR_SIZE		22			/* 20 digits max */
+
+#define VERSION			"1.0.0"
+#define BANNER			"hwlat_detector: "
+#define DRVNAME			"hwlat_detector"
+#define DEFAULT_SAMPLE_WINDOW	1000000			/* 1s */
+#define DEFAULT_SAMPLE_WIDTH	500000			/* 0.5s */
+#define DEFAULT_LAT_THRESHOLD	10			/* 10us */
+
+/* Module metadata */
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Jon Masters <jcm@redhat.com>");
+MODULE_DESCRIPTION("A simple hardware latency detector");
+MODULE_VERSION(VERSION);
+
+/* Module parameters */
+
+static int debug;
+static int enabled;
+static int threshold;
+
+module_param(debug, int, 0);			/* enable debug */
+module_param(enabled, int, 0);			/* enable detector */
+module_param(threshold, int, 0);		/* latency threshold */
+
+/* Buffering and sampling */
+
+static struct ring_buffer *ring_buffer;		/* sample buffer */
+static DEFINE_MUTEX(ring_buffer_mutex);		/* lock changes */
+static unsigned long buf_size = BUF_SIZE_DEFAULT;
+static struct task_struct *kthread;		/* sampling thread */
+
+/* DebugFS filesystem entries */
+
+static struct dentry *debug_dir;		/* debugfs directory */
+static struct dentry *debug_max;		/* maximum TSC delta */
+static struct dentry *debug_count;		/* total detect count */
+static struct dentry *debug_sample_width;	/* sample width us */
+static struct dentry *debug_sample_window;	/* sample window us */
+static struct dentry *debug_sample;		/* raw samples us */
+static struct dentry *debug_threshold;		/* threshold us */
+static struct dentry *debug_enable;         	/* enable/disable */
+
+/* Individual samples and global state */
+
+struct sample;					/* latency sample */
+struct data;					/* Global state */
+
+/* Sampling functions */
+static int __buffer_add_sample(struct sample *sample);
+static struct sample *buffer_get_sample(struct sample *sample);
+static int get_sample(void *unused);
+
+/* Threading and state */
+static int kthread_fn(void *unused);
+static int start_kthread(void);
+static int stop_kthread(void);
+static void __reset_stats(void);
+static int init_stats(void);
+
+/* Debugfs interface */
+static ssize_t simple_data_read(struct file *filp, char __user *ubuf,
+				size_t cnt, loff_t *ppos, const u64 *entry);
+static ssize_t simple_data_write(struct file *filp, const char __user *ubuf,
+				 size_t cnt, loff_t *ppos, u64 *entry);
+static int debug_sample_fopen(struct inode *inode, struct file *filp);
+static ssize_t debug_sample_fread(struct file *filp, char __user *ubuf,
+				  size_t cnt, loff_t *ppos);
+static int debug_sample_release(struct inode *inode, struct file *filp);
+static int debug_enable_fopen(struct inode *inode, struct file *filp);
+static ssize_t debug_enable_fread(struct file *filp, char __user *ubuf,
+				  size_t cnt, loff_t *ppos);
+static ssize_t debug_enable_fwrite(struct file *file,
+				   const char __user *user_buffer,
+				   size_t user_size, loff_t *offset);
+
+/* Initialization functions */
+static int init_debugfs(void);
+static void free_debugfs(void);
+static int detector_init(void);
+static void detector_exit(void);
+
+/* Individual latency samples are stored here when detected and packed into
+ * the ring_buffer circular buffer, where they are overwritten when
+ * more than buf_size/sizeof(sample) samples are received. */
+struct sample {
+	u64		seqnum;		/* unique sequence */
+	u64		duration;	/* ktime delta */
+	struct timespec	timestamp;	/* wall time */
+	unsigned long   lost;
+};
+
+/* keep the global state somewhere. Mostly used under stop_machine. */
+static struct data {
+
+	struct mutex lock;		/* protect changes */
+
+	u64	count;			/* total since reset */
+	u64	max_sample;		/* max hardware latency */
+	u64	threshold;		/* sample threshold level */
+
+	u64	sample_window;		/* total sampling window (on+off) */
+	u64	sample_width;		/* active sampling portion of window */
+
+	atomic_t sample_open;		/* whether the sample file is open */
+
+	wait_queue_head_t wq;		/* waitqeue for new sample values */
+
+} data;
+
+/**
+ * __buffer_add_sample - add a new latency sample recording to the ring buffer
+ * @sample: The new latency sample value
+ *
+ * This receives a new latency sample and records it in a global ring buffer.
+ * No additional locking is used in this case - suited for stop_machine use.
+ */
+static int __buffer_add_sample(struct sample *sample)
+{
+	return ring_buffer_write(ring_buffer,
+				 sizeof(struct sample), sample);
+}
+
+/**
+ * buffer_get_sample - remove a hardware latency sample from the ring buffer
+ * @sample: Pre-allocated storage for the sample
+ *
+ * This retrieves a hardware latency sample from the global circular buffer
+ */
+static struct sample *buffer_get_sample(struct sample *sample)
+{
+	struct ring_buffer_event *e = NULL;
+	struct sample *s = NULL;
+	unsigned int cpu = 0;
+
+	if (!sample)
+		return NULL;
+
+	mutex_lock(&ring_buffer_mutex);
+	for_each_online_cpu(cpu) {
+		e = ring_buffer_consume(ring_buffer, cpu, NULL, &sample->lost);
+		if (e)
+			break;
+	}
+
+	if (e) {
+		s = ring_buffer_event_data(e);
+		memcpy(sample, s, sizeof(struct sample));
+	} else
+		sample = NULL;
+	mutex_unlock(&ring_buffer_mutex);
+
+	return sample;
+}
+
+/**
+ * get_sample - sample the CPU TSC and look for likely hardware latencies
+ * @unused: This is not used but is a part of the stop_machine API
+ *
+ * Used to repeatedly capture the CPU TSC (or similar), looking for potential
+ * hardware-induced latency. Called under stop_machine, with data.lock held.
+ */
+static int get_sample(void *unused)
+{
+	ktime_t start, t1, t2;
+	s64 diff, total = 0;
+	u64 sample = 0;
+	int ret = 1;
+
+	start = ktime_get(); /* start timestamp */
+
+	do {
+
+		t1 = ktime_get();	/* we'll look for a discontinuity */
+		t2 = ktime_get();
+
+		total = ktime_to_us(ktime_sub(t2, start)); /* sample width */
+		diff = ktime_to_us(ktime_sub(t2, t1));     /* current diff */
+
+		/* This shouldn't happen */
+		if (diff < 0) {
+			printk(KERN_ERR BANNER "time running backwards\n");
+			goto out;
+		}
+
+		if (diff > sample)
+			sample = diff; /* only want highest value */
+
+	} while (total <= data.sample_width);
+
+	/* If we exceed the threshold value, we have found a hardware latency */
+	if (sample > data.threshold) {
+		struct sample s;
+
+		data.count++;
+		s.seqnum = data.count;
+		s.duration = sample;
+		s.timestamp = CURRENT_TIME;
+		__buffer_add_sample(&s);
+
+		/* Keep a running maximum ever recorded hardware latency */
+		if (sample > data.max_sample)
+			data.max_sample = sample;
+	}
+
+	ret = 0;
+out:
+	return ret;
+}
+
+/*
+ * kthread_fn - The CPU time sampling/hardware latency detection kernel thread
+ * @unused: A required part of the kthread API.
+ *
+ * Used to periodically sample the CPU TSC via a call to get_sample. We
+ * use stop_machine, whith does (intentionally) introduce latency since we
+ * need to ensure nothing else might be running (and thus pre-empting).
+ * Obviously this should never be used in production environments.
+ *
+ * stop_machine will schedule us typically only on CPU0 which is fine for
+ * almost every real-world hardware latency situation - but we might later
+ * generalize this if we find there are any actualy systems with alternate
+ * SMI delivery or other non CPU0 hardware latencies.
+ */
+static int kthread_fn(void *unused)
+{
+	int err = 0;
+	u64 interval = 0;
+
+	while (!kthread_should_stop()) {
+
+		mutex_lock(&data.lock);
+
+		err = stop_machine(get_sample, unused, 0);
+		if (err) {
+			/* Houston, we have a problem */
+			mutex_unlock(&data.lock);
+			goto err_out;
+		}
+
+		wake_up(&data.wq); /* wake up reader(s) */
+
+		interval = data.sample_window - data.sample_width;
+		do_div(interval, USEC_PER_MSEC); /* modifies interval value */
+
+		mutex_unlock(&data.lock);
+
+		if (msleep_interruptible(interval))
+			goto out;
+	}
+		goto out;
+err_out:
+	printk(KERN_ERR BANNER "could not call stop_machine, disabling\n");
+	enabled = 0;
+out:
+	return err;
+
+}
+
+/**
+ * start_kthread - Kick off the hardware latency sampling/detector kthread
+ *
+ * This starts a kernel thread that will sit and sample the CPU timestamp
+ * counter (TSC or similar) and look for potential hardware latencies.
+ */
+static int start_kthread(void)
+{
+	kthread = kthread_run(kthread_fn, NULL,
+					DRVNAME);
+	if (IS_ERR(kthread)) {
+		printk(KERN_ERR BANNER "could not start sampling thread\n");
+		enabled = 0;
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+/**
+ * stop_kthread - Inform the hardware latency samping/detector kthread to stop
+ *
+ * This kicks the running hardware latency sampling/detector kernel thread and
+ * tells it to stop sampling now. Use this on unload and at system shutdown.
+ */
+static int stop_kthread(void)
+{
+	int ret;
+
+	ret = kthread_stop(kthread);
+
+	return ret;
+}
+
+/**
+ * __reset_stats - Reset statistics for the hardware latency detector
+ *
+ * We use data to store various statistics and global state. We call this
+ * function in order to reset those when "enable" is toggled on or off, and
+ * also at initialization. Should be called with data.lock held.
+ */
+static void __reset_stats(void)
+{
+	data.count = 0;
+	data.max_sample = 0;
+	ring_buffer_reset(ring_buffer); /* flush out old sample entries */
+}
+
+/**
+ * init_stats - Setup global state statistics for the hardware latency detector
+ *
+ * We use data to store various statistics and global state. We also use
+ * a global ring buffer (ring_buffer) to keep raw samples of detected hardware
+ * induced system latencies. This function initializes these structures and
+ * allocates the global ring buffer also.
+ */
+static int init_stats(void)
+{
+	int ret = -ENOMEM;
+
+	mutex_init(&data.lock);
+	init_waitqueue_head(&data.wq);
+	atomic_set(&data.sample_open, 0);
+
+	ring_buffer = ring_buffer_alloc(buf_size, BUF_FLAGS);
+
+	if (WARN(!ring_buffer, KERN_ERR BANNER
+			       "failed to allocate ring buffer!\n"))
+		goto out;
+
+	__reset_stats();
+	data.threshold = DEFAULT_LAT_THRESHOLD;	    /* threshold us */
+	data.sample_window = DEFAULT_SAMPLE_WINDOW; /* window us */
+	data.sample_width = DEFAULT_SAMPLE_WIDTH;   /* width us */
+
+	ret = 0;
+
+out:
+	return ret;
+
+}
+
+/*
+ * simple_data_read - Wrapper read function for global state debugfs entries
+ * @filp: The active open file structure for the debugfs "file"
+ * @ubuf: The userspace provided buffer to read value into
+ * @cnt: The maximum number of bytes to read
+ * @ppos: The current "file" position
+ * @entry: The entry to read from
+ *
+ * This function provides a generic read implementation for the global state
+ * "data" structure debugfs filesystem entries. It would be nice to use
+ * simple_attr_read directly, but we need to make sure that the data.lock
+ * spinlock is held during the actual read (even though we likely won't ever
+ * actually race here as the updater runs under a stop_machine context).
+ */
+static ssize_t simple_data_read(struct file *filp, char __user *ubuf,
+				size_t cnt, loff_t *ppos, const u64 *entry)
+{
+	char buf[U64STR_SIZE];
+	u64 val = 0;
+	int len = 0;
+
+	memset(buf, 0, sizeof(buf));
+
+	if (!entry)
+		return -EFAULT;
+
+	mutex_lock(&data.lock);
+	val = *entry;
+	mutex_unlock(&data.lock);
+
+	len = snprintf(buf, sizeof(buf), "%llu\n", (unsigned long long)val);
+
+	return simple_read_from_buffer(ubuf, cnt, ppos, buf, len);
+
+}
+
+/*
+ * simple_data_write - Wrapper write function for global state debugfs entries
+ * @filp: The active open file structure for the debugfs "file"
+ * @ubuf: The userspace provided buffer to write value from
+ * @cnt: The maximum number of bytes to write
+ * @ppos: The current "file" position
+ * @entry: The entry to write to
+ *
+ * This function provides a generic write implementation for the global state
+ * "data" structure debugfs filesystem entries. It would be nice to use
+ * simple_attr_write directly, but we need to make sure that the data.lock
+ * spinlock is held during the actual write (even though we likely won't ever
+ * actually race here as the updater runs under a stop_machine context).
+ */
+static ssize_t simple_data_write(struct file *filp, const char __user *ubuf,
+				 size_t cnt, loff_t *ppos, u64 *entry)
+{
+	char buf[U64STR_SIZE];
+	int csize = min(cnt, sizeof(buf));
+	u64 val = 0;
+	int err = 0;
+
+	memset(buf, '\0', sizeof(buf));
+	if (copy_from_user(buf, ubuf, csize))
+		return -EFAULT;
+
+	buf[U64STR_SIZE-1] = '\0';			/* just in case */
+	err = strict_strtoull(buf, 10, &val);
+	if (err)
+		return -EINVAL;
+
+	mutex_lock(&data.lock);
+	*entry = val;
+	mutex_unlock(&data.lock);
+
+	return csize;
+}
+
+/**
+ * debug_count_fopen - Open function for "count" debugfs entry
+ * @inode: The in-kernel inode representation of the debugfs "file"
+ * @filp: The active open file structure for the debugfs "file"
+ *
+ * This function provides an open implementation for the "count" debugfs
+ * interface to the hardware latency detector.
+ */
+static int debug_count_fopen(struct inode *inode, struct file *filp)
+{
+	return 0;
+}
+
+/**
+ * debug_count_fread - Read function for "count" debugfs entry
+ * @filp: The active open file structure for the debugfs "file"
+ * @ubuf: The userspace provided buffer to read value into
+ * @cnt: The maximum number of bytes to read
+ * @ppos: The current "file" position
+ *
+ * This function provides a read implementation for the "count" debugfs
+ * interface to the hardware latency detector. Can be used to read the
+ * number of latency readings exceeding the configured threshold since
+ * the detector was last reset (e.g. by writing a zero into "count").
+ */
+static ssize_t debug_count_fread(struct file *filp, char __user *ubuf,
+				     size_t cnt, loff_t *ppos)
+{
+	return simple_data_read(filp, ubuf, cnt, ppos, &data.count);
+}
+
+/**
+ * debug_count_fwrite - Write function for "count" debugfs entry
+ * @filp: The active open file structure for the debugfs "file"
+ * @ubuf: The user buffer that contains the value to write
+ * @cnt: The maximum number of bytes to write to "file"
+ * @ppos: The current position in the debugfs "file"
+ *
+ * This function provides a write implementation for the "count" debugfs
+ * interface to the hardware latency detector. Can be used to write a
+ * desired value, especially to zero the total count.
+ */
+static ssize_t  debug_count_fwrite(struct file *filp,
+				       const char __user *ubuf,
+				       size_t cnt,
+				       loff_t *ppos)
+{
+	return simple_data_write(filp, ubuf, cnt, ppos, &data.count);
+}
+
+/**
+ * debug_enable_fopen - Dummy open function for "enable" debugfs interface
+ * @inode: The in-kernel inode representation of the debugfs "file"
+ * @filp: The active open file structure for the debugfs "file"
+ *
+ * This function provides an open implementation for the "enable" debugfs
+ * interface to the hardware latency detector.
+ */
+static int debug_enable_fopen(struct inode *inode, struct file *filp)
+{
+	return 0;
+}
+
+/**
+ * debug_enable_fread - Read function for "enable" debugfs interface
+ * @filp: The active open file structure for the debugfs "file"
+ * @ubuf: The userspace provided buffer to read value into
+ * @cnt: The maximum number of bytes to read
+ * @ppos: The current "file" position
+ *
+ * This function provides a read implementation for the "enable" debugfs
+ * interface to the hardware latency detector. Can be used to determine
+ * whether the detector is currently enabled ("0\n" or "1\n" returned).
+ */
+static ssize_t debug_enable_fread(struct file *filp, char __user *ubuf,
+				      size_t cnt, loff_t *ppos)
+{
+	char buf[4];
+
+	if ((cnt < sizeof(buf)) || (*ppos))
+		return 0;
+
+	buf[0] = enabled ? '1' : '0';
+	buf[1] = '\n';
+	buf[2] = '\0';
+	if (copy_to_user(ubuf, buf, strlen(buf)))
+		return -EFAULT;
+	return *ppos = strlen(buf);
+}
+
+/**
+ * debug_enable_fwrite - Write function for "enable" debugfs interface
+ * @filp: The active open file structure for the debugfs "file"
+ * @ubuf: The user buffer that contains the value to write
+ * @cnt: The maximum number of bytes to write to "file"
+ * @ppos: The current position in the debugfs "file"
+ *
+ * This function provides a write implementation for the "enable" debugfs
+ * interface to the hardware latency detector. Can be used to enable or
+ * disable the detector, which will have the side-effect of possibly
+ * also resetting the global stats and kicking off the measuring
+ * kthread (on an enable) or the converse (upon a disable).
+ */
+static ssize_t  debug_enable_fwrite(struct file *filp,
+					const char __user *ubuf,
+					size_t cnt,
+					loff_t *ppos)
+{
+	char buf[4];
+	int csize = min(cnt, sizeof(buf));
+	long val = 0;
+	int err = 0;
+
+	memset(buf, '\0', sizeof(buf));
+	if (copy_from_user(buf, ubuf, csize))
+		return -EFAULT;
+
+	buf[sizeof(buf)-1] = '\0';			/* just in case */
+	err = strict_strtoul(buf, 10, &val);
+	if (0 != err)
+		return -EINVAL;
+
+	if (val) {
+		if (enabled)
+			goto unlock;
+		enabled = 1;
+		__reset_stats();
+		if (start_kthread())
+			return -EFAULT;
+	} else {
+		if (!enabled)
+			goto unlock;
+		enabled = 0;
+		err = stop_kthread();
+		if (err) {
+			printk(KERN_ERR BANNER "cannot stop kthread\n");
+			return -EFAULT;
+		}
+		wake_up(&data.wq);		/* reader(s) should return */
+	}
+unlock:
+	return csize;
+}
+
+/**
+ * debug_max_fopen - Open function for "max" debugfs entry
+ * @inode: The in-kernel inode representation of the debugfs "file"
+ * @filp: The active open file structure for the debugfs "file"
+ *
+ * This function provides an open implementation for the "max" debugfs
+ * interface to the hardware latency detector.
+ */
+static int debug_max_fopen(struct inode *inode, struct file *filp)
+{
+	return 0;
+}
+
+/**
+ * debug_max_fread - Read function for "max" debugfs entry
+ * @filp: The active open file structure for the debugfs "file"
+ * @ubuf: The userspace provided buffer to read value into
+ * @cnt: The maximum number of bytes to read
+ * @ppos: The current "file" position
+ *
+ * This function provides a read implementation for the "max" debugfs
+ * interface to the hardware latency detector. Can be used to determine
+ * the maximum latency value observed since it was last reset.
+ */
+static ssize_t debug_max_fread(struct file *filp, char __user *ubuf,
+				   size_t cnt, loff_t *ppos)
+{
+	return simple_data_read(filp, ubuf, cnt, ppos, &data.max_sample);
+}
+
+/**
+ * debug_max_fwrite - Write function for "max" debugfs entry
+ * @filp: The active open file structure for the debugfs "file"
+ * @ubuf: The user buffer that contains the value to write
+ * @cnt: The maximum number of bytes to write to "file"
+ * @ppos: The current position in the debugfs "file"
+ *
+ * This function provides a write implementation for the "max" debugfs
+ * interface to the hardware latency detector. Can be used to reset the
+ * maximum or set it to some other desired value - if, then, subsequent
+ * measurements exceed this value, the maximum will be updated.
+ */
+static ssize_t  debug_max_fwrite(struct file *filp,
+				     const char __user *ubuf,
+				     size_t cnt,
+				     loff_t *ppos)
+{
+	return simple_data_write(filp, ubuf, cnt, ppos, &data.max_sample);
+}
+
+
+/**
+ * debug_sample_fopen - An open function for "sample" debugfs interface
+ * @inode: The in-kernel inode representation of this debugfs "file"
+ * @filp: The active open file structure for the debugfs "file"
+ *
+ * This function handles opening the "sample" file within the hardware
+ * latency detector debugfs directory interface. This file is used to read
+ * raw samples from the global ring_buffer and allows the user to see a
+ * running latency history. Can be opened blocking or non-blocking,
+ * affecting whether it behaves as a buffer read pipe, or does not.
+ * Implements simple locking to prevent multiple simultaneous use.
+ */
+static int debug_sample_fopen(struct inode *inode, struct file *filp)
+{
+	if (!atomic_add_unless(&data.sample_open, 1, 1))
+		return -EBUSY;
+	else
+		return 0;
+}
+
+/**
+ * debug_sample_fread - A read function for "sample" debugfs interface
+ * @filp: The active open file structure for the debugfs "file"
+ * @ubuf: The user buffer that will contain the samples read
+ * @cnt: The maximum bytes to read from the debugfs "file"
+ * @ppos: The current position in the debugfs "file"
+ *
+ * This function handles reading from the "sample" file within the hardware
+ * latency detector debugfs directory interface. This file is used to read
+ * raw samples from the global ring_buffer and allows the user to see a
+ * running latency history. By default this will block pending a new
+ * value written into the sample buffer, unless there are already a
+ * number of value(s) waiting in the buffer, or the sample file was
+ * previously opened in a non-blocking mode of operation.
+ */
+static ssize_t debug_sample_fread(struct file *filp, char __user *ubuf,
+					size_t cnt, loff_t *ppos)
+{
+	int len = 0;
+	char buf[64];
+	struct sample *sample = NULL;
+
+	if (!enabled)
+		return 0;
+
+	sample = kzalloc(sizeof(struct sample), GFP_KERNEL);
+	if (!sample)
+		return -ENOMEM;
+
+	while (!buffer_get_sample(sample)) {
+
+		DEFINE_WAIT(wait);
+
+		if (filp->f_flags & O_NONBLOCK) {
+			len = -EAGAIN;
+			goto out;
+		}
+
+		prepare_to_wait(&data.wq, &wait, TASK_INTERRUPTIBLE);
+		schedule();
+		finish_wait(&data.wq, &wait);
+
+		if (signal_pending(current)) {
+			len = -EINTR;
+			goto out;
+		}
+
+		if (!enabled) {			/* enable was toggled */
+			len = 0;
+			goto out;
+		}
+	}
+
+	len = snprintf(buf, sizeof(buf), "%010lu.%010lu\t%llu\n",
+		      sample->timestamp.tv_sec,
+		      sample->timestamp.tv_nsec,
+		      sample->duration);
+
+
+	/* handling partial reads is more trouble than it's worth */
+	if (len > cnt)
+		goto out;
+
+	if (copy_to_user(ubuf, buf, len))
+		len = -EFAULT;
+
+out:
+	kfree(sample);
+	return len;
+}
+
+/**
+ * debug_sample_release - Release function for "sample" debugfs interface
+ * @inode: The in-kernel inode represenation of the debugfs "file"
+ * @filp: The active open file structure for the debugfs "file"
+ *
+ * This function completes the close of the debugfs interface "sample" file.
+ * Frees the sample_open "lock" so that other users may open the interface.
+ */
+static int debug_sample_release(struct inode *inode, struct file *filp)
+{
+	atomic_dec(&data.sample_open);
+
+	return 0;
+}
+
+/**
+ * debug_threshold_fopen - Open function for "threshold" debugfs entry
+ * @inode: The in-kernel inode representation of the debugfs "file"
+ * @filp: The active open file structure for the debugfs "file"
+ *
+ * This function provides an open implementation for the "threshold" debugfs
+ * interface to the hardware latency detector.
+ */
+static int debug_threshold_fopen(struct inode *inode, struct file *filp)
+{
+	return 0;
+}
+
+/**
+ * debug_threshold_fread - Read function for "threshold" debugfs entry
+ * @filp: The active open file structure for the debugfs "file"
+ * @ubuf: The userspace provided buffer to read value into
+ * @cnt: The maximum number of bytes to read
+ * @ppos: The current "file" position
+ *
+ * This function provides a read implementation for the "threshold" debugfs
+ * interface to the hardware latency detector. It can be used to determine
+ * the current threshold level at which a latency will be recorded in the
+ * global ring buffer, typically on the order of 10us.
+ */
+static ssize_t debug_threshold_fread(struct file *filp, char __user *ubuf,
+					 size_t cnt, loff_t *ppos)
+{
+	return simple_data_read(filp, ubuf, cnt, ppos, &data.threshold);
+}
+
+/**
+ * debug_threshold_fwrite - Write function for "threshold" debugfs entry
+ * @filp: The active open file structure for the debugfs "file"
+ * @ubuf: The user buffer that contains the value to write
+ * @cnt: The maximum number of bytes to write to "file"
+ * @ppos: The current position in the debugfs "file"
+ *
+ * This function provides a write implementation for the "threshold" debugfs
+ * interface to the hardware latency detector. It can be used to configure
+ * the threshold level at which any subsequently detected latencies will
+ * be recorded into the global ring buffer.
+ */
+static ssize_t  debug_threshold_fwrite(struct file *filp,
+					const char __user *ubuf,
+					size_t cnt,
+					loff_t *ppos)
+{
+	int ret;
+
+	ret = simple_data_write(filp, ubuf, cnt, ppos, &data.threshold);
+
+	if (enabled)
+		wake_up_process(kthread);
+
+	return ret;
+}
+
+/**
+ * debug_width_fopen - Open function for "width" debugfs entry
+ * @inode: The in-kernel inode representation of the debugfs "file"
+ * @filp: The active open file structure for the debugfs "file"
+ *
+ * This function provides an open implementation for the "width" debugfs
+ * interface to the hardware latency detector.
+ */
+static int debug_width_fopen(struct inode *inode, struct file *filp)
+{
+	return 0;
+}
+
+/**
+ * debug_width_fread - Read function for "width" debugfs entry
+ * @filp: The active open file structure for the debugfs "file"
+ * @ubuf: The userspace provided buffer to read value into
+ * @cnt: The maximum number of bytes to read
+ * @ppos: The current "file" position
+ *
+ * This function provides a read implementation for the "width" debugfs
+ * interface to the hardware latency detector. It can be used to determine
+ * for how many us of the total window us we will actively sample for any
+ * hardware-induced latecy periods. Obviously, it is not possible to
+ * sample constantly and have the system respond to a sample reader, or,
+ * worse, without having the system appear to have gone out to lunch.
+ */
+static ssize_t debug_width_fread(struct file *filp, char __user *ubuf,
+				     size_t cnt, loff_t *ppos)
+{
+	return simple_data_read(filp, ubuf, cnt, ppos, &data.sample_width);
+}
+
+/**
+ * debug_width_fwrite - Write function for "width" debugfs entry
+ * @filp: The active open file structure for the debugfs "file"
+ * @ubuf: The user buffer that contains the value to write
+ * @cnt: The maximum number of bytes to write to "file"
+ * @ppos: The current position in the debugfs "file"
+ *
+ * This function provides a write implementation for the "width" debugfs
+ * interface to the hardware latency detector. It can be used to configure
+ * for how many us of the total window us we will actively sample for any
+ * hardware-induced latency periods. Obviously, it is not possible to
+ * sample constantly and have the system respond to a sample reader, or,
+ * worse, without having the system appear to have gone out to lunch. It
+ * is enforced that width is less that the total window size.
+ */
+static ssize_t  debug_width_fwrite(struct file *filp,
+				       const char __user *ubuf,
+				       size_t cnt,
+				       loff_t *ppos)
+{
+	char buf[U64STR_SIZE];
+	int csize = min(cnt, sizeof(buf));
+	u64 val = 0;
+	int err = 0;
+
+	memset(buf, '\0', sizeof(buf));
+	if (copy_from_user(buf, ubuf, csize))
+		return -EFAULT;
+
+	buf[U64STR_SIZE-1] = '\0';			/* just in case */
+	err = strict_strtoull(buf, 10, &val);
+	if (0 != err)
+		return -EINVAL;
+
+	mutex_lock(&data.lock);
+	if (val < data.sample_window)
+		data.sample_width = val;
+	else {
+		mutex_unlock(&data.lock);
+		return -EINVAL;
+	}
+	mutex_unlock(&data.lock);
+
+	if (enabled)
+		wake_up_process(kthread);
+
+	return csize;
+}
+
+/**
+ * debug_window_fopen - Open function for "window" debugfs entry
+ * @inode: The in-kernel inode representation of the debugfs "file"
+ * @filp: The active open file structure for the debugfs "file"
+ *
+ * This function provides an open implementation for the "window" debugfs
+ * interface to the hardware latency detector. The window is the total time
+ * in us that will be considered one sample period. Conceptually, windows
+ * occur back-to-back and contain a sample width period during which
+ * actual sampling occurs.
+ */
+static int debug_window_fopen(struct inode *inode, struct file *filp)
+{
+	return 0;
+}
+
+/**
+ * debug_window_fread - Read function for "window" debugfs entry
+ * @filp: The active open file structure for the debugfs "file"
+ * @ubuf: The userspace provided buffer to read value into
+ * @cnt: The maximum number of bytes to read
+ * @ppos: The current "file" position
+ *
+ * This function provides a read implementation for the "window" debugfs
+ * interface to the hardware latency detector. The window is the total time
+ * in us that will be considered one sample period. Conceptually, windows
+ * occur back-to-back and contain a sample width period during which
+ * actual sampling occurs. Can be used to read the total window size.
+ */
+static ssize_t debug_window_fread(struct file *filp, char __user *ubuf,
+				      size_t cnt, loff_t *ppos)
+{
+	return simple_data_read(filp, ubuf, cnt, ppos, &data.sample_window);
+}
+
+/**
+ * debug_window_fwrite - Write function for "window" debugfs entry
+ * @filp: The active open file structure for the debugfs "file"
+ * @ubuf: The user buffer that contains the value to write
+ * @cnt: The maximum number of bytes to write to "file"
+ * @ppos: The current position in the debugfs "file"
+ *
+ * This function provides a write implementation for the "window" debufds
+ * interface to the hardware latency detetector. The window is the total time
+ * in us that will be considered one sample period. Conceptually, windows
+ * occur back-to-back and contain a sample width period during which
+ * actual sampling occurs. Can be used to write a new total window size. It
+ * is enfoced that any value written must be greater than the sample width
+ * size, or an error results.
+ */
+static ssize_t  debug_window_fwrite(struct file *filp,
+					const char __user *ubuf,
+					size_t cnt,
+					loff_t *ppos)
+{
+	char buf[U64STR_SIZE];
+	int csize = min(cnt, sizeof(buf));
+	u64 val = 0;
+	int err = 0;
+
+	memset(buf, '\0', sizeof(buf));
+	if (copy_from_user(buf, ubuf, csize))
+		return -EFAULT;
+
+	buf[U64STR_SIZE-1] = '\0';			/* just in case */
+	err = strict_strtoull(buf, 10, &val);
+	if (0 != err)
+		return -EINVAL;
+
+	mutex_lock(&data.lock);
+	if (data.sample_width < val)
+		data.sample_window = val;
+	else {
+		mutex_unlock(&data.lock);
+		return -EINVAL;
+	}
+	mutex_unlock(&data.lock);
+
+	return csize;
+}
+
+/*
+ * Function pointers for the "count" debugfs file operations
+ */
+static const struct file_operations count_fops = {
+	.open		= debug_count_fopen,
+	.read		= debug_count_fread,
+	.write		= debug_count_fwrite,
+	.owner		= THIS_MODULE,
+};
+
+/*
+ * Function pointers for the "enable" debugfs file operations
+ */
+static const struct file_operations enable_fops = {
+	.open		= debug_enable_fopen,
+	.read		= debug_enable_fread,
+	.write		= debug_enable_fwrite,
+	.owner		= THIS_MODULE,
+};
+
+/*
+ * Function pointers for the "max" debugfs file operations
+ */
+static const struct file_operations max_fops = {
+	.open		= debug_max_fopen,
+	.read		= debug_max_fread,
+	.write		= debug_max_fwrite,
+	.owner		= THIS_MODULE,
+};
+
+/*
+ * Function pointers for the "sample" debugfs file operations
+ */
+static const struct file_operations sample_fops = {
+	.open 		= debug_sample_fopen,
+	.read		= debug_sample_fread,
+	.release	= debug_sample_release,
+	.owner		= THIS_MODULE,
+};
+
+/*
+ * Function pointers for the "threshold" debugfs file operations
+ */
+static const struct file_operations threshold_fops = {
+	.open		= debug_threshold_fopen,
+	.read		= debug_threshold_fread,
+	.write		= debug_threshold_fwrite,
+	.owner		= THIS_MODULE,
+};
+
+/*
+ * Function pointers for the "width" debugfs file operations
+ */
+static const struct file_operations width_fops = {
+	.open		= debug_width_fopen,
+	.read		= debug_width_fread,
+	.write		= debug_width_fwrite,
+	.owner		= THIS_MODULE,
+};
+
+/*
+ * Function pointers for the "window" debugfs file operations
+ */
+static const struct file_operations window_fops = {
+	.open		= debug_window_fopen,
+	.read		= debug_window_fread,
+	.write		= debug_window_fwrite,
+	.owner		= THIS_MODULE,
+};
+
+/**
+ * init_debugfs - A function to initialize the debugfs interface files
+ *
+ * This function creates entries in debugfs for "hwlat_detector", including
+ * files to read values from the detector, current samples, and the
+ * maximum sample that has been captured since the hardware latency
+ * dectector was started.
+ */
+static int init_debugfs(void)
+{
+	int ret = -ENOMEM;
+
+	debug_dir = debugfs_create_dir(DRVNAME, NULL);
+	if (!debug_dir)
+		goto err_debug_dir;
+
+	debug_sample = debugfs_create_file("sample", 0444,
+					       debug_dir, NULL,
+					       &sample_fops);
+	if (!debug_sample)
+		goto err_sample;
+
+	debug_count = debugfs_create_file("count", 0444,
+					      debug_dir, NULL,
+					      &count_fops);
+	if (!debug_count)
+		goto err_count;
+
+	debug_max = debugfs_create_file("max", 0444,
+					    debug_dir, NULL,
+					    &max_fops);
+	if (!debug_max)
+		goto err_max;
+
+	debug_sample_window = debugfs_create_file("window", 0644,
+						      debug_dir, NULL,
+						      &window_fops);
+	if (!debug_sample_window)
+		goto err_window;
+
+	debug_sample_width = debugfs_create_file("width", 0644,
+						     debug_dir, NULL,
+						     &width_fops);
+	if (!debug_sample_width)
+		goto err_width;
+
+	debug_threshold = debugfs_create_file("threshold", 0644,
+						  debug_dir, NULL,
+						  &threshold_fops);
+	if (!debug_threshold)
+		goto err_threshold;
+
+	debug_enable = debugfs_create_file("enable", 0644,
+					       debug_dir, &enabled,
+					       &enable_fops);
+	if (!debug_enable)
+		goto err_enable;
+
+	else {
+		ret = 0;
+		goto out;
+	}
+
+err_enable:
+	debugfs_remove(debug_threshold);
+err_threshold:
+	debugfs_remove(debug_sample_width);
+err_width:
+	debugfs_remove(debug_sample_window);
+err_window:
+	debugfs_remove(debug_max);
+err_max:
+	debugfs_remove(debug_count);
+err_count:
+	debugfs_remove(debug_sample);
+err_sample:
+	debugfs_remove(debug_dir);
+err_debug_dir:
+out:
+	return ret;
+}
+
+/**
+ * free_debugfs - A function to cleanup the debugfs file interface
+ */
+static void free_debugfs(void)
+{
+	/* could also use a debugfs_remove_recursive */
+	debugfs_remove(debug_enable);
+	debugfs_remove(debug_threshold);
+	debugfs_remove(debug_sample_width);
+	debugfs_remove(debug_sample_window);
+	debugfs_remove(debug_max);
+	debugfs_remove(debug_count);
+	debugfs_remove(debug_sample);
+	debugfs_remove(debug_dir);
+}
+
+/**
+ * detector_init - Standard module initialization code
+ */
+static int detector_init(void)
+{
+	int ret = -ENOMEM;
+
+	printk(KERN_INFO BANNER "version %s\n", VERSION);
+
+	ret = init_stats();
+	if (0 != ret)
+		goto out;
+
+	ret = init_debugfs();
+	if (0 != ret)
+		goto err_stats;
+
+	if (enabled)
+		ret = start_kthread();
+
+	goto out;
+
+err_stats:
+	ring_buffer_free(ring_buffer);
+out:
+	return ret;
+
+}
+
+/**
+ * detector_exit - Standard module cleanup code
+ */
+static void detector_exit(void)
+{
+	int err;
+
+	if (enabled) {
+		enabled = 0;
+		err = stop_kthread();
+		if (err)
+			printk(KERN_ERR BANNER "cannot stop kthread\n");
+	}
+
+	free_debugfs();
+	ring_buffer_free(ring_buffer);	/* free up the ring buffer */
+
+}
+
+module_init(detector_init);
+module_exit(detector_exit);
-- 
cgit v0.10.2


From 28cb35c03f157d04d4404e0dde2441059d6289e9 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 8 Jul 2011 20:25:16 +0200
Subject: localversion.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/n/tip-8vdw4bfcsds27cvox6rpb334@git.kernel.org

diff --git a/localversion-rt b/localversion-rt
new file mode 100644
index 0000000..8fc605d
--- /dev/null
+++ b/localversion-rt
@@ -0,0 +1 @@
+-rt6
-- 
cgit v0.10.2


From a8fdc2b68b4bf34175e97f56cc9810ff86af8243 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 22 Jul 2011 17:58:40 +0200
Subject: printk-kill.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/printk.h b/include/linux/printk.h
index 8111ffa..812d102 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -99,9 +99,11 @@ int no_printk(const char *fmt, ...)
 extern asmlinkage __printf(1, 2)
 void early_printk(const char *fmt, ...);
 void early_vprintk(const char *fmt, va_list ap);
+extern void printk_kill(void);
 #else
 static inline __printf(1, 2) __cold
 void early_printk(const char *s, ...) { }
+static inline void printk_kill(void) { }
 #endif
 
 extern int printk_needs_cpu(int cpu);
@@ -138,7 +140,6 @@ extern int __printk_ratelimit(const char *func);
 #define printk_ratelimit() __printk_ratelimit(__func__)
 extern bool printk_timed_ratelimit(unsigned long *caller_jiffies,
 				   unsigned int interval_msec);
-
 extern int printk_delay_msec;
 extern int dmesg_restrict;
 extern int kptr_restrict;
diff --git a/kernel/printk.c b/kernel/printk.c
index 69270a6..d429e73 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -770,6 +770,32 @@ asmlinkage void early_printk(const char *fmt, ...)
 	early_vprintk(fmt, ap);
 	va_end(ap);
 }
+
+/*
+ * This is independent of any log levels - a global
+ * kill switch that turns off all of printk.
+ *
+ * Used by the NMI watchdog if early-printk is enabled.
+ */
+static bool __read_mostly printk_killswitch;
+
+void printk_kill(void)
+{
+	printk_killswitch = true;
+}
+
+static int forced_early_printk(const char *fmt, va_list ap)
+{
+	if (!printk_killswitch)
+		return 0;
+	early_vprintk(fmt, ap);
+	return 1;
+}
+#else
+static inline int forced_early_printk(const char *fmt, va_list ap)
+{
+	return 0;
+}
 #endif
 
 #ifdef CONFIG_BOOT_PRINTK_DELAY
@@ -1511,6 +1537,13 @@ asmlinkage int vprintk_emit(int facility, int level,
 	int this_cpu;
 	int printed_len = 0;
 
+	/*
+	 * Fall back to early_printk if a debugging subsystem has
+	 * killed printk output
+	 */
+	if (unlikely(forced_early_printk(fmt, args)))
+		return 1;
+
 	boot_delay_msec(level);
 	printk_delay();
 
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 75a2ab3..608d2fe 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -206,6 +206,8 @@ static int is_softlockup(unsigned long touch_ts)
 
 #ifdef CONFIG_HARDLOCKUP_DETECTOR
 
+static DEFINE_RAW_SPINLOCK(watchdog_output_lock);
+
 static struct perf_event_attr wd_hw_attr = {
 	.type		= PERF_TYPE_HARDWARE,
 	.config		= PERF_COUNT_HW_CPU_CYCLES,
@@ -240,10 +242,19 @@ static void watchdog_overflow_callback(struct perf_event *event,
 		if (__this_cpu_read(hard_watchdog_warn) == true)
 			return;
 
-		if (hardlockup_panic)
+		/*
+		 * If early-printk is enabled then make sure we do not
+		 * lock up in printk() and kill console logging:
+		 */
+		printk_kill();
+
+		if (hardlockup_panic) {
 			panic("Watchdog detected hard LOCKUP on cpu %d", this_cpu);
-		else
+		} else {
+			raw_spin_lock(&watchdog_output_lock);
 			WARN(1, "Watchdog detected hard LOCKUP on cpu %d", this_cpu);
+			raw_spin_unlock(&watchdog_output_lock);
+		}
 
 		__this_cpu_write(hard_watchdog_warn, true);
 		return;
-- 
cgit v0.10.2


From e344d25f055bb2822780916776de691919996cef Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 2 Sep 2011 14:29:33 +0200
Subject: printk: 'force_early_printk' boot param to help with debugging

Gives me an option to screw printk and actually see what the machine
says.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1314967289.1301.11.camel@twins
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/n/tip-ykb97nsfmobq44xketrxs977@git.kernel.org

diff --git a/kernel/printk.c b/kernel/printk.c
index d429e73..98aad49 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -779,6 +779,13 @@ asmlinkage void early_printk(const char *fmt, ...)
  */
 static bool __read_mostly printk_killswitch;
 
+static int __init force_early_printk_setup(char *str)
+{
+	printk_killswitch = true;
+	return 0;
+}
+early_param("force_early_printk", force_early_printk_setup);
+
 void printk_kill(void)
 {
 	printk_killswitch = true;
-- 
cgit v0.10.2


From 270cc42a5c5f12f5dffefad027ab8aa94aed1cfc Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 17 Jun 2011 12:39:57 +0200
Subject: rt-preempt-base-config.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index 3f9c974..885efbd 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -1,3 +1,10 @@
+config PREEMPT
+	bool
+	select PREEMPT_COUNT
+
+config PREEMPT_RT_BASE
+	bool
+	select PREEMPT
 
 choice
 	prompt "Preemption Model"
@@ -33,9 +40,9 @@ config PREEMPT_VOLUNTARY
 
 	  Select this if you are building a kernel for a desktop system.
 
-config PREEMPT
+config PREEMPT__LL
 	bool "Preemptible Kernel (Low-Latency Desktop)"
-	select PREEMPT_COUNT
+	select PREEMPT
 	select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK
 	help
 	  This option reduces the latency of the kernel by making
@@ -52,6 +59,14 @@ config PREEMPT
 	  embedded system with latency requirements in the milliseconds
 	  range.
 
+config PREEMPT_RTB
+	bool "Preemptible Kernel (Basic RT)"
+	select PREEMPT_RT_BASE
+	help
+	  This option is basically the same as (Low-Latency Desktop) but
+	  enables changes which are preliminary for the full preemptiple
+	  RT kernel.
+
 endchoice
 
 config PREEMPT_COUNT
-- 
cgit v0.10.2


From 7c94502d97054342060d053456075f939bcd9ecc Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 3 Jul 2009 08:29:58 -0500
Subject: bug: BUG_ON/WARN_ON variants dependend on RT/!RT

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h
index 7d10f96..aee7fd2 100644
--- a/include/asm-generic/bug.h
+++ b/include/asm-generic/bug.h
@@ -202,6 +202,20 @@ extern void warn_slowpath_null(const char *file, const int line);
 # define WARN_ON_SMP(x)			({0;})
 #endif
 
+#ifdef CONFIG_PREEMPT_RT_BASE
+# define BUG_ON_RT(c)			BUG_ON(c)
+# define BUG_ON_NONRT(c)		do { } while (0)
+# define WARN_ON_RT(condition)		WARN_ON(condition)
+# define WARN_ON_NONRT(condition)	do { } while (0)
+# define WARN_ON_ONCE_NONRT(condition)	do { } while (0)
+#else
+# define BUG_ON_RT(c)			do { } while (0)
+# define BUG_ON_NONRT(c)		BUG_ON(c)
+# define WARN_ON_RT(condition)		do { } while (0)
+# define WARN_ON_NONRT(condition)	WARN_ON(condition)
+# define WARN_ON_ONCE_NONRT(condition)	WARN_ON_ONCE(condition)
+#endif
+
 #endif /* __ASSEMBLY__ */
 
 #endif
-- 
cgit v0.10.2


From 4a0c2e29cc4a3bbf14847f27bd6eeadb0a5d91ea Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 21 Jul 2009 22:34:14 +0200
Subject: rt: local_irq_* variants depending on RT/!RT

Add local_irq_*_(no)rt variant which are mainly used to break
interrupt disabled sections on PREEMPT_RT or to explicitely disable
interrupts on PREEMPT_RT.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 5fc9d8b..3f3a4e2 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -211,7 +211,7 @@ extern void devm_free_irq(struct device *dev, unsigned int irq, void *dev_id);
 #ifdef CONFIG_LOCKDEP
 # define local_irq_enable_in_hardirq()	do { } while (0)
 #else
-# define local_irq_enable_in_hardirq()	local_irq_enable()
+# define local_irq_enable_in_hardirq()	local_irq_enable_nort()
 #endif
 
 extern void disable_irq_nosync(unsigned int irq);
diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h
index d176d65..37b13c4 100644
--- a/include/linux/irqflags.h
+++ b/include/linux/irqflags.h
@@ -147,4 +147,23 @@
 
 #endif /* CONFIG_TRACE_IRQFLAGS_SUPPORT */
 
+/*
+ * local_irq* variants depending on RT/!RT
+ */
+#ifdef CONFIG_PREEMPT_RT_FULL
+# define local_irq_disable_nort()	do { } while (0)
+# define local_irq_enable_nort()	do { } while (0)
+# define local_irq_save_nort(flags)	do { local_save_flags(flags); } while (0)
+# define local_irq_restore_nort(flags)	do { (void)(flags); } while (0)
+# define local_irq_disable_rt()		local_irq_disable()
+# define local_irq_enable_rt()		local_irq_enable()
+#else
+# define local_irq_disable_nort()	local_irq_disable()
+# define local_irq_enable_nort()	local_irq_enable()
+# define local_irq_save_nort(flags)	local_irq_save(flags)
+# define local_irq_restore_nort(flags)	local_irq_restore(flags)
+# define local_irq_disable_rt()		do { } while (0)
+# define local_irq_enable_rt()		do { } while (0)
+#endif
+
 #endif
-- 
cgit v0.10.2


From 6e30639c78b139c14c5e194bd7959620ec524cd7 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 24 Jul 2009 12:38:56 +0200
Subject: preempt: Provide preempt_*_(no)rt variants

RT needs a few preempt_disable/enable points which are not necessary
otherwise. Implement variants to avoid #ifdeffery.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index 87a03c7..de0c8af 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -54,11 +54,15 @@ do { \
 	dec_preempt_count(); \
 } while (0)
 
-#define preempt_enable_no_resched()	sched_preempt_enable_no_resched()
+#ifndef CONFIG_PREEMPT_RT_BASE
+# define preempt_enable_no_resched()	sched_preempt_enable_no_resched()
+#else
+# define preempt_enable_no_resched()	preempt_enable()
+#endif
 
 #define preempt_enable() \
 do { \
-	preempt_enable_no_resched(); \
+	sched_preempt_enable_no_resched(); \
 	barrier(); \
 	preempt_check_resched(); \
 } while (0)
@@ -110,6 +114,18 @@ do { \
 
 #endif /* CONFIG_PREEMPT_COUNT */
 
+#ifdef CONFIG_PREEMPT_RT_FULL
+# define preempt_disable_rt()		preempt_disable()
+# define preempt_enable_rt()		preempt_enable()
+# define preempt_disable_nort()		barrier()
+# define preempt_enable_nort()		barrier()
+#else
+# define preempt_disable_rt()		barrier()
+# define preempt_enable_rt()		barrier()
+# define preempt_disable_nort()		preempt_disable()
+# define preempt_enable_nort()		preempt_enable()
+#endif
+
 #ifdef CONFIG_PREEMPT_NOTIFIERS
 
 struct preempt_notifier;
-- 
cgit v0.10.2


From 7107a71126bb11653b78d1a0851495895e929063 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 3 Jul 2009 08:44:29 -0500
Subject: ata: Do not disable interrupts in ide code for preempt-rt

Use the local_irq_*_nort variants.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/drivers/ata/libata-sff.c b/drivers/ata/libata-sff.c
index d8af325..ad3130d 100644
--- a/drivers/ata/libata-sff.c
+++ b/drivers/ata/libata-sff.c
@@ -678,9 +678,9 @@ unsigned int ata_sff_data_xfer_noirq(struct ata_device *dev, unsigned char *buf,
 	unsigned long flags;
 	unsigned int consumed;
 
-	local_irq_save(flags);
+	local_irq_save_nort(flags);
 	consumed = ata_sff_data_xfer32(dev, buf, buflen, rw);
-	local_irq_restore(flags);
+	local_irq_restore_nort(flags);
 
 	return consumed;
 }
@@ -719,7 +719,7 @@ static void ata_pio_sector(struct ata_queued_cmd *qc)
 		unsigned long flags;
 
 		/* FIXME: use a bounce buffer */
-		local_irq_save(flags);
+		local_irq_save_nort(flags);
 		buf = kmap_atomic(page);
 
 		/* do the actual data transfer */
@@ -727,7 +727,7 @@ static void ata_pio_sector(struct ata_queued_cmd *qc)
 				       do_write);
 
 		kunmap_atomic(buf);
-		local_irq_restore(flags);
+		local_irq_restore_nort(flags);
 	} else {
 		buf = page_address(page);
 		ap->ops->sff_data_xfer(qc->dev, buf + offset, qc->sect_size,
@@ -864,7 +864,7 @@ next_sg:
 		unsigned long flags;
 
 		/* FIXME: use bounce buffer */
-		local_irq_save(flags);
+		local_irq_save_nort(flags);
 		buf = kmap_atomic(page);
 
 		/* do the actual data transfer */
@@ -872,7 +872,7 @@ next_sg:
 								count, rw);
 
 		kunmap_atomic(buf);
-		local_irq_restore(flags);
+		local_irq_restore_nort(flags);
 	} else {
 		buf = page_address(page);
 		consumed = ap->ops->sff_data_xfer(dev,  buf + offset,
-- 
cgit v0.10.2


From c31bb65caaeb43a8c29af408348e33beb59fa13f Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 3 Jul 2009 08:30:16 -0500
Subject: ide: Do not disable interrupts for PREEMPT-RT

Use the local_irq_*_nort variants.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/drivers/ide/alim15x3.c b/drivers/ide/alim15x3.c
index 36f76e2..394f142f 100644
--- a/drivers/ide/alim15x3.c
+++ b/drivers/ide/alim15x3.c
@@ -234,7 +234,7 @@ static int init_chipset_ali15x3(struct pci_dev *dev)
 
 	isa_dev = pci_get_device(PCI_VENDOR_ID_AL, PCI_DEVICE_ID_AL_M1533, NULL);
 
-	local_irq_save(flags);
+	local_irq_save_nort(flags);
 
 	if (m5229_revision < 0xC2) {
 		/*
@@ -325,7 +325,7 @@ out:
 	}
 	pci_dev_put(north);
 	pci_dev_put(isa_dev);
-	local_irq_restore(flags);
+	local_irq_restore_nort(flags);
 	return 0;
 }
 
diff --git a/drivers/ide/hpt366.c b/drivers/ide/hpt366.c
index 696b6c1..0d0a966 100644
--- a/drivers/ide/hpt366.c
+++ b/drivers/ide/hpt366.c
@@ -1241,7 +1241,7 @@ static int init_dma_hpt366(ide_hwif_t *hwif,
 
 	dma_old = inb(base + 2);
 
-	local_irq_save(flags);
+	local_irq_save_nort(flags);
 
 	dma_new = dma_old;
 	pci_read_config_byte(dev, hwif->channel ? 0x4b : 0x43, &masterdma);
@@ -1252,7 +1252,7 @@ static int init_dma_hpt366(ide_hwif_t *hwif,
 	if (dma_new != dma_old)
 		outb(dma_new, base + 2);
 
-	local_irq_restore(flags);
+	local_irq_restore_nort(flags);
 
 	printk(KERN_INFO "    %s: BM-DMA at 0x%04lx-0x%04lx\n",
 			 hwif->name, base, base + 7);
diff --git a/drivers/ide/ide-io-std.c b/drivers/ide/ide-io-std.c
index 1976397..4169433 100644
--- a/drivers/ide/ide-io-std.c
+++ b/drivers/ide/ide-io-std.c
@@ -175,7 +175,7 @@ void ide_input_data(ide_drive_t *drive, struct ide_cmd *cmd, void *buf,
 		unsigned long uninitialized_var(flags);
 
 		if ((io_32bit & 2) && !mmio) {
-			local_irq_save(flags);
+			local_irq_save_nort(flags);
 			ata_vlb_sync(io_ports->nsect_addr);
 		}
 
@@ -186,7 +186,7 @@ void ide_input_data(ide_drive_t *drive, struct ide_cmd *cmd, void *buf,
 			insl(data_addr, buf, words);
 
 		if ((io_32bit & 2) && !mmio)
-			local_irq_restore(flags);
+			local_irq_restore_nort(flags);
 
 		if (((len + 1) & 3) < 2)
 			return;
@@ -219,7 +219,7 @@ void ide_output_data(ide_drive_t *drive, struct ide_cmd *cmd, void *buf,
 		unsigned long uninitialized_var(flags);
 
 		if ((io_32bit & 2) && !mmio) {
-			local_irq_save(flags);
+			local_irq_save_nort(flags);
 			ata_vlb_sync(io_ports->nsect_addr);
 		}
 
@@ -230,7 +230,7 @@ void ide_output_data(ide_drive_t *drive, struct ide_cmd *cmd, void *buf,
 			outsl(data_addr, buf, words);
 
 		if ((io_32bit & 2) && !mmio)
-			local_irq_restore(flags);
+			local_irq_restore_nort(flags);
 
 		if (((len + 1) & 3) < 2)
 			return;
diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index 177db6d..079ae6b 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -659,7 +659,7 @@ void ide_timer_expiry (unsigned long data)
 		/* disable_irq_nosync ?? */
 		disable_irq(hwif->irq);
 		/* local CPU only, as if we were handling an interrupt */
-		local_irq_disable();
+		local_irq_disable_nort();
 		if (hwif->polling) {
 			startstop = handler(drive);
 		} else if (drive_is_ready(drive)) {
diff --git a/drivers/ide/ide-iops.c b/drivers/ide/ide-iops.c
index 376f2dc..f014dd1 100644
--- a/drivers/ide/ide-iops.c
+++ b/drivers/ide/ide-iops.c
@@ -129,12 +129,12 @@ int __ide_wait_stat(ide_drive_t *drive, u8 good, u8 bad,
 				if ((stat & ATA_BUSY) == 0)
 					break;
 
-				local_irq_restore(flags);
+				local_irq_restore_nort(flags);
 				*rstat = stat;
 				return -EBUSY;
 			}
 		}
-		local_irq_restore(flags);
+		local_irq_restore_nort(flags);
 	}
 	/*
 	 * Allow status to settle, then read it again.
diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c
index 068cef0..38e69e1 100644
--- a/drivers/ide/ide-probe.c
+++ b/drivers/ide/ide-probe.c
@@ -196,10 +196,10 @@ static void do_identify(ide_drive_t *drive, u8 cmd, u16 *id)
 	int bswap = 1;
 
 	/* local CPU only; some systems need this */
-	local_irq_save(flags);
+	local_irq_save_nort(flags);
 	/* read 512 bytes of id info */
 	hwif->tp_ops->input_data(drive, NULL, id, SECTOR_SIZE);
-	local_irq_restore(flags);
+	local_irq_restore_nort(flags);
 
 	drive->dev_flags |= IDE_DFLAG_ID_READ;
 #ifdef DEBUG
diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c
index 729428e..3a9a1fc 100644
--- a/drivers/ide/ide-taskfile.c
+++ b/drivers/ide/ide-taskfile.c
@@ -251,7 +251,7 @@ void ide_pio_bytes(ide_drive_t *drive, struct ide_cmd *cmd,
 
 		page_is_high = PageHighMem(page);
 		if (page_is_high)
-			local_irq_save(flags);
+			local_irq_save_nort(flags);
 
 		buf = kmap_atomic(page) + offset;
 
@@ -272,7 +272,7 @@ void ide_pio_bytes(ide_drive_t *drive, struct ide_cmd *cmd,
 		kunmap_atomic(buf);
 
 		if (page_is_high)
-			local_irq_restore(flags);
+			local_irq_restore_nort(flags);
 
 		len -= nr_bytes;
 	}
@@ -415,7 +415,7 @@ static ide_startstop_t pre_task_out_intr(ide_drive_t *drive,
 	}
 
 	if ((drive->dev_flags & IDE_DFLAG_UNMASK) == 0)
-		local_irq_disable();
+		local_irq_disable_nort();
 
 	ide_set_handler(drive, &task_pio_intr, WAIT_WORSTCASE);
 
-- 
cgit v0.10.2


From 5b5b1cc8433ff332470dd74e2a8169e5f24e8182 Mon Sep 17 00:00:00 2001
From: Sven-Thorsten Dietrich <sdietrich@novell.com>
Date: Fri, 3 Jul 2009 08:30:35 -0500
Subject: infiniband: Mellanox IB driver patch use _nort() primitives

Fixes in_atomic stack-dump, when Mellanox module is loaded into the RT
Kernel.

Michael S. Tsirkin <mst@dev.mellanox.co.il> sayeth:
"Basically, if you just make spin_lock_irqsave (and spin_lock_irq) not disable
interrupts for non-raw spinlocks, I think all of infiniband will be fine without
changes."

Signed-off-by: Sven-Thorsten Dietrich <sven@thebigcorporation.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
index cecb98a..3800ef5 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
@@ -783,7 +783,7 @@ void ipoib_mcast_restart_task(struct work_struct *work)
 
 	ipoib_mcast_stop_thread(dev, 0);
 
-	local_irq_save(flags);
+	local_irq_save_nort(flags);
 	netif_addr_lock(dev);
 	spin_lock(&priv->lock);
 
@@ -865,7 +865,7 @@ void ipoib_mcast_restart_task(struct work_struct *work)
 
 	spin_unlock(&priv->lock);
 	netif_addr_unlock(dev);
-	local_irq_restore(flags);
+	local_irq_restore_nort(flags);
 
 	/* We have to cancel outside of the spinlock */
 	list_for_each_entry_safe(mcast, tmcast, &remove_list, list) {
-- 
cgit v0.10.2


From cc313c746dc4e7de178a495731f39bbaa84c44d1 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 3 Jul 2009 08:30:16 -0500
Subject: input: gameport: Do not disable interrupts on PREEMPT_RT

Use the _nort() primitives.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/drivers/input/gameport/gameport.c b/drivers/input/gameport/gameport.c
index da739d9..18fdafe 100644
--- a/drivers/input/gameport/gameport.c
+++ b/drivers/input/gameport/gameport.c
@@ -87,12 +87,12 @@ static int gameport_measure_speed(struct gameport *gameport)
 	tx = 1 << 30;
 
 	for(i = 0; i < 50; i++) {
-		local_irq_save(flags);
+		local_irq_save_nort(flags);
 		GET_TIME(t1);
 		for (t = 0; t < 50; t++) gameport_read(gameport);
 		GET_TIME(t2);
 		GET_TIME(t3);
-		local_irq_restore(flags);
+		local_irq_restore_nort(flags);
 		udelay(i * 10);
 		if ((t = DELTA(t2,t1) - DELTA(t3,t2)) < tx) tx = t;
 	}
@@ -111,11 +111,11 @@ static int gameport_measure_speed(struct gameport *gameport)
 	tx = 1 << 30;
 
 	for(i = 0; i < 50; i++) {
-		local_irq_save(flags);
+		local_irq_save_nort(flags);
 		rdtscl(t1);
 		for (t = 0; t < 50; t++) gameport_read(gameport);
 		rdtscl(t2);
-		local_irq_restore(flags);
+		local_irq_restore_nort(flags);
 		udelay(i * 10);
 		if (t2 - t1 < tx) tx = t2 - t1;
 	}
-- 
cgit v0.10.2


From bfc02945956e3433f1653781fb734662575b97e4 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 21 Jul 2009 22:54:51 +0200
Subject: acpi: Do not disable interrupts on PREEMPT_RT

Use the local_irq_*_nort() variants.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
index 0c44630..1785dd7 100644
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -51,8 +51,8 @@
 
 #define ACPI_ASM_MACROS
 #define BREAKPOINT3
-#define ACPI_DISABLE_IRQS() local_irq_disable()
-#define ACPI_ENABLE_IRQS()  local_irq_enable()
+#define ACPI_DISABLE_IRQS() local_irq_disable_nort()
+#define ACPI_ENABLE_IRQS()  local_irq_enable_nort()
 #define ACPI_FLUSH_CPU_CACHE()	wbinvd()
 
 int __acpi_acquire_global_lock(unsigned int *lock);
-- 
cgit v0.10.2


From 52806222034d31cfb636cbd056b4518bba83f52a Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 21 Jul 2009 23:06:05 +0200
Subject: core: Do not disable interrupts on RT in kernel/users.c

Use the local_irq_*_nort variants to reduce latencies in RT. The code
is serialized by the locks. No need to disable interrupts.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/user.c b/kernel/user.c
index 7f6ff2b..68b70d7 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -159,11 +159,11 @@ void free_uid(struct user_struct *up)
 	if (!up)
 		return;
 
-	local_irq_save(flags);
+	local_irq_save_nort(flags);
 	if (atomic_dec_and_lock(&up->__count, &uidhash_lock))
 		free_user(up, flags);
 	else
-		local_irq_restore(flags);
+		local_irq_restore_nort(flags);
 }
 
 struct user_struct *alloc_uid(kuid_t uid)
-- 
cgit v0.10.2


From 057e18fd1457ca1a15f1ad867f3b6351f500e93b Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 3 Jul 2009 08:44:33 -0500
Subject: core: Do not disable interrupts on RT in res_counter.c

Frederic Weisbecker reported this warning:

[   45.228562] BUG: sleeping function called from invalid context at kernel/rtmutex.c:683
[   45.228571] in_atomic(): 0, irqs_disabled(): 1, pid: 4290, name: ntpdate
[   45.228576] INFO: lockdep is turned off.
[   45.228580] irq event stamp: 0
[   45.228583] hardirqs last  enabled at (0): [<(null)>] (null)
[   45.228589] hardirqs last disabled at (0): [<ffffffff8025449d>] copy_process+0x68d/0x1500
[   45.228602] softirqs last  enabled at (0): [<ffffffff8025449d>] copy_process+0x68d/0x1500
[   45.228609] softirqs last disabled at (0): [<(null)>] (null)
[   45.228617] Pid: 4290, comm: ntpdate Tainted: G        W  2.6.29-rc4-rt1-tip #1
[   45.228622] Call Trace:
[   45.228632]  [<ffffffff8027dfb0>] ? print_irqtrace_events+0xd0/0xe0
[   45.228639]  [<ffffffff8024cd73>] __might_sleep+0x113/0x130
[   45.228646]  [<ffffffff8077c811>] rt_spin_lock+0xa1/0xb0
[   45.228653]  [<ffffffff80296a3d>] res_counter_charge+0x5d/0x130
[   45.228660]  [<ffffffff802fb67f>] __mem_cgroup_try_charge+0x7f/0x180
[   45.228667]  [<ffffffff802fc407>] mem_cgroup_charge_common+0x57/0x90
[   45.228674]  [<ffffffff80212096>] ? ftrace_call+0x5/0x2b
[   45.228680]  [<ffffffff802fc49d>] mem_cgroup_newpage_charge+0x5d/0x60
[   45.228688]  [<ffffffff802d94ce>] __do_fault+0x29e/0x4c0
[   45.228694]  [<ffffffff8077c843>] ? rt_spin_unlock+0x23/0x80
[   45.228700]  [<ffffffff802db8b5>] handle_mm_fault+0x205/0x890
[   45.228707]  [<ffffffff80212096>] ? ftrace_call+0x5/0x2b
[   45.228714]  [<ffffffff8023495e>] do_page_fault+0x11e/0x2a0
[   45.228720]  [<ffffffff8077e5a5>] page_fault+0x25/0x30
[   45.228727]  [<ffffffff8043e1ed>] ? __clear_user+0x3d/0x70
[   45.228733]  [<ffffffff8043e1d1>] ? __clear_user+0x21/0x70

The reason is the raw IRQ flag use of kernel/res_counter.c.

The irq flags tricks there seem a bit pointless: it cannot protect the
c->parent linkage because local_irq_save() is only per CPU.

So replace it with _nort(). This code needs a second look.

Reported-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index ff55247..cfecbee 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -49,7 +49,7 @@ static int __res_counter_charge(struct res_counter *counter, unsigned long val,
 
 	r = ret = 0;
 	*limit_fail_at = NULL;
-	local_irq_save(flags);
+	local_irq_save_nort(flags);
 	for (c = counter; c != NULL; c = c->parent) {
 		spin_lock(&c->lock);
 		r = res_counter_charge_locked(c, val, force);
@@ -69,7 +69,7 @@ static int __res_counter_charge(struct res_counter *counter, unsigned long val,
 			spin_unlock(&u->lock);
 		}
 	}
-	local_irq_restore(flags);
+	local_irq_restore_nort(flags);
 
 	return ret;
 }
@@ -103,7 +103,7 @@ u64 res_counter_uncharge_until(struct res_counter *counter,
 	struct res_counter *c;
 	u64 ret = 0;
 
-	local_irq_save(flags);
+	local_irq_save_nort(flags);
 	for (c = counter; c != top; c = c->parent) {
 		u64 r;
 		spin_lock(&c->lock);
@@ -112,7 +112,7 @@ u64 res_counter_uncharge_until(struct res_counter *counter,
 			ret = r;
 		spin_unlock(&c->lock);
 	}
-	local_irq_restore(flags);
+	local_irq_restore_nort(flags);
 	return ret;
 }
 
-- 
cgit v0.10.2


From 34c2713705b6612aded83c6033957da61e5caae1 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 3 Jul 2009 08:44:26 -0500
Subject: usb: Use local_irq_*_nort() variants

[ tglx: Now that irqf_disabled is dead we should kill that ]

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/drivers/usb/core/hcd.c b/drivers/usb/core/hcd.c
index 8e64adf..59c4d3c 100644
--- a/drivers/usb/core/hcd.c
+++ b/drivers/usb/core/hcd.c
@@ -2217,7 +2217,7 @@ irqreturn_t usb_hcd_irq (int irq, void *__hcd)
 	 * when the first handler doesn't use it.  So let's just
 	 * assume it's never used.
 	 */
-	local_irq_save(flags);
+	local_irq_save_nort(flags);
 
 	if (unlikely(HCD_DEAD(hcd) || !HCD_HW_ACCESSIBLE(hcd)))
 		rc = IRQ_NONE;
@@ -2226,7 +2226,7 @@ irqreturn_t usb_hcd_irq (int irq, void *__hcd)
 	else
 		rc = IRQ_HANDLED;
 
-	local_irq_restore(flags);
+	local_irq_restore_nort(flags);
 	return rc;
 }
 EXPORT_SYMBOL_GPL(usb_hcd_irq);
-- 
cgit v0.10.2


From 95f97c6266b54b268615e1285de07efc881a4bf8 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 3 Jul 2009 08:44:34 -0500
Subject: mm: scatterlist dont disable irqs on RT

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/lib/scatterlist.c b/lib/scatterlist.c
index 7874b01..43603ee 100644
--- a/lib/scatterlist.c
+++ b/lib/scatterlist.c
@@ -499,7 +499,7 @@ void sg_miter_stop(struct sg_mapping_iter *miter)
 			flush_kernel_dcache_page(miter->page);
 
 		if (miter->__flags & SG_MITER_ATOMIC) {
-			WARN_ON_ONCE(preemptible());
+			WARN_ON_ONCE(!pagefault_disabled());
 			kunmap_atomic(miter->addr);
 		} else
 			kunmap(miter->page);
@@ -539,7 +539,7 @@ static size_t sg_copy_buffer(struct scatterlist *sgl, unsigned int nents,
 
 	sg_miter_start(&miter, sgl, nents, sg_flags);
 
-	local_irq_save(flags);
+	local_irq_save_nort(flags);
 
 	while (sg_miter_next(&miter) && offset < buflen) {
 		unsigned int len;
@@ -556,7 +556,7 @@ static size_t sg_copy_buffer(struct scatterlist *sgl, unsigned int nents,
 
 	sg_miter_stop(&miter);
 
-	local_irq_restore(flags);
+	local_irq_restore_nort(flags);
 	return offset;
 }
 
-- 
cgit v0.10.2


From e91d6f69181568af650358c217bb4a124035cb16 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 22 Jul 2011 08:07:08 +0200
Subject: signal-fix-up-rcu-wreckage.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/signal.c b/kernel/signal.c
index 491cb9b..03b6e8f 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1391,12 +1391,12 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
 	struct sighand_struct *sighand;
 
 	for (;;) {
-		local_irq_save(*flags);
+		local_irq_save_nort(*flags);
 		rcu_read_lock();
 		sighand = rcu_dereference(tsk->sighand);
 		if (unlikely(sighand == NULL)) {
 			rcu_read_unlock();
-			local_irq_restore(*flags);
+			local_irq_restore_nort(*flags);
 			break;
 		}
 
@@ -1407,7 +1407,7 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
 		}
 		spin_unlock(&sighand->siglock);
 		rcu_read_unlock();
-		local_irq_restore(*flags);
+		local_irq_restore_nort(*flags);
 	}
 
 	return sighand;
-- 
cgit v0.10.2


From 9bbac602b244b4c0b7642aed390b561c49d6e880 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 21 Jul 2011 21:05:33 +0200
Subject: net-wireless-warn-nort.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 580704e..c58f3cd 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -3144,7 +3144,7 @@ void ieee80211_rx(struct ieee80211_hw *hw, struct sk_buff *skb)
 	struct ieee80211_supported_band *sband;
 	struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
 
-	WARN_ON_ONCE(softirq_count() == 0);
+	WARN_ON_ONCE_NONRT(softirq_count() == 0);
 
 	if (WARN_ON(status->band >= IEEE80211_NUM_BANDS))
 		goto drop;
-- 
cgit v0.10.2


From d485cf5474e45e7942586043b6539da478b82e1e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 19 Aug 2009 09:56:42 +0200
Subject: mm: Replace cgroup_page bit spinlock

Bit spinlocks are not working on RT. Replace them.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h
index 777a524..ca67e80 100644
--- a/include/linux/page_cgroup.h
+++ b/include/linux/page_cgroup.h
@@ -24,6 +24,9 @@ enum {
  */
 struct page_cgroup {
 	unsigned long flags;
+#ifdef CONFIG_PREEMPT_RT_BASE
+	spinlock_t pcg_lock;
+#endif
 	struct mem_cgroup *mem_cgroup;
 };
 
@@ -74,12 +77,20 @@ static inline void lock_page_cgroup(struct page_cgroup *pc)
 	 * Don't take this lock in IRQ context.
 	 * This lock is for pc->mem_cgroup, USED, MIGRATION
 	 */
+#ifndef CONFIG_PREEMPT_RT_BASE
 	bit_spin_lock(PCG_LOCK, &pc->flags);
+#else
+	spin_lock(&pc->pcg_lock);
+#endif
 }
 
 static inline void unlock_page_cgroup(struct page_cgroup *pc)
 {
+#ifndef CONFIG_PREEMPT_RT_BASE
 	bit_spin_unlock(PCG_LOCK, &pc->flags);
+#else
+	spin_unlock(&pc->pcg_lock);
+#endif
 }
 
 #else /* CONFIG_MEMCG */
@@ -102,6 +113,10 @@ static inline void __init page_cgroup_init_flatmem(void)
 {
 }
 
+static inline void page_cgroup_lock_init(struct page_cgroup *pc)
+{
+}
+
 #endif /* CONFIG_MEMCG */
 
 #include <linux/swap.h>
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 6d757e3a..98caeee 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -13,6 +13,14 @@
 
 static unsigned long total_usage;
 
+static void page_cgroup_lock_init(struct page_cgroup *pc, int nr_pages)
+{
+#ifdef CONFIG_PREEMPT_RT_BASE
+	for (; nr_pages; nr_pages--, pc++)
+		spin_lock_init(&pc->pcg_lock);
+#endif
+}
+
 #if !defined(CONFIG_SPARSEMEM)
 
 
@@ -60,6 +68,7 @@ static int __init alloc_node_page_cgroup(int nid)
 		return -ENOMEM;
 	NODE_DATA(nid)->node_page_cgroup = base;
 	total_usage += table_size;
+	page_cgroup_lock_init(base, nr_pages);
 	return 0;
 }
 
@@ -150,6 +159,8 @@ static int __meminit init_section_page_cgroup(unsigned long pfn, int nid)
 		return -ENOMEM;
 	}
 
+	page_cgroup_lock_init(base, PAGES_PER_SECTION);
+
 	/*
 	 * The passed "pfn" may not be aligned to SECTION.  For the calculation
 	 * we need to apply a mask.
-- 
cgit v0.10.2


From 636f1198b60066d555fb1420f6cac3310764b432 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 18 Mar 2011 09:18:52 +0100
Subject: buffer_head: Replace bh_uptodate_lock for -rt

Wrap the bit_spin_lock calls into a separate inline and add the RT
replacements with a real spinlock.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/fs/buffer.c b/fs/buffer.c
index 7a75c3e..8863f45 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -280,8 +280,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
 	 * decide that the page is now completely done.
 	 */
 	first = page_buffers(page);
-	local_irq_save(flags);
-	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
+	flags = bh_uptodate_lock_irqsave(first);
 	clear_buffer_async_read(bh);
 	unlock_buffer(bh);
 	tmp = bh;
@@ -294,8 +293,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
 		}
 		tmp = tmp->b_this_page;
 	} while (tmp != bh);
-	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
-	local_irq_restore(flags);
+	bh_uptodate_unlock_irqrestore(first, flags);
 
 	/*
 	 * If none of the buffers had errors and they are all
@@ -307,9 +305,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
 	return;
 
 still_busy:
-	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
-	local_irq_restore(flags);
-	return;
+	bh_uptodate_unlock_irqrestore(first, flags);
 }
 
 /*
@@ -343,8 +339,7 @@ void end_buffer_async_write(struct buffer_head *bh, int uptodate)
 	}
 
 	first = page_buffers(page);
-	local_irq_save(flags);
-	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
+	flags = bh_uptodate_lock_irqsave(first);
 
 	clear_buffer_async_write(bh);
 	unlock_buffer(bh);
@@ -356,15 +351,12 @@ void end_buffer_async_write(struct buffer_head *bh, int uptodate)
 		}
 		tmp = tmp->b_this_page;
 	}
-	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
-	local_irq_restore(flags);
+	bh_uptodate_unlock_irqrestore(first, flags);
 	end_page_writeback(page);
 	return;
 
 still_busy:
-	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
-	local_irq_restore(flags);
-	return;
+	bh_uptodate_unlock_irqrestore(first, flags);
 }
 EXPORT_SYMBOL(end_buffer_async_write);
 
@@ -3256,6 +3248,7 @@ struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
 	struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags);
 	if (ret) {
 		INIT_LIST_HEAD(&ret->b_assoc_buffers);
+		buffer_head_init_locks(ret);
 		preempt_disable();
 		__this_cpu_inc(bh_accounting.nr);
 		recalc_bh_state();
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index fa9c05f..4e4ca73 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -108,8 +108,7 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
 				"0x%llx.", (unsigned long long)bh->b_blocknr);
 	}
 	first = page_buffers(page);
-	local_irq_save(flags);
-	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
+	flags = bh_uptodate_lock_irqsave(first);
 	clear_buffer_async_read(bh);
 	unlock_buffer(bh);
 	tmp = bh;
@@ -124,8 +123,7 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
 		}
 		tmp = tmp->b_this_page;
 	} while (tmp != bh);
-	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
-	local_irq_restore(flags);
+	bh_uptodate_unlock_irqrestore(first, flags);
 	/*
 	 * If none of the buffers had errors then we can set the page uptodate,
 	 * but we first have to perform the post read mst fixups, if the
@@ -160,9 +158,7 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
 	unlock_page(page);
 	return;
 still_busy:
-	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
-	local_irq_restore(flags);
-	return;
+	bh_uptodate_unlock_irqrestore(first, flags);
 }
 
 /**
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 458f497..5c16cf1 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -72,8 +72,42 @@ struct buffer_head {
 	struct address_space *b_assoc_map;	/* mapping this buffer is
 						   associated with */
 	atomic_t b_count;		/* users using this buffer_head */
+#ifdef CONFIG_PREEMPT_RT_BASE
+	spinlock_t b_uptodate_lock;
+#endif
 };
 
+static inline unsigned long bh_uptodate_lock_irqsave(struct buffer_head *bh)
+{
+	unsigned long flags;
+
+#ifndef CONFIG_PREEMPT_RT_BASE
+	local_irq_save(flags);
+	bit_spin_lock(BH_Uptodate_Lock, &bh->b_state);
+#else
+	spin_lock_irqsave(&bh->b_uptodate_lock, flags);
+#endif
+	return flags;
+}
+
+static inline void
+bh_uptodate_unlock_irqrestore(struct buffer_head *bh, unsigned long flags)
+{
+#ifndef CONFIG_PREEMPT_RT_BASE
+	bit_spin_unlock(BH_Uptodate_Lock, &bh->b_state);
+	local_irq_restore(flags);
+#else
+	spin_unlock_irqrestore(&bh->b_uptodate_lock, flags);
+#endif
+}
+
+static inline void buffer_head_init_locks(struct buffer_head *bh)
+{
+#ifdef CONFIG_PREEMPT_RT_BASE
+	spin_lock_init(&bh->b_uptodate_lock);
+#endif
+}
+
 /*
  * macro tricks to expand the set_buffer_foo(), clear_buffer_foo()
  * and buffer_foo() functions.
-- 
cgit v0.10.2


From 7447c4f71ce8bc800ebed66ebc380060c312677f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 18 Mar 2011 10:11:25 +0100
Subject: fs: jbd/jbd2: Make state lock and journal head lock rt safe

bit_spin_locks break under RT.

Based on a previous patch from Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

--

 include/linux/buffer_head.h |   10 ++++++++++
 include/linux/jbd_common.h  |   24 ++++++++++++++++++++++++
 2 files changed, 34 insertions(+)

diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 5c16cf1..3f8e27b 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -74,6 +74,11 @@ struct buffer_head {
 	atomic_t b_count;		/* users using this buffer_head */
 #ifdef CONFIG_PREEMPT_RT_BASE
 	spinlock_t b_uptodate_lock;
+#if defined(CONFIG_JBD) || defined(CONFIG_JBD_MODULE) || \
+    defined(CONFIG_JBD2) || defined(CONFIG_JBD2_MODULE)
+	spinlock_t b_state_lock;
+	spinlock_t b_journal_head_lock;
+#endif
 #endif
 };
 
@@ -105,6 +110,11 @@ static inline void buffer_head_init_locks(struct buffer_head *bh)
 {
 #ifdef CONFIG_PREEMPT_RT_BASE
 	spin_lock_init(&bh->b_uptodate_lock);
+#if defined(CONFIG_JBD) || defined(CONFIG_JBD_MODULE) || \
+    defined(CONFIG_JBD2) || defined(CONFIG_JBD2_MODULE)
+	spin_lock_init(&bh->b_state_lock);
+	spin_lock_init(&bh->b_journal_head_lock);
+#endif
 #endif
 }
 
diff --git a/include/linux/jbd_common.h b/include/linux/jbd_common.h
index 6133679..0dbc151 100644
--- a/include/linux/jbd_common.h
+++ b/include/linux/jbd_common.h
@@ -39,32 +39,56 @@ static inline struct journal_head *bh2jh(struct buffer_head *bh)
 
 static inline void jbd_lock_bh_state(struct buffer_head *bh)
 {
+#ifndef CONFIG_PREEMPT_RT_BASE
 	bit_spin_lock(BH_State, &bh->b_state);
+#else
+	spin_lock(&bh->b_state_lock);
+#endif
 }
 
 static inline int jbd_trylock_bh_state(struct buffer_head *bh)
 {
+#ifndef CONFIG_PREEMPT_RT_BASE
 	return bit_spin_trylock(BH_State, &bh->b_state);
+#else
+	return spin_trylock(&bh->b_state_lock);
+#endif
 }
 
 static inline int jbd_is_locked_bh_state(struct buffer_head *bh)
 {
+#ifndef CONFIG_PREEMPT_RT_BASE
 	return bit_spin_is_locked(BH_State, &bh->b_state);
+#else
+	return spin_is_locked(&bh->b_state_lock);
+#endif
 }
 
 static inline void jbd_unlock_bh_state(struct buffer_head *bh)
 {
+#ifndef CONFIG_PREEMPT_RT_BASE
 	bit_spin_unlock(BH_State, &bh->b_state);
+#else
+	spin_unlock(&bh->b_state_lock);
+#endif
 }
 
 static inline void jbd_lock_bh_journal_head(struct buffer_head *bh)
 {
+#ifndef CONFIG_PREEMPT_RT_BASE
 	bit_spin_lock(BH_JournalHead, &bh->b_state);
+#else
+	spin_lock(&bh->b_journal_head_lock);
+#endif
 }
 
 static inline void jbd_unlock_bh_journal_head(struct buffer_head *bh)
 {
+#ifndef CONFIG_PREEMPT_RT_BASE
 	bit_spin_unlock(BH_JournalHead, &bh->b_state);
+#else
+	spin_unlock(&bh->b_journal_head_lock);
+#endif
 }
 
 #endif
-- 
cgit v0.10.2


From d8af0a1d45ffa7912cdac2c241ec243e7d8c7d3f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 18 Mar 2011 10:22:04 +0100
Subject: genirq: Disable DEBUG_SHIRQ for rt

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 67604e5..24b60ba 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -164,7 +164,7 @@ config DEBUG_KERNEL
 
 config DEBUG_SHIRQ
 	bool "Debug shared IRQ handlers"
-	depends on DEBUG_KERNEL && GENERIC_HARDIRQS
+	depends on DEBUG_KERNEL && GENERIC_HARDIRQS && !PREEMPT_RT_BASE
 	help
 	  Enable this to generate a spurious interrupt as soon as a shared
 	  interrupt handler is registered, and just before one is deregistered.
-- 
cgit v0.10.2


From ba9c82c8197317d259b53441db50e520f7e5126e Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 3 Jul 2009 08:29:57 -0500
Subject: genirq: disable irqpoll on -rt

Creates long latencies for no value

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 7b5f012..38a32b0 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -340,6 +340,11 @@ MODULE_PARM_DESC(noirqdebug, "Disable irq lockup detection when true");
 
 static int __init irqfixup_setup(char *str)
 {
+#ifdef CONFIG_PREEMPT_RT_BASE
+	printk(KERN_WARNING "irqfixup boot option not supported "
+		"w/ CONFIG_PREEMPT_RT_BASE\n");
+	return 1;
+#endif
 	irqfixup = 1;
 	printk(KERN_WARNING "Misrouted IRQ fixup support enabled.\n");
 	printk(KERN_WARNING "This may impact system performance.\n");
@@ -352,6 +357,11 @@ module_param(irqfixup, int, 0644);
 
 static int __init irqpoll_setup(char *str)
 {
+#ifdef CONFIG_PREEMPT_RT_BASE
+	printk(KERN_WARNING "irqpoll boot option not supported "
+		"w/ CONFIG_PREEMPT_RT_BASE\n");
+	return 1;
+#endif
 	irqfixup = 2;
 	printk(KERN_WARNING "Misrouted IRQ fixup and polling support "
 				"enabled\n");
-- 
cgit v0.10.2


From 86fc12d4b11807e5815f5fb63da2d510f19d93b1 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 3 Apr 2011 11:57:29 +0200
Subject: genirq-force-threading.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 3f3a4e2..6fb89c4 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -383,9 +383,13 @@ static inline int disable_irq_wake(unsigned int irq)
 
 
 #ifdef CONFIG_IRQ_FORCED_THREADING
-extern bool force_irqthreads;
+# ifndef CONFIG_PREEMPT_RT_BASE
+   extern bool force_irqthreads;
+# else
+#  define force_irqthreads	(true)
+# endif
 #else
-#define force_irqthreads	(0)
+#define force_irqthreads	(false)
 #endif
 
 #ifndef __ARCH_SET_SOFTIRQ_PENDING
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index e49a288..902f880 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -21,6 +21,7 @@
 #include "internals.h"
 
 #ifdef CONFIG_IRQ_FORCED_THREADING
+# ifndef CONFIG_PREEMPT_RT_BASE
 __read_mostly bool force_irqthreads;
 
 static int __init setup_forced_irqthreads(char *arg)
@@ -29,6 +30,7 @@ static int __init setup_forced_irqthreads(char *arg)
 	return 0;
 }
 early_param("threadirqs", setup_forced_irqthreads);
+# endif
 #endif
 
 /**
-- 
cgit v0.10.2


From 0d556327b0d780ac310cad8cb9cb9915c89f9dd6 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 20 Jun 2009 11:36:54 +0200
Subject: drivers/net: fix livelock issues

Preempt-RT runs into a live lock issue with the NETDEV_TX_LOCKED micro
optimization. The reason is that the softirq thread is rescheduling
itself on that return value. Depending on priorities it starts to
monoplize the CPU and livelock on UP systems.

Remove it.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/drivers/net/ethernet/atheros/atl1c/atl1c_main.c b/drivers/net/ethernet/atheros/atl1c/atl1c_main.c
index bfcb8bc..dbe44ba 100644
--- a/drivers/net/ethernet/atheros/atl1c/atl1c_main.c
+++ b/drivers/net/ethernet/atheros/atl1c/atl1c_main.c
@@ -2171,11 +2171,7 @@ static netdev_tx_t atl1c_xmit_frame(struct sk_buff *skb,
 	}
 
 	tpd_req = atl1c_cal_tpd_req(skb);
-	if (!spin_trylock_irqsave(&adapter->tx_lock, flags)) {
-		if (netif_msg_pktdata(adapter))
-			dev_info(&adapter->pdev->dev, "tx locked\n");
-		return NETDEV_TX_LOCKED;
-	}
+	spin_lock_irqsave(&adapter->tx_lock, flags);
 
 	if (atl1c_tpd_avail(adapter, type) < tpd_req) {
 		/* no enough descriptor, just stop queue */
diff --git a/drivers/net/ethernet/atheros/atl1e/atl1e_main.c b/drivers/net/ethernet/atheros/atl1e/atl1e_main.c
index 35faab7..297bf42 100644
--- a/drivers/net/ethernet/atheros/atl1e/atl1e_main.c
+++ b/drivers/net/ethernet/atheros/atl1e/atl1e_main.c
@@ -1803,8 +1803,7 @@ static netdev_tx_t atl1e_xmit_frame(struct sk_buff *skb,
 		return NETDEV_TX_OK;
 	}
 	tpd_req = atl1e_cal_tdp_req(skb);
-	if (!spin_trylock_irqsave(&adapter->tx_lock, flags))
-		return NETDEV_TX_LOCKED;
+	spin_lock_irqsave(&adapter->tx_lock, flags);
 
 	if (atl1e_tpd_avail(adapter) < tpd_req) {
 		/* no enough descriptor, just stop queue */
diff --git a/drivers/net/ethernet/chelsio/cxgb/sge.c b/drivers/net/ethernet/chelsio/cxgb/sge.c
index d84872e..1420ea8 100644
--- a/drivers/net/ethernet/chelsio/cxgb/sge.c
+++ b/drivers/net/ethernet/chelsio/cxgb/sge.c
@@ -1666,8 +1666,7 @@ static int t1_sge_tx(struct sk_buff *skb, struct adapter *adapter,
 	struct cmdQ *q = &sge->cmdQ[qid];
 	unsigned int credits, pidx, genbit, count, use_sched_skb = 0;
 
-	if (!spin_trylock(&q->lock))
-		return NETDEV_TX_LOCKED;
+	spin_lock(&q->lock);
 
 	reclaim_completed_tx(sge, q);
 
diff --git a/drivers/net/ethernet/neterion/s2io.c b/drivers/net/ethernet/neterion/s2io.c
index 7c94c08..8757a2c 100644
--- a/drivers/net/ethernet/neterion/s2io.c
+++ b/drivers/net/ethernet/neterion/s2io.c
@@ -4088,12 +4088,7 @@ static netdev_tx_t s2io_xmit(struct sk_buff *skb, struct net_device *dev)
 			[skb->priority & (MAX_TX_FIFOS - 1)];
 	fifo = &mac_control->fifos[queue];
 
-	if (do_spin_lock)
-		spin_lock_irqsave(&fifo->tx_lock, flags);
-	else {
-		if (unlikely(!spin_trylock_irqsave(&fifo->tx_lock, flags)))
-			return NETDEV_TX_LOCKED;
-	}
+	spin_lock_irqsave(&fifo->tx_lock, flags);
 
 	if (sp->config.multiq) {
 		if (__netif_subqueue_stopped(dev, fifo->fifo_no)) {
diff --git a/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c b/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c
index 73ce7dd..b3ba6fe 100644
--- a/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c
+++ b/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c
@@ -2114,10 +2114,8 @@ static int pch_gbe_xmit_frame(struct sk_buff *skb, struct net_device *netdev)
 	struct pch_gbe_tx_ring *tx_ring = adapter->tx_ring;
 	unsigned long flags;
 
-	if (!spin_trylock_irqsave(&tx_ring->tx_lock, flags)) {
-		/* Collision - tell upper layer to requeue */
-		return NETDEV_TX_LOCKED;
-	}
+	spin_lock_irqsave(&tx_ring->tx_lock, flags);
+
 	if (unlikely(!PCH_GBE_DESC_UNUSED(tx_ring))) {
 		netif_stop_queue(netdev);
 		spin_unlock_irqrestore(&tx_ring->tx_lock, flags);
diff --git a/drivers/net/ethernet/tehuti/tehuti.c b/drivers/net/ethernet/tehuti/tehuti.c
index 1e4d743..9dfd4f5 100644
--- a/drivers/net/ethernet/tehuti/tehuti.c
+++ b/drivers/net/ethernet/tehuti/tehuti.c
@@ -1630,13 +1630,8 @@ static netdev_tx_t bdx_tx_transmit(struct sk_buff *skb,
 	unsigned long flags;
 
 	ENTER;
-	local_irq_save(flags);
-	if (!spin_trylock(&priv->tx_lock)) {
-		local_irq_restore(flags);
-		DBG("%s[%s]: TX locked, returning NETDEV_TX_LOCKED\n",
-		    BDX_DRV_NAME, ndev->name);
-		return NETDEV_TX_LOCKED;
-	}
+
+	spin_lock_irqsave(&priv->tx_lock, flags);
 
 	/* build tx descriptor */
 	BDX_ASSERT(f->m.wptr >= f->m.memsz);	/* started with valid wptr */
diff --git a/drivers/net/rionet.c b/drivers/net/rionet.c
index d8b9b1e..7ae06a7 100644
--- a/drivers/net/rionet.c
+++ b/drivers/net/rionet.c
@@ -174,11 +174,7 @@ static int rionet_start_xmit(struct sk_buff *skb, struct net_device *ndev)
 	unsigned long flags;
 	int add_num = 1;
 
-	local_irq_save(flags);
-	if (!spin_trylock(&rnet->tx_lock)) {
-		local_irq_restore(flags);
-		return NETDEV_TX_LOCKED;
-	}
+	spin_lock_irqsave(&rnet->tx_lock, flags);
 
 	if (is_multicast_ether_addr(eth->h_dest))
 		add_num = nets[rnet->mport->id].nact;
-- 
cgit v0.10.2


From e20a6df62ba9895e147814b8b1829ce8e7bb4f77 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Fri, 3 Jul 2009 08:30:00 -0500
Subject: drivers/net: vortex fix locking issues

Argh, cut and paste wasn't enough...

Use this patch instead.  It needs an irq disable.  But, believe it or not,
on SMP this is actually better.  If the irq is shared (as it is in Mark's
case), we don't stop the irq of other devices from being handled on
another CPU (unfortunately for Mark, he pinned all interrupts to one CPU).

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

 drivers/net/ethernet/3com/3c59x.c |    8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/drivers/net/ethernet/3com/3c59x.c b/drivers/net/ethernet/3com/3c59x.c
index ed0feb3..0da3917 100644
--- a/drivers/net/ethernet/3com/3c59x.c
+++ b/drivers/net/ethernet/3com/3c59x.c
@@ -843,9 +843,9 @@ static void poll_vortex(struct net_device *dev)
 {
 	struct vortex_private *vp = netdev_priv(dev);
 	unsigned long flags;
-	local_irq_save(flags);
+	local_irq_save_nort(flags);
 	(vp->full_bus_master_rx ? boomerang_interrupt:vortex_interrupt)(dev->irq,dev);
-	local_irq_restore(flags);
+	local_irq_restore_nort(flags);
 }
 #endif
 
@@ -1919,12 +1919,12 @@ static void vortex_tx_timeout(struct net_device *dev)
 			 * Block interrupts because vortex_interrupt does a bare spin_lock()
 			 */
 			unsigned long flags;
-			local_irq_save(flags);
+			local_irq_save_nort(flags);
 			if (vp->full_bus_master_tx)
 				boomerang_interrupt(dev->irq, dev);
 			else
 				vortex_interrupt(dev->irq, dev);
-			local_irq_restore(flags);
+			local_irq_restore_nort(flags);
 		}
 	}
 
-- 
cgit v0.10.2


From ae5955ff61dab5625fb56b3369518c18c17e9423 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 1 Apr 2010 20:20:57 +0200
Subject: drivers: net: gianfar: Make RT aware

The adjust_link() disables interrupts before taking the queue
locks. On RT those locks are converted to "sleeping" locks and
therefor the local_irq_save/restore must be converted to
local_irq_save/restore_nort.

Reported-by: Xianghua Xiao <xiaoxianghua@gmail.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Xianghua Xiao <xiaoxianghua@gmail.com>

diff --git a/drivers/net/ethernet/freescale/gianfar.c b/drivers/net/ethernet/freescale/gianfar.c
index bffb2ed..5c53535 100644
--- a/drivers/net/ethernet/freescale/gianfar.c
+++ b/drivers/net/ethernet/freescale/gianfar.c
@@ -1663,7 +1663,7 @@ void stop_gfar(struct net_device *dev)
 
 
 	/* Lock it down */
-	local_irq_save(flags);
+	local_irq_save_nort(flags);
 	lock_tx_qs(priv);
 	lock_rx_qs(priv);
 
@@ -1671,7 +1671,7 @@ void stop_gfar(struct net_device *dev)
 
 	unlock_rx_qs(priv);
 	unlock_tx_qs(priv);
-	local_irq_restore(flags);
+	local_irq_restore_nort(flags);
 
 	/* Free the IRQs */
 	if (priv->device_flags & FSL_GIANFAR_DEV_HAS_MULTI_INTR) {
@@ -2951,7 +2951,7 @@ static void adjust_link(struct net_device *dev)
 	struct phy_device *phydev = priv->phydev;
 	int new_state = 0;
 
-	local_irq_save(flags);
+	local_irq_save_nort(flags);
 	lock_tx_qs(priv);
 
 	if (phydev->link) {
@@ -3020,7 +3020,7 @@ static void adjust_link(struct net_device *dev)
 	if (new_state && netif_msg_link(priv))
 		phy_print_status(phydev);
 	unlock_tx_qs(priv);
-	local_irq_restore(flags);
+	local_irq_restore_nort(flags);
 }
 
 /* Update the hash table based on the current list of multicast
-- 
cgit v0.10.2


From 79cbe63bb6cf0561921b9ed4dd15e95179b11435 Mon Sep 17 00:00:00 2001
From: Wu Zhangjin <wuzj@lemote.com>
Date: Mon, 4 Jan 2010 11:33:02 +0800
Subject: USB: Fix the mouse problem when copying large amounts of data

When copying large amounts of data between the USB storage devices and
the hard disk, the USB mouse will not work, this patch fixes it.

[NOTE: This problem have been found in the Loongson family machines, not
sure whether it is producible on other platforms]

Signed-off-by: Hu Hongbing <huhb@lemote.com>
Signed-off-by: Wu Zhangjin <wuzhangjin@gmail.com>

diff --git a/drivers/usb/host/ohci-hcd.c b/drivers/usb/host/ohci-hcd.c
index 180a2b0..1a3e81a 100644
--- a/drivers/usb/host/ohci-hcd.c
+++ b/drivers/usb/host/ohci-hcd.c
@@ -857,9 +857,13 @@ static irqreturn_t ohci_irq (struct usb_hcd *hcd)
 	}
 
 	if (ints & OHCI_INTR_WDH) {
-		spin_lock (&ohci->lock);
-		dl_done_list (ohci);
-		spin_unlock (&ohci->lock);
+		if (ohci->hcca->done_head == 0) {
+			ints &= ~OHCI_INTR_WDH;
+		} else {
+			spin_lock (&ohci->lock);
+			dl_done_list (ohci);
+			spin_unlock (&ohci->lock);
+		}
 	}
 
 	if (quirk_zfmicro(ohci) && (ints & OHCI_INTR_SF)) {
-- 
cgit v0.10.2


From 0ab9e4ce256b8e66ad33721026812a5d87d80d11 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 24 Jun 2011 18:40:37 +0200
Subject: local-var.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index cc88172..bf69fd1 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -48,6 +48,11 @@
 	preempt_enable();				\
 } while (0)
 
+#define get_local_var(var)	get_cpu_var(var)
+#define put_local_var(var)	put_cpu_var(var)
+#define get_local_ptr(var)	get_cpu_ptr(var)
+#define put_local_ptr(var)	put_cpu_ptr(var)
+
 /* minimum unit size, also is the maximum supported allocation size */
 #define PCPU_MIN_UNIT_SIZE		PFN_ALIGN(32 << 10)
 
-- 
cgit v0.10.2


From 8c96abaafd26822121c1e393ddf4d9e9fc9f1e41 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 20 Jun 2011 09:03:47 +0200
Subject: rt-local-irq-lock.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/locallock.h b/include/linux/locallock.h
new file mode 100644
index 0000000..0161fbb
--- /dev/null
+++ b/include/linux/locallock.h
@@ -0,0 +1,238 @@
+#ifndef _LINUX_LOCALLOCK_H
+#define _LINUX_LOCALLOCK_H
+
+#include <linux/spinlock.h>
+
+#ifdef CONFIG_PREEMPT_RT_BASE
+
+#ifdef CONFIG_DEBUG_SPINLOCK
+# define LL_WARN(cond)	WARN_ON(cond)
+#else
+# define LL_WARN(cond)	do { } while (0)
+#endif
+
+/*
+ * per cpu lock based substitute for local_irq_*()
+ */
+struct local_irq_lock {
+	spinlock_t		lock;
+	struct task_struct	*owner;
+	int			nestcnt;
+	unsigned long		flags;
+};
+
+#define DEFINE_LOCAL_IRQ_LOCK(lvar)					\
+	DEFINE_PER_CPU(struct local_irq_lock, lvar) = {			\
+		.lock = __SPIN_LOCK_UNLOCKED((lvar).lock) }
+
+#define local_irq_lock_init(lvar)					\
+	do {								\
+		int __cpu;						\
+		for_each_possible_cpu(__cpu)				\
+			spin_lock_init(&per_cpu(lvar, __cpu).lock);	\
+	} while (0)
+
+static inline void __local_lock(struct local_irq_lock *lv)
+{
+	if (lv->owner != current) {
+		spin_lock(&lv->lock);
+		LL_WARN(lv->owner);
+		LL_WARN(lv->nestcnt);
+		lv->owner = current;
+	}
+	lv->nestcnt++;
+}
+
+#define local_lock(lvar)					\
+	do { __local_lock(&get_local_var(lvar)); } while (0)
+
+static inline int __local_trylock(struct local_irq_lock *lv)
+{
+	if (lv->owner != current && spin_trylock(&lv->lock)) {
+		LL_WARN(lv->owner);
+		LL_WARN(lv->nestcnt);
+		lv->owner = current;
+		lv->nestcnt = 1;
+		return 1;
+	}
+	return 0;
+}
+
+#define local_trylock(lvar)						\
+	({								\
+		int __locked;						\
+		__locked = __local_trylock(&get_local_var(lvar));	\
+		if (!__locked)						\
+			put_local_var(lvar);				\
+		__locked;						\
+	})
+
+static inline void __local_unlock(struct local_irq_lock *lv)
+{
+	LL_WARN(lv->nestcnt == 0);
+	LL_WARN(lv->owner != current);
+	if (--lv->nestcnt)
+		return;
+
+	lv->owner = NULL;
+	spin_unlock(&lv->lock);
+}
+
+#define local_unlock(lvar)					\
+	do {							\
+		__local_unlock(&__get_cpu_var(lvar));		\
+		put_local_var(lvar);				\
+	} while (0)
+
+static inline void __local_lock_irq(struct local_irq_lock *lv)
+{
+	spin_lock_irqsave(&lv->lock, lv->flags);
+	LL_WARN(lv->owner);
+	LL_WARN(lv->nestcnt);
+	lv->owner = current;
+	lv->nestcnt = 1;
+}
+
+#define local_lock_irq(lvar)						\
+	do { __local_lock_irq(&get_local_var(lvar)); } while (0)
+
+#define local_lock_irq_on(lvar, cpu)					\
+	do { __local_lock_irq(&per_cpu(lvar, cpu)); } while (0)
+
+static inline void __local_unlock_irq(struct local_irq_lock *lv)
+{
+	LL_WARN(!lv->nestcnt);
+	LL_WARN(lv->owner != current);
+	lv->owner = NULL;
+	lv->nestcnt = 0;
+	spin_unlock_irq(&lv->lock);
+}
+
+#define local_unlock_irq(lvar)						\
+	do {								\
+		__local_unlock_irq(&__get_cpu_var(lvar));		\
+		put_local_var(lvar);					\
+	} while (0)
+
+#define local_unlock_irq_on(lvar, cpu)					\
+	do {								\
+		__local_unlock_irq(&per_cpu(lvar, cpu));		\
+	} while (0)
+
+static inline int __local_lock_irqsave(struct local_irq_lock *lv)
+{
+	if (lv->owner != current) {
+		__local_lock_irq(lv);
+		return 0;
+	} else {
+		lv->nestcnt++;
+		return 1;
+	}
+}
+
+#define local_lock_irqsave(lvar, _flags)				\
+	do {								\
+		if (__local_lock_irqsave(&get_local_var(lvar)))		\
+			put_local_var(lvar);				\
+		_flags = __get_cpu_var(lvar).flags;			\
+	} while (0)
+
+static inline int __local_unlock_irqrestore(struct local_irq_lock *lv,
+					    unsigned long flags)
+{
+	LL_WARN(!lv->nestcnt);
+	LL_WARN(lv->owner != current);
+	if (--lv->nestcnt)
+		return 0;
+
+	lv->owner = NULL;
+	spin_unlock_irqrestore(&lv->lock, lv->flags);
+	return 1;
+}
+
+#define local_unlock_irqrestore(lvar, flags)				\
+	do {								\
+		if (__local_unlock_irqrestore(&__get_cpu_var(lvar), flags)) \
+			put_local_var(lvar);				\
+	} while (0)
+
+#define local_spin_trylock_irq(lvar, lock)				\
+	({								\
+		int __locked;						\
+		local_lock_irq(lvar);					\
+		__locked = spin_trylock(lock);				\
+		if (!__locked)						\
+			local_unlock_irq(lvar);				\
+		__locked;						\
+	})
+
+#define local_spin_lock_irq(lvar, lock)					\
+	do {								\
+		local_lock_irq(lvar);					\
+		spin_lock(lock);					\
+	} while (0)
+
+#define local_spin_unlock_irq(lvar, lock)				\
+	do {								\
+		spin_unlock(lock);					\
+		local_unlock_irq(lvar);					\
+	} while (0)
+
+#define local_spin_lock_irqsave(lvar, lock, flags)			\
+	do {								\
+		local_lock_irqsave(lvar, flags);			\
+		spin_lock(lock);					\
+	} while (0)
+
+#define local_spin_unlock_irqrestore(lvar, lock, flags)			\
+	do {								\
+		spin_unlock(lock);					\
+		local_unlock_irqrestore(lvar, flags);			\
+	} while (0)
+
+#define get_locked_var(lvar, var)					\
+	(*({								\
+		local_lock(lvar);					\
+		&__get_cpu_var(var);					\
+	}))
+
+#define put_locked_var(lvar, var)		local_unlock(lvar)
+
+#define local_lock_cpu(lvar)						\
+	({								\
+		local_lock(lvar);					\
+		smp_processor_id();					\
+	})
+
+#define local_unlock_cpu(lvar)			local_unlock(lvar)
+
+#else /* PREEMPT_RT_BASE */
+
+#define DEFINE_LOCAL_IRQ_LOCK(lvar)		__typeof__(const int) lvar
+
+static inline void local_irq_lock_init(int lvar) { }
+
+#define local_lock(lvar)			preempt_disable()
+#define local_unlock(lvar)			preempt_enable()
+#define local_lock_irq(lvar)			local_irq_disable()
+#define local_unlock_irq(lvar)			local_irq_enable()
+#define local_lock_irqsave(lvar, flags)		local_irq_save(flags)
+#define local_unlock_irqrestore(lvar, flags)	local_irq_restore(flags)
+
+#define local_spin_trylock_irq(lvar, lock)	spin_trylock_irq(lock)
+#define local_spin_lock_irq(lvar, lock)		spin_lock_irq(lock)
+#define local_spin_unlock_irq(lvar, lock)	spin_unlock_irq(lock)
+#define local_spin_lock_irqsave(lvar, lock, flags)	\
+	spin_lock_irqsave(lock, flags)
+#define local_spin_unlock_irqrestore(lvar, lock, flags)	\
+	spin_unlock_irqrestore(lock, flags)
+
+#define get_locked_var(lvar, var)		get_cpu_var(var)
+#define put_locked_var(lvar, var)		put_cpu_var(var)
+
+#define local_lock_cpu(lvar)			get_cpu()
+#define local_unlock_cpu(lvar)			put_cpu()
+
+#endif
+
+#endif
-- 
cgit v0.10.2


From e339231e5d2a9c7a387726991ffc93e282fc69a9 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 17 Jun 2011 15:42:38 +0200
Subject: cpu-rt-variants.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/smp.h b/include/linux/smp.h
index dd6f06b..8624321 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -218,6 +218,14 @@ static inline void kick_all_cpus_sync(void) {  }
 #define get_cpu()		({ preempt_disable(); smp_processor_id(); })
 #define put_cpu()		preempt_enable()
 
+#ifndef CONFIG_PREEMPT_RT_FULL
+# define get_cpu_light()	get_cpu()
+# define put_cpu_light()	put_cpu()
+#else
+# define get_cpu_light()	({ migrate_disable(); smp_processor_id(); })
+# define put_cpu_light()	migrate_enable()
+#endif
+
 /*
  * Callback to arch code if there's nosmp or maxcpus=0 on the
  * boot command line:
-- 
cgit v0.10.2


From 8399fd7bf0a189c75a14ea576f33b30afe747736 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 18 Jun 2011 19:44:43 +0200
Subject: mm-slab-wrap-functions.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/mm/slab.c b/mm/slab.c
index 2aa9fd4..e083470 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -116,6 +116,7 @@
 #include	<linux/kmemcheck.h>
 #include	<linux/memory.h>
 #include	<linux/prefetch.h>
+#include	<linux/locallock.h>
 
 #include	<net/sock.h>
 
@@ -696,12 +697,49 @@ static void slab_set_debugobj_lock_classes(struct kmem_cache *cachep)
 #endif
 
 static DEFINE_PER_CPU(struct delayed_work, slab_reap_work);
+static DEFINE_LOCAL_IRQ_LOCK(slab_lock);
+
+#ifndef CONFIG_PREEMPT_RT_BASE
+# define slab_on_each_cpu(func, cp)	on_each_cpu(func, cp, 1)
+#else
+/*
+ * execute func() for all CPUs. On PREEMPT_RT we dont actually have
+ * to run on the remote CPUs - we only have to take their CPU-locks.
+ * (This is a rare operation, so cacheline bouncing is not an issue.)
+ */
+static void
+slab_on_each_cpu(void (*func)(void *arg, int this_cpu), void *arg)
+{
+	unsigned int i;
+
+	get_cpu_light();
+	for_each_online_cpu(i)
+		func(arg, i);
+	put_cpu_light();
+}
+
+static void lock_slab_on(unsigned int cpu)
+{
+	local_lock_irq_on(slab_lock, cpu);
+}
+
+static void unlock_slab_on(unsigned int cpu)
+{
+	local_unlock_irq_on(slab_lock, cpu);
+}
+#endif
 
 static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
 {
 	return cachep->array[smp_processor_id()];
 }
 
+static inline struct array_cache *cpu_cache_get_on_cpu(struct kmem_cache *cachep,
+						       int cpu)
+{
+	return cachep->array[cpu];
+}
+
 static inline struct kmem_cache *__find_general_cachep(size_t size,
 							gfp_t gfpflags)
 {
@@ -1171,9 +1209,10 @@ static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3)
 	if (l3->alien) {
 		struct array_cache *ac = l3->alien[node];
 
-		if (ac && ac->avail && spin_trylock_irq(&ac->lock)) {
+		if (ac && ac->avail &&
+		    local_spin_trylock_irq(slab_lock, &ac->lock)) {
 			__drain_alien_cache(cachep, ac, node);
-			spin_unlock_irq(&ac->lock);
+			local_spin_unlock_irq(slab_lock, &ac->lock);
 		}
 	}
 }
@@ -1188,9 +1227,9 @@ static void drain_alien_cache(struct kmem_cache *cachep,
 	for_each_online_node(i) {
 		ac = alien[i];
 		if (ac) {
-			spin_lock_irqsave(&ac->lock, flags);
+			local_spin_lock_irqsave(slab_lock, &ac->lock, flags);
 			__drain_alien_cache(cachep, ac, i);
-			spin_unlock_irqrestore(&ac->lock, flags);
+			local_spin_unlock_irqrestore(slab_lock, &ac->lock, flags);
 		}
 	}
 }
@@ -1269,11 +1308,11 @@ static int init_cache_nodelists_node(int node)
 			cachep->nodelists[node] = l3;
 		}
 
-		spin_lock_irq(&cachep->nodelists[node]->list_lock);
+		local_spin_lock_irq(slab_lock, &cachep->nodelists[node]->list_lock);
 		cachep->nodelists[node]->free_limit =
 			(1 + nr_cpus_node(node)) *
 			cachep->batchcount + cachep->num;
-		spin_unlock_irq(&cachep->nodelists[node]->list_lock);
+		local_spin_unlock_irq(slab_lock, &cachep->nodelists[node]->list_lock);
 	}
 	return 0;
 }
@@ -1298,7 +1337,7 @@ static void __cpuinit cpuup_canceled(long cpu)
 		if (!l3)
 			goto free_array_cache;
 
-		spin_lock_irq(&l3->list_lock);
+		local_spin_lock_irq(slab_lock, &l3->list_lock);
 
 		/* Free limit for this kmem_list3 */
 		l3->free_limit -= cachep->batchcount;
@@ -1306,7 +1345,7 @@ static void __cpuinit cpuup_canceled(long cpu)
 			free_block(cachep, nc->entry, nc->avail, node);
 
 		if (!cpumask_empty(mask)) {
-			spin_unlock_irq(&l3->list_lock);
+			local_spin_unlock_irq(slab_lock, &l3->list_lock);
 			goto free_array_cache;
 		}
 
@@ -1320,7 +1359,7 @@ static void __cpuinit cpuup_canceled(long cpu)
 		alien = l3->alien;
 		l3->alien = NULL;
 
-		spin_unlock_irq(&l3->list_lock);
+		local_spin_unlock_irq(slab_lock, &l3->list_lock);
 
 		kfree(shared);
 		if (alien) {
@@ -1394,7 +1433,7 @@ static int __cpuinit cpuup_prepare(long cpu)
 		l3 = cachep->nodelists[node];
 		BUG_ON(!l3);
 
-		spin_lock_irq(&l3->list_lock);
+		local_spin_lock_irq(slab_lock, &l3->list_lock);
 		if (!l3->shared) {
 			/*
 			 * We are serialised from CPU_DEAD or
@@ -1409,7 +1448,7 @@ static int __cpuinit cpuup_prepare(long cpu)
 			alien = NULL;
 		}
 #endif
-		spin_unlock_irq(&l3->list_lock);
+		local_spin_unlock_irq(slab_lock, &l3->list_lock);
 		kfree(shared);
 		free_alien_cache(alien);
 		if (cachep->flags & SLAB_DEBUG_OBJECTS)
@@ -1612,6 +1651,8 @@ void __init kmem_cache_init(void)
 	if (num_possible_nodes() == 1)
 		use_alien_caches = 0;
 
+	local_irq_lock_init(slab_lock);
+
 	for (i = 0; i < NUM_INIT_LISTS; i++)
 		kmem_list3_init(&initkmem_list3[i]);
 
@@ -2533,7 +2574,7 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
 #if DEBUG
 static void check_irq_off(void)
 {
-	BUG_ON(!irqs_disabled());
+	BUG_ON_NONRT(!irqs_disabled());
 }
 
 static void check_irq_on(void)
@@ -2568,26 +2609,37 @@ static void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
 			struct array_cache *ac,
 			int force, int node);
 
-static void do_drain(void *arg)
+static void __do_drain(void *arg, unsigned int cpu)
 {
 	struct kmem_cache *cachep = arg;
 	struct array_cache *ac;
-	int node = numa_mem_id();
+	int node = cpu_to_mem(cpu);
 
-	check_irq_off();
-	ac = cpu_cache_get(cachep);
+	ac = cpu_cache_get_on_cpu(cachep, cpu);
 	spin_lock(&cachep->nodelists[node]->list_lock);
 	free_block(cachep, ac->entry, ac->avail, node);
 	spin_unlock(&cachep->nodelists[node]->list_lock);
 	ac->avail = 0;
 }
 
+#ifndef CONFIG_PREEMPT_RT_BASE
+static void do_drain(void *arg)
+{
+	__do_drain(arg, smp_processor_id());
+}
+#else
+static void do_drain(void *arg, int this_cpu)
+{
+	__do_drain(arg, this_cpu);
+}
+#endif
+
 static void drain_cpu_caches(struct kmem_cache *cachep)
 {
 	struct kmem_list3 *l3;
 	int node;
 
-	on_each_cpu(do_drain, cachep, 1);
+	slab_on_each_cpu(do_drain, cachep);
 	check_irq_on();
 	for_each_online_node(node) {
 		l3 = cachep->nodelists[node];
@@ -2618,10 +2670,10 @@ static int drain_freelist(struct kmem_cache *cache,
 	nr_freed = 0;
 	while (nr_freed < tofree && !list_empty(&l3->slabs_free)) {
 
-		spin_lock_irq(&l3->list_lock);
+		local_spin_lock_irq(slab_lock, &l3->list_lock);
 		p = l3->slabs_free.prev;
 		if (p == &l3->slabs_free) {
-			spin_unlock_irq(&l3->list_lock);
+			local_spin_unlock_irq(slab_lock, &l3->list_lock);
 			goto out;
 		}
 
@@ -2635,7 +2687,7 @@ static int drain_freelist(struct kmem_cache *cache,
 		 * to the cache.
 		 */
 		l3->free_objects -= cache->num;
-		spin_unlock_irq(&l3->list_lock);
+		local_spin_unlock_irq(slab_lock, &l3->list_lock);
 		slab_destroy(cache, slabp);
 		nr_freed++;
 	}
@@ -2910,7 +2962,7 @@ static int cache_grow(struct kmem_cache *cachep,
 	offset *= cachep->colour_off;
 
 	if (local_flags & __GFP_WAIT)
-		local_irq_enable();
+		local_unlock_irq(slab_lock);
 
 	/*
 	 * The test for missing atomic flag is performed here, rather than
@@ -2940,7 +2992,7 @@ static int cache_grow(struct kmem_cache *cachep,
 	cache_init_objs(cachep, slabp);
 
 	if (local_flags & __GFP_WAIT)
-		local_irq_disable();
+		local_lock_irq(slab_lock);
 	check_irq_off();
 	spin_lock(&l3->list_lock);
 
@@ -2954,7 +3006,7 @@ opps1:
 	kmem_freepages(cachep, objp);
 failed:
 	if (local_flags & __GFP_WAIT)
-		local_irq_disable();
+		local_lock_irq(slab_lock);
 	return 0;
 }
 
@@ -3368,11 +3420,11 @@ retry:
 		 * set and go into memory reserves if necessary.
 		 */
 		if (local_flags & __GFP_WAIT)
-			local_irq_enable();
+			local_unlock_irq(slab_lock);
 		kmem_flagcheck(cache, flags);
 		obj = kmem_getpages(cache, local_flags, numa_mem_id());
 		if (local_flags & __GFP_WAIT)
-			local_irq_disable();
+			local_lock_irq(slab_lock);
 		if (obj) {
 			/*
 			 * Insert into the appropriate per node queues
@@ -3492,7 +3544,7 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
 	cachep = memcg_kmem_get_cache(cachep, flags);
 
 	cache_alloc_debugcheck_before(cachep, flags);
-	local_irq_save(save_flags);
+	local_lock_irqsave(slab_lock, save_flags);
 
 	if (nodeid == NUMA_NO_NODE)
 		nodeid = slab_node;
@@ -3517,7 +3569,7 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
 	/* ___cache_alloc_node can fall back to other nodes */
 	ptr = ____cache_alloc_node(cachep, flags, nodeid);
   out:
-	local_irq_restore(save_flags);
+	local_unlock_irqrestore(slab_lock, save_flags);
 	ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller);
 	kmemleak_alloc_recursive(ptr, cachep->object_size, 1, cachep->flags,
 				 flags);
@@ -3579,9 +3631,9 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller)
 	cachep = memcg_kmem_get_cache(cachep, flags);
 
 	cache_alloc_debugcheck_before(cachep, flags);
-	local_irq_save(save_flags);
+	local_lock_irqsave(slab_lock, save_flags);
 	objp = __do_cache_alloc(cachep, flags);
-	local_irq_restore(save_flags);
+	local_unlock_irqrestore(slab_lock, save_flags);
 	objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller);
 	kmemleak_alloc_recursive(objp, cachep->object_size, 1, cachep->flags,
 				 flags);
@@ -3898,9 +3950,9 @@ void kmem_cache_free(struct kmem_cache *cachep, void *objp)
 	debug_check_no_locks_freed(objp, cachep->object_size);
 	if (!(cachep->flags & SLAB_DEBUG_OBJECTS))
 		debug_check_no_obj_freed(objp, cachep->object_size);
-	local_irq_save(flags);
+	local_lock_irqsave(slab_lock, flags);
 	__cache_free(cachep, objp, _RET_IP_);
-	local_irq_restore(flags);
+	local_unlock_irqrestore(slab_lock, flags);
 
 	trace_kmem_cache_free(_RET_IP_, objp);
 }
@@ -3929,9 +3981,9 @@ void kfree(const void *objp)
 	debug_check_no_locks_freed(objp, c->object_size);
 
 	debug_check_no_obj_freed(objp, c->object_size);
-	local_irq_save(flags);
+	local_lock_irqsave(slab_lock, flags);
 	__cache_free(c, (void *)objp, _RET_IP_);
-	local_irq_restore(flags);
+	local_unlock_irqrestore(slab_lock, flags);
 }
 EXPORT_SYMBOL(kfree);
 
@@ -3968,7 +4020,7 @@ static int alloc_kmemlist(struct kmem_cache *cachep, gfp_t gfp)
 		if (l3) {
 			struct array_cache *shared = l3->shared;
 
-			spin_lock_irq(&l3->list_lock);
+			local_spin_lock_irq(slab_lock, &l3->list_lock);
 
 			if (shared)
 				free_block(cachep, shared->entry,
@@ -3981,7 +4033,7 @@ static int alloc_kmemlist(struct kmem_cache *cachep, gfp_t gfp)
 			}
 			l3->free_limit = (1 + nr_cpus_node(node)) *
 					cachep->batchcount + cachep->num;
-			spin_unlock_irq(&l3->list_lock);
+			local_spin_unlock_irq(slab_lock, &l3->list_lock);
 			kfree(shared);
 			free_alien_cache(new_alien);
 			continue;
@@ -4028,17 +4080,28 @@ struct ccupdate_struct {
 	struct array_cache *new[0];
 };
 
-static void do_ccupdate_local(void *info)
+static void __do_ccupdate_local(void *info, int cpu)
 {
 	struct ccupdate_struct *new = info;
 	struct array_cache *old;
 
-	check_irq_off();
-	old = cpu_cache_get(new->cachep);
+	old = cpu_cache_get_on_cpu(new->cachep, cpu);
 
-	new->cachep->array[smp_processor_id()] = new->new[smp_processor_id()];
-	new->new[smp_processor_id()] = old;
+	new->cachep->array[cpu] = new->new[cpu];
+	new->new[cpu] = old;
+}
+
+#ifndef CONFIG_PREEMPT_RT_BASE
+static void do_ccupdate_local(void *info)
+{
+	__do_ccupdate_local(info, smp_processor_id());
 }
+#else
+static void do_ccupdate_local(void *info, int cpu)
+{
+	__do_ccupdate_local(info, cpu);
+}
+#endif
 
 /* Always called with the slab_mutex held */
 static int __do_tune_cpucache(struct kmem_cache *cachep, int limit,
@@ -4064,7 +4127,7 @@ static int __do_tune_cpucache(struct kmem_cache *cachep, int limit,
 	}
 	new->cachep = cachep;
 
-	on_each_cpu(do_ccupdate_local, (void *)new, 1);
+	slab_on_each_cpu(do_ccupdate_local, (void *)new);
 
 	check_irq_on();
 	cachep->batchcount = batchcount;
@@ -4075,9 +4138,11 @@ static int __do_tune_cpucache(struct kmem_cache *cachep, int limit,
 		struct array_cache *ccold = new->new[i];
 		if (!ccold)
 			continue;
-		spin_lock_irq(&cachep->nodelists[cpu_to_mem(i)]->list_lock);
+		local_spin_lock_irq(slab_lock,
+				    &cachep->nodelists[cpu_to_mem(i)]->list_lock);
 		free_block(cachep, ccold->entry, ccold->avail, cpu_to_mem(i));
-		spin_unlock_irq(&cachep->nodelists[cpu_to_mem(i)]->list_lock);
+		local_spin_unlock_irq(slab_lock,
+				      &cachep->nodelists[cpu_to_mem(i)]->list_lock);
 		kfree(ccold);
 	}
 	kfree(new);
@@ -4192,7 +4257,7 @@ static void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
 	if (ac->touched && !force) {
 		ac->touched = 0;
 	} else {
-		spin_lock_irq(&l3->list_lock);
+		local_spin_lock_irq(slab_lock, &l3->list_lock);
 		if (ac->avail) {
 			tofree = force ? ac->avail : (ac->limit + 4) / 5;
 			if (tofree > ac->avail)
@@ -4202,7 +4267,7 @@ static void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
 			memmove(ac->entry, &(ac->entry[tofree]),
 				sizeof(void *) * ac->avail);
 		}
-		spin_unlock_irq(&l3->list_lock);
+		local_spin_unlock_irq(slab_lock, &l3->list_lock);
 	}
 }
 
@@ -4295,7 +4360,7 @@ void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo)
 			continue;
 
 		check_irq_on();
-		spin_lock_irq(&l3->list_lock);
+		local_spin_lock_irq(slab_lock, &l3->list_lock);
 
 		list_for_each_entry(slabp, &l3->slabs_full, list) {
 			if (slabp->inuse != cachep->num && !error)
@@ -4320,7 +4385,7 @@ void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo)
 		if (l3->shared)
 			shared_avail += l3->shared->avail;
 
-		spin_unlock_irq(&l3->list_lock);
+		local_spin_unlock_irq(slab_lock, &l3->list_lock);
 	}
 	num_slabs += active_slabs;
 	num_objs = num_slabs * cachep->num;
@@ -4520,13 +4585,13 @@ static int leaks_show(struct seq_file *m, void *p)
 			continue;
 
 		check_irq_on();
-		spin_lock_irq(&l3->list_lock);
+		local_spin_lock_irq(slab_lock, &l3->list_lock);
 
 		list_for_each_entry(slabp, &l3->slabs_full, list)
 			handle_slab(n, cachep, slabp);
 		list_for_each_entry(slabp, &l3->slabs_partial, list)
 			handle_slab(n, cachep, slabp);
-		spin_unlock_irq(&l3->list_lock);
+		local_spin_unlock_irq(slab_lock, &l3->list_lock);
 	}
 	name = cachep->name;
 	if (n[0] == n[1]) {
-- 
cgit v0.10.2


From 114d00ccbef28753d824ff304771741dfd99fd74 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 3 Jul 2009 08:44:43 -0500
Subject: mm: More lock breaks in slab.c

Handle __free_pages outside of the locked regions. This reduces the
lock contention on the percpu slab locks in -rt significantly.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/mm/slab.c b/mm/slab.c
index e083470..6604ced 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -697,6 +697,7 @@ static void slab_set_debugobj_lock_classes(struct kmem_cache *cachep)
 #endif
 
 static DEFINE_PER_CPU(struct delayed_work, slab_reap_work);
+static DEFINE_PER_CPU(struct list_head, slab_free_list);
 static DEFINE_LOCAL_IRQ_LOCK(slab_lock);
 
 #ifndef CONFIG_PREEMPT_RT_BASE
@@ -729,6 +730,34 @@ static void unlock_slab_on(unsigned int cpu)
 }
 #endif
 
+static void free_delayed(struct list_head *h)
+{
+	while(!list_empty(h)) {
+		struct page *page = list_first_entry(h, struct page, lru);
+
+		list_del(&page->lru);
+		__free_pages(page, page->index);
+	}
+}
+
+static void unlock_l3_and_free_delayed(spinlock_t *list_lock)
+{
+	LIST_HEAD(tmp);
+
+	list_splice_init(&__get_cpu_var(slab_free_list), &tmp);
+	local_spin_unlock_irq(slab_lock, list_lock);
+	free_delayed(&tmp);
+}
+
+static void unlock_slab_and_free_delayed(unsigned long flags)
+{
+	LIST_HEAD(tmp);
+
+	list_splice_init(&__get_cpu_var(slab_free_list), &tmp);
+	local_unlock_irqrestore(slab_lock, flags);
+	free_delayed(&tmp);
+}
+
 static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
 {
 	return cachep->array[smp_processor_id()];
@@ -1345,7 +1374,7 @@ static void __cpuinit cpuup_canceled(long cpu)
 			free_block(cachep, nc->entry, nc->avail, node);
 
 		if (!cpumask_empty(mask)) {
-			local_spin_unlock_irq(slab_lock, &l3->list_lock);
+			unlock_l3_and_free_delayed(&l3->list_lock);
 			goto free_array_cache;
 		}
 
@@ -1359,7 +1388,7 @@ static void __cpuinit cpuup_canceled(long cpu)
 		alien = l3->alien;
 		l3->alien = NULL;
 
-		local_spin_unlock_irq(slab_lock, &l3->list_lock);
+		unlock_l3_and_free_delayed(&l3->list_lock);
 
 		kfree(shared);
 		if (alien) {
@@ -1652,6 +1681,8 @@ void __init kmem_cache_init(void)
 		use_alien_caches = 0;
 
 	local_irq_lock_init(slab_lock);
+	for_each_possible_cpu(i)
+		INIT_LIST_HEAD(&per_cpu(slab_free_list, i));
 
 	for (i = 0; i < NUM_INIT_LISTS; i++)
 		kmem_list3_init(&initkmem_list3[i]);
@@ -1953,12 +1984,14 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
 /*
  * Interface to system's page release.
  */
-static void kmem_freepages(struct kmem_cache *cachep, void *addr)
+static void kmem_freepages(struct kmem_cache *cachep, void *addr, bool delayed)
 {
 	unsigned long i = (1 << cachep->gfporder);
-	struct page *page = virt_to_page(addr);
+	struct page *page, *basepage = virt_to_page(addr);
 	const unsigned long nr_freed = i;
 
+	page = basepage;
+
 	kmemcheck_free_shadow(page, cachep->gfporder);
 
 	if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
@@ -1977,7 +2010,12 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr)
 	memcg_release_pages(cachep, cachep->gfporder);
 	if (current->reclaim_state)
 		current->reclaim_state->reclaimed_slab += nr_freed;
-	free_memcg_kmem_pages((unsigned long)addr, cachep->gfporder);
+	if (!delayed) {
+		free_memcg_kmem_pages((unsigned long)addr, cachep->gfporder);
+	} else {
+		basepage->index = cachep->gfporder;
+		list_add(&basepage->lru, &__get_cpu_var(slab_free_list));
+	}
 }
 
 static void kmem_rcu_free(struct rcu_head *head)
@@ -1985,7 +2023,7 @@ static void kmem_rcu_free(struct rcu_head *head)
 	struct slab_rcu *slab_rcu = (struct slab_rcu *)head;
 	struct kmem_cache *cachep = slab_rcu->cachep;
 
-	kmem_freepages(cachep, slab_rcu->addr);
+	kmem_freepages(cachep, slab_rcu->addr, false);
 	if (OFF_SLAB(cachep))
 		kmem_cache_free(cachep->slabp_cache, slab_rcu);
 }
@@ -2204,7 +2242,8 @@ static void slab_destroy_debugcheck(struct kmem_cache *cachep, struct slab *slab
  * Before calling the slab must have been unlinked from the cache.  The
  * cache-lock is not held/needed.
  */
-static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp)
+static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp,
+			 bool delayed)
 {
 	void *addr = slabp->s_mem - slabp->colouroff;
 
@@ -2217,7 +2256,7 @@ static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp)
 		slab_rcu->addr = addr;
 		call_rcu(&slab_rcu->head, kmem_rcu_free);
 	} else {
-		kmem_freepages(cachep, addr);
+		kmem_freepages(cachep, addr, delayed);
 		if (OFF_SLAB(cachep))
 			kmem_cache_free(cachep->slabp_cache, slabp);
 	}
@@ -2628,9 +2667,15 @@ static void do_drain(void *arg)
 	__do_drain(arg, smp_processor_id());
 }
 #else
-static void do_drain(void *arg, int this_cpu)
+static void do_drain(void *arg, int cpu)
 {
-	__do_drain(arg, this_cpu);
+	LIST_HEAD(tmp);
+
+	lock_slab_on(cpu);
+	__do_drain(arg, cpu);
+	list_splice_init(&per_cpu(slab_free_list, cpu), &tmp);
+	unlock_slab_on(cpu);
+	free_delayed(&tmp);
 }
 #endif
 
@@ -2688,7 +2733,7 @@ static int drain_freelist(struct kmem_cache *cache,
 		 */
 		l3->free_objects -= cache->num;
 		local_spin_unlock_irq(slab_lock, &l3->list_lock);
-		slab_destroy(cache, slabp);
+		slab_destroy(cache, slabp, false);
 		nr_freed++;
 	}
 out:
@@ -3003,7 +3048,7 @@ static int cache_grow(struct kmem_cache *cachep,
 	spin_unlock(&l3->list_lock);
 	return 1;
 opps1:
-	kmem_freepages(cachep, objp);
+	kmem_freepages(cachep, objp, false);
 failed:
 	if (local_flags & __GFP_WAIT)
 		local_lock_irq(slab_lock);
@@ -3684,7 +3729,7 @@ static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects,
 				 * a different cache, refer to comments before
 				 * alloc_slabmgmt.
 				 */
-				slab_destroy(cachep, slabp);
+				slab_destroy(cachep, slabp, true);
 			} else {
 				list_add(&slabp->list, &l3->slabs_free);
 			}
@@ -3952,7 +3997,7 @@ void kmem_cache_free(struct kmem_cache *cachep, void *objp)
 		debug_check_no_obj_freed(objp, cachep->object_size);
 	local_lock_irqsave(slab_lock, flags);
 	__cache_free(cachep, objp, _RET_IP_);
-	local_unlock_irqrestore(slab_lock, flags);
+	unlock_slab_and_free_delayed(flags);
 
 	trace_kmem_cache_free(_RET_IP_, objp);
 }
@@ -3983,7 +4028,7 @@ void kfree(const void *objp)
 	debug_check_no_obj_freed(objp, c->object_size);
 	local_lock_irqsave(slab_lock, flags);
 	__cache_free(c, (void *)objp, _RET_IP_);
-	local_unlock_irqrestore(slab_lock, flags);
+	unlock_slab_and_free_delayed(flags);
 }
 EXPORT_SYMBOL(kfree);
 
@@ -4033,7 +4078,8 @@ static int alloc_kmemlist(struct kmem_cache *cachep, gfp_t gfp)
 			}
 			l3->free_limit = (1 + nr_cpus_node(node)) *
 					cachep->batchcount + cachep->num;
-			local_spin_unlock_irq(slab_lock, &l3->list_lock);
+			unlock_l3_and_free_delayed(&l3->list_lock);
+
 			kfree(shared);
 			free_alien_cache(new_alien);
 			continue;
@@ -4141,8 +4187,8 @@ static int __do_tune_cpucache(struct kmem_cache *cachep, int limit,
 		local_spin_lock_irq(slab_lock,
 				    &cachep->nodelists[cpu_to_mem(i)]->list_lock);
 		free_block(cachep, ccold->entry, ccold->avail, cpu_to_mem(i));
-		local_spin_unlock_irq(slab_lock,
-				      &cachep->nodelists[cpu_to_mem(i)]->list_lock);
+
+		unlock_l3_and_free_delayed(&cachep->nodelists[cpu_to_mem(i)]->list_lock);
 		kfree(ccold);
 	}
 	kfree(new);
-- 
cgit v0.10.2


From 0542d396ca8aef0c4d15e9c39780026d9fbc8afd Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 3 Jul 2009 08:29:37 -0500
Subject: mm: page_alloc: rt-friendly per-cpu pages

rt-friendly per-cpu pages: convert the irqs-off per-cpu locking
method into a preemptible, explicit-per-cpu-locks method.

Contains fixes from:
	 Peter Zijlstra <a.p.zijlstra@chello.nl>
	 Thomas Gleixner <tglx@linutronix.de>

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index f61bc0f..e93a0d8 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -58,6 +58,7 @@
 #include <linux/prefetch.h>
 #include <linux/migrate.h>
 #include <linux/page-debug-flags.h>
+#include <linux/locallock.h>
 
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
@@ -219,6 +220,18 @@ EXPORT_SYMBOL(nr_node_ids);
 EXPORT_SYMBOL(nr_online_nodes);
 #endif
 
+static DEFINE_LOCAL_IRQ_LOCK(pa_lock);
+
+#ifdef CONFIG_PREEMPT_RT_BASE
+# define cpu_lock_irqsave(cpu, flags)		\
+	spin_lock_irqsave(&per_cpu(pa_lock, cpu).lock, flags)
+# define cpu_unlock_irqrestore(cpu, flags)	\
+	spin_unlock_irqrestore(&per_cpu(pa_lock, cpu).lock, flags)
+#else
+# define cpu_lock_irqsave(cpu, flags)		local_irq_save(flags)
+# define cpu_unlock_irqrestore(cpu, flags)	local_irq_restore(flags)
+#endif
+
 int page_group_by_mobility_disabled __read_mostly;
 
 void set_pageblock_migratetype(struct page *page, int migratetype)
@@ -722,12 +735,12 @@ static void __free_pages_ok(struct page *page, unsigned int order)
 	if (!free_pages_prepare(page, order))
 		return;
 
-	local_irq_save(flags);
+	local_lock_irqsave(pa_lock, flags);
 	__count_vm_events(PGFREE, 1 << order);
 	migratetype = get_pageblock_migratetype(page);
 	set_freepage_migratetype(page, migratetype);
 	free_one_page(page_zone(page), page, order, migratetype);
-	local_irq_restore(flags);
+	local_unlock_irqrestore(pa_lock, flags);
 }
 
 /*
@@ -1169,7 +1182,7 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
 	unsigned long flags;
 	int to_drain;
 
-	local_irq_save(flags);
+	local_lock_irqsave(pa_lock, flags);
 	if (pcp->count >= pcp->batch)
 		to_drain = pcp->batch;
 	else
@@ -1178,7 +1191,7 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
 		free_pcppages_bulk(zone, to_drain, pcp);
 		pcp->count -= to_drain;
 	}
-	local_irq_restore(flags);
+	local_unlock_irqrestore(pa_lock, flags);
 }
 #endif
 
@@ -1198,7 +1211,7 @@ static void drain_pages(unsigned int cpu)
 		struct per_cpu_pageset *pset;
 		struct per_cpu_pages *pcp;
 
-		local_irq_save(flags);
+		cpu_lock_irqsave(cpu, flags);
 		pset = per_cpu_ptr(zone->pageset, cpu);
 
 		pcp = &pset->pcp;
@@ -1206,7 +1219,7 @@ static void drain_pages(unsigned int cpu)
 			free_pcppages_bulk(zone, pcp->count, pcp);
 			pcp->count = 0;
 		}
-		local_irq_restore(flags);
+		cpu_unlock_irqrestore(cpu, flags);
 	}
 }
 
@@ -1259,7 +1272,12 @@ void drain_all_pages(void)
 		else
 			cpumask_clear_cpu(cpu, &cpus_with_pcps);
 	}
+#ifndef CONFIG_PREEMPT_RT_BASE
 	on_each_cpu_mask(&cpus_with_pcps, drain_local_pages, NULL, 1);
+#else
+	for_each_cpu(cpu, &cpus_with_pcps)
+		drain_pages(cpu);
+#endif
 }
 
 #ifdef CONFIG_HIBERNATION
@@ -1314,7 +1332,7 @@ void free_hot_cold_page(struct page *page, int cold)
 
 	migratetype = get_pageblock_migratetype(page);
 	set_freepage_migratetype(page, migratetype);
-	local_irq_save(flags);
+	local_lock_irqsave(pa_lock, flags);
 	__count_vm_event(PGFREE);
 
 	/*
@@ -1344,7 +1362,7 @@ void free_hot_cold_page(struct page *page, int cold)
 	}
 
 out:
-	local_irq_restore(flags);
+	local_unlock_irqrestore(pa_lock, flags);
 }
 
 /*
@@ -1473,7 +1491,7 @@ again:
 		struct per_cpu_pages *pcp;
 		struct list_head *list;
 
-		local_irq_save(flags);
+		local_lock_irqsave(pa_lock, flags);
 		pcp = &this_cpu_ptr(zone->pageset)->pcp;
 		list = &pcp->lists[migratetype];
 		if (list_empty(list)) {
@@ -1505,18 +1523,20 @@ again:
 			 */
 			WARN_ON_ONCE(order > 1);
 		}
-		spin_lock_irqsave(&zone->lock, flags);
+		local_spin_lock_irqsave(pa_lock, &zone->lock, flags);
 		page = __rmqueue(zone, order, migratetype);
-		spin_unlock(&zone->lock);
-		if (!page)
+		if (!page) {
+			spin_unlock(&zone->lock);
 			goto failed;
+		}
 		__mod_zone_freepage_state(zone, -(1 << order),
 					  get_pageblock_migratetype(page));
+		spin_unlock(&zone->lock);
 	}
 
 	__count_zone_vm_events(PGALLOC, zone, 1 << order);
 	zone_statistics(preferred_zone, zone, gfp_flags);
-	local_irq_restore(flags);
+	local_unlock_irqrestore(pa_lock, flags);
 
 	VM_BUG_ON(bad_range(zone, page));
 	if (prep_new_page(page, order, gfp_flags))
@@ -1524,7 +1544,7 @@ again:
 	return page;
 
 failed:
-	local_irq_restore(flags);
+	local_unlock_irqrestore(pa_lock, flags);
 	return NULL;
 }
 
@@ -5133,6 +5153,7 @@ static int page_alloc_cpu_notify(struct notifier_block *self,
 void __init page_alloc_init(void)
 {
 	hotcpu_notifier(page_alloc_cpu_notify, 0);
+	local_irq_lock_init(pa_lock);
 }
 
 /*
@@ -5965,12 +5986,12 @@ static int __meminit __zone_pcp_update(void *data)
 		pset = per_cpu_ptr(zone->pageset, cpu);
 		pcp = &pset->pcp;
 
-		local_irq_save(flags);
+		cpu_lock_irqsave(cpu, flags);
 		if (pcp->count > 0)
 			free_pcppages_bulk(zone, pcp->count, pcp);
 		drain_zonestat(zone, pset);
 		setup_pageset(pset, batch);
-		local_irq_restore(flags);
+		cpu_unlock_irqrestore(cpu, flags);
 	}
 	return 0;
 }
@@ -5988,7 +6009,7 @@ void zone_pcp_reset(struct zone *zone)
 	struct per_cpu_pageset *pset;
 
 	/* avoid races with drain_pages()  */
-	local_irq_save(flags);
+	local_lock_irqsave(pa_lock, flags);
 	if (zone->pageset != &boot_pageset) {
 		for_each_online_cpu(cpu) {
 			pset = per_cpu_ptr(zone->pageset, cpu);
@@ -5997,7 +6018,7 @@ void zone_pcp_reset(struct zone *zone)
 		free_percpu(zone->pageset);
 		zone->pageset = &boot_pageset;
 	}
-	local_irq_restore(flags);
+	local_unlock_irqrestore(pa_lock, flags);
 }
 
 #ifdef CONFIG_MEMORY_HOTREMOVE
-- 
cgit v0.10.2


From 2f4d12d4ec247b0b1baaf510e8f469db603c221e Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 3 Jul 2009 08:44:37 -0500
Subject: mm: page_alloc reduce lock sections further

Split out the pages which are to be freed into a separate list and
call free_pages_bulk() outside of the percpu page allocator locks.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e93a0d8..9122f48 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -625,7 +625,7 @@ static inline int free_pages_check(struct page *page)
 }
 
 /*
- * Frees a number of pages from the PCP lists
+ * Frees a number of pages which have been collected from the pcp lists.
  * Assumes all pages on list are in same zone, and of same order.
  * count is the number of pages to free.
  *
@@ -636,16 +636,50 @@ static inline int free_pages_check(struct page *page)
  * pinned" detection logic.
  */
 static void free_pcppages_bulk(struct zone *zone, int count,
-					struct per_cpu_pages *pcp)
+			       struct list_head *list)
 {
-	int migratetype = 0;
-	int batch_free = 0;
 	int to_free = count;
+	unsigned long flags;
 
-	spin_lock(&zone->lock);
+	spin_lock_irqsave(&zone->lock, flags);
 	zone->all_unreclaimable = 0;
 	zone->pages_scanned = 0;
 
+	while (!list_empty(list)) {
+		struct page *page = list_first_entry(list, struct page, lru);
+		int mt;	/* migratetype of the to-be-freed page */
+
+		/* must delete as __free_one_page list manipulates */
+		list_del(&page->lru);
+
+		mt = get_freepage_migratetype(page);
+		/* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
+		__free_one_page(page, zone, 0, mt);
+		trace_mm_page_pcpu_drain(page, 0, mt);
+		if (likely(get_pageblock_migratetype(page) != MIGRATE_ISOLATE)) {
+			__mod_zone_page_state(zone, NR_FREE_PAGES, 1);
+			if (is_migrate_cma(mt))
+				__mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1);
+		}
+
+		to_free--;
+	}
+	WARN_ON(to_free != 0);
+	spin_unlock_irqrestore(&zone->lock, flags);
+}
+
+/*
+ * Moves a number of pages from the PCP lists to free list which
+ * is freed outside of the locked region.
+ *
+ * Assumes all pages on list are in same zone, and of same order.
+ * count is the number of pages to free.
+ */
+static void isolate_pcp_pages(int to_free, struct per_cpu_pages *src,
+			      struct list_head *dst)
+{
+	int migratetype = 0, batch_free = 0;
+
 	while (to_free) {
 		struct page *page;
 		struct list_head *list;
@@ -661,7 +695,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
 			batch_free++;
 			if (++migratetype == MIGRATE_PCPTYPES)
 				migratetype = 0;
-			list = &pcp->lists[migratetype];
+			list = &src->lists[migratetype];
 		} while (list_empty(list));
 
 		/* This is the only non-empty list. Free them all. */
@@ -669,36 +703,26 @@ static void free_pcppages_bulk(struct zone *zone, int count,
 			batch_free = to_free;
 
 		do {
-			int mt;	/* migratetype of the to-be-freed page */
-
 			page = list_last_entry(list, struct page, lru);
-			/* must delete as __free_one_page list manipulates */
 			list_del(&page->lru);
-			mt = get_freepage_migratetype(page);
-			/* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
-			__free_one_page(page, zone, 0, mt);
-			trace_mm_page_pcpu_drain(page, 0, mt);
-			if (likely(get_pageblock_migratetype(page) != MIGRATE_ISOLATE)) {
-				__mod_zone_page_state(zone, NR_FREE_PAGES, 1);
-				if (is_migrate_cma(mt))
-					__mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1);
-			}
+			list_add(&page->lru, dst);
 		} while (--to_free && --batch_free && !list_empty(list));
 	}
-	spin_unlock(&zone->lock);
 }
 
 static void free_one_page(struct zone *zone, struct page *page, int order,
 				int migratetype)
 {
-	spin_lock(&zone->lock);
+	unsigned long flags;
+
+	spin_lock_irqsave(&zone->lock, flags);
 	zone->all_unreclaimable = 0;
 	zone->pages_scanned = 0;
 
 	__free_one_page(page, zone, order, migratetype);
 	if (unlikely(migratetype != MIGRATE_ISOLATE))
 		__mod_zone_freepage_state(zone, 1 << order, migratetype);
-	spin_unlock(&zone->lock);
+	spin_unlock_irqrestore(&zone->lock, flags);
 }
 
 static bool free_pages_prepare(struct page *page, unsigned int order)
@@ -1180,6 +1204,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
 void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
 {
 	unsigned long flags;
+	LIST_HEAD(dst);
 	int to_drain;
 
 	local_lock_irqsave(pa_lock, flags);
@@ -1188,10 +1213,11 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
 	else
 		to_drain = pcp->count;
 	if (to_drain > 0) {
-		free_pcppages_bulk(zone, to_drain, pcp);
+		isolate_pcp_pages(to_drain, pcp, &dst);
 		pcp->count -= to_drain;
 	}
 	local_unlock_irqrestore(pa_lock, flags);
+	free_pcppages_bulk(zone, to_drain, &dst);
 }
 #endif
 
@@ -1210,16 +1236,21 @@ static void drain_pages(unsigned int cpu)
 	for_each_populated_zone(zone) {
 		struct per_cpu_pageset *pset;
 		struct per_cpu_pages *pcp;
+		LIST_HEAD(dst);
+		int count;
 
 		cpu_lock_irqsave(cpu, flags);
 		pset = per_cpu_ptr(zone->pageset, cpu);
 
 		pcp = &pset->pcp;
-		if (pcp->count) {
-			free_pcppages_bulk(zone, pcp->count, pcp);
+		count = pcp->count;
+		if (count) {
+			isolate_pcp_pages(count, pcp, &dst);
 			pcp->count = 0;
 		}
 		cpu_unlock_irqrestore(cpu, flags);
+		if (count)
+			free_pcppages_bulk(zone, count, &dst);
 	}
 }
 
@@ -1357,8 +1388,15 @@ void free_hot_cold_page(struct page *page, int cold)
 		list_add(&page->lru, &pcp->lists[migratetype]);
 	pcp->count++;
 	if (pcp->count >= pcp->high) {
-		free_pcppages_bulk(zone, pcp->batch, pcp);
+		LIST_HEAD(dst);
+		int count;
+
+		isolate_pcp_pages(pcp->batch, pcp, &dst);
 		pcp->count -= pcp->batch;
+		count = pcp->batch;
+		local_unlock_irqrestore(pa_lock, flags);
+		free_pcppages_bulk(zone, count, &dst);
+		return;
 	}
 
 out:
@@ -5977,20 +6015,22 @@ static int __meminit __zone_pcp_update(void *data)
 {
 	struct zone *zone = data;
 	int cpu;
-	unsigned long batch = zone_batchsize(zone), flags;
+	unsigned long flags;
 
 	for_each_possible_cpu(cpu) {
 		struct per_cpu_pageset *pset;
 		struct per_cpu_pages *pcp;
+		LIST_HEAD(dst);
 
 		pset = per_cpu_ptr(zone->pageset, cpu);
 		pcp = &pset->pcp;
 
 		cpu_lock_irqsave(cpu, flags);
-		if (pcp->count > 0)
-			free_pcppages_bulk(zone, pcp->count, pcp);
+		if (pcp->count > 0) {
+			isolate_pcp_pages(pcp->count, pcp, &dst);
+			free_pcppages_bulk(zone, pcp->count, &dst);
+		}
 		drain_zonestat(zone, pset);
-		setup_pageset(pset, batch);
 		cpu_unlock_irqrestore(cpu, flags);
 	}
 	return 0;
-- 
cgit v0.10.2


From 6fc5d7bf91443bec83058748e29a9f6ebc1107c2 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 21 Jul 2011 16:47:49 +0200
Subject: mm-page-alloc-fix.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 9122f48..364c3c6 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2204,8 +2204,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 		struct page *page;
 
 		/* Page migration frees to the PCP lists but we want merging */
-		drain_pages(get_cpu());
-		put_cpu();
+		drain_pages(get_cpu_light());
+		put_cpu_light();
 
 		page = get_page_from_freelist(gfp_mask, nodemask,
 				order, zonelist, high_zoneidx,
-- 
cgit v0.10.2


From 94433990dd57664abe2cfa11cb9db077b487b30c Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 3 Jul 2009 08:29:51 -0500
Subject: mm: convert swap to percpu locked

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/mm/swap.c b/mm/swap.c
index 6310dc2..5812f96 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -30,6 +30,7 @@
 #include <linux/backing-dev.h>
 #include <linux/memcontrol.h>
 #include <linux/gfp.h>
+#include <linux/locallock.h>
 
 #include "internal.h"
 
@@ -40,6 +41,9 @@ static DEFINE_PER_CPU(struct pagevec[NR_LRU_LISTS], lru_add_pvecs);
 static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs);
 static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs);
 
+static DEFINE_LOCAL_IRQ_LOCK(rotate_lock);
+static DEFINE_LOCAL_IRQ_LOCK(swapvec_lock);
+
 /*
  * This path almost never happens for VM activity - pages are normally
  * freed via pagevecs.  But it gets used by networking.
@@ -354,11 +358,11 @@ void rotate_reclaimable_page(struct page *page)
 		unsigned long flags;
 
 		page_cache_get(page);
-		local_irq_save(flags);
+		local_lock_irqsave(rotate_lock, flags);
 		pvec = &__get_cpu_var(lru_rotate_pvecs);
 		if (!pagevec_add(pvec, page))
 			pagevec_move_tail(pvec);
-		local_irq_restore(flags);
+		local_unlock_irqrestore(rotate_lock, flags);
 	}
 }
 
@@ -403,12 +407,13 @@ static void activate_page_drain(int cpu)
 void activate_page(struct page *page)
 {
 	if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
-		struct pagevec *pvec = &get_cpu_var(activate_page_pvecs);
+		struct pagevec *pvec = &get_locked_var(swapvec_lock,
+						       activate_page_pvecs);
 
 		page_cache_get(page);
 		if (!pagevec_add(pvec, page))
 			pagevec_lru_move_fn(pvec, __activate_page, NULL);
-		put_cpu_var(activate_page_pvecs);
+		put_locked_var(swapvec_lock, activate_page_pvecs);
 	}
 }
 
@@ -456,13 +461,13 @@ EXPORT_SYMBOL(mark_page_accessed);
  */
 void __lru_cache_add(struct page *page, enum lru_list lru)
 {
-	struct pagevec *pvec = &get_cpu_var(lru_add_pvecs)[lru];
+	struct pagevec *pvec = &get_locked_var(swapvec_lock, lru_add_pvecs)[lru];
 
 	page_cache_get(page);
 	if (!pagevec_space(pvec))
 		__pagevec_lru_add(pvec, lru);
 	pagevec_add(pvec, page);
-	put_cpu_var(lru_add_pvecs);
+	put_locked_var(swapvec_lock, lru_add_pvecs);
 }
 EXPORT_SYMBOL(__lru_cache_add);
 
@@ -597,9 +602,9 @@ void lru_add_drain_cpu(int cpu)
 		unsigned long flags;
 
 		/* No harm done if a racing interrupt already did this */
-		local_irq_save(flags);
+		local_lock_irqsave(rotate_lock, flags);
 		pagevec_move_tail(pvec);
-		local_irq_restore(flags);
+		local_unlock_irqrestore(rotate_lock, flags);
 	}
 
 	pvec = &per_cpu(lru_deactivate_pvecs, cpu);
@@ -627,18 +632,19 @@ void deactivate_page(struct page *page)
 		return;
 
 	if (likely(get_page_unless_zero(page))) {
-		struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs);
+		struct pagevec *pvec = &get_locked_var(swapvec_lock,
+						       lru_deactivate_pvecs);
 
 		if (!pagevec_add(pvec, page))
 			pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
-		put_cpu_var(lru_deactivate_pvecs);
+		put_locked_var(swapvec_lock, lru_deactivate_pvecs);
 	}
 }
 
 void lru_add_drain(void)
 {
-	lru_add_drain_cpu(get_cpu());
-	put_cpu();
+	lru_add_drain_cpu(local_lock_cpu(swapvec_lock));
+	local_unlock_cpu(swapvec_lock);
 }
 
 static void lru_add_drain_per_cpu(struct work_struct *dummy)
-- 
cgit v0.10.2


From 0dde5d59b276577e49402410a27c5c896c6a1633 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 3 Jul 2009 08:30:13 -0500
Subject: mm: make vmstat -rt aware

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index a13291f..839806b 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -29,7 +29,9 @@ DECLARE_PER_CPU(struct vm_event_state, vm_event_states);
 
 static inline void __count_vm_event(enum vm_event_item item)
 {
+	preempt_disable_rt();
 	__this_cpu_inc(vm_event_states.event[item]);
+	preempt_enable_rt();
 }
 
 static inline void count_vm_event(enum vm_event_item item)
@@ -39,7 +41,9 @@ static inline void count_vm_event(enum vm_event_item item)
 
 static inline void __count_vm_events(enum vm_event_item item, long delta)
 {
+	preempt_disable_rt();
 	__this_cpu_add(vm_event_states.event[item], delta);
+	preempt_enable_rt();
 }
 
 static inline void count_vm_events(enum vm_event_item item, long delta)
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 9800306..a4dbd77 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -216,6 +216,7 @@ void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
 	long x;
 	long t;
 
+	preempt_disable_rt();
 	x = delta + __this_cpu_read(*p);
 
 	t = __this_cpu_read(pcp->stat_threshold);
@@ -225,6 +226,7 @@ void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
 		x = 0;
 	}
 	__this_cpu_write(*p, x);
+	preempt_enable_rt();
 }
 EXPORT_SYMBOL(__mod_zone_page_state);
 
@@ -257,6 +259,7 @@ void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
 	s8 __percpu *p = pcp->vm_stat_diff + item;
 	s8 v, t;
 
+	preempt_disable_rt();
 	v = __this_cpu_inc_return(*p);
 	t = __this_cpu_read(pcp->stat_threshold);
 	if (unlikely(v > t)) {
@@ -265,6 +268,7 @@ void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
 		zone_page_state_add(v + overstep, zone, item);
 		__this_cpu_write(*p, -overstep);
 	}
+	preempt_enable_rt();
 }
 
 void __inc_zone_page_state(struct page *page, enum zone_stat_item item)
@@ -279,6 +283,7 @@ void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
 	s8 __percpu *p = pcp->vm_stat_diff + item;
 	s8 v, t;
 
+	preempt_disable_rt();
 	v = __this_cpu_dec_return(*p);
 	t = __this_cpu_read(pcp->stat_threshold);
 	if (unlikely(v < - t)) {
@@ -287,6 +292,7 @@ void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
 		zone_page_state_add(v - overstep, zone, item);
 		__this_cpu_write(*p, overstep);
 	}
+	preempt_enable_rt();
 }
 
 void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
-- 
cgit v0.10.2


From 5a1f4ace1e081c102cccfd5332137c1de73f65da Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 3 Jul 2009 08:44:54 -0500
Subject: mm: shrink the page frame to !-rt size

He below is a boot-tested hack to shrink the page frame size back to
normal.

Should be a net win since there should be many less PTE-pages than
page-frames.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 9568b90..e3b3a15 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1259,27 +1259,59 @@ static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long a
  * overflow into the next struct page (as it might with DEBUG_SPINLOCK).
  * When freeing, reset page->mapping so free_pages_check won't complain.
  */
+#ifndef CONFIG_PREEMPT_RT_FULL
+
 #define __pte_lockptr(page)	&((page)->ptl)
-#define pte_lock_init(_page)	do {					\
-	spin_lock_init(__pte_lockptr(_page));				\
-} while (0)
+
+static inline struct page *pte_lock_init(struct page *page)
+{
+	spin_lock_init(__pte_lockptr(page));
+	return page;
+}
+
 #define pte_lock_deinit(page)	((page)->mapping = NULL)
+
+#else /* !PREEMPT_RT_FULL */
+
+/*
+ * On PREEMPT_RT_FULL the spinlock_t's are too large to embed in the
+ * page frame, hence it only has a pointer and we need to dynamically
+ * allocate the lock when we allocate PTE-pages.
+ *
+ * This is an overall win, since only a small fraction of the pages
+ * will be PTE pages under normal circumstances.
+ */
+
+#define __pte_lockptr(page)	((page)->ptl)
+
+extern struct page *pte_lock_init(struct page *page);
+extern void pte_lock_deinit(struct page *page);
+
+#endif /* PREEMPT_RT_FULL */
+
 #define pte_lockptr(mm, pmd)	({(void)(mm); __pte_lockptr(pmd_page(*(pmd)));})
 #else	/* !USE_SPLIT_PTLOCKS */
 /*
  * We use mm->page_table_lock to guard all pagetable pages of the mm.
  */
-#define pte_lock_init(page)	do {} while (0)
+static inline struct page *pte_lock_init(struct page *page) { return page; }
 #define pte_lock_deinit(page)	do {} while (0)
 #define pte_lockptr(mm, pmd)	({(void)(pmd); &(mm)->page_table_lock;})
 #endif /* USE_SPLIT_PTLOCKS */
 
-static inline void pgtable_page_ctor(struct page *page)
+static inline struct page *__pgtable_page_ctor(struct page *page)
 {
-	pte_lock_init(page);
-	inc_zone_page_state(page, NR_PAGETABLE);
+	page = pte_lock_init(page);
+	if (page)
+		inc_zone_page_state(page, NR_PAGETABLE);
+	return page;
 }
 
+#define pgtable_page_ctor(page)				\
+do {							\
+	page = __pgtable_page_ctor(page);		\
+} while (0)
+
 static inline void pgtable_page_dtor(struct page *page)
 {
 	pte_lock_deinit(page);
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index f8f5162..8b53bdf 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -141,7 +141,11 @@ struct page {
 						 * system if PG_buddy is set.
 						 */
 #if USE_SPLIT_PTLOCKS
+# ifndef CONFIG_PREEMPT_RT_FULL
 		spinlock_t ptl;
+# else
+		spinlock_t *ptl;
+# endif
 #endif
 		struct kmem_cache *slab_cache;	/* SL[AU]B: Pointer to slab */
 		struct page *first_page;	/* Compound tail pages */
diff --git a/mm/memory.c b/mm/memory.c
index cb7094f..7417ebc 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4312,3 +4312,35 @@ void copy_user_huge_page(struct page *dst, struct page *src,
 	}
 }
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */
+
+#if defined(CONFIG_PREEMPT_RT_FULL) && (USE_SPLIT_PTLOCKS > 0)
+/*
+ * Heinous hack, relies on the caller doing something like:
+ *
+ *   pte = alloc_pages(PGALLOC_GFP, 0);
+ *   if (pte)
+ *     pgtable_page_ctor(pte);
+ *   return pte;
+ *
+ * This ensures we release the page and return NULL when the
+ * lock allocation fails.
+ */
+struct page *pte_lock_init(struct page *page)
+{
+	page->ptl = kmalloc(sizeof(spinlock_t), GFP_KERNEL);
+	if (page->ptl) {
+		spin_lock_init(__pte_lockptr(page));
+	} else {
+		__free_page(page);
+		page = NULL;
+	}
+	return page;
+}
+
+void pte_lock_deinit(struct page *page)
+{
+	kfree(page->ptl);
+	page->mapping = NULL;
+}
+
+#endif
-- 
cgit v0.10.2


From 6ba80eb0b94e6c15c397d58c770b958f56676fa9 Mon Sep 17 00:00:00 2001
From: Frank Rowand <frank.rowand@am.sony.com>
Date: Sat, 1 Oct 2011 18:58:13 -0700
Subject: ARM: Initialize ptl->lock for vector page

Without this patch, ARM can not use SPLIT_PTLOCK_CPUS if
PREEMPT_RT_FULL=y because vectors_user_mapping() creates a
VM_ALWAYSDUMP mapping of the vector page (address 0xffff0000), but no
ptl->lock has been allocated for the page.  An attempt to coredump
that page will result in a kernel NULL pointer dereference when
follow_page() attempts to lock the page.

The call tree to the NULL pointer dereference is:

   do_notify_resume()
      get_signal_to_deliver()
         do_coredump()
            elf_core_dump()
               get_dump_page()
                  __get_user_pages()
                     follow_page()
                        pte_offset_map_lock() <----- a #define
                           ...
                              rt_spin_lock()

The underlying problem is exposed by mm-shrink-the-page-frame-to-rt-size.patch.

Signed-off-by: Frank Rowand <frank.rowand@am.sony.com>
Cc: Frank <Frank_Rowand@sonyusa.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/4E87C535.2030907@am.sony.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c
index c6dec5f..43ac178 100644
--- a/arch/arm/kernel/process.c
+++ b/arch/arm/kernel/process.c
@@ -459,6 +459,31 @@ unsigned long arch_randomize_brk(struct mm_struct *mm)
 }
 
 #ifdef CONFIG_MMU
+
+/*
+ * CONFIG_SPLIT_PTLOCK_CPUS results in a page->ptl lock.  If the lock is not
+ * initialized by pgtable_page_ctor() then a coredump of the vector page will
+ * fail.
+ */
+static int __init vectors_user_mapping_init_page(void)
+{
+	struct page *page;
+	unsigned long addr = 0xffff0000;
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+
+	pgd = pgd_offset_k(addr);
+	pud = pud_offset(pgd, addr);
+	pmd = pmd_offset(pud, addr);
+	page = pmd_page(*(pmd));
+
+	pgtable_page_ctor(page);
+
+	return 0;
+}
+late_initcall(vectors_user_mapping_init_page);
+
 /*
  * The vectors page is always readable from user space for the
  * atomic helpers and the signal restart code. Insert it into the
-- 
cgit v0.10.2


From d3d5f5bdc1fe6dc0f7197dd3e04d37de781faa33 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 3 Jul 2009 08:44:03 -0500
Subject: mm: Allow only slab on RT

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/init/Kconfig b/init/Kconfig
index be8b7f5..fbc0785 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1493,6 +1493,7 @@ config SLAB
 
 config SLUB
 	bool "SLUB (Unqueued Allocator)"
+	depends on !PREEMPT_RT_FULL
 	help
 	   SLUB is a slab allocator that minimizes cache line usage
 	   instead of managing queues of cached objects (SLAB approach).
@@ -1504,6 +1505,7 @@ config SLUB
 config SLOB
 	depends on EXPERT
 	bool "SLOB (Simple Allocator)"
+	depends on !PREEMPT_RT_FULL
 	help
 	   SLOB replaces the stock allocator with a drastically simpler
 	   allocator. SLOB is generally more space efficient but
-- 
cgit v0.10.2


From dee67a2c4f475ade5afe3f25acd6ade469ad96b9 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 17 Jul 2011 21:33:18 +0200
Subject: radix-tree-rt-aware.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index ffc444c..7ddfbf9 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -230,7 +230,13 @@ unsigned long radix_tree_next_hole(struct radix_tree_root *root,
 				unsigned long index, unsigned long max_scan);
 unsigned long radix_tree_prev_hole(struct radix_tree_root *root,
 				unsigned long index, unsigned long max_scan);
+
+#ifndef CONFIG_PREEMPT_RT_FULL
 int radix_tree_preload(gfp_t gfp_mask);
+#else
+static inline int radix_tree_preload(gfp_t gm) { return 0; }
+#endif
+
 void radix_tree_init(void);
 void *radix_tree_tag_set(struct radix_tree_root *root,
 			unsigned long index, unsigned int tag);
@@ -255,7 +261,7 @@ unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item);
 
 static inline void radix_tree_preload_end(void)
 {
-	preempt_enable();
+	preempt_enable_nort();
 }
 
 /**
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index e796429..63bac7d 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -215,12 +215,13 @@ radix_tree_node_alloc(struct radix_tree_root *root)
 		 * succeed in getting a node here (and never reach
 		 * kmem_cache_alloc)
 		 */
-		rtp = &__get_cpu_var(radix_tree_preloads);
+		rtp = &get_cpu_var(radix_tree_preloads);
 		if (rtp->nr) {
 			ret = rtp->nodes[rtp->nr - 1];
 			rtp->nodes[rtp->nr - 1] = NULL;
 			rtp->nr--;
 		}
+		put_cpu_var(radix_tree_preloads);
 	}
 	if (ret == NULL)
 		ret = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask);
@@ -255,6 +256,7 @@ radix_tree_node_free(struct radix_tree_node *node)
 	call_rcu(&node->rcu_head, radix_tree_node_rcu_free);
 }
 
+#ifndef CONFIG_PREEMPT_RT_FULL
 /*
  * Load up this CPU's radix_tree_node buffer with sufficient objects to
  * ensure that the addition of a single element in the tree cannot fail.  On
@@ -289,6 +291,7 @@ out:
 	return ret;
 }
 EXPORT_SYMBOL(radix_tree_preload);
+#endif
 
 /*
  *	Return the maximum key which can be store into a
-- 
cgit v0.10.2


From 08ad733098910707b827df4fe6fc7cdcc1cbae05 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 30 Apr 2013 03:17:05 -0500
Subject: panic: skip get_random_bytes for RT_FULL in init_oops_id


diff --git a/kernel/panic.c b/kernel/panic.c
index e1b2822..5dc4381 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -371,9 +371,11 @@ static u64 oops_id;
 
 static int init_oops_id(void)
 {
+#ifndef CONFIG_PREEMPT_RT_FULL
 	if (!oops_id)
 		get_random_bytes(&oops_id, sizeof(oops_id));
 	else
+#endif
 		oops_id++;
 
 	return 0;
-- 
cgit v0.10.2


From db5b3ba5c764f61ccbabf414a2a497eec0f8228b Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 3 Jul 2009 08:30:12 -0500
Subject: ipc: Make the ipc code -rt aware

RT serializes the code with the (rt)spinlock but keeps preemption
enabled. Some parts of the code need to be atomic nevertheless.

Protect it with preempt_disable/enable_rt pairts.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index f3f40dc..94f5c9e 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -921,12 +921,17 @@ static inline void pipelined_send(struct mqueue_inode_info *info,
 				  struct msg_msg *message,
 				  struct ext_wait_queue *receiver)
 {
+	/*
+	 * Keep them in one critical section for PREEMPT_RT:
+	 */
+	preempt_disable_rt();
 	receiver->msg = message;
 	list_del(&receiver->list);
 	receiver->state = STATE_PENDING;
 	wake_up_process(receiver->task);
 	smp_wmb();
 	receiver->state = STATE_READY;
+	preempt_enable_rt();
 }
 
 /* pipelined_receive() - if there is task waiting in sys_mq_timedsend()
diff --git a/ipc/msg.c b/ipc/msg.c
index fede1d0..0b60596 100644
--- a/ipc/msg.c
+++ b/ipc/msg.c
@@ -259,12 +259,20 @@ static void expunge_all(struct msg_queue *msq, int res)
 	while (tmp != &msq->q_receivers) {
 		struct msg_receiver *msr;
 
+		/*
+		 * Make sure that the wakeup doesnt preempt
+		 * this CPU prematurely. (on PREEMPT_RT)
+		 */
+		preempt_disable_rt();
+
 		msr = list_entry(tmp, struct msg_receiver, r_list);
 		tmp = tmp->next;
 		msr->r_msg = NULL;
 		wake_up_process(msr->r_tsk);
 		smp_mb();
 		msr->r_msg = ERR_PTR(res);
+
+		preempt_enable_rt();
 	}
 }
 
@@ -614,6 +622,12 @@ static inline int pipelined_send(struct msg_queue *msq, struct msg_msg *msg)
 		    !security_msg_queue_msgrcv(msq, msg, msr->r_tsk,
 					       msr->r_msgtype, msr->r_mode)) {
 
+			/*
+			 * Make sure that the wakeup doesnt preempt
+			 * this CPU prematurely. (on PREEMPT_RT)
+			 */
+			preempt_disable_rt();
+
 			list_del(&msr->r_list);
 			if (msr->r_maxsize < msg->m_ts) {
 				msr->r_msg = NULL;
@@ -627,9 +641,11 @@ static inline int pipelined_send(struct msg_queue *msq, struct msg_msg *msg)
 				wake_up_process(msr->r_tsk);
 				smp_mb();
 				msr->r_msg = msg;
+				preempt_enable_rt();
 
 				return 1;
 			}
+			preempt_enable_rt();
 		}
 	}
 	return 0;
-- 
cgit v0.10.2


From c089a44d176689206765e2bf241993093490a8c8 Mon Sep 17 00:00:00 2001
From: KOBAYASHI Yoshitake <yoshitake.kobayashi@toshiba.co.jp>
Date: Sat, 23 Jul 2011 11:57:36 +0900
Subject: ipc/mqueue: Add a critical section to avoid a deadlock

(Repost for v3.0-rt1 and changed the distination addreses)
I have tested the following patch on v3.0-rt1 with PREEMPT_RT_FULL.
In POSIX message queue, if a sender process uses SCHED_FIFO and
has a higher priority than a receiver process, the sender will
be stuck at ipc/mqueue.c:452

  452                 while (ewp->state == STATE_PENDING)
  453                         cpu_relax();

Description of the problem
 (receiver process)
   1. receiver changes sender's state to STATE_PENDING (mqueue.c:846)
   2. wake up sender process and "switch to sender" (mqueue.c:847)
      Note: This context switch only happens in PREEMPT_RT_FULL kernel.
 (sender process)
   3. sender check the own state in above loop (mqueue.c:452-453)
   *. receiver will never wake up and cannot change sender's state to
      STATE_READY because sender has higher priority


Signed-off-by: Yoshitake Kobayashi <yoshitake.kobayashi@toshiba.co.jp>
Cc: viro@zeniv.linux.org.uk
Cc: dchinner@redhat.com
Cc: npiggin@kernel.dk
Cc: hch@lst.de
Cc: arnd@arndb.de
Link: http://lkml.kernel.org/r/4E2A38A0.1090601@toshiba.co.jp
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index 94f5c9e..4f7d959 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -945,13 +945,18 @@ static inline void pipelined_receive(struct mqueue_inode_info *info)
 		wake_up_interruptible(&info->wait_q);
 		return;
 	}
-	if (msg_insert(sender->msg, info))
-		return;
-	list_del(&sender->list);
-	sender->state = STATE_PENDING;
-	wake_up_process(sender->task);
-	smp_wmb();
-	sender->state = STATE_READY;
+	/*
+	 * Keep them in one critical section for PREEMPT_RT:
+	 */
+	preempt_disable_rt();
+	if (!msg_insert(sender->msg, info)) {
+		list_del(&sender->list);
+		sender->state = STATE_PENDING;
+		wake_up_process(sender->task);
+		smp_wmb();
+		sender->state = STATE_READY;
+	}
+	preempt_enable_rt();
 }
 
 SYSCALL_DEFINE5(mq_timedsend, mqd_t, mqdes, const char __user *, u_msg_ptr,
-- 
cgit v0.10.2


From 77e93d3d16d9e995b5ce7d14a4134def6c2b5cfc Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 3 Jul 2009 08:44:07 -0500
Subject: relay: fix timer madness

remove timer calls (!!!) from deep within the tracing infrastructure.
This was totally bogus code that can cause lockups and worse.  Poll
the buffer every 2 jiffies for now.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/relay.c b/kernel/relay.c
index e8cd202..56ba44f 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -340,6 +340,10 @@ static void wakeup_readers(unsigned long data)
 {
 	struct rchan_buf *buf = (struct rchan_buf *)data;
 	wake_up_interruptible(&buf->read_wait);
+	/*
+	 * Stupid polling for now:
+	 */
+	mod_timer(&buf->timer, jiffies + 1);
 }
 
 /**
@@ -357,6 +361,7 @@ static void __relay_reset(struct rchan_buf *buf, unsigned int init)
 		init_waitqueue_head(&buf->read_wait);
 		kref_init(&buf->kref);
 		setup_timer(&buf->timer, wakeup_readers, (unsigned long)buf);
+		mod_timer(&buf->timer, jiffies + 1);
 	} else
 		del_timer_sync(&buf->timer);
 
@@ -739,15 +744,6 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length)
 		else
 			buf->early_bytes += buf->chan->subbuf_size -
 					    buf->padding[old_subbuf];
-		smp_mb();
-		if (waitqueue_active(&buf->read_wait))
-			/*
-			 * Calling wake_up_interruptible() from here
-			 * will deadlock if we happen to be logging
-			 * from the scheduler (trying to re-grab
-			 * rq->lock), so defer it.
-			 */
-			mod_timer(&buf->timer, jiffies + 1);
 	}
 
 	old = buf->data;
-- 
cgit v0.10.2


From f2c1bf3187143f48a519b4b25ac4865d9c398d94 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 3 Jul 2009 08:29:34 -0500
Subject: timers: prepare for full preemption

When softirqs can be preempted we need to make sure that cancelling
the timer from the active thread can not deadlock vs. a running timer
callback. Add a waitqueue to resolve that.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/timer.h b/include/linux/timer.h
index 8c5a197..5fcd72c 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -241,7 +241,7 @@ extern void add_timer(struct timer_list *timer);
 
 extern int try_to_del_timer_sync(struct timer_list *timer);
 
-#ifdef CONFIG_SMP
+#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL)
   extern int del_timer_sync(struct timer_list *timer);
 #else
 # define del_timer_sync(t)		del_timer(t)
diff --git a/kernel/timer.c b/kernel/timer.c
index 367d008..10fcbae 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -76,6 +76,7 @@ struct tvec_root {
 struct tvec_base {
 	spinlock_t lock;
 	struct timer_list *running_timer;
+	wait_queue_head_t wait_for_running_timer;
 	unsigned long timer_jiffies;
 	unsigned long next_timer;
 	unsigned long active_timers;
@@ -735,12 +736,15 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
 
 	debug_activate(timer, expires);
 
+	preempt_disable_rt();
 	cpu = smp_processor_id();
 
 #if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP)
 	if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu))
 		cpu = get_nohz_timer_target();
 #endif
+	preempt_enable_rt();
+
 	new_base = per_cpu(tvec_bases, cpu);
 
 	if (base != new_base) {
@@ -941,6 +945,29 @@ void add_timer_on(struct timer_list *timer, int cpu)
 }
 EXPORT_SYMBOL_GPL(add_timer_on);
 
+#ifdef CONFIG_PREEMPT_RT_FULL
+/*
+ * Wait for a running timer
+ */
+static void wait_for_running_timer(struct timer_list *timer)
+{
+	struct tvec_base *base = timer->base;
+
+	if (base->running_timer == timer)
+		wait_event(base->wait_for_running_timer,
+			   base->running_timer != timer);
+}
+
+# define wakeup_timer_waiters(b)	wake_up(&(b)->wait_for_tunning_timer)
+#else
+static inline void wait_for_running_timer(struct timer_list *timer)
+{
+	cpu_relax();
+}
+
+# define wakeup_timer_waiters(b)	do { } while (0)
+#endif
+
 /**
  * del_timer - deactive a timer.
  * @timer: the timer to be deactivated
@@ -998,7 +1025,7 @@ int try_to_del_timer_sync(struct timer_list *timer)
 }
 EXPORT_SYMBOL(try_to_del_timer_sync);
 
-#ifdef CONFIG_SMP
+#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL)
 /**
  * del_timer_sync - deactivate a timer and wait for the handler to finish.
  * @timer: the timer to be deactivated
@@ -1058,7 +1085,7 @@ int del_timer_sync(struct timer_list *timer)
 		int ret = try_to_del_timer_sync(timer);
 		if (ret >= 0)
 			return ret;
-		cpu_relax();
+		wait_for_running_timer(timer);
 	}
 }
 EXPORT_SYMBOL(del_timer_sync);
@@ -1175,15 +1202,17 @@ static inline void __run_timers(struct tvec_base *base)
 			if (irqsafe) {
 				spin_unlock(&base->lock);
 				call_timer_fn(timer, fn, data);
+				base->running_timer = NULL;
 				spin_lock(&base->lock);
 			} else {
 				spin_unlock_irq(&base->lock);
 				call_timer_fn(timer, fn, data);
+				base->running_timer = NULL;
 				spin_lock_irq(&base->lock);
 			}
 		}
 	}
-	base->running_timer = NULL;
+	wake_up(&base->wait_for_running_timer);
 	spin_unlock_irq(&base->lock);
 }
 
@@ -1684,6 +1713,7 @@ static int __cpuinit init_timers_cpu(int cpu)
 	}
 
 	spin_lock_init(&base->lock);
+	init_waitqueue_head(&base->wait_for_running_timer);
 
 	for (j = 0; j < TVN_SIZE; j++) {
 		INIT_LIST_HEAD(base->tv5.vec + j);
-- 
cgit v0.10.2


From 23fcc2f167e1254ed211beb98ec6f27eb0b45cfb Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 3 Jul 2009 08:30:20 -0500
Subject: timers: preempt-rt support

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/timer.c b/kernel/timer.c
index 10fcbae..9b51abd 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1352,7 +1352,17 @@ unsigned long get_next_timer_interrupt(unsigned long now)
 	if (cpu_is_offline(smp_processor_id()))
 		return expires;
 
+#ifdef CONFIG_PREEMPT_RT_FULL
+	/*
+	 * On PREEMPT_RT we cannot sleep here. If the trylock does not
+	 * succeed then we return the worst-case 'expires in 1 tick'
+	 * value:
+	 */
+	if (!spin_trylock(&base->lock))
+		return  now + 1;
+#else
 	spin_lock(&base->lock);
+#endif
 	if (base->active_timers) {
 		if (time_before_eq(base->next_timer, base->timer_jiffies))
 			base->next_timer = __next_timer_interrupt(base);
@@ -1362,7 +1372,6 @@ unsigned long get_next_timer_interrupt(unsigned long now)
 
 	if (time_before_eq(expires, now))
 		return now;
-
 	return cmp_next_hrtimer_event(now, expires);
 }
 #endif
@@ -1752,7 +1761,7 @@ static void __cpuinit migrate_timers(int cpu)
 
 	BUG_ON(cpu_online(cpu));
 	old_base = per_cpu(tvec_bases, cpu);
-	new_base = get_cpu_var(tvec_bases);
+	new_base = get_local_var(tvec_bases);
 	/*
 	 * The caller is globally serialized and nobody else
 	 * takes two locks at once, deadlock is not possible.
@@ -1773,7 +1782,7 @@ static void __cpuinit migrate_timers(int cpu)
 
 	spin_unlock(&old_base->lock);
 	spin_unlock_irq(&new_base->lock);
-	put_cpu_var(tvec_bases);
+	put_local_var(tvec_bases);
 }
 #endif /* CONFIG_HOTPLUG_CPU */
 
-- 
cgit v0.10.2


From 16a774f642ef5647139c72a33c087bd5d88bb6a3 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 3 Jul 2009 08:44:30 -0500
Subject: timers: mov printk_tick to soft interrupt

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/timer.c b/kernel/timer.c
index 9b51abd..f4b3b68 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1389,7 +1389,6 @@ void update_process_times(int user_tick)
 	account_process_tick(p, user_tick);
 	run_local_timers();
 	rcu_check_callbacks(cpu, user_tick);
-	printk_tick();
 #ifdef CONFIG_IRQ_WORK
 	if (in_irq())
 		irq_work_run();
@@ -1405,6 +1404,7 @@ static void run_timer_softirq(struct softirq_action *h)
 {
 	struct tvec_base *base = __this_cpu_read(tvec_bases);
 
+	printk_tick();
 	hrtimer_run_pending();
 
 	if (time_after_eq(jiffies, base->timer_jiffies))
-- 
cgit v0.10.2


From b979771caccf04bbc2096575dc7d5ac9115ecb7f Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 21 Aug 2009 11:56:45 +0200
Subject: timer: delay waking softirqs from the jiffy tick

People were complaining about broken balancing with the recent -rt
series.

A look at /proc/sched_debug yielded:

cpu#0, 2393.874 MHz
  .nr_running                    : 0
  .load                          : 0
  .cpu_load[0]                   : 177522
  .cpu_load[1]                   : 177522
  .cpu_load[2]                   : 177522
  .cpu_load[3]                   : 177522
  .cpu_load[4]                   : 177522
cpu#1, 2393.874 MHz
  .nr_running                    : 4
  .load                          : 4096
  .cpu_load[0]                   : 181618
  .cpu_load[1]                   : 180850
  .cpu_load[2]                   : 180274
  .cpu_load[3]                   : 179938
  .cpu_load[4]                   : 179758

Which indicated the cpu_load computation was hosed, the 177522 value
indicates that there is one RT task runnable. Initially I thought the
old problem of calculating the cpu_load from a softirq had re-surfaced,
however looking at the code shows its being done from scheduler_tick().

[ we really should fix this RT/cfs interaction some day... ]

A few trace_printk()s later:

    sirq-timer/1-19    [001]   174.289744:     19: 50:S ==> [001]     0:140:R <idle>
          <idle>-0     [001]   174.290724: enqueue_task_rt: adding task: 19/sirq-timer/1 with load: 177522
          <idle>-0     [001]   174.290725:      0:140:R   + [001]    19: 50:S sirq-timer/1
          <idle>-0     [001]   174.290730: scheduler_tick: current load: 177522
          <idle>-0     [001]   174.290732: scheduler_tick: current: 0/swapper
          <idle>-0     [001]   174.290736:      0:140:R ==> [001]    19: 50:R sirq-timer/1
    sirq-timer/1-19    [001]   174.290741: dequeue_task_rt: removing task: 19/sirq-timer/1 with load: 177522
    sirq-timer/1-19    [001]   174.290743:     19: 50:S ==> [001]     0:140:R <idle>

We see that we always raise the timer softirq before doing the load
calculation. Avoid this by re-ordering the scheduler_tick() call in
update_process_times() to occur before we deal with timers.

This lowers the load back to sanity and restores regular load-balancing
behaviour.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/timer.c b/kernel/timer.c
index f4b3b68..828ed8f 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1387,13 +1387,13 @@ void update_process_times(int user_tick)
 
 	/* Note: this timer irq context must be accounted for as well. */
 	account_process_tick(p, user_tick);
+	scheduler_tick();
 	run_local_timers();
 	rcu_check_callbacks(cpu, user_tick);
 #ifdef CONFIG_IRQ_WORK
 	if (in_irq())
 		irq_work_run();
 #endif
-	scheduler_tick();
 	run_posix_cpu_timers(p);
 }
 
-- 
cgit v0.10.2


From 19c3ee0f6321fcbddf786b2cc3999e7834b17d83 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 21 Jul 2011 15:23:39 +0200
Subject: timers: Avoid the switch timers base set to NULL trick on RT

On RT that code is preemptible, so we cannot assign NULL to timers
base as a preempter would spin forever in lock_timer_base().

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/timer.c b/kernel/timer.c
index 828ed8f..5470c3e 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -717,6 +717,36 @@ static struct tvec_base *lock_timer_base(struct timer_list *timer,
 	}
 }
 
+#ifndef CONFIG_PREEMPT_RT_FULL
+static inline struct tvec_base *switch_timer_base(struct timer_list *timer,
+						  struct tvec_base *old,
+						  struct tvec_base *new)
+{
+	/* See the comment in lock_timer_base() */
+	timer_set_base(timer, NULL);
+	spin_unlock(&old->lock);
+	spin_lock(&new->lock);
+	timer_set_base(timer, new);
+	return new;
+}
+#else
+static inline struct tvec_base *switch_timer_base(struct timer_list *timer,
+						  struct tvec_base *old,
+						  struct tvec_base *new)
+{
+	/*
+	 * We cannot do the above because we might be preempted and
+	 * then the preempter would see NULL and loop forever.
+	 */
+	if (spin_trylock(&new->lock)) {
+		timer_set_base(timer, new);
+		spin_unlock(&old->lock);
+		return new;
+	}
+	return old;
+}
+#endif
+
 static inline int
 __mod_timer(struct timer_list *timer, unsigned long expires,
 						bool pending_only, int pinned)
@@ -755,14 +785,8 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
 		 * handler yet has not finished. This also guarantees that
 		 * the timer is serialized wrt itself.
 		 */
-		if (likely(base->running_timer != timer)) {
-			/* See the comment in lock_timer_base() */
-			timer_set_base(timer, NULL);
-			spin_unlock(&base->lock);
-			base = new_base;
-			spin_lock(&base->lock);
-			timer_set_base(timer, base);
-		}
+		if (likely(base->running_timer != timer))
+			base = switch_timer_base(timer, base, new_base);
 	}
 
 	timer->expires = expires;
-- 
cgit v0.10.2


From c08fefbd77c94c1ff135c96efdbbe0c15656ffd4 Mon Sep 17 00:00:00 2001
From: Yong Zhang <yong.zhang0@gmail.com>
Date: Sun, 16 Oct 2011 18:56:45 +0800
Subject: printk: Don't call printk_tick in printk_needs_cpu() on RT

printk_tick() can't be called in atomic context when RT is enabled,
otherwise below warning will show:

[  117.597095] BUG: sleeping function called from invalid context at kernel/rtmutex.c:645
[  117.597102] in_atomic(): 1, irqs_disabled(): 1, pid: 0, name: kworker/0:0
[  117.597111] Pid: 0, comm: kworker/0:0 Not tainted 3.0.6-rt17-00284-gb76d419-dirty #7
[  117.597116] Call Trace:
[  117.597131]  [<c06e3b61>] ? printk+0x1d/0x24
[  117.597142]  [<c01390b6>] __might_sleep+0xe6/0x110
[  117.597151]  [<c06e634c>] rt_spin_lock+0x1c/0x30
[  117.597158]  [<c0142f26>] __wake_up+0x26/0x60
[  117.597166]  [<c014c78e>] printk_tick+0x3e/0x40
[  117.597173]  [<c014c7b4>] printk_needs_cpu+0x24/0x30
[  117.597181]  [<c017ecc8>] tick_nohz_stop_sched_tick+0x2e8/0x410
[  117.597191]  [<c017305a>] ? sched_clock_idle_wakeup_event+0x1a/0x20
[  117.597201]  [<c010182a>] cpu_idle+0x4a/0xb0
[  117.597209]  [<c06e0b97>] start_secondary+0xd3/0xd7

Now this is a really rare case and it's very unlikely that we starve
an logbuf waiter that way.

Signed-off-by: Yong Zhang <yong.zhang0@gmail.com>
Link: http://lkml.kernel.org/r/1318762607-2261-4-git-send-email-yong.zhang0@gmail.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/printk.c b/kernel/printk.c
index 98aad49..df4b56b 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -2030,8 +2030,8 @@ void printk_tick(void)
 
 int printk_needs_cpu(int cpu)
 {
-	if (cpu_is_offline(cpu))
-		printk_tick();
+	if (unlikely(cpu_is_offline(cpu)))
+		__this_cpu_write(printk_pending, 0);
 	return __this_cpu_read(printk_pending);
 }
 
-- 
cgit v0.10.2


From 0768d71c9682ab7ba855e79679291f702aa7b5c0 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 3 Jul 2009 08:29:34 -0500
Subject: hrtimers: prepare full preemption

Make cancellation of a running callback in softirq context safe
against preemption.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 5b9e789..ff5680c 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -192,6 +192,9 @@ struct hrtimer_cpu_base {
 	unsigned long			nr_hangs;
 	ktime_t				max_hang_time;
 #endif
+#ifdef CONFIG_PREEMPT_RT_BASE
+	wait_queue_head_t		wait;
+#endif
 	struct hrtimer_clock_base	clock_base[HRTIMER_MAX_CLOCK_BASES];
 };
 
@@ -385,6 +388,13 @@ static inline int hrtimer_restart(struct hrtimer *timer)
 	return hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
 }
 
+/* Softirq preemption could deadlock timer removal */
+#ifdef CONFIG_PREEMPT_RT_BASE
+  extern void hrtimer_wait_for_timer(const struct hrtimer *timer);
+#else
+# define hrtimer_wait_for_timer(timer)	do { cpu_relax(); } while (0)
+#endif
+
 /* Query timers: */
 extern ktime_t hrtimer_get_remaining(const struct hrtimer *timer);
 extern int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp);
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 05b3452..49a5596 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -845,6 +845,32 @@ u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
 }
 EXPORT_SYMBOL_GPL(hrtimer_forward);
 
+#ifdef CONFIG_PREEMPT_RT_BASE
+# define wake_up_timer_waiters(b)	wake_up(&(b)->wait)
+
+/**
+ * hrtimer_wait_for_timer - Wait for a running timer
+ *
+ * @timer:	timer to wait for
+ *
+ * The function waits in case the timers callback function is
+ * currently executed on the waitqueue of the timer base. The
+ * waitqueue is woken up after the timer callback function has
+ * finished execution.
+ */
+void hrtimer_wait_for_timer(const struct hrtimer *timer)
+{
+	struct hrtimer_clock_base *base = timer->base;
+
+	if (base && base->cpu_base && !hrtimer_hres_active(base->cpu_base))
+		wait_event(base->cpu_base->wait,
+				!(timer->state & HRTIMER_STATE_CALLBACK));
+}
+
+#else
+# define wake_up_timer_waiters(b)	do { } while (0)
+#endif
+
 /*
  * enqueue_hrtimer - internal function to (re)start a timer
  *
@@ -1095,7 +1121,7 @@ int hrtimer_cancel(struct hrtimer *timer)
 
 		if (ret >= 0)
 			return ret;
-		cpu_relax();
+		hrtimer_wait_for_timer(timer);
 	}
 }
 EXPORT_SYMBOL_GPL(hrtimer_cancel);
@@ -1510,6 +1536,8 @@ void hrtimer_run_queues(void)
 		}
 		raw_spin_unlock(&cpu_base->lock);
 	}
+
+	wake_up_timer_waiters(cpu_base);
 }
 
 /*
@@ -1670,6 +1698,9 @@ static void __cpuinit init_hrtimers_cpu(int cpu)
 	}
 
 	hrtimer_init_hres(cpu_base);
+#ifdef CONFIG_PREEMPT_RT_BASE
+	init_waitqueue_head(&cpu_base->wait);
+#endif
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
diff --git a/kernel/itimer.c b/kernel/itimer.c
index 8d262b4..d051390 100644
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -213,6 +213,7 @@ again:
 		/* We are sharing ->siglock with it_real_fn() */
 		if (hrtimer_try_to_cancel(timer) < 0) {
 			spin_unlock_irq(&tsk->sighand->siglock);
+			hrtimer_wait_for_timer(&tsk->signal->real_timer);
 			goto again;
 		}
 		expires = timeval_to_ktime(value->it_value);
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 4b7183c..1d5e435 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -773,6 +773,20 @@ SYSCALL_DEFINE1(timer_getoverrun, timer_t, timer_id)
 	return overrun;
 }
 
+/*
+ * Protected by RCU!
+ */
+static void timer_wait_for_callback(struct k_clock *kc, struct k_itimer *timr)
+{
+#ifdef CONFIG_PREEMPT_RT_FULL
+	if (kc->timer_set == common_timer_set)
+		hrtimer_wait_for_timer(&timr->it.real.timer);
+	else
+		/* FIXME: Whacky hack for posix-cpu-timers */
+		schedule_timeout(1);
+#endif
+}
+
 /* Set a POSIX.1b interval timer. */
 /* timr->it_lock is taken. */
 static int
@@ -850,6 +864,7 @@ retry:
 	if (!timr)
 		return -EINVAL;
 
+	rcu_read_lock();
 	kc = clockid_to_kclock(timr->it_clock);
 	if (WARN_ON_ONCE(!kc || !kc->timer_set))
 		error = -EINVAL;
@@ -858,9 +873,12 @@ retry:
 
 	unlock_timer(timr, flag);
 	if (error == TIMER_RETRY) {
+		timer_wait_for_callback(kc, timr);
 		rtn = NULL;	// We already got the old time...
+		rcu_read_unlock();
 		goto retry;
 	}
+	rcu_read_unlock();
 
 	if (old_setting && !error &&
 	    copy_to_user(old_setting, &old_spec, sizeof (old_spec)))
@@ -898,10 +916,15 @@ retry_delete:
 	if (!timer)
 		return -EINVAL;
 
+	rcu_read_lock();
 	if (timer_delete_hook(timer) == TIMER_RETRY) {
 		unlock_timer(timer, flags);
+		timer_wait_for_callback(clockid_to_kclock(timer->it_clock),
+					timer);
+		rcu_read_unlock();
 		goto retry_delete;
 	}
+	rcu_read_unlock();
 
 	spin_lock(&current->sighand->siglock);
 	list_del(&timer->list);
@@ -927,8 +950,18 @@ static void itimer_delete(struct k_itimer *timer)
 retry_delete:
 	spin_lock_irqsave(&timer->it_lock, flags);
 
+	/* On RT we can race with a deletion */
+	if (!timer->it_signal) {
+		unlock_timer(timer, flags);
+		return;
+	}
+
 	if (timer_delete_hook(timer) == TIMER_RETRY) {
+		rcu_read_lock();
 		unlock_timer(timer, flags);
+		timer_wait_for_callback(clockid_to_kclock(timer->it_clock),
+					timer);
+		rcu_read_unlock();
 		goto retry_delete;
 	}
 	list_del(&timer->list);
-- 
cgit v0.10.2


From 3bf9ce9ce086a43eea49b4ff67b8f0596d5285fd Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 3 Jul 2009 08:44:31 -0500
Subject: hrtimer: fixup hrtimer callback changes for preempt-rt

In preempt-rt we can not call the callbacks which take sleeping locks
from the timer interrupt context.

Bring back the softirq split for now, until we fixed the signal
delivery problem for real.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index ff5680c..113bcf1 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -111,6 +111,8 @@ struct hrtimer {
 	enum hrtimer_restart		(*function)(struct hrtimer *);
 	struct hrtimer_clock_base	*base;
 	unsigned long			state;
+	struct list_head		cb_entry;
+	int				irqsafe;
 #ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
 	ktime_t				praecox;
 #endif
@@ -150,6 +152,7 @@ struct hrtimer_clock_base {
 	int			index;
 	clockid_t		clockid;
 	struct timerqueue_head	active;
+	struct list_head	expired;
 	ktime_t			resolution;
 	ktime_t			(*get_time)(void);
 	ktime_t			softirq_time;
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 49a5596..f39d778 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -590,8 +590,7 @@ static int hrtimer_reprogram(struct hrtimer *timer,
 	 * When the callback is running, we do not reprogram the clock event
 	 * device. The timer callback is either running on a different CPU or
 	 * the callback is executed in the hrtimer_interrupt context. The
-	 * reprogramming is handled either by the softirq, which called the
-	 * callback or at the end of the hrtimer_interrupt.
+	 * reprogramming is handled at the end of the hrtimer_interrupt.
 	 */
 	if (hrtimer_callback_running(timer))
 		return 0;
@@ -626,6 +625,9 @@ static int hrtimer_reprogram(struct hrtimer *timer,
 	return res;
 }
 
+static void __run_hrtimer(struct hrtimer *timer, ktime_t *now);
+static int hrtimer_rt_defer(struct hrtimer *timer);
+
 /*
  * Initialize the high resolution related parts of cpu_base
  */
@@ -642,9 +644,18 @@ static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base)
  * and expiry check is done in the hrtimer_interrupt or in the softirq.
  */
 static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
-					    struct hrtimer_clock_base *base)
+					    struct hrtimer_clock_base *base,
+					    int wakeup)
 {
-	return base->cpu_base->hres_active && hrtimer_reprogram(timer, base);
+	if (!(base->cpu_base->hres_active && hrtimer_reprogram(timer, base)))
+		return 0;
+	if (!wakeup)
+		return -ETIME;
+#ifdef CONFIG_PREEMPT_RT_BASE
+	if (!hrtimer_rt_defer(timer))
+		return -ETIME;
+#endif
+	return 1;
 }
 
 static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
@@ -725,12 +736,18 @@ static inline int hrtimer_switch_to_hres(void) { return 0; }
 static inline void
 hrtimer_force_reprogram(struct hrtimer_cpu_base *base, int skip_equal) { }
 static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
-					    struct hrtimer_clock_base *base)
+					    struct hrtimer_clock_base *base,
+					    int wakeup)
 {
 	return 0;
 }
 static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { }
 static inline void retrigger_next_event(void *arg) { }
+static inline int hrtimer_reprogram(struct hrtimer *timer,
+				    struct hrtimer_clock_base *base)
+{
+	return 0;
+}
 
 #endif /* CONFIG_HIGH_RES_TIMERS */
 
@@ -862,9 +879,9 @@ void hrtimer_wait_for_timer(const struct hrtimer *timer)
 {
 	struct hrtimer_clock_base *base = timer->base;
 
-	if (base && base->cpu_base && !hrtimer_hres_active(base->cpu_base))
+	if (base && base->cpu_base && !timer->irqsafe)
 		wait_event(base->cpu_base->wait,
-				!(timer->state & HRTIMER_STATE_CALLBACK));
+			   !(timer->state & HRTIMER_STATE_CALLBACK));
 }
 
 #else
@@ -914,6 +931,11 @@ static void __remove_hrtimer(struct hrtimer *timer,
 	if (!(timer->state & HRTIMER_STATE_ENQUEUED))
 		goto out;
 
+	if (unlikely(!list_empty(&timer->cb_entry))) {
+		list_del_init(&timer->cb_entry);
+		goto out;
+	}
+
 	next_timer = timerqueue_getnext(&base->active);
 	timerqueue_del(&base->active, &timer->node);
 	if (&timer->node == next_timer) {
@@ -1021,9 +1043,19 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
 	 *
 	 * XXX send_remote_softirq() ?
 	 */
-	if (leftmost && new_base->cpu_base == &__get_cpu_var(hrtimer_bases)
-		&& hrtimer_enqueue_reprogram(timer, new_base)) {
-		if (wakeup) {
+	if (leftmost && new_base->cpu_base == &__get_cpu_var(hrtimer_bases)) {
+		ret = hrtimer_enqueue_reprogram(timer, new_base, wakeup);
+		if (ret < 0) {
+			/*
+			 * In case we failed to reprogram the timer (mostly
+			 * because out current timer is already elapsed),
+			 * remove it again and report a failure. This avoids
+			 * stale base->first entries.
+			 */
+			debug_deactivate(timer);
+			__remove_hrtimer(timer, new_base,
+				timer->state & HRTIMER_STATE_CALLBACK, 0);
+		} else if (ret > 0) {
 			/*
 			 * We need to drop cpu_base->lock to avoid a
 			 * lock ordering issue vs. rq->lock.
@@ -1031,9 +1063,7 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
 			raw_spin_unlock(&new_base->cpu_base->lock);
 			raise_softirq_irqoff(HRTIMER_SOFTIRQ);
 			local_irq_restore(flags);
-			return ret;
-		} else {
-			__raise_softirq_irqoff(HRTIMER_SOFTIRQ);
+			return 0;
 		}
 	}
 
@@ -1200,6 +1230,7 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
 
 	base = hrtimer_clockid_to_base(clock_id);
 	timer->base = &cpu_base->clock_base[base];
+	INIT_LIST_HEAD(&timer->cb_entry);
 	timerqueue_init(&timer->node);
 
 #ifdef CONFIG_TIMER_STATS
@@ -1283,10 +1314,128 @@ static void __run_hrtimer(struct hrtimer *timer, ktime_t *now)
 	timer->state &= ~HRTIMER_STATE_CALLBACK;
 }
 
-#ifdef CONFIG_HIGH_RES_TIMERS
-
 static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer);
 
+#ifdef CONFIG_PREEMPT_RT_BASE
+static void hrtimer_rt_reprogram(int restart, struct hrtimer *timer,
+				 struct hrtimer_clock_base *base)
+{
+	/*
+	 * Note, we clear the callback flag before we requeue the
+	 * timer otherwise we trigger the callback_running() check
+	 * in hrtimer_reprogram().
+	 */
+	timer->state &= ~HRTIMER_STATE_CALLBACK;
+
+	if (restart != HRTIMER_NORESTART) {
+		BUG_ON(hrtimer_active(timer));
+		/*
+		 * Enqueue the timer, if it's the leftmost timer then
+		 * we need to reprogram it.
+		 */
+		if (!enqueue_hrtimer(timer, base))
+			return;
+
+#ifndef CONFIG_HIGH_RES_TIMERS
+	}
+#else
+		if (base->cpu_base->hres_active &&
+		    hrtimer_reprogram(timer, base))
+			goto requeue;
+
+	} else if (hrtimer_active(timer)) {
+		/*
+		 * If the timer was rearmed on another CPU, reprogram
+		 * the event device.
+		 */
+		if (&timer->node == base->active.next &&
+		    base->cpu_base->hres_active &&
+		    hrtimer_reprogram(timer, base))
+			goto requeue;
+	}
+	return;
+
+requeue:
+	/*
+	 * Timer is expired. Thus move it from tree to pending list
+	 * again.
+	 */
+	__remove_hrtimer(timer, base, timer->state, 0);
+	list_add_tail(&timer->cb_entry, &base->expired);
+#endif
+}
+
+/*
+ * The changes in mainline which removed the callback modes from
+ * hrtimer are not yet working with -rt. The non wakeup_process()
+ * based callbacks which involve sleeping locks need to be treated
+ * seperately.
+ */
+static void hrtimer_rt_run_pending(void)
+{
+	enum hrtimer_restart (*fn)(struct hrtimer *);
+	struct hrtimer_cpu_base *cpu_base;
+	struct hrtimer_clock_base *base;
+	struct hrtimer *timer;
+	int index, restart;
+
+	local_irq_disable();
+	cpu_base = &per_cpu(hrtimer_bases, smp_processor_id());
+
+	raw_spin_lock(&cpu_base->lock);
+
+	for (index = 0; index < HRTIMER_MAX_CLOCK_BASES; index++) {
+		base = &cpu_base->clock_base[index];
+
+		while (!list_empty(&base->expired)) {
+			timer = list_first_entry(&base->expired,
+						 struct hrtimer, cb_entry);
+
+			/*
+			 * Same as the above __run_hrtimer function
+			 * just we run with interrupts enabled.
+			 */
+			debug_hrtimer_deactivate(timer);
+			__remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0);
+			timer_stats_account_hrtimer(timer);
+			fn = timer->function;
+
+			raw_spin_unlock_irq(&cpu_base->lock);
+			restart = fn(timer);
+			raw_spin_lock_irq(&cpu_base->lock);
+
+			hrtimer_rt_reprogram(restart, timer, base);
+		}
+	}
+
+	raw_spin_unlock_irq(&cpu_base->lock);
+
+	wake_up_timer_waiters(cpu_base);
+}
+
+static int hrtimer_rt_defer(struct hrtimer *timer)
+{
+	if (timer->irqsafe)
+		return 0;
+
+	__remove_hrtimer(timer, timer->base, timer->state, 0);
+	list_add_tail(&timer->cb_entry, &timer->base->expired);
+	return 1;
+}
+
+#else
+
+static inline void hrtimer_rt_run_pending(void)
+{
+	hrtimer_peek_ahead_timers();
+}
+
+static inline int hrtimer_rt_defer(struct hrtimer *timer) { return 0; }
+
+#endif
+
+#ifdef CONFIG_HIGH_RES_TIMERS
+
 /*
  * High resolution timer interrupt
  * Called with interrupts disabled
@@ -1295,7 +1444,7 @@ void hrtimer_interrupt(struct clock_event_device *dev)
 {
 	struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
 	ktime_t expires_next, now, entry_time, delta;
-	int i, retries = 0;
+	int i, retries = 0, raise = 0;
 
 	BUG_ON(!cpu_base->hres_active);
 	cpu_base->nr_events++;
@@ -1362,7 +1511,10 @@ retry:
 				break;
 			}
 
-			__run_hrtimer(timer, &basenow);
+			if (!hrtimer_rt_defer(timer))
+				__run_hrtimer(timer, &basenow);
+			else
+				raise = 1;
 		}
 	}
 
@@ -1377,6 +1529,10 @@ retry:
 	if (expires_next.tv64 == KTIME_MAX ||
 	    !tick_program_event(expires_next, 0)) {
 		cpu_base->hang_detected = 0;
+
+		if (raise)
+			raise_softirq_irqoff(HRTIMER_SOFTIRQ);
+
 		return;
 	}
 
@@ -1457,24 +1613,26 @@ void hrtimer_peek_ahead_timers(void)
 	local_irq_restore(flags);
 }
 
+#else /* CONFIG_HIGH_RES_TIMERS */
+
+static inline void __hrtimer_peek_ahead_timers(void) { }
+
+#endif	/* !CONFIG_HIGH_RES_TIMERS */
+
 static void run_hrtimer_softirq(struct softirq_action *h)
 {
+#ifdef CONFIG_HIGH_RES_TIMERS
 	struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
 
 	if (cpu_base->clock_was_set) {
 		cpu_base->clock_was_set = 0;
 		clock_was_set();
 	}
+#endif
 
-	hrtimer_peek_ahead_timers();
+	hrtimer_rt_run_pending();
 }
 
-#else /* CONFIG_HIGH_RES_TIMERS */
-
-static inline void __hrtimer_peek_ahead_timers(void) { }
-
-#endif	/* !CONFIG_HIGH_RES_TIMERS */
-
 /*
  * Called from timer softirq every jiffy, expire hrtimers:
  *
@@ -1507,7 +1665,7 @@ void hrtimer_run_queues(void)
 	struct timerqueue_node *node;
 	struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
 	struct hrtimer_clock_base *base;
-	int index, gettime = 1;
+	int index, gettime = 1, raise = 0;
 
 	if (hrtimer_hres_active())
 		return;
@@ -1532,12 +1690,16 @@ void hrtimer_run_queues(void)
 					hrtimer_get_expires_tv64(timer))
 				break;
 
-			__run_hrtimer(timer, &base->softirq_time);
+			if (!hrtimer_rt_defer(timer))
+				__run_hrtimer(timer, &base->softirq_time);
+			else
+				raise = 1;
 		}
 		raw_spin_unlock(&cpu_base->lock);
 	}
 
-	wake_up_timer_waiters(cpu_base);
+	if (raise)
+		raise_softirq_irqoff(HRTIMER_SOFTIRQ);
 }
 
 /*
@@ -1559,6 +1721,7 @@ static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer)
 void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task)
 {
 	sl->timer.function = hrtimer_wakeup;
+	sl->timer.irqsafe = 1;
 	sl->task = task;
 }
 EXPORT_SYMBOL_GPL(hrtimer_init_sleeper);
@@ -1695,6 +1858,7 @@ static void __cpuinit init_hrtimers_cpu(int cpu)
 	for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
 		cpu_base->clock_base[i].cpu_base = cpu_base;
 		timerqueue_init_head(&cpu_base->clock_base[i].active);
+		INIT_LIST_HEAD(&cpu_base->clock_base[i].expired);
 	}
 
 	hrtimer_init_hres(cpu_base);
@@ -1813,9 +1977,7 @@ void __init hrtimers_init(void)
 	hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE,
 			  (void *)(long)smp_processor_id());
 	register_cpu_notifier(&hrtimers_nb);
-#ifdef CONFIG_HIGH_RES_TIMERS
 	open_softirq(HRTIMER_SOFTIRQ, run_hrtimer_softirq);
-#endif
 }
 
 /**
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 643549c..3485672 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -489,6 +489,7 @@ static void init_rq_hrtick(struct rq *rq)
 
 	hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	rq->hrtick_timer.function = hrtick;
+	rq->hrtick_timer.irqsafe = 1;
 }
 #else	/* CONFIG_SCHED_HRTICK */
 static inline void hrtick_clear(struct rq *rq)
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 4f02b28..46faf69 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -41,6 +41,7 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
 
 	hrtimer_init(&rt_b->rt_period_timer,
 			CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	rt_b->rt_period_timer.irqsafe = 1;
 	rt_b->rt_period_timer.function = sched_rt_period_timer;
 }
 
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 3365223..5fe688c 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -867,6 +867,7 @@ void tick_setup_sched_timer(void)
 	 * Emulate tick processing via per-CPU hrtimers:
 	 */
 	hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+	ts->sched_timer.irqsafe = 1;
 	ts->sched_timer.function = tick_sched_timer;
 
 	/* Get the next period (per cpu) */
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 608d2fe..7bbc18a 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -358,6 +358,7 @@ static void watchdog_enable(unsigned int cpu)
 	/* kick off the timer for the hardlockup detector */
 	hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	hrtimer->function = watchdog_timer_fn;
+	hrtimer->irqsafe = 1;
 
 	if (!watchdog_enabled) {
 		kthread_park(current);
-- 
cgit v0.10.2


From e7d610052e9dc976d0745fe696f4aed5dd9382a3 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 25 Jan 2012 11:08:40 +0100
Subject: timer-fd: Prevent live lock

If hrtimer_try_to_cancel() requires a retry, then depending on the
priority setting te retry loop might prevent timer callback completion
on RT. Prevent that by waiting for completion on RT, no change for a
non RT kernel.

Reported-by: Sankara Muthukrishnan <sankara.m@gmail.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable-rt@vger.kernel.org

diff --git a/fs/timerfd.c b/fs/timerfd.c
index d03822b..522aeb8 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -311,7 +311,7 @@ SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags,
 		if (hrtimer_try_to_cancel(&ctx->tmr) >= 0)
 			break;
 		spin_unlock_irq(&ctx->wqh.lock);
-		cpu_relax();
+		hrtimer_wait_for_timer(&ctx->tmr);
 	}
 
 	/*
-- 
cgit v0.10.2


From 88689333414d07054da9bb27f0cf5cf3344c92a9 Mon Sep 17 00:00:00 2001
From: John Stultz <johnstul@us.ibm.com>
Date: Fri, 3 Jul 2009 08:29:58 -0500
Subject: posix-timers: thread posix-cpu-timers on -rt

posix-cpu-timer code takes non -rt safe locks in hard irq
context. Move it to a thread.

[ 3.0 fixes from Peter Zijlstra <peterz@infradead.org> ]

Signed-off-by: John Stultz <johnstul@us.ibm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 6d087c5..10f32ab 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -141,6 +141,12 @@ extern struct task_group root_task_group;
 # define INIT_PERF_EVENTS(tsk)
 #endif
 
+#ifdef CONFIG_PREEMPT_RT_BASE
+# define INIT_TIMER_LIST		.posix_timer_list = NULL,
+#else
+# define INIT_TIMER_LIST
+#endif
+
 #define INIT_TASK_COMM "swapper"
 
 /*
@@ -196,6 +202,7 @@ extern struct task_group root_task_group;
 	.cpu_timers	= INIT_CPU_TIMERS(tsk.cpu_timers),		\
 	.pi_lock	= __RAW_SPIN_LOCK_UNLOCKED(tsk.pi_lock),	\
 	.timer_slack_ns = 50000, /* 50 usec default slack */		\
+	INIT_TIMER_LIST							\
 	.pids = {							\
 		[PIDTYPE_PID]  = INIT_PID_LINK(PIDTYPE_PID),		\
 		[PIDTYPE_PGID] = INIT_PID_LINK(PIDTYPE_PGID),		\
diff --git a/include/linux/sched.h b/include/linux/sched.h
index cb81412..4b4b06a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1378,6 +1378,9 @@ struct task_struct {
 
 	struct task_cputime cputime_expires;
 	struct list_head cpu_timers[3];
+#ifdef CONFIG_PREEMPT_RT_BASE
+	struct task_struct *posix_timer_list;
+#endif
 
 /* process credentials */
 	const struct cred __rcu *real_cred; /* objective and real subjective task
diff --git a/init/main.c b/init/main.c
index cee4b5c..a6766db 100644
--- a/init/main.c
+++ b/init/main.c
@@ -70,6 +70,7 @@
 #include <linux/perf_event.h>
 #include <linux/file.h>
 #include <linux/ptrace.h>
+#include <linux/posix-timers.h>
 
 #include <asm/io.h>
 #include <asm/bugs.h>
diff --git a/kernel/fork.c b/kernel/fork.c
index 6b22be6..fae9ab2 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1112,6 +1112,9 @@ void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
  */
 static void posix_cpu_timers_init(struct task_struct *tsk)
 {
+#ifdef CONFIG_PREEMPT_RT_BASE
+	tsk->posix_timer_list = NULL;
+#endif
 	tsk->cputime_expires.prof_exp = 0;
 	tsk->cputime_expires.virt_exp = 0;
 	tsk->cputime_expires.sched_exp = 0;
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 942ca27..15c4821 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -661,7 +661,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags,
 	/*
 	 * Disarm any old timer after extracting its expiry time.
 	 */
-	BUG_ON(!irqs_disabled());
+	BUG_ON_NONRT(!irqs_disabled());
 
 	ret = 0;
 	old_incr = timer->it.cpu.incr;
@@ -1177,7 +1177,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
 	/*
 	 * Now re-arm for the new expiry time.
 	 */
-	BUG_ON(!irqs_disabled());
+	BUG_ON_NONRT(!irqs_disabled());
 	arm_timer(timer);
 	spin_unlock(&p->sighand->siglock);
 
@@ -1241,10 +1241,11 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
 	sig = tsk->signal;
 	if (sig->cputimer.running) {
 		struct task_cputime group_sample;
+		unsigned long flags;
 
-		raw_spin_lock(&sig->cputimer.lock);
+		raw_spin_lock_irqsave(&sig->cputimer.lock, flags);
 		group_sample = sig->cputimer.cputime;
-		raw_spin_unlock(&sig->cputimer.lock);
+		raw_spin_unlock_irqrestore(&sig->cputimer.lock, flags);
 
 		if (task_cputime_expired(&group_sample, &sig->cputime_expires))
 			return 1;
@@ -1258,13 +1259,13 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
  * already updated our counts.  We need to check if any timers fire now.
  * Interrupts are disabled.
  */
-void run_posix_cpu_timers(struct task_struct *tsk)
+static void __run_posix_cpu_timers(struct task_struct *tsk)
 {
 	LIST_HEAD(firing);
 	struct k_itimer *timer, *next;
 	unsigned long flags;
 
-	BUG_ON(!irqs_disabled());
+	BUG_ON_NONRT(!irqs_disabled());
 
 	/*
 	 * The fast path checks that there are no expired thread or thread
@@ -1322,6 +1323,175 @@ void run_posix_cpu_timers(struct task_struct *tsk)
 	}
 }
 
+#ifdef CONFIG_PREEMPT_RT_BASE
+#include <linux/kthread.h>
+#include <linux/cpu.h>
+DEFINE_PER_CPU(struct task_struct *, posix_timer_task);
+DEFINE_PER_CPU(struct task_struct *, posix_timer_tasklist);
+
+static int posix_cpu_timers_thread(void *data)
+{
+	int cpu = (long)data;
+
+	BUG_ON(per_cpu(posix_timer_task,cpu) != current);
+
+	while (!kthread_should_stop()) {
+		struct task_struct *tsk = NULL;
+		struct task_struct *next = NULL;
+
+		if (cpu_is_offline(cpu))
+			goto wait_to_die;
+
+		/* grab task list */
+		raw_local_irq_disable();
+		tsk = per_cpu(posix_timer_tasklist, cpu);
+		per_cpu(posix_timer_tasklist, cpu) = NULL;
+		raw_local_irq_enable();
+
+		/* its possible the list is empty, just return */
+		if (!tsk) {
+			set_current_state(TASK_INTERRUPTIBLE);
+			schedule();
+			__set_current_state(TASK_RUNNING);
+			continue;
+		}
+
+		/* Process task list */
+		while (1) {
+			/* save next */
+			next = tsk->posix_timer_list;
+
+			/* run the task timers, clear its ptr and
+			 * unreference it
+			 */
+			__run_posix_cpu_timers(tsk);
+			tsk->posix_timer_list = NULL;
+			put_task_struct(tsk);
+
+			/* check if this is the last on the list */
+			if (next == tsk)
+				break;
+			tsk = next;
+		}
+	}
+	return 0;
+
+wait_to_die:
+	/* Wait for kthread_stop */
+	set_current_state(TASK_INTERRUPTIBLE);
+	while (!kthread_should_stop()) {
+		schedule();
+		set_current_state(TASK_INTERRUPTIBLE);
+	}
+	__set_current_state(TASK_RUNNING);
+	return 0;
+}
+
+void run_posix_cpu_timers(struct task_struct *tsk)
+{
+	unsigned long cpu = smp_processor_id();
+	struct task_struct *tasklist;
+
+	BUG_ON(!irqs_disabled());
+	if(!per_cpu(posix_timer_task, cpu))
+		return;
+	/* get per-cpu references */
+	tasklist = per_cpu(posix_timer_tasklist, cpu);
+
+	/* check to see if we're already queued */
+	if (!tsk->posix_timer_list) {
+		get_task_struct(tsk);
+		if (tasklist) {
+			tsk->posix_timer_list = tasklist;
+		} else {
+			/*
+			 * The list is terminated by a self-pointing
+			 * task_struct
+			 */
+			tsk->posix_timer_list = tsk;
+		}
+		per_cpu(posix_timer_tasklist, cpu) = tsk;
+	}
+	/* XXX signal the thread somehow */
+	wake_up_process(per_cpu(posix_timer_task, cpu));
+}
+
+/*
+ * posix_cpu_thread_call - callback that gets triggered when a CPU is added.
+ * Here we can start up the necessary migration thread for the new CPU.
+ */
+static int posix_cpu_thread_call(struct notifier_block *nfb,
+				 unsigned long action, void *hcpu)
+{
+	int cpu = (long)hcpu;
+	struct task_struct *p;
+	struct sched_param param;
+
+	switch (action) {
+	case CPU_UP_PREPARE:
+		p = kthread_create(posix_cpu_timers_thread, hcpu,
+					"posix_cpu_timers/%d",cpu);
+		if (IS_ERR(p))
+			return NOTIFY_BAD;
+		p->flags |= PF_NOFREEZE;
+		kthread_bind(p, cpu);
+		/* Must be high prio to avoid getting starved */
+		param.sched_priority = MAX_RT_PRIO-1;
+		sched_setscheduler(p, SCHED_FIFO, &param);
+		per_cpu(posix_timer_task,cpu) = p;
+		break;
+	case CPU_ONLINE:
+		/* Strictly unneccessary, as first user will wake it. */
+		wake_up_process(per_cpu(posix_timer_task,cpu));
+		break;
+#ifdef CONFIG_HOTPLUG_CPU
+	case CPU_UP_CANCELED:
+		/* Unbind it from offline cpu so it can run.  Fall thru. */
+		kthread_bind(per_cpu(posix_timer_task, cpu),
+			     cpumask_any(cpu_online_mask));
+		kthread_stop(per_cpu(posix_timer_task,cpu));
+		per_cpu(posix_timer_task,cpu) = NULL;
+		break;
+	case CPU_DEAD:
+		kthread_stop(per_cpu(posix_timer_task,cpu));
+		per_cpu(posix_timer_task,cpu) = NULL;
+		break;
+#endif
+	}
+	return NOTIFY_OK;
+}
+
+/* Register at highest priority so that task migration (migrate_all_tasks)
+ * happens before everything else.
+ */
+static struct notifier_block posix_cpu_thread_notifier = {
+	.notifier_call = posix_cpu_thread_call,
+	.priority = 10
+};
+
+static int __init posix_cpu_thread_init(void)
+{
+	void *hcpu = (void *)(long)smp_processor_id();
+	/* Start one for boot CPU. */
+	unsigned long cpu;
+
+	/* init the per-cpu posix_timer_tasklets */
+	for_each_possible_cpu(cpu)
+		per_cpu(posix_timer_tasklist, cpu) = NULL;
+
+	posix_cpu_thread_call(&posix_cpu_thread_notifier, CPU_UP_PREPARE, hcpu);
+	posix_cpu_thread_call(&posix_cpu_thread_notifier, CPU_ONLINE, hcpu);
+	register_cpu_notifier(&posix_cpu_thread_notifier);
+	return 0;
+}
+early_initcall(posix_cpu_thread_init);
+#else /* CONFIG_PREEMPT_RT_BASE */
+void run_posix_cpu_timers(struct task_struct *tsk)
+{
+	__run_posix_cpu_timers(tsk);
+}
+#endif /* CONFIG_PREEMPT_RT_BASE */
+
 /*
  * Set one of the process-wide special case CPU timers or RLIMIT_CPU.
  * The tsk->sighand->siglock must be held by the caller.
-- 
cgit v0.10.2


From 1f519cf1f8e8d93c787c62083e57f2bcfe08e8e0 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Fri, 3 Jul 2009 08:30:00 -0500
Subject: posix-timers: Shorten posix_cpu_timers/<CPU> kernel thread names

Shorten the softirq kernel thread names because they always overflow the
limited comm length, appearing as "posix_cpu_timer" CPU# times.

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 15c4821..578189a 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -1430,7 +1430,7 @@ static int posix_cpu_thread_call(struct notifier_block *nfb,
 	switch (action) {
 	case CPU_UP_PREPARE:
 		p = kthread_create(posix_cpu_timers_thread, hcpu,
-					"posix_cpu_timers/%d",cpu);
+					"posixcputmr/%d",cpu);
 		if (IS_ERR(p))
 			return NOTIFY_BAD;
 		p->flags |= PF_NOFREEZE;
-- 
cgit v0.10.2


From b62ddb1e986f08c8ee7b56fdeb8fca890efa32ef Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 3 Jul 2009 08:44:44 -0500
Subject: posix-timers: Avoid wakeups when no timers are active

Waking the thread even when no timers are scheduled is useless.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 578189a..06692e8 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -1387,6 +1387,21 @@ wait_to_die:
 	return 0;
 }
 
+static inline int __fastpath_timer_check(struct task_struct *tsk)
+{
+	/* tsk == current, ensure it is safe to use ->signal/sighand */
+	if (unlikely(tsk->exit_state))
+		return 0;
+
+	if (!task_cputime_zero(&tsk->cputime_expires))
+			return 1;
+
+	if (!task_cputime_zero(&tsk->signal->cputime_expires))
+			return 1;
+
+	return 0;
+}
+
 void run_posix_cpu_timers(struct task_struct *tsk)
 {
 	unsigned long cpu = smp_processor_id();
@@ -1399,7 +1414,7 @@ void run_posix_cpu_timers(struct task_struct *tsk)
 	tasklist = per_cpu(posix_timer_tasklist, cpu);
 
 	/* check to see if we're already queued */
-	if (!tsk->posix_timer_list) {
+	if (!tsk->posix_timer_list && __fastpath_timer_check(tsk)) {
 		get_task_struct(tsk);
 		if (tasklist) {
 			tsk->posix_timer_list = tasklist;
@@ -1411,9 +1426,9 @@ void run_posix_cpu_timers(struct task_struct *tsk)
 			tsk->posix_timer_list = tsk;
 		}
 		per_cpu(posix_timer_tasklist, cpu) = tsk;
+
+		wake_up_process(per_cpu(posix_timer_task, cpu));
 	}
-	/* XXX signal the thread somehow */
-	wake_up_process(per_cpu(posix_timer_task, cpu));
 }
 
 /*
-- 
cgit v0.10.2


From 0f55d8d52915623f31ecbbef6cc84fd29bdd5fa5 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 31 May 2011 16:59:16 +0200
Subject: sched-delay-put-task.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4b4b06a..a7cfab5 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1624,6 +1624,9 @@ struct task_struct {
 #ifdef CONFIG_UPROBES
 	struct uprobe_task *utask;
 #endif
+#ifdef CONFIG_PREEMPT_RT_BASE
+	struct rcu_head put_rcu;
+#endif
 };
 
 /* Future-safe accessor for struct task_struct's cpus_allowed. */
@@ -1814,6 +1817,15 @@ extern struct pid *cad_pid;
 extern void free_task(struct task_struct *tsk);
 #define get_task_struct(tsk) do { atomic_inc(&(tsk)->usage); } while(0)
 
+#ifdef CONFIG_PREEMPT_RT_BASE
+extern void __put_task_struct_cb(struct rcu_head *rhp);
+
+static inline void put_task_struct(struct task_struct *t)
+{
+	if (atomic_dec_and_test(&t->usage))
+		call_rcu(&t->put_rcu, __put_task_struct_cb);
+}
+#else
 extern void __put_task_struct(struct task_struct *t);
 
 static inline void put_task_struct(struct task_struct *t)
@@ -1821,6 +1833,7 @@ static inline void put_task_struct(struct task_struct *t)
 	if (atomic_dec_and_test(&t->usage))
 		__put_task_struct(t);
 }
+#endif
 
 extern void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st);
 extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st);
diff --git a/kernel/fork.c b/kernel/fork.c
index fae9ab2..c2e0120 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -229,7 +229,9 @@ static inline void put_signal_struct(struct signal_struct *sig)
 	if (atomic_dec_and_test(&sig->sigcnt))
 		free_signal_struct(sig);
 }
-
+#ifdef CONFIG_PREEMPT_RT_BASE
+static
+#endif
 void __put_task_struct(struct task_struct *tsk)
 {
 	WARN_ON(!tsk->exit_state);
@@ -244,7 +246,18 @@ void __put_task_struct(struct task_struct *tsk)
 	if (!profile_handoff_task(tsk))
 		free_task(tsk);
 }
+#ifndef CONFIG_PREEMPT_RT_BASE
 EXPORT_SYMBOL_GPL(__put_task_struct);
+#else
+void __put_task_struct_cb(struct rcu_head *rhp)
+{
+	struct task_struct *tsk = container_of(rhp, struct task_struct, put_rcu);
+
+	__put_task_struct(tsk);
+
+}
+EXPORT_SYMBOL_GPL(__put_task_struct_cb);
+#endif
 
 void __init __weak arch_task_cache_init(void) { }
 
-- 
cgit v0.10.2


From 0691325b833c54ba379e9ab9e2efee6e4242445e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 6 Jun 2011 12:12:51 +0200
Subject: sched-limit-nr-migrate.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 3485672..b589aa1 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -272,7 +272,11 @@ late_initcall(sched_init_debug);
  * Number of tasks to iterate in a single balance run.
  * Limited because this is done with IRQs disabled.
  */
+#ifndef CONFIG_PREEMPT_RT_FULL
 const_debug unsigned int sysctl_sched_nr_migrate = 32;
+#else
+const_debug unsigned int sysctl_sched_nr_migrate = 8;
+#endif
 
 /*
  * period over which we average the RT time consumption, measured
-- 
cgit v0.10.2


From 42c5570ed402bac32b3f9f8b70dea48b311f3504 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 6 Jun 2011 12:20:33 +0200
Subject: sched-mmdrop-delayed.patch

Needs thread context (pgd_lock) -> ifdeffed. workqueues wont work with
RT

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 8b53bdf..6270199 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -11,6 +11,7 @@
 #include <linux/completion.h>
 #include <linux/cpumask.h>
 #include <linux/page-debug-flags.h>
+#include <linux/rcupdate.h>
 #include <linux/uprobes.h>
 #include <asm/page.h>
 #include <asm/mmu.h>
@@ -440,6 +441,9 @@ struct mm_struct {
 	int first_nid;
 #endif
 	struct uprobes_state uprobes_state;
+#ifdef CONFIG_PREEMPT_RT_BASE
+	struct rcu_head delayed_drop;
+#endif
 };
 
 /* first nid will either be a valid NID or one of these values */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index a7cfab5..9d3432b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2355,12 +2355,24 @@ extern struct mm_struct * mm_alloc(void);
 
 /* mmdrop drops the mm and the page tables */
 extern void __mmdrop(struct mm_struct *);
+
 static inline void mmdrop(struct mm_struct * mm)
 {
 	if (unlikely(atomic_dec_and_test(&mm->mm_count)))
 		__mmdrop(mm);
 }
 
+#ifdef CONFIG_PREEMPT_RT_BASE
+extern void __mmdrop_delayed(struct rcu_head *rhp);
+static inline void mmdrop_delayed(struct mm_struct *mm)
+{
+	if (atomic_dec_and_test(&mm->mm_count))
+		call_rcu(&mm->delayed_drop, __mmdrop_delayed);
+}
+#else
+# define mmdrop_delayed(mm)	mmdrop(mm)
+#endif
+
 /* mmput gets rid of the mappings and all user-space */
 extern void mmput(struct mm_struct *);
 /* Grab a reference to a task's mm, if it is not already going away */
diff --git a/kernel/fork.c b/kernel/fork.c
index c2e0120..591d2bd 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -616,6 +616,19 @@ void __mmdrop(struct mm_struct *mm)
 }
 EXPORT_SYMBOL_GPL(__mmdrop);
 
+#ifdef CONFIG_PREEMPT_RT_BASE
+/*
+ * RCU callback for delayed mm drop. Not strictly rcu, but we don't
+ * want another facility to make this work.
+ */
+void __mmdrop_delayed(struct rcu_head *rhp)
+{
+	struct mm_struct *mm = container_of(rhp, struct mm_struct, delayed_drop);
+
+	__mmdrop(mm);
+}
+#endif
+
 /*
  * Decrement the use count and release all resources for an mm.
  */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index b589aa1..f051fde 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1847,8 +1847,12 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
 	finish_arch_post_lock_switch();
 
 	fire_sched_in_preempt_notifiers(current);
+	/*
+	 * We use mmdrop_delayed() here so we don't have to do the
+	 * full __mmdrop() when we are the last user.
+	 */
 	if (mm)
-		mmdrop(mm);
+		mmdrop_delayed(mm);
 	if (unlikely(prev_state == TASK_DEAD)) {
 		/*
 		 * Remove function-return probe instances associated with this
@@ -4835,6 +4839,8 @@ static int migration_cpu_stop(void *data)
 
 #ifdef CONFIG_HOTPLUG_CPU
 
+static DEFINE_PER_CPU(struct mm_struct *, idle_last_mm);
+
 /*
  * Ensures that the idle task is using init_mm right before its cpu goes
  * offline.
@@ -4847,7 +4853,12 @@ void idle_task_exit(void)
 
 	if (mm != &init_mm)
 		switch_mm(mm, &init_mm, current);
-	mmdrop(mm);
+
+	/*
+	 * Defer the cleanup to an alive cpu. On RT we can neither
+	 * call mmdrop() nor mmdrop_delayed() from here.
+	 */
+	per_cpu(idle_last_mm, smp_processor_id()) = mm;
 }
 
 /*
@@ -5164,6 +5175,10 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 
 	case CPU_DEAD:
 		calc_load_migrate(rq);
+		if (per_cpu(idle_last_mm, cpu)) {
+			mmdrop(per_cpu(idle_last_mm, cpu));
+			per_cpu(idle_last_mm, cpu) = NULL;
+		}
 		break;
 #endif
 	}
-- 
cgit v0.10.2


From 57b370130892062eb0074a0e0dc3feee98f758d9 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 25 Jun 2011 09:21:04 +0200
Subject: sched-rt-mutex-wakeup.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 9d3432b..c5cb61d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1063,6 +1063,7 @@ struct sched_domain;
 #define WF_SYNC		0x01		/* waker goes to sleep after wakup */
 #define WF_FORK		0x02		/* child wakeup after fork */
 #define WF_MIGRATED	0x04		/* internal use, task got migrated */
+#define WF_LOCK_SLEEPER	0x08		/* wakeup spinlock "sleeper" */
 
 #define ENQUEUE_WAKEUP		1
 #define ENQUEUE_HEAD		2
@@ -1239,6 +1240,7 @@ enum perf_event_task_context {
 
 struct task_struct {
 	volatile long state;	/* -1 unrunnable, 0 runnable, >0 stopped */
+	volatile long saved_state;	/* saved state for "spinlock sleepers" */
 	void *stack;
 	atomic_t usage;
 	unsigned int flags;	/* per process flags, defined below */
@@ -2251,6 +2253,7 @@ extern void xtime_update(unsigned long ticks);
 
 extern int wake_up_state(struct task_struct *tsk, unsigned int state);
 extern int wake_up_process(struct task_struct *tsk);
+extern int wake_up_lock_sleeper(struct task_struct * tsk);
 extern void wake_up_new_task(struct task_struct *tsk);
 #ifdef CONFIG_SMP
  extern void kick_process(struct task_struct *tsk);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f051fde..1cd82b7 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1438,8 +1438,25 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 
 	smp_wmb();
 	raw_spin_lock_irqsave(&p->pi_lock, flags);
-	if (!(p->state & state))
+	if (!(p->state & state)) {
+		/*
+		 * The task might be running due to a spinlock sleeper
+		 * wakeup. Check the saved state and set it to running
+		 * if the wakeup condition is true.
+		 */
+		if (!(wake_flags & WF_LOCK_SLEEPER)) {
+			if (p->saved_state & state)
+				p->saved_state = TASK_RUNNING;
+		}
 		goto out;
+	}
+
+	/*
+	 * If this is a regular wakeup, then we can unconditionally
+	 * clear the saved state of a "lock sleeper".
+	 */
+	if (!(wake_flags & WF_LOCK_SLEEPER))
+		p->saved_state = TASK_RUNNING;
 
 	success = 1; /* we're going to change ->state */
 	cpu = task_cpu(p);
@@ -1535,6 +1552,18 @@ int wake_up_process(struct task_struct *p)
 }
 EXPORT_SYMBOL(wake_up_process);
 
+/**
+ * wake_up_lock_sleeper - Wake up a specific process blocked on a "sleeping lock"
+ * @p: The process to be woken up.
+ *
+ * Same as wake_up_process() above, but wake_flags=WF_LOCK_SLEEPER to indicate
+ * the nature of the wakeup.
+ */
+int wake_up_lock_sleeper(struct task_struct *p)
+{
+	return try_to_wake_up(p, TASK_ALL, WF_LOCK_SLEEPER);
+}
+
 int wake_up_state(struct task_struct *p, unsigned int state)
 {
 	return try_to_wake_up(p, state, 0);
-- 
cgit v0.10.2


From 199e15a475d952f3716681780d4deecd3b7b310a Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 7 Jun 2011 09:19:06 +0200
Subject: sched-might-sleep-do-not-account-rcu-depth.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 275aa3f1..d870976 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -182,6 +182,11 @@ void synchronize_rcu(void);
  * types of kernel builds, the rcu_read_lock() nesting depth is unknowable.
  */
 #define rcu_preempt_depth() (current->rcu_read_lock_nesting)
+#ifndef CONFIG_PREEMPT_RT_FULL
+#define sched_rcu_preempt_depth()	rcu_preempt_depth()
+#else
+static inline int sched_rcu_preempt_depth(void) { return 0; }
+#endif
 
 #else /* #ifdef CONFIG_PREEMPT_RCU */
 
@@ -205,6 +210,8 @@ static inline int rcu_preempt_depth(void)
 	return 0;
 }
 
+#define sched_rcu_preempt_depth()	rcu_preempt_depth()
+
 #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
 
 /* Internal to kernel */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 1cd82b7..f5bf3e8 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7060,7 +7060,8 @@ void __init sched_init(void)
 #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
 static inline int preempt_count_equals(int preempt_offset)
 {
-	int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
+	int nested = (preempt_count() & ~PREEMPT_ACTIVE) +
+		sched_rcu_preempt_depth();
 
 	return (nested == preempt_offset);
 }
-- 
cgit v0.10.2


From 43d0233353a056234a5554ea4543d8ba97ee9614 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 7 Jun 2011 11:25:03 +0200
Subject: sched-cond-resched.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f5bf3e8..aba0f8b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4317,9 +4317,17 @@ static inline int should_resched(void)
 
 static void __cond_resched(void)
 {
-	add_preempt_count(PREEMPT_ACTIVE);
-	__schedule();
-	sub_preempt_count(PREEMPT_ACTIVE);
+	do {
+		add_preempt_count(PREEMPT_ACTIVE);
+		__schedule();
+		sub_preempt_count(PREEMPT_ACTIVE);
+		/*
+		 * Check again in case we missed a preemption
+		 * opportunity between schedule and now.
+		 */
+		barrier();
+
+	} while (need_resched());
 }
 
 int __sched _cond_resched(void)
-- 
cgit v0.10.2


From e014ae12ba339725a5f2177d3bf09a9f6e0a3c07 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 14 Jul 2011 09:56:44 +0200
Subject: cond-resched-softirq-fix.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/sched.h b/include/linux/sched.h
index c5cb61d..180bcf7 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2731,12 +2731,16 @@ extern int __cond_resched_lock(spinlock_t *lock);
 	__cond_resched_lock(lock);				\
 })
 
+#ifndef CONFIG_PREEMPT_RT_FULL
 extern int __cond_resched_softirq(void);
 
 #define cond_resched_softirq() ({					\
 	__might_sleep(__FILE__, __LINE__, SOFTIRQ_DISABLE_OFFSET);	\
 	__cond_resched_softirq();					\
 })
+#else
+# define cond_resched_softirq()		cond_resched()
+#endif
 
 /*
  * Does a critical section need to be broken due to another
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index aba0f8b..bebd66f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4368,6 +4368,7 @@ int __cond_resched_lock(spinlock_t *lock)
 }
 EXPORT_SYMBOL(__cond_resched_lock);
 
+#ifndef CONFIG_PREEMPT_RT_FULL
 int __sched __cond_resched_softirq(void)
 {
 	BUG_ON(!in_softirq());
@@ -4381,6 +4382,7 @@ int __sched __cond_resched_softirq(void)
 	return 0;
 }
 EXPORT_SYMBOL(__cond_resched_softirq);
+#endif
 
 /**
  * yield - yield the current processor to other threads.
-- 
cgit v0.10.2


From 62e8751c45f0d8d15e5cee683d995a1ce1666a08 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 17 Jul 2011 22:51:33 +0200
Subject: cond-resched-lock-rt-tweak.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 180bcf7..302687b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2720,7 +2720,7 @@ extern int _cond_resched(void);
 
 extern int __cond_resched_lock(spinlock_t *lock);
 
-#ifdef CONFIG_PREEMPT_COUNT
+#if defined(CONFIG_PREEMPT_COUNT) && !defined(CONFIG_PREEMPT_RT_FULL)
 #define PREEMPT_LOCK_OFFSET	PREEMPT_OFFSET
 #else
 #define PREEMPT_LOCK_OFFSET	0
-- 
cgit v0.10.2


From 21d3cc14b519fae9a4f17d73630baa7a8611063b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 13 Sep 2011 16:42:35 +0200
Subject: sched-disable-ttwu-queue.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 1ad1d2b..e5de4db 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -57,11 +57,15 @@ SCHED_FEAT(OWNER_SPIN, true)
  */
 SCHED_FEAT(NONTASK_POWER, true)
 
+#ifndef CONFIG_PREEMPT_RT_FULL
 /*
  * Queue remote wakeups on the target CPU and process them
  * using the scheduler IPI. Reduces rq->lock contention/bounces.
  */
 SCHED_FEAT(TTWU_QUEUE, true)
+#else
+SCHED_FEAT(TTWU_QUEUE, false)
+#endif
 
 SCHED_FEAT(FORCE_SD_OVERLAP, false)
 SCHED_FEAT(RT_RUNTIME_SHARE, true)
-- 
cgit v0.10.2


From 9560fb5ae185db9163db09a49f2b45cb501ce190 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 18 Jul 2011 17:03:52 +0200
Subject: sched: Disable CONFIG_RT_GROUP_SCHED on RT

Carsten reported problems when running:

	taskset 01 chrt -f 1 sleep 1

from within rc.local on a F15 machine. The task stays running and
never gets on the run queue because some of the run queues have
rt_throttled=1 which does not go away. Works nice from a ssh login
shell. Disabling CONFIG_RT_GROUP_SCHED solves that as well.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/init/Kconfig b/init/Kconfig
index fbc0785..f6a406d 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -946,6 +946,7 @@ config RT_GROUP_SCHED
 	bool "Group scheduling for SCHED_RR/FIFO"
 	depends on EXPERIMENTAL
 	depends on CGROUP_SCHED
+	depends on !PREEMPT_RT_FULL
 	default n
 	help
 	  This feature lets you explicitly allocate real CPU bandwidth
-- 
cgit v0.10.2


From 0b812ad982efccb406e4ddd87718195e1255b16a Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 13 Dec 2011 21:42:19 +0100
Subject: sched: ttwu: Return success when only changing the saved_state value

When a task blocks on a rt lock, it saves the current state in
p->saved_state, so a lock related wake up will not destroy the
original state.

When a real wakeup happens, while the task is running due to a lock
wakeup already, we update p->saved_state to TASK_RUNNING, but we do
not return success, which might cause another wakeup in the waitqueue
code and the task remains in the waitqueue list. Return success in
that case as well.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable-rt@vger.kernel.org

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index bebd66f..7328dc9 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1445,8 +1445,10 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 		 * if the wakeup condition is true.
 		 */
 		if (!(wake_flags & WF_LOCK_SLEEPER)) {
-			if (p->saved_state & state)
+			if (p->saved_state & state) {
 				p->saved_state = TASK_RUNNING;
+				success = 1;
+			}
 		}
 		goto out;
 	}
-- 
cgit v0.10.2


From d5e058bf35dfdf1c6f612433f817266568ccb37c Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 3 Jul 2009 08:30:27 -0500
Subject: stop_machine: convert stop_machine_run() to PREEMPT_RT

Instead of playing with non-preemption, introduce explicit
startup serialization. This is more robust and cleaner as
well.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 2f194e9..61779f8 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -135,6 +135,7 @@ void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
 
 /* static data for stop_cpus */
 static DEFINE_MUTEX(stop_cpus_mutex);
+static DEFINE_MUTEX(stopper_lock);
 static DEFINE_PER_CPU(struct cpu_stop_work, stop_cpus_work);
 
 static void queue_stop_cpus_work(const struct cpumask *cpumask,
@@ -153,15 +154,14 @@ static void queue_stop_cpus_work(const struct cpumask *cpumask,
 	}
 
 	/*
-	 * Disable preemption while queueing to avoid getting
-	 * preempted by a stopper which might wait for other stoppers
-	 * to enter @fn which can lead to deadlock.
+	 * Make sure that all work is queued on all cpus before we
+	 * any of the cpus can execute it.
 	 */
-	preempt_disable();
+	mutex_lock(&stopper_lock);
 	for_each_cpu(cpu, cpumask)
 		cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu),
 				    &per_cpu(stop_cpus_work, cpu));
-	preempt_enable();
+	mutex_unlock(&stopper_lock);
 }
 
 static int __stop_cpus(const struct cpumask *cpumask,
@@ -275,6 +275,16 @@ repeat:
 
 		__set_current_state(TASK_RUNNING);
 
+		/*
+		 * Wait until the stopper finished scheduling on all
+		 * cpus
+		 */
+		mutex_lock(&stopper_lock);
+		/*
+		 * Let other cpu threads continue as well
+		 */
+		mutex_unlock(&stopper_lock);
+
 		/* cpu stop callbacks are not allowed to sleep */
 		preempt_disable();
 
-- 
cgit v0.10.2


From db1e28acdedefa12bc8fdab194e7a63c53f35e2a Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 17 Jul 2011 19:53:19 +0200
Subject: stomp-machine-mark-stomper-thread.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 302687b..9530df4 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1860,6 +1860,7 @@ extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut,
 #define PF_FROZEN	0x00010000	/* frozen for system suspend */
 #define PF_FSTRANS	0x00020000	/* inside a filesystem transaction */
 #define PF_KSWAPD	0x00040000	/* I am kswapd */
+#define PF_STOMPER	0x00080000	/* I am a stomp machine thread */
 #define PF_LESS_THROTTLE 0x00100000	/* Throttle me less: I clean memory */
 #define PF_KTHREAD	0x00200000	/* I am a kernel thread */
 #define PF_RANDOMIZE	0x00400000	/* randomize virtual address space */
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 61779f8..484a335 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -327,6 +327,7 @@ static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb,
 		if (IS_ERR(p))
 			return notifier_from_errno(PTR_ERR(p));
 		get_task_struct(p);
+		p->flags |= PF_STOMPER;
 		kthread_bind(p, cpu);
 		sched_set_stop_task(cpu, p);
 		stopper->thread = p;
-- 
cgit v0.10.2


From e4eedbcb4dd8354081e3cefed9132c565a556b08 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 29 Jun 2011 11:01:51 +0200
Subject: stomp-machine-raw-lock.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 484a335..561ba3a 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -29,12 +29,12 @@ struct cpu_stop_done {
 	atomic_t		nr_todo;	/* nr left to execute */
 	bool			executed;	/* actually executed? */
 	int			ret;		/* collected return value */
-	struct completion	completion;	/* fired if nr_todo reaches 0 */
+	struct task_struct	*waiter;	/* woken when nr_todo reaches 0 */
 };
 
 /* the actual stopper, one per every possible cpu, enabled on online cpus */
 struct cpu_stopper {
-	spinlock_t		lock;
+	raw_spinlock_t		lock;
 	bool			enabled;	/* is this stopper enabled? */
 	struct list_head	works;		/* list of pending works */
 	struct task_struct	*thread;	/* stopper thread */
@@ -47,7 +47,7 @@ static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo)
 {
 	memset(done, 0, sizeof(*done));
 	atomic_set(&done->nr_todo, nr_todo);
-	init_completion(&done->completion);
+	done->waiter = current;
 }
 
 /* signal completion unless @done is NULL */
@@ -56,8 +56,10 @@ static void cpu_stop_signal_done(struct cpu_stop_done *done, bool executed)
 	if (done) {
 		if (executed)
 			done->executed = true;
-		if (atomic_dec_and_test(&done->nr_todo))
-			complete(&done->completion);
+		if (atomic_dec_and_test(&done->nr_todo)) {
+			wake_up_process(done->waiter);
+			done->waiter = NULL;
+		}
 	}
 }
 
@@ -67,7 +69,7 @@ static void cpu_stop_queue_work(struct cpu_stopper *stopper,
 {
 	unsigned long flags;
 
-	spin_lock_irqsave(&stopper->lock, flags);
+	raw_spin_lock_irqsave(&stopper->lock, flags);
 
 	if (stopper->enabled) {
 		list_add_tail(&work->list, &stopper->works);
@@ -75,7 +77,23 @@ static void cpu_stop_queue_work(struct cpu_stopper *stopper,
 	} else
 		cpu_stop_signal_done(work->done, false);
 
-	spin_unlock_irqrestore(&stopper->lock, flags);
+	raw_spin_unlock_irqrestore(&stopper->lock, flags);
+}
+
+static void wait_for_stop_done(struct cpu_stop_done *done)
+{
+	set_current_state(TASK_UNINTERRUPTIBLE);
+	while (atomic_read(&done->nr_todo)) {
+		schedule();
+		set_current_state(TASK_UNINTERRUPTIBLE);
+	}
+	/*
+	 * We need to wait until cpu_stop_signal_done() has cleared
+	 * done->waiter.
+	 */
+	while (done->waiter)
+		cpu_relax();
+	set_current_state(TASK_RUNNING);
 }
 
 /**
@@ -109,7 +127,7 @@ int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg)
 
 	cpu_stop_init_done(&done, 1);
 	cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu), &work);
-	wait_for_completion(&done.completion);
+	wait_for_stop_done(&done);
 	return done.executed ? done.ret : -ENOENT;
 }
 
@@ -171,7 +189,7 @@ static int __stop_cpus(const struct cpumask *cpumask,
 
 	cpu_stop_init_done(&done, cpumask_weight(cpumask));
 	queue_stop_cpus_work(cpumask, fn, arg, &done);
-	wait_for_completion(&done.completion);
+	wait_for_stop_done(&done);
 	return done.executed ? done.ret : -ENOENT;
 }
 
@@ -259,13 +277,13 @@ repeat:
 	}
 
 	work = NULL;
-	spin_lock_irq(&stopper->lock);
+	raw_spin_lock_irq(&stopper->lock);
 	if (!list_empty(&stopper->works)) {
 		work = list_first_entry(&stopper->works,
 					struct cpu_stop_work, list);
 		list_del_init(&work->list);
 	}
-	spin_unlock_irq(&stopper->lock);
+	raw_spin_unlock_irq(&stopper->lock);
 
 	if (work) {
 		cpu_stop_fn_t fn = work->fn;
@@ -299,7 +317,13 @@ repeat:
 			  kallsyms_lookup((unsigned long)fn, NULL, NULL, NULL,
 					  ksym_buf), arg);
 
+		/*
+		 * Make sure that the wakeup and setting done->waiter
+		 * to NULL is atomic.
+		 */
+		local_irq_disable();
 		cpu_stop_signal_done(done, true);
+		local_irq_enable();
 	} else
 		schedule();
 
@@ -337,9 +361,9 @@ static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb,
 		/* strictly unnecessary, as first user will wake it */
 		wake_up_process(stopper->thread);
 		/* mark enabled */
-		spin_lock_irq(&stopper->lock);
+		raw_spin_lock_irq(&stopper->lock);
 		stopper->enabled = true;
-		spin_unlock_irq(&stopper->lock);
+		raw_spin_unlock_irq(&stopper->lock);
 		break;
 
 #ifdef CONFIG_HOTPLUG_CPU
@@ -352,11 +376,11 @@ static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb,
 		/* kill the stopper */
 		kthread_stop(stopper->thread);
 		/* drain remaining works */
-		spin_lock_irq(&stopper->lock);
+		raw_spin_lock_irq(&stopper->lock);
 		list_for_each_entry(work, &stopper->works, list)
 			cpu_stop_signal_done(work->done, false);
 		stopper->enabled = false;
-		spin_unlock_irq(&stopper->lock);
+		raw_spin_unlock_irq(&stopper->lock);
 		/* release the stopper */
 		put_task_struct(stopper->thread);
 		stopper->thread = NULL;
@@ -387,7 +411,7 @@ static int __init cpu_stop_init(void)
 	for_each_possible_cpu(cpu) {
 		struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
 
-		spin_lock_init(&stopper->lock);
+		raw_spin_lock_init(&stopper->lock);
 		INIT_LIST_HEAD(&stopper->works);
 	}
 
@@ -581,7 +605,7 @@ int stop_machine_from_inactive_cpu(int (*fn)(void *), void *data,
 	ret = stop_machine_cpu_stop(&smdata);
 
 	/* Busy wait for completion. */
-	while (!completion_done(&done.completion))
+	while (atomic_read(&done.nr_todo))
 		cpu_relax();
 
 	mutex_unlock(&stop_cpus_mutex);
-- 
cgit v0.10.2


From 1faf28a75df0943f5ed9a87bcb26d0a106786147 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 15 Jun 2011 12:36:06 +0200
Subject: hotplug: Lightweight get online cpus

get_online_cpus() is a heavy weight function which involves a global
mutex. migrate_disable() wants a simpler construct which prevents only
a CPU from going doing while a task is in a migrate disabled section.

Implement a per cpu lockless mechanism, which serializes only in the
real unplug case on a global mutex. That serialization affects only
tasks on the cpu which should be brought down.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index ce7a074..7781c9e 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -175,6 +175,8 @@ extern struct bus_type cpu_subsys;
 
 extern void get_online_cpus(void);
 extern void put_online_cpus(void);
+extern void pin_current_cpu(void);
+extern void unpin_current_cpu(void);
 #define hotcpu_notifier(fn, pri)	cpu_notifier(fn, pri)
 #define register_hotcpu_notifier(nb)	register_cpu_notifier(nb)
 #define unregister_hotcpu_notifier(nb)	unregister_cpu_notifier(nb)
@@ -198,6 +200,8 @@ static inline void cpu_hotplug_driver_unlock(void)
 
 #define get_online_cpus()	do { } while (0)
 #define put_online_cpus()	do { } while (0)
+static inline void pin_current_cpu(void) { }
+static inline void unpin_current_cpu(void) { }
 #define hotcpu_notifier(fn, pri)	do { (void)(fn); } while (0)
 /* These aren't inline functions due to a GCC bug. */
 #define register_hotcpu_notifier(nb)	({ (void)(nb); 0; })
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 3046a50..05e0381 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -63,6 +63,102 @@ static struct {
 	.refcount = 0,
 };
 
+struct hotplug_pcp {
+	struct task_struct *unplug;
+	int refcount;
+	struct completion synced;
+};
+
+static DEFINE_PER_CPU(struct hotplug_pcp, hotplug_pcp);
+
+/**
+ * pin_current_cpu - Prevent the current cpu from being unplugged
+ *
+ * Lightweight version of get_online_cpus() to prevent cpu from being
+ * unplugged when code runs in a migration disabled region.
+ *
+ * Must be called with preemption disabled (preempt_count = 1)!
+ */
+void pin_current_cpu(void)
+{
+	struct hotplug_pcp *hp = &__get_cpu_var(hotplug_pcp);
+
+retry:
+	if (!hp->unplug || hp->refcount || preempt_count() > 1 ||
+	    hp->unplug == current || (current->flags & PF_STOMPER)) {
+		hp->refcount++;
+		return;
+	}
+	preempt_enable();
+	mutex_lock(&cpu_hotplug.lock);
+	mutex_unlock(&cpu_hotplug.lock);
+	preempt_disable();
+	goto retry;
+}
+
+/**
+ * unpin_current_cpu - Allow unplug of current cpu
+ *
+ * Must be called with preemption or interrupts disabled!
+ */
+void unpin_current_cpu(void)
+{
+	struct hotplug_pcp *hp = &__get_cpu_var(hotplug_pcp);
+
+	WARN_ON(hp->refcount <= 0);
+
+	/* This is safe. sync_unplug_thread is pinned to this cpu */
+	if (!--hp->refcount && hp->unplug && hp->unplug != current &&
+	    !(current->flags & PF_STOMPER))
+		wake_up_process(hp->unplug);
+}
+
+/*
+ * FIXME: Is this really correct under all circumstances ?
+ */
+static int sync_unplug_thread(void *data)
+{
+	struct hotplug_pcp *hp = data;
+
+	preempt_disable();
+	hp->unplug = current;
+	set_current_state(TASK_UNINTERRUPTIBLE);
+	while (hp->refcount) {
+		schedule_preempt_disabled();
+		set_current_state(TASK_UNINTERRUPTIBLE);
+	}
+	set_current_state(TASK_RUNNING);
+	preempt_enable();
+	complete(&hp->synced);
+	return 0;
+}
+
+/*
+ * Start the sync_unplug_thread on the target cpu and wait for it to
+ * complete.
+ */
+static int cpu_unplug_begin(unsigned int cpu)
+{
+	struct hotplug_pcp *hp = &per_cpu(hotplug_pcp, cpu);
+	struct task_struct *tsk;
+
+	init_completion(&hp->synced);
+	tsk = kthread_create(sync_unplug_thread, hp, "sync_unplug/%d\n", cpu);
+	if (IS_ERR(tsk))
+		return (PTR_ERR(tsk));
+	kthread_bind(tsk, cpu);
+	wake_up_process(tsk);
+	wait_for_completion(&hp->synced);
+	return 0;
+}
+
+static void cpu_unplug_done(unsigned int cpu)
+{
+	struct hotplug_pcp *hp = &per_cpu(hotplug_pcp, cpu);
+
+	hp->unplug = NULL;
+}
+
 void get_online_cpus(void)
 {
 	might_sleep();
@@ -260,13 +356,14 @@ static int __ref take_cpu_down(void *_param)
 /* Requires cpu_add_remove_lock to be held */
 static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
 {
-	int err, nr_calls = 0;
+	int mycpu, err, nr_calls = 0;
 	void *hcpu = (void *)(long)cpu;
 	unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
 	struct take_cpu_down_param tcd_param = {
 		.mod = mod,
 		.hcpu = hcpu,
 	};
+	cpumask_var_t cpumask;
 
 	if (num_online_cpus() == 1)
 		return -EBUSY;
@@ -274,7 +371,20 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
 	if (!cpu_online(cpu))
 		return -EINVAL;
 
-	cpu_hotplug_begin();
+	/* Move the downtaker off the unplug cpu */
+	if (!alloc_cpumask_var(&cpumask, GFP_KERNEL))
+		return -ENOMEM;
+	cpumask_andnot(cpumask, cpu_online_mask, cpumask_of(cpu));
+	set_cpus_allowed_ptr(current, cpumask);
+	free_cpumask_var(cpumask);
+	preempt_disable();
+	mycpu = smp_processor_id();
+	if (mycpu == cpu) {
+		printk(KERN_ERR "Yuck! Still on unplug CPU\n!");
+		preempt_enable();
+		return -EBUSY;
+	}
+	preempt_enable();
 
 	err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls);
 	if (err) {
@@ -282,7 +392,16 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
 		__cpu_notify(CPU_DOWN_FAILED | mod, hcpu, nr_calls, NULL);
 		printk("%s: attempt to take down CPU %u failed\n",
 				__func__, cpu);
-		goto out_release;
+		goto out_cancel;
+	}
+
+	cpu_hotplug_begin();
+	err = cpu_unplug_begin(cpu);
+	if (err) {
+		nr_calls--;
+		__cpu_notify(CPU_DOWN_FAILED | mod, hcpu, nr_calls, NULL);
+		printk("cpu_unplug_begin(%d) failed\n", cpu);
+		goto out_cancel;
 	}
 	smpboot_park_threads(cpu);
 
@@ -314,6 +433,8 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
 	check_for_tasks(cpu);
 
 out_release:
+	cpu_unplug_done(cpu);
+out_cancel:
 	cpu_hotplug_done();
 	if (!err)
 		cpu_notify_nofail(CPU_POST_DEAD | mod, hcpu);
-- 
cgit v0.10.2


From e05ccd629425ac28b605f24e958494b14ad0359d Mon Sep 17 00:00:00 2001
From: Yong Zhang <yong.zhang0@gmail.com>
Date: Sun, 16 Oct 2011 18:56:43 +0800
Subject: hotplug: sync_unplug: No "\n" in task name

Otherwise the output will look a little odd.

Signed-off-by: Yong Zhang <yong.zhang0@gmail.com>
Link: http://lkml.kernel.org/r/1318762607-2261-2-git-send-email-yong.zhang0@gmail.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 05e0381..b06757c 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -143,7 +143,7 @@ static int cpu_unplug_begin(unsigned int cpu)
 	struct task_struct *tsk;
 
 	init_completion(&hp->synced);
-	tsk = kthread_create(sync_unplug_thread, hp, "sync_unplug/%d\n", cpu);
+	tsk = kthread_create(sync_unplug_thread, hp, "sync_unplug/%d", cpu);
 	if (IS_ERR(tsk))
 		return (PTR_ERR(tsk));
 	kthread_bind(tsk, cpu);
-- 
cgit v0.10.2


From 5e7cf3d622d2e908a6cd13d33ecf3e053bd2c816 Mon Sep 17 00:00:00 2001
From: Yong Zhang <yong.zhang0@gmail.com>
Date: Thu, 28 Jul 2011 11:16:00 +0800
Subject: hotplug: Reread hotplug_pcp on pin_current_cpu() retry

When retry happens, it's likely that the task has been migrated to
another cpu (except unplug failed), but it still derefernces the
original hotplug_pcp per cpu data.

Update the pointer to hotplug_pcp in the retry path, so it points to
the current cpu.

Signed-off-by: Yong Zhang <yong.zhang0@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/20110728031600.GA338@windriver.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/cpu.c b/kernel/cpu.c
index b06757c..3231d1c 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -81,9 +81,11 @@ static DEFINE_PER_CPU(struct hotplug_pcp, hotplug_pcp);
  */
 void pin_current_cpu(void)
 {
-	struct hotplug_pcp *hp = &__get_cpu_var(hotplug_pcp);
+	struct hotplug_pcp *hp;
 
 retry:
+	hp = &__get_cpu_var(hotplug_pcp);
+
 	if (!hp->unplug || hp->refcount || preempt_count() > 1 ||
 	    hp->unplug == current || (current->flags & PF_STOMPER)) {
 		hp->refcount++;
-- 
cgit v0.10.2


From ce68ff4bcd3bca2c86d6dfd868261df6702272fc Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 16 Jun 2011 13:26:08 +0200
Subject: sched-migrate-disable.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index de0c8af..9cde8c3 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -114,6 +114,14 @@ do { \
 
 #endif /* CONFIG_PREEMPT_COUNT */
 
+#ifdef CONFIG_SMP
+extern void migrate_disable(void);
+extern void migrate_enable(void);
+#else
+# define migrate_disable()		barrier()
+# define migrate_enable()		barrier()
+#endif
+
 #ifdef CONFIG_PREEMPT_RT_FULL
 # define preempt_disable_rt()		preempt_disable()
 # define preempt_enable_rt()		preempt_enable()
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 9530df4..63f34f4 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1280,6 +1280,7 @@ struct task_struct {
 #endif
 
 	unsigned int policy;
+	int migrate_disable;
 	int nr_cpus_allowed;
 	cpumask_t cpus_allowed;
 
@@ -1631,9 +1632,6 @@ struct task_struct {
 #endif
 };
 
-/* Future-safe accessor for struct task_struct's cpus_allowed. */
-#define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)
-
 #ifdef CONFIG_NUMA_BALANCING
 extern void task_numa_fault(int node, int pages, bool migrated);
 extern void set_numabalancing_state(bool enabled);
@@ -2813,6 +2811,15 @@ static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
 
 #endif /* CONFIG_SMP */
 
+/* Future-safe accessor for struct task_struct's cpus_allowed. */
+static inline const struct cpumask *tsk_cpus_allowed(struct task_struct *p)
+{
+	if (p->migrate_disable)
+		return cpumask_of(task_cpu(p));
+
+	return &p->cpus_allowed;
+}
+
 extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask);
 extern long sched_getaffinity(pid_t pid, struct cpumask *mask);
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 7328dc9..feef295 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4736,11 +4736,12 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
 #ifdef CONFIG_SMP
 void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
 {
-	if (p->sched_class && p->sched_class->set_cpus_allowed)
-		p->sched_class->set_cpus_allowed(p, new_mask);
-
+	if (!p->migrate_disable) {
+		if (p->sched_class && p->sched_class->set_cpus_allowed)
+			p->sched_class->set_cpus_allowed(p, new_mask);
+		p->nr_cpus_allowed = cpumask_weight(new_mask);
+	}
 	cpumask_copy(&p->cpus_allowed, new_mask);
-	p->nr_cpus_allowed = cpumask_weight(new_mask);
 }
 
 /*
@@ -4791,7 +4792,7 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
 	do_set_cpus_allowed(p, new_mask);
 
 	/* Can the task run on the task's current CPU? If so, we're done */
-	if (cpumask_test_cpu(task_cpu(p), new_mask))
+	if (cpumask_test_cpu(task_cpu(p), new_mask) || p->migrate_disable)
 		goto out;
 
 	dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
@@ -4810,6 +4811,83 @@ out:
 }
 EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
 
+void migrate_disable(void)
+{
+	struct task_struct *p = current;
+	const struct cpumask *mask;
+	unsigned long flags;
+	struct rq *rq;
+
+	preempt_disable();
+	if (p->migrate_disable) {
+		p->migrate_disable++;
+		preempt_enable();
+		return;
+	}
+
+	pin_current_cpu();
+	if (unlikely(!scheduler_running)) {
+		p->migrate_disable = 1;
+		preempt_enable();
+		return;
+	}
+	rq = task_rq_lock(p, &flags);
+	p->migrate_disable = 1;
+	mask = tsk_cpus_allowed(p);
+
+	WARN_ON(!cpumask_test_cpu(smp_processor_id(), mask));
+
+	if (!cpumask_equal(&p->cpus_allowed, mask)) {
+		if (p->sched_class->set_cpus_allowed)
+			p->sched_class->set_cpus_allowed(p, mask);
+		p->nr_cpus_allowed = cpumask_weight(mask);
+	}
+	task_rq_unlock(rq, p, &flags);
+	preempt_enable();
+}
+EXPORT_SYMBOL(migrate_disable);
+
+void migrate_enable(void)
+{
+	struct task_struct *p = current;
+	const struct cpumask *mask;
+	unsigned long flags;
+	struct rq *rq;
+
+	WARN_ON_ONCE(p->migrate_disable <= 0);
+
+	preempt_disable();
+	if (p->migrate_disable > 1) {
+		p->migrate_disable--;
+		preempt_enable();
+		return;
+	}
+
+	if (unlikely(!scheduler_running)) {
+		p->migrate_disable = 0;
+		unpin_current_cpu();
+		preempt_enable();
+		return;
+	}
+
+	rq = task_rq_lock(p, &flags);
+	p->migrate_disable = 0;
+	mask = tsk_cpus_allowed(p);
+
+	WARN_ON(!cpumask_test_cpu(smp_processor_id(), mask));
+
+	if (!cpumask_equal(&p->cpus_allowed, mask)) {
+		if (p->sched_class->set_cpus_allowed)
+			p->sched_class->set_cpus_allowed(p, mask);
+		p->nr_cpus_allowed = cpumask_weight(mask);
+	}
+
+	task_rq_unlock(rq, p, &flags);
+	unpin_current_cpu();
+	preempt_enable();
+}
+EXPORT_SYMBOL(migrate_enable);
+
 /*
  * Move (not current) task off this cpu, onto dest cpu. We're doing
  * this because either it can't run here any more (set_cpus_allowed()
diff --git a/lib/smp_processor_id.c b/lib/smp_processor_id.c
index 4c0d0e5..0a846e7 100644
--- a/lib/smp_processor_id.c
+++ b/lib/smp_processor_id.c
@@ -39,9 +39,9 @@ notrace unsigned int debug_smp_processor_id(void)
 	if (!printk_ratelimit())
 		goto out_enable;
 
-	printk(KERN_ERR "BUG: using smp_processor_id() in preemptible [%08x] "
-			"code: %s/%d\n",
-			preempt_count() - 1, current->comm, current->pid);
+	printk(KERN_ERR "BUG: using smp_processor_id() in preemptible [%08x %08x] "
+	       "code: %s/%d\n", preempt_count() - 1,
+	       current->migrate_disable, current->comm, current->pid);
 	print_symbol("caller is %s\n", (long)__builtin_return_address(0));
 	dump_stack();
 
-- 
cgit v0.10.2


From 01cc374c49ada0b0383b049e0111ed479b1afa70 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 17 Jul 2011 19:35:29 +0200
Subject: hotplug-use-migrate-disable.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 3231d1c..3a931bb 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -379,14 +379,13 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
 	cpumask_andnot(cpumask, cpu_online_mask, cpumask_of(cpu));
 	set_cpus_allowed_ptr(current, cpumask);
 	free_cpumask_var(cpumask);
-	preempt_disable();
+	migrate_disable();
 	mycpu = smp_processor_id();
 	if (mycpu == cpu) {
 		printk(KERN_ERR "Yuck! Still on unplug CPU\n!");
-		preempt_enable();
+		migrate_enable();
 		return -EBUSY;
 	}
-	preempt_enable();
 
 	err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls);
 	if (err) {
@@ -437,6 +436,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
 out_release:
 	cpu_unplug_done(cpu);
 out_cancel:
+	migrate_enable();
 	cpu_hotplug_done();
 	if (!err)
 		cpu_notify_nofail(CPU_POST_DEAD | mod, hcpu);
-- 
cgit v0.10.2


From 7da58acedd5a4efa3a91993a79bc87b78f72aea2 Mon Sep 17 00:00:00 2001
From: Yong Zhang <yong.zhang0@gmail.com>
Date: Sun, 16 Oct 2011 18:56:44 +0800
Subject: hotplug: Call cpu_unplug_begin() before DOWN_PREPARE

cpu_unplug_begin() should be called before CPU_DOWN_PREPARE, because
at CPU_DOWN_PREPARE cpu_active is cleared and sched_domain is
rebuilt. Otherwise the 'sync_unplug' thread will be running on the cpu
on which it's created and not bound on the cpu which is about to go
down.

I found that by an incorrect warning on smp_processor_id() called by
sync_unplug/1, and trace shows below:
(echo 1 > /sys/device/system/cpu/cpu1/online)
  bash-1664  [000]    83.136620: _cpu_down: Bind sync_unplug to cpu 1
  bash-1664  [000]    83.136623: sched_wait_task: comm=sync_unplug/1 pid=1724 prio=120
  bash-1664  [000]    83.136624: _cpu_down: Wake sync_unplug
  bash-1664  [000]    83.136629: sched_wakeup: comm=sync_unplug/1 pid=1724 prio=120 success=1 target_cpu=000

Wants to be folded back....

Signed-off-by: Yong Zhang <yong.zhang0@gmail.com>
Link: http://lkml.kernel.org/r/1318762607-2261-3-git-send-email-yong.zhang0@gmail.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 3a931bb..7b5f4cb 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -387,22 +387,20 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
 		return -EBUSY;
 	}
 
-	err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls);
+	cpu_hotplug_begin();
+	err = cpu_unplug_begin(cpu);
 	if (err) {
-		nr_calls--;
-		__cpu_notify(CPU_DOWN_FAILED | mod, hcpu, nr_calls, NULL);
-		printk("%s: attempt to take down CPU %u failed\n",
-				__func__, cpu);
+		printk("cpu_unplug_begin(%d) failed\n", cpu);
 		goto out_cancel;
 	}
 
-	cpu_hotplug_begin();
-	err = cpu_unplug_begin(cpu);
+	err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls);
 	if (err) {
 		nr_calls--;
 		__cpu_notify(CPU_DOWN_FAILED | mod, hcpu, nr_calls, NULL);
-		printk("cpu_unplug_begin(%d) failed\n", cpu);
-		goto out_cancel;
+		printk("%s: attempt to take down CPU %u failed\n",
+				__func__, cpu);
+		goto out_release;
 	}
 	smpboot_park_threads(cpu);
 
-- 
cgit v0.10.2


From 9e6c41f1d2496551143e78bf5dff1446364812b0 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 17 Jul 2011 21:56:42 +0200
Subject: ftrace-migrate-disable-tracing.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index a3d4895..806c02b 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -49,7 +49,8 @@ struct trace_entry {
 	unsigned char		flags;
 	unsigned char		preempt_count;
 	int			pid;
-	int			padding;
+	unsigned short		migrate_disable;
+	unsigned short		padding;
 };
 
 #define FTRACE_MAX_EVENT						\
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index fe1d581..3240f24 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1178,6 +1178,8 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
 		((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) |
 		((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) |
 		(need_resched() ? TRACE_FLAG_NEED_RESCHED : 0);
+
+	entry->migrate_disable	= (tsk) ? tsk->migrate_disable & 0xFF : 0;
 }
 EXPORT_SYMBOL_GPL(tracing_generic_entry_update);
 
@@ -2035,9 +2037,10 @@ static void print_lat_help_header(struct seq_file *m)
 	seq_puts(m, "#                | / _----=> need-resched    \n");
 	seq_puts(m, "#                || / _---=> hardirq/softirq \n");
 	seq_puts(m, "#                ||| / _--=> preempt-depth   \n");
-	seq_puts(m, "#                |||| /     delay             \n");
-	seq_puts(m, "#  cmd     pid   ||||| time  |   caller      \n");
-	seq_puts(m, "#     \\   /      |||||  \\    |   /           \n");
+	seq_puts(m, "#                |||| / _--=> migrate-disable\n");
+	seq_puts(m, "#                ||||| /     delay           \n");
+	seq_puts(m, "#  cmd     pid   |||||| time  |   caller     \n");
+	seq_puts(m, "#     \\   /      |||||  \\   |   /          \n");
 }
 
 static void print_event_info(struct trace_array *tr, struct seq_file *m)
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 880073d..dbd6ae1 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -116,6 +116,7 @@ static int trace_define_common_fields(void)
 	__common_field(unsigned char, flags);
 	__common_field(unsigned char, preempt_count);
 	__common_field(int, pid);
+	__common_field(unsigned short, migrate_disable);
 	__common_field(int, padding);
 
 	return ret;
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 194d796..e36737f 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -593,6 +593,11 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
 	else
 		ret = trace_seq_putc(s, '.');
 
+	if (entry->migrate_disable)
+		ret = trace_seq_printf(s, "%x", entry->migrate_disable);
+	else
+		ret = trace_seq_putc(s, '.');
+
 	return ret;
 }
 
-- 
cgit v0.10.2


From 713d6754b0267ecdd2eb4d0a182f91671671d4cc Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Wed, 16 Nov 2011 13:19:35 -0500
Subject: tracing: Show padding as unsigned short

RT added two bytes to trace migrate disable counting to the trace events
and used two bytes of the padding to make the change. The structures and
all were updated correctly, but the display in the event formats was
not:

cat /debug/tracing/events/sched/sched_switch/format

name: sched_switch
ID: 51
format:
	field:unsigned short common_type;	offset:0;	size:2;	signed:0;
	field:unsigned char common_flags;	offset:2;	size:1;	signed:0;
	field:unsigned char common_preempt_count;	offset:3;	size:1;	signed:0;
	field:int common_pid;	offset:4;	size:4;	signed:1;
	field:unsigned short common_migrate_disable;	offset:8;	size:2;	signed:0;
	field:int common_padding;	offset:10;	size:2;	signed:0;


The field for common_padding has the correct size and offset, but the
use of "int" might confuse some parsers (and people that are reading
it). This needs to be changed to "unsigned short".

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Link: http://lkml.kernel.org/r/1321467575.4181.36.camel@frodo
Cc: stable-rt@vger.kernel.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index dbd6ae1..a45a22d 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -117,7 +117,7 @@ static int trace_define_common_fields(void)
 	__common_field(unsigned char, preempt_count);
 	__common_field(int, pid);
 	__common_field(unsigned short, migrate_disable);
-	__common_field(int, padding);
+	__common_field(unsigned short, padding);
 
 	return ret;
 }
-- 
cgit v0.10.2


From c1f36eb11c8c51114917514e687485034922c1b8 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 17 Jul 2011 19:48:20 +0200
Subject: migrate-disable-rt-variant.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index 9cde8c3..6446e22 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -127,11 +127,15 @@ extern void migrate_enable(void);
 # define preempt_enable_rt()		preempt_enable()
 # define preempt_disable_nort()		barrier()
 # define preempt_enable_nort()		barrier()
+# define migrate_disable_rt()		migrate_disable()
+# define migrate_enable_rt()		migrate_enable()
 #else
 # define preempt_disable_rt()		barrier()
 # define preempt_enable_rt()		barrier()
 # define preempt_disable_nort()		preempt_disable()
 # define preempt_enable_nort()		preempt_enable()
+# define migrate_disable_rt()		barrier()
+# define migrate_enable_rt()		barrier()
 #endif
 
 #ifdef CONFIG_PREEMPT_NOTIFIERS
-- 
cgit v0.10.2


From 343961d5fcf9dec48be46bf1939998e60d04a29c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 11 Aug 2011 15:03:35 +0200
Subject: sched: Optimize migrate_disable

Change from task_rq_lock() to raw_spin_lock(&rq->lock) to avoid a few
atomic ops. See comment on why it should be safe.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/n/tip-cbz6hkl5r5mvwtx5s3tor2y6@git.kernel.org

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index feef295..4d57761 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4831,7 +4831,19 @@ void migrate_disable(void)
 		preempt_enable();
 		return;
 	}
-	rq = task_rq_lock(p, &flags);
+
+	/*
+	 * Since this is always current we can get away with only locking
+	 * rq->lock, the ->cpus_allowed value can normally only be changed
+	 * while holding both p->pi_lock and rq->lock, but seeing that this
+	 * it current, we cannot actually be waking up, so all code that
+	 * relies on serialization against p->pi_lock is out of scope.
+	 *
+	 * Taking rq->lock serializes us against things like
+	 * set_cpus_allowed_ptr() that can still happen concurrently.
+	 */
+	rq = this_rq();
+	raw_spin_lock_irqsave(&rq->lock, flags);
 	p->migrate_disable = 1;
 	mask = tsk_cpus_allowed(p);
 
@@ -4842,7 +4854,7 @@ void migrate_disable(void)
 			p->sched_class->set_cpus_allowed(p, mask);
 		p->nr_cpus_allowed = cpumask_weight(mask);
 	}
-	task_rq_unlock(rq, p, &flags);
+	raw_spin_unlock_irqrestore(&rq->lock, flags);
 	preempt_enable();
 }
 EXPORT_SYMBOL(migrate_disable);
@@ -4870,7 +4882,11 @@ void migrate_enable(void)
 		return;
 	}
 
-	rq = task_rq_lock(p, &flags);
+	/*
+	 * See comment in migrate_disable().
+	 */
+	rq = this_rq();
+	raw_spin_lock_irqsave(&rq->lock, flags);
 	p->migrate_disable = 0;
 	mask = tsk_cpus_allowed(p);
 
@@ -4882,7 +4898,7 @@ void migrate_enable(void)
 		p->nr_cpus_allowed = cpumask_weight(mask);
 	}
 
-	task_rq_unlock(rq, p, &flags);
+	raw_spin_unlock_irqrestore(&rq->lock, flags);
 	unpin_current_cpu();
 	preempt_enable();
 }
-- 
cgit v0.10.2


From 55aaf7e2796eb45139d472c2377aa2969754b8a7 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 11 Aug 2011 15:14:58 +0200
Subject: sched: Generic migrate_disable

Make migrate_disable() be a preempt_disable() for !rt kernels. This
allows generic code to use it but still enforces that these code
sections stay relatively small.

A preemptible migrate_disable() accessible for general use would allow
people growing arbitrary per-cpu crap instead of clean these things
up.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/n/tip-275i87sl8e1jcamtchmehonm@git.kernel.org

diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index 6446e22..c28f5d6 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -114,28 +114,25 @@ do { \
 
 #endif /* CONFIG_PREEMPT_COUNT */
 
-#ifdef CONFIG_SMP
-extern void migrate_disable(void);
-extern void migrate_enable(void);
-#else
-# define migrate_disable()		barrier()
-# define migrate_enable()		barrier()
-#endif
-
 #ifdef CONFIG_PREEMPT_RT_FULL
 # define preempt_disable_rt()		preempt_disable()
 # define preempt_enable_rt()		preempt_enable()
 # define preempt_disable_nort()		barrier()
 # define preempt_enable_nort()		barrier()
-# define migrate_disable_rt()		migrate_disable()
-# define migrate_enable_rt()		migrate_enable()
+# ifdef CONFIG_SMP
+   extern void migrate_disable(void);
+   extern void migrate_enable(void);
+# else /* CONFIG_SMP */
+#  define migrate_disable()		barrier()
+#  define migrate_enable()		barrier()
+# endif /* CONFIG_SMP */
 #else
 # define preempt_disable_rt()		barrier()
 # define preempt_enable_rt()		barrier()
 # define preempt_disable_nort()		preempt_disable()
 # define preempt_enable_nort()		preempt_enable()
-# define migrate_disable_rt()		barrier()
-# define migrate_enable_rt()		barrier()
+# define migrate_disable()		preempt_disable()
+# define migrate_enable()		preempt_enable()
 #endif
 
 #ifdef CONFIG_PREEMPT_NOTIFIERS
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 63f34f4..dbe0ffe 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1280,7 +1280,9 @@ struct task_struct {
 #endif
 
 	unsigned int policy;
+#ifdef CONFIG_PREEMPT_RT_FULL
 	int migrate_disable;
+#endif
 	int nr_cpus_allowed;
 	cpumask_t cpus_allowed;
 
@@ -2811,11 +2813,22 @@ static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
 
 #endif /* CONFIG_SMP */
 
+static inline int __migrate_disabled(struct task_struct *p)
+{
+#ifdef CONFIG_PREEMPT_RT_FULL
+	return p->migrate_disable;
+#else
+	return 0;
+#endif
+}
+
 /* Future-safe accessor for struct task_struct's cpus_allowed. */
 static inline const struct cpumask *tsk_cpus_allowed(struct task_struct *p)
 {
+#ifdef CONFIG_PREEMPT_RT_FULL
 	if (p->migrate_disable)
 		return cpumask_of(task_cpu(p));
+#endif
 
 	return &p->cpus_allowed;
 }
diff --git a/include/linux/smp.h b/include/linux/smp.h
index 8624321..9e3255b 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -218,13 +218,8 @@ static inline void kick_all_cpus_sync(void) {  }
 #define get_cpu()		({ preempt_disable(); smp_processor_id(); })
 #define put_cpu()		preempt_enable()
 
-#ifndef CONFIG_PREEMPT_RT_FULL
-# define get_cpu_light()	get_cpu()
-# define put_cpu_light()	put_cpu()
-#else
-# define get_cpu_light()	({ migrate_disable(); smp_processor_id(); })
-# define put_cpu_light()	migrate_enable()
-#endif
+#define get_cpu_light()		({ migrate_disable(); smp_processor_id(); })
+#define put_cpu_light()		migrate_enable()
 
 /*
  * Callback to arch code if there's nosmp or maxcpus=0 on the
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 4d57761..ae90b25 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4736,7 +4736,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
 #ifdef CONFIG_SMP
 void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
 {
-	if (!p->migrate_disable) {
+	if (!__migrate_disabled(p)) {
 		if (p->sched_class && p->sched_class->set_cpus_allowed)
 			p->sched_class->set_cpus_allowed(p, new_mask);
 		p->nr_cpus_allowed = cpumask_weight(new_mask);
@@ -4792,7 +4792,7 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
 	do_set_cpus_allowed(p, new_mask);
 
 	/* Can the task run on the task's current CPU? If so, we're done */
-	if (cpumask_test_cpu(task_cpu(p), new_mask) || p->migrate_disable)
+	if (cpumask_test_cpu(task_cpu(p), new_mask) || __migrate_disabled(p))
 		goto out;
 
 	dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
@@ -4811,6 +4811,7 @@ out:
 }
 EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
 
+#ifdef CONFIG_PREEMPT_RT_FULL
 void migrate_disable(void)
 {
 	struct task_struct *p = current;
@@ -4903,6 +4904,7 @@ void migrate_enable(void)
 	preempt_enable();
 }
 EXPORT_SYMBOL(migrate_enable);
+#endif /* CONFIG_PREEMPT_RT_FULL */
 
 /*
  * Move (not current) task off this cpu, onto dest cpu. We're doing
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 3240f24..298e5ca 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1179,7 +1179,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
 		((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) |
 		(need_resched() ? TRACE_FLAG_NEED_RESCHED : 0);
 
-	entry->migrate_disable	= (tsk) ? tsk->migrate_disable & 0xFF : 0;
+	entry->migrate_disable	= (tsk) ? __migrate_disabled(tsk) & 0xFF : 0;
 }
 EXPORT_SYMBOL_GPL(tracing_generic_entry_update);
 
diff --git a/lib/smp_processor_id.c b/lib/smp_processor_id.c
index 0a846e7..dbb1570 100644
--- a/lib/smp_processor_id.c
+++ b/lib/smp_processor_id.c
@@ -41,7 +41,7 @@ notrace unsigned int debug_smp_processor_id(void)
 
 	printk(KERN_ERR "BUG: using smp_processor_id() in preemptible [%08x %08x] "
 	       "code: %s/%d\n", preempt_count() - 1,
-	       current->migrate_disable, current->comm, current->pid);
+	       __migrate_disabled(current), current->comm, current->pid);
 	print_symbol("caller is %s\n", (long)__builtin_return_address(0));
 	dump_stack();
 
-- 
cgit v0.10.2


From bdb79e831ee9e66dabfecddcf9760049119c1b90 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Tue, 23 Aug 2011 16:12:43 +0200
Subject: sched, rt: Fix migrate_enable() thinko

Assigning mask = tsk_cpus_allowed(p) after p->migrate_disable = 0 ensures
that we won't see a mask change.. no push/pull, we stack tasks on one CPU.

Also add a couple fields to sched_debug for the next guy.

[ Build fix from Stratos Psomadakis <psomas@gentoo.org> ]

Signed-off-by: Mike Galbraith <efault@gmx.de>
Cc: Paul E. McKenney <paulmck@us.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1314108763.6689.4.camel@marge.simson.net
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ae90b25..3e2159d 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4888,12 +4888,14 @@ void migrate_enable(void)
 	 */
 	rq = this_rq();
 	raw_spin_lock_irqsave(&rq->lock, flags);
-	p->migrate_disable = 0;
 	mask = tsk_cpus_allowed(p);
+	p->migrate_disable = 0;
 
 	WARN_ON(!cpumask_test_cpu(smp_processor_id(), mask));
 
 	if (!cpumask_equal(&p->cpus_allowed, mask)) {
+		/* Get the mask now that migration is enabled */
+		mask = tsk_cpus_allowed(p);
 		if (p->sched_class->set_cpus_allowed)
 			p->sched_class->set_cpus_allowed(p, mask);
 		p->nr_cpus_allowed = cpumask_weight(mask);
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 7ae4c4c..2cac500 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -253,6 +253,9 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
 	P(rt_throttled);
 	PN(rt_time);
 	PN(rt_runtime);
+#ifdef CONFIG_SMP
+	P(rt_nr_migratory);
+#endif
 
 #undef PN
 #undef P
@@ -507,6 +510,10 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
 	P(se.load.weight);
 	P(policy);
 	P(prio);
+#ifdef CONFIG_PREEMPT_RT_FULL
+	P(migrate_disable);
+#endif
+	P(nr_cpus_allowed);
 #undef PN
 #undef __PN
 #undef P
-- 
cgit v0.10.2


From 3ff0e900d0126bc2300552dde5147970967e315a Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 2 Sep 2011 14:29:27 +0200
Subject: sched: teach migrate_disable about atomic contexts

 <NMI>  [<ffffffff812dafd8>] spin_bug+0x94/0xa8
 [<ffffffff812db07f>] do_raw_spin_lock+0x43/0xea
 [<ffffffff814fa9be>] _raw_spin_lock_irqsave+0x6b/0x85
 [<ffffffff8106ff9e>] ? migrate_disable+0x75/0x12d
 [<ffffffff81078aaf>] ? pin_current_cpu+0x36/0xb0
 [<ffffffff8106ff9e>] migrate_disable+0x75/0x12d
 [<ffffffff81115b9d>] pagefault_disable+0xe/0x1f
 [<ffffffff81047027>] copy_from_user_nmi+0x74/0xe6
 [<ffffffff810489d7>] perf_callchain_user+0xf3/0x135

Now clearly we can't go around taking locks from NMI context, cure
this by short-circuiting migrate_disable() when we're in an atomic
context already.

Add some extra debugging to avoid things like:

  preempt_disable()
  migrate_disable();

  preempt_enable();
  migrate_enable();

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1314967297.1301.14.camel@twins
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/n/tip-wbot4vsmwhi8vmbf83hsclk6@git.kernel.org

diff --git a/include/linux/sched.h b/include/linux/sched.h
index dbe0ffe..5639b15 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1282,6 +1282,9 @@ struct task_struct {
 	unsigned int policy;
 #ifdef CONFIG_PREEMPT_RT_FULL
 	int migrate_disable;
+# ifdef CONFIG_SCHED_DEBUG
+	int migrate_disable_atomic;
+# endif
 #endif
 	int nr_cpus_allowed;
 	cpumask_t cpus_allowed;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 3e2159d..97a6a49 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4819,6 +4819,17 @@ void migrate_disable(void)
 	unsigned long flags;
 	struct rq *rq;
 
+	if (in_atomic()) {
+#ifdef CONFIG_SCHED_DEBUG
+		p->migrate_disable_atomic++;
+#endif
+		return;
+	}
+
+#ifdef CONFIG_SCHED_DEBUG
+	WARN_ON_ONCE(p->migrate_disable_atomic);
+#endif
+
 	preempt_disable();
 	if (p->migrate_disable) {
 		p->migrate_disable++;
@@ -4867,6 +4878,16 @@ void migrate_enable(void)
 	unsigned long flags;
 	struct rq *rq;
 
+	if (in_atomic()) {
+#ifdef CONFIG_SCHED_DEBUG
+		p->migrate_disable_atomic--;
+#endif
+		return;
+	}
+
+#ifdef CONFIG_SCHED_DEBUG
+	WARN_ON_ONCE(p->migrate_disable_atomic);
+#endif
 	WARN_ON_ONCE(p->migrate_disable <= 0);
 
 	preempt_disable();
-- 
cgit v0.10.2


From cf8cc907e5d711e8cc94c2cc7b62ca985d7225b1 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Tue, 27 Sep 2011 08:40:23 -0400
Subject: sched: Postpone actual migration disalbe to schedule

The migrate_disable() can cause a bit of a overhead to the RT kernel,
as changing the affinity is expensive to do at every lock encountered.
As a running task can not migrate, the actual disabling of migration
does not need to occur until the task is about to schedule out.

In most cases, a task that disables migration will enable it before
it schedules making this change improve performance tremendously.

[ Frank Rowand: UP compile fix ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Clark Williams <williams@redhat.com>
Link: http://lkml.kernel.org/r/20110927124422.779693167@goodmis.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 97a6a49..ba35921 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2850,6 +2850,135 @@ static inline void schedule_debug(struct task_struct *prev)
 	schedstat_inc(this_rq(), sched_count);
 }
 
+#if defined(CONFIG_PREEMPT_RT_FULL) && defined(CONFIG_SMP)
+#define MIGRATE_DISABLE_SET_AFFIN	(1<<30) /* Can't make a negative */
+#define migrate_disabled_updated(p)	((p)->migrate_disable & MIGRATE_DISABLE_SET_AFFIN)
+#define migrate_disable_count(p)	((p)->migrate_disable & ~MIGRATE_DISABLE_SET_AFFIN)
+
+static inline void update_migrate_disable(struct task_struct *p)
+{
+	const struct cpumask *mask;
+
+	if (likely(!p->migrate_disable))
+		return;
+
+	/* Did we already update affinity? */
+	if (unlikely(migrate_disabled_updated(p)))
+		return;
+
+	/*
+	 * Since this is always current we can get away with only locking
+	 * rq->lock, the ->cpus_allowed value can normally only be changed
+	 * while holding both p->pi_lock and rq->lock, but seeing that this
+	 * is current, we cannot actually be waking up, so all code that
+	 * relies on serialization against p->pi_lock is out of scope.
+	 *
+	 * Having rq->lock serializes us against things like
+	 * set_cpus_allowed_ptr() that can still happen concurrently.
+	 */
+	mask = tsk_cpus_allowed(p);
+
+	WARN_ON(!cpumask_test_cpu(smp_processor_id(), mask));
+
+	if (!cpumask_equal(&p->cpus_allowed, mask)) {
+		if (p->sched_class->set_cpus_allowed)
+			p->sched_class->set_cpus_allowed(p, mask);
+		p->nr_cpus_allowed = cpumask_weight(mask);
+
+		/* Let migrate_enable know to fix things back up */
+		p->migrate_disable |= MIGRATE_DISABLE_SET_AFFIN;
+	}
+}
+
+void migrate_disable(void)
+{
+	struct task_struct *p = current;
+
+	if (in_atomic()) {
+#ifdef CONFIG_SCHED_DEBUG
+		p->migrate_disable_atomic++;
+#endif
+		return;
+	}
+
+#ifdef CONFIG_SCHED_DEBUG
+	WARN_ON_ONCE(p->migrate_disable_atomic);
+#endif
+
+	preempt_disable();
+	if (p->migrate_disable) {
+		p->migrate_disable++;
+		preempt_enable();
+		return;
+	}
+
+	pin_current_cpu();
+	p->migrate_disable = 1;
+	preempt_enable();
+}
+EXPORT_SYMBOL(migrate_disable);
+
+void migrate_enable(void)
+{
+	struct task_struct *p = current;
+	const struct cpumask *mask;
+	unsigned long flags;
+	struct rq *rq;
+
+	if (in_atomic()) {
+#ifdef CONFIG_SCHED_DEBUG
+		p->migrate_disable_atomic--;
+#endif
+		return;
+	}
+
+#ifdef CONFIG_SCHED_DEBUG
+	WARN_ON_ONCE(p->migrate_disable_atomic);
+#endif
+	WARN_ON_ONCE(p->migrate_disable <= 0);
+
+	preempt_disable();
+	if (migrate_disable_count(p) > 1) {
+		p->migrate_disable--;
+		preempt_enable();
+		return;
+	}
+
+	if (unlikely(migrate_disabled_updated(p))) {
+		/*
+		 * See comment in update_migrate_disable() about locking.
+		 */
+		rq = this_rq();
+		raw_spin_lock_irqsave(&rq->lock, flags);
+		mask = tsk_cpus_allowed(p);
+		/*
+		 * Clearing migrate_disable causes tsk_cpus_allowed to
+		 * show the tasks original cpu affinity.
+		 */
+		p->migrate_disable = 0;
+
+		WARN_ON(!cpumask_test_cpu(smp_processor_id(), mask));
+
+		if (unlikely(!cpumask_equal(&p->cpus_allowed, mask))) {
+			/* Get the mask now that migration is enabled */
+			mask = tsk_cpus_allowed(p);
+			if (p->sched_class->set_cpus_allowed)
+				p->sched_class->set_cpus_allowed(p, mask);
+			p->nr_cpus_allowed = cpumask_weight(mask);
+		}
+		raw_spin_unlock_irqrestore(&rq->lock, flags);
+	} else
+		p->migrate_disable = 0;
+
+	unpin_current_cpu();
+	preempt_enable();
+}
+EXPORT_SYMBOL(migrate_enable);
+#else
+static inline void update_migrate_disable(struct task_struct *p) { }
+#define migrate_disabled_updated(p)		0
+#endif
+
 static void put_prev_task(struct rq *rq, struct task_struct *prev)
 {
 	if (prev->on_rq || rq->skip_clock_update < 0)
@@ -2943,6 +3072,8 @@ need_resched:
 
 	raw_spin_lock_irq(&rq->lock);
 
+	update_migrate_disable(prev);
+
 	switch_count = &prev->nivcsw;
 	if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
 		if (unlikely(signal_pending_state(prev->state, prev))) {
@@ -4736,7 +4867,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
 #ifdef CONFIG_SMP
 void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
 {
-	if (!__migrate_disabled(p)) {
+	if (!migrate_disabled_updated(p)) {
 		if (p->sched_class && p->sched_class->set_cpus_allowed)
 			p->sched_class->set_cpus_allowed(p, new_mask);
 		p->nr_cpus_allowed = cpumask_weight(new_mask);
@@ -4811,124 +4942,6 @@ out:
 }
 EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
 
-#ifdef CONFIG_PREEMPT_RT_FULL
-void migrate_disable(void)
-{
-	struct task_struct *p = current;
-	const struct cpumask *mask;
-	unsigned long flags;
-	struct rq *rq;
-
-	if (in_atomic()) {
-#ifdef CONFIG_SCHED_DEBUG
-		p->migrate_disable_atomic++;
-#endif
-		return;
-	}
-
-#ifdef CONFIG_SCHED_DEBUG
-	WARN_ON_ONCE(p->migrate_disable_atomic);
-#endif
-
-	preempt_disable();
-	if (p->migrate_disable) {
-		p->migrate_disable++;
-		preempt_enable();
-		return;
-	}
-
-	pin_current_cpu();
-	if (unlikely(!scheduler_running)) {
-		p->migrate_disable = 1;
-		preempt_enable();
-		return;
-	}
-
-	/*
-	 * Since this is always current we can get away with only locking
-	 * rq->lock, the ->cpus_allowed value can normally only be changed
-	 * while holding both p->pi_lock and rq->lock, but seeing that this
-	 * it current, we cannot actually be waking up, so all code that
-	 * relies on serialization against p->pi_lock is out of scope.
-	 *
-	 * Taking rq->lock serializes us against things like
-	 * set_cpus_allowed_ptr() that can still happen concurrently.
-	 */
-	rq = this_rq();
-	raw_spin_lock_irqsave(&rq->lock, flags);
-	p->migrate_disable = 1;
-	mask = tsk_cpus_allowed(p);
-
-	WARN_ON(!cpumask_test_cpu(smp_processor_id(), mask));
-
-	if (!cpumask_equal(&p->cpus_allowed, mask)) {
-		if (p->sched_class->set_cpus_allowed)
-			p->sched_class->set_cpus_allowed(p, mask);
-		p->nr_cpus_allowed = cpumask_weight(mask);
-	}
-	raw_spin_unlock_irqrestore(&rq->lock, flags);
-	preempt_enable();
-}
-EXPORT_SYMBOL(migrate_disable);
-
-void migrate_enable(void)
-{
-	struct task_struct *p = current;
-	const struct cpumask *mask;
-	unsigned long flags;
-	struct rq *rq;
-
-	if (in_atomic()) {
-#ifdef CONFIG_SCHED_DEBUG
-		p->migrate_disable_atomic--;
-#endif
-		return;
-	}
-
-#ifdef CONFIG_SCHED_DEBUG
-	WARN_ON_ONCE(p->migrate_disable_atomic);
-#endif
-	WARN_ON_ONCE(p->migrate_disable <= 0);
-
-	preempt_disable();
-	if (p->migrate_disable > 1) {
-		p->migrate_disable--;
-		preempt_enable();
-		return;
-	}
-
-	if (unlikely(!scheduler_running)) {
-		p->migrate_disable = 0;
-		unpin_current_cpu();
-		preempt_enable();
-		return;
-	}
-
-	/*
-	 * See comment in migrate_disable().
-	 */
-	rq = this_rq();
-	raw_spin_lock_irqsave(&rq->lock, flags);
-	mask = tsk_cpus_allowed(p);
-	p->migrate_disable = 0;
-
-	WARN_ON(!cpumask_test_cpu(smp_processor_id(), mask));
-
-	if (!cpumask_equal(&p->cpus_allowed, mask)) {
-		/* Get the mask now that migration is enabled */
-		mask = tsk_cpus_allowed(p);
-		if (p->sched_class->set_cpus_allowed)
-			p->sched_class->set_cpus_allowed(p, mask);
-		p->nr_cpus_allowed = cpumask_weight(mask);
-	}
-
-	raw_spin_unlock_irqrestore(&rq->lock, flags);
-	unpin_current_cpu();
-	preempt_enable();
-}
-EXPORT_SYMBOL(migrate_enable);
-#endif /* CONFIG_PREEMPT_RT_FULL */
-
 /*
  * Move (not current) task off this cpu, onto dest cpu. We're doing
  * this because either it can't run here any more (set_cpus_allowed()
-- 
cgit v0.10.2


From aee204d087878493d154cc7944095861bc8d6117 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 27 Sep 2011 08:40:24 -0400
Subject: sched: Do not compare cpu masks in scheduler

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Clark Williams <williams@redhat.com>
Link: http://lkml.kernel.org/r/20110927124423.128129033@goodmis.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ba35921..151afb8 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2878,16 +2878,12 @@ static inline void update_migrate_disable(struct task_struct *p)
 	 */
 	mask = tsk_cpus_allowed(p);
 
-	WARN_ON(!cpumask_test_cpu(smp_processor_id(), mask));
+	if (p->sched_class->set_cpus_allowed)
+		p->sched_class->set_cpus_allowed(p, mask);
+	p->nr_cpus_allowed = cpumask_weight(mask);
 
-	if (!cpumask_equal(&p->cpus_allowed, mask)) {
-		if (p->sched_class->set_cpus_allowed)
-			p->sched_class->set_cpus_allowed(p, mask);
-		p->nr_cpus_allowed = cpumask_weight(mask);
-
-		/* Let migrate_enable know to fix things back up */
-		p->migrate_disable |= MIGRATE_DISABLE_SET_AFFIN;
-	}
+	/* Let migrate_enable know to fix things back up */
+	p->migrate_disable |= MIGRATE_DISABLE_SET_AFFIN;
 }
 
 void migrate_disable(void)
-- 
cgit v0.10.2


From a2aed26ab9ecc7f7b8dd199791cd8e21441c8426 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 27 Sep 2011 08:40:25 -0400
Subject: sched: Have migrate_disable ignore bounded threads

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Clark Williams <williams@redhat.com>
Link: http://lkml.kernel.org/r/20110927124423.567944215@goodmis.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 151afb8..9efa58e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2890,7 +2890,7 @@ void migrate_disable(void)
 {
 	struct task_struct *p = current;
 
-	if (in_atomic()) {
+	if (in_atomic() || p->flags & PF_THREAD_BOUND) {
 #ifdef CONFIG_SCHED_DEBUG
 		p->migrate_disable_atomic++;
 #endif
@@ -2921,7 +2921,7 @@ void migrate_enable(void)
 	unsigned long flags;
 	struct rq *rq;
 
-	if (in_atomic()) {
+	if (in_atomic() || p->flags & PF_THREAD_BOUND) {
 #ifdef CONFIG_SCHED_DEBUG
 		p->migrate_disable_atomic--;
 #endif
@@ -2942,26 +2942,21 @@ void migrate_enable(void)
 
 	if (unlikely(migrate_disabled_updated(p))) {
 		/*
-		 * See comment in update_migrate_disable() about locking.
+		 * Undo whatever update_migrate_disable() did, also see there
+		 * about locking.
 		 */
 		rq = this_rq();
 		raw_spin_lock_irqsave(&rq->lock, flags);
-		mask = tsk_cpus_allowed(p);
+
 		/*
 		 * Clearing migrate_disable causes tsk_cpus_allowed to
 		 * show the tasks original cpu affinity.
 		 */
 		p->migrate_disable = 0;
-
-		WARN_ON(!cpumask_test_cpu(smp_processor_id(), mask));
-
-		if (unlikely(!cpumask_equal(&p->cpus_allowed, mask))) {
-			/* Get the mask now that migration is enabled */
-			mask = tsk_cpus_allowed(p);
-			if (p->sched_class->set_cpus_allowed)
-				p->sched_class->set_cpus_allowed(p, mask);
-			p->nr_cpus_allowed = cpumask_weight(mask);
-		}
+		mask = tsk_cpus_allowed(p);
+		if (p->sched_class->set_cpus_allowed)
+			p->sched_class->set_cpus_allowed(p, mask);
+		p->nr_cpus_allowed = cpumask_weight(mask);
 		raw_spin_unlock_irqrestore(&rq->lock, flags);
 	} else
 		p->migrate_disable = 0;
-- 
cgit v0.10.2


From 6810eb7aad751a8f34bf31e37a3e11bb1613fbcf Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 4 Nov 2011 20:48:36 +0100
Subject: sched-clear-pf-thread-bound-on-fallback-rq.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9efa58e..070f5e2 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1192,6 +1192,12 @@ out:
 		}
 	}
 
+	/*
+	 * Clear PF_THREAD_BOUND, otherwise we wreckage
+	 * migrate_disable/enable. See optimization for
+	 * PF_THREAD_BOUND tasks there.
+	 */
+	p->flags &= ~PF_THREAD_BOUND;
 	return dest_cpu;
 }
 
-- 
cgit v0.10.2


From 0e694b9537749141d43a390bf32630e36a861b3c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 17 Jul 2011 16:29:27 +0200
Subject: net-netif_rx_ni-migrate-disable.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/net/core/dev.c b/net/core/dev.c
index f6bc659..b741130 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3114,11 +3114,11 @@ int netif_rx_ni(struct sk_buff *skb)
 {
 	int err;
 
-	preempt_disable();
+	migrate_disable();
 	err = netif_rx(skb);
 	if (local_softirq_pending())
 		thread_do_softirq();
-	preempt_enable();
+	migrate_enable();
 
 	return err;
 }
-- 
cgit v0.10.2


From 5988848ea9d8f35bc3407b56fc0e05493708ab68 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 3 Jul 2009 13:16:38 -0500
Subject: softirq: Sanitize softirq pending for NOHZ/RT

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 6fb89c4..89f1075 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -455,6 +455,8 @@ extern void __raise_softirq_irqoff(unsigned int nr);
 extern void raise_softirq_irqoff(unsigned int nr);
 extern void raise_softirq(unsigned int nr);
 
+extern void softirq_check_pending_idle(void);
+
 /* This is the worklist that queues up per-cpu softirq work.
  *
  * send_remote_sendirq() adds work to these lists, and
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 18784bb..46bf4f2 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -62,6 +62,69 @@ char *softirq_to_name[NR_SOFTIRQS] = {
 	"TASKLET", "SCHED", "HRTIMER", "RCU"
 };
 
+#ifdef CONFIG_NO_HZ
+# ifdef CONFIG_PREEMPT_RT_FULL
+/*
+ * On preempt-rt a softirq might be blocked on a lock. There might be
+ * no other runnable task on this CPU because the lock owner runs on
+ * some other CPU. So we have to go into idle with the pending bit
+ * set. Therefor we need to check this otherwise we warn about false
+ * positives which confuses users and defeats the whole purpose of
+ * this test.
+ *
+ * This code is called with interrupts disabled.
+ */
+void softirq_check_pending_idle(void)
+{
+	static int rate_limit;
+	u32 warnpending = 0, pending;
+
+	if (rate_limit >= 10)
+		return;
+
+	pending = local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK;
+	if (pending) {
+		struct task_struct *tsk;
+
+		tsk = __get_cpu_var(ksoftirqd);
+		/*
+		 * The wakeup code in rtmutex.c wakes up the task
+		 * _before_ it sets pi_blocked_on to NULL under
+		 * tsk->pi_lock. So we need to check for both: state
+		 * and pi_blocked_on.
+		 */
+		raw_spin_lock(&tsk->pi_lock);
+
+		if (!tsk->pi_blocked_on && !(tsk->state == TASK_RUNNING))
+			warnpending = 1;
+
+		raw_spin_unlock(&tsk->pi_lock);
+	}
+
+	if (warnpending) {
+		printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n",
+		       pending);
+		rate_limit++;
+	}
+}
+# else
+/*
+ * On !PREEMPT_RT we just printk rate limited:
+ */
+void softirq_check_pending_idle(void)
+{
+	static int rate_limit;
+
+	if (rate_limit < 10 &&
+			(local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) {
+		printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n",
+		       local_softirq_pending());
+		rate_limit++;
+	}
+}
+# endif
+#endif
+
 /*
  * we cannot loop indefinitely here to avoid userspace starvation,
  * but we also don't want to introduce a worst case 1/HZ latency
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 5fe688c..626b320f 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -481,14 +481,7 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
 		return false;
 
 	if (unlikely(local_softirq_pending() && cpu_online(cpu))) {
-		static int ratelimit;
-
-		if (ratelimit < 10 &&
-		    (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) {
-			printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n",
-			       (unsigned int) local_softirq_pending());
-			ratelimit++;
-		}
+		softirq_check_pending_idle();
 		return false;
 	}
 
-- 
cgit v0.10.2


From f7baa13bd1c9117291da1419adf71405dbd16566 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 17 Jul 2011 18:51:23 +0200
Subject: lockdep-rt.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h
index 37b13c4..a52b35d 100644
--- a/include/linux/irqflags.h
+++ b/include/linux/irqflags.h
@@ -25,8 +25,6 @@
 # define trace_softirqs_enabled(p)	((p)->softirqs_enabled)
 # define trace_hardirq_enter()	do { current->hardirq_context++; } while (0)
 # define trace_hardirq_exit()	do { current->hardirq_context--; } while (0)
-# define lockdep_softirq_enter()	do { current->softirq_context++; } while (0)
-# define lockdep_softirq_exit()	do { current->softirq_context--; } while (0)
 # define INIT_TRACE_IRQFLAGS	.softirqs_enabled = 1,
 #else
 # define trace_hardirqs_on()		do { } while (0)
@@ -39,9 +37,15 @@
 # define trace_softirqs_enabled(p)	0
 # define trace_hardirq_enter()		do { } while (0)
 # define trace_hardirq_exit()		do { } while (0)
+# define INIT_TRACE_IRQFLAGS
+#endif
+
+#if defined(CONFIG_TRACE_IRQFLAGS) && !defined(CONFIG_PREEMPT_RT_FULL)
+# define lockdep_softirq_enter() do { current->softirq_context++; } while (0)
+# define lockdep_softirq_exit()	 do { current->softirq_context--; } while (0)
+#else
 # define lockdep_softirq_enter()	do { } while (0)
 # define lockdep_softirq_exit()		do { } while (0)
-# define INIT_TRACE_IRQFLAGS
 #endif
 
 #if defined(CONFIG_IRQSOFF_TRACER) || \
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 7981e5b..7f03801 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -3534,6 +3534,7 @@ static void check_flags(unsigned long flags)
 		}
 	}
 
+#ifndef CONFIG_PREEMPT_RT_FULL
 	/*
 	 * We dont accurately track softirq state in e.g.
 	 * hardirq contexts (such as on 4KSTACKS), so only
@@ -3548,6 +3549,7 @@ static void check_flags(unsigned long flags)
 			DEBUG_LOCKS_WARN_ON(!current->softirqs_enabled);
 		}
 	}
+#endif
 
 	if (!debug_locks)
 		print_irqtrace_events(current);
-- 
cgit v0.10.2


From 3c40a3cf0a36831fffd36158ba4d34b6c154d487 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 17 Jul 2011 21:51:45 +0200
Subject: mutex-no-spin-on-rt.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
index 44511d1..f60af90 100644
--- a/kernel/Kconfig.locks
+++ b/kernel/Kconfig.locks
@@ -222,4 +222,4 @@ endif
 
 config MUTEX_SPIN_ON_OWNER
 	def_bool y
-	depends on SMP && !DEBUG_MUTEXES
+	depends on SMP && !DEBUG_MUTEXES && !PREEMPT_RT_FULL
-- 
cgit v0.10.2


From 919ed4b5116d248ac27fa04c7e492cc983426b44 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 28 Jun 2011 15:57:18 +0200
Subject: softirq-local-lock.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index 624ef3f..2ae0a9d 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -61,7 +61,11 @@
 #define HARDIRQ_OFFSET	(1UL << HARDIRQ_SHIFT)
 #define NMI_OFFSET	(1UL << NMI_SHIFT)
 
-#define SOFTIRQ_DISABLE_OFFSET	(2 * SOFTIRQ_OFFSET)
+#ifndef CONFIG_PREEMPT_RT_FULL
+# define SOFTIRQ_DISABLE_OFFSET	(2 * SOFTIRQ_OFFSET)
+#else
+# define SOFTIRQ_DISABLE_OFFSET (0)
+#endif
 
 #ifndef PREEMPT_ACTIVE
 #define PREEMPT_ACTIVE_BITS	1
@@ -74,10 +78,17 @@
 #endif
 
 #define hardirq_count()	(preempt_count() & HARDIRQ_MASK)
-#define softirq_count()	(preempt_count() & SOFTIRQ_MASK)
 #define irq_count()	(preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK \
 				 | NMI_MASK))
 
+#ifndef CONFIG_PREEMPT_RT_FULL
+# define softirq_count()	(preempt_count() & SOFTIRQ_MASK)
+# define in_serving_softirq()	(softirq_count() & SOFTIRQ_OFFSET)
+#else
+# define softirq_count()	(0U)
+extern int in_serving_softirq(void);
+#endif
+
 /*
  * Are we doing bottom half or hardware interrupt processing?
  * Are we in a softirq context? Interrupt context?
@@ -87,7 +98,6 @@
 #define in_irq()		(hardirq_count())
 #define in_softirq()		(softirq_count())
 #define in_interrupt()		(irq_count())
-#define in_serving_softirq()	(softirq_count() & SOFTIRQ_OFFSET)
 
 /*
  * Are we in NMI context?
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 89f1075..77ed66d 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -447,7 +447,13 @@ struct softirq_action
 
 asmlinkage void do_softirq(void);
 asmlinkage void __do_softirq(void);
+
+#ifndef CONFIG_PREEMPT_RT_FULL
 static inline void thread_do_softirq(void) { do_softirq(); }
+#else
+extern void thread_do_softirq(void);
+#endif
+
 extern void open_softirq(int nr, void (*action)(struct softirq_action *));
 extern void softirq_init(void);
 extern void __raise_softirq_irqoff(unsigned int nr);
@@ -634,6 +640,12 @@ void tasklet_hrtimer_cancel(struct tasklet_hrtimer *ttimer)
 	tasklet_kill(&ttimer->tasklet);
 }
 
+#ifdef CONFIG_PREEMPT_RT_FULL
+extern void softirq_early_init(void);
+#else
+static inline void softirq_early_init(void) { }
+#endif
+
 /*
  * Autoprobing for irqs:
  *
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5639b15..56c1337 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1634,6 +1634,7 @@ struct task_struct {
 #endif
 #ifdef CONFIG_PREEMPT_RT_BASE
 	struct rcu_head put_rcu;
+	int softirq_nestcnt;
 #endif
 };
 
diff --git a/init/main.c b/init/main.c
index a6766db..4df397d 100644
--- a/init/main.c
+++ b/init/main.c
@@ -493,6 +493,7 @@ asmlinkage void __init start_kernel(void)
  * Interrupts are still disabled. Do necessary setups, then
  * enable them
  */
+	softirq_early_init();
 	tick_init();
 	boot_cpu_init();
 	page_address_init();
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 46bf4f2..4cdca2e 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -25,6 +25,7 @@
 #include <linux/smp.h>
 #include <linux/smpboot.h>
 #include <linux/tick.h>
+#include <linux/locallock.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/irq.h>
@@ -168,6 +169,7 @@ static void handle_pending_softirqs(u32 pending, int cpu)
 	local_irq_disable();
 }
 
+#ifndef CONFIG_PREEMPT_RT_FULL
 /*
  * preempt_count and SOFTIRQ_OFFSET usage:
  * - preempt_count is changed by SOFTIRQ_OFFSET on entering or leaving
@@ -360,6 +362,162 @@ asmlinkage void do_softirq(void)
 
 #endif
 
+static inline void local_bh_disable_nort(void) { local_bh_disable(); }
+static inline void _local_bh_enable_nort(void) { _local_bh_enable(); }
+
+#else /* !PREEMPT_RT_FULL */
+
+/*
+ * On RT we serialize softirq execution with a cpu local lock
+ */
+static DEFINE_LOCAL_IRQ_LOCK(local_softirq_lock);
+static DEFINE_PER_CPU(struct task_struct *, local_softirq_runner);
+
+static void __do_softirq(void);
+
+void __init softirq_early_init(void)
+{
+	local_irq_lock_init(local_softirq_lock);
+}
+
+void local_bh_disable(void)
+{
+	migrate_disable();
+	current->softirq_nestcnt++;
+}
+EXPORT_SYMBOL(local_bh_disable);
+
+void local_bh_enable(void)
+{
+	if (WARN_ON(current->softirq_nestcnt == 0))
+		return;
+
+	if ((current->softirq_nestcnt == 1) &&
+	    local_softirq_pending() &&
+	    local_trylock(local_softirq_lock)) {
+
+		local_irq_disable();
+		if (local_softirq_pending())
+			__do_softirq();
+		local_irq_enable();
+		local_unlock(local_softirq_lock);
+		WARN_ON(current->softirq_nestcnt != 1);
+	}
+	current->softirq_nestcnt--;
+	migrate_enable();
+}
+EXPORT_SYMBOL(local_bh_enable);
+
+void local_bh_enable_ip(unsigned long ip)
+{
+	local_bh_enable();
+}
+EXPORT_SYMBOL(local_bh_enable_ip);
+
+/* For tracing */
+int notrace __in_softirq(void)
+{
+	if (__get_cpu_var(local_softirq_lock).owner == current)
+		return __get_cpu_var(local_softirq_lock).nestcnt;
+	return 0;
+}
+
+int in_serving_softirq(void)
+{
+	int res;
+
+	preempt_disable();
+	res = __get_cpu_var(local_softirq_runner) == current;
+	preempt_enable();
+	return res;
+}
+
+/*
+ * Called with bh and local interrupts disabled. For full RT cpu must
+ * be pinned.
+ */
+static void __do_softirq(void)
+{
+	u32 pending = local_softirq_pending();
+	int cpu = smp_processor_id();
+
+	current->softirq_nestcnt++;
+
+	/* Reset the pending bitmask before enabling irqs */
+	set_softirq_pending(0);
+
+	__get_cpu_var(local_softirq_runner) = current;
+
+	lockdep_softirq_enter();
+
+	handle_pending_softirqs(pending, cpu);
+
+	pending = local_softirq_pending();
+	if (pending)
+		wakeup_softirqd();
+
+	lockdep_softirq_exit();
+	__get_cpu_var(local_softirq_runner) = NULL;
+
+	current->softirq_nestcnt--;
+}
+
+static int __thread_do_softirq(int cpu)
+{
+	/*
+	 * Prevent the current cpu from going offline.
+	 * pin_current_cpu() can reenable preemption and block on the
+	 * hotplug mutex. When it returns, the current cpu is
+	 * pinned. It might be the wrong one, but the offline check
+	 * below catches that.
+	 */
+	pin_current_cpu();
+	/*
+	 * If called from ksoftirqd (cpu >= 0) we need to check
+	 * whether we are on the wrong cpu due to cpu offlining. If
+	 * called via thread_do_softirq() no action required.
+	 */
+	if (cpu >= 0 && cpu_is_offline(cpu)) {
+		unpin_current_cpu();
+		return -1;
+	}
+	preempt_enable();
+	local_lock(local_softirq_lock);
+	local_irq_disable();
+	/*
+	 * We cannot switch stacks on RT as we want to be able to
+	 * schedule!
+	 */
+	if (local_softirq_pending())
+		__do_softirq();
+	local_unlock(local_softirq_lock);
+	unpin_current_cpu();
+	preempt_disable();
+	local_irq_enable();
+	return 0;
+}
+
+/*
+ * Called from netif_rx_ni(). Preemption enabled.
+ */
+void thread_do_softirq(void)
+{
+	if (!in_serving_softirq()) {
+		preempt_disable();
+		__thread_do_softirq(-1);
+		preempt_enable();
+	}
+}
+
+static int ksoftirqd_do_softirq(int cpu)
+{
+	return __thread_do_softirq(cpu);
+}
+
+static inline void local_bh_disable_nort(void) { }
+static inline void _local_bh_enable_nort(void) { }
+
+#endif /* PREEMPT_RT_FULL */
 /*
  * Enter an interrupt context.
  */
@@ -373,9 +531,9 @@ void irq_enter(void)
 		 * Prevent raise_softirq from needlessly waking up ksoftirqd
 		 * here, as softirq will be serviced on return from interrupt.
 		 */
-		local_bh_disable();
+		local_bh_disable_nort();
 		tick_check_idle(cpu);
-		_local_bh_enable();
+		_local_bh_enable_nort();
 	}
 
 	__irq_enter();
@@ -383,6 +541,7 @@ void irq_enter(void)
 
 static inline void invoke_softirq(void)
 {
+#ifndef CONFIG_PREEMPT_RT_FULL
 	if (!force_irqthreads) {
 #ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED
 		__do_softirq();
@@ -395,6 +554,9 @@ static inline void invoke_softirq(void)
 		wakeup_softirqd();
 		__local_bh_enable(SOFTIRQ_OFFSET);
 	}
+#else
+		wakeup_softirqd();
+#endif
 }
 
 /*
-- 
cgit v0.10.2


From dfc0aea2042676fef748ef3f6757e727324cbe45 Mon Sep 17 00:00:00 2001
From: John Kacur <jkacur@redhat.com>
Date: Mon, 14 Nov 2011 02:44:43 +0100
Subject: softirq: Export in_serving_softirq()

ERROR: "in_serving_softirq" [net/sched/cls_cgroup.ko] undefined!

The above can be fixed by exporting in_serving_softirq

Signed-off-by: John Kacur <jkacur@redhat.com>
Cc: Paul McKenney <paulmck@linux.vnet.ibm.com>
Cc: stable-rt@vger.kernel.org
Link: http://lkml.kernel.org/r/1321235083-21756-2-git-send-email-jkacur@redhat.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/softirq.c b/kernel/softirq.c
index 4cdca2e..bca3e64 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -431,6 +431,7 @@ int in_serving_softirq(void)
 	preempt_enable();
 	return res;
 }
+EXPORT_SYMBOL(in_serving_softirq);
 
 /*
  * Called with bh and local interrupts disabled. For full RT cpu must
-- 
cgit v0.10.2


From f3377324b04dcaeed499879fda1e172520c37d0c Mon Sep 17 00:00:00 2001
From: Yong Zhang <yong.zhang0@gmail.com>
Date: Thu, 13 Oct 2011 17:19:09 +0800
Subject: hardirq.h: Define softirq_count() as OUL to kill build warning
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

kernel/lockdep.c: In function ‘print_bad_irq_dependency’:
kernel/lockdep.c:1476:3: warning: format ‘%lu’ expects type ‘long unsigned int’, but argument 7 has type ‘unsigned int’
kernel/lockdep.c: In function ‘print_usage_bug’:
kernel/lockdep.c:2193:3: warning: format ‘%lu’ expects type ‘long unsigned int’, but argument 7 has type ‘unsigned int’

kernel/lockdep.i show this:
 printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n",
  curr->comm, task_pid_nr(curr),
  curr->hardirq_context, ((current_thread_info()->preempt_count) & (((1UL << (10))-1) << ((0 + 8) + 8))) >> ((0 + 8) + 8),
  curr->softirq_context, (0U) >> (0 + 8),
  curr->hardirqs_enabled,
  curr->softirqs_enabled);

Signed-off-by: Yong Zhang <yong.zhang0@gmail.com>
Link: http://lkml.kernel.org/r/20111013091909.GA32739@zhy
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index 2ae0a9d..dfa97de 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -85,7 +85,7 @@
 # define softirq_count()	(preempt_count() & SOFTIRQ_MASK)
 # define in_serving_softirq()	(softirq_count() & SOFTIRQ_OFFSET)
 #else
-# define softirq_count()	(0U)
+# define softirq_count()	(0UL)
 extern int in_serving_softirq(void);
 #endif
 
-- 
cgit v0.10.2


From 72527ecfe165ee2c1e7b712b7f826886f46d6a89 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 18 Jul 2011 13:59:17 +0200
Subject: softirq-disable-softirq-stacks-for-rt.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index 71413f4..bb73a2e 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -584,6 +584,7 @@ void irq_ctx_init(void)
 	}
 }
 
+#ifndef CONFIG_PREEMPT_RT_FULL
 static inline void do_softirq_onstack(void)
 {
 	struct thread_info *curtp, *irqtp;
@@ -620,6 +621,7 @@ void do_softirq(void)
 
 	local_irq_restore(flags);
 }
+#endif
 
 irq_hw_number_t virq_to_hw(unsigned int virq)
 {
diff --git a/arch/powerpc/kernel/misc_32.S b/arch/powerpc/kernel/misc_32.S
index 19e096b..313078b 100644
--- a/arch/powerpc/kernel/misc_32.S
+++ b/arch/powerpc/kernel/misc_32.S
@@ -36,6 +36,7 @@
 
 	.text
 
+#ifndef CONFIG_PREEMPT_RT_FULL
 _GLOBAL(call_do_softirq)
 	mflr	r0
 	stw	r0,4(r1)
@@ -46,6 +47,7 @@ _GLOBAL(call_do_softirq)
 	lwz	r0,4(r1)
 	mtlr	r0
 	blr
+#endif
 
 _GLOBAL(call_handle_irq)
 	mflr	r0
diff --git a/arch/powerpc/kernel/misc_64.S b/arch/powerpc/kernel/misc_64.S
index 5cfa800..39b77e8 100644
--- a/arch/powerpc/kernel/misc_64.S
+++ b/arch/powerpc/kernel/misc_64.S
@@ -29,6 +29,7 @@
 
 	.text
 
+#ifndef CONFIG_PREEMPT_RT_FULL
 _GLOBAL(call_do_softirq)
 	mflr	r0
 	std	r0,16(r1)
@@ -39,6 +40,7 @@ _GLOBAL(call_do_softirq)
 	ld	r0,16(r1)
 	mtlr	r0
 	blr
+#endif
 
 _GLOBAL(call_handle_irq)
 	ld	r8,0(r6)
diff --git a/arch/sh/kernel/irq.c b/arch/sh/kernel/irq.c
index 063af10..ae4b141 100644
--- a/arch/sh/kernel/irq.c
+++ b/arch/sh/kernel/irq.c
@@ -149,6 +149,7 @@ void irq_ctx_exit(int cpu)
 	hardirq_ctx[cpu] = NULL;
 }
 
+#ifndef CONFIG_PREEMPT_RT_FULL
 asmlinkage void do_softirq(void)
 {
 	unsigned long flags;
@@ -191,6 +192,7 @@ asmlinkage void do_softirq(void)
 
 	local_irq_restore(flags);
 }
+#endif
 #else
 static inline void handle_one_irq(unsigned int irq)
 {
diff --git a/arch/sparc/kernel/irq_64.c b/arch/sparc/kernel/irq_64.c
index 9bcbbe2..1bc5cd8 100644
--- a/arch/sparc/kernel/irq_64.c
+++ b/arch/sparc/kernel/irq_64.c
@@ -698,6 +698,7 @@ void __irq_entry handler_irq(int pil, struct pt_regs *regs)
 	set_irq_regs(old_regs);
 }
 
+#ifndef CONFIG_PREEMPT_RT_FULL
 void do_softirq(void)
 {
 	unsigned long flags;
@@ -723,6 +724,7 @@ void do_softirq(void)
 
 	local_irq_restore(flags);
 }
+#endif
 
 #ifdef CONFIG_HOTPLUG_CPU
 void fixup_irqs(void)
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index cb3c591..3c06da6 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1337,6 +1337,7 @@ bad_gs:
 	jmp  2b
 	.previous
 
+#ifndef CONFIG_PREEMPT_RT_FULL
 /* Call softirq on interrupt stack. Interrupts are off. */
 ENTRY(call_softirq)
 	CFI_STARTPROC
@@ -1356,6 +1357,7 @@ ENTRY(call_softirq)
 	ret
 	CFI_ENDPROC
 END(call_softirq)
+#endif
 
 #ifdef CONFIG_XEN
 zeroentry xen_hypervisor_callback xen_do_hypervisor_callback
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 344faf8..f60ecc0 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -149,6 +149,7 @@ void __cpuinit irq_ctx_init(int cpu)
 	       cpu, per_cpu(hardirq_ctx, cpu),  per_cpu(softirq_ctx, cpu));
 }
 
+#ifndef CONFIG_PREEMPT_RT_FULL
 asmlinkage void do_softirq(void)
 {
 	unsigned long flags;
@@ -179,6 +180,7 @@ asmlinkage void do_softirq(void)
 
 	local_irq_restore(flags);
 }
+#endif
 
 bool handle_irq(unsigned irq, struct pt_regs *regs)
 {
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index d04d3ec..831f247 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -88,7 +88,7 @@ bool handle_irq(unsigned irq, struct pt_regs *regs)
 	return true;
 }
 
-
+#ifndef CONFIG_PREEMPT_RT_FULL
 extern void call_softirq(void);
 
 asmlinkage void do_softirq(void)
@@ -108,3 +108,4 @@ asmlinkage void do_softirq(void)
 	}
 	local_irq_restore(flags);
 }
+#endif
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 77ed66d..a103ef8 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -445,10 +445,9 @@ struct softirq_action
 	void	(*action)(struct softirq_action *);
 };
 
+#ifndef CONFIG_PREEMPT_RT_FULL
 asmlinkage void do_softirq(void);
 asmlinkage void __do_softirq(void);
-
-#ifndef CONFIG_PREEMPT_RT_FULL
 static inline void thread_do_softirq(void) { do_softirq(); }
 #else
 extern void thread_do_softirq(void);
-- 
cgit v0.10.2


From c2b4e6390470e99d32502ceaf0e5691145a77a1b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 21 Jul 2011 21:06:43 +0200
Subject: softirq-make-fifo.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/softirq.c b/kernel/softirq.c
index bca3e64..7a19af0 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -364,6 +364,8 @@ asmlinkage void do_softirq(void)
 
 static inline void local_bh_disable_nort(void) { local_bh_disable(); }
 static inline void _local_bh_enable_nort(void) { _local_bh_enable(); }
+static void ksoftirqd_set_sched_params(unsigned int cpu) { }
+static void ksoftirqd_clr_sched_params(unsigned int cpu, bool online) { }
 
 #else /* !PREEMPT_RT_FULL */
 
@@ -518,6 +520,20 @@ static int ksoftirqd_do_softirq(int cpu)
 static inline void local_bh_disable_nort(void) { }
 static inline void _local_bh_enable_nort(void) { }
 
+static inline void ksoftirqd_set_sched_params(unsigned int cpu)
+{
+	struct sched_param param = { .sched_priority = 1 };
+
+	sched_setscheduler(current, SCHED_FIFO, &param);
+}
+
+static inline void ksoftirqd_clr_sched_params(unsigned int cpu, bool online)
+{
+	struct sched_param param = { .sched_priority = 0 };
+
+	sched_setscheduler(current, SCHED_NORMAL, &param);
+}
+
 #endif /* PREEMPT_RT_FULL */
 /*
  * Enter an interrupt context.
@@ -1065,6 +1081,8 @@ static struct notifier_block __cpuinitdata cpu_nfb = {
 
 static struct smp_hotplug_thread softirq_threads = {
 	.store			= &ksoftirqd,
+	.setup			= ksoftirqd_set_sched_params,
+	.cleanup		= ksoftirqd_clr_sched_params,
 	.thread_should_run	= ksoftirqd_should_run,
 	.thread_fn		= run_ksoftirqd,
 	.thread_comm		= "ksoftirqd/%u",
-- 
cgit v0.10.2


From 6fee25cc5f7e7a53c3d1739f49504d24442461eb Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 29 Nov 2011 20:18:22 -0500
Subject: tasklet: Prevent tasklets from going into infinite spin in RT

When CONFIG_PREEMPT_RT_FULL is enabled, tasklets run as threads,
and spinlocks turn are mutexes. But this can cause issues with
tasks disabling tasklets. A tasklet runs under ksoftirqd, and
if a tasklets are disabled with tasklet_disable(), the tasklet
count is increased. When a tasklet runs, it checks this counter
and if it is set, it adds itself back on the softirq queue and
returns.

The problem arises in RT because ksoftirq will see that a softirq
is ready to run (the tasklet softirq just re-armed itself), and will
not sleep, but instead run the softirqs again. The tasklet softirq
will still see that the count is non-zero and will not execute
the tasklet and requeue itself on the softirq again, which will
cause ksoftirqd to run it again and again and again.

It gets worse because ksoftirqd runs as a real-time thread.
If it preempted the task that disabled tasklets, and that task
has migration disabled, or can't run for other reasons, the tasklet
softirq will never run because the count will never be zero, and
ksoftirqd will go into an infinite loop. As an RT task, it this
becomes a big problem.

This is a hack solution to have tasklet_disable stop tasklets, and
when a tasklet runs, instead of requeueing the tasklet softirqd
it delays it. When tasklet_enable() is called, and tasklets are
waiting, then the tasklet_enable() will kick the tasklets to continue.
This prevents the lock up from ksoftirq going into an infinite loop.

[ rostedt@goodmis.org: ported to 3.0-rt ]

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index a103ef8..8784459 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -502,8 +502,9 @@ extern void __send_remote_softirq(struct call_single_data *cp, int cpu,
      to be executed on some cpu at least once after this.
    * If the tasklet is already scheduled, but its execution is still not
      started, it will be executed only once.
-   * If this tasklet is already running on another CPU (or schedule is called
-     from tasklet itself), it is rescheduled for later.
+   * If this tasklet is already running on another CPU, it is rescheduled
+     for later.
+   * Schedule must not be called from the tasklet itself (a lockup occurs)
    * Tasklet is strictly serialized wrt itself, but not
      wrt another tasklets. If client needs some intertask synchronization,
      he makes it with spinlocks.
@@ -528,27 +529,36 @@ struct tasklet_struct name = { NULL, 0, ATOMIC_INIT(1), func, data }
 enum
 {
 	TASKLET_STATE_SCHED,	/* Tasklet is scheduled for execution */
-	TASKLET_STATE_RUN	/* Tasklet is running (SMP only) */
+	TASKLET_STATE_RUN,	/* Tasklet is running (SMP only) */
+	TASKLET_STATE_PENDING	/* Tasklet is pending */
 };
 
-#ifdef CONFIG_SMP
+#define TASKLET_STATEF_SCHED	(1 << TASKLET_STATE_SCHED)
+#define TASKLET_STATEF_RUN	(1 << TASKLET_STATE_RUN)
+#define TASKLET_STATEF_PENDING	(1 << TASKLET_STATE_PENDING)
+
+#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL)
 static inline int tasklet_trylock(struct tasklet_struct *t)
 {
 	return !test_and_set_bit(TASKLET_STATE_RUN, &(t)->state);
 }
 
+static inline int tasklet_tryunlock(struct tasklet_struct *t)
+{
+	return cmpxchg(&t->state, TASKLET_STATEF_RUN, 0) == TASKLET_STATEF_RUN;
+}
+
 static inline void tasklet_unlock(struct tasklet_struct *t)
 {
 	smp_mb__before_clear_bit(); 
 	clear_bit(TASKLET_STATE_RUN, &(t)->state);
 }
 
-static inline void tasklet_unlock_wait(struct tasklet_struct *t)
-{
-	while (test_bit(TASKLET_STATE_RUN, &(t)->state)) { barrier(); }
-}
+extern void tasklet_unlock_wait(struct tasklet_struct *t);
+
 #else
 #define tasklet_trylock(t) 1
+#define tasklet_tryunlock(t)	1
 #define tasklet_unlock_wait(t) do { } while (0)
 #define tasklet_unlock(t) do { } while (0)
 #endif
@@ -597,17 +607,8 @@ static inline void tasklet_disable(struct tasklet_struct *t)
 	smp_mb();
 }
 
-static inline void tasklet_enable(struct tasklet_struct *t)
-{
-	smp_mb__before_atomic_dec();
-	atomic_dec(&t->count);
-}
-
-static inline void tasklet_hi_enable(struct tasklet_struct *t)
-{
-	smp_mb__before_atomic_dec();
-	atomic_dec(&t->count);
-}
+extern  void tasklet_enable(struct tasklet_struct *t);
+extern  void tasklet_hi_enable(struct tasklet_struct *t);
 
 extern void tasklet_kill(struct tasklet_struct *t);
 extern void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 7a19af0..fcf6997 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -21,6 +21,7 @@
 #include <linux/freezer.h>
 #include <linux/kthread.h>
 #include <linux/rcupdate.h>
+#include <linux/delay.h>
 #include <linux/ftrace.h>
 #include <linux/smp.h>
 #include <linux/smpboot.h>
@@ -648,15 +649,45 @@ struct tasklet_head
 static DEFINE_PER_CPU(struct tasklet_head, tasklet_vec);
 static DEFINE_PER_CPU(struct tasklet_head, tasklet_hi_vec);
 
+static void inline
+__tasklet_common_schedule(struct tasklet_struct *t, struct tasklet_head *head, unsigned int nr)
+{
+	if (tasklet_trylock(t)) {
+again:
+		/* We may have been preempted before tasklet_trylock
+		 * and __tasklet_action may have already run.
+		 * So double check the sched bit while the takslet
+		 * is locked before adding it to the list.
+		 */
+		if (test_bit(TASKLET_STATE_SCHED, &t->state)) {
+			t->next = NULL;
+			*head->tail = t;
+			head->tail = &(t->next);
+			raise_softirq_irqoff(nr);
+			tasklet_unlock(t);
+		} else {
+			/* This is subtle. If we hit the corner case above
+			 * It is possible that we get preempted right here,
+			 * and another task has successfully called
+			 * tasklet_schedule(), then this function, and
+			 * failed on the trylock. Thus we must be sure
+			 * before releasing the tasklet lock, that the
+			 * SCHED_BIT is clear. Otherwise the tasklet
+			 * may get its SCHED_BIT set, but not added to the
+			 * list
+			 */
+			if (!tasklet_tryunlock(t))
+				goto again;
+		}
+	}
+}
+
 void __tasklet_schedule(struct tasklet_struct *t)
 {
 	unsigned long flags;
 
 	local_irq_save(flags);
-	t->next = NULL;
-	*__this_cpu_read(tasklet_vec.tail) = t;
-	__this_cpu_write(tasklet_vec.tail, &(t->next));
-	raise_softirq_irqoff(TASKLET_SOFTIRQ);
+	__tasklet_common_schedule(t, &__get_cpu_var(tasklet_vec), TASKLET_SOFTIRQ);
 	local_irq_restore(flags);
 }
 
@@ -667,10 +698,7 @@ void __tasklet_hi_schedule(struct tasklet_struct *t)
 	unsigned long flags;
 
 	local_irq_save(flags);
-	t->next = NULL;
-	*__this_cpu_read(tasklet_hi_vec.tail) = t;
-	__this_cpu_write(tasklet_hi_vec.tail,  &(t->next));
-	raise_softirq_irqoff(HI_SOFTIRQ);
+	__tasklet_common_schedule(t, &__get_cpu_var(tasklet_hi_vec), HI_SOFTIRQ);
 	local_irq_restore(flags);
 }
 
@@ -678,50 +706,119 @@ EXPORT_SYMBOL(__tasklet_hi_schedule);
 
 void __tasklet_hi_schedule_first(struct tasklet_struct *t)
 {
-	BUG_ON(!irqs_disabled());
-
-	t->next = __this_cpu_read(tasklet_hi_vec.head);
-	__this_cpu_write(tasklet_hi_vec.head, t);
-	__raise_softirq_irqoff(HI_SOFTIRQ);
+	__tasklet_hi_schedule(t);
 }
 
 EXPORT_SYMBOL(__tasklet_hi_schedule_first);
 
-static void tasklet_action(struct softirq_action *a)
+void  tasklet_enable(struct tasklet_struct *t)
 {
-	struct tasklet_struct *list;
+	if (!atomic_dec_and_test(&t->count))
+		return;
+	if (test_and_clear_bit(TASKLET_STATE_PENDING, &t->state))
+		tasklet_schedule(t);
+}
 
-	local_irq_disable();
-	list = __this_cpu_read(tasklet_vec.head);
-	__this_cpu_write(tasklet_vec.head, NULL);
-	__this_cpu_write(tasklet_vec.tail, &__get_cpu_var(tasklet_vec).head);
-	local_irq_enable();
+EXPORT_SYMBOL(tasklet_enable);
+
+void  tasklet_hi_enable(struct tasklet_struct *t)
+{
+	if (!atomic_dec_and_test(&t->count))
+		return;
+	if (test_and_clear_bit(TASKLET_STATE_PENDING, &t->state))
+		tasklet_hi_schedule(t);
+}
+
+EXPORT_SYMBOL(tasklet_hi_enable);
+
+static void
+__tasklet_action(struct softirq_action *a, struct tasklet_struct *list)
+{
+	int loops = 1000000;
 
 	while (list) {
 		struct tasklet_struct *t = list;
 
 		list = list->next;
 
-		if (tasklet_trylock(t)) {
-			if (!atomic_read(&t->count)) {
-				if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
-					BUG();
-				t->func(t->data);
-				tasklet_unlock(t);
-				continue;
-			}
-			tasklet_unlock(t);
+		/*
+		 * Should always succeed - after a tasklist got on the
+		 * list (after getting the SCHED bit set from 0 to 1),
+		 * nothing but the tasklet softirq it got queued to can
+		 * lock it:
+		 */
+		if (!tasklet_trylock(t)) {
+			WARN_ON(1);
+			continue;
 		}
 
-		local_irq_disable();
 		t->next = NULL;
-		*__this_cpu_read(tasklet_vec.tail) = t;
-		__this_cpu_write(tasklet_vec.tail, &(t->next));
-		__raise_softirq_irqoff(TASKLET_SOFTIRQ);
-		local_irq_enable();
+
+		/*
+		 * If we cannot handle the tasklet because it's disabled,
+		 * mark it as pending. tasklet_enable() will later
+		 * re-schedule the tasklet.
+		 */
+		if (unlikely(atomic_read(&t->count))) {
+out_disabled:
+			/* implicit unlock: */
+			wmb();
+			t->state = TASKLET_STATEF_PENDING;
+			continue;
+		}
+
+		/*
+		 * After this point on the tasklet might be rescheduled
+		 * on another CPU, but it can only be added to another
+		 * CPU's tasklet list if we unlock the tasklet (which we
+		 * dont do yet).
+		 */
+		if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
+			WARN_ON(1);
+
+again:
+		t->func(t->data);
+
+		/*
+		 * Try to unlock the tasklet. We must use cmpxchg, because
+		 * another CPU might have scheduled or disabled the tasklet.
+		 * We only allow the STATE_RUN -> 0 transition here.
+		 */
+		while (!tasklet_tryunlock(t)) {
+			/*
+			 * If it got disabled meanwhile, bail out:
+			 */
+			if (atomic_read(&t->count))
+				goto out_disabled;
+			/*
+			 * If it got scheduled meanwhile, re-execute
+			 * the tasklet function:
+			 */
+			if (test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
+				goto again;
+			if (!--loops) {
+				printk("hm, tasklet state: %08lx\n", t->state);
+				WARN_ON(1);
+				tasklet_unlock(t);
+				break;
+			}
+		}
 	}
 }
 
+static void tasklet_action(struct softirq_action *a)
+{
+	struct tasklet_struct *list;
+
+	local_irq_disable();
+	list = __get_cpu_var(tasklet_vec).head;
+	__get_cpu_var(tasklet_vec).head = NULL;
+	__get_cpu_var(tasklet_vec).tail = &__get_cpu_var(tasklet_vec).head;
+	local_irq_enable();
+
+	__tasklet_action(a, list);
+}
+
 static void tasklet_hi_action(struct softirq_action *a)
 {
 	struct tasklet_struct *list;
@@ -732,29 +829,7 @@ static void tasklet_hi_action(struct softirq_action *a)
 	__this_cpu_write(tasklet_hi_vec.tail, &__get_cpu_var(tasklet_hi_vec).head);
 	local_irq_enable();
 
-	while (list) {
-		struct tasklet_struct *t = list;
-
-		list = list->next;
-
-		if (tasklet_trylock(t)) {
-			if (!atomic_read(&t->count)) {
-				if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
-					BUG();
-				t->func(t->data);
-				tasklet_unlock(t);
-				continue;
-			}
-			tasklet_unlock(t);
-		}
-
-		local_irq_disable();
-		t->next = NULL;
-		*__this_cpu_read(tasklet_hi_vec.tail) = t;
-		__this_cpu_write(tasklet_hi_vec.tail, &(t->next));
-		__raise_softirq_irqoff(HI_SOFTIRQ);
-		local_irq_enable();
-	}
+	__tasklet_action(a, list);
 }
 
 
@@ -777,7 +852,7 @@ void tasklet_kill(struct tasklet_struct *t)
 
 	while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) {
 		do {
-			yield();
+			msleep(1);
 		} while (test_bit(TASKLET_STATE_SCHED, &t->state));
 	}
 	tasklet_unlock_wait(t);
@@ -983,6 +1058,23 @@ void __init softirq_init(void)
 	open_softirq(HI_SOFTIRQ, tasklet_hi_action);
 }
 
+#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL)
+void tasklet_unlock_wait(struct tasklet_struct *t)
+{
+	while (test_bit(TASKLET_STATE_RUN, &(t)->state)) {
+		/*
+		 * Hack for now to avoid this busy-loop:
+		 */
+#ifdef CONFIG_PREEMPT_RT_FULL
+		msleep(1);
+#else
+		barrier();
+#endif
+	}
+}
+EXPORT_SYMBOL(tasklet_unlock_wait);
+#endif
+
 static int ksoftirqd_should_run(unsigned int cpu)
 {
 	return local_softirq_pending();
-- 
cgit v0.10.2


From 37e62b063a65fc8b2f9b046f739e75df69d15273 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 31 Jan 2012 13:01:27 +0100
Subject: genirq: Allow disabling of softirq processing in irq thread context

The processing of softirqs in irq thread context is a performance gain
for the non-rt workloads of a system, but it's counterproductive for
interrupts which are explicitely related to the realtime
workload. Allow such interrupts to prevent softirq processing in their
thread context.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable-rt@vger.kernel.org

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 8784459..11bdb1e 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -58,6 +58,7 @@
  * IRQF_NO_THREAD - Interrupt cannot be threaded
  * IRQF_EARLY_RESUME - Resume IRQ early during syscore instead of at device
  *                resume time.
+ * IRQF_NO_SOFTIRQ_CALL - Do not process softirqs in the irq thread context (RT)
  */
 #define IRQF_DISABLED		0x00000020
 #define IRQF_SHARED		0x00000080
@@ -71,6 +72,7 @@
 #define IRQF_FORCE_RESUME	0x00008000
 #define IRQF_NO_THREAD		0x00010000
 #define IRQF_EARLY_RESUME	0x00020000
+#define IRQF_NO_SOFTIRQ_CALL	0x00040000
 
 #define IRQF_TIMER		(__IRQF_TIMER | IRQF_NO_SUSPEND | IRQF_NO_THREAD)
 
diff --git a/include/linux/irq.h b/include/linux/irq.h
index fdf2c4a..3929bbe 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -70,6 +70,7 @@ typedef	void (*irq_preflow_handler_t)(struct irq_data *data);
  * IRQ_MOVE_PCNTXT		- Interrupt can be migrated from process context
  * IRQ_NESTED_TRHEAD		- Interrupt nests into another thread
  * IRQ_PER_CPU_DEVID		- Dev_id is a per-cpu variable
+ * IRQ_NO_SOFTIRQ_CALL		- No softirq processing in the irq thread context (RT)
  */
 enum {
 	IRQ_TYPE_NONE		= 0x00000000,
@@ -94,12 +95,14 @@ enum {
 	IRQ_NESTED_THREAD	= (1 << 15),
 	IRQ_NOTHREAD		= (1 << 16),
 	IRQ_PER_CPU_DEVID	= (1 << 17),
+	IRQ_NO_SOFTIRQ_CALL	= (1 << 18),
 };
 
 #define IRQF_MODIFY_MASK	\
 	(IRQ_TYPE_SENSE_MASK | IRQ_NOPROBE | IRQ_NOREQUEST | \
 	 IRQ_NOAUTOEN | IRQ_MOVE_PCNTXT | IRQ_LEVEL | IRQ_NO_BALANCING | \
-	 IRQ_PER_CPU | IRQ_NESTED_THREAD | IRQ_NOTHREAD | IRQ_PER_CPU_DEVID)
+	 IRQ_PER_CPU | IRQ_NESTED_THREAD | IRQ_NOTHREAD | IRQ_PER_CPU_DEVID | \
+	 IRQ_NO_SOFTIRQ_CALL)
 
 #define IRQ_NO_BALANCING_MASK	(IRQ_PER_CPU | IRQ_NO_BALANCING)
 
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 902f880..5bc2f41 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -782,7 +782,15 @@ irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action)
 	local_bh_disable();
 	ret = action->thread_fn(action->irq, action->dev_id);
 	irq_finalize_oneshot(desc, action);
-	local_bh_enable();
+	/*
+	 * Interrupts which have real time requirements can be set up
+	 * to avoid softirq processing in the thread handler. This is
+	 * safe as these interrupts do not raise soft interrupts.
+	 */
+	if (irq_settings_no_softirq_call(desc))
+		_local_bh_enable();
+	else
+		local_bh_enable();
 	return ret;
 }
 
@@ -1127,6 +1135,9 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 			irqd_set(&desc->irq_data, IRQD_NO_BALANCING);
 		}
 
+		if (new->flags & IRQF_NO_SOFTIRQ_CALL)
+			irq_settings_set_no_softirq_call(desc);
+
 		/* Set default affinity mask once everything is setup */
 		setup_affinity(irq, desc, mask);
 
diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h
index 1162f10..0d2c381 100644
--- a/kernel/irq/settings.h
+++ b/kernel/irq/settings.h
@@ -14,6 +14,7 @@ enum {
 	_IRQ_NO_BALANCING	= IRQ_NO_BALANCING,
 	_IRQ_NESTED_THREAD	= IRQ_NESTED_THREAD,
 	_IRQ_PER_CPU_DEVID	= IRQ_PER_CPU_DEVID,
+	_IRQ_NO_SOFTIRQ_CALL	= IRQ_NO_SOFTIRQ_CALL,
 	_IRQF_MODIFY_MASK	= IRQF_MODIFY_MASK,
 };
 
@@ -26,6 +27,7 @@ enum {
 #define IRQ_NOAUTOEN		GOT_YOU_MORON
 #define IRQ_NESTED_THREAD	GOT_YOU_MORON
 #define IRQ_PER_CPU_DEVID	GOT_YOU_MORON
+#define IRQ_NO_SOFTIRQ_CALL	GOT_YOU_MORON
 #undef IRQF_MODIFY_MASK
 #define IRQF_MODIFY_MASK	GOT_YOU_MORON
 
@@ -36,6 +38,16 @@ irq_settings_clr_and_set(struct irq_desc *desc, u32 clr, u32 set)
 	desc->status_use_accessors |= (set & _IRQF_MODIFY_MASK);
 }
 
+static inline bool irq_settings_no_softirq_call(struct irq_desc *desc)
+{
+	return desc->status_use_accessors & _IRQ_NO_SOFTIRQ_CALL;
+}
+
+static inline void irq_settings_set_no_softirq_call(struct irq_desc *desc)
+{
+	desc->status_use_accessors |= _IRQ_NO_SOFTIRQ_CALL;
+}
+
 static inline bool irq_settings_is_per_cpu(struct irq_desc *desc)
 {
 	return desc->status_use_accessors & _IRQ_PER_CPU;
diff --git a/kernel/softirq.c b/kernel/softirq.c
index fcf6997..f69c069 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -417,6 +417,13 @@ void local_bh_enable_ip(unsigned long ip)
 }
 EXPORT_SYMBOL(local_bh_enable_ip);
 
+void _local_bh_enable(void)
+{
+	current->softirq_nestcnt--;
+	migrate_enable();
+}
+EXPORT_SYMBOL(_local_bh_enable);
+
 /* For tracing */
 int notrace __in_softirq(void)
 {
-- 
cgit v0.10.2


From 866bd3beca553c73b46d7f0a55ff9c60317121f7 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 28 Jun 2011 20:42:16 +0200
Subject: local-vars-migrate-disable.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index bf69fd1..12b394f 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -48,10 +48,30 @@
 	preempt_enable();				\
 } while (0)
 
-#define get_local_var(var)	get_cpu_var(var)
-#define put_local_var(var)	put_cpu_var(var)
-#define get_local_ptr(var)	get_cpu_ptr(var)
-#define put_local_ptr(var)	put_cpu_ptr(var)
+#ifndef CONFIG_PREEMPT_RT_FULL
+# define get_local_var(var)	get_cpu_var(var)
+# define put_local_var(var)	put_cpu_var(var)
+# define get_local_ptr(var)	get_cpu_ptr(var)
+# define put_local_ptr(var)	put_cpu_ptr(var)
+#else
+# define get_local_var(var) (*({			\
+	migrate_disable();				\
+	&__get_cpu_var(var); }))
+
+# define put_local_var(var) do {			\
+	(void)&(var);					\
+	migrate_enable();				\
+} while (0)
+
+# define get_local_ptr(var) ({				\
+	migrate_disable();				\
+	this_cpu_ptr(var); })
+
+# define put_local_ptr(var) do {			\
+	(void)(var);					\
+	migrate_enable();				\
+} while (0)
+#endif
 
 /* minimum unit size, also is the maximum supported allocation size */
 #define PCPU_MIN_UNIT_SIZE		PFN_ALIGN(32 << 10)
-- 
cgit v0.10.2


From ada4122b3bf43ff48abed296bd8ef7d3df045d31 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 6 Apr 2010 16:51:31 +0200
Subject: md: raid5: Make raid5_percpu handling RT aware

__raid_run_ops() disables preemption with get_cpu() around the access
to the raid5_percpu variables. That causes scheduling while atomic
spews on RT.

Serialize the access to the percpu data with a lock and keep the code
preemptible.

Reported-by: Udo van den Heuvel <udovdh@xs4all.nl>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Udo van den Heuvel <udovdh@xs4all.nl>

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 94ce78e..0089cb1 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1418,8 +1418,9 @@ static void __raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
 	struct raid5_percpu *percpu;
 	unsigned long cpu;
 
-	cpu = get_cpu();
+	cpu = get_cpu_light();
 	percpu = per_cpu_ptr(conf->percpu, cpu);
+	spin_lock(&percpu->lock);
 	if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) {
 		ops_run_biofill(sh);
 		overlap_clear++;
@@ -1471,7 +1472,8 @@ static void __raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
 			if (test_and_clear_bit(R5_Overlap, &dev->flags))
 				wake_up(&sh->raid_conf->wait_for_overlap);
 		}
-	put_cpu();
+	spin_unlock(&percpu->lock);
+	put_cpu_light();
 }
 
 #ifdef CONFIG_MULTICORE_RAID456
@@ -5139,6 +5141,7 @@ static int raid5_alloc_percpu(struct r5conf *conf)
 			break;
 		}
 		per_cpu_ptr(conf->percpu, cpu)->scribble = scribble;
+		spin_lock_init(&per_cpu_ptr(conf->percpu, cpu)->lock);
 	}
 #ifdef CONFIG_HOTPLUG_CPU
 	conf->cpu_notify.notifier_call = raid456_cpu_notify;
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 050a334..8a57647 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -429,6 +429,7 @@ struct r5conf {
 	int			recovery_disabled;
 	/* per cpu variables */
 	struct raid5_percpu {
+		spinlock_t	lock;	     /* Protection for -RT */
 		struct page	*spare_page; /* Used when checking P/Q in raid6 */
 		void		*scribble;   /* space for constructing buffer
 					      * lists and performing address
-- 
cgit v0.10.2


From 24fcc18bf1908e5b6da1072a6f1a48e58b1e1a4f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 10 Jun 2011 11:04:15 +0200
Subject: rtmutex-futex-prepare-rt.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/futex.c b/kernel/futex.c
index 8879430..bcf86f7 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1442,6 +1442,16 @@ retry_private:
 				requeue_pi_wake_futex(this, &key2, hb2);
 				drop_count++;
 				continue;
+			} else if (ret == -EAGAIN) {
+				/*
+				 * Waiter was woken by timeout or
+				 * signal and has set pi_blocked_on to
+				 * PI_WAKEUP_INPROGRESS before we
+				 * tried to enqueue it on the rtmutex.
+				 */
+				this->pi_state = NULL;
+				free_pi_state(pi_state);
+				continue;
 			} else if (ret) {
 				/* -EDEADLK */
 				this->pi_state = NULL;
@@ -2286,7 +2296,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
 	struct hrtimer_sleeper timeout, *to = NULL;
 	struct rt_mutex_waiter rt_waiter;
 	struct rt_mutex *pi_mutex = NULL;
-	struct futex_hash_bucket *hb;
+	struct futex_hash_bucket *hb, *hb2;
 	union futex_key key2 = FUTEX_KEY_INIT;
 	struct futex_q q = futex_q_init;
 	int res, ret;
@@ -2333,20 +2343,55 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
 	/* Queue the futex_q, drop the hb lock, wait for wakeup. */
 	futex_wait_queue_me(hb, &q, to);
 
-	spin_lock(&hb->lock);
-	ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
-	spin_unlock(&hb->lock);
-	if (ret)
-		goto out_put_keys;
+	/*
+	 * On RT we must avoid races with requeue and trying to block
+	 * on two mutexes (hb->lock and uaddr2's rtmutex) by
+	 * serializing access to pi_blocked_on with pi_lock.
+	 */
+	raw_spin_lock_irq(&current->pi_lock);
+	if (current->pi_blocked_on) {
+		/*
+		 * We have been requeued or are in the process of
+		 * being requeued.
+		 */
+		raw_spin_unlock_irq(&current->pi_lock);
+	} else {
+		/*
+		 * Setting pi_blocked_on to PI_WAKEUP_INPROGRESS
+		 * prevents a concurrent requeue from moving us to the
+		 * uaddr2 rtmutex. After that we can safely acquire
+		 * (and possibly block on) hb->lock.
+		 */
+		current->pi_blocked_on = PI_WAKEUP_INPROGRESS;
+		raw_spin_unlock_irq(&current->pi_lock);
+
+		spin_lock(&hb->lock);
+
+		/*
+		 * Clean up pi_blocked_on. We might leak it otherwise
+		 * when we succeeded with the hb->lock in the fast
+		 * path.
+		 */
+		raw_spin_lock_irq(&current->pi_lock);
+		current->pi_blocked_on = NULL;
+		raw_spin_unlock_irq(&current->pi_lock);
+
+		ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
+		spin_unlock(&hb->lock);
+		if (ret)
+			goto out_put_keys;
+	}
 
 	/*
-	 * In order for us to be here, we know our q.key == key2, and since
-	 * we took the hb->lock above, we also know that futex_requeue() has
-	 * completed and we no longer have to concern ourselves with a wakeup
-	 * race with the atomic proxy lock acquisition by the requeue code. The
-	 * futex_requeue dropped our key1 reference and incremented our key2
-	 * reference count.
+	 * In order to be here, we have either been requeued, are in
+	 * the process of being requeued, or requeue successfully
+	 * acquired uaddr2 on our behalf.  If pi_blocked_on was
+	 * non-null above, we may be racing with a requeue.  Do not
+	 * rely on q->lock_ptr to be hb2->lock until after blocking on
+	 * hb->lock or hb2->lock. The futex_requeue dropped our key1
+	 * reference and incremented our key2 reference count.
 	 */
+	hb2 = hash_futex(&key2);
 
 	/* Check if the requeue code acquired the second futex for us. */
 	if (!q.rt_waiter) {
@@ -2355,9 +2400,10 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
 		 * did a lock-steal - fix up the PI-state in that case.
 		 */
 		if (q.pi_state && (q.pi_state->owner != current)) {
-			spin_lock(q.lock_ptr);
+			spin_lock(&hb2->lock);
+			BUG_ON(&hb2->lock != q.lock_ptr);
 			ret = fixup_pi_state_owner(uaddr2, &q, current);
-			spin_unlock(q.lock_ptr);
+			spin_unlock(&hb2->lock);
 		}
 	} else {
 		/*
@@ -2370,7 +2416,8 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
 		ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter, 1);
 		debug_rt_mutex_free_waiter(&rt_waiter);
 
-		spin_lock(q.lock_ptr);
+		spin_lock(&hb2->lock);
+		BUG_ON(&hb2->lock != q.lock_ptr);
 		/*
 		 * Fixup the pi_state owner and possibly acquire the lock if we
 		 * haven't already.
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index a242e69..f1a6137 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -67,6 +67,11 @@ static void fixup_rt_mutex_waiters(struct rt_mutex *lock)
 		clear_rt_mutex_waiters(lock);
 }
 
+static int rt_mutex_real_waiter(struct rt_mutex_waiter *waiter)
+{
+	return waiter && waiter != PI_WAKEUP_INPROGRESS;
+}
+
 /*
  * We can speed up the acquire/release, if the architecture
  * supports cmpxchg and if there's no debugging state to be set up
@@ -196,7 +201,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
 	 * reached or the state of the chain has changed while we
 	 * dropped the locks.
 	 */
-	if (!waiter)
+	if (!rt_mutex_real_waiter(waiter))
 		goto out_unlock_pi;
 
 	/*
@@ -399,6 +404,23 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
 	int chain_walk = 0, res;
 
 	raw_spin_lock_irqsave(&task->pi_lock, flags);
+
+	/*
+	 * In the case of futex requeue PI, this will be a proxy
+	 * lock. The task will wake unaware that it is enqueueed on
+	 * this lock. Avoid blocking on two locks and corrupting
+	 * pi_blocked_on via the PI_WAKEUP_INPROGRESS
+	 * flag. futex_wait_requeue_pi() sets this when it wakes up
+	 * before requeue (due to a signal or timeout). Do not enqueue
+	 * the task if PI_WAKEUP_INPROGRESS is set.
+	 */
+	if (task != current && task->pi_blocked_on == PI_WAKEUP_INPROGRESS) {
+		raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+		return -EAGAIN;
+	}
+
+	BUG_ON(rt_mutex_real_waiter(task->pi_blocked_on));
+
 	__rt_mutex_adjust_prio(task);
 	waiter->task = task;
 	waiter->lock = lock;
@@ -423,7 +445,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
 		plist_add(&waiter->pi_list_entry, &owner->pi_waiters);
 
 		__rt_mutex_adjust_prio(owner);
-		if (owner->pi_blocked_on)
+		if (rt_mutex_real_waiter(owner->pi_blocked_on))
 			chain_walk = 1;
 		raw_spin_unlock_irqrestore(&owner->pi_lock, flags);
 	}
@@ -517,7 +539,7 @@ static void remove_waiter(struct rt_mutex *lock,
 		}
 		__rt_mutex_adjust_prio(owner);
 
-		if (owner->pi_blocked_on)
+		if (rt_mutex_real_waiter(owner->pi_blocked_on))
 			chain_walk = 1;
 
 		raw_spin_unlock_irqrestore(&owner->pi_lock, flags);
@@ -551,7 +573,8 @@ void rt_mutex_adjust_pi(struct task_struct *task)
 	raw_spin_lock_irqsave(&task->pi_lock, flags);
 
 	waiter = task->pi_blocked_on;
-	if (!waiter || waiter->list_entry.prio == task->prio) {
+	if (!rt_mutex_real_waiter(waiter) ||
+	    waiter->list_entry.prio == task->prio) {
 		raw_spin_unlock_irqrestore(&task->pi_lock, flags);
 		return;
 	}
diff --git a/kernel/rtmutex_common.h b/kernel/rtmutex_common.h
index 53a66c8..b43d832 100644
--- a/kernel/rtmutex_common.h
+++ b/kernel/rtmutex_common.h
@@ -103,6 +103,8 @@ static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock)
 /*
  * PI-futex support (proxy locking functions, etc.):
  */
+#define PI_WAKEUP_INPROGRESS	((struct rt_mutex_waiter *) 1)
+
 extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock);
 extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
 				       struct task_struct *proxy_owner);
-- 
cgit v0.10.2


From 6b7baa9c617def9a04a8fc62dba265c8f7fa7604 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Tue, 30 Apr 2013 03:17:15 -0500
Subject: futex: Fix bug on when a requeued RT task times out

Requeue with timeout causes a bug with PREEMPT_RT_FULL.

The bug comes from a timed out condition.


	TASK 1				TASK 2
	------				------
    futex_wait_requeue_pi()
	futex_wait_queue_me()
	<timed out>

					double_lock_hb();

	raw_spin_lock(pi_lock);
	if (current->pi_blocked_on) {
	} else {
	    current->pi_blocked_on = PI_WAKE_INPROGRESS;
	    run_spin_unlock(pi_lock);
	    spin_lock(hb->lock); <-- blocked!


					plist_for_each_entry_safe(this) {
					    rt_mutex_start_proxy_lock();
						task_blocks_on_rt_mutex();
						BUG_ON(task->pi_blocked_on)!!!!

The BUG_ON() actually has a check for PI_WAKE_INPROGRESS, but the
problem is that, after TASK 1 sets PI_WAKE_INPROGRESS, it then tries to
grab the hb->lock, which it fails to do so. As the hb->lock is a mutex,
it will block and set the "pi_blocked_on" to the hb->lock.

When TASK 2 goes to requeue it, the check for PI_WAKE_INPROGESS fails
because the task1's pi_blocked_on is no longer set to that, but instead,
set to the hb->lock.

The fix:

When calling rt_mutex_start_proxy_lock() a check is made to see
if the proxy tasks pi_blocked_on is set. If so, exit out early.
Otherwise set it to a new flag PI_REQUEUE_INPROGRESS, which notifies
the proxy task that it is being requeued, and will handle things
appropriately.

Cc: stable-rt@vger.kernel.org
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index f1a6137..a8a1cd0 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -69,7 +69,8 @@ static void fixup_rt_mutex_waiters(struct rt_mutex *lock)
 
 static int rt_mutex_real_waiter(struct rt_mutex_waiter *waiter)
 {
-	return waiter && waiter != PI_WAKEUP_INPROGRESS;
+	return waiter && waiter != PI_WAKEUP_INPROGRESS &&
+		waiter != PI_REQUEUE_INPROGRESS;
 }
 
 /*
@@ -981,6 +982,35 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
 		return 1;
 	}
 
+#ifdef CONFIG_PREEMPT_RT_FULL
+	/*
+	 * In PREEMPT_RT there's an added race.
+	 * If the task, that we are about to requeue, times out,
+	 * it can set the PI_WAKEUP_INPROGRESS. This tells the requeue
+	 * to skip this task. But right after the task sets
+	 * its pi_blocked_on to PI_WAKEUP_INPROGRESS it can then
+	 * block on the spin_lock(&hb->lock), which in RT is an rtmutex.
+	 * This will replace the PI_WAKEUP_INPROGRESS with the actual
+	 * lock that it blocks on. We *must not* place this task
+	 * on this proxy lock in that case.
+	 *
+	 * To prevent this race, we first take the task's pi_lock
+	 * and check if it has updated its pi_blocked_on. If it has,
+	 * we assume that it woke up and we return -EAGAIN.
+	 * Otherwise, we set the task's pi_blocked_on to
+	 * PI_REQUEUE_INPROGRESS, so that if the task is waking up
+	 * it will know that we are in the process of requeuing it.
+	 */
+	raw_spin_lock_irq(&task->pi_lock);
+	if (task->pi_blocked_on) {
+		raw_spin_unlock_irq(&task->pi_lock);
+		raw_spin_unlock(&lock->wait_lock);
+		return -EAGAIN;
+	}
+	task->pi_blocked_on = PI_REQUEUE_INPROGRESS;
+	raw_spin_unlock_irq(&task->pi_lock);
+#endif
+
 	ret = task_blocks_on_rt_mutex(lock, waiter, task, detect_deadlock);
 
 	if (ret && !rt_mutex_owner(lock)) {
diff --git a/kernel/rtmutex_common.h b/kernel/rtmutex_common.h
index b43d832..47290ec 100644
--- a/kernel/rtmutex_common.h
+++ b/kernel/rtmutex_common.h
@@ -104,6 +104,7 @@ static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock)
  * PI-futex support (proxy locking functions, etc.):
  */
 #define PI_WAKEUP_INPROGRESS	((struct rt_mutex_waiter *) 1)
+#define PI_REQUEUE_INPROGRESS	((struct rt_mutex_waiter *) 2)
 
 extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock);
 extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
-- 
cgit v0.10.2


From a9cd8167ce35945c046541b849f9c21c6886121b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 1 Mar 2013 11:17:42 +0100
Subject: futex: Ensure lock/unlock symetry versus pi_lock and hash bucket lock

In exit_pi_state_list() we have the following locking construct:

   spin_lock(&hb->lock);
   raw_spin_lock_irq(&curr->pi_lock);

   ...
   spin_unlock(&hb->lock);

In !RT this works, but on RT the migrate_enable() function which is
called from spin_unlock() sees atomic context due to the held pi_lock
and just decrements the migrate_disable_atomic counter of the
task. Now the next call to migrate_disable() sees the counter being
negative and issues a warning. That check should be in
migrate_enable() already.

Fix this by dropping pi_lock before unlocking hb->lock and reaquire
pi_lock after that again. This is safe as the loop code reevaluates
head again under the pi_lock.

Reported-by: Yong Zhang <yong.zhang@windriver.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>

diff --git a/kernel/futex.c b/kernel/futex.c
index bcf86f7..1d74b85 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -568,7 +568,9 @@ void exit_pi_state_list(struct task_struct *curr)
 		 * task still owns the PI-state:
 		 */
 		if (head->next != next) {
+			raw_spin_unlock_irq(&curr->pi_lock);
 			spin_unlock(&hb->lock);
+			raw_spin_lock_irq(&curr->pi_lock);
 			continue;
 		}
 
-- 
cgit v0.10.2


From b62b25f16f060d88f7ea0712c6205a960df74961 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 9 Jun 2011 11:43:52 +0200
Subject: rtmutex-lock-killable.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/rtmutex.h b/include/linux/rtmutex.h
index de17134..3561eb2 100644
--- a/include/linux/rtmutex.h
+++ b/include/linux/rtmutex.h
@@ -90,6 +90,7 @@ extern void rt_mutex_destroy(struct rt_mutex *lock);
 extern void rt_mutex_lock(struct rt_mutex *lock);
 extern int rt_mutex_lock_interruptible(struct rt_mutex *lock,
 						int detect_deadlock);
+extern int rt_mutex_lock_killable(struct rt_mutex *lock, int detect_deadlock);
 extern int rt_mutex_timed_lock(struct rt_mutex *lock,
 					struct hrtimer_sleeper *timeout,
 					int detect_deadlock);
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index a8a1cd0..0d129f9 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -815,12 +815,12 @@ EXPORT_SYMBOL_GPL(rt_mutex_lock);
 /**
  * rt_mutex_lock_interruptible - lock a rt_mutex interruptible
  *
- * @lock: 		the rt_mutex to be locked
+ * @lock:		the rt_mutex to be locked
  * @detect_deadlock:	deadlock detection on/off
  *
  * Returns:
- *  0 		on success
- * -EINTR 	when interrupted by a signal
+ *  0		on success
+ * -EINTR	when interrupted by a signal
  * -EDEADLK	when the lock would deadlock (when deadlock detection is on)
  */
 int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock,
@@ -834,17 +834,38 @@ int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock,
 EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);
 
 /**
+ * rt_mutex_lock_killable - lock a rt_mutex killable
+ *
+ * @lock:		the rt_mutex to be locked
+ * @detect_deadlock:	deadlock detection on/off
+ *
+ * Returns:
+ *  0		on success
+ * -EINTR	when interrupted by a signal
+ * -EDEADLK	when the lock would deadlock (when deadlock detection is on)
+ */
+int __sched rt_mutex_lock_killable(struct rt_mutex *lock,
+				   int detect_deadlock)
+{
+	might_sleep();
+
+	return rt_mutex_fastlock(lock, TASK_KILLABLE,
+				 detect_deadlock, rt_mutex_slowlock);
+}
+EXPORT_SYMBOL_GPL(rt_mutex_lock_killable);
+
+/**
  * rt_mutex_timed_lock - lock a rt_mutex interruptible
  *			the timeout structure is provided
  *			by the caller
  *
- * @lock: 		the rt_mutex to be locked
+ * @lock:		the rt_mutex to be locked
  * @timeout:		timeout structure or NULL (no timeout)
  * @detect_deadlock:	deadlock detection on/off
  *
  * Returns:
- *  0 		on success
- * -EINTR 	when interrupted by a signal
+ *  0		on success
+ * -EINTR	when interrupted by a signal
  * -ETIMEDOUT	when the timeout expired
  * -EDEADLK	when the lock would deadlock (when deadlock detection is on)
  */
-- 
cgit v0.10.2


From cfbd74247d49422cf6807fbadb0b1a815e17d402 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 10 Jun 2011 11:21:25 +0200
Subject: rt-mutex-add-sleeping-spinlocks-support.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/rtmutex.h b/include/linux/rtmutex.h
index 3561eb2..928d93e 100644
--- a/include/linux/rtmutex.h
+++ b/include/linux/rtmutex.h
@@ -29,9 +29,10 @@ struct rt_mutex {
 	raw_spinlock_t		wait_lock;
 	struct plist_head	wait_list;
 	struct task_struct	*owner;
-#ifdef CONFIG_DEBUG_RT_MUTEXES
 	int			save_state;
-	const char 		*name, *file;
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+	const char		*file;
+	const char		*name;
 	int			line;
 	void			*magic;
 #endif
@@ -56,19 +57,39 @@ struct hrtimer_sleeper;
 #ifdef CONFIG_DEBUG_RT_MUTEXES
 # define __DEBUG_RT_MUTEX_INITIALIZER(mutexname) \
 	, .name = #mutexname, .file = __FILE__, .line = __LINE__
-# define rt_mutex_init(mutex)			__rt_mutex_init(mutex, __func__)
+
+# define rt_mutex_init(mutex)					\
+	do {							\
+		raw_spin_lock_init(&(mutex)->wait_lock);	\
+		__rt_mutex_init(mutex, #mutex);			\
+	} while (0)
+
  extern void rt_mutex_debug_task_free(struct task_struct *tsk);
 #else
 # define __DEBUG_RT_MUTEX_INITIALIZER(mutexname)
-# define rt_mutex_init(mutex)			__rt_mutex_init(mutex, NULL)
+
+# define rt_mutex_init(mutex)					\
+	do {							\
+		raw_spin_lock_init(&(mutex)->wait_lock);	\
+		__rt_mutex_init(mutex, #mutex);			\
+	} while (0)
+
 # define rt_mutex_debug_task_free(t)			do { } while (0)
 #endif
 
-#define __RT_MUTEX_INITIALIZER(mutexname) \
-	{ .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(mutexname.wait_lock) \
+#define __RT_MUTEX_INITIALIZER_PLAIN(mutexname) \
+	.wait_lock = __RAW_SPIN_LOCK_UNLOCKED(mutexname.wait_lock) \
 	, .wait_list = PLIST_HEAD_INIT(mutexname.wait_list) \
 	, .owner = NULL \
-	__DEBUG_RT_MUTEX_INITIALIZER(mutexname)}
+	__DEBUG_RT_MUTEX_INITIALIZER(mutexname)
+
+
+#define __RT_MUTEX_INITIALIZER(mutexname) \
+	{ __RT_MUTEX_INITIALIZER_PLAIN(mutexname) }
+
+#define __RT_MUTEX_INITIALIZER_SAVE_STATE(mutexname) \
+	{ __RT_MUTEX_INITIALIZER_PLAIN(mutexname)    \
+	  , .save_state = 1 }
 
 #define DEFINE_RT_MUTEX(mutexname) \
 	struct rt_mutex mutexname = __RT_MUTEX_INITIALIZER(mutexname)
diff --git a/kernel/futex.c b/kernel/futex.c
index 1d74b85..473c3c4 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -2323,8 +2323,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
 	 * The waiter is allocated on our stack, manipulated by the requeue
 	 * code while we sleep on uaddr.
 	 */
-	debug_rt_mutex_init_waiter(&rt_waiter);
-	rt_waiter.task = NULL;
+	rt_mutex_init_waiter(&rt_waiter, false);
 
 	ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE);
 	if (unlikely(ret != 0))
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index 0d129f9..b562de3 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -8,6 +8,12 @@
  *  Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt
  *  Copyright (C) 2006 Esben Nielsen
  *
+ * Adaptive Spinlocks:
+ *  Copyright (C) 2008 Novell, Inc., Gregory Haskins, Sven Dietrich,
+ *                                   and Peter Morreale,
+ * Adaptive Spinlocks simplification:
+ *  Copyright (C) 2008 Red Hat, Inc., Steven Rostedt <srostedt@redhat.com>
+ *
  *  See Documentation/rt-mutex-design.txt for details.
  */
 #include <linux/spinlock.h>
@@ -96,6 +102,12 @@ static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
 }
 #endif
 
+static inline void init_lists(struct rt_mutex *lock)
+{
+	if (unlikely(!lock->wait_list.node_list.prev))
+		plist_head_init(&lock->wait_list);
+}
+
 /*
  * Calculate task priority from the waiter list priority
  *
@@ -142,6 +154,14 @@ static void rt_mutex_adjust_prio(struct task_struct *task)
 	raw_spin_unlock_irqrestore(&task->pi_lock, flags);
 }
 
+static void rt_mutex_wake_waiter(struct rt_mutex_waiter *waiter)
+{
+	if (waiter->savestate)
+		wake_up_lock_sleeper(waiter->task);
+	else
+		wake_up_process(waiter->task);
+}
+
 /*
  * Max number of times we'll walk the boosting chain:
  */
@@ -253,13 +273,15 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
 	/* Release the task */
 	raw_spin_unlock_irqrestore(&task->pi_lock, flags);
 	if (!rt_mutex_owner(lock)) {
+		struct rt_mutex_waiter *lock_top_waiter;
+
 		/*
 		 * If the requeue above changed the top waiter, then we need
 		 * to wake the new top waiter up to try to get the lock.
 		 */
-
-		if (top_waiter != rt_mutex_top_waiter(lock))
-			wake_up_process(rt_mutex_top_waiter(lock)->task);
+		lock_top_waiter = rt_mutex_top_waiter(lock);
+		if (top_waiter != lock_top_waiter)
+			rt_mutex_wake_waiter(lock_top_waiter);
 		raw_spin_unlock(&lock->wait_lock);
 		goto out_put_task;
 	}
@@ -304,6 +326,25 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
 	return ret;
 }
 
+
+#define STEAL_NORMAL  0
+#define STEAL_LATERAL 1
+
+/*
+ * Note that RT tasks are excluded from lateral-steals to prevent the
+ * introduction of an unbounded latency
+ */
+static inline int lock_is_stealable(struct task_struct *task,
+				    struct task_struct *pendowner, int mode)
+{
+    if (mode == STEAL_NORMAL || rt_task(task)) {
+	    if (task->prio >= pendowner->prio)
+		    return 0;
+    } else if (task->prio > pendowner->prio)
+	    return 0;
+    return 1;
+}
+
 /*
  * Try to take an rt-mutex
  *
@@ -313,8 +354,9 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
  * @task:   the task which wants to acquire the lock
  * @waiter: the waiter that is queued to the lock's wait list. (could be NULL)
  */
-static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
-		struct rt_mutex_waiter *waiter)
+static int
+__try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
+		       struct rt_mutex_waiter *waiter, int mode)
 {
 	/*
 	 * We have to be careful here if the atomic speedups are
@@ -347,12 +389,14 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
 	 * 3) it is top waiter
 	 */
 	if (rt_mutex_has_waiters(lock)) {
-		if (task->prio >= rt_mutex_top_waiter(lock)->list_entry.prio) {
-			if (!waiter || waiter != rt_mutex_top_waiter(lock))
-				return 0;
-		}
+		struct task_struct *pown = rt_mutex_top_waiter(lock)->task;
+
+		if (task != pown && !lock_is_stealable(task, pown, mode))
+			return 0;
 	}
 
+	/* We got the lock. */
+
 	if (waiter || rt_mutex_has_waiters(lock)) {
 		unsigned long flags;
 		struct rt_mutex_waiter *top;
@@ -377,7 +421,6 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
 		raw_spin_unlock_irqrestore(&task->pi_lock, flags);
 	}
 
-	/* We got the lock. */
 	debug_rt_mutex_lock(lock);
 
 	rt_mutex_set_owner(lock, task);
@@ -387,6 +430,13 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
 	return 1;
 }
 
+static inline int
+try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
+		     struct rt_mutex_waiter *waiter)
+{
+	return __try_to_take_rt_mutex(lock, task, waiter, STEAL_NORMAL);
+}
+
 /*
  * Task blocks on lock.
  *
@@ -501,7 +551,7 @@ static void wakeup_next_waiter(struct rt_mutex *lock)
 
 	raw_spin_unlock_irqrestore(&current->pi_lock, flags);
 
-	wake_up_process(waiter->task);
+	rt_mutex_wake_waiter(waiter);
 }
 
 /*
@@ -580,18 +630,315 @@ void rt_mutex_adjust_pi(struct task_struct *task)
 		return;
 	}
 
-	raw_spin_unlock_irqrestore(&task->pi_lock, flags);
-
 	/* gets dropped in rt_mutex_adjust_prio_chain()! */
 	get_task_struct(task);
+	raw_spin_unlock_irqrestore(&task->pi_lock, flags);
 	rt_mutex_adjust_prio_chain(task, 0, NULL, NULL, task);
 }
 
+#ifdef CONFIG_PREEMPT_RT_FULL
+/*
+ * preemptible spin_lock functions:
+ */
+static inline void rt_spin_lock_fastlock(struct rt_mutex *lock,
+					 void  (*slowfn)(struct rt_mutex *lock))
+{
+	might_sleep();
+
+	if (likely(rt_mutex_cmpxchg(lock, NULL, current)))
+		rt_mutex_deadlock_account_lock(lock, current);
+	else
+		slowfn(lock);
+}
+
+static inline void rt_spin_lock_fastunlock(struct rt_mutex *lock,
+					   void  (*slowfn)(struct rt_mutex *lock))
+{
+	if (likely(rt_mutex_cmpxchg(lock, current, NULL)))
+		rt_mutex_deadlock_account_unlock(current);
+	else
+		slowfn(lock);
+}
+
+#ifdef CONFIG_SMP
+/*
+ * Note that owner is a speculative pointer and dereferencing relies
+ * on rcu_read_lock() and the check against the lock owner.
+ */
+static int adaptive_wait(struct rt_mutex *lock,
+			 struct task_struct *owner)
+{
+	int res = 0;
+
+	rcu_read_lock();
+	for (;;) {
+		if (owner != rt_mutex_owner(lock))
+			break;
+		/*
+		 * Ensure that owner->on_cpu is dereferenced _after_
+		 * checking the above to be valid.
+		 */
+		barrier();
+		if (!owner->on_cpu) {
+			res = 1;
+			break;
+		}
+		cpu_relax();
+	}
+	rcu_read_unlock();
+	return res;
+}
+#else
+static int adaptive_wait(struct rt_mutex *lock,
+			 struct task_struct *orig_owner)
+{
+	return 1;
+}
+#endif
+
+# define pi_lock(lock)			raw_spin_lock_irq(lock)
+# define pi_unlock(lock)		raw_spin_unlock_irq(lock)
+
+/*
+ * Slow path lock function spin_lock style: this variant is very
+ * careful not to miss any non-lock wakeups.
+ *
+ * We store the current state under p->pi_lock in p->saved_state and
+ * the try_to_wake_up() code handles this accordingly.
+ */
+static void  noinline __sched rt_spin_lock_slowlock(struct rt_mutex *lock)
+{
+	struct task_struct *lock_owner, *self = current;
+	struct rt_mutex_waiter waiter, *top_waiter;
+	int ret;
+
+	rt_mutex_init_waiter(&waiter, true);
+
+	raw_spin_lock(&lock->wait_lock);
+	init_lists(lock);
+
+	if (__try_to_take_rt_mutex(lock, self, NULL, STEAL_LATERAL)) {
+		raw_spin_unlock(&lock->wait_lock);
+		return;
+	}
+
+	BUG_ON(rt_mutex_owner(lock) == self);
+
+	/*
+	 * We save whatever state the task is in and we'll restore it
+	 * after acquiring the lock taking real wakeups into account
+	 * as well. We are serialized via pi_lock against wakeups. See
+	 * try_to_wake_up().
+	 */
+	pi_lock(&self->pi_lock);
+	self->saved_state = self->state;
+	__set_current_state(TASK_UNINTERRUPTIBLE);
+	pi_unlock(&self->pi_lock);
+
+	ret = task_blocks_on_rt_mutex(lock, &waiter, self, 0);
+	BUG_ON(ret);
+
+	for (;;) {
+		/* Try to acquire the lock again. */
+		if (__try_to_take_rt_mutex(lock, self, &waiter, STEAL_LATERAL))
+			break;
+
+		top_waiter = rt_mutex_top_waiter(lock);
+		lock_owner = rt_mutex_owner(lock);
+
+		raw_spin_unlock(&lock->wait_lock);
+
+		debug_rt_mutex_print_deadlock(&waiter);
+
+		if (top_waiter != &waiter || adaptive_wait(lock, lock_owner))
+			schedule_rt_mutex(lock);
+
+		raw_spin_lock(&lock->wait_lock);
+
+		pi_lock(&self->pi_lock);
+		__set_current_state(TASK_UNINTERRUPTIBLE);
+		pi_unlock(&self->pi_lock);
+	}
+
+	/*
+	 * Restore the task state to current->saved_state. We set it
+	 * to the original state above and the try_to_wake_up() code
+	 * has possibly updated it when a real (non-rtmutex) wakeup
+	 * happened while we were blocked. Clear saved_state so
+	 * try_to_wakeup() does not get confused.
+	 */
+	pi_lock(&self->pi_lock);
+	__set_current_state(self->saved_state);
+	self->saved_state = TASK_RUNNING;
+	pi_unlock(&self->pi_lock);
+
+	/*
+	 * try_to_take_rt_mutex() sets the waiter bit
+	 * unconditionally. We might have to fix that up:
+	 */
+	fixup_rt_mutex_waiters(lock);
+
+	BUG_ON(rt_mutex_has_waiters(lock) && &waiter == rt_mutex_top_waiter(lock));
+	BUG_ON(!plist_node_empty(&waiter.list_entry));
+
+	raw_spin_unlock(&lock->wait_lock);
+
+	debug_rt_mutex_free_waiter(&waiter);
+}
+
+/*
+ * Slow path to release a rt_mutex spin_lock style
+ */
+static void  noinline __sched rt_spin_lock_slowunlock(struct rt_mutex *lock)
+{
+	raw_spin_lock(&lock->wait_lock);
+
+	debug_rt_mutex_unlock(lock);
+
+	rt_mutex_deadlock_account_unlock(current);
+
+	if (!rt_mutex_has_waiters(lock)) {
+		lock->owner = NULL;
+		raw_spin_unlock(&lock->wait_lock);
+		return;
+	}
+
+	wakeup_next_waiter(lock);
+
+	raw_spin_unlock(&lock->wait_lock);
+
+	/* Undo pi boosting.when necessary */
+	rt_mutex_adjust_prio(current);
+}
+
+void __lockfunc rt_spin_lock(spinlock_t *lock)
+{
+	rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock);
+	spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
+}
+EXPORT_SYMBOL(rt_spin_lock);
+
+void __lockfunc __rt_spin_lock(struct rt_mutex *lock)
+{
+	rt_spin_lock_fastlock(lock, rt_spin_lock_slowlock);
+}
+EXPORT_SYMBOL(__rt_spin_lock);
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+void __lockfunc rt_spin_lock_nested(spinlock_t *lock, int subclass)
+{
+	rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock);
+	spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
+}
+EXPORT_SYMBOL(rt_spin_lock_nested);
+#endif
+
+void __lockfunc rt_spin_unlock(spinlock_t *lock)
+{
+	/* NOTE: we always pass in '1' for nested, for simplicity */
+	spin_release(&lock->dep_map, 1, _RET_IP_);
+	rt_spin_lock_fastunlock(&lock->lock, rt_spin_lock_slowunlock);
+}
+EXPORT_SYMBOL(rt_spin_unlock);
+
+void __lockfunc __rt_spin_unlock(struct rt_mutex *lock)
+{
+	rt_spin_lock_fastunlock(lock, rt_spin_lock_slowunlock);
+}
+EXPORT_SYMBOL(__rt_spin_unlock);
+
+/*
+ * Wait for the lock to get unlocked: instead of polling for an unlock
+ * (like raw spinlocks do), we lock and unlock, to force the kernel to
+ * schedule if there's contention:
+ */
+void __lockfunc rt_spin_unlock_wait(spinlock_t *lock)
+{
+	spin_lock(lock);
+	spin_unlock(lock);
+}
+EXPORT_SYMBOL(rt_spin_unlock_wait);
+
+int __lockfunc rt_spin_trylock(spinlock_t *lock)
+{
+	int ret;
+
+	migrate_disable();
+	ret = rt_mutex_trylock(&lock->lock);
+	if (ret)
+		spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
+	else
+		migrate_enable();
+
+	return ret;
+}
+EXPORT_SYMBOL(rt_spin_trylock);
+
+int __lockfunc rt_spin_trylock_bh(spinlock_t *lock)
+{
+	int ret;
+
+	local_bh_disable();
+	ret = rt_mutex_trylock(&lock->lock);
+	if (ret) {
+		migrate_disable();
+		spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
+	} else
+		local_bh_enable();
+	return ret;
+}
+EXPORT_SYMBOL(rt_spin_trylock_bh);
+
+int __lockfunc rt_spin_trylock_irqsave(spinlock_t *lock, unsigned long *flags)
+{
+	int ret;
+
+	*flags = 0;
+	migrate_disable();
+	ret = rt_mutex_trylock(&lock->lock);
+	if (ret)
+		spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
+	else
+		migrate_enable();
+	return ret;
+}
+EXPORT_SYMBOL(rt_spin_trylock_irqsave);
+
+int atomic_dec_and_spin_lock(atomic_t *atomic, spinlock_t *lock)
+{
+	/* Subtract 1 from counter unless that drops it to 0 (ie. it was 1) */
+	if (atomic_add_unless(atomic, -1, 1))
+		return 0;
+	migrate_disable();
+	rt_spin_lock(lock);
+	if (atomic_dec_and_test(atomic))
+		return 1;
+	rt_spin_unlock(lock);
+	migrate_enable();
+	return 0;
+}
+EXPORT_SYMBOL(atomic_dec_and_spin_lock);
+
+void
+__rt_spin_lock_init(spinlock_t *lock, char *name, struct lock_class_key *key)
+{
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	/*
+	 * Make sure we are not reinitializing a held lock:
+	 */
+	debug_check_no_locks_freed((void *)lock, sizeof(*lock));
+	lockdep_init_map(&lock->dep_map, name, key, 0);
+#endif
+}
+EXPORT_SYMBOL(__rt_spin_lock_init);
+
+#endif /* PREEMPT_RT_FULL */
+
 /**
  * __rt_mutex_slowlock() - Perform the wait-wake-try-to-take loop
  * @lock:		 the rt_mutex to take
  * @state:		 the state the task should block in (TASK_INTERRUPTIBLE
- * 			 or TASK_UNINTERRUPTIBLE)
+ *			 or TASK_UNINTERRUPTIBLE)
  * @timeout:		 the pre-initialized and started timer, or NULL for none
  * @waiter:		 the pre-initialized rt_mutex_waiter
  *
@@ -647,9 +994,10 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
 	struct rt_mutex_waiter waiter;
 	int ret = 0;
 
-	debug_rt_mutex_init_waiter(&waiter);
+	rt_mutex_init_waiter(&waiter, false);
 
 	raw_spin_lock(&lock->wait_lock);
+	init_lists(lock);
 
 	/* Try to acquire the lock again: */
 	if (try_to_take_rt_mutex(lock, current, NULL)) {
@@ -702,6 +1050,7 @@ rt_mutex_slowtrylock(struct rt_mutex *lock)
 	int ret = 0;
 
 	raw_spin_lock(&lock->wait_lock);
+	init_lists(lock);
 
 	if (likely(rt_mutex_owner(lock) != current)) {
 
@@ -934,12 +1283,11 @@ EXPORT_SYMBOL_GPL(rt_mutex_destroy);
 void __rt_mutex_init(struct rt_mutex *lock, const char *name)
 {
 	lock->owner = NULL;
-	raw_spin_lock_init(&lock->wait_lock);
 	plist_head_init(&lock->wait_list);
 
 	debug_rt_mutex_init(lock, name);
 }
-EXPORT_SYMBOL_GPL(__rt_mutex_init);
+EXPORT_SYMBOL(__rt_mutex_init);
 
 /**
  * rt_mutex_init_proxy_locked - initialize and lock a rt_mutex on behalf of a
@@ -954,7 +1302,7 @@ EXPORT_SYMBOL_GPL(__rt_mutex_init);
 void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
 				struct task_struct *proxy_owner)
 {
-	__rt_mutex_init(lock, NULL);
+	rt_mutex_init(lock);
 	debug_rt_mutex_proxy_lock(lock, proxy_owner);
 	rt_mutex_set_owner(lock, proxy_owner);
 	rt_mutex_deadlock_account_lock(lock, proxy_owner);
diff --git a/kernel/rtmutex_common.h b/kernel/rtmutex_common.h
index 47290ec..6ec3dc1 100644
--- a/kernel/rtmutex_common.h
+++ b/kernel/rtmutex_common.h
@@ -49,6 +49,7 @@ struct rt_mutex_waiter {
 	struct plist_node	pi_list_entry;
 	struct task_struct	*task;
 	struct rt_mutex		*lock;
+	bool			savestate;
 #ifdef CONFIG_DEBUG_RT_MUTEXES
 	unsigned long		ip;
 	struct pid		*deadlock_task_pid;
@@ -126,4 +127,12 @@ extern int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
 # include "rtmutex.h"
 #endif
 
+static inline void
+rt_mutex_init_waiter(struct rt_mutex_waiter *waiter, bool savestate)
+{
+	debug_rt_mutex_init_waiter(waiter);
+	waiter->task = NULL;
+	waiter->savestate = savestate;
+}
+
 #endif
-- 
cgit v0.10.2


From 1cca3f564c4ac367af0f3b69d4a74d19739cce3f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 29 Jun 2011 19:34:01 +0200
Subject: spinlock-types-separate-raw.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/rwlock_types.h b/include/linux/rwlock_types.h
index cc0072e..5317cd9 100644
--- a/include/linux/rwlock_types.h
+++ b/include/linux/rwlock_types.h
@@ -1,6 +1,10 @@
 #ifndef __LINUX_RWLOCK_TYPES_H
 #define __LINUX_RWLOCK_TYPES_H
 
+#if !defined(__LINUX_SPINLOCK_TYPES_H)
+# error "Do not include directly, include spinlock_types.h"
+#endif
+
 /*
  * include/linux/rwlock_types.h - generic rwlock type definitions
  *				  and initializers
diff --git a/include/linux/spinlock_types.h b/include/linux/spinlock_types.h
index 73548eb..5c8664d 100644
--- a/include/linux/spinlock_types.h
+++ b/include/linux/spinlock_types.h
@@ -9,79 +9,9 @@
  * Released under the General Public License (GPL).
  */
 
-#if defined(CONFIG_SMP)
-# include <asm/spinlock_types.h>
-#else
-# include <linux/spinlock_types_up.h>
-#endif
+#include <linux/spinlock_types_raw.h>
 
-#include <linux/lockdep.h>
-
-typedef struct raw_spinlock {
-	arch_spinlock_t raw_lock;
-#ifdef CONFIG_GENERIC_LOCKBREAK
-	unsigned int break_lock;
-#endif
-#ifdef CONFIG_DEBUG_SPINLOCK
-	unsigned int magic, owner_cpu;
-	void *owner;
-#endif
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-	struct lockdep_map dep_map;
-#endif
-} raw_spinlock_t;
-
-#define SPINLOCK_MAGIC		0xdead4ead
-
-#define SPINLOCK_OWNER_INIT	((void *)-1L)
-
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-# define SPIN_DEP_MAP_INIT(lockname)	.dep_map = { .name = #lockname }
-#else
-# define SPIN_DEP_MAP_INIT(lockname)
-#endif
-
-#ifdef CONFIG_DEBUG_SPINLOCK
-# define SPIN_DEBUG_INIT(lockname)		\
-	.magic = SPINLOCK_MAGIC,		\
-	.owner_cpu = -1,			\
-	.owner = SPINLOCK_OWNER_INIT,
-#else
-# define SPIN_DEBUG_INIT(lockname)
-#endif
-
-#define __RAW_SPIN_LOCK_INITIALIZER(lockname)	\
-	{					\
-	.raw_lock = __ARCH_SPIN_LOCK_UNLOCKED,	\
-	SPIN_DEBUG_INIT(lockname)		\
-	SPIN_DEP_MAP_INIT(lockname) }
-
-#define __RAW_SPIN_LOCK_UNLOCKED(lockname)	\
-	(raw_spinlock_t) __RAW_SPIN_LOCK_INITIALIZER(lockname)
-
-#define DEFINE_RAW_SPINLOCK(x)	raw_spinlock_t x = __RAW_SPIN_LOCK_UNLOCKED(x)
-
-typedef struct spinlock {
-	union {
-		struct raw_spinlock rlock;
-
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-# define LOCK_PADSIZE (offsetof(struct raw_spinlock, dep_map))
-		struct {
-			u8 __padding[LOCK_PADSIZE];
-			struct lockdep_map dep_map;
-		};
-#endif
-	};
-} spinlock_t;
-
-#define __SPIN_LOCK_INITIALIZER(lockname) \
-	{ { .rlock = __RAW_SPIN_LOCK_INITIALIZER(lockname) } }
-
-#define __SPIN_LOCK_UNLOCKED(lockname) \
-	(spinlock_t ) __SPIN_LOCK_INITIALIZER(lockname)
-
-#define DEFINE_SPINLOCK(x)	spinlock_t x = __SPIN_LOCK_UNLOCKED(x)
+#include <linux/spinlock_types_nort.h>
 
 #include <linux/rwlock_types.h>
 
diff --git a/include/linux/spinlock_types_nort.h b/include/linux/spinlock_types_nort.h
new file mode 100644
index 0000000..f1dac1f
--- /dev/null
+++ b/include/linux/spinlock_types_nort.h
@@ -0,0 +1,33 @@
+#ifndef __LINUX_SPINLOCK_TYPES_NORT_H
+#define __LINUX_SPINLOCK_TYPES_NORT_H
+
+#ifndef __LINUX_SPINLOCK_TYPES_H
+#error "Do not include directly. Include spinlock_types.h instead"
+#endif
+
+/*
+ * The non RT version maps spinlocks to raw_spinlocks
+ */
+typedef struct spinlock {
+	union {
+		struct raw_spinlock rlock;
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+# define LOCK_PADSIZE (offsetof(struct raw_spinlock, dep_map))
+		struct {
+			u8 __padding[LOCK_PADSIZE];
+			struct lockdep_map dep_map;
+		};
+#endif
+	};
+} spinlock_t;
+
+#define __SPIN_LOCK_INITIALIZER(lockname) \
+	{ { .rlock = __RAW_SPIN_LOCK_INITIALIZER(lockname) } }
+
+#define __SPIN_LOCK_UNLOCKED(lockname) \
+	(spinlock_t ) __SPIN_LOCK_INITIALIZER(lockname)
+
+#define DEFINE_SPINLOCK(x)	spinlock_t x = __SPIN_LOCK_UNLOCKED(x)
+
+#endif
diff --git a/include/linux/spinlock_types_raw.h b/include/linux/spinlock_types_raw.h
new file mode 100644
index 0000000..edffc4d
--- /dev/null
+++ b/include/linux/spinlock_types_raw.h
@@ -0,0 +1,56 @@
+#ifndef __LINUX_SPINLOCK_TYPES_RAW_H
+#define __LINUX_SPINLOCK_TYPES_RAW_H
+
+#if defined(CONFIG_SMP)
+# include <asm/spinlock_types.h>
+#else
+# include <linux/spinlock_types_up.h>
+#endif
+
+#include <linux/lockdep.h>
+
+typedef struct raw_spinlock {
+	arch_spinlock_t raw_lock;
+#ifdef CONFIG_GENERIC_LOCKBREAK
+	unsigned int break_lock;
+#endif
+#ifdef CONFIG_DEBUG_SPINLOCK
+	unsigned int magic, owner_cpu;
+	void *owner;
+#endif
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	struct lockdep_map dep_map;
+#endif
+} raw_spinlock_t;
+
+#define SPINLOCK_MAGIC		0xdead4ead
+
+#define SPINLOCK_OWNER_INIT	((void *)-1L)
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+# define SPIN_DEP_MAP_INIT(lockname)	.dep_map = { .name = #lockname }
+#else
+# define SPIN_DEP_MAP_INIT(lockname)
+#endif
+
+#ifdef CONFIG_DEBUG_SPINLOCK
+# define SPIN_DEBUG_INIT(lockname)		\
+	.magic = SPINLOCK_MAGIC,		\
+	.owner_cpu = -1,			\
+	.owner = SPINLOCK_OWNER_INIT,
+#else
+# define SPIN_DEBUG_INIT(lockname)
+#endif
+
+#define __RAW_SPIN_LOCK_INITIALIZER(lockname)	\
+	{					\
+	.raw_lock = __ARCH_SPIN_LOCK_UNLOCKED,	\
+	SPIN_DEBUG_INIT(lockname)		\
+	SPIN_DEP_MAP_INIT(lockname) }
+
+#define __RAW_SPIN_LOCK_UNLOCKED(lockname)	\
+	(raw_spinlock_t) __RAW_SPIN_LOCK_INITIALIZER(lockname)
+
+#define DEFINE_RAW_SPINLOCK(x)	raw_spinlock_t x = __RAW_SPIN_LOCK_UNLOCKED(x)
+
+#endif
-- 
cgit v0.10.2


From 1392baf4fa8f4a81b936315e4e6db5d4f91357e8 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 29 Jun 2011 20:06:39 +0200
Subject: rtmutex-avoid-include-hell.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/rtmutex.h b/include/linux/rtmutex.h
index 928d93e..5ebd0bb 100644
--- a/include/linux/rtmutex.h
+++ b/include/linux/rtmutex.h
@@ -14,7 +14,7 @@
 
 #include <linux/linkage.h>
 #include <linux/plist.h>
-#include <linux/spinlock_types.h>
+#include <linux/spinlock_types_raw.h>
 
 extern int max_lock_depth; /* for sysctl */
 
-- 
cgit v0.10.2


From d48a04a80519628cf88831f84db0e1f552e05c67 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 29 Jun 2011 19:43:35 +0200
Subject: rt-add-rt-spinlocks.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/rwlock_types_rt.h b/include/linux/rwlock_types_rt.h
new file mode 100644
index 0000000..b138321
--- /dev/null
+++ b/include/linux/rwlock_types_rt.h
@@ -0,0 +1,33 @@
+#ifndef __LINUX_RWLOCK_TYPES_RT_H
+#define __LINUX_RWLOCK_TYPES_RT_H
+
+#ifndef __LINUX_SPINLOCK_TYPES_H
+#error "Do not include directly. Include spinlock_types.h instead"
+#endif
+
+/*
+ * rwlocks - rtmutex which allows single reader recursion
+ */
+typedef struct {
+	struct rt_mutex		lock;
+	int			read_depth;
+	unsigned int		break_lock;
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	struct lockdep_map	dep_map;
+#endif
+} rwlock_t;
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+# define RW_DEP_MAP_INIT(lockname)	.dep_map = { .name = #lockname }
+#else
+# define RW_DEP_MAP_INIT(lockname)
+#endif
+
+#define __RW_LOCK_UNLOCKED(name) \
+	{ .lock = __RT_MUTEX_INITIALIZER_SAVE_STATE(name.lock),	\
+	  RW_DEP_MAP_INIT(name) }
+
+#define DEFINE_RWLOCK(name) \
+	rwlock_t name __cacheline_aligned_in_smp = __RW_LOCK_UNLOCKED(name)
+
+#endif
diff --git a/include/linux/spinlock_types.h b/include/linux/spinlock_types.h
index 5c8664d..10bac71 100644
--- a/include/linux/spinlock_types.h
+++ b/include/linux/spinlock_types.h
@@ -11,8 +11,13 @@
 
 #include <linux/spinlock_types_raw.h>
 
-#include <linux/spinlock_types_nort.h>
-
-#include <linux/rwlock_types.h>
+#ifndef CONFIG_PREEMPT_RT_FULL
+# include <linux/spinlock_types_nort.h>
+# include <linux/rwlock_types.h>
+#else
+# include <linux/rtmutex.h>
+# include <linux/spinlock_types_rt.h>
+# include <linux/rwlock_types_rt.h>
+#endif
 
 #endif /* __LINUX_SPINLOCK_TYPES_H */
diff --git a/include/linux/spinlock_types_rt.h b/include/linux/spinlock_types_rt.h
new file mode 100644
index 0000000..9fd4319
--- /dev/null
+++ b/include/linux/spinlock_types_rt.h
@@ -0,0 +1,51 @@
+#ifndef __LINUX_SPINLOCK_TYPES_RT_H
+#define __LINUX_SPINLOCK_TYPES_RT_H
+
+#ifndef __LINUX_SPINLOCK_TYPES_H
+#error "Do not include directly. Include spinlock_types.h instead"
+#endif
+
+#include <linux/cache.h>
+
+/*
+ * PREEMPT_RT: spinlocks - an RT mutex plus lock-break field:
+ */
+typedef struct spinlock {
+	struct rt_mutex		lock;
+	unsigned int		break_lock;
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	struct lockdep_map	dep_map;
+#endif
+} spinlock_t;
+
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+# define __RT_SPIN_INITIALIZER(name) \
+	{ \
+	.wait_lock = __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock), \
+	.save_state = 1, \
+	.file = __FILE__, \
+	.line = __LINE__ , \
+	}
+#else
+# define __RT_SPIN_INITIALIZER(name) \
+	{								\
+	.wait_lock = __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock),		\
+	.save_state = 1, \
+	}
+#endif
+
+/*
+.wait_list = PLIST_HEAD_INIT_RAW((name).lock.wait_list, (name).lock.wait_lock)
+*/
+
+#define __SPIN_LOCK_UNLOCKED(name)			\
+	{ .lock = __RT_SPIN_INITIALIZER(name.lock),		\
+	  SPIN_DEP_MAP_INIT(name) }
+
+#define __DEFINE_SPINLOCK(name) \
+	spinlock_t name = __SPIN_LOCK_UNLOCKED(name)
+
+#define DEFINE_SPINLOCK(name) \
+	spinlock_t name __cacheline_aligned_in_smp = __SPIN_LOCK_UNLOCKED(name)
+
+#endif
-- 
cgit v0.10.2


From f1de2067095fb6e97e59242a9e445a07b8e1b29d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 29 Jun 2011 20:56:22 +0200
Subject: rt-add-rt-to-mutex-headers.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index 9121595..bdf1da2 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -17,6 +17,17 @@
 
 #include <linux/atomic.h>
 
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+# define __DEP_MAP_MUTEX_INITIALIZER(lockname) \
+		, .dep_map = { .name = #lockname }
+#else
+# define __DEP_MAP_MUTEX_INITIALIZER(lockname)
+#endif
+
+#ifdef CONFIG_PREEMPT_RT_FULL
+# include <linux/mutex_rt.h>
+#else
+
 /*
  * Simple, straightforward mutexes with strict semantics:
  *
@@ -95,13 +106,6 @@ do {							\
 static inline void mutex_destroy(struct mutex *lock) {}
 #endif
 
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-# define __DEP_MAP_MUTEX_INITIALIZER(lockname) \
-		, .dep_map = { .name = #lockname }
-#else
-# define __DEP_MAP_MUTEX_INITIALIZER(lockname)
-#endif
-
 #define __MUTEX_INITIALIZER(lockname) \
 		{ .count = ATOMIC_INIT(1) \
 		, .wait_lock = __SPIN_LOCK_UNLOCKED(lockname.wait_lock) \
@@ -167,6 +171,9 @@ extern int __must_check mutex_lock_killable(struct mutex *lock);
  */
 extern int mutex_trylock(struct mutex *lock);
 extern void mutex_unlock(struct mutex *lock);
+
+#endif /* !PREEMPT_RT_FULL */
+
 extern int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock);
 
 #ifndef CONFIG_HAVE_ARCH_MUTEX_CPU_RELAX
diff --git a/include/linux/mutex_rt.h b/include/linux/mutex_rt.h
new file mode 100644
index 0000000..c38a44b
--- /dev/null
+++ b/include/linux/mutex_rt.h
@@ -0,0 +1,84 @@
+#ifndef __LINUX_MUTEX_RT_H
+#define __LINUX_MUTEX_RT_H
+
+#ifndef __LINUX_MUTEX_H
+#error "Please include mutex.h"
+#endif
+
+#include <linux/rtmutex.h>
+
+/* FIXME: Just for __lockfunc */
+#include <linux/spinlock.h>
+
+struct mutex {
+	struct rt_mutex		lock;
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	struct lockdep_map	dep_map;
+#endif
+};
+
+#define __MUTEX_INITIALIZER(mutexname)					\
+	{								\
+		.lock = __RT_MUTEX_INITIALIZER(mutexname.lock)		\
+		__DEP_MAP_MUTEX_INITIALIZER(mutexname)			\
+	}
+
+#define DEFINE_MUTEX(mutexname)						\
+	struct mutex mutexname = __MUTEX_INITIALIZER(mutexname)
+
+extern void __mutex_do_init(struct mutex *lock, const char *name, struct lock_class_key *key);
+extern void __lockfunc _mutex_lock(struct mutex *lock);
+extern int __lockfunc _mutex_lock_interruptible(struct mutex *lock);
+extern int __lockfunc _mutex_lock_killable(struct mutex *lock);
+extern void __lockfunc _mutex_lock_nested(struct mutex *lock, int subclass);
+extern void __lockfunc _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest_lock);
+extern int __lockfunc _mutex_lock_interruptible_nested(struct mutex *lock, int subclass);
+extern int __lockfunc _mutex_lock_killable_nested(struct mutex *lock, int subclass);
+extern int __lockfunc _mutex_trylock(struct mutex *lock);
+extern void __lockfunc _mutex_unlock(struct mutex *lock);
+
+#define mutex_is_locked(l)		rt_mutex_is_locked(&(l)->lock)
+#define mutex_lock(l)			_mutex_lock(l)
+#define mutex_lock_interruptible(l)	_mutex_lock_interruptible(l)
+#define mutex_lock_killable(l)		_mutex_lock_killable(l)
+#define mutex_trylock(l)		_mutex_trylock(l)
+#define mutex_unlock(l)			_mutex_unlock(l)
+#define mutex_destroy(l)		rt_mutex_destroy(&(l)->lock)
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+# define mutex_lock_nested(l, s)	_mutex_lock_nested(l, s)
+# define mutex_lock_interruptible_nested(l, s) \
+					_mutex_lock_interruptible_nested(l, s)
+# define mutex_lock_killable_nested(l, s) \
+					_mutex_lock_killable_nested(l, s)
+
+# define mutex_lock_nest_lock(lock, nest_lock)				\
+do {									\
+	typecheck(struct lockdep_map *, &(nest_lock)->dep_map);		\
+	_mutex_lock_nest_lock(lock, &(nest_lock)->dep_map);		\
+} while (0)
+
+#else
+# define mutex_lock_nested(l, s)	_mutex_lock(l)
+# define mutex_lock_interruptible_nested(l, s) \
+					_mutex_lock_interruptible(l)
+# define mutex_lock_killable_nested(l, s) \
+					_mutex_lock_killable(l)
+# define mutex_lock_nest_lock(lock, nest_lock) mutex_lock(lock)
+#endif
+
+# define mutex_init(mutex)				\
+do {							\
+	static struct lock_class_key __key;		\
+							\
+	rt_mutex_init(&(mutex)->lock);			\
+	__mutex_do_init((mutex), #mutex, &__key);	\
+} while (0)
+
+# define __mutex_init(mutex, name, key)			\
+do {							\
+	rt_mutex_init(&(mutex)->lock);			\
+	__mutex_do_init((mutex), name, key);		\
+} while (0)
+
+#endif
-- 
cgit v0.10.2


From b24bd2c86c14ffd2e7a99e51184bf95b369d8b3d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 29 Jun 2011 21:02:53 +0200
Subject: rwsem-add-rt-variant.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
index 8da67d6..f994bd3 100644
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -16,6 +16,10 @@
 
 #include <linux/atomic.h>
 
+#ifdef CONFIG_PREEMPT_RT_FULL
+#include <linux/rwsem_rt.h>
+#else /* PREEMPT_RT_FULL */
+
 struct rw_semaphore;
 
 #ifdef CONFIG_RWSEM_GENERIC_SPINLOCK
@@ -139,4 +143,6 @@ do {								\
 # define down_write_nested(sem, subclass)	down_write(sem)
 #endif
 
+#endif /* !PREEMPT_RT_FULL */
+
 #endif /* _LINUX_RWSEM_H */
diff --git a/include/linux/rwsem_rt.h b/include/linux/rwsem_rt.h
new file mode 100644
index 0000000..e94d945
--- /dev/null
+++ b/include/linux/rwsem_rt.h
@@ -0,0 +1,128 @@
+#ifndef _LINUX_RWSEM_RT_H
+#define _LINUX_RWSEM_RT_H
+
+#ifndef _LINUX_RWSEM_H
+#error "Include rwsem.h"
+#endif
+
+/*
+ * RW-semaphores are a spinlock plus a reader-depth count.
+ *
+ * Note that the semantics are different from the usual
+ * Linux rw-sems, in PREEMPT_RT mode we do not allow
+ * multiple readers to hold the lock at once, we only allow
+ * a read-lock owner to read-lock recursively. This is
+ * better for latency, makes the implementation inherently
+ * fair and makes it simpler as well.
+ */
+
+#include <linux/rtmutex.h>
+
+struct rw_semaphore {
+	struct rt_mutex		lock;
+	int			read_depth;
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	struct lockdep_map	dep_map;
+#endif
+};
+
+#define __RWSEM_INITIALIZER(name) \
+	{ .lock = __RT_MUTEX_INITIALIZER(name.lock), \
+	  RW_DEP_MAP_INIT(name) }
+
+#define DECLARE_RWSEM(lockname) \
+	struct rw_semaphore lockname = __RWSEM_INITIALIZER(lockname)
+
+extern void  __rt_rwsem_init(struct rw_semaphore *rwsem, const char *name,
+				     struct lock_class_key *key);
+
+#define __rt_init_rwsem(sem, name, key)			\
+	do {						\
+		rt_mutex_init(&(sem)->lock);		\
+		__rt_rwsem_init((sem), (name), (key));\
+	} while (0)
+
+#define __init_rwsem(sem, name, key) __rt_init_rwsem(sem, name, key)
+
+# define rt_init_rwsem(sem)				\
+do {							\
+	static struct lock_class_key __key;		\
+							\
+	__rt_init_rwsem((sem), #sem, &__key);		\
+} while (0)
+
+extern void  rt_down_write(struct rw_semaphore *rwsem);
+extern void rt_down_read_nested(struct rw_semaphore *rwsem, int subclass);
+extern void rt_down_write_nested(struct rw_semaphore *rwsem, int subclass);
+extern void rt_down_write_nested_lock(struct rw_semaphore *rwsem,
+		struct lockdep_map *nest);
+extern void  rt_down_read(struct rw_semaphore *rwsem);
+extern int  rt_down_write_trylock(struct rw_semaphore *rwsem);
+extern int  rt_down_read_trylock(struct rw_semaphore *rwsem);
+extern void  rt_up_read(struct rw_semaphore *rwsem);
+extern void  rt_up_write(struct rw_semaphore *rwsem);
+extern void  rt_downgrade_write(struct rw_semaphore *rwsem);
+
+#define init_rwsem(sem)		rt_init_rwsem(sem)
+#define rwsem_is_locked(s)	rt_mutex_is_locked(&(s)->lock)
+
+static inline void down_read(struct rw_semaphore *sem)
+{
+	rt_down_read(sem);
+}
+
+static inline int down_read_trylock(struct rw_semaphore *sem)
+{
+	return rt_down_read_trylock(sem);
+}
+
+static inline void down_write(struct rw_semaphore *sem)
+{
+	rt_down_write(sem);
+}
+
+static inline int down_write_trylock(struct rw_semaphore *sem)
+{
+	return rt_down_write_trylock(sem);
+}
+
+static inline void up_read(struct rw_semaphore *sem)
+{
+	rt_up_read(sem);
+}
+
+static inline void up_write(struct rw_semaphore *sem)
+{
+	rt_up_write(sem);
+}
+
+static inline void downgrade_write(struct rw_semaphore *sem)
+{
+	rt_downgrade_write(sem);
+}
+
+static inline void down_read_nested(struct rw_semaphore *sem, int subclass)
+{
+	return rt_down_read_nested(sem, subclass);
+}
+
+static inline void down_write_nested(struct rw_semaphore *sem, int subclass)
+{
+	rt_down_write_nested(sem, subclass);
+}
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+static inline void down_write_nest_lock(struct rw_semaphore *sem,
+		struct rw_semaphore *nest_lock)
+{
+	rt_down_write_nested_lock(sem, &nest_lock->dep_map);
+}
+
+#else
+
+static inline void down_write_nest_lock(struct rw_semaphore *sem,
+		struct rw_semaphore *nest_lock)
+{
+	rt_down_write_nested_lock(sem, NULL);
+}
+#endif
+#endif
diff --git a/lib/Makefile b/lib/Makefile
index 02ed6c0..7e961f1 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -38,8 +38,11 @@ obj-$(CONFIG_HAS_IOMEM) += iomap_copy.o devres.o
 obj-$(CONFIG_CHECK_SIGNATURE) += check_signature.o
 obj-$(CONFIG_DEBUG_LOCKING_API_SELFTESTS) += locking-selftest.o
 obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o
+
+ifneq ($(CONFIG_PREEMPT_RT_FULL),y)
 lib-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
 lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o
+endif
 lib-$(CONFIG_PERCPU_RWSEM) += percpu-rwsem.o
 
 CFLAGS_hweight.o = $(subst $(quote),,$(CONFIG_ARCH_HWEIGHT_CFLAGS))
-- 
cgit v0.10.2


From eff404912cbe53ecb8ca1fd31b019183249ec8e1 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 26 Jul 2009 19:39:56 +0200
Subject: rt: Add the preempt-rt lock replacement APIs

Map spinlocks, rwlocks, rw_semaphores and semaphores to the rt_mutex
based locking functions for preempt-rt.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/rwlock_rt.h b/include/linux/rwlock_rt.h
new file mode 100644
index 0000000..853ee36
--- /dev/null
+++ b/include/linux/rwlock_rt.h
@@ -0,0 +1,123 @@
+#ifndef __LINUX_RWLOCK_RT_H
+#define __LINUX_RWLOCK_RT_H
+
+#ifndef __LINUX_SPINLOCK_H
+#error Do not include directly. Use spinlock.h
+#endif
+
+#define rwlock_init(rwl)				\
+do {							\
+	static struct lock_class_key __key;		\
+							\
+	rt_mutex_init(&(rwl)->lock);			\
+	__rt_rwlock_init(rwl, #rwl, &__key);		\
+} while (0)
+
+extern void __lockfunc rt_write_lock(rwlock_t *rwlock);
+extern void __lockfunc rt_read_lock(rwlock_t *rwlock);
+extern int __lockfunc rt_write_trylock(rwlock_t *rwlock);
+extern int __lockfunc rt_write_trylock_irqsave(rwlock_t *trylock, unsigned long *flags);
+extern int __lockfunc rt_read_trylock(rwlock_t *rwlock);
+extern void __lockfunc rt_write_unlock(rwlock_t *rwlock);
+extern void __lockfunc rt_read_unlock(rwlock_t *rwlock);
+extern unsigned long __lockfunc rt_write_lock_irqsave(rwlock_t *rwlock);
+extern unsigned long __lockfunc rt_read_lock_irqsave(rwlock_t *rwlock);
+extern void __rt_rwlock_init(rwlock_t *rwlock, char *name, struct lock_class_key *key);
+
+#define read_trylock(lock)	__cond_lock(lock, rt_read_trylock(lock))
+#define write_trylock(lock)	__cond_lock(lock, rt_write_trylock(lock))
+
+#define write_trylock_irqsave(lock, flags)	\
+	__cond_lock(lock, rt_write_trylock_irqsave(lock, &flags))
+
+#define read_lock_irqsave(lock, flags)			\
+	do {						\
+		typecheck(unsigned long, flags);	\
+		migrate_disable();			\
+		flags = rt_read_lock_irqsave(lock);	\
+	} while (0)
+
+#define write_lock_irqsave(lock, flags)			\
+	do {						\
+		typecheck(unsigned long, flags);	\
+		migrate_disable();			\
+		flags = rt_write_lock_irqsave(lock);	\
+	} while (0)
+
+#define read_lock(lock)					\
+	do {						\
+		migrate_disable();			\
+		rt_read_lock(lock);			\
+	} while (0)
+
+#define read_lock_bh(lock)				\
+	do {						\
+		local_bh_disable();			\
+		migrate_disable();			\
+		rt_read_lock(lock);			\
+	} while (0)
+
+#define read_lock_irq(lock)	read_lock(lock)
+
+#define write_lock(lock)				\
+	do {						\
+		migrate_disable();			\
+		rt_write_lock(lock);			\
+	} while (0)
+
+#define write_lock_bh(lock)				\
+	do {						\
+		local_bh_disable();			\
+		migrate_disable();			\
+		rt_write_lock(lock);			\
+	} while (0)
+
+#define write_lock_irq(lock)	write_lock(lock)
+
+#define read_unlock(lock)				\
+	do {						\
+		rt_read_unlock(lock);			\
+		migrate_enable();			\
+	} while (0)
+
+#define read_unlock_bh(lock)				\
+	do {						\
+		rt_read_unlock(lock);			\
+		migrate_enable();			\
+		local_bh_enable();			\
+	} while (0)
+
+#define read_unlock_irq(lock)	read_unlock(lock)
+
+#define write_unlock(lock)				\
+	do {						\
+		rt_write_unlock(lock);			\
+		migrate_enable();			\
+	} while (0)
+
+#define write_unlock_bh(lock)				\
+	do {						\
+		rt_write_unlock(lock);			\
+		migrate_enable();			\
+		local_bh_enable();			\
+	} while (0)
+
+#define write_unlock_irq(lock)	write_unlock(lock)
+
+#define read_unlock_irqrestore(lock, flags)		\
+	do {						\
+		typecheck(unsigned long, flags);	\
+		(void) flags;				\
+		rt_read_unlock(lock);			\
+		migrate_enable();			\
+	} while (0)
+
+#define write_unlock_irqrestore(lock, flags) \
+	do {						\
+		typecheck(unsigned long, flags);	\
+		(void) flags;				\
+		rt_write_unlock(lock);			\
+		migrate_enable();			\
+	} while (0)
+
+#endif
diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h
index 7d537ce..0c11a7c 100644
--- a/include/linux/spinlock.h
+++ b/include/linux/spinlock.h
@@ -254,7 +254,11 @@ static inline void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock)
 #define raw_spin_can_lock(lock)	(!raw_spin_is_locked(lock))
 
 /* Include rwlock functions */
-#include <linux/rwlock.h>
+#ifdef CONFIG_PREEMPT_RT_FULL
+# include <linux/rwlock_rt.h>
+#else
+# include <linux/rwlock.h>
+#endif
 
 /*
  * Pull the _spin_*()/_read_*()/_write_*() functions/declarations:
@@ -265,6 +269,10 @@ static inline void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock)
 # include <linux/spinlock_api_up.h>
 #endif
 
+#ifdef CONFIG_PREEMPT_RT_FULL
+# include <linux/spinlock_rt.h>
+#else /* PREEMPT_RT_FULL */
+
 /*
  * Map the spin_lock functions to the raw variants for PREEMPT_RT=n
  */
@@ -394,4 +402,6 @@ extern int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock);
 #define atomic_dec_and_lock(atomic, lock) \
 		__cond_lock(lock, _atomic_dec_and_lock(atomic, lock))
 
+#endif /* !PREEMPT_RT_FULL */
+
 #endif /* __LINUX_SPINLOCK_H */
diff --git a/include/linux/spinlock_api_smp.h b/include/linux/spinlock_api_smp.h
index 51df117..3f68f50 100644
--- a/include/linux/spinlock_api_smp.h
+++ b/include/linux/spinlock_api_smp.h
@@ -191,6 +191,8 @@ static inline int __raw_spin_trylock_bh(raw_spinlock_t *lock)
 	return 0;
 }
 
-#include <linux/rwlock_api_smp.h>
+#ifndef CONFIG_PREEMPT_RT_FULL
+# include <linux/rwlock_api_smp.h>
+#endif
 
 #endif /* __LINUX_SPINLOCK_API_SMP_H */
diff --git a/include/linux/spinlock_rt.h b/include/linux/spinlock_rt.h
new file mode 100644
index 0000000..a9ea392
--- /dev/null
+++ b/include/linux/spinlock_rt.h
@@ -0,0 +1,158 @@
+#ifndef __LINUX_SPINLOCK_RT_H
+#define __LINUX_SPINLOCK_RT_H
+
+#ifndef __LINUX_SPINLOCK_H
+#error Do not include directly. Use spinlock.h
+#endif
+
+#include <linux/bug.h>
+
+extern void
+__rt_spin_lock_init(spinlock_t *lock, char *name, struct lock_class_key *key);
+
+#define spin_lock_init(slock)				\
+do {							\
+	static struct lock_class_key __key;		\
+							\
+	rt_mutex_init(&(slock)->lock);			\
+	__rt_spin_lock_init(slock, #slock, &__key);	\
+} while (0)
+
+extern void __lockfunc rt_spin_lock(spinlock_t *lock);
+extern unsigned long __lockfunc rt_spin_lock_trace_flags(spinlock_t *lock);
+extern void __lockfunc rt_spin_lock_nested(spinlock_t *lock, int subclass);
+extern void __lockfunc rt_spin_unlock(spinlock_t *lock);
+extern void __lockfunc rt_spin_unlock_wait(spinlock_t *lock);
+extern int __lockfunc rt_spin_trylock_irqsave(spinlock_t *lock, unsigned long *flags);
+extern int __lockfunc rt_spin_trylock_bh(spinlock_t *lock);
+extern int __lockfunc rt_spin_trylock(spinlock_t *lock);
+extern int atomic_dec_and_spin_lock(atomic_t *atomic, spinlock_t *lock);
+
+/*
+ * lockdep-less calls, for derived types like rwlock:
+ * (for trylock they can use rt_mutex_trylock() directly.
+ */
+extern void __lockfunc __rt_spin_lock(struct rt_mutex *lock);
+extern void __lockfunc __rt_spin_unlock(struct rt_mutex *lock);
+
+#define spin_lock_local(lock)			rt_spin_lock(lock)
+#define spin_unlock_local(lock)			rt_spin_unlock(lock)
+
+#define spin_lock(lock)				\
+	do {					\
+		migrate_disable();		\
+		rt_spin_lock(lock);		\
+	} while (0)
+
+#define spin_lock_bh(lock)			\
+	do {					\
+		local_bh_disable();		\
+		migrate_disable();		\
+		rt_spin_lock(lock);		\
+	} while (0)
+
+#define spin_lock_irq(lock)		spin_lock(lock)
+
+#define spin_trylock(lock)		__cond_lock(lock, rt_spin_trylock(lock))
+
+#ifdef CONFIG_LOCKDEP
+# define spin_lock_nested(lock, subclass)		\
+	do {						\
+		migrate_disable();			\
+		rt_spin_lock_nested(lock, subclass);	\
+	} while (0)
+
+# define spin_lock_irqsave_nested(lock, flags, subclass) \
+	do {						 \
+		typecheck(unsigned long, flags);	 \
+		flags = 0;				 \
+		migrate_disable();			 \
+		rt_spin_lock_nested(lock, subclass);	 \
+	} while (0)
+#else
+# define spin_lock_nested(lock, subclass)	spin_lock(lock)
+
+# define spin_lock_irqsave_nested(lock, flags, subclass) \
+	do {						 \
+		typecheck(unsigned long, flags);	 \
+		flags = 0;				 \
+		spin_lock(lock);			 \
+	} while (0)
+#endif
+
+#define spin_lock_irqsave(lock, flags)			 \
+	do {						 \
+		typecheck(unsigned long, flags);	 \
+		flags = 0;				 \
+		spin_lock(lock);			 \
+	} while (0)
+
+static inline unsigned long spin_lock_trace_flags(spinlock_t *lock)
+{
+	unsigned long flags = 0;
+#ifdef CONFIG_TRACE_IRQFLAGS
+	flags = rt_spin_lock_trace_flags(lock);
+#else
+	spin_lock(lock); /* lock_local */
+#endif
+	return flags;
+}
+
+/* FIXME: we need rt_spin_lock_nest_lock */
+#define spin_lock_nest_lock(lock, nest_lock) spin_lock_nested(lock, 0)
+
+#define spin_unlock(lock)				\
+	do {						\
+		rt_spin_unlock(lock);			\
+		migrate_enable();			\
+	} while (0)
+
+#define spin_unlock_bh(lock)				\
+	do {						\
+		rt_spin_unlock(lock);			\
+		migrate_enable();			\
+		local_bh_enable();			\
+	} while (0)
+
+#define spin_unlock_irq(lock)		spin_unlock(lock)
+
+#define spin_unlock_irqrestore(lock, flags)		\
+	do {						\
+		typecheck(unsigned long, flags);	\
+		(void) flags;				\
+		spin_unlock(lock);			\
+	} while (0)
+
+#define spin_trylock_bh(lock)	__cond_lock(lock, rt_spin_trylock_bh(lock))
+#define spin_trylock_irq(lock)	spin_trylock(lock)
+
+#define spin_trylock_irqsave(lock, flags)	\
+	rt_spin_trylock_irqsave(lock, &(flags))
+
+#define spin_unlock_wait(lock)		rt_spin_unlock_wait(lock)
+
+#ifdef CONFIG_GENERIC_LOCKBREAK
+# define spin_is_contended(lock)	((lock)->break_lock)
+#else
+# define spin_is_contended(lock)	(((void)(lock), 0))
+#endif
+
+static inline int spin_can_lock(spinlock_t *lock)
+{
+	return !rt_mutex_is_locked(&lock->lock);
+}
+
+static inline int spin_is_locked(spinlock_t *lock)
+{
+	return rt_mutex_is_locked(&lock->lock);
+}
+
+static inline void assert_spin_locked(spinlock_t *lock)
+{
+	BUG_ON(!spin_is_locked(lock));
+}
+
+#define atomic_dec_and_lock(atomic, lock) \
+	atomic_dec_and_spin_lock(atomic, lock)
+
+#endif
diff --git a/kernel/Makefile b/kernel/Makefile
index 6c072b6..0e12fc2 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -7,8 +7,8 @@ obj-y     = fork.o exec_domain.o panic.o printk.o \
 	    sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \
 	    signal.o sys.o kmod.o workqueue.o pid.o task_work.o \
 	    rcupdate.o extable.o params.o posix-timers.o \
-	    kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
-	    hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
+	    kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o \
+	    hrtimer.o nsproxy.o srcu.o semaphore.o \
 	    notifier.o ksysfs.o cred.o \
 	    async.o range.o groups.o lglock.o smpboot.o
 
@@ -32,7 +32,11 @@ obj-$(CONFIG_FREEZER) += freezer.o
 obj-$(CONFIG_PROFILING) += profile.o
 obj-$(CONFIG_STACKTRACE) += stacktrace.o
 obj-y += time/
+ifneq ($(CONFIG_PREEMPT_RT_FULL),y)
+obj-y += mutex.o
 obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
+obj-y += rwsem.o
+endif
 obj-$(CONFIG_LOCKDEP) += lockdep.o
 ifeq ($(CONFIG_PROC_FS),y)
 obj-$(CONFIG_LOCKDEP) += lockdep_proc.o
@@ -44,6 +48,7 @@ endif
 obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
 obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
 obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o
+obj-$(CONFIG_PREEMPT_RT_FULL) += rt.o
 obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
 obj-$(CONFIG_SMP) += smp.o
 ifneq ($(CONFIG_SMP),y)
diff --git a/kernel/rt.c b/kernel/rt.c
new file mode 100644
index 0000000..92a16e1
--- /dev/null
+++ b/kernel/rt.c
@@ -0,0 +1,442 @@
+/*
+ * kernel/rt.c
+ *
+ * Real-Time Preemption Support
+ *
+ * started by Ingo Molnar:
+ *
+ *  Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *  Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
+ *
+ * historic credit for proving that Linux spinlocks can be implemented via
+ * RT-aware mutexes goes to many people: The Pmutex project (Dirk Grambow
+ * and others) who prototyped it on 2.4 and did lots of comparative
+ * research and analysis; TimeSys, for proving that you can implement a
+ * fully preemptible kernel via the use of IRQ threading and mutexes;
+ * Bill Huey for persuasively arguing on lkml that the mutex model is the
+ * right one; and to MontaVista, who ported pmutexes to 2.6.
+ *
+ * This code is a from-scratch implementation and is not based on pmutexes,
+ * but the idea of converting spinlocks to mutexes is used here too.
+ *
+ * lock debugging, locking tree, deadlock detection:
+ *
+ *  Copyright (C) 2004, LynuxWorks, Inc., Igor Manyilov, Bill Huey
+ *  Released under the General Public License (GPL).
+ *
+ * Includes portions of the generic R/W semaphore implementation from:
+ *
+ *  Copyright (c) 2001   David Howells (dhowells@redhat.com).
+ *  - Derived partially from idea by Andrea Arcangeli <andrea@suse.de>
+ *  - Derived also from comments by Linus
+ *
+ * Pending ownership of locks and ownership stealing:
+ *
+ *  Copyright (C) 2005, Kihon Technologies Inc., Steven Rostedt
+ *
+ *   (also by Steven Rostedt)
+ *    - Converted single pi_lock to individual task locks.
+ *
+ * By Esben Nielsen:
+ *    Doing priority inheritance with help of the scheduler.
+ *
+ *  Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
+ *  - major rework based on Esben Nielsens initial patch
+ *  - replaced thread_info references by task_struct refs
+ *  - removed task->pending_owner dependency
+ *  - BKL drop/reacquire for semaphore style locks to avoid deadlocks
+ *    in the scheduler return path as discussed with Steven Rostedt
+ *
+ *  Copyright (C) 2006, Kihon Technologies Inc.
+ *    Steven Rostedt <rostedt@goodmis.org>
+ *  - debugged and patched Thomas Gleixner's rework.
+ *  - added back the cmpxchg to the rework.
+ *  - turned atomic require back on for SMP.
+ */
+
+#include <linux/spinlock.h>
+#include <linux/rtmutex.h>
+#include <linux/sched.h>
+#include <linux/delay.h>
+#include <linux/module.h>
+#include <linux/kallsyms.h>
+#include <linux/syscalls.h>
+#include <linux/interrupt.h>
+#include <linux/plist.h>
+#include <linux/fs.h>
+#include <linux/futex.h>
+#include <linux/hrtimer.h>
+
+#include "rtmutex_common.h"
+
+/*
+ * struct mutex functions
+ */
+void __mutex_do_init(struct mutex *mutex, const char *name,
+		     struct lock_class_key *key)
+{
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	/*
+	 * Make sure we are not reinitializing a held lock:
+	 */
+	debug_check_no_locks_freed((void *)mutex, sizeof(*mutex));
+	lockdep_init_map(&mutex->dep_map, name, key, 0);
+#endif
+	mutex->lock.save_state = 0;
+}
+EXPORT_SYMBOL(__mutex_do_init);
+
+void __lockfunc _mutex_lock(struct mutex *lock)
+{
+	mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
+	rt_mutex_lock(&lock->lock);
+}
+EXPORT_SYMBOL(_mutex_lock);
+
+int __lockfunc _mutex_lock_interruptible(struct mutex *lock)
+{
+	int ret;
+
+	mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
+	ret = rt_mutex_lock_interruptible(&lock->lock, 0);
+	if (ret)
+		mutex_release(&lock->dep_map, 1, _RET_IP_);
+	return ret;
+}
+EXPORT_SYMBOL(_mutex_lock_interruptible);
+
+int __lockfunc _mutex_lock_killable(struct mutex *lock)
+{
+	int ret;
+
+	mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
+	ret = rt_mutex_lock_killable(&lock->lock, 0);
+	if (ret)
+		mutex_release(&lock->dep_map, 1, _RET_IP_);
+	return ret;
+}
+EXPORT_SYMBOL(_mutex_lock_killable);
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+void __lockfunc _mutex_lock_nested(struct mutex *lock, int subclass)
+{
+	mutex_acquire_nest(&lock->dep_map, subclass, 0, NULL, _RET_IP_);
+	rt_mutex_lock(&lock->lock);
+}
+EXPORT_SYMBOL(_mutex_lock_nested);
+
+void __lockfunc _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest)
+{
+	mutex_acquire_nest(&lock->dep_map, 0, 0, nest, _RET_IP_);
+	rt_mutex_lock(&lock->lock);
+}
+EXPORT_SYMBOL(_mutex_lock_nest_lock);
+
+int __lockfunc _mutex_lock_interruptible_nested(struct mutex *lock, int subclass)
+{
+	int ret;
+
+	mutex_acquire_nest(&lock->dep_map, subclass, 0, NULL, _RET_IP_);
+	ret = rt_mutex_lock_interruptible(&lock->lock, 0);
+	if (ret)
+		mutex_release(&lock->dep_map, 1, _RET_IP_);
+	return ret;
+}
+EXPORT_SYMBOL(_mutex_lock_interruptible_nested);
+
+int __lockfunc _mutex_lock_killable_nested(struct mutex *lock, int subclass)
+{
+	int ret;
+
+	mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
+	ret = rt_mutex_lock_killable(&lock->lock, 0);
+	if (ret)
+		mutex_release(&lock->dep_map, 1, _RET_IP_);
+	return ret;
+}
+EXPORT_SYMBOL(_mutex_lock_killable_nested);
+#endif
+
+int __lockfunc _mutex_trylock(struct mutex *lock)
+{
+	int ret = rt_mutex_trylock(&lock->lock);
+
+	if (ret)
+		mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
+
+	return ret;
+}
+EXPORT_SYMBOL(_mutex_trylock);
+
+void __lockfunc _mutex_unlock(struct mutex *lock)
+{
+	mutex_release(&lock->dep_map, 1, _RET_IP_);
+	rt_mutex_unlock(&lock->lock);
+}
+EXPORT_SYMBOL(_mutex_unlock);
+
+/*
+ * rwlock_t functions
+ */
+int __lockfunc rt_write_trylock(rwlock_t *rwlock)
+{
+	int ret = rt_mutex_trylock(&rwlock->lock);
+
+	migrate_disable();
+	if (ret)
+		rwlock_acquire(&rwlock->dep_map, 0, 1, _RET_IP_);
+	else
+		migrate_enable();
+
+	return ret;
+}
+EXPORT_SYMBOL(rt_write_trylock);
+
+int __lockfunc rt_write_trylock_irqsave(rwlock_t *rwlock, unsigned long *flags)
+{
+	int ret;
+
+	*flags = 0;
+	migrate_disable();
+	ret = rt_write_trylock(rwlock);
+	if (!ret)
+		migrate_enable();
+	return ret;
+}
+EXPORT_SYMBOL(rt_write_trylock_irqsave);
+
+int __lockfunc rt_read_trylock(rwlock_t *rwlock)
+{
+	struct rt_mutex *lock = &rwlock->lock;
+	int ret = 1;
+
+	/*
+	 * recursive read locks succeed when current owns the lock,
+	 * but not when read_depth == 0 which means that the lock is
+	 * write locked.
+	 */
+	migrate_disable();
+	if (rt_mutex_owner(lock) != current)
+		ret = rt_mutex_trylock(lock);
+	else if (!rwlock->read_depth)
+		ret = 0;
+
+	if (ret) {
+		rwlock->read_depth++;
+		rwlock_acquire_read(&rwlock->dep_map, 0, 1, _RET_IP_);
+	} else
+		migrate_enable();
+
+	return ret;
+}
+EXPORT_SYMBOL(rt_read_trylock);
+
+void __lockfunc rt_write_lock(rwlock_t *rwlock)
+{
+	rwlock_acquire(&rwlock->dep_map, 0, 0, _RET_IP_);
+	__rt_spin_lock(&rwlock->lock);
+}
+EXPORT_SYMBOL(rt_write_lock);
+
+void __lockfunc rt_read_lock(rwlock_t *rwlock)
+{
+	struct rt_mutex *lock = &rwlock->lock;
+
+	rwlock_acquire_read(&rwlock->dep_map, 0, 0, _RET_IP_);
+
+	/*
+	 * recursive read locks succeed when current owns the lock
+	 */
+	if (rt_mutex_owner(lock) != current)
+		__rt_spin_lock(lock);
+	rwlock->read_depth++;
+}
+
+EXPORT_SYMBOL(rt_read_lock);
+
+void __lockfunc rt_write_unlock(rwlock_t *rwlock)
+{
+	/* NOTE: we always pass in '1' for nested, for simplicity */
+	rwlock_release(&rwlock->dep_map, 1, _RET_IP_);
+	__rt_spin_unlock(&rwlock->lock);
+}
+EXPORT_SYMBOL(rt_write_unlock);
+
+void __lockfunc rt_read_unlock(rwlock_t *rwlock)
+{
+	rwlock_release(&rwlock->dep_map, 1, _RET_IP_);
+
+	/* Release the lock only when read_depth is down to 0 */
+	if (--rwlock->read_depth == 0)
+		__rt_spin_unlock(&rwlock->lock);
+}
+EXPORT_SYMBOL(rt_read_unlock);
+
+unsigned long __lockfunc rt_write_lock_irqsave(rwlock_t *rwlock)
+{
+	rt_write_lock(rwlock);
+
+	return 0;
+}
+EXPORT_SYMBOL(rt_write_lock_irqsave);
+
+unsigned long __lockfunc rt_read_lock_irqsave(rwlock_t *rwlock)
+{
+	rt_read_lock(rwlock);
+
+	return 0;
+}
+EXPORT_SYMBOL(rt_read_lock_irqsave);
+
+void __rt_rwlock_init(rwlock_t *rwlock, char *name, struct lock_class_key *key)
+{
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	/*
+	 * Make sure we are not reinitializing a held lock:
+	 */
+	debug_check_no_locks_freed((void *)rwlock, sizeof(*rwlock));
+	lockdep_init_map(&rwlock->dep_map, name, key, 0);
+#endif
+	rwlock->lock.save_state = 1;
+	rwlock->read_depth = 0;
+}
+EXPORT_SYMBOL(__rt_rwlock_init);
+
+/*
+ * rw_semaphores
+ */
+
+void  rt_up_write(struct rw_semaphore *rwsem)
+{
+	rwsem_release(&rwsem->dep_map, 1, _RET_IP_);
+	rt_mutex_unlock(&rwsem->lock);
+}
+EXPORT_SYMBOL(rt_up_write);
+
+void  rt_up_read(struct rw_semaphore *rwsem)
+{
+	rwsem_release(&rwsem->dep_map, 1, _RET_IP_);
+	if (--rwsem->read_depth == 0)
+		rt_mutex_unlock(&rwsem->lock);
+}
+EXPORT_SYMBOL(rt_up_read);
+
+/*
+ * downgrade a write lock into a read lock
+ * - just wake up any readers at the front of the queue
+ */
+void  rt_downgrade_write(struct rw_semaphore *rwsem)
+{
+	BUG_ON(rt_mutex_owner(&rwsem->lock) != current);
+	rwsem->read_depth = 1;
+}
+EXPORT_SYMBOL(rt_downgrade_write);
+
+int  rt_down_write_trylock(struct rw_semaphore *rwsem)
+{
+	int ret = rt_mutex_trylock(&rwsem->lock);
+
+	if (ret)
+		rwsem_acquire(&rwsem->dep_map, 0, 1, _RET_IP_);
+	return ret;
+}
+EXPORT_SYMBOL(rt_down_write_trylock);
+
+void  rt_down_write(struct rw_semaphore *rwsem)
+{
+	rwsem_acquire(&rwsem->dep_map, 0, 0, _RET_IP_);
+	rt_mutex_lock(&rwsem->lock);
+}
+EXPORT_SYMBOL(rt_down_write);
+
+void  rt_down_write_nested(struct rw_semaphore *rwsem, int subclass)
+{
+	rwsem_acquire(&rwsem->dep_map, subclass, 0, _RET_IP_);
+	rt_mutex_lock(&rwsem->lock);
+}
+EXPORT_SYMBOL(rt_down_write_nested);
+
+int  rt_down_read_trylock(struct rw_semaphore *rwsem)
+{
+	struct rt_mutex *lock = &rwsem->lock;
+	int ret = 1;
+
+	/*
+	 * recursive read locks succeed when current owns the rwsem,
+	 * but not when read_depth == 0 which means that the rwsem is
+	 * write locked.
+	 */
+	if (rt_mutex_owner(lock) != current)
+		ret = rt_mutex_trylock(&rwsem->lock);
+	else if (!rwsem->read_depth)
+		ret = 0;
+
+	if (ret) {
+		rwsem->read_depth++;
+		rwsem_acquire(&rwsem->dep_map, 0, 1, _RET_IP_);
+	}
+	return ret;
+}
+EXPORT_SYMBOL(rt_down_read_trylock);
+
+static void __rt_down_read(struct rw_semaphore *rwsem, int subclass)
+{
+	struct rt_mutex *lock = &rwsem->lock;
+
+	rwsem_acquire_read(&rwsem->dep_map, subclass, 0, _RET_IP_);
+
+	if (rt_mutex_owner(lock) != current)
+		rt_mutex_lock(&rwsem->lock);
+	rwsem->read_depth++;
+}
+
+void  rt_down_read(struct rw_semaphore *rwsem)
+{
+	__rt_down_read(rwsem, 0);
+}
+EXPORT_SYMBOL(rt_down_read);
+
+void  rt_down_read_nested(struct rw_semaphore *rwsem, int subclass)
+{
+	__rt_down_read(rwsem, subclass);
+}
+EXPORT_SYMBOL(rt_down_read_nested);
+
+void  __rt_rwsem_init(struct rw_semaphore *rwsem, const char *name,
+			      struct lock_class_key *key)
+{
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	/*
+	 * Make sure we are not reinitializing a held lock:
+	 */
+	debug_check_no_locks_freed((void *)rwsem, sizeof(*rwsem));
+	lockdep_init_map(&rwsem->dep_map, name, key, 0);
+#endif
+	rwsem->read_depth = 0;
+	rwsem->lock.save_state = 0;
+}
+EXPORT_SYMBOL(__rt_rwsem_init);
+
+/**
+ * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0
+ * @cnt: the atomic which we are to dec
+ * @lock: the mutex to return holding if we dec to 0
+ *
+ * return true and hold lock if we dec to 0, return false otherwise
+ */
+int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock)
+{
+	/* dec if we can't possibly hit 0 */
+	if (atomic_add_unless(cnt, -1, 1))
+		return 0;
+	/* we might hit 0, so take the lock */
+	mutex_lock(lock);
+	if (!atomic_dec_and_test(cnt)) {
+		/* when we actually did the dec, we didn't hit 0 */
+		mutex_unlock(lock);
+		return 0;
+	}
+	/* we hit 0, and we hold the lock */
+	return 1;
+}
+EXPORT_SYMBOL(atomic_dec_and_mutex_lock);
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index 5cdd806..da9775b 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -110,8 +110,11 @@ void __lockfunc __raw_##op##_lock_bh(locktype##_t *lock)		\
  *         __[spin|read|write]_lock_bh()
  */
 BUILD_LOCK_OPS(spin, raw_spinlock);
+
+#ifndef CONFIG_PREEMPT_RT_FULL
 BUILD_LOCK_OPS(read, rwlock);
 BUILD_LOCK_OPS(write, rwlock);
+#endif
 
 #endif
 
@@ -195,6 +198,8 @@ void __lockfunc _raw_spin_unlock_bh(raw_spinlock_t *lock)
 EXPORT_SYMBOL(_raw_spin_unlock_bh);
 #endif
 
+#ifndef CONFIG_PREEMPT_RT_FULL
+
 #ifndef CONFIG_INLINE_READ_TRYLOCK
 int __lockfunc _raw_read_trylock(rwlock_t *lock)
 {
@@ -339,6 +344,8 @@ void __lockfunc _raw_write_unlock_bh(rwlock_t *lock)
 EXPORT_SYMBOL(_raw_write_unlock_bh);
 #endif
 
+#endif /* !PREEMPT_RT_FULL */
+
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 
 void __lockfunc _raw_spin_lock_nested(raw_spinlock_t *lock, int subclass)
diff --git a/lib/spinlock_debug.c b/lib/spinlock_debug.c
index 0374a59..9497033 100644
--- a/lib/spinlock_debug.c
+++ b/lib/spinlock_debug.c
@@ -31,6 +31,7 @@ void __raw_spin_lock_init(raw_spinlock_t *lock, const char *name,
 
 EXPORT_SYMBOL(__raw_spin_lock_init);
 
+#ifndef CONFIG_PREEMPT_RT_FULL
 void __rwlock_init(rwlock_t *lock, const char *name,
 		   struct lock_class_key *key)
 {
@@ -48,6 +49,7 @@ void __rwlock_init(rwlock_t *lock, const char *name,
 }
 
 EXPORT_SYMBOL(__rwlock_init);
+#endif
 
 static void spin_dump(raw_spinlock_t *lock, const char *msg)
 {
@@ -159,6 +161,7 @@ void do_raw_spin_unlock(raw_spinlock_t *lock)
 	arch_spin_unlock(&lock->raw_lock);
 }
 
+#ifndef CONFIG_PREEMPT_RT_FULL
 static void rwlock_bug(rwlock_t *lock, const char *msg)
 {
 	if (!debug_locks_off())
@@ -300,3 +303,5 @@ void do_raw_write_unlock(rwlock_t *lock)
 	debug_write_unlock(lock);
 	arch_write_unlock(&lock->raw_lock);
 }
+
+#endif
-- 
cgit v0.10.2


From ef59d457465ff1d84a3b94ff25092a6a2debaca5 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Mon, 8 Apr 2013 16:08:46 +0200
Subject: percpu-rwsem: compile fix

The shortcut on mainline skip lockdep. No idea why this is a good thing.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>

diff --git a/lib/percpu-rwsem.c b/lib/percpu-rwsem.c
index 652a8ee..2db0f42 100644
--- a/lib/percpu-rwsem.c
+++ b/lib/percpu-rwsem.c
@@ -84,8 +84,12 @@ void percpu_down_read(struct percpu_rw_semaphore *brw)
 
 	down_read(&brw->rw_sem);
 	atomic_inc(&brw->slow_read_ctr);
+#ifdef CONFIG_PREEMPT_RT_FULL
+	up_read(&brw->rw_sem);
+#else
 	/* avoid up_read()->rwsem_release() */
 	__up_read(&brw->rw_sem);
+#endif
 }
 
 void percpu_up_read(struct percpu_rw_semaphore *brw)
-- 
cgit v0.10.2


From 511cbb26f021d12777da60b05a6a4f6d8e08c4f7 Mon Sep 17 00:00:00 2001
From: John Kacur <jkacur@redhat.com>
Date: Mon, 19 Sep 2011 11:09:27 +0200
Subject: rwlocks: Fix section mismatch
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This fixes the following build error for the preempt-rt kernel.

make kernel/fork.o
  CC      kernel/fork.o
kernel/fork.c:90: error: section of ¡tasklist_lock¢ conflicts with previous declaration
make[2]: *** [kernel/fork.o] Error 1
make[1]: *** [kernel/fork.o] Error 2

The rt kernel cache aligns the RWLOCK in DEFINE_RWLOCK by default.
The non-rt kernels explicitly cache align only the tasklist_lock in
kernel/fork.c
That can create a build conflict. This fixes the build problem by making the
non-rt kernels cache align RWLOCKs by default. The side effect is that
the other RWLOCKs are also cache aligned for non-rt.

This is a short term solution for rt only.
The longer term solution would be to push the cache aligned DEFINE_RWLOCK
to mainline. If there are objections, then we could create a
DEFINE_RWLOCK_CACHE_ALIGNED or something of that nature.

Comments? Objections?

Signed-off-by: John Kacur <jkacur@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/alpine.LFD.2.00.1109191104010.23118@localhost6.localdomain6
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/rwlock_types.h b/include/linux/rwlock_types.h
index 5317cd9..d0da966 100644
--- a/include/linux/rwlock_types.h
+++ b/include/linux/rwlock_types.h
@@ -47,6 +47,7 @@ typedef struct {
 				RW_DEP_MAP_INIT(lockname) }
 #endif
 
-#define DEFINE_RWLOCK(x)	rwlock_t x = __RW_LOCK_UNLOCKED(x)
+#define DEFINE_RWLOCK(name) \
+	rwlock_t name __cacheline_aligned_in_smp = __RW_LOCK_UNLOCKED(name)
 
 #endif /* __LINUX_RWLOCK_TYPES_H */
diff --git a/kernel/fork.c b/kernel/fork.c
index 591d2bd..b814a02 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -93,7 +93,7 @@ int max_threads;		/* tunable limit on nr_threads */
 
 DEFINE_PER_CPU(unsigned long, process_counts) = 0;
 
-__cacheline_aligned DEFINE_RWLOCK(tasklist_lock);  /* outer */
+DEFINE_RWLOCK(tasklist_lock);  /* outer */
 
 #ifdef CONFIG_PROVE_RCU
 int lockdep_tasklist_lock_is_held(void)
-- 
cgit v0.10.2


From 2cfd0417a86827cb80dfacbe72948ced92511733 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 17 Jul 2011 22:08:38 +0200
Subject: timer-handle-idle-trylock-in-get-next-timer-irq.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/spinlock_rt.h b/include/linux/spinlock_rt.h
index a9ea392..0618387 100644
--- a/include/linux/spinlock_rt.h
+++ b/include/linux/spinlock_rt.h
@@ -53,7 +53,17 @@ extern void __lockfunc __rt_spin_unlock(struct rt_mutex *lock);
 
 #define spin_lock_irq(lock)		spin_lock(lock)
 
-#define spin_trylock(lock)		__cond_lock(lock, rt_spin_trylock(lock))
+#define spin_do_trylock(lock)		__cond_lock(lock, rt_spin_trylock(lock))
+
+#define spin_trylock(lock)			\
+({						\
+	int __locked;				\
+	migrate_disable();			\
+	__locked = spin_do_trylock(lock);	\
+	if (!__locked)				\
+		migrate_enable();		\
+	__locked;				\
+})
 
 #ifdef CONFIG_LOCKDEP
 # define spin_lock_nested(lock, subclass)		\
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index b562de3..3bff726 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -861,15 +861,10 @@ EXPORT_SYMBOL(rt_spin_unlock_wait);
 
 int __lockfunc rt_spin_trylock(spinlock_t *lock)
 {
-	int ret;
+	int ret = rt_mutex_trylock(&lock->lock);
 
-	migrate_disable();
-	ret = rt_mutex_trylock(&lock->lock);
 	if (ret)
 		spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
-	else
-		migrate_enable();
-
 	return ret;
 }
 EXPORT_SYMBOL(rt_spin_trylock);
diff --git a/kernel/timer.c b/kernel/timer.c
index 5470c3e..8af44de 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1380,9 +1380,10 @@ unsigned long get_next_timer_interrupt(unsigned long now)
 	/*
 	 * On PREEMPT_RT we cannot sleep here. If the trylock does not
 	 * succeed then we return the worst-case 'expires in 1 tick'
-	 * value:
+	 * value.  We use the rt functions here directly to avoid a
+	 * migrate_disable() call.
 	 */
-	if (!spin_trylock(&base->lock))
+	if (!spin_do_trylock(&base->lock))
 		return  now + 1;
 #else
 	spin_lock(&base->lock);
@@ -1392,7 +1393,11 @@ unsigned long get_next_timer_interrupt(unsigned long now)
 			base->next_timer = __next_timer_interrupt(base);
 		expires = base->next_timer;
 	}
+#ifdef CONFIG_PREEMPT_RT_FULL
+	rt_spin_unlock(&base->lock);
+#else
 	spin_unlock(&base->lock);
+#endif
 
 	if (time_before_eq(expires, now))
 		return now;
-- 
cgit v0.10.2


From bde69a035b37e2a6c7b351e77afa376eacbd708f Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Sat, 13 Aug 2011 00:23:17 +0200
Subject: rcu: Frob softirq test

With RT_FULL we get the below wreckage:

[  126.060484] =======================================================
[  126.060486] [ INFO: possible circular locking dependency detected ]
[  126.060489] 3.0.1-rt10+ #30
[  126.060490] -------------------------------------------------------
[  126.060492] irq/24-eth0/1235 is trying to acquire lock:
[  126.060495]  (&(lock)->wait_lock#2){+.+...}, at: [<ffffffff81501c81>] rt_mutex_slowunlock+0x16/0x55
[  126.060503]
[  126.060504] but task is already holding lock:
[  126.060506]  (&p->pi_lock){-...-.}, at: [<ffffffff81074fdc>] try_to_wake_up+0x35/0x429
[  126.060511]
[  126.060511] which lock already depends on the new lock.
[  126.060513]
[  126.060514]
[  126.060514] the existing dependency chain (in reverse order) is:
[  126.060516]
[  126.060516] -> #1 (&p->pi_lock){-...-.}:
[  126.060519]        [<ffffffff810afe9e>] lock_acquire+0x145/0x18a
[  126.060524]        [<ffffffff8150291e>] _raw_spin_lock_irqsave+0x4b/0x85
[  126.060527]        [<ffffffff810b5aa4>] task_blocks_on_rt_mutex+0x36/0x20f
[  126.060531]        [<ffffffff815019bb>] rt_mutex_slowlock+0xd1/0x15a
[  126.060534]        [<ffffffff81501ae3>] rt_mutex_lock+0x2d/0x2f
[  126.060537]        [<ffffffff810d9020>] rcu_boost+0xad/0xde
[  126.060541]        [<ffffffff810d90ce>] rcu_boost_kthread+0x7d/0x9b
[  126.060544]        [<ffffffff8109a760>] kthread+0x99/0xa1
[  126.060547]        [<ffffffff81509b14>] kernel_thread_helper+0x4/0x10
[  126.060551]
[  126.060552] -> #0 (&(lock)->wait_lock#2){+.+...}:
[  126.060555]        [<ffffffff810af1b8>] __lock_acquire+0x1157/0x1816
[  126.060558]        [<ffffffff810afe9e>] lock_acquire+0x145/0x18a
[  126.060561]        [<ffffffff8150279e>] _raw_spin_lock+0x40/0x73
[  126.060564]        [<ffffffff81501c81>] rt_mutex_slowunlock+0x16/0x55
[  126.060566]        [<ffffffff81501ce7>] rt_mutex_unlock+0x27/0x29
[  126.060569]        [<ffffffff810d9f86>] rcu_read_unlock_special+0x17e/0x1c4
[  126.060573]        [<ffffffff810da014>] __rcu_read_unlock+0x48/0x89
[  126.060576]        [<ffffffff8106847a>] select_task_rq_rt+0xc7/0xd5
[  126.060580]        [<ffffffff8107511c>] try_to_wake_up+0x175/0x429
[  126.060583]        [<ffffffff81075425>] wake_up_process+0x15/0x17
[  126.060585]        [<ffffffff81080a51>] wakeup_softirqd+0x24/0x26
[  126.060590]        [<ffffffff81081df9>] irq_exit+0x49/0x55
[  126.060593]        [<ffffffff8150a3bd>] smp_apic_timer_interrupt+0x8a/0x98
[  126.060597]        [<ffffffff81509793>] apic_timer_interrupt+0x13/0x20
[  126.060600]        [<ffffffff810d5952>] irq_forced_thread_fn+0x1b/0x44
[  126.060603]        [<ffffffff810d582c>] irq_thread+0xde/0x1af
[  126.060606]        [<ffffffff8109a760>] kthread+0x99/0xa1
[  126.060608]        [<ffffffff81509b14>] kernel_thread_helper+0x4/0x10
[  126.060611]
[  126.060612] other info that might help us debug this:
[  126.060614]
[  126.060615]  Possible unsafe locking scenario:
[  126.060616]
[  126.060617]        CPU0                    CPU1
[  126.060619]        ----                    ----
[  126.060620]   lock(&p->pi_lock);
[  126.060623]                                lock(&(lock)->wait_lock);
[  126.060625]                                lock(&p->pi_lock);
[  126.060627]   lock(&(lock)->wait_lock);
[  126.060629]
[  126.060629]  *** DEADLOCK ***
[  126.060630]
[  126.060632] 1 lock held by irq/24-eth0/1235:
[  126.060633]  #0:  (&p->pi_lock){-...-.}, at: [<ffffffff81074fdc>] try_to_wake_up+0x35/0x429
[  126.060638]
[  126.060638] stack backtrace:
[  126.060641] Pid: 1235, comm: irq/24-eth0 Not tainted 3.0.1-rt10+ #30
[  126.060643] Call Trace:
[  126.060644]  <IRQ>  [<ffffffff810acbde>] print_circular_bug+0x289/0x29a
[  126.060651]  [<ffffffff810af1b8>] __lock_acquire+0x1157/0x1816
[  126.060655]  [<ffffffff810ab3aa>] ? trace_hardirqs_off_caller+0x1f/0x99
[  126.060658]  [<ffffffff81501c81>] ? rt_mutex_slowunlock+0x16/0x55
[  126.060661]  [<ffffffff810afe9e>] lock_acquire+0x145/0x18a
[  126.060664]  [<ffffffff81501c81>] ? rt_mutex_slowunlock+0x16/0x55
[  126.060668]  [<ffffffff8150279e>] _raw_spin_lock+0x40/0x73
[  126.060671]  [<ffffffff81501c81>] ? rt_mutex_slowunlock+0x16/0x55
[  126.060674]  [<ffffffff810d9655>] ? rcu_report_qs_rsp+0x87/0x8c
[  126.060677]  [<ffffffff81501c81>] rt_mutex_slowunlock+0x16/0x55
[  126.060680]  [<ffffffff810d9ea3>] ? rcu_read_unlock_special+0x9b/0x1c4
[  126.060683]  [<ffffffff81501ce7>] rt_mutex_unlock+0x27/0x29
[  126.060687]  [<ffffffff810d9f86>] rcu_read_unlock_special+0x17e/0x1c4
[  126.060690]  [<ffffffff810da014>] __rcu_read_unlock+0x48/0x89
[  126.060693]  [<ffffffff8106847a>] select_task_rq_rt+0xc7/0xd5
[  126.060696]  [<ffffffff810683da>] ? select_task_rq_rt+0x27/0xd5
[  126.060701]  [<ffffffff810a852a>] ? clockevents_program_event+0x8e/0x90
[  126.060704]  [<ffffffff8107511c>] try_to_wake_up+0x175/0x429
[  126.060708]  [<ffffffff810a95dc>] ? tick_program_event+0x1f/0x21
[  126.060711]  [<ffffffff81075425>] wake_up_process+0x15/0x17
[  126.060715]  [<ffffffff81080a51>] wakeup_softirqd+0x24/0x26
[  126.060718]  [<ffffffff81081df9>] irq_exit+0x49/0x55
[  126.060721]  [<ffffffff8150a3bd>] smp_apic_timer_interrupt+0x8a/0x98
[  126.060724]  [<ffffffff81509793>] apic_timer_interrupt+0x13/0x20
[  126.060726]  <EOI>  [<ffffffff81072855>] ? migrate_disable+0x75/0x12d
[  126.060733]  [<ffffffff81080a61>] ? local_bh_disable+0xe/0x1f
[  126.060736]  [<ffffffff81080a70>] ? local_bh_disable+0x1d/0x1f
[  126.060739]  [<ffffffff810d5952>] irq_forced_thread_fn+0x1b/0x44
[  126.060742]  [<ffffffff81502ac0>] ? _raw_spin_unlock_irq+0x3b/0x59
[  126.060745]  [<ffffffff810d582c>] irq_thread+0xde/0x1af
[  126.060748]  [<ffffffff810d5937>] ? irq_thread_fn+0x3a/0x3a
[  126.060751]  [<ffffffff810d574e>] ? irq_finalize_oneshot+0xd1/0xd1
[  126.060754]  [<ffffffff810d574e>] ? irq_finalize_oneshot+0xd1/0xd1
[  126.060757]  [<ffffffff8109a760>] kthread+0x99/0xa1
[  126.060761]  [<ffffffff81509b14>] kernel_thread_helper+0x4/0x10
[  126.060764]  [<ffffffff81069ed7>] ? finish_task_switch+0x87/0x10a
[  126.060768]  [<ffffffff81502ec4>] ? retint_restore_args+0xe/0xe
[  126.060771]  [<ffffffff8109a6c7>] ? __init_kthread_worker+0x8c/0x8c
[  126.060774]  [<ffffffff81509b10>] ? gs_change+0xb/0xb

Because irq_exit() does:

void irq_exit(void)
{
	account_system_vtime(current);
	trace_hardirq_exit();
	sub_preempt_count(IRQ_EXIT_OFFSET);
	if (!in_interrupt() && local_softirq_pending())
		invoke_softirq();

	...
}

Which triggers a wakeup, which uses RCU, now if the interrupted task has
t->rcu_read_unlock_special set, the rcu usage from the wakeup will end
up in rcu_read_unlock_special(). rcu_read_unlock_special() will test
for in_irq(), which will fail as we just decremented preempt_count
with IRQ_EXIT_OFFSET, and in_sering_softirq(), which for
PREEMPT_RT_FULL reads:

int in_serving_softirq(void)
{
	int res;

	preempt_disable();
	res = __get_cpu_var(local_softirq_runner) == current;
	preempt_enable();
	return res;
}

Which will thus also fail, resulting in the above wreckage.

The 'somewhat' ugly solution is to open-code the preempt_count() test
in rcu_read_unlock_special().

Also, we're not at all sure how ->rcu_read_unlock_special gets set
here... so this is very likely a bandaid and more thought is required.

Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>

diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index c1cc7e1..68cdd0a 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -351,7 +351,7 @@ void rcu_read_unlock_special(struct task_struct *t)
 	}
 
 	/* Hardware IRQ handlers cannot block. */
-	if (in_irq() || in_serving_softirq()) {
+	if (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_OFFSET)) {
 		local_irq_restore(flags);
 		return;
 	}
-- 
cgit v0.10.2


From 899c4bec3cb58535f8c827175b872ec299d6949f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 5 Oct 2011 11:59:38 -0700
Subject: rcu: Merge RCU-bh into RCU-preempt

The Linux kernel has long RCU-bh read-side critical sections that
intolerably increase scheduling latency under mainline's RCU-bh rules,
which include RCU-bh read-side critical sections being non-preemptible.
This patch therefore arranges for RCU-bh to be implemented in terms of
RCU-preempt for CONFIG_PREEMPT_RT_FULL=y.

This has the downside of defeating the purpose of RCU-bh, namely,
handling the case where the system is subjected to a network-based
denial-of-service attack that keeps at least one CPU doing full-time
softirq processing.  This issue will be fixed by a later commit.

The current commit will need some work to make it appropriate for
mainline use, for example, it needs to be extended to cover Tiny RCU.

[ paulmck: Added a useful changelog ]

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Link: http://lkml.kernel.org/r/20111005185938.GA20403@linux.vnet.ibm.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index d870976..76ba53a 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -120,6 +120,9 @@ extern void call_rcu(struct rcu_head *head,
 
 #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
 
+#ifdef CONFIG_PREEMPT_RT_FULL
+#define call_rcu_bh	call_rcu
+#else
 /**
  * call_rcu_bh() - Queue an RCU for invocation after a quicker grace period.
  * @head: structure to be used for queueing the RCU updates.
@@ -143,6 +146,7 @@ extern void call_rcu(struct rcu_head *head,
  */
 extern void call_rcu_bh(struct rcu_head *head,
 			void (*func)(struct rcu_head *head));
+#endif
 
 /**
  * call_rcu_sched() - Queue an RCU for invocation after sched grace period.
@@ -216,7 +220,13 @@ static inline int rcu_preempt_depth(void)
 
 /* Internal to kernel */
 extern void rcu_sched_qs(int cpu);
+
+#ifndef CONFIG_PREEMPT_RT_FULL
 extern void rcu_bh_qs(int cpu);
+#else
+static inline void rcu_bh_qs(int cpu) { }
+#endif
+
 extern void rcu_check_callbacks(int cpu, int user);
 struct notifier_block;
 extern void rcu_idle_enter(void);
@@ -366,7 +376,14 @@ static inline int rcu_read_lock_held(void)
  * rcu_read_lock_bh_held() is defined out of line to avoid #include-file
  * hell.
  */
+#ifdef CONFIG_PREEMPT_RT_FULL
+static inline int rcu_read_lock_bh_held(void)
+{
+	return rcu_read_lock_held();
+}
+#else
 extern int rcu_read_lock_bh_held(void);
+#endif
 
 /**
  * rcu_read_lock_sched_held() - might we be in RCU-sched read-side critical section?
@@ -814,10 +831,14 @@ static inline void rcu_read_unlock(void)
 static inline void rcu_read_lock_bh(void)
 {
 	local_bh_disable();
+#ifdef CONFIG_PREEMPT_RT_FULL
+	rcu_read_lock();
+#else
 	__acquire(RCU_BH);
 	rcu_lock_acquire(&rcu_bh_lock_map);
 	rcu_lockdep_assert(!rcu_is_cpu_idle(),
 			   "rcu_read_lock_bh() used illegally while idle");
+#endif
 }
 
 /*
@@ -827,10 +848,14 @@ static inline void rcu_read_lock_bh(void)
  */
 static inline void rcu_read_unlock_bh(void)
 {
+#ifdef CONFIG_PREEMPT_RT_FULL
+	rcu_read_unlock();
+#else
 	rcu_lockdep_assert(!rcu_is_cpu_idle(),
 			   "rcu_read_unlock_bh() used illegally while idle");
 	rcu_lock_release(&rcu_bh_lock_map);
 	__release(RCU_BH);
+#endif
 	local_bh_enable();
 }
 
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 952b793..f1472a2 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -45,7 +45,11 @@ static inline void rcu_virt_note_context_switch(int cpu)
 	rcu_note_context_switch(cpu);
 }
 
+#ifdef CONFIG_PREEMPT_RT_FULL
+# define synchronize_rcu_bh	synchronize_rcu
+#else
 extern void synchronize_rcu_bh(void);
+#endif
 extern void synchronize_sched_expedited(void);
 extern void synchronize_rcu_expedited(void);
 
@@ -73,20 +77,30 @@ static inline void synchronize_rcu_bh_expedited(void)
 }
 
 extern void rcu_barrier(void);
+#ifdef CONFIG_PREEMPT_RT_FULL
+# define rcu_barrier_bh		rcu_barrier
+#else
 extern void rcu_barrier_bh(void);
+#endif
 extern void rcu_barrier_sched(void);
 
 extern unsigned long rcutorture_testseq;
 extern unsigned long rcutorture_vernum;
 extern long rcu_batches_completed(void);
-extern long rcu_batches_completed_bh(void);
 extern long rcu_batches_completed_sched(void);
 
 extern void rcu_force_quiescent_state(void);
-extern void rcu_bh_force_quiescent_state(void);
 extern void rcu_sched_force_quiescent_state(void);
 
 extern void rcu_scheduler_starting(void);
 extern int rcu_scheduler_active __read_mostly;
 
+#ifndef CONFIG_PREEMPT_RT_FULL
+extern void rcu_bh_force_quiescent_state(void);
+extern long rcu_batches_completed_bh(void);
+#else
+# define rcu_bh_force_quiescent_state	rcu_force_quiescent_state
+# define rcu_batches_completed_bh	rcu_batches_completed
+#endif
+
 #endif /* __LINUX_RCUTREE_H */
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index a2cf761..8906a79 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -156,6 +156,7 @@ int debug_lockdep_rcu_enabled(void)
 }
 EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled);
 
+#ifndef CONFIG_PREEMPT_RT_FULL
 /**
  * rcu_read_lock_bh_held() - might we be in RCU-bh read-side critical section?
  *
@@ -182,6 +183,7 @@ int rcu_read_lock_bh_held(void)
 	return in_softirq() || irqs_disabled();
 }
 EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held);
+#endif
 
 #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
 
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index e441b77..e6fae48 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -181,6 +181,7 @@ void rcu_sched_qs(int cpu)
 	rdp->passed_quiesce = 1;
 }
 
+#ifndef CONFIG_PREEMPT_RT_FULL
 void rcu_bh_qs(int cpu)
 {
 	struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu);
@@ -189,6 +190,7 @@ void rcu_bh_qs(int cpu)
 		trace_rcu_grace_period("rcu_bh", rdp->gpnum, "cpuqs");
 	rdp->passed_quiesce = 1;
 }
+#endif
 
 /*
  * Note a context switch.  This is a quiescent state for RCU-sched,
@@ -242,6 +244,7 @@ long rcu_batches_completed_sched(void)
 }
 EXPORT_SYMBOL_GPL(rcu_batches_completed_sched);
 
+#ifndef CONFIG_PREEMPT_RT_FULL
 /*
  * Return the number of RCU BH batches processed thus far for debug & stats.
  */
@@ -259,6 +262,7 @@ void rcu_bh_force_quiescent_state(void)
 	force_quiescent_state(&rcu_bh_state);
 }
 EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state);
+#endif
 
 /*
  * Record the number of times rcutorture tests have been initiated and
@@ -2183,6 +2187,7 @@ void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
 }
 EXPORT_SYMBOL_GPL(call_rcu_sched);
 
+#ifndef CONFIG_PREEMPT_RT_FULL
 /*
  * Queue an RCU callback for invocation after a quicker grace period.
  */
@@ -2191,6 +2196,7 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
 	__call_rcu(head, func, &rcu_bh_state, -1, 0);
 }
 EXPORT_SYMBOL_GPL(call_rcu_bh);
+#endif
 
 /*
  * Because a context switch is a grace period for RCU-sched and RCU-bh,
@@ -2268,6 +2274,7 @@ void synchronize_sched(void)
 }
 EXPORT_SYMBOL_GPL(synchronize_sched);
 
+#ifndef CONFIG_PREEMPT_RT_FULL
 /**
  * synchronize_rcu_bh - wait until an rcu_bh grace period has elapsed.
  *
@@ -2294,6 +2301,7 @@ void synchronize_rcu_bh(void)
 		wait_rcu_gp(call_rcu_bh);
 }
 EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
+#endif
 
 static int synchronize_sched_expedited_cpu_stop(void *data)
 {
@@ -2682,6 +2690,7 @@ static void _rcu_barrier(struct rcu_state *rsp)
 	mutex_unlock(&rsp->barrier_mutex);
 }
 
+#ifndef CONFIG_PREEMPT_RT_FULL
 /**
  * rcu_barrier_bh - Wait until all in-flight call_rcu_bh() callbacks complete.
  */
@@ -2690,6 +2699,7 @@ void rcu_barrier_bh(void)
 	_rcu_barrier(&rcu_bh_state);
 }
 EXPORT_SYMBOL_GPL(rcu_barrier_bh);
+#endif
 
 /**
  * rcu_barrier_sched - Wait for in-flight call_rcu_sched() callbacks.
-- 
cgit v0.10.2


From f372309b8d7f08ea3ae71546df28ee156d16efdf Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 14 Nov 2011 10:57:54 +0100
Subject: rcu-more-fallout.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index e7dce58..9bc2fe6 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -371,6 +371,7 @@ void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
 }
 EXPORT_SYMBOL_GPL(call_rcu_sched);
 
+#ifndef CONFIG_PREEMPT_RT_FULL
 /*
  * Post an RCU bottom-half callback to be invoked after any subsequent
  * quiescent state.
@@ -380,3 +381,4 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
 	__call_rcu(head, func, &rcu_bh_ctrlblk);
 }
 EXPORT_SYMBOL_GPL(call_rcu_bh);
+#endif
-- 
cgit v0.10.2


From 5f98d33865e920fe4279d316a011216dcba1f615 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 5 Oct 2011 11:45:18 -0700
Subject: rcu: Make ksoftirqd do RCU quiescent states

Implementing RCU-bh in terms of RCU-preempt makes the system vulnerable
to network-based denial-of-service attacks.  This patch therefore
makes __do_softirq() invoke rcu_bh_qs(), but only when __do_softirq()
is running in ksoftirqd context.  A wrapper layer in interposed so that
other calls to __do_softirq() avoid invoking rcu_bh_qs().  The underlying
function __do_softirq_common() does the actual work.

The reason that rcu_bh_qs() is bad in these non-ksoftirqd contexts is
that there might be a local_bh_enable() inside an RCU-preempt read-side
critical section.  This local_bh_enable() can invoke __do_softirq()
directly, so if __do_softirq() were to invoke rcu_bh_qs() (which just
calls rcu_preempt_qs() in the PREEMPT_RT_FULL case), there would be
an illegal RCU-preempt quiescent state in the middle of an RCU-preempt
read-side critical section.  Therefore, quiescent states can only happen
in cases where __do_softirq() is invoked directly from ksoftirqd.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Link: http://lkml.kernel.org/r/20111005184518.GA21601@linux.vnet.ibm.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 76ba53a..33e1d2e 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -220,13 +220,7 @@ static inline int rcu_preempt_depth(void)
 
 /* Internal to kernel */
 extern void rcu_sched_qs(int cpu);
-
-#ifndef CONFIG_PREEMPT_RT_FULL
 extern void rcu_bh_qs(int cpu);
-#else
-static inline void rcu_bh_qs(int cpu) { }
-#endif
-
 extern void rcu_check_callbacks(int cpu, int user);
 struct notifier_block;
 extern void rcu_idle_enter(void);
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index e6fae48..85fd9a7 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -181,7 +181,14 @@ void rcu_sched_qs(int cpu)
 	rdp->passed_quiesce = 1;
 }
 
-#ifndef CONFIG_PREEMPT_RT_FULL
+#ifdef CONFIG_PREEMPT_RT_FULL
+static void rcu_preempt_qs(int cpu);
+
+void rcu_bh_qs(int cpu)
+{
+	rcu_preempt_qs(cpu);
+}
+#else
 void rcu_bh_qs(int cpu)
 {
 	struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu);
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 68cdd0a..778f138 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -1519,7 +1519,7 @@ static void __cpuinit rcu_prepare_kthreads(int cpu)
 
 #endif /* #else #ifdef CONFIG_RCU_BOOST */
 
-#if !defined(CONFIG_RCU_FAST_NO_HZ)
+#if !defined(CONFIG_RCU_FAST_NO_HZ) || defined(CONFIG_PREEMPT_RT_FULL)
 
 /*
  * Check to see if any future RCU-related work will need to be done
@@ -1535,6 +1535,9 @@ int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies)
 	*delta_jiffies = ULONG_MAX;
 	return rcu_cpu_has_callbacks(cpu);
 }
+#endif	/* !defined(CONFIG_RCU_FAST_NO_HZ) || defined(CONFIG_PREEMPT_RT_FULL) */
+
+#if !defined(CONFIG_RCU_FAST_NO_HZ)
 
 /*
  * Because we do not have RCU_FAST_NO_HZ, don't bother initializing for it.
@@ -1651,6 +1654,7 @@ static bool rcu_cpu_has_nonlazy_callbacks(int cpu)
 	       rcu_preempt_cpu_has_nonlazy_callbacks(cpu);
 }
 
+#ifndef CONFIG_PREEMPT_RT_FULL
 /*
  * Allow the CPU to enter dyntick-idle mode if either: (1) There are no
  * callbacks on this CPU, (2) this CPU has not yet attempted to enter
@@ -1694,6 +1698,7 @@ int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies)
 	}
 	return 0;
 }
+#endif	/* #ifndef CONFIG_PREEMPT_RT_FULL */
 
 /*
  * Handler for smp_call_function_single().  The only point of this
diff --git a/kernel/softirq.c b/kernel/softirq.c
index f69c069..f52cdbc 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -142,7 +142,7 @@ static void wakeup_softirqd(void)
 		wake_up_process(tsk);
 }
 
-static void handle_pending_softirqs(u32 pending, int cpu)
+static void handle_pending_softirqs(u32 pending, int cpu, int need_rcu_bh_qs)
 {
 	struct softirq_action *h = softirq_vec;
 	unsigned int prev_count = preempt_count();
@@ -165,7 +165,8 @@ static void handle_pending_softirqs(u32 pending, int cpu)
 			       prev_count, (unsigned int) preempt_count());
 			preempt_count() = prev_count;
 		}
-		rcu_bh_qs(cpu);
+		if (need_rcu_bh_qs)
+			rcu_bh_qs(cpu);
 	}
 	local_irq_disable();
 }
@@ -325,7 +326,7 @@ restart:
 	/* Reset the pending bitmask before enabling irqs */
 	set_softirq_pending(0);
 
-	handle_pending_softirqs(pending, cpu);
+	handle_pending_softirqs(pending, cpu, 1);
 
 	pending = local_softirq_pending();
 	if (pending && --max_restart)
@@ -376,7 +377,12 @@ static void ksoftirqd_clr_sched_params(unsigned int cpu, bool online) { }
 static DEFINE_LOCAL_IRQ_LOCK(local_softirq_lock);
 static DEFINE_PER_CPU(struct task_struct *, local_softirq_runner);
 
-static void __do_softirq(void);
+static void __do_softirq_common(int need_rcu_bh_qs);
+
+void __do_softirq(void)
+{
+	__do_softirq_common(0);
+}
 
 void __init softirq_early_init(void)
 {
@@ -447,7 +453,7 @@ EXPORT_SYMBOL(in_serving_softirq);
  * Called with bh and local interrupts disabled. For full RT cpu must
  * be pinned.
  */
-static void __do_softirq(void)
+static void __do_softirq_common(int need_rcu_bh_qs)
 {
 	u32 pending = local_softirq_pending();
 	int cpu = smp_processor_id();
@@ -461,7 +467,7 @@ static void __do_softirq(void)
 
 	lockdep_softirq_enter();
 
-	handle_pending_softirqs(pending, cpu);
+	handle_pending_softirqs(pending, cpu, need_rcu_bh_qs);
 
 	pending = local_softirq_pending();
 	if (pending)
@@ -500,7 +506,7 @@ static int __thread_do_softirq(int cpu)
 	 * schedule!
 	 */
 	if (local_softirq_pending())
-		__do_softirq();
+		__do_softirq_common(cpu >= 0);
 	local_unlock(local_softirq_lock);
 	unpin_current_cpu();
 	preempt_disable();
-- 
cgit v0.10.2


From c99e11cbd6b6e9d18ad640bdd57aee0d682bea70 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 15 Jun 2011 11:02:21 +0200
Subject: lglocks-rt.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/lglock.h b/include/linux/lglock.h
index 0d24e93..d2c0d6d 100644
--- a/include/linux/lglock.h
+++ b/include/linux/lglock.h
@@ -42,22 +42,37 @@
 #endif
 
 struct lglock {
+#ifndef CONFIG_PREEMPT_RT_FULL
 	arch_spinlock_t __percpu *lock;
+#else
+	struct rt_mutex __percpu *lock;
+#endif
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 	struct lock_class_key lock_key;
 	struct lockdep_map    lock_dep_map;
 #endif
 };
 
-#define DEFINE_LGLOCK(name)						\
+#ifndef CONFIG_PREEMPT_RT_FULL
+# define DEFINE_LGLOCK(name)						\
 	static DEFINE_PER_CPU(arch_spinlock_t, name ## _lock)		\
 	= __ARCH_SPIN_LOCK_UNLOCKED;					\
 	struct lglock name = { .lock = &name ## _lock }
 
-#define DEFINE_STATIC_LGLOCK(name)					\
+# define DEFINE_STATIC_LGLOCK(name)					\
 	static DEFINE_PER_CPU(arch_spinlock_t, name ## _lock)		\
 	= __ARCH_SPIN_LOCK_UNLOCKED;					\
 	static struct lglock name = { .lock = &name ## _lock }
+#else
+
+# define DEFINE_LGLOCK(name)						\
+	static DEFINE_PER_CPU(struct rt_mutex, name ## _lock);		\
+	struct lglock name = { .lock = &name ## _lock }
+
+# define DEFINE_STATIC_LGLOCK(name)					\
+	static DEFINE_PER_CPU(struct rt_mutex, name ## _lock);		\
+	static struct lglock name = { .lock = &name ## _lock }
+#endif
 
 void lg_lock_init(struct lglock *lg, char *name);
 void lg_local_lock(struct lglock *lg);
diff --git a/kernel/lglock.c b/kernel/lglock.c
index 6535a66..0bbf5d1 100644
--- a/kernel/lglock.c
+++ b/kernel/lglock.c
@@ -4,6 +4,15 @@
 #include <linux/cpu.h>
 #include <linux/string.h>
 
+#ifndef CONFIG_PREEMPT_RT_FULL
+# define lg_lock_ptr		arch_spinlock_t
+# define lg_do_lock(l)		arch_spin_lock(l)
+# define lg_do_unlock(l)	arch_spin_unlock(l)
+#else
+# define lg_lock_ptr		struct rt_mutex
+# define lg_do_lock(l)		__rt_spin_lock(l)
+# define lg_do_unlock(l)	__rt_spin_unlock(l)
+#endif
 /*
  * Note there is no uninit, so lglocks cannot be defined in
  * modules (but it's fine to use them from there)
@@ -12,51 +21,60 @@
 
 void lg_lock_init(struct lglock *lg, char *name)
 {
+#ifdef CONFIG_PREEMPT_RT_FULL
+	int i;
+
+	for_each_possible_cpu(i) {
+		struct rt_mutex *lock = per_cpu_ptr(lg->lock, i);
+
+		rt_mutex_init(lock);
+	}
+#endif
 	LOCKDEP_INIT_MAP(&lg->lock_dep_map, name, &lg->lock_key, 0);
 }
 EXPORT_SYMBOL(lg_lock_init);
 
 void lg_local_lock(struct lglock *lg)
 {
-	arch_spinlock_t *lock;
+	lg_lock_ptr *lock;
 
-	preempt_disable();
+	migrate_disable();
 	rwlock_acquire_read(&lg->lock_dep_map, 0, 0, _RET_IP_);
 	lock = this_cpu_ptr(lg->lock);
-	arch_spin_lock(lock);
+	lg_do_lock(lock);
 }
 EXPORT_SYMBOL(lg_local_lock);
 
 void lg_local_unlock(struct lglock *lg)
 {
-	arch_spinlock_t *lock;
+	lg_lock_ptr *lock;
 
 	rwlock_release(&lg->lock_dep_map, 1, _RET_IP_);
 	lock = this_cpu_ptr(lg->lock);
-	arch_spin_unlock(lock);
-	preempt_enable();
+	lg_do_unlock(lock);
+	migrate_enable();
 }
 EXPORT_SYMBOL(lg_local_unlock);
 
 void lg_local_lock_cpu(struct lglock *lg, int cpu)
 {
-	arch_spinlock_t *lock;
+	lg_lock_ptr *lock;
 
-	preempt_disable();
+	preempt_disable_nort();
 	rwlock_acquire_read(&lg->lock_dep_map, 0, 0, _RET_IP_);
 	lock = per_cpu_ptr(lg->lock, cpu);
-	arch_spin_lock(lock);
+	lg_do_lock(lock);
 }
 EXPORT_SYMBOL(lg_local_lock_cpu);
 
 void lg_local_unlock_cpu(struct lglock *lg, int cpu)
 {
-	arch_spinlock_t *lock;
+	lg_lock_ptr *lock;
 
 	rwlock_release(&lg->lock_dep_map, 1, _RET_IP_);
 	lock = per_cpu_ptr(lg->lock, cpu);
-	arch_spin_unlock(lock);
-	preempt_enable();
+	lg_do_unlock(lock);
+	preempt_enable_nort();
 }
 EXPORT_SYMBOL(lg_local_unlock_cpu);
 
@@ -64,12 +82,12 @@ void lg_global_lock(struct lglock *lg)
 {
 	int i;
 
-	preempt_disable();
+	preempt_disable_nort();
 	rwlock_acquire(&lg->lock_dep_map, 0, 0, _RET_IP_);
 	for_each_possible_cpu(i) {
-		arch_spinlock_t *lock;
+		lg_lock_ptr *lock;
 		lock = per_cpu_ptr(lg->lock, i);
-		arch_spin_lock(lock);
+		lg_do_lock(lock);
 	}
 }
 EXPORT_SYMBOL(lg_global_lock);
@@ -80,10 +98,10 @@ void lg_global_unlock(struct lglock *lg)
 
 	rwlock_release(&lg->lock_dep_map, 1, _RET_IP_);
 	for_each_possible_cpu(i) {
-		arch_spinlock_t *lock;
+		lg_lock_ptr *lock;
 		lock = per_cpu_ptr(lg->lock, i);
-		arch_spin_unlock(lock);
+		lg_do_unlock(lock);
 	}
-	preempt_enable();
+	preempt_enable_nort();
 }
 EXPORT_SYMBOL(lg_global_unlock);
-- 
cgit v0.10.2


From 40db6e3bbe5fd5f08f161e6e872a8ee571f88df1 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 3 Jul 2009 08:30:01 -0500
Subject: serial: 8250: Clean up the locking for -rt

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/drivers/tty/serial/8250/8250.c b/drivers/tty/serial/8250/8250.c
index 733f22c..c6cb88c 100644
--- a/drivers/tty/serial/8250/8250.c
+++ b/drivers/tty/serial/8250/8250.c
@@ -2900,14 +2900,10 @@ serial8250_console_write(struct console *co, const char *s, unsigned int count)
 
 	touch_nmi_watchdog();
 
-	local_irq_save(flags);
-	if (port->sysrq) {
-		/* serial8250_handle_irq() already took the lock */
-		locked = 0;
-	} else if (oops_in_progress) {
-		locked = spin_trylock(&port->lock);
-	} else
-		spin_lock(&port->lock);
+	if (port->sysrq || oops_in_progress)
+		locked = spin_trylock_irqsave(&port->lock, flags);
+	else
+		spin_lock_irqsave(&port->lock, flags);
 
 	/*
 	 *	First save the IER then disable the interrupts
@@ -2939,8 +2935,7 @@ serial8250_console_write(struct console *co, const char *s, unsigned int count)
 		serial8250_modem_status(up);
 
 	if (locked)
-		spin_unlock(&port->lock);
-	local_irq_restore(flags);
+		spin_unlock_irqrestore(&port->lock, flags);
 }
 
 static int __init serial8250_console_setup(struct console *co, char *options)
-- 
cgit v0.10.2


From 76db8fdf85c7f9340a6461a61de3dd42b5d35b9b Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 3 Jul 2009 08:30:01 -0500
Subject: serial: 8250: Call flush_to_ldisc when the irq is threaded

Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/drivers/tty/tty_buffer.c b/drivers/tty/tty_buffer.c
index cd1f861..7c021eb 100644
--- a/drivers/tty/tty_buffer.c
+++ b/drivers/tty/tty_buffer.c
@@ -566,10 +566,15 @@ void tty_flip_buffer_push(struct tty_struct *tty)
 		buf->tail->commit = buf->tail->used;
 	spin_unlock_irqrestore(&buf->lock, flags);
 
+#ifndef CONFIG_PREEMPT_RT_FULL
 	if (tty->low_latency)
 		flush_to_ldisc(&buf->work);
 	else
 		schedule_work(&buf->work);
+#else
+	flush_to_ldisc(&buf->work);
+#endif
+
 }
 EXPORT_SYMBOL(tty_flip_buffer_push);
 
-- 
cgit v0.10.2


From 184b04e541f4b54bb12d04b7fbbb7818154ddb74 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 28 Jul 2011 13:32:57 +0200
Subject: drivers-tty-fix-omap-lock-crap.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/drivers/tty/serial/omap-serial.c b/drivers/tty/serial/omap-serial.c
index 57d6b29..f33fa96 100644
--- a/drivers/tty/serial/omap-serial.c
+++ b/drivers/tty/serial/omap-serial.c
@@ -1166,13 +1166,10 @@ serial_omap_console_write(struct console *co, const char *s,
 
 	pm_runtime_get_sync(up->dev);
 
-	local_irq_save(flags);
-	if (up->port.sysrq)
-		locked = 0;
-	else if (oops_in_progress)
-		locked = spin_trylock(&up->port.lock);
+	if (up->port.sysrq || oops_in_progress)
+		locked = spin_trylock_irqsave(&up->port.lock, flags);
 	else
-		spin_lock(&up->port.lock);
+		spin_lock_irqsave(&up->port.lock, flags);
 
 	/*
 	 * First save the IER then disable the interrupts
@@ -1201,8 +1198,7 @@ serial_omap_console_write(struct console *co, const char *s,
 	pm_runtime_mark_last_busy(up->dev);
 	pm_runtime_put_autosuspend(up->dev);
 	if (locked)
-		spin_unlock(&up->port.lock);
-	local_irq_restore(flags);
+		spin_unlock_irqrestore(&up->port.lock, flags);
 }
 
 static int __init
-- 
cgit v0.10.2


From 67a5dafa24b0e4826a8e1ca6acc07242fdc843f1 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 8 Jan 2013 21:36:51 +0100
Subject: drivers-tty-pl011-irq-disable-madness.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/drivers/tty/serial/amba-pl011.c b/drivers/tty/serial/amba-pl011.c
index 7fca402..d7294f7 100644
--- a/drivers/tty/serial/amba-pl011.c
+++ b/drivers/tty/serial/amba-pl011.c
@@ -1779,13 +1779,19 @@ pl011_console_write(struct console *co, const char *s, unsigned int count)
 
 	clk_enable(uap->clk);
 
-	local_irq_save(flags);
+	/*
+	 * local_irq_save(flags);
+	 *
+	 * This local_irq_save() is nonsense. If we come in via sysrq
+	 * handling then interrupts are already disabled. Aside of
+	 * that the port.sysrq check is racy on SMP regardless.
+	*/
 	if (uap->port.sysrq)
 		locked = 0;
 	else if (oops_in_progress)
-		locked = spin_trylock(&uap->port.lock);
+		locked = spin_trylock_irqsave(&uap->port.lock, flags);
 	else
-		spin_lock(&uap->port.lock);
+		spin_lock_irqsave(&uap->port.lock, flags);
 
 	/*
 	 *	First save the CR then disable the interrupts
@@ -1807,8 +1813,7 @@ pl011_console_write(struct console *co, const char *s, unsigned int count)
 	writew(old_cr, uap->port.membase + UART011_CR);
 
 	if (locked)
-		spin_unlock(&uap->port.lock);
-	local_irq_restore(flags);
+		spin_unlock_irqrestore(&uap->port.lock, flags);
 
 	clk_disable(uap->clk);
 }
-- 
cgit v0.10.2


From 27bcea12186fc98629b3bfab800f05a9114b4e5b Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 14 Dec 2011 13:05:54 +0100
Subject: rt: Improve the serial console PASS_LIMIT
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Beyond the warning:

 drivers/tty/serial/8250/8250.c:1613:6: warning: unused variable ‘pass_counter’ [-Wunused-variable]

the solution of just looping infinitely was ugly - up it to 1 million to
give it a chance to continue in some really ugly situation.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/drivers/tty/serial/8250/8250.c b/drivers/tty/serial/8250/8250.c
index c6cb88c..25a5451 100644
--- a/drivers/tty/serial/8250/8250.c
+++ b/drivers/tty/serial/8250/8250.c
@@ -80,7 +80,16 @@ static unsigned int skip_txen_test; /* force skip of txen test at init time */
 #define DEBUG_INTR(fmt...)	do { } while (0)
 #endif
 
-#define PASS_LIMIT	512
+/*
+ * On -rt we can have a more delays, and legitimately
+ * so - so don't drop work spuriously and spam the
+ * syslog:
+ */
+#ifdef CONFIG_PREEMPT_RT_FULL
+# define PASS_LIMIT	1000000
+#else
+# define PASS_LIMIT	512
+#endif
 
 #define BOTH_EMPTY 	(UART_LSR_TEMT | UART_LSR_THRE)
 
-- 
cgit v0.10.2


From 0c577d12cd3c98517fd06e1fd5778a485e1f2acf Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 19 Jul 2009 08:44:27 -0500
Subject: fs: namespace preemption fix

On RT we cannot loop with preemption disabled here as
mnt_make_readonly() might have been preempted. We can safely enable
preemption while waiting for MNT_WRITE_HOLD to be cleared. Safe on !RT
as well.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/fs/namespace.c b/fs/namespace.c
index 5dd7709..c03d460 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -313,8 +313,11 @@ int __mnt_want_write(struct vfsmount *m)
 	 * incremented count after it has set MNT_WRITE_HOLD.
 	 */
 	smp_mb();
-	while (ACCESS_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD)
+	while (ACCESS_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD) {
+		preempt_enable();
 		cpu_relax();
+		preempt_disable();
+	}
 	/*
 	 * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will
 	 * be set to match its requirements. So we must not load that until
-- 
cgit v0.10.2


From 21179801fa92ab858b1c72c009f1b5ea4dc90ea1 Mon Sep 17 00:00:00 2001
From: Yong Zhang <yong.zhang0@gmail.com>
Date: Tue, 15 May 2012 13:53:56 +0800
Subject: mm: Protect activate_mm() by preempt_[disable&enable]_rt()

User preempt_*_rt instead of local_irq_*_rt or otherwise there will be
warning on ARM like below:

WARNING: at build/linux/kernel/smp.c:459 smp_call_function_many+0x98/0x264()
Modules linked in:
[<c0013bb4>] (unwind_backtrace+0x0/0xe4) from [<c001be94>] (warn_slowpath_common+0x4c/0x64)
[<c001be94>] (warn_slowpath_common+0x4c/0x64) from [<c001bec4>] (warn_slowpath_null+0x18/0x1c)
[<c001bec4>] (warn_slowpath_null+0x18/0x1c) from [<c0053ff8>](smp_call_function_many+0x98/0x264)
[<c0053ff8>] (smp_call_function_many+0x98/0x264) from [<c0054364>] (smp_call_function+0x44/0x6c)
[<c0054364>] (smp_call_function+0x44/0x6c) from [<c0017d50>] (__new_context+0xbc/0x124)
[<c0017d50>] (__new_context+0xbc/0x124) from [<c009e49c>] (flush_old_exec+0x460/0x5e4)
[<c009e49c>] (flush_old_exec+0x460/0x5e4) from [<c00d61ac>] (load_elf_binary+0x2e0/0x11ac)
[<c00d61ac>] (load_elf_binary+0x2e0/0x11ac) from [<c009d060>] (search_binary_handler+0x94/0x2a4)
[<c009d060>] (search_binary_handler+0x94/0x2a4) from [<c009e8fc>] (do_execve+0x254/0x364)
[<c009e8fc>] (do_execve+0x254/0x364) from [<c0010e84>] (sys_execve+0x34/0x54)
[<c0010e84>] (sys_execve+0x34/0x54) from [<c000da00>] (ret_fast_syscall+0x0/0x30)
---[ end trace 0000000000000002 ]---

The reason is that ARM need irq enabled when doing activate_mm().
According to mm-protect-activate-switch-mm.patch, actually
preempt_[disable|enable]_rt() is sufficient.

Inspired-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Yong Zhang <yong.zhang0@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Link: http://lkml.kernel.org/r/1337061236-1766-1-git-send-email-yong.zhang0@gmail.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/fs/exec.c b/fs/exec.c
index 20df02c..db1c4ec 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -827,10 +827,12 @@ static int exec_mmap(struct mm_struct *mm)
 		}
 	}
 	task_lock(tsk);
+	preempt_disable_rt();
 	active_mm = tsk->active_mm;
 	tsk->mm = mm;
 	tsk->active_mm = mm;
 	activate_mm(active_mm, mm);
+	preempt_enable_rt();
 	task_unlock(tsk);
 	arch_pick_mmap_layout(mm);
 	if (old_mm) {
diff --git a/mm/mmu_context.c b/mm/mmu_context.c
index 3dcfaf4..1385e48 100644
--- a/mm/mmu_context.c
+++ b/mm/mmu_context.c
@@ -26,6 +26,7 @@ void use_mm(struct mm_struct *mm)
 	struct task_struct *tsk = current;
 
 	task_lock(tsk);
+	preempt_disable_rt();
 	active_mm = tsk->active_mm;
 	if (active_mm != mm) {
 		atomic_inc(&mm->mm_count);
@@ -33,6 +34,7 @@ void use_mm(struct mm_struct *mm)
 	}
 	tsk->mm = mm;
 	switch_mm(active_mm, mm, tsk);
+	preempt_enable_rt();
 	task_unlock(tsk);
 
 	if (active_mm != mm)
-- 
cgit v0.10.2


From e78852912d83dc1cf9cbdbaee6e87f8c313e2c4f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 14 Jun 2011 17:05:09 +0200
Subject: fs-block-rt-support.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/block/blk-core.c b/block/blk-core.c
index 0ff1564..372ddb3 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -241,7 +241,7 @@ EXPORT_SYMBOL(blk_delay_queue);
  **/
 void blk_start_queue(struct request_queue *q)
 {
-	WARN_ON(!irqs_disabled());
+	WARN_ON_NONRT(!irqs_disabled());
 
 	queue_flag_clear(QUEUE_FLAG_STOPPED, q);
 	__blk_run_queue(q);
diff --git a/fs/file.c b/fs/file.c
index 3906d95..328087b 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -98,14 +98,14 @@ static void free_fdtable_rcu(struct rcu_head *rcu)
 		kfree(fdt->open_fds);
 		kfree(fdt);
 	} else {
-		fddef = &get_cpu_var(fdtable_defer_list);
+		fddef = &per_cpu(fdtable_defer_list, get_cpu_light());
 		spin_lock(&fddef->lock);
 		fdt->next = fddef->next;
 		fddef->next = fdt;
 		/* vmallocs are handled from the workqueue context */
 		schedule_work(&fddef->wq);
 		spin_unlock(&fddef->lock);
-		put_cpu_var(fdtable_defer_list);
+		put_cpu_light();
 	}
 }
 
-- 
cgit v0.10.2


From a4c07c6bec8f6e80cae08fe6443cf43c0bae0274 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Fri, 3 Jul 2009 08:44:12 -0500
Subject: fs: ntfs: disable interrupt only on !RT

On Sat, 2007-10-27 at 11:44 +0200, Ingo Molnar wrote:
> * Nick Piggin <nickpiggin@yahoo.com.au> wrote:
>
> > > [10138.175796]  [<c0105de3>] show_trace+0x12/0x14
> > > [10138.180291]  [<c0105dfb>] dump_stack+0x16/0x18
> > > [10138.184769]  [<c011609f>] native_smp_call_function_mask+0x138/0x13d
> > > [10138.191117]  [<c0117606>] smp_call_function+0x1e/0x24
> > > [10138.196210]  [<c012f85c>] on_each_cpu+0x25/0x50
> > > [10138.200807]  [<c0115c74>] flush_tlb_all+0x1e/0x20
> > > [10138.205553]  [<c016caaf>] kmap_high+0x1b6/0x417
> > > [10138.210118]  [<c011ec88>] kmap+0x4d/0x4f
> > > [10138.214102]  [<c026a9d8>] ntfs_end_buffer_async_read+0x228/0x2f9
> > > [10138.220163]  [<c01a0e9e>] end_bio_bh_io_sync+0x26/0x3f
> > > [10138.225352]  [<c01a2b09>] bio_endio+0x42/0x6d
> > > [10138.229769]  [<c02c2a08>] __end_that_request_first+0x115/0x4ac
> > > [10138.235682]  [<c02c2da7>] end_that_request_chunk+0x8/0xa
> > > [10138.241052]  [<c0365943>] ide_end_request+0x55/0x10a
> > > [10138.246058]  [<c036dae3>] ide_dma_intr+0x6f/0xac
> > > [10138.250727]  [<c0366d83>] ide_intr+0x93/0x1e0
> > > [10138.255125]  [<c015afb4>] handle_IRQ_event+0x5c/0xc9
> >
> > Looks like ntfs is kmap()ing from interrupt context. Should be using
> > kmap_atomic instead, I think.
>
> it's not atomic interrupt context but irq thread context - and -rt
> remaps kmap_atomic() to kmap() internally.

Hm.  Looking at the change to mm/bounce.c, perhaps I should do this
instead?

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index 4e4ca73..f5d4565 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -144,13 +144,13 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
 		recs = PAGE_CACHE_SIZE / rec_size;
 		/* Should have been verified before we got here... */
 		BUG_ON(!recs);
-		local_irq_save(flags);
+		local_irq_save_nort(flags);
 		kaddr = kmap_atomic(page);
 		for (i = 0; i < recs; i++)
 			post_read_mst_fixup((NTFS_RECORD*)(kaddr +
 					i * rec_size), rec_size);
 		kunmap_atomic(kaddr);
-		local_irq_restore(flags);
+		local_irq_restore_nort(flags);
 		flush_dcache_page(page);
 		if (likely(page_uptodate && !PageError(page)))
 			SetPageUptodate(page);
-- 
cgit v0.10.2


From 8235196b46b6267611f54bb6daf1d964867d26ef Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 13 Dec 2010 16:33:39 +0100
Subject: x86: Convert mce timer to hrtimer

mce_timer is started in atomic contexts of cpu bringup. This results
in might_sleep() warnings on RT. Convert mce_timer to a hrtimer to
avoid this.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 80dbda8..b57a6ed 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -41,6 +41,7 @@
 #include <linux/debugfs.h>
 #include <linux/irq_work.h>
 #include <linux/export.h>
+#include <linux/jiffies.h>
 
 #include <asm/processor.h>
 #include <asm/mce.h>
@@ -1259,7 +1260,7 @@ void mce_log_therm_throt_event(__u64 status)
 static unsigned long check_interval = 5 * 60; /* 5 minutes */
 
 static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */
-static DEFINE_PER_CPU(struct timer_list, mce_timer);
+static DEFINE_PER_CPU(struct hrtimer, mce_timer);
 
 static unsigned long mce_adjust_timer_default(unsigned long interval)
 {
@@ -1269,13 +1270,10 @@ static unsigned long mce_adjust_timer_default(unsigned long interval)
 static unsigned long (*mce_adjust_timer)(unsigned long interval) =
 	mce_adjust_timer_default;
 
-static void mce_timer_fn(unsigned long data)
+static enum hrtimer_restart mce_timer_fn(struct hrtimer *timer)
 {
-	struct timer_list *t = &__get_cpu_var(mce_timer);
 	unsigned long iv;
 
-	WARN_ON(smp_processor_id() != data);
-
 	if (mce_available(__this_cpu_ptr(&cpu_info))) {
 		machine_check_poll(MCP_TIMESTAMP,
 				&__get_cpu_var(mce_poll_banks));
@@ -1296,9 +1294,10 @@ static void mce_timer_fn(unsigned long data)
 	__this_cpu_write(mce_next_interval, iv);
 	/* Might have become 0 after CMCI storm subsided */
 	if (iv) {
-		t->expires = jiffies + iv;
-		add_timer_on(t, smp_processor_id());
+		hrtimer_forward_now(timer, ns_to_ktime(jiffies_to_usecs(iv)));
+		return HRTIMER_RESTART;
 	}
+	return HRTIMER_NORESTART;
 }
 
 /*
@@ -1306,28 +1305,37 @@ static void mce_timer_fn(unsigned long data)
  */
 void mce_timer_kick(unsigned long interval)
 {
-	struct timer_list *t = &__get_cpu_var(mce_timer);
-	unsigned long when = jiffies + interval;
+	struct hrtimer *t = &__get_cpu_var(mce_timer);
 	unsigned long iv = __this_cpu_read(mce_next_interval);
 
-	if (timer_pending(t)) {
-		if (time_before(when, t->expires))
-			mod_timer_pinned(t, when);
+	if (hrtimer_active(t)) {
+		s64 exp;
+		s64 intv_us;
+
+		intv_us = jiffies_to_usecs(interval);
+		exp = ktime_to_us(hrtimer_expires_remaining(t));
+		if (intv_us < exp) {
+			hrtimer_cancel(t);
+			hrtimer_start_range_ns(t,
+					ns_to_ktime(intv_us * 1000),
+					0, HRTIMER_MODE_REL_PINNED);
+		}
 	} else {
-		t->expires = round_jiffies(when);
-		add_timer_on(t, smp_processor_id());
+		hrtimer_start_range_ns(t,
+				ns_to_ktime(jiffies_to_usecs(interval) * 1000),
+				0, HRTIMER_MODE_REL_PINNED);
 	}
 	if (interval < iv)
 		__this_cpu_write(mce_next_interval, interval);
 }
 
-/* Must not be called in IRQ context where del_timer_sync() can deadlock */
+/* Must not be called in IRQ context where hrtimer_cancel() can deadlock */
 static void mce_timer_delete_all(void)
 {
 	int cpu;
 
 	for_each_online_cpu(cpu)
-		del_timer_sync(&per_cpu(mce_timer, cpu));
+		hrtimer_cancel(&per_cpu(mce_timer, cpu));
 }
 
 static void mce_do_trigger(struct work_struct *work)
@@ -1632,7 +1640,7 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
 	}
 }
 
-static void mce_start_timer(unsigned int cpu, struct timer_list *t)
+static void mce_start_timer(unsigned int cpu, struct hrtimer *t)
 {
 	unsigned long iv = mce_adjust_timer(check_interval * HZ);
 
@@ -1641,16 +1649,17 @@ static void mce_start_timer(unsigned int cpu, struct timer_list *t)
 	if (mca_cfg.ignore_ce || !iv)
 		return;
 
-	t->expires = round_jiffies(jiffies + iv);
-	add_timer_on(t, smp_processor_id());
+	hrtimer_start_range_ns(t, ns_to_ktime(jiffies_to_usecs(iv) * 1000),
+			0, HRTIMER_MODE_REL_PINNED);
 }
 
 static void __mcheck_cpu_init_timer(void)
 {
-	struct timer_list *t = &__get_cpu_var(mce_timer);
+	struct hrtimer *t = &__get_cpu_var(mce_timer);
 	unsigned int cpu = smp_processor_id();
 
-	setup_timer(t, mce_timer_fn, cpu);
+	hrtimer_init(t, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	t->function = mce_timer_fn;
 	mce_start_timer(cpu, t);
 }
 
@@ -2307,6 +2316,8 @@ static void __cpuinit mce_disable_cpu(void *h)
 	if (!mce_available(__this_cpu_ptr(&cpu_info)))
 		return;
 
+	hrtimer_cancel(&__get_cpu_var(mce_timer));
+
 	if (!(action & CPU_TASKS_FROZEN))
 		cmci_clear();
 	for (i = 0; i < mca_cfg.banks; i++) {
@@ -2333,6 +2344,7 @@ static void __cpuinit mce_reenable_cpu(void *h)
 		if (b->init)
 			wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl);
 	}
+	__mcheck_cpu_init_timer();
 }
 
 /* Get notified when a cpu comes on/off. Be hotplug friendly. */
@@ -2340,7 +2352,6 @@ static int __cpuinit
 mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
 {
 	unsigned int cpu = (unsigned long)hcpu;
-	struct timer_list *t = &per_cpu(mce_timer, cpu);
 
 	switch (action & ~CPU_TASKS_FROZEN) {
 	case CPU_ONLINE:
@@ -2356,11 +2367,9 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
 		break;
 	case CPU_DOWN_PREPARE:
 		smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
-		del_timer_sync(t);
 		break;
 	case CPU_DOWN_FAILED:
 		smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
-		mce_start_timer(cpu, t);
 		break;
 	}
 
-- 
cgit v0.10.2


From bde85ab102f34bc1be14d0c151a59522941cadb9 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Thu, 11 Apr 2013 14:33:34 -0400
Subject: x86/mce: Defer mce wakeups to threads for PREEMPT_RT

We had a customer report a lockup on a 3.0-rt kernel that had the
following backtrace:

[ffff88107fca3e80] rt_spin_lock_slowlock at ffffffff81499113
[ffff88107fca3f40] rt_spin_lock at ffffffff81499a56
[ffff88107fca3f50] __wake_up at ffffffff81043379
[ffff88107fca3f80] mce_notify_irq at ffffffff81017328
[ffff88107fca3f90] intel_threshold_interrupt at ffffffff81019508
[ffff88107fca3fa0] smp_threshold_interrupt at ffffffff81019fc1
[ffff88107fca3fb0] threshold_interrupt at ffffffff814a1853

It actually bugged because the lock was taken by the same owner that
already had that lock. What happened was the thread that was setting
itself on a wait queue had the lock when an MCE triggered. The MCE
interrupt does a wake up on its wait list and grabs the same lock.

NOTE: THIS IS NOT A BUG ON MAINLINE

Sorry for yelling, but as I Cc'd mainline maintainers I want them to
know that this is an PREEMPT_RT bug only. I only Cc'd them for advice.

On PREEMPT_RT the wait queue locks are converted from normal
"spin_locks" into an rt_mutex (see the rt_spin_lock_slowlock above).
These are not to be taken by hard interrupt context. This usually isn't
a problem as most all interrupts in PREEMPT_RT are converted into
schedulable threads. Unfortunately that's not the case with the MCE irq.

As wait queue locks are notorious for long hold times, we can not
convert them to raw_spin_locks without causing issues with -rt. But
Thomas has created a "simple-wait" structure that uses raw spin locks
which may have been a good fit.

Unfortunately, wait queues are not the only issue, as the mce_notify_irq
also does a schedule_work(), which grabs the workqueue spin locks that
have the exact same issue.

Thus, this patch I'm proposing is to move the actual work of the MCE
interrupt into a helper thread that gets woken up on the MCE interrupt
and does the work in a schedulable context.

NOTE: THIS PATCH ONLY CHANGES THE BEHAVIOR WHEN PREEMPT_RT IS SET

Oops, sorry for yelling again, but I want to stress that I keep the same
behavior of mainline when PREEMPT_RT is not set. Thus, this only changes
the MCE behavior when PREEMPT_RT is configured.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
[bigeasy@linutronix: make mce_notify_work() a proper prototype, use
		     kthread_run()]
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index b57a6ed..332e133 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -18,6 +18,7 @@
 #include <linux/rcupdate.h>
 #include <linux/kobject.h>
 #include <linux/uaccess.h>
+#include <linux/kthread.h>
 #include <linux/kdebug.h>
 #include <linux/kernel.h>
 #include <linux/percpu.h>
@@ -1345,6 +1346,63 @@ static void mce_do_trigger(struct work_struct *work)
 
 static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
 
+static void __mce_notify_work(void)
+{
+	/* Not more than two messages every minute */
+	static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
+
+	/* wake processes polling /dev/mcelog */
+	wake_up_interruptible(&mce_chrdev_wait);
+
+	/*
+	 * There is no risk of missing notifications because
+	 * work_pending is always cleared before the function is
+	 * executed.
+	 */
+	if (mce_helper[0] && !work_pending(&mce_trigger_work))
+		schedule_work(&mce_trigger_work);
+
+	if (__ratelimit(&ratelimit))
+		pr_info(HW_ERR "Machine check events logged\n");
+}
+
+#ifdef CONFIG_PREEMPT_RT_FULL
+struct task_struct *mce_notify_helper;
+
+static int mce_notify_helper_thread(void *unused)
+{
+	while (1) {
+		set_current_state(TASK_INTERRUPTIBLE);
+		schedule();
+		if (kthread_should_stop())
+			break;
+		__mce_notify_work();
+	}
+	return 0;
+}
+
+static int mce_notify_work_init(void)
+{
+	mce_notify_helper = kthread_run(mce_notify_helper_thread, NULL,
+					   "mce-notify");
+	if (!mce_notify_helper)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static void mce_notify_work(void)
+{
+	wake_up_process(mce_notify_helper);
+}
+#else
+static void mce_notify_work(void)
+{
+	__mce_notify_work();
+}
+static inline int mce_notify_work_init(void) { return 0; }
+#endif
+
 /*
  * Notify the user(s) about new machine check events.
  * Can be called from interrupt context, but not from machine check/NMI
@@ -1352,24 +1410,8 @@ static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
  */
 int mce_notify_irq(void)
 {
-	/* Not more than two messages every minute */
-	static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
-
 	if (test_and_clear_bit(0, &mce_need_notify)) {
-		/* wake processes polling /dev/mcelog */
-		wake_up_interruptible(&mce_chrdev_wait);
-
-		/*
-		 * There is no risk of missing notifications because
-		 * work_pending is always cleared before the function is
-		 * executed.
-		 */
-		if (mce_helper[0] && !work_pending(&mce_trigger_work))
-			schedule_work(&mce_trigger_work);
-
-		if (__ratelimit(&ratelimit))
-			pr_info(HW_ERR "Machine check events logged\n");
-
+		mce_notify_work();
 		return 1;
 	}
 	return 0;
@@ -2431,6 +2473,8 @@ static __init int mcheck_init_device(void)
 	/* register character device /dev/mcelog */
 	misc_register(&mce_chrdev_device);
 
+	err = mce_notify_work_init();
+
 	return err;
 }
 device_initcall_sync(mcheck_init_device);
-- 
cgit v0.10.2


From 3ad1be686c3bf19319380987ed71ddd70e7679e3 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 16 Dec 2010 14:25:18 +0100
Subject: x86: stackprotector: Avoid random pool on rt

CPU bringup calls into the random pool to initialize the stack
canary. During boot that works nicely even on RT as the might sleep
checks are disabled. During CPU hotplug the might sleep checks
trigger. Making the locks in random raw is a major PITA, so avoid the
call on RT is the only sensible solution. This is basically the same
randomness which we get during boot where the random pool has no
entropy and we rely on the TSC randomnness.

Reported-by: Carsten Emde <carsten.emde@osadl.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h
index 6a99859..64fb5cb 100644
--- a/arch/x86/include/asm/stackprotector.h
+++ b/arch/x86/include/asm/stackprotector.h
@@ -57,7 +57,7 @@
  */
 static __always_inline void boot_init_stack_canary(void)
 {
-	u64 canary;
+	u64 uninitialized_var(canary);
 	u64 tsc;
 
 #ifdef CONFIG_X86_64
@@ -68,8 +68,16 @@ static __always_inline void boot_init_stack_canary(void)
 	 * of randomness. The TSC only matters for very early init,
 	 * there it already has some randomness on most systems. Later
 	 * on during the bootup the random pool has true entropy too.
+	 *
+	 * For preempt-rt we need to weaken the randomness a bit, as
+	 * we can't call into the random generator from atomic context
+	 * due to locking constraints. We just leave canary
+	 * uninitialized and use the TSC based randomness on top of
+	 * it.
 	 */
+#ifndef CONFIG_PREEMPT_RT_FULL
 	get_random_bytes(&canary, sizeof(canary));
+#endif
 	tsc = __native_read_tsc();
 	canary += tsc + (tsc << 32UL);
 
-- 
cgit v0.10.2


From 6a262313f6db79132b2cb4dbb09f596dee0c4fc8 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 26 Jul 2009 02:21:32 +0200
Subject: x86: Use generic rwsem_spinlocks on -rt

Simplifies the separation of anon_rw_semaphores and rw_semaphores for
-rt.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 0694d09..8b866b2 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -173,8 +173,11 @@ config ARCH_MAY_HAVE_PC_FDC
 	def_bool y
 	depends on ISA_DMA_API
 
+config RWSEM_GENERIC_SPINLOCK
+	def_bool PREEMPT_RT_FULL
+
 config RWSEM_XCHGADD_ALGORITHM
-	def_bool y
+	def_bool !RWSEM_GENERIC_SPINLOCK && !PREEMPT_RT_FULL
 
 config GENERIC_CALIBRATE_DELAY
 	def_bool y
-- 
cgit v0.10.2


From 4f1f56c94adce7d553d446238478ee173198d6fa Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Fri, 3 Jul 2009 08:44:10 -0500
Subject: x86: Disable IST stacks for debug/int 3/stack fault for PREEMPT_RT

Normally the x86-64 trap handlers for debug/int 3/stack fault run
on a special interrupt stack to make them more robust
when dealing with kernel code.

The PREEMPT_RT kernel can sleep in locks even while allocating
GFP_ATOMIC memory. When one of these trap handlers needs to send
real time signals for ptrace it allocates memory and could then
try to to schedule.  But it is not allowed to schedule on a
IST stack. This can cause warnings and hangs.

This patch disables the IST stacks for these handlers for PREEMPT_RT
kernel. Instead let them run on the normal process stack.

The kernel only really needs the ISTs here to make kernel debuggers more
robust in case someone sets a break point somewhere where the stack is
invalid. But there are no kernel debuggers in the standard kernel
that do this.

It also means kprobes cannot be set in situations with invalid stack;
but that sounds like a reasonable restriction.

The stack fault change could minimally impact oops quality, but not very
much because stack faults are fairly rare.

A better solution would be to use similar logic as the NMI "paranoid"
path: check if signal is for user space, if yes go back to entry.S, switch stack,
call sync_regs, then do the signal sending etc.

But this patch is much simpler and should work too with minimal impact.

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h
index 320f7bb..65b85f4 100644
--- a/arch/x86/include/asm/page_64_types.h
+++ b/arch/x86/include/asm/page_64_types.h
@@ -14,12 +14,21 @@
 #define IRQ_STACK_ORDER 2
 #define IRQ_STACK_SIZE (PAGE_SIZE << IRQ_STACK_ORDER)
 
-#define STACKFAULT_STACK 1
-#define DOUBLEFAULT_STACK 2
-#define NMI_STACK 3
-#define DEBUG_STACK 4
-#define MCE_STACK 5
-#define N_EXCEPTION_STACKS 5  /* hw limit: 7 */
+#ifdef CONFIG_PREEMPT_RT_FULL
+# define STACKFAULT_STACK 0
+# define DOUBLEFAULT_STACK 1
+# define NMI_STACK 2
+# define DEBUG_STACK 0
+# define MCE_STACK 3
+# define N_EXCEPTION_STACKS 3  /* hw limit: 7 */
+#else
+# define STACKFAULT_STACK 1
+# define DOUBLEFAULT_STACK 2
+# define NMI_STACK 3
+# define DEBUG_STACK 4
+# define MCE_STACK 5
+# define N_EXCEPTION_STACKS 5  /* hw limit: 7 */
+#endif
 
 #define PUD_PAGE_SIZE		(_AC(1, UL) << PUD_SHIFT)
 #define PUD_PAGE_MASK		(~(PUD_PAGE_SIZE-1))
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 9c3ab43..2636e0f 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1103,7 +1103,9 @@ DEFINE_PER_CPU(struct task_struct *, fpu_owner_task);
  */
 static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = {
 	  [0 ... N_EXCEPTION_STACKS - 1]	= EXCEPTION_STKSZ,
+#if DEBUG_STACK > 0
 	  [DEBUG_STACK - 1]			= DEBUG_STKSZ
+#endif
 };
 
 static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index b653675..f16c07b 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -21,10 +21,14 @@
 		(N_EXCEPTION_STACKS + DEBUG_STKSZ/EXCEPTION_STKSZ - 2)
 
 static char x86_stack_ids[][8] = {
+#if DEBUG_STACK > 0
 		[ DEBUG_STACK-1			]	= "#DB",
+#endif
 		[ NMI_STACK-1			]	= "NMI",
 		[ DOUBLEFAULT_STACK-1		]	= "#DF",
+#if STACKFAULT_STACK > 0
 		[ STACKFAULT_STACK-1		]	= "#SS",
+#endif
 		[ MCE_STACK-1			]	= "#MC",
 #if DEBUG_STKSZ > EXCEPTION_STKSZ
 		[ N_EXCEPTION_STACKS ...
-- 
cgit v0.10.2


From 09c220bdd3bee8ae1ee0e3325438502dcdf3c318 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 8 Jul 2011 16:35:35 +0200
Subject: epoll.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 9fec183..343e14a 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -497,12 +497,12 @@ static int ep_poll_wakeup_proc(void *priv, void *cookie, int call_nests)
  */
 static void ep_poll_safewake(wait_queue_head_t *wq)
 {
-	int this_cpu = get_cpu();
+	int this_cpu = get_cpu_light();
 
 	ep_call_nested(&poll_safewake_ncalls, EP_MAX_NESTS,
 		       ep_poll_wakeup_proc, NULL, wq, (void *) (long) this_cpu);
 
-	put_cpu();
+	put_cpu_light();
 }
 
 static void ep_remove_wait_queue(struct eppoll_entry *pwq)
-- 
cgit v0.10.2


From 046949d7f5173e7d1ee5ebf35d3e3cad113f8b34 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 12 Jul 2011 11:39:36 +0200
Subject: mm-vmalloc.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 5123a16..f4b4fee 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -782,7 +782,7 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask)
 	struct vmap_block *vb;
 	struct vmap_area *va;
 	unsigned long vb_idx;
-	int node, err;
+	int node, err, cpu;
 
 	node = numa_node_id();
 
@@ -821,12 +821,13 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask)
 	BUG_ON(err);
 	radix_tree_preload_end();
 
-	vbq = &get_cpu_var(vmap_block_queue);
+	cpu = get_cpu_light();
+	vbq = &__get_cpu_var(vmap_block_queue);
 	vb->vbq = vbq;
 	spin_lock(&vbq->lock);
 	list_add_rcu(&vb->free_list, &vbq->free);
 	spin_unlock(&vbq->lock);
-	put_cpu_var(vmap_block_queue);
+	put_cpu_light();
 
 	return vb;
 }
@@ -900,7 +901,7 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
 	struct vmap_block *vb;
 	unsigned long addr = 0;
 	unsigned int order;
-	int purge = 0;
+	int purge = 0, cpu;
 
 	BUG_ON(size & ~PAGE_MASK);
 	BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
@@ -916,7 +917,8 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
 
 again:
 	rcu_read_lock();
-	vbq = &get_cpu_var(vmap_block_queue);
+	cpu = get_cpu_light();
+	vbq = &__get_cpu_var(vmap_block_queue);
 	list_for_each_entry_rcu(vb, &vbq->free, free_list) {
 		int i;
 
@@ -953,7 +955,7 @@ next:
 	if (purge)
 		purge_fragmented_blocks_thiscpu();
 
-	put_cpu_var(vmap_block_queue);
+	put_cpu_light();
 	rcu_read_unlock();
 
 	if (!addr) {
-- 
cgit v0.10.2


From 1981f7205d944eddb8b4a4522ef92cddf0491a9b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 17 Jul 2011 21:42:26 +0200
Subject: Use local irq lock instead of irq disable regions

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 3a3a98f..11285e4 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -41,6 +41,7 @@
 #include <linux/debug_locks.h>
 #include <linux/lockdep.h>
 #include <linux/idr.h>
+#include <linux/locallock.h>
 
 #include "workqueue_sched.h"
 
@@ -278,6 +279,8 @@ EXPORT_SYMBOL_GPL(system_unbound_wq);
 struct workqueue_struct *system_freezable_wq __read_mostly;
 EXPORT_SYMBOL_GPL(system_freezable_wq);
 
+static DEFINE_LOCAL_IRQ_LOCK(pendingb_lock);
+
 #define CREATE_TRACE_POINTS
 #include <trace/events/workqueue.h>
 
@@ -1092,7 +1095,7 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
 {
 	struct global_cwq *gcwq;
 
-	local_irq_save(*flags);
+	local_lock_irqsave(pendingb_lock, *flags);
 
 	/* try to steal the timer if it exists */
 	if (is_dwork) {
@@ -1151,7 +1154,7 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
 	}
 	spin_unlock(&gcwq->lock);
 fail:
-	local_irq_restore(*flags);
+	local_unlock_irqrestore(pendingb_lock, *flags);
 	if (work_is_canceling(work))
 		return -ENOENT;
 	cpu_relax();
@@ -1246,7 +1249,7 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
 	 * queued or lose PENDING.  Grabbing PENDING and queueing should
 	 * happen with IRQ disabled.
 	 */
-	WARN_ON_ONCE(!irqs_disabled());
+	WARN_ON_ONCE_NONRT(!irqs_disabled());
 
 	debug_work_activate(work);
 
@@ -1336,14 +1339,14 @@ bool queue_work_on(int cpu, struct workqueue_struct *wq,
 	bool ret = false;
 	unsigned long flags;
 
-	local_irq_save(flags);
+	local_lock_irqsave(pendingb_lock,flags);
 
 	if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
 		__queue_work(cpu, wq, work);
 		ret = true;
 	}
 
-	local_irq_restore(flags);
+	local_unlock_irqrestore(pendingb_lock, flags);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(queue_work_on);
@@ -1451,14 +1454,14 @@ bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
 	unsigned long flags;
 
 	/* read the comment in __queue_work() */
-	local_irq_save(flags);
+	local_lock_irqsave(pendingb_lock, flags);
 
 	if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
 		__queue_delayed_work(cpu, wq, dwork, delay);
 		ret = true;
 	}
 
-	local_irq_restore(flags);
+	local_unlock_irqrestore(pendingb_lock, flags);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(queue_delayed_work_on);
@@ -1508,7 +1511,7 @@ bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq,
 
 	if (likely(ret >= 0)) {
 		__queue_delayed_work(cpu, wq, dwork, delay);
-		local_irq_restore(flags);
+		local_unlock_irqrestore(pendingb_lock, flags);
 	}
 
 	/* -ENOENT from try_to_grab_pending() becomes %true */
@@ -2936,7 +2939,7 @@ static bool __cancel_work_timer(struct work_struct *work, bool is_dwork)
 
 	/* tell other tasks trying to grab @work to back off */
 	mark_work_canceling(work);
-	local_irq_restore(flags);
+	local_unlock_irqrestore(pendingb_lock, flags);
 
 	flush_work(work);
 	clear_work_data(work);
@@ -2981,11 +2984,11 @@ EXPORT_SYMBOL_GPL(cancel_work_sync);
  */
 bool flush_delayed_work(struct delayed_work *dwork)
 {
-	local_irq_disable();
+	local_lock_irq(pendingb_lock);
 	if (del_timer_sync(&dwork->timer))
 		__queue_work(dwork->cpu,
 			     get_work_cwq(&dwork->work)->wq, &dwork->work);
-	local_irq_enable();
+	local_unlock_irq(pendingb_lock);
 	return flush_work(&dwork->work);
 }
 EXPORT_SYMBOL(flush_delayed_work);
@@ -3015,7 +3018,7 @@ bool cancel_delayed_work(struct delayed_work *dwork)
 		return false;
 
 	set_work_cpu_and_clear_pending(&dwork->work, work_cpu(&dwork->work));
-	local_irq_restore(flags);
+	local_unlock_irqrestore(pendingb_lock, flags);
 	return ret;
 }
 EXPORT_SYMBOL(cancel_delayed_work);
-- 
cgit v0.10.2


From 2879be1b9a6e400bafca4f4f741c957093cb586a Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 17 Jul 2011 21:41:35 +0200
Subject: debugobjects-rt.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/lib/debugobjects.c b/lib/debugobjects.c
index d11808c..cf5f02f 100644
--- a/lib/debugobjects.c
+++ b/lib/debugobjects.c
@@ -309,7 +309,10 @@ __debug_object_init(void *addr, struct debug_obj_descr *descr, int onstack)
 	struct debug_obj *obj;
 	unsigned long flags;
 
-	fill_pool();
+#ifdef CONFIG_PREEMPT_RT_FULL
+	if (preempt_count() == 0 && !irqs_disabled())
+#endif
+		fill_pool();
 
 	db = get_bucket((unsigned long) addr);
 
-- 
cgit v0.10.2


From a1b185d834708a0c166ad5bea613e87dbbb6c2c0 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 13 Jul 2011 11:03:16 +0200
Subject: jump-label-rt.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h
index 0976fc4..40c876b 100644
--- a/include/linux/jump_label.h
+++ b/include/linux/jump_label.h
@@ -50,7 +50,8 @@
 #include <linux/compiler.h>
 #include <linux/workqueue.h>
 
-#if defined(CC_HAVE_ASM_GOTO) && defined(CONFIG_JUMP_LABEL)
+#if defined(CC_HAVE_ASM_GOTO) && defined(CONFIG_JUMP_LABEL) && \
+	!defined(CONFIG_PREEMPT_BASE)
 
 struct static_key {
 	atomic_t enabled;
-- 
cgit v0.10.2


From e36d844ce8ae91f99576f282ffb09c0f0359d5de Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 12 Jul 2011 15:38:34 +0200
Subject: skbufhead-raw-lock.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 9ef07d0..b705426 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1783,6 +1783,7 @@ struct softnet_data {
 	unsigned int		dropped;
 	struct sk_buff_head	input_pkt_queue;
 	struct napi_struct	backlog;
+	struct sk_buff_head	tofree_queue;
 };
 
 static inline void input_queue_head_incr(struct softnet_data *sd)
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 98399e2..0ad6aff 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -132,6 +132,7 @@ struct sk_buff_head {
 
 	__u32		qlen;
 	spinlock_t	lock;
+	raw_spinlock_t	raw_lock;
 };
 
 struct sk_buff;
@@ -1008,6 +1009,12 @@ static inline void skb_queue_head_init(struct sk_buff_head *list)
 	__skb_queue_head_init(list);
 }
 
+static inline void skb_queue_head_init_raw(struct sk_buff_head *list)
+{
+	raw_spin_lock_init(&list->raw_lock);
+	__skb_queue_head_init(list);
+}
+
 static inline void skb_queue_head_init_class(struct sk_buff_head *list,
 		struct lock_class_key *class)
 {
diff --git a/net/core/dev.c b/net/core/dev.c
index b741130..4693901 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -225,14 +225,14 @@ static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 static inline void rps_lock(struct softnet_data *sd)
 {
 #ifdef CONFIG_RPS
-	spin_lock(&sd->input_pkt_queue.lock);
+	raw_spin_lock(&sd->input_pkt_queue.raw_lock);
 #endif
 }
 
 static inline void rps_unlock(struct softnet_data *sd)
 {
 #ifdef CONFIG_RPS
-	spin_unlock(&sd->input_pkt_queue.lock);
+	raw_spin_unlock(&sd->input_pkt_queue.raw_lock);
 #endif
 }
 
@@ -3533,7 +3533,7 @@ static void flush_backlog(void *arg)
 	skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
 		if (skb->dev == dev) {
 			__skb_unlink(skb, &sd->input_pkt_queue);
-			kfree_skb(skb);
+			__skb_queue_tail(&sd->tofree_queue, skb);
 			input_queue_head_incr(sd);
 		}
 	}
@@ -3542,10 +3542,13 @@ static void flush_backlog(void *arg)
 	skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
 		if (skb->dev == dev) {
 			__skb_unlink(skb, &sd->process_queue);
-			kfree_skb(skb);
+			__skb_queue_tail(&sd->tofree_queue, skb);
 			input_queue_head_incr(sd);
 		}
 	}
+
+	if (!skb_queue_empty(&sd->tofree_queue))
+		raise_softirq_irqoff(NET_RX_SOFTIRQ);
 }
 
 static int napi_gro_complete(struct sk_buff *skb)
@@ -4050,10 +4053,17 @@ static void net_rx_action(struct softirq_action *h)
 	struct softnet_data *sd = &__get_cpu_var(softnet_data);
 	unsigned long time_limit = jiffies + 2;
 	int budget = netdev_budget;
+	struct sk_buff *skb;
 	void *have;
 
 	local_irq_disable();
 
+	while ((skb = __skb_dequeue(&sd->tofree_queue))) {
+		local_irq_enable();
+		kfree_skb(skb);
+		local_irq_disable();
+	}
+
 	while (!list_empty(&sd->poll_list)) {
 		struct napi_struct *n;
 		int work, weight;
@@ -6535,6 +6545,9 @@ static int dev_cpu_callback(struct notifier_block *nfb,
 		netif_rx(skb);
 		input_queue_head_incr(oldsd);
 	}
+	while ((skb = __skb_dequeue(&oldsd->tofree_queue))) {
+		kfree_skb(skb);
+	}
 
 	return NOTIFY_OK;
 }
@@ -6807,8 +6820,9 @@ static int __init net_dev_init(void)
 		struct softnet_data *sd = &per_cpu(softnet_data, i);
 
 		memset(sd, 0, sizeof(*sd));
-		skb_queue_head_init(&sd->input_pkt_queue);
-		skb_queue_head_init(&sd->process_queue);
+		skb_queue_head_init_raw(&sd->input_pkt_queue);
+		skb_queue_head_init_raw(&sd->process_queue);
+		skb_queue_head_init_raw(&sd->tofree_queue);
 		sd->completion_queue = NULL;
 		INIT_LIST_HEAD(&sd->poll_list);
 		sd->output_queue = NULL;
-- 
cgit v0.10.2


From 3ad6871af4d3ccf510e229c386d36e757160070e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 13 Jul 2011 14:05:05 +0200
Subject: x86-no-perf-irq-work-rt.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/x86/kernel/irq_work.c b/arch/x86/kernel/irq_work.c
index ca8f703..129b8bb 100644
--- a/arch/x86/kernel/irq_work.c
+++ b/arch/x86/kernel/irq_work.c
@@ -18,6 +18,7 @@ void smp_irq_work_interrupt(struct pt_regs *regs)
 	irq_exit();
 }
 
+#ifndef CONFIG_PREEMPT_RT_FULL
 void arch_irq_work_raise(void)
 {
 #ifdef CONFIG_X86_LOCAL_APIC
@@ -28,3 +29,4 @@ void arch_irq_work_raise(void)
 	apic_wait_icr_idle();
 #endif
 }
+#endif
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index 1588e3b..170c2ea 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -107,8 +107,10 @@ void irq_work_run(void)
 	if (llist_empty(this_list))
 		return;
 
+#ifndef CONFIG_PREEMPT_RT_FULL
 	BUG_ON(!in_irq());
 	BUG_ON(!irqs_disabled());
+#endif
 
 	llnode = llist_del_all(this_list);
 	while (llnode != NULL) {
diff --git a/kernel/timer.c b/kernel/timer.c
index 8af44de..374e7b1 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1419,7 +1419,7 @@ void update_process_times(int user_tick)
 	scheduler_tick();
 	run_local_timers();
 	rcu_check_callbacks(cpu, user_tick);
-#ifdef CONFIG_IRQ_WORK
+#if defined(CONFIG_IRQ_WORK) && !defined(CONFIG_PREEMPT_RT_FULL)
 	if (in_irq())
 		irq_work_run();
 #endif
@@ -1433,6 +1433,10 @@ static void run_timer_softirq(struct softirq_action *h)
 {
 	struct tvec_base *base = __this_cpu_read(tvec_bases);
 
+#if defined(CONFIG_IRQ_WORK) && defined(CONFIG_PREEMPT_RT_FULL)
+	irq_work_run();
+#endif
+
 	printk_tick();
 	hrtimer_run_pending();
 
-- 
cgit v0.10.2


From 0d489fd0024fb6829af2dd4528622943e852a6c3 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 19 Sep 2012 14:50:37 +0200
Subject: printk-rt-aware.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/printk.c b/kernel/printk.c
index df4b56b..2609082 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -1315,6 +1315,7 @@ static void call_console_drivers(int level, const char *text, size_t len)
 	if (!console_drivers)
 		return;
 
+	migrate_disable();
 	for_each_console(con) {
 		if (exclusive_console && con != exclusive_console)
 			continue;
@@ -1327,6 +1328,7 @@ static void call_console_drivers(int level, const char *text, size_t len)
 			continue;
 		con->write(con, text, len);
 	}
+	migrate_enable();
 }
 
 /*
@@ -1386,12 +1388,18 @@ static inline int can_use_console(unsigned int cpu)
  * interrupts disabled. It should return with 'lockbuf_lock'
  * released but interrupts still disabled.
  */
-static int console_trylock_for_printk(unsigned int cpu)
+static int console_trylock_for_printk(unsigned int cpu, unsigned long flags)
 	__releases(&logbuf_lock)
 {
 	int retval = 0, wake = 0;
+#ifdef CONFIG_PREEMPT_RT_FULL
+	int lock = !early_boot_irqs_disabled && !irqs_disabled_flags(flags) &&
+		(preempt_count() <= 1);
+#else
+	int lock = 1;
+#endif
 
-	if (console_trylock()) {
+	if (lock && console_trylock()) {
 		retval = 1;
 
 		/*
@@ -1670,8 +1678,15 @@ asmlinkage int vprintk_emit(int facility, int level,
 	 * The console_trylock_for_printk() function will release 'logbuf_lock'
 	 * regardless of whether it actually gets the console semaphore or not.
 	 */
-	if (console_trylock_for_printk(this_cpu))
+	if (console_trylock_for_printk(this_cpu, flags)) {
+#ifndef CONFIG_PREEMPT_RT_FULL
+		console_unlock();
+#else
+		raw_local_irq_restore(flags);
 		console_unlock();
+		raw_local_irq_save(flags);
+#endif
+	}
 
 	lockdep_on();
 out_restore_irqs:
@@ -2060,11 +2075,16 @@ static void console_cont_flush(char *text, size_t size)
 		goto out;
 
 	len = cont_print_text(text, size);
+#ifndef CONFIG_PREEMPT_RT_FULL
 	raw_spin_unlock(&logbuf_lock);
 	stop_critical_timings();
 	call_console_drivers(cont.level, text, len);
 	start_critical_timings();
 	local_irq_restore(flags);
+#else
+	raw_spin_unlock_irqrestore(&logbuf_lock, flags);
+	call_console_drivers(cont.level, text, len);
+#endif
 	return;
 out:
 	raw_spin_unlock_irqrestore(&logbuf_lock, flags);
@@ -2147,12 +2167,17 @@ skip:
 		console_idx = log_next(console_idx);
 		console_seq++;
 		console_prev = msg->flags;
-		raw_spin_unlock(&logbuf_lock);
 
+#ifndef CONFIG_PREEMPT_RT_FULL
+		raw_spin_unlock(&logbuf_lock);
 		stop_critical_timings();	/* don't trace print latency */
 		call_console_drivers(level, text, len);
 		start_critical_timings();
 		local_irq_restore(flags);
+#else
+		raw_spin_unlock_irqrestore(&logbuf_lock, flags);
+		call_console_drivers(level, text, len);
+#endif
 	}
 	console_locked = 0;
 
-- 
cgit v0.10.2


From db4240cfbba9488a62db004caa2871105e5fd668 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 30 Apr 2013 03:17:21 -0500
Subject: Powerpc: Use generic rwsem on RT

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index dabe429..d2bf89e 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -60,10 +60,11 @@ config LOCKDEP_SUPPORT
 
 config RWSEM_GENERIC_SPINLOCK
 	bool
+	default y if PREEMPT_RT_FULL
 
 config RWSEM_XCHGADD_ALGORITHM
 	bool
-	default y
+	default y if !PREEMPT_RT_FULL
 
 config GENERIC_LOCKBREAK
 	bool
-- 
cgit v0.10.2


From 7aedd38755e7fb0f7f7ff06ed4928cc4bfd79feb Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 18 Jul 2011 17:08:34 +0200
Subject: power-disable-highmem-on-rt.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index d2bf89e..bd0885eb 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -291,7 +291,7 @@ menu "Kernel options"
 
 config HIGHMEM
 	bool "High memory support"
-	depends on PPC32
+	depends on PPC32 && !PREEMPT_RT_FULL
 
 source kernel/Kconfig.hz
 source kernel/Kconfig.preempt
-- 
cgit v0.10.2


From 3f4308548643954575f0ec7200c8a6785cb1c167 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 18 Jul 2011 17:09:28 +0200
Subject: arm-disable-highmem-on-rt.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 4890796..772c2d3 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -1752,7 +1752,7 @@ config HAVE_ARCH_PFN_VALID
 
 config HIGHMEM
 	bool "High Memory Support"
-	depends on MMU
+	depends on MMU && !PREEMPT_RT_FULL
 	help
 	  The address space of ARM processors is only 4 Gigabytes large
 	  and it has to accommodate user address space, kernel address
-- 
cgit v0.10.2


From e84782ec09dbc4c34424916851cca1b83cbf7739 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 1 May 2010 18:29:35 +0200
Subject: ARM: at91: tclib: Default to tclib timer for RT

RT is not too happy about the shared timer interrupt in AT91
devices. Default to tclib timer for RT.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig
index 25ffcfd..bb07512 100644
--- a/drivers/misc/Kconfig
+++ b/drivers/misc/Kconfig
@@ -63,6 +63,7 @@ config ATMEL_PWM
 config ATMEL_TCLIB
 	bool "Atmel AT32/AT91 Timer/Counter Library"
 	depends on (AVR32 || ARCH_AT91)
+	default y if PREEMPT_RT_FULL
 	help
 	  Select this if you want a library to allocate the Timer/Counter
 	  blocks found on many Atmel processors.  This facilitates using
@@ -95,7 +96,7 @@ config ATMEL_TCB_CLKSRC_BLOCK
 config ATMEL_TCB_CLKSRC_USE_SLOW_CLOCK
 	bool "TC Block use 32 KiHz clock"
 	depends on ATMEL_TCB_CLKSRC
-	default y
+	default y if !PREEMPT_RT_FULL
 	help
 	  Select this to use 32 KiHz base clock rate as TC block clock
 	  source for clock events.
-- 
cgit v0.10.2


From 2cfff562ec7225e5cfe2e069723aff98814301f3 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 18 Jul 2011 17:10:12 +0200
Subject: mips-disable-highmem-on-rt.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index 2ac626a..1c355a1 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -2102,7 +2102,7 @@ config CPU_R4400_WORKAROUNDS
 #
 config HIGHMEM
 	bool "High Memory Support"
-	depends on 32BIT && CPU_SUPPORTS_HIGHMEM && SYS_SUPPORTS_HIGHMEM
+	depends on 32BIT && CPU_SUPPORTS_HIGHMEM && SYS_SUPPORTS_HIGHMEM && !PREEMPT_RT_FULL
 
 config CPU_SUPPORTS_HIGHMEM
 	bool
-- 
cgit v0.10.2


From 511239788448a25b2b79a1bccbab89c51775ade5 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 6 Oct 2011 10:48:39 -0400
Subject: net: Avoid livelock in net_tx_action() on RT

qdisc_lock is taken w/o disabling interrupts or bottom halfs. So code
holding a qdisc_lock() can be interrupted and softirqs can run on the
return of interrupt in !RT.

The spin_trylock() in net_tx_action() makes sure, that the softirq
does not deadlock. When the lock can't be acquired q is requeued and
the NET_TX softirq is raised. That causes the softirq to run over and
over.

That works in mainline as do_softirq() has a retry loop limit and
leaves the softirq processing in the interrupt return path and
schedules ksoftirqd. The task which holds qdisc_lock cannot be
preempted, so the lock is released and either ksoftirqd or the next
softirq in the return from interrupt path can proceed. Though it's a
bit strange to actually run MAX_SOFTIRQ_RESTART (10) loops before it
decides to bail out even if it's clear in the first iteration :)

On RT all softirq processing is done in a FIFO thread and we don't
have a loop limit, so ksoftirqd preempts the lock holder forever and
unqueues and requeues until the reset button is hit.

Due to the forced threading of ksoftirqd on RT we actually cannot
deadlock on qdisc_lock because it's a "sleeping lock". So it's safe to
replace the spin_trylock() with a spin_lock(). When contended,
ksoftirqd is scheduled out and the lock holder can proceed.

[ tglx: Massaged changelog and code comments ]

Solved-by: Thomas Gleixner <tglx@linuxtronix.de>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Tested-by: Carsten Emde <cbe@osadl.org>
Cc: Clark Williams <williams@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
Cc: Luis Claudio R. Goncalves <lclaudio@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/net/core/dev.c b/net/core/dev.c
index 4693901..9d691dc 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3124,6 +3124,36 @@ int netif_rx_ni(struct sk_buff *skb)
 }
 EXPORT_SYMBOL(netif_rx_ni);
 
+#ifdef CONFIG_PREEMPT_RT_FULL
+/*
+ * RT runs ksoftirqd as a real time thread and the root_lock is a
+ * "sleeping spinlock". If the trylock fails then we can go into an
+ * infinite loop when ksoftirqd preempted the task which actually
+ * holds the lock, because we requeue q and raise NET_TX softirq
+ * causing ksoftirqd to loop forever.
+ *
+ * It's safe to use spin_lock on RT here as softirqs run in thread
+ * context and cannot deadlock against the thread which is holding
+ * root_lock.
+ *
+ * On !RT the trylock might fail, but there we bail out from the
+ * softirq loop after 10 attempts which we can't do on RT. And the
+ * task holding root_lock cannot be preempted, so the only downside of
+ * that trylock is that we need 10 loops to decide that we should have
+ * given up in the first one :)
+ */
+static inline int take_root_lock(spinlock_t *lock)
+{
+	spin_lock(lock);
+	return 1;
+}
+#else
+static inline int take_root_lock(spinlock_t *lock)
+{
+	return spin_trylock(lock);
+}
+#endif
+
 static void net_tx_action(struct softirq_action *h)
 {
 	struct softnet_data *sd = &__get_cpu_var(softnet_data);
@@ -3162,7 +3192,7 @@ static void net_tx_action(struct softirq_action *h)
 			head = head->next_sched;
 
 			root_lock = qdisc_lock(q);
-			if (spin_trylock(root_lock)) {
+			if (take_root_lock(root_lock)) {
 				smp_mb__before_clear_bit();
 				clear_bit(__QDISC_STATE_SCHED,
 					  &q->state);
-- 
cgit v0.10.2


From 42b7a151bb6effb667a2f5d72b167bf38a88ce37 Mon Sep 17 00:00:00 2001
From: Carsten Emde <C.Emde@osadl.org>
Date: Tue, 19 Jul 2011 13:51:17 +0100
Subject: net: sysrq via icmp

There are (probably rare) situations when a system crashed and the system
console becomes unresponsive but the network icmp layer still is alive.
Wouldn't it be wonderful, if we then could submit a sysreq command via ping?

This patch provides this facility. Please consult the updated documentation
Documentation/sysrq.txt for details.

Signed-off-by: Carsten Emde <C.Emde@osadl.org>

diff --git a/Documentation/sysrq.txt b/Documentation/sysrq.txt
index 2a4cdda..67ac78c 100644
--- a/Documentation/sysrq.txt
+++ b/Documentation/sysrq.txt
@@ -57,10 +57,17 @@ On PowerPC - Press 'ALT - Print Screen (or F13) - <command key>,
 On other - If you know of the key combos for other architectures, please
            let me know so I can add them to this section.
 
-On all -  write a character to /proc/sysrq-trigger.  e.g.:
-
+On all -  write a character to /proc/sysrq-trigger, e.g.:
 		echo t > /proc/sysrq-trigger
 
+On all - Enable network SysRq by writing a cookie to icmp_echo_sysrq, e.g.
+		echo 0x01020304 >/proc/sys/net/ipv4/icmp_echo_sysrq
+	 Send an ICMP echo request with this pattern plus the particular
+	 SysRq command key. Example:
+		# ping -c1 -s57 -p0102030468
+	 will trigger the SysRq-H (help) command.
+
+
 *  What are the 'command' keys?
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 'b'     - Will immediately reboot the system without syncing or unmounting
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 2ae2b83..17e815d 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -56,6 +56,7 @@ struct netns_ipv4 {
 
 	int sysctl_icmp_echo_ignore_all;
 	int sysctl_icmp_echo_ignore_broadcasts;
+	int sysctl_icmp_echo_sysrq;
 	int sysctl_icmp_ignore_bogus_error_responses;
 	int sysctl_icmp_ratelimit;
 	int sysctl_icmp_ratemask;
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 3ac5dff..d8bbe94 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -69,6 +69,7 @@
 #include <linux/jiffies.h>
 #include <linux/kernel.h>
 #include <linux/fcntl.h>
+#include <linux/sysrq.h>
 #include <linux/socket.h>
 #include <linux/in.h>
 #include <linux/inet.h>
@@ -768,6 +769,30 @@ static void icmp_redirect(struct sk_buff *skb)
 }
 
 /*
+ * 32bit and 64bit have different timestamp length, so we check for
+ * the cookie at offset 20 and verify it is repeated at offset 50
+ */
+#define CO_POS0		20
+#define CO_POS1		50
+#define CO_SIZE		sizeof(int)
+#define ICMP_SYSRQ_SIZE	57
+
+/*
+ * We got a ICMP_SYSRQ_SIZE sized ping request. Check for the cookie
+ * pattern and if it matches send the next byte as a trigger to sysrq.
+ */
+static void icmp_check_sysrq(struct net *net, struct sk_buff *skb)
+{
+	int cookie = htonl(net->ipv4.sysctl_icmp_echo_sysrq);
+	char *p = skb->data;
+
+	if (!memcmp(&cookie, p + CO_POS0, CO_SIZE) &&
+	    !memcmp(&cookie, p + CO_POS1, CO_SIZE) &&
+	    p[CO_POS0 + CO_SIZE] == p[CO_POS1 + CO_SIZE])
+		handle_sysrq(p[CO_POS0 + CO_SIZE]);
+}
+
+/*
  *	Handle ICMP_ECHO ("ping") requests.
  *
  *	RFC 1122: 3.2.2.6 MUST have an echo server that answers ICMP echo
@@ -794,6 +819,11 @@ static void icmp_echo(struct sk_buff *skb)
 		icmp_param.data_len	   = skb->len;
 		icmp_param.head_len	   = sizeof(struct icmphdr);
 		icmp_reply(&icmp_param, skb);
+
+		if (skb->len == ICMP_SYSRQ_SIZE &&
+		    net->ipv4.sysctl_icmp_echo_sysrq) {
+			icmp_check_sysrq(net, skb);
+		}
 	}
 }
 
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index d84400b..44bf3b0 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -815,6 +815,13 @@ static struct ctl_table ipv4_net_table[] = {
 		.proc_handler	= proc_dointvec
 	},
 	{
+		.procname	= "icmp_echo_sysrq",
+		.data		= &init_net.ipv4.sysctl_icmp_echo_sysrq,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+	{
 		.procname	= "icmp_ignore_bogus_error_responses",
 		.data		= &init_net.ipv4.sysctl_icmp_ignore_bogus_error_responses,
 		.maxlen		= sizeof(int),
-- 
cgit v0.10.2


From 2b6b102f6cd8648295a8a7395705ce8db87b0227 Mon Sep 17 00:00:00 2001
From: Jason Wessel <jason.wessel@windriver.com>
Date: Thu, 28 Jul 2011 12:42:23 -0500
Subject: kgdb/serial: Short term workaround

On 07/27/2011 04:37 PM, Thomas Gleixner wrote:
>  - KGDB (not yet disabled) is reportedly unusable on -rt right now due
>    to missing hacks in the console locking which I dropped on purpose.
>

To work around this in the short term you can use this patch, in
addition to the clocksource watchdog patch that Thomas brewed up.

Comments are welcome of course.  Ultimately the right solution is to
change separation between the console and the HW to have a polled mode
+ work queue so as not to introduce any kind of latency.

Thanks,
Jason.

diff --git a/drivers/tty/serial/8250/8250.c b/drivers/tty/serial/8250/8250.c
index 25a5451..f318c96 100644
--- a/drivers/tty/serial/8250/8250.c
+++ b/drivers/tty/serial/8250/8250.c
@@ -38,6 +38,7 @@
 #include <linux/nmi.h>
 #include <linux/mutex.h>
 #include <linux/slab.h>
+#include <linux/kdb.h>
 #ifdef CONFIG_SPARC
 #include <linux/sunserialcore.h>
 #endif
@@ -2909,7 +2910,7 @@ serial8250_console_write(struct console *co, const char *s, unsigned int count)
 
 	touch_nmi_watchdog();
 
-	if (port->sysrq || oops_in_progress)
+	if (port->sysrq || oops_in_progress || in_kdb_printk())
 		locked = spin_trylock_irqsave(&port->lock, flags);
 	else
 		spin_lock_irqsave(&port->lock, flags);
diff --git a/include/linux/kdb.h b/include/linux/kdb.h
index 7f6fe6e..680ad23 100644
--- a/include/linux/kdb.h
+++ b/include/linux/kdb.h
@@ -115,7 +115,7 @@ extern int kdb_trap_printk;
 extern __printf(1, 0) int vkdb_printf(const char *fmt, va_list args);
 extern __printf(1, 2) int kdb_printf(const char *, ...);
 typedef __printf(1, 2) int (*kdb_printf_t)(const char *, ...);
-
+#define in_kdb_printk() (kdb_trap_printk)
 extern void kdb_init(int level);
 
 /* Access to kdb specific polling devices */
@@ -150,6 +150,7 @@ extern int kdb_register_repeat(char *, kdb_func_t, char *, char *,
 extern int kdb_unregister(char *);
 #else /* ! CONFIG_KGDB_KDB */
 static inline __printf(1, 2) int kdb_printf(const char *fmt, ...) { return 0; }
+#define in_kdb_printk() (0)
 static inline void kdb_init(int level) {}
 static inline int kdb_register(char *cmd, kdb_func_t func, char *usage,
 			       char *help, short minlen) { return 0; }
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index 14ff484..399dba6 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -554,7 +554,6 @@ int vkdb_printf(const char *fmt, va_list ap)
 	int linecount;
 	int colcount;
 	int logging, saved_loglevel = 0;
-	int saved_trap_printk;
 	int got_printf_lock = 0;
 	int retlen = 0;
 	int fnd, len;
@@ -565,8 +564,6 @@ int vkdb_printf(const char *fmt, va_list ap)
 	unsigned long uninitialized_var(flags);
 
 	preempt_disable();
-	saved_trap_printk = kdb_trap_printk;
-	kdb_trap_printk = 0;
 
 	/* Serialize kdb_printf if multiple cpus try to write at once.
 	 * But if any cpu goes recursive in kdb, just print the output,
@@ -833,7 +830,6 @@ kdb_print_out:
 	} else {
 		__release(kdb_printf_lock);
 	}
-	kdb_trap_printk = saved_trap_printk;
 	preempt_enable();
 	return retlen;
 }
@@ -843,9 +839,11 @@ int kdb_printf(const char *fmt, ...)
 	va_list ap;
 	int r;
 
+	kdb_trap_printk++;
 	va_start(ap, fmt);
 	r = vkdb_printf(fmt, ap);
 	va_end(ap);
+	kdb_trap_printk--;
 
 	return r;
 }
-- 
cgit v0.10.2


From 2e2fe95b0c337676cab94834b16c22a0aa7b9c7e Mon Sep 17 00:00:00 2001
From: Clark Williams <williams@redhat.com>
Date: Sat, 30 Jul 2011 21:55:53 -0500
Subject: add /sys/kernel/realtime entry

Add a /sys/kernel entry to indicate that the kernel is a
realtime kernel.

Clark says that he needs this for udev rules, udev needs to evaluate
if its a PREEMPT_RT kernel a few thousand times and parsing uname
output is too slow or so.

Are there better solutions? Should it exist and return 0 on !-rt?

Signed-off-by: Clark Williams <williams@redhat.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>

diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 6ada93c..1c991e3 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -132,6 +132,15 @@ KERNEL_ATTR_RO(vmcoreinfo);
 
 #endif /* CONFIG_KEXEC */
 
+#if defined(CONFIG_PREEMPT_RT_FULL)
+static ssize_t  realtime_show(struct kobject *kobj,
+			      struct kobj_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%d\n", 1);
+}
+KERNEL_ATTR_RO(realtime);
+#endif
+
 /* whether file capabilities are enabled */
 static ssize_t fscaps_show(struct kobject *kobj,
 				  struct kobj_attribute *attr, char *buf)
@@ -197,6 +206,9 @@ static struct attribute * kernel_attrs[] = {
 	&vmcoreinfo_attr.attr,
 #endif
 	&rcu_expedited_attr.attr,
+#ifdef CONFIG_PREEMPT_RT_FULL
+	&realtime_attr.attr,
+#endif
 	NULL
 };
 
-- 
cgit v0.10.2


From de25e14b97f187200bd4e9d6fab736d0e3d1d884 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 28 Jul 2011 10:43:51 +0200
Subject: mm, rt: kmap_atomic scheduling

In fact, with migrate_disable() existing one could play games with
kmap_atomic. You could save/restore the kmap_atomic slots on context
switch (if there are any in use of course), this should be esp easy now
that we have a kmap_atomic stack.

Something like the below.. it wants replacing all the preempt_disable()
stuff with pagefault_disable() && migrate_disable() of course, but then
you can flip kmaps around like below.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
[dvhart@linux.intel.com: build fix]
Link: http://lkml.kernel.org/r/1311842631.5890.208.camel@twins

[tglx@linutronix.de: Get rid of the per cpu variable and store the idx
		     and the pte content right away in the task struct.
		     Shortens the context switch code. ]

diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index b5a8905..139ad27 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -36,6 +36,7 @@
 #include <linux/uaccess.h>
 #include <linux/io.h>
 #include <linux/kdebug.h>
+#include <linux/highmem.h>
 
 #include <asm/pgtable.h>
 #include <asm/ldt.h>
@@ -216,6 +217,35 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
 }
 EXPORT_SYMBOL_GPL(start_thread);
 
+#ifdef CONFIG_PREEMPT_RT_FULL
+static void switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p)
+{
+	int i;
+
+	/*
+	 * Clear @prev's kmap_atomic mappings
+	 */
+	for (i = 0; i < prev_p->kmap_idx; i++) {
+		int idx = i + KM_TYPE_NR * smp_processor_id();
+		pte_t *ptep = kmap_pte - idx;
+
+		kpte_clear_flush(ptep, __fix_to_virt(FIX_KMAP_BEGIN + idx));
+	}
+	/*
+	 * Restore @next_p's kmap_atomic mappings
+	 */
+	for (i = 0; i < next_p->kmap_idx; i++) {
+		int idx = i + KM_TYPE_NR * smp_processor_id();
+
+		if (!pte_none(next_p->kmap_pte[i]))
+			set_pte(kmap_pte - idx, next_p->kmap_pte[i]);
+	}
+}
+#else
+static inline void
+switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p) { }
+#endif
+
 
 /*
  *	switch_to(x,y) should switch tasks from x to y.
@@ -295,6 +325,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 		     task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT))
 		__switch_to_xtra(prev_p, next_p, tss);
 
+	switch_kmaps(prev_p, next_p);
+
 	/*
 	 * Leave lazy mode, flushing any hypercalls made here.
 	 * This must be done before restoring TLS segments so
diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
index 6f31ee5..01f7c99 100644
--- a/arch/x86/mm/highmem_32.c
+++ b/arch/x86/mm/highmem_32.c
@@ -31,6 +31,7 @@ EXPORT_SYMBOL(kunmap);
  */
 void *kmap_atomic_prot(struct page *page, pgprot_t prot)
 {
+	pte_t pte = mk_pte(page, prot);
 	unsigned long vaddr;
 	int idx, type;
 
@@ -44,7 +45,10 @@ void *kmap_atomic_prot(struct page *page, pgprot_t prot)
 	idx = type + KM_TYPE_NR*smp_processor_id();
 	vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
 	BUG_ON(!pte_none(*(kmap_pte-idx)));
-	set_pte(kmap_pte-idx, mk_pte(page, prot));
+#ifdef CONFIG_PREEMPT_RT_FULL
+	current->kmap_pte[type] = pte;
+#endif
+	set_pte(kmap_pte-idx, pte);
 	arch_flush_lazy_mmu_mode();
 
 	return (void *)vaddr;
@@ -87,6 +91,9 @@ void __kunmap_atomic(void *kvaddr)
 		 * is a bad idea also, in case the page changes cacheability
 		 * attributes or becomes a protected page in a hypervisor.
 		 */
+#ifdef CONFIG_PREEMPT_RT_FULL
+		current->kmap_pte[type] = __pte(0);
+#endif
 		kpte_clear_flush(kmap_pte-idx, vaddr);
 		kmap_atomic_idx_pop();
 		arch_flush_lazy_mmu_mode();
diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c
index 7b179b4..0c953e3 100644
--- a/arch/x86/mm/iomap_32.c
+++ b/arch/x86/mm/iomap_32.c
@@ -56,6 +56,7 @@ EXPORT_SYMBOL_GPL(iomap_free);
 
 void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot)
 {
+	pte_t pte = pfn_pte(pfn, prot);
 	unsigned long vaddr;
 	int idx, type;
 
@@ -64,7 +65,10 @@ void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot)
 	type = kmap_atomic_idx_push();
 	idx = type + KM_TYPE_NR * smp_processor_id();
 	vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
-	set_pte(kmap_pte - idx, pfn_pte(pfn, prot));
+#ifdef CONFIG_PREEMPT_RT_FULL
+	current->kmap_pte[type] = pte;
+#endif
+	set_pte(kmap_pte - idx, pte);
 	arch_flush_lazy_mmu_mode();
 
 	return (void *)vaddr;
@@ -110,6 +114,9 @@ iounmap_atomic(void __iomem *kvaddr)
 		 * is a bad idea also, in case the page changes cacheability
 		 * attributes or becomes a protected page in a hypervisor.
 		 */
+#ifdef CONFIG_PREEMPT_RT_FULL
+		current->kmap_pte[type] = __pte(0);
+#endif
 		kpte_clear_flush(kmap_pte-idx, vaddr);
 		kmap_atomic_idx_pop();
 	}
diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index ef788b5..4ec3d97 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -85,32 +85,51 @@ static inline void __kunmap_atomic(void *addr)
 
 #if defined(CONFIG_HIGHMEM) || defined(CONFIG_X86_32)
 
+#ifndef CONFIG_PREEMPT_RT_FULL
 DECLARE_PER_CPU(int, __kmap_atomic_idx);
+#endif
 
 static inline int kmap_atomic_idx_push(void)
 {
+#ifndef CONFIG_PREEMPT_RT_FULL
 	int idx = __this_cpu_inc_return(__kmap_atomic_idx) - 1;
 
-#ifdef CONFIG_DEBUG_HIGHMEM
+# ifdef CONFIG_DEBUG_HIGHMEM
 	WARN_ON_ONCE(in_irq() && !irqs_disabled());
 	BUG_ON(idx > KM_TYPE_NR);
-#endif
+# endif
 	return idx;
+#else
+	current->kmap_idx++;
+	BUG_ON(current->kmap_idx > KM_TYPE_NR);
+	return current->kmap_idx - 1;
+#endif
 }
 
 static inline int kmap_atomic_idx(void)
 {
+#ifndef CONFIG_PREEMPT_RT_FULL
 	return __this_cpu_read(__kmap_atomic_idx) - 1;
+#else
+	return current->kmap_idx - 1;
+#endif
 }
 
 static inline void kmap_atomic_idx_pop(void)
 {
-#ifdef CONFIG_DEBUG_HIGHMEM
+#ifndef CONFIG_PREEMPT_RT_FULL
+# ifdef CONFIG_DEBUG_HIGHMEM
 	int idx = __this_cpu_dec_return(__kmap_atomic_idx);
 
 	BUG_ON(idx < 0);
-#else
+# else
 	__this_cpu_dec(__kmap_atomic_idx);
+# endif
+#else
+	current->kmap_idx--;
+# ifdef CONFIG_DEBUG_HIGHMEM
+	BUG_ON(current->kmap_idx < 0);
+# endif
 #endif
 }
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 56c1337..fbb7e73 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -23,6 +23,7 @@ struct sched_param {
 #include <linux/nodemask.h>
 #include <linux/mm_types.h>
 
+#include <asm/kmap_types.h>
 #include <asm/page.h>
 #include <asm/ptrace.h>
 #include <asm/cputime.h>
@@ -1636,6 +1637,12 @@ struct task_struct {
 	struct rcu_head put_rcu;
 	int softirq_nestcnt;
 #endif
+#ifdef CONFIG_PREEMPT_RT_FULL
+# if defined CONFIG_HIGHMEM || defined CONFIG_X86_32
+	int kmap_idx;
+	pte_t kmap_pte[KM_TYPE_NR];
+# endif
+#endif
 };
 
 #ifdef CONFIG_NUMA_BALANCING
diff --git a/mm/highmem.c b/mm/highmem.c
index b32b70c..b1c7d43 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -29,10 +29,11 @@
 #include <linux/kgdb.h>
 #include <asm/tlbflush.h>
 
-
+#ifndef CONFIG_PREEMPT_RT_FULL
 #if defined(CONFIG_HIGHMEM) || defined(CONFIG_X86_32)
 DEFINE_PER_CPU(int, __kmap_atomic_idx);
 #endif
+#endif
 
 /*
  * Virtual_count is not a pure "count".
@@ -47,8 +48,9 @@ DEFINE_PER_CPU(int, __kmap_atomic_idx);
 unsigned long totalhigh_pages __read_mostly;
 EXPORT_SYMBOL(totalhigh_pages);
 
-
+#ifndef CONFIG_PREEMPT_RT_FULL
 EXPORT_PER_CPU_SYMBOL(__kmap_atomic_idx);
+#endif
 
 unsigned int nr_free_highpages (void)
 {
diff --git a/mm/memory.c b/mm/memory.c
index 7417ebc..23b82ee 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3720,6 +3720,7 @@ unlock:
 #ifdef CONFIG_PREEMPT_RT_FULL
 void pagefault_disable(void)
 {
+	migrate_disable();
 	current->pagefault_disabled++;
 	/*
 	 * make sure to have issued the store before a pagefault
@@ -3737,6 +3738,7 @@ void pagefault_enable(void)
 	 */
 	barrier();
 	current->pagefault_disabled--;
+	migrate_enable();
 }
 EXPORT_SYMBOL(pagefault_enable);
 #endif
-- 
cgit v0.10.2


From 68facae6555d7a19402a05a3bbee1443b1c50d7f Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Mon, 11 Mar 2013 17:09:55 +0100
Subject: x86/highmem: add a "already used pte" check

This is a copy from kmap_atomic_prot().

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>

diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c
index 0c953e3..62377d6 100644
--- a/arch/x86/mm/iomap_32.c
+++ b/arch/x86/mm/iomap_32.c
@@ -65,6 +65,8 @@ void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot)
 	type = kmap_atomic_idx_push();
 	idx = type + KM_TYPE_NR * smp_processor_id();
 	vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
+	WARN_ON(!pte_none(*(kmap_pte - idx)));
+
 #ifdef CONFIG_PREEMPT_RT_FULL
 	current->kmap_pte[type] = pte;
 #endif
-- 
cgit v0.10.2


From d47e9434da11ea97849ef062dc964604b3c15eae Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Mon, 11 Mar 2013 21:37:27 +0100
Subject: arm/highmem: flush tlb on unmap

The tlb should be flushed on unmap and thus make the mapping entry
invalid. This is only done in the non-debug case which does not look
right.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>

diff --git a/arch/arm/mm/highmem.c b/arch/arm/mm/highmem.c
index 21b9e1b..3688c8b 100644
--- a/arch/arm/mm/highmem.c
+++ b/arch/arm/mm/highmem.c
@@ -95,10 +95,10 @@ void __kunmap_atomic(void *kvaddr)
 			__cpuc_flush_dcache_area((void *)vaddr, PAGE_SIZE);
 #ifdef CONFIG_DEBUG_HIGHMEM
 		BUG_ON(vaddr != __fix_to_virt(FIX_KMAP_BEGIN + idx));
-		set_top_pte(vaddr, __pte(0));
 #else
 		(void) idx;  /* to kill a warning */
 #endif
+		set_top_pte(vaddr, __pte(0));
 		kmap_atomic_idx_pop();
 	} else if (vaddr >= PKMAP_ADDR(0) && vaddr < PKMAP_ADDR(LAST_PKMAP)) {
 		/* this address was obtained through kmap_high_get() */
-- 
cgit v0.10.2


From 1c7c1918135fe55e3e879cc0a45073d1f86ee396 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 13 Feb 2013 11:03:11 +0100
Subject: arm-enable-highmem-for-rt.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 772c2d3..4890796 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -1752,7 +1752,7 @@ config HAVE_ARCH_PFN_VALID
 
 config HIGHMEM
 	bool "High Memory Support"
-	depends on MMU && !PREEMPT_RT_FULL
+	depends on MMU
 	help
 	  The address space of ARM processors is only 4 Gigabytes large
 	  and it has to accommodate user address space, kernel address
diff --git a/arch/arm/include/asm/switch_to.h b/arch/arm/include/asm/switch_to.h
index fa09e6b..fbd0ba7 100644
--- a/arch/arm/include/asm/switch_to.h
+++ b/arch/arm/include/asm/switch_to.h
@@ -3,6 +3,14 @@
 
 #include <linux/thread_info.h>
 
+#if defined CONFIG_PREEMPT_RT_FULL && defined CONFIG_HIGHMEM
+void switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p);
+#else
+static inline void
+switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p) { }
+#endif
+
+
 /*
  * switch_to(prev, next) should switch from task `prev' to `next'
  * `prev' will never be the same as `next'.  schedule() itself
@@ -12,6 +20,7 @@ extern struct task_struct *__switch_to(struct task_struct *, struct thread_info
 
 #define switch_to(prev,next,last)					\
 do {									\
+	switch_kmaps(prev, next);					\
 	last = __switch_to(prev,task_thread_info(prev), task_thread_info(next));	\
 } while (0)
 
diff --git a/arch/arm/mm/highmem.c b/arch/arm/mm/highmem.c
index 3688c8b..bd41dd8 100644
--- a/arch/arm/mm/highmem.c
+++ b/arch/arm/mm/highmem.c
@@ -38,6 +38,7 @@ EXPORT_SYMBOL(kunmap);
 
 void *kmap_atomic(struct page *page)
 {
+	pte_t pte = mk_pte(page, kmap_prot);
 	unsigned int idx;
 	unsigned long vaddr;
 	void *kmap;
@@ -76,7 +77,10 @@ void *kmap_atomic(struct page *page)
 	 * in place, so the contained TLB flush ensures the TLB is updated
 	 * with the new mapping.
 	 */
-	set_top_pte(vaddr, mk_pte(page, kmap_prot));
+#ifdef CONFIG_PREEMPT_RT_FULL
+	current->kmap_pte[type] = pte;
+#endif
+	set_top_pte(vaddr, pte);
 
 	return (void *)vaddr;
 }
@@ -93,6 +97,9 @@ void __kunmap_atomic(void *kvaddr)
 
 		if (cache_is_vivt())
 			__cpuc_flush_dcache_area((void *)vaddr, PAGE_SIZE);
+#ifdef CONFIG_PREEMPT_RT_FULL
+		current->kmap_pte[type] = __pte(0);
+#endif
 #ifdef CONFIG_DEBUG_HIGHMEM
 		BUG_ON(vaddr != __fix_to_virt(FIX_KMAP_BEGIN + idx));
 #else
@@ -110,6 +117,7 @@ EXPORT_SYMBOL(__kunmap_atomic);
 
 void *kmap_atomic_pfn(unsigned long pfn)
 {
+	pte_t pte = pfn_pte(pfn, kmap_prot);
 	unsigned long vaddr;
 	int idx, type;
 
@@ -121,7 +129,10 @@ void *kmap_atomic_pfn(unsigned long pfn)
 #ifdef CONFIG_DEBUG_HIGHMEM
 	BUG_ON(!pte_none(get_top_pte(vaddr)));
 #endif
-	set_top_pte(vaddr, pfn_pte(pfn, kmap_prot));
+#ifdef CONFIG_PREEMPT_RT_FULL
+	current->kmap_pte[type] = pte;
+#endif
+	set_top_pte(vaddr, pte);
 
 	return (void *)vaddr;
 }
@@ -135,3 +146,29 @@ struct page *kmap_atomic_to_page(const void *ptr)
 
 	return pte_page(get_top_pte(vaddr));
 }
+
+#if defined CONFIG_PREEMPT_RT_FULL
+void switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p)
+{
+	int i;
+
+	/*
+	 * Clear @prev's kmap_atomic mappings
+	 */
+	for (i = 0; i < prev_p->kmap_idx; i++) {
+		int idx = i + KM_TYPE_NR * smp_processor_id();
+
+		set_top_pte(__fix_to_virt(FIX_KMAP_BEGIN + idx), __pte(0));
+	}
+	/*
+	 * Restore @next_p's kmap_atomic mappings
+	 */
+	for (i = 0; i < next_p->kmap_idx; i++) {
+		int idx = i + KM_TYPE_NR * smp_processor_id();
+
+		if (!pte_none(next_p->kmap_pte[i]))
+			set_top_pte(__fix_to_virt(FIX_KMAP_BEGIN + idx),
+					next_p->kmap_pte[i]);
+	}
+}
+#endif
diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index 4ec3d97..84223de 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -7,6 +7,7 @@
 #include <linux/mm.h>
 #include <linux/uaccess.h>
 #include <linux/hardirq.h>
+#include <linux/sched.h>
 
 #include <asm/cacheflush.h>
 
-- 
cgit v0.10.2


From 467a334a45ccbdd173c9d7c64323ccf1cb3848aa Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 13 Sep 2011 15:09:40 +0200
Subject: ipc/sem: Rework semaphore wakeups

Current sysv sems have a weird ass wakeup scheme that involves keeping
preemption disabled over a potential O(n^2) loop and busy waiting on
that on other CPUs.

Kill this and simply wake the task directly from under the sem_lock.

This was discovered by a migrate_disable() debug feature that
disallows:

  spin_lock();
  preempt_disable();
  spin_unlock()
  preempt_enable();

Cc: Manfred Spraul <manfred@colorfullife.com>
Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Reported-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Manfred Spraul <manfred@colorfullife.com>
Link: http://lkml.kernel.org/r/1315994224.5040.1.camel@twins
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/ipc/sem.c b/ipc/sem.c
index 58d31f1..d7bdd84 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -461,6 +461,13 @@ undo:
 static void wake_up_sem_queue_prepare(struct list_head *pt,
 				struct sem_queue *q, int error)
 {
+#ifdef CONFIG_PREEMPT_RT_BASE
+	struct task_struct *p = q->sleeper;
+	get_task_struct(p);
+	q->status = error;
+	wake_up_process(p);
+	put_task_struct(p);
+#else
 	if (list_empty(pt)) {
 		/*
 		 * Hold preempt off so that we don't get preempted and have the
@@ -472,6 +479,7 @@ static void wake_up_sem_queue_prepare(struct list_head *pt,
 	q->pid = error;
 
 	list_add_tail(&q->simple_list, pt);
+#endif
 }
 
 /**
@@ -485,6 +493,7 @@ static void wake_up_sem_queue_prepare(struct list_head *pt,
  */
 static void wake_up_sem_queue_do(struct list_head *pt)
 {
+#ifndef CONFIG_PREEMPT_RT_BASE
 	struct sem_queue *q, *t;
 	int did_something;
 
@@ -497,6 +506,7 @@ static void wake_up_sem_queue_do(struct list_head *pt)
 	}
 	if (did_something)
 		preempt_enable();
+#endif
 }
 
 static void unlink_queue(struct sem_array *sma, struct sem_queue *q)
-- 
cgit v0.10.2


From 36d9a4a1e1ac725b2986f7ece0693fb1f694255e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 6 Nov 2011 12:26:18 +0100
Subject: x86-kvm-require-const-tsc-for-rt.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 9a51121..fb485ba 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5238,6 +5238,13 @@ int kvm_arch_init(void *opaque)
 		goto out;
 	}
 
+#ifdef CONFIG_PREEMPT_RT_FULL
+	if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
+		printk(KERN_ERR "RT requires X86_FEATURE_CONSTANT_TSC\n");
+		return -EOPNOTSUPP;
+	}
+#endif
+
 	r = kvm_mmu_module_init();
 	if (r)
 		goto out_free_percpu;
-- 
cgit v0.10.2


From ec5e183b93006d418165791945d0b5ef9b561259 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 12 Nov 2011 14:00:48 +0100
Subject: scsi-fcoe-rt-aware.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/drivers/scsi/fcoe/fcoe.c b/drivers/scsi/fcoe/fcoe.c
index 666b7ac..61c1d2a 100644
--- a/drivers/scsi/fcoe/fcoe.c
+++ b/drivers/scsi/fcoe/fcoe.c
@@ -1272,7 +1272,7 @@ static void fcoe_percpu_thread_destroy(unsigned int cpu)
 	struct sk_buff *skb;
 #ifdef CONFIG_SMP
 	struct fcoe_percpu_s *p0;
-	unsigned targ_cpu = get_cpu();
+	unsigned targ_cpu = get_cpu_light();
 #endif /* CONFIG_SMP */
 
 	FCOE_DBG("Destroying receive thread for CPU %d\n", cpu);
@@ -1328,7 +1328,7 @@ static void fcoe_percpu_thread_destroy(unsigned int cpu)
 			kfree_skb(skb);
 		spin_unlock_bh(&p->fcoe_rx_list.lock);
 	}
-	put_cpu();
+	put_cpu_light();
 #else
 	/*
 	 * This a non-SMP scenario where the singular Rx thread is
@@ -1546,11 +1546,11 @@ err2:
 static int fcoe_alloc_paged_crc_eof(struct sk_buff *skb, int tlen)
 {
 	struct fcoe_percpu_s *fps;
-	int rc;
+	int rc, cpu = get_cpu_light();
 
-	fps = &get_cpu_var(fcoe_percpu);
+	fps = &per_cpu(fcoe_percpu, cpu);
 	rc = fcoe_get_paged_crc_eof(skb, tlen, fps);
-	put_cpu_var(fcoe_percpu);
+	put_cpu_light();
 
 	return rc;
 }
@@ -1745,11 +1745,11 @@ static inline int fcoe_filter_frames(struct fc_lport *lport,
 		return 0;
 	}
 
-	stats = per_cpu_ptr(lport->stats, get_cpu());
+	stats = per_cpu_ptr(lport->stats, get_cpu_light());
 	stats->InvalidCRCCount++;
 	if (stats->InvalidCRCCount < 5)
 		printk(KERN_WARNING "fcoe: dropping frame with CRC error\n");
-	put_cpu();
+	put_cpu_light();
 	return -EINVAL;
 }
 
@@ -1825,13 +1825,13 @@ static void fcoe_recv_frame(struct sk_buff *skb)
 		goto drop;
 
 	if (!fcoe_filter_frames(lport, fp)) {
-		put_cpu();
+		put_cpu_light();
 		fc_exch_recv(lport, fp);
 		return;
 	}
 drop:
 	stats->ErrorFrames++;
-	put_cpu();
+	put_cpu_light();
 	kfree_skb(skb);
 }
 
diff --git a/drivers/scsi/fcoe/fcoe_ctlr.c b/drivers/scsi/fcoe/fcoe_ctlr.c
index 4a909d7..69d36cc 100644
--- a/drivers/scsi/fcoe/fcoe_ctlr.c
+++ b/drivers/scsi/fcoe/fcoe_ctlr.c
@@ -792,7 +792,7 @@ static unsigned long fcoe_ctlr_age_fcfs(struct fcoe_ctlr *fip)
 
 	INIT_LIST_HEAD(&del_list);
 
-	stats = per_cpu_ptr(fip->lp->stats, get_cpu());
+	stats = per_cpu_ptr(fip->lp->stats, get_cpu_light());
 
 	list_for_each_entry_safe(fcf, next, &fip->fcfs, list) {
 		deadline = fcf->time + fcf->fka_period + fcf->fka_period / 2;
@@ -828,7 +828,7 @@ static unsigned long fcoe_ctlr_age_fcfs(struct fcoe_ctlr *fip)
 				sel_time = fcf->time;
 		}
 	}
-	put_cpu();
+	put_cpu_light();
 
 	list_for_each_entry_safe(fcf, next, &del_list, list) {
 		/* Removes fcf from current list */
diff --git a/drivers/scsi/libfc/fc_exch.c b/drivers/scsi/libfc/fc_exch.c
index c772d8d..67ca13a 100644
--- a/drivers/scsi/libfc/fc_exch.c
+++ b/drivers/scsi/libfc/fc_exch.c
@@ -730,10 +730,10 @@ static struct fc_exch *fc_exch_em_alloc(struct fc_lport *lport,
 	}
 	memset(ep, 0, sizeof(*ep));
 
-	cpu = get_cpu();
+	cpu = get_cpu_light();
 	pool = per_cpu_ptr(mp->pool, cpu);
 	spin_lock_bh(&pool->lock);
-	put_cpu();
+	put_cpu_light();
 
 	/* peek cache of free slot */
 	if (pool->left != FC_XID_UNKNOWN) {
-- 
cgit v0.10.2


From e8dcfa4f3ffee678a58a24cea24e160dfd639102 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 14 Nov 2011 18:19:27 +0100
Subject: x86: crypto: Reduce preempt disabled regions

Restrict the preempt disabled regions to the actual floating point
operations and enable preemption for the administrative actions.

This is necessary on RT to avoid that kfree and other operations are
called with preemption disabled.

Reported-and-tested-by: Carsten Emde <cbe@osadl.org>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: stable-rt@vger.kernel.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index 1b9c22b..653314e 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -250,14 +250,14 @@ static int ecb_encrypt(struct blkcipher_desc *desc,
 	err = blkcipher_walk_virt(desc, &walk);
 	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
 
-	kernel_fpu_begin();
 	while ((nbytes = walk.nbytes)) {
+		kernel_fpu_begin();
 		aesni_ecb_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr,
-			      nbytes & AES_BLOCK_MASK);
+				nbytes & AES_BLOCK_MASK);
+		kernel_fpu_end();
 		nbytes &= AES_BLOCK_SIZE - 1;
 		err = blkcipher_walk_done(desc, &walk, nbytes);
 	}
-	kernel_fpu_end();
 
 	return err;
 }
@@ -274,14 +274,14 @@ static int ecb_decrypt(struct blkcipher_desc *desc,
 	err = blkcipher_walk_virt(desc, &walk);
 	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
 
-	kernel_fpu_begin();
 	while ((nbytes = walk.nbytes)) {
+		kernel_fpu_begin();
 		aesni_ecb_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr,
 			      nbytes & AES_BLOCK_MASK);
+		kernel_fpu_end();
 		nbytes &= AES_BLOCK_SIZE - 1;
 		err = blkcipher_walk_done(desc, &walk, nbytes);
 	}
-	kernel_fpu_end();
 
 	return err;
 }
@@ -298,14 +298,14 @@ static int cbc_encrypt(struct blkcipher_desc *desc,
 	err = blkcipher_walk_virt(desc, &walk);
 	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
 
-	kernel_fpu_begin();
 	while ((nbytes = walk.nbytes)) {
+		kernel_fpu_begin();
 		aesni_cbc_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr,
 			      nbytes & AES_BLOCK_MASK, walk.iv);
+		kernel_fpu_end();
 		nbytes &= AES_BLOCK_SIZE - 1;
 		err = blkcipher_walk_done(desc, &walk, nbytes);
 	}
-	kernel_fpu_end();
 
 	return err;
 }
@@ -322,14 +322,14 @@ static int cbc_decrypt(struct blkcipher_desc *desc,
 	err = blkcipher_walk_virt(desc, &walk);
 	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
 
-	kernel_fpu_begin();
 	while ((nbytes = walk.nbytes)) {
+		kernel_fpu_begin();
 		aesni_cbc_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr,
 			      nbytes & AES_BLOCK_MASK, walk.iv);
+		kernel_fpu_end();
 		nbytes &= AES_BLOCK_SIZE - 1;
 		err = blkcipher_walk_done(desc, &walk, nbytes);
 	}
-	kernel_fpu_end();
 
 	return err;
 }
@@ -362,18 +362,20 @@ static int ctr_crypt(struct blkcipher_desc *desc,
 	err = blkcipher_walk_virt_block(desc, &walk, AES_BLOCK_SIZE);
 	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
 
-	kernel_fpu_begin();
 	while ((nbytes = walk.nbytes) >= AES_BLOCK_SIZE) {
+		kernel_fpu_begin();
 		aesni_ctr_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr,
 			      nbytes & AES_BLOCK_MASK, walk.iv);
+		kernel_fpu_end();
 		nbytes &= AES_BLOCK_SIZE - 1;
 		err = blkcipher_walk_done(desc, &walk, nbytes);
 	}
 	if (walk.nbytes) {
+		kernel_fpu_begin();
 		ctr_crypt_final(ctx, &walk);
+		kernel_fpu_end();
 		err = blkcipher_walk_done(desc, &walk, 0);
 	}
-	kernel_fpu_end();
 
 	return err;
 }
-- 
cgit v0.10.2


From ee8118c6c5d61b86a68704104e110f9d1e7198a7 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 14 Nov 2011 23:06:09 +0100
Subject: dm: Make rt aware

Use the BUG_ON_NORT variant for the irq_disabled() checks. RT has
interrupts legitimately enabled here as we cant deadlock against the
irq thread due to the "sleeping spinlocks" conversion.

Reported-by: Luis Claudio R. Goncalves <lclaudio@uudg.org>
Cc: stable-rt@vger.kernel.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 0d8f086..0e1699e 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1677,14 +1677,14 @@ static void dm_request_fn(struct request_queue *q)
 		if (map_request(ti, clone, md))
 			goto requeued;
 
-		BUG_ON(!irqs_disabled());
+		BUG_ON_NONRT(!irqs_disabled());
 		spin_lock(q->queue_lock);
 	}
 
 	goto out;
 
 requeued:
-	BUG_ON(!irqs_disabled());
+	BUG_ON_NONRT(!irqs_disabled());
 	spin_lock(q->queue_lock);
 
 delay_and_out:
-- 
cgit v0.10.2


From 2994fbc050be66c2d87efa70cf610d79adb0c66b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 14 Dec 2011 01:03:49 +0100
Subject: cpumask: Disable CONFIG_CPUMASK_OFFSTACK for RT

We can't deal with the cpumask allocations which happen in atomic
context (see arch/x86/kernel/apic/io_apic.c) on RT right now.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 8b866b2..989e191 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -775,7 +775,7 @@ config IOMMU_HELPER
 config MAXSMP
 	bool "Enable Maximum number of SMP Processors and NUMA Nodes"
 	depends on X86_64 && SMP && DEBUG_KERNEL && EXPERIMENTAL
-	select CPUMASK_OFFSTACK
+	select CPUMASK_OFFSTACK if !PREEMPT_RT_FULL
 	---help---
 	  Enable maximum number of CPUS and NUMA Nodes for this architecture.
 	  If unsure, say N.
diff --git a/lib/Kconfig b/lib/Kconfig
index 75cdb77..7669d65 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -315,6 +315,7 @@ config CHECK_SIGNATURE
 
 config CPUMASK_OFFSTACK
 	bool "Force CPU masks off stack" if DEBUG_PER_CPU_MAPS
+	depends on !PREEMPT_RT_FULL
 	help
 	  Use dynamic allocation for cpumask_var_t, instead of putting
 	  them on the stack.  This is a bit more expensive, but avoids
-- 
cgit v0.10.2


From 1a54260abbeef320c2d164018bf5918b84ece1eb Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 22 Feb 2012 12:03:30 +0100
Subject: seqlock: Prevent rt starvation

If a low prio writer gets preempted while holding the seqlock write
locked, a high prio reader spins forever on RT.

To prevent this let the reader grab the spinlock, so it blocks and
eventually boosts the writer. This way the writer can proceed and
endless spinning is prevented.

For seqcount writers we disable preemption over the update code
path. Thaanks to Al Viro for distangling some VFS code to make that
possible.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable-rt@vger.kernel.org

diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
index 1829905..939ea1a 100644
--- a/include/linux/seqlock.h
+++ b/include/linux/seqlock.h
@@ -146,18 +146,30 @@ static inline int read_seqcount_retry(const seqcount_t *s, unsigned start)
  * Sequence counter only version assumes that callers are using their
  * own mutexing.
  */
-static inline void write_seqcount_begin(seqcount_t *s)
+static inline void __write_seqcount_begin(seqcount_t *s)
 {
 	s->sequence++;
 	smp_wmb();
 }
 
-static inline void write_seqcount_end(seqcount_t *s)
+static inline void write_seqcount_begin(seqcount_t *s)
+{
+	preempt_disable_rt();
+	__write_seqcount_begin(s);
+}
+
+static inline void __write_seqcount_end(seqcount_t *s)
 {
 	smp_wmb();
 	s->sequence++;
 }
 
+static inline void write_seqcount_end(seqcount_t *s)
+{
+	__write_seqcount_end(s);
+	preempt_enable_rt();
+}
+
 /**
  * write_seqcount_barrier - invalidate in-progress read-side seq operations
  * @s: pointer to seqcount_t
@@ -198,10 +210,33 @@ typedef struct {
 /*
  * Read side functions for starting and finalizing a read side section.
  */
+#ifndef CONFIG_PREEMPT_RT_FULL
 static inline unsigned read_seqbegin(const seqlock_t *sl)
 {
 	return read_seqcount_begin(&sl->seqcount);
 }
+#else
+/*
+ * Starvation safe read side for RT
+ */
+static inline unsigned read_seqbegin(seqlock_t *sl)
+{
+	unsigned ret;
+
+repeat:
+	ret = ACCESS_ONCE(sl->seqcount.sequence);
+	if (unlikely(ret & 1)) {
+		/*
+		 * Take the lock and let the writer proceed (i.e. evtl
+		 * boost it), otherwise we could loop here forever.
+		 */
+		spin_lock(&sl->lock);
+		spin_unlock(&sl->lock);
+		goto repeat;
+	}
+	return ret;
+}
+#endif
 
 static inline unsigned read_seqretry(const seqlock_t *sl, unsigned start)
 {
@@ -216,36 +251,36 @@ static inline unsigned read_seqretry(const seqlock_t *sl, unsigned start)
 static inline void write_seqlock(seqlock_t *sl)
 {
 	spin_lock(&sl->lock);
-	write_seqcount_begin(&sl->seqcount);
+	__write_seqcount_begin(&sl->seqcount);
 }
 
 static inline void write_sequnlock(seqlock_t *sl)
 {
-	write_seqcount_end(&sl->seqcount);
+	__write_seqcount_end(&sl->seqcount);
 	spin_unlock(&sl->lock);
 }
 
 static inline void write_seqlock_bh(seqlock_t *sl)
 {
 	spin_lock_bh(&sl->lock);
-	write_seqcount_begin(&sl->seqcount);
+	__write_seqcount_begin(&sl->seqcount);
 }
 
 static inline void write_sequnlock_bh(seqlock_t *sl)
 {
-	write_seqcount_end(&sl->seqcount);
+	__write_seqcount_end(&sl->seqcount);
 	spin_unlock_bh(&sl->lock);
 }
 
 static inline void write_seqlock_irq(seqlock_t *sl)
 {
 	spin_lock_irq(&sl->lock);
-	write_seqcount_begin(&sl->seqcount);
+	__write_seqcount_begin(&sl->seqcount);
 }
 
 static inline void write_sequnlock_irq(seqlock_t *sl)
 {
-	write_seqcount_end(&sl->seqcount);
+	__write_seqcount_end(&sl->seqcount);
 	spin_unlock_irq(&sl->lock);
 }
 
@@ -254,7 +289,7 @@ static inline unsigned long __write_seqlock_irqsave(seqlock_t *sl)
 	unsigned long flags;
 
 	spin_lock_irqsave(&sl->lock, flags);
-	write_seqcount_begin(&sl->seqcount);
+	__write_seqcount_begin(&sl->seqcount);
 	return flags;
 }
 
@@ -264,7 +299,7 @@ static inline unsigned long __write_seqlock_irqsave(seqlock_t *sl)
 static inline void
 write_sequnlock_irqrestore(seqlock_t *sl, unsigned long flags)
 {
-	write_seqcount_end(&sl->seqcount);
+	__write_seqcount_end(&sl->seqcount);
 	spin_unlock_irqrestore(&sl->lock, flags);
 }
 
diff --git a/include/net/dst.h b/include/net/dst.h
index b3ebe17..446d7b1 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -392,7 +392,7 @@ static inline void dst_confirm(struct dst_entry *dst)
 static inline int dst_neigh_output(struct dst_entry *dst, struct neighbour *n,
 				   struct sk_buff *skb)
 {
-	const struct hh_cache *hh;
+	struct hh_cache *hh;
 
 	if (dst->pending_confirm) {
 		unsigned long now = jiffies;
diff --git a/include/net/neighbour.h b/include/net/neighbour.h
index 0dab173..f28b70c 100644
--- a/include/net/neighbour.h
+++ b/include/net/neighbour.h
@@ -334,7 +334,7 @@ static inline int neigh_hh_bridge(struct hh_cache *hh, struct sk_buff *skb)
 }
 #endif
 
-static inline int neigh_hh_output(const struct hh_cache *hh, struct sk_buff *skb)
+static inline int neigh_hh_output(struct hh_cache *hh, struct sk_buff *skb)
 {
 	unsigned int seq;
 	int hh_len;
@@ -389,7 +389,7 @@ struct neighbour_cb {
 
 #define NEIGH_CB(skb)	((struct neighbour_cb *)(skb)->cb)
 
-static inline void neigh_ha_snapshot(char *dst, const struct neighbour *n,
+static inline void neigh_ha_snapshot(char *dst, struct neighbour *n,
 				     const struct net_device *dev)
 {
 	unsigned int seq;
-- 
cgit v0.10.2


From 933d7b6f77b781d0e729cb4b32fbbc97460e473a Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Thu, 1 Mar 2012 13:55:33 -0500
Subject: sched/rt: Fix wait_task_interactive() to test rt_spin_lock state

The wait_task_interactive() will have a task sleep waiting for another
task to have a certain state. But it ignores the rt_spin_locks state
and can return with an incorrect result if the task it is waiting
for is blocked on a rt_spin_lock() and is waking up.

The rt_spin_locks save the tasks state in the saved_state field
and the wait_task_interactive() must also test that state.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Carsten Emde <C.Emde@osadl.org>
Cc: John Kacur <jkacur@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Clark Williams <clark.williams@gmail.com>
Cc: stable-rt@vger.kernel.org
Link: http://lkml.kernel.org/r/20120301190345.979435764@goodmis.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 070f5e2..e9f12e0 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1041,7 +1041,8 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
 		 * is actually now running somewhere else!
 		 */
 		while (task_running(rq, p)) {
-			if (match_state && unlikely(p->state != match_state))
+			if (match_state && unlikely(p->state != match_state)
+			    && unlikely(p->saved_state != match_state))
 				return 0;
 			cpu_relax();
 		}
@@ -1056,7 +1057,8 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
 		running = task_running(rq, p);
 		on_rq = p->on_rq;
 		ncsw = 0;
-		if (!match_state || p->state == match_state)
+		if (!match_state || p->state == match_state
+		    || p->saved_state == match_state)
 			ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
 		task_rq_unlock(rq, p, &flags);
 
-- 
cgit v0.10.2


From ab36176c72ff2db76f85983e2a817707e5e2dff0 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Fri, 2 Mar 2012 10:36:57 -0500
Subject: cpu: Make hotplug.lock a "sleeping" spinlock on RT

Tasks can block on hotplug.lock in pin_current_cpu(), but their state
might be != RUNNING. So the mutex wakeup will set the state
unconditionally to RUNNING. That might cause spurious unexpected
wakeups. We could provide a state preserving mutex_lock() function,
but this is semantically backwards. So instead we convert the
hotplug.lock() to a spinlock for RT, which has the state preserving
semantics already.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Carsten Emde <C.Emde@osadl.org>
Cc: John Kacur <jkacur@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Clark Williams <clark.williams@gmail.com>
Cc: stable-rt@vger.kernel.org
Link: http://lkml.kernel.org/r/1330702617.25686.265.camel@gandalf.stny.rr.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 7b5f4cb..2942f54 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -51,7 +51,12 @@ static int cpu_hotplug_disabled;
 
 static struct {
 	struct task_struct *active_writer;
+#ifdef CONFIG_PREEMPT_RT_FULL
+	/* Makes the lock keep the task's state */
+	spinlock_t lock;
+#else
 	struct mutex lock; /* Synchronizes accesses to refcount, */
+#endif
 	/*
 	 * Also blocks the new readers during
 	 * an ongoing cpu hotplug operation.
@@ -59,10 +64,22 @@ static struct {
 	int refcount;
 } cpu_hotplug = {
 	.active_writer = NULL,
+#ifdef CONFIG_PREEMPT_RT_FULL
+	.lock = __SPIN_LOCK_UNLOCKED(cpu_hotplug.lock),
+#else
 	.lock = __MUTEX_INITIALIZER(cpu_hotplug.lock),
+#endif
 	.refcount = 0,
 };
 
+#ifdef CONFIG_PREEMPT_RT_FULL
+# define hotplug_lock() rt_spin_lock(&cpu_hotplug.lock)
+# define hotplug_unlock() rt_spin_unlock(&cpu_hotplug.lock)
+#else
+# define hotplug_lock() mutex_lock(&cpu_hotplug.lock)
+# define hotplug_unlock() mutex_unlock(&cpu_hotplug.lock)
+#endif
+
 struct hotplug_pcp {
 	struct task_struct *unplug;
 	int refcount;
@@ -92,8 +109,8 @@ retry:
 		return;
 	}
 	preempt_enable();
-	mutex_lock(&cpu_hotplug.lock);
-	mutex_unlock(&cpu_hotplug.lock);
+	hotplug_lock();
+	hotplug_unlock();
 	preempt_disable();
 	goto retry;
 }
@@ -166,9 +183,9 @@ void get_online_cpus(void)
 	might_sleep();
 	if (cpu_hotplug.active_writer == current)
 		return;
-	mutex_lock(&cpu_hotplug.lock);
+	hotplug_lock();
 	cpu_hotplug.refcount++;
-	mutex_unlock(&cpu_hotplug.lock);
+	hotplug_unlock();
 
 }
 EXPORT_SYMBOL_GPL(get_online_cpus);
@@ -177,14 +194,14 @@ void put_online_cpus(void)
 {
 	if (cpu_hotplug.active_writer == current)
 		return;
-	mutex_lock(&cpu_hotplug.lock);
 
+	hotplug_lock();
 	if (WARN_ON(!cpu_hotplug.refcount))
 		cpu_hotplug.refcount++; /* try to fix things up */
 
 	if (!--cpu_hotplug.refcount && unlikely(cpu_hotplug.active_writer))
 		wake_up_process(cpu_hotplug.active_writer);
-	mutex_unlock(&cpu_hotplug.lock);
+	hotplug_unlock();
 
 }
 EXPORT_SYMBOL_GPL(put_online_cpus);
@@ -216,11 +233,11 @@ static void cpu_hotplug_begin(void)
 	cpu_hotplug.active_writer = current;
 
 	for (;;) {
-		mutex_lock(&cpu_hotplug.lock);
+		hotplug_lock();
 		if (likely(!cpu_hotplug.refcount))
 			break;
 		__set_current_state(TASK_UNINTERRUPTIBLE);
-		mutex_unlock(&cpu_hotplug.lock);
+		hotplug_unlock();
 		schedule();
 	}
 }
@@ -228,7 +245,7 @@ static void cpu_hotplug_begin(void)
 static void cpu_hotplug_done(void)
 {
 	cpu_hotplug.active_writer = NULL;
-	mutex_unlock(&cpu_hotplug.lock);
+	hotplug_unlock();
 }
 
 #else /* #if CONFIG_HOTPLUG_CPU */
-- 
cgit v0.10.2


From f913098da8458e0a147d569fa62882a368b978f1 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 13 Nov 2011 17:17:09 +0100
Subject: softirq: Check preemption after reenabling interrupts

raise_softirq_irqoff() disables interrupts and wakes the softirq
daemon, but after reenabling interrupts there is no preemption check,
so the execution of the softirq thread might be delayed arbitrarily.

In principle we could add that check to local_irq_enable/restore, but
that's overkill as the rasie_softirq_irqoff() sections are the only
ones which show this behaviour.

Reported-by: Carsten Emde <cbe@osadl.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable-rt@vger.kernel.org

diff --git a/block/blk-iopoll.c b/block/blk-iopoll.c
index 58916af..f7ca9b4 100644
--- a/block/blk-iopoll.c
+++ b/block/blk-iopoll.c
@@ -38,6 +38,7 @@ void blk_iopoll_sched(struct blk_iopoll *iop)
 	list_add_tail(&iop->list, &__get_cpu_var(blk_cpu_iopoll));
 	__raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ);
 	local_irq_restore(flags);
+	preempt_check_resched_rt();
 }
 EXPORT_SYMBOL(blk_iopoll_sched);
 
@@ -135,6 +136,7 @@ static void blk_iopoll_softirq(struct softirq_action *h)
 		__raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ);
 
 	local_irq_enable();
+	preempt_check_resched_rt();
 }
 
 /**
@@ -204,6 +206,7 @@ static int __cpuinit blk_iopoll_cpu_notify(struct notifier_block *self,
 				 &__get_cpu_var(blk_cpu_iopoll));
 		__raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ);
 		local_irq_enable();
+		preempt_check_resched_rt();
 	}
 
 	return NOTIFY_OK;
diff --git a/block/blk-softirq.c b/block/blk-softirq.c
index 467c8de..3fe2368 100644
--- a/block/blk-softirq.c
+++ b/block/blk-softirq.c
@@ -51,6 +51,7 @@ static void trigger_softirq(void *data)
 		raise_softirq_irqoff(BLOCK_SOFTIRQ);
 
 	local_irq_restore(flags);
+	preempt_check_resched_rt();
 }
 
 /*
@@ -93,6 +94,7 @@ static int __cpuinit blk_cpu_notify(struct notifier_block *self,
 				 &__get_cpu_var(blk_cpu_done));
 		raise_softirq_irqoff(BLOCK_SOFTIRQ);
 		local_irq_enable();
+		preempt_check_resched_rt();
 	}
 
 	return NOTIFY_OK;
@@ -150,6 +152,7 @@ do_local:
 		goto do_local;
 
 	local_irq_restore(flags);
+	preempt_check_resched_rt();
 }
 
 /**
diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index c28f5d6..9978ec5 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -56,8 +56,10 @@ do { \
 
 #ifndef CONFIG_PREEMPT_RT_BASE
 # define preempt_enable_no_resched()	sched_preempt_enable_no_resched()
+# define preempt_check_resched_rt()	barrier()
 #else
 # define preempt_enable_no_resched()	preempt_enable()
+# define preempt_check_resched_rt()	preempt_check_resched()
 #endif
 
 #define preempt_enable() \
@@ -111,6 +113,7 @@ do { \
 #define preempt_disable_notrace()		barrier()
 #define preempt_enable_no_resched_notrace()	barrier()
 #define preempt_enable_notrace()		barrier()
+#define preempt_check_resched_rt()		barrier()
 
 #endif /* CONFIG_PREEMPT_COUNT */
 
diff --git a/net/core/dev.c b/net/core/dev.c
index 9d691dc..e1b5ef2 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1945,6 +1945,7 @@ static inline void __netif_reschedule(struct Qdisc *q)
 	sd->output_queue_tailp = &q->next_sched;
 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
 	local_irq_restore(flags);
+	preempt_check_resched_rt();
 }
 
 void __netif_schedule(struct Qdisc *q)
@@ -1966,6 +1967,7 @@ void dev_kfree_skb_irq(struct sk_buff *skb)
 		sd->completion_queue = skb;
 		raise_softirq_irqoff(NET_TX_SOFTIRQ);
 		local_irq_restore(flags);
+		preempt_check_resched_rt();
 	}
 }
 EXPORT_SYMBOL(dev_kfree_skb_irq);
@@ -3051,6 +3053,7 @@ enqueue:
 	rps_unlock(sd);
 
 	local_irq_restore(flags);
+	preempt_check_resched_rt();
 
 	atomic_long_inc(&skb->dev->rx_dropped);
 	kfree_skb(skb);
@@ -3937,6 +3940,7 @@ static void net_rps_action_and_irq_enable(struct softnet_data *sd)
 	} else
 #endif
 		local_irq_enable();
+	preempt_check_resched_rt();
 }
 
 static int process_backlog(struct napi_struct *napi, int quota)
@@ -4009,6 +4013,7 @@ void __napi_schedule(struct napi_struct *n)
 	local_irq_save(flags);
 	____napi_schedule(&__get_cpu_var(softnet_data), n);
 	local_irq_restore(flags);
+	preempt_check_resched_rt();
 }
 EXPORT_SYMBOL(__napi_schedule);
 
@@ -6565,6 +6570,7 @@ static int dev_cpu_callback(struct notifier_block *nfb,
 
 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
 	local_irq_enable();
+	preempt_check_resched_rt();
 
 	/* Process offline CPU's input_pkt_queue */
 	while ((skb = __skb_dequeue(&oldsd->process_queue))) {
-- 
cgit v0.10.2


From 9b85c157bec6a59bdf1bc69321837879d9cce5d3 Mon Sep 17 00:00:00 2001
From: John Kacur <jkacur@redhat.com>
Date: Fri, 27 Apr 2012 12:48:46 +0200
Subject: scsi: qla2xxx: Use local_irq_save_nort() in qla2x00_poll

RT triggers the following:

[   11.307652]  [<ffffffff81077b27>] __might_sleep+0xe7/0x110
[   11.307663]  [<ffffffff8150e524>] rt_spin_lock+0x24/0x60
[   11.307670]  [<ffffffff8150da78>] ? rt_spin_lock_slowunlock+0x78/0x90
[   11.307703]  [<ffffffffa0272d83>] qla24xx_intr_handler+0x63/0x2d0 [qla2xxx]
[   11.307736]  [<ffffffffa0262307>] qla2x00_poll+0x67/0x90 [qla2xxx]

Function qla2x00_poll does local_irq_save() before calling qla24xx_intr_handler
which has a spinlock. Since spinlocks are sleepable on rt, it is not allowed
to call them with interrupts disabled. Therefore we use local_irq_save_nort()
instead which saves flags without disabling interrupts.

This fix needs to be applied to v3.0-rt, v3.2-rt and v3.4-rt

Suggested-by: Thomas Gleixner
Signed-off-by: John Kacur <jkacur@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: David Sommerseth <davids@redhat.com>
Link: http://lkml.kernel.org/r/1335523726-10024-1-git-send-email-jkacur@redhat.com
Cc: stable-rt@vger.kernel.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/drivers/scsi/qla2xxx/qla_inline.h b/drivers/scsi/qla2xxx/qla_inline.h
index c0462c0..41192aa 100644
--- a/drivers/scsi/qla2xxx/qla_inline.h
+++ b/drivers/scsi/qla2xxx/qla_inline.h
@@ -36,12 +36,12 @@ qla2x00_poll(struct rsp_que *rsp)
 {
 	unsigned long flags;
 	struct qla_hw_data *ha = rsp->hw;
-	local_irq_save(flags);
+	local_irq_save_nort(flags);
 	if (IS_QLA82XX(ha))
 		qla82xx_poll(0, rsp);
 	else
 		ha->isp_ops->intr_handler(0, rsp);
-	local_irq_restore(flags);
+	local_irq_restore_nort(flags);
 }
 
 static inline uint8_t *
-- 
cgit v0.10.2


From a4cc7bd7a9c8e6e7638722cc1e1d85a5ceb58b6c Mon Sep 17 00:00:00 2001
From: Priyanka Jain <Priyanka.Jain@freescale.com>
Date: Thu, 17 May 2012 09:35:11 +0530
Subject: net,RT:REmove preemption disabling in netif_rx()

1)enqueue_to_backlog() (called from netif_rx) should be
  bind to a particluar CPU. This can be achieved by
  disabling migration. No need to disable preemption

2)Fixes crash "BUG: scheduling while atomic: ksoftirqd"
  in case of RT.
  If preemption is disabled, enqueue_to_backog() is called
  in atomic context. And if backlog exceeds its count,
  kfree_skb() is called. But in RT, kfree_skb() might
  gets scheduled out, so it expects non atomic context.

3)When CONFIG_PREEMPT_RT_FULL is not defined,
 migrate_enable(), migrate_disable() maps to
 preempt_enable() and preempt_disable(), so no
 change in functionality in case of non-RT.

-Replace preempt_enable(), preempt_disable() with
 migrate_enable(), migrate_disable() respectively
-Replace get_cpu(), put_cpu() with get_cpu_light(),
 put_cpu_light() respectively

Signed-off-by: Priyanka Jain <Priyanka.Jain@freescale.com>
Acked-by: Rajan Srivastava <Rajan.Srivastava@freescale.com>
Cc: <rostedt@goodmis.orgn>
Link: http://lkml.kernel.org/r/1337227511-2271-1-git-send-email-Priyanka.Jain@freescale.com
Cc: stable-rt@vger.kernel.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/net/core/dev.c b/net/core/dev.c
index e1b5ef2..aa529f7 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3091,7 +3091,7 @@ int netif_rx(struct sk_buff *skb)
 		struct rps_dev_flow voidflow, *rflow = &voidflow;
 		int cpu;
 
-		preempt_disable();
+		migrate_disable();
 		rcu_read_lock();
 
 		cpu = get_rps_cpu(skb->dev, skb, &rflow);
@@ -3101,13 +3101,13 @@ int netif_rx(struct sk_buff *skb)
 		ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
 
 		rcu_read_unlock();
-		preempt_enable();
+		migrate_enable();
 	} else
 #endif
 	{
 		unsigned int qtail;
-		ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
-		put_cpu();
+		ret = enqueue_to_backlog(skb, get_cpu_light(), &qtail);
+		put_cpu_light();
 	}
 	return ret;
 }
-- 
cgit v0.10.2


From 85d4f5b1567a6e440f3e000220dddba10b5bf401 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 7 Mar 2012 20:51:03 +0100
Subject: rt: Introduce cpu_chill()

Retry loops on RT might loop forever when the modifying side was
preempted. Add cpu_chill() to replace cpu_relax(). cpu_chill()
defaults to cpu_relax() for non RT. On RT it puts the looping task to
sleep for a tick so the preempted task can make progress.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable-rt@vger.kernel.org

diff --git a/include/linux/delay.h b/include/linux/delay.h
index a6ecb34..e23a7c0 100644
--- a/include/linux/delay.h
+++ b/include/linux/delay.h
@@ -52,4 +52,10 @@ static inline void ssleep(unsigned int seconds)
 	msleep(seconds * 1000);
 }
 
+#ifdef CONFIG_PREEMPT_RT_FULL
+# define cpu_chill()	msleep(1)
+#else
+# define cpu_chill()	cpu_relax()
+#endif
+
 #endif /* defined(_LINUX_DELAY_H) */
-- 
cgit v0.10.2


From dd8046f8df91251e97e624f1075d05ddf1dccb5f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 7 Mar 2012 21:00:34 +0100
Subject: fs: dcache: Use cpu_chill() in trylock loops

Retry loops on RT might loop forever when the modifying side was
preempted. Use cpu_chill() instead of cpu_relax() to let the system
make progress.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable-rt@vger.kernel.org

diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index b785e77..b0f12d7 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -34,6 +34,7 @@
 #include <linux/sched.h>
 #include <linux/mount.h>
 #include <linux/namei.h>
+#include <linux/delay.h>
 #include <asm/current.h>
 #include <asm/uaccess.h>
 
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index 01443ce..5928f92 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -166,7 +166,7 @@ again:
 			parent = p->d_parent;
 			if (!spin_trylock(&parent->d_lock)) {
 				spin_unlock(&p->d_lock);
-				cpu_relax();
+				cpu_chill();
 				goto relock;
 			}
 			spin_unlock(&p->d_lock);
diff --git a/fs/dcache.c b/fs/dcache.c
index c3bbf85..5bf12b1 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -37,6 +37,7 @@
 #include <linux/rculist_bl.h>
 #include <linux/prefetch.h>
 #include <linux/ratelimit.h>
+#include <linux/delay.h>
 #include "internal.h"
 #include "mount.h"
 
@@ -470,7 +471,7 @@ static inline struct dentry *dentry_kill(struct dentry *dentry, int ref)
 	if (inode && !spin_trylock(&inode->i_lock)) {
 relock:
 		spin_unlock(&dentry->d_lock);
-		cpu_relax();
+		cpu_chill();
 		return dentry; /* try again with same dentry */
 	}
 	if (IS_ROOT(dentry))
@@ -852,7 +853,7 @@ relock:
 
 		if (!spin_trylock(&dentry->d_lock)) {
 			spin_unlock(&dcache_lru_lock);
-			cpu_relax();
+			cpu_chill();
 			goto relock;
 		}
 
@@ -2084,7 +2085,7 @@ again:
 	if (dentry->d_count == 1) {
 		if (!spin_trylock(&inode->i_lock)) {
 			spin_unlock(&dentry->d_lock);
-			cpu_relax();
+			cpu_chill();
 			goto again;
 		}
 		dentry->d_flags &= ~DCACHE_CANT_MOUNT;
diff --git a/fs/namespace.c b/fs/namespace.c
index c03d460..859a026 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -22,6 +22,7 @@
 #include <linux/fsnotify.h>	/* fsnotify_vfsmount_delete */
 #include <linux/uaccess.h>
 #include <linux/proc_fs.h>
+#include <linux/delay.h>
 #include "pnode.h"
 #include "internal.h"
 
@@ -315,7 +316,7 @@ int __mnt_want_write(struct vfsmount *m)
 	smp_mb();
 	while (ACCESS_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD) {
 		preempt_enable();
-		cpu_relax();
+		cpu_chill();
 		preempt_disable();
 	}
 	/*
-- 
cgit v0.10.2


From d7c98725fdbf40ca72c8a1a00bda3260e8332c1f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 7 Mar 2012 21:10:04 +0100
Subject: net: Use cpu_chill() instead of cpu_relax()

Retry loops on RT might loop forever when the modifying side was
preempted. Use cpu_chill() instead of cpu_relax() to let the system
make progress.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable-rt@vger.kernel.org

diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index c111bd0..92a2359 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -88,6 +88,7 @@
 #include <linux/virtio_net.h>
 #include <linux/errqueue.h>
 #include <linux/net_tstamp.h>
+#include <linux/delay.h>
 
 #ifdef CONFIG_INET
 #include <net/inet_common.h>
@@ -553,7 +554,7 @@ static void prb_retire_rx_blk_timer_expired(unsigned long data)
 	if (BLOCK_NUM_PKTS(pbd)) {
 		while (atomic_read(&pkc->blk_fill_in_prog)) {
 			/* Waiting for skb_copy_bits to finish... */
-			cpu_relax();
+			cpu_chill();
 		}
 	}
 
@@ -807,7 +808,7 @@ static void prb_retire_current_block(struct tpacket_kbdq_core *pkc,
 		if (!(status & TP_STATUS_BLK_TMO)) {
 			while (atomic_read(&pkc->blk_fill_in_prog)) {
 				/* Waiting for skb_copy_bits to finish... */
-				cpu_relax();
+				cpu_chill();
 			}
 		}
 		prb_close_block(pkc, pbd, po, status);
diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c
index e8fdb17..5a44c6e 100644
--- a/net/rds/ib_rdma.c
+++ b/net/rds/ib_rdma.c
@@ -34,6 +34,7 @@
 #include <linux/slab.h>
 #include <linux/rculist.h>
 #include <linux/llist.h>
+#include <linux/delay.h>
 
 #include "rds.h"
 #include "ib.h"
@@ -286,7 +287,7 @@ static inline void wait_clean_list_grace(void)
 	for_each_online_cpu(cpu) {
 		flag = &per_cpu(clean_list_grace, cpu);
 		while (test_bit(CLEAN_LIST_BUSY_BIT, flag))
-			cpu_relax();
+			cpu_chill();
 	}
 }
 
-- 
cgit v0.10.2


From 5850427538e1dcb21d4c99c0e026aff56e363a6c Mon Sep 17 00:00:00 2001
From: Yong Zhang <yong.zhang@windriver.com>
Date: Mon, 16 Apr 2012 15:01:55 +0800
Subject: lockdep: Selftest: convert spinlock to raw spinlock

spinlock is sleepable on -rt and can not be used in
interrupt context.

Signed-off-by: Yong Zhang <yong.zhang0@gmail.com>
Cc: Yong Zhang <yong.zhang@windriver.com>
Link: http://lkml.kernel.org/r/1334559716-18447-2-git-send-email-yong.zhang0@gmail.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/lib/locking-selftest.c b/lib/locking-selftest.c
index 7aae0f2..c3eb261 100644
--- a/lib/locking-selftest.c
+++ b/lib/locking-selftest.c
@@ -47,10 +47,10 @@ __setup("debug_locks_verbose=", setup_debug_locks_verbose);
  * Normal standalone locks, for the circular and irq-context
  * dependency tests:
  */
-static DEFINE_SPINLOCK(lock_A);
-static DEFINE_SPINLOCK(lock_B);
-static DEFINE_SPINLOCK(lock_C);
-static DEFINE_SPINLOCK(lock_D);
+static DEFINE_RAW_SPINLOCK(lock_A);
+static DEFINE_RAW_SPINLOCK(lock_B);
+static DEFINE_RAW_SPINLOCK(lock_C);
+static DEFINE_RAW_SPINLOCK(lock_D);
 
 static DEFINE_RWLOCK(rwlock_A);
 static DEFINE_RWLOCK(rwlock_B);
@@ -73,12 +73,12 @@ static DECLARE_RWSEM(rwsem_D);
  * but X* and Y* are different classes. We do this so that
  * we do not trigger a real lockup:
  */
-static DEFINE_SPINLOCK(lock_X1);
-static DEFINE_SPINLOCK(lock_X2);
-static DEFINE_SPINLOCK(lock_Y1);
-static DEFINE_SPINLOCK(lock_Y2);
-static DEFINE_SPINLOCK(lock_Z1);
-static DEFINE_SPINLOCK(lock_Z2);
+static DEFINE_RAW_SPINLOCK(lock_X1);
+static DEFINE_RAW_SPINLOCK(lock_X2);
+static DEFINE_RAW_SPINLOCK(lock_Y1);
+static DEFINE_RAW_SPINLOCK(lock_Y2);
+static DEFINE_RAW_SPINLOCK(lock_Z1);
+static DEFINE_RAW_SPINLOCK(lock_Z2);
 
 static DEFINE_RWLOCK(rwlock_X1);
 static DEFINE_RWLOCK(rwlock_X2);
@@ -107,10 +107,10 @@ static DECLARE_RWSEM(rwsem_Z2);
  */
 #define INIT_CLASS_FUNC(class) 				\
 static noinline void					\
-init_class_##class(spinlock_t *lock, rwlock_t *rwlock, struct mutex *mutex, \
-		 struct rw_semaphore *rwsem)		\
+init_class_##class(raw_spinlock_t *lock, rwlock_t *rwlock, \
+	struct mutex *mutex, struct rw_semaphore *rwsem)\
 {							\
-	spin_lock_init(lock);				\
+	raw_spin_lock_init(lock);			\
 	rwlock_init(rwlock);				\
 	mutex_init(mutex);				\
 	init_rwsem(rwsem);				\
@@ -168,10 +168,10 @@ static void init_shared_classes(void)
  * Shortcuts for lock/unlock API variants, to keep
  * the testcases compact:
  */
-#define L(x)			spin_lock(&lock_##x)
-#define U(x)			spin_unlock(&lock_##x)
+#define L(x)			raw_spin_lock(&lock_##x)
+#define U(x)			raw_spin_unlock(&lock_##x)
 #define LU(x)			L(x); U(x)
-#define SI(x)			spin_lock_init(&lock_##x)
+#define SI(x)			raw_spin_lock_init(&lock_##x)
 
 #define WL(x)			write_lock(&rwlock_##x)
 #define WU(x)			write_unlock(&rwlock_##x)
@@ -911,7 +911,7 @@ GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion_soft)
 
 #define I2(x)					\
 	do {					\
-		spin_lock_init(&lock_##x);	\
+		raw_spin_lock_init(&lock_##x);	\
 		rwlock_init(&rwlock_##x);	\
 		mutex_init(&mutex_##x);		\
 		init_rwsem(&rwsem_##x);		\
-- 
cgit v0.10.2


From 068bdce2fbb5b12ab6abcc8e483be82f4f402219 Mon Sep 17 00:00:00 2001
From: Yong Zhang <yong.zhang@windriver.com>
Date: Mon, 16 Apr 2012 15:01:56 +0800
Subject: lockdep: Selftest: Only do hardirq context test for raw spinlock

On -rt there is no softirq context any more and rwlock is sleepable,
disable softirq context test and rwlock+irq test.

Signed-off-by: Yong Zhang <yong.zhang0@gmail.com>
Cc: Yong Zhang <yong.zhang@windriver.com>
Link: http://lkml.kernel.org/r/1334559716-18447-3-git-send-email-yong.zhang0@gmail.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/lib/locking-selftest.c b/lib/locking-selftest.c
index c3eb261..23b8564 100644
--- a/lib/locking-selftest.c
+++ b/lib/locking-selftest.c
@@ -1175,6 +1175,7 @@ void locking_selftest(void)
 
 	printk("  --------------------------------------------------------------------------\n");
 
+#ifndef CONFIG_PREEMPT_RT_FULL
 	/*
 	 * irq-context testcases:
 	 */
@@ -1187,6 +1188,28 @@ void locking_selftest(void)
 
 	DO_TESTCASE_6x2("irq read-recursion", irq_read_recursion);
 //	DO_TESTCASE_6x2B("irq read-recursion #2", irq_read_recursion2);
+#else
+	/* On -rt, we only do hardirq context test for raw spinlock */
+	DO_TESTCASE_1B("hard-irqs-on + irq-safe-A", irqsafe1_hard_spin, 12);
+	DO_TESTCASE_1B("hard-irqs-on + irq-safe-A", irqsafe1_hard_spin, 21);
+
+	DO_TESTCASE_1B("hard-safe-A + irqs-on", irqsafe2B_hard_spin, 12);
+	DO_TESTCASE_1B("hard-safe-A + irqs-on", irqsafe2B_hard_spin, 21);
+
+	DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 123);
+	DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 132);
+	DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 213);
+	DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 231);
+	DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 312);
+	DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 321);
+
+	DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 123);
+	DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 132);
+	DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 213);
+	DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 231);
+	DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 312);
+	DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 321);
+#endif
 
 	if (unexpected_testcase_failures) {
 		printk("-----------------------------------------------------------------\n");
-- 
cgit v0.10.2


From 3a76d0338b892af98274f5498f7fa64f891c8abb Mon Sep 17 00:00:00 2001
From: Mike Galbraith <mgalbraith@suse.de>
Date: Wed, 11 Jul 2012 22:05:20 +0000
Subject: fs, jbd: pull your plug when waiting for space

With an -rt kernel, and a heavy sync IO load, tasks can jam
up on journal locks without unplugging, which can lead to
terminal IO starvation.  Unplug and schedule when waiting for space.

Signed-off-by: Mike Galbraith <mgalbraith@suse.de>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Theodore Tso <tytso@mit.edu>
Link: http://lkml.kernel.org/r/1341812414.7370.73.camel@marge.simpson.net
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c
index 08c0304..95debd7 100644
--- a/fs/jbd/checkpoint.c
+++ b/fs/jbd/checkpoint.c
@@ -129,6 +129,8 @@ void __log_wait_for_space(journal_t *journal)
 		if (journal->j_flags & JFS_ABORT)
 			return;
 		spin_unlock(&journal->j_state_lock);
+		if (current->plug)
+			io_schedule();
 		mutex_lock(&journal->j_checkpoint_mutex);
 
 		/*
-- 
cgit v0.10.2


From 7acb30ffc1db18a22fec1bcafc716bd3fc296bec Mon Sep 17 00:00:00 2001
From: Yong Zhang <yong.zhang@windriver.com>
Date: Wed, 11 Jul 2012 22:05:21 +0000
Subject: perf: Make swevent hrtimer run in irq instead of softirq

Otherwise we get a deadlock like below:

[ 1044.042749] BUG: scheduling while atomic: ksoftirqd/21/141/0x00010003
[ 1044.042752] INFO: lockdep is turned off.
[ 1044.042754] Modules linked in:
[ 1044.042757] Pid: 141, comm: ksoftirqd/21 Tainted: G        W    3.4.0-rc2-rt3-23676-ga723175-dirty #29
[ 1044.042759] Call Trace:
[ 1044.042761]  <IRQ>  [<ffffffff8107d8e5>] __schedule_bug+0x65/0x80
[ 1044.042770]  [<ffffffff8168978c>] __schedule+0x83c/0xa70
[ 1044.042775]  [<ffffffff8106bdd2>] ? prepare_to_wait+0x32/0xb0
[ 1044.042779]  [<ffffffff81689a5e>] schedule+0x2e/0xa0
[ 1044.042782]  [<ffffffff81071ebd>] hrtimer_wait_for_timer+0x6d/0xb0
[ 1044.042786]  [<ffffffff8106bb30>] ? wake_up_bit+0x40/0x40
[ 1044.042790]  [<ffffffff81071f20>] hrtimer_cancel+0x20/0x40
[ 1044.042794]  [<ffffffff8111da0c>] perf_swevent_cancel_hrtimer+0x3c/0x50
[ 1044.042798]  [<ffffffff8111da31>] task_clock_event_stop+0x11/0x40
[ 1044.042802]  [<ffffffff8111da6e>] task_clock_event_del+0xe/0x10
[ 1044.042805]  [<ffffffff8111c568>] event_sched_out+0x118/0x1d0
[ 1044.042809]  [<ffffffff8111c649>] group_sched_out+0x29/0x90
[ 1044.042813]  [<ffffffff8111ed7e>] __perf_event_disable+0x18e/0x200
[ 1044.042817]  [<ffffffff8111c343>] remote_function+0x63/0x70
[ 1044.042821]  [<ffffffff810b0aae>] generic_smp_call_function_single_interrupt+0xce/0x120
[ 1044.042826]  [<ffffffff81022bc7>] smp_call_function_single_interrupt+0x27/0x40
[ 1044.042831]  [<ffffffff8168d50c>] call_function_single_interrupt+0x6c/0x80
[ 1044.042833]  <EOI>  [<ffffffff811275b0>] ? perf_event_overflow+0x20/0x20
[ 1044.042840]  [<ffffffff8168b970>] ? _raw_spin_unlock_irq+0x30/0x70
[ 1044.042844]  [<ffffffff8168b976>] ? _raw_spin_unlock_irq+0x36/0x70
[ 1044.042848]  [<ffffffff810702e2>] run_hrtimer_softirq+0xc2/0x200
[ 1044.042853]  [<ffffffff811275b0>] ? perf_event_overflow+0x20/0x20
[ 1044.042857]  [<ffffffff81045265>] __do_softirq_common+0xf5/0x3a0
[ 1044.042862]  [<ffffffff81045c3d>] __thread_do_softirq+0x15d/0x200
[ 1044.042865]  [<ffffffff81045dda>] run_ksoftirqd+0xfa/0x210
[ 1044.042869]  [<ffffffff81045ce0>] ? __thread_do_softirq+0x200/0x200
[ 1044.042873]  [<ffffffff81045ce0>] ? __thread_do_softirq+0x200/0x200
[ 1044.042877]  [<ffffffff8106b596>] kthread+0xb6/0xc0
[ 1044.042881]  [<ffffffff8168b97b>] ? _raw_spin_unlock_irq+0x3b/0x70
[ 1044.042886]  [<ffffffff8168d994>] kernel_thread_helper+0x4/0x10
[ 1044.042889]  [<ffffffff8107d98c>] ? finish_task_switch+0x8c/0x110
[ 1044.042894]  [<ffffffff8168b97b>] ? _raw_spin_unlock_irq+0x3b/0x70
[ 1044.042897]  [<ffffffff8168bd5d>] ? retint_restore_args+0xe/0xe
[ 1044.042900]  [<ffffffff8106b4e0>] ? kthreadd+0x1e0/0x1e0
[ 1044.042902]  [<ffffffff8168d990>] ? gs_change+0xb/0xb

Signed-off-by: Yong Zhang <yong.zhang0@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Steven Rostedt <rostedt@goodmis.org>
Link: http://lkml.kernel.org/r/1341476476-5666-1-git-send-email-yong.zhang0@gmail.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 0600d3b..45f7b3e 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -5638,6 +5638,7 @@ static void perf_swevent_init_hrtimer(struct perf_event *event)
 
 	hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	hwc->hrtimer.function = perf_swevent_hrtimer;
+	hwc->hrtimer.irqsafe = 1;
 
 	/*
 	 * Since hrtimers have a fixed rate, we can do a static freq->period
-- 
cgit v0.10.2


From f1a32eddf445d38d086ebbdfb7201f9dfafcda11 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 16 Jul 2012 08:07:43 +0000
Subject: cpu/rt: Rework cpu down for PREEMPT_RT

Bringing a CPU down is a pain with the PREEMPT_RT kernel because
tasks can be preempted in many more places than in non-RT. In
order to handle per_cpu variables, tasks may be pinned to a CPU
for a while, and even sleep. But these tasks need to be off the CPU
if that CPU is going down.

Several synchronization methods have been tried, but when stressed
they failed. This is a new approach.

A sync_tsk thread is still created and tasks may still block on a
lock when the CPU is going down, but how that works is a bit different.
When cpu_down() starts, it will create the sync_tsk and wait on it
to inform that current tasks that are pinned on the CPU are no longer
pinned. But new tasks that are about to be pinned will still be allowed
to do so at this time.

Then the notifiers are called. Several notifiers will bring down tasks
that will enter these locations. Some of these tasks will take locks
of other tasks that are on the CPU. If we don't let those other tasks
continue, but make them block until CPU down is done, the tasks that
the notifiers are waiting on will never complete as they are waiting
for the locks held by the tasks that are blocked.

Thus we still let the task pin the CPU until the notifiers are done.
After the notifiers run, we then make new tasks entering the pinned
CPU sections grab a mutex and wait. This mutex is now a per CPU mutex
in the hotplug_pcp descriptor.

To help things along, a new function in the scheduler code is created
called migrate_me(). This function will try to migrate the current task
off the CPU this is going down if possible. When the sync_tsk is created,
all tasks will then try to migrate off the CPU going down. There are
several cases that this wont work, but it helps in most cases.

After the notifiers are called and if a task can't migrate off but enters
the pin CPU sections, it will be forced to wait on the hotplug_pcp mutex
until the CPU down is complete. Then the scheduler will force the migration
anyway.

Also, I found that THREAD_BOUND need to also be accounted for in the
pinned CPU, and the migrate_disable no longer treats them special.
This helps fix issues with ksoftirqd and workqueue that unbind on CPU down.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/sched.h b/include/linux/sched.h
index fbb7e73..c002f80 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1978,6 +1978,10 @@ extern void do_set_cpus_allowed(struct task_struct *p,
 
 extern int set_cpus_allowed_ptr(struct task_struct *p,
 				const struct cpumask *new_mask);
+int migrate_me(void);
+void tell_sched_cpu_down_begin(int cpu);
+void tell_sched_cpu_down_done(int cpu);
+
 #else
 static inline void do_set_cpus_allowed(struct task_struct *p,
 				      const struct cpumask *new_mask)
@@ -1990,6 +1994,9 @@ static inline int set_cpus_allowed_ptr(struct task_struct *p,
 		return -EINVAL;
 	return 0;
 }
+static inline int migrate_me(void) { return 0; }
+static inline void tell_sched_cpu_down_begin(int cpu) { }
+static inline void tell_sched_cpu_down_done(int cpu) { }
 #endif
 
 #ifdef CONFIG_NO_HZ
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 2942f54..d44dea3 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -51,12 +51,7 @@ static int cpu_hotplug_disabled;
 
 static struct {
 	struct task_struct *active_writer;
-#ifdef CONFIG_PREEMPT_RT_FULL
-	/* Makes the lock keep the task's state */
-	spinlock_t lock;
-#else
 	struct mutex lock; /* Synchronizes accesses to refcount, */
-#endif
 	/*
 	 * Also blocks the new readers during
 	 * an ongoing cpu hotplug operation.
@@ -64,28 +59,46 @@ static struct {
 	int refcount;
 } cpu_hotplug = {
 	.active_writer = NULL,
-#ifdef CONFIG_PREEMPT_RT_FULL
-	.lock = __SPIN_LOCK_UNLOCKED(cpu_hotplug.lock),
-#else
 	.lock = __MUTEX_INITIALIZER(cpu_hotplug.lock),
-#endif
 	.refcount = 0,
 };
 
-#ifdef CONFIG_PREEMPT_RT_FULL
-# define hotplug_lock() rt_spin_lock(&cpu_hotplug.lock)
-# define hotplug_unlock() rt_spin_unlock(&cpu_hotplug.lock)
-#else
-# define hotplug_lock() mutex_lock(&cpu_hotplug.lock)
-# define hotplug_unlock() mutex_unlock(&cpu_hotplug.lock)
-#endif
-
+/**
+ * hotplug_pcp - per cpu hotplug descriptor
+ * @unplug:	set when pin_current_cpu() needs to sync tasks
+ * @sync_tsk:	the task that waits for tasks to finish pinned sections
+ * @refcount:	counter of tasks in pinned sections
+ * @grab_lock:	set when the tasks entering pinned sections should wait
+ * @synced:	notifier for @sync_tsk to tell cpu_down it's finished
+ * @mutex:	the mutex to make tasks wait (used when @grab_lock is true)
+ * @mutex_init:	zero if the mutex hasn't been initialized yet.
+ *
+ * Although @unplug and @sync_tsk may point to the same task, the @unplug
+ * is used as a flag and still exists after @sync_tsk has exited and
+ * @sync_tsk set to NULL.
+ */
 struct hotplug_pcp {
 	struct task_struct *unplug;
+	struct task_struct *sync_tsk;
 	int refcount;
+	int grab_lock;
 	struct completion synced;
+#ifdef CONFIG_PREEMPT_RT_FULL
+	spinlock_t lock;
+#else
+	struct mutex mutex;
+#endif
+	int mutex_init;
 };
 
+#ifdef CONFIG_PREEMPT_RT_FULL
+# define hotplug_lock(hp) rt_spin_lock(&(hp)->lock)
+# define hotplug_unlock(hp) rt_spin_unlock(&(hp)->lock)
+#else
+# define hotplug_lock(hp) mutex_lock(&(hp)->mutex)
+# define hotplug_unlock(hp) mutex_unlock(&(hp)->mutex)
+#endif
+
 static DEFINE_PER_CPU(struct hotplug_pcp, hotplug_pcp);
 
 /**
@@ -99,18 +112,40 @@ static DEFINE_PER_CPU(struct hotplug_pcp, hotplug_pcp);
 void pin_current_cpu(void)
 {
 	struct hotplug_pcp *hp;
+	int force = 0;
 
 retry:
 	hp = &__get_cpu_var(hotplug_pcp);
 
-	if (!hp->unplug || hp->refcount || preempt_count() > 1 ||
+	if (!hp->unplug || hp->refcount || force || preempt_count() > 1 ||
 	    hp->unplug == current || (current->flags & PF_STOMPER)) {
 		hp->refcount++;
 		return;
 	}
-	preempt_enable();
-	hotplug_lock();
-	hotplug_unlock();
+
+	if (hp->grab_lock) {
+		preempt_enable();
+		hotplug_lock(hp);
+		hotplug_unlock(hp);
+	} else {
+		preempt_enable();
+		/*
+		 * Try to push this task off of this CPU.
+		 */
+		if (!migrate_me()) {
+			preempt_disable();
+			hp = &__get_cpu_var(hotplug_pcp);
+			if (!hp->grab_lock) {
+				/*
+				 * Just let it continue it's already pinned
+				 * or about to sleep.
+				 */
+				force = 1;
+				goto retry;
+			}
+			preempt_enable();
+		}
+	}
 	preempt_disable();
 	goto retry;
 }
@@ -132,26 +167,84 @@ void unpin_current_cpu(void)
 		wake_up_process(hp->unplug);
 }
 
-/*
- * FIXME: Is this really correct under all circumstances ?
- */
+static void wait_for_pinned_cpus(struct hotplug_pcp *hp)
+{
+	set_current_state(TASK_UNINTERRUPTIBLE);
+	while (hp->refcount) {
+		schedule_preempt_disabled();
+		set_current_state(TASK_UNINTERRUPTIBLE);
+	}
+}
+
 static int sync_unplug_thread(void *data)
 {
 	struct hotplug_pcp *hp = data;
 
 	preempt_disable();
 	hp->unplug = current;
+	wait_for_pinned_cpus(hp);
+
+	/*
+	 * This thread will synchronize the cpu_down() with threads
+	 * that have pinned the CPU. When the pinned CPU count reaches
+	 * zero, we inform the cpu_down code to continue to the next step.
+	 */
 	set_current_state(TASK_UNINTERRUPTIBLE);
-	while (hp->refcount) {
-		schedule_preempt_disabled();
+	preempt_enable();
+	complete(&hp->synced);
+
+	/*
+	 * If all succeeds, the next step will need tasks to wait till
+	 * the CPU is offline before continuing. To do this, the grab_lock
+	 * is set and tasks going into pin_current_cpu() will block on the
+	 * mutex. But we still need to wait for those that are already in
+	 * pinned CPU sections. If the cpu_down() failed, the kthread_should_stop()
+	 * will kick this thread out.
+	 */
+	while (!hp->grab_lock && !kthread_should_stop()) {
+		schedule();
+		set_current_state(TASK_UNINTERRUPTIBLE);
+	}
+
+	/* Make sure grab_lock is seen before we see a stale completion */
+	smp_mb();
+
+	/*
+	 * Now just before cpu_down() enters stop machine, we need to make
+	 * sure all tasks that are in pinned CPU sections are out, and new
+	 * tasks will now grab the lock, keeping them from entering pinned
+	 * CPU sections.
+	 */
+	if (!kthread_should_stop()) {
+		preempt_disable();
+		wait_for_pinned_cpus(hp);
+		preempt_enable();
+		complete(&hp->synced);
+	}
+
+	set_current_state(TASK_UNINTERRUPTIBLE);
+	while (!kthread_should_stop()) {
+		schedule();
 		set_current_state(TASK_UNINTERRUPTIBLE);
 	}
 	set_current_state(TASK_RUNNING);
-	preempt_enable();
-	complete(&hp->synced);
+
+	/*
+	 * Force this thread off this CPU as it's going down and
+	 * we don't want any more work on this CPU.
+	 */
+	current->flags &= ~PF_THREAD_BOUND;
+	do_set_cpus_allowed(current, cpu_present_mask);
+	migrate_me();
 	return 0;
 }
 
+static void __cpu_unplug_sync(struct hotplug_pcp *hp)
+{
+	wake_up_process(hp->sync_tsk);
+	wait_for_completion(&hp->synced);
+}
+
 /*
  * Start the sync_unplug_thread on the target cpu and wait for it to
  * complete.
@@ -159,23 +252,83 @@ static int sync_unplug_thread(void *data)
 static int cpu_unplug_begin(unsigned int cpu)
 {
 	struct hotplug_pcp *hp = &per_cpu(hotplug_pcp, cpu);
-	struct task_struct *tsk;
+	int err;
+
+	/* Protected by cpu_hotplug.lock */
+	if (!hp->mutex_init) {
+#ifdef CONFIG_PREEMPT_RT_FULL
+		spin_lock_init(&hp->lock);
+#else
+		mutex_init(&hp->mutex);
+#endif
+		hp->mutex_init = 1;
+	}
+
+	/* Inform the scheduler to migrate tasks off this CPU */
+	tell_sched_cpu_down_begin(cpu);
 
 	init_completion(&hp->synced);
-	tsk = kthread_create(sync_unplug_thread, hp, "sync_unplug/%d", cpu);
-	if (IS_ERR(tsk))
-		return (PTR_ERR(tsk));
-	kthread_bind(tsk, cpu);
-	wake_up_process(tsk);
-	wait_for_completion(&hp->synced);
+
+	hp->sync_tsk = kthread_create(sync_unplug_thread, hp, "sync_unplug/%d", cpu);
+	if (IS_ERR(hp->sync_tsk)) {
+		err = PTR_ERR(hp->sync_tsk);
+		hp->sync_tsk = NULL;
+		return err;
+	}
+	kthread_bind(hp->sync_tsk, cpu);
+
+	/*
+	 * Wait for tasks to get out of the pinned sections,
+	 * it's still OK if new tasks enter. Some CPU notifiers will
+	 * wait for tasks that are going to enter these sections and
+	 * we must not have them block.
+	 */
+	__cpu_unplug_sync(hp);
+
 	return 0;
 }
 
+static void cpu_unplug_sync(unsigned int cpu)
+{
+	struct hotplug_pcp *hp = &per_cpu(hotplug_pcp, cpu);
+
+	init_completion(&hp->synced);
+	/* The completion needs to be initialzied before setting grab_lock */
+	smp_wmb();
+
+	/* Grab the mutex before setting grab_lock */
+	hotplug_lock(hp);
+	hp->grab_lock = 1;
+
+	/*
+	 * The CPU notifiers have been completed.
+	 * Wait for tasks to get out of pinned CPU sections and have new
+	 * tasks block until the CPU is completely down.
+	 */
+	__cpu_unplug_sync(hp);
+
+	/* All done with the sync thread */
+	kthread_stop(hp->sync_tsk);
+	hp->sync_tsk = NULL;
+}
+
 static void cpu_unplug_done(unsigned int cpu)
 {
 	struct hotplug_pcp *hp = &per_cpu(hotplug_pcp, cpu);
 
 	hp->unplug = NULL;
+	/* Let all tasks know cpu unplug is finished before cleaning up */
+	smp_wmb();
+
+	if (hp->sync_tsk)
+		kthread_stop(hp->sync_tsk);
+
+	if (hp->grab_lock) {
+		hotplug_unlock(hp);
+		/* protected by cpu_hotplug.lock */
+		hp->grab_lock = 0;
+	}
+	tell_sched_cpu_down_done(cpu);
 }
 
 void get_online_cpus(void)
@@ -183,9 +336,9 @@ void get_online_cpus(void)
 	might_sleep();
 	if (cpu_hotplug.active_writer == current)
 		return;
-	hotplug_lock();
+	mutex_lock(&cpu_hotplug.lock);
 	cpu_hotplug.refcount++;
-	hotplug_unlock();
+	mutex_unlock(&cpu_hotplug.lock);
 
 }
 EXPORT_SYMBOL_GPL(get_online_cpus);
@@ -195,14 +348,13 @@ void put_online_cpus(void)
 	if (cpu_hotplug.active_writer == current)
 		return;
 
-	hotplug_lock();
+	mutex_lock(&cpu_hotplug.lock);
 	if (WARN_ON(!cpu_hotplug.refcount))
 		cpu_hotplug.refcount++; /* try to fix things up */
 
 	if (!--cpu_hotplug.refcount && unlikely(cpu_hotplug.active_writer))
 		wake_up_process(cpu_hotplug.active_writer);
-	hotplug_unlock();
-
+	mutex_unlock(&cpu_hotplug.lock);
 }
 EXPORT_SYMBOL_GPL(put_online_cpus);
 
@@ -233,11 +385,11 @@ static void cpu_hotplug_begin(void)
 	cpu_hotplug.active_writer = current;
 
 	for (;;) {
-		hotplug_lock();
+		mutex_lock(&cpu_hotplug.lock);
 		if (likely(!cpu_hotplug.refcount))
 			break;
 		__set_current_state(TASK_UNINTERRUPTIBLE);
-		hotplug_unlock();
+		mutex_unlock(&cpu_hotplug.lock);
 		schedule();
 	}
 }
@@ -245,7 +397,7 @@ static void cpu_hotplug_begin(void)
 static void cpu_hotplug_done(void)
 {
 	cpu_hotplug.active_writer = NULL;
-	hotplug_unlock();
+	mutex_unlock(&cpu_hotplug.lock);
 }
 
 #else /* #if CONFIG_HOTPLUG_CPU */
@@ -421,6 +573,9 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
 	}
 	smpboot_park_threads(cpu);
 
+	/* Notifiers are done. Don't let any more tasks pin this CPU. */
+	cpu_unplug_sync(cpu);
+
 	err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
 	if (err) {
 		/* CPU didn't die: tell everyone.  Can't complain. */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e9f12e0..e33174a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2898,7 +2898,7 @@ void migrate_disable(void)
 {
 	struct task_struct *p = current;
 
-	if (in_atomic() || p->flags & PF_THREAD_BOUND) {
+	if (in_atomic()) {
 #ifdef CONFIG_SCHED_DEBUG
 		p->migrate_disable_atomic++;
 #endif
@@ -2929,7 +2929,7 @@ void migrate_enable(void)
 	unsigned long flags;
 	struct rq *rq;
 
-	if (in_atomic() || p->flags & PF_THREAD_BOUND) {
+	if (in_atomic()) {
 #ifdef CONFIG_SCHED_DEBUG
 		p->migrate_disable_atomic--;
 #endif
@@ -4874,6 +4874,84 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
 	cpumask_copy(&p->cpus_allowed, new_mask);
 }
 
+static DEFINE_PER_CPU(struct cpumask, sched_cpumasks);
+static DEFINE_MUTEX(sched_down_mutex);
+static cpumask_t sched_down_cpumask;
+
+void tell_sched_cpu_down_begin(int cpu)
+{
+	mutex_lock(&sched_down_mutex);
+	cpumask_set_cpu(cpu, &sched_down_cpumask);
+	mutex_unlock(&sched_down_mutex);
+}
+
+void tell_sched_cpu_down_done(int cpu)
+{
+	mutex_lock(&sched_down_mutex);
+	cpumask_clear_cpu(cpu, &sched_down_cpumask);
+	mutex_unlock(&sched_down_mutex);
+}
+
+/**
+ * migrate_me - try to move the current task off this cpu
+ *
+ * Used by the pin_current_cpu() code to try to get tasks
+ * to move off the current CPU as it is going down.
+ * It will only move the task if the task isn't pinned to
+ * the CPU (with migrate_disable, affinity or THREAD_BOUND)
+ * and the task has to be in a RUNNING state. Otherwise the
+ * movement of the task will wake it up (change its state
+ * to running) when the task did not expect it.
+ *
+ * Returns 1 if it succeeded in moving the current task
+ *         0 otherwise.
+ */
+int migrate_me(void)
+{
+	struct task_struct *p = current;
+	struct migration_arg arg;
+	struct cpumask *cpumask;
+	struct cpumask *mask;
+	unsigned long flags;
+	unsigned int dest_cpu;
+	struct rq *rq;
+
+	/*
+	 * We can not migrate tasks bounded to a CPU or tasks not
+	 * running. The movement of the task will wake it up.
+	 */
+	if (p->flags & PF_THREAD_BOUND || p->state)
+		return 0;
+
+	mutex_lock(&sched_down_mutex);
+	rq = task_rq_lock(p, &flags);
+
+	cpumask = &__get_cpu_var(sched_cpumasks);
+	mask = &p->cpus_allowed;
+
+	cpumask_andnot(cpumask, mask, &sched_down_cpumask);
+
+	if (!cpumask_weight(cpumask)) {
+		/* It's only on this CPU? */
+		task_rq_unlock(rq, p, &flags);
+		mutex_unlock(&sched_down_mutex);
+		return 0;
+	}
+
+	dest_cpu = cpumask_any_and(cpu_active_mask, cpumask);
+
+	arg.task = p;
+	arg.dest_cpu = dest_cpu;
+
+	task_rq_unlock(rq, p, &flags);
+
+	stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
+	tlb_migrate_finish(p->mm);
+	mutex_unlock(&sched_down_mutex);
+
+	return 1;
+}
+
 /*
  * This is how migration works:
  *
-- 
cgit v0.10.2


From 05bc115a8136cb828a012ec9457d5725a1498ffa Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 21 Aug 2012 20:38:50 +0200
Subject: random: Make it work on rt

Delegate the random insertion to the forced threaded interrupt
handler. Store the return IP of the hard interrupt handler in the irq
descriptor and feed it into the random generator as a source of
entropy.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable-rt@vger.kernel.org

diff --git a/drivers/char/random.c b/drivers/char/random.c
index 50007e1..9d6c416 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -742,18 +742,16 @@ EXPORT_SYMBOL_GPL(add_input_randomness);
 
 static DEFINE_PER_CPU(struct fast_pool, irq_randomness);
 
-void add_interrupt_randomness(int irq, int irq_flags)
+void add_interrupt_randomness(int irq, int irq_flags, __u64 ip)
 {
 	struct entropy_store	*r;
 	struct fast_pool	*fast_pool = &__get_cpu_var(irq_randomness);
-	struct pt_regs		*regs = get_irq_regs();
 	unsigned long		now = jiffies;
 	__u32			input[4], cycles = get_cycles();
 
 	input[0] = cycles ^ jiffies;
 	input[1] = irq;
-	if (regs) {
-		__u64 ip = instruction_pointer(regs);
+	if (ip) {
 		input[2] = ip;
 		input[3] = ip >> 32;
 	}
@@ -767,7 +765,11 @@ void add_interrupt_randomness(int irq, int irq_flags)
 	fast_pool->last = now;
 
 	r = nonblocking_pool.initialized ? &input_pool : &nonblocking_pool;
+#ifndef CONFIG_PREEMPT_RT_FULL
 	__mix_pool_bytes(r, &fast_pool->pool, sizeof(fast_pool->pool), NULL);
+#else
+	mix_pool_bytes(r, &fast_pool->pool, sizeof(fast_pool->pool), NULL);
+#endif
 	/*
 	 * If we don't have a valid cycle counter, and we see
 	 * back-to-back timer interrupts, then skip giving credit for
diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h
index 623325e..a7edc47 100644
--- a/include/linux/irqdesc.h
+++ b/include/linux/irqdesc.h
@@ -52,6 +52,7 @@ struct irq_desc {
 	unsigned int		irq_count;	/* For detecting broken IRQs */
 	unsigned long		last_unhandled;	/* Aging timer for unhandled count */
 	unsigned int		irqs_unhandled;
+	u64			random_ip;
 	raw_spinlock_t		lock;
 	struct cpumask		*percpu_enabled;
 #ifdef CONFIG_SMP
diff --git a/include/linux/random.h b/include/linux/random.h
index d984608..f975382 100644
--- a/include/linux/random.h
+++ b/include/linux/random.h
@@ -12,7 +12,7 @@
 extern void add_device_randomness(const void *, unsigned int);
 extern void add_input_randomness(unsigned int type, unsigned int code,
 				 unsigned int value);
-extern void add_interrupt_randomness(int irq, int irq_flags);
+extern void add_interrupt_randomness(int irq, int irq_flags, __u64 ip);
 
 extern void get_random_bytes(void *buf, int nbytes);
 extern void get_random_bytes_arch(void *buf, int nbytes);
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 131ca17..7f50c55 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -132,6 +132,8 @@ static void irq_wake_thread(struct irq_desc *desc, struct irqaction *action)
 irqreturn_t
 handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action)
 {
+	struct pt_regs *regs = get_irq_regs();
+	u64 ip = regs ? instruction_pointer(regs) : 0;
 	irqreturn_t retval = IRQ_NONE;
 	unsigned int flags = 0, irq = desc->irq_data.irq;
 
@@ -172,7 +174,11 @@ handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action)
 		action = action->next;
 	} while (action);
 
-	add_interrupt_randomness(irq, flags);
+#ifndef CONFIG_PREEMPT_RT_FULL
+	add_interrupt_randomness(irq, flags, ip);
+#else
+	desc->random_ip = ip;
+#endif
 
 	if (!noirqdebug)
 		note_interrupt(irq, desc, retval);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 5bc2f41..1fba5cb 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -879,6 +879,12 @@ static int irq_thread(void *data)
 		if (!noirqdebug)
 			note_interrupt(action->irq, desc, action_ret);
 
+#ifdef CONFIG_PREEMPT_RT_FULL
+		migrate_disable();
+		add_interrupt_randomness(action->irq, 0,
+				 desc->random_ip ^ (unsigned long) action);
+		migrate_enable();
+#endif
 		wake_threads_waitq(desc);
 	}
 
-- 
cgit v0.10.2


From b5294fd5ea3ea1499ca352b9eda11f7464d3a67d Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Thu, 4 Oct 2012 11:02:04 -0400
Subject: softirq: Init softirq local lock after per cpu section is set up

I discovered this bug when booting 3.4-rt on my powerpc box. It crashed
with the following report:

------------[ cut here ]------------
kernel BUG at /work/rt/stable-rt.git/kernel/rtmutex_common.h:75!
Oops: Exception in kernel mode, sig: 5 [#1]
PREEMPT SMP NR_CPUS=64 NUMA PA Semi PWRficient
Modules linked in:
NIP: c0000000004aa03c LR: c0000000004aa01c CTR: c00000000009b2ac
REGS: c00000003e8d7950 TRAP: 0700   Not tainted  (3.4.11-test-rt19)
MSR: 9000000000029032 <SF,HV,EE,ME,IR,DR,RI>  CR: 24000082  XER: 20000000
SOFTE: 0
TASK = c00000003e8fdcd0[11] 'ksoftirqd/1' THREAD: c00000003e8d4000 CPU: 1
GPR00: 0000000000000001 c00000003e8d7bd0 c000000000d6cbb0 0000000000000000
GPR04: c00000003e8fdcd0 0000000000000000 0000000024004082 c000000000011454
GPR08: 0000000000000000 0000000080000001 c00000003e8fdcd1 0000000000000000
GPR12: 0000000024000084 c00000000fff0280 ffffffffffffffff 000000003ffffad8
GPR16: ffffffffffffffff 000000000072c798 0000000000000060 0000000000000000
GPR20: 0000000000642741 000000000072c858 000000003ffffaf0 0000000000000417
GPR24: 000000000072dcd0 c00000003e7ff990 0000000000000000 0000000000000001
GPR28: 0000000000000000 c000000000792340 c000000000ccec78 c000000001182338
NIP [c0000000004aa03c] .wakeup_next_waiter+0x44/0xb8
LR [c0000000004aa01c] .wakeup_next_waiter+0x24/0xb8
Call Trace:
[c00000003e8d7bd0] [c0000000004aa01c] .wakeup_next_waiter+0x24/0xb8 (unreliable)
[c00000003e8d7c60] [c0000000004a0320] .rt_spin_lock_slowunlock+0x8c/0xe4
[c00000003e8d7ce0] [c0000000004a07cc] .rt_spin_unlock+0x54/0x64
[c00000003e8d7d60] [c0000000000636bc] .__thread_do_softirq+0x130/0x174
[c00000003e8d7df0] [c00000000006379c] .run_ksoftirqd+0x9c/0x1a4
[c00000003e8d7ea0] [c000000000080b68] .kthread+0xa8/0xb4
[c00000003e8d7f90] [c00000000001c2f8] .kernel_thread+0x54/0x70
Instruction dump:
60000000 e86d01c8 38630730 4bff7061 60000000 ebbf0008 7c7c1b78 e81d0040
7fe00278 7c000074 7800d182 68000001 <0b000000> e88d01c8 387d0010 38840738

The rtmutex_common.h:75 is:

rt_mutex_top_waiter(struct rt_mutex *lock)
{
	struct rt_mutex_waiter *w;

	w = plist_first_entry(&lock->wait_list, struct rt_mutex_waiter,
			       list_entry);
	BUG_ON(w->lock != lock);

	return w;
}

Where the waiter->lock is corrupted. I saw various other random bugs
that all had to with the softirq lock and plist. As plist needs to be
initialized before it is used I investigated how this lock is
initialized. It's initialized with:

void __init softirq_early_init(void)
{
	local_irq_lock_init(local_softirq_lock);
}

Where:

#define local_irq_lock_init(lvar)					\
	do {								\
		int __cpu;						\
		for_each_possible_cpu(__cpu)				\
			spin_lock_init(&per_cpu(lvar, __cpu).lock);	\
	} while (0)

As the softirq lock is a local_irq_lock, which is a per_cpu lock, the
initialization is done to all per_cpu versions of the lock. But lets
look at where the softirq_early_init() is called from.

In init/main.c: start_kernel()

/*
 * Interrupts are still disabled. Do necessary setups, then
 * enable them
 */
	softirq_early_init();
	tick_init();
	boot_cpu_init();
	page_address_init();
	printk(KERN_NOTICE "%s", linux_banner);
	setup_arch(&command_line);
	mm_init_owner(&init_mm, &init_task);
	mm_init_cpumask(&init_mm);
	setup_command_line(command_line);
	setup_nr_cpu_ids();
	setup_per_cpu_areas();
	smp_prepare_boot_cpu();	/* arch-specific boot-cpu hooks */

One of the first things that is called is the initialization of the
softirq lock. But if you look further down, we see the per_cpu areas
have not been set up yet. Thus initializing a local_irq_lock() before
the per_cpu section is set up, may not work as it is initializing the
per cpu locks before the per cpu exists.

By moving the softirq_early_init() right after setup_per_cpu_areas(),
the kernel boots fine.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Clark Williams <clark@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
Cc: Carsten Emde <cbe@osadl.org>
Cc: vomlehn@texas.net
Link: http://lkml.kernel.org/r/1349362924.6755.18.camel@gandalf.local.home
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/init/main.c b/init/main.c
index 4df397d..2d80a11 100644
--- a/init/main.c
+++ b/init/main.c
@@ -493,7 +493,6 @@ asmlinkage void __init start_kernel(void)
  * Interrupts are still disabled. Do necessary setups, then
  * enable them
  */
-	softirq_early_init();
 	tick_init();
 	boot_cpu_init();
 	page_address_init();
@@ -504,6 +503,7 @@ asmlinkage void __init start_kernel(void)
 	setup_command_line(command_line);
 	setup_nr_cpu_ids();
 	setup_per_cpu_areas();
+	softirq_early_init();
 	smp_prepare_boot_cpu();	/* arch-specific boot-cpu hooks */
 
 	build_all_zonelists(NULL, NULL);
-- 
cgit v0.10.2


From b72b8c58821f73a2d1b1a3941afbf1adfaeffa07 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 27 Sep 2012 11:11:46 +0200
Subject: mm: page_alloc: Use local_lock_on() instead of plain spinlock

The plain spinlock while sufficient does not update the local_lock
internals. Use a proper local_lock function instead to ease debugging.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable-rt@vger.kernel.org

diff --git a/include/linux/locallock.h b/include/linux/locallock.h
index 0161fbb..f1804a3 100644
--- a/include/linux/locallock.h
+++ b/include/linux/locallock.h
@@ -137,6 +137,12 @@ static inline int __local_lock_irqsave(struct local_irq_lock *lv)
 		_flags = __get_cpu_var(lvar).flags;			\
 	} while (0)
 
+#define local_lock_irqsave_on(lvar, _flags, cpu)			\
+	do {								\
+		__local_lock_irqsave(&per_cpu(lvar, cpu));		\
+		_flags = per_cpu(lvar, cpu).flags;			\
+	} while (0)
+
 static inline int __local_unlock_irqrestore(struct local_irq_lock *lv,
 					    unsigned long flags)
 {
@@ -156,6 +162,11 @@ static inline int __local_unlock_irqrestore(struct local_irq_lock *lv,
 			put_local_var(lvar);				\
 	} while (0)
 
+#define local_unlock_irqrestore_on(lvar, flags, cpu)			\
+	do {								\
+		__local_unlock_irqrestore(&per_cpu(lvar, cpu), flags);	\
+	} while (0)
+
 #define local_spin_trylock_irq(lvar, lock)				\
 	({								\
 		int __locked;						\
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 364c3c6..1734913 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -224,9 +224,9 @@ static DEFINE_LOCAL_IRQ_LOCK(pa_lock);
 
 #ifdef CONFIG_PREEMPT_RT_BASE
 # define cpu_lock_irqsave(cpu, flags)		\
-	spin_lock_irqsave(&per_cpu(pa_lock, cpu).lock, flags)
+	local_lock_irqsave_on(pa_lock, flags, cpu)
 # define cpu_unlock_irqrestore(cpu, flags)	\
-	spin_unlock_irqrestore(&per_cpu(pa_lock, cpu).lock, flags)
+	local_unlock_irqrestore_on(pa_lock, flags, cpu)
 #else
 # define cpu_lock_irqsave(cpu, flags)		local_irq_save(flags)
 # define cpu_unlock_irqrestore(cpu, flags)	local_irq_restore(flags)
-- 
cgit v0.10.2


From 71b9a567f055848e9c35dac3abac5c091e41e903 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 28 Sep 2012 10:49:42 +0100
Subject: rt: rwsem/rwlock: lockdep annotations

rwlocks and rwsems on RT do not allow multiple readers. Annotate the
lockdep acquire functions accordingly.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable-rt@vger.kernel.org

diff --git a/kernel/rt.c b/kernel/rt.c
index 92a16e1..433ae42 100644
--- a/kernel/rt.c
+++ b/kernel/rt.c
@@ -216,15 +216,17 @@ int __lockfunc rt_read_trylock(rwlock_t *rwlock)
 	 * write locked.
 	 */
 	migrate_disable();
-	if (rt_mutex_owner(lock) != current)
+	if (rt_mutex_owner(lock) != current) {
 		ret = rt_mutex_trylock(lock);
-	else if (!rwlock->read_depth)
+		if (ret)
+			rwlock_acquire(&rwlock->dep_map, 0, 1, _RET_IP_);
+	} else if (!rwlock->read_depth) {
 		ret = 0;
+	}
 
-	if (ret) {
+	if (ret)
 		rwlock->read_depth++;
-		rwlock_acquire_read(&rwlock->dep_map, 0, 1, _RET_IP_);
-	} else
+	else
 		migrate_enable();
 
 	return ret;
@@ -242,13 +244,13 @@ void __lockfunc rt_read_lock(rwlock_t *rwlock)
 {
 	struct rt_mutex *lock = &rwlock->lock;
 
-	rwlock_acquire_read(&rwlock->dep_map, 0, 0, _RET_IP_);
-
 	/*
 	 * recursive read locks succeed when current owns the lock
 	 */
-	if (rt_mutex_owner(lock) != current)
+	if (rt_mutex_owner(lock) != current) {
+		rwlock_acquire(&rwlock->dep_map, 0, 0, _RET_IP_);
 		__rt_spin_lock(lock);
+	}
 	rwlock->read_depth++;
 }
 
@@ -264,11 +266,11 @@ EXPORT_SYMBOL(rt_write_unlock);
 
 void __lockfunc rt_read_unlock(rwlock_t *rwlock)
 {
-	rwlock_release(&rwlock->dep_map, 1, _RET_IP_);
-
 	/* Release the lock only when read_depth is down to 0 */
-	if (--rwlock->read_depth == 0)
+	if (--rwlock->read_depth == 0) {
+		rwlock_release(&rwlock->dep_map, 1, _RET_IP_);
 		__rt_spin_unlock(&rwlock->lock);
+	}
 }
 EXPORT_SYMBOL(rt_read_unlock);
 
@@ -315,9 +317,10 @@ EXPORT_SYMBOL(rt_up_write);
 
 void  rt_up_read(struct rw_semaphore *rwsem)
 {
-	rwsem_release(&rwsem->dep_map, 1, _RET_IP_);
-	if (--rwsem->read_depth == 0)
+	if (--rwsem->read_depth == 0) {
+		rwsem_release(&rwsem->dep_map, 1, _RET_IP_);
 		rt_mutex_unlock(&rwsem->lock);
+	}
 }
 EXPORT_SYMBOL(rt_up_read);
 
@@ -356,6 +359,13 @@ void  rt_down_write_nested(struct rw_semaphore *rwsem, int subclass)
 }
 EXPORT_SYMBOL(rt_down_write_nested);
 
+void rt_down_write_nested_lock(struct rw_semaphore *rwsem,
+		struct lockdep_map *nest)
+{
+	rwsem_acquire_nest(&rwsem->dep_map, 0, 0, nest, _RET_IP_);
+	rt_mutex_lock(&rwsem->lock);
+}
+
 int  rt_down_read_trylock(struct rw_semaphore *rwsem)
 {
 	struct rt_mutex *lock = &rwsem->lock;
@@ -366,15 +376,16 @@ int  rt_down_read_trylock(struct rw_semaphore *rwsem)
 	 * but not when read_depth == 0 which means that the rwsem is
 	 * write locked.
 	 */
-	if (rt_mutex_owner(lock) != current)
+	if (rt_mutex_owner(lock) != current) {
 		ret = rt_mutex_trylock(&rwsem->lock);
-	else if (!rwsem->read_depth)
+		if (ret)
+			rwsem_acquire(&rwsem->dep_map, 0, 1, _RET_IP_);
+	} else if (!rwsem->read_depth) {
 		ret = 0;
+	}
 
-	if (ret) {
+	if (ret)
 		rwsem->read_depth++;
-		rwsem_acquire(&rwsem->dep_map, 0, 1, _RET_IP_);
-	}
 	return ret;
 }
 EXPORT_SYMBOL(rt_down_read_trylock);
@@ -383,10 +394,10 @@ static void __rt_down_read(struct rw_semaphore *rwsem, int subclass)
 {
 	struct rt_mutex *lock = &rwsem->lock;
 
-	rwsem_acquire_read(&rwsem->dep_map, subclass, 0, _RET_IP_);
-
-	if (rt_mutex_owner(lock) != current)
+	if (rt_mutex_owner(lock) != current) {
+		rwsem_acquire(&rwsem->dep_map, subclass, 0, _RET_IP_);
 		rt_mutex_lock(&rwsem->lock);
+	}
 	rwsem->read_depth++;
 }
 
-- 
cgit v0.10.2


From c8780171406409d60ee0986159c3744e7bc04aec Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 5 Oct 2012 08:56:15 +0100
Subject: sched: Better debug output for might sleep

might sleep can tell us where interrupts have been disabled, but we
have no idea what disabled preemption. Add some debug infrastructure.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/sched.h b/include/linux/sched.h
index c002f80..62be398 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1643,6 +1643,9 @@ struct task_struct {
 	pte_t kmap_pte[KM_TYPE_NR];
 # endif
 #endif
+#ifdef CONFIG_DEBUG_PREEMPT
+	unsigned long preempt_disable_ip;
+#endif
 };
 
 #ifdef CONFIG_NUMA_BALANCING
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e33174a..49079e3 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2791,8 +2791,13 @@ void __kprobes add_preempt_count(int val)
 	DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
 				PREEMPT_MASK - 10);
 #endif
-	if (preempt_count() == val)
-		trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
+	if (preempt_count() == val) {
+		unsigned long ip = get_parent_ip(CALLER_ADDR1);
+#ifdef CONFIG_DEBUG_PREEMPT
+		current->preempt_disable_ip = ip;
+#endif
+		trace_preempt_off(CALLER_ADDR0, ip);
+	}
 }
 EXPORT_SYMBOL(add_preempt_count);
 
@@ -2835,6 +2840,13 @@ static noinline void __schedule_bug(struct task_struct *prev)
 	print_modules();
 	if (irqs_disabled())
 		print_irqtrace_events(prev);
+#ifdef CONFIG_DEBUG_PREEMPT
+	if (in_atomic_preempt_off()) {
+		pr_err("Preemption disabled at:");
+		print_ip_sym(current->preempt_disable_ip);
+		pr_cont("\n");
+	}
+#endif
 	dump_stack();
 	add_taint(TAINT_WARN);
 }
@@ -7310,6 +7322,13 @@ void __might_sleep(const char *file, int line, int preempt_offset)
 	debug_show_held_locks(current);
 	if (irqs_disabled())
 		print_irqtrace_events(current);
+#ifdef CONFIG_DEBUG_PREEMPT
+	if (!preempt_count_equals(preempt_offset)) {
+		pr_err("Preemption disabled at:");
+		print_ip_sym(current->preempt_disable_ip);
+		pr_cont("\n");
+	}
+#endif
 	dump_stack();
 }
 EXPORT_SYMBOL(__might_sleep);
-- 
cgit v0.10.2


From 14f2f4ee58996ae7104b6e615c54c0a89c1873d8 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 3 Oct 2012 17:21:53 +0100
Subject: stomp_machine: Use mutex_trylock when called from inactive cpu

If the stop machinery is called from inactive CPU we cannot use
mutex_lock, because some other stomp machine invokation might be in
progress and the mutex can be contended. We cannot schedule from this
context, so trylock and loop.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable-rt@vger.kernel.org

diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 561ba3a..e98c70b 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -158,7 +158,7 @@ static DEFINE_PER_CPU(struct cpu_stop_work, stop_cpus_work);
 
 static void queue_stop_cpus_work(const struct cpumask *cpumask,
 				 cpu_stop_fn_t fn, void *arg,
-				 struct cpu_stop_done *done)
+				 struct cpu_stop_done *done, bool inactive)
 {
 	struct cpu_stop_work *work;
 	unsigned int cpu;
@@ -175,7 +175,12 @@ static void queue_stop_cpus_work(const struct cpumask *cpumask,
 	 * Make sure that all work is queued on all cpus before we
 	 * any of the cpus can execute it.
 	 */
-	mutex_lock(&stopper_lock);
+	if (!inactive) {
+		mutex_lock(&stopper_lock);
+	} else {
+		while (!mutex_trylock(&stopper_lock))
+			cpu_relax();
+	}
 	for_each_cpu(cpu, cpumask)
 		cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu),
 				    &per_cpu(stop_cpus_work, cpu));
@@ -188,7 +193,7 @@ static int __stop_cpus(const struct cpumask *cpumask,
 	struct cpu_stop_done done;
 
 	cpu_stop_init_done(&done, cpumask_weight(cpumask));
-	queue_stop_cpus_work(cpumask, fn, arg, &done);
+	queue_stop_cpus_work(cpumask, fn, arg, &done, false);
 	wait_for_stop_done(&done);
 	return done.executed ? done.ret : -ENOENT;
 }
@@ -601,7 +606,7 @@ int stop_machine_from_inactive_cpu(int (*fn)(void *), void *data,
 	set_state(&smdata, STOPMACHINE_PREPARE);
 	cpu_stop_init_done(&done, num_active_cpus());
 	queue_stop_cpus_work(cpu_active_mask, stop_machine_cpu_stop, &smdata,
-			     &done);
+			     &done, true);
 	ret = stop_machine_cpu_stop(&smdata);
 
 	/* Busy wait for completion. */
-- 
cgit v0.10.2


From e5f8c864024c3e505734674985e037b678671eda Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 26 Sep 2012 16:21:08 +0200
Subject: net: Another local_irq_disable/kmalloc headache

Replace it by a local lock. Though that's pretty inefficient :(

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 32443eb..39b45c0 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -60,6 +60,7 @@
 #include <linux/scatterlist.h>
 #include <linux/errqueue.h>
 #include <linux/prefetch.h>
+#include <linux/locallock.h>
 
 #include <net/protocol.h>
 #include <net/dst.h>
@@ -347,6 +348,7 @@ struct netdev_alloc_cache {
 	unsigned int		pagecnt_bias;
 };
 static DEFINE_PER_CPU(struct netdev_alloc_cache, netdev_alloc_cache);
+static DEFINE_LOCAL_IRQ_LOCK(netdev_alloc_lock);
 
 #define NETDEV_FRAG_PAGE_MAX_ORDER get_order(32768)
 #define NETDEV_FRAG_PAGE_MAX_SIZE  (PAGE_SIZE << NETDEV_FRAG_PAGE_MAX_ORDER)
@@ -359,7 +361,7 @@ static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
 	int order;
 	unsigned long flags;
 
-	local_irq_save(flags);
+	local_lock_irqsave(netdev_alloc_lock, flags);
 	nc = &__get_cpu_var(netdev_alloc_cache);
 	if (unlikely(!nc->frag.page)) {
 refill:
@@ -393,7 +395,7 @@ recycle:
 	nc->frag.offset += fragsz;
 	nc->pagecnt_bias--;
 end:
-	local_irq_restore(flags);
+	local_unlock_irqrestore(netdev_alloc_lock, flags);
 	return data;
 }
 
-- 
cgit v0.10.2


From 57692614f8f45b98d692007513a86b93eb456283 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 1 Oct 2012 17:12:35 +0100
Subject: net: Use get_cpu_light() in ip_send_unicast_reply()

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 3e98ed2..253692b 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -1508,7 +1508,8 @@ void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, __be32 daddr,
 	if (IS_ERR(rt))
 		return;
 
-	inet = &get_cpu_var(unicast_sock);
+	get_cpu_light();
+	inet = &__get_cpu_var(unicast_sock);
 
 	inet->tos = arg->tos;
 	sk = &inet->sk;
@@ -1532,7 +1533,7 @@ void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, __be32 daddr,
 		ip_push_pending_frames(sk, &fl4);
 	}
 
-	put_cpu_var(unicast_sock);
+	put_cpu_light();
 
 	ip_rt_put(rt);
 }
-- 
cgit v0.10.2


From bfbe8537ee92986779697b3c0389992827877713 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 5 Oct 2012 09:03:24 +0100
Subject: crypto: Convert crypto notifier chain to SRCU

The crypto notifier deadlocks on RT. Though this can be a real deadlock
on mainline as well due to fifo fair rwsems.

The involved parties here are:

[   82.172678] swapper/0       S 0000000000000001     0     1      0 0x00000000
[   82.172682]  ffff88042f18fcf0 0000000000000046 ffff88042f18fc80 ffffffff81491238
[   82.172685]  0000000000011cc0 0000000000011cc0 ffff88042f18c040 ffff88042f18ffd8
[   82.172688]  0000000000011cc0 0000000000011cc0 ffff88042f18ffd8 0000000000011cc0
[   82.172689] Call Trace:
[   82.172697]  [<ffffffff81491238>] ? _raw_spin_unlock_irqrestore+0x6c/0x7a
[   82.172701]  [<ffffffff8148fd3f>] schedule+0x64/0x66
[   82.172704]  [<ffffffff8148ec6b>] schedule_timeout+0x27/0xd0
[   82.172708]  [<ffffffff81043c0c>] ? unpin_current_cpu+0x1a/0x6c
[   82.172713]  [<ffffffff8106e491>] ? migrate_enable+0x12f/0x141
[   82.172716]  [<ffffffff8148fbbd>] wait_for_common+0xbb/0x11f
[   82.172719]  [<ffffffff810709f2>] ? try_to_wake_up+0x182/0x182
[   82.172722]  [<ffffffff8148fc96>] wait_for_completion_interruptible+0x1d/0x2e
[   82.172726]  [<ffffffff811debfd>] crypto_wait_for_test+0x49/0x6b
[   82.172728]  [<ffffffff811ded32>] crypto_register_alg+0x53/0x5a
[   82.172730]  [<ffffffff811ded6c>] crypto_register_algs+0x33/0x72
[   82.172734]  [<ffffffff81ad7686>] ? aes_init+0x12/0x12
[   82.172737]  [<ffffffff81ad76ea>] aesni_init+0x64/0x66
[   82.172741]  [<ffffffff81000318>] do_one_initcall+0x7f/0x13b
[   82.172744]  [<ffffffff81ac4d34>] kernel_init+0x199/0x22c
[   82.172747]  [<ffffffff81ac44ef>] ? loglevel+0x31/0x31
[   82.172752]  [<ffffffff814987c4>] kernel_thread_helper+0x4/0x10
[   82.172755]  [<ffffffff81491574>] ? retint_restore_args+0x13/0x13
[   82.172759]  [<ffffffff81ac4b9b>] ? start_kernel+0x3ca/0x3ca
[   82.172761]  [<ffffffff814987c0>] ? gs_change+0x13/0x13

[   82.174186] cryptomgr_test  S 0000000000000001     0    41      2 0x00000000
[   82.174189]  ffff88042c971980 0000000000000046 ffffffff81d74830 0000000000000292
[   82.174192]  0000000000011cc0 0000000000011cc0 ffff88042c96eb80 ffff88042c971fd8
[   82.174195]  0000000000011cc0 0000000000011cc0 ffff88042c971fd8 0000000000011cc0
[   82.174195] Call Trace:
[   82.174198]  [<ffffffff8148fd3f>] schedule+0x64/0x66
[   82.174201]  [<ffffffff8148ec6b>] schedule_timeout+0x27/0xd0
[   82.174204]  [<ffffffff81043c0c>] ? unpin_current_cpu+0x1a/0x6c
[   82.174206]  [<ffffffff8106e491>] ? migrate_enable+0x12f/0x141
[   82.174209]  [<ffffffff8148fbbd>] wait_for_common+0xbb/0x11f
[   82.174212]  [<ffffffff810709f2>] ? try_to_wake_up+0x182/0x182
[   82.174215]  [<ffffffff8148fc96>] wait_for_completion_interruptible+0x1d/0x2e
[   82.174218]  [<ffffffff811e4883>] cryptomgr_notify+0x280/0x385
[   82.174221]  [<ffffffff814943de>] notifier_call_chain+0x6b/0x98
[   82.174224]  [<ffffffff8108a11c>] ? rt_down_read+0x10/0x12
[   82.174227]  [<ffffffff810677cd>] __blocking_notifier_call_chain+0x70/0x8d
[   82.174230]  [<ffffffff810677fe>] blocking_notifier_call_chain+0x14/0x16
[   82.174234]  [<ffffffff811dd272>] crypto_probing_notify+0x24/0x50
[   82.174236]  [<ffffffff811dd7a1>] crypto_alg_mod_lookup+0x3e/0x74
[   82.174238]  [<ffffffff811dd949>] crypto_alloc_base+0x36/0x8f
[   82.174241]  [<ffffffff811e9408>] cryptd_alloc_ablkcipher+0x6e/0xb5
[   82.174243]  [<ffffffff811dd591>] ? kzalloc.clone.5+0xe/0x10
[   82.174246]  [<ffffffff8103085d>] ablk_init_common+0x1d/0x38
[   82.174249]  [<ffffffff8103852a>] ablk_ecb_init+0x15/0x17
[   82.174251]  [<ffffffff811dd8c6>] __crypto_alloc_tfm+0xc7/0x114
[   82.174254]  [<ffffffff811e0caa>] ? crypto_lookup_skcipher+0x1f/0xe4
[   82.174256]  [<ffffffff811e0dcf>] crypto_alloc_ablkcipher+0x60/0xa5
[   82.174258]  [<ffffffff811e5bde>] alg_test_skcipher+0x24/0x9b
[   82.174261]  [<ffffffff8106d96d>] ? finish_task_switch+0x3f/0xfa
[   82.174263]  [<ffffffff811e6b8e>] alg_test+0x16f/0x1d7
[   82.174267]  [<ffffffff811e45ac>] ? cryptomgr_probe+0xac/0xac
[   82.174269]  [<ffffffff811e45d8>] cryptomgr_test+0x2c/0x47
[   82.174272]  [<ffffffff81061161>] kthread+0x7e/0x86
[   82.174275]  [<ffffffff8106d9dd>] ? finish_task_switch+0xaf/0xfa
[   82.174278]  [<ffffffff814987c4>] kernel_thread_helper+0x4/0x10
[   82.174281]  [<ffffffff81491574>] ? retint_restore_args+0x13/0x13
[   82.174284]  [<ffffffff810610e3>] ? __init_kthread_worker+0x8c/0x8c
[   82.174287]  [<ffffffff814987c0>] ? gs_change+0x13/0x13

[   82.174329] cryptomgr_probe D 0000000000000002     0    47      2 0x00000000
[   82.174332]  ffff88042c991b70 0000000000000046 ffff88042c991bb0 0000000000000006
[   82.174335]  0000000000011cc0 0000000000011cc0 ffff88042c98ed00 ffff88042c991fd8
[   82.174338]  0000000000011cc0 0000000000011cc0 ffff88042c991fd8 0000000000011cc0
[   82.174338] Call Trace:
[   82.174342]  [<ffffffff8148fd3f>] schedule+0x64/0x66
[   82.174344]  [<ffffffff814901ad>] __rt_mutex_slowlock+0x85/0xbe
[   82.174347]  [<ffffffff814902d2>] rt_mutex_slowlock+0xec/0x159
[   82.174351]  [<ffffffff81089c4d>] rt_mutex_fastlock.clone.8+0x29/0x2f
[   82.174353]  [<ffffffff81490372>] rt_mutex_lock+0x33/0x37
[   82.174356]  [<ffffffff8108a0f2>] __rt_down_read+0x50/0x5a
[   82.174358]  [<ffffffff8108a11c>] ? rt_down_read+0x10/0x12
[   82.174360]  [<ffffffff8108a11c>] rt_down_read+0x10/0x12
[   82.174363]  [<ffffffff810677b5>] __blocking_notifier_call_chain+0x58/0x8d
[   82.174366]  [<ffffffff810677fe>] blocking_notifier_call_chain+0x14/0x16
[   82.174369]  [<ffffffff811dd272>] crypto_probing_notify+0x24/0x50
[   82.174372]  [<ffffffff811debd6>] crypto_wait_for_test+0x22/0x6b
[   82.174374]  [<ffffffff811decd3>] crypto_register_instance+0xb4/0xc0
[   82.174377]  [<ffffffff811e9b76>] cryptd_create+0x378/0x3b6
[   82.174379]  [<ffffffff811de512>] ? __crypto_lookup_template+0x5b/0x63
[   82.174382]  [<ffffffff811e4545>] cryptomgr_probe+0x45/0xac
[   82.174385]  [<ffffffff811e4500>] ? crypto_alloc_pcomp+0x1b/0x1b
[   82.174388]  [<ffffffff81061161>] kthread+0x7e/0x86
[   82.174391]  [<ffffffff8106d9dd>] ? finish_task_switch+0xaf/0xfa
[   82.174394]  [<ffffffff814987c4>] kernel_thread_helper+0x4/0x10
[   82.174398]  [<ffffffff81491574>] ? retint_restore_args+0x13/0x13
[   82.174401]  [<ffffffff810610e3>] ? __init_kthread_worker+0x8c/0x8c
[   82.174403]  [<ffffffff814987c0>] ? gs_change+0x13/0x13

cryptomgr_test spawns the cryptomgr_probe thread from the notifier
call. The probe thread fires the same notifier as the test thread and
deadlocks on the rwsem on RT.

Now this is a potential deadlock in mainline as well, because we have
fifo fair rwsems. If another thread blocks with a down_write() on the
notifier chain before the probe thread issues the down_read() it will
block the probe thread and the whole party is dead locked.

Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/crypto/algapi.c b/crypto/algapi.c
index c3b9bfe..3574066 100644
--- a/crypto/algapi.c
+++ b/crypto/algapi.c
@@ -683,13 +683,13 @@ EXPORT_SYMBOL_GPL(crypto_spawn_tfm2);
 
 int crypto_register_notifier(struct notifier_block *nb)
 {
-	return blocking_notifier_chain_register(&crypto_chain, nb);
+	return srcu_notifier_chain_register(&crypto_chain, nb);
 }
 EXPORT_SYMBOL_GPL(crypto_register_notifier);
 
 int crypto_unregister_notifier(struct notifier_block *nb)
 {
-	return blocking_notifier_chain_unregister(&crypto_chain, nb);
+	return srcu_notifier_chain_unregister(&crypto_chain, nb);
 }
 EXPORT_SYMBOL_GPL(crypto_unregister_notifier);
 
diff --git a/crypto/api.c b/crypto/api.c
index 033a714..8ff072c 100644
--- a/crypto/api.c
+++ b/crypto/api.c
@@ -31,7 +31,7 @@ EXPORT_SYMBOL_GPL(crypto_alg_list);
 DECLARE_RWSEM(crypto_alg_sem);
 EXPORT_SYMBOL_GPL(crypto_alg_sem);
 
-BLOCKING_NOTIFIER_HEAD(crypto_chain);
+SRCU_NOTIFIER_HEAD(crypto_chain);
 EXPORT_SYMBOL_GPL(crypto_chain);
 
 static inline struct crypto_alg *crypto_alg_get(struct crypto_alg *alg)
@@ -237,10 +237,10 @@ int crypto_probing_notify(unsigned long val, void *v)
 {
 	int ok;
 
-	ok = blocking_notifier_call_chain(&crypto_chain, val, v);
+	ok = srcu_notifier_call_chain(&crypto_chain, val, v);
 	if (ok == NOTIFY_DONE) {
 		request_module("cryptomgr");
-		ok = blocking_notifier_call_chain(&crypto_chain, val, v);
+		ok = srcu_notifier_call_chain(&crypto_chain, val, v);
 	}
 
 	return ok;
diff --git a/crypto/internal.h b/crypto/internal.h
index 9ebedae..8cbe3dc 100644
--- a/crypto/internal.h
+++ b/crypto/internal.h
@@ -48,7 +48,7 @@ struct crypto_larval {
 
 extern struct list_head crypto_alg_list;
 extern struct rw_semaphore crypto_alg_sem;
-extern struct blocking_notifier_head crypto_chain;
+extern struct srcu_notifier_head crypto_chain;
 
 #ifdef CONFIG_PROC_FS
 void __init crypto_init_proc(void);
@@ -136,7 +136,7 @@ static inline int crypto_is_moribund(struct crypto_alg *alg)
 
 static inline void crypto_notify(unsigned long val, void *v)
 {
-	blocking_notifier_call_chain(&crypto_chain, val, v);
+	srcu_notifier_call_chain(&crypto_chain, val, v);
 }
 
 #endif	/* _CRYPTO_INTERNAL_H */
-- 
cgit v0.10.2


From df25caf4fa02ed364b357a7ed9116525ca133b7f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 4 Oct 2012 13:32:46 +0100
Subject: x86: perf: Deal with kfree from atomic contexts

The x86 perf code allocates memory upfront because it might need
it. The detection that it is not needed happens in atomic context and
calls kfree from there. RT cant do that. Use kfree_rcu instead.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 115c1ea..f50cca1 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -108,6 +108,7 @@ struct intel_shared_regs {
 	struct er_account       regs[EXTRA_REG_MAX];
 	int                     refcnt;		/* per-core: #HT threads */
 	unsigned                core_id;	/* per-core: core id */
+	struct rcu_head		rcu;
 };
 
 #define MAX_LBR_ENTRIES		16
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 70602f8..0632237 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1721,7 +1721,7 @@ static void intel_pmu_cpu_dying(int cpu)
 	pc = cpuc->shared_regs;
 	if (pc) {
 		if (pc->core_id == -1 || --pc->refcnt == 0)
-			kfree(pc);
+			kfree_rcu(pc, rcu);
 		cpuc->shared_regs = NULL;
 	}
 
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
index b43200d..85f64b3 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
@@ -2636,7 +2636,7 @@ static void __cpuinit uncore_cpu_dying(int cpu)
 			box = *per_cpu_ptr(pmu->box, cpu);
 			*per_cpu_ptr(pmu->box, cpu) = NULL;
 			if (box && atomic_dec_and_test(&box->refcnt))
-				kfree(box);
+				kfree_rcu(box, rcu);
 		}
 	}
 }
@@ -2666,7 +2666,8 @@ static int __cpuinit uncore_cpu_starting(int cpu)
 				if (exist && exist->phys_id == phys_id) {
 					atomic_inc(&exist->refcnt);
 					*per_cpu_ptr(pmu->box, cpu) = exist;
-					kfree(box);
+					if (box)
+						kfree_rcu(box, rcu);
 					box = NULL;
 					break;
 				}
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h b/arch/x86/kernel/cpu/perf_event_intel_uncore.h
index e68a455..c4e1028 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.h
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.h
@@ -421,6 +421,7 @@ struct intel_uncore_box {
 	struct hrtimer hrtimer;
 	struct list_head list;
 	struct intel_uncore_extra_reg shared_regs[0];
+	struct rcu_head rcu;
 };
 
 #define UNCORE_BOX_FLAG_INITIATED	0
-- 
cgit v0.10.2


From 9d61e5f91b0faf8426e816b7a91b25c4906c56a8 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 4 Oct 2012 14:30:25 +0100
Subject: softirq: Make serving softirqs a task flag

Avoid the percpu softirq_runner pointer magic by using a task flag.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 62be398..a3047e1 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1857,6 +1857,7 @@ extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut,
 /*
  * Per process flags
  */
+#define PF_IN_SOFTIRQ	0x00000001	/* Task is serving softirq */
 #define PF_EXITING	0x00000004	/* getting shut down */
 #define PF_EXITPIDONE	0x00000008	/* pi exit done on shut down */
 #define PF_VCPU		0x00000010	/* I'm a virtual CPU */
diff --git a/kernel/softirq.c b/kernel/softirq.c
index f52cdbc..0a810e4 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -375,7 +375,6 @@ static void ksoftirqd_clr_sched_params(unsigned int cpu, bool online) { }
  * On RT we serialize softirq execution with a cpu local lock
  */
 static DEFINE_LOCAL_IRQ_LOCK(local_softirq_lock);
-static DEFINE_PER_CPU(struct task_struct *, local_softirq_runner);
 
 static void __do_softirq_common(int need_rcu_bh_qs);
 
@@ -430,22 +429,9 @@ void _local_bh_enable(void)
 }
 EXPORT_SYMBOL(_local_bh_enable);
 
-/* For tracing */
-int notrace __in_softirq(void)
-{
-	if (__get_cpu_var(local_softirq_lock).owner == current)
-		return __get_cpu_var(local_softirq_lock).nestcnt;
-	return 0;
-}
-
 int in_serving_softirq(void)
 {
-	int res;
-
-	preempt_disable();
-	res = __get_cpu_var(local_softirq_runner) == current;
-	preempt_enable();
-	return res;
+	return current->flags & PF_IN_SOFTIRQ;
 }
 EXPORT_SYMBOL(in_serving_softirq);
 
@@ -463,7 +449,7 @@ static void __do_softirq_common(int need_rcu_bh_qs)
 	/* Reset the pending bitmask before enabling irqs */
 	set_softirq_pending(0);
 
-	__get_cpu_var(local_softirq_runner) = current;
+	current->flags |= PF_IN_SOFTIRQ;
 
 	lockdep_softirq_enter();
 
@@ -474,7 +460,7 @@ static void __do_softirq_common(int need_rcu_bh_qs)
 		wakeup_softirqd();
 
 	lockdep_softirq_exit();
-	__get_cpu_var(local_softirq_runner) = NULL;
+	current->flags &= ~PF_IN_SOFTIRQ;
 
 	current->softirq_nestcnt--;
 }
-- 
cgit v0.10.2


From 4ab4e5b7f1759389f4af22b93dc975070a2e6e42 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 4 Oct 2012 15:33:53 +0100
Subject: softirq: Split handling function

Split out the inner handling function, so RT can reuse it.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/softirq.c b/kernel/softirq.c
index 0a810e4..e9167de 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -142,31 +142,34 @@ static void wakeup_softirqd(void)
 		wake_up_process(tsk);
 }
 
-static void handle_pending_softirqs(u32 pending, int cpu, int need_rcu_bh_qs)
+static void handle_softirq(unsigned int vec_nr, int cpu, int need_rcu_bh_qs)
 {
-	struct softirq_action *h = softirq_vec;
+	struct softirq_action *h = softirq_vec + vec_nr;
 	unsigned int prev_count = preempt_count();
 
-	local_irq_enable();
-	for ( ; pending; h++, pending >>= 1) {
-		unsigned int vec_nr = h - softirq_vec;
+	kstat_incr_softirqs_this_cpu(vec_nr);
+	trace_softirq_entry(vec_nr);
+	h->action(h);
+	trace_softirq_exit(vec_nr);
 
-		if (!(pending & 1))
-			continue;
+	if (unlikely(prev_count != preempt_count())) {
+		pr_err("softirq %u %s %p preempt count leak: %08x -> %08x\n",
+		       vec_nr, softirq_to_name[vec_nr], h->action,
+		       prev_count, (unsigned int) preempt_count());
+		preempt_count() = prev_count;
+	}
+	if (need_rcu_bh_qs)
+		rcu_bh_qs(cpu);
+}
 
-		kstat_incr_softirqs_this_cpu(vec_nr);
-		trace_softirq_entry(vec_nr);
-		h->action(h);
-		trace_softirq_exit(vec_nr);
-		if (unlikely(prev_count != preempt_count())) {
-			printk(KERN_ERR
- "huh, entered softirq %u %s %p with preempt_count %08x exited with %08x?\n",
-			       vec_nr, softirq_to_name[vec_nr], h->action,
-			       prev_count, (unsigned int) preempt_count());
-			preempt_count() = prev_count;
-		}
-		if (need_rcu_bh_qs)
-			rcu_bh_qs(cpu);
+static void handle_pending_softirqs(u32 pending, int cpu, int need_rcu_bh_qs)
+{
+	unsigned int vec_nr;
+
+	local_irq_enable();
+	for (vec_nr = 0; pending; vec_nr++, pending >>= 1) {
+		if (pending & 1)
+			handle_softirq(vec_nr, cpu, need_rcu_bh_qs);
 	}
 	local_irq_disable();
 }
-- 
cgit v0.10.2


From 68a15d43a442eee66cd79ab7b6c3545e3c87b347 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 4 Oct 2012 14:20:47 +0100
Subject: softirq: Split softirq locks

The 3.x RT series removed the split softirq implementation in favour
of pushing softirq processing into the context of the thread which
raised it. Though this prevents us from handling the various softirqs
at different priorities. Now instead of reintroducing the split
softirq threads we split the locks which serialize the softirq
processing.

If a softirq is raised in context of a thread, then the softirq is
noted on a per thread field, if the thread is in a bh disabled
region. If the softirq is raised from hard interrupt context, then the
bit is set in the flag field of ksoftirqd and ksoftirqd is invoked.
When a thread leaves a bh disabled region, then it tries to execute
the softirqs which have been raised in its own context. It acquires
the per softirq / per cpu lock for the softirq and then checks,
whether the softirq is still pending in the per cpu
local_softirq_pending() field. If yes, it runs the softirq. If no,
then some other task executed it already. This allows for zero config
softirq elevation in the context of user space tasks or interrupt
threads.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/sched.h b/include/linux/sched.h
index a3047e1..63f4ad6 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1636,6 +1636,7 @@ struct task_struct {
 #ifdef CONFIG_PREEMPT_RT_BASE
 	struct rcu_head put_rcu;
 	int softirq_nestcnt;
+	unsigned int softirqs_raised;
 #endif
 #ifdef CONFIG_PREEMPT_RT_FULL
 # if defined CONFIG_HIGHMEM || defined CONFIG_X86_32
diff --git a/kernel/softirq.c b/kernel/softirq.c
index e9167de..53e37a9 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -162,6 +162,12 @@ static void handle_softirq(unsigned int vec_nr, int cpu, int need_rcu_bh_qs)
 		rcu_bh_qs(cpu);
 }
 
+#ifndef CONFIG_PREEMPT_RT_FULL
+static inline int ksoftirqd_softirq_pending(void)
+{
+	return local_softirq_pending();
+}
+
 static void handle_pending_softirqs(u32 pending, int cpu, int need_rcu_bh_qs)
 {
 	unsigned int vec_nr;
@@ -174,7 +180,19 @@ static void handle_pending_softirqs(u32 pending, int cpu, int need_rcu_bh_qs)
 	local_irq_disable();
 }
 
-#ifndef CONFIG_PREEMPT_RT_FULL
+static void run_ksoftirqd(unsigned int cpu)
+{
+	local_irq_disable();
+	if (ksoftirqd_softirq_pending()) {
+		__do_softirq();
+		rcu_note_context_switch(cpu);
+		local_irq_enable();
+		cond_resched();
+		return;
+	}
+	local_irq_enable();
+}
+
 /*
  * preempt_count and SOFTIRQ_OFFSET usage:
  * - preempt_count is changed by SOFTIRQ_OFFSET on entering or leaving
@@ -367,6 +385,32 @@ asmlinkage void do_softirq(void)
 
 #endif
 
+/*
+ * This function must run with irqs disabled!
+ */
+void raise_softirq_irqoff(unsigned int nr)
+{
+	__raise_softirq_irqoff(nr);
+
+	/*
+	 * If we're in an interrupt or softirq, we're done
+	 * (this also catches softirq-disabled code). We will
+	 * actually run the softirq once we return from
+	 * the irq or softirq.
+	 *
+	 * Otherwise we wake up ksoftirqd to make sure we
+	 * schedule the softirq soon.
+	 */
+	if (!in_interrupt())
+		wakeup_softirqd();
+}
+
+void __raise_softirq_irqoff(unsigned int nr)
+{
+	trace_softirq_raise(nr);
+	or_softirq_pending(1UL << nr);
+}
+
 static inline void local_bh_disable_nort(void) { local_bh_disable(); }
 static inline void _local_bh_enable_nort(void) { _local_bh_enable(); }
 static void ksoftirqd_set_sched_params(unsigned int cpu) { }
@@ -375,20 +419,78 @@ static void ksoftirqd_clr_sched_params(unsigned int cpu, bool online) { }
 #else /* !PREEMPT_RT_FULL */
 
 /*
- * On RT we serialize softirq execution with a cpu local lock
+ * On RT we serialize softirq execution with a cpu local lock per softirq
  */
-static DEFINE_LOCAL_IRQ_LOCK(local_softirq_lock);
+static DEFINE_PER_CPU(struct local_irq_lock [NR_SOFTIRQS], local_softirq_locks);
+
+void __init softirq_early_init(void)
+{
+	int i;
 
-static void __do_softirq_common(int need_rcu_bh_qs);
+	for (i = 0; i < NR_SOFTIRQS; i++)
+		local_irq_lock_init(local_softirq_locks[i]);
+}
 
-void __do_softirq(void)
+static void lock_softirq(int which)
 {
-	__do_softirq_common(0);
+	__local_lock(&__get_cpu_var(local_softirq_locks[which]));
 }
 
-void __init softirq_early_init(void)
+static void unlock_softirq(int which)
+{
+	__local_unlock(&__get_cpu_var(local_softirq_locks[which]));
+}
+
+static void do_single_softirq(int which, int need_rcu_bh_qs)
 {
-	local_irq_lock_init(local_softirq_lock);
+	unsigned long old_flags = current->flags;
+
+	current->flags &= ~PF_MEMALLOC;
+	vtime_account(current);
+	current->flags |= PF_IN_SOFTIRQ;
+	lockdep_softirq_enter();
+	local_irq_enable();
+	handle_softirq(which, smp_processor_id(), need_rcu_bh_qs);
+	local_irq_disable();
+	lockdep_softirq_exit();
+	current->flags &= ~PF_IN_SOFTIRQ;
+	vtime_account(current);
+	tsk_restore_flags(current, old_flags, PF_MEMALLOC);
+}
+
+/*
+ * Called with interrupts disabled. Process softirqs which were raised
+ * in current context (or on behalf of ksoftirqd).
+ */
+static void do_current_softirqs(int need_rcu_bh_qs)
+{
+	while (current->softirqs_raised) {
+		int i = __ffs(current->softirqs_raised);
+		unsigned int pending, mask = (1U << i);
+
+		current->softirqs_raised &= ~mask;
+		local_irq_enable();
+
+		/*
+		 * If the lock is contended, we boost the owner to
+		 * process the softirq or leave the critical section
+		 * now.
+		 */
+		lock_softirq(i);
+		local_irq_disable();
+		/*
+		 * Check with the local_softirq_pending() bits,
+		 * whether we need to process this still or if someone
+		 * else took care of it.
+		 */
+		pending = local_softirq_pending();
+		if (pending & mask) {
+			set_softirq_pending(pending & ~mask);
+			do_single_softirq(i, need_rcu_bh_qs);
+		}
+		unlock_softirq(i);
+		WARN_ON(current->softirq_nestcnt != 1);
+	}
 }
 
 void local_bh_disable(void)
@@ -403,17 +505,11 @@ void local_bh_enable(void)
 	if (WARN_ON(current->softirq_nestcnt == 0))
 		return;
 
-	if ((current->softirq_nestcnt == 1) &&
-	    local_softirq_pending() &&
-	    local_trylock(local_softirq_lock)) {
+	local_irq_disable();
+	if (current->softirq_nestcnt == 1 && current->softirqs_raised)
+		do_current_softirqs(1);
+	local_irq_enable();
 
-		local_irq_disable();
-		if (local_softirq_pending())
-			__do_softirq();
-		local_irq_enable();
-		local_unlock(local_softirq_lock);
-		WARN_ON(current->softirq_nestcnt != 1);
-	}
 	current->softirq_nestcnt--;
 	migrate_enable();
 }
@@ -438,86 +534,82 @@ int in_serving_softirq(void)
 }
 EXPORT_SYMBOL(in_serving_softirq);
 
-/*
- * Called with bh and local interrupts disabled. For full RT cpu must
- * be pinned.
- */
-static void __do_softirq_common(int need_rcu_bh_qs)
+/* Called with preemption disabled */
+static void run_ksoftirqd(unsigned int cpu)
 {
-	u32 pending = local_softirq_pending();
-	int cpu = smp_processor_id();
-
+	local_irq_disable();
 	current->softirq_nestcnt++;
-
-	/* Reset the pending bitmask before enabling irqs */
-	set_softirq_pending(0);
-
-	current->flags |= PF_IN_SOFTIRQ;
-
-	lockdep_softirq_enter();
-
-	handle_pending_softirqs(pending, cpu, need_rcu_bh_qs);
-
-	pending = local_softirq_pending();
-	if (pending)
-		wakeup_softirqd();
-
-	lockdep_softirq_exit();
-	current->flags &= ~PF_IN_SOFTIRQ;
-
+	do_current_softirqs(1);
 	current->softirq_nestcnt--;
+	rcu_note_context_switch(cpu);
+	local_irq_enable();
 }
 
-static int __thread_do_softirq(int cpu)
+/*
+ * Called from netif_rx_ni(). Preemption enabled, but migration
+ * disabled. So the cpu can't go away under us.
+ */
+void thread_do_softirq(void)
 {
+	if (!in_serving_softirq() && current->softirqs_raised) {
+		current->softirq_nestcnt++;
+		do_current_softirqs(0);
+		current->softirq_nestcnt--;
+	}
+}
+
+void __raise_softirq_irqoff(unsigned int nr)
+{
+	trace_softirq_raise(nr);
+	or_softirq_pending(1UL << nr);
+
 	/*
-	 * Prevent the current cpu from going offline.
-	 * pin_current_cpu() can reenable preemption and block on the
-	 * hotplug mutex. When it returns, the current cpu is
-	 * pinned. It might be the wrong one, but the offline check
-	 * below catches that.
+	 * If we are not in a hard interrupt and inside a bh disabled
+	 * region, we simply raise the flag on current. local_bh_enable()
+	 * will make sure that the softirq is executed. Otherwise we
+	 * delegate it to ksoftirqd.
 	 */
-	pin_current_cpu();
+	if (!in_irq() && current->softirq_nestcnt)
+		current->softirqs_raised |= (1U << nr);
+	else if (__this_cpu_read(ksoftirqd))
+		__this_cpu_read(ksoftirqd)->softirqs_raised |= (1U << nr);
+}
+
+/*
+ * This function must run with irqs disabled!
+ */
+void raise_softirq_irqoff(unsigned int nr)
+{
+	__raise_softirq_irqoff(nr);
+
 	/*
-	 * If called from ksoftirqd (cpu >= 0) we need to check
-	 * whether we are on the wrong cpu due to cpu offlining. If
-	 * called via thread_do_softirq() no action required.
+	 * If we're in an hard interrupt we let irq return code deal
+	 * with the wakeup of ksoftirqd.
 	 */
-	if (cpu >= 0 && cpu_is_offline(cpu)) {
-		unpin_current_cpu();
-		return -1;
-	}
-	preempt_enable();
-	local_lock(local_softirq_lock);
-	local_irq_disable();
+	if (in_irq())
+		return;
+
 	/*
-	 * We cannot switch stacks on RT as we want to be able to
-	 * schedule!
+	 * If we are in thread context but outside of a bh disabled
+	 * region, we need to wake ksoftirqd as well.
+	 *
+	 * CHECKME: Some of the places which do that could be wrapped
+	 * into local_bh_disable/enable pairs. Though it's unclear
+	 * whether this is worth the effort. To find those places just
+	 * raise a WARN() if the condition is met.
 	 */
-	if (local_softirq_pending())
-		__do_softirq_common(cpu >= 0);
-	local_unlock(local_softirq_lock);
-	unpin_current_cpu();
-	preempt_disable();
-	local_irq_enable();
-	return 0;
+	if (!current->softirq_nestcnt)
+		wakeup_softirqd();
 }
 
-/*
- * Called from netif_rx_ni(). Preemption enabled.
- */
-void thread_do_softirq(void)
+void do_raise_softirq_irqoff(unsigned int nr)
 {
-	if (!in_serving_softirq()) {
-		preempt_disable();
-		__thread_do_softirq(-1);
-		preempt_enable();
-	}
+	raise_softirq_irqoff(nr);
 }
 
-static int ksoftirqd_do_softirq(int cpu)
+static inline int ksoftirqd_softirq_pending(void)
 {
-	return __thread_do_softirq(cpu);
+	return current->softirqs_raised;
 }
 
 static inline void local_bh_disable_nort(void) { }
@@ -528,6 +620,10 @@ static inline void ksoftirqd_set_sched_params(unsigned int cpu)
 	struct sched_param param = { .sched_priority = 1 };
 
 	sched_setscheduler(current, SCHED_FIFO, &param);
+	/* Take over all pending softirqs when starting */
+	local_irq_disable();
+	current->softirqs_raised = local_softirq_pending();
+	local_irq_enable();
 }
 
 static inline void ksoftirqd_clr_sched_params(unsigned int cpu, bool online)
@@ -574,8 +670,14 @@ static inline void invoke_softirq(void)
 		wakeup_softirqd();
 		__local_bh_enable(SOFTIRQ_OFFSET);
 	}
-#else
+#else /* PREEMPT_RT_FULL */
+	unsigned long flags;
+
+	local_irq_save(flags);
+	if (__this_cpu_read(ksoftirqd) &&
+	    __this_cpu_read(ksoftirqd)->softirqs_raised)
 		wakeup_softirqd();
+	local_irq_restore(flags);
 #endif
 }
 
@@ -599,26 +701,6 @@ void irq_exit(void)
 	sched_preempt_enable_no_resched();
 }
 
-/*
- * This function must run with irqs disabled!
- */
-inline void raise_softirq_irqoff(unsigned int nr)
-{
-	__raise_softirq_irqoff(nr);
-
-	/*
-	 * If we're in an interrupt or softirq, we're done
-	 * (this also catches softirq-disabled code). We will
-	 * actually run the softirq once we return from
-	 * the irq or softirq.
-	 *
-	 * Otherwise we wake up ksoftirqd to make sure we
-	 * schedule the softirq soon.
-	 */
-	if (!in_interrupt())
-		wakeup_softirqd();
-}
-
 void raise_softirq(unsigned int nr)
 {
 	unsigned long flags;
@@ -628,12 +710,6 @@ void raise_softirq(unsigned int nr)
 	local_irq_restore(flags);
 }
 
-void __raise_softirq_irqoff(unsigned int nr)
-{
-	trace_softirq_raise(nr);
-	or_softirq_pending(1UL << nr);
-}
-
 void open_softirq(int nr, void (*action)(struct softirq_action *))
 {
 	softirq_vec[nr].action = action;
@@ -1079,20 +1155,7 @@ EXPORT_SYMBOL(tasklet_unlock_wait);
 
 static int ksoftirqd_should_run(unsigned int cpu)
 {
-	return local_softirq_pending();
-}
-
-static void run_ksoftirqd(unsigned int cpu)
-{
-	local_irq_disable();
-	if (local_softirq_pending()) {
-		__do_softirq();
-		rcu_note_context_switch(cpu);
-		local_irq_enable();
-		cond_resched();
-		return;
-	}
-	local_irq_enable();
+	return ksoftirqd_softirq_pending();
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
-- 
cgit v0.10.2


From 32ff6bc6303cd5ea69f69bdf7a89aeabe2e94c95 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 16 Oct 2012 18:36:51 +0200
Subject: rcu: rcutiny: Prevent RCU stall

rcu_read_unlock_special() checks in_serving_softirq() and leaves early
when true. On RT this is obviously wrong as softirq processing context
can be preempted and therefor such a task can be on the gp_tasks
list. Leaving early here will leave the task on the list and therefor
block RCU processing forever.

This cannot happen on mainline because softirq processing context
cannot be preempted and therefor this can never happen at all.

In fact this check looks quite questionable in general. Neither irq
context nor softirq processing context in mainline can ever be
preempted in mainline so the special unlock case should not ever be
invoked in such context. Now the only explanation might be a
rcu_read_unlock() being interrupted and therefor leave the rcu nest
count at 0 before the special unlock bit has been cleared. That looks
fragile. At least it's missing a big fat comment. Paul ????

See mainline commits: ec433f0c5 and 8762705a for further enlightment.

Reported-by: Kristian Lehmann <krleit00@hs-esslingen.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable-rt@vger.kernel.org

diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index f85016a..ec3446e 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -560,7 +560,7 @@ void rcu_read_unlock_special(struct task_struct *t)
 		rcu_preempt_cpu_qs();
 
 	/* Hardware IRQ handlers cannot block. */
-	if (in_irq() || in_serving_softirq()) {
+	if (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_OFFSET)) {
 		local_irq_restore(flags);
 		return;
 	}
-- 
cgit v0.10.2


From 60bece6add8ca31ec0605a4998f2d4082a56762f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 25 Oct 2012 10:32:35 +0100
Subject: mm: Enable SLUB for RT

Make SLUB RT aware and remove the restriction in Kconfig.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index 9db4825..a58ad34 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -54,7 +54,7 @@ struct kmem_cache_cpu {
 };
 
 struct kmem_cache_node {
-	spinlock_t list_lock;	/* Protect partial list and nr_partial */
+	raw_spinlock_t list_lock;	/* Protect partial list and nr_partial */
 	unsigned long nr_partial;
 	struct list_head partial;
 #ifdef CONFIG_SLUB_DEBUG
diff --git a/init/Kconfig b/init/Kconfig
index f6a406d..54b18f0 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1494,7 +1494,6 @@ config SLAB
 
 config SLUB
 	bool "SLUB (Unqueued Allocator)"
-	depends on !PREEMPT_RT_FULL
 	help
 	   SLUB is a slab allocator that minimizes cache line usage
 	   instead of managing queues of cached objects (SLAB approach).
diff --git a/mm/slub.c b/mm/slub.c
index 3d2308f..f8cfc46 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1070,7 +1070,7 @@ static noinline struct kmem_cache_node *free_debug_processing(
 {
 	struct kmem_cache_node *n = get_node(s, page_to_nid(page));
 
-	spin_lock_irqsave(&n->list_lock, *flags);
+	raw_spin_lock_irqsave(&n->list_lock, *flags);
 	slab_lock(page);
 
 	if (!check_slab(s, page))
@@ -1118,7 +1118,7 @@ out:
 
 fail:
 	slab_unlock(page);
-	spin_unlock_irqrestore(&n->list_lock, *flags);
+	raw_spin_unlock_irqrestore(&n->list_lock, *flags);
 	slab_fix(s, "Object at 0x%p not freed", object);
 	return NULL;
 }
@@ -1253,6 +1253,12 @@ static inline void slab_free_hook(struct kmem_cache *s, void *x) {}
 
 #endif /* CONFIG_SLUB_DEBUG */
 
+struct slub_free_list {
+	raw_spinlock_t		lock;
+	struct list_head	list;
+};
+static DEFINE_PER_CPU(struct slub_free_list, slub_free_list);
+
 /*
  * Slab allocation and freeing
  */
@@ -1277,7 +1283,11 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
 
 	flags &= gfp_allowed_mask;
 
+#ifdef CONFIG_PREEMPT_RT_FULL
+	if (system_state == SYSTEM_RUNNING)
+#else
 	if (flags & __GFP_WAIT)
+#endif
 		local_irq_enable();
 
 	flags |= s->allocflags;
@@ -1317,7 +1327,11 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
 			kmemcheck_mark_unallocated_pages(page, pages);
 	}
 
+#ifdef CONFIG_PREEMPT_RT_FULL
+	if (system_state == SYSTEM_RUNNING)
+#else
 	if (flags & __GFP_WAIT)
+#endif
 		local_irq_disable();
 	if (!page)
 		return NULL;
@@ -1414,6 +1428,16 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
 	__free_memcg_kmem_pages(page, order);
 }
 
+static void free_delayed(struct kmem_cache *s, struct list_head *h)
+{
+	while(!list_empty(h)) {
+		struct page *page = list_first_entry(h, struct page, lru);
+
+		list_del(&page->lru);
+		__free_slab(s, page);
+	}
+}
+
 #define need_reserve_slab_rcu						\
 	(sizeof(((struct page *)NULL)->lru) < sizeof(struct rcu_head))
 
@@ -1448,6 +1472,12 @@ static void free_slab(struct kmem_cache *s, struct page *page)
 		}
 
 		call_rcu(head, rcu_free_slab);
+	} else if (irqs_disabled()) {
+		struct slub_free_list *f = &__get_cpu_var(slub_free_list);
+
+		raw_spin_lock(&f->lock);
+		list_add(&page->lru, &f->list);
+		raw_spin_unlock(&f->lock);
 	} else
 		__free_slab(s, page);
 }
@@ -1549,7 +1579,7 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
 	if (!n || !n->nr_partial)
 		return NULL;
 
-	spin_lock(&n->list_lock);
+	raw_spin_lock(&n->list_lock);
 	list_for_each_entry_safe(page, page2, &n->partial, lru) {
 		void *t;
 		int available;
@@ -1574,7 +1604,7 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
 			break;
 
 	}
-	spin_unlock(&n->list_lock);
+	raw_spin_unlock(&n->list_lock);
 	return object;
 }
 
@@ -1816,7 +1846,7 @@ redo:
 			 * that acquire_slab() will see a slab page that
 			 * is frozen
 			 */
-			spin_lock(&n->list_lock);
+			raw_spin_lock(&n->list_lock);
 		}
 	} else {
 		m = M_FULL;
@@ -1827,7 +1857,7 @@ redo:
 			 * slabs from diagnostic functions will not see
 			 * any frozen slabs.
 			 */
-			spin_lock(&n->list_lock);
+			raw_spin_lock(&n->list_lock);
 		}
 	}
 
@@ -1862,7 +1892,7 @@ redo:
 		goto redo;
 
 	if (lock)
-		spin_unlock(&n->list_lock);
+		raw_spin_unlock(&n->list_lock);
 
 	if (m == M_FREE) {
 		stat(s, DEACTIVATE_EMPTY);
@@ -1893,10 +1923,10 @@ static void unfreeze_partials(struct kmem_cache *s,
 		n2 = get_node(s, page_to_nid(page));
 		if (n != n2) {
 			if (n)
-				spin_unlock(&n->list_lock);
+				raw_spin_unlock(&n->list_lock);
 
 			n = n2;
-			spin_lock(&n->list_lock);
+			raw_spin_lock(&n->list_lock);
 		}
 
 		do {
@@ -1925,7 +1955,7 @@ static void unfreeze_partials(struct kmem_cache *s,
 	}
 
 	if (n)
-		spin_unlock(&n->list_lock);
+		raw_spin_unlock(&n->list_lock);
 
 	while (discard_page) {
 		page = discard_page;
@@ -1961,14 +1991,21 @@ static int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
 			pobjects = oldpage->pobjects;
 			pages = oldpage->pages;
 			if (drain && pobjects > s->cpu_partial) {
+				struct slub_free_list *f;
 				unsigned long flags;
+				LIST_HEAD(tofree);
 				/*
 				 * partial array is full. Move the existing
 				 * set to the per node partial list.
 				 */
 				local_irq_save(flags);
 				unfreeze_partials(s, this_cpu_ptr(s->cpu_slab));
+				f = &__get_cpu_var(slub_free_list);
+				raw_spin_lock(&f->lock);
+				list_splice_init(&f->list, &tofree);
+				raw_spin_unlock(&f->lock);
 				local_irq_restore(flags);
+				free_delayed(s, &tofree);
 				oldpage = NULL;
 				pobjects = 0;
 				pages = 0;
@@ -2031,7 +2068,22 @@ static bool has_cpu_slab(int cpu, void *info)
 
 static void flush_all(struct kmem_cache *s)
 {
+	LIST_HEAD(tofree);
+	int cpu;
+
 	on_each_cpu_cond(has_cpu_slab, flush_cpu_slab, s, 1, GFP_ATOMIC);
+	for_each_online_cpu(cpu) {
+		struct slub_free_list *f;
+
+		if (!has_cpu_slab(cpu, s))
+			continue;
+
+		f = &per_cpu(slub_free_list, cpu);
+		raw_spin_lock_irq(&f->lock);
+		list_splice_init(&f->list, &tofree);
+		raw_spin_unlock_irq(&f->lock);
+		free_delayed(s, &tofree);
+	}
 }
 
 /*
@@ -2059,10 +2111,10 @@ static unsigned long count_partial(struct kmem_cache_node *n,
 	unsigned long x = 0;
 	struct page *page;
 
-	spin_lock_irqsave(&n->list_lock, flags);
+	raw_spin_lock_irqsave(&n->list_lock, flags);
 	list_for_each_entry(page, &n->partial, lru)
 		x += get_count(page);
-	spin_unlock_irqrestore(&n->list_lock, flags);
+	raw_spin_unlock_irqrestore(&n->list_lock, flags);
 	return x;
 }
 
@@ -2205,9 +2257,11 @@ static inline void *get_freelist(struct kmem_cache *s, struct page *page)
 static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
 			  unsigned long addr, struct kmem_cache_cpu *c)
 {
+	struct slub_free_list *f;
 	void *freelist;
 	struct page *page;
 	unsigned long flags;
+	LIST_HEAD(tofree);
 
 	local_irq_save(flags);
 #ifdef CONFIG_PREEMPT
@@ -2270,7 +2324,13 @@ load_freelist:
 	VM_BUG_ON(!c->page->frozen);
 	c->freelist = get_freepointer(s, freelist);
 	c->tid = next_tid(c->tid);
+out:
+	f = &__get_cpu_var(slub_free_list);
+	raw_spin_lock(&f->lock);
+	list_splice_init(&f->list, &tofree);
+	raw_spin_unlock(&f->lock);
 	local_irq_restore(flags);
+	free_delayed(s, &tofree);
 	return freelist;
 
 new_slab:
@@ -2288,9 +2348,7 @@ new_slab:
 	if (unlikely(!freelist)) {
 		if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit())
 			slab_out_of_memory(s, gfpflags, node);
-
-		local_irq_restore(flags);
-		return NULL;
+		goto out;
 	}
 
 	page = c->page;
@@ -2304,8 +2362,7 @@ new_slab:
 	deactivate_slab(s, page, get_freepointer(s, freelist));
 	c->page = NULL;
 	c->freelist = NULL;
-	local_irq_restore(flags);
-	return freelist;
+	goto out;
 }
 
 /*
@@ -2477,7 +2534,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
 
 	do {
 		if (unlikely(n)) {
-			spin_unlock_irqrestore(&n->list_lock, flags);
+			raw_spin_unlock_irqrestore(&n->list_lock, flags);
 			n = NULL;
 		}
 		prior = page->freelist;
@@ -2507,7 +2564,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
 				 * Otherwise the list_lock will synchronize with
 				 * other processors updating the list of slabs.
 				 */
-				spin_lock_irqsave(&n->list_lock, flags);
+				raw_spin_lock_irqsave(&n->list_lock, flags);
 
 			}
 		}
@@ -2548,7 +2605,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
 		add_partial(n, page, DEACTIVATE_TO_TAIL);
 		stat(s, FREE_ADD_PARTIAL);
 	}
-	spin_unlock_irqrestore(&n->list_lock, flags);
+	raw_spin_unlock_irqrestore(&n->list_lock, flags);
 	return;
 
 slab_empty:
@@ -2562,7 +2619,7 @@ slab_empty:
 		/* Slab must be on the full list */
 		remove_full(s, page);
 
-	spin_unlock_irqrestore(&n->list_lock, flags);
+	raw_spin_unlock_irqrestore(&n->list_lock, flags);
 	stat(s, FREE_SLAB);
 	discard_slab(s, page);
 }
@@ -2764,7 +2821,7 @@ static void
 init_kmem_cache_node(struct kmem_cache_node *n)
 {
 	n->nr_partial = 0;
-	spin_lock_init(&n->list_lock);
+	raw_spin_lock_init(&n->list_lock);
 	INIT_LIST_HEAD(&n->partial);
 #ifdef CONFIG_SLUB_DEBUG
 	atomic_long_set(&n->nr_slabs, 0);
@@ -3451,7 +3508,7 @@ int kmem_cache_shrink(struct kmem_cache *s)
 		for (i = 0; i < objects; i++)
 			INIT_LIST_HEAD(slabs_by_inuse + i);
 
-		spin_lock_irqsave(&n->list_lock, flags);
+		raw_spin_lock_irqsave(&n->list_lock, flags);
 
 		/*
 		 * Build lists indexed by the items in use in each slab.
@@ -3472,7 +3529,7 @@ int kmem_cache_shrink(struct kmem_cache *s)
 		for (i = objects - 1; i > 0; i--)
 			list_splice(slabs_by_inuse + i, n->partial.prev);
 
-		spin_unlock_irqrestore(&n->list_lock, flags);
+		raw_spin_unlock_irqrestore(&n->list_lock, flags);
 
 		/* Release empty slabs */
 		list_for_each_entry_safe(page, t, slabs_by_inuse, lru)
@@ -3642,6 +3699,12 @@ void __init kmem_cache_init(void)
 		boot_kmem_cache_node;
 	int i;
 	int caches = 2;
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		raw_spin_lock_init(&per_cpu(slub_free_list, cpu).lock);
+		INIT_LIST_HEAD(&per_cpu(slub_free_list, cpu).list);
+	}
 
 	if (debug_guardpage_minorder())
 		slub_max_order = 0;
@@ -4033,7 +4096,7 @@ static int validate_slab_node(struct kmem_cache *s,
 	struct page *page;
 	unsigned long flags;
 
-	spin_lock_irqsave(&n->list_lock, flags);
+	raw_spin_lock_irqsave(&n->list_lock, flags);
 
 	list_for_each_entry(page, &n->partial, lru) {
 		validate_slab_slab(s, page, map);
@@ -4056,7 +4119,7 @@ static int validate_slab_node(struct kmem_cache *s,
 			atomic_long_read(&n->nr_slabs));
 
 out:
-	spin_unlock_irqrestore(&n->list_lock, flags);
+	raw_spin_unlock_irqrestore(&n->list_lock, flags);
 	return count;
 }
 
@@ -4246,12 +4309,12 @@ static int list_locations(struct kmem_cache *s, char *buf,
 		if (!atomic_long_read(&n->nr_slabs))
 			continue;
 
-		spin_lock_irqsave(&n->list_lock, flags);
+		raw_spin_lock_irqsave(&n->list_lock, flags);
 		list_for_each_entry(page, &n->partial, lru)
 			process_slab(&t, s, page, alloc, map);
 		list_for_each_entry(page, &n->full, lru)
 			process_slab(&t, s, page, alloc, map);
-		spin_unlock_irqrestore(&n->list_lock, flags);
+		raw_spin_unlock_irqrestore(&n->list_lock, flags);
 	}
 
 	for (i = 0; i < t.count; i++) {
-- 
cgit v0.10.2


From d9d589cc96b6aabd8c5c1fb1c61a824aaef4d385 Mon Sep 17 00:00:00 2001
From: Watanabe <shunsuke.watanabe@tel.com>
Date: Sun, 28 Oct 2012 11:13:44 +0100
Subject: hrtimer: Raise softirq if hrtimer irq stalled

When the hrtimer stall detection hits the softirq is not raised.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable-rt@vger.kernel.org

diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index f39d778..c2c58f9 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -1529,11 +1529,7 @@ retry:
 	if (expires_next.tv64 == KTIME_MAX ||
 	    !tick_program_event(expires_next, 0)) {
 		cpu_base->hang_detected = 0;
-
-		if (raise)
-			raise_softirq_irqoff(HRTIMER_SOFTIRQ);
-
-		return;
+		goto out;
 	}
 
 	/*
@@ -1577,6 +1573,9 @@ retry:
 	tick_program_event(expires_next, 1);
 	printk_once(KERN_WARNING "hrtimer: interrupt took %llu ns\n",
 		    ktime_to_ns(delta));
+out:
+	if (raise)
+		raise_softirq_irqoff(HRTIMER_SOFTIRQ);
 }
 
 /*
-- 
cgit v0.10.2


From b00e2f4427929a0cb6cd9e140309be249c65bcee Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 28 Oct 2012 13:26:09 +0000
Subject: rcu: Disable RCU_FAST_NO_HZ on RT

This uses a timer_list timer from the irq disabled guts of the idle
code. Disable it for now to prevent wreckage.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable-rt@vger.kernel.org

diff --git a/init/Kconfig b/init/Kconfig
index 54b18f0..d0590c7 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -579,7 +579,7 @@ config RCU_FANOUT_EXACT
 
 config RCU_FAST_NO_HZ
 	bool "Accelerate last non-dyntick-idle CPU's grace periods"
-	depends on NO_HZ && SMP
+	depends on NO_HZ && SMP && !PREEMPT_RT_FULL
 	default n
 	help
 	  This option causes RCU to attempt to accelerate grace periods in
-- 
cgit v0.10.2


From 46d2675b9bf494fc50e46463363e6f48a582b144 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 28 Oct 2012 11:18:08 +0100
Subject: net: netfilter: Serialize xt_write_recseq sections on RT

The netfilter code relies only on the implicit semantics of
local_bh_disable() for serializing wt_write_recseq sections. RT breaks
that and needs explicit serialization here.

Reported-by: Peter LaDow <petela@gocougs.wsu.edu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable-rt@vger.kernel.org

diff --git a/include/linux/locallock.h b/include/linux/locallock.h
index f1804a3..a5eea5d 100644
--- a/include/linux/locallock.h
+++ b/include/linux/locallock.h
@@ -25,6 +25,9 @@ struct local_irq_lock {
 	DEFINE_PER_CPU(struct local_irq_lock, lvar) = {			\
 		.lock = __SPIN_LOCK_UNLOCKED((lvar).lock) }
 
+#define DECLARE_LOCAL_IRQ_LOCK(lvar)					\
+	DECLARE_PER_CPU(struct local_irq_lock, lvar)
+
 #define local_irq_lock_init(lvar)					\
 	do {								\
 		int __cpu;						\
@@ -220,6 +223,7 @@ static inline int __local_unlock_irqrestore(struct local_irq_lock *lv,
 #else /* PREEMPT_RT_BASE */
 
 #define DEFINE_LOCAL_IRQ_LOCK(lvar)		__typeof__(const int) lvar
+#define DECLARE_LOCAL_IRQ_LOCK(lvar)		extern __typeof__(const int) lvar
 
 static inline void local_irq_lock_init(int lvar) { }
 
diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index dd49566..7d083af 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -3,6 +3,7 @@
 
 
 #include <linux/netdevice.h>
+#include <linux/locallock.h>
 #include <uapi/linux/netfilter/x_tables.h>
 
 /**
@@ -284,6 +285,8 @@ extern void xt_free_table_info(struct xt_table_info *info);
  */
 DECLARE_PER_CPU(seqcount_t, xt_recseq);
 
+DECLARE_LOCAL_IRQ_LOCK(xt_write_lock);
+
 /**
  * xt_write_recseq_begin - start of a write section
  *
@@ -298,6 +301,9 @@ static inline unsigned int xt_write_recseq_begin(void)
 {
 	unsigned int addend;
 
+	/* RT protection */
+	local_lock(xt_write_lock);
+
 	/*
 	 * Low order bit of sequence is set if we already
 	 * called xt_write_recseq_begin().
@@ -328,6 +334,7 @@ static inline void xt_write_recseq_end(unsigned int addend)
 	/* this is kind of a write_seqcount_end(), but addend is 0 or 1 */
 	smp_wmb();
 	__this_cpu_add(xt_recseq.sequence, addend);
+	local_unlock(xt_write_lock);
 }
 
 /*
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index a9c488b..c646ec8 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -20,11 +20,17 @@
 #include <linux/proc_fs.h>
 #include <linux/mutex.h>
 #include <linux/slab.h>
+#include <linux/locallock.h>
 #include <net/net_namespace.h>
 #include <net/sock.h>
 
 #include "nf_internals.h"
 
+#ifdef CONFIG_PREEMPT_RT_BASE
+DEFINE_LOCAL_IRQ_LOCK(xt_write_lock);
+EXPORT_PER_CPU_SYMBOL(xt_write_lock);
+#endif
+
 static DEFINE_MUTEX(afinfo_mutex);
 
 const struct nf_afinfo __rcu *nf_afinfo[NFPROTO_NUMPROTO] __read_mostly;
-- 
cgit v0.10.2


From 2ff985d671a2fbc200ca39500e68bb065b7f5d66 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 28 Oct 2012 13:46:16 +0000
Subject: softirq: Adapt NOHZ softirq pending check to new RT scheme

We can't rely on ksoftirqd anymore and we need to check the tasks
which run a particular softirq and if such a task is pi blocked ignore
the other pending bits of that task as well.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/softirq.c b/kernel/softirq.c
index 53e37a9..8447c8d 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -66,46 +66,71 @@ char *softirq_to_name[NR_SOFTIRQS] = {
 
 #ifdef CONFIG_NO_HZ
 # ifdef CONFIG_PREEMPT_RT_FULL
+
+struct softirq_runner {
+	struct task_struct *runner[NR_SOFTIRQS];
+};
+
+static DEFINE_PER_CPU(struct softirq_runner, softirq_runners);
+
+static inline void softirq_set_runner(unsigned int sirq)
+{
+	struct softirq_runner *sr = &__get_cpu_var(softirq_runners);
+
+	sr->runner[sirq] = current;
+}
+
+static inline void softirq_clr_runner(unsigned int sirq)
+{
+	struct softirq_runner *sr = &__get_cpu_var(softirq_runners);
+
+	sr->runner[sirq] = NULL;
+}
+
 /*
- * On preempt-rt a softirq might be blocked on a lock. There might be
- * no other runnable task on this CPU because the lock owner runs on
- * some other CPU. So we have to go into idle with the pending bit
- * set. Therefor we need to check this otherwise we warn about false
- * positives which confuses users and defeats the whole purpose of
- * this test.
+ * On preempt-rt a softirq running context might be blocked on a
+ * lock. There might be no other runnable task on this CPU because the
+ * lock owner runs on some other CPU. So we have to go into idle with
+ * the pending bit set. Therefor we need to check this otherwise we
+ * warn about false positives which confuses users and defeats the
+ * whole purpose of this test.
  *
  * This code is called with interrupts disabled.
  */
 void softirq_check_pending_idle(void)
 {
 	static int rate_limit;
-	u32 warnpending = 0, pending;
+	struct softirq_runner *sr = &__get_cpu_var(softirq_runners);
+	u32 warnpending;
+	int i;
 
 	if (rate_limit >= 10)
 		return;
 
-	pending = local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK;
-	if (pending) {
-		struct task_struct *tsk;
+	warnpending = local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK;
+	for (i = 0; i < NR_SOFTIRQS; i++) {
+		struct task_struct *tsk = sr->runner[i];
 
-		tsk = __get_cpu_var(ksoftirqd);
 		/*
 		 * The wakeup code in rtmutex.c wakes up the task
 		 * _before_ it sets pi_blocked_on to NULL under
 		 * tsk->pi_lock. So we need to check for both: state
 		 * and pi_blocked_on.
 		 */
-		raw_spin_lock(&tsk->pi_lock);
-
-		if (!tsk->pi_blocked_on && !(tsk->state == TASK_RUNNING))
-			warnpending = 1;
-
-		raw_spin_unlock(&tsk->pi_lock);
+		if (tsk) {
+			raw_spin_lock(&tsk->pi_lock);
+			if (tsk->pi_blocked_on || tsk->state == TASK_RUNNING) {
+				/* Clear all bits pending in that task */
+				warnpending &= ~(tsk->softirqs_raised);
+				warnpending &= ~(1 << i);
+			}
+			raw_spin_unlock(&tsk->pi_lock);
+		}
 	}
 
 	if (warnpending) {
 		printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n",
-		       pending);
+		       warnpending);
 		rate_limit++;
 	}
 }
@@ -125,6 +150,10 @@ void softirq_check_pending_idle(void)
 	}
 }
 # endif
+
+#else /* !NO_HZ */
+static inline void softirq_set_runner(unsigned int sirq) { }
+static inline void softirq_clr_runner(unsigned int sirq) { }
 #endif
 
 /*
@@ -478,6 +507,7 @@ static void do_current_softirqs(int need_rcu_bh_qs)
 		 */
 		lock_softirq(i);
 		local_irq_disable();
+		softirq_set_runner(i);
 		/*
 		 * Check with the local_softirq_pending() bits,
 		 * whether we need to process this still or if someone
@@ -488,6 +518,7 @@ static void do_current_softirqs(int need_rcu_bh_qs)
 			set_softirq_pending(pending & ~mask);
 			do_single_softirq(i, need_rcu_bh_qs);
 		}
+		softirq_clr_runner(i);
 		unlock_softirq(i);
 		WARN_ON(current->softirq_nestcnt != 1);
 	}
@@ -558,7 +589,7 @@ void thread_do_softirq(void)
 	}
 }
 
-void __raise_softirq_irqoff(unsigned int nr)
+static void do_raise_softirq_irqoff(unsigned int nr)
 {
 	trace_softirq_raise(nr);
 	or_softirq_pending(1UL << nr);
@@ -575,12 +606,19 @@ void __raise_softirq_irqoff(unsigned int nr)
 		__this_cpu_read(ksoftirqd)->softirqs_raised |= (1U << nr);
 }
 
+void __raise_softirq_irqoff(unsigned int nr)
+{
+	do_raise_softirq_irqoff(nr);
+	if (!in_irq() && !current->softirq_nestcnt)
+		wakeup_softirqd();
+}
+
 /*
  * This function must run with irqs disabled!
  */
 void raise_softirq_irqoff(unsigned int nr)
 {
-	__raise_softirq_irqoff(nr);
+	do_raise_softirq_irqoff(nr);
 
 	/*
 	 * If we're in an hard interrupt we let irq return code deal
@@ -602,11 +640,6 @@ void raise_softirq_irqoff(unsigned int nr)
 		wakeup_softirqd();
 }
 
-void do_raise_softirq_irqoff(unsigned int nr)
-{
-	raise_softirq_irqoff(nr);
-}
-
 static inline int ksoftirqd_softirq_pending(void)
 {
 	return current->softirqs_raised;
-- 
cgit v0.10.2


From 3ef432f02b82cacea0d4a14363e4f60c9684d28a Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 28 Oct 2012 15:12:49 +0000
Subject: net: Use local_bh_disable in netif_rx_ni()

This code triggers the new WARN in __raise_softirq_irqsoff() though it
actually looks at the softirq pending bit and calls into the softirq
code, but that fits not well with the context related softirq model of
RT. It's correct on mainline though, but going through
local_bh_disable/enable here is not going to hurt badly.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/net/core/dev.c b/net/core/dev.c
index aa529f7..b5e6b50 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3117,11 +3117,9 @@ int netif_rx_ni(struct sk_buff *skb)
 {
 	int err;
 
-	migrate_disable();
+	local_bh_disable();
 	err = netif_rx(skb);
-	if (local_softirq_pending())
-		thread_do_softirq();
-	migrate_enable();
+	local_bh_enable();
 
 	return err;
 }
-- 
cgit v0.10.2


From 8bd8b048686c0bad0112c85ef0417078a2d64a06 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 26 Oct 2012 18:50:54 +0100
Subject: sched: Add support for lazy preemption

It has become an obsession to mitigate the determinism vs. throughput
loss of RT. Looking at the mainline semantics of preemption points
gives a hint why RT sucks throughput wise for ordinary SCHED_OTHER
tasks. One major issue is the wakeup of tasks which are right away
preempting the waking task while the waking task holds a lock on which
the woken task will block right after having preempted the wakee. In
mainline this is prevented due to the implicit preemption disable of
spin/rw_lock held regions. On RT this is not possible due to the fully
preemptible nature of sleeping spinlocks.

Though for a SCHED_OTHER task preempting another SCHED_OTHER task this
is really not a correctness issue. RT folks are concerned about
SCHED_FIFO/RR tasks preemption and not about the purely fairness
driven SCHED_OTHER preemption latencies.

So I introduced a lazy preemption mechanism which only applies to
SCHED_OTHER tasks preempting another SCHED_OTHER task. Aside of the
existing preempt_count each tasks sports now a preempt_lazy_count
which is manipulated on lock acquiry and release. This is slightly
incorrect as for lazyness reasons I coupled this on
migrate_disable/enable so some other mechanisms get the same treatment
(e.g. get_cpu_light).

Now on the scheduler side instead of setting NEED_RESCHED this sets
NEED_RESCHED_LAZY in case of a SCHED_OTHER/SCHED_OTHER preemption and
therefor allows to exit the waking task the lock held region before
the woken task preempts. That also works better for cross CPU wakeups
as the other side can stay in the adaptive spinning loop.

For RT class preemption there is no change. This simply sets
NEED_RESCHED and forgoes the lazy preemption counter.

 Initial test do not expose any observable latency increasement, but
history shows that I've been proven wrong before :)

The lazy preemption mode is per default on, but with
CONFIG_SCHED_DEBUG enabled it can be disabled via:

 # echo NO_PREEMPT_LAZY >/sys/kernel/debug/sched_features

and reenabled via

 # echo PREEMPT_LAZY >/sys/kernel/debug/sched_features

The test results so far are very machine and workload dependent, but
there is a clear trend that it enhances the non RT workload
performance.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 806c02b..16ad63d 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -51,6 +51,7 @@ struct trace_entry {
 	int			pid;
 	unsigned short		migrate_disable;
 	unsigned short		padding;
+	unsigned char		preempt_lazy_count;
 };
 
 #define FTRACE_MAX_EVENT						\
diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index 9978ec5..a7f4212 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -23,15 +23,38 @@
 
 #define preempt_count()	(current_thread_info()->preempt_count)
 
+#ifdef CONFIG_PREEMPT_LAZY
+#define add_preempt_lazy_count(val)	do { preempt_lazy_count() += (val); } while (0)
+#define sub_preempt_lazy_count(val)	do { preempt_lazy_count() -= (val); } while (0)
+#define inc_preempt_lazy_count()	add_preempt_lazy_count(1)
+#define dec_preempt_lazy_count()	sub_preempt_lazy_count(1)
+#define preempt_lazy_count()		(current_thread_info()->preempt_lazy_count)
+#else
+#define add_preempt_lazy_count(val)	do { } while (0)
+#define sub_preempt_lazy_count(val)	do { } while (0)
+#define inc_preempt_lazy_count()	do { } while (0)
+#define dec_preempt_lazy_count()	do { } while (0)
+#define preempt_lazy_count()		(0)
+#endif
+
 #ifdef CONFIG_PREEMPT
 
 asmlinkage void preempt_schedule(void);
 
+# ifdef CONFIG_PREEMPT_LAZY
+#define preempt_check_resched() \
+do { \
+	if (unlikely(test_thread_flag(TIF_NEED_RESCHED) || \
+		     test_thread_flag(TIF_NEED_RESCHED_LAZY)))  \
+		preempt_schedule(); \
+} while (0)
+# else
 #define preempt_check_resched() \
 do { \
-	if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) \
+	if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))	\
 		preempt_schedule(); \
 } while (0)
+# endif
 
 #else /* !CONFIG_PREEMPT */
 
@@ -48,6 +71,12 @@ do { \
 	barrier(); \
 } while (0)
 
+#define preempt_lazy_disable() \
+do { \
+	inc_preempt_lazy_count(); \
+	barrier(); \
+} while (0)
+
 #define sched_preempt_enable_no_resched() \
 do { \
 	barrier(); \
@@ -69,6 +98,13 @@ do { \
 	preempt_check_resched(); \
 } while (0)
 
+#define preempt_lazy_enable() \
+do { \
+	dec_preempt_lazy_count(); \
+	barrier(); \
+	preempt_check_resched(); \
+} while (0)
+
 /* For debugging and tracer internals only! */
 #define add_preempt_count_notrace(val)			\
 	do { preempt_count() += (val); } while (0)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 63f4ad6..199ef40 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2692,6 +2692,52 @@ static inline int test_tsk_need_resched(struct task_struct *tsk)
 	return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED));
 }
 
+#ifdef CONFIG_PREEMPT_LAZY
+static inline void set_tsk_need_resched_lazy(struct task_struct *tsk)
+{
+	set_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY);
+}
+
+static inline void clear_tsk_need_resched_lazy(struct task_struct *tsk)
+{
+	clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY);
+}
+
+static inline int test_tsk_need_resched_lazy(struct task_struct *tsk)
+{
+	return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY));
+}
+
+static inline int need_resched_lazy(void)
+{
+	return test_thread_flag(TIF_NEED_RESCHED_LAZY);
+}
+
+static inline int need_resched_now(void)
+{
+	return test_thread_flag(TIF_NEED_RESCHED);
+}
+
+static inline int need_resched(void)
+{
+	return test_thread_flag(TIF_NEED_RESCHED) ||
+		test_thread_flag(TIF_NEED_RESCHED_LAZY);
+}
+#else
+static inline void clear_tsk_need_resched_lazy(struct task_struct *tsk) { }
+static inline int need_resched_lazy(void) { return 0; }
+
+static inline int need_resched_now(void)
+{
+	return test_thread_flag(TIF_NEED_RESCHED);
+}
+
+static inline int need_resched(void)
+{
+	return test_thread_flag(TIF_NEED_RESCHED);
+}
+#endif
+
 static inline int restart_syscall(void)
 {
 	set_tsk_thread_flag(current, TIF_SIGPENDING);
@@ -2723,11 +2769,6 @@ static inline int signal_pending_state(long state, struct task_struct *p)
 	return (state & TASK_INTERRUPTIBLE) || __fatal_signal_pending(p);
 }
 
-static inline int need_resched(void)
-{
-	return unlikely(test_thread_flag(TIF_NEED_RESCHED));
-}
-
 /*
  * cond_resched() and cond_resched_lock(): latency reduction via
  * explicit rescheduling in places that are safe. The return
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index 885efbd..574a855 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -6,6 +6,12 @@ config PREEMPT_RT_BASE
 	bool
 	select PREEMPT
 
+config HAVE_PREEMPT_LAZY
+	bool
+
+config PREEMPT_LAZY
+	def_bool y if HAVE_PREEMPT_LAZY && PREEMPT_RT_FULL
+
 choice
 	prompt "Preemption Model"
 	default PREEMPT_NONE
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 49079e3..82ae3c4 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -543,6 +543,37 @@ void resched_task(struct task_struct *p)
 		smp_send_reschedule(cpu);
 }
 
+#ifdef CONFIG_PREEMPT_LAZY
+void resched_task_lazy(struct task_struct *p)
+{
+	int cpu;
+
+	if (!sched_feat(PREEMPT_LAZY)) {
+		resched_task(p);
+		return;
+	}
+
+	assert_raw_spin_locked(&task_rq(p)->lock);
+
+	if (test_tsk_need_resched(p))
+		return;
+
+	if (test_tsk_need_resched_lazy(p))
+		return;
+
+	set_tsk_need_resched_lazy(p);
+
+	cpu = task_cpu(p);
+	if (cpu == smp_processor_id())
+		return;
+
+	/* NEED_RESCHED_LAZY must be visible before we test polling */
+	smp_mb();
+	if (!tsk_is_polling(p))
+		smp_send_reschedule(cpu);
+}
+#endif
+
 void resched_cpu(int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
@@ -659,6 +690,17 @@ void resched_task(struct task_struct *p)
 	assert_raw_spin_locked(&task_rq(p)->lock);
 	set_tsk_need_resched(p);
 }
+#ifdef CONFIG_PREEMPT_LAZY
+void resched_task_lazy(struct task_struct *p)
+{
+	if (!sched_feat(PREEMPT_LAZY)) {
+		resched_task(p);
+		return;
+	}
+	assert_raw_spin_locked(&task_rq(p)->lock);
+	set_tsk_need_resched_lazy(p);
+}
+#endif
 #endif /* CONFIG_SMP */
 
 #if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
@@ -1720,6 +1762,9 @@ void sched_fork(struct task_struct *p)
 	/* Want to start with kernel preemption disabled. */
 	task_thread_info(p)->preempt_count = 1;
 #endif
+#ifdef CONFIG_HAVE_PREEMPT_LAZY
+	task_thread_info(p)->preempt_lazy_count = 0;
+#endif
 #ifdef CONFIG_SMP
 	plist_node_init(&p->pushable_tasks, MAX_PRIO);
 #endif
@@ -2928,6 +2973,7 @@ void migrate_disable(void)
 		return;
 	}
 
+	preempt_lazy_disable();
 	pin_current_cpu();
 	p->migrate_disable = 1;
 	preempt_enable();
@@ -2983,6 +3029,7 @@ void migrate_enable(void)
 
 	unpin_current_cpu();
 	preempt_enable();
+	preempt_lazy_enable();
 }
 EXPORT_SYMBOL(migrate_enable);
 #else
@@ -3117,6 +3164,7 @@ need_resched:
 	put_prev_task(rq, prev);
 	next = pick_next_task(rq);
 	clear_tsk_need_resched(prev);
+	clear_tsk_need_resched_lazy(prev);
 	rq->skip_clock_update = 0;
 
 	if (likely(prev != next)) {
@@ -3253,6 +3301,14 @@ asmlinkage void __sched notrace preempt_schedule(void)
 	if (likely(ti->preempt_count || irqs_disabled()))
 		return;
 
+#ifdef CONFIG_PREEMPT_LAZY
+	/*
+	 * Check for lazy preemption
+	 */
+	if (ti->preempt_lazy_count && !test_thread_flag(TIF_NEED_RESCHED))
+		return;
+#endif
+
 	do {
 		add_preempt_count_notrace(PREEMPT_ACTIVE);
 		/*
@@ -4864,7 +4920,9 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
 
 	/* Set the preempt count _outside_ the spinlocks! */
 	task_thread_info(idle)->preempt_count = 0;
-
+#ifdef CONFIG_HAVE_PREEMPT_LAZY
+	task_thread_info(idle)->preempt_lazy_count = 0;
+#endif
 	/*
 	 * The idle tasks have their own, simple scheduling class:
 	 */
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 81fa536..392fcf3 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1827,7 +1827,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 	ideal_runtime = sched_slice(cfs_rq, curr);
 	delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
 	if (delta_exec > ideal_runtime) {
-		resched_task(rq_of(cfs_rq)->curr);
+		resched_task_lazy(rq_of(cfs_rq)->curr);
 		/*
 		 * The current task ran long enough, ensure it doesn't get
 		 * re-elected due to buddy favours.
@@ -1851,7 +1851,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 		return;
 
 	if (delta > ideal_runtime)
-		resched_task(rq_of(cfs_rq)->curr);
+		resched_task_lazy(rq_of(cfs_rq)->curr);
 }
 
 static void
@@ -1971,7 +1971,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
 	 * validating it and just reschedule.
 	 */
 	if (queued) {
-		resched_task(rq_of(cfs_rq)->curr);
+		resched_task_lazy(rq_of(cfs_rq)->curr);
 		return;
 	}
 	/*
@@ -2160,7 +2160,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
 	 * hierarchy can be throttled
 	 */
 	if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
-		resched_task(rq_of(cfs_rq)->curr);
+		resched_task_lazy(rq_of(cfs_rq)->curr);
 }
 
 static __always_inline
@@ -2745,7 +2745,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
 
 		if (delta < 0) {
 			if (rq->curr == p)
-				resched_task(p);
+				resched_task_lazy(p);
 			return;
 		}
 
@@ -3577,7 +3577,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
 	return;
 
 preempt:
-	resched_task(curr);
+	resched_task_lazy(curr);
 	/*
 	 * Only set the backward buddy when the current task is still
 	 * on the rq. This can happen when a wakeup gets interleaved
@@ -5772,7 +5772,7 @@ static void task_fork_fair(struct task_struct *p)
 		 * 'current' within the tree based on its new key value.
 		 */
 		swap(curr->vruntime, se->vruntime);
-		resched_task(rq->curr);
+		resched_task_lazy(rq->curr);
 	}
 
 	se->vruntime -= cfs_rq->min_vruntime;
@@ -5797,7 +5797,7 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
 	 */
 	if (rq->curr == p) {
 		if (p->prio > oldprio)
-			resched_task(rq->curr);
+			resched_task_lazy(rq->curr);
 	} else
 		check_preempt_curr(rq, p, 0);
 }
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index e5de4db..771b529 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -65,6 +65,9 @@ SCHED_FEAT(NONTASK_POWER, true)
 SCHED_FEAT(TTWU_QUEUE, true)
 #else
 SCHED_FEAT(TTWU_QUEUE, false)
+# ifdef CONFIG_PREEMPT_LAZY
+SCHED_FEAT(PREEMPT_LAZY, true)
+# endif
 #endif
 
 SCHED_FEAT(FORCE_SD_OVERLAP, false)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index fc88644..d055951 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -897,6 +897,15 @@ extern void init_sched_fair_class(void);
 extern void resched_task(struct task_struct *p);
 extern void resched_cpu(int cpu);
 
+#ifdef CONFIG_PREEMPT_LAZY
+extern void resched_task_lazy(struct task_struct *tsk);
+#else
+static inline void resched_task_lazy(struct task_struct *tsk)
+{
+	resched_task(tsk);
+}
+#endif
+
 extern struct rt_bandwidth def_rt_bandwidth;
 extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
 
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 298e5ca..dfe9201 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1167,6 +1167,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
 	struct task_struct *tsk = current;
 
 	entry->preempt_count		= pc & 0xff;
+	entry->preempt_lazy_count	= preempt_lazy_count();
 	entry->pid			= (tsk) ? tsk->pid : 0;
 	entry->padding			= 0;
 	entry->flags =
@@ -1177,7 +1178,8 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
 #endif
 		((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) |
 		((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) |
-		(need_resched() ? TRACE_FLAG_NEED_RESCHED : 0);
+		(need_resched_now() ? TRACE_FLAG_NEED_RESCHED : 0) |
+		(need_resched_lazy() ? TRACE_FLAG_NEED_RESCHED_LAZY : 0);
 
 	entry->migrate_disable	= (tsk) ? __migrate_disabled(tsk) & 0xFF : 0;
 }
@@ -2032,15 +2034,17 @@ get_total_entries(struct trace_array *tr, unsigned long *total, unsigned long *e
 
 static void print_lat_help_header(struct seq_file *m)
 {
-	seq_puts(m, "#                  _------=> CPU#            \n");
-	seq_puts(m, "#                 / _-----=> irqs-off        \n");
-	seq_puts(m, "#                | / _----=> need-resched    \n");
-	seq_puts(m, "#                || / _---=> hardirq/softirq \n");
-	seq_puts(m, "#                ||| / _--=> preempt-depth   \n");
-	seq_puts(m, "#                |||| / _--=> migrate-disable\n");
-	seq_puts(m, "#                ||||| /     delay           \n");
-	seq_puts(m, "#  cmd     pid   |||||| time  |   caller     \n");
-	seq_puts(m, "#     \\   /      |||||  \\   |   /          \n");
+	seq_puts(m, "#                   _--------=> CPU#              \n");
+	seq_puts(m, "#                  / _-------=> irqs-off          \n");
+	seq_puts(m, "#                 | / _------=> need-resched      \n");
+	seq_puts(m, "#                 || / _-----=> need-resched_lazy \n");
+	seq_puts(m, "#                 ||| / _----=> hardirq/softirq   \n");
+	seq_puts(m, "#                 |||| / _---=> preempt-depth     \n");
+	seq_puts(m, "#                 ||||| / _--=> preempt-lazy-depth\n");
+	seq_puts(m, "#                 |||||| / _-=> migrate-disable   \n");
+	seq_puts(m, "#                 ||||||| /     delay             \n");
+	seq_puts(m, "#  cmd     pid    |||||||| time  |   caller       \n");
+	seq_puts(m, "#     \\   /      ||||||||  \\   |   /            \n");
 }
 
 static void print_event_info(struct trace_array *tr, struct seq_file *m)
@@ -2064,13 +2068,16 @@ static void print_func_help_header(struct trace_array *tr, struct seq_file *m)
 static void print_func_help_header_irq(struct trace_array *tr, struct seq_file *m)
 {
 	print_event_info(tr, m);
-	seq_puts(m, "#                              _-----=> irqs-off\n");
-	seq_puts(m, "#                             / _----=> need-resched\n");
-	seq_puts(m, "#                            | / _---=> hardirq/softirq\n");
-	seq_puts(m, "#                            || / _--=> preempt-depth\n");
-	seq_puts(m, "#                            ||| /     delay\n");
-	seq_puts(m, "#           TASK-PID   CPU#  ||||    TIMESTAMP  FUNCTION\n");
-	seq_puts(m, "#              | |       |   ||||       |         |\n");
+	seq_puts(m, "#                              _-------=> irqs-off          \n");
+	seq_puts(m, "#                            /  _------=> need-resched      \n");
+	seq_puts(m, "#                            |/  _-----=> need-resched_lazy \n");
+	seq_puts(m, "#                            ||/  _----=> hardirq/softirq   \n");
+	seq_puts(m, "#                            |||/  _---=> preempt-depth     \n");
+	seq_puts(m, "#                            ||||/  _--=> preempt-lazy-depth\n");
+	seq_puts(m, "#                            ||||| / _-=> migrate-disable   \n");
+	seq_puts(m, "#                            |||||| /     delay\n");
+	seq_puts(m, "#           TASK-PID   CPU#  |||||||    TIMESTAMP  FUNCTION\n");
+	seq_puts(m, "#              | |       |   |||||||       |         |\n");
 }
 
 void
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 23f1d2c..15f4a31 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -116,6 +116,7 @@ struct uprobe_trace_entry_head {
  *  NEED_RESCHED	- reschedule is requested
  *  HARDIRQ		- inside an interrupt handler
  *  SOFTIRQ		- inside a softirq handler
+ *  NEED_RESCHED_LAZY	- lazy reschedule is requested
  */
 enum trace_flag_type {
 	TRACE_FLAG_IRQS_OFF		= 0x01,
@@ -123,6 +124,7 @@ enum trace_flag_type {
 	TRACE_FLAG_NEED_RESCHED		= 0x04,
 	TRACE_FLAG_HARDIRQ		= 0x08,
 	TRACE_FLAG_SOFTIRQ		= 0x10,
+	TRACE_FLAG_NEED_RESCHED_LAZY	= 0x20,
 };
 
 #define TRACE_BUF_SIZE		1024
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index e36737f..2b0aea4 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -564,6 +564,7 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
 {
 	char hardsoft_irq;
 	char need_resched;
+	char need_resched_lazy;
 	char irqs_off;
 	int hardirq;
 	int softirq;
@@ -578,14 +579,17 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
 		'.';
 	need_resched =
 		(entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.';
+	need_resched_lazy =
+		(entry->flags & TRACE_FLAG_NEED_RESCHED_LAZY) ? 'L' : '.';
 	hardsoft_irq =
 		(hardirq && softirq) ? 'H' :
 		hardirq ? 'h' :
 		softirq ? 's' :
 		'.';
 
-	if (!trace_seq_printf(s, "%c%c%c",
-			      irqs_off, need_resched, hardsoft_irq))
+	if (!trace_seq_printf(s, "%c%c%c%c",
+			      irqs_off, need_resched, need_resched_lazy,
+			      hardsoft_irq))
 		return 0;
 
 	if (entry->preempt_count)
@@ -593,6 +597,11 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
 	else
 		ret = trace_seq_putc(s, '.');
 
+	if (entry->preempt_lazy_count)
+		ret = trace_seq_printf(s, "%x", entry->preempt_lazy_count);
+	else
+		ret = trace_seq_putc(s, '.');
+
 	if (entry->migrate_disable)
 		ret = trace_seq_printf(s, "%x", entry->migrate_disable);
 	else
-- 
cgit v0.10.2


From a2bc9c09458eb21fe8cd28748e6805bff67c73b0 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 1 Nov 2012 11:03:47 +0100
Subject: x86-preempt-lazy.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 989e191..585e236 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -108,6 +108,7 @@ config X86
 	select KTIME_SCALAR if X86_32
 	select GENERIC_STRNCPY_FROM_USER
 	select GENERIC_STRNLEN_USER
+	select HAVE_PREEMPT_LAZY
 	select HAVE_CONTEXT_TRACKING if X86_64
 	select HAVE_IRQ_TIME_ACCOUNTING
 	select MODULES_USE_ELF_REL if X86_32
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 2d946e6..6b0fc2e 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -31,6 +31,8 @@ struct thread_info {
 	__u32			cpu;		/* current CPU */
 	int			preempt_count;	/* 0 => preemptable,
 						   <0 => BUG */
+	int			preempt_lazy_count;	/* 0 => lazy preemptable,
+							   <0 => BUG */
 	mm_segment_t		addr_limit;
 	struct restart_block    restart_block;
 	void __user		*sysenter_return;
@@ -82,6 +84,7 @@ struct thread_info {
 #define TIF_SYSCALL_EMU		6	/* syscall emulation active */
 #define TIF_SYSCALL_AUDIT	7	/* syscall auditing active */
 #define TIF_SECCOMP		8	/* secure computing */
+#define TIF_NEED_RESCHED_LAZY	9	/* lazy rescheduling necessary */
 #define TIF_MCE_NOTIFY		10	/* notify userspace of an MCE */
 #define TIF_USER_RETURN_NOTIFY	11	/* notify kernel of userspace return */
 #define TIF_UPROBE		12	/* breakpointed or singlestepping */
@@ -107,6 +110,7 @@ struct thread_info {
 #define _TIF_SYSCALL_EMU	(1 << TIF_SYSCALL_EMU)
 #define _TIF_SYSCALL_AUDIT	(1 << TIF_SYSCALL_AUDIT)
 #define _TIF_SECCOMP		(1 << TIF_SECCOMP)
+#define _TIF_NEED_RESCHED_LAZY	(1 << TIF_NEED_RESCHED_LAZY)
 #define _TIF_MCE_NOTIFY		(1 << TIF_MCE_NOTIFY)
 #define _TIF_USER_RETURN_NOTIFY	(1 << TIF_USER_RETURN_NOTIFY)
 #define _TIF_UPROBE		(1 << TIF_UPROBE)
@@ -157,6 +161,8 @@ struct thread_info {
 #define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY)
 #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG)
 
+#define _TIF_NEED_RESCHED_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)
+
 #define PREEMPT_ACTIVE		0x10000000
 
 #ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index 2861082..a36d9cf 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -33,6 +33,7 @@ void common(void) {
 	OFFSET(TI_status, thread_info, status);
 	OFFSET(TI_addr_limit, thread_info, addr_limit);
 	OFFSET(TI_preempt_count, thread_info, preempt_count);
+	OFFSET(TI_preempt_lazy_count, thread_info, preempt_lazy_count);
 
 	BLANK();
 	OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx);
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 6ed91d9..218e79a 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -364,14 +364,22 @@ ENTRY(resume_kernel)
 	DISABLE_INTERRUPTS(CLBR_ANY)
 	cmpl $0,TI_preempt_count(%ebp)	# non-zero preempt_count ?
 	jnz restore_all
-need_resched:
 	movl TI_flags(%ebp), %ecx	# need_resched set ?
 	testb $_TIF_NEED_RESCHED, %cl
+	jnz 1f
+
+	cmpl $0,TI_preempt_lazy_count(%ebp)	# non-zero preempt_lazy_count ?
+	jnz  restore_all
+	testl $_TIF_NEED_RESCHED_LAZY, %ecx
 	jz restore_all
-	testl $X86_EFLAGS_IF,PT_EFLAGS(%esp)	# interrupts off (exception path) ?
+
+1:	testl $X86_EFLAGS_IF,PT_EFLAGS(%esp)	# interrupts off (exception path) ?
 	jz restore_all
 	call preempt_schedule_irq
-	jmp need_resched
+	movl TI_flags(%ebp), %ecx	# need_resched set ?
+	testl $_TIF_NEED_RESCHED_MASK, %ecx
+	jnz 1b
+	jmp restore_all
 END(resume_kernel)
 #endif
 	CFI_ENDPROC
@@ -607,7 +615,7 @@ ENDPROC(system_call)
 	ALIGN
 	RING0_PTREGS_FRAME		# can't unwind into user space anyway
 work_pending:
-	testb $_TIF_NEED_RESCHED, %cl
+	testl $_TIF_NEED_RESCHED_MASK, %ecx
 	jz work_notifysig
 work_resched:
 	call schedule
@@ -620,7 +628,7 @@ work_resched:
 	andl $_TIF_WORK_MASK, %ecx	# is there any work to be done other
 					# than syscall tracing?
 	jz restore_all
-	testb $_TIF_NEED_RESCHED, %cl
+	testl $_TIF_NEED_RESCHED_MASK, %ecx
 	jnz work_resched
 
 work_notifysig:				# deal with pending signals and
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 3c06da6..0b01d8d 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -673,8 +673,8 @@ sysret_check:
 	/* Handle reschedules */
 	/* edx:	work, edi: workmask */
 sysret_careful:
-	bt $TIF_NEED_RESCHED,%edx
-	jnc sysret_signal
+	testl $_TIF_NEED_RESCHED_MASK,%edx
+	jz sysret_signal
 	TRACE_IRQS_ON
 	ENABLE_INTERRUPTS(CLBR_NONE)
 	pushq_cfi %rdi
@@ -786,8 +786,8 @@ GLOBAL(int_with_check)
 	/* First do a reschedule test. */
 	/* edx:	work, edi: workmask */
 int_careful:
-	bt $TIF_NEED_RESCHED,%edx
-	jnc  int_very_careful
+	testl $_TIF_NEED_RESCHED_MASK,%edx
+	jz  int_very_careful
 	TRACE_IRQS_ON
 	ENABLE_INTERRUPTS(CLBR_NONE)
 	pushq_cfi %rdi
@@ -1094,8 +1094,8 @@ bad_iret:
 	/* edi: workmask, edx: work */
 retint_careful:
 	CFI_RESTORE_STATE
-	bt    $TIF_NEED_RESCHED,%edx
-	jnc   retint_signal
+	testl $_TIF_NEED_RESCHED_MASK,%edx
+	jz   retint_signal
 	TRACE_IRQS_ON
 	ENABLE_INTERRUPTS(CLBR_NONE)
 	pushq_cfi %rdi
@@ -1128,9 +1128,15 @@ retint_signal:
 ENTRY(retint_kernel)
 	cmpl $0,TI_preempt_count(%rcx)
 	jnz  retint_restore_args
-	bt  $TIF_NEED_RESCHED,TI_flags(%rcx)
+	bt   $TIF_NEED_RESCHED,TI_flags(%rcx)
+	jc   1f
+
+	cmpl $0,TI_preempt_lazy_count(%rcx)
+	jnz  retint_restore_args
+	bt   $TIF_NEED_RESCHED_LAZY,TI_flags(%rcx)
 	jnc  retint_restore_args
-	bt   $9,EFLAGS-ARGOFFSET(%rsp)	/* interrupts off? */
+
+1:	bt   $9,EFLAGS-ARGOFFSET(%rsp)	/* interrupts off? */
 	jnc  retint_restore_args
 	call preempt_schedule_irq
 	jmp exit_intr
@@ -1522,7 +1528,7 @@ paranoid_userspace:
 	movq %rsp,%rdi			/* &pt_regs */
 	call sync_regs
 	movq %rax,%rsp			/* switch stack for scheduling */
-	testl $_TIF_NEED_RESCHED,%ebx
+	testl $_TIF_NEED_RESCHED_MASK,%ebx
 	jnz paranoid_schedule
 	movl %ebx,%edx			/* arg3: thread flags */
 	TRACE_IRQS_ON
-- 
cgit v0.10.2


From b49feb1c4ad0696073dc3c46e346b7c2a72499ee Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 31 Oct 2012 12:04:11 +0100
Subject: arm-preempt-lazy-support.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 4890796..2753534 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -47,6 +47,7 @@ config ARM
 	select HAVE_MEMBLOCK
 	select HAVE_OPROFILE if (HAVE_PERF_EVENTS)
 	select HAVE_PERF_EVENTS
+	select HAVE_PREEMPT_LAZY
 	select HAVE_REGS_AND_STACK_ACCESS_API
 	select HAVE_SYSCALL_TRACEPOINTS
 	select HAVE_UID16
diff --git a/arch/arm/include/asm/thread_info.h b/arch/arm/include/asm/thread_info.h
index cddda1f..4acacb2 100644
--- a/arch/arm/include/asm/thread_info.h
+++ b/arch/arm/include/asm/thread_info.h
@@ -50,6 +50,7 @@ struct cpu_context_save {
 struct thread_info {
 	unsigned long		flags;		/* low level flags */
 	int			preempt_count;	/* 0 => preemptable, <0 => bug */
+	int			preempt_lazy_count;	/* 0 => preemptable, <0 => bug */
 	mm_segment_t		addr_limit;	/* address limit */
 	struct task_struct	*task;		/* main task structure */
 	struct exec_domain	*exec_domain;	/* execution domain */
@@ -148,6 +149,7 @@ extern int vfp_restore_user_hwstate(struct user_vfp __user *,
 #define TIF_SIGPENDING		0
 #define TIF_NEED_RESCHED	1
 #define TIF_NOTIFY_RESUME	2	/* callback before returning to user */
+#define TIF_NEED_RESCHED_LAZY	3
 #define TIF_SYSCALL_TRACE	8
 #define TIF_SYSCALL_AUDIT	9
 #define TIF_SYSCALL_TRACEPOINT	10
@@ -160,6 +162,7 @@ extern int vfp_restore_user_hwstate(struct user_vfp __user *,
 #define _TIF_SIGPENDING		(1 << TIF_SIGPENDING)
 #define _TIF_NEED_RESCHED	(1 << TIF_NEED_RESCHED)
 #define _TIF_NOTIFY_RESUME	(1 << TIF_NOTIFY_RESUME)
+#define _TIF_NEED_RESCHED_LAZY	(1 << TIF_NEED_RESCHED_LAZY)
 #define _TIF_SYSCALL_TRACE	(1 << TIF_SYSCALL_TRACE)
 #define _TIF_SYSCALL_AUDIT	(1 << TIF_SYSCALL_AUDIT)
 #define _TIF_SYSCALL_TRACEPOINT	(1 << TIF_SYSCALL_TRACEPOINT)
diff --git a/arch/arm/kernel/asm-offsets.c b/arch/arm/kernel/asm-offsets.c
index cf10d18..8b1d153 100644
--- a/arch/arm/kernel/asm-offsets.c
+++ b/arch/arm/kernel/asm-offsets.c
@@ -50,6 +50,7 @@ int main(void)
   BLANK();
   DEFINE(TI_FLAGS,		offsetof(struct thread_info, flags));
   DEFINE(TI_PREEMPT,		offsetof(struct thread_info, preempt_count));
+  DEFINE(TI_PREEMPT_LAZY,	offsetof(struct thread_info, preempt_lazy_count));
   DEFINE(TI_ADDR_LIMIT,		offsetof(struct thread_info, addr_limit));
   DEFINE(TI_TASK,		offsetof(struct thread_info, task));
   DEFINE(TI_EXEC_DOMAIN,	offsetof(struct thread_info, exec_domain));
diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S
index 0f82098..afa746c 100644
--- a/arch/arm/kernel/entry-armv.S
+++ b/arch/arm/kernel/entry-armv.S
@@ -216,11 +216,18 @@ __irq_svc:
 #ifdef CONFIG_PREEMPT
 	get_thread_info tsk
 	ldr	r8, [tsk, #TI_PREEMPT]		@ get preempt count
-	ldr	r0, [tsk, #TI_FLAGS]		@ get flags
 	teq	r8, #0				@ if preempt count != 0
+	bne	1f				@ return from exeption
+	ldr	r0, [tsk, #TI_FLAGS]		@ get flags
+	tst	r0, #_TIF_NEED_RESCHED		@ if NEED_RESCHED is set
+	blne	svc_preempt			@ preempt!
+
+	ldr	r8, [tsk, #TI_PREEMPT_LAZY]	@ get preempt lazy count
+	teq	r8, #0				@ if preempt lazy count != 0
 	movne	r0, #0				@ force flags to 0
-	tst	r0, #_TIF_NEED_RESCHED
+	tst	r0, #_TIF_NEED_RESCHED_LAZY
 	blne	svc_preempt
+1:
 #endif
 
 #ifdef CONFIG_TRACE_IRQFLAGS
@@ -240,6 +247,8 @@ svc_preempt:
 1:	bl	preempt_schedule_irq		@ irq en/disable is done inside
 	ldr	r0, [tsk, #TI_FLAGS]		@ get new tasks TI_FLAGS
 	tst	r0, #_TIF_NEED_RESCHED
+	bne	1b
+	tst	r0, #_TIF_NEED_RESCHED_LAZY
 	moveq	pc, r8				@ go again
 	b	1b
 #endif
diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c
index 56f72d2..806416a 100644
--- a/arch/arm/kernel/signal.c
+++ b/arch/arm/kernel/signal.c
@@ -638,7 +638,8 @@ asmlinkage int
 do_work_pending(struct pt_regs *regs, unsigned int thread_flags, int syscall)
 {
 	do {
-		if (likely(thread_flags & _TIF_NEED_RESCHED)) {
+		if (likely(thread_flags & (_TIF_NEED_RESCHED |
+					   _TIF_NEED_RESCHED_LAZY))) {
 			schedule();
 		} else {
 			if (unlikely(!user_mode(regs)))
-- 
cgit v0.10.2


From 438cb0532b4377f1a598f4aaea0d700243a07fe3 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 20 Mar 2013 18:06:20 +0100
Subject: net: make devnet_rename_seq a mutex

On RT write_seqcount_begin() disables preemption and device_rename()
allocates memory with GFP_KERNEL and grabs later the sysfs_mutex mutex.
Since I don't see a reason why this can't be a mutex, make it one. We
probably don't have that much reads at the same time in the hot path.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index b705426..041bc5b 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1579,7 +1579,7 @@ extern int call_netdevice_notifiers(unsigned long val, struct net_device *dev);
 
 extern rwlock_t				dev_base_lock;		/* Device list lock */
 
-extern seqcount_t	devnet_rename_seq;	/* Device rename seq */
+extern struct mutex devnet_rename_mutex;
 
 
 #define for_each_netdev(net, d)		\
diff --git a/net/core/dev.c b/net/core/dev.c
index b5e6b50..dce8f51 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -203,7 +203,7 @@ static struct list_head offload_base __read_mostly;
 DEFINE_RWLOCK(dev_base_lock);
 EXPORT_SYMBOL(dev_base_lock);
 
-seqcount_t devnet_rename_seq;
+DEFINE_MUTEX(devnet_rename_mutex);
 
 static inline void dev_base_seq_inc(struct net *net)
 {
@@ -1093,10 +1093,11 @@ int dev_change_name(struct net_device *dev, const char *newname)
 	if (dev->flags & IFF_UP)
 		return -EBUSY;
 
-	write_seqcount_begin(&devnet_rename_seq);
+
+	mutex_lock(&devnet_rename_mutex);
 
 	if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
-		write_seqcount_end(&devnet_rename_seq);
+		mutex_unlock(&devnet_rename_mutex);
 		return 0;
 	}
 
@@ -1104,7 +1105,7 @@ int dev_change_name(struct net_device *dev, const char *newname)
 
 	err = dev_get_valid_name(net, dev, newname);
 	if (err < 0) {
-		write_seqcount_end(&devnet_rename_seq);
+		mutex_unlock(&devnet_rename_mutex);
 		return err;
 	}
 
@@ -1112,11 +1113,11 @@ rollback:
 	ret = device_rename(&dev->dev, dev->name);
 	if (ret) {
 		memcpy(dev->name, oldname, IFNAMSIZ);
-		write_seqcount_end(&devnet_rename_seq);
+		mutex_unlock(&devnet_rename_mutex);
 		return ret;
 	}
 
-	write_seqcount_end(&devnet_rename_seq);
+	mutex_unlock(&devnet_rename_mutex);
 
 	write_lock_bh(&dev_base_lock);
 	hlist_del_rcu(&dev->name_hlist);
@@ -1135,7 +1136,7 @@ rollback:
 		/* err >= 0 after dev_alloc_name() or stores the first errno */
 		if (err >= 0) {
 			err = ret;
-			write_seqcount_begin(&devnet_rename_seq);
+			mutex_lock(&devnet_rename_mutex);
 			memcpy(dev->name, oldname, IFNAMSIZ);
 			goto rollback;
 		} else {
@@ -4219,7 +4220,6 @@ static int dev_ifname(struct net *net, struct ifreq __user *arg)
 {
 	struct net_device *dev;
 	struct ifreq ifr;
-	unsigned seq;
 
 	/*
 	 *	Fetch the caller's info block.
@@ -4228,19 +4228,18 @@ static int dev_ifname(struct net *net, struct ifreq __user *arg)
 	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
 		return -EFAULT;
 
-retry:
-	seq = read_seqcount_begin(&devnet_rename_seq);
+	mutex_lock(&devnet_rename_mutex);
 	rcu_read_lock();
 	dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
 	if (!dev) {
 		rcu_read_unlock();
+		mutex_unlock(&devnet_rename_mutex);
 		return -ENODEV;
 	}
 
 	strcpy(ifr.ifr_name, dev->name);
 	rcu_read_unlock();
-	if (read_seqcount_retry(&devnet_rename_seq, seq))
-		goto retry;
+	mutex_unlock(&devnet_rename_mutex);
 
 	if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
 		return -EFAULT;
diff --git a/net/core/sock.c b/net/core/sock.c
index 45a50ee..2754c99 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -571,7 +571,6 @@ static int sock_getbindtodevice(struct sock *sk, char __user *optval,
 	struct net *net = sock_net(sk);
 	struct net_device *dev;
 	char devname[IFNAMSIZ];
-	unsigned seq;
 
 	if (sk->sk_bound_dev_if == 0) {
 		len = 0;
@@ -582,20 +581,19 @@ static int sock_getbindtodevice(struct sock *sk, char __user *optval,
 	if (len < IFNAMSIZ)
 		goto out;
 
-retry:
-	seq = read_seqcount_begin(&devnet_rename_seq);
+	mutex_lock(&devnet_rename_mutex);
 	rcu_read_lock();
 	dev = dev_get_by_index_rcu(net, sk->sk_bound_dev_if);
 	ret = -ENODEV;
 	if (!dev) {
 		rcu_read_unlock();
+		mutex_unlock(&devnet_rename_mutex);
 		goto out;
 	}
 
 	strcpy(devname, dev->name);
 	rcu_read_unlock();
-	if (read_seqcount_retry(&devnet_rename_seq, seq))
-		goto retry;
+	mutex_unlock(&devnet_rename_mutex);
 
 	len = strlen(devname) + 1;
 
-- 
cgit v0.10.2


From 6d48e92e5577907312d903c0878fe3c633e89db7 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Thu, 21 Mar 2013 10:00:28 +0100
Subject: powerpc/fsl-msi: use a different locklcass for the cascade interrupt

lockdep thinks that it might deadlock because it grabs a lock of the
same class while calling the generic_irq_handler(). This annotation will
inform lockdep that it will not.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>

diff --git a/arch/powerpc/sysdev/fsl_msi.c b/arch/powerpc/sysdev/fsl_msi.c
index 6e53d97..b94b478 100644
--- a/arch/powerpc/sysdev/fsl_msi.c
+++ b/arch/powerpc/sysdev/fsl_msi.c
@@ -333,6 +333,8 @@ static int fsl_of_msi_remove(struct platform_device *ofdev)
 	return 0;
 }
 
+static struct lock_class_key fsl_msi_irq_class;
+
 static int fsl_msi_setup_hwirq(struct fsl_msi *msi, struct platform_device *dev,
 			       int offset, int irq_index)
 {
@@ -351,7 +353,7 @@ static int fsl_msi_setup_hwirq(struct fsl_msi *msi, struct platform_device *dev,
 		dev_err(&dev->dev, "No memory for MSI cascade data\n");
 		return -ENOMEM;
 	}
-
+	irq_set_lockdep_class(virt_msir, &fsl_msi_irq_class);
 	msi->msi_virqs[irq_index] = virt_msir;
 	cascade_data->index = offset;
 	cascade_data->msi_data = msi;
-- 
cgit v0.10.2


From c7199908b51d1f14a175a2b0d538024442b04f35 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Thu, 21 Mar 2013 11:35:49 +0100
Subject: i2c/omap: drop the lock hard irq context

The lock is taken while reading two registers. On RT the first lock is
taken in hard irq where it might sleep and in the threaded irq.
The threaded irq runs in oneshot mode so the hard irq does not run until
the thread the completes so there is no reason to grab the lock.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>

diff --git a/drivers/i2c/busses/i2c-omap.c b/drivers/i2c/busses/i2c-omap.c
index 4cc2f05..bb51488 100644
--- a/drivers/i2c/busses/i2c-omap.c
+++ b/drivers/i2c/busses/i2c-omap.c
@@ -881,15 +881,12 @@ omap_i2c_isr(int irq, void *dev_id)
 	u16 mask;
 	u16 stat;
 
-	spin_lock(&dev->lock);
-	mask = omap_i2c_read_reg(dev, OMAP_I2C_IE_REG);
 	stat = omap_i2c_read_reg(dev, OMAP_I2C_STAT_REG);
+	mask = omap_i2c_read_reg(dev, OMAP_I2C_IE_REG);
 
 	if (stat & mask)
 		ret = IRQ_WAKE_THREAD;
 
-	spin_unlock(&dev->lock);
-
 	return ret;
 }
 
-- 
cgit v0.10.2


From 65275eb1b73916c2d799d26d5c2acf1ad4001da0 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Thu, 21 Mar 2013 12:46:49 +0100
Subject: spi/omap-mcspi: check condition also after timeout

It is possible that the handler gets interrupted after checking the
status. After it resumes it the time out is due but the condition it was
waiting for might be true. Therefore it is necessary to check the
condition in case of an time out to be sure that the condition is not
true _and_ the time passed by.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>

diff --git a/drivers/spi/spi-omap2-mcspi.c b/drivers/spi/spi-omap2-mcspi.c
index b610f52..56c4166 100644
--- a/drivers/spi/spi-omap2-mcspi.c
+++ b/drivers/spi/spi-omap2-mcspi.c
@@ -285,8 +285,12 @@ static int mcspi_wait_for_reg_bit(void __iomem *reg, unsigned long bit)
 
 	timeout = jiffies + msecs_to_jiffies(1000);
 	while (!(__raw_readl(reg) & bit)) {
-		if (time_after(jiffies, timeout))
-			return -1;
+		if (time_after(jiffies, timeout)) {
+			if (!(__raw_readl(reg) & bit))
+				return -ETIMEDOUT;
+			else
+				return 0;
+		}
 		cpu_relax();
 	}
 	return 0;
-- 
cgit v0.10.2


From 876c673e89f5da0307803a4da4bdf9fdb6afcbee Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Thu, 21 Mar 2013 19:01:05 +0100
Subject: HACK: printk: drop the logbuf_lock more often

The lock is hold with irgs off. The latency drops 500us+ on my arm bugs
with a "full" buffer after executing "dmesg" on the shell.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>

diff --git a/kernel/printk.c b/kernel/printk.c
index 2609082..6d52c34 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -1072,6 +1072,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
 {
 	char *text;
 	int len = 0;
+	int attempts = 0;
 
 	text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL);
 	if (!text)
@@ -1083,7 +1084,14 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
 		u64 seq;
 		u32 idx;
 		enum log_flags prev;
-
+		int num_msg;
+try_again:
+		attempts++;
+		if (attempts > 10) {
+			len = -EBUSY;
+			goto out;
+		}
+		num_msg = 0;
 		if (clear_seq < log_first_seq) {
 			/* messages are gone, move to first available one */
 			clear_seq = log_first_seq;
@@ -1104,6 +1112,14 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
 			prev = msg->flags;
 			idx = log_next(idx);
 			seq++;
+			num_msg++;
+			if (num_msg > 5) {
+				num_msg = 0;
+				raw_spin_unlock_irq(&logbuf_lock);
+				raw_spin_lock_irq(&logbuf_lock);
+				if (clear_seq < log_first_seq)
+					goto try_again;
+			}
 		}
 
 		/* move first record forward until length fits into the buffer */
@@ -1117,6 +1133,14 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
 			prev = msg->flags;
 			idx = log_next(idx);
 			seq++;
+			num_msg++;
+			if (num_msg > 5) {
+				num_msg = 0;
+				raw_spin_unlock_irq(&logbuf_lock);
+				raw_spin_lock_irq(&logbuf_lock);
+				if (clear_seq < log_first_seq)
+					goto try_again;
+			}
 		}
 
 		/* last message fitting into this dump */
@@ -1158,6 +1182,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
 		clear_seq = log_next_seq;
 		clear_idx = log_next_idx;
 	}
+out:
 	raw_spin_unlock_irq(&logbuf_lock);
 
 	kfree(text);
-- 
cgit v0.10.2


From fca358c56b98565dffeeffe62a6c9448138f2cb5 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 3 Apr 2013 12:43:30 +0200
Subject: fs/fscache: remove spin_lock() from the condition in while()

The spinlock() within the condition in while() will cause a compile error
if it is not a function. This is not a problem on mainline but it does not
look pretty and there is no reason to do it that way.
That patch writes it a little differently and avoids the double condition.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>

diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index ff000e5..c84696c 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -796,11 +796,13 @@ void fscache_invalidate_writes(struct fscache_cookie *cookie)
 
 	_enter("");
 
-	while (spin_lock(&cookie->stores_lock),
-	       n = radix_tree_gang_lookup_tag(&cookie->stores, results, 0,
-					      ARRAY_SIZE(results),
-					      FSCACHE_COOKIE_PENDING_TAG),
-	       n > 0) {
+	spin_lock(&cookie->stores_lock);
+	while (1) {
+		n = radix_tree_gang_lookup_tag(&cookie->stores, results, 0,
+					       ARRAY_SIZE(results),
+					       FSCACHE_COOKIE_PENDING_TAG);
+		if (n == 0)
+			break;
 		for (i = n - 1; i >= 0; i--) {
 			page = results[i];
 			radix_tree_delete(&cookie->stores, page->index);
@@ -810,6 +812,7 @@ void fscache_invalidate_writes(struct fscache_cookie *cookie)
 
 		for (i = n - 1; i >= 0; i--)
 			page_cache_release(results[i]);
+		spin_lock(&cookie->stores_lock);
 	}
 
 	spin_unlock(&cookie->stores_lock);
-- 
cgit v0.10.2


From b96563d4038e3007940e68d13d3890655150c824 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Tue, 30 Apr 2013 03:17:31 -0500
Subject: gpu/i915: don't open code these things

The opencode part is gone in 1f83fee0 ("drm/i915: clear up wedged transitions")
the owner check is still there.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index de45b60..9b51712 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -91,7 +91,6 @@ i915_gem_wait_for_error(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	struct completion *x = &dev_priv->error_completion;
-	unsigned long flags;
 	int ret;
 
 	if (!atomic_read(&dev_priv->mm.wedged))
@@ -116,9 +115,7 @@ i915_gem_wait_for_error(struct drm_device *dev)
 		 * end up waiting upon a subsequent completion event that
 		 * will never happen.
 		 */
-		spin_lock_irqsave(&x->wait.lock, flags);
-		x->done++;
-		spin_unlock_irqrestore(&x->wait.lock, flags);
+		complete(x);
 	}
 	return 0;
 }
@@ -946,12 +943,9 @@ i915_gem_check_wedge(struct drm_i915_private *dev_priv,
 	if (atomic_read(&dev_priv->mm.wedged)) {
 		struct completion *x = &dev_priv->error_completion;
 		bool recovery_complete;
-		unsigned long flags;
 
 		/* Give the error handler a chance to run. */
-		spin_lock_irqsave(&x->wait.lock, flags);
-		recovery_complete = x->done > 0;
-		spin_unlock_irqrestore(&x->wait.lock, flags);
+		recovery_complete = completion_done(x);
 
 		/* Non-interruptible callers can't handle -EAGAIN, hence return
 		 * -EIO unconditionally for these. */
@@ -4366,7 +4360,7 @@ static bool mutex_is_locked_by(struct mutex *mutex, struct task_struct *task)
 	if (!mutex_is_locked(mutex))
 		return false;
 
-#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_MUTEXES)
+#if (defined(CONFIG_SMP) || defined(CONFIG_DEBUG_MUTEXES)) && !defined(CONFIG_PREEMPT_RT_BASE)
 	return mutex->owner == task;
 #else
 	/* Since UP may be pre-empted, we cannot assume that we own the lock */
-- 
cgit v0.10.2


From cfc63cb5b37ac83e2e4171446e31057dcfcaefcc Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Thu, 25 Apr 2013 18:12:52 +0200
Subject: drm/i915: move i915_trace_irq_get() out of the tracing macro

Reported-by: Joakim Hernberg <jbh@alchemy.lu>
[C.Emde@osadl.org: pull out seqno and define it so it compiled]
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>

diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index ba8805a..eabd3dd 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -814,6 +814,7 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
 	struct intel_ring_buffer *ring;
 	u32 ctx_id = i915_execbuffer2_get_context_id(*args);
 	u32 exec_start, exec_len;
+	u32 seqno;
 	u32 mask;
 	u32 flags;
 	int ret, mode, i;
@@ -1068,7 +1069,9 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
 			goto err;
 	}
 
-	trace_i915_gem_ring_dispatch(ring, intel_ring_get_seqno(ring), flags);
+	seqno = intel_ring_get_seqno(ring);
+	trace_i915_gem_ring_dispatch(ring, seqno, flags);
+	i915_trace_irq_get(ring, seqno);
 
 	i915_gem_execbuffer_move_to_active(&objects, ring);
 	i915_gem_execbuffer_retire_commands(dev, file, ring);
diff --git a/drivers/gpu/drm/i915/i915_trace.h b/drivers/gpu/drm/i915/i915_trace.h
index 3db4a68..29217db 100644
--- a/drivers/gpu/drm/i915/i915_trace.h
+++ b/drivers/gpu/drm/i915/i915_trace.h
@@ -244,7 +244,6 @@ TRACE_EVENT(i915_gem_ring_dispatch,
 			   __entry->ring = ring->id;
 			   __entry->seqno = seqno;
 			   __entry->flags = flags;
-			   i915_trace_irq_get(ring, seqno);
 			   ),
 
 	    TP_printk("dev=%u, ring=%u, seqno=%u, flags=%x",
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 6af87cd..8b5e4ae 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -232,8 +232,10 @@ static inline u32 intel_ring_get_seqno(struct intel_ring_buffer *ring)
 
 static inline void i915_trace_irq_get(struct intel_ring_buffer *ring, u32 seqno)
 {
+#ifdef CONFIG_TRACEPOINTS
 	if (ring->trace_irq_seqno == 0 && ring->irq_get(ring))
 		ring->trace_irq_seqno = seqno;
+#endif
 }
 
 /* DRI warts */
-- 
cgit v0.10.2


From 9eb91590bf46ad0969eddeba6b35561878e4c2d4 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 1 Nov 2012 10:14:11 +0100
Subject: powerpc-preempt-lazy-support.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index bd0885eb..281256e 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -142,6 +142,7 @@ config PPC
 	select GENERIC_CLOCKEVENTS
 	select GENERIC_STRNCPY_FROM_USER
 	select GENERIC_STRNLEN_USER
+	select HAVE_PREEMPT_LAZY
 	select HAVE_MOD_ARCH_SPECIFIC
 	select MODULES_USE_ELF_RELA
 	select CLONE_BACKWARDS
diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h
index 406b7b9..14c08f1 100644
--- a/arch/powerpc/include/asm/thread_info.h
+++ b/arch/powerpc/include/asm/thread_info.h
@@ -43,6 +43,8 @@ struct thread_info {
 	int		cpu;			/* cpu we're on */
 	int		preempt_count;		/* 0 => preemptable,
 						   <0 => BUG */
+	int		preempt_lazy_count;	/* 0 => preemptable,
+						   <0 => BUG */
 	struct restart_block restart_block;
 	unsigned long	local_flags;		/* private flags for thread */
 
@@ -97,7 +99,7 @@ static inline struct thread_info *current_thread_info(void)
 #define TIF_PERFMON_CTXSW	6	/* perfmon needs ctxsw calls */
 #define TIF_SYSCALL_AUDIT	7	/* syscall auditing active */
 #define TIF_SINGLESTEP		8	/* singlestepping active */
-#define TIF_MEMDIE		9	/* is terminating due to OOM killer */
+#define TIF_NEED_RESCHED_LAZY	9	/* lazy rescheduling necessary */
 #define TIF_SECCOMP		10	/* secure computing */
 #define TIF_RESTOREALL		11	/* Restore all regs (implies NOERROR) */
 #define TIF_NOERROR		12	/* Force successful syscall return */
@@ -106,6 +108,7 @@ static inline struct thread_info *current_thread_info(void)
 #define TIF_SYSCALL_TRACEPOINT	15	/* syscall tracepoint instrumentation */
 #define TIF_EMULATE_STACK_STORE	16	/* Is an instruction emulation
 						for stack store? */
+#define TIF_MEMDIE		17	/* is terminating due to OOM killer */
 
 /* as above, but as bit values */
 #define _TIF_SYSCALL_TRACE	(1<<TIF_SYSCALL_TRACE)
@@ -124,12 +127,15 @@ static inline struct thread_info *current_thread_info(void)
 #define _TIF_UPROBE		(1<<TIF_UPROBE)
 #define _TIF_SYSCALL_TRACEPOINT	(1<<TIF_SYSCALL_TRACEPOINT)
 #define _TIF_EMULATE_STACK_STORE	(1<<TIF_EMULATE_STACK_STORE)
+#define _TIF_NEED_RESCHED_LAZY	(1<<TIF_NEED_RESCHED_LAZY)
 #define _TIF_SYSCALL_T_OR_A	(_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \
 				 _TIF_SECCOMP | _TIF_SYSCALL_TRACEPOINT)
 
 #define _TIF_USER_WORK_MASK	(_TIF_SIGPENDING | _TIF_NEED_RESCHED | \
-				 _TIF_NOTIFY_RESUME | _TIF_UPROBE)
+				 _TIF_NOTIFY_RESUME | _TIF_UPROBE | \
+				 _TIF_NEED_RESCHED_LAZY)
 #define _TIF_PERSYSCALL_MASK	(_TIF_RESTOREALL|_TIF_NOERROR)
+#define _TIF_NEED_RESCHED_MASK	(_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)
 
 /* Bits in local_flags */
 /* Don't move TLF_NAPPING without adjusting the code in entry_32.S */
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 4e23ba2..4c3e321 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -124,6 +124,7 @@ int main(void)
 	DEFINE(TI_FLAGS, offsetof(struct thread_info, flags));
 	DEFINE(TI_LOCAL_FLAGS, offsetof(struct thread_info, local_flags));
 	DEFINE(TI_PREEMPT, offsetof(struct thread_info, preempt_count));
+	DEFINE(TI_PREEMPT_LAZY, offsetof(struct thread_info, preempt_lazy_count));
 	DEFINE(TI_TASK, offsetof(struct thread_info, task));
 	DEFINE(TI_CPU, offsetof(struct thread_info, cpu));
 
diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S
index e514de5..95b884e 100644
--- a/arch/powerpc/kernel/entry_32.S
+++ b/arch/powerpc/kernel/entry_32.S
@@ -892,7 +892,14 @@ resume_kernel:
 	cmpwi	0,r0,0		/* if non-zero, just restore regs and return */
 	bne	restore
 	andi.	r8,r8,_TIF_NEED_RESCHED
+	bne+	1f
+	lwz	r0,TI_PREEMPT_LAZY(r9)
+	cmpwi	0,r0,0		/* if non-zero, just restore regs and return */
+	bne	restore
+	lwz	r0,TI_FLAGS(r9)
+	andi.	r0,r0,_TIF_NEED_RESCHED_LAZY
 	beq+	restore
+1:
 	lwz	r3,_MSR(r1)
 	andi.	r0,r3,MSR_EE	/* interrupts off? */
 	beq	restore		/* don't schedule if so */
@@ -903,11 +910,11 @@ resume_kernel:
 	 */
 	bl	trace_hardirqs_off
 #endif
-1:	bl	preempt_schedule_irq
+2:	bl	preempt_schedule_irq
 	CURRENT_THREAD_INFO(r9, r1)
 	lwz	r3,TI_FLAGS(r9)
-	andi.	r0,r3,_TIF_NEED_RESCHED
-	bne-	1b
+	andi.	r0,r3,_TIF_NEED_RESCHED_MASK
+	bne-	2b
 #ifdef CONFIG_TRACE_IRQFLAGS
 	/* And now, to properly rebalance the above, we tell lockdep they
 	 * are being turned back on, which will happen when we return
@@ -1228,7 +1235,7 @@ global_dbcr0:
 #endif /* !(CONFIG_4xx || CONFIG_BOOKE) */
 
 do_work:			/* r10 contains MSR_KERNEL here */
-	andi.	r0,r9,_TIF_NEED_RESCHED
+	andi.	r0,r9,_TIF_NEED_RESCHED_MASK
 	beq	do_user_signal
 
 do_resched:			/* r10 contains MSR_KERNEL here */
@@ -1249,7 +1256,7 @@ recheck:
 	MTMSRD(r10)		/* disable interrupts */
 	CURRENT_THREAD_INFO(r9, r1)
 	lwz	r9,TI_FLAGS(r9)
-	andi.	r0,r9,_TIF_NEED_RESCHED
+	andi.	r0,r9,_TIF_NEED_RESCHED_MASK
 	bne-	do_resched
 	andi.	r0,r9,_TIF_USER_WORK_MASK
 	beq	restore_user
diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index e0822a3..9f0ca17 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -592,7 +592,7 @@ _GLOBAL(ret_from_except_lite)
 	andi.	r0,r4,_TIF_USER_WORK_MASK
 	beq	restore
 
-	andi.	r0,r4,_TIF_NEED_RESCHED
+	andi.	r0,r4,_TIF_NEED_RESCHED_MASK
 	beq	1f
 	bl	.restore_interrupts
 	bl	.schedule
@@ -642,10 +642,18 @@ resume_kernel:
 
 #ifdef CONFIG_PREEMPT
 	/* Check if we need to preempt */
+	lwz	r8,TI_PREEMPT(r9)
+	cmpwi	0,r8,0		/* if non-zero, just restore regs and return */
+	bne	restore
 	andi.	r0,r4,_TIF_NEED_RESCHED
+	bne+	check_count
+
+	andi.	r0,r4,_TIF_NEED_RESCHED_LAZY
 	beq+	restore
+	lwz	r8,TI_PREEMPT_LAZY(r9)
+
 	/* Check that preempt_count() == 0 and interrupts are enabled */
-	lwz	r8,TI_PREEMPT(r9)
+check_count:
 	cmpwi	cr1,r8,0
 	ld	r0,SOFTE(r1)
 	cmpdi	r0,0
@@ -662,7 +670,7 @@ resume_kernel:
 	/* Re-test flags and eventually loop */
 	CURRENT_THREAD_INFO(r9, r1)
 	ld	r4,TI_FLAGS(r9)
-	andi.	r0,r4,_TIF_NEED_RESCHED
+	andi.	r0,r4,_TIF_NEED_RESCHED_MASK
 	bne	1b
 
 	/*
-- 
cgit v0.10.2


From 4e7917258188645796634b32b361f7c055c058a1 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 12 Dec 2011 12:29:04 +0100
Subject: wait-simple: Simple waitqueue implementation

wait_queue is a swiss army knife and in most of the cases the
complexity is not needed. For RT waitqueues are a constant source of
trouble as we can't convert the head lock to a raw spinlock due to
fancy and long lasting callbacks.

Provide a slim version, which allows RT to replace wait queues. This
should go mainline as well, as it lowers memory consumption and
runtime overhead.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/wait-simple.h b/include/linux/wait-simple.h
new file mode 100644
index 0000000..19ac983
--- /dev/null
+++ b/include/linux/wait-simple.h
@@ -0,0 +1,231 @@
+#ifndef _LINUX_WAIT_SIMPLE_H
+#define _LINUX_WAIT_SIMPLE_H
+
+#include <linux/spinlock.h>
+#include <linux/list.h>
+
+#include <asm/current.h>
+
+struct swaiter {
+	struct task_struct	*task;
+	struct list_head	node;
+};
+
+#define DEFINE_SWAITER(name)					\
+	struct swaiter name = {					\
+		.task	= current,				\
+		.node	= LIST_HEAD_INIT((name).node),		\
+	}
+
+struct swait_head {
+	raw_spinlock_t		lock;
+	struct list_head	list;
+};
+
+#define DEFINE_SWAIT_HEAD(name)					\
+	struct swait_head name = {				\
+		.lock	= __RAW_SPIN_LOCK_UNLOCKED(name.lock),	\
+		.list	= LIST_HEAD_INIT((name).list),		\
+	}
+
+extern void __init_swait_head(struct swait_head *h, struct lock_class_key *key);
+
+#define init_swait_head(swh)					\
+	do {							\
+		static struct lock_class_key __key;		\
+								\
+		__init_swait_head((swh), &__key);		\
+	} while (0)
+
+/*
+ * Waiter functions
+ */
+static inline bool swaiter_enqueued(struct swaiter *w)
+{
+	return w->task != NULL;
+}
+
+extern void swait_prepare(struct swait_head *head, struct swaiter *w, int state);
+extern void swait_finish(struct swait_head *head, struct swaiter *w);
+
+/*
+ * Adds w to head->list. Must be called with head->lock locked.
+ */
+static inline void __swait_enqueue(struct swait_head *head, struct swaiter *w)
+{
+	list_add(&w->node, &head->list);
+}
+
+/*
+ * Removes w from head->list. Must be called with head->lock locked.
+ */
+static inline void __swait_dequeue(struct swaiter *w)
+{
+	list_del_init(&w->node);
+}
+
+/*
+ * Check whether a head has waiters enqueued
+ */
+static inline bool swait_head_has_waiters(struct swait_head *h)
+{
+	return !list_empty(&h->list);
+}
+
+/*
+ * Wakeup functions
+ */
+extern int __swait_wake(struct swait_head *head, unsigned int state);
+
+static inline int swait_wake(struct swait_head *head)
+{
+	return swait_head_has_waiters(head) ?
+		__swait_wake(head, TASK_NORMAL) : 0;
+}
+
+static inline int swait_wake_interruptible(struct swait_head *head)
+{
+	return swait_head_has_waiters(head) ?
+		__swait_wake(head, TASK_INTERRUPTIBLE) : 0;
+}
+
+/*
+ * Event API
+ */
+
+#define __swait_event(wq, condition)					\
+do {									\
+	DEFINE_SWAITER(__wait);						\
+									\
+	for (;;) {							\
+		swait_prepare(&wq, &__wait, TASK_UNINTERRUPTIBLE);	\
+		if (condition)						\
+			break;						\
+		schedule();						\
+	}								\
+	swait_finish(&wq, &__wait);					\
+} while (0)
+
+/**
+ * swait_event - sleep until a condition gets true
+ * @wq: the waitqueue to wait on
+ * @condition: a C expression for the event to wait for
+ *
+ * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the
+ * @condition evaluates to true. The @condition is checked each time
+ * the waitqueue @wq is woken up.
+ *
+ * wake_up() has to be called after changing any variable that could
+ * change the result of the wait condition.
+ */
+#define swait_event(wq, condition)					\
+do {									\
+	if (condition)							\
+		break;							\
+	__swait_event(wq, condition);					\
+} while (0)
+
+#define __swait_event_interruptible(wq, condition, ret)			\
+do {									\
+	DEFINE_SWAITER(__wait);						\
+									\
+	for (;;) {							\
+		swait_prepare(&wq, &__wait, TASK_INTERRUPTIBLE);	\
+		if (condition)						\
+			break;						\
+		if (signal_pending(current)) {				\
+			ret = -ERESTARTSYS;				\
+			break;						\
+		}							\
+		schedule();						\
+	}								\
+	swait_finish(&wq, &__wait);					\
+} while (0)
+
+#define __swait_event_interruptible_timeout(wq, condition, ret)		\
+do {									\
+	DEFINE_SWAITER(__wait);						\
+									\
+	for (;;) {							\
+		swait_prepare(&wq, &__wait, TASK_INTERRUPTIBLE);	\
+		if (condition)						\
+			break;						\
+		if (signal_pending(current)) {				\
+			ret = -ERESTARTSYS;				\
+			break;						\
+		}							\
+		ret = schedule_timeout(ret);				\
+		if (!ret)						\
+			break;						\
+	}								\
+	swait_finish(&wq, &__wait);					\
+} while (0)
+
+/**
+ * swait_event_interruptible - sleep until a condition gets true
+ * @wq: the waitqueue to wait on
+ * @condition: a C expression for the event to wait for
+ *
+ * The process is put to sleep (TASK_INTERRUPTIBLE) until the
+ * @condition evaluates to true. The @condition is checked each time
+ * the waitqueue @wq is woken up.
+ *
+ * wake_up() has to be called after changing any variable that could
+ * change the result of the wait condition.
+ */
+#define swait_event_interruptible(wq, condition)			\
+({									\
+	int __ret = 0;							\
+	if (!(condition))						\
+		__swait_event_interruptible(wq, condition, __ret);	\
+	__ret;								\
+})
+
+#define swait_event_interruptible_timeout(wq, condition, timeout)	\
+({									\
+	int __ret = timeout;						\
+	if (!(condition))						\
+		__swait_event_interruptible_timeout(wq, condition, __ret);	\
+	__ret;								\
+})
+
+#define __swait_event_timeout(wq, condition, ret)			\
+do {									\
+	DEFINE_SWAITER(__wait);						\
+									\
+	for (;;) {							\
+		swait_prepare(&wq, &__wait, TASK_UNINTERRUPTIBLE);	\
+		if (condition)						\
+			break;						\
+		ret = schedule_timeout(ret);				\
+		if (!ret)						\
+			break;						\
+	}								\
+	swait_finish(&wq, &__wait);					\
+} while (0)
+
+/**
+ * swait_event_timeout - sleep until a condition gets true or a timeout elapses
+ * @wq: the waitqueue to wait on
+ * @condition: a C expression for the event to wait for
+ * @timeout: timeout, in jiffies
+ *
+ * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the
+ * @condition evaluates to true. The @condition is checked each time
+ * the waitqueue @wq is woken up.
+ *
+ * wake_up() has to be called after changing any variable that could
+ * change the result of the wait condition.
+ *
+ * The function returns 0 if the @timeout elapsed, and the remaining
+ * jiffies if the condition evaluated to true before the timeout elapsed.
+ */
+#define swait_event_timeout(wq, condition, timeout)			\
+({									\
+	long __ret = timeout;						\
+	if (!(condition))						\
+		__swait_event_timeout(wq, condition, __ret);		\
+	__ret;								\
+})
+
+#endif
diff --git a/kernel/Makefile b/kernel/Makefile
index 0e12fc2..2c7ab7f 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -10,7 +10,7 @@ obj-y     = fork.o exec_domain.o panic.o printk.o \
 	    kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o \
 	    hrtimer.o nsproxy.o srcu.o semaphore.o \
 	    notifier.o ksysfs.o cred.o \
-	    async.o range.o groups.o lglock.o smpboot.o
+	    async.o range.o groups.o lglock.o smpboot.o wait-simple.o
 
 ifdef CONFIG_FUNCTION_TRACER
 # Do not trace debug files and internal ftrace files
diff --git a/kernel/wait-simple.c b/kernel/wait-simple.c
new file mode 100644
index 0000000..040d714
--- /dev/null
+++ b/kernel/wait-simple.c
@@ -0,0 +1,68 @@
+/*
+ * Simple waitqueues without fancy flags and callbacks
+ *
+ * (C) 2011 Thomas Gleixner <tglx@linutronix.de>
+ *
+ * Based on kernel/wait.c
+ *
+ * For licencing details see kernel-base/COPYING
+ */
+#include <linux/init.h>
+#include <linux/export.h>
+#include <linux/sched.h>
+#include <linux/wait-simple.h>
+
+void __init_swait_head(struct swait_head *head, struct lock_class_key *key)
+{
+	raw_spin_lock_init(&head->lock);
+	lockdep_set_class(&head->lock, key);
+	INIT_LIST_HEAD(&head->list);
+}
+EXPORT_SYMBOL(__init_swait_head);
+
+void swait_prepare(struct swait_head *head, struct swaiter *w, int state)
+{
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&head->lock, flags);
+	w->task = current;
+	if (list_empty(&w->node))
+		__swait_enqueue(head, w);
+	set_current_state(state);
+	raw_spin_unlock_irqrestore(&head->lock, flags);
+}
+EXPORT_SYMBOL(swait_prepare);
+
+void swait_finish(struct swait_head *head, struct swaiter *w)
+{
+	unsigned long flags;
+
+	__set_current_state(TASK_RUNNING);
+	if (w->task) {
+		raw_spin_lock_irqsave(&head->lock, flags);
+		__swait_dequeue(w);
+		raw_spin_unlock_irqrestore(&head->lock, flags);
+	}
+}
+EXPORT_SYMBOL(swait_finish);
+
+int __swait_wake(struct swait_head *head, unsigned int state)
+{
+	struct swaiter *curr, *next;
+	unsigned long flags;
+	int woken = 0;
+
+	raw_spin_lock_irqsave(&head->lock, flags);
+
+	list_for_each_entry_safe(curr, next, &head->list, node) {
+		if (wake_up_state(curr->task, state)) {
+			__swait_dequeue(curr);
+			curr->task = NULL;
+			woken++;
+		}
+	}
+
+	raw_spin_unlock_irqrestore(&head->lock, flags);
+	return woken;
+}
+EXPORT_SYMBOL(__swait_wake);
-- 
cgit v0.10.2


From 5477c6bae53d90541b8638b74dbffc137633319f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 3 Dec 2012 16:25:21 +0100
Subject: rcutiny: Use simple waitqueue

Simple waitqueues can be handled from interrupt disabled contexts.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index ec3446e..14cae12 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -26,6 +26,7 @@
 #include <linux/module.h>
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
+#include <linux/wait-simple.h>
 
 /* Global control variables for rcupdate callback mechanism. */
 struct rcu_ctrlblk {
@@ -260,7 +261,7 @@ static void show_tiny_preempt_stats(struct seq_file *m)
 
 /* Controls for rcu_kthread() kthread. */
 static struct task_struct *rcu_kthread_task;
-static DECLARE_WAIT_QUEUE_HEAD(rcu_kthread_wq);
+static DEFINE_SWAIT_HEAD(rcu_kthread_wq);
 static unsigned long have_rcu_kthread_work;
 
 /*
@@ -713,7 +714,7 @@ void synchronize_rcu(void)
 }
 EXPORT_SYMBOL_GPL(synchronize_rcu);
 
-static DECLARE_WAIT_QUEUE_HEAD(sync_rcu_preempt_exp_wq);
+static DEFINE_SWAIT_HEAD(sync_rcu_preempt_exp_wq);
 static unsigned long sync_rcu_preempt_exp_count;
 static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex);
 
@@ -735,7 +736,7 @@ static int rcu_preempted_readers_exp(void)
  */
 static void rcu_report_exp_done(void)
 {
-	wake_up(&sync_rcu_preempt_exp_wq);
+	swait_wake(&sync_rcu_preempt_exp_wq);
 }
 
 /*
@@ -787,8 +788,8 @@ void synchronize_rcu_expedited(void)
 	} else {
 		rcu_initiate_boost();
 		local_irq_restore(flags);
-		wait_event(sync_rcu_preempt_exp_wq,
-			   !rcu_preempted_readers_exp());
+		swait_event(sync_rcu_preempt_exp_wq,
+			    !rcu_preempted_readers_exp());
 	}
 
 	/* Clean up and exit. */
@@ -858,7 +859,7 @@ static void invoke_rcu_callbacks(void)
 {
 	have_rcu_kthread_work = 1;
 	if (rcu_kthread_task != NULL)
-		wake_up(&rcu_kthread_wq);
+		swait_wake(&rcu_kthread_wq);
 }
 
 #ifdef CONFIG_RCU_TRACE
@@ -888,8 +889,8 @@ static int rcu_kthread(void *arg)
 	unsigned long flags;
 
 	for (;;) {
-		wait_event_interruptible(rcu_kthread_wq,
-					 have_rcu_kthread_work != 0);
+		swait_event_interruptible(rcu_kthread_wq,
+					  have_rcu_kthread_work != 0);
 		morework = rcu_boost();
 		local_irq_save(flags);
 		work = have_rcu_kthread_work;
-- 
cgit v0.10.2


From 3a46393dd34285079b6468bb1a278d082b6a3964 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Mon, 8 Apr 2013 16:09:57 +0200
Subject: kernel/treercu: use a simple waitqueue

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 85fd9a7..7ec834d 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -1319,7 +1319,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
 
 		/* Handle grace-period start. */
 		for (;;) {
-			wait_event_interruptible(rsp->gp_wq,
+			swait_event_interruptible(rsp->gp_wq,
 						 rsp->gp_flags &
 						 RCU_GP_FLAG_INIT);
 			if ((rsp->gp_flags & RCU_GP_FLAG_INIT) &&
@@ -1338,7 +1338,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
 		}
 		for (;;) {
 			rsp->jiffies_force_qs = jiffies + j;
-			ret = wait_event_interruptible_timeout(rsp->gp_wq,
+			ret = swait_event_interruptible_timeout(rsp->gp_wq,
 					(rsp->gp_flags & RCU_GP_FLAG_FQS) ||
 					(!ACCESS_ONCE(rnp->qsmask) &&
 					 !rcu_preempt_blocked_readers_cgp(rnp)),
@@ -1423,7 +1423,7 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
 	local_irq_restore(flags);
 
 	/* Wake up rcu_gp_kthread() to start the grace period. */
-	wake_up(&rsp->gp_wq);
+	swait_wake(&rsp->gp_wq);
 }
 
 /*
@@ -1438,7 +1438,7 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
 {
 	WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
 	raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags);
-	wake_up(&rsp->gp_wq);  /* Memory barrier implied by wake_up() path. */
+	swait_wake(&rsp->gp_wq);  /* Memory barrier implied by wake_up() path. */
 }
 
 /*
@@ -2003,7 +2003,8 @@ static void force_quiescent_state(struct rcu_state *rsp)
 	}
 	rsp->gp_flags |= RCU_GP_FLAG_FQS;
 	raw_spin_unlock_irqrestore(&rnp_old->lock, flags);
-	wake_up(&rsp->gp_wq);  /* Memory barrier implied by wake_up() path. */
+	/* Memory barrier implied by wake_up() path. */
+	swait_wake(&rsp->gp_wq);
 }
 
 /*
@@ -2999,7 +3000,7 @@ static void __init rcu_init_one(struct rcu_state *rsp,
 	}
 
 	rsp->rda = rda;
-	init_waitqueue_head(&rsp->gp_wq);
+	init_swait_head(&rsp->gp_wq);
 	rnp = rsp->level[rcu_num_lvls - 1];
 	for_each_possible_cpu(i) {
 		while (i > rnp->grphi)
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 4b69291..5cfdff9 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -397,7 +397,7 @@ struct rcu_state {
 	unsigned long gpnum;			/* Current gp number. */
 	unsigned long completed;		/* # of last completed gp. */
 	struct task_struct *gp_kthread;		/* Task for grace periods. */
-	wait_queue_head_t gp_wq;		/* Where GP task waits. */
+	struct swait_head gp_wq;		/* Where GP task waits. */
 	int gp_flags;				/* Commands for GP task. */
 
 	/* End of fields guarded by root rcu_node's lock. */
-- 
cgit v0.10.2


From 54230934366da148914a9166e94722f1144b0450 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 20 Dec 2012 14:58:00 +0100
Subject: sched: Adjust sched_reset_on_fork when nothing else changes

If the policy and priority remain unchanged a possible modification of
sched_reset_on_fork gets lost in the early exit path.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable@vger.kernel.org
Cc: stable-rt@vger.kernel.org

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 82ae3c4..0aa6395 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4119,10 +4119,13 @@ recheck:
 	}
 
 	/*
-	 * If not changing anything there's no need to proceed further:
+	 * If not changing anything there's no need to proceed
+	 * further, but store a possible modification of
+	 * reset_on_fork.
 	 */
 	if (unlikely(policy == p->policy && (!rt_policy(policy) ||
 			param->sched_priority == p->rt_priority))) {
+		p->sched_reset_on_fork = reset_on_fork;
 		task_rq_unlock(rq, p, &flags);
 		return 0;
 	}
-- 
cgit v0.10.2


From 3d9ce46e3ead71b653241c2c08fb501869cce80f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 4 Dec 2012 08:56:41 +0100
Subject: sched: Queue RT tasks to head when prio drops

The following scenario does not work correctly:

Runqueue of CPUx contains two runnable and pinned tasks:
 T1: SCHED_FIFO, prio 80
 T2: SCHED_FIFO, prio 80

T1 is on the cpu and executes the following syscalls (classic priority
ceiling scenario):

 sys_sched_setscheduler(pid(T1), SCHED_FIFO, .prio = 90);
 ...
 sys_sched_setscheduler(pid(T1), SCHED_FIFO, .prio = 80);
 ...

Now T1 gets preempted by T3 (SCHED_FIFO, prio 95). After T3 goes back
to sleep the scheduler picks T2. Surprise!

The same happens w/o actual preemption when T1 is forced into the
scheduler due to a sporadic NEED_RESCHED event. The scheduler invokes
pick_next_task() which returns T2. So T1 gets preempted and scheduled
out.

This happens because sched_setscheduler() dequeues T1 from the prio 90
list and then enqueues it on the tail of the prio 80 list behind T2.
This violates the POSIX spec and surprises user space which relies on
the guarantee that SCHED_FIFO tasks are not scheduled out unless they
give the CPU up voluntarily or are preempted by a higher priority
task. In the latter case the preempted task must get back on the CPU
after the preempting task schedules out again.

We fixed a similar issue already in commit 60db48c (sched: Queue a
deboosted task to the head of the RT prio queue). The same treatment
is necessary for sched_setscheduler(). So enqueue to head of the prio
bucket list if the priority of the task is lowered.

It might be possible that existing user space relies on the current
behaviour, but it can be considered highly unlikely due to the corner
case nature of the application scenario.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable@vger.kernel.org
Cc: stable-rt@vger.kernel.org

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 0aa6395..a17ce7f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4166,8 +4166,13 @@ recheck:
 
 	if (running)
 		p->sched_class->set_curr_task(rq);
-	if (on_rq)
-		enqueue_task(rq, p, 0);
+	if (on_rq) {
+		/*
+		 * We enqueue to tail when the priority of a task is
+		 * increased (user space view).
+		 */
+		enqueue_task(rq, p, oldprio <= p->prio ? ENQUEUE_HEAD : 0);
+	}
 
 	check_class_changed(rq, p, prev_class, oldprio);
 	task_rq_unlock(rq, p, &flags);
-- 
cgit v0.10.2


From 2aa73067edf1f0379a268ae2aeb0a1e9224d917a Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 20 Dec 2012 15:13:49 +0100
Subject: sched: Consider pi boosting in setscheduler

If a PI boosted task policy/priority is modified by a setscheduler()
call we unconditionally dequeue and requeue the task if it is on the
runqueue even if the new priority is lower than the current effective
boosted priority. This can result in undesired reordering of the
priority bucket list.

If the new priority is less or equal than the current effective we
just store the new parameters in the task struct and leave the
scheduler class and the runqueue untouched. This is handled when the
task deboosts itself. Only if the new priority is higher than the
effective boosted priority we apply the change immediately.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable@vger.kernel.org
Cc: stable-rt@vger.kernel.org

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 199ef40..8c4837b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2176,6 +2176,7 @@ extern unsigned int sysctl_sched_cfs_bandwidth_slice;
 #ifdef CONFIG_RT_MUTEXES
 extern int rt_mutex_getprio(struct task_struct *p);
 extern void rt_mutex_setprio(struct task_struct *p, int prio);
+extern int rt_mutex_check_prio(struct task_struct *task, int newprio);
 extern void rt_mutex_adjust_pi(struct task_struct *p);
 static inline bool tsk_is_pi_blocked(struct task_struct *tsk)
 {
@@ -2186,6 +2187,10 @@ static inline int rt_mutex_getprio(struct task_struct *p)
 {
 	return p->normal_prio;
 }
+static inline int rt_mutex_check_prio(struct task_struct *task, int newprio)
+{
+	return 0;
+}
 # define rt_mutex_adjust_pi(p)		do { } while (0)
 static inline bool tsk_is_pi_blocked(struct task_struct *tsk)
 {
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index 3bff726..20742e7 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -124,6 +124,18 @@ int rt_mutex_getprio(struct task_struct *task)
 }
 
 /*
+ * Called by sched_setscheduler() to check whether the priority change
+ * is overruled by a possible priority boosting.
+ */
+int rt_mutex_check_prio(struct task_struct *task, int newprio)
+{
+	if (!task_has_pi_waiters(task))
+		return 0;
+
+	return task_top_pi_waiter(task)->pi_list_entry.prio <= newprio;
+}
+
+/*
  * Adjust the priority of a task, after its pi_waiters got modified.
  *
  * This can be both boosting and unboosting. task->pi_lock must be held.
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a17ce7f..747b71e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3766,7 +3766,8 @@ EXPORT_SYMBOL(sleep_on_timeout);
  * This function changes the 'effective' priority of a task. It does
  * not touch ->normal_prio like __setscheduler().
  *
- * Used by the rt_mutex code to implement priority inheritance logic.
+ * Used by the rt_mutex code to implement priority inheritance
+ * logic. Call site only calls if the priority of the task changed.
  */
 void rt_mutex_setprio(struct task_struct *p, int prio)
 {
@@ -3989,20 +3990,25 @@ static struct task_struct *find_process_by_pid(pid_t pid)
 	return pid ? find_task_by_vpid(pid) : current;
 }
 
-/* Actually do priority change: must hold rq lock. */
-static void
-__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
+static void __setscheduler_params(struct task_struct *p, int policy, int prio)
 {
 	p->policy = policy;
 	p->rt_priority = prio;
 	p->normal_prio = normal_prio(p);
+	set_load_weight(p);
+}
+
+/* Actually do priority change: must hold rq lock. */
+static void
+__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
+{
+	__setscheduler_params(p, policy, prio);
 	/* we are holding p->pi_lock already */
 	p->prio = rt_mutex_getprio(p);
 	if (rt_prio(p->prio))
 		p->sched_class = &rt_sched_class;
 	else
 		p->sched_class = &fair_sched_class;
-	set_load_weight(p);
 }
 
 /*
@@ -4024,6 +4030,7 @@ static bool check_same_owner(struct task_struct *p)
 static int __sched_setscheduler(struct task_struct *p, int policy,
 				const struct sched_param *param, bool user)
 {
+	int newprio = MAX_RT_PRIO - 1 - param->sched_priority;
 	int retval, oldprio, oldpolicy = -1, on_rq, running;
 	unsigned long flags;
 	const struct sched_class *prev_class;
@@ -4151,6 +4158,25 @@ recheck:
 		task_rq_unlock(rq, p, &flags);
 		goto recheck;
 	}
+
+	p->sched_reset_on_fork = reset_on_fork;
+	oldprio = p->prio;
+
+	/*
+	 * Special case for priority boosted tasks.
+	 *
+	 * If the new priority is lower or equal (user space view)
+	 * than the current (boosted) priority, we just store the new
+	 * normal parameters and do not touch the scheduler class and
+	 * the runqueue. This will be done when the task deboost
+	 * itself.
+	 */
+	if (rt_mutex_check_prio(p, newprio)) {
+		__setscheduler_params(p, policy, param->sched_priority);
+		task_rq_unlock(rq, p, &flags);
+		return 0;
+	}
+
 	on_rq = p->on_rq;
 	running = task_current(rq, p);
 	if (on_rq)
@@ -4158,9 +4184,6 @@ recheck:
 	if (running)
 		p->sched_class->put_prev_task(rq, p);
 
-	p->sched_reset_on_fork = reset_on_fork;
-
-	oldprio = p->prio;
 	prev_class = p->sched_class;
 	__setscheduler(rq, p, policy, param->sched_priority);
 
@@ -4173,7 +4196,6 @@ recheck:
 		 */
 		enqueue_task(rq, p, oldprio <= p->prio ? ENQUEUE_HEAD : 0);
 	}
-
 	check_class_changed(rq, p, prev_class, oldprio);
 	task_rq_unlock(rq, p, &flags);
 
-- 
cgit v0.10.2


From 3934e86c7008fdea21e424a3c295d3139fa31f8d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 20 Dec 2012 18:28:26 +0100
Subject: block: Use cpu_chill() for retry loops

Retry loops on RT might loop forever when the modifying side was
preempted. Steven also observed a live lock when there was a
concurrent priority boosting going on.

Use cpu_chill() instead of cpu_relax() to let the system
make progress.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable-rt@vger.kernel.org

diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index fab4cdd..fb21ad5 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -8,6 +8,7 @@
 #include <linux/blkdev.h>
 #include <linux/bootmem.h>	/* for max_pfn/max_low_pfn */
 #include <linux/slab.h>
+#include <linux/delay.h>
 
 #include "blk.h"
 
@@ -110,7 +111,7 @@ static void ioc_release_fn(struct work_struct *work)
 			spin_unlock(q->queue_lock);
 		} else {
 			spin_unlock_irqrestore(&ioc->lock, flags);
-			cpu_relax();
+			cpu_chill();
 			spin_lock_irqsave_nested(&ioc->lock, flags, 1);
 		}
 	}
@@ -188,7 +189,7 @@ retry:
 			spin_unlock(icq->q->queue_lock);
 		} else {
 			spin_unlock_irqrestore(&ioc->lock, flags);
-			cpu_relax();
+			cpu_chill();
 			goto retry;
 		}
 	}
-- 
cgit v0.10.2


From 1cbf5b3c9f6f1c6df1e1ea887dd67743a2a8b8e7 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 9 Jan 2013 10:33:09 +0100
Subject: mm: bounce: Use local_irq_save_nort

kmap_atomic() is preemptible on RT.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/mm/bounce.c b/mm/bounce.c
index 0420867..1e78ef7 100644
--- a/mm/bounce.c
+++ b/mm/bounce.c
@@ -51,11 +51,11 @@ static void bounce_copy_vec(struct bio_vec *to, unsigned char *vfrom)
 	unsigned long flags;
 	unsigned char *vto;
 
-	local_irq_save(flags);
+	local_irq_save_nort(flags);
 	vto = kmap_atomic(to->bv_page);
 	memcpy(vto + to->bv_offset, vfrom, to->bv_len);
 	kunmap_atomic(vto);
-	local_irq_restore(flags);
+	local_irq_restore_nort(flags);
 }
 
 #else /* CONFIG_HIGHMEM */
-- 
cgit v0.10.2


From f42f0c56a0f15bf0757122e057397813854535f3 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 9 Jan 2013 12:11:12 +0100
Subject: mmci: Remove bogus local_irq_save()

On !RT interrupt runs with interrupts disabled. On RT it's in a
thread, so no need to disable interrupts at all.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/drivers/mmc/host/mmci.c b/drivers/mmc/host/mmci.c
index 1507723..724f478 100644
--- a/drivers/mmc/host/mmci.c
+++ b/drivers/mmc/host/mmci.c
@@ -930,15 +930,12 @@ static irqreturn_t mmci_pio_irq(int irq, void *dev_id)
 	struct sg_mapping_iter *sg_miter = &host->sg_miter;
 	struct variant_data *variant = host->variant;
 	void __iomem *base = host->base;
-	unsigned long flags;
 	u32 status;
 
 	status = readl(base + MMCISTATUS);
 
 	dev_dbg(mmc_dev(host->mmc), "irq1 (pio) %08x\n", status);
 
-	local_irq_save(flags);
-
 	do {
 		unsigned int remain, len;
 		char *buffer;
@@ -978,8 +975,6 @@ static irqreturn_t mmci_pio_irq(int irq, void *dev_id)
 
 	sg_miter_stop(sg_miter);
 
-	local_irq_restore(flags);
-
 	/*
 	 * If we have less than the fifo 'half-full' threshold to transfer,
 	 * trigger a PIO interrupt as soon as any data is available.
-- 
cgit v0.10.2


From cb36dde6a350f0d04789a182b4e6b04c2d972e05 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 9 Jan 2013 12:08:15 +0100
Subject: slub: Enable irqs for __GFP_WAIT

SYSTEM_RUNNING might be too late for enabling interrupts. Allocations
with GFP_WAIT can happen before that. So use this as an indicator.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/mm/slub.c b/mm/slub.c
index f8cfc46..85299b8 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1280,14 +1280,15 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
 	struct page *page;
 	struct kmem_cache_order_objects oo = s->oo;
 	gfp_t alloc_gfp;
+	bool enableirqs;
 
 	flags &= gfp_allowed_mask;
 
+	enableirqs = (flags & __GFP_WAIT) != 0;
 #ifdef CONFIG_PREEMPT_RT_FULL
-	if (system_state == SYSTEM_RUNNING)
-#else
-	if (flags & __GFP_WAIT)
+	enableirqs |= system_state == SYSTEM_RUNNING;
 #endif
+	if (enableirqs)
 		local_irq_enable();
 
 	flags |= s->allocflags;
@@ -1327,11 +1328,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
 			kmemcheck_mark_unallocated_pages(page, pages);
 	}
 
-#ifdef CONFIG_PREEMPT_RT_FULL
-	if (system_state == SYSTEM_RUNNING)
-#else
-	if (flags & __GFP_WAIT)
-#endif
+	if (enableirqs)
 		local_irq_disable();
 	if (!page)
 		return NULL;
-- 
cgit v0.10.2


From 529f0ed81dba5982443713e0585fb36bda6f4ae5 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Tue, 30 Apr 2013 03:17:33 -0500
Subject: slub: delay ctor until the object is requested

It seems that allocation of plenty objects causes latency on ARM since that
code can not be preempted

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>

diff --git a/mm/slub.c b/mm/slub.c
index 85299b8..f6871c5 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1346,8 +1346,10 @@ static void setup_object(struct kmem_cache *s, struct page *page,
 				void *object)
 {
 	setup_object_debug(s, page, object);
+#ifndef CONFIG_PREEMPT_RT_FULL
 	if (unlikely(s->ctor))
 		s->ctor(object);
+#endif
 }
 
 static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
@@ -2437,6 +2439,10 @@ redo:
 
 	if (unlikely(gfpflags & __GFP_ZERO) && object)
 		memset(object, 0, s->object_size);
+#ifdef CONFIG_PREEMPT_RT_FULL
+	if (unlikely(s->ctor) && object)
+		s->ctor(object);
+#endif
 
 	slab_post_alloc_hook(s, gfpflags, object);
 
-- 
cgit v0.10.2


From 45b77e96eb6516ad0437a8c2e0151ff3e01d834e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 9 Jan 2013 23:03:29 +0100
Subject: sched: Init idle->on_rq in init_idle()

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 747b71e..60f66f3 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4943,6 +4943,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
 	rcu_read_unlock();
 
 	rq->curr = rq->idle = idle;
+	idle->on_rq = 1;
 #if defined(CONFIG_SMP)
 	idle->on_cpu = 1;
 #endif
-- 
cgit v0.10.2


From 3caa9e7838966c92e7621cad5afb0f79477254b2 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 9 Jan 2013 23:34:08 +0100
Subject: sched: Check for idle task in might_sleep()

Idle is not allowed to call sleeping functions ever!

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 60f66f3..4cfced6 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7393,7 +7393,8 @@ void __might_sleep(const char *file, int line, int preempt_offset)
 	static unsigned long prev_jiffy;	/* ratelimiting */
 
 	rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */
-	if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) ||
+	if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
+	     !is_idle_task(current)) ||
 	    system_state != SYSTEM_RUNNING || oops_in_progress)
 		return;
 	if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
-- 
cgit v0.10.2


From 8d8302dd34c86b58aaf0395c030765aafe92efc6 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 10 Jan 2013 11:47:35 +0100
Subject: wait-simple: Rework for use with completions

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/wait-simple.h b/include/linux/wait-simple.h
index 19ac983..4efba4d 100644
--- a/include/linux/wait-simple.h
+++ b/include/linux/wait-simple.h
@@ -22,12 +22,14 @@ struct swait_head {
 	struct list_head	list;
 };
 
-#define DEFINE_SWAIT_HEAD(name)					\
-	struct swait_head name = {				\
+#define SWAIT_HEAD_INITIALIZER(name) {				\
 		.lock	= __RAW_SPIN_LOCK_UNLOCKED(name.lock),	\
 		.list	= LIST_HEAD_INIT((name).list),		\
 	}
 
+#define DEFINE_SWAIT_HEAD(name)					\
+	struct swait_head name = SWAIT_HEAD_INITIALIZER(name)
+
 extern void __init_swait_head(struct swait_head *h, struct lock_class_key *key);
 
 #define init_swait_head(swh)					\
@@ -40,59 +42,25 @@ extern void __init_swait_head(struct swait_head *h, struct lock_class_key *key);
 /*
  * Waiter functions
  */
-static inline bool swaiter_enqueued(struct swaiter *w)
-{
-	return w->task != NULL;
-}
-
+extern void swait_prepare_locked(struct swait_head *head, struct swaiter *w);
 extern void swait_prepare(struct swait_head *head, struct swaiter *w, int state);
+extern void swait_finish_locked(struct swait_head *head, struct swaiter *w);
 extern void swait_finish(struct swait_head *head, struct swaiter *w);
 
 /*
- * Adds w to head->list. Must be called with head->lock locked.
- */
-static inline void __swait_enqueue(struct swait_head *head, struct swaiter *w)
-{
-	list_add(&w->node, &head->list);
-}
-
-/*
- * Removes w from head->list. Must be called with head->lock locked.
- */
-static inline void __swait_dequeue(struct swaiter *w)
-{
-	list_del_init(&w->node);
-}
-
-/*
- * Check whether a head has waiters enqueued
- */
-static inline bool swait_head_has_waiters(struct swait_head *h)
-{
-	return !list_empty(&h->list);
-}
-
-/*
  * Wakeup functions
  */
-extern int __swait_wake(struct swait_head *head, unsigned int state);
-
-static inline int swait_wake(struct swait_head *head)
-{
-	return swait_head_has_waiters(head) ?
-		__swait_wake(head, TASK_NORMAL) : 0;
-}
+extern unsigned int __swait_wake(struct swait_head *head, unsigned int state, unsigned int num);
+extern unsigned int __swait_wake_locked(struct swait_head *head, unsigned int state, unsigned int num);
 
-static inline int swait_wake_interruptible(struct swait_head *head)
-{
-	return swait_head_has_waiters(head) ?
-		__swait_wake(head, TASK_INTERRUPTIBLE) : 0;
-}
+#define swait_wake(head)			__swait_wake(head, TASK_NORMAL, 1)
+#define swait_wake_interruptible(head)		__swait_wake(head, TASK_INTERRUPTIBLE, 1)
+#define swait_wake_all(head)			__swait_wake(head, TASK_NORMAL, 0)
+#define swait_wake_all_interruptible(head)	__swait_wake(head, TASK_INTERRUPTIBLE, 0)
 
 /*
  * Event API
  */
-
 #define __swait_event(wq, condition)					\
 do {									\
 	DEFINE_SWAITER(__wait);						\
diff --git a/kernel/wait-simple.c b/kernel/wait-simple.c
index 040d714..4b9a0b5 100644
--- a/kernel/wait-simple.c
+++ b/kernel/wait-simple.c
@@ -12,6 +12,24 @@
 #include <linux/sched.h>
 #include <linux/wait-simple.h>
 
+/* Adds w to head->list. Must be called with head->lock locked. */
+static inline void __swait_enqueue(struct swait_head *head, struct swaiter *w)
+{
+	list_add(&w->node, &head->list);
+}
+
+/* Removes w from head->list. Must be called with head->lock locked. */
+static inline void __swait_dequeue(struct swaiter *w)
+{
+	list_del_init(&w->node);
+}
+
+/* Check whether a head has waiters enqueued */
+static inline bool swait_head_has_waiters(struct swait_head *h)
+{
+	return !list_empty(&h->list);
+}
+
 void __init_swait_head(struct swait_head *head, struct lock_class_key *key)
 {
 	raw_spin_lock_init(&head->lock);
@@ -20,19 +38,31 @@ void __init_swait_head(struct swait_head *head, struct lock_class_key *key)
 }
 EXPORT_SYMBOL(__init_swait_head);
 
+void swait_prepare_locked(struct swait_head *head, struct swaiter *w)
+{
+	w->task = current;
+	if (list_empty(&w->node))
+		__swait_enqueue(head, w);
+}
+
 void swait_prepare(struct swait_head *head, struct swaiter *w, int state)
 {
 	unsigned long flags;
 
 	raw_spin_lock_irqsave(&head->lock, flags);
-	w->task = current;
-	if (list_empty(&w->node))
-		__swait_enqueue(head, w);
-	set_current_state(state);
+	swait_prepare_locked(head, w);
+	__set_current_state(state);
 	raw_spin_unlock_irqrestore(&head->lock, flags);
 }
 EXPORT_SYMBOL(swait_prepare);
 
+void swait_finish_locked(struct swait_head *head, struct swaiter *w)
+{
+	__set_current_state(TASK_RUNNING);
+	if (w->task)
+		__swait_dequeue(w);
+}
+
 void swait_finish(struct swait_head *head, struct swaiter *w)
 {
 	unsigned long flags;
@@ -46,22 +76,43 @@ void swait_finish(struct swait_head *head, struct swaiter *w)
 }
 EXPORT_SYMBOL(swait_finish);
 
-int __swait_wake(struct swait_head *head, unsigned int state)
+unsigned int
+__swait_wake_locked(struct swait_head *head, unsigned int state, unsigned int num)
 {
 	struct swaiter *curr, *next;
-	unsigned long flags;
 	int woken = 0;
 
-	raw_spin_lock_irqsave(&head->lock, flags);
-
 	list_for_each_entry_safe(curr, next, &head->list, node) {
 		if (wake_up_state(curr->task, state)) {
 			__swait_dequeue(curr);
+			/*
+			 * The waiting task can free the waiter as
+			 * soon as curr->task = NULL is written,
+			 * without taking any locks. A memory barrier
+			 * is required here to prevent the following
+			 * store to curr->task from getting ahead of
+			 * the dequeue operation.
+			 */
+			smp_wmb();
 			curr->task = NULL;
-			woken++;
+			if (++woken == num)
+				break;
 		}
 	}
+	return woken;
+}
+
+unsigned int
+__swait_wake(struct swait_head *head, unsigned int state, unsigned int num)
+{
+	unsigned long flags;
+	int woken;
 
+	if (!swait_head_has_waiters(head))
+		return 0;
+
+	raw_spin_lock_irqsave(&head->lock, flags);
+	woken = __swait_wake_locked(head, state, num);
 	raw_spin_unlock_irqrestore(&head->lock, flags);
 	return woken;
 }
-- 
cgit v0.10.2


From df14ee11aa405d60a33343e99796daf02316e531 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 11 Jan 2013 11:23:51 +0100
Subject: completion: Use simple wait queues

Completions have no long lasting callbacks and therefor do not need
the complex waitqueue variant. Use simple waitqueues which reduces the
contention on the waitqueue lock.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/completion.h b/include/linux/completion.h
index 51494e6..ebb6565 100644
--- a/include/linux/completion.h
+++ b/include/linux/completion.h
@@ -8,7 +8,7 @@
  * See kernel/sched.c for details.
  */
 
-#include <linux/wait.h>
+#include <linux/wait-simple.h>
 
 /*
  * struct completion - structure used to maintain state for a "completion"
@@ -24,11 +24,11 @@
  */
 struct completion {
 	unsigned int done;
-	wait_queue_head_t wait;
+	struct swait_head wait;
 };
 
 #define COMPLETION_INITIALIZER(work) \
-	{ 0, __WAIT_QUEUE_HEAD_INITIALIZER((work).wait) }
+	{ 0, SWAIT_HEAD_INITIALIZER((work).wait) }
 
 #define COMPLETION_INITIALIZER_ONSTACK(work) \
 	({ init_completion(&work); work; })
@@ -73,7 +73,7 @@ struct completion {
 static inline void init_completion(struct completion *x)
 {
 	x->done = 0;
-	init_waitqueue_head(&x->wait);
+	init_swait_head(&x->wait);
 }
 
 extern void wait_for_completion(struct completion *);
diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
index 4f628a6..ef03e33 100644
--- a/include/linux/uprobes.h
+++ b/include/linux/uprobes.h
@@ -26,6 +26,7 @@
 
 #include <linux/errno.h>
 #include <linux/rbtree.h>
+#include <linux/wait.h>
 
 struct vm_area_struct;
 struct mm_struct;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 4cfced6..1ec9d1f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3489,10 +3489,10 @@ void complete(struct completion *x)
 {
 	unsigned long flags;
 
-	spin_lock_irqsave(&x->wait.lock, flags);
+	raw_spin_lock_irqsave(&x->wait.lock, flags);
 	x->done++;
-	__wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL);
-	spin_unlock_irqrestore(&x->wait.lock, flags);
+	__swait_wake_locked(&x->wait, TASK_NORMAL, 1);
+	raw_spin_unlock_irqrestore(&x->wait.lock, flags);
 }
 EXPORT_SYMBOL(complete);
 
@@ -3509,10 +3509,10 @@ void complete_all(struct completion *x)
 {
 	unsigned long flags;
 
-	spin_lock_irqsave(&x->wait.lock, flags);
+	raw_spin_lock_irqsave(&x->wait.lock, flags);
 	x->done += UINT_MAX/2;
-	__wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL);
-	spin_unlock_irqrestore(&x->wait.lock, flags);
+	__swait_wake_locked(&x->wait, TASK_NORMAL, 0);
+	raw_spin_unlock_irqrestore(&x->wait.lock, flags);
 }
 EXPORT_SYMBOL(complete_all);
 
@@ -3520,20 +3520,20 @@ static inline long __sched
 do_wait_for_common(struct completion *x, long timeout, int state)
 {
 	if (!x->done) {
-		DECLARE_WAITQUEUE(wait, current);
+		DEFINE_SWAITER(wait);
 
-		__add_wait_queue_tail_exclusive(&x->wait, &wait);
+		swait_prepare_locked(&x->wait, &wait);
 		do {
 			if (signal_pending_state(state, current)) {
 				timeout = -ERESTARTSYS;
 				break;
 			}
 			__set_current_state(state);
-			spin_unlock_irq(&x->wait.lock);
+			raw_spin_unlock_irq(&x->wait.lock);
 			timeout = schedule_timeout(timeout);
-			spin_lock_irq(&x->wait.lock);
+			raw_spin_lock_irq(&x->wait.lock);
 		} while (!x->done && timeout);
-		__remove_wait_queue(&x->wait, &wait);
+		swait_finish_locked(&x->wait, &wait);
 		if (!x->done)
 			return timeout;
 	}
@@ -3546,9 +3546,9 @@ wait_for_common(struct completion *x, long timeout, int state)
 {
 	might_sleep();
 
-	spin_lock_irq(&x->wait.lock);
+	raw_spin_lock_irq(&x->wait.lock);
 	timeout = do_wait_for_common(x, timeout, state);
-	spin_unlock_irq(&x->wait.lock);
+	raw_spin_unlock_irq(&x->wait.lock);
 	return timeout;
 }
 
@@ -3679,12 +3679,12 @@ bool try_wait_for_completion(struct completion *x)
 	unsigned long flags;
 	int ret = 1;
 
-	spin_lock_irqsave(&x->wait.lock, flags);
+	raw_spin_lock_irqsave(&x->wait.lock, flags);
 	if (!x->done)
 		ret = 0;
 	else
 		x->done--;
-	spin_unlock_irqrestore(&x->wait.lock, flags);
+	raw_spin_unlock_irqrestore(&x->wait.lock, flags);
 	return ret;
 }
 EXPORT_SYMBOL(try_wait_for_completion);
@@ -3702,10 +3702,10 @@ bool completion_done(struct completion *x)
 	unsigned long flags;
 	int ret = 1;
 
-	spin_lock_irqsave(&x->wait.lock, flags);
+	raw_spin_lock_irqsave(&x->wait.lock, flags);
 	if (!x->done)
 		ret = 0;
-	spin_unlock_irqrestore(&x->wait.lock, flags);
+	raw_spin_unlock_irqrestore(&x->wait.lock, flags);
 	return ret;
 }
 EXPORT_SYMBOL(completion_done);
-- 
cgit v0.10.2


From f81f9aa423539132ae84bd25fae9b4b87b0fd237 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 24 Jul 2011 12:11:43 +0200
Subject: kconfig-disable-a-few-options-rt.patch

Disable stuff which is known to have issues on RT

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/Kconfig b/arch/Kconfig
index 7f8f281..4a93e44 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -6,6 +6,7 @@ config OPROFILE
 	tristate "OProfile system profiling"
 	depends on PROFILING
 	depends on HAVE_OPROFILE
+	depends on !PREEMPT_RT_FULL
 	select RING_BUFFER
 	select RING_BUFFER_ALLOW_SWAP
 	help
diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index 6a70184..0052e52 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -164,6 +164,7 @@ config VXLAN
 
 config NETCONSOLE
 	tristate "Network console logging support"
+	depends on !PREEMPT_RT_FULL
 	---help---
 	If you want to log kernel messages over the network, enable this.
 	See <file:Documentation/networking/netconsole.txt> for details.
diff --git a/mm/Kconfig b/mm/Kconfig
index 278e3ab..c6ceefc 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -353,7 +353,7 @@ config NOMMU_INITIAL_TRIM_EXCESS
 
 config TRANSPARENT_HUGEPAGE
 	bool "Transparent Hugepage Support"
-	depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE
+	depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE && !PREEMPT_RT_FULL
 	select COMPACTION
 	help
 	  Transparent Hugepages allows the kernel to use huge pages and
-- 
cgit v0.10.2


From 671649c5d30d05cea0f8672014140314418a9c4d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 29 Jun 2011 14:58:57 +0200
Subject: kconfig-preempt-rt-full.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/init/Makefile b/init/Makefile
index 7bc47ee..88cf473 100644
--- a/init/Makefile
+++ b/init/Makefile
@@ -33,4 +33,4 @@ silent_chk_compile.h = :
 include/generated/compile.h: FORCE
 	@$($(quiet)chk_compile.h)
 	$(Q)$(CONFIG_SHELL) $(srctree)/scripts/mkcompile_h $@ \
-	"$(UTS_MACHINE)" "$(CONFIG_SMP)" "$(CONFIG_PREEMPT)" "$(CC) $(KBUILD_CFLAGS)"
+	"$(UTS_MACHINE)" "$(CONFIG_SMP)" "$(CONFIG_PREEMPT)" "$(CONFIG_PREEMPT_RT_FULL)" "$(CC) $(KBUILD_CFLAGS)"
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index 574a855..38cecfe 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -73,6 +73,14 @@ config PREEMPT_RTB
 	  enables changes which are preliminary for the full preemptiple
 	  RT kernel.
 
+config PREEMPT_RT_FULL
+	bool "Fully Preemptible Kernel (RT)"
+	depends on IRQ_FORCED_THREADING
+	select PREEMPT_RT_BASE
+	select PREEMPT_RCU
+	help
+	  All and everything
+
 endchoice
 
 config PREEMPT_COUNT
diff --git a/scripts/mkcompile_h b/scripts/mkcompile_h
index f221ddf..5f44009 100755
--- a/scripts/mkcompile_h
+++ b/scripts/mkcompile_h
@@ -4,7 +4,8 @@ TARGET=$1
 ARCH=$2
 SMP=$3
 PREEMPT=$4
-CC=$5
+RT=$5
+CC=$6
 
 vecho() { [ "${quiet}" = "silent_" ] || echo "$@" ; }
 
@@ -57,6 +58,7 @@ UTS_VERSION="#$VERSION"
 CONFIG_FLAGS=""
 if [ -n "$SMP" ] ; then CONFIG_FLAGS="SMP"; fi
 if [ -n "$PREEMPT" ] ; then CONFIG_FLAGS="$CONFIG_FLAGS PREEMPT"; fi
+if [ -n "$RT" ] ; then CONFIG_FLAGS="$CONFIG_FLAGS RT"; fi
 UTS_VERSION="$UTS_VERSION $CONFIG_FLAGS $TIMESTAMP"
 
 # Truncate to maximum length
-- 
cgit v0.10.2