From f7b344595ee56ed4d9ae21873c96654ec60958f0 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <cl@linux.com>
Date: Wed, 23 Jan 2013 21:45:47 +0000
Subject: FIX [1/2] slub: Do not dereference NULL pointer in node_match

The variables accessed in slab_alloc are volatile and therefore
the page pointer passed to node_match can be NULL. The processing
of data in slab_alloc is tentative until either the cmpxhchg
succeeds or the __slab_alloc slowpath is invoked. Both are
able to perform the same allocation from the freelist.

Check for the NULL pointer in node_match.

A false positive will lead to a retry of the loop in __slab_alloc.

Signed-off-by: Christoph Lameter <cl@linux.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Pekka Enberg <penberg@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/mm/slub.c b/mm/slub.c
index ba2ca53..0f270db 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2041,7 +2041,7 @@ static void flush_all(struct kmem_cache *s)
 static inline int node_match(struct page *page, int node)
 {
 #ifdef CONFIG_NUMA
-	if (node != NUMA_NO_NODE && page_to_nid(page) != node)
+	if (!page || (node != NUMA_NO_NODE && page_to_nid(page) != node))
 		return 0;
 #endif
 	return 1;
-- 
cgit v0.10.2


From 3bc3e6047e6c95b221a7b8643a26525c3c9b5e55 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <cl@linux.com>
Date: Wed, 23 Jan 2013 21:45:48 +0000
Subject: FIX [2/2] slub: Tid must be retrieved from the percpu area of the
 current processor

As Steven Rostedt has pointer out: Rescheduling could occur on a differnet processor
after the determination of the per cpu pointer and before the tid is retrieved.
This could result in allocation from the wrong node in slab_alloc.

The effect is much more severe in slab_free() where we could free to the freelist
of the wrong page.

The window for something like that occurring is pretty small but it is possible.

Signed-off-by: Christoph Lameter <cl@linux.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Pekka Enberg <penberg@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/mm/slub.c b/mm/slub.c
index 0f270db..3d2308f 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2331,13 +2331,13 @@ static __always_inline void *slab_alloc_node(struct kmem_cache *s,
 
 	s = memcg_kmem_get_cache(s, gfpflags);
 redo:
-
 	/*
-	 * Must read kmem_cache cpu data via this cpu ptr. Preemption is
-	 * enabled. We may switch back and forth between cpus while
-	 * reading from one cpu area. That does not matter as long
-	 * as we end up on the original cpu again when doing the cmpxchg.
+	 * Preemption is disabled for the retrieval of the tid because that
+	 * must occur from the current processor. We cannot allow rescheduling
+	 * on a different processor between the determination of the pointer
+	 * and the retrieval of the tid.
 	 */
+	preempt_disable();
 	c = __this_cpu_ptr(s->cpu_slab);
 
 	/*
@@ -2347,7 +2347,7 @@ redo:
 	 * linked list in between.
 	 */
 	tid = c->tid;
-	barrier();
+	preempt_enable();
 
 	object = c->freelist;
 	page = c->page;
@@ -2594,10 +2594,11 @@ redo:
 	 * data is retrieved via this pointer. If we are on the same cpu
 	 * during the cmpxchg then the free will succedd.
 	 */
+	preempt_disable();
 	c = __this_cpu_ptr(s->cpu_slab);
 
 	tid = c->tid;
-	barrier();
+	preempt_enable();
 
 	if (likely(page == c->page)) {
 		set_freepointer(s, object, c->freelist);
-- 
cgit v0.10.2


From 9c160ce12b2214851730e37fbbf03c826ae1d93a Mon Sep 17 00:00:00 2001
From: "Bu, Yitian" <ybu@qti.qualcomm.com>
Date: Mon, 18 Feb 2013 12:53:37 +0000
Subject: printk: Fix rq->lock vs logbuf_lock unlock lock inversion

commit 07354eb1a74d1 ("locking printk: Annotate logbuf_lock as raw")
reintroduced a lock inversion problem which was fixed in commit
0b5e1c5255 ("printk: Release console_sem after logbuf_lock"). This
happened probably when fixing up patch rejects.

Restore the ordering and unlock logbuf_lock before releasing
console_sem.

Signed-off-by: ybu <ybu@qti.qualcomm.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: stable@vger.kernel.org
Link: http://lkml.kernel.org/r/E807E903FE6CBE4D95E420FBFCC273B827413C@nasanexd01h.na.qualcomm.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/printk.c b/kernel/printk.c
index 267ce78..e698e80 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -1358,9 +1358,9 @@ static int console_trylock_for_printk(unsigned int cpu)
 		}
 	}
 	logbuf_cpu = UINT_MAX;
+	raw_spin_unlock(&logbuf_lock);
 	if (wake)
 		up(&console_sem);
-	raw_spin_unlock(&logbuf_lock);
 	return retval;
 }
 
-- 
cgit v0.10.2


From a583b4b19f4d475cc563516d035df8a836c33a4a Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 25 May 2012 16:59:47 +0200
Subject: genirq: Add default affinity mask command line option

If we isolate CPUs, then we don't want random device interrupts on
them. Even w/o the user space irq balancer enabled we can end up with
irqs on non boot cpus.

Allow to restrict the default irq affinity mask.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 986614d..1ba0afe 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1182,6 +1182,15 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 			See comment before ip2_setup() in
 			drivers/char/ip2/ip2base.c.
 
+	irqaffinity=	[SMP] Set the default irq affinity mask
+			Format:
+			<cpu number>,...,<cpu number>
+			or
+			<cpu number>-<cpu number>
+			(must be a positive range in ascending order)
+			or a mixture
+			<cpu number>,...,<cpu number>-<cpu number>
+
 	irqfixup	[HW]
 			When an interrupt is not handled search all handlers
 			for it. Intended to get systems with badly broken
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 192a302..473b2b6 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -23,10 +23,27 @@
 static struct lock_class_key irq_desc_lock_class;
 
 #if defined(CONFIG_SMP)
+static int __init irq_affinity_setup(char *str)
+{
+	zalloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT);
+	cpulist_parse(str, irq_default_affinity);
+	/*
+	 * Set at least the boot cpu. We don't want to end up with
+	 * bugreports caused by random comandline masks
+	 */
+	cpumask_set_cpu(smp_processor_id(), irq_default_affinity);
+	return 1;
+}
+__setup("irqaffinity=", irq_affinity_setup);
+
 static void __init init_irq_default_affinity(void)
 {
-	alloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT);
-	cpumask_setall(irq_default_affinity);
+#ifdef CONFIG_CPUMASK_OFFSTACK
+	if (!irq_default_affinity)
+		zalloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT);
+#endif
+	if (cpumask_empty(irq_default_affinity))
+		cpumask_setall(irq_default_affinity);
 }
 #else
 static void __init init_irq_default_affinity(void)
-- 
cgit v0.10.2


From deadc0785e3cb9e41eed456b41560cf991506371 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 25 Jan 2013 13:21:47 -0500
Subject: OF: Fixup resursive locking code paths

There is no real reason to use a rwlock for devtree_lock. It even
could be a mutex, but unfortunately it's locked from cpu hotplug
paths which can't schedule :(

So it needs to become a raw lock on rt as well.  The devtree_lock would
be the only user of a raw_rw_lock, so we are better off cleaning up the
recursive locking paths which allows us to convert devtree_lock to a
read_lock.

Here we do the standard thing of introducing __foo() as the "raw"
version of foo(), so that we can take better control of the locking.
The "raw" versions are not exported and are for internal use within
the file itself.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
Cc: devicetree-discuss@lists.ozlabs.org
Cc: Grant Likely <grant.likely@secretlab.ca>
Cc: Rob Herring <rob.herring@calxeda.com>
Link: http://lkml.kernel.org/r/1359138107-14159-1-git-send-email-paul.gortmaker@windriver.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/drivers/of/base.c b/drivers/of/base.c
index 2390ddb..21195a1 100644
--- a/drivers/of/base.c
+++ b/drivers/of/base.c
@@ -164,16 +164,14 @@ void of_node_put(struct device_node *node)
 EXPORT_SYMBOL(of_node_put);
 #endif /* CONFIG_OF_DYNAMIC */
 
-struct property *of_find_property(const struct device_node *np,
-				  const char *name,
-				  int *lenp)
+static struct property *__of_find_property(const struct device_node *np,
+					   const char *name, int *lenp)
 {
 	struct property *pp;
 
 	if (!np)
 		return NULL;
 
-	read_lock(&devtree_lock);
 	for (pp = np->properties; pp; pp = pp->next) {
 		if (of_prop_cmp(pp->name, name) == 0) {
 			if (lenp)
@@ -181,6 +179,18 @@ struct property *of_find_property(const struct device_node *np,
 			break;
 		}
 	}
+
+	return pp;
+}
+
+struct property *of_find_property(const struct device_node *np,
+				  const char *name,
+				  int *lenp)
+{
+	struct property *pp;
+
+	read_lock(&devtree_lock);
+	pp = __of_find_property(np, name, lenp);
 	read_unlock(&devtree_lock);
 
 	return pp;
@@ -214,8 +224,20 @@ EXPORT_SYMBOL(of_find_all_nodes);
  * Find a property with a given name for a given node
  * and return the value.
  */
+static const void *__of_get_property(const struct device_node *np,
+				     const char *name, int *lenp)
+{
+	struct property *pp = __of_find_property(np, name, lenp);
+
+	return pp ? pp->value : NULL;
+}
+
+/*
+ * Find a property with a given name for a given node
+ * and return the value.
+ */
 const void *of_get_property(const struct device_node *np, const char *name,
-			 int *lenp)
+			    int *lenp)
 {
 	struct property *pp = of_find_property(np, name, lenp);
 
@@ -226,13 +248,13 @@ EXPORT_SYMBOL(of_get_property);
 /** Checks if the given "compat" string matches one of the strings in
  * the device's "compatible" property
  */
-int of_device_is_compatible(const struct device_node *device,
-		const char *compat)
+static int __of_device_is_compatible(const struct device_node *device,
+				     const char *compat)
 {
 	const char* cp;
 	int cplen, l;
 
-	cp = of_get_property(device, "compatible", &cplen);
+	cp = __of_get_property(device, "compatible", &cplen);
 	if (cp == NULL)
 		return 0;
 	while (cplen > 0) {
@@ -245,6 +267,20 @@ int of_device_is_compatible(const struct device_node *device,
 
 	return 0;
 }
+
+/** Checks if the given "compat" string matches one of the strings in
+ * the device's "compatible" property
+ */
+int of_device_is_compatible(const struct device_node *device,
+		const char *compat)
+{
+	int res;
+
+	read_lock(&devtree_lock);
+	res = __of_device_is_compatible(device, compat);
+	read_unlock(&devtree_lock);
+	return res;
+}
 EXPORT_SYMBOL(of_device_is_compatible);
 
 /**
@@ -518,7 +554,8 @@ struct device_node *of_find_compatible_node(struct device_node *from,
 		if (type
 		    && !(np->type && (of_node_cmp(np->type, type) == 0)))
 			continue;
-		if (of_device_is_compatible(np, compatible) && of_node_get(np))
+		if (__of_device_is_compatible(np, compatible) &&
+		    of_node_get(np))
 			break;
 	}
 	of_node_put(from);
@@ -562,15 +599,9 @@ out:
 }
 EXPORT_SYMBOL(of_find_node_with_property);
 
-/**
- * of_match_node - Tell if an device_node has a matching of_match structure
- *	@matches:	array of of device match structures to search in
- *	@node:		the of device structure to match against
- *
- *	Low level utility function used by device matching.
- */
-const struct of_device_id *of_match_node(const struct of_device_id *matches,
-					 const struct device_node *node)
+static
+const struct of_device_id *__of_match_node(const struct of_device_id *matches,
+					   const struct device_node *node)
 {
 	if (!matches)
 		return NULL;
@@ -584,14 +615,32 @@ const struct of_device_id *of_match_node(const struct of_device_id *matches,
 			match &= node->type
 				&& !strcmp(matches->type, node->type);
 		if (matches->compatible[0])
-			match &= of_device_is_compatible(node,
-						matches->compatible);
+			match &= __of_device_is_compatible(node,
+							   matches->compatible);
 		if (match)
 			return matches;
 		matches++;
 	}
 	return NULL;
 }
+
+/**
+ * of_match_node - Tell if an device_node has a matching of_match structure
+ *	@matches:	array of of device match structures to search in
+ *	@node:		the of device structure to match against
+ *
+ *	Low level utility function used by device matching.
+ */
+const struct of_device_id *of_match_node(const struct of_device_id *matches,
+					 const struct device_node *node)
+{
+	const struct of_device_id *match;
+
+	read_lock(&devtree_lock);
+	match = __of_match_node(matches, node);
+	read_unlock(&devtree_lock);
+	return match;
+}
 EXPORT_SYMBOL(of_match_node);
 
 /**
@@ -619,7 +668,7 @@ struct device_node *of_find_matching_node_and_match(struct device_node *from,
 	read_lock(&devtree_lock);
 	np = from ? from->allnext : of_allnodes;
 	for (; np; np = np->allnext) {
-		if (of_match_node(matches, np) && of_node_get(np)) {
+		if (__of_match_node(matches, np) && of_node_get(np)) {
 			if (match)
 				*match = matches;
 			break;
-- 
cgit v0.10.2


From b89d5494de613c754c98b1fa85484d8217babac3 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 6 Feb 2013 15:30:56 -0500
Subject: OF: Convert devtree lock from rw_lock to raw spinlock

With the locking cleanup in place (from "OF: Fixup resursive
locking code paths"), we can now do the conversion from the
rw_lock to a raw spinlock as required for preempt-rt.

The previous cleanup and this conversion were originally
separate since they predated when mainline got raw spinlock (in
commit c2f21ce2e31286a "locking: Implement new raw_spinlock").

So, at that point in time, the cleanup was considered plausible
for mainline, but not this conversion.  In any case, we've kept
them separate as it makes for easier review and better bisection.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Grant Likely <grant.likely@secretlab.ca>
Cc: Sam Ravnborg <sam@ravnborg.org>
Cc: <devicetree-discuss@lists.ozlabs.org>
Cc: Rob Herring <rob.herring@calxeda.com>
Link: http://lkml.kernel.org/r/1360182656-15898-1-git-send-email-paul.gortmaker@windriver.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
[PG: taken from preempt-rt, update subject & add a commit log]
Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>

diff --git a/arch/sparc/kernel/prom_common.c b/arch/sparc/kernel/prom_common.c
index 1303021..9f20566 100644
--- a/arch/sparc/kernel/prom_common.c
+++ b/arch/sparc/kernel/prom_common.c
@@ -64,7 +64,7 @@ int of_set_property(struct device_node *dp, const char *name, void *val, int len
 	err = -ENODEV;
 
 	mutex_lock(&of_set_property_mutex);
-	write_lock(&devtree_lock);
+	raw_spin_lock(&devtree_lock);
 	prevp = &dp->properties;
 	while (*prevp) {
 		struct property *prop = *prevp;
@@ -91,7 +91,7 @@ int of_set_property(struct device_node *dp, const char *name, void *val, int len
 		}
 		prevp = &(*prevp)->next;
 	}
-	write_unlock(&devtree_lock);
+	raw_spin_unlock(&devtree_lock);
 	mutex_unlock(&of_set_property_mutex);
 
 	/* XXX Upate procfs if necessary... */
diff --git a/drivers/of/base.c b/drivers/of/base.c
index 21195a1..662e009 100644
--- a/drivers/of/base.c
+++ b/drivers/of/base.c
@@ -55,7 +55,7 @@ static DEFINE_MUTEX(of_aliases_mutex);
 /* use when traversing tree through the allnext, child, sibling,
  * or parent members of struct device_node.
  */
-DEFINE_RWLOCK(devtree_lock);
+DEFINE_RAW_SPINLOCK(devtree_lock);
 
 int of_n_addr_cells(struct device_node *np)
 {
@@ -188,10 +188,11 @@ struct property *of_find_property(const struct device_node *np,
 				  int *lenp)
 {
 	struct property *pp;
+	unsigned long flags;
 
-	read_lock(&devtree_lock);
+	raw_spin_lock_irqsave(&devtree_lock, flags);
 	pp = __of_find_property(np, name, lenp);
-	read_unlock(&devtree_lock);
+	raw_spin_unlock_irqrestore(&devtree_lock, flags);
 
 	return pp;
 }
@@ -209,13 +210,13 @@ struct device_node *of_find_all_nodes(struct device_node *prev)
 {
 	struct device_node *np;
 
-	read_lock(&devtree_lock);
+	raw_spin_lock(&devtree_lock);
 	np = prev ? prev->allnext : of_allnodes;
 	for (; np != NULL; np = np->allnext)
 		if (of_node_get(np))
 			break;
 	of_node_put(prev);
-	read_unlock(&devtree_lock);
+	raw_spin_unlock(&devtree_lock);
 	return np;
 }
 EXPORT_SYMBOL(of_find_all_nodes);
@@ -274,11 +275,12 @@ static int __of_device_is_compatible(const struct device_node *device,
 int of_device_is_compatible(const struct device_node *device,
 		const char *compat)
 {
+	unsigned long flags;
 	int res;
 
-	read_lock(&devtree_lock);
+	raw_spin_lock_irqsave(&devtree_lock, flags);
 	res = __of_device_is_compatible(device, compat);
-	read_unlock(&devtree_lock);
+	raw_spin_unlock_irqrestore(&devtree_lock, flags);
 	return res;
 }
 EXPORT_SYMBOL(of_device_is_compatible);
@@ -340,13 +342,14 @@ EXPORT_SYMBOL(of_device_is_available);
 struct device_node *of_get_parent(const struct device_node *node)
 {
 	struct device_node *np;
+	unsigned long flags;
 
 	if (!node)
 		return NULL;
 
-	read_lock(&devtree_lock);
+	raw_spin_lock_irqsave(&devtree_lock, flags);
 	np = of_node_get(node->parent);
-	read_unlock(&devtree_lock);
+	raw_spin_unlock_irqrestore(&devtree_lock, flags);
 	return np;
 }
 EXPORT_SYMBOL(of_get_parent);
@@ -365,14 +368,15 @@ EXPORT_SYMBOL(of_get_parent);
 struct device_node *of_get_next_parent(struct device_node *node)
 {
 	struct device_node *parent;
+	unsigned long flags;
 
 	if (!node)
 		return NULL;
 
-	read_lock(&devtree_lock);
+	raw_spin_lock_irqsave(&devtree_lock, flags);
 	parent = of_node_get(node->parent);
 	of_node_put(node);
-	read_unlock(&devtree_lock);
+	raw_spin_unlock_irqrestore(&devtree_lock, flags);
 	return parent;
 }
 
@@ -388,14 +392,15 @@ struct device_node *of_get_next_child(const struct device_node *node,
 	struct device_node *prev)
 {
 	struct device_node *next;
+	unsigned long flags;
 
-	read_lock(&devtree_lock);
+	raw_spin_lock_irqsave(&devtree_lock, flags);
 	next = prev ? prev->sibling : node->child;
 	for (; next; next = next->sibling)
 		if (of_node_get(next))
 			break;
 	of_node_put(prev);
-	read_unlock(&devtree_lock);
+	raw_spin_unlock_irqrestore(&devtree_lock, flags);
 	return next;
 }
 EXPORT_SYMBOL(of_get_next_child);
@@ -413,7 +418,7 @@ struct device_node *of_get_next_available_child(const struct device_node *node,
 {
 	struct device_node *next;
 
-	read_lock(&devtree_lock);
+	raw_spin_lock(&devtree_lock);
 	next = prev ? prev->sibling : node->child;
 	for (; next; next = next->sibling) {
 		if (!of_device_is_available(next))
@@ -422,7 +427,7 @@ struct device_node *of_get_next_available_child(const struct device_node *node,
 			break;
 	}
 	of_node_put(prev);
-	read_unlock(&devtree_lock);
+	raw_spin_unlock(&devtree_lock);
 	return next;
 }
 EXPORT_SYMBOL(of_get_next_available_child);
@@ -460,14 +465,15 @@ EXPORT_SYMBOL(of_get_child_by_name);
 struct device_node *of_find_node_by_path(const char *path)
 {
 	struct device_node *np = of_allnodes;
+	unsigned long flags;
 
-	read_lock(&devtree_lock);
+	raw_spin_lock_irqsave(&devtree_lock, flags);
 	for (; np; np = np->allnext) {
 		if (np->full_name && (of_node_cmp(np->full_name, path) == 0)
 		    && of_node_get(np))
 			break;
 	}
-	read_unlock(&devtree_lock);
+	raw_spin_unlock_irqrestore(&devtree_lock, flags);
 	return np;
 }
 EXPORT_SYMBOL(of_find_node_by_path);
@@ -487,15 +493,16 @@ struct device_node *of_find_node_by_name(struct device_node *from,
 	const char *name)
 {
 	struct device_node *np;
+	unsigned long flags;
 
-	read_lock(&devtree_lock);
+	raw_spin_lock_irqsave(&devtree_lock, flags);
 	np = from ? from->allnext : of_allnodes;
 	for (; np; np = np->allnext)
 		if (np->name && (of_node_cmp(np->name, name) == 0)
 		    && of_node_get(np))
 			break;
 	of_node_put(from);
-	read_unlock(&devtree_lock);
+	raw_spin_unlock_irqrestore(&devtree_lock, flags);
 	return np;
 }
 EXPORT_SYMBOL(of_find_node_by_name);
@@ -516,15 +523,16 @@ struct device_node *of_find_node_by_type(struct device_node *from,
 	const char *type)
 {
 	struct device_node *np;
+	unsigned long flags;
 
-	read_lock(&devtree_lock);
+	raw_spin_lock_irqsave(&devtree_lock, flags);
 	np = from ? from->allnext : of_allnodes;
 	for (; np; np = np->allnext)
 		if (np->type && (of_node_cmp(np->type, type) == 0)
 		    && of_node_get(np))
 			break;
 	of_node_put(from);
-	read_unlock(&devtree_lock);
+	raw_spin_unlock_irqrestore(&devtree_lock, flags);
 	return np;
 }
 EXPORT_SYMBOL(of_find_node_by_type);
@@ -547,8 +555,9 @@ struct device_node *of_find_compatible_node(struct device_node *from,
 	const char *type, const char *compatible)
 {
 	struct device_node *np;
+	unsigned long flags;
 
-	read_lock(&devtree_lock);
+	raw_spin_lock_irqsave(&devtree_lock, flags);
 	np = from ? from->allnext : of_allnodes;
 	for (; np; np = np->allnext) {
 		if (type
@@ -559,7 +568,7 @@ struct device_node *of_find_compatible_node(struct device_node *from,
 			break;
 	}
 	of_node_put(from);
-	read_unlock(&devtree_lock);
+	raw_spin_unlock_irqrestore(&devtree_lock, flags);
 	return np;
 }
 EXPORT_SYMBOL(of_find_compatible_node);
@@ -581,8 +590,9 @@ struct device_node *of_find_node_with_property(struct device_node *from,
 {
 	struct device_node *np;
 	struct property *pp;
+	unsigned long flags;
 
-	read_lock(&devtree_lock);
+	raw_spin_lock_irqsave(&devtree_lock, flags);
 	np = from ? from->allnext : of_allnodes;
 	for (; np; np = np->allnext) {
 		for (pp = np->properties; pp; pp = pp->next) {
@@ -594,7 +604,7 @@ struct device_node *of_find_node_with_property(struct device_node *from,
 	}
 out:
 	of_node_put(from);
-	read_unlock(&devtree_lock);
+	raw_spin_unlock_irqrestore(&devtree_lock, flags);
 	return np;
 }
 EXPORT_SYMBOL(of_find_node_with_property);
@@ -635,10 +645,11 @@ const struct of_device_id *of_match_node(const struct of_device_id *matches,
 					 const struct device_node *node)
 {
 	const struct of_device_id *match;
+	unsigned long flags;
 
-	read_lock(&devtree_lock);
+	raw_spin_lock_irqsave(&devtree_lock, flags);
 	match = __of_match_node(matches, node);
-	read_unlock(&devtree_lock);
+	raw_spin_unlock_irqrestore(&devtree_lock, flags);
 	return match;
 }
 EXPORT_SYMBOL(of_match_node);
@@ -661,11 +672,12 @@ struct device_node *of_find_matching_node_and_match(struct device_node *from,
 					const struct of_device_id **match)
 {
 	struct device_node *np;
+	unsigned long flags;
 
 	if (match)
 		*match = NULL;
 
-	read_lock(&devtree_lock);
+	raw_spin_lock_irqsave(&devtree_lock, flags);
 	np = from ? from->allnext : of_allnodes;
 	for (; np; np = np->allnext) {
 		if (__of_match_node(matches, np) && of_node_get(np)) {
@@ -675,7 +687,7 @@ struct device_node *of_find_matching_node_and_match(struct device_node *from,
 		}
 	}
 	of_node_put(from);
-	read_unlock(&devtree_lock);
+	raw_spin_unlock_irqrestore(&devtree_lock, flags);
 	return np;
 }
 EXPORT_SYMBOL(of_find_matching_node_and_match);
@@ -718,12 +730,12 @@ struct device_node *of_find_node_by_phandle(phandle handle)
 {
 	struct device_node *np;
 
-	read_lock(&devtree_lock);
+	raw_spin_lock(&devtree_lock);
 	for (np = of_allnodes; np; np = np->allnext)
 		if (np->phandle == handle)
 			break;
 	of_node_get(np);
-	read_unlock(&devtree_lock);
+	raw_spin_unlock(&devtree_lock);
 	return np;
 }
 EXPORT_SYMBOL(of_find_node_by_phandle);
@@ -1195,18 +1207,18 @@ int of_add_property(struct device_node *np, struct property *prop)
 		return rc;
 
 	prop->next = NULL;
-	write_lock_irqsave(&devtree_lock, flags);
+	raw_spin_lock_irqsave(&devtree_lock, flags);
 	next = &np->properties;
 	while (*next) {
 		if (strcmp(prop->name, (*next)->name) == 0) {
 			/* duplicate ! don't insert it */
-			write_unlock_irqrestore(&devtree_lock, flags);
+			raw_spin_unlock_irqrestore(&devtree_lock, flags);
 			return -1;
 		}
 		next = &(*next)->next;
 	}
 	*next = prop;
-	write_unlock_irqrestore(&devtree_lock, flags);
+	raw_spin_unlock_irqrestore(&devtree_lock, flags);
 
 #ifdef CONFIG_PROC_DEVICETREE
 	/* try to add to proc as well if it was initialized */
@@ -1236,7 +1248,7 @@ int of_remove_property(struct device_node *np, struct property *prop)
 	if (rc)
 		return rc;
 
-	write_lock_irqsave(&devtree_lock, flags);
+	raw_spin_lock_irqsave(&devtree_lock, flags);
 	next = &np->properties;
 	while (*next) {
 		if (*next == prop) {
@@ -1249,7 +1261,7 @@ int of_remove_property(struct device_node *np, struct property *prop)
 		}
 		next = &(*next)->next;
 	}
-	write_unlock_irqrestore(&devtree_lock, flags);
+	raw_spin_unlock_irqrestore(&devtree_lock, flags);
 
 	if (!found)
 		return -ENODEV;
@@ -1289,7 +1301,7 @@ int of_update_property(struct device_node *np, struct property *newprop)
 	if (!oldprop)
 		return of_add_property(np, newprop);
 
-	write_lock_irqsave(&devtree_lock, flags);
+	raw_spin_lock_irqsave(&devtree_lock, flags);
 	next = &np->properties;
 	while (*next) {
 		if (*next == oldprop) {
@@ -1303,7 +1315,7 @@ int of_update_property(struct device_node *np, struct property *newprop)
 		}
 		next = &(*next)->next;
 	}
-	write_unlock_irqrestore(&devtree_lock, flags);
+	raw_spin_unlock_irqrestore(&devtree_lock, flags);
 
 	if (!found)
 		return -ENODEV;
@@ -1376,12 +1388,12 @@ int of_attach_node(struct device_node *np)
 	if (rc)
 		return rc;
 
-	write_lock_irqsave(&devtree_lock, flags);
+	raw_spin_lock_irqsave(&devtree_lock, flags);
 	np->sibling = np->parent->child;
 	np->allnext = of_allnodes;
 	np->parent->child = np;
 	of_allnodes = np;
-	write_unlock_irqrestore(&devtree_lock, flags);
+	raw_spin_unlock_irqrestore(&devtree_lock, flags);
 
 	of_add_proc_dt_entry(np);
 	return 0;
@@ -1424,17 +1436,17 @@ int of_detach_node(struct device_node *np)
 	if (rc)
 		return rc;
 
-	write_lock_irqsave(&devtree_lock, flags);
+	raw_spin_lock_irqsave(&devtree_lock, flags);
 
 	if (of_node_check_flag(np, OF_DETACHED)) {
 		/* someone already detached it */
-		write_unlock_irqrestore(&devtree_lock, flags);
+		raw_spin_unlock_irqrestore(&devtree_lock, flags);
 		return rc;
 	}
 
 	parent = np->parent;
 	if (!parent) {
-		write_unlock_irqrestore(&devtree_lock, flags);
+		raw_spin_unlock_irqrestore(&devtree_lock, flags);
 		return rc;
 	}
 
@@ -1461,7 +1473,7 @@ int of_detach_node(struct device_node *np)
 	}
 
 	of_node_set_flag(np, OF_DETACHED);
-	write_unlock_irqrestore(&devtree_lock, flags);
+	raw_spin_unlock_irqrestore(&devtree_lock, flags);
 
 	of_remove_proc_dt_entry(np);
 	return rc;
diff --git a/include/linux/of.h b/include/linux/of.h
index 5ebcc5c..bb35c42 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -92,7 +92,7 @@ static inline void of_node_put(struct device_node *node) { }
 extern struct device_node *of_allnodes;
 extern struct device_node *of_chosen;
 extern struct device_node *of_aliases;
-extern rwlock_t devtree_lock;
+extern raw_spinlock_t devtree_lock;
 
 static inline bool of_have_populated_dt(void)
 {
-- 
cgit v0.10.2


From 68b2d25b1acd325f05be94bb795bc5f095022ef9 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 17 Jul 2011 21:25:03 +0200
Subject: locking-various-init-fixes.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/drivers/char/random.c b/drivers/char/random.c
index 57d4b15..32a6c57 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -445,7 +445,7 @@ static struct entropy_store input_pool = {
 	.poolinfo = &poolinfo_table[0],
 	.name = "input",
 	.limit = 1,
-	.lock = __SPIN_LOCK_UNLOCKED(&input_pool.lock),
+	.lock = __SPIN_LOCK_UNLOCKED(input_pool.lock),
 	.pool = input_pool_data
 };
 
@@ -454,7 +454,7 @@ static struct entropy_store blocking_pool = {
 	.name = "blocking",
 	.limit = 1,
 	.pull = &input_pool,
-	.lock = __SPIN_LOCK_UNLOCKED(&blocking_pool.lock),
+	.lock = __SPIN_LOCK_UNLOCKED(blocking_pool.lock),
 	.pool = blocking_pool_data
 };
 
@@ -462,7 +462,7 @@ static struct entropy_store nonblocking_pool = {
 	.poolinfo = &poolinfo_table[1],
 	.name = "nonblocking",
 	.pull = &input_pool,
-	.lock = __SPIN_LOCK_UNLOCKED(&nonblocking_pool.lock),
+	.lock = __SPIN_LOCK_UNLOCKED(nonblocking_pool.lock),
 	.pool = nonblocking_pool_data
 };
 
diff --git a/drivers/usb/chipidea/debug.c b/drivers/usb/chipidea/debug.c
index 3bc244d..a62c4a4 100644
--- a/drivers/usb/chipidea/debug.c
+++ b/drivers/usb/chipidea/debug.c
@@ -222,7 +222,7 @@ static struct {
 } dbg_data = {
 	.idx = 0,
 	.tty = 0,
-	.lck = __RW_LOCK_UNLOCKED(lck)
+	.lck = __RW_LOCK_UNLOCKED(dbg_data.lck)
 };
 
 /**
diff --git a/fs/file.c b/fs/file.c
index 2b3570b..3906d95 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -516,7 +516,7 @@ struct files_struct init_files = {
 		.close_on_exec	= init_files.close_on_exec_init,
 		.open_fds	= init_files.open_fds_init,
 	},
-	.file_lock	= __SPIN_LOCK_UNLOCKED(init_task.file_lock),
+	.file_lock	= __SPIN_LOCK_UNLOCKED(init_files.file_lock),
 };
 
 /*
diff --git a/include/linux/idr.h b/include/linux/idr.h
index de7e190..e5eb125 100644
--- a/include/linux/idr.h
+++ b/include/linux/idr.h
@@ -136,7 +136,7 @@ struct ida {
 	struct ida_bitmap	*free_bitmap;
 };
 
-#define IDA_INIT(name)		{ .idr = IDR_INIT(name), .free_bitmap = NULL, }
+#define IDA_INIT(name)		{ .idr = IDR_INIT((name).idr), .free_bitmap = NULL, }
 #define DEFINE_IDA(name)	struct ida name = IDA_INIT(name)
 
 int ida_pre_get(struct ida *ida, gfp_t gfp_mask);
-- 
cgit v0.10.2


From 56dca525c5c342651371170c5814ecd153eca226 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Wed, 7 Dec 2011 12:48:42 +0100
Subject: intel_idle: Convert i7300_idle_lock to raw spinlock

24 core Intel box's first exposure to 3.0.12-rt30-rc3 didn't go well.

[   27.104159] i7300_idle: loaded v1.55
[   27.104192] BUG: scheduling while atomic: swapper/2/0/0x00000002
[   27.104309] Pid: 0, comm: swapper/2 Tainted: G           N  3.0.12-rt30-rc3-rt #1
[   27.104317] Call Trace:
[   27.104338]  [<ffffffff810046a5>] dump_trace+0x85/0x2e0
[   27.104372]  [<ffffffff8144eb00>] thread_return+0x12b/0x30b
[   27.104381]  [<ffffffff8144f1b9>] schedule+0x29/0xb0
[   27.104389]  [<ffffffff814506e5>] rt_spin_lock_slowlock+0xc5/0x240
[   27.104401]  [<ffffffffa01f818f>] i7300_idle_notifier+0x3f/0x360 [i7300_idle]
[   27.104415]  [<ffffffff814546c7>] notifier_call_chain+0x37/0x70
[   27.104426]  [<ffffffff81454748>] __atomic_notifier_call_chain+0x48/0x70
[   27.104439]  [<ffffffff81001a39>] cpu_idle+0x89/0xb0
[   27.104449] bad: scheduling from the idle thread!

Signed-off-by: Mike Galbraith <efault@gmx.de>
Cc: Steven Rostedt <rostedt@goodmis.org>
Link: http://lkml.kernel.org/r/1323258522.5057.73.camel@marge.simson.net
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/drivers/idle/i7300_idle.c b/drivers/idle/i7300_idle.c
index fa080eb..ffeebc7 100644
--- a/drivers/idle/i7300_idle.c
+++ b/drivers/idle/i7300_idle.c
@@ -75,7 +75,7 @@ static unsigned long past_skip;
 
 static struct pci_dev *fbd_dev;
 
-static spinlock_t i7300_idle_lock;
+static raw_spinlock_t i7300_idle_lock;
 static int i7300_idle_active;
 
 static u8 i7300_idle_thrtctl_saved;
@@ -457,7 +457,7 @@ static int i7300_idle_notifier(struct notifier_block *nb, unsigned long val,
 		idle_begin_time = ktime_get();
 	}
 
-	spin_lock_irqsave(&i7300_idle_lock, flags);
+	raw_spin_lock_irqsave(&i7300_idle_lock, flags);
 	if (val == IDLE_START) {
 
 		cpumask_set_cpu(smp_processor_id(), idle_cpumask);
@@ -506,7 +506,7 @@ static int i7300_idle_notifier(struct notifier_block *nb, unsigned long val,
 		}
 	}
 end:
-	spin_unlock_irqrestore(&i7300_idle_lock, flags);
+	raw_spin_unlock_irqrestore(&i7300_idle_lock, flags);
 	return 0;
 }
 
@@ -548,7 +548,7 @@ struct debugfs_file_info {
 
 static int __init i7300_idle_init(void)
 {
-	spin_lock_init(&i7300_idle_lock);
+	raw_spin_lock_init(&i7300_idle_lock);
 	total_us = 0;
 
 	if (i7300_idle_platform_probe(&fbd_dev, &ioat_dev, forceload))
-- 
cgit v0.10.2


From 507019f961aa4cb38db802200853b5a40fd4c20a Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 10 Apr 2012 11:14:55 +0200
Subject: ntp: Make ntp_lock raw.

This needs to be revisited. Not sure whether we can avoid to make this
lock raw, but it'd really like to.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 24174b4..bb1edfa 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -22,7 +22,7 @@
  * NTP timekeeping variables:
  */
 
-DEFINE_SPINLOCK(ntp_lock);
+DEFINE_RAW_SPINLOCK(ntp_lock);
 
 
 /* USER_HZ period (usecs): */
@@ -347,7 +347,7 @@ void ntp_clear(void)
 {
 	unsigned long flags;
 
-	spin_lock_irqsave(&ntp_lock, flags);
+	raw_spin_lock_irqsave(&ntp_lock, flags);
 
 	time_adjust	= 0;		/* stop active adjtime() */
 	time_status	|= STA_UNSYNC;
@@ -361,7 +361,7 @@ void ntp_clear(void)
 
 	/* Clear PPS state variables */
 	pps_clear();
-	spin_unlock_irqrestore(&ntp_lock, flags);
+	raw_spin_unlock_irqrestore(&ntp_lock, flags);
 
 }
 
@@ -371,9 +371,9 @@ u64 ntp_tick_length(void)
 	unsigned long flags;
 	s64 ret;
 
-	spin_lock_irqsave(&ntp_lock, flags);
+	raw_spin_lock_irqsave(&ntp_lock, flags);
 	ret = tick_length;
-	spin_unlock_irqrestore(&ntp_lock, flags);
+	raw_spin_unlock_irqrestore(&ntp_lock, flags);
 	return ret;
 }
 
@@ -394,7 +394,7 @@ int second_overflow(unsigned long secs)
 	int leap = 0;
 	unsigned long flags;
 
-	spin_lock_irqsave(&ntp_lock, flags);
+	raw_spin_lock_irqsave(&ntp_lock, flags);
 
 	/*
 	 * Leap second processing. If in leap-insert state at the end of the
@@ -478,7 +478,7 @@ int second_overflow(unsigned long secs)
 	time_adjust = 0;
 
 out:
-	spin_unlock_irqrestore(&ntp_lock, flags);
+	raw_spin_unlock_irqrestore(&ntp_lock, flags);
 
 	return leap;
 }
@@ -660,7 +660,7 @@ int do_adjtimex(struct timex *txc)
 
 	getnstimeofday(&ts);
 
-	spin_lock_irq(&ntp_lock);
+	raw_spin_lock_irq(&ntp_lock);
 
 	if (txc->modes & ADJ_ADJTIME) {
 		long save_adjust = time_adjust;
@@ -702,7 +702,7 @@ int do_adjtimex(struct timex *txc)
 	/* fill PPS status fields */
 	pps_fill_timex(txc);
 
-	spin_unlock_irq(&ntp_lock);
+	raw_spin_unlock_irq(&ntp_lock);
 
 	txc->time.tv_sec = ts.tv_sec;
 	txc->time.tv_usec = ts.tv_nsec;
@@ -900,7 +900,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
 
 	pts_norm = pps_normalize_ts(*phase_ts);
 
-	spin_lock_irqsave(&ntp_lock, flags);
+	raw_spin_lock_irqsave(&ntp_lock, flags);
 
 	/* clear the error bits, they will be set again if needed */
 	time_status &= ~(STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR);
@@ -913,7 +913,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
 	 * just start the frequency interval */
 	if (unlikely(pps_fbase.tv_sec == 0)) {
 		pps_fbase = *raw_ts;
-		spin_unlock_irqrestore(&ntp_lock, flags);
+		raw_spin_unlock_irqrestore(&ntp_lock, flags);
 		return;
 	}
 
@@ -928,7 +928,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
 		time_status |= STA_PPSJITTER;
 		/* restart the frequency calibration interval */
 		pps_fbase = *raw_ts;
-		spin_unlock_irqrestore(&ntp_lock, flags);
+		raw_spin_unlock_irqrestore(&ntp_lock, flags);
 		pr_err("hardpps: PPSJITTER: bad pulse\n");
 		return;
 	}
@@ -945,7 +945,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
 
 	hardpps_update_phase(pts_norm.nsec);
 
-	spin_unlock_irqrestore(&ntp_lock, flags);
+	raw_spin_unlock_irqrestore(&ntp_lock, flags);
 }
 EXPORT_SYMBOL(hardpps);
 
-- 
cgit v0.10.2


From c16f0b63926ae53e3ce2191a52f86d6cddb01eb5 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 16 Jul 2011 18:38:22 +0200
Subject: seqlock: Remove unused functions

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
index 600060e2..cb0599c 100644
--- a/include/linux/seqlock.h
+++ b/include/linux/seqlock.h
@@ -69,17 +69,6 @@ static inline void write_sequnlock(seqlock_t *sl)
 	spin_unlock(&sl->lock);
 }
 
-static inline int write_tryseqlock(seqlock_t *sl)
-{
-	int ret = spin_trylock(&sl->lock);
-
-	if (ret) {
-		++sl->sequence;
-		smp_wmb();
-	}
-	return ret;
-}
-
 /* Start of read calculation -- fetch last complete writer token */
 static __always_inline unsigned read_seqbegin(const seqlock_t *sl)
 {
@@ -269,14 +258,4 @@ static inline void write_seqcount_barrier(seqcount_t *s)
 #define write_sequnlock_bh(lock)					\
 	do { write_sequnlock(lock); local_bh_enable(); } while(0)
 
-#define read_seqbegin_irqsave(lock, flags)				\
-	({ local_irq_save(flags);   read_seqbegin(lock); })
-
-#define read_seqretry_irqrestore(lock, iv, flags)			\
-	({								\
-		int ret = read_seqretry(lock, iv);			\
-		local_irq_restore(flags);				\
-		ret;							\
-	})
-
 #endif /* __LINUX_SEQLOCK_H */
-- 
cgit v0.10.2


From 19a8cdb9c20842ef79b6cdf877c4a8f87e1713d0 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 16 Jul 2011 18:40:26 +0200
Subject: seqlock: Use seqcount

No point in having different implementations for the same thing.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
index cb0599c..1829905 100644
--- a/include/linux/seqlock.h
+++ b/include/linux/seqlock.h
@@ -30,81 +30,12 @@
 #include <linux/preempt.h>
 #include <asm/processor.h>
 
-typedef struct {
-	unsigned sequence;
-	spinlock_t lock;
-} seqlock_t;
-
-/*
- * These macros triggered gcc-3.x compile-time problems.  We think these are
- * OK now.  Be cautious.
- */
-#define __SEQLOCK_UNLOCKED(lockname) \
-		 { 0, __SPIN_LOCK_UNLOCKED(lockname) }
-
-#define seqlock_init(x)					\
-	do {						\
-		(x)->sequence = 0;			\
-		spin_lock_init(&(x)->lock);		\
-	} while (0)
-
-#define DEFINE_SEQLOCK(x) \
-		seqlock_t x = __SEQLOCK_UNLOCKED(x)
-
-/* Lock out other writers and update the count.
- * Acts like a normal spin_lock/unlock.
- * Don't need preempt_disable() because that is in the spin_lock already.
- */
-static inline void write_seqlock(seqlock_t *sl)
-{
-	spin_lock(&sl->lock);
-	++sl->sequence;
-	smp_wmb();
-}
-
-static inline void write_sequnlock(seqlock_t *sl)
-{
-	smp_wmb();
-	sl->sequence++;
-	spin_unlock(&sl->lock);
-}
-
-/* Start of read calculation -- fetch last complete writer token */
-static __always_inline unsigned read_seqbegin(const seqlock_t *sl)
-{
-	unsigned ret;
-
-repeat:
-	ret = ACCESS_ONCE(sl->sequence);
-	if (unlikely(ret & 1)) {
-		cpu_relax();
-		goto repeat;
-	}
-	smp_rmb();
-
-	return ret;
-}
-
-/*
- * Test if reader processed invalid data.
- *
- * If sequence value changed then writer changed data while in section.
- */
-static __always_inline int read_seqretry(const seqlock_t *sl, unsigned start)
-{
-	smp_rmb();
-
-	return unlikely(sl->sequence != start);
-}
-
-
 /*
  * Version using sequence counter only.
  * This can be used when code has its own mutex protecting the
  * updating starting before the write_seqcountbeqin() and ending
  * after the write_seqcount_end().
  */
-
 typedef struct seqcount {
 	unsigned sequence;
 } seqcount_t;
@@ -207,7 +138,6 @@ static inline int __read_seqcount_retry(const seqcount_t *s, unsigned start)
 static inline int read_seqcount_retry(const seqcount_t *s, unsigned start)
 {
 	smp_rmb();
-
 	return __read_seqcount_retry(s, start);
 }
 
@@ -241,21 +171,101 @@ static inline void write_seqcount_barrier(seqcount_t *s)
 	s->sequence+=2;
 }
 
+typedef struct {
+	struct seqcount seqcount;
+	spinlock_t lock;
+} seqlock_t;
+
+/*
+ * These macros triggered gcc-3.x compile-time problems.  We think these are
+ * OK now.  Be cautious.
+ */
+#define __SEQLOCK_UNLOCKED(lockname)			\
+	{						\
+		.seqcount = SEQCNT_ZERO,		\
+		.lock =	__SPIN_LOCK_UNLOCKED(lockname)	\
+	}
+
+#define seqlock_init(x)					\
+	do {						\
+		seqcount_init(&(x)->seqcount);		\
+		spin_lock_init(&(x)->lock);		\
+	} while (0)
+
+#define DEFINE_SEQLOCK(x) \
+		seqlock_t x = __SEQLOCK_UNLOCKED(x)
+
+/*
+ * Read side functions for starting and finalizing a read side section.
+ */
+static inline unsigned read_seqbegin(const seqlock_t *sl)
+{
+	return read_seqcount_begin(&sl->seqcount);
+}
+
+static inline unsigned read_seqretry(const seqlock_t *sl, unsigned start)
+{
+	return read_seqcount_retry(&sl->seqcount, start);
+}
+
 /*
- * Possible sw/hw IRQ protected versions of the interfaces.
+ * Lock out other writers and update the count.
+ * Acts like a normal spin_lock/unlock.
+ * Don't need preempt_disable() because that is in the spin_lock already.
  */
+static inline void write_seqlock(seqlock_t *sl)
+{
+	spin_lock(&sl->lock);
+	write_seqcount_begin(&sl->seqcount);
+}
+
+static inline void write_sequnlock(seqlock_t *sl)
+{
+	write_seqcount_end(&sl->seqcount);
+	spin_unlock(&sl->lock);
+}
+
+static inline void write_seqlock_bh(seqlock_t *sl)
+{
+	spin_lock_bh(&sl->lock);
+	write_seqcount_begin(&sl->seqcount);
+}
+
+static inline void write_sequnlock_bh(seqlock_t *sl)
+{
+	write_seqcount_end(&sl->seqcount);
+	spin_unlock_bh(&sl->lock);
+}
+
+static inline void write_seqlock_irq(seqlock_t *sl)
+{
+	spin_lock_irq(&sl->lock);
+	write_seqcount_begin(&sl->seqcount);
+}
+
+static inline void write_sequnlock_irq(seqlock_t *sl)
+{
+	write_seqcount_end(&sl->seqcount);
+	spin_unlock_irq(&sl->lock);
+}
+
+static inline unsigned long __write_seqlock_irqsave(seqlock_t *sl)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&sl->lock, flags);
+	write_seqcount_begin(&sl->seqcount);
+	return flags;
+}
+
 #define write_seqlock_irqsave(lock, flags)				\
-	do { local_irq_save(flags); write_seqlock(lock); } while (0)
-#define write_seqlock_irq(lock)						\
-	do { local_irq_disable();   write_seqlock(lock); } while (0)
-#define write_seqlock_bh(lock)						\
-        do { local_bh_disable();    write_seqlock(lock); } while (0)
+	do { flags = __write_seqlock_irqsave(lock); } while (0)
 
-#define write_sequnlock_irqrestore(lock, flags)				\
-	do { write_sequnlock(lock); local_irq_restore(flags); } while(0)
-#define write_sequnlock_irq(lock)					\
-	do { write_sequnlock(lock); local_irq_enable(); } while(0)
-#define write_sequnlock_bh(lock)					\
-	do { write_sequnlock(lock); local_bh_enable(); } while(0)
+static inline void
+write_sequnlock_irqrestore(seqlock_t *sl, unsigned long flags)
+{
+	write_seqcount_end(&sl->seqcount);
+	spin_unlock_irqrestore(&sl->lock, flags);
+}
 
 #endif /* __LINUX_SEQLOCK_H */
-- 
cgit v0.10.2


From d0b25b2988e9b836b81a1a80d1fedf044b22b65e Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 3 Jul 2009 08:29:30 -0500
Subject: generic: Use raw local irq variant for generic cmpxchg

No point in tracing those.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/asm-generic/cmpxchg-local.h b/include/asm-generic/cmpxchg-local.h
index 2533fdd..d8d4c89 100644
--- a/include/asm-generic/cmpxchg-local.h
+++ b/include/asm-generic/cmpxchg-local.h
@@ -21,7 +21,7 @@ static inline unsigned long __cmpxchg_local_generic(volatile void *ptr,
 	if (size == 8 && sizeof(unsigned long) != 8)
 		wrong_size_cmpxchg(ptr);
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	switch (size) {
 	case 1: prev = *(u8 *)ptr;
 		if (prev == old)
@@ -42,7 +42,7 @@ static inline unsigned long __cmpxchg_local_generic(volatile void *ptr,
 	default:
 		wrong_size_cmpxchg(ptr);
 	}
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 	return prev;
 }
 
@@ -55,11 +55,11 @@ static inline u64 __cmpxchg64_local_generic(volatile void *ptr,
 	u64 prev;
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	prev = *(u64 *)ptr;
 	if (prev == old)
 		*(u64 *)ptr = new;
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 	return prev;
 }
 
-- 
cgit v0.10.2


From 10188d8cc7d3010388e5d0259fe26670228139d3 Mon Sep 17 00:00:00 2001
From: Stephen Warren <swarren@nvidia.com>
Date: Mon, 11 Feb 2013 14:15:32 -0700
Subject: of: fix recursive locking in of_get_next_available_child()

of_get_next_available_child() acquires devtree_lock, then calls
of_device_is_available() which calls of_get_property() which calls
of_find_property() which tries to re-acquire devtree_lock, thus causing
deadlock.

To avoid this, create a new __of_device_is_available() which calls
__of_get_property() instead, which calls __of_find_property(), which
does not take the lock,. Update of_get_next_available_child() to call
the new __of_device_is_available() since it already owns the lock.

Signed-off-by: Stephen Warren <swarren@nvidia.com>
Signed-off-by: Grant Likely <grant.likely@secretlab.ca>

diff --git a/drivers/of/base.c b/drivers/of/base.c
index 662e009..ec2fd1f 100644
--- a/drivers/of/base.c
+++ b/drivers/of/base.c
@@ -307,19 +307,19 @@ int of_machine_is_compatible(const char *compat)
 EXPORT_SYMBOL(of_machine_is_compatible);
 
 /**
- *  of_device_is_available - check if a device is available for use
+ *  __of_device_is_available - check if a device is available for use
  *
- *  @device: Node to check for availability
+ *  @device: Node to check for availability, with locks already held
  *
  *  Returns 1 if the status property is absent or set to "okay" or "ok",
  *  0 otherwise
  */
-int of_device_is_available(const struct device_node *device)
+static int __of_device_is_available(const struct device_node *device)
 {
 	const char *status;
 	int statlen;
 
-	status = of_get_property(device, "status", &statlen);
+	status = __of_get_property(device, "status", &statlen);
 	if (status == NULL)
 		return 1;
 
@@ -330,6 +330,26 @@ int of_device_is_available(const struct device_node *device)
 
 	return 0;
 }
+
+/**
+ *  of_device_is_available - check if a device is available for use
+ *
+ *  @device: Node to check for availability
+ *
+ *  Returns 1 if the status property is absent or set to "okay" or "ok",
+ *  0 otherwise
+ */
+int of_device_is_available(const struct device_node *device)
+{
+	unsigned long flags;
+	int res;
+
+	raw_spin_lock_irqsave(&devtree_lock, flags);
+	res = __of_device_is_available(device);
+	raw_spin_unlock_irqrestore(&devtree_lock, flags);
+	return res;
+
+}
 EXPORT_SYMBOL(of_device_is_available);
 
 /**
@@ -421,7 +441,7 @@ struct device_node *of_get_next_available_child(const struct device_node *node,
 	raw_spin_lock(&devtree_lock);
 	next = prev ? prev->sibling : node->child;
 	for (; next; next = next->sibling) {
-		if (!of_device_is_available(next))
+		if (!__of_device_is_available(next))
 			continue;
 		if (of_node_get(next))
 			break;
-- 
cgit v0.10.2


From 639c3b9809d2202a2fd4c212d32d137d80c49fed Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 30 Sep 2011 20:03:37 +0200
Subject: x86: hpet: Disable MSI on Lenovo W510

MSI based per cpu timers lose interrupts when intel_idle() is enabled
- independent of the c-state. With idle=poll the problem cannot be
observed. We have no idea yet, whether this is a W510 specific issue
or a general chipset oddity. Blacklist the known problem machine.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index e28670f..5ce3c25 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -8,6 +8,7 @@
 #include <linux/slab.h>
 #include <linux/hpet.h>
 #include <linux/init.h>
+#include <linux/dmi.h>
 #include <linux/cpu.h>
 #include <linux/pm.h>
 #include <linux/io.h>
@@ -573,6 +574,30 @@ static void init_one_hpet_msi_clockevent(struct hpet_dev *hdev, int cpu)
 #define RESERVE_TIMERS 0
 #endif
 
+static int __init dmi_disable_hpet_msi(const struct dmi_system_id *d)
+{
+	hpet_msi_disable = 1;
+	return 0;
+}
+
+static struct dmi_system_id __initdata dmi_hpet_table[] = {
+	/*
+	 * MSI based per cpu timers lose interrupts when intel_idle()
+	 * is enabled - independent of the c-state. With idle=poll the
+	 * problem cannot be observed. We have no idea yet, whether
+	 * this is a W510 specific issue or a general chipset oddity.
+	 */
+	{
+	 .callback = dmi_disable_hpet_msi,
+	 .ident = "Lenovo W510",
+	 .matches = {
+		     DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"),
+		     DMI_MATCH(DMI_PRODUCT_VERSION, "ThinkPad W510"),
+		     },
+	 },
+	{}
+};
+
 static void hpet_msi_capability_lookup(unsigned int start_timer)
 {
 	unsigned int id;
@@ -580,6 +605,8 @@ static void hpet_msi_capability_lookup(unsigned int start_timer)
 	unsigned int num_timers_used = 0;
 	int i;
 
+	dmi_check_system(dmi_hpet_table);
+
 	if (hpet_msi_disable)
 		return;
 
-- 
cgit v0.10.2


From 90f73aa79422e1d1e69d2713a6153980f89e03c4 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 23 Jul 2011 11:04:08 +0200
Subject: early-printk-consolidate.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/arm/kernel/early_printk.c b/arch/arm/kernel/early_printk.c
index 85aa2b2..4307653 100644
--- a/arch/arm/kernel/early_printk.c
+++ b/arch/arm/kernel/early_printk.c
@@ -29,28 +29,17 @@ static void early_console_write(struct console *con, const char *s, unsigned n)
 	early_write(s, n);
 }
 
-static struct console early_console = {
+static struct console early_console_dev = {
 	.name =		"earlycon",
 	.write =	early_console_write,
 	.flags =	CON_PRINTBUFFER | CON_BOOT,
 	.index =	-1,
 };
 
-asmlinkage void early_printk(const char *fmt, ...)
-{
-	char buf[512];
-	int n;
-	va_list ap;
-
-	va_start(ap, fmt);
-	n = vscnprintf(buf, sizeof(buf), fmt, ap);
-	early_write(buf, n);
-	va_end(ap);
-}
-
 static int __init setup_early_printk(char *buf)
 {
-	register_console(&early_console);
+	early_console = &early_console_dev;
+	register_console(&early_console_dev);
 	return 0;
 }
 
diff --git a/arch/blackfin/kernel/early_printk.c b/arch/blackfin/kernel/early_printk.c
index 84ed837..61fbd2d 100644
--- a/arch/blackfin/kernel/early_printk.c
+++ b/arch/blackfin/kernel/early_printk.c
@@ -25,8 +25,6 @@ extern struct console *bfin_earlyserial_init(unsigned int port,
 extern struct console *bfin_jc_early_init(void);
 #endif
 
-static struct console *early_console;
-
 /* Default console */
 #define DEFAULT_PORT 0
 #define DEFAULT_CFLAG CS8|B57600
diff --git a/arch/microblaze/kernel/early_printk.c b/arch/microblaze/kernel/early_printk.c
index aba1f9a..b099a86 100644
--- a/arch/microblaze/kernel/early_printk.c
+++ b/arch/microblaze/kernel/early_printk.c
@@ -21,7 +21,6 @@
 #include <asm/setup.h>
 #include <asm/prom.h>
 
-static u32 early_console_initialized;
 static u32 base_addr;
 
 #ifdef CONFIG_SERIAL_UARTLITE_CONSOLE
@@ -109,27 +108,11 @@ static struct console early_serial_uart16550_console = {
 };
 #endif /* CONFIG_SERIAL_8250_CONSOLE */
 
-static struct console *early_console;
-
-void early_printk(const char *fmt, ...)
-{
-	char buf[512];
-	int n;
-	va_list ap;
-
-	if (early_console_initialized) {
-		va_start(ap, fmt);
-		n = vscnprintf(buf, 512, fmt, ap);
-		early_console->write(early_console, buf, n);
-		va_end(ap);
-	}
-}
-
 int __init setup_early_printk(char *opt)
 {
 	int version = 0;
 
-	if (early_console_initialized)
+	if (early_console)
 		return 1;
 
 	base_addr = of_early_console(&version);
@@ -159,7 +142,6 @@ int __init setup_early_printk(char *opt)
 		}
 
 		register_console(early_console);
-		early_console_initialized = 1;
 		return 0;
 	}
 	return 1;
@@ -169,7 +151,7 @@ int __init setup_early_printk(char *opt)
  * only for early console because of performance degression */
 void __init remap_early_printk(void)
 {
-	if (!early_console_initialized || !early_console)
+	if (!early_console)
 		return;
 	printk(KERN_INFO "early_printk_console remapping from 0x%x to ",
 								base_addr);
@@ -195,9 +177,9 @@ void __init remap_early_printk(void)
 
 void __init disable_early_printk(void)
 {
-	if (!early_console_initialized || !early_console)
+	if (!early_console)
 		return;
 	printk(KERN_WARNING "disabling early console\n");
 	unregister_console(early_console);
-	early_console_initialized = 0;
+	early_console = NULL;
 }
diff --git a/arch/mips/kernel/early_printk.c b/arch/mips/kernel/early_printk.c
index 9ae813e..86d325a 100644
--- a/arch/mips/kernel/early_printk.c
+++ b/arch/mips/kernel/early_printk.c
@@ -8,6 +8,7 @@
  *   written by Ralf Baechle (ralf@linux-mips.org)
  */
 #include <linux/console.h>
+#include <linux/printk.h>
 #include <linux/init.h>
 
 #include <asm/setup.h>
@@ -25,20 +26,18 @@ early_console_write(struct console *con, const char *s, unsigned n)
 	}
 }
 
-static struct console early_console __initdata = {
+static struct console early_console_prom = {
 	.name	= "early",
 	.write	= early_console_write,
 	.flags	= CON_PRINTBUFFER | CON_BOOT,
 	.index	= -1
 };
 
-static int early_console_initialized __initdata;
-
 void __init setup_early_printk(void)
 {
-	if (early_console_initialized)
+	if (early_console)
 		return;
-	early_console_initialized = 1;
+	early_console = &early_console_prom;
 
-	register_console(&early_console);
+	register_console(&early_console_prom);
 }
diff --git a/arch/powerpc/kernel/udbg.c b/arch/powerpc/kernel/udbg.c
index f974849..13b8670 100644
--- a/arch/powerpc/kernel/udbg.c
+++ b/arch/powerpc/kernel/udbg.c
@@ -156,15 +156,13 @@ static struct console udbg_console = {
 	.index	= 0,
 };
 
-static int early_console_initialized;
-
 /*
  * Called by setup_system after ppc_md->probe and ppc_md->early_init.
  * Call it again after setting udbg_putc in ppc_md->setup_arch.
  */
 void __init register_early_udbg_console(void)
 {
-	if (early_console_initialized)
+	if (early_console)
 		return;
 
 	if (!udbg_putc)
@@ -174,7 +172,7 @@ void __init register_early_udbg_console(void)
 		printk(KERN_INFO "early console immortal !\n");
 		udbg_console.flags &= ~CON_BOOT;
 	}
-	early_console_initialized = 1;
+	early_console = &udbg_console;
 	register_console(&udbg_console);
 }
 
diff --git a/arch/sh/kernel/sh_bios.c b/arch/sh/kernel/sh_bios.c
index 47475cc..a5b51b9 100644
--- a/arch/sh/kernel/sh_bios.c
+++ b/arch/sh/kernel/sh_bios.c
@@ -144,8 +144,6 @@ static struct console bios_console = {
 	.index		= -1,
 };
 
-static struct console *early_console;
-
 static int __init setup_early_printk(char *buf)
 {
 	int keep_early = 0;
diff --git a/arch/sparc/kernel/setup_32.c b/arch/sparc/kernel/setup_32.c
index 38bf80a..f4fb00e 100644
--- a/arch/sparc/kernel/setup_32.c
+++ b/arch/sparc/kernel/setup_32.c
@@ -309,6 +309,7 @@ void __init setup_arch(char **cmdline_p)
 
 	boot_flags_init(*cmdline_p);
 
+	early_console = &prom_early_console;
 	register_console(&prom_early_console);
 
 	printk("ARCH: ");
diff --git a/arch/sparc/kernel/setup_64.c b/arch/sparc/kernel/setup_64.c
index 0eaf005..ede7dc3 100644
--- a/arch/sparc/kernel/setup_64.c
+++ b/arch/sparc/kernel/setup_64.c
@@ -551,6 +551,12 @@ static void __init init_sparc64_elf_hwcap(void)
 		pause_patch();
 }
 
+static inline void register_prom_console(void)
+{
+	early_console = &prom_early_console;
+	register_console(&prom_early_console);
+}
+
 void __init setup_arch(char **cmdline_p)
 {
 	/* Initialize PROM console and command line. */
@@ -562,7 +568,7 @@ void __init setup_arch(char **cmdline_p)
 #ifdef CONFIG_EARLYFB
 	if (btext_find_display())
 #endif
-		register_console(&prom_early_console);
+		register_prom_console();
 
 	if (tlb_type == hypervisor)
 		printk("ARCH: SUN4V\n");
diff --git a/arch/tile/kernel/early_printk.c b/arch/tile/kernel/early_printk.c
index afb9c9a..34d72a1 100644
--- a/arch/tile/kernel/early_printk.c
+++ b/arch/tile/kernel/early_printk.c
@@ -17,6 +17,7 @@
 #include <linux/init.h>
 #include <linux/string.h>
 #include <linux/irqflags.h>
+#include <linux/printk.h>
 #include <asm/setup.h>
 #include <hv/hypervisor.h>
 
@@ -33,25 +34,8 @@ static struct console early_hv_console = {
 };
 
 /* Direct interface for emergencies */
-static struct console *early_console = &early_hv_console;
-static int early_console_initialized;
 static int early_console_complete;
 
-static void early_vprintk(const char *fmt, va_list ap)
-{
-	char buf[512];
-	int n = vscnprintf(buf, sizeof(buf), fmt, ap);
-	early_console->write(early_console, buf, n);
-}
-
-void early_printk(const char *fmt, ...)
-{
-	va_list ap;
-	va_start(ap, fmt);
-	early_vprintk(fmt, ap);
-	va_end(ap);
-}
-
 void early_panic(const char *fmt, ...)
 {
 	va_list ap;
@@ -69,14 +53,13 @@ static int __initdata keep_early;
 
 static int __init setup_early_printk(char *str)
 {
-	if (early_console_initialized)
+	if (early_console)
 		return 1;
 
 	if (str != NULL && strncmp(str, "keep", 4) == 0)
 		keep_early = 1;
 
 	early_console = &early_hv_console;
-	early_console_initialized = 1;
 	register_console(early_console);
 
 	return 0;
@@ -85,12 +68,12 @@ static int __init setup_early_printk(char *str)
 void __init disable_early_printk(void)
 {
 	early_console_complete = 1;
-	if (!early_console_initialized || !early_console)
+	if (!early_console)
 		return;
 	if (!keep_early) {
 		early_printk("disabling early console\n");
 		unregister_console(early_console);
-		early_console_initialized = 0;
+		early_console = NULL;
 	} else {
 		early_printk("keeping early console\n");
 	}
@@ -98,7 +81,7 @@ void __init disable_early_printk(void)
 
 void warn_early_printk(void)
 {
-	if (early_console_complete || early_console_initialized)
+	if (early_console_complete || early_console)
 		return;
 	early_printk("\
 Machine shutting down before console output is fully initialized.\n\
diff --git a/arch/um/kernel/early_printk.c b/arch/um/kernel/early_printk.c
index 49480f0..4a0800b 100644
--- a/arch/um/kernel/early_printk.c
+++ b/arch/um/kernel/early_printk.c
@@ -16,7 +16,7 @@ static void early_console_write(struct console *con, const char *s, unsigned int
 	um_early_printk(s, n);
 }
 
-static struct console early_console = {
+static struct console early_console_dev = {
 	.name = "earlycon",
 	.write = early_console_write,
 	.flags = CON_BOOT,
@@ -25,8 +25,10 @@ static struct console early_console = {
 
 static int __init setup_early_printk(char *buf)
 {
-	register_console(&early_console);
-
+	if (!early_console) {
+		early_console = &early_console_dev;
+		register_console(&early_console_dev);
+	}
 	return 0;
 }
 
diff --git a/arch/unicore32/kernel/early_printk.c b/arch/unicore32/kernel/early_printk.c
index 3922255..9be0d5d 100644
--- a/arch/unicore32/kernel/early_printk.c
+++ b/arch/unicore32/kernel/early_printk.c
@@ -33,21 +33,17 @@ static struct console early_ocd_console = {
 	.index =	-1,
 };
 
-/* Direct interface for emergencies */
-static struct console *early_console = &early_ocd_console;
-
-static int __initdata keep_early;
-
 static int __init setup_early_printk(char *buf)
 {
-	if (!buf)
+	int keep_early;
+
+	if (!buf || early_console)
 		return 0;
 
 	if (strstr(buf, "keep"))
 		keep_early = 1;
 
-	if (!strncmp(buf, "ocd", 3))
-		early_console = &early_ocd_console;
+	early_console = &early_ocd_console;
 
 	if (keep_early)
 		early_console->flags &= ~CON_BOOT;
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index 9b9f18b..d15f575 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -169,25 +169,9 @@ static struct console early_serial_console = {
 	.index =	-1,
 };
 
-/* Direct interface for emergencies */
-static struct console *early_console = &early_vga_console;
-static int __initdata early_console_initialized;
-
-asmlinkage void early_printk(const char *fmt, ...)
-{
-	char buf[512];
-	int n;
-	va_list ap;
-
-	va_start(ap, fmt);
-	n = vscnprintf(buf, sizeof(buf), fmt, ap);
-	early_console->write(early_console, buf, n);
-	va_end(ap);
-}
-
 static inline void early_console_register(struct console *con, int keep_early)
 {
-	if (early_console->index != -1) {
+	if (con->index != -1) {
 		printk(KERN_CRIT "ERROR: earlyprintk= %s already used\n",
 		       con->name);
 		return;
@@ -207,9 +191,8 @@ static int __init setup_early_printk(char *buf)
 	if (!buf)
 		return 0;
 
-	if (early_console_initialized)
+	if (early_console)
 		return 0;
-	early_console_initialized = 1;
 
 	keep = (strstr(buf, "keep") != NULL);
 
diff --git a/include/linux/console.h b/include/linux/console.h
index 47b858c..4a6948a 100644
--- a/include/linux/console.h
+++ b/include/linux/console.h
@@ -141,6 +141,7 @@ struct console {
 	for (con = console_drivers; con != NULL; con = con->next)
 
 extern int console_set_on_cmdline;
+extern struct console *early_console;
 
 extern int add_preferred_console(char *name, int idx, char *options);
 extern int update_console_cmdline(char *name, int idx, char *name_new, int idx_new, char *options);
diff --git a/include/linux/printk.h b/include/linux/printk.h
index 9afc01e..8111ffa 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -95,8 +95,14 @@ int no_printk(const char *fmt, ...)
 	return 0;
 }
 
+#ifdef CONFIG_EARLY_PRINTK
 extern asmlinkage __printf(1, 2)
 void early_printk(const char *fmt, ...);
+void early_vprintk(const char *fmt, va_list ap);
+#else
+static inline __printf(1, 2) __cold
+void early_printk(const char *s, ...) { }
+#endif
 
 extern int printk_needs_cpu(int cpu);
 extern void printk_tick(void);
diff --git a/kernel/printk.c b/kernel/printk.c
index e698e80..69270a6 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -48,13 +48,6 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/printk.h>
 
-/*
- * Architectures can override it:
- */
-void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...)
-{
-}
-
 /* printk's without a loglevel use this.. */
 #define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL
 
@@ -756,6 +749,29 @@ module_param(ignore_loglevel, bool, S_IRUGO | S_IWUSR);
 MODULE_PARM_DESC(ignore_loglevel, "ignore loglevel setting, to"
 	"print all kernel messages to the console.");
 
+#ifdef CONFIG_EARLY_PRINTK
+struct console *early_console;
+
+void early_vprintk(const char *fmt, va_list ap)
+{
+	if (early_console) {
+		char buf[512];
+		int n = vscnprintf(buf, sizeof(buf), fmt, ap);
+
+		early_console->write(early_console, buf, n);
+	}
+}
+
+asmlinkage void early_printk(const char *fmt, ...)
+{
+	va_list ap;
+
+	va_start(ap, fmt);
+	early_vprintk(fmt, ap);
+	va_end(ap);
+}
+#endif
+
 #ifdef CONFIG_BOOT_PRINTK_DELAY
 
 static int boot_delay; /* msecs delay after each printk during bootup */
-- 
cgit v0.10.2


From 65727343ee193ac4955d409ad58fe4a1be252d95 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Tue, 19 Mar 2013 14:41:04 +0100
Subject: kernel/srcu: merge common code into a macro

DEFINE_SRCU() and DEFINE_STATIC_SRCU() does the same thing except for
the "static" attribute. This patch moves the common pieces into
_DEFINE_SRCU() which is used by the the former macros either adding the
static attribute or not.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>

diff --git a/include/linux/srcu.h b/include/linux/srcu.h
index 6eb691b..d04acb8 100644
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -102,13 +102,13 @@ void process_srcu(struct work_struct *work);
  * define and init a srcu struct at build time.
  * dont't call init_srcu_struct() nor cleanup_srcu_struct() on it.
  */
-#define DEFINE_SRCU(name)						\
+#define _DEFINE_SRCU(name, mod)						\
 	static DEFINE_PER_CPU(struct srcu_struct_array, name##_srcu_array);\
-	struct srcu_struct name = __SRCU_STRUCT_INIT(name);
+	mod struct srcu_struct name =					\
+				__SRCU_STRUCT_INIT(name);
 
-#define DEFINE_STATIC_SRCU(name)					\
-	static DEFINE_PER_CPU(struct srcu_struct_array, name##_srcu_array);\
-	static struct srcu_struct name = __SRCU_STRUCT_INIT(name);
+#define DEFINE_SRCU(name)		_DEFINE_SRCU(name, )
+#define DEFINE_STATIC_SRCU(name)	_DEFINE_SRCU(name, static)
 
 /**
  * call_srcu() - Queue a callback for invocation after an SRCU grace period
-- 
cgit v0.10.2


From 389bebe00e6d8e7866924a4f93c8a9035abf9f7b Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Tue, 19 Mar 2013 14:44:30 +0100
Subject: kernel/SRCU: provide a static initializer

There are macros for static initializer for the three out of four
possible notifier types, that are:
	ATOMIC_NOTIFIER_HEAD()
	BLOCKING_NOTIFIER_HEAD()
	RAW_NOTIFIER_HEAD()

This patch provides a static initilizer for the forth type to make it
complete.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>

diff --git a/include/linux/notifier.h b/include/linux/notifier.h
index d65746e..6bfd703 100644
--- a/include/linux/notifier.h
+++ b/include/linux/notifier.h
@@ -42,9 +42,7 @@
  * in srcu_notifier_call_chain(): no cache bounces and no memory barriers.
  * As compensation, srcu_notifier_chain_unregister() is rather expensive.
  * SRCU notifier chains should be used when the chain will be called very
- * often but notifier_blocks will seldom be removed.  Also, SRCU notifier
- * chains are slightly more difficult to use because they require special
- * runtime initialization.
+ * often but notifier_blocks will seldom be removed.
  */
 
 struct notifier_block {
@@ -85,7 +83,7 @@ struct srcu_notifier_head {
 		(name)->head = NULL;		\
 	} while (0)
 
-/* srcu_notifier_heads must be initialized and cleaned up dynamically */
+/* srcu_notifier_heads must be cleaned up dynamically */
 extern void srcu_init_notifier_head(struct srcu_notifier_head *nh);
 #define srcu_cleanup_notifier_head(name)	\
 		cleanup_srcu_struct(&(name)->srcu);
@@ -98,7 +96,13 @@ extern void srcu_init_notifier_head(struct srcu_notifier_head *nh);
 		.head = NULL }
 #define RAW_NOTIFIER_INIT(name)	{				\
 		.head = NULL }
-/* srcu_notifier_heads cannot be initialized statically */
+
+#define SRCU_NOTIFIER_INIT(name, pcpu)				\
+	{							\
+		.mutex = __MUTEX_INITIALIZER(name.mutex),	\
+		.head = NULL,					\
+		.srcu = __SRCU_STRUCT_INIT(name.srcu, pcpu),	\
+	}
 
 #define ATOMIC_NOTIFIER_HEAD(name)				\
 	struct atomic_notifier_head name =			\
@@ -110,6 +114,18 @@ extern void srcu_init_notifier_head(struct srcu_notifier_head *nh);
 	struct raw_notifier_head name =				\
 		RAW_NOTIFIER_INIT(name)
 
+#define _SRCU_NOTIFIER_HEAD(name, mod)				\
+	static DEFINE_PER_CPU(struct srcu_struct_array,		\
+			name##_head_srcu_array);		\
+	mod struct srcu_notifier_head name =			\
+			SRCU_NOTIFIER_INIT(name, name##_head_srcu_array)
+
+#define SRCU_NOTIFIER_HEAD(name)				\
+	_SRCU_NOTIFIER_HEAD(name, )
+
+#define SRCU_NOTIFIER_HEAD_STATIC(name)				\
+	_SRCU_NOTIFIER_HEAD(name, static)
+
 #ifdef __KERNEL__
 
 extern int atomic_notifier_chain_register(struct atomic_notifier_head *nh,
diff --git a/include/linux/srcu.h b/include/linux/srcu.h
index d04acb8..fe9efd4 100644
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -84,10 +84,10 @@ int init_srcu_struct(struct srcu_struct *sp);
 
 void process_srcu(struct work_struct *work);
 
-#define __SRCU_STRUCT_INIT(name)					\
+#define __SRCU_STRUCT_INIT(name, pcpu_name)				\
 	{								\
 		.completed = -300,					\
-		.per_cpu_ref = &name##_srcu_array,			\
+		.per_cpu_ref = &pcpu_name,				\
 		.queue_lock = __SPIN_LOCK_UNLOCKED(name.queue_lock),	\
 		.running = false,					\
 		.batch_queue = RCU_BATCH_INIT(name.batch_queue),	\
@@ -105,7 +105,7 @@ void process_srcu(struct work_struct *work);
 #define _DEFINE_SRCU(name, mod)						\
 	static DEFINE_PER_CPU(struct srcu_struct_array, name##_srcu_array);\
 	mod struct srcu_struct name =					\
-				__SRCU_STRUCT_INIT(name);
+				__SRCU_STRUCT_INIT(name, name##_srcu_array);
 
 #define DEFINE_SRCU(name)		_DEFINE_SRCU(name, )
 #define DEFINE_STATIC_SRCU(name)	_DEFINE_SRCU(name, static)
-- 
cgit v0.10.2


From e0554a0a1e81220f896abce8f533af3f374bceec Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 16 Mar 2011 14:45:31 +0100
Subject: arm: Mark pmu interupt IRQF_NO_THREAD

PMU interrupts must not be threaded.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/arm/kernel/perf_event_cpu.c b/arch/arm/kernel/perf_event_cpu.c
index 5f66206..aa1b171 100644
--- a/arch/arm/kernel/perf_event_cpu.c
+++ b/arch/arm/kernel/perf_event_cpu.c
@@ -118,7 +118,8 @@ static int cpu_pmu_request_irq(struct arm_pmu *cpu_pmu, irq_handler_t handler)
 			continue;
 		}
 
-		err = request_irq(irq, handler, IRQF_NOBALANCING, "arm-pmu",
+		err = request_irq(irq, handler,
+				  IRQF_NOBALANCING | IRQF_NO_THREAD, "arm-pmu",
 				  cpu_pmu);
 		if (err) {
 			pr_err("unable to request IRQ%d for ARM PMU counters\n",
-- 
cgit v0.10.2


From 7525e6654c87d85aad34ebb917256893b37eebb7 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 16 Jul 2011 13:15:20 +0200
Subject: arm: Allow forced irq threading

All timer interrupts and the perf interrupt are marked NO_THREAD, so
its safe to allow forced interrupt threading.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 67874b8..4890796 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -17,6 +17,7 @@ config ARM
 	select GENERIC_STRNCPY_FROM_USER
 	select GENERIC_STRNLEN_USER
 	select HARDIRQS_SW_RESEND
+	select IRQ_FORCED_THREADING
 	select HAVE_AOUT
 	select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL
 	select HAVE_ARCH_KGDB
-- 
cgit v0.10.2


From 1ce0385a1c44be16b1176591b9bc4414f90c1a09 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 18 Jun 2012 19:53:17 +0200
Subject: powerpc: Mark low level irq handlers NO_THREAD

These low level handlers cannot be threaded. Mark them NO_THREAD

Reported-by: leroy christophe <christophe.leroy@c-s.fr>
Tested-by: leroy christophe <christophe.leroy@c-s.fr>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/powerpc/platforms/8xx/m8xx_setup.c b/arch/powerpc/platforms/8xx/m8xx_setup.c
index 1e12108..806cbbd 100644
--- a/arch/powerpc/platforms/8xx/m8xx_setup.c
+++ b/arch/powerpc/platforms/8xx/m8xx_setup.c
@@ -43,6 +43,7 @@ static irqreturn_t timebase_interrupt(int irq, void *dev)
 
 static struct irqaction tbint_irqaction = {
 	.handler = timebase_interrupt,
+	.flags = IRQF_NO_THREAD,
 	.name = "tbint",
 };
 
diff --git a/arch/powerpc/sysdev/cpm1.c b/arch/powerpc/sysdev/cpm1.c
index d4fa03f..5e6ff38 100644
--- a/arch/powerpc/sysdev/cpm1.c
+++ b/arch/powerpc/sysdev/cpm1.c
@@ -120,6 +120,7 @@ static irqreturn_t cpm_error_interrupt(int irq, void *dev)
 
 static struct irqaction cpm_error_irqaction = {
 	.handler = cpm_error_interrupt,
+	.flags = IRQF_NO_THREAD,
 	.name = "error",
 };
 
-- 
cgit v0.10.2


From ca65dcc57ce3b421afc6b21058b730859d26a0b8 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 16 Feb 2013 00:12:36 +0100
Subject: timekeeping: Calc stuff once

Calculate the cycle interval shifted value once. No functional change,
just makes the code more readable.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index cbc6acb..06dc034 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1077,15 +1077,16 @@ static inline void accumulate_nsecs_to_secs(struct timekeeper *tk)
 static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset,
 						u32 shift)
 {
+	cycle_t interval = tk->cycle_interval << shift;
 	u64 raw_nsecs;
 
 	/* If the offset is smaller then a shifted interval, do nothing */
-	if (offset < tk->cycle_interval<<shift)
+	if (offset < interval)
 		return offset;
 
 	/* Accumulate one shifted interval */
-	offset -= tk->cycle_interval << shift;
-	tk->clock->cycle_last += tk->cycle_interval << shift;
+	offset -= interval;
+	tk->clock->cycle_last += interval;
 
 	tk->xtime_nsec += tk->xtime_interval << shift;
 	accumulate_nsecs_to_secs(tk);
-- 
cgit v0.10.2


From 7d986526cc1ba2ab8122604f6f334c02251c7be0 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 14 Feb 2013 22:38:07 +0100
Subject: timekeeping: Make jiffies_lock internal

Nothing outside of the timekeeping core needs that lock.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h
index 82ed068..8fb8edf 100644
--- a/include/linux/jiffies.h
+++ b/include/linux/jiffies.h
@@ -75,7 +75,6 @@ extern int register_refined_jiffies(long clock_tick_rate);
  */
 extern u64 __jiffy_data jiffies_64;
 extern unsigned long volatile __jiffy_data jiffies;
-extern seqlock_t jiffies_lock;
 
 #if (BITS_PER_LONG < 64)
 u64 get_jiffies_64(void);
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index cf3e59e..f5c9207 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -4,6 +4,8 @@
 #include <linux/hrtimer.h>
 #include <linux/tick.h>
 
+extern seqlock_t jiffies_lock;
+
 #ifdef CONFIG_GENERIC_CLOCKEVENTS_BUILD
 
 #define TICK_DO_TIMER_NONE	-1
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 06dc034..e2892da 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -23,6 +23,7 @@
 #include <linux/stop_machine.h>
 #include <linux/pvclock_gtod.h>
 
+#include "tick-internal.h"
 
 static struct timekeeper timekeeper;
 
-- 
cgit v0.10.2


From 757a75343c7913007d47d6481d91c040a506bde7 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 15 Feb 2013 15:05:48 +0100
Subject: timekeeping: Move lock out of timekeeper struct

Make the lock a separate entity. Preparatory patch for shadow
timekeeper structure.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/timekeeper_internal.h b/include/linux/timekeeper_internal.h
index e1d558e..851bc43 100644
--- a/include/linux/timekeeper_internal.h
+++ b/include/linux/timekeeper_internal.h
@@ -62,8 +62,6 @@ struct timekeeper {
 	ktime_t			offs_boot;
 	/* The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. */
 	struct timespec		raw_time;
-	/* Seqlock for all timekeeper values */
-	seqlock_t		lock;
 };
 
 static inline struct timespec tk_xtime(struct timekeeper *tk)
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index e2892da..37887dc 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -26,6 +26,7 @@
 #include "tick-internal.h"
 
 static struct timekeeper timekeeper;
+static DEFINE_SEQLOCK(timekeeper_lock);
 
 /* flag for if timekeeping is suspended */
 int __read_mostly timekeeping_suspended;
@@ -194,11 +195,11 @@ int pvclock_gtod_register_notifier(struct notifier_block *nb)
 	unsigned long flags;
 	int ret;
 
-	write_seqlock_irqsave(&tk->lock, flags);
+	write_seqlock_irqsave(&timekeeper_lock, flags);
 	ret = raw_notifier_chain_register(&pvclock_gtod_chain, nb);
 	/* update timekeeping data */
 	update_pvclock_gtod(tk);
-	write_sequnlock_irqrestore(&tk->lock, flags);
+	write_sequnlock_irqrestore(&timekeeper_lock, flags);
 
 	return ret;
 }
@@ -212,13 +213,12 @@ EXPORT_SYMBOL_GPL(pvclock_gtod_register_notifier);
  */
 int pvclock_gtod_unregister_notifier(struct notifier_block *nb)
 {
-	struct timekeeper *tk = &timekeeper;
 	unsigned long flags;
 	int ret;
 
-	write_seqlock_irqsave(&tk->lock, flags);
+	write_seqlock_irqsave(&timekeeper_lock, flags);
 	ret = raw_notifier_chain_unregister(&pvclock_gtod_chain, nb);
-	write_sequnlock_irqrestore(&tk->lock, flags);
+	write_sequnlock_irqrestore(&timekeeper_lock, flags);
 
 	return ret;
 }
@@ -279,12 +279,12 @@ void getnstimeofday(struct timespec *ts)
 	WARN_ON(timekeeping_suspended);
 
 	do {
-		seq = read_seqbegin(&tk->lock);
+		seq = read_seqbegin(&timekeeper_lock);
 
 		ts->tv_sec = tk->xtime_sec;
 		nsecs = timekeeping_get_ns(tk);
 
-	} while (read_seqretry(&tk->lock, seq));
+	} while (read_seqretry(&timekeeper_lock, seq));
 
 	ts->tv_nsec = 0;
 	timespec_add_ns(ts, nsecs);
@@ -300,11 +300,11 @@ ktime_t ktime_get(void)
 	WARN_ON(timekeeping_suspended);
 
 	do {
-		seq = read_seqbegin(&tk->lock);
+		seq = read_seqbegin(&timekeeper_lock);
 		secs = tk->xtime_sec + tk->wall_to_monotonic.tv_sec;
 		nsecs = timekeeping_get_ns(tk) + tk->wall_to_monotonic.tv_nsec;
 
-	} while (read_seqretry(&tk->lock, seq));
+	} while (read_seqretry(&timekeeper_lock, seq));
 	/*
 	 * Use ktime_set/ktime_add_ns to create a proper ktime on
 	 * 32-bit architectures without CONFIG_KTIME_SCALAR.
@@ -331,12 +331,12 @@ void ktime_get_ts(struct timespec *ts)
 	WARN_ON(timekeeping_suspended);
 
 	do {
-		seq = read_seqbegin(&tk->lock);
+		seq = read_seqbegin(&timekeeper_lock);
 		ts->tv_sec = tk->xtime_sec;
 		nsec = timekeeping_get_ns(tk);
 		tomono = tk->wall_to_monotonic;
 
-	} while (read_seqretry(&tk->lock, seq));
+	} while (read_seqretry(&timekeeper_lock, seq));
 
 	ts->tv_sec += tomono.tv_sec;
 	ts->tv_nsec = 0;
@@ -364,7 +364,7 @@ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real)
 	WARN_ON_ONCE(timekeeping_suspended);
 
 	do {
-		seq = read_seqbegin(&tk->lock);
+		seq = read_seqbegin(&timekeeper_lock);
 
 		*ts_raw = tk->raw_time;
 		ts_real->tv_sec = tk->xtime_sec;
@@ -373,7 +373,7 @@ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real)
 		nsecs_raw = timekeeping_get_ns_raw(tk);
 		nsecs_real = timekeeping_get_ns(tk);
 
-	} while (read_seqretry(&tk->lock, seq));
+	} while (read_seqretry(&timekeeper_lock, seq));
 
 	timespec_add_ns(ts_raw, nsecs_raw);
 	timespec_add_ns(ts_real, nsecs_real);
@@ -413,7 +413,7 @@ int do_settimeofday(const struct timespec *tv)
 	if (!timespec_valid_strict(tv))
 		return -EINVAL;
 
-	write_seqlock_irqsave(&tk->lock, flags);
+	write_seqlock_irqsave(&timekeeper_lock, flags);
 
 	timekeeping_forward_now(tk);
 
@@ -427,7 +427,7 @@ int do_settimeofday(const struct timespec *tv)
 
 	timekeeping_update(tk, true);
 
-	write_sequnlock_irqrestore(&tk->lock, flags);
+	write_sequnlock_irqrestore(&timekeeper_lock, flags);
 
 	/* signal hrtimers about time change */
 	clock_was_set();
@@ -452,7 +452,7 @@ int timekeeping_inject_offset(struct timespec *ts)
 	if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC)
 		return -EINVAL;
 
-	write_seqlock_irqsave(&tk->lock, flags);
+	write_seqlock_irqsave(&timekeeper_lock, flags);
 
 	timekeeping_forward_now(tk);
 
@@ -469,7 +469,7 @@ int timekeeping_inject_offset(struct timespec *ts)
 error: /* even if we error out, we forwarded the time, so call update */
 	timekeeping_update(tk, true);
 
-	write_sequnlock_irqrestore(&tk->lock, flags);
+	write_sequnlock_irqrestore(&timekeeper_lock, flags);
 
 	/* signal hrtimers about time change */
 	clock_was_set();
@@ -491,7 +491,7 @@ static int change_clocksource(void *data)
 
 	new = (struct clocksource *) data;
 
-	write_seqlock_irqsave(&tk->lock, flags);
+	write_seqlock_irqsave(&timekeeper_lock, flags);
 
 	timekeeping_forward_now(tk);
 	if (!new->enable || new->enable(new) == 0) {
@@ -502,7 +502,7 @@ static int change_clocksource(void *data)
 	}
 	timekeeping_update(tk, true);
 
-	write_sequnlock_irqrestore(&tk->lock, flags);
+	write_sequnlock_irqrestore(&timekeeper_lock, flags);
 
 	return 0;
 }
@@ -552,11 +552,11 @@ void getrawmonotonic(struct timespec *ts)
 	s64 nsecs;
 
 	do {
-		seq = read_seqbegin(&tk->lock);
+		seq = read_seqbegin(&timekeeper_lock);
 		nsecs = timekeeping_get_ns_raw(tk);
 		*ts = tk->raw_time;
 
-	} while (read_seqretry(&tk->lock, seq));
+	} while (read_seqretry(&timekeeper_lock, seq));
 
 	timespec_add_ns(ts, nsecs);
 }
@@ -572,11 +572,11 @@ int timekeeping_valid_for_hres(void)
 	int ret;
 
 	do {
-		seq = read_seqbegin(&tk->lock);
+		seq = read_seqbegin(&timekeeper_lock);
 
 		ret = tk->clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;
 
-	} while (read_seqretry(&tk->lock, seq));
+	} while (read_seqretry(&timekeeper_lock, seq));
 
 	return ret;
 }
@@ -591,11 +591,11 @@ u64 timekeeping_max_deferment(void)
 	u64 ret;
 
 	do {
-		seq = read_seqbegin(&tk->lock);
+		seq = read_seqbegin(&timekeeper_lock);
 
 		ret = tk->clock->max_idle_ns;
 
-	} while (read_seqretry(&tk->lock, seq));
+	} while (read_seqretry(&timekeeper_lock, seq));
 
 	return ret;
 }
@@ -656,11 +656,9 @@ void __init timekeeping_init(void)
 		boot.tv_nsec = 0;
 	}
 
-	seqlock_init(&tk->lock);
-
 	ntp_init();
 
-	write_seqlock_irqsave(&tk->lock, flags);
+	write_seqlock_irqsave(&timekeeper_lock, flags);
 	clock = clocksource_default_clock();
 	if (clock->enable)
 		clock->enable(clock);
@@ -679,7 +677,7 @@ void __init timekeeping_init(void)
 	tmp.tv_nsec = 0;
 	tk_set_sleep_time(tk, tmp);
 
-	write_sequnlock_irqrestore(&tk->lock, flags);
+	write_sequnlock_irqrestore(&timekeeper_lock, flags);
 }
 
 /* time in seconds when suspend began */
@@ -726,7 +724,7 @@ void timekeeping_inject_sleeptime(struct timespec *delta)
 	if (!(ts.tv_sec == 0 && ts.tv_nsec == 0))
 		return;
 
-	write_seqlock_irqsave(&tk->lock, flags);
+	write_seqlock_irqsave(&timekeeper_lock, flags);
 
 	timekeeping_forward_now(tk);
 
@@ -734,7 +732,7 @@ void timekeeping_inject_sleeptime(struct timespec *delta)
 
 	timekeeping_update(tk, true);
 
-	write_sequnlock_irqrestore(&tk->lock, flags);
+	write_sequnlock_irqrestore(&timekeeper_lock, flags);
 
 	/* signal hrtimers about time change */
 	clock_was_set();
@@ -758,7 +756,7 @@ static void timekeeping_resume(void)
 	clockevents_resume();
 	clocksource_resume();
 
-	write_seqlock_irqsave(&tk->lock, flags);
+	write_seqlock_irqsave(&timekeeper_lock, flags);
 
 	if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) {
 		ts = timespec_sub(ts, timekeeping_suspend_time);
@@ -769,7 +767,7 @@ static void timekeeping_resume(void)
 	tk->ntp_error = 0;
 	timekeeping_suspended = 0;
 	timekeeping_update(tk, false);
-	write_sequnlock_irqrestore(&tk->lock, flags);
+	write_sequnlock_irqrestore(&timekeeper_lock, flags);
 
 	touch_softlockup_watchdog();
 
@@ -788,7 +786,7 @@ static int timekeeping_suspend(void)
 
 	read_persistent_clock(&timekeeping_suspend_time);
 
-	write_seqlock_irqsave(&tk->lock, flags);
+	write_seqlock_irqsave(&timekeeper_lock, flags);
 	timekeeping_forward_now(tk);
 	timekeeping_suspended = 1;
 
@@ -811,7 +809,7 @@ static int timekeeping_suspend(void)
 		timekeeping_suspend_time =
 			timespec_add(timekeeping_suspend_time, delta_delta);
 	}
-	write_sequnlock_irqrestore(&tk->lock, flags);
+	write_sequnlock_irqrestore(&timekeeper_lock, flags);
 
 	clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
 	clocksource_suspend();
@@ -1149,7 +1147,7 @@ static void update_wall_time(void)
 	int shift = 0, maxshift;
 	unsigned long flags;
 
-	write_seqlock_irqsave(&tk->lock, flags);
+	write_seqlock_irqsave(&timekeeper_lock, flags);
 
 	/* Make sure we're fully resumed: */
 	if (unlikely(timekeeping_suspended))
@@ -1204,7 +1202,7 @@ static void update_wall_time(void)
 	timekeeping_update(tk, false);
 
 out:
-	write_sequnlock_irqrestore(&tk->lock, flags);
+	write_sequnlock_irqrestore(&timekeeper_lock, flags);
 
 }
 
@@ -1252,13 +1250,13 @@ void get_monotonic_boottime(struct timespec *ts)
 	WARN_ON(timekeeping_suspended);
 
 	do {
-		seq = read_seqbegin(&tk->lock);
+		seq = read_seqbegin(&timekeeper_lock);
 		ts->tv_sec = tk->xtime_sec;
 		nsec = timekeeping_get_ns(tk);
 		tomono = tk->wall_to_monotonic;
 		sleep = tk->total_sleep_time;
 
-	} while (read_seqretry(&tk->lock, seq));
+	} while (read_seqretry(&timekeeper_lock, seq));
 
 	ts->tv_sec += tomono.tv_sec + sleep.tv_sec;
 	ts->tv_nsec = 0;
@@ -1317,10 +1315,10 @@ struct timespec current_kernel_time(void)
 	unsigned long seq;
 
 	do {
-		seq = read_seqbegin(&tk->lock);
+		seq = read_seqbegin(&timekeeper_lock);
 
 		now = tk_xtime(tk);
-	} while (read_seqretry(&tk->lock, seq));
+	} while (read_seqretry(&timekeeper_lock, seq));
 
 	return now;
 }
@@ -1333,11 +1331,11 @@ struct timespec get_monotonic_coarse(void)
 	unsigned long seq;
 
 	do {
-		seq = read_seqbegin(&tk->lock);
+		seq = read_seqbegin(&timekeeper_lock);
 
 		now = tk_xtime(tk);
 		mono = tk->wall_to_monotonic;
-	} while (read_seqretry(&tk->lock, seq));
+	} while (read_seqretry(&timekeeper_lock, seq));
 
 	set_normalized_timespec(&now, now.tv_sec + mono.tv_sec,
 				now.tv_nsec + mono.tv_nsec);
@@ -1368,11 +1366,11 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim,
 	unsigned long seq;
 
 	do {
-		seq = read_seqbegin(&tk->lock);
+		seq = read_seqbegin(&timekeeper_lock);
 		*xtim = tk_xtime(tk);
 		*wtom = tk->wall_to_monotonic;
 		*sleep = tk->total_sleep_time;
-	} while (read_seqretry(&tk->lock, seq));
+	} while (read_seqretry(&timekeeper_lock, seq));
 }
 
 #ifdef CONFIG_HIGH_RES_TIMERS
@@ -1392,14 +1390,14 @@ ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot)
 	u64 secs, nsecs;
 
 	do {
-		seq = read_seqbegin(&tk->lock);
+		seq = read_seqbegin(&timekeeper_lock);
 
 		secs = tk->xtime_sec;
 		nsecs = timekeeping_get_ns(tk);
 
 		*offs_real = tk->offs_real;
 		*offs_boot = tk->offs_boot;
-	} while (read_seqretry(&tk->lock, seq));
+	} while (read_seqretry(&timekeeper_lock, seq));
 
 	now = ktime_add_ns(ktime_set(secs, 0), nsecs);
 	now = ktime_sub(now, *offs_real);
@@ -1417,9 +1415,9 @@ ktime_t ktime_get_monotonic_offset(void)
 	struct timespec wtom;
 
 	do {
-		seq = read_seqbegin(&tk->lock);
+		seq = read_seqbegin(&timekeeper_lock);
 		wtom = tk->wall_to_monotonic;
-	} while (read_seqretry(&tk->lock, seq));
+	} while (read_seqretry(&timekeeper_lock, seq));
 
 	return timespec_to_ktime(wtom);
 }
-- 
cgit v0.10.2


From 43c9c4550a10638b3307eee9ca8c521d501eb253 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 15 Feb 2013 15:03:17 +0100
Subject: timekeeping: Split timekeeper_lock into lock and seqcount

We want to shorten the seqcount write hold time. So split the seqlock
into a lock and a seqcount.

Open code the seqwrite_lock in the places which matter and drop the
sequence counter update where it's pointless.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 37887dc..bb62b9f 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -26,7 +26,8 @@
 #include "tick-internal.h"
 
 static struct timekeeper timekeeper;
-static DEFINE_SEQLOCK(timekeeper_lock);
+static DEFINE_RAW_SPINLOCK(timekeeper_lock);
+static seqcount_t timekeeper_seq;
 
 /* flag for if timekeeping is suspended */
 int __read_mostly timekeeping_suspended;
@@ -186,8 +187,6 @@ static void update_pvclock_gtod(struct timekeeper *tk)
 
 /**
  * pvclock_gtod_register_notifier - register a pvclock timedata update listener
- *
- * Must hold write on timekeeper.lock
  */
 int pvclock_gtod_register_notifier(struct notifier_block *nb)
 {
@@ -195,11 +194,10 @@ int pvclock_gtod_register_notifier(struct notifier_block *nb)
 	unsigned long flags;
 	int ret;
 
-	write_seqlock_irqsave(&timekeeper_lock, flags);
+	raw_spin_lock_irqsave(&timekeeper_lock, flags);
 	ret = raw_notifier_chain_register(&pvclock_gtod_chain, nb);
-	/* update timekeeping data */
 	update_pvclock_gtod(tk);
-	write_sequnlock_irqrestore(&timekeeper_lock, flags);
+	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
 
 	return ret;
 }
@@ -208,23 +206,21 @@ EXPORT_SYMBOL_GPL(pvclock_gtod_register_notifier);
 /**
  * pvclock_gtod_unregister_notifier - unregister a pvclock
  * timedata update listener
- *
- * Must hold write on timekeeper.lock
  */
 int pvclock_gtod_unregister_notifier(struct notifier_block *nb)
 {
 	unsigned long flags;
 	int ret;
 
-	write_seqlock_irqsave(&timekeeper_lock, flags);
+	raw_spin_lock_irqsave(&timekeeper_lock, flags);
 	ret = raw_notifier_chain_unregister(&pvclock_gtod_chain, nb);
-	write_sequnlock_irqrestore(&timekeeper_lock, flags);
+	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
 
 	return ret;
 }
 EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier);
 
-/* must hold write on timekeeper.lock */
+/* must hold timekeeper_lock */
 static void timekeeping_update(struct timekeeper *tk, bool clearntp)
 {
 	if (clearntp) {
@@ -279,12 +275,12 @@ void getnstimeofday(struct timespec *ts)
 	WARN_ON(timekeeping_suspended);
 
 	do {
-		seq = read_seqbegin(&timekeeper_lock);
+		seq = read_seqcount_begin(&timekeeper_seq);
 
 		ts->tv_sec = tk->xtime_sec;
 		nsecs = timekeeping_get_ns(tk);
 
-	} while (read_seqretry(&timekeeper_lock, seq));
+	} while (read_seqcount_retry(&timekeeper_seq, seq));
 
 	ts->tv_nsec = 0;
 	timespec_add_ns(ts, nsecs);
@@ -300,11 +296,11 @@ ktime_t ktime_get(void)
 	WARN_ON(timekeeping_suspended);
 
 	do {
-		seq = read_seqbegin(&timekeeper_lock);
+		seq = read_seqcount_begin(&timekeeper_seq);
 		secs = tk->xtime_sec + tk->wall_to_monotonic.tv_sec;
 		nsecs = timekeeping_get_ns(tk) + tk->wall_to_monotonic.tv_nsec;
 
-	} while (read_seqretry(&timekeeper_lock, seq));
+	} while (read_seqcount_retry(&timekeeper_seq, seq));
 	/*
 	 * Use ktime_set/ktime_add_ns to create a proper ktime on
 	 * 32-bit architectures without CONFIG_KTIME_SCALAR.
@@ -331,12 +327,12 @@ void ktime_get_ts(struct timespec *ts)
 	WARN_ON(timekeeping_suspended);
 
 	do {
-		seq = read_seqbegin(&timekeeper_lock);
+		seq = read_seqcount_begin(&timekeeper_seq);
 		ts->tv_sec = tk->xtime_sec;
 		nsec = timekeeping_get_ns(tk);
 		tomono = tk->wall_to_monotonic;
 
-	} while (read_seqretry(&timekeeper_lock, seq));
+	} while (read_seqcount_retry(&timekeeper_seq, seq));
 
 	ts->tv_sec += tomono.tv_sec;
 	ts->tv_nsec = 0;
@@ -364,7 +360,7 @@ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real)
 	WARN_ON_ONCE(timekeeping_suspended);
 
 	do {
-		seq = read_seqbegin(&timekeeper_lock);
+		seq = read_seqcount_begin(&timekeeper_seq);
 
 		*ts_raw = tk->raw_time;
 		ts_real->tv_sec = tk->xtime_sec;
@@ -373,7 +369,7 @@ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real)
 		nsecs_raw = timekeeping_get_ns_raw(tk);
 		nsecs_real = timekeeping_get_ns(tk);
 
-	} while (read_seqretry(&timekeeper_lock, seq));
+	} while (read_seqcount_retry(&timekeeper_seq, seq));
 
 	timespec_add_ns(ts_raw, nsecs_raw);
 	timespec_add_ns(ts_real, nsecs_real);
@@ -413,7 +409,8 @@ int do_settimeofday(const struct timespec *tv)
 	if (!timespec_valid_strict(tv))
 		return -EINVAL;
 
-	write_seqlock_irqsave(&timekeeper_lock, flags);
+	raw_spin_lock_irqsave(&timekeeper_lock, flags);
+	write_seqcount_begin(&timekeeper_seq);
 
 	timekeeping_forward_now(tk);
 
@@ -427,7 +424,8 @@ int do_settimeofday(const struct timespec *tv)
 
 	timekeeping_update(tk, true);
 
-	write_sequnlock_irqrestore(&timekeeper_lock, flags);
+	write_seqcount_end(&timekeeper_seq);
+	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
 
 	/* signal hrtimers about time change */
 	clock_was_set();
@@ -452,7 +450,8 @@ int timekeeping_inject_offset(struct timespec *ts)
 	if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC)
 		return -EINVAL;
 
-	write_seqlock_irqsave(&timekeeper_lock, flags);
+	raw_spin_lock_irqsave(&timekeeper_lock, flags);
+	write_seqcount_begin(&timekeeper_seq);
 
 	timekeeping_forward_now(tk);
 
@@ -469,7 +468,8 @@ int timekeeping_inject_offset(struct timespec *ts)
 error: /* even if we error out, we forwarded the time, so call update */
 	timekeeping_update(tk, true);
 
-	write_sequnlock_irqrestore(&timekeeper_lock, flags);
+	write_seqcount_end(&timekeeper_seq);
+	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
 
 	/* signal hrtimers about time change */
 	clock_was_set();
@@ -491,7 +491,8 @@ static int change_clocksource(void *data)
 
 	new = (struct clocksource *) data;
 
-	write_seqlock_irqsave(&timekeeper_lock, flags);
+	raw_spin_lock_irqsave(&timekeeper_lock, flags);
+	write_seqcount_begin(&timekeeper_seq);
 
 	timekeeping_forward_now(tk);
 	if (!new->enable || new->enable(new) == 0) {
@@ -502,7 +503,8 @@ static int change_clocksource(void *data)
 	}
 	timekeeping_update(tk, true);
 
-	write_sequnlock_irqrestore(&timekeeper_lock, flags);
+	write_seqcount_end(&timekeeper_seq);
+	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
 
 	return 0;
 }
@@ -552,11 +554,11 @@ void getrawmonotonic(struct timespec *ts)
 	s64 nsecs;
 
 	do {
-		seq = read_seqbegin(&timekeeper_lock);
+		seq = read_seqcount_begin(&timekeeper_seq);
 		nsecs = timekeeping_get_ns_raw(tk);
 		*ts = tk->raw_time;
 
-	} while (read_seqretry(&timekeeper_lock, seq));
+	} while (read_seqcount_retry(&timekeeper_seq, seq));
 
 	timespec_add_ns(ts, nsecs);
 }
@@ -572,11 +574,11 @@ int timekeeping_valid_for_hres(void)
 	int ret;
 
 	do {
-		seq = read_seqbegin(&timekeeper_lock);
+		seq = read_seqcount_begin(&timekeeper_seq);
 
 		ret = tk->clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;
 
-	} while (read_seqretry(&timekeeper_lock, seq));
+	} while (read_seqcount_retry(&timekeeper_seq, seq));
 
 	return ret;
 }
@@ -591,11 +593,11 @@ u64 timekeeping_max_deferment(void)
 	u64 ret;
 
 	do {
-		seq = read_seqbegin(&timekeeper_lock);
+		seq = read_seqcount_begin(&timekeeper_seq);
 
 		ret = tk->clock->max_idle_ns;
 
-	} while (read_seqretry(&timekeeper_lock, seq));
+	} while (read_seqcount_retry(&timekeeper_seq, seq));
 
 	return ret;
 }
@@ -658,7 +660,8 @@ void __init timekeeping_init(void)
 
 	ntp_init();
 
-	write_seqlock_irqsave(&timekeeper_lock, flags);
+	raw_spin_lock_irqsave(&timekeeper_lock, flags);
+	write_seqcount_begin(&timekeeper_seq);
 	clock = clocksource_default_clock();
 	if (clock->enable)
 		clock->enable(clock);
@@ -677,7 +680,8 @@ void __init timekeeping_init(void)
 	tmp.tv_nsec = 0;
 	tk_set_sleep_time(tk, tmp);
 
-	write_sequnlock_irqrestore(&timekeeper_lock, flags);
+	write_seqcount_end(&timekeeper_seq);
+	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
 }
 
 /* time in seconds when suspend began */
@@ -724,7 +728,8 @@ void timekeeping_inject_sleeptime(struct timespec *delta)
 	if (!(ts.tv_sec == 0 && ts.tv_nsec == 0))
 		return;
 
-	write_seqlock_irqsave(&timekeeper_lock, flags);
+	raw_spin_lock_irqsave(&timekeeper_lock, flags);
+	write_seqcount_begin(&timekeeper_seq);
 
 	timekeeping_forward_now(tk);
 
@@ -732,7 +737,8 @@ void timekeeping_inject_sleeptime(struct timespec *delta)
 
 	timekeeping_update(tk, true);
 
-	write_sequnlock_irqrestore(&timekeeper_lock, flags);
+	write_seqcount_end(&timekeeper_seq);
+	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
 
 	/* signal hrtimers about time change */
 	clock_was_set();
@@ -756,7 +762,8 @@ static void timekeeping_resume(void)
 	clockevents_resume();
 	clocksource_resume();
 
-	write_seqlock_irqsave(&timekeeper_lock, flags);
+	raw_spin_lock_irqsave(&timekeeper_lock, flags);
+	write_seqcount_begin(&timekeeper_seq);
 
 	if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) {
 		ts = timespec_sub(ts, timekeeping_suspend_time);
@@ -767,7 +774,8 @@ static void timekeeping_resume(void)
 	tk->ntp_error = 0;
 	timekeeping_suspended = 0;
 	timekeeping_update(tk, false);
-	write_sequnlock_irqrestore(&timekeeper_lock, flags);
+	write_seqcount_end(&timekeeper_seq);
+	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
 
 	touch_softlockup_watchdog();
 
@@ -786,7 +794,8 @@ static int timekeeping_suspend(void)
 
 	read_persistent_clock(&timekeeping_suspend_time);
 
-	write_seqlock_irqsave(&timekeeper_lock, flags);
+	raw_spin_lock_irqsave(&timekeeper_lock, flags);
+	write_seqcount_begin(&timekeeper_seq);
 	timekeeping_forward_now(tk);
 	timekeeping_suspended = 1;
 
@@ -809,7 +818,8 @@ static int timekeeping_suspend(void)
 		timekeeping_suspend_time =
 			timespec_add(timekeeping_suspend_time, delta_delta);
 	}
-	write_sequnlock_irqrestore(&timekeeper_lock, flags);
+	write_seqcount_end(&timekeeper_seq);
+	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
 
 	clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
 	clocksource_suspend();
@@ -1147,7 +1157,8 @@ static void update_wall_time(void)
 	int shift = 0, maxshift;
 	unsigned long flags;
 
-	write_seqlock_irqsave(&timekeeper_lock, flags);
+	raw_spin_lock_irqsave(&timekeeper_lock, flags);
+	write_seqcount_begin(&timekeeper_seq);
 
 	/* Make sure we're fully resumed: */
 	if (unlikely(timekeeping_suspended))
@@ -1202,7 +1213,8 @@ static void update_wall_time(void)
 	timekeeping_update(tk, false);
 
 out:
-	write_sequnlock_irqrestore(&timekeeper_lock, flags);
+	write_seqcount_end(&timekeeper_seq);
+	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
 
 }
 
@@ -1250,13 +1262,13 @@ void get_monotonic_boottime(struct timespec *ts)
 	WARN_ON(timekeeping_suspended);
 
 	do {
-		seq = read_seqbegin(&timekeeper_lock);
+		seq = read_seqcount_begin(&timekeeper_seq);
 		ts->tv_sec = tk->xtime_sec;
 		nsec = timekeeping_get_ns(tk);
 		tomono = tk->wall_to_monotonic;
 		sleep = tk->total_sleep_time;
 
-	} while (read_seqretry(&timekeeper_lock, seq));
+	} while (read_seqcount_retry(&timekeeper_seq, seq));
 
 	ts->tv_sec += tomono.tv_sec + sleep.tv_sec;
 	ts->tv_nsec = 0;
@@ -1315,10 +1327,10 @@ struct timespec current_kernel_time(void)
 	unsigned long seq;
 
 	do {
-		seq = read_seqbegin(&timekeeper_lock);
+		seq = read_seqcount_begin(&timekeeper_seq);
 
 		now = tk_xtime(tk);
-	} while (read_seqretry(&timekeeper_lock, seq));
+	} while (read_seqcount_retry(&timekeeper_seq, seq));
 
 	return now;
 }
@@ -1331,11 +1343,11 @@ struct timespec get_monotonic_coarse(void)
 	unsigned long seq;
 
 	do {
-		seq = read_seqbegin(&timekeeper_lock);
+		seq = read_seqcount_begin(&timekeeper_seq);
 
 		now = tk_xtime(tk);
 		mono = tk->wall_to_monotonic;
-	} while (read_seqretry(&timekeeper_lock, seq));
+	} while (read_seqcount_retry(&timekeeper_seq, seq));
 
 	set_normalized_timespec(&now, now.tv_sec + mono.tv_sec,
 				now.tv_nsec + mono.tv_nsec);
@@ -1366,11 +1378,11 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim,
 	unsigned long seq;
 
 	do {
-		seq = read_seqbegin(&timekeeper_lock);
+		seq = read_seqcount_begin(&timekeeper_seq);
 		*xtim = tk_xtime(tk);
 		*wtom = tk->wall_to_monotonic;
 		*sleep = tk->total_sleep_time;
-	} while (read_seqretry(&timekeeper_lock, seq));
+	} while (read_seqcount_retry(&timekeeper_seq, seq));
 }
 
 #ifdef CONFIG_HIGH_RES_TIMERS
@@ -1390,14 +1402,14 @@ ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot)
 	u64 secs, nsecs;
 
 	do {
-		seq = read_seqbegin(&timekeeper_lock);
+		seq = read_seqcount_begin(&timekeeper_seq);
 
 		secs = tk->xtime_sec;
 		nsecs = timekeeping_get_ns(tk);
 
 		*offs_real = tk->offs_real;
 		*offs_boot = tk->offs_boot;
-	} while (read_seqretry(&timekeeper_lock, seq));
+	} while (read_seqcount_retry(&timekeeper_seq, seq));
 
 	now = ktime_add_ns(ktime_set(secs, 0), nsecs);
 	now = ktime_sub(now, *offs_real);
@@ -1415,9 +1427,9 @@ ktime_t ktime_get_monotonic_offset(void)
 	struct timespec wtom;
 
 	do {
-		seq = read_seqbegin(&timekeeper_lock);
+		seq = read_seqcount_begin(&timekeeper_seq);
 		wtom = tk->wall_to_monotonic;
-	} while (read_seqretry(&timekeeper_lock, seq));
+	} while (read_seqcount_retry(&timekeeper_seq, seq));
 
 	return timespec_to_ktime(wtom);
 }
-- 
cgit v0.10.2


From 748b68dcd970fb0c8d4151310f05955cbe96b178 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 15 Feb 2013 17:15:49 +0100
Subject: timekeeping: Store cycle_last value in timekeeper struct as well

For implementing a shadow timekeeper and a split calculation/update
region we need to store the cycle_last value in the timekeeper and
update the value in the clocksource struct only in the update region.

Add the extra storage to the timekeeper.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/timekeeper_internal.h b/include/linux/timekeeper_internal.h
index 851bc43..2adf9c3 100644
--- a/include/linux/timekeeper_internal.h
+++ b/include/linux/timekeeper_internal.h
@@ -20,6 +20,8 @@ struct timekeeper {
 	u32			shift;
 	/* Number of clock cycles in one NTP interval. */
 	cycle_t			cycle_interval;
+	/* Last cycle value (also stored in clock->cycle_last) */
+	cycle_t			cycle_last;
 	/* Number of clock shifted nano seconds in one NTP interval. */
 	u64			xtime_interval;
 	/* shifted nano seconds left over when rounding cycle_interval */
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index bb62b9f..c59c9c6 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -96,7 +96,7 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
 
 	old_clock = tk->clock;
 	tk->clock = clock;
-	clock->cycle_last = clock->read(clock);
+	tk->cycle_last = clock->cycle_last = clock->read(clock);
 
 	/* Do the ns -> cycle conversion first, using original mult */
 	tmp = NTP_INTERVAL_LENGTH;
@@ -247,7 +247,7 @@ static void timekeeping_forward_now(struct timekeeper *tk)
 	clock = tk->clock;
 	cycle_now = clock->read(clock);
 	cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
-	clock->cycle_last = cycle_now;
+	tk->cycle_last = clock->cycle_last = cycle_now;
 
 	tk->xtime_nsec += cycle_delta * tk->mult;
 
-- 
cgit v0.10.2


From 0d8c8220f41a079c96361f55fe6fb979d34cbb8b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 16 Feb 2013 00:06:18 +0100
Subject: timekeeping: Delay update of clock->cycle_last

For calculating the new timekeeper values store the new cycle_last
value in the timekeeper and update the clock->cycle_last just when we
actually update the new values.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index c59c9c6..d317692 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1095,7 +1095,7 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset,
 
 	/* Accumulate one shifted interval */
 	offset -= interval;
-	tk->clock->cycle_last += interval;
+	tk->cycle_last += interval;
 
 	tk->xtime_nsec += tk->xtime_interval << shift;
 	accumulate_nsecs_to_secs(tk);
@@ -1210,6 +1210,8 @@ static void update_wall_time(void)
 	 */
 	accumulate_nsecs_to_secs(tk);
 
+	/* Update clock->cycle_last with the new value */
+	clock->cycle_last = tk->cycle_last;
 	timekeeping_update(tk, false);
 
 out:
-- 
cgit v0.10.2


From 5ef90233a4bac69c58b7aa0e9348212b8c072e22 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 15 Feb 2013 15:47:13 +0100
Subject: timekeeping: Implement a shadow timekeeper

Use the shadow timekeeper to do the update_wall_time() adjustments and
then copy it over to the real timekeeper.

Keep the shadow timekeeper in sync when updating stuff outside of
update_wall_time().

This allows us to limit the timekeeper_seq hold time to the update of
the real timekeeper and the vsyscall data in the next patch.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index d317692..9a0100b 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -28,6 +28,7 @@
 static struct timekeeper timekeeper;
 static DEFINE_RAW_SPINLOCK(timekeeper_lock);
 static seqcount_t timekeeper_seq;
+static struct timekeeper shadow_timekeeper;
 
 /* flag for if timekeeping is suspended */
 int __read_mostly timekeeping_suspended;
@@ -221,7 +222,7 @@ int pvclock_gtod_unregister_notifier(struct notifier_block *nb)
 EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier);
 
 /* must hold timekeeper_lock */
-static void timekeeping_update(struct timekeeper *tk, bool clearntp)
+static void timekeeping_update(struct timekeeper *tk, bool clearntp, bool mirror)
 {
 	if (clearntp) {
 		tk->ntp_error = 0;
@@ -229,6 +230,9 @@ static void timekeeping_update(struct timekeeper *tk, bool clearntp)
 	}
 	update_vsyscall(tk);
 	update_pvclock_gtod(tk);
+
+	if (mirror)
+		memcpy(&shadow_timekeeper, &timekeeper, sizeof(timekeeper));
 }
 
 /**
@@ -422,7 +426,7 @@ int do_settimeofday(const struct timespec *tv)
 
 	tk_set_xtime(tk, tv);
 
-	timekeeping_update(tk, true);
+	timekeeping_update(tk, true, true);
 
 	write_seqcount_end(&timekeeper_seq);
 	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
@@ -466,7 +470,7 @@ int timekeeping_inject_offset(struct timespec *ts)
 	tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *ts));
 
 error: /* even if we error out, we forwarded the time, so call update */
-	timekeeping_update(tk, true);
+	timekeeping_update(tk, true, true);
 
 	write_seqcount_end(&timekeeper_seq);
 	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
@@ -501,7 +505,7 @@ static int change_clocksource(void *data)
 		if (old->disable)
 			old->disable(old);
 	}
-	timekeeping_update(tk, true);
+	timekeeping_update(tk, true, true);
 
 	write_seqcount_end(&timekeeper_seq);
 	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
@@ -680,6 +684,8 @@ void __init timekeeping_init(void)
 	tmp.tv_nsec = 0;
 	tk_set_sleep_time(tk, tmp);
 
+	memcpy(&shadow_timekeeper, &timekeeper, sizeof(timekeeper));
+
 	write_seqcount_end(&timekeeper_seq);
 	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
 }
@@ -735,7 +741,7 @@ void timekeeping_inject_sleeptime(struct timespec *delta)
 
 	__timekeeping_inject_sleeptime(tk, delta);
 
-	timekeeping_update(tk, true);
+	timekeeping_update(tk, true, true);
 
 	write_seqcount_end(&timekeeper_seq);
 	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
@@ -773,7 +779,7 @@ static void timekeeping_resume(void)
 	tk->clock->cycle_last = tk->clock->read(tk->clock);
 	tk->ntp_error = 0;
 	timekeeping_suspended = 0;
-	timekeeping_update(tk, false);
+	timekeeping_update(tk, false, true);
 	write_seqcount_end(&timekeeper_seq);
 	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
 
@@ -1152,7 +1158,8 @@ static inline void old_vsyscall_fixup(struct timekeeper *tk)
 static void update_wall_time(void)
 {
 	struct clocksource *clock;
-	struct timekeeper *tk = &timekeeper;
+	struct timekeeper *real_tk = &timekeeper;
+	struct timekeeper *tk = &shadow_timekeeper;
 	cycle_t offset;
 	int shift = 0, maxshift;
 	unsigned long flags;
@@ -1164,16 +1171,16 @@ static void update_wall_time(void)
 	if (unlikely(timekeeping_suspended))
 		goto out;
 
-	clock = tk->clock;
+	clock = real_tk->clock;
 
 #ifdef CONFIG_ARCH_USES_GETTIMEOFFSET
-	offset = tk->cycle_interval;
+	offset = real_tk->cycle_interval;
 #else
 	offset = (clock->read(clock) - clock->cycle_last) & clock->mask;
 #endif
 
 	/* Check if there's really nothing to do */
-	if (offset < tk->cycle_interval)
+	if (offset < real_tk->cycle_interval)
 		goto out;
 
 	/*
@@ -1212,12 +1219,22 @@ static void update_wall_time(void)
 
 	/* Update clock->cycle_last with the new value */
 	clock->cycle_last = tk->cycle_last;
-	timekeeping_update(tk, false);
+	/*
+	 * Update the real timekeeper.
+	 *
+	 * We could avoid this memcpy by switching pointers, but that
+	 * requires changes to all other timekeeper usage sites as
+	 * well, i.e. move the timekeeper pointer getter into the
+	 * spinlocked/seqcount protected sections. And we trade this
+	 * memcpy under the timekeeper_seq against one before we start
+	 * updating.
+	 */
+	memcpy(real_tk, tk, sizeof(*tk));
+	timekeeping_update(real_tk, false, false);
 
 out:
 	write_seqcount_end(&timekeeper_seq);
 	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
-
 }
 
 /**
-- 
cgit v0.10.2


From 22e0e25d8bab8b023fd6a79ea172b8b843e3a062 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 16 Feb 2013 00:39:49 +0100
Subject: timekeeping: Shorten seq_count region

Shorten the seqcount write hold region to the actual update of the
timekeeper and the related data (e.g vsyscall).

On a contemporary x86 system this reduces the maximum latencies on
Preempt-RT from 8us to 4us on the non-timekeeping cores.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 9a0100b..1612708 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1165,7 +1165,6 @@ static void update_wall_time(void)
 	unsigned long flags;
 
 	raw_spin_lock_irqsave(&timekeeper_lock, flags);
-	write_seqcount_begin(&timekeeper_seq);
 
 	/* Make sure we're fully resumed: */
 	if (unlikely(timekeeping_suspended))
@@ -1217,6 +1216,7 @@ static void update_wall_time(void)
 	 */
 	accumulate_nsecs_to_secs(tk);
 
+	write_seqcount_begin(&timekeeper_seq);
 	/* Update clock->cycle_last with the new value */
 	clock->cycle_last = tk->cycle_last;
 	/*
@@ -1231,9 +1231,8 @@ static void update_wall_time(void)
 	 */
 	memcpy(real_tk, tk, sizeof(*tk));
 	timekeeping_update(real_tk, false, false);
-
-out:
 	write_seqcount_end(&timekeeper_seq);
+out:
 	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
 }
 
-- 
cgit v0.10.2


From ca2954d9349d23d3140294104424a00f1734a4a9 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 22 Jun 2011 19:47:02 +0200
Subject: block: Shorten interrupt disabled regions

Moving the blk_sched_flush_plug() call out of the interrupt/preempt
disabled region in the scheduler allows us to replace
local_irq_save/restore(flags) by local_irq_disable/enable() in
blk_flush_plug().

Now instead of doing this we disable interrupts explicitely when we
lock the request_queue and reenable them when we drop the lock. That
allows interrupts to be handled when the plug list contains requests
for more than one queue.

Aside of that this change makes the scope of the irq disabled region
more obvious. The current code confused the hell out of me when
looking at:

 local_irq_save(flags);
   spin_lock(q->queue_lock);
   ...
   queue_unplugged(q...);
     scsi_request_fn();
       spin_unlock(q->queue_lock);
       spin_lock(shost->host_lock);
       spin_unlock_irq(shost->host_lock);

-------------------^^^ ????

       spin_lock_irq(q->queue_lock);
       spin_unlock(q->lock);
 local_irq_restore(flags);

Also add a comment to __blk_run_queue() documenting that
q->request_fn() can drop q->queue_lock and reenable interrupts, but
must return with q->queue_lock held and interrupts disabled.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: http://lkml.kernel.org/r/20110622174919.025446432@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/block/blk-core.c b/block/blk-core.c
index c973249..0ff1564 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2929,7 +2929,7 @@ static void queue_unplugged(struct request_queue *q, unsigned int depth,
 		blk_run_queue_async(q);
 	else
 		__blk_run_queue(q);
-	spin_unlock(q->queue_lock);
+	spin_unlock_irq(q->queue_lock);
 }
 
 static void flush_plug_callbacks(struct blk_plug *plug, bool from_schedule)
@@ -2977,7 +2977,6 @@ EXPORT_SYMBOL(blk_check_plugged);
 void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
 {
 	struct request_queue *q;
-	unsigned long flags;
 	struct request *rq;
 	LIST_HEAD(list);
 	unsigned int depth;
@@ -2998,11 +2997,6 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
 	q = NULL;
 	depth = 0;
 
-	/*
-	 * Save and disable interrupts here, to avoid doing it for every
-	 * queue lock we have to take.
-	 */
-	local_irq_save(flags);
 	while (!list_empty(&list)) {
 		rq = list_entry_rq(list.next);
 		list_del_init(&rq->queuelist);
@@ -3015,7 +3009,7 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
 				queue_unplugged(q, depth, from_schedule);
 			q = rq->q;
 			depth = 0;
-			spin_lock(q->queue_lock);
+			spin_lock_irq(q->queue_lock);
 		}
 
 		/*
@@ -3042,8 +3036,6 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
 	 */
 	if (q)
 		queue_unplugged(q, depth, from_schedule);
-
-	local_irq_restore(flags);
 }
 
 void blk_finish_plug(struct blk_plug *plug)
-- 
cgit v0.10.2


From 37349f9dcf63fc0ea41229508489ffd88d2227cd Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 14 Feb 2013 22:36:59 +0100
Subject: timekeeping-split-jiffies-lock.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index 7a925ba..ca9e113 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -67,7 +67,8 @@ static struct clocksource clocksource_jiffies = {
 	.shift		= JIFFIES_SHIFT,
 };
 
-__cacheline_aligned_in_smp DEFINE_SEQLOCK(jiffies_lock);
+__cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(jiffies_lock);
+__cacheline_aligned_in_smp seqcount_t jiffies_seq;
 
 #if (BITS_PER_LONG < 64)
 u64 get_jiffies_64(void)
@@ -76,9 +77,9 @@ u64 get_jiffies_64(void)
 	u64 ret;
 
 	do {
-		seq = read_seqbegin(&jiffies_lock);
+		seq = read_seqcount_begin(&jiffies_seq);
 		ret = jiffies_64;
-	} while (read_seqretry(&jiffies_lock, seq));
+	} while (read_seqcount_retry(&jiffies_seq, seq));
 	return ret;
 }
 EXPORT_SYMBOL(get_jiffies_64);
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index b1600a6..e8076c6 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -63,13 +63,15 @@ int tick_is_oneshot_available(void)
 static void tick_periodic(int cpu)
 {
 	if (tick_do_timer_cpu == cpu) {
-		write_seqlock(&jiffies_lock);
+		raw_spin_lock(&jiffies_lock);
+		write_seqcount_begin(&jiffies_seq);
 
 		/* Keep track of the next tick event */
 		tick_next_period = ktime_add(tick_next_period, tick_period);
 
 		do_timer(1);
-		write_sequnlock(&jiffies_lock);
+		write_seqcount_end(&jiffies_seq);
+		raw_spin_unlock(&jiffies_lock);
 	}
 
 	update_process_times(user_mode(get_irq_regs()));
@@ -130,9 +132,9 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
 		ktime_t next;
 
 		do {
-			seq = read_seqbegin(&jiffies_lock);
+			seq = read_seqcount_begin(&jiffies_seq);
 			next = tick_next_period;
-		} while (read_seqretry(&jiffies_lock, seq));
+		} while (read_seqcount_retry(&jiffies_seq, seq));
 
 		clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
 
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index f5c9207..ad8edee 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -4,7 +4,8 @@
 #include <linux/hrtimer.h>
 #include <linux/tick.h>
 
-extern seqlock_t jiffies_lock;
+extern raw_spinlock_t jiffies_lock;
+extern seqcount_t jiffies_seq;
 
 #ifdef CONFIG_GENERIC_CLOCKEVENTS_BUILD
 
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index e78feff..3365223 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -56,7 +56,8 @@ static void tick_do_update_jiffies64(ktime_t now)
 		return;
 
 	/* Reevalute with jiffies_lock held */
-	write_seqlock(&jiffies_lock);
+	raw_spin_lock(&jiffies_lock);
+	write_seqcount_begin(&jiffies_seq);
 
 	delta = ktime_sub(now, last_jiffies_update);
 	if (delta.tv64 >= tick_period.tv64) {
@@ -79,7 +80,8 @@ static void tick_do_update_jiffies64(ktime_t now)
 		/* Keep the tick_next_period variable up to date */
 		tick_next_period = ktime_add(last_jiffies_update, tick_period);
 	}
-	write_sequnlock(&jiffies_lock);
+	write_seqcount_end(&jiffies_seq);
+	raw_spin_unlock(&jiffies_lock);
 }
 
 /*
@@ -89,12 +91,14 @@ static ktime_t tick_init_jiffy_update(void)
 {
 	ktime_t period;
 
-	write_seqlock(&jiffies_lock);
+	raw_spin_lock(&jiffies_lock);
+	write_seqcount_begin(&jiffies_seq);
 	/* Did we start the jiffies update yet ? */
 	if (last_jiffies_update.tv64 == 0)
 		last_jiffies_update = tick_next_period;
 	period = last_jiffies_update;
-	write_sequnlock(&jiffies_lock);
+	write_seqcount_end(&jiffies_seq);
+	raw_spin_unlock(&jiffies_lock);
 	return period;
 }
 
@@ -325,11 +329,11 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
 
 	/* Read jiffies and the time when jiffies were updated last */
 	do {
-		seq = read_seqbegin(&jiffies_lock);
+		seq = read_seqcount_begin(&jiffies_seq);
 		last_update = last_jiffies_update;
 		last_jiffies = jiffies;
 		time_delta = timekeeping_max_deferment();
-	} while (read_seqretry(&jiffies_lock, seq));
+	} while (read_seqcount_retry(&jiffies_seq, seq));
 
 	if (rcu_needs_cpu(cpu, &rcu_delta_jiffies) || printk_needs_cpu(cpu) ||
 	    arch_needs_cpu(cpu)) {
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 1612708..99f943b 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1461,7 +1461,9 @@ EXPORT_SYMBOL_GPL(ktime_get_monotonic_offset);
  */
 void xtime_update(unsigned long ticks)
 {
-	write_seqlock(&jiffies_lock);
+	raw_spin_lock(&jiffies_lock);
+	write_seqcount_begin(&jiffies_seq);
 	do_timer(ticks);
-	write_sequnlock(&jiffies_lock);
+	write_seqcount_end(&jiffies_seq);
+	raw_spin_unlock(&jiffies_lock);
 }
-- 
cgit v0.10.2


From 5894a3ada26ac4e252ede52b937144ee5df2397e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 18 Jul 2011 21:32:10 +0200
Subject: mips-enable-interrupts-in-signal.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/mips/kernel/signal.c b/arch/mips/kernel/signal.c
index b6aa770..bfcaea6 100644
--- a/arch/mips/kernel/signal.c
+++ b/arch/mips/kernel/signal.c
@@ -601,6 +601,7 @@ asmlinkage void do_notify_resume(struct pt_regs *regs, void *unused,
 	__u32 thread_info_flags)
 {
 	local_irq_enable();
+	preempt_check_resched();
 
 	/* deal with pending signal delivery */
 	if (thread_info_flags & _TIF_SIGPENDING)
-- 
cgit v0.10.2


From b34f0c2b25edad0ed4f504e4bda53c7b0c9be6ea Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Thu, 29 Sep 2011 12:24:30 -0500
Subject: tracing: Account for preempt off in preempt_schedule()

The preempt_schedule() uses the preempt_disable_notrace() version
because it can cause infinite recursion by the function tracer as
the function tracer uses preempt_enable_notrace() which may call
back into the preempt_schedule() code as the NEED_RESCHED is still
set and the PREEMPT_ACTIVE has not been set yet.

See commit: d1f74e20b5b064a130cd0743a256c2d3cfe84010 that made this
change.

The preemptoff and preemptirqsoff latency tracers require the first
and last preempt count modifiers to enable tracing. But this skips
the checks. Since we can not convert them back to the non notrace
version, we can use the idle() hooks for the latency tracers here.
That is, the start/stop_critical_timings() works well to manually
start and stop the latency tracer for preempt off timings.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Clark Williams <williams@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 26058d0..3419f34 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3071,7 +3071,16 @@ asmlinkage void __sched notrace preempt_schedule(void)
 
 	do {
 		add_preempt_count_notrace(PREEMPT_ACTIVE);
+		/*
+		 * The add/subtract must not be traced by the function
+		 * tracer. But we still want to account for the
+		 * preempt off latency tracer. Since the _notrace versions
+		 * of add/subtract skip the accounting for latency tracer
+		 * we must force it manually.
+		 */
+		start_critical_timings();
 		__schedule();
+		stop_critical_timings();
 		sub_preempt_count_notrace(PREEMPT_ACTIVE);
 
 		/*
-- 
cgit v0.10.2


From bc9615c6bbf846a09e17b537fc247984db5ad940 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 21 Sep 2011 19:57:12 +0200
Subject: signal-revert-ptrace-preempt-magic.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/signal.c b/kernel/signal.c
index 7591ccc..55bff2a 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1902,15 +1902,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
 		if (gstop_done && ptrace_reparented(current))
 			do_notify_parent_cldstop(current, false, why);
 
-		/*
-		 * Don't want to allow preemption here, because
-		 * sys_ptrace() needs this task to be inactive.
-		 *
-		 * XXX: implement read_unlock_no_resched().
-		 */
-		preempt_disable();
 		read_unlock(&tasklist_lock);
-		preempt_enable_no_resched();
 		freezable_schedule();
 	} else {
 		/*
-- 
cgit v0.10.2


From 66367892235794ebaf117abef4835b8fa3b8bffe Mon Sep 17 00:00:00 2001
From: Frank Rowand <frank.rowand@am.sony.com>
Date: Mon, 19 Sep 2011 14:51:14 -0700
Subject: preempt-rt: Convert arm boot_lock to raw

The arm boot_lock is used by the secondary processor startup code.  The locking
task is the idle thread, which has idle->sched_class == &idle_sched_class.
idle_sched_class->enqueue_task == NULL, so if the idle task blocks on the
lock, the attempt to wake it when the lock becomes available will fail:

try_to_wake_up()
   ...
      activate_task()
         enqueue_task()
            p->sched_class->enqueue_task(rq, p, flags)

Fix by converting boot_lock to a raw spin lock.

Signed-off-by: Frank Rowand <frank.rowand@am.sony.com>
Link: http://lkml.kernel.org/r/4E77B952.3010606@am.sony.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/arm/mach-exynos/platsmp.c b/arch/arm/mach-exynos/platsmp.c
index c5c840e..d044812 100644
--- a/arch/arm/mach-exynos/platsmp.c
+++ b/arch/arm/mach-exynos/platsmp.c
@@ -71,7 +71,7 @@ static void __iomem *scu_base_addr(void)
 	return (void __iomem *)(S5P_VA_SCU);
 }
 
-static DEFINE_SPINLOCK(boot_lock);
+static DEFINE_RAW_SPINLOCK(boot_lock);
 
 static void __cpuinit exynos_secondary_init(unsigned int cpu)
 {
@@ -91,8 +91,8 @@ static void __cpuinit exynos_secondary_init(unsigned int cpu)
 	/*
 	 * Synchronise with the boot thread.
 	 */
-	spin_lock(&boot_lock);
-	spin_unlock(&boot_lock);
+	raw_spin_lock(&boot_lock);
+	raw_spin_unlock(&boot_lock);
 }
 
 static int __cpuinit exynos_boot_secondary(unsigned int cpu, struct task_struct *idle)
@@ -104,7 +104,7 @@ static int __cpuinit exynos_boot_secondary(unsigned int cpu, struct task_struct
 	 * Set synchronisation state between this boot processor
 	 * and the secondary one
 	 */
-	spin_lock(&boot_lock);
+	raw_spin_lock(&boot_lock);
 
 	/*
 	 * The secondary processor is waiting to be released from
@@ -133,7 +133,7 @@ static int __cpuinit exynos_boot_secondary(unsigned int cpu, struct task_struct
 
 		if (timeout == 0) {
 			printk(KERN_ERR "cpu1 power enable failed");
-			spin_unlock(&boot_lock);
+			raw_spin_unlock(&boot_lock);
 			return -ETIMEDOUT;
 		}
 	}
@@ -161,7 +161,7 @@ static int __cpuinit exynos_boot_secondary(unsigned int cpu, struct task_struct
 	 * now the secondary core is starting up let it run its
 	 * calibrations, then wait for it to finish
 	 */
-	spin_unlock(&boot_lock);
+	raw_spin_unlock(&boot_lock);
 
 	return pen_release != -1 ? -ENOSYS : 0;
 }
diff --git a/arch/arm/mach-msm/platsmp.c b/arch/arm/mach-msm/platsmp.c
index 7ed69b69..7b8043a 100644
--- a/arch/arm/mach-msm/platsmp.c
+++ b/arch/arm/mach-msm/platsmp.c
@@ -31,7 +31,7 @@
 
 extern void msm_secondary_startup(void);
 
-static DEFINE_SPINLOCK(boot_lock);
+static DEFINE_RAW_SPINLOCK(boot_lock);
 
 static inline int get_core_count(void)
 {
@@ -58,8 +58,8 @@ static void __cpuinit msm_secondary_init(unsigned int cpu)
 	/*
 	 * Synchronise with the boot thread.
 	 */
-	spin_lock(&boot_lock);
-	spin_unlock(&boot_lock);
+	raw_spin_lock(&boot_lock);
+	raw_spin_unlock(&boot_lock);
 }
 
 static __cpuinit void prepare_cold_cpu(unsigned int cpu)
@@ -96,7 +96,7 @@ static int __cpuinit msm_boot_secondary(unsigned int cpu, struct task_struct *id
 	 * set synchronisation state between this boot processor
 	 * and the secondary one
 	 */
-	spin_lock(&boot_lock);
+	raw_spin_lock(&boot_lock);
 
 	/*
 	 * The secondary processor is waiting to be released from
@@ -130,7 +130,7 @@ static int __cpuinit msm_boot_secondary(unsigned int cpu, struct task_struct *id
 	 * now the secondary core is starting up let it run its
 	 * calibrations, then wait for it to finish
 	 */
-	spin_unlock(&boot_lock);
+	raw_spin_unlock(&boot_lock);
 
 	return pen_release != -1 ? -ENOSYS : 0;
 }
diff --git a/arch/arm/mach-omap2/omap-smp.c b/arch/arm/mach-omap2/omap-smp.c
index cd42d92..ef1fbcc 100644
--- a/arch/arm/mach-omap2/omap-smp.c
+++ b/arch/arm/mach-omap2/omap-smp.c
@@ -45,7 +45,7 @@ u16 pm44xx_errata;
 /* SCU base address */
 static void __iomem *scu_base;
 
-static DEFINE_SPINLOCK(boot_lock);
+static DEFINE_RAW_SPINLOCK(boot_lock);
 
 void __iomem *omap4_get_scu_base(void)
 {
@@ -76,8 +76,8 @@ static void __cpuinit omap4_secondary_init(unsigned int cpu)
 	/*
 	 * Synchronise with the boot thread.
 	 */
-	spin_lock(&boot_lock);
-	spin_unlock(&boot_lock);
+	raw_spin_lock(&boot_lock);
+	raw_spin_unlock(&boot_lock);
 }
 
 static int __cpuinit omap4_boot_secondary(unsigned int cpu, struct task_struct *idle)
@@ -90,7 +90,7 @@ static int __cpuinit omap4_boot_secondary(unsigned int cpu, struct task_struct *
 	 * Set synchronisation state between this boot processor
 	 * and the secondary one
 	 */
-	spin_lock(&boot_lock);
+	raw_spin_lock(&boot_lock);
 
 	/*
 	 * Update the AuxCoreBoot0 with boot state for secondary core.
@@ -163,7 +163,7 @@ static int __cpuinit omap4_boot_secondary(unsigned int cpu, struct task_struct *
 	 * Now the secondary core is starting up let it run its
 	 * calibrations, then wait for it to finish
 	 */
-	spin_unlock(&boot_lock);
+	raw_spin_unlock(&boot_lock);
 
 	return 0;
 }
diff --git a/arch/arm/mach-spear13xx/platsmp.c b/arch/arm/mach-spear13xx/platsmp.c
index 2eaa3fa..bb61b1f 100644
--- a/arch/arm/mach-spear13xx/platsmp.c
+++ b/arch/arm/mach-spear13xx/platsmp.c
@@ -21,7 +21,7 @@
 #include <mach/spear.h>
 #include <mach/generic.h>
 
-static DEFINE_SPINLOCK(boot_lock);
+static DEFINE_RAW_SPINLOCK(boot_lock);
 
 static void __iomem *scu_base = IOMEM(VA_SCU_BASE);
 
@@ -44,8 +44,8 @@ static void __cpuinit spear13xx_secondary_init(unsigned int cpu)
 	/*
 	 * Synchronise with the boot thread.
 	 */
-	spin_lock(&boot_lock);
-	spin_unlock(&boot_lock);
+	raw_spin_lock(&boot_lock);
+	raw_spin_unlock(&boot_lock);
 }
 
 static int __cpuinit spear13xx_boot_secondary(unsigned int cpu, struct task_struct *idle)
@@ -56,7 +56,7 @@ static int __cpuinit spear13xx_boot_secondary(unsigned int cpu, struct task_stru
 	 * set synchronisation state between this boot processor
 	 * and the secondary one
 	 */
-	spin_lock(&boot_lock);
+	raw_spin_lock(&boot_lock);
 
 	/*
 	 * The secondary processor is waiting to be released from
@@ -83,7 +83,7 @@ static int __cpuinit spear13xx_boot_secondary(unsigned int cpu, struct task_stru
 	 * now the secondary core is starting up let it run its
 	 * calibrations, then wait for it to finish
 	 */
-	spin_unlock(&boot_lock);
+	raw_spin_unlock(&boot_lock);
 
 	return pen_release != -1 ? -ENOSYS : 0;
 }
diff --git a/arch/arm/mach-ux500/platsmp.c b/arch/arm/mach-ux500/platsmp.c
index 3db7782..dcecae1 100644
--- a/arch/arm/mach-ux500/platsmp.c
+++ b/arch/arm/mach-ux500/platsmp.c
@@ -50,7 +50,7 @@ static void __iomem *scu_base_addr(void)
 	return NULL;
 }
 
-static DEFINE_SPINLOCK(boot_lock);
+static DEFINE_RAW_SPINLOCK(boot_lock);
 
 static void __cpuinit ux500_secondary_init(unsigned int cpu)
 {
@@ -70,8 +70,8 @@ static void __cpuinit ux500_secondary_init(unsigned int cpu)
 	/*
 	 * Synchronise with the boot thread.
 	 */
-	spin_lock(&boot_lock);
-	spin_unlock(&boot_lock);
+	raw_spin_lock(&boot_lock);
+	raw_spin_unlock(&boot_lock);
 }
 
 static int __cpuinit ux500_boot_secondary(unsigned int cpu, struct task_struct *idle)
@@ -82,7 +82,7 @@ static int __cpuinit ux500_boot_secondary(unsigned int cpu, struct task_struct *
 	 * set synchronisation state between this boot processor
 	 * and the secondary one
 	 */
-	spin_lock(&boot_lock);
+	raw_spin_lock(&boot_lock);
 
 	/*
 	 * The secondary processor is waiting to be released from
@@ -103,7 +103,7 @@ static int __cpuinit ux500_boot_secondary(unsigned int cpu, struct task_struct *
 	 * now the secondary core is starting up let it run its
 	 * calibrations, then wait for it to finish
 	 */
-	spin_unlock(&boot_lock);
+	raw_spin_unlock(&boot_lock);
 
 	return pen_release != -1 ? -ENOSYS : 0;
 }
diff --git a/arch/arm/plat-versatile/platsmp.c b/arch/arm/plat-versatile/platsmp.c
index 04ca493..20f9044 100644
--- a/arch/arm/plat-versatile/platsmp.c
+++ b/arch/arm/plat-versatile/platsmp.c
@@ -32,7 +32,7 @@ static void __cpuinit write_pen_release(int val)
 	outer_clean_range(__pa(&pen_release), __pa(&pen_release + 1));
 }
 
-static DEFINE_SPINLOCK(boot_lock);
+static DEFINE_RAW_SPINLOCK(boot_lock);
 
 void __cpuinit versatile_secondary_init(unsigned int cpu)
 {
@@ -52,8 +52,8 @@ void __cpuinit versatile_secondary_init(unsigned int cpu)
 	/*
 	 * Synchronise with the boot thread.
 	 */
-	spin_lock(&boot_lock);
-	spin_unlock(&boot_lock);
+	raw_spin_lock(&boot_lock);
+	raw_spin_unlock(&boot_lock);
 }
 
 int __cpuinit versatile_boot_secondary(unsigned int cpu, struct task_struct *idle)
@@ -64,7 +64,7 @@ int __cpuinit versatile_boot_secondary(unsigned int cpu, struct task_struct *idl
 	 * Set synchronisation state between this boot processor
 	 * and the secondary one
 	 */
-	spin_lock(&boot_lock);
+	raw_spin_lock(&boot_lock);
 
 	/*
 	 * This is really belt and braces; we hold unintended secondary
@@ -94,7 +94,7 @@ int __cpuinit versatile_boot_secondary(unsigned int cpu, struct task_struct *idl
 	 * now the secondary core is starting up let it run its
 	 * calibrations, then wait for it to finish
 	 */
-	spin_unlock(&boot_lock);
+	raw_spin_unlock(&boot_lock);
 
 	return pen_release != -1 ? -ENOSYS : 0;
 }
-- 
cgit v0.10.2


From 3e42b4d7e57b7f2095e3bf659398a088835e4ca5 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 11 Apr 2012 11:26:38 +0200
Subject: arm-omap-make-wakeupgen_lock-raw.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/arm/mach-omap2/omap-wakeupgen.c b/arch/arm/mach-omap2/omap-wakeupgen.c
index 5d3b4f4..8633a43 100644
--- a/arch/arm/mach-omap2/omap-wakeupgen.c
+++ b/arch/arm/mach-omap2/omap-wakeupgen.c
@@ -46,7 +46,7 @@
 
 static void __iomem *wakeupgen_base;
 static void __iomem *sar_base;
-static DEFINE_SPINLOCK(wakeupgen_lock);
+static DEFINE_RAW_SPINLOCK(wakeupgen_lock);
 static unsigned int irq_target_cpu[MAX_IRQS];
 static unsigned int irq_banks = MAX_NR_REG_BANKS;
 static unsigned int max_irqs = MAX_IRQS;
@@ -134,9 +134,9 @@ static void wakeupgen_mask(struct irq_data *d)
 {
 	unsigned long flags;
 
-	spin_lock_irqsave(&wakeupgen_lock, flags);
+	raw_spin_lock_irqsave(&wakeupgen_lock, flags);
 	_wakeupgen_clear(d->irq, irq_target_cpu[d->irq]);
-	spin_unlock_irqrestore(&wakeupgen_lock, flags);
+	raw_spin_unlock_irqrestore(&wakeupgen_lock, flags);
 }
 
 /*
@@ -146,9 +146,9 @@ static void wakeupgen_unmask(struct irq_data *d)
 {
 	unsigned long flags;
 
-	spin_lock_irqsave(&wakeupgen_lock, flags);
+	raw_spin_lock_irqsave(&wakeupgen_lock, flags);
 	_wakeupgen_set(d->irq, irq_target_cpu[d->irq]);
-	spin_unlock_irqrestore(&wakeupgen_lock, flags);
+	raw_spin_unlock_irqrestore(&wakeupgen_lock, flags);
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
@@ -189,7 +189,7 @@ static void wakeupgen_irqmask_all(unsigned int cpu, unsigned int set)
 {
 	unsigned long flags;
 
-	spin_lock_irqsave(&wakeupgen_lock, flags);
+	raw_spin_lock_irqsave(&wakeupgen_lock, flags);
 	if (set) {
 		_wakeupgen_save_masks(cpu);
 		_wakeupgen_set_all(cpu, WKG_MASK_ALL);
@@ -197,7 +197,7 @@ static void wakeupgen_irqmask_all(unsigned int cpu, unsigned int set)
 		_wakeupgen_set_all(cpu, WKG_UNMASK_ALL);
 		_wakeupgen_restore_masks(cpu);
 	}
-	spin_unlock_irqrestore(&wakeupgen_lock, flags);
+	raw_spin_unlock_irqrestore(&wakeupgen_lock, flags);
 }
 #endif
 
-- 
cgit v0.10.2


From fa7024086addbeab3173a7126f1bbfd5189ea771 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 3 Jul 2009 08:29:20 -0500
Subject: posix-timers: Prevent broadcast signals

Posix timers should not send broadcast signals and kernel only
signals. Prevent it.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index e885be1..4b7183c 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -439,6 +439,7 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer)
 static struct pid *good_sigevent(sigevent_t * event)
 {
 	struct task_struct *rtn = current->group_leader;
+	int sig = event->sigev_signo;
 
 	if ((event->sigev_notify & SIGEV_THREAD_ID ) &&
 		(!(rtn = find_task_by_vpid(event->sigev_notify_thread_id)) ||
@@ -447,7 +448,8 @@ static struct pid *good_sigevent(sigevent_t * event)
 		return NULL;
 
 	if (((event->sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) &&
-	    ((event->sigev_signo <= 0) || (event->sigev_signo > SIGRTMAX)))
+	    (sig <= 0 || sig > SIGRTMAX || sig_kernel_only(sig) ||
+	     sig_kernel_coredump(sig)))
 		return NULL;
 
 	return task_pid(rtn);
-- 
cgit v0.10.2


From f9d30d1ba5503fbebdeed38f282efa1c7cea52ef Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 3 Jul 2009 08:44:56 -0500
Subject: signals: Allow rt tasks to cache one sigqueue struct

To avoid allocation allow rt tasks to cache one sigqueue struct in
task struct.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/sched.h b/include/linux/sched.h
index d2112477..e59279a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1407,6 +1407,7 @@ struct task_struct {
 /* signal handlers */
 	struct signal_struct *signal;
 	struct sighand_struct *sighand;
+	struct sigqueue *sigqueue_cache;
 
 	sigset_t blocked, real_blocked;
 	sigset_t saved_sigmask;	/* restored if set_restore_sigmask() was used */
diff --git a/include/linux/signal.h b/include/linux/signal.h
index 0a89ffc..8942895 100644
--- a/include/linux/signal.h
+++ b/include/linux/signal.h
@@ -226,6 +226,7 @@ static inline void init_sigpending(struct sigpending *sig)
 }
 
 extern void flush_sigqueue(struct sigpending *queue);
+extern void flush_task_sigqueue(struct task_struct *tsk);
 
 /* Test if 'sig' is valid signal. Use this instead of testing _NSIG directly */
 static inline int valid_signal(unsigned long sig)
diff --git a/kernel/exit.c b/kernel/exit.c
index b4df219..8fb4a48 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -143,7 +143,7 @@ static void __exit_signal(struct task_struct *tsk)
 	 * Do this under ->siglock, we can race with another thread
 	 * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals.
 	 */
-	flush_sigqueue(&tsk->pending);
+	flush_task_sigqueue(tsk);
 	tsk->sighand = NULL;
 	spin_unlock(&sighand->siglock);
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 5630e52..e4cee21 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1230,6 +1230,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	spin_lock_init(&p->alloc_lock);
 
 	init_sigpending(&p->pending);
+	p->sigqueue_cache = NULL;
 
 	p->utime = p->stime = p->gtime = 0;
 	p->utimescaled = p->stimescaled = 0;
diff --git a/kernel/signal.c b/kernel/signal.c
index 55bff2a..131247f 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -348,13 +348,45 @@ static bool task_participate_group_stop(struct task_struct *task)
 	return false;
 }
 
+#ifdef __HAVE_ARCH_CMPXCHG
+static inline struct sigqueue *get_task_cache(struct task_struct *t)
+{
+	struct sigqueue *q = t->sigqueue_cache;
+
+	if (cmpxchg(&t->sigqueue_cache, q, NULL) != q)
+		return NULL;
+	return q;
+}
+
+static inline int put_task_cache(struct task_struct *t, struct sigqueue *q)
+{
+	if (cmpxchg(&t->sigqueue_cache, NULL, q) == NULL)
+		return 0;
+	return 1;
+}
+
+#else
+
+static inline struct sigqueue *get_task_cache(struct task_struct *t)
+{
+	return NULL;
+}
+
+static inline int put_task_cache(struct task_struct *t, struct sigqueue *q)
+{
+	return 1;
+}
+
+#endif
+
 /*
  * allocate a new signal queue record
  * - this may be called without locks if and only if t == current, otherwise an
  *   appropriate lock must be held to stop the target task from exiting
  */
 static struct sigqueue *
-__sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimit)
+__sigqueue_do_alloc(int sig, struct task_struct *t, gfp_t flags,
+		    int override_rlimit, int fromslab)
 {
 	struct sigqueue *q = NULL;
 	struct user_struct *user;
@@ -371,7 +403,10 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimi
 	if (override_rlimit ||
 	    atomic_read(&user->sigpending) <=
 			task_rlimit(t, RLIMIT_SIGPENDING)) {
-		q = kmem_cache_alloc(sigqueue_cachep, flags);
+		if (!fromslab)
+			q = get_task_cache(t);
+		if (!q)
+			q = kmem_cache_alloc(sigqueue_cachep, flags);
 	} else {
 		print_dropped_signal(sig);
 	}
@@ -388,6 +423,13 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimi
 	return q;
 }
 
+static struct sigqueue *
+__sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags,
+		 int override_rlimit)
+{
+	return __sigqueue_do_alloc(sig, t, flags, override_rlimit, 0);
+}
+
 static void __sigqueue_free(struct sigqueue *q)
 {
 	if (q->flags & SIGQUEUE_PREALLOC)
@@ -397,6 +439,21 @@ static void __sigqueue_free(struct sigqueue *q)
 	kmem_cache_free(sigqueue_cachep, q);
 }
 
+static void sigqueue_free_current(struct sigqueue *q)
+{
+	struct user_struct *up;
+
+	if (q->flags & SIGQUEUE_PREALLOC)
+		return;
+
+	up = q->user;
+	if (rt_prio(current->normal_prio) && !put_task_cache(current, q)) {
+		atomic_dec(&up->sigpending);
+		free_uid(up);
+	} else
+		  __sigqueue_free(q);
+}
+
 void flush_sigqueue(struct sigpending *queue)
 {
 	struct sigqueue *q;
@@ -410,6 +467,21 @@ void flush_sigqueue(struct sigpending *queue)
 }
 
 /*
+ * Called from __exit_signal. Flush tsk->pending and
+ * tsk->sigqueue_cache
+ */
+void flush_task_sigqueue(struct task_struct *tsk)
+{
+	struct sigqueue *q;
+
+	flush_sigqueue(&tsk->pending);
+
+	q = get_task_cache(tsk);
+	if (q)
+		kmem_cache_free(sigqueue_cachep, q);
+}
+
+/*
  * Flush all pending signals for a task.
  */
 void __flush_signals(struct task_struct *t)
@@ -561,7 +633,7 @@ static void collect_signal(int sig, struct sigpending *list, siginfo_t *info)
 still_pending:
 		list_del_init(&first->list);
 		copy_siginfo(info, &first->info);
-		__sigqueue_free(first);
+		sigqueue_free_current(first);
 	} else {
 		/*
 		 * Ok, it wasn't in the queue.  This must be
@@ -607,6 +679,8 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
 {
 	int signr;
 
+	WARN_ON_ONCE(tsk != current);
+
 	/* We only dequeue private signals from ourselves, we don't let
 	 * signalfd steal them
 	 */
@@ -1545,7 +1619,8 @@ EXPORT_SYMBOL(kill_pid);
  */
 struct sigqueue *sigqueue_alloc(void)
 {
-	struct sigqueue *q = __sigqueue_alloc(-1, current, GFP_KERNEL, 0);
+	/* Preallocated sigqueue objects always from the slabcache ! */
+	struct sigqueue *q = __sigqueue_do_alloc(-1, current, GFP_KERNEL, 0, 1);
 
 	if (q)
 		q->flags |= SIGQUEUE_PREALLOC;
-- 
cgit v0.10.2


From 823f81df78e9af82385b672b53f50c8b81a33b8a Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 4 Apr 2013 17:09:08 -0500
Subject: signal/x86: Delay calling signals in atomic

On x86_64 we must disable preemption before we enable interrupts
for stack faults, int3 and debugging, because the current task is using
a per CPU debug stack defined by the IST. If we schedule out, another task
can come in and use the same stack and cause the stack to be corrupted
and crash the kernel on return.

When CONFIG_PREEMPT_RT_FULL is enabled, spin_locks become mutexes, and
one of these is the spin lock used in signal handling.

Some of the debug code (int3) causes do_trap() to send a signal.
This function calls a spin lock that has been converted to a mutex
and has the possibility to sleep. If this happens, the above issues with
the corrupted stack is possible.

Instead of calling the signal right away, for PREEMPT_RT and x86_64,
the signal information is stored on the stacks task_struct and
TIF_NOTIFY_RESUME is set. Then on exit of the trap, the signal resume
code will send the signal when preemption is enabled.

[ rostedt: Switched from #ifdef CONFIG_PREEMPT_RT_FULL to
  ARCH_RT_DELAYS_SIGNAL_SEND and added comments to the code. ]

Cc: stable-rt@vger.kernel.org
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/x86/include/asm/signal.h b/arch/x86/include/asm/signal.h
index 216bf36..135d598 100644
--- a/arch/x86/include/asm/signal.h
+++ b/arch/x86/include/asm/signal.h
@@ -23,6 +23,19 @@ typedef struct {
 	unsigned long sig[_NSIG_WORDS];
 } sigset_t;
 
+/*
+ * Because some traps use the IST stack, we must keep preemption
+ * disabled while calling do_trap(), but do_trap() may call
+ * force_sig_info() which will grab the signal spin_locks for the
+ * task, which in PREEMPT_RT_FULL are mutexes.  By defining
+ * ARCH_RT_DELAYS_SIGNAL_SEND the force_sig_info() will set
+ * TIF_NOTIFY_RESUME and set up the signal to be sent on exit of the
+ * trap.
+ */
+#if defined(CONFIG_PREEMPT_RT_FULL) && defined(CONFIG_X86_64)
+#define ARCH_RT_DELAYS_SIGNAL_SEND
+#endif
+
 #ifndef CONFIG_COMPAT
 typedef sigset_t compat_sigset_t;
 #endif
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index d6bf1f3..4a3a5dd 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -808,6 +808,14 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
 		mce_notify_process();
 #endif /* CONFIG_X86_64 && CONFIG_X86_MCE */
 
+#ifdef ARCH_RT_DELAYS_SIGNAL_SEND
+	if (unlikely(current->forced_info.si_signo)) {
+		struct task_struct *t = current;
+		force_sig_info(t->forced_info.si_signo,	&t->forced_info, t);
+		t->forced_info.si_signo = 0;
+	}
+#endif
+
 	if (thread_info_flags & _TIF_UPROBE)
 		uprobe_notify_resume(regs);
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index e59279a..7b62065 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1412,6 +1412,10 @@ struct task_struct {
 	sigset_t blocked, real_blocked;
 	sigset_t saved_sigmask;	/* restored if set_restore_sigmask() was used */
 	struct sigpending pending;
+#ifdef CONFIG_PREEMPT_RT_FULL
+	/* TODO: move me into ->restart_block ? */
+	struct siginfo forced_info;
+#endif
 
 	unsigned long sas_ss_sp;
 	size_t sas_ss_size;
diff --git a/kernel/signal.c b/kernel/signal.c
index 131247f..3b3e347 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1302,8 +1302,8 @@ int do_send_sig_info(int sig, struct siginfo *info, struct task_struct *p,
  * We don't want to have recursive SIGSEGV's etc, for example,
  * that is why we also clear SIGNAL_UNKILLABLE.
  */
-int
-force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
+static int
+do_force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
 {
 	unsigned long int flags;
 	int ret, blocked, ignored;
@@ -1328,6 +1328,39 @@ force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
 	return ret;
 }
 
+int force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
+{
+/*
+ * On some archs, PREEMPT_RT has to delay sending a signal from a trap
+ * since it can not enable preemption, and the signal code's spin_locks
+ * turn into mutexes. Instead, it must set TIF_NOTIFY_RESUME which will
+ * send the signal on exit of the trap.
+ */
+#ifdef ARCH_RT_DELAYS_SIGNAL_SEND
+	if (in_atomic()) {
+		if (WARN_ON_ONCE(t != current))
+			return 0;
+		if (WARN_ON_ONCE(t->forced_info.si_signo))
+			return 0;
+
+		if (is_si_special(info)) {
+			WARN_ON_ONCE(info != SEND_SIG_PRIV);
+			t->forced_info.si_signo = sig;
+			t->forced_info.si_errno = 0;
+			t->forced_info.si_code = SI_KERNEL;
+			t->forced_info.si_pid = 0;
+			t->forced_info.si_uid = 0;
+		} else {
+			t->forced_info = *info;
+		}
+
+		set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
+		return 0;
+	}
+#endif
+	return do_force_sig_info(sig, info, t);
+}
+
 /*
  * Nuke all other threads in the group.
  */
-- 
cgit v0.10.2


From ac2edd42126bf68b898c43629d611babf7644e72 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 3 Jul 2009 08:29:30 -0500
Subject: drivers: random: Reduce preempt disabled region

No need to keep preemption disabled across the whole function.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/drivers/char/random.c b/drivers/char/random.c
index 32a6c57..50007e1 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -676,9 +676,12 @@ static void add_timer_randomness(struct timer_rand_state *state, unsigned num)
 	preempt_disable();
 	/* if over the trickle threshold, use only 1 in 4096 samples */
 	if (input_pool.entropy_count > trickle_thresh &&
-	    ((__this_cpu_inc_return(trickle_count) - 1) & 0xfff))
-		goto out;
+	    ((__this_cpu_inc_return(trickle_count) - 1) & 0xfff)) {
+		preempt_enable();
+		return;
+	}
 
+	preempt_enable();
 	sample.jiffies = jiffies;
 	sample.cycles = get_cycles();
 	sample.num = num;
@@ -719,8 +722,6 @@ static void add_timer_randomness(struct timer_rand_state *state, unsigned num)
 		credit_entropy_bits(&input_pool,
 				    min_t(int, fls(delta>>1), 11));
 	}
-out:
-	preempt_enable();
 }
 
 void add_input_randomness(unsigned int type, unsigned int code,
-- 
cgit v0.10.2


From 08b5e49428ce9d10c0a835b0472513bc7ce72862 Mon Sep 17 00:00:00 2001
From: Benedikt Spranger <b.spranger@linutronix.de>
Date: Sat, 6 Mar 2010 17:47:10 +0100
Subject: ARM: AT91: PIT: Remove irq handler when clock event is unused

Setup and remove the interrupt handler in clock event mode selection.
This avoids calling the (shared) interrupt handler when the device is
not used.

Signed-off-by: Benedikt Spranger <b.spranger@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/arm/mach-at91/at91rm9200_time.c b/arch/arm/mach-at91/at91rm9200_time.c
index cafe988..0c36dcd 100644
--- a/arch/arm/mach-at91/at91rm9200_time.c
+++ b/arch/arm/mach-at91/at91rm9200_time.c
@@ -134,6 +134,7 @@ clkevt32k_mode(enum clock_event_mode mode, struct clock_event_device *dev)
 		break;
 	case CLOCK_EVT_MODE_SHUTDOWN:
 	case CLOCK_EVT_MODE_UNUSED:
+		remove_irq(AT91_ID_SYS, &at91rm9200_timer_irq);
 	case CLOCK_EVT_MODE_RESUME:
 		irqmask = 0;
 		break;
diff --git a/arch/arm/mach-at91/at91sam926x_time.c b/arch/arm/mach-at91/at91sam926x_time.c
index 358412f..a6f9751 100644
--- a/arch/arm/mach-at91/at91sam926x_time.c
+++ b/arch/arm/mach-at91/at91sam926x_time.c
@@ -77,7 +77,7 @@ static struct clocksource pit_clk = {
 	.flags		= CLOCK_SOURCE_IS_CONTINUOUS,
 };
 
-
+static struct irqaction at91sam926x_pit_irq;
 /*
  * Clockevent device:  interrupts every 1/HZ (== pit_cycles * MCK/16)
  */
@@ -86,6 +86,8 @@ pit_clkevt_mode(enum clock_event_mode mode, struct clock_event_device *dev)
 {
 	switch (mode) {
 	case CLOCK_EVT_MODE_PERIODIC:
+		/* Set up irq handler */
+		setup_irq(AT91_ID_SYS, &at91sam926x_pit_irq);
 		/* update clocksource counter */
 		pit_cnt += pit_cycle * PIT_PICNT(pit_read(AT91_PIT_PIVR));
 		pit_write(AT91_PIT_MR, (pit_cycle - 1) | AT91_PIT_PITEN
@@ -98,6 +100,7 @@ pit_clkevt_mode(enum clock_event_mode mode, struct clock_event_device *dev)
 	case CLOCK_EVT_MODE_UNUSED:
 		/* disable irq, leaving the clocksource active */
 		pit_write(AT91_PIT_MR, (pit_cycle - 1) | AT91_PIT_PITEN);
+		remove_irq(AT91_ID_SYS, &at91sam926x_pit_irq);
 		break;
 	case CLOCK_EVT_MODE_RESUME:
 		break;
-- 
cgit v0.10.2


From cd9b3157ae6bc6e4bb8f324781d8cd1180383074 Mon Sep 17 00:00:00 2001
From: Benedikt Spranger <b.spranger@linutronix.de>
Date: Mon, 8 Mar 2010 18:57:04 +0100
Subject: clocksource: TCLIB: Allow higher clock rates for clock events

As default the TCLIB uses the 32KiHz base clock rate for clock events.
Add a compile time selection to allow higher clock resulution.

Signed-off-by: Benedikt Spranger <b.spranger@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/drivers/clocksource/tcb_clksrc.c b/drivers/clocksource/tcb_clksrc.c
index 32cb929..ac0bb2e 100644
--- a/drivers/clocksource/tcb_clksrc.c
+++ b/drivers/clocksource/tcb_clksrc.c
@@ -23,8 +23,7 @@
  *     this 32 bit free-running counter. the second channel is not used.
  *
  *   - The third channel may be used to provide a 16-bit clockevent
- *     source, used in either periodic or oneshot mode.  This runs
- *     at 32 KiHZ, and can handle delays of up to two seconds.
+ *     source, used in either periodic or oneshot mode.
  *
  * A boot clocksource and clockevent source are also currently needed,
  * unless the relevant platforms (ARM/AT91, AVR32/AT32) are changed so
@@ -74,6 +73,7 @@ static struct clocksource clksrc = {
 struct tc_clkevt_device {
 	struct clock_event_device	clkevt;
 	struct clk			*clk;
+	u32				freq;
 	void __iomem			*regs;
 };
 
@@ -82,13 +82,6 @@ static struct tc_clkevt_device *to_tc_clkevt(struct clock_event_device *clkevt)
 	return container_of(clkevt, struct tc_clkevt_device, clkevt);
 }
 
-/* For now, we always use the 32K clock ... this optimizes for NO_HZ,
- * because using one of the divided clocks would usually mean the
- * tick rate can never be less than several dozen Hz (vs 0.5 Hz).
- *
- * A divided clock could be good for high resolution timers, since
- * 30.5 usec resolution can seem "low".
- */
 static u32 timer_clock;
 
 static void tc_mode(enum clock_event_mode m, struct clock_event_device *d)
@@ -111,11 +104,12 @@ static void tc_mode(enum clock_event_mode m, struct clock_event_device *d)
 	case CLOCK_EVT_MODE_PERIODIC:
 		clk_enable(tcd->clk);
 
-		/* slow clock, count up to RC, then irq and restart */
+		/* count up to RC, then irq and restart */
 		__raw_writel(timer_clock
 				| ATMEL_TC_WAVE | ATMEL_TC_WAVESEL_UP_AUTO,
 				regs + ATMEL_TC_REG(2, CMR));
-		__raw_writel((32768 + HZ/2) / HZ, tcaddr + ATMEL_TC_REG(2, RC));
+		__raw_writel((tcd->freq + HZ/2)/HZ,
+			     tcaddr + ATMEL_TC_REG(2, RC));
 
 		/* Enable clock and interrupts on RC compare */
 		__raw_writel(ATMEL_TC_CPCS, regs + ATMEL_TC_REG(2, IER));
@@ -128,7 +122,7 @@ static void tc_mode(enum clock_event_mode m, struct clock_event_device *d)
 	case CLOCK_EVT_MODE_ONESHOT:
 		clk_enable(tcd->clk);
 
-		/* slow clock, count up to RC, then irq and stop */
+		/* count up to RC, then irq and stop */
 		__raw_writel(timer_clock | ATMEL_TC_CPCSTOP
 				| ATMEL_TC_WAVE | ATMEL_TC_WAVESEL_UP_AUTO,
 				regs + ATMEL_TC_REG(2, CMR));
@@ -158,8 +152,12 @@ static struct tc_clkevt_device clkevt = {
 		.features	= CLOCK_EVT_FEAT_PERIODIC
 					| CLOCK_EVT_FEAT_ONESHOT,
 		.shift		= 32,
+#ifdef CONFIG_ATMEL_TCB_CLKSRC_USE_SLOW_CLOCK
 		/* Should be lower than at91rm9200's system timer */
 		.rating		= 125,
+#else
+		.rating		= 200,
+#endif
 		.set_next_event	= tc_next_event,
 		.set_mode	= tc_mode,
 	},
@@ -185,8 +183,9 @@ static struct irqaction tc_irqaction = {
 	.handler	= ch2_irq,
 };
 
-static void __init setup_clkevents(struct atmel_tc *tc, int clk32k_divisor_idx)
+static void __init setup_clkevents(struct atmel_tc *tc, int divisor_idx)
 {
+	unsigned divisor = atmel_tc_divisors[divisor_idx];
 	struct clk *t2_clk = tc->clk[2];
 	int irq = tc->irq[2];
 
@@ -194,11 +193,17 @@ static void __init setup_clkevents(struct atmel_tc *tc, int clk32k_divisor_idx)
 	clkevt.clk = t2_clk;
 	tc_irqaction.dev_id = &clkevt;
 
-	timer_clock = clk32k_divisor_idx;
+	timer_clock = divisor_idx;
 
-	clkevt.clkevt.mult = div_sc(32768, NSEC_PER_SEC, clkevt.clkevt.shift);
-	clkevt.clkevt.max_delta_ns
-		= clockevent_delta2ns(0xffff, &clkevt.clkevt);
+	if (!divisor)
+		clkevt.freq = 32768;
+	else
+		clkevt.freq = clk_get_rate(t2_clk)/divisor;
+
+	clkevt.clkevt.mult = div_sc(clkevt.freq, NSEC_PER_SEC,
+				    clkevt.clkevt.shift);
+	clkevt.clkevt.max_delta_ns =
+		clockevent_delta2ns(0xffff, &clkevt.clkevt);
 	clkevt.clkevt.min_delta_ns = clockevent_delta2ns(1, &clkevt.clkevt) + 1;
 	clkevt.clkevt.cpumask = cpumask_of(0);
 
@@ -327,8 +332,11 @@ static int __init tcb_clksrc_init(void)
 	clocksource_register_hz(&clksrc, divided_rate);
 
 	/* channel 2:  periodic and oneshot timer support */
+#ifdef CONFIG_ATMEL_TCB_CLKSRC_USE_SLOW_CLOCK
 	setup_clkevents(tc, clk32k_divisor_idx);
-
+#else
+	setup_clkevents(tc, best_divisor_idx);
+#endif
 	return 0;
 }
 arch_initcall(tcb_clksrc_init);
diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig
index b151b7c..e94f19d 100644
--- a/drivers/misc/Kconfig
+++ b/drivers/misc/Kconfig
@@ -78,8 +78,7 @@ config ATMEL_TCB_CLKSRC
 	  are combined to make a single 32-bit timer.
 
 	  When GENERIC_CLOCKEVENTS is defined, the third timer channel
-	  may be used as a clock event device supporting oneshot mode
-	  (delays of up to two seconds) based on the 32 KiHz clock.
+	  may be used as a clock event device supporting oneshot mode.
 
 config ATMEL_TCB_CLKSRC_BLOCK
 	int
@@ -93,6 +92,14 @@ config ATMEL_TCB_CLKSRC_BLOCK
 	  TC can be used for other purposes, such as PWM generation and
 	  interval timing.
 
+config ATMEL_TCB_CLKSRC_USE_SLOW_CLOCK
+	bool "TC Block use 32 KiHz clock"
+	depends on ATMEL_TCB_CLKSRC
+	default y
+	help
+	  Select this to use 32 KiHz base clock rate as TC block clock
+	  source for clock events.
+
 config IBM_ASM
 	tristate "Device driver for IBM RSA service processor"
 	depends on X86 && PCI && INPUT
-- 
cgit v0.10.2


From fcb4c11643e5a58f1e7089466750995cac1b15bd Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 3 Jul 2009 08:30:18 -0500
Subject: drivers/net: tulip_remove_one needs to call pci_disable_device()

Otherwise the device is not completely shut down.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/drivers/net/ethernet/dec/tulip/tulip_core.c b/drivers/net/ethernet/dec/tulip/tulip_core.c
index 1e9443d..d25961b 100644
--- a/drivers/net/ethernet/dec/tulip/tulip_core.c
+++ b/drivers/net/ethernet/dec/tulip/tulip_core.c
@@ -1943,6 +1943,7 @@ static void tulip_remove_one(struct pci_dev *pdev)
 	pci_iounmap(pdev, tp->base_addr);
 	free_netdev (dev);
 	pci_release_regions (pdev);
+	pci_disable_device (pdev);
 	pci_set_drvdata (pdev, NULL);
 
 	/* pci_power_off (pdev, -1); */
-- 
cgit v0.10.2


From 959661cdb998fe5c3d9109340f1596e3b61907ec Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 3 Jul 2009 08:29:24 -0500
Subject: drivers/net: Use disable_irq_nosync() in 8139too

Use disable_irq_nosync() instead of disable_irq() as this might be
called in atomic context with netpoll.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/drivers/net/ethernet/realtek/8139too.c b/drivers/net/ethernet/realtek/8139too.c
index 5dc1616..3bed27d 100644
--- a/drivers/net/ethernet/realtek/8139too.c
+++ b/drivers/net/ethernet/realtek/8139too.c
@@ -2216,7 +2216,7 @@ static void rtl8139_poll_controller(struct net_device *dev)
 	struct rtl8139_private *tp = netdev_priv(dev);
 	const int irq = tp->pci_dev->irq;
 
-	disable_irq(irq);
+	disable_irq_nosync(irq);
 	rtl8139_interrupt(irq, dev);
 	enable_irq(irq);
 }
-- 
cgit v0.10.2


From 86e57c2dc838d57324ccc333a36c32e12df4f11f Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 3 Jul 2009 08:30:37 -0500
Subject: mm: Prepare decoupling the page fault disabling logic

Add a pagefault_disabled variable to task_struct to allow decoupling
the pagefault-disabled logic from the preempt count.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 7b62065..3fb5915 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1452,6 +1452,7 @@ struct task_struct {
 	/* mutex deadlock detection */
 	struct mutex_waiter *blocked_on;
 #endif
+	int pagefault_disabled;
 #ifdef CONFIG_TRACE_IRQFLAGS
 	unsigned int irq_events;
 	unsigned long hardirq_enable_ip;
diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index 5ca0951..9414a1b 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -6,37 +6,10 @@
 
 /*
  * These routines enable/disable the pagefault handler in that
- * it will not take any locks and go straight to the fixup table.
- *
- * They have great resemblance to the preempt_disable/enable calls
- * and in fact they are identical; this is because currently there is
- * no other way to make the pagefault handlers do this. So we do
- * disable preemption but we don't necessarily care about that.
+ * it will not take any MM locks and go straight to the fixup table.
  */
-static inline void pagefault_disable(void)
-{
-	inc_preempt_count();
-	/*
-	 * make sure to have issued the store before a pagefault
-	 * can hit.
-	 */
-	barrier();
-}
-
-static inline void pagefault_enable(void)
-{
-	/*
-	 * make sure to issue those last loads/stores before enabling
-	 * the pagefault handler again.
-	 */
-	barrier();
-	dec_preempt_count();
-	/*
-	 * make sure we do..
-	 */
-	barrier();
-	preempt_check_resched();
-}
+extern void pagefault_disable(void);
+extern void pagefault_enable(void);
 
 #ifndef ARCH_HAS_NOCACHE_UACCESS
 
diff --git a/kernel/fork.c b/kernel/fork.c
index e4cee21..4349b83 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1285,6 +1285,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	p->hardirq_context = 0;
 	p->softirq_context = 0;
 #endif
+	p->pagefault_disabled = 0;
 #ifdef CONFIG_LOCKDEP
 	p->lockdep_depth = 0; /* no locks held yet */
 	p->curr_chain_key = 0;
diff --git a/mm/memory.c b/mm/memory.c
index bb1369f..2f059a9 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3669,6 +3669,35 @@ unlock:
 	return 0;
 }
 
+void pagefault_disable(void)
+{
+	inc_preempt_count();
+	current->pagefault_disabled++;
+	/*
+	 * make sure to have issued the store before a pagefault
+	 * can hit.
+	 */
+	barrier();
+}
+EXPORT_SYMBOL(pagefault_disable);
+
+void pagefault_enable(void)
+{
+	/*
+	 * make sure to issue those last loads/stores before enabling
+	 * the pagefault handler again.
+	 */
+	barrier();
+	current->pagefault_disabled--;
+	dec_preempt_count();
+	/*
+	 * make sure we do..
+	 */
+	barrier();
+	preempt_check_resched();
+}
+EXPORT_SYMBOL(pagefault_enable);
+
 /*
  * By the time we get here, we already hold the mm semaphore
  */
-- 
cgit v0.10.2


From 1aebc993d201dd35eb834e5171caef7400a323ff Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 17 Mar 2011 11:32:28 +0100
Subject: mm: Fixup all fault handlers to check current->pagefault_disable

Necessary for decoupling pagefault disable from preempt count.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/alpha/mm/fault.c b/arch/alpha/mm/fault.c
index 0c4132d..f7d6c49 100644
--- a/arch/alpha/mm/fault.c
+++ b/arch/alpha/mm/fault.c
@@ -108,7 +108,7 @@ do_page_fault(unsigned long address, unsigned long mmcsr,
 
 	/* If we're in an interrupt context, or have no user context,
 	   we must not take the fault.  */
-	if (!mm || in_atomic())
+	if (!mm || in_atomic() || current->pagefault_disabled)
 		goto no_context;
 
 #ifdef CONFIG_ALPHA_LARGE_VMALLOC
diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c
index 5dbf13f..56269df 100644
--- a/arch/arm/mm/fault.c
+++ b/arch/arm/mm/fault.c
@@ -279,7 +279,7 @@ do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
 	 * If we're in an interrupt or have no user
 	 * context, we must not take the fault..
 	 */
-	if (in_atomic() || !mm)
+	if (in_atomic() || !mm || current->pagefault_disabled)
 		goto no_context;
 
 	/*
diff --git a/arch/avr32/mm/fault.c b/arch/avr32/mm/fault.c
index b2f2d2d..4850f93 100644
--- a/arch/avr32/mm/fault.c
+++ b/arch/avr32/mm/fault.c
@@ -81,7 +81,8 @@ asmlinkage void do_page_fault(unsigned long ecr, struct pt_regs *regs)
 	 * If we're in an interrupt or have no user context, we must
 	 * not take the fault...
 	 */
-	if (in_atomic() || !mm || regs->sr & SYSREG_BIT(GM))
+	if (in_atomic() || !mm || regs->sr & SYSREG_BIT(GM) ||
+	    current->pagefault_disabled)
 		goto no_context;
 
 	local_irq_enable();
diff --git a/arch/cris/mm/fault.c b/arch/cris/mm/fault.c
index 73312ab..c82f44c 100644
--- a/arch/cris/mm/fault.c
+++ b/arch/cris/mm/fault.c
@@ -114,7 +114,7 @@ do_page_fault(unsigned long address, struct pt_regs *regs,
 	 * user context, we must not take the fault.
 	 */
 
-	if (in_atomic() || !mm)
+	if (in_atomic() || !mm || current->pagefault_disabled)
 		goto no_context;
 
 retry:
diff --git a/arch/frv/mm/fault.c b/arch/frv/mm/fault.c
index 331c1e2..6372088 100644
--- a/arch/frv/mm/fault.c
+++ b/arch/frv/mm/fault.c
@@ -78,7 +78,7 @@ asmlinkage void do_page_fault(int datammu, unsigned long esr0, unsigned long ear
 	 * If we're in an interrupt or have no user
 	 * context, we must not take the fault..
 	 */
-	if (in_atomic() || !mm)
+	if (in_atomic() || !mm || current->pagefault_disabled)
 		goto no_context;
 
 	down_read(&mm->mmap_sem);
diff --git a/arch/ia64/mm/fault.c b/arch/ia64/mm/fault.c
index 6cf0341..ecac819 100644
--- a/arch/ia64/mm/fault.c
+++ b/arch/ia64/mm/fault.c
@@ -98,7 +98,7 @@ ia64_do_page_fault (unsigned long address, unsigned long isr, struct pt_regs *re
 	/*
 	 * If we're in an interrupt or have no user context, we must not take the fault..
 	 */
-	if (in_atomic() || !mm)
+	if (in_atomic() || !mm || current->pagefault_disabled)
 		goto no_context;
 
 #ifdef CONFIG_VIRTUAL_MEM_MAP
diff --git a/arch/m32r/mm/fault.c b/arch/m32r/mm/fault.c
index 3cdfa9c..1eec8af 100644
--- a/arch/m32r/mm/fault.c
+++ b/arch/m32r/mm/fault.c
@@ -114,7 +114,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code,
 	 * If we're in an interrupt or have no user context or are running in an
 	 * atomic region then we must not take the fault..
 	 */
-	if (in_atomic() || !mm)
+	if (in_atomic() || !mm || current->pagefault_disabled)
 		goto bad_area_nosemaphore;
 
 	/* When running in the kernel we expect faults to occur only to
diff --git a/arch/m68k/mm/fault.c b/arch/m68k/mm/fault.c
index a563727..152361e 100644
--- a/arch/m68k/mm/fault.c
+++ b/arch/m68k/mm/fault.c
@@ -85,7 +85,7 @@ int do_page_fault(struct pt_regs *regs, unsigned long address,
 	 * If we're in an interrupt or have no user
 	 * context, we must not take the fault..
 	 */
-	if (in_atomic() || !mm)
+	if (in_atomic() || !mm || current->pagefault_disabled)
 		goto no_context;
 
 retry:
diff --git a/arch/microblaze/mm/fault.c b/arch/microblaze/mm/fault.c
index 714b35a..fb9add1 100644
--- a/arch/microblaze/mm/fault.c
+++ b/arch/microblaze/mm/fault.c
@@ -108,7 +108,7 @@ void do_page_fault(struct pt_regs *regs, unsigned long address,
 	if ((error_code & 0x13) == 0x13 || (error_code & 0x11) == 0x11)
 		is_write = 0;
 
-	if (unlikely(in_atomic() || !mm)) {
+	if (unlikely(in_atomic() || !mm || current->pagefault_disabled)) {
 		if (kernel_mode(regs))
 			goto bad_area_nosemaphore;
 
diff --git a/arch/mips/mm/fault.c b/arch/mips/mm/fault.c
index ddcec1e..63fec6e 100644
--- a/arch/mips/mm/fault.c
+++ b/arch/mips/mm/fault.c
@@ -89,7 +89,7 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs, unsigned long writ
 	 * If we're in an interrupt or have no user
 	 * context, we must not take the fault..
 	 */
-	if (in_atomic() || !mm)
+	if (in_atomic() || !mm || current->pagefault_disabled)
 		goto bad_area_nosemaphore;
 
 retry:
diff --git a/arch/mn10300/mm/fault.c b/arch/mn10300/mm/fault.c
index d48a84f..b245bdf 100644
--- a/arch/mn10300/mm/fault.c
+++ b/arch/mn10300/mm/fault.c
@@ -168,7 +168,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long fault_code,
 	 * If we're in an interrupt or have no user
 	 * context, we must not take the fault..
 	 */
-	if (in_atomic() || !mm)
+	if (in_atomic() || !mm || current->pagefault_disabled)
 		goto no_context;
 
 retry:
diff --git a/arch/parisc/mm/fault.c b/arch/parisc/mm/fault.c
index 18162ce..09ecc8a 100644
--- a/arch/parisc/mm/fault.c
+++ b/arch/parisc/mm/fault.c
@@ -176,7 +176,7 @@ void do_page_fault(struct pt_regs *regs, unsigned long code,
 	unsigned long acc_type;
 	int fault;
 
-	if (in_atomic() || !mm)
+	if (in_atomic() || !mm || current->pagefault_disabled)
 		goto no_context;
 
 	down_read(&mm->mmap_sem);
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 3a8489a..feef7a6 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -259,7 +259,7 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,
 	if (!arch_irq_disabled_regs(regs))
 		local_irq_enable();
 
-	if (in_atomic() || mm == NULL) {
+	if (in_atomic() || mm == NULL || current->pagefault_disabled) {
 		if (!user_mode(regs))
 			return SIGSEGV;
 		/* in_atomic() in user mode is really bad,
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index 2fb9e63..a5fb7bc 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -296,7 +296,8 @@ static inline int do_exception(struct pt_regs *regs, int access)
 	 * user context.
 	 */
 	fault = VM_FAULT_BADCONTEXT;
-	if (unlikely(!user_space_fault(trans_exc_code) || in_atomic() || !mm))
+	if (unlikely(!user_space_fault(trans_exc_code) || in_atomic() || !mm ||
+		    tsk->pagefault_disabled))
 		goto out;
 
 	address = trans_exc_code & __FAIL_ADDR_MASK;
@@ -435,7 +436,8 @@ void __kprobes do_asce_exception(struct pt_regs *regs)
 	clear_tsk_thread_flag(current, TIF_PER_TRAP);
 
 	trans_exc_code = regs->int_parm_long;
-	if (unlikely(!user_space_fault(trans_exc_code) || in_atomic() || !mm))
+	if (unlikely(!user_space_fault(trans_exc_code) || in_atomic() || !mm ||
+		     current->pagefault_disabled()));
 		goto no_context;
 
 	down_read(&mm->mmap_sem);
diff --git a/arch/score/mm/fault.c b/arch/score/mm/fault.c
index 47b600e..4c12824 100644
--- a/arch/score/mm/fault.c
+++ b/arch/score/mm/fault.c
@@ -72,7 +72,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long write,
 	* If we're in an interrupt or have no user
 	* context, we must not take the fault..
 	*/
-	if (in_atomic() || !mm)
+	if (in_atomic() || !mm || current->pagefault_disabled)
 		goto bad_area_nosemaphore;
 
 	down_read(&mm->mmap_sem);
diff --git a/arch/sh/mm/fault.c b/arch/sh/mm/fault.c
index 1f49c28..c89dd12 100644
--- a/arch/sh/mm/fault.c
+++ b/arch/sh/mm/fault.c
@@ -440,7 +440,7 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
 	 * If we're in an interrupt, have no user context or are running
 	 * in an atomic region then we must not take the fault:
 	 */
-	if (unlikely(in_atomic() || !mm)) {
+	if (unlikely(in_atomic() || !mm || current->pagefault_disabled)) {
 		bad_area_nosemaphore(regs, error_code, address);
 		return;
 	}
diff --git a/arch/sparc/mm/fault_32.c b/arch/sparc/mm/fault_32.c
index e98bfda..ef35d69 100644
--- a/arch/sparc/mm/fault_32.c
+++ b/arch/sparc/mm/fault_32.c
@@ -200,7 +200,7 @@ asmlinkage void do_sparc_fault(struct pt_regs *regs, int text_fault, int write,
 	 * If we're in an interrupt or have no user
 	 * context, we must not take the fault..
 	 */
-	if (in_atomic() || !mm)
+	if (in_atomic() || !mm || current->pagefault_disabled)
 		goto no_context;
 
 	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
diff --git a/arch/sparc/mm/fault_64.c b/arch/sparc/mm/fault_64.c
index 5062ff3..e852238 100644
--- a/arch/sparc/mm/fault_64.c
+++ b/arch/sparc/mm/fault_64.c
@@ -321,7 +321,7 @@ asmlinkage void __kprobes do_sparc64_fault(struct pt_regs *regs)
 	 * If we're in an interrupt or have no user
 	 * context, we must not take the fault..
 	 */
-	if (in_atomic() || !mm)
+	if (in_atomic() || !mm || current->pagefault_enabled)
 		goto intr_or_no_mm;
 
 	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
diff --git a/arch/tile/mm/fault.c b/arch/tile/mm/fault.c
index 3d2b81c..a0fba28 100644
--- a/arch/tile/mm/fault.c
+++ b/arch/tile/mm/fault.c
@@ -360,7 +360,7 @@ static int handle_page_fault(struct pt_regs *regs,
 	 * If we're in an interrupt, have no user context or are running in an
 	 * atomic region then we must not take the fault.
 	 */
-	if (in_atomic() || !mm) {
+	if (in_atomic() || !mm || current->pagefault_disabled) {
 		vma = NULL;  /* happy compiler */
 		goto bad_area_nosemaphore;
 	}
diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c
index 089f398..bd897f6 100644
--- a/arch/um/kernel/trap.c
+++ b/arch/um/kernel/trap.c
@@ -39,7 +39,7 @@ int handle_page_fault(unsigned long address, unsigned long ip,
 	 * If the fault was during atomic operation, don't take the fault, just
 	 * fail.
 	 */
-	if (in_atomic())
+	if (in_atomic() || current->pagefault_disabled)
 		goto out_nosemaphore;
 
 retry:
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index fb674fd..0adea38 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1108,7 +1108,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code)
 	 * If we're in an interrupt, have no user context or are running
 	 * in an atomic region then we must not take the fault:
 	 */
-	if (unlikely(in_atomic() || !mm)) {
+	if (unlikely(in_atomic() || !mm || current->pagefault_disabled)) {
 		bad_area_nosemaphore(regs, error_code, address);
 		return;
 	}
diff --git a/arch/xtensa/mm/fault.c b/arch/xtensa/mm/fault.c
index 4b7bc8d..0d82503 100644
--- a/arch/xtensa/mm/fault.c
+++ b/arch/xtensa/mm/fault.c
@@ -57,7 +57,7 @@ void do_page_fault(struct pt_regs *regs)
 	/* If we're in an interrupt or have no user
 	 * context, we must not take the fault..
 	 */
-	if (in_atomic() || !mm) {
+	if (in_atomic() || !mm || current->pagefault_disabled) {
 		bad_page_fault(regs, address, SIGSEGV);
 		return;
 	}
-- 
cgit v0.10.2


From e580ed044d1d9ee7b4e5e4f26c04f3dbf1b49ea8 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 11 Aug 2011 15:31:31 +0200
Subject: mm: pagefault_disabled()

Wrap the test for pagefault_disabled() into a helper, this allows us
to remove the need for current->pagefault_disabled on !-rt kernels.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/n/tip-3yy517m8zsi9fpsf14xfaqkw@git.kernel.org

diff --git a/arch/alpha/mm/fault.c b/arch/alpha/mm/fault.c
index f7d6c49..fb9eaa4 100644
--- a/arch/alpha/mm/fault.c
+++ b/arch/alpha/mm/fault.c
@@ -108,7 +108,7 @@ do_page_fault(unsigned long address, unsigned long mmcsr,
 
 	/* If we're in an interrupt context, or have no user context,
 	   we must not take the fault.  */
-	if (!mm || in_atomic() || current->pagefault_disabled)
+	if (!mm || pagefault_disabled())
 		goto no_context;
 
 #ifdef CONFIG_ALPHA_LARGE_VMALLOC
diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c
index 56269df..149bab5 100644
--- a/arch/arm/mm/fault.c
+++ b/arch/arm/mm/fault.c
@@ -279,7 +279,7 @@ do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
 	 * If we're in an interrupt or have no user
 	 * context, we must not take the fault..
 	 */
-	if (in_atomic() || !mm || current->pagefault_disabled)
+	if (!mm || pagefault_disabled())
 		goto no_context;
 
 	/*
diff --git a/arch/avr32/mm/fault.c b/arch/avr32/mm/fault.c
index 4850f93..9577e69 100644
--- a/arch/avr32/mm/fault.c
+++ b/arch/avr32/mm/fault.c
@@ -81,8 +81,7 @@ asmlinkage void do_page_fault(unsigned long ecr, struct pt_regs *regs)
 	 * If we're in an interrupt or have no user context, we must
 	 * not take the fault...
 	 */
-	if (in_atomic() || !mm || regs->sr & SYSREG_BIT(GM) ||
-	    current->pagefault_disabled)
+	if (!mm || regs->sr & SYSREG_BIT(GM) || pagefault_disabled())
 		goto no_context;
 
 	local_irq_enable();
diff --git a/arch/cris/mm/fault.c b/arch/cris/mm/fault.c
index c82f44c..1a403d9 100644
--- a/arch/cris/mm/fault.c
+++ b/arch/cris/mm/fault.c
@@ -114,7 +114,7 @@ do_page_fault(unsigned long address, struct pt_regs *regs,
 	 * user context, we must not take the fault.
 	 */
 
-	if (in_atomic() || !mm || current->pagefault_disabled)
+	if (!mm || pagefault_disabled())
 		goto no_context;
 
 retry:
diff --git a/arch/frv/mm/fault.c b/arch/frv/mm/fault.c
index 6372088..e87972c 100644
--- a/arch/frv/mm/fault.c
+++ b/arch/frv/mm/fault.c
@@ -78,7 +78,7 @@ asmlinkage void do_page_fault(int datammu, unsigned long esr0, unsigned long ear
 	 * If we're in an interrupt or have no user
 	 * context, we must not take the fault..
 	 */
-	if (in_atomic() || !mm || current->pagefault_disabled)
+	if (!mm || pagefault_disabled())
 		goto no_context;
 
 	down_read(&mm->mmap_sem);
diff --git a/arch/ia64/mm/fault.c b/arch/ia64/mm/fault.c
index ecac819..dd88415 100644
--- a/arch/ia64/mm/fault.c
+++ b/arch/ia64/mm/fault.c
@@ -98,7 +98,7 @@ ia64_do_page_fault (unsigned long address, unsigned long isr, struct pt_regs *re
 	/*
 	 * If we're in an interrupt or have no user context, we must not take the fault..
 	 */
-	if (in_atomic() || !mm || current->pagefault_disabled)
+	if (!mm || pagefault_disabled())
 		goto no_context;
 
 #ifdef CONFIG_VIRTUAL_MEM_MAP
diff --git a/arch/m32r/mm/fault.c b/arch/m32r/mm/fault.c
index 1eec8af..6945056 100644
--- a/arch/m32r/mm/fault.c
+++ b/arch/m32r/mm/fault.c
@@ -114,7 +114,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code,
 	 * If we're in an interrupt or have no user context or are running in an
 	 * atomic region then we must not take the fault..
 	 */
-	if (in_atomic() || !mm || current->pagefault_disabled)
+	if (!mm || pagefault_disabled())
 		goto bad_area_nosemaphore;
 
 	/* When running in the kernel we expect faults to occur only to
diff --git a/arch/m68k/mm/fault.c b/arch/m68k/mm/fault.c
index 152361e..9ea40db 100644
--- a/arch/m68k/mm/fault.c
+++ b/arch/m68k/mm/fault.c
@@ -85,7 +85,7 @@ int do_page_fault(struct pt_regs *regs, unsigned long address,
 	 * If we're in an interrupt or have no user
 	 * context, we must not take the fault..
 	 */
-	if (in_atomic() || !mm || current->pagefault_disabled)
+	if (!mm || pagefault_disabled())
 		goto no_context;
 
 retry:
diff --git a/arch/microblaze/mm/fault.c b/arch/microblaze/mm/fault.c
index fb9add1..dc1d8c1 100644
--- a/arch/microblaze/mm/fault.c
+++ b/arch/microblaze/mm/fault.c
@@ -108,7 +108,7 @@ void do_page_fault(struct pt_regs *regs, unsigned long address,
 	if ((error_code & 0x13) == 0x13 || (error_code & 0x11) == 0x11)
 		is_write = 0;
 
-	if (unlikely(in_atomic() || !mm || current->pagefault_disabled)) {
+	if (unlikely(!mm || pagefault_disabled())) {
 		if (kernel_mode(regs))
 			goto bad_area_nosemaphore;
 
diff --git a/arch/mips/mm/fault.c b/arch/mips/mm/fault.c
index 63fec6e..90d193a 100644
--- a/arch/mips/mm/fault.c
+++ b/arch/mips/mm/fault.c
@@ -89,7 +89,7 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs, unsigned long writ
 	 * If we're in an interrupt or have no user
 	 * context, we must not take the fault..
 	 */
-	if (in_atomic() || !mm || current->pagefault_disabled)
+	if (!mm || pagefault_disabled())
 		goto bad_area_nosemaphore;
 
 retry:
diff --git a/arch/mn10300/mm/fault.c b/arch/mn10300/mm/fault.c
index b245bdf..34a83b9 100644
--- a/arch/mn10300/mm/fault.c
+++ b/arch/mn10300/mm/fault.c
@@ -168,7 +168,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long fault_code,
 	 * If we're in an interrupt or have no user
 	 * context, we must not take the fault..
 	 */
-	if (in_atomic() || !mm || current->pagefault_disabled)
+	if (!mm || pagefault_disabled())
 		goto no_context;
 
 retry:
diff --git a/arch/parisc/mm/fault.c b/arch/parisc/mm/fault.c
index 09ecc8a..df22f39 100644
--- a/arch/parisc/mm/fault.c
+++ b/arch/parisc/mm/fault.c
@@ -176,7 +176,7 @@ void do_page_fault(struct pt_regs *regs, unsigned long code,
 	unsigned long acc_type;
 	int fault;
 
-	if (in_atomic() || !mm || current->pagefault_disabled)
+	if (!mm || pagefault_disabled())
 		goto no_context;
 
 	down_read(&mm->mmap_sem);
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index feef7a6..66fdd82 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -259,7 +259,7 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,
 	if (!arch_irq_disabled_regs(regs))
 		local_irq_enable();
 
-	if (in_atomic() || mm == NULL || current->pagefault_disabled) {
+	if (!mm || pagefault_disabled()) {
 		if (!user_mode(regs))
 			return SIGSEGV;
 		/* in_atomic() in user mode is really bad,
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index a5fb7bc..62d659d 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -296,8 +296,8 @@ static inline int do_exception(struct pt_regs *regs, int access)
 	 * user context.
 	 */
 	fault = VM_FAULT_BADCONTEXT;
-	if (unlikely(!user_space_fault(trans_exc_code) || in_atomic() || !mm ||
-		    tsk->pagefault_disabled))
+	if (unlikely(!user_space_fault(trans_exc_code) ||
+		     !mm || pagefault_disabled()))
 		goto out;
 
 	address = trans_exc_code & __FAIL_ADDR_MASK;
@@ -436,8 +436,8 @@ void __kprobes do_asce_exception(struct pt_regs *regs)
 	clear_tsk_thread_flag(current, TIF_PER_TRAP);
 
 	trans_exc_code = regs->int_parm_long;
-	if (unlikely(!user_space_fault(trans_exc_code) || in_atomic() || !mm ||
-		     current->pagefault_disabled()));
+	if (unlikely(!user_space_fault(trans_exc_code) || !mm ||
+		     pagefault_disabled()))
 		goto no_context;
 
 	down_read(&mm->mmap_sem);
diff --git a/arch/score/mm/fault.c b/arch/score/mm/fault.c
index 4c12824..59fccbe 100644
--- a/arch/score/mm/fault.c
+++ b/arch/score/mm/fault.c
@@ -72,7 +72,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long write,
 	* If we're in an interrupt or have no user
 	* context, we must not take the fault..
 	*/
-	if (in_atomic() || !mm || current->pagefault_disabled)
+	if (!mm || pagefault_disabled())
 		goto bad_area_nosemaphore;
 
 	down_read(&mm->mmap_sem);
diff --git a/arch/sh/mm/fault.c b/arch/sh/mm/fault.c
index c89dd12..8ff1613 100644
--- a/arch/sh/mm/fault.c
+++ b/arch/sh/mm/fault.c
@@ -440,7 +440,7 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
 	 * If we're in an interrupt, have no user context or are running
 	 * in an atomic region then we must not take the fault:
 	 */
-	if (unlikely(in_atomic() || !mm || current->pagefault_disabled)) {
+	if (unlikely(!mm || pagefault_disabled())) {
 		bad_area_nosemaphore(regs, error_code, address);
 		return;
 	}
diff --git a/arch/sparc/mm/fault_32.c b/arch/sparc/mm/fault_32.c
index ef35d69..18cbe13 100644
--- a/arch/sparc/mm/fault_32.c
+++ b/arch/sparc/mm/fault_32.c
@@ -200,7 +200,7 @@ asmlinkage void do_sparc_fault(struct pt_regs *regs, int text_fault, int write,
 	 * If we're in an interrupt or have no user
 	 * context, we must not take the fault..
 	 */
-	if (in_atomic() || !mm || current->pagefault_disabled)
+	if (!mm || pagefault_disabled())
 		goto no_context;
 
 	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
diff --git a/arch/sparc/mm/fault_64.c b/arch/sparc/mm/fault_64.c
index e852238..2764ac6 100644
--- a/arch/sparc/mm/fault_64.c
+++ b/arch/sparc/mm/fault_64.c
@@ -321,7 +321,7 @@ asmlinkage void __kprobes do_sparc64_fault(struct pt_regs *regs)
 	 * If we're in an interrupt or have no user
 	 * context, we must not take the fault..
 	 */
-	if (in_atomic() || !mm || current->pagefault_enabled)
+	if (!mm || pagefault_disabled())
 		goto intr_or_no_mm;
 
 	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
diff --git a/arch/tile/mm/fault.c b/arch/tile/mm/fault.c
index a0fba28..1ba0ccc 100644
--- a/arch/tile/mm/fault.c
+++ b/arch/tile/mm/fault.c
@@ -360,7 +360,7 @@ static int handle_page_fault(struct pt_regs *regs,
 	 * If we're in an interrupt, have no user context or are running in an
 	 * atomic region then we must not take the fault.
 	 */
-	if (in_atomic() || !mm || current->pagefault_disabled) {
+	if (!mm || pagefault_disabled()) {
 		vma = NULL;  /* happy compiler */
 		goto bad_area_nosemaphore;
 	}
diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c
index bd897f6..991b33a 100644
--- a/arch/um/kernel/trap.c
+++ b/arch/um/kernel/trap.c
@@ -39,7 +39,7 @@ int handle_page_fault(unsigned long address, unsigned long ip,
 	 * If the fault was during atomic operation, don't take the fault, just
 	 * fail.
 	 */
-	if (in_atomic() || current->pagefault_disabled)
+	if (pagefault_disabled())
 		goto out_nosemaphore;
 
 retry:
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 0adea38..8706173 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1108,7 +1108,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code)
 	 * If we're in an interrupt, have no user context or are running
 	 * in an atomic region then we must not take the fault:
 	 */
-	if (unlikely(in_atomic() || !mm || current->pagefault_disabled)) {
+	if (unlikely(!mm || pagefault_disabled())) {
 		bad_area_nosemaphore(regs, error_code, address);
 		return;
 	}
diff --git a/arch/xtensa/mm/fault.c b/arch/xtensa/mm/fault.c
index 0d82503..d57c257 100644
--- a/arch/xtensa/mm/fault.c
+++ b/arch/xtensa/mm/fault.c
@@ -57,7 +57,7 @@ void do_page_fault(struct pt_regs *regs)
 	/* If we're in an interrupt or have no user
 	 * context, we must not take the fault..
 	 */
-	if (in_atomic() || !mm || current->pagefault_disabled) {
+	if (!mm || pagefault_disabled()) {
 		bad_page_fault(regs, address, SIGSEGV);
 		return;
 	}
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 3fb5915..fd868f6 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -51,6 +51,7 @@ struct sched_param {
 #include <linux/cred.h>
 #include <linux/llist.h>
 #include <linux/uidgid.h>
+#include <linux/hardirq.h>
 
 #include <asm/processor.h>
 
@@ -1452,7 +1453,9 @@ struct task_struct {
 	/* mutex deadlock detection */
 	struct mutex_waiter *blocked_on;
 #endif
+#ifdef CONFIG_PREEMPT_RT_FULL
 	int pagefault_disabled;
+#endif
 #ifdef CONFIG_TRACE_IRQFLAGS
 	unsigned int irq_events;
 	unsigned long hardirq_enable_ip;
@@ -1628,6 +1631,17 @@ static inline void set_numabalancing_state(bool enabled)
 }
 #endif
 
+#ifdef CONFIG_PREEMPT_RT_FULL
+static inline bool cur_pf_disabled(void) { return current->pagefault_disabled; }
+#else
+static inline bool cur_pf_disabled(void) { return false; }
+#endif
+
+static inline bool pagefault_disabled(void)
+{
+	return in_atomic() || cur_pf_disabled();
+}
+
 /*
  * Priority of a process goes from 0..MAX_PRIO-1, valid RT
  * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH
diff --git a/kernel/fork.c b/kernel/fork.c
index 4349b83..6b22be6 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1285,7 +1285,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	p->hardirq_context = 0;
 	p->softirq_context = 0;
 #endif
+#ifdef CONFIG_PREEMPT_RT_FULL
 	p->pagefault_disabled = 0;
+#endif
 #ifdef CONFIG_LOCKDEP
 	p->lockdep_depth = 0; /* no locks held yet */
 	p->curr_chain_key = 0;
-- 
cgit v0.10.2


From af64dcb7810696ec5d78aec2a09caeae2a1aa743 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 5 Aug 2011 17:16:58 +0200
Subject: mm: raw_pagefault_disable

Adding migrate_disable() to pagefault_disable() to preserve the
per-cpu thing for kmap_atomic might not have been the best of choices.
But short of adding preempt_disable/migrate_disable foo all over the
kmap code it still seems the best way.

It does however yield the below borkage as well as wreck !-rt builds
since !-rt does rely on pagefault_disable() not preempting. So fix all
that up by adding raw_pagefault_disable().

 <NMI>  [<ffffffff81076d5c>] warn_slowpath_common+0x85/0x9d
 [<ffffffff81076e17>] warn_slowpath_fmt+0x46/0x48
 [<ffffffff814f7fca>] ? _raw_spin_lock+0x6c/0x73
 [<ffffffff810cac87>] ? watchdog_overflow_callback+0x9b/0xd0
 [<ffffffff810caca3>] watchdog_overflow_callback+0xb7/0xd0
 [<ffffffff810f51bb>] __perf_event_overflow+0x11c/0x1fe
 [<ffffffff810f298f>] ? perf_event_update_userpage+0x149/0x151
 [<ffffffff810f2846>] ? perf_event_task_disable+0x7c/0x7c
 [<ffffffff810f5b7c>] perf_event_overflow+0x14/0x16
 [<ffffffff81046e02>] x86_pmu_handle_irq+0xcb/0x108
 [<ffffffff814f9a6b>] perf_event_nmi_handler+0x46/0x91
 [<ffffffff814fb2ba>] notifier_call_chain+0x79/0xa6
 [<ffffffff814fb34d>] __atomic_notifier_call_chain+0x66/0x98
 [<ffffffff814fb2e7>] ? notifier_call_chain+0xa6/0xa6
 [<ffffffff814fb393>] atomic_notifier_call_chain+0x14/0x16
 [<ffffffff814fb3c3>] notify_die+0x2e/0x30
 [<ffffffff814f8f75>] do_nmi+0x7e/0x22b
 [<ffffffff814f8bca>] nmi+0x1a/0x2c
 [<ffffffff814fb130>] ? sub_preempt_count+0x4b/0xaa
 <<EOE>>  <IRQ>  [<ffffffff812d44cc>] delay_tsc+0xac/0xd1
 [<ffffffff812d4399>] __delay+0xf/0x11
 [<ffffffff812d95d9>] do_raw_spin_lock+0xd2/0x13c
 [<ffffffff814f813e>] _raw_spin_lock_irqsave+0x6b/0x85
 [<ffffffff8106772a>] ? task_rq_lock+0x35/0x8d
 [<ffffffff8106772a>] task_rq_lock+0x35/0x8d
 [<ffffffff8106fe2f>] migrate_disable+0x65/0x12c
 [<ffffffff81114e69>] pagefault_disable+0xe/0x1f
 [<ffffffff81039c73>] dump_trace+0x21f/0x2e2
 [<ffffffff8103ad79>] show_trace_log_lvl+0x54/0x5d
 [<ffffffff8103ad97>] show_trace+0x15/0x17
 [<ffffffff814f4f5f>] dump_stack+0x77/0x80
 [<ffffffff812d94b0>] spin_bug+0x9c/0xa3
 [<ffffffff81067745>] ? task_rq_lock+0x50/0x8d
 [<ffffffff812d954e>] do_raw_spin_lock+0x47/0x13c
 [<ffffffff814f7fbe>] _raw_spin_lock+0x60/0x73
 [<ffffffff81067745>] ? task_rq_lock+0x50/0x8d
 [<ffffffff81067745>] task_rq_lock+0x50/0x8d
 [<ffffffff8106fe2f>] migrate_disable+0x65/0x12c
 [<ffffffff81114e69>] pagefault_disable+0xe/0x1f
 [<ffffffff81039c73>] dump_trace+0x21f/0x2e2
 [<ffffffff8104369b>] save_stack_trace+0x2f/0x4c
 [<ffffffff810a7848>] save_trace+0x3f/0xaf
 [<ffffffff810aa2bd>] mark_lock+0x228/0x530
 [<ffffffff810aac27>] __lock_acquire+0x662/0x1812
 [<ffffffff8103dad4>] ? native_sched_clock+0x37/0x6d
 [<ffffffff810a790e>] ? trace_hardirqs_off_caller+0x1f/0x99
 [<ffffffff810693f6>] ? sched_rt_period_timer+0xbd/0x218
 [<ffffffff810ac403>] lock_acquire+0x145/0x18a
 [<ffffffff810693f6>] ? sched_rt_period_timer+0xbd/0x218
 [<ffffffff814f7f9e>] _raw_spin_lock+0x40/0x73
 [<ffffffff810693f6>] ? sched_rt_period_timer+0xbd/0x218
 [<ffffffff810693f6>] sched_rt_period_timer+0xbd/0x218
 [<ffffffff8109aa39>] __run_hrtimer+0x1e4/0x347
 [<ffffffff81069339>] ? can_migrate_task.clone.82+0x14a/0x14a
 [<ffffffff8109b97c>] hrtimer_interrupt+0xee/0x1d6
 [<ffffffff814fb23d>] ? add_preempt_count+0xae/0xb2
 [<ffffffff814ffb38>] smp_apic_timer_interrupt+0x85/0x98
 [<ffffffff814fef13>] apic_timer_interrupt+0x13/0x20


Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/n/tip-31keae8mkjiv8esq4rl76cib@git.kernel.org

diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index 9414a1b..44b3751 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -8,8 +8,34 @@
  * These routines enable/disable the pagefault handler in that
  * it will not take any MM locks and go straight to the fixup table.
  */
+static inline void raw_pagefault_disable(void)
+{
+	inc_preempt_count();
+	barrier();
+}
+
+static inline void raw_pagefault_enable(void)
+{
+	barrier();
+	dec_preempt_count();
+	barrier();
+	preempt_check_resched();
+}
+
+#ifndef CONFIG_PREEMPT_RT_FULL
+static inline void pagefault_disable(void)
+{
+	raw_pagefault_disable();
+}
+
+static inline void pagefault_enable(void)
+{
+	raw_pagefault_enable();
+}
+#else
 extern void pagefault_disable(void);
 extern void pagefault_enable(void);
+#endif
 
 #ifndef ARCH_HAS_NOCACHE_UACCESS
 
@@ -50,9 +76,9 @@ static inline unsigned long __copy_from_user_nocache(void *to,
 		mm_segment_t old_fs = get_fs();		\
 							\
 		set_fs(KERNEL_DS);			\
-		pagefault_disable();			\
+		raw_pagefault_disable();		\
 		ret = __copy_from_user_inatomic(&(retval), (__force typeof(retval) __user *)(addr), sizeof(retval));		\
-		pagefault_enable();			\
+		raw_pagefault_enable();			\
 		set_fs(old_fs);				\
 		ret;					\
 	})
diff --git a/mm/memory.c b/mm/memory.c
index 2f059a9..2eac5e6 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3669,6 +3669,7 @@ unlock:
 	return 0;
 }
 
+#ifdef CONFIG_PREEMPT_RT_FULL
 void pagefault_disable(void)
 {
 	inc_preempt_count();
@@ -3697,6 +3698,7 @@ void pagefault_enable(void)
 	preempt_check_resched();
 }
 EXPORT_SYMBOL(pagefault_enable);
+#endif
 
 /*
  * By the time we get here, we already hold the mm semaphore
-- 
cgit v0.10.2


From d838f18f5efe70181901dbb2af1d1816875b7816 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 17 Jun 2011 18:56:24 +0200
Subject: filemap-fix-up.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Wrecked-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/n/tip-m6yuzd6ul717hlnl2gj6p3ou@git.kernel.org

diff --git a/mm/filemap.c b/mm/filemap.c
index 83efee7..cb81968 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1955,7 +1955,7 @@ size_t iov_iter_copy_from_user_atomic(struct page *page,
 	char *kaddr;
 	size_t copied;
 
-	BUG_ON(!in_atomic());
+	BUG_ON(!pagefault_disabled());
 	kaddr = kmap_atomic(page);
 	if (likely(i->nr_segs == 1)) {
 		int left;
-- 
cgit v0.10.2


From 0cfe69aa58123033d7799fb88b88772c40ba9bdf Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 25 Jul 2009 22:06:27 +0200
Subject: mm: Remove preempt count from pagefault disable/enable

Now that all users are cleaned up, we can remove the preemption count.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/mm/memory.c b/mm/memory.c
index 2eac5e6..3f862e4 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3672,7 +3672,6 @@ unlock:
 #ifdef CONFIG_PREEMPT_RT_FULL
 void pagefault_disable(void)
 {
-	inc_preempt_count();
 	current->pagefault_disabled++;
 	/*
 	 * make sure to have issued the store before a pagefault
@@ -3690,12 +3689,6 @@ void pagefault_enable(void)
 	 */
 	barrier();
 	current->pagefault_disabled--;
-	dec_preempt_count();
-	/*
-	 * make sure we do..
-	 */
-	barrier();
-	preempt_check_resched();
 }
 EXPORT_SYMBOL(pagefault_enable);
 #endif
-- 
cgit v0.10.2


From f89b6e68ce427bc5910c7b2c8700880832bf5a06 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 15 Jul 2010 10:29:00 +0200
Subject: suspend: Prevent might sleep splats

timekeeping suspend/resume calls read_persistant_clock() which takes
rtc_lock. That results in might sleep warnings because at that point
we run with interrupts disabled.

We cannot convert rtc_lock to a raw spinlock as that would trigger
other might sleep warnings.

As a temporary workaround we disable the might sleep warnings by
setting system_state to SYSTEM_SUSPEND before calling sysdev_suspend()
and restoring it to SYSTEM_RUNNING afer sysdev_resume().

Needs to be revisited.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index c566927..8b3086d 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -412,6 +412,7 @@ extern enum system_states {
 	SYSTEM_HALT,
 	SYSTEM_POWER_OFF,
 	SYSTEM_RESTART,
+	SYSTEM_SUSPEND,
 } system_state;
 
 #define TAINT_PROPRIETARY_MODULE	0
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index b26f5f1..3321e2b 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -275,6 +275,8 @@ static int create_image(int platform_mode)
 
 	local_irq_disable();
 
+	system_state = SYSTEM_SUSPEND;
+
 	error = syscore_suspend();
 	if (error) {
 		printk(KERN_ERR "PM: Some system devices failed to power down, "
@@ -302,6 +304,7 @@ static int create_image(int platform_mode)
 	syscore_resume();
 
  Enable_irqs:
+	system_state = SYSTEM_RUNNING;
 	local_irq_enable();
 
  Enable_cpus:
@@ -427,6 +430,7 @@ static int resume_target_kernel(bool platform_mode)
 		goto Enable_cpus;
 
 	local_irq_disable();
+	system_state = SYSTEM_SUSPEND;
 
 	error = syscore_suspend();
 	if (error)
@@ -460,6 +464,7 @@ static int resume_target_kernel(bool platform_mode)
 	syscore_resume();
 
  Enable_irqs:
+	system_state = SYSTEM_RUNNING;
 	local_irq_enable();
 
  Enable_cpus:
@@ -542,6 +547,7 @@ int hibernation_platform_enter(void)
 		goto Platform_finish;
 
 	local_irq_disable();
+	system_state = SYSTEM_SUSPEND;
 	syscore_suspend();
 	if (pm_wakeup_pending()) {
 		error = -EAGAIN;
@@ -554,6 +560,7 @@ int hibernation_platform_enter(void)
 
  Power_up:
 	syscore_resume();
+	system_state = SYSTEM_RUNNING;
 	local_irq_enable();
 	enable_nonboot_cpus();
 
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index c8b7446..ff2dade 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -165,6 +165,8 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
 	arch_suspend_disable_irqs();
 	BUG_ON(!irqs_disabled());
 
+	system_state = SYSTEM_SUSPEND;
+
 	error = syscore_suspend();
 	if (!error) {
 		*wakeup = pm_wakeup_pending();
@@ -175,6 +177,8 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
 		syscore_resume();
 	}
 
+	system_state = SYSTEM_RUNNING;
+
 	arch_suspend_enable_irqs();
 	BUG_ON(irqs_disabled());
 
-- 
cgit v0.10.2


From 14aba70a81f6a5f4058a68ff130b57e1e4640e65 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 21 Jun 2011 11:22:36 +0200
Subject: list-add-list-last-entry.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/list.h b/include/linux/list.h
index cc6d2aa..7a9851b 100644
--- a/include/linux/list.h
+++ b/include/linux/list.h
@@ -362,6 +362,17 @@ static inline void list_splice_tail_init(struct list_head *list,
 	list_entry((ptr)->next, type, member)
 
 /**
+ * list_last_entry - get the last element from a list
+ * @ptr:	the list head to take the element from.
+ * @type:	the type of the struct this is embedded in.
+ * @member:	the name of the list_struct within the struct.
+ *
+ * Note, that list is expected to be not empty.
+ */
+#define list_last_entry(ptr, type, member) \
+	list_entry((ptr)->prev, type, member)
+
+/**
  * list_for_each	-	iterate over a list
  * @pos:	the &struct list_head to use as a loop cursor.
  * @head:	the head for your list.
-- 
cgit v0.10.2


From af6be1983ba3bf25069a8e6734fbcf855b727c9a Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 21 Jun 2011 11:24:35 +0200
Subject: mm-page-alloc-use-list-last-entry.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6a83cd3..f61bc0f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -658,7 +658,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
 		do {
 			int mt;	/* migratetype of the to-be-freed page */
 
-			page = list_entry(list->prev, struct page, lru);
+			page = list_last_entry(list, struct page, lru);
 			/* must delete as __free_one_page list manipulates */
 			list_del(&page->lru);
 			mt = get_freepage_migratetype(page);
-- 
cgit v0.10.2


From 5d7da3af215636d0954ae8c96529108b58aa7a55 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 20 Jun 2011 10:42:04 +0200
Subject: mm-slab-move-debug-out.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/mm/slab.c b/mm/slab.c
index e7667a3..2aa9fd4 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3895,10 +3895,10 @@ void kmem_cache_free(struct kmem_cache *cachep, void *objp)
 	if (!cachep)
 		return;
 
-	local_irq_save(flags);
 	debug_check_no_locks_freed(objp, cachep->object_size);
 	if (!(cachep->flags & SLAB_DEBUG_OBJECTS))
 		debug_check_no_obj_freed(objp, cachep->object_size);
+	local_irq_save(flags);
 	__cache_free(cachep, objp, _RET_IP_);
 	local_irq_restore(flags);
 
@@ -3924,12 +3924,12 @@ void kfree(const void *objp)
 
 	if (unlikely(ZERO_OR_NULL_PTR(objp)))
 		return;
-	local_irq_save(flags);
 	kfree_debugcheck(objp);
 	c = virt_to_cache(objp);
 	debug_check_no_locks_freed(objp, c->object_size);
 
 	debug_check_no_obj_freed(objp, c->object_size);
+	local_irq_save(flags);
 	__cache_free(c, (void *)objp, _RET_IP_);
 	local_irq_restore(flags);
 }
-- 
cgit v0.10.2


From 3e067db514d11181c48006f84ce3ad8311d2775f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 15 Jul 2011 21:24:27 +0200
Subject: rwsem-inlcude-fix.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/pid.h b/include/linux/pid.h
index 2381c97..3b67343 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -2,6 +2,7 @@
 #define _LINUX_PID_H
 
 #include <linux/rcupdate.h>
+#include <linux/atomic.h>
 
 enum pid_type
 {
-- 
cgit v0.10.2


From df1c1e53dbb279bc0869f51aaf175b8d256d3d00 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 14 Nov 2011 10:52:34 +0100
Subject: sysctl-include-fix.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 14a8ff2..b15655f 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -25,6 +25,7 @@
 #include <linux/rcupdate.h>
 #include <linux/wait.h>
 #include <linux/rbtree.h>
+#include <linux/atomic.h>
 #include <uapi/linux/sysctl.h>
 
 /* For the /proc/sys support */
-- 
cgit v0.10.2


From f17837aa962f0479dd4d5d6dd1d151b0c58061b8 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 28 Jun 2011 10:59:58 +0200
Subject: net-flip-lock-dep-thingy.patch

=======================================================
[ INFO: possible circular locking dependency detected ]
3.0.0-rc3+ #26
-------------------------------------------------------
ip/1104 is trying to acquire lock:
 (local_softirq_lock){+.+...}, at: [<ffffffff81056d12>] __local_lock+0x25/0x68

but task is already holding lock:
 (sk_lock-AF_INET){+.+...}, at: [<ffffffff81433308>] lock_sock+0x10/0x12

which lock already depends on the new lock.


the existing dependency chain (in reverse order) is:

-> #1 (sk_lock-AF_INET){+.+...}:
       [<ffffffff810836e5>] lock_acquire+0x103/0x12e
       [<ffffffff813e2781>] lock_sock_nested+0x82/0x92
       [<ffffffff81433308>] lock_sock+0x10/0x12
       [<ffffffff81433afa>] tcp_close+0x1b/0x355
       [<ffffffff81453c99>] inet_release+0xc3/0xcd
       [<ffffffff813dff3f>] sock_release+0x1f/0x74
       [<ffffffff813dffbb>] sock_close+0x27/0x2b
       [<ffffffff81129c63>] fput+0x11d/0x1e3
       [<ffffffff81126577>] filp_close+0x70/0x7b
       [<ffffffff8112667a>] sys_close+0xf8/0x13d
       [<ffffffff814ae882>] system_call_fastpath+0x16/0x1b

-> #0 (local_softirq_lock){+.+...}:
       [<ffffffff81082ecc>] __lock_acquire+0xacc/0xdc8
       [<ffffffff810836e5>] lock_acquire+0x103/0x12e
       [<ffffffff814a7e40>] _raw_spin_lock+0x3b/0x4a
       [<ffffffff81056d12>] __local_lock+0x25/0x68
       [<ffffffff81056d8b>] local_bh_disable+0x36/0x3b
       [<ffffffff814a7fc4>] _raw_write_lock_bh+0x16/0x4f
       [<ffffffff81433c38>] tcp_close+0x159/0x355
       [<ffffffff81453c99>] inet_release+0xc3/0xcd
       [<ffffffff813dff3f>] sock_release+0x1f/0x74
       [<ffffffff813dffbb>] sock_close+0x27/0x2b
       [<ffffffff81129c63>] fput+0x11d/0x1e3
       [<ffffffff81126577>] filp_close+0x70/0x7b
       [<ffffffff8112667a>] sys_close+0xf8/0x13d
       [<ffffffff814ae882>] system_call_fastpath+0x16/0x1b

other info that might help us debug this:

 Possible unsafe locking scenario:

       CPU0                    CPU1
       ----                    ----
  lock(sk_lock-AF_INET);
                               lock(local_softirq_lock);
                               lock(sk_lock-AF_INET);
  lock(local_softirq_lock);

 *** DEADLOCK ***

1 lock held by ip/1104:
 #0:  (sk_lock-AF_INET){+.+...}, at: [<ffffffff81433308>] lock_sock+0x10/0x12

stack backtrace:
Pid: 1104, comm: ip Not tainted 3.0.0-rc3+ #26
Call Trace:
 [<ffffffff81081649>] print_circular_bug+0x1f8/0x209
 [<ffffffff81082ecc>] __lock_acquire+0xacc/0xdc8
 [<ffffffff81056d12>] ? __local_lock+0x25/0x68
 [<ffffffff810836e5>] lock_acquire+0x103/0x12e
 [<ffffffff81056d12>] ? __local_lock+0x25/0x68
 [<ffffffff81046c75>] ? get_parent_ip+0x11/0x41
 [<ffffffff814a7e40>] _raw_spin_lock+0x3b/0x4a
 [<ffffffff81056d12>] ? __local_lock+0x25/0x68
 [<ffffffff81046c8c>] ? get_parent_ip+0x28/0x41
 [<ffffffff81056d12>] __local_lock+0x25/0x68
 [<ffffffff81056d8b>] local_bh_disable+0x36/0x3b
 [<ffffffff81433308>] ? lock_sock+0x10/0x12
 [<ffffffff814a7fc4>] _raw_write_lock_bh+0x16/0x4f
 [<ffffffff81433c38>] tcp_close+0x159/0x355
 [<ffffffff81453c99>] inet_release+0xc3/0xcd
 [<ffffffff813dff3f>] sock_release+0x1f/0x74
 [<ffffffff813dffbb>] sock_close+0x27/0x2b
 [<ffffffff81129c63>] fput+0x11d/0x1e3
 [<ffffffff81126577>] filp_close+0x70/0x7b
 [<ffffffff8112667a>] sys_close+0xf8/0x13d
 [<ffffffff814ae882>] system_call_fastpath+0x16/0x1b


Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/net/core/sock.c b/net/core/sock.c
index bc131d4..45a50ee 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2287,12 +2287,11 @@ void lock_sock_nested(struct sock *sk, int subclass)
 	if (sk->sk_lock.owned)
 		__lock_sock(sk);
 	sk->sk_lock.owned = 1;
-	spin_unlock(&sk->sk_lock.slock);
+	spin_unlock_bh(&sk->sk_lock.slock);
 	/*
 	 * The sk_lock has mutex_lock() semantics here:
 	 */
 	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
-	local_bh_enable();
 }
 EXPORT_SYMBOL(lock_sock_nested);
 
-- 
cgit v0.10.2


From 48226ac47904e36acf1285b748b2d7686d8d73c2 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 28 Jun 2011 15:44:15 +0200
Subject: softirq-thread-do-softirq.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 5fa5afe..5fc9d8b 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -443,6 +443,7 @@ struct softirq_action
 
 asmlinkage void do_softirq(void);
 asmlinkage void __do_softirq(void);
+static inline void thread_do_softirq(void) { do_softirq(); }
 extern void open_softirq(int nr, void (*action)(struct softirq_action *));
 extern void softirq_init(void);
 extern void __raise_softirq_irqoff(unsigned int nr);
diff --git a/net/core/dev.c b/net/core/dev.c
index 1339f77..2edfcf8 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3118,7 +3118,7 @@ int netif_rx_ni(struct sk_buff *skb)
 	preempt_disable();
 	err = netif_rx(skb);
 	if (local_softirq_pending())
-		do_softirq();
+		thread_do_softirq();
 	preempt_enable();
 
 	return err;
-- 
cgit v0.10.2


From c1ac170df496756d66e1890c613269e6f4b71912 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 28 Jun 2011 15:46:49 +0200
Subject: softirq-split-out-code.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/softirq.c b/kernel/softirq.c
index ed567ba..18784bb 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -77,6 +77,34 @@ static void wakeup_softirqd(void)
 		wake_up_process(tsk);
 }
 
+static void handle_pending_softirqs(u32 pending, int cpu)
+{
+	struct softirq_action *h = softirq_vec;
+	unsigned int prev_count = preempt_count();
+
+	local_irq_enable();
+	for ( ; pending; h++, pending >>= 1) {
+		unsigned int vec_nr = h - softirq_vec;
+
+		if (!(pending & 1))
+			continue;
+
+		kstat_incr_softirqs_this_cpu(vec_nr);
+		trace_softirq_entry(vec_nr);
+		h->action(h);
+		trace_softirq_exit(vec_nr);
+		if (unlikely(prev_count != preempt_count())) {
+			printk(KERN_ERR
+ "huh, entered softirq %u %s %p with preempt_count %08x exited with %08x?\n",
+			       vec_nr, softirq_to_name[vec_nr], h->action,
+			       prev_count, (unsigned int) preempt_count());
+			preempt_count() = prev_count;
+		}
+		rcu_bh_qs(cpu);
+	}
+	local_irq_disable();
+}
+
 /*
  * preempt_count and SOFTIRQ_OFFSET usage:
  * - preempt_count is changed by SOFTIRQ_OFFSET on entering or leaving
@@ -207,7 +235,6 @@ EXPORT_SYMBOL(local_bh_enable_ip);
 
 asmlinkage void __do_softirq(void)
 {
-	struct softirq_action *h;
 	__u32 pending;
 	int max_restart = MAX_SOFTIRQ_RESTART;
 	int cpu;
@@ -224,7 +251,7 @@ asmlinkage void __do_softirq(void)
 	vtime_account_irq_enter(current);
 
 	__local_bh_disable((unsigned long)__builtin_return_address(0),
-				SOFTIRQ_OFFSET);
+			   SOFTIRQ_OFFSET);
 	lockdep_softirq_enter();
 
 	cpu = smp_processor_id();
@@ -232,36 +259,7 @@ restart:
 	/* Reset the pending bitmask before enabling irqs */
 	set_softirq_pending(0);
 
-	local_irq_enable();
-
-	h = softirq_vec;
-
-	do {
-		if (pending & 1) {
-			unsigned int vec_nr = h - softirq_vec;
-			int prev_count = preempt_count();
-
-			kstat_incr_softirqs_this_cpu(vec_nr);
-
-			trace_softirq_entry(vec_nr);
-			h->action(h);
-			trace_softirq_exit(vec_nr);
-			if (unlikely(prev_count != preempt_count())) {
-				printk(KERN_ERR "huh, entered softirq %u %s %p"
-				       "with preempt_count %08x,"
-				       " exited with %08x?\n", vec_nr,
-				       softirq_to_name[vec_nr], h->action,
-				       prev_count, preempt_count());
-				preempt_count() = prev_count;
-			}
-
-			rcu_bh_qs(cpu);
-		}
-		h++;
-		pending >>= 1;
-	} while (pending);
-
-	local_irq_disable();
+	handle_pending_softirqs(pending, cpu);
 
 	pending = local_softirq_pending();
 	if (pending && --max_restart)
-- 
cgit v0.10.2


From abe865f720f5a349019187c6d9487ae778c638e1 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 3 Jul 2009 08:29:27 -0500
Subject: x86: Do not unmask io_apic when interrupt is in progress

With threaded interrupts we might see an interrupt in progress on
migration. Do not unmask it when this is the case.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index b739d39..aaa6399 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -2428,7 +2428,8 @@ static bool io_apic_level_ack_pending(struct irq_cfg *cfg)
 static inline bool ioapic_irqd_mask(struct irq_data *data, struct irq_cfg *cfg)
 {
 	/* If we are moving the irq we need to mask it */
-	if (unlikely(irqd_is_setaffinity_pending(data))) {
+	if (unlikely(irqd_is_setaffinity_pending(data) &&
+		     !irqd_irq_inprogress(data))) {
 		mask_ioapic(cfg);
 		return true;
 	}
-- 
cgit v0.10.2


From 6e6edeb1d567f56b3d5ecdbeb41b1fb045bfe668 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Thu, 4 Apr 2013 17:09:11 -0500
Subject: x86: Do not disable preemption in int3 on 32bit

Preemption must be disabled before enabling interrupts in do_trap
on x86_64 because the stack in use for int3 and debug is a per CPU
stack set by th IST. But 32bit does not have an IST and the stack
still belongs to the current task and there is no problem in scheduling
out the task.

Keep preemption enabled on X86_32 when enabling interrupts for
do_trap().

The name of the function is changed from preempt_conditional_sti/cli()
to conditional_sti/cli_ist(), to annotate that this function is used
when the stack is on the IST.

Cc: stable-rt@vger.kernel.org
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index ecffca1..be18ff6 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -85,9 +85,21 @@ static inline void conditional_sti(struct pt_regs *regs)
 		local_irq_enable();
 }
 
-static inline void preempt_conditional_sti(struct pt_regs *regs)
+static inline void conditional_sti_ist(struct pt_regs *regs)
 {
+#ifdef CONFIG_X86_64
+	/*
+	 * X86_64 uses a per CPU stack on the IST for certain traps
+	 * like int3. The task can not be preempted when using one
+	 * of these stacks, thus preemption must be disabled, otherwise
+	 * the stack can be corrupted if the task is scheduled out,
+	 * and another task comes in and uses this stack.
+	 *
+	 * On x86_32 the task keeps its own stack and it is OK if the
+	 * task schedules out.
+	 */
 	inc_preempt_count();
+#endif
 	if (regs->flags & X86_EFLAGS_IF)
 		local_irq_enable();
 }
@@ -98,11 +110,13 @@ static inline void conditional_cli(struct pt_regs *regs)
 		local_irq_disable();
 }
 
-static inline void preempt_conditional_cli(struct pt_regs *regs)
+static inline void conditional_cli_ist(struct pt_regs *regs)
 {
 	if (regs->flags & X86_EFLAGS_IF)
 		local_irq_disable();
+#ifdef CONFIG_X86_64
 	dec_preempt_count();
+#endif
 }
 
 static int __kprobes
@@ -229,9 +243,9 @@ dotraplinkage void do_stack_segment(struct pt_regs *regs, long error_code)
 	exception_enter(regs);
 	if (notify_die(DIE_TRAP, "stack segment", regs, error_code,
 		       X86_TRAP_SS, SIGBUS) != NOTIFY_STOP) {
-		preempt_conditional_sti(regs);
+		conditional_sti_ist(regs);
 		do_trap(X86_TRAP_SS, SIGBUS, "stack segment", regs, error_code, NULL);
-		preempt_conditional_cli(regs);
+		conditional_cli_ist(regs);
 	}
 	exception_exit(regs);
 }
@@ -331,9 +345,9 @@ dotraplinkage void __kprobes notrace do_int3(struct pt_regs *regs, long error_co
 	 * as we may switch to the interrupt stack.
 	 */
 	debug_stack_usage_inc();
-	preempt_conditional_sti(regs);
+	conditional_sti_ist(regs);
 	do_trap(X86_TRAP_BP, SIGTRAP, "int3", regs, error_code, NULL);
-	preempt_conditional_cli(regs);
+	conditional_cli_ist(regs);
 	debug_stack_usage_dec();
 exit:
 	exception_exit(regs);
@@ -438,12 +452,12 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
 	debug_stack_usage_inc();
 
 	/* It's safe to allow irq's after DR6 has been saved */
-	preempt_conditional_sti(regs);
+	conditional_sti_ist(regs);
 
 	if (regs->flags & X86_VM_MASK) {
 		handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code,
 					X86_TRAP_DB);
-		preempt_conditional_cli(regs);
+		conditional_cli_ist(regs);
 		debug_stack_usage_dec();
 		goto exit;
 	}
@@ -463,7 +477,7 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
 	si_code = get_si_code(tsk->thread.debugreg6);
 	if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS) || user_icebp)
 		send_sigtrap(tsk, regs, error_code, si_code);
-	preempt_conditional_cli(regs);
+	conditional_cli_ist(regs);
 	debug_stack_usage_dec();
 
 exit:
-- 
cgit v0.10.2


From 831ae4682660723aaedeb289a5e744722ecb347e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 1 Dec 2011 00:07:16 +0100
Subject: pci: Use __wake_up_all_locked pci_unblock_user_cfg_access()

The waitqueue is protected by the pci_lock, so we can just avoid to
lock the waitqueue lock itself. That prevents the
might_sleep()/scheduling while atomic problem on RT

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable-rt@vger.kernel.org

diff --git a/drivers/pci/access.c b/drivers/pci/access.c
index 32046c5..0941838 100644
--- a/drivers/pci/access.c
+++ b/drivers/pci/access.c
@@ -465,7 +465,7 @@ void pci_cfg_access_unlock(struct pci_dev *dev)
 	WARN_ON(!dev->block_cfg_access);
 
 	dev->block_cfg_access = 0;
-	wake_up_all(&pci_cfg_wait);
+	wake_up_all_locked(&pci_cfg_wait);
 	raw_spin_unlock_irqrestore(&pci_lock, flags);
 }
 EXPORT_SYMBOL_GPL(pci_cfg_access_unlock);
-- 
cgit v0.10.2


From 4d74656d1e80c7aa7de6efcc10eed40270fc699a Mon Sep 17 00:00:00 2001
From: Carsten Emde <C.Emde@osadl.org>
Date: Tue, 19 Jul 2011 14:03:41 +0100
Subject: latency-hist.patch

This patch provides a recording mechanism to store data of potential
sources of system latencies. The recordings separately determine the
latency caused by a delayed timer expiration, by a delayed wakeup of the
related user space program and by the sum of both. The histograms can be
enabled and reset individually. The data are accessible via the debug
filesystem. For details please consult Documentation/trace/histograms.txt.

Signed-off-by: Carsten Emde <C.Emde@osadl.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/Documentation/trace/histograms.txt b/Documentation/trace/histograms.txt
new file mode 100644
index 0000000..6f2aeab
--- /dev/null
+++ b/Documentation/trace/histograms.txt
@@ -0,0 +1,186 @@
+		Using the Linux Kernel Latency Histograms
+
+
+This document gives a short explanation how to enable, configure and use
+latency histograms. Latency histograms are primarily relevant in the
+context of real-time enabled kernels (CONFIG_PREEMPT/CONFIG_PREEMPT_RT)
+and are used in the quality management of the Linux real-time
+capabilities.
+
+
+* Purpose of latency histograms
+
+A latency histogram continuously accumulates the frequencies of latency
+data. There are two types of histograms
+- potential sources of latencies
+- effective latencies
+
+
+* Potential sources of latencies
+
+Potential sources of latencies are code segments where interrupts,
+preemption or both are disabled (aka critical sections). To create
+histograms of potential sources of latency, the kernel stores the time
+stamp at the start of a critical section, determines the time elapsed
+when the end of the section is reached, and increments the frequency
+counter of that latency value - irrespective of whether any concurrently
+running process is affected by latency or not.
+- Configuration items (in the Kernel hacking/Tracers submenu)
+  CONFIG_INTERRUPT_OFF_LATENCY
+  CONFIG_PREEMPT_OFF_LATENCY
+
+
+* Effective latencies
+
+Effective latencies are actually occuring during wakeup of a process. To
+determine effective latencies, the kernel stores the time stamp when a
+process is scheduled to be woken up, and determines the duration of the
+wakeup time shortly before control is passed over to this process. Note
+that the apparent latency in user space may be somewhat longer, since the
+process may be interrupted after control is passed over to it but before
+the execution in user space takes place. Simply measuring the interval
+between enqueuing and wakeup may also not appropriate in cases when a
+process is scheduled as a result of a timer expiration. The timer may have
+missed its deadline, e.g. due to disabled interrupts, but this latency
+would not be registered. Therefore, the offsets of missed timers are
+recorded in a separate histogram. If both wakeup latency and missed timer
+offsets are configured and enabled, a third histogram may be enabled that
+records the overall latency as a sum of the timer latency, if any, and the
+wakeup latency. This histogram is called "timerandwakeup".
+- Configuration items (in the Kernel hacking/Tracers submenu)
+  CONFIG_WAKEUP_LATENCY
+  CONFIG_MISSED_TIMER_OFSETS
+
+
+* Usage
+
+The interface to the administration of the latency histograms is located
+in the debugfs file system. To mount it, either enter
+
+mount -t sysfs nodev /sys
+mount -t debugfs nodev /sys/kernel/debug
+
+from shell command line level, or add
+
+nodev	/sys			sysfs	defaults	0 0
+nodev	/sys/kernel/debug	debugfs	defaults	0 0
+
+to the file /etc/fstab. All latency histogram related files are then
+available in the directory /sys/kernel/debug/tracing/latency_hist. A
+particular histogram type is enabled by writing non-zero to the related
+variable in the /sys/kernel/debug/tracing/latency_hist/enable directory.
+Select "preemptirqsoff" for the histograms of potential sources of
+latencies and "wakeup" for histograms of effective latencies etc. The
+histogram data - one per CPU - are available in the files
+
+/sys/kernel/debug/tracing/latency_hist/preemptoff/CPUx
+/sys/kernel/debug/tracing/latency_hist/irqsoff/CPUx
+/sys/kernel/debug/tracing/latency_hist/preemptirqsoff/CPUx
+/sys/kernel/debug/tracing/latency_hist/wakeup/CPUx
+/sys/kernel/debug/tracing/latency_hist/wakeup/sharedprio/CPUx
+/sys/kernel/debug/tracing/latency_hist/missed_timer_offsets/CPUx
+/sys/kernel/debug/tracing/latency_hist/timerandwakeup/CPUx
+
+The histograms are reset by writing non-zero to the file "reset" in a
+particular latency directory. To reset all latency data, use
+
+#!/bin/sh
+
+TRACINGDIR=/sys/kernel/debug/tracing
+HISTDIR=$TRACINGDIR/latency_hist
+
+if test -d $HISTDIR
+then
+  cd $HISTDIR
+  for i in `find . | grep /reset$`
+  do
+    echo 1 >$i
+  done
+fi
+
+
+* Data format
+
+Latency data are stored with a resolution of one microsecond. The
+maximum latency is 10,240 microseconds. The data are only valid, if the
+overflow register is empty. Every output line contains the latency in
+microseconds in the first row and the number of samples in the second
+row. To display only lines with a positive latency count, use, for
+example,
+
+grep -v " 0$" /sys/kernel/debug/tracing/latency_hist/preemptoff/CPU0
+
+#Minimum latency: 0 microseconds.
+#Average latency: 0 microseconds.
+#Maximum latency: 25 microseconds.
+#Total samples: 3104770694
+#There are 0 samples greater or equal than 10240 microseconds
+#usecs	         samples
+    0	      2984486876
+    1	        49843506
+    2	        58219047
+    3	         5348126
+    4	         2187960
+    5	         3388262
+    6	          959289
+    7	          208294
+    8	           40420
+    9	            4485
+   10	           14918
+   11	           18340
+   12	           25052
+   13	           19455
+   14	            5602
+   15	             969
+   16	              47
+   17	              18
+   18	              14
+   19	               1
+   20	               3
+   21	               2
+   22	               5
+   23	               2
+   25	               1
+
+
+* Wakeup latency of a selected process
+
+To only collect wakeup latency data of a particular process, write the
+PID of the requested process to
+
+/sys/kernel/debug/tracing/latency_hist/wakeup/pid
+
+PIDs are not considered, if this variable is set to 0.
+
+
+* Details of the process with the highest wakeup latency so far
+
+Selected data of the process that suffered from the highest wakeup
+latency that occurred in a particular CPU are available in the file
+
+/sys/kernel/debug/tracing/latency_hist/wakeup/max_latency-CPUx.
+
+In addition, other relevant system data at the time when the
+latency occurred are given.
+
+The format of the data is (all in one line):
+<PID> <Priority> <Latency> (<Timeroffset>) <Command> \
+<- <PID> <Priority> <Command> <Timestamp>
+
+The value of <Timeroffset> is only relevant in the combined timer
+and wakeup latency recording. In the wakeup recording, it is
+always 0, in the missed_timer_offsets recording, it is the same
+as <Latency>.
+
+When retrospectively searching for the origin of a latency and
+tracing was not enabled, it may be helpful to know the name and
+some basic data of the task that (finally) was switching to the
+late real-tlme task. In addition to the victim's data, also the
+data of the possible culprit are therefore displayed after the
+"<-" symbol.
+
+Finally, the timestamp of the time when the latency occurred
+in <seconds>.<microseconds> after the most recent system boot
+is provided.
+
+These data are also reset when the wakeup histogram is reset.
diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index cc07d27..5b9e789 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -111,6 +111,9 @@ struct hrtimer {
 	enum hrtimer_restart		(*function)(struct hrtimer *);
 	struct hrtimer_clock_base	*base;
 	unsigned long			state;
+#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
+	ktime_t				praecox;
+#endif
 #ifdef CONFIG_TIMER_STATS
 	int				start_pid;
 	void				*start_site;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index fd868f6..3eed200 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1598,6 +1598,12 @@ struct task_struct {
 	unsigned long trace;
 	/* bitmask and counter of trace recursion */
 	unsigned long trace_recursion;
+#ifdef CONFIG_WAKEUP_LATENCY_HIST
+	u64 preempt_timestamp_hist;
+#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
+	long timer_offset;
+#endif
+#endif
 #endif /* CONFIG_TRACING */
 #ifdef CONFIG_MEMCG /* memcg uses this to do batch job */
 	struct memcg_batch_info {
diff --git a/include/trace/events/hist.h b/include/trace/events/hist.h
new file mode 100644
index 0000000..28646db
--- /dev/null
+++ b/include/trace/events/hist.h
@@ -0,0 +1,69 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM hist
+
+#if !defined(_TRACE_HIST_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_HIST_H
+
+#include "latency_hist.h"
+#include <linux/tracepoint.h>
+
+#if !defined(CONFIG_PREEMPT_OFF_HIST) && !defined(CONFIG_INTERRUPT_OFF_HIST)
+#define trace_preemptirqsoff_hist(a,b)
+#else
+TRACE_EVENT(preemptirqsoff_hist,
+
+	TP_PROTO(int reason, int starthist),
+
+	TP_ARGS(reason, starthist),
+
+	TP_STRUCT__entry(
+		__field(int,	reason	)
+		__field(int,	starthist	)
+	),
+
+	TP_fast_assign(
+		__entry->reason		= reason;
+		__entry->starthist	= starthist;
+	),
+
+	TP_printk("reason=%s starthist=%s", getaction(__entry->reason),
+		  __entry->starthist ? "start" : "stop")
+);
+#endif
+
+#ifndef CONFIG_MISSED_TIMER_OFFSETS_HIST
+#define trace_hrtimer_interrupt(a,b,c,d)
+#else
+TRACE_EVENT(hrtimer_interrupt,
+
+	TP_PROTO(int cpu, long long offset, struct task_struct *curr, struct task_struct *task),
+
+	TP_ARGS(cpu, offset, curr, task),
+
+	TP_STRUCT__entry(
+		__field(int,		cpu	)
+		__field(long long,	offset	)
+		__array(char,		ccomm,	TASK_COMM_LEN)
+		__field(int,		cprio	)
+		__array(char,		tcomm,	TASK_COMM_LEN)
+		__field(int,		tprio	)
+	),
+
+	TP_fast_assign(
+		__entry->cpu	= cpu;
+		__entry->offset	= offset;
+		memcpy(__entry->ccomm, curr->comm, TASK_COMM_LEN);
+		__entry->cprio  = curr->prio;
+		memcpy(__entry->tcomm, task != NULL ? task->comm : "<none>", task != NULL ? TASK_COMM_LEN : 7);
+		__entry->tprio  = task != NULL ? task->prio : -1;
+	),
+
+	TP_printk("cpu=%d offset=%lld curr=%s[%d] thread=%s[%d]",
+		__entry->cpu, __entry->offset, __entry->ccomm, __entry->cprio, __entry->tcomm, __entry->tprio)
+);
+#endif
+
+#endif /* _TRACE_HIST_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/include/trace/events/latency_hist.h b/include/trace/events/latency_hist.h
new file mode 100644
index 0000000..7f70794
--- /dev/null
+++ b/include/trace/events/latency_hist.h
@@ -0,0 +1,29 @@
+#ifndef _LATENCY_HIST_H
+#define _LATENCY_HIST_H
+
+enum hist_action {
+	IRQS_ON,
+	PREEMPT_ON,
+	TRACE_STOP,
+	IRQS_OFF,
+	PREEMPT_OFF,
+	TRACE_START,
+};
+
+static char *actions[] = {
+	"IRQS_ON",
+	"PREEMPT_ON",
+	"TRACE_STOP",
+	"IRQS_OFF",
+	"PREEMPT_OFF",
+	"TRACE_START",
+};
+
+static inline char *getaction(int action)
+{
+	if (action >= 0 && action <= sizeof(actions)/sizeof(actions[0]))
+		return(actions[action]);
+	return("unknown");
+}
+
+#endif /* _LATENCY_HIST_H */
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index cdd5607..7ee08f8 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -49,6 +49,7 @@
 #include <asm/uaccess.h>
 
 #include <trace/events/timer.h>
+#include <trace/events/hist.h>
 
 /*
  * The timer bases:
@@ -970,6 +971,17 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
 #endif
 	}
 
+#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
+	{
+		ktime_t now = new_base->get_time();
+
+		if (ktime_to_ns(tim) < ktime_to_ns(now))
+			timer->praecox = now;
+		else
+			timer->praecox = ktime_set(0, 0);
+	}
+#endif
+
 	hrtimer_set_expires_range_ns(timer, tim, delta_ns);
 
 	timer_stats_hrtimer_set_start_info(timer);
@@ -1246,6 +1258,8 @@ static void __run_hrtimer(struct hrtimer *timer, ktime_t *now)
 
 #ifdef CONFIG_HIGH_RES_TIMERS
 
+static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer);
+
 /*
  * High resolution timer interrupt
  * Called with interrupts disabled
@@ -1289,6 +1303,15 @@ retry:
 
 			timer = container_of(node, struct hrtimer, node);
 
+			trace_hrtimer_interrupt(raw_smp_processor_id(),
+			    ktime_to_ns(ktime_sub(ktime_to_ns(timer->praecox) ?
+				timer->praecox : hrtimer_get_expires(timer),
+				basenow)),
+			    current,
+			    timer->function == hrtimer_wakeup ?
+			    container_of(timer, struct hrtimer_sleeper,
+				timer)->task : NULL);
+
 			/*
 			 * The immediate goal for using the softexpires is
 			 * minimizing wakeups, not running timers at the
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 2747967..c872f5f 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -202,6 +202,24 @@ config IRQSOFF_TRACER
 	  enabled. This option and the preempt-off timing option can be
 	  used together or separately.)
 
+config INTERRUPT_OFF_HIST
+	bool "Interrupts-off Latency Histogram"
+	depends on IRQSOFF_TRACER
+	help
+	  This option generates continuously updated histograms (one per cpu)
+	  of the duration of time periods with interrupts disabled. The
+	  histograms are disabled by default. To enable them, write a non-zero
+	  number to
+
+	      /sys/kernel/debug/tracing/latency_hist/enable/preemptirqsoff
+
+	  If PREEMPT_OFF_HIST is also selected, additional histograms (one
+	  per cpu) are generated that accumulate the duration of time periods
+	  when both interrupts and preemption are disabled. The histogram data
+	  will be located in the debug file system at
+
+	      /sys/kernel/debug/tracing/latency_hist/irqsoff
+
 config PREEMPT_TRACER
 	bool "Preemption-off Latency Tracer"
 	default n
@@ -224,6 +242,24 @@ config PREEMPT_TRACER
 	  enabled. This option and the irqs-off timing option can be
 	  used together or separately.)
 
+config PREEMPT_OFF_HIST
+	bool "Preemption-off Latency Histogram"
+	depends on PREEMPT_TRACER
+	help
+	  This option generates continuously updated histograms (one per cpu)
+	  of the duration of time periods with preemption disabled. The
+	  histograms are disabled by default. To enable them, write a non-zero
+	  number to
+
+	      /sys/kernel/debug/tracing/latency_hist/enable/preemptirqsoff
+
+	  If INTERRUPT_OFF_HIST is also selected, additional histograms (one
+	  per cpu) are generated that accumulate the duration of time periods
+	  when both interrupts and preemption are disabled. The histogram data
+	  will be located in the debug file system at
+
+	      /sys/kernel/debug/tracing/latency_hist/preemptoff
+
 config SCHED_TRACER
 	bool "Scheduling Latency Tracer"
 	select GENERIC_TRACER
@@ -233,6 +269,74 @@ config SCHED_TRACER
 	  This tracer tracks the latency of the highest priority task
 	  to be scheduled in, starting from the point it has woken up.
 
+config WAKEUP_LATENCY_HIST
+	bool "Scheduling Latency Histogram"
+	depends on SCHED_TRACER
+	help
+	  This option generates continuously updated histograms (one per cpu)
+	  of the scheduling latency of the highest priority task.
+	  The histograms are disabled by default. To enable them, write a
+	  non-zero number to
+
+	      /sys/kernel/debug/tracing/latency_hist/enable/wakeup
+
+	  Two different algorithms are used, one to determine the latency of
+	  processes that exclusively use the highest priority of the system and
+	  another one to determine the latency of processes that share the
+	  highest system priority with other processes. The former is used to
+	  improve hardware and system software, the latter to optimize the
+	  priority design of a given system. The histogram data will be
+	  located in the debug file system at
+
+	      /sys/kernel/debug/tracing/latency_hist/wakeup
+
+	  and
+
+	      /sys/kernel/debug/tracing/latency_hist/wakeup/sharedprio
+
+	  If both Scheduling Latency Histogram and Missed Timer Offsets
+	  Histogram are selected, additional histogram data will be collected
+	  that contain, in addition to the wakeup latency, the timer latency, in
+	  case the wakeup was triggered by an expired timer. These histograms
+	  are available in the
+
+	      /sys/kernel/debug/tracing/latency_hist/timerandwakeup
+
+	  directory. They reflect the apparent interrupt and scheduling latency
+	  and are best suitable to determine the worst-case latency of a given
+	  system. To enable these histograms, write a non-zero number to
+
+	      /sys/kernel/debug/tracing/latency_hist/enable/timerandwakeup
+
+config MISSED_TIMER_OFFSETS_HIST
+	depends on HIGH_RES_TIMERS
+	select GENERIC_TRACER
+	bool "Missed Timer Offsets Histogram"
+	help
+	  Generate a histogram of missed timer offsets in microseconds. The
+	  histograms are disabled by default. To enable them, write a non-zero
+	  number to
+
+	      /sys/kernel/debug/tracing/latency_hist/enable/missed_timer_offsets
+
+	  The histogram data will be located in the debug file system at
+
+	      /sys/kernel/debug/tracing/latency_hist/missed_timer_offsets
+
+	  If both Scheduling Latency Histogram and Missed Timer Offsets
+	  Histogram are selected, additional histogram data will be collected
+	  that contain, in addition to the wakeup latency, the timer latency, in
+	  case the wakeup was triggered by an expired timer. These histograms
+	  are available in the
+
+	      /sys/kernel/debug/tracing/latency_hist/timerandwakeup
+
+	  directory. They reflect the apparent interrupt and scheduling latency
+	  and are best suitable to determine the worst-case latency of a given
+	  system. To enable these histograms, write a non-zero number to
+
+	      /sys/kernel/debug/tracing/latency_hist/enable/timerandwakeup
+
 config ENABLE_DEFAULT_TRACERS
 	bool "Trace process context switches and events"
 	depends on !GENERIC_TRACER
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index d7e2068..f5e0243 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -34,6 +34,10 @@ obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o
 obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o
 obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o
 obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o
+obj-$(CONFIG_INTERRUPT_OFF_HIST) += latency_hist.o
+obj-$(CONFIG_PREEMPT_OFF_HIST) += latency_hist.o
+obj-$(CONFIG_WAKEUP_LATENCY_HIST) += latency_hist.o
+obj-$(CONFIG_MISSED_TIMER_OFFSETS_HIST) += latency_hist.o
 obj-$(CONFIG_NOP_TRACER) += trace_nop.o
 obj-$(CONFIG_STACK_TRACER) += trace_stack.o
 obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
diff --git a/kernel/trace/latency_hist.c b/kernel/trace/latency_hist.c
new file mode 100644
index 0000000..6a4c869
--- /dev/null
+++ b/kernel/trace/latency_hist.c
@@ -0,0 +1,1176 @@
+/*
+ * kernel/trace/latency_hist.c
+ *
+ * Add support for histograms of preemption-off latency and
+ * interrupt-off latency and wakeup latency, it depends on
+ * Real-Time Preemption Support.
+ *
+ *  Copyright (C) 2005 MontaVista Software, Inc.
+ *  Yi Yang <yyang@ch.mvista.com>
+ *
+ *  Converted to work with the new latency tracer.
+ *  Copyright (C) 2008 Red Hat, Inc.
+ *    Steven Rostedt <srostedt@redhat.com>
+ *
+ */
+#include <linux/module.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+#include <linux/percpu.h>
+#include <linux/kallsyms.h>
+#include <linux/uaccess.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <asm/atomic.h>
+#include <asm/div64.h>
+
+#include "trace.h"
+#include <trace/events/sched.h>
+
+#define NSECS_PER_USECS 1000L
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/hist.h>
+
+enum {
+	IRQSOFF_LATENCY = 0,
+	PREEMPTOFF_LATENCY,
+	PREEMPTIRQSOFF_LATENCY,
+	WAKEUP_LATENCY,
+	WAKEUP_LATENCY_SHAREDPRIO,
+	MISSED_TIMER_OFFSETS,
+	TIMERANDWAKEUP_LATENCY,
+	MAX_LATENCY_TYPE,
+};
+
+#define MAX_ENTRY_NUM 10240
+
+struct hist_data {
+	atomic_t hist_mode; /* 0 log, 1 don't log */
+	long offset; /* set it to MAX_ENTRY_NUM/2 for a bipolar scale */
+	long min_lat;
+	long max_lat;
+	unsigned long long below_hist_bound_samples;
+	unsigned long long above_hist_bound_samples;
+	long long accumulate_lat;
+	unsigned long long total_samples;
+	unsigned long long hist_array[MAX_ENTRY_NUM];
+};
+
+struct enable_data {
+	int latency_type;
+	int enabled;
+};
+
+static char *latency_hist_dir_root = "latency_hist";
+
+#ifdef CONFIG_INTERRUPT_OFF_HIST
+static DEFINE_PER_CPU(struct hist_data, irqsoff_hist);
+static char *irqsoff_hist_dir = "irqsoff";
+static DEFINE_PER_CPU(cycles_t, hist_irqsoff_start);
+static DEFINE_PER_CPU(int, hist_irqsoff_counting);
+#endif
+
+#ifdef CONFIG_PREEMPT_OFF_HIST
+static DEFINE_PER_CPU(struct hist_data, preemptoff_hist);
+static char *preemptoff_hist_dir = "preemptoff";
+static DEFINE_PER_CPU(cycles_t, hist_preemptoff_start);
+static DEFINE_PER_CPU(int, hist_preemptoff_counting);
+#endif
+
+#if defined(CONFIG_PREEMPT_OFF_HIST) && defined(CONFIG_INTERRUPT_OFF_HIST)
+static DEFINE_PER_CPU(struct hist_data, preemptirqsoff_hist);
+static char *preemptirqsoff_hist_dir = "preemptirqsoff";
+static DEFINE_PER_CPU(cycles_t, hist_preemptirqsoff_start);
+static DEFINE_PER_CPU(int, hist_preemptirqsoff_counting);
+#endif
+
+#if defined(CONFIG_PREEMPT_OFF_HIST) || defined(CONFIG_INTERRUPT_OFF_HIST)
+static notrace void probe_preemptirqsoff_hist(void *v, int reason, int start);
+static struct enable_data preemptirqsoff_enabled_data = {
+	.latency_type = PREEMPTIRQSOFF_LATENCY,
+	.enabled = 0,
+};
+#endif
+
+#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
+    defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
+struct maxlatproc_data {
+	char comm[FIELD_SIZEOF(struct task_struct, comm)];
+	char current_comm[FIELD_SIZEOF(struct task_struct, comm)];
+	int pid;
+	int current_pid;
+	int prio;
+	int current_prio;
+	long latency;
+	long timeroffset;
+	cycle_t timestamp;
+};
+#endif
+
+#ifdef CONFIG_WAKEUP_LATENCY_HIST
+static DEFINE_PER_CPU(struct hist_data, wakeup_latency_hist);
+static DEFINE_PER_CPU(struct hist_data, wakeup_latency_hist_sharedprio);
+static char *wakeup_latency_hist_dir = "wakeup";
+static char *wakeup_latency_hist_dir_sharedprio = "sharedprio";
+static notrace void probe_wakeup_latency_hist_start(void *v,
+    struct task_struct *p, int success);
+static notrace void probe_wakeup_latency_hist_stop(void *v,
+    struct task_struct *prev, struct task_struct *next);
+static notrace void probe_sched_migrate_task(void *,
+    struct task_struct *task, int cpu);
+static struct enable_data wakeup_latency_enabled_data = {
+	.latency_type = WAKEUP_LATENCY,
+	.enabled = 0,
+};
+static DEFINE_PER_CPU(struct maxlatproc_data, wakeup_maxlatproc);
+static DEFINE_PER_CPU(struct maxlatproc_data, wakeup_maxlatproc_sharedprio);
+static DEFINE_PER_CPU(struct task_struct *, wakeup_task);
+static DEFINE_PER_CPU(int, wakeup_sharedprio);
+static unsigned long wakeup_pid;
+#endif
+
+#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
+static DEFINE_PER_CPU(struct hist_data, missed_timer_offsets);
+static char *missed_timer_offsets_dir = "missed_timer_offsets";
+static notrace void probe_hrtimer_interrupt(void *v, int cpu,
+    long long offset, struct task_struct *curr, struct task_struct *task);
+static struct enable_data missed_timer_offsets_enabled_data = {
+	.latency_type = MISSED_TIMER_OFFSETS,
+	.enabled = 0,
+};
+static DEFINE_PER_CPU(struct maxlatproc_data, missed_timer_offsets_maxlatproc);
+static unsigned long missed_timer_offsets_pid;
+#endif
+
+#if defined(CONFIG_WAKEUP_LATENCY_HIST) && \
+    defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
+static DEFINE_PER_CPU(struct hist_data, timerandwakeup_latency_hist);
+static char *timerandwakeup_latency_hist_dir = "timerandwakeup";
+static struct enable_data timerandwakeup_enabled_data = {
+	.latency_type = TIMERANDWAKEUP_LATENCY,
+	.enabled = 0,
+};
+static DEFINE_PER_CPU(struct maxlatproc_data, timerandwakeup_maxlatproc);
+#endif
+
+void notrace latency_hist(int latency_type, int cpu, long latency,
+			  long timeroffset, cycle_t stop,
+			  struct task_struct *p)
+{
+	struct hist_data *my_hist;
+#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
+    defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
+	struct maxlatproc_data *mp = NULL;
+#endif
+
+	if (cpu < 0 || cpu >= NR_CPUS || latency_type < 0 ||
+	    latency_type >= MAX_LATENCY_TYPE)
+		return;
+
+	switch (latency_type) {
+#ifdef CONFIG_INTERRUPT_OFF_HIST
+	case IRQSOFF_LATENCY:
+		my_hist = &per_cpu(irqsoff_hist, cpu);
+		break;
+#endif
+#ifdef CONFIG_PREEMPT_OFF_HIST
+	case PREEMPTOFF_LATENCY:
+		my_hist = &per_cpu(preemptoff_hist, cpu);
+		break;
+#endif
+#if defined(CONFIG_PREEMPT_OFF_HIST) && defined(CONFIG_INTERRUPT_OFF_HIST)
+	case PREEMPTIRQSOFF_LATENCY:
+		my_hist = &per_cpu(preemptirqsoff_hist, cpu);
+		break;
+#endif
+#ifdef CONFIG_WAKEUP_LATENCY_HIST
+	case WAKEUP_LATENCY:
+		my_hist = &per_cpu(wakeup_latency_hist, cpu);
+		mp = &per_cpu(wakeup_maxlatproc, cpu);
+		break;
+	case WAKEUP_LATENCY_SHAREDPRIO:
+		my_hist = &per_cpu(wakeup_latency_hist_sharedprio, cpu);
+		mp = &per_cpu(wakeup_maxlatproc_sharedprio, cpu);
+		break;
+#endif
+#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
+	case MISSED_TIMER_OFFSETS:
+		my_hist = &per_cpu(missed_timer_offsets, cpu);
+		mp = &per_cpu(missed_timer_offsets_maxlatproc, cpu);
+		break;
+#endif
+#if defined(CONFIG_WAKEUP_LATENCY_HIST) && \
+    defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
+	case TIMERANDWAKEUP_LATENCY:
+		my_hist = &per_cpu(timerandwakeup_latency_hist, cpu);
+		mp = &per_cpu(timerandwakeup_maxlatproc, cpu);
+		break;
+#endif
+
+	default:
+		return;
+	}
+
+	latency += my_hist->offset;
+
+	if (atomic_read(&my_hist->hist_mode) == 0)
+		return;
+
+	if (latency < 0 || latency >= MAX_ENTRY_NUM) {
+		if (latency < 0)
+			my_hist->below_hist_bound_samples++;
+		else
+			my_hist->above_hist_bound_samples++;
+	} else
+		my_hist->hist_array[latency]++;
+
+	if (unlikely(latency > my_hist->max_lat ||
+	    my_hist->min_lat == LONG_MAX)) {
+#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
+    defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
+		if (latency_type == WAKEUP_LATENCY ||
+		    latency_type == WAKEUP_LATENCY_SHAREDPRIO ||
+		    latency_type == MISSED_TIMER_OFFSETS ||
+		    latency_type == TIMERANDWAKEUP_LATENCY) {
+			strncpy(mp->comm, p->comm, sizeof(mp->comm));
+			strncpy(mp->current_comm, current->comm,
+			    sizeof(mp->current_comm));
+			mp->pid = task_pid_nr(p);
+			mp->current_pid = task_pid_nr(current);
+			mp->prio = p->prio;
+			mp->current_prio = current->prio;
+			mp->latency = latency;
+			mp->timeroffset = timeroffset;
+			mp->timestamp = stop;
+		}
+#endif
+		my_hist->max_lat = latency;
+	}
+	if (unlikely(latency < my_hist->min_lat))
+		my_hist->min_lat = latency;
+	my_hist->total_samples++;
+	my_hist->accumulate_lat += latency;
+}
+
+static void *l_start(struct seq_file *m, loff_t *pos)
+{
+	loff_t *index_ptr = NULL;
+	loff_t index = *pos;
+	struct hist_data *my_hist = m->private;
+
+	if (index == 0) {
+		char minstr[32], avgstr[32], maxstr[32];
+
+		atomic_dec(&my_hist->hist_mode);
+
+		if (likely(my_hist->total_samples)) {
+			long avg = (long) div64_s64(my_hist->accumulate_lat,
+			    my_hist->total_samples);
+			snprintf(minstr, sizeof(minstr), "%ld",
+			    my_hist->min_lat - my_hist->offset);
+			snprintf(avgstr, sizeof(avgstr), "%ld",
+			    avg - my_hist->offset);
+			snprintf(maxstr, sizeof(maxstr), "%ld",
+			    my_hist->max_lat - my_hist->offset);
+		} else {
+			strcpy(minstr, "<undef>");
+			strcpy(avgstr, minstr);
+			strcpy(maxstr, minstr);
+		}
+
+		seq_printf(m, "#Minimum latency: %s microseconds\n"
+			   "#Average latency: %s microseconds\n"
+			   "#Maximum latency: %s microseconds\n"
+			   "#Total samples: %llu\n"
+			   "#There are %llu samples lower than %ld"
+			   " microseconds.\n"
+			   "#There are %llu samples greater or equal"
+			   " than %ld microseconds.\n"
+			   "#usecs\t%16s\n",
+			   minstr, avgstr, maxstr,
+			   my_hist->total_samples,
+			   my_hist->below_hist_bound_samples,
+			   -my_hist->offset,
+			   my_hist->above_hist_bound_samples,
+			   MAX_ENTRY_NUM - my_hist->offset,
+			   "samples");
+	}
+	if (index < MAX_ENTRY_NUM) {
+		index_ptr = kmalloc(sizeof(loff_t), GFP_KERNEL);
+		if (index_ptr)
+			*index_ptr = index;
+	}
+
+	return index_ptr;
+}
+
+static void *l_next(struct seq_file *m, void *p, loff_t *pos)
+{
+	loff_t *index_ptr = p;
+	struct hist_data *my_hist = m->private;
+
+	if (++*pos >= MAX_ENTRY_NUM) {
+		atomic_inc(&my_hist->hist_mode);
+		return NULL;
+	}
+	*index_ptr = *pos;
+	return index_ptr;
+}
+
+static void l_stop(struct seq_file *m, void *p)
+{
+	kfree(p);
+}
+
+static int l_show(struct seq_file *m, void *p)
+{
+	int index = *(loff_t *) p;
+	struct hist_data *my_hist = m->private;
+
+	seq_printf(m, "%6ld\t%16llu\n", index - my_hist->offset,
+	    my_hist->hist_array[index]);
+	return 0;
+}
+
+static struct seq_operations latency_hist_seq_op = {
+	.start = l_start,
+	.next  = l_next,
+	.stop  = l_stop,
+	.show  = l_show
+};
+
+static int latency_hist_open(struct inode *inode, struct file *file)
+{
+	int ret;
+
+	ret = seq_open(file, &latency_hist_seq_op);
+	if (!ret) {
+		struct seq_file *seq = file->private_data;
+		seq->private = inode->i_private;
+	}
+	return ret;
+}
+
+static struct file_operations latency_hist_fops = {
+	.open = latency_hist_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = seq_release,
+};
+
+#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
+    defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
+static void clear_maxlatprocdata(struct maxlatproc_data *mp)
+{
+	mp->comm[0] = mp->current_comm[0] = '\0';
+	mp->prio = mp->current_prio = mp->pid = mp->current_pid =
+	    mp->latency = mp->timeroffset = -1;
+	mp->timestamp = 0;
+}
+#endif
+
+static void hist_reset(struct hist_data *hist)
+{
+	atomic_dec(&hist->hist_mode);
+
+	memset(hist->hist_array, 0, sizeof(hist->hist_array));
+	hist->below_hist_bound_samples = 0ULL;
+	hist->above_hist_bound_samples = 0ULL;
+	hist->min_lat = LONG_MAX;
+	hist->max_lat = LONG_MIN;
+	hist->total_samples = 0ULL;
+	hist->accumulate_lat = 0LL;
+
+	atomic_inc(&hist->hist_mode);
+}
+
+static ssize_t
+latency_hist_reset(struct file *file, const char __user *a,
+		   size_t size, loff_t *off)
+{
+	int cpu;
+	struct hist_data *hist = NULL;
+#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
+    defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
+	struct maxlatproc_data *mp = NULL;
+#endif
+	off_t latency_type = (off_t) file->private_data;
+
+	for_each_online_cpu(cpu) {
+
+		switch (latency_type) {
+#ifdef CONFIG_PREEMPT_OFF_HIST
+		case PREEMPTOFF_LATENCY:
+			hist = &per_cpu(preemptoff_hist, cpu);
+			break;
+#endif
+#ifdef CONFIG_INTERRUPT_OFF_HIST
+		case IRQSOFF_LATENCY:
+			hist = &per_cpu(irqsoff_hist, cpu);
+			break;
+#endif
+#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST)
+		case PREEMPTIRQSOFF_LATENCY:
+			hist = &per_cpu(preemptirqsoff_hist, cpu);
+			break;
+#endif
+#ifdef CONFIG_WAKEUP_LATENCY_HIST
+		case WAKEUP_LATENCY:
+			hist = &per_cpu(wakeup_latency_hist, cpu);
+			mp = &per_cpu(wakeup_maxlatproc, cpu);
+			break;
+		case WAKEUP_LATENCY_SHAREDPRIO:
+			hist = &per_cpu(wakeup_latency_hist_sharedprio, cpu);
+			mp = &per_cpu(wakeup_maxlatproc_sharedprio, cpu);
+			break;
+#endif
+#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
+		case MISSED_TIMER_OFFSETS:
+			hist = &per_cpu(missed_timer_offsets, cpu);
+			mp = &per_cpu(missed_timer_offsets_maxlatproc, cpu);
+			break;
+#endif
+#if defined(CONFIG_WAKEUP_LATENCY_HIST) && \
+    defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
+		case TIMERANDWAKEUP_LATENCY:
+			hist = &per_cpu(timerandwakeup_latency_hist, cpu);
+			mp = &per_cpu(timerandwakeup_maxlatproc, cpu);
+			break;
+#endif
+		}
+
+		hist_reset(hist);
+#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
+    defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
+		if (latency_type == WAKEUP_LATENCY ||
+		    latency_type == WAKEUP_LATENCY_SHAREDPRIO ||
+		    latency_type == MISSED_TIMER_OFFSETS ||
+		    latency_type == TIMERANDWAKEUP_LATENCY)
+			clear_maxlatprocdata(mp);
+#endif
+	}
+
+	return size;
+}
+
+#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
+    defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
+static ssize_t
+show_pid(struct file *file, char __user *ubuf, size_t cnt, loff_t *ppos)
+{
+	char buf[64];
+	int r;
+	unsigned long *this_pid = file->private_data;
+
+	r = snprintf(buf, sizeof(buf), "%lu\n", *this_pid);
+	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+
+static ssize_t do_pid(struct file *file, const char __user *ubuf,
+		      size_t cnt, loff_t *ppos)
+{
+	char buf[64];
+	unsigned long pid;
+	unsigned long *this_pid = file->private_data;
+
+	if (cnt >= sizeof(buf))
+		return -EINVAL;
+
+	if (copy_from_user(&buf, ubuf, cnt))
+		return -EFAULT;
+
+	buf[cnt] = '\0';
+
+	if (strict_strtoul(buf, 10, &pid))
+		return(-EINVAL);
+
+	*this_pid = pid;
+
+	return cnt;
+}
+#endif
+
+#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
+    defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
+static ssize_t
+show_maxlatproc(struct file *file, char __user *ubuf, size_t cnt, loff_t *ppos)
+{
+	int r;
+	struct maxlatproc_data *mp = file->private_data;
+	int strmaxlen = (TASK_COMM_LEN * 2) + (8 * 8);
+	unsigned long long t;
+	unsigned long usecs, secs;
+	char *buf;
+
+	if (mp->pid == -1 || mp->current_pid == -1) {
+		buf = "(none)\n";
+		return simple_read_from_buffer(ubuf, cnt, ppos, buf,
+		    strlen(buf));
+	}
+
+	buf = kmalloc(strmaxlen, GFP_KERNEL);
+	if (buf == NULL)
+		return -ENOMEM;
+
+	t = ns2usecs(mp->timestamp);
+	usecs = do_div(t, USEC_PER_SEC);
+	secs = (unsigned long) t;
+	r = snprintf(buf, strmaxlen,
+	    "%d %d %ld (%ld) %s <- %d %d %s %lu.%06lu\n", mp->pid,
+	    MAX_RT_PRIO-1 - mp->prio, mp->latency, mp->timeroffset, mp->comm,
+	    mp->current_pid, MAX_RT_PRIO-1 - mp->current_prio, mp->current_comm,
+	    secs, usecs);
+	r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+	kfree(buf);
+	return r;
+}
+#endif
+
+static ssize_t
+show_enable(struct file *file, char __user *ubuf, size_t cnt, loff_t *ppos)
+{
+	char buf[64];
+	struct enable_data *ed = file->private_data;
+	int r;
+
+	r = snprintf(buf, sizeof(buf), "%d\n", ed->enabled);
+	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+
+static ssize_t
+do_enable(struct file *file, const char __user *ubuf, size_t cnt, loff_t *ppos)
+{
+	char buf[64];
+	long enable;
+	struct enable_data *ed = file->private_data;
+
+	if (cnt >= sizeof(buf))
+		return -EINVAL;
+
+	if (copy_from_user(&buf, ubuf, cnt))
+		return -EFAULT;
+
+	buf[cnt] = 0;
+
+	if (strict_strtol(buf, 10, &enable))
+		return(-EINVAL);
+
+	if ((enable && ed->enabled) || (!enable && !ed->enabled))
+		return cnt;
+
+	if (enable) {
+		int ret;
+
+		switch (ed->latency_type) {
+#if defined(CONFIG_INTERRUPT_OFF_HIST) || defined(CONFIG_PREEMPT_OFF_HIST)
+		case PREEMPTIRQSOFF_LATENCY:
+			ret = register_trace_preemptirqsoff_hist(
+			    probe_preemptirqsoff_hist, NULL);
+			if (ret) {
+				pr_info("wakeup trace: Couldn't assign "
+				    "probe_preemptirqsoff_hist "
+				    "to trace_preemptirqsoff_hist\n");
+				return ret;
+			}
+			break;
+#endif
+#ifdef CONFIG_WAKEUP_LATENCY_HIST
+		case WAKEUP_LATENCY:
+			ret = register_trace_sched_wakeup(
+			    probe_wakeup_latency_hist_start, NULL);
+			if (ret) {
+				pr_info("wakeup trace: Couldn't assign "
+				    "probe_wakeup_latency_hist_start "
+				    "to trace_sched_wakeup\n");
+				return ret;
+			}
+			ret = register_trace_sched_wakeup_new(
+			    probe_wakeup_latency_hist_start, NULL);
+			if (ret) {
+				pr_info("wakeup trace: Couldn't assign "
+				    "probe_wakeup_latency_hist_start "
+				    "to trace_sched_wakeup_new\n");
+				unregister_trace_sched_wakeup(
+				    probe_wakeup_latency_hist_start, NULL);
+				return ret;
+			}
+			ret = register_trace_sched_switch(
+			    probe_wakeup_latency_hist_stop, NULL);
+			if (ret) {
+				pr_info("wakeup trace: Couldn't assign "
+				    "probe_wakeup_latency_hist_stop "
+				    "to trace_sched_switch\n");
+				unregister_trace_sched_wakeup(
+				    probe_wakeup_latency_hist_start, NULL);
+				unregister_trace_sched_wakeup_new(
+				    probe_wakeup_latency_hist_start, NULL);
+				return ret;
+			}
+			ret = register_trace_sched_migrate_task(
+			    probe_sched_migrate_task, NULL);
+			if (ret) {
+				pr_info("wakeup trace: Couldn't assign "
+				    "probe_sched_migrate_task "
+				    "to trace_sched_migrate_task\n");
+				unregister_trace_sched_wakeup(
+				    probe_wakeup_latency_hist_start, NULL);
+				unregister_trace_sched_wakeup_new(
+				    probe_wakeup_latency_hist_start, NULL);
+				unregister_trace_sched_switch(
+				    probe_wakeup_latency_hist_stop, NULL);
+				return ret;
+			}
+			break;
+#endif
+#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
+		case MISSED_TIMER_OFFSETS:
+			ret = register_trace_hrtimer_interrupt(
+			    probe_hrtimer_interrupt, NULL);
+			if (ret) {
+				pr_info("wakeup trace: Couldn't assign "
+				    "probe_hrtimer_interrupt "
+				    "to trace_hrtimer_interrupt\n");
+				return ret;
+			}
+			break;
+#endif
+#if defined(CONFIG_WAKEUP_LATENCY_HIST) && \
+    defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
+		case TIMERANDWAKEUP_LATENCY:
+			if (!wakeup_latency_enabled_data.enabled ||
+			    !missed_timer_offsets_enabled_data.enabled)
+				return -EINVAL;
+			break;
+#endif
+		default:
+			break;
+		}
+	} else {
+		switch (ed->latency_type) {
+#if defined(CONFIG_INTERRUPT_OFF_HIST) || defined(CONFIG_PREEMPT_OFF_HIST)
+		case PREEMPTIRQSOFF_LATENCY:
+			{
+				int cpu;
+
+				unregister_trace_preemptirqsoff_hist(
+				    probe_preemptirqsoff_hist, NULL);
+				for_each_online_cpu(cpu) {
+#ifdef CONFIG_INTERRUPT_OFF_HIST
+					per_cpu(hist_irqsoff_counting,
+					    cpu) = 0;
+#endif
+#ifdef CONFIG_PREEMPT_OFF_HIST
+					per_cpu(hist_preemptoff_counting,
+					    cpu) = 0;
+#endif
+#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST)
+					per_cpu(hist_preemptirqsoff_counting,
+					    cpu) = 0;
+#endif
+				}
+			}
+			break;
+#endif
+#ifdef CONFIG_WAKEUP_LATENCY_HIST
+		case WAKEUP_LATENCY:
+			{
+				int cpu;
+
+				unregister_trace_sched_wakeup(
+				    probe_wakeup_latency_hist_start, NULL);
+				unregister_trace_sched_wakeup_new(
+				    probe_wakeup_latency_hist_start, NULL);
+				unregister_trace_sched_switch(
+				    probe_wakeup_latency_hist_stop, NULL);
+				unregister_trace_sched_migrate_task(
+				    probe_sched_migrate_task, NULL);
+
+				for_each_online_cpu(cpu) {
+					per_cpu(wakeup_task, cpu) = NULL;
+					per_cpu(wakeup_sharedprio, cpu) = 0;
+				}
+			}
+#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
+			timerandwakeup_enabled_data.enabled = 0;
+#endif
+			break;
+#endif
+#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
+		case MISSED_TIMER_OFFSETS:
+			unregister_trace_hrtimer_interrupt(
+			    probe_hrtimer_interrupt, NULL);
+#ifdef CONFIG_WAKEUP_LATENCY_HIST
+			timerandwakeup_enabled_data.enabled = 0;
+#endif
+			break;
+#endif
+		default:
+			break;
+		}
+	}
+	ed->enabled = enable;
+	return cnt;
+}
+
+static const struct file_operations latency_hist_reset_fops = {
+	.open = tracing_open_generic,
+	.write = latency_hist_reset,
+};
+
+static const struct file_operations enable_fops = {
+	.open = tracing_open_generic,
+	.read = show_enable,
+	.write = do_enable,
+};
+
+#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
+    defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
+static const struct file_operations pid_fops = {
+	.open = tracing_open_generic,
+	.read = show_pid,
+	.write = do_pid,
+};
+
+static const struct file_operations maxlatproc_fops = {
+	.open = tracing_open_generic,
+	.read = show_maxlatproc,
+};
+#endif
+
+#if defined(CONFIG_INTERRUPT_OFF_HIST) || defined(CONFIG_PREEMPT_OFF_HIST)
+static notrace void probe_preemptirqsoff_hist(void *v, int reason,
+    int starthist)
+{
+	int cpu = raw_smp_processor_id();
+	int time_set = 0;
+
+	if (starthist) {
+		cycle_t uninitialized_var(start);
+
+		if (!preempt_count() && !irqs_disabled())
+			return;
+
+#ifdef CONFIG_INTERRUPT_OFF_HIST
+		if ((reason == IRQS_OFF || reason == TRACE_START) &&
+		    !per_cpu(hist_irqsoff_counting, cpu)) {
+			per_cpu(hist_irqsoff_counting, cpu) = 1;
+			start = ftrace_now(cpu);
+			time_set++;
+			per_cpu(hist_irqsoff_start, cpu) = start;
+		}
+#endif
+
+#ifdef CONFIG_PREEMPT_OFF_HIST
+		if ((reason == PREEMPT_OFF || reason == TRACE_START) &&
+		    !per_cpu(hist_preemptoff_counting, cpu)) {
+			per_cpu(hist_preemptoff_counting, cpu) = 1;
+			if (!(time_set++))
+				start = ftrace_now(cpu);
+			per_cpu(hist_preemptoff_start, cpu) = start;
+		}
+#endif
+
+#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST)
+		if (per_cpu(hist_irqsoff_counting, cpu) &&
+		    per_cpu(hist_preemptoff_counting, cpu) &&
+		    !per_cpu(hist_preemptirqsoff_counting, cpu)) {
+			per_cpu(hist_preemptirqsoff_counting, cpu) = 1;
+			if (!time_set)
+				start = ftrace_now(cpu);
+			per_cpu(hist_preemptirqsoff_start, cpu) = start;
+		}
+#endif
+	} else {
+		cycle_t uninitialized_var(stop);
+
+#ifdef CONFIG_INTERRUPT_OFF_HIST
+		if ((reason == IRQS_ON || reason == TRACE_STOP) &&
+		    per_cpu(hist_irqsoff_counting, cpu)) {
+			cycle_t start = per_cpu(hist_irqsoff_start, cpu);
+
+			stop = ftrace_now(cpu);
+			time_set++;
+			if (start) {
+				long latency = ((long) (stop - start)) /
+				    NSECS_PER_USECS;
+
+				latency_hist(IRQSOFF_LATENCY, cpu, latency, 0,
+				    stop, NULL);
+			}
+			per_cpu(hist_irqsoff_counting, cpu) = 0;
+		}
+#endif
+
+#ifdef CONFIG_PREEMPT_OFF_HIST
+		if ((reason == PREEMPT_ON || reason == TRACE_STOP) &&
+		    per_cpu(hist_preemptoff_counting, cpu)) {
+			cycle_t start = per_cpu(hist_preemptoff_start, cpu);
+
+			if (!(time_set++))
+				stop = ftrace_now(cpu);
+			if (start) {
+				long latency = ((long) (stop - start)) /
+				    NSECS_PER_USECS;
+
+				latency_hist(PREEMPTOFF_LATENCY, cpu, latency,
+				    0, stop, NULL);
+			}
+			per_cpu(hist_preemptoff_counting, cpu) = 0;
+		}
+#endif
+
+#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST)
+		if ((!per_cpu(hist_irqsoff_counting, cpu) ||
+		     !per_cpu(hist_preemptoff_counting, cpu)) &&
+		   per_cpu(hist_preemptirqsoff_counting, cpu)) {
+			cycle_t start = per_cpu(hist_preemptirqsoff_start, cpu);
+
+			if (!time_set)
+				stop = ftrace_now(cpu);
+			if (start) {
+				long latency = ((long) (stop - start)) /
+				    NSECS_PER_USECS;
+
+				latency_hist(PREEMPTIRQSOFF_LATENCY, cpu,
+				    latency, 0, stop, NULL);
+			}
+			per_cpu(hist_preemptirqsoff_counting, cpu) = 0;
+		}
+#endif
+	}
+}
+#endif
+
+#ifdef CONFIG_WAKEUP_LATENCY_HIST
+static DEFINE_RAW_SPINLOCK(wakeup_lock);
+static notrace void probe_sched_migrate_task(void *v, struct task_struct *task,
+    int cpu)
+{
+	int old_cpu = task_cpu(task);
+
+	if (cpu != old_cpu) {
+		unsigned long flags;
+		struct task_struct *cpu_wakeup_task;
+
+		raw_spin_lock_irqsave(&wakeup_lock, flags);
+
+		cpu_wakeup_task = per_cpu(wakeup_task, old_cpu);
+		if (task == cpu_wakeup_task) {
+			put_task_struct(cpu_wakeup_task);
+			per_cpu(wakeup_task, old_cpu) = NULL;
+			cpu_wakeup_task = per_cpu(wakeup_task, cpu) = task;
+			get_task_struct(cpu_wakeup_task);
+		}
+
+		raw_spin_unlock_irqrestore(&wakeup_lock, flags);
+	}
+}
+
+static notrace void probe_wakeup_latency_hist_start(void *v,
+    struct task_struct *p, int success)
+{
+	unsigned long flags;
+	struct task_struct *curr = current;
+	int cpu = task_cpu(p);
+	struct task_struct *cpu_wakeup_task;
+
+	raw_spin_lock_irqsave(&wakeup_lock, flags);
+
+	cpu_wakeup_task = per_cpu(wakeup_task, cpu);
+
+	if (wakeup_pid) {
+		if ((cpu_wakeup_task && p->prio == cpu_wakeup_task->prio) ||
+		    p->prio == curr->prio)
+			per_cpu(wakeup_sharedprio, cpu) = 1;
+		if (likely(wakeup_pid != task_pid_nr(p)))
+			goto out;
+	} else {
+		if (likely(!rt_task(p)) ||
+		    (cpu_wakeup_task && p->prio > cpu_wakeup_task->prio) ||
+		    p->prio > curr->prio)
+			goto out;
+		if ((cpu_wakeup_task && p->prio == cpu_wakeup_task->prio) ||
+		    p->prio == curr->prio)
+			per_cpu(wakeup_sharedprio, cpu) = 1;
+	}
+
+	if (cpu_wakeup_task)
+		put_task_struct(cpu_wakeup_task);
+	cpu_wakeup_task = per_cpu(wakeup_task, cpu) = p;
+	get_task_struct(cpu_wakeup_task);
+	cpu_wakeup_task->preempt_timestamp_hist =
+		ftrace_now(raw_smp_processor_id());
+out:
+	raw_spin_unlock_irqrestore(&wakeup_lock, flags);
+}
+
+static notrace void probe_wakeup_latency_hist_stop(void *v,
+    struct task_struct *prev, struct task_struct *next)
+{
+	unsigned long flags;
+	int cpu = task_cpu(next);
+	long latency;
+	cycle_t stop;
+	struct task_struct *cpu_wakeup_task;
+
+	raw_spin_lock_irqsave(&wakeup_lock, flags);
+
+	cpu_wakeup_task = per_cpu(wakeup_task, cpu);
+
+	if (cpu_wakeup_task == NULL)
+		goto out;
+
+	/* Already running? */
+	if (unlikely(current == cpu_wakeup_task))
+		goto out_reset;
+
+	if (next != cpu_wakeup_task) {
+		if (next->prio < cpu_wakeup_task->prio)
+			goto out_reset;
+
+		if (next->prio == cpu_wakeup_task->prio)
+			per_cpu(wakeup_sharedprio, cpu) = 1;
+
+		goto out;
+	}
+
+	if (current->prio == cpu_wakeup_task->prio)
+		per_cpu(wakeup_sharedprio, cpu) = 1;
+
+	/*
+	 * The task we are waiting for is about to be switched to.
+	 * Calculate latency and store it in histogram.
+	 */
+	stop = ftrace_now(raw_smp_processor_id());
+
+	latency = ((long) (stop - next->preempt_timestamp_hist)) /
+	    NSECS_PER_USECS;
+
+	if (per_cpu(wakeup_sharedprio, cpu)) {
+		latency_hist(WAKEUP_LATENCY_SHAREDPRIO, cpu, latency, 0, stop,
+		    next);
+		per_cpu(wakeup_sharedprio, cpu) = 0;
+	} else {
+		latency_hist(WAKEUP_LATENCY, cpu, latency, 0, stop, next);
+#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
+		if (timerandwakeup_enabled_data.enabled) {
+			latency_hist(TIMERANDWAKEUP_LATENCY, cpu,
+			    next->timer_offset + latency, next->timer_offset,
+			    stop, next);
+		}
+#endif
+	}
+
+out_reset:
+#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
+	next->timer_offset = 0;
+#endif
+	put_task_struct(cpu_wakeup_task);
+	per_cpu(wakeup_task, cpu) = NULL;
+out:
+	raw_spin_unlock_irqrestore(&wakeup_lock, flags);
+}
+#endif
+
+#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
+static notrace void probe_hrtimer_interrupt(void *v, int cpu,
+    long long latency_ns, struct task_struct *curr, struct task_struct *task)
+{
+	if (latency_ns <= 0 && task != NULL && rt_task(task) &&
+	    (task->prio < curr->prio ||
+	    (task->prio == curr->prio &&
+	    !cpumask_test_cpu(cpu, &task->cpus_allowed)))) {
+		long latency;
+		cycle_t now;
+
+		if (missed_timer_offsets_pid) {
+			if (likely(missed_timer_offsets_pid !=
+			    task_pid_nr(task)))
+				return;
+		}
+
+		now = ftrace_now(cpu);
+		latency = (long) div_s64(-latency_ns, NSECS_PER_USECS);
+		latency_hist(MISSED_TIMER_OFFSETS, cpu, latency, latency, now,
+		    task);
+#ifdef CONFIG_WAKEUP_LATENCY_HIST
+		task->timer_offset = latency;
+#endif
+	}
+}
+#endif
+
+static __init int latency_hist_init(void)
+{
+	struct dentry *latency_hist_root = NULL;
+	struct dentry *dentry;
+#ifdef CONFIG_WAKEUP_LATENCY_HIST
+	struct dentry *dentry_sharedprio;
+#endif
+	struct dentry *entry;
+	struct dentry *enable_root;
+	int i = 0;
+	struct hist_data *my_hist;
+	char name[64];
+	char *cpufmt = "CPU%d";
+#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
+    defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
+	char *cpufmt_maxlatproc = "max_latency-CPU%d";
+	struct maxlatproc_data *mp = NULL;
+#endif
+
+	dentry = tracing_init_dentry();
+	latency_hist_root = debugfs_create_dir(latency_hist_dir_root, dentry);
+	enable_root = debugfs_create_dir("enable", latency_hist_root);
+
+#ifdef CONFIG_INTERRUPT_OFF_HIST
+	dentry = debugfs_create_dir(irqsoff_hist_dir, latency_hist_root);
+	for_each_possible_cpu(i) {
+		sprintf(name, cpufmt, i);
+		entry = debugfs_create_file(name, 0444, dentry,
+		    &per_cpu(irqsoff_hist, i), &latency_hist_fops);
+		my_hist = &per_cpu(irqsoff_hist, i);
+		atomic_set(&my_hist->hist_mode, 1);
+		my_hist->min_lat = LONG_MAX;
+	}
+	entry = debugfs_create_file("reset", 0644, dentry,
+	    (void *)IRQSOFF_LATENCY, &latency_hist_reset_fops);
+#endif
+
+#ifdef CONFIG_PREEMPT_OFF_HIST
+	dentry = debugfs_create_dir(preemptoff_hist_dir,
+	    latency_hist_root);
+	for_each_possible_cpu(i) {
+		sprintf(name, cpufmt, i);
+		entry = debugfs_create_file(name, 0444, dentry,
+		    &per_cpu(preemptoff_hist, i), &latency_hist_fops);
+		my_hist = &per_cpu(preemptoff_hist, i);
+		atomic_set(&my_hist->hist_mode, 1);
+		my_hist->min_lat = LONG_MAX;
+	}
+	entry = debugfs_create_file("reset", 0644, dentry,
+	    (void *)PREEMPTOFF_LATENCY, &latency_hist_reset_fops);
+#endif
+
+#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST)
+	dentry = debugfs_create_dir(preemptirqsoff_hist_dir,
+	    latency_hist_root);
+	for_each_possible_cpu(i) {
+		sprintf(name, cpufmt, i);
+		entry = debugfs_create_file(name, 0444, dentry,
+		    &per_cpu(preemptirqsoff_hist, i), &latency_hist_fops);
+		my_hist = &per_cpu(preemptirqsoff_hist, i);
+		atomic_set(&my_hist->hist_mode, 1);
+		my_hist->min_lat = LONG_MAX;
+	}
+	entry = debugfs_create_file("reset", 0644, dentry,
+	    (void *)PREEMPTIRQSOFF_LATENCY, &latency_hist_reset_fops);
+#endif
+
+#if defined(CONFIG_INTERRUPT_OFF_HIST) || defined(CONFIG_PREEMPT_OFF_HIST)
+	entry = debugfs_create_file("preemptirqsoff", 0644,
+	    enable_root, (void *)&preemptirqsoff_enabled_data,
+	    &enable_fops);
+#endif
+
+#ifdef CONFIG_WAKEUP_LATENCY_HIST
+	dentry = debugfs_create_dir(wakeup_latency_hist_dir,
+	    latency_hist_root);
+	dentry_sharedprio = debugfs_create_dir(
+	    wakeup_latency_hist_dir_sharedprio, dentry);
+	for_each_possible_cpu(i) {
+		sprintf(name, cpufmt, i);
+
+		entry = debugfs_create_file(name, 0444, dentry,
+		    &per_cpu(wakeup_latency_hist, i),
+		    &latency_hist_fops);
+		my_hist = &per_cpu(wakeup_latency_hist, i);
+		atomic_set(&my_hist->hist_mode, 1);
+		my_hist->min_lat = LONG_MAX;
+
+		entry = debugfs_create_file(name, 0444, dentry_sharedprio,
+		    &per_cpu(wakeup_latency_hist_sharedprio, i),
+		    &latency_hist_fops);
+		my_hist = &per_cpu(wakeup_latency_hist_sharedprio, i);
+		atomic_set(&my_hist->hist_mode, 1);
+		my_hist->min_lat = LONG_MAX;
+
+		sprintf(name, cpufmt_maxlatproc, i);
+
+		mp = &per_cpu(wakeup_maxlatproc, i);
+		entry = debugfs_create_file(name, 0444, dentry, mp,
+		    &maxlatproc_fops);
+		clear_maxlatprocdata(mp);
+
+		mp = &per_cpu(wakeup_maxlatproc_sharedprio, i);
+		entry = debugfs_create_file(name, 0444, dentry_sharedprio, mp,
+		    &maxlatproc_fops);
+		clear_maxlatprocdata(mp);
+	}
+	entry = debugfs_create_file("pid", 0644, dentry,
+	    (void *)&wakeup_pid, &pid_fops);
+	entry = debugfs_create_file("reset", 0644, dentry,
+	    (void *)WAKEUP_LATENCY, &latency_hist_reset_fops);
+	entry = debugfs_create_file("reset", 0644, dentry_sharedprio,
+	    (void *)WAKEUP_LATENCY_SHAREDPRIO, &latency_hist_reset_fops);
+	entry = debugfs_create_file("wakeup", 0644,
+	    enable_root, (void *)&wakeup_latency_enabled_data,
+	    &enable_fops);
+#endif
+
+#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
+	dentry = debugfs_create_dir(missed_timer_offsets_dir,
+	    latency_hist_root);
+	for_each_possible_cpu(i) {
+		sprintf(name, cpufmt, i);
+		entry = debugfs_create_file(name, 0444, dentry,
+		    &per_cpu(missed_timer_offsets, i), &latency_hist_fops);
+		my_hist = &per_cpu(missed_timer_offsets, i);
+		atomic_set(&my_hist->hist_mode, 1);
+		my_hist->min_lat = LONG_MAX;
+
+		sprintf(name, cpufmt_maxlatproc, i);
+		mp = &per_cpu(missed_timer_offsets_maxlatproc, i);
+		entry = debugfs_create_file(name, 0444, dentry, mp,
+		    &maxlatproc_fops);
+		clear_maxlatprocdata(mp);
+	}
+	entry = debugfs_create_file("pid", 0644, dentry,
+	    (void *)&missed_timer_offsets_pid, &pid_fops);
+	entry = debugfs_create_file("reset", 0644, dentry,
+	    (void *)MISSED_TIMER_OFFSETS, &latency_hist_reset_fops);
+	entry = debugfs_create_file("missed_timer_offsets", 0644,
+	    enable_root, (void *)&missed_timer_offsets_enabled_data,
+	    &enable_fops);
+#endif
+
+#if defined(CONFIG_WAKEUP_LATENCY_HIST) && \
+    defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
+	dentry = debugfs_create_dir(timerandwakeup_latency_hist_dir,
+	    latency_hist_root);
+	for_each_possible_cpu(i) {
+		sprintf(name, cpufmt, i);
+		entry = debugfs_create_file(name, 0444, dentry,
+		    &per_cpu(timerandwakeup_latency_hist, i),
+		    &latency_hist_fops);
+		my_hist = &per_cpu(timerandwakeup_latency_hist, i);
+		atomic_set(&my_hist->hist_mode, 1);
+		my_hist->min_lat = LONG_MAX;
+
+		sprintf(name, cpufmt_maxlatproc, i);
+		mp = &per_cpu(timerandwakeup_maxlatproc, i);
+		entry = debugfs_create_file(name, 0444, dentry, mp,
+		    &maxlatproc_fops);
+		clear_maxlatprocdata(mp);
+	}
+	entry = debugfs_create_file("reset", 0644, dentry,
+	    (void *)TIMERANDWAKEUP_LATENCY, &latency_hist_reset_fops);
+	entry = debugfs_create_file("timerandwakeup", 0644,
+	    enable_root, (void *)&timerandwakeup_enabled_data,
+	    &enable_fops);
+#endif
+	return 0;
+}
+
+__initcall(latency_hist_init);
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 713a2ca..e18da6a 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -17,6 +17,7 @@
 #include <linux/fs.h>
 
 #include "trace.h"
+#include <trace/events/hist.h>
 
 static struct trace_array		*irqsoff_trace __read_mostly;
 static int				tracer_enabled __read_mostly;
@@ -438,11 +439,13 @@ void start_critical_timings(void)
 {
 	if (preempt_trace() || irq_trace())
 		start_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
+	trace_preemptirqsoff_hist(TRACE_START, 1);
 }
 EXPORT_SYMBOL_GPL(start_critical_timings);
 
 void stop_critical_timings(void)
 {
+	trace_preemptirqsoff_hist(TRACE_STOP, 0);
 	if (preempt_trace() || irq_trace())
 		stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
 }
@@ -452,6 +455,7 @@ EXPORT_SYMBOL_GPL(stop_critical_timings);
 #ifdef CONFIG_PROVE_LOCKING
 void time_hardirqs_on(unsigned long a0, unsigned long a1)
 {
+	trace_preemptirqsoff_hist(IRQS_ON, 0);
 	if (!preempt_trace() && irq_trace())
 		stop_critical_timing(a0, a1);
 }
@@ -460,6 +464,7 @@ void time_hardirqs_off(unsigned long a0, unsigned long a1)
 {
 	if (!preempt_trace() && irq_trace())
 		start_critical_timing(a0, a1);
+	trace_preemptirqsoff_hist(IRQS_OFF, 1);
 }
 
 #else /* !CONFIG_PROVE_LOCKING */
@@ -485,6 +490,7 @@ inline void print_irqtrace_events(struct task_struct *curr)
  */
 void trace_hardirqs_on(void)
 {
+	trace_preemptirqsoff_hist(IRQS_ON, 0);
 	if (!preempt_trace() && irq_trace())
 		stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
 }
@@ -494,11 +500,13 @@ void trace_hardirqs_off(void)
 {
 	if (!preempt_trace() && irq_trace())
 		start_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
+	trace_preemptirqsoff_hist(IRQS_OFF, 1);
 }
 EXPORT_SYMBOL(trace_hardirqs_off);
 
 void trace_hardirqs_on_caller(unsigned long caller_addr)
 {
+	trace_preemptirqsoff_hist(IRQS_ON, 0);
 	if (!preempt_trace() && irq_trace())
 		stop_critical_timing(CALLER_ADDR0, caller_addr);
 }
@@ -508,6 +516,7 @@ void trace_hardirqs_off_caller(unsigned long caller_addr)
 {
 	if (!preempt_trace() && irq_trace())
 		start_critical_timing(CALLER_ADDR0, caller_addr);
+	trace_preemptirqsoff_hist(IRQS_OFF, 1);
 }
 EXPORT_SYMBOL(trace_hardirqs_off_caller);
 
@@ -517,12 +526,14 @@ EXPORT_SYMBOL(trace_hardirqs_off_caller);
 #ifdef CONFIG_PREEMPT_TRACER
 void trace_preempt_on(unsigned long a0, unsigned long a1)
 {
+	trace_preemptirqsoff_hist(PREEMPT_ON, 0);
 	if (preempt_trace() && !irq_trace())
 		stop_critical_timing(a0, a1);
 }
 
 void trace_preempt_off(unsigned long a0, unsigned long a1)
 {
+	trace_preemptirqsoff_hist(PREEMPT_ON, 1);
 	if (preempt_trace() && !irq_trace())
 		start_critical_timing(a0, a1);
 }
-- 
cgit v0.10.2


From 10a55efbc246192763faefcf5bbfa5ab21924c62 Mon Sep 17 00:00:00 2001
From: Carsten Emde <C.Emde@osadl.org>
Date: Tue, 19 Jul 2011 13:53:12 +0100
Subject: hwlatdetect.patch

Jon Masters developed this wonderful SMI detector. For details please
consult Documentation/hwlat_detector.txt. It could be ported to Linux
3.0 RT without any major change.

Signed-off-by: Carsten Emde <C.Emde@osadl.org>

diff --git a/Documentation/hwlat_detector.txt b/Documentation/hwlat_detector.txt
new file mode 100644
index 0000000..cb61516
--- /dev/null
+++ b/Documentation/hwlat_detector.txt
@@ -0,0 +1,64 @@
+Introduction:
+-------------
+
+The module hwlat_detector is a special purpose kernel module that is used to
+detect large system latencies induced by the behavior of certain underlying
+hardware or firmware, independent of Linux itself. The code was developed
+originally to detect SMIs (System Management Interrupts) on x86 systems,
+however there is nothing x86 specific about this patchset. It was
+originally written for use by the "RT" patch since the Real Time
+kernel is highly latency sensitive.
+
+SMIs are usually not serviced by the Linux kernel, which typically does not
+even know that they are occuring. SMIs are instead are set up by BIOS code
+and are serviced by BIOS code, usually for "critical" events such as
+management of thermal sensors and fans. Sometimes though, SMIs are used for
+other tasks and those tasks can spend an inordinate amount of time in the
+handler (sometimes measured in milliseconds). Obviously this is a problem if
+you are trying to keep event service latencies down in the microsecond range.
+
+The hardware latency detector works by hogging all of the cpus for configurable
+amounts of time (by calling stop_machine()), polling the CPU Time Stamp Counter
+for some period, then looking for gaps in the TSC data. Any gap indicates a
+time when the polling was interrupted and since the machine is stopped and
+interrupts turned off the only thing that could do that would be an SMI.
+
+Note that the SMI detector should *NEVER* be used in a production environment.
+It is intended to be run manually to determine if the hardware platform has a
+problem with long system firmware service routines.
+
+Usage:
+------
+
+Loading the module hwlat_detector passing the parameter "enabled=1" (or by
+setting the "enable" entry in "hwlat_detector" debugfs toggled on) is the only
+step required to start the hwlat_detector. It is possible to redefine the
+threshold in microseconds (us) above which latency spikes will be taken
+into account (parameter "threshold=").
+
+Example:
+
+	# modprobe hwlat_detector enabled=1 threshold=100
+
+After the module is loaded, it creates a directory named "hwlat_detector" under
+the debugfs mountpoint, "/debug/hwlat_detector" for this text. It is necessary
+to have debugfs mounted, which might be on /sys/debug on your system.
+
+The /debug/hwlat_detector interface contains the following files:
+
+count			- number of latency spikes observed since last reset
+enable			- a global enable/disable toggle (0/1), resets count
+max			- maximum hardware latency actually observed (usecs)
+sample			- a pipe from which to read current raw sample data
+			  in the format <timestamp> <latency observed usecs>
+			  (can be opened O_NONBLOCK for a single sample)
+threshold		- minimum latency value to be considered (usecs)
+width			- time period to sample with CPUs held (usecs)
+			  must be less than the total window size (enforced)
+window			- total period of sampling, width being inside (usecs)
+
+By default we will set width to 500,000 and window to 1,000,000, meaning that
+we will sample every 1,000,000 usecs (1s) for 500,000 usecs (0.5s). If we
+observe any latencies that exceed the threshold (initially 100 usecs),
+then we write to a global sample ring buffer of 8K samples, which is
+consumed by reading from the "sample" (pipe) debugfs file interface.
diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig
index e94f19d..25ffcfd 100644
--- a/drivers/misc/Kconfig
+++ b/drivers/misc/Kconfig
@@ -121,6 +121,35 @@ config IBM_ASM
 	  for information on the specific driver level and support statement
 	  for your IBM server.
 
+config HWLAT_DETECTOR
+	tristate "Testing module to detect hardware-induced latencies"
+	depends on DEBUG_FS
+	depends on RING_BUFFER
+	default m
+	---help---
+	  A simple hardware latency detector. Use this module to detect
+	  large latencies introduced by the behavior of the underlying
+	  system firmware external to Linux. We do this using periodic
+	  use of stop_machine to grab all available CPUs and measure
+	  for unexplainable gaps in the CPU timestamp counter(s). By
+	  default, the module is not enabled until the "enable" file
+	  within the "hwlat_detector" debugfs directory is toggled.
+
+	  This module is often used to detect SMI (System Management
+	  Interrupts) on x86 systems, though is not x86 specific. To
+	  this end, we default to using a sample window of 1 second,
+	  during which we will sample for 0.5 seconds. If an SMI or
+	  similar event occurs during that time, it is recorded
+	  into an 8K samples global ring buffer until retreived.
+
+	  WARNING: This software should never be enabled (it can be built
+	  but should not be turned on after it is loaded) in a production
+	  environment where high latencies are a concern since the
+	  sampling mechanism actually introduces latencies for
+	  regular tasks while the CPU(s) are being held.
+
+	  If unsure, say N
+
 config PHANTOM
 	tristate "Sensable PHANToM (PCI)"
 	depends on PCI
diff --git a/drivers/misc/Makefile b/drivers/misc/Makefile
index 2129377..ec6ee3f 100644
--- a/drivers/misc/Makefile
+++ b/drivers/misc/Makefile
@@ -49,3 +49,4 @@ obj-y				+= carma/
 obj-$(CONFIG_USB_SWITCH_FSA9480) += fsa9480.o
 obj-$(CONFIG_ALTERA_STAPL)	+=altera-stapl/
 obj-$(CONFIG_INTEL_MEI)		+= mei/
+obj-$(CONFIG_HWLAT_DETECTOR)	+= hwlat_detector.o
diff --git a/drivers/misc/hwlat_detector.c b/drivers/misc/hwlat_detector.c
new file mode 100644
index 0000000..b7b7c90
--- /dev/null
+++ b/drivers/misc/hwlat_detector.c
@@ -0,0 +1,1212 @@
+/*
+ * hwlat_detector.c - A simple Hardware Latency detector.
+ *
+ * Use this module to detect large system latencies induced by the behavior of
+ * certain underlying system hardware or firmware, independent of Linux itself.
+ * The code was developed originally to detect the presence of SMIs on Intel
+ * and AMD systems, although there is no dependency upon x86 herein.
+ *
+ * The classical example usage of this module is in detecting the presence of
+ * SMIs or System Management Interrupts on Intel and AMD systems. An SMI is a
+ * somewhat special form of hardware interrupt spawned from earlier CPU debug
+ * modes in which the (BIOS/EFI/etc.) firmware arranges for the South Bridge
+ * LPC (or other device) to generate a special interrupt under certain
+ * circumstances, for example, upon expiration of a special SMI timer device,
+ * due to certain external thermal readings, on certain I/O address accesses,
+ * and other situations. An SMI hits a special CPU pin, triggers a special
+ * SMI mode (complete with special memory map), and the OS is unaware.
+ *
+ * Although certain hardware-inducing latencies are necessary (for example,
+ * a modern system often requires an SMI handler for correct thermal control
+ * and remote management) they can wreak havoc upon any OS-level performance
+ * guarantees toward low-latency, especially when the OS is not even made
+ * aware of the presence of these interrupts. For this reason, we need a
+ * somewhat brute force mechanism to detect these interrupts. In this case,
+ * we do it by hogging all of the CPU(s) for configurable timer intervals,
+ * sampling the built-in CPU timer, looking for discontiguous readings.
+ *
+ * WARNING: This implementation necessarily introduces latencies. Therefore,
+ *          you should NEVER use this module in a production environment
+ *          requiring any kind of low-latency performance guarantee(s).
+ *
+ * Copyright (C) 2008-2009 Jon Masters, Red Hat, Inc. <jcm@redhat.com>
+ *
+ * Includes useful feedback from Clark Williams <clark@redhat.com>
+ *
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2. This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/ring_buffer.h>
+#include <linux/stop_machine.h>
+#include <linux/time.h>
+#include <linux/hrtimer.h>
+#include <linux/kthread.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+#include <linux/uaccess.h>
+#include <linux/version.h>
+#include <linux/delay.h>
+#include <linux/slab.h>
+
+#define BUF_SIZE_DEFAULT	262144UL		/* 8K*(sizeof(entry)) */
+#define BUF_FLAGS		(RB_FL_OVERWRITE)	/* no block on full */
+#define U64STR_SIZE		22			/* 20 digits max */
+
+#define VERSION			"1.0.0"
+#define BANNER			"hwlat_detector: "
+#define DRVNAME			"hwlat_detector"
+#define DEFAULT_SAMPLE_WINDOW	1000000			/* 1s */
+#define DEFAULT_SAMPLE_WIDTH	500000			/* 0.5s */
+#define DEFAULT_LAT_THRESHOLD	10			/* 10us */
+
+/* Module metadata */
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Jon Masters <jcm@redhat.com>");
+MODULE_DESCRIPTION("A simple hardware latency detector");
+MODULE_VERSION(VERSION);
+
+/* Module parameters */
+
+static int debug;
+static int enabled;
+static int threshold;
+
+module_param(debug, int, 0);			/* enable debug */
+module_param(enabled, int, 0);			/* enable detector */
+module_param(threshold, int, 0);		/* latency threshold */
+
+/* Buffering and sampling */
+
+static struct ring_buffer *ring_buffer;		/* sample buffer */
+static DEFINE_MUTEX(ring_buffer_mutex);		/* lock changes */
+static unsigned long buf_size = BUF_SIZE_DEFAULT;
+static struct task_struct *kthread;		/* sampling thread */
+
+/* DebugFS filesystem entries */
+
+static struct dentry *debug_dir;		/* debugfs directory */
+static struct dentry *debug_max;		/* maximum TSC delta */
+static struct dentry *debug_count;		/* total detect count */
+static struct dentry *debug_sample_width;	/* sample width us */
+static struct dentry *debug_sample_window;	/* sample window us */
+static struct dentry *debug_sample;		/* raw samples us */
+static struct dentry *debug_threshold;		/* threshold us */
+static struct dentry *debug_enable;         	/* enable/disable */
+
+/* Individual samples and global state */
+
+struct sample;					/* latency sample */
+struct data;					/* Global state */
+
+/* Sampling functions */
+static int __buffer_add_sample(struct sample *sample);
+static struct sample *buffer_get_sample(struct sample *sample);
+static int get_sample(void *unused);
+
+/* Threading and state */
+static int kthread_fn(void *unused);
+static int start_kthread(void);
+static int stop_kthread(void);
+static void __reset_stats(void);
+static int init_stats(void);
+
+/* Debugfs interface */
+static ssize_t simple_data_read(struct file *filp, char __user *ubuf,
+				size_t cnt, loff_t *ppos, const u64 *entry);
+static ssize_t simple_data_write(struct file *filp, const char __user *ubuf,
+				 size_t cnt, loff_t *ppos, u64 *entry);
+static int debug_sample_fopen(struct inode *inode, struct file *filp);
+static ssize_t debug_sample_fread(struct file *filp, char __user *ubuf,
+				  size_t cnt, loff_t *ppos);
+static int debug_sample_release(struct inode *inode, struct file *filp);
+static int debug_enable_fopen(struct inode *inode, struct file *filp);
+static ssize_t debug_enable_fread(struct file *filp, char __user *ubuf,
+				  size_t cnt, loff_t *ppos);
+static ssize_t debug_enable_fwrite(struct file *file,
+				   const char __user *user_buffer,
+				   size_t user_size, loff_t *offset);
+
+/* Initialization functions */
+static int init_debugfs(void);
+static void free_debugfs(void);
+static int detector_init(void);
+static void detector_exit(void);
+
+/* Individual latency samples are stored here when detected and packed into
+ * the ring_buffer circular buffer, where they are overwritten when
+ * more than buf_size/sizeof(sample) samples are received. */
+struct sample {
+	u64		seqnum;		/* unique sequence */
+	u64		duration;	/* ktime delta */
+	struct timespec	timestamp;	/* wall time */
+	unsigned long   lost;
+};
+
+/* keep the global state somewhere. Mostly used under stop_machine. */
+static struct data {
+
+	struct mutex lock;		/* protect changes */
+
+	u64	count;			/* total since reset */
+	u64	max_sample;		/* max hardware latency */
+	u64	threshold;		/* sample threshold level */
+
+	u64	sample_window;		/* total sampling window (on+off) */
+	u64	sample_width;		/* active sampling portion of window */
+
+	atomic_t sample_open;		/* whether the sample file is open */
+
+	wait_queue_head_t wq;		/* waitqeue for new sample values */
+
+} data;
+
+/**
+ * __buffer_add_sample - add a new latency sample recording to the ring buffer
+ * @sample: The new latency sample value
+ *
+ * This receives a new latency sample and records it in a global ring buffer.
+ * No additional locking is used in this case - suited for stop_machine use.
+ */
+static int __buffer_add_sample(struct sample *sample)
+{
+	return ring_buffer_write(ring_buffer,
+				 sizeof(struct sample), sample);
+}
+
+/**
+ * buffer_get_sample - remove a hardware latency sample from the ring buffer
+ * @sample: Pre-allocated storage for the sample
+ *
+ * This retrieves a hardware latency sample from the global circular buffer
+ */
+static struct sample *buffer_get_sample(struct sample *sample)
+{
+	struct ring_buffer_event *e = NULL;
+	struct sample *s = NULL;
+	unsigned int cpu = 0;
+
+	if (!sample)
+		return NULL;
+
+	mutex_lock(&ring_buffer_mutex);
+	for_each_online_cpu(cpu) {
+		e = ring_buffer_consume(ring_buffer, cpu, NULL, &sample->lost);
+		if (e)
+			break;
+	}
+
+	if (e) {
+		s = ring_buffer_event_data(e);
+		memcpy(sample, s, sizeof(struct sample));
+	} else
+		sample = NULL;
+	mutex_unlock(&ring_buffer_mutex);
+
+	return sample;
+}
+
+/**
+ * get_sample - sample the CPU TSC and look for likely hardware latencies
+ * @unused: This is not used but is a part of the stop_machine API
+ *
+ * Used to repeatedly capture the CPU TSC (or similar), looking for potential
+ * hardware-induced latency. Called under stop_machine, with data.lock held.
+ */
+static int get_sample(void *unused)
+{
+	ktime_t start, t1, t2;
+	s64 diff, total = 0;
+	u64 sample = 0;
+	int ret = 1;
+
+	start = ktime_get(); /* start timestamp */
+
+	do {
+
+		t1 = ktime_get();	/* we'll look for a discontinuity */
+		t2 = ktime_get();
+
+		total = ktime_to_us(ktime_sub(t2, start)); /* sample width */
+		diff = ktime_to_us(ktime_sub(t2, t1));     /* current diff */
+
+		/* This shouldn't happen */
+		if (diff < 0) {
+			printk(KERN_ERR BANNER "time running backwards\n");
+			goto out;
+		}
+
+		if (diff > sample)
+			sample = diff; /* only want highest value */
+
+	} while (total <= data.sample_width);
+
+	/* If we exceed the threshold value, we have found a hardware latency */
+	if (sample > data.threshold) {
+		struct sample s;
+
+		data.count++;
+		s.seqnum = data.count;
+		s.duration = sample;
+		s.timestamp = CURRENT_TIME;
+		__buffer_add_sample(&s);
+
+		/* Keep a running maximum ever recorded hardware latency */
+		if (sample > data.max_sample)
+			data.max_sample = sample;
+	}
+
+	ret = 0;
+out:
+	return ret;
+}
+
+/*
+ * kthread_fn - The CPU time sampling/hardware latency detection kernel thread
+ * @unused: A required part of the kthread API.
+ *
+ * Used to periodically sample the CPU TSC via a call to get_sample. We
+ * use stop_machine, whith does (intentionally) introduce latency since we
+ * need to ensure nothing else might be running (and thus pre-empting).
+ * Obviously this should never be used in production environments.
+ *
+ * stop_machine will schedule us typically only on CPU0 which is fine for
+ * almost every real-world hardware latency situation - but we might later
+ * generalize this if we find there are any actualy systems with alternate
+ * SMI delivery or other non CPU0 hardware latencies.
+ */
+static int kthread_fn(void *unused)
+{
+	int err = 0;
+	u64 interval = 0;
+
+	while (!kthread_should_stop()) {
+
+		mutex_lock(&data.lock);
+
+		err = stop_machine(get_sample, unused, 0);
+		if (err) {
+			/* Houston, we have a problem */
+			mutex_unlock(&data.lock);
+			goto err_out;
+		}
+
+		wake_up(&data.wq); /* wake up reader(s) */
+
+		interval = data.sample_window - data.sample_width;
+		do_div(interval, USEC_PER_MSEC); /* modifies interval value */
+
+		mutex_unlock(&data.lock);
+
+		if (msleep_interruptible(interval))
+			goto out;
+	}
+		goto out;
+err_out:
+	printk(KERN_ERR BANNER "could not call stop_machine, disabling\n");
+	enabled = 0;
+out:
+	return err;
+
+}
+
+/**
+ * start_kthread - Kick off the hardware latency sampling/detector kthread
+ *
+ * This starts a kernel thread that will sit and sample the CPU timestamp
+ * counter (TSC or similar) and look for potential hardware latencies.
+ */
+static int start_kthread(void)
+{
+	kthread = kthread_run(kthread_fn, NULL,
+					DRVNAME);
+	if (IS_ERR(kthread)) {
+		printk(KERN_ERR BANNER "could not start sampling thread\n");
+		enabled = 0;
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+/**
+ * stop_kthread - Inform the hardware latency samping/detector kthread to stop
+ *
+ * This kicks the running hardware latency sampling/detector kernel thread and
+ * tells it to stop sampling now. Use this on unload and at system shutdown.
+ */
+static int stop_kthread(void)
+{
+	int ret;
+
+	ret = kthread_stop(kthread);
+
+	return ret;
+}
+
+/**
+ * __reset_stats - Reset statistics for the hardware latency detector
+ *
+ * We use data to store various statistics and global state. We call this
+ * function in order to reset those when "enable" is toggled on or off, and
+ * also at initialization. Should be called with data.lock held.
+ */
+static void __reset_stats(void)
+{
+	data.count = 0;
+	data.max_sample = 0;
+	ring_buffer_reset(ring_buffer); /* flush out old sample entries */
+}
+
+/**
+ * init_stats - Setup global state statistics for the hardware latency detector
+ *
+ * We use data to store various statistics and global state. We also use
+ * a global ring buffer (ring_buffer) to keep raw samples of detected hardware
+ * induced system latencies. This function initializes these structures and
+ * allocates the global ring buffer also.
+ */
+static int init_stats(void)
+{
+	int ret = -ENOMEM;
+
+	mutex_init(&data.lock);
+	init_waitqueue_head(&data.wq);
+	atomic_set(&data.sample_open, 0);
+
+	ring_buffer = ring_buffer_alloc(buf_size, BUF_FLAGS);
+
+	if (WARN(!ring_buffer, KERN_ERR BANNER
+			       "failed to allocate ring buffer!\n"))
+		goto out;
+
+	__reset_stats();
+	data.threshold = DEFAULT_LAT_THRESHOLD;	    /* threshold us */
+	data.sample_window = DEFAULT_SAMPLE_WINDOW; /* window us */
+	data.sample_width = DEFAULT_SAMPLE_WIDTH;   /* width us */
+
+	ret = 0;
+
+out:
+	return ret;
+
+}
+
+/*
+ * simple_data_read - Wrapper read function for global state debugfs entries
+ * @filp: The active open file structure for the debugfs "file"
+ * @ubuf: The userspace provided buffer to read value into
+ * @cnt: The maximum number of bytes to read
+ * @ppos: The current "file" position
+ * @entry: The entry to read from
+ *
+ * This function provides a generic read implementation for the global state
+ * "data" structure debugfs filesystem entries. It would be nice to use
+ * simple_attr_read directly, but we need to make sure that the data.lock
+ * spinlock is held during the actual read (even though we likely won't ever
+ * actually race here as the updater runs under a stop_machine context).
+ */
+static ssize_t simple_data_read(struct file *filp, char __user *ubuf,
+				size_t cnt, loff_t *ppos, const u64 *entry)
+{
+	char buf[U64STR_SIZE];
+	u64 val = 0;
+	int len = 0;
+
+	memset(buf, 0, sizeof(buf));
+
+	if (!entry)
+		return -EFAULT;
+
+	mutex_lock(&data.lock);
+	val = *entry;
+	mutex_unlock(&data.lock);
+
+	len = snprintf(buf, sizeof(buf), "%llu\n", (unsigned long long)val);
+
+	return simple_read_from_buffer(ubuf, cnt, ppos, buf, len);
+
+}
+
+/*
+ * simple_data_write - Wrapper write function for global state debugfs entries
+ * @filp: The active open file structure for the debugfs "file"
+ * @ubuf: The userspace provided buffer to write value from
+ * @cnt: The maximum number of bytes to write
+ * @ppos: The current "file" position
+ * @entry: The entry to write to
+ *
+ * This function provides a generic write implementation for the global state
+ * "data" structure debugfs filesystem entries. It would be nice to use
+ * simple_attr_write directly, but we need to make sure that the data.lock
+ * spinlock is held during the actual write (even though we likely won't ever
+ * actually race here as the updater runs under a stop_machine context).
+ */
+static ssize_t simple_data_write(struct file *filp, const char __user *ubuf,
+				 size_t cnt, loff_t *ppos, u64 *entry)
+{
+	char buf[U64STR_SIZE];
+	int csize = min(cnt, sizeof(buf));
+	u64 val = 0;
+	int err = 0;
+
+	memset(buf, '\0', sizeof(buf));
+	if (copy_from_user(buf, ubuf, csize))
+		return -EFAULT;
+
+	buf[U64STR_SIZE-1] = '\0';			/* just in case */
+	err = strict_strtoull(buf, 10, &val);
+	if (err)
+		return -EINVAL;
+
+	mutex_lock(&data.lock);
+	*entry = val;
+	mutex_unlock(&data.lock);
+
+	return csize;
+}
+
+/**
+ * debug_count_fopen - Open function for "count" debugfs entry
+ * @inode: The in-kernel inode representation of the debugfs "file"
+ * @filp: The active open file structure for the debugfs "file"
+ *
+ * This function provides an open implementation for the "count" debugfs
+ * interface to the hardware latency detector.
+ */
+static int debug_count_fopen(struct inode *inode, struct file *filp)
+{
+	return 0;
+}
+
+/**
+ * debug_count_fread - Read function for "count" debugfs entry
+ * @filp: The active open file structure for the debugfs "file"
+ * @ubuf: The userspace provided buffer to read value into
+ * @cnt: The maximum number of bytes to read
+ * @ppos: The current "file" position
+ *
+ * This function provides a read implementation for the "count" debugfs
+ * interface to the hardware latency detector. Can be used to read the
+ * number of latency readings exceeding the configured threshold since
+ * the detector was last reset (e.g. by writing a zero into "count").
+ */
+static ssize_t debug_count_fread(struct file *filp, char __user *ubuf,
+				     size_t cnt, loff_t *ppos)
+{
+	return simple_data_read(filp, ubuf, cnt, ppos, &data.count);
+}
+
+/**
+ * debug_count_fwrite - Write function for "count" debugfs entry
+ * @filp: The active open file structure for the debugfs "file"
+ * @ubuf: The user buffer that contains the value to write
+ * @cnt: The maximum number of bytes to write to "file"
+ * @ppos: The current position in the debugfs "file"
+ *
+ * This function provides a write implementation for the "count" debugfs
+ * interface to the hardware latency detector. Can be used to write a
+ * desired value, especially to zero the total count.
+ */
+static ssize_t  debug_count_fwrite(struct file *filp,
+				       const char __user *ubuf,
+				       size_t cnt,
+				       loff_t *ppos)
+{
+	return simple_data_write(filp, ubuf, cnt, ppos, &data.count);
+}
+
+/**
+ * debug_enable_fopen - Dummy open function for "enable" debugfs interface
+ * @inode: The in-kernel inode representation of the debugfs "file"
+ * @filp: The active open file structure for the debugfs "file"
+ *
+ * This function provides an open implementation for the "enable" debugfs
+ * interface to the hardware latency detector.
+ */
+static int debug_enable_fopen(struct inode *inode, struct file *filp)
+{
+	return 0;
+}
+
+/**
+ * debug_enable_fread - Read function for "enable" debugfs interface
+ * @filp: The active open file structure for the debugfs "file"
+ * @ubuf: The userspace provided buffer to read value into
+ * @cnt: The maximum number of bytes to read
+ * @ppos: The current "file" position
+ *
+ * This function provides a read implementation for the "enable" debugfs
+ * interface to the hardware latency detector. Can be used to determine
+ * whether the detector is currently enabled ("0\n" or "1\n" returned).
+ */
+static ssize_t debug_enable_fread(struct file *filp, char __user *ubuf,
+				      size_t cnt, loff_t *ppos)
+{
+	char buf[4];
+
+	if ((cnt < sizeof(buf)) || (*ppos))
+		return 0;
+
+	buf[0] = enabled ? '1' : '0';
+	buf[1] = '\n';
+	buf[2] = '\0';
+	if (copy_to_user(ubuf, buf, strlen(buf)))
+		return -EFAULT;
+	return *ppos = strlen(buf);
+}
+
+/**
+ * debug_enable_fwrite - Write function for "enable" debugfs interface
+ * @filp: The active open file structure for the debugfs "file"
+ * @ubuf: The user buffer that contains the value to write
+ * @cnt: The maximum number of bytes to write to "file"
+ * @ppos: The current position in the debugfs "file"
+ *
+ * This function provides a write implementation for the "enable" debugfs
+ * interface to the hardware latency detector. Can be used to enable or
+ * disable the detector, which will have the side-effect of possibly
+ * also resetting the global stats and kicking off the measuring
+ * kthread (on an enable) or the converse (upon a disable).
+ */
+static ssize_t  debug_enable_fwrite(struct file *filp,
+					const char __user *ubuf,
+					size_t cnt,
+					loff_t *ppos)
+{
+	char buf[4];
+	int csize = min(cnt, sizeof(buf));
+	long val = 0;
+	int err = 0;
+
+	memset(buf, '\0', sizeof(buf));
+	if (copy_from_user(buf, ubuf, csize))
+		return -EFAULT;
+
+	buf[sizeof(buf)-1] = '\0';			/* just in case */
+	err = strict_strtoul(buf, 10, &val);
+	if (0 != err)
+		return -EINVAL;
+
+	if (val) {
+		if (enabled)
+			goto unlock;
+		enabled = 1;
+		__reset_stats();
+		if (start_kthread())
+			return -EFAULT;
+	} else {
+		if (!enabled)
+			goto unlock;
+		enabled = 0;
+		err = stop_kthread();
+		if (err) {
+			printk(KERN_ERR BANNER "cannot stop kthread\n");
+			return -EFAULT;
+		}
+		wake_up(&data.wq);		/* reader(s) should return */
+	}
+unlock:
+	return csize;
+}
+
+/**
+ * debug_max_fopen - Open function for "max" debugfs entry
+ * @inode: The in-kernel inode representation of the debugfs "file"
+ * @filp: The active open file structure for the debugfs "file"
+ *
+ * This function provides an open implementation for the "max" debugfs
+ * interface to the hardware latency detector.
+ */
+static int debug_max_fopen(struct inode *inode, struct file *filp)
+{
+	return 0;
+}
+
+/**
+ * debug_max_fread - Read function for "max" debugfs entry
+ * @filp: The active open file structure for the debugfs "file"
+ * @ubuf: The userspace provided buffer to read value into
+ * @cnt: The maximum number of bytes to read
+ * @ppos: The current "file" position
+ *
+ * This function provides a read implementation for the "max" debugfs
+ * interface to the hardware latency detector. Can be used to determine
+ * the maximum latency value observed since it was last reset.
+ */
+static ssize_t debug_max_fread(struct file *filp, char __user *ubuf,
+				   size_t cnt, loff_t *ppos)
+{
+	return simple_data_read(filp, ubuf, cnt, ppos, &data.max_sample);
+}
+
+/**
+ * debug_max_fwrite - Write function for "max" debugfs entry
+ * @filp: The active open file structure for the debugfs "file"
+ * @ubuf: The user buffer that contains the value to write
+ * @cnt: The maximum number of bytes to write to "file"
+ * @ppos: The current position in the debugfs "file"
+ *
+ * This function provides a write implementation for the "max" debugfs
+ * interface to the hardware latency detector. Can be used to reset the
+ * maximum or set it to some other desired value - if, then, subsequent
+ * measurements exceed this value, the maximum will be updated.
+ */
+static ssize_t  debug_max_fwrite(struct file *filp,
+				     const char __user *ubuf,
+				     size_t cnt,
+				     loff_t *ppos)
+{
+	return simple_data_write(filp, ubuf, cnt, ppos, &data.max_sample);
+}
+
+
+/**
+ * debug_sample_fopen - An open function for "sample" debugfs interface
+ * @inode: The in-kernel inode representation of this debugfs "file"
+ * @filp: The active open file structure for the debugfs "file"
+ *
+ * This function handles opening the "sample" file within the hardware
+ * latency detector debugfs directory interface. This file is used to read
+ * raw samples from the global ring_buffer and allows the user to see a
+ * running latency history. Can be opened blocking or non-blocking,
+ * affecting whether it behaves as a buffer read pipe, or does not.
+ * Implements simple locking to prevent multiple simultaneous use.
+ */
+static int debug_sample_fopen(struct inode *inode, struct file *filp)
+{
+	if (!atomic_add_unless(&data.sample_open, 1, 1))
+		return -EBUSY;
+	else
+		return 0;
+}
+
+/**
+ * debug_sample_fread - A read function for "sample" debugfs interface
+ * @filp: The active open file structure for the debugfs "file"
+ * @ubuf: The user buffer that will contain the samples read
+ * @cnt: The maximum bytes to read from the debugfs "file"
+ * @ppos: The current position in the debugfs "file"
+ *
+ * This function handles reading from the "sample" file within the hardware
+ * latency detector debugfs directory interface. This file is used to read
+ * raw samples from the global ring_buffer and allows the user to see a
+ * running latency history. By default this will block pending a new
+ * value written into the sample buffer, unless there are already a
+ * number of value(s) waiting in the buffer, or the sample file was
+ * previously opened in a non-blocking mode of operation.
+ */
+static ssize_t debug_sample_fread(struct file *filp, char __user *ubuf,
+					size_t cnt, loff_t *ppos)
+{
+	int len = 0;
+	char buf[64];
+	struct sample *sample = NULL;
+
+	if (!enabled)
+		return 0;
+
+	sample = kzalloc(sizeof(struct sample), GFP_KERNEL);
+	if (!sample)
+		return -ENOMEM;
+
+	while (!buffer_get_sample(sample)) {
+
+		DEFINE_WAIT(wait);
+
+		if (filp->f_flags & O_NONBLOCK) {
+			len = -EAGAIN;
+			goto out;
+		}
+
+		prepare_to_wait(&data.wq, &wait, TASK_INTERRUPTIBLE);
+		schedule();
+		finish_wait(&data.wq, &wait);
+
+		if (signal_pending(current)) {
+			len = -EINTR;
+			goto out;
+		}
+
+		if (!enabled) {			/* enable was toggled */
+			len = 0;
+			goto out;
+		}
+	}
+
+	len = snprintf(buf, sizeof(buf), "%010lu.%010lu\t%llu\n",
+		      sample->timestamp.tv_sec,
+		      sample->timestamp.tv_nsec,
+		      sample->duration);
+
+
+	/* handling partial reads is more trouble than it's worth */
+	if (len > cnt)
+		goto out;
+
+	if (copy_to_user(ubuf, buf, len))
+		len = -EFAULT;
+
+out:
+	kfree(sample);
+	return len;
+}
+
+/**
+ * debug_sample_release - Release function for "sample" debugfs interface
+ * @inode: The in-kernel inode represenation of the debugfs "file"
+ * @filp: The active open file structure for the debugfs "file"
+ *
+ * This function completes the close of the debugfs interface "sample" file.
+ * Frees the sample_open "lock" so that other users may open the interface.
+ */
+static int debug_sample_release(struct inode *inode, struct file *filp)
+{
+	atomic_dec(&data.sample_open);
+
+	return 0;
+}
+
+/**
+ * debug_threshold_fopen - Open function for "threshold" debugfs entry
+ * @inode: The in-kernel inode representation of the debugfs "file"
+ * @filp: The active open file structure for the debugfs "file"
+ *
+ * This function provides an open implementation for the "threshold" debugfs
+ * interface to the hardware latency detector.
+ */
+static int debug_threshold_fopen(struct inode *inode, struct file *filp)
+{
+	return 0;
+}
+
+/**
+ * debug_threshold_fread - Read function for "threshold" debugfs entry
+ * @filp: The active open file structure for the debugfs "file"
+ * @ubuf: The userspace provided buffer to read value into
+ * @cnt: The maximum number of bytes to read
+ * @ppos: The current "file" position
+ *
+ * This function provides a read implementation for the "threshold" debugfs
+ * interface to the hardware latency detector. It can be used to determine
+ * the current threshold level at which a latency will be recorded in the
+ * global ring buffer, typically on the order of 10us.
+ */
+static ssize_t debug_threshold_fread(struct file *filp, char __user *ubuf,
+					 size_t cnt, loff_t *ppos)
+{
+	return simple_data_read(filp, ubuf, cnt, ppos, &data.threshold);
+}
+
+/**
+ * debug_threshold_fwrite - Write function for "threshold" debugfs entry
+ * @filp: The active open file structure for the debugfs "file"
+ * @ubuf: The user buffer that contains the value to write
+ * @cnt: The maximum number of bytes to write to "file"
+ * @ppos: The current position in the debugfs "file"
+ *
+ * This function provides a write implementation for the "threshold" debugfs
+ * interface to the hardware latency detector. It can be used to configure
+ * the threshold level at which any subsequently detected latencies will
+ * be recorded into the global ring buffer.
+ */
+static ssize_t  debug_threshold_fwrite(struct file *filp,
+					const char __user *ubuf,
+					size_t cnt,
+					loff_t *ppos)
+{
+	int ret;
+
+	ret = simple_data_write(filp, ubuf, cnt, ppos, &data.threshold);
+
+	if (enabled)
+		wake_up_process(kthread);
+
+	return ret;
+}
+
+/**
+ * debug_width_fopen - Open function for "width" debugfs entry
+ * @inode: The in-kernel inode representation of the debugfs "file"
+ * @filp: The active open file structure for the debugfs "file"
+ *
+ * This function provides an open implementation for the "width" debugfs
+ * interface to the hardware latency detector.
+ */
+static int debug_width_fopen(struct inode *inode, struct file *filp)
+{
+	return 0;
+}
+
+/**
+ * debug_width_fread - Read function for "width" debugfs entry
+ * @filp: The active open file structure for the debugfs "file"
+ * @ubuf: The userspace provided buffer to read value into
+ * @cnt: The maximum number of bytes to read
+ * @ppos: The current "file" position
+ *
+ * This function provides a read implementation for the "width" debugfs
+ * interface to the hardware latency detector. It can be used to determine
+ * for how many us of the total window us we will actively sample for any
+ * hardware-induced latecy periods. Obviously, it is not possible to
+ * sample constantly and have the system respond to a sample reader, or,
+ * worse, without having the system appear to have gone out to lunch.
+ */
+static ssize_t debug_width_fread(struct file *filp, char __user *ubuf,
+				     size_t cnt, loff_t *ppos)
+{
+	return simple_data_read(filp, ubuf, cnt, ppos, &data.sample_width);
+}
+
+/**
+ * debug_width_fwrite - Write function for "width" debugfs entry
+ * @filp: The active open file structure for the debugfs "file"
+ * @ubuf: The user buffer that contains the value to write
+ * @cnt: The maximum number of bytes to write to "file"
+ * @ppos: The current position in the debugfs "file"
+ *
+ * This function provides a write implementation for the "width" debugfs
+ * interface to the hardware latency detector. It can be used to configure
+ * for how many us of the total window us we will actively sample for any
+ * hardware-induced latency periods. Obviously, it is not possible to
+ * sample constantly and have the system respond to a sample reader, or,
+ * worse, without having the system appear to have gone out to lunch. It
+ * is enforced that width is less that the total window size.
+ */
+static ssize_t  debug_width_fwrite(struct file *filp,
+				       const char __user *ubuf,
+				       size_t cnt,
+				       loff_t *ppos)
+{
+	char buf[U64STR_SIZE];
+	int csize = min(cnt, sizeof(buf));
+	u64 val = 0;
+	int err = 0;
+
+	memset(buf, '\0', sizeof(buf));
+	if (copy_from_user(buf, ubuf, csize))
+		return -EFAULT;
+
+	buf[U64STR_SIZE-1] = '\0';			/* just in case */
+	err = strict_strtoull(buf, 10, &val);
+	if (0 != err)
+		return -EINVAL;
+
+	mutex_lock(&data.lock);
+	if (val < data.sample_window)
+		data.sample_width = val;
+	else {
+		mutex_unlock(&data.lock);
+		return -EINVAL;
+	}
+	mutex_unlock(&data.lock);
+
+	if (enabled)
+		wake_up_process(kthread);
+
+	return csize;
+}
+
+/**
+ * debug_window_fopen - Open function for "window" debugfs entry
+ * @inode: The in-kernel inode representation of the debugfs "file"
+ * @filp: The active open file structure for the debugfs "file"
+ *
+ * This function provides an open implementation for the "window" debugfs
+ * interface to the hardware latency detector. The window is the total time
+ * in us that will be considered one sample period. Conceptually, windows
+ * occur back-to-back and contain a sample width period during which
+ * actual sampling occurs.
+ */
+static int debug_window_fopen(struct inode *inode, struct file *filp)
+{
+	return 0;
+}
+
+/**
+ * debug_window_fread - Read function for "window" debugfs entry
+ * @filp: The active open file structure for the debugfs "file"
+ * @ubuf: The userspace provided buffer to read value into
+ * @cnt: The maximum number of bytes to read
+ * @ppos: The current "file" position
+ *
+ * This function provides a read implementation for the "window" debugfs
+ * interface to the hardware latency detector. The window is the total time
+ * in us that will be considered one sample period. Conceptually, windows
+ * occur back-to-back and contain a sample width period during which
+ * actual sampling occurs. Can be used to read the total window size.
+ */
+static ssize_t debug_window_fread(struct file *filp, char __user *ubuf,
+				      size_t cnt, loff_t *ppos)
+{
+	return simple_data_read(filp, ubuf, cnt, ppos, &data.sample_window);
+}
+
+/**
+ * debug_window_fwrite - Write function for "window" debugfs entry
+ * @filp: The active open file structure for the debugfs "file"
+ * @ubuf: The user buffer that contains the value to write
+ * @cnt: The maximum number of bytes to write to "file"
+ * @ppos: The current position in the debugfs "file"
+ *
+ * This function provides a write implementation for the "window" debufds
+ * interface to the hardware latency detetector. The window is the total time
+ * in us that will be considered one sample period. Conceptually, windows
+ * occur back-to-back and contain a sample width period during which
+ * actual sampling occurs. Can be used to write a new total window size. It
+ * is enfoced that any value written must be greater than the sample width
+ * size, or an error results.
+ */
+static ssize_t  debug_window_fwrite(struct file *filp,
+					const char __user *ubuf,
+					size_t cnt,
+					loff_t *ppos)
+{
+	char buf[U64STR_SIZE];
+	int csize = min(cnt, sizeof(buf));
+	u64 val = 0;
+	int err = 0;
+
+	memset(buf, '\0', sizeof(buf));
+	if (copy_from_user(buf, ubuf, csize))
+		return -EFAULT;
+
+	buf[U64STR_SIZE-1] = '\0';			/* just in case */
+	err = strict_strtoull(buf, 10, &val);
+	if (0 != err)
+		return -EINVAL;
+
+	mutex_lock(&data.lock);
+	if (data.sample_width < val)
+		data.sample_window = val;
+	else {
+		mutex_unlock(&data.lock);
+		return -EINVAL;
+	}
+	mutex_unlock(&data.lock);
+
+	return csize;
+}
+
+/*
+ * Function pointers for the "count" debugfs file operations
+ */
+static const struct file_operations count_fops = {
+	.open		= debug_count_fopen,
+	.read		= debug_count_fread,
+	.write		= debug_count_fwrite,
+	.owner		= THIS_MODULE,
+};
+
+/*
+ * Function pointers for the "enable" debugfs file operations
+ */
+static const struct file_operations enable_fops = {
+	.open		= debug_enable_fopen,
+	.read		= debug_enable_fread,
+	.write		= debug_enable_fwrite,
+	.owner		= THIS_MODULE,
+};
+
+/*
+ * Function pointers for the "max" debugfs file operations
+ */
+static const struct file_operations max_fops = {
+	.open		= debug_max_fopen,
+	.read		= debug_max_fread,
+	.write		= debug_max_fwrite,
+	.owner		= THIS_MODULE,
+};
+
+/*
+ * Function pointers for the "sample" debugfs file operations
+ */
+static const struct file_operations sample_fops = {
+	.open 		= debug_sample_fopen,
+	.read		= debug_sample_fread,
+	.release	= debug_sample_release,
+	.owner		= THIS_MODULE,
+};
+
+/*
+ * Function pointers for the "threshold" debugfs file operations
+ */
+static const struct file_operations threshold_fops = {
+	.open		= debug_threshold_fopen,
+	.read		= debug_threshold_fread,
+	.write		= debug_threshold_fwrite,
+	.owner		= THIS_MODULE,
+};
+
+/*
+ * Function pointers for the "width" debugfs file operations
+ */
+static const struct file_operations width_fops = {
+	.open		= debug_width_fopen,
+	.read		= debug_width_fread,
+	.write		= debug_width_fwrite,
+	.owner		= THIS_MODULE,
+};
+
+/*
+ * Function pointers for the "window" debugfs file operations
+ */
+static const struct file_operations window_fops = {
+	.open		= debug_window_fopen,
+	.read		= debug_window_fread,
+	.write		= debug_window_fwrite,
+	.owner		= THIS_MODULE,
+};
+
+/**
+ * init_debugfs - A function to initialize the debugfs interface files
+ *
+ * This function creates entries in debugfs for "hwlat_detector", including
+ * files to read values from the detector, current samples, and the
+ * maximum sample that has been captured since the hardware latency
+ * dectector was started.
+ */
+static int init_debugfs(void)
+{
+	int ret = -ENOMEM;
+
+	debug_dir = debugfs_create_dir(DRVNAME, NULL);
+	if (!debug_dir)
+		goto err_debug_dir;
+
+	debug_sample = debugfs_create_file("sample", 0444,
+					       debug_dir, NULL,
+					       &sample_fops);
+	if (!debug_sample)
+		goto err_sample;
+
+	debug_count = debugfs_create_file("count", 0444,
+					      debug_dir, NULL,
+					      &count_fops);
+	if (!debug_count)
+		goto err_count;
+
+	debug_max = debugfs_create_file("max", 0444,
+					    debug_dir, NULL,
+					    &max_fops);
+	if (!debug_max)
+		goto err_max;
+
+	debug_sample_window = debugfs_create_file("window", 0644,
+						      debug_dir, NULL,
+						      &window_fops);
+	if (!debug_sample_window)
+		goto err_window;
+
+	debug_sample_width = debugfs_create_file("width", 0644,
+						     debug_dir, NULL,
+						     &width_fops);
+	if (!debug_sample_width)
+		goto err_width;
+
+	debug_threshold = debugfs_create_file("threshold", 0644,
+						  debug_dir, NULL,
+						  &threshold_fops);
+	if (!debug_threshold)
+		goto err_threshold;
+
+	debug_enable = debugfs_create_file("enable", 0644,
+					       debug_dir, &enabled,
+					       &enable_fops);
+	if (!debug_enable)
+		goto err_enable;
+
+	else {
+		ret = 0;
+		goto out;
+	}
+
+err_enable:
+	debugfs_remove(debug_threshold);
+err_threshold:
+	debugfs_remove(debug_sample_width);
+err_width:
+	debugfs_remove(debug_sample_window);
+err_window:
+	debugfs_remove(debug_max);
+err_max:
+	debugfs_remove(debug_count);
+err_count:
+	debugfs_remove(debug_sample);
+err_sample:
+	debugfs_remove(debug_dir);
+err_debug_dir:
+out:
+	return ret;
+}
+
+/**
+ * free_debugfs - A function to cleanup the debugfs file interface
+ */
+static void free_debugfs(void)
+{
+	/* could also use a debugfs_remove_recursive */
+	debugfs_remove(debug_enable);
+	debugfs_remove(debug_threshold);
+	debugfs_remove(debug_sample_width);
+	debugfs_remove(debug_sample_window);
+	debugfs_remove(debug_max);
+	debugfs_remove(debug_count);
+	debugfs_remove(debug_sample);
+	debugfs_remove(debug_dir);
+}
+
+/**
+ * detector_init - Standard module initialization code
+ */
+static int detector_init(void)
+{
+	int ret = -ENOMEM;
+
+	printk(KERN_INFO BANNER "version %s\n", VERSION);
+
+	ret = init_stats();
+	if (0 != ret)
+		goto out;
+
+	ret = init_debugfs();
+	if (0 != ret)
+		goto err_stats;
+
+	if (enabled)
+		ret = start_kthread();
+
+	goto out;
+
+err_stats:
+	ring_buffer_free(ring_buffer);
+out:
+	return ret;
+
+}
+
+/**
+ * detector_exit - Standard module cleanup code
+ */
+static void detector_exit(void)
+{
+	int err;
+
+	if (enabled) {
+		enabled = 0;
+		err = stop_kthread();
+		if (err)
+			printk(KERN_ERR BANNER "cannot stop kthread\n");
+	}
+
+	free_debugfs();
+	ring_buffer_free(ring_buffer);	/* free up the ring buffer */
+
+}
+
+module_init(detector_init);
+module_exit(detector_exit);
-- 
cgit v0.10.2


From b87fc2a0054a6173cd8ce8b000ea46c7a409c086 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 8 Jul 2011 20:25:16 +0200
Subject: localversion.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/n/tip-8vdw4bfcsds27cvox6rpb334@git.kernel.org

diff --git a/localversion-rt b/localversion-rt
new file mode 100644
index 0000000..c3054d0
--- /dev/null
+++ b/localversion-rt
@@ -0,0 +1 @@
+-rt2
-- 
cgit v0.10.2


From cb950e4e5bfe1b0c18c46371b9ad9765b1270b97 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 22 Jul 2011 17:58:40 +0200
Subject: printk-kill.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/printk.h b/include/linux/printk.h
index 8111ffa..812d102 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -99,9 +99,11 @@ int no_printk(const char *fmt, ...)
 extern asmlinkage __printf(1, 2)
 void early_printk(const char *fmt, ...);
 void early_vprintk(const char *fmt, va_list ap);
+extern void printk_kill(void);
 #else
 static inline __printf(1, 2) __cold
 void early_printk(const char *s, ...) { }
+static inline void printk_kill(void) { }
 #endif
 
 extern int printk_needs_cpu(int cpu);
@@ -138,7 +140,6 @@ extern int __printk_ratelimit(const char *func);
 #define printk_ratelimit() __printk_ratelimit(__func__)
 extern bool printk_timed_ratelimit(unsigned long *caller_jiffies,
 				   unsigned int interval_msec);
-
 extern int printk_delay_msec;
 extern int dmesg_restrict;
 extern int kptr_restrict;
diff --git a/kernel/printk.c b/kernel/printk.c
index 69270a6..d429e73 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -770,6 +770,32 @@ asmlinkage void early_printk(const char *fmt, ...)
 	early_vprintk(fmt, ap);
 	va_end(ap);
 }
+
+/*
+ * This is independent of any log levels - a global
+ * kill switch that turns off all of printk.
+ *
+ * Used by the NMI watchdog if early-printk is enabled.
+ */
+static bool __read_mostly printk_killswitch;
+
+void printk_kill(void)
+{
+	printk_killswitch = true;
+}
+
+static int forced_early_printk(const char *fmt, va_list ap)
+{
+	if (!printk_killswitch)
+		return 0;
+	early_vprintk(fmt, ap);
+	return 1;
+}
+#else
+static inline int forced_early_printk(const char *fmt, va_list ap)
+{
+	return 0;
+}
 #endif
 
 #ifdef CONFIG_BOOT_PRINTK_DELAY
@@ -1511,6 +1537,13 @@ asmlinkage int vprintk_emit(int facility, int level,
 	int this_cpu;
 	int printed_len = 0;
 
+	/*
+	 * Fall back to early_printk if a debugging subsystem has
+	 * killed printk output
+	 */
+	if (unlikely(forced_early_printk(fmt, args)))
+		return 1;
+
 	boot_delay_msec(level);
 	printk_delay();
 
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 75a2ab3..608d2fe 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -206,6 +206,8 @@ static int is_softlockup(unsigned long touch_ts)
 
 #ifdef CONFIG_HARDLOCKUP_DETECTOR
 
+static DEFINE_RAW_SPINLOCK(watchdog_output_lock);
+
 static struct perf_event_attr wd_hw_attr = {
 	.type		= PERF_TYPE_HARDWARE,
 	.config		= PERF_COUNT_HW_CPU_CYCLES,
@@ -240,10 +242,19 @@ static void watchdog_overflow_callback(struct perf_event *event,
 		if (__this_cpu_read(hard_watchdog_warn) == true)
 			return;
 
-		if (hardlockup_panic)
+		/*
+		 * If early-printk is enabled then make sure we do not
+		 * lock up in printk() and kill console logging:
+		 */
+		printk_kill();
+
+		if (hardlockup_panic) {
 			panic("Watchdog detected hard LOCKUP on cpu %d", this_cpu);
-		else
+		} else {
+			raw_spin_lock(&watchdog_output_lock);
 			WARN(1, "Watchdog detected hard LOCKUP on cpu %d", this_cpu);
+			raw_spin_unlock(&watchdog_output_lock);
+		}
 
 		__this_cpu_write(hard_watchdog_warn, true);
 		return;
-- 
cgit v0.10.2


From ccf8b548190376bad90e3e3f4b549ac45c8be84f Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 2 Sep 2011 14:29:33 +0200
Subject: printk: 'force_early_printk' boot param to help with debugging

Gives me an option to screw printk and actually see what the machine
says.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1314967289.1301.11.camel@twins
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/n/tip-ykb97nsfmobq44xketrxs977@git.kernel.org

diff --git a/kernel/printk.c b/kernel/printk.c
index d429e73..98aad49 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -779,6 +779,13 @@ asmlinkage void early_printk(const char *fmt, ...)
  */
 static bool __read_mostly printk_killswitch;
 
+static int __init force_early_printk_setup(char *str)
+{
+	printk_killswitch = true;
+	return 0;
+}
+early_param("force_early_printk", force_early_printk_setup);
+
 void printk_kill(void)
 {
 	printk_killswitch = true;
-- 
cgit v0.10.2


From c7fa111207c7df6bab343de170d13db8629108d0 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 17 Jun 2011 12:39:57 +0200
Subject: rt-preempt-base-config.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index 3f9c974..885efbd 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -1,3 +1,10 @@
+config PREEMPT
+	bool
+	select PREEMPT_COUNT
+
+config PREEMPT_RT_BASE
+	bool
+	select PREEMPT
 
 choice
 	prompt "Preemption Model"
@@ -33,9 +40,9 @@ config PREEMPT_VOLUNTARY
 
 	  Select this if you are building a kernel for a desktop system.
 
-config PREEMPT
+config PREEMPT__LL
 	bool "Preemptible Kernel (Low-Latency Desktop)"
-	select PREEMPT_COUNT
+	select PREEMPT
 	select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK
 	help
 	  This option reduces the latency of the kernel by making
@@ -52,6 +59,14 @@ config PREEMPT
 	  embedded system with latency requirements in the milliseconds
 	  range.
 
+config PREEMPT_RTB
+	bool "Preemptible Kernel (Basic RT)"
+	select PREEMPT_RT_BASE
+	help
+	  This option is basically the same as (Low-Latency Desktop) but
+	  enables changes which are preliminary for the full preemptiple
+	  RT kernel.
+
 endchoice
 
 config PREEMPT_COUNT
-- 
cgit v0.10.2


From 1af9f4ba35bd2b09fc85613eb58a2040ace27aca Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 3 Jul 2009 08:29:58 -0500
Subject: bug: BUG_ON/WARN_ON variants dependend on RT/!RT

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h
index 7d10f96..aee7fd2 100644
--- a/include/asm-generic/bug.h
+++ b/include/asm-generic/bug.h
@@ -202,6 +202,20 @@ extern void warn_slowpath_null(const char *file, const int line);
 # define WARN_ON_SMP(x)			({0;})
 #endif
 
+#ifdef CONFIG_PREEMPT_RT_BASE
+# define BUG_ON_RT(c)			BUG_ON(c)
+# define BUG_ON_NONRT(c)		do { } while (0)
+# define WARN_ON_RT(condition)		WARN_ON(condition)
+# define WARN_ON_NONRT(condition)	do { } while (0)
+# define WARN_ON_ONCE_NONRT(condition)	do { } while (0)
+#else
+# define BUG_ON_RT(c)			do { } while (0)
+# define BUG_ON_NONRT(c)		BUG_ON(c)
+# define WARN_ON_RT(condition)		do { } while (0)
+# define WARN_ON_NONRT(condition)	WARN_ON(condition)
+# define WARN_ON_ONCE_NONRT(condition)	WARN_ON_ONCE(condition)
+#endif
+
 #endif /* __ASSEMBLY__ */
 
 #endif
-- 
cgit v0.10.2


From f7a8a5faf3acad0c3f5e4cf244d5bf978d0f37b0 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 21 Jul 2009 22:34:14 +0200
Subject: rt: local_irq_* variants depending on RT/!RT

Add local_irq_*_(no)rt variant which are mainly used to break
interrupt disabled sections on PREEMPT_RT or to explicitely disable
interrupts on PREEMPT_RT.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 5fc9d8b..3f3a4e2 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -211,7 +211,7 @@ extern void devm_free_irq(struct device *dev, unsigned int irq, void *dev_id);
 #ifdef CONFIG_LOCKDEP
 # define local_irq_enable_in_hardirq()	do { } while (0)
 #else
-# define local_irq_enable_in_hardirq()	local_irq_enable()
+# define local_irq_enable_in_hardirq()	local_irq_enable_nort()
 #endif
 
 extern void disable_irq_nosync(unsigned int irq);
diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h
index d176d65..37b13c4 100644
--- a/include/linux/irqflags.h
+++ b/include/linux/irqflags.h
@@ -147,4 +147,23 @@
 
 #endif /* CONFIG_TRACE_IRQFLAGS_SUPPORT */
 
+/*
+ * local_irq* variants depending on RT/!RT
+ */
+#ifdef CONFIG_PREEMPT_RT_FULL
+# define local_irq_disable_nort()	do { } while (0)
+# define local_irq_enable_nort()	do { } while (0)
+# define local_irq_save_nort(flags)	do { local_save_flags(flags); } while (0)
+# define local_irq_restore_nort(flags)	do { (void)(flags); } while (0)
+# define local_irq_disable_rt()		local_irq_disable()
+# define local_irq_enable_rt()		local_irq_enable()
+#else
+# define local_irq_disable_nort()	local_irq_disable()
+# define local_irq_enable_nort()	local_irq_enable()
+# define local_irq_save_nort(flags)	local_irq_save(flags)
+# define local_irq_restore_nort(flags)	local_irq_restore(flags)
+# define local_irq_disable_rt()		do { } while (0)
+# define local_irq_enable_rt()		do { } while (0)
+#endif
+
 #endif
-- 
cgit v0.10.2


From 49de0e27b26dae8dd09d26b98a438955b34d140d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 24 Jul 2009 12:38:56 +0200
Subject: preempt: Provide preempt_*_(no)rt variants

RT needs a few preempt_disable/enable points which are not necessary
otherwise. Implement variants to avoid #ifdeffery.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index 5a710b9..5b46536 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -54,11 +54,15 @@ do { \
 	dec_preempt_count(); \
 } while (0)
 
-#define preempt_enable_no_resched()	sched_preempt_enable_no_resched()
+#ifndef CONFIG_PREEMPT_RT_BASE
+# define preempt_enable_no_resched()	sched_preempt_enable_no_resched()
+#else
+# define preempt_enable_no_resched()	preempt_enable()
+#endif
 
 #define preempt_enable() \
 do { \
-	preempt_enable_no_resched(); \
+	sched_preempt_enable_no_resched(); \
 	barrier(); \
 	preempt_check_resched(); \
 } while (0)
@@ -104,6 +108,18 @@ do { \
 
 #endif /* CONFIG_PREEMPT_COUNT */
 
+#ifdef CONFIG_PREEMPT_RT_FULL
+# define preempt_disable_rt()		preempt_disable()
+# define preempt_enable_rt()		preempt_enable()
+# define preempt_disable_nort()		do { } while (0)
+# define preempt_enable_nort()		do { } while (0)
+#else
+# define preempt_disable_rt()		do { } while (0)
+# define preempt_enable_rt()		do { } while (0)
+# define preempt_disable_nort()		preempt_disable()
+# define preempt_enable_nort()		preempt_enable()
+#endif
+
 #ifdef CONFIG_PREEMPT_NOTIFIERS
 
 struct preempt_notifier;
-- 
cgit v0.10.2


From aba7dac56a905f43809109c248d07d48ad039e50 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 3 Jul 2009 08:44:29 -0500
Subject: ata: Do not disable interrupts in ide code for preempt-rt

Use the local_irq_*_nort variants.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/drivers/ata/libata-sff.c b/drivers/ata/libata-sff.c
index d8af325..ad3130d 100644
--- a/drivers/ata/libata-sff.c
+++ b/drivers/ata/libata-sff.c
@@ -678,9 +678,9 @@ unsigned int ata_sff_data_xfer_noirq(struct ata_device *dev, unsigned char *buf,
 	unsigned long flags;
 	unsigned int consumed;
 
-	local_irq_save(flags);
+	local_irq_save_nort(flags);
 	consumed = ata_sff_data_xfer32(dev, buf, buflen, rw);
-	local_irq_restore(flags);
+	local_irq_restore_nort(flags);
 
 	return consumed;
 }
@@ -719,7 +719,7 @@ static void ata_pio_sector(struct ata_queued_cmd *qc)
 		unsigned long flags;
 
 		/* FIXME: use a bounce buffer */
-		local_irq_save(flags);
+		local_irq_save_nort(flags);
 		buf = kmap_atomic(page);
 
 		/* do the actual data transfer */
@@ -727,7 +727,7 @@ static void ata_pio_sector(struct ata_queued_cmd *qc)
 				       do_write);
 
 		kunmap_atomic(buf);
-		local_irq_restore(flags);
+		local_irq_restore_nort(flags);
 	} else {
 		buf = page_address(page);
 		ap->ops->sff_data_xfer(qc->dev, buf + offset, qc->sect_size,
@@ -864,7 +864,7 @@ next_sg:
 		unsigned long flags;
 
 		/* FIXME: use bounce buffer */
-		local_irq_save(flags);
+		local_irq_save_nort(flags);
 		buf = kmap_atomic(page);
 
 		/* do the actual data transfer */
@@ -872,7 +872,7 @@ next_sg:
 								count, rw);
 
 		kunmap_atomic(buf);
-		local_irq_restore(flags);
+		local_irq_restore_nort(flags);
 	} else {
 		buf = page_address(page);
 		consumed = ap->ops->sff_data_xfer(dev,  buf + offset,
-- 
cgit v0.10.2


From fc791acdab27c366a032193b1a8f81084e7e4bb4 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 3 Jul 2009 08:30:16 -0500
Subject: ide: Do not disable interrupts for PREEMPT-RT

Use the local_irq_*_nort variants.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/drivers/ide/alim15x3.c b/drivers/ide/alim15x3.c
index 36f76e2..394f142f 100644
--- a/drivers/ide/alim15x3.c
+++ b/drivers/ide/alim15x3.c
@@ -234,7 +234,7 @@ static int init_chipset_ali15x3(struct pci_dev *dev)
 
 	isa_dev = pci_get_device(PCI_VENDOR_ID_AL, PCI_DEVICE_ID_AL_M1533, NULL);
 
-	local_irq_save(flags);
+	local_irq_save_nort(flags);
 
 	if (m5229_revision < 0xC2) {
 		/*
@@ -325,7 +325,7 @@ out:
 	}
 	pci_dev_put(north);
 	pci_dev_put(isa_dev);
-	local_irq_restore(flags);
+	local_irq_restore_nort(flags);
 	return 0;
 }
 
diff --git a/drivers/ide/hpt366.c b/drivers/ide/hpt366.c
index 696b6c1..0d0a966 100644
--- a/drivers/ide/hpt366.c
+++ b/drivers/ide/hpt366.c
@@ -1241,7 +1241,7 @@ static int init_dma_hpt366(ide_hwif_t *hwif,
 
 	dma_old = inb(base + 2);
 
-	local_irq_save(flags);
+	local_irq_save_nort(flags);
 
 	dma_new = dma_old;
 	pci_read_config_byte(dev, hwif->channel ? 0x4b : 0x43, &masterdma);
@@ -1252,7 +1252,7 @@ static int init_dma_hpt366(ide_hwif_t *hwif,
 	if (dma_new != dma_old)
 		outb(dma_new, base + 2);
 
-	local_irq_restore(flags);
+	local_irq_restore_nort(flags);
 
 	printk(KERN_INFO "    %s: BM-DMA at 0x%04lx-0x%04lx\n",
 			 hwif->name, base, base + 7);
diff --git a/drivers/ide/ide-io-std.c b/drivers/ide/ide-io-std.c
index 1976397..4169433 100644
--- a/drivers/ide/ide-io-std.c
+++ b/drivers/ide/ide-io-std.c
@@ -175,7 +175,7 @@ void ide_input_data(ide_drive_t *drive, struct ide_cmd *cmd, void *buf,
 		unsigned long uninitialized_var(flags);
 
 		if ((io_32bit & 2) && !mmio) {
-			local_irq_save(flags);
+			local_irq_save_nort(flags);
 			ata_vlb_sync(io_ports->nsect_addr);
 		}
 
@@ -186,7 +186,7 @@ void ide_input_data(ide_drive_t *drive, struct ide_cmd *cmd, void *buf,
 			insl(data_addr, buf, words);
 
 		if ((io_32bit & 2) && !mmio)
-			local_irq_restore(flags);
+			local_irq_restore_nort(flags);
 
 		if (((len + 1) & 3) < 2)
 			return;
@@ -219,7 +219,7 @@ void ide_output_data(ide_drive_t *drive, struct ide_cmd *cmd, void *buf,
 		unsigned long uninitialized_var(flags);
 
 		if ((io_32bit & 2) && !mmio) {
-			local_irq_save(flags);
+			local_irq_save_nort(flags);
 			ata_vlb_sync(io_ports->nsect_addr);
 		}
 
@@ -230,7 +230,7 @@ void ide_output_data(ide_drive_t *drive, struct ide_cmd *cmd, void *buf,
 			outsl(data_addr, buf, words);
 
 		if ((io_32bit & 2) && !mmio)
-			local_irq_restore(flags);
+			local_irq_restore_nort(flags);
 
 		if (((len + 1) & 3) < 2)
 			return;
diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index 177db6d..079ae6b 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -659,7 +659,7 @@ void ide_timer_expiry (unsigned long data)
 		/* disable_irq_nosync ?? */
 		disable_irq(hwif->irq);
 		/* local CPU only, as if we were handling an interrupt */
-		local_irq_disable();
+		local_irq_disable_nort();
 		if (hwif->polling) {
 			startstop = handler(drive);
 		} else if (drive_is_ready(drive)) {
diff --git a/drivers/ide/ide-iops.c b/drivers/ide/ide-iops.c
index 376f2dc..f014dd1 100644
--- a/drivers/ide/ide-iops.c
+++ b/drivers/ide/ide-iops.c
@@ -129,12 +129,12 @@ int __ide_wait_stat(ide_drive_t *drive, u8 good, u8 bad,
 				if ((stat & ATA_BUSY) == 0)
 					break;
 
-				local_irq_restore(flags);
+				local_irq_restore_nort(flags);
 				*rstat = stat;
 				return -EBUSY;
 			}
 		}
-		local_irq_restore(flags);
+		local_irq_restore_nort(flags);
 	}
 	/*
 	 * Allow status to settle, then read it again.
diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c
index 068cef0..38e69e1 100644
--- a/drivers/ide/ide-probe.c
+++ b/drivers/ide/ide-probe.c
@@ -196,10 +196,10 @@ static void do_identify(ide_drive_t *drive, u8 cmd, u16 *id)
 	int bswap = 1;
 
 	/* local CPU only; some systems need this */
-	local_irq_save(flags);
+	local_irq_save_nort(flags);
 	/* read 512 bytes of id info */
 	hwif->tp_ops->input_data(drive, NULL, id, SECTOR_SIZE);
-	local_irq_restore(flags);
+	local_irq_restore_nort(flags);
 
 	drive->dev_flags |= IDE_DFLAG_ID_READ;
 #ifdef DEBUG
diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c
index 729428e..3a9a1fc 100644
--- a/drivers/ide/ide-taskfile.c
+++ b/drivers/ide/ide-taskfile.c
@@ -251,7 +251,7 @@ void ide_pio_bytes(ide_drive_t *drive, struct ide_cmd *cmd,
 
 		page_is_high = PageHighMem(page);
 		if (page_is_high)
-			local_irq_save(flags);
+			local_irq_save_nort(flags);
 
 		buf = kmap_atomic(page) + offset;
 
@@ -272,7 +272,7 @@ void ide_pio_bytes(ide_drive_t *drive, struct ide_cmd *cmd,
 		kunmap_atomic(buf);
 
 		if (page_is_high)
-			local_irq_restore(flags);
+			local_irq_restore_nort(flags);
 
 		len -= nr_bytes;
 	}
@@ -415,7 +415,7 @@ static ide_startstop_t pre_task_out_intr(ide_drive_t *drive,
 	}
 
 	if ((drive->dev_flags & IDE_DFLAG_UNMASK) == 0)
-		local_irq_disable();
+		local_irq_disable_nort();
 
 	ide_set_handler(drive, &task_pio_intr, WAIT_WORSTCASE);
 
-- 
cgit v0.10.2


From e6a05fc7bf5cbbbd1b5f104197a9bf0548131613 Mon Sep 17 00:00:00 2001
From: Sven-Thorsten Dietrich <sdietrich@novell.com>
Date: Fri, 3 Jul 2009 08:30:35 -0500
Subject: infiniband: Mellanox IB driver patch use _nort() primitives

Fixes in_atomic stack-dump, when Mellanox module is loaded into the RT
Kernel.

Michael S. Tsirkin <mst@dev.mellanox.co.il> sayeth:
"Basically, if you just make spin_lock_irqsave (and spin_lock_irq) not disable
interrupts for non-raw spinlocks, I think all of infiniband will be fine without
changes."

Signed-off-by: Sven-Thorsten Dietrich <sven@thebigcorporation.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
index cecb98a..3800ef5 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
@@ -783,7 +783,7 @@ void ipoib_mcast_restart_task(struct work_struct *work)
 
 	ipoib_mcast_stop_thread(dev, 0);
 
-	local_irq_save(flags);
+	local_irq_save_nort(flags);
 	netif_addr_lock(dev);
 	spin_lock(&priv->lock);
 
@@ -865,7 +865,7 @@ void ipoib_mcast_restart_task(struct work_struct *work)
 
 	spin_unlock(&priv->lock);
 	netif_addr_unlock(dev);
-	local_irq_restore(flags);
+	local_irq_restore_nort(flags);
 
 	/* We have to cancel outside of the spinlock */
 	list_for_each_entry_safe(mcast, tmcast, &remove_list, list) {
-- 
cgit v0.10.2


From 1394824856a601cb9212dafdb930f0704b2fb512 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 3 Jul 2009 08:30:16 -0500
Subject: input: gameport: Do not disable interrupts on PREEMPT_RT

Use the _nort() primitives.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/drivers/input/gameport/gameport.c b/drivers/input/gameport/gameport.c
index da739d9..18fdafe 100644
--- a/drivers/input/gameport/gameport.c
+++ b/drivers/input/gameport/gameport.c
@@ -87,12 +87,12 @@ static int gameport_measure_speed(struct gameport *gameport)
 	tx = 1 << 30;
 
 	for(i = 0; i < 50; i++) {
-		local_irq_save(flags);
+		local_irq_save_nort(flags);
 		GET_TIME(t1);
 		for (t = 0; t < 50; t++) gameport_read(gameport);
 		GET_TIME(t2);
 		GET_TIME(t3);
-		local_irq_restore(flags);
+		local_irq_restore_nort(flags);
 		udelay(i * 10);
 		if ((t = DELTA(t2,t1) - DELTA(t3,t2)) < tx) tx = t;
 	}
@@ -111,11 +111,11 @@ static int gameport_measure_speed(struct gameport *gameport)
 	tx = 1 << 30;
 
 	for(i = 0; i < 50; i++) {
-		local_irq_save(flags);
+		local_irq_save_nort(flags);
 		rdtscl(t1);
 		for (t = 0; t < 50; t++) gameport_read(gameport);
 		rdtscl(t2);
-		local_irq_restore(flags);
+		local_irq_restore_nort(flags);
 		udelay(i * 10);
 		if (t2 - t1 < tx) tx = t2 - t1;
 	}
-- 
cgit v0.10.2


From 6d56ea8fa272ebe66cff738166e9658531d03615 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 21 Jul 2009 22:54:51 +0200
Subject: acpi: Do not disable interrupts on PREEMPT_RT

Use the local_irq_*_nort() variants.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
index 0c44630..1785dd7 100644
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -51,8 +51,8 @@
 
 #define ACPI_ASM_MACROS
 #define BREAKPOINT3
-#define ACPI_DISABLE_IRQS() local_irq_disable()
-#define ACPI_ENABLE_IRQS()  local_irq_enable()
+#define ACPI_DISABLE_IRQS() local_irq_disable_nort()
+#define ACPI_ENABLE_IRQS()  local_irq_enable_nort()
 #define ACPI_FLUSH_CPU_CACHE()	wbinvd()
 
 int __acpi_acquire_global_lock(unsigned int *lock);
-- 
cgit v0.10.2


From b425591c4a32d2bf13a31e32e81f9a94d961348d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 21 Jul 2009 23:06:05 +0200
Subject: core: Do not disable interrupts on RT in kernel/users.c

Use the local_irq_*_nort variants to reduce latencies in RT. The code
is serialized by the locks. No need to disable interrupts.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/user.c b/kernel/user.c
index 33acb5e..8dbfdb7 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -157,11 +157,11 @@ void free_uid(struct user_struct *up)
 	if (!up)
 		return;
 
-	local_irq_save(flags);
+	local_irq_save_nort(flags);
 	if (atomic_dec_and_lock(&up->__count, &uidhash_lock))
 		free_user(up, flags);
 	else
-		local_irq_restore(flags);
+		local_irq_restore_nort(flags);
 }
 
 struct user_struct *alloc_uid(kuid_t uid)
-- 
cgit v0.10.2


From 502c367fab76b71f5c37fd7b9a8fb18aa4605f95 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 3 Jul 2009 08:44:33 -0500
Subject: core: Do not disable interrupts on RT in res_counter.c

Frederic Weisbecker reported this warning:

[   45.228562] BUG: sleeping function called from invalid context at kernel/rtmutex.c:683
[   45.228571] in_atomic(): 0, irqs_disabled(): 1, pid: 4290, name: ntpdate
[   45.228576] INFO: lockdep is turned off.
[   45.228580] irq event stamp: 0
[   45.228583] hardirqs last  enabled at (0): [<(null)>] (null)
[   45.228589] hardirqs last disabled at (0): [<ffffffff8025449d>] copy_process+0x68d/0x1500
[   45.228602] softirqs last  enabled at (0): [<ffffffff8025449d>] copy_process+0x68d/0x1500
[   45.228609] softirqs last disabled at (0): [<(null)>] (null)
[   45.228617] Pid: 4290, comm: ntpdate Tainted: G        W  2.6.29-rc4-rt1-tip #1
[   45.228622] Call Trace:
[   45.228632]  [<ffffffff8027dfb0>] ? print_irqtrace_events+0xd0/0xe0
[   45.228639]  [<ffffffff8024cd73>] __might_sleep+0x113/0x130
[   45.228646]  [<ffffffff8077c811>] rt_spin_lock+0xa1/0xb0
[   45.228653]  [<ffffffff80296a3d>] res_counter_charge+0x5d/0x130
[   45.228660]  [<ffffffff802fb67f>] __mem_cgroup_try_charge+0x7f/0x180
[   45.228667]  [<ffffffff802fc407>] mem_cgroup_charge_common+0x57/0x90
[   45.228674]  [<ffffffff80212096>] ? ftrace_call+0x5/0x2b
[   45.228680]  [<ffffffff802fc49d>] mem_cgroup_newpage_charge+0x5d/0x60
[   45.228688]  [<ffffffff802d94ce>] __do_fault+0x29e/0x4c0
[   45.228694]  [<ffffffff8077c843>] ? rt_spin_unlock+0x23/0x80
[   45.228700]  [<ffffffff802db8b5>] handle_mm_fault+0x205/0x890
[   45.228707]  [<ffffffff80212096>] ? ftrace_call+0x5/0x2b
[   45.228714]  [<ffffffff8023495e>] do_page_fault+0x11e/0x2a0
[   45.228720]  [<ffffffff8077e5a5>] page_fault+0x25/0x30
[   45.228727]  [<ffffffff8043e1ed>] ? __clear_user+0x3d/0x70
[   45.228733]  [<ffffffff8043e1d1>] ? __clear_user+0x21/0x70

The reason is the raw IRQ flag use of kernel/res_counter.c.

The irq flags tricks there seem a bit pointless: it cannot protect the
c->parent linkage because local_irq_save() is only per CPU.

So replace it with _nort(). This code needs a second look.

Reported-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index ff55247..cfecbee 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -49,7 +49,7 @@ static int __res_counter_charge(struct res_counter *counter, unsigned long val,
 
 	r = ret = 0;
 	*limit_fail_at = NULL;
-	local_irq_save(flags);
+	local_irq_save_nort(flags);
 	for (c = counter; c != NULL; c = c->parent) {
 		spin_lock(&c->lock);
 		r = res_counter_charge_locked(c, val, force);
@@ -69,7 +69,7 @@ static int __res_counter_charge(struct res_counter *counter, unsigned long val,
 			spin_unlock(&u->lock);
 		}
 	}
-	local_irq_restore(flags);
+	local_irq_restore_nort(flags);
 
 	return ret;
 }
@@ -103,7 +103,7 @@ u64 res_counter_uncharge_until(struct res_counter *counter,
 	struct res_counter *c;
 	u64 ret = 0;
 
-	local_irq_save(flags);
+	local_irq_save_nort(flags);
 	for (c = counter; c != top; c = c->parent) {
 		u64 r;
 		spin_lock(&c->lock);
@@ -112,7 +112,7 @@ u64 res_counter_uncharge_until(struct res_counter *counter,
 			ret = r;
 		spin_unlock(&c->lock);
 	}
-	local_irq_restore(flags);
+	local_irq_restore_nort(flags);
 	return ret;
 }
 
-- 
cgit v0.10.2


From 45525e7d4d02641c8d87131ea88e86ba294bc7e6 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 3 Jul 2009 08:44:26 -0500
Subject: usb: Use local_irq_*_nort() variants

[ tglx: Now that irqf_disabled is dead we should kill that ]

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/drivers/usb/core/hcd.c b/drivers/usb/core/hcd.c
index 8e64adf..59c4d3c 100644
--- a/drivers/usb/core/hcd.c
+++ b/drivers/usb/core/hcd.c
@@ -2217,7 +2217,7 @@ irqreturn_t usb_hcd_irq (int irq, void *__hcd)
 	 * when the first handler doesn't use it.  So let's just
 	 * assume it's never used.
 	 */
-	local_irq_save(flags);
+	local_irq_save_nort(flags);
 
 	if (unlikely(HCD_DEAD(hcd) || !HCD_HW_ACCESSIBLE(hcd)))
 		rc = IRQ_NONE;
@@ -2226,7 +2226,7 @@ irqreturn_t usb_hcd_irq (int irq, void *__hcd)
 	else
 		rc = IRQ_HANDLED;
 
-	local_irq_restore(flags);
+	local_irq_restore_nort(flags);
 	return rc;
 }
 EXPORT_SYMBOL_GPL(usb_hcd_irq);
-- 
cgit v0.10.2


From b19f6cfca314b287fa9b7b52d73ce6453d4ecb4f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 3 Jul 2009 08:44:34 -0500
Subject: mm: scatterlist dont disable irqs on RT

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/lib/scatterlist.c b/lib/scatterlist.c
index 7874b01..43603ee 100644
--- a/lib/scatterlist.c
+++ b/lib/scatterlist.c
@@ -499,7 +499,7 @@ void sg_miter_stop(struct sg_mapping_iter *miter)
 			flush_kernel_dcache_page(miter->page);
 
 		if (miter->__flags & SG_MITER_ATOMIC) {
-			WARN_ON_ONCE(preemptible());
+			WARN_ON_ONCE(!pagefault_disabled());
 			kunmap_atomic(miter->addr);
 		} else
 			kunmap(miter->page);
@@ -539,7 +539,7 @@ static size_t sg_copy_buffer(struct scatterlist *sgl, unsigned int nents,
 
 	sg_miter_start(&miter, sgl, nents, sg_flags);
 
-	local_irq_save(flags);
+	local_irq_save_nort(flags);
 
 	while (sg_miter_next(&miter) && offset < buflen) {
 		unsigned int len;
@@ -556,7 +556,7 @@ static size_t sg_copy_buffer(struct scatterlist *sgl, unsigned int nents,
 
 	sg_miter_stop(&miter);
 
-	local_irq_restore(flags);
+	local_irq_restore_nort(flags);
 	return offset;
 }
 
-- 
cgit v0.10.2


From 3c26de19712190e964dbe650058482f130f8c821 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 22 Jul 2011 08:07:08 +0200
Subject: signal-fix-up-rcu-wreckage.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/signal.c b/kernel/signal.c
index 3b3e347..8222b89 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1391,12 +1391,12 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
 	struct sighand_struct *sighand;
 
 	for (;;) {
-		local_irq_save(*flags);
+		local_irq_save_nort(*flags);
 		rcu_read_lock();
 		sighand = rcu_dereference(tsk->sighand);
 		if (unlikely(sighand == NULL)) {
 			rcu_read_unlock();
-			local_irq_restore(*flags);
+			local_irq_restore_nort(*flags);
 			break;
 		}
 
@@ -1407,7 +1407,7 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
 		}
 		spin_unlock(&sighand->siglock);
 		rcu_read_unlock();
-		local_irq_restore(*flags);
+		local_irq_restore_nort(*flags);
 	}
 
 	return sighand;
-- 
cgit v0.10.2


From 7c4226daccde03dcf8ab75dc4dd475c880574911 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 21 Jul 2011 21:05:33 +0200
Subject: net-wireless-warn-nort.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 580704e..c58f3cd 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -3144,7 +3144,7 @@ void ieee80211_rx(struct ieee80211_hw *hw, struct sk_buff *skb)
 	struct ieee80211_supported_band *sband;
 	struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
 
-	WARN_ON_ONCE(softirq_count() == 0);
+	WARN_ON_ONCE_NONRT(softirq_count() == 0);
 
 	if (WARN_ON(status->band >= IEEE80211_NUM_BANDS))
 		goto drop;
-- 
cgit v0.10.2


From b4a92c94d5e6999aa775a4e4afc9c5293bc0bfdb Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 19 Aug 2009 09:56:42 +0200
Subject: mm: Replace cgroup_page bit spinlock

Bit spinlocks are not working on RT. Replace them.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h
index 777a524..ca67e80 100644
--- a/include/linux/page_cgroup.h
+++ b/include/linux/page_cgroup.h
@@ -24,6 +24,9 @@ enum {
  */
 struct page_cgroup {
 	unsigned long flags;
+#ifdef CONFIG_PREEMPT_RT_BASE
+	spinlock_t pcg_lock;
+#endif
 	struct mem_cgroup *mem_cgroup;
 };
 
@@ -74,12 +77,20 @@ static inline void lock_page_cgroup(struct page_cgroup *pc)
 	 * Don't take this lock in IRQ context.
 	 * This lock is for pc->mem_cgroup, USED, MIGRATION
 	 */
+#ifndef CONFIG_PREEMPT_RT_BASE
 	bit_spin_lock(PCG_LOCK, &pc->flags);
+#else
+	spin_lock(&pc->pcg_lock);
+#endif
 }
 
 static inline void unlock_page_cgroup(struct page_cgroup *pc)
 {
+#ifndef CONFIG_PREEMPT_RT_BASE
 	bit_spin_unlock(PCG_LOCK, &pc->flags);
+#else
+	spin_unlock(&pc->pcg_lock);
+#endif
 }
 
 #else /* CONFIG_MEMCG */
@@ -102,6 +113,10 @@ static inline void __init page_cgroup_init_flatmem(void)
 {
 }
 
+static inline void page_cgroup_lock_init(struct page_cgroup *pc)
+{
+}
+
 #endif /* CONFIG_MEMCG */
 
 #include <linux/swap.h>
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 6d757e3a..98caeee 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -13,6 +13,14 @@
 
 static unsigned long total_usage;
 
+static void page_cgroup_lock_init(struct page_cgroup *pc, int nr_pages)
+{
+#ifdef CONFIG_PREEMPT_RT_BASE
+	for (; nr_pages; nr_pages--, pc++)
+		spin_lock_init(&pc->pcg_lock);
+#endif
+}
+
 #if !defined(CONFIG_SPARSEMEM)
 
 
@@ -60,6 +68,7 @@ static int __init alloc_node_page_cgroup(int nid)
 		return -ENOMEM;
 	NODE_DATA(nid)->node_page_cgroup = base;
 	total_usage += table_size;
+	page_cgroup_lock_init(base, nr_pages);
 	return 0;
 }
 
@@ -150,6 +159,8 @@ static int __meminit init_section_page_cgroup(unsigned long pfn, int nid)
 		return -ENOMEM;
 	}
 
+	page_cgroup_lock_init(base, PAGES_PER_SECTION);
+
 	/*
 	 * The passed "pfn" may not be aligned to SECTION.  For the calculation
 	 * we need to apply a mask.
-- 
cgit v0.10.2


From 57eecc70b5ee4460c47cf8ab771c7e5a714b1a1a Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 18 Mar 2011 09:18:52 +0100
Subject: buffer_head: Replace bh_uptodate_lock for -rt

Wrap the bit_spin_lock calls into a separate inline and add the RT
replacements with a real spinlock.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/fs/buffer.c b/fs/buffer.c
index 7a75c3e..8863f45 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -280,8 +280,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
 	 * decide that the page is now completely done.
 	 */
 	first = page_buffers(page);
-	local_irq_save(flags);
-	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
+	flags = bh_uptodate_lock_irqsave(first);
 	clear_buffer_async_read(bh);
 	unlock_buffer(bh);
 	tmp = bh;
@@ -294,8 +293,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
 		}
 		tmp = tmp->b_this_page;
 	} while (tmp != bh);
-	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
-	local_irq_restore(flags);
+	bh_uptodate_unlock_irqrestore(first, flags);
 
 	/*
 	 * If none of the buffers had errors and they are all
@@ -307,9 +305,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
 	return;
 
 still_busy:
-	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
-	local_irq_restore(flags);
-	return;
+	bh_uptodate_unlock_irqrestore(first, flags);
 }
 
 /*
@@ -343,8 +339,7 @@ void end_buffer_async_write(struct buffer_head *bh, int uptodate)
 	}
 
 	first = page_buffers(page);
-	local_irq_save(flags);
-	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
+	flags = bh_uptodate_lock_irqsave(first);
 
 	clear_buffer_async_write(bh);
 	unlock_buffer(bh);
@@ -356,15 +351,12 @@ void end_buffer_async_write(struct buffer_head *bh, int uptodate)
 		}
 		tmp = tmp->b_this_page;
 	}
-	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
-	local_irq_restore(flags);
+	bh_uptodate_unlock_irqrestore(first, flags);
 	end_page_writeback(page);
 	return;
 
 still_busy:
-	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
-	local_irq_restore(flags);
-	return;
+	bh_uptodate_unlock_irqrestore(first, flags);
 }
 EXPORT_SYMBOL(end_buffer_async_write);
 
@@ -3256,6 +3248,7 @@ struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
 	struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags);
 	if (ret) {
 		INIT_LIST_HEAD(&ret->b_assoc_buffers);
+		buffer_head_init_locks(ret);
 		preempt_disable();
 		__this_cpu_inc(bh_accounting.nr);
 		recalc_bh_state();
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index fa9c05f..4e4ca73 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -108,8 +108,7 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
 				"0x%llx.", (unsigned long long)bh->b_blocknr);
 	}
 	first = page_buffers(page);
-	local_irq_save(flags);
-	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
+	flags = bh_uptodate_lock_irqsave(first);
 	clear_buffer_async_read(bh);
 	unlock_buffer(bh);
 	tmp = bh;
@@ -124,8 +123,7 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
 		}
 		tmp = tmp->b_this_page;
 	} while (tmp != bh);
-	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
-	local_irq_restore(flags);
+	bh_uptodate_unlock_irqrestore(first, flags);
 	/*
 	 * If none of the buffers had errors then we can set the page uptodate,
 	 * but we first have to perform the post read mst fixups, if the
@@ -160,9 +158,7 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
 	unlock_page(page);
 	return;
 still_busy:
-	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
-	local_irq_restore(flags);
-	return;
+	bh_uptodate_unlock_irqrestore(first, flags);
 }
 
 /**
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 458f497..5c16cf1 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -72,8 +72,42 @@ struct buffer_head {
 	struct address_space *b_assoc_map;	/* mapping this buffer is
 						   associated with */
 	atomic_t b_count;		/* users using this buffer_head */
+#ifdef CONFIG_PREEMPT_RT_BASE
+	spinlock_t b_uptodate_lock;
+#endif
 };
 
+static inline unsigned long bh_uptodate_lock_irqsave(struct buffer_head *bh)
+{
+	unsigned long flags;
+
+#ifndef CONFIG_PREEMPT_RT_BASE
+	local_irq_save(flags);
+	bit_spin_lock(BH_Uptodate_Lock, &bh->b_state);
+#else
+	spin_lock_irqsave(&bh->b_uptodate_lock, flags);
+#endif
+	return flags;
+}
+
+static inline void
+bh_uptodate_unlock_irqrestore(struct buffer_head *bh, unsigned long flags)
+{
+#ifndef CONFIG_PREEMPT_RT_BASE
+	bit_spin_unlock(BH_Uptodate_Lock, &bh->b_state);
+	local_irq_restore(flags);
+#else
+	spin_unlock_irqrestore(&bh->b_uptodate_lock, flags);
+#endif
+}
+
+static inline void buffer_head_init_locks(struct buffer_head *bh)
+{
+#ifdef CONFIG_PREEMPT_RT_BASE
+	spin_lock_init(&bh->b_uptodate_lock);
+#endif
+}
+
 /*
  * macro tricks to expand the set_buffer_foo(), clear_buffer_foo()
  * and buffer_foo() functions.
-- 
cgit v0.10.2


From e27866fd45e5a12b92fc572d7c3076308a9969a7 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 18 Mar 2011 10:11:25 +0100
Subject: fs: jbd/jbd2: Make state lock and journal head lock rt safe

bit_spin_locks break under RT.

Based on a previous patch from Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

--

 include/linux/buffer_head.h |   10 ++++++++++
 include/linux/jbd_common.h  |   24 ++++++++++++++++++++++++
 2 files changed, 34 insertions(+)

diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 5c16cf1..3f8e27b 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -74,6 +74,11 @@ struct buffer_head {
 	atomic_t b_count;		/* users using this buffer_head */
 #ifdef CONFIG_PREEMPT_RT_BASE
 	spinlock_t b_uptodate_lock;
+#if defined(CONFIG_JBD) || defined(CONFIG_JBD_MODULE) || \
+    defined(CONFIG_JBD2) || defined(CONFIG_JBD2_MODULE)
+	spinlock_t b_state_lock;
+	spinlock_t b_journal_head_lock;
+#endif
 #endif
 };
 
@@ -105,6 +110,11 @@ static inline void buffer_head_init_locks(struct buffer_head *bh)
 {
 #ifdef CONFIG_PREEMPT_RT_BASE
 	spin_lock_init(&bh->b_uptodate_lock);
+#if defined(CONFIG_JBD) || defined(CONFIG_JBD_MODULE) || \
+    defined(CONFIG_JBD2) || defined(CONFIG_JBD2_MODULE)
+	spin_lock_init(&bh->b_state_lock);
+	spin_lock_init(&bh->b_journal_head_lock);
+#endif
 #endif
 }
 
diff --git a/include/linux/jbd_common.h b/include/linux/jbd_common.h
index 6133679..0dbc151 100644
--- a/include/linux/jbd_common.h
+++ b/include/linux/jbd_common.h
@@ -39,32 +39,56 @@ static inline struct journal_head *bh2jh(struct buffer_head *bh)
 
 static inline void jbd_lock_bh_state(struct buffer_head *bh)
 {
+#ifndef CONFIG_PREEMPT_RT_BASE
 	bit_spin_lock(BH_State, &bh->b_state);
+#else
+	spin_lock(&bh->b_state_lock);
+#endif
 }
 
 static inline int jbd_trylock_bh_state(struct buffer_head *bh)
 {
+#ifndef CONFIG_PREEMPT_RT_BASE
 	return bit_spin_trylock(BH_State, &bh->b_state);
+#else
+	return spin_trylock(&bh->b_state_lock);
+#endif
 }
 
 static inline int jbd_is_locked_bh_state(struct buffer_head *bh)
 {
+#ifndef CONFIG_PREEMPT_RT_BASE
 	return bit_spin_is_locked(BH_State, &bh->b_state);
+#else
+	return spin_is_locked(&bh->b_state_lock);
+#endif
 }
 
 static inline void jbd_unlock_bh_state(struct buffer_head *bh)
 {
+#ifndef CONFIG_PREEMPT_RT_BASE
 	bit_spin_unlock(BH_State, &bh->b_state);
+#else
+	spin_unlock(&bh->b_state_lock);
+#endif
 }
 
 static inline void jbd_lock_bh_journal_head(struct buffer_head *bh)
 {
+#ifndef CONFIG_PREEMPT_RT_BASE
 	bit_spin_lock(BH_JournalHead, &bh->b_state);
+#else
+	spin_lock(&bh->b_journal_head_lock);
+#endif
 }
 
 static inline void jbd_unlock_bh_journal_head(struct buffer_head *bh)
 {
+#ifndef CONFIG_PREEMPT_RT_BASE
 	bit_spin_unlock(BH_JournalHead, &bh->b_state);
+#else
+	spin_unlock(&bh->b_journal_head_lock);
+#endif
 }
 
 #endif
-- 
cgit v0.10.2


From 044f74d54bbbb233d16252f82c00232c43b70646 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 18 Mar 2011 10:22:04 +0100
Subject: genirq: Disable DEBUG_SHIRQ for rt

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 67604e5..24b60ba 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -164,7 +164,7 @@ config DEBUG_KERNEL
 
 config DEBUG_SHIRQ
 	bool "Debug shared IRQ handlers"
-	depends on DEBUG_KERNEL && GENERIC_HARDIRQS
+	depends on DEBUG_KERNEL && GENERIC_HARDIRQS && !PREEMPT_RT_BASE
 	help
 	  Enable this to generate a spurious interrupt as soon as a shared
 	  interrupt handler is registered, and just before one is deregistered.
-- 
cgit v0.10.2


From 9d007f268e7fe68cb1a08c01e38bfad7cfcf5983 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 3 Jul 2009 08:29:57 -0500
Subject: genirq: disable irqpoll on -rt

Creates long latencies for no value

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 7b5f012..38a32b0 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -340,6 +340,11 @@ MODULE_PARM_DESC(noirqdebug, "Disable irq lockup detection when true");
 
 static int __init irqfixup_setup(char *str)
 {
+#ifdef CONFIG_PREEMPT_RT_BASE
+	printk(KERN_WARNING "irqfixup boot option not supported "
+		"w/ CONFIG_PREEMPT_RT_BASE\n");
+	return 1;
+#endif
 	irqfixup = 1;
 	printk(KERN_WARNING "Misrouted IRQ fixup support enabled.\n");
 	printk(KERN_WARNING "This may impact system performance.\n");
@@ -352,6 +357,11 @@ module_param(irqfixup, int, 0644);
 
 static int __init irqpoll_setup(char *str)
 {
+#ifdef CONFIG_PREEMPT_RT_BASE
+	printk(KERN_WARNING "irqpoll boot option not supported "
+		"w/ CONFIG_PREEMPT_RT_BASE\n");
+	return 1;
+#endif
 	irqfixup = 2;
 	printk(KERN_WARNING "Misrouted IRQ fixup and polling support "
 				"enabled\n");
-- 
cgit v0.10.2


From e496e6ab6d027163f4948ab062bce951e2f589c1 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 3 Apr 2011 11:57:29 +0200
Subject: genirq-force-threading.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 3f3a4e2..6fb89c4 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -383,9 +383,13 @@ static inline int disable_irq_wake(unsigned int irq)
 
 
 #ifdef CONFIG_IRQ_FORCED_THREADING
-extern bool force_irqthreads;
+# ifndef CONFIG_PREEMPT_RT_BASE
+   extern bool force_irqthreads;
+# else
+#  define force_irqthreads	(true)
+# endif
 #else
-#define force_irqthreads	(0)
+#define force_irqthreads	(false)
 #endif
 
 #ifndef __ARCH_SET_SOFTIRQ_PENDING
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index e49a288..902f880 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -21,6 +21,7 @@
 #include "internals.h"
 
 #ifdef CONFIG_IRQ_FORCED_THREADING
+# ifndef CONFIG_PREEMPT_RT_BASE
 __read_mostly bool force_irqthreads;
 
 static int __init setup_forced_irqthreads(char *arg)
@@ -29,6 +30,7 @@ static int __init setup_forced_irqthreads(char *arg)
 	return 0;
 }
 early_param("threadirqs", setup_forced_irqthreads);
+# endif
 #endif
 
 /**
-- 
cgit v0.10.2


From d135abb1f167eda99cc27d57583ba7c440e2ad02 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 20 Jun 2009 11:36:54 +0200
Subject: drivers/net: fix livelock issues

Preempt-RT runs into a live lock issue with the NETDEV_TX_LOCKED micro
optimization. The reason is that the softirq thread is rescheduling
itself on that return value. Depending on priorities it starts to
monoplize the CPU and livelock on UP systems.

Remove it.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/drivers/net/ethernet/atheros/atl1c/atl1c_main.c b/drivers/net/ethernet/atheros/atl1c/atl1c_main.c
index bfcb8bc..dbe44ba 100644
--- a/drivers/net/ethernet/atheros/atl1c/atl1c_main.c
+++ b/drivers/net/ethernet/atheros/atl1c/atl1c_main.c
@@ -2171,11 +2171,7 @@ static netdev_tx_t atl1c_xmit_frame(struct sk_buff *skb,
 	}
 
 	tpd_req = atl1c_cal_tpd_req(skb);
-	if (!spin_trylock_irqsave(&adapter->tx_lock, flags)) {
-		if (netif_msg_pktdata(adapter))
-			dev_info(&adapter->pdev->dev, "tx locked\n");
-		return NETDEV_TX_LOCKED;
-	}
+	spin_lock_irqsave(&adapter->tx_lock, flags);
 
 	if (atl1c_tpd_avail(adapter, type) < tpd_req) {
 		/* no enough descriptor, just stop queue */
diff --git a/drivers/net/ethernet/atheros/atl1e/atl1e_main.c b/drivers/net/ethernet/atheros/atl1e/atl1e_main.c
index e4466a3..01866d7 100644
--- a/drivers/net/ethernet/atheros/atl1e/atl1e_main.c
+++ b/drivers/net/ethernet/atheros/atl1e/atl1e_main.c
@@ -1803,8 +1803,7 @@ static netdev_tx_t atl1e_xmit_frame(struct sk_buff *skb,
 		return NETDEV_TX_OK;
 	}
 	tpd_req = atl1e_cal_tdp_req(skb);
-	if (!spin_trylock_irqsave(&adapter->tx_lock, flags))
-		return NETDEV_TX_LOCKED;
+	spin_lock_irqsave(&adapter->tx_lock, flags);
 
 	if (atl1e_tpd_avail(adapter) < tpd_req) {
 		/* no enough descriptor, just stop queue */
diff --git a/drivers/net/ethernet/chelsio/cxgb/sge.c b/drivers/net/ethernet/chelsio/cxgb/sge.c
index d84872e..1420ea8 100644
--- a/drivers/net/ethernet/chelsio/cxgb/sge.c
+++ b/drivers/net/ethernet/chelsio/cxgb/sge.c
@@ -1666,8 +1666,7 @@ static int t1_sge_tx(struct sk_buff *skb, struct adapter *adapter,
 	struct cmdQ *q = &sge->cmdQ[qid];
 	unsigned int credits, pidx, genbit, count, use_sched_skb = 0;
 
-	if (!spin_trylock(&q->lock))
-		return NETDEV_TX_LOCKED;
+	spin_lock(&q->lock);
 
 	reclaim_completed_tx(sge, q);
 
diff --git a/drivers/net/ethernet/neterion/s2io.c b/drivers/net/ethernet/neterion/s2io.c
index 7c94c08..8757a2c 100644
--- a/drivers/net/ethernet/neterion/s2io.c
+++ b/drivers/net/ethernet/neterion/s2io.c
@@ -4088,12 +4088,7 @@ static netdev_tx_t s2io_xmit(struct sk_buff *skb, struct net_device *dev)
 			[skb->priority & (MAX_TX_FIFOS - 1)];
 	fifo = &mac_control->fifos[queue];
 
-	if (do_spin_lock)
-		spin_lock_irqsave(&fifo->tx_lock, flags);
-	else {
-		if (unlikely(!spin_trylock_irqsave(&fifo->tx_lock, flags)))
-			return NETDEV_TX_LOCKED;
-	}
+	spin_lock_irqsave(&fifo->tx_lock, flags);
 
 	if (sp->config.multiq) {
 		if (__netif_subqueue_stopped(dev, fifo->fifo_no)) {
diff --git a/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c b/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c
index 39ab4d0..990a6d7 100644
--- a/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c
+++ b/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c
@@ -2114,10 +2114,8 @@ static int pch_gbe_xmit_frame(struct sk_buff *skb, struct net_device *netdev)
 	struct pch_gbe_tx_ring *tx_ring = adapter->tx_ring;
 	unsigned long flags;
 
-	if (!spin_trylock_irqsave(&tx_ring->tx_lock, flags)) {
-		/* Collision - tell upper layer to requeue */
-		return NETDEV_TX_LOCKED;
-	}
+	spin_lock_irqsave(&tx_ring->tx_lock, flags);
+
 	if (unlikely(!PCH_GBE_DESC_UNUSED(tx_ring))) {
 		netif_stop_queue(netdev);
 		spin_unlock_irqrestore(&tx_ring->tx_lock, flags);
diff --git a/drivers/net/ethernet/tehuti/tehuti.c b/drivers/net/ethernet/tehuti/tehuti.c
index 1e4d743..9dfd4f5 100644
--- a/drivers/net/ethernet/tehuti/tehuti.c
+++ b/drivers/net/ethernet/tehuti/tehuti.c
@@ -1630,13 +1630,8 @@ static netdev_tx_t bdx_tx_transmit(struct sk_buff *skb,
 	unsigned long flags;
 
 	ENTER;
-	local_irq_save(flags);
-	if (!spin_trylock(&priv->tx_lock)) {
-		local_irq_restore(flags);
-		DBG("%s[%s]: TX locked, returning NETDEV_TX_LOCKED\n",
-		    BDX_DRV_NAME, ndev->name);
-		return NETDEV_TX_LOCKED;
-	}
+
+	spin_lock_irqsave(&priv->tx_lock, flags);
 
 	/* build tx descriptor */
 	BDX_ASSERT(f->m.wptr >= f->m.memsz);	/* started with valid wptr */
diff --git a/drivers/net/rionet.c b/drivers/net/rionet.c
index d8b9b1e..7ae06a7 100644
--- a/drivers/net/rionet.c
+++ b/drivers/net/rionet.c
@@ -174,11 +174,7 @@ static int rionet_start_xmit(struct sk_buff *skb, struct net_device *ndev)
 	unsigned long flags;
 	int add_num = 1;
 
-	local_irq_save(flags);
-	if (!spin_trylock(&rnet->tx_lock)) {
-		local_irq_restore(flags);
-		return NETDEV_TX_LOCKED;
-	}
+	spin_lock_irqsave(&rnet->tx_lock, flags);
 
 	if (is_multicast_ether_addr(eth->h_dest))
 		add_num = nets[rnet->mport->id].nact;
-- 
cgit v0.10.2


From 149fc739163da5be748e2091dd27b1c339344406 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Fri, 3 Jul 2009 08:30:00 -0500
Subject: drivers/net: vortex fix locking issues

Argh, cut and paste wasn't enough...

Use this patch instead.  It needs an irq disable.  But, believe it or not,
on SMP this is actually better.  If the irq is shared (as it is in Mark's
case), we don't stop the irq of other devices from being handled on
another CPU (unfortunately for Mark, he pinned all interrupts to one CPU).

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

 drivers/net/ethernet/3com/3c59x.c |    8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/drivers/net/ethernet/3com/3c59x.c b/drivers/net/ethernet/3com/3c59x.c
index ed0feb3..0da3917 100644
--- a/drivers/net/ethernet/3com/3c59x.c
+++ b/drivers/net/ethernet/3com/3c59x.c
@@ -843,9 +843,9 @@ static void poll_vortex(struct net_device *dev)
 {
 	struct vortex_private *vp = netdev_priv(dev);
 	unsigned long flags;
-	local_irq_save(flags);
+	local_irq_save_nort(flags);
 	(vp->full_bus_master_rx ? boomerang_interrupt:vortex_interrupt)(dev->irq,dev);
-	local_irq_restore(flags);
+	local_irq_restore_nort(flags);
 }
 #endif
 
@@ -1919,12 +1919,12 @@ static void vortex_tx_timeout(struct net_device *dev)
 			 * Block interrupts because vortex_interrupt does a bare spin_lock()
 			 */
 			unsigned long flags;
-			local_irq_save(flags);
+			local_irq_save_nort(flags);
 			if (vp->full_bus_master_tx)
 				boomerang_interrupt(dev->irq, dev);
 			else
 				vortex_interrupt(dev->irq, dev);
-			local_irq_restore(flags);
+			local_irq_restore_nort(flags);
 		}
 	}
 
-- 
cgit v0.10.2


From 52d703204c6a71aa8be8c1634656ec4889318faf Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 1 Apr 2010 20:20:57 +0200
Subject: drivers: net: gianfar: Make RT aware

The adjust_link() disables interrupts before taking the queue
locks. On RT those locks are converted to "sleeping" locks and
therefor the local_irq_save/restore must be converted to
local_irq_save/restore_nort.

Reported-by: Xianghua Xiao <xiaoxianghua@gmail.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Xianghua Xiao <xiaoxianghua@gmail.com>

diff --git a/drivers/net/ethernet/freescale/gianfar.c b/drivers/net/ethernet/freescale/gianfar.c
index bffb2ed..5c53535 100644
--- a/drivers/net/ethernet/freescale/gianfar.c
+++ b/drivers/net/ethernet/freescale/gianfar.c
@@ -1663,7 +1663,7 @@ void stop_gfar(struct net_device *dev)
 
 
 	/* Lock it down */
-	local_irq_save(flags);
+	local_irq_save_nort(flags);
 	lock_tx_qs(priv);
 	lock_rx_qs(priv);
 
@@ -1671,7 +1671,7 @@ void stop_gfar(struct net_device *dev)
 
 	unlock_rx_qs(priv);
 	unlock_tx_qs(priv);
-	local_irq_restore(flags);
+	local_irq_restore_nort(flags);
 
 	/* Free the IRQs */
 	if (priv->device_flags & FSL_GIANFAR_DEV_HAS_MULTI_INTR) {
@@ -2951,7 +2951,7 @@ static void adjust_link(struct net_device *dev)
 	struct phy_device *phydev = priv->phydev;
 	int new_state = 0;
 
-	local_irq_save(flags);
+	local_irq_save_nort(flags);
 	lock_tx_qs(priv);
 
 	if (phydev->link) {
@@ -3020,7 +3020,7 @@ static void adjust_link(struct net_device *dev)
 	if (new_state && netif_msg_link(priv))
 		phy_print_status(phydev);
 	unlock_tx_qs(priv);
-	local_irq_restore(flags);
+	local_irq_restore_nort(flags);
 }
 
 /* Update the hash table based on the current list of multicast
-- 
cgit v0.10.2


From a5f80d7030cff1295bb9e766ce2be116202eeaea Mon Sep 17 00:00:00 2001
From: Wu Zhangjin <wuzj@lemote.com>
Date: Mon, 4 Jan 2010 11:33:02 +0800
Subject: USB: Fix the mouse problem when copying large amounts of data

When copying large amounts of data between the USB storage devices and
the hard disk, the USB mouse will not work, this patch fixes it.

[NOTE: This problem have been found in the Loongson family machines, not
sure whether it is producible on other platforms]

Signed-off-by: Hu Hongbing <huhb@lemote.com>
Signed-off-by: Wu Zhangjin <wuzhangjin@gmail.com>

diff --git a/drivers/usb/host/ohci-hcd.c b/drivers/usb/host/ohci-hcd.c
index 180a2b0..1a3e81a 100644
--- a/drivers/usb/host/ohci-hcd.c
+++ b/drivers/usb/host/ohci-hcd.c
@@ -857,9 +857,13 @@ static irqreturn_t ohci_irq (struct usb_hcd *hcd)
 	}
 
 	if (ints & OHCI_INTR_WDH) {
-		spin_lock (&ohci->lock);
-		dl_done_list (ohci);
-		spin_unlock (&ohci->lock);
+		if (ohci->hcca->done_head == 0) {
+			ints &= ~OHCI_INTR_WDH;
+		} else {
+			spin_lock (&ohci->lock);
+			dl_done_list (ohci);
+			spin_unlock (&ohci->lock);
+		}
 	}
 
 	if (quirk_zfmicro(ohci) && (ints & OHCI_INTR_SF)) {
-- 
cgit v0.10.2


From 97b29d76e8d9216c173418983ccd49164879fa17 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 24 Jun 2011 18:40:37 +0200
Subject: local-var.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index cc88172..bf69fd1 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -48,6 +48,11 @@
 	preempt_enable();				\
 } while (0)
 
+#define get_local_var(var)	get_cpu_var(var)
+#define put_local_var(var)	put_cpu_var(var)
+#define get_local_ptr(var)	get_cpu_ptr(var)
+#define put_local_ptr(var)	put_cpu_ptr(var)
+
 /* minimum unit size, also is the maximum supported allocation size */
 #define PCPU_MIN_UNIT_SIZE		PFN_ALIGN(32 << 10)
 
-- 
cgit v0.10.2


From 880393a23f6ea69c019d604cf463e6aaabc28c53 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 20 Jun 2011 09:03:47 +0200
Subject: rt-local-irq-lock.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/locallock.h b/include/linux/locallock.h
new file mode 100644
index 0000000..0161fbb
--- /dev/null
+++ b/include/linux/locallock.h
@@ -0,0 +1,238 @@
+#ifndef _LINUX_LOCALLOCK_H
+#define _LINUX_LOCALLOCK_H
+
+#include <linux/spinlock.h>
+
+#ifdef CONFIG_PREEMPT_RT_BASE
+
+#ifdef CONFIG_DEBUG_SPINLOCK
+# define LL_WARN(cond)	WARN_ON(cond)
+#else
+# define LL_WARN(cond)	do { } while (0)
+#endif
+
+/*
+ * per cpu lock based substitute for local_irq_*()
+ */
+struct local_irq_lock {
+	spinlock_t		lock;
+	struct task_struct	*owner;
+	int			nestcnt;
+	unsigned long		flags;
+};
+
+#define DEFINE_LOCAL_IRQ_LOCK(lvar)					\
+	DEFINE_PER_CPU(struct local_irq_lock, lvar) = {			\
+		.lock = __SPIN_LOCK_UNLOCKED((lvar).lock) }
+
+#define local_irq_lock_init(lvar)					\
+	do {								\
+		int __cpu;						\
+		for_each_possible_cpu(__cpu)				\
+			spin_lock_init(&per_cpu(lvar, __cpu).lock);	\
+	} while (0)
+
+static inline void __local_lock(struct local_irq_lock *lv)
+{
+	if (lv->owner != current) {
+		spin_lock(&lv->lock);
+		LL_WARN(lv->owner);
+		LL_WARN(lv->nestcnt);
+		lv->owner = current;
+	}
+	lv->nestcnt++;
+}
+
+#define local_lock(lvar)					\
+	do { __local_lock(&get_local_var(lvar)); } while (0)
+
+static inline int __local_trylock(struct local_irq_lock *lv)
+{
+	if (lv->owner != current && spin_trylock(&lv->lock)) {
+		LL_WARN(lv->owner);
+		LL_WARN(lv->nestcnt);
+		lv->owner = current;
+		lv->nestcnt = 1;
+		return 1;
+	}
+	return 0;
+}
+
+#define local_trylock(lvar)						\
+	({								\
+		int __locked;						\
+		__locked = __local_trylock(&get_local_var(lvar));	\
+		if (!__locked)						\
+			put_local_var(lvar);				\
+		__locked;						\
+	})
+
+static inline void __local_unlock(struct local_irq_lock *lv)
+{
+	LL_WARN(lv->nestcnt == 0);
+	LL_WARN(lv->owner != current);
+	if (--lv->nestcnt)
+		return;
+
+	lv->owner = NULL;
+	spin_unlock(&lv->lock);
+}
+
+#define local_unlock(lvar)					\
+	do {							\
+		__local_unlock(&__get_cpu_var(lvar));		\
+		put_local_var(lvar);				\
+	} while (0)
+
+static inline void __local_lock_irq(struct local_irq_lock *lv)
+{
+	spin_lock_irqsave(&lv->lock, lv->flags);
+	LL_WARN(lv->owner);
+	LL_WARN(lv->nestcnt);
+	lv->owner = current;
+	lv->nestcnt = 1;
+}
+
+#define local_lock_irq(lvar)						\
+	do { __local_lock_irq(&get_local_var(lvar)); } while (0)
+
+#define local_lock_irq_on(lvar, cpu)					\
+	do { __local_lock_irq(&per_cpu(lvar, cpu)); } while (0)
+
+static inline void __local_unlock_irq(struct local_irq_lock *lv)
+{
+	LL_WARN(!lv->nestcnt);
+	LL_WARN(lv->owner != current);
+	lv->owner = NULL;
+	lv->nestcnt = 0;
+	spin_unlock_irq(&lv->lock);
+}
+
+#define local_unlock_irq(lvar)						\
+	do {								\
+		__local_unlock_irq(&__get_cpu_var(lvar));		\
+		put_local_var(lvar);					\
+	} while (0)
+
+#define local_unlock_irq_on(lvar, cpu)					\
+	do {								\
+		__local_unlock_irq(&per_cpu(lvar, cpu));		\
+	} while (0)
+
+static inline int __local_lock_irqsave(struct local_irq_lock *lv)
+{
+	if (lv->owner != current) {
+		__local_lock_irq(lv);
+		return 0;
+	} else {
+		lv->nestcnt++;
+		return 1;
+	}
+}
+
+#define local_lock_irqsave(lvar, _flags)				\
+	do {								\
+		if (__local_lock_irqsave(&get_local_var(lvar)))		\
+			put_local_var(lvar);				\
+		_flags = __get_cpu_var(lvar).flags;			\
+	} while (0)
+
+static inline int __local_unlock_irqrestore(struct local_irq_lock *lv,
+					    unsigned long flags)
+{
+	LL_WARN(!lv->nestcnt);
+	LL_WARN(lv->owner != current);
+	if (--lv->nestcnt)
+		return 0;
+
+	lv->owner = NULL;
+	spin_unlock_irqrestore(&lv->lock, lv->flags);
+	return 1;
+}
+
+#define local_unlock_irqrestore(lvar, flags)				\
+	do {								\
+		if (__local_unlock_irqrestore(&__get_cpu_var(lvar), flags)) \
+			put_local_var(lvar);				\
+	} while (0)
+
+#define local_spin_trylock_irq(lvar, lock)				\
+	({								\
+		int __locked;						\
+		local_lock_irq(lvar);					\
+		__locked = spin_trylock(lock);				\
+		if (!__locked)						\
+			local_unlock_irq(lvar);				\
+		__locked;						\
+	})
+
+#define local_spin_lock_irq(lvar, lock)					\
+	do {								\
+		local_lock_irq(lvar);					\
+		spin_lock(lock);					\
+	} while (0)
+
+#define local_spin_unlock_irq(lvar, lock)				\
+	do {								\
+		spin_unlock(lock);					\
+		local_unlock_irq(lvar);					\
+	} while (0)
+
+#define local_spin_lock_irqsave(lvar, lock, flags)			\
+	do {								\
+		local_lock_irqsave(lvar, flags);			\
+		spin_lock(lock);					\
+	} while (0)
+
+#define local_spin_unlock_irqrestore(lvar, lock, flags)			\
+	do {								\
+		spin_unlock(lock);					\
+		local_unlock_irqrestore(lvar, flags);			\
+	} while (0)
+
+#define get_locked_var(lvar, var)					\
+	(*({								\
+		local_lock(lvar);					\
+		&__get_cpu_var(var);					\
+	}))
+
+#define put_locked_var(lvar, var)		local_unlock(lvar)
+
+#define local_lock_cpu(lvar)						\
+	({								\
+		local_lock(lvar);					\
+		smp_processor_id();					\
+	})
+
+#define local_unlock_cpu(lvar)			local_unlock(lvar)
+
+#else /* PREEMPT_RT_BASE */
+
+#define DEFINE_LOCAL_IRQ_LOCK(lvar)		__typeof__(const int) lvar
+
+static inline void local_irq_lock_init(int lvar) { }
+
+#define local_lock(lvar)			preempt_disable()
+#define local_unlock(lvar)			preempt_enable()
+#define local_lock_irq(lvar)			local_irq_disable()
+#define local_unlock_irq(lvar)			local_irq_enable()
+#define local_lock_irqsave(lvar, flags)		local_irq_save(flags)
+#define local_unlock_irqrestore(lvar, flags)	local_irq_restore(flags)
+
+#define local_spin_trylock_irq(lvar, lock)	spin_trylock_irq(lock)
+#define local_spin_lock_irq(lvar, lock)		spin_lock_irq(lock)
+#define local_spin_unlock_irq(lvar, lock)	spin_unlock_irq(lock)
+#define local_spin_lock_irqsave(lvar, lock, flags)	\
+	spin_lock_irqsave(lock, flags)
+#define local_spin_unlock_irqrestore(lvar, lock, flags)	\
+	spin_unlock_irqrestore(lock, flags)
+
+#define get_locked_var(lvar, var)		get_cpu_var(var)
+#define put_locked_var(lvar, var)		put_cpu_var(var)
+
+#define local_lock_cpu(lvar)			get_cpu()
+#define local_unlock_cpu(lvar)			put_cpu()
+
+#endif
+
+#endif
-- 
cgit v0.10.2


From 24a257f109c4fa5db6a0f4930ee700371cdf1551 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 17 Jun 2011 15:42:38 +0200
Subject: cpu-rt-variants.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/smp.h b/include/linux/smp.h
index dd6f06b..8624321 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -218,6 +218,14 @@ static inline void kick_all_cpus_sync(void) {  }
 #define get_cpu()		({ preempt_disable(); smp_processor_id(); })
 #define put_cpu()		preempt_enable()
 
+#ifndef CONFIG_PREEMPT_RT_FULL
+# define get_cpu_light()	get_cpu()
+# define put_cpu_light()	put_cpu()
+#else
+# define get_cpu_light()	({ migrate_disable(); smp_processor_id(); })
+# define put_cpu_light()	migrate_enable()
+#endif
+
 /*
  * Callback to arch code if there's nosmp or maxcpus=0 on the
  * boot command line:
-- 
cgit v0.10.2


From 9d750b721c90d2eb16a16d67ef3001f5657798d2 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 18 Jun 2011 19:44:43 +0200
Subject: mm-slab-wrap-functions.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/mm/slab.c b/mm/slab.c
index 2aa9fd4..e083470 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -116,6 +116,7 @@
 #include	<linux/kmemcheck.h>
 #include	<linux/memory.h>
 #include	<linux/prefetch.h>
+#include	<linux/locallock.h>
 
 #include	<net/sock.h>
 
@@ -696,12 +697,49 @@ static void slab_set_debugobj_lock_classes(struct kmem_cache *cachep)
 #endif
 
 static DEFINE_PER_CPU(struct delayed_work, slab_reap_work);
+static DEFINE_LOCAL_IRQ_LOCK(slab_lock);
+
+#ifndef CONFIG_PREEMPT_RT_BASE
+# define slab_on_each_cpu(func, cp)	on_each_cpu(func, cp, 1)
+#else
+/*
+ * execute func() for all CPUs. On PREEMPT_RT we dont actually have
+ * to run on the remote CPUs - we only have to take their CPU-locks.
+ * (This is a rare operation, so cacheline bouncing is not an issue.)
+ */
+static void
+slab_on_each_cpu(void (*func)(void *arg, int this_cpu), void *arg)
+{
+	unsigned int i;
+
+	get_cpu_light();
+	for_each_online_cpu(i)
+		func(arg, i);
+	put_cpu_light();
+}
+
+static void lock_slab_on(unsigned int cpu)
+{
+	local_lock_irq_on(slab_lock, cpu);
+}
+
+static void unlock_slab_on(unsigned int cpu)
+{
+	local_unlock_irq_on(slab_lock, cpu);
+}
+#endif
 
 static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
 {
 	return cachep->array[smp_processor_id()];
 }
 
+static inline struct array_cache *cpu_cache_get_on_cpu(struct kmem_cache *cachep,
+						       int cpu)
+{
+	return cachep->array[cpu];
+}
+
 static inline struct kmem_cache *__find_general_cachep(size_t size,
 							gfp_t gfpflags)
 {
@@ -1171,9 +1209,10 @@ static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3)
 	if (l3->alien) {
 		struct array_cache *ac = l3->alien[node];
 
-		if (ac && ac->avail && spin_trylock_irq(&ac->lock)) {
+		if (ac && ac->avail &&
+		    local_spin_trylock_irq(slab_lock, &ac->lock)) {
 			__drain_alien_cache(cachep, ac, node);
-			spin_unlock_irq(&ac->lock);
+			local_spin_unlock_irq(slab_lock, &ac->lock);
 		}
 	}
 }
@@ -1188,9 +1227,9 @@ static void drain_alien_cache(struct kmem_cache *cachep,
 	for_each_online_node(i) {
 		ac = alien[i];
 		if (ac) {
-			spin_lock_irqsave(&ac->lock, flags);
+			local_spin_lock_irqsave(slab_lock, &ac->lock, flags);
 			__drain_alien_cache(cachep, ac, i);
-			spin_unlock_irqrestore(&ac->lock, flags);
+			local_spin_unlock_irqrestore(slab_lock, &ac->lock, flags);
 		}
 	}
 }
@@ -1269,11 +1308,11 @@ static int init_cache_nodelists_node(int node)
 			cachep->nodelists[node] = l3;
 		}
 
-		spin_lock_irq(&cachep->nodelists[node]->list_lock);
+		local_spin_lock_irq(slab_lock, &cachep->nodelists[node]->list_lock);
 		cachep->nodelists[node]->free_limit =
 			(1 + nr_cpus_node(node)) *
 			cachep->batchcount + cachep->num;
-		spin_unlock_irq(&cachep->nodelists[node]->list_lock);
+		local_spin_unlock_irq(slab_lock, &cachep->nodelists[node]->list_lock);
 	}
 	return 0;
 }
@@ -1298,7 +1337,7 @@ static void __cpuinit cpuup_canceled(long cpu)
 		if (!l3)
 			goto free_array_cache;
 
-		spin_lock_irq(&l3->list_lock);
+		local_spin_lock_irq(slab_lock, &l3->list_lock);
 
 		/* Free limit for this kmem_list3 */
 		l3->free_limit -= cachep->batchcount;
@@ -1306,7 +1345,7 @@ static void __cpuinit cpuup_canceled(long cpu)
 			free_block(cachep, nc->entry, nc->avail, node);
 
 		if (!cpumask_empty(mask)) {
-			spin_unlock_irq(&l3->list_lock);
+			local_spin_unlock_irq(slab_lock, &l3->list_lock);
 			goto free_array_cache;
 		}
 
@@ -1320,7 +1359,7 @@ static void __cpuinit cpuup_canceled(long cpu)
 		alien = l3->alien;
 		l3->alien = NULL;
 
-		spin_unlock_irq(&l3->list_lock);
+		local_spin_unlock_irq(slab_lock, &l3->list_lock);
 
 		kfree(shared);
 		if (alien) {
@@ -1394,7 +1433,7 @@ static int __cpuinit cpuup_prepare(long cpu)
 		l3 = cachep->nodelists[node];
 		BUG_ON(!l3);
 
-		spin_lock_irq(&l3->list_lock);
+		local_spin_lock_irq(slab_lock, &l3->list_lock);
 		if (!l3->shared) {
 			/*
 			 * We are serialised from CPU_DEAD or
@@ -1409,7 +1448,7 @@ static int __cpuinit cpuup_prepare(long cpu)
 			alien = NULL;
 		}
 #endif
-		spin_unlock_irq(&l3->list_lock);
+		local_spin_unlock_irq(slab_lock, &l3->list_lock);
 		kfree(shared);
 		free_alien_cache(alien);
 		if (cachep->flags & SLAB_DEBUG_OBJECTS)
@@ -1612,6 +1651,8 @@ void __init kmem_cache_init(void)
 	if (num_possible_nodes() == 1)
 		use_alien_caches = 0;
 
+	local_irq_lock_init(slab_lock);
+
 	for (i = 0; i < NUM_INIT_LISTS; i++)
 		kmem_list3_init(&initkmem_list3[i]);
 
@@ -2533,7 +2574,7 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
 #if DEBUG
 static void check_irq_off(void)
 {
-	BUG_ON(!irqs_disabled());
+	BUG_ON_NONRT(!irqs_disabled());
 }
 
 static void check_irq_on(void)
@@ -2568,26 +2609,37 @@ static void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
 			struct array_cache *ac,
 			int force, int node);
 
-static void do_drain(void *arg)
+static void __do_drain(void *arg, unsigned int cpu)
 {
 	struct kmem_cache *cachep = arg;
 	struct array_cache *ac;
-	int node = numa_mem_id();
+	int node = cpu_to_mem(cpu);
 
-	check_irq_off();
-	ac = cpu_cache_get(cachep);
+	ac = cpu_cache_get_on_cpu(cachep, cpu);
 	spin_lock(&cachep->nodelists[node]->list_lock);
 	free_block(cachep, ac->entry, ac->avail, node);
 	spin_unlock(&cachep->nodelists[node]->list_lock);
 	ac->avail = 0;
 }
 
+#ifndef CONFIG_PREEMPT_RT_BASE
+static void do_drain(void *arg)
+{
+	__do_drain(arg, smp_processor_id());
+}
+#else
+static void do_drain(void *arg, int this_cpu)
+{
+	__do_drain(arg, this_cpu);
+}
+#endif
+
 static void drain_cpu_caches(struct kmem_cache *cachep)
 {
 	struct kmem_list3 *l3;
 	int node;
 
-	on_each_cpu(do_drain, cachep, 1);
+	slab_on_each_cpu(do_drain, cachep);
 	check_irq_on();
 	for_each_online_node(node) {
 		l3 = cachep->nodelists[node];
@@ -2618,10 +2670,10 @@ static int drain_freelist(struct kmem_cache *cache,
 	nr_freed = 0;
 	while (nr_freed < tofree && !list_empty(&l3->slabs_free)) {
 
-		spin_lock_irq(&l3->list_lock);
+		local_spin_lock_irq(slab_lock, &l3->list_lock);
 		p = l3->slabs_free.prev;
 		if (p == &l3->slabs_free) {
-			spin_unlock_irq(&l3->list_lock);
+			local_spin_unlock_irq(slab_lock, &l3->list_lock);
 			goto out;
 		}
 
@@ -2635,7 +2687,7 @@ static int drain_freelist(struct kmem_cache *cache,
 		 * to the cache.
 		 */
 		l3->free_objects -= cache->num;
-		spin_unlock_irq(&l3->list_lock);
+		local_spin_unlock_irq(slab_lock, &l3->list_lock);
 		slab_destroy(cache, slabp);
 		nr_freed++;
 	}
@@ -2910,7 +2962,7 @@ static int cache_grow(struct kmem_cache *cachep,
 	offset *= cachep->colour_off;
 
 	if (local_flags & __GFP_WAIT)
-		local_irq_enable();
+		local_unlock_irq(slab_lock);
 
 	/*
 	 * The test for missing atomic flag is performed here, rather than
@@ -2940,7 +2992,7 @@ static int cache_grow(struct kmem_cache *cachep,
 	cache_init_objs(cachep, slabp);
 
 	if (local_flags & __GFP_WAIT)
-		local_irq_disable();
+		local_lock_irq(slab_lock);
 	check_irq_off();
 	spin_lock(&l3->list_lock);
 
@@ -2954,7 +3006,7 @@ opps1:
 	kmem_freepages(cachep, objp);
 failed:
 	if (local_flags & __GFP_WAIT)
-		local_irq_disable();
+		local_lock_irq(slab_lock);
 	return 0;
 }
 
@@ -3368,11 +3420,11 @@ retry:
 		 * set and go into memory reserves if necessary.
 		 */
 		if (local_flags & __GFP_WAIT)
-			local_irq_enable();
+			local_unlock_irq(slab_lock);
 		kmem_flagcheck(cache, flags);
 		obj = kmem_getpages(cache, local_flags, numa_mem_id());
 		if (local_flags & __GFP_WAIT)
-			local_irq_disable();
+			local_lock_irq(slab_lock);
 		if (obj) {
 			/*
 			 * Insert into the appropriate per node queues
@@ -3492,7 +3544,7 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
 	cachep = memcg_kmem_get_cache(cachep, flags);
 
 	cache_alloc_debugcheck_before(cachep, flags);
-	local_irq_save(save_flags);
+	local_lock_irqsave(slab_lock, save_flags);
 
 	if (nodeid == NUMA_NO_NODE)
 		nodeid = slab_node;
@@ -3517,7 +3569,7 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
 	/* ___cache_alloc_node can fall back to other nodes */
 	ptr = ____cache_alloc_node(cachep, flags, nodeid);
   out:
-	local_irq_restore(save_flags);
+	local_unlock_irqrestore(slab_lock, save_flags);
 	ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller);
 	kmemleak_alloc_recursive(ptr, cachep->object_size, 1, cachep->flags,
 				 flags);
@@ -3579,9 +3631,9 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller)
 	cachep = memcg_kmem_get_cache(cachep, flags);
 
 	cache_alloc_debugcheck_before(cachep, flags);
-	local_irq_save(save_flags);
+	local_lock_irqsave(slab_lock, save_flags);
 	objp = __do_cache_alloc(cachep, flags);
-	local_irq_restore(save_flags);
+	local_unlock_irqrestore(slab_lock, save_flags);
 	objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller);
 	kmemleak_alloc_recursive(objp, cachep->object_size, 1, cachep->flags,
 				 flags);
@@ -3898,9 +3950,9 @@ void kmem_cache_free(struct kmem_cache *cachep, void *objp)
 	debug_check_no_locks_freed(objp, cachep->object_size);
 	if (!(cachep->flags & SLAB_DEBUG_OBJECTS))
 		debug_check_no_obj_freed(objp, cachep->object_size);
-	local_irq_save(flags);
+	local_lock_irqsave(slab_lock, flags);
 	__cache_free(cachep, objp, _RET_IP_);
-	local_irq_restore(flags);
+	local_unlock_irqrestore(slab_lock, flags);
 
 	trace_kmem_cache_free(_RET_IP_, objp);
 }
@@ -3929,9 +3981,9 @@ void kfree(const void *objp)
 	debug_check_no_locks_freed(objp, c->object_size);
 
 	debug_check_no_obj_freed(objp, c->object_size);
-	local_irq_save(flags);
+	local_lock_irqsave(slab_lock, flags);
 	__cache_free(c, (void *)objp, _RET_IP_);
-	local_irq_restore(flags);
+	local_unlock_irqrestore(slab_lock, flags);
 }
 EXPORT_SYMBOL(kfree);
 
@@ -3968,7 +4020,7 @@ static int alloc_kmemlist(struct kmem_cache *cachep, gfp_t gfp)
 		if (l3) {
 			struct array_cache *shared = l3->shared;
 
-			spin_lock_irq(&l3->list_lock);
+			local_spin_lock_irq(slab_lock, &l3->list_lock);
 
 			if (shared)
 				free_block(cachep, shared->entry,
@@ -3981,7 +4033,7 @@ static int alloc_kmemlist(struct kmem_cache *cachep, gfp_t gfp)
 			}
 			l3->free_limit = (1 + nr_cpus_node(node)) *
 					cachep->batchcount + cachep->num;
-			spin_unlock_irq(&l3->list_lock);
+			local_spin_unlock_irq(slab_lock, &l3->list_lock);
 			kfree(shared);
 			free_alien_cache(new_alien);
 			continue;
@@ -4028,17 +4080,28 @@ struct ccupdate_struct {
 	struct array_cache *new[0];
 };
 
-static void do_ccupdate_local(void *info)
+static void __do_ccupdate_local(void *info, int cpu)
 {
 	struct ccupdate_struct *new = info;
 	struct array_cache *old;
 
-	check_irq_off();
-	old = cpu_cache_get(new->cachep);
+	old = cpu_cache_get_on_cpu(new->cachep, cpu);
 
-	new->cachep->array[smp_processor_id()] = new->new[smp_processor_id()];
-	new->new[smp_processor_id()] = old;
+	new->cachep->array[cpu] = new->new[cpu];
+	new->new[cpu] = old;
+}
+
+#ifndef CONFIG_PREEMPT_RT_BASE
+static void do_ccupdate_local(void *info)
+{
+	__do_ccupdate_local(info, smp_processor_id());
 }
+#else
+static void do_ccupdate_local(void *info, int cpu)
+{
+	__do_ccupdate_local(info, cpu);
+}
+#endif
 
 /* Always called with the slab_mutex held */
 static int __do_tune_cpucache(struct kmem_cache *cachep, int limit,
@@ -4064,7 +4127,7 @@ static int __do_tune_cpucache(struct kmem_cache *cachep, int limit,
 	}
 	new->cachep = cachep;
 
-	on_each_cpu(do_ccupdate_local, (void *)new, 1);
+	slab_on_each_cpu(do_ccupdate_local, (void *)new);
 
 	check_irq_on();
 	cachep->batchcount = batchcount;
@@ -4075,9 +4138,11 @@ static int __do_tune_cpucache(struct kmem_cache *cachep, int limit,
 		struct array_cache *ccold = new->new[i];
 		if (!ccold)
 			continue;
-		spin_lock_irq(&cachep->nodelists[cpu_to_mem(i)]->list_lock);
+		local_spin_lock_irq(slab_lock,
+				    &cachep->nodelists[cpu_to_mem(i)]->list_lock);
 		free_block(cachep, ccold->entry, ccold->avail, cpu_to_mem(i));
-		spin_unlock_irq(&cachep->nodelists[cpu_to_mem(i)]->list_lock);
+		local_spin_unlock_irq(slab_lock,
+				      &cachep->nodelists[cpu_to_mem(i)]->list_lock);
 		kfree(ccold);
 	}
 	kfree(new);
@@ -4192,7 +4257,7 @@ static void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
 	if (ac->touched && !force) {
 		ac->touched = 0;
 	} else {
-		spin_lock_irq(&l3->list_lock);
+		local_spin_lock_irq(slab_lock, &l3->list_lock);
 		if (ac->avail) {
 			tofree = force ? ac->avail : (ac->limit + 4) / 5;
 			if (tofree > ac->avail)
@@ -4202,7 +4267,7 @@ static void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
 			memmove(ac->entry, &(ac->entry[tofree]),
 				sizeof(void *) * ac->avail);
 		}
-		spin_unlock_irq(&l3->list_lock);
+		local_spin_unlock_irq(slab_lock, &l3->list_lock);
 	}
 }
 
@@ -4295,7 +4360,7 @@ void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo)
 			continue;
 
 		check_irq_on();
-		spin_lock_irq(&l3->list_lock);
+		local_spin_lock_irq(slab_lock, &l3->list_lock);
 
 		list_for_each_entry(slabp, &l3->slabs_full, list) {
 			if (slabp->inuse != cachep->num && !error)
@@ -4320,7 +4385,7 @@ void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo)
 		if (l3->shared)
 			shared_avail += l3->shared->avail;
 
-		spin_unlock_irq(&l3->list_lock);
+		local_spin_unlock_irq(slab_lock, &l3->list_lock);
 	}
 	num_slabs += active_slabs;
 	num_objs = num_slabs * cachep->num;
@@ -4520,13 +4585,13 @@ static int leaks_show(struct seq_file *m, void *p)
 			continue;
 
 		check_irq_on();
-		spin_lock_irq(&l3->list_lock);
+		local_spin_lock_irq(slab_lock, &l3->list_lock);
 
 		list_for_each_entry(slabp, &l3->slabs_full, list)
 			handle_slab(n, cachep, slabp);
 		list_for_each_entry(slabp, &l3->slabs_partial, list)
 			handle_slab(n, cachep, slabp);
-		spin_unlock_irq(&l3->list_lock);
+		local_spin_unlock_irq(slab_lock, &l3->list_lock);
 	}
 	name = cachep->name;
 	if (n[0] == n[1]) {
-- 
cgit v0.10.2


From dc9291821ce438a834c81c4860c78e7e824b6d02 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 3 Jul 2009 08:44:43 -0500
Subject: mm: More lock breaks in slab.c

Handle __free_pages outside of the locked regions. This reduces the
lock contention on the percpu slab locks in -rt significantly.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/mm/slab.c b/mm/slab.c
index e083470..6604ced 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -697,6 +697,7 @@ static void slab_set_debugobj_lock_classes(struct kmem_cache *cachep)
 #endif
 
 static DEFINE_PER_CPU(struct delayed_work, slab_reap_work);
+static DEFINE_PER_CPU(struct list_head, slab_free_list);
 static DEFINE_LOCAL_IRQ_LOCK(slab_lock);
 
 #ifndef CONFIG_PREEMPT_RT_BASE
@@ -729,6 +730,34 @@ static void unlock_slab_on(unsigned int cpu)
 }
 #endif
 
+static void free_delayed(struct list_head *h)
+{
+	while(!list_empty(h)) {
+		struct page *page = list_first_entry(h, struct page, lru);
+
+		list_del(&page->lru);
+		__free_pages(page, page->index);
+	}
+}
+
+static void unlock_l3_and_free_delayed(spinlock_t *list_lock)
+{
+	LIST_HEAD(tmp);
+
+	list_splice_init(&__get_cpu_var(slab_free_list), &tmp);
+	local_spin_unlock_irq(slab_lock, list_lock);
+	free_delayed(&tmp);
+}
+
+static void unlock_slab_and_free_delayed(unsigned long flags)
+{
+	LIST_HEAD(tmp);
+
+	list_splice_init(&__get_cpu_var(slab_free_list), &tmp);
+	local_unlock_irqrestore(slab_lock, flags);
+	free_delayed(&tmp);
+}
+
 static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
 {
 	return cachep->array[smp_processor_id()];
@@ -1345,7 +1374,7 @@ static void __cpuinit cpuup_canceled(long cpu)
 			free_block(cachep, nc->entry, nc->avail, node);
 
 		if (!cpumask_empty(mask)) {
-			local_spin_unlock_irq(slab_lock, &l3->list_lock);
+			unlock_l3_and_free_delayed(&l3->list_lock);
 			goto free_array_cache;
 		}
 
@@ -1359,7 +1388,7 @@ static void __cpuinit cpuup_canceled(long cpu)
 		alien = l3->alien;
 		l3->alien = NULL;
 
-		local_spin_unlock_irq(slab_lock, &l3->list_lock);
+		unlock_l3_and_free_delayed(&l3->list_lock);
 
 		kfree(shared);
 		if (alien) {
@@ -1652,6 +1681,8 @@ void __init kmem_cache_init(void)
 		use_alien_caches = 0;
 
 	local_irq_lock_init(slab_lock);
+	for_each_possible_cpu(i)
+		INIT_LIST_HEAD(&per_cpu(slab_free_list, i));
 
 	for (i = 0; i < NUM_INIT_LISTS; i++)
 		kmem_list3_init(&initkmem_list3[i]);
@@ -1953,12 +1984,14 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
 /*
  * Interface to system's page release.
  */
-static void kmem_freepages(struct kmem_cache *cachep, void *addr)
+static void kmem_freepages(struct kmem_cache *cachep, void *addr, bool delayed)
 {
 	unsigned long i = (1 << cachep->gfporder);
-	struct page *page = virt_to_page(addr);
+	struct page *page, *basepage = virt_to_page(addr);
 	const unsigned long nr_freed = i;
 
+	page = basepage;
+
 	kmemcheck_free_shadow(page, cachep->gfporder);
 
 	if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
@@ -1977,7 +2010,12 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr)
 	memcg_release_pages(cachep, cachep->gfporder);
 	if (current->reclaim_state)
 		current->reclaim_state->reclaimed_slab += nr_freed;
-	free_memcg_kmem_pages((unsigned long)addr, cachep->gfporder);
+	if (!delayed) {
+		free_memcg_kmem_pages((unsigned long)addr, cachep->gfporder);
+	} else {
+		basepage->index = cachep->gfporder;
+		list_add(&basepage->lru, &__get_cpu_var(slab_free_list));
+	}
 }
 
 static void kmem_rcu_free(struct rcu_head *head)
@@ -1985,7 +2023,7 @@ static void kmem_rcu_free(struct rcu_head *head)
 	struct slab_rcu *slab_rcu = (struct slab_rcu *)head;
 	struct kmem_cache *cachep = slab_rcu->cachep;
 
-	kmem_freepages(cachep, slab_rcu->addr);
+	kmem_freepages(cachep, slab_rcu->addr, false);
 	if (OFF_SLAB(cachep))
 		kmem_cache_free(cachep->slabp_cache, slab_rcu);
 }
@@ -2204,7 +2242,8 @@ static void slab_destroy_debugcheck(struct kmem_cache *cachep, struct slab *slab
  * Before calling the slab must have been unlinked from the cache.  The
  * cache-lock is not held/needed.
  */
-static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp)
+static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp,
+			 bool delayed)
 {
 	void *addr = slabp->s_mem - slabp->colouroff;
 
@@ -2217,7 +2256,7 @@ static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp)
 		slab_rcu->addr = addr;
 		call_rcu(&slab_rcu->head, kmem_rcu_free);
 	} else {
-		kmem_freepages(cachep, addr);
+		kmem_freepages(cachep, addr, delayed);
 		if (OFF_SLAB(cachep))
 			kmem_cache_free(cachep->slabp_cache, slabp);
 	}
@@ -2628,9 +2667,15 @@ static void do_drain(void *arg)
 	__do_drain(arg, smp_processor_id());
 }
 #else
-static void do_drain(void *arg, int this_cpu)
+static void do_drain(void *arg, int cpu)
 {
-	__do_drain(arg, this_cpu);
+	LIST_HEAD(tmp);
+
+	lock_slab_on(cpu);
+	__do_drain(arg, cpu);
+	list_splice_init(&per_cpu(slab_free_list, cpu), &tmp);
+	unlock_slab_on(cpu);
+	free_delayed(&tmp);
 }
 #endif
 
@@ -2688,7 +2733,7 @@ static int drain_freelist(struct kmem_cache *cache,
 		 */
 		l3->free_objects -= cache->num;
 		local_spin_unlock_irq(slab_lock, &l3->list_lock);
-		slab_destroy(cache, slabp);
+		slab_destroy(cache, slabp, false);
 		nr_freed++;
 	}
 out:
@@ -3003,7 +3048,7 @@ static int cache_grow(struct kmem_cache *cachep,
 	spin_unlock(&l3->list_lock);
 	return 1;
 opps1:
-	kmem_freepages(cachep, objp);
+	kmem_freepages(cachep, objp, false);
 failed:
 	if (local_flags & __GFP_WAIT)
 		local_lock_irq(slab_lock);
@@ -3684,7 +3729,7 @@ static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects,
 				 * a different cache, refer to comments before
 				 * alloc_slabmgmt.
 				 */
-				slab_destroy(cachep, slabp);
+				slab_destroy(cachep, slabp, true);
 			} else {
 				list_add(&slabp->list, &l3->slabs_free);
 			}
@@ -3952,7 +3997,7 @@ void kmem_cache_free(struct kmem_cache *cachep, void *objp)
 		debug_check_no_obj_freed(objp, cachep->object_size);
 	local_lock_irqsave(slab_lock, flags);
 	__cache_free(cachep, objp, _RET_IP_);
-	local_unlock_irqrestore(slab_lock, flags);
+	unlock_slab_and_free_delayed(flags);
 
 	trace_kmem_cache_free(_RET_IP_, objp);
 }
@@ -3983,7 +4028,7 @@ void kfree(const void *objp)
 	debug_check_no_obj_freed(objp, c->object_size);
 	local_lock_irqsave(slab_lock, flags);
 	__cache_free(c, (void *)objp, _RET_IP_);
-	local_unlock_irqrestore(slab_lock, flags);
+	unlock_slab_and_free_delayed(flags);
 }
 EXPORT_SYMBOL(kfree);
 
@@ -4033,7 +4078,8 @@ static int alloc_kmemlist(struct kmem_cache *cachep, gfp_t gfp)
 			}
 			l3->free_limit = (1 + nr_cpus_node(node)) *
 					cachep->batchcount + cachep->num;
-			local_spin_unlock_irq(slab_lock, &l3->list_lock);
+			unlock_l3_and_free_delayed(&l3->list_lock);
+
 			kfree(shared);
 			free_alien_cache(new_alien);
 			continue;
@@ -4141,8 +4187,8 @@ static int __do_tune_cpucache(struct kmem_cache *cachep, int limit,
 		local_spin_lock_irq(slab_lock,
 				    &cachep->nodelists[cpu_to_mem(i)]->list_lock);
 		free_block(cachep, ccold->entry, ccold->avail, cpu_to_mem(i));
-		local_spin_unlock_irq(slab_lock,
-				      &cachep->nodelists[cpu_to_mem(i)]->list_lock);
+
+		unlock_l3_and_free_delayed(&cachep->nodelists[cpu_to_mem(i)]->list_lock);
 		kfree(ccold);
 	}
 	kfree(new);
-- 
cgit v0.10.2


From 478022118fa8b49b03dafbbee339698863a76920 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 3 Jul 2009 08:29:37 -0500
Subject: mm: page_alloc: rt-friendly per-cpu pages

rt-friendly per-cpu pages: convert the irqs-off per-cpu locking
method into a preemptible, explicit-per-cpu-locks method.

Contains fixes from:
	 Peter Zijlstra <a.p.zijlstra@chello.nl>
	 Thomas Gleixner <tglx@linutronix.de>

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index f61bc0f..e93a0d8 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -58,6 +58,7 @@
 #include <linux/prefetch.h>
 #include <linux/migrate.h>
 #include <linux/page-debug-flags.h>
+#include <linux/locallock.h>
 
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
@@ -219,6 +220,18 @@ EXPORT_SYMBOL(nr_node_ids);
 EXPORT_SYMBOL(nr_online_nodes);
 #endif
 
+static DEFINE_LOCAL_IRQ_LOCK(pa_lock);
+
+#ifdef CONFIG_PREEMPT_RT_BASE
+# define cpu_lock_irqsave(cpu, flags)		\
+	spin_lock_irqsave(&per_cpu(pa_lock, cpu).lock, flags)
+# define cpu_unlock_irqrestore(cpu, flags)	\
+	spin_unlock_irqrestore(&per_cpu(pa_lock, cpu).lock, flags)
+#else
+# define cpu_lock_irqsave(cpu, flags)		local_irq_save(flags)
+# define cpu_unlock_irqrestore(cpu, flags)	local_irq_restore(flags)
+#endif
+
 int page_group_by_mobility_disabled __read_mostly;
 
 void set_pageblock_migratetype(struct page *page, int migratetype)
@@ -722,12 +735,12 @@ static void __free_pages_ok(struct page *page, unsigned int order)
 	if (!free_pages_prepare(page, order))
 		return;
 
-	local_irq_save(flags);
+	local_lock_irqsave(pa_lock, flags);
 	__count_vm_events(PGFREE, 1 << order);
 	migratetype = get_pageblock_migratetype(page);
 	set_freepage_migratetype(page, migratetype);
 	free_one_page(page_zone(page), page, order, migratetype);
-	local_irq_restore(flags);
+	local_unlock_irqrestore(pa_lock, flags);
 }
 
 /*
@@ -1169,7 +1182,7 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
 	unsigned long flags;
 	int to_drain;
 
-	local_irq_save(flags);
+	local_lock_irqsave(pa_lock, flags);
 	if (pcp->count >= pcp->batch)
 		to_drain = pcp->batch;
 	else
@@ -1178,7 +1191,7 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
 		free_pcppages_bulk(zone, to_drain, pcp);
 		pcp->count -= to_drain;
 	}
-	local_irq_restore(flags);
+	local_unlock_irqrestore(pa_lock, flags);
 }
 #endif
 
@@ -1198,7 +1211,7 @@ static void drain_pages(unsigned int cpu)
 		struct per_cpu_pageset *pset;
 		struct per_cpu_pages *pcp;
 
-		local_irq_save(flags);
+		cpu_lock_irqsave(cpu, flags);
 		pset = per_cpu_ptr(zone->pageset, cpu);
 
 		pcp = &pset->pcp;
@@ -1206,7 +1219,7 @@ static void drain_pages(unsigned int cpu)
 			free_pcppages_bulk(zone, pcp->count, pcp);
 			pcp->count = 0;
 		}
-		local_irq_restore(flags);
+		cpu_unlock_irqrestore(cpu, flags);
 	}
 }
 
@@ -1259,7 +1272,12 @@ void drain_all_pages(void)
 		else
 			cpumask_clear_cpu(cpu, &cpus_with_pcps);
 	}
+#ifndef CONFIG_PREEMPT_RT_BASE
 	on_each_cpu_mask(&cpus_with_pcps, drain_local_pages, NULL, 1);
+#else
+	for_each_cpu(cpu, &cpus_with_pcps)
+		drain_pages(cpu);
+#endif
 }
 
 #ifdef CONFIG_HIBERNATION
@@ -1314,7 +1332,7 @@ void free_hot_cold_page(struct page *page, int cold)
 
 	migratetype = get_pageblock_migratetype(page);
 	set_freepage_migratetype(page, migratetype);
-	local_irq_save(flags);
+	local_lock_irqsave(pa_lock, flags);
 	__count_vm_event(PGFREE);
 
 	/*
@@ -1344,7 +1362,7 @@ void free_hot_cold_page(struct page *page, int cold)
 	}
 
 out:
-	local_irq_restore(flags);
+	local_unlock_irqrestore(pa_lock, flags);
 }
 
 /*
@@ -1473,7 +1491,7 @@ again:
 		struct per_cpu_pages *pcp;
 		struct list_head *list;
 
-		local_irq_save(flags);
+		local_lock_irqsave(pa_lock, flags);
 		pcp = &this_cpu_ptr(zone->pageset)->pcp;
 		list = &pcp->lists[migratetype];
 		if (list_empty(list)) {
@@ -1505,18 +1523,20 @@ again:
 			 */
 			WARN_ON_ONCE(order > 1);
 		}
-		spin_lock_irqsave(&zone->lock, flags);
+		local_spin_lock_irqsave(pa_lock, &zone->lock, flags);
 		page = __rmqueue(zone, order, migratetype);
-		spin_unlock(&zone->lock);
-		if (!page)
+		if (!page) {
+			spin_unlock(&zone->lock);
 			goto failed;
+		}
 		__mod_zone_freepage_state(zone, -(1 << order),
 					  get_pageblock_migratetype(page));
+		spin_unlock(&zone->lock);
 	}
 
 	__count_zone_vm_events(PGALLOC, zone, 1 << order);
 	zone_statistics(preferred_zone, zone, gfp_flags);
-	local_irq_restore(flags);
+	local_unlock_irqrestore(pa_lock, flags);
 
 	VM_BUG_ON(bad_range(zone, page));
 	if (prep_new_page(page, order, gfp_flags))
@@ -1524,7 +1544,7 @@ again:
 	return page;
 
 failed:
-	local_irq_restore(flags);
+	local_unlock_irqrestore(pa_lock, flags);
 	return NULL;
 }
 
@@ -5133,6 +5153,7 @@ static int page_alloc_cpu_notify(struct notifier_block *self,
 void __init page_alloc_init(void)
 {
 	hotcpu_notifier(page_alloc_cpu_notify, 0);
+	local_irq_lock_init(pa_lock);
 }
 
 /*
@@ -5965,12 +5986,12 @@ static int __meminit __zone_pcp_update(void *data)
 		pset = per_cpu_ptr(zone->pageset, cpu);
 		pcp = &pset->pcp;
 
-		local_irq_save(flags);
+		cpu_lock_irqsave(cpu, flags);
 		if (pcp->count > 0)
 			free_pcppages_bulk(zone, pcp->count, pcp);
 		drain_zonestat(zone, pset);
 		setup_pageset(pset, batch);
-		local_irq_restore(flags);
+		cpu_unlock_irqrestore(cpu, flags);
 	}
 	return 0;
 }
@@ -5988,7 +6009,7 @@ void zone_pcp_reset(struct zone *zone)
 	struct per_cpu_pageset *pset;
 
 	/* avoid races with drain_pages()  */
-	local_irq_save(flags);
+	local_lock_irqsave(pa_lock, flags);
 	if (zone->pageset != &boot_pageset) {
 		for_each_online_cpu(cpu) {
 			pset = per_cpu_ptr(zone->pageset, cpu);
@@ -5997,7 +6018,7 @@ void zone_pcp_reset(struct zone *zone)
 		free_percpu(zone->pageset);
 		zone->pageset = &boot_pageset;
 	}
-	local_irq_restore(flags);
+	local_unlock_irqrestore(pa_lock, flags);
 }
 
 #ifdef CONFIG_MEMORY_HOTREMOVE
-- 
cgit v0.10.2


From 4e2c02fa284193c25c609d94a066e9adec5a2502 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 3 Jul 2009 08:44:37 -0500
Subject: mm: page_alloc reduce lock sections further

Split out the pages which are to be freed into a separate list and
call free_pages_bulk() outside of the percpu page allocator locks.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e93a0d8..9122f48 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -625,7 +625,7 @@ static inline int free_pages_check(struct page *page)
 }
 
 /*
- * Frees a number of pages from the PCP lists
+ * Frees a number of pages which have been collected from the pcp lists.
  * Assumes all pages on list are in same zone, and of same order.
  * count is the number of pages to free.
  *
@@ -636,16 +636,50 @@ static inline int free_pages_check(struct page *page)
  * pinned" detection logic.
  */
 static void free_pcppages_bulk(struct zone *zone, int count,
-					struct per_cpu_pages *pcp)
+			       struct list_head *list)
 {
-	int migratetype = 0;
-	int batch_free = 0;
 	int to_free = count;
+	unsigned long flags;
 
-	spin_lock(&zone->lock);
+	spin_lock_irqsave(&zone->lock, flags);
 	zone->all_unreclaimable = 0;
 	zone->pages_scanned = 0;
 
+	while (!list_empty(list)) {
+		struct page *page = list_first_entry(list, struct page, lru);
+		int mt;	/* migratetype of the to-be-freed page */
+
+		/* must delete as __free_one_page list manipulates */
+		list_del(&page->lru);
+
+		mt = get_freepage_migratetype(page);
+		/* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
+		__free_one_page(page, zone, 0, mt);
+		trace_mm_page_pcpu_drain(page, 0, mt);
+		if (likely(get_pageblock_migratetype(page) != MIGRATE_ISOLATE)) {
+			__mod_zone_page_state(zone, NR_FREE_PAGES, 1);
+			if (is_migrate_cma(mt))
+				__mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1);
+		}
+
+		to_free--;
+	}
+	WARN_ON(to_free != 0);
+	spin_unlock_irqrestore(&zone->lock, flags);
+}
+
+/*
+ * Moves a number of pages from the PCP lists to free list which
+ * is freed outside of the locked region.
+ *
+ * Assumes all pages on list are in same zone, and of same order.
+ * count is the number of pages to free.
+ */
+static void isolate_pcp_pages(int to_free, struct per_cpu_pages *src,
+			      struct list_head *dst)
+{
+	int migratetype = 0, batch_free = 0;
+
 	while (to_free) {
 		struct page *page;
 		struct list_head *list;
@@ -661,7 +695,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
 			batch_free++;
 			if (++migratetype == MIGRATE_PCPTYPES)
 				migratetype = 0;
-			list = &pcp->lists[migratetype];
+			list = &src->lists[migratetype];
 		} while (list_empty(list));
 
 		/* This is the only non-empty list. Free them all. */
@@ -669,36 +703,26 @@ static void free_pcppages_bulk(struct zone *zone, int count,
 			batch_free = to_free;
 
 		do {
-			int mt;	/* migratetype of the to-be-freed page */
-
 			page = list_last_entry(list, struct page, lru);
-			/* must delete as __free_one_page list manipulates */
 			list_del(&page->lru);
-			mt = get_freepage_migratetype(page);
-			/* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
-			__free_one_page(page, zone, 0, mt);
-			trace_mm_page_pcpu_drain(page, 0, mt);
-			if (likely(get_pageblock_migratetype(page) != MIGRATE_ISOLATE)) {
-				__mod_zone_page_state(zone, NR_FREE_PAGES, 1);
-				if (is_migrate_cma(mt))
-					__mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1);
-			}
+			list_add(&page->lru, dst);
 		} while (--to_free && --batch_free && !list_empty(list));
 	}
-	spin_unlock(&zone->lock);
 }
 
 static void free_one_page(struct zone *zone, struct page *page, int order,
 				int migratetype)
 {
-	spin_lock(&zone->lock);
+	unsigned long flags;
+
+	spin_lock_irqsave(&zone->lock, flags);
 	zone->all_unreclaimable = 0;
 	zone->pages_scanned = 0;
 
 	__free_one_page(page, zone, order, migratetype);
 	if (unlikely(migratetype != MIGRATE_ISOLATE))
 		__mod_zone_freepage_state(zone, 1 << order, migratetype);
-	spin_unlock(&zone->lock);
+	spin_unlock_irqrestore(&zone->lock, flags);
 }
 
 static bool free_pages_prepare(struct page *page, unsigned int order)
@@ -1180,6 +1204,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
 void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
 {
 	unsigned long flags;
+	LIST_HEAD(dst);
 	int to_drain;
 
 	local_lock_irqsave(pa_lock, flags);
@@ -1188,10 +1213,11 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
 	else
 		to_drain = pcp->count;
 	if (to_drain > 0) {
-		free_pcppages_bulk(zone, to_drain, pcp);
+		isolate_pcp_pages(to_drain, pcp, &dst);
 		pcp->count -= to_drain;
 	}
 	local_unlock_irqrestore(pa_lock, flags);
+	free_pcppages_bulk(zone, to_drain, &dst);
 }
 #endif
 
@@ -1210,16 +1236,21 @@ static void drain_pages(unsigned int cpu)
 	for_each_populated_zone(zone) {
 		struct per_cpu_pageset *pset;
 		struct per_cpu_pages *pcp;
+		LIST_HEAD(dst);
+		int count;
 
 		cpu_lock_irqsave(cpu, flags);
 		pset = per_cpu_ptr(zone->pageset, cpu);
 
 		pcp = &pset->pcp;
-		if (pcp->count) {
-			free_pcppages_bulk(zone, pcp->count, pcp);
+		count = pcp->count;
+		if (count) {
+			isolate_pcp_pages(count, pcp, &dst);
 			pcp->count = 0;
 		}
 		cpu_unlock_irqrestore(cpu, flags);
+		if (count)
+			free_pcppages_bulk(zone, count, &dst);
 	}
 }
 
@@ -1357,8 +1388,15 @@ void free_hot_cold_page(struct page *page, int cold)
 		list_add(&page->lru, &pcp->lists[migratetype]);
 	pcp->count++;
 	if (pcp->count >= pcp->high) {
-		free_pcppages_bulk(zone, pcp->batch, pcp);
+		LIST_HEAD(dst);
+		int count;
+
+		isolate_pcp_pages(pcp->batch, pcp, &dst);
 		pcp->count -= pcp->batch;
+		count = pcp->batch;
+		local_unlock_irqrestore(pa_lock, flags);
+		free_pcppages_bulk(zone, count, &dst);
+		return;
 	}
 
 out:
@@ -5977,20 +6015,22 @@ static int __meminit __zone_pcp_update(void *data)
 {
 	struct zone *zone = data;
 	int cpu;
-	unsigned long batch = zone_batchsize(zone), flags;
+	unsigned long flags;
 
 	for_each_possible_cpu(cpu) {
 		struct per_cpu_pageset *pset;
 		struct per_cpu_pages *pcp;
+		LIST_HEAD(dst);
 
 		pset = per_cpu_ptr(zone->pageset, cpu);
 		pcp = &pset->pcp;
 
 		cpu_lock_irqsave(cpu, flags);
-		if (pcp->count > 0)
-			free_pcppages_bulk(zone, pcp->count, pcp);
+		if (pcp->count > 0) {
+			isolate_pcp_pages(pcp->count, pcp, &dst);
+			free_pcppages_bulk(zone, pcp->count, &dst);
+		}
 		drain_zonestat(zone, pset);
-		setup_pageset(pset, batch);
 		cpu_unlock_irqrestore(cpu, flags);
 	}
 	return 0;
-- 
cgit v0.10.2


From f9c7209047f892f441941faee719b8d5aa4b09c9 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 21 Jul 2011 16:47:49 +0200
Subject: mm-page-alloc-fix.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 9122f48..364c3c6 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2204,8 +2204,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 		struct page *page;
 
 		/* Page migration frees to the PCP lists but we want merging */
-		drain_pages(get_cpu());
-		put_cpu();
+		drain_pages(get_cpu_light());
+		put_cpu_light();
 
 		page = get_page_from_freelist(gfp_mask, nodemask,
 				order, zonelist, high_zoneidx,
-- 
cgit v0.10.2


From 7beb959732fc4f37598be826a3649097fed8e978 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 3 Jul 2009 08:29:51 -0500
Subject: mm: convert swap to percpu locked

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/mm/swap.c b/mm/swap.c
index 6310dc2..63f42b8 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -30,6 +30,7 @@
 #include <linux/backing-dev.h>
 #include <linux/memcontrol.h>
 #include <linux/gfp.h>
+#include <linux/locallock.h>
 
 #include "internal.h"
 
@@ -40,6 +41,9 @@ static DEFINE_PER_CPU(struct pagevec[NR_LRU_LISTS], lru_add_pvecs);
 static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs);
 static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs);
 
+static DEFINE_LOCAL_IRQ_LOCK(rotate_lock);
+static DEFINE_LOCAL_IRQ_LOCK(swap_lock);
+
 /*
  * This path almost never happens for VM activity - pages are normally
  * freed via pagevecs.  But it gets used by networking.
@@ -354,11 +358,11 @@ void rotate_reclaimable_page(struct page *page)
 		unsigned long flags;
 
 		page_cache_get(page);
-		local_irq_save(flags);
+		local_lock_irqsave(rotate_lock, flags);
 		pvec = &__get_cpu_var(lru_rotate_pvecs);
 		if (!pagevec_add(pvec, page))
 			pagevec_move_tail(pvec);
-		local_irq_restore(flags);
+		local_unlock_irqrestore(rotate_lock, flags);
 	}
 }
 
@@ -403,12 +407,13 @@ static void activate_page_drain(int cpu)
 void activate_page(struct page *page)
 {
 	if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
-		struct pagevec *pvec = &get_cpu_var(activate_page_pvecs);
+		struct pagevec *pvec = &get_locked_var(swap_lock,
+						       activate_page_pvecs);
 
 		page_cache_get(page);
 		if (!pagevec_add(pvec, page))
 			pagevec_lru_move_fn(pvec, __activate_page, NULL);
-		put_cpu_var(activate_page_pvecs);
+		put_locked_var(swap_lock, activate_page_pvecs);
 	}
 }
 
@@ -456,13 +461,13 @@ EXPORT_SYMBOL(mark_page_accessed);
  */
 void __lru_cache_add(struct page *page, enum lru_list lru)
 {
-	struct pagevec *pvec = &get_cpu_var(lru_add_pvecs)[lru];
+	struct pagevec *pvec = &get_locked_var(swap_lock, lru_add_pvecs)[lru];
 
 	page_cache_get(page);
 	if (!pagevec_space(pvec))
 		__pagevec_lru_add(pvec, lru);
 	pagevec_add(pvec, page);
-	put_cpu_var(lru_add_pvecs);
+	put_locked_var(swap_lock, lru_add_pvecs);
 }
 EXPORT_SYMBOL(__lru_cache_add);
 
@@ -597,9 +602,9 @@ void lru_add_drain_cpu(int cpu)
 		unsigned long flags;
 
 		/* No harm done if a racing interrupt already did this */
-		local_irq_save(flags);
+		local_lock_irqsave(rotate_lock, flags);
 		pagevec_move_tail(pvec);
-		local_irq_restore(flags);
+		local_unlock_irqrestore(rotate_lock, flags);
 	}
 
 	pvec = &per_cpu(lru_deactivate_pvecs, cpu);
@@ -627,18 +632,19 @@ void deactivate_page(struct page *page)
 		return;
 
 	if (likely(get_page_unless_zero(page))) {
-		struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs);
+		struct pagevec *pvec = &get_locked_var(swap_lock,
+						       lru_deactivate_pvecs);
 
 		if (!pagevec_add(pvec, page))
 			pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
-		put_cpu_var(lru_deactivate_pvecs);
+		put_locked_var(swap_lock, lru_deactivate_pvecs);
 	}
 }
 
 void lru_add_drain(void)
 {
-	lru_add_drain_cpu(get_cpu());
-	put_cpu();
+	lru_add_drain_cpu(local_lock_cpu(swap_lock));
+	local_unlock_cpu(swap_lock);
 }
 
 static void lru_add_drain_per_cpu(struct work_struct *dummy)
-- 
cgit v0.10.2


From 6d28ca1e5cb3077863d889eb8e8d3b07318c50dd Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 3 Jul 2009 08:30:13 -0500
Subject: mm: make vmstat -rt aware

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index a13291f..839806b 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -29,7 +29,9 @@ DECLARE_PER_CPU(struct vm_event_state, vm_event_states);
 
 static inline void __count_vm_event(enum vm_event_item item)
 {
+	preempt_disable_rt();
 	__this_cpu_inc(vm_event_states.event[item]);
+	preempt_enable_rt();
 }
 
 static inline void count_vm_event(enum vm_event_item item)
@@ -39,7 +41,9 @@ static inline void count_vm_event(enum vm_event_item item)
 
 static inline void __count_vm_events(enum vm_event_item item, long delta)
 {
+	preempt_disable_rt();
 	__this_cpu_add(vm_event_states.event[item], delta);
+	preempt_enable_rt();
 }
 
 static inline void count_vm_events(enum vm_event_item item, long delta)
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 9800306..a4dbd77 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -216,6 +216,7 @@ void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
 	long x;
 	long t;
 
+	preempt_disable_rt();
 	x = delta + __this_cpu_read(*p);
 
 	t = __this_cpu_read(pcp->stat_threshold);
@@ -225,6 +226,7 @@ void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
 		x = 0;
 	}
 	__this_cpu_write(*p, x);
+	preempt_enable_rt();
 }
 EXPORT_SYMBOL(__mod_zone_page_state);
 
@@ -257,6 +259,7 @@ void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
 	s8 __percpu *p = pcp->vm_stat_diff + item;
 	s8 v, t;
 
+	preempt_disable_rt();
 	v = __this_cpu_inc_return(*p);
 	t = __this_cpu_read(pcp->stat_threshold);
 	if (unlikely(v > t)) {
@@ -265,6 +268,7 @@ void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
 		zone_page_state_add(v + overstep, zone, item);
 		__this_cpu_write(*p, -overstep);
 	}
+	preempt_enable_rt();
 }
 
 void __inc_zone_page_state(struct page *page, enum zone_stat_item item)
@@ -279,6 +283,7 @@ void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
 	s8 __percpu *p = pcp->vm_stat_diff + item;
 	s8 v, t;
 
+	preempt_disable_rt();
 	v = __this_cpu_dec_return(*p);
 	t = __this_cpu_read(pcp->stat_threshold);
 	if (unlikely(v < - t)) {
@@ -287,6 +292,7 @@ void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
 		zone_page_state_add(v - overstep, zone, item);
 		__this_cpu_write(*p, overstep);
 	}
+	preempt_enable_rt();
 }
 
 void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
-- 
cgit v0.10.2


From 9dfa1ebfb0efad9b9de0d7940df00b193773f89a Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 3 Jul 2009 08:44:54 -0500
Subject: mm: shrink the page frame to !-rt size

He below is a boot-tested hack to shrink the page frame size back to
normal.

Should be a net win since there should be many less PTE-pages than
page-frames.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 66e2f7c..f499754 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1259,27 +1259,59 @@ static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long a
  * overflow into the next struct page (as it might with DEBUG_SPINLOCK).
  * When freeing, reset page->mapping so free_pages_check won't complain.
  */
+#ifndef CONFIG_PREEMPT_RT_FULL
+
 #define __pte_lockptr(page)	&((page)->ptl)
-#define pte_lock_init(_page)	do {					\
-	spin_lock_init(__pte_lockptr(_page));				\
-} while (0)
+
+static inline struct page *pte_lock_init(struct page *page)
+{
+	spin_lock_init(__pte_lockptr(page));
+	return page;
+}
+
 #define pte_lock_deinit(page)	((page)->mapping = NULL)
+
+#else /* !PREEMPT_RT_FULL */
+
+/*
+ * On PREEMPT_RT_FULL the spinlock_t's are too large to embed in the
+ * page frame, hence it only has a pointer and we need to dynamically
+ * allocate the lock when we allocate PTE-pages.
+ *
+ * This is an overall win, since only a small fraction of the pages
+ * will be PTE pages under normal circumstances.
+ */
+
+#define __pte_lockptr(page)	((page)->ptl)
+
+extern struct page *pte_lock_init(struct page *page);
+extern void pte_lock_deinit(struct page *page);
+
+#endif /* PREEMPT_RT_FULL */
+
 #define pte_lockptr(mm, pmd)	({(void)(mm); __pte_lockptr(pmd_page(*(pmd)));})
 #else	/* !USE_SPLIT_PTLOCKS */
 /*
  * We use mm->page_table_lock to guard all pagetable pages of the mm.
  */
-#define pte_lock_init(page)	do {} while (0)
+static inline struct page *pte_lock_init(struct page *page) { return page; }
 #define pte_lock_deinit(page)	do {} while (0)
 #define pte_lockptr(mm, pmd)	({(void)(pmd); &(mm)->page_table_lock;})
 #endif /* USE_SPLIT_PTLOCKS */
 
-static inline void pgtable_page_ctor(struct page *page)
+static inline struct page *__pgtable_page_ctor(struct page *page)
 {
-	pte_lock_init(page);
-	inc_zone_page_state(page, NR_PAGETABLE);
+	page = pte_lock_init(page);
+	if (page)
+		inc_zone_page_state(page, NR_PAGETABLE);
+	return page;
 }
 
+#define pgtable_page_ctor(page)				\
+do {							\
+	page = __pgtable_page_ctor(page);		\
+} while (0)
+
 static inline void pgtable_page_dtor(struct page *page)
 {
 	pte_lock_deinit(page);
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index f8f5162..8b53bdf 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -141,7 +141,11 @@ struct page {
 						 * system if PG_buddy is set.
 						 */
 #if USE_SPLIT_PTLOCKS
+# ifndef CONFIG_PREEMPT_RT_FULL
 		spinlock_t ptl;
+# else
+		spinlock_t *ptl;
+# endif
 #endif
 		struct kmem_cache *slab_cache;	/* SL[AU]B: Pointer to slab */
 		struct page *first_page;	/* Compound tail pages */
diff --git a/mm/memory.c b/mm/memory.c
index 3f862e4..9d55b54 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4264,3 +4264,35 @@ void copy_user_huge_page(struct page *dst, struct page *src,
 	}
 }
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */
+
+#if defined(CONFIG_PREEMPT_RT_FULL) && (USE_SPLIT_PTLOCKS > 0)
+/*
+ * Heinous hack, relies on the caller doing something like:
+ *
+ *   pte = alloc_pages(PGALLOC_GFP, 0);
+ *   if (pte)
+ *     pgtable_page_ctor(pte);
+ *   return pte;
+ *
+ * This ensures we release the page and return NULL when the
+ * lock allocation fails.
+ */
+struct page *pte_lock_init(struct page *page)
+{
+	page->ptl = kmalloc(sizeof(spinlock_t), GFP_KERNEL);
+	if (page->ptl) {
+		spin_lock_init(__pte_lockptr(page));
+	} else {
+		__free_page(page);
+		page = NULL;
+	}
+	return page;
+}
+
+void pte_lock_deinit(struct page *page)
+{
+	kfree(page->ptl);
+	page->mapping = NULL;
+}
+
+#endif
-- 
cgit v0.10.2


From 78cde0eb9514dee0954efe6f6cdbf082c952742b Mon Sep 17 00:00:00 2001
From: Frank Rowand <frank.rowand@am.sony.com>
Date: Sat, 1 Oct 2011 18:58:13 -0700
Subject: ARM: Initialize ptl->lock for vector page

Without this patch, ARM can not use SPLIT_PTLOCK_CPUS if
PREEMPT_RT_FULL=y because vectors_user_mapping() creates a
VM_ALWAYSDUMP mapping of the vector page (address 0xffff0000), but no
ptl->lock has been allocated for the page.  An attempt to coredump
that page will result in a kernel NULL pointer dereference when
follow_page() attempts to lock the page.

The call tree to the NULL pointer dereference is:

   do_notify_resume()
      get_signal_to_deliver()
         do_coredump()
            elf_core_dump()
               get_dump_page()
                  __get_user_pages()
                     follow_page()
                        pte_offset_map_lock() <----- a #define
                           ...
                              rt_spin_lock()

The underlying problem is exposed by mm-shrink-the-page-frame-to-rt-size.patch.

Signed-off-by: Frank Rowand <frank.rowand@am.sony.com>
Cc: Frank <Frank_Rowand@sonyusa.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/4E87C535.2030907@am.sony.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c
index c6dec5f..43ac178 100644
--- a/arch/arm/kernel/process.c
+++ b/arch/arm/kernel/process.c
@@ -459,6 +459,31 @@ unsigned long arch_randomize_brk(struct mm_struct *mm)
 }
 
 #ifdef CONFIG_MMU
+
+/*
+ * CONFIG_SPLIT_PTLOCK_CPUS results in a page->ptl lock.  If the lock is not
+ * initialized by pgtable_page_ctor() then a coredump of the vector page will
+ * fail.
+ */
+static int __init vectors_user_mapping_init_page(void)
+{
+	struct page *page;
+	unsigned long addr = 0xffff0000;
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+
+	pgd = pgd_offset_k(addr);
+	pud = pud_offset(pgd, addr);
+	pmd = pmd_offset(pud, addr);
+	page = pmd_page(*(pmd));
+
+	pgtable_page_ctor(page);
+
+	return 0;
+}
+late_initcall(vectors_user_mapping_init_page);
+
 /*
  * The vectors page is always readable from user space for the
  * atomic helpers and the signal restart code. Insert it into the
-- 
cgit v0.10.2


From 2b9dfcb4573cdfafe60172a19d8779b92010873f Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 3 Jul 2009 08:44:03 -0500
Subject: mm: Allow only slab on RT

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/init/Kconfig b/init/Kconfig
index be8b7f5..fbc0785 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1493,6 +1493,7 @@ config SLAB
 
 config SLUB
 	bool "SLUB (Unqueued Allocator)"
+	depends on !PREEMPT_RT_FULL
 	help
 	   SLUB is a slab allocator that minimizes cache line usage
 	   instead of managing queues of cached objects (SLAB approach).
@@ -1504,6 +1505,7 @@ config SLUB
 config SLOB
 	depends on EXPERT
 	bool "SLOB (Simple Allocator)"
+	depends on !PREEMPT_RT_FULL
 	help
 	   SLOB replaces the stock allocator with a drastically simpler
 	   allocator. SLOB is generally more space efficient but
-- 
cgit v0.10.2


From 2c48e85a65779ca18b7fdc4a2218662b8a96f81d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 17 Jul 2011 21:33:18 +0200
Subject: radix-tree-rt-aware.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index ffc444c..7ddfbf9 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -230,7 +230,13 @@ unsigned long radix_tree_next_hole(struct radix_tree_root *root,
 				unsigned long index, unsigned long max_scan);
 unsigned long radix_tree_prev_hole(struct radix_tree_root *root,
 				unsigned long index, unsigned long max_scan);
+
+#ifndef CONFIG_PREEMPT_RT_FULL
 int radix_tree_preload(gfp_t gfp_mask);
+#else
+static inline int radix_tree_preload(gfp_t gm) { return 0; }
+#endif
+
 void radix_tree_init(void);
 void *radix_tree_tag_set(struct radix_tree_root *root,
 			unsigned long index, unsigned int tag);
@@ -255,7 +261,7 @@ unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item);
 
 static inline void radix_tree_preload_end(void)
 {
-	preempt_enable();
+	preempt_enable_nort();
 }
 
 /**
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index e796429..63bac7d 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -215,12 +215,13 @@ radix_tree_node_alloc(struct radix_tree_root *root)
 		 * succeed in getting a node here (and never reach
 		 * kmem_cache_alloc)
 		 */
-		rtp = &__get_cpu_var(radix_tree_preloads);
+		rtp = &get_cpu_var(radix_tree_preloads);
 		if (rtp->nr) {
 			ret = rtp->nodes[rtp->nr - 1];
 			rtp->nodes[rtp->nr - 1] = NULL;
 			rtp->nr--;
 		}
+		put_cpu_var(radix_tree_preloads);
 	}
 	if (ret == NULL)
 		ret = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask);
@@ -255,6 +256,7 @@ radix_tree_node_free(struct radix_tree_node *node)
 	call_rcu(&node->rcu_head, radix_tree_node_rcu_free);
 }
 
+#ifndef CONFIG_PREEMPT_RT_FULL
 /*
  * Load up this CPU's radix_tree_node buffer with sufficient objects to
  * ensure that the addition of a single element in the tree cannot fail.  On
@@ -289,6 +291,7 @@ out:
 	return ret;
 }
 EXPORT_SYMBOL(radix_tree_preload);
+#endif
 
 /*
  *	Return the maximum key which can be store into a
-- 
cgit v0.10.2


From 50b76dd1c5ac90e5ae721f52e5966c251f6a1256 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 4 Apr 2013 17:09:18 -0500
Subject: panic: skip get_random_bytes for RT_FULL in init_oops_id


diff --git a/kernel/panic.c b/kernel/panic.c
index e1b2822..5dc4381 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -371,9 +371,11 @@ static u64 oops_id;
 
 static int init_oops_id(void)
 {
+#ifndef CONFIG_PREEMPT_RT_FULL
 	if (!oops_id)
 		get_random_bytes(&oops_id, sizeof(oops_id));
 	else
+#endif
 		oops_id++;
 
 	return 0;
-- 
cgit v0.10.2


From c666e561c45f71ff9241c2785d626d2c2e3a091b Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 3 Jul 2009 08:30:12 -0500
Subject: ipc: Make the ipc code -rt aware

RT serializes the code with the (rt)spinlock but keeps preemption
enabled. Some parts of the code need to be atomic nevertheless.

Protect it with preempt_disable/enable_rt pairts.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index 71a3ca1..87d4d0e 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -912,12 +912,17 @@ static inline void pipelined_send(struct mqueue_inode_info *info,
 				  struct msg_msg *message,
 				  struct ext_wait_queue *receiver)
 {
+	/*
+	 * Keep them in one critical section for PREEMPT_RT:
+	 */
+	preempt_disable_rt();
 	receiver->msg = message;
 	list_del(&receiver->list);
 	receiver->state = STATE_PENDING;
 	wake_up_process(receiver->task);
 	smp_wmb();
 	receiver->state = STATE_READY;
+	preempt_enable_rt();
 }
 
 /* pipelined_receive() - if there is task waiting in sys_mq_timedsend()
diff --git a/ipc/msg.c b/ipc/msg.c
index 31cd1bf..22c722d 100644
--- a/ipc/msg.c
+++ b/ipc/msg.c
@@ -259,12 +259,20 @@ static void expunge_all(struct msg_queue *msq, int res)
 	while (tmp != &msq->q_receivers) {
 		struct msg_receiver *msr;
 
+		/*
+		 * Make sure that the wakeup doesnt preempt
+		 * this CPU prematurely. (on PREEMPT_RT)
+		 */
+		preempt_disable_rt();
+
 		msr = list_entry(tmp, struct msg_receiver, r_list);
 		tmp = tmp->next;
 		msr->r_msg = NULL;
 		wake_up_process(msr->r_tsk);
 		smp_mb();
 		msr->r_msg = ERR_PTR(res);
+
+		preempt_enable_rt();
 	}
 }
 
@@ -614,6 +622,12 @@ static inline int pipelined_send(struct msg_queue *msq, struct msg_msg *msg)
 		    !security_msg_queue_msgrcv(msq, msg, msr->r_tsk,
 					       msr->r_msgtype, msr->r_mode)) {
 
+			/*
+			 * Make sure that the wakeup doesnt preempt
+			 * this CPU prematurely. (on PREEMPT_RT)
+			 */
+			preempt_disable_rt();
+
 			list_del(&msr->r_list);
 			if (msr->r_maxsize < msg->m_ts) {
 				msr->r_msg = NULL;
@@ -627,9 +641,11 @@ static inline int pipelined_send(struct msg_queue *msq, struct msg_msg *msg)
 				wake_up_process(msr->r_tsk);
 				smp_mb();
 				msr->r_msg = msg;
+				preempt_enable_rt();
 
 				return 1;
 			}
+			preempt_enable_rt();
 		}
 	}
 	return 0;
-- 
cgit v0.10.2


From 2beec17abdd76d15517c2f3a189f18e561e32e93 Mon Sep 17 00:00:00 2001
From: KOBAYASHI Yoshitake <yoshitake.kobayashi@toshiba.co.jp>
Date: Sat, 23 Jul 2011 11:57:36 +0900
Subject: ipc/mqueue: Add a critical section to avoid a deadlock

(Repost for v3.0-rt1 and changed the distination addreses)
I have tested the following patch on v3.0-rt1 with PREEMPT_RT_FULL.
In POSIX message queue, if a sender process uses SCHED_FIFO and
has a higher priority than a receiver process, the sender will
be stuck at ipc/mqueue.c:452

  452                 while (ewp->state == STATE_PENDING)
  453                         cpu_relax();

Description of the problem
 (receiver process)
   1. receiver changes sender's state to STATE_PENDING (mqueue.c:846)
   2. wake up sender process and "switch to sender" (mqueue.c:847)
      Note: This context switch only happens in PREEMPT_RT_FULL kernel.
 (sender process)
   3. sender check the own state in above loop (mqueue.c:452-453)
   *. receiver will never wake up and cannot change sender's state to
      STATE_READY because sender has higher priority


Signed-off-by: Yoshitake Kobayashi <yoshitake.kobayashi@toshiba.co.jp>
Cc: viro@zeniv.linux.org.uk
Cc: dchinner@redhat.com
Cc: npiggin@kernel.dk
Cc: hch@lst.de
Cc: arnd@arndb.de
Link: http://lkml.kernel.org/r/4E2A38A0.1090601@toshiba.co.jp
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index 87d4d0e..4b8724e 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -936,13 +936,18 @@ static inline void pipelined_receive(struct mqueue_inode_info *info)
 		wake_up_interruptible(&info->wait_q);
 		return;
 	}
-	if (msg_insert(sender->msg, info))
-		return;
-	list_del(&sender->list);
-	sender->state = STATE_PENDING;
-	wake_up_process(sender->task);
-	smp_wmb();
-	sender->state = STATE_READY;
+	/*
+	 * Keep them in one critical section for PREEMPT_RT:
+	 */
+	preempt_disable_rt();
+	if (!msg_insert(sender->msg, info)) {
+		list_del(&sender->list);
+		sender->state = STATE_PENDING;
+		wake_up_process(sender->task);
+		smp_wmb();
+		sender->state = STATE_READY;
+	}
+	preempt_enable_rt();
 }
 
 SYSCALL_DEFINE5(mq_timedsend, mqd_t, mqdes, const char __user *, u_msg_ptr,
-- 
cgit v0.10.2


From 692a9db4c11c21bcb64c25654bee7e15880df138 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 3 Jul 2009 08:44:07 -0500
Subject: relay: fix timer madness

remove timer calls (!!!) from deep within the tracing infrastructure.
This was totally bogus code that can cause lockups and worse.  Poll
the buffer every 2 jiffies for now.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/relay.c b/kernel/relay.c
index e8cd202..56ba44f 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -340,6 +340,10 @@ static void wakeup_readers(unsigned long data)
 {
 	struct rchan_buf *buf = (struct rchan_buf *)data;
 	wake_up_interruptible(&buf->read_wait);
+	/*
+	 * Stupid polling for now:
+	 */
+	mod_timer(&buf->timer, jiffies + 1);
 }
 
 /**
@@ -357,6 +361,7 @@ static void __relay_reset(struct rchan_buf *buf, unsigned int init)
 		init_waitqueue_head(&buf->read_wait);
 		kref_init(&buf->kref);
 		setup_timer(&buf->timer, wakeup_readers, (unsigned long)buf);
+		mod_timer(&buf->timer, jiffies + 1);
 	} else
 		del_timer_sync(&buf->timer);
 
@@ -739,15 +744,6 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length)
 		else
 			buf->early_bytes += buf->chan->subbuf_size -
 					    buf->padding[old_subbuf];
-		smp_mb();
-		if (waitqueue_active(&buf->read_wait))
-			/*
-			 * Calling wake_up_interruptible() from here
-			 * will deadlock if we happen to be logging
-			 * from the scheduler (trying to re-grab
-			 * rq->lock), so defer it.
-			 */
-			mod_timer(&buf->timer, jiffies + 1);
 	}
 
 	old = buf->data;
-- 
cgit v0.10.2


From 96a0e3242988d9d96b3e5ab01baebdcd4e9efe9b Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 3 Jul 2009 08:29:34 -0500
Subject: timers: prepare for full preemption

When softirqs can be preempted we need to make sure that cancelling
the timer from the active thread can not deadlock vs. a running timer
callback. Add a waitqueue to resolve that.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/timer.h b/include/linux/timer.h
index 8c5a197..5fcd72c 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -241,7 +241,7 @@ extern void add_timer(struct timer_list *timer);
 
 extern int try_to_del_timer_sync(struct timer_list *timer);
 
-#ifdef CONFIG_SMP
+#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL)
   extern int del_timer_sync(struct timer_list *timer);
 #else
 # define del_timer_sync(t)		del_timer(t)
diff --git a/kernel/timer.c b/kernel/timer.c
index 367d008..10fcbae 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -76,6 +76,7 @@ struct tvec_root {
 struct tvec_base {
 	spinlock_t lock;
 	struct timer_list *running_timer;
+	wait_queue_head_t wait_for_running_timer;
 	unsigned long timer_jiffies;
 	unsigned long next_timer;
 	unsigned long active_timers;
@@ -735,12 +736,15 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
 
 	debug_activate(timer, expires);
 
+	preempt_disable_rt();
 	cpu = smp_processor_id();
 
 #if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP)
 	if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu))
 		cpu = get_nohz_timer_target();
 #endif
+	preempt_enable_rt();
+
 	new_base = per_cpu(tvec_bases, cpu);
 
 	if (base != new_base) {
@@ -941,6 +945,29 @@ void add_timer_on(struct timer_list *timer, int cpu)
 }
 EXPORT_SYMBOL_GPL(add_timer_on);
 
+#ifdef CONFIG_PREEMPT_RT_FULL
+/*
+ * Wait for a running timer
+ */
+static void wait_for_running_timer(struct timer_list *timer)
+{
+	struct tvec_base *base = timer->base;
+
+	if (base->running_timer == timer)
+		wait_event(base->wait_for_running_timer,
+			   base->running_timer != timer);
+}
+
+# define wakeup_timer_waiters(b)	wake_up(&(b)->wait_for_tunning_timer)
+#else
+static inline void wait_for_running_timer(struct timer_list *timer)
+{
+	cpu_relax();
+}
+
+# define wakeup_timer_waiters(b)	do { } while (0)
+#endif
+
 /**
  * del_timer - deactive a timer.
  * @timer: the timer to be deactivated
@@ -998,7 +1025,7 @@ int try_to_del_timer_sync(struct timer_list *timer)
 }
 EXPORT_SYMBOL(try_to_del_timer_sync);
 
-#ifdef CONFIG_SMP
+#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL)
 /**
  * del_timer_sync - deactivate a timer and wait for the handler to finish.
  * @timer: the timer to be deactivated
@@ -1058,7 +1085,7 @@ int del_timer_sync(struct timer_list *timer)
 		int ret = try_to_del_timer_sync(timer);
 		if (ret >= 0)
 			return ret;
-		cpu_relax();
+		wait_for_running_timer(timer);
 	}
 }
 EXPORT_SYMBOL(del_timer_sync);
@@ -1175,15 +1202,17 @@ static inline void __run_timers(struct tvec_base *base)
 			if (irqsafe) {
 				spin_unlock(&base->lock);
 				call_timer_fn(timer, fn, data);
+				base->running_timer = NULL;
 				spin_lock(&base->lock);
 			} else {
 				spin_unlock_irq(&base->lock);
 				call_timer_fn(timer, fn, data);
+				base->running_timer = NULL;
 				spin_lock_irq(&base->lock);
 			}
 		}
 	}
-	base->running_timer = NULL;
+	wake_up(&base->wait_for_running_timer);
 	spin_unlock_irq(&base->lock);
 }
 
@@ -1684,6 +1713,7 @@ static int __cpuinit init_timers_cpu(int cpu)
 	}
 
 	spin_lock_init(&base->lock);
+	init_waitqueue_head(&base->wait_for_running_timer);
 
 	for (j = 0; j < TVN_SIZE; j++) {
 		INIT_LIST_HEAD(base->tv5.vec + j);
-- 
cgit v0.10.2


From 013f2f6b262db82f2db166d02f9c456b46d1c49e Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 3 Jul 2009 08:30:20 -0500
Subject: timers: preempt-rt support

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/timer.c b/kernel/timer.c
index 10fcbae..9b51abd 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1352,7 +1352,17 @@ unsigned long get_next_timer_interrupt(unsigned long now)
 	if (cpu_is_offline(smp_processor_id()))
 		return expires;
 
+#ifdef CONFIG_PREEMPT_RT_FULL
+	/*
+	 * On PREEMPT_RT we cannot sleep here. If the trylock does not
+	 * succeed then we return the worst-case 'expires in 1 tick'
+	 * value:
+	 */
+	if (!spin_trylock(&base->lock))
+		return  now + 1;
+#else
 	spin_lock(&base->lock);
+#endif
 	if (base->active_timers) {
 		if (time_before_eq(base->next_timer, base->timer_jiffies))
 			base->next_timer = __next_timer_interrupt(base);
@@ -1362,7 +1372,6 @@ unsigned long get_next_timer_interrupt(unsigned long now)
 
 	if (time_before_eq(expires, now))
 		return now;
-
 	return cmp_next_hrtimer_event(now, expires);
 }
 #endif
@@ -1752,7 +1761,7 @@ static void __cpuinit migrate_timers(int cpu)
 
 	BUG_ON(cpu_online(cpu));
 	old_base = per_cpu(tvec_bases, cpu);
-	new_base = get_cpu_var(tvec_bases);
+	new_base = get_local_var(tvec_bases);
 	/*
 	 * The caller is globally serialized and nobody else
 	 * takes two locks at once, deadlock is not possible.
@@ -1773,7 +1782,7 @@ static void __cpuinit migrate_timers(int cpu)
 
 	spin_unlock(&old_base->lock);
 	spin_unlock_irq(&new_base->lock);
-	put_cpu_var(tvec_bases);
+	put_local_var(tvec_bases);
 }
 #endif /* CONFIG_HOTPLUG_CPU */
 
-- 
cgit v0.10.2


From 05654c2506df312b01752586e42a48b041a877fd Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 3 Jul 2009 08:44:30 -0500
Subject: timers: mov printk_tick to soft interrupt

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/timer.c b/kernel/timer.c
index 9b51abd..f4b3b68 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1389,7 +1389,6 @@ void update_process_times(int user_tick)
 	account_process_tick(p, user_tick);
 	run_local_timers();
 	rcu_check_callbacks(cpu, user_tick);
-	printk_tick();
 #ifdef CONFIG_IRQ_WORK
 	if (in_irq())
 		irq_work_run();
@@ -1405,6 +1404,7 @@ static void run_timer_softirq(struct softirq_action *h)
 {
 	struct tvec_base *base = __this_cpu_read(tvec_bases);
 
+	printk_tick();
 	hrtimer_run_pending();
 
 	if (time_after_eq(jiffies, base->timer_jiffies))
-- 
cgit v0.10.2


From 55baeef23b35cbec2a5aa6d05e73b6070dbd385c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 21 Aug 2009 11:56:45 +0200
Subject: timer: delay waking softirqs from the jiffy tick

People were complaining about broken balancing with the recent -rt
series.

A look at /proc/sched_debug yielded:

cpu#0, 2393.874 MHz
  .nr_running                    : 0
  .load                          : 0
  .cpu_load[0]                   : 177522
  .cpu_load[1]                   : 177522
  .cpu_load[2]                   : 177522
  .cpu_load[3]                   : 177522
  .cpu_load[4]                   : 177522
cpu#1, 2393.874 MHz
  .nr_running                    : 4
  .load                          : 4096
  .cpu_load[0]                   : 181618
  .cpu_load[1]                   : 180850
  .cpu_load[2]                   : 180274
  .cpu_load[3]                   : 179938
  .cpu_load[4]                   : 179758

Which indicated the cpu_load computation was hosed, the 177522 value
indicates that there is one RT task runnable. Initially I thought the
old problem of calculating the cpu_load from a softirq had re-surfaced,
however looking at the code shows its being done from scheduler_tick().

[ we really should fix this RT/cfs interaction some day... ]

A few trace_printk()s later:

    sirq-timer/1-19    [001]   174.289744:     19: 50:S ==> [001]     0:140:R <idle>
          <idle>-0     [001]   174.290724: enqueue_task_rt: adding task: 19/sirq-timer/1 with load: 177522
          <idle>-0     [001]   174.290725:      0:140:R   + [001]    19: 50:S sirq-timer/1
          <idle>-0     [001]   174.290730: scheduler_tick: current load: 177522
          <idle>-0     [001]   174.290732: scheduler_tick: current: 0/swapper
          <idle>-0     [001]   174.290736:      0:140:R ==> [001]    19: 50:R sirq-timer/1
    sirq-timer/1-19    [001]   174.290741: dequeue_task_rt: removing task: 19/sirq-timer/1 with load: 177522
    sirq-timer/1-19    [001]   174.290743:     19: 50:S ==> [001]     0:140:R <idle>

We see that we always raise the timer softirq before doing the load
calculation. Avoid this by re-ordering the scheduler_tick() call in
update_process_times() to occur before we deal with timers.

This lowers the load back to sanity and restores regular load-balancing
behaviour.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/timer.c b/kernel/timer.c
index f4b3b68..828ed8f 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1387,13 +1387,13 @@ void update_process_times(int user_tick)
 
 	/* Note: this timer irq context must be accounted for as well. */
 	account_process_tick(p, user_tick);
+	scheduler_tick();
 	run_local_timers();
 	rcu_check_callbacks(cpu, user_tick);
 #ifdef CONFIG_IRQ_WORK
 	if (in_irq())
 		irq_work_run();
 #endif
-	scheduler_tick();
 	run_posix_cpu_timers(p);
 }
 
-- 
cgit v0.10.2


From 42e0d87679dad5fbcff6d5138a0da87ac910da8b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 21 Jul 2011 15:23:39 +0200
Subject: timers: Avoid the switch timers base set to NULL trick on RT

On RT that code is preemptible, so we cannot assign NULL to timers
base as a preempter would spin forever in lock_timer_base().

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/timer.c b/kernel/timer.c
index 828ed8f..5470c3e 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -717,6 +717,36 @@ static struct tvec_base *lock_timer_base(struct timer_list *timer,
 	}
 }
 
+#ifndef CONFIG_PREEMPT_RT_FULL
+static inline struct tvec_base *switch_timer_base(struct timer_list *timer,
+						  struct tvec_base *old,
+						  struct tvec_base *new)
+{
+	/* See the comment in lock_timer_base() */
+	timer_set_base(timer, NULL);
+	spin_unlock(&old->lock);
+	spin_lock(&new->lock);
+	timer_set_base(timer, new);
+	return new;
+}
+#else
+static inline struct tvec_base *switch_timer_base(struct timer_list *timer,
+						  struct tvec_base *old,
+						  struct tvec_base *new)
+{
+	/*
+	 * We cannot do the above because we might be preempted and
+	 * then the preempter would see NULL and loop forever.
+	 */
+	if (spin_trylock(&new->lock)) {
+		timer_set_base(timer, new);
+		spin_unlock(&old->lock);
+		return new;
+	}
+	return old;
+}
+#endif
+
 static inline int
 __mod_timer(struct timer_list *timer, unsigned long expires,
 						bool pending_only, int pinned)
@@ -755,14 +785,8 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
 		 * handler yet has not finished. This also guarantees that
 		 * the timer is serialized wrt itself.
 		 */
-		if (likely(base->running_timer != timer)) {
-			/* See the comment in lock_timer_base() */
-			timer_set_base(timer, NULL);
-			spin_unlock(&base->lock);
-			base = new_base;
-			spin_lock(&base->lock);
-			timer_set_base(timer, base);
-		}
+		if (likely(base->running_timer != timer))
+			base = switch_timer_base(timer, base, new_base);
 	}
 
 	timer->expires = expires;
-- 
cgit v0.10.2


From 8d58f4e7f0a9b8b318c49c8b1fda06165086b0c8 Mon Sep 17 00:00:00 2001
From: Yong Zhang <yong.zhang0@gmail.com>
Date: Sun, 16 Oct 2011 18:56:45 +0800
Subject: printk: Don't call printk_tick in printk_needs_cpu() on RT

printk_tick() can't be called in atomic context when RT is enabled,
otherwise below warning will show:

[  117.597095] BUG: sleeping function called from invalid context at kernel/rtmutex.c:645
[  117.597102] in_atomic(): 1, irqs_disabled(): 1, pid: 0, name: kworker/0:0
[  117.597111] Pid: 0, comm: kworker/0:0 Not tainted 3.0.6-rt17-00284-gb76d419-dirty #7
[  117.597116] Call Trace:
[  117.597131]  [<c06e3b61>] ? printk+0x1d/0x24
[  117.597142]  [<c01390b6>] __might_sleep+0xe6/0x110
[  117.597151]  [<c06e634c>] rt_spin_lock+0x1c/0x30
[  117.597158]  [<c0142f26>] __wake_up+0x26/0x60
[  117.597166]  [<c014c78e>] printk_tick+0x3e/0x40
[  117.597173]  [<c014c7b4>] printk_needs_cpu+0x24/0x30
[  117.597181]  [<c017ecc8>] tick_nohz_stop_sched_tick+0x2e8/0x410
[  117.597191]  [<c017305a>] ? sched_clock_idle_wakeup_event+0x1a/0x20
[  117.597201]  [<c010182a>] cpu_idle+0x4a/0xb0
[  117.597209]  [<c06e0b97>] start_secondary+0xd3/0xd7

Now this is a really rare case and it's very unlikely that we starve
an logbuf waiter that way.

Signed-off-by: Yong Zhang <yong.zhang0@gmail.com>
Link: http://lkml.kernel.org/r/1318762607-2261-4-git-send-email-yong.zhang0@gmail.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/printk.c b/kernel/printk.c
index 98aad49..df4b56b 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -2030,8 +2030,8 @@ void printk_tick(void)
 
 int printk_needs_cpu(int cpu)
 {
-	if (cpu_is_offline(cpu))
-		printk_tick();
+	if (unlikely(cpu_is_offline(cpu)))
+		__this_cpu_write(printk_pending, 0);
 	return __this_cpu_read(printk_pending);
 }
 
-- 
cgit v0.10.2


From 268c20db7e01cd30533283b0aa64914ebf12feff Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 3 Jul 2009 08:29:34 -0500
Subject: hrtimers: prepare full preemption

Make cancellation of a running callback in softirq context safe
against preemption.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 5b9e789..ff5680c 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -192,6 +192,9 @@ struct hrtimer_cpu_base {
 	unsigned long			nr_hangs;
 	ktime_t				max_hang_time;
 #endif
+#ifdef CONFIG_PREEMPT_RT_BASE
+	wait_queue_head_t		wait;
+#endif
 	struct hrtimer_clock_base	clock_base[HRTIMER_MAX_CLOCK_BASES];
 };
 
@@ -385,6 +388,13 @@ static inline int hrtimer_restart(struct hrtimer *timer)
 	return hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
 }
 
+/* Softirq preemption could deadlock timer removal */
+#ifdef CONFIG_PREEMPT_RT_BASE
+  extern void hrtimer_wait_for_timer(const struct hrtimer *timer);
+#else
+# define hrtimer_wait_for_timer(timer)	do { cpu_relax(); } while (0)
+#endif
+
 /* Query timers: */
 extern ktime_t hrtimer_get_remaining(const struct hrtimer *timer);
 extern int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp);
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 7ee08f8..483fd3b 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -844,6 +844,32 @@ u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
 }
 EXPORT_SYMBOL_GPL(hrtimer_forward);
 
+#ifdef CONFIG_PREEMPT_RT_BASE
+# define wake_up_timer_waiters(b)	wake_up(&(b)->wait)
+
+/**
+ * hrtimer_wait_for_timer - Wait for a running timer
+ *
+ * @timer:	timer to wait for
+ *
+ * The function waits in case the timers callback function is
+ * currently executed on the waitqueue of the timer base. The
+ * waitqueue is woken up after the timer callback function has
+ * finished execution.
+ */
+void hrtimer_wait_for_timer(const struct hrtimer *timer)
+{
+	struct hrtimer_clock_base *base = timer->base;
+
+	if (base && base->cpu_base && !hrtimer_hres_active(base->cpu_base))
+		wait_event(base->cpu_base->wait,
+				!(timer->state & HRTIMER_STATE_CALLBACK));
+}
+
+#else
+# define wake_up_timer_waiters(b)	do { } while (0)
+#endif
+
 /*
  * enqueue_hrtimer - internal function to (re)start a timer
  *
@@ -1094,7 +1120,7 @@ int hrtimer_cancel(struct hrtimer *timer)
 
 		if (ret >= 0)
 			return ret;
-		cpu_relax();
+		hrtimer_wait_for_timer(timer);
 	}
 }
 EXPORT_SYMBOL_GPL(hrtimer_cancel);
@@ -1509,6 +1535,8 @@ void hrtimer_run_queues(void)
 		}
 		raw_spin_unlock(&cpu_base->lock);
 	}
+
+	wake_up_timer_waiters(cpu_base);
 }
 
 /*
@@ -1671,6 +1699,9 @@ static void __cpuinit init_hrtimers_cpu(int cpu)
 	}
 
 	hrtimer_init_hres(cpu_base);
+#ifdef CONFIG_PREEMPT_RT_BASE
+	init_waitqueue_head(&cpu_base->wait);
+#endif
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
diff --git a/kernel/itimer.c b/kernel/itimer.c
index 8d262b4..d051390 100644
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -213,6 +213,7 @@ again:
 		/* We are sharing ->siglock with it_real_fn() */
 		if (hrtimer_try_to_cancel(timer) < 0) {
 			spin_unlock_irq(&tsk->sighand->siglock);
+			hrtimer_wait_for_timer(&tsk->signal->real_timer);
 			goto again;
 		}
 		expires = timeval_to_ktime(value->it_value);
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 4b7183c..1d5e435 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -773,6 +773,20 @@ SYSCALL_DEFINE1(timer_getoverrun, timer_t, timer_id)
 	return overrun;
 }
 
+/*
+ * Protected by RCU!
+ */
+static void timer_wait_for_callback(struct k_clock *kc, struct k_itimer *timr)
+{
+#ifdef CONFIG_PREEMPT_RT_FULL
+	if (kc->timer_set == common_timer_set)
+		hrtimer_wait_for_timer(&timr->it.real.timer);
+	else
+		/* FIXME: Whacky hack for posix-cpu-timers */
+		schedule_timeout(1);
+#endif
+}
+
 /* Set a POSIX.1b interval timer. */
 /* timr->it_lock is taken. */
 static int
@@ -850,6 +864,7 @@ retry:
 	if (!timr)
 		return -EINVAL;
 
+	rcu_read_lock();
 	kc = clockid_to_kclock(timr->it_clock);
 	if (WARN_ON_ONCE(!kc || !kc->timer_set))
 		error = -EINVAL;
@@ -858,9 +873,12 @@ retry:
 
 	unlock_timer(timr, flag);
 	if (error == TIMER_RETRY) {
+		timer_wait_for_callback(kc, timr);
 		rtn = NULL;	// We already got the old time...
+		rcu_read_unlock();
 		goto retry;
 	}
+	rcu_read_unlock();
 
 	if (old_setting && !error &&
 	    copy_to_user(old_setting, &old_spec, sizeof (old_spec)))
@@ -898,10 +916,15 @@ retry_delete:
 	if (!timer)
 		return -EINVAL;
 
+	rcu_read_lock();
 	if (timer_delete_hook(timer) == TIMER_RETRY) {
 		unlock_timer(timer, flags);
+		timer_wait_for_callback(clockid_to_kclock(timer->it_clock),
+					timer);
+		rcu_read_unlock();
 		goto retry_delete;
 	}
+	rcu_read_unlock();
 
 	spin_lock(&current->sighand->siglock);
 	list_del(&timer->list);
@@ -927,8 +950,18 @@ static void itimer_delete(struct k_itimer *timer)
 retry_delete:
 	spin_lock_irqsave(&timer->it_lock, flags);
 
+	/* On RT we can race with a deletion */
+	if (!timer->it_signal) {
+		unlock_timer(timer, flags);
+		return;
+	}
+
 	if (timer_delete_hook(timer) == TIMER_RETRY) {
+		rcu_read_lock();
 		unlock_timer(timer, flags);
+		timer_wait_for_callback(clockid_to_kclock(timer->it_clock),
+					timer);
+		rcu_read_unlock();
 		goto retry_delete;
 	}
 	list_del(&timer->list);
-- 
cgit v0.10.2


From 3bc739a0e3a70b7b196c2044b0824a7292556509 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 3 Jul 2009 08:44:31 -0500
Subject: hrtimer: fixup hrtimer callback changes for preempt-rt

In preempt-rt we can not call the callbacks which take sleeping locks
from the timer interrupt context.

Bring back the softirq split for now, until we fixed the signal
delivery problem for real.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index ff5680c..113bcf1 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -111,6 +111,8 @@ struct hrtimer {
 	enum hrtimer_restart		(*function)(struct hrtimer *);
 	struct hrtimer_clock_base	*base;
 	unsigned long			state;
+	struct list_head		cb_entry;
+	int				irqsafe;
 #ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
 	ktime_t				praecox;
 #endif
@@ -150,6 +152,7 @@ struct hrtimer_clock_base {
 	int			index;
 	clockid_t		clockid;
 	struct timerqueue_head	active;
+	struct list_head	expired;
 	ktime_t			resolution;
 	ktime_t			(*get_time)(void);
 	ktime_t			softirq_time;
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 483fd3b..a8b8527 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -589,8 +589,7 @@ static int hrtimer_reprogram(struct hrtimer *timer,
 	 * When the callback is running, we do not reprogram the clock event
 	 * device. The timer callback is either running on a different CPU or
 	 * the callback is executed in the hrtimer_interrupt context. The
-	 * reprogramming is handled either by the softirq, which called the
-	 * callback or at the end of the hrtimer_interrupt.
+	 * reprogramming is handled at the end of the hrtimer_interrupt.
 	 */
 	if (hrtimer_callback_running(timer))
 		return 0;
@@ -625,6 +624,9 @@ static int hrtimer_reprogram(struct hrtimer *timer,
 	return res;
 }
 
+static void __run_hrtimer(struct hrtimer *timer, ktime_t *now);
+static int hrtimer_rt_defer(struct hrtimer *timer);
+
 /*
  * Initialize the high resolution related parts of cpu_base
  */
@@ -641,9 +643,18 @@ static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base)
  * and expiry check is done in the hrtimer_interrupt or in the softirq.
  */
 static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
-					    struct hrtimer_clock_base *base)
+					    struct hrtimer_clock_base *base,
+					    int wakeup)
 {
-	return base->cpu_base->hres_active && hrtimer_reprogram(timer, base);
+	if (!(base->cpu_base->hres_active && hrtimer_reprogram(timer, base)))
+		return 0;
+	if (!wakeup)
+		return -ETIME;
+#ifdef CONFIG_PREEMPT_RT_BASE
+	if (!hrtimer_rt_defer(timer))
+		return -ETIME;
+#endif
+	return 1;
 }
 
 static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
@@ -724,12 +735,18 @@ static inline int hrtimer_switch_to_hres(void) { return 0; }
 static inline void
 hrtimer_force_reprogram(struct hrtimer_cpu_base *base, int skip_equal) { }
 static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
-					    struct hrtimer_clock_base *base)
+					    struct hrtimer_clock_base *base,
+					    int wakeup)
 {
 	return 0;
 }
 static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { }
 static inline void retrigger_next_event(void *arg) { }
+static inline int hrtimer_reprogram(struct hrtimer *timer,
+				    struct hrtimer_clock_base *base)
+{
+	return 0;
+}
 
 #endif /* CONFIG_HIGH_RES_TIMERS */
 
@@ -861,9 +878,9 @@ void hrtimer_wait_for_timer(const struct hrtimer *timer)
 {
 	struct hrtimer_clock_base *base = timer->base;
 
-	if (base && base->cpu_base && !hrtimer_hres_active(base->cpu_base))
+	if (base && base->cpu_base && !timer->irqsafe)
 		wait_event(base->cpu_base->wait,
-				!(timer->state & HRTIMER_STATE_CALLBACK));
+			   !(timer->state & HRTIMER_STATE_CALLBACK));
 }
 
 #else
@@ -913,6 +930,11 @@ static void __remove_hrtimer(struct hrtimer *timer,
 	if (!(timer->state & HRTIMER_STATE_ENQUEUED))
 		goto out;
 
+	if (unlikely(!list_empty(&timer->cb_entry))) {
+		list_del_init(&timer->cb_entry);
+		goto out;
+	}
+
 	next_timer = timerqueue_getnext(&base->active);
 	timerqueue_del(&base->active, &timer->node);
 	if (&timer->node == next_timer) {
@@ -1020,9 +1042,19 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
 	 *
 	 * XXX send_remote_softirq() ?
 	 */
-	if (leftmost && new_base->cpu_base == &__get_cpu_var(hrtimer_bases)
-		&& hrtimer_enqueue_reprogram(timer, new_base)) {
-		if (wakeup) {
+	if (leftmost && new_base->cpu_base == &__get_cpu_var(hrtimer_bases)) {
+		ret = hrtimer_enqueue_reprogram(timer, new_base, wakeup);
+		if (ret < 0) {
+			/*
+			 * In case we failed to reprogram the timer (mostly
+			 * because out current timer is already elapsed),
+			 * remove it again and report a failure. This avoids
+			 * stale base->first entries.
+			 */
+			debug_deactivate(timer);
+			__remove_hrtimer(timer, new_base,
+				timer->state & HRTIMER_STATE_CALLBACK, 0);
+		} else if (ret > 0) {
 			/*
 			 * We need to drop cpu_base->lock to avoid a
 			 * lock ordering issue vs. rq->lock.
@@ -1030,9 +1062,7 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
 			raw_spin_unlock(&new_base->cpu_base->lock);
 			raise_softirq_irqoff(HRTIMER_SOFTIRQ);
 			local_irq_restore(flags);
-			return ret;
-		} else {
-			__raise_softirq_irqoff(HRTIMER_SOFTIRQ);
+			return 0;
 		}
 	}
 
@@ -1199,6 +1229,7 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
 
 	base = hrtimer_clockid_to_base(clock_id);
 	timer->base = &cpu_base->clock_base[base];
+	INIT_LIST_HEAD(&timer->cb_entry);
 	timerqueue_init(&timer->node);
 
 #ifdef CONFIG_TIMER_STATS
@@ -1282,10 +1313,128 @@ static void __run_hrtimer(struct hrtimer *timer, ktime_t *now)
 	timer->state &= ~HRTIMER_STATE_CALLBACK;
 }
 
-#ifdef CONFIG_HIGH_RES_TIMERS
-
 static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer);
 
+#ifdef CONFIG_PREEMPT_RT_BASE
+static void hrtimer_rt_reprogram(int restart, struct hrtimer *timer,
+				 struct hrtimer_clock_base *base)
+{
+	/*
+	 * Note, we clear the callback flag before we requeue the
+	 * timer otherwise we trigger the callback_running() check
+	 * in hrtimer_reprogram().
+	 */
+	timer->state &= ~HRTIMER_STATE_CALLBACK;
+
+	if (restart != HRTIMER_NORESTART) {
+		BUG_ON(hrtimer_active(timer));
+		/*
+		 * Enqueue the timer, if it's the leftmost timer then
+		 * we need to reprogram it.
+		 */
+		if (!enqueue_hrtimer(timer, base))
+			return;
+
+#ifndef CONFIG_HIGH_RES_TIMERS
+	}
+#else
+		if (base->cpu_base->hres_active &&
+		    hrtimer_reprogram(timer, base))
+			goto requeue;
+
+	} else if (hrtimer_active(timer)) {
+		/*
+		 * If the timer was rearmed on another CPU, reprogram
+		 * the event device.
+		 */
+		if (&timer->node == base->active.next &&
+		    base->cpu_base->hres_active &&
+		    hrtimer_reprogram(timer, base))
+			goto requeue;
+	}
+	return;
+
+requeue:
+	/*
+	 * Timer is expired. Thus move it from tree to pending list
+	 * again.
+	 */
+	__remove_hrtimer(timer, base, timer->state, 0);
+	list_add_tail(&timer->cb_entry, &base->expired);
+#endif
+}
+
+/*
+ * The changes in mainline which removed the callback modes from
+ * hrtimer are not yet working with -rt. The non wakeup_process()
+ * based callbacks which involve sleeping locks need to be treated
+ * seperately.
+ */
+static void hrtimer_rt_run_pending(void)
+{
+	enum hrtimer_restart (*fn)(struct hrtimer *);
+	struct hrtimer_cpu_base *cpu_base;
+	struct hrtimer_clock_base *base;
+	struct hrtimer *timer;
+	int index, restart;
+
+	local_irq_disable();
+	cpu_base = &per_cpu(hrtimer_bases, smp_processor_id());
+
+	raw_spin_lock(&cpu_base->lock);
+
+	for (index = 0; index < HRTIMER_MAX_CLOCK_BASES; index++) {
+		base = &cpu_base->clock_base[index];
+
+		while (!list_empty(&base->expired)) {
+			timer = list_first_entry(&base->expired,
+						 struct hrtimer, cb_entry);
+
+			/*
+			 * Same as the above __run_hrtimer function
+			 * just we run with interrupts enabled.
+			 */
+			debug_hrtimer_deactivate(timer);
+			__remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0);
+			timer_stats_account_hrtimer(timer);
+			fn = timer->function;
+
+			raw_spin_unlock_irq(&cpu_base->lock);
+			restart = fn(timer);
+			raw_spin_lock_irq(&cpu_base->lock);
+
+			hrtimer_rt_reprogram(restart, timer, base);
+		}
+	}
+
+	raw_spin_unlock_irq(&cpu_base->lock);
+
+	wake_up_timer_waiters(cpu_base);
+}
+
+static int hrtimer_rt_defer(struct hrtimer *timer)
+{
+	if (timer->irqsafe)
+		return 0;
+
+	__remove_hrtimer(timer, timer->base, timer->state, 0);
+	list_add_tail(&timer->cb_entry, &timer->base->expired);
+	return 1;
+}
+
+#else
+
+static inline void hrtimer_rt_run_pending(void)
+{
+	hrtimer_peek_ahead_timers();
+}
+
+static inline int hrtimer_rt_defer(struct hrtimer *timer) { return 0; }
+
+#endif
+
+#ifdef CONFIG_HIGH_RES_TIMERS
+
 /*
  * High resolution timer interrupt
  * Called with interrupts disabled
@@ -1294,7 +1443,7 @@ void hrtimer_interrupt(struct clock_event_device *dev)
 {
 	struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
 	ktime_t expires_next, now, entry_time, delta;
-	int i, retries = 0;
+	int i, retries = 0, raise = 0;
 
 	BUG_ON(!cpu_base->hres_active);
 	cpu_base->nr_events++;
@@ -1361,7 +1510,10 @@ retry:
 				break;
 			}
 
-			__run_hrtimer(timer, &basenow);
+			if (!hrtimer_rt_defer(timer))
+				__run_hrtimer(timer, &basenow);
+			else
+				raise = 1;
 		}
 	}
 
@@ -1376,6 +1528,10 @@ retry:
 	if (expires_next.tv64 == KTIME_MAX ||
 	    !tick_program_event(expires_next, 0)) {
 		cpu_base->hang_detected = 0;
+
+		if (raise)
+			raise_softirq_irqoff(HRTIMER_SOFTIRQ);
+
 		return;
 	}
 
@@ -1456,24 +1612,26 @@ void hrtimer_peek_ahead_timers(void)
 	local_irq_restore(flags);
 }
 
+#else /* CONFIG_HIGH_RES_TIMERS */
+
+static inline void __hrtimer_peek_ahead_timers(void) { }
+
+#endif	/* !CONFIG_HIGH_RES_TIMERS */
+
 static void run_hrtimer_softirq(struct softirq_action *h)
 {
+#ifdef CONFIG_HIGH_RES_TIMERS
 	struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
 
 	if (cpu_base->clock_was_set) {
 		cpu_base->clock_was_set = 0;
 		clock_was_set();
 	}
+#endif
 
-	hrtimer_peek_ahead_timers();
+	hrtimer_rt_run_pending();
 }
 
-#else /* CONFIG_HIGH_RES_TIMERS */
-
-static inline void __hrtimer_peek_ahead_timers(void) { }
-
-#endif	/* !CONFIG_HIGH_RES_TIMERS */
-
 /*
  * Called from timer softirq every jiffy, expire hrtimers:
  *
@@ -1506,7 +1664,7 @@ void hrtimer_run_queues(void)
 	struct timerqueue_node *node;
 	struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
 	struct hrtimer_clock_base *base;
-	int index, gettime = 1;
+	int index, gettime = 1, raise = 0;
 
 	if (hrtimer_hres_active())
 		return;
@@ -1531,12 +1689,16 @@ void hrtimer_run_queues(void)
 					hrtimer_get_expires_tv64(timer))
 				break;
 
-			__run_hrtimer(timer, &base->softirq_time);
+			if (!hrtimer_rt_defer(timer))
+				__run_hrtimer(timer, &base->softirq_time);
+			else
+				raise = 1;
 		}
 		raw_spin_unlock(&cpu_base->lock);
 	}
 
-	wake_up_timer_waiters(cpu_base);
+	if (raise)
+		raise_softirq_irqoff(HRTIMER_SOFTIRQ);
 }
 
 /*
@@ -1558,6 +1720,7 @@ static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer)
 void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task)
 {
 	sl->timer.function = hrtimer_wakeup;
+	sl->timer.irqsafe = 1;
 	sl->task = task;
 }
 EXPORT_SYMBOL_GPL(hrtimer_init_sleeper);
@@ -1696,6 +1859,7 @@ static void __cpuinit init_hrtimers_cpu(int cpu)
 	for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
 		cpu_base->clock_base[i].cpu_base = cpu_base;
 		timerqueue_init_head(&cpu_base->clock_base[i].active);
+		INIT_LIST_HEAD(&cpu_base->clock_base[i].expired);
 	}
 
 	hrtimer_init_hres(cpu_base);
@@ -1814,9 +1978,7 @@ void __init hrtimers_init(void)
 	hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE,
 			  (void *)(long)smp_processor_id());
 	register_cpu_notifier(&hrtimers_nb);
-#ifdef CONFIG_HIGH_RES_TIMERS
 	open_softirq(HRTIMER_SOFTIRQ, run_hrtimer_softirq);
-#endif
 }
 
 /**
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 3419f34..a5c0bd0 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -489,6 +489,7 @@ static void init_rq_hrtick(struct rq *rq)
 
 	hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	rq->hrtick_timer.function = hrtick;
+	rq->hrtick_timer.irqsafe = 1;
 }
 #else	/* CONFIG_SCHED_HRTICK */
 static inline void hrtick_clear(struct rq *rq)
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 4f02b28..46faf69 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -41,6 +41,7 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
 
 	hrtimer_init(&rt_b->rt_period_timer,
 			CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	rt_b->rt_period_timer.irqsafe = 1;
 	rt_b->rt_period_timer.function = sched_rt_period_timer;
 }
 
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 3365223..5fe688c 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -867,6 +867,7 @@ void tick_setup_sched_timer(void)
 	 * Emulate tick processing via per-CPU hrtimers:
 	 */
 	hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+	ts->sched_timer.irqsafe = 1;
 	ts->sched_timer.function = tick_sched_timer;
 
 	/* Get the next period (per cpu) */
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 608d2fe..7bbc18a 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -358,6 +358,7 @@ static void watchdog_enable(unsigned int cpu)
 	/* kick off the timer for the hardlockup detector */
 	hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	hrtimer->function = watchdog_timer_fn;
+	hrtimer->irqsafe = 1;
 
 	if (!watchdog_enabled) {
 		kthread_park(current);
-- 
cgit v0.10.2


From 42b4e8d5f84744d513fd65b9b999b3badf76e92e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 25 Jan 2012 11:08:40 +0100
Subject: timer-fd: Prevent live lock

If hrtimer_try_to_cancel() requires a retry, then depending on the
priority setting te retry loop might prevent timer callback completion
on RT. Prevent that by waiting for completion on RT, no change for a
non RT kernel.

Reported-by: Sankara Muthukrishnan <sankara.m@gmail.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable-rt@vger.kernel.org

diff --git a/fs/timerfd.c b/fs/timerfd.c
index d03822b..522aeb8 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -311,7 +311,7 @@ SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags,
 		if (hrtimer_try_to_cancel(&ctx->tmr) >= 0)
 			break;
 		spin_unlock_irq(&ctx->wqh.lock);
-		cpu_relax();
+		hrtimer_wait_for_timer(&ctx->tmr);
 	}
 
 	/*
-- 
cgit v0.10.2


From 8d295db46092fd19c8f8592461dfc6d0ac3ffaa5 Mon Sep 17 00:00:00 2001
From: John Stultz <johnstul@us.ibm.com>
Date: Fri, 3 Jul 2009 08:29:58 -0500
Subject: posix-timers: thread posix-cpu-timers on -rt

posix-cpu-timer code takes non -rt safe locks in hard irq
context. Move it to a thread.

[ 3.0 fixes from Peter Zijlstra <peterz@infradead.org> ]

Signed-off-by: John Stultz <johnstul@us.ibm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 6d087c5..10f32ab 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -141,6 +141,12 @@ extern struct task_group root_task_group;
 # define INIT_PERF_EVENTS(tsk)
 #endif
 
+#ifdef CONFIG_PREEMPT_RT_BASE
+# define INIT_TIMER_LIST		.posix_timer_list = NULL,
+#else
+# define INIT_TIMER_LIST
+#endif
+
 #define INIT_TASK_COMM "swapper"
 
 /*
@@ -196,6 +202,7 @@ extern struct task_group root_task_group;
 	.cpu_timers	= INIT_CPU_TIMERS(tsk.cpu_timers),		\
 	.pi_lock	= __RAW_SPIN_LOCK_UNLOCKED(tsk.pi_lock),	\
 	.timer_slack_ns = 50000, /* 50 usec default slack */		\
+	INIT_TIMER_LIST							\
 	.pids = {							\
 		[PIDTYPE_PID]  = INIT_PID_LINK(PIDTYPE_PID),		\
 		[PIDTYPE_PGID] = INIT_PID_LINK(PIDTYPE_PGID),		\
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 3eed200..77f1fa5 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1377,6 +1377,9 @@ struct task_struct {
 
 	struct task_cputime cputime_expires;
 	struct list_head cpu_timers[3];
+#ifdef CONFIG_PREEMPT_RT_BASE
+	struct task_struct *posix_timer_list;
+#endif
 
 /* process credentials */
 	const struct cred __rcu *real_cred; /* objective and real subjective task
diff --git a/init/main.c b/init/main.c
index cee4b5c..a6766db 100644
--- a/init/main.c
+++ b/init/main.c
@@ -70,6 +70,7 @@
 #include <linux/perf_event.h>
 #include <linux/file.h>
 #include <linux/ptrace.h>
+#include <linux/posix-timers.h>
 
 #include <asm/io.h>
 #include <asm/bugs.h>
diff --git a/kernel/fork.c b/kernel/fork.c
index 6b22be6..fae9ab2 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1112,6 +1112,9 @@ void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
  */
 static void posix_cpu_timers_init(struct task_struct *tsk)
 {
+#ifdef CONFIG_PREEMPT_RT_BASE
+	tsk->posix_timer_list = NULL;
+#endif
 	tsk->cputime_expires.prof_exp = 0;
 	tsk->cputime_expires.virt_exp = 0;
 	tsk->cputime_expires.sched_exp = 0;
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 942ca27..15c4821 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -661,7 +661,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags,
 	/*
 	 * Disarm any old timer after extracting its expiry time.
 	 */
-	BUG_ON(!irqs_disabled());
+	BUG_ON_NONRT(!irqs_disabled());
 
 	ret = 0;
 	old_incr = timer->it.cpu.incr;
@@ -1177,7 +1177,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
 	/*
 	 * Now re-arm for the new expiry time.
 	 */
-	BUG_ON(!irqs_disabled());
+	BUG_ON_NONRT(!irqs_disabled());
 	arm_timer(timer);
 	spin_unlock(&p->sighand->siglock);
 
@@ -1241,10 +1241,11 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
 	sig = tsk->signal;
 	if (sig->cputimer.running) {
 		struct task_cputime group_sample;
+		unsigned long flags;
 
-		raw_spin_lock(&sig->cputimer.lock);
+		raw_spin_lock_irqsave(&sig->cputimer.lock, flags);
 		group_sample = sig->cputimer.cputime;
-		raw_spin_unlock(&sig->cputimer.lock);
+		raw_spin_unlock_irqrestore(&sig->cputimer.lock, flags);
 
 		if (task_cputime_expired(&group_sample, &sig->cputime_expires))
 			return 1;
@@ -1258,13 +1259,13 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
  * already updated our counts.  We need to check if any timers fire now.
  * Interrupts are disabled.
  */
-void run_posix_cpu_timers(struct task_struct *tsk)
+static void __run_posix_cpu_timers(struct task_struct *tsk)
 {
 	LIST_HEAD(firing);
 	struct k_itimer *timer, *next;
 	unsigned long flags;
 
-	BUG_ON(!irqs_disabled());
+	BUG_ON_NONRT(!irqs_disabled());
 
 	/*
 	 * The fast path checks that there are no expired thread or thread
@@ -1322,6 +1323,175 @@ void run_posix_cpu_timers(struct task_struct *tsk)
 	}
 }
 
+#ifdef CONFIG_PREEMPT_RT_BASE
+#include <linux/kthread.h>
+#include <linux/cpu.h>
+DEFINE_PER_CPU(struct task_struct *, posix_timer_task);
+DEFINE_PER_CPU(struct task_struct *, posix_timer_tasklist);
+
+static int posix_cpu_timers_thread(void *data)
+{
+	int cpu = (long)data;
+
+	BUG_ON(per_cpu(posix_timer_task,cpu) != current);
+
+	while (!kthread_should_stop()) {
+		struct task_struct *tsk = NULL;
+		struct task_struct *next = NULL;
+
+		if (cpu_is_offline(cpu))
+			goto wait_to_die;
+
+		/* grab task list */
+		raw_local_irq_disable();
+		tsk = per_cpu(posix_timer_tasklist, cpu);
+		per_cpu(posix_timer_tasklist, cpu) = NULL;
+		raw_local_irq_enable();
+
+		/* its possible the list is empty, just return */
+		if (!tsk) {
+			set_current_state(TASK_INTERRUPTIBLE);
+			schedule();
+			__set_current_state(TASK_RUNNING);
+			continue;
+		}
+
+		/* Process task list */
+		while (1) {
+			/* save next */
+			next = tsk->posix_timer_list;
+
+			/* run the task timers, clear its ptr and
+			 * unreference it
+			 */
+			__run_posix_cpu_timers(tsk);
+			tsk->posix_timer_list = NULL;
+			put_task_struct(tsk);
+
+			/* check if this is the last on the list */
+			if (next == tsk)
+				break;
+			tsk = next;
+		}
+	}
+	return 0;
+
+wait_to_die:
+	/* Wait for kthread_stop */
+	set_current_state(TASK_INTERRUPTIBLE);
+	while (!kthread_should_stop()) {
+		schedule();
+		set_current_state(TASK_INTERRUPTIBLE);
+	}
+	__set_current_state(TASK_RUNNING);
+	return 0;
+}
+
+void run_posix_cpu_timers(struct task_struct *tsk)
+{
+	unsigned long cpu = smp_processor_id();
+	struct task_struct *tasklist;
+
+	BUG_ON(!irqs_disabled());
+	if(!per_cpu(posix_timer_task, cpu))
+		return;
+	/* get per-cpu references */
+	tasklist = per_cpu(posix_timer_tasklist, cpu);
+
+	/* check to see if we're already queued */
+	if (!tsk->posix_timer_list) {
+		get_task_struct(tsk);
+		if (tasklist) {
+			tsk->posix_timer_list = tasklist;
+		} else {
+			/*
+			 * The list is terminated by a self-pointing
+			 * task_struct
+			 */
+			tsk->posix_timer_list = tsk;
+		}
+		per_cpu(posix_timer_tasklist, cpu) = tsk;
+	}
+	/* XXX signal the thread somehow */
+	wake_up_process(per_cpu(posix_timer_task, cpu));
+}
+
+/*
+ * posix_cpu_thread_call - callback that gets triggered when a CPU is added.
+ * Here we can start up the necessary migration thread for the new CPU.
+ */
+static int posix_cpu_thread_call(struct notifier_block *nfb,
+				 unsigned long action, void *hcpu)
+{
+	int cpu = (long)hcpu;
+	struct task_struct *p;
+	struct sched_param param;
+
+	switch (action) {
+	case CPU_UP_PREPARE:
+		p = kthread_create(posix_cpu_timers_thread, hcpu,
+					"posix_cpu_timers/%d",cpu);
+		if (IS_ERR(p))
+			return NOTIFY_BAD;
+		p->flags |= PF_NOFREEZE;
+		kthread_bind(p, cpu);
+		/* Must be high prio to avoid getting starved */
+		param.sched_priority = MAX_RT_PRIO-1;
+		sched_setscheduler(p, SCHED_FIFO, &param);
+		per_cpu(posix_timer_task,cpu) = p;
+		break;
+	case CPU_ONLINE:
+		/* Strictly unneccessary, as first user will wake it. */
+		wake_up_process(per_cpu(posix_timer_task,cpu));
+		break;
+#ifdef CONFIG_HOTPLUG_CPU
+	case CPU_UP_CANCELED:
+		/* Unbind it from offline cpu so it can run.  Fall thru. */
+		kthread_bind(per_cpu(posix_timer_task, cpu),
+			     cpumask_any(cpu_online_mask));
+		kthread_stop(per_cpu(posix_timer_task,cpu));
+		per_cpu(posix_timer_task,cpu) = NULL;
+		break;
+	case CPU_DEAD:
+		kthread_stop(per_cpu(posix_timer_task,cpu));
+		per_cpu(posix_timer_task,cpu) = NULL;
+		break;
+#endif
+	}
+	return NOTIFY_OK;
+}
+
+/* Register at highest priority so that task migration (migrate_all_tasks)
+ * happens before everything else.
+ */
+static struct notifier_block posix_cpu_thread_notifier = {
+	.notifier_call = posix_cpu_thread_call,
+	.priority = 10
+};
+
+static int __init posix_cpu_thread_init(void)
+{
+	void *hcpu = (void *)(long)smp_processor_id();
+	/* Start one for boot CPU. */
+	unsigned long cpu;
+
+	/* init the per-cpu posix_timer_tasklets */
+	for_each_possible_cpu(cpu)
+		per_cpu(posix_timer_tasklist, cpu) = NULL;
+
+	posix_cpu_thread_call(&posix_cpu_thread_notifier, CPU_UP_PREPARE, hcpu);
+	posix_cpu_thread_call(&posix_cpu_thread_notifier, CPU_ONLINE, hcpu);
+	register_cpu_notifier(&posix_cpu_thread_notifier);
+	return 0;
+}
+early_initcall(posix_cpu_thread_init);
+#else /* CONFIG_PREEMPT_RT_BASE */
+void run_posix_cpu_timers(struct task_struct *tsk)
+{
+	__run_posix_cpu_timers(tsk);
+}
+#endif /* CONFIG_PREEMPT_RT_BASE */
+
 /*
  * Set one of the process-wide special case CPU timers or RLIMIT_CPU.
  * The tsk->sighand->siglock must be held by the caller.
-- 
cgit v0.10.2


From f95a978392f74534158fead16aab64850bfd1e43 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Fri, 3 Jul 2009 08:30:00 -0500
Subject: posix-timers: Shorten posix_cpu_timers/<CPU> kernel thread names

Shorten the softirq kernel thread names because they always overflow the
limited comm length, appearing as "posix_cpu_timer" CPU# times.

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 15c4821..578189a 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -1430,7 +1430,7 @@ static int posix_cpu_thread_call(struct notifier_block *nfb,
 	switch (action) {
 	case CPU_UP_PREPARE:
 		p = kthread_create(posix_cpu_timers_thread, hcpu,
-					"posix_cpu_timers/%d",cpu);
+					"posixcputmr/%d",cpu);
 		if (IS_ERR(p))
 			return NOTIFY_BAD;
 		p->flags |= PF_NOFREEZE;
-- 
cgit v0.10.2


From 613094dd9f151f22efc98c9e921d646ef3594e63 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 3 Jul 2009 08:44:44 -0500
Subject: posix-timers: Avoid wakeups when no timers are active

Waking the thread even when no timers are scheduled is useless.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 578189a..06692e8 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -1387,6 +1387,21 @@ wait_to_die:
 	return 0;
 }
 
+static inline int __fastpath_timer_check(struct task_struct *tsk)
+{
+	/* tsk == current, ensure it is safe to use ->signal/sighand */
+	if (unlikely(tsk->exit_state))
+		return 0;
+
+	if (!task_cputime_zero(&tsk->cputime_expires))
+			return 1;
+
+	if (!task_cputime_zero(&tsk->signal->cputime_expires))
+			return 1;
+
+	return 0;
+}
+
 void run_posix_cpu_timers(struct task_struct *tsk)
 {
 	unsigned long cpu = smp_processor_id();
@@ -1399,7 +1414,7 @@ void run_posix_cpu_timers(struct task_struct *tsk)
 	tasklist = per_cpu(posix_timer_tasklist, cpu);
 
 	/* check to see if we're already queued */
-	if (!tsk->posix_timer_list) {
+	if (!tsk->posix_timer_list && __fastpath_timer_check(tsk)) {
 		get_task_struct(tsk);
 		if (tasklist) {
 			tsk->posix_timer_list = tasklist;
@@ -1411,9 +1426,9 @@ void run_posix_cpu_timers(struct task_struct *tsk)
 			tsk->posix_timer_list = tsk;
 		}
 		per_cpu(posix_timer_tasklist, cpu) = tsk;
+
+		wake_up_process(per_cpu(posix_timer_task, cpu));
 	}
-	/* XXX signal the thread somehow */
-	wake_up_process(per_cpu(posix_timer_task, cpu));
 }
 
 /*
-- 
cgit v0.10.2


From 63daca0d9eda1b883705f66d738b45c554cb16ad Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 31 May 2011 16:59:16 +0200
Subject: sched-delay-put-task.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 77f1fa5..7b58175 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1623,6 +1623,9 @@ struct task_struct {
 #ifdef CONFIG_UPROBES
 	struct uprobe_task *utask;
 #endif
+#ifdef CONFIG_PREEMPT_RT_BASE
+	struct rcu_head put_rcu;
+#endif
 };
 
 /* Future-safe accessor for struct task_struct's cpus_allowed. */
@@ -1813,6 +1816,15 @@ extern struct pid *cad_pid;
 extern void free_task(struct task_struct *tsk);
 #define get_task_struct(tsk) do { atomic_inc(&(tsk)->usage); } while(0)
 
+#ifdef CONFIG_PREEMPT_RT_BASE
+extern void __put_task_struct_cb(struct rcu_head *rhp);
+
+static inline void put_task_struct(struct task_struct *t)
+{
+	if (atomic_dec_and_test(&t->usage))
+		call_rcu(&t->put_rcu, __put_task_struct_cb);
+}
+#else
 extern void __put_task_struct(struct task_struct *t);
 
 static inline void put_task_struct(struct task_struct *t)
@@ -1820,6 +1832,7 @@ static inline void put_task_struct(struct task_struct *t)
 	if (atomic_dec_and_test(&t->usage))
 		__put_task_struct(t);
 }
+#endif
 
 extern void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st);
 extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st);
diff --git a/kernel/fork.c b/kernel/fork.c
index fae9ab2..c2e0120 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -229,7 +229,9 @@ static inline void put_signal_struct(struct signal_struct *sig)
 	if (atomic_dec_and_test(&sig->sigcnt))
 		free_signal_struct(sig);
 }
-
+#ifdef CONFIG_PREEMPT_RT_BASE
+static
+#endif
 void __put_task_struct(struct task_struct *tsk)
 {
 	WARN_ON(!tsk->exit_state);
@@ -244,7 +246,18 @@ void __put_task_struct(struct task_struct *tsk)
 	if (!profile_handoff_task(tsk))
 		free_task(tsk);
 }
+#ifndef CONFIG_PREEMPT_RT_BASE
 EXPORT_SYMBOL_GPL(__put_task_struct);
+#else
+void __put_task_struct_cb(struct rcu_head *rhp)
+{
+	struct task_struct *tsk = container_of(rhp, struct task_struct, put_rcu);
+
+	__put_task_struct(tsk);
+
+}
+EXPORT_SYMBOL_GPL(__put_task_struct_cb);
+#endif
 
 void __init __weak arch_task_cache_init(void) { }
 
-- 
cgit v0.10.2


From 8e6176bd44fb853d7a0dd753bfa57484ecee31d6 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 6 Jun 2011 12:12:51 +0200
Subject: sched-limit-nr-migrate.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a5c0bd0..26b009a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -272,7 +272,11 @@ late_initcall(sched_init_debug);
  * Number of tasks to iterate in a single balance run.
  * Limited because this is done with IRQs disabled.
  */
+#ifndef CONFIG_PREEMPT_RT_FULL
 const_debug unsigned int sysctl_sched_nr_migrate = 32;
+#else
+const_debug unsigned int sysctl_sched_nr_migrate = 8;
+#endif
 
 /*
  * period over which we average the RT time consumption, measured
-- 
cgit v0.10.2


From 247c109c085c68d2bbff5376a981f7f96c04b6f3 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 6 Jun 2011 12:20:33 +0200
Subject: sched-mmdrop-delayed.patch

Needs thread context (pgd_lock) -> ifdeffed. workqueues wont work with
RT

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 8b53bdf..6270199 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -11,6 +11,7 @@
 #include <linux/completion.h>
 #include <linux/cpumask.h>
 #include <linux/page-debug-flags.h>
+#include <linux/rcupdate.h>
 #include <linux/uprobes.h>
 #include <asm/page.h>
 #include <asm/mmu.h>
@@ -440,6 +441,9 @@ struct mm_struct {
 	int first_nid;
 #endif
 	struct uprobes_state uprobes_state;
+#ifdef CONFIG_PREEMPT_RT_BASE
+	struct rcu_head delayed_drop;
+#endif
 };
 
 /* first nid will either be a valid NID or one of these values */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 7b58175..9e2700c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2354,12 +2354,24 @@ extern struct mm_struct * mm_alloc(void);
 
 /* mmdrop drops the mm and the page tables */
 extern void __mmdrop(struct mm_struct *);
+
 static inline void mmdrop(struct mm_struct * mm)
 {
 	if (unlikely(atomic_dec_and_test(&mm->mm_count)))
 		__mmdrop(mm);
 }
 
+#ifdef CONFIG_PREEMPT_RT_BASE
+extern void __mmdrop_delayed(struct rcu_head *rhp);
+static inline void mmdrop_delayed(struct mm_struct *mm)
+{
+	if (atomic_dec_and_test(&mm->mm_count))
+		call_rcu(&mm->delayed_drop, __mmdrop_delayed);
+}
+#else
+# define mmdrop_delayed(mm)	mmdrop(mm)
+#endif
+
 /* mmput gets rid of the mappings and all user-space */
 extern void mmput(struct mm_struct *);
 /* Grab a reference to a task's mm, if it is not already going away */
diff --git a/kernel/fork.c b/kernel/fork.c
index c2e0120..591d2bd 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -616,6 +616,19 @@ void __mmdrop(struct mm_struct *mm)
 }
 EXPORT_SYMBOL_GPL(__mmdrop);
 
+#ifdef CONFIG_PREEMPT_RT_BASE
+/*
+ * RCU callback for delayed mm drop. Not strictly rcu, but we don't
+ * want another facility to make this work.
+ */
+void __mmdrop_delayed(struct rcu_head *rhp)
+{
+	struct mm_struct *mm = container_of(rhp, struct mm_struct, delayed_drop);
+
+	__mmdrop(mm);
+}
+#endif
+
 /*
  * Decrement the use count and release all resources for an mm.
  */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 26b009a..7248fdb 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1845,8 +1845,12 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
 	finish_arch_post_lock_switch();
 
 	fire_sched_in_preempt_notifiers(current);
+	/*
+	 * We use mmdrop_delayed() here so we don't have to do the
+	 * full __mmdrop() when we are the last user.
+	 */
 	if (mm)
-		mmdrop(mm);
+		mmdrop_delayed(mm);
 	if (unlikely(prev_state == TASK_DEAD)) {
 		/*
 		 * Remove function-return probe instances associated with this
@@ -4833,6 +4837,8 @@ static int migration_cpu_stop(void *data)
 
 #ifdef CONFIG_HOTPLUG_CPU
 
+static DEFINE_PER_CPU(struct mm_struct *, idle_last_mm);
+
 /*
  * Ensures that the idle task is using init_mm right before its cpu goes
  * offline.
@@ -4845,7 +4851,12 @@ void idle_task_exit(void)
 
 	if (mm != &init_mm)
 		switch_mm(mm, &init_mm, current);
-	mmdrop(mm);
+
+	/*
+	 * Defer the cleanup to an alive cpu. On RT we can neither
+	 * call mmdrop() nor mmdrop_delayed() from here.
+	 */
+	per_cpu(idle_last_mm, smp_processor_id()) = mm;
 }
 
 /*
@@ -5162,6 +5173,10 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 
 	case CPU_DEAD:
 		calc_load_migrate(rq);
+		if (per_cpu(idle_last_mm, cpu)) {
+			mmdrop(per_cpu(idle_last_mm, cpu));
+			per_cpu(idle_last_mm, cpu) = NULL;
+		}
 		break;
 #endif
 	}
-- 
cgit v0.10.2


From 60547234f5ae3160259b753271bd0243983ff023 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 25 Jun 2011 09:21:04 +0200
Subject: sched-rt-mutex-wakeup.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 9e2700c..40277fc 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1062,6 +1062,7 @@ struct sched_domain;
 #define WF_SYNC		0x01		/* waker goes to sleep after wakup */
 #define WF_FORK		0x02		/* child wakeup after fork */
 #define WF_MIGRATED	0x04		/* internal use, task got migrated */
+#define WF_LOCK_SLEEPER	0x08		/* wakeup spinlock "sleeper" */
 
 #define ENQUEUE_WAKEUP		1
 #define ENQUEUE_HEAD		2
@@ -1238,6 +1239,7 @@ enum perf_event_task_context {
 
 struct task_struct {
 	volatile long state;	/* -1 unrunnable, 0 runnable, >0 stopped */
+	volatile long saved_state;	/* saved state for "spinlock sleepers" */
 	void *stack;
 	atomic_t usage;
 	unsigned int flags;	/* per process flags, defined below */
@@ -2250,6 +2252,7 @@ extern void xtime_update(unsigned long ticks);
 
 extern int wake_up_state(struct task_struct *tsk, unsigned int state);
 extern int wake_up_process(struct task_struct *tsk);
+extern int wake_up_lock_sleeper(struct task_struct * tsk);
 extern void wake_up_new_task(struct task_struct *tsk);
 #ifdef CONFIG_SMP
  extern void kick_process(struct task_struct *tsk);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 7248fdb..d5c225d 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1438,8 +1438,25 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 
 	smp_wmb();
 	raw_spin_lock_irqsave(&p->pi_lock, flags);
-	if (!(p->state & state))
+	if (!(p->state & state)) {
+		/*
+		 * The task might be running due to a spinlock sleeper
+		 * wakeup. Check the saved state and set it to running
+		 * if the wakeup condition is true.
+		 */
+		if (!(wake_flags & WF_LOCK_SLEEPER)) {
+			if (p->saved_state & state)
+				p->saved_state = TASK_RUNNING;
+		}
 		goto out;
+	}
+
+	/*
+	 * If this is a regular wakeup, then we can unconditionally
+	 * clear the saved state of a "lock sleeper".
+	 */
+	if (!(wake_flags & WF_LOCK_SLEEPER))
+		p->saved_state = TASK_RUNNING;
 
 	success = 1; /* we're going to change ->state */
 	cpu = task_cpu(p);
@@ -1533,6 +1550,18 @@ int wake_up_process(struct task_struct *p)
 }
 EXPORT_SYMBOL(wake_up_process);
 
+/**
+ * wake_up_lock_sleeper - Wake up a specific process blocked on a "sleeping lock"
+ * @p: The process to be woken up.
+ *
+ * Same as wake_up_process() above, but wake_flags=WF_LOCK_SLEEPER to indicate
+ * the nature of the wakeup.
+ */
+int wake_up_lock_sleeper(struct task_struct *p)
+{
+	return try_to_wake_up(p, TASK_ALL, WF_LOCK_SLEEPER);
+}
+
 int wake_up_state(struct task_struct *p, unsigned int state)
 {
 	return try_to_wake_up(p, state, 0);
-- 
cgit v0.10.2


From e2cc53780829dfb1f0f516ef9fd1180fe9641389 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 7 Jun 2011 09:19:06 +0200
Subject: sched-might-sleep-do-not-account-rcu-depth.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 275aa3f1..d870976 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -182,6 +182,11 @@ void synchronize_rcu(void);
  * types of kernel builds, the rcu_read_lock() nesting depth is unknowable.
  */
 #define rcu_preempt_depth() (current->rcu_read_lock_nesting)
+#ifndef CONFIG_PREEMPT_RT_FULL
+#define sched_rcu_preempt_depth()	rcu_preempt_depth()
+#else
+static inline int sched_rcu_preempt_depth(void) { return 0; }
+#endif
 
 #else /* #ifdef CONFIG_PREEMPT_RCU */
 
@@ -205,6 +210,8 @@ static inline int rcu_preempt_depth(void)
 	return 0;
 }
 
+#define sched_rcu_preempt_depth()	rcu_preempt_depth()
+
 #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
 
 /* Internal to kernel */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d5c225d..5394bc9 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7058,7 +7058,8 @@ void __init sched_init(void)
 #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
 static inline int preempt_count_equals(int preempt_offset)
 {
-	int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
+	int nested = (preempt_count() & ~PREEMPT_ACTIVE) +
+		sched_rcu_preempt_depth();
 
 	return (nested == preempt_offset);
 }
-- 
cgit v0.10.2


From ebc39bc73e25b8afa8dcb73d4f0ba1f3664744ca Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 7 Jun 2011 11:25:03 +0200
Subject: sched-cond-resched.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 5394bc9..d65d523 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4315,9 +4315,17 @@ static inline int should_resched(void)
 
 static void __cond_resched(void)
 {
-	add_preempt_count(PREEMPT_ACTIVE);
-	__schedule();
-	sub_preempt_count(PREEMPT_ACTIVE);
+	do {
+		add_preempt_count(PREEMPT_ACTIVE);
+		__schedule();
+		sub_preempt_count(PREEMPT_ACTIVE);
+		/*
+		 * Check again in case we missed a preemption
+		 * opportunity between schedule and now.
+		 */
+		barrier();
+
+	} while (need_resched());
 }
 
 int __sched _cond_resched(void)
-- 
cgit v0.10.2


From 318009722ed057a6f8be5d4df8299aa6d1aaff44 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 14 Jul 2011 09:56:44 +0200
Subject: cond-resched-softirq-fix.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 40277fc..a3042d2 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2730,12 +2730,16 @@ extern int __cond_resched_lock(spinlock_t *lock);
 	__cond_resched_lock(lock);				\
 })
 
+#ifndef CONFIG_PREEMPT_RT_FULL
 extern int __cond_resched_softirq(void);
 
 #define cond_resched_softirq() ({					\
 	__might_sleep(__FILE__, __LINE__, SOFTIRQ_DISABLE_OFFSET);	\
 	__cond_resched_softirq();					\
 })
+#else
+# define cond_resched_softirq()		cond_resched()
+#endif
 
 /*
  * Does a critical section need to be broken due to another
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d65d523..9e456ec 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4366,6 +4366,7 @@ int __cond_resched_lock(spinlock_t *lock)
 }
 EXPORT_SYMBOL(__cond_resched_lock);
 
+#ifndef CONFIG_PREEMPT_RT_FULL
 int __sched __cond_resched_softirq(void)
 {
 	BUG_ON(!in_softirq());
@@ -4379,6 +4380,7 @@ int __sched __cond_resched_softirq(void)
 	return 0;
 }
 EXPORT_SYMBOL(__cond_resched_softirq);
+#endif
 
 /**
  * yield - yield the current processor to other threads.
-- 
cgit v0.10.2


From a747afb1d23efca5a3cc00c79fa4d2bd60f9e49e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 17 Jul 2011 22:51:33 +0200
Subject: cond-resched-lock-rt-tweak.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/sched.h b/include/linux/sched.h
index a3042d2..956e826 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2719,7 +2719,7 @@ extern int _cond_resched(void);
 
 extern int __cond_resched_lock(spinlock_t *lock);
 
-#ifdef CONFIG_PREEMPT_COUNT
+#if defined(CONFIG_PREEMPT_COUNT) && !defined(CONFIG_PREEMPT_RT_FULL)
 #define PREEMPT_LOCK_OFFSET	PREEMPT_OFFSET
 #else
 #define PREEMPT_LOCK_OFFSET	0
-- 
cgit v0.10.2


From 8a8e4af3ff1d34d5f89a1eb9b2254fe0b900dc88 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 13 Sep 2011 16:42:35 +0200
Subject: sched-disable-ttwu-queue.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 1ad1d2b..e5de4db 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -57,11 +57,15 @@ SCHED_FEAT(OWNER_SPIN, true)
  */
 SCHED_FEAT(NONTASK_POWER, true)
 
+#ifndef CONFIG_PREEMPT_RT_FULL
 /*
  * Queue remote wakeups on the target CPU and process them
  * using the scheduler IPI. Reduces rq->lock contention/bounces.
  */
 SCHED_FEAT(TTWU_QUEUE, true)
+#else
+SCHED_FEAT(TTWU_QUEUE, false)
+#endif
 
 SCHED_FEAT(FORCE_SD_OVERLAP, false)
 SCHED_FEAT(RT_RUNTIME_SHARE, true)
-- 
cgit v0.10.2


From 32f4d7ce8d2935a6ed77e8e54a48514574948fea Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 18 Jul 2011 17:03:52 +0200
Subject: sched: Disable CONFIG_RT_GROUP_SCHED on RT

Carsten reported problems when running:

	taskset 01 chrt -f 1 sleep 1

from within rc.local on a F15 machine. The task stays running and
never gets on the run queue because some of the run queues have
rt_throttled=1 which does not go away. Works nice from a ssh login
shell. Disabling CONFIG_RT_GROUP_SCHED solves that as well.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/init/Kconfig b/init/Kconfig
index fbc0785..f6a406d 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -946,6 +946,7 @@ config RT_GROUP_SCHED
 	bool "Group scheduling for SCHED_RR/FIFO"
 	depends on EXPERIMENTAL
 	depends on CGROUP_SCHED
+	depends on !PREEMPT_RT_FULL
 	default n
 	help
 	  This feature lets you explicitly allocate real CPU bandwidth
-- 
cgit v0.10.2


From b8d3ae988c39dc81f2d72a10cf431cc5098aea3c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 13 Dec 2011 21:42:19 +0100
Subject: sched: ttwu: Return success when only changing the saved_state value

When a task blocks on a rt lock, it saves the current state in
p->saved_state, so a lock related wake up will not destroy the
original state.

When a real wakeup happens, while the task is running due to a lock
wakeup already, we update p->saved_state to TASK_RUNNING, but we do
not return success, which might cause another wakeup in the waitqueue
code and the task remains in the waitqueue list. Return success in
that case as well.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable-rt@vger.kernel.org

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9e456ec..d89e1d6 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1445,8 +1445,10 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 		 * if the wakeup condition is true.
 		 */
 		if (!(wake_flags & WF_LOCK_SLEEPER)) {
-			if (p->saved_state & state)
+			if (p->saved_state & state) {
 				p->saved_state = TASK_RUNNING;
+				success = 1;
+			}
 		}
 		goto out;
 	}
-- 
cgit v0.10.2


From 1e05219e0ca80d8568dc39e3365646540ce75b65 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 3 Jul 2009 08:30:27 -0500
Subject: stop_machine: convert stop_machine_run() to PREEMPT_RT

Instead of playing with non-preemption, introduce explicit
startup serialization. This is more robust and cleaner as
well.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 2f194e9..61779f8 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -135,6 +135,7 @@ void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
 
 /* static data for stop_cpus */
 static DEFINE_MUTEX(stop_cpus_mutex);
+static DEFINE_MUTEX(stopper_lock);
 static DEFINE_PER_CPU(struct cpu_stop_work, stop_cpus_work);
 
 static void queue_stop_cpus_work(const struct cpumask *cpumask,
@@ -153,15 +154,14 @@ static void queue_stop_cpus_work(const struct cpumask *cpumask,
 	}
 
 	/*
-	 * Disable preemption while queueing to avoid getting
-	 * preempted by a stopper which might wait for other stoppers
-	 * to enter @fn which can lead to deadlock.
+	 * Make sure that all work is queued on all cpus before we
+	 * any of the cpus can execute it.
 	 */
-	preempt_disable();
+	mutex_lock(&stopper_lock);
 	for_each_cpu(cpu, cpumask)
 		cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu),
 				    &per_cpu(stop_cpus_work, cpu));
-	preempt_enable();
+	mutex_unlock(&stopper_lock);
 }
 
 static int __stop_cpus(const struct cpumask *cpumask,
@@ -275,6 +275,16 @@ repeat:
 
 		__set_current_state(TASK_RUNNING);
 
+		/*
+		 * Wait until the stopper finished scheduling on all
+		 * cpus
+		 */
+		mutex_lock(&stopper_lock);
+		/*
+		 * Let other cpu threads continue as well
+		 */
+		mutex_unlock(&stopper_lock);
+
 		/* cpu stop callbacks are not allowed to sleep */
 		preempt_disable();
 
-- 
cgit v0.10.2


From b3960a79c9c8fa549c26bef16b7beffddbdb4399 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 17 Jul 2011 19:53:19 +0200
Subject: stomp-machine-mark-stomper-thread.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 956e826..de395b6 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1859,6 +1859,7 @@ extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut,
 #define PF_FROZEN	0x00010000	/* frozen for system suspend */
 #define PF_FSTRANS	0x00020000	/* inside a filesystem transaction */
 #define PF_KSWAPD	0x00040000	/* I am kswapd */
+#define PF_STOMPER	0x00080000	/* I am a stomp machine thread */
 #define PF_LESS_THROTTLE 0x00100000	/* Throttle me less: I clean memory */
 #define PF_KTHREAD	0x00200000	/* I am a kernel thread */
 #define PF_RANDOMIZE	0x00400000	/* randomize virtual address space */
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 61779f8..484a335 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -327,6 +327,7 @@ static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb,
 		if (IS_ERR(p))
 			return notifier_from_errno(PTR_ERR(p));
 		get_task_struct(p);
+		p->flags |= PF_STOMPER;
 		kthread_bind(p, cpu);
 		sched_set_stop_task(cpu, p);
 		stopper->thread = p;
-- 
cgit v0.10.2


From d4fabb90d7fac82f92d17801cf5deb3b045aab28 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 29 Jun 2011 11:01:51 +0200
Subject: stomp-machine-raw-lock.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 484a335..561ba3a 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -29,12 +29,12 @@ struct cpu_stop_done {
 	atomic_t		nr_todo;	/* nr left to execute */
 	bool			executed;	/* actually executed? */
 	int			ret;		/* collected return value */
-	struct completion	completion;	/* fired if nr_todo reaches 0 */
+	struct task_struct	*waiter;	/* woken when nr_todo reaches 0 */
 };
 
 /* the actual stopper, one per every possible cpu, enabled on online cpus */
 struct cpu_stopper {
-	spinlock_t		lock;
+	raw_spinlock_t		lock;
 	bool			enabled;	/* is this stopper enabled? */
 	struct list_head	works;		/* list of pending works */
 	struct task_struct	*thread;	/* stopper thread */
@@ -47,7 +47,7 @@ static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo)
 {
 	memset(done, 0, sizeof(*done));
 	atomic_set(&done->nr_todo, nr_todo);
-	init_completion(&done->completion);
+	done->waiter = current;
 }
 
 /* signal completion unless @done is NULL */
@@ -56,8 +56,10 @@ static void cpu_stop_signal_done(struct cpu_stop_done *done, bool executed)
 	if (done) {
 		if (executed)
 			done->executed = true;
-		if (atomic_dec_and_test(&done->nr_todo))
-			complete(&done->completion);
+		if (atomic_dec_and_test(&done->nr_todo)) {
+			wake_up_process(done->waiter);
+			done->waiter = NULL;
+		}
 	}
 }
 
@@ -67,7 +69,7 @@ static void cpu_stop_queue_work(struct cpu_stopper *stopper,
 {
 	unsigned long flags;
 
-	spin_lock_irqsave(&stopper->lock, flags);
+	raw_spin_lock_irqsave(&stopper->lock, flags);
 
 	if (stopper->enabled) {
 		list_add_tail(&work->list, &stopper->works);
@@ -75,7 +77,23 @@ static void cpu_stop_queue_work(struct cpu_stopper *stopper,
 	} else
 		cpu_stop_signal_done(work->done, false);
 
-	spin_unlock_irqrestore(&stopper->lock, flags);
+	raw_spin_unlock_irqrestore(&stopper->lock, flags);
+}
+
+static void wait_for_stop_done(struct cpu_stop_done *done)
+{
+	set_current_state(TASK_UNINTERRUPTIBLE);
+	while (atomic_read(&done->nr_todo)) {
+		schedule();
+		set_current_state(TASK_UNINTERRUPTIBLE);
+	}
+	/*
+	 * We need to wait until cpu_stop_signal_done() has cleared
+	 * done->waiter.
+	 */
+	while (done->waiter)
+		cpu_relax();
+	set_current_state(TASK_RUNNING);
 }
 
 /**
@@ -109,7 +127,7 @@ int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg)
 
 	cpu_stop_init_done(&done, 1);
 	cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu), &work);
-	wait_for_completion(&done.completion);
+	wait_for_stop_done(&done);
 	return done.executed ? done.ret : -ENOENT;
 }
 
@@ -171,7 +189,7 @@ static int __stop_cpus(const struct cpumask *cpumask,
 
 	cpu_stop_init_done(&done, cpumask_weight(cpumask));
 	queue_stop_cpus_work(cpumask, fn, arg, &done);
-	wait_for_completion(&done.completion);
+	wait_for_stop_done(&done);
 	return done.executed ? done.ret : -ENOENT;
 }
 
@@ -259,13 +277,13 @@ repeat:
 	}
 
 	work = NULL;
-	spin_lock_irq(&stopper->lock);
+	raw_spin_lock_irq(&stopper->lock);
 	if (!list_empty(&stopper->works)) {
 		work = list_first_entry(&stopper->works,
 					struct cpu_stop_work, list);
 		list_del_init(&work->list);
 	}
-	spin_unlock_irq(&stopper->lock);
+	raw_spin_unlock_irq(&stopper->lock);
 
 	if (work) {
 		cpu_stop_fn_t fn = work->fn;
@@ -299,7 +317,13 @@ repeat:
 			  kallsyms_lookup((unsigned long)fn, NULL, NULL, NULL,
 					  ksym_buf), arg);
 
+		/*
+		 * Make sure that the wakeup and setting done->waiter
+		 * to NULL is atomic.
+		 */
+		local_irq_disable();
 		cpu_stop_signal_done(done, true);
+		local_irq_enable();
 	} else
 		schedule();
 
@@ -337,9 +361,9 @@ static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb,
 		/* strictly unnecessary, as first user will wake it */
 		wake_up_process(stopper->thread);
 		/* mark enabled */
-		spin_lock_irq(&stopper->lock);
+		raw_spin_lock_irq(&stopper->lock);
 		stopper->enabled = true;
-		spin_unlock_irq(&stopper->lock);
+		raw_spin_unlock_irq(&stopper->lock);
 		break;
 
 #ifdef CONFIG_HOTPLUG_CPU
@@ -352,11 +376,11 @@ static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb,
 		/* kill the stopper */
 		kthread_stop(stopper->thread);
 		/* drain remaining works */
-		spin_lock_irq(&stopper->lock);
+		raw_spin_lock_irq(&stopper->lock);
 		list_for_each_entry(work, &stopper->works, list)
 			cpu_stop_signal_done(work->done, false);
 		stopper->enabled = false;
-		spin_unlock_irq(&stopper->lock);
+		raw_spin_unlock_irq(&stopper->lock);
 		/* release the stopper */
 		put_task_struct(stopper->thread);
 		stopper->thread = NULL;
@@ -387,7 +411,7 @@ static int __init cpu_stop_init(void)
 	for_each_possible_cpu(cpu) {
 		struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
 
-		spin_lock_init(&stopper->lock);
+		raw_spin_lock_init(&stopper->lock);
 		INIT_LIST_HEAD(&stopper->works);
 	}
 
@@ -581,7 +605,7 @@ int stop_machine_from_inactive_cpu(int (*fn)(void *), void *data,
 	ret = stop_machine_cpu_stop(&smdata);
 
 	/* Busy wait for completion. */
-	while (!completion_done(&done.completion))
+	while (atomic_read(&done.nr_todo))
 		cpu_relax();
 
 	mutex_unlock(&stop_cpus_mutex);
-- 
cgit v0.10.2


From bae8dfcdc6b8673633a89be65f918627a6f2f6f5 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 15 Jun 2011 12:36:06 +0200
Subject: hotplug: Lightweight get online cpus

get_online_cpus() is a heavy weight function which involves a global
mutex. migrate_disable() wants a simpler construct which prevents only
a CPU from going doing while a task is in a migrate disabled section.

Implement a per cpu lockless mechanism, which serializes only in the
real unplug case on a global mutex. That serialization affects only
tasks on the cpu which should be brought down.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index ce7a074..7781c9e 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -175,6 +175,8 @@ extern struct bus_type cpu_subsys;
 
 extern void get_online_cpus(void);
 extern void put_online_cpus(void);
+extern void pin_current_cpu(void);
+extern void unpin_current_cpu(void);
 #define hotcpu_notifier(fn, pri)	cpu_notifier(fn, pri)
 #define register_hotcpu_notifier(nb)	register_cpu_notifier(nb)
 #define unregister_hotcpu_notifier(nb)	unregister_cpu_notifier(nb)
@@ -198,6 +200,8 @@ static inline void cpu_hotplug_driver_unlock(void)
 
 #define get_online_cpus()	do { } while (0)
 #define put_online_cpus()	do { } while (0)
+static inline void pin_current_cpu(void) { }
+static inline void unpin_current_cpu(void) { }
 #define hotcpu_notifier(fn, pri)	do { (void)(fn); } while (0)
 /* These aren't inline functions due to a GCC bug. */
 #define register_hotcpu_notifier(nb)	({ (void)(nb); 0; })
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 3046a50..05e0381 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -63,6 +63,102 @@ static struct {
 	.refcount = 0,
 };
 
+struct hotplug_pcp {
+	struct task_struct *unplug;
+	int refcount;
+	struct completion synced;
+};
+
+static DEFINE_PER_CPU(struct hotplug_pcp, hotplug_pcp);
+
+/**
+ * pin_current_cpu - Prevent the current cpu from being unplugged
+ *
+ * Lightweight version of get_online_cpus() to prevent cpu from being
+ * unplugged when code runs in a migration disabled region.
+ *
+ * Must be called with preemption disabled (preempt_count = 1)!
+ */
+void pin_current_cpu(void)
+{
+	struct hotplug_pcp *hp = &__get_cpu_var(hotplug_pcp);
+
+retry:
+	if (!hp->unplug || hp->refcount || preempt_count() > 1 ||
+	    hp->unplug == current || (current->flags & PF_STOMPER)) {
+		hp->refcount++;
+		return;
+	}
+	preempt_enable();
+	mutex_lock(&cpu_hotplug.lock);
+	mutex_unlock(&cpu_hotplug.lock);
+	preempt_disable();
+	goto retry;
+}
+
+/**
+ * unpin_current_cpu - Allow unplug of current cpu
+ *
+ * Must be called with preemption or interrupts disabled!
+ */
+void unpin_current_cpu(void)
+{
+	struct hotplug_pcp *hp = &__get_cpu_var(hotplug_pcp);
+
+	WARN_ON(hp->refcount <= 0);
+
+	/* This is safe. sync_unplug_thread is pinned to this cpu */
+	if (!--hp->refcount && hp->unplug && hp->unplug != current &&
+	    !(current->flags & PF_STOMPER))
+		wake_up_process(hp->unplug);
+}
+
+/*
+ * FIXME: Is this really correct under all circumstances ?
+ */
+static int sync_unplug_thread(void *data)
+{
+	struct hotplug_pcp *hp = data;
+
+	preempt_disable();
+	hp->unplug = current;
+	set_current_state(TASK_UNINTERRUPTIBLE);
+	while (hp->refcount) {
+		schedule_preempt_disabled();
+		set_current_state(TASK_UNINTERRUPTIBLE);
+	}
+	set_current_state(TASK_RUNNING);
+	preempt_enable();
+	complete(&hp->synced);
+	return 0;
+}
+
+/*
+ * Start the sync_unplug_thread on the target cpu and wait for it to
+ * complete.
+ */
+static int cpu_unplug_begin(unsigned int cpu)
+{
+	struct hotplug_pcp *hp = &per_cpu(hotplug_pcp, cpu);
+	struct task_struct *tsk;
+
+	init_completion(&hp->synced);
+	tsk = kthread_create(sync_unplug_thread, hp, "sync_unplug/%d\n", cpu);
+	if (IS_ERR(tsk))
+		return (PTR_ERR(tsk));
+	kthread_bind(tsk, cpu);
+	wake_up_process(tsk);
+	wait_for_completion(&hp->synced);
+	return 0;
+}
+
+static void cpu_unplug_done(unsigned int cpu)
+{
+	struct hotplug_pcp *hp = &per_cpu(hotplug_pcp, cpu);
+
+	hp->unplug = NULL;
+}
+
 void get_online_cpus(void)
 {
 	might_sleep();
@@ -260,13 +356,14 @@ static int __ref take_cpu_down(void *_param)
 /* Requires cpu_add_remove_lock to be held */
 static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
 {
-	int err, nr_calls = 0;
+	int mycpu, err, nr_calls = 0;
 	void *hcpu = (void *)(long)cpu;
 	unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
 	struct take_cpu_down_param tcd_param = {
 		.mod = mod,
 		.hcpu = hcpu,
 	};
+	cpumask_var_t cpumask;
 
 	if (num_online_cpus() == 1)
 		return -EBUSY;
@@ -274,7 +371,20 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
 	if (!cpu_online(cpu))
 		return -EINVAL;
 
-	cpu_hotplug_begin();
+	/* Move the downtaker off the unplug cpu */
+	if (!alloc_cpumask_var(&cpumask, GFP_KERNEL))
+		return -ENOMEM;
+	cpumask_andnot(cpumask, cpu_online_mask, cpumask_of(cpu));
+	set_cpus_allowed_ptr(current, cpumask);
+	free_cpumask_var(cpumask);
+	preempt_disable();
+	mycpu = smp_processor_id();
+	if (mycpu == cpu) {
+		printk(KERN_ERR "Yuck! Still on unplug CPU\n!");
+		preempt_enable();
+		return -EBUSY;
+	}
+	preempt_enable();
 
 	err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls);
 	if (err) {
@@ -282,7 +392,16 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
 		__cpu_notify(CPU_DOWN_FAILED | mod, hcpu, nr_calls, NULL);
 		printk("%s: attempt to take down CPU %u failed\n",
 				__func__, cpu);
-		goto out_release;
+		goto out_cancel;
+	}
+
+	cpu_hotplug_begin();
+	err = cpu_unplug_begin(cpu);
+	if (err) {
+		nr_calls--;
+		__cpu_notify(CPU_DOWN_FAILED | mod, hcpu, nr_calls, NULL);
+		printk("cpu_unplug_begin(%d) failed\n", cpu);
+		goto out_cancel;
 	}
 	smpboot_park_threads(cpu);
 
@@ -314,6 +433,8 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
 	check_for_tasks(cpu);
 
 out_release:
+	cpu_unplug_done(cpu);
+out_cancel:
 	cpu_hotplug_done();
 	if (!err)
 		cpu_notify_nofail(CPU_POST_DEAD | mod, hcpu);
-- 
cgit v0.10.2


From 12e93eb728c6165cbd237ac9162e001a8c963f1e Mon Sep 17 00:00:00 2001
From: Yong Zhang <yong.zhang0@gmail.com>
Date: Sun, 16 Oct 2011 18:56:43 +0800
Subject: hotplug: sync_unplug: No " " in task name

Otherwise the output will look a little odd.

Signed-off-by: Yong Zhang <yong.zhang0@gmail.com>
Link: http://lkml.kernel.org/r/1318762607-2261-2-git-send-email-yong.zhang0@gmail.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 05e0381..b06757c 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -143,7 +143,7 @@ static int cpu_unplug_begin(unsigned int cpu)
 	struct task_struct *tsk;
 
 	init_completion(&hp->synced);
-	tsk = kthread_create(sync_unplug_thread, hp, "sync_unplug/%d\n", cpu);
+	tsk = kthread_create(sync_unplug_thread, hp, "sync_unplug/%d", cpu);
 	if (IS_ERR(tsk))
 		return (PTR_ERR(tsk));
 	kthread_bind(tsk, cpu);
-- 
cgit v0.10.2


From 8daba52cac6e6246d08bb1da85f72b160cc7f3b7 Mon Sep 17 00:00:00 2001
From: Yong Zhang <yong.zhang0@gmail.com>
Date: Thu, 28 Jul 2011 11:16:00 +0800
Subject: hotplug: Reread hotplug_pcp on pin_current_cpu() retry

When retry happens, it's likely that the task has been migrated to
another cpu (except unplug failed), but it still derefernces the
original hotplug_pcp per cpu data.

Update the pointer to hotplug_pcp in the retry path, so it points to
the current cpu.

Signed-off-by: Yong Zhang <yong.zhang0@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/20110728031600.GA338@windriver.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/cpu.c b/kernel/cpu.c
index b06757c..3231d1c 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -81,9 +81,11 @@ static DEFINE_PER_CPU(struct hotplug_pcp, hotplug_pcp);
  */
 void pin_current_cpu(void)
 {
-	struct hotplug_pcp *hp = &__get_cpu_var(hotplug_pcp);
+	struct hotplug_pcp *hp;
 
 retry:
+	hp = &__get_cpu_var(hotplug_pcp);
+
 	if (!hp->unplug || hp->refcount || preempt_count() > 1 ||
 	    hp->unplug == current || (current->flags & PF_STOMPER)) {
 		hp->refcount++;
-- 
cgit v0.10.2


From e50b8707cb426a1eb02f5df26a148fa9a0ea3678 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 16 Jun 2011 13:26:08 +0200
Subject: sched-migrate-disable.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index 5b46536..c5d2a36 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -108,6 +108,14 @@ do { \
 
 #endif /* CONFIG_PREEMPT_COUNT */
 
+#ifdef CONFIG_SMP
+extern void migrate_disable(void);
+extern void migrate_enable(void);
+#else
+# define migrate_disable()		do { } while (0)
+# define migrate_enable()		do { } while (0)
+#endif
+
 #ifdef CONFIG_PREEMPT_RT_FULL
 # define preempt_disable_rt()		preempt_disable()
 # define preempt_enable_rt()		preempt_enable()
diff --git a/include/linux/sched.h b/include/linux/sched.h
index de395b6..3021406 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1279,6 +1279,7 @@ struct task_struct {
 #endif
 
 	unsigned int policy;
+	int migrate_disable;
 	int nr_cpus_allowed;
 	cpumask_t cpus_allowed;
 
@@ -1630,9 +1631,6 @@ struct task_struct {
 #endif
 };
 
-/* Future-safe accessor for struct task_struct's cpus_allowed. */
-#define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)
-
 #ifdef CONFIG_NUMA_BALANCING
 extern void task_numa_fault(int node, int pages, bool migrated);
 extern void set_numabalancing_state(bool enabled);
@@ -2812,6 +2810,15 @@ static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
 
 #endif /* CONFIG_SMP */
 
+/* Future-safe accessor for struct task_struct's cpus_allowed. */
+static inline const struct cpumask *tsk_cpus_allowed(struct task_struct *p)
+{
+	if (p->migrate_disable)
+		return cpumask_of(task_cpu(p));
+
+	return &p->cpus_allowed;
+}
+
 extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask);
 extern long sched_getaffinity(pid_t pid, struct cpumask *mask);
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d89e1d6..dfb1a35 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4734,11 +4734,12 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
 #ifdef CONFIG_SMP
 void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
 {
-	if (p->sched_class && p->sched_class->set_cpus_allowed)
-		p->sched_class->set_cpus_allowed(p, new_mask);
-
+	if (!p->migrate_disable) {
+		if (p->sched_class && p->sched_class->set_cpus_allowed)
+			p->sched_class->set_cpus_allowed(p, new_mask);
+		p->nr_cpus_allowed = cpumask_weight(new_mask);
+	}
 	cpumask_copy(&p->cpus_allowed, new_mask);
-	p->nr_cpus_allowed = cpumask_weight(new_mask);
 }
 
 /*
@@ -4789,7 +4790,7 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
 	do_set_cpus_allowed(p, new_mask);
 
 	/* Can the task run on the task's current CPU? If so, we're done */
-	if (cpumask_test_cpu(task_cpu(p), new_mask))
+	if (cpumask_test_cpu(task_cpu(p), new_mask) || p->migrate_disable)
 		goto out;
 
 	dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
@@ -4808,6 +4809,83 @@ out:
 }
 EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
 
+void migrate_disable(void)
+{
+	struct task_struct *p = current;
+	const struct cpumask *mask;
+	unsigned long flags;
+	struct rq *rq;
+
+	preempt_disable();
+	if (p->migrate_disable) {
+		p->migrate_disable++;
+		preempt_enable();
+		return;
+	}
+
+	pin_current_cpu();
+	if (unlikely(!scheduler_running)) {
+		p->migrate_disable = 1;
+		preempt_enable();
+		return;
+	}
+	rq = task_rq_lock(p, &flags);
+	p->migrate_disable = 1;
+	mask = tsk_cpus_allowed(p);
+
+	WARN_ON(!cpumask_test_cpu(smp_processor_id(), mask));
+
+	if (!cpumask_equal(&p->cpus_allowed, mask)) {
+		if (p->sched_class->set_cpus_allowed)
+			p->sched_class->set_cpus_allowed(p, mask);
+		p->nr_cpus_allowed = cpumask_weight(mask);
+	}
+	task_rq_unlock(rq, p, &flags);
+	preempt_enable();
+}
+EXPORT_SYMBOL(migrate_disable);
+
+void migrate_enable(void)
+{
+	struct task_struct *p = current;
+	const struct cpumask *mask;
+	unsigned long flags;
+	struct rq *rq;
+
+	WARN_ON_ONCE(p->migrate_disable <= 0);
+
+	preempt_disable();
+	if (p->migrate_disable > 1) {
+		p->migrate_disable--;
+		preempt_enable();
+		return;
+	}
+
+	if (unlikely(!scheduler_running)) {
+		p->migrate_disable = 0;
+		unpin_current_cpu();
+		preempt_enable();
+		return;
+	}
+
+	rq = task_rq_lock(p, &flags);
+	p->migrate_disable = 0;
+	mask = tsk_cpus_allowed(p);
+
+	WARN_ON(!cpumask_test_cpu(smp_processor_id(), mask));
+
+	if (!cpumask_equal(&p->cpus_allowed, mask)) {
+		if (p->sched_class->set_cpus_allowed)
+			p->sched_class->set_cpus_allowed(p, mask);
+		p->nr_cpus_allowed = cpumask_weight(mask);
+	}
+
+	task_rq_unlock(rq, p, &flags);
+	unpin_current_cpu();
+	preempt_enable();
+}
+EXPORT_SYMBOL(migrate_enable);
+
 /*
  * Move (not current) task off this cpu, onto dest cpu. We're doing
  * this because either it can't run here any more (set_cpus_allowed()
diff --git a/lib/smp_processor_id.c b/lib/smp_processor_id.c
index 4c0d0e5..0a846e7 100644
--- a/lib/smp_processor_id.c
+++ b/lib/smp_processor_id.c
@@ -39,9 +39,9 @@ notrace unsigned int debug_smp_processor_id(void)
 	if (!printk_ratelimit())
 		goto out_enable;
 
-	printk(KERN_ERR "BUG: using smp_processor_id() in preemptible [%08x] "
-			"code: %s/%d\n",
-			preempt_count() - 1, current->comm, current->pid);
+	printk(KERN_ERR "BUG: using smp_processor_id() in preemptible [%08x %08x] "
+	       "code: %s/%d\n", preempt_count() - 1,
+	       current->migrate_disable, current->comm, current->pid);
 	print_symbol("caller is %s\n", (long)__builtin_return_address(0));
 	dump_stack();
 
-- 
cgit v0.10.2


From 715f6c61ab0e2294ead1364ee54ca59c30d16ecd Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 17 Jul 2011 19:35:29 +0200
Subject: hotplug-use-migrate-disable.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 3231d1c..3a931bb 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -379,14 +379,13 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
 	cpumask_andnot(cpumask, cpu_online_mask, cpumask_of(cpu));
 	set_cpus_allowed_ptr(current, cpumask);
 	free_cpumask_var(cpumask);
-	preempt_disable();
+	migrate_disable();
 	mycpu = smp_processor_id();
 	if (mycpu == cpu) {
 		printk(KERN_ERR "Yuck! Still on unplug CPU\n!");
-		preempt_enable();
+		migrate_enable();
 		return -EBUSY;
 	}
-	preempt_enable();
 
 	err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls);
 	if (err) {
@@ -437,6 +436,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
 out_release:
 	cpu_unplug_done(cpu);
 out_cancel:
+	migrate_enable();
 	cpu_hotplug_done();
 	if (!err)
 		cpu_notify_nofail(CPU_POST_DEAD | mod, hcpu);
-- 
cgit v0.10.2


From 32d0add0c33f1eb77f84093d4832112541ae3ffb Mon Sep 17 00:00:00 2001
From: Yong Zhang <yong.zhang0@gmail.com>
Date: Sun, 16 Oct 2011 18:56:44 +0800
Subject: hotplug: Call cpu_unplug_begin() before DOWN_PREPARE

cpu_unplug_begin() should be called before CPU_DOWN_PREPARE, because
at CPU_DOWN_PREPARE cpu_active is cleared and sched_domain is
rebuilt. Otherwise the 'sync_unplug' thread will be running on the cpu
on which it's created and not bound on the cpu which is about to go
down.

I found that by an incorrect warning on smp_processor_id() called by
sync_unplug/1, and trace shows below:
(echo 1 > /sys/device/system/cpu/cpu1/online)
  bash-1664  [000]    83.136620: _cpu_down: Bind sync_unplug to cpu 1
  bash-1664  [000]    83.136623: sched_wait_task: comm=sync_unplug/1 pid=1724 prio=120
  bash-1664  [000]    83.136624: _cpu_down: Wake sync_unplug
  bash-1664  [000]    83.136629: sched_wakeup: comm=sync_unplug/1 pid=1724 prio=120 success=1 target_cpu=000

Wants to be folded back....

Signed-off-by: Yong Zhang <yong.zhang0@gmail.com>
Link: http://lkml.kernel.org/r/1318762607-2261-3-git-send-email-yong.zhang0@gmail.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 3a931bb..7b5f4cb 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -387,22 +387,20 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
 		return -EBUSY;
 	}
 
-	err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls);
+	cpu_hotplug_begin();
+	err = cpu_unplug_begin(cpu);
 	if (err) {
-		nr_calls--;
-		__cpu_notify(CPU_DOWN_FAILED | mod, hcpu, nr_calls, NULL);
-		printk("%s: attempt to take down CPU %u failed\n",
-				__func__, cpu);
+		printk("cpu_unplug_begin(%d) failed\n", cpu);
 		goto out_cancel;
 	}
 
-	cpu_hotplug_begin();
-	err = cpu_unplug_begin(cpu);
+	err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls);
 	if (err) {
 		nr_calls--;
 		__cpu_notify(CPU_DOWN_FAILED | mod, hcpu, nr_calls, NULL);
-		printk("cpu_unplug_begin(%d) failed\n", cpu);
-		goto out_cancel;
+		printk("%s: attempt to take down CPU %u failed\n",
+				__func__, cpu);
+		goto out_release;
 	}
 	smpboot_park_threads(cpu);
 
-- 
cgit v0.10.2


From 04afba88ec7c030c1ff81a80bb418d761f5fd320 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 17 Jul 2011 21:56:42 +0200
Subject: ftrace-migrate-disable-tracing.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index a3d4895..806c02b 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -49,7 +49,8 @@ struct trace_entry {
 	unsigned char		flags;
 	unsigned char		preempt_count;
 	int			pid;
-	int			padding;
+	unsigned short		migrate_disable;
+	unsigned short		padding;
 };
 
 #define FTRACE_MAX_EVENT						\
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 3c13e46..acb499e 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1177,6 +1177,8 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
 		((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) |
 		((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) |
 		(need_resched() ? TRACE_FLAG_NEED_RESCHED : 0);
+
+	entry->migrate_disable	= (tsk) ? tsk->migrate_disable & 0xFF : 0;
 }
 EXPORT_SYMBOL_GPL(tracing_generic_entry_update);
 
@@ -2034,9 +2036,10 @@ static void print_lat_help_header(struct seq_file *m)
 	seq_puts(m, "#                | / _----=> need-resched    \n");
 	seq_puts(m, "#                || / _---=> hardirq/softirq \n");
 	seq_puts(m, "#                ||| / _--=> preempt-depth   \n");
-	seq_puts(m, "#                |||| /     delay             \n");
-	seq_puts(m, "#  cmd     pid   ||||| time  |   caller      \n");
-	seq_puts(m, "#     \\   /      |||||  \\    |   /           \n");
+	seq_puts(m, "#                |||| / _--=> migrate-disable\n");
+	seq_puts(m, "#                ||||| /     delay           \n");
+	seq_puts(m, "#  cmd     pid   |||||| time  |   caller     \n");
+	seq_puts(m, "#     \\   /      |||||  \\   |   /          \n");
 }
 
 static void print_event_info(struct trace_array *tr, struct seq_file *m)
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 880073d..dbd6ae1 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -116,6 +116,7 @@ static int trace_define_common_fields(void)
 	__common_field(unsigned char, flags);
 	__common_field(unsigned char, preempt_count);
 	__common_field(int, pid);
+	__common_field(unsigned short, migrate_disable);
 	__common_field(int, padding);
 
 	return ret;
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 194d796..e36737f 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -593,6 +593,11 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
 	else
 		ret = trace_seq_putc(s, '.');
 
+	if (entry->migrate_disable)
+		ret = trace_seq_printf(s, "%x", entry->migrate_disable);
+	else
+		ret = trace_seq_putc(s, '.');
+
 	return ret;
 }
 
-- 
cgit v0.10.2


From caefdd54e349a40a44045a15b48a2f9075e6c6bd Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Wed, 16 Nov 2011 13:19:35 -0500
Subject: tracing: Show padding as unsigned short

RT added two bytes to trace migrate disable counting to the trace events
and used two bytes of the padding to make the change. The structures and
all were updated correctly, but the display in the event formats was
not:

cat /debug/tracing/events/sched/sched_switch/format

name: sched_switch
ID: 51
format:
	field:unsigned short common_type;	offset:0;	size:2;	signed:0;
	field:unsigned char common_flags;	offset:2;	size:1;	signed:0;
	field:unsigned char common_preempt_count;	offset:3;	size:1;	signed:0;
	field:int common_pid;	offset:4;	size:4;	signed:1;
	field:unsigned short common_migrate_disable;	offset:8;	size:2;	signed:0;
	field:int common_padding;	offset:10;	size:2;	signed:0;


The field for common_padding has the correct size and offset, but the
use of "int" might confuse some parsers (and people that are reading
it). This needs to be changed to "unsigned short".

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Link: http://lkml.kernel.org/r/1321467575.4181.36.camel@frodo
Cc: stable-rt@vger.kernel.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index dbd6ae1..a45a22d 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -117,7 +117,7 @@ static int trace_define_common_fields(void)
 	__common_field(unsigned char, preempt_count);
 	__common_field(int, pid);
 	__common_field(unsigned short, migrate_disable);
-	__common_field(int, padding);
+	__common_field(unsigned short, padding);
 
 	return ret;
 }
-- 
cgit v0.10.2


From ec22901a5db1de828f5fd055c3178211a16728ee Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 17 Jul 2011 19:48:20 +0200
Subject: migrate-disable-rt-variant.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index c5d2a36..714a08c 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -121,11 +121,15 @@ extern void migrate_enable(void);
 # define preempt_enable_rt()		preempt_enable()
 # define preempt_disable_nort()		do { } while (0)
 # define preempt_enable_nort()		do { } while (0)
+# define migrate_disable_rt()		migrate_disable()
+# define migrate_enable_rt()		migrate_enable()
 #else
 # define preempt_disable_rt()		do { } while (0)
 # define preempt_enable_rt()		do { } while (0)
 # define preempt_disable_nort()		preempt_disable()
 # define preempt_enable_nort()		preempt_enable()
+# define migrate_disable_rt()		do { } while (0)
+# define migrate_enable_rt()		do { } while (0)
 #endif
 
 #ifdef CONFIG_PREEMPT_NOTIFIERS
-- 
cgit v0.10.2


From c1dbc939c9b46bb2001f6580dc4def865ff6d96c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 11 Aug 2011 15:03:35 +0200
Subject: sched: Optimize migrate_disable

Change from task_rq_lock() to raw_spin_lock(&rq->lock) to avoid a few
atomic ops. See comment on why it should be safe.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/n/tip-cbz6hkl5r5mvwtx5s3tor2y6@git.kernel.org

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index dfb1a35..321f079 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4829,7 +4829,19 @@ void migrate_disable(void)
 		preempt_enable();
 		return;
 	}
-	rq = task_rq_lock(p, &flags);
+
+	/*
+	 * Since this is always current we can get away with only locking
+	 * rq->lock, the ->cpus_allowed value can normally only be changed
+	 * while holding both p->pi_lock and rq->lock, but seeing that this
+	 * it current, we cannot actually be waking up, so all code that
+	 * relies on serialization against p->pi_lock is out of scope.
+	 *
+	 * Taking rq->lock serializes us against things like
+	 * set_cpus_allowed_ptr() that can still happen concurrently.
+	 */
+	rq = this_rq();
+	raw_spin_lock_irqsave(&rq->lock, flags);
 	p->migrate_disable = 1;
 	mask = tsk_cpus_allowed(p);
 
@@ -4840,7 +4852,7 @@ void migrate_disable(void)
 			p->sched_class->set_cpus_allowed(p, mask);
 		p->nr_cpus_allowed = cpumask_weight(mask);
 	}
-	task_rq_unlock(rq, p, &flags);
+	raw_spin_unlock_irqrestore(&rq->lock, flags);
 	preempt_enable();
 }
 EXPORT_SYMBOL(migrate_disable);
@@ -4868,7 +4880,11 @@ void migrate_enable(void)
 		return;
 	}
 
-	rq = task_rq_lock(p, &flags);
+	/*
+	 * See comment in migrate_disable().
+	 */
+	rq = this_rq();
+	raw_spin_lock_irqsave(&rq->lock, flags);
 	p->migrate_disable = 0;
 	mask = tsk_cpus_allowed(p);
 
@@ -4880,7 +4896,7 @@ void migrate_enable(void)
 		p->nr_cpus_allowed = cpumask_weight(mask);
 	}
 
-	task_rq_unlock(rq, p, &flags);
+	raw_spin_unlock_irqrestore(&rq->lock, flags);
 	unpin_current_cpu();
 	preempt_enable();
 }
-- 
cgit v0.10.2


From ef9eac1a1d7a8c33fcec62b5c74ba574598919ce Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 11 Aug 2011 15:14:58 +0200
Subject: sched: Generic migrate_disable

Make migrate_disable() be a preempt_disable() for !rt kernels. This
allows generic code to use it but still enforces that these code
sections stay relatively small.

A preemptible migrate_disable() accessible for general use would allow
people growing arbitrary per-cpu crap instead of clean these things
up.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/n/tip-275i87sl8e1jcamtchmehonm@git.kernel.org

diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index 714a08c..5af46d0 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -108,28 +108,25 @@ do { \
 
 #endif /* CONFIG_PREEMPT_COUNT */
 
-#ifdef CONFIG_SMP
-extern void migrate_disable(void);
-extern void migrate_enable(void);
-#else
-# define migrate_disable()		do { } while (0)
-# define migrate_enable()		do { } while (0)
-#endif
-
 #ifdef CONFIG_PREEMPT_RT_FULL
 # define preempt_disable_rt()		preempt_disable()
 # define preempt_enable_rt()		preempt_enable()
 # define preempt_disable_nort()		do { } while (0)
 # define preempt_enable_nort()		do { } while (0)
-# define migrate_disable_rt()		migrate_disable()
-# define migrate_enable_rt()		migrate_enable()
+# ifdef CONFIG_SMP
+   extern void migrate_disable(void);
+   extern void migrate_enable(void);
+# else /* CONFIG_SMP */
+#  define migrate_disable()		do { } while (0)
+#  define migrate_enable()		do { } while (0)
+# endif /* CONFIG_SMP */
 #else
 # define preempt_disable_rt()		do { } while (0)
 # define preempt_enable_rt()		do { } while (0)
 # define preempt_disable_nort()		preempt_disable()
 # define preempt_enable_nort()		preempt_enable()
-# define migrate_disable_rt()		do { } while (0)
-# define migrate_enable_rt()		do { } while (0)
+# define migrate_disable()		preempt_disable()
+# define migrate_enable()		preempt_enable()
 #endif
 
 #ifdef CONFIG_PREEMPT_NOTIFIERS
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 3021406..6efcd19 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1279,7 +1279,9 @@ struct task_struct {
 #endif
 
 	unsigned int policy;
+#ifdef CONFIG_PREEMPT_RT_FULL
 	int migrate_disable;
+#endif
 	int nr_cpus_allowed;
 	cpumask_t cpus_allowed;
 
@@ -2810,11 +2812,22 @@ static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
 
 #endif /* CONFIG_SMP */
 
+static inline int __migrate_disabled(struct task_struct *p)
+{
+#ifdef CONFIG_PREEMPT_RT_FULL
+	return p->migrate_disable;
+#else
+	return 0;
+#endif
+}
+
 /* Future-safe accessor for struct task_struct's cpus_allowed. */
 static inline const struct cpumask *tsk_cpus_allowed(struct task_struct *p)
 {
+#ifdef CONFIG_PREEMPT_RT_FULL
 	if (p->migrate_disable)
 		return cpumask_of(task_cpu(p));
+#endif
 
 	return &p->cpus_allowed;
 }
diff --git a/include/linux/smp.h b/include/linux/smp.h
index 8624321..9e3255b 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -218,13 +218,8 @@ static inline void kick_all_cpus_sync(void) {  }
 #define get_cpu()		({ preempt_disable(); smp_processor_id(); })
 #define put_cpu()		preempt_enable()
 
-#ifndef CONFIG_PREEMPT_RT_FULL
-# define get_cpu_light()	get_cpu()
-# define put_cpu_light()	put_cpu()
-#else
-# define get_cpu_light()	({ migrate_disable(); smp_processor_id(); })
-# define put_cpu_light()	migrate_enable()
-#endif
+#define get_cpu_light()		({ migrate_disable(); smp_processor_id(); })
+#define put_cpu_light()		migrate_enable()
 
 /*
  * Callback to arch code if there's nosmp or maxcpus=0 on the
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 321f079..22e5f9d 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4734,7 +4734,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
 #ifdef CONFIG_SMP
 void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
 {
-	if (!p->migrate_disable) {
+	if (!__migrate_disabled(p)) {
 		if (p->sched_class && p->sched_class->set_cpus_allowed)
 			p->sched_class->set_cpus_allowed(p, new_mask);
 		p->nr_cpus_allowed = cpumask_weight(new_mask);
@@ -4790,7 +4790,7 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
 	do_set_cpus_allowed(p, new_mask);
 
 	/* Can the task run on the task's current CPU? If so, we're done */
-	if (cpumask_test_cpu(task_cpu(p), new_mask) || p->migrate_disable)
+	if (cpumask_test_cpu(task_cpu(p), new_mask) || __migrate_disabled(p))
 		goto out;
 
 	dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
@@ -4809,6 +4809,7 @@ out:
 }
 EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
 
+#ifdef CONFIG_PREEMPT_RT_FULL
 void migrate_disable(void)
 {
 	struct task_struct *p = current;
@@ -4901,6 +4902,7 @@ void migrate_enable(void)
 	preempt_enable();
 }
 EXPORT_SYMBOL(migrate_enable);
+#endif /* CONFIG_PREEMPT_RT_FULL */
 
 /*
  * Move (not current) task off this cpu, onto dest cpu. We're doing
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index acb499e..3a0f978 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1178,7 +1178,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
 		((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) |
 		(need_resched() ? TRACE_FLAG_NEED_RESCHED : 0);
 
-	entry->migrate_disable	= (tsk) ? tsk->migrate_disable & 0xFF : 0;
+	entry->migrate_disable	= (tsk) ? __migrate_disabled(tsk) & 0xFF : 0;
 }
 EXPORT_SYMBOL_GPL(tracing_generic_entry_update);
 
diff --git a/lib/smp_processor_id.c b/lib/smp_processor_id.c
index 0a846e7..dbb1570 100644
--- a/lib/smp_processor_id.c
+++ b/lib/smp_processor_id.c
@@ -41,7 +41,7 @@ notrace unsigned int debug_smp_processor_id(void)
 
 	printk(KERN_ERR "BUG: using smp_processor_id() in preemptible [%08x %08x] "
 	       "code: %s/%d\n", preempt_count() - 1,
-	       current->migrate_disable, current->comm, current->pid);
+	       __migrate_disabled(current), current->comm, current->pid);
 	print_symbol("caller is %s\n", (long)__builtin_return_address(0));
 	dump_stack();
 
-- 
cgit v0.10.2


From 299095bdcc7a7a040b6c6895ffaf0ec8c3b055fd Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Tue, 23 Aug 2011 16:12:43 +0200
Subject: sched, rt: Fix migrate_enable() thinko

Assigning mask = tsk_cpus_allowed(p) after p->migrate_disable = 0 ensures
that we won't see a mask change.. no push/pull, we stack tasks on one CPU.

Also add a couple fields to sched_debug for the next guy.

[ Build fix from Stratos Psomadakis <psomas@gentoo.org> ]

Signed-off-by: Mike Galbraith <efault@gmx.de>
Cc: Paul E. McKenney <paulmck@us.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1314108763.6689.4.camel@marge.simson.net
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 22e5f9d..3fa81f5 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4886,12 +4886,14 @@ void migrate_enable(void)
 	 */
 	rq = this_rq();
 	raw_spin_lock_irqsave(&rq->lock, flags);
-	p->migrate_disable = 0;
 	mask = tsk_cpus_allowed(p);
+	p->migrate_disable = 0;
 
 	WARN_ON(!cpumask_test_cpu(smp_processor_id(), mask));
 
 	if (!cpumask_equal(&p->cpus_allowed, mask)) {
+		/* Get the mask now that migration is enabled */
+		mask = tsk_cpus_allowed(p);
 		if (p->sched_class->set_cpus_allowed)
 			p->sched_class->set_cpus_allowed(p, mask);
 		p->nr_cpus_allowed = cpumask_weight(mask);
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 7ae4c4c..2cac500 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -253,6 +253,9 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
 	P(rt_throttled);
 	PN(rt_time);
 	PN(rt_runtime);
+#ifdef CONFIG_SMP
+	P(rt_nr_migratory);
+#endif
 
 #undef PN
 #undef P
@@ -507,6 +510,10 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
 	P(se.load.weight);
 	P(policy);
 	P(prio);
+#ifdef CONFIG_PREEMPT_RT_FULL
+	P(migrate_disable);
+#endif
+	P(nr_cpus_allowed);
 #undef PN
 #undef __PN
 #undef P
-- 
cgit v0.10.2


From 5236188f0403219120e7148e9a806210b06118d6 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 2 Sep 2011 14:29:27 +0200
Subject: sched: teach migrate_disable about atomic contexts

 <NMI>  [<ffffffff812dafd8>] spin_bug+0x94/0xa8
 [<ffffffff812db07f>] do_raw_spin_lock+0x43/0xea
 [<ffffffff814fa9be>] _raw_spin_lock_irqsave+0x6b/0x85
 [<ffffffff8106ff9e>] ? migrate_disable+0x75/0x12d
 [<ffffffff81078aaf>] ? pin_current_cpu+0x36/0xb0
 [<ffffffff8106ff9e>] migrate_disable+0x75/0x12d
 [<ffffffff81115b9d>] pagefault_disable+0xe/0x1f
 [<ffffffff81047027>] copy_from_user_nmi+0x74/0xe6
 [<ffffffff810489d7>] perf_callchain_user+0xf3/0x135

Now clearly we can't go around taking locks from NMI context, cure
this by short-circuiting migrate_disable() when we're in an atomic
context already.

Add some extra debugging to avoid things like:

  preempt_disable()
  migrate_disable();

  preempt_enable();
  migrate_enable();

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1314967297.1301.14.camel@twins
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/n/tip-wbot4vsmwhi8vmbf83hsclk6@git.kernel.org

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6efcd19..4c758c5 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1281,6 +1281,9 @@ struct task_struct {
 	unsigned int policy;
 #ifdef CONFIG_PREEMPT_RT_FULL
 	int migrate_disable;
+# ifdef CONFIG_SCHED_DEBUG
+	int migrate_disable_atomic;
+# endif
 #endif
 	int nr_cpus_allowed;
 	cpumask_t cpus_allowed;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 3fa81f5..c111f13 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4817,6 +4817,17 @@ void migrate_disable(void)
 	unsigned long flags;
 	struct rq *rq;
 
+	if (in_atomic()) {
+#ifdef CONFIG_SCHED_DEBUG
+		p->migrate_disable_atomic++;
+#endif
+		return;
+	}
+
+#ifdef CONFIG_SCHED_DEBUG
+	WARN_ON_ONCE(p->migrate_disable_atomic);
+#endif
+
 	preempt_disable();
 	if (p->migrate_disable) {
 		p->migrate_disable++;
@@ -4865,6 +4876,16 @@ void migrate_enable(void)
 	unsigned long flags;
 	struct rq *rq;
 
+	if (in_atomic()) {
+#ifdef CONFIG_SCHED_DEBUG
+		p->migrate_disable_atomic--;
+#endif
+		return;
+	}
+
+#ifdef CONFIG_SCHED_DEBUG
+	WARN_ON_ONCE(p->migrate_disable_atomic);
+#endif
 	WARN_ON_ONCE(p->migrate_disable <= 0);
 
 	preempt_disable();
-- 
cgit v0.10.2


From fda8ca447dcb7f15f6409a856586d446ec690fe1 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Tue, 27 Sep 2011 08:40:23 -0400
Subject: sched: Postpone actual migration disalbe to schedule

The migrate_disable() can cause a bit of a overhead to the RT kernel,
as changing the affinity is expensive to do at every lock encountered.
As a running task can not migrate, the actual disabling of migration
does not need to occur until the task is about to schedule out.

In most cases, a task that disables migration will enable it before
it schedules making this change improve performance tremendously.

[ Frank Rowand: UP compile fix ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Clark Williams <williams@redhat.com>
Link: http://lkml.kernel.org/r/20110927124422.779693167@goodmis.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c111f13..f5dadee 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2848,6 +2848,135 @@ static inline void schedule_debug(struct task_struct *prev)
 	schedstat_inc(this_rq(), sched_count);
 }
 
+#if defined(CONFIG_PREEMPT_RT_FULL) && defined(CONFIG_SMP)
+#define MIGRATE_DISABLE_SET_AFFIN	(1<<30) /* Can't make a negative */
+#define migrate_disabled_updated(p)	((p)->migrate_disable & MIGRATE_DISABLE_SET_AFFIN)
+#define migrate_disable_count(p)	((p)->migrate_disable & ~MIGRATE_DISABLE_SET_AFFIN)
+
+static inline void update_migrate_disable(struct task_struct *p)
+{
+	const struct cpumask *mask;
+
+	if (likely(!p->migrate_disable))
+		return;
+
+	/* Did we already update affinity? */
+	if (unlikely(migrate_disabled_updated(p)))
+		return;
+
+	/*
+	 * Since this is always current we can get away with only locking
+	 * rq->lock, the ->cpus_allowed value can normally only be changed
+	 * while holding both p->pi_lock and rq->lock, but seeing that this
+	 * is current, we cannot actually be waking up, so all code that
+	 * relies on serialization against p->pi_lock is out of scope.
+	 *
+	 * Having rq->lock serializes us against things like
+	 * set_cpus_allowed_ptr() that can still happen concurrently.
+	 */
+	mask = tsk_cpus_allowed(p);
+
+	WARN_ON(!cpumask_test_cpu(smp_processor_id(), mask));
+
+	if (!cpumask_equal(&p->cpus_allowed, mask)) {
+		if (p->sched_class->set_cpus_allowed)
+			p->sched_class->set_cpus_allowed(p, mask);
+		p->nr_cpus_allowed = cpumask_weight(mask);
+
+		/* Let migrate_enable know to fix things back up */
+		p->migrate_disable |= MIGRATE_DISABLE_SET_AFFIN;
+	}
+}
+
+void migrate_disable(void)
+{
+	struct task_struct *p = current;
+
+	if (in_atomic()) {
+#ifdef CONFIG_SCHED_DEBUG
+		p->migrate_disable_atomic++;
+#endif
+		return;
+	}
+
+#ifdef CONFIG_SCHED_DEBUG
+	WARN_ON_ONCE(p->migrate_disable_atomic);
+#endif
+
+	preempt_disable();
+	if (p->migrate_disable) {
+		p->migrate_disable++;
+		preempt_enable();
+		return;
+	}
+
+	pin_current_cpu();
+	p->migrate_disable = 1;
+	preempt_enable();
+}
+EXPORT_SYMBOL(migrate_disable);
+
+void migrate_enable(void)
+{
+	struct task_struct *p = current;
+	const struct cpumask *mask;
+	unsigned long flags;
+	struct rq *rq;
+
+	if (in_atomic()) {
+#ifdef CONFIG_SCHED_DEBUG
+		p->migrate_disable_atomic--;
+#endif
+		return;
+	}
+
+#ifdef CONFIG_SCHED_DEBUG
+	WARN_ON_ONCE(p->migrate_disable_atomic);
+#endif
+	WARN_ON_ONCE(p->migrate_disable <= 0);
+
+	preempt_disable();
+	if (migrate_disable_count(p) > 1) {
+		p->migrate_disable--;
+		preempt_enable();
+		return;
+	}
+
+	if (unlikely(migrate_disabled_updated(p))) {
+		/*
+		 * See comment in update_migrate_disable() about locking.
+		 */
+		rq = this_rq();
+		raw_spin_lock_irqsave(&rq->lock, flags);
+		mask = tsk_cpus_allowed(p);
+		/*
+		 * Clearing migrate_disable causes tsk_cpus_allowed to
+		 * show the tasks original cpu affinity.
+		 */
+		p->migrate_disable = 0;
+
+		WARN_ON(!cpumask_test_cpu(smp_processor_id(), mask));
+
+		if (unlikely(!cpumask_equal(&p->cpus_allowed, mask))) {
+			/* Get the mask now that migration is enabled */
+			mask = tsk_cpus_allowed(p);
+			if (p->sched_class->set_cpus_allowed)
+				p->sched_class->set_cpus_allowed(p, mask);
+			p->nr_cpus_allowed = cpumask_weight(mask);
+		}
+		raw_spin_unlock_irqrestore(&rq->lock, flags);
+	} else
+		p->migrate_disable = 0;
+
+	unpin_current_cpu();
+	preempt_enable();
+}
+EXPORT_SYMBOL(migrate_enable);
+#else
+static inline void update_migrate_disable(struct task_struct *p) { }
+#define migrate_disabled_updated(p)		0
+#endif
+
 static void put_prev_task(struct rq *rq, struct task_struct *prev)
 {
 	if (prev->on_rq || rq->skip_clock_update < 0)
@@ -2941,6 +3070,8 @@ need_resched:
 
 	raw_spin_lock_irq(&rq->lock);
 
+	update_migrate_disable(prev);
+
 	switch_count = &prev->nivcsw;
 	if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
 		if (unlikely(signal_pending_state(prev->state, prev))) {
@@ -4734,7 +4865,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
 #ifdef CONFIG_SMP
 void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
 {
-	if (!__migrate_disabled(p)) {
+	if (!migrate_disabled_updated(p)) {
 		if (p->sched_class && p->sched_class->set_cpus_allowed)
 			p->sched_class->set_cpus_allowed(p, new_mask);
 		p->nr_cpus_allowed = cpumask_weight(new_mask);
@@ -4809,124 +4940,6 @@ out:
 }
 EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
 
-#ifdef CONFIG_PREEMPT_RT_FULL
-void migrate_disable(void)
-{
-	struct task_struct *p = current;
-	const struct cpumask *mask;
-	unsigned long flags;
-	struct rq *rq;
-
-	if (in_atomic()) {
-#ifdef CONFIG_SCHED_DEBUG
-		p->migrate_disable_atomic++;
-#endif
-		return;
-	}
-
-#ifdef CONFIG_SCHED_DEBUG
-	WARN_ON_ONCE(p->migrate_disable_atomic);
-#endif
-
-	preempt_disable();
-	if (p->migrate_disable) {
-		p->migrate_disable++;
-		preempt_enable();
-		return;
-	}
-
-	pin_current_cpu();
-	if (unlikely(!scheduler_running)) {
-		p->migrate_disable = 1;
-		preempt_enable();
-		return;
-	}
-
-	/*
-	 * Since this is always current we can get away with only locking
-	 * rq->lock, the ->cpus_allowed value can normally only be changed
-	 * while holding both p->pi_lock and rq->lock, but seeing that this
-	 * it current, we cannot actually be waking up, so all code that
-	 * relies on serialization against p->pi_lock is out of scope.
-	 *
-	 * Taking rq->lock serializes us against things like
-	 * set_cpus_allowed_ptr() that can still happen concurrently.
-	 */
-	rq = this_rq();
-	raw_spin_lock_irqsave(&rq->lock, flags);
-	p->migrate_disable = 1;
-	mask = tsk_cpus_allowed(p);
-
-	WARN_ON(!cpumask_test_cpu(smp_processor_id(), mask));
-
-	if (!cpumask_equal(&p->cpus_allowed, mask)) {
-		if (p->sched_class->set_cpus_allowed)
-			p->sched_class->set_cpus_allowed(p, mask);
-		p->nr_cpus_allowed = cpumask_weight(mask);
-	}
-	raw_spin_unlock_irqrestore(&rq->lock, flags);
-	preempt_enable();
-}
-EXPORT_SYMBOL(migrate_disable);
-
-void migrate_enable(void)
-{
-	struct task_struct *p = current;
-	const struct cpumask *mask;
-	unsigned long flags;
-	struct rq *rq;
-
-	if (in_atomic()) {
-#ifdef CONFIG_SCHED_DEBUG
-		p->migrate_disable_atomic--;
-#endif
-		return;
-	}
-
-#ifdef CONFIG_SCHED_DEBUG
-	WARN_ON_ONCE(p->migrate_disable_atomic);
-#endif
-	WARN_ON_ONCE(p->migrate_disable <= 0);
-
-	preempt_disable();
-	if (p->migrate_disable > 1) {
-		p->migrate_disable--;
-		preempt_enable();
-		return;
-	}
-
-	if (unlikely(!scheduler_running)) {
-		p->migrate_disable = 0;
-		unpin_current_cpu();
-		preempt_enable();
-		return;
-	}
-
-	/*
-	 * See comment in migrate_disable().
-	 */
-	rq = this_rq();
-	raw_spin_lock_irqsave(&rq->lock, flags);
-	mask = tsk_cpus_allowed(p);
-	p->migrate_disable = 0;
-
-	WARN_ON(!cpumask_test_cpu(smp_processor_id(), mask));
-
-	if (!cpumask_equal(&p->cpus_allowed, mask)) {
-		/* Get the mask now that migration is enabled */
-		mask = tsk_cpus_allowed(p);
-		if (p->sched_class->set_cpus_allowed)
-			p->sched_class->set_cpus_allowed(p, mask);
-		p->nr_cpus_allowed = cpumask_weight(mask);
-	}
-
-	raw_spin_unlock_irqrestore(&rq->lock, flags);
-	unpin_current_cpu();
-	preempt_enable();
-}
-EXPORT_SYMBOL(migrate_enable);
-#endif /* CONFIG_PREEMPT_RT_FULL */
-
 /*
  * Move (not current) task off this cpu, onto dest cpu. We're doing
  * this because either it can't run here any more (set_cpus_allowed()
-- 
cgit v0.10.2


From 04ffc10c13bc154fa797666fd28c1e5bde7f2690 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 27 Sep 2011 08:40:24 -0400
Subject: sched: Do not compare cpu masks in scheduler

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Clark Williams <williams@redhat.com>
Link: http://lkml.kernel.org/r/20110927124423.128129033@goodmis.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f5dadee..d80031c 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2876,16 +2876,12 @@ static inline void update_migrate_disable(struct task_struct *p)
 	 */
 	mask = tsk_cpus_allowed(p);
 
-	WARN_ON(!cpumask_test_cpu(smp_processor_id(), mask));
+	if (p->sched_class->set_cpus_allowed)
+		p->sched_class->set_cpus_allowed(p, mask);
+	p->nr_cpus_allowed = cpumask_weight(mask);
 
-	if (!cpumask_equal(&p->cpus_allowed, mask)) {
-		if (p->sched_class->set_cpus_allowed)
-			p->sched_class->set_cpus_allowed(p, mask);
-		p->nr_cpus_allowed = cpumask_weight(mask);
-
-		/* Let migrate_enable know to fix things back up */
-		p->migrate_disable |= MIGRATE_DISABLE_SET_AFFIN;
-	}
+	/* Let migrate_enable know to fix things back up */
+	p->migrate_disable |= MIGRATE_DISABLE_SET_AFFIN;
 }
 
 void migrate_disable(void)
-- 
cgit v0.10.2


From 206e294f11d5043f5567636e9401cb034ba4841d Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 27 Sep 2011 08:40:25 -0400
Subject: sched: Have migrate_disable ignore bounded threads

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Clark Williams <williams@redhat.com>
Link: http://lkml.kernel.org/r/20110927124423.567944215@goodmis.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d80031c..3ee8829 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2888,7 +2888,7 @@ void migrate_disable(void)
 {
 	struct task_struct *p = current;
 
-	if (in_atomic()) {
+	if (in_atomic() || p->flags & PF_THREAD_BOUND) {
 #ifdef CONFIG_SCHED_DEBUG
 		p->migrate_disable_atomic++;
 #endif
@@ -2919,7 +2919,7 @@ void migrate_enable(void)
 	unsigned long flags;
 	struct rq *rq;
 
-	if (in_atomic()) {
+	if (in_atomic() || p->flags & PF_THREAD_BOUND) {
 #ifdef CONFIG_SCHED_DEBUG
 		p->migrate_disable_atomic--;
 #endif
@@ -2940,26 +2940,21 @@ void migrate_enable(void)
 
 	if (unlikely(migrate_disabled_updated(p))) {
 		/*
-		 * See comment in update_migrate_disable() about locking.
+		 * Undo whatever update_migrate_disable() did, also see there
+		 * about locking.
 		 */
 		rq = this_rq();
 		raw_spin_lock_irqsave(&rq->lock, flags);
-		mask = tsk_cpus_allowed(p);
+
 		/*
 		 * Clearing migrate_disable causes tsk_cpus_allowed to
 		 * show the tasks original cpu affinity.
 		 */
 		p->migrate_disable = 0;
-
-		WARN_ON(!cpumask_test_cpu(smp_processor_id(), mask));
-
-		if (unlikely(!cpumask_equal(&p->cpus_allowed, mask))) {
-			/* Get the mask now that migration is enabled */
-			mask = tsk_cpus_allowed(p);
-			if (p->sched_class->set_cpus_allowed)
-				p->sched_class->set_cpus_allowed(p, mask);
-			p->nr_cpus_allowed = cpumask_weight(mask);
-		}
+		mask = tsk_cpus_allowed(p);
+		if (p->sched_class->set_cpus_allowed)
+			p->sched_class->set_cpus_allowed(p, mask);
+		p->nr_cpus_allowed = cpumask_weight(mask);
 		raw_spin_unlock_irqrestore(&rq->lock, flags);
 	} else
 		p->migrate_disable = 0;
-- 
cgit v0.10.2


From 80304b397d165b9f6f742a719292d607e0d0daa7 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 4 Nov 2011 20:48:36 +0100
Subject: sched-clear-pf-thread-bound-on-fallback-rq.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 3ee8829..7d852af 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1192,6 +1192,12 @@ out:
 		}
 	}
 
+	/*
+	 * Clear PF_THREAD_BOUND, otherwise we wreckage
+	 * migrate_disable/enable. See optimization for
+	 * PF_THREAD_BOUND tasks there.
+	 */
+	p->flags &= ~PF_THREAD_BOUND;
 	return dest_cpu;
 }
 
-- 
cgit v0.10.2


From 7b0dd3249c498b6f67f7d222d5f23a2185ab453a Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 17 Jul 2011 16:29:27 +0200
Subject: net-netif_rx_ni-migrate-disable.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/net/core/dev.c b/net/core/dev.c
index 2edfcf8..fad0a35 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3115,11 +3115,11 @@ int netif_rx_ni(struct sk_buff *skb)
 {
 	int err;
 
-	preempt_disable();
+	migrate_disable();
 	err = netif_rx(skb);
 	if (local_softirq_pending())
 		thread_do_softirq();
-	preempt_enable();
+	migrate_enable();
 
 	return err;
 }
-- 
cgit v0.10.2


From cd71086eab7dfc8fceb61553fd4059d16ca64e4f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 3 Jul 2009 13:16:38 -0500
Subject: softirq: Sanitize softirq pending for NOHZ/RT

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 6fb89c4..89f1075 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -455,6 +455,8 @@ extern void __raise_softirq_irqoff(unsigned int nr);
 extern void raise_softirq_irqoff(unsigned int nr);
 extern void raise_softirq(unsigned int nr);
 
+extern void softirq_check_pending_idle(void);
+
 /* This is the worklist that queues up per-cpu softirq work.
  *
  * send_remote_sendirq() adds work to these lists, and
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 18784bb..46bf4f2 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -62,6 +62,69 @@ char *softirq_to_name[NR_SOFTIRQS] = {
 	"TASKLET", "SCHED", "HRTIMER", "RCU"
 };
 
+#ifdef CONFIG_NO_HZ
+# ifdef CONFIG_PREEMPT_RT_FULL
+/*
+ * On preempt-rt a softirq might be blocked on a lock. There might be
+ * no other runnable task on this CPU because the lock owner runs on
+ * some other CPU. So we have to go into idle with the pending bit
+ * set. Therefor we need to check this otherwise we warn about false
+ * positives which confuses users and defeats the whole purpose of
+ * this test.
+ *
+ * This code is called with interrupts disabled.
+ */
+void softirq_check_pending_idle(void)
+{
+	static int rate_limit;
+	u32 warnpending = 0, pending;
+
+	if (rate_limit >= 10)
+		return;
+
+	pending = local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK;
+	if (pending) {
+		struct task_struct *tsk;
+
+		tsk = __get_cpu_var(ksoftirqd);
+		/*
+		 * The wakeup code in rtmutex.c wakes up the task
+		 * _before_ it sets pi_blocked_on to NULL under
+		 * tsk->pi_lock. So we need to check for both: state
+		 * and pi_blocked_on.
+		 */
+		raw_spin_lock(&tsk->pi_lock);
+
+		if (!tsk->pi_blocked_on && !(tsk->state == TASK_RUNNING))
+			warnpending = 1;
+
+		raw_spin_unlock(&tsk->pi_lock);
+	}
+
+	if (warnpending) {
+		printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n",
+		       pending);
+		rate_limit++;
+	}
+}
+# else
+/*
+ * On !PREEMPT_RT we just printk rate limited:
+ */
+void softirq_check_pending_idle(void)
+{
+	static int rate_limit;
+
+	if (rate_limit < 10 &&
+			(local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) {
+		printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n",
+		       local_softirq_pending());
+		rate_limit++;
+	}
+}
+# endif
+#endif
+
 /*
  * we cannot loop indefinitely here to avoid userspace starvation,
  * but we also don't want to introduce a worst case 1/HZ latency
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 5fe688c..626b320f 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -481,14 +481,7 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
 		return false;
 
 	if (unlikely(local_softirq_pending() && cpu_online(cpu))) {
-		static int ratelimit;
-
-		if (ratelimit < 10 &&
-		    (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) {
-			printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n",
-			       (unsigned int) local_softirq_pending());
-			ratelimit++;
-		}
+		softirq_check_pending_idle();
 		return false;
 	}
 
-- 
cgit v0.10.2


From 44eb905c3d2d94cbdf1593476c62cfe5e64630bd Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 17 Jul 2011 18:51:23 +0200
Subject: lockdep-rt.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h
index 37b13c4..a52b35d 100644
--- a/include/linux/irqflags.h
+++ b/include/linux/irqflags.h
@@ -25,8 +25,6 @@
 # define trace_softirqs_enabled(p)	((p)->softirqs_enabled)
 # define trace_hardirq_enter()	do { current->hardirq_context++; } while (0)
 # define trace_hardirq_exit()	do { current->hardirq_context--; } while (0)
-# define lockdep_softirq_enter()	do { current->softirq_context++; } while (0)
-# define lockdep_softirq_exit()	do { current->softirq_context--; } while (0)
 # define INIT_TRACE_IRQFLAGS	.softirqs_enabled = 1,
 #else
 # define trace_hardirqs_on()		do { } while (0)
@@ -39,9 +37,15 @@
 # define trace_softirqs_enabled(p)	0
 # define trace_hardirq_enter()		do { } while (0)
 # define trace_hardirq_exit()		do { } while (0)
+# define INIT_TRACE_IRQFLAGS
+#endif
+
+#if defined(CONFIG_TRACE_IRQFLAGS) && !defined(CONFIG_PREEMPT_RT_FULL)
+# define lockdep_softirq_enter() do { current->softirq_context++; } while (0)
+# define lockdep_softirq_exit()	 do { current->softirq_context--; } while (0)
+#else
 # define lockdep_softirq_enter()	do { } while (0)
 # define lockdep_softirq_exit()		do { } while (0)
-# define INIT_TRACE_IRQFLAGS
 #endif
 
 #if defined(CONFIG_IRQSOFF_TRACER) || \
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 7981e5b..7f03801 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -3534,6 +3534,7 @@ static void check_flags(unsigned long flags)
 		}
 	}
 
+#ifndef CONFIG_PREEMPT_RT_FULL
 	/*
 	 * We dont accurately track softirq state in e.g.
 	 * hardirq contexts (such as on 4KSTACKS), so only
@@ -3548,6 +3549,7 @@ static void check_flags(unsigned long flags)
 			DEBUG_LOCKS_WARN_ON(!current->softirqs_enabled);
 		}
 	}
+#endif
 
 	if (!debug_locks)
 		print_irqtrace_events(current);
-- 
cgit v0.10.2


From 1c9d6ad3e1c556dd43cfc3b831b7eff79dc875db Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 17 Jul 2011 21:51:45 +0200
Subject: mutex-no-spin-on-rt.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
index 44511d1..f60af90 100644
--- a/kernel/Kconfig.locks
+++ b/kernel/Kconfig.locks
@@ -222,4 +222,4 @@ endif
 
 config MUTEX_SPIN_ON_OWNER
 	def_bool y
-	depends on SMP && !DEBUG_MUTEXES
+	depends on SMP && !DEBUG_MUTEXES && !PREEMPT_RT_FULL
-- 
cgit v0.10.2


From cde2be3a7c7b4e3f0b4f9ac34460b5ef0a8217ab Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 28 Jun 2011 15:57:18 +0200
Subject: softirq-local-lock.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index 624ef3f..2ae0a9d 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -61,7 +61,11 @@
 #define HARDIRQ_OFFSET	(1UL << HARDIRQ_SHIFT)
 #define NMI_OFFSET	(1UL << NMI_SHIFT)
 
-#define SOFTIRQ_DISABLE_OFFSET	(2 * SOFTIRQ_OFFSET)
+#ifndef CONFIG_PREEMPT_RT_FULL
+# define SOFTIRQ_DISABLE_OFFSET	(2 * SOFTIRQ_OFFSET)
+#else
+# define SOFTIRQ_DISABLE_OFFSET (0)
+#endif
 
 #ifndef PREEMPT_ACTIVE
 #define PREEMPT_ACTIVE_BITS	1
@@ -74,10 +78,17 @@
 #endif
 
 #define hardirq_count()	(preempt_count() & HARDIRQ_MASK)
-#define softirq_count()	(preempt_count() & SOFTIRQ_MASK)
 #define irq_count()	(preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK \
 				 | NMI_MASK))
 
+#ifndef CONFIG_PREEMPT_RT_FULL
+# define softirq_count()	(preempt_count() & SOFTIRQ_MASK)
+# define in_serving_softirq()	(softirq_count() & SOFTIRQ_OFFSET)
+#else
+# define softirq_count()	(0U)
+extern int in_serving_softirq(void);
+#endif
+
 /*
  * Are we doing bottom half or hardware interrupt processing?
  * Are we in a softirq context? Interrupt context?
@@ -87,7 +98,6 @@
 #define in_irq()		(hardirq_count())
 #define in_softirq()		(softirq_count())
 #define in_interrupt()		(irq_count())
-#define in_serving_softirq()	(softirq_count() & SOFTIRQ_OFFSET)
 
 /*
  * Are we in NMI context?
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 89f1075..77ed66d 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -447,7 +447,13 @@ struct softirq_action
 
 asmlinkage void do_softirq(void);
 asmlinkage void __do_softirq(void);
+
+#ifndef CONFIG_PREEMPT_RT_FULL
 static inline void thread_do_softirq(void) { do_softirq(); }
+#else
+extern void thread_do_softirq(void);
+#endif
+
 extern void open_softirq(int nr, void (*action)(struct softirq_action *));
 extern void softirq_init(void);
 extern void __raise_softirq_irqoff(unsigned int nr);
@@ -634,6 +640,12 @@ void tasklet_hrtimer_cancel(struct tasklet_hrtimer *ttimer)
 	tasklet_kill(&ttimer->tasklet);
 }
 
+#ifdef CONFIG_PREEMPT_RT_FULL
+extern void softirq_early_init(void);
+#else
+static inline void softirq_early_init(void) { }
+#endif
+
 /*
  * Autoprobing for irqs:
  *
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4c758c5..8589292 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1633,6 +1633,7 @@ struct task_struct {
 #endif
 #ifdef CONFIG_PREEMPT_RT_BASE
 	struct rcu_head put_rcu;
+	int softirq_nestcnt;
 #endif
 };
 
diff --git a/init/main.c b/init/main.c
index a6766db..4df397d 100644
--- a/init/main.c
+++ b/init/main.c
@@ -493,6 +493,7 @@ asmlinkage void __init start_kernel(void)
  * Interrupts are still disabled. Do necessary setups, then
  * enable them
  */
+	softirq_early_init();
 	tick_init();
 	boot_cpu_init();
 	page_address_init();
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 46bf4f2..4cdca2e 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -25,6 +25,7 @@
 #include <linux/smp.h>
 #include <linux/smpboot.h>
 #include <linux/tick.h>
+#include <linux/locallock.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/irq.h>
@@ -168,6 +169,7 @@ static void handle_pending_softirqs(u32 pending, int cpu)
 	local_irq_disable();
 }
 
+#ifndef CONFIG_PREEMPT_RT_FULL
 /*
  * preempt_count and SOFTIRQ_OFFSET usage:
  * - preempt_count is changed by SOFTIRQ_OFFSET on entering or leaving
@@ -360,6 +362,162 @@ asmlinkage void do_softirq(void)
 
 #endif
 
+static inline void local_bh_disable_nort(void) { local_bh_disable(); }
+static inline void _local_bh_enable_nort(void) { _local_bh_enable(); }
+
+#else /* !PREEMPT_RT_FULL */
+
+/*
+ * On RT we serialize softirq execution with a cpu local lock
+ */
+static DEFINE_LOCAL_IRQ_LOCK(local_softirq_lock);
+static DEFINE_PER_CPU(struct task_struct *, local_softirq_runner);
+
+static void __do_softirq(void);
+
+void __init softirq_early_init(void)
+{
+	local_irq_lock_init(local_softirq_lock);
+}
+
+void local_bh_disable(void)
+{
+	migrate_disable();
+	current->softirq_nestcnt++;
+}
+EXPORT_SYMBOL(local_bh_disable);
+
+void local_bh_enable(void)
+{
+	if (WARN_ON(current->softirq_nestcnt == 0))
+		return;
+
+	if ((current->softirq_nestcnt == 1) &&
+	    local_softirq_pending() &&
+	    local_trylock(local_softirq_lock)) {
+
+		local_irq_disable();
+		if (local_softirq_pending())
+			__do_softirq();
+		local_irq_enable();
+		local_unlock(local_softirq_lock);
+		WARN_ON(current->softirq_nestcnt != 1);
+	}
+	current->softirq_nestcnt--;
+	migrate_enable();
+}
+EXPORT_SYMBOL(local_bh_enable);
+
+void local_bh_enable_ip(unsigned long ip)
+{
+	local_bh_enable();
+}
+EXPORT_SYMBOL(local_bh_enable_ip);
+
+/* For tracing */
+int notrace __in_softirq(void)
+{
+	if (__get_cpu_var(local_softirq_lock).owner == current)
+		return __get_cpu_var(local_softirq_lock).nestcnt;
+	return 0;
+}
+
+int in_serving_softirq(void)
+{
+	int res;
+
+	preempt_disable();
+	res = __get_cpu_var(local_softirq_runner) == current;
+	preempt_enable();
+	return res;
+}
+
+/*
+ * Called with bh and local interrupts disabled. For full RT cpu must
+ * be pinned.
+ */
+static void __do_softirq(void)
+{
+	u32 pending = local_softirq_pending();
+	int cpu = smp_processor_id();
+
+	current->softirq_nestcnt++;
+
+	/* Reset the pending bitmask before enabling irqs */
+	set_softirq_pending(0);
+
+	__get_cpu_var(local_softirq_runner) = current;
+
+	lockdep_softirq_enter();
+
+	handle_pending_softirqs(pending, cpu);
+
+	pending = local_softirq_pending();
+	if (pending)
+		wakeup_softirqd();
+
+	lockdep_softirq_exit();
+	__get_cpu_var(local_softirq_runner) = NULL;
+
+	current->softirq_nestcnt--;
+}
+
+static int __thread_do_softirq(int cpu)
+{
+	/*
+	 * Prevent the current cpu from going offline.
+	 * pin_current_cpu() can reenable preemption and block on the
+	 * hotplug mutex. When it returns, the current cpu is
+	 * pinned. It might be the wrong one, but the offline check
+	 * below catches that.
+	 */
+	pin_current_cpu();
+	/*
+	 * If called from ksoftirqd (cpu >= 0) we need to check
+	 * whether we are on the wrong cpu due to cpu offlining. If
+	 * called via thread_do_softirq() no action required.
+	 */
+	if (cpu >= 0 && cpu_is_offline(cpu)) {
+		unpin_current_cpu();
+		return -1;
+	}
+	preempt_enable();
+	local_lock(local_softirq_lock);
+	local_irq_disable();
+	/*
+	 * We cannot switch stacks on RT as we want to be able to
+	 * schedule!
+	 */
+	if (local_softirq_pending())
+		__do_softirq();
+	local_unlock(local_softirq_lock);
+	unpin_current_cpu();
+	preempt_disable();
+	local_irq_enable();
+	return 0;
+}
+
+/*
+ * Called from netif_rx_ni(). Preemption enabled.
+ */
+void thread_do_softirq(void)
+{
+	if (!in_serving_softirq()) {
+		preempt_disable();
+		__thread_do_softirq(-1);
+		preempt_enable();
+	}
+}
+
+static int ksoftirqd_do_softirq(int cpu)
+{
+	return __thread_do_softirq(cpu);
+}
+
+static inline void local_bh_disable_nort(void) { }
+static inline void _local_bh_enable_nort(void) { }
+
+#endif /* PREEMPT_RT_FULL */
 /*
  * Enter an interrupt context.
  */
@@ -373,9 +531,9 @@ void irq_enter(void)
 		 * Prevent raise_softirq from needlessly waking up ksoftirqd
 		 * here, as softirq will be serviced on return from interrupt.
 		 */
-		local_bh_disable();
+		local_bh_disable_nort();
 		tick_check_idle(cpu);
-		_local_bh_enable();
+		_local_bh_enable_nort();
 	}
 
 	__irq_enter();
@@ -383,6 +541,7 @@ void irq_enter(void)
 
 static inline void invoke_softirq(void)
 {
+#ifndef CONFIG_PREEMPT_RT_FULL
 	if (!force_irqthreads) {
 #ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED
 		__do_softirq();
@@ -395,6 +554,9 @@ static inline void invoke_softirq(void)
 		wakeup_softirqd();
 		__local_bh_enable(SOFTIRQ_OFFSET);
 	}
+#else
+		wakeup_softirqd();
+#endif
 }
 
 /*
-- 
cgit v0.10.2


From ba4b90074c24413febf1c542aab2245dca450a3a Mon Sep 17 00:00:00 2001
From: John Kacur <jkacur@redhat.com>
Date: Mon, 14 Nov 2011 02:44:43 +0100
Subject: softirq: Export in_serving_softirq()

ERROR: "in_serving_softirq" [net/sched/cls_cgroup.ko] undefined!

The above can be fixed by exporting in_serving_softirq

Signed-off-by: John Kacur <jkacur@redhat.com>
Cc: Paul McKenney <paulmck@linux.vnet.ibm.com>
Cc: stable-rt@vger.kernel.org
Link: http://lkml.kernel.org/r/1321235083-21756-2-git-send-email-jkacur@redhat.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/softirq.c b/kernel/softirq.c
index 4cdca2e..bca3e64 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -431,6 +431,7 @@ int in_serving_softirq(void)
 	preempt_enable();
 	return res;
 }
+EXPORT_SYMBOL(in_serving_softirq);
 
 /*
  * Called with bh and local interrupts disabled. For full RT cpu must
-- 
cgit v0.10.2


From 2f5ab9420943bfce5d721306b36b87b3459ccbb9 Mon Sep 17 00:00:00 2001
From: Yong Zhang <yong.zhang0@gmail.com>
Date: Thu, 13 Oct 2011 17:19:09 +0800
Subject: hardirq.h: Define softirq_count() as OUL to kill build warning
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

kernel/lockdep.c: In function ‘print_bad_irq_dependency’:
kernel/lockdep.c:1476:3: warning: format ‘%lu’ expects type ‘long unsigned int’, but argument 7 has type ‘unsigned int’
kernel/lockdep.c: In function ‘print_usage_bug’:
kernel/lockdep.c:2193:3: warning: format ‘%lu’ expects type ‘long unsigned int’, but argument 7 has type ‘unsigned int’

kernel/lockdep.i show this:
 printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n",
  curr->comm, task_pid_nr(curr),
  curr->hardirq_context, ((current_thread_info()->preempt_count) & (((1UL << (10))-1) << ((0 + 8) + 8))) >> ((0 + 8) + 8),
  curr->softirq_context, (0U) >> (0 + 8),
  curr->hardirqs_enabled,
  curr->softirqs_enabled);

Signed-off-by: Yong Zhang <yong.zhang0@gmail.com>
Link: http://lkml.kernel.org/r/20111013091909.GA32739@zhy
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index 2ae0a9d..dfa97de 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -85,7 +85,7 @@
 # define softirq_count()	(preempt_count() & SOFTIRQ_MASK)
 # define in_serving_softirq()	(softirq_count() & SOFTIRQ_OFFSET)
 #else
-# define softirq_count()	(0U)
+# define softirq_count()	(0UL)
 extern int in_serving_softirq(void);
 #endif
 
-- 
cgit v0.10.2


From b6d336ed51b97bc5ee0e2c900ecfef371e19c125 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 18 Jul 2011 13:59:17 +0200
Subject: softirq-disable-softirq-stacks-for-rt.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index 71413f4..bb73a2e 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -584,6 +584,7 @@ void irq_ctx_init(void)
 	}
 }
 
+#ifndef CONFIG_PREEMPT_RT_FULL
 static inline void do_softirq_onstack(void)
 {
 	struct thread_info *curtp, *irqtp;
@@ -620,6 +621,7 @@ void do_softirq(void)
 
 	local_irq_restore(flags);
 }
+#endif
 
 irq_hw_number_t virq_to_hw(unsigned int virq)
 {
diff --git a/arch/powerpc/kernel/misc_32.S b/arch/powerpc/kernel/misc_32.S
index 19e096b..313078b 100644
--- a/arch/powerpc/kernel/misc_32.S
+++ b/arch/powerpc/kernel/misc_32.S
@@ -36,6 +36,7 @@
 
 	.text
 
+#ifndef CONFIG_PREEMPT_RT_FULL
 _GLOBAL(call_do_softirq)
 	mflr	r0
 	stw	r0,4(r1)
@@ -46,6 +47,7 @@ _GLOBAL(call_do_softirq)
 	lwz	r0,4(r1)
 	mtlr	r0
 	blr
+#endif
 
 _GLOBAL(call_handle_irq)
 	mflr	r0
diff --git a/arch/powerpc/kernel/misc_64.S b/arch/powerpc/kernel/misc_64.S
index 5cfa800..39b77e8 100644
--- a/arch/powerpc/kernel/misc_64.S
+++ b/arch/powerpc/kernel/misc_64.S
@@ -29,6 +29,7 @@
 
 	.text
 
+#ifndef CONFIG_PREEMPT_RT_FULL
 _GLOBAL(call_do_softirq)
 	mflr	r0
 	std	r0,16(r1)
@@ -39,6 +40,7 @@ _GLOBAL(call_do_softirq)
 	ld	r0,16(r1)
 	mtlr	r0
 	blr
+#endif
 
 _GLOBAL(call_handle_irq)
 	ld	r8,0(r6)
diff --git a/arch/sh/kernel/irq.c b/arch/sh/kernel/irq.c
index 063af10..ae4b141 100644
--- a/arch/sh/kernel/irq.c
+++ b/arch/sh/kernel/irq.c
@@ -149,6 +149,7 @@ void irq_ctx_exit(int cpu)
 	hardirq_ctx[cpu] = NULL;
 }
 
+#ifndef CONFIG_PREEMPT_RT_FULL
 asmlinkage void do_softirq(void)
 {
 	unsigned long flags;
@@ -191,6 +192,7 @@ asmlinkage void do_softirq(void)
 
 	local_irq_restore(flags);
 }
+#endif
 #else
 static inline void handle_one_irq(unsigned int irq)
 {
diff --git a/arch/sparc/kernel/irq_64.c b/arch/sparc/kernel/irq_64.c
index 9bcbbe2..1bc5cd8 100644
--- a/arch/sparc/kernel/irq_64.c
+++ b/arch/sparc/kernel/irq_64.c
@@ -698,6 +698,7 @@ void __irq_entry handler_irq(int pil, struct pt_regs *regs)
 	set_irq_regs(old_regs);
 }
 
+#ifndef CONFIG_PREEMPT_RT_FULL
 void do_softirq(void)
 {
 	unsigned long flags;
@@ -723,6 +724,7 @@ void do_softirq(void)
 
 	local_irq_restore(flags);
 }
+#endif
 
 #ifdef CONFIG_HOTPLUG_CPU
 void fixup_irqs(void)
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index cb3c591..3c06da6 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1337,6 +1337,7 @@ bad_gs:
 	jmp  2b
 	.previous
 
+#ifndef CONFIG_PREEMPT_RT_FULL
 /* Call softirq on interrupt stack. Interrupts are off. */
 ENTRY(call_softirq)
 	CFI_STARTPROC
@@ -1356,6 +1357,7 @@ ENTRY(call_softirq)
 	ret
 	CFI_ENDPROC
 END(call_softirq)
+#endif
 
 #ifdef CONFIG_XEN
 zeroentry xen_hypervisor_callback xen_do_hypervisor_callback
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 344faf8..f60ecc0 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -149,6 +149,7 @@ void __cpuinit irq_ctx_init(int cpu)
 	       cpu, per_cpu(hardirq_ctx, cpu),  per_cpu(softirq_ctx, cpu));
 }
 
+#ifndef CONFIG_PREEMPT_RT_FULL
 asmlinkage void do_softirq(void)
 {
 	unsigned long flags;
@@ -179,6 +180,7 @@ asmlinkage void do_softirq(void)
 
 	local_irq_restore(flags);
 }
+#endif
 
 bool handle_irq(unsigned irq, struct pt_regs *regs)
 {
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index d04d3ec..831f247 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -88,7 +88,7 @@ bool handle_irq(unsigned irq, struct pt_regs *regs)
 	return true;
 }
 
-
+#ifndef CONFIG_PREEMPT_RT_FULL
 extern void call_softirq(void);
 
 asmlinkage void do_softirq(void)
@@ -108,3 +108,4 @@ asmlinkage void do_softirq(void)
 	}
 	local_irq_restore(flags);
 }
+#endif
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 77ed66d..a103ef8 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -445,10 +445,9 @@ struct softirq_action
 	void	(*action)(struct softirq_action *);
 };
 
+#ifndef CONFIG_PREEMPT_RT_FULL
 asmlinkage void do_softirq(void);
 asmlinkage void __do_softirq(void);
-
-#ifndef CONFIG_PREEMPT_RT_FULL
 static inline void thread_do_softirq(void) { do_softirq(); }
 #else
 extern void thread_do_softirq(void);
-- 
cgit v0.10.2


From 01b8de7ad30dd000532acebf4b4684fa6e620270 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 21 Jul 2011 21:06:43 +0200
Subject: softirq-make-fifo.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/softirq.c b/kernel/softirq.c
index bca3e64..7a19af0 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -364,6 +364,8 @@ asmlinkage void do_softirq(void)
 
 static inline void local_bh_disable_nort(void) { local_bh_disable(); }
 static inline void _local_bh_enable_nort(void) { _local_bh_enable(); }
+static void ksoftirqd_set_sched_params(unsigned int cpu) { }
+static void ksoftirqd_clr_sched_params(unsigned int cpu, bool online) { }
 
 #else /* !PREEMPT_RT_FULL */
 
@@ -518,6 +520,20 @@ static int ksoftirqd_do_softirq(int cpu)
 static inline void local_bh_disable_nort(void) { }
 static inline void _local_bh_enable_nort(void) { }
 
+static inline void ksoftirqd_set_sched_params(unsigned int cpu)
+{
+	struct sched_param param = { .sched_priority = 1 };
+
+	sched_setscheduler(current, SCHED_FIFO, &param);
+}
+
+static inline void ksoftirqd_clr_sched_params(unsigned int cpu, bool online)
+{
+	struct sched_param param = { .sched_priority = 0 };
+
+	sched_setscheduler(current, SCHED_NORMAL, &param);
+}
+
 #endif /* PREEMPT_RT_FULL */
 /*
  * Enter an interrupt context.
@@ -1065,6 +1081,8 @@ static struct notifier_block __cpuinitdata cpu_nfb = {
 
 static struct smp_hotplug_thread softirq_threads = {
 	.store			= &ksoftirqd,
+	.setup			= ksoftirqd_set_sched_params,
+	.cleanup		= ksoftirqd_clr_sched_params,
 	.thread_should_run	= ksoftirqd_should_run,
 	.thread_fn		= run_ksoftirqd,
 	.thread_comm		= "ksoftirqd/%u",
-- 
cgit v0.10.2


From 53b052a24a0d8daf924b0023a1ec108d2ceca5ae Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 29 Nov 2011 20:18:22 -0500
Subject: tasklet: Prevent tasklets from going into infinite spin in RT

When CONFIG_PREEMPT_RT_FULL is enabled, tasklets run as threads,
and spinlocks turn are mutexes. But this can cause issues with
tasks disabling tasklets. A tasklet runs under ksoftirqd, and
if a tasklets are disabled with tasklet_disable(), the tasklet
count is increased. When a tasklet runs, it checks this counter
and if it is set, it adds itself back on the softirq queue and
returns.

The problem arises in RT because ksoftirq will see that a softirq
is ready to run (the tasklet softirq just re-armed itself), and will
not sleep, but instead run the softirqs again. The tasklet softirq
will still see that the count is non-zero and will not execute
the tasklet and requeue itself on the softirq again, which will
cause ksoftirqd to run it again and again and again.

It gets worse because ksoftirqd runs as a real-time thread.
If it preempted the task that disabled tasklets, and that task
has migration disabled, or can't run for other reasons, the tasklet
softirq will never run because the count will never be zero, and
ksoftirqd will go into an infinite loop. As an RT task, it this
becomes a big problem.

This is a hack solution to have tasklet_disable stop tasklets, and
when a tasklet runs, instead of requeueing the tasklet softirqd
it delays it. When tasklet_enable() is called, and tasklets are
waiting, then the tasklet_enable() will kick the tasklets to continue.
This prevents the lock up from ksoftirq going into an infinite loop.

[ rostedt@goodmis.org: ported to 3.0-rt ]

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index a103ef8..8784459 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -502,8 +502,9 @@ extern void __send_remote_softirq(struct call_single_data *cp, int cpu,
      to be executed on some cpu at least once after this.
    * If the tasklet is already scheduled, but its execution is still not
      started, it will be executed only once.
-   * If this tasklet is already running on another CPU (or schedule is called
-     from tasklet itself), it is rescheduled for later.
+   * If this tasklet is already running on another CPU, it is rescheduled
+     for later.
+   * Schedule must not be called from the tasklet itself (a lockup occurs)
    * Tasklet is strictly serialized wrt itself, but not
      wrt another tasklets. If client needs some intertask synchronization,
      he makes it with spinlocks.
@@ -528,27 +529,36 @@ struct tasklet_struct name = { NULL, 0, ATOMIC_INIT(1), func, data }
 enum
 {
 	TASKLET_STATE_SCHED,	/* Tasklet is scheduled for execution */
-	TASKLET_STATE_RUN	/* Tasklet is running (SMP only) */
+	TASKLET_STATE_RUN,	/* Tasklet is running (SMP only) */
+	TASKLET_STATE_PENDING	/* Tasklet is pending */
 };
 
-#ifdef CONFIG_SMP
+#define TASKLET_STATEF_SCHED	(1 << TASKLET_STATE_SCHED)
+#define TASKLET_STATEF_RUN	(1 << TASKLET_STATE_RUN)
+#define TASKLET_STATEF_PENDING	(1 << TASKLET_STATE_PENDING)
+
+#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL)
 static inline int tasklet_trylock(struct tasklet_struct *t)
 {
 	return !test_and_set_bit(TASKLET_STATE_RUN, &(t)->state);
 }
 
+static inline int tasklet_tryunlock(struct tasklet_struct *t)
+{
+	return cmpxchg(&t->state, TASKLET_STATEF_RUN, 0) == TASKLET_STATEF_RUN;
+}
+
 static inline void tasklet_unlock(struct tasklet_struct *t)
 {
 	smp_mb__before_clear_bit(); 
 	clear_bit(TASKLET_STATE_RUN, &(t)->state);
 }
 
-static inline void tasklet_unlock_wait(struct tasklet_struct *t)
-{
-	while (test_bit(TASKLET_STATE_RUN, &(t)->state)) { barrier(); }
-}
+extern void tasklet_unlock_wait(struct tasklet_struct *t);
+
 #else
 #define tasklet_trylock(t) 1
+#define tasklet_tryunlock(t)	1
 #define tasklet_unlock_wait(t) do { } while (0)
 #define tasklet_unlock(t) do { } while (0)
 #endif
@@ -597,17 +607,8 @@ static inline void tasklet_disable(struct tasklet_struct *t)
 	smp_mb();
 }
 
-static inline void tasklet_enable(struct tasklet_struct *t)
-{
-	smp_mb__before_atomic_dec();
-	atomic_dec(&t->count);
-}
-
-static inline void tasklet_hi_enable(struct tasklet_struct *t)
-{
-	smp_mb__before_atomic_dec();
-	atomic_dec(&t->count);
-}
+extern  void tasklet_enable(struct tasklet_struct *t);
+extern  void tasklet_hi_enable(struct tasklet_struct *t);
 
 extern void tasklet_kill(struct tasklet_struct *t);
 extern void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 7a19af0..fcf6997 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -21,6 +21,7 @@
 #include <linux/freezer.h>
 #include <linux/kthread.h>
 #include <linux/rcupdate.h>
+#include <linux/delay.h>
 #include <linux/ftrace.h>
 #include <linux/smp.h>
 #include <linux/smpboot.h>
@@ -648,15 +649,45 @@ struct tasklet_head
 static DEFINE_PER_CPU(struct tasklet_head, tasklet_vec);
 static DEFINE_PER_CPU(struct tasklet_head, tasklet_hi_vec);
 
+static void inline
+__tasklet_common_schedule(struct tasklet_struct *t, struct tasklet_head *head, unsigned int nr)
+{
+	if (tasklet_trylock(t)) {
+again:
+		/* We may have been preempted before tasklet_trylock
+		 * and __tasklet_action may have already run.
+		 * So double check the sched bit while the takslet
+		 * is locked before adding it to the list.
+		 */
+		if (test_bit(TASKLET_STATE_SCHED, &t->state)) {
+			t->next = NULL;
+			*head->tail = t;
+			head->tail = &(t->next);
+			raise_softirq_irqoff(nr);
+			tasklet_unlock(t);
+		} else {
+			/* This is subtle. If we hit the corner case above
+			 * It is possible that we get preempted right here,
+			 * and another task has successfully called
+			 * tasklet_schedule(), then this function, and
+			 * failed on the trylock. Thus we must be sure
+			 * before releasing the tasklet lock, that the
+			 * SCHED_BIT is clear. Otherwise the tasklet
+			 * may get its SCHED_BIT set, but not added to the
+			 * list
+			 */
+			if (!tasklet_tryunlock(t))
+				goto again;
+		}
+	}
+}
+
 void __tasklet_schedule(struct tasklet_struct *t)
 {
 	unsigned long flags;
 
 	local_irq_save(flags);
-	t->next = NULL;
-	*__this_cpu_read(tasklet_vec.tail) = t;
-	__this_cpu_write(tasklet_vec.tail, &(t->next));
-	raise_softirq_irqoff(TASKLET_SOFTIRQ);
+	__tasklet_common_schedule(t, &__get_cpu_var(tasklet_vec), TASKLET_SOFTIRQ);
 	local_irq_restore(flags);
 }
 
@@ -667,10 +698,7 @@ void __tasklet_hi_schedule(struct tasklet_struct *t)
 	unsigned long flags;
 
 	local_irq_save(flags);
-	t->next = NULL;
-	*__this_cpu_read(tasklet_hi_vec.tail) = t;
-	__this_cpu_write(tasklet_hi_vec.tail,  &(t->next));
-	raise_softirq_irqoff(HI_SOFTIRQ);
+	__tasklet_common_schedule(t, &__get_cpu_var(tasklet_hi_vec), HI_SOFTIRQ);
 	local_irq_restore(flags);
 }
 
@@ -678,50 +706,119 @@ EXPORT_SYMBOL(__tasklet_hi_schedule);
 
 void __tasklet_hi_schedule_first(struct tasklet_struct *t)
 {
-	BUG_ON(!irqs_disabled());
-
-	t->next = __this_cpu_read(tasklet_hi_vec.head);
-	__this_cpu_write(tasklet_hi_vec.head, t);
-	__raise_softirq_irqoff(HI_SOFTIRQ);
+	__tasklet_hi_schedule(t);
 }
 
 EXPORT_SYMBOL(__tasklet_hi_schedule_first);
 
-static void tasklet_action(struct softirq_action *a)
+void  tasklet_enable(struct tasklet_struct *t)
 {
-	struct tasklet_struct *list;
+	if (!atomic_dec_and_test(&t->count))
+		return;
+	if (test_and_clear_bit(TASKLET_STATE_PENDING, &t->state))
+		tasklet_schedule(t);
+}
 
-	local_irq_disable();
-	list = __this_cpu_read(tasklet_vec.head);
-	__this_cpu_write(tasklet_vec.head, NULL);
-	__this_cpu_write(tasklet_vec.tail, &__get_cpu_var(tasklet_vec).head);
-	local_irq_enable();
+EXPORT_SYMBOL(tasklet_enable);
+
+void  tasklet_hi_enable(struct tasklet_struct *t)
+{
+	if (!atomic_dec_and_test(&t->count))
+		return;
+	if (test_and_clear_bit(TASKLET_STATE_PENDING, &t->state))
+		tasklet_hi_schedule(t);
+}
+
+EXPORT_SYMBOL(tasklet_hi_enable);
+
+static void
+__tasklet_action(struct softirq_action *a, struct tasklet_struct *list)
+{
+	int loops = 1000000;
 
 	while (list) {
 		struct tasklet_struct *t = list;
 
 		list = list->next;
 
-		if (tasklet_trylock(t)) {
-			if (!atomic_read(&t->count)) {
-				if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
-					BUG();
-				t->func(t->data);
-				tasklet_unlock(t);
-				continue;
-			}
-			tasklet_unlock(t);
+		/*
+		 * Should always succeed - after a tasklist got on the
+		 * list (after getting the SCHED bit set from 0 to 1),
+		 * nothing but the tasklet softirq it got queued to can
+		 * lock it:
+		 */
+		if (!tasklet_trylock(t)) {
+			WARN_ON(1);
+			continue;
 		}
 
-		local_irq_disable();
 		t->next = NULL;
-		*__this_cpu_read(tasklet_vec.tail) = t;
-		__this_cpu_write(tasklet_vec.tail, &(t->next));
-		__raise_softirq_irqoff(TASKLET_SOFTIRQ);
-		local_irq_enable();
+
+		/*
+		 * If we cannot handle the tasklet because it's disabled,
+		 * mark it as pending. tasklet_enable() will later
+		 * re-schedule the tasklet.
+		 */
+		if (unlikely(atomic_read(&t->count))) {
+out_disabled:
+			/* implicit unlock: */
+			wmb();
+			t->state = TASKLET_STATEF_PENDING;
+			continue;
+		}
+
+		/*
+		 * After this point on the tasklet might be rescheduled
+		 * on another CPU, but it can only be added to another
+		 * CPU's tasklet list if we unlock the tasklet (which we
+		 * dont do yet).
+		 */
+		if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
+			WARN_ON(1);
+
+again:
+		t->func(t->data);
+
+		/*
+		 * Try to unlock the tasklet. We must use cmpxchg, because
+		 * another CPU might have scheduled or disabled the tasklet.
+		 * We only allow the STATE_RUN -> 0 transition here.
+		 */
+		while (!tasklet_tryunlock(t)) {
+			/*
+			 * If it got disabled meanwhile, bail out:
+			 */
+			if (atomic_read(&t->count))
+				goto out_disabled;
+			/*
+			 * If it got scheduled meanwhile, re-execute
+			 * the tasklet function:
+			 */
+			if (test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
+				goto again;
+			if (!--loops) {
+				printk("hm, tasklet state: %08lx\n", t->state);
+				WARN_ON(1);
+				tasklet_unlock(t);
+				break;
+			}
+		}
 	}
 }
 
+static void tasklet_action(struct softirq_action *a)
+{
+	struct tasklet_struct *list;
+
+	local_irq_disable();
+	list = __get_cpu_var(tasklet_vec).head;
+	__get_cpu_var(tasklet_vec).head = NULL;
+	__get_cpu_var(tasklet_vec).tail = &__get_cpu_var(tasklet_vec).head;
+	local_irq_enable();
+
+	__tasklet_action(a, list);
+}
+
 static void tasklet_hi_action(struct softirq_action *a)
 {
 	struct tasklet_struct *list;
@@ -732,29 +829,7 @@ static void tasklet_hi_action(struct softirq_action *a)
 	__this_cpu_write(tasklet_hi_vec.tail, &__get_cpu_var(tasklet_hi_vec).head);
 	local_irq_enable();
 
-	while (list) {
-		struct tasklet_struct *t = list;
-
-		list = list->next;
-
-		if (tasklet_trylock(t)) {
-			if (!atomic_read(&t->count)) {
-				if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
-					BUG();
-				t->func(t->data);
-				tasklet_unlock(t);
-				continue;
-			}
-			tasklet_unlock(t);
-		}
-
-		local_irq_disable();
-		t->next = NULL;
-		*__this_cpu_read(tasklet_hi_vec.tail) = t;
-		__this_cpu_write(tasklet_hi_vec.tail, &(t->next));
-		__raise_softirq_irqoff(HI_SOFTIRQ);
-		local_irq_enable();
-	}
+	__tasklet_action(a, list);
 }
 
 
@@ -777,7 +852,7 @@ void tasklet_kill(struct tasklet_struct *t)
 
 	while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) {
 		do {
-			yield();
+			msleep(1);
 		} while (test_bit(TASKLET_STATE_SCHED, &t->state));
 	}
 	tasklet_unlock_wait(t);
@@ -983,6 +1058,23 @@ void __init softirq_init(void)
 	open_softirq(HI_SOFTIRQ, tasklet_hi_action);
 }
 
+#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL)
+void tasklet_unlock_wait(struct tasklet_struct *t)
+{
+	while (test_bit(TASKLET_STATE_RUN, &(t)->state)) {
+		/*
+		 * Hack for now to avoid this busy-loop:
+		 */
+#ifdef CONFIG_PREEMPT_RT_FULL
+		msleep(1);
+#else
+		barrier();
+#endif
+	}
+}
+EXPORT_SYMBOL(tasklet_unlock_wait);
+#endif
+
 static int ksoftirqd_should_run(unsigned int cpu)
 {
 	return local_softirq_pending();
-- 
cgit v0.10.2


From 843c16311744fb8ae6c3b547e15ef209830aa828 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 31 Jan 2012 13:01:27 +0100
Subject: genirq: Allow disabling of softirq processing in irq thread context

The processing of softirqs in irq thread context is a performance gain
for the non-rt workloads of a system, but it's counterproductive for
interrupts which are explicitely related to the realtime
workload. Allow such interrupts to prevent softirq processing in their
thread context.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable-rt@vger.kernel.org

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 8784459..11bdb1e 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -58,6 +58,7 @@
  * IRQF_NO_THREAD - Interrupt cannot be threaded
  * IRQF_EARLY_RESUME - Resume IRQ early during syscore instead of at device
  *                resume time.
+ * IRQF_NO_SOFTIRQ_CALL - Do not process softirqs in the irq thread context (RT)
  */
 #define IRQF_DISABLED		0x00000020
 #define IRQF_SHARED		0x00000080
@@ -71,6 +72,7 @@
 #define IRQF_FORCE_RESUME	0x00008000
 #define IRQF_NO_THREAD		0x00010000
 #define IRQF_EARLY_RESUME	0x00020000
+#define IRQF_NO_SOFTIRQ_CALL	0x00040000
 
 #define IRQF_TIMER		(__IRQF_TIMER | IRQF_NO_SUSPEND | IRQF_NO_THREAD)
 
diff --git a/include/linux/irq.h b/include/linux/irq.h
index fdf2c4a..3929bbe 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -70,6 +70,7 @@ typedef	void (*irq_preflow_handler_t)(struct irq_data *data);
  * IRQ_MOVE_PCNTXT		- Interrupt can be migrated from process context
  * IRQ_NESTED_TRHEAD		- Interrupt nests into another thread
  * IRQ_PER_CPU_DEVID		- Dev_id is a per-cpu variable
+ * IRQ_NO_SOFTIRQ_CALL		- No softirq processing in the irq thread context (RT)
  */
 enum {
 	IRQ_TYPE_NONE		= 0x00000000,
@@ -94,12 +95,14 @@ enum {
 	IRQ_NESTED_THREAD	= (1 << 15),
 	IRQ_NOTHREAD		= (1 << 16),
 	IRQ_PER_CPU_DEVID	= (1 << 17),
+	IRQ_NO_SOFTIRQ_CALL	= (1 << 18),
 };
 
 #define IRQF_MODIFY_MASK	\
 	(IRQ_TYPE_SENSE_MASK | IRQ_NOPROBE | IRQ_NOREQUEST | \
 	 IRQ_NOAUTOEN | IRQ_MOVE_PCNTXT | IRQ_LEVEL | IRQ_NO_BALANCING | \
-	 IRQ_PER_CPU | IRQ_NESTED_THREAD | IRQ_NOTHREAD | IRQ_PER_CPU_DEVID)
+	 IRQ_PER_CPU | IRQ_NESTED_THREAD | IRQ_NOTHREAD | IRQ_PER_CPU_DEVID | \
+	 IRQ_NO_SOFTIRQ_CALL)
 
 #define IRQ_NO_BALANCING_MASK	(IRQ_PER_CPU | IRQ_NO_BALANCING)
 
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 902f880..5bc2f41 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -782,7 +782,15 @@ irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action)
 	local_bh_disable();
 	ret = action->thread_fn(action->irq, action->dev_id);
 	irq_finalize_oneshot(desc, action);
-	local_bh_enable();
+	/*
+	 * Interrupts which have real time requirements can be set up
+	 * to avoid softirq processing in the thread handler. This is
+	 * safe as these interrupts do not raise soft interrupts.
+	 */
+	if (irq_settings_no_softirq_call(desc))
+		_local_bh_enable();
+	else
+		local_bh_enable();
 	return ret;
 }
 
@@ -1127,6 +1135,9 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 			irqd_set(&desc->irq_data, IRQD_NO_BALANCING);
 		}
 
+		if (new->flags & IRQF_NO_SOFTIRQ_CALL)
+			irq_settings_set_no_softirq_call(desc);
+
 		/* Set default affinity mask once everything is setup */
 		setup_affinity(irq, desc, mask);
 
diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h
index 1162f10..0d2c381 100644
--- a/kernel/irq/settings.h
+++ b/kernel/irq/settings.h
@@ -14,6 +14,7 @@ enum {
 	_IRQ_NO_BALANCING	= IRQ_NO_BALANCING,
 	_IRQ_NESTED_THREAD	= IRQ_NESTED_THREAD,
 	_IRQ_PER_CPU_DEVID	= IRQ_PER_CPU_DEVID,
+	_IRQ_NO_SOFTIRQ_CALL	= IRQ_NO_SOFTIRQ_CALL,
 	_IRQF_MODIFY_MASK	= IRQF_MODIFY_MASK,
 };
 
@@ -26,6 +27,7 @@ enum {
 #define IRQ_NOAUTOEN		GOT_YOU_MORON
 #define IRQ_NESTED_THREAD	GOT_YOU_MORON
 #define IRQ_PER_CPU_DEVID	GOT_YOU_MORON
+#define IRQ_NO_SOFTIRQ_CALL	GOT_YOU_MORON
 #undef IRQF_MODIFY_MASK
 #define IRQF_MODIFY_MASK	GOT_YOU_MORON
 
@@ -36,6 +38,16 @@ irq_settings_clr_and_set(struct irq_desc *desc, u32 clr, u32 set)
 	desc->status_use_accessors |= (set & _IRQF_MODIFY_MASK);
 }
 
+static inline bool irq_settings_no_softirq_call(struct irq_desc *desc)
+{
+	return desc->status_use_accessors & _IRQ_NO_SOFTIRQ_CALL;
+}
+
+static inline void irq_settings_set_no_softirq_call(struct irq_desc *desc)
+{
+	desc->status_use_accessors |= _IRQ_NO_SOFTIRQ_CALL;
+}
+
 static inline bool irq_settings_is_per_cpu(struct irq_desc *desc)
 {
 	return desc->status_use_accessors & _IRQ_PER_CPU;
diff --git a/kernel/softirq.c b/kernel/softirq.c
index fcf6997..f69c069 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -417,6 +417,13 @@ void local_bh_enable_ip(unsigned long ip)
 }
 EXPORT_SYMBOL(local_bh_enable_ip);
 
+void _local_bh_enable(void)
+{
+	current->softirq_nestcnt--;
+	migrate_enable();
+}
+EXPORT_SYMBOL(_local_bh_enable);
+
 /* For tracing */
 int notrace __in_softirq(void)
 {
-- 
cgit v0.10.2


From d79878cfb0c2dfe18c04634deebe0d9aa374cee7 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 28 Jun 2011 20:42:16 +0200
Subject: local-vars-migrate-disable.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index bf69fd1..12b394f 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -48,10 +48,30 @@
 	preempt_enable();				\
 } while (0)
 
-#define get_local_var(var)	get_cpu_var(var)
-#define put_local_var(var)	put_cpu_var(var)
-#define get_local_ptr(var)	get_cpu_ptr(var)
-#define put_local_ptr(var)	put_cpu_ptr(var)
+#ifndef CONFIG_PREEMPT_RT_FULL
+# define get_local_var(var)	get_cpu_var(var)
+# define put_local_var(var)	put_cpu_var(var)
+# define get_local_ptr(var)	get_cpu_ptr(var)
+# define put_local_ptr(var)	put_cpu_ptr(var)
+#else
+# define get_local_var(var) (*({			\
+	migrate_disable();				\
+	&__get_cpu_var(var); }))
+
+# define put_local_var(var) do {			\
+	(void)&(var);					\
+	migrate_enable();				\
+} while (0)
+
+# define get_local_ptr(var) ({				\
+	migrate_disable();				\
+	this_cpu_ptr(var); })
+
+# define put_local_ptr(var) do {			\
+	(void)(var);					\
+	migrate_enable();				\
+} while (0)
+#endif
 
 /* minimum unit size, also is the maximum supported allocation size */
 #define PCPU_MIN_UNIT_SIZE		PFN_ALIGN(32 << 10)
-- 
cgit v0.10.2


From a1863832249afbb64bdc158d87a3ae865d4189f7 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 6 Apr 2010 16:51:31 +0200
Subject: md: raid5: Make raid5_percpu handling RT aware

__raid_run_ops() disables preemption with get_cpu() around the access
to the raid5_percpu variables. That causes scheduling while atomic
spews on RT.

Serialize the access to the percpu data with a lock and keep the code
preemptible.

Reported-by: Udo van den Heuvel <udovdh@xs4all.nl>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Udo van den Heuvel <udovdh@xs4all.nl>

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 19d77a0..627faa2 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1415,8 +1415,9 @@ static void __raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
 	struct raid5_percpu *percpu;
 	unsigned long cpu;
 
-	cpu = get_cpu();
+	cpu = get_cpu_light();
 	percpu = per_cpu_ptr(conf->percpu, cpu);
+	spin_lock(&percpu->lock);
 	if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) {
 		ops_run_biofill(sh);
 		overlap_clear++;
@@ -1468,7 +1469,8 @@ static void __raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
 			if (test_and_clear_bit(R5_Overlap, &dev->flags))
 				wake_up(&sh->raid_conf->wait_for_overlap);
 		}
-	put_cpu();
+	spin_unlock(&percpu->lock);
+	put_cpu_light();
 }
 
 #ifdef CONFIG_MULTICORE_RAID456
@@ -5093,6 +5095,7 @@ static int raid5_alloc_percpu(struct r5conf *conf)
 			break;
 		}
 		per_cpu_ptr(conf->percpu, cpu)->scribble = scribble;
+		spin_lock_init(&per_cpu_ptr(conf->percpu, cpu)->lock);
 	}
 #ifdef CONFIG_HOTPLUG_CPU
 	conf->cpu_notify.notifier_call = raid456_cpu_notify;
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 18b2c4a..e091966 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -428,6 +428,7 @@ struct r5conf {
 	int			recovery_disabled;
 	/* per cpu variables */
 	struct raid5_percpu {
+		spinlock_t	lock;	     /* Protection for -RT */
 		struct page	*spare_page; /* Used when checking P/Q in raid6 */
 		void		*scribble;   /* space for constructing buffer
 					      * lists and performing address
-- 
cgit v0.10.2


From e2ae77b62ca9f6fadbace98de610d9c8f6f2758f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 10 Jun 2011 11:04:15 +0200
Subject: rtmutex-futex-prepare-rt.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/futex.c b/kernel/futex.c
index 8879430..bcf86f7 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1442,6 +1442,16 @@ retry_private:
 				requeue_pi_wake_futex(this, &key2, hb2);
 				drop_count++;
 				continue;
+			} else if (ret == -EAGAIN) {
+				/*
+				 * Waiter was woken by timeout or
+				 * signal and has set pi_blocked_on to
+				 * PI_WAKEUP_INPROGRESS before we
+				 * tried to enqueue it on the rtmutex.
+				 */
+				this->pi_state = NULL;
+				free_pi_state(pi_state);
+				continue;
 			} else if (ret) {
 				/* -EDEADLK */
 				this->pi_state = NULL;
@@ -2286,7 +2296,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
 	struct hrtimer_sleeper timeout, *to = NULL;
 	struct rt_mutex_waiter rt_waiter;
 	struct rt_mutex *pi_mutex = NULL;
-	struct futex_hash_bucket *hb;
+	struct futex_hash_bucket *hb, *hb2;
 	union futex_key key2 = FUTEX_KEY_INIT;
 	struct futex_q q = futex_q_init;
 	int res, ret;
@@ -2333,20 +2343,55 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
 	/* Queue the futex_q, drop the hb lock, wait for wakeup. */
 	futex_wait_queue_me(hb, &q, to);
 
-	spin_lock(&hb->lock);
-	ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
-	spin_unlock(&hb->lock);
-	if (ret)
-		goto out_put_keys;
+	/*
+	 * On RT we must avoid races with requeue and trying to block
+	 * on two mutexes (hb->lock and uaddr2's rtmutex) by
+	 * serializing access to pi_blocked_on with pi_lock.
+	 */
+	raw_spin_lock_irq(&current->pi_lock);
+	if (current->pi_blocked_on) {
+		/*
+		 * We have been requeued or are in the process of
+		 * being requeued.
+		 */
+		raw_spin_unlock_irq(&current->pi_lock);
+	} else {
+		/*
+		 * Setting pi_blocked_on to PI_WAKEUP_INPROGRESS
+		 * prevents a concurrent requeue from moving us to the
+		 * uaddr2 rtmutex. After that we can safely acquire
+		 * (and possibly block on) hb->lock.
+		 */
+		current->pi_blocked_on = PI_WAKEUP_INPROGRESS;
+		raw_spin_unlock_irq(&current->pi_lock);
+
+		spin_lock(&hb->lock);
+
+		/*
+		 * Clean up pi_blocked_on. We might leak it otherwise
+		 * when we succeeded with the hb->lock in the fast
+		 * path.
+		 */
+		raw_spin_lock_irq(&current->pi_lock);
+		current->pi_blocked_on = NULL;
+		raw_spin_unlock_irq(&current->pi_lock);
+
+		ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
+		spin_unlock(&hb->lock);
+		if (ret)
+			goto out_put_keys;
+	}
 
 	/*
-	 * In order for us to be here, we know our q.key == key2, and since
-	 * we took the hb->lock above, we also know that futex_requeue() has
-	 * completed and we no longer have to concern ourselves with a wakeup
-	 * race with the atomic proxy lock acquisition by the requeue code. The
-	 * futex_requeue dropped our key1 reference and incremented our key2
-	 * reference count.
+	 * In order to be here, we have either been requeued, are in
+	 * the process of being requeued, or requeue successfully
+	 * acquired uaddr2 on our behalf.  If pi_blocked_on was
+	 * non-null above, we may be racing with a requeue.  Do not
+	 * rely on q->lock_ptr to be hb2->lock until after blocking on
+	 * hb->lock or hb2->lock. The futex_requeue dropped our key1
+	 * reference and incremented our key2 reference count.
 	 */
+	hb2 = hash_futex(&key2);
 
 	/* Check if the requeue code acquired the second futex for us. */
 	if (!q.rt_waiter) {
@@ -2355,9 +2400,10 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
 		 * did a lock-steal - fix up the PI-state in that case.
 		 */
 		if (q.pi_state && (q.pi_state->owner != current)) {
-			spin_lock(q.lock_ptr);
+			spin_lock(&hb2->lock);
+			BUG_ON(&hb2->lock != q.lock_ptr);
 			ret = fixup_pi_state_owner(uaddr2, &q, current);
-			spin_unlock(q.lock_ptr);
+			spin_unlock(&hb2->lock);
 		}
 	} else {
 		/*
@@ -2370,7 +2416,8 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
 		ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter, 1);
 		debug_rt_mutex_free_waiter(&rt_waiter);
 
-		spin_lock(q.lock_ptr);
+		spin_lock(&hb2->lock);
+		BUG_ON(&hb2->lock != q.lock_ptr);
 		/*
 		 * Fixup the pi_state owner and possibly acquire the lock if we
 		 * haven't already.
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index a242e69..f1a6137 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -67,6 +67,11 @@ static void fixup_rt_mutex_waiters(struct rt_mutex *lock)
 		clear_rt_mutex_waiters(lock);
 }
 
+static int rt_mutex_real_waiter(struct rt_mutex_waiter *waiter)
+{
+	return waiter && waiter != PI_WAKEUP_INPROGRESS;
+}
+
 /*
  * We can speed up the acquire/release, if the architecture
  * supports cmpxchg and if there's no debugging state to be set up
@@ -196,7 +201,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
 	 * reached or the state of the chain has changed while we
 	 * dropped the locks.
 	 */
-	if (!waiter)
+	if (!rt_mutex_real_waiter(waiter))
 		goto out_unlock_pi;
 
 	/*
@@ -399,6 +404,23 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
 	int chain_walk = 0, res;
 
 	raw_spin_lock_irqsave(&task->pi_lock, flags);
+
+	/*
+	 * In the case of futex requeue PI, this will be a proxy
+	 * lock. The task will wake unaware that it is enqueueed on
+	 * this lock. Avoid blocking on two locks and corrupting
+	 * pi_blocked_on via the PI_WAKEUP_INPROGRESS
+	 * flag. futex_wait_requeue_pi() sets this when it wakes up
+	 * before requeue (due to a signal or timeout). Do not enqueue
+	 * the task if PI_WAKEUP_INPROGRESS is set.
+	 */
+	if (task != current && task->pi_blocked_on == PI_WAKEUP_INPROGRESS) {
+		raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+		return -EAGAIN;
+	}
+
+	BUG_ON(rt_mutex_real_waiter(task->pi_blocked_on));
+
 	__rt_mutex_adjust_prio(task);
 	waiter->task = task;
 	waiter->lock = lock;
@@ -423,7 +445,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
 		plist_add(&waiter->pi_list_entry, &owner->pi_waiters);
 
 		__rt_mutex_adjust_prio(owner);
-		if (owner->pi_blocked_on)
+		if (rt_mutex_real_waiter(owner->pi_blocked_on))
 			chain_walk = 1;
 		raw_spin_unlock_irqrestore(&owner->pi_lock, flags);
 	}
@@ -517,7 +539,7 @@ static void remove_waiter(struct rt_mutex *lock,
 		}
 		__rt_mutex_adjust_prio(owner);
 
-		if (owner->pi_blocked_on)
+		if (rt_mutex_real_waiter(owner->pi_blocked_on))
 			chain_walk = 1;
 
 		raw_spin_unlock_irqrestore(&owner->pi_lock, flags);
@@ -551,7 +573,8 @@ void rt_mutex_adjust_pi(struct task_struct *task)
 	raw_spin_lock_irqsave(&task->pi_lock, flags);
 
 	waiter = task->pi_blocked_on;
-	if (!waiter || waiter->list_entry.prio == task->prio) {
+	if (!rt_mutex_real_waiter(waiter) ||
+	    waiter->list_entry.prio == task->prio) {
 		raw_spin_unlock_irqrestore(&task->pi_lock, flags);
 		return;
 	}
diff --git a/kernel/rtmutex_common.h b/kernel/rtmutex_common.h
index 53a66c8..b43d832 100644
--- a/kernel/rtmutex_common.h
+++ b/kernel/rtmutex_common.h
@@ -103,6 +103,8 @@ static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock)
 /*
  * PI-futex support (proxy locking functions, etc.):
  */
+#define PI_WAKEUP_INPROGRESS	((struct rt_mutex_waiter *) 1)
+
 extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock);
 extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
 				       struct task_struct *proxy_owner);
-- 
cgit v0.10.2


From 7f5192717440bea7d6d445b279ba00b60f3d0f48 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Thu, 4 Apr 2013 17:09:25 -0500
Subject: futex: Fix bug on when a requeued RT task times out

Requeue with timeout causes a bug with PREEMPT_RT_FULL.

The bug comes from a timed out condition.


	TASK 1				TASK 2
	------				------
    futex_wait_requeue_pi()
	futex_wait_queue_me()
	<timed out>

					double_lock_hb();

	raw_spin_lock(pi_lock);
	if (current->pi_blocked_on) {
	} else {
	    current->pi_blocked_on = PI_WAKE_INPROGRESS;
	    run_spin_unlock(pi_lock);
	    spin_lock(hb->lock); <-- blocked!


					plist_for_each_entry_safe(this) {
					    rt_mutex_start_proxy_lock();
						task_blocks_on_rt_mutex();
						BUG_ON(task->pi_blocked_on)!!!!

The BUG_ON() actually has a check for PI_WAKE_INPROGRESS, but the
problem is that, after TASK 1 sets PI_WAKE_INPROGRESS, it then tries to
grab the hb->lock, which it fails to do so. As the hb->lock is a mutex,
it will block and set the "pi_blocked_on" to the hb->lock.

When TASK 2 goes to requeue it, the check for PI_WAKE_INPROGESS fails
because the task1's pi_blocked_on is no longer set to that, but instead,
set to the hb->lock.

The fix:

When calling rt_mutex_start_proxy_lock() a check is made to see
if the proxy tasks pi_blocked_on is set. If so, exit out early.
Otherwise set it to a new flag PI_REQUEUE_INPROGRESS, which notifies
the proxy task that it is being requeued, and will handle things
appropriately.

Cc: stable-rt@vger.kernel.org
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index f1a6137..a8a1cd0 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -69,7 +69,8 @@ static void fixup_rt_mutex_waiters(struct rt_mutex *lock)
 
 static int rt_mutex_real_waiter(struct rt_mutex_waiter *waiter)
 {
-	return waiter && waiter != PI_WAKEUP_INPROGRESS;
+	return waiter && waiter != PI_WAKEUP_INPROGRESS &&
+		waiter != PI_REQUEUE_INPROGRESS;
 }
 
 /*
@@ -981,6 +982,35 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
 		return 1;
 	}
 
+#ifdef CONFIG_PREEMPT_RT_FULL
+	/*
+	 * In PREEMPT_RT there's an added race.
+	 * If the task, that we are about to requeue, times out,
+	 * it can set the PI_WAKEUP_INPROGRESS. This tells the requeue
+	 * to skip this task. But right after the task sets
+	 * its pi_blocked_on to PI_WAKEUP_INPROGRESS it can then
+	 * block on the spin_lock(&hb->lock), which in RT is an rtmutex.
+	 * This will replace the PI_WAKEUP_INPROGRESS with the actual
+	 * lock that it blocks on. We *must not* place this task
+	 * on this proxy lock in that case.
+	 *
+	 * To prevent this race, we first take the task's pi_lock
+	 * and check if it has updated its pi_blocked_on. If it has,
+	 * we assume that it woke up and we return -EAGAIN.
+	 * Otherwise, we set the task's pi_blocked_on to
+	 * PI_REQUEUE_INPROGRESS, so that if the task is waking up
+	 * it will know that we are in the process of requeuing it.
+	 */
+	raw_spin_lock_irq(&task->pi_lock);
+	if (task->pi_blocked_on) {
+		raw_spin_unlock_irq(&task->pi_lock);
+		raw_spin_unlock(&lock->wait_lock);
+		return -EAGAIN;
+	}
+	task->pi_blocked_on = PI_REQUEUE_INPROGRESS;
+	raw_spin_unlock_irq(&task->pi_lock);
+#endif
+
 	ret = task_blocks_on_rt_mutex(lock, waiter, task, detect_deadlock);
 
 	if (ret && !rt_mutex_owner(lock)) {
diff --git a/kernel/rtmutex_common.h b/kernel/rtmutex_common.h
index b43d832..47290ec 100644
--- a/kernel/rtmutex_common.h
+++ b/kernel/rtmutex_common.h
@@ -104,6 +104,7 @@ static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock)
  * PI-futex support (proxy locking functions, etc.):
  */
 #define PI_WAKEUP_INPROGRESS	((struct rt_mutex_waiter *) 1)
+#define PI_REQUEUE_INPROGRESS	((struct rt_mutex_waiter *) 2)
 
 extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock);
 extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
-- 
cgit v0.10.2


From f55fbd861f95d92e591bf6df3a816f251cbdf009 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 1 Mar 2013 11:17:42 +0100
Subject: futex: Ensure lock/unlock symetry versus pi_lock and hash bucket lock

In exit_pi_state_list() we have the following locking construct:

   spin_lock(&hb->lock);
   raw_spin_lock_irq(&curr->pi_lock);

   ...
   spin_unlock(&hb->lock);

In !RT this works, but on RT the migrate_enable() function which is
called from spin_unlock() sees atomic context due to the held pi_lock
and just decrements the migrate_disable_atomic counter of the
task. Now the next call to migrate_disable() sees the counter being
negative and issues a warning. That check should be in
migrate_enable() already.

Fix this by dropping pi_lock before unlocking hb->lock and reaquire
pi_lock after that again. This is safe as the loop code reevaluates
head again under the pi_lock.

Reported-by: Yong Zhang <yong.zhang@windriver.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>

diff --git a/kernel/futex.c b/kernel/futex.c
index bcf86f7..1d74b85 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -568,7 +568,9 @@ void exit_pi_state_list(struct task_struct *curr)
 		 * task still owns the PI-state:
 		 */
 		if (head->next != next) {
+			raw_spin_unlock_irq(&curr->pi_lock);
 			spin_unlock(&hb->lock);
+			raw_spin_lock_irq(&curr->pi_lock);
 			continue;
 		}
 
-- 
cgit v0.10.2


From 798cfa2881bc7d5d92b735adcbc54f9f740c3b08 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 9 Jun 2011 11:43:52 +0200
Subject: rtmutex-lock-killable.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/rtmutex.h b/include/linux/rtmutex.h
index de17134..3561eb2 100644
--- a/include/linux/rtmutex.h
+++ b/include/linux/rtmutex.h
@@ -90,6 +90,7 @@ extern void rt_mutex_destroy(struct rt_mutex *lock);
 extern void rt_mutex_lock(struct rt_mutex *lock);
 extern int rt_mutex_lock_interruptible(struct rt_mutex *lock,
 						int detect_deadlock);
+extern int rt_mutex_lock_killable(struct rt_mutex *lock, int detect_deadlock);
 extern int rt_mutex_timed_lock(struct rt_mutex *lock,
 					struct hrtimer_sleeper *timeout,
 					int detect_deadlock);
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index a8a1cd0..0d129f9 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -815,12 +815,12 @@ EXPORT_SYMBOL_GPL(rt_mutex_lock);
 /**
  * rt_mutex_lock_interruptible - lock a rt_mutex interruptible
  *
- * @lock: 		the rt_mutex to be locked
+ * @lock:		the rt_mutex to be locked
  * @detect_deadlock:	deadlock detection on/off
  *
  * Returns:
- *  0 		on success
- * -EINTR 	when interrupted by a signal
+ *  0		on success
+ * -EINTR	when interrupted by a signal
  * -EDEADLK	when the lock would deadlock (when deadlock detection is on)
  */
 int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock,
@@ -834,17 +834,38 @@ int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock,
 EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);
 
 /**
+ * rt_mutex_lock_killable - lock a rt_mutex killable
+ *
+ * @lock:		the rt_mutex to be locked
+ * @detect_deadlock:	deadlock detection on/off
+ *
+ * Returns:
+ *  0		on success
+ * -EINTR	when interrupted by a signal
+ * -EDEADLK	when the lock would deadlock (when deadlock detection is on)
+ */
+int __sched rt_mutex_lock_killable(struct rt_mutex *lock,
+				   int detect_deadlock)
+{
+	might_sleep();
+
+	return rt_mutex_fastlock(lock, TASK_KILLABLE,
+				 detect_deadlock, rt_mutex_slowlock);
+}
+EXPORT_SYMBOL_GPL(rt_mutex_lock_killable);
+
+/**
  * rt_mutex_timed_lock - lock a rt_mutex interruptible
  *			the timeout structure is provided
  *			by the caller
  *
- * @lock: 		the rt_mutex to be locked
+ * @lock:		the rt_mutex to be locked
  * @timeout:		timeout structure or NULL (no timeout)
  * @detect_deadlock:	deadlock detection on/off
  *
  * Returns:
- *  0 		on success
- * -EINTR 	when interrupted by a signal
+ *  0		on success
+ * -EINTR	when interrupted by a signal
  * -ETIMEDOUT	when the timeout expired
  * -EDEADLK	when the lock would deadlock (when deadlock detection is on)
  */
-- 
cgit v0.10.2


From 407e5730b94ab18962fa1cbb1474adef039aa8a1 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 10 Jun 2011 11:21:25 +0200
Subject: rt-mutex-add-sleeping-spinlocks-support.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/rtmutex.h b/include/linux/rtmutex.h
index 3561eb2..928d93e 100644
--- a/include/linux/rtmutex.h
+++ b/include/linux/rtmutex.h
@@ -29,9 +29,10 @@ struct rt_mutex {
 	raw_spinlock_t		wait_lock;
 	struct plist_head	wait_list;
 	struct task_struct	*owner;
-#ifdef CONFIG_DEBUG_RT_MUTEXES
 	int			save_state;
-	const char 		*name, *file;
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+	const char		*file;
+	const char		*name;
 	int			line;
 	void			*magic;
 #endif
@@ -56,19 +57,39 @@ struct hrtimer_sleeper;
 #ifdef CONFIG_DEBUG_RT_MUTEXES
 # define __DEBUG_RT_MUTEX_INITIALIZER(mutexname) \
 	, .name = #mutexname, .file = __FILE__, .line = __LINE__
-# define rt_mutex_init(mutex)			__rt_mutex_init(mutex, __func__)
+
+# define rt_mutex_init(mutex)					\
+	do {							\
+		raw_spin_lock_init(&(mutex)->wait_lock);	\
+		__rt_mutex_init(mutex, #mutex);			\
+	} while (0)
+
  extern void rt_mutex_debug_task_free(struct task_struct *tsk);
 #else
 # define __DEBUG_RT_MUTEX_INITIALIZER(mutexname)
-# define rt_mutex_init(mutex)			__rt_mutex_init(mutex, NULL)
+
+# define rt_mutex_init(mutex)					\
+	do {							\
+		raw_spin_lock_init(&(mutex)->wait_lock);	\
+		__rt_mutex_init(mutex, #mutex);			\
+	} while (0)
+
 # define rt_mutex_debug_task_free(t)			do { } while (0)
 #endif
 
-#define __RT_MUTEX_INITIALIZER(mutexname) \
-	{ .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(mutexname.wait_lock) \
+#define __RT_MUTEX_INITIALIZER_PLAIN(mutexname) \
+	.wait_lock = __RAW_SPIN_LOCK_UNLOCKED(mutexname.wait_lock) \
 	, .wait_list = PLIST_HEAD_INIT(mutexname.wait_list) \
 	, .owner = NULL \
-	__DEBUG_RT_MUTEX_INITIALIZER(mutexname)}
+	__DEBUG_RT_MUTEX_INITIALIZER(mutexname)
+
+
+#define __RT_MUTEX_INITIALIZER(mutexname) \
+	{ __RT_MUTEX_INITIALIZER_PLAIN(mutexname) }
+
+#define __RT_MUTEX_INITIALIZER_SAVE_STATE(mutexname) \
+	{ __RT_MUTEX_INITIALIZER_PLAIN(mutexname)    \
+	  , .save_state = 1 }
 
 #define DEFINE_RT_MUTEX(mutexname) \
 	struct rt_mutex mutexname = __RT_MUTEX_INITIALIZER(mutexname)
diff --git a/kernel/futex.c b/kernel/futex.c
index 1d74b85..473c3c4 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -2323,8 +2323,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
 	 * The waiter is allocated on our stack, manipulated by the requeue
 	 * code while we sleep on uaddr.
 	 */
-	debug_rt_mutex_init_waiter(&rt_waiter);
-	rt_waiter.task = NULL;
+	rt_mutex_init_waiter(&rt_waiter, false);
 
 	ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE);
 	if (unlikely(ret != 0))
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index 0d129f9..b562de3 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -8,6 +8,12 @@
  *  Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt
  *  Copyright (C) 2006 Esben Nielsen
  *
+ * Adaptive Spinlocks:
+ *  Copyright (C) 2008 Novell, Inc., Gregory Haskins, Sven Dietrich,
+ *                                   and Peter Morreale,
+ * Adaptive Spinlocks simplification:
+ *  Copyright (C) 2008 Red Hat, Inc., Steven Rostedt <srostedt@redhat.com>
+ *
  *  See Documentation/rt-mutex-design.txt for details.
  */
 #include <linux/spinlock.h>
@@ -96,6 +102,12 @@ static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
 }
 #endif
 
+static inline void init_lists(struct rt_mutex *lock)
+{
+	if (unlikely(!lock->wait_list.node_list.prev))
+		plist_head_init(&lock->wait_list);
+}
+
 /*
  * Calculate task priority from the waiter list priority
  *
@@ -142,6 +154,14 @@ static void rt_mutex_adjust_prio(struct task_struct *task)
 	raw_spin_unlock_irqrestore(&task->pi_lock, flags);
 }
 
+static void rt_mutex_wake_waiter(struct rt_mutex_waiter *waiter)
+{
+	if (waiter->savestate)
+		wake_up_lock_sleeper(waiter->task);
+	else
+		wake_up_process(waiter->task);
+}
+
 /*
  * Max number of times we'll walk the boosting chain:
  */
@@ -253,13 +273,15 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
 	/* Release the task */
 	raw_spin_unlock_irqrestore(&task->pi_lock, flags);
 	if (!rt_mutex_owner(lock)) {
+		struct rt_mutex_waiter *lock_top_waiter;
+
 		/*
 		 * If the requeue above changed the top waiter, then we need
 		 * to wake the new top waiter up to try to get the lock.
 		 */
-
-		if (top_waiter != rt_mutex_top_waiter(lock))
-			wake_up_process(rt_mutex_top_waiter(lock)->task);
+		lock_top_waiter = rt_mutex_top_waiter(lock);
+		if (top_waiter != lock_top_waiter)
+			rt_mutex_wake_waiter(lock_top_waiter);
 		raw_spin_unlock(&lock->wait_lock);
 		goto out_put_task;
 	}
@@ -304,6 +326,25 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
 	return ret;
 }
 
+
+#define STEAL_NORMAL  0
+#define STEAL_LATERAL 1
+
+/*
+ * Note that RT tasks are excluded from lateral-steals to prevent the
+ * introduction of an unbounded latency
+ */
+static inline int lock_is_stealable(struct task_struct *task,
+				    struct task_struct *pendowner, int mode)
+{
+    if (mode == STEAL_NORMAL || rt_task(task)) {
+	    if (task->prio >= pendowner->prio)
+		    return 0;
+    } else if (task->prio > pendowner->prio)
+	    return 0;
+    return 1;
+}
+
 /*
  * Try to take an rt-mutex
  *
@@ -313,8 +354,9 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
  * @task:   the task which wants to acquire the lock
  * @waiter: the waiter that is queued to the lock's wait list. (could be NULL)
  */
-static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
-		struct rt_mutex_waiter *waiter)
+static int
+__try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
+		       struct rt_mutex_waiter *waiter, int mode)
 {
 	/*
 	 * We have to be careful here if the atomic speedups are
@@ -347,12 +389,14 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
 	 * 3) it is top waiter
 	 */
 	if (rt_mutex_has_waiters(lock)) {
-		if (task->prio >= rt_mutex_top_waiter(lock)->list_entry.prio) {
-			if (!waiter || waiter != rt_mutex_top_waiter(lock))
-				return 0;
-		}
+		struct task_struct *pown = rt_mutex_top_waiter(lock)->task;
+
+		if (task != pown && !lock_is_stealable(task, pown, mode))
+			return 0;
 	}
 
+	/* We got the lock. */
+
 	if (waiter || rt_mutex_has_waiters(lock)) {
 		unsigned long flags;
 		struct rt_mutex_waiter *top;
@@ -377,7 +421,6 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
 		raw_spin_unlock_irqrestore(&task->pi_lock, flags);
 	}
 
-	/* We got the lock. */
 	debug_rt_mutex_lock(lock);
 
 	rt_mutex_set_owner(lock, task);
@@ -387,6 +430,13 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
 	return 1;
 }
 
+static inline int
+try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
+		     struct rt_mutex_waiter *waiter)
+{
+	return __try_to_take_rt_mutex(lock, task, waiter, STEAL_NORMAL);
+}
+
 /*
  * Task blocks on lock.
  *
@@ -501,7 +551,7 @@ static void wakeup_next_waiter(struct rt_mutex *lock)
 
 	raw_spin_unlock_irqrestore(&current->pi_lock, flags);
 
-	wake_up_process(waiter->task);
+	rt_mutex_wake_waiter(waiter);
 }
 
 /*
@@ -580,18 +630,315 @@ void rt_mutex_adjust_pi(struct task_struct *task)
 		return;
 	}
 
-	raw_spin_unlock_irqrestore(&task->pi_lock, flags);
-
 	/* gets dropped in rt_mutex_adjust_prio_chain()! */
 	get_task_struct(task);
+	raw_spin_unlock_irqrestore(&task->pi_lock, flags);
 	rt_mutex_adjust_prio_chain(task, 0, NULL, NULL, task);
 }
 
+#ifdef CONFIG_PREEMPT_RT_FULL
+/*
+ * preemptible spin_lock functions:
+ */
+static inline void rt_spin_lock_fastlock(struct rt_mutex *lock,
+					 void  (*slowfn)(struct rt_mutex *lock))
+{
+	might_sleep();
+
+	if (likely(rt_mutex_cmpxchg(lock, NULL, current)))
+		rt_mutex_deadlock_account_lock(lock, current);
+	else
+		slowfn(lock);
+}
+
+static inline void rt_spin_lock_fastunlock(struct rt_mutex *lock,
+					   void  (*slowfn)(struct rt_mutex *lock))
+{
+	if (likely(rt_mutex_cmpxchg(lock, current, NULL)))
+		rt_mutex_deadlock_account_unlock(current);
+	else
+		slowfn(lock);
+}
+
+#ifdef CONFIG_SMP
+/*
+ * Note that owner is a speculative pointer and dereferencing relies
+ * on rcu_read_lock() and the check against the lock owner.
+ */
+static int adaptive_wait(struct rt_mutex *lock,
+			 struct task_struct *owner)
+{
+	int res = 0;
+
+	rcu_read_lock();
+	for (;;) {
+		if (owner != rt_mutex_owner(lock))
+			break;
+		/*
+		 * Ensure that owner->on_cpu is dereferenced _after_
+		 * checking the above to be valid.
+		 */
+		barrier();
+		if (!owner->on_cpu) {
+			res = 1;
+			break;
+		}
+		cpu_relax();
+	}
+	rcu_read_unlock();
+	return res;
+}
+#else
+static int adaptive_wait(struct rt_mutex *lock,
+			 struct task_struct *orig_owner)
+{
+	return 1;
+}
+#endif
+
+# define pi_lock(lock)			raw_spin_lock_irq(lock)
+# define pi_unlock(lock)		raw_spin_unlock_irq(lock)
+
+/*
+ * Slow path lock function spin_lock style: this variant is very
+ * careful not to miss any non-lock wakeups.
+ *
+ * We store the current state under p->pi_lock in p->saved_state and
+ * the try_to_wake_up() code handles this accordingly.
+ */
+static void  noinline __sched rt_spin_lock_slowlock(struct rt_mutex *lock)
+{
+	struct task_struct *lock_owner, *self = current;
+	struct rt_mutex_waiter waiter, *top_waiter;
+	int ret;
+
+	rt_mutex_init_waiter(&waiter, true);
+
+	raw_spin_lock(&lock->wait_lock);
+	init_lists(lock);
+
+	if (__try_to_take_rt_mutex(lock, self, NULL, STEAL_LATERAL)) {
+		raw_spin_unlock(&lock->wait_lock);
+		return;
+	}
+
+	BUG_ON(rt_mutex_owner(lock) == self);
+
+	/*
+	 * We save whatever state the task is in and we'll restore it
+	 * after acquiring the lock taking real wakeups into account
+	 * as well. We are serialized via pi_lock against wakeups. See
+	 * try_to_wake_up().
+	 */
+	pi_lock(&self->pi_lock);
+	self->saved_state = self->state;
+	__set_current_state(TASK_UNINTERRUPTIBLE);
+	pi_unlock(&self->pi_lock);
+
+	ret = task_blocks_on_rt_mutex(lock, &waiter, self, 0);
+	BUG_ON(ret);
+
+	for (;;) {
+		/* Try to acquire the lock again. */
+		if (__try_to_take_rt_mutex(lock, self, &waiter, STEAL_LATERAL))
+			break;
+
+		top_waiter = rt_mutex_top_waiter(lock);
+		lock_owner = rt_mutex_owner(lock);
+
+		raw_spin_unlock(&lock->wait_lock);
+
+		debug_rt_mutex_print_deadlock(&waiter);
+
+		if (top_waiter != &waiter || adaptive_wait(lock, lock_owner))
+			schedule_rt_mutex(lock);
+
+		raw_spin_lock(&lock->wait_lock);
+
+		pi_lock(&self->pi_lock);
+		__set_current_state(TASK_UNINTERRUPTIBLE);
+		pi_unlock(&self->pi_lock);
+	}
+
+	/*
+	 * Restore the task state to current->saved_state. We set it
+	 * to the original state above and the try_to_wake_up() code
+	 * has possibly updated it when a real (non-rtmutex) wakeup
+	 * happened while we were blocked. Clear saved_state so
+	 * try_to_wakeup() does not get confused.
+	 */
+	pi_lock(&self->pi_lock);
+	__set_current_state(self->saved_state);
+	self->saved_state = TASK_RUNNING;
+	pi_unlock(&self->pi_lock);
+
+	/*
+	 * try_to_take_rt_mutex() sets the waiter bit
+	 * unconditionally. We might have to fix that up:
+	 */
+	fixup_rt_mutex_waiters(lock);
+
+	BUG_ON(rt_mutex_has_waiters(lock) && &waiter == rt_mutex_top_waiter(lock));
+	BUG_ON(!plist_node_empty(&waiter.list_entry));
+
+	raw_spin_unlock(&lock->wait_lock);
+
+	debug_rt_mutex_free_waiter(&waiter);
+}
+
+/*
+ * Slow path to release a rt_mutex spin_lock style
+ */
+static void  noinline __sched rt_spin_lock_slowunlock(struct rt_mutex *lock)
+{
+	raw_spin_lock(&lock->wait_lock);
+
+	debug_rt_mutex_unlock(lock);
+
+	rt_mutex_deadlock_account_unlock(current);
+
+	if (!rt_mutex_has_waiters(lock)) {
+		lock->owner = NULL;
+		raw_spin_unlock(&lock->wait_lock);
+		return;
+	}
+
+	wakeup_next_waiter(lock);
+
+	raw_spin_unlock(&lock->wait_lock);
+
+	/* Undo pi boosting.when necessary */
+	rt_mutex_adjust_prio(current);
+}
+
+void __lockfunc rt_spin_lock(spinlock_t *lock)
+{
+	rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock);
+	spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
+}
+EXPORT_SYMBOL(rt_spin_lock);
+
+void __lockfunc __rt_spin_lock(struct rt_mutex *lock)
+{
+	rt_spin_lock_fastlock(lock, rt_spin_lock_slowlock);
+}
+EXPORT_SYMBOL(__rt_spin_lock);
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+void __lockfunc rt_spin_lock_nested(spinlock_t *lock, int subclass)
+{
+	rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock);
+	spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
+}
+EXPORT_SYMBOL(rt_spin_lock_nested);
+#endif
+
+void __lockfunc rt_spin_unlock(spinlock_t *lock)
+{
+	/* NOTE: we always pass in '1' for nested, for simplicity */
+	spin_release(&lock->dep_map, 1, _RET_IP_);
+	rt_spin_lock_fastunlock(&lock->lock, rt_spin_lock_slowunlock);
+}
+EXPORT_SYMBOL(rt_spin_unlock);
+
+void __lockfunc __rt_spin_unlock(struct rt_mutex *lock)
+{
+	rt_spin_lock_fastunlock(lock, rt_spin_lock_slowunlock);
+}
+EXPORT_SYMBOL(__rt_spin_unlock);
+
+/*
+ * Wait for the lock to get unlocked: instead of polling for an unlock
+ * (like raw spinlocks do), we lock and unlock, to force the kernel to
+ * schedule if there's contention:
+ */
+void __lockfunc rt_spin_unlock_wait(spinlock_t *lock)
+{
+	spin_lock(lock);
+	spin_unlock(lock);
+}
+EXPORT_SYMBOL(rt_spin_unlock_wait);
+
+int __lockfunc rt_spin_trylock(spinlock_t *lock)
+{
+	int ret;
+
+	migrate_disable();
+	ret = rt_mutex_trylock(&lock->lock);
+	if (ret)
+		spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
+	else
+		migrate_enable();
+
+	return ret;
+}
+EXPORT_SYMBOL(rt_spin_trylock);
+
+int __lockfunc rt_spin_trylock_bh(spinlock_t *lock)
+{
+	int ret;
+
+	local_bh_disable();
+	ret = rt_mutex_trylock(&lock->lock);
+	if (ret) {
+		migrate_disable();
+		spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
+	} else
+		local_bh_enable();
+	return ret;
+}
+EXPORT_SYMBOL(rt_spin_trylock_bh);
+
+int __lockfunc rt_spin_trylock_irqsave(spinlock_t *lock, unsigned long *flags)
+{
+	int ret;
+
+	*flags = 0;
+	migrate_disable();
+	ret = rt_mutex_trylock(&lock->lock);
+	if (ret)
+		spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
+	else
+		migrate_enable();
+	return ret;
+}
+EXPORT_SYMBOL(rt_spin_trylock_irqsave);
+
+int atomic_dec_and_spin_lock(atomic_t *atomic, spinlock_t *lock)
+{
+	/* Subtract 1 from counter unless that drops it to 0 (ie. it was 1) */
+	if (atomic_add_unless(atomic, -1, 1))
+		return 0;
+	migrate_disable();
+	rt_spin_lock(lock);
+	if (atomic_dec_and_test(atomic))
+		return 1;
+	rt_spin_unlock(lock);
+	migrate_enable();
+	return 0;
+}
+EXPORT_SYMBOL(atomic_dec_and_spin_lock);
+
+void
+__rt_spin_lock_init(spinlock_t *lock, char *name, struct lock_class_key *key)
+{
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	/*
+	 * Make sure we are not reinitializing a held lock:
+	 */
+	debug_check_no_locks_freed((void *)lock, sizeof(*lock));
+	lockdep_init_map(&lock->dep_map, name, key, 0);
+#endif
+}
+EXPORT_SYMBOL(__rt_spin_lock_init);
+
+#endif /* PREEMPT_RT_FULL */
+
 /**
  * __rt_mutex_slowlock() - Perform the wait-wake-try-to-take loop
  * @lock:		 the rt_mutex to take
  * @state:		 the state the task should block in (TASK_INTERRUPTIBLE
- * 			 or TASK_UNINTERRUPTIBLE)
+ *			 or TASK_UNINTERRUPTIBLE)
  * @timeout:		 the pre-initialized and started timer, or NULL for none
  * @waiter:		 the pre-initialized rt_mutex_waiter
  *
@@ -647,9 +994,10 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
 	struct rt_mutex_waiter waiter;
 	int ret = 0;
 
-	debug_rt_mutex_init_waiter(&waiter);
+	rt_mutex_init_waiter(&waiter, false);
 
 	raw_spin_lock(&lock->wait_lock);
+	init_lists(lock);
 
 	/* Try to acquire the lock again: */
 	if (try_to_take_rt_mutex(lock, current, NULL)) {
@@ -702,6 +1050,7 @@ rt_mutex_slowtrylock(struct rt_mutex *lock)
 	int ret = 0;
 
 	raw_spin_lock(&lock->wait_lock);
+	init_lists(lock);
 
 	if (likely(rt_mutex_owner(lock) != current)) {
 
@@ -934,12 +1283,11 @@ EXPORT_SYMBOL_GPL(rt_mutex_destroy);
 void __rt_mutex_init(struct rt_mutex *lock, const char *name)
 {
 	lock->owner = NULL;
-	raw_spin_lock_init(&lock->wait_lock);
 	plist_head_init(&lock->wait_list);
 
 	debug_rt_mutex_init(lock, name);
 }
-EXPORT_SYMBOL_GPL(__rt_mutex_init);
+EXPORT_SYMBOL(__rt_mutex_init);
 
 /**
  * rt_mutex_init_proxy_locked - initialize and lock a rt_mutex on behalf of a
@@ -954,7 +1302,7 @@ EXPORT_SYMBOL_GPL(__rt_mutex_init);
 void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
 				struct task_struct *proxy_owner)
 {
-	__rt_mutex_init(lock, NULL);
+	rt_mutex_init(lock);
 	debug_rt_mutex_proxy_lock(lock, proxy_owner);
 	rt_mutex_set_owner(lock, proxy_owner);
 	rt_mutex_deadlock_account_lock(lock, proxy_owner);
diff --git a/kernel/rtmutex_common.h b/kernel/rtmutex_common.h
index 47290ec..6ec3dc1 100644
--- a/kernel/rtmutex_common.h
+++ b/kernel/rtmutex_common.h
@@ -49,6 +49,7 @@ struct rt_mutex_waiter {
 	struct plist_node	pi_list_entry;
 	struct task_struct	*task;
 	struct rt_mutex		*lock;
+	bool			savestate;
 #ifdef CONFIG_DEBUG_RT_MUTEXES
 	unsigned long		ip;
 	struct pid		*deadlock_task_pid;
@@ -126,4 +127,12 @@ extern int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
 # include "rtmutex.h"
 #endif
 
+static inline void
+rt_mutex_init_waiter(struct rt_mutex_waiter *waiter, bool savestate)
+{
+	debug_rt_mutex_init_waiter(waiter);
+	waiter->task = NULL;
+	waiter->savestate = savestate;
+}
+
 #endif
-- 
cgit v0.10.2


From 850dafafb60b6a7d03ba85824abc20917cf836af Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 29 Jun 2011 19:34:01 +0200
Subject: spinlock-types-separate-raw.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/rwlock_types.h b/include/linux/rwlock_types.h
index cc0072e..5317cd9 100644
--- a/include/linux/rwlock_types.h
+++ b/include/linux/rwlock_types.h
@@ -1,6 +1,10 @@
 #ifndef __LINUX_RWLOCK_TYPES_H
 #define __LINUX_RWLOCK_TYPES_H
 
+#if !defined(__LINUX_SPINLOCK_TYPES_H)
+# error "Do not include directly, include spinlock_types.h"
+#endif
+
 /*
  * include/linux/rwlock_types.h - generic rwlock type definitions
  *				  and initializers
diff --git a/include/linux/spinlock_types.h b/include/linux/spinlock_types.h
index 73548eb..5c8664d 100644
--- a/include/linux/spinlock_types.h
+++ b/include/linux/spinlock_types.h
@@ -9,79 +9,9 @@
  * Released under the General Public License (GPL).
  */
 
-#if defined(CONFIG_SMP)
-# include <asm/spinlock_types.h>
-#else
-# include <linux/spinlock_types_up.h>
-#endif
+#include <linux/spinlock_types_raw.h>
 
-#include <linux/lockdep.h>
-
-typedef struct raw_spinlock {
-	arch_spinlock_t raw_lock;
-#ifdef CONFIG_GENERIC_LOCKBREAK
-	unsigned int break_lock;
-#endif
-#ifdef CONFIG_DEBUG_SPINLOCK
-	unsigned int magic, owner_cpu;
-	void *owner;
-#endif
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-	struct lockdep_map dep_map;
-#endif
-} raw_spinlock_t;
-
-#define SPINLOCK_MAGIC		0xdead4ead
-
-#define SPINLOCK_OWNER_INIT	((void *)-1L)
-
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-# define SPIN_DEP_MAP_INIT(lockname)	.dep_map = { .name = #lockname }
-#else
-# define SPIN_DEP_MAP_INIT(lockname)
-#endif
-
-#ifdef CONFIG_DEBUG_SPINLOCK
-# define SPIN_DEBUG_INIT(lockname)		\
-	.magic = SPINLOCK_MAGIC,		\
-	.owner_cpu = -1,			\
-	.owner = SPINLOCK_OWNER_INIT,
-#else
-# define SPIN_DEBUG_INIT(lockname)
-#endif
-
-#define __RAW_SPIN_LOCK_INITIALIZER(lockname)	\
-	{					\
-	.raw_lock = __ARCH_SPIN_LOCK_UNLOCKED,	\
-	SPIN_DEBUG_INIT(lockname)		\
-	SPIN_DEP_MAP_INIT(lockname) }
-
-#define __RAW_SPIN_LOCK_UNLOCKED(lockname)	\
-	(raw_spinlock_t) __RAW_SPIN_LOCK_INITIALIZER(lockname)
-
-#define DEFINE_RAW_SPINLOCK(x)	raw_spinlock_t x = __RAW_SPIN_LOCK_UNLOCKED(x)
-
-typedef struct spinlock {
-	union {
-		struct raw_spinlock rlock;
-
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-# define LOCK_PADSIZE (offsetof(struct raw_spinlock, dep_map))
-		struct {
-			u8 __padding[LOCK_PADSIZE];
-			struct lockdep_map dep_map;
-		};
-#endif
-	};
-} spinlock_t;
-
-#define __SPIN_LOCK_INITIALIZER(lockname) \
-	{ { .rlock = __RAW_SPIN_LOCK_INITIALIZER(lockname) } }
-
-#define __SPIN_LOCK_UNLOCKED(lockname) \
-	(spinlock_t ) __SPIN_LOCK_INITIALIZER(lockname)
-
-#define DEFINE_SPINLOCK(x)	spinlock_t x = __SPIN_LOCK_UNLOCKED(x)
+#include <linux/spinlock_types_nort.h>
 
 #include <linux/rwlock_types.h>
 
diff --git a/include/linux/spinlock_types_nort.h b/include/linux/spinlock_types_nort.h
new file mode 100644
index 0000000..f1dac1f
--- /dev/null
+++ b/include/linux/spinlock_types_nort.h
@@ -0,0 +1,33 @@
+#ifndef __LINUX_SPINLOCK_TYPES_NORT_H
+#define __LINUX_SPINLOCK_TYPES_NORT_H
+
+#ifndef __LINUX_SPINLOCK_TYPES_H
+#error "Do not include directly. Include spinlock_types.h instead"
+#endif
+
+/*
+ * The non RT version maps spinlocks to raw_spinlocks
+ */
+typedef struct spinlock {
+	union {
+		struct raw_spinlock rlock;
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+# define LOCK_PADSIZE (offsetof(struct raw_spinlock, dep_map))
+		struct {
+			u8 __padding[LOCK_PADSIZE];
+			struct lockdep_map dep_map;
+		};
+#endif
+	};
+} spinlock_t;
+
+#define __SPIN_LOCK_INITIALIZER(lockname) \
+	{ { .rlock = __RAW_SPIN_LOCK_INITIALIZER(lockname) } }
+
+#define __SPIN_LOCK_UNLOCKED(lockname) \
+	(spinlock_t ) __SPIN_LOCK_INITIALIZER(lockname)
+
+#define DEFINE_SPINLOCK(x)	spinlock_t x = __SPIN_LOCK_UNLOCKED(x)
+
+#endif
diff --git a/include/linux/spinlock_types_raw.h b/include/linux/spinlock_types_raw.h
new file mode 100644
index 0000000..edffc4d
--- /dev/null
+++ b/include/linux/spinlock_types_raw.h
@@ -0,0 +1,56 @@
+#ifndef __LINUX_SPINLOCK_TYPES_RAW_H
+#define __LINUX_SPINLOCK_TYPES_RAW_H
+
+#if defined(CONFIG_SMP)
+# include <asm/spinlock_types.h>
+#else
+# include <linux/spinlock_types_up.h>
+#endif
+
+#include <linux/lockdep.h>
+
+typedef struct raw_spinlock {
+	arch_spinlock_t raw_lock;
+#ifdef CONFIG_GENERIC_LOCKBREAK
+	unsigned int break_lock;
+#endif
+#ifdef CONFIG_DEBUG_SPINLOCK
+	unsigned int magic, owner_cpu;
+	void *owner;
+#endif
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	struct lockdep_map dep_map;
+#endif
+} raw_spinlock_t;
+
+#define SPINLOCK_MAGIC		0xdead4ead
+
+#define SPINLOCK_OWNER_INIT	((void *)-1L)
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+# define SPIN_DEP_MAP_INIT(lockname)	.dep_map = { .name = #lockname }
+#else
+# define SPIN_DEP_MAP_INIT(lockname)
+#endif
+
+#ifdef CONFIG_DEBUG_SPINLOCK
+# define SPIN_DEBUG_INIT(lockname)		\
+	.magic = SPINLOCK_MAGIC,		\
+	.owner_cpu = -1,			\
+	.owner = SPINLOCK_OWNER_INIT,
+#else
+# define SPIN_DEBUG_INIT(lockname)
+#endif
+
+#define __RAW_SPIN_LOCK_INITIALIZER(lockname)	\
+	{					\
+	.raw_lock = __ARCH_SPIN_LOCK_UNLOCKED,	\
+	SPIN_DEBUG_INIT(lockname)		\
+	SPIN_DEP_MAP_INIT(lockname) }
+
+#define __RAW_SPIN_LOCK_UNLOCKED(lockname)	\
+	(raw_spinlock_t) __RAW_SPIN_LOCK_INITIALIZER(lockname)
+
+#define DEFINE_RAW_SPINLOCK(x)	raw_spinlock_t x = __RAW_SPIN_LOCK_UNLOCKED(x)
+
+#endif
-- 
cgit v0.10.2


From 674ee1eedc075544d7f8f3c44e3333995439257c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 29 Jun 2011 20:06:39 +0200
Subject: rtmutex-avoid-include-hell.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/rtmutex.h b/include/linux/rtmutex.h
index 928d93e..5ebd0bb 100644
--- a/include/linux/rtmutex.h
+++ b/include/linux/rtmutex.h
@@ -14,7 +14,7 @@
 
 #include <linux/linkage.h>
 #include <linux/plist.h>
-#include <linux/spinlock_types.h>
+#include <linux/spinlock_types_raw.h>
 
 extern int max_lock_depth; /* for sysctl */
 
-- 
cgit v0.10.2


From 18a69ffce6fbbabe011b368e138f7d3db067932a Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 29 Jun 2011 19:43:35 +0200
Subject: rt-add-rt-spinlocks.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/rwlock_types_rt.h b/include/linux/rwlock_types_rt.h
new file mode 100644
index 0000000..b138321
--- /dev/null
+++ b/include/linux/rwlock_types_rt.h
@@ -0,0 +1,33 @@
+#ifndef __LINUX_RWLOCK_TYPES_RT_H
+#define __LINUX_RWLOCK_TYPES_RT_H
+
+#ifndef __LINUX_SPINLOCK_TYPES_H
+#error "Do not include directly. Include spinlock_types.h instead"
+#endif
+
+/*
+ * rwlocks - rtmutex which allows single reader recursion
+ */
+typedef struct {
+	struct rt_mutex		lock;
+	int			read_depth;
+	unsigned int		break_lock;
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	struct lockdep_map	dep_map;
+#endif
+} rwlock_t;
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+# define RW_DEP_MAP_INIT(lockname)	.dep_map = { .name = #lockname }
+#else
+# define RW_DEP_MAP_INIT(lockname)
+#endif
+
+#define __RW_LOCK_UNLOCKED(name) \
+	{ .lock = __RT_MUTEX_INITIALIZER_SAVE_STATE(name.lock),	\
+	  RW_DEP_MAP_INIT(name) }
+
+#define DEFINE_RWLOCK(name) \
+	rwlock_t name __cacheline_aligned_in_smp = __RW_LOCK_UNLOCKED(name)
+
+#endif
diff --git a/include/linux/spinlock_types.h b/include/linux/spinlock_types.h
index 5c8664d..10bac71 100644
--- a/include/linux/spinlock_types.h
+++ b/include/linux/spinlock_types.h
@@ -11,8 +11,13 @@
 
 #include <linux/spinlock_types_raw.h>
 
-#include <linux/spinlock_types_nort.h>
-
-#include <linux/rwlock_types.h>
+#ifndef CONFIG_PREEMPT_RT_FULL
+# include <linux/spinlock_types_nort.h>
+# include <linux/rwlock_types.h>
+#else
+# include <linux/rtmutex.h>
+# include <linux/spinlock_types_rt.h>
+# include <linux/rwlock_types_rt.h>
+#endif
 
 #endif /* __LINUX_SPINLOCK_TYPES_H */
diff --git a/include/linux/spinlock_types_rt.h b/include/linux/spinlock_types_rt.h
new file mode 100644
index 0000000..1fe8fc0
--- /dev/null
+++ b/include/linux/spinlock_types_rt.h
@@ -0,0 +1,49 @@
+#ifndef __LINUX_SPINLOCK_TYPES_RT_H
+#define __LINUX_SPINLOCK_TYPES_RT_H
+
+#ifndef __LINUX_SPINLOCK_TYPES_H
+#error "Do not include directly. Include spinlock_types.h instead"
+#endif
+
+/*
+ * PREEMPT_RT: spinlocks - an RT mutex plus lock-break field:
+ */
+typedef struct spinlock {
+	struct rt_mutex		lock;
+	unsigned int		break_lock;
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	struct lockdep_map	dep_map;
+#endif
+} spinlock_t;
+
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+# define __RT_SPIN_INITIALIZER(name) \
+	{ \
+	.wait_lock = __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock), \
+	.save_state = 1, \
+	.file = __FILE__, \
+	.line = __LINE__ , \
+	}
+#else
+# define __RT_SPIN_INITIALIZER(name) \
+	{								\
+	.wait_lock = __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock),		\
+	.save_state = 1, \
+	}
+#endif
+
+/*
+.wait_list = PLIST_HEAD_INIT_RAW((name).lock.wait_list, (name).lock.wait_lock)
+*/
+
+#define __SPIN_LOCK_UNLOCKED(name)			\
+	{ .lock = __RT_SPIN_INITIALIZER(name.lock),		\
+	  SPIN_DEP_MAP_INIT(name) }
+
+#define __DEFINE_SPINLOCK(name) \
+	spinlock_t name = __SPIN_LOCK_UNLOCKED(name)
+
+#define DEFINE_SPINLOCK(name) \
+	spinlock_t name __cacheline_aligned_in_smp = __SPIN_LOCK_UNLOCKED(name)
+
+#endif
-- 
cgit v0.10.2


From c6b8bb2fbebdf7ae1a8e7dff4c666f2c8278178a Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 29 Jun 2011 20:56:22 +0200
Subject: rt-add-rt-to-mutex-headers.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index 9121595..bdf1da2 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -17,6 +17,17 @@
 
 #include <linux/atomic.h>
 
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+# define __DEP_MAP_MUTEX_INITIALIZER(lockname) \
+		, .dep_map = { .name = #lockname }
+#else
+# define __DEP_MAP_MUTEX_INITIALIZER(lockname)
+#endif
+
+#ifdef CONFIG_PREEMPT_RT_FULL
+# include <linux/mutex_rt.h>
+#else
+
 /*
  * Simple, straightforward mutexes with strict semantics:
  *
@@ -95,13 +106,6 @@ do {							\
 static inline void mutex_destroy(struct mutex *lock) {}
 #endif
 
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-# define __DEP_MAP_MUTEX_INITIALIZER(lockname) \
-		, .dep_map = { .name = #lockname }
-#else
-# define __DEP_MAP_MUTEX_INITIALIZER(lockname)
-#endif
-
 #define __MUTEX_INITIALIZER(lockname) \
 		{ .count = ATOMIC_INIT(1) \
 		, .wait_lock = __SPIN_LOCK_UNLOCKED(lockname.wait_lock) \
@@ -167,6 +171,9 @@ extern int __must_check mutex_lock_killable(struct mutex *lock);
  */
 extern int mutex_trylock(struct mutex *lock);
 extern void mutex_unlock(struct mutex *lock);
+
+#endif /* !PREEMPT_RT_FULL */
+
 extern int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock);
 
 #ifndef CONFIG_HAVE_ARCH_MUTEX_CPU_RELAX
diff --git a/include/linux/mutex_rt.h b/include/linux/mutex_rt.h
new file mode 100644
index 0000000..c38a44b
--- /dev/null
+++ b/include/linux/mutex_rt.h
@@ -0,0 +1,84 @@
+#ifndef __LINUX_MUTEX_RT_H
+#define __LINUX_MUTEX_RT_H
+
+#ifndef __LINUX_MUTEX_H
+#error "Please include mutex.h"
+#endif
+
+#include <linux/rtmutex.h>
+
+/* FIXME: Just for __lockfunc */
+#include <linux/spinlock.h>
+
+struct mutex {
+	struct rt_mutex		lock;
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	struct lockdep_map	dep_map;
+#endif
+};
+
+#define __MUTEX_INITIALIZER(mutexname)					\
+	{								\
+		.lock = __RT_MUTEX_INITIALIZER(mutexname.lock)		\
+		__DEP_MAP_MUTEX_INITIALIZER(mutexname)			\
+	}
+
+#define DEFINE_MUTEX(mutexname)						\
+	struct mutex mutexname = __MUTEX_INITIALIZER(mutexname)
+
+extern void __mutex_do_init(struct mutex *lock, const char *name, struct lock_class_key *key);
+extern void __lockfunc _mutex_lock(struct mutex *lock);
+extern int __lockfunc _mutex_lock_interruptible(struct mutex *lock);
+extern int __lockfunc _mutex_lock_killable(struct mutex *lock);
+extern void __lockfunc _mutex_lock_nested(struct mutex *lock, int subclass);
+extern void __lockfunc _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest_lock);
+extern int __lockfunc _mutex_lock_interruptible_nested(struct mutex *lock, int subclass);
+extern int __lockfunc _mutex_lock_killable_nested(struct mutex *lock, int subclass);
+extern int __lockfunc _mutex_trylock(struct mutex *lock);
+extern void __lockfunc _mutex_unlock(struct mutex *lock);
+
+#define mutex_is_locked(l)		rt_mutex_is_locked(&(l)->lock)
+#define mutex_lock(l)			_mutex_lock(l)
+#define mutex_lock_interruptible(l)	_mutex_lock_interruptible(l)
+#define mutex_lock_killable(l)		_mutex_lock_killable(l)
+#define mutex_trylock(l)		_mutex_trylock(l)
+#define mutex_unlock(l)			_mutex_unlock(l)
+#define mutex_destroy(l)		rt_mutex_destroy(&(l)->lock)
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+# define mutex_lock_nested(l, s)	_mutex_lock_nested(l, s)
+# define mutex_lock_interruptible_nested(l, s) \
+					_mutex_lock_interruptible_nested(l, s)
+# define mutex_lock_killable_nested(l, s) \
+					_mutex_lock_killable_nested(l, s)
+
+# define mutex_lock_nest_lock(lock, nest_lock)				\
+do {									\
+	typecheck(struct lockdep_map *, &(nest_lock)->dep_map);		\
+	_mutex_lock_nest_lock(lock, &(nest_lock)->dep_map);		\
+} while (0)
+
+#else
+# define mutex_lock_nested(l, s)	_mutex_lock(l)
+# define mutex_lock_interruptible_nested(l, s) \
+					_mutex_lock_interruptible(l)
+# define mutex_lock_killable_nested(l, s) \
+					_mutex_lock_killable(l)
+# define mutex_lock_nest_lock(lock, nest_lock) mutex_lock(lock)
+#endif
+
+# define mutex_init(mutex)				\
+do {							\
+	static struct lock_class_key __key;		\
+							\
+	rt_mutex_init(&(mutex)->lock);			\
+	__mutex_do_init((mutex), #mutex, &__key);	\
+} while (0)
+
+# define __mutex_init(mutex, name, key)			\
+do {							\
+	rt_mutex_init(&(mutex)->lock);			\
+	__mutex_do_init((mutex), name, key);		\
+} while (0)
+
+#endif
-- 
cgit v0.10.2


From 891fb15c46eae2e29bf3aad918aa78a8a4a42dd7 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 29 Jun 2011 21:02:53 +0200
Subject: rwsem-add-rt-variant.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
index 8da67d6..f994bd3 100644
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -16,6 +16,10 @@
 
 #include <linux/atomic.h>
 
+#ifdef CONFIG_PREEMPT_RT_FULL
+#include <linux/rwsem_rt.h>
+#else /* PREEMPT_RT_FULL */
+
 struct rw_semaphore;
 
 #ifdef CONFIG_RWSEM_GENERIC_SPINLOCK
@@ -139,4 +143,6 @@ do {								\
 # define down_write_nested(sem, subclass)	down_write(sem)
 #endif
 
+#endif /* !PREEMPT_RT_FULL */
+
 #endif /* _LINUX_RWSEM_H */
diff --git a/include/linux/rwsem_rt.h b/include/linux/rwsem_rt.h
new file mode 100644
index 0000000..e94d945
--- /dev/null
+++ b/include/linux/rwsem_rt.h
@@ -0,0 +1,128 @@
+#ifndef _LINUX_RWSEM_RT_H
+#define _LINUX_RWSEM_RT_H
+
+#ifndef _LINUX_RWSEM_H
+#error "Include rwsem.h"
+#endif
+
+/*
+ * RW-semaphores are a spinlock plus a reader-depth count.
+ *
+ * Note that the semantics are different from the usual
+ * Linux rw-sems, in PREEMPT_RT mode we do not allow
+ * multiple readers to hold the lock at once, we only allow
+ * a read-lock owner to read-lock recursively. This is
+ * better for latency, makes the implementation inherently
+ * fair and makes it simpler as well.
+ */
+
+#include <linux/rtmutex.h>
+
+struct rw_semaphore {
+	struct rt_mutex		lock;
+	int			read_depth;
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	struct lockdep_map	dep_map;
+#endif
+};
+
+#define __RWSEM_INITIALIZER(name) \
+	{ .lock = __RT_MUTEX_INITIALIZER(name.lock), \
+	  RW_DEP_MAP_INIT(name) }
+
+#define DECLARE_RWSEM(lockname) \
+	struct rw_semaphore lockname = __RWSEM_INITIALIZER(lockname)
+
+extern void  __rt_rwsem_init(struct rw_semaphore *rwsem, const char *name,
+				     struct lock_class_key *key);
+
+#define __rt_init_rwsem(sem, name, key)			\
+	do {						\
+		rt_mutex_init(&(sem)->lock);		\
+		__rt_rwsem_init((sem), (name), (key));\
+	} while (0)
+
+#define __init_rwsem(sem, name, key) __rt_init_rwsem(sem, name, key)
+
+# define rt_init_rwsem(sem)				\
+do {							\
+	static struct lock_class_key __key;		\
+							\
+	__rt_init_rwsem((sem), #sem, &__key);		\
+} while (0)
+
+extern void  rt_down_write(struct rw_semaphore *rwsem);
+extern void rt_down_read_nested(struct rw_semaphore *rwsem, int subclass);
+extern void rt_down_write_nested(struct rw_semaphore *rwsem, int subclass);
+extern void rt_down_write_nested_lock(struct rw_semaphore *rwsem,
+		struct lockdep_map *nest);
+extern void  rt_down_read(struct rw_semaphore *rwsem);
+extern int  rt_down_write_trylock(struct rw_semaphore *rwsem);
+extern int  rt_down_read_trylock(struct rw_semaphore *rwsem);
+extern void  rt_up_read(struct rw_semaphore *rwsem);
+extern void  rt_up_write(struct rw_semaphore *rwsem);
+extern void  rt_downgrade_write(struct rw_semaphore *rwsem);
+
+#define init_rwsem(sem)		rt_init_rwsem(sem)
+#define rwsem_is_locked(s)	rt_mutex_is_locked(&(s)->lock)
+
+static inline void down_read(struct rw_semaphore *sem)
+{
+	rt_down_read(sem);
+}
+
+static inline int down_read_trylock(struct rw_semaphore *sem)
+{
+	return rt_down_read_trylock(sem);
+}
+
+static inline void down_write(struct rw_semaphore *sem)
+{
+	rt_down_write(sem);
+}
+
+static inline int down_write_trylock(struct rw_semaphore *sem)
+{
+	return rt_down_write_trylock(sem);
+}
+
+static inline void up_read(struct rw_semaphore *sem)
+{
+	rt_up_read(sem);
+}
+
+static inline void up_write(struct rw_semaphore *sem)
+{
+	rt_up_write(sem);
+}
+
+static inline void downgrade_write(struct rw_semaphore *sem)
+{
+	rt_downgrade_write(sem);
+}
+
+static inline void down_read_nested(struct rw_semaphore *sem, int subclass)
+{
+	return rt_down_read_nested(sem, subclass);
+}
+
+static inline void down_write_nested(struct rw_semaphore *sem, int subclass)
+{
+	rt_down_write_nested(sem, subclass);
+}
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+static inline void down_write_nest_lock(struct rw_semaphore *sem,
+		struct rw_semaphore *nest_lock)
+{
+	rt_down_write_nested_lock(sem, &nest_lock->dep_map);
+}
+
+#else
+
+static inline void down_write_nest_lock(struct rw_semaphore *sem,
+		struct rw_semaphore *nest_lock)
+{
+	rt_down_write_nested_lock(sem, NULL);
+}
+#endif
+#endif
diff --git a/lib/Makefile b/lib/Makefile
index 02ed6c0..7e961f1 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -38,8 +38,11 @@ obj-$(CONFIG_HAS_IOMEM) += iomap_copy.o devres.o
 obj-$(CONFIG_CHECK_SIGNATURE) += check_signature.o
 obj-$(CONFIG_DEBUG_LOCKING_API_SELFTESTS) += locking-selftest.o
 obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o
+
+ifneq ($(CONFIG_PREEMPT_RT_FULL),y)
 lib-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
 lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o
+endif
 lib-$(CONFIG_PERCPU_RWSEM) += percpu-rwsem.o
 
 CFLAGS_hweight.o = $(subst $(quote),,$(CONFIG_ARCH_HWEIGHT_CFLAGS))
-- 
cgit v0.10.2


From 48eb8241f1898d0bf68e0557f70eb9ecf147276d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 26 Jul 2009 19:39:56 +0200
Subject: rt: Add the preempt-rt lock replacement APIs

Map spinlocks, rwlocks, rw_semaphores and semaphores to the rt_mutex
based locking functions for preempt-rt.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/rwlock_rt.h b/include/linux/rwlock_rt.h
new file mode 100644
index 0000000..853ee36
--- /dev/null
+++ b/include/linux/rwlock_rt.h
@@ -0,0 +1,123 @@
+#ifndef __LINUX_RWLOCK_RT_H
+#define __LINUX_RWLOCK_RT_H
+
+#ifndef __LINUX_SPINLOCK_H
+#error Do not include directly. Use spinlock.h
+#endif
+
+#define rwlock_init(rwl)				\
+do {							\
+	static struct lock_class_key __key;		\
+							\
+	rt_mutex_init(&(rwl)->lock);			\
+	__rt_rwlock_init(rwl, #rwl, &__key);		\
+} while (0)
+
+extern void __lockfunc rt_write_lock(rwlock_t *rwlock);
+extern void __lockfunc rt_read_lock(rwlock_t *rwlock);
+extern int __lockfunc rt_write_trylock(rwlock_t *rwlock);
+extern int __lockfunc rt_write_trylock_irqsave(rwlock_t *trylock, unsigned long *flags);
+extern int __lockfunc rt_read_trylock(rwlock_t *rwlock);
+extern void __lockfunc rt_write_unlock(rwlock_t *rwlock);
+extern void __lockfunc rt_read_unlock(rwlock_t *rwlock);
+extern unsigned long __lockfunc rt_write_lock_irqsave(rwlock_t *rwlock);
+extern unsigned long __lockfunc rt_read_lock_irqsave(rwlock_t *rwlock);
+extern void __rt_rwlock_init(rwlock_t *rwlock, char *name, struct lock_class_key *key);
+
+#define read_trylock(lock)	__cond_lock(lock, rt_read_trylock(lock))
+#define write_trylock(lock)	__cond_lock(lock, rt_write_trylock(lock))
+
+#define write_trylock_irqsave(lock, flags)	\
+	__cond_lock(lock, rt_write_trylock_irqsave(lock, &flags))
+
+#define read_lock_irqsave(lock, flags)			\
+	do {						\
+		typecheck(unsigned long, flags);	\
+		migrate_disable();			\
+		flags = rt_read_lock_irqsave(lock);	\
+	} while (0)
+
+#define write_lock_irqsave(lock, flags)			\
+	do {						\
+		typecheck(unsigned long, flags);	\
+		migrate_disable();			\
+		flags = rt_write_lock_irqsave(lock);	\
+	} while (0)
+
+#define read_lock(lock)					\
+	do {						\
+		migrate_disable();			\
+		rt_read_lock(lock);			\
+	} while (0)
+
+#define read_lock_bh(lock)				\
+	do {						\
+		local_bh_disable();			\
+		migrate_disable();			\
+		rt_read_lock(lock);			\
+	} while (0)
+
+#define read_lock_irq(lock)	read_lock(lock)
+
+#define write_lock(lock)				\
+	do {						\
+		migrate_disable();			\
+		rt_write_lock(lock);			\
+	} while (0)
+
+#define write_lock_bh(lock)				\
+	do {						\
+		local_bh_disable();			\
+		migrate_disable();			\
+		rt_write_lock(lock);			\
+	} while (0)
+
+#define write_lock_irq(lock)	write_lock(lock)
+
+#define read_unlock(lock)				\
+	do {						\
+		rt_read_unlock(lock);			\
+		migrate_enable();			\
+	} while (0)
+
+#define read_unlock_bh(lock)				\
+	do {						\
+		rt_read_unlock(lock);			\
+		migrate_enable();			\
+		local_bh_enable();			\
+	} while (0)
+
+#define read_unlock_irq(lock)	read_unlock(lock)
+
+#define write_unlock(lock)				\
+	do {						\
+		rt_write_unlock(lock);			\
+		migrate_enable();			\
+	} while (0)
+
+#define write_unlock_bh(lock)				\
+	do {						\
+		rt_write_unlock(lock);			\
+		migrate_enable();			\
+		local_bh_enable();			\
+	} while (0)
+
+#define write_unlock_irq(lock)	write_unlock(lock)
+
+#define read_unlock_irqrestore(lock, flags)		\
+	do {						\
+		typecheck(unsigned long, flags);	\
+		(void) flags;				\
+		rt_read_unlock(lock);			\
+		migrate_enable();			\
+	} while (0)
+
+#define write_unlock_irqrestore(lock, flags) \
+	do {						\
+		typecheck(unsigned long, flags);	\
+		(void) flags;				\
+		rt_write_unlock(lock);			\
+		migrate_enable();			\
+	} while (0)
+
+#endif
diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h
index 7d537ce..0c11a7c 100644
--- a/include/linux/spinlock.h
+++ b/include/linux/spinlock.h
@@ -254,7 +254,11 @@ static inline void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock)
 #define raw_spin_can_lock(lock)	(!raw_spin_is_locked(lock))
 
 /* Include rwlock functions */
-#include <linux/rwlock.h>
+#ifdef CONFIG_PREEMPT_RT_FULL
+# include <linux/rwlock_rt.h>
+#else
+# include <linux/rwlock.h>
+#endif
 
 /*
  * Pull the _spin_*()/_read_*()/_write_*() functions/declarations:
@@ -265,6 +269,10 @@ static inline void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock)
 # include <linux/spinlock_api_up.h>
 #endif
 
+#ifdef CONFIG_PREEMPT_RT_FULL
+# include <linux/spinlock_rt.h>
+#else /* PREEMPT_RT_FULL */
+
 /*
  * Map the spin_lock functions to the raw variants for PREEMPT_RT=n
  */
@@ -394,4 +402,6 @@ extern int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock);
 #define atomic_dec_and_lock(atomic, lock) \
 		__cond_lock(lock, _atomic_dec_and_lock(atomic, lock))
 
+#endif /* !PREEMPT_RT_FULL */
+
 #endif /* __LINUX_SPINLOCK_H */
diff --git a/include/linux/spinlock_api_smp.h b/include/linux/spinlock_api_smp.h
index 51df117..3f68f50 100644
--- a/include/linux/spinlock_api_smp.h
+++ b/include/linux/spinlock_api_smp.h
@@ -191,6 +191,8 @@ static inline int __raw_spin_trylock_bh(raw_spinlock_t *lock)
 	return 0;
 }
 
-#include <linux/rwlock_api_smp.h>
+#ifndef CONFIG_PREEMPT_RT_FULL
+# include <linux/rwlock_api_smp.h>
+#endif
 
 #endif /* __LINUX_SPINLOCK_API_SMP_H */
diff --git a/include/linux/spinlock_rt.h b/include/linux/spinlock_rt.h
new file mode 100644
index 0000000..a9ea392
--- /dev/null
+++ b/include/linux/spinlock_rt.h
@@ -0,0 +1,158 @@
+#ifndef __LINUX_SPINLOCK_RT_H
+#define __LINUX_SPINLOCK_RT_H
+
+#ifndef __LINUX_SPINLOCK_H
+#error Do not include directly. Use spinlock.h
+#endif
+
+#include <linux/bug.h>
+
+extern void
+__rt_spin_lock_init(spinlock_t *lock, char *name, struct lock_class_key *key);
+
+#define spin_lock_init(slock)				\
+do {							\
+	static struct lock_class_key __key;		\
+							\
+	rt_mutex_init(&(slock)->lock);			\
+	__rt_spin_lock_init(slock, #slock, &__key);	\
+} while (0)
+
+extern void __lockfunc rt_spin_lock(spinlock_t *lock);
+extern unsigned long __lockfunc rt_spin_lock_trace_flags(spinlock_t *lock);
+extern void __lockfunc rt_spin_lock_nested(spinlock_t *lock, int subclass);
+extern void __lockfunc rt_spin_unlock(spinlock_t *lock);
+extern void __lockfunc rt_spin_unlock_wait(spinlock_t *lock);
+extern int __lockfunc rt_spin_trylock_irqsave(spinlock_t *lock, unsigned long *flags);
+extern int __lockfunc rt_spin_trylock_bh(spinlock_t *lock);
+extern int __lockfunc rt_spin_trylock(spinlock_t *lock);
+extern int atomic_dec_and_spin_lock(atomic_t *atomic, spinlock_t *lock);
+
+/*
+ * lockdep-less calls, for derived types like rwlock:
+ * (for trylock they can use rt_mutex_trylock() directly.
+ */
+extern void __lockfunc __rt_spin_lock(struct rt_mutex *lock);
+extern void __lockfunc __rt_spin_unlock(struct rt_mutex *lock);
+
+#define spin_lock_local(lock)			rt_spin_lock(lock)
+#define spin_unlock_local(lock)			rt_spin_unlock(lock)
+
+#define spin_lock(lock)				\
+	do {					\
+		migrate_disable();		\
+		rt_spin_lock(lock);		\
+	} while (0)
+
+#define spin_lock_bh(lock)			\
+	do {					\
+		local_bh_disable();		\
+		migrate_disable();		\
+		rt_spin_lock(lock);		\
+	} while (0)
+
+#define spin_lock_irq(lock)		spin_lock(lock)
+
+#define spin_trylock(lock)		__cond_lock(lock, rt_spin_trylock(lock))
+
+#ifdef CONFIG_LOCKDEP
+# define spin_lock_nested(lock, subclass)		\
+	do {						\
+		migrate_disable();			\
+		rt_spin_lock_nested(lock, subclass);	\
+	} while (0)
+
+# define spin_lock_irqsave_nested(lock, flags, subclass) \
+	do {						 \
+		typecheck(unsigned long, flags);	 \
+		flags = 0;				 \
+		migrate_disable();			 \
+		rt_spin_lock_nested(lock, subclass);	 \
+	} while (0)
+#else
+# define spin_lock_nested(lock, subclass)	spin_lock(lock)
+
+# define spin_lock_irqsave_nested(lock, flags, subclass) \
+	do {						 \
+		typecheck(unsigned long, flags);	 \
+		flags = 0;				 \
+		spin_lock(lock);			 \
+	} while (0)
+#endif
+
+#define spin_lock_irqsave(lock, flags)			 \
+	do {						 \
+		typecheck(unsigned long, flags);	 \
+		flags = 0;				 \
+		spin_lock(lock);			 \
+	} while (0)
+
+static inline unsigned long spin_lock_trace_flags(spinlock_t *lock)
+{
+	unsigned long flags = 0;
+#ifdef CONFIG_TRACE_IRQFLAGS
+	flags = rt_spin_lock_trace_flags(lock);
+#else
+	spin_lock(lock); /* lock_local */
+#endif
+	return flags;
+}
+
+/* FIXME: we need rt_spin_lock_nest_lock */
+#define spin_lock_nest_lock(lock, nest_lock) spin_lock_nested(lock, 0)
+
+#define spin_unlock(lock)				\
+	do {						\
+		rt_spin_unlock(lock);			\
+		migrate_enable();			\
+	} while (0)
+
+#define spin_unlock_bh(lock)				\
+	do {						\
+		rt_spin_unlock(lock);			\
+		migrate_enable();			\
+		local_bh_enable();			\
+	} while (0)
+
+#define spin_unlock_irq(lock)		spin_unlock(lock)
+
+#define spin_unlock_irqrestore(lock, flags)		\
+	do {						\
+		typecheck(unsigned long, flags);	\
+		(void) flags;				\
+		spin_unlock(lock);			\
+	} while (0)
+
+#define spin_trylock_bh(lock)	__cond_lock(lock, rt_spin_trylock_bh(lock))
+#define spin_trylock_irq(lock)	spin_trylock(lock)
+
+#define spin_trylock_irqsave(lock, flags)	\
+	rt_spin_trylock_irqsave(lock, &(flags))
+
+#define spin_unlock_wait(lock)		rt_spin_unlock_wait(lock)
+
+#ifdef CONFIG_GENERIC_LOCKBREAK
+# define spin_is_contended(lock)	((lock)->break_lock)
+#else
+# define spin_is_contended(lock)	(((void)(lock), 0))
+#endif
+
+static inline int spin_can_lock(spinlock_t *lock)
+{
+	return !rt_mutex_is_locked(&lock->lock);
+}
+
+static inline int spin_is_locked(spinlock_t *lock)
+{
+	return rt_mutex_is_locked(&lock->lock);
+}
+
+static inline void assert_spin_locked(spinlock_t *lock)
+{
+	BUG_ON(!spin_is_locked(lock));
+}
+
+#define atomic_dec_and_lock(atomic, lock) \
+	atomic_dec_and_spin_lock(atomic, lock)
+
+#endif
diff --git a/kernel/Makefile b/kernel/Makefile
index 6c072b6..0e12fc2 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -7,8 +7,8 @@ obj-y     = fork.o exec_domain.o panic.o printk.o \
 	    sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \
 	    signal.o sys.o kmod.o workqueue.o pid.o task_work.o \
 	    rcupdate.o extable.o params.o posix-timers.o \
-	    kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
-	    hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
+	    kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o \
+	    hrtimer.o nsproxy.o srcu.o semaphore.o \
 	    notifier.o ksysfs.o cred.o \
 	    async.o range.o groups.o lglock.o smpboot.o
 
@@ -32,7 +32,11 @@ obj-$(CONFIG_FREEZER) += freezer.o
 obj-$(CONFIG_PROFILING) += profile.o
 obj-$(CONFIG_STACKTRACE) += stacktrace.o
 obj-y += time/
+ifneq ($(CONFIG_PREEMPT_RT_FULL),y)
+obj-y += mutex.o
 obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
+obj-y += rwsem.o
+endif
 obj-$(CONFIG_LOCKDEP) += lockdep.o
 ifeq ($(CONFIG_PROC_FS),y)
 obj-$(CONFIG_LOCKDEP) += lockdep_proc.o
@@ -44,6 +48,7 @@ endif
 obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
 obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
 obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o
+obj-$(CONFIG_PREEMPT_RT_FULL) += rt.o
 obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
 obj-$(CONFIG_SMP) += smp.o
 ifneq ($(CONFIG_SMP),y)
diff --git a/kernel/rt.c b/kernel/rt.c
new file mode 100644
index 0000000..92a16e1
--- /dev/null
+++ b/kernel/rt.c
@@ -0,0 +1,442 @@
+/*
+ * kernel/rt.c
+ *
+ * Real-Time Preemption Support
+ *
+ * started by Ingo Molnar:
+ *
+ *  Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *  Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
+ *
+ * historic credit for proving that Linux spinlocks can be implemented via
+ * RT-aware mutexes goes to many people: The Pmutex project (Dirk Grambow
+ * and others) who prototyped it on 2.4 and did lots of comparative
+ * research and analysis; TimeSys, for proving that you can implement a
+ * fully preemptible kernel via the use of IRQ threading and mutexes;
+ * Bill Huey for persuasively arguing on lkml that the mutex model is the
+ * right one; and to MontaVista, who ported pmutexes to 2.6.
+ *
+ * This code is a from-scratch implementation and is not based on pmutexes,
+ * but the idea of converting spinlocks to mutexes is used here too.
+ *
+ * lock debugging, locking tree, deadlock detection:
+ *
+ *  Copyright (C) 2004, LynuxWorks, Inc., Igor Manyilov, Bill Huey
+ *  Released under the General Public License (GPL).
+ *
+ * Includes portions of the generic R/W semaphore implementation from:
+ *
+ *  Copyright (c) 2001   David Howells (dhowells@redhat.com).
+ *  - Derived partially from idea by Andrea Arcangeli <andrea@suse.de>
+ *  - Derived also from comments by Linus
+ *
+ * Pending ownership of locks and ownership stealing:
+ *
+ *  Copyright (C) 2005, Kihon Technologies Inc., Steven Rostedt
+ *
+ *   (also by Steven Rostedt)
+ *    - Converted single pi_lock to individual task locks.
+ *
+ * By Esben Nielsen:
+ *    Doing priority inheritance with help of the scheduler.
+ *
+ *  Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
+ *  - major rework based on Esben Nielsens initial patch
+ *  - replaced thread_info references by task_struct refs
+ *  - removed task->pending_owner dependency
+ *  - BKL drop/reacquire for semaphore style locks to avoid deadlocks
+ *    in the scheduler return path as discussed with Steven Rostedt
+ *
+ *  Copyright (C) 2006, Kihon Technologies Inc.
+ *    Steven Rostedt <rostedt@goodmis.org>
+ *  - debugged and patched Thomas Gleixner's rework.
+ *  - added back the cmpxchg to the rework.
+ *  - turned atomic require back on for SMP.
+ */
+
+#include <linux/spinlock.h>
+#include <linux/rtmutex.h>
+#include <linux/sched.h>
+#include <linux/delay.h>
+#include <linux/module.h>
+#include <linux/kallsyms.h>
+#include <linux/syscalls.h>
+#include <linux/interrupt.h>
+#include <linux/plist.h>
+#include <linux/fs.h>
+#include <linux/futex.h>
+#include <linux/hrtimer.h>
+
+#include "rtmutex_common.h"
+
+/*
+ * struct mutex functions
+ */
+void __mutex_do_init(struct mutex *mutex, const char *name,
+		     struct lock_class_key *key)
+{
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	/*
+	 * Make sure we are not reinitializing a held lock:
+	 */
+	debug_check_no_locks_freed((void *)mutex, sizeof(*mutex));
+	lockdep_init_map(&mutex->dep_map, name, key, 0);
+#endif
+	mutex->lock.save_state = 0;
+}
+EXPORT_SYMBOL(__mutex_do_init);
+
+void __lockfunc _mutex_lock(struct mutex *lock)
+{
+	mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
+	rt_mutex_lock(&lock->lock);
+}
+EXPORT_SYMBOL(_mutex_lock);
+
+int __lockfunc _mutex_lock_interruptible(struct mutex *lock)
+{
+	int ret;
+
+	mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
+	ret = rt_mutex_lock_interruptible(&lock->lock, 0);
+	if (ret)
+		mutex_release(&lock->dep_map, 1, _RET_IP_);
+	return ret;
+}
+EXPORT_SYMBOL(_mutex_lock_interruptible);
+
+int __lockfunc _mutex_lock_killable(struct mutex *lock)
+{
+	int ret;
+
+	mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
+	ret = rt_mutex_lock_killable(&lock->lock, 0);
+	if (ret)
+		mutex_release(&lock->dep_map, 1, _RET_IP_);
+	return ret;
+}
+EXPORT_SYMBOL(_mutex_lock_killable);
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+void __lockfunc _mutex_lock_nested(struct mutex *lock, int subclass)
+{
+	mutex_acquire_nest(&lock->dep_map, subclass, 0, NULL, _RET_IP_);
+	rt_mutex_lock(&lock->lock);
+}
+EXPORT_SYMBOL(_mutex_lock_nested);
+
+void __lockfunc _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest)
+{
+	mutex_acquire_nest(&lock->dep_map, 0, 0, nest, _RET_IP_);
+	rt_mutex_lock(&lock->lock);
+}
+EXPORT_SYMBOL(_mutex_lock_nest_lock);
+
+int __lockfunc _mutex_lock_interruptible_nested(struct mutex *lock, int subclass)
+{
+	int ret;
+
+	mutex_acquire_nest(&lock->dep_map, subclass, 0, NULL, _RET_IP_);
+	ret = rt_mutex_lock_interruptible(&lock->lock, 0);
+	if (ret)
+		mutex_release(&lock->dep_map, 1, _RET_IP_);
+	return ret;
+}
+EXPORT_SYMBOL(_mutex_lock_interruptible_nested);
+
+int __lockfunc _mutex_lock_killable_nested(struct mutex *lock, int subclass)
+{
+	int ret;
+
+	mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
+	ret = rt_mutex_lock_killable(&lock->lock, 0);
+	if (ret)
+		mutex_release(&lock->dep_map, 1, _RET_IP_);
+	return ret;
+}
+EXPORT_SYMBOL(_mutex_lock_killable_nested);
+#endif
+
+int __lockfunc _mutex_trylock(struct mutex *lock)
+{
+	int ret = rt_mutex_trylock(&lock->lock);
+
+	if (ret)
+		mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
+
+	return ret;
+}
+EXPORT_SYMBOL(_mutex_trylock);
+
+void __lockfunc _mutex_unlock(struct mutex *lock)
+{
+	mutex_release(&lock->dep_map, 1, _RET_IP_);
+	rt_mutex_unlock(&lock->lock);
+}
+EXPORT_SYMBOL(_mutex_unlock);
+
+/*
+ * rwlock_t functions
+ */
+int __lockfunc rt_write_trylock(rwlock_t *rwlock)
+{
+	int ret = rt_mutex_trylock(&rwlock->lock);
+
+	migrate_disable();
+	if (ret)
+		rwlock_acquire(&rwlock->dep_map, 0, 1, _RET_IP_);
+	else
+		migrate_enable();
+
+	return ret;
+}
+EXPORT_SYMBOL(rt_write_trylock);
+
+int __lockfunc rt_write_trylock_irqsave(rwlock_t *rwlock, unsigned long *flags)
+{
+	int ret;
+
+	*flags = 0;
+	migrate_disable();
+	ret = rt_write_trylock(rwlock);
+	if (!ret)
+		migrate_enable();
+	return ret;
+}
+EXPORT_SYMBOL(rt_write_trylock_irqsave);
+
+int __lockfunc rt_read_trylock(rwlock_t *rwlock)
+{
+	struct rt_mutex *lock = &rwlock->lock;
+	int ret = 1;
+
+	/*
+	 * recursive read locks succeed when current owns the lock,
+	 * but not when read_depth == 0 which means that the lock is
+	 * write locked.
+	 */
+	migrate_disable();
+	if (rt_mutex_owner(lock) != current)
+		ret = rt_mutex_trylock(lock);
+	else if (!rwlock->read_depth)
+		ret = 0;
+
+	if (ret) {
+		rwlock->read_depth++;
+		rwlock_acquire_read(&rwlock->dep_map, 0, 1, _RET_IP_);
+	} else
+		migrate_enable();
+
+	return ret;
+}
+EXPORT_SYMBOL(rt_read_trylock);
+
+void __lockfunc rt_write_lock(rwlock_t *rwlock)
+{
+	rwlock_acquire(&rwlock->dep_map, 0, 0, _RET_IP_);
+	__rt_spin_lock(&rwlock->lock);
+}
+EXPORT_SYMBOL(rt_write_lock);
+
+void __lockfunc rt_read_lock(rwlock_t *rwlock)
+{
+	struct rt_mutex *lock = &rwlock->lock;
+
+	rwlock_acquire_read(&rwlock->dep_map, 0, 0, _RET_IP_);
+
+	/*
+	 * recursive read locks succeed when current owns the lock
+	 */
+	if (rt_mutex_owner(lock) != current)
+		__rt_spin_lock(lock);
+	rwlock->read_depth++;
+}
+
+EXPORT_SYMBOL(rt_read_lock);
+
+void __lockfunc rt_write_unlock(rwlock_t *rwlock)
+{
+	/* NOTE: we always pass in '1' for nested, for simplicity */
+	rwlock_release(&rwlock->dep_map, 1, _RET_IP_);
+	__rt_spin_unlock(&rwlock->lock);
+}
+EXPORT_SYMBOL(rt_write_unlock);
+
+void __lockfunc rt_read_unlock(rwlock_t *rwlock)
+{
+	rwlock_release(&rwlock->dep_map, 1, _RET_IP_);
+
+	/* Release the lock only when read_depth is down to 0 */
+	if (--rwlock->read_depth == 0)
+		__rt_spin_unlock(&rwlock->lock);
+}
+EXPORT_SYMBOL(rt_read_unlock);
+
+unsigned long __lockfunc rt_write_lock_irqsave(rwlock_t *rwlock)
+{
+	rt_write_lock(rwlock);
+
+	return 0;
+}
+EXPORT_SYMBOL(rt_write_lock_irqsave);
+
+unsigned long __lockfunc rt_read_lock_irqsave(rwlock_t *rwlock)
+{
+	rt_read_lock(rwlock);
+
+	return 0;
+}
+EXPORT_SYMBOL(rt_read_lock_irqsave);
+
+void __rt_rwlock_init(rwlock_t *rwlock, char *name, struct lock_class_key *key)
+{
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	/*
+	 * Make sure we are not reinitializing a held lock:
+	 */
+	debug_check_no_locks_freed((void *)rwlock, sizeof(*rwlock));
+	lockdep_init_map(&rwlock->dep_map, name, key, 0);
+#endif
+	rwlock->lock.save_state = 1;
+	rwlock->read_depth = 0;
+}
+EXPORT_SYMBOL(__rt_rwlock_init);
+
+/*
+ * rw_semaphores
+ */
+
+void  rt_up_write(struct rw_semaphore *rwsem)
+{
+	rwsem_release(&rwsem->dep_map, 1, _RET_IP_);
+	rt_mutex_unlock(&rwsem->lock);
+}
+EXPORT_SYMBOL(rt_up_write);
+
+void  rt_up_read(struct rw_semaphore *rwsem)
+{
+	rwsem_release(&rwsem->dep_map, 1, _RET_IP_);
+	if (--rwsem->read_depth == 0)
+		rt_mutex_unlock(&rwsem->lock);
+}
+EXPORT_SYMBOL(rt_up_read);
+
+/*
+ * downgrade a write lock into a read lock
+ * - just wake up any readers at the front of the queue
+ */
+void  rt_downgrade_write(struct rw_semaphore *rwsem)
+{
+	BUG_ON(rt_mutex_owner(&rwsem->lock) != current);
+	rwsem->read_depth = 1;
+}
+EXPORT_SYMBOL(rt_downgrade_write);
+
+int  rt_down_write_trylock(struct rw_semaphore *rwsem)
+{
+	int ret = rt_mutex_trylock(&rwsem->lock);
+
+	if (ret)
+		rwsem_acquire(&rwsem->dep_map, 0, 1, _RET_IP_);
+	return ret;
+}
+EXPORT_SYMBOL(rt_down_write_trylock);
+
+void  rt_down_write(struct rw_semaphore *rwsem)
+{
+	rwsem_acquire(&rwsem->dep_map, 0, 0, _RET_IP_);
+	rt_mutex_lock(&rwsem->lock);
+}
+EXPORT_SYMBOL(rt_down_write);
+
+void  rt_down_write_nested(struct rw_semaphore *rwsem, int subclass)
+{
+	rwsem_acquire(&rwsem->dep_map, subclass, 0, _RET_IP_);
+	rt_mutex_lock(&rwsem->lock);
+}
+EXPORT_SYMBOL(rt_down_write_nested);
+
+int  rt_down_read_trylock(struct rw_semaphore *rwsem)
+{
+	struct rt_mutex *lock = &rwsem->lock;
+	int ret = 1;
+
+	/*
+	 * recursive read locks succeed when current owns the rwsem,
+	 * but not when read_depth == 0 which means that the rwsem is
+	 * write locked.
+	 */
+	if (rt_mutex_owner(lock) != current)
+		ret = rt_mutex_trylock(&rwsem->lock);
+	else if (!rwsem->read_depth)
+		ret = 0;
+
+	if (ret) {
+		rwsem->read_depth++;
+		rwsem_acquire(&rwsem->dep_map, 0, 1, _RET_IP_);
+	}
+	return ret;
+}
+EXPORT_SYMBOL(rt_down_read_trylock);
+
+static void __rt_down_read(struct rw_semaphore *rwsem, int subclass)
+{
+	struct rt_mutex *lock = &rwsem->lock;
+
+	rwsem_acquire_read(&rwsem->dep_map, subclass, 0, _RET_IP_);
+
+	if (rt_mutex_owner(lock) != current)
+		rt_mutex_lock(&rwsem->lock);
+	rwsem->read_depth++;
+}
+
+void  rt_down_read(struct rw_semaphore *rwsem)
+{
+	__rt_down_read(rwsem, 0);
+}
+EXPORT_SYMBOL(rt_down_read);
+
+void  rt_down_read_nested(struct rw_semaphore *rwsem, int subclass)
+{
+	__rt_down_read(rwsem, subclass);
+}
+EXPORT_SYMBOL(rt_down_read_nested);
+
+void  __rt_rwsem_init(struct rw_semaphore *rwsem, const char *name,
+			      struct lock_class_key *key)
+{
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	/*
+	 * Make sure we are not reinitializing a held lock:
+	 */
+	debug_check_no_locks_freed((void *)rwsem, sizeof(*rwsem));
+	lockdep_init_map(&rwsem->dep_map, name, key, 0);
+#endif
+	rwsem->read_depth = 0;
+	rwsem->lock.save_state = 0;
+}
+EXPORT_SYMBOL(__rt_rwsem_init);
+
+/**
+ * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0
+ * @cnt: the atomic which we are to dec
+ * @lock: the mutex to return holding if we dec to 0
+ *
+ * return true and hold lock if we dec to 0, return false otherwise
+ */
+int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock)
+{
+	/* dec if we can't possibly hit 0 */
+	if (atomic_add_unless(cnt, -1, 1))
+		return 0;
+	/* we might hit 0, so take the lock */
+	mutex_lock(lock);
+	if (!atomic_dec_and_test(cnt)) {
+		/* when we actually did the dec, we didn't hit 0 */
+		mutex_unlock(lock);
+		return 0;
+	}
+	/* we hit 0, and we hold the lock */
+	return 1;
+}
+EXPORT_SYMBOL(atomic_dec_and_mutex_lock);
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index 5cdd806..da9775b 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -110,8 +110,11 @@ void __lockfunc __raw_##op##_lock_bh(locktype##_t *lock)		\
  *         __[spin|read|write]_lock_bh()
  */
 BUILD_LOCK_OPS(spin, raw_spinlock);
+
+#ifndef CONFIG_PREEMPT_RT_FULL
 BUILD_LOCK_OPS(read, rwlock);
 BUILD_LOCK_OPS(write, rwlock);
+#endif
 
 #endif
 
@@ -195,6 +198,8 @@ void __lockfunc _raw_spin_unlock_bh(raw_spinlock_t *lock)
 EXPORT_SYMBOL(_raw_spin_unlock_bh);
 #endif
 
+#ifndef CONFIG_PREEMPT_RT_FULL
+
 #ifndef CONFIG_INLINE_READ_TRYLOCK
 int __lockfunc _raw_read_trylock(rwlock_t *lock)
 {
@@ -339,6 +344,8 @@ void __lockfunc _raw_write_unlock_bh(rwlock_t *lock)
 EXPORT_SYMBOL(_raw_write_unlock_bh);
 #endif
 
+#endif /* !PREEMPT_RT_FULL */
+
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 
 void __lockfunc _raw_spin_lock_nested(raw_spinlock_t *lock, int subclass)
diff --git a/lib/spinlock_debug.c b/lib/spinlock_debug.c
index 0374a59..9497033 100644
--- a/lib/spinlock_debug.c
+++ b/lib/spinlock_debug.c
@@ -31,6 +31,7 @@ void __raw_spin_lock_init(raw_spinlock_t *lock, const char *name,
 
 EXPORT_SYMBOL(__raw_spin_lock_init);
 
+#ifndef CONFIG_PREEMPT_RT_FULL
 void __rwlock_init(rwlock_t *lock, const char *name,
 		   struct lock_class_key *key)
 {
@@ -48,6 +49,7 @@ void __rwlock_init(rwlock_t *lock, const char *name,
 }
 
 EXPORT_SYMBOL(__rwlock_init);
+#endif
 
 static void spin_dump(raw_spinlock_t *lock, const char *msg)
 {
@@ -159,6 +161,7 @@ void do_raw_spin_unlock(raw_spinlock_t *lock)
 	arch_spin_unlock(&lock->raw_lock);
 }
 
+#ifndef CONFIG_PREEMPT_RT_FULL
 static void rwlock_bug(rwlock_t *lock, const char *msg)
 {
 	if (!debug_locks_off())
@@ -300,3 +303,5 @@ void do_raw_write_unlock(rwlock_t *lock)
 	debug_write_unlock(lock);
 	arch_write_unlock(&lock->raw_lock);
 }
+
+#endif
-- 
cgit v0.10.2


From b43f83b53aff708b1dde2a4b166c3e1555dd443c Mon Sep 17 00:00:00 2001
From: RT patchquilt <nobody@example.com>
Date: Thu, 4 Apr 2013 17:09:27 -0500
Subject: percpu-rwsem-compilefix


diff --git a/lib/percpu-rwsem.c b/lib/percpu-rwsem.c
index 652a8ee..2db0f42 100644
--- a/lib/percpu-rwsem.c
+++ b/lib/percpu-rwsem.c
@@ -84,8 +84,12 @@ void percpu_down_read(struct percpu_rw_semaphore *brw)
 
 	down_read(&brw->rw_sem);
 	atomic_inc(&brw->slow_read_ctr);
+#ifdef CONFIG_PREEMPT_RT_FULL
+	up_read(&brw->rw_sem);
+#else
 	/* avoid up_read()->rwsem_release() */
 	__up_read(&brw->rw_sem);
+#endif
 }
 
 void percpu_up_read(struct percpu_rw_semaphore *brw)
-- 
cgit v0.10.2


From b225e91e1ea601aa54a6220ec8bbe573b5e2540c Mon Sep 17 00:00:00 2001
From: John Kacur <jkacur@redhat.com>
Date: Mon, 19 Sep 2011 11:09:27 +0200
Subject: rwlocks: Fix section mismatch
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This fixes the following build error for the preempt-rt kernel.

make kernel/fork.o
  CC      kernel/fork.o
kernel/fork.c:90: error: section of ¡tasklist_lock¢ conflicts with previous declaration
make[2]: *** [kernel/fork.o] Error 1
make[1]: *** [kernel/fork.o] Error 2

The rt kernel cache aligns the RWLOCK in DEFINE_RWLOCK by default.
The non-rt kernels explicitly cache align only the tasklist_lock in
kernel/fork.c
That can create a build conflict. This fixes the build problem by making the
non-rt kernels cache align RWLOCKs by default. The side effect is that
the other RWLOCKs are also cache aligned for non-rt.

This is a short term solution for rt only.
The longer term solution would be to push the cache aligned DEFINE_RWLOCK
to mainline. If there are objections, then we could create a
DEFINE_RWLOCK_CACHE_ALIGNED or something of that nature.

Comments? Objections?

Signed-off-by: John Kacur <jkacur@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/alpine.LFD.2.00.1109191104010.23118@localhost6.localdomain6
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/rwlock_types.h b/include/linux/rwlock_types.h
index 5317cd9..d0da966 100644
--- a/include/linux/rwlock_types.h
+++ b/include/linux/rwlock_types.h
@@ -47,6 +47,7 @@ typedef struct {
 				RW_DEP_MAP_INIT(lockname) }
 #endif
 
-#define DEFINE_RWLOCK(x)	rwlock_t x = __RW_LOCK_UNLOCKED(x)
+#define DEFINE_RWLOCK(name) \
+	rwlock_t name __cacheline_aligned_in_smp = __RW_LOCK_UNLOCKED(name)
 
 #endif /* __LINUX_RWLOCK_TYPES_H */
diff --git a/kernel/fork.c b/kernel/fork.c
index 591d2bd..b814a02 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -93,7 +93,7 @@ int max_threads;		/* tunable limit on nr_threads */
 
 DEFINE_PER_CPU(unsigned long, process_counts) = 0;
 
-__cacheline_aligned DEFINE_RWLOCK(tasklist_lock);  /* outer */
+DEFINE_RWLOCK(tasklist_lock);  /* outer */
 
 #ifdef CONFIG_PROVE_RCU
 int lockdep_tasklist_lock_is_held(void)
-- 
cgit v0.10.2


From 841a7fefa879aa2c75e327f9f0931ba9eb3e9940 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 17 Jul 2011 22:08:38 +0200
Subject: timer-handle-idle-trylock-in-get-next-timer-irq.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/spinlock_rt.h b/include/linux/spinlock_rt.h
index a9ea392..0618387 100644
--- a/include/linux/spinlock_rt.h
+++ b/include/linux/spinlock_rt.h
@@ -53,7 +53,17 @@ extern void __lockfunc __rt_spin_unlock(struct rt_mutex *lock);
 
 #define spin_lock_irq(lock)		spin_lock(lock)
 
-#define spin_trylock(lock)		__cond_lock(lock, rt_spin_trylock(lock))
+#define spin_do_trylock(lock)		__cond_lock(lock, rt_spin_trylock(lock))
+
+#define spin_trylock(lock)			\
+({						\
+	int __locked;				\
+	migrate_disable();			\
+	__locked = spin_do_trylock(lock);	\
+	if (!__locked)				\
+		migrate_enable();		\
+	__locked;				\
+})
 
 #ifdef CONFIG_LOCKDEP
 # define spin_lock_nested(lock, subclass)		\
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index b562de3..3bff726 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -861,15 +861,10 @@ EXPORT_SYMBOL(rt_spin_unlock_wait);
 
 int __lockfunc rt_spin_trylock(spinlock_t *lock)
 {
-	int ret;
+	int ret = rt_mutex_trylock(&lock->lock);
 
-	migrate_disable();
-	ret = rt_mutex_trylock(&lock->lock);
 	if (ret)
 		spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
-	else
-		migrate_enable();
-
 	return ret;
 }
 EXPORT_SYMBOL(rt_spin_trylock);
diff --git a/kernel/timer.c b/kernel/timer.c
index 5470c3e..8af44de 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1380,9 +1380,10 @@ unsigned long get_next_timer_interrupt(unsigned long now)
 	/*
 	 * On PREEMPT_RT we cannot sleep here. If the trylock does not
 	 * succeed then we return the worst-case 'expires in 1 tick'
-	 * value:
+	 * value.  We use the rt functions here directly to avoid a
+	 * migrate_disable() call.
 	 */
-	if (!spin_trylock(&base->lock))
+	if (!spin_do_trylock(&base->lock))
 		return  now + 1;
 #else
 	spin_lock(&base->lock);
@@ -1392,7 +1393,11 @@ unsigned long get_next_timer_interrupt(unsigned long now)
 			base->next_timer = __next_timer_interrupt(base);
 		expires = base->next_timer;
 	}
+#ifdef CONFIG_PREEMPT_RT_FULL
+	rt_spin_unlock(&base->lock);
+#else
 	spin_unlock(&base->lock);
+#endif
 
 	if (time_before_eq(expires, now))
 		return now;
-- 
cgit v0.10.2


From fc5f9f4ee7487ad22dc22e358e2e6a077ad1d240 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Sat, 13 Aug 2011 00:23:17 +0200
Subject: rcu: Frob softirq test

With RT_FULL we get the below wreckage:

[  126.060484] =======================================================
[  126.060486] [ INFO: possible circular locking dependency detected ]
[  126.060489] 3.0.1-rt10+ #30
[  126.060490] -------------------------------------------------------
[  126.060492] irq/24-eth0/1235 is trying to acquire lock:
[  126.060495]  (&(lock)->wait_lock#2){+.+...}, at: [<ffffffff81501c81>] rt_mutex_slowunlock+0x16/0x55
[  126.060503]
[  126.060504] but task is already holding lock:
[  126.060506]  (&p->pi_lock){-...-.}, at: [<ffffffff81074fdc>] try_to_wake_up+0x35/0x429
[  126.060511]
[  126.060511] which lock already depends on the new lock.
[  126.060513]
[  126.060514]
[  126.060514] the existing dependency chain (in reverse order) is:
[  126.060516]
[  126.060516] -> #1 (&p->pi_lock){-...-.}:
[  126.060519]        [<ffffffff810afe9e>] lock_acquire+0x145/0x18a
[  126.060524]        [<ffffffff8150291e>] _raw_spin_lock_irqsave+0x4b/0x85
[  126.060527]        [<ffffffff810b5aa4>] task_blocks_on_rt_mutex+0x36/0x20f
[  126.060531]        [<ffffffff815019bb>] rt_mutex_slowlock+0xd1/0x15a
[  126.060534]        [<ffffffff81501ae3>] rt_mutex_lock+0x2d/0x2f
[  126.060537]        [<ffffffff810d9020>] rcu_boost+0xad/0xde
[  126.060541]        [<ffffffff810d90ce>] rcu_boost_kthread+0x7d/0x9b
[  126.060544]        [<ffffffff8109a760>] kthread+0x99/0xa1
[  126.060547]        [<ffffffff81509b14>] kernel_thread_helper+0x4/0x10
[  126.060551]
[  126.060552] -> #0 (&(lock)->wait_lock#2){+.+...}:
[  126.060555]        [<ffffffff810af1b8>] __lock_acquire+0x1157/0x1816
[  126.060558]        [<ffffffff810afe9e>] lock_acquire+0x145/0x18a
[  126.060561]        [<ffffffff8150279e>] _raw_spin_lock+0x40/0x73
[  126.060564]        [<ffffffff81501c81>] rt_mutex_slowunlock+0x16/0x55
[  126.060566]        [<ffffffff81501ce7>] rt_mutex_unlock+0x27/0x29
[  126.060569]        [<ffffffff810d9f86>] rcu_read_unlock_special+0x17e/0x1c4
[  126.060573]        [<ffffffff810da014>] __rcu_read_unlock+0x48/0x89
[  126.060576]        [<ffffffff8106847a>] select_task_rq_rt+0xc7/0xd5
[  126.060580]        [<ffffffff8107511c>] try_to_wake_up+0x175/0x429
[  126.060583]        [<ffffffff81075425>] wake_up_process+0x15/0x17
[  126.060585]        [<ffffffff81080a51>] wakeup_softirqd+0x24/0x26
[  126.060590]        [<ffffffff81081df9>] irq_exit+0x49/0x55
[  126.060593]        [<ffffffff8150a3bd>] smp_apic_timer_interrupt+0x8a/0x98
[  126.060597]        [<ffffffff81509793>] apic_timer_interrupt+0x13/0x20
[  126.060600]        [<ffffffff810d5952>] irq_forced_thread_fn+0x1b/0x44
[  126.060603]        [<ffffffff810d582c>] irq_thread+0xde/0x1af
[  126.060606]        [<ffffffff8109a760>] kthread+0x99/0xa1
[  126.060608]        [<ffffffff81509b14>] kernel_thread_helper+0x4/0x10
[  126.060611]
[  126.060612] other info that might help us debug this:
[  126.060614]
[  126.060615]  Possible unsafe locking scenario:
[  126.060616]
[  126.060617]        CPU0                    CPU1
[  126.060619]        ----                    ----
[  126.060620]   lock(&p->pi_lock);
[  126.060623]                                lock(&(lock)->wait_lock);
[  126.060625]                                lock(&p->pi_lock);
[  126.060627]   lock(&(lock)->wait_lock);
[  126.060629]
[  126.060629]  *** DEADLOCK ***
[  126.060630]
[  126.060632] 1 lock held by irq/24-eth0/1235:
[  126.060633]  #0:  (&p->pi_lock){-...-.}, at: [<ffffffff81074fdc>] try_to_wake_up+0x35/0x429
[  126.060638]
[  126.060638] stack backtrace:
[  126.060641] Pid: 1235, comm: irq/24-eth0 Not tainted 3.0.1-rt10+ #30
[  126.060643] Call Trace:
[  126.060644]  <IRQ>  [<ffffffff810acbde>] print_circular_bug+0x289/0x29a
[  126.060651]  [<ffffffff810af1b8>] __lock_acquire+0x1157/0x1816
[  126.060655]  [<ffffffff810ab3aa>] ? trace_hardirqs_off_caller+0x1f/0x99
[  126.060658]  [<ffffffff81501c81>] ? rt_mutex_slowunlock+0x16/0x55
[  126.060661]  [<ffffffff810afe9e>] lock_acquire+0x145/0x18a
[  126.060664]  [<ffffffff81501c81>] ? rt_mutex_slowunlock+0x16/0x55
[  126.060668]  [<ffffffff8150279e>] _raw_spin_lock+0x40/0x73
[  126.060671]  [<ffffffff81501c81>] ? rt_mutex_slowunlock+0x16/0x55
[  126.060674]  [<ffffffff810d9655>] ? rcu_report_qs_rsp+0x87/0x8c
[  126.060677]  [<ffffffff81501c81>] rt_mutex_slowunlock+0x16/0x55
[  126.060680]  [<ffffffff810d9ea3>] ? rcu_read_unlock_special+0x9b/0x1c4
[  126.060683]  [<ffffffff81501ce7>] rt_mutex_unlock+0x27/0x29
[  126.060687]  [<ffffffff810d9f86>] rcu_read_unlock_special+0x17e/0x1c4
[  126.060690]  [<ffffffff810da014>] __rcu_read_unlock+0x48/0x89
[  126.060693]  [<ffffffff8106847a>] select_task_rq_rt+0xc7/0xd5
[  126.060696]  [<ffffffff810683da>] ? select_task_rq_rt+0x27/0xd5
[  126.060701]  [<ffffffff810a852a>] ? clockevents_program_event+0x8e/0x90
[  126.060704]  [<ffffffff8107511c>] try_to_wake_up+0x175/0x429
[  126.060708]  [<ffffffff810a95dc>] ? tick_program_event+0x1f/0x21
[  126.060711]  [<ffffffff81075425>] wake_up_process+0x15/0x17
[  126.060715]  [<ffffffff81080a51>] wakeup_softirqd+0x24/0x26
[  126.060718]  [<ffffffff81081df9>] irq_exit+0x49/0x55
[  126.060721]  [<ffffffff8150a3bd>] smp_apic_timer_interrupt+0x8a/0x98
[  126.060724]  [<ffffffff81509793>] apic_timer_interrupt+0x13/0x20
[  126.060726]  <EOI>  [<ffffffff81072855>] ? migrate_disable+0x75/0x12d
[  126.060733]  [<ffffffff81080a61>] ? local_bh_disable+0xe/0x1f
[  126.060736]  [<ffffffff81080a70>] ? local_bh_disable+0x1d/0x1f
[  126.060739]  [<ffffffff810d5952>] irq_forced_thread_fn+0x1b/0x44
[  126.060742]  [<ffffffff81502ac0>] ? _raw_spin_unlock_irq+0x3b/0x59
[  126.060745]  [<ffffffff810d582c>] irq_thread+0xde/0x1af
[  126.060748]  [<ffffffff810d5937>] ? irq_thread_fn+0x3a/0x3a
[  126.060751]  [<ffffffff810d574e>] ? irq_finalize_oneshot+0xd1/0xd1
[  126.060754]  [<ffffffff810d574e>] ? irq_finalize_oneshot+0xd1/0xd1
[  126.060757]  [<ffffffff8109a760>] kthread+0x99/0xa1
[  126.060761]  [<ffffffff81509b14>] kernel_thread_helper+0x4/0x10
[  126.060764]  [<ffffffff81069ed7>] ? finish_task_switch+0x87/0x10a
[  126.060768]  [<ffffffff81502ec4>] ? retint_restore_args+0xe/0xe
[  126.060771]  [<ffffffff8109a6c7>] ? __init_kthread_worker+0x8c/0x8c
[  126.060774]  [<ffffffff81509b10>] ? gs_change+0xb/0xb

Because irq_exit() does:

void irq_exit(void)
{
	account_system_vtime(current);
	trace_hardirq_exit();
	sub_preempt_count(IRQ_EXIT_OFFSET);
	if (!in_interrupt() && local_softirq_pending())
		invoke_softirq();

	...
}

Which triggers a wakeup, which uses RCU, now if the interrupted task has
t->rcu_read_unlock_special set, the rcu usage from the wakeup will end
up in rcu_read_unlock_special(). rcu_read_unlock_special() will test
for in_irq(), which will fail as we just decremented preempt_count
with IRQ_EXIT_OFFSET, and in_sering_softirq(), which for
PREEMPT_RT_FULL reads:

int in_serving_softirq(void)
{
	int res;

	preempt_disable();
	res = __get_cpu_var(local_softirq_runner) == current;
	preempt_enable();
	return res;
}

Which will thus also fail, resulting in the above wreckage.

The 'somewhat' ugly solution is to open-code the preempt_count() test
in rcu_read_unlock_special().

Also, we're not at all sure how ->rcu_read_unlock_special gets set
here... so this is very likely a bandaid and more thought is required.

Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>

diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index c1cc7e1..68cdd0a 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -351,7 +351,7 @@ void rcu_read_unlock_special(struct task_struct *t)
 	}
 
 	/* Hardware IRQ handlers cannot block. */
-	if (in_irq() || in_serving_softirq()) {
+	if (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_OFFSET)) {
 		local_irq_restore(flags);
 		return;
 	}
-- 
cgit v0.10.2


From 3b5debe1ded9158d97fc4b00dd7592d5e2b77b69 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 5 Oct 2011 11:59:38 -0700
Subject: rcu: Merge RCU-bh into RCU-preempt

The Linux kernel has long RCU-bh read-side critical sections that
intolerably increase scheduling latency under mainline's RCU-bh rules,
which include RCU-bh read-side critical sections being non-preemptible.
This patch therefore arranges for RCU-bh to be implemented in terms of
RCU-preempt for CONFIG_PREEMPT_RT_FULL=y.

This has the downside of defeating the purpose of RCU-bh, namely,
handling the case where the system is subjected to a network-based
denial-of-service attack that keeps at least one CPU doing full-time
softirq processing.  This issue will be fixed by a later commit.

The current commit will need some work to make it appropriate for
mainline use, for example, it needs to be extended to cover Tiny RCU.

[ paulmck: Added a useful changelog ]

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Link: http://lkml.kernel.org/r/20111005185938.GA20403@linux.vnet.ibm.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index d870976..76ba53a 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -120,6 +120,9 @@ extern void call_rcu(struct rcu_head *head,
 
 #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
 
+#ifdef CONFIG_PREEMPT_RT_FULL
+#define call_rcu_bh	call_rcu
+#else
 /**
  * call_rcu_bh() - Queue an RCU for invocation after a quicker grace period.
  * @head: structure to be used for queueing the RCU updates.
@@ -143,6 +146,7 @@ extern void call_rcu(struct rcu_head *head,
  */
 extern void call_rcu_bh(struct rcu_head *head,
 			void (*func)(struct rcu_head *head));
+#endif
 
 /**
  * call_rcu_sched() - Queue an RCU for invocation after sched grace period.
@@ -216,7 +220,13 @@ static inline int rcu_preempt_depth(void)
 
 /* Internal to kernel */
 extern void rcu_sched_qs(int cpu);
+
+#ifndef CONFIG_PREEMPT_RT_FULL
 extern void rcu_bh_qs(int cpu);
+#else
+static inline void rcu_bh_qs(int cpu) { }
+#endif
+
 extern void rcu_check_callbacks(int cpu, int user);
 struct notifier_block;
 extern void rcu_idle_enter(void);
@@ -366,7 +376,14 @@ static inline int rcu_read_lock_held(void)
  * rcu_read_lock_bh_held() is defined out of line to avoid #include-file
  * hell.
  */
+#ifdef CONFIG_PREEMPT_RT_FULL
+static inline int rcu_read_lock_bh_held(void)
+{
+	return rcu_read_lock_held();
+}
+#else
 extern int rcu_read_lock_bh_held(void);
+#endif
 
 /**
  * rcu_read_lock_sched_held() - might we be in RCU-sched read-side critical section?
@@ -814,10 +831,14 @@ static inline void rcu_read_unlock(void)
 static inline void rcu_read_lock_bh(void)
 {
 	local_bh_disable();
+#ifdef CONFIG_PREEMPT_RT_FULL
+	rcu_read_lock();
+#else
 	__acquire(RCU_BH);
 	rcu_lock_acquire(&rcu_bh_lock_map);
 	rcu_lockdep_assert(!rcu_is_cpu_idle(),
 			   "rcu_read_lock_bh() used illegally while idle");
+#endif
 }
 
 /*
@@ -827,10 +848,14 @@ static inline void rcu_read_lock_bh(void)
  */
 static inline void rcu_read_unlock_bh(void)
 {
+#ifdef CONFIG_PREEMPT_RT_FULL
+	rcu_read_unlock();
+#else
 	rcu_lockdep_assert(!rcu_is_cpu_idle(),
 			   "rcu_read_unlock_bh() used illegally while idle");
 	rcu_lock_release(&rcu_bh_lock_map);
 	__release(RCU_BH);
+#endif
 	local_bh_enable();
 }
 
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 952b793..f1472a2 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -45,7 +45,11 @@ static inline void rcu_virt_note_context_switch(int cpu)
 	rcu_note_context_switch(cpu);
 }
 
+#ifdef CONFIG_PREEMPT_RT_FULL
+# define synchronize_rcu_bh	synchronize_rcu
+#else
 extern void synchronize_rcu_bh(void);
+#endif
 extern void synchronize_sched_expedited(void);
 extern void synchronize_rcu_expedited(void);
 
@@ -73,20 +77,30 @@ static inline void synchronize_rcu_bh_expedited(void)
 }
 
 extern void rcu_barrier(void);
+#ifdef CONFIG_PREEMPT_RT_FULL
+# define rcu_barrier_bh		rcu_barrier
+#else
 extern void rcu_barrier_bh(void);
+#endif
 extern void rcu_barrier_sched(void);
 
 extern unsigned long rcutorture_testseq;
 extern unsigned long rcutorture_vernum;
 extern long rcu_batches_completed(void);
-extern long rcu_batches_completed_bh(void);
 extern long rcu_batches_completed_sched(void);
 
 extern void rcu_force_quiescent_state(void);
-extern void rcu_bh_force_quiescent_state(void);
 extern void rcu_sched_force_quiescent_state(void);
 
 extern void rcu_scheduler_starting(void);
 extern int rcu_scheduler_active __read_mostly;
 
+#ifndef CONFIG_PREEMPT_RT_FULL
+extern void rcu_bh_force_quiescent_state(void);
+extern long rcu_batches_completed_bh(void);
+#else
+# define rcu_bh_force_quiescent_state	rcu_force_quiescent_state
+# define rcu_batches_completed_bh	rcu_batches_completed
+#endif
+
 #endif /* __LINUX_RCUTREE_H */
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index a2cf761..8906a79 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -156,6 +156,7 @@ int debug_lockdep_rcu_enabled(void)
 }
 EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled);
 
+#ifndef CONFIG_PREEMPT_RT_FULL
 /**
  * rcu_read_lock_bh_held() - might we be in RCU-bh read-side critical section?
  *
@@ -182,6 +183,7 @@ int rcu_read_lock_bh_held(void)
 	return in_softirq() || irqs_disabled();
 }
 EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held);
+#endif
 
 #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
 
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index e441b77..e6fae48 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -181,6 +181,7 @@ void rcu_sched_qs(int cpu)
 	rdp->passed_quiesce = 1;
 }
 
+#ifndef CONFIG_PREEMPT_RT_FULL
 void rcu_bh_qs(int cpu)
 {
 	struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu);
@@ -189,6 +190,7 @@ void rcu_bh_qs(int cpu)
 		trace_rcu_grace_period("rcu_bh", rdp->gpnum, "cpuqs");
 	rdp->passed_quiesce = 1;
 }
+#endif
 
 /*
  * Note a context switch.  This is a quiescent state for RCU-sched,
@@ -242,6 +244,7 @@ long rcu_batches_completed_sched(void)
 }
 EXPORT_SYMBOL_GPL(rcu_batches_completed_sched);
 
+#ifndef CONFIG_PREEMPT_RT_FULL
 /*
  * Return the number of RCU BH batches processed thus far for debug & stats.
  */
@@ -259,6 +262,7 @@ void rcu_bh_force_quiescent_state(void)
 	force_quiescent_state(&rcu_bh_state);
 }
 EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state);
+#endif
 
 /*
  * Record the number of times rcutorture tests have been initiated and
@@ -2183,6 +2187,7 @@ void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
 }
 EXPORT_SYMBOL_GPL(call_rcu_sched);
 
+#ifndef CONFIG_PREEMPT_RT_FULL
 /*
  * Queue an RCU callback for invocation after a quicker grace period.
  */
@@ -2191,6 +2196,7 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
 	__call_rcu(head, func, &rcu_bh_state, -1, 0);
 }
 EXPORT_SYMBOL_GPL(call_rcu_bh);
+#endif
 
 /*
  * Because a context switch is a grace period for RCU-sched and RCU-bh,
@@ -2268,6 +2274,7 @@ void synchronize_sched(void)
 }
 EXPORT_SYMBOL_GPL(synchronize_sched);
 
+#ifndef CONFIG_PREEMPT_RT_FULL
 /**
  * synchronize_rcu_bh - wait until an rcu_bh grace period has elapsed.
  *
@@ -2294,6 +2301,7 @@ void synchronize_rcu_bh(void)
 		wait_rcu_gp(call_rcu_bh);
 }
 EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
+#endif
 
 static int synchronize_sched_expedited_cpu_stop(void *data)
 {
@@ -2682,6 +2690,7 @@ static void _rcu_barrier(struct rcu_state *rsp)
 	mutex_unlock(&rsp->barrier_mutex);
 }
 
+#ifndef CONFIG_PREEMPT_RT_FULL
 /**
  * rcu_barrier_bh - Wait until all in-flight call_rcu_bh() callbacks complete.
  */
@@ -2690,6 +2699,7 @@ void rcu_barrier_bh(void)
 	_rcu_barrier(&rcu_bh_state);
 }
 EXPORT_SYMBOL_GPL(rcu_barrier_bh);
+#endif
 
 /**
  * rcu_barrier_sched - Wait for in-flight call_rcu_sched() callbacks.
-- 
cgit v0.10.2


From 381f07e238b2821b4f643d320a9eab1c295cce8b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 14 Nov 2011 10:57:54 +0100
Subject: rcu-more-fallout.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index e7dce58..9bc2fe6 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -371,6 +371,7 @@ void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
 }
 EXPORT_SYMBOL_GPL(call_rcu_sched);
 
+#ifndef CONFIG_PREEMPT_RT_FULL
 /*
  * Post an RCU bottom-half callback to be invoked after any subsequent
  * quiescent state.
@@ -380,3 +381,4 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
 	__call_rcu(head, func, &rcu_bh_ctrlblk);
 }
 EXPORT_SYMBOL_GPL(call_rcu_bh);
+#endif
-- 
cgit v0.10.2


From 08c25ad7ac582bc80b9e0e23abd8f1b4c5bafbc8 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 5 Oct 2011 11:45:18 -0700
Subject: rcu: Make ksoftirqd do RCU quiescent states

Implementing RCU-bh in terms of RCU-preempt makes the system vulnerable
to network-based denial-of-service attacks.  This patch therefore
makes __do_softirq() invoke rcu_bh_qs(), but only when __do_softirq()
is running in ksoftirqd context.  A wrapper layer in interposed so that
other calls to __do_softirq() avoid invoking rcu_bh_qs().  The underlying
function __do_softirq_common() does the actual work.

The reason that rcu_bh_qs() is bad in these non-ksoftirqd contexts is
that there might be a local_bh_enable() inside an RCU-preempt read-side
critical section.  This local_bh_enable() can invoke __do_softirq()
directly, so if __do_softirq() were to invoke rcu_bh_qs() (which just
calls rcu_preempt_qs() in the PREEMPT_RT_FULL case), there would be
an illegal RCU-preempt quiescent state in the middle of an RCU-preempt
read-side critical section.  Therefore, quiescent states can only happen
in cases where __do_softirq() is invoked directly from ksoftirqd.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Link: http://lkml.kernel.org/r/20111005184518.GA21601@linux.vnet.ibm.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 76ba53a..33e1d2e 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -220,13 +220,7 @@ static inline int rcu_preempt_depth(void)
 
 /* Internal to kernel */
 extern void rcu_sched_qs(int cpu);
-
-#ifndef CONFIG_PREEMPT_RT_FULL
 extern void rcu_bh_qs(int cpu);
-#else
-static inline void rcu_bh_qs(int cpu) { }
-#endif
-
 extern void rcu_check_callbacks(int cpu, int user);
 struct notifier_block;
 extern void rcu_idle_enter(void);
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index e6fae48..85fd9a7 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -181,7 +181,14 @@ void rcu_sched_qs(int cpu)
 	rdp->passed_quiesce = 1;
 }
 
-#ifndef CONFIG_PREEMPT_RT_FULL
+#ifdef CONFIG_PREEMPT_RT_FULL
+static void rcu_preempt_qs(int cpu);
+
+void rcu_bh_qs(int cpu)
+{
+	rcu_preempt_qs(cpu);
+}
+#else
 void rcu_bh_qs(int cpu)
 {
 	struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu);
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 68cdd0a..778f138 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -1519,7 +1519,7 @@ static void __cpuinit rcu_prepare_kthreads(int cpu)
 
 #endif /* #else #ifdef CONFIG_RCU_BOOST */
 
-#if !defined(CONFIG_RCU_FAST_NO_HZ)
+#if !defined(CONFIG_RCU_FAST_NO_HZ) || defined(CONFIG_PREEMPT_RT_FULL)
 
 /*
  * Check to see if any future RCU-related work will need to be done
@@ -1535,6 +1535,9 @@ int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies)
 	*delta_jiffies = ULONG_MAX;
 	return rcu_cpu_has_callbacks(cpu);
 }
+#endif	/* !defined(CONFIG_RCU_FAST_NO_HZ) || defined(CONFIG_PREEMPT_RT_FULL) */
+
+#if !defined(CONFIG_RCU_FAST_NO_HZ)
 
 /*
  * Because we do not have RCU_FAST_NO_HZ, don't bother initializing for it.
@@ -1651,6 +1654,7 @@ static bool rcu_cpu_has_nonlazy_callbacks(int cpu)
 	       rcu_preempt_cpu_has_nonlazy_callbacks(cpu);
 }
 
+#ifndef CONFIG_PREEMPT_RT_FULL
 /*
  * Allow the CPU to enter dyntick-idle mode if either: (1) There are no
  * callbacks on this CPU, (2) this CPU has not yet attempted to enter
@@ -1694,6 +1698,7 @@ int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies)
 	}
 	return 0;
 }
+#endif	/* #ifndef CONFIG_PREEMPT_RT_FULL */
 
 /*
  * Handler for smp_call_function_single().  The only point of this
diff --git a/kernel/softirq.c b/kernel/softirq.c
index f69c069..f52cdbc 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -142,7 +142,7 @@ static void wakeup_softirqd(void)
 		wake_up_process(tsk);
 }
 
-static void handle_pending_softirqs(u32 pending, int cpu)
+static void handle_pending_softirqs(u32 pending, int cpu, int need_rcu_bh_qs)
 {
 	struct softirq_action *h = softirq_vec;
 	unsigned int prev_count = preempt_count();
@@ -165,7 +165,8 @@ static void handle_pending_softirqs(u32 pending, int cpu)
 			       prev_count, (unsigned int) preempt_count());
 			preempt_count() = prev_count;
 		}
-		rcu_bh_qs(cpu);
+		if (need_rcu_bh_qs)
+			rcu_bh_qs(cpu);
 	}
 	local_irq_disable();
 }
@@ -325,7 +326,7 @@ restart:
 	/* Reset the pending bitmask before enabling irqs */
 	set_softirq_pending(0);
 
-	handle_pending_softirqs(pending, cpu);
+	handle_pending_softirqs(pending, cpu, 1);
 
 	pending = local_softirq_pending();
 	if (pending && --max_restart)
@@ -376,7 +377,12 @@ static void ksoftirqd_clr_sched_params(unsigned int cpu, bool online) { }
 static DEFINE_LOCAL_IRQ_LOCK(local_softirq_lock);
 static DEFINE_PER_CPU(struct task_struct *, local_softirq_runner);
 
-static void __do_softirq(void);
+static void __do_softirq_common(int need_rcu_bh_qs);
+
+void __do_softirq(void)
+{
+	__do_softirq_common(0);
+}
 
 void __init softirq_early_init(void)
 {
@@ -447,7 +453,7 @@ EXPORT_SYMBOL(in_serving_softirq);
  * Called with bh and local interrupts disabled. For full RT cpu must
  * be pinned.
  */
-static void __do_softirq(void)
+static void __do_softirq_common(int need_rcu_bh_qs)
 {
 	u32 pending = local_softirq_pending();
 	int cpu = smp_processor_id();
@@ -461,7 +467,7 @@ static void __do_softirq(void)
 
 	lockdep_softirq_enter();
 
-	handle_pending_softirqs(pending, cpu);
+	handle_pending_softirqs(pending, cpu, need_rcu_bh_qs);
 
 	pending = local_softirq_pending();
 	if (pending)
@@ -500,7 +506,7 @@ static int __thread_do_softirq(int cpu)
 	 * schedule!
 	 */
 	if (local_softirq_pending())
-		__do_softirq();
+		__do_softirq_common(cpu >= 0);
 	local_unlock(local_softirq_lock);
 	unpin_current_cpu();
 	preempt_disable();
-- 
cgit v0.10.2


From 537b4bfc5ba2baea42cec8f36650807d0eb65bca Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 15 Jun 2011 11:02:21 +0200
Subject: lglocks-rt.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/lglock.h b/include/linux/lglock.h
index 0d24e93..d2c0d6d 100644
--- a/include/linux/lglock.h
+++ b/include/linux/lglock.h
@@ -42,22 +42,37 @@
 #endif
 
 struct lglock {
+#ifndef CONFIG_PREEMPT_RT_FULL
 	arch_spinlock_t __percpu *lock;
+#else
+	struct rt_mutex __percpu *lock;
+#endif
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 	struct lock_class_key lock_key;
 	struct lockdep_map    lock_dep_map;
 #endif
 };
 
-#define DEFINE_LGLOCK(name)						\
+#ifndef CONFIG_PREEMPT_RT_FULL
+# define DEFINE_LGLOCK(name)						\
 	static DEFINE_PER_CPU(arch_spinlock_t, name ## _lock)		\
 	= __ARCH_SPIN_LOCK_UNLOCKED;					\
 	struct lglock name = { .lock = &name ## _lock }
 
-#define DEFINE_STATIC_LGLOCK(name)					\
+# define DEFINE_STATIC_LGLOCK(name)					\
 	static DEFINE_PER_CPU(arch_spinlock_t, name ## _lock)		\
 	= __ARCH_SPIN_LOCK_UNLOCKED;					\
 	static struct lglock name = { .lock = &name ## _lock }
+#else
+
+# define DEFINE_LGLOCK(name)						\
+	static DEFINE_PER_CPU(struct rt_mutex, name ## _lock);		\
+	struct lglock name = { .lock = &name ## _lock }
+
+# define DEFINE_STATIC_LGLOCK(name)					\
+	static DEFINE_PER_CPU(struct rt_mutex, name ## _lock);		\
+	static struct lglock name = { .lock = &name ## _lock }
+#endif
 
 void lg_lock_init(struct lglock *lg, char *name);
 void lg_local_lock(struct lglock *lg);
diff --git a/kernel/lglock.c b/kernel/lglock.c
index 6535a66..0bbf5d1 100644
--- a/kernel/lglock.c
+++ b/kernel/lglock.c
@@ -4,6 +4,15 @@
 #include <linux/cpu.h>
 #include <linux/string.h>
 
+#ifndef CONFIG_PREEMPT_RT_FULL
+# define lg_lock_ptr		arch_spinlock_t
+# define lg_do_lock(l)		arch_spin_lock(l)
+# define lg_do_unlock(l)	arch_spin_unlock(l)
+#else
+# define lg_lock_ptr		struct rt_mutex
+# define lg_do_lock(l)		__rt_spin_lock(l)
+# define lg_do_unlock(l)	__rt_spin_unlock(l)
+#endif
 /*
  * Note there is no uninit, so lglocks cannot be defined in
  * modules (but it's fine to use them from there)
@@ -12,51 +21,60 @@
 
 void lg_lock_init(struct lglock *lg, char *name)
 {
+#ifdef CONFIG_PREEMPT_RT_FULL
+	int i;
+
+	for_each_possible_cpu(i) {
+		struct rt_mutex *lock = per_cpu_ptr(lg->lock, i);
+
+		rt_mutex_init(lock);
+	}
+#endif
 	LOCKDEP_INIT_MAP(&lg->lock_dep_map, name, &lg->lock_key, 0);
 }
 EXPORT_SYMBOL(lg_lock_init);
 
 void lg_local_lock(struct lglock *lg)
 {
-	arch_spinlock_t *lock;
+	lg_lock_ptr *lock;
 
-	preempt_disable();
+	migrate_disable();
 	rwlock_acquire_read(&lg->lock_dep_map, 0, 0, _RET_IP_);
 	lock = this_cpu_ptr(lg->lock);
-	arch_spin_lock(lock);
+	lg_do_lock(lock);
 }
 EXPORT_SYMBOL(lg_local_lock);
 
 void lg_local_unlock(struct lglock *lg)
 {
-	arch_spinlock_t *lock;
+	lg_lock_ptr *lock;
 
 	rwlock_release(&lg->lock_dep_map, 1, _RET_IP_);
 	lock = this_cpu_ptr(lg->lock);
-	arch_spin_unlock(lock);
-	preempt_enable();
+	lg_do_unlock(lock);
+	migrate_enable();
 }
 EXPORT_SYMBOL(lg_local_unlock);
 
 void lg_local_lock_cpu(struct lglock *lg, int cpu)
 {
-	arch_spinlock_t *lock;
+	lg_lock_ptr *lock;
 
-	preempt_disable();
+	preempt_disable_nort();
 	rwlock_acquire_read(&lg->lock_dep_map, 0, 0, _RET_IP_);
 	lock = per_cpu_ptr(lg->lock, cpu);
-	arch_spin_lock(lock);
+	lg_do_lock(lock);
 }
 EXPORT_SYMBOL(lg_local_lock_cpu);
 
 void lg_local_unlock_cpu(struct lglock *lg, int cpu)
 {
-	arch_spinlock_t *lock;
+	lg_lock_ptr *lock;
 
 	rwlock_release(&lg->lock_dep_map, 1, _RET_IP_);
 	lock = per_cpu_ptr(lg->lock, cpu);
-	arch_spin_unlock(lock);
-	preempt_enable();
+	lg_do_unlock(lock);
+	preempt_enable_nort();
 }
 EXPORT_SYMBOL(lg_local_unlock_cpu);
 
@@ -64,12 +82,12 @@ void lg_global_lock(struct lglock *lg)
 {
 	int i;
 
-	preempt_disable();
+	preempt_disable_nort();
 	rwlock_acquire(&lg->lock_dep_map, 0, 0, _RET_IP_);
 	for_each_possible_cpu(i) {
-		arch_spinlock_t *lock;
+		lg_lock_ptr *lock;
 		lock = per_cpu_ptr(lg->lock, i);
-		arch_spin_lock(lock);
+		lg_do_lock(lock);
 	}
 }
 EXPORT_SYMBOL(lg_global_lock);
@@ -80,10 +98,10 @@ void lg_global_unlock(struct lglock *lg)
 
 	rwlock_release(&lg->lock_dep_map, 1, _RET_IP_);
 	for_each_possible_cpu(i) {
-		arch_spinlock_t *lock;
+		lg_lock_ptr *lock;
 		lock = per_cpu_ptr(lg->lock, i);
-		arch_spin_unlock(lock);
+		lg_do_unlock(lock);
 	}
-	preempt_enable();
+	preempt_enable_nort();
 }
 EXPORT_SYMBOL(lg_global_unlock);
-- 
cgit v0.10.2


From 09f976b0a008b1f01409640adce478eac4ea0e7f Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 3 Jul 2009 08:30:01 -0500
Subject: serial: 8250: Clean up the locking for -rt

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/drivers/tty/serial/8250/8250.c b/drivers/tty/serial/8250/8250.c
index 733f22c..c6cb88c 100644
--- a/drivers/tty/serial/8250/8250.c
+++ b/drivers/tty/serial/8250/8250.c
@@ -2900,14 +2900,10 @@ serial8250_console_write(struct console *co, const char *s, unsigned int count)
 
 	touch_nmi_watchdog();
 
-	local_irq_save(flags);
-	if (port->sysrq) {
-		/* serial8250_handle_irq() already took the lock */
-		locked = 0;
-	} else if (oops_in_progress) {
-		locked = spin_trylock(&port->lock);
-	} else
-		spin_lock(&port->lock);
+	if (port->sysrq || oops_in_progress)
+		locked = spin_trylock_irqsave(&port->lock, flags);
+	else
+		spin_lock_irqsave(&port->lock, flags);
 
 	/*
 	 *	First save the IER then disable the interrupts
@@ -2939,8 +2935,7 @@ serial8250_console_write(struct console *co, const char *s, unsigned int count)
 		serial8250_modem_status(up);
 
 	if (locked)
-		spin_unlock(&port->lock);
-	local_irq_restore(flags);
+		spin_unlock_irqrestore(&port->lock, flags);
 }
 
 static int __init serial8250_console_setup(struct console *co, char *options)
-- 
cgit v0.10.2


From 76fe289ac7f900836967b9b3b035f565339ef967 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 3 Jul 2009 08:30:01 -0500
Subject: serial: 8250: Call flush_to_ldisc when the irq is threaded

Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/drivers/tty/tty_buffer.c b/drivers/tty/tty_buffer.c
index cd1f861..7c021eb 100644
--- a/drivers/tty/tty_buffer.c
+++ b/drivers/tty/tty_buffer.c
@@ -566,10 +566,15 @@ void tty_flip_buffer_push(struct tty_struct *tty)
 		buf->tail->commit = buf->tail->used;
 	spin_unlock_irqrestore(&buf->lock, flags);
 
+#ifndef CONFIG_PREEMPT_RT_FULL
 	if (tty->low_latency)
 		flush_to_ldisc(&buf->work);
 	else
 		schedule_work(&buf->work);
+#else
+	flush_to_ldisc(&buf->work);
+#endif
+
 }
 EXPORT_SYMBOL(tty_flip_buffer_push);
 
-- 
cgit v0.10.2


From 67ed98e93176ea79d54681731f6d1ba578a63ca3 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 28 Jul 2011 13:32:57 +0200
Subject: drivers-tty-fix-omap-lock-crap.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/drivers/tty/serial/omap-serial.c b/drivers/tty/serial/omap-serial.c
index 57d6b29..f33fa96 100644
--- a/drivers/tty/serial/omap-serial.c
+++ b/drivers/tty/serial/omap-serial.c
@@ -1166,13 +1166,10 @@ serial_omap_console_write(struct console *co, const char *s,
 
 	pm_runtime_get_sync(up->dev);
 
-	local_irq_save(flags);
-	if (up->port.sysrq)
-		locked = 0;
-	else if (oops_in_progress)
-		locked = spin_trylock(&up->port.lock);
+	if (up->port.sysrq || oops_in_progress)
+		locked = spin_trylock_irqsave(&up->port.lock, flags);
 	else
-		spin_lock(&up->port.lock);
+		spin_lock_irqsave(&up->port.lock, flags);
 
 	/*
 	 * First save the IER then disable the interrupts
@@ -1201,8 +1198,7 @@ serial_omap_console_write(struct console *co, const char *s,
 	pm_runtime_mark_last_busy(up->dev);
 	pm_runtime_put_autosuspend(up->dev);
 	if (locked)
-		spin_unlock(&up->port.lock);
-	local_irq_restore(flags);
+		spin_unlock_irqrestore(&up->port.lock, flags);
 }
 
 static int __init
-- 
cgit v0.10.2


From ffdeba1b6094ad684eefd18e36025a8b7088bd11 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 8 Jan 2013 21:36:51 +0100
Subject: drivers-tty-pl011-irq-disable-madness.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/drivers/tty/serial/amba-pl011.c b/drivers/tty/serial/amba-pl011.c
index 7fca402..d7294f7 100644
--- a/drivers/tty/serial/amba-pl011.c
+++ b/drivers/tty/serial/amba-pl011.c
@@ -1779,13 +1779,19 @@ pl011_console_write(struct console *co, const char *s, unsigned int count)
 
 	clk_enable(uap->clk);
 
-	local_irq_save(flags);
+	/*
+	 * local_irq_save(flags);
+	 *
+	 * This local_irq_save() is nonsense. If we come in via sysrq
+	 * handling then interrupts are already disabled. Aside of
+	 * that the port.sysrq check is racy on SMP regardless.
+	*/
 	if (uap->port.sysrq)
 		locked = 0;
 	else if (oops_in_progress)
-		locked = spin_trylock(&uap->port.lock);
+		locked = spin_trylock_irqsave(&uap->port.lock, flags);
 	else
-		spin_lock(&uap->port.lock);
+		spin_lock_irqsave(&uap->port.lock, flags);
 
 	/*
 	 *	First save the CR then disable the interrupts
@@ -1807,8 +1813,7 @@ pl011_console_write(struct console *co, const char *s, unsigned int count)
 	writew(old_cr, uap->port.membase + UART011_CR);
 
 	if (locked)
-		spin_unlock(&uap->port.lock);
-	local_irq_restore(flags);
+		spin_unlock_irqrestore(&uap->port.lock, flags);
 
 	clk_disable(uap->clk);
 }
-- 
cgit v0.10.2


From dd7ccc4b1081913746426ef0c40cd2710f9e1c15 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 14 Dec 2011 13:05:54 +0100
Subject: rt: Improve the serial console PASS_LIMIT
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Beyond the warning:

 drivers/tty/serial/8250/8250.c:1613:6: warning: unused variable ‘pass_counter’ [-Wunused-variable]

the solution of just looping infinitely was ugly - up it to 1 million to
give it a chance to continue in some really ugly situation.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/drivers/tty/serial/8250/8250.c b/drivers/tty/serial/8250/8250.c
index c6cb88c..25a5451 100644
--- a/drivers/tty/serial/8250/8250.c
+++ b/drivers/tty/serial/8250/8250.c
@@ -80,7 +80,16 @@ static unsigned int skip_txen_test; /* force skip of txen test at init time */
 #define DEBUG_INTR(fmt...)	do { } while (0)
 #endif
 
-#define PASS_LIMIT	512
+/*
+ * On -rt we can have a more delays, and legitimately
+ * so - so don't drop work spuriously and spam the
+ * syslog:
+ */
+#ifdef CONFIG_PREEMPT_RT_FULL
+# define PASS_LIMIT	1000000
+#else
+# define PASS_LIMIT	512
+#endif
 
 #define BOTH_EMPTY 	(UART_LSR_TEMT | UART_LSR_THRE)
 
-- 
cgit v0.10.2


From 314c75446106d892f8b1ecc8bfeb591cce3b5ec8 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 19 Jul 2009 08:44:27 -0500
Subject: fs: namespace preemption fix

On RT we cannot loop with preemption disabled here as
mnt_make_readonly() might have been preempted. We can safely enable
preemption while waiting for MNT_WRITE_HOLD to be cleared. Safe on !RT
as well.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/fs/namespace.c b/fs/namespace.c
index a51054f..0a189ed 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -313,8 +313,11 @@ int __mnt_want_write(struct vfsmount *m)
 	 * incremented count after it has set MNT_WRITE_HOLD.
 	 */
 	smp_mb();
-	while (ACCESS_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD)
+	while (ACCESS_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD) {
+		preempt_enable();
 		cpu_relax();
+		preempt_disable();
+	}
 	/*
 	 * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will
 	 * be set to match its requirements. So we must not load that until
-- 
cgit v0.10.2


From be04edec1b97e57211e3b10c2fe485b5d29ed0d2 Mon Sep 17 00:00:00 2001
From: Yong Zhang <yong.zhang0@gmail.com>
Date: Tue, 15 May 2012 13:53:56 +0800
Subject: mm: Protect activate_mm() by preempt_[disable&enable]_rt()

User preempt_*_rt instead of local_irq_*_rt or otherwise there will be
warning on ARM like below:

WARNING: at build/linux/kernel/smp.c:459 smp_call_function_many+0x98/0x264()
Modules linked in:
[<c0013bb4>] (unwind_backtrace+0x0/0xe4) from [<c001be94>] (warn_slowpath_common+0x4c/0x64)
[<c001be94>] (warn_slowpath_common+0x4c/0x64) from [<c001bec4>] (warn_slowpath_null+0x18/0x1c)
[<c001bec4>] (warn_slowpath_null+0x18/0x1c) from [<c0053ff8>](smp_call_function_many+0x98/0x264)
[<c0053ff8>] (smp_call_function_many+0x98/0x264) from [<c0054364>] (smp_call_function+0x44/0x6c)
[<c0054364>] (smp_call_function+0x44/0x6c) from [<c0017d50>] (__new_context+0xbc/0x124)
[<c0017d50>] (__new_context+0xbc/0x124) from [<c009e49c>] (flush_old_exec+0x460/0x5e4)
[<c009e49c>] (flush_old_exec+0x460/0x5e4) from [<c00d61ac>] (load_elf_binary+0x2e0/0x11ac)
[<c00d61ac>] (load_elf_binary+0x2e0/0x11ac) from [<c009d060>] (search_binary_handler+0x94/0x2a4)
[<c009d060>] (search_binary_handler+0x94/0x2a4) from [<c009e8fc>] (do_execve+0x254/0x364)
[<c009e8fc>] (do_execve+0x254/0x364) from [<c0010e84>] (sys_execve+0x34/0x54)
[<c0010e84>] (sys_execve+0x34/0x54) from [<c000da00>] (ret_fast_syscall+0x0/0x30)
---[ end trace 0000000000000002 ]---

The reason is that ARM need irq enabled when doing activate_mm().
According to mm-protect-activate-switch-mm.patch, actually
preempt_[disable|enable]_rt() is sufficient.

Inspired-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Yong Zhang <yong.zhang0@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Link: http://lkml.kernel.org/r/1337061236-1766-1-git-send-email-yong.zhang0@gmail.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/fs/exec.c b/fs/exec.c
index 20df02c..db1c4ec 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -827,10 +827,12 @@ static int exec_mmap(struct mm_struct *mm)
 		}
 	}
 	task_lock(tsk);
+	preempt_disable_rt();
 	active_mm = tsk->active_mm;
 	tsk->mm = mm;
 	tsk->active_mm = mm;
 	activate_mm(active_mm, mm);
+	preempt_enable_rt();
 	task_unlock(tsk);
 	arch_pick_mmap_layout(mm);
 	if (old_mm) {
diff --git a/mm/mmu_context.c b/mm/mmu_context.c
index 3dcfaf4..1385e48 100644
--- a/mm/mmu_context.c
+++ b/mm/mmu_context.c
@@ -26,6 +26,7 @@ void use_mm(struct mm_struct *mm)
 	struct task_struct *tsk = current;
 
 	task_lock(tsk);
+	preempt_disable_rt();
 	active_mm = tsk->active_mm;
 	if (active_mm != mm) {
 		atomic_inc(&mm->mm_count);
@@ -33,6 +34,7 @@ void use_mm(struct mm_struct *mm)
 	}
 	tsk->mm = mm;
 	switch_mm(active_mm, mm, tsk);
+	preempt_enable_rt();
 	task_unlock(tsk);
 
 	if (active_mm != mm)
-- 
cgit v0.10.2


From 0bc73c675a0fcf7ebb360d8ed9aa39206b29b54f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 14 Jun 2011 17:05:09 +0200
Subject: fs-block-rt-support.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/block/blk-core.c b/block/blk-core.c
index 0ff1564..372ddb3 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -241,7 +241,7 @@ EXPORT_SYMBOL(blk_delay_queue);
  **/
 void blk_start_queue(struct request_queue *q)
 {
-	WARN_ON(!irqs_disabled());
+	WARN_ON_NONRT(!irqs_disabled());
 
 	queue_flag_clear(QUEUE_FLAG_STOPPED, q);
 	__blk_run_queue(q);
diff --git a/fs/file.c b/fs/file.c
index 3906d95..328087b 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -98,14 +98,14 @@ static void free_fdtable_rcu(struct rcu_head *rcu)
 		kfree(fdt->open_fds);
 		kfree(fdt);
 	} else {
-		fddef = &get_cpu_var(fdtable_defer_list);
+		fddef = &per_cpu(fdtable_defer_list, get_cpu_light());
 		spin_lock(&fddef->lock);
 		fdt->next = fddef->next;
 		fddef->next = fdt;
 		/* vmallocs are handled from the workqueue context */
 		schedule_work(&fddef->wq);
 		spin_unlock(&fddef->lock);
-		put_cpu_var(fdtable_defer_list);
+		put_cpu_light();
 	}
 }
 
-- 
cgit v0.10.2


From d1c925b864730c54fc93d34de73ce2fe5d5da0ff Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Fri, 3 Jul 2009 08:44:12 -0500
Subject: fs: ntfs: disable interrupt only on !RT

On Sat, 2007-10-27 at 11:44 +0200, Ingo Molnar wrote:
> * Nick Piggin <nickpiggin@yahoo.com.au> wrote:
>
> > > [10138.175796]  [<c0105de3>] show_trace+0x12/0x14
> > > [10138.180291]  [<c0105dfb>] dump_stack+0x16/0x18
> > > [10138.184769]  [<c011609f>] native_smp_call_function_mask+0x138/0x13d
> > > [10138.191117]  [<c0117606>] smp_call_function+0x1e/0x24
> > > [10138.196210]  [<c012f85c>] on_each_cpu+0x25/0x50
> > > [10138.200807]  [<c0115c74>] flush_tlb_all+0x1e/0x20
> > > [10138.205553]  [<c016caaf>] kmap_high+0x1b6/0x417
> > > [10138.210118]  [<c011ec88>] kmap+0x4d/0x4f
> > > [10138.214102]  [<c026a9d8>] ntfs_end_buffer_async_read+0x228/0x2f9
> > > [10138.220163]  [<c01a0e9e>] end_bio_bh_io_sync+0x26/0x3f
> > > [10138.225352]  [<c01a2b09>] bio_endio+0x42/0x6d
> > > [10138.229769]  [<c02c2a08>] __end_that_request_first+0x115/0x4ac
> > > [10138.235682]  [<c02c2da7>] end_that_request_chunk+0x8/0xa
> > > [10138.241052]  [<c0365943>] ide_end_request+0x55/0x10a
> > > [10138.246058]  [<c036dae3>] ide_dma_intr+0x6f/0xac
> > > [10138.250727]  [<c0366d83>] ide_intr+0x93/0x1e0
> > > [10138.255125]  [<c015afb4>] handle_IRQ_event+0x5c/0xc9
> >
> > Looks like ntfs is kmap()ing from interrupt context. Should be using
> > kmap_atomic instead, I think.
>
> it's not atomic interrupt context but irq thread context - and -rt
> remaps kmap_atomic() to kmap() internally.

Hm.  Looking at the change to mm/bounce.c, perhaps I should do this
instead?

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index 4e4ca73..f5d4565 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -144,13 +144,13 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
 		recs = PAGE_CACHE_SIZE / rec_size;
 		/* Should have been verified before we got here... */
 		BUG_ON(!recs);
-		local_irq_save(flags);
+		local_irq_save_nort(flags);
 		kaddr = kmap_atomic(page);
 		for (i = 0; i < recs; i++)
 			post_read_mst_fixup((NTFS_RECORD*)(kaddr +
 					i * rec_size), rec_size);
 		kunmap_atomic(kaddr);
-		local_irq_restore(flags);
+		local_irq_restore_nort(flags);
 		flush_dcache_page(page);
 		if (likely(page_uptodate && !PageError(page)))
 			SetPageUptodate(page);
-- 
cgit v0.10.2


From 6ba122741b86f7d12f0bf06860f9be57867442ec Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 13 Dec 2010 16:33:39 +0100
Subject: x86: Convert mce timer to hrtimer

mce_timer is started in atomic contexts of cpu bringup. This results
in might_sleep() warnings on RT. Convert mce_timer to a hrtimer to
avoid this.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 80dbda8..b57a6ed 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -41,6 +41,7 @@
 #include <linux/debugfs.h>
 #include <linux/irq_work.h>
 #include <linux/export.h>
+#include <linux/jiffies.h>
 
 #include <asm/processor.h>
 #include <asm/mce.h>
@@ -1259,7 +1260,7 @@ void mce_log_therm_throt_event(__u64 status)
 static unsigned long check_interval = 5 * 60; /* 5 minutes */
 
 static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */
-static DEFINE_PER_CPU(struct timer_list, mce_timer);
+static DEFINE_PER_CPU(struct hrtimer, mce_timer);
 
 static unsigned long mce_adjust_timer_default(unsigned long interval)
 {
@@ -1269,13 +1270,10 @@ static unsigned long mce_adjust_timer_default(unsigned long interval)
 static unsigned long (*mce_adjust_timer)(unsigned long interval) =
 	mce_adjust_timer_default;
 
-static void mce_timer_fn(unsigned long data)
+static enum hrtimer_restart mce_timer_fn(struct hrtimer *timer)
 {
-	struct timer_list *t = &__get_cpu_var(mce_timer);
 	unsigned long iv;
 
-	WARN_ON(smp_processor_id() != data);
-
 	if (mce_available(__this_cpu_ptr(&cpu_info))) {
 		machine_check_poll(MCP_TIMESTAMP,
 				&__get_cpu_var(mce_poll_banks));
@@ -1296,9 +1294,10 @@ static void mce_timer_fn(unsigned long data)
 	__this_cpu_write(mce_next_interval, iv);
 	/* Might have become 0 after CMCI storm subsided */
 	if (iv) {
-		t->expires = jiffies + iv;
-		add_timer_on(t, smp_processor_id());
+		hrtimer_forward_now(timer, ns_to_ktime(jiffies_to_usecs(iv)));
+		return HRTIMER_RESTART;
 	}
+	return HRTIMER_NORESTART;
 }
 
 /*
@@ -1306,28 +1305,37 @@ static void mce_timer_fn(unsigned long data)
  */
 void mce_timer_kick(unsigned long interval)
 {
-	struct timer_list *t = &__get_cpu_var(mce_timer);
-	unsigned long when = jiffies + interval;
+	struct hrtimer *t = &__get_cpu_var(mce_timer);
 	unsigned long iv = __this_cpu_read(mce_next_interval);
 
-	if (timer_pending(t)) {
-		if (time_before(when, t->expires))
-			mod_timer_pinned(t, when);
+	if (hrtimer_active(t)) {
+		s64 exp;
+		s64 intv_us;
+
+		intv_us = jiffies_to_usecs(interval);
+		exp = ktime_to_us(hrtimer_expires_remaining(t));
+		if (intv_us < exp) {
+			hrtimer_cancel(t);
+			hrtimer_start_range_ns(t,
+					ns_to_ktime(intv_us * 1000),
+					0, HRTIMER_MODE_REL_PINNED);
+		}
 	} else {
-		t->expires = round_jiffies(when);
-		add_timer_on(t, smp_processor_id());
+		hrtimer_start_range_ns(t,
+				ns_to_ktime(jiffies_to_usecs(interval) * 1000),
+				0, HRTIMER_MODE_REL_PINNED);
 	}
 	if (interval < iv)
 		__this_cpu_write(mce_next_interval, interval);
 }
 
-/* Must not be called in IRQ context where del_timer_sync() can deadlock */
+/* Must not be called in IRQ context where hrtimer_cancel() can deadlock */
 static void mce_timer_delete_all(void)
 {
 	int cpu;
 
 	for_each_online_cpu(cpu)
-		del_timer_sync(&per_cpu(mce_timer, cpu));
+		hrtimer_cancel(&per_cpu(mce_timer, cpu));
 }
 
 static void mce_do_trigger(struct work_struct *work)
@@ -1632,7 +1640,7 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
 	}
 }
 
-static void mce_start_timer(unsigned int cpu, struct timer_list *t)
+static void mce_start_timer(unsigned int cpu, struct hrtimer *t)
 {
 	unsigned long iv = mce_adjust_timer(check_interval * HZ);
 
@@ -1641,16 +1649,17 @@ static void mce_start_timer(unsigned int cpu, struct timer_list *t)
 	if (mca_cfg.ignore_ce || !iv)
 		return;
 
-	t->expires = round_jiffies(jiffies + iv);
-	add_timer_on(t, smp_processor_id());
+	hrtimer_start_range_ns(t, ns_to_ktime(jiffies_to_usecs(iv) * 1000),
+			0, HRTIMER_MODE_REL_PINNED);
 }
 
 static void __mcheck_cpu_init_timer(void)
 {
-	struct timer_list *t = &__get_cpu_var(mce_timer);
+	struct hrtimer *t = &__get_cpu_var(mce_timer);
 	unsigned int cpu = smp_processor_id();
 
-	setup_timer(t, mce_timer_fn, cpu);
+	hrtimer_init(t, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	t->function = mce_timer_fn;
 	mce_start_timer(cpu, t);
 }
 
@@ -2307,6 +2316,8 @@ static void __cpuinit mce_disable_cpu(void *h)
 	if (!mce_available(__this_cpu_ptr(&cpu_info)))
 		return;
 
+	hrtimer_cancel(&__get_cpu_var(mce_timer));
+
 	if (!(action & CPU_TASKS_FROZEN))
 		cmci_clear();
 	for (i = 0; i < mca_cfg.banks; i++) {
@@ -2333,6 +2344,7 @@ static void __cpuinit mce_reenable_cpu(void *h)
 		if (b->init)
 			wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl);
 	}
+	__mcheck_cpu_init_timer();
 }
 
 /* Get notified when a cpu comes on/off. Be hotplug friendly. */
@@ -2340,7 +2352,6 @@ static int __cpuinit
 mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
 {
 	unsigned int cpu = (unsigned long)hcpu;
-	struct timer_list *t = &per_cpu(mce_timer, cpu);
 
 	switch (action & ~CPU_TASKS_FROZEN) {
 	case CPU_ONLINE:
@@ -2356,11 +2367,9 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
 		break;
 	case CPU_DOWN_PREPARE:
 		smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
-		del_timer_sync(t);
 		break;
 	case CPU_DOWN_FAILED:
 		smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
-		mce_start_timer(cpu, t);
 		break;
 	}
 
-- 
cgit v0.10.2


From 61734b02458784ac8f2f2a59d79760522eef79ef Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 16 Dec 2010 14:25:18 +0100
Subject: x86: stackprotector: Avoid random pool on rt

CPU bringup calls into the random pool to initialize the stack
canary. During boot that works nicely even on RT as the might sleep
checks are disabled. During CPU hotplug the might sleep checks
trigger. Making the locks in random raw is a major PITA, so avoid the
call on RT is the only sensible solution. This is basically the same
randomness which we get during boot where the random pool has no
entropy and we rely on the TSC randomnness.

Reported-by: Carsten Emde <carsten.emde@osadl.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h
index 6a99859..64fb5cb 100644
--- a/arch/x86/include/asm/stackprotector.h
+++ b/arch/x86/include/asm/stackprotector.h
@@ -57,7 +57,7 @@
  */
 static __always_inline void boot_init_stack_canary(void)
 {
-	u64 canary;
+	u64 uninitialized_var(canary);
 	u64 tsc;
 
 #ifdef CONFIG_X86_64
@@ -68,8 +68,16 @@ static __always_inline void boot_init_stack_canary(void)
 	 * of randomness. The TSC only matters for very early init,
 	 * there it already has some randomness on most systems. Later
 	 * on during the bootup the random pool has true entropy too.
+	 *
+	 * For preempt-rt we need to weaken the randomness a bit, as
+	 * we can't call into the random generator from atomic context
+	 * due to locking constraints. We just leave canary
+	 * uninitialized and use the TSC based randomness on top of
+	 * it.
 	 */
+#ifndef CONFIG_PREEMPT_RT_FULL
 	get_random_bytes(&canary, sizeof(canary));
+#endif
 	tsc = __native_read_tsc();
 	canary += tsc + (tsc << 32UL);
 
-- 
cgit v0.10.2


From 4553856d0fd9ffbcdce259d886ca532b04aae91f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 26 Jul 2009 02:21:32 +0200
Subject: x86: Use generic rwsem_spinlocks on -rt

Simplifies the separation of anon_rw_semaphores and rw_semaphores for
-rt.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 0694d09..8b866b2 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -173,8 +173,11 @@ config ARCH_MAY_HAVE_PC_FDC
 	def_bool y
 	depends on ISA_DMA_API
 
+config RWSEM_GENERIC_SPINLOCK
+	def_bool PREEMPT_RT_FULL
+
 config RWSEM_XCHGADD_ALGORITHM
-	def_bool y
+	def_bool !RWSEM_GENERIC_SPINLOCK && !PREEMPT_RT_FULL
 
 config GENERIC_CALIBRATE_DELAY
 	def_bool y
-- 
cgit v0.10.2


From 317ef1d1362bf7931460446a46d9cb908d6c1722 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Fri, 3 Jul 2009 08:44:10 -0500
Subject: x86: Disable IST stacks for debug/int 3/stack fault for PREEMPT_RT

Normally the x86-64 trap handlers for debug/int 3/stack fault run
on a special interrupt stack to make them more robust
when dealing with kernel code.

The PREEMPT_RT kernel can sleep in locks even while allocating
GFP_ATOMIC memory. When one of these trap handlers needs to send
real time signals for ptrace it allocates memory and could then
try to to schedule.  But it is not allowed to schedule on a
IST stack. This can cause warnings and hangs.

This patch disables the IST stacks for these handlers for PREEMPT_RT
kernel. Instead let them run on the normal process stack.

The kernel only really needs the ISTs here to make kernel debuggers more
robust in case someone sets a break point somewhere where the stack is
invalid. But there are no kernel debuggers in the standard kernel
that do this.

It also means kprobes cannot be set in situations with invalid stack;
but that sounds like a reasonable restriction.

The stack fault change could minimally impact oops quality, but not very
much because stack faults are fairly rare.

A better solution would be to use similar logic as the NMI "paranoid"
path: check if signal is for user space, if yes go back to entry.S, switch stack,
call sync_regs, then do the signal sending etc.

But this patch is much simpler and should work too with minimal impact.

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h
index 320f7bb..65b85f4 100644
--- a/arch/x86/include/asm/page_64_types.h
+++ b/arch/x86/include/asm/page_64_types.h
@@ -14,12 +14,21 @@
 #define IRQ_STACK_ORDER 2
 #define IRQ_STACK_SIZE (PAGE_SIZE << IRQ_STACK_ORDER)
 
-#define STACKFAULT_STACK 1
-#define DOUBLEFAULT_STACK 2
-#define NMI_STACK 3
-#define DEBUG_STACK 4
-#define MCE_STACK 5
-#define N_EXCEPTION_STACKS 5  /* hw limit: 7 */
+#ifdef CONFIG_PREEMPT_RT_FULL
+# define STACKFAULT_STACK 0
+# define DOUBLEFAULT_STACK 1
+# define NMI_STACK 2
+# define DEBUG_STACK 0
+# define MCE_STACK 3
+# define N_EXCEPTION_STACKS 3  /* hw limit: 7 */
+#else
+# define STACKFAULT_STACK 1
+# define DOUBLEFAULT_STACK 2
+# define NMI_STACK 3
+# define DEBUG_STACK 4
+# define MCE_STACK 5
+# define N_EXCEPTION_STACKS 5  /* hw limit: 7 */
+#endif
 
 #define PUD_PAGE_SIZE		(_AC(1, UL) << PUD_SHIFT)
 #define PUD_PAGE_MASK		(~(PUD_PAGE_SIZE-1))
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 9c3ab43..2636e0f 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1103,7 +1103,9 @@ DEFINE_PER_CPU(struct task_struct *, fpu_owner_task);
  */
 static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = {
 	  [0 ... N_EXCEPTION_STACKS - 1]	= EXCEPTION_STKSZ,
+#if DEBUG_STACK > 0
 	  [DEBUG_STACK - 1]			= DEBUG_STKSZ
+#endif
 };
 
 static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index b653675..f16c07b 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -21,10 +21,14 @@
 		(N_EXCEPTION_STACKS + DEBUG_STKSZ/EXCEPTION_STKSZ - 2)
 
 static char x86_stack_ids[][8] = {
+#if DEBUG_STACK > 0
 		[ DEBUG_STACK-1			]	= "#DB",
+#endif
 		[ NMI_STACK-1			]	= "NMI",
 		[ DOUBLEFAULT_STACK-1		]	= "#DF",
+#if STACKFAULT_STACK > 0
 		[ STACKFAULT_STACK-1		]	= "#SS",
+#endif
 		[ MCE_STACK-1			]	= "#MC",
 #if DEBUG_STKSZ > EXCEPTION_STKSZ
 		[ N_EXCEPTION_STACKS ...
-- 
cgit v0.10.2


From 0f8e4bb7c87f0a0a84fe9ea5dcdba3e1dd552344 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 8 Jul 2011 16:35:35 +0200
Subject: epoll.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 9fec183..343e14a 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -497,12 +497,12 @@ static int ep_poll_wakeup_proc(void *priv, void *cookie, int call_nests)
  */
 static void ep_poll_safewake(wait_queue_head_t *wq)
 {
-	int this_cpu = get_cpu();
+	int this_cpu = get_cpu_light();
 
 	ep_call_nested(&poll_safewake_ncalls, EP_MAX_NESTS,
 		       ep_poll_wakeup_proc, NULL, wq, (void *) (long) this_cpu);
 
-	put_cpu();
+	put_cpu_light();
 }
 
 static void ep_remove_wait_queue(struct eppoll_entry *pwq)
-- 
cgit v0.10.2


From e611d85aba3bd11b3072194d4577509e0760415e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 12 Jul 2011 11:39:36 +0200
Subject: mm-vmalloc.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 5123a16..f4b4fee 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -782,7 +782,7 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask)
 	struct vmap_block *vb;
 	struct vmap_area *va;
 	unsigned long vb_idx;
-	int node, err;
+	int node, err, cpu;
 
 	node = numa_node_id();
 
@@ -821,12 +821,13 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask)
 	BUG_ON(err);
 	radix_tree_preload_end();
 
-	vbq = &get_cpu_var(vmap_block_queue);
+	cpu = get_cpu_light();
+	vbq = &__get_cpu_var(vmap_block_queue);
 	vb->vbq = vbq;
 	spin_lock(&vbq->lock);
 	list_add_rcu(&vb->free_list, &vbq->free);
 	spin_unlock(&vbq->lock);
-	put_cpu_var(vmap_block_queue);
+	put_cpu_light();
 
 	return vb;
 }
@@ -900,7 +901,7 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
 	struct vmap_block *vb;
 	unsigned long addr = 0;
 	unsigned int order;
-	int purge = 0;
+	int purge = 0, cpu;
 
 	BUG_ON(size & ~PAGE_MASK);
 	BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
@@ -916,7 +917,8 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
 
 again:
 	rcu_read_lock();
-	vbq = &get_cpu_var(vmap_block_queue);
+	cpu = get_cpu_light();
+	vbq = &__get_cpu_var(vmap_block_queue);
 	list_for_each_entry_rcu(vb, &vbq->free, free_list) {
 		int i;
 
@@ -953,7 +955,7 @@ next:
 	if (purge)
 		purge_fragmented_blocks_thiscpu();
 
-	put_cpu_var(vmap_block_queue);
+	put_cpu_light();
 	rcu_read_unlock();
 
 	if (!addr) {
-- 
cgit v0.10.2


From 89207ed8deef22377119ae50dd8afe6481e2fd6a Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 17 Jul 2011 21:42:26 +0200
Subject: Use local irq lock instead of irq disable regions

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 3a3a98f..11285e4 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -41,6 +41,7 @@
 #include <linux/debug_locks.h>
 #include <linux/lockdep.h>
 #include <linux/idr.h>
+#include <linux/locallock.h>
 
 #include "workqueue_sched.h"
 
@@ -278,6 +279,8 @@ EXPORT_SYMBOL_GPL(system_unbound_wq);
 struct workqueue_struct *system_freezable_wq __read_mostly;
 EXPORT_SYMBOL_GPL(system_freezable_wq);
 
+static DEFINE_LOCAL_IRQ_LOCK(pendingb_lock);
+
 #define CREATE_TRACE_POINTS
 #include <trace/events/workqueue.h>
 
@@ -1092,7 +1095,7 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
 {
 	struct global_cwq *gcwq;
 
-	local_irq_save(*flags);
+	local_lock_irqsave(pendingb_lock, *flags);
 
 	/* try to steal the timer if it exists */
 	if (is_dwork) {
@@ -1151,7 +1154,7 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
 	}
 	spin_unlock(&gcwq->lock);
 fail:
-	local_irq_restore(*flags);
+	local_unlock_irqrestore(pendingb_lock, *flags);
 	if (work_is_canceling(work))
 		return -ENOENT;
 	cpu_relax();
@@ -1246,7 +1249,7 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
 	 * queued or lose PENDING.  Grabbing PENDING and queueing should
 	 * happen with IRQ disabled.
 	 */
-	WARN_ON_ONCE(!irqs_disabled());
+	WARN_ON_ONCE_NONRT(!irqs_disabled());
 
 	debug_work_activate(work);
 
@@ -1336,14 +1339,14 @@ bool queue_work_on(int cpu, struct workqueue_struct *wq,
 	bool ret = false;
 	unsigned long flags;
 
-	local_irq_save(flags);
+	local_lock_irqsave(pendingb_lock,flags);
 
 	if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
 		__queue_work(cpu, wq, work);
 		ret = true;
 	}
 
-	local_irq_restore(flags);
+	local_unlock_irqrestore(pendingb_lock, flags);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(queue_work_on);
@@ -1451,14 +1454,14 @@ bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
 	unsigned long flags;
 
 	/* read the comment in __queue_work() */
-	local_irq_save(flags);
+	local_lock_irqsave(pendingb_lock, flags);
 
 	if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
 		__queue_delayed_work(cpu, wq, dwork, delay);
 		ret = true;
 	}
 
-	local_irq_restore(flags);
+	local_unlock_irqrestore(pendingb_lock, flags);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(queue_delayed_work_on);
@@ -1508,7 +1511,7 @@ bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq,
 
 	if (likely(ret >= 0)) {
 		__queue_delayed_work(cpu, wq, dwork, delay);
-		local_irq_restore(flags);
+		local_unlock_irqrestore(pendingb_lock, flags);
 	}
 
 	/* -ENOENT from try_to_grab_pending() becomes %true */
@@ -2936,7 +2939,7 @@ static bool __cancel_work_timer(struct work_struct *work, bool is_dwork)
 
 	/* tell other tasks trying to grab @work to back off */
 	mark_work_canceling(work);
-	local_irq_restore(flags);
+	local_unlock_irqrestore(pendingb_lock, flags);
 
 	flush_work(work);
 	clear_work_data(work);
@@ -2981,11 +2984,11 @@ EXPORT_SYMBOL_GPL(cancel_work_sync);
  */
 bool flush_delayed_work(struct delayed_work *dwork)
 {
-	local_irq_disable();
+	local_lock_irq(pendingb_lock);
 	if (del_timer_sync(&dwork->timer))
 		__queue_work(dwork->cpu,
 			     get_work_cwq(&dwork->work)->wq, &dwork->work);
-	local_irq_enable();
+	local_unlock_irq(pendingb_lock);
 	return flush_work(&dwork->work);
 }
 EXPORT_SYMBOL(flush_delayed_work);
@@ -3015,7 +3018,7 @@ bool cancel_delayed_work(struct delayed_work *dwork)
 		return false;
 
 	set_work_cpu_and_clear_pending(&dwork->work, work_cpu(&dwork->work));
-	local_irq_restore(flags);
+	local_unlock_irqrestore(pendingb_lock, flags);
 	return ret;
 }
 EXPORT_SYMBOL(cancel_delayed_work);
-- 
cgit v0.10.2


From 23e91e8aca1dd0472801f62dc59043ad0a1fff42 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 17 Jul 2011 21:41:35 +0200
Subject: debugobjects-rt.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/lib/debugobjects.c b/lib/debugobjects.c
index d11808c..cf5f02f 100644
--- a/lib/debugobjects.c
+++ b/lib/debugobjects.c
@@ -309,7 +309,10 @@ __debug_object_init(void *addr, struct debug_obj_descr *descr, int onstack)
 	struct debug_obj *obj;
 	unsigned long flags;
 
-	fill_pool();
+#ifdef CONFIG_PREEMPT_RT_FULL
+	if (preempt_count() == 0 && !irqs_disabled())
+#endif
+		fill_pool();
 
 	db = get_bucket((unsigned long) addr);
 
-- 
cgit v0.10.2


From ac17a776dd42a067b69b5e12de5f1919ab923f94 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 13 Jul 2011 11:03:16 +0200
Subject: jump-label-rt.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h
index 0976fc4..40c876b 100644
--- a/include/linux/jump_label.h
+++ b/include/linux/jump_label.h
@@ -50,7 +50,8 @@
 #include <linux/compiler.h>
 #include <linux/workqueue.h>
 
-#if defined(CC_HAVE_ASM_GOTO) && defined(CONFIG_JUMP_LABEL)
+#if defined(CC_HAVE_ASM_GOTO) && defined(CONFIG_JUMP_LABEL) && \
+	!defined(CONFIG_PREEMPT_BASE)
 
 struct static_key {
 	atomic_t enabled;
-- 
cgit v0.10.2


From abf99e68a31845cbc4ba827728adb3aaeffcc065 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 12 Jul 2011 15:38:34 +0200
Subject: skbufhead-raw-lock.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 9ef07d0..b705426 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1783,6 +1783,7 @@ struct softnet_data {
 	unsigned int		dropped;
 	struct sk_buff_head	input_pkt_queue;
 	struct napi_struct	backlog;
+	struct sk_buff_head	tofree_queue;
 };
 
 static inline void input_queue_head_incr(struct softnet_data *sd)
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 320e976..1b5987d 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -132,6 +132,7 @@ struct sk_buff_head {
 
 	__u32		qlen;
 	spinlock_t	lock;
+	raw_spinlock_t	raw_lock;
 };
 
 struct sk_buff;
@@ -1008,6 +1009,12 @@ static inline void skb_queue_head_init(struct sk_buff_head *list)
 	__skb_queue_head_init(list);
 }
 
+static inline void skb_queue_head_init_raw(struct sk_buff_head *list)
+{
+	raw_spin_lock_init(&list->raw_lock);
+	__skb_queue_head_init(list);
+}
+
 static inline void skb_queue_head_init_class(struct sk_buff_head *list,
 		struct lock_class_key *class)
 {
diff --git a/net/core/dev.c b/net/core/dev.c
index fad0a35..3eda126 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -225,14 +225,14 @@ static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 static inline void rps_lock(struct softnet_data *sd)
 {
 #ifdef CONFIG_RPS
-	spin_lock(&sd->input_pkt_queue.lock);
+	raw_spin_lock(&sd->input_pkt_queue.raw_lock);
 #endif
 }
 
 static inline void rps_unlock(struct softnet_data *sd)
 {
 #ifdef CONFIG_RPS
-	spin_unlock(&sd->input_pkt_queue.lock);
+	raw_spin_unlock(&sd->input_pkt_queue.raw_lock);
 #endif
 }
 
@@ -3528,7 +3528,7 @@ static void flush_backlog(void *arg)
 	skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
 		if (skb->dev == dev) {
 			__skb_unlink(skb, &sd->input_pkt_queue);
-			kfree_skb(skb);
+			__skb_queue_tail(&sd->tofree_queue, skb);
 			input_queue_head_incr(sd);
 		}
 	}
@@ -3537,10 +3537,13 @@ static void flush_backlog(void *arg)
 	skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
 		if (skb->dev == dev) {
 			__skb_unlink(skb, &sd->process_queue);
-			kfree_skb(skb);
+			__skb_queue_tail(&sd->tofree_queue, skb);
 			input_queue_head_incr(sd);
 		}
 	}
+
+	if (!skb_queue_empty(&sd->tofree_queue))
+		raise_softirq_irqoff(NET_RX_SOFTIRQ);
 }
 
 static int napi_gro_complete(struct sk_buff *skb)
@@ -4045,10 +4048,17 @@ static void net_rx_action(struct softirq_action *h)
 	struct softnet_data *sd = &__get_cpu_var(softnet_data);
 	unsigned long time_limit = jiffies + 2;
 	int budget = netdev_budget;
+	struct sk_buff *skb;
 	void *have;
 
 	local_irq_disable();
 
+	while ((skb = __skb_dequeue(&sd->tofree_queue))) {
+		local_irq_enable();
+		kfree_skb(skb);
+		local_irq_disable();
+	}
+
 	while (!list_empty(&sd->poll_list)) {
 		struct napi_struct *n;
 		int work, weight;
@@ -6530,6 +6540,9 @@ static int dev_cpu_callback(struct notifier_block *nfb,
 		netif_rx(skb);
 		input_queue_head_incr(oldsd);
 	}
+	while ((skb = __skb_dequeue(&oldsd->tofree_queue))) {
+		kfree_skb(skb);
+	}
 
 	return NOTIFY_OK;
 }
@@ -6802,8 +6815,9 @@ static int __init net_dev_init(void)
 		struct softnet_data *sd = &per_cpu(softnet_data, i);
 
 		memset(sd, 0, sizeof(*sd));
-		skb_queue_head_init(&sd->input_pkt_queue);
-		skb_queue_head_init(&sd->process_queue);
+		skb_queue_head_init_raw(&sd->input_pkt_queue);
+		skb_queue_head_init_raw(&sd->process_queue);
+		skb_queue_head_init_raw(&sd->tofree_queue);
 		sd->completion_queue = NULL;
 		INIT_LIST_HEAD(&sd->poll_list);
 		sd->output_queue = NULL;
-- 
cgit v0.10.2


From c257932f43b73f6a414e9dbb0f4be0dadfd3665a Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 13 Jul 2011 14:05:05 +0200
Subject: x86-no-perf-irq-work-rt.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/x86/kernel/irq_work.c b/arch/x86/kernel/irq_work.c
index ca8f703..129b8bb 100644
--- a/arch/x86/kernel/irq_work.c
+++ b/arch/x86/kernel/irq_work.c
@@ -18,6 +18,7 @@ void smp_irq_work_interrupt(struct pt_regs *regs)
 	irq_exit();
 }
 
+#ifndef CONFIG_PREEMPT_RT_FULL
 void arch_irq_work_raise(void)
 {
 #ifdef CONFIG_X86_LOCAL_APIC
@@ -28,3 +29,4 @@ void arch_irq_work_raise(void)
 	apic_wait_icr_idle();
 #endif
 }
+#endif
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index 1588e3b..170c2ea 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -107,8 +107,10 @@ void irq_work_run(void)
 	if (llist_empty(this_list))
 		return;
 
+#ifndef CONFIG_PREEMPT_RT_FULL
 	BUG_ON(!in_irq());
 	BUG_ON(!irqs_disabled());
+#endif
 
 	llnode = llist_del_all(this_list);
 	while (llnode != NULL) {
diff --git a/kernel/timer.c b/kernel/timer.c
index 8af44de..374e7b1 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1419,7 +1419,7 @@ void update_process_times(int user_tick)
 	scheduler_tick();
 	run_local_timers();
 	rcu_check_callbacks(cpu, user_tick);
-#ifdef CONFIG_IRQ_WORK
+#if defined(CONFIG_IRQ_WORK) && !defined(CONFIG_PREEMPT_RT_FULL)
 	if (in_irq())
 		irq_work_run();
 #endif
@@ -1433,6 +1433,10 @@ static void run_timer_softirq(struct softirq_action *h)
 {
 	struct tvec_base *base = __this_cpu_read(tvec_bases);
 
+#if defined(CONFIG_IRQ_WORK) && defined(CONFIG_PREEMPT_RT_FULL)
+	irq_work_run();
+#endif
+
 	printk_tick();
 	hrtimer_run_pending();
 
-- 
cgit v0.10.2


From 00c59821fe6309ba22590cbe2a93e2b769bb8c4f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 19 Sep 2012 14:50:37 +0200
Subject: printk-rt-aware.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/printk.c b/kernel/printk.c
index df4b56b..2609082 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -1315,6 +1315,7 @@ static void call_console_drivers(int level, const char *text, size_t len)
 	if (!console_drivers)
 		return;
 
+	migrate_disable();
 	for_each_console(con) {
 		if (exclusive_console && con != exclusive_console)
 			continue;
@@ -1327,6 +1328,7 @@ static void call_console_drivers(int level, const char *text, size_t len)
 			continue;
 		con->write(con, text, len);
 	}
+	migrate_enable();
 }
 
 /*
@@ -1386,12 +1388,18 @@ static inline int can_use_console(unsigned int cpu)
  * interrupts disabled. It should return with 'lockbuf_lock'
  * released but interrupts still disabled.
  */
-static int console_trylock_for_printk(unsigned int cpu)
+static int console_trylock_for_printk(unsigned int cpu, unsigned long flags)
 	__releases(&logbuf_lock)
 {
 	int retval = 0, wake = 0;
+#ifdef CONFIG_PREEMPT_RT_FULL
+	int lock = !early_boot_irqs_disabled && !irqs_disabled_flags(flags) &&
+		(preempt_count() <= 1);
+#else
+	int lock = 1;
+#endif
 
-	if (console_trylock()) {
+	if (lock && console_trylock()) {
 		retval = 1;
 
 		/*
@@ -1670,8 +1678,15 @@ asmlinkage int vprintk_emit(int facility, int level,
 	 * The console_trylock_for_printk() function will release 'logbuf_lock'
 	 * regardless of whether it actually gets the console semaphore or not.
 	 */
-	if (console_trylock_for_printk(this_cpu))
+	if (console_trylock_for_printk(this_cpu, flags)) {
+#ifndef CONFIG_PREEMPT_RT_FULL
+		console_unlock();
+#else
+		raw_local_irq_restore(flags);
 		console_unlock();
+		raw_local_irq_save(flags);
+#endif
+	}
 
 	lockdep_on();
 out_restore_irqs:
@@ -2060,11 +2075,16 @@ static void console_cont_flush(char *text, size_t size)
 		goto out;
 
 	len = cont_print_text(text, size);
+#ifndef CONFIG_PREEMPT_RT_FULL
 	raw_spin_unlock(&logbuf_lock);
 	stop_critical_timings();
 	call_console_drivers(cont.level, text, len);
 	start_critical_timings();
 	local_irq_restore(flags);
+#else
+	raw_spin_unlock_irqrestore(&logbuf_lock, flags);
+	call_console_drivers(cont.level, text, len);
+#endif
 	return;
 out:
 	raw_spin_unlock_irqrestore(&logbuf_lock, flags);
@@ -2147,12 +2167,17 @@ skip:
 		console_idx = log_next(console_idx);
 		console_seq++;
 		console_prev = msg->flags;
-		raw_spin_unlock(&logbuf_lock);
 
+#ifndef CONFIG_PREEMPT_RT_FULL
+		raw_spin_unlock(&logbuf_lock);
 		stop_critical_timings();	/* don't trace print latency */
 		call_console_drivers(level, text, len);
 		start_critical_timings();
 		local_irq_restore(flags);
+#else
+		raw_spin_unlock_irqrestore(&logbuf_lock, flags);
+		call_console_drivers(level, text, len);
+#endif
 	}
 	console_locked = 0;
 
-- 
cgit v0.10.2


From 4a6e8f41050664c793a4a3beb764c5be2604d41f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 4 Apr 2013 17:09:31 -0500
Subject: Powerpc: Use generic rwsem on RT

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index dabe429..d2bf89e 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -60,10 +60,11 @@ config LOCKDEP_SUPPORT
 
 config RWSEM_GENERIC_SPINLOCK
 	bool
+	default y if PREEMPT_RT_FULL
 
 config RWSEM_XCHGADD_ALGORITHM
 	bool
-	default y
+	default y if !PREEMPT_RT_FULL
 
 config GENERIC_LOCKBREAK
 	bool
-- 
cgit v0.10.2


From e60e15e60ae205405edccf598db30beeaddccf3b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 18 Jul 2011 17:08:34 +0200
Subject: power-disable-highmem-on-rt.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index d2bf89e..bd0885eb 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -291,7 +291,7 @@ menu "Kernel options"
 
 config HIGHMEM
 	bool "High memory support"
-	depends on PPC32
+	depends on PPC32 && !PREEMPT_RT_FULL
 
 source kernel/Kconfig.hz
 source kernel/Kconfig.preempt
-- 
cgit v0.10.2


From f478821c4f9eecf14e0e13ef3ad6ba79faff41d4 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 18 Jul 2011 17:09:28 +0200
Subject: arm-disable-highmem-on-rt.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 4890796..772c2d3 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -1752,7 +1752,7 @@ config HAVE_ARCH_PFN_VALID
 
 config HIGHMEM
 	bool "High Memory Support"
-	depends on MMU
+	depends on MMU && !PREEMPT_RT_FULL
 	help
 	  The address space of ARM processors is only 4 Gigabytes large
 	  and it has to accommodate user address space, kernel address
-- 
cgit v0.10.2


From d68ca15307b6cd1b393c4d454ca3657062cd500b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 1 May 2010 18:29:35 +0200
Subject: ARM: at91: tclib: Default to tclib timer for RT

RT is not too happy about the shared timer interrupt in AT91
devices. Default to tclib timer for RT.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig
index 25ffcfd..bb07512 100644
--- a/drivers/misc/Kconfig
+++ b/drivers/misc/Kconfig
@@ -63,6 +63,7 @@ config ATMEL_PWM
 config ATMEL_TCLIB
 	bool "Atmel AT32/AT91 Timer/Counter Library"
 	depends on (AVR32 || ARCH_AT91)
+	default y if PREEMPT_RT_FULL
 	help
 	  Select this if you want a library to allocate the Timer/Counter
 	  blocks found on many Atmel processors.  This facilitates using
@@ -95,7 +96,7 @@ config ATMEL_TCB_CLKSRC_BLOCK
 config ATMEL_TCB_CLKSRC_USE_SLOW_CLOCK
 	bool "TC Block use 32 KiHz clock"
 	depends on ATMEL_TCB_CLKSRC
-	default y
+	default y if !PREEMPT_RT_FULL
 	help
 	  Select this to use 32 KiHz base clock rate as TC block clock
 	  source for clock events.
-- 
cgit v0.10.2


From f9ef56f04da6196037457fbeab409e19023c4b27 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 18 Jul 2011 17:10:12 +0200
Subject: mips-disable-highmem-on-rt.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index 2ac626a..1c355a1 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -2102,7 +2102,7 @@ config CPU_R4400_WORKAROUNDS
 #
 config HIGHMEM
 	bool "High Memory Support"
-	depends on 32BIT && CPU_SUPPORTS_HIGHMEM && SYS_SUPPORTS_HIGHMEM
+	depends on 32BIT && CPU_SUPPORTS_HIGHMEM && SYS_SUPPORTS_HIGHMEM && !PREEMPT_RT_FULL
 
 config CPU_SUPPORTS_HIGHMEM
 	bool
-- 
cgit v0.10.2


From ae5a202857700dc5f731646883e01f8123fa6505 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 6 Oct 2011 10:48:39 -0400
Subject: net: Avoid livelock in net_tx_action() on RT

qdisc_lock is taken w/o disabling interrupts or bottom halfs. So code
holding a qdisc_lock() can be interrupted and softirqs can run on the
return of interrupt in !RT.

The spin_trylock() in net_tx_action() makes sure, that the softirq
does not deadlock. When the lock can't be acquired q is requeued and
the NET_TX softirq is raised. That causes the softirq to run over and
over.

That works in mainline as do_softirq() has a retry loop limit and
leaves the softirq processing in the interrupt return path and
schedules ksoftirqd. The task which holds qdisc_lock cannot be
preempted, so the lock is released and either ksoftirqd or the next
softirq in the return from interrupt path can proceed. Though it's a
bit strange to actually run MAX_SOFTIRQ_RESTART (10) loops before it
decides to bail out even if it's clear in the first iteration :)

On RT all softirq processing is done in a FIFO thread and we don't
have a loop limit, so ksoftirqd preempts the lock holder forever and
unqueues and requeues until the reset button is hit.

Due to the forced threading of ksoftirqd on RT we actually cannot
deadlock on qdisc_lock because it's a "sleeping lock". So it's safe to
replace the spin_trylock() with a spin_lock(). When contended,
ksoftirqd is scheduled out and the lock holder can proceed.

[ tglx: Massaged changelog and code comments ]

Solved-by: Thomas Gleixner <tglx@linuxtronix.de>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Tested-by: Carsten Emde <cbe@osadl.org>
Cc: Clark Williams <williams@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
Cc: Luis Claudio R. Goncalves <lclaudio@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/net/core/dev.c b/net/core/dev.c
index 3eda126..6c87379 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3125,6 +3125,36 @@ int netif_rx_ni(struct sk_buff *skb)
 }
 EXPORT_SYMBOL(netif_rx_ni);
 
+#ifdef CONFIG_PREEMPT_RT_FULL
+/*
+ * RT runs ksoftirqd as a real time thread and the root_lock is a
+ * "sleeping spinlock". If the trylock fails then we can go into an
+ * infinite loop when ksoftirqd preempted the task which actually
+ * holds the lock, because we requeue q and raise NET_TX softirq
+ * causing ksoftirqd to loop forever.
+ *
+ * It's safe to use spin_lock on RT here as softirqs run in thread
+ * context and cannot deadlock against the thread which is holding
+ * root_lock.
+ *
+ * On !RT the trylock might fail, but there we bail out from the
+ * softirq loop after 10 attempts which we can't do on RT. And the
+ * task holding root_lock cannot be preempted, so the only downside of
+ * that trylock is that we need 10 loops to decide that we should have
+ * given up in the first one :)
+ */
+static inline int take_root_lock(spinlock_t *lock)
+{
+	spin_lock(lock);
+	return 1;
+}
+#else
+static inline int take_root_lock(spinlock_t *lock)
+{
+	return spin_trylock(lock);
+}
+#endif
+
 static void net_tx_action(struct softirq_action *h)
 {
 	struct softnet_data *sd = &__get_cpu_var(softnet_data);
@@ -3163,7 +3193,7 @@ static void net_tx_action(struct softirq_action *h)
 			head = head->next_sched;
 
 			root_lock = qdisc_lock(q);
-			if (spin_trylock(root_lock)) {
+			if (take_root_lock(root_lock)) {
 				smp_mb__before_clear_bit();
 				clear_bit(__QDISC_STATE_SCHED,
 					  &q->state);
-- 
cgit v0.10.2


From 738dc294e30e50c521e24867b18e67344b840b73 Mon Sep 17 00:00:00 2001
From: Carsten Emde <C.Emde@osadl.org>
Date: Tue, 19 Jul 2011 13:51:17 +0100
Subject: net: sysrq via icmp

There are (probably rare) situations when a system crashed and the system
console becomes unresponsive but the network icmp layer still is alive.
Wouldn't it be wonderful, if we then could submit a sysreq command via ping?

This patch provides this facility. Please consult the updated documentation
Documentation/sysrq.txt for details.

Signed-off-by: Carsten Emde <C.Emde@osadl.org>

diff --git a/Documentation/sysrq.txt b/Documentation/sysrq.txt
index 2a4cdda..67ac78c 100644
--- a/Documentation/sysrq.txt
+++ b/Documentation/sysrq.txt
@@ -57,10 +57,17 @@ On PowerPC - Press 'ALT - Print Screen (or F13) - <command key>,
 On other - If you know of the key combos for other architectures, please
            let me know so I can add them to this section.
 
-On all -  write a character to /proc/sysrq-trigger.  e.g.:
-
+On all -  write a character to /proc/sysrq-trigger, e.g.:
 		echo t > /proc/sysrq-trigger
 
+On all - Enable network SysRq by writing a cookie to icmp_echo_sysrq, e.g.
+		echo 0x01020304 >/proc/sys/net/ipv4/icmp_echo_sysrq
+	 Send an ICMP echo request with this pattern plus the particular
+	 SysRq command key. Example:
+		# ping -c1 -s57 -p0102030468
+	 will trigger the SysRq-H (help) command.
+
+
 *  What are the 'command' keys?
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 'b'     - Will immediately reboot the system without syncing or unmounting
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 2ae2b83..17e815d 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -56,6 +56,7 @@ struct netns_ipv4 {
 
 	int sysctl_icmp_echo_ignore_all;
 	int sysctl_icmp_echo_ignore_broadcasts;
+	int sysctl_icmp_echo_sysrq;
 	int sysctl_icmp_ignore_bogus_error_responses;
 	int sysctl_icmp_ratelimit;
 	int sysctl_icmp_ratemask;
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 3ac5dff..d8bbe94 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -69,6 +69,7 @@
 #include <linux/jiffies.h>
 #include <linux/kernel.h>
 #include <linux/fcntl.h>
+#include <linux/sysrq.h>
 #include <linux/socket.h>
 #include <linux/in.h>
 #include <linux/inet.h>
@@ -768,6 +769,30 @@ static void icmp_redirect(struct sk_buff *skb)
 }
 
 /*
+ * 32bit and 64bit have different timestamp length, so we check for
+ * the cookie at offset 20 and verify it is repeated at offset 50
+ */
+#define CO_POS0		20
+#define CO_POS1		50
+#define CO_SIZE		sizeof(int)
+#define ICMP_SYSRQ_SIZE	57
+
+/*
+ * We got a ICMP_SYSRQ_SIZE sized ping request. Check for the cookie
+ * pattern and if it matches send the next byte as a trigger to sysrq.
+ */
+static void icmp_check_sysrq(struct net *net, struct sk_buff *skb)
+{
+	int cookie = htonl(net->ipv4.sysctl_icmp_echo_sysrq);
+	char *p = skb->data;
+
+	if (!memcmp(&cookie, p + CO_POS0, CO_SIZE) &&
+	    !memcmp(&cookie, p + CO_POS1, CO_SIZE) &&
+	    p[CO_POS0 + CO_SIZE] == p[CO_POS1 + CO_SIZE])
+		handle_sysrq(p[CO_POS0 + CO_SIZE]);
+}
+
+/*
  *	Handle ICMP_ECHO ("ping") requests.
  *
  *	RFC 1122: 3.2.2.6 MUST have an echo server that answers ICMP echo
@@ -794,6 +819,11 @@ static void icmp_echo(struct sk_buff *skb)
 		icmp_param.data_len	   = skb->len;
 		icmp_param.head_len	   = sizeof(struct icmphdr);
 		icmp_reply(&icmp_param, skb);
+
+		if (skb->len == ICMP_SYSRQ_SIZE &&
+		    net->ipv4.sysctl_icmp_echo_sysrq) {
+			icmp_check_sysrq(net, skb);
+		}
 	}
 }
 
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index d84400b..44bf3b0 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -815,6 +815,13 @@ static struct ctl_table ipv4_net_table[] = {
 		.proc_handler	= proc_dointvec
 	},
 	{
+		.procname	= "icmp_echo_sysrq",
+		.data		= &init_net.ipv4.sysctl_icmp_echo_sysrq,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+	{
 		.procname	= "icmp_ignore_bogus_error_responses",
 		.data		= &init_net.ipv4.sysctl_icmp_ignore_bogus_error_responses,
 		.maxlen		= sizeof(int),
-- 
cgit v0.10.2


From 5cda76efb06a4fea245bf23510188719d8e2dddc Mon Sep 17 00:00:00 2001
From: Jason Wessel <jason.wessel@windriver.com>
Date: Thu, 28 Jul 2011 12:42:23 -0500
Subject: kgdb/serial: Short term workaround

On 07/27/2011 04:37 PM, Thomas Gleixner wrote:
>  - KGDB (not yet disabled) is reportedly unusable on -rt right now due
>    to missing hacks in the console locking which I dropped on purpose.
>

To work around this in the short term you can use this patch, in
addition to the clocksource watchdog patch that Thomas brewed up.

Comments are welcome of course.  Ultimately the right solution is to
change separation between the console and the HW to have a polled mode
+ work queue so as not to introduce any kind of latency.

Thanks,
Jason.

diff --git a/drivers/tty/serial/8250/8250.c b/drivers/tty/serial/8250/8250.c
index 25a5451..f318c96 100644
--- a/drivers/tty/serial/8250/8250.c
+++ b/drivers/tty/serial/8250/8250.c
@@ -38,6 +38,7 @@
 #include <linux/nmi.h>
 #include <linux/mutex.h>
 #include <linux/slab.h>
+#include <linux/kdb.h>
 #ifdef CONFIG_SPARC
 #include <linux/sunserialcore.h>
 #endif
@@ -2909,7 +2910,7 @@ serial8250_console_write(struct console *co, const char *s, unsigned int count)
 
 	touch_nmi_watchdog();
 
-	if (port->sysrq || oops_in_progress)
+	if (port->sysrq || oops_in_progress || in_kdb_printk())
 		locked = spin_trylock_irqsave(&port->lock, flags);
 	else
 		spin_lock_irqsave(&port->lock, flags);
diff --git a/include/linux/kdb.h b/include/linux/kdb.h
index 7f6fe6e..680ad23 100644
--- a/include/linux/kdb.h
+++ b/include/linux/kdb.h
@@ -115,7 +115,7 @@ extern int kdb_trap_printk;
 extern __printf(1, 0) int vkdb_printf(const char *fmt, va_list args);
 extern __printf(1, 2) int kdb_printf(const char *, ...);
 typedef __printf(1, 2) int (*kdb_printf_t)(const char *, ...);
-
+#define in_kdb_printk() (kdb_trap_printk)
 extern void kdb_init(int level);
 
 /* Access to kdb specific polling devices */
@@ -150,6 +150,7 @@ extern int kdb_register_repeat(char *, kdb_func_t, char *, char *,
 extern int kdb_unregister(char *);
 #else /* ! CONFIG_KGDB_KDB */
 static inline __printf(1, 2) int kdb_printf(const char *fmt, ...) { return 0; }
+#define in_kdb_printk() (0)
 static inline void kdb_init(int level) {}
 static inline int kdb_register(char *cmd, kdb_func_t func, char *usage,
 			       char *help, short minlen) { return 0; }
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index 14ff484..399dba6 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -554,7 +554,6 @@ int vkdb_printf(const char *fmt, va_list ap)
 	int linecount;
 	int colcount;
 	int logging, saved_loglevel = 0;
-	int saved_trap_printk;
 	int got_printf_lock = 0;
 	int retlen = 0;
 	int fnd, len;
@@ -565,8 +564,6 @@ int vkdb_printf(const char *fmt, va_list ap)
 	unsigned long uninitialized_var(flags);
 
 	preempt_disable();
-	saved_trap_printk = kdb_trap_printk;
-	kdb_trap_printk = 0;
 
 	/* Serialize kdb_printf if multiple cpus try to write at once.
 	 * But if any cpu goes recursive in kdb, just print the output,
@@ -833,7 +830,6 @@ kdb_print_out:
 	} else {
 		__release(kdb_printf_lock);
 	}
-	kdb_trap_printk = saved_trap_printk;
 	preempt_enable();
 	return retlen;
 }
@@ -843,9 +839,11 @@ int kdb_printf(const char *fmt, ...)
 	va_list ap;
 	int r;
 
+	kdb_trap_printk++;
 	va_start(ap, fmt);
 	r = vkdb_printf(fmt, ap);
 	va_end(ap);
+	kdb_trap_printk--;
 
 	return r;
 }
-- 
cgit v0.10.2


From 27d303ac8047747ae0ad66624cecf30578b50f5d Mon Sep 17 00:00:00 2001
From: Clark Williams <williams@redhat.com>
Date: Sat, 30 Jul 2011 21:55:53 -0500
Subject: add /sys/kernel/realtime entry

Add a /sys/kernel entry to indicate that the kernel is a
realtime kernel.

Clark says that he needs this for udev rules, udev needs to evaluate
if its a PREEMPT_RT kernel a few thousand times and parsing uname
output is too slow or so.

Are there better solutions? Should it exist and return 0 on !-rt?

Signed-off-by: Clark Williams <williams@redhat.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>

diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 6ada93c..1c991e3 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -132,6 +132,15 @@ KERNEL_ATTR_RO(vmcoreinfo);
 
 #endif /* CONFIG_KEXEC */
 
+#if defined(CONFIG_PREEMPT_RT_FULL)
+static ssize_t  realtime_show(struct kobject *kobj,
+			      struct kobj_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%d\n", 1);
+}
+KERNEL_ATTR_RO(realtime);
+#endif
+
 /* whether file capabilities are enabled */
 static ssize_t fscaps_show(struct kobject *kobj,
 				  struct kobj_attribute *attr, char *buf)
@@ -197,6 +206,9 @@ static struct attribute * kernel_attrs[] = {
 	&vmcoreinfo_attr.attr,
 #endif
 	&rcu_expedited_attr.attr,
+#ifdef CONFIG_PREEMPT_RT_FULL
+	&realtime_attr.attr,
+#endif
 	NULL
 };
 
-- 
cgit v0.10.2


From d45d69c38799a8d5051914f76114fc0631791fc4 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 28 Jul 2011 10:43:51 +0200
Subject: mm, rt: kmap_atomic scheduling

In fact, with migrate_disable() existing one could play games with
kmap_atomic. You could save/restore the kmap_atomic slots on context
switch (if there are any in use of course), this should be esp easy now
that we have a kmap_atomic stack.

Something like the below.. it wants replacing all the preempt_disable()
stuff with pagefault_disable() && migrate_disable() of course, but then
you can flip kmaps around like below.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
[dvhart@linux.intel.com: build fix]
Link: http://lkml.kernel.org/r/1311842631.5890.208.camel@twins

[tglx@linutronix.de: Get rid of the per cpu variable and store the idx
		     and the pte content right away in the task struct.
		     Shortens the context switch code. ]

diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index b5a8905..139ad27 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -36,6 +36,7 @@
 #include <linux/uaccess.h>
 #include <linux/io.h>
 #include <linux/kdebug.h>
+#include <linux/highmem.h>
 
 #include <asm/pgtable.h>
 #include <asm/ldt.h>
@@ -216,6 +217,35 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
 }
 EXPORT_SYMBOL_GPL(start_thread);
 
+#ifdef CONFIG_PREEMPT_RT_FULL
+static void switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p)
+{
+	int i;
+
+	/*
+	 * Clear @prev's kmap_atomic mappings
+	 */
+	for (i = 0; i < prev_p->kmap_idx; i++) {
+		int idx = i + KM_TYPE_NR * smp_processor_id();
+		pte_t *ptep = kmap_pte - idx;
+
+		kpte_clear_flush(ptep, __fix_to_virt(FIX_KMAP_BEGIN + idx));
+	}
+	/*
+	 * Restore @next_p's kmap_atomic mappings
+	 */
+	for (i = 0; i < next_p->kmap_idx; i++) {
+		int idx = i + KM_TYPE_NR * smp_processor_id();
+
+		if (!pte_none(next_p->kmap_pte[i]))
+			set_pte(kmap_pte - idx, next_p->kmap_pte[i]);
+	}
+}
+#else
+static inline void
+switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p) { }
+#endif
+
 
 /*
  *	switch_to(x,y) should switch tasks from x to y.
@@ -295,6 +325,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 		     task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT))
 		__switch_to_xtra(prev_p, next_p, tss);
 
+	switch_kmaps(prev_p, next_p);
+
 	/*
 	 * Leave lazy mode, flushing any hypercalls made here.
 	 * This must be done before restoring TLS segments so
diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
index 6f31ee5..01f7c99 100644
--- a/arch/x86/mm/highmem_32.c
+++ b/arch/x86/mm/highmem_32.c
@@ -31,6 +31,7 @@ EXPORT_SYMBOL(kunmap);
  */
 void *kmap_atomic_prot(struct page *page, pgprot_t prot)
 {
+	pte_t pte = mk_pte(page, prot);
 	unsigned long vaddr;
 	int idx, type;
 
@@ -44,7 +45,10 @@ void *kmap_atomic_prot(struct page *page, pgprot_t prot)
 	idx = type + KM_TYPE_NR*smp_processor_id();
 	vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
 	BUG_ON(!pte_none(*(kmap_pte-idx)));
-	set_pte(kmap_pte-idx, mk_pte(page, prot));
+#ifdef CONFIG_PREEMPT_RT_FULL
+	current->kmap_pte[type] = pte;
+#endif
+	set_pte(kmap_pte-idx, pte);
 	arch_flush_lazy_mmu_mode();
 
 	return (void *)vaddr;
@@ -87,6 +91,9 @@ void __kunmap_atomic(void *kvaddr)
 		 * is a bad idea also, in case the page changes cacheability
 		 * attributes or becomes a protected page in a hypervisor.
 		 */
+#ifdef CONFIG_PREEMPT_RT_FULL
+		current->kmap_pte[type] = __pte(0);
+#endif
 		kpte_clear_flush(kmap_pte-idx, vaddr);
 		kmap_atomic_idx_pop();
 		arch_flush_lazy_mmu_mode();
diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c
index 7b179b4..0c953e3 100644
--- a/arch/x86/mm/iomap_32.c
+++ b/arch/x86/mm/iomap_32.c
@@ -56,6 +56,7 @@ EXPORT_SYMBOL_GPL(iomap_free);
 
 void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot)
 {
+	pte_t pte = pfn_pte(pfn, prot);
 	unsigned long vaddr;
 	int idx, type;
 
@@ -64,7 +65,10 @@ void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot)
 	type = kmap_atomic_idx_push();
 	idx = type + KM_TYPE_NR * smp_processor_id();
 	vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
-	set_pte(kmap_pte - idx, pfn_pte(pfn, prot));
+#ifdef CONFIG_PREEMPT_RT_FULL
+	current->kmap_pte[type] = pte;
+#endif
+	set_pte(kmap_pte - idx, pte);
 	arch_flush_lazy_mmu_mode();
 
 	return (void *)vaddr;
@@ -110,6 +114,9 @@ iounmap_atomic(void __iomem *kvaddr)
 		 * is a bad idea also, in case the page changes cacheability
 		 * attributes or becomes a protected page in a hypervisor.
 		 */
+#ifdef CONFIG_PREEMPT_RT_FULL
+		current->kmap_pte[type] = __pte(0);
+#endif
 		kpte_clear_flush(kmap_pte-idx, vaddr);
 		kmap_atomic_idx_pop();
 	}
diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index ef788b5..4ec3d97 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -85,32 +85,51 @@ static inline void __kunmap_atomic(void *addr)
 
 #if defined(CONFIG_HIGHMEM) || defined(CONFIG_X86_32)
 
+#ifndef CONFIG_PREEMPT_RT_FULL
 DECLARE_PER_CPU(int, __kmap_atomic_idx);
+#endif
 
 static inline int kmap_atomic_idx_push(void)
 {
+#ifndef CONFIG_PREEMPT_RT_FULL
 	int idx = __this_cpu_inc_return(__kmap_atomic_idx) - 1;
 
-#ifdef CONFIG_DEBUG_HIGHMEM
+# ifdef CONFIG_DEBUG_HIGHMEM
 	WARN_ON_ONCE(in_irq() && !irqs_disabled());
 	BUG_ON(idx > KM_TYPE_NR);
-#endif
+# endif
 	return idx;
+#else
+	current->kmap_idx++;
+	BUG_ON(current->kmap_idx > KM_TYPE_NR);
+	return current->kmap_idx - 1;
+#endif
 }
 
 static inline int kmap_atomic_idx(void)
 {
+#ifndef CONFIG_PREEMPT_RT_FULL
 	return __this_cpu_read(__kmap_atomic_idx) - 1;
+#else
+	return current->kmap_idx - 1;
+#endif
 }
 
 static inline void kmap_atomic_idx_pop(void)
 {
-#ifdef CONFIG_DEBUG_HIGHMEM
+#ifndef CONFIG_PREEMPT_RT_FULL
+# ifdef CONFIG_DEBUG_HIGHMEM
 	int idx = __this_cpu_dec_return(__kmap_atomic_idx);
 
 	BUG_ON(idx < 0);
-#else
+# else
 	__this_cpu_dec(__kmap_atomic_idx);
+# endif
+#else
+	current->kmap_idx--;
+# ifdef CONFIG_DEBUG_HIGHMEM
+	BUG_ON(current->kmap_idx < 0);
+# endif
 #endif
 }
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8589292..747e0a6 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -23,6 +23,7 @@ struct sched_param {
 #include <linux/nodemask.h>
 #include <linux/mm_types.h>
 
+#include <asm/kmap_types.h>
 #include <asm/page.h>
 #include <asm/ptrace.h>
 #include <asm/cputime.h>
@@ -1635,6 +1636,12 @@ struct task_struct {
 	struct rcu_head put_rcu;
 	int softirq_nestcnt;
 #endif
+#ifdef CONFIG_PREEMPT_RT_FULL
+# if defined CONFIG_HIGHMEM || defined CONFIG_X86_32
+	int kmap_idx;
+	pte_t kmap_pte[KM_TYPE_NR];
+# endif
+#endif
 };
 
 #ifdef CONFIG_NUMA_BALANCING
diff --git a/mm/highmem.c b/mm/highmem.c
index b32b70c..b1c7d43 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -29,10 +29,11 @@
 #include <linux/kgdb.h>
 #include <asm/tlbflush.h>
 
-
+#ifndef CONFIG_PREEMPT_RT_FULL
 #if defined(CONFIG_HIGHMEM) || defined(CONFIG_X86_32)
 DEFINE_PER_CPU(int, __kmap_atomic_idx);
 #endif
+#endif
 
 /*
  * Virtual_count is not a pure "count".
@@ -47,8 +48,9 @@ DEFINE_PER_CPU(int, __kmap_atomic_idx);
 unsigned long totalhigh_pages __read_mostly;
 EXPORT_SYMBOL(totalhigh_pages);
 
-
+#ifndef CONFIG_PREEMPT_RT_FULL
 EXPORT_PER_CPU_SYMBOL(__kmap_atomic_idx);
+#endif
 
 unsigned int nr_free_highpages (void)
 {
diff --git a/mm/memory.c b/mm/memory.c
index 9d55b54..2f62dab 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3672,6 +3672,7 @@ unlock:
 #ifdef CONFIG_PREEMPT_RT_FULL
 void pagefault_disable(void)
 {
+	migrate_disable();
 	current->pagefault_disabled++;
 	/*
 	 * make sure to have issued the store before a pagefault
@@ -3689,6 +3690,7 @@ void pagefault_enable(void)
 	 */
 	barrier();
 	current->pagefault_disabled--;
+	migrate_enable();
 }
 EXPORT_SYMBOL(pagefault_enable);
 #endif
-- 
cgit v0.10.2


From 620e478a6db371238797d99be971524d4d9d3a68 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Mon, 11 Mar 2013 17:09:55 +0100
Subject: x86/highmem: add a "already used pte" check

This is a copy from kmap_atomic_prot().

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>

diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c
index 0c953e3..62377d6 100644
--- a/arch/x86/mm/iomap_32.c
+++ b/arch/x86/mm/iomap_32.c
@@ -65,6 +65,8 @@ void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot)
 	type = kmap_atomic_idx_push();
 	idx = type + KM_TYPE_NR * smp_processor_id();
 	vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
+	WARN_ON(!pte_none(*(kmap_pte - idx)));
+
 #ifdef CONFIG_PREEMPT_RT_FULL
 	current->kmap_pte[type] = pte;
 #endif
-- 
cgit v0.10.2


From 902099faef68019ef6e67b3fab402e6a387057e3 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Mon, 11 Mar 2013 21:37:27 +0100
Subject: arm/highmem: flush tlb on unmap

The tlb should be flushed on unmap and thus make the mapping entry
invalid. This is only done in the non-debug case which does not look
right.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>

diff --git a/arch/arm/mm/highmem.c b/arch/arm/mm/highmem.c
index 21b9e1b..3688c8b 100644
--- a/arch/arm/mm/highmem.c
+++ b/arch/arm/mm/highmem.c
@@ -95,10 +95,10 @@ void __kunmap_atomic(void *kvaddr)
 			__cpuc_flush_dcache_area((void *)vaddr, PAGE_SIZE);
 #ifdef CONFIG_DEBUG_HIGHMEM
 		BUG_ON(vaddr != __fix_to_virt(FIX_KMAP_BEGIN + idx));
-		set_top_pte(vaddr, __pte(0));
 #else
 		(void) idx;  /* to kill a warning */
 #endif
+		set_top_pte(vaddr, __pte(0));
 		kmap_atomic_idx_pop();
 	} else if (vaddr >= PKMAP_ADDR(0) && vaddr < PKMAP_ADDR(LAST_PKMAP)) {
 		/* this address was obtained through kmap_high_get() */
-- 
cgit v0.10.2


From c6455064d0093cb0ed200e37c276e10cb5c18ca4 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 13 Feb 2013 11:03:11 +0100
Subject: arm-enable-highmem-for-rt.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 772c2d3..4890796 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -1752,7 +1752,7 @@ config HAVE_ARCH_PFN_VALID
 
 config HIGHMEM
 	bool "High Memory Support"
-	depends on MMU && !PREEMPT_RT_FULL
+	depends on MMU
 	help
 	  The address space of ARM processors is only 4 Gigabytes large
 	  and it has to accommodate user address space, kernel address
diff --git a/arch/arm/include/asm/switch_to.h b/arch/arm/include/asm/switch_to.h
index fa09e6b..fbd0ba7 100644
--- a/arch/arm/include/asm/switch_to.h
+++ b/arch/arm/include/asm/switch_to.h
@@ -3,6 +3,14 @@
 
 #include <linux/thread_info.h>
 
+#if defined CONFIG_PREEMPT_RT_FULL && defined CONFIG_HIGHMEM
+void switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p);
+#else
+static inline void
+switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p) { }
+#endif
+
+
 /*
  * switch_to(prev, next) should switch from task `prev' to `next'
  * `prev' will never be the same as `next'.  schedule() itself
@@ -12,6 +20,7 @@ extern struct task_struct *__switch_to(struct task_struct *, struct thread_info
 
 #define switch_to(prev,next,last)					\
 do {									\
+	switch_kmaps(prev, next);					\
 	last = __switch_to(prev,task_thread_info(prev), task_thread_info(next));	\
 } while (0)
 
diff --git a/arch/arm/mm/highmem.c b/arch/arm/mm/highmem.c
index 3688c8b..bd41dd8 100644
--- a/arch/arm/mm/highmem.c
+++ b/arch/arm/mm/highmem.c
@@ -38,6 +38,7 @@ EXPORT_SYMBOL(kunmap);
 
 void *kmap_atomic(struct page *page)
 {
+	pte_t pte = mk_pte(page, kmap_prot);
 	unsigned int idx;
 	unsigned long vaddr;
 	void *kmap;
@@ -76,7 +77,10 @@ void *kmap_atomic(struct page *page)
 	 * in place, so the contained TLB flush ensures the TLB is updated
 	 * with the new mapping.
 	 */
-	set_top_pte(vaddr, mk_pte(page, kmap_prot));
+#ifdef CONFIG_PREEMPT_RT_FULL
+	current->kmap_pte[type] = pte;
+#endif
+	set_top_pte(vaddr, pte);
 
 	return (void *)vaddr;
 }
@@ -93,6 +97,9 @@ void __kunmap_atomic(void *kvaddr)
 
 		if (cache_is_vivt())
 			__cpuc_flush_dcache_area((void *)vaddr, PAGE_SIZE);
+#ifdef CONFIG_PREEMPT_RT_FULL
+		current->kmap_pte[type] = __pte(0);
+#endif
 #ifdef CONFIG_DEBUG_HIGHMEM
 		BUG_ON(vaddr != __fix_to_virt(FIX_KMAP_BEGIN + idx));
 #else
@@ -110,6 +117,7 @@ EXPORT_SYMBOL(__kunmap_atomic);
 
 void *kmap_atomic_pfn(unsigned long pfn)
 {
+	pte_t pte = pfn_pte(pfn, kmap_prot);
 	unsigned long vaddr;
 	int idx, type;
 
@@ -121,7 +129,10 @@ void *kmap_atomic_pfn(unsigned long pfn)
 #ifdef CONFIG_DEBUG_HIGHMEM
 	BUG_ON(!pte_none(get_top_pte(vaddr)));
 #endif
-	set_top_pte(vaddr, pfn_pte(pfn, kmap_prot));
+#ifdef CONFIG_PREEMPT_RT_FULL
+	current->kmap_pte[type] = pte;
+#endif
+	set_top_pte(vaddr, pte);
 
 	return (void *)vaddr;
 }
@@ -135,3 +146,29 @@ struct page *kmap_atomic_to_page(const void *ptr)
 
 	return pte_page(get_top_pte(vaddr));
 }
+
+#if defined CONFIG_PREEMPT_RT_FULL
+void switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p)
+{
+	int i;
+
+	/*
+	 * Clear @prev's kmap_atomic mappings
+	 */
+	for (i = 0; i < prev_p->kmap_idx; i++) {
+		int idx = i + KM_TYPE_NR * smp_processor_id();
+
+		set_top_pte(__fix_to_virt(FIX_KMAP_BEGIN + idx), __pte(0));
+	}
+	/*
+	 * Restore @next_p's kmap_atomic mappings
+	 */
+	for (i = 0; i < next_p->kmap_idx; i++) {
+		int idx = i + KM_TYPE_NR * smp_processor_id();
+
+		if (!pte_none(next_p->kmap_pte[i]))
+			set_top_pte(__fix_to_virt(FIX_KMAP_BEGIN + idx),
+					next_p->kmap_pte[i]);
+	}
+}
+#endif
diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index 4ec3d97..84223de 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -7,6 +7,7 @@
 #include <linux/mm.h>
 #include <linux/uaccess.h>
 #include <linux/hardirq.h>
+#include <linux/sched.h>
 
 #include <asm/cacheflush.h>
 
-- 
cgit v0.10.2


From 651ceecd0c5d89012cec986cdf0b37e12c3c4ef3 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 13 Sep 2011 15:09:40 +0200
Subject: ipc/sem: Rework semaphore wakeups

Current sysv sems have a weird ass wakeup scheme that involves keeping
preemption disabled over a potential O(n^2) loop and busy waiting on
that on other CPUs.

Kill this and simply wake the task directly from under the sem_lock.

This was discovered by a migrate_disable() debug feature that
disallows:

  spin_lock();
  preempt_disable();
  spin_unlock()
  preempt_enable();

Cc: Manfred Spraul <manfred@colorfullife.com>
Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Reported-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Manfred Spraul <manfred@colorfullife.com>
Link: http://lkml.kernel.org/r/1315994224.5040.1.camel@twins
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/ipc/sem.c b/ipc/sem.c
index 58d31f1..d7bdd84 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -461,6 +461,13 @@ undo:
 static void wake_up_sem_queue_prepare(struct list_head *pt,
 				struct sem_queue *q, int error)
 {
+#ifdef CONFIG_PREEMPT_RT_BASE
+	struct task_struct *p = q->sleeper;
+	get_task_struct(p);
+	q->status = error;
+	wake_up_process(p);
+	put_task_struct(p);
+#else
 	if (list_empty(pt)) {
 		/*
 		 * Hold preempt off so that we don't get preempted and have the
@@ -472,6 +479,7 @@ static void wake_up_sem_queue_prepare(struct list_head *pt,
 	q->pid = error;
 
 	list_add_tail(&q->simple_list, pt);
+#endif
 }
 
 /**
@@ -485,6 +493,7 @@ static void wake_up_sem_queue_prepare(struct list_head *pt,
  */
 static void wake_up_sem_queue_do(struct list_head *pt)
 {
+#ifndef CONFIG_PREEMPT_RT_BASE
 	struct sem_queue *q, *t;
 	int did_something;
 
@@ -497,6 +506,7 @@ static void wake_up_sem_queue_do(struct list_head *pt)
 	}
 	if (did_something)
 		preempt_enable();
+#endif
 }
 
 static void unlink_queue(struct sem_array *sma, struct sem_queue *q)
-- 
cgit v0.10.2


From 10ca88e2c48b1ae14aa007c07fca2b405d4826d5 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 6 Nov 2011 12:26:18 +0100
Subject: x86-kvm-require-const-tsc-for-rt.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c243b81..f384710 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5242,6 +5242,13 @@ int kvm_arch_init(void *opaque)
 		goto out;
 	}
 
+#ifdef CONFIG_PREEMPT_RT_FULL
+	if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
+		printk(KERN_ERR "RT requires X86_FEATURE_CONSTANT_TSC\n");
+		return -EOPNOTSUPP;
+	}
+#endif
+
 	r = kvm_mmu_module_init();
 	if (r)
 		goto out_free_percpu;
-- 
cgit v0.10.2


From 1d7dd77b0e38ebb85badf91974be866449aa6e25 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 12 Nov 2011 14:00:48 +0100
Subject: scsi-fcoe-rt-aware.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/drivers/scsi/fcoe/fcoe.c b/drivers/scsi/fcoe/fcoe.c
index 666b7ac..61c1d2a 100644
--- a/drivers/scsi/fcoe/fcoe.c
+++ b/drivers/scsi/fcoe/fcoe.c
@@ -1272,7 +1272,7 @@ static void fcoe_percpu_thread_destroy(unsigned int cpu)
 	struct sk_buff *skb;
 #ifdef CONFIG_SMP
 	struct fcoe_percpu_s *p0;
-	unsigned targ_cpu = get_cpu();
+	unsigned targ_cpu = get_cpu_light();
 #endif /* CONFIG_SMP */
 
 	FCOE_DBG("Destroying receive thread for CPU %d\n", cpu);
@@ -1328,7 +1328,7 @@ static void fcoe_percpu_thread_destroy(unsigned int cpu)
 			kfree_skb(skb);
 		spin_unlock_bh(&p->fcoe_rx_list.lock);
 	}
-	put_cpu();
+	put_cpu_light();
 #else
 	/*
 	 * This a non-SMP scenario where the singular Rx thread is
@@ -1546,11 +1546,11 @@ err2:
 static int fcoe_alloc_paged_crc_eof(struct sk_buff *skb, int tlen)
 {
 	struct fcoe_percpu_s *fps;
-	int rc;
+	int rc, cpu = get_cpu_light();
 
-	fps = &get_cpu_var(fcoe_percpu);
+	fps = &per_cpu(fcoe_percpu, cpu);
 	rc = fcoe_get_paged_crc_eof(skb, tlen, fps);
-	put_cpu_var(fcoe_percpu);
+	put_cpu_light();
 
 	return rc;
 }
@@ -1745,11 +1745,11 @@ static inline int fcoe_filter_frames(struct fc_lport *lport,
 		return 0;
 	}
 
-	stats = per_cpu_ptr(lport->stats, get_cpu());
+	stats = per_cpu_ptr(lport->stats, get_cpu_light());
 	stats->InvalidCRCCount++;
 	if (stats->InvalidCRCCount < 5)
 		printk(KERN_WARNING "fcoe: dropping frame with CRC error\n");
-	put_cpu();
+	put_cpu_light();
 	return -EINVAL;
 }
 
@@ -1825,13 +1825,13 @@ static void fcoe_recv_frame(struct sk_buff *skb)
 		goto drop;
 
 	if (!fcoe_filter_frames(lport, fp)) {
-		put_cpu();
+		put_cpu_light();
 		fc_exch_recv(lport, fp);
 		return;
 	}
 drop:
 	stats->ErrorFrames++;
-	put_cpu();
+	put_cpu_light();
 	kfree_skb(skb);
 }
 
diff --git a/drivers/scsi/fcoe/fcoe_ctlr.c b/drivers/scsi/fcoe/fcoe_ctlr.c
index 4a909d7..69d36cc 100644
--- a/drivers/scsi/fcoe/fcoe_ctlr.c
+++ b/drivers/scsi/fcoe/fcoe_ctlr.c
@@ -792,7 +792,7 @@ static unsigned long fcoe_ctlr_age_fcfs(struct fcoe_ctlr *fip)
 
 	INIT_LIST_HEAD(&del_list);
 
-	stats = per_cpu_ptr(fip->lp->stats, get_cpu());
+	stats = per_cpu_ptr(fip->lp->stats, get_cpu_light());
 
 	list_for_each_entry_safe(fcf, next, &fip->fcfs, list) {
 		deadline = fcf->time + fcf->fka_period + fcf->fka_period / 2;
@@ -828,7 +828,7 @@ static unsigned long fcoe_ctlr_age_fcfs(struct fcoe_ctlr *fip)
 				sel_time = fcf->time;
 		}
 	}
-	put_cpu();
+	put_cpu_light();
 
 	list_for_each_entry_safe(fcf, next, &del_list, list) {
 		/* Removes fcf from current list */
diff --git a/drivers/scsi/libfc/fc_exch.c b/drivers/scsi/libfc/fc_exch.c
index c772d8d..67ca13a 100644
--- a/drivers/scsi/libfc/fc_exch.c
+++ b/drivers/scsi/libfc/fc_exch.c
@@ -730,10 +730,10 @@ static struct fc_exch *fc_exch_em_alloc(struct fc_lport *lport,
 	}
 	memset(ep, 0, sizeof(*ep));
 
-	cpu = get_cpu();
+	cpu = get_cpu_light();
 	pool = per_cpu_ptr(mp->pool, cpu);
 	spin_lock_bh(&pool->lock);
-	put_cpu();
+	put_cpu_light();
 
 	/* peek cache of free slot */
 	if (pool->left != FC_XID_UNKNOWN) {
-- 
cgit v0.10.2


From 571d6cf4e8ca219be4b8139324b60a5c901111c2 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 14 Nov 2011 18:19:27 +0100
Subject: x86: crypto: Reduce preempt disabled regions

Restrict the preempt disabled regions to the actual floating point
operations and enable preemption for the administrative actions.

This is necessary on RT to avoid that kfree and other operations are
called with preemption disabled.

Reported-and-tested-by: Carsten Emde <cbe@osadl.org>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: stable-rt@vger.kernel.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index 1b9c22b..653314e 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -250,14 +250,14 @@ static int ecb_encrypt(struct blkcipher_desc *desc,
 	err = blkcipher_walk_virt(desc, &walk);
 	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
 
-	kernel_fpu_begin();
 	while ((nbytes = walk.nbytes)) {
+		kernel_fpu_begin();
 		aesni_ecb_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr,
-			      nbytes & AES_BLOCK_MASK);
+				nbytes & AES_BLOCK_MASK);
+		kernel_fpu_end();
 		nbytes &= AES_BLOCK_SIZE - 1;
 		err = blkcipher_walk_done(desc, &walk, nbytes);
 	}
-	kernel_fpu_end();
 
 	return err;
 }
@@ -274,14 +274,14 @@ static int ecb_decrypt(struct blkcipher_desc *desc,
 	err = blkcipher_walk_virt(desc, &walk);
 	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
 
-	kernel_fpu_begin();
 	while ((nbytes = walk.nbytes)) {
+		kernel_fpu_begin();
 		aesni_ecb_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr,
 			      nbytes & AES_BLOCK_MASK);
+		kernel_fpu_end();
 		nbytes &= AES_BLOCK_SIZE - 1;
 		err = blkcipher_walk_done(desc, &walk, nbytes);
 	}
-	kernel_fpu_end();
 
 	return err;
 }
@@ -298,14 +298,14 @@ static int cbc_encrypt(struct blkcipher_desc *desc,
 	err = blkcipher_walk_virt(desc, &walk);
 	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
 
-	kernel_fpu_begin();
 	while ((nbytes = walk.nbytes)) {
+		kernel_fpu_begin();
 		aesni_cbc_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr,
 			      nbytes & AES_BLOCK_MASK, walk.iv);
+		kernel_fpu_end();
 		nbytes &= AES_BLOCK_SIZE - 1;
 		err = blkcipher_walk_done(desc, &walk, nbytes);
 	}
-	kernel_fpu_end();
 
 	return err;
 }
@@ -322,14 +322,14 @@ static int cbc_decrypt(struct blkcipher_desc *desc,
 	err = blkcipher_walk_virt(desc, &walk);
 	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
 
-	kernel_fpu_begin();
 	while ((nbytes = walk.nbytes)) {
+		kernel_fpu_begin();
 		aesni_cbc_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr,
 			      nbytes & AES_BLOCK_MASK, walk.iv);
+		kernel_fpu_end();
 		nbytes &= AES_BLOCK_SIZE - 1;
 		err = blkcipher_walk_done(desc, &walk, nbytes);
 	}
-	kernel_fpu_end();
 
 	return err;
 }
@@ -362,18 +362,20 @@ static int ctr_crypt(struct blkcipher_desc *desc,
 	err = blkcipher_walk_virt_block(desc, &walk, AES_BLOCK_SIZE);
 	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
 
-	kernel_fpu_begin();
 	while ((nbytes = walk.nbytes) >= AES_BLOCK_SIZE) {
+		kernel_fpu_begin();
 		aesni_ctr_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr,
 			      nbytes & AES_BLOCK_MASK, walk.iv);
+		kernel_fpu_end();
 		nbytes &= AES_BLOCK_SIZE - 1;
 		err = blkcipher_walk_done(desc, &walk, nbytes);
 	}
 	if (walk.nbytes) {
+		kernel_fpu_begin();
 		ctr_crypt_final(ctx, &walk);
+		kernel_fpu_end();
 		err = blkcipher_walk_done(desc, &walk, 0);
 	}
-	kernel_fpu_end();
 
 	return err;
 }
-- 
cgit v0.10.2


From 7e5afc5e4f36acd9d31b66d675c932768a310d42 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 14 Nov 2011 23:06:09 +0100
Subject: dm: Make rt aware

Use the BUG_ON_NORT variant for the irq_disabled() checks. RT has
interrupts legitimately enabled here as we cant deadlock against the
irq thread due to the "sleeping spinlocks" conversion.

Reported-by: Luis Claudio R. Goncalves <lclaudio@uudg.org>
Cc: stable-rt@vger.kernel.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 0d8f086..0e1699e 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1677,14 +1677,14 @@ static void dm_request_fn(struct request_queue *q)
 		if (map_request(ti, clone, md))
 			goto requeued;
 
-		BUG_ON(!irqs_disabled());
+		BUG_ON_NONRT(!irqs_disabled());
 		spin_lock(q->queue_lock);
 	}
 
 	goto out;
 
 requeued:
-	BUG_ON(!irqs_disabled());
+	BUG_ON_NONRT(!irqs_disabled());
 	spin_lock(q->queue_lock);
 
 delay_and_out:
-- 
cgit v0.10.2


From 5a5757b3d4207a77408c336e0bcf4ec29bbafe0b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 14 Dec 2011 01:03:49 +0100
Subject: cpumask: Disable CONFIG_CPUMASK_OFFSTACK for RT

We can't deal with the cpumask allocations which happen in atomic
context (see arch/x86/kernel/apic/io_apic.c) on RT right now.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 8b866b2..989e191 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -775,7 +775,7 @@ config IOMMU_HELPER
 config MAXSMP
 	bool "Enable Maximum number of SMP Processors and NUMA Nodes"
 	depends on X86_64 && SMP && DEBUG_KERNEL && EXPERIMENTAL
-	select CPUMASK_OFFSTACK
+	select CPUMASK_OFFSTACK if !PREEMPT_RT_FULL
 	---help---
 	  Enable maximum number of CPUS and NUMA Nodes for this architecture.
 	  If unsure, say N.
diff --git a/lib/Kconfig b/lib/Kconfig
index 75cdb77..7669d65 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -315,6 +315,7 @@ config CHECK_SIGNATURE
 
 config CPUMASK_OFFSTACK
 	bool "Force CPU masks off stack" if DEBUG_PER_CPU_MAPS
+	depends on !PREEMPT_RT_FULL
 	help
 	  Use dynamic allocation for cpumask_var_t, instead of putting
 	  them on the stack.  This is a bit more expensive, but avoids
-- 
cgit v0.10.2


From ce9c0cba2799ec2657231ae7b5e86f154b9d2a73 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 22 Feb 2012 12:03:30 +0100
Subject: seqlock: Prevent rt starvation

If a low prio writer gets preempted while holding the seqlock write
locked, a high prio reader spins forever on RT.

To prevent this let the reader grab the spinlock, so it blocks and
eventually boosts the writer. This way the writer can proceed and
endless spinning is prevented.

For seqcount writers we disable preemption over the update code
path. Thaanks to Al Viro for distangling some VFS code to make that
possible.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable-rt@vger.kernel.org

diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
index 1829905..939ea1a 100644
--- a/include/linux/seqlock.h
+++ b/include/linux/seqlock.h
@@ -146,18 +146,30 @@ static inline int read_seqcount_retry(const seqcount_t *s, unsigned start)
  * Sequence counter only version assumes that callers are using their
  * own mutexing.
  */
-static inline void write_seqcount_begin(seqcount_t *s)
+static inline void __write_seqcount_begin(seqcount_t *s)
 {
 	s->sequence++;
 	smp_wmb();
 }
 
-static inline void write_seqcount_end(seqcount_t *s)
+static inline void write_seqcount_begin(seqcount_t *s)
+{
+	preempt_disable_rt();
+	__write_seqcount_begin(s);
+}
+
+static inline void __write_seqcount_end(seqcount_t *s)
 {
 	smp_wmb();
 	s->sequence++;
 }
 
+static inline void write_seqcount_end(seqcount_t *s)
+{
+	__write_seqcount_end(s);
+	preempt_enable_rt();
+}
+
 /**
  * write_seqcount_barrier - invalidate in-progress read-side seq operations
  * @s: pointer to seqcount_t
@@ -198,10 +210,33 @@ typedef struct {
 /*
  * Read side functions for starting and finalizing a read side section.
  */
+#ifndef CONFIG_PREEMPT_RT_FULL
 static inline unsigned read_seqbegin(const seqlock_t *sl)
 {
 	return read_seqcount_begin(&sl->seqcount);
 }
+#else
+/*
+ * Starvation safe read side for RT
+ */
+static inline unsigned read_seqbegin(seqlock_t *sl)
+{
+	unsigned ret;
+
+repeat:
+	ret = ACCESS_ONCE(sl->seqcount.sequence);
+	if (unlikely(ret & 1)) {
+		/*
+		 * Take the lock and let the writer proceed (i.e. evtl
+		 * boost it), otherwise we could loop here forever.
+		 */
+		spin_lock(&sl->lock);
+		spin_unlock(&sl->lock);
+		goto repeat;
+	}
+	return ret;
+}
+#endif
 
 static inline unsigned read_seqretry(const seqlock_t *sl, unsigned start)
 {
@@ -216,36 +251,36 @@ static inline unsigned read_seqretry(const seqlock_t *sl, unsigned start)
 static inline void write_seqlock(seqlock_t *sl)
 {
 	spin_lock(&sl->lock);
-	write_seqcount_begin(&sl->seqcount);
+	__write_seqcount_begin(&sl->seqcount);
 }
 
 static inline void write_sequnlock(seqlock_t *sl)
 {
-	write_seqcount_end(&sl->seqcount);
+	__write_seqcount_end(&sl->seqcount);
 	spin_unlock(&sl->lock);
 }
 
 static inline void write_seqlock_bh(seqlock_t *sl)
 {
 	spin_lock_bh(&sl->lock);
-	write_seqcount_begin(&sl->seqcount);
+	__write_seqcount_begin(&sl->seqcount);
 }
 
 static inline void write_sequnlock_bh(seqlock_t *sl)
 {
-	write_seqcount_end(&sl->seqcount);
+	__write_seqcount_end(&sl->seqcount);
 	spin_unlock_bh(&sl->lock);
 }
 
 static inline void write_seqlock_irq(seqlock_t *sl)
 {
 	spin_lock_irq(&sl->lock);
-	write_seqcount_begin(&sl->seqcount);
+	__write_seqcount_begin(&sl->seqcount);
 }
 
 static inline void write_sequnlock_irq(seqlock_t *sl)
 {
-	write_seqcount_end(&sl->seqcount);
+	__write_seqcount_end(&sl->seqcount);
 	spin_unlock_irq(&sl->lock);
 }
 
@@ -254,7 +289,7 @@ static inline unsigned long __write_seqlock_irqsave(seqlock_t *sl)
 	unsigned long flags;
 
 	spin_lock_irqsave(&sl->lock, flags);
-	write_seqcount_begin(&sl->seqcount);
+	__write_seqcount_begin(&sl->seqcount);
 	return flags;
 }
 
@@ -264,7 +299,7 @@ static inline unsigned long __write_seqlock_irqsave(seqlock_t *sl)
 static inline void
 write_sequnlock_irqrestore(seqlock_t *sl, unsigned long flags)
 {
-	write_seqcount_end(&sl->seqcount);
+	__write_seqcount_end(&sl->seqcount);
 	spin_unlock_irqrestore(&sl->lock, flags);
 }
 
diff --git a/include/net/dst.h b/include/net/dst.h
index b3ebe17..446d7b1 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -392,7 +392,7 @@ static inline void dst_confirm(struct dst_entry *dst)
 static inline int dst_neigh_output(struct dst_entry *dst, struct neighbour *n,
 				   struct sk_buff *skb)
 {
-	const struct hh_cache *hh;
+	struct hh_cache *hh;
 
 	if (dst->pending_confirm) {
 		unsigned long now = jiffies;
diff --git a/include/net/neighbour.h b/include/net/neighbour.h
index 0dab173..f28b70c 100644
--- a/include/net/neighbour.h
+++ b/include/net/neighbour.h
@@ -334,7 +334,7 @@ static inline int neigh_hh_bridge(struct hh_cache *hh, struct sk_buff *skb)
 }
 #endif
 
-static inline int neigh_hh_output(const struct hh_cache *hh, struct sk_buff *skb)
+static inline int neigh_hh_output(struct hh_cache *hh, struct sk_buff *skb)
 {
 	unsigned int seq;
 	int hh_len;
@@ -389,7 +389,7 @@ struct neighbour_cb {
 
 #define NEIGH_CB(skb)	((struct neighbour_cb *)(skb)->cb)
 
-static inline void neigh_ha_snapshot(char *dst, const struct neighbour *n,
+static inline void neigh_ha_snapshot(char *dst, struct neighbour *n,
 				     const struct net_device *dev)
 {
 	unsigned int seq;
-- 
cgit v0.10.2


From 5673cd604eeee718e563f6a551a82e7cb48df7d5 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Thu, 1 Mar 2012 13:55:33 -0500
Subject: sched/rt: Fix wait_task_interactive() to test rt_spin_lock state

The wait_task_interactive() will have a task sleep waiting for another
task to have a certain state. But it ignores the rt_spin_locks state
and can return with an incorrect result if the task it is waiting
for is blocked on a rt_spin_lock() and is waking up.

The rt_spin_locks save the tasks state in the saved_state field
and the wait_task_interactive() must also test that state.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Carsten Emde <C.Emde@osadl.org>
Cc: John Kacur <jkacur@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Clark Williams <clark.williams@gmail.com>
Cc: stable-rt@vger.kernel.org
Link: http://lkml.kernel.org/r/20120301190345.979435764@goodmis.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 7d852af..c11a9cc 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1041,7 +1041,8 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
 		 * is actually now running somewhere else!
 		 */
 		while (task_running(rq, p)) {
-			if (match_state && unlikely(p->state != match_state))
+			if (match_state && unlikely(p->state != match_state)
+			    && unlikely(p->saved_state != match_state))
 				return 0;
 			cpu_relax();
 		}
@@ -1056,7 +1057,8 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
 		running = task_running(rq, p);
 		on_rq = p->on_rq;
 		ncsw = 0;
-		if (!match_state || p->state == match_state)
+		if (!match_state || p->state == match_state
+		    || p->saved_state == match_state)
 			ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
 		task_rq_unlock(rq, p, &flags);
 
-- 
cgit v0.10.2


From bccf7742b7f3b50cbc0087663f5e1bddf88e54af Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Fri, 2 Mar 2012 10:36:57 -0500
Subject: cpu: Make hotplug.lock a "sleeping" spinlock on RT

Tasks can block on hotplug.lock in pin_current_cpu(), but their state
might be != RUNNING. So the mutex wakeup will set the state
unconditionally to RUNNING. That might cause spurious unexpected
wakeups. We could provide a state preserving mutex_lock() function,
but this is semantically backwards. So instead we convert the
hotplug.lock() to a spinlock for RT, which has the state preserving
semantics already.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Carsten Emde <C.Emde@osadl.org>
Cc: John Kacur <jkacur@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Clark Williams <clark.williams@gmail.com>
Cc: stable-rt@vger.kernel.org
Link: http://lkml.kernel.org/r/1330702617.25686.265.camel@gandalf.stny.rr.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 7b5f4cb..2942f54 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -51,7 +51,12 @@ static int cpu_hotplug_disabled;
 
 static struct {
 	struct task_struct *active_writer;
+#ifdef CONFIG_PREEMPT_RT_FULL
+	/* Makes the lock keep the task's state */
+	spinlock_t lock;
+#else
 	struct mutex lock; /* Synchronizes accesses to refcount, */
+#endif
 	/*
 	 * Also blocks the new readers during
 	 * an ongoing cpu hotplug operation.
@@ -59,10 +64,22 @@ static struct {
 	int refcount;
 } cpu_hotplug = {
 	.active_writer = NULL,
+#ifdef CONFIG_PREEMPT_RT_FULL
+	.lock = __SPIN_LOCK_UNLOCKED(cpu_hotplug.lock),
+#else
 	.lock = __MUTEX_INITIALIZER(cpu_hotplug.lock),
+#endif
 	.refcount = 0,
 };
 
+#ifdef CONFIG_PREEMPT_RT_FULL
+# define hotplug_lock() rt_spin_lock(&cpu_hotplug.lock)
+# define hotplug_unlock() rt_spin_unlock(&cpu_hotplug.lock)
+#else
+# define hotplug_lock() mutex_lock(&cpu_hotplug.lock)
+# define hotplug_unlock() mutex_unlock(&cpu_hotplug.lock)
+#endif
+
 struct hotplug_pcp {
 	struct task_struct *unplug;
 	int refcount;
@@ -92,8 +109,8 @@ retry:
 		return;
 	}
 	preempt_enable();
-	mutex_lock(&cpu_hotplug.lock);
-	mutex_unlock(&cpu_hotplug.lock);
+	hotplug_lock();
+	hotplug_unlock();
 	preempt_disable();
 	goto retry;
 }
@@ -166,9 +183,9 @@ void get_online_cpus(void)
 	might_sleep();
 	if (cpu_hotplug.active_writer == current)
 		return;
-	mutex_lock(&cpu_hotplug.lock);
+	hotplug_lock();
 	cpu_hotplug.refcount++;
-	mutex_unlock(&cpu_hotplug.lock);
+	hotplug_unlock();
 
 }
 EXPORT_SYMBOL_GPL(get_online_cpus);
@@ -177,14 +194,14 @@ void put_online_cpus(void)
 {
 	if (cpu_hotplug.active_writer == current)
 		return;
-	mutex_lock(&cpu_hotplug.lock);
 
+	hotplug_lock();
 	if (WARN_ON(!cpu_hotplug.refcount))
 		cpu_hotplug.refcount++; /* try to fix things up */
 
 	if (!--cpu_hotplug.refcount && unlikely(cpu_hotplug.active_writer))
 		wake_up_process(cpu_hotplug.active_writer);
-	mutex_unlock(&cpu_hotplug.lock);
+	hotplug_unlock();
 
 }
 EXPORT_SYMBOL_GPL(put_online_cpus);
@@ -216,11 +233,11 @@ static void cpu_hotplug_begin(void)
 	cpu_hotplug.active_writer = current;
 
 	for (;;) {
-		mutex_lock(&cpu_hotplug.lock);
+		hotplug_lock();
 		if (likely(!cpu_hotplug.refcount))
 			break;
 		__set_current_state(TASK_UNINTERRUPTIBLE);
-		mutex_unlock(&cpu_hotplug.lock);
+		hotplug_unlock();
 		schedule();
 	}
 }
@@ -228,7 +245,7 @@ static void cpu_hotplug_begin(void)
 static void cpu_hotplug_done(void)
 {
 	cpu_hotplug.active_writer = NULL;
-	mutex_unlock(&cpu_hotplug.lock);
+	hotplug_unlock();
 }
 
 #else /* #if CONFIG_HOTPLUG_CPU */
-- 
cgit v0.10.2


From 23473a1d9b6223b2ca7b92b10953860a87a71a3b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 13 Nov 2011 17:17:09 +0100
Subject: softirq: Check preemption after reenabling interrupts

raise_softirq_irqoff() disables interrupts and wakes the softirq
daemon, but after reenabling interrupts there is no preemption check,
so the execution of the softirq thread might be delayed arbitrarily.

In principle we could add that check to local_irq_enable/restore, but
that's overkill as the rasie_softirq_irqoff() sections are the only
ones which show this behaviour.

Reported-by: Carsten Emde <cbe@osadl.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable-rt@vger.kernel.org

diff --git a/block/blk-iopoll.c b/block/blk-iopoll.c
index 58916af..f7ca9b4 100644
--- a/block/blk-iopoll.c
+++ b/block/blk-iopoll.c
@@ -38,6 +38,7 @@ void blk_iopoll_sched(struct blk_iopoll *iop)
 	list_add_tail(&iop->list, &__get_cpu_var(blk_cpu_iopoll));
 	__raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ);
 	local_irq_restore(flags);
+	preempt_check_resched_rt();
 }
 EXPORT_SYMBOL(blk_iopoll_sched);
 
@@ -135,6 +136,7 @@ static void blk_iopoll_softirq(struct softirq_action *h)
 		__raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ);
 
 	local_irq_enable();
+	preempt_check_resched_rt();
 }
 
 /**
@@ -204,6 +206,7 @@ static int __cpuinit blk_iopoll_cpu_notify(struct notifier_block *self,
 				 &__get_cpu_var(blk_cpu_iopoll));
 		__raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ);
 		local_irq_enable();
+		preempt_check_resched_rt();
 	}
 
 	return NOTIFY_OK;
diff --git a/block/blk-softirq.c b/block/blk-softirq.c
index 467c8de..3fe2368 100644
--- a/block/blk-softirq.c
+++ b/block/blk-softirq.c
@@ -51,6 +51,7 @@ static void trigger_softirq(void *data)
 		raise_softirq_irqoff(BLOCK_SOFTIRQ);
 
 	local_irq_restore(flags);
+	preempt_check_resched_rt();
 }
 
 /*
@@ -93,6 +94,7 @@ static int __cpuinit blk_cpu_notify(struct notifier_block *self,
 				 &__get_cpu_var(blk_cpu_done));
 		raise_softirq_irqoff(BLOCK_SOFTIRQ);
 		local_irq_enable();
+		preempt_check_resched_rt();
 	}
 
 	return NOTIFY_OK;
@@ -150,6 +152,7 @@ do_local:
 		goto do_local;
 
 	local_irq_restore(flags);
+	preempt_check_resched_rt();
 }
 
 /**
diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index 5af46d0..5e71285 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -56,8 +56,10 @@ do { \
 
 #ifndef CONFIG_PREEMPT_RT_BASE
 # define preempt_enable_no_resched()	sched_preempt_enable_no_resched()
+# define preempt_check_resched_rt()	do { } while (0)
 #else
 # define preempt_enable_no_resched()	preempt_enable()
+# define preempt_check_resched_rt()	preempt_check_resched()
 #endif
 
 #define preempt_enable() \
@@ -105,6 +107,7 @@ do { \
 #define preempt_disable_notrace()		do { } while (0)
 #define preempt_enable_no_resched_notrace()	do { } while (0)
 #define preempt_enable_notrace()		do { } while (0)
+#define preempt_check_resched_rt()	do { } while (0)
 
 #endif /* CONFIG_PREEMPT_COUNT */
 
diff --git a/net/core/dev.c b/net/core/dev.c
index 6c87379..19f2584 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1946,6 +1946,7 @@ static inline void __netif_reschedule(struct Qdisc *q)
 	sd->output_queue_tailp = &q->next_sched;
 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
 	local_irq_restore(flags);
+	preempt_check_resched_rt();
 }
 
 void __netif_schedule(struct Qdisc *q)
@@ -1967,6 +1968,7 @@ void dev_kfree_skb_irq(struct sk_buff *skb)
 		sd->completion_queue = skb;
 		raise_softirq_irqoff(NET_TX_SOFTIRQ);
 		local_irq_restore(flags);
+		preempt_check_resched_rt();
 	}
 }
 EXPORT_SYMBOL(dev_kfree_skb_irq);
@@ -3052,6 +3054,7 @@ enqueue:
 	rps_unlock(sd);
 
 	local_irq_restore(flags);
+	preempt_check_resched_rt();
 
 	atomic_long_inc(&skb->dev->rx_dropped);
 	kfree_skb(skb);
@@ -3932,6 +3935,7 @@ static void net_rps_action_and_irq_enable(struct softnet_data *sd)
 	} else
 #endif
 		local_irq_enable();
+	preempt_check_resched_rt();
 }
 
 static int process_backlog(struct napi_struct *napi, int quota)
@@ -4004,6 +4008,7 @@ void __napi_schedule(struct napi_struct *n)
 	local_irq_save(flags);
 	____napi_schedule(&__get_cpu_var(softnet_data), n);
 	local_irq_restore(flags);
+	preempt_check_resched_rt();
 }
 EXPORT_SYMBOL(__napi_schedule);
 
@@ -6560,6 +6565,7 @@ static int dev_cpu_callback(struct notifier_block *nfb,
 
 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
 	local_irq_enable();
+	preempt_check_resched_rt();
 
 	/* Process offline CPU's input_pkt_queue */
 	while ((skb = __skb_dequeue(&oldsd->process_queue))) {
-- 
cgit v0.10.2


From 5370e36a9a17b2c5d245b39fde23a20a617265d1 Mon Sep 17 00:00:00 2001
From: John Kacur <jkacur@redhat.com>
Date: Fri, 27 Apr 2012 12:48:46 +0200
Subject: scsi: qla2xxx: Use local_irq_save_nort() in qla2x00_poll

RT triggers the following:

[   11.307652]  [<ffffffff81077b27>] __might_sleep+0xe7/0x110
[   11.307663]  [<ffffffff8150e524>] rt_spin_lock+0x24/0x60
[   11.307670]  [<ffffffff8150da78>] ? rt_spin_lock_slowunlock+0x78/0x90
[   11.307703]  [<ffffffffa0272d83>] qla24xx_intr_handler+0x63/0x2d0 [qla2xxx]
[   11.307736]  [<ffffffffa0262307>] qla2x00_poll+0x67/0x90 [qla2xxx]

Function qla2x00_poll does local_irq_save() before calling qla24xx_intr_handler
which has a spinlock. Since spinlocks are sleepable on rt, it is not allowed
to call them with interrupts disabled. Therefore we use local_irq_save_nort()
instead which saves flags without disabling interrupts.

This fix needs to be applied to v3.0-rt, v3.2-rt and v3.4-rt

Suggested-by: Thomas Gleixner
Signed-off-by: John Kacur <jkacur@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: David Sommerseth <davids@redhat.com>
Link: http://lkml.kernel.org/r/1335523726-10024-1-git-send-email-jkacur@redhat.com
Cc: stable-rt@vger.kernel.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/drivers/scsi/qla2xxx/qla_inline.h b/drivers/scsi/qla2xxx/qla_inline.h
index c0462c0..41192aa 100644
--- a/drivers/scsi/qla2xxx/qla_inline.h
+++ b/drivers/scsi/qla2xxx/qla_inline.h
@@ -36,12 +36,12 @@ qla2x00_poll(struct rsp_que *rsp)
 {
 	unsigned long flags;
 	struct qla_hw_data *ha = rsp->hw;
-	local_irq_save(flags);
+	local_irq_save_nort(flags);
 	if (IS_QLA82XX(ha))
 		qla82xx_poll(0, rsp);
 	else
 		ha->isp_ops->intr_handler(0, rsp);
-	local_irq_restore(flags);
+	local_irq_restore_nort(flags);
 }
 
 static inline uint8_t *
-- 
cgit v0.10.2


From 4d20afb3828d4b59f37e0c9d442e9a964dd2eaae Mon Sep 17 00:00:00 2001
From: Priyanka Jain <Priyanka.Jain@freescale.com>
Date: Thu, 17 May 2012 09:35:11 +0530
Subject: net,RT:REmove preemption disabling in netif_rx()

1)enqueue_to_backlog() (called from netif_rx) should be
  bind to a particluar CPU. This can be achieved by
  disabling migration. No need to disable preemption

2)Fixes crash "BUG: scheduling while atomic: ksoftirqd"
  in case of RT.
  If preemption is disabled, enqueue_to_backog() is called
  in atomic context. And if backlog exceeds its count,
  kfree_skb() is called. But in RT, kfree_skb() might
  gets scheduled out, so it expects non atomic context.

3)When CONFIG_PREEMPT_RT_FULL is not defined,
 migrate_enable(), migrate_disable() maps to
 preempt_enable() and preempt_disable(), so no
 change in functionality in case of non-RT.

-Replace preempt_enable(), preempt_disable() with
 migrate_enable(), migrate_disable() respectively
-Replace get_cpu(), put_cpu() with get_cpu_light(),
 put_cpu_light() respectively

Signed-off-by: Priyanka Jain <Priyanka.Jain@freescale.com>
Acked-by: Rajan Srivastava <Rajan.Srivastava@freescale.com>
Cc: <rostedt@goodmis.orgn>
Link: http://lkml.kernel.org/r/1337227511-2271-1-git-send-email-Priyanka.Jain@freescale.com
Cc: stable-rt@vger.kernel.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/net/core/dev.c b/net/core/dev.c
index 19f2584..3250939 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3092,7 +3092,7 @@ int netif_rx(struct sk_buff *skb)
 		struct rps_dev_flow voidflow, *rflow = &voidflow;
 		int cpu;
 
-		preempt_disable();
+		migrate_disable();
 		rcu_read_lock();
 
 		cpu = get_rps_cpu(skb->dev, skb, &rflow);
@@ -3102,13 +3102,13 @@ int netif_rx(struct sk_buff *skb)
 		ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
 
 		rcu_read_unlock();
-		preempt_enable();
+		migrate_enable();
 	} else
 #endif
 	{
 		unsigned int qtail;
-		ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
-		put_cpu();
+		ret = enqueue_to_backlog(skb, get_cpu_light(), &qtail);
+		put_cpu_light();
 	}
 	return ret;
 }
-- 
cgit v0.10.2


From 68d8ac49e21ac491135ca2b71cc88cdc51385aa7 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 7 Mar 2012 20:51:03 +0100
Subject: rt: Introduce cpu_chill()

Retry loops on RT might loop forever when the modifying side was
preempted. Add cpu_chill() to replace cpu_relax(). cpu_chill()
defaults to cpu_relax() for non RT. On RT it puts the looping task to
sleep for a tick so the preempted task can make progress.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable-rt@vger.kernel.org

diff --git a/include/linux/delay.h b/include/linux/delay.h
index a6ecb34..e23a7c0 100644
--- a/include/linux/delay.h
+++ b/include/linux/delay.h
@@ -52,4 +52,10 @@ static inline void ssleep(unsigned int seconds)
 	msleep(seconds * 1000);
 }
 
+#ifdef CONFIG_PREEMPT_RT_FULL
+# define cpu_chill()	msleep(1)
+#else
+# define cpu_chill()	cpu_relax()
+#endif
+
 #endif /* defined(_LINUX_DELAY_H) */
-- 
cgit v0.10.2


From a442efcdd891980eed4a65af2412e95f05d303fc Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 7 Mar 2012 21:00:34 +0100
Subject: fs: dcache: Use cpu_chill() in trylock loops

Retry loops on RT might loop forever when the modifying side was
preempted. Use cpu_chill() instead of cpu_relax() to let the system
make progress.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable-rt@vger.kernel.org

diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index b785e77..b0f12d7 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -34,6 +34,7 @@
 #include <linux/sched.h>
 #include <linux/mount.h>
 #include <linux/namei.h>
+#include <linux/delay.h>
 #include <asm/current.h>
 #include <asm/uaccess.h>
 
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index 01443ce..5928f92 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -166,7 +166,7 @@ again:
 			parent = p->d_parent;
 			if (!spin_trylock(&parent->d_lock)) {
 				spin_unlock(&p->d_lock);
-				cpu_relax();
+				cpu_chill();
 				goto relock;
 			}
 			spin_unlock(&p->d_lock);
diff --git a/fs/dcache.c b/fs/dcache.c
index 19153a0..a3b4587 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -37,6 +37,7 @@
 #include <linux/rculist_bl.h>
 #include <linux/prefetch.h>
 #include <linux/ratelimit.h>
+#include <linux/delay.h>
 #include "internal.h"
 #include "mount.h"
 
@@ -470,7 +471,7 @@ static inline struct dentry *dentry_kill(struct dentry *dentry, int ref)
 	if (inode && !spin_trylock(&inode->i_lock)) {
 relock:
 		spin_unlock(&dentry->d_lock);
-		cpu_relax();
+		cpu_chill();
 		return dentry; /* try again with same dentry */
 	}
 	if (IS_ROOT(dentry))
@@ -852,7 +853,7 @@ relock:
 
 		if (!spin_trylock(&dentry->d_lock)) {
 			spin_unlock(&dcache_lru_lock);
-			cpu_relax();
+			cpu_chill();
 			goto relock;
 		}
 
@@ -2084,7 +2085,7 @@ again:
 	if (dentry->d_count == 1) {
 		if (!spin_trylock(&inode->i_lock)) {
 			spin_unlock(&dentry->d_lock);
-			cpu_relax();
+			cpu_chill();
 			goto again;
 		}
 		dentry->d_flags &= ~DCACHE_CANT_MOUNT;
diff --git a/fs/namespace.c b/fs/namespace.c
index 0a189ed..c8b0aaa 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -22,6 +22,7 @@
 #include <linux/fsnotify.h>	/* fsnotify_vfsmount_delete */
 #include <linux/uaccess.h>
 #include <linux/proc_fs.h>
+#include <linux/delay.h>
 #include "pnode.h"
 #include "internal.h"
 
@@ -315,7 +316,7 @@ int __mnt_want_write(struct vfsmount *m)
 	smp_mb();
 	while (ACCESS_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD) {
 		preempt_enable();
-		cpu_relax();
+		cpu_chill();
 		preempt_disable();
 	}
 	/*
-- 
cgit v0.10.2


From 77927da01054f748106944994ea63fcbabd594ea Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 7 Mar 2012 21:10:04 +0100
Subject: net: Use cpu_chill() instead of cpu_relax()

Retry loops on RT might loop forever when the modifying side was
preempted. Use cpu_chill() instead of cpu_relax() to let the system
make progress.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable-rt@vger.kernel.org

diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index c111bd0..92a2359 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -88,6 +88,7 @@
 #include <linux/virtio_net.h>
 #include <linux/errqueue.h>
 #include <linux/net_tstamp.h>
+#include <linux/delay.h>
 
 #ifdef CONFIG_INET
 #include <net/inet_common.h>
@@ -553,7 +554,7 @@ static void prb_retire_rx_blk_timer_expired(unsigned long data)
 	if (BLOCK_NUM_PKTS(pbd)) {
 		while (atomic_read(&pkc->blk_fill_in_prog)) {
 			/* Waiting for skb_copy_bits to finish... */
-			cpu_relax();
+			cpu_chill();
 		}
 	}
 
@@ -807,7 +808,7 @@ static void prb_retire_current_block(struct tpacket_kbdq_core *pkc,
 		if (!(status & TP_STATUS_BLK_TMO)) {
 			while (atomic_read(&pkc->blk_fill_in_prog)) {
 				/* Waiting for skb_copy_bits to finish... */
-				cpu_relax();
+				cpu_chill();
 			}
 		}
 		prb_close_block(pkc, pbd, po, status);
diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c
index e8fdb17..5a44c6e 100644
--- a/net/rds/ib_rdma.c
+++ b/net/rds/ib_rdma.c
@@ -34,6 +34,7 @@
 #include <linux/slab.h>
 #include <linux/rculist.h>
 #include <linux/llist.h>
+#include <linux/delay.h>
 
 #include "rds.h"
 #include "ib.h"
@@ -286,7 +287,7 @@ static inline void wait_clean_list_grace(void)
 	for_each_online_cpu(cpu) {
 		flag = &per_cpu(clean_list_grace, cpu);
 		while (test_bit(CLEAN_LIST_BUSY_BIT, flag))
-			cpu_relax();
+			cpu_chill();
 	}
 }
 
-- 
cgit v0.10.2


From 57f1fccabda40ec21ada338cdb8a1bc16bc8f459 Mon Sep 17 00:00:00 2001
From: Yong Zhang <yong.zhang@windriver.com>
Date: Mon, 16 Apr 2012 15:01:55 +0800
Subject: lockdep: Selftest: convert spinlock to raw spinlock

spinlock is sleepable on -rt and can not be used in
interrupt context.

Signed-off-by: Yong Zhang <yong.zhang0@gmail.com>
Cc: Yong Zhang <yong.zhang@windriver.com>
Link: http://lkml.kernel.org/r/1334559716-18447-2-git-send-email-yong.zhang0@gmail.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/lib/locking-selftest.c b/lib/locking-selftest.c
index 7aae0f2..c3eb261 100644
--- a/lib/locking-selftest.c
+++ b/lib/locking-selftest.c
@@ -47,10 +47,10 @@ __setup("debug_locks_verbose=", setup_debug_locks_verbose);
  * Normal standalone locks, for the circular and irq-context
  * dependency tests:
  */
-static DEFINE_SPINLOCK(lock_A);
-static DEFINE_SPINLOCK(lock_B);
-static DEFINE_SPINLOCK(lock_C);
-static DEFINE_SPINLOCK(lock_D);
+static DEFINE_RAW_SPINLOCK(lock_A);
+static DEFINE_RAW_SPINLOCK(lock_B);
+static DEFINE_RAW_SPINLOCK(lock_C);
+static DEFINE_RAW_SPINLOCK(lock_D);
 
 static DEFINE_RWLOCK(rwlock_A);
 static DEFINE_RWLOCK(rwlock_B);
@@ -73,12 +73,12 @@ static DECLARE_RWSEM(rwsem_D);
  * but X* and Y* are different classes. We do this so that
  * we do not trigger a real lockup:
  */
-static DEFINE_SPINLOCK(lock_X1);
-static DEFINE_SPINLOCK(lock_X2);
-static DEFINE_SPINLOCK(lock_Y1);
-static DEFINE_SPINLOCK(lock_Y2);
-static DEFINE_SPINLOCK(lock_Z1);
-static DEFINE_SPINLOCK(lock_Z2);
+static DEFINE_RAW_SPINLOCK(lock_X1);
+static DEFINE_RAW_SPINLOCK(lock_X2);
+static DEFINE_RAW_SPINLOCK(lock_Y1);
+static DEFINE_RAW_SPINLOCK(lock_Y2);
+static DEFINE_RAW_SPINLOCK(lock_Z1);
+static DEFINE_RAW_SPINLOCK(lock_Z2);
 
 static DEFINE_RWLOCK(rwlock_X1);
 static DEFINE_RWLOCK(rwlock_X2);
@@ -107,10 +107,10 @@ static DECLARE_RWSEM(rwsem_Z2);
  */
 #define INIT_CLASS_FUNC(class) 				\
 static noinline void					\
-init_class_##class(spinlock_t *lock, rwlock_t *rwlock, struct mutex *mutex, \
-		 struct rw_semaphore *rwsem)		\
+init_class_##class(raw_spinlock_t *lock, rwlock_t *rwlock, \
+	struct mutex *mutex, struct rw_semaphore *rwsem)\
 {							\
-	spin_lock_init(lock);				\
+	raw_spin_lock_init(lock);			\
 	rwlock_init(rwlock);				\
 	mutex_init(mutex);				\
 	init_rwsem(rwsem);				\
@@ -168,10 +168,10 @@ static void init_shared_classes(void)
  * Shortcuts for lock/unlock API variants, to keep
  * the testcases compact:
  */
-#define L(x)			spin_lock(&lock_##x)
-#define U(x)			spin_unlock(&lock_##x)
+#define L(x)			raw_spin_lock(&lock_##x)
+#define U(x)			raw_spin_unlock(&lock_##x)
 #define LU(x)			L(x); U(x)
-#define SI(x)			spin_lock_init(&lock_##x)
+#define SI(x)			raw_spin_lock_init(&lock_##x)
 
 #define WL(x)			write_lock(&rwlock_##x)
 #define WU(x)			write_unlock(&rwlock_##x)
@@ -911,7 +911,7 @@ GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion_soft)
 
 #define I2(x)					\
 	do {					\
-		spin_lock_init(&lock_##x);	\
+		raw_spin_lock_init(&lock_##x);	\
 		rwlock_init(&rwlock_##x);	\
 		mutex_init(&mutex_##x);		\
 		init_rwsem(&rwsem_##x);		\
-- 
cgit v0.10.2


From bfd94cbd44677dc9af960bdf93ff9bc23bd64fee Mon Sep 17 00:00:00 2001
From: Yong Zhang <yong.zhang@windriver.com>
Date: Mon, 16 Apr 2012 15:01:56 +0800
Subject: lockdep: Selftest: Only do hardirq context test for raw spinlock

On -rt there is no softirq context any more and rwlock is sleepable,
disable softirq context test and rwlock+irq test.

Signed-off-by: Yong Zhang <yong.zhang0@gmail.com>
Cc: Yong Zhang <yong.zhang@windriver.com>
Link: http://lkml.kernel.org/r/1334559716-18447-3-git-send-email-yong.zhang0@gmail.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/lib/locking-selftest.c b/lib/locking-selftest.c
index c3eb261..23b8564 100644
--- a/lib/locking-selftest.c
+++ b/lib/locking-selftest.c
@@ -1175,6 +1175,7 @@ void locking_selftest(void)
 
 	printk("  --------------------------------------------------------------------------\n");
 
+#ifndef CONFIG_PREEMPT_RT_FULL
 	/*
 	 * irq-context testcases:
 	 */
@@ -1187,6 +1188,28 @@ void locking_selftest(void)
 
 	DO_TESTCASE_6x2("irq read-recursion", irq_read_recursion);
 //	DO_TESTCASE_6x2B("irq read-recursion #2", irq_read_recursion2);
+#else
+	/* On -rt, we only do hardirq context test for raw spinlock */
+	DO_TESTCASE_1B("hard-irqs-on + irq-safe-A", irqsafe1_hard_spin, 12);
+	DO_TESTCASE_1B("hard-irqs-on + irq-safe-A", irqsafe1_hard_spin, 21);
+
+	DO_TESTCASE_1B("hard-safe-A + irqs-on", irqsafe2B_hard_spin, 12);
+	DO_TESTCASE_1B("hard-safe-A + irqs-on", irqsafe2B_hard_spin, 21);
+
+	DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 123);
+	DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 132);
+	DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 213);
+	DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 231);
+	DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 312);
+	DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 321);
+
+	DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 123);
+	DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 132);
+	DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 213);
+	DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 231);
+	DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 312);
+	DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 321);
+#endif
 
 	if (unexpected_testcase_failures) {
 		printk("-----------------------------------------------------------------\n");
-- 
cgit v0.10.2


From 898b00d8ea7ac3a3123c2e083f033d2039d7ea35 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <mgalbraith@suse.de>
Date: Wed, 11 Jul 2012 22:05:20 +0000
Subject: fs, jbd: pull your plug when waiting for space

With an -rt kernel, and a heavy sync IO load, tasks can jam
up on journal locks without unplugging, which can lead to
terminal IO starvation.  Unplug and schedule when waiting for space.

Signed-off-by: Mike Galbraith <mgalbraith@suse.de>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Theodore Tso <tytso@mit.edu>
Link: http://lkml.kernel.org/r/1341812414.7370.73.camel@marge.simpson.net
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c
index 08c0304..95debd7 100644
--- a/fs/jbd/checkpoint.c
+++ b/fs/jbd/checkpoint.c
@@ -129,6 +129,8 @@ void __log_wait_for_space(journal_t *journal)
 		if (journal->j_flags & JFS_ABORT)
 			return;
 		spin_unlock(&journal->j_state_lock);
+		if (current->plug)
+			io_schedule();
 		mutex_lock(&journal->j_checkpoint_mutex);
 
 		/*
-- 
cgit v0.10.2


From 37aa77b810f0d3d7c78b33fba09700b04929df44 Mon Sep 17 00:00:00 2001
From: Yong Zhang <yong.zhang@windriver.com>
Date: Wed, 11 Jul 2012 22:05:21 +0000
Subject: perf: Make swevent hrtimer run in irq instead of softirq

Otherwise we get a deadlock like below:

[ 1044.042749] BUG: scheduling while atomic: ksoftirqd/21/141/0x00010003
[ 1044.042752] INFO: lockdep is turned off.
[ 1044.042754] Modules linked in:
[ 1044.042757] Pid: 141, comm: ksoftirqd/21 Tainted: G        W    3.4.0-rc2-rt3-23676-ga723175-dirty #29
[ 1044.042759] Call Trace:
[ 1044.042761]  <IRQ>  [<ffffffff8107d8e5>] __schedule_bug+0x65/0x80
[ 1044.042770]  [<ffffffff8168978c>] __schedule+0x83c/0xa70
[ 1044.042775]  [<ffffffff8106bdd2>] ? prepare_to_wait+0x32/0xb0
[ 1044.042779]  [<ffffffff81689a5e>] schedule+0x2e/0xa0
[ 1044.042782]  [<ffffffff81071ebd>] hrtimer_wait_for_timer+0x6d/0xb0
[ 1044.042786]  [<ffffffff8106bb30>] ? wake_up_bit+0x40/0x40
[ 1044.042790]  [<ffffffff81071f20>] hrtimer_cancel+0x20/0x40
[ 1044.042794]  [<ffffffff8111da0c>] perf_swevent_cancel_hrtimer+0x3c/0x50
[ 1044.042798]  [<ffffffff8111da31>] task_clock_event_stop+0x11/0x40
[ 1044.042802]  [<ffffffff8111da6e>] task_clock_event_del+0xe/0x10
[ 1044.042805]  [<ffffffff8111c568>] event_sched_out+0x118/0x1d0
[ 1044.042809]  [<ffffffff8111c649>] group_sched_out+0x29/0x90
[ 1044.042813]  [<ffffffff8111ed7e>] __perf_event_disable+0x18e/0x200
[ 1044.042817]  [<ffffffff8111c343>] remote_function+0x63/0x70
[ 1044.042821]  [<ffffffff810b0aae>] generic_smp_call_function_single_interrupt+0xce/0x120
[ 1044.042826]  [<ffffffff81022bc7>] smp_call_function_single_interrupt+0x27/0x40
[ 1044.042831]  [<ffffffff8168d50c>] call_function_single_interrupt+0x6c/0x80
[ 1044.042833]  <EOI>  [<ffffffff811275b0>] ? perf_event_overflow+0x20/0x20
[ 1044.042840]  [<ffffffff8168b970>] ? _raw_spin_unlock_irq+0x30/0x70
[ 1044.042844]  [<ffffffff8168b976>] ? _raw_spin_unlock_irq+0x36/0x70
[ 1044.042848]  [<ffffffff810702e2>] run_hrtimer_softirq+0xc2/0x200
[ 1044.042853]  [<ffffffff811275b0>] ? perf_event_overflow+0x20/0x20
[ 1044.042857]  [<ffffffff81045265>] __do_softirq_common+0xf5/0x3a0
[ 1044.042862]  [<ffffffff81045c3d>] __thread_do_softirq+0x15d/0x200
[ 1044.042865]  [<ffffffff81045dda>] run_ksoftirqd+0xfa/0x210
[ 1044.042869]  [<ffffffff81045ce0>] ? __thread_do_softirq+0x200/0x200
[ 1044.042873]  [<ffffffff81045ce0>] ? __thread_do_softirq+0x200/0x200
[ 1044.042877]  [<ffffffff8106b596>] kthread+0xb6/0xc0
[ 1044.042881]  [<ffffffff8168b97b>] ? _raw_spin_unlock_irq+0x3b/0x70
[ 1044.042886]  [<ffffffff8168d994>] kernel_thread_helper+0x4/0x10
[ 1044.042889]  [<ffffffff8107d98c>] ? finish_task_switch+0x8c/0x110
[ 1044.042894]  [<ffffffff8168b97b>] ? _raw_spin_unlock_irq+0x3b/0x70
[ 1044.042897]  [<ffffffff8168bd5d>] ? retint_restore_args+0xe/0xe
[ 1044.042900]  [<ffffffff8106b4e0>] ? kthreadd+0x1e0/0x1e0
[ 1044.042902]  [<ffffffff8168d990>] ? gs_change+0xb/0xb

Signed-off-by: Yong Zhang <yong.zhang0@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Steven Rostedt <rostedt@goodmis.org>
Link: http://lkml.kernel.org/r/1341476476-5666-1-git-send-email-yong.zhang0@gmail.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 7b6646a..02aad6d 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -5638,6 +5638,7 @@ static void perf_swevent_init_hrtimer(struct perf_event *event)
 
 	hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	hwc->hrtimer.function = perf_swevent_hrtimer;
+	hwc->hrtimer.irqsafe = 1;
 
 	/*
 	 * Since hrtimers have a fixed rate, we can do a static freq->period
-- 
cgit v0.10.2


From 083d644e4a2ddc26875b996e88f11c71ce62bda2 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 16 Jul 2012 08:07:43 +0000
Subject: cpu/rt: Rework cpu down for PREEMPT_RT

Bringing a CPU down is a pain with the PREEMPT_RT kernel because
tasks can be preempted in many more places than in non-RT. In
order to handle per_cpu variables, tasks may be pinned to a CPU
for a while, and even sleep. But these tasks need to be off the CPU
if that CPU is going down.

Several synchronization methods have been tried, but when stressed
they failed. This is a new approach.

A sync_tsk thread is still created and tasks may still block on a
lock when the CPU is going down, but how that works is a bit different.
When cpu_down() starts, it will create the sync_tsk and wait on it
to inform that current tasks that are pinned on the CPU are no longer
pinned. But new tasks that are about to be pinned will still be allowed
to do so at this time.

Then the notifiers are called. Several notifiers will bring down tasks
that will enter these locations. Some of these tasks will take locks
of other tasks that are on the CPU. If we don't let those other tasks
continue, but make them block until CPU down is done, the tasks that
the notifiers are waiting on will never complete as they are waiting
for the locks held by the tasks that are blocked.

Thus we still let the task pin the CPU until the notifiers are done.
After the notifiers run, we then make new tasks entering the pinned
CPU sections grab a mutex and wait. This mutex is now a per CPU mutex
in the hotplug_pcp descriptor.

To help things along, a new function in the scheduler code is created
called migrate_me(). This function will try to migrate the current task
off the CPU this is going down if possible. When the sync_tsk is created,
all tasks will then try to migrate off the CPU going down. There are
several cases that this wont work, but it helps in most cases.

After the notifiers are called and if a task can't migrate off but enters
the pin CPU sections, it will be forced to wait on the hotplug_pcp mutex
until the CPU down is complete. Then the scheduler will force the migration
anyway.

Also, I found that THREAD_BOUND need to also be accounted for in the
pinned CPU, and the migrate_disable no longer treats them special.
This helps fix issues with ksoftirqd and workqueue that unbind on CPU down.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 747e0a6..e7e4c96 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1977,6 +1977,10 @@ extern void do_set_cpus_allowed(struct task_struct *p,
 
 extern int set_cpus_allowed_ptr(struct task_struct *p,
 				const struct cpumask *new_mask);
+int migrate_me(void);
+void tell_sched_cpu_down_begin(int cpu);
+void tell_sched_cpu_down_done(int cpu);
+
 #else
 static inline void do_set_cpus_allowed(struct task_struct *p,
 				      const struct cpumask *new_mask)
@@ -1989,6 +1993,9 @@ static inline int set_cpus_allowed_ptr(struct task_struct *p,
 		return -EINVAL;
 	return 0;
 }
+static inline int migrate_me(void) { return 0; }
+static inline void tell_sched_cpu_down_begin(int cpu) { }
+static inline void tell_sched_cpu_down_done(int cpu) { }
 #endif
 
 #ifdef CONFIG_NO_HZ
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 2942f54..d44dea3 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -51,12 +51,7 @@ static int cpu_hotplug_disabled;
 
 static struct {
 	struct task_struct *active_writer;
-#ifdef CONFIG_PREEMPT_RT_FULL
-	/* Makes the lock keep the task's state */
-	spinlock_t lock;
-#else
 	struct mutex lock; /* Synchronizes accesses to refcount, */
-#endif
 	/*
 	 * Also blocks the new readers during
 	 * an ongoing cpu hotplug operation.
@@ -64,28 +59,46 @@ static struct {
 	int refcount;
 } cpu_hotplug = {
 	.active_writer = NULL,
-#ifdef CONFIG_PREEMPT_RT_FULL
-	.lock = __SPIN_LOCK_UNLOCKED(cpu_hotplug.lock),
-#else
 	.lock = __MUTEX_INITIALIZER(cpu_hotplug.lock),
-#endif
 	.refcount = 0,
 };
 
-#ifdef CONFIG_PREEMPT_RT_FULL
-# define hotplug_lock() rt_spin_lock(&cpu_hotplug.lock)
-# define hotplug_unlock() rt_spin_unlock(&cpu_hotplug.lock)
-#else
-# define hotplug_lock() mutex_lock(&cpu_hotplug.lock)
-# define hotplug_unlock() mutex_unlock(&cpu_hotplug.lock)
-#endif
-
+/**
+ * hotplug_pcp - per cpu hotplug descriptor
+ * @unplug:	set when pin_current_cpu() needs to sync tasks
+ * @sync_tsk:	the task that waits for tasks to finish pinned sections
+ * @refcount:	counter of tasks in pinned sections
+ * @grab_lock:	set when the tasks entering pinned sections should wait
+ * @synced:	notifier for @sync_tsk to tell cpu_down it's finished
+ * @mutex:	the mutex to make tasks wait (used when @grab_lock is true)
+ * @mutex_init:	zero if the mutex hasn't been initialized yet.
+ *
+ * Although @unplug and @sync_tsk may point to the same task, the @unplug
+ * is used as a flag and still exists after @sync_tsk has exited and
+ * @sync_tsk set to NULL.
+ */
 struct hotplug_pcp {
 	struct task_struct *unplug;
+	struct task_struct *sync_tsk;
 	int refcount;
+	int grab_lock;
 	struct completion synced;
+#ifdef CONFIG_PREEMPT_RT_FULL
+	spinlock_t lock;
+#else
+	struct mutex mutex;
+#endif
+	int mutex_init;
 };
 
+#ifdef CONFIG_PREEMPT_RT_FULL
+# define hotplug_lock(hp) rt_spin_lock(&(hp)->lock)
+# define hotplug_unlock(hp) rt_spin_unlock(&(hp)->lock)
+#else
+# define hotplug_lock(hp) mutex_lock(&(hp)->mutex)
+# define hotplug_unlock(hp) mutex_unlock(&(hp)->mutex)
+#endif
+
 static DEFINE_PER_CPU(struct hotplug_pcp, hotplug_pcp);
 
 /**
@@ -99,18 +112,40 @@ static DEFINE_PER_CPU(struct hotplug_pcp, hotplug_pcp);
 void pin_current_cpu(void)
 {
 	struct hotplug_pcp *hp;
+	int force = 0;
 
 retry:
 	hp = &__get_cpu_var(hotplug_pcp);
 
-	if (!hp->unplug || hp->refcount || preempt_count() > 1 ||
+	if (!hp->unplug || hp->refcount || force || preempt_count() > 1 ||
 	    hp->unplug == current || (current->flags & PF_STOMPER)) {
 		hp->refcount++;
 		return;
 	}
-	preempt_enable();
-	hotplug_lock();
-	hotplug_unlock();
+
+	if (hp->grab_lock) {
+		preempt_enable();
+		hotplug_lock(hp);
+		hotplug_unlock(hp);
+	} else {
+		preempt_enable();
+		/*
+		 * Try to push this task off of this CPU.
+		 */
+		if (!migrate_me()) {
+			preempt_disable();
+			hp = &__get_cpu_var(hotplug_pcp);
+			if (!hp->grab_lock) {
+				/*
+				 * Just let it continue it's already pinned
+				 * or about to sleep.
+				 */
+				force = 1;
+				goto retry;
+			}
+			preempt_enable();
+		}
+	}
 	preempt_disable();
 	goto retry;
 }
@@ -132,26 +167,84 @@ void unpin_current_cpu(void)
 		wake_up_process(hp->unplug);
 }
 
-/*
- * FIXME: Is this really correct under all circumstances ?
- */
+static void wait_for_pinned_cpus(struct hotplug_pcp *hp)
+{
+	set_current_state(TASK_UNINTERRUPTIBLE);
+	while (hp->refcount) {
+		schedule_preempt_disabled();
+		set_current_state(TASK_UNINTERRUPTIBLE);
+	}
+}
+
 static int sync_unplug_thread(void *data)
 {
 	struct hotplug_pcp *hp = data;
 
 	preempt_disable();
 	hp->unplug = current;
+	wait_for_pinned_cpus(hp);
+
+	/*
+	 * This thread will synchronize the cpu_down() with threads
+	 * that have pinned the CPU. When the pinned CPU count reaches
+	 * zero, we inform the cpu_down code to continue to the next step.
+	 */
 	set_current_state(TASK_UNINTERRUPTIBLE);
-	while (hp->refcount) {
-		schedule_preempt_disabled();
+	preempt_enable();
+	complete(&hp->synced);
+
+	/*
+	 * If all succeeds, the next step will need tasks to wait till
+	 * the CPU is offline before continuing. To do this, the grab_lock
+	 * is set and tasks going into pin_current_cpu() will block on the
+	 * mutex. But we still need to wait for those that are already in
+	 * pinned CPU sections. If the cpu_down() failed, the kthread_should_stop()
+	 * will kick this thread out.
+	 */
+	while (!hp->grab_lock && !kthread_should_stop()) {
+		schedule();
+		set_current_state(TASK_UNINTERRUPTIBLE);
+	}
+
+	/* Make sure grab_lock is seen before we see a stale completion */
+	smp_mb();
+
+	/*
+	 * Now just before cpu_down() enters stop machine, we need to make
+	 * sure all tasks that are in pinned CPU sections are out, and new
+	 * tasks will now grab the lock, keeping them from entering pinned
+	 * CPU sections.
+	 */
+	if (!kthread_should_stop()) {
+		preempt_disable();
+		wait_for_pinned_cpus(hp);
+		preempt_enable();
+		complete(&hp->synced);
+	}
+
+	set_current_state(TASK_UNINTERRUPTIBLE);
+	while (!kthread_should_stop()) {
+		schedule();
 		set_current_state(TASK_UNINTERRUPTIBLE);
 	}
 	set_current_state(TASK_RUNNING);
-	preempt_enable();
-	complete(&hp->synced);
+
+	/*
+	 * Force this thread off this CPU as it's going down and
+	 * we don't want any more work on this CPU.
+	 */
+	current->flags &= ~PF_THREAD_BOUND;
+	do_set_cpus_allowed(current, cpu_present_mask);
+	migrate_me();
 	return 0;
 }
 
+static void __cpu_unplug_sync(struct hotplug_pcp *hp)
+{
+	wake_up_process(hp->sync_tsk);
+	wait_for_completion(&hp->synced);
+}
+
 /*
  * Start the sync_unplug_thread on the target cpu and wait for it to
  * complete.
@@ -159,23 +252,83 @@ static int sync_unplug_thread(void *data)
 static int cpu_unplug_begin(unsigned int cpu)
 {
 	struct hotplug_pcp *hp = &per_cpu(hotplug_pcp, cpu);
-	struct task_struct *tsk;
+	int err;
+
+	/* Protected by cpu_hotplug.lock */
+	if (!hp->mutex_init) {
+#ifdef CONFIG_PREEMPT_RT_FULL
+		spin_lock_init(&hp->lock);
+#else
+		mutex_init(&hp->mutex);
+#endif
+		hp->mutex_init = 1;
+	}
+
+	/* Inform the scheduler to migrate tasks off this CPU */
+	tell_sched_cpu_down_begin(cpu);
 
 	init_completion(&hp->synced);
-	tsk = kthread_create(sync_unplug_thread, hp, "sync_unplug/%d", cpu);
-	if (IS_ERR(tsk))
-		return (PTR_ERR(tsk));
-	kthread_bind(tsk, cpu);
-	wake_up_process(tsk);
-	wait_for_completion(&hp->synced);
+
+	hp->sync_tsk = kthread_create(sync_unplug_thread, hp, "sync_unplug/%d", cpu);
+	if (IS_ERR(hp->sync_tsk)) {
+		err = PTR_ERR(hp->sync_tsk);
+		hp->sync_tsk = NULL;
+		return err;
+	}
+	kthread_bind(hp->sync_tsk, cpu);
+
+	/*
+	 * Wait for tasks to get out of the pinned sections,
+	 * it's still OK if new tasks enter. Some CPU notifiers will
+	 * wait for tasks that are going to enter these sections and
+	 * we must not have them block.
+	 */
+	__cpu_unplug_sync(hp);
+
 	return 0;
 }
 
+static void cpu_unplug_sync(unsigned int cpu)
+{
+	struct hotplug_pcp *hp = &per_cpu(hotplug_pcp, cpu);
+
+	init_completion(&hp->synced);
+	/* The completion needs to be initialzied before setting grab_lock */
+	smp_wmb();
+
+	/* Grab the mutex before setting grab_lock */
+	hotplug_lock(hp);
+	hp->grab_lock = 1;
+
+	/*
+	 * The CPU notifiers have been completed.
+	 * Wait for tasks to get out of pinned CPU sections and have new
+	 * tasks block until the CPU is completely down.
+	 */
+	__cpu_unplug_sync(hp);
+
+	/* All done with the sync thread */
+	kthread_stop(hp->sync_tsk);
+	hp->sync_tsk = NULL;
+}
+
 static void cpu_unplug_done(unsigned int cpu)
 {
 	struct hotplug_pcp *hp = &per_cpu(hotplug_pcp, cpu);
 
 	hp->unplug = NULL;
+	/* Let all tasks know cpu unplug is finished before cleaning up */
+	smp_wmb();
+
+	if (hp->sync_tsk)
+		kthread_stop(hp->sync_tsk);
+
+	if (hp->grab_lock) {
+		hotplug_unlock(hp);
+		/* protected by cpu_hotplug.lock */
+		hp->grab_lock = 0;
+	}
+	tell_sched_cpu_down_done(cpu);
 }
 
 void get_online_cpus(void)
@@ -183,9 +336,9 @@ void get_online_cpus(void)
 	might_sleep();
 	if (cpu_hotplug.active_writer == current)
 		return;
-	hotplug_lock();
+	mutex_lock(&cpu_hotplug.lock);
 	cpu_hotplug.refcount++;
-	hotplug_unlock();
+	mutex_unlock(&cpu_hotplug.lock);
 
 }
 EXPORT_SYMBOL_GPL(get_online_cpus);
@@ -195,14 +348,13 @@ void put_online_cpus(void)
 	if (cpu_hotplug.active_writer == current)
 		return;
 
-	hotplug_lock();
+	mutex_lock(&cpu_hotplug.lock);
 	if (WARN_ON(!cpu_hotplug.refcount))
 		cpu_hotplug.refcount++; /* try to fix things up */
 
 	if (!--cpu_hotplug.refcount && unlikely(cpu_hotplug.active_writer))
 		wake_up_process(cpu_hotplug.active_writer);
-	hotplug_unlock();
-
+	mutex_unlock(&cpu_hotplug.lock);
 }
 EXPORT_SYMBOL_GPL(put_online_cpus);
 
@@ -233,11 +385,11 @@ static void cpu_hotplug_begin(void)
 	cpu_hotplug.active_writer = current;
 
 	for (;;) {
-		hotplug_lock();
+		mutex_lock(&cpu_hotplug.lock);
 		if (likely(!cpu_hotplug.refcount))
 			break;
 		__set_current_state(TASK_UNINTERRUPTIBLE);
-		hotplug_unlock();
+		mutex_unlock(&cpu_hotplug.lock);
 		schedule();
 	}
 }
@@ -245,7 +397,7 @@ static void cpu_hotplug_begin(void)
 static void cpu_hotplug_done(void)
 {
 	cpu_hotplug.active_writer = NULL;
-	hotplug_unlock();
+	mutex_unlock(&cpu_hotplug.lock);
 }
 
 #else /* #if CONFIG_HOTPLUG_CPU */
@@ -421,6 +573,9 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
 	}
 	smpboot_park_threads(cpu);
 
+	/* Notifiers are done. Don't let any more tasks pin this CPU. */
+	cpu_unplug_sync(cpu);
+
 	err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
 	if (err) {
 		/* CPU didn't die: tell everyone.  Can't complain. */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c11a9cc..11e027b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2896,7 +2896,7 @@ void migrate_disable(void)
 {
 	struct task_struct *p = current;
 
-	if (in_atomic() || p->flags & PF_THREAD_BOUND) {
+	if (in_atomic()) {
 #ifdef CONFIG_SCHED_DEBUG
 		p->migrate_disable_atomic++;
 #endif
@@ -2927,7 +2927,7 @@ void migrate_enable(void)
 	unsigned long flags;
 	struct rq *rq;
 
-	if (in_atomic() || p->flags & PF_THREAD_BOUND) {
+	if (in_atomic()) {
 #ifdef CONFIG_SCHED_DEBUG
 		p->migrate_disable_atomic--;
 #endif
@@ -4872,6 +4872,84 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
 	cpumask_copy(&p->cpus_allowed, new_mask);
 }
 
+static DEFINE_PER_CPU(struct cpumask, sched_cpumasks);
+static DEFINE_MUTEX(sched_down_mutex);
+static cpumask_t sched_down_cpumask;
+
+void tell_sched_cpu_down_begin(int cpu)
+{
+	mutex_lock(&sched_down_mutex);
+	cpumask_set_cpu(cpu, &sched_down_cpumask);
+	mutex_unlock(&sched_down_mutex);
+}
+
+void tell_sched_cpu_down_done(int cpu)
+{
+	mutex_lock(&sched_down_mutex);
+	cpumask_clear_cpu(cpu, &sched_down_cpumask);
+	mutex_unlock(&sched_down_mutex);
+}
+
+/**
+ * migrate_me - try to move the current task off this cpu
+ *
+ * Used by the pin_current_cpu() code to try to get tasks
+ * to move off the current CPU as it is going down.
+ * It will only move the task if the task isn't pinned to
+ * the CPU (with migrate_disable, affinity or THREAD_BOUND)
+ * and the task has to be in a RUNNING state. Otherwise the
+ * movement of the task will wake it up (change its state
+ * to running) when the task did not expect it.
+ *
+ * Returns 1 if it succeeded in moving the current task
+ *         0 otherwise.
+ */
+int migrate_me(void)
+{
+	struct task_struct *p = current;
+	struct migration_arg arg;
+	struct cpumask *cpumask;
+	struct cpumask *mask;
+	unsigned long flags;
+	unsigned int dest_cpu;
+	struct rq *rq;
+
+	/*
+	 * We can not migrate tasks bounded to a CPU or tasks not
+	 * running. The movement of the task will wake it up.
+	 */
+	if (p->flags & PF_THREAD_BOUND || p->state)
+		return 0;
+
+	mutex_lock(&sched_down_mutex);
+	rq = task_rq_lock(p, &flags);
+
+	cpumask = &__get_cpu_var(sched_cpumasks);
+	mask = &p->cpus_allowed;
+
+	cpumask_andnot(cpumask, mask, &sched_down_cpumask);
+
+	if (!cpumask_weight(cpumask)) {
+		/* It's only on this CPU? */
+		task_rq_unlock(rq, p, &flags);
+		mutex_unlock(&sched_down_mutex);
+		return 0;
+	}
+
+	dest_cpu = cpumask_any_and(cpu_active_mask, cpumask);
+
+	arg.task = p;
+	arg.dest_cpu = dest_cpu;
+
+	task_rq_unlock(rq, p, &flags);
+
+	stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
+	tlb_migrate_finish(p->mm);
+	mutex_unlock(&sched_down_mutex);
+
+	return 1;
+}
+
 /*
  * This is how migration works:
  *
-- 
cgit v0.10.2


From 17b4266d4d84308da258f20377866bfda752be5d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 21 Aug 2012 20:38:50 +0200
Subject: random: Make it work on rt

Delegate the random insertion to the forced threaded interrupt
handler. Store the return IP of the hard interrupt handler in the irq
descriptor and feed it into the random generator as a source of
entropy.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable-rt@vger.kernel.org

diff --git a/drivers/char/random.c b/drivers/char/random.c
index 50007e1..9d6c416 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -742,18 +742,16 @@ EXPORT_SYMBOL_GPL(add_input_randomness);
 
 static DEFINE_PER_CPU(struct fast_pool, irq_randomness);
 
-void add_interrupt_randomness(int irq, int irq_flags)
+void add_interrupt_randomness(int irq, int irq_flags, __u64 ip)
 {
 	struct entropy_store	*r;
 	struct fast_pool	*fast_pool = &__get_cpu_var(irq_randomness);
-	struct pt_regs		*regs = get_irq_regs();
 	unsigned long		now = jiffies;
 	__u32			input[4], cycles = get_cycles();
 
 	input[0] = cycles ^ jiffies;
 	input[1] = irq;
-	if (regs) {
-		__u64 ip = instruction_pointer(regs);
+	if (ip) {
 		input[2] = ip;
 		input[3] = ip >> 32;
 	}
@@ -767,7 +765,11 @@ void add_interrupt_randomness(int irq, int irq_flags)
 	fast_pool->last = now;
 
 	r = nonblocking_pool.initialized ? &input_pool : &nonblocking_pool;
+#ifndef CONFIG_PREEMPT_RT_FULL
 	__mix_pool_bytes(r, &fast_pool->pool, sizeof(fast_pool->pool), NULL);
+#else
+	mix_pool_bytes(r, &fast_pool->pool, sizeof(fast_pool->pool), NULL);
+#endif
 	/*
 	 * If we don't have a valid cycle counter, and we see
 	 * back-to-back timer interrupts, then skip giving credit for
diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h
index 623325e..a7edc47 100644
--- a/include/linux/irqdesc.h
+++ b/include/linux/irqdesc.h
@@ -52,6 +52,7 @@ struct irq_desc {
 	unsigned int		irq_count;	/* For detecting broken IRQs */
 	unsigned long		last_unhandled;	/* Aging timer for unhandled count */
 	unsigned int		irqs_unhandled;
+	u64			random_ip;
 	raw_spinlock_t		lock;
 	struct cpumask		*percpu_enabled;
 #ifdef CONFIG_SMP
diff --git a/include/linux/random.h b/include/linux/random.h
index d984608..f975382 100644
--- a/include/linux/random.h
+++ b/include/linux/random.h
@@ -12,7 +12,7 @@
 extern void add_device_randomness(const void *, unsigned int);
 extern void add_input_randomness(unsigned int type, unsigned int code,
 				 unsigned int value);
-extern void add_interrupt_randomness(int irq, int irq_flags);
+extern void add_interrupt_randomness(int irq, int irq_flags, __u64 ip);
 
 extern void get_random_bytes(void *buf, int nbytes);
 extern void get_random_bytes_arch(void *buf, int nbytes);
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 131ca17..7f50c55 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -132,6 +132,8 @@ static void irq_wake_thread(struct irq_desc *desc, struct irqaction *action)
 irqreturn_t
 handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action)
 {
+	struct pt_regs *regs = get_irq_regs();
+	u64 ip = regs ? instruction_pointer(regs) : 0;
 	irqreturn_t retval = IRQ_NONE;
 	unsigned int flags = 0, irq = desc->irq_data.irq;
 
@@ -172,7 +174,11 @@ handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action)
 		action = action->next;
 	} while (action);
 
-	add_interrupt_randomness(irq, flags);
+#ifndef CONFIG_PREEMPT_RT_FULL
+	add_interrupt_randomness(irq, flags, ip);
+#else
+	desc->random_ip = ip;
+#endif
 
 	if (!noirqdebug)
 		note_interrupt(irq, desc, retval);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 5bc2f41..1fba5cb 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -879,6 +879,12 @@ static int irq_thread(void *data)
 		if (!noirqdebug)
 			note_interrupt(action->irq, desc, action_ret);
 
+#ifdef CONFIG_PREEMPT_RT_FULL
+		migrate_disable();
+		add_interrupt_randomness(action->irq, 0,
+				 desc->random_ip ^ (unsigned long) action);
+		migrate_enable();
+#endif
 		wake_threads_waitq(desc);
 	}
 
-- 
cgit v0.10.2


From ce8eaa5e9bc699791065c2bb30de923623201deb Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Thu, 4 Oct 2012 11:02:04 -0400
Subject: softirq: Init softirq local lock after per cpu section is set up

I discovered this bug when booting 3.4-rt on my powerpc box. It crashed
with the following report:

------------[ cut here ]------------
kernel BUG at /work/rt/stable-rt.git/kernel/rtmutex_common.h:75!
Oops: Exception in kernel mode, sig: 5 [#1]
PREEMPT SMP NR_CPUS=64 NUMA PA Semi PWRficient
Modules linked in:
NIP: c0000000004aa03c LR: c0000000004aa01c CTR: c00000000009b2ac
REGS: c00000003e8d7950 TRAP: 0700   Not tainted  (3.4.11-test-rt19)
MSR: 9000000000029032 <SF,HV,EE,ME,IR,DR,RI>  CR: 24000082  XER: 20000000
SOFTE: 0
TASK = c00000003e8fdcd0[11] 'ksoftirqd/1' THREAD: c00000003e8d4000 CPU: 1
GPR00: 0000000000000001 c00000003e8d7bd0 c000000000d6cbb0 0000000000000000
GPR04: c00000003e8fdcd0 0000000000000000 0000000024004082 c000000000011454
GPR08: 0000000000000000 0000000080000001 c00000003e8fdcd1 0000000000000000
GPR12: 0000000024000084 c00000000fff0280 ffffffffffffffff 000000003ffffad8
GPR16: ffffffffffffffff 000000000072c798 0000000000000060 0000000000000000
GPR20: 0000000000642741 000000000072c858 000000003ffffaf0 0000000000000417
GPR24: 000000000072dcd0 c00000003e7ff990 0000000000000000 0000000000000001
GPR28: 0000000000000000 c000000000792340 c000000000ccec78 c000000001182338
NIP [c0000000004aa03c] .wakeup_next_waiter+0x44/0xb8
LR [c0000000004aa01c] .wakeup_next_waiter+0x24/0xb8
Call Trace:
[c00000003e8d7bd0] [c0000000004aa01c] .wakeup_next_waiter+0x24/0xb8 (unreliable)
[c00000003e8d7c60] [c0000000004a0320] .rt_spin_lock_slowunlock+0x8c/0xe4
[c00000003e8d7ce0] [c0000000004a07cc] .rt_spin_unlock+0x54/0x64
[c00000003e8d7d60] [c0000000000636bc] .__thread_do_softirq+0x130/0x174
[c00000003e8d7df0] [c00000000006379c] .run_ksoftirqd+0x9c/0x1a4
[c00000003e8d7ea0] [c000000000080b68] .kthread+0xa8/0xb4
[c00000003e8d7f90] [c00000000001c2f8] .kernel_thread+0x54/0x70
Instruction dump:
60000000 e86d01c8 38630730 4bff7061 60000000 ebbf0008 7c7c1b78 e81d0040
7fe00278 7c000074 7800d182 68000001 <0b000000> e88d01c8 387d0010 38840738

The rtmutex_common.h:75 is:

rt_mutex_top_waiter(struct rt_mutex *lock)
{
	struct rt_mutex_waiter *w;

	w = plist_first_entry(&lock->wait_list, struct rt_mutex_waiter,
			       list_entry);
	BUG_ON(w->lock != lock);

	return w;
}

Where the waiter->lock is corrupted. I saw various other random bugs
that all had to with the softirq lock and plist. As plist needs to be
initialized before it is used I investigated how this lock is
initialized. It's initialized with:

void __init softirq_early_init(void)
{
	local_irq_lock_init(local_softirq_lock);
}

Where:

#define local_irq_lock_init(lvar)					\
	do {								\
		int __cpu;						\
		for_each_possible_cpu(__cpu)				\
			spin_lock_init(&per_cpu(lvar, __cpu).lock);	\
	} while (0)

As the softirq lock is a local_irq_lock, which is a per_cpu lock, the
initialization is done to all per_cpu versions of the lock. But lets
look at where the softirq_early_init() is called from.

In init/main.c: start_kernel()

/*
 * Interrupts are still disabled. Do necessary setups, then
 * enable them
 */
	softirq_early_init();
	tick_init();
	boot_cpu_init();
	page_address_init();
	printk(KERN_NOTICE "%s", linux_banner);
	setup_arch(&command_line);
	mm_init_owner(&init_mm, &init_task);
	mm_init_cpumask(&init_mm);
	setup_command_line(command_line);
	setup_nr_cpu_ids();
	setup_per_cpu_areas();
	smp_prepare_boot_cpu();	/* arch-specific boot-cpu hooks */

One of the first things that is called is the initialization of the
softirq lock. But if you look further down, we see the per_cpu areas
have not been set up yet. Thus initializing a local_irq_lock() before
the per_cpu section is set up, may not work as it is initializing the
per cpu locks before the per cpu exists.

By moving the softirq_early_init() right after setup_per_cpu_areas(),
the kernel boots fine.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Clark Williams <clark@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
Cc: Carsten Emde <cbe@osadl.org>
Cc: vomlehn@texas.net
Link: http://lkml.kernel.org/r/1349362924.6755.18.camel@gandalf.local.home
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/init/main.c b/init/main.c
index 4df397d..2d80a11 100644
--- a/init/main.c
+++ b/init/main.c
@@ -493,7 +493,6 @@ asmlinkage void __init start_kernel(void)
  * Interrupts are still disabled. Do necessary setups, then
  * enable them
  */
-	softirq_early_init();
 	tick_init();
 	boot_cpu_init();
 	page_address_init();
@@ -504,6 +503,7 @@ asmlinkage void __init start_kernel(void)
 	setup_command_line(command_line);
 	setup_nr_cpu_ids();
 	setup_per_cpu_areas();
+	softirq_early_init();
 	smp_prepare_boot_cpu();	/* arch-specific boot-cpu hooks */
 
 	build_all_zonelists(NULL, NULL);
-- 
cgit v0.10.2


From 22177270e20d57551701d2898c34415838a1ed23 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 27 Sep 2012 11:11:46 +0200
Subject: mm: page_alloc: Use local_lock_on() instead of plain spinlock

The plain spinlock while sufficient does not update the local_lock
internals. Use a proper local_lock function instead to ease debugging.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable-rt@vger.kernel.org

diff --git a/include/linux/locallock.h b/include/linux/locallock.h
index 0161fbb..f1804a3 100644
--- a/include/linux/locallock.h
+++ b/include/linux/locallock.h
@@ -137,6 +137,12 @@ static inline int __local_lock_irqsave(struct local_irq_lock *lv)
 		_flags = __get_cpu_var(lvar).flags;			\
 	} while (0)
 
+#define local_lock_irqsave_on(lvar, _flags, cpu)			\
+	do {								\
+		__local_lock_irqsave(&per_cpu(lvar, cpu));		\
+		_flags = per_cpu(lvar, cpu).flags;			\
+	} while (0)
+
 static inline int __local_unlock_irqrestore(struct local_irq_lock *lv,
 					    unsigned long flags)
 {
@@ -156,6 +162,11 @@ static inline int __local_unlock_irqrestore(struct local_irq_lock *lv,
 			put_local_var(lvar);				\
 	} while (0)
 
+#define local_unlock_irqrestore_on(lvar, flags, cpu)			\
+	do {								\
+		__local_unlock_irqrestore(&per_cpu(lvar, cpu), flags);	\
+	} while (0)
+
 #define local_spin_trylock_irq(lvar, lock)				\
 	({								\
 		int __locked;						\
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 364c3c6..1734913 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -224,9 +224,9 @@ static DEFINE_LOCAL_IRQ_LOCK(pa_lock);
 
 #ifdef CONFIG_PREEMPT_RT_BASE
 # define cpu_lock_irqsave(cpu, flags)		\
-	spin_lock_irqsave(&per_cpu(pa_lock, cpu).lock, flags)
+	local_lock_irqsave_on(pa_lock, flags, cpu)
 # define cpu_unlock_irqrestore(cpu, flags)	\
-	spin_unlock_irqrestore(&per_cpu(pa_lock, cpu).lock, flags)
+	local_unlock_irqrestore_on(pa_lock, flags, cpu)
 #else
 # define cpu_lock_irqsave(cpu, flags)		local_irq_save(flags)
 # define cpu_unlock_irqrestore(cpu, flags)	local_irq_restore(flags)
-- 
cgit v0.10.2


From ae70961f02eea901b8a68fec0f664e2b6e2431d7 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 28 Sep 2012 10:49:42 +0100
Subject: rt: rwsem/rwlock: lockdep annotations

rwlocks and rwsems on RT do not allow multiple readers. Annotate the
lockdep acquire functions accordingly.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable-rt@vger.kernel.org

diff --git a/kernel/rt.c b/kernel/rt.c
index 92a16e1..433ae42 100644
--- a/kernel/rt.c
+++ b/kernel/rt.c
@@ -216,15 +216,17 @@ int __lockfunc rt_read_trylock(rwlock_t *rwlock)
 	 * write locked.
 	 */
 	migrate_disable();
-	if (rt_mutex_owner(lock) != current)
+	if (rt_mutex_owner(lock) != current) {
 		ret = rt_mutex_trylock(lock);
-	else if (!rwlock->read_depth)
+		if (ret)
+			rwlock_acquire(&rwlock->dep_map, 0, 1, _RET_IP_);
+	} else if (!rwlock->read_depth) {
 		ret = 0;
+	}
 
-	if (ret) {
+	if (ret)
 		rwlock->read_depth++;
-		rwlock_acquire_read(&rwlock->dep_map, 0, 1, _RET_IP_);
-	} else
+	else
 		migrate_enable();
 
 	return ret;
@@ -242,13 +244,13 @@ void __lockfunc rt_read_lock(rwlock_t *rwlock)
 {
 	struct rt_mutex *lock = &rwlock->lock;
 
-	rwlock_acquire_read(&rwlock->dep_map, 0, 0, _RET_IP_);
-
 	/*
 	 * recursive read locks succeed when current owns the lock
 	 */
-	if (rt_mutex_owner(lock) != current)
+	if (rt_mutex_owner(lock) != current) {
+		rwlock_acquire(&rwlock->dep_map, 0, 0, _RET_IP_);
 		__rt_spin_lock(lock);
+	}
 	rwlock->read_depth++;
 }
 
@@ -264,11 +266,11 @@ EXPORT_SYMBOL(rt_write_unlock);
 
 void __lockfunc rt_read_unlock(rwlock_t *rwlock)
 {
-	rwlock_release(&rwlock->dep_map, 1, _RET_IP_);
-
 	/* Release the lock only when read_depth is down to 0 */
-	if (--rwlock->read_depth == 0)
+	if (--rwlock->read_depth == 0) {
+		rwlock_release(&rwlock->dep_map, 1, _RET_IP_);
 		__rt_spin_unlock(&rwlock->lock);
+	}
 }
 EXPORT_SYMBOL(rt_read_unlock);
 
@@ -315,9 +317,10 @@ EXPORT_SYMBOL(rt_up_write);
 
 void  rt_up_read(struct rw_semaphore *rwsem)
 {
-	rwsem_release(&rwsem->dep_map, 1, _RET_IP_);
-	if (--rwsem->read_depth == 0)
+	if (--rwsem->read_depth == 0) {
+		rwsem_release(&rwsem->dep_map, 1, _RET_IP_);
 		rt_mutex_unlock(&rwsem->lock);
+	}
 }
 EXPORT_SYMBOL(rt_up_read);
 
@@ -356,6 +359,13 @@ void  rt_down_write_nested(struct rw_semaphore *rwsem, int subclass)
 }
 EXPORT_SYMBOL(rt_down_write_nested);
 
+void rt_down_write_nested_lock(struct rw_semaphore *rwsem,
+		struct lockdep_map *nest)
+{
+	rwsem_acquire_nest(&rwsem->dep_map, 0, 0, nest, _RET_IP_);
+	rt_mutex_lock(&rwsem->lock);
+}
+
 int  rt_down_read_trylock(struct rw_semaphore *rwsem)
 {
 	struct rt_mutex *lock = &rwsem->lock;
@@ -366,15 +376,16 @@ int  rt_down_read_trylock(struct rw_semaphore *rwsem)
 	 * but not when read_depth == 0 which means that the rwsem is
 	 * write locked.
 	 */
-	if (rt_mutex_owner(lock) != current)
+	if (rt_mutex_owner(lock) != current) {
 		ret = rt_mutex_trylock(&rwsem->lock);
-	else if (!rwsem->read_depth)
+		if (ret)
+			rwsem_acquire(&rwsem->dep_map, 0, 1, _RET_IP_);
+	} else if (!rwsem->read_depth) {
 		ret = 0;
+	}
 
-	if (ret) {
+	if (ret)
 		rwsem->read_depth++;
-		rwsem_acquire(&rwsem->dep_map, 0, 1, _RET_IP_);
-	}
 	return ret;
 }
 EXPORT_SYMBOL(rt_down_read_trylock);
@@ -383,10 +394,10 @@ static void __rt_down_read(struct rw_semaphore *rwsem, int subclass)
 {
 	struct rt_mutex *lock = &rwsem->lock;
 
-	rwsem_acquire_read(&rwsem->dep_map, subclass, 0, _RET_IP_);
-
-	if (rt_mutex_owner(lock) != current)
+	if (rt_mutex_owner(lock) != current) {
+		rwsem_acquire(&rwsem->dep_map, subclass, 0, _RET_IP_);
 		rt_mutex_lock(&rwsem->lock);
+	}
 	rwsem->read_depth++;
 }
 
-- 
cgit v0.10.2


From 4adc7d297b019699e307c46f2f2ebfc5afea5194 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 5 Oct 2012 08:56:15 +0100
Subject: sched: Better debug output for might sleep

might sleep can tell us where interrupts have been disabled, but we
have no idea what disabled preemption. Add some debug infrastructure.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/sched.h b/include/linux/sched.h
index e7e4c96..b0cde41 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1642,6 +1642,9 @@ struct task_struct {
 	pte_t kmap_pte[KM_TYPE_NR];
 # endif
 #endif
+#ifdef CONFIG_DEBUG_PREEMPT
+	unsigned long preempt_disable_ip;
+#endif
 };
 
 #ifdef CONFIG_NUMA_BALANCING
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 11e027b..7fb8e28 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2789,8 +2789,13 @@ void __kprobes add_preempt_count(int val)
 	DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
 				PREEMPT_MASK - 10);
 #endif
-	if (preempt_count() == val)
-		trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
+	if (preempt_count() == val) {
+		unsigned long ip = get_parent_ip(CALLER_ADDR1);
+#ifdef CONFIG_DEBUG_PREEMPT
+		current->preempt_disable_ip = ip;
+#endif
+		trace_preempt_off(CALLER_ADDR0, ip);
+	}
 }
 EXPORT_SYMBOL(add_preempt_count);
 
@@ -2833,6 +2838,13 @@ static noinline void __schedule_bug(struct task_struct *prev)
 	print_modules();
 	if (irqs_disabled())
 		print_irqtrace_events(prev);
+#ifdef DEBUG_PREEMPT
+	if (in_atomic_preempt_off()) {
+		pr_err("Preemption disabled at:");
+		print_ip_sym(current->preempt_disable_ip);
+		pr_cont("\n");
+	}
+#endif
 	dump_stack();
 	add_taint(TAINT_WARN);
 }
@@ -7308,6 +7320,13 @@ void __might_sleep(const char *file, int line, int preempt_offset)
 	debug_show_held_locks(current);
 	if (irqs_disabled())
 		print_irqtrace_events(current);
+#ifdef DEBUG_PREEMPT
+	if (!preempt_count_equals(preempt_offset)) {
+		pr_err("Preemption disabled at:");
+		print_ip_sym(current->preempt_disable_ip);
+		pr_cont("\n");
+	}
+#endif
 	dump_stack();
 }
 EXPORT_SYMBOL(__might_sleep);
-- 
cgit v0.10.2


From ff0a1dda48e8a1fce1051070fafd1d1366d36683 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 3 Oct 2012 17:21:53 +0100
Subject: stomp_machine: Use mutex_trylock when called from inactive cpu

If the stop machinery is called from inactive CPU we cannot use
mutex_lock, because some other stomp machine invokation might be in
progress and the mutex can be contended. We cannot schedule from this
context, so trylock and loop.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable-rt@vger.kernel.org

diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 561ba3a..e98c70b 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -158,7 +158,7 @@ static DEFINE_PER_CPU(struct cpu_stop_work, stop_cpus_work);
 
 static void queue_stop_cpus_work(const struct cpumask *cpumask,
 				 cpu_stop_fn_t fn, void *arg,
-				 struct cpu_stop_done *done)
+				 struct cpu_stop_done *done, bool inactive)
 {
 	struct cpu_stop_work *work;
 	unsigned int cpu;
@@ -175,7 +175,12 @@ static void queue_stop_cpus_work(const struct cpumask *cpumask,
 	 * Make sure that all work is queued on all cpus before we
 	 * any of the cpus can execute it.
 	 */
-	mutex_lock(&stopper_lock);
+	if (!inactive) {
+		mutex_lock(&stopper_lock);
+	} else {
+		while (!mutex_trylock(&stopper_lock))
+			cpu_relax();
+	}
 	for_each_cpu(cpu, cpumask)
 		cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu),
 				    &per_cpu(stop_cpus_work, cpu));
@@ -188,7 +193,7 @@ static int __stop_cpus(const struct cpumask *cpumask,
 	struct cpu_stop_done done;
 
 	cpu_stop_init_done(&done, cpumask_weight(cpumask));
-	queue_stop_cpus_work(cpumask, fn, arg, &done);
+	queue_stop_cpus_work(cpumask, fn, arg, &done, false);
 	wait_for_stop_done(&done);
 	return done.executed ? done.ret : -ENOENT;
 }
@@ -601,7 +606,7 @@ int stop_machine_from_inactive_cpu(int (*fn)(void *), void *data,
 	set_state(&smdata, STOPMACHINE_PREPARE);
 	cpu_stop_init_done(&done, num_active_cpus());
 	queue_stop_cpus_work(cpu_active_mask, stop_machine_cpu_stop, &smdata,
-			     &done);
+			     &done, true);
 	ret = stop_machine_cpu_stop(&smdata);
 
 	/* Busy wait for completion. */
-- 
cgit v0.10.2


From 9f4b7a42a47b88e7b85c2df219b9f03b589df227 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 26 Sep 2012 16:21:08 +0200
Subject: net: Another local_irq_disable/kmalloc headache

Replace it by a local lock. Though that's pretty inefficient :(

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 32443eb..39b45c0 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -60,6 +60,7 @@
 #include <linux/scatterlist.h>
 #include <linux/errqueue.h>
 #include <linux/prefetch.h>
+#include <linux/locallock.h>
 
 #include <net/protocol.h>
 #include <net/dst.h>
@@ -347,6 +348,7 @@ struct netdev_alloc_cache {
 	unsigned int		pagecnt_bias;
 };
 static DEFINE_PER_CPU(struct netdev_alloc_cache, netdev_alloc_cache);
+static DEFINE_LOCAL_IRQ_LOCK(netdev_alloc_lock);
 
 #define NETDEV_FRAG_PAGE_MAX_ORDER get_order(32768)
 #define NETDEV_FRAG_PAGE_MAX_SIZE  (PAGE_SIZE << NETDEV_FRAG_PAGE_MAX_ORDER)
@@ -359,7 +361,7 @@ static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
 	int order;
 	unsigned long flags;
 
-	local_irq_save(flags);
+	local_lock_irqsave(netdev_alloc_lock, flags);
 	nc = &__get_cpu_var(netdev_alloc_cache);
 	if (unlikely(!nc->frag.page)) {
 refill:
@@ -393,7 +395,7 @@ recycle:
 	nc->frag.offset += fragsz;
 	nc->pagecnt_bias--;
 end:
-	local_irq_restore(flags);
+	local_unlock_irqrestore(netdev_alloc_lock, flags);
 	return data;
 }
 
-- 
cgit v0.10.2


From af181493b162ac1f0d547cfb12a2ba57f4808dcd Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 1 Oct 2012 17:12:35 +0100
Subject: net: Use get_cpu_light() in ip_send_unicast_reply()

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 3e98ed2..253692b 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -1508,7 +1508,8 @@ void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, __be32 daddr,
 	if (IS_ERR(rt))
 		return;
 
-	inet = &get_cpu_var(unicast_sock);
+	get_cpu_light();
+	inet = &__get_cpu_var(unicast_sock);
 
 	inet->tos = arg->tos;
 	sk = &inet->sk;
@@ -1532,7 +1533,7 @@ void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, __be32 daddr,
 		ip_push_pending_frames(sk, &fl4);
 	}
 
-	put_cpu_var(unicast_sock);
+	put_cpu_light();
 
 	ip_rt_put(rt);
 }
-- 
cgit v0.10.2


From 06ff214e4febc079cb6ace9bdf4a0c781a8dad74 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 5 Oct 2012 09:03:24 +0100
Subject: crypto: Convert crypto notifier chain to SRCU

The crypto notifier deadlocks on RT. Though this can be a real deadlock
on mainline as well due to fifo fair rwsems.

The involved parties here are:

[   82.172678] swapper/0       S 0000000000000001     0     1      0 0x00000000
[   82.172682]  ffff88042f18fcf0 0000000000000046 ffff88042f18fc80 ffffffff81491238
[   82.172685]  0000000000011cc0 0000000000011cc0 ffff88042f18c040 ffff88042f18ffd8
[   82.172688]  0000000000011cc0 0000000000011cc0 ffff88042f18ffd8 0000000000011cc0
[   82.172689] Call Trace:
[   82.172697]  [<ffffffff81491238>] ? _raw_spin_unlock_irqrestore+0x6c/0x7a
[   82.172701]  [<ffffffff8148fd3f>] schedule+0x64/0x66
[   82.172704]  [<ffffffff8148ec6b>] schedule_timeout+0x27/0xd0
[   82.172708]  [<ffffffff81043c0c>] ? unpin_current_cpu+0x1a/0x6c
[   82.172713]  [<ffffffff8106e491>] ? migrate_enable+0x12f/0x141
[   82.172716]  [<ffffffff8148fbbd>] wait_for_common+0xbb/0x11f
[   82.172719]  [<ffffffff810709f2>] ? try_to_wake_up+0x182/0x182
[   82.172722]  [<ffffffff8148fc96>] wait_for_completion_interruptible+0x1d/0x2e
[   82.172726]  [<ffffffff811debfd>] crypto_wait_for_test+0x49/0x6b
[   82.172728]  [<ffffffff811ded32>] crypto_register_alg+0x53/0x5a
[   82.172730]  [<ffffffff811ded6c>] crypto_register_algs+0x33/0x72
[   82.172734]  [<ffffffff81ad7686>] ? aes_init+0x12/0x12
[   82.172737]  [<ffffffff81ad76ea>] aesni_init+0x64/0x66
[   82.172741]  [<ffffffff81000318>] do_one_initcall+0x7f/0x13b
[   82.172744]  [<ffffffff81ac4d34>] kernel_init+0x199/0x22c
[   82.172747]  [<ffffffff81ac44ef>] ? loglevel+0x31/0x31
[   82.172752]  [<ffffffff814987c4>] kernel_thread_helper+0x4/0x10
[   82.172755]  [<ffffffff81491574>] ? retint_restore_args+0x13/0x13
[   82.172759]  [<ffffffff81ac4b9b>] ? start_kernel+0x3ca/0x3ca
[   82.172761]  [<ffffffff814987c0>] ? gs_change+0x13/0x13

[   82.174186] cryptomgr_test  S 0000000000000001     0    41      2 0x00000000
[   82.174189]  ffff88042c971980 0000000000000046 ffffffff81d74830 0000000000000292
[   82.174192]  0000000000011cc0 0000000000011cc0 ffff88042c96eb80 ffff88042c971fd8
[   82.174195]  0000000000011cc0 0000000000011cc0 ffff88042c971fd8 0000000000011cc0
[   82.174195] Call Trace:
[   82.174198]  [<ffffffff8148fd3f>] schedule+0x64/0x66
[   82.174201]  [<ffffffff8148ec6b>] schedule_timeout+0x27/0xd0
[   82.174204]  [<ffffffff81043c0c>] ? unpin_current_cpu+0x1a/0x6c
[   82.174206]  [<ffffffff8106e491>] ? migrate_enable+0x12f/0x141
[   82.174209]  [<ffffffff8148fbbd>] wait_for_common+0xbb/0x11f
[   82.174212]  [<ffffffff810709f2>] ? try_to_wake_up+0x182/0x182
[   82.174215]  [<ffffffff8148fc96>] wait_for_completion_interruptible+0x1d/0x2e
[   82.174218]  [<ffffffff811e4883>] cryptomgr_notify+0x280/0x385
[   82.174221]  [<ffffffff814943de>] notifier_call_chain+0x6b/0x98
[   82.174224]  [<ffffffff8108a11c>] ? rt_down_read+0x10/0x12
[   82.174227]  [<ffffffff810677cd>] __blocking_notifier_call_chain+0x70/0x8d
[   82.174230]  [<ffffffff810677fe>] blocking_notifier_call_chain+0x14/0x16
[   82.174234]  [<ffffffff811dd272>] crypto_probing_notify+0x24/0x50
[   82.174236]  [<ffffffff811dd7a1>] crypto_alg_mod_lookup+0x3e/0x74
[   82.174238]  [<ffffffff811dd949>] crypto_alloc_base+0x36/0x8f
[   82.174241]  [<ffffffff811e9408>] cryptd_alloc_ablkcipher+0x6e/0xb5
[   82.174243]  [<ffffffff811dd591>] ? kzalloc.clone.5+0xe/0x10
[   82.174246]  [<ffffffff8103085d>] ablk_init_common+0x1d/0x38
[   82.174249]  [<ffffffff8103852a>] ablk_ecb_init+0x15/0x17
[   82.174251]  [<ffffffff811dd8c6>] __crypto_alloc_tfm+0xc7/0x114
[   82.174254]  [<ffffffff811e0caa>] ? crypto_lookup_skcipher+0x1f/0xe4
[   82.174256]  [<ffffffff811e0dcf>] crypto_alloc_ablkcipher+0x60/0xa5
[   82.174258]  [<ffffffff811e5bde>] alg_test_skcipher+0x24/0x9b
[   82.174261]  [<ffffffff8106d96d>] ? finish_task_switch+0x3f/0xfa
[   82.174263]  [<ffffffff811e6b8e>] alg_test+0x16f/0x1d7
[   82.174267]  [<ffffffff811e45ac>] ? cryptomgr_probe+0xac/0xac
[   82.174269]  [<ffffffff811e45d8>] cryptomgr_test+0x2c/0x47
[   82.174272]  [<ffffffff81061161>] kthread+0x7e/0x86
[   82.174275]  [<ffffffff8106d9dd>] ? finish_task_switch+0xaf/0xfa
[   82.174278]  [<ffffffff814987c4>] kernel_thread_helper+0x4/0x10
[   82.174281]  [<ffffffff81491574>] ? retint_restore_args+0x13/0x13
[   82.174284]  [<ffffffff810610e3>] ? __init_kthread_worker+0x8c/0x8c
[   82.174287]  [<ffffffff814987c0>] ? gs_change+0x13/0x13

[   82.174329] cryptomgr_probe D 0000000000000002     0    47      2 0x00000000
[   82.174332]  ffff88042c991b70 0000000000000046 ffff88042c991bb0 0000000000000006
[   82.174335]  0000000000011cc0 0000000000011cc0 ffff88042c98ed00 ffff88042c991fd8
[   82.174338]  0000000000011cc0 0000000000011cc0 ffff88042c991fd8 0000000000011cc0
[   82.174338] Call Trace:
[   82.174342]  [<ffffffff8148fd3f>] schedule+0x64/0x66
[   82.174344]  [<ffffffff814901ad>] __rt_mutex_slowlock+0x85/0xbe
[   82.174347]  [<ffffffff814902d2>] rt_mutex_slowlock+0xec/0x159
[   82.174351]  [<ffffffff81089c4d>] rt_mutex_fastlock.clone.8+0x29/0x2f
[   82.174353]  [<ffffffff81490372>] rt_mutex_lock+0x33/0x37
[   82.174356]  [<ffffffff8108a0f2>] __rt_down_read+0x50/0x5a
[   82.174358]  [<ffffffff8108a11c>] ? rt_down_read+0x10/0x12
[   82.174360]  [<ffffffff8108a11c>] rt_down_read+0x10/0x12
[   82.174363]  [<ffffffff810677b5>] __blocking_notifier_call_chain+0x58/0x8d
[   82.174366]  [<ffffffff810677fe>] blocking_notifier_call_chain+0x14/0x16
[   82.174369]  [<ffffffff811dd272>] crypto_probing_notify+0x24/0x50
[   82.174372]  [<ffffffff811debd6>] crypto_wait_for_test+0x22/0x6b
[   82.174374]  [<ffffffff811decd3>] crypto_register_instance+0xb4/0xc0
[   82.174377]  [<ffffffff811e9b76>] cryptd_create+0x378/0x3b6
[   82.174379]  [<ffffffff811de512>] ? __crypto_lookup_template+0x5b/0x63
[   82.174382]  [<ffffffff811e4545>] cryptomgr_probe+0x45/0xac
[   82.174385]  [<ffffffff811e4500>] ? crypto_alloc_pcomp+0x1b/0x1b
[   82.174388]  [<ffffffff81061161>] kthread+0x7e/0x86
[   82.174391]  [<ffffffff8106d9dd>] ? finish_task_switch+0xaf/0xfa
[   82.174394]  [<ffffffff814987c4>] kernel_thread_helper+0x4/0x10
[   82.174398]  [<ffffffff81491574>] ? retint_restore_args+0x13/0x13
[   82.174401]  [<ffffffff810610e3>] ? __init_kthread_worker+0x8c/0x8c
[   82.174403]  [<ffffffff814987c0>] ? gs_change+0x13/0x13

cryptomgr_test spawns the cryptomgr_probe thread from the notifier
call. The probe thread fires the same notifier as the test thread and
deadlocks on the rwsem on RT.

Now this is a potential deadlock in mainline as well, because we have
fifo fair rwsems. If another thread blocks with a down_write() on the
notifier chain before the probe thread issues the down_read() it will
block the probe thread and the whole party is dead locked.

Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/crypto/algapi.c b/crypto/algapi.c
index c3b9bfe..3574066 100644
--- a/crypto/algapi.c
+++ b/crypto/algapi.c
@@ -683,13 +683,13 @@ EXPORT_SYMBOL_GPL(crypto_spawn_tfm2);
 
 int crypto_register_notifier(struct notifier_block *nb)
 {
-	return blocking_notifier_chain_register(&crypto_chain, nb);
+	return srcu_notifier_chain_register(&crypto_chain, nb);
 }
 EXPORT_SYMBOL_GPL(crypto_register_notifier);
 
 int crypto_unregister_notifier(struct notifier_block *nb)
 {
-	return blocking_notifier_chain_unregister(&crypto_chain, nb);
+	return srcu_notifier_chain_unregister(&crypto_chain, nb);
 }
 EXPORT_SYMBOL_GPL(crypto_unregister_notifier);
 
diff --git a/crypto/api.c b/crypto/api.c
index 033a714..8ff072c 100644
--- a/crypto/api.c
+++ b/crypto/api.c
@@ -31,7 +31,7 @@ EXPORT_SYMBOL_GPL(crypto_alg_list);
 DECLARE_RWSEM(crypto_alg_sem);
 EXPORT_SYMBOL_GPL(crypto_alg_sem);
 
-BLOCKING_NOTIFIER_HEAD(crypto_chain);
+SRCU_NOTIFIER_HEAD(crypto_chain);
 EXPORT_SYMBOL_GPL(crypto_chain);
 
 static inline struct crypto_alg *crypto_alg_get(struct crypto_alg *alg)
@@ -237,10 +237,10 @@ int crypto_probing_notify(unsigned long val, void *v)
 {
 	int ok;
 
-	ok = blocking_notifier_call_chain(&crypto_chain, val, v);
+	ok = srcu_notifier_call_chain(&crypto_chain, val, v);
 	if (ok == NOTIFY_DONE) {
 		request_module("cryptomgr");
-		ok = blocking_notifier_call_chain(&crypto_chain, val, v);
+		ok = srcu_notifier_call_chain(&crypto_chain, val, v);
 	}
 
 	return ok;
diff --git a/crypto/internal.h b/crypto/internal.h
index 9ebedae..8cbe3dc 100644
--- a/crypto/internal.h
+++ b/crypto/internal.h
@@ -48,7 +48,7 @@ struct crypto_larval {
 
 extern struct list_head crypto_alg_list;
 extern struct rw_semaphore crypto_alg_sem;
-extern struct blocking_notifier_head crypto_chain;
+extern struct srcu_notifier_head crypto_chain;
 
 #ifdef CONFIG_PROC_FS
 void __init crypto_init_proc(void);
@@ -136,7 +136,7 @@ static inline int crypto_is_moribund(struct crypto_alg *alg)
 
 static inline void crypto_notify(unsigned long val, void *v)
 {
-	blocking_notifier_call_chain(&crypto_chain, val, v);
+	srcu_notifier_call_chain(&crypto_chain, val, v);
 }
 
 #endif	/* _CRYPTO_INTERNAL_H */
-- 
cgit v0.10.2


From 25d3d97b1361fae3e15cb37335479abe8369e0fa Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 4 Oct 2012 13:32:46 +0100
Subject: x86: perf: Deal with kfree from atomic contexts

The x86 perf code allocates memory upfront because it might need
it. The detection that it is not needed happens in atomic context and
calls kfree from there. RT cant do that. Use kfree_rcu instead.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 115c1ea..f50cca1 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -108,6 +108,7 @@ struct intel_shared_regs {
 	struct er_account       regs[EXTRA_REG_MAX];
 	int                     refcnt;		/* per-core: #HT threads */
 	unsigned                core_id;	/* per-core: core id */
+	struct rcu_head		rcu;
 };
 
 #define MAX_LBR_ENTRIES		16
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 4914e94..69662ad 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1715,7 +1715,7 @@ static void intel_pmu_cpu_dying(int cpu)
 	pc = cpuc->shared_regs;
 	if (pc) {
 		if (pc->core_id == -1 || --pc->refcnt == 0)
-			kfree(pc);
+			kfree_rcu(pc, rcu);
 		cpuc->shared_regs = NULL;
 	}
 
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
index b43200d..85f64b3 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
@@ -2636,7 +2636,7 @@ static void __cpuinit uncore_cpu_dying(int cpu)
 			box = *per_cpu_ptr(pmu->box, cpu);
 			*per_cpu_ptr(pmu->box, cpu) = NULL;
 			if (box && atomic_dec_and_test(&box->refcnt))
-				kfree(box);
+				kfree_rcu(box, rcu);
 		}
 	}
 }
@@ -2666,7 +2666,8 @@ static int __cpuinit uncore_cpu_starting(int cpu)
 				if (exist && exist->phys_id == phys_id) {
 					atomic_inc(&exist->refcnt);
 					*per_cpu_ptr(pmu->box, cpu) = exist;
-					kfree(box);
+					if (box)
+						kfree_rcu(box, rcu);
 					box = NULL;
 					break;
 				}
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h b/arch/x86/kernel/cpu/perf_event_intel_uncore.h
index e68a455..c4e1028 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.h
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.h
@@ -421,6 +421,7 @@ struct intel_uncore_box {
 	struct hrtimer hrtimer;
 	struct list_head list;
 	struct intel_uncore_extra_reg shared_regs[0];
+	struct rcu_head rcu;
 };
 
 #define UNCORE_BOX_FLAG_INITIATED	0
-- 
cgit v0.10.2


From 2ca1440b3c62810b25dc2ed08835d88e5494de0d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 4 Oct 2012 14:30:25 +0100
Subject: softirq: Make serving softirqs a task flag

Avoid the percpu softirq_runner pointer magic by using a task flag.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/sched.h b/include/linux/sched.h
index b0cde41..2fce5ed 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1856,6 +1856,7 @@ extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut,
 /*
  * Per process flags
  */
+#define PF_IN_SOFTIRQ	0x00000001	/* Task is serving softirq */
 #define PF_EXITING	0x00000004	/* getting shut down */
 #define PF_EXITPIDONE	0x00000008	/* pi exit done on shut down */
 #define PF_VCPU		0x00000010	/* I'm a virtual CPU */
diff --git a/kernel/softirq.c b/kernel/softirq.c
index f52cdbc..0a810e4 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -375,7 +375,6 @@ static void ksoftirqd_clr_sched_params(unsigned int cpu, bool online) { }
  * On RT we serialize softirq execution with a cpu local lock
  */
 static DEFINE_LOCAL_IRQ_LOCK(local_softirq_lock);
-static DEFINE_PER_CPU(struct task_struct *, local_softirq_runner);
 
 static void __do_softirq_common(int need_rcu_bh_qs);
 
@@ -430,22 +429,9 @@ void _local_bh_enable(void)
 }
 EXPORT_SYMBOL(_local_bh_enable);
 
-/* For tracing */
-int notrace __in_softirq(void)
-{
-	if (__get_cpu_var(local_softirq_lock).owner == current)
-		return __get_cpu_var(local_softirq_lock).nestcnt;
-	return 0;
-}
-
 int in_serving_softirq(void)
 {
-	int res;
-
-	preempt_disable();
-	res = __get_cpu_var(local_softirq_runner) == current;
-	preempt_enable();
-	return res;
+	return current->flags & PF_IN_SOFTIRQ;
 }
 EXPORT_SYMBOL(in_serving_softirq);
 
@@ -463,7 +449,7 @@ static void __do_softirq_common(int need_rcu_bh_qs)
 	/* Reset the pending bitmask before enabling irqs */
 	set_softirq_pending(0);
 
-	__get_cpu_var(local_softirq_runner) = current;
+	current->flags |= PF_IN_SOFTIRQ;
 
 	lockdep_softirq_enter();
 
@@ -474,7 +460,7 @@ static void __do_softirq_common(int need_rcu_bh_qs)
 		wakeup_softirqd();
 
 	lockdep_softirq_exit();
-	__get_cpu_var(local_softirq_runner) = NULL;
+	current->flags &= ~PF_IN_SOFTIRQ;
 
 	current->softirq_nestcnt--;
 }
-- 
cgit v0.10.2


From f740b7bf6438f21bc51cdbc6ba637b84db9b32e4 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 4 Oct 2012 15:33:53 +0100
Subject: softirq: Split handling function

Split out the inner handling function, so RT can reuse it.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/softirq.c b/kernel/softirq.c
index 0a810e4..e9167de 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -142,31 +142,34 @@ static void wakeup_softirqd(void)
 		wake_up_process(tsk);
 }
 
-static void handle_pending_softirqs(u32 pending, int cpu, int need_rcu_bh_qs)
+static void handle_softirq(unsigned int vec_nr, int cpu, int need_rcu_bh_qs)
 {
-	struct softirq_action *h = softirq_vec;
+	struct softirq_action *h = softirq_vec + vec_nr;
 	unsigned int prev_count = preempt_count();
 
-	local_irq_enable();
-	for ( ; pending; h++, pending >>= 1) {
-		unsigned int vec_nr = h - softirq_vec;
+	kstat_incr_softirqs_this_cpu(vec_nr);
+	trace_softirq_entry(vec_nr);
+	h->action(h);
+	trace_softirq_exit(vec_nr);
 
-		if (!(pending & 1))
-			continue;
+	if (unlikely(prev_count != preempt_count())) {
+		pr_err("softirq %u %s %p preempt count leak: %08x -> %08x\n",
+		       vec_nr, softirq_to_name[vec_nr], h->action,
+		       prev_count, (unsigned int) preempt_count());
+		preempt_count() = prev_count;
+	}
+	if (need_rcu_bh_qs)
+		rcu_bh_qs(cpu);
+}
 
-		kstat_incr_softirqs_this_cpu(vec_nr);
-		trace_softirq_entry(vec_nr);
-		h->action(h);
-		trace_softirq_exit(vec_nr);
-		if (unlikely(prev_count != preempt_count())) {
-			printk(KERN_ERR
- "huh, entered softirq %u %s %p with preempt_count %08x exited with %08x?\n",
-			       vec_nr, softirq_to_name[vec_nr], h->action,
-			       prev_count, (unsigned int) preempt_count());
-			preempt_count() = prev_count;
-		}
-		if (need_rcu_bh_qs)
-			rcu_bh_qs(cpu);
+static void handle_pending_softirqs(u32 pending, int cpu, int need_rcu_bh_qs)
+{
+	unsigned int vec_nr;
+
+	local_irq_enable();
+	for (vec_nr = 0; pending; vec_nr++, pending >>= 1) {
+		if (pending & 1)
+			handle_softirq(vec_nr, cpu, need_rcu_bh_qs);
 	}
 	local_irq_disable();
 }
-- 
cgit v0.10.2


From 4f86379022a7942a3bfff7fb2110b04e80b11690 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 4 Oct 2012 14:20:47 +0100
Subject: softirq: Split softirq locks

The 3.x RT series removed the split softirq implementation in favour
of pushing softirq processing into the context of the thread which
raised it. Though this prevents us from handling the various softirqs
at different priorities. Now instead of reintroducing the split
softirq threads we split the locks which serialize the softirq
processing.

If a softirq is raised in context of a thread, then the softirq is
noted on a per thread field, if the thread is in a bh disabled
region. If the softirq is raised from hard interrupt context, then the
bit is set in the flag field of ksoftirqd and ksoftirqd is invoked.
When a thread leaves a bh disabled region, then it tries to execute
the softirqs which have been raised in its own context. It acquires
the per softirq / per cpu lock for the softirq and then checks,
whether the softirq is still pending in the per cpu
local_softirq_pending() field. If yes, it runs the softirq. If no,
then some other task executed it already. This allows for zero config
softirq elevation in the context of user space tasks or interrupt
threads.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 2fce5ed..354b56e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1635,6 +1635,7 @@ struct task_struct {
 #ifdef CONFIG_PREEMPT_RT_BASE
 	struct rcu_head put_rcu;
 	int softirq_nestcnt;
+	unsigned int softirqs_raised;
 #endif
 #ifdef CONFIG_PREEMPT_RT_FULL
 # if defined CONFIG_HIGHMEM || defined CONFIG_X86_32
diff --git a/kernel/softirq.c b/kernel/softirq.c
index e9167de..53e37a9 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -162,6 +162,12 @@ static void handle_softirq(unsigned int vec_nr, int cpu, int need_rcu_bh_qs)
 		rcu_bh_qs(cpu);
 }
 
+#ifndef CONFIG_PREEMPT_RT_FULL
+static inline int ksoftirqd_softirq_pending(void)
+{
+	return local_softirq_pending();
+}
+
 static void handle_pending_softirqs(u32 pending, int cpu, int need_rcu_bh_qs)
 {
 	unsigned int vec_nr;
@@ -174,7 +180,19 @@ static void handle_pending_softirqs(u32 pending, int cpu, int need_rcu_bh_qs)
 	local_irq_disable();
 }
 
-#ifndef CONFIG_PREEMPT_RT_FULL
+static void run_ksoftirqd(unsigned int cpu)
+{
+	local_irq_disable();
+	if (ksoftirqd_softirq_pending()) {
+		__do_softirq();
+		rcu_note_context_switch(cpu);
+		local_irq_enable();
+		cond_resched();
+		return;
+	}
+	local_irq_enable();
+}
+
 /*
  * preempt_count and SOFTIRQ_OFFSET usage:
  * - preempt_count is changed by SOFTIRQ_OFFSET on entering or leaving
@@ -367,6 +385,32 @@ asmlinkage void do_softirq(void)
 
 #endif
 
+/*
+ * This function must run with irqs disabled!
+ */
+void raise_softirq_irqoff(unsigned int nr)
+{
+	__raise_softirq_irqoff(nr);
+
+	/*
+	 * If we're in an interrupt or softirq, we're done
+	 * (this also catches softirq-disabled code). We will
+	 * actually run the softirq once we return from
+	 * the irq or softirq.
+	 *
+	 * Otherwise we wake up ksoftirqd to make sure we
+	 * schedule the softirq soon.
+	 */
+	if (!in_interrupt())
+		wakeup_softirqd();
+}
+
+void __raise_softirq_irqoff(unsigned int nr)
+{
+	trace_softirq_raise(nr);
+	or_softirq_pending(1UL << nr);
+}
+
 static inline void local_bh_disable_nort(void) { local_bh_disable(); }
 static inline void _local_bh_enable_nort(void) { _local_bh_enable(); }
 static void ksoftirqd_set_sched_params(unsigned int cpu) { }
@@ -375,20 +419,78 @@ static void ksoftirqd_clr_sched_params(unsigned int cpu, bool online) { }
 #else /* !PREEMPT_RT_FULL */
 
 /*
- * On RT we serialize softirq execution with a cpu local lock
+ * On RT we serialize softirq execution with a cpu local lock per softirq
  */
-static DEFINE_LOCAL_IRQ_LOCK(local_softirq_lock);
+static DEFINE_PER_CPU(struct local_irq_lock [NR_SOFTIRQS], local_softirq_locks);
+
+void __init softirq_early_init(void)
+{
+	int i;
 
-static void __do_softirq_common(int need_rcu_bh_qs);
+	for (i = 0; i < NR_SOFTIRQS; i++)
+		local_irq_lock_init(local_softirq_locks[i]);
+}
 
-void __do_softirq(void)
+static void lock_softirq(int which)
 {
-	__do_softirq_common(0);
+	__local_lock(&__get_cpu_var(local_softirq_locks[which]));
 }
 
-void __init softirq_early_init(void)
+static void unlock_softirq(int which)
+{
+	__local_unlock(&__get_cpu_var(local_softirq_locks[which]));
+}
+
+static void do_single_softirq(int which, int need_rcu_bh_qs)
 {
-	local_irq_lock_init(local_softirq_lock);
+	unsigned long old_flags = current->flags;
+
+	current->flags &= ~PF_MEMALLOC;
+	vtime_account(current);
+	current->flags |= PF_IN_SOFTIRQ;
+	lockdep_softirq_enter();
+	local_irq_enable();
+	handle_softirq(which, smp_processor_id(), need_rcu_bh_qs);
+	local_irq_disable();
+	lockdep_softirq_exit();
+	current->flags &= ~PF_IN_SOFTIRQ;
+	vtime_account(current);
+	tsk_restore_flags(current, old_flags, PF_MEMALLOC);
+}
+
+/*
+ * Called with interrupts disabled. Process softirqs which were raised
+ * in current context (or on behalf of ksoftirqd).
+ */
+static void do_current_softirqs(int need_rcu_bh_qs)
+{
+	while (current->softirqs_raised) {
+		int i = __ffs(current->softirqs_raised);
+		unsigned int pending, mask = (1U << i);
+
+		current->softirqs_raised &= ~mask;
+		local_irq_enable();
+
+		/*
+		 * If the lock is contended, we boost the owner to
+		 * process the softirq or leave the critical section
+		 * now.
+		 */
+		lock_softirq(i);
+		local_irq_disable();
+		/*
+		 * Check with the local_softirq_pending() bits,
+		 * whether we need to process this still or if someone
+		 * else took care of it.
+		 */
+		pending = local_softirq_pending();
+		if (pending & mask) {
+			set_softirq_pending(pending & ~mask);
+			do_single_softirq(i, need_rcu_bh_qs);
+		}
+		unlock_softirq(i);
+		WARN_ON(current->softirq_nestcnt != 1);
+	}
 }
 
 void local_bh_disable(void)
@@ -403,17 +505,11 @@ void local_bh_enable(void)
 	if (WARN_ON(current->softirq_nestcnt == 0))
 		return;
 
-	if ((current->softirq_nestcnt == 1) &&
-	    local_softirq_pending() &&
-	    local_trylock(local_softirq_lock)) {
+	local_irq_disable();
+	if (current->softirq_nestcnt == 1 && current->softirqs_raised)
+		do_current_softirqs(1);
+	local_irq_enable();
 
-		local_irq_disable();
-		if (local_softirq_pending())
-			__do_softirq();
-		local_irq_enable();
-		local_unlock(local_softirq_lock);
-		WARN_ON(current->softirq_nestcnt != 1);
-	}
 	current->softirq_nestcnt--;
 	migrate_enable();
 }
@@ -438,86 +534,82 @@ int in_serving_softirq(void)
 }
 EXPORT_SYMBOL(in_serving_softirq);
 
-/*
- * Called with bh and local interrupts disabled. For full RT cpu must
- * be pinned.
- */
-static void __do_softirq_common(int need_rcu_bh_qs)
+/* Called with preemption disabled */
+static void run_ksoftirqd(unsigned int cpu)
 {
-	u32 pending = local_softirq_pending();
-	int cpu = smp_processor_id();
-
+	local_irq_disable();
 	current->softirq_nestcnt++;
-
-	/* Reset the pending bitmask before enabling irqs */
-	set_softirq_pending(0);
-
-	current->flags |= PF_IN_SOFTIRQ;
-
-	lockdep_softirq_enter();
-
-	handle_pending_softirqs(pending, cpu, need_rcu_bh_qs);
-
-	pending = local_softirq_pending();
-	if (pending)
-		wakeup_softirqd();
-
-	lockdep_softirq_exit();
-	current->flags &= ~PF_IN_SOFTIRQ;
-
+	do_current_softirqs(1);
 	current->softirq_nestcnt--;
+	rcu_note_context_switch(cpu);
+	local_irq_enable();
 }
 
-static int __thread_do_softirq(int cpu)
+/*
+ * Called from netif_rx_ni(). Preemption enabled, but migration
+ * disabled. So the cpu can't go away under us.
+ */
+void thread_do_softirq(void)
 {
+	if (!in_serving_softirq() && current->softirqs_raised) {
+		current->softirq_nestcnt++;
+		do_current_softirqs(0);
+		current->softirq_nestcnt--;
+	}
+}
+
+void __raise_softirq_irqoff(unsigned int nr)
+{
+	trace_softirq_raise(nr);
+	or_softirq_pending(1UL << nr);
+
 	/*
-	 * Prevent the current cpu from going offline.
-	 * pin_current_cpu() can reenable preemption and block on the
-	 * hotplug mutex. When it returns, the current cpu is
-	 * pinned. It might be the wrong one, but the offline check
-	 * below catches that.
+	 * If we are not in a hard interrupt and inside a bh disabled
+	 * region, we simply raise the flag on current. local_bh_enable()
+	 * will make sure that the softirq is executed. Otherwise we
+	 * delegate it to ksoftirqd.
 	 */
-	pin_current_cpu();
+	if (!in_irq() && current->softirq_nestcnt)
+		current->softirqs_raised |= (1U << nr);
+	else if (__this_cpu_read(ksoftirqd))
+		__this_cpu_read(ksoftirqd)->softirqs_raised |= (1U << nr);
+}
+
+/*
+ * This function must run with irqs disabled!
+ */
+void raise_softirq_irqoff(unsigned int nr)
+{
+	__raise_softirq_irqoff(nr);
+
 	/*
-	 * If called from ksoftirqd (cpu >= 0) we need to check
-	 * whether we are on the wrong cpu due to cpu offlining. If
-	 * called via thread_do_softirq() no action required.
+	 * If we're in an hard interrupt we let irq return code deal
+	 * with the wakeup of ksoftirqd.
 	 */
-	if (cpu >= 0 && cpu_is_offline(cpu)) {
-		unpin_current_cpu();
-		return -1;
-	}
-	preempt_enable();
-	local_lock(local_softirq_lock);
-	local_irq_disable();
+	if (in_irq())
+		return;
+
 	/*
-	 * We cannot switch stacks on RT as we want to be able to
-	 * schedule!
+	 * If we are in thread context but outside of a bh disabled
+	 * region, we need to wake ksoftirqd as well.
+	 *
+	 * CHECKME: Some of the places which do that could be wrapped
+	 * into local_bh_disable/enable pairs. Though it's unclear
+	 * whether this is worth the effort. To find those places just
+	 * raise a WARN() if the condition is met.
 	 */
-	if (local_softirq_pending())
-		__do_softirq_common(cpu >= 0);
-	local_unlock(local_softirq_lock);
-	unpin_current_cpu();
-	preempt_disable();
-	local_irq_enable();
-	return 0;
+	if (!current->softirq_nestcnt)
+		wakeup_softirqd();
 }
 
-/*
- * Called from netif_rx_ni(). Preemption enabled.
- */
-void thread_do_softirq(void)
+void do_raise_softirq_irqoff(unsigned int nr)
 {
-	if (!in_serving_softirq()) {
-		preempt_disable();
-		__thread_do_softirq(-1);
-		preempt_enable();
-	}
+	raise_softirq_irqoff(nr);
 }
 
-static int ksoftirqd_do_softirq(int cpu)
+static inline int ksoftirqd_softirq_pending(void)
 {
-	return __thread_do_softirq(cpu);
+	return current->softirqs_raised;
 }
 
 static inline void local_bh_disable_nort(void) { }
@@ -528,6 +620,10 @@ static inline void ksoftirqd_set_sched_params(unsigned int cpu)
 	struct sched_param param = { .sched_priority = 1 };
 
 	sched_setscheduler(current, SCHED_FIFO, &param);
+	/* Take over all pending softirqs when starting */
+	local_irq_disable();
+	current->softirqs_raised = local_softirq_pending();
+	local_irq_enable();
 }
 
 static inline void ksoftirqd_clr_sched_params(unsigned int cpu, bool online)
@@ -574,8 +670,14 @@ static inline void invoke_softirq(void)
 		wakeup_softirqd();
 		__local_bh_enable(SOFTIRQ_OFFSET);
 	}
-#else
+#else /* PREEMPT_RT_FULL */
+	unsigned long flags;
+
+	local_irq_save(flags);
+	if (__this_cpu_read(ksoftirqd) &&
+	    __this_cpu_read(ksoftirqd)->softirqs_raised)
 		wakeup_softirqd();
+	local_irq_restore(flags);
 #endif
 }
 
@@ -599,26 +701,6 @@ void irq_exit(void)
 	sched_preempt_enable_no_resched();
 }
 
-/*
- * This function must run with irqs disabled!
- */
-inline void raise_softirq_irqoff(unsigned int nr)
-{
-	__raise_softirq_irqoff(nr);
-
-	/*
-	 * If we're in an interrupt or softirq, we're done
-	 * (this also catches softirq-disabled code). We will
-	 * actually run the softirq once we return from
-	 * the irq or softirq.
-	 *
-	 * Otherwise we wake up ksoftirqd to make sure we
-	 * schedule the softirq soon.
-	 */
-	if (!in_interrupt())
-		wakeup_softirqd();
-}
-
 void raise_softirq(unsigned int nr)
 {
 	unsigned long flags;
@@ -628,12 +710,6 @@ void raise_softirq(unsigned int nr)
 	local_irq_restore(flags);
 }
 
-void __raise_softirq_irqoff(unsigned int nr)
-{
-	trace_softirq_raise(nr);
-	or_softirq_pending(1UL << nr);
-}
-
 void open_softirq(int nr, void (*action)(struct softirq_action *))
 {
 	softirq_vec[nr].action = action;
@@ -1079,20 +1155,7 @@ EXPORT_SYMBOL(tasklet_unlock_wait);
 
 static int ksoftirqd_should_run(unsigned int cpu)
 {
-	return local_softirq_pending();
-}
-
-static void run_ksoftirqd(unsigned int cpu)
-{
-	local_irq_disable();
-	if (local_softirq_pending()) {
-		__do_softirq();
-		rcu_note_context_switch(cpu);
-		local_irq_enable();
-		cond_resched();
-		return;
-	}
-	local_irq_enable();
+	return ksoftirqd_softirq_pending();
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
-- 
cgit v0.10.2


From f2f21d88ff0dda6d5c48590bd8381c1daa9c5911 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 16 Oct 2012 18:36:51 +0200
Subject: rcu: rcutiny: Prevent RCU stall

rcu_read_unlock_special() checks in_serving_softirq() and leaves early
when true. On RT this is obviously wrong as softirq processing context
can be preempted and therefor such a task can be on the gp_tasks
list. Leaving early here will leave the task on the list and therefor
block RCU processing forever.

This cannot happen on mainline because softirq processing context
cannot be preempted and therefor this can never happen at all.

In fact this check looks quite questionable in general. Neither irq
context nor softirq processing context in mainline can ever be
preempted in mainline so the special unlock case should not ever be
invoked in such context. Now the only explanation might be a
rcu_read_unlock() being interrupted and therefor leave the rcu nest
count at 0 before the special unlock bit has been cleared. That looks
fragile. At least it's missing a big fat comment. Paul ????

See mainline commits: ec433f0c5 and 8762705a for further enlightment.

Reported-by: Kristian Lehmann <krleit00@hs-esslingen.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable-rt@vger.kernel.org

diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index f85016a..ec3446e 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -560,7 +560,7 @@ void rcu_read_unlock_special(struct task_struct *t)
 		rcu_preempt_cpu_qs();
 
 	/* Hardware IRQ handlers cannot block. */
-	if (in_irq() || in_serving_softirq()) {
+	if (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_OFFSET)) {
 		local_irq_restore(flags);
 		return;
 	}
-- 
cgit v0.10.2


From bfb05712af4376b1189d5bbdccd8e39112a420d4 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 25 Oct 2012 10:32:35 +0100
Subject: mm: Enable SLUB for RT

Make SLUB RT aware and remove the restriction in Kconfig.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index 9db4825..a58ad34 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -54,7 +54,7 @@ struct kmem_cache_cpu {
 };
 
 struct kmem_cache_node {
-	spinlock_t list_lock;	/* Protect partial list and nr_partial */
+	raw_spinlock_t list_lock;	/* Protect partial list and nr_partial */
 	unsigned long nr_partial;
 	struct list_head partial;
 #ifdef CONFIG_SLUB_DEBUG
diff --git a/init/Kconfig b/init/Kconfig
index f6a406d..54b18f0 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1494,7 +1494,6 @@ config SLAB
 
 config SLUB
 	bool "SLUB (Unqueued Allocator)"
-	depends on !PREEMPT_RT_FULL
 	help
 	   SLUB is a slab allocator that minimizes cache line usage
 	   instead of managing queues of cached objects (SLAB approach).
diff --git a/mm/slub.c b/mm/slub.c
index 3d2308f..f8cfc46 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1070,7 +1070,7 @@ static noinline struct kmem_cache_node *free_debug_processing(
 {
 	struct kmem_cache_node *n = get_node(s, page_to_nid(page));
 
-	spin_lock_irqsave(&n->list_lock, *flags);
+	raw_spin_lock_irqsave(&n->list_lock, *flags);
 	slab_lock(page);
 
 	if (!check_slab(s, page))
@@ -1118,7 +1118,7 @@ out:
 
 fail:
 	slab_unlock(page);
-	spin_unlock_irqrestore(&n->list_lock, *flags);
+	raw_spin_unlock_irqrestore(&n->list_lock, *flags);
 	slab_fix(s, "Object at 0x%p not freed", object);
 	return NULL;
 }
@@ -1253,6 +1253,12 @@ static inline void slab_free_hook(struct kmem_cache *s, void *x) {}
 
 #endif /* CONFIG_SLUB_DEBUG */
 
+struct slub_free_list {
+	raw_spinlock_t		lock;
+	struct list_head	list;
+};
+static DEFINE_PER_CPU(struct slub_free_list, slub_free_list);
+
 /*
  * Slab allocation and freeing
  */
@@ -1277,7 +1283,11 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
 
 	flags &= gfp_allowed_mask;
 
+#ifdef CONFIG_PREEMPT_RT_FULL
+	if (system_state == SYSTEM_RUNNING)
+#else
 	if (flags & __GFP_WAIT)
+#endif
 		local_irq_enable();
 
 	flags |= s->allocflags;
@@ -1317,7 +1327,11 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
 			kmemcheck_mark_unallocated_pages(page, pages);
 	}
 
+#ifdef CONFIG_PREEMPT_RT_FULL
+	if (system_state == SYSTEM_RUNNING)
+#else
 	if (flags & __GFP_WAIT)
+#endif
 		local_irq_disable();
 	if (!page)
 		return NULL;
@@ -1414,6 +1428,16 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
 	__free_memcg_kmem_pages(page, order);
 }
 
+static void free_delayed(struct kmem_cache *s, struct list_head *h)
+{
+	while(!list_empty(h)) {
+		struct page *page = list_first_entry(h, struct page, lru);
+
+		list_del(&page->lru);
+		__free_slab(s, page);
+	}
+}
+
 #define need_reserve_slab_rcu						\
 	(sizeof(((struct page *)NULL)->lru) < sizeof(struct rcu_head))
 
@@ -1448,6 +1472,12 @@ static void free_slab(struct kmem_cache *s, struct page *page)
 		}
 
 		call_rcu(head, rcu_free_slab);
+	} else if (irqs_disabled()) {
+		struct slub_free_list *f = &__get_cpu_var(slub_free_list);
+
+		raw_spin_lock(&f->lock);
+		list_add(&page->lru, &f->list);
+		raw_spin_unlock(&f->lock);
 	} else
 		__free_slab(s, page);
 }
@@ -1549,7 +1579,7 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
 	if (!n || !n->nr_partial)
 		return NULL;
 
-	spin_lock(&n->list_lock);
+	raw_spin_lock(&n->list_lock);
 	list_for_each_entry_safe(page, page2, &n->partial, lru) {
 		void *t;
 		int available;
@@ -1574,7 +1604,7 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
 			break;
 
 	}
-	spin_unlock(&n->list_lock);
+	raw_spin_unlock(&n->list_lock);
 	return object;
 }
 
@@ -1816,7 +1846,7 @@ redo:
 			 * that acquire_slab() will see a slab page that
 			 * is frozen
 			 */
-			spin_lock(&n->list_lock);
+			raw_spin_lock(&n->list_lock);
 		}
 	} else {
 		m = M_FULL;
@@ -1827,7 +1857,7 @@ redo:
 			 * slabs from diagnostic functions will not see
 			 * any frozen slabs.
 			 */
-			spin_lock(&n->list_lock);
+			raw_spin_lock(&n->list_lock);
 		}
 	}
 
@@ -1862,7 +1892,7 @@ redo:
 		goto redo;
 
 	if (lock)
-		spin_unlock(&n->list_lock);
+		raw_spin_unlock(&n->list_lock);
 
 	if (m == M_FREE) {
 		stat(s, DEACTIVATE_EMPTY);
@@ -1893,10 +1923,10 @@ static void unfreeze_partials(struct kmem_cache *s,
 		n2 = get_node(s, page_to_nid(page));
 		if (n != n2) {
 			if (n)
-				spin_unlock(&n->list_lock);
+				raw_spin_unlock(&n->list_lock);
 
 			n = n2;
-			spin_lock(&n->list_lock);
+			raw_spin_lock(&n->list_lock);
 		}
 
 		do {
@@ -1925,7 +1955,7 @@ static void unfreeze_partials(struct kmem_cache *s,
 	}
 
 	if (n)
-		spin_unlock(&n->list_lock);
+		raw_spin_unlock(&n->list_lock);
 
 	while (discard_page) {
 		page = discard_page;
@@ -1961,14 +1991,21 @@ static int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
 			pobjects = oldpage->pobjects;
 			pages = oldpage->pages;
 			if (drain && pobjects > s->cpu_partial) {
+				struct slub_free_list *f;
 				unsigned long flags;
+				LIST_HEAD(tofree);
 				/*
 				 * partial array is full. Move the existing
 				 * set to the per node partial list.
 				 */
 				local_irq_save(flags);
 				unfreeze_partials(s, this_cpu_ptr(s->cpu_slab));
+				f = &__get_cpu_var(slub_free_list);
+				raw_spin_lock(&f->lock);
+				list_splice_init(&f->list, &tofree);
+				raw_spin_unlock(&f->lock);
 				local_irq_restore(flags);
+				free_delayed(s, &tofree);
 				oldpage = NULL;
 				pobjects = 0;
 				pages = 0;
@@ -2031,7 +2068,22 @@ static bool has_cpu_slab(int cpu, void *info)
 
 static void flush_all(struct kmem_cache *s)
 {
+	LIST_HEAD(tofree);
+	int cpu;
+
 	on_each_cpu_cond(has_cpu_slab, flush_cpu_slab, s, 1, GFP_ATOMIC);
+	for_each_online_cpu(cpu) {
+		struct slub_free_list *f;
+
+		if (!has_cpu_slab(cpu, s))
+			continue;
+
+		f = &per_cpu(slub_free_list, cpu);
+		raw_spin_lock_irq(&f->lock);
+		list_splice_init(&f->list, &tofree);
+		raw_spin_unlock_irq(&f->lock);
+		free_delayed(s, &tofree);
+	}
 }
 
 /*
@@ -2059,10 +2111,10 @@ static unsigned long count_partial(struct kmem_cache_node *n,
 	unsigned long x = 0;
 	struct page *page;
 
-	spin_lock_irqsave(&n->list_lock, flags);
+	raw_spin_lock_irqsave(&n->list_lock, flags);
 	list_for_each_entry(page, &n->partial, lru)
 		x += get_count(page);
-	spin_unlock_irqrestore(&n->list_lock, flags);
+	raw_spin_unlock_irqrestore(&n->list_lock, flags);
 	return x;
 }
 
@@ -2205,9 +2257,11 @@ static inline void *get_freelist(struct kmem_cache *s, struct page *page)
 static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
 			  unsigned long addr, struct kmem_cache_cpu *c)
 {
+	struct slub_free_list *f;
 	void *freelist;
 	struct page *page;
 	unsigned long flags;
+	LIST_HEAD(tofree);
 
 	local_irq_save(flags);
 #ifdef CONFIG_PREEMPT
@@ -2270,7 +2324,13 @@ load_freelist:
 	VM_BUG_ON(!c->page->frozen);
 	c->freelist = get_freepointer(s, freelist);
 	c->tid = next_tid(c->tid);
+out:
+	f = &__get_cpu_var(slub_free_list);
+	raw_spin_lock(&f->lock);
+	list_splice_init(&f->list, &tofree);
+	raw_spin_unlock(&f->lock);
 	local_irq_restore(flags);
+	free_delayed(s, &tofree);
 	return freelist;
 
 new_slab:
@@ -2288,9 +2348,7 @@ new_slab:
 	if (unlikely(!freelist)) {
 		if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit())
 			slab_out_of_memory(s, gfpflags, node);
-
-		local_irq_restore(flags);
-		return NULL;
+		goto out;
 	}
 
 	page = c->page;
@@ -2304,8 +2362,7 @@ new_slab:
 	deactivate_slab(s, page, get_freepointer(s, freelist));
 	c->page = NULL;
 	c->freelist = NULL;
-	local_irq_restore(flags);
-	return freelist;
+	goto out;
 }
 
 /*
@@ -2477,7 +2534,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
 
 	do {
 		if (unlikely(n)) {
-			spin_unlock_irqrestore(&n->list_lock, flags);
+			raw_spin_unlock_irqrestore(&n->list_lock, flags);
 			n = NULL;
 		}
 		prior = page->freelist;
@@ -2507,7 +2564,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
 				 * Otherwise the list_lock will synchronize with
 				 * other processors updating the list of slabs.
 				 */
-				spin_lock_irqsave(&n->list_lock, flags);
+				raw_spin_lock_irqsave(&n->list_lock, flags);
 
 			}
 		}
@@ -2548,7 +2605,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
 		add_partial(n, page, DEACTIVATE_TO_TAIL);
 		stat(s, FREE_ADD_PARTIAL);
 	}
-	spin_unlock_irqrestore(&n->list_lock, flags);
+	raw_spin_unlock_irqrestore(&n->list_lock, flags);
 	return;
 
 slab_empty:
@@ -2562,7 +2619,7 @@ slab_empty:
 		/* Slab must be on the full list */
 		remove_full(s, page);
 
-	spin_unlock_irqrestore(&n->list_lock, flags);
+	raw_spin_unlock_irqrestore(&n->list_lock, flags);
 	stat(s, FREE_SLAB);
 	discard_slab(s, page);
 }
@@ -2764,7 +2821,7 @@ static void
 init_kmem_cache_node(struct kmem_cache_node *n)
 {
 	n->nr_partial = 0;
-	spin_lock_init(&n->list_lock);
+	raw_spin_lock_init(&n->list_lock);
 	INIT_LIST_HEAD(&n->partial);
 #ifdef CONFIG_SLUB_DEBUG
 	atomic_long_set(&n->nr_slabs, 0);
@@ -3451,7 +3508,7 @@ int kmem_cache_shrink(struct kmem_cache *s)
 		for (i = 0; i < objects; i++)
 			INIT_LIST_HEAD(slabs_by_inuse + i);
 
-		spin_lock_irqsave(&n->list_lock, flags);
+		raw_spin_lock_irqsave(&n->list_lock, flags);
 
 		/*
 		 * Build lists indexed by the items in use in each slab.
@@ -3472,7 +3529,7 @@ int kmem_cache_shrink(struct kmem_cache *s)
 		for (i = objects - 1; i > 0; i--)
 			list_splice(slabs_by_inuse + i, n->partial.prev);
 
-		spin_unlock_irqrestore(&n->list_lock, flags);
+		raw_spin_unlock_irqrestore(&n->list_lock, flags);
 
 		/* Release empty slabs */
 		list_for_each_entry_safe(page, t, slabs_by_inuse, lru)
@@ -3642,6 +3699,12 @@ void __init kmem_cache_init(void)
 		boot_kmem_cache_node;
 	int i;
 	int caches = 2;
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		raw_spin_lock_init(&per_cpu(slub_free_list, cpu).lock);
+		INIT_LIST_HEAD(&per_cpu(slub_free_list, cpu).list);
+	}
 
 	if (debug_guardpage_minorder())
 		slub_max_order = 0;
@@ -4033,7 +4096,7 @@ static int validate_slab_node(struct kmem_cache *s,
 	struct page *page;
 	unsigned long flags;
 
-	spin_lock_irqsave(&n->list_lock, flags);
+	raw_spin_lock_irqsave(&n->list_lock, flags);
 
 	list_for_each_entry(page, &n->partial, lru) {
 		validate_slab_slab(s, page, map);
@@ -4056,7 +4119,7 @@ static int validate_slab_node(struct kmem_cache *s,
 			atomic_long_read(&n->nr_slabs));
 
 out:
-	spin_unlock_irqrestore(&n->list_lock, flags);
+	raw_spin_unlock_irqrestore(&n->list_lock, flags);
 	return count;
 }
 
@@ -4246,12 +4309,12 @@ static int list_locations(struct kmem_cache *s, char *buf,
 		if (!atomic_long_read(&n->nr_slabs))
 			continue;
 
-		spin_lock_irqsave(&n->list_lock, flags);
+		raw_spin_lock_irqsave(&n->list_lock, flags);
 		list_for_each_entry(page, &n->partial, lru)
 			process_slab(&t, s, page, alloc, map);
 		list_for_each_entry(page, &n->full, lru)
 			process_slab(&t, s, page, alloc, map);
-		spin_unlock_irqrestore(&n->list_lock, flags);
+		raw_spin_unlock_irqrestore(&n->list_lock, flags);
 	}
 
 	for (i = 0; i < t.count; i++) {
-- 
cgit v0.10.2


From f8a08e11d20024a5c1931f28e959efb076086195 Mon Sep 17 00:00:00 2001
From: Watanabe <shunsuke.watanabe@tel.com>
Date: Sun, 28 Oct 2012 11:13:44 +0100
Subject: hrtimer: Raise softirq if hrtimer irq stalled

When the hrtimer stall detection hits the softirq is not raised.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable-rt@vger.kernel.org

diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index a8b8527..b8ba4e3 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -1528,11 +1528,7 @@ retry:
 	if (expires_next.tv64 == KTIME_MAX ||
 	    !tick_program_event(expires_next, 0)) {
 		cpu_base->hang_detected = 0;
-
-		if (raise)
-			raise_softirq_irqoff(HRTIMER_SOFTIRQ);
-
-		return;
+		goto out;
 	}
 
 	/*
@@ -1576,6 +1572,9 @@ retry:
 	tick_program_event(expires_next, 1);
 	printk_once(KERN_WARNING "hrtimer: interrupt took %llu ns\n",
 		    ktime_to_ns(delta));
+out:
+	if (raise)
+		raise_softirq_irqoff(HRTIMER_SOFTIRQ);
 }
 
 /*
-- 
cgit v0.10.2


From 6205c51fcf0bd6cc7957372f997a7cb73011f4a1 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 28 Oct 2012 13:26:09 +0000
Subject: rcu: Disable RCU_FAST_NO_HZ on RT

This uses a timer_list timer from the irq disabled guts of the idle
code. Disable it for now to prevent wreckage.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable-rt@vger.kernel.org

diff --git a/init/Kconfig b/init/Kconfig
index 54b18f0..d0590c7 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -579,7 +579,7 @@ config RCU_FANOUT_EXACT
 
 config RCU_FAST_NO_HZ
 	bool "Accelerate last non-dyntick-idle CPU's grace periods"
-	depends on NO_HZ && SMP
+	depends on NO_HZ && SMP && !PREEMPT_RT_FULL
 	default n
 	help
 	  This option causes RCU to attempt to accelerate grace periods in
-- 
cgit v0.10.2


From afe10a673089defcf4d4f77bc691b4a4d3e97031 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 28 Oct 2012 11:18:08 +0100
Subject: net: netfilter: Serialize xt_write_recseq sections on RT

The netfilter code relies only on the implicit semantics of
local_bh_disable() for serializing wt_write_recseq sections. RT breaks
that and needs explicit serialization here.

Reported-by: Peter LaDow <petela@gocougs.wsu.edu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable-rt@vger.kernel.org

diff --git a/include/linux/locallock.h b/include/linux/locallock.h
index f1804a3..a5eea5d 100644
--- a/include/linux/locallock.h
+++ b/include/linux/locallock.h
@@ -25,6 +25,9 @@ struct local_irq_lock {
 	DEFINE_PER_CPU(struct local_irq_lock, lvar) = {			\
 		.lock = __SPIN_LOCK_UNLOCKED((lvar).lock) }
 
+#define DECLARE_LOCAL_IRQ_LOCK(lvar)					\
+	DECLARE_PER_CPU(struct local_irq_lock, lvar)
+
 #define local_irq_lock_init(lvar)					\
 	do {								\
 		int __cpu;						\
@@ -220,6 +223,7 @@ static inline int __local_unlock_irqrestore(struct local_irq_lock *lv,
 #else /* PREEMPT_RT_BASE */
 
 #define DEFINE_LOCAL_IRQ_LOCK(lvar)		__typeof__(const int) lvar
+#define DECLARE_LOCAL_IRQ_LOCK(lvar)		extern __typeof__(const int) lvar
 
 static inline void local_irq_lock_init(int lvar) { }
 
diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index dd49566..7d083af 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -3,6 +3,7 @@
 
 
 #include <linux/netdevice.h>
+#include <linux/locallock.h>
 #include <uapi/linux/netfilter/x_tables.h>
 
 /**
@@ -284,6 +285,8 @@ extern void xt_free_table_info(struct xt_table_info *info);
  */
 DECLARE_PER_CPU(seqcount_t, xt_recseq);
 
+DECLARE_LOCAL_IRQ_LOCK(xt_write_lock);
+
 /**
  * xt_write_recseq_begin - start of a write section
  *
@@ -298,6 +301,9 @@ static inline unsigned int xt_write_recseq_begin(void)
 {
 	unsigned int addend;
 
+	/* RT protection */
+	local_lock(xt_write_lock);
+
 	/*
 	 * Low order bit of sequence is set if we already
 	 * called xt_write_recseq_begin().
@@ -328,6 +334,7 @@ static inline void xt_write_recseq_end(unsigned int addend)
 	/* this is kind of a write_seqcount_end(), but addend is 0 or 1 */
 	smp_wmb();
 	__this_cpu_add(xt_recseq.sequence, addend);
+	local_unlock(xt_write_lock);
 }
 
 /*
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index a9c488b..c646ec8 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -20,11 +20,17 @@
 #include <linux/proc_fs.h>
 #include <linux/mutex.h>
 #include <linux/slab.h>
+#include <linux/locallock.h>
 #include <net/net_namespace.h>
 #include <net/sock.h>
 
 #include "nf_internals.h"
 
+#ifdef CONFIG_PREEMPT_RT_BASE
+DEFINE_LOCAL_IRQ_LOCK(xt_write_lock);
+EXPORT_PER_CPU_SYMBOL(xt_write_lock);
+#endif
+
 static DEFINE_MUTEX(afinfo_mutex);
 
 const struct nf_afinfo __rcu *nf_afinfo[NFPROTO_NUMPROTO] __read_mostly;
-- 
cgit v0.10.2


From 00fee614b331f417c88208b07d7e2be673c92410 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 28 Oct 2012 13:46:16 +0000
Subject: softirq: Adapt NOHZ softirq pending check to new RT scheme

We can't rely on ksoftirqd anymore and we need to check the tasks
which run a particular softirq and if such a task is pi blocked ignore
the other pending bits of that task as well.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/softirq.c b/kernel/softirq.c
index 53e37a9..8447c8d 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -66,46 +66,71 @@ char *softirq_to_name[NR_SOFTIRQS] = {
 
 #ifdef CONFIG_NO_HZ
 # ifdef CONFIG_PREEMPT_RT_FULL
+
+struct softirq_runner {
+	struct task_struct *runner[NR_SOFTIRQS];
+};
+
+static DEFINE_PER_CPU(struct softirq_runner, softirq_runners);
+
+static inline void softirq_set_runner(unsigned int sirq)
+{
+	struct softirq_runner *sr = &__get_cpu_var(softirq_runners);
+
+	sr->runner[sirq] = current;
+}
+
+static inline void softirq_clr_runner(unsigned int sirq)
+{
+	struct softirq_runner *sr = &__get_cpu_var(softirq_runners);
+
+	sr->runner[sirq] = NULL;
+}
+
 /*
- * On preempt-rt a softirq might be blocked on a lock. There might be
- * no other runnable task on this CPU because the lock owner runs on
- * some other CPU. So we have to go into idle with the pending bit
- * set. Therefor we need to check this otherwise we warn about false
- * positives which confuses users and defeats the whole purpose of
- * this test.
+ * On preempt-rt a softirq running context might be blocked on a
+ * lock. There might be no other runnable task on this CPU because the
+ * lock owner runs on some other CPU. So we have to go into idle with
+ * the pending bit set. Therefor we need to check this otherwise we
+ * warn about false positives which confuses users and defeats the
+ * whole purpose of this test.
  *
  * This code is called with interrupts disabled.
  */
 void softirq_check_pending_idle(void)
 {
 	static int rate_limit;
-	u32 warnpending = 0, pending;
+	struct softirq_runner *sr = &__get_cpu_var(softirq_runners);
+	u32 warnpending;
+	int i;
 
 	if (rate_limit >= 10)
 		return;
 
-	pending = local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK;
-	if (pending) {
-		struct task_struct *tsk;
+	warnpending = local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK;
+	for (i = 0; i < NR_SOFTIRQS; i++) {
+		struct task_struct *tsk = sr->runner[i];
 
-		tsk = __get_cpu_var(ksoftirqd);
 		/*
 		 * The wakeup code in rtmutex.c wakes up the task
 		 * _before_ it sets pi_blocked_on to NULL under
 		 * tsk->pi_lock. So we need to check for both: state
 		 * and pi_blocked_on.
 		 */
-		raw_spin_lock(&tsk->pi_lock);
-
-		if (!tsk->pi_blocked_on && !(tsk->state == TASK_RUNNING))
-			warnpending = 1;
-
-		raw_spin_unlock(&tsk->pi_lock);
+		if (tsk) {
+			raw_spin_lock(&tsk->pi_lock);
+			if (tsk->pi_blocked_on || tsk->state == TASK_RUNNING) {
+				/* Clear all bits pending in that task */
+				warnpending &= ~(tsk->softirqs_raised);
+				warnpending &= ~(1 << i);
+			}
+			raw_spin_unlock(&tsk->pi_lock);
+		}
 	}
 
 	if (warnpending) {
 		printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n",
-		       pending);
+		       warnpending);
 		rate_limit++;
 	}
 }
@@ -125,6 +150,10 @@ void softirq_check_pending_idle(void)
 	}
 }
 # endif
+
+#else /* !NO_HZ */
+static inline void softirq_set_runner(unsigned int sirq) { }
+static inline void softirq_clr_runner(unsigned int sirq) { }
 #endif
 
 /*
@@ -478,6 +507,7 @@ static void do_current_softirqs(int need_rcu_bh_qs)
 		 */
 		lock_softirq(i);
 		local_irq_disable();
+		softirq_set_runner(i);
 		/*
 		 * Check with the local_softirq_pending() bits,
 		 * whether we need to process this still or if someone
@@ -488,6 +518,7 @@ static void do_current_softirqs(int need_rcu_bh_qs)
 			set_softirq_pending(pending & ~mask);
 			do_single_softirq(i, need_rcu_bh_qs);
 		}
+		softirq_clr_runner(i);
 		unlock_softirq(i);
 		WARN_ON(current->softirq_nestcnt != 1);
 	}
@@ -558,7 +589,7 @@ void thread_do_softirq(void)
 	}
 }
 
-void __raise_softirq_irqoff(unsigned int nr)
+static void do_raise_softirq_irqoff(unsigned int nr)
 {
 	trace_softirq_raise(nr);
 	or_softirq_pending(1UL << nr);
@@ -575,12 +606,19 @@ void __raise_softirq_irqoff(unsigned int nr)
 		__this_cpu_read(ksoftirqd)->softirqs_raised |= (1U << nr);
 }
 
+void __raise_softirq_irqoff(unsigned int nr)
+{
+	do_raise_softirq_irqoff(nr);
+	if (!in_irq() && !current->softirq_nestcnt)
+		wakeup_softirqd();
+}
+
 /*
  * This function must run with irqs disabled!
  */
 void raise_softirq_irqoff(unsigned int nr)
 {
-	__raise_softirq_irqoff(nr);
+	do_raise_softirq_irqoff(nr);
 
 	/*
 	 * If we're in an hard interrupt we let irq return code deal
@@ -602,11 +640,6 @@ void raise_softirq_irqoff(unsigned int nr)
 		wakeup_softirqd();
 }
 
-void do_raise_softirq_irqoff(unsigned int nr)
-{
-	raise_softirq_irqoff(nr);
-}
-
 static inline int ksoftirqd_softirq_pending(void)
 {
 	return current->softirqs_raised;
-- 
cgit v0.10.2


From f1fc8f81db788a40d89e32fc0d8955f2df3be9b8 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 28 Oct 2012 15:12:49 +0000
Subject: net: Use local_bh_disable in netif_rx_ni()

This code triggers the new WARN in __raise_softirq_irqsoff() though it
actually looks at the softirq pending bit and calls into the softirq
code, but that fits not well with the context related softirq model of
RT. It's correct on mainline though, but going through
local_bh_disable/enable here is not going to hurt badly.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/net/core/dev.c b/net/core/dev.c
index 3250939..3ef02a1 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3118,11 +3118,9 @@ int netif_rx_ni(struct sk_buff *skb)
 {
 	int err;
 
-	migrate_disable();
+	local_bh_disable();
 	err = netif_rx(skb);
-	if (local_softirq_pending())
-		thread_do_softirq();
-	migrate_enable();
+	local_bh_enable();
 
 	return err;
 }
-- 
cgit v0.10.2


From 197d9fd67a616a12de9548945f22de941b1df800 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 26 Oct 2012 18:50:54 +0100
Subject: sched: Add support for lazy preemption

It has become an obsession to mitigate the determinism vs. throughput
loss of RT. Looking at the mainline semantics of preemption points
gives a hint why RT sucks throughput wise for ordinary SCHED_OTHER
tasks. One major issue is the wakeup of tasks which are right away
preempting the waking task while the waking task holds a lock on which
the woken task will block right after having preempted the wakee. In
mainline this is prevented due to the implicit preemption disable of
spin/rw_lock held regions. On RT this is not possible due to the fully
preemptible nature of sleeping spinlocks.

Though for a SCHED_OTHER task preempting another SCHED_OTHER task this
is really not a correctness issue. RT folks are concerned about
SCHED_FIFO/RR tasks preemption and not about the purely fairness
driven SCHED_OTHER preemption latencies.

So I introduced a lazy preemption mechanism which only applies to
SCHED_OTHER tasks preempting another SCHED_OTHER task. Aside of the
existing preempt_count each tasks sports now a preempt_lazy_count
which is manipulated on lock acquiry and release. This is slightly
incorrect as for lazyness reasons I coupled this on
migrate_disable/enable so some other mechanisms get the same treatment
(e.g. get_cpu_light).

Now on the scheduler side instead of setting NEED_RESCHED this sets
NEED_RESCHED_LAZY in case of a SCHED_OTHER/SCHED_OTHER preemption and
therefor allows to exit the waking task the lock held region before
the woken task preempts. That also works better for cross CPU wakeups
as the other side can stay in the adaptive spinning loop.

For RT class preemption there is no change. This simply sets
NEED_RESCHED and forgoes the lazy preemption counter.

 Initial test do not expose any observable latency increasement, but
history shows that I've been proven wrong before :)

The lazy preemption mode is per default on, but with
CONFIG_SCHED_DEBUG enabled it can be disabled via:

 # echo NO_PREEMPT_LAZY >/sys/kernel/debug/sched_features

and reenabled via

 # echo PREEMPT_LAZY >/sys/kernel/debug/sched_features

The test results so far are very machine and workload dependent, but
there is a clear trend that it enhances the non RT workload
performance.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 806c02b..16ad63d 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -51,6 +51,7 @@ struct trace_entry {
 	int			pid;
 	unsigned short		migrate_disable;
 	unsigned short		padding;
+	unsigned char		preempt_lazy_count;
 };
 
 #define FTRACE_MAX_EVENT						\
diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index 5e71285..172bc83 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -23,15 +23,38 @@
 
 #define preempt_count()	(current_thread_info()->preempt_count)
 
+#ifdef CONFIG_PREEMPT_LAZY
+#define add_preempt_lazy_count(val)	do { preempt_lazy_count() += (val); } while (0)
+#define sub_preempt_lazy_count(val)	do { preempt_lazy_count() -= (val); } while (0)
+#define inc_preempt_lazy_count()	add_preempt_lazy_count(1)
+#define dec_preempt_lazy_count()	sub_preempt_lazy_count(1)
+#define preempt_lazy_count()		(current_thread_info()->preempt_lazy_count)
+#else
+#define add_preempt_lazy_count(val)	do { } while (0)
+#define sub_preempt_lazy_count(val)	do { } while (0)
+#define inc_preempt_lazy_count()	do { } while (0)
+#define dec_preempt_lazy_count()	do { } while (0)
+#define preempt_lazy_count()		(0)
+#endif
+
 #ifdef CONFIG_PREEMPT
 
 asmlinkage void preempt_schedule(void);
 
+# ifdef CONFIG_PREEMPT_LAZY
+#define preempt_check_resched() \
+do { \
+	if (unlikely(test_thread_flag(TIF_NEED_RESCHED) || \
+		     test_thread_flag(TIF_NEED_RESCHED_LAZY)))  \
+		preempt_schedule(); \
+} while (0)
+# else
 #define preempt_check_resched() \
 do { \
-	if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) \
+	if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))	\
 		preempt_schedule(); \
 } while (0)
+# endif
 
 #else /* !CONFIG_PREEMPT */
 
@@ -48,6 +71,12 @@ do { \
 	barrier(); \
 } while (0)
 
+#define preempt_lazy_disable() \
+do { \
+	inc_preempt_lazy_count(); \
+	barrier(); \
+} while (0)
+
 #define sched_preempt_enable_no_resched() \
 do { \
 	barrier(); \
@@ -69,6 +98,13 @@ do { \
 	preempt_check_resched(); \
 } while (0)
 
+#define preempt_lazy_enable() \
+do { \
+	dec_preempt_lazy_count(); \
+	barrier(); \
+	preempt_check_resched(); \
+} while (0)
+
 /* For debugging and tracer internals only! */
 #define add_preempt_count_notrace(val)			\
 	do { preempt_count() += (val); } while (0)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 354b56e..70b821a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2691,6 +2691,52 @@ static inline int test_tsk_need_resched(struct task_struct *tsk)
 	return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED));
 }
 
+#ifdef CONFIG_PREEMPT_LAZY
+static inline void set_tsk_need_resched_lazy(struct task_struct *tsk)
+{
+	set_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY);
+}
+
+static inline void clear_tsk_need_resched_lazy(struct task_struct *tsk)
+{
+	clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY);
+}
+
+static inline int test_tsk_need_resched_lazy(struct task_struct *tsk)
+{
+	return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY));
+}
+
+static inline int need_resched_lazy(void)
+{
+	return test_thread_flag(TIF_NEED_RESCHED_LAZY);
+}
+
+static inline int need_resched_now(void)
+{
+	return test_thread_flag(TIF_NEED_RESCHED);
+}
+
+static inline int need_resched(void)
+{
+	return test_thread_flag(TIF_NEED_RESCHED) ||
+		test_thread_flag(TIF_NEED_RESCHED_LAZY);
+}
+#else
+static inline void clear_tsk_need_resched_lazy(struct task_struct *tsk) { }
+static inline int need_resched_lazy(void) { return 0; }
+
+static inline int need_resched_now(void)
+{
+	return test_thread_flag(TIF_NEED_RESCHED);
+}
+
+static inline int need_resched(void)
+{
+	return test_thread_flag(TIF_NEED_RESCHED);
+}
+#endif
+
 static inline int restart_syscall(void)
 {
 	set_tsk_thread_flag(current, TIF_SIGPENDING);
@@ -2722,11 +2768,6 @@ static inline int signal_pending_state(long state, struct task_struct *p)
 	return (state & TASK_INTERRUPTIBLE) || __fatal_signal_pending(p);
 }
 
-static inline int need_resched(void)
-{
-	return unlikely(test_thread_flag(TIF_NEED_RESCHED));
-}
-
 /*
  * cond_resched() and cond_resched_lock(): latency reduction via
  * explicit rescheduling in places that are safe. The return
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index 885efbd..574a855 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -6,6 +6,12 @@ config PREEMPT_RT_BASE
 	bool
 	select PREEMPT
 
+config HAVE_PREEMPT_LAZY
+	bool
+
+config PREEMPT_LAZY
+	def_bool y if HAVE_PREEMPT_LAZY && PREEMPT_RT_FULL
+
 choice
 	prompt "Preemption Model"
 	default PREEMPT_NONE
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 7fb8e28..f6794dc 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -543,6 +543,37 @@ void resched_task(struct task_struct *p)
 		smp_send_reschedule(cpu);
 }
 
+#ifdef CONFIG_PREEMPT_LAZY
+void resched_task_lazy(struct task_struct *p)
+{
+	int cpu;
+
+	if (!sched_feat(PREEMPT_LAZY)) {
+		resched_task(p);
+		return;
+	}
+
+	assert_raw_spin_locked(&task_rq(p)->lock);
+
+	if (test_tsk_need_resched(p))
+		return;
+
+	if (test_tsk_need_resched_lazy(p))
+		return;
+
+	set_tsk_need_resched_lazy(p);
+
+	cpu = task_cpu(p);
+	if (cpu == smp_processor_id())
+		return;
+
+	/* NEED_RESCHED_LAZY must be visible before we test polling */
+	smp_mb();
+	if (!tsk_is_polling(p))
+		smp_send_reschedule(cpu);
+}
+#endif
+
 void resched_cpu(int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
@@ -659,6 +690,17 @@ void resched_task(struct task_struct *p)
 	assert_raw_spin_locked(&task_rq(p)->lock);
 	set_tsk_need_resched(p);
 }
+#ifdef CONFIG_PREEMPT_LAZY
+void resched_task_lazy(struct task_struct *p)
+{
+	if (!sched_feat(PREEMPT_LAZY)) {
+		resched_task(p);
+		return;
+	}
+	assert_raw_spin_locked(&task_rq(p)->lock);
+	set_tsk_need_resched_lazy(p);
+}
+#endif
 #endif /* CONFIG_SMP */
 
 #if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
@@ -1718,6 +1760,9 @@ void sched_fork(struct task_struct *p)
 	/* Want to start with kernel preemption disabled. */
 	task_thread_info(p)->preempt_count = 1;
 #endif
+#ifdef CONFIG_HAVE_PREEMPT_LAZY
+	task_thread_info(p)->preempt_lazy_count = 0;
+#endif
 #ifdef CONFIG_SMP
 	plist_node_init(&p->pushable_tasks, MAX_PRIO);
 #endif
@@ -2926,6 +2971,7 @@ void migrate_disable(void)
 		return;
 	}
 
+	preempt_lazy_disable();
 	pin_current_cpu();
 	p->migrate_disable = 1;
 	preempt_enable();
@@ -2981,6 +3027,7 @@ void migrate_enable(void)
 
 	unpin_current_cpu();
 	preempt_enable();
+	preempt_lazy_enable();
 }
 EXPORT_SYMBOL(migrate_enable);
 #else
@@ -3115,6 +3162,7 @@ need_resched:
 	put_prev_task(rq, prev);
 	next = pick_next_task(rq);
 	clear_tsk_need_resched(prev);
+	clear_tsk_need_resched_lazy(prev);
 	rq->skip_clock_update = 0;
 
 	if (likely(prev != next)) {
@@ -3251,6 +3299,14 @@ asmlinkage void __sched notrace preempt_schedule(void)
 	if (likely(ti->preempt_count || irqs_disabled()))
 		return;
 
+#ifdef CONFIG_PREEMPT_LAZY
+	/*
+	 * Check for lazy preemption
+	 */
+	if (ti->preempt_lazy_count && !test_thread_flag(TIF_NEED_RESCHED))
+		return;
+#endif
+
 	do {
 		add_preempt_count_notrace(PREEMPT_ACTIVE);
 		/*
@@ -4862,7 +4918,9 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
 
 	/* Set the preempt count _outside_ the spinlocks! */
 	task_thread_info(idle)->preempt_count = 0;
-
+#ifdef CONFIG_HAVE_PREEMPT_LAZY
+	task_thread_info(idle)->preempt_lazy_count = 0;
+#endif
 	/*
 	 * The idle tasks have their own, simple scheduling class:
 	 */
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 81fa536..392fcf3 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1827,7 +1827,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 	ideal_runtime = sched_slice(cfs_rq, curr);
 	delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
 	if (delta_exec > ideal_runtime) {
-		resched_task(rq_of(cfs_rq)->curr);
+		resched_task_lazy(rq_of(cfs_rq)->curr);
 		/*
 		 * The current task ran long enough, ensure it doesn't get
 		 * re-elected due to buddy favours.
@@ -1851,7 +1851,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 		return;
 
 	if (delta > ideal_runtime)
-		resched_task(rq_of(cfs_rq)->curr);
+		resched_task_lazy(rq_of(cfs_rq)->curr);
 }
 
 static void
@@ -1971,7 +1971,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
 	 * validating it and just reschedule.
 	 */
 	if (queued) {
-		resched_task(rq_of(cfs_rq)->curr);
+		resched_task_lazy(rq_of(cfs_rq)->curr);
 		return;
 	}
 	/*
@@ -2160,7 +2160,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
 	 * hierarchy can be throttled
 	 */
 	if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
-		resched_task(rq_of(cfs_rq)->curr);
+		resched_task_lazy(rq_of(cfs_rq)->curr);
 }
 
 static __always_inline
@@ -2745,7 +2745,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
 
 		if (delta < 0) {
 			if (rq->curr == p)
-				resched_task(p);
+				resched_task_lazy(p);
 			return;
 		}
 
@@ -3577,7 +3577,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
 	return;
 
 preempt:
-	resched_task(curr);
+	resched_task_lazy(curr);
 	/*
 	 * Only set the backward buddy when the current task is still
 	 * on the rq. This can happen when a wakeup gets interleaved
@@ -5772,7 +5772,7 @@ static void task_fork_fair(struct task_struct *p)
 		 * 'current' within the tree based on its new key value.
 		 */
 		swap(curr->vruntime, se->vruntime);
-		resched_task(rq->curr);
+		resched_task_lazy(rq->curr);
 	}
 
 	se->vruntime -= cfs_rq->min_vruntime;
@@ -5797,7 +5797,7 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
 	 */
 	if (rq->curr == p) {
 		if (p->prio > oldprio)
-			resched_task(rq->curr);
+			resched_task_lazy(rq->curr);
 	} else
 		check_preempt_curr(rq, p, 0);
 }
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index e5de4db..771b529 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -65,6 +65,9 @@ SCHED_FEAT(NONTASK_POWER, true)
 SCHED_FEAT(TTWU_QUEUE, true)
 #else
 SCHED_FEAT(TTWU_QUEUE, false)
+# ifdef CONFIG_PREEMPT_LAZY
+SCHED_FEAT(PREEMPT_LAZY, true)
+# endif
 #endif
 
 SCHED_FEAT(FORCE_SD_OVERLAP, false)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index fc88644..d055951 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -897,6 +897,15 @@ extern void init_sched_fair_class(void);
 extern void resched_task(struct task_struct *p);
 extern void resched_cpu(int cpu);
 
+#ifdef CONFIG_PREEMPT_LAZY
+extern void resched_task_lazy(struct task_struct *tsk);
+#else
+static inline void resched_task_lazy(struct task_struct *tsk)
+{
+	resched_task(tsk);
+}
+#endif
+
 extern struct rt_bandwidth def_rt_bandwidth;
 extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
 
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 3a0f978..cf44625 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1166,6 +1166,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
 	struct task_struct *tsk = current;
 
 	entry->preempt_count		= pc & 0xff;
+	entry->preempt_lazy_count	= preempt_lazy_count();
 	entry->pid			= (tsk) ? tsk->pid : 0;
 	entry->padding			= 0;
 	entry->flags =
@@ -1176,7 +1177,8 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
 #endif
 		((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) |
 		((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) |
-		(need_resched() ? TRACE_FLAG_NEED_RESCHED : 0);
+		(need_resched_now() ? TRACE_FLAG_NEED_RESCHED : 0) |
+		(need_resched_lazy() ? TRACE_FLAG_NEED_RESCHED_LAZY : 0);
 
 	entry->migrate_disable	= (tsk) ? __migrate_disabled(tsk) & 0xFF : 0;
 }
@@ -2031,15 +2033,17 @@ get_total_entries(struct trace_array *tr, unsigned long *total, unsigned long *e
 
 static void print_lat_help_header(struct seq_file *m)
 {
-	seq_puts(m, "#                  _------=> CPU#            \n");
-	seq_puts(m, "#                 / _-----=> irqs-off        \n");
-	seq_puts(m, "#                | / _----=> need-resched    \n");
-	seq_puts(m, "#                || / _---=> hardirq/softirq \n");
-	seq_puts(m, "#                ||| / _--=> preempt-depth   \n");
-	seq_puts(m, "#                |||| / _--=> migrate-disable\n");
-	seq_puts(m, "#                ||||| /     delay           \n");
-	seq_puts(m, "#  cmd     pid   |||||| time  |   caller     \n");
-	seq_puts(m, "#     \\   /      |||||  \\   |   /          \n");
+	seq_puts(m, "#                   _--------=> CPU#              \n");
+	seq_puts(m, "#                  / _-------=> irqs-off          \n");
+	seq_puts(m, "#                 | / _------=> need-resched      \n");
+	seq_puts(m, "#                 || / _-----=> need-resched_lazy \n");
+	seq_puts(m, "#                 ||| / _----=> hardirq/softirq   \n");
+	seq_puts(m, "#                 |||| / _---=> preempt-depth     \n");
+	seq_puts(m, "#                 ||||| / _--=> preempt-lazy-depth\n");
+	seq_puts(m, "#                 |||||| / _-=> migrate-disable   \n");
+	seq_puts(m, "#                 ||||||| /     delay             \n");
+	seq_puts(m, "#  cmd     pid    |||||||| time  |   caller       \n");
+	seq_puts(m, "#     \\   /      ||||||||  \\   |   /            \n");
 }
 
 static void print_event_info(struct trace_array *tr, struct seq_file *m)
@@ -2063,13 +2067,16 @@ static void print_func_help_header(struct trace_array *tr, struct seq_file *m)
 static void print_func_help_header_irq(struct trace_array *tr, struct seq_file *m)
 {
 	print_event_info(tr, m);
-	seq_puts(m, "#                              _-----=> irqs-off\n");
-	seq_puts(m, "#                             / _----=> need-resched\n");
-	seq_puts(m, "#                            | / _---=> hardirq/softirq\n");
-	seq_puts(m, "#                            || / _--=> preempt-depth\n");
-	seq_puts(m, "#                            ||| /     delay\n");
-	seq_puts(m, "#           TASK-PID   CPU#  ||||    TIMESTAMP  FUNCTION\n");
-	seq_puts(m, "#              | |       |   ||||       |         |\n");
+	seq_puts(m, "#                              _-------=> irqs-off          \n");
+	seq_puts(m, "#                            /  _------=> need-resched      \n");
+	seq_puts(m, "#                            |/  _-----=> need-resched_lazy \n");
+	seq_puts(m, "#                            ||/  _----=> hardirq/softirq   \n");
+	seq_puts(m, "#                            |||/  _---=> preempt-depth     \n");
+	seq_puts(m, "#                            ||||/  _--=> preempt-lazy-depth\n");
+	seq_puts(m, "#                            ||||| / _-=> migrate-disable   \n");
+	seq_puts(m, "#                            |||||| /     delay\n");
+	seq_puts(m, "#           TASK-PID   CPU#  |||||||    TIMESTAMP  FUNCTION\n");
+	seq_puts(m, "#              | |       |   |||||||       |         |\n");
 }
 
 void
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index c75d798..49585a3 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -116,6 +116,7 @@ struct uprobe_trace_entry_head {
  *  NEED_RESCHED	- reschedule is requested
  *  HARDIRQ		- inside an interrupt handler
  *  SOFTIRQ		- inside a softirq handler
+ *  NEED_RESCHED_LAZY	- lazy reschedule is requested
  */
 enum trace_flag_type {
 	TRACE_FLAG_IRQS_OFF		= 0x01,
@@ -123,6 +124,7 @@ enum trace_flag_type {
 	TRACE_FLAG_NEED_RESCHED		= 0x04,
 	TRACE_FLAG_HARDIRQ		= 0x08,
 	TRACE_FLAG_SOFTIRQ		= 0x10,
+	TRACE_FLAG_NEED_RESCHED_LAZY	= 0x20,
 };
 
 #define TRACE_BUF_SIZE		1024
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index e36737f..2b0aea4 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -564,6 +564,7 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
 {
 	char hardsoft_irq;
 	char need_resched;
+	char need_resched_lazy;
 	char irqs_off;
 	int hardirq;
 	int softirq;
@@ -578,14 +579,17 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
 		'.';
 	need_resched =
 		(entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.';
+	need_resched_lazy =
+		(entry->flags & TRACE_FLAG_NEED_RESCHED_LAZY) ? 'L' : '.';
 	hardsoft_irq =
 		(hardirq && softirq) ? 'H' :
 		hardirq ? 'h' :
 		softirq ? 's' :
 		'.';
 
-	if (!trace_seq_printf(s, "%c%c%c",
-			      irqs_off, need_resched, hardsoft_irq))
+	if (!trace_seq_printf(s, "%c%c%c%c",
+			      irqs_off, need_resched, need_resched_lazy,
+			      hardsoft_irq))
 		return 0;
 
 	if (entry->preempt_count)
@@ -593,6 +597,11 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
 	else
 		ret = trace_seq_putc(s, '.');
 
+	if (entry->preempt_lazy_count)
+		ret = trace_seq_printf(s, "%x", entry->preempt_lazy_count);
+	else
+		ret = trace_seq_putc(s, '.');
+
 	if (entry->migrate_disable)
 		ret = trace_seq_printf(s, "%x", entry->migrate_disable);
 	else
-- 
cgit v0.10.2


From c03c1683cfe45fc21356715d6b29568cac71983d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 1 Nov 2012 11:03:47 +0100
Subject: x86-preempt-lazy.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 989e191..585e236 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -108,6 +108,7 @@ config X86
 	select KTIME_SCALAR if X86_32
 	select GENERIC_STRNCPY_FROM_USER
 	select GENERIC_STRNLEN_USER
+	select HAVE_PREEMPT_LAZY
 	select HAVE_CONTEXT_TRACKING if X86_64
 	select HAVE_IRQ_TIME_ACCOUNTING
 	select MODULES_USE_ELF_REL if X86_32
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 2d946e6..6b0fc2e 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -31,6 +31,8 @@ struct thread_info {
 	__u32			cpu;		/* current CPU */
 	int			preempt_count;	/* 0 => preemptable,
 						   <0 => BUG */
+	int			preempt_lazy_count;	/* 0 => lazy preemptable,
+							   <0 => BUG */
 	mm_segment_t		addr_limit;
 	struct restart_block    restart_block;
 	void __user		*sysenter_return;
@@ -82,6 +84,7 @@ struct thread_info {
 #define TIF_SYSCALL_EMU		6	/* syscall emulation active */
 #define TIF_SYSCALL_AUDIT	7	/* syscall auditing active */
 #define TIF_SECCOMP		8	/* secure computing */
+#define TIF_NEED_RESCHED_LAZY	9	/* lazy rescheduling necessary */
 #define TIF_MCE_NOTIFY		10	/* notify userspace of an MCE */
 #define TIF_USER_RETURN_NOTIFY	11	/* notify kernel of userspace return */
 #define TIF_UPROBE		12	/* breakpointed or singlestepping */
@@ -107,6 +110,7 @@ struct thread_info {
 #define _TIF_SYSCALL_EMU	(1 << TIF_SYSCALL_EMU)
 #define _TIF_SYSCALL_AUDIT	(1 << TIF_SYSCALL_AUDIT)
 #define _TIF_SECCOMP		(1 << TIF_SECCOMP)
+#define _TIF_NEED_RESCHED_LAZY	(1 << TIF_NEED_RESCHED_LAZY)
 #define _TIF_MCE_NOTIFY		(1 << TIF_MCE_NOTIFY)
 #define _TIF_USER_RETURN_NOTIFY	(1 << TIF_USER_RETURN_NOTIFY)
 #define _TIF_UPROBE		(1 << TIF_UPROBE)
@@ -157,6 +161,8 @@ struct thread_info {
 #define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY)
 #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG)
 
+#define _TIF_NEED_RESCHED_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)
+
 #define PREEMPT_ACTIVE		0x10000000
 
 #ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index 2861082..a36d9cf 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -33,6 +33,7 @@ void common(void) {
 	OFFSET(TI_status, thread_info, status);
 	OFFSET(TI_addr_limit, thread_info, addr_limit);
 	OFFSET(TI_preempt_count, thread_info, preempt_count);
+	OFFSET(TI_preempt_lazy_count, thread_info, preempt_lazy_count);
 
 	BLANK();
 	OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx);
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 6ed91d9..218e79a 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -364,14 +364,22 @@ ENTRY(resume_kernel)
 	DISABLE_INTERRUPTS(CLBR_ANY)
 	cmpl $0,TI_preempt_count(%ebp)	# non-zero preempt_count ?
 	jnz restore_all
-need_resched:
 	movl TI_flags(%ebp), %ecx	# need_resched set ?
 	testb $_TIF_NEED_RESCHED, %cl
+	jnz 1f
+
+	cmpl $0,TI_preempt_lazy_count(%ebp)	# non-zero preempt_lazy_count ?
+	jnz  restore_all
+	testl $_TIF_NEED_RESCHED_LAZY, %ecx
 	jz restore_all
-	testl $X86_EFLAGS_IF,PT_EFLAGS(%esp)	# interrupts off (exception path) ?
+
+1:	testl $X86_EFLAGS_IF,PT_EFLAGS(%esp)	# interrupts off (exception path) ?
 	jz restore_all
 	call preempt_schedule_irq
-	jmp need_resched
+	movl TI_flags(%ebp), %ecx	# need_resched set ?
+	testl $_TIF_NEED_RESCHED_MASK, %ecx
+	jnz 1b
+	jmp restore_all
 END(resume_kernel)
 #endif
 	CFI_ENDPROC
@@ -607,7 +615,7 @@ ENDPROC(system_call)
 	ALIGN
 	RING0_PTREGS_FRAME		# can't unwind into user space anyway
 work_pending:
-	testb $_TIF_NEED_RESCHED, %cl
+	testl $_TIF_NEED_RESCHED_MASK, %ecx
 	jz work_notifysig
 work_resched:
 	call schedule
@@ -620,7 +628,7 @@ work_resched:
 	andl $_TIF_WORK_MASK, %ecx	# is there any work to be done other
 					# than syscall tracing?
 	jz restore_all
-	testb $_TIF_NEED_RESCHED, %cl
+	testl $_TIF_NEED_RESCHED_MASK, %ecx
 	jnz work_resched
 
 work_notifysig:				# deal with pending signals and
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 3c06da6..0b01d8d 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -673,8 +673,8 @@ sysret_check:
 	/* Handle reschedules */
 	/* edx:	work, edi: workmask */
 sysret_careful:
-	bt $TIF_NEED_RESCHED,%edx
-	jnc sysret_signal
+	testl $_TIF_NEED_RESCHED_MASK,%edx
+	jz sysret_signal
 	TRACE_IRQS_ON
 	ENABLE_INTERRUPTS(CLBR_NONE)
 	pushq_cfi %rdi
@@ -786,8 +786,8 @@ GLOBAL(int_with_check)
 	/* First do a reschedule test. */
 	/* edx:	work, edi: workmask */
 int_careful:
-	bt $TIF_NEED_RESCHED,%edx
-	jnc  int_very_careful
+	testl $_TIF_NEED_RESCHED_MASK,%edx
+	jz  int_very_careful
 	TRACE_IRQS_ON
 	ENABLE_INTERRUPTS(CLBR_NONE)
 	pushq_cfi %rdi
@@ -1094,8 +1094,8 @@ bad_iret:
 	/* edi: workmask, edx: work */
 retint_careful:
 	CFI_RESTORE_STATE
-	bt    $TIF_NEED_RESCHED,%edx
-	jnc   retint_signal
+	testl $_TIF_NEED_RESCHED_MASK,%edx
+	jz   retint_signal
 	TRACE_IRQS_ON
 	ENABLE_INTERRUPTS(CLBR_NONE)
 	pushq_cfi %rdi
@@ -1128,9 +1128,15 @@ retint_signal:
 ENTRY(retint_kernel)
 	cmpl $0,TI_preempt_count(%rcx)
 	jnz  retint_restore_args
-	bt  $TIF_NEED_RESCHED,TI_flags(%rcx)
+	bt   $TIF_NEED_RESCHED,TI_flags(%rcx)
+	jc   1f
+
+	cmpl $0,TI_preempt_lazy_count(%rcx)
+	jnz  retint_restore_args
+	bt   $TIF_NEED_RESCHED_LAZY,TI_flags(%rcx)
 	jnc  retint_restore_args
-	bt   $9,EFLAGS-ARGOFFSET(%rsp)	/* interrupts off? */
+
+1:	bt   $9,EFLAGS-ARGOFFSET(%rsp)	/* interrupts off? */
 	jnc  retint_restore_args
 	call preempt_schedule_irq
 	jmp exit_intr
@@ -1522,7 +1528,7 @@ paranoid_userspace:
 	movq %rsp,%rdi			/* &pt_regs */
 	call sync_regs
 	movq %rax,%rsp			/* switch stack for scheduling */
-	testl $_TIF_NEED_RESCHED,%ebx
+	testl $_TIF_NEED_RESCHED_MASK,%ebx
 	jnz paranoid_schedule
 	movl %ebx,%edx			/* arg3: thread flags */
 	TRACE_IRQS_ON
-- 
cgit v0.10.2


From 14e27282bb54af3db98aa29fcd83895438a3bc98 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 31 Oct 2012 12:04:11 +0100
Subject: arm-preempt-lazy-support.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 4890796..2753534 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -47,6 +47,7 @@ config ARM
 	select HAVE_MEMBLOCK
 	select HAVE_OPROFILE if (HAVE_PERF_EVENTS)
 	select HAVE_PERF_EVENTS
+	select HAVE_PREEMPT_LAZY
 	select HAVE_REGS_AND_STACK_ACCESS_API
 	select HAVE_SYSCALL_TRACEPOINTS
 	select HAVE_UID16
diff --git a/arch/arm/include/asm/thread_info.h b/arch/arm/include/asm/thread_info.h
index cddda1f..4acacb2 100644
--- a/arch/arm/include/asm/thread_info.h
+++ b/arch/arm/include/asm/thread_info.h
@@ -50,6 +50,7 @@ struct cpu_context_save {
 struct thread_info {
 	unsigned long		flags;		/* low level flags */
 	int			preempt_count;	/* 0 => preemptable, <0 => bug */
+	int			preempt_lazy_count;	/* 0 => preemptable, <0 => bug */
 	mm_segment_t		addr_limit;	/* address limit */
 	struct task_struct	*task;		/* main task structure */
 	struct exec_domain	*exec_domain;	/* execution domain */
@@ -148,6 +149,7 @@ extern int vfp_restore_user_hwstate(struct user_vfp __user *,
 #define TIF_SIGPENDING		0
 #define TIF_NEED_RESCHED	1
 #define TIF_NOTIFY_RESUME	2	/* callback before returning to user */
+#define TIF_NEED_RESCHED_LAZY	3
 #define TIF_SYSCALL_TRACE	8
 #define TIF_SYSCALL_AUDIT	9
 #define TIF_SYSCALL_TRACEPOINT	10
@@ -160,6 +162,7 @@ extern int vfp_restore_user_hwstate(struct user_vfp __user *,
 #define _TIF_SIGPENDING		(1 << TIF_SIGPENDING)
 #define _TIF_NEED_RESCHED	(1 << TIF_NEED_RESCHED)
 #define _TIF_NOTIFY_RESUME	(1 << TIF_NOTIFY_RESUME)
+#define _TIF_NEED_RESCHED_LAZY	(1 << TIF_NEED_RESCHED_LAZY)
 #define _TIF_SYSCALL_TRACE	(1 << TIF_SYSCALL_TRACE)
 #define _TIF_SYSCALL_AUDIT	(1 << TIF_SYSCALL_AUDIT)
 #define _TIF_SYSCALL_TRACEPOINT	(1 << TIF_SYSCALL_TRACEPOINT)
diff --git a/arch/arm/kernel/asm-offsets.c b/arch/arm/kernel/asm-offsets.c
index cf10d18..8b1d153 100644
--- a/arch/arm/kernel/asm-offsets.c
+++ b/arch/arm/kernel/asm-offsets.c
@@ -50,6 +50,7 @@ int main(void)
   BLANK();
   DEFINE(TI_FLAGS,		offsetof(struct thread_info, flags));
   DEFINE(TI_PREEMPT,		offsetof(struct thread_info, preempt_count));
+  DEFINE(TI_PREEMPT_LAZY,	offsetof(struct thread_info, preempt_lazy_count));
   DEFINE(TI_ADDR_LIMIT,		offsetof(struct thread_info, addr_limit));
   DEFINE(TI_TASK,		offsetof(struct thread_info, task));
   DEFINE(TI_EXEC_DOMAIN,	offsetof(struct thread_info, exec_domain));
diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S
index 0f82098..afa746c 100644
--- a/arch/arm/kernel/entry-armv.S
+++ b/arch/arm/kernel/entry-armv.S
@@ -216,11 +216,18 @@ __irq_svc:
 #ifdef CONFIG_PREEMPT
 	get_thread_info tsk
 	ldr	r8, [tsk, #TI_PREEMPT]		@ get preempt count
-	ldr	r0, [tsk, #TI_FLAGS]		@ get flags
 	teq	r8, #0				@ if preempt count != 0
+	bne	1f				@ return from exeption
+	ldr	r0, [tsk, #TI_FLAGS]		@ get flags
+	tst	r0, #_TIF_NEED_RESCHED		@ if NEED_RESCHED is set
+	blne	svc_preempt			@ preempt!
+
+	ldr	r8, [tsk, #TI_PREEMPT_LAZY]	@ get preempt lazy count
+	teq	r8, #0				@ if preempt lazy count != 0
 	movne	r0, #0				@ force flags to 0
-	tst	r0, #_TIF_NEED_RESCHED
+	tst	r0, #_TIF_NEED_RESCHED_LAZY
 	blne	svc_preempt
+1:
 #endif
 
 #ifdef CONFIG_TRACE_IRQFLAGS
@@ -240,6 +247,8 @@ svc_preempt:
 1:	bl	preempt_schedule_irq		@ irq en/disable is done inside
 	ldr	r0, [tsk, #TI_FLAGS]		@ get new tasks TI_FLAGS
 	tst	r0, #_TIF_NEED_RESCHED
+	bne	1b
+	tst	r0, #_TIF_NEED_RESCHED_LAZY
 	moveq	pc, r8				@ go again
 	b	1b
 #endif
diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c
index 56f72d2..806416a 100644
--- a/arch/arm/kernel/signal.c
+++ b/arch/arm/kernel/signal.c
@@ -638,7 +638,8 @@ asmlinkage int
 do_work_pending(struct pt_regs *regs, unsigned int thread_flags, int syscall)
 {
 	do {
-		if (likely(thread_flags & _TIF_NEED_RESCHED)) {
+		if (likely(thread_flags & (_TIF_NEED_RESCHED |
+					   _TIF_NEED_RESCHED_LAZY))) {
 			schedule();
 		} else {
 			if (unlikely(!user_mode(regs)))
-- 
cgit v0.10.2


From 18b4ed55ef83e77e8df2c45f1a99ffc6e5c07db6 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 20 Mar 2013 18:06:20 +0100
Subject: net: make devnet_rename_seq a mutex

On RT write_seqcount_begin() disables preemption and device_rename()
allocates memory with GFP_KERNEL and grabs later the sysfs_mutex mutex.
Since I don't see a reason why this can't be a mutex, make it one. We
probably don't have that much reads at the same time in the hot path.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index b705426..041bc5b 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1579,7 +1579,7 @@ extern int call_netdevice_notifiers(unsigned long val, struct net_device *dev);
 
 extern rwlock_t				dev_base_lock;		/* Device list lock */
 
-extern seqcount_t	devnet_rename_seq;	/* Device rename seq */
+extern struct mutex devnet_rename_mutex;
 
 
 #define for_each_netdev(net, d)		\
diff --git a/net/core/dev.c b/net/core/dev.c
index 3ef02a1..454b151 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -203,7 +203,7 @@ static struct list_head offload_base __read_mostly;
 DEFINE_RWLOCK(dev_base_lock);
 EXPORT_SYMBOL(dev_base_lock);
 
-seqcount_t devnet_rename_seq;
+DEFINE_MUTEX(devnet_rename_mutex);
 
 static inline void dev_base_seq_inc(struct net *net)
 {
@@ -1093,10 +1093,11 @@ int dev_change_name(struct net_device *dev, const char *newname)
 	if (dev->flags & IFF_UP)
 		return -EBUSY;
 
-	write_seqcount_begin(&devnet_rename_seq);
+
+	mutex_lock(&devnet_rename_mutex);
 
 	if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
-		write_seqcount_end(&devnet_rename_seq);
+		mutex_unlock(&devnet_rename_mutex);
 		return 0;
 	}
 
@@ -1104,7 +1105,7 @@ int dev_change_name(struct net_device *dev, const char *newname)
 
 	err = dev_get_valid_name(net, dev, newname);
 	if (err < 0) {
-		write_seqcount_end(&devnet_rename_seq);
+		mutex_unlock(&devnet_rename_mutex);
 		return err;
 	}
 
@@ -1112,11 +1113,11 @@ rollback:
 	ret = device_rename(&dev->dev, dev->name);
 	if (ret) {
 		memcpy(dev->name, oldname, IFNAMSIZ);
-		write_seqcount_end(&devnet_rename_seq);
+		mutex_unlock(&devnet_rename_mutex);
 		return ret;
 	}
 
-	write_seqcount_end(&devnet_rename_seq);
+	mutex_unlock(&devnet_rename_mutex);
 
 	write_lock_bh(&dev_base_lock);
 	hlist_del_rcu(&dev->name_hlist);
@@ -1135,7 +1136,7 @@ rollback:
 		/* err >= 0 after dev_alloc_name() or stores the first errno */
 		if (err >= 0) {
 			err = ret;
-			write_seqcount_begin(&devnet_rename_seq);
+			mutex_lock(&devnet_rename_mutex);
 			memcpy(dev->name, oldname, IFNAMSIZ);
 			goto rollback;
 		} else {
@@ -4214,7 +4215,6 @@ static int dev_ifname(struct net *net, struct ifreq __user *arg)
 {
 	struct net_device *dev;
 	struct ifreq ifr;
-	unsigned seq;
 
 	/*
 	 *	Fetch the caller's info block.
@@ -4223,19 +4223,18 @@ static int dev_ifname(struct net *net, struct ifreq __user *arg)
 	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
 		return -EFAULT;
 
-retry:
-	seq = read_seqcount_begin(&devnet_rename_seq);
+	mutex_lock(&devnet_rename_mutex);
 	rcu_read_lock();
 	dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
 	if (!dev) {
 		rcu_read_unlock();
+		mutex_unlock(&devnet_rename_mutex);
 		return -ENODEV;
 	}
 
 	strcpy(ifr.ifr_name, dev->name);
 	rcu_read_unlock();
-	if (read_seqcount_retry(&devnet_rename_seq, seq))
-		goto retry;
+	mutex_unlock(&devnet_rename_mutex);
 
 	if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
 		return -EFAULT;
diff --git a/net/core/sock.c b/net/core/sock.c
index 45a50ee..2754c99 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -571,7 +571,6 @@ static int sock_getbindtodevice(struct sock *sk, char __user *optval,
 	struct net *net = sock_net(sk);
 	struct net_device *dev;
 	char devname[IFNAMSIZ];
-	unsigned seq;
 
 	if (sk->sk_bound_dev_if == 0) {
 		len = 0;
@@ -582,20 +581,19 @@ static int sock_getbindtodevice(struct sock *sk, char __user *optval,
 	if (len < IFNAMSIZ)
 		goto out;
 
-retry:
-	seq = read_seqcount_begin(&devnet_rename_seq);
+	mutex_lock(&devnet_rename_mutex);
 	rcu_read_lock();
 	dev = dev_get_by_index_rcu(net, sk->sk_bound_dev_if);
 	ret = -ENODEV;
 	if (!dev) {
 		rcu_read_unlock();
+		mutex_unlock(&devnet_rename_mutex);
 		goto out;
 	}
 
 	strcpy(devname, dev->name);
 	rcu_read_unlock();
-	if (read_seqcount_retry(&devnet_rename_seq, seq))
-		goto retry;
+	mutex_unlock(&devnet_rename_mutex);
 
 	len = strlen(devname) + 1;
 
-- 
cgit v0.10.2


From 899802ca7d5c4259ca0aa09dbb2d99f73d3f096f Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Thu, 21 Mar 2013 10:00:28 +0100
Subject: powerpc/fsl-msi: use a different locklcass for the cascade interrupt

lockdep thinks that it might deadlock because it grabs a lock of the
same class while calling the generic_irq_handler(). This annotation will
inform lockdep that it will not.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>

diff --git a/arch/powerpc/sysdev/fsl_msi.c b/arch/powerpc/sysdev/fsl_msi.c
index 6e53d97..b94b478 100644
--- a/arch/powerpc/sysdev/fsl_msi.c
+++ b/arch/powerpc/sysdev/fsl_msi.c
@@ -333,6 +333,8 @@ static int fsl_of_msi_remove(struct platform_device *ofdev)
 	return 0;
 }
 
+static struct lock_class_key fsl_msi_irq_class;
+
 static int fsl_msi_setup_hwirq(struct fsl_msi *msi, struct platform_device *dev,
 			       int offset, int irq_index)
 {
@@ -351,7 +353,7 @@ static int fsl_msi_setup_hwirq(struct fsl_msi *msi, struct platform_device *dev,
 		dev_err(&dev->dev, "No memory for MSI cascade data\n");
 		return -ENOMEM;
 	}
-
+	irq_set_lockdep_class(virt_msir, &fsl_msi_irq_class);
 	msi->msi_virqs[irq_index] = virt_msir;
 	cascade_data->index = offset;
 	cascade_data->msi_data = msi;
-- 
cgit v0.10.2


From aa2effa1ca6da72f93fca3158079a10d8a592dac Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Thu, 21 Mar 2013 11:35:49 +0100
Subject: i2c/omap: drop the lock hard irq context

The lock is taken while reading two registers. On RT the first lock is
taken in hard irq where it might sleep and in the threaded irq.
The threaded irq runs in oneshot mode so the hard irq does not run until
the thread the completes so there is no reason to grab the lock.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>

diff --git a/drivers/i2c/busses/i2c-omap.c b/drivers/i2c/busses/i2c-omap.c
index 4cc2f05..bb51488 100644
--- a/drivers/i2c/busses/i2c-omap.c
+++ b/drivers/i2c/busses/i2c-omap.c
@@ -881,15 +881,12 @@ omap_i2c_isr(int irq, void *dev_id)
 	u16 mask;
 	u16 stat;
 
-	spin_lock(&dev->lock);
-	mask = omap_i2c_read_reg(dev, OMAP_I2C_IE_REG);
 	stat = omap_i2c_read_reg(dev, OMAP_I2C_STAT_REG);
+	mask = omap_i2c_read_reg(dev, OMAP_I2C_IE_REG);
 
 	if (stat & mask)
 		ret = IRQ_WAKE_THREAD;
 
-	spin_unlock(&dev->lock);
-
 	return ret;
 }
 
-- 
cgit v0.10.2


From cc102fc6cd3425a8b56e62665331c67e7ce29149 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Thu, 21 Mar 2013 12:46:49 +0100
Subject: spi/omap-mcspi: check condition also after timeout

It is possible that the handler gets interrupted after checking the
status. After it resumes it the time out is due but the condition it was
waiting for might be true. Therefore it is necessary to check the
condition in case of an time out to be sure that the condition is not
true _and_ the time passed by.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>

diff --git a/drivers/spi/spi-omap2-mcspi.c b/drivers/spi/spi-omap2-mcspi.c
index b610f52..56c4166 100644
--- a/drivers/spi/spi-omap2-mcspi.c
+++ b/drivers/spi/spi-omap2-mcspi.c
@@ -285,8 +285,12 @@ static int mcspi_wait_for_reg_bit(void __iomem *reg, unsigned long bit)
 
 	timeout = jiffies + msecs_to_jiffies(1000);
 	while (!(__raw_readl(reg) & bit)) {
-		if (time_after(jiffies, timeout))
-			return -1;
+		if (time_after(jiffies, timeout)) {
+			if (!(__raw_readl(reg) & bit))
+				return -ETIMEDOUT;
+			else
+				return 0;
+		}
 		cpu_relax();
 	}
 	return 0;
-- 
cgit v0.10.2


From 53f1fac42aaed09d684ef36a3b01707db880df5c Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Thu, 21 Mar 2013 19:01:05 +0100
Subject: HACK: printk: drop the logbuf_lock more often

The lock is hold with irgs off. The latency drops 500us+ on my arm bugs
with a "full" buffer after executing "dmesg" on the shell.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>

diff --git a/kernel/printk.c b/kernel/printk.c
index 2609082..6d52c34 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -1072,6 +1072,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
 {
 	char *text;
 	int len = 0;
+	int attempts = 0;
 
 	text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL);
 	if (!text)
@@ -1083,7 +1084,14 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
 		u64 seq;
 		u32 idx;
 		enum log_flags prev;
-
+		int num_msg;
+try_again:
+		attempts++;
+		if (attempts > 10) {
+			len = -EBUSY;
+			goto out;
+		}
+		num_msg = 0;
 		if (clear_seq < log_first_seq) {
 			/* messages are gone, move to first available one */
 			clear_seq = log_first_seq;
@@ -1104,6 +1112,14 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
 			prev = msg->flags;
 			idx = log_next(idx);
 			seq++;
+			num_msg++;
+			if (num_msg > 5) {
+				num_msg = 0;
+				raw_spin_unlock_irq(&logbuf_lock);
+				raw_spin_lock_irq(&logbuf_lock);
+				if (clear_seq < log_first_seq)
+					goto try_again;
+			}
 		}
 
 		/* move first record forward until length fits into the buffer */
@@ -1117,6 +1133,14 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
 			prev = msg->flags;
 			idx = log_next(idx);
 			seq++;
+			num_msg++;
+			if (num_msg > 5) {
+				num_msg = 0;
+				raw_spin_unlock_irq(&logbuf_lock);
+				raw_spin_lock_irq(&logbuf_lock);
+				if (clear_seq < log_first_seq)
+					goto try_again;
+			}
 		}
 
 		/* last message fitting into this dump */
@@ -1158,6 +1182,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
 		clear_seq = log_next_seq;
 		clear_idx = log_next_idx;
 	}
+out:
 	raw_spin_unlock_irq(&logbuf_lock);
 
 	kfree(text);
-- 
cgit v0.10.2


From a99efd248956100fadd75a84239f89396e40ea67 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Thu, 4 Apr 2013 17:09:40 -0500
Subject: fs/fscache: done merge spin_lock() in while()

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>

diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index ff000e5..5a0c0b6 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -796,11 +796,13 @@ void fscache_invalidate_writes(struct fscache_cookie *cookie)
 
 	_enter("");
 
-	while (spin_lock(&cookie->stores_lock),
-	       n = radix_tree_gang_lookup_tag(&cookie->stores, results, 0,
-					      ARRAY_SIZE(results),
-					      FSCACHE_COOKIE_PENDING_TAG),
-	       n > 0) {
+	do  {
+		spin_lock(&cookie->stores_lock);
+		n = radix_tree_gang_lookup_tag(&cookie->stores, results, 0,
+				ARRAY_SIZE(results),
+				FSCACHE_COOKIE_PENDING_TAG);
+	       if (n == 0)
+		       break;
 		for (i = n - 1; i >= 0; i--) {
 			page = results[i];
 			radix_tree_delete(&cookie->stores, page->index);
@@ -810,7 +812,7 @@ void fscache_invalidate_writes(struct fscache_cookie *cookie)
 
 		for (i = n - 1; i >= 0; i--)
 			page_cache_release(results[i]);
-	}
+	} while (1);
 
 	spin_unlock(&cookie->stores_lock);
 	_leave("");
-- 
cgit v0.10.2


From 840c610ed6d1dce6c947cf1ddd7c884bdc7cfa4b Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Thu, 4 Apr 2013 17:09:40 -0500
Subject: gpu/i915: don't open code these things

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index de45b60..9b51712 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -91,7 +91,6 @@ i915_gem_wait_for_error(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	struct completion *x = &dev_priv->error_completion;
-	unsigned long flags;
 	int ret;
 
 	if (!atomic_read(&dev_priv->mm.wedged))
@@ -116,9 +115,7 @@ i915_gem_wait_for_error(struct drm_device *dev)
 		 * end up waiting upon a subsequent completion event that
 		 * will never happen.
 		 */
-		spin_lock_irqsave(&x->wait.lock, flags);
-		x->done++;
-		spin_unlock_irqrestore(&x->wait.lock, flags);
+		complete(x);
 	}
 	return 0;
 }
@@ -946,12 +943,9 @@ i915_gem_check_wedge(struct drm_i915_private *dev_priv,
 	if (atomic_read(&dev_priv->mm.wedged)) {
 		struct completion *x = &dev_priv->error_completion;
 		bool recovery_complete;
-		unsigned long flags;
 
 		/* Give the error handler a chance to run. */
-		spin_lock_irqsave(&x->wait.lock, flags);
-		recovery_complete = x->done > 0;
-		spin_unlock_irqrestore(&x->wait.lock, flags);
+		recovery_complete = completion_done(x);
 
 		/* Non-interruptible callers can't handle -EAGAIN, hence return
 		 * -EIO unconditionally for these. */
@@ -4366,7 +4360,7 @@ static bool mutex_is_locked_by(struct mutex *mutex, struct task_struct *task)
 	if (!mutex_is_locked(mutex))
 		return false;
 
-#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_MUTEXES)
+#if (defined(CONFIG_SMP) || defined(CONFIG_DEBUG_MUTEXES)) && !defined(CONFIG_PREEMPT_RT_BASE)
 	return mutex->owner == task;
 #else
 	/* Since UP may be pre-empted, we cannot assume that we own the lock */
-- 
cgit v0.10.2


From e3a39bd41aa6a47c7570fdc9ea3130b7e60dd5fa Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 1 Nov 2012 10:14:11 +0100
Subject: powerpc-preempt-lazy-support.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index bd0885eb..281256e 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -142,6 +142,7 @@ config PPC
 	select GENERIC_CLOCKEVENTS
 	select GENERIC_STRNCPY_FROM_USER
 	select GENERIC_STRNLEN_USER
+	select HAVE_PREEMPT_LAZY
 	select HAVE_MOD_ARCH_SPECIFIC
 	select MODULES_USE_ELF_RELA
 	select CLONE_BACKWARDS
diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h
index 406b7b9..14c08f1 100644
--- a/arch/powerpc/include/asm/thread_info.h
+++ b/arch/powerpc/include/asm/thread_info.h
@@ -43,6 +43,8 @@ struct thread_info {
 	int		cpu;			/* cpu we're on */
 	int		preempt_count;		/* 0 => preemptable,
 						   <0 => BUG */
+	int		preempt_lazy_count;	/* 0 => preemptable,
+						   <0 => BUG */
 	struct restart_block restart_block;
 	unsigned long	local_flags;		/* private flags for thread */
 
@@ -97,7 +99,7 @@ static inline struct thread_info *current_thread_info(void)
 #define TIF_PERFMON_CTXSW	6	/* perfmon needs ctxsw calls */
 #define TIF_SYSCALL_AUDIT	7	/* syscall auditing active */
 #define TIF_SINGLESTEP		8	/* singlestepping active */
-#define TIF_MEMDIE		9	/* is terminating due to OOM killer */
+#define TIF_NEED_RESCHED_LAZY	9	/* lazy rescheduling necessary */
 #define TIF_SECCOMP		10	/* secure computing */
 #define TIF_RESTOREALL		11	/* Restore all regs (implies NOERROR) */
 #define TIF_NOERROR		12	/* Force successful syscall return */
@@ -106,6 +108,7 @@ static inline struct thread_info *current_thread_info(void)
 #define TIF_SYSCALL_TRACEPOINT	15	/* syscall tracepoint instrumentation */
 #define TIF_EMULATE_STACK_STORE	16	/* Is an instruction emulation
 						for stack store? */
+#define TIF_MEMDIE		17	/* is terminating due to OOM killer */
 
 /* as above, but as bit values */
 #define _TIF_SYSCALL_TRACE	(1<<TIF_SYSCALL_TRACE)
@@ -124,12 +127,15 @@ static inline struct thread_info *current_thread_info(void)
 #define _TIF_UPROBE		(1<<TIF_UPROBE)
 #define _TIF_SYSCALL_TRACEPOINT	(1<<TIF_SYSCALL_TRACEPOINT)
 #define _TIF_EMULATE_STACK_STORE	(1<<TIF_EMULATE_STACK_STORE)
+#define _TIF_NEED_RESCHED_LAZY	(1<<TIF_NEED_RESCHED_LAZY)
 #define _TIF_SYSCALL_T_OR_A	(_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \
 				 _TIF_SECCOMP | _TIF_SYSCALL_TRACEPOINT)
 
 #define _TIF_USER_WORK_MASK	(_TIF_SIGPENDING | _TIF_NEED_RESCHED | \
-				 _TIF_NOTIFY_RESUME | _TIF_UPROBE)
+				 _TIF_NOTIFY_RESUME | _TIF_UPROBE | \
+				 _TIF_NEED_RESCHED_LAZY)
 #define _TIF_PERSYSCALL_MASK	(_TIF_RESTOREALL|_TIF_NOERROR)
+#define _TIF_NEED_RESCHED_MASK	(_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)
 
 /* Bits in local_flags */
 /* Don't move TLF_NAPPING without adjusting the code in entry_32.S */
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 4e23ba2..4c3e321 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -124,6 +124,7 @@ int main(void)
 	DEFINE(TI_FLAGS, offsetof(struct thread_info, flags));
 	DEFINE(TI_LOCAL_FLAGS, offsetof(struct thread_info, local_flags));
 	DEFINE(TI_PREEMPT, offsetof(struct thread_info, preempt_count));
+	DEFINE(TI_PREEMPT_LAZY, offsetof(struct thread_info, preempt_lazy_count));
 	DEFINE(TI_TASK, offsetof(struct thread_info, task));
 	DEFINE(TI_CPU, offsetof(struct thread_info, cpu));
 
diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S
index e514de5..95b884e 100644
--- a/arch/powerpc/kernel/entry_32.S
+++ b/arch/powerpc/kernel/entry_32.S
@@ -892,7 +892,14 @@ resume_kernel:
 	cmpwi	0,r0,0		/* if non-zero, just restore regs and return */
 	bne	restore
 	andi.	r8,r8,_TIF_NEED_RESCHED
+	bne+	1f
+	lwz	r0,TI_PREEMPT_LAZY(r9)
+	cmpwi	0,r0,0		/* if non-zero, just restore regs and return */
+	bne	restore
+	lwz	r0,TI_FLAGS(r9)
+	andi.	r0,r0,_TIF_NEED_RESCHED_LAZY
 	beq+	restore
+1:
 	lwz	r3,_MSR(r1)
 	andi.	r0,r3,MSR_EE	/* interrupts off? */
 	beq	restore		/* don't schedule if so */
@@ -903,11 +910,11 @@ resume_kernel:
 	 */
 	bl	trace_hardirqs_off
 #endif
-1:	bl	preempt_schedule_irq
+2:	bl	preempt_schedule_irq
 	CURRENT_THREAD_INFO(r9, r1)
 	lwz	r3,TI_FLAGS(r9)
-	andi.	r0,r3,_TIF_NEED_RESCHED
-	bne-	1b
+	andi.	r0,r3,_TIF_NEED_RESCHED_MASK
+	bne-	2b
 #ifdef CONFIG_TRACE_IRQFLAGS
 	/* And now, to properly rebalance the above, we tell lockdep they
 	 * are being turned back on, which will happen when we return
@@ -1228,7 +1235,7 @@ global_dbcr0:
 #endif /* !(CONFIG_4xx || CONFIG_BOOKE) */
 
 do_work:			/* r10 contains MSR_KERNEL here */
-	andi.	r0,r9,_TIF_NEED_RESCHED
+	andi.	r0,r9,_TIF_NEED_RESCHED_MASK
 	beq	do_user_signal
 
 do_resched:			/* r10 contains MSR_KERNEL here */
@@ -1249,7 +1256,7 @@ recheck:
 	MTMSRD(r10)		/* disable interrupts */
 	CURRENT_THREAD_INFO(r9, r1)
 	lwz	r9,TI_FLAGS(r9)
-	andi.	r0,r9,_TIF_NEED_RESCHED
+	andi.	r0,r9,_TIF_NEED_RESCHED_MASK
 	bne-	do_resched
 	andi.	r0,r9,_TIF_USER_WORK_MASK
 	beq	restore_user
diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index 3d990d3..00d1c05 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -592,7 +592,7 @@ _GLOBAL(ret_from_except_lite)
 	andi.	r0,r4,_TIF_USER_WORK_MASK
 	beq	restore
 
-	andi.	r0,r4,_TIF_NEED_RESCHED
+	andi.	r0,r4,_TIF_NEED_RESCHED_MASK
 	beq	1f
 	bl	.restore_interrupts
 	bl	.schedule
@@ -642,10 +642,16 @@ resume_kernel:
 
 #ifdef CONFIG_PREEMPT
 	/* Check if we need to preempt */
+	lwz	r8,TI_PREEMPT(r9)
 	andi.	r0,r4,_TIF_NEED_RESCHED
+	bne+	check_count
+
+	andi.	r0,r4,_TIF_NEED_RESCHED_LAZY
 	beq+	restore
+	lwz	r8,TI_PREEMPT_LAZY(r9)
+
 	/* Check that preempt_count() == 0 and interrupts are enabled */
-	lwz	r8,TI_PREEMPT(r9)
+check_count:
 	cmpwi	cr1,r8,0
 	ld	r0,SOFTE(r1)
 	cmpdi	r0,0
@@ -662,7 +668,7 @@ resume_kernel:
 	/* Re-test flags and eventually loop */
 	CURRENT_THREAD_INFO(r9, r1)
 	ld	r4,TI_FLAGS(r9)
-	andi.	r0,r4,_TIF_NEED_RESCHED
+	andi.	r0,r4,_TIF_NEED_RESCHED_MASK
 	bne	1b
 
 	/*
-- 
cgit v0.10.2


From dae448d198c823dbb45f0cfe91ca83ffe81dde9d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 12 Dec 2011 12:29:04 +0100
Subject: wait-simple: Simple waitqueue implementation

wait_queue is a swiss army knife and in most of the cases the
complexity is not needed. For RT waitqueues are a constant source of
trouble as we can't convert the head lock to a raw spinlock due to
fancy and long lasting callbacks.

Provide a slim version, which allows RT to replace wait queues. This
should go mainline as well, as it lowers memory consumption and
runtime overhead.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/wait-simple.h b/include/linux/wait-simple.h
new file mode 100644
index 0000000..19ac983
--- /dev/null
+++ b/include/linux/wait-simple.h
@@ -0,0 +1,231 @@
+#ifndef _LINUX_WAIT_SIMPLE_H
+#define _LINUX_WAIT_SIMPLE_H
+
+#include <linux/spinlock.h>
+#include <linux/list.h>
+
+#include <asm/current.h>
+
+struct swaiter {
+	struct task_struct	*task;
+	struct list_head	node;
+};
+
+#define DEFINE_SWAITER(name)					\
+	struct swaiter name = {					\
+		.task	= current,				\
+		.node	= LIST_HEAD_INIT((name).node),		\
+	}
+
+struct swait_head {
+	raw_spinlock_t		lock;
+	struct list_head	list;
+};
+
+#define DEFINE_SWAIT_HEAD(name)					\
+	struct swait_head name = {				\
+		.lock	= __RAW_SPIN_LOCK_UNLOCKED(name.lock),	\
+		.list	= LIST_HEAD_INIT((name).list),		\
+	}
+
+extern void __init_swait_head(struct swait_head *h, struct lock_class_key *key);
+
+#define init_swait_head(swh)					\
+	do {							\
+		static struct lock_class_key __key;		\
+								\
+		__init_swait_head((swh), &__key);		\
+	} while (0)
+
+/*
+ * Waiter functions
+ */
+static inline bool swaiter_enqueued(struct swaiter *w)
+{
+	return w->task != NULL;
+}
+
+extern void swait_prepare(struct swait_head *head, struct swaiter *w, int state);
+extern void swait_finish(struct swait_head *head, struct swaiter *w);
+
+/*
+ * Adds w to head->list. Must be called with head->lock locked.
+ */
+static inline void __swait_enqueue(struct swait_head *head, struct swaiter *w)
+{
+	list_add(&w->node, &head->list);
+}
+
+/*
+ * Removes w from head->list. Must be called with head->lock locked.
+ */
+static inline void __swait_dequeue(struct swaiter *w)
+{
+	list_del_init(&w->node);
+}
+
+/*
+ * Check whether a head has waiters enqueued
+ */
+static inline bool swait_head_has_waiters(struct swait_head *h)
+{
+	return !list_empty(&h->list);
+}
+
+/*
+ * Wakeup functions
+ */
+extern int __swait_wake(struct swait_head *head, unsigned int state);
+
+static inline int swait_wake(struct swait_head *head)
+{
+	return swait_head_has_waiters(head) ?
+		__swait_wake(head, TASK_NORMAL) : 0;
+}
+
+static inline int swait_wake_interruptible(struct swait_head *head)
+{
+	return swait_head_has_waiters(head) ?
+		__swait_wake(head, TASK_INTERRUPTIBLE) : 0;
+}
+
+/*
+ * Event API
+ */
+
+#define __swait_event(wq, condition)					\
+do {									\
+	DEFINE_SWAITER(__wait);						\
+									\
+	for (;;) {							\
+		swait_prepare(&wq, &__wait, TASK_UNINTERRUPTIBLE);	\
+		if (condition)						\
+			break;						\
+		schedule();						\
+	}								\
+	swait_finish(&wq, &__wait);					\
+} while (0)
+
+/**
+ * swait_event - sleep until a condition gets true
+ * @wq: the waitqueue to wait on
+ * @condition: a C expression for the event to wait for
+ *
+ * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the
+ * @condition evaluates to true. The @condition is checked each time
+ * the waitqueue @wq is woken up.
+ *
+ * wake_up() has to be called after changing any variable that could
+ * change the result of the wait condition.
+ */
+#define swait_event(wq, condition)					\
+do {									\
+	if (condition)							\
+		break;							\
+	__swait_event(wq, condition);					\
+} while (0)
+
+#define __swait_event_interruptible(wq, condition, ret)			\
+do {									\
+	DEFINE_SWAITER(__wait);						\
+									\
+	for (;;) {							\
+		swait_prepare(&wq, &__wait, TASK_INTERRUPTIBLE);	\
+		if (condition)						\
+			break;						\
+		if (signal_pending(current)) {				\
+			ret = -ERESTARTSYS;				\
+			break;						\
+		}							\
+		schedule();						\
+	}								\
+	swait_finish(&wq, &__wait);					\
+} while (0)
+
+#define __swait_event_interruptible_timeout(wq, condition, ret)		\
+do {									\
+	DEFINE_SWAITER(__wait);						\
+									\
+	for (;;) {							\
+		swait_prepare(&wq, &__wait, TASK_INTERRUPTIBLE);	\
+		if (condition)						\
+			break;						\
+		if (signal_pending(current)) {				\
+			ret = -ERESTARTSYS;				\
+			break;						\
+		}							\
+		ret = schedule_timeout(ret);				\
+		if (!ret)						\
+			break;						\
+	}								\
+	swait_finish(&wq, &__wait);					\
+} while (0)
+
+/**
+ * swait_event_interruptible - sleep until a condition gets true
+ * @wq: the waitqueue to wait on
+ * @condition: a C expression for the event to wait for
+ *
+ * The process is put to sleep (TASK_INTERRUPTIBLE) until the
+ * @condition evaluates to true. The @condition is checked each time
+ * the waitqueue @wq is woken up.
+ *
+ * wake_up() has to be called after changing any variable that could
+ * change the result of the wait condition.
+ */
+#define swait_event_interruptible(wq, condition)			\
+({									\
+	int __ret = 0;							\
+	if (!(condition))						\
+		__swait_event_interruptible(wq, condition, __ret);	\
+	__ret;								\
+})
+
+#define swait_event_interruptible_timeout(wq, condition, timeout)	\
+({									\
+	int __ret = timeout;						\
+	if (!(condition))						\
+		__swait_event_interruptible_timeout(wq, condition, __ret);	\
+	__ret;								\
+})
+
+#define __swait_event_timeout(wq, condition, ret)			\
+do {									\
+	DEFINE_SWAITER(__wait);						\
+									\
+	for (;;) {							\
+		swait_prepare(&wq, &__wait, TASK_UNINTERRUPTIBLE);	\
+		if (condition)						\
+			break;						\
+		ret = schedule_timeout(ret);				\
+		if (!ret)						\
+			break;						\
+	}								\
+	swait_finish(&wq, &__wait);					\
+} while (0)
+
+/**
+ * swait_event_timeout - sleep until a condition gets true or a timeout elapses
+ * @wq: the waitqueue to wait on
+ * @condition: a C expression for the event to wait for
+ * @timeout: timeout, in jiffies
+ *
+ * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the
+ * @condition evaluates to true. The @condition is checked each time
+ * the waitqueue @wq is woken up.
+ *
+ * wake_up() has to be called after changing any variable that could
+ * change the result of the wait condition.
+ *
+ * The function returns 0 if the @timeout elapsed, and the remaining
+ * jiffies if the condition evaluated to true before the timeout elapsed.
+ */
+#define swait_event_timeout(wq, condition, timeout)			\
+({									\
+	long __ret = timeout;						\
+	if (!(condition))						\
+		__swait_event_timeout(wq, condition, __ret);		\
+	__ret;								\
+})
+
+#endif
diff --git a/kernel/Makefile b/kernel/Makefile
index 0e12fc2..2c7ab7f 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -10,7 +10,7 @@ obj-y     = fork.o exec_domain.o panic.o printk.o \
 	    kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o \
 	    hrtimer.o nsproxy.o srcu.o semaphore.o \
 	    notifier.o ksysfs.o cred.o \
-	    async.o range.o groups.o lglock.o smpboot.o
+	    async.o range.o groups.o lglock.o smpboot.o wait-simple.o
 
 ifdef CONFIG_FUNCTION_TRACER
 # Do not trace debug files and internal ftrace files
diff --git a/kernel/wait-simple.c b/kernel/wait-simple.c
new file mode 100644
index 0000000..040d714
--- /dev/null
+++ b/kernel/wait-simple.c
@@ -0,0 +1,68 @@
+/*
+ * Simple waitqueues without fancy flags and callbacks
+ *
+ * (C) 2011 Thomas Gleixner <tglx@linutronix.de>
+ *
+ * Based on kernel/wait.c
+ *
+ * For licencing details see kernel-base/COPYING
+ */
+#include <linux/init.h>
+#include <linux/export.h>
+#include <linux/sched.h>
+#include <linux/wait-simple.h>
+
+void __init_swait_head(struct swait_head *head, struct lock_class_key *key)
+{
+	raw_spin_lock_init(&head->lock);
+	lockdep_set_class(&head->lock, key);
+	INIT_LIST_HEAD(&head->list);
+}
+EXPORT_SYMBOL(__init_swait_head);
+
+void swait_prepare(struct swait_head *head, struct swaiter *w, int state)
+{
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&head->lock, flags);
+	w->task = current;
+	if (list_empty(&w->node))
+		__swait_enqueue(head, w);
+	set_current_state(state);
+	raw_spin_unlock_irqrestore(&head->lock, flags);
+}
+EXPORT_SYMBOL(swait_prepare);
+
+void swait_finish(struct swait_head *head, struct swaiter *w)
+{
+	unsigned long flags;
+
+	__set_current_state(TASK_RUNNING);
+	if (w->task) {
+		raw_spin_lock_irqsave(&head->lock, flags);
+		__swait_dequeue(w);
+		raw_spin_unlock_irqrestore(&head->lock, flags);
+	}
+}
+EXPORT_SYMBOL(swait_finish);
+
+int __swait_wake(struct swait_head *head, unsigned int state)
+{
+	struct swaiter *curr, *next;
+	unsigned long flags;
+	int woken = 0;
+
+	raw_spin_lock_irqsave(&head->lock, flags);
+
+	list_for_each_entry_safe(curr, next, &head->list, node) {
+		if (wake_up_state(curr->task, state)) {
+			__swait_dequeue(curr);
+			curr->task = NULL;
+			woken++;
+		}
+	}
+
+	raw_spin_unlock_irqrestore(&head->lock, flags);
+	return woken;
+}
+EXPORT_SYMBOL(__swait_wake);
-- 
cgit v0.10.2


From 26738c6d915e43a0581b7cd0801a87c33df289ed Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 3 Dec 2012 16:25:21 +0100
Subject: rcutiny: Use simple waitqueue

Simple waitqueues can be handled from interrupt disabled contexts.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index ec3446e..14cae12 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -26,6 +26,7 @@
 #include <linux/module.h>
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
+#include <linux/wait-simple.h>
 
 /* Global control variables for rcupdate callback mechanism. */
 struct rcu_ctrlblk {
@@ -260,7 +261,7 @@ static void show_tiny_preempt_stats(struct seq_file *m)
 
 /* Controls for rcu_kthread() kthread. */
 static struct task_struct *rcu_kthread_task;
-static DECLARE_WAIT_QUEUE_HEAD(rcu_kthread_wq);
+static DEFINE_SWAIT_HEAD(rcu_kthread_wq);
 static unsigned long have_rcu_kthread_work;
 
 /*
@@ -713,7 +714,7 @@ void synchronize_rcu(void)
 }
 EXPORT_SYMBOL_GPL(synchronize_rcu);
 
-static DECLARE_WAIT_QUEUE_HEAD(sync_rcu_preempt_exp_wq);
+static DEFINE_SWAIT_HEAD(sync_rcu_preempt_exp_wq);
 static unsigned long sync_rcu_preempt_exp_count;
 static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex);
 
@@ -735,7 +736,7 @@ static int rcu_preempted_readers_exp(void)
  */
 static void rcu_report_exp_done(void)
 {
-	wake_up(&sync_rcu_preempt_exp_wq);
+	swait_wake(&sync_rcu_preempt_exp_wq);
 }
 
 /*
@@ -787,8 +788,8 @@ void synchronize_rcu_expedited(void)
 	} else {
 		rcu_initiate_boost();
 		local_irq_restore(flags);
-		wait_event(sync_rcu_preempt_exp_wq,
-			   !rcu_preempted_readers_exp());
+		swait_event(sync_rcu_preempt_exp_wq,
+			    !rcu_preempted_readers_exp());
 	}
 
 	/* Clean up and exit. */
@@ -858,7 +859,7 @@ static void invoke_rcu_callbacks(void)
 {
 	have_rcu_kthread_work = 1;
 	if (rcu_kthread_task != NULL)
-		wake_up(&rcu_kthread_wq);
+		swait_wake(&rcu_kthread_wq);
 }
 
 #ifdef CONFIG_RCU_TRACE
@@ -888,8 +889,8 @@ static int rcu_kthread(void *arg)
 	unsigned long flags;
 
 	for (;;) {
-		wait_event_interruptible(rcu_kthread_wq,
-					 have_rcu_kthread_work != 0);
+		swait_event_interruptible(rcu_kthread_wq,
+					  have_rcu_kthread_work != 0);
 		morework = rcu_boost();
 		local_irq_save(flags);
 		work = have_rcu_kthread_work;
-- 
cgit v0.10.2


From 738fc33d010e12e8eba3c0e7662f52031a384221 Mon Sep 17 00:00:00 2001
From: RT patchquilt <nobody@example.com>
Date: Thu, 4 Apr 2013 17:09:41 -0500
Subject: treercu-use-simple-waitqueue


diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 85fd9a7..7ec834d 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -1319,7 +1319,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
 
 		/* Handle grace-period start. */
 		for (;;) {
-			wait_event_interruptible(rsp->gp_wq,
+			swait_event_interruptible(rsp->gp_wq,
 						 rsp->gp_flags &
 						 RCU_GP_FLAG_INIT);
 			if ((rsp->gp_flags & RCU_GP_FLAG_INIT) &&
@@ -1338,7 +1338,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
 		}
 		for (;;) {
 			rsp->jiffies_force_qs = jiffies + j;
-			ret = wait_event_interruptible_timeout(rsp->gp_wq,
+			ret = swait_event_interruptible_timeout(rsp->gp_wq,
 					(rsp->gp_flags & RCU_GP_FLAG_FQS) ||
 					(!ACCESS_ONCE(rnp->qsmask) &&
 					 !rcu_preempt_blocked_readers_cgp(rnp)),
@@ -1423,7 +1423,7 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
 	local_irq_restore(flags);
 
 	/* Wake up rcu_gp_kthread() to start the grace period. */
-	wake_up(&rsp->gp_wq);
+	swait_wake(&rsp->gp_wq);
 }
 
 /*
@@ -1438,7 +1438,7 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
 {
 	WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
 	raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags);
-	wake_up(&rsp->gp_wq);  /* Memory barrier implied by wake_up() path. */
+	swait_wake(&rsp->gp_wq);  /* Memory barrier implied by wake_up() path. */
 }
 
 /*
@@ -2003,7 +2003,8 @@ static void force_quiescent_state(struct rcu_state *rsp)
 	}
 	rsp->gp_flags |= RCU_GP_FLAG_FQS;
 	raw_spin_unlock_irqrestore(&rnp_old->lock, flags);
-	wake_up(&rsp->gp_wq);  /* Memory barrier implied by wake_up() path. */
+	/* Memory barrier implied by wake_up() path. */
+	swait_wake(&rsp->gp_wq);
 }
 
 /*
@@ -2999,7 +3000,7 @@ static void __init rcu_init_one(struct rcu_state *rsp,
 	}
 
 	rsp->rda = rda;
-	init_waitqueue_head(&rsp->gp_wq);
+	init_swait_head(&rsp->gp_wq);
 	rnp = rsp->level[rcu_num_lvls - 1];
 	for_each_possible_cpu(i) {
 		while (i > rnp->grphi)
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 4b69291..5cfdff9 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -397,7 +397,7 @@ struct rcu_state {
 	unsigned long gpnum;			/* Current gp number. */
 	unsigned long completed;		/* # of last completed gp. */
 	struct task_struct *gp_kthread;		/* Task for grace periods. */
-	wait_queue_head_t gp_wq;		/* Where GP task waits. */
+	struct swait_head gp_wq;		/* Where GP task waits. */
 	int gp_flags;				/* Commands for GP task. */
 
 	/* End of fields guarded by root rcu_node's lock. */
-- 
cgit v0.10.2


From f257d82a29b61fc0202fe8b5ca82459224ee8ea5 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 20 Dec 2012 14:58:00 +0100
Subject: sched: Adjust sched_reset_on_fork when nothing else changes

If the policy and priority remain unchanged a possible modification of
sched_reset_on_fork gets lost in the early exit path.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable@vger.kernel.org
Cc: stable-rt@vger.kernel.org

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f6794dc..f107c38 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4117,10 +4117,13 @@ recheck:
 	}
 
 	/*
-	 * If not changing anything there's no need to proceed further:
+	 * If not changing anything there's no need to proceed
+	 * further, but store a possible modification of
+	 * reset_on_fork.
 	 */
 	if (unlikely(policy == p->policy && (!rt_policy(policy) ||
 			param->sched_priority == p->rt_priority))) {
+		p->sched_reset_on_fork = reset_on_fork;
 		task_rq_unlock(rq, p, &flags);
 		return 0;
 	}
-- 
cgit v0.10.2


From 5518c6bed940793c5b32f99f1d212af4fccc455b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 4 Dec 2012 08:56:41 +0100
Subject: sched: Queue RT tasks to head when prio drops

The following scenario does not work correctly:

Runqueue of CPUx contains two runnable and pinned tasks:
 T1: SCHED_FIFO, prio 80
 T2: SCHED_FIFO, prio 80

T1 is on the cpu and executes the following syscalls (classic priority
ceiling scenario):

 sys_sched_setscheduler(pid(T1), SCHED_FIFO, .prio = 90);
 ...
 sys_sched_setscheduler(pid(T1), SCHED_FIFO, .prio = 80);
 ...

Now T1 gets preempted by T3 (SCHED_FIFO, prio 95). After T3 goes back
to sleep the scheduler picks T2. Surprise!

The same happens w/o actual preemption when T1 is forced into the
scheduler due to a sporadic NEED_RESCHED event. The scheduler invokes
pick_next_task() which returns T2. So T1 gets preempted and scheduled
out.

This happens because sched_setscheduler() dequeues T1 from the prio 90
list and then enqueues it on the tail of the prio 80 list behind T2.
This violates the POSIX spec and surprises user space which relies on
the guarantee that SCHED_FIFO tasks are not scheduled out unless they
give the CPU up voluntarily or are preempted by a higher priority
task. In the latter case the preempted task must get back on the CPU
after the preempting task schedules out again.

We fixed a similar issue already in commit 60db48c (sched: Queue a
deboosted task to the head of the RT prio queue). The same treatment
is necessary for sched_setscheduler(). So enqueue to head of the prio
bucket list if the priority of the task is lowered.

It might be possible that existing user space relies on the current
behaviour, but it can be considered highly unlikely due to the corner
case nature of the application scenario.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable@vger.kernel.org
Cc: stable-rt@vger.kernel.org

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f107c38..06660b8 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4164,8 +4164,13 @@ recheck:
 
 	if (running)
 		p->sched_class->set_curr_task(rq);
-	if (on_rq)
-		enqueue_task(rq, p, 0);
+	if (on_rq) {
+		/*
+		 * We enqueue to tail when the priority of a task is
+		 * increased (user space view).
+		 */
+		enqueue_task(rq, p, oldprio <= p->prio ? ENQUEUE_HEAD : 0);
+	}
 
 	check_class_changed(rq, p, prev_class, oldprio);
 	task_rq_unlock(rq, p, &flags);
-- 
cgit v0.10.2


From 51ae24cecd6b7641a0e2bcd4e909a31766953173 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 20 Dec 2012 15:13:49 +0100
Subject: sched: Consider pi boosting in setscheduler

If a PI boosted task policy/priority is modified by a setscheduler()
call we unconditionally dequeue and requeue the task if it is on the
runqueue even if the new priority is lower than the current effective
boosted priority. This can result in undesired reordering of the
priority bucket list.

If the new priority is less or equal than the current effective we
just store the new parameters in the task struct and leave the
scheduler class and the runqueue untouched. This is handled when the
task deboosts itself. Only if the new priority is higher than the
effective boosted priority we apply the change immediately.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable@vger.kernel.org
Cc: stable-rt@vger.kernel.org

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 70b821a..913ec05 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2175,6 +2175,7 @@ extern unsigned int sysctl_sched_cfs_bandwidth_slice;
 #ifdef CONFIG_RT_MUTEXES
 extern int rt_mutex_getprio(struct task_struct *p);
 extern void rt_mutex_setprio(struct task_struct *p, int prio);
+extern int rt_mutex_check_prio(struct task_struct *task, int newprio);
 extern void rt_mutex_adjust_pi(struct task_struct *p);
 static inline bool tsk_is_pi_blocked(struct task_struct *tsk)
 {
@@ -2185,6 +2186,10 @@ static inline int rt_mutex_getprio(struct task_struct *p)
 {
 	return p->normal_prio;
 }
+static inline int rt_mutex_check_prio(struct task_struct *task, int newprio)
+{
+	return 0;
+}
 # define rt_mutex_adjust_pi(p)		do { } while (0)
 static inline bool tsk_is_pi_blocked(struct task_struct *tsk)
 {
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index 3bff726..20742e7 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -124,6 +124,18 @@ int rt_mutex_getprio(struct task_struct *task)
 }
 
 /*
+ * Called by sched_setscheduler() to check whether the priority change
+ * is overruled by a possible priority boosting.
+ */
+int rt_mutex_check_prio(struct task_struct *task, int newprio)
+{
+	if (!task_has_pi_waiters(task))
+		return 0;
+
+	return task_top_pi_waiter(task)->pi_list_entry.prio <= newprio;
+}
+
+/*
  * Adjust the priority of a task, after its pi_waiters got modified.
  *
  * This can be both boosting and unboosting. task->pi_lock must be held.
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 06660b8..74e6fe8 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3764,7 +3764,8 @@ EXPORT_SYMBOL(sleep_on_timeout);
  * This function changes the 'effective' priority of a task. It does
  * not touch ->normal_prio like __setscheduler().
  *
- * Used by the rt_mutex code to implement priority inheritance logic.
+ * Used by the rt_mutex code to implement priority inheritance
+ * logic. Call site only calls if the priority of the task changed.
  */
 void rt_mutex_setprio(struct task_struct *p, int prio)
 {
@@ -3987,20 +3988,25 @@ static struct task_struct *find_process_by_pid(pid_t pid)
 	return pid ? find_task_by_vpid(pid) : current;
 }
 
-/* Actually do priority change: must hold rq lock. */
-static void
-__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
+static void __setscheduler_params(struct task_struct *p, int policy, int prio)
 {
 	p->policy = policy;
 	p->rt_priority = prio;
 	p->normal_prio = normal_prio(p);
+	set_load_weight(p);
+}
+
+/* Actually do priority change: must hold rq lock. */
+static void
+__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
+{
+	__setscheduler_params(p, policy, prio);
 	/* we are holding p->pi_lock already */
 	p->prio = rt_mutex_getprio(p);
 	if (rt_prio(p->prio))
 		p->sched_class = &rt_sched_class;
 	else
 		p->sched_class = &fair_sched_class;
-	set_load_weight(p);
 }
 
 /*
@@ -4022,6 +4028,7 @@ static bool check_same_owner(struct task_struct *p)
 static int __sched_setscheduler(struct task_struct *p, int policy,
 				const struct sched_param *param, bool user)
 {
+	int newprio = MAX_RT_PRIO - 1 - param->sched_priority;
 	int retval, oldprio, oldpolicy = -1, on_rq, running;
 	unsigned long flags;
 	const struct sched_class *prev_class;
@@ -4149,6 +4156,25 @@ recheck:
 		task_rq_unlock(rq, p, &flags);
 		goto recheck;
 	}
+
+	p->sched_reset_on_fork = reset_on_fork;
+	oldprio = p->prio;
+
+	/*
+	 * Special case for priority boosted tasks.
+	 *
+	 * If the new priority is lower or equal (user space view)
+	 * than the current (boosted) priority, we just store the new
+	 * normal parameters and do not touch the scheduler class and
+	 * the runqueue. This will be done when the task deboost
+	 * itself.
+	 */
+	if (rt_mutex_check_prio(p, newprio)) {
+		__setscheduler_params(p, policy, param->sched_priority);
+		task_rq_unlock(rq, p, &flags);
+		return 0;
+	}
+
 	on_rq = p->on_rq;
 	running = task_current(rq, p);
 	if (on_rq)
@@ -4156,9 +4182,6 @@ recheck:
 	if (running)
 		p->sched_class->put_prev_task(rq, p);
 
-	p->sched_reset_on_fork = reset_on_fork;
-
-	oldprio = p->prio;
 	prev_class = p->sched_class;
 	__setscheduler(rq, p, policy, param->sched_priority);
 
@@ -4171,7 +4194,6 @@ recheck:
 		 */
 		enqueue_task(rq, p, oldprio <= p->prio ? ENQUEUE_HEAD : 0);
 	}
-
 	check_class_changed(rq, p, prev_class, oldprio);
 	task_rq_unlock(rq, p, &flags);
 
-- 
cgit v0.10.2


From c6aa8c2474a63147d2fb1a14a8af6543f5ef6fb1 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 20 Dec 2012 18:28:26 +0100
Subject: block: Use cpu_chill() for retry loops

Retry loops on RT might loop forever when the modifying side was
preempted. Steven also observed a live lock when there was a
concurrent priority boosting going on.

Use cpu_chill() instead of cpu_relax() to let the system
make progress.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable-rt@vger.kernel.org

diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index fab4cdd..fb21ad5 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -8,6 +8,7 @@
 #include <linux/blkdev.h>
 #include <linux/bootmem.h>	/* for max_pfn/max_low_pfn */
 #include <linux/slab.h>
+#include <linux/delay.h>
 
 #include "blk.h"
 
@@ -110,7 +111,7 @@ static void ioc_release_fn(struct work_struct *work)
 			spin_unlock(q->queue_lock);
 		} else {
 			spin_unlock_irqrestore(&ioc->lock, flags);
-			cpu_relax();
+			cpu_chill();
 			spin_lock_irqsave_nested(&ioc->lock, flags, 1);
 		}
 	}
@@ -188,7 +189,7 @@ retry:
 			spin_unlock(icq->q->queue_lock);
 		} else {
 			spin_unlock_irqrestore(&ioc->lock, flags);
-			cpu_relax();
+			cpu_chill();
 			goto retry;
 		}
 	}
-- 
cgit v0.10.2


From 7d436d5da3abb13fb08f48bb4fd0ec26e1b5897f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 9 Jan 2013 10:33:09 +0100
Subject: mm: bounce: Use local_irq_save_nort

kmap_atomic() is preemptible on RT.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/mm/bounce.c b/mm/bounce.c
index 0420867..1e78ef7 100644
--- a/mm/bounce.c
+++ b/mm/bounce.c
@@ -51,11 +51,11 @@ static void bounce_copy_vec(struct bio_vec *to, unsigned char *vfrom)
 	unsigned long flags;
 	unsigned char *vto;
 
-	local_irq_save(flags);
+	local_irq_save_nort(flags);
 	vto = kmap_atomic(to->bv_page);
 	memcpy(vto + to->bv_offset, vfrom, to->bv_len);
 	kunmap_atomic(vto);
-	local_irq_restore(flags);
+	local_irq_restore_nort(flags);
 }
 
 #else /* CONFIG_HIGHMEM */
-- 
cgit v0.10.2


From 708a49a53988eaa915d422816dfb114f8a777b73 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 9 Jan 2013 12:11:12 +0100
Subject: mmci: Remove bogus local_irq_save()

On !RT interrupt runs with interrupts disabled. On RT it's in a
thread, so no need to disable interrupts at all.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/drivers/mmc/host/mmci.c b/drivers/mmc/host/mmci.c
index 1507723..724f478 100644
--- a/drivers/mmc/host/mmci.c
+++ b/drivers/mmc/host/mmci.c
@@ -930,15 +930,12 @@ static irqreturn_t mmci_pio_irq(int irq, void *dev_id)
 	struct sg_mapping_iter *sg_miter = &host->sg_miter;
 	struct variant_data *variant = host->variant;
 	void __iomem *base = host->base;
-	unsigned long flags;
 	u32 status;
 
 	status = readl(base + MMCISTATUS);
 
 	dev_dbg(mmc_dev(host->mmc), "irq1 (pio) %08x\n", status);
 
-	local_irq_save(flags);
-
 	do {
 		unsigned int remain, len;
 		char *buffer;
@@ -978,8 +975,6 @@ static irqreturn_t mmci_pio_irq(int irq, void *dev_id)
 
 	sg_miter_stop(sg_miter);
 
-	local_irq_restore(flags);
-
 	/*
 	 * If we have less than the fifo 'half-full' threshold to transfer,
 	 * trigger a PIO interrupt as soon as any data is available.
-- 
cgit v0.10.2


From 9e7caf47ba85e6658b7e872344c129b235935077 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 9 Jan 2013 12:08:15 +0100
Subject: slub: Enable irqs for __GFP_WAIT

SYSTEM_RUNNING might be too late for enabling interrupts. Allocations
with GFP_WAIT can happen before that. So use this as an indicator.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/mm/slub.c b/mm/slub.c
index f8cfc46..85299b8 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1280,14 +1280,15 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
 	struct page *page;
 	struct kmem_cache_order_objects oo = s->oo;
 	gfp_t alloc_gfp;
+	bool enableirqs;
 
 	flags &= gfp_allowed_mask;
 
+	enableirqs = (flags & __GFP_WAIT) != 0;
 #ifdef CONFIG_PREEMPT_RT_FULL
-	if (system_state == SYSTEM_RUNNING)
-#else
-	if (flags & __GFP_WAIT)
+	enableirqs |= system_state == SYSTEM_RUNNING;
 #endif
+	if (enableirqs)
 		local_irq_enable();
 
 	flags |= s->allocflags;
@@ -1327,11 +1328,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
 			kmemcheck_mark_unallocated_pages(page, pages);
 	}
 
-#ifdef CONFIG_PREEMPT_RT_FULL
-	if (system_state == SYSTEM_RUNNING)
-#else
-	if (flags & __GFP_WAIT)
-#endif
+	if (enableirqs)
 		local_irq_disable();
 	if (!page)
 		return NULL;
-- 
cgit v0.10.2


From 6b159f5a12c49cb8ba5fcb0471419c3d6fd6a650 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Thu, 4 Apr 2013 17:09:42 -0500
Subject: slub: delay ctor until the object is requested

It seems that allocation of plenty objects causes latency on ARM since that
code can not be preempted

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>

diff --git a/mm/slub.c b/mm/slub.c
index 85299b8..f6871c5 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1346,8 +1346,10 @@ static void setup_object(struct kmem_cache *s, struct page *page,
 				void *object)
 {
 	setup_object_debug(s, page, object);
+#ifndef CONFIG_PREEMPT_RT_FULL
 	if (unlikely(s->ctor))
 		s->ctor(object);
+#endif
 }
 
 static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
@@ -2437,6 +2439,10 @@ redo:
 
 	if (unlikely(gfpflags & __GFP_ZERO) && object)
 		memset(object, 0, s->object_size);
+#ifdef CONFIG_PREEMPT_RT_FULL
+	if (unlikely(s->ctor) && object)
+		s->ctor(object);
+#endif
 
 	slab_post_alloc_hook(s, gfpflags, object);
 
-- 
cgit v0.10.2


From 3686586e34d3addd28e6cfe02949e5f79ddf854f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 9 Jan 2013 23:03:29 +0100
Subject: sched: Init idle->on_rq in init_idle()

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 74e6fe8..7429aef 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4941,6 +4941,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
 	rcu_read_unlock();
 
 	rq->curr = rq->idle = idle;
+	idle->on_rq = 1;
 #if defined(CONFIG_SMP)
 	idle->on_cpu = 1;
 #endif
-- 
cgit v0.10.2


From 3fdeda22a495d6d12870a39495fba3d478a6b90b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 9 Jan 2013 23:34:08 +0100
Subject: sched: Check for idle task in might_sleep()

Idle is not allowed to call sleeping functions ever!

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 7429aef..06f41cf 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7391,7 +7391,8 @@ void __might_sleep(const char *file, int line, int preempt_offset)
 	static unsigned long prev_jiffy;	/* ratelimiting */
 
 	rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */
-	if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) ||
+	if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
+	     !is_idle_task(current)) ||
 	    system_state != SYSTEM_RUNNING || oops_in_progress)
 		return;
 	if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
-- 
cgit v0.10.2


From 69f9bb892241fb441fe72c20a34863c12868a2c7 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 10 Jan 2013 11:47:35 +0100
Subject: wait-simple: Rework for use with completions

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/wait-simple.h b/include/linux/wait-simple.h
index 19ac983..4efba4d 100644
--- a/include/linux/wait-simple.h
+++ b/include/linux/wait-simple.h
@@ -22,12 +22,14 @@ struct swait_head {
 	struct list_head	list;
 };
 
-#define DEFINE_SWAIT_HEAD(name)					\
-	struct swait_head name = {				\
+#define SWAIT_HEAD_INITIALIZER(name) {				\
 		.lock	= __RAW_SPIN_LOCK_UNLOCKED(name.lock),	\
 		.list	= LIST_HEAD_INIT((name).list),		\
 	}
 
+#define DEFINE_SWAIT_HEAD(name)					\
+	struct swait_head name = SWAIT_HEAD_INITIALIZER(name)
+
 extern void __init_swait_head(struct swait_head *h, struct lock_class_key *key);
 
 #define init_swait_head(swh)					\
@@ -40,59 +42,25 @@ extern void __init_swait_head(struct swait_head *h, struct lock_class_key *key);
 /*
  * Waiter functions
  */
-static inline bool swaiter_enqueued(struct swaiter *w)
-{
-	return w->task != NULL;
-}
-
+extern void swait_prepare_locked(struct swait_head *head, struct swaiter *w);
 extern void swait_prepare(struct swait_head *head, struct swaiter *w, int state);
+extern void swait_finish_locked(struct swait_head *head, struct swaiter *w);
 extern void swait_finish(struct swait_head *head, struct swaiter *w);
 
 /*
- * Adds w to head->list. Must be called with head->lock locked.
- */
-static inline void __swait_enqueue(struct swait_head *head, struct swaiter *w)
-{
-	list_add(&w->node, &head->list);
-}
-
-/*
- * Removes w from head->list. Must be called with head->lock locked.
- */
-static inline void __swait_dequeue(struct swaiter *w)
-{
-	list_del_init(&w->node);
-}
-
-/*
- * Check whether a head has waiters enqueued
- */
-static inline bool swait_head_has_waiters(struct swait_head *h)
-{
-	return !list_empty(&h->list);
-}
-
-/*
  * Wakeup functions
  */
-extern int __swait_wake(struct swait_head *head, unsigned int state);
-
-static inline int swait_wake(struct swait_head *head)
-{
-	return swait_head_has_waiters(head) ?
-		__swait_wake(head, TASK_NORMAL) : 0;
-}
+extern unsigned int __swait_wake(struct swait_head *head, unsigned int state, unsigned int num);
+extern unsigned int __swait_wake_locked(struct swait_head *head, unsigned int state, unsigned int num);
 
-static inline int swait_wake_interruptible(struct swait_head *head)
-{
-	return swait_head_has_waiters(head) ?
-		__swait_wake(head, TASK_INTERRUPTIBLE) : 0;
-}
+#define swait_wake(head)			__swait_wake(head, TASK_NORMAL, 1)
+#define swait_wake_interruptible(head)		__swait_wake(head, TASK_INTERRUPTIBLE, 1)
+#define swait_wake_all(head)			__swait_wake(head, TASK_NORMAL, 0)
+#define swait_wake_all_interruptible(head)	__swait_wake(head, TASK_INTERRUPTIBLE, 0)
 
 /*
  * Event API
  */
-
 #define __swait_event(wq, condition)					\
 do {									\
 	DEFINE_SWAITER(__wait);						\
diff --git a/kernel/wait-simple.c b/kernel/wait-simple.c
index 040d714..4b9a0b5 100644
--- a/kernel/wait-simple.c
+++ b/kernel/wait-simple.c
@@ -12,6 +12,24 @@
 #include <linux/sched.h>
 #include <linux/wait-simple.h>
 
+/* Adds w to head->list. Must be called with head->lock locked. */
+static inline void __swait_enqueue(struct swait_head *head, struct swaiter *w)
+{
+	list_add(&w->node, &head->list);
+}
+
+/* Removes w from head->list. Must be called with head->lock locked. */
+static inline void __swait_dequeue(struct swaiter *w)
+{
+	list_del_init(&w->node);
+}
+
+/* Check whether a head has waiters enqueued */
+static inline bool swait_head_has_waiters(struct swait_head *h)
+{
+	return !list_empty(&h->list);
+}
+
 void __init_swait_head(struct swait_head *head, struct lock_class_key *key)
 {
 	raw_spin_lock_init(&head->lock);
@@ -20,19 +38,31 @@ void __init_swait_head(struct swait_head *head, struct lock_class_key *key)
 }
 EXPORT_SYMBOL(__init_swait_head);
 
+void swait_prepare_locked(struct swait_head *head, struct swaiter *w)
+{
+	w->task = current;
+	if (list_empty(&w->node))
+		__swait_enqueue(head, w);
+}
+
 void swait_prepare(struct swait_head *head, struct swaiter *w, int state)
 {
 	unsigned long flags;
 
 	raw_spin_lock_irqsave(&head->lock, flags);
-	w->task = current;
-	if (list_empty(&w->node))
-		__swait_enqueue(head, w);
-	set_current_state(state);
+	swait_prepare_locked(head, w);
+	__set_current_state(state);
 	raw_spin_unlock_irqrestore(&head->lock, flags);
 }
 EXPORT_SYMBOL(swait_prepare);
 
+void swait_finish_locked(struct swait_head *head, struct swaiter *w)
+{
+	__set_current_state(TASK_RUNNING);
+	if (w->task)
+		__swait_dequeue(w);
+}
+
 void swait_finish(struct swait_head *head, struct swaiter *w)
 {
 	unsigned long flags;
@@ -46,22 +76,43 @@ void swait_finish(struct swait_head *head, struct swaiter *w)
 }
 EXPORT_SYMBOL(swait_finish);
 
-int __swait_wake(struct swait_head *head, unsigned int state)
+unsigned int
+__swait_wake_locked(struct swait_head *head, unsigned int state, unsigned int num)
 {
 	struct swaiter *curr, *next;
-	unsigned long flags;
 	int woken = 0;
 
-	raw_spin_lock_irqsave(&head->lock, flags);
-
 	list_for_each_entry_safe(curr, next, &head->list, node) {
 		if (wake_up_state(curr->task, state)) {
 			__swait_dequeue(curr);
+			/*
+			 * The waiting task can free the waiter as
+			 * soon as curr->task = NULL is written,
+			 * without taking any locks. A memory barrier
+			 * is required here to prevent the following
+			 * store to curr->task from getting ahead of
+			 * the dequeue operation.
+			 */
+			smp_wmb();
 			curr->task = NULL;
-			woken++;
+			if (++woken == num)
+				break;
 		}
 	}
+	return woken;
+}
+
+unsigned int
+__swait_wake(struct swait_head *head, unsigned int state, unsigned int num)
+{
+	unsigned long flags;
+	int woken;
 
+	if (!swait_head_has_waiters(head))
+		return 0;
+
+	raw_spin_lock_irqsave(&head->lock, flags);
+	woken = __swait_wake_locked(head, state, num);
 	raw_spin_unlock_irqrestore(&head->lock, flags);
 	return woken;
 }
-- 
cgit v0.10.2


From 7c3f5e412214cff68024291fc4750e34881f185b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 11 Jan 2013 11:23:51 +0100
Subject: completion: Use simple wait queues

Completions have no long lasting callbacks and therefor do not need
the complex waitqueue variant. Use simple waitqueues which reduces the
contention on the waitqueue lock.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/completion.h b/include/linux/completion.h
index 51494e6..ebb6565 100644
--- a/include/linux/completion.h
+++ b/include/linux/completion.h
@@ -8,7 +8,7 @@
  * See kernel/sched.c for details.
  */
 
-#include <linux/wait.h>
+#include <linux/wait-simple.h>
 
 /*
  * struct completion - structure used to maintain state for a "completion"
@@ -24,11 +24,11 @@
  */
 struct completion {
 	unsigned int done;
-	wait_queue_head_t wait;
+	struct swait_head wait;
 };
 
 #define COMPLETION_INITIALIZER(work) \
-	{ 0, __WAIT_QUEUE_HEAD_INITIALIZER((work).wait) }
+	{ 0, SWAIT_HEAD_INITIALIZER((work).wait) }
 
 #define COMPLETION_INITIALIZER_ONSTACK(work) \
 	({ init_completion(&work); work; })
@@ -73,7 +73,7 @@ struct completion {
 static inline void init_completion(struct completion *x)
 {
 	x->done = 0;
-	init_waitqueue_head(&x->wait);
+	init_swait_head(&x->wait);
 }
 
 extern void wait_for_completion(struct completion *);
diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
index 4f628a6..ef03e33 100644
--- a/include/linux/uprobes.h
+++ b/include/linux/uprobes.h
@@ -26,6 +26,7 @@
 
 #include <linux/errno.h>
 #include <linux/rbtree.h>
+#include <linux/wait.h>
 
 struct vm_area_struct;
 struct mm_struct;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 06f41cf..5d82809 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3487,10 +3487,10 @@ void complete(struct completion *x)
 {
 	unsigned long flags;
 
-	spin_lock_irqsave(&x->wait.lock, flags);
+	raw_spin_lock_irqsave(&x->wait.lock, flags);
 	x->done++;
-	__wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL);
-	spin_unlock_irqrestore(&x->wait.lock, flags);
+	__swait_wake_locked(&x->wait, TASK_NORMAL, 1);
+	raw_spin_unlock_irqrestore(&x->wait.lock, flags);
 }
 EXPORT_SYMBOL(complete);
 
@@ -3507,10 +3507,10 @@ void complete_all(struct completion *x)
 {
 	unsigned long flags;
 
-	spin_lock_irqsave(&x->wait.lock, flags);
+	raw_spin_lock_irqsave(&x->wait.lock, flags);
 	x->done += UINT_MAX/2;
-	__wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL);
-	spin_unlock_irqrestore(&x->wait.lock, flags);
+	__swait_wake_locked(&x->wait, TASK_NORMAL, 0);
+	raw_spin_unlock_irqrestore(&x->wait.lock, flags);
 }
 EXPORT_SYMBOL(complete_all);
 
@@ -3518,20 +3518,20 @@ static inline long __sched
 do_wait_for_common(struct completion *x, long timeout, int state)
 {
 	if (!x->done) {
-		DECLARE_WAITQUEUE(wait, current);
+		DEFINE_SWAITER(wait);
 
-		__add_wait_queue_tail_exclusive(&x->wait, &wait);
+		swait_prepare_locked(&x->wait, &wait);
 		do {
 			if (signal_pending_state(state, current)) {
 				timeout = -ERESTARTSYS;
 				break;
 			}
 			__set_current_state(state);
-			spin_unlock_irq(&x->wait.lock);
+			raw_spin_unlock_irq(&x->wait.lock);
 			timeout = schedule_timeout(timeout);
-			spin_lock_irq(&x->wait.lock);
+			raw_spin_lock_irq(&x->wait.lock);
 		} while (!x->done && timeout);
-		__remove_wait_queue(&x->wait, &wait);
+		swait_finish_locked(&x->wait, &wait);
 		if (!x->done)
 			return timeout;
 	}
@@ -3544,9 +3544,9 @@ wait_for_common(struct completion *x, long timeout, int state)
 {
 	might_sleep();
 
-	spin_lock_irq(&x->wait.lock);
+	raw_spin_lock_irq(&x->wait.lock);
 	timeout = do_wait_for_common(x, timeout, state);
-	spin_unlock_irq(&x->wait.lock);
+	raw_spin_unlock_irq(&x->wait.lock);
 	return timeout;
 }
 
@@ -3677,12 +3677,12 @@ bool try_wait_for_completion(struct completion *x)
 	unsigned long flags;
 	int ret = 1;
 
-	spin_lock_irqsave(&x->wait.lock, flags);
+	raw_spin_lock_irqsave(&x->wait.lock, flags);
 	if (!x->done)
 		ret = 0;
 	else
 		x->done--;
-	spin_unlock_irqrestore(&x->wait.lock, flags);
+	raw_spin_unlock_irqrestore(&x->wait.lock, flags);
 	return ret;
 }
 EXPORT_SYMBOL(try_wait_for_completion);
@@ -3700,10 +3700,10 @@ bool completion_done(struct completion *x)
 	unsigned long flags;
 	int ret = 1;
 
-	spin_lock_irqsave(&x->wait.lock, flags);
+	raw_spin_lock_irqsave(&x->wait.lock, flags);
 	if (!x->done)
 		ret = 0;
-	spin_unlock_irqrestore(&x->wait.lock, flags);
+	raw_spin_unlock_irqrestore(&x->wait.lock, flags);
 	return ret;
 }
 EXPORT_SYMBOL(completion_done);
-- 
cgit v0.10.2


From b6c24069a70f80b9fef49721d424637724116d25 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 24 Jul 2011 12:11:43 +0200
Subject: kconfig-disable-a-few-options-rt.patch

Disable stuff which is known to have issues on RT

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/Kconfig b/arch/Kconfig
index 7f8f281..4a93e44 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -6,6 +6,7 @@ config OPROFILE
 	tristate "OProfile system profiling"
 	depends on PROFILING
 	depends on HAVE_OPROFILE
+	depends on !PREEMPT_RT_FULL
 	select RING_BUFFER
 	select RING_BUFFER_ALLOW_SWAP
 	help
diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index 6a70184..0052e52 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -164,6 +164,7 @@ config VXLAN
 
 config NETCONSOLE
 	tristate "Network console logging support"
+	depends on !PREEMPT_RT_FULL
 	---help---
 	If you want to log kernel messages over the network, enable this.
 	See <file:Documentation/networking/netconsole.txt> for details.
diff --git a/mm/Kconfig b/mm/Kconfig
index 278e3ab..c6ceefc 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -353,7 +353,7 @@ config NOMMU_INITIAL_TRIM_EXCESS
 
 config TRANSPARENT_HUGEPAGE
 	bool "Transparent Hugepage Support"
-	depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE
+	depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE && !PREEMPT_RT_FULL
 	select COMPACTION
 	help
 	  Transparent Hugepages allows the kernel to use huge pages and
-- 
cgit v0.10.2


From 4d72f19861e95cf911e0336882f0958f39f5cad0 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 29 Jun 2011 14:58:57 +0200
Subject: kconfig-preempt-rt-full.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/init/Makefile b/init/Makefile
index 7bc47ee..88cf473 100644
--- a/init/Makefile
+++ b/init/Makefile
@@ -33,4 +33,4 @@ silent_chk_compile.h = :
 include/generated/compile.h: FORCE
 	@$($(quiet)chk_compile.h)
 	$(Q)$(CONFIG_SHELL) $(srctree)/scripts/mkcompile_h $@ \
-	"$(UTS_MACHINE)" "$(CONFIG_SMP)" "$(CONFIG_PREEMPT)" "$(CC) $(KBUILD_CFLAGS)"
+	"$(UTS_MACHINE)" "$(CONFIG_SMP)" "$(CONFIG_PREEMPT)" "$(CONFIG_PREEMPT_RT_FULL)" "$(CC) $(KBUILD_CFLAGS)"
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index 574a855..38cecfe 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -73,6 +73,14 @@ config PREEMPT_RTB
 	  enables changes which are preliminary for the full preemptiple
 	  RT kernel.
 
+config PREEMPT_RT_FULL
+	bool "Fully Preemptible Kernel (RT)"
+	depends on IRQ_FORCED_THREADING
+	select PREEMPT_RT_BASE
+	select PREEMPT_RCU
+	help
+	  All and everything
+
 endchoice
 
 config PREEMPT_COUNT
diff --git a/scripts/mkcompile_h b/scripts/mkcompile_h
index f221ddf..5f44009 100755
--- a/scripts/mkcompile_h
+++ b/scripts/mkcompile_h
@@ -4,7 +4,8 @@ TARGET=$1
 ARCH=$2
 SMP=$3
 PREEMPT=$4
-CC=$5
+RT=$5
+CC=$6
 
 vecho() { [ "${quiet}" = "silent_" ] || echo "$@" ; }
 
@@ -57,6 +58,7 @@ UTS_VERSION="#$VERSION"
 CONFIG_FLAGS=""
 if [ -n "$SMP" ] ; then CONFIG_FLAGS="SMP"; fi
 if [ -n "$PREEMPT" ] ; then CONFIG_FLAGS="$CONFIG_FLAGS PREEMPT"; fi
+if [ -n "$RT" ] ; then CONFIG_FLAGS="$CONFIG_FLAGS RT"; fi
 UTS_VERSION="$UTS_VERSION $CONFIG_FLAGS $TIMESTAMP"
 
 # Truncate to maximum length
-- 
cgit v0.10.2