From 0be5c8ff58cf7a66019af2f1236daff731ed318c Mon Sep 17 00:00:00 2001
From: Yuanhan Liu <yuanhan.liu@linux.intel.com>
Date: Thu, 24 Jan 2013 17:22:44 +0800
Subject: locking/stat: Fix a typo

s/STATS/STAT

Signed-off-by: Yuanhan Liu <yuanhan.liu@linux.intel.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1359019365-23646-1-git-send-email-yuanhan.liu@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/Documentation/lockstat.txt b/Documentation/lockstat.txt
index cef00d4..dd2f7b2 100644
--- a/Documentation/lockstat.txt
+++ b/Documentation/lockstat.txt
@@ -65,7 +65,7 @@ that had to wait on lock acquisition.
 
  - CONFIGURATION
 
-Lock statistics are enabled via CONFIG_LOCK_STATS.
+Lock statistics are enabled via CONFIG_LOCK_STAT.
 
  - USAGE
 
-- 
cgit v0.10.2


From f86f75548233a2be7379ac446e91729710d4a5f7 Mon Sep 17 00:00:00 2001
From: "Srivatsa S. Bhat" <srivatsa.bhat@linux.vnet.ibm.com>
Date: Tue, 8 Jan 2013 18:35:58 +0530
Subject: lockdep: Rename print_unlock_inbalance_bug() to
 print_unlock_imbalance_bug()

Fix the typo in the function name (s/inbalance/imbalance)

Signed-off-by: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
Cc: rostedt@goodmis.org
Cc: peterz@infradead.org
Link: http://lkml.kernel.org/r/20130108130547.32733.79507.stgit@srivatsabhat.in.ibm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 7981e5b..5cf12e7 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -3203,7 +3203,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 }
 
 static int
-print_unlock_inbalance_bug(struct task_struct *curr, struct lockdep_map *lock,
+print_unlock_imbalance_bug(struct task_struct *curr, struct lockdep_map *lock,
 			   unsigned long ip)
 {
 	if (!debug_locks_off())
@@ -3246,7 +3246,7 @@ static int check_unlock(struct task_struct *curr, struct lockdep_map *lock,
 		return 0;
 
 	if (curr->lockdep_depth <= 0)
-		return print_unlock_inbalance_bug(curr, lock, ip);
+		return print_unlock_imbalance_bug(curr, lock, ip);
 
 	return 1;
 }
@@ -3317,7 +3317,7 @@ __lock_set_class(struct lockdep_map *lock, const char *name,
 			goto found_it;
 		prev_hlock = hlock;
 	}
-	return print_unlock_inbalance_bug(curr, lock, ip);
+	return print_unlock_imbalance_bug(curr, lock, ip);
 
 found_it:
 	lockdep_init_map(lock, name, key, 0);
@@ -3384,7 +3384,7 @@ lock_release_non_nested(struct task_struct *curr,
 			goto found_it;
 		prev_hlock = hlock;
 	}
-	return print_unlock_inbalance_bug(curr, lock, ip);
+	return print_unlock_imbalance_bug(curr, lock, ip);
 
 found_it:
 	if (hlock->instance == lock)
-- 
cgit v0.10.2


From c06b4f1947213cd0902610fafcabac02d49ad728 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung.kim@lge.com>
Date: Thu, 27 Dec 2012 11:49:44 +0900
Subject: watchdog: Use local_clock for get_timestamp()

The get_timestamp() function is always called with current cpu,
thus using local_clock() would be more appropriate and it makes
the code shorter and cleaner IMHO.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Acked-by: Don Zickus <dzickus@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Link: http://lkml.kernel.org/r/1356576585-28782-1-git-send-email-namhyung@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 75a2ab3..082ca68 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -112,9 +112,9 @@ static int get_softlockup_thresh(void)
  * resolution, and we don't need to waste time with a big divide when
  * 2^30ns == 1.074s.
  */
-static unsigned long get_timestamp(int this_cpu)
+static unsigned long get_timestamp(void)
 {
-	return cpu_clock(this_cpu) >> 30LL;  /* 2^30 ~= 10^9 */
+	return local_clock() >> 30LL;  /* 2^30 ~= 10^9 */
 }
 
 static void set_sample_period(void)
@@ -132,9 +132,7 @@ static void set_sample_period(void)
 /* Commands for resetting the watchdog */
 static void __touch_watchdog(void)
 {
-	int this_cpu = smp_processor_id();
-
-	__this_cpu_write(watchdog_touch_ts, get_timestamp(this_cpu));
+	__this_cpu_write(watchdog_touch_ts, get_timestamp());
 }
 
 void touch_softlockup_watchdog(void)
@@ -195,7 +193,7 @@ static int is_hardlockup(void)
 
 static int is_softlockup(unsigned long touch_ts)
 {
-	unsigned long now = get_timestamp(smp_processor_id());
+	unsigned long now = get_timestamp();
 
 	/* Warn about unreasonable delays: */
 	if (time_after(now, touch_ts + get_softlockup_thresh()))
-- 
cgit v0.10.2


From 5cd3f5affad2109fd1458aab3f6216f2181e26ea Mon Sep 17 00:00:00 2001
From: Paul Bolle <pebolle@tiscali.nl>
Date: Thu, 24 Jan 2013 21:53:17 +0100
Subject: lockdep: Silence warning if CONFIG_LOCKDEP isn't set
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Since commit c9a4962881929df7f1ef6e63e1b9da304faca4dd ("nfsd:
make client_lock per net") compiling nfs4state.o without
CONFIG_LOCKDEP set, triggers this GCC warning:

    fs/nfsd/nfs4state.c: In function ‘free_client’:
    fs/nfsd/nfs4state.c:1051:19: warning: unused variable ‘nn’ [-Wunused-variable]

The cause of that warning is that lockdep_assert_held() compiles
away if CONFIG_LOCKDEP is not set. Silence this warning by using
the argument to lockdep_assert_held() as a nop if CONFIG_LOCKDEP
is not set.

Signed-off-by: Paul Bolle <pebolle@tiscali.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stanislav Kinsbursky <skinsbursky@parallels.com>
Cc: J. Bruce Fields <bfields@redhat.com>
Link: http://lkml.kernel.org/r/1359060797.1325.33.camel@x61.thuisdomein
Signed-off-by: Ingo Molnar <mingo@kernel.org>
--
 include/linux/lockdep.h |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index 2bca44b..f05631e 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -410,7 +410,7 @@ struct lock_class_key { };
 
 #define lockdep_depth(tsk)	(0)
 
-#define lockdep_assert_held(l)			do { } while (0)
+#define lockdep_assert_held(l)			do { (void)(l); } while (0)
 
 #define lockdep_recursing(tsk)			(0)
 
-- 
cgit v0.10.2


From ce6711f3d196f09ca0ed29a24dfad42d83912b20 Mon Sep 17 00:00:00 2001
From: Alex Shi <alex.shi@intel.com>
Date: Tue, 5 Feb 2013 21:11:55 +0800
Subject: rwsem: Implement writer lock-stealing for better scalability

Commit 5a505085f043 ("mm/rmap: Convert the struct anon_vma::mutex
to an rwsem") changed struct anon_vma::mutex to an rwsem, which
caused aim7 fork_test performance to drop by 50%.

Yuanhan Liu did the following excellent analysis:

    https://lkml.org/lkml/2013/1/29/84

and found that the regression is caused by strict, serialized,
FIFO sequential write-ownership of rwsems. Ingo suggested
implementing opportunistic lock-stealing for the front writer
task in the waitqueue.

Yuanhan Liu implemented lock-stealing for spinlock-rwsems,
which indeed recovered much of the regression - confirming
the analysis that the main factor in the regression was the
FIFO writer-fairness of rwsems.

In this patch we allow lock-stealing to happen when the first
waiter is also writer. With that change in place the
aim7 fork_test performance is fully recovered on my
Intel NHM EP, NHM EX, SNB EP 2S and 4S test-machines.

Reported-by: lkp@linux.intel.com
Reported-by: Yuanhan Liu <yuanhan.liu@linux.intel.com>
Signed-off-by: Alex Shi <alex.shi@intel.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Michel Lespinasse <walken@google.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Anton Blanchard <anton@samba.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: paul.gortmaker@windriver.com
Link: https://lkml.org/lkml/2013/1/29/84
Link: http://lkml.kernel.org/r/1360069915-31619-1-git-send-email-alex.shi@intel.com
[ Small stylistic fixes, updated changelog. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/lib/rwsem.c b/lib/rwsem.c
index 8337e1b9b..ad5e0df 100644
--- a/lib/rwsem.c
+++ b/lib/rwsem.c
@@ -2,6 +2,8 @@
  *
  * Written by David Howells (dhowells@redhat.com).
  * Derived from arch/i386/kernel/semaphore.c
+ *
+ * Writer lock-stealing by Alex Shi <alex.shi@intel.com>
  */
 #include <linux/rwsem.h>
 #include <linux/sched.h>
@@ -60,7 +62,7 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wake_type)
 	struct rwsem_waiter *waiter;
 	struct task_struct *tsk;
 	struct list_head *next;
-	signed long oldcount, woken, loop, adjustment;
+	signed long woken, loop, adjustment;
 
 	waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list);
 	if (!(waiter->flags & RWSEM_WAITING_FOR_WRITE))
@@ -72,30 +74,8 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wake_type)
 		 */
 		goto out;
 
-	/* There's a writer at the front of the queue - try to grant it the
-	 * write lock.  However, we only wake this writer if we can transition
-	 * the active part of the count from 0 -> 1
-	 */
-	adjustment = RWSEM_ACTIVE_WRITE_BIAS;
-	if (waiter->list.next == &sem->wait_list)
-		adjustment -= RWSEM_WAITING_BIAS;
-
- try_again_write:
-	oldcount = rwsem_atomic_update(adjustment, sem) - adjustment;
-	if (oldcount & RWSEM_ACTIVE_MASK)
-		/* Someone grabbed the sem already */
-		goto undo_write;
-
-	/* We must be careful not to touch 'waiter' after we set ->task = NULL.
-	 * It is an allocated on the waiter's stack and may become invalid at
-	 * any time after that point (due to a wakeup from another source).
-	 */
-	list_del(&waiter->list);
-	tsk = waiter->task;
-	smp_mb();
-	waiter->task = NULL;
-	wake_up_process(tsk);
-	put_task_struct(tsk);
+	/* Wake up the writing waiter and let the task grab the sem: */
+	wake_up_process(waiter->task);
 	goto out;
 
  readers_only:
@@ -157,12 +137,40 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wake_type)
 
  out:
 	return sem;
+}
+
+/* Try to get write sem, caller holds sem->wait_lock: */
+static int try_get_writer_sem(struct rw_semaphore *sem,
+					struct rwsem_waiter *waiter)
+{
+	struct rwsem_waiter *fwaiter;
+	long oldcount, adjustment;
 
-	/* undo the change to the active count, but check for a transition
-	 * 1->0 */
- undo_write:
+	/* only steal when first waiter is writing */
+	fwaiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list);
+	if (!(fwaiter->flags & RWSEM_WAITING_FOR_WRITE))
+		return 0;
+
+	adjustment = RWSEM_ACTIVE_WRITE_BIAS;
+	/* Only one waiter in the queue: */
+	if (fwaiter == waiter && waiter->list.next == &sem->wait_list)
+		adjustment -= RWSEM_WAITING_BIAS;
+
+try_again_write:
+	oldcount = rwsem_atomic_update(adjustment, sem) - adjustment;
+	if (!(oldcount & RWSEM_ACTIVE_MASK)) {
+		/* No active lock: */
+		struct task_struct *tsk = waiter->task;
+
+		list_del(&waiter->list);
+		smp_mb();
+		put_task_struct(tsk);
+		tsk->state = TASK_RUNNING;
+		return 1;
+	}
+	/* some one grabbed the sem already */
 	if (rwsem_atomic_update(-adjustment, sem) & RWSEM_ACTIVE_MASK)
-		goto out;
+		return 0;
 	goto try_again_write;
 }
 
@@ -210,6 +218,15 @@ rwsem_down_failed_common(struct rw_semaphore *sem,
 	for (;;) {
 		if (!waiter.task)
 			break;
+
+		raw_spin_lock_irq(&sem->wait_lock);
+		/* Try to get the writer sem, may steal from the head writer: */
+		if (flags == RWSEM_WAITING_FOR_WRITE)
+			if (try_get_writer_sem(sem, &waiter)) {
+				raw_spin_unlock_irq(&sem->wait_lock);
+				return sem;
+			}
+		raw_spin_unlock_irq(&sem->wait_lock);
 		schedule();
 		set_task_state(tsk, TASK_UNINTERRUPTIBLE);
 	}
-- 
cgit v0.10.2


From c0540606837af79b2ae101e5e7b2206e3844d150 Mon Sep 17 00:00:00 2001
From: Ben Greear <greearb@candelatech.com>
Date: Wed, 6 Feb 2013 10:56:19 -0800
Subject: lockdep: Print more info when MAX_LOCK_DEPTH is exceeded

This helps debug cases where a lock is acquired over and
over without being released.

Suggested-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ben Greear <greearb@candelatech.com>
Cc: peterz@infradead.org
Link: http://lkml.kernel.org/r/1360176979-4421-1-git-send-email-greearb@candelatech.com
[ Changed the printout ordering. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 5cf12e7..8a0efac 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -3190,9 +3190,14 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 #endif
 	if (unlikely(curr->lockdep_depth >= MAX_LOCK_DEPTH)) {
 		debug_locks_off();
-		printk("BUG: MAX_LOCK_DEPTH too low!\n");
+		printk("BUG: MAX_LOCK_DEPTH too low, depth: %i  max: %lu!\n",
+		       curr->lockdep_depth, MAX_LOCK_DEPTH);
 		printk("turning off the locking correctness validator.\n");
+
+		lockdep_print_held_locks(current);
+		debug_show_all_locks();
 		dump_stack();
+
 		return 0;
 	}
 
-- 
cgit v0.10.2


From eece09ec213e93333010bf4c6bb9175b32229c54 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 17 Jul 2011 21:25:03 +0200
Subject: locking: Various static lock initializer fixes

The static lock initializers want to be fed the proper name of the
lock and not some random string. In mainline random strings are
obfuscating the readability of debug output, but for RT they prevent
the spinlock substitution. Fix it up.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/drivers/char/random.c b/drivers/char/random.c
index 85e81ec..594bda9 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -445,7 +445,7 @@ static struct entropy_store input_pool = {
 	.poolinfo = &poolinfo_table[0],
 	.name = "input",
 	.limit = 1,
-	.lock = __SPIN_LOCK_UNLOCKED(&input_pool.lock),
+	.lock = __SPIN_LOCK_UNLOCKED(input_pool.lock),
 	.pool = input_pool_data
 };
 
@@ -454,7 +454,7 @@ static struct entropy_store blocking_pool = {
 	.name = "blocking",
 	.limit = 1,
 	.pull = &input_pool,
-	.lock = __SPIN_LOCK_UNLOCKED(&blocking_pool.lock),
+	.lock = __SPIN_LOCK_UNLOCKED(blocking_pool.lock),
 	.pool = blocking_pool_data
 };
 
@@ -462,7 +462,7 @@ static struct entropy_store nonblocking_pool = {
 	.poolinfo = &poolinfo_table[1],
 	.name = "nonblocking",
 	.pull = &input_pool,
-	.lock = __SPIN_LOCK_UNLOCKED(&nonblocking_pool.lock),
+	.lock = __SPIN_LOCK_UNLOCKED(nonblocking_pool.lock),
 	.pool = nonblocking_pool_data
 };
 
diff --git a/drivers/usb/chipidea/debug.c b/drivers/usb/chipidea/debug.c
index 3bc244d..a62c4a4 100644
--- a/drivers/usb/chipidea/debug.c
+++ b/drivers/usb/chipidea/debug.c
@@ -222,7 +222,7 @@ static struct {
 } dbg_data = {
 	.idx = 0,
 	.tty = 0,
-	.lck = __RW_LOCK_UNLOCKED(lck)
+	.lck = __RW_LOCK_UNLOCKED(dbg_data.lck)
 };
 
 /**
diff --git a/fs/file.c b/fs/file.c
index 2b3570b..3906d95 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -516,7 +516,7 @@ struct files_struct init_files = {
 		.close_on_exec	= init_files.close_on_exec_init,
 		.open_fds	= init_files.open_fds_init,
 	},
-	.file_lock	= __SPIN_LOCK_UNLOCKED(init_task.file_lock),
+	.file_lock	= __SPIN_LOCK_UNLOCKED(init_files.file_lock),
 };
 
 /*
diff --git a/include/linux/idr.h b/include/linux/idr.h
index de7e190..e5eb125 100644
--- a/include/linux/idr.h
+++ b/include/linux/idr.h
@@ -136,7 +136,7 @@ struct ida {
 	struct ida_bitmap	*free_bitmap;
 };
 
-#define IDA_INIT(name)		{ .idr = IDR_INIT(name), .free_bitmap = NULL, }
+#define IDA_INIT(name)		{ .idr = IDR_INIT((name).idr), .free_bitmap = NULL, }
 #define DEFINE_IDA(name)	struct ida name = IDA_INIT(name)
 
 int ida_pre_get(struct ida *ida, gfp_t gfp_mask);
-- 
cgit v0.10.2


From 066361a7c58cb6c8b18c7ce0ee8527bb1ce58460 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Wed, 7 Dec 2011 12:48:42 +0100
Subject: intel_idle: Convert i7300_idle_lock to raw_spinlock

24 core Intel box's first exposure to 3.0.12-rt30-rc3 didn't go well.

[   27.104159] i7300_idle: loaded v1.55
[   27.104192] BUG: scheduling while atomic: swapper/2/0/0x00000002
[   27.104309] Pid: 0, comm: swapper/2 Tainted: G           N  3.0.12-rt30-rc3-rt #1
[   27.104317] Call Trace:
[   27.104338]  [<ffffffff810046a5>] dump_trace+0x85/0x2e0
[   27.104372]  [<ffffffff8144eb00>] thread_return+0x12b/0x30b
[   27.104381]  [<ffffffff8144f1b9>] schedule+0x29/0xb0
[   27.104389]  [<ffffffff814506e5>] rt_spin_lock_slowlock+0xc5/0x240
[   27.104401]  [<ffffffffa01f818f>] i7300_idle_notifier+0x3f/0x360 [i7300_idle]
[   27.104415]  [<ffffffff814546c7>] notifier_call_chain+0x37/0x70
[   27.104426]  [<ffffffff81454748>] __atomic_notifier_call_chain+0x48/0x70
[   27.104439]  [<ffffffff81001a39>] cpu_idle+0x89/0xb0
[   27.104449] bad: scheduling from the idle thread!

This lock is taken from interrupt disabled context in the guts of
idle. Convert it to a raw_spinlock.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Andy Henroid <andrew.d.henroid@intel.com>
Link: http://lkml.kernel.org/r/1323258522.5057.73.camel@marge.simson.net
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/drivers/idle/i7300_idle.c b/drivers/idle/i7300_idle.c
index fa080eb..ffeebc7 100644
--- a/drivers/idle/i7300_idle.c
+++ b/drivers/idle/i7300_idle.c
@@ -75,7 +75,7 @@ static unsigned long past_skip;
 
 static struct pci_dev *fbd_dev;
 
-static spinlock_t i7300_idle_lock;
+static raw_spinlock_t i7300_idle_lock;
 static int i7300_idle_active;
 
 static u8 i7300_idle_thrtctl_saved;
@@ -457,7 +457,7 @@ static int i7300_idle_notifier(struct notifier_block *nb, unsigned long val,
 		idle_begin_time = ktime_get();
 	}
 
-	spin_lock_irqsave(&i7300_idle_lock, flags);
+	raw_spin_lock_irqsave(&i7300_idle_lock, flags);
 	if (val == IDLE_START) {
 
 		cpumask_set_cpu(smp_processor_id(), idle_cpumask);
@@ -506,7 +506,7 @@ static int i7300_idle_notifier(struct notifier_block *nb, unsigned long val,
 		}
 	}
 end:
-	spin_unlock_irqrestore(&i7300_idle_lock, flags);
+	raw_spin_unlock_irqrestore(&i7300_idle_lock, flags);
 	return 0;
 }
 
@@ -548,7 +548,7 @@ struct debugfs_file_info {
 
 static int __init i7300_idle_init(void)
 {
-	spin_lock_init(&i7300_idle_lock);
+	raw_spin_lock_init(&i7300_idle_lock);
 	total_us = 0;
 
 	if (i7300_idle_platform_probe(&fbd_dev, &ioat_dev, forceload))
-- 
cgit v0.10.2


From a6c0c943a15d0b3d6ac33760cb8f95c75f395895 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 10 Apr 2012 11:14:55 +0200
Subject: ntp: Make ntp_lock raw

seconds_overflow() is called from hard interrupt context even on
Preempt-RT. This requires the lock to be a raw_spinlock.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: John Stultz <john.stultz@linaro.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 24174b4..bb1edfa 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -22,7 +22,7 @@
  * NTP timekeeping variables:
  */
 
-DEFINE_SPINLOCK(ntp_lock);
+DEFINE_RAW_SPINLOCK(ntp_lock);
 
 
 /* USER_HZ period (usecs): */
@@ -347,7 +347,7 @@ void ntp_clear(void)
 {
 	unsigned long flags;
 
-	spin_lock_irqsave(&ntp_lock, flags);
+	raw_spin_lock_irqsave(&ntp_lock, flags);
 
 	time_adjust	= 0;		/* stop active adjtime() */
 	time_status	|= STA_UNSYNC;
@@ -361,7 +361,7 @@ void ntp_clear(void)
 
 	/* Clear PPS state variables */
 	pps_clear();
-	spin_unlock_irqrestore(&ntp_lock, flags);
+	raw_spin_unlock_irqrestore(&ntp_lock, flags);
 
 }
 
@@ -371,9 +371,9 @@ u64 ntp_tick_length(void)
 	unsigned long flags;
 	s64 ret;
 
-	spin_lock_irqsave(&ntp_lock, flags);
+	raw_spin_lock_irqsave(&ntp_lock, flags);
 	ret = tick_length;
-	spin_unlock_irqrestore(&ntp_lock, flags);
+	raw_spin_unlock_irqrestore(&ntp_lock, flags);
 	return ret;
 }
 
@@ -394,7 +394,7 @@ int second_overflow(unsigned long secs)
 	int leap = 0;
 	unsigned long flags;
 
-	spin_lock_irqsave(&ntp_lock, flags);
+	raw_spin_lock_irqsave(&ntp_lock, flags);
 
 	/*
 	 * Leap second processing. If in leap-insert state at the end of the
@@ -478,7 +478,7 @@ int second_overflow(unsigned long secs)
 	time_adjust = 0;
 
 out:
-	spin_unlock_irqrestore(&ntp_lock, flags);
+	raw_spin_unlock_irqrestore(&ntp_lock, flags);
 
 	return leap;
 }
@@ -660,7 +660,7 @@ int do_adjtimex(struct timex *txc)
 
 	getnstimeofday(&ts);
 
-	spin_lock_irq(&ntp_lock);
+	raw_spin_lock_irq(&ntp_lock);
 
 	if (txc->modes & ADJ_ADJTIME) {
 		long save_adjust = time_adjust;
@@ -702,7 +702,7 @@ int do_adjtimex(struct timex *txc)
 	/* fill PPS status fields */
 	pps_fill_timex(txc);
 
-	spin_unlock_irq(&ntp_lock);
+	raw_spin_unlock_irq(&ntp_lock);
 
 	txc->time.tv_sec = ts.tv_sec;
 	txc->time.tv_usec = ts.tv_nsec;
@@ -900,7 +900,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
 
 	pts_norm = pps_normalize_ts(*phase_ts);
 
-	spin_lock_irqsave(&ntp_lock, flags);
+	raw_spin_lock_irqsave(&ntp_lock, flags);
 
 	/* clear the error bits, they will be set again if needed */
 	time_status &= ~(STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR);
@@ -913,7 +913,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
 	 * just start the frequency interval */
 	if (unlikely(pps_fbase.tv_sec == 0)) {
 		pps_fbase = *raw_ts;
-		spin_unlock_irqrestore(&ntp_lock, flags);
+		raw_spin_unlock_irqrestore(&ntp_lock, flags);
 		return;
 	}
 
@@ -928,7 +928,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
 		time_status |= STA_PPSJITTER;
 		/* restart the frequency calibration interval */
 		pps_fbase = *raw_ts;
-		spin_unlock_irqrestore(&ntp_lock, flags);
+		raw_spin_unlock_irqrestore(&ntp_lock, flags);
 		pr_err("hardpps: PPSJITTER: bad pulse\n");
 		return;
 	}
@@ -945,7 +945,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
 
 	hardpps_update_phase(pts_norm.nsec);
 
-	spin_unlock_irqrestore(&ntp_lock, flags);
+	raw_spin_unlock_irqrestore(&ntp_lock, flags);
 }
 EXPORT_SYMBOL(hardpps);
 
-- 
cgit v0.10.2


From 65c9d1bbc9f32c568a4f7ad154c9a10b0028df52 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 16 Jul 2011 18:38:22 +0200
Subject: seqlock: Remove unused functions

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
index 600060e2..cb0599c 100644
--- a/include/linux/seqlock.h
+++ b/include/linux/seqlock.h
@@ -69,17 +69,6 @@ static inline void write_sequnlock(seqlock_t *sl)
 	spin_unlock(&sl->lock);
 }
 
-static inline int write_tryseqlock(seqlock_t *sl)
-{
-	int ret = spin_trylock(&sl->lock);
-
-	if (ret) {
-		++sl->sequence;
-		smp_wmb();
-	}
-	return ret;
-}
-
 /* Start of read calculation -- fetch last complete writer token */
 static __always_inline unsigned read_seqbegin(const seqlock_t *sl)
 {
@@ -269,14 +258,4 @@ static inline void write_seqcount_barrier(seqcount_t *s)
 #define write_sequnlock_bh(lock)					\
 	do { write_sequnlock(lock); local_bh_enable(); } while(0)
 
-#define read_seqbegin_irqsave(lock, flags)				\
-	({ local_irq_save(flags);   read_seqbegin(lock); })
-
-#define read_seqretry_irqrestore(lock, iv, flags)			\
-	({								\
-		int ret = read_seqretry(lock, iv);			\
-		local_irq_restore(flags);				\
-		ret;							\
-	})
-
 #endif /* __LINUX_SEQLOCK_H */
-- 
cgit v0.10.2


From 6617feca15c68aad12f4a1edd8d8e2ef8d5b4ae5 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 16 Jul 2011 18:40:26 +0200
Subject: seqlock: Use seqcount infrastructure

No point in having different implementations for the same
thing. Change the macro mess to inline functions where possible.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
index cb0599c..1829905 100644
--- a/include/linux/seqlock.h
+++ b/include/linux/seqlock.h
@@ -30,81 +30,12 @@
 #include <linux/preempt.h>
 #include <asm/processor.h>
 
-typedef struct {
-	unsigned sequence;
-	spinlock_t lock;
-} seqlock_t;
-
-/*
- * These macros triggered gcc-3.x compile-time problems.  We think these are
- * OK now.  Be cautious.
- */
-#define __SEQLOCK_UNLOCKED(lockname) \
-		 { 0, __SPIN_LOCK_UNLOCKED(lockname) }
-
-#define seqlock_init(x)					\
-	do {						\
-		(x)->sequence = 0;			\
-		spin_lock_init(&(x)->lock);		\
-	} while (0)
-
-#define DEFINE_SEQLOCK(x) \
-		seqlock_t x = __SEQLOCK_UNLOCKED(x)
-
-/* Lock out other writers and update the count.
- * Acts like a normal spin_lock/unlock.
- * Don't need preempt_disable() because that is in the spin_lock already.
- */
-static inline void write_seqlock(seqlock_t *sl)
-{
-	spin_lock(&sl->lock);
-	++sl->sequence;
-	smp_wmb();
-}
-
-static inline void write_sequnlock(seqlock_t *sl)
-{
-	smp_wmb();
-	sl->sequence++;
-	spin_unlock(&sl->lock);
-}
-
-/* Start of read calculation -- fetch last complete writer token */
-static __always_inline unsigned read_seqbegin(const seqlock_t *sl)
-{
-	unsigned ret;
-
-repeat:
-	ret = ACCESS_ONCE(sl->sequence);
-	if (unlikely(ret & 1)) {
-		cpu_relax();
-		goto repeat;
-	}
-	smp_rmb();
-
-	return ret;
-}
-
-/*
- * Test if reader processed invalid data.
- *
- * If sequence value changed then writer changed data while in section.
- */
-static __always_inline int read_seqretry(const seqlock_t *sl, unsigned start)
-{
-	smp_rmb();
-
-	return unlikely(sl->sequence != start);
-}
-
-
 /*
  * Version using sequence counter only.
  * This can be used when code has its own mutex protecting the
  * updating starting before the write_seqcountbeqin() and ending
  * after the write_seqcount_end().
  */
-
 typedef struct seqcount {
 	unsigned sequence;
 } seqcount_t;
@@ -207,7 +138,6 @@ static inline int __read_seqcount_retry(const seqcount_t *s, unsigned start)
 static inline int read_seqcount_retry(const seqcount_t *s, unsigned start)
 {
 	smp_rmb();
-
 	return __read_seqcount_retry(s, start);
 }
 
@@ -241,21 +171,101 @@ static inline void write_seqcount_barrier(seqcount_t *s)
 	s->sequence+=2;
 }
 
+typedef struct {
+	struct seqcount seqcount;
+	spinlock_t lock;
+} seqlock_t;
+
+/*
+ * These macros triggered gcc-3.x compile-time problems.  We think these are
+ * OK now.  Be cautious.
+ */
+#define __SEQLOCK_UNLOCKED(lockname)			\
+	{						\
+		.seqcount = SEQCNT_ZERO,		\
+		.lock =	__SPIN_LOCK_UNLOCKED(lockname)	\
+	}
+
+#define seqlock_init(x)					\
+	do {						\
+		seqcount_init(&(x)->seqcount);		\
+		spin_lock_init(&(x)->lock);		\
+	} while (0)
+
+#define DEFINE_SEQLOCK(x) \
+		seqlock_t x = __SEQLOCK_UNLOCKED(x)
+
+/*
+ * Read side functions for starting and finalizing a read side section.
+ */
+static inline unsigned read_seqbegin(const seqlock_t *sl)
+{
+	return read_seqcount_begin(&sl->seqcount);
+}
+
+static inline unsigned read_seqretry(const seqlock_t *sl, unsigned start)
+{
+	return read_seqcount_retry(&sl->seqcount, start);
+}
+
 /*
- * Possible sw/hw IRQ protected versions of the interfaces.
+ * Lock out other writers and update the count.
+ * Acts like a normal spin_lock/unlock.
+ * Don't need preempt_disable() because that is in the spin_lock already.
  */
+static inline void write_seqlock(seqlock_t *sl)
+{
+	spin_lock(&sl->lock);
+	write_seqcount_begin(&sl->seqcount);
+}
+
+static inline void write_sequnlock(seqlock_t *sl)
+{
+	write_seqcount_end(&sl->seqcount);
+	spin_unlock(&sl->lock);
+}
+
+static inline void write_seqlock_bh(seqlock_t *sl)
+{
+	spin_lock_bh(&sl->lock);
+	write_seqcount_begin(&sl->seqcount);
+}
+
+static inline void write_sequnlock_bh(seqlock_t *sl)
+{
+	write_seqcount_end(&sl->seqcount);
+	spin_unlock_bh(&sl->lock);
+}
+
+static inline void write_seqlock_irq(seqlock_t *sl)
+{
+	spin_lock_irq(&sl->lock);
+	write_seqcount_begin(&sl->seqcount);
+}
+
+static inline void write_sequnlock_irq(seqlock_t *sl)
+{
+	write_seqcount_end(&sl->seqcount);
+	spin_unlock_irq(&sl->lock);
+}
+
+static inline unsigned long __write_seqlock_irqsave(seqlock_t *sl)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&sl->lock, flags);
+	write_seqcount_begin(&sl->seqcount);
+	return flags;
+}
+
 #define write_seqlock_irqsave(lock, flags)				\
-	do { local_irq_save(flags); write_seqlock(lock); } while (0)
-#define write_seqlock_irq(lock)						\
-	do { local_irq_disable();   write_seqlock(lock); } while (0)
-#define write_seqlock_bh(lock)						\
-        do { local_bh_disable();    write_seqlock(lock); } while (0)
+	do { flags = __write_seqlock_irqsave(lock); } while (0)
 
-#define write_sequnlock_irqrestore(lock, flags)				\
-	do { write_sequnlock(lock); local_irq_restore(flags); } while(0)
-#define write_sequnlock_irq(lock)					\
-	do { write_sequnlock(lock); local_irq_enable(); } while(0)
-#define write_sequnlock_bh(lock)					\
-	do { write_sequnlock(lock); local_bh_enable(); } while(0)
+static inline void
+write_sequnlock_irqrestore(seqlock_t *sl, unsigned long flags)
+{
+	write_seqcount_end(&sl->seqcount);
+	spin_unlock_irqrestore(&sl->lock, flags);
+}
 
 #endif /* __LINUX_SEQLOCK_H */
-- 
cgit v0.10.2


From 9fb1b90ce0a847a8cc9492a6c1f347b5be1f33ff Mon Sep 17 00:00:00 2001
From: Yong Zhang <yong.zhang0@gmail.com>
Date: Mon, 16 Apr 2012 15:01:55 +0800
Subject: lockdep: Selftest: convert spinlock to raw spinlock

To make the lockdep selftest working on RT we need to convert the
spinlock tests to a raw spinlock. Otherwise we cannot run the irq
context checks. For mainline this is just annotational as spinlocks
are mapped to raw_spinlocks anyway.

Signed-off-by: Yong Zhang <yong.zhang0@gmail.com>
Link: http://lkml.kernel.org/r/1334559716-18447-2-git-send-email-yong.zhang0@gmail.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/lib/locking-selftest.c b/lib/locking-selftest.c
index 7aae0f2..c3eb261 100644
--- a/lib/locking-selftest.c
+++ b/lib/locking-selftest.c
@@ -47,10 +47,10 @@ __setup("debug_locks_verbose=", setup_debug_locks_verbose);
  * Normal standalone locks, for the circular and irq-context
  * dependency tests:
  */
-static DEFINE_SPINLOCK(lock_A);
-static DEFINE_SPINLOCK(lock_B);
-static DEFINE_SPINLOCK(lock_C);
-static DEFINE_SPINLOCK(lock_D);
+static DEFINE_RAW_SPINLOCK(lock_A);
+static DEFINE_RAW_SPINLOCK(lock_B);
+static DEFINE_RAW_SPINLOCK(lock_C);
+static DEFINE_RAW_SPINLOCK(lock_D);
 
 static DEFINE_RWLOCK(rwlock_A);
 static DEFINE_RWLOCK(rwlock_B);
@@ -73,12 +73,12 @@ static DECLARE_RWSEM(rwsem_D);
  * but X* and Y* are different classes. We do this so that
  * we do not trigger a real lockup:
  */
-static DEFINE_SPINLOCK(lock_X1);
-static DEFINE_SPINLOCK(lock_X2);
-static DEFINE_SPINLOCK(lock_Y1);
-static DEFINE_SPINLOCK(lock_Y2);
-static DEFINE_SPINLOCK(lock_Z1);
-static DEFINE_SPINLOCK(lock_Z2);
+static DEFINE_RAW_SPINLOCK(lock_X1);
+static DEFINE_RAW_SPINLOCK(lock_X2);
+static DEFINE_RAW_SPINLOCK(lock_Y1);
+static DEFINE_RAW_SPINLOCK(lock_Y2);
+static DEFINE_RAW_SPINLOCK(lock_Z1);
+static DEFINE_RAW_SPINLOCK(lock_Z2);
 
 static DEFINE_RWLOCK(rwlock_X1);
 static DEFINE_RWLOCK(rwlock_X2);
@@ -107,10 +107,10 @@ static DECLARE_RWSEM(rwsem_Z2);
  */
 #define INIT_CLASS_FUNC(class) 				\
 static noinline void					\
-init_class_##class(spinlock_t *lock, rwlock_t *rwlock, struct mutex *mutex, \
-		 struct rw_semaphore *rwsem)		\
+init_class_##class(raw_spinlock_t *lock, rwlock_t *rwlock, \
+	struct mutex *mutex, struct rw_semaphore *rwsem)\
 {							\
-	spin_lock_init(lock);				\
+	raw_spin_lock_init(lock);			\
 	rwlock_init(rwlock);				\
 	mutex_init(mutex);				\
 	init_rwsem(rwsem);				\
@@ -168,10 +168,10 @@ static void init_shared_classes(void)
  * Shortcuts for lock/unlock API variants, to keep
  * the testcases compact:
  */
-#define L(x)			spin_lock(&lock_##x)
-#define U(x)			spin_unlock(&lock_##x)
+#define L(x)			raw_spin_lock(&lock_##x)
+#define U(x)			raw_spin_unlock(&lock_##x)
 #define LU(x)			L(x); U(x)
-#define SI(x)			spin_lock_init(&lock_##x)
+#define SI(x)			raw_spin_lock_init(&lock_##x)
 
 #define WL(x)			write_lock(&rwlock_##x)
 #define WU(x)			write_unlock(&rwlock_##x)
@@ -911,7 +911,7 @@ GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion_soft)
 
 #define I2(x)					\
 	do {					\
-		spin_lock_init(&lock_##x);	\
+		raw_spin_lock_init(&lock_##x);	\
 		rwlock_init(&rwlock_##x);	\
 		mutex_init(&mutex_##x);		\
 		init_rwsem(&rwsem_##x);		\
-- 
cgit v0.10.2


From 5042afe7fe32390e79910ecd0a1f0563d4bca38c Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 3 Jul 2009 08:29:30 -0500
Subject: generic: Use raw local irq variant for generic cmpxchg

The interrupt disabled region is extremly tiny and therefor not
latency relevant. Avoid cluttering the traces with those pointless
entries.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/asm-generic/cmpxchg-local.h b/include/asm-generic/cmpxchg-local.h
index 2533fdd..d8d4c89 100644
--- a/include/asm-generic/cmpxchg-local.h
+++ b/include/asm-generic/cmpxchg-local.h
@@ -21,7 +21,7 @@ static inline unsigned long __cmpxchg_local_generic(volatile void *ptr,
 	if (size == 8 && sizeof(unsigned long) != 8)
 		wrong_size_cmpxchg(ptr);
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	switch (size) {
 	case 1: prev = *(u8 *)ptr;
 		if (prev == old)
@@ -42,7 +42,7 @@ static inline unsigned long __cmpxchg_local_generic(volatile void *ptr,
 	default:
 		wrong_size_cmpxchg(ptr);
 	}
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 	return prev;
 }
 
@@ -55,11 +55,11 @@ static inline u64 __cmpxchg64_local_generic(volatile void *ptr,
 	u64 prev;
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	prev = *(u64 *)ptr;
 	if (prev == old)
 		*(u64 *)ptr = new;
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 	return prev;
 }
 
-- 
cgit v0.10.2


From fe2b05f7ca9f906be61dced5489f63b8b4d7c770 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 18 Feb 2013 09:52:08 +0100
Subject: futex: Revert "futex: Mark get_robust_list as deprecated"

This reverts commit ec0c4274e33c0373e476b73e01995c53128f1257.

get_robust_list() is in use and a removal would break existing user
space. With the permission checks in place it's not longer a security
hole. Remove the deprecation warnings.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Richard Weinberger <richard@nod.at>
Cc: akpm@linux-foundation.org
Cc: paul.gortmaker@windriver.com
Cc: davej@redhat.com
Cc: keescook@chromium.org
Cc: stable@vger.kernel.org
Cc: ebiederm@xmission.com

diff --git a/kernel/futex.c b/kernel/futex.c
index 19eb089..8879430 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -2471,8 +2471,6 @@ SYSCALL_DEFINE3(get_robust_list, int, pid,
 	if (!futex_cmpxchg_enabled)
 		return -ENOSYS;
 
-	WARN_ONCE(1, "deprecated: get_robust_list will be deleted in 2013.\n");
-
 	rcu_read_lock();
 
 	ret = -ESRCH;
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index 83e368b..a9642d5 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -142,8 +142,6 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr,
 	if (!futex_cmpxchg_enabled)
 		return -ENOSYS;
 
-	WARN_ONCE(1, "deprecated: get_robust_list will be deleted in 2013.\n");
-
 	rcu_read_lock();
 
 	ret = -ESRCH;
-- 
cgit v0.10.2


From 41ef8f826692c8f65882bec0a8211bd4d1d2d19a Mon Sep 17 00:00:00 2001
From: Yuanhan Liu <yuanhan.liu@linux.intel.com>
Date: Fri, 1 Feb 2013 18:59:16 +0800
Subject: rwsem-spinlock: Implement writer lock-stealing for better scalability

We (Linux Kernel Performance project) found a regression
introduced by commit:

  5a505085f043 mm/rmap: Convert the struct anon_vma::mutex to an rwsem

which converted all anon_vma::mutex locks rwsem write locks.

The semantics are the same, but the behavioral difference is
quite huge in some cases. After investigating it we found the
root cause: mutexes support lock stealing while rwsems don't.

Here is the link for the detailed regression report:

  https://lkml.org/lkml/2013/1/29/84

Ingo suggested adding write lock stealing to rwsems:

    "I think we should allow lock-steal between rwsem writers - that
     will not hurt fairness as most rwsem fairness concerns relate to
     reader vs. writer fairness"

And here is the rwsem-spinlock version.

With this patch, we got a double performance increase in one
test box with following aim7 workfile:

    FILESIZE: 1M
    POOLSIZE: 10M
    10 fork_test

 /usr/bin/time output w/o patch                       /usr/bin/time_output with patch
 -- Percent of CPU this job got: 369%                 Percent of CPU this job got: 537%
 Voluntary context switches: 640595016                Voluntary context switches: 157915561

We got a 45% increase in CPU usage and saved about 3/4 voluntary context switches.

Reported-by: LKP project <lkp@linux.intel.com>
Suggested-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Yuanhan Liu <yuanhan.liu@linux.intel.com>
Cc: Alex Shi <alex.shi@intel.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Michel Lespinasse <walken@google.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Anton Blanchard <anton@samba.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: paul.gortmaker@windriver.com
Link: http://lkml.kernel.org/r/1359716356-23865-1-git-send-email-yuanhan.liu@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/lib/rwsem-spinlock.c b/lib/rwsem-spinlock.c
index 7e0d6a5..7542afb 100644
--- a/lib/rwsem-spinlock.c
+++ b/lib/rwsem-spinlock.c
@@ -73,20 +73,13 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wakewrite)
 		goto dont_wake_writers;
 	}
 
-	/* if we are allowed to wake writers try to grant a single write lock
-	 * if there's a writer at the front of the queue
-	 * - we leave the 'waiting count' incremented to signify potential
-	 *   contention
+	/*
+	 * as we support write lock stealing, we can't set sem->activity
+	 * to -1 here to indicate we get the lock. Instead, we wake it up
+	 * to let it go get it again.
 	 */
 	if (waiter->flags & RWSEM_WAITING_FOR_WRITE) {
-		sem->activity = -1;
-		list_del(&waiter->list);
-		tsk = waiter->task;
-		/* Don't touch waiter after ->task has been NULLed */
-		smp_mb();
-		waiter->task = NULL;
-		wake_up_process(tsk);
-		put_task_struct(tsk);
+		wake_up_process(waiter->task);
 		goto out;
 	}
 
@@ -121,18 +114,10 @@ static inline struct rw_semaphore *
 __rwsem_wake_one_writer(struct rw_semaphore *sem)
 {
 	struct rwsem_waiter *waiter;
-	struct task_struct *tsk;
-
-	sem->activity = -1;
 
 	waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list);
-	list_del(&waiter->list);
+	wake_up_process(waiter->task);
 
-	tsk = waiter->task;
-	smp_mb();
-	waiter->task = NULL;
-	wake_up_process(tsk);
-	put_task_struct(tsk);
 	return sem;
 }
 
@@ -204,7 +189,6 @@ int __down_read_trylock(struct rw_semaphore *sem)
 
 /*
  * get a write lock on the semaphore
- * - we increment the waiting count anyway to indicate an exclusive lock
  */
 void __sched __down_write_nested(struct rw_semaphore *sem, int subclass)
 {
@@ -214,37 +198,32 @@ void __sched __down_write_nested(struct rw_semaphore *sem, int subclass)
 
 	raw_spin_lock_irqsave(&sem->wait_lock, flags);
 
-	if (sem->activity == 0 && list_empty(&sem->wait_list)) {
-		/* granted */
-		sem->activity = -1;
-		raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
-		goto out;
-	}
-
-	tsk = current;
-	set_task_state(tsk, TASK_UNINTERRUPTIBLE);
-
 	/* set up my own style of waitqueue */
+	tsk = current;
 	waiter.task = tsk;
 	waiter.flags = RWSEM_WAITING_FOR_WRITE;
-	get_task_struct(tsk);
-
 	list_add_tail(&waiter.list, &sem->wait_list);
 
-	/* we don't need to touch the semaphore struct anymore */
-	raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
-
-	/* wait to be given the lock */
+	/* wait for someone to release the lock */
 	for (;;) {
-		if (!waiter.task)
+		/*
+		 * That is the key to support write lock stealing: allows the
+		 * task already on CPU to get the lock soon rather than put
+		 * itself into sleep and waiting for system woke it or someone
+		 * else in the head of the wait list up.
+		 */
+		if (sem->activity == 0)
 			break;
-		schedule();
 		set_task_state(tsk, TASK_UNINTERRUPTIBLE);
+		raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
+		schedule();
+		raw_spin_lock_irqsave(&sem->wait_lock, flags);
 	}
+	/* got the lock */
+	sem->activity = -1;
+	list_del(&waiter.list);
 
-	tsk->state = TASK_RUNNING;
- out:
-	;
+	raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
 }
 
 void __sched __down_write(struct rw_semaphore *sem)
@@ -262,8 +241,8 @@ int __down_write_trylock(struct rw_semaphore *sem)
 
 	raw_spin_lock_irqsave(&sem->wait_lock, flags);
 
-	if (sem->activity == 0 && list_empty(&sem->wait_list)) {
-		/* granted */
+	if (sem->activity == 0) {
+		/* got the lock */
 		sem->activity = -1;
 		ret = 1;
 	}
-- 
cgit v0.10.2