From 84567995612258c23bc55795575babe7ef605dd9 Mon Sep 17 00:00:00 2001
From: Boqun Feng <boqun.feng@gmail.com>
Date: Wed, 26 Aug 2015 19:52:46 +0800
Subject: locking/atomics, cmpxchg: Privatize the inclusion of asm/cmpxchg.h

After commit:

  654672d4ba1a ("locking/atomics: Add _{acquire|release|relaxed}() variants of some atomic operations")

Architectures may only provide {cmp,}xchg_relaxed definitions in
asm/cmpxchg.h. Other variants, such as {cmp,}xchg, may be built in
linux/atomic.h, which means simply including asm/cmpxchg.h may not get
the definitions of all the{cmp,}xchg variants.

Therefore, we should privatize the inclusions of asm/cmpxchg.h to
keep it only included in arch/* and replace the inclusions outside
with linux/atomic.h

Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Acked-by: Will Deacon <will.deacon@arm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Aybuke Ozdemir <aybuke.147@gmail.com>
Cc: Chris Brannon <chris@the-brannons.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Kirk Reiser <kirk@reisers.ca>
Cc: Kishon Vijay Abraham I <kishon@ti.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Samuel Thibault <samuel.thibault@ens-lyon.org>
Cc: Shradha Shah <sshah@solarflare.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: William Hubbs <w.d.hubbs@gmail.com>
Cc: devel@driverdev.osuosl.org
Cc: linux-net-drivers@solarflare.com
Cc: speakup@linux-speakup.org
Link: http://lkml.kernel.org/r/1440589966-26280-1-git-send-email-boqun.feng@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/Documentation/atomic_ops.txt b/Documentation/atomic_ops.txt
index b19fc34..c9d1cac 100644
--- a/Documentation/atomic_ops.txt
+++ b/Documentation/atomic_ops.txt
@@ -542,6 +542,10 @@ The routines xchg() and cmpxchg() must provide the same exact
 memory-barrier semantics as the atomic and bit operations returning
 values.
 
+Note: If someone wants to use xchg(), cmpxchg() and their variants,
+linux/atomic.h should be included rather than asm/cmpxchg.h, unless
+the code is in arch/* and can take care of itself.
+
 Spinlocks and rwlocks have memory barrier expectations as well.
 The rule to follow is simple:
 
diff --git a/drivers/net/ethernet/sfc/mcdi.c b/drivers/net/ethernet/sfc/mcdi.c
index 98d172b..a9b9460 100644
--- a/drivers/net/ethernet/sfc/mcdi.c
+++ b/drivers/net/ethernet/sfc/mcdi.c
@@ -9,7 +9,7 @@
 
 #include <linux/delay.h>
 #include <linux/moduleparam.h>
-#include <asm/cmpxchg.h>
+#include <linux/atomic.h>
 #include "net_driver.h"
 #include "nic.h"
 #include "io.h"
diff --git a/drivers/phy/phy-rcar-gen2.c b/drivers/phy/phy-rcar-gen2.c
index 6e0d9fa..c7a0599 100644
--- a/drivers/phy/phy-rcar-gen2.c
+++ b/drivers/phy/phy-rcar-gen2.c
@@ -17,8 +17,7 @@
 #include <linux/phy/phy.h>
 #include <linux/platform_device.h>
 #include <linux/spinlock.h>
-
-#include <asm/cmpxchg.h>
+#include <linux/atomic.h>
 
 #define USBHS_LPSTS			0x02
 #define USBHS_UGCTRL			0x80
diff --git a/drivers/staging/speakup/selection.c b/drivers/staging/speakup/selection.c
index 98af3b1..aa5ab6c 100644
--- a/drivers/staging/speakup/selection.c
+++ b/drivers/staging/speakup/selection.c
@@ -7,7 +7,7 @@
 #include <linux/workqueue.h>
 #include <linux/tty.h>
 #include <linux/tty_flip.h>
-#include <asm/cmpxchg.h>
+#include <linux/atomic.h>
 
 #include "speakup.h"
 
-- 
cgit v0.10.2


From e58cdf585a38412f9f56c512d20e8e12637d9892 Mon Sep 17 00:00:00 2001
From: Stephen Boyd <sboyd@codeaurora.org>
Date: Wed, 9 Sep 2015 16:46:38 -0700
Subject: locking/Documentation/lockstat: Fix typo - lokcing -> locking

Signed-off-by: Stephen Boyd <sboyd@codeaurora.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Jiri Kosina <trivial@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1441842398-25063-1-git-send-email-sboyd@codeaurora.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/Documentation/locking/lockstat.txt b/Documentation/locking/lockstat.txt
index 568bbba..5786ad2 100644
--- a/Documentation/locking/lockstat.txt
+++ b/Documentation/locking/lockstat.txt
@@ -12,7 +12,7 @@ Because things like lock contention can severely impact performance.
 - HOW
 
 Lockdep already has hooks in the lock functions and maps lock instances to
-lock classes. We build on that (see Documentation/lokcing/lockdep-design.txt).
+lock classes. We build on that (see Documentation/locking/lockdep-design.txt).
 The graph below shows the relation between the lock functions and the various
 hooks therein.
 
-- 
cgit v0.10.2


From 6e1e5196975fb7ecc501b3fe1075b77aea2b7839 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Mon, 14 Sep 2015 00:37:22 -0700
Subject: locking/qrwlock: Rename ->lock to ->wait_lock

... trivial, but reads a little nicer when we name our
actual primitive 'lock'.

Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Waiman Long <Waiman.Long@hpe.com>
Link: http://lkml.kernel.org/r/1442216244-4409-1-git-send-email-dave@stgolabs.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/include/asm-generic/qrwlock_types.h b/include/asm-generic/qrwlock_types.h
index 4d76f24..0abc6b6 100644
--- a/include/asm-generic/qrwlock_types.h
+++ b/include/asm-generic/qrwlock_types.h
@@ -10,12 +10,12 @@
 
 typedef struct qrwlock {
 	atomic_t		cnts;
-	arch_spinlock_t		lock;
+	arch_spinlock_t		wait_lock;
 } arch_rwlock_t;
 
 #define	__ARCH_RW_LOCK_UNLOCKED {		\
 	.cnts = ATOMIC_INIT(0),			\
-	.lock = __ARCH_SPIN_LOCK_UNLOCKED,	\
+	.wait_lock = __ARCH_SPIN_LOCK_UNLOCKED,	\
 }
 
 #endif /* __ASM_GENERIC_QRWLOCK_TYPES_H */
diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c
index f17a3e3..fec0823 100644
--- a/kernel/locking/qrwlock.c
+++ b/kernel/locking/qrwlock.c
@@ -86,7 +86,7 @@ void queued_read_lock_slowpath(struct qrwlock *lock, u32 cnts)
 	/*
 	 * Put the reader into the wait queue
 	 */
-	arch_spin_lock(&lock->lock);
+	arch_spin_lock(&lock->wait_lock);
 
 	/*
 	 * The ACQUIRE semantics of the following spinning code ensure
@@ -99,7 +99,7 @@ void queued_read_lock_slowpath(struct qrwlock *lock, u32 cnts)
 	/*
 	 * Signal the next one in queue to become queue head
 	 */
-	arch_spin_unlock(&lock->lock);
+	arch_spin_unlock(&lock->wait_lock);
 }
 EXPORT_SYMBOL(queued_read_lock_slowpath);
 
@@ -112,7 +112,7 @@ void queued_write_lock_slowpath(struct qrwlock *lock)
 	u32 cnts;
 
 	/* Put the writer into the wait queue */
-	arch_spin_lock(&lock->lock);
+	arch_spin_lock(&lock->wait_lock);
 
 	/* Try to acquire the lock directly if no reader is present */
 	if (!atomic_read(&lock->cnts) &&
@@ -144,6 +144,6 @@ void queued_write_lock_slowpath(struct qrwlock *lock)
 		cpu_relax_lowlatency();
 	}
 unlock:
-	arch_spin_unlock(&lock->lock);
+	arch_spin_unlock(&lock->wait_lock);
 }
 EXPORT_SYMBOL(queued_write_lock_slowpath);
-- 
cgit v0.10.2


From c55a6ffa6285e29f874ed403979472631ec70bff Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Mon, 14 Sep 2015 00:37:24 -0700
Subject: locking/osq: Relax atomic semantics

... by using acquire/release for ops around the lock->tail. As such,
weakly ordered archs can benefit from more relaxed use of barriers
when issuing atomics.

Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Waiman Long <Waiman.Long@hpe.com>
Link: http://lkml.kernel.org/r/1442216244-4409-3-git-send-email-dave@stgolabs.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c
index dc85ee2..d092a0c 100644
--- a/kernel/locking/osq_lock.c
+++ b/kernel/locking/osq_lock.c
@@ -50,7 +50,7 @@ osq_wait_next(struct optimistic_spin_queue *lock,
 
 	for (;;) {
 		if (atomic_read(&lock->tail) == curr &&
-		    atomic_cmpxchg(&lock->tail, curr, old) == curr) {
+		    atomic_cmpxchg_acquire(&lock->tail, curr, old) == curr) {
 			/*
 			 * We were the last queued, we moved @lock back. @prev
 			 * will now observe @lock and will complete its
@@ -92,7 +92,11 @@ bool osq_lock(struct optimistic_spin_queue *lock)
 	node->next = NULL;
 	node->cpu = curr;
 
-	old = atomic_xchg(&lock->tail, curr);
+	/*
+	 * ACQUIRE semantics, pairs with corresponding RELEASE
+	 * in unlock() uncontended, or fastpath.
+	 */
+	old = atomic_xchg_acquire(&lock->tail, curr);
 	if (old == OSQ_UNLOCKED_VAL)
 		return true;
 
@@ -184,7 +188,8 @@ void osq_unlock(struct optimistic_spin_queue *lock)
 	/*
 	 * Fast path for the uncontended case.
 	 */
-	if (likely(atomic_cmpxchg(&lock->tail, curr, OSQ_UNLOCKED_VAL) == curr))
+	if (likely(atomic_cmpxchg_release(&lock->tail, curr,
+					  OSQ_UNLOCKED_VAL) == curr))
 		return;
 
 	/*
-- 
cgit v0.10.2


From 93edc8bd7750ff3cae088bfca453ea73dc9004a4 Mon Sep 17 00:00:00 2001
From: Waiman Long <Waiman.Long@hpe.com>
Date: Fri, 11 Sep 2015 14:37:34 -0400
Subject: locking/pvqspinlock: Kick the PV CPU unconditionally when _Q_SLOW_VAL

If _Q_SLOW_VAL has been set, the vCPU state must have been vcpu_hashed.
The extra check at the end of __pv_queued_spin_unlock() is unnecessary
and can be removed.

Signed-off-by: Waiman Long <Waiman.Long@hpe.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Davidlohr Bueso <dave@stgolabs.net>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Douglas Hatch <doug.hatch@hp.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Scott J Norton <scott.norton@hp.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1441996658-62854-3-git-send-email-Waiman.Long@hpe.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index c8e6e9a..f0450ff 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -267,7 +267,6 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)
 		}
 
 		if (!lp) { /* ONCE */
-			WRITE_ONCE(pn->state, vcpu_hashed);
 			lp = pv_hash(lock, pn);
 
 			/*
@@ -275,11 +274,9 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)
 			 * when we observe _Q_SLOW_VAL in __pv_queued_spin_unlock()
 			 * we'll be sure to be able to observe our hash entry.
 			 *
-			 *   [S] pn->state
 			 *   [S] <hash>                 [Rmw] l->locked == _Q_SLOW_VAL
 			 *       MB                           RMB
 			 * [RmW] l->locked = _Q_SLOW_VAL  [L] <unhash>
-			 *                                [L] pn->state
 			 *
 			 * Matches the smp_rmb() in __pv_queued_spin_unlock().
 			 */
@@ -364,8 +361,7 @@ __visible void __pv_queued_spin_unlock(struct qspinlock *lock)
 	 * vCPU is harmless other than the additional latency in completing
 	 * the unlock.
 	 */
-	if (READ_ONCE(node->state) == vcpu_hashed)
-		pv_kick(node->cpu);
+	pv_kick(node->cpu);
 }
 /*
  * Include the architecture specific callee-save thunk of the
-- 
cgit v0.10.2


From ac742d37180bee83bc433be087b66a17af2883b9 Mon Sep 17 00:00:00 2001
From: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Date: Wed, 9 Sep 2015 23:36:40 +0200
Subject: futex: Force hot variables into a single cache line

futex_hash() references two global variables: the base pointer
futex_queues and the size of the array futex_hashsize. The latter is
marked __read_mostly, while the former is not, so they are likely to
end up very far from each other. This means that futex_hash() is
likely to encounter two cache misses.

We could mark futex_queues as __read_mostly as well, but that doesn't
guarantee they'll end up next to each other (and even if they do, they
may still end up in different cache lines). So put the two variables
in a small singleton struct with sufficient alignment and mark that as
__read_mostly.

Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: kbuild test robot <fengguang.wu@intel.com>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Link: http://lkml.kernel.org/r/1441834601-13633-1-git-send-email-linux@rasmusvillemoes.dk
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/futex.c b/kernel/futex.c
index 6e443ef..dfc86e9 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -255,9 +255,18 @@ struct futex_hash_bucket {
 	struct plist_head chain;
 } ____cacheline_aligned_in_smp;
 
-static unsigned long __read_mostly futex_hashsize;
+/*
+ * The base of the bucket array and its size are always used together
+ * (after initialization only in hash_futex()), so ensure that they
+ * reside in the same cacheline.
+ */
+static struct {
+	struct futex_hash_bucket *queues;
+	unsigned long            hashsize;
+} __futex_data __read_mostly __aligned(2*sizeof(long));
+#define futex_queues   (__futex_data.queues)
+#define futex_hashsize (__futex_data.hashsize)
 
-static struct futex_hash_bucket *futex_queues;
 
 /*
  * Fault injections for futexes.
-- 
cgit v0.10.2


From 90fe65148ea76988d8d5acbf3e578aa74129a490 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 18 Sep 2015 15:04:59 +0200
Subject: atomic: Add atomic_long_t bitops

When adding the atomic bitops, I seem to have forgotten about
atomic_long_t, fix this.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/include/asm-generic/atomic-long.h b/include/asm-generic/atomic-long.h
index a94cbeb..c7a5c1a 100644
--- a/include/asm-generic/atomic-long.h
+++ b/include/asm-generic/atomic-long.h
@@ -112,19 +112,23 @@ static inline void atomic_long_dec(atomic_long_t *l)
 	ATOMIC_LONG_PFX(_dec)(v);
 }
 
-static inline void atomic_long_add(long i, atomic_long_t *l)
-{
-	ATOMIC_LONG_PFX(_t) *v = (ATOMIC_LONG_PFX(_t) *)l;
-
-	ATOMIC_LONG_PFX(_add)(i, v);
+#define ATOMIC_LONG_OP(op)						\
+static inline void							\
+atomic_long_##op(long i, atomic_long_t *l)				\
+{									\
+	ATOMIC_LONG_PFX(_t) *v = (ATOMIC_LONG_PFX(_t) *)l;		\
+									\
+	ATOMIC_LONG_PFX(_##op)(i, v);					\
 }
 
-static inline void atomic_long_sub(long i, atomic_long_t *l)
-{
-	ATOMIC_LONG_PFX(_t) *v = (ATOMIC_LONG_PFX(_t) *)l;
+ATOMIC_LONG_OP(add)
+ATOMIC_LONG_OP(sub)
+ATOMIC_LONG_OP(and)
+ATOMIC_LONG_OP(or)
+ATOMIC_LONG_OP(xor)
+ATOMIC_LONG_OP(andnot)
 
-	ATOMIC_LONG_PFX(_sub)(i, v);
-}
+#undef ATOMIC_LONG_OP
 
 static inline int atomic_long_sub_and_test(long i, atomic_long_t *l)
 {
diff --git a/include/linux/atomic.h b/include/linux/atomic.h
index 00a5763..29dafa1 100644
--- a/include/linux/atomic.h
+++ b/include/linux/atomic.h
@@ -451,7 +451,6 @@ static inline int atomic_dec_if_positive(atomic_t *v)
 }
 #endif
 
-#include <asm-generic/atomic-long.h>
 #ifdef CONFIG_GENERIC_ATOMIC64
 #include <asm-generic/atomic64.h>
 #endif
@@ -463,4 +462,6 @@ static inline void atomic64_andnot(long long i, atomic64_t *v)
 }
 #endif
 
+#include <asm-generic/atomic-long.h>
+
 #endif /* _LINUX_ATOMIC_H */
-- 
cgit v0.10.2


From 62e8a3258bda118f24ff462fe04cfbe75b8189b5 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 18 Sep 2015 11:13:10 +0200
Subject: atomic, arch: Audit atomic_{read,set}()

This patch makes sure that atomic_{read,set}() are at least
{READ,WRITE}_ONCE().

We already had the 'requirement' that atomic_read() should use
ACCESS_ONCE(), and most archs had this, but a few were lacking.
All are now converted to use READ_ONCE().

And, by a symmetry and general paranoia argument, upgrade atomic_set()
to use WRITE_ONCE().

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: james.hogan@imgtec.com
Cc: linux-kernel@vger.kernel.org
Cc: oleg@redhat.com
Cc: will.deacon@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/alpha/include/asm/atomic.h b/arch/alpha/include/asm/atomic.h
index e8c9560..572b228 100644
--- a/arch/alpha/include/asm/atomic.h
+++ b/arch/alpha/include/asm/atomic.h
@@ -17,11 +17,11 @@
 #define ATOMIC_INIT(i)		{ (i) }
 #define ATOMIC64_INIT(i)	{ (i) }
 
-#define atomic_read(v)		ACCESS_ONCE((v)->counter)
-#define atomic64_read(v)	ACCESS_ONCE((v)->counter)
+#define atomic_read(v)		READ_ONCE((v)->counter)
+#define atomic64_read(v)	READ_ONCE((v)->counter)
 
-#define atomic_set(v,i)		((v)->counter = (i))
-#define atomic64_set(v,i)	((v)->counter = (i))
+#define atomic_set(v,i)		WRITE_ONCE((v)->counter, (i))
+#define atomic64_set(v,i)	WRITE_ONCE((v)->counter, (i))
 
 /*
  * To get proper branch prediction for the main line, we must branch
diff --git a/arch/arc/include/asm/atomic.h b/arch/arc/include/asm/atomic.h
index c3ecda0..7730d30 100644
--- a/arch/arc/include/asm/atomic.h
+++ b/arch/arc/include/asm/atomic.h
@@ -17,11 +17,11 @@
 #include <asm/barrier.h>
 #include <asm/smp.h>
 
-#define atomic_read(v)  ((v)->counter)
+#define atomic_read(v)  READ_ONCE((v)->counter)
 
 #ifdef CONFIG_ARC_HAS_LLSC
 
-#define atomic_set(v, i) (((v)->counter) = (i))
+#define atomic_set(v, i) WRITE_ONCE(((v)->counter), (i))
 
 #ifdef CONFIG_ARC_STAR_9000923308
 
@@ -107,7 +107,7 @@ static inline int atomic_##op##_return(int i, atomic_t *v)		\
 #ifndef CONFIG_SMP
 
  /* violating atomic_xxx API locking protocol in UP for optimization sake */
-#define atomic_set(v, i) (((v)->counter) = (i))
+#define atomic_set(v, i) WRITE_ONCE(((v)->counter), (i))
 
 #else
 
@@ -125,7 +125,7 @@ static inline void atomic_set(atomic_t *v, int i)
 	unsigned long flags;
 
 	atomic_ops_lock(flags);
-	v->counter = i;
+	WRITE_ONCE(v->counter, i);
 	atomic_ops_unlock(flags);
 }
 
diff --git a/arch/arm/include/asm/atomic.h b/arch/arm/include/asm/atomic.h
index fe3ef39..2bf80af 100644
--- a/arch/arm/include/asm/atomic.h
+++ b/arch/arm/include/asm/atomic.h
@@ -27,8 +27,8 @@
  * strex/ldrex monitor on some implementations. The reason we can use it for
  * atomic_set() is the clrex or dummy strex done on every exception return.
  */
-#define atomic_read(v)	ACCESS_ONCE((v)->counter)
-#define atomic_set(v,i)	(((v)->counter) = (i))
+#define atomic_read(v)	READ_ONCE((v)->counter)
+#define atomic_set(v,i)	WRITE_ONCE(((v)->counter), (i))
 
 #if __LINUX_ARM_ARCH__ >= 6
 
diff --git a/arch/arm64/include/asm/atomic.h b/arch/arm64/include/asm/atomic.h
index 35a6778..1e247ac 100644
--- a/arch/arm64/include/asm/atomic.h
+++ b/arch/arm64/include/asm/atomic.h
@@ -54,7 +54,7 @@
 #define ATOMIC_INIT(i)	{ (i) }
 
 #define atomic_read(v)			READ_ONCE((v)->counter)
-#define atomic_set(v, i)		(((v)->counter) = (i))
+#define atomic_set(v, i)		WRITE_ONCE(((v)->counter), (i))
 #define atomic_xchg(v, new)		xchg(&((v)->counter), (new))
 #define atomic_cmpxchg(v, old, new)	cmpxchg(&((v)->counter), (old), (new))
 
diff --git a/arch/avr32/include/asm/atomic.h b/arch/avr32/include/asm/atomic.h
index 97c9bdf..d74fd8c 100644
--- a/arch/avr32/include/asm/atomic.h
+++ b/arch/avr32/include/asm/atomic.h
@@ -19,8 +19,8 @@
 
 #define ATOMIC_INIT(i)  { (i) }
 
-#define atomic_read(v)		ACCESS_ONCE((v)->counter)
-#define atomic_set(v, i)	(((v)->counter) = i)
+#define atomic_read(v)		READ_ONCE((v)->counter)
+#define atomic_set(v, i)	WRITE_ONCE(((v)->counter), (i))
 
 #define ATOMIC_OP_RETURN(op, asm_op, asm_con)				\
 static inline int __atomic_##op##_return(int i, atomic_t *v)		\
diff --git a/arch/frv/include/asm/atomic.h b/arch/frv/include/asm/atomic.h
index 0da689d..64f02d4 100644
--- a/arch/frv/include/asm/atomic.h
+++ b/arch/frv/include/asm/atomic.h
@@ -32,8 +32,8 @@
  */
 
 #define ATOMIC_INIT(i)		{ (i) }
-#define atomic_read(v)		ACCESS_ONCE((v)->counter)
-#define atomic_set(v, i)	(((v)->counter) = (i))
+#define atomic_read(v)		READ_ONCE((v)->counter)
+#define atomic_set(v, i)	WRITE_ONCE(((v)->counter), (i))
 
 static inline int atomic_inc_return(atomic_t *v)
 {
diff --git a/arch/h8300/include/asm/atomic.h b/arch/h8300/include/asm/atomic.h
index 702ee53..4435a44 100644
--- a/arch/h8300/include/asm/atomic.h
+++ b/arch/h8300/include/asm/atomic.h
@@ -11,8 +11,8 @@
 
 #define ATOMIC_INIT(i)	{ (i) }
 
-#define atomic_read(v)		ACCESS_ONCE((v)->counter)
-#define atomic_set(v, i)	(((v)->counter) = i)
+#define atomic_read(v)		READ_ONCE((v)->counter)
+#define atomic_set(v, i)	WRITE_ONCE(((v)->counter), (i))
 
 #include <linux/kernel.h>
 
diff --git a/arch/hexagon/include/asm/atomic.h b/arch/hexagon/include/asm/atomic.h
index 811d61f..55696c4 100644
--- a/arch/hexagon/include/asm/atomic.h
+++ b/arch/hexagon/include/asm/atomic.h
@@ -48,7 +48,7 @@ static inline void atomic_set(atomic_t *v, int new)
  *
  * Assumes all word reads on our architecture are atomic.
  */
-#define atomic_read(v)		((v)->counter)
+#define atomic_read(v)		READ_ONCE((v)->counter)
 
 /**
  * atomic_xchg - atomic
diff --git a/arch/ia64/include/asm/atomic.h b/arch/ia64/include/asm/atomic.h
index be4beeb..8dfb5f6 100644
--- a/arch/ia64/include/asm/atomic.h
+++ b/arch/ia64/include/asm/atomic.h
@@ -21,11 +21,11 @@
 #define ATOMIC_INIT(i)		{ (i) }
 #define ATOMIC64_INIT(i)	{ (i) }
 
-#define atomic_read(v)		ACCESS_ONCE((v)->counter)
-#define atomic64_read(v)	ACCESS_ONCE((v)->counter)
+#define atomic_read(v)		READ_ONCE((v)->counter)
+#define atomic64_read(v)	READ_ONCE((v)->counter)
 
-#define atomic_set(v,i)		(((v)->counter) = (i))
-#define atomic64_set(v,i)	(((v)->counter) = (i))
+#define atomic_set(v,i)		WRITE_ONCE(((v)->counter), (i))
+#define atomic64_set(v,i)	WRITE_ONCE(((v)->counter), (i))
 
 #define ATOMIC_OP(op, c_op)						\
 static __inline__ int							\
diff --git a/arch/m32r/include/asm/atomic.h b/arch/m32r/include/asm/atomic.h
index 025e2a1..ea35160 100644
--- a/arch/m32r/include/asm/atomic.h
+++ b/arch/m32r/include/asm/atomic.h
@@ -28,7 +28,7 @@
  *
  * Atomically reads the value of @v.
  */
-#define atomic_read(v)	ACCESS_ONCE((v)->counter)
+#define atomic_read(v)	READ_ONCE((v)->counter)
 
 /**
  * atomic_set - set atomic variable
@@ -37,7 +37,7 @@
  *
  * Atomically sets the value of @v to @i.
  */
-#define atomic_set(v,i)	(((v)->counter) = (i))
+#define atomic_set(v,i)	WRITE_ONCE(((v)->counter), (i))
 
 #ifdef CONFIG_CHIP_M32700_TS1
 #define __ATOMIC_CLOBBER	, "r4"
diff --git a/arch/m68k/include/asm/atomic.h b/arch/m68k/include/asm/atomic.h
index 039fac1..4858178 100644
--- a/arch/m68k/include/asm/atomic.h
+++ b/arch/m68k/include/asm/atomic.h
@@ -17,8 +17,8 @@
 
 #define ATOMIC_INIT(i)	{ (i) }
 
-#define atomic_read(v)		ACCESS_ONCE((v)->counter)
-#define atomic_set(v, i)	(((v)->counter) = i)
+#define atomic_read(v)		READ_ONCE((v)->counter)
+#define atomic_set(v, i)	WRITE_ONCE(((v)->counter), (i))
 
 /*
  * The ColdFire parts cannot do some immediate to memory operations,
diff --git a/arch/metag/include/asm/atomic_lnkget.h b/arch/metag/include/asm/atomic_lnkget.h
index 21c4c26..a625818 100644
--- a/arch/metag/include/asm/atomic_lnkget.h
+++ b/arch/metag/include/asm/atomic_lnkget.h
@@ -3,7 +3,7 @@
 
 #define ATOMIC_INIT(i)	{ (i) }
 
-#define atomic_set(v, i)		((v)->counter = (i))
+#define atomic_set(v, i)		WRITE_ONCE((v)->counter, (i))
 
 #include <linux/compiler.h>
 
diff --git a/arch/metag/include/asm/atomic_lock1.h b/arch/metag/include/asm/atomic_lock1.h
index f8efe38..0295d9b 100644
--- a/arch/metag/include/asm/atomic_lock1.h
+++ b/arch/metag/include/asm/atomic_lock1.h
@@ -10,7 +10,7 @@
 
 static inline int atomic_read(const atomic_t *v)
 {
-	return (v)->counter;
+	return READ_ONCE((v)->counter);
 }
 
 /*
diff --git a/arch/mips/include/asm/atomic.h b/arch/mips/include/asm/atomic.h
index 4c42fd9..f82d3af 100644
--- a/arch/mips/include/asm/atomic.h
+++ b/arch/mips/include/asm/atomic.h
@@ -30,7 +30,7 @@
  *
  * Atomically reads the value of @v.
  */
-#define atomic_read(v)		ACCESS_ONCE((v)->counter)
+#define atomic_read(v)		READ_ONCE((v)->counter)
 
 /*
  * atomic_set - set atomic variable
@@ -39,7 +39,7 @@
  *
  * Atomically sets the value of @v to @i.
  */
-#define atomic_set(v, i)		((v)->counter = (i))
+#define atomic_set(v, i)	WRITE_ONCE((v)->counter, (i))
 
 #define ATOMIC_OP(op, c_op, asm_op)					      \
 static __inline__ void atomic_##op(int i, atomic_t * v)			      \
@@ -315,14 +315,14 @@ static __inline__ int __atomic_add_unless(atomic_t *v, int a, int u)
  * @v: pointer of type atomic64_t
  *
  */
-#define atomic64_read(v)	ACCESS_ONCE((v)->counter)
+#define atomic64_read(v)	READ_ONCE((v)->counter)
 
 /*
  * atomic64_set - set atomic variable
  * @v: pointer of type atomic64_t
  * @i: required value
  */
-#define atomic64_set(v, i)	((v)->counter = (i))
+#define atomic64_set(v, i)	WRITE_ONCE((v)->counter, (i))
 
 #define ATOMIC64_OP(op, c_op, asm_op)					      \
 static __inline__ void atomic64_##op(long i, atomic64_t * v)		      \
diff --git a/arch/mn10300/include/asm/atomic.h b/arch/mn10300/include/asm/atomic.h
index 375e591..ce318d5 100644
--- a/arch/mn10300/include/asm/atomic.h
+++ b/arch/mn10300/include/asm/atomic.h
@@ -34,7 +34,7 @@
  *
  * Atomically reads the value of @v.  Note that the guaranteed
  */
-#define atomic_read(v)	(ACCESS_ONCE((v)->counter))
+#define atomic_read(v)	READ_ONCE((v)->counter)
 
 /**
  * atomic_set - set atomic variable
@@ -43,7 +43,7 @@
  *
  * Atomically sets the value of @v to @i.  Note that the guaranteed
  */
-#define atomic_set(v, i) (((v)->counter) = (i))
+#define atomic_set(v, i) WRITE_ONCE(((v)->counter), (i))
 
 #define ATOMIC_OP(op)							\
 static inline void atomic_##op(int i, atomic_t *v)			\
diff --git a/arch/parisc/include/asm/atomic.h b/arch/parisc/include/asm/atomic.h
index 2536965..1d10999 100644
--- a/arch/parisc/include/asm/atomic.h
+++ b/arch/parisc/include/asm/atomic.h
@@ -67,7 +67,7 @@ static __inline__ void atomic_set(atomic_t *v, int i)
 
 static __inline__ int atomic_read(const atomic_t *v)
 {
-	return ACCESS_ONCE((v)->counter);
+	return READ_ONCE((v)->counter);
 }
 
 /* exported interface */
diff --git a/arch/sh/include/asm/atomic.h b/arch/sh/include/asm/atomic.h
index 05b9f74..c399e1c 100644
--- a/arch/sh/include/asm/atomic.h
+++ b/arch/sh/include/asm/atomic.h
@@ -14,8 +14,8 @@
 
 #define ATOMIC_INIT(i)	{ (i) }
 
-#define atomic_read(v)		ACCESS_ONCE((v)->counter)
-#define atomic_set(v,i)		((v)->counter = (i))
+#define atomic_read(v)		READ_ONCE((v)->counter)
+#define atomic_set(v,i)		WRITE_ONCE((v)->counter, (i))
 
 #if defined(CONFIG_GUSA_RB)
 #include <asm/atomic-grb.h>
diff --git a/arch/sparc/include/asm/atomic_64.h b/arch/sparc/include/asm/atomic_64.h
index 917084a..f2fbf9e 100644
--- a/arch/sparc/include/asm/atomic_64.h
+++ b/arch/sparc/include/asm/atomic_64.h
@@ -14,11 +14,11 @@
 #define ATOMIC_INIT(i)		{ (i) }
 #define ATOMIC64_INIT(i)	{ (i) }
 
-#define atomic_read(v)		ACCESS_ONCE((v)->counter)
-#define atomic64_read(v)	ACCESS_ONCE((v)->counter)
+#define atomic_read(v)		READ_ONCE((v)->counter)
+#define atomic64_read(v)	READ_ONCE((v)->counter)
 
-#define atomic_set(v, i)	(((v)->counter) = i)
-#define atomic64_set(v, i)	(((v)->counter) = i)
+#define atomic_set(v, i)	WRITE_ONCE(((v)->counter), (i))
+#define atomic64_set(v, i)	WRITE_ONCE(((v)->counter), (i))
 
 #define ATOMIC_OP(op)							\
 void atomic_##op(int, atomic_t *);					\
diff --git a/arch/tile/include/asm/atomic.h b/arch/tile/include/asm/atomic.h
index 7097984..9fc0107 100644
--- a/arch/tile/include/asm/atomic.h
+++ b/arch/tile/include/asm/atomic.h
@@ -34,7 +34,7 @@
  */
 static inline int atomic_read(const atomic_t *v)
 {
-	return ACCESS_ONCE(v->counter);
+	return READ_ONCE(v->counter);
 }
 
 /**
diff --git a/arch/tile/include/asm/atomic_64.h b/arch/tile/include/asm/atomic_64.h
index 096a56d..51cabc2 100644
--- a/arch/tile/include/asm/atomic_64.h
+++ b/arch/tile/include/asm/atomic_64.h
@@ -24,7 +24,7 @@
 
 /* First, the 32-bit atomic ops that are "real" on our 64-bit platform. */
 
-#define atomic_set(v, i) ((v)->counter = (i))
+#define atomic_set(v, i) WRITE_ONCE((v)->counter, (i))
 
 /*
  * The smp_mb() operations throughout are to support the fact that
@@ -82,8 +82,8 @@ static inline void atomic_xor(int i, atomic_t *v)
 
 #define ATOMIC64_INIT(i)	{ (i) }
 
-#define atomic64_read(v)		((v)->counter)
-#define atomic64_set(v, i) ((v)->counter = (i))
+#define atomic64_read(v)	READ_ONCE((v)->counter)
+#define atomic64_set(v, i)	WRITE_ONCE((v)->counter, (i))
 
 static inline void atomic64_add(long i, atomic64_t *v)
 {
diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h
index fb52aa6..ae5fb83 100644
--- a/arch/x86/include/asm/atomic.h
+++ b/arch/x86/include/asm/atomic.h
@@ -24,7 +24,7 @@
  */
 static __always_inline int atomic_read(const atomic_t *v)
 {
-	return ACCESS_ONCE((v)->counter);
+	return READ_ONCE((v)->counter);
 }
 
 /**
@@ -36,7 +36,7 @@ static __always_inline int atomic_read(const atomic_t *v)
  */
 static __always_inline void atomic_set(atomic_t *v, int i)
 {
-	v->counter = i;
+	WRITE_ONCE(v->counter, i);
 }
 
 /**
diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h
index 50e33ef..0373510 100644
--- a/arch/x86/include/asm/atomic64_64.h
+++ b/arch/x86/include/asm/atomic64_64.h
@@ -18,7 +18,7 @@
  */
 static inline long atomic64_read(const atomic64_t *v)
 {
-	return ACCESS_ONCE((v)->counter);
+	return READ_ONCE((v)->counter);
 }
 
 /**
@@ -30,7 +30,7 @@ static inline long atomic64_read(const atomic64_t *v)
  */
 static inline void atomic64_set(atomic64_t *v, long i)
 {
-	v->counter = i;
+	WRITE_ONCE(v->counter, i);
 }
 
 /**
diff --git a/arch/xtensa/include/asm/atomic.h b/arch/xtensa/include/asm/atomic.h
index 93795d0..fd8017c 100644
--- a/arch/xtensa/include/asm/atomic.h
+++ b/arch/xtensa/include/asm/atomic.h
@@ -47,7 +47,7 @@
  *
  * Atomically reads the value of @v.
  */
-#define atomic_read(v)		ACCESS_ONCE((v)->counter)
+#define atomic_read(v)		READ_ONCE((v)->counter)
 
 /**
  * atomic_set - set atomic variable
@@ -56,7 +56,7 @@
  *
  * Atomically sets the value of @v to @i.
  */
-#define atomic_set(v,i)		((v)->counter = (i))
+#define atomic_set(v,i)		WRITE_ONCE((v)->counter, (i))
 
 #if XCHAL_HAVE_S32C1I
 #define ATOMIC_OP(op)							\
diff --git a/include/asm-generic/atomic.h b/include/asm-generic/atomic.h
index d4d7e33..74f1a37 100644
--- a/include/asm-generic/atomic.h
+++ b/include/asm-generic/atomic.h
@@ -127,7 +127,7 @@ ATOMIC_OP(xor, ^)
  * Atomically reads the value of @v.
  */
 #ifndef atomic_read
-#define atomic_read(v)	ACCESS_ONCE((v)->counter)
+#define atomic_read(v)	READ_ONCE((v)->counter)
 #endif
 
 /**
@@ -137,7 +137,7 @@ ATOMIC_OP(xor, ^)
  *
  * Atomically sets the value of @v to @i.
  */
-#define atomic_set(v, i) (((v)->counter) = (i))
+#define atomic_set(v, i) WRITE_ONCE(((v)->counter), (i))
 
 #include <linux/irqflags.h>
 
-- 
cgit v0.10.2


From e3e72ab80a3fac0b88e07d358a2c75724ccd66b4 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 18 Sep 2015 13:22:52 +0200
Subject: atomic: Implement atomic_read_ctrl()

Provide atomic_read_ctrl() to mirror READ_ONCE_CTRL(), such that we can
more conveniently use atomics in control dependencies.

Since we can assume atomic_read() implies a READ_ONCE(), we must only
emit an extra smp_read_barrier_depends() in order to upgrade to
READ_ONCE_CTRL() semantics.

Requested-by: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Will Deacon <will.deacon@arm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Cc: oleg@redhat.com
Link: http://lkml.kernel.org/r/20150918115637.GM3604@twins.programming.kicks-ass.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt
index 2ba8461..41ffd7e 100644
--- a/Documentation/memory-barriers.txt
+++ b/Documentation/memory-barriers.txt
@@ -637,7 +637,8 @@ as follows:
 	b = p;  /* BUG: Compiler and CPU can both reorder!!! */
 
 Finally, the READ_ONCE_CTRL() includes an smp_read_barrier_depends()
-that DEC Alpha needs in order to respect control depedencies.
+that DEC Alpha needs in order to respect control depedencies. Alternatively
+use one of atomic{,64}_read_ctrl().
 
 So don't leave out the READ_ONCE_CTRL().
 
@@ -796,9 +797,9 @@ site: https://www.cl.cam.ac.uk/~pes20/ppcmem/index.html.
 
 In summary:
 
-  (*) Control dependencies must be headed by READ_ONCE_CTRL().
-      Or, as a much less preferable alternative, interpose
-      smp_read_barrier_depends() between a READ_ONCE() and the
+  (*) Control dependencies must be headed by READ_ONCE_CTRL(),
+      atomic{,64}_read_ctrl(). Or, as a much less preferable alternative,
+      interpose smp_read_barrier_depends() between a READ_ONCE() and the
       control-dependent write.
 
   (*) Control dependencies can order prior loads against later stores.
@@ -820,10 +821,10 @@ In summary:
       and WRITE_ONCE() can help to preserve the needed conditional.
 
   (*) Control dependencies require that the compiler avoid reordering the
-      dependency into nonexistence.  Careful use of READ_ONCE_CTRL()
-      or smp_read_barrier_depends() can help to preserve your control
-      dependency.  Please see the Compiler Barrier section for more
-      information.
+      dependency into nonexistence.  Careful use of READ_ONCE_CTRL(),
+      atomic{,64}_read_ctrl() or smp_read_barrier_depends() can help to
+      preserve your control dependency.  Please see the Compiler Barrier
+      section for more information.
 
   (*) Control dependencies pair normally with other types of barriers.
 
diff --git a/include/asm-generic/atomic-long.h b/include/asm-generic/atomic-long.h
index c7a5c1a..8942cdc 100644
--- a/include/asm-generic/atomic-long.h
+++ b/include/asm-generic/atomic-long.h
@@ -35,7 +35,7 @@ typedef atomic_t atomic_long_t;
 #endif
 
 #define ATOMIC_LONG_READ_OP(mo)						\
-static inline long atomic_long_read##mo(atomic_long_t *l)		\
+static inline long atomic_long_read##mo(const atomic_long_t *l)		\
 {									\
 	ATOMIC_LONG_PFX(_t) *v = (ATOMIC_LONG_PFX(_t) *)l;		\
 									\
@@ -43,6 +43,7 @@ static inline long atomic_long_read##mo(atomic_long_t *l)		\
 }
 ATOMIC_LONG_READ_OP()
 ATOMIC_LONG_READ_OP(_acquire)
+ATOMIC_LONG_READ_OP(_ctrl)
 
 #undef ATOMIC_LONG_READ_OP
 
diff --git a/include/linux/atomic.h b/include/linux/atomic.h
index 29dafa1..e326469 100644
--- a/include/linux/atomic.h
+++ b/include/linux/atomic.h
@@ -4,6 +4,15 @@
 #include <asm/atomic.h>
 #include <asm/barrier.h>
 
+#ifndef atomic_read_ctrl
+static inline int atomic_read_ctrl(const atomic_t *v)
+{
+	int val = atomic_read(v);
+	smp_read_barrier_depends(); /* Enforce control dependency. */
+	return val;
+}
+#endif
+
 /*
  * Relaxed variants of xchg, cmpxchg and some atomic operations.
  *
@@ -455,6 +464,15 @@ static inline int atomic_dec_if_positive(atomic_t *v)
 #include <asm-generic/atomic64.h>
 #endif
 
+#ifndef atomic64_read_ctrl
+static inline long long atomic64_read_ctrl(const atomic64_t *v)
+{
+	long long val = atomic64_read(v);
+	smp_read_barrier_depends(); /* Enforce control dependency. */
+	return val;
+}
+#endif
+
 #ifndef atomic64_andnot
 static inline void atomic64_andnot(long long i, atomic64_t *v)
 {
-- 
cgit v0.10.2


From 63ab7bd0d450b726b88fa4b932f151b98cee2557 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Wed, 30 Sep 2015 13:03:11 -0700
Subject: locking/asm-generic: Add _{relaxed|acquire|release}() variants for
 inc/dec atomics

Similar to what we have for regular add/sub calls. For now, no actual arch
implements them, so everyone falls back to the default atomics... iow,
nothing changes. These will be used in future primitives.

Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Paul E.McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: linux-kernel@vger.kernel.org
Link: http://lkml.kernel.org/r/1443643395-17016-2-git-send-email-dave@stgolabs.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/include/asm-generic/atomic-long.h b/include/asm-generic/atomic-long.h
index 8942cdc..f91093c 100644
--- a/include/asm-generic/atomic-long.h
+++ b/include/asm-generic/atomic-long.h
@@ -159,19 +159,24 @@ static inline int atomic_long_add_negative(long i, atomic_long_t *l)
 	return ATOMIC_LONG_PFX(_add_negative)(i, v);
 }
 
-static inline long atomic_long_inc_return(atomic_long_t *l)
-{
-	ATOMIC_LONG_PFX(_t) *v = (ATOMIC_LONG_PFX(_t) *)l;
-
-	return (long)ATOMIC_LONG_PFX(_inc_return)(v);
-}
-
-static inline long atomic_long_dec_return(atomic_long_t *l)
-{
-	ATOMIC_LONG_PFX(_t) *v = (ATOMIC_LONG_PFX(_t) *)l;
-
-	return (long)ATOMIC_LONG_PFX(_dec_return)(v);
+#define ATOMIC_LONG_INC_DEC_OP(op, mo)					\
+static inline long							\
+atomic_long_##op##_return##mo(atomic_long_t *l)				\
+{									\
+	ATOMIC_LONG_PFX(_t) *v = (ATOMIC_LONG_PFX(_t) *)l;		\
+									\
+	return (long)ATOMIC_LONG_PFX(_##op##_return##mo)(v);		\
 }
+ATOMIC_LONG_INC_DEC_OP(inc,)
+ATOMIC_LONG_INC_DEC_OP(inc, _relaxed)
+ATOMIC_LONG_INC_DEC_OP(inc, _acquire)
+ATOMIC_LONG_INC_DEC_OP(inc, _release)
+ATOMIC_LONG_INC_DEC_OP(dec,)
+ATOMIC_LONG_INC_DEC_OP(dec, _relaxed)
+ATOMIC_LONG_INC_DEC_OP(dec, _acquire)
+ATOMIC_LONG_INC_DEC_OP(dec, _release)
+
+#undef ATOMIC_LONG_INC_DEC_OP
 
 static inline long atomic_long_add_unless(atomic_long_t *l, long a, long u)
 {
diff --git a/include/linux/atomic.h b/include/linux/atomic.h
index e326469..27e580d 100644
--- a/include/linux/atomic.h
+++ b/include/linux/atomic.h
@@ -90,6 +90,30 @@ static inline int atomic_read_ctrl(const atomic_t *v)
 #endif
 #endif /* atomic_add_return_relaxed */
 
+/* atomic_inc_return_relaxed */
+#ifndef atomic_inc_return_relaxed
+#define  atomic_inc_return_relaxed	atomic_inc_return
+#define  atomic_inc_return_acquire	atomic_inc_return
+#define  atomic_inc_return_release	atomic_inc_return
+
+#else /* atomic_inc_return_relaxed */
+
+#ifndef atomic_inc_return_acquire
+#define  atomic_inc_return_acquire(...)					\
+	__atomic_op_acquire(atomic_inc_return, __VA_ARGS__)
+#endif
+
+#ifndef atomic_inc_return_release
+#define  atomic_inc_return_release(...)					\
+	__atomic_op_release(atomic_inc_return, __VA_ARGS__)
+#endif
+
+#ifndef atomic_inc_return
+#define  atomic_inc_return(...)						\
+	__atomic_op_fence(atomic_inc_return, __VA_ARGS__)
+#endif
+#endif /* atomic_inc_return_relaxed */
+
 /* atomic_sub_return_relaxed */
 #ifndef atomic_sub_return_relaxed
 #define  atomic_sub_return_relaxed	atomic_sub_return
@@ -114,6 +138,30 @@ static inline int atomic_read_ctrl(const atomic_t *v)
 #endif
 #endif /* atomic_sub_return_relaxed */
 
+/* atomic_dec_return_relaxed */
+#ifndef atomic_dec_return_relaxed
+#define  atomic_dec_return_relaxed	atomic_dec_return
+#define  atomic_dec_return_acquire	atomic_dec_return
+#define  atomic_dec_return_release	atomic_dec_return
+
+#else /* atomic_dec_return_relaxed */
+
+#ifndef atomic_dec_return_acquire
+#define  atomic_dec_return_acquire(...)					\
+	__atomic_op_acquire(atomic_dec_return, __VA_ARGS__)
+#endif
+
+#ifndef atomic_dec_return_release
+#define  atomic_dec_return_release(...)					\
+	__atomic_op_release(atomic_dec_return, __VA_ARGS__)
+#endif
+
+#ifndef atomic_dec_return
+#define  atomic_dec_return(...)						\
+	__atomic_op_fence(atomic_dec_return, __VA_ARGS__)
+#endif
+#endif /* atomic_dec_return_relaxed */
+
 /* atomic_xchg_relaxed */
 #ifndef atomic_xchg_relaxed
 #define  atomic_xchg_relaxed		atomic_xchg
@@ -194,6 +242,31 @@ static inline int atomic_read_ctrl(const atomic_t *v)
 #endif
 #endif /* atomic64_add_return_relaxed */
 
+/* atomic64_inc_return_relaxed */
+#ifndef atomic64_inc_return_relaxed
+#define  atomic64_inc_return_relaxed	atomic64_inc_return
+#define  atomic64_inc_return_acquire	atomic64_inc_return
+#define  atomic64_inc_return_release	atomic64_inc_return
+
+#else /* atomic64_inc_return_relaxed */
+
+#ifndef atomic64_inc_return_acquire
+#define  atomic64_inc_return_acquire(...)				\
+	__atomic_op_acquire(atomic64_inc_return, __VA_ARGS__)
+#endif
+
+#ifndef atomic64_inc_return_release
+#define  atomic64_inc_return_release(...)				\
+	__atomic_op_release(atomic64_inc_return, __VA_ARGS__)
+#endif
+
+#ifndef atomic64_inc_return
+#define  atomic64_inc_return(...)					\
+	__atomic_op_fence(atomic64_inc_return, __VA_ARGS__)
+#endif
+#endif /* atomic64_inc_return_relaxed */
+
+
 /* atomic64_sub_return_relaxed */
 #ifndef atomic64_sub_return_relaxed
 #define  atomic64_sub_return_relaxed	atomic64_sub_return
@@ -218,6 +291,30 @@ static inline int atomic_read_ctrl(const atomic_t *v)
 #endif
 #endif /* atomic64_sub_return_relaxed */
 
+/* atomic64_dec_return_relaxed */
+#ifndef atomic64_dec_return_relaxed
+#define  atomic64_dec_return_relaxed	atomic64_dec_return
+#define  atomic64_dec_return_acquire	atomic64_dec_return
+#define  atomic64_dec_return_release	atomic64_dec_return
+
+#else /* atomic64_dec_return_relaxed */
+
+#ifndef atomic64_dec_return_acquire
+#define  atomic64_dec_return_acquire(...)				\
+	__atomic_op_acquire(atomic64_dec_return, __VA_ARGS__)
+#endif
+
+#ifndef atomic64_dec_return_release
+#define  atomic64_dec_return_release(...)				\
+	__atomic_op_release(atomic64_dec_return, __VA_ARGS__)
+#endif
+
+#ifndef atomic64_dec_return
+#define  atomic64_dec_return(...)					\
+	__atomic_op_fence(atomic64_dec_return, __VA_ARGS__)
+#endif
+#endif /* atomic64_dec_return_relaxed */
+
 /* atomic64_xchg_relaxed */
 #ifndef atomic64_xchg_relaxed
 #define  atomic64_xchg_relaxed		atomic64_xchg
-- 
cgit v0.10.2


From 81a43adae3b943193fb3afd20a36a7482332f964 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Wed, 30 Sep 2015 13:03:12 -0700
Subject: locking/mutex: Use acquire/release semantics

As of 654672d4ba1 (locking/atomics: Add _{acquire|release|relaxed}()
variants of some atomic operations) and 6d79ef2d30e (locking, asm-generic:
Add _{relaxed|acquire|release}() variants for 'atomic_long_t'), weakly
ordered archs can benefit from more relaxed use of barriers when locking
and unlocking, instead of regular full barrier semantics. While currently
only arm64 supports such optimizations, updating corresponding locking
primitives serves for other archs to immediately benefit as well, once the
necessary machinery is implemented of course.

Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Paul E.McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: linux-kernel@vger.kernel.org
Link: http://lkml.kernel.org/r/1443643395-17016-3-git-send-email-dave@stgolabs.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/include/asm-generic/mutex-dec.h b/include/asm-generic/mutex-dec.h
index d4f9fb4..fd694cf 100644
--- a/include/asm-generic/mutex-dec.h
+++ b/include/asm-generic/mutex-dec.h
@@ -20,7 +20,7 @@
 static inline void
 __mutex_fastpath_lock(atomic_t *count, void (*fail_fn)(atomic_t *))
 {
-	if (unlikely(atomic_dec_return(count) < 0))
+	if (unlikely(atomic_dec_return_acquire(count) < 0))
 		fail_fn(count);
 }
 
@@ -35,7 +35,7 @@ __mutex_fastpath_lock(atomic_t *count, void (*fail_fn)(atomic_t *))
 static inline int
 __mutex_fastpath_lock_retval(atomic_t *count)
 {
-	if (unlikely(atomic_dec_return(count) < 0))
+	if (unlikely(atomic_dec_return_acquire(count) < 0))
 		return -1;
 	return 0;
 }
@@ -56,7 +56,7 @@ __mutex_fastpath_lock_retval(atomic_t *count)
 static inline void
 __mutex_fastpath_unlock(atomic_t *count, void (*fail_fn)(atomic_t *))
 {
-	if (unlikely(atomic_inc_return(count) <= 0))
+	if (unlikely(atomic_inc_return_release(count) <= 0))
 		fail_fn(count);
 }
 
@@ -80,7 +80,7 @@ __mutex_fastpath_unlock(atomic_t *count, void (*fail_fn)(atomic_t *))
 static inline int
 __mutex_fastpath_trylock(atomic_t *count, int (*fail_fn)(atomic_t *))
 {
-	if (likely(atomic_cmpxchg(count, 1, 0) == 1))
+	if (likely(atomic_cmpxchg_acquire(count, 1, 0) == 1))
 		return 1;
 	return 0;
 }
diff --git a/include/asm-generic/mutex-xchg.h b/include/asm-generic/mutex-xchg.h
index f169ec0..a6b4a7b 100644
--- a/include/asm-generic/mutex-xchg.h
+++ b/include/asm-generic/mutex-xchg.h
@@ -31,7 +31,7 @@ __mutex_fastpath_lock(atomic_t *count, void (*fail_fn)(atomic_t *))
 		 * to ensure that any waiting tasks are woken up by the
 		 * unlock slow path.
 		 */
-		if (likely(atomic_xchg(count, -1) != 1))
+		if (likely(atomic_xchg_acquire(count, -1) != 1))
 			fail_fn(count);
 }
 
@@ -46,7 +46,7 @@ __mutex_fastpath_lock(atomic_t *count, void (*fail_fn)(atomic_t *))
 static inline int
 __mutex_fastpath_lock_retval(atomic_t *count)
 {
-	if (unlikely(atomic_xchg(count, 0) != 1))
+	if (unlikely(atomic_xchg_acquire(count, 0) != 1))
 		if (likely(atomic_xchg(count, -1) != 1))
 			return -1;
 	return 0;
@@ -67,7 +67,7 @@ __mutex_fastpath_lock_retval(atomic_t *count)
 static inline void
 __mutex_fastpath_unlock(atomic_t *count, void (*fail_fn)(atomic_t *))
 {
-	if (unlikely(atomic_xchg(count, 1) != 0))
+	if (unlikely(atomic_xchg_release(count, 1) != 0))
 		fail_fn(count);
 }
 
@@ -91,7 +91,7 @@ __mutex_fastpath_unlock(atomic_t *count, void (*fail_fn)(atomic_t *))
 static inline int
 __mutex_fastpath_trylock(atomic_t *count, int (*fail_fn)(atomic_t *))
 {
-	int prev = atomic_xchg(count, 0);
+	int prev = atomic_xchg_acquire(count, 0);
 
 	if (unlikely(prev < 0)) {
 		/*
@@ -105,7 +105,7 @@ __mutex_fastpath_trylock(atomic_t *count, int (*fail_fn)(atomic_t *))
 		 *   owner's unlock path needlessly, but that's not a problem
 		 *   in practice. ]
 		 */
-		prev = atomic_xchg(count, prev);
+		prev = atomic_xchg_acquire(count, prev);
 		if (prev < 0)
 			prev = 0;
 	}
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 4cccea6..0551c21 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -277,7 +277,7 @@ static inline int mutex_can_spin_on_owner(struct mutex *lock)
 static inline bool mutex_try_to_acquire(struct mutex *lock)
 {
 	return !mutex_is_locked(lock) &&
-		(atomic_cmpxchg(&lock->count, 1, 0) == 1);
+		(atomic_cmpxchg_acquire(&lock->count, 1, 0) == 1);
 }
 
 /*
@@ -529,7 +529,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 	 * Once more, try to acquire the lock. Only try-lock the mutex if
 	 * it is unlocked to reduce unnecessary xchg() operations.
 	 */
-	if (!mutex_is_locked(lock) && (atomic_xchg(&lock->count, 0) == 1))
+	if (!mutex_is_locked(lock) &&
+	    (atomic_xchg_acquire(&lock->count, 0) == 1))
 		goto skip_wait;
 
 	debug_mutex_lock_common(lock, &waiter);
@@ -553,7 +554,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 		 * non-negative in order to avoid unnecessary xchg operations:
 		 */
 		if (atomic_read(&lock->count) >= 0 &&
-		    (atomic_xchg(&lock->count, -1) == 1))
+		    (atomic_xchg_acquire(&lock->count, -1) == 1))
 			break;
 
 		/*
@@ -867,7 +868,7 @@ static inline int __mutex_trylock_slowpath(atomic_t *lock_count)
 
 	spin_lock_mutex(&lock->wait_lock, flags);
 
-	prev = atomic_xchg(&lock->count, -1);
+	prev = atomic_xchg_acquire(&lock->count, -1);
 	if (likely(prev == 1)) {
 		mutex_set_owner(lock);
 		mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
-- 
cgit v0.10.2


From 700318d1d7b38bbfe86813d9c5c18364dd941526 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Wed, 30 Sep 2015 13:03:13 -0700
Subject: locking/rtmutex: Use acquire/release semantics

As of 654672d4ba1 (locking/atomics: Add _{acquire|release|relaxed}()
variants of some atomic operations) and 6d79ef2d30e (locking, asm-generic:
Add _{relaxed|acquire|release}() variants for 'atomic_long_t'), weakly
ordered archs can benefit from more relaxed use of barriers when locking
and unlocking, instead of regular full barrier semantics. While currently
only arm64 supports such optimizations, updating corresponding locking
primitives serves for other archs to immediately benefit as well, once the
necessary machinery is implemented of course.

Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Paul E.McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: linux-kernel@vger.kernel.org
Link: http://lkml.kernel.org/r/1443643395-17016-4-git-send-email-dave@stgolabs.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 7781d80..bbb72b4 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -74,14 +74,23 @@ static void fixup_rt_mutex_waiters(struct rt_mutex *lock)
  * set up.
  */
 #ifndef CONFIG_DEBUG_RT_MUTEXES
-# define rt_mutex_cmpxchg(l,c,n)	(cmpxchg(&l->owner, c, n) == c)
+# define rt_mutex_cmpxchg_relaxed(l,c,n) (cmpxchg_relaxed(&l->owner, c, n) == c)
+# define rt_mutex_cmpxchg_acquire(l,c,n) (cmpxchg_acquire(&l->owner, c, n) == c)
+# define rt_mutex_cmpxchg_release(l,c,n) (cmpxchg_release(&l->owner, c, n) == c)
+
+/*
+ * Callers must hold the ->wait_lock -- which is the whole purpose as we force
+ * all future threads that attempt to [Rmw] the lock to the slowpath. As such
+ * relaxed semantics suffice.
+ */
 static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
 {
 	unsigned long owner, *p = (unsigned long *) &lock->owner;
 
 	do {
 		owner = *p;
-	} while (cmpxchg(p, owner, owner | RT_MUTEX_HAS_WAITERS) != owner);
+	} while (cmpxchg_relaxed(p, owner,
+				 owner | RT_MUTEX_HAS_WAITERS) != owner);
 }
 
 /*
@@ -121,11 +130,14 @@ static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock)
 	 *					lock(wait_lock);
 	 *					acquire(lock);
 	 */
-	return rt_mutex_cmpxchg(lock, owner, NULL);
+	return rt_mutex_cmpxchg_release(lock, owner, NULL);
 }
 
 #else
-# define rt_mutex_cmpxchg(l,c,n)	(0)
+# define rt_mutex_cmpxchg_relaxed(l,c,n)	(0)
+# define rt_mutex_cmpxchg_acquire(l,c,n)	(0)
+# define rt_mutex_cmpxchg_release(l,c,n)	(0)
+
 static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
 {
 	lock->owner = (struct task_struct *)
@@ -1321,7 +1333,7 @@ rt_mutex_fastlock(struct rt_mutex *lock, int state,
 				struct hrtimer_sleeper *timeout,
 				enum rtmutex_chainwalk chwalk))
 {
-	if (likely(rt_mutex_cmpxchg(lock, NULL, current))) {
+	if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) {
 		rt_mutex_deadlock_account_lock(lock, current);
 		return 0;
 	} else
@@ -1337,7 +1349,7 @@ rt_mutex_timed_fastlock(struct rt_mutex *lock, int state,
 				      enum rtmutex_chainwalk chwalk))
 {
 	if (chwalk == RT_MUTEX_MIN_CHAINWALK &&
-	    likely(rt_mutex_cmpxchg(lock, NULL, current))) {
+	    likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) {
 		rt_mutex_deadlock_account_lock(lock, current);
 		return 0;
 	} else
@@ -1348,7 +1360,7 @@ static inline int
 rt_mutex_fasttrylock(struct rt_mutex *lock,
 		     int (*slowfn)(struct rt_mutex *lock))
 {
-	if (likely(rt_mutex_cmpxchg(lock, NULL, current))) {
+	if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) {
 		rt_mutex_deadlock_account_lock(lock, current);
 		return 1;
 	}
@@ -1362,7 +1374,7 @@ rt_mutex_fastunlock(struct rt_mutex *lock,
 {
 	WAKE_Q(wake_q);
 
-	if (likely(rt_mutex_cmpxchg(lock, current, NULL))) {
+	if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) {
 		rt_mutex_deadlock_account_unlock(current);
 
 	} else {
@@ -1484,7 +1496,7 @@ EXPORT_SYMBOL_GPL(rt_mutex_unlock);
 bool __sched rt_mutex_futex_unlock(struct rt_mutex *lock,
 				   struct wake_q_head *wqh)
 {
-	if (likely(rt_mutex_cmpxchg(lock, current, NULL))) {
+	if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) {
 		rt_mutex_deadlock_account_unlock(current);
 		return false;
 	}
-- 
cgit v0.10.2


From 3552a07a9c4aea32cc092fadf10a186c84ed8a61 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Wed, 30 Sep 2015 13:03:14 -0700
Subject: locking/mcs: Use acquire/release semantics

As of 654672d4ba1 (locking/atomics: Add _{acquire|release|relaxed}()
variants of some atomic operations) and 6d79ef2d30e (locking, asm-generic:
Add _{relaxed|acquire|release}() variants for 'atomic_long_t'), weakly
ordered archs can benefit from more relaxed use of barriers when locking
and unlocking, instead of regular full barrier semantics. While currently
only arm64 supports such optimizations, updating corresponding locking
primitives serves for other archs to immediately benefit as well, once the
necessary machinery is implemented of course.

Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Paul E.McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: linux-kernel@vger.kernel.org
Link: http://lkml.kernel.org/r/1443643395-17016-5-git-send-email-dave@stgolabs.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h
index fd91aaa..5b9102a 100644
--- a/kernel/locking/mcs_spinlock.h
+++ b/kernel/locking/mcs_spinlock.h
@@ -67,7 +67,7 @@ void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
 	node->locked = 0;
 	node->next   = NULL;
 
-	prev = xchg(lock, node);
+	prev = xchg_acquire(lock, node);
 	if (likely(prev == NULL)) {
 		/*
 		 * Lock acquired, don't need to set node->locked to 1. Threads
@@ -98,7 +98,7 @@ void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
 		/*
 		 * Release the lock by setting it to NULL
 		 */
-		if (likely(cmpxchg(lock, node, NULL) == node))
+		if (likely(cmpxchg_release(lock, node, NULL) == node))
 			return;
 		/* Wait until the next pointer is set */
 		while (!(next = READ_ONCE(node->next)))
-- 
cgit v0.10.2


From 00eb4bab69db349c3bdc7e0b0f7e9070dafea58c Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Wed, 30 Sep 2015 13:03:15 -0700
Subject: locking/rwsem: Use acquire/release semantics

As of 654672d4ba1 (locking/atomics: Add _{acquire|release|relaxed}()
variants of some atomic operations) and 6d79ef2d30e (locking, asm-generic:
Add _{relaxed|acquire|release}() variants for 'atomic_long_t'), weakly
ordered archs can benefit from more relaxed use of barriers when locking
and unlocking, instead of regular full barrier semantics. While currently
only arm64 supports such optimizations, updating corresponding locking
primitives serves for other archs to immediately benefit as well, once the
necessary machinery is implemented of course.

Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Paul E.McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: linux-kernel@vger.kernel.org
Link: http://lkml.kernel.org/r/1443643395-17016-6-git-send-email-dave@stgolabs.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/include/asm-generic/rwsem.h b/include/asm-generic/rwsem.h
index d48bf5a..d6d5dc9 100644
--- a/include/asm-generic/rwsem.h
+++ b/include/asm-generic/rwsem.h
@@ -33,7 +33,7 @@
  */
 static inline void __down_read(struct rw_semaphore *sem)
 {
-	if (unlikely(atomic_long_inc_return((atomic_long_t *)&sem->count) <= 0))
+	if (unlikely(atomic_long_inc_return_acquire((atomic_long_t *)&sem->count) <= 0))
 		rwsem_down_read_failed(sem);
 }
 
@@ -42,7 +42,7 @@ static inline int __down_read_trylock(struct rw_semaphore *sem)
 	long tmp;
 
 	while ((tmp = sem->count) >= 0) {
-		if (tmp == cmpxchg(&sem->count, tmp,
+		if (tmp == cmpxchg_acquire(&sem->count, tmp,
 				   tmp + RWSEM_ACTIVE_READ_BIAS)) {
 			return 1;
 		}
@@ -57,7 +57,7 @@ static inline void __down_write_nested(struct rw_semaphore *sem, int subclass)
 {
 	long tmp;
 
-	tmp = atomic_long_add_return(RWSEM_ACTIVE_WRITE_BIAS,
+	tmp = atomic_long_add_return_acquire(RWSEM_ACTIVE_WRITE_BIAS,
 				     (atomic_long_t *)&sem->count);
 	if (unlikely(tmp != RWSEM_ACTIVE_WRITE_BIAS))
 		rwsem_down_write_failed(sem);
@@ -72,7 +72,7 @@ static inline int __down_write_trylock(struct rw_semaphore *sem)
 {
 	long tmp;
 
-	tmp = cmpxchg(&sem->count, RWSEM_UNLOCKED_VALUE,
+	tmp = cmpxchg_acquire(&sem->count, RWSEM_UNLOCKED_VALUE,
 		      RWSEM_ACTIVE_WRITE_BIAS);
 	return tmp == RWSEM_UNLOCKED_VALUE;
 }
@@ -84,7 +84,7 @@ static inline void __up_read(struct rw_semaphore *sem)
 {
 	long tmp;
 
-	tmp = atomic_long_dec_return((atomic_long_t *)&sem->count);
+	tmp = atomic_long_dec_return_release((atomic_long_t *)&sem->count);
 	if (unlikely(tmp < -1 && (tmp & RWSEM_ACTIVE_MASK) == 0))
 		rwsem_wake(sem);
 }
@@ -94,7 +94,7 @@ static inline void __up_read(struct rw_semaphore *sem)
  */
 static inline void __up_write(struct rw_semaphore *sem)
 {
-	if (unlikely(atomic_long_sub_return(RWSEM_ACTIVE_WRITE_BIAS,
+	if (unlikely(atomic_long_sub_return_release(RWSEM_ACTIVE_WRITE_BIAS,
 				 (atomic_long_t *)&sem->count) < 0))
 		rwsem_wake(sem);
 }
@@ -114,7 +114,14 @@ static inline void __downgrade_write(struct rw_semaphore *sem)
 {
 	long tmp;
 
-	tmp = atomic_long_add_return(-RWSEM_WAITING_BIAS,
+	/*
+	 * When downgrading from exclusive to shared ownership,
+	 * anything inside the write-locked region cannot leak
+	 * into the read side. In contrast, anything in the
+	 * read-locked region is ok to be re-ordered into the
+	 * write side. As such, rely on RELEASE semantics.
+	 */
+	tmp = atomic_long_add_return_release(-RWSEM_WAITING_BIAS,
 				     (atomic_long_t *)&sem->count);
 	if (tmp < 0)
 		rwsem_downgrade_wake(sem);
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index 0f18971..a4d4de0 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -262,7 +262,7 @@ static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem)
 	 * to reduce unnecessary expensive cmpxchg() operations.
 	 */
 	if (count == RWSEM_WAITING_BIAS &&
-	    cmpxchg(&sem->count, RWSEM_WAITING_BIAS,
+	    cmpxchg_acquire(&sem->count, RWSEM_WAITING_BIAS,
 		    RWSEM_ACTIVE_WRITE_BIAS) == RWSEM_WAITING_BIAS) {
 		if (!list_is_singular(&sem->wait_list))
 			rwsem_atomic_update(RWSEM_WAITING_BIAS, sem);
@@ -285,7 +285,8 @@ static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
 		if (!(count == 0 || count == RWSEM_WAITING_BIAS))
 			return false;
 
-		old = cmpxchg(&sem->count, count, count + RWSEM_ACTIVE_WRITE_BIAS);
+		old = cmpxchg_acquire(&sem->count, count,
+				      count + RWSEM_ACTIVE_WRITE_BIAS);
 		if (old == count) {
 			rwsem_set_owner(sem);
 			return true;
-- 
cgit v0.10.2


From 6e490b0106a2118ee4c37c37847454a5c2dc6e32 Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Wed, 7 Oct 2015 15:10:38 +0100
Subject: ARM, locking/atomics: Implement _relaxed variants of
 atomic[64]_{inc,dec}

Now that the core code supports acquire/release/relaxed versions of
the atomic_inc family, implement only the _relaxed flavours in the ARM
backend so that we get all of the others for free.

Signed-off-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Davidlohr Bueso <dbueso@suse.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1444227038-12533-1-git-send-email-will.deacon@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/arm/include/asm/atomic.h b/arch/arm/include/asm/atomic.h
index 2bf80af..9e10c45 100644
--- a/arch/arm/include/asm/atomic.h
+++ b/arch/arm/include/asm/atomic.h
@@ -210,8 +210,8 @@ ATOMIC_OP(xor, ^=, eor)
 
 #define atomic_inc_and_test(v)	(atomic_add_return(1, v) == 0)
 #define atomic_dec_and_test(v)	(atomic_sub_return(1, v) == 0)
-#define atomic_inc_return(v)    (atomic_add_return(1, v))
-#define atomic_dec_return(v)    (atomic_sub_return(1, v))
+#define atomic_inc_return_relaxed(v)    (atomic_add_return_relaxed(1, v))
+#define atomic_dec_return_relaxed(v)    (atomic_sub_return_relaxed(1, v))
 #define atomic_sub_and_test(i, v) (atomic_sub_return(i, v) == 0)
 
 #define atomic_add_negative(i,v) (atomic_add_return(i, v) < 0)
@@ -442,11 +442,11 @@ static inline int atomic64_add_unless(atomic64_t *v, long long a, long long u)
 
 #define atomic64_add_negative(a, v)	(atomic64_add_return((a), (v)) < 0)
 #define atomic64_inc(v)			atomic64_add(1LL, (v))
-#define atomic64_inc_return(v)		atomic64_add_return(1LL, (v))
+#define atomic64_inc_return_relaxed(v)	atomic64_add_return_relaxed(1LL, (v))
 #define atomic64_inc_and_test(v)	(atomic64_inc_return(v) == 0)
 #define atomic64_sub_and_test(a, v)	(atomic64_sub_return((a), (v)) == 0)
 #define atomic64_dec(v)			atomic64_sub(1LL, (v))
-#define atomic64_dec_return(v)		atomic64_sub_return(1LL, (v))
+#define atomic64_dec_return_relaxed(v)	atomic64_sub_return_relaxed(1LL, (v))
 #define atomic64_dec_and_test(v)	(atomic64_dec_return((v)) == 0)
 #define atomic64_inc_not_zero(v)	atomic64_add_unless((v), 1LL, 0LL)
 
-- 
cgit v0.10.2