diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2016-01-11 22:18:38 (GMT) |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2016-01-11 22:18:38 (GMT) |
commit | 24af98c4cf5f5e69266e270c7f3fb34b82ff6656 (patch) | |
tree | 70d71381c841c92b2d28397bf0c5d6a7d9bbbaac /kernel/locking/qspinlock_paravirt.h | |
parent | 9061cbe62adeccf8c986883bcd40f4aeee59ea75 (diff) | |
parent | 337f13046ff03717a9e99675284a817527440a49 (diff) | |
download | linux-24af98c4cf5f5e69266e270c7f3fb34b82ff6656.tar.xz |
Merge branch 'locking-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull locking updates from Ingo Molnar:
"So we have a laundry list of locking subsystem changes:
- continuing barrier API and code improvements
- futex enhancements
- atomics API improvements
- pvqspinlock enhancements: in particular lock stealing and adaptive
spinning
- qspinlock micro-enhancements"
* 'locking-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
futex: Allow FUTEX_CLOCK_REALTIME with FUTEX_WAIT op
futex: Cleanup the goto confusion in requeue_pi()
futex: Remove pointless put_pi_state calls in requeue()
futex: Document pi_state refcounting in requeue code
futex: Rename free_pi_state() to put_pi_state()
futex: Drop refcount if requeue_pi() acquired the rtmutex
locking/barriers, arch: Remove ambiguous statement in the smp_store_mb() documentation
lcoking/barriers, arch: Use smp barriers in smp_store_release()
locking/cmpxchg, arch: Remove tas() definitions
locking/pvqspinlock: Queue node adaptive spinning
locking/pvqspinlock: Allow limited lock stealing
locking/pvqspinlock: Collect slowpath lock statistics
sched/core, locking: Document Program-Order guarantees
locking, sched: Introduce smp_cond_acquire() and use it
locking/pvqspinlock, x86: Optimize the PV unlock code path
locking/qspinlock: Avoid redundant read of next pointer
locking/qspinlock: Prefetch the next node cacheline
locking/qspinlock: Use _acquire/_release() versions of cmpxchg() & xchg()
atomics: Add test for atomic operations with _relaxed variants
Diffstat (limited to 'kernel/locking/qspinlock_paravirt.h')
-rw-r--r-- | kernel/locking/qspinlock_paravirt.h | 252 |
1 files changed, 212 insertions, 40 deletions
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index f0450ff..87bb235 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h @@ -23,6 +23,20 @@ #define _Q_SLOW_VAL (3U << _Q_LOCKED_OFFSET) /* + * Queue Node Adaptive Spinning + * + * A queue node vCPU will stop spinning if the vCPU in the previous node is + * not running. The one lock stealing attempt allowed at slowpath entry + * mitigates the slight slowdown for non-overcommitted guest with this + * aggressive wait-early mechanism. + * + * The status of the previous node will be checked at fixed interval + * controlled by PV_PREV_CHECK_MASK. This is to ensure that we won't + * pound on the cacheline of the previous node too heavily. + */ +#define PV_PREV_CHECK_MASK 0xff + +/* * Queue node uses: vcpu_running & vcpu_halted. * Queue head uses: vcpu_running & vcpu_hashed. */ @@ -41,6 +55,94 @@ struct pv_node { }; /* + * By replacing the regular queued_spin_trylock() with the function below, + * it will be called once when a lock waiter enter the PV slowpath before + * being queued. By allowing one lock stealing attempt here when the pending + * bit is off, it helps to reduce the performance impact of lock waiter + * preemption without the drawback of lock starvation. + */ +#define queued_spin_trylock(l) pv_queued_spin_steal_lock(l) +static inline bool pv_queued_spin_steal_lock(struct qspinlock *lock) +{ + struct __qspinlock *l = (void *)lock; + + return !(atomic_read(&lock->val) & _Q_LOCKED_PENDING_MASK) && + (cmpxchg(&l->locked, 0, _Q_LOCKED_VAL) == 0); +} + +/* + * The pending bit is used by the queue head vCPU to indicate that it + * is actively spinning on the lock and no lock stealing is allowed. + */ +#if _Q_PENDING_BITS == 8 +static __always_inline void set_pending(struct qspinlock *lock) +{ + struct __qspinlock *l = (void *)lock; + + WRITE_ONCE(l->pending, 1); +} + +static __always_inline void clear_pending(struct qspinlock *lock) +{ + struct __qspinlock *l = (void *)lock; + + WRITE_ONCE(l->pending, 0); +} + +/* + * The pending bit check in pv_queued_spin_steal_lock() isn't a memory + * barrier. Therefore, an atomic cmpxchg() is used to acquire the lock + * just to be sure that it will get it. + */ +static __always_inline int trylock_clear_pending(struct qspinlock *lock) +{ + struct __qspinlock *l = (void *)lock; + + return !READ_ONCE(l->locked) && + (cmpxchg(&l->locked_pending, _Q_PENDING_VAL, _Q_LOCKED_VAL) + == _Q_PENDING_VAL); +} +#else /* _Q_PENDING_BITS == 8 */ +static __always_inline void set_pending(struct qspinlock *lock) +{ + atomic_set_mask(_Q_PENDING_VAL, &lock->val); +} + +static __always_inline void clear_pending(struct qspinlock *lock) +{ + atomic_clear_mask(_Q_PENDING_VAL, &lock->val); +} + +static __always_inline int trylock_clear_pending(struct qspinlock *lock) +{ + int val = atomic_read(&lock->val); + + for (;;) { + int old, new; + + if (val & _Q_LOCKED_MASK) + break; + + /* + * Try to clear pending bit & set locked bit + */ + old = val; + new = (val & ~_Q_PENDING_MASK) | _Q_LOCKED_VAL; + val = atomic_cmpxchg(&lock->val, old, new); + + if (val == old) + return 1; + } + return 0; +} +#endif /* _Q_PENDING_BITS == 8 */ + +/* + * Include queued spinlock statistics code + */ +#include "qspinlock_stat.h" + +/* * Lock and MCS node addresses hash table for fast lookup * * Hashing is done on a per-cacheline basis to minimize the need to access @@ -100,10 +202,13 @@ static struct qspinlock **pv_hash(struct qspinlock *lock, struct pv_node *node) { unsigned long offset, hash = hash_ptr(lock, pv_lock_hash_bits); struct pv_hash_entry *he; + int hopcnt = 0; for_each_hash_entry(he, offset, hash) { + hopcnt++; if (!cmpxchg(&he->lock, NULL, lock)) { WRITE_ONCE(he->node, node); + qstat_hop(hopcnt); return &he->lock; } } @@ -144,6 +249,20 @@ static struct pv_node *pv_unhash(struct qspinlock *lock) } /* + * Return true if when it is time to check the previous node which is not + * in a running state. + */ +static inline bool +pv_wait_early(struct pv_node *prev, int loop) +{ + + if ((loop & PV_PREV_CHECK_MASK) != 0) + return false; + + return READ_ONCE(prev->state) != vcpu_running; +} + +/* * Initialize the PV part of the mcs_spinlock node. */ static void pv_init_node(struct mcs_spinlock *node) @@ -161,15 +280,23 @@ static void pv_init_node(struct mcs_spinlock *node) * pv_kick_node() is used to set _Q_SLOW_VAL and fill in hash table on its * behalf. */ -static void pv_wait_node(struct mcs_spinlock *node) +static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev) { struct pv_node *pn = (struct pv_node *)node; + struct pv_node *pp = (struct pv_node *)prev; + int waitcnt = 0; int loop; + bool wait_early; - for (;;) { - for (loop = SPIN_THRESHOLD; loop; loop--) { + /* waitcnt processing will be compiled out if !QUEUED_LOCK_STAT */ + for (;; waitcnt++) { + for (wait_early = false, loop = SPIN_THRESHOLD; loop; loop--) { if (READ_ONCE(node->locked)) return; + if (pv_wait_early(pp, loop)) { + wait_early = true; + break; + } cpu_relax(); } @@ -184,12 +311,17 @@ static void pv_wait_node(struct mcs_spinlock *node) */ smp_store_mb(pn->state, vcpu_halted); - if (!READ_ONCE(node->locked)) + if (!READ_ONCE(node->locked)) { + qstat_inc(qstat_pv_wait_node, true); + qstat_inc(qstat_pv_wait_again, waitcnt); + qstat_inc(qstat_pv_wait_early, wait_early); pv_wait(&pn->state, vcpu_halted); + } /* - * If pv_kick_node() changed us to vcpu_hashed, retain that value - * so that pv_wait_head() knows to not also try to hash this lock. + * If pv_kick_node() changed us to vcpu_hashed, retain that + * value so that pv_wait_head_or_lock() knows to not also try + * to hash this lock. */ cmpxchg(&pn->state, vcpu_halted, vcpu_running); @@ -200,6 +332,7 @@ static void pv_wait_node(struct mcs_spinlock *node) * So it is better to spin for a while in the hope that the * MCS lock will be released soon. */ + qstat_inc(qstat_pv_spurious_wakeup, !READ_ONCE(node->locked)); } /* @@ -212,8 +345,9 @@ static void pv_wait_node(struct mcs_spinlock *node) /* * Called after setting next->locked = 1 when we're the lock owner. * - * Instead of waking the waiters stuck in pv_wait_node() advance their state such - * that they're waiting in pv_wait_head(), this avoids a wake/sleep cycle. + * Instead of waking the waiters stuck in pv_wait_node() advance their state + * such that they're waiting in pv_wait_head_or_lock(), this avoids a + * wake/sleep cycle. */ static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node) { @@ -242,14 +376,19 @@ static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node) } /* - * Wait for l->locked to become clear; halt the vcpu after a short spin. + * Wait for l->locked to become clear and acquire the lock; + * halt the vcpu after a short spin. * __pv_queued_spin_unlock() will wake us. + * + * The current value of the lock will be returned for additional processing. */ -static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node) +static u32 +pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node) { struct pv_node *pn = (struct pv_node *)node; struct __qspinlock *l = (void *)lock; struct qspinlock **lp = NULL; + int waitcnt = 0; int loop; /* @@ -259,12 +398,25 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node) if (READ_ONCE(pn->state) == vcpu_hashed) lp = (struct qspinlock **)1; - for (;;) { + for (;; waitcnt++) { + /* + * Set correct vCPU state to be used by queue node wait-early + * mechanism. + */ + WRITE_ONCE(pn->state, vcpu_running); + + /* + * Set the pending bit in the active lock spinning loop to + * disable lock stealing before attempting to acquire the lock. + */ + set_pending(lock); for (loop = SPIN_THRESHOLD; loop; loop--) { - if (!READ_ONCE(l->locked)) - return; + if (trylock_clear_pending(lock)) + goto gotlock; cpu_relax(); } + clear_pending(lock); + if (!lp) { /* ONCE */ lp = pv_hash(lock, pn); @@ -280,51 +432,50 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node) * * Matches the smp_rmb() in __pv_queued_spin_unlock(). */ - if (!cmpxchg(&l->locked, _Q_LOCKED_VAL, _Q_SLOW_VAL)) { + if (xchg(&l->locked, _Q_SLOW_VAL) == 0) { /* - * The lock is free and _Q_SLOW_VAL has never - * been set. Therefore we need to unhash before - * getting the lock. + * The lock was free and now we own the lock. + * Change the lock value back to _Q_LOCKED_VAL + * and unhash the table. */ + WRITE_ONCE(l->locked, _Q_LOCKED_VAL); WRITE_ONCE(*lp, NULL); - return; + goto gotlock; } } + WRITE_ONCE(pn->state, vcpu_halted); + qstat_inc(qstat_pv_wait_head, true); + qstat_inc(qstat_pv_wait_again, waitcnt); pv_wait(&l->locked, _Q_SLOW_VAL); /* * The unlocker should have freed the lock before kicking the * CPU. So if the lock is still not free, it is a spurious - * wakeup and so the vCPU should wait again after spinning for - * a while. + * wakeup or another vCPU has stolen the lock. The current + * vCPU should spin again. */ + qstat_inc(qstat_pv_spurious_wakeup, READ_ONCE(l->locked)); } /* - * Lock is unlocked now; the caller will acquire it without waiting. - * As with pv_wait_node() we rely on the caller to do a load-acquire - * for us. + * The cmpxchg() or xchg() call before coming here provides the + * acquire semantics for locking. The dummy ORing of _Q_LOCKED_VAL + * here is to indicate to the compiler that the value will always + * be nozero to enable better code optimization. */ +gotlock: + return (u32)(atomic_read(&lock->val) | _Q_LOCKED_VAL); } /* - * PV version of the unlock function to be used in stead of - * queued_spin_unlock(). + * PV versions of the unlock fastpath and slowpath functions to be used + * instead of queued_spin_unlock(). */ -__visible void __pv_queued_spin_unlock(struct qspinlock *lock) +__visible void +__pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked) { struct __qspinlock *l = (void *)lock; struct pv_node *node; - u8 locked; - - /* - * We must not unlock if SLOW, because in that case we must first - * unhash. Otherwise it would be possible to have multiple @lock - * entries, which would be BAD. - */ - locked = cmpxchg(&l->locked, _Q_LOCKED_VAL, 0); - if (likely(locked == _Q_LOCKED_VAL)) - return; if (unlikely(locked != _Q_SLOW_VAL)) { WARN(!debug_locks_silent, @@ -338,7 +489,7 @@ __visible void __pv_queued_spin_unlock(struct qspinlock *lock) * so we need a barrier to order the read of the node data in * pv_unhash *after* we've read the lock being _Q_SLOW_VAL. * - * Matches the cmpxchg() in pv_wait_head() setting _Q_SLOW_VAL. + * Matches the cmpxchg() in pv_wait_head_or_lock() setting _Q_SLOW_VAL. */ smp_rmb(); @@ -361,14 +512,35 @@ __visible void __pv_queued_spin_unlock(struct qspinlock *lock) * vCPU is harmless other than the additional latency in completing * the unlock. */ + qstat_inc(qstat_pv_kick_unlock, true); pv_kick(node->cpu); } + /* * Include the architecture specific callee-save thunk of the * __pv_queued_spin_unlock(). This thunk is put together with - * __pv_queued_spin_unlock() near the top of the file to make sure - * that the callee-save thunk and the real unlock function are close - * to each other sharing consecutive instruction cachelines. + * __pv_queued_spin_unlock() to make the callee-save thunk and the real unlock + * function close to each other sharing consecutive instruction cachelines. + * Alternatively, architecture specific version of __pv_queued_spin_unlock() + * can be defined. */ #include <asm/qspinlock_paravirt.h> +#ifndef __pv_queued_spin_unlock +__visible void __pv_queued_spin_unlock(struct qspinlock *lock) +{ + struct __qspinlock *l = (void *)lock; + u8 locked; + + /* + * We must not unlock if SLOW, because in that case we must first + * unhash. Otherwise it would be possible to have multiple @lock + * entries, which would be BAD. + */ + locked = cmpxchg(&l->locked, _Q_LOCKED_VAL, 0); + if (likely(locked == _Q_LOCKED_VAL)) + return; + + __pv_queued_spin_unlock_slowpath(lock, locked); +} +#endif /* __pv_queued_spin_unlock */ |