@@ -35,6 +35,14 @@ typedef struct qspinlock {
/*
* Bitfields in the atomic value:
*
+ * When NR_CPUS < 16K
+ * 0- 7: locked byte
+ * 8: pending
+ * 9-15: not used
+ * 16-17: tail index
+ * 18-31: tail cpu (+1)
+ *
+ * When NR_CPUS >= 16K
* 0- 7: locked byte
* 8: pending
* 9-10: tail index
@@ -47,7 +55,11 @@ typedef struct qspinlock {
#define _Q_LOCKED_MASK _Q_SET_MASK(LOCKED)
#define _Q_PENDING_OFFSET (_Q_LOCKED_OFFSET + _Q_LOCKED_BITS)
+#if CONFIG_NR_CPUS < (1U << 14)
+#define _Q_PENDING_BITS 8
+#else
#define _Q_PENDING_BITS 1
+#endif
#define _Q_PENDING_MASK _Q_SET_MASK(PENDING)
#define _Q_TAIL_IDX_OFFSET (_Q_PENDING_OFFSET + _Q_PENDING_BITS)
@@ -58,6 +70,7 @@ typedef struct qspinlock {
#define _Q_TAIL_CPU_BITS (32 - _Q_TAIL_CPU_OFFSET)
#define _Q_TAIL_CPU_MASK _Q_SET_MASK(TAIL_CPU)
+#define _Q_TAIL_OFFSET _Q_TAIL_IDX_OFFSET
#define _Q_TAIL_MASK (_Q_TAIL_IDX_MASK | _Q_TAIL_CPU_MASK)
#define _Q_LOCKED_VAL (1U << _Q_LOCKED_OFFSET)
@@ -22,6 +22,7 @@
#include <linux/percpu.h>
#include <linux/hardirq.h>
#include <linux/mutex.h>
+#include <asm/byteorder.h>
#include <asm/qspinlock.h>
/*
@@ -54,6 +55,10 @@
* node; whereby avoiding the need to carry a node from lock to unlock, and
* preserving existing lock API. This also makes the unlock code simpler and
* faster.
+ *
+ * N.B. The current implementation only supports architectures that allow
+ * atomic operations on smaller 8-bit and 16-bit data types.
+ *
*/
#include "mcs_spinlock.h"
@@ -94,6 +99,64 @@ static inline struct mcs_spinlock *decode_tail(u32 tail)
#define _Q_LOCKED_PENDING_MASK (_Q_LOCKED_MASK | _Q_PENDING_MASK)
+/*
+ * By using the whole 2nd least significant byte for the pending bit, we
+ * can allow better optimization of the lock acquisition for the pending
+ * bit holder.
+ */
+#if _Q_PENDING_BITS == 8
+
+struct __qspinlock {
+ union {
+ atomic_t val;
+ struct {
+#ifdef __LITTLE_ENDIAN
+ u16 locked_pending;
+ u16 tail;
+#else
+ u16 tail;
+ u16 locked_pending;
+#endif
+ };
+ };
+};
+
+/**
+ * clear_pending_set_locked - take ownership and clear the pending bit.
+ * @lock: Pointer to queue spinlock structure
+ * @val : Current value of the queue spinlock 32-bit word
+ *
+ * *,1,0 -> *,0,1
+ *
+ * Lock stealing is not allowed if this function is used.
+ */
+static __always_inline void
+clear_pending_set_locked(struct qspinlock *lock, u32 val)
+{
+ struct __qspinlock *l = (void *)lock;
+
+ ACCESS_ONCE(l->locked_pending) = _Q_LOCKED_VAL;
+}
+
+/*
+ * xchg_tail - Put in the new queue tail code word & retrieve previous one
+ * @lock : Pointer to queue spinlock structure
+ * @tail : The new queue tail code word
+ * Return: The previous queue tail code word
+ *
+ * xchg(lock, tail)
+ *
+ * p,*,* -> n,*,* ; prev = xchg(lock, node)
+ */
+static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
+{
+ struct __qspinlock *l = (void *)lock;
+
+ return (u32)xchg(&l->tail, tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET;
+}
+
+#else /* _Q_PENDING_BITS == 8 */
+
/**
* clear_pending_set_locked - take ownership and clear the pending bit.
* @lock: Pointer to queue spinlock structure
@@ -141,6 +204,7 @@ static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
}
return old;
}
+#endif /* _Q_PENDING_BITS == 8 */
/**
* queue_spin_lock_slowpath - acquire the queue spinlock
@@ -215,8 +279,13 @@ void queue_spin_lock_slowpath(struct qspinlock *lock, u32 val)
* we're pending, wait for the owner to go away.
*
* *,1,1 -> *,1,0
+ *
+ * this wait loop must be a load-acquire such that we match the
+ * store-release that clears the locked bit and create lock
+ * sequentiality; this is because not all clear_pending_set_locked()
+ * implementations imply full barriers.
*/
- while ((val = atomic_read(&lock->val)) & _Q_LOCKED_MASK)
+ while ((val = smp_load_acquire(&lock->val.counter)) & _Q_LOCKED_MASK)
cpu_relax();
/*