2 files changed, 51 insertions, 21 deletions
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index 38c49202d532..337c8818541d 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -239,8 +239,8 @@ static __always_inline void set_locked(struct qspinlock *lock)
 
 static __always_inline void __pv_init_node(struct mcs_spinlock *node) { }
 static __always_inline void __pv_wait_node(struct mcs_spinlock *node) { }
-static __always_inline void __pv_kick_node(struct mcs_spinlock *node) { }
-
+static __always_inline void __pv_kick_node(struct qspinlock *lock,
+					   struct mcs_spinlock *node) { }
 static __always_inline void __pv_wait_head(struct qspinlock *lock,
 					   struct mcs_spinlock *node) { }
 
@@ -440,7 +440,7 @@ queue:
 		cpu_relax();
 
 	arch_mcs_spin_unlock_contended(&next->locked);
-	pv_kick_node(next);
+	pv_kick_node(lock, next);
 
 release:
 	/*
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index ab8b1bb8caa4..c8e6e9a596f5 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -22,9 +22,14 @@
 
 #define _Q_SLOW_VAL	(3U << _Q_LOCKED_OFFSET)
 
+/*
+ * Queue node uses: vcpu_running & vcpu_halted.
+ * Queue head uses: vcpu_running & vcpu_hashed.
+ */
 enum vcpu_state {
 	vcpu_running = 0,
-	vcpu_halted,
+	vcpu_halted,		/* Used only in pv_wait_node */
+	vcpu_hashed,		/* = pv_hash'ed + vcpu_halted */
 };
 
 struct pv_node {
@@ -153,7 +158,8 @@ static void pv_init_node(struct mcs_spinlock *node)
 
 /*
  * Wait for node->locked to become true, halt the vcpu after a short spin.
- * pv_kick_node() is used to wake the vcpu again.
+ * pv_kick_node() is used to set _Q_SLOW_VAL and fill in hash table on its
+ * behalf.
  */
 static void pv_wait_node(struct mcs_spinlock *node)
 {
@@ -172,9 +178,9 @@ static void pv_wait_node(struct mcs_spinlock *node)
 		 *
 		 * [S] pn->state = vcpu_halted	  [S] next->locked = 1
 		 *     MB			      MB
-		 * [L] pn->locked		[RmW] pn->state = vcpu_running
+		 * [L] pn->locked		[RmW] pn->state = vcpu_hashed
 		 *
-		 * Matches the xchg() from pv_kick_node().
+		 * Matches the cmpxchg() from pv_kick_node().
 		 */
 		smp_store_mb(pn->state, vcpu_halted);
 
@@ -182,9 +188,10 @@ static void pv_wait_node(struct mcs_spinlock *node)
 			pv_wait(&pn->state, vcpu_halted);
 
 		/*
-		 * Reset the vCPU state to avoid unncessary CPU kicking
+		 * If pv_kick_node() changed us to vcpu_hashed, retain that value
+		 * so that pv_wait_head() knows to not also try to hash this lock.
 		 */
-		WRITE_ONCE(pn->state, vcpu_running);
+		cmpxchg(&pn->state, vcpu_halted, vcpu_running);
 
 		/*
 		 * If the locked flag is still not set after wakeup, it is a
@@ -194,6 +201,7 @@ static void pv_wait_node(struct mcs_spinlock *node)
 		 * MCS lock will be released soon.
 		 */
 	}
+
 	/*
 	 * By now our node->locked should be 1 and our caller will not actually
 	 * spin-wait for it. We do however rely on our caller to do a
@@ -202,24 +210,35 @@ static void pv_wait_node(struct mcs_spinlock *node)
 }
 
 /*
- * Called after setting next->locked = 1, used to wake those stuck in
- * pv_wait_node().
+ * Called after setting next->locked = 1 when we're the lock owner.
+ *
+ * Instead of waking the waiters stuck in pv_wait_node() advance their state such
+ * that they're waiting in pv_wait_head(), this avoids a wake/sleep cycle.
  */
-static void pv_kick_node(struct mcs_spinlock *node)
+static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node)
 {
 	struct pv_node *pn = (struct pv_node *)node;
+	struct __qspinlock *l = (void *)lock;
 
 	/*
-	 * Note that because node->locked is already set, this actual
-	 * mcs_spinlock entry could be re-used already.
+	 * If the vCPU is indeed halted, advance its state to match that of
+	 * pv_wait_node(). If OTOH this fails, the vCPU was running and will
+	 * observe its next->locked value and advance itself.
 	 *
-	 * This should be fine however, kicking people for no reason is
-	 * harmless.
+	 * Matches with smp_store_mb() and cmpxchg() in pv_wait_node()
+	 */
+	if (cmpxchg(&pn->state, vcpu_halted, vcpu_hashed) != vcpu_halted)
+		return;
+
+	/*
+	 * Put the lock into the hash table and set the _Q_SLOW_VAL.
 	 *
-	 * See the comment in pv_wait_node().
+	 * As this is the same vCPU that will check the _Q_SLOW_VAL value and
+	 * the hash table later on at unlock time, no atomic instruction is
+	 * needed.
 	 */
-	if (xchg(&pn->state, vcpu_running) == vcpu_halted)
-		pv_kick(pn->cpu);
+	WRITE_ONCE(l->locked, _Q_SLOW_VAL);
+	(void)pv_hash(lock, pn);
 }
 
 /*
@@ -233,6 +252,13 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)
 	struct qspinlock **lp = NULL;
 	int loop;
 
+	/*
+	 * If pv_kick_node() already advanced our state, we don't need to
+	 * insert ourselves into the hash table anymore.
+	 */
+	if (READ_ONCE(pn->state) == vcpu_hashed)
+		lp = (struct qspinlock **)1;
+
 	for (;;) {
 		for (loop = SPIN_THRESHOLD; loop; loop--) {
 			if (!READ_ONCE(l->locked))
@@ -240,9 +266,10 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)
 			cpu_relax();
 		}
 
-		WRITE_ONCE(pn->state, vcpu_halted);
 		if (!lp) { /* ONCE */
+			WRITE_ONCE(pn->state, vcpu_hashed);
 			lp = pv_hash(lock, pn);
+
 			/*
 			 * We must hash before setting _Q_SLOW_VAL, such that
 			 * when we observe _Q_SLOW_VAL in __pv_queued_spin_unlock()
@@ -333,8 +360,11 @@ __visible void __pv_queued_spin_unlock(struct qspinlock *lock)
 	/*
 	 * At this point the memory pointed at by lock can be freed/reused,
 	 * however we can still use the pv_node to kick the CPU.
+	 * The other vCPU may not really be halted, but kicking an active
+	 * vCPU is harmless other than the additional latency in completing
+	 * the unlock.
 	 */
-	if (READ_ONCE(node->state) == vcpu_halted)
+	if (READ_ONCE(node->state) == vcpu_hashed)
 		pv_kick(node->cpu);
 }
 /*