From patchwork Wed Aug 21 20:28:09 2024
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: David Woodhouse <dwmw2@infradead.org>
X-Patchwork-Id: 13772039
Received: from casper.infradead.org (casper.infradead.org [90.155.50.34])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 4CE8F170A2C;
	Wed, 21 Aug 2024 20:28:24 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=90.155.50.34
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1724272107; cv=none;
 b=hh7PQzO6Ww/r8fnRPsERvwuHZMRD7e6GQztqVKuKbvsfe54Amak6eYdneAltdn7CmHuWsFP+Me4I5NZGmIWV3qi+VFFO7tYPV33Dh+vQSS9Tb8BiteGp/EYnCEEqmy5dqxZaHOr3qflEN4aPsSRscQX/8nvE7jrAHdm/w5dzKYc=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1724272107; c=relaxed/simple;
	bh=SXtWlXXJvfMzOc3hdKl16HGzmbsTfSjUfArfRvLm3Qc=;
	h=From:To:Cc:Subject:Date:Message-ID:MIME-Version;
 b=pfoNtRy28v6eiVUxYn1b3XEiYYPYnKWkNn72KujNCzDKqa9CSNujXAinItcWWZXa0jbZ/yL9ak0zEJl81nN97MUXOHWTDsbox6Yz84WxV+TBVabqdwQyB7vCBqX4NhEyUTialyQ3wr/t/fW7wIGLl385iMAErczf5O+Ehqg/vkQ=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=infradead.org;
 spf=none smtp.mailfrom=casper.srs.infradead.org;
 dkim=pass (2048-bit key) header.d=infradead.org header.i=@infradead.org
 header.b=eXX/uEbu; arc=none smtp.client-ip=90.155.50.34
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=infradead.org
Authentication-Results: smtp.subspace.kernel.org;
 spf=none smtp.mailfrom=casper.srs.infradead.org
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=infradead.org header.i=@infradead.org
 header.b="eXX/uEbu"
DKIM-Signature: v=1; a=rsa-sha256; q=dns/txt; c=relaxed/relaxed;
	d=infradead.org; s=casper.20170209; h=Sender:Content-Transfer-Encoding:
	MIME-Version:Message-ID:Date:Subject:Cc:To:From:Reply-To:Content-Type:
	Content-ID:Content-Description:In-Reply-To:References;
	bh=kj3JEAu7EiqwY4Qhm6ZqPD9191IkjGRIRHW3y1xsTyg=; b=eXX/uEbuUPtEKsR1cEsF4mJNW1
	UJEqGGmOQMAdHhp0sCbN5tT9iGGGO+YoKzGRg3EtBz3blON0n4yS+dtxRPqyKMcXpLxv4MaWWGG2M
	3Z+7C/2DXo+5qsvl7I7W7TxbdXUTHuZUX3pQl5CySYmR9AnHBVmeHM9A0y0Id7MtWjag4xTgeMe06
	SYGVK3+/eGHw22tNNRzo+JFXS+9q3EwN3sLeEhyN9VbJ7Xk30uFC3SyhZZqj+XY21NlMfa+qyerzG
	IdPd7mUqmXY2tjjclU4lsStH6xBjAFKuRV0TKmac0PgoeebGOKCL9bAS2Wnbv1lQBPUL1MtvW3i9Y
	01SSfeVw==;
Received: from [2001:8b0:10b:1::ebe] (helo=i7.infradead.org)
	by casper.infradead.org with esmtpsa (Exim 4.97.1 #2 (Red Hat Linux))
	id 1sgrwF-00000009fuS-0vJO;
	Wed, 21 Aug 2024 20:28:15 +0000
Received: from dwoodhou by i7.infradead.org with local (Exim 4.97.1 #2 (Red
 Hat Linux))
	id 1sgrwE-00000002z8p-0LiG;
	Wed, 21 Aug 2024 21:28:14 +0100
From: David Woodhouse <dwmw2@infradead.org>
To: Sean Christopherson <seanjc@google.com>,
	Paolo Bonzini <pbonzini@redhat.com>,
	"Hussain, Mushahid" <hmushi@amazon.co.uk>
Cc: Vitaly Kuznetsov <vkuznets@redhat.com>,
	Wanpeng Li <wanpengli@tencent.com>,
	Jim Mattson <jmattson@google.com>,
	Joerg Roedel <joro@8bytes.org>,
	kvm@vger.kernel.org,
	linux-kernel@vger.kernel.org,
	Mingwei Zhang <mizhang@google.com>,
	Maxim Levitsky <mlevitsk@redhat.com>
Subject: [PATCH v3 1/5] KVM: pfncache: Add needs_invalidation flag to
 gfn_to_pfn_cache
Date: Wed, 21 Aug 2024 21:28:09 +0100
Message-ID: <20240821202814.711673-1-dwmw2@infradead.org>
X-Mailer: git-send-email 2.44.0
Precedence: bulk
X-Mailing-List: kvm@vger.kernel.org
List-Id: <kvm.vger.kernel.org>
List-Subscribe: <mailto:kvm+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:kvm+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Sender: David Woodhouse <dwmw2@infradead.org>
X-SRS-Rewrite: SMTP reverse-path rewritten from <dwmw2@infradead.org> by
 casper.infradead.org. See http://www.infradead.org/rpr.html

From: David Woodhouse <dwmw@amazon.co.uk>

This will be used to allow hva_to_pfn_retry() to be more selective about
its retry loop, which is currently extremely pessimistic.

It allows for invalidations to occur even while the PFN is being mapped
(which happens with the lock dropped), before the GPC is fully valid.

No functional change yet, as the existing mmu_notifier_retry_cache()
function will still return true in all cases where the invalidation
may have triggered.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
---
 include/linux/kvm_types.h |  1 +
 virt/kvm/pfncache.c       | 29 ++++++++++++++++++++++++-----
 2 files changed, 25 insertions(+), 5 deletions(-)

diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h
index 827ecc0b7e10..4d8fbd87c320 100644
--- a/include/linux/kvm_types.h
+++ b/include/linux/kvm_types.h
@@ -69,6 +69,7 @@ struct gfn_to_pfn_cache {
 	void *khva;
 	kvm_pfn_t pfn;
 	bool active;
+	bool needs_invalidation;
 	bool valid;
 };
 
diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c
index f0039efb9e1e..7007d32d197a 100644
--- a/virt/kvm/pfncache.c
+++ b/virt/kvm/pfncache.c
@@ -32,7 +32,7 @@ void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm, unsigned long start,
 		read_lock_irq(&gpc->lock);
 
 		/* Only a single page so no need to care about length */
-		if (gpc->valid && !is_error_noslot_pfn(gpc->pfn) &&
+		if (gpc->needs_invalidation && !is_error_noslot_pfn(gpc->pfn) &&
 		    gpc->uhva >= start && gpc->uhva < end) {
 			read_unlock_irq(&gpc->lock);
 
@@ -45,9 +45,11 @@ void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm, unsigned long start,
 			 */
 
 			write_lock_irq(&gpc->lock);
-			if (gpc->valid && !is_error_noslot_pfn(gpc->pfn) &&
-			    gpc->uhva >= start && gpc->uhva < end)
+			if (gpc->needs_invalidation && !is_error_noslot_pfn(gpc->pfn) &&
+			    gpc->uhva >= start && gpc->uhva < end) {
+				gpc->needs_invalidation = false;
 				gpc->valid = false;
+			}
 			write_unlock_irq(&gpc->lock);
 			continue;
 		}
@@ -93,6 +95,9 @@ bool kvm_gpc_check(struct gfn_to_pfn_cache *gpc, unsigned long len)
 	if (!gpc->valid)
 		return false;
 
+	/* If it's valid, it needs invalidation! */
+	WARN_ON_ONCE(!gpc->needs_invalidation);
+
 	return true;
 }
 
@@ -175,6 +180,17 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
 		mmu_seq = gpc->kvm->mmu_invalidate_seq;
 		smp_rmb();
 
+		/*
+		 * The translation made by hva_to_pfn() below could be made
+		 * invalid as soon as it's mapped. But the uhva is already
+		 * known and that's all that gfn_to_pfn_cache_invalidate()
+		 * looks at. So set the 'needs_invalidation' flag to allow
+		 * the GPC to be marked invalid from the moment the lock is
+		 * dropped, before the corresponding PFN is even found (and,
+		 * more to the point, immediately afterwards).
+		 */
+		gpc->needs_invalidation = true;
+
 		write_unlock_irq(&gpc->lock);
 
 		/*
@@ -224,7 +240,8 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
 		 * attempting to refresh.
 		 */
 		WARN_ON_ONCE(gpc->valid);
-	} while (mmu_notifier_retry_cache(gpc->kvm, mmu_seq));
+	} while (!gpc->needs_invalidation ||
+		 mmu_notifier_retry_cache(gpc->kvm, mmu_seq));
 
 	gpc->valid = true;
 	gpc->pfn = new_pfn;
@@ -339,6 +356,7 @@ static int __kvm_gpc_refresh(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned l
 	 */
 	if (ret) {
 		gpc->valid = false;
+		gpc->needs_invalidation = false;
 		gpc->pfn = KVM_PFN_ERR_FAULT;
 		gpc->khva = NULL;
 	}
@@ -383,7 +401,7 @@ void kvm_gpc_init(struct gfn_to_pfn_cache *gpc, struct kvm *kvm)
 	gpc->pfn = KVM_PFN_ERR_FAULT;
 	gpc->gpa = INVALID_GPA;
 	gpc->uhva = KVM_HVA_ERR_BAD;
-	gpc->active = gpc->valid = false;
+	gpc->active = gpc->valid = gpc->needs_invalidation = false;
 }
 
 static int __kvm_gpc_activate(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned long uhva,
@@ -453,6 +471,7 @@ void kvm_gpc_deactivate(struct gfn_to_pfn_cache *gpc)
 		write_lock_irq(&gpc->lock);
 		gpc->active = false;
 		gpc->valid = false;
+		gpc->needs_invalidation = false;
 
 		/*
 		 * Leave the GPA => uHVA cache intact, it's protected by the

From patchwork Wed Aug 21 20:28:10 2024
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: David Woodhouse <dwmw2@infradead.org>
X-Patchwork-Id: 13772043
Received: from desiato.infradead.org (desiato.infradead.org [90.155.92.199])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id D48CC176FC5;
	Wed, 21 Aug 2024 20:28:33 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=90.155.92.199
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1724272115; cv=none;
 b=mxId/fUPMcvBqcG7a6ZK60eAwYOd2wtXaMgG8vrzTA9b2z1FELVoEOnKp4sx5uNmjcRvDwSEltv2pfi54FuJv2t0U2YIYUZOqhJil6hHyWYSFKEPfV1JflFTXboMAdhXTidfJ3jpTemKLp7FZMya00FxW/oYaenu8ai4EnBjCBk=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1724272115; c=relaxed/simple;
	bh=4TZCEcMeHRDVkViBccCJGrdtaKSVz1KZdVpM6a0h8Ow=;
	h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References:
	 MIME-Version;
 b=fLbjB5rMhEIFKsoUZTwVA0OJQnec0fF/q6yywSpZMg1/bB+a1NweT3v2rv8scP68oB0ZfBtb1JEB73fmkyQgoW89Sz0vjzN7vJ0GmRr8nJfETcAvSarepfIF3nOrXHZxWrJeyxIlgja5LGNN9uCSR4yIcYH4N3A1pQ+IIfhHoSc=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=infradead.org;
 spf=none smtp.mailfrom=desiato.srs.infradead.org;
 dkim=pass (2048-bit key) header.d=infradead.org header.i=@infradead.org
 header.b=bY44l8fB; arc=none smtp.client-ip=90.155.92.199
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=infradead.org
Authentication-Results: smtp.subspace.kernel.org;
 spf=none smtp.mailfrom=desiato.srs.infradead.org
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=infradead.org header.i=@infradead.org
 header.b="bY44l8fB"
DKIM-Signature: v=1; a=rsa-sha256; q=dns/txt; c=relaxed/relaxed;
	d=infradead.org; s=desiato.20200630; h=Sender:Content-Transfer-Encoding:
	MIME-Version:References:In-Reply-To:Message-ID:Date:Subject:Cc:To:From:
	Reply-To:Content-Type:Content-ID:Content-Description;
	bh=ZN2vyfIr2V0Pa6Fea5QC6HdpJRzhC2n5uBNf9IpQsyc=; b=bY44l8fBEVRWaH+Tp5N/hgRLQd
	eMNkSgLj29QNKmBL17iqa38jKqBP9pbhq3OQp8ZsANSB3MHWsmcu3D37ZcZlp2zn+7rAG+qcvtVha
	lzYln6d3RX4uM/3chlkRcT3H61BblqzxXA31jNrTQuMQHEDxwOFK8KAINbh3kNkyZHwZv1vziAHsN
	tCYcHpMW2R4i6S/c7xFCmCDTo2WBy9hSU4JxTAL49g3tTFgVpKsSVSC3d2efVI7aaRxDYSZOM+aNk
	JKdMcjIZsrdzSANOM7/2A/t75yxnVp8e14lh8Mg16K8qaGlNDMX1kgbdBaHWCe+B1iqPwVJxIpJ4N
	hk4piS4w==;
Received: from [2001:8b0:10b:1::ebe] (helo=i7.infradead.org)
	by desiato.infradead.org with esmtpsa (Exim 4.97.1 #2 (Red Hat Linux))
	id 1sgrwG-00000009hcn-1fL9;
	Wed, 21 Aug 2024 20:28:20 +0000
Received: from dwoodhou by i7.infradead.org with local (Exim 4.97.1 #2 (Red
 Hat Linux))
	id 1sgrwE-00000002z8s-0XVy;
	Wed, 21 Aug 2024 21:28:14 +0100
From: David Woodhouse <dwmw2@infradead.org>
To: Sean Christopherson <seanjc@google.com>,
	Paolo Bonzini <pbonzini@redhat.com>,
	"Hussain, Mushahid" <hmushi@amazon.co.uk>
Cc: Vitaly Kuznetsov <vkuznets@redhat.com>,
	Wanpeng Li <wanpengli@tencent.com>,
	Jim Mattson <jmattson@google.com>,
	Joerg Roedel <joro@8bytes.org>,
	kvm@vger.kernel.org,
	linux-kernel@vger.kernel.org,
	Mingwei Zhang <mizhang@google.com>,
	Maxim Levitsky <mlevitsk@redhat.com>
Subject: [PATCH v3 2/5] KVM: pfncache: Implement range-based invalidation
 check for hva_to_pfn_retry()
Date: Wed, 21 Aug 2024 21:28:10 +0100
Message-ID: <20240821202814.711673-2-dwmw2@infradead.org>
X-Mailer: git-send-email 2.44.0
In-Reply-To: <20240821202814.711673-1-dwmw2@infradead.org>
References: <20240821202814.711673-1-dwmw2@infradead.org>
Precedence: bulk
X-Mailing-List: kvm@vger.kernel.org
List-Id: <kvm.vger.kernel.org>
List-Subscribe: <mailto:kvm+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:kvm+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Sender: David Woodhouse <dwmw2@infradead.org>
X-SRS-Rewrite: SMTP reverse-path rewritten from <dwmw2@infradead.org> by
 desiato.infradead.org. See http://www.infradead.org/rpr.html

From: David Woodhouse <dwmw@amazon.co.uk>

The existing retry loop in hva_to_pfn_retry() is extremely pessimistic.
If there are any concurrent invalidations running, it's effectively just
a complex busy wait loop because its local mmu_notifier_retry_cache()
function will always return true.

Since multiple invalidations can be running in parallel, this can result
in a situation where hva_to_pfn_retry() just backs off and keep retrying
for ever, not making any progress.

Solve this by being a bit more selective about when to retry. In
addition to the ->needs_invalidation flag added in a previous commit,
check before calling hva_to_pfn() if there is any pending invalidation
(i.e. between invalidate_range_start() and invalidate_range_end()) which
affects the uHVA of the GPC being updated. This catches the case where
hva_to_pfn() would return a soon-to-be-stale mapping of a range for which
the invalidate_range_start() hook had already been called before the uHVA
was even set in the GPC and the ->needs_invalidation flag set.

An invalidation which *starts* after the lock is dropped in the loop is
fine because it will clear the ->needs_revalidation flag and also trigger
a retry.

This is slightly more complex than it needs to be; moving the
invalidation to occur in the invalidate_range_end() hook will make life
simpler, in a subsequent commit.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
---
 include/linux/kvm_host.h |  2 ++
 virt/kvm/kvm_main.c      | 20 ++++++++++++++
 virt/kvm/pfncache.c      | 60 +++++++++++++++++++++-------------------
 3 files changed, 54 insertions(+), 28 deletions(-)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 79a6b1a63027..1bfe2e8d52cd 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -770,6 +770,8 @@ struct kvm {
 	/* For management / invalidation of gfn_to_pfn_caches */
 	spinlock_t gpc_lock;
 	struct list_head gpc_list;
+	u64 mmu_gpc_invalidate_range_start;
+	u64 mmu_gpc_invalidate_range_end;
 
 	/*
 	 * created_vcpus is protected by kvm->lock, and is incremented
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 92901656a0d4..84eb1ebb6f47 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -775,6 +775,24 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
 	 */
 	spin_lock(&kvm->mn_invalidate_lock);
 	kvm->mn_active_invalidate_count++;
+	if (likely(kvm->mn_active_invalidate_count == 1)) {
+		kvm->mmu_gpc_invalidate_range_start = range->start;
+		kvm->mmu_gpc_invalidate_range_end = range->end;
+	} else {
+		/*
+		 * Fully tracking multiple concurrent ranges has diminishing
+		 * returns. Keep things simple and just find the minimal range
+		 * which includes the current and new ranges. As there won't be
+		 * enough information to subtract a range after its invalidate
+		 * completes, any ranges invalidated concurrently will
+		 * accumulate and persist until all outstanding invalidates
+		 * complete.
+		 */
+		kvm->mmu_gpc_invalidate_range_start =
+			min(kvm->mmu_gpc_invalidate_range_start, range->start);
+		kvm->mmu_gpc_invalidate_range_end =
+			max(kvm->mmu_gpc_invalidate_range_end, range->end);
+	}
 	spin_unlock(&kvm->mn_invalidate_lock);
 
 	/*
@@ -1164,6 +1182,8 @@ static struct kvm *kvm_create_vm(unsigned long type, const char *fdname)
 
 	INIT_LIST_HEAD(&kvm->gpc_list);
 	spin_lock_init(&kvm->gpc_lock);
+	kvm->mmu_gpc_invalidate_range_start = KVM_HVA_ERR_BAD;
+	kvm->mmu_gpc_invalidate_range_end = KVM_HVA_ERR_BAD;
 
 	INIT_LIST_HEAD(&kvm->devices);
 	kvm->max_vcpus = KVM_MAX_VCPUS;
diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c
index 7007d32d197a..eeb9bf43c04a 100644
--- a/virt/kvm/pfncache.c
+++ b/virt/kvm/pfncache.c
@@ -129,32 +129,17 @@ static void gpc_unmap(kvm_pfn_t pfn, void *khva)
 #endif
 }
 
-static inline bool mmu_notifier_retry_cache(struct kvm *kvm, unsigned long mmu_seq)
+static bool gpc_invalidations_pending(struct gfn_to_pfn_cache *gpc)
 {
 	/*
-	 * mn_active_invalidate_count acts for all intents and purposes
-	 * like mmu_invalidate_in_progress here; but the latter cannot
-	 * be used here because the invalidation of caches in the
-	 * mmu_notifier event occurs _before_ mmu_invalidate_in_progress
-	 * is elevated.
-	 *
-	 * Note, it does not matter that mn_active_invalidate_count
-	 * is not protected by gpc->lock.  It is guaranteed to
-	 * be elevated before the mmu_notifier acquires gpc->lock, and
-	 * isn't dropped until after mmu_invalidate_seq is updated.
+	 * No need for locking on GPC here because these fields are protected
+	 * by gpc->refresh_lock.
 	 */
-	if (kvm->mn_active_invalidate_count)
-		return true;
+	guard(spinlock)(&gpc->kvm->mn_invalidate_lock);
 
-	/*
-	 * Ensure mn_active_invalidate_count is read before
-	 * mmu_invalidate_seq.  This pairs with the smp_wmb() in
-	 * mmu_notifier_invalidate_range_end() to guarantee either the
-	 * old (non-zero) value of mn_active_invalidate_count or the
-	 * new (incremented) value of mmu_invalidate_seq is observed.
-	 */
-	smp_rmb();
-	return kvm->mmu_invalidate_seq != mmu_seq;
+	return unlikely(gpc->kvm->mn_active_invalidate_count) &&
+		(gpc->kvm->mmu_gpc_invalidate_range_start <= gpc->uhva) &&
+		(gpc->kvm->mmu_gpc_invalidate_range_end > gpc->uhva);
 }
 
 static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
@@ -163,7 +148,6 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
 	void *old_khva = (void *)PAGE_ALIGN_DOWN((uintptr_t)gpc->khva);
 	kvm_pfn_t new_pfn = KVM_PFN_ERR_FAULT;
 	void *new_khva = NULL;
-	unsigned long mmu_seq;
 
 	lockdep_assert_held(&gpc->refresh_lock);
 
@@ -177,9 +161,6 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
 	gpc->valid = false;
 
 	do {
-		mmu_seq = gpc->kvm->mmu_invalidate_seq;
-		smp_rmb();
-
 		/*
 		 * The translation made by hva_to_pfn() below could be made
 		 * invalid as soon as it's mapped. But the uhva is already
@@ -193,6 +174,29 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
 
 		write_unlock_irq(&gpc->lock);
 
+		/*
+		 * Invalidation occurs from the invalidate_range_start() hook,
+		 * which could already have happened before __kvm_gpc_refresh()
+		 * (or the previous turn around this loop) took gpc->lock().
+		 * If so, and if the corresponding invalidate_range_end() hook
+		 * hasn't happened yet, hva_to_pfn() could return a mapping
+		 * which is about to be stale and which should not be used. So
+		 * check if there are any currently-running invalidations which
+		 * affect the uHVA of this GPC, and retry if there are. Any
+		 * invalidation which starts after gpc->needs_invalidation is
+		 * set is fine, because it will clear that flag and trigger a
+		 * retry. And any invalidation which *completes* by having its
+		 * invalidate_range_end() hook called immediately prior to this
+		 * check is also fine, because the page tables are guaranteed
+		 * to have been changted already, so hva_to_pfn() won't return
+		 * a stale mapping in that case anyway.
+		 */
+		while (gpc_invalidations_pending(gpc)) {
+			cond_resched();
+			write_lock_irq(&gpc->lock);
+			continue;
+		}
+
 		/*
 		 * If the previous iteration "failed" due to an mmu_notifier
 		 * event, release the pfn and unmap the kernel virtual address
@@ -233,6 +237,7 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
 			goto out_error;
 		}
 
+
 		write_lock_irq(&gpc->lock);
 
 		/*
@@ -240,8 +245,7 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
 		 * attempting to refresh.
 		 */
 		WARN_ON_ONCE(gpc->valid);
-	} while (!gpc->needs_invalidation ||
-		 mmu_notifier_retry_cache(gpc->kvm, mmu_seq));
+	} while (!gpc->needs_invalidation);
 
 	gpc->valid = true;
 	gpc->pfn = new_pfn;

From patchwork Wed Aug 21 20:28:11 2024
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: David Woodhouse <dwmw2@infradead.org>
X-Patchwork-Id: 13772041
Received: from desiato.infradead.org (desiato.infradead.org [90.155.92.199])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id B8321170855;
	Wed, 21 Aug 2024 20:28:31 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=90.155.92.199
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1724272115; cv=none;
 b=HTHqtNrd7RLI7ERwNdG8A+2Vm9VqjQs80moz6Bx098iKO6Sy+qaIJLcQR00qrbQn1iDMdgHAUcrfA5hTW5NureBZu7eLAIswMurUGd1jCY1WJl/kQp1DCeccUirDZ3eYRBQJbArpTMGWi6PTsKMEPI6ANI53grn5vq/3nffd9PI=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1724272115; c=relaxed/simple;
	bh=5BtWl8Dnsbx+8f6UULFm5mZrQFd7xASkSWGtVqAHZKc=;
	h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References:
	 MIME-Version;
 b=NyT/F7bw8FZBgmzIGgP8niThJJe1YHyweg3H2uOQmfOMBrx8bi8ffLQ1NoCgs5tzhiIJ0e8WLmazZma+WoC1P6+vmjjCQAbnYANUiBQPaN1ZMKJ4r5Q7bznWp5z1Hmgpbm8HajgsJ6EZfZ3hL56BZnExlBxsAtXJPZNmBEpI0u0=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=infradead.org;
 spf=none smtp.mailfrom=desiato.srs.infradead.org;
 dkim=pass (2048-bit key) header.d=infradead.org header.i=@infradead.org
 header.b=g5wT2Tcm; arc=none smtp.client-ip=90.155.92.199
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=infradead.org
Authentication-Results: smtp.subspace.kernel.org;
 spf=none smtp.mailfrom=desiato.srs.infradead.org
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=infradead.org header.i=@infradead.org
 header.b="g5wT2Tcm"
DKIM-Signature: v=1; a=rsa-sha256; q=dns/txt; c=relaxed/relaxed;
	d=infradead.org; s=desiato.20200630; h=Sender:Content-Transfer-Encoding:
	MIME-Version:References:In-Reply-To:Message-ID:Date:Subject:Cc:To:From:
	Reply-To:Content-Type:Content-ID:Content-Description;
	bh=/rsvtto93wObnUsrzWtdVKXZIg/7tPJ25PmzaGy0YOM=; b=g5wT2TcmA96hRMh5YOIQsXkCej
	RU7JmXAectihWxotMoN4OucuX4qeoU96iSs/1mtRFIRwT1ChT1Uql5XPdQvUL4idDu0R1ylAxveZt
	pXvMHOI+wbOq3J3vjNv/pTMzNwBuERdQI4mGeBYB3xBzaJQImO1v6Y77++HEfUchbW4hi5VYb+q1N
	XizaNKYEVRY7w2WYZ6CVgUdFsiWyMnr4yBxhmZRSjmOzo3rUUk7B8mRn7Go/EcFLt1HpDIgkd+KtB
	Pwkcbd/sZ/+uJYhDTWpkEjfxBx1GlcPVQ5TGG+UymS9f5G/ddMzGjf/4CTVlgIdhtn9zv0T2RIgdi
	4DJ1pdzg==;
Received: from [2001:8b0:10b:1::ebe] (helo=i7.infradead.org)
	by desiato.infradead.org with esmtpsa (Exim 4.97.1 #2 (Red Hat Linux))
	id 1sgrwG-00000009hcm-1f6U;
	Wed, 21 Aug 2024 20:28:17 +0000
Received: from dwoodhou by i7.infradead.org with local (Exim 4.97.1 #2 (Red
 Hat Linux))
	id 1sgrwE-00000002z8v-0oEl;
	Wed, 21 Aug 2024 21:28:14 +0100
From: David Woodhouse <dwmw2@infradead.org>
To: Sean Christopherson <seanjc@google.com>,
	Paolo Bonzini <pbonzini@redhat.com>,
	"Hussain, Mushahid" <hmushi@amazon.co.uk>
Cc: Vitaly Kuznetsov <vkuznets@redhat.com>,
	Wanpeng Li <wanpengli@tencent.com>,
	Jim Mattson <jmattson@google.com>,
	Joerg Roedel <joro@8bytes.org>,
	kvm@vger.kernel.org,
	linux-kernel@vger.kernel.org,
	Mingwei Zhang <mizhang@google.com>,
	Maxim Levitsky <mlevitsk@redhat.com>
Subject: [PATCH v3 3/5] KVM: pfncache: Wait for pending invalidations instead
 of spinning
Date: Wed, 21 Aug 2024 21:28:11 +0100
Message-ID: <20240821202814.711673-3-dwmw2@infradead.org>
X-Mailer: git-send-email 2.44.0
In-Reply-To: <20240821202814.711673-1-dwmw2@infradead.org>
References: <20240821202814.711673-1-dwmw2@infradead.org>
Precedence: bulk
X-Mailing-List: kvm@vger.kernel.org
List-Id: <kvm.vger.kernel.org>
List-Subscribe: <mailto:kvm+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:kvm+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Sender: David Woodhouse <dwmw2@infradead.org>
X-SRS-Rewrite: SMTP reverse-path rewritten from <dwmw2@infradead.org> by
 desiato.infradead.org. See http://www.infradead.org/rpr.html

From: David Woodhouse <dwmw@amazon.co.uk>

The busy loop in hva_to_pfn_retry() is worse than a normal page fault
retry loop because it spins even while it's waiting for the invalidation
to complete. It isn't just that a page might get faulted out again before
it's actually accessed.

Introduce a wait queue to be woken when kvm->mn_active_invalidate_count
reaches zero, and wait on it if there is any pending invalidation which
affects the GPC being refreshed.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
---
 include/linux/kvm_host.h |  1 +
 virt/kvm/kvm_main.c      |  5 ++++-
 virt/kvm/pfncache.c      | 32 ++++++++++++++++++++++++++++----
 3 files changed, 33 insertions(+), 5 deletions(-)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 1bfe2e8d52cd..a0739c343da5 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -772,6 +772,7 @@ struct kvm {
 	struct list_head gpc_list;
 	u64 mmu_gpc_invalidate_range_start;
 	u64 mmu_gpc_invalidate_range_end;
+	wait_queue_head_t gpc_invalidate_wq;
 
 	/*
 	 * created_vcpus is protected by kvm->lock, and is incremented
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 84eb1ebb6f47..e04eb700448b 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -871,8 +871,10 @@ static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
 	 * There can only be one waiter, since the wait happens under
 	 * slots_lock.
 	 */
-	if (wake)
+	if (wake) {
+		wake_up(&kvm->gpc_invalidate_wq);
 		rcuwait_wake_up(&kvm->mn_memslots_update_rcuwait);
+	}
 }
 
 static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
@@ -1182,6 +1184,7 @@ static struct kvm *kvm_create_vm(unsigned long type, const char *fdname)
 
 	INIT_LIST_HEAD(&kvm->gpc_list);
 	spin_lock_init(&kvm->gpc_lock);
+	init_waitqueue_head(&kvm->gpc_invalidate_wq);
 	kvm->mmu_gpc_invalidate_range_start = KVM_HVA_ERR_BAD;
 	kvm->mmu_gpc_invalidate_range_end = KVM_HVA_ERR_BAD;
 
diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c
index eeb9bf43c04a..fa494eb3d924 100644
--- a/virt/kvm/pfncache.c
+++ b/virt/kvm/pfncache.c
@@ -135,13 +135,38 @@ static bool gpc_invalidations_pending(struct gfn_to_pfn_cache *gpc)
 	 * No need for locking on GPC here because these fields are protected
 	 * by gpc->refresh_lock.
 	 */
-	guard(spinlock)(&gpc->kvm->mn_invalidate_lock);
-
 	return unlikely(gpc->kvm->mn_active_invalidate_count) &&
 		(gpc->kvm->mmu_gpc_invalidate_range_start <= gpc->uhva) &&
 		(gpc->kvm->mmu_gpc_invalidate_range_end > gpc->uhva);
 }
 
+static bool gpc_wait_for_invalidations(struct gfn_to_pfn_cache *gpc)
+{
+	bool waited = false;
+
+	spin_lock(&gpc->kvm->mn_invalidate_lock);
+	if (gpc_invalidations_pending(gpc)) {
+		DEFINE_WAIT(wait);
+
+		waited = true;
+		for (;;) {
+			prepare_to_wait(&gpc->kvm->gpc_invalidate_wq, &wait,
+					TASK_UNINTERRUPTIBLE);
+
+			if (!gpc_invalidations_pending(gpc))
+				break;
+
+			spin_unlock(&gpc->kvm->mn_invalidate_lock);
+			schedule();
+			spin_lock(&gpc->kvm->mn_invalidate_lock);
+		}
+		finish_wait(&gpc->kvm->gpc_invalidate_wq, &wait);
+	}
+	spin_unlock(&gpc->kvm->mn_invalidate_lock);
+	return waited;
+}
+
+
 static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
 {
 	/* Note, the new page offset may be different than the old! */
@@ -191,8 +216,7 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
 		 * to have been changted already, so hva_to_pfn() won't return
 		 * a stale mapping in that case anyway.
 		 */
-		while (gpc_invalidations_pending(gpc)) {
-			cond_resched();
+		if (gpc_wait_for_invalidations(gpc)) {
 			write_lock_irq(&gpc->lock);
 			continue;
 		}

From patchwork Wed Aug 21 20:28:12 2024
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: David Woodhouse <dwmw2@infradead.org>
X-Patchwork-Id: 13772040
Received: from desiato.infradead.org (desiato.infradead.org [90.155.92.199])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 50345170A07;
	Wed, 21 Aug 2024 20:28:30 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=90.155.92.199
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1724272113; cv=none;
 b=rAslv3kNBIjQgTF6ScG9GTwwNlujZLEphdkiRMJi/iOeICl6Z+vx5K/fJ7LnP4Tm9Le7VM3ks4w28Cl6h/uxvseo7PBWZiidVU0Vu6L0Kf4kiqeaI5mY8K2DHVsaCyG4rXEKoRWzRYpioJdmg8ahk0cVka60NtE49Jx/kB+2Bio=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1724272113; c=relaxed/simple;
	bh=wYWWD6+yxfLHSKpGo2+kCsinNKAJCb8Mt6w4CQ/1j4c=;
	h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References:
	 MIME-Version;
 b=T9KE6tdpSXTR1J/ps5txQvynHRolbkmxW5yO8ibmDhE1ZuT7nyWMfNKXlz35BGlTZ7hKSFEaGbQTePPnyYpXrIcNtwZxI3IAtUQtv3q0X6vbTyWVI3QudC4QedG+tZFdlpc8P9ThGz2A8TlXa04F3KizYk2X/r6UlKdrnklG61k=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=infradead.org;
 spf=none smtp.mailfrom=desiato.srs.infradead.org;
 dkim=pass (2048-bit key) header.d=infradead.org header.i=@infradead.org
 header.b=P9ctFq+8; arc=none smtp.client-ip=90.155.92.199
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=infradead.org
Authentication-Results: smtp.subspace.kernel.org;
 spf=none smtp.mailfrom=desiato.srs.infradead.org
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=infradead.org header.i=@infradead.org
 header.b="P9ctFq+8"
DKIM-Signature: v=1; a=rsa-sha256; q=dns/txt; c=relaxed/relaxed;
	d=infradead.org; s=desiato.20200630; h=Sender:Content-Transfer-Encoding:
	MIME-Version:References:In-Reply-To:Message-ID:Date:Subject:Cc:To:From:
	Reply-To:Content-Type:Content-ID:Content-Description;
	bh=IwCwivfLOtSd8/jjj4uzAk46k9+fMxFM3Fu0LYrRtOw=; b=P9ctFq+8xzH9oZuAU1UW84RGE2
	Ljd1YYbCde0pVIgGMW9iGL/VnriPCQhppzMRepx1pXx14dhI8p3+O/JZUpapi1BmHUiOgRXJ2D9Ky
	GZJxEA6LFFgURDlDIkpSk9HDJP9fl3YBL7Zf0FGsForMwXaSY4KxtUbCOvdAB9N3V+TDmcOY9/qEn
	9XcWBydBAv5wksivuGLC7L3hz0uV8frsXfihuy1mWdZukq76TLVTnoU+Z0YF6XVwhqwwSLepWxy1F
	vkoizkZVMdTDR8AnPSYfu1Lz8LDdlKR540k7qIRiqUBzUvgjDV5Tw49B0OCbAELWymrP3WKvYMmlP
	tONzmkXA==;
Received: from [2001:8b0:10b:1::ebe] (helo=i7.infradead.org)
	by desiato.infradead.org with esmtpsa (Exim 4.97.1 #2 (Red Hat Linux))
	id 1sgrwG-00000009hcp-1d0B;
	Wed, 21 Aug 2024 20:28:18 +0000
Received: from dwoodhou by i7.infradead.org with local (Exim 4.97.1 #2 (Red
 Hat Linux))
	id 1sgrwE-00000002z8y-15Xg;
	Wed, 21 Aug 2024 21:28:14 +0100
From: David Woodhouse <dwmw2@infradead.org>
To: Sean Christopherson <seanjc@google.com>,
	Paolo Bonzini <pbonzini@redhat.com>,
	"Hussain, Mushahid" <hmushi@amazon.co.uk>
Cc: Vitaly Kuznetsov <vkuznets@redhat.com>,
	Wanpeng Li <wanpengli@tencent.com>,
	Jim Mattson <jmattson@google.com>,
	Joerg Roedel <joro@8bytes.org>,
	kvm@vger.kernel.org,
	linux-kernel@vger.kernel.org,
	Mingwei Zhang <mizhang@google.com>,
	Maxim Levitsky <mlevitsk@redhat.com>
Subject: [PATCH v3 4/5] KVM: pfncache: Move invalidation to
 invalidate_range_end hook
Date: Wed, 21 Aug 2024 21:28:12 +0100
Message-ID: <20240821202814.711673-4-dwmw2@infradead.org>
X-Mailer: git-send-email 2.44.0
In-Reply-To: <20240821202814.711673-1-dwmw2@infradead.org>
References: <20240821202814.711673-1-dwmw2@infradead.org>
Precedence: bulk
X-Mailing-List: kvm@vger.kernel.org
List-Id: <kvm.vger.kernel.org>
List-Subscribe: <mailto:kvm+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:kvm+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Sender: David Woodhouse <dwmw2@infradead.org>
X-SRS-Rewrite: SMTP reverse-path rewritten from <dwmw2@infradead.org> by
 desiato.infradead.org. See http://www.infradead.org/rpr.html

From: David Woodhouse <dwmw@amazon.co.uk>

By performing the invalidation from the 'end' hook, the process is a lot
cleaner and simpler because it is guaranteed that ->needs_invalidation
will be cleared after hva_to_pfn() has been called, so the only thing
that hva_to_pfn_retry() needs to do is *wait* for there to be no pending
invalidations.

Another false positive on the range based checks is thus removed as well.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
---
 virt/kvm/kvm_main.c | 20 +++++++----------
 virt/kvm/kvm_mm.h   | 12 +++++-----
 virt/kvm/pfncache.c | 55 ++++++++++++++++++++-------------------------
 3 files changed, 38 insertions(+), 49 deletions(-)

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index e04eb700448b..cca870f1a78c 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -795,18 +795,6 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
 	}
 	spin_unlock(&kvm->mn_invalidate_lock);
 
-	/*
-	 * Invalidate pfn caches _before_ invalidating the secondary MMUs, i.e.
-	 * before acquiring mmu_lock, to avoid holding mmu_lock while acquiring
-	 * each cache's lock.  There are relatively few caches in existence at
-	 * any given time, and the caches themselves can check for hva overlap,
-	 * i.e. don't need to rely on memslot overlap checks for performance.
-	 * Because this runs without holding mmu_lock, the pfn caches must use
-	 * mn_active_invalidate_count (see above) instead of
-	 * mmu_invalidate_in_progress.
-	 */
-	gfn_to_pfn_cache_invalidate_start(kvm, range->start, range->end);
-
 	/*
 	 * If one or more memslots were found and thus zapped, notify arch code
 	 * that guest memory has been reclaimed.  This needs to be done *after*
@@ -860,6 +848,14 @@ static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
 
 	__kvm_handle_hva_range(kvm, &hva_range);
 
+	/*
+	 * It's safe to invalidate the gfn_to_pfn_caches from this 'end'
+	 * hook, because hva_to_pfn_retry() will wait until no active
+	 * invalidations could be affecting the corresponding uHVA
+	 * before before allowing a newly mapped GPC to become valid.
+	 */
+	gfn_to_pfn_cache_invalidate(kvm, range->start, range->end);
+
 	/* Pairs with the increment in range_start(). */
 	spin_lock(&kvm->mn_invalidate_lock);
 	if (!WARN_ON_ONCE(!kvm->mn_active_invalidate_count))
diff --git a/virt/kvm/kvm_mm.h b/virt/kvm/kvm_mm.h
index 715f19669d01..34e4e67f09f8 100644
--- a/virt/kvm/kvm_mm.h
+++ b/virt/kvm/kvm_mm.h
@@ -24,13 +24,13 @@ kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool interruptible,
 		     bool *async, bool write_fault, bool *writable);
 
 #ifdef CONFIG_HAVE_KVM_PFNCACHE
-void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm,
-				       unsigned long start,
-				       unsigned long end);
+void gfn_to_pfn_cache_invalidate(struct kvm *kvm,
+				 unsigned long start,
+				 unsigned long end);
 #else
-static inline void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm,
-						     unsigned long start,
-						     unsigned long end)
+static inline void gfn_to_pfn_cache_invalidate(struct kvm *kvm,
+					       unsigned long start,
+					       unsigned long end)
 {
 }
 #endif /* HAVE_KVM_PFNCACHE */
diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c
index fa494eb3d924..3f48df8cd6e5 100644
--- a/virt/kvm/pfncache.c
+++ b/virt/kvm/pfncache.c
@@ -20,10 +20,15 @@
 #include "kvm_mm.h"
 
 /*
- * MMU notifier 'invalidate_range_start' hook.
+ * MMU notifier 'invalidate_range_end' hook. The hva_to_pfn_retry() function
+ * below may look up a PFN just before it is zapped, and may be mapping it
+ * concurrently with the actual invalidation (with the GPC lock dropped). By
+ * using a separate 'needs_invalidation' flag, the concurrent invalidation
+ * can handle this case, causing hva_to_pfn_retry() to drop its result and
+ * retry correctly.
  */
-void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm, unsigned long start,
-				       unsigned long end)
+void gfn_to_pfn_cache_invalidate(struct kvm *kvm, unsigned long start,
+				 unsigned long end)
 {
 	struct gfn_to_pfn_cache *gpc;
 
@@ -140,15 +145,12 @@ static bool gpc_invalidations_pending(struct gfn_to_pfn_cache *gpc)
 		(gpc->kvm->mmu_gpc_invalidate_range_end > gpc->uhva);
 }
 
-static bool gpc_wait_for_invalidations(struct gfn_to_pfn_cache *gpc)
+static void gpc_wait_for_invalidations(struct gfn_to_pfn_cache *gpc)
 {
-	bool waited = false;
-
 	spin_lock(&gpc->kvm->mn_invalidate_lock);
 	if (gpc_invalidations_pending(gpc)) {
 		DEFINE_WAIT(wait);
 
-		waited = true;
 		for (;;) {
 			prepare_to_wait(&gpc->kvm->gpc_invalidate_wq, &wait,
 					TASK_UNINTERRUPTIBLE);
@@ -163,10 +165,8 @@ static bool gpc_wait_for_invalidations(struct gfn_to_pfn_cache *gpc)
 		finish_wait(&gpc->kvm->gpc_invalidate_wq, &wait);
 	}
 	spin_unlock(&gpc->kvm->mn_invalidate_lock);
-	return waited;
 }
 
-
 static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
 {
 	/* Note, the new page offset may be different than the old! */
@@ -199,28 +199,6 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
 
 		write_unlock_irq(&gpc->lock);
 
-		/*
-		 * Invalidation occurs from the invalidate_range_start() hook,
-		 * which could already have happened before __kvm_gpc_refresh()
-		 * (or the previous turn around this loop) took gpc->lock().
-		 * If so, and if the corresponding invalidate_range_end() hook
-		 * hasn't happened yet, hva_to_pfn() could return a mapping
-		 * which is about to be stale and which should not be used. So
-		 * check if there are any currently-running invalidations which
-		 * affect the uHVA of this GPC, and retry if there are. Any
-		 * invalidation which starts after gpc->needs_invalidation is
-		 * set is fine, because it will clear that flag and trigger a
-		 * retry. And any invalidation which *completes* by having its
-		 * invalidate_range_end() hook called immediately prior to this
-		 * check is also fine, because the page tables are guaranteed
-		 * to have been changted already, so hva_to_pfn() won't return
-		 * a stale mapping in that case anyway.
-		 */
-		if (gpc_wait_for_invalidations(gpc)) {
-			write_lock_irq(&gpc->lock);
-			continue;
-		}
-
 		/*
 		 * If the previous iteration "failed" due to an mmu_notifier
 		 * event, release the pfn and unmap the kernel virtual address
@@ -261,6 +239,14 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
 			goto out_error;
 		}
 
+		/*
+		 * Avoid populating a GPC with a user address which is already
+		 * being invalidated, if the invalidate_range_start() notifier
+		 * has already been called. The actual invalidation happens in
+		 * the invalidate_range_end() callback, so wait until there are
+		 * no active invalidations (for the relevant HVA).
+		 */
+		gpc_wait_for_invalidations(gpc);
 
 		write_lock_irq(&gpc->lock);
 
@@ -269,6 +255,13 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
 		 * attempting to refresh.
 		 */
 		WARN_ON_ONCE(gpc->valid);
+
+		/*
+		 * Since gfn_to_pfn_cache_invalidate() is called from the
+		 * kvm_mmu_notifier_invalidate_range_end() callback, it can
+		 * invalidate the GPC the moment after hva_to_pfn() returned
+		 * a valid PFN. If that happens, retry.
+		 */
 	} while (!gpc->needs_invalidation);
 
 	gpc->valid = true;

From patchwork Wed Aug 21 20:28:13 2024
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: David Woodhouse <dwmw2@infradead.org>
X-Patchwork-Id: 13772042
Received: from desiato.infradead.org (desiato.infradead.org [90.155.92.199])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id DD924175D44;
	Wed, 21 Aug 2024 20:28:32 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=90.155.92.199
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1724272115; cv=none;
 b=H61/YSLAXtcCqGqPKLxE8lEqjer4kbHgpAO/4o0Me3zz0C4RYqBzQKm1YGRr1y0ZcNcpHGYbUHM5QU6Hs1QiOIS7aumYyUe1oaNIwCIbleD4dowy7y3+f3K7QvFiJpWcov4PID8Y3btSqVy1agO8Oz0IvQVVJjOcmzZlkFXZ6SY=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1724272115; c=relaxed/simple;
	bh=zhYvuRfsmvYozZROGCUyto52jsKNY3QqVc8r5eQUgYc=;
	h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References:
	 MIME-Version;
 b=Jmti6RZDdyIearABuoGdMBEIKdwG821//PVMCoWHpnbUFaG/MZPXcSkycvqBacFhCXqfrKhUYo7+yrzSDveiARmWLoXiZoyB1wMMGq77Ua/HGVCE5EZi3ebLUWarr98DZz4CTQvKLsNIOimDtztOWETc4efFr2ccoVl3TPIuzqY=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=infradead.org;
 spf=none smtp.mailfrom=desiato.srs.infradead.org;
 dkim=pass (2048-bit key) header.d=infradead.org header.i=@infradead.org
 header.b=CYIGYeoN; arc=none smtp.client-ip=90.155.92.199
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=infradead.org
Authentication-Results: smtp.subspace.kernel.org;
 spf=none smtp.mailfrom=desiato.srs.infradead.org
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=infradead.org header.i=@infradead.org
 header.b="CYIGYeoN"
DKIM-Signature: v=1; a=rsa-sha256; q=dns/txt; c=relaxed/relaxed;
	d=infradead.org; s=desiato.20200630; h=Sender:Content-Transfer-Encoding:
	MIME-Version:References:In-Reply-To:Message-ID:Date:Subject:Cc:To:From:
	Reply-To:Content-Type:Content-ID:Content-Description;
	bh=y07ALXmdNM/dzK4Ybsd0kGURE7f/SjK1VeGfZzQy5jw=; b=CYIGYeoN6k9TZFC8USMKRiPKDt
	YRJLoJQWww608HgIcIsZD/WNbjwsICzFR/t/s3egwyrUEeAB8W2WJrDbvAQBRbdnnYUrYf66CFYAI
	EU9b9k6jVRGxBmz/WepyyqXECB7hBcyj6n06H/Spgnm7YAJyGMHOjzJfg6TWW4Q0IBv/sffp1Za8U
	Aaxc8cGLDy0/dJnP2tRsGltQJKtb0Bt4gYpvEWIa2oW7gODSQz4M/HuiV+8wG1c3DPiA93PcxMfZ2
	vDD8vH4unXrCqdYQWZ12EhwQQC09WhIgc9iCBlEvv6GpyIFulvwE5+2LFb2PRpWemCfOlpN6TrPX+
	kNwevO2Q==;
Received: from [2001:8b0:10b:1::ebe] (helo=i7.infradead.org)
	by desiato.infradead.org with esmtpsa (Exim 4.97.1 #2 (Red Hat Linux))
	id 1sgrwG-00000009hco-1hLr;
	Wed, 21 Aug 2024 20:28:18 +0000
Received: from dwoodhou by i7.infradead.org with local (Exim 4.97.1 #2 (Red
 Hat Linux))
	id 1sgrwE-00000002z91-1NWg;
	Wed, 21 Aug 2024 21:28:14 +0100
From: David Woodhouse <dwmw2@infradead.org>
To: Sean Christopherson <seanjc@google.com>,
	Paolo Bonzini <pbonzini@redhat.com>,
	"Hussain, Mushahid" <hmushi@amazon.co.uk>
Cc: Vitaly Kuznetsov <vkuznets@redhat.com>,
	Wanpeng Li <wanpengli@tencent.com>,
	Jim Mattson <jmattson@google.com>,
	Joerg Roedel <joro@8bytes.org>,
	kvm@vger.kernel.org,
	linux-kernel@vger.kernel.org,
	Mingwei Zhang <mizhang@google.com>,
	Maxim Levitsky <mlevitsk@redhat.com>
Subject: [PATCH v3 5/5] DO NOT MERGE: Hack-a-test to verify gpc
 invalidation+refresh
Date: Wed, 21 Aug 2024 21:28:13 +0100
Message-ID: <20240821202814.711673-5-dwmw2@infradead.org>
X-Mailer: git-send-email 2.44.0
In-Reply-To: <20240821202814.711673-1-dwmw2@infradead.org>
References: <20240821202814.711673-1-dwmw2@infradead.org>
Precedence: bulk
X-Mailing-List: kvm@vger.kernel.org
List-Id: <kvm.vger.kernel.org>
List-Subscribe: <mailto:kvm+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:kvm+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Sender: David Woodhouse <dwmw2@infradead.org>
X-SRS-Rewrite: SMTP reverse-path rewritten from <dwmw2@infradead.org> by
 desiato.infradead.org. See http://www.infradead.org/rpr.html

From: Sean Christopherson <seanjc@google.com>

Add a VM-wide gfn=>pfn cache and a fake MSR to let userspace control the
cache.  On writes, reflect the value of the MSR into the backing page of
a gfn=>pfn cache so that userspace can detect if a value was written to
the wrong page, i.e. to a stale mapping.

Spin up 16 vCPUs (arbitrary) to use/refresh the cache, and another thread
to trigger mmu_notifier events and memslot updates.

Co-authored-by: David Woodhouse <dwmw@amazon.co.uk>
Not-signed-off-by: Sean Christopherson <seanjc@google.com>
Not-signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
---
 arch/x86/kvm/x86.c                     |  28 ++++
 include/linux/kvm_host.h               |   2 +
 tools/testing/selftests/kvm/Makefile   |   1 +
 tools/testing/selftests/kvm/gpc_test.c | 215 +++++++++++++++++++++++++
 4 files changed, 246 insertions(+)
 create mode 100644 tools/testing/selftests/kvm/gpc_test.c

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 70219e406987..910fef96409a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3767,6 +3767,19 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		return kvm_xen_write_hypercall_page(vcpu, data);
 
 	switch (msr) {
+	case 0xdeadbeefu: {
+		struct gfn_to_pfn_cache *gpc = &vcpu->kvm->test_cache;
+
+		if (kvm_gpc_activate(gpc, data, 8))
+			break;
+
+		read_lock(&gpc->lock);
+		if (kvm_gpc_check(gpc, 8))
+			*(u64 *)(gpc->khva) = data;
+		read_unlock(&gpc->lock);
+		break;
+	}
+
 	case MSR_AMD64_NB_CFG:
 	case MSR_IA32_UCODE_WRITE:
 	case MSR_VM_HSAVE_PA:
@@ -4206,6 +4219,18 @@ static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host)
 int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 {
 	switch (msr_info->index) {
+	case 0xdeadbeefu: {
+		struct gfn_to_pfn_cache *gpc = &vcpu->kvm->test_cache;
+
+		read_lock(&gpc->lock);
+		if (kvm_gpc_check(gpc, 8))
+			msr_info->data = gpc->gpa;
+		else
+			msr_info->data = 0xdeadbeefu;
+		read_unlock(&gpc->lock);
+		return 0;
+	}
+
 	case MSR_IA32_PLATFORM_ID:
 	case MSR_IA32_EBL_CR_POWERON:
 	case MSR_IA32_LASTBRANCHFROMIP:
@@ -12693,6 +12718,8 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 	kvm_hv_init_vm(kvm);
 	kvm_xen_init_vm(kvm);
 
+	kvm_gpc_init(&kvm->test_cache, kvm);
+
 	return 0;
 
 out_uninit_mmu:
@@ -12840,6 +12867,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
 	kvm_page_track_cleanup(kvm);
 	kvm_xen_destroy_vm(kvm);
 	kvm_hv_destroy_vm(kvm);
+	kvm_gpc_deactivate(&kvm->test_cache);
 }
 
 static void memslot_rmap_free(struct kvm_memory_slot *slot)
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index a0739c343da5..d4d155091bfd 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -741,6 +741,8 @@ struct kvm {
 
 	struct mutex slots_lock;
 
+	struct gfn_to_pfn_cache test_cache;
+
 	/*
 	 * Protects the arch-specific fields of struct kvm_memory_slots in
 	 * use by the VM. To be used under the slots_lock (above) or in a
diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile
index 48d32c5aa3eb..1c9f601d7f7d 100644
--- a/tools/testing/selftests/kvm/Makefile
+++ b/tools/testing/selftests/kvm/Makefile
@@ -65,6 +65,7 @@ TEST_PROGS_x86_64 += x86_64/nx_huge_pages_test.sh
 
 # Compiled test targets
 TEST_GEN_PROGS_x86_64 = x86_64/cpuid_test
+TEST_GEN_PROGS_x86_64 += gpc_test
 TEST_GEN_PROGS_x86_64 += x86_64/cr4_cpuid_sync_test
 TEST_GEN_PROGS_x86_64 += x86_64/dirty_log_page_splitting_test
 TEST_GEN_PROGS_x86_64 += x86_64/get_msr_index_features
diff --git a/tools/testing/selftests/kvm/gpc_test.c b/tools/testing/selftests/kvm/gpc_test.c
new file mode 100644
index 000000000000..611d2342d7d4
--- /dev/null
+++ b/tools/testing/selftests/kvm/gpc_test.c
@@ -0,0 +1,215 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#define _GNU_SOURCE /* for program_invocation_short_name */
+#include <errno.h>
+#include <fcntl.h>
+#include <pthread.h>
+#include <sched.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <signal.h>
+#include <syscall.h>
+#include <sys/ioctl.h>
+#include <sys/sysinfo.h>
+#include <asm/barrier.h>
+#include <linux/atomic.h>
+#include <linux/rseq.h>
+#include <linux/unistd.h>
+
+#include "kvm_util.h"
+#include "processor.h"
+#include "test_util.h"
+
+#define NR_VCPUS 16
+
+#define NR_ITERATIONS	1000
+
+#ifndef MAP_FIXED_NOREPLACE
+#define MAP_FIXED_NOREPLACE	0x100000
+#endif
+
+static const uint64_t gpa_base = (4ull * (1 << 30));
+
+static struct kvm_vm *vm;
+
+static pthread_t memory_thread;
+static pthread_t vcpu_threads[NR_VCPUS];
+struct kvm_vcpu *vcpus[KVM_MAX_VCPUS];
+
+static bool fight;
+
+static uint64_t per_vcpu_gpa_aligned(int vcpu_id)
+{
+	return gpa_base + (vcpu_id * PAGE_SIZE);
+}
+
+static uint64_t per_vcpu_gpa(int vcpu_id)
+{
+	return per_vcpu_gpa_aligned(vcpu_id) + vcpu_id;
+}
+
+static void guest_code(int vcpu_id)
+{
+	uint64_t this_vcpu_gpa;
+	int i;
+
+	this_vcpu_gpa = per_vcpu_gpa(vcpu_id);
+
+	for (i = 0; i < NR_ITERATIONS; i++)
+		wrmsr(0xdeadbeefu, this_vcpu_gpa);
+	GUEST_SYNC(0);
+}
+
+static void *memory_worker(void *ign)
+{
+	int i, x, r, k;
+	uint64_t *hva;
+	uint64_t gpa;
+	void *mem;
+
+	while (!READ_ONCE(fight))
+		cpu_relax();
+
+	for (k = 0; k < 50; k++) {
+		i = (unsigned int)random() % NR_VCPUS;
+
+		gpa = per_vcpu_gpa_aligned(i);
+		hva = (void *)gpa;
+
+		x = (unsigned int)random() % 5;
+		switch (x) {
+		case 0:
+			r = munmap(hva, PAGE_SIZE);
+			TEST_ASSERT(!r, "Failed to mumap (hva = %lx), errno = %d (%s)",
+				    (unsigned long)hva, errno, strerror(errno));
+
+			mem = mmap(hva, PAGE_SIZE, PROT_READ | PROT_WRITE,
+				MAP_FIXED_NOREPLACE | MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+			TEST_ASSERT(mem != MAP_FAILED || mem != hva,
+				    "Failed to mmap (hva = %lx), errno = %d (%s)",
+				    (unsigned long)hva, errno, strerror(errno));
+			break;
+		case 1:
+			vm_set_user_memory_region(vm, i + 1, KVM_MEM_LOG_DIRTY_PAGES,
+						  gpa, PAGE_SIZE, hva);
+			vm_set_user_memory_region(vm, i + 1, 0, gpa, PAGE_SIZE, hva);
+			break;
+		case 2:
+			r = mprotect(hva, PAGE_SIZE, PROT_NONE);
+			TEST_ASSERT(!r, "Failed to mprotect (hva = %lx), errno = %d (%s)",
+				    (unsigned long)hva, errno, strerror(errno));
+
+			r = mprotect(hva, PAGE_SIZE, PROT_READ | PROT_WRITE);
+			TEST_ASSERT(!r, "Failed to mprotect (hva = %lx), errno = %d (%s)",
+				    (unsigned long)hva, errno, strerror(errno));
+			break;
+		case 3:
+			r = mprotect(hva, PAGE_SIZE, PROT_READ);
+			TEST_ASSERT(!r, "Failed to mprotect (hva = %lx), errno = %d (%s)",
+				    (unsigned long)hva, errno, strerror(errno));
+
+			r = mprotect(hva, PAGE_SIZE, PROT_READ | PROT_WRITE);
+			TEST_ASSERT(!r, "Failed to mprotect (hva = %lx), errno = %d (%s)",
+				    (unsigned long)hva, errno, strerror(errno));
+			break;
+		case 4:
+			vm_set_user_memory_region(vm, i + 1, 0, gpa, 0, 0);
+			vm_set_user_memory_region(vm, i + 1, 0, gpa, PAGE_SIZE,
+						  (void *)per_vcpu_gpa_aligned(NR_VCPUS));
+			vm_set_user_memory_region(vm, i + 1, 0, gpa, 0, 0);
+			vm_set_user_memory_region(vm, i + 1, 0, gpa, PAGE_SIZE, hva);
+			break;
+		}
+	}
+	return NULL;
+}
+
+static void sync_guest(int vcpu_id)
+{
+	struct ucall uc;
+
+	switch (get_ucall(vcpus[vcpu_id], &uc)) {
+	case UCALL_SYNC:
+		TEST_ASSERT(uc.args[1] == 0,
+			   "Unexpected sync ucall, got %lx", uc.args[1]);
+		break;
+	case UCALL_ABORT:
+			TEST_FAIL("%s at %s:%ld\n\tvalues: %#lx, %#lx",
+		(const char *)uc.args[0],
+		__FILE__, uc.args[1], uc.args[2], uc.args[3]);
+		break;
+	default:
+		TEST_FAIL("Unexpected userspace exit, reason = %s\n",
+			  exit_reason_str(vcpus[vcpu_id]->run->exit_reason));
+		break;
+	}
+}
+
+static void *vcpu_worker(void *data)
+{
+	int vcpu_id = (unsigned long)data;
+
+	vcpu_args_set(vcpus[vcpu_id], 1, vcpu_id);
+
+	while (!READ_ONCE(fight))
+		cpu_relax();
+
+	usleep(10);
+
+	vcpu_run(vcpus[vcpu_id]);
+
+	sync_guest(vcpu_id);
+
+	return NULL;
+}
+
+int main(int argc, char *argv[])
+{
+	uint64_t *hva;
+	uint64_t gpa;
+	void *r;
+	int i;
+
+	srandom(time(0));
+
+	vm = vm_create_with_vcpus(NR_VCPUS, guest_code, vcpus);
+
+	pthread_create(&memory_thread, NULL, memory_worker, 0);
+
+	for (i = 0; i < NR_VCPUS; i++) {
+		pthread_create(&vcpu_threads[i], NULL, vcpu_worker, (void *)(unsigned long)i);
+
+		gpa = per_vcpu_gpa_aligned(i);
+		hva = (void *)gpa;
+		r = mmap(hva, PAGE_SIZE, PROT_READ | PROT_WRITE,
+			 MAP_FIXED_NOREPLACE | MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+		TEST_ASSERT(r != MAP_FAILED, "mmap() '%lx' failed, errno = %d (%s)",
+			    gpa, errno, strerror(errno));
+
+		vm_set_user_memory_region(vm, i + 1, 0, gpa, PAGE_SIZE, hva);
+	}
+
+	WRITE_ONCE(fight, true);
+
+	for (i = 0; i < NR_VCPUS; i++)
+		pthread_join(vcpu_threads[i], NULL);
+
+	pthread_join(memory_thread, NULL);
+
+	for (i = 0; i < NR_VCPUS; i++) {
+		gpa = per_vcpu_gpa(i);
+		hva = (void *)gpa;
+
+		TEST_ASSERT(*hva == 0 || *hva == gpa,
+			    "Want '0' or '%lx', got '%lx'\n", gpa, *hva);
+	}
+
+	gpa = vcpu_get_msr(vcpus[0], 0xdeadbeefu);
+	hva = (void *)gpa;
+	if (gpa != 0xdeadbeefu)
+		TEST_ASSERT(*hva == gpa, "Want '%lx', got '%lx'\n", gpa, *hva);
+
+	kvm_vm_free(vm);
+
+	return 0;
+}