diff mbox series

[v2,3/4] KVM: arm64: Factor out pKVM hyp vcpu creation to separate function

Message ID 20250226215520.1759218-4-tabba@google.com (mailing list archive)
State New, archived
Headers show
Series KVM: arm64: Fix initializing HCRX_EL2 and other traps in pKVM | expand

Commit Message

Fuad Tabba Feb. 26, 2025, 9:55 p.m. UTC
Move the code that creates and initializes the hyp view of a vcpu
in pKVM to its own function. This is meant to make the transition
to initializing every vcpu individually clearer.

Signed-off-by: Fuad Tabba <tabba@google.com>
---
 arch/arm64/kvm/pkvm.c | 48 ++++++++++++++++++++-----------------------
 1 file changed, 22 insertions(+), 26 deletions(-)

Comments

Quentin Perret Feb. 28, 2025, 7:44 p.m. UTC | #1
On Wednesday 26 Feb 2025 at 21:55:19 (+0000), Fuad Tabba wrote:
>  static int __pkvm_create_hyp_vm(struct kvm *host_kvm)
>  {
> -	size_t pgd_sz, hyp_vm_sz, hyp_vcpu_sz;
> +	size_t pgd_sz, hyp_vm_sz;
>  	struct kvm_vcpu *host_vcpu;
> -	pkvm_handle_t handle;
>  	void *pgd, *hyp_vm;
>  	unsigned long idx;
>  	int ret;
> @@ -161,33 +178,12 @@ static int __pkvm_create_hyp_vm(struct kvm *host_kvm)
>  	if (ret < 0)
>  		goto free_vm;
>  
> -	handle = ret;
> +	WRITE_ONCE(host_kvm->arch.pkvm.handle, ret);

What's the reason to make this a WRITE_ONCE? Does it mean we should
update the readers to be READ_ONCE()?

> -	host_kvm->arch.pkvm.handle = handle;
> -
> -	/* Donate memory for the vcpus at hyp and initialize it. */
> -	hyp_vcpu_sz = PAGE_ALIGN(PKVM_HYP_VCPU_SIZE);
>  	kvm_for_each_vcpu(idx, host_vcpu, host_kvm) {
> -		void *hyp_vcpu;
> -
> -		/* Indexing of the vcpus to be sequential starting at 0. */
> -		if (WARN_ON(host_vcpu->vcpu_idx != idx)) {
> -			ret = -EINVAL;
> -			goto destroy_vm;
> -		}
> -
> -		hyp_vcpu = alloc_pages_exact(hyp_vcpu_sz, GFP_KERNEL_ACCOUNT);
> -		if (!hyp_vcpu) {
> -			ret = -ENOMEM;
> -			goto destroy_vm;
> -		}
> -
> -		ret = kvm_call_hyp_nvhe(__pkvm_init_vcpu, handle, host_vcpu,
> -					hyp_vcpu);
> -		if (ret) {
> -			free_pages_exact(hyp_vcpu, hyp_vcpu_sz);
> +		ret = __pkvm_create_hyp_vcpu(host_vcpu);
> +		if (ret)
>  			goto destroy_vm;
> -		}
>  	}
>  
>  	return 0;
> -- 
> 2.48.1.711.g2feabab25a-goog
>
Fuad Tabba March 3, 2025, 7:57 a.m. UTC | #2
Hi Quentin,

On Fri, 28 Feb 2025 at 19:44, Quentin Perret <qperret@google.com> wrote:
>
> On Wednesday 26 Feb 2025 at 21:55:19 (+0000), Fuad Tabba wrote:
> >  static int __pkvm_create_hyp_vm(struct kvm *host_kvm)
> >  {
> > -     size_t pgd_sz, hyp_vm_sz, hyp_vcpu_sz;
> > +     size_t pgd_sz, hyp_vm_sz;
> >       struct kvm_vcpu *host_vcpu;
> > -     pkvm_handle_t handle;
> >       void *pgd, *hyp_vm;
> >       unsigned long idx;
> >       int ret;
> > @@ -161,33 +178,12 @@ static int __pkvm_create_hyp_vm(struct kvm *host_kvm)
> >       if (ret < 0)
> >               goto free_vm;
> >
> > -     handle = ret;
> > +     WRITE_ONCE(host_kvm->arch.pkvm.handle, ret);
>
> What's the reason to make this a WRITE_ONCE? Does it mean we should
> update the readers to be READ_ONCE()?

I don't remember the original reason, to be honest. In this case, it
was to make it consistent with downstream code in Android. That said,
I plan on revising all of these soon and fixing this (and related
code) in light of Will's comment regarding potential specter gadgets:

https://lore.kernel.org/all/20250218092705.GA17030@willie-the-truck/

Cheers,
/fuad

> > -     host_kvm->arch.pkvm.handle = handle;
> > -
> > -     /* Donate memory for the vcpus at hyp and initialize it. */
> > -     hyp_vcpu_sz = PAGE_ALIGN(PKVM_HYP_VCPU_SIZE);
> >       kvm_for_each_vcpu(idx, host_vcpu, host_kvm) {
> > -             void *hyp_vcpu;
> > -
> > -             /* Indexing of the vcpus to be sequential starting at 0. */
> > -             if (WARN_ON(host_vcpu->vcpu_idx != idx)) {
> > -                     ret = -EINVAL;
> > -                     goto destroy_vm;
> > -             }
> > -
> > -             hyp_vcpu = alloc_pages_exact(hyp_vcpu_sz, GFP_KERNEL_ACCOUNT);
> > -             if (!hyp_vcpu) {
> > -                     ret = -ENOMEM;
> > -                     goto destroy_vm;
> > -             }
> > -
> > -             ret = kvm_call_hyp_nvhe(__pkvm_init_vcpu, handle, host_vcpu,
> > -                                     hyp_vcpu);
> > -             if (ret) {
> > -                     free_pages_exact(hyp_vcpu, hyp_vcpu_sz);
> > +             ret = __pkvm_create_hyp_vcpu(host_vcpu);
> > +             if (ret)
> >                       goto destroy_vm;
> > -             }
> >       }
> >
> >       return 0;
> > --
> > 2.48.1.711.g2feabab25a-goog
> >
Will Deacon March 3, 2025, 7:18 p.m. UTC | #3
On Mon, Mar 03, 2025 at 07:57:00AM +0000, Fuad Tabba wrote:
> On Fri, 28 Feb 2025 at 19:44, Quentin Perret <qperret@google.com> wrote:
> >
> > On Wednesday 26 Feb 2025 at 21:55:19 (+0000), Fuad Tabba wrote:
> > >  static int __pkvm_create_hyp_vm(struct kvm *host_kvm)
> > >  {
> > > -     size_t pgd_sz, hyp_vm_sz, hyp_vcpu_sz;
> > > +     size_t pgd_sz, hyp_vm_sz;
> > >       struct kvm_vcpu *host_vcpu;
> > > -     pkvm_handle_t handle;
> > >       void *pgd, *hyp_vm;
> > >       unsigned long idx;
> > >       int ret;
> > > @@ -161,33 +178,12 @@ static int __pkvm_create_hyp_vm(struct kvm *host_kvm)
> > >       if (ret < 0)
> > >               goto free_vm;
> > >
> > > -     handle = ret;
> > > +     WRITE_ONCE(host_kvm->arch.pkvm.handle, ret);
> >
> > What's the reason to make this a WRITE_ONCE? Does it mean we should
> > update the readers to be READ_ONCE()?
> 
> I don't remember the original reason, to be honest. In this case, it
> was to make it consistent with downstream code in Android. That said,
> I plan on revising all of these soon and fixing this (and related
> code) in light of Will's comment regarding potential specter gadgets:
> 
> https://lore.kernel.org/all/20250218092705.GA17030@willie-the-truck/

I'm not sure the spectre stuff changes the concurrency aspects here, so
Quentin's question presumably still stands even after that.

Looking at the Android code, this WRITE_ONCE() pairs with a READ_ONCE()
in pkvm_is_hyp_created() which is called without the config_lock held
by kvm_arch_prepare_memory_region(). However, given that
pkvm_is_hyp_created() is only testing against 0, I don't think the
_ONCE() accessors are doing anything useful in that case.

The more confusing stuff is where 'kvm->arch.pkvm.handle' is read
directly without the 'config_lock' held. The MMU notifiers look like
they can do that, so I wonder if there's a theoretical race where they
can race with first run and issue TLB invalidation with the wrong handle?
That would apply equally to the upstream code, I think.

Will
Fuad Tabba March 3, 2025, 7:21 p.m. UTC | #4
Hi Will,

On Mon, 3 Mar 2025 at 19:18, Will Deacon <will@kernel.org> wrote:
>
> On Mon, Mar 03, 2025 at 07:57:00AM +0000, Fuad Tabba wrote:
> > On Fri, 28 Feb 2025 at 19:44, Quentin Perret <qperret@google.com> wrote:
> > >
> > > On Wednesday 26 Feb 2025 at 21:55:19 (+0000), Fuad Tabba wrote:
> > > >  static int __pkvm_create_hyp_vm(struct kvm *host_kvm)
> > > >  {
> > > > -     size_t pgd_sz, hyp_vm_sz, hyp_vcpu_sz;
> > > > +     size_t pgd_sz, hyp_vm_sz;
> > > >       struct kvm_vcpu *host_vcpu;
> > > > -     pkvm_handle_t handle;
> > > >       void *pgd, *hyp_vm;
> > > >       unsigned long idx;
> > > >       int ret;
> > > > @@ -161,33 +178,12 @@ static int __pkvm_create_hyp_vm(struct kvm *host_kvm)
> > > >       if (ret < 0)
> > > >               goto free_vm;
> > > >
> > > > -     handle = ret;
> > > > +     WRITE_ONCE(host_kvm->arch.pkvm.handle, ret);
> > >
> > > What's the reason to make this a WRITE_ONCE? Does it mean we should
> > > update the readers to be READ_ONCE()?
> >
> > I don't remember the original reason, to be honest. In this case, it
> > was to make it consistent with downstream code in Android. That said,
> > I plan on revising all of these soon and fixing this (and related
> > code) in light of Will's comment regarding potential specter gadgets:
> >
> > https://lore.kernel.org/all/20250218092705.GA17030@willie-the-truck/
>
> I'm not sure the spectre stuff changes the concurrency aspects here, so
> Quentin's question presumably still stands even after that.
>
> Looking at the Android code, this WRITE_ONCE() pairs with a READ_ONCE()
> in pkvm_is_hyp_created() which is called without the config_lock held
> by kvm_arch_prepare_memory_region(). However, given that
> pkvm_is_hyp_created() is only testing against 0, I don't think the
> _ONCE() accessors are doing anything useful in that case.
>
> The more confusing stuff is where 'kvm->arch.pkvm.handle' is read
> directly without the 'config_lock' held. The MMU notifiers look like
> they can do that, so I wonder if there's a theoretical race where they
> can race with first run and issue TLB invalidation with the wrong handle?
> That would apply equally to the upstream code, I think.

Would using _ONCE accessors with the MMU notifiers be enough to avoid
the race, or do we need to reconsider the lock protecting the handle
and apply it to the notifiers?

Cheers,
/fuad

> Will
Will Deacon March 3, 2025, 9:49 p.m. UTC | #5
On Mon, Mar 03, 2025 at 07:21:33PM +0000, Fuad Tabba wrote:
> On Mon, 3 Mar 2025 at 19:18, Will Deacon <will@kernel.org> wrote:
> > On Mon, Mar 03, 2025 at 07:57:00AM +0000, Fuad Tabba wrote:
> > > On Fri, 28 Feb 2025 at 19:44, Quentin Perret <qperret@google.com> wrote:
> > > > On Wednesday 26 Feb 2025 at 21:55:19 (+0000), Fuad Tabba wrote:
> > > > >  static int __pkvm_create_hyp_vm(struct kvm *host_kvm)
> > > > >  {
> > > > > -     size_t pgd_sz, hyp_vm_sz, hyp_vcpu_sz;
> > > > > +     size_t pgd_sz, hyp_vm_sz;
> > > > >       struct kvm_vcpu *host_vcpu;
> > > > > -     pkvm_handle_t handle;
> > > > >       void *pgd, *hyp_vm;
> > > > >       unsigned long idx;
> > > > >       int ret;
> > > > > @@ -161,33 +178,12 @@ static int __pkvm_create_hyp_vm(struct kvm *host_kvm)
> > > > >       if (ret < 0)
> > > > >               goto free_vm;
> > > > >
> > > > > -     handle = ret;
> > > > > +     WRITE_ONCE(host_kvm->arch.pkvm.handle, ret);
> > > >
> > > > What's the reason to make this a WRITE_ONCE? Does it mean we should
> > > > update the readers to be READ_ONCE()?
> > >
> > > I don't remember the original reason, to be honest. In this case, it
> > > was to make it consistent with downstream code in Android. That said,
> > > I plan on revising all of these soon and fixing this (and related
> > > code) in light of Will's comment regarding potential specter gadgets:
> > >
> > > https://lore.kernel.org/all/20250218092705.GA17030@willie-the-truck/
> >
> > I'm not sure the spectre stuff changes the concurrency aspects here, so
> > Quentin's question presumably still stands even after that.
> >
> > Looking at the Android code, this WRITE_ONCE() pairs with a READ_ONCE()
> > in pkvm_is_hyp_created() which is called without the config_lock held
> > by kvm_arch_prepare_memory_region(). However, given that
> > pkvm_is_hyp_created() is only testing against 0, I don't think the
> > _ONCE() accessors are doing anything useful in that case.
> >
> > The more confusing stuff is where 'kvm->arch.pkvm.handle' is read
> > directly without the 'config_lock' held. The MMU notifiers look like
> > they can do that, so I wonder if there's a theoretical race where they
> > can race with first run and issue TLB invalidation with the wrong handle?
> > That would apply equally to the upstream code, I think.
> 
> Would using _ONCE accessors with the MMU notifiers be enough to avoid
> the race, or do we need to reconsider the lock protecting the handle
> and apply it to the notifiers?

I'm not entirely sure... if we used _ONCE() then we'd get either the
correct handle or zero, but we presumably need to order that against the
page-table somehow. The 'mmu_lock' looks like it gives us that, but I
don't think the notifiers are expecting an uninitialised handle in the
case where the page-table is empty (i.e. if they fire before first run).

Given that the handle is necessary for TLB invalidation, I'd be inclined
to make sure that the handle is allocated and published _before_ the
kvm_pgtable pointer checked by the notifiers is set, but that means
moving the handle allocation into kvm_arch_init_vm(). Is that do-able?

Will
Fuad Tabba March 4, 2025, 12:33 p.m. UTC | #6
Hi Will,

On Mon, 3 Mar 2025 at 21:49, Will Deacon <will@kernel.org> wrote:
>
> On Mon, Mar 03, 2025 at 07:21:33PM +0000, Fuad Tabba wrote:
> > On Mon, 3 Mar 2025 at 19:18, Will Deacon <will@kernel.org> wrote:
> > > On Mon, Mar 03, 2025 at 07:57:00AM +0000, Fuad Tabba wrote:
> > > > On Fri, 28 Feb 2025 at 19:44, Quentin Perret <qperret@google.com> wrote:
> > > > > On Wednesday 26 Feb 2025 at 21:55:19 (+0000), Fuad Tabba wrote:
> > > > > >  static int __pkvm_create_hyp_vm(struct kvm *host_kvm)
> > > > > >  {
> > > > > > -     size_t pgd_sz, hyp_vm_sz, hyp_vcpu_sz;
> > > > > > +     size_t pgd_sz, hyp_vm_sz;
> > > > > >       struct kvm_vcpu *host_vcpu;
> > > > > > -     pkvm_handle_t handle;
> > > > > >       void *pgd, *hyp_vm;
> > > > > >       unsigned long idx;
> > > > > >       int ret;
> > > > > > @@ -161,33 +178,12 @@ static int __pkvm_create_hyp_vm(struct kvm *host_kvm)
> > > > > >       if (ret < 0)
> > > > > >               goto free_vm;
> > > > > >
> > > > > > -     handle = ret;
> > > > > > +     WRITE_ONCE(host_kvm->arch.pkvm.handle, ret);
> > > > >
> > > > > What's the reason to make this a WRITE_ONCE? Does it mean we should
> > > > > update the readers to be READ_ONCE()?
> > > >
> > > > I don't remember the original reason, to be honest. In this case, it
> > > > was to make it consistent with downstream code in Android. That said,
> > > > I plan on revising all of these soon and fixing this (and related
> > > > code) in light of Will's comment regarding potential specter gadgets:
> > > >
> > > > https://lore.kernel.org/all/20250218092705.GA17030@willie-the-truck/
> > >
> > > I'm not sure the spectre stuff changes the concurrency aspects here, so
> > > Quentin's question presumably still stands even after that.
> > >
> > > Looking at the Android code, this WRITE_ONCE() pairs with a READ_ONCE()
> > > in pkvm_is_hyp_created() which is called without the config_lock held
> > > by kvm_arch_prepare_memory_region(). However, given that
> > > pkvm_is_hyp_created() is only testing against 0, I don't think the
> > > _ONCE() accessors are doing anything useful in that case.
> > >
> > > The more confusing stuff is where 'kvm->arch.pkvm.handle' is read
> > > directly without the 'config_lock' held. The MMU notifiers look like
> > > they can do that, so I wonder if there's a theoretical race where they
> > > can race with first run and issue TLB invalidation with the wrong handle?
> > > That would apply equally to the upstream code, I think.
> >
> > Would using _ONCE accessors with the MMU notifiers be enough to avoid
> > the race, or do we need to reconsider the lock protecting the handle
> > and apply it to the notifiers?
>
> I'm not entirely sure... if we used _ONCE() then we'd get either the
> correct handle or zero, but we presumably need to order that against the
> page-table somehow. The 'mmu_lock' looks like it gives us that, but I
> don't think the notifiers are expecting an uninitialised handle in the
> case where the page-table is empty (i.e. if they fire before first run).
>
> Given that the handle is necessary for TLB invalidation, I'd be inclined
> to make sure that the handle is allocated and published _before_ the
> kvm_pgtable pointer checked by the notifiers is set, but that means
> moving the handle allocation into kvm_arch_init_vm(). Is that do-able?

It is doable, but it won't be a simple patch. Up until now, the
creation of the hyp view of a vm is associated with the first run of
the vm, rather than its allocation at the host. Part of the reason for
that is that much of the vm state (mainly the vcpu state) isn't
finalized until then.

This patch series fixes this in part, by decoupling the initialization
of the individual vcpus from the vm. Because of that, it would be
doable, but it might be better as another patch series that we could
test and analyze separately.

If you agree, I could whip up something and send it soon.

Cheers,
/fuad
> Will
Will Deacon March 12, 2025, 3:29 p.m. UTC | #7
On Tue, Mar 04, 2025 at 12:33:27PM +0000, Fuad Tabba wrote:
> On Mon, 3 Mar 2025 at 21:49, Will Deacon <will@kernel.org> wrote:
> >
> > On Mon, Mar 03, 2025 at 07:21:33PM +0000, Fuad Tabba wrote:
> > > On Mon, 3 Mar 2025 at 19:18, Will Deacon <will@kernel.org> wrote:
> > > > On Mon, Mar 03, 2025 at 07:57:00AM +0000, Fuad Tabba wrote:
> > > > > On Fri, 28 Feb 2025 at 19:44, Quentin Perret <qperret@google.com> wrote:
> > > > > > On Wednesday 26 Feb 2025 at 21:55:19 (+0000), Fuad Tabba wrote:
> > > > > > >  static int __pkvm_create_hyp_vm(struct kvm *host_kvm)
> > > > > > >  {
> > > > > > > -     size_t pgd_sz, hyp_vm_sz, hyp_vcpu_sz;
> > > > > > > +     size_t pgd_sz, hyp_vm_sz;
> > > > > > >       struct kvm_vcpu *host_vcpu;
> > > > > > > -     pkvm_handle_t handle;
> > > > > > >       void *pgd, *hyp_vm;
> > > > > > >       unsigned long idx;
> > > > > > >       int ret;
> > > > > > > @@ -161,33 +178,12 @@ static int __pkvm_create_hyp_vm(struct kvm *host_kvm)
> > > > > > >       if (ret < 0)
> > > > > > >               goto free_vm;
> > > > > > >
> > > > > > > -     handle = ret;
> > > > > > > +     WRITE_ONCE(host_kvm->arch.pkvm.handle, ret);
> > > > > >
> > > > > > What's the reason to make this a WRITE_ONCE? Does it mean we should
> > > > > > update the readers to be READ_ONCE()?
> > > > >
> > > > > I don't remember the original reason, to be honest. In this case, it
> > > > > was to make it consistent with downstream code in Android. That said,
> > > > > I plan on revising all of these soon and fixing this (and related
> > > > > code) in light of Will's comment regarding potential specter gadgets:
> > > > >
> > > > > https://lore.kernel.org/all/20250218092705.GA17030@willie-the-truck/
> > > >
> > > > I'm not sure the spectre stuff changes the concurrency aspects here, so
> > > > Quentin's question presumably still stands even after that.
> > > >
> > > > Looking at the Android code, this WRITE_ONCE() pairs with a READ_ONCE()
> > > > in pkvm_is_hyp_created() which is called without the config_lock held
> > > > by kvm_arch_prepare_memory_region(). However, given that
> > > > pkvm_is_hyp_created() is only testing against 0, I don't think the
> > > > _ONCE() accessors are doing anything useful in that case.
> > > >
> > > > The more confusing stuff is where 'kvm->arch.pkvm.handle' is read
> > > > directly without the 'config_lock' held. The MMU notifiers look like
> > > > they can do that, so I wonder if there's a theoretical race where they
> > > > can race with first run and issue TLB invalidation with the wrong handle?
> > > > That would apply equally to the upstream code, I think.
> > >
> > > Would using _ONCE accessors with the MMU notifiers be enough to avoid
> > > the race, or do we need to reconsider the lock protecting the handle
> > > and apply it to the notifiers?
> >
> > I'm not entirely sure... if we used _ONCE() then we'd get either the
> > correct handle or zero, but we presumably need to order that against the
> > page-table somehow. The 'mmu_lock' looks like it gives us that, but I
> > don't think the notifiers are expecting an uninitialised handle in the
> > case where the page-table is empty (i.e. if they fire before first run).
> >
> > Given that the handle is necessary for TLB invalidation, I'd be inclined
> > to make sure that the handle is allocated and published _before_ the
> > kvm_pgtable pointer checked by the notifiers is set, but that means
> > moving the handle allocation into kvm_arch_init_vm(). Is that do-able?
> 
> It is doable, but it won't be a simple patch. Up until now, the
> creation of the hyp view of a vm is associated with the first run of
> the vm, rather than its allocation at the host. Part of the reason for
> that is that much of the vm state (mainly the vcpu state) isn't
> finalized until then.
> 
> This patch series fixes this in part, by decoupling the initialization
> of the individual vcpus from the vm. Because of that, it would be
> doable, but it might be better as another patch series that we could
> test and analyze separately.
> 
> If you agree, I could whip up something and send it soon.

Yeah, I think we should fix this as a separate series and drop the
WRITE_ONCE() from this patch.

Will
Fuad Tabba March 12, 2025, 3:31 p.m. UTC | #8
Hi Will,

On Wed, 12 Mar 2025 at 15:29, Will Deacon <will@kernel.org> wrote:
>
> On Tue, Mar 04, 2025 at 12:33:27PM +0000, Fuad Tabba wrote:
> > On Mon, 3 Mar 2025 at 21:49, Will Deacon <will@kernel.org> wrote:
> > >
> > > On Mon, Mar 03, 2025 at 07:21:33PM +0000, Fuad Tabba wrote:
> > > > On Mon, 3 Mar 2025 at 19:18, Will Deacon <will@kernel.org> wrote:
> > > > > On Mon, Mar 03, 2025 at 07:57:00AM +0000, Fuad Tabba wrote:
> > > > > > On Fri, 28 Feb 2025 at 19:44, Quentin Perret <qperret@google.com> wrote:
> > > > > > > On Wednesday 26 Feb 2025 at 21:55:19 (+0000), Fuad Tabba wrote:
> > > > > > > >  static int __pkvm_create_hyp_vm(struct kvm *host_kvm)
> > > > > > > >  {
> > > > > > > > -     size_t pgd_sz, hyp_vm_sz, hyp_vcpu_sz;
> > > > > > > > +     size_t pgd_sz, hyp_vm_sz;
> > > > > > > >       struct kvm_vcpu *host_vcpu;
> > > > > > > > -     pkvm_handle_t handle;
> > > > > > > >       void *pgd, *hyp_vm;
> > > > > > > >       unsigned long idx;
> > > > > > > >       int ret;
> > > > > > > > @@ -161,33 +178,12 @@ static int __pkvm_create_hyp_vm(struct kvm *host_kvm)
> > > > > > > >       if (ret < 0)
> > > > > > > >               goto free_vm;
> > > > > > > >
> > > > > > > > -     handle = ret;
> > > > > > > > +     WRITE_ONCE(host_kvm->arch.pkvm.handle, ret);
> > > > > > >
> > > > > > > What's the reason to make this a WRITE_ONCE? Does it mean we should
> > > > > > > update the readers to be READ_ONCE()?
> > > > > >
> > > > > > I don't remember the original reason, to be honest. In this case, it
> > > > > > was to make it consistent with downstream code in Android. That said,
> > > > > > I plan on revising all of these soon and fixing this (and related
> > > > > > code) in light of Will's comment regarding potential specter gadgets:
> > > > > >
> > > > > > https://lore.kernel.org/all/20250218092705.GA17030@willie-the-truck/
> > > > >
> > > > > I'm not sure the spectre stuff changes the concurrency aspects here, so
> > > > > Quentin's question presumably still stands even after that.
> > > > >
> > > > > Looking at the Android code, this WRITE_ONCE() pairs with a READ_ONCE()
> > > > > in pkvm_is_hyp_created() which is called without the config_lock held
> > > > > by kvm_arch_prepare_memory_region(). However, given that
> > > > > pkvm_is_hyp_created() is only testing against 0, I don't think the
> > > > > _ONCE() accessors are doing anything useful in that case.
> > > > >
> > > > > The more confusing stuff is where 'kvm->arch.pkvm.handle' is read
> > > > > directly without the 'config_lock' held. The MMU notifiers look like
> > > > > they can do that, so I wonder if there's a theoretical race where they
> > > > > can race with first run and issue TLB invalidation with the wrong handle?
> > > > > That would apply equally to the upstream code, I think.
> > > >
> > > > Would using _ONCE accessors with the MMU notifiers be enough to avoid
> > > > the race, or do we need to reconsider the lock protecting the handle
> > > > and apply it to the notifiers?
> > >
> > > I'm not entirely sure... if we used _ONCE() then we'd get either the
> > > correct handle or zero, but we presumably need to order that against the
> > > page-table somehow. The 'mmu_lock' looks like it gives us that, but I
> > > don't think the notifiers are expecting an uninitialised handle in the
> > > case where the page-table is empty (i.e. if they fire before first run).
> > >
> > > Given that the handle is necessary for TLB invalidation, I'd be inclined
> > > to make sure that the handle is allocated and published _before_ the
> > > kvm_pgtable pointer checked by the notifiers is set, but that means
> > > moving the handle allocation into kvm_arch_init_vm(). Is that do-able?
> >
> > It is doable, but it won't be a simple patch. Up until now, the
> > creation of the hyp view of a vm is associated with the first run of
> > the vm, rather than its allocation at the host. Part of the reason for
> > that is that much of the vm state (mainly the vcpu state) isn't
> > finalized until then.
> >
> > This patch series fixes this in part, by decoupling the initialization
> > of the individual vcpus from the vm. Because of that, it would be
> > doable, but it might be better as another patch series that we could
> > test and analyze separately.
> >
> > If you agree, I could whip up something and send it soon.
>
> Yeah, I think we should fix this as a separate series and drop the
> WRITE_ONCE() from this patch.

I'll send a fix soon.

Oliver, would you like me to respin this series, or can you drop the
WRITE_ONCE()?

Thanks,
/fuad

> Will
Will Deacon March 14, 2025, 11:14 a.m. UTC | #9
On Wed, Mar 12, 2025 at 03:31:17PM +0000, Fuad Tabba wrote:
> On Wed, 12 Mar 2025 at 15:29, Will Deacon <will@kernel.org> wrote:
> > On Tue, Mar 04, 2025 at 12:33:27PM +0000, Fuad Tabba wrote:
> > > If you agree, I could whip up something and send it soon.
> >
> > Yeah, I think we should fix this as a separate series and drop the
> > WRITE_ONCE() from this patch.
> 
> I'll send a fix soon.
> 
> Oliver, would you like me to respin this series, or can you drop the
> WRITE_ONCE()?

With the WRITE_ONCE() dropped:

Acked-by: Will Deacon <will@kernel.org>

Cheers,

Will
diff mbox series

Patch

diff --git a/arch/arm64/kvm/pkvm.c b/arch/arm64/kvm/pkvm.c
index 930b677eb9b0..06665cf221bf 100644
--- a/arch/arm64/kvm/pkvm.c
+++ b/arch/arm64/kvm/pkvm.c
@@ -113,6 +113,24 @@  static void __pkvm_destroy_hyp_vm(struct kvm *host_kvm)
 	free_hyp_memcache(&host_kvm->arch.pkvm.teardown_mc);
 }
 
+static int __pkvm_create_hyp_vcpu(struct kvm_vcpu *vcpu)
+{
+	size_t hyp_vcpu_sz = PAGE_ALIGN(PKVM_HYP_VCPU_SIZE);
+	pkvm_handle_t handle = vcpu->kvm->arch.pkvm.handle;
+	void *hyp_vcpu;
+	int ret;
+
+	hyp_vcpu = alloc_pages_exact(hyp_vcpu_sz, GFP_KERNEL_ACCOUNT);
+	if (!hyp_vcpu)
+		return -ENOMEM;
+
+	ret = kvm_call_hyp_nvhe(__pkvm_init_vcpu, handle, vcpu, hyp_vcpu);
+	if (ret)
+		free_pages_exact(hyp_vcpu, hyp_vcpu_sz);
+
+	return ret;
+}
+
 /*
  * Allocates and donates memory for hypervisor VM structs at EL2.
  *
@@ -125,9 +143,8 @@  static void __pkvm_destroy_hyp_vm(struct kvm *host_kvm)
  */
 static int __pkvm_create_hyp_vm(struct kvm *host_kvm)
 {
-	size_t pgd_sz, hyp_vm_sz, hyp_vcpu_sz;
+	size_t pgd_sz, hyp_vm_sz;
 	struct kvm_vcpu *host_vcpu;
-	pkvm_handle_t handle;
 	void *pgd, *hyp_vm;
 	unsigned long idx;
 	int ret;
@@ -161,33 +178,12 @@  static int __pkvm_create_hyp_vm(struct kvm *host_kvm)
 	if (ret < 0)
 		goto free_vm;
 
-	handle = ret;
+	WRITE_ONCE(host_kvm->arch.pkvm.handle, ret);
 
-	host_kvm->arch.pkvm.handle = handle;
-
-	/* Donate memory for the vcpus at hyp and initialize it. */
-	hyp_vcpu_sz = PAGE_ALIGN(PKVM_HYP_VCPU_SIZE);
 	kvm_for_each_vcpu(idx, host_vcpu, host_kvm) {
-		void *hyp_vcpu;
-
-		/* Indexing of the vcpus to be sequential starting at 0. */
-		if (WARN_ON(host_vcpu->vcpu_idx != idx)) {
-			ret = -EINVAL;
-			goto destroy_vm;
-		}
-
-		hyp_vcpu = alloc_pages_exact(hyp_vcpu_sz, GFP_KERNEL_ACCOUNT);
-		if (!hyp_vcpu) {
-			ret = -ENOMEM;
-			goto destroy_vm;
-		}
-
-		ret = kvm_call_hyp_nvhe(__pkvm_init_vcpu, handle, host_vcpu,
-					hyp_vcpu);
-		if (ret) {
-			free_pages_exact(hyp_vcpu, hyp_vcpu_sz);
+		ret = __pkvm_create_hyp_vcpu(host_vcpu);
+		if (ret)
 			goto destroy_vm;
-		}
 	}
 
 	return 0;