diff mbox series

[v2,8/8] KVM: x86: hyper-v: Deactivate APICv only when AutoEOI feature is in use

Message ID 20210713142023.106183-9-mlevitsk@redhat.com (mailing list archive)
State New, archived
Headers show
Series My AVIC patch queue | expand

Commit Message

Maxim Levitsky July 13, 2021, 2:20 p.m. UTC
From: Vitaly Kuznetsov <vkuznets@redhat.com>

APICV_INHIBIT_REASON_HYPERV is currently unconditionally forced upon
SynIC activation as SynIC's AutoEOI is incompatible with APICv/AVIC. It is,
however, possible to track whether the feature was actually used by the
guest and only inhibit APICv/AVIC when needed.

TLFS suggests a dedicated 'HV_DEPRECATING_AEOI_RECOMMENDED' flag to let
Windows know that AutoEOI feature should be avoided. While it's up to
KVM userspace to set the flag, KVM can help a bit by exposing global
APICv/AVIC enablement: in case APICv/AVIC usage is impossible, AutoEOI
is still preferred.

Maxim:
   - added SRCU lock drop around call to kvm_request_apicv_update
   - always set HV_DEPRECATING_AEOI_RECOMMENDED in kvm_get_hv_cpuid,
     since this feature can be used regardless of AVIC

Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  3 +++
 arch/x86/kvm/hyperv.c           | 34 +++++++++++++++++++++++++++------
 2 files changed, 31 insertions(+), 6 deletions(-)

Comments

Maxim Levitsky July 18, 2021, 12:13 p.m. UTC | #1
On Tue, 2021-07-13 at 17:20 +0300, Maxim Levitsky wrote:
> From: Vitaly Kuznetsov <vkuznets@redhat.com>
> 
> APICV_INHIBIT_REASON_HYPERV is currently unconditionally forced upon
> SynIC activation as SynIC's AutoEOI is incompatible with APICv/AVIC. It is,
> however, possible to track whether the feature was actually used by the
> guest and only inhibit APICv/AVIC when needed.
> 
> TLFS suggests a dedicated 'HV_DEPRECATING_AEOI_RECOMMENDED' flag to let
> Windows know that AutoEOI feature should be avoided. While it's up to
> KVM userspace to set the flag, KVM can help a bit by exposing global
> APICv/AVIC enablement: in case APICv/AVIC usage is impossible, AutoEOI
> is still preferred.

> 
> Maxim:
>    - added SRCU lock drop around call to kvm_request_apicv_update
>    - always set HV_DEPRECATING_AEOI_RECOMMENDED in kvm_get_hv_cpuid,
>      since this feature can be used regardless of AVIC
> 
> Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
> Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
> ---
>  arch/x86/include/asm/kvm_host.h |  3 +++
>  arch/x86/kvm/hyperv.c           | 34 +++++++++++++++++++++++++++------
>  2 files changed, 31 insertions(+), 6 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index e11d64aa0bcd..f900dca58af8 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -956,6 +956,9 @@ struct kvm_hv {
>  	/* How many vCPUs have VP index != vCPU index */
>  	atomic_t num_mismatched_vp_indexes;
>  
> +	/* How many SynICs use 'AutoEOI' feature */
> +	atomic_t synic_auto_eoi_used;
> +
>  	struct hv_partition_assist_pg *hv_pa_pg;
>  	struct kvm_hv_syndbg hv_syndbg;
>  };
> diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
> index b07592ca92f0..6bf47a583d0e 100644
> --- a/arch/x86/kvm/hyperv.c
> +++ b/arch/x86/kvm/hyperv.c
> @@ -85,9 +85,22 @@ static bool synic_has_vector_auto_eoi(struct kvm_vcpu_hv_synic *synic,
>  	return false;
>  }
>  
> +
> +static void synic_toggle_avic(struct kvm_vcpu *vcpu, bool activate)
> +{
> +	srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
> +	kvm_request_apicv_update(vcpu->kvm, activate,
> +			APICV_INHIBIT_REASON_HYPERV);
> +	vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
> +}

Well turns out that this patch still doesn't work (on this
weekend I found out that all my AVIC enabled VMs hang on reboot).

I finally found out what prompted me back then to make srcu lock drop
in synic_update_vector conditional on whether the write was done
by the host.
 
Turns out that while KVM_SET_MSRS does take the kvm->srcu lock,
it stores the returned srcu index in a local variable and not
in vcpu->srcu_idx, thus the lock drop in synic_toggle_avic
doesn't work.
 
So it is likely that I have seen it not work, and blamed 
KVM_SET_MSRS for not taking the srcu lock which was a wrong assumption.
 
I am more inclined to fix this by just tracking if we hold the srcu
lock on each VCPU manually, just as we track the srcu index anyway,
and then kvm_request_apicv_update can use this to drop the srcu
lock when needed.


> +
>  static void synic_update_vector(struct kvm_vcpu_hv_synic *synic,
>  				int vector)
>  {
> +	struct kvm_vcpu *vcpu = hv_synic_to_vcpu(synic);
> +	struct kvm_hv *hv = to_kvm_hv(vcpu->kvm);
> +	int auto_eoi_old, auto_eoi_new;
> +
>  	if (vector < HV_SYNIC_FIRST_VALID_VECTOR)
>  		return;
>  
> @@ -96,10 +109,23 @@ static void synic_update_vector(struct kvm_vcpu_hv_synic *synic,
>  	else
>  		__clear_bit(vector, synic->vec_bitmap);
>  
> +	auto_eoi_old = bitmap_weight(synic->auto_eoi_bitmap, 256);
> +
>  	if (synic_has_vector_auto_eoi(synic, vector))
>  		__set_bit(vector, synic->auto_eoi_bitmap);
>  	else
>  		__clear_bit(vector, synic->auto_eoi_bitmap);
> +
> +	auto_eoi_new = bitmap_weight(synic->auto_eoi_bitmap, 256);
> +
> +	/* Hyper-V SynIC auto EOI SINTs are not compatible with APICV */
> +	if (!auto_eoi_old && auto_eoi_new) {
> +		if (atomic_inc_return(&hv->synic_auto_eoi_used) == 1)
> +			synic_toggle_avic(vcpu, false);
> +	} else if (!auto_eoi_new && auto_eoi_old) {
> +		if (atomic_dec_return(&hv->synic_auto_eoi_used) == 0)
> +			synic_toggle_avic(vcpu, true);
> +	}
>  }
>  
>  static int synic_set_sint(struct kvm_vcpu_hv_synic *synic, int sint,
> @@ -933,12 +959,6 @@ int kvm_hv_activate_synic(struct kvm_vcpu *vcpu, bool dont_zero_synic_pages)
>  
>  	synic = to_hv_synic(vcpu);
>  
> -	/*
> -	 * Hyper-V SynIC auto EOI SINT's are
> -	 * not compatible with APICV, so request
> -	 * to deactivate APICV permanently.
> -	 */
> -	kvm_request_apicv_update(vcpu->kvm, false, APICV_INHIBIT_REASON_HYPERV);
>  	synic->active = true;
>  	synic->dont_zero_synic_pages = dont_zero_synic_pages;
>  	synic->control = HV_SYNIC_CONTROL_ENABLE;
> @@ -2466,6 +2486,8 @@ int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid,
>  				ent->eax |= HV_X64_ENLIGHTENED_VMCS_RECOMMENDED;
>  			if (!cpu_smt_possible())
>  				ent->eax |= HV_X64_NO_NONARCH_CORESHARING;
> +
> +			ent->eax |= HV_DEPRECATING_AEOI_RECOMMENDED;

Vitally, I would like to hear your opinion on this change I also made to your code.
I think that we should always expose HV_DEPRECATING_AEOI_RECOMMENDED as a supported
HV cpuid bit regardless of AVIC, so that qemu could set it regardless of AVIC
in the kernel, even if this is not optimal.


Best regards,
	Maxim Levitsky

>  			/*
>  			 * Default number of spinlock retry attempts, matches
>  			 * HyperV 2016.
Vitaly Kuznetsov July 19, 2021, 7:47 a.m. UTC | #2
Maxim Levitsky <mlevitsk@redhat.com> writes:

> On Tue, 2021-07-13 at 17:20 +0300, Maxim Levitsky wrote:
>> From: Vitaly Kuznetsov <vkuznets@redhat.com>
>> 
>> APICV_INHIBIT_REASON_HYPERV is currently unconditionally forced upon
>> SynIC activation as SynIC's AutoEOI is incompatible with APICv/AVIC. It is,
>> however, possible to track whether the feature was actually used by the
>> guest and only inhibit APICv/AVIC when needed.
>> 
>> TLFS suggests a dedicated 'HV_DEPRECATING_AEOI_RECOMMENDED' flag to let
>> Windows know that AutoEOI feature should be avoided. While it's up to
>> KVM userspace to set the flag, KVM can help a bit by exposing global
>> APICv/AVIC enablement: in case APICv/AVIC usage is impossible, AutoEOI
>> is still preferred.
>
>> 
>> Maxim:
>>    - added SRCU lock drop around call to kvm_request_apicv_update
>>    - always set HV_DEPRECATING_AEOI_RECOMMENDED in kvm_get_hv_cpuid,
>>      since this feature can be used regardless of AVIC
>> 
>> Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
>> Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
>> ---
>>  arch/x86/include/asm/kvm_host.h |  3 +++
>>  arch/x86/kvm/hyperv.c           | 34 +++++++++++++++++++++++++++------
>>  2 files changed, 31 insertions(+), 6 deletions(-)
>> 
>> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
>> index e11d64aa0bcd..f900dca58af8 100644
>> --- a/arch/x86/include/asm/kvm_host.h
>> +++ b/arch/x86/include/asm/kvm_host.h
>> @@ -956,6 +956,9 @@ struct kvm_hv {
>>  	/* How many vCPUs have VP index != vCPU index */
>>  	atomic_t num_mismatched_vp_indexes;
>>  
>> +	/* How many SynICs use 'AutoEOI' feature */
>> +	atomic_t synic_auto_eoi_used;
>> +
>>  	struct hv_partition_assist_pg *hv_pa_pg;
>>  	struct kvm_hv_syndbg hv_syndbg;
>>  };
>> diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
>> index b07592ca92f0..6bf47a583d0e 100644
>> --- a/arch/x86/kvm/hyperv.c
>> +++ b/arch/x86/kvm/hyperv.c
>> @@ -85,9 +85,22 @@ static bool synic_has_vector_auto_eoi(struct kvm_vcpu_hv_synic *synic,
>>  	return false;
>>  }
>>  
>> +
>> +static void synic_toggle_avic(struct kvm_vcpu *vcpu, bool activate)
>> +{
>> +	srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
>> +	kvm_request_apicv_update(vcpu->kvm, activate,
>> +			APICV_INHIBIT_REASON_HYPERV);
>> +	vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
>> +}
>
> Well turns out that this patch still doesn't work (on this
> weekend I found out that all my AVIC enabled VMs hang on reboot).
>
> I finally found out what prompted me back then to make srcu lock drop
> in synic_update_vector conditional on whether the write was done
> by the host.
>  
> Turns out that while KVM_SET_MSRS does take the kvm->srcu lock,
> it stores the returned srcu index in a local variable and not
> in vcpu->srcu_idx, thus the lock drop in synic_toggle_avic
> doesn't work.
>  
> So it is likely that I have seen it not work, and blamed 
> KVM_SET_MSRS for not taking the srcu lock which was a wrong assumption.
>  
> I am more inclined to fix this by just tracking if we hold the srcu
> lock on each VCPU manually, just as we track the srcu index anyway,
> and then kvm_request_apicv_update can use this to drop the srcu
> lock when needed.
>

Would it be possible to use some magic value in 'vcpu->srcu_idx' and not
introduce a new 'srcu_ls_locked' flag?

>
>> +
>>  static void synic_update_vector(struct kvm_vcpu_hv_synic *synic,
>>  				int vector)
>>  {
>> +	struct kvm_vcpu *vcpu = hv_synic_to_vcpu(synic);
>> +	struct kvm_hv *hv = to_kvm_hv(vcpu->kvm);
>> +	int auto_eoi_old, auto_eoi_new;
>> +
>>  	if (vector < HV_SYNIC_FIRST_VALID_VECTOR)
>>  		return;
>>  
>> @@ -96,10 +109,23 @@ static void synic_update_vector(struct kvm_vcpu_hv_synic *synic,
>>  	else
>>  		__clear_bit(vector, synic->vec_bitmap);
>>  
>> +	auto_eoi_old = bitmap_weight(synic->auto_eoi_bitmap, 256);
>> +
>>  	if (synic_has_vector_auto_eoi(synic, vector))
>>  		__set_bit(vector, synic->auto_eoi_bitmap);
>>  	else
>>  		__clear_bit(vector, synic->auto_eoi_bitmap);
>> +
>> +	auto_eoi_new = bitmap_weight(synic->auto_eoi_bitmap, 256);
>> +
>> +	/* Hyper-V SynIC auto EOI SINTs are not compatible with APICV */
>> +	if (!auto_eoi_old && auto_eoi_new) {
>> +		if (atomic_inc_return(&hv->synic_auto_eoi_used) == 1)
>> +			synic_toggle_avic(vcpu, false);
>> +	} else if (!auto_eoi_new && auto_eoi_old) {
>> +		if (atomic_dec_return(&hv->synic_auto_eoi_used) == 0)
>> +			synic_toggle_avic(vcpu, true);
>> +	}
>>  }
>>  
>>  static int synic_set_sint(struct kvm_vcpu_hv_synic *synic, int sint,
>> @@ -933,12 +959,6 @@ int kvm_hv_activate_synic(struct kvm_vcpu *vcpu, bool dont_zero_synic_pages)
>>  
>>  	synic = to_hv_synic(vcpu);
>>  
>> -	/*
>> -	 * Hyper-V SynIC auto EOI SINT's are
>> -	 * not compatible with APICV, so request
>> -	 * to deactivate APICV permanently.
>> -	 */
>> -	kvm_request_apicv_update(vcpu->kvm, false, APICV_INHIBIT_REASON_HYPERV);
>>  	synic->active = true;
>>  	synic->dont_zero_synic_pages = dont_zero_synic_pages;
>>  	synic->control = HV_SYNIC_CONTROL_ENABLE;
>> @@ -2466,6 +2486,8 @@ int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid,
>>  				ent->eax |= HV_X64_ENLIGHTENED_VMCS_RECOMMENDED;
>>  			if (!cpu_smt_possible())
>>  				ent->eax |= HV_X64_NO_NONARCH_CORESHARING;
>> +
>> +			ent->eax |= HV_DEPRECATING_AEOI_RECOMMENDED;
>
> Vitally, I would like to hear your opinion on this change I also made to your code.
> I think that we should always expose HV_DEPRECATING_AEOI_RECOMMENDED as a supported
> HV cpuid bit regardless of AVIC, so that qemu could set it regardless of AVIC
> in the kernel, even if this is not optimal.

Generally, I'm OK with the change. The meaning of the bit changes
slightly depending on whether we expose it conditionally or not.

If HV_DEPRECATING_AEOI_RECOMMENDED is always exposed it just means that
the patch in question is in, nothing else. VMM has to figure out if
APICv/AVIC is supported by some other means.

If HV_DEPRECATING_AEOI_RECOMMENDED is exposed conditionally, then VMM
can be stupid and solely rely on this information by just copying the
bit to user visible CPUIDs (e.g. QEMU's 'hv-passthrough' mode).
Maxim Levitsky July 19, 2021, 9 a.m. UTC | #3
On Mon, 2021-07-19 at 09:47 +0200, Vitaly Kuznetsov wrote:
> Maxim Levitsky <mlevitsk@redhat.com> writes:
> 
> > On Tue, 2021-07-13 at 17:20 +0300, Maxim Levitsky wrote:
> > > From: Vitaly Kuznetsov <vkuznets@redhat.com>
> > > 
> > > APICV_INHIBIT_REASON_HYPERV is currently unconditionally forced upon
> > > SynIC activation as SynIC's AutoEOI is incompatible with APICv/AVIC. It is,
> > > however, possible to track whether the feature was actually used by the
> > > guest and only inhibit APICv/AVIC when needed.
> > > 
> > > TLFS suggests a dedicated 'HV_DEPRECATING_AEOI_RECOMMENDED' flag to let
> > > Windows know that AutoEOI feature should be avoided. While it's up to
> > > KVM userspace to set the flag, KVM can help a bit by exposing global
> > > APICv/AVIC enablement: in case APICv/AVIC usage is impossible, AutoEOI
> > > is still preferred.
> > > Maxim:
> > >    - added SRCU lock drop around call to kvm_request_apicv_update
> > >    - always set HV_DEPRECATING_AEOI_RECOMMENDED in kvm_get_hv_cpuid,
> > >      since this feature can be used regardless of AVIC
> > > 
> > > Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
> > > Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
> > > ---
> > >  arch/x86/include/asm/kvm_host.h |  3 +++
> > >  arch/x86/kvm/hyperv.c           | 34 +++++++++++++++++++++++++++------
> > >  2 files changed, 31 insertions(+), 6 deletions(-)
> > > 
> > > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> > > index e11d64aa0bcd..f900dca58af8 100644
> > > --- a/arch/x86/include/asm/kvm_host.h
> > > +++ b/arch/x86/include/asm/kvm_host.h
> > > @@ -956,6 +956,9 @@ struct kvm_hv {
> > >  	/* How many vCPUs have VP index != vCPU index */
> > >  	atomic_t num_mismatched_vp_indexes;
> > >  
> > > +	/* How many SynICs use 'AutoEOI' feature */
> > > +	atomic_t synic_auto_eoi_used;
> > > +
> > >  	struct hv_partition_assist_pg *hv_pa_pg;
> > >  	struct kvm_hv_syndbg hv_syndbg;
> > >  };
> > > diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
> > > index b07592ca92f0..6bf47a583d0e 100644
> > > --- a/arch/x86/kvm/hyperv.c
> > > +++ b/arch/x86/kvm/hyperv.c
> > > @@ -85,9 +85,22 @@ static bool synic_has_vector_auto_eoi(struct kvm_vcpu_hv_synic *synic,
> > >  	return false;
> > >  }
> > >  
> > > +
> > > +static void synic_toggle_avic(struct kvm_vcpu *vcpu, bool activate)
> > > +{
> > > +	srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
> > > +	kvm_request_apicv_update(vcpu->kvm, activate,
> > > +			APICV_INHIBIT_REASON_HYPERV);
> > > +	vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
> > > +}
> > 
> > Well turns out that this patch still doesn't work (on this
> > weekend I found out that all my AVIC enabled VMs hang on reboot).
> > 
> > I finally found out what prompted me back then to make srcu lock drop
> > in synic_update_vector conditional on whether the write was done
> > by the host.
> >  
> > Turns out that while KVM_SET_MSRS does take the kvm->srcu lock,
> > it stores the returned srcu index in a local variable and not
> > in vcpu->srcu_idx, thus the lock drop in synic_toggle_avic
> > doesn't work.
> >  
> > So it is likely that I have seen it not work, and blamed 
> > KVM_SET_MSRS for not taking the srcu lock which was a wrong assumption.
> >  
> > I am more inclined to fix this by just tracking if we hold the srcu
> > lock on each VCPU manually, just as we track the srcu index anyway,
> > and then kvm_request_apicv_update can use this to drop the srcu
> > lock when needed.
> > 
> 
> Would it be possible to use some magic value in 'vcpu->srcu_idx' and not
> introduce a new 'srcu_ls_locked' flag?

Well, currently the returned index value from srcu_read_lock is opaque 
(and we have two SRCU implementations and both I think return small positive numbers, 
but I haven't studied them in depth).
 
We can ask the people that maintain SRCU to reserve a number (like -1)
or so.
I probably first add the 'srcu_is_locked' thought and then as a follow up patch
remove it if they agree.

> 
> > > +
> > >  static void synic_update_vector(struct kvm_vcpu_hv_synic *synic,
> > >  				int vector)
> > >  {
> > > +	struct kvm_vcpu *vcpu = hv_synic_to_vcpu(synic);
> > > +	struct kvm_hv *hv = to_kvm_hv(vcpu->kvm);
> > > +	int auto_eoi_old, auto_eoi_new;
> > > +
> > >  	if (vector < HV_SYNIC_FIRST_VALID_VECTOR)
> > >  		return;
> > >  
> > > @@ -96,10 +109,23 @@ static void synic_update_vector(struct kvm_vcpu_hv_synic *synic,
> > >  	else
> > >  		__clear_bit(vector, synic->vec_bitmap);
> > >  
> > > +	auto_eoi_old = bitmap_weight(synic->auto_eoi_bitmap, 256);
> > > +
> > >  	if (synic_has_vector_auto_eoi(synic, vector))
> > >  		__set_bit(vector, synic->auto_eoi_bitmap);
> > >  	else
> > >  		__clear_bit(vector, synic->auto_eoi_bitmap);
> > > +
> > > +	auto_eoi_new = bitmap_weight(synic->auto_eoi_bitmap, 256);
> > > +
> > > +	/* Hyper-V SynIC auto EOI SINTs are not compatible with APICV */
> > > +	if (!auto_eoi_old && auto_eoi_new) {
> > > +		if (atomic_inc_return(&hv->synic_auto_eoi_used) == 1)
> > > +			synic_toggle_avic(vcpu, false);
> > > +	} else if (!auto_eoi_new && auto_eoi_old) {
> > > +		if (atomic_dec_return(&hv->synic_auto_eoi_used) == 0)
> > > +			synic_toggle_avic(vcpu, true);
> > > +	}
> > >  }
> > >  
> > >  static int synic_set_sint(struct kvm_vcpu_hv_synic *synic, int sint,
> > > @@ -933,12 +959,6 @@ int kvm_hv_activate_synic(struct kvm_vcpu *vcpu, bool dont_zero_synic_pages)
> > >  
> > >  	synic = to_hv_synic(vcpu);
> > >  
> > > -	/*
> > > -	 * Hyper-V SynIC auto EOI SINT's are
> > > -	 * not compatible with APICV, so request
> > > -	 * to deactivate APICV permanently.
> > > -	 */
> > > -	kvm_request_apicv_update(vcpu->kvm, false, APICV_INHIBIT_REASON_HYPERV);
> > >  	synic->active = true;
> > >  	synic->dont_zero_synic_pages = dont_zero_synic_pages;
> > >  	synic->control = HV_SYNIC_CONTROL_ENABLE;
> > > @@ -2466,6 +2486,8 @@ int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid,
> > >  				ent->eax |= HV_X64_ENLIGHTENED_VMCS_RECOMMENDED;
> > >  			if (!cpu_smt_possible())
> > >  				ent->eax |= HV_X64_NO_NONARCH_CORESHARING;
> > > +
> > > +			ent->eax |= HV_DEPRECATING_AEOI_RECOMMENDED;
> > 
> > Vitally, I would like to hear your opinion on this change I also made to your code.
> > I think that we should always expose HV_DEPRECATING_AEOI_RECOMMENDED as a supported
> > HV cpuid bit regardless of AVIC, so that qemu could set it regardless of AVIC
> > in the kernel, even if this is not optimal.
> 
> Generally, I'm OK with the change. The meaning of the bit changes
> slightly depending on whether we expose it conditionally or not.
> 
> If HV_DEPRECATING_AEOI_RECOMMENDED is always exposed it just means that
> the patch in question is in, nothing else. VMM has to figure out if
> APICv/AVIC is supported by some other means.
> 
> If HV_DEPRECATING_AEOI_RECOMMENDED is exposed conditionally, then VMM
> can be stupid and solely rely on this information by just copying the
> bit to user visible CPUIDs (e.g. QEMU's 'hv-passthrough' mode).

I understand what you mean.
In regard to avic, though, the 'hv-passthrough' doesn't work at all 
anyway since it also makes the qemu enable hv-vapic bit which
makes the guest not use APIC....

So it seems that not even qemu but a management layer above it should
figure out that it wants AVIC, and then enable all the bits that 'should'
make it work, but AVIC might still not work for some reason.

Currently those are the flags that are needed to make AVIC work on windows:

 -cpu -svm,-x2apic,-hv-vapic,hv-aeoi-deprecation (I added this locally)
 -global kvm-pit.lost_tick_policy=discard

Thanks,
Best regards,
	Maxim Levitsky
 

>
Vitaly Kuznetsov July 19, 2021, 9:23 a.m. UTC | #4
Maxim Levitsky <mlevitsk@redhat.com> writes:

> On Mon, 2021-07-19 at 09:47 +0200, Vitaly Kuznetsov wrote:
>> Maxim Levitsky <mlevitsk@redhat.com> writes:
>> 
>> > On Tue, 2021-07-13 at 17:20 +0300, Maxim Levitsky wrote:
>> > > From: Vitaly Kuznetsov <vkuznets@redhat.com>
>> > > 
>> > > APICV_INHIBIT_REASON_HYPERV is currently unconditionally forced upon
>> > > SynIC activation as SynIC's AutoEOI is incompatible with APICv/AVIC. It is,
>> > > however, possible to track whether the feature was actually used by the
>> > > guest and only inhibit APICv/AVIC when needed.
>> > > 
>> > > TLFS suggests a dedicated 'HV_DEPRECATING_AEOI_RECOMMENDED' flag to let
>> > > Windows know that AutoEOI feature should be avoided. While it's up to
>> > > KVM userspace to set the flag, KVM can help a bit by exposing global
>> > > APICv/AVIC enablement: in case APICv/AVIC usage is impossible, AutoEOI
>> > > is still preferred.
>> > > Maxim:
>> > >    - added SRCU lock drop around call to kvm_request_apicv_update
>> > >    - always set HV_DEPRECATING_AEOI_RECOMMENDED in kvm_get_hv_cpuid,
>> > >      since this feature can be used regardless of AVIC
>> > > 
>> > > Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
>> > > Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
>> > > ---
>> > >  arch/x86/include/asm/kvm_host.h |  3 +++
>> > >  arch/x86/kvm/hyperv.c           | 34 +++++++++++++++++++++++++++------
>> > >  2 files changed, 31 insertions(+), 6 deletions(-)
>> > > 
>> > > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
>> > > index e11d64aa0bcd..f900dca58af8 100644
>> > > --- a/arch/x86/include/asm/kvm_host.h
>> > > +++ b/arch/x86/include/asm/kvm_host.h
>> > > @@ -956,6 +956,9 @@ struct kvm_hv {
>> > >  	/* How many vCPUs have VP index != vCPU index */
>> > >  	atomic_t num_mismatched_vp_indexes;
>> > >  
>> > > +	/* How many SynICs use 'AutoEOI' feature */
>> > > +	atomic_t synic_auto_eoi_used;
>> > > +
>> > >  	struct hv_partition_assist_pg *hv_pa_pg;
>> > >  	struct kvm_hv_syndbg hv_syndbg;
>> > >  };
>> > > diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
>> > > index b07592ca92f0..6bf47a583d0e 100644
>> > > --- a/arch/x86/kvm/hyperv.c
>> > > +++ b/arch/x86/kvm/hyperv.c
>> > > @@ -85,9 +85,22 @@ static bool synic_has_vector_auto_eoi(struct kvm_vcpu_hv_synic *synic,
>> > >  	return false;
>> > >  }
>> > >  
>> > > +
>> > > +static void synic_toggle_avic(struct kvm_vcpu *vcpu, bool activate)
>> > > +{
>> > > +	srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
>> > > +	kvm_request_apicv_update(vcpu->kvm, activate,
>> > > +			APICV_INHIBIT_REASON_HYPERV);
>> > > +	vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
>> > > +}
>> > 
>> > Well turns out that this patch still doesn't work (on this
>> > weekend I found out that all my AVIC enabled VMs hang on reboot).
>> > 
>> > I finally found out what prompted me back then to make srcu lock drop
>> > in synic_update_vector conditional on whether the write was done
>> > by the host.
>> >  
>> > Turns out that while KVM_SET_MSRS does take the kvm->srcu lock,
>> > it stores the returned srcu index in a local variable and not
>> > in vcpu->srcu_idx, thus the lock drop in synic_toggle_avic
>> > doesn't work.
>> >  
>> > So it is likely that I have seen it not work, and blamed 
>> > KVM_SET_MSRS for not taking the srcu lock which was a wrong assumption.
>> >  
>> > I am more inclined to fix this by just tracking if we hold the srcu
>> > lock on each VCPU manually, just as we track the srcu index anyway,
>> > and then kvm_request_apicv_update can use this to drop the srcu
>> > lock when needed.
>> > 
>> 
>> Would it be possible to use some magic value in 'vcpu->srcu_idx' and not
>> introduce a new 'srcu_ls_locked' flag?
>
> Well, currently the returned index value from srcu_read_lock is opaque 
> (and we have two SRCU implementations and both I think return small positive numbers, 
> but I haven't studied them in depth).
>  
> We can ask the people that maintain SRCU to reserve a number (like -1)
> or so.
> I probably first add the 'srcu_is_locked' thought and then as a follow up patch
> remove it if they agree.
>

Ah, OK. BTW, I've just discovered srcu_read_lock_held() which sounds
like the function we need but unfortunately it is not.
Maxim Levitsky July 19, 2021, 9:58 a.m. UTC | #5
On Mon, 2021-07-19 at 11:23 +0200, Vitaly Kuznetsov wrote:
> Maxim Levitsky <mlevitsk@redhat.com> writes:
> 
> > On Mon, 2021-07-19 at 09:47 +0200, Vitaly Kuznetsov wrote:
> > > Maxim Levitsky <mlevitsk@redhat.com> writes:
> > > 
> > > > On Tue, 2021-07-13 at 17:20 +0300, Maxim Levitsky wrote:
> > > > > From: Vitaly Kuznetsov <vkuznets@redhat.com>
> > > > > 
> > > > > APICV_INHIBIT_REASON_HYPERV is currently unconditionally forced upon
> > > > > SynIC activation as SynIC's AutoEOI is incompatible with APICv/AVIC. It is,
> > > > > however, possible to track whether the feature was actually used by the
> > > > > guest and only inhibit APICv/AVIC when needed.
> > > > > 
> > > > > TLFS suggests a dedicated 'HV_DEPRECATING_AEOI_RECOMMENDED' flag to let
> > > > > Windows know that AutoEOI feature should be avoided. While it's up to
> > > > > KVM userspace to set the flag, KVM can help a bit by exposing global
> > > > > APICv/AVIC enablement: in case APICv/AVIC usage is impossible, AutoEOI
> > > > > is still preferred.
> > > > > Maxim:
> > > > >    - added SRCU lock drop around call to kvm_request_apicv_update
> > > > >    - always set HV_DEPRECATING_AEOI_RECOMMENDED in kvm_get_hv_cpuid,
> > > > >      since this feature can be used regardless of AVIC
> > > > > 
> > > > > Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
> > > > > Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
> > > > > ---
> > > > >  arch/x86/include/asm/kvm_host.h |  3 +++
> > > > >  arch/x86/kvm/hyperv.c           | 34 +++++++++++++++++++++++++++------
> > > > >  2 files changed, 31 insertions(+), 6 deletions(-)
> > > > > 
> > > > > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> > > > > index e11d64aa0bcd..f900dca58af8 100644
> > > > > --- a/arch/x86/include/asm/kvm_host.h
> > > > > +++ b/arch/x86/include/asm/kvm_host.h
> > > > > @@ -956,6 +956,9 @@ struct kvm_hv {
> > > > >  	/* How many vCPUs have VP index != vCPU index */
> > > > >  	atomic_t num_mismatched_vp_indexes;
> > > > >  
> > > > > +	/* How many SynICs use 'AutoEOI' feature */
> > > > > +	atomic_t synic_auto_eoi_used;
> > > > > +
> > > > >  	struct hv_partition_assist_pg *hv_pa_pg;
> > > > >  	struct kvm_hv_syndbg hv_syndbg;
> > > > >  };
> > > > > diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
> > > > > index b07592ca92f0..6bf47a583d0e 100644
> > > > > --- a/arch/x86/kvm/hyperv.c
> > > > > +++ b/arch/x86/kvm/hyperv.c
> > > > > @@ -85,9 +85,22 @@ static bool synic_has_vector_auto_eoi(struct kvm_vcpu_hv_synic *synic,
> > > > >  	return false;
> > > > >  }
> > > > >  
> > > > > +
> > > > > +static void synic_toggle_avic(struct kvm_vcpu *vcpu, bool activate)
> > > > > +{
> > > > > +	srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
> > > > > +	kvm_request_apicv_update(vcpu->kvm, activate,
> > > > > +			APICV_INHIBIT_REASON_HYPERV);
> > > > > +	vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
> > > > > +}
> > > > 
> > > > Well turns out that this patch still doesn't work (on this
> > > > weekend I found out that all my AVIC enabled VMs hang on reboot).
> > > > 
> > > > I finally found out what prompted me back then to make srcu lock drop
> > > > in synic_update_vector conditional on whether the write was done
> > > > by the host.
> > > >  
> > > > Turns out that while KVM_SET_MSRS does take the kvm->srcu lock,
> > > > it stores the returned srcu index in a local variable and not
> > > > in vcpu->srcu_idx, thus the lock drop in synic_toggle_avic
> > > > doesn't work.
> > > >  
> > > > So it is likely that I have seen it not work, and blamed 
> > > > KVM_SET_MSRS for not taking the srcu lock which was a wrong assumption.
> > > >  
> > > > I am more inclined to fix this by just tracking if we hold the srcu
> > > > lock on each VCPU manually, just as we track the srcu index anyway,
> > > > and then kvm_request_apicv_update can use this to drop the srcu
> > > > lock when needed.
> > > > 
> > > 
> > > Would it be possible to use some magic value in 'vcpu->srcu_idx' and not
> > > introduce a new 'srcu_ls_locked' flag?
> > 
> > Well, currently the returned index value from srcu_read_lock is opaque 
> > (and we have two SRCU implementations and both I think return small positive numbers, 
> > but I haven't studied them in depth).
> >  
> > We can ask the people that maintain SRCU to reserve a number (like -1)
> > or so.
> > I probably first add the 'srcu_is_locked' thought and then as a follow up patch
> > remove it if they agree.
> > 
> 
> Ah, OK. BTW, I've just discovered srcu_read_lock_held() which sounds
> like the function we need but unfortunately it is not.

Yea, exactly this. :-(

Best regards,
	Maxim Levitsky

>
Sean Christopherson July 19, 2021, 6:49 p.m. UTC | #6
On Sun, Jul 18, 2021, Maxim Levitsky wrote:
> I am more inclined to fix this by just tracking if we hold the srcu
> lock on each VCPU manually, just as we track the srcu index anyway,
> and then kvm_request_apicv_update can use this to drop the srcu
> lock when needed.

The entire approach of dynamically adding/removing the memslot seems doomed to
failure, and is likely responsible for the performance issues with AVIC, e.g. a
single vCPU temporarily inhibiting AVIC will zap all SPTEs _twice_; on disable
and again on re-enable.

Rather than pile on more gunk, what about special casing the APIC access page
memslot in try_async_pf()?  E.g. zap the GFN in avic_update_access_page() when
disabling (and bounce through kvm_{inc,dec}_notifier_count()), and have the page
fault path skip directly to MMIO emulation without caching the MMIO info.  It'd
also give us a good excuse to rename try_async_pf() :-)

If lack of MMIO caching is a performance problem, an alternative solution would
be to allow caching but add a helper to zap the MMIO SPTE and request all vCPUs to
clear their cache.

It's all a bit gross, especially hijacking the mmu_notifier path, but IMO it'd be
less awful than the current memslot+SRCU mess.

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index f4d35289f59e..ea434d76cfb0 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -3767,9 +3767,9 @@ static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
                                  kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch);
 }

-static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
-                        gpa_t cr2_or_gpa, kvm_pfn_t *pfn, hva_t *hva,
-                        bool write, bool *writable)
+static bool kvm_faultin_pfn(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
+                           gpa_t cr2_or_gpa, kvm_pfn_t *pfn, hva_t *hva,
+                           bool write, bool *writable, int *r)
 {
        struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
        bool async;
@@ -3780,13 +3780,26 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
         * be zapped before KVM inserts a new MMIO SPTE for the gfn.
         */
        if (slot && (slot->flags & KVM_MEMSLOT_INVALID))
-               return true;
+               goto out_retry;

-       /* Don't expose private memslots to L2. */
-       if (is_guest_mode(vcpu) && !kvm_is_visible_memslot(slot)) {
-               *pfn = KVM_PFN_NOSLOT;
-               *writable = false;
-               return false;
+       if (!kvm_is_visible_memslot(slot)) {
+               /* Don't expose private memslots to L2. */
+               if (is_guest_mode(vcpu)) {
+                       *pfn = KVM_PFN_NOSLOT;
+                       *writable = false;
+                       return false;
+               }
+               /*
+                * If the APIC access page exists but is disabled, go directly
+                * to emulation without caching the MMIO access or creating a
+                * MMIO SPTE.  That way the cache doesn't need to be purged
+                * when the AVIC is re-enabled.
+                */
+               if (slot->id == APIC_ACCESS_PAGE_PRIVATE_MEMSLOT &&
+                   !vcpu->kvm->arch.apic_access_memslot_enabled) {
+                       *r = RET_PF_EMULATE;
+                       return true;
+               }
        }

        async = false;
@@ -3800,14 +3813,19 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
                if (kvm_find_async_pf_gfn(vcpu, gfn)) {
                        trace_kvm_async_pf_doublefault(cr2_or_gpa, gfn);
                        kvm_make_request(KVM_REQ_APF_HALT, vcpu);
-                       return true;
-               } else if (kvm_arch_setup_async_pf(vcpu, cr2_or_gpa, gfn))
-                       return true;
+                       goto out_retry;
+               } else if (kvm_arch_setup_async_pf(vcpu, cr2_or_gpa, gfn)) {
+                       goto out_retry;
+               }
        }

        *pfn = __gfn_to_pfn_memslot(slot, gfn, false, NULL,
                                    write, writable, hva);
        return false;
+
+out_retry:
+       *r = RET_PF_RETRY;
+       return true;
 }

 static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
@@ -3839,9 +3857,9 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
        mmu_seq = vcpu->kvm->mmu_notifier_seq;
        smp_rmb();

-       if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, &hva,
-                        write, &map_writable))
-               return RET_PF_RETRY;
+       if (kvm_faultin_pfn(vcpu, prefault, gfn, gpa, &pfn, &hva, write,
+                           &map_writable, &r))
+               return r;

        if (handle_abnormal_pfn(vcpu, is_tdp ? 0 : gpa, gfn, pfn, ACC_ALL, &r))
                return r;
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index 490a028ddabe..9747124b877d 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -881,9 +881,9 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code,
        mmu_seq = vcpu->kvm->mmu_notifier_seq;
        smp_rmb();
 
-       if (try_async_pf(vcpu, prefault, walker.gfn, addr, &pfn, &hva,
-                        write_fault, &map_writable))
-               return RET_PF_RETRY;
+       if (kvm_faultin_pfn(vcpu, prefault, walker.gfn, addr, &pfn, &hva,
+                           write_fault, &map_writable, &r))
+               return r;
 
        if (handle_abnormal_pfn(vcpu, addr, walker.gfn, pfn, walker.pte_access, &r))
                return r;
Maxim Levitsky July 20, 2021, 9:40 a.m. UTC | #7
On Mon, 2021-07-19 at 18:49 +0000, Sean Christopherson wrote:
> On Sun, Jul 18, 2021, Maxim Levitsky wrote:
> > I am more inclined to fix this by just tracking if we hold the srcu
> > lock on each VCPU manually, just as we track the srcu index anyway,
> > and then kvm_request_apicv_update can use this to drop the srcu
> > lock when needed.
> 
> The entire approach of dynamically adding/removing the memslot seems doomed to
> failure, and is likely responsible for the performance issues with AVIC, e.g. a
> single vCPU temporarily inhibiting AVIC will zap all SPTEs _twice_; on disable
> and again on re-enable.
100% agree, especially about the fact that this approach is doomed to fail.
 
There are two reasons why I didn't explore the direction you propose:
 
(when I talked about this on IRC with Paolo, I did suggest trying to explore 
this direction once)
 
One is that practically speaking AVIC inhibition is not something that happens often,
and in all of my tests it didn't happen after the VM had finished booting.
 
There are the following cases when AVIC is inhibited:
 
1.Not possible to support configuration
 
  (PIT reinject mode, x2apic mode, autoeoi mode of hyperv SYNIC)
 
  With the theoretical exception of the SYNIC autoeoi mode, it is something that happens
  only once when qemu configures the VM.
 
  In all of my tests the SYNIC autoeoi mode was also enabled once on each vCPU and 
  stayed that way.
 
 
2.Not yet supported configuration (nesting)
  Something that will go away eventually and also something that fires only once
  at least currently.
 
3.Dynamically when the guest dares to use local APIC's virtual wire mode to
  send PIC's connected interrupts via LAPIC, and since this can't be done using AVIC
  because in this mode, the interrupt must not affect local APIC registers (like IRR, ISR, etc),
  a normal EVENTINJ injection has to be done.
  This sometimes requires detection of the start of the interrupt window, which is something not 
  possible to do when AVIC is enabled because AVIC makes the CPU ignore the so called
  virtual interrupts, which we inject and intercept, to detect it.

  This case only happens on windows and in OVMF (both are silly enough to use this mode),
  and thankfully windows only uses this mode during boot. 

Thus even a gross performance issue isn't an issue yet, but it would be
indeed nice to eliminate it if we ever need to deal with a guest which
does somehow end up enabling and disabling AVIC many times per second.

> 
> Rather than pile on more gunk, what about special casing the APIC access page
> memslot in try_async_pf()?  E.g. zap the GFN in avic_update_access_page() when
> disabling (and bounce through kvm_{inc,dec}_notifier_count()), and have the page
> fault path skip directly to MMIO emulation without caching the MMIO info.  It'd
> also give us a good excuse to rename try_async_pf() :-)

I understand what you mean about renaming try_async_pf :-)
 
As for the other reason, the reason is simple: Fear ;-)
While I do know the KVM's MMU an order of magnitude better than I did a year ago,
I still don't have the confidence needed to add hacks to it.
 
I do agree that a special AVIC hack in the mmu as you propose is miles better than
dynamic disable hack of the AVIC memslot.

> 
> If lack of MMIO caching is a performance problem, an alternative solution would
> be to allow caching but add a helper to zap the MMIO SPTE and request all vCPUs to
> clear their cache.

In theory this can be an issue. LAPIC MMIO has the ICR register which is split in two,
Thus, lack of caching should hurt performance.

With that said, a management layer (e.g libvirt) these days always enables X2APIC
and KVM exposes it as supported in CPUID regardless of host support fo it,
as it still has better performance than unaccelerated xAPIC.
 
This means that it can be expected that a management layer will only disable X2APIC
when it enables AVIC and sets up everything such as it is actually used, and therefore
no performance impact will be felt.
 
(the time during which the AVIC could be dynamically inhibited is neglectable as I explained above).

> 
> It's all a bit gross, especially hijacking the mmu_notifier path, but IMO it'd be
> less awful than the current memslot+SRCU mess.
> 
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index f4d35289f59e..ea434d76cfb0 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -3767,9 +3767,9 @@ static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
>                                   kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch);
>  }
> 
> -static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
> -                        gpa_t cr2_or_gpa, kvm_pfn_t *pfn, hva_t *hva,
> -                        bool write, bool *writable)
> +static bool kvm_faultin_pfn(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
> +                           gpa_t cr2_or_gpa, kvm_pfn_t *pfn, hva_t *hva,
> +                           bool write, bool *writable, int *r)
>  {
>         struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
>         bool async;
> @@ -3780,13 +3780,26 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
>          * be zapped before KVM inserts a new MMIO SPTE for the gfn.
>          */
>         if (slot && (slot->flags & KVM_MEMSLOT_INVALID))
> -               return true;
> +               goto out_retry;
> 
> -       /* Don't expose private memslots to L2. */
> -       if (is_guest_mode(vcpu) && !kvm_is_visible_memslot(slot)) {
> -               *pfn = KVM_PFN_NOSLOT;
> -               *writable = false;
> -               return false;
> +       if (!kvm_is_visible_memslot(slot)) {
> +               /* Don't expose private memslots to L2. */
> +               if (is_guest_mode(vcpu)) {
> +                       *pfn = KVM_PFN_NOSLOT;
> +                       *writable = false;
> +                       return false;
> +               }
> +               /*
> +                * If the APIC access page exists but is disabled, go directly
> +                * to emulation without caching the MMIO access or creating a
> +                * MMIO SPTE.  That way the cache doesn't need to be purged
> +                * when the AVIC is re-enabled.
> +                */
> +               if (slot->id == APIC_ACCESS_PAGE_PRIVATE_MEMSLOT &&
> +                   !vcpu->kvm->arch.apic_access_memslot_enabled) {
> +                       *r = RET_PF_EMULATE;
> +                       return true;
> +               }
>         }
> 
>         async = false;
> @@ -3800,14 +3813,19 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
>                 if (kvm_find_async_pf_gfn(vcpu, gfn)) {
>                         trace_kvm_async_pf_doublefault(cr2_or_gpa, gfn);
>                         kvm_make_request(KVM_REQ_APF_HALT, vcpu);
> -                       return true;
> -               } else if (kvm_arch_setup_async_pf(vcpu, cr2_or_gpa, gfn))
> -                       return true;
> +                       goto out_retry;
> +               } else if (kvm_arch_setup_async_pf(vcpu, cr2_or_gpa, gfn)) {
> +                       goto out_retry;
> +               }
>         }
> 
>         *pfn = __gfn_to_pfn_memslot(slot, gfn, false, NULL,
>                                     write, writable, hva);
>         return false;
> +
> +out_retry:
> +       *r = RET_PF_RETRY;
> +       return true;
>  }
> 
>  static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
> @@ -3839,9 +3857,9 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
>         mmu_seq = vcpu->kvm->mmu_notifier_seq;
>         smp_rmb();
> 
> -       if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, &hva,
> -                        write, &map_writable))
> -               return RET_PF_RETRY;
> +       if (kvm_faultin_pfn(vcpu, prefault, gfn, gpa, &pfn, &hva, write,
> +                           &map_writable, &r))
> +               return r;
> 
>         if (handle_abnormal_pfn(vcpu, is_tdp ? 0 : gpa, gfn, pfn, ACC_ALL, &r))
>                 return r;
> diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
> index 490a028ddabe..9747124b877d 100644
> --- a/arch/x86/kvm/mmu/paging_tmpl.h
> +++ b/arch/x86/kvm/mmu/paging_tmpl.h
> @@ -881,9 +881,9 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code,
>         mmu_seq = vcpu->kvm->mmu_notifier_seq;
>         smp_rmb();
>  
> -       if (try_async_pf(vcpu, prefault, walker.gfn, addr, &pfn, &hva,
> -                        write_fault, &map_writable))
> -               return RET_PF_RETRY;
> +       if (kvm_faultin_pfn(vcpu, prefault, walker.gfn, addr, &pfn, &hva,
> +                           write_fault, &map_writable, &r))
> +               return r;
>  
>         if (handle_abnormal_pfn(vcpu, addr, walker.gfn, pfn, walker.pte_access, &r))
>                 return r;
> 


The above approach looks very good.
Paolo what do you think?

Best regards,
	Maxim Levitsky
Maxim Levitsky July 22, 2021, 9:12 a.m. UTC | #8
On Mon, 2021-07-19 at 18:49 +0000, Sean Christopherson wrote:
> On Sun, Jul 18, 2021, Maxim Levitsky wrote:
> > I am more inclined to fix this by just tracking if we hold the srcu
> > lock on each VCPU manually, just as we track the srcu index anyway,
> > and then kvm_request_apicv_update can use this to drop the srcu
> > lock when needed.
> 
> The entire approach of dynamically adding/removing the memslot seems doomed to
> failure, and is likely responsible for the performance issues with AVIC, e.g. a
> single vCPU temporarily inhibiting AVIC will zap all SPTEs _twice_; on disable
> and again on re-enable.
> 
> Rather than pile on more gunk, what about special casing the APIC access page
> memslot in try_async_pf()?  E.g. zap the GFN in avic_update_access_page() when
> disabling (and bounce through kvm_{inc,dec}_notifier_count()), and have the page
> fault path skip directly to MMIO emulation without caching the MMIO info.  It'd
> also give us a good excuse to rename try_async_pf() :-)
> 
> If lack of MMIO caching is a performance problem, an alternative solution would
> be to allow caching but add a helper to zap the MMIO SPTE and request all vCPUs to
> clear their cache.
> 
> It's all a bit gross, especially hijacking the mmu_notifier path, but IMO it'd be
> less awful than the current memslot+SRCU mess

Hi Sean, Paolo and everyone else:

I am exploring the approach that you proposed and I noticed that we have very inconsistient
code that handles the APIC base relocation for in-kernel local apic.
I do know that APIC base relocation is not supported, and I don't have anything against
this as long as VMs don't use that feature, but I do want this to be consistent.

I did a study of the code that is involved in this mess and I would like to hear your opinion:

There are basically 3 modes of operation of in kernel local apic:

Regular unaccelerated local apic:

-> APIC MMIO base address is stored at 'apic->base_address', and updated in 
   kvm_lapic_set_base which is called from  msr write of 'MSR_IA32_APICBASE'
   (both by SVM and VMX).
   The value is even migrated.

-> APIC mmio read/write is done from MMU, when we access MMIO page:
	vcpu_mmio_write always calls apic_mmio_write which checks if the write is in 
	apic->base_address page and if so forwards the write local apic with offset
	in this page.

-> APIC MMIO area has to be MMIO for 'apic_mmio_write' to be called,
   thus must contain no guest memslots.
   If the guest relocates the APIC base somewhere where we have a memslot, 
   memslot will take priority, while on real hardware, LAPIC is likely to
   take priority.

APICv:

-> The default apic MMIO base (0xfee00000) is covered by a dummy page which is
   allocated from qemu's process  using __x86_set_memory_region.
   
   This is done once in alloc_apic_access_page which is called on vcpu creation,
   (but only once when the memslot is not yet enabled)

-> to avoid pinning this page into qemu's memory, reference to it
   is dropped in alloc_apic_access_page.
   (see commit c24ae0dcd3e8695efa43e71704d1fc4bc7e29e9b)

-> kvm_arch_mmu_notifier_invalidate_range -> checks if we invalidate GPA 0xfee00000 
   and if so, raises KVM_REQ_APIC_PAGE_RELOAD request.

-> kvm_vcpu_reload_apic_access_page handles the KVM_REQ_APIC_PAGE_RELOAD request by calling
   kvm_x86_ops.set_apic_access_page_addr which is only implemented on VMX
   (vmx_set_apic_access_page_addr)

-> vmx_set_apic_access_page_addr does gfn_to_page on 0xfee00000 GPA,
   and if the result is valid, writes the physical address of this page to APIC_ACCESS_ADDR vmcs field.

   (This is a major difference from the AVIC - AVIC's avic_vapic_bar is *GPA*, while APIC_ACCESS_ADDR
   is host physical address which the hypervisor is supposed to map at APIC MMIO GPA using EPT)

   Note that if we have an error here, we might end with invalid APIC_ACCESS_ADDR field.

-> writes to the  HPA of that special page (which has GPA of 0xfee00000, and mapped via EPT) go to
   APICv or cause special VM exits: (EXIT_REASON_APIC_ACCESS, EXIT_REASON_APIC_WRITE)

   * EXIT_REASON_APIC_ACCESS (which is used for older limited 'flexpriotiy' mode which only emulates TPR practically) 
     actually emulates the instruction to know the written value,
     but we have a special case in vcpu_is_mmio_gpa which makes the emulation treat the access to the default
     apic base as MMIO.
   
   * EXIT_REASON_APIC_WRITE is a trap VMexit which comes with full APICv, and since it also has offset
     qualification and the value is already in the apic page, this info is just passed to kvm_lapic_reg_write


-> If APIC base is relocated, the APICv isn't aware of it, and the writes to new APIC base,
   (assuming that we have no memslots covering it) will go through standard APIC MMIO emulation,
   and *might* create a mess.

AVIC:

-> The default apic MMIO base (0xfee00000) 
   is also covered by a dummy page which is allocated from qemu's process using __x86_set_memory_region 
   in avic_update_access_page which is called also on vcpu creation (also only once),
   and from SVM's dynamic AVIC inhibition.

-> The reference to this page is not dropped thus there is no KVM_REQ_APIC_PAGE_RELOAD handler.
   I think we should do the same we do for APICv here?

-> avic_vapic_bar is GPA and thus contains 0xfee00000 but writes to MSR_IA32_APICBASE do update it
   (avic_update_vapic_bar which is called from MSR_IA32_APICBASE write in SVM code)

   thus if the guest relocates the APIC base to a writable memory page, actually AVIC would happen to work.
   (opposite from the stock xAPIC handlilng, which only works when apic base is in MMIO area.)

-> writes to the GPA in avic_vapic_bar are first checked in NPT (but HPA written there ignored),
   and then either go to AVIC or cause SVM_EXIT_AVIC_UNACCELERATED_ACCESS which has offset of the write
   in the exit_info_1
   (there is also SVM_EXIT_AVIC_INCOMPLETE_IPI which is called on some ICR writes)


As far as I know the only good reason to relocate APIC base is to access it from the real mode
which is not something that is done these days by modern BIOSes.

I vote to make it read only (#GP on MSR_IA32_APICBASE write when non default base is set and apic enabled) 
and remove all remains of the support for variable APIC base.
(we already have a warning when APIC base is set to non default value)


Best regards,
	Maxim Levitsky
Maxim Levitsky July 22, 2021, 5:35 p.m. UTC | #9
On Mon, 2021-07-19 at 18:49 +0000, Sean Christopherson wrote:
> On Sun, Jul 18, 2021, Maxim Levitsky wrote:
> > I am more inclined to fix this by just tracking if we hold the srcu
> > lock on each VCPU manually, just as we track the srcu index anyway,
> > and then kvm_request_apicv_update can use this to drop the srcu
> > lock when needed.
> 
> The entire approach of dynamically adding/removing the memslot seems doomed to
> failure, and is likely responsible for the performance issues with AVIC, e.g. a
> single vCPU temporarily inhibiting AVIC will zap all SPTEs _twice_; on disable
> and again on re-enable.
> 
> Rather than pile on more gunk, what about special casing the APIC access page
> memslot in try_async_pf()?  E.g. zap the GFN in avic_update_access_page() when
> disabling (and bounce through kvm_{inc,dec}_notifier_count()), and have the page
> fault path skip directly to MMIO emulation without caching the MMIO info.  It'd
> also give us a good excuse to rename try_async_pf() :-)
> 
> If lack of MMIO caching is a performance problem, an alternative solution would
> be to allow caching but add a helper to zap the MMIO SPTE and request all vCPUs to
> clear their cache.
> 
> It's all a bit gross, especially hijacking the mmu_notifier path, but IMO it'd be
> less awful than the current memslot+SRCU mess.

Hi!

I am testing your approach and it actually works very well! I can't seem to break it.

Could you explain why do I need to do something with kvm_{inc,dec}_notifier_count()) ?

I just use your patch as is, plus the changes that are needed in kvm_request_apicv_update.

On AVIC unlike APICv, the page in this memslot is pinned in the physical memory still
(AVIC misses the code that APICv has in this regard).
It should be done in the future though.

Best regards,
	Maxim Levitsky

> 
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index f4d35289f59e..ea434d76cfb0 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -3767,9 +3767,9 @@ static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
>                                   kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch);
>  }
> 
> -static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
> -                        gpa_t cr2_or_gpa, kvm_pfn_t *pfn, hva_t *hva,
> -                        bool write, bool *writable)
> +static bool kvm_faultin_pfn(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
> +                           gpa_t cr2_or_gpa, kvm_pfn_t *pfn, hva_t *hva,
> +                           bool write, bool *writable, int *r)
>  {
>         struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
>         bool async;
> @@ -3780,13 +3780,26 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
>          * be zapped before KVM inserts a new MMIO SPTE for the gfn.
>          */
>         if (slot && (slot->flags & KVM_MEMSLOT_INVALID))
> -               return true;
> +               goto out_retry;
> 
> -       /* Don't expose private memslots to L2. */
> -       if (is_guest_mode(vcpu) && !kvm_is_visible_memslot(slot)) {
> -               *pfn = KVM_PFN_NOSLOT;
> -               *writable = false;
> -               return false;
> +       if (!kvm_is_visible_memslot(slot)) {
> +               /* Don't expose private memslots to L2. */
> +               if (is_guest_mode(vcpu)) {
> +                       *pfn = KVM_PFN_NOSLOT;
> +                       *writable = false;
> +                       return false;
> +               }
> +               /*
> +                * If the APIC access page exists but is disabled, go directly
> +                * to emulation without caching the MMIO access or creating a
> +                * MMIO SPTE.  That way the cache doesn't need to be purged
> +                * when the AVIC is re-enabled.
> +                */
> +               if (slot->id == APIC_ACCESS_PAGE_PRIVATE_MEMSLOT &&
> +                   !vcpu->kvm->arch.apic_access_memslot_enabled) {
> +                       *r = RET_PF_EMULATE;
> +                       return true;
> +               }
>         }
> 
>         async = false;
> @@ -3800,14 +3813,19 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
>                 if (kvm_find_async_pf_gfn(vcpu, gfn)) {
>                         trace_kvm_async_pf_doublefault(cr2_or_gpa, gfn);
>                         kvm_make_request(KVM_REQ_APF_HALT, vcpu);
> -                       return true;
> -               } else if (kvm_arch_setup_async_pf(vcpu, cr2_or_gpa, gfn))
> -                       return true;
> +                       goto out_retry;
> +               } else if (kvm_arch_setup_async_pf(vcpu, cr2_or_gpa, gfn)) {
> +                       goto out_retry;
> +               }
>         }
> 
>         *pfn = __gfn_to_pfn_memslot(slot, gfn, false, NULL,
>                                     write, writable, hva);
>         return false;
> +
> +out_retry:
> +       *r = RET_PF_RETRY;
> +       return true;
>  }
> 
>  static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
> @@ -3839,9 +3857,9 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
>         mmu_seq = vcpu->kvm->mmu_notifier_seq;
>         smp_rmb();
> 
> -       if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, &hva,
> -                        write, &map_writable))
> -               return RET_PF_RETRY;
> +       if (kvm_faultin_pfn(vcpu, prefault, gfn, gpa, &pfn, &hva, write,
> +                           &map_writable, &r))
> +               return r;
> 
>         if (handle_abnormal_pfn(vcpu, is_tdp ? 0 : gpa, gfn, pfn, ACC_ALL, &r))
>                 return r;
> diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
> index 490a028ddabe..9747124b877d 100644
> --- a/arch/x86/kvm/mmu/paging_tmpl.h
> +++ b/arch/x86/kvm/mmu/paging_tmpl.h
> @@ -881,9 +881,9 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code,
>         mmu_seq = vcpu->kvm->mmu_notifier_seq;
>         smp_rmb();
>  
> -       if (try_async_pf(vcpu, prefault, walker.gfn, addr, &pfn, &hva,
> -                        write_fault, &map_writable))
> -               return RET_PF_RETRY;
> +       if (kvm_faultin_pfn(vcpu, prefault, walker.gfn, addr, &pfn, &hva,
> +                           write_fault, &map_writable, &r))
> +               return r;
>  
>         if (handle_abnormal_pfn(vcpu, addr, walker.gfn, pfn, walker.pte_access, &r))
>                 return r;
>
Sean Christopherson July 22, 2021, 7:06 p.m. UTC | #10
+Ben

On Thu, Jul 22, 2021, Maxim Levitsky wrote:
> On Mon, 2021-07-19 at 18:49 +0000, Sean Christopherson wrote:
> > On Sun, Jul 18, 2021, Maxim Levitsky wrote:
> > > I am more inclined to fix this by just tracking if we hold the srcu
> > > lock on each VCPU manually, just as we track the srcu index anyway,
> > > and then kvm_request_apicv_update can use this to drop the srcu
> > > lock when needed.
> > 
> > The entire approach of dynamically adding/removing the memslot seems doomed to
> > failure, and is likely responsible for the performance issues with AVIC, e.g. a
> > single vCPU temporarily inhibiting AVIC will zap all SPTEs _twice_; on disable
> > and again on re-enable.
> > 
> > Rather than pile on more gunk, what about special casing the APIC access page
> > memslot in try_async_pf()?  E.g. zap the GFN in avic_update_access_page() when
> > disabling (and bounce through kvm_{inc,dec}_notifier_count()), and have the page
> > fault path skip directly to MMIO emulation without caching the MMIO info.  It'd
> > also give us a good excuse to rename try_async_pf() :-)
> > 
> > If lack of MMIO caching is a performance problem, an alternative solution would
> > be to allow caching but add a helper to zap the MMIO SPTE and request all vCPUs to
> > clear their cache.
> > 
> > It's all a bit gross, especially hijacking the mmu_notifier path, but IMO it'd be
> > less awful than the current memslot+SRCU mess.
> 
> Hi!
> 
> I am testing your approach and it actually works very well! I can't seem to break it.
> 
> Could you explain why do I need to do something with kvm_{inc,dec}_notifier_count()) ?

Glad you asked, there's one more change needed.  kvm_zap_gfn_range() currently
takes mmu_lock for read, but it needs to take mmu_lock for write for this case
(more way below).

The existing users, update_mtrr() and kvm_post_set_cr0(), are a bit sketchy.  The
whole thing is a grey area because KVM is trying to ensure it honors the guest's
UC memtype for non-coherent DMA, but the inputs (CR0 and MTRRs) are per-vCPU,
i.e. for it to work correctly, the guest has to ensure all running vCPUs do the
same transition.  So in practice there's likely no observable bug, but it also
means that taking mmu_lock for read is likely pointless, because for things to
work the guest has to serialize all running vCPUs.

Ben, any objection to taking mmu_lock for write in kvm_zap_gfn_range()?  It would
effectively revert commit 6103bc074048 ("KVM: x86/mmu: Allow zap gfn range to
operate under the mmu read lock"); see attached patch.  And we could even bump
the notifier count in that helper, e.g. on top of the attached:

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index b607e8763aa2..7174058e982b 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -5568,6 +5568,8 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)

        write_lock(&kvm->mmu_lock);

+       kvm_inc_notifier_count(kvm, gfn_start, gfn_end);
+
        if (kvm_memslots_have_rmaps(kvm)) {
                for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
                        slots = __kvm_memslots(kvm, i);
@@ -5598,6 +5600,8 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
        if (flush)
                kvm_flush_remote_tlbs_with_address(kvm, gfn_start, gfn_end);

+       kvm_dec_notifier_count(kvm, gfn_start, gfn_end);
+
        write_unlock(&kvm->mmu_lock);
 }




Back to Maxim's original question...

Elevating mmu_notifier_count and bumping mmu_notifier_seq will will handle the case
where APICv is being disabled while a different vCPU is concurrently faulting in a
new mapping for the APIC page.  E.g. it handles this race:

 vCPU0                                 vCPU1
                                       apic_access_memslot_enabled = true;
 			               #NPF on APIC
			               apic_access_memslot_enabled==true, proceed with #NPF
 apic_access_memslot_enabled = false 
 kvm_zap_gfn_range(APIC);
                                       __direct_map(APIC)

 mov [APIC], 0 <-- succeeds, but KVM wants to intercept to emulate



The elevated mmu_notifier_count and/or changed mmu_notifier_seq will cause vCPU1
to bail and resume the guest without fixing the #NPF.  After acquiring mmu_lock,
vCPU1 will see the elevated mmu_notifier_count (if kvm_zap_gfn_range() is about
to be called, or just finised) and/or a modified mmu_notifier_seq (after the
count was decremented).

This is why kvm_zap_gfn_range() needs to take mmu_lock for write.  If it's allowed
to run in parallel with the page fault handler, there's no guarantee that the
correct apic_access_memslot_enabled will be observed.

	if (is_tdp_mmu_fault)
		read_lock(&vcpu->kvm->mmu_lock);
	else
		write_lock(&vcpu->kvm->mmu_lock);

	if (!is_noslot_pfn(pfn) && mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, hva)) <--- look here!
		goto out_unlock;

	if (is_tdp_mmu_fault)
		r = kvm_tdp_mmu_map(vcpu, gpa, error_code, map_writable, max_level,
				    pfn, prefault);
	else
		r = __direct_map(vcpu, gpa, error_code, map_writable, max_level, pfn,
				 prefault, is_tdp);
Maxim Levitsky July 27, 2021, 1:05 p.m. UTC | #11
On Thu, 2021-07-22 at 19:06 +0000, Sean Christopherson wrote:
> +Ben
> 
> On Thu, Jul 22, 2021, Maxim Levitsky wrote:
> > On Mon, 2021-07-19 at 18:49 +0000, Sean Christopherson wrote:
> > > On Sun, Jul 18, 2021, Maxim Levitsky wrote:
> > > > I am more inclined to fix this by just tracking if we hold the srcu
> > > > lock on each VCPU manually, just as we track the srcu index anyway,
> > > > and then kvm_request_apicv_update can use this to drop the srcu
> > > > lock when needed.
> > > 
> > > The entire approach of dynamically adding/removing the memslot seems doomed to
> > > failure, and is likely responsible for the performance issues with AVIC, e.g. a
> > > single vCPU temporarily inhibiting AVIC will zap all SPTEs _twice_; on disable
> > > and again on re-enable.
> > > 
> > > Rather than pile on more gunk, what about special casing the APIC access page
> > > memslot in try_async_pf()?  E.g. zap the GFN in avic_update_access_page() when
> > > disabling (and bounce through kvm_{inc,dec}_notifier_count()), and have the page
> > > fault path skip directly to MMIO emulation without caching the MMIO info.  It'd
> > > also give us a good excuse to rename try_async_pf() :-)
> > > 
> > > If lack of MMIO caching is a performance problem, an alternative solution would
> > > be to allow caching but add a helper to zap the MMIO SPTE and request all vCPUs to
> > > clear their cache.
> > > 
> > > It's all a bit gross, especially hijacking the mmu_notifier path, but IMO it'd be
> > > less awful than the current memslot+SRCU mess.
> > 
> > Hi!
> > 
> > I am testing your approach and it actually works very well! I can't seem to break it.
> > 
> > Could you explain why do I need to do something with kvm_{inc,dec}_notifier_count()) ?
> 
> Glad you asked, there's one more change needed.  kvm_zap_gfn_range() currently
> takes mmu_lock for read, but it needs to take mmu_lock for write for this case
> (more way below).
> 
> The existing users, update_mtrr() and kvm_post_set_cr0(), are a bit sketchy.  The
> whole thing is a grey area because KVM is trying to ensure it honors the guest's
> UC memtype for non-coherent DMA, but the inputs (CR0 and MTRRs) are per-vCPU,
> i.e. for it to work correctly, the guest has to ensure all running vCPUs do the
> same transition.  So in practice there's likely no observable bug, but it also
> means that taking mmu_lock for read is likely pointless, because for things to
> work the guest has to serialize all running vCPUs.
> 
> Ben, any objection to taking mmu_lock for write in kvm_zap_gfn_range()?  It would
> effectively revert commit 6103bc074048 ("KVM: x86/mmu: Allow zap gfn range to
> operate under the mmu read lock"); see attached patch.  And we could even bump
> the notifier count in that helper, e.g. on top of the attached:
> 
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index b607e8763aa2..7174058e982b 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -5568,6 +5568,8 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
> 
>         write_lock(&kvm->mmu_lock);
> 
> +       kvm_inc_notifier_count(kvm, gfn_start, gfn_end);
> +
>         if (kvm_memslots_have_rmaps(kvm)) {
>                 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
>                         slots = __kvm_memslots(kvm, i);
> @@ -5598,6 +5600,8 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
>         if (flush)
>                 kvm_flush_remote_tlbs_with_address(kvm, gfn_start, gfn_end);
> 
> +       kvm_dec_notifier_count(kvm, gfn_start, gfn_end);
> +
>         write_unlock(&kvm->mmu_lock);
>  }
> 

I understand what you mean now. I thought that I need to change to code of the
kvm_inc_notifier_count/kvm_dec_notifier_count.




> 
> 
> 
> Back to Maxim's original question...
> 
> Elevating mmu_notifier_count and bumping mmu_notifier_seq will will handle the case
> where APICv is being disabled while a different vCPU is concurrently faulting in a
> new mapping for the APIC page.  E.g. it handles this race:
> 
>  vCPU0                                 vCPU1
>                                        apic_access_memslot_enabled = true;
>  			               #NPF on APIC
> 			               apic_access_memslot_enabled==true, proceed with #NPF
>  apic_access_memslot_enabled = false 
>  kvm_zap_gfn_range(APIC);
>                                        __direct_map(APIC)
> 
>  mov [APIC], 0 <-- succeeds, but KVM wants to intercept to emulate

I understand this now. I guess this can't happen with original memslot disable
which I guess has the needed locking and flushing to avoid this.
(I didnt' study the code in depth thought)

> 
> 
> 
> The elevated mmu_notifier_count and/or changed mmu_notifier_seq will cause vCPU1
> to bail and resume the guest without fixing the #NPF.  After acquiring mmu_lock,
> vCPU1 will see the elevated mmu_notifier_count (if kvm_zap_gfn_range() is about
> to be called, or just finised) and/or a modified mmu_notifier_seq (after the
> count was decremented).
> 
> This is why kvm_zap_gfn_range() needs to take mmu_lock for write.  If it's allowed
> to run in parallel with the page fault handler, there's no guarantee that the
> correct apic_access_memslot_enabled will be observed.

I understand now.

So, Paolo, Ben Gardon, what do you think. Do you think this approach is feasable?
Do you agree to revert the usage of the read lock?

I will post a new series using this approach very soon, since I already have
msot of the code done.

Best regards,
	Maxim Levitsky

> 
> 	if (is_tdp_mmu_fault)
> 		read_lock(&vcpu->kvm->mmu_lock);
> 	else
> 		write_lock(&vcpu->kvm->mmu_lock);
> 
> 	if (!is_noslot_pfn(pfn) && mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, hva)) <--- look here!
> 		goto out_unlock;
> 
> 	if (is_tdp_mmu_fault)
> 		r = kvm_tdp_mmu_map(vcpu, gpa, error_code, map_writable, max_level,
> 				    pfn, prefault);
> 	else
> 		r = __direct_map(vcpu, gpa, error_code, map_writable, max_level, pfn,
> 				 prefault, is_tdp);
Ben Gardon July 27, 2021, 5:48 p.m. UTC | #12
On Tue, Jul 27, 2021 at 6:06 AM Maxim Levitsky <mlevitsk@redhat.com> wrote:
>
> On Thu, 2021-07-22 at 19:06 +0000, Sean Christopherson wrote:
> > +Ben
> >
> > On Thu, Jul 22, 2021, Maxim Levitsky wrote:
> > > On Mon, 2021-07-19 at 18:49 +0000, Sean Christopherson wrote:
> > > > On Sun, Jul 18, 2021, Maxim Levitsky wrote:
> > > > > I am more inclined to fix this by just tracking if we hold the srcu
> > > > > lock on each VCPU manually, just as we track the srcu index anyway,
> > > > > and then kvm_request_apicv_update can use this to drop the srcu
> > > > > lock when needed.
> > > >
> > > > The entire approach of dynamically adding/removing the memslot seems doomed to
> > > > failure, and is likely responsible for the performance issues with AVIC, e.g. a
> > > > single vCPU temporarily inhibiting AVIC will zap all SPTEs _twice_; on disable
> > > > and again on re-enable.
> > > >
> > > > Rather than pile on more gunk, what about special casing the APIC access page
> > > > memslot in try_async_pf()?  E.g. zap the GFN in avic_update_access_page() when
> > > > disabling (and bounce through kvm_{inc,dec}_notifier_count()), and have the page
> > > > fault path skip directly to MMIO emulation without caching the MMIO info.  It'd
> > > > also give us a good excuse to rename try_async_pf() :-)
> > > >
> > > > If lack of MMIO caching is a performance problem, an alternative solution would
> > > > be to allow caching but add a helper to zap the MMIO SPTE and request all vCPUs to
> > > > clear their cache.
> > > >
> > > > It's all a bit gross, especially hijacking the mmu_notifier path, but IMO it'd be
> > > > less awful than the current memslot+SRCU mess.
> > >
> > > Hi!
> > >
> > > I am testing your approach and it actually works very well! I can't seem to break it.
> > >
> > > Could you explain why do I need to do something with kvm_{inc,dec}_notifier_count()) ?
> >
> > Glad you asked, there's one more change needed.  kvm_zap_gfn_range() currently
> > takes mmu_lock for read, but it needs to take mmu_lock for write for this case
> > (more way below).
> >
> > The existing users, update_mtrr() and kvm_post_set_cr0(), are a bit sketchy.  The
> > whole thing is a grey area because KVM is trying to ensure it honors the guest's
> > UC memtype for non-coherent DMA, but the inputs (CR0 and MTRRs) are per-vCPU,
> > i.e. for it to work correctly, the guest has to ensure all running vCPUs do the
> > same transition.  So in practice there's likely no observable bug, but it also
> > means that taking mmu_lock for read is likely pointless, because for things to
> > work the guest has to serialize all running vCPUs.
> >
> > Ben, any objection to taking mmu_lock for write in kvm_zap_gfn_range()?  It would
> > effectively revert commit 6103bc074048 ("KVM: x86/mmu: Allow zap gfn range to
> > operate under the mmu read lock"); see attached patch.  And we could even bump
> > the notifier count in that helper, e.g. on top of the attached:
> >
> > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> > index b607e8763aa2..7174058e982b 100644
> > --- a/arch/x86/kvm/mmu/mmu.c
> > +++ b/arch/x86/kvm/mmu/mmu.c
> > @@ -5568,6 +5568,8 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
> >
> >         write_lock(&kvm->mmu_lock);
> >
> > +       kvm_inc_notifier_count(kvm, gfn_start, gfn_end);
> > +
> >         if (kvm_memslots_have_rmaps(kvm)) {
> >                 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
> >                         slots = __kvm_memslots(kvm, i);
> > @@ -5598,6 +5600,8 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
> >         if (flush)
> >                 kvm_flush_remote_tlbs_with_address(kvm, gfn_start, gfn_end);
> >
> > +       kvm_dec_notifier_count(kvm, gfn_start, gfn_end);
> > +
> >         write_unlock(&kvm->mmu_lock);
> >  }
> >
>
> I understand what you mean now. I thought that I need to change to code of the
> kvm_inc_notifier_count/kvm_dec_notifier_count.
>
>
>
>
> >
> >
> >
> > Back to Maxim's original question...
> >
> > Elevating mmu_notifier_count and bumping mmu_notifier_seq will will handle the case
> > where APICv is being disabled while a different vCPU is concurrently faulting in a
> > new mapping for the APIC page.  E.g. it handles this race:
> >
> >  vCPU0                                 vCPU1
> >                                        apic_access_memslot_enabled = true;
> >                                      #NPF on APIC
> >                                      apic_access_memslot_enabled==true, proceed with #NPF
> >  apic_access_memslot_enabled = false
> >  kvm_zap_gfn_range(APIC);
> >                                        __direct_map(APIC)
> >
> >  mov [APIC], 0 <-- succeeds, but KVM wants to intercept to emulate
>
> I understand this now. I guess this can't happen with original memslot disable
> which I guess has the needed locking and flushing to avoid this.
> (I didnt' study the code in depth thought)
>
> >
> >
> >
> > The elevated mmu_notifier_count and/or changed mmu_notifier_seq will cause vCPU1
> > to bail and resume the guest without fixing the #NPF.  After acquiring mmu_lock,
> > vCPU1 will see the elevated mmu_notifier_count (if kvm_zap_gfn_range() is about
> > to be called, or just finised) and/or a modified mmu_notifier_seq (after the
> > count was decremented).
> >
> > This is why kvm_zap_gfn_range() needs to take mmu_lock for write.  If it's allowed
> > to run in parallel with the page fault handler, there's no guarantee that the
> > correct apic_access_memslot_enabled will be observed.
>
> I understand now.
>
> So, Paolo, Ben Gardon, what do you think. Do you think this approach is feasable?
> Do you agree to revert the usage of the read lock?
>
> I will post a new series using this approach very soon, since I already have
> msot of the code done.
>
> Best regards,
>         Maxim Levitsky

From reading through this thread, it seems like switching from read
lock to write lock is only necessary for a small range of GFNs, (i.e.
the APIC access page) is that correct?
My initial reaction was that switching kvm_zap_gfn_range back to the
write lock would be terrible for performance, but given its only two
callers, I think it would actually be fine.
If you do that though, you should pass shared=false to
kvm_tdp_mmu_zap_gfn_range in that function, so that it knows it's
operating with exclusive access to the MMU lock.

>
> >
> >       if (is_tdp_mmu_fault)
> >               read_lock(&vcpu->kvm->mmu_lock);
> >       else
> >               write_lock(&vcpu->kvm->mmu_lock);
> >
> >       if (!is_noslot_pfn(pfn) && mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, hva)) <--- look here!
> >               goto out_unlock;
> >
> >       if (is_tdp_mmu_fault)
> >               r = kvm_tdp_mmu_map(vcpu, gpa, error_code, map_writable, max_level,
> >                                   pfn, prefault);
> >       else
> >               r = __direct_map(vcpu, gpa, error_code, map_writable, max_level, pfn,
> >                                prefault, is_tdp);
>
>
Sean Christopherson July 27, 2021, 6:17 p.m. UTC | #13
On Tue, Jul 27, 2021, Ben Gardon wrote:
> On Tue, Jul 27, 2021 at 6:06 AM Maxim Levitsky <mlevitsk@redhat.com> wrote:
> >
> > On Thu, 2021-07-22 at 19:06 +0000, Sean Christopherson wrote:
> > > The elevated mmu_notifier_count and/or changed mmu_notifier_seq will cause vCPU1
> > > to bail and resume the guest without fixing the #NPF.  After acquiring mmu_lock,
> > > vCPU1 will see the elevated mmu_notifier_count (if kvm_zap_gfn_range() is about
> > > to be called, or just finised) and/or a modified mmu_notifier_seq (after the
> > > count was decremented).
> > >
> > > This is why kvm_zap_gfn_range() needs to take mmu_lock for write.  If it's allowed
> > > to run in parallel with the page fault handler, there's no guarantee that the
> > > correct apic_access_memslot_enabled will be observed.
> >
> > I understand now.
> >
> > So, Paolo, Ben Gardon, what do you think. Do you think this approach is feasable?
> > Do you agree to revert the usage of the read lock?
> >
> > I will post a new series using this approach very soon, since I already have
> > msot of the code done.
> >
> > Best regards,
> >         Maxim Levitsky
> 
> From reading through this thread, it seems like switching from read
> lock to write lock is only necessary for a small range of GFNs, (i.e.
> the APIC access page) is that correct?

For the APICv case, yes, literally a single GFN (the default APIC base).

> My initial reaction was that switching kvm_zap_gfn_range back to the
> write lock would be terrible for performance, but given its only two
> callers, I think it would actually be fine.

And more importantly, the two callers are gated by kvm_arch_has_noncoherent_dma()
and are very rare flows for the guest (updating MTRRs, toggling CR0.CD).

> If you do that though, you should pass shared=false to
> kvm_tdp_mmu_zap_gfn_range in that function, so that it knows it's
> operating with exclusive access to the MMU lock.

Ya, my suggested revert was to drop @shared entirely since kvm_zap_gfn_range() is
the only caller that passes @shared=true.
Maxim Levitsky July 29, 2021, 2:10 p.m. UTC | #14
On Tue, 2021-07-27 at 18:17 +0000, Sean Christopherson wrote:
> On Tue, Jul 27, 2021, Ben Gardon wrote:
> > On Tue, Jul 27, 2021 at 6:06 AM Maxim Levitsky <mlevitsk@redhat.com> wrote:
> > > On Thu, 2021-07-22 at 19:06 +0000, Sean Christopherson wrote:
> > > > The elevated mmu_notifier_count and/or changed mmu_notifier_seq will cause vCPU1
> > > > to bail and resume the guest without fixing the #NPF.  After acquiring mmu_lock,
> > > > vCPU1 will see the elevated mmu_notifier_count (if kvm_zap_gfn_range() is about
> > > > to be called, or just finised) and/or a modified mmu_notifier_seq (after the
> > > > count was decremented).
> > > > 
> > > > This is why kvm_zap_gfn_range() needs to take mmu_lock for write.  If it's allowed
> > > > to run in parallel with the page fault handler, there's no guarantee that the
> > > > correct apic_access_memslot_enabled will be observed.
> > > 
> > > I understand now.
> > > 
> > > So, Paolo, Ben Gardon, what do you think. Do you think this approach is feasable?
> > > Do you agree to revert the usage of the read lock?
> > > 
> > > I will post a new series using this approach very soon, since I already have
> > > msot of the code done.
> > > 
> > > Best regards,
> > >         Maxim Levitsky
> > 
> > From reading through this thread, it seems like switching from read
> > lock to write lock is only necessary for a small range of GFNs, (i.e.
> > the APIC access page) is that correct?
> 
> For the APICv case, yes, literally a single GFN (the default APIC base).
> 
> > My initial reaction was that switching kvm_zap_gfn_range back to the
> > write lock would be terrible for performance, but given its only two
> > callers, I think it would actually be fine.
> 
> And more importantly, the two callers are gated by kvm_arch_has_noncoherent_dma()
> and are very rare flows for the guest (updating MTRRs, toggling CR0.CD).
> 
> > If you do that though, you should pass shared=false to
> > kvm_tdp_mmu_zap_gfn_range in that function, so that it knows it's
> > operating with exclusive access to the MMU lock.
> 
> Ya, my suggested revert was to drop @shared entirely since kvm_zap_gfn_range() is
> the only caller that passes @shared=true.
> 


Just one question:
Should I submit the patches for MMU changes that you described,
and on top of them my AVIC patches?

Should I worry about the new TDP mmu?

Best regards,
	Maxim Levitsky
Maxim Levitsky Aug. 2, 2021, 9:20 a.m. UTC | #15
On Thu, 2021-07-22 at 12:12 +0300, Maxim Levitsky wrote:
> On Mon, 2021-07-19 at 18:49 +0000, Sean Christopherson wrote:
> > On Sun, Jul 18, 2021, Maxim Levitsky wrote:
> > > I am more inclined to fix this by just tracking if we hold the srcu
> > > lock on each VCPU manually, just as we track the srcu index anyway,
> > > and then kvm_request_apicv_update can use this to drop the srcu
> > > lock when needed.
> > 
> > The entire approach of dynamically adding/removing the memslot seems doomed to
> > failure, and is likely responsible for the performance issues with AVIC, e.g. a
> > single vCPU temporarily inhibiting AVIC will zap all SPTEs _twice_; on disable
> > and again on re-enable.
> > 
> > Rather than pile on more gunk, what about special casing the APIC access page
> > memslot in try_async_pf()?  E.g. zap the GFN in avic_update_access_page() when
> > disabling (and bounce through kvm_{inc,dec}_notifier_count()), and have the page
> > fault path skip directly to MMIO emulation without caching the MMIO info.  It'd
> > also give us a good excuse to rename try_async_pf() :-)
> > 
> > If lack of MMIO caching is a performance problem, an alternative solution would
> > be to allow caching but add a helper to zap the MMIO SPTE and request all vCPUs to
> > clear their cache.
> > 
> > It's all a bit gross, especially hijacking the mmu_notifier path, but IMO it'd be
> > less awful than the current memslot+SRCU mess
> 
> Hi Sean, Paolo and everyone else:
> 
> I am exploring the approach that you proposed and I noticed that we have very inconsistient
> code that handles the APIC base relocation for in-kernel local apic.
> I do know that APIC base relocation is not supported, and I don't have anything against
> this as long as VMs don't use that feature, but I do want this to be consistent.
> 
> I did a study of the code that is involved in this mess and I would like to hear your opinion:
> 
> There are basically 3 modes of operation of in kernel local apic:
> 
> Regular unaccelerated local apic:
> 
> -> APIC MMIO base address is stored at 'apic->base_address', and updated in 
>    kvm_lapic_set_base which is called from  msr write of 'MSR_IA32_APICBASE'
>    (both by SVM and VMX).
>    The value is even migrated.
> 
> -> APIC mmio read/write is done from MMU, when we access MMIO page:
> 	vcpu_mmio_write always calls apic_mmio_write which checks if the write is in 
> 	apic->base_address page and if so forwards the write local apic with offset
> 	in this page.
> 
> -> APIC MMIO area has to be MMIO for 'apic_mmio_write' to be called,
>    thus must contain no guest memslots.
>    If the guest relocates the APIC base somewhere where we have a memslot, 
>    memslot will take priority, while on real hardware, LAPIC is likely to
>    take priority.
> 
> APICv:
> 
> -> The default apic MMIO base (0xfee00000) is covered by a dummy page which is
>    allocated from qemu's process  using __x86_set_memory_region.
>    
>    This is done once in alloc_apic_access_page which is called on vcpu creation,
>    (but only once when the memslot is not yet enabled)
> 
> -> to avoid pinning this page into qemu's memory, reference to it
>    is dropped in alloc_apic_access_page.
>    (see commit c24ae0dcd3e8695efa43e71704d1fc4bc7e29e9b)
> 
> -> kvm_arch_mmu_notifier_invalidate_range -> checks if we invalidate GPA 0xfee00000 
>    and if so, raises KVM_REQ_APIC_PAGE_RELOAD request.
> 
> -> kvm_vcpu_reload_apic_access_page handles the KVM_REQ_APIC_PAGE_RELOAD request by calling
>    kvm_x86_ops.set_apic_access_page_addr which is only implemented on VMX
>    (vmx_set_apic_access_page_addr)
> 
> -> vmx_set_apic_access_page_addr does gfn_to_page on 0xfee00000 GPA,
>    and if the result is valid, writes the physical address of this page to APIC_ACCESS_ADDR vmcs field.
> 
>    (This is a major difference from the AVIC - AVIC's avic_vapic_bar is *GPA*, while APIC_ACCESS_ADDR
>    is host physical address which the hypervisor is supposed to map at APIC MMIO GPA using EPT)
> 
>    Note that if we have an error here, we might end with invalid APIC_ACCESS_ADDR field.
> 
> -> writes to the  HPA of that special page (which has GPA of 0xfee00000, and mapped via EPT) go to
>    APICv or cause special VM exits: (EXIT_REASON_APIC_ACCESS, EXIT_REASON_APIC_WRITE)
> 
>    * EXIT_REASON_APIC_ACCESS (which is used for older limited 'flexpriotiy' mode which only emulates TPR practically) 
>      actually emulates the instruction to know the written value,
>      but we have a special case in vcpu_is_mmio_gpa which makes the emulation treat the access to the default
>      apic base as MMIO.
>    
>    * EXIT_REASON_APIC_WRITE is a trap VMexit which comes with full APICv, and since it also has offset
>      qualification and the value is already in the apic page, this info is just passed to kvm_lapic_reg_write
> 
> 
> -> If APIC base is relocated, the APICv isn't aware of it, and the writes to new APIC base,
>    (assuming that we have no memslots covering it) will go through standard APIC MMIO emulation,
>    and *might* create a mess.
> 
> AVIC:
> 
> -> The default apic MMIO base (0xfee00000) 
>    is also covered by a dummy page which is allocated from qemu's process using __x86_set_memory_region 
>    in avic_update_access_page which is called also on vcpu creation (also only once),
>    and from SVM's dynamic AVIC inhibition.
> 
> -> The reference to this page is not dropped thus there is no KVM_REQ_APIC_PAGE_RELOAD handler.
>    I think we should do the same we do for APICv here?
> 
> -> avic_vapic_bar is GPA and thus contains 0xfee00000 but writes to MSR_IA32_APICBASE do update it
>    (avic_update_vapic_bar which is called from MSR_IA32_APICBASE write in SVM code)
> 
>    thus if the guest relocates the APIC base to a writable memory page, actually AVIC would happen to work.
>    (opposite from the stock xAPIC handlilng, which only works when apic base is in MMIO area.)
> 
> -> writes to the GPA in avic_vapic_bar are first checked in NPT (but HPA written there ignored),
>    and then either go to AVIC or cause SVM_EXIT_AVIC_UNACCELERATED_ACCESS which has offset of the write
>    in the exit_info_1
>    (there is also SVM_EXIT_AVIC_INCOMPLETE_IPI which is called on some ICR writes)
> 
> 
> As far as I know the only good reason to relocate APIC base is to access it from the real mode
> which is not something that is done these days by modern BIOSes.
> 
> I vote to make it read only (#GP on MSR_IA32_APICBASE write when non default base is set and apic enabled) 
> and remove all remains of the support for variable APIC base.
> (we already have a warning when APIC base is set to non default value)
> 
> 
> Best regards,
> 	Maxim Levitsky
> 
Ping.

Best regards,
	Maxim Levitsky
Sean Christopherson Aug. 6, 2021, 9:55 p.m. UTC | #16
On Thu, Jul 22, 2021, Maxim Levitsky wrote:
> On Mon, 2021-07-19 at 18:49 +0000, Sean Christopherson wrote:
> > On Sun, Jul 18, 2021, Maxim Levitsky wrote:
> -> APIC MMIO area has to be MMIO for 'apic_mmio_write' to be called,
>    thus must contain no guest memslots.
>    If the guest relocates the APIC base somewhere where we have a memslot, 
>    memslot will take priority, while on real hardware, LAPIC is likely to
>    take priority.

Yep.  The thing that really bites us is that other vCPUs should still be able to
access the memory defined by the memslot, e.g. to make it work we'd have to run
the vCPU with a completely different MMU root.

> As far as I know the only good reason to relocate APIC base is to access it
> from the real mode which is not something that is done these days by modern
> BIOSes.
> 
> I vote to make it read only (#GP on MSR_IA32_APICBASE write when non default
> base is set and apic enabled) and remove all remains of the support for
> variable APIC base.

Making up our own behavior is almost never the right approach.  E.g. _best_ case
scenario for an unexpected #GP is the guest immediately terminates.  Worst case
scenario is the guest eats the #GP and continues on, which is basically the status
quo, except it's guaranteed to now work, whereas todays behavior can at least let
the guest function, for some definitions of "function".

I think the only viable "solution" is to exit to userspace on the guilty WRMSR.
Whether or not we can do that without breaking userspace is probably the big
question.  Fully emulating APIC base relocation would be a tremendous amount of
effort and complexity for practically zero benefit.

> (we already have a warning when APIC base is set to non default value)

FWIW, that warning is worthless because it's _once(), i.e. won't help detect a
misbehaving guest unless it's the first guest to misbehave on a particular
instantiation of KVM.   _ratelimited() would improve the situation, but not
completely eliminate the possibility of a misbehaving guest going unnoticed.
Anything else isn't an option becuase it's obviously guest triggerable.
Maxim Levitsky Aug. 9, 2021, 9:40 a.m. UTC | #17
On Fri, 2021-08-06 at 21:55 +0000, Sean Christopherson wrote:
> On Thu, Jul 22, 2021, Maxim Levitsky wrote:
> > On Mon, 2021-07-19 at 18:49 +0000, Sean Christopherson wrote:
> > > On Sun, Jul 18, 2021, Maxim Levitsky wrote:
> > -> APIC MMIO area has to be MMIO for 'apic_mmio_write' to be called,
> >    thus must contain no guest memslots.
> >    If the guest relocates the APIC base somewhere where we have a memslot, 
> >    memslot will take priority, while on real hardware, LAPIC is likely to
> >    take priority.
> 
> Yep.  The thing that really bites us is that other vCPUs should still be able to
> access the memory defined by the memslot, e.g. to make it work we'd have to run
> the vCPU with a completely different MMU root.
That is something I haven't took in the account. 
Complexity of supporting this indeed isn't worth it.

> 
> > As far as I know the only good reason to relocate APIC base is to access it
> > from the real mode which is not something that is done these days by modern
> > BIOSes.
> > 
> > I vote to make it read only (#GP on MSR_IA32_APICBASE write when non default
> > base is set and apic enabled) and remove all remains of the support for
> > variable APIC base.
> 
> Making up our own behavior is almost never the right approach.  E.g. _best_ case
> scenario for an unexpected #GP is the guest immediately terminates.  Worst case
> scenario is the guest eats the #GP and continues on, which is basically the status
> quo, except it's guaranteed to now work, whereas todays behavior can at least let
> the guest function, for some definitions of "function".

Well, at least the Intel's PRM does state that APIC base relocation is not guaranteed
to work on all CPUs, so giving the guest a #GP is like telling it that current CPU doesn't
support it. In theory, a very well behaving guest can catch the exception and
fail back to the default base.

I don't understand what do you mean by 'guaranteed to now work'. If the guest
ignores this #GP and still thinks that APIC base relocation worked, it is its fault.
A well behaving guest should never assume that a msr write that failed with #GP
worked.


> 
> I think the only viable "solution" is to exit to userspace on the guilty WRMSR.
> Whether or not we can do that without breaking userspace is probably the big
> question.  Fully emulating APIC base relocation would be a tremendous amount of
> effort and complexity for practically zero benefit.

I have nothing against this as well although I kind of like the #GP approach a bit more, 
and knowing that there are barely any reasons
to relocate the APIC base, and that it doesn't work well, there is a good chance
that no one does it anyway (except our kvm unit tests, but that isn't an issue).

> 
> > (we already have a warning when APIC base is set to non default value)
> 
> FWIW, that warning is worthless because it's _once(), i.e. won't help detect a
> misbehaving guest unless it's the first guest to misbehave on a particular
> instantiation of KVM.   _ratelimited() would improve the situation, but not
> completely eliminate the possibility of a misbehaving guest going unnoticed.
> Anything else isn't an option becuase it's obviously guest triggerable.

100% agree.

I'll say I would first make it _ratelimited() for few KVM versions, and then
if nobody complains, make it a KVM internal error / #GP, and remove all the leftovers
from the code that pretend that it can work.

And add a comment explaining *why* as you explained, supporting APIC base relocation
isn't worth it.

Best regards,
	Maxim Levitsky

>
Sean Christopherson Aug. 9, 2021, 3:57 p.m. UTC | #18
On Mon, Aug 09, 2021, Maxim Levitsky wrote:
> On Fri, 2021-08-06 at 21:55 +0000, Sean Christopherson wrote:
> > Making up our own behavior is almost never the right approach.  E.g. _best_ case
> > scenario for an unexpected #GP is the guest immediately terminates.  Worst case
> > scenario is the guest eats the #GP and continues on, which is basically the status
> > quo, except it's guaranteed to now work, whereas todays behavior can at least let
> > the guest function, for some definitions of "function".
> 
> Well, at least the Intel's PRM does state that APIC base relocation is not guaranteed
> to work on all CPUs, so giving the guest a #GP is like telling it that current CPU doesn't
> support it. In theory, a very well behaving guest can catch the exception and
> fail back to the default base.
> 
> I don't understand what do you mean by 'guaranteed to now work'. If the guest

Doh, typo, it should be "not", i.e. "guaranteed to not work".  As in, allowing the
unsupported WRMSR could work depending on what features KVM is using and what the
guest is doing, whereas injecting #GP is guaranteed to break the guest.

> ignores this #GP and still thinks that APIC base relocation worked, it is its fault.
> A well behaving guest should never assume that a msr write that failed with #GP
> worked.
> 
> > 
> > I think the only viable "solution" is to exit to userspace on the guilty WRMSR.
> > Whether or not we can do that without breaking userspace is probably the big
> > question.  Fully emulating APIC base relocation would be a tremendous amount of
> > effort and complexity for practically zero benefit.
> 
> I have nothing against this as well although I kind of like the #GP approach
> a bit more, and knowing that there are barely any reasons to relocate the
> APIC base, and that it doesn't work well, there is a good chance that no one
> does it anyway (except our kvm unit tests, but that isn't an issue).

Injecting an exception that architecturally should not happen is simply not
acceptable.  Silently (and partially) ignoring the WRMSR isn't acceptable either,
but we can't travel back in time to fix that so we're stuck with it unless we can
change the behavior without anyone complaining.

> > > (we already have a warning when APIC base is set to non default value)
> > 
> > FWIW, that warning is worthless because it's _once(), i.e. won't help detect a
> > misbehaving guest unless it's the first guest to misbehave on a particular
> > instantiation of KVM.   _ratelimited() would improve the situation, but not
> > completely eliminate the possibility of a misbehaving guest going unnoticed.
> > Anything else isn't an option becuase it's obviously guest triggerable.
> 
> 100% agree.
> 
> I'll say I would first make it _ratelimited() for few KVM versions, and then
> if nobody complains, make it a KVM internal error / #GP, and remove all the
> leftovers from the code that pretend that it can work.

I don't see any point in temporarily making it _ratelimited(), (1) the odds of someone
running a guest that relies on APIC base relocation are very low, (2) the odds of
that someone noticing a _ratelimited() and not a _once() are even lower, and (3) the
odds of that prompting a bug report are even lower still.

> And add a comment explaining *why* as you explained, supporting APIC base relocation
> isn't worth it.
> 
> Best regards,
> 	Maxim Levitsky
> 
> > 
> 
>
Jim Mattson Aug. 9, 2021, 4:47 p.m. UTC | #19
On Mon, Aug 9, 2021 at 2:40 AM Maxim Levitsky <mlevitsk@redhat.com> wrote:
>
> On Fri, 2021-08-06 at 21:55 +0000, Sean Christopherson wrote:
> > On Thu, Jul 22, 2021, Maxim Levitsky wrote:
> > > On Mon, 2021-07-19 at 18:49 +0000, Sean Christopherson wrote:
> > > > On Sun, Jul 18, 2021, Maxim Levitsky wrote:
> > > -> APIC MMIO area has to be MMIO for 'apic_mmio_write' to be called,
> > >    thus must contain no guest memslots.
> > >    If the guest relocates the APIC base somewhere where we have a memslot,
> > >    memslot will take priority, while on real hardware, LAPIC is likely to
> > >    take priority.
> >
> > Yep.  The thing that really bites us is that other vCPUs should still be able to
> > access the memory defined by the memslot, e.g. to make it work we'd have to run
> > the vCPU with a completely different MMU root.
> That is something I haven't took in the account.
> Complexity of supporting this indeed isn't worth it.
>
> >
> > > As far as I know the only good reason to relocate APIC base is to access it
> > > from the real mode which is not something that is done these days by modern
> > > BIOSes.
> > >
> > > I vote to make it read only (#GP on MSR_IA32_APICBASE write when non default
> > > base is set and apic enabled) and remove all remains of the support for
> > > variable APIC base.
> >
> > Making up our own behavior is almost never the right approach.  E.g. _best_ case
> > scenario for an unexpected #GP is the guest immediately terminates.  Worst case
> > scenario is the guest eats the #GP and continues on, which is basically the status
> > quo, except it's guaranteed to now work, whereas todays behavior can at least let
> > the guest function, for some definitions of "function".
>
> Well, at least the Intel's PRM does state that APIC base relocation is not guaranteed
> to work on all CPUs, so giving the guest a #GP is like telling it that current CPU doesn't
> support it. In theory, a very well behaving guest can catch the exception and
> fail back to the default base.
>
> I don't understand what do you mean by 'guaranteed to now work'. If the guest
> ignores this #GP and still thinks that APIC base relocation worked, it is its fault.
> A well behaving guest should never assume that a msr write that failed with #GP
> worked.
>
>
> >
> > I think the only viable "solution" is to exit to userspace on the guilty WRMSR.
> > Whether or not we can do that without breaking userspace is probably the big
> > question.  Fully emulating APIC base relocation would be a tremendous amount of
> > effort and complexity for practically zero benefit.
>
> I have nothing against this as well although I kind of like the #GP approach a bit more,
> and knowing that there are barely any reasons
> to relocate the APIC base, and that it doesn't work well, there is a good chance
> that no one does it anyway (except our kvm unit tests, but that isn't an issue).
>
> >
> > > (we already have a warning when APIC base is set to non default value)
> >
> > FWIW, that warning is worthless because it's _once(), i.e. won't help detect a
> > misbehaving guest unless it's the first guest to misbehave on a particular
> > instantiation of KVM.   _ratelimited() would improve the situation, but not
> > completely eliminate the possibility of a misbehaving guest going unnoticed.
> > Anything else isn't an option becuase it's obviously guest triggerable.
>
> 100% agree.
>
> I'll say I would first make it _ratelimited() for few KVM versions, and then
> if nobody complains, make it a KVM internal error / #GP, and remove all the leftovers
> from the code that pretend that it can work.

Printing things to syslog is not very helpful. Any time that kvm
violates the architectural specification, it should provide
information about the emulation error to userspace.
Maxim Levitsky Aug. 10, 2021, 8:42 p.m. UTC | #20
On Mon, 2021-08-09 at 09:47 -0700, Jim Mattson wrote:
> On Mon, Aug 9, 2021 at 2:40 AM Maxim Levitsky <mlevitsk@redhat.com> wrote:
> > On Fri, 2021-08-06 at 21:55 +0000, Sean Christopherson wrote:
> > > On Thu, Jul 22, 2021, Maxim Levitsky wrote:
> > > > On Mon, 2021-07-19 at 18:49 +0000, Sean Christopherson wrote:
> > > > > On Sun, Jul 18, 2021, Maxim Levitsky wrote:
> > > > -> APIC MMIO area has to be MMIO for 'apic_mmio_write' to be called,
> > > >    thus must contain no guest memslots.
> > > >    If the guest relocates the APIC base somewhere where we have a memslot,
> > > >    memslot will take priority, while on real hardware, LAPIC is likely to
> > > >    take priority.
> > > 
> > > Yep.  The thing that really bites us is that other vCPUs should still be able to
> > > access the memory defined by the memslot, e.g. to make it work we'd have to run
> > > the vCPU with a completely different MMU root.
> > That is something I haven't took in the account.
> > Complexity of supporting this indeed isn't worth it.
> > 
> > > > As far as I know the only good reason to relocate APIC base is to access it
> > > > from the real mode which is not something that is done these days by modern
> > > > BIOSes.
> > > > 
> > > > I vote to make it read only (#GP on MSR_IA32_APICBASE write when non default
> > > > base is set and apic enabled) and remove all remains of the support for
> > > > variable APIC base.
> > > 
> > > Making up our own behavior is almost never the right approach.  E.g. _best_ case
> > > scenario for an unexpected #GP is the guest immediately terminates.  Worst case
> > > scenario is the guest eats the #GP and continues on, which is basically the status
> > > quo, except it's guaranteed to now work, whereas todays behavior can at least let
> > > the guest function, for some definitions of "function".
> > 
> > Well, at least the Intel's PRM does state that APIC base relocation is not guaranteed
> > to work on all CPUs, so giving the guest a #GP is like telling it that current CPU doesn't
> > support it. In theory, a very well behaving guest can catch the exception and
> > fail back to the default base.
> > 
> > I don't understand what do you mean by 'guaranteed to now work'. If the guest
> > ignores this #GP and still thinks that APIC base relocation worked, it is its fault.
> > A well behaving guest should never assume that a msr write that failed with #GP
> > worked.
> > 
> > 
> > > I think the only viable "solution" is to exit to userspace on the guilty WRMSR.
> > > Whether or not we can do that without breaking userspace is probably the big
> > > question.  Fully emulating APIC base relocation would be a tremendous amount of
> > > effort and complexity for practically zero benefit.
> > 
> > I have nothing against this as well although I kind of like the #GP approach a bit more,
> > and knowing that there are barely any reasons
> > to relocate the APIC base, and that it doesn't work well, there is a good chance
> > that no one does it anyway (except our kvm unit tests, but that isn't an issue).
> > 
> > > > (we already have a warning when APIC base is set to non default value)
> > > 
> > > FWIW, that warning is worthless because it's _once(), i.e. won't help detect a
> > > misbehaving guest unless it's the first guest to misbehave on a particular
> > > instantiation of KVM.   _ratelimited() would improve the situation, but not
> > > completely eliminate the possibility of a misbehaving guest going unnoticed.
> > > Anything else isn't an option becuase it's obviously guest triggerable.
> > 
> > 100% agree.
> > 
> > I'll say I would first make it _ratelimited() for few KVM versions, and then
> > if nobody complains, make it a KVM internal error / #GP, and remove all the leftovers
> > from the code that pretend that it can work.
> 
> Printing things to syslog is not very helpful. Any time that kvm
> violates the architectural specification, it should provide
> information about the emulation error to userspace.
> 
Paolo, what do you think?

My personal opinion is that we should indeed cause KVM internal error
on all attempts to change the APIC base.

If someone complains, then we can look at their use-case.

My view is that any half-working feature is bound to bitrot
and cause harm and confusion.

Best regards,
	Maxim Levitsky
diff mbox series

Patch

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index e11d64aa0bcd..f900dca58af8 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -956,6 +956,9 @@  struct kvm_hv {
 	/* How many vCPUs have VP index != vCPU index */
 	atomic_t num_mismatched_vp_indexes;
 
+	/* How many SynICs use 'AutoEOI' feature */
+	atomic_t synic_auto_eoi_used;
+
 	struct hv_partition_assist_pg *hv_pa_pg;
 	struct kvm_hv_syndbg hv_syndbg;
 };
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index b07592ca92f0..6bf47a583d0e 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -85,9 +85,22 @@  static bool synic_has_vector_auto_eoi(struct kvm_vcpu_hv_synic *synic,
 	return false;
 }
 
+
+static void synic_toggle_avic(struct kvm_vcpu *vcpu, bool activate)
+{
+	srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
+	kvm_request_apicv_update(vcpu->kvm, activate,
+			APICV_INHIBIT_REASON_HYPERV);
+	vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
+}
+
 static void synic_update_vector(struct kvm_vcpu_hv_synic *synic,
 				int vector)
 {
+	struct kvm_vcpu *vcpu = hv_synic_to_vcpu(synic);
+	struct kvm_hv *hv = to_kvm_hv(vcpu->kvm);
+	int auto_eoi_old, auto_eoi_new;
+
 	if (vector < HV_SYNIC_FIRST_VALID_VECTOR)
 		return;
 
@@ -96,10 +109,23 @@  static void synic_update_vector(struct kvm_vcpu_hv_synic *synic,
 	else
 		__clear_bit(vector, synic->vec_bitmap);
 
+	auto_eoi_old = bitmap_weight(synic->auto_eoi_bitmap, 256);
+
 	if (synic_has_vector_auto_eoi(synic, vector))
 		__set_bit(vector, synic->auto_eoi_bitmap);
 	else
 		__clear_bit(vector, synic->auto_eoi_bitmap);
+
+	auto_eoi_new = bitmap_weight(synic->auto_eoi_bitmap, 256);
+
+	/* Hyper-V SynIC auto EOI SINTs are not compatible with APICV */
+	if (!auto_eoi_old && auto_eoi_new) {
+		if (atomic_inc_return(&hv->synic_auto_eoi_used) == 1)
+			synic_toggle_avic(vcpu, false);
+	} else if (!auto_eoi_new && auto_eoi_old) {
+		if (atomic_dec_return(&hv->synic_auto_eoi_used) == 0)
+			synic_toggle_avic(vcpu, true);
+	}
 }
 
 static int synic_set_sint(struct kvm_vcpu_hv_synic *synic, int sint,
@@ -933,12 +959,6 @@  int kvm_hv_activate_synic(struct kvm_vcpu *vcpu, bool dont_zero_synic_pages)
 
 	synic = to_hv_synic(vcpu);
 
-	/*
-	 * Hyper-V SynIC auto EOI SINT's are
-	 * not compatible with APICV, so request
-	 * to deactivate APICV permanently.
-	 */
-	kvm_request_apicv_update(vcpu->kvm, false, APICV_INHIBIT_REASON_HYPERV);
 	synic->active = true;
 	synic->dont_zero_synic_pages = dont_zero_synic_pages;
 	synic->control = HV_SYNIC_CONTROL_ENABLE;
@@ -2466,6 +2486,8 @@  int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid,
 				ent->eax |= HV_X64_ENLIGHTENED_VMCS_RECOMMENDED;
 			if (!cpu_smt_possible())
 				ent->eax |= HV_X64_NO_NONARCH_CORESHARING;
+
+			ent->eax |= HV_DEPRECATING_AEOI_RECOMMENDED;
 			/*
 			 * Default number of spinlock retry attempts, matches
 			 * HyperV 2016.