diff mbox series

[v2,4/6] KVM: arm: vgic-v2: Only use the virtual state when userspace accesses pending bits

Message ID 20200417083319.3066217-5-maz@kernel.org (mailing list archive)
State New, archived
Headers show
Series KVM: arm: vgic fixes for 5.7 | expand

Commit Message

Marc Zyngier April 17, 2020, 8:33 a.m. UTC
There is no point in accessing the HW when writing to any of the
ISPENDR/ICPENDR registers from userspace, as only the guest should
be allowed to change the HW state.

Introduce new userspace-specific accessors that deal solely with
the virtual state. Note that the API differs from that of GICv3,
where userspace exclusively uses ISPENDR to set the state. Too
bad we can't reuse it.

Signed-off-by: Marc Zyngier <maz@kernel.org>
---
 virt/kvm/arm/vgic/vgic-mmio-v2.c |  6 +++--
 virt/kvm/arm/vgic/vgic-mmio.c    | 41 ++++++++++++++++++++++++++++++++
 virt/kvm/arm/vgic/vgic-mmio.h    |  8 +++++++
 3 files changed, 53 insertions(+), 2 deletions(-)

Comments

James Morse April 17, 2020, 11:22 a.m. UTC | #1
Hi Marc,

On 17/04/2020 09:33, Marc Zyngier wrote:
> There is no point in accessing the HW when writing to any of the
> ISPENDR/ICPENDR registers from userspace, as only the guest should
> be allowed to change the HW state.
> 
> Introduce new userspace-specific accessors that deal solely with
> the virtual state. Note that the API differs from that of GICv3,
> where userspace exclusively uses ISPENDR to set the state. Too
> bad we can't reuse it.

> diff --git a/virt/kvm/arm/vgic/vgic-mmio-v2.c b/virt/kvm/arm/vgic/vgic-mmio-v2.c
> index f51c6e939c76..a016f07adc28 100644
> --- a/virt/kvm/arm/vgic/vgic-mmio-v2.c
> +++ b/virt/kvm/arm/vgic/vgic-mmio-v2.c
> @@ -417,10 +417,12 @@ static const struct vgic_register_region vgic_v2_dist_registers[] = {
>  		NULL, vgic_uaccess_write_cenable, 1,
>  		VGIC_ACCESS_32bit),
>  	REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_PENDING_SET,
> -		vgic_mmio_read_pending, vgic_mmio_write_spending, NULL, NULL, 1,
> +		vgic_mmio_read_pending, vgic_mmio_write_spending,
> +		NULL, vgic_uaccess_write_spending, 1,
>  		VGIC_ACCESS_32bit),

vgic_mmio_write_spending() has some homebrew detection for is_uaccess, which causes
vgic_hw_irq_spending() to do nothing. Isn't that now dead-code with this change?


> diff --git a/virt/kvm/arm/vgic/vgic-mmio.c b/virt/kvm/arm/vgic/vgic-mmio.c
> index 6e30034d1464..f1927ae02d2e 100644
> --- a/virt/kvm/arm/vgic/vgic-mmio.c
> +++ b/virt/kvm/arm/vgic/vgic-mmio.c
> @@ -321,6 +321,27 @@ void vgic_mmio_write_spending(struct kvm_vcpu *vcpu,

> +int vgic_uaccess_write_spending(struct kvm_vcpu *vcpu,
> +				gpa_t addr, unsigned int len,
> +				unsigned long val)
> +{
> +	u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
> +	int i;
> +	unsigned long flags;
> +
> +	for_each_set_bit(i, &val, len * 8) {
> +		struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);

vgic_mmio_write_spending() has:
|	/* GICD_ISPENDR0 SGI bits are WI *

and bales out early. Is GIC_DIST_PENDING_SET the same register?
(If so, shouldn't that be true for PPI too?)


> +		raw_spin_lock_irqsave(&irq->irq_lock, flags);
> +		irq->pending_latch = true;
> +		vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
> +
> +		vgic_put_irq(vcpu->kvm, irq);
> +	}
> +
> +	return 0;
> +}

> @@ -390,6 +411,26 @@ void vgic_mmio_write_cpending(struct kvm_vcpu *vcpu,

> +int vgic_uaccess_write_cpending(struct kvm_vcpu *vcpu,
> +				gpa_t addr, unsigned int len,
> +				unsigned long val)
> +{
> +	u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
> +	int i;
> +	unsigned long flags;
> +
> +	for_each_set_bit(i, &val, len * 8) {
> +		struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);

Same dumb question about GICD_ICPENDR0!?

> +		raw_spin_lock_irqsave(&irq->irq_lock, flags);
> +		irq->pending_latch = false;
> +		raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
> +
> +		vgic_put_irq(vcpu->kvm, irq);
> +	}
> +
> +	return 0;
> +}


Thanks,

James
Marc Zyngier April 17, 2020, 12:41 p.m. UTC | #2
On Fri, 17 Apr 2020 12:22:10 +0100
James Morse <james.morse@arm.com> wrote:

Hi James,

> Hi Marc,
> 
> On 17/04/2020 09:33, Marc Zyngier wrote:
> > There is no point in accessing the HW when writing to any of the
> > ISPENDR/ICPENDR registers from userspace, as only the guest should
> > be allowed to change the HW state.
> > 
> > Introduce new userspace-specific accessors that deal solely with
> > the virtual state. Note that the API differs from that of GICv3,
> > where userspace exclusively uses ISPENDR to set the state. Too
> > bad we can't reuse it.  
> 
> > diff --git a/virt/kvm/arm/vgic/vgic-mmio-v2.c b/virt/kvm/arm/vgic/vgic-mmio-v2.c
> > index f51c6e939c76..a016f07adc28 100644
> > --- a/virt/kvm/arm/vgic/vgic-mmio-v2.c
> > +++ b/virt/kvm/arm/vgic/vgic-mmio-v2.c
> > @@ -417,10 +417,12 @@ static const struct vgic_register_region vgic_v2_dist_registers[] = {
> >  		NULL, vgic_uaccess_write_cenable, 1,
> >  		VGIC_ACCESS_32bit),
> >  	REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_PENDING_SET,
> > -		vgic_mmio_read_pending, vgic_mmio_write_spending, NULL, NULL, 1,
> > +		vgic_mmio_read_pending, vgic_mmio_write_spending,
> > +		NULL, vgic_uaccess_write_spending, 1,
> >  		VGIC_ACCESS_32bit),  
> 
> vgic_mmio_write_spending() has some homebrew detection for is_uaccess, which causes
> vgic_hw_irq_spending() to do nothing. Isn't that now dead-code with this change?

Very good point, this deserves a cleanup.

> > diff --git a/virt/kvm/arm/vgic/vgic-mmio.c b/virt/kvm/arm/vgic/vgic-mmio.c
> > index 6e30034d1464..f1927ae02d2e 100644
> > --- a/virt/kvm/arm/vgic/vgic-mmio.c
> > +++ b/virt/kvm/arm/vgic/vgic-mmio.c
> > @@ -321,6 +321,27 @@ void vgic_mmio_write_spending(struct kvm_vcpu *vcpu,  
> 
> > +int vgic_uaccess_write_spending(struct kvm_vcpu *vcpu,
> > +				gpa_t addr, unsigned int len,
> > +				unsigned long val)
> > +{
> > +	u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
> > +	int i;
> > +	unsigned long flags;
> > +
> > +	for_each_set_bit(i, &val, len * 8) {
> > +		struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);  
> 
> vgic_mmio_write_spending() has:
> |	/* GICD_ISPENDR0 SGI bits are WI *
> 
> and bales out early. Is GIC_DIST_PENDING_SET the same register?
> (If so, shouldn't that be true for PPI too?)

Hmmm. It's a bit more complicated (surprisingly).

Yes, the SGI pending bits are WI from the guest perspective (as
required by the spec). But we still need to be able to restore them
from userspace, and I bet 82e40f558de56 ("KVM: arm/arm64: vgic-v2:
Handle SGI bits in GICD_I{S,C}PENDR0 as WI") has broken migration with
GICv2 (if you migrated with a pending SGI, you cannot restore it...).

Now, there is still a bug here, in the sense that we need to indicate
which vcpu is the source of the SGI (this is a GICv2-special).
Unfortunately, we don't have a way to communicate this architecturally.
The only option we have is to make it up (as a self-SGI, for example).
But this is pretty broken at the architectural level TBH.

On the other hand, PPIs are just fine.

> 
> 
> > +		raw_spin_lock_irqsave(&irq->irq_lock, flags);
> > +		irq->pending_latch = true;
> > +		vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
> > +
> > +		vgic_put_irq(vcpu->kvm, irq);
> > +	}
> > +
> > +	return 0;
> > +}  
> 
> > @@ -390,6 +411,26 @@ void vgic_mmio_write_cpending(struct kvm_vcpu *vcpu,  
> 
> > +int vgic_uaccess_write_cpending(struct kvm_vcpu *vcpu,
> > +				gpa_t addr, unsigned int len,
> > +				unsigned long val)
> > +{
> > +	u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
> > +	int i;
> > +	unsigned long flags;
> > +
> > +	for_each_set_bit(i, &val, len * 8) {
> > +		struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);  
> 
> Same dumb question about GICD_ICPENDR0!?

Not dumb at all! Given that we previously allowed this to be accessed
from userspace (well, before we broke it again), it should be able to
clear *something*. If we adopt the self-SGI behaviour as above, we will
get away with it.

Here's what I'm proposing to add to this patch, together with a
Fixes: 82e40f558de56 ("KVM: arm/arm64: vgic-v2: Handle SGI bits in GICD_I{S,C}PENDR0 as WI")

Nobody is using GICv2, obviously... :-/

Thanks,

	M.

diff --git a/virt/kvm/arm/vgic/vgic-mmio.c b/virt/kvm/arm/vgic/vgic-mmio.c
index f1927ae02d2e..974cdcf2f232 100644
--- a/virt/kvm/arm/vgic/vgic-mmio.c
+++ b/virt/kvm/arm/vgic/vgic-mmio.c
@@ -261,17 +261,6 @@ unsigned long vgic_mmio_read_pending(struct kvm_vcpu *vcpu,
 	return value;
 }
 
-/* Must be called with irq->irq_lock held */
-static void vgic_hw_irq_spending(struct kvm_vcpu *vcpu, struct vgic_irq *irq,
-				 bool is_uaccess)
-{
-	if (is_uaccess)
-		return;
-
-	irq->pending_latch = true;
-	vgic_irq_set_phys_active(irq, true);
-}
-
 static bool is_vgic_v2_sgi(struct kvm_vcpu *vcpu, struct vgic_irq *irq)
 {
 	return (vgic_irq_is_sgi(irq->intid) &&
@@ -282,7 +271,6 @@ void vgic_mmio_write_spending(struct kvm_vcpu *vcpu,
 			      gpa_t addr, unsigned int len,
 			      unsigned long val)
 {
-	bool is_uaccess = !kvm_get_running_vcpu();
 	u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
 	int i;
 	unsigned long flags;
@@ -312,10 +300,10 @@ void vgic_mmio_write_spending(struct kvm_vcpu *vcpu,
 			continue;
 		}
 
+		irq->pending_latch = true;
 		if (irq->hw)
-			vgic_hw_irq_spending(vcpu, irq, is_uaccess);
-		else
-			irq->pending_latch = true;
+			vgic_irq_set_phys_active(irq, true);
+
 		vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
 		vgic_put_irq(vcpu->kvm, irq);
 	}
@@ -334,6 +322,15 @@ int vgic_uaccess_write_spending(struct kvm_vcpu *vcpu,
 
 		raw_spin_lock_irqsave(&irq->irq_lock, flags);
 		irq->pending_latch = true;
+
+		/*
+		 * GICv2 SGIs are terribly broken. We can't restore
+		 * the source of the interrupt, so just pick the vcpu
+		 * itself as the source...
+		 */
+		if (is_vgic_v2_sgi(vcpu, irq))
+			irq->source |= BIT(vcpu->vcpu_id);
+
 		vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
 
 		vgic_put_irq(vcpu->kvm, irq);
@@ -343,12 +340,8 @@ int vgic_uaccess_write_spending(struct kvm_vcpu *vcpu,
 }
 
 /* Must be called with irq->irq_lock held */
-static void vgic_hw_irq_cpending(struct kvm_vcpu *vcpu, struct vgic_irq *irq,
-				 bool is_uaccess)
+static void vgic_hw_irq_cpending(struct kvm_vcpu *vcpu, struct vgic_irq *irq)
 {
-	if (is_uaccess)
-		return;
-
 	irq->pending_latch = false;
 
 	/*
@@ -371,7 +364,6 @@ void vgic_mmio_write_cpending(struct kvm_vcpu *vcpu,
 			      gpa_t addr, unsigned int len,
 			      unsigned long val)
 {
-	bool is_uaccess = !kvm_get_running_vcpu();
 	u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
 	int i;
 	unsigned long flags;
@@ -402,7 +394,7 @@ void vgic_mmio_write_cpending(struct kvm_vcpu *vcpu,
 		}
 
 		if (irq->hw)
-			vgic_hw_irq_cpending(vcpu, irq, is_uaccess);
+			vgic_hw_irq_cpending(vcpu, irq);
 		else
 			irq->pending_latch = false;
 
@@ -423,7 +415,22 @@ int vgic_uaccess_write_cpending(struct kvm_vcpu *vcpu,
 		struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
 
 		raw_spin_lock_irqsave(&irq->irq_lock, flags);
-		irq->pending_latch = false;
+		/*
+		 * More fun with GICv2 SGIs! If we're clearing one of them
+		 * from userspace, which source vcpu to clear?  Let's pick
+		 * the target vcpu itself (consistent whith the way we
+		 * populate them on the ISPENDR side), and only clear the
+		 * pending state if no sources are left (insert expletive
+		 * here).
+		 */
+		if (is_vgic_v2_sgi(vcpu, irq)) {
+			irq->source &= ~BIT(vcpu->vcpu_id);
+			if (!irq->source)
+				irq->pending_latch = false;
+		} else {
+			irq->pending_latch = false;
+		}
+
 		raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
 
 		vgic_put_irq(vcpu->kvm, irq);
James Morse April 17, 2020, 4:48 p.m. UTC | #3
Hi Marc,

On 17/04/2020 13:41, Marc Zyngier wrote:
> On Fri, 17 Apr 2020 12:22:10 +0100 James Morse <james.morse@arm.com> wrote:
>> On 17/04/2020 09:33, Marc Zyngier wrote:
>>> There is no point in accessing the HW when writing to any of the
>>> ISPENDR/ICPENDR registers from userspace, as only the guest should
>>> be allowed to change the HW state.
>>>
>>> Introduce new userspace-specific accessors that deal solely with
>>> the virtual state. Note that the API differs from that of GICv3,
>>> where userspace exclusively uses ISPENDR to set the state. Too
>>> bad we can't reuse it.  

>>> diff --git a/virt/kvm/arm/vgic/vgic-mmio.c b/virt/kvm/arm/vgic/vgic-mmio.c
>>> index 6e30034d1464..f1927ae02d2e 100644
>>> --- a/virt/kvm/arm/vgic/vgic-mmio.c
>>> +++ b/virt/kvm/arm/vgic/vgic-mmio.c
>>> @@ -321,6 +321,27 @@ void vgic_mmio_write_spending(struct kvm_vcpu *vcpu,  
>>
>>> +int vgic_uaccess_write_spending(struct kvm_vcpu *vcpu,
>>> +				gpa_t addr, unsigned int len,
>>> +				unsigned long val)
>>> +{
>>> +	u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
>>> +	int i;
>>> +	unsigned long flags;
>>> +
>>> +	for_each_set_bit(i, &val, len * 8) {
>>> +		struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);  
>>
>> vgic_mmio_write_spending() has:
>> |	/* GICD_ISPENDR0 SGI bits are WI *
>>
>> and bales out early. Is GIC_DIST_PENDING_SET the same register?
>> (If so, shouldn't that be true for PPI too?)
> 
> Hmmm. It's a bit more complicated (surprisingly).
> 
> Yes, the SGI pending bits are WI from the guest perspective (as
> required by the spec).

> But we still need to be able to restore them
> from userspace, and I bet 82e40f558de56 ("KVM: arm/arm64: vgic-v2:
> Handle SGI bits in GICD_I{S,C}PENDR0 as WI") has broken migration with
> GICv2 (if you migrated with a pending SGI, you cannot restore it...).

Fun! It looks like the ioctl() would succeed, but nothing happened. Once you restart the
guest one CPU may wait forever for the victim to respond.


> Now, there is still a bug here, in the sense that we need to indicate
> which vcpu is the source of the SGI (this is a GICv2-special).
> Unfortunately, we don't have a way to communicate this architecturally.
> The only option we have is to make it up (as a self-SGI, for example).
> But this is pretty broken at the architectural level TBH.
> On the other hand, PPIs are just fine.

Yup, wrong spec, I was looking at the same register in GICv3! It looks like the GICv3 text
is there because those registers live in the redistributor instead... duh!


>>> @@ -390,6 +411,26 @@ void vgic_mmio_write_cpending(struct kvm_vcpu *vcpu,  
>>
>>> +int vgic_uaccess_write_cpending(struct kvm_vcpu *vcpu,
>>> +				gpa_t addr, unsigned int len,
>>> +				unsigned long val)
>>> +{
>>> +	u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
>>> +	int i;
>>> +	unsigned long flags;
>>> +
>>> +	for_each_set_bit(i, &val, len * 8) {
>>> +		struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);  
>>
>> Same dumb question about GICD_ICPENDR0!?
> 
> Not dumb at all! Given that we previously allowed this to be accessed
> from userspace (well, before we broke it again), it should be able to
> clear *something*. If we adopt the self-SGI behaviour as above, we will
> get away with it.
> 
> Here's what I'm proposing to add to this patch, together with a
> Fixes: 82e40f558de56 ("KVM: arm/arm64: vgic-v2: Handle SGI bits in GICD_I{S,C}PENDR0 as WI")
> 
> Nobody is using GICv2, obviously... :-/

> diff --git a/virt/kvm/arm/vgic/vgic-mmio.c b/virt/kvm/arm/vgic/vgic-mmio.c
> index f1927ae02d2e..974cdcf2f232 100644
> --- a/virt/kvm/arm/vgic/vgic-mmio.c
> +++ b/virt/kvm/arm/vgic/vgic-mmio.c

> @@ -334,6 +322,15 @@ int vgic_uaccess_write_spending(struct kvm_vcpu *vcpu,
>  
>  		raw_spin_lock_irqsave(&irq->irq_lock, flags);
>  		irq->pending_latch = true;
> +
> +		/*
> +		 * GICv2 SGIs are terribly broken. We can't restore
> +		 * the source of the interrupt, so just pick the vcpu
> +		 * itself as the source...

Makes sense, this way you can't have an SGI coming from an offline CPU!


> +		 */
> +		if (is_vgic_v2_sgi(vcpu, irq))
> +			irq->source |= BIT(vcpu->vcpu_id);
> +
>  		vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
>  
>  		vgic_put_irq(vcpu->kvm, irq);

> @@ -423,7 +415,22 @@ int vgic_uaccess_write_cpending(struct kvm_vcpu *vcpu,
>  		struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
>  
>  		raw_spin_lock_irqsave(&irq->irq_lock, flags);
> -		irq->pending_latch = false;
> +		/*
> +		 * More fun with GICv2 SGIs! If we're clearing one of them
> +		 * from userspace, which source vcpu to clear?  Let's pick
> +		 * the target vcpu itself (consistent whith the way we
> +		 * populate them on the ISPENDR side), and only clear the
> +		 * pending state if no sources are left (insert expletive
> +		 * here).

But I'm not so sure about this. Doesn't this mean that user-space can't clear pending-SGI?
Only if its pending due to self-SGI. I'm not sure when user-space would want to do this,
so it may not matter.

Using ffs() you could clear the lowest pending source, I assume if its pending, there is
likely only one source. If not, user-space can eventually clear pending SGI with at most
nr-vcpu calls ... and ffs() could double up as the missing expletive!

(but if user-space never actually does this, then we should do the simplest thing)


> +		 */
> +		if (is_vgic_v2_sgi(vcpu, irq)) {
> +			irq->source &= ~BIT(vcpu->vcpu_id);
> +			if (!irq->source)
> +				irq->pending_latch = false;
> +		} else {
> +			irq->pending_latch = false;
> +		}
> +
>  		raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
>  
>  		vgic_put_irq(vcpu->kvm, irq);

Otherwise looks good to me,


Thanks,

James

[0]
https://static.docs.arm.com/ihi0069/f/IHI0069F_gic_architecture_specification_v3_and_v4.1.pdf
Marc Zyngier April 20, 2020, 10:03 a.m. UTC | #4
On Fri, 17 Apr 2020 17:48:34 +0100
James Morse <james.morse@arm.com> wrote:

Hi James,

> Hi Marc,
> 
> On 17/04/2020 13:41, Marc Zyngier wrote:
> > On Fri, 17 Apr 2020 12:22:10 +0100 James Morse <james.morse@arm.com> wrote:  
> >> On 17/04/2020 09:33, Marc Zyngier wrote:  
> >>> There is no point in accessing the HW when writing to any of the
> >>> ISPENDR/ICPENDR registers from userspace, as only the guest should
> >>> be allowed to change the HW state.
> >>>
> >>> Introduce new userspace-specific accessors that deal solely with
> >>> the virtual state. Note that the API differs from that of GICv3,
> >>> where userspace exclusively uses ISPENDR to set the state. Too
> >>> bad we can't reuse it.    
> 
> >>> diff --git a/virt/kvm/arm/vgic/vgic-mmio.c b/virt/kvm/arm/vgic/vgic-mmio.c
> >>> index 6e30034d1464..f1927ae02d2e 100644
> >>> --- a/virt/kvm/arm/vgic/vgic-mmio.c
> >>> +++ b/virt/kvm/arm/vgic/vgic-mmio.c
> >>> @@ -321,6 +321,27 @@ void vgic_mmio_write_spending(struct kvm_vcpu *vcpu,    
> >>  
> >>> +int vgic_uaccess_write_spending(struct kvm_vcpu *vcpu,
> >>> +				gpa_t addr, unsigned int len,
> >>> +				unsigned long val)
> >>> +{
> >>> +	u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
> >>> +	int i;
> >>> +	unsigned long flags;
> >>> +
> >>> +	for_each_set_bit(i, &val, len * 8) {
> >>> +		struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);    
> >>
> >> vgic_mmio_write_spending() has:
> >> |	/* GICD_ISPENDR0 SGI bits are WI *
> >>
> >> and bales out early. Is GIC_DIST_PENDING_SET the same register?
> >> (If so, shouldn't that be true for PPI too?)  
> > 
> > Hmmm. It's a bit more complicated (surprisingly).
> > 
> > Yes, the SGI pending bits are WI from the guest perspective (as
> > required by the spec).  
> 
> > But we still need to be able to restore them
> > from userspace, and I bet 82e40f558de56 ("KVM: arm/arm64: vgic-v2:
> > Handle SGI bits in GICD_I{S,C}PENDR0 as WI") has broken migration with
> > GICv2 (if you migrated with a pending SGI, you cannot restore it...).  
> 
> Fun! It looks like the ioctl() would succeed, but nothing happened. Once you restart the
> guest one CPU may wait forever for the victim to respond.

Yup. I can only see two reason for this not being reported: nobody
tests live migration with GICv2 (most probable), or we're incredibly
lucky by having never take a snapshot of a pending SGI. Either way,
this needs fixing.

> > Now, there is still a bug here, in the sense that we need to indicate
> > which vcpu is the source of the SGI (this is a GICv2-special).
> > Unfortunately, we don't have a way to communicate this architecturally.
> > The only option we have is to make it up (as a self-SGI, for example).
> > But this is pretty broken at the architectural level TBH.
> > On the other hand, PPIs are just fine.  
> 
> Yup, wrong spec, I was looking at the same register in GICv3! It looks like the GICv3 text
> is there because those registers live in the redistributor instead... duh!
> 
> 
> >>> @@ -390,6 +411,26 @@ void vgic_mmio_write_cpending(struct kvm_vcpu *vcpu,    
> >>  
> >>> +int vgic_uaccess_write_cpending(struct kvm_vcpu *vcpu,
> >>> +				gpa_t addr, unsigned int len,
> >>> +				unsigned long val)
> >>> +{
> >>> +	u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
> >>> +	int i;
> >>> +	unsigned long flags;
> >>> +
> >>> +	for_each_set_bit(i, &val, len * 8) {
> >>> +		struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);    
> >>
> >> Same dumb question about GICD_ICPENDR0!?  
> > 
> > Not dumb at all! Given that we previously allowed this to be accessed
> > from userspace (well, before we broke it again), it should be able to
> > clear *something*. If we adopt the self-SGI behaviour as above, we will
> > get away with it.
> > 
> > Here's what I'm proposing to add to this patch, together with a
> > Fixes: 82e40f558de56 ("KVM: arm/arm64: vgic-v2: Handle SGI bits in GICD_I{S,C}PENDR0 as WI")
> > 
> > Nobody is using GICv2, obviously... :-/  
> 
> > diff --git a/virt/kvm/arm/vgic/vgic-mmio.c b/virt/kvm/arm/vgic/vgic-mmio.c
> > index f1927ae02d2e..974cdcf2f232 100644
> > --- a/virt/kvm/arm/vgic/vgic-mmio.c
> > +++ b/virt/kvm/arm/vgic/vgic-mmio.c  
> 
> > @@ -334,6 +322,15 @@ int vgic_uaccess_write_spending(struct kvm_vcpu *vcpu,
> >  
> >  		raw_spin_lock_irqsave(&irq->irq_lock, flags);
> >  		irq->pending_latch = true;
> > +
> > +		/*
> > +		 * GICv2 SGIs are terribly broken. We can't restore
> > +		 * the source of the interrupt, so just pick the vcpu
> > +		 * itself as the source...  
> 
> Makes sense, this way you can't have an SGI coming from an offline CPU!
> 
> 
> > +		 */
> > +		if (is_vgic_v2_sgi(vcpu, irq))
> > +			irq->source |= BIT(vcpu->vcpu_id);
> > +
> >  		vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
> >  
> >  		vgic_put_irq(vcpu->kvm, irq);  
> 
> > @@ -423,7 +415,22 @@ int vgic_uaccess_write_cpending(struct kvm_vcpu *vcpu,
> >  		struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
> >  
> >  		raw_spin_lock_irqsave(&irq->irq_lock, flags);
> > -		irq->pending_latch = false;
> > +		/*
> > +		 * More fun with GICv2 SGIs! If we're clearing one of them
> > +		 * from userspace, which source vcpu to clear?  Let's pick
> > +		 * the target vcpu itself (consistent whith the way we
> > +		 * populate them on the ISPENDR side), and only clear the
> > +		 * pending state if no sources are left (insert expletive
> > +		 * here).  
> 
> But I'm not so sure about this. Doesn't this mean that user-space can't clear pending-SGI?
> Only if its pending due to self-SGI. I'm not sure when user-space would want to do this,
> so it may not matter.

In general, userspace just sets the pending bit, and doesn't bother
clearing anything (because by default, there is nothing to clear).

> Using ffs() you could clear the lowest pending source, I assume if its pending, there is
> likely only one source. If not, user-space can eventually clear pending SGI with at most
> nr-vcpu calls ... and ffs() could double up as the missing expletive!

;-)

> (but if user-space never actually does this, then we should do the simplest thing)

A third way would be to align on what GICv3 does, which is that ISPENDR
is used for both setting and clearing in one go. Given that the current
state it broken (and has been for some time now), I'm tempted to adopt
the same behaviour...

What do you think?

	M.
James Morse April 22, 2020, 3:55 p.m. UTC | #5
Hi Marc,

On 20/04/2020 11:03, Marc Zyngier wrote:
> On Fri, 17 Apr 2020 17:48:34 +0100
> James Morse <james.morse@arm.com> wrote:
>> On 17/04/2020 13:41, Marc Zyngier wrote:
>>> On Fri, 17 Apr 2020 12:22:10 +0100 James Morse <james.morse@arm.com> wrote:  
>>>> On 17/04/2020 09:33, Marc Zyngier wrote:  
>>>>> There is no point in accessing the HW when writing to any of the
>>>>> ISPENDR/ICPENDR registers from userspace, as only the guest should
>>>>> be allowed to change the HW state.
>>>>>
>>>>> Introduce new userspace-specific accessors that deal solely with
>>>>> the virtual state. Note that the API differs from that of GICv3,
>>>>> where userspace exclusively uses ISPENDR to set the state. Too
>>>>> bad we can't reuse it.    
>>
>>>>> diff --git a/virt/kvm/arm/vgic/vgic-mmio.c b/virt/kvm/arm/vgic/vgic-mmio.c
>>>>> index 6e30034d1464..f1927ae02d2e 100644
>>>>> --- a/virt/kvm/arm/vgic/vgic-mmio.c
>>>>> +++ b/virt/kvm/arm/vgic/vgic-mmio.c
>>>>> @@ -321,6 +321,27 @@ void vgic_mmio_write_spending(struct kvm_vcpu *vcpu,    
>>>>  
>>>>> +int vgic_uaccess_write_spending(struct kvm_vcpu *vcpu,
>>>>> +				gpa_t addr, unsigned int len,
>>>>> +				unsigned long val)
>>>>> +{
>>>>> +	u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
>>>>> +	int i;
>>>>> +	unsigned long flags;
>>>>> +
>>>>> +	for_each_set_bit(i, &val, len * 8) {
>>>>> +		struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);    
>>>>
>>>> vgic_mmio_write_spending() has:
>>>> |	/* GICD_ISPENDR0 SGI bits are WI *
>>>>
>>>> and bales out early. Is GIC_DIST_PENDING_SET the same register?
>>>> (If so, shouldn't that be true for PPI too?)  
>>>
>>> Hmmm. It's a bit more complicated (surprisingly).
>>>
>>> Yes, the SGI pending bits are WI from the guest perspective (as
>>> required by the spec).  
>>
>>> But we still need to be able to restore them
>>> from userspace, and I bet 82e40f558de56 ("KVM: arm/arm64: vgic-v2:
>>> Handle SGI bits in GICD_I{S,C}PENDR0 as WI") has broken migration with
>>> GICv2 (if you migrated with a pending SGI, you cannot restore it...).  
>>
>> Fun! It looks like the ioctl() would succeed, but nothing happened. Once you restart the
>> guest one CPU may wait forever for the victim to respond.
> 
> Yup. I can only see two reason for this not being reported: nobody
> tests live migration with GICv2 (most probable), or we're incredibly
> lucky by having never take a snapshot of a pending SGI. Either way,
> this needs fixing.
> 
>>> Now, there is still a bug here, in the sense that we need to indicate
>>> which vcpu is the source of the SGI (this is a GICv2-special).
>>> Unfortunately, we don't have a way to communicate this architecturally.
>>> The only option we have is to make it up (as a self-SGI, for example).
>>> But this is pretty broken at the architectural level TBH.
>>> On the other hand, PPIs are just fine.  

[...]

>>> Not dumb at all! Given that we previously allowed this to be accessed
>>> from userspace (well, before we broke it again), it should be able to
>>> clear *something*. If we adopt the self-SGI behaviour as above, we will
>>> get away with it.

>>> @@ -423,7 +415,22 @@ int vgic_uaccess_write_cpending(struct kvm_vcpu *vcpu,
>>>  		struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
>>>  
>>>  		raw_spin_lock_irqsave(&irq->irq_lock, flags);
>>> -		irq->pending_latch = false;
>>> +		/*
>>> +		 * More fun with GICv2 SGIs! If we're clearing one of them
>>> +		 * from userspace, which source vcpu to clear?  Let's pick
>>> +		 * the target vcpu itself (consistent whith the way we
>>> +		 * populate them on the ISPENDR side), and only clear the
>>> +		 * pending state if no sources are left (insert expletive
>>> +		 * here).  
>>
>> But I'm not so sure about this. Doesn't this mean that user-space can't clear pending-SGI?
>> Only if its pending due to self-SGI. I'm not sure when user-space would want to do this,
>> so it may not matter.
> 
> In general, userspace just sets the pending bit, and doesn't bother
> clearing anything (because by default, there is nothing to clear).

>> (but if user-space never actually does this, then we should do the simplest thing)

Adding printk() to this combined patch and using 'loadvm' on the qemu console, I see Qemu
writing '0xffffffff' into cpending to clear all 16 SGIs. I guess it is 'resetting' the
in-kernel state to replace it with the state read from disk.


> A third way would be to align on what GICv3 does, which is that ISPENDR
> is used for both setting and clearing in one go. Given that the current
> state it broken (and has been for some time now), I'm tempted to adopt
> the same behaviour...

> What do you think?

I think Qemu is expecting the bank of cpending writes to clear whatever the kernel has
stored, so that it can replay the new state. Ignoring the cpending writes means the kernel
keeps an interrupt pending if nothing else in that 64bit group was set. Its not what Qemu
expects, it looks like we'd get away with it, but I don't think we should do it!

I think we should let user-space write to those WI registers, and clearing the SGIs should
clear all sources of SGI...

(N.B. I hit the original issue by typing 'loadvm' in the wrong terminal)


Thanks,

James
Marc Zyngier April 22, 2020, 4:02 p.m. UTC | #6
Hi James,

On 2020-04-22 16:55, James Morse wrote:
> Hi Marc,
> 
> On 20/04/2020 11:03, Marc Zyngier wrote:
>> On Fri, 17 Apr 2020 17:48:34 +0100
>> James Morse <james.morse@arm.com> wrote:

[...]

>>> (but if user-space never actually does this, then we should do the 
>>> simplest thing)
> 
> Adding printk() to this combined patch and using 'loadvm' on the qemu
> console, I see Qemu
> writing '0xffffffff' into cpending to clear all 16 SGIs. I guess it is
> 'resetting' the
> in-kernel state to replace it with the state read from disk.
> 
> 
>> A third way would be to align on what GICv3 does, which is that 
>> ISPENDR
>> is used for both setting and clearing in one go. Given that the 
>> current
>> state it broken (and has been for some time now), I'm tempted to adopt
>> the same behaviour...
> 
>> What do you think?
> 
> I think Qemu is expecting the bank of cpending writes to clear
> whatever the kernel has
> stored, so that it can replay the new state. Ignoring the cpending
> writes means the kernel
> keeps an interrupt pending if nothing else in that 64bit group was
> set. Its not what Qemu
> expects, it looks like we'd get away with it, but I don't think we 
> should do it!
> 
> I think we should let user-space write to those WI registers, and
> clearing the SGIs should clear all sources of SGI...

I'd be happy with that. Let me rework the patch, and I'll post the 
series again
shortly.

Thanks,

         M.
diff mbox series

Patch

diff --git a/virt/kvm/arm/vgic/vgic-mmio-v2.c b/virt/kvm/arm/vgic/vgic-mmio-v2.c
index f51c6e939c76..a016f07adc28 100644
--- a/virt/kvm/arm/vgic/vgic-mmio-v2.c
+++ b/virt/kvm/arm/vgic/vgic-mmio-v2.c
@@ -417,10 +417,12 @@  static const struct vgic_register_region vgic_v2_dist_registers[] = {
 		NULL, vgic_uaccess_write_cenable, 1,
 		VGIC_ACCESS_32bit),
 	REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_PENDING_SET,
-		vgic_mmio_read_pending, vgic_mmio_write_spending, NULL, NULL, 1,
+		vgic_mmio_read_pending, vgic_mmio_write_spending,
+		NULL, vgic_uaccess_write_spending, 1,
 		VGIC_ACCESS_32bit),
 	REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_PENDING_CLEAR,
-		vgic_mmio_read_pending, vgic_mmio_write_cpending, NULL, NULL, 1,
+		vgic_mmio_read_pending, vgic_mmio_write_cpending,
+		NULL, vgic_uaccess_write_cpending, 1,
 		VGIC_ACCESS_32bit),
 	REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_ACTIVE_SET,
 		vgic_mmio_read_active, vgic_mmio_write_sactive,
diff --git a/virt/kvm/arm/vgic/vgic-mmio.c b/virt/kvm/arm/vgic/vgic-mmio.c
index 6e30034d1464..f1927ae02d2e 100644
--- a/virt/kvm/arm/vgic/vgic-mmio.c
+++ b/virt/kvm/arm/vgic/vgic-mmio.c
@@ -321,6 +321,27 @@  void vgic_mmio_write_spending(struct kvm_vcpu *vcpu,
 	}
 }
 
+int vgic_uaccess_write_spending(struct kvm_vcpu *vcpu,
+				gpa_t addr, unsigned int len,
+				unsigned long val)
+{
+	u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
+	int i;
+	unsigned long flags;
+
+	for_each_set_bit(i, &val, len * 8) {
+		struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+
+		raw_spin_lock_irqsave(&irq->irq_lock, flags);
+		irq->pending_latch = true;
+		vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
+
+		vgic_put_irq(vcpu->kvm, irq);
+	}
+
+	return 0;
+}
+
 /* Must be called with irq->irq_lock held */
 static void vgic_hw_irq_cpending(struct kvm_vcpu *vcpu, struct vgic_irq *irq,
 				 bool is_uaccess)
@@ -390,6 +411,26 @@  void vgic_mmio_write_cpending(struct kvm_vcpu *vcpu,
 	}
 }
 
+int vgic_uaccess_write_cpending(struct kvm_vcpu *vcpu,
+				gpa_t addr, unsigned int len,
+				unsigned long val)
+{
+	u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
+	int i;
+	unsigned long flags;
+
+	for_each_set_bit(i, &val, len * 8) {
+		struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+
+		raw_spin_lock_irqsave(&irq->irq_lock, flags);
+		irq->pending_latch = false;
+		raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+
+		vgic_put_irq(vcpu->kvm, irq);
+	}
+
+	return 0;
+}
 
 /*
  * If we are fiddling with an IRQ's active state, we have to make sure the IRQ
diff --git a/virt/kvm/arm/vgic/vgic-mmio.h b/virt/kvm/arm/vgic/vgic-mmio.h
index 327d0a6938e4..fefcca2b14dc 100644
--- a/virt/kvm/arm/vgic/vgic-mmio.h
+++ b/virt/kvm/arm/vgic/vgic-mmio.h
@@ -157,6 +157,14 @@  void vgic_mmio_write_cpending(struct kvm_vcpu *vcpu,
 			      gpa_t addr, unsigned int len,
 			      unsigned long val);
 
+int vgic_uaccess_write_spending(struct kvm_vcpu *vcpu,
+				gpa_t addr, unsigned int len,
+				unsigned long val);
+
+int vgic_uaccess_write_cpending(struct kvm_vcpu *vcpu,
+				gpa_t addr, unsigned int len,
+				unsigned long val);
+
 unsigned long vgic_mmio_read_active(struct kvm_vcpu *vcpu,
 				    gpa_t addr, unsigned int len);