diff mbox

[v4,07/11] KVM: arm/arm64: vgic: Allow HW interrupts to be queued to a guest

Message ID 1438962347-17541-8-git-send-email-marc.zyngier@arm.com (mailing list archive)
State New, archived
Headers show

Commit Message

Marc Zyngier Aug. 7, 2015, 3:45 p.m. UTC
To allow a HW interrupt to be injected into a guest, we lookup the
guest virtual interrupt in the irq_phys_map list, and if we have
a match, encode both interrupts in the LR.

We also mark the interrupt as "active" at the host distributor level.

On guest EOI on the virtual interrupt, the host interrupt will be
deactivated.

Reviewed-by: Christoffer Dall <christoffer.dall@linaro.org>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 virt/kvm/arm/vgic.c | 89 +++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 86 insertions(+), 3 deletions(-)

Comments

Andre Przywara Sept. 23, 2015, 5:55 p.m. UTC | #1
Salut Marc,

I know that this patch is already merged, but ....

On 07/08/15 16:45, Marc Zyngier wrote:
> diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c
> index 51c9900..9d009d2 100644
...
> @@ -1364,6 +1397,39 @@ static bool vgic_process_maintenance(struct kvm_vcpu *vcpu)
>  	return level_pending;
>  }
>  
> +/*
> + * Save the physical active state, and reset it to inactive.
> + *
> + * Return 1 if HW interrupt went from active to inactive, and 0 otherwise.
> + */
> +static int vgic_sync_hwirq(struct kvm_vcpu *vcpu, struct vgic_lr vlr)
> +{
> +	struct irq_phys_map *map;
> +	int ret;
> +
> +	if (!(vlr.state & LR_HW))
> +		return 0;
> +
> +	map = vgic_irq_map_search(vcpu, vlr.irq);
> +	BUG_ON(!map || !map->active);
> +
> +	ret = irq_get_irqchip_state(map->irq,
> +				    IRQCHIP_STATE_ACTIVE,
> +				    &map->active);
> +
> +	WARN_ON(ret);
> +
> +	if (map->active) {
> +		ret = irq_set_irqchip_state(map->irq,
> +					    IRQCHIP_STATE_ACTIVE,
> +					    false);
> +		WARN_ON(ret);
> +		return 0;
> +	}
> +
> +	return 1;
> +}
> +
>  /* Sync back the VGIC state after a guest run */
>  static void __kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu)
>  {
> @@ -1378,14 +1444,31 @@ static void __kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu)
>  	elrsr = vgic_get_elrsr(vcpu);
>  	elrsr_ptr = u64_to_bitmask(&elrsr);
>  
> -	/* Clear mappings for empty LRs */
> -	for_each_set_bit(lr, elrsr_ptr, vgic->nr_lr) {
> +	/* Deal with HW interrupts, and clear mappings for empty LRs */
> +	for (lr = 0; lr < vgic->nr_lr; lr++) {
>  		struct vgic_lr vlr;
>  
> -		if (!test_and_clear_bit(lr, vgic_cpu->lr_used))
> +		if (!test_bit(lr, vgic_cpu->lr_used))
>  			continue;
>  
>  		vlr = vgic_get_lr(vcpu, lr);
> +		if (vgic_sync_hwirq(vcpu, vlr)) {
> +			/*
> +			 * So this is a HW interrupt that the guest
> +			 * EOI-ed. Clean the LR state and allow the
> +			 * interrupt to be sampled again.
> +			 */
> +			vlr.state = 0;
> +			vlr.hwirq = 0;
> +			vgic_set_lr(vcpu, lr, vlr);
> +			vgic_irq_clear_queued(vcpu, vlr.irq);

Isn't this line altering common VGIC state without holding the lock?
Eric removed the coarse dist->lock around the whole
__kvm_vgic_sync_hwstate() function, we take it now in
vgic_process_maintenance(), but don't hold it here AFAICT.
As long as we are only dealing with private timer IRQs this is probably
not a problem, but the IRQ number could be a SPI as well, right?

Cheers,
Andre.

> +			set_bit(lr, elrsr_ptr);
> +		}
> +
> +		if (!test_bit(lr, elrsr_ptr))
> +			continue;
> +
> +		clear_bit(lr, vgic_cpu->lr_used);
>  
>  		BUG_ON(vlr.irq >= dist->nr_irqs);
>  		vgic_cpu->vgic_irq_lr_map[vlr.irq] = LR_EMPTY;
> 
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoffer Dall Sept. 29, 2015, 1:44 p.m. UTC | #2
On Wed, Sep 23, 2015 at 06:55:04PM +0100, Andre Przywara wrote:
> Salut Marc,
> 
> I know that this patch is already merged, but ....
> 
> On 07/08/15 16:45, Marc Zyngier wrote:
> > diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c
> > index 51c9900..9d009d2 100644
> ...
> > @@ -1364,6 +1397,39 @@ static bool vgic_process_maintenance(struct kvm_vcpu *vcpu)
> >  	return level_pending;
> >  }
> >  
> > +/*
> > + * Save the physical active state, and reset it to inactive.
> > + *
> > + * Return 1 if HW interrupt went from active to inactive, and 0 otherwise.
> > + */
> > +static int vgic_sync_hwirq(struct kvm_vcpu *vcpu, struct vgic_lr vlr)
> > +{
> > +	struct irq_phys_map *map;
> > +	int ret;
> > +
> > +	if (!(vlr.state & LR_HW))
> > +		return 0;
> > +
> > +	map = vgic_irq_map_search(vcpu, vlr.irq);
> > +	BUG_ON(!map || !map->active);
> > +
> > +	ret = irq_get_irqchip_state(map->irq,
> > +				    IRQCHIP_STATE_ACTIVE,
> > +				    &map->active);
> > +
> > +	WARN_ON(ret);
> > +
> > +	if (map->active) {
> > +		ret = irq_set_irqchip_state(map->irq,
> > +					    IRQCHIP_STATE_ACTIVE,
> > +					    false);
> > +		WARN_ON(ret);
> > +		return 0;
> > +	}
> > +
> > +	return 1;
> > +}
> > +
> >  /* Sync back the VGIC state after a guest run */
> >  static void __kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu)
> >  {
> > @@ -1378,14 +1444,31 @@ static void __kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu)
> >  	elrsr = vgic_get_elrsr(vcpu);
> >  	elrsr_ptr = u64_to_bitmask(&elrsr);
> >  
> > -	/* Clear mappings for empty LRs */
> > -	for_each_set_bit(lr, elrsr_ptr, vgic->nr_lr) {
> > +	/* Deal with HW interrupts, and clear mappings for empty LRs */
> > +	for (lr = 0; lr < vgic->nr_lr; lr++) {
> >  		struct vgic_lr vlr;
> >  
> > -		if (!test_and_clear_bit(lr, vgic_cpu->lr_used))
> > +		if (!test_bit(lr, vgic_cpu->lr_used))
> >  			continue;
> >  
> >  		vlr = vgic_get_lr(vcpu, lr);
> > +		if (vgic_sync_hwirq(vcpu, vlr)) {
> > +			/*
> > +			 * So this is a HW interrupt that the guest
> > +			 * EOI-ed. Clean the LR state and allow the
> > +			 * interrupt to be sampled again.
> > +			 */
> > +			vlr.state = 0;
> > +			vlr.hwirq = 0;
> > +			vgic_set_lr(vcpu, lr, vlr);
> > +			vgic_irq_clear_queued(vcpu, vlr.irq);
> 
> Isn't this line altering common VGIC state without holding the lock?
> Eric removed the coarse dist->lock around the whole
> __kvm_vgic_sync_hwstate() function, we take it now in
> vgic_process_maintenance(), but don't hold it here AFAICT.
> As long as we are only dealing with private timer IRQs this is probably
> not a problem, but the IRQ number could be a SPI as well, right?
> 
I don't see a problematic race with this though, as all we're doing is
to clear a bit in a bitmap, which is always checked atomically, so
adding a lock around this really doesn't change anything as far as I can
tell.

Nevertheless, my rework series also addresses this.

-Christoffer
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andre Przywara Sept. 30, 2015, 2:06 p.m. UTC | #3
Hi Christoffer,

On 29/09/15 14:44, Christoffer Dall wrote:
> On Wed, Sep 23, 2015 at 06:55:04PM +0100, Andre Przywara wrote:
>> Salut Marc,
>>
>> I know that this patch is already merged, but ....
>>
>> On 07/08/15 16:45, Marc Zyngier wrote:
>>> diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c
>>> index 51c9900..9d009d2 100644
>> ...
>>> @@ -1364,6 +1397,39 @@ static bool vgic_process_maintenance(struct kvm_vcpu *vcpu)
>>>  	return level_pending;
>>>  }
>>>  
>>> +/*
>>> + * Save the physical active state, and reset it to inactive.
>>> + *
>>> + * Return 1 if HW interrupt went from active to inactive, and 0 otherwise.
>>> + */
>>> +static int vgic_sync_hwirq(struct kvm_vcpu *vcpu, struct vgic_lr vlr)
>>> +{
>>> +	struct irq_phys_map *map;
>>> +	int ret;
>>> +
>>> +	if (!(vlr.state & LR_HW))
>>> +		return 0;
>>> +
>>> +	map = vgic_irq_map_search(vcpu, vlr.irq);
>>> +	BUG_ON(!map || !map->active);
>>> +
>>> +	ret = irq_get_irqchip_state(map->irq,
>>> +				    IRQCHIP_STATE_ACTIVE,
>>> +				    &map->active);
>>> +
>>> +	WARN_ON(ret);
>>> +
>>> +	if (map->active) {
>>> +		ret = irq_set_irqchip_state(map->irq,
>>> +					    IRQCHIP_STATE_ACTIVE,
>>> +					    false);
>>> +		WARN_ON(ret);
>>> +		return 0;
>>> +	}
>>> +
>>> +	return 1;
>>> +}
>>> +
>>>  /* Sync back the VGIC state after a guest run */
>>>  static void __kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu)
>>>  {
>>> @@ -1378,14 +1444,31 @@ static void __kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu)
>>>  	elrsr = vgic_get_elrsr(vcpu);
>>>  	elrsr_ptr = u64_to_bitmask(&elrsr);
>>>  
>>> -	/* Clear mappings for empty LRs */
>>> -	for_each_set_bit(lr, elrsr_ptr, vgic->nr_lr) {
>>> +	/* Deal with HW interrupts, and clear mappings for empty LRs */
>>> +	for (lr = 0; lr < vgic->nr_lr; lr++) {
>>>  		struct vgic_lr vlr;
>>>  
>>> -		if (!test_and_clear_bit(lr, vgic_cpu->lr_used))
>>> +		if (!test_bit(lr, vgic_cpu->lr_used))
>>>  			continue;
>>>  
>>>  		vlr = vgic_get_lr(vcpu, lr);
>>> +		if (vgic_sync_hwirq(vcpu, vlr)) {
>>> +			/*
>>> +			 * So this is a HW interrupt that the guest
>>> +			 * EOI-ed. Clean the LR state and allow the
>>> +			 * interrupt to be sampled again.
>>> +			 */
>>> +			vlr.state = 0;
>>> +			vlr.hwirq = 0;
>>> +			vgic_set_lr(vcpu, lr, vlr);
>>> +			vgic_irq_clear_queued(vcpu, vlr.irq);
>>
>> Isn't this line altering common VGIC state without holding the lock?
>> Eric removed the coarse dist->lock around the whole
>> __kvm_vgic_sync_hwstate() function, we take it now in
>> vgic_process_maintenance(), but don't hold it here AFAICT.
>> As long as we are only dealing with private timer IRQs this is probably
>> not a problem, but the IRQ number could be a SPI as well, right?
>>
> I don't see a problematic race with this though, as all we're doing is
> to clear a bit in a bitmap, which is always checked atomically, so
> adding a lock around this really doesn't change anything as far as I can
> tell.

Indeed I found a similar comment in some older revisions of the code.

But isn't it that other code holding the lock (thinking about
kvm_vgic_flush_hwstate() in particular) assumes that no-one else tinkers
with the VGIC state while it holds the lock?
So couldn't we (potentially) run into inconsistent state because we
cleared the queued bit while the flushing code runs over all interrupts?
Maybe not in this particular case, but in general?

Haven't looked at your new series yet, but will do this ASAP.

Cheers,
Andre.

> 
> Nevertheless, my rework series also addresses this.
> 
> -Christoffer
> 
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoffer Dall Oct. 1, 2015, 10:25 a.m. UTC | #4
On Wed, Sep 30, 2015 at 03:06:32PM +0100, Andre Przywara wrote:
> Hi Christoffer,
> 
> On 29/09/15 14:44, Christoffer Dall wrote:
> > On Wed, Sep 23, 2015 at 06:55:04PM +0100, Andre Przywara wrote:
> >> Salut Marc,
> >>
> >> I know that this patch is already merged, but ....
> >>
> >> On 07/08/15 16:45, Marc Zyngier wrote:
> >>> diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c
> >>> index 51c9900..9d009d2 100644
> >> ...
> >>> @@ -1364,6 +1397,39 @@ static bool vgic_process_maintenance(struct kvm_vcpu *vcpu)
> >>>  	return level_pending;
> >>>  }
> >>>  
> >>> +/*
> >>> + * Save the physical active state, and reset it to inactive.
> >>> + *
> >>> + * Return 1 if HW interrupt went from active to inactive, and 0 otherwise.
> >>> + */
> >>> +static int vgic_sync_hwirq(struct kvm_vcpu *vcpu, struct vgic_lr vlr)
> >>> +{
> >>> +	struct irq_phys_map *map;
> >>> +	int ret;
> >>> +
> >>> +	if (!(vlr.state & LR_HW))
> >>> +		return 0;
> >>> +
> >>> +	map = vgic_irq_map_search(vcpu, vlr.irq);
> >>> +	BUG_ON(!map || !map->active);
> >>> +
> >>> +	ret = irq_get_irqchip_state(map->irq,
> >>> +				    IRQCHIP_STATE_ACTIVE,
> >>> +				    &map->active);
> >>> +
> >>> +	WARN_ON(ret);
> >>> +
> >>> +	if (map->active) {
> >>> +		ret = irq_set_irqchip_state(map->irq,
> >>> +					    IRQCHIP_STATE_ACTIVE,
> >>> +					    false);
> >>> +		WARN_ON(ret);
> >>> +		return 0;
> >>> +	}
> >>> +
> >>> +	return 1;
> >>> +}
> >>> +
> >>>  /* Sync back the VGIC state after a guest run */
> >>>  static void __kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu)
> >>>  {
> >>> @@ -1378,14 +1444,31 @@ static void __kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu)
> >>>  	elrsr = vgic_get_elrsr(vcpu);
> >>>  	elrsr_ptr = u64_to_bitmask(&elrsr);
> >>>  
> >>> -	/* Clear mappings for empty LRs */
> >>> -	for_each_set_bit(lr, elrsr_ptr, vgic->nr_lr) {
> >>> +	/* Deal with HW interrupts, and clear mappings for empty LRs */
> >>> +	for (lr = 0; lr < vgic->nr_lr; lr++) {
> >>>  		struct vgic_lr vlr;
> >>>  
> >>> -		if (!test_and_clear_bit(lr, vgic_cpu->lr_used))
> >>> +		if (!test_bit(lr, vgic_cpu->lr_used))
> >>>  			continue;
> >>>  
> >>>  		vlr = vgic_get_lr(vcpu, lr);
> >>> +		if (vgic_sync_hwirq(vcpu, vlr)) {
> >>> +			/*
> >>> +			 * So this is a HW interrupt that the guest
> >>> +			 * EOI-ed. Clean the LR state and allow the
> >>> +			 * interrupt to be sampled again.
> >>> +			 */
> >>> +			vlr.state = 0;
> >>> +			vlr.hwirq = 0;
> >>> +			vgic_set_lr(vcpu, lr, vlr);
> >>> +			vgic_irq_clear_queued(vcpu, vlr.irq);
> >>
> >> Isn't this line altering common VGIC state without holding the lock?
> >> Eric removed the coarse dist->lock around the whole
> >> __kvm_vgic_sync_hwstate() function, we take it now in
> >> vgic_process_maintenance(), but don't hold it here AFAICT.
> >> As long as we are only dealing with private timer IRQs this is probably
> >> not a problem, but the IRQ number could be a SPI as well, right?
> >>
> > I don't see a problematic race with this though, as all we're doing is
> > to clear a bit in a bitmap, which is always checked atomically, so
> > adding a lock around this really doesn't change anything as far as I can
> > tell.
> 
> Indeed I found a similar comment in some older revisions of the code.
> 
> But isn't it that other code holding the lock (thinking about
> kvm_vgic_flush_hwstate() in particular) assumes that no-one else tinkers
> with the VGIC state while it holds the lock?
> So couldn't we (potentially) run into inconsistent state because we
> cleared the queued bit while the flushing code runs over all interrupts?
> Maybe not in this particular case, but in general?

In general, yes, you should lock operations accessing the distributor.

It just feels silly to do 

spin_lock();
vgic_irq_clear_queued(...);
spin_unlock();

because vgic_irq_clear_queued just clears a bit and I don't see the
race.

> 
> Haven't looked at your new series yet, but will do this ASAP.
> 
Thanks, much appreciated.  Based on your comment on the previous
version, the whole thing is now wrapped in a spinlock so this point is
moot.  Unless there's a clear race to be fixed here, I would prefer if
we focus our energy on getting the other series merged.

-Christoffer
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c
index 51c9900..9d009d2 100644
--- a/virt/kvm/arm/vgic.c
+++ b/virt/kvm/arm/vgic.c
@@ -1140,6 +1140,39 @@  static void vgic_queue_irq_to_lr(struct kvm_vcpu *vcpu, int irq,
 	if (!vgic_irq_is_edge(vcpu, irq))
 		vlr.state |= LR_EOI_INT;
 
+	if (vlr.irq >= VGIC_NR_SGIS) {
+		struct irq_phys_map *map;
+		map = vgic_irq_map_search(vcpu, irq);
+
+		/*
+		 * If we have a mapping, and the virtual interrupt is
+		 * being injected, then we must set the state to
+		 * active in the physical world. Otherwise the
+		 * physical interrupt will fire and the guest will
+		 * exit before processing the virtual interrupt.
+		 */
+		if (map) {
+			int ret;
+
+			BUG_ON(!map->active);
+			vlr.hwirq = map->phys_irq;
+			vlr.state |= LR_HW;
+			vlr.state &= ~LR_EOI_INT;
+
+			ret = irq_set_irqchip_state(map->irq,
+						    IRQCHIP_STATE_ACTIVE,
+						    true);
+			WARN_ON(ret);
+
+			/*
+			 * Make sure we're not going to sample this
+			 * again, as a HW-backed interrupt cannot be
+			 * in the PENDING_ACTIVE stage.
+			 */
+			vgic_irq_set_queued(vcpu, irq);
+		}
+	}
+
 	vgic_set_lr(vcpu, lr_nr, vlr);
 	vgic_sync_lr_elrsr(vcpu, lr_nr, vlr);
 }
@@ -1364,6 +1397,39 @@  static bool vgic_process_maintenance(struct kvm_vcpu *vcpu)
 	return level_pending;
 }
 
+/*
+ * Save the physical active state, and reset it to inactive.
+ *
+ * Return 1 if HW interrupt went from active to inactive, and 0 otherwise.
+ */
+static int vgic_sync_hwirq(struct kvm_vcpu *vcpu, struct vgic_lr vlr)
+{
+	struct irq_phys_map *map;
+	int ret;
+
+	if (!(vlr.state & LR_HW))
+		return 0;
+
+	map = vgic_irq_map_search(vcpu, vlr.irq);
+	BUG_ON(!map || !map->active);
+
+	ret = irq_get_irqchip_state(map->irq,
+				    IRQCHIP_STATE_ACTIVE,
+				    &map->active);
+
+	WARN_ON(ret);
+
+	if (map->active) {
+		ret = irq_set_irqchip_state(map->irq,
+					    IRQCHIP_STATE_ACTIVE,
+					    false);
+		WARN_ON(ret);
+		return 0;
+	}
+
+	return 1;
+}
+
 /* Sync back the VGIC state after a guest run */
 static void __kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu)
 {
@@ -1378,14 +1444,31 @@  static void __kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu)
 	elrsr = vgic_get_elrsr(vcpu);
 	elrsr_ptr = u64_to_bitmask(&elrsr);
 
-	/* Clear mappings for empty LRs */
-	for_each_set_bit(lr, elrsr_ptr, vgic->nr_lr) {
+	/* Deal with HW interrupts, and clear mappings for empty LRs */
+	for (lr = 0; lr < vgic->nr_lr; lr++) {
 		struct vgic_lr vlr;
 
-		if (!test_and_clear_bit(lr, vgic_cpu->lr_used))
+		if (!test_bit(lr, vgic_cpu->lr_used))
 			continue;
 
 		vlr = vgic_get_lr(vcpu, lr);
+		if (vgic_sync_hwirq(vcpu, vlr)) {
+			/*
+			 * So this is a HW interrupt that the guest
+			 * EOI-ed. Clean the LR state and allow the
+			 * interrupt to be sampled again.
+			 */
+			vlr.state = 0;
+			vlr.hwirq = 0;
+			vgic_set_lr(vcpu, lr, vlr);
+			vgic_irq_clear_queued(vcpu, vlr.irq);
+			set_bit(lr, elrsr_ptr);
+		}
+
+		if (!test_bit(lr, elrsr_ptr))
+			continue;
+
+		clear_bit(lr, vgic_cpu->lr_used);
 
 		BUG_ON(vlr.irq >= dist->nr_irqs);
 		vgic_cpu->vgic_irq_lr_map[vlr.irq] = LR_EMPTY;