diff mbox

[07/10] KVM: arm/arm64: vgic: Allow HW interrupts to be queued to a guest

Message ID 1433783045-8002-8-git-send-email-marc.zyngier@arm.com (mailing list archive)
State New, archived
Headers show

Commit Message

Marc Zyngier June 8, 2015, 5:04 p.m. UTC
To allow a HW interrupt to be injected into a guest, we lookup the
guest virtual interrupt in the irq_phys_map rbtree, and if we have
a match, encode both interrupts in the LR.

We also mark the interrupt as "active" at the host distributor level.

On guest EOI on the virtual interrupt, the host interrupt will be
deactivated.

Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 virt/kvm/arm/vgic.c | 71 ++++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 68 insertions(+), 3 deletions(-)

Comments

Andre Przywara June 11, 2015, 8:44 a.m. UTC | #1
Hi Marc,

On 06/08/2015 06:04 PM, Marc Zyngier wrote:
> To allow a HW interrupt to be injected into a guest, we lookup the
> guest virtual interrupt in the irq_phys_map rbtree, and if we have
> a match, encode both interrupts in the LR.
> 
> We also mark the interrupt as "active" at the host distributor level.
> 
> On guest EOI on the virtual interrupt, the host interrupt will be
> deactivated.
> 
> Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
> ---
>  virt/kvm/arm/vgic.c | 71 ++++++++++++++++++++++++++++++++++++++++++++++++++---
>  1 file changed, 68 insertions(+), 3 deletions(-)
> 
> diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c
> index c6604f2..495ac7d 100644
> --- a/virt/kvm/arm/vgic.c
> +++ b/virt/kvm/arm/vgic.c
> @@ -1120,6 +1120,26 @@ static void vgic_queue_irq_to_lr(struct kvm_vcpu *vcpu, int irq,
>  	if (!vgic_irq_is_edge(vcpu, irq))
>  		vlr.state |= LR_EOI_INT;
>  
> +	if (vlr.irq >= VGIC_NR_SGIS) {
> +		struct irq_phys_map *map;
> +		map = vgic_irq_map_search(vcpu, irq);
> +
> +		if (map) {
> +			int ret;
> +
> +			BUG_ON(!map->active);
> +			vlr.hwirq = map->phys_irq;
> +			vlr.state |= LR_HW;
> +			vlr.state &= ~LR_EOI_INT;
> +
> +			ret = irq_set_irqchip_state(map->irq,
> +						    IRQCHIP_STATE_ACTIVE,
> +						    true);
> +			vgic_irq_set_queued(vcpu, irq);
> +			WARN_ON(ret);
> +		}
> +	}
> +
>  	vgic_set_lr(vcpu, lr_nr, vlr);
>  	vgic_sync_lr_elrsr(vcpu, lr_nr, vlr);
>  }
> @@ -1344,6 +1364,35 @@ static bool vgic_process_maintenance(struct kvm_vcpu *vcpu)
>  	return level_pending;
>  }
>  
> +/* Return 1 if HW interrupt went from active to inactive, and 0 otherwise */
> +static int vgic_sync_hwirq(struct kvm_vcpu *vcpu, struct vgic_lr vlr)
> +{
> +	struct irq_phys_map *map;
> +	int ret;
> +
> +	if (!(vlr.state & LR_HW))
> +		return 0;
> +
> +	map = vgic_irq_map_search(vcpu, vlr.irq);

I wonder if it's safe to rely on that mapping here. Are we sure that
this hasn't changed while the VCPU was running? If I got this correctly,
currently only vcpu_reset will actually add a map entry, but I guess in
the future there will be more users.
Also we rely on the irqdomain mapping to be still the same, but that is
probably a safe assumption.

But I'd still find it more natural to use the hwirq number from the LR
at this point. Can't we use irq_find_mapping() here to learn Linux'
(current) irq number from that?

Or am I too paranoid here?

Cheers,
Andre.

> +	BUG_ON(!map || !map->active);
> +
> +	ret = irq_get_irqchip_state(map->irq,
> +				    IRQCHIP_STATE_ACTIVE,
> +				    &map->active);
> +
> +	WARN_ON(ret);
> +
> +	if (map->active) {
> +		ret = irq_set_irqchip_state(map->irq,
> +					    IRQCHIP_STATE_ACTIVE,
> +					    false);
> +		WARN_ON(ret);
> +		return 0;
> +	}
> +
> +	return 1;
> +}
> +
>  /* Sync back the VGIC state after a guest run */
>  static void __kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu)
>  {
> @@ -1358,14 +1407,30 @@ static void __kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu)
>  	elrsr = vgic_get_elrsr(vcpu);
>  	elrsr_ptr = u64_to_bitmask(&elrsr);
>  
> -	/* Clear mappings for empty LRs */
> -	for_each_set_bit(lr, elrsr_ptr, vgic->nr_lr) {
> +	/* Deal with HW interrupts, and clear mappings for empty LRs */
> +	for (lr = 0; lr < vgic->nr_lr; lr++) {
>  		struct vgic_lr vlr;
>  
> -		if (!test_and_clear_bit(lr, vgic_cpu->lr_used))
> +		if (!test_bit(lr, vgic_cpu->lr_used))
>  			continue;
>  
>  		vlr = vgic_get_lr(vcpu, lr);
> +		if (vgic_sync_hwirq(vcpu, vlr)) {
> +			/*
> +			 * So this is a HW interrupt that the guest
> +			 * EOI-ed. Clean the LR state and allow the
> +			 * interrupt to be queued again.
> +			 */
> +			vlr.state &= ~LR_HW;
> +			vlr.hwirq = 0;
> +			vgic_set_lr(vcpu, lr, vlr);
> +			vgic_irq_clear_queued(vcpu, vlr.irq);
> +		}
> +
> +		if (!test_bit(lr, elrsr_ptr))
> +			continue;
> +
> +		clear_bit(lr, vgic_cpu->lr_used);
>  
>  		BUG_ON(vlr.irq >= dist->nr_irqs);
>  		vgic_cpu->vgic_irq_lr_map[vlr.irq] = LR_EMPTY;
> 
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Marc Zyngier June 11, 2015, 9:15 a.m. UTC | #2
On 11/06/15 09:44, Andre Przywara wrote:
> Hi Marc,
> 
> On 06/08/2015 06:04 PM, Marc Zyngier wrote:
>> To allow a HW interrupt to be injected into a guest, we lookup the
>> guest virtual interrupt in the irq_phys_map rbtree, and if we have
>> a match, encode both interrupts in the LR.
>>
>> We also mark the interrupt as "active" at the host distributor level.
>>
>> On guest EOI on the virtual interrupt, the host interrupt will be
>> deactivated.
>>
>> Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
>> ---
>>  virt/kvm/arm/vgic.c | 71 ++++++++++++++++++++++++++++++++++++++++++++++++++---
>>  1 file changed, 68 insertions(+), 3 deletions(-)
>>
>> diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c
>> index c6604f2..495ac7d 100644
>> --- a/virt/kvm/arm/vgic.c
>> +++ b/virt/kvm/arm/vgic.c
>> @@ -1120,6 +1120,26 @@ static void vgic_queue_irq_to_lr(struct kvm_vcpu *vcpu, int irq,
>>  	if (!vgic_irq_is_edge(vcpu, irq))
>>  		vlr.state |= LR_EOI_INT;
>>  
>> +	if (vlr.irq >= VGIC_NR_SGIS) {
>> +		struct irq_phys_map *map;
>> +		map = vgic_irq_map_search(vcpu, irq);
>> +
>> +		if (map) {
>> +			int ret;
>> +
>> +			BUG_ON(!map->active);
>> +			vlr.hwirq = map->phys_irq;
>> +			vlr.state |= LR_HW;
>> +			vlr.state &= ~LR_EOI_INT;
>> +
>> +			ret = irq_set_irqchip_state(map->irq,
>> +						    IRQCHIP_STATE_ACTIVE,
>> +						    true);
>> +			vgic_irq_set_queued(vcpu, irq);
>> +			WARN_ON(ret);
>> +		}
>> +	}
>> +
>>  	vgic_set_lr(vcpu, lr_nr, vlr);
>>  	vgic_sync_lr_elrsr(vcpu, lr_nr, vlr);
>>  }
>> @@ -1344,6 +1364,35 @@ static bool vgic_process_maintenance(struct kvm_vcpu *vcpu)
>>  	return level_pending;
>>  }
>>  
>> +/* Return 1 if HW interrupt went from active to inactive, and 0 otherwise */
>> +static int vgic_sync_hwirq(struct kvm_vcpu *vcpu, struct vgic_lr vlr)
>> +{
>> +	struct irq_phys_map *map;
>> +	int ret;
>> +
>> +	if (!(vlr.state & LR_HW))
>> +		return 0;
>> +
>> +	map = vgic_irq_map_search(vcpu, vlr.irq);
> 
> I wonder if it's safe to rely on that mapping here. Are we sure that
> this hasn't changed while the VCPU was running? If I got this correctly,
> currently only vcpu_reset will actually add a map entry, but I guess in
> the future there will be more users.

How can the guest interrupt change? This is HW, as far as the guest is
concerned. An actual interrupt line. We don't reconfigure the HW live.

> Also we rely on the irqdomain mapping to be still the same, but that is
> probably a safe assumption.

Like I said before, this *cannot* change.

> But I'd still find it more natural to use the hwirq number from the LR
> at this point. Can't we use irq_find_mapping() here to learn Linux'
> (current) irq number from that?

I think you're confused.

- The guest irq (vlr.irq) is entirely made up, and has no connection
with reality. it is stable, and cannot change during the lifetime of the
guest (think of it as a HW irq line).

- The host hwirq (vlr.hwirq) is stable as well, for the same reason.

- The Linux IRQ cannot change because we've been given it by the kernel,
and that's what we use for *everything* as far as the kernel is
concerned. Its mapping to hwirq is stable as well because this is how we
talk to the HW.

- irq_find_mapping gives you the *reverse* mapping (from hwirq to Linux
irq), and for that to work, you need the domain on which you want to
apply the translation. This is only useful when actually taking the
interrupt (i.e. in an interrupt controller driver). I can't see how that
could make sense here.

The purpose of this mapping is to, given the guest irq (because that's
what we inject), what the other values are:
- hwirq: to provide GICH with the interrupt to deactivate
- Linux irq: to control the active state through the irqchip state API.

> Or am I too paranoid here?

Hope it makes more sense to you now.

Thanks,

	M.
Andre Przywara June 11, 2015, 9:44 a.m. UTC | #3
On 06/11/2015 10:15 AM, Marc Zyngier wrote:
> On 11/06/15 09:44, Andre Przywara wrote:
>> On 06/08/2015 06:04 PM, Marc Zyngier wrote:
...
>>> @@ -1344,6 +1364,35 @@ static bool vgic_process_maintenance(struct kvm_vcpu *vcpu)
>>>  	return level_pending;
>>>  }
>>>  
>>> +/* Return 1 if HW interrupt went from active to inactive, and 0 otherwise */
>>> +static int vgic_sync_hwirq(struct kvm_vcpu *vcpu, struct vgic_lr vlr)
>>> +{
>>> +	struct irq_phys_map *map;
>>> +	int ret;
>>> +
>>> +	if (!(vlr.state & LR_HW))
>>> +		return 0;
>>> +
>>> +	map = vgic_irq_map_search(vcpu, vlr.irq);
>>
>> I wonder if it's safe to rely on that mapping here. Are we sure that
>> this hasn't changed while the VCPU was running? If I got this correctly,
>> currently only vcpu_reset will actually add a map entry, but I guess in
>> the future there will be more users.
> 
> How can the guest interrupt change? This is HW, as far as the guest is
> concerned. An actual interrupt line. We don't reconfigure the HW live.

I was thinking about the rbtree mapping we introduced. There we map a
guest interrupt to a hardware interrupt. Are we sure that no one tears
down that mapping while we have an LR populated with this pair?
I am not talking about the timer here, but more about future users.

>> Also we rely on the irqdomain mapping to be still the same, but that is
>> probably a safe assumption.
> 
> Like I said before, this *cannot* change.

OK, got it.

> 
>> But I'd still find it more natural to use the hwirq number from the LR
>> at this point. Can't we use irq_find_mapping() here to learn Linux'
>> (current) irq number from that?
> 
> I think you're confused.
> 
> - The guest irq (vlr.irq) is entirely made up, and has no connection
> with reality. it is stable, and cannot change during the lifetime of the
> guest (think of it as a HW irq line).
> 
> - The host hwirq (vlr.hwirq) is stable as well, for the same reason.
> 
> - The Linux IRQ cannot change because we've been given it by the kernel,
> and that's what we use for *everything* as far as the kernel is
> concerned. Its mapping to hwirq is stable as well because this is how we
> talk to the HW.

Not disputing any of them, but:

> - irq_find_mapping gives you the *reverse* mapping (from hwirq to Linux
> irq), and for that to work, you need the domain on which you want to
> apply the translation. This is only useful when actually taking the
> interrupt (i.e. in an interrupt controller driver). I can't see how that
> could make sense here.

So if the guest has acked/EOIed it's IRQ, the GIC at the same time
acked/EOIed the hardware IRQ it found in the LR. Now we assume that this
is the very same as the HW IRQ we found doing our rbtree traversal.
I just wanted to be sure that this is always true and that this mapping
didn't change while the VCPU was running.
If you are sure of this, fine, I was just concerned that someone breaks
this assumption in the future by more dynamically mapping/unmapping
entries (say some irq forwarding user) and we will not notice.

Cheers,
Andre.

> 
> The purpose of this mapping is to, given the guest irq (because that's
> what we inject), what the other values are:
> - hwirq: to provide GICH with the interrupt to deactivate
> - Linux irq: to control the active state through the irqchip state API.
> 
>> Or am I too paranoid here?
> 
> Hope it makes more sense to you now.
> 
> Thanks,
> 
> 	M.
> 
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Marc Zyngier June 11, 2015, 10:02 a.m. UTC | #4
On 11/06/15 10:44, Andre Przywara wrote:
> On 06/11/2015 10:15 AM, Marc Zyngier wrote:
>> On 11/06/15 09:44, Andre Przywara wrote:
>>> On 06/08/2015 06:04 PM, Marc Zyngier wrote:
> ...
>>>> @@ -1344,6 +1364,35 @@ static bool vgic_process_maintenance(struct kvm_vcpu *vcpu)
>>>>  	return level_pending;
>>>>  }
>>>>  
>>>> +/* Return 1 if HW interrupt went from active to inactive, and 0 otherwise */
>>>> +static int vgic_sync_hwirq(struct kvm_vcpu *vcpu, struct vgic_lr vlr)
>>>> +{
>>>> +	struct irq_phys_map *map;
>>>> +	int ret;
>>>> +
>>>> +	if (!(vlr.state & LR_HW))
>>>> +		return 0;
>>>> +
>>>> +	map = vgic_irq_map_search(vcpu, vlr.irq);
>>>
>>> I wonder if it's safe to rely on that mapping here. Are we sure that
>>> this hasn't changed while the VCPU was running? If I got this correctly,
>>> currently only vcpu_reset will actually add a map entry, but I guess in
>>> the future there will be more users.
>>
>> How can the guest interrupt change? This is HW, as far as the guest is
>> concerned. An actual interrupt line. We don't reconfigure the HW live.
> 
> I was thinking about the rbtree mapping we introduced. There we map a
> guest interrupt to a hardware interrupt. Are we sure that no one tears
> down that mapping while we have an LR populated with this pair?
> I am not talking about the timer here, but more about future users.
> 
>>> Also we rely on the irqdomain mapping to be still the same, but that is
>>> probably a safe assumption.
>>
>> Like I said before, this *cannot* change.
> 
> OK, got it.
> 
>>
>>> But I'd still find it more natural to use the hwirq number from the LR
>>> at this point. Can't we use irq_find_mapping() here to learn Linux'
>>> (current) irq number from that?
>>
>> I think you're confused.
>>
>> - The guest irq (vlr.irq) is entirely made up, and has no connection
>> with reality. it is stable, and cannot change during the lifetime of the
>> guest (think of it as a HW irq line).
>>
>> - The host hwirq (vlr.hwirq) is stable as well, for the same reason.
>>
>> - The Linux IRQ cannot change because we've been given it by the kernel,
>> and that's what we use for *everything* as far as the kernel is
>> concerned. Its mapping to hwirq is stable as well because this is how we
>> talk to the HW.
> 
> Not disputing any of them, but:
> 
>> - irq_find_mapping gives you the *reverse* mapping (from hwirq to Linux
>> irq), and for that to work, you need the domain on which you want to
>> apply the translation. This is only useful when actually taking the
>> interrupt (i.e. in an interrupt controller driver). I can't see how that
>> could make sense here.
> 
> So if the guest has acked/EOIed it's IRQ, the GIC at the same time
> acked/EOIed the hardware IRQ it found in the LR. Now we assume that this
> is the very same as the HW IRQ we found doing our rbtree traversal.
> I just wanted to be sure that this is always true and that this mapping
> didn't change while the VCPU was running.
> If you are sure of this, fine, I was just concerned that someone breaks
> this assumption in the future by more dynamically mapping/unmapping
> entries (say some irq forwarding user) and we will not notice.

How can the mapping change? Are you thinking of an unmap/map operation
being done while the guest is running, replacing a HW device with
another? That's not an option, and not only for the interrupts.

	M.
Eric Auger June 15, 2015, 4:11 p.m. UTC | #5
On 06/11/2015 12:02 PM, Marc Zyngier wrote:
> On 11/06/15 10:44, Andre Przywara wrote:
>> On 06/11/2015 10:15 AM, Marc Zyngier wrote:
>>> On 11/06/15 09:44, Andre Przywara wrote:
>>>> On 06/08/2015 06:04 PM, Marc Zyngier wrote:
>> ...
>>>>> @@ -1344,6 +1364,35 @@ static bool vgic_process_maintenance(struct kvm_vcpu *vcpu)
>>>>>  	return level_pending;
>>>>>  }
>>>>>  
>>>>> +/* Return 1 if HW interrupt went from active to inactive, and 0 otherwise */
>>>>> +static int vgic_sync_hwirq(struct kvm_vcpu *vcpu, struct vgic_lr vlr)
>>>>> +{
>>>>> +	struct irq_phys_map *map;
>>>>> +	int ret;
>>>>> +
>>>>> +	if (!(vlr.state & LR_HW))
>>>>> +		return 0;
>>>>> +
>>>>> +	map = vgic_irq_map_search(vcpu, vlr.irq);
>>>>
>>>> I wonder if it's safe to rely on that mapping here. Are we sure that
>>>> this hasn't changed while the VCPU was running? If I got this correctly,
>>>> currently only vcpu_reset will actually add a map entry, but I guess in
>>>> the future there will be more users.
>>>
>>> How can the guest interrupt change? This is HW, as far as the guest is
>>> concerned. An actual interrupt line. We don't reconfigure the HW live.
>>
>> I was thinking about the rbtree mapping we introduced. There we map a
>> guest interrupt to a hardware interrupt. Are we sure that no one tears
>> down that mapping while we have an LR populated with this pair?
>> I am not talking about the timer here, but more about future users.
>>
>>>> Also we rely on the irqdomain mapping to be still the same, but that is
>>>> probably a safe assumption.
>>>
>>> Like I said before, this *cannot* change.
>>
>> OK, got it.
>>
>>>
>>>> But I'd still find it more natural to use the hwirq number from the LR
>>>> at this point. Can't we use irq_find_mapping() here to learn Linux'
>>>> (current) irq number from that?
>>>
>>> I think you're confused.
>>>
>>> - The guest irq (vlr.irq) is entirely made up, and has no connection
>>> with reality. it is stable, and cannot change during the lifetime of the
>>> guest (think of it as a HW irq line).
>>>
>>> - The host hwirq (vlr.hwirq) is stable as well, for the same reason.
>>>
>>> - The Linux IRQ cannot change because we've been given it by the kernel,
>>> and that's what we use for *everything* as far as the kernel is
>>> concerned. Its mapping to hwirq is stable as well because this is how we
>>> talk to the HW.
>>
>> Not disputing any of them, but:
>>
>>> - irq_find_mapping gives you the *reverse* mapping (from hwirq to Linux
>>> irq), and for that to work, you need the domain on which you want to
>>> apply the translation. This is only useful when actually taking the
>>> interrupt (i.e. in an interrupt controller driver). I can't see how that
>>> could make sense here.
>>
>> So if the guest has acked/EOIed it's IRQ, the GIC at the same time
>> acked/EOIed the hardware IRQ it found in the LR. Now we assume that this
>> is the very same as the HW IRQ we found doing our rbtree traversal.
>> I just wanted to be sure that this is always true and that this mapping
>> didn't change while the VCPU was running.
>> If you are sure of this, fine, I was just concerned that someone breaks
>> this assumption in the future by more dynamically mapping/unmapping
>> entries (say some irq forwarding user) and we will not notice.
> 
> How can the mapping change? Are you thinking of an unmap/map operation
> being done while the guest is running, replacing a HW device with
> another? That's not an option, and not only for the interrupts.

Well that's what we achieved I think with the kvm-vfio integration. The
requirement was: since we allow the user-space to turn forwarding on,
through the kvm-vfio device, we should offer the inverse operation and
this was should never fail. This was achieved by forcing the guest exit,
check the HW state of the IRQ, and quite a lot of pain ...

At that time the kvm-vfio integration seemed to be the most appropriate
approach. Now it seems this is put into question again with Intel posted
IRQ API series review (https://lkml.org/lkml/2015/6/12/595). I think you
will happy. not sure I can say the same ;-)

Best Regards

Eric

> 
> 	M.
> 

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Auger June 17, 2015, 11:51 a.m. UTC | #6
Hi Marc,
On 06/08/2015 07:04 PM, Marc Zyngier wrote:
> To allow a HW interrupt to be injected into a guest, we lookup the
> guest virtual interrupt in the irq_phys_map rbtree, and if we have
> a match, encode both interrupts in the LR.
> 
> We also mark the interrupt as "active" at the host distributor level.
> 
> On guest EOI on the virtual interrupt, the host interrupt will be
> deactivated.
a "standard" physical IRQ would be first handled by the host handler
which would ack and deactivate it a first time. Here, if my
understanding is correct, the virtual counter PPI never hits. Instead we
"emulate" it on world-switch by directly setting the dist state. Is that
correct? If yes it is quite a specific handling of an "HW" IRQ.

> 
> Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
> ---
>  virt/kvm/arm/vgic.c | 71 ++++++++++++++++++++++++++++++++++++++++++++++++++---
>  1 file changed, 68 insertions(+), 3 deletions(-)
> 
> diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c
> index c6604f2..495ac7d 100644
> --- a/virt/kvm/arm/vgic.c
> +++ b/virt/kvm/arm/vgic.c
> @@ -1120,6 +1120,26 @@ static void vgic_queue_irq_to_lr(struct kvm_vcpu *vcpu, int irq,
>  	if (!vgic_irq_is_edge(vcpu, irq))
>  		vlr.state |= LR_EOI_INT;
>  
> +	if (vlr.irq >= VGIC_NR_SGIS) {
> +		struct irq_phys_map *map;
> +		map = vgic_irq_map_search(vcpu, irq);
> +
> +		if (map) {
> +			int ret;
> +
> +			BUG_ON(!map->active);
> +			vlr.hwirq = map->phys_irq;
> +			vlr.state |= LR_HW;
> +			vlr.state &= ~LR_EOI_INT;
> +
> +			ret = irq_set_irqchip_state(map->irq,
> +						    IRQCHIP_STATE_ACTIVE,
> +						    true);
> +			vgic_irq_set_queued(vcpu, irq);
queued state was used for level sensitive IRQs only. Forwarded or "HW"
IRQs theoretically can be edge or sensitive, right? If yes may be worth
to justify the usage of queued state for forwarded IRQ? Also
vgic_irq_set_queued rather was called in parent vgic_queue_hwirq today.

> +			WARN_ON(ret);
> +		}
> +	}
> +
>  	vgic_set_lr(vcpu, lr_nr, vlr);
>  	vgic_sync_lr_elrsr(vcpu, lr_nr, vlr);
>  }
> @@ -1344,6 +1364,35 @@ static bool vgic_process_maintenance(struct kvm_vcpu *vcpu)
>  	return level_pending;
>  }
>  
> +/* Return 1 if HW interrupt went from active to inactive, and 0 otherwise */
> +static int vgic_sync_hwirq(struct kvm_vcpu *vcpu, struct vgic_lr vlr)
> +{
> +	struct irq_phys_map *map;
> +	int ret;
> +
> +	if (!(vlr.state & LR_HW))
> +		return 0;
> +
> +	map = vgic_irq_map_search(vcpu, vlr.irq);
> +	BUG_ON(!map || !map->active);
> +
> +	ret = irq_get_irqchip_state(map->irq,
> +				    IRQCHIP_STATE_ACTIVE,
> +				    &map->active);
Doesn't it work because the virtual timer was disabled during the world
switch. Does it characterize all "shared" devices? Difficult for me to
understand how much this is specific to arch timer integration?
> +
> +	WARN_ON(ret);
> +
> +	if (map->active) {
> +		ret = irq_set_irqchip_state(map->irq,
> +					    IRQCHIP_STATE_ACTIVE,
> +					    false);
> +		WARN_ON(ret);
> +		return 0;
> +	}
> +
> +	return 1;
> +}
> +
>  /* Sync back the VGIC state after a guest run */
>  static void __kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu)
>  {
> @@ -1358,14 +1407,30 @@ static void __kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu)
>  	elrsr = vgic_get_elrsr(vcpu);
>  	elrsr_ptr = u64_to_bitmask(&elrsr);
>  
> -	/* Clear mappings for empty LRs */
> -	for_each_set_bit(lr, elrsr_ptr, vgic->nr_lr) {
> +	/* Deal with HW interrupts, and clear mappings for empty LRs */
> +	for (lr = 0; lr < vgic->nr_lr; lr++) {
>  		struct vgic_lr vlr;
>  
> -		if (!test_and_clear_bit(lr, vgic_cpu->lr_used))
> +		if (!test_bit(lr, vgic_cpu->lr_used))
>  			continue;
>  
>  		vlr = vgic_get_lr(vcpu, lr);
> +		if (vgic_sync_hwirq(vcpu, vlr)) {
> +			/*
> +			 * So this is a HW interrupt that the guest
> +			 * EOI-ed. Clean the LR state and allow the
> +			 * interrupt to be queued again.
> +			 */
> +			vlr.state &= ~LR_HW;
> +			vlr.hwirq = 0;
> +			vgic_set_lr(vcpu, lr, vlr);
> +			vgic_irq_clear_queued(vcpu, vlr.irq)
not necessarily a level sensitive IRQ?

- Eric
> +		}
> +
> +		if (!test_bit(lr, elrsr_ptr))
> +			continue;
> +
> +		clear_bit(lr, vgic_cpu->lr_used);
>  
>  		BUG_ON(vlr.irq >= dist->nr_irqs);
>  		vgic_cpu->vgic_irq_lr_map[vlr.irq] = LR_EMPTY;
> 

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Marc Zyngier June 17, 2015, 12:23 p.m. UTC | #7
Hi Eric,

On 17/06/15 12:51, Eric Auger wrote:
> Hi Marc,
> On 06/08/2015 07:04 PM, Marc Zyngier wrote:
>> To allow a HW interrupt to be injected into a guest, we lookup the
>> guest virtual interrupt in the irq_phys_map rbtree, and if we have
>> a match, encode both interrupts in the LR.
>>
>> We also mark the interrupt as "active" at the host distributor level.
>>
>> On guest EOI on the virtual interrupt, the host interrupt will be
>> deactivated.
>
> a "standard" physical IRQ would be first handled by the host handler
> which would ack and deactivate it a first time. Here, if my
> understanding is correct, the virtual counter PPI never hits. Instead we
> "emulate" it on world-switch by directly setting the dist state. Is that
> correct? If yes it is quite a specific handling of an "HW" IRQ.

This is (mostly) correct. Because we deal with HW that is shared between
guests, we absolutely need to make that HW quiescent before getting back
to the host. Setting the active bit in the distributor allows us to
restore the HW in a state that shows a pending interrupt at the guest
level, but ensure that the interrupt doesn't fire at the host level.

As for the "specificity", this is how the architecture has been
designed, and the way we're expected to deal with this kind of shared
HW. Rest assured I didn't come up with that on my own! ;-)

> 
>>
>> Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
>> ---
>>  virt/kvm/arm/vgic.c | 71 ++++++++++++++++++++++++++++++++++++++++++++++++++---
>>  1 file changed, 68 insertions(+), 3 deletions(-)
>>
>> diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c
>> index c6604f2..495ac7d 100644
>> --- a/virt/kvm/arm/vgic.c
>> +++ b/virt/kvm/arm/vgic.c
>> @@ -1120,6 +1120,26 @@ static void vgic_queue_irq_to_lr(struct kvm_vcpu *vcpu, int irq,
>>  	if (!vgic_irq_is_edge(vcpu, irq))
>>  		vlr.state |= LR_EOI_INT;
>>  
>> +	if (vlr.irq >= VGIC_NR_SGIS) {
>> +		struct irq_phys_map *map;
>> +		map = vgic_irq_map_search(vcpu, irq);
>> +
>> +		if (map) {
>> +			int ret;
>> +
>> +			BUG_ON(!map->active);
>> +			vlr.hwirq = map->phys_irq;
>> +			vlr.state |= LR_HW;
>> +			vlr.state &= ~LR_EOI_INT;
>> +
>> +			ret = irq_set_irqchip_state(map->irq,
>> +						    IRQCHIP_STATE_ACTIVE,
>> +						    true);
>> +			vgic_irq_set_queued(vcpu, irq);
>
> queued state was used for level sensitive IRQs only. Forwarded or "HW"
> IRQs theoretically can be edge or sensitive, right? If yes may be worth
> to justify the usage of queued state for forwarded IRQ? Also

That's because it is illegal to set a HW interrupt to be PENDING+ACTIVE,
which means we have to prevent the interrupt to be injected multiple
times. The behaviour is sufficiently close to what we do for a level
interrupt that we use the same state.

> vgic_irq_set_queued rather was called in parent vgic_queue_hwirq today.

I tried to keep the HW bit madness as localized as possible. Letting it
spread further away seems to make the code more difficult to read IMHO.

> 
>> +			WARN_ON(ret);
>> +		}
>> +	}
>> +
>>  	vgic_set_lr(vcpu, lr_nr, vlr);
>>  	vgic_sync_lr_elrsr(vcpu, lr_nr, vlr);
>>  }
>> @@ -1344,6 +1364,35 @@ static bool vgic_process_maintenance(struct kvm_vcpu *vcpu)
>>  	return level_pending;
>>  }
>>  
>> +/* Return 1 if HW interrupt went from active to inactive, and 0 otherwise */
>> +static int vgic_sync_hwirq(struct kvm_vcpu *vcpu, struct vgic_lr vlr)
>> +{
>> +	struct irq_phys_map *map;
>> +	int ret;
>> +
>> +	if (!(vlr.state & LR_HW))
>> +		return 0;
>> +
>> +	map = vgic_irq_map_search(vcpu, vlr.irq);
>> +	BUG_ON(!map || !map->active);
>> +
>> +	ret = irq_get_irqchip_state(map->irq,
>> +				    IRQCHIP_STATE_ACTIVE,
>> +				    &map->active);
>
> Doesn't it work because the virtual timer was disabled during the world
> switch. Does it characterize all "shared" devices? Difficult for me to
> understand how much this is specific to arch timer integration?

Shared devices cannot be left running when the guest is not running
because (a) we have lost the context (the guest), and (b) we need to
give it to another guest. This is a fundamental property of this kind of
resource.

This is by no mean specific to the timer, BTW. The VGIC itself is a
shared resource, and we nuke it on each exit, for the same reason. The
only difference is that we don't propagate the VGIC interrupt to a guest.

>> +
>> +	WARN_ON(ret);
>> +
>> +	if (map->active) {
>> +		ret = irq_set_irqchip_state(map->irq,
>> +					    IRQCHIP_STATE_ACTIVE,
>> +					    false);
>> +		WARN_ON(ret);
>> +		return 0;
>> +	}
>> +
>> +	return 1;
>> +}
>> +
>>  /* Sync back the VGIC state after a guest run */
>>  static void __kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu)
>>  {
>> @@ -1358,14 +1407,30 @@ static void __kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu)
>>  	elrsr = vgic_get_elrsr(vcpu);
>>  	elrsr_ptr = u64_to_bitmask(&elrsr);
>>  
>> -	/* Clear mappings for empty LRs */
>> -	for_each_set_bit(lr, elrsr_ptr, vgic->nr_lr) {
>> +	/* Deal with HW interrupts, and clear mappings for empty LRs */
>> +	for (lr = 0; lr < vgic->nr_lr; lr++) {
>>  		struct vgic_lr vlr;
>>  
>> -		if (!test_and_clear_bit(lr, vgic_cpu->lr_used))
>> +		if (!test_bit(lr, vgic_cpu->lr_used))
>>  			continue;
>>  
>>  		vlr = vgic_get_lr(vcpu, lr);
>> +		if (vgic_sync_hwirq(vcpu, vlr)) {
>> +			/*
>> +			 * So this is a HW interrupt that the guest
>> +			 * EOI-ed. Clean the LR state and allow the
>> +			 * interrupt to be queued again.
>> +			 */
>> +			vlr.state &= ~LR_HW;
>> +			vlr.hwirq = 0;
>> +			vgic_set_lr(vcpu, lr, vlr);
>> +			vgic_irq_clear_queued(vcpu, vlr.irq)
>
> not necessarily a level sensitive IRQ?

As explained above, we have the same requirements when an interrupt is
forwarded to a guest.

Thanks,

	M.
diff mbox

Patch

diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c
index c6604f2..495ac7d 100644
--- a/virt/kvm/arm/vgic.c
+++ b/virt/kvm/arm/vgic.c
@@ -1120,6 +1120,26 @@  static void vgic_queue_irq_to_lr(struct kvm_vcpu *vcpu, int irq,
 	if (!vgic_irq_is_edge(vcpu, irq))
 		vlr.state |= LR_EOI_INT;
 
+	if (vlr.irq >= VGIC_NR_SGIS) {
+		struct irq_phys_map *map;
+		map = vgic_irq_map_search(vcpu, irq);
+
+		if (map) {
+			int ret;
+
+			BUG_ON(!map->active);
+			vlr.hwirq = map->phys_irq;
+			vlr.state |= LR_HW;
+			vlr.state &= ~LR_EOI_INT;
+
+			ret = irq_set_irqchip_state(map->irq,
+						    IRQCHIP_STATE_ACTIVE,
+						    true);
+			vgic_irq_set_queued(vcpu, irq);
+			WARN_ON(ret);
+		}
+	}
+
 	vgic_set_lr(vcpu, lr_nr, vlr);
 	vgic_sync_lr_elrsr(vcpu, lr_nr, vlr);
 }
@@ -1344,6 +1364,35 @@  static bool vgic_process_maintenance(struct kvm_vcpu *vcpu)
 	return level_pending;
 }
 
+/* Return 1 if HW interrupt went from active to inactive, and 0 otherwise */
+static int vgic_sync_hwirq(struct kvm_vcpu *vcpu, struct vgic_lr vlr)
+{
+	struct irq_phys_map *map;
+	int ret;
+
+	if (!(vlr.state & LR_HW))
+		return 0;
+
+	map = vgic_irq_map_search(vcpu, vlr.irq);
+	BUG_ON(!map || !map->active);
+
+	ret = irq_get_irqchip_state(map->irq,
+				    IRQCHIP_STATE_ACTIVE,
+				    &map->active);
+
+	WARN_ON(ret);
+
+	if (map->active) {
+		ret = irq_set_irqchip_state(map->irq,
+					    IRQCHIP_STATE_ACTIVE,
+					    false);
+		WARN_ON(ret);
+		return 0;
+	}
+
+	return 1;
+}
+
 /* Sync back the VGIC state after a guest run */
 static void __kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu)
 {
@@ -1358,14 +1407,30 @@  static void __kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu)
 	elrsr = vgic_get_elrsr(vcpu);
 	elrsr_ptr = u64_to_bitmask(&elrsr);
 
-	/* Clear mappings for empty LRs */
-	for_each_set_bit(lr, elrsr_ptr, vgic->nr_lr) {
+	/* Deal with HW interrupts, and clear mappings for empty LRs */
+	for (lr = 0; lr < vgic->nr_lr; lr++) {
 		struct vgic_lr vlr;
 
-		if (!test_and_clear_bit(lr, vgic_cpu->lr_used))
+		if (!test_bit(lr, vgic_cpu->lr_used))
 			continue;
 
 		vlr = vgic_get_lr(vcpu, lr);
+		if (vgic_sync_hwirq(vcpu, vlr)) {
+			/*
+			 * So this is a HW interrupt that the guest
+			 * EOI-ed. Clean the LR state and allow the
+			 * interrupt to be queued again.
+			 */
+			vlr.state &= ~LR_HW;
+			vlr.hwirq = 0;
+			vgic_set_lr(vcpu, lr, vlr);
+			vgic_irq_clear_queued(vcpu, vlr.irq);
+		}
+
+		if (!test_bit(lr, elrsr_ptr))
+			continue;
+
+		clear_bit(lr, vgic_cpu->lr_used);
 
 		BUG_ON(vlr.irq >= dist->nr_irqs);
 		vgic_cpu->vgic_irq_lr_map[vlr.irq] = LR_EMPTY;