diff mbox series

[RFC,v2,2/4] KVM: arm64: GICv4.1: Try to save hw pending state in save_pending_tables

Message ID 20210104081613.100-3-lushenming@huawei.com (mailing list archive)
State New, archived
Headers show
Series KVM: arm64: Add VLPI migration support on GICv4.1 | expand

Commit Message

Shenming Lu Jan. 4, 2021, 8:16 a.m. UTC
After pausing all vCPUs and devices capable of interrupting, in order
to save the information of all interrupts, besides flushing the pending
states in kvm’s vgic, we also try to flush the states of VLPIs in the
virtual pending tables into guest RAM, but we need to have GICv4.1 and
safely unmap the vPEs first.

Signed-off-by: Shenming Lu <lushenming@huawei.com>
---
 arch/arm64/kvm/vgic/vgic-v3.c | 58 +++++++++++++++++++++++++++++++----
 1 file changed, 52 insertions(+), 6 deletions(-)

Comments

Marc Zyngier Jan. 5, 2021, 9:13 a.m. UTC | #1
On 2021-01-04 08:16, Shenming Lu wrote:
> After pausing all vCPUs and devices capable of interrupting, in order
> to save the information of all interrupts, besides flushing the pending
> states in kvm’s vgic, we also try to flush the states of VLPIs in the
> virtual pending tables into guest RAM, but we need to have GICv4.1 and
> safely unmap the vPEs first.
> 
> Signed-off-by: Shenming Lu <lushenming@huawei.com>
> ---
>  arch/arm64/kvm/vgic/vgic-v3.c | 58 +++++++++++++++++++++++++++++++----
>  1 file changed, 52 insertions(+), 6 deletions(-)
> 
> diff --git a/arch/arm64/kvm/vgic/vgic-v3.c 
> b/arch/arm64/kvm/vgic/vgic-v3.c
> index 9cdf39a94a63..a58c94127cb0 100644
> --- a/arch/arm64/kvm/vgic/vgic-v3.c
> +++ b/arch/arm64/kvm/vgic/vgic-v3.c
> @@ -1,6 +1,8 @@
>  // SPDX-License-Identifier: GPL-2.0-only
> 
>  #include <linux/irqchip/arm-gic-v3.h>
> +#include <linux/irq.h>
> +#include <linux/irqdomain.h>
>  #include <linux/kvm.h>
>  #include <linux/kvm_host.h>
>  #include <kvm/arm_vgic.h>
> @@ -356,6 +358,38 @@ int vgic_v3_lpi_sync_pending_status(struct kvm
> *kvm, struct vgic_irq *irq)
>  	return 0;
>  }
> 
> +/*
> + * The deactivation of the doorbell interrupt will trigger the
> + * unmapping of the associated vPE.
> + */
> +static void unmap_all_vpes(struct vgic_dist *dist)
> +{
> +	struct irq_desc *desc;
> +	int i;
> +
> +	if (!kvm_vgic_global_state.has_gicv4_1)
> +		return;
> +
> +	for (i = 0; i < dist->its_vm.nr_vpes; i++) {
> +		desc = irq_to_desc(dist->its_vm.vpes[i]->irq);
> +		irq_domain_deactivate_irq(irq_desc_get_irq_data(desc));
> +	}
> +}
> +
> +static void map_all_vpes(struct vgic_dist *dist)
> +{
> +	struct irq_desc *desc;
> +	int i;
> +
> +	if (!kvm_vgic_global_state.has_gicv4_1)
> +		return;
> +
> +	for (i = 0; i < dist->its_vm.nr_vpes; i++) {
> +		desc = irq_to_desc(dist->its_vm.vpes[i]->irq);
> +		irq_domain_activate_irq(irq_desc_get_irq_data(desc), false);
> +	}
> +}
> +
>  /**
>   * vgic_v3_save_pending_tables - Save the pending tables into guest 
> RAM
>   * kvm lock and all vcpu lock must be held
> @@ -365,14 +399,18 @@ int vgic_v3_save_pending_tables(struct kvm *kvm)
>  	struct vgic_dist *dist = &kvm->arch.vgic;
>  	struct vgic_irq *irq;
>  	gpa_t last_ptr = ~(gpa_t)0;
> -	int ret;
> +	int ret = 0;
>  	u8 val;
> 
> +	/* As a preparation for getting any VLPI states. */
> +	unmap_all_vpes(dist);

What if the VPEs are not mapped yet? Is it possible to snapshot a VM
that has not run at all?

> +
>  	list_for_each_entry(irq, &dist->lpi_list_head, lpi_list) {
>  		int byte_offset, bit_nr;
>  		struct kvm_vcpu *vcpu;
>  		gpa_t pendbase, ptr;
>  		bool stored;
> +		bool is_pending = irq->pending_latch;
> 
>  		vcpu = irq->target_vcpu;
>  		if (!vcpu)
> @@ -387,24 +425,32 @@ int vgic_v3_save_pending_tables(struct kvm *kvm)
>  		if (ptr != last_ptr) {
>  			ret = kvm_read_guest_lock(kvm, ptr, &val, 1);
>  			if (ret)
> -				return ret;
> +				goto out;
>  			last_ptr = ptr;
>  		}
> 
>  		stored = val & (1U << bit_nr);
> -		if (stored == irq->pending_latch)
> +
> +		if (irq->hw)
> +			vgic_v4_get_vlpi_state(irq, &is_pending);

You don't check the return value here, so I wonder why the checks
in vgic_v4_get_vlpi_state().

Another thing that worries me is that vgic_v4_get_vlpi_state() doesn't
have any cache invalidation, and can end-up hitting in the CPU cache
(there is no guarantee of coherency between the GIC and the CPU, only
that the GIC will have flushed its caches).

I'd expect this to happen at unmap time, though, in order to avoid
repeated single byte invalidations.

> +
> +		if (stored == is_pending)
>  			continue;
> 
> -		if (irq->pending_latch)
> +		if (is_pending)
>  			val |= 1 << bit_nr;
>  		else
>  			val &= ~(1 << bit_nr);
> 
>  		ret = kvm_write_guest_lock(kvm, ptr, &val, 1);
>  		if (ret)
> -			return ret;
> +			goto out;
>  	}
> -	return 0;
> +
> +out:
> +	map_all_vpes(dist);
> +
> +	return ret;
>  }
> 
>  /**

Thanks,

         M.
Marc Zyngier Jan. 5, 2021, 11:40 a.m. UTC | #2
On 2021-01-05 09:13, Marc Zyngier wrote:
> On 2021-01-04 08:16, Shenming Lu wrote:
>> After pausing all vCPUs and devices capable of interrupting, in order
>> to save the information of all interrupts, besides flushing the 
>> pending
>> states in kvm’s vgic, we also try to flush the states of VLPIs in the
>> virtual pending tables into guest RAM, but we need to have GICv4.1 and
>> safely unmap the vPEs first.
>> 
>> Signed-off-by: Shenming Lu <lushenming@huawei.com>
>> ---
>>  arch/arm64/kvm/vgic/vgic-v3.c | 58 
>> +++++++++++++++++++++++++++++++----
>>  1 file changed, 52 insertions(+), 6 deletions(-)
>> 
>> diff --git a/arch/arm64/kvm/vgic/vgic-v3.c 
>> b/arch/arm64/kvm/vgic/vgic-v3.c
>> index 9cdf39a94a63..a58c94127cb0 100644
>> --- a/arch/arm64/kvm/vgic/vgic-v3.c
>> +++ b/arch/arm64/kvm/vgic/vgic-v3.c
>> @@ -1,6 +1,8 @@
>>  // SPDX-License-Identifier: GPL-2.0-only
>> 
>>  #include <linux/irqchip/arm-gic-v3.h>
>> +#include <linux/irq.h>
>> +#include <linux/irqdomain.h>
>>  #include <linux/kvm.h>
>>  #include <linux/kvm_host.h>
>>  #include <kvm/arm_vgic.h>
>> @@ -356,6 +358,38 @@ int vgic_v3_lpi_sync_pending_status(struct kvm
>> *kvm, struct vgic_irq *irq)
>>  	return 0;
>>  }
>> 
>> +/*
>> + * The deactivation of the doorbell interrupt will trigger the
>> + * unmapping of the associated vPE.
>> + */
>> +static void unmap_all_vpes(struct vgic_dist *dist)
>> +{
>> +	struct irq_desc *desc;
>> +	int i;
>> +
>> +	if (!kvm_vgic_global_state.has_gicv4_1)
>> +		return;
>> +
>> +	for (i = 0; i < dist->its_vm.nr_vpes; i++) {
>> +		desc = irq_to_desc(dist->its_vm.vpes[i]->irq);
>> +		irq_domain_deactivate_irq(irq_desc_get_irq_data(desc));
>> +	}
>> +}
>> +
>> +static void map_all_vpes(struct vgic_dist *dist)
>> +{
>> +	struct irq_desc *desc;
>> +	int i;
>> +
>> +	if (!kvm_vgic_global_state.has_gicv4_1)
>> +		return;
>> +
>> +	for (i = 0; i < dist->its_vm.nr_vpes; i++) {
>> +		desc = irq_to_desc(dist->its_vm.vpes[i]->irq);
>> +		irq_domain_activate_irq(irq_desc_get_irq_data(desc), false);
>> +	}
>> +}
>> +
>>  /**
>>   * vgic_v3_save_pending_tables - Save the pending tables into guest 
>> RAM
>>   * kvm lock and all vcpu lock must be held
>> @@ -365,14 +399,18 @@ int vgic_v3_save_pending_tables(struct kvm *kvm)
>>  	struct vgic_dist *dist = &kvm->arch.vgic;
>>  	struct vgic_irq *irq;
>>  	gpa_t last_ptr = ~(gpa_t)0;
>> -	int ret;
>> +	int ret = 0;
>>  	u8 val;
>> 
>> +	/* As a preparation for getting any VLPI states. */
>> +	unmap_all_vpes(dist);
> 
> What if the VPEs are not mapped yet? Is it possible to snapshot a VM
> that has not run at all?

More questions: what happens to vSGIs that were mapped to the VPEs?
Can they safely be restarted? The spec is not saying much on the 
subject.

Once the unmap has taken place, it won't be possible to read their state
via GICR_VSGIRPEND, and only the memory state can be used. This probably
needs to be tracked as well.

Thanks,

         M.
Shenming Lu Jan. 5, 2021, 1:02 p.m. UTC | #3
On 2021/1/5 17:13, Marc Zyngier wrote:
> On 2021-01-04 08:16, Shenming Lu wrote:
>> After pausing all vCPUs and devices capable of interrupting, in order
>> to save the information of all interrupts, besides flushing the pending
>> states in kvm’s vgic, we also try to flush the states of VLPIs in the
>> virtual pending tables into guest RAM, but we need to have GICv4.1 and
>> safely unmap the vPEs first.
>>
>> Signed-off-by: Shenming Lu <lushenming@huawei.com>
>> ---
>>  arch/arm64/kvm/vgic/vgic-v3.c | 58 +++++++++++++++++++++++++++++++----
>>  1 file changed, 52 insertions(+), 6 deletions(-)
>>
>> diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c
>> index 9cdf39a94a63..a58c94127cb0 100644
>> --- a/arch/arm64/kvm/vgic/vgic-v3.c
>> +++ b/arch/arm64/kvm/vgic/vgic-v3.c
>> @@ -1,6 +1,8 @@
>>  // SPDX-License-Identifier: GPL-2.0-only
>>
>>  #include <linux/irqchip/arm-gic-v3.h>
>> +#include <linux/irq.h>
>> +#include <linux/irqdomain.h>
>>  #include <linux/kvm.h>
>>  #include <linux/kvm_host.h>
>>  #include <kvm/arm_vgic.h>
>> @@ -356,6 +358,38 @@ int vgic_v3_lpi_sync_pending_status(struct kvm
>> *kvm, struct vgic_irq *irq)
>>      return 0;
>>  }
>>
>> +/*
>> + * The deactivation of the doorbell interrupt will trigger the
>> + * unmapping of the associated vPE.
>> + */
>> +static void unmap_all_vpes(struct vgic_dist *dist)
>> +{
>> +    struct irq_desc *desc;
>> +    int i;
>> +
>> +    if (!kvm_vgic_global_state.has_gicv4_1)
>> +        return;
>> +
>> +    for (i = 0; i < dist->its_vm.nr_vpes; i++) {
>> +        desc = irq_to_desc(dist->its_vm.vpes[i]->irq);
>> +        irq_domain_deactivate_irq(irq_desc_get_irq_data(desc));
>> +    }
>> +}
>> +
>> +static void map_all_vpes(struct vgic_dist *dist)
>> +{
>> +    struct irq_desc *desc;
>> +    int i;
>> +
>> +    if (!kvm_vgic_global_state.has_gicv4_1)
>> +        return;
>> +
>> +    for (i = 0; i < dist->its_vm.nr_vpes; i++) {
>> +        desc = irq_to_desc(dist->its_vm.vpes[i]->irq);
>> +        irq_domain_activate_irq(irq_desc_get_irq_data(desc), false);
>> +    }
>> +}
>> +
>>  /**
>>   * vgic_v3_save_pending_tables - Save the pending tables into guest RAM
>>   * kvm lock and all vcpu lock must be held
>> @@ -365,14 +399,18 @@ int vgic_v3_save_pending_tables(struct kvm *kvm)
>>      struct vgic_dist *dist = &kvm->arch.vgic;
>>      struct vgic_irq *irq;
>>      gpa_t last_ptr = ~(gpa_t)0;
>> -    int ret;
>> +    int ret = 0;
>>      u8 val;
>>
>> +    /* As a preparation for getting any VLPI states. */
>> +    unmap_all_vpes(dist);
> 
> What if the VPEs are not mapped yet? Is it possible to snapshot a VM
> that has not run at all?

What I see in QEMU is that the saving of the pending tables would only be
called when stopping the VM and it needs the current VM state to be RUNNING.

> 
>> +
>>      list_for_each_entry(irq, &dist->lpi_list_head, lpi_list) {
>>          int byte_offset, bit_nr;
>>          struct kvm_vcpu *vcpu;
>>          gpa_t pendbase, ptr;
>>          bool stored;
>> +        bool is_pending = irq->pending_latch;
>>
>>          vcpu = irq->target_vcpu;
>>          if (!vcpu)
>> @@ -387,24 +425,32 @@ int vgic_v3_save_pending_tables(struct kvm *kvm)
>>          if (ptr != last_ptr) {
>>              ret = kvm_read_guest_lock(kvm, ptr, &val, 1);
>>              if (ret)
>> -                return ret;
>> +                goto out;
>>              last_ptr = ptr;
>>          }
>>
>>          stored = val & (1U << bit_nr);
>> -        if (stored == irq->pending_latch)
>> +
>> +        if (irq->hw)
>> +            vgic_v4_get_vlpi_state(irq, &is_pending);
> 
> You don't check the return value here, so I wonder why the checks
> in vgic_v4_get_vlpi_state().

Since I have already checked the condition and reported in save_its_tables
(patch 4), I just check in get_vlpi_state and don't report again here.

> 
> Another thing that worries me is that vgic_v4_get_vlpi_state() doesn't
> have any cache invalidation, and can end-up hitting in the CPU cache
> (there is no guarantee of coherency between the GIC and the CPU, only
> that the GIC will have flushed its caches).
> 
> I'd expect this to happen at unmap time, though, in order to avoid
> repeated single byte invalidations.

Ok, I will add a cache invalidation at unmap time.

> 
>> +
>> +        if (stored == is_pending)
>>              continue;
>>
>> -        if (irq->pending_latch)
>> +        if (is_pending)
>>              val |= 1 << bit_nr;
>>          else
>>              val &= ~(1 << bit_nr);
>>
>>          ret = kvm_write_guest_lock(kvm, ptr, &val, 1);
>>          if (ret)
>> -            return ret;
>> +            goto out;
>>      }
>> -    return 0;
>> +
>> +out:
>> +    map_all_vpes(dist);
>> +
>> +    return ret;
>>  }
>>
>>  /**
> 
> Thanks,
> 
>         M.
Marc Zyngier Jan. 5, 2021, 1:47 p.m. UTC | #4
On 2021-01-05 13:02, Shenming Lu wrote:
> On 2021/1/5 17:13, Marc Zyngier wrote:
>> On 2021-01-04 08:16, Shenming Lu wrote:
>>> After pausing all vCPUs and devices capable of interrupting, in order
>>> to save the information of all interrupts, besides flushing the 
>>> pending
>>> states in kvm’s vgic, we also try to flush the states of VLPIs in the
>>> virtual pending tables into guest RAM, but we need to have GICv4.1 
>>> and
>>> safely unmap the vPEs first.
>>> 
>>> Signed-off-by: Shenming Lu <lushenming@huawei.com>
>>> ---
>>>  arch/arm64/kvm/vgic/vgic-v3.c | 58 
>>> +++++++++++++++++++++++++++++++----
>>>  1 file changed, 52 insertions(+), 6 deletions(-)
>>> 
>>> diff --git a/arch/arm64/kvm/vgic/vgic-v3.c 
>>> b/arch/arm64/kvm/vgic/vgic-v3.c
>>> index 9cdf39a94a63..a58c94127cb0 100644
>>> --- a/arch/arm64/kvm/vgic/vgic-v3.c
>>> +++ b/arch/arm64/kvm/vgic/vgic-v3.c
>>> @@ -1,6 +1,8 @@
>>>  // SPDX-License-Identifier: GPL-2.0-only
>>> 
>>>  #include <linux/irqchip/arm-gic-v3.h>
>>> +#include <linux/irq.h>
>>> +#include <linux/irqdomain.h>
>>>  #include <linux/kvm.h>
>>>  #include <linux/kvm_host.h>
>>>  #include <kvm/arm_vgic.h>
>>> @@ -356,6 +358,38 @@ int vgic_v3_lpi_sync_pending_status(struct kvm
>>> *kvm, struct vgic_irq *irq)
>>>      return 0;
>>>  }
>>> 
>>> +/*
>>> + * The deactivation of the doorbell interrupt will trigger the
>>> + * unmapping of the associated vPE.
>>> + */
>>> +static void unmap_all_vpes(struct vgic_dist *dist)
>>> +{
>>> +    struct irq_desc *desc;
>>> +    int i;
>>> +
>>> +    if (!kvm_vgic_global_state.has_gicv4_1)
>>> +        return;
>>> +
>>> +    for (i = 0; i < dist->its_vm.nr_vpes; i++) {
>>> +        desc = irq_to_desc(dist->its_vm.vpes[i]->irq);
>>> +        irq_domain_deactivate_irq(irq_desc_get_irq_data(desc));
>>> +    }
>>> +}
>>> +
>>> +static void map_all_vpes(struct vgic_dist *dist)
>>> +{
>>> +    struct irq_desc *desc;
>>> +    int i;
>>> +
>>> +    if (!kvm_vgic_global_state.has_gicv4_1)
>>> +        return;
>>> +
>>> +    for (i = 0; i < dist->its_vm.nr_vpes; i++) {
>>> +        desc = irq_to_desc(dist->its_vm.vpes[i]->irq);
>>> +        irq_domain_activate_irq(irq_desc_get_irq_data(desc), false);
>>> +    }
>>> +}
>>> +
>>>  /**
>>>   * vgic_v3_save_pending_tables - Save the pending tables into guest 
>>> RAM
>>>   * kvm lock and all vcpu lock must be held
>>> @@ -365,14 +399,18 @@ int vgic_v3_save_pending_tables(struct kvm 
>>> *kvm)
>>>      struct vgic_dist *dist = &kvm->arch.vgic;
>>>      struct vgic_irq *irq;
>>>      gpa_t last_ptr = ~(gpa_t)0;
>>> -    int ret;
>>> +    int ret = 0;
>>>      u8 val;
>>> 
>>> +    /* As a preparation for getting any VLPI states. */
>>> +    unmap_all_vpes(dist);
>> 
>> What if the VPEs are not mapped yet? Is it possible to snapshot a VM
>> that has not run at all?
> 
> What I see in QEMU is that the saving of the pending tables would only 
> be
> called when stopping the VM and it needs the current VM state to be 
> RUNNING.

Sure, but that's what QEMU does, and a different userspace could well do
something different. It looks to me that I should be able to start (or
even restore) a guest, and snapshot it immediately. Here, I'm pretty
sure this wouldn't do the right thing (I have the suspicion that the
doorbells are not allocated, and that we'll end-up with an Oops at unmap
time, though I haven't investigated it to be sure).

>> 
>>> +
>>>      list_for_each_entry(irq, &dist->lpi_list_head, lpi_list) {
>>>          int byte_offset, bit_nr;
>>>          struct kvm_vcpu *vcpu;
>>>          gpa_t pendbase, ptr;
>>>          bool stored;
>>> +        bool is_pending = irq->pending_latch;
>>> 
>>>          vcpu = irq->target_vcpu;
>>>          if (!vcpu)
>>> @@ -387,24 +425,32 @@ int vgic_v3_save_pending_tables(struct kvm 
>>> *kvm)
>>>          if (ptr != last_ptr) {
>>>              ret = kvm_read_guest_lock(kvm, ptr, &val, 1);
>>>              if (ret)
>>> -                return ret;
>>> +                goto out;
>>>              last_ptr = ptr;
>>>          }
>>> 
>>>          stored = val & (1U << bit_nr);
>>> -        if (stored == irq->pending_latch)
>>> +
>>> +        if (irq->hw)
>>> +            vgic_v4_get_vlpi_state(irq, &is_pending);
>> 
>> You don't check the return value here, so I wonder why the checks
>> in vgic_v4_get_vlpi_state().
> 
> Since I have already checked the condition and reported in 
> save_its_tables
> (patch 4), I just check in get_vlpi_state and don't report again here.

Sure, but why the checks and the return value then? I'd rather you check 
all
the relevant conditions in one place.

> 
>> 
>> Another thing that worries me is that vgic_v4_get_vlpi_state() doesn't
>> have any cache invalidation, and can end-up hitting in the CPU cache
>> (there is no guarantee of coherency between the GIC and the CPU, only
>> that the GIC will have flushed its caches).
>> 
>> I'd expect this to happen at unmap time, though, in order to avoid
>> repeated single byte invalidations.
> 
> Ok, I will add a cache invalidation at unmap time.

I guess a sensible place to do that would be at deactivation time.
I came up with the following hack, completely untested.

If that works for you, I'll turn it into a proper patch that you
can carry with the series (I may turn it into a __inval_dcache_area
call if I can find the equivalent 32bit).

Thanks,

         M.

diff --git a/drivers/irqchip/irq-gic-v3-its.c 
b/drivers/irqchip/irq-gic-v3-its.c
index 7db602434ac5..2dbef127ca15 100644
--- a/drivers/irqchip/irq-gic-v3-its.c
+++ b/drivers/irqchip/irq-gic-v3-its.c
@@ -4552,6 +4552,10 @@ static void its_vpe_irq_domain_deactivate(struct 
irq_domain *domain,

  		its_send_vmapp(its, vpe, false);
  	}
+
+	if (find_4_1_its() && !atomic_read(vpe->vmapp_count))
+		gic_flush_dcache_to_poc(page_address(vpe->vpt_page),
+					LPI_PENDBASE_SZ);
  }

  static const struct irq_domain_ops its_vpe_domain_ops = {
Shenming Lu Jan. 6, 2021, 5:48 a.m. UTC | #5
On 2021/1/5 19:40, Marc Zyngier wrote:
> On 2021-01-05 09:13, Marc Zyngier wrote:
>> On 2021-01-04 08:16, Shenming Lu wrote:
>>> After pausing all vCPUs and devices capable of interrupting, in order
>>> to save the information of all interrupts, besides flushing the pending
>>> states in kvm’s vgic, we also try to flush the states of VLPIs in the
>>> virtual pending tables into guest RAM, but we need to have GICv4.1 and
>>> safely unmap the vPEs first.
>>>
>>> Signed-off-by: Shenming Lu <lushenming@huawei.com>
>>> ---
>>>  arch/arm64/kvm/vgic/vgic-v3.c | 58 +++++++++++++++++++++++++++++++----
>>>  1 file changed, 52 insertions(+), 6 deletions(-)
>>>
>>> diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c
>>> index 9cdf39a94a63..a58c94127cb0 100644
>>> --- a/arch/arm64/kvm/vgic/vgic-v3.c
>>> +++ b/arch/arm64/kvm/vgic/vgic-v3.c
>>> @@ -1,6 +1,8 @@
>>>  // SPDX-License-Identifier: GPL-2.0-only
>>>
>>>  #include <linux/irqchip/arm-gic-v3.h>
>>> +#include <linux/irq.h>
>>> +#include <linux/irqdomain.h>
>>>  #include <linux/kvm.h>
>>>  #include <linux/kvm_host.h>
>>>  #include <kvm/arm_vgic.h>
>>> @@ -356,6 +358,38 @@ int vgic_v3_lpi_sync_pending_status(struct kvm
>>> *kvm, struct vgic_irq *irq)
>>>      return 0;
>>>  }
>>>
>>> +/*
>>> + * The deactivation of the doorbell interrupt will trigger the
>>> + * unmapping of the associated vPE.
>>> + */
>>> +static void unmap_all_vpes(struct vgic_dist *dist)
>>> +{
>>> +    struct irq_desc *desc;
>>> +    int i;
>>> +
>>> +    if (!kvm_vgic_global_state.has_gicv4_1)
>>> +        return;
>>> +
>>> +    for (i = 0; i < dist->its_vm.nr_vpes; i++) {
>>> +        desc = irq_to_desc(dist->its_vm.vpes[i]->irq);
>>> +        irq_domain_deactivate_irq(irq_desc_get_irq_data(desc));
>>> +    }
>>> +}
>>> +
>>> +static void map_all_vpes(struct vgic_dist *dist)
>>> +{
>>> +    struct irq_desc *desc;
>>> +    int i;
>>> +
>>> +    if (!kvm_vgic_global_state.has_gicv4_1)
>>> +        return;
>>> +
>>> +    for (i = 0; i < dist->its_vm.nr_vpes; i++) {
>>> +        desc = irq_to_desc(dist->its_vm.vpes[i]->irq);
>>> +        irq_domain_activate_irq(irq_desc_get_irq_data(desc), false);
>>> +    }
>>> +}
>>> +
>>>  /**
>>>   * vgic_v3_save_pending_tables - Save the pending tables into guest RAM
>>>   * kvm lock and all vcpu lock must be held
>>> @@ -365,14 +399,18 @@ int vgic_v3_save_pending_tables(struct kvm *kvm)
>>>      struct vgic_dist *dist = &kvm->arch.vgic;
>>>      struct vgic_irq *irq;
>>>      gpa_t last_ptr = ~(gpa_t)0;
>>> -    int ret;
>>> +    int ret = 0;
>>>      u8 val;
>>>
>>> +    /* As a preparation for getting any VLPI states. */
>>> +    unmap_all_vpes(dist);
>>
>> What if the VPEs are not mapped yet? Is it possible to snapshot a VM
>> that has not run at all?
> 
> More questions: what happens to vSGIs that were mapped to the VPEs?
> Can they safely be restarted? The spec is not saying much on the subject.

Since we have already paused all vCPUs, there would be no more vSGIs generated,
and also no vSGI would be delivered to the vPE. And the unmapping of the
vPE would not affect the (already) stored vSGI states... I think they could
be safely restarted.

> 
> Once the unmap has taken place, it won't be possible to read their state
> via GICR_VSGIRPEND, and only the memory state can be used. This probably
> needs to be tracked as well.

Yes, since we will map the vPEs back, could we assume that the saving of the
vLPI and vSGI states happen serially? In fact that's what QEMU does.

> 
> Thanks,
> 
>         M.
Shenming Lu Jan. 6, 2021, 7:14 a.m. UTC | #6
On 2021/1/5 21:47, Marc Zyngier wrote:
> On 2021-01-05 13:02, Shenming Lu wrote:
>> On 2021/1/5 17:13, Marc Zyngier wrote:
>>> On 2021-01-04 08:16, Shenming Lu wrote:
>>>> After pausing all vCPUs and devices capable of interrupting, in order
>>>> to save the information of all interrupts, besides flushing the pending
>>>> states in kvm’s vgic, we also try to flush the states of VLPIs in the
>>>> virtual pending tables into guest RAM, but we need to have GICv4.1 and
>>>> safely unmap the vPEs first.
>>>>
>>>> Signed-off-by: Shenming Lu <lushenming@huawei.com>
>>>> ---
>>>>  arch/arm64/kvm/vgic/vgic-v3.c | 58 +++++++++++++++++++++++++++++++----
>>>>  1 file changed, 52 insertions(+), 6 deletions(-)
>>>>
>>>> diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c
>>>> index 9cdf39a94a63..a58c94127cb0 100644
>>>> --- a/arch/arm64/kvm/vgic/vgic-v3.c
>>>> +++ b/arch/arm64/kvm/vgic/vgic-v3.c
>>>> @@ -1,6 +1,8 @@
>>>>  // SPDX-License-Identifier: GPL-2.0-only
>>>>
>>>>  #include <linux/irqchip/arm-gic-v3.h>
>>>> +#include <linux/irq.h>
>>>> +#include <linux/irqdomain.h>
>>>>  #include <linux/kvm.h>
>>>>  #include <linux/kvm_host.h>
>>>>  #include <kvm/arm_vgic.h>
>>>> @@ -356,6 +358,38 @@ int vgic_v3_lpi_sync_pending_status(struct kvm
>>>> *kvm, struct vgic_irq *irq)
>>>>      return 0;
>>>>  }
>>>>
>>>> +/*
>>>> + * The deactivation of the doorbell interrupt will trigger the
>>>> + * unmapping of the associated vPE.
>>>> + */
>>>> +static void unmap_all_vpes(struct vgic_dist *dist)
>>>> +{
>>>> +    struct irq_desc *desc;
>>>> +    int i;
>>>> +
>>>> +    if (!kvm_vgic_global_state.has_gicv4_1)
>>>> +        return;
>>>> +
>>>> +    for (i = 0; i < dist->its_vm.nr_vpes; i++) {
>>>> +        desc = irq_to_desc(dist->its_vm.vpes[i]->irq);
>>>> +        irq_domain_deactivate_irq(irq_desc_get_irq_data(desc));
>>>> +    }
>>>> +}
>>>> +
>>>> +static void map_all_vpes(struct vgic_dist *dist)
>>>> +{
>>>> +    struct irq_desc *desc;
>>>> +    int i;
>>>> +
>>>> +    if (!kvm_vgic_global_state.has_gicv4_1)
>>>> +        return;
>>>> +
>>>> +    for (i = 0; i < dist->its_vm.nr_vpes; i++) {
>>>> +        desc = irq_to_desc(dist->its_vm.vpes[i]->irq);
>>>> +        irq_domain_activate_irq(irq_desc_get_irq_data(desc), false);
>>>> +    }
>>>> +}
>>>> +
>>>>  /**
>>>>   * vgic_v3_save_pending_tables - Save the pending tables into guest RAM
>>>>   * kvm lock and all vcpu lock must be held
>>>> @@ -365,14 +399,18 @@ int vgic_v3_save_pending_tables(struct kvm *kvm)
>>>>      struct vgic_dist *dist = &kvm->arch.vgic;
>>>>      struct vgic_irq *irq;
>>>>      gpa_t last_ptr = ~(gpa_t)0;
>>>> -    int ret;
>>>> +    int ret = 0;
>>>>      u8 val;
>>>>
>>>> +    /* As a preparation for getting any VLPI states. */
>>>> +    unmap_all_vpes(dist);
>>>
>>> What if the VPEs are not mapped yet? Is it possible to snapshot a VM
>>> that has not run at all?
>>
>> What I see in QEMU is that the saving of the pending tables would only be
>> called when stopping the VM and it needs the current VM state to be RUNNING.
> 
> Sure, but that's what QEMU does, and a different userspace could well do
> something different. It looks to me that I should be able to start (or
> even restore) a guest, and snapshot it immediately. Here, I'm pretty
> sure this wouldn't do the right thing (I have the suspicion that the
> doorbells are not allocated, and that we'll end-up with an Oops at unmap
> time, though I haven't investigated it to be sure).
>

If we can't rely on the userspace, could we check whether it is allowed
(at the right time) before the unmapping? Maybe have a look at vmapp_count?
Although I think snapshot a VM that has not been started is almost impossible...

>>>
>>>> +
>>>>      list_for_each_entry(irq, &dist->lpi_list_head, lpi_list) {
>>>>          int byte_offset, bit_nr;
>>>>          struct kvm_vcpu *vcpu;
>>>>          gpa_t pendbase, ptr;
>>>>          bool stored;
>>>> +        bool is_pending = irq->pending_latch;
>>>>
>>>>          vcpu = irq->target_vcpu;
>>>>          if (!vcpu)
>>>> @@ -387,24 +425,32 @@ int vgic_v3_save_pending_tables(struct kvm *kvm)
>>>>          if (ptr != last_ptr) {
>>>>              ret = kvm_read_guest_lock(kvm, ptr, &val, 1);
>>>>              if (ret)
>>>> -                return ret;
>>>> +                goto out;
>>>>              last_ptr = ptr;
>>>>          }
>>>>
>>>>          stored = val & (1U << bit_nr);
>>>> -        if (stored == irq->pending_latch)
>>>> +
>>>> +        if (irq->hw)
>>>> +            vgic_v4_get_vlpi_state(irq, &is_pending);
>>>
>>> You don't check the return value here, so I wonder why the checks
>>> in vgic_v4_get_vlpi_state().
>>
>> Since I have already checked the condition and reported in save_its_tables
>> (patch 4), I just check in get_vlpi_state and don't report again here.
> 
> Sure, but why the checks and the return value then? I'd rather you check all
> the relevant conditions in one place.

Yeah, it seems that the return value is unnecessary, I can change vgic_v4_get_vlpi_state()
to be void. And does the check in one place mean that we check all the relevant
conditions at the beginning of vgic_v3_save_pending_tables (in unmap_all_vpes())
and set a variable maybe called hw_avail?

> 
>>
>>>
>>> Another thing that worries me is that vgic_v4_get_vlpi_state() doesn't
>>> have any cache invalidation, and can end-up hitting in the CPU cache
>>> (there is no guarantee of coherency between the GIC and the CPU, only
>>> that the GIC will have flushed its caches).
>>>
>>> I'd expect this to happen at unmap time, though, in order to avoid
>>> repeated single byte invalidations.
>>
>> Ok, I will add a cache invalidation at unmap time.
> 
> I guess a sensible place to do that would be at deactivation time.
> I came up with the following hack, completely untested.
> 
> If that works for you, I'll turn it into a proper patch that you
> can carry with the series (I may turn it into a __inval_dcache_area
> call if I can find the equivalent 32bit).

It looks good to me :-), thanks.

> 
> Thanks,
> 
>         M.
> 
> diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
> index 7db602434ac5..2dbef127ca15 100644
> --- a/drivers/irqchip/irq-gic-v3-its.c
> +++ b/drivers/irqchip/irq-gic-v3-its.c
> @@ -4552,6 +4552,10 @@ static void its_vpe_irq_domain_deactivate(struct irq_domain *domain,
> 
>          its_send_vmapp(its, vpe, false);
>      }
> +
> +    if (find_4_1_its() && !atomic_read(vpe->vmapp_count))
> +        gic_flush_dcache_to_poc(page_address(vpe->vpt_page),
> +                    LPI_PENDBASE_SZ);
>  }
> 
>  static const struct irq_domain_ops its_vpe_domain_ops = {
> 
>
diff mbox series

Patch

diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c
index 9cdf39a94a63..a58c94127cb0 100644
--- a/arch/arm64/kvm/vgic/vgic-v3.c
+++ b/arch/arm64/kvm/vgic/vgic-v3.c
@@ -1,6 +1,8 @@ 
 // SPDX-License-Identifier: GPL-2.0-only
 
 #include <linux/irqchip/arm-gic-v3.h>
+#include <linux/irq.h>
+#include <linux/irqdomain.h>
 #include <linux/kvm.h>
 #include <linux/kvm_host.h>
 #include <kvm/arm_vgic.h>
@@ -356,6 +358,38 @@  int vgic_v3_lpi_sync_pending_status(struct kvm *kvm, struct vgic_irq *irq)
 	return 0;
 }
 
+/*
+ * The deactivation of the doorbell interrupt will trigger the
+ * unmapping of the associated vPE.
+ */
+static void unmap_all_vpes(struct vgic_dist *dist)
+{
+	struct irq_desc *desc;
+	int i;
+
+	if (!kvm_vgic_global_state.has_gicv4_1)
+		return;
+
+	for (i = 0; i < dist->its_vm.nr_vpes; i++) {
+		desc = irq_to_desc(dist->its_vm.vpes[i]->irq);
+		irq_domain_deactivate_irq(irq_desc_get_irq_data(desc));
+	}
+}
+
+static void map_all_vpes(struct vgic_dist *dist)
+{
+	struct irq_desc *desc;
+	int i;
+
+	if (!kvm_vgic_global_state.has_gicv4_1)
+		return;
+
+	for (i = 0; i < dist->its_vm.nr_vpes; i++) {
+		desc = irq_to_desc(dist->its_vm.vpes[i]->irq);
+		irq_domain_activate_irq(irq_desc_get_irq_data(desc), false);
+	}
+}
+
 /**
  * vgic_v3_save_pending_tables - Save the pending tables into guest RAM
  * kvm lock and all vcpu lock must be held
@@ -365,14 +399,18 @@  int vgic_v3_save_pending_tables(struct kvm *kvm)
 	struct vgic_dist *dist = &kvm->arch.vgic;
 	struct vgic_irq *irq;
 	gpa_t last_ptr = ~(gpa_t)0;
-	int ret;
+	int ret = 0;
 	u8 val;
 
+	/* As a preparation for getting any VLPI states. */
+	unmap_all_vpes(dist);
+
 	list_for_each_entry(irq, &dist->lpi_list_head, lpi_list) {
 		int byte_offset, bit_nr;
 		struct kvm_vcpu *vcpu;
 		gpa_t pendbase, ptr;
 		bool stored;
+		bool is_pending = irq->pending_latch;
 
 		vcpu = irq->target_vcpu;
 		if (!vcpu)
@@ -387,24 +425,32 @@  int vgic_v3_save_pending_tables(struct kvm *kvm)
 		if (ptr != last_ptr) {
 			ret = kvm_read_guest_lock(kvm, ptr, &val, 1);
 			if (ret)
-				return ret;
+				goto out;
 			last_ptr = ptr;
 		}
 
 		stored = val & (1U << bit_nr);
-		if (stored == irq->pending_latch)
+
+		if (irq->hw)
+			vgic_v4_get_vlpi_state(irq, &is_pending);
+
+		if (stored == is_pending)
 			continue;
 
-		if (irq->pending_latch)
+		if (is_pending)
 			val |= 1 << bit_nr;
 		else
 			val &= ~(1 << bit_nr);
 
 		ret = kvm_write_guest_lock(kvm, ptr, &val, 1);
 		if (ret)
-			return ret;
+			goto out;
 	}
-	return 0;
+
+out:
+	map_all_vpes(dist);
+
+	return ret;
 }
 
 /**