diff mbox

[v4,15/19] arm/arm64: KVM: add virtual GICv3 distributor emulation

Message ID 1415959683-26027-16-git-send-email-andre.przywara@arm.com (mailing list archive)
State New, archived
Headers show

Commit Message

Andre Przywara Nov. 14, 2014, 10:07 a.m. UTC
With everything separated and prepared, we implement a model of a
GICv3 distributor and redistributors by using the existing framework
to provide handler functions for each register group.

Currently we limit the emulation to a model enforcing a single
security state, with SRE==1 (forcing system register access) and
ARE==1 (allowing more than 8 VCPUs).

We share some of the functions provided for GICv2 emulation, but take
the different ways of addressing (v)CPUs into account.
Save and restore is currently not implemented.

Similar to the split-off of the GICv2 specific code, the new emulation
code goes into a new file (vgic-v3-emul.c).

Signed-off-by: Andre Przywara <andre.przywara@arm.com>
---
Changelog v3...v4:
- remove ICC_SGI1R_EL1 register handling (moved into later patch)
- add definitions for single security state
- document emulation limitations in vgic-v3-emul.c header
- move CTLR, TYPER and IIDR handling into separate functions
- add RAO/WI handling for IGROUPRn registers
- remove unneeded offset masking on calling vgic_reg_access()
- rework handle_mmio_route_reg() to only handle SPIs
- refine IROUTERn register range
- use non-atomic bitops functions (__clear_bit() and __set_bit())
- rename vgic_dist_ranges[] to vgic_v3_dist_ranges[]
- add (RAZ/WI) implementation of GICD_STATUSR
- add (RAZ/WI) implementations of MBI registers
- adapt to new private passing (in struct kvm_exit_mmio instead of a paramter)
- fix vcpu_id calculation bug in handle CFG registers
- always use hexadecimal numbers for .len member
- simplify vgic_v3_handle_mmio()
- add vgic_v3_create() and vgic_v3_destroy()
- swap vgic_v3_[sg]et_attr() code location
- add and improve comments
- (adaptions to changes from earlier patches)

 arch/arm64/kvm/Makefile            |    1 +
 include/kvm/arm_vgic.h             |    9 +-
 include/linux/irqchip/arm-gic-v3.h |   32 ++
 include/linux/kvm_host.h           |    1 +
 include/uapi/linux/kvm.h           |    2 +
 virt/kvm/arm/vgic-v3-emul.c        |  904 ++++++++++++++++++++++++++++++++++++
 virt/kvm/arm/vgic.c                |   11 +-
 virt/kvm/arm/vgic.h                |    3 +
 8 files changed, 960 insertions(+), 3 deletions(-)
 create mode 100644 virt/kvm/arm/vgic-v3-emul.c

Comments

Christoffer Dall Nov. 14, 2014, 11:07 a.m. UTC | #1
On Fri, Nov 14, 2014 at 10:07:59AM +0000, Andre Przywara wrote:
> With everything separated and prepared, we implement a model of a
> GICv3 distributor and redistributors by using the existing framework
> to provide handler functions for each register group.
> 
> Currently we limit the emulation to a model enforcing a single
> security state, with SRE==1 (forcing system register access) and
> ARE==1 (allowing more than 8 VCPUs).
> 
> We share some of the functions provided for GICv2 emulation, but take
> the different ways of addressing (v)CPUs into account.
> Save and restore is currently not implemented.
> 
> Similar to the split-off of the GICv2 specific code, the new emulation
> code goes into a new file (vgic-v3-emul.c).
> 
> Signed-off-by: Andre Przywara <andre.przywara@arm.com>

å...¨
> diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c
> index 335ffe0..b7de0f8 100644
> --- a/virt/kvm/arm/vgic.c
> +++ b/virt/kvm/arm/vgic.c
> @@ -1249,7 +1249,7 @@ static bool vgic_update_irq_pending(struct kvm *kvm, int cpuid,
>  	struct kvm_vcpu *vcpu;
>  	int edge_triggered, level_triggered;
>  	int enabled;
> -	bool ret = true;
> +	bool ret = true, can_inject = true;
>  
>  	spin_lock(&dist->lock);
>  
> @@ -1264,6 +1264,11 @@ static bool vgic_update_irq_pending(struct kvm *kvm, int cpuid,
>  
>  	if (irq_num >= VGIC_NR_PRIVATE_IRQS) {
>  		cpuid = dist->irq_spi_cpu[irq_num - VGIC_NR_PRIVATE_IRQS];
> +		if (cpuid == VCPU_NOT_ALLOCATED) {
> +			/* Pretend we use CPU0, and prevent injection */
> +			cpuid = 0;
> +			can_inject = false;
> +		}
>  		vcpu = kvm_get_vcpu(kvm, cpuid);
>  	}
>  
> @@ -1285,7 +1290,7 @@ static bool vgic_update_irq_pending(struct kvm *kvm, int cpuid,
>  
>  	enabled = vgic_irq_is_enabled(vcpu, irq_num);
>  
> -	if (!enabled) {
> +	if (!enabled || !can_inject) {
>  		ret = false;
>  		goto out;
>  	}

As I wrote, I think this is wrong.  What happened here?

-Christoffer
Andre Przywara Nov. 17, 2014, 1:58 p.m. UTC | #2
Hej Christoffer,

maybe I just don't see the wood for the trees, so I will explain my view
on the things below. Please correct me / explain your view!

On 14/11/14 11:07, Christoffer Dall wrote:
> On Fri, Nov 14, 2014 at 10:07:59AM +0000, Andre Przywara wrote:
>> With everything separated and prepared, we implement a model of a
>> GICv3 distributor and redistributors by using the existing framework
>> to provide handler functions for each register group.
>>
>> Currently we limit the emulation to a model enforcing a single
>> security state, with SRE==1 (forcing system register access) and
>> ARE==1 (allowing more than 8 VCPUs).
>>
>> We share some of the functions provided for GICv2 emulation, but take
>> the different ways of addressing (v)CPUs into account.
>> Save and restore is currently not implemented.
>>
>> Similar to the split-off of the GICv2 specific code, the new emulation
>> code goes into a new file (vgic-v3-emul.c).
>>
>> Signed-off-by: Andre Przywara <andre.przywara@arm.com>
> 
> å...¨
>> diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c
>> index 335ffe0..b7de0f8 100644
>> --- a/virt/kvm/arm/vgic.c
>> +++ b/virt/kvm/arm/vgic.c
>> @@ -1249,7 +1249,7 @@ static bool vgic_update_irq_pending(struct kvm *kvm, int cpuid,
>>  	struct kvm_vcpu *vcpu;
>>  	int edge_triggered, level_triggered;
>>  	int enabled;
>> -	bool ret = true;
>> +	bool ret = true, can_inject = true;
>>  
>>  	spin_lock(&dist->lock);
>>  
>> @@ -1264,6 +1264,11 @@ static bool vgic_update_irq_pending(struct kvm *kvm, int cpuid,
>>  
>>  	if (irq_num >= VGIC_NR_PRIVATE_IRQS) {
>>  		cpuid = dist->irq_spi_cpu[irq_num - VGIC_NR_PRIVATE_IRQS];
>> +		if (cpuid == VCPU_NOT_ALLOCATED) {
>> +			/* Pretend we use CPU0, and prevent injection */
>> +			cpuid = 0;
>> +			can_inject = false;
>> +		}
>>  		vcpu = kvm_get_vcpu(kvm, cpuid);
>>  	}
>>  
>> @@ -1285,7 +1290,7 @@ static bool vgic_update_irq_pending(struct kvm *kvm, int cpuid,
>>  
>>  	enabled = vgic_irq_is_enabled(vcpu, irq_num);
>>  
>> -	if (!enabled) {
>> +	if (!enabled || !can_inject) {
>>  		ret = false;
>>  		goto out;
>>  	}
> 
> As I wrote, I think this is wrong.  What happened here?

can_inject is just a way for stopping "non-targeted" SPIs to be
_injected_. The spec says in "5.3.4. GICD_IROUTERn":

"If this register is programmed to forward the corresponding interrupt
to a specific processor (i.e. IRM is zero) and the affinity path does
not correspond to an implemented processor, then if the corresponding
interrupt becomes pending it will not be forwarded to any processor and
will remain pending."

So we can happily make these SPIs pending, but an illegal irq_spi_cpu[]
entry will just avoid it to be injected, right?

What am I missing?

Do you want a comment here explaining this?

Cheers,
Andre.
Christoffer Dall Nov. 17, 2014, 11:46 p.m. UTC | #3
On Mon, Nov 17, 2014 at 01:58:05PM +0000, Andre Przywara wrote:
> Hej Christoffer,
> 
> maybe I just don't see the wood for the trees, so I will explain my view
> on the things below. Please correct me / explain your view!

I think this argument should have taken place as a reply to my e-mail on
the last version of this patch, but ok.

> 
> On 14/11/14 11:07, Christoffer Dall wrote:
> > On Fri, Nov 14, 2014 at 10:07:59AM +0000, Andre Przywara wrote:
> >> With everything separated and prepared, we implement a model of a
> >> GICv3 distributor and redistributors by using the existing framework
> >> to provide handler functions for each register group.
> >>
> >> Currently we limit the emulation to a model enforcing a single
> >> security state, with SRE==1 (forcing system register access) and
> >> ARE==1 (allowing more than 8 VCPUs).
> >>
> >> We share some of the functions provided for GICv2 emulation, but take
> >> the different ways of addressing (v)CPUs into account.
> >> Save and restore is currently not implemented.
> >>
> >> Similar to the split-off of the GICv2 specific code, the new emulation
> >> code goes into a new file (vgic-v3-emul.c).
> >>
> >> Signed-off-by: Andre Przywara <andre.przywara@arm.com>
> > 
> > å...¨
> >> diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c
> >> index 335ffe0..b7de0f8 100644
> >> --- a/virt/kvm/arm/vgic.c
> >> +++ b/virt/kvm/arm/vgic.c
> >> @@ -1249,7 +1249,7 @@ static bool vgic_update_irq_pending(struct kvm *kvm, int cpuid,
> >>  	struct kvm_vcpu *vcpu;
> >>  	int edge_triggered, level_triggered;
> >>  	int enabled;
> >> -	bool ret = true;
> >> +	bool ret = true, can_inject = true;
> >>  
> >>  	spin_lock(&dist->lock);
> >>  
> >> @@ -1264,6 +1264,11 @@ static bool vgic_update_irq_pending(struct kvm *kvm, int cpuid,
> >>  
> >>  	if (irq_num >= VGIC_NR_PRIVATE_IRQS) {
> >>  		cpuid = dist->irq_spi_cpu[irq_num - VGIC_NR_PRIVATE_IRQS];
> >> +		if (cpuid == VCPU_NOT_ALLOCATED) {
> >> +			/* Pretend we use CPU0, and prevent injection */
> >> +			cpuid = 0;
> >> +			can_inject = false;
> >> +		}
> >>  		vcpu = kvm_get_vcpu(kvm, cpuid);
> >>  	}
> >>  
> >> @@ -1285,7 +1290,7 @@ static bool vgic_update_irq_pending(struct kvm *kvm, int cpuid,
> >>  
> >>  	enabled = vgic_irq_is_enabled(vcpu, irq_num);
> >>  
> >> -	if (!enabled) {
> >> +	if (!enabled || !can_inject) {
> >>  		ret = false;
> >>  		goto out;
> >>  	}
> > 
> > As I wrote, I think this is wrong.  What happened here?
> 
> can_inject is just a way for stopping "non-targeted" SPIs to be
> _injected_. The spec says in "5.3.4. GICD_IROUTERn":
> 
> "If this register is programmed to forward the corresponding interrupt
> to a specific processor (i.e. IRM is zero) and the affinity path does
> not correspond to an implemented processor, then if the corresponding
> interrupt becomes pending it will not be forwarded to any processor and
> will remain pending."
> 
> So we can happily make these SPIs pending, but an illegal irq_spi_cpu[]
> entry will just avoid it to be injected, right?
> 
> What am I missing?
> 
> Do you want a comment here explaining this?
> 
No, I missed something.  What I missed was that we consider
irq_spi_target in the compute_pending_for_cpu() so that this actually
ends up working.

We really shouldn't be trying to take this "precompute cpu pending
bitmaps instead of calling compute_pending_for_cpu()" shortcut here, at
least I'm beginning to have troubles following the flow.  That's for
another time and place though.

I'll review the rest of this version of the series when I get some cycles.

-Christoffer
Eric Auger Nov. 18, 2014, 3:57 p.m. UTC | #4
On 11/14/2014 11:07 AM, Andre Przywara wrote:
> With everything separated and prepared, we implement a model of a
> GICv3 distributor and redistributors by using the existing framework
> to provide handler functions for each register group.
> 
> Currently we limit the emulation to a model enforcing a single
> security state, with SRE==1 (forcing system register access) and
> ARE==1 (allowing more than 8 VCPUs).
> 
> We share some of the functions provided for GICv2 emulation, but take
> the different ways of addressing (v)CPUs into account.
> Save and restore is currently not implemented.
> 
> Similar to the split-off of the GICv2 specific code, the new emulation
> code goes into a new file (vgic-v3-emul.c).
> 
> Signed-off-by: Andre Przywara <andre.przywara@arm.com>
> ---
> Changelog v3...v4:
> - remove ICC_SGI1R_EL1 register handling (moved into later patch)
> - add definitions for single security state
> - document emulation limitations in vgic-v3-emul.c header
> - move CTLR, TYPER and IIDR handling into separate functions
> - add RAO/WI handling for IGROUPRn registers
> - remove unneeded offset masking on calling vgic_reg_access()
> - rework handle_mmio_route_reg() to only handle SPIs
> - refine IROUTERn register range
> - use non-atomic bitops functions (__clear_bit() and __set_bit())
> - rename vgic_dist_ranges[] to vgic_v3_dist_ranges[]
> - add (RAZ/WI) implementation of GICD_STATUSR
> - add (RAZ/WI) implementations of MBI registers
> - adapt to new private passing (in struct kvm_exit_mmio instead of a paramter)
> - fix vcpu_id calculation bug in handle CFG registers
> - always use hexadecimal numbers for .len member
> - simplify vgic_v3_handle_mmio()
> - add vgic_v3_create() and vgic_v3_destroy()
> - swap vgic_v3_[sg]et_attr() code location
> - add and improve comments
> - (adaptions to changes from earlier patches)
> 
>  arch/arm64/kvm/Makefile            |    1 +
>  include/kvm/arm_vgic.h             |    9 +-
>  include/linux/irqchip/arm-gic-v3.h |   32 ++
>  include/linux/kvm_host.h           |    1 +
>  include/uapi/linux/kvm.h           |    2 +
>  virt/kvm/arm/vgic-v3-emul.c        |  904 ++++++++++++++++++++++++++++++++++++
>  virt/kvm/arm/vgic.c                |   11 +-
>  virt/kvm/arm/vgic.h                |    3 +
>  8 files changed, 960 insertions(+), 3 deletions(-)
>  create mode 100644 virt/kvm/arm/vgic-v3-emul.c
> 
> diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile
> index d957353..4e6e09e 100644
> --- a/arch/arm64/kvm/Makefile
> +++ b/arch/arm64/kvm/Makefile
> @@ -24,5 +24,6 @@ kvm-$(CONFIG_KVM_ARM_VGIC) += $(KVM)/arm/vgic-v2.o
>  kvm-$(CONFIG_KVM_ARM_VGIC) += $(KVM)/arm/vgic-v2-emul.o
>  kvm-$(CONFIG_KVM_ARM_VGIC) += vgic-v2-switch.o
>  kvm-$(CONFIG_KVM_ARM_VGIC) += $(KVM)/arm/vgic-v3.o
> +kvm-$(CONFIG_KVM_ARM_VGIC) += $(KVM)/arm/vgic-v3-emul.o
>  kvm-$(CONFIG_KVM_ARM_VGIC) += vgic-v3-switch.o
>  kvm-$(CONFIG_KVM_ARM_TIMER) += $(KVM)/arm/arch_timer.o
> diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
> index 421833f..c1ef5a9 100644
> --- a/include/kvm/arm_vgic.h
> +++ b/include/kvm/arm_vgic.h
> @@ -160,7 +160,11 @@ struct vgic_dist {
>  
>  	/* Distributor and vcpu interface mapping in the guest */
>  	phys_addr_t		vgic_dist_base;
> -	phys_addr_t		vgic_cpu_base;
> +	/* GICv2 and GICv3 use different mapped register blocks */
> +	union {
> +		phys_addr_t		vgic_cpu_base;
> +		phys_addr_t		vgic_redist_base;
> +	};
>  
>  	/* Distributor enabled */
>  	u32			enabled;
> @@ -222,6 +226,9 @@ struct vgic_dist {
>  	 */
>  	struct vgic_bitmap	*irq_spi_target;
>  
> +	/* Target MPIDR for each IRQ (needed for GICv3 IROUTERn) only */
> +	u32			*irq_spi_mpidr;
> +
>  	/* Bitmap indicating which CPU has something pending */
>  	unsigned long		*irq_pending_on_cpu;
>  
> diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h
> index 03a4ea3..726d898 100644
> --- a/include/linux/irqchip/arm-gic-v3.h
> +++ b/include/linux/irqchip/arm-gic-v3.h
> @@ -33,6 +33,7 @@
>  #define GICD_SETSPI_SR			0x0050
>  #define GICD_CLRSPI_SR			0x0058
>  #define GICD_SEIR			0x0068
> +#define GICD_IGROUPR			0x0080
>  #define GICD_ISENABLER			0x0100
>  #define GICD_ICENABLER			0x0180
>  #define GICD_ISPENDR			0x0200
> @@ -41,14 +42,37 @@
>  #define GICD_ICACTIVER			0x0380
>  #define GICD_IPRIORITYR			0x0400
>  #define GICD_ICFGR			0x0C00
> +#define GICD_IGRPMODR			0x0D00
> +#define GICD_NSACR			0x0E00
>  #define GICD_IROUTER			0x6000
> +#define GICD_IDREGS			0xFFD0
>  #define GICD_PIDR2			0xFFE8
>  
> +/*
> + * Those registers are actually from GICv2, but the spec demands that they
> + * are implemented as RES0 if ARE is 1 (which we do in KVM's emulated GICv3).
> + */
> +#define GICD_ITARGETSR			0x0800
> +#define GICD_SGIR			0x0F00
> +#define GICD_CPENDSGIR			0x0F10
> +#define GICD_SPENDSGIR			0x0F20
> +
>  #define GICD_CTLR_RWP			(1U << 31)
> +#define GICD_CTLR_DS			(1U << 6)
>  #define GICD_CTLR_ARE_NS		(1U << 4)
>  #define GICD_CTLR_ENABLE_G1A		(1U << 1)
>  #define GICD_CTLR_ENABLE_G1		(1U << 0)
>  
> +/*
> + * In systems with a single security state (what we emulate in KVM)
> + * the meaning of the interrupt group enable bits is slightly different
> + */
> +#define GICD_CTLR_ENABLE_SS_G1		(1U << 1)
> +#define GICD_CTLR_ENABLE_SS_G0		(1U << 0)
> +
> +#define GICD_TYPER_LPIS			(1U << 17)
> +#define GICD_TYPER_MBIS			(1U << 16)
> +
>  #define GICD_IROUTER_SPI_MODE_ONE	(0U << 31)
>  #define GICD_IROUTER_SPI_MODE_ANY	(1U << 31)
>  
> @@ -56,6 +80,8 @@
>  #define GIC_PIDR2_ARCH_GICv3		0x30
>  #define GIC_PIDR2_ARCH_GICv4		0x40
>  
> +#define GIC_V3_DIST_SIZE		0x10000
> +
>  /*
>   * Re-Distributor registers, offsets from RD_base
>   */
> @@ -74,6 +100,7 @@
>  #define GICR_SYNCR			0x00C0
>  #define GICR_MOVLPIR			0x0100
>  #define GICR_MOVALLR			0x0110
> +#define GICR_IDREGS			GICD_IDREGS
>  #define GICR_PIDR2			GICD_PIDR2
>  
>  #define GICR_WAKER_ProcessorSleep	(1U << 1)
> @@ -82,6 +109,7 @@
>  /*
>   * Re-Distributor registers, offsets from SGI_base
>   */
> +#define GICR_IGROUPR0			GICD_IGROUPR
>  #define GICR_ISENABLER0			GICD_ISENABLER
>  #define GICR_ICENABLER0			GICD_ICENABLER
>  #define GICR_ISPENDR0			GICD_ISPENDR
> @@ -90,10 +118,14 @@
>  #define GICR_ICACTIVER0			GICD_ICACTIVER
>  #define GICR_IPRIORITYR0		GICD_IPRIORITYR
>  #define GICR_ICFGR0			GICD_ICFGR
> +#define GICR_IGRPMODR0			GICD_IGRPMODR
> +#define GICR_NSACR			GICD_NSACR
>  
>  #define GICR_TYPER_VLPIS		(1U << 1)
>  #define GICR_TYPER_LAST			(1U << 4)
>  
> +#define GIC_V3_REDIST_SIZE		0x20000
> +
>  /*
>   * CPU interface registers
>   */
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 326ba7a..4a7798e 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -1085,6 +1085,7 @@ void kvm_unregister_device_ops(u32 type);
>  extern struct kvm_device_ops kvm_mpic_ops;
>  extern struct kvm_device_ops kvm_xics_ops;
>  extern struct kvm_device_ops kvm_arm_vgic_v2_ops;
> +extern struct kvm_device_ops kvm_arm_vgic_v3_ops;
>  
>  #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
>  
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index 6076882..24cb129 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -960,6 +960,8 @@ enum kvm_device_type {
>  #define KVM_DEV_TYPE_ARM_VGIC_V2	KVM_DEV_TYPE_ARM_VGIC_V2
>  	KVM_DEV_TYPE_FLIC,
>  #define KVM_DEV_TYPE_FLIC		KVM_DEV_TYPE_FLIC
> +	KVM_DEV_TYPE_ARM_VGIC_V3,
> +#define KVM_DEV_TYPE_ARM_VGIC_V3	KVM_DEV_TYPE_ARM_VGIC_V3
>  	KVM_DEV_TYPE_MAX,
>  };
Hi Andre,

Documentation/virtual/kvm/devices/arm-vgic.txt needs to be updated too.

I need some more time to study the rest and digest ;-)

--Eric

>  
> diff --git a/virt/kvm/arm/vgic-v3-emul.c b/virt/kvm/arm/vgic-v3-emul.c
> new file mode 100644
> index 0000000..97b5801
> --- /dev/null
> +++ b/virt/kvm/arm/vgic-v3-emul.c
> @@ -0,0 +1,904 @@
> +/*
> + * GICv3 distributor and redistributor emulation
> + *
> + * GICv3 emulation is currently only supported on a GICv3 host (because
> + * we rely on the hardware's CPU interface virtualization support), but
> + * supports both hardware with or without the optional GICv2 backwards
> + * compatibility features.
> + *
> + * Limitations of the emulation:
> + * (RAZ/WI: read as zero, write ignore, RAO/WI: read as one, write ignore)
> + * - We do not support LPIs (yet). TYPER.LPIS is reported as 0 and is RAZ/WI.
> + * - We do not support the message based interrupts (MBIs) triggered by
> + *   writes to the GICD_{SET,CLR}SPI_* registers. TYPER.MBIS is reported as 0.
> + * - We do not support the (optional) backwards compatibility feature.
> + *   GICD_CTLR.ARE resets to 1 and is RAO/WI. If the _host_ GIC supports
> + *   the compatiblity feature, you can use a GICv2 in the guest, though.
> + * - We only support a single security state. GICD_CTLR.DS is 1 and is RAO/WI.
> + * - Priorities are not emulated (same as the GICv2 emulation). Linux
> + *   as a guest is fine with this, because it does not use priorities.
> + * - We only support Group1 interrupts. Again Linux uses only those.
> + *
> + * Copyright (C) 2014 ARM Ltd.
> + * Author: Andre Przywara <andre.przywara@arm.com>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program.  If not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +#include <linux/cpu.h>
> +#include <linux/kvm.h>
> +#include <linux/kvm_host.h>
> +#include <linux/interrupt.h>
> +
> +#include <linux/irqchip/arm-gic-v3.h>
> +#include <kvm/arm_vgic.h>
> +
> +#include <asm/kvm_emulate.h>
> +#include <asm/kvm_arm.h>
> +#include <asm/kvm_mmu.h>
> +
> +#include "vgic.h"
> +
> +static bool handle_mmio_rao_wi(struct kvm_vcpu *vcpu,
> +			       struct kvm_exit_mmio *mmio, phys_addr_t offset)
> +{
> +	u32 reg = 0xffffffff;
> +
> +	vgic_reg_access(mmio, &reg, offset,
> +			ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
> +
> +	return false;
> +}
> +
> +static bool handle_mmio_ctlr(struct kvm_vcpu *vcpu,
> +			     struct kvm_exit_mmio *mmio, phys_addr_t offset)
> +{
> +	u32 reg = 0;
> +
> +	/*
> +	 * Force ARE and DS to 1, the guest cannot change this.
> +	 * For the time being we only support Group1 interrupts.
> +	 */
> +	if (vcpu->kvm->arch.vgic.enabled)
> +		reg = GICD_CTLR_ENABLE_SS_G1;
> +	reg |= GICD_CTLR_ARE_NS | GICD_CTLR_DS;
> +
> +	vgic_reg_access(mmio, &reg, offset,
> +			ACCESS_READ_VALUE | ACCESS_WRITE_VALUE);
> +	if (mmio->is_write) {
> +		if (reg & GICD_CTLR_ENABLE_SS_G0)
> +			kvm_info("guest tried to enable unsupported Group0 interrupts\n");
> +		vcpu->kvm->arch.vgic.enabled = !!(reg & GICD_CTLR_ENABLE_SS_G1);
> +		vgic_update_state(vcpu->kvm);
> +		return true;
> +	}
> +	return false;
> +}
> +
> +/*
> + * As this implementation does not provide compatibility
> + * with GICv2 (ARE==1), we report zero CPUs in bits [5..7].
> + * Also LPIs and MBIs are not supported, so we set the respective bits to 0.
> + * Also we report at most 2**10=1024 interrupt IDs (to match 1024 SPIs).
> + */
> +#define INTERRUPT_ID_BITS 10
> +static bool handle_mmio_typer(struct kvm_vcpu *vcpu,
> +			      struct kvm_exit_mmio *mmio, phys_addr_t offset)
> +{
> +	u32 reg;
> +
> +	/* we report at most 1024 IRQs via this interface */
> +	reg = (min(vcpu->kvm->arch.vgic.nr_irqs, 1024) >> 5) - 1;
> +
> +	reg |= (INTERRUPT_ID_BITS - 1) << 19;
> +
> +	vgic_reg_access(mmio, &reg, offset,
> +			ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
> +
> +	return false;
> +}
> +
> +static bool handle_mmio_iidr(struct kvm_vcpu *vcpu,
> +			     struct kvm_exit_mmio *mmio, phys_addr_t offset)
> +{
> +	u32 reg;
> +
> +	reg = (PRODUCT_ID_KVM << 24) | (IMPLEMENTER_ARM << 0);
> +	vgic_reg_access(mmio, &reg, offset,
> +			ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
> +
> +	return false;
> +}
> +
> +static bool handle_mmio_set_enable_reg_dist(struct kvm_vcpu *vcpu,
> +					    struct kvm_exit_mmio *mmio,
> +					    phys_addr_t offset)
> +{
> +	if (likely(offset >= VGIC_NR_PRIVATE_IRQS / 8))
> +		return vgic_handle_enable_reg(vcpu->kvm, mmio, offset,
> +					      vcpu->vcpu_id,
> +					      ACCESS_WRITE_SETBIT);
> +
> +	vgic_reg_access(mmio, NULL, offset,
> +			ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
> +	return false;
> +}
> +
> +static bool handle_mmio_clear_enable_reg_dist(struct kvm_vcpu *vcpu,
> +					      struct kvm_exit_mmio *mmio,
> +					      phys_addr_t offset)
> +{
> +	if (likely(offset >= VGIC_NR_PRIVATE_IRQS / 8))
> +		return vgic_handle_enable_reg(vcpu->kvm, mmio, offset,
> +					      vcpu->vcpu_id,
> +					      ACCESS_WRITE_CLEARBIT);
> +
> +	vgic_reg_access(mmio, NULL, offset,
> +			ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
> +	return false;
> +}
> +
> +static bool handle_mmio_set_pending_reg_dist(struct kvm_vcpu *vcpu,
> +					     struct kvm_exit_mmio *mmio,
> +					     phys_addr_t offset)
> +{
> +	if (likely(offset >= VGIC_NR_PRIVATE_IRQS / 8))
> +		return vgic_handle_set_pending_reg(vcpu->kvm, mmio, offset,
> +						   vcpu->vcpu_id);
> +
> +	vgic_reg_access(mmio, NULL, offset,
> +			ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
> +	return false;
> +}
> +
> +static bool handle_mmio_clear_pending_reg_dist(struct kvm_vcpu *vcpu,
> +					       struct kvm_exit_mmio *mmio,
> +					       phys_addr_t offset)
> +{
> +	if (likely(offset >= VGIC_NR_PRIVATE_IRQS / 8))
> +		return vgic_handle_clear_pending_reg(vcpu->kvm, mmio, offset,
> +						     vcpu->vcpu_id);
> +
> +	vgic_reg_access(mmio, NULL, offset,
> +			ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
> +	return false;
> +}
> +
> +static bool handle_mmio_priority_reg_dist(struct kvm_vcpu *vcpu,
> +					  struct kvm_exit_mmio *mmio,
> +					  phys_addr_t offset)
> +{
> +	u32 *reg;
> +
> +	if (unlikely(offset < VGIC_NR_PRIVATE_IRQS)) {
> +		vgic_reg_access(mmio, NULL, offset,
> +				ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
> +		return false;
> +	}
> +
> +	reg = vgic_bytemap_get_reg(&vcpu->kvm->arch.vgic.irq_priority,
> +				   vcpu->vcpu_id, offset);
> +	vgic_reg_access(mmio, reg, offset,
> +		ACCESS_READ_VALUE | ACCESS_WRITE_VALUE);
> +	return false;
> +}
> +
> +static bool handle_mmio_cfg_reg_dist(struct kvm_vcpu *vcpu,
> +				     struct kvm_exit_mmio *mmio,
> +				     phys_addr_t offset)
> +{
> +	u32 *reg;
> +
> +	if (unlikely(offset < VGIC_NR_PRIVATE_IRQS / 4)) {
> +		vgic_reg_access(mmio, NULL, offset,
> +				ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
> +		return false;
> +	}
> +
> +	reg = vgic_bitmap_get_reg(&vcpu->kvm->arch.vgic.irq_cfg,
> +				  vcpu->vcpu_id, offset >> 1);
> +
> +	return vgic_handle_cfg_reg(reg, mmio, offset);
> +}
> +
> +/*
> + * We use a compressed version of the MPIDR (all 32 bits in one 32-bit word)
> + * when we store the target MPIDR written by the guest.
> + */
> +static u32 compress_mpidr(unsigned long mpidr)
> +{
> +	u32 ret;
> +
> +	ret = MPIDR_AFFINITY_LEVEL(mpidr, 0);
> +	ret |= MPIDR_AFFINITY_LEVEL(mpidr, 1) << 8;
> +	ret |= MPIDR_AFFINITY_LEVEL(mpidr, 2) << 16;
> +	ret |= MPIDR_AFFINITY_LEVEL(mpidr, 3) << 24;
> +
> +	return ret;
> +}
> +
> +static unsigned long uncompress_mpidr(u32 value)
> +{
> +	unsigned long mpidr;
> +
> +	mpidr  = ((value >>  0) & 0xFF) << MPIDR_LEVEL_SHIFT(0);
> +	mpidr |= ((value >>  8) & 0xFF) << MPIDR_LEVEL_SHIFT(1);
> +	mpidr |= ((value >> 16) & 0xFF) << MPIDR_LEVEL_SHIFT(2);
> +	mpidr |= (u64)((value >> 24) & 0xFF) << MPIDR_LEVEL_SHIFT(3);
> +
> +	return mpidr;
> +}
> +
> +/*
> + * Lookup the given MPIDR value to get the vcpu_id (if there is one)
> + * and store that in the irq_spi_cpu[] array.
> + * This limits the number of VCPUs to 255 for now, extending the data
> + * type (or storing kvm_vcpu poiners) should lift the limit.
> + * Store the original MPIDR value in an extra array to support read-as-written.
> + * Unallocated MPIDRs are translated to a special value and caught
> + * before any array accesses.
> + */
> +static bool handle_mmio_route_reg(struct kvm_vcpu *vcpu,
> +				  struct kvm_exit_mmio *mmio,
> +				  phys_addr_t offset)
> +{
> +	struct kvm *kvm = vcpu->kvm;
> +	struct vgic_dist *dist = &kvm->arch.vgic;
> +	int spi;
> +	u32 reg;
> +	int vcpu_id;
> +	unsigned long *bmap, mpidr;
> +
> +	/*
> +	 * The upper 32 bits of each 64 bit register are zero,
> +	 * as we don't support Aff3.
> +	 */
> +	if ((offset & 4)) {
> +		vgic_reg_access(mmio, NULL, offset,
> +				ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
> +		return false;
> +	}
> +
> +	/* This region only covers SPIs, so no handling of private IRQs here. */
> +	spi = offset / 8;
> +
> +	/* get the stored MPIDR for this IRQ */
> +	mpidr = uncompress_mpidr(dist->irq_spi_mpidr[spi]);
> +	mpidr &= MPIDR_HWID_BITMASK;
> +	reg = mpidr;
> +
> +	vgic_reg_access(mmio, &reg, offset,
> +			ACCESS_READ_VALUE | ACCESS_WRITE_VALUE);
> +
> +	if (!mmio->is_write)
> +		return false;
> +
> +	/*
> +	 * Now clear the currently assigned vCPU from the map, making room
> +	 * for the new one to be written below
> +	 */
> +	vcpu = kvm_mpidr_to_vcpu(kvm, mpidr);
> +	if (likely(vcpu)) {
> +		vcpu_id = vcpu->vcpu_id;
> +		bmap = vgic_bitmap_get_shared_map(&dist->irq_spi_target[vcpu_id]);
> +		__clear_bit(spi, bmap);
> +	}
> +
> +	dist->irq_spi_mpidr[spi] = compress_mpidr(reg);
> +	vcpu = kvm_mpidr_to_vcpu(kvm, reg & MPIDR_HWID_BITMASK);
> +
> +	/*
> +	 * The spec says that non-existent MPIDR values should not be
> +	 * forwarded to any existent (v)CPU, but should be able to become
> +	 * pending anyway. We simply keep the irq_spi_target[] array empty, so
> +	 * the interrupt will never be injected.
> +	 * irq_spi_cpu[irq] gets a magic value in this case.
> +	 */
> +	if (likely(vcpu)) {
> +		vcpu_id = vcpu->vcpu_id;
> +		dist->irq_spi_cpu[spi] = vcpu_id;
> +		bmap = vgic_bitmap_get_shared_map(&dist->irq_spi_target[vcpu_id]);
> +		__set_bit(spi, bmap);
> +	} else {
> +		dist->irq_spi_cpu[spi] = VCPU_NOT_ALLOCATED;
> +	}
> +
> +	vgic_update_state(kvm);
> +
> +	return true;
> +}
> +
> +/*
> + * We should be careful about promising too much when a guest reads
> + * this register. Don't claim to be like any hardware implementation,
> + * but just report the GIC as version 3 - which is what a Linux guest
> + * would check.
> + */
> +static bool handle_mmio_idregs(struct kvm_vcpu *vcpu,
> +			       struct kvm_exit_mmio *mmio,
> +			       phys_addr_t offset)
> +{
> +	u32 reg = 0;
> +
> +	switch (offset + GICD_IDREGS) {
> +	case GICD_PIDR2:
> +		reg = 0x3b;
> +		break;
> +	}
> +
> +	vgic_reg_access(mmio, &reg, offset,
> +			ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
> +
> +	return false;
> +}
> +
> +static const struct kvm_mmio_range vgic_v3_dist_ranges[] = {
> +	{
> +		.base           = GICD_CTLR,
> +		.len            = 0x04,
> +		.bits_per_irq   = 0,
> +		.handle_mmio    = handle_mmio_ctlr,
> +	},
> +	{
> +		.base           = GICD_TYPER,
> +		.len            = 0x04,
> +		.bits_per_irq   = 0,
> +		.handle_mmio    = handle_mmio_typer,
> +	},
> +	{
> +		.base           = GICD_IIDR,
> +		.len            = 0x04,
> +		.bits_per_irq   = 0,
> +		.handle_mmio    = handle_mmio_iidr,
> +	},
> +	{
> +		/* this register is optional, it is RAZ/WI if not implemented */
> +		.base           = GICD_STATUSR,
> +		.len            = 0x04,
> +		.bits_per_irq   = 0,
> +		.handle_mmio    = handle_mmio_raz_wi,
> +	},
> +	{
> +		/* this write only register is WI when TYPER.MBIS=0 */
> +		.base		= GICD_SETSPI_NSR,
> +		.len		= 0x04,
> +		.bits_per_irq	= 0,
> +		.handle_mmio	= handle_mmio_raz_wi,
> +	},
> +	{
> +		/* this write only register is WI when TYPER.MBIS=0 */
> +		.base		= GICD_CLRSPI_NSR,
> +		.len		= 0x04,
> +		.bits_per_irq	= 0,
> +		.handle_mmio	= handle_mmio_raz_wi,
> +	},
> +	{
> +		/* this is RAZ/WI when DS=1 */
> +		.base		= GICD_SETSPI_SR,
> +		.len		= 0x04,
> +		.bits_per_irq	= 0,
> +		.handle_mmio	= handle_mmio_raz_wi,
> +	},
> +	{
> +		/* this is RAZ/WI when DS=1 */
> +		.base		= GICD_CLRSPI_SR,
> +		.len		= 0x04,
> +		.bits_per_irq	= 0,
> +		.handle_mmio	= handle_mmio_raz_wi,
> +	},
> +	{
> +		.base		= GICD_IGROUPR,
> +		.len		= 0x80,
> +		.bits_per_irq	= 1,
> +		.handle_mmio	= handle_mmio_rao_wi,
> +	},
> +	{
> +		.base		= GICD_ISENABLER,
> +		.len		= 0x80,
> +		.bits_per_irq	= 1,
> +		.handle_mmio	= handle_mmio_set_enable_reg_dist,
> +	},
> +	{
> +		.base		= GICD_ICENABLER,
> +		.len		= 0x80,
> +		.bits_per_irq	= 1,
> +		.handle_mmio	= handle_mmio_clear_enable_reg_dist,
> +	},
> +	{
> +		.base		= GICD_ISPENDR,
> +		.len		= 0x80,
> +		.bits_per_irq	= 1,
> +		.handle_mmio	= handle_mmio_set_pending_reg_dist,
> +	},
> +	{
> +		.base		= GICD_ICPENDR,
> +		.len		= 0x80,
> +		.bits_per_irq	= 1,
> +		.handle_mmio	= handle_mmio_clear_pending_reg_dist,
> +	},
> +	{
> +		.base		= GICD_ISACTIVER,
> +		.len		= 0x80,
> +		.bits_per_irq	= 1,
> +		.handle_mmio	= handle_mmio_raz_wi,
> +	},
> +	{
> +		.base		= GICD_ICACTIVER,
> +		.len		= 0x80,
> +		.bits_per_irq	= 1,
> +		.handle_mmio	= handle_mmio_raz_wi,
> +	},
> +	{
> +		.base		= GICD_IPRIORITYR,
> +		.len		= 0x400,
> +		.bits_per_irq	= 8,
> +		.handle_mmio	= handle_mmio_priority_reg_dist,
> +	},
> +	{
> +		/* TARGETSRn is RES0 when ARE=1 */
> +		.base		= GICD_ITARGETSR,
> +		.len		= 0x400,
> +		.bits_per_irq	= 8,
> +		.handle_mmio	= handle_mmio_raz_wi,
> +	},
> +	{
> +		.base		= GICD_ICFGR,
> +		.len		= 0x100,
> +		.bits_per_irq	= 2,
> +		.handle_mmio	= handle_mmio_cfg_reg_dist,
> +	},
> +	{
> +		/* this is RAZ/WI when DS=1 */
> +		.base		= GICD_IGRPMODR,
> +		.len		= 0x80,
> +		.bits_per_irq	= 1,
> +		.handle_mmio	= handle_mmio_raz_wi,
> +	},
> +	{
> +		/* this is RAZ/WI when DS=1 */
> +		.base		= GICD_NSACR,
> +		.len		= 0x100,
> +		.bits_per_irq	= 2,
> +		.handle_mmio	= handle_mmio_raz_wi,
> +	},
> +	{
> +		/* this is RAZ/WI when ARE=1 */
> +		.base		= GICD_SGIR,
> +		.len		= 0x04,
> +		.handle_mmio	= handle_mmio_raz_wi,
> +	},
> +	{
> +		/* this is RAZ/WI when ARE=1 */
> +		.base		= GICD_CPENDSGIR,
> +		.len		= 0x10,
> +		.handle_mmio	= handle_mmio_raz_wi,
> +	},
> +	{
> +		/* this is RAZ/WI when ARE=1 */
> +		.base           = GICD_SPENDSGIR,
> +		.len            = 0x10,
> +		.handle_mmio    = handle_mmio_raz_wi,
> +	},
> +	{
> +		.base		= GICD_IROUTER + 0x100,
> +		.len		= 0x1edc,
> +		.bits_per_irq	= 64,
> +		.handle_mmio	= handle_mmio_route_reg,
> +	},
> +	{
> +		.base           = GICD_IDREGS,
> +		.len            = 0x30,
> +		.bits_per_irq   = 0,
> +		.handle_mmio    = handle_mmio_idregs,
> +	},
> +	{},
> +};
> +
> +static bool handle_mmio_set_enable_reg_redist(struct kvm_vcpu *vcpu,
> +					      struct kvm_exit_mmio *mmio,
> +					      phys_addr_t offset)
> +{
> +	struct kvm_vcpu *redist_vcpu = mmio->private;
> +
> +	return vgic_handle_enable_reg(vcpu->kvm, mmio, offset,
> +				      redist_vcpu->vcpu_id,
> +				      ACCESS_WRITE_SETBIT);
> +}
> +
> +static bool handle_mmio_clear_enable_reg_redist(struct kvm_vcpu *vcpu,
> +						struct kvm_exit_mmio *mmio,
> +						phys_addr_t offset)
> +{
> +	struct kvm_vcpu *redist_vcpu = mmio->private;
> +
> +	return vgic_handle_enable_reg(vcpu->kvm, mmio, offset,
> +				      redist_vcpu->vcpu_id,
> +				      ACCESS_WRITE_CLEARBIT);
> +}
> +
> +static bool handle_mmio_set_pending_reg_redist(struct kvm_vcpu *vcpu,
> +					       struct kvm_exit_mmio *mmio,
> +					       phys_addr_t offset)
> +{
> +	struct kvm_vcpu *redist_vcpu = mmio->private;
> +
> +	return vgic_handle_set_pending_reg(vcpu->kvm, mmio, offset,
> +					   redist_vcpu->vcpu_id);
> +}
> +
> +static bool handle_mmio_clear_pending_reg_redist(struct kvm_vcpu *vcpu,
> +						 struct kvm_exit_mmio *mmio,
> +						 phys_addr_t offset)
> +{
> +	struct kvm_vcpu *redist_vcpu = mmio->private;
> +
> +	return vgic_handle_clear_pending_reg(vcpu->kvm, mmio, offset,
> +					     redist_vcpu->vcpu_id);
> +}
> +
> +static bool handle_mmio_priority_reg_redist(struct kvm_vcpu *vcpu,
> +					    struct kvm_exit_mmio *mmio,
> +					    phys_addr_t offset)
> +{
> +	struct kvm_vcpu *redist_vcpu = mmio->private;
> +	u32 *reg;
> +
> +	reg = vgic_bytemap_get_reg(&vcpu->kvm->arch.vgic.irq_priority,
> +				   redist_vcpu->vcpu_id, offset);
> +	vgic_reg_access(mmio, reg, offset,
> +			ACCESS_READ_VALUE | ACCESS_WRITE_VALUE);
> +	return false;
> +}
> +
> +static bool handle_mmio_cfg_reg_redist(struct kvm_vcpu *vcpu,
> +				       struct kvm_exit_mmio *mmio,
> +				       phys_addr_t offset)
> +{
> +	struct kvm_vcpu *redist_vcpu = mmio->private;
> +
> +	u32 *reg = vgic_bitmap_get_reg(&vcpu->kvm->arch.vgic.irq_cfg,
> +				       redist_vcpu->vcpu_id, offset >> 1);
> +
> +	return vgic_handle_cfg_reg(reg, mmio, offset);
> +}
> +
> +static const struct kvm_mmio_range vgic_redist_sgi_ranges[] = {
> +	{
> +		.base		= GICR_IGROUPR0,
> +		.len		= 0x04,
> +		.bits_per_irq	= 1,
> +		.handle_mmio	= handle_mmio_raz_wi,
> +	},
> +	{
> +		.base		= GICR_ISENABLER0,
> +		.len		= 0x04,
> +		.bits_per_irq	= 1,
> +		.handle_mmio	= handle_mmio_set_enable_reg_redist,
> +	},
> +	{
> +		.base		= GICR_ICENABLER0,
> +		.len		= 0x04,
> +		.bits_per_irq	= 1,
> +		.handle_mmio	= handle_mmio_clear_enable_reg_redist,
> +	},
> +	{
> +		.base		= GICR_ISPENDR0,
> +		.len		= 0x04,
> +		.bits_per_irq	= 1,
> +		.handle_mmio	= handle_mmio_set_pending_reg_redist,
> +	},
> +	{
> +		.base		= GICR_ICPENDR0,
> +		.len		= 0x04,
> +		.bits_per_irq	= 1,
> +		.handle_mmio	= handle_mmio_clear_pending_reg_redist,
> +	},
> +	{
> +		.base		= GICR_ISACTIVER0,
> +		.len		= 0x04,
> +		.bits_per_irq	= 1,
> +		.handle_mmio	= handle_mmio_raz_wi,
> +	},
> +	{
> +		.base		= GICR_ICACTIVER0,
> +		.len		= 0x04,
> +		.bits_per_irq	= 1,
> +		.handle_mmio	= handle_mmio_raz_wi,
> +	},
> +	{
> +		.base		= GICR_IPRIORITYR0,
> +		.len		= 0x20,
> +		.bits_per_irq	= 8,
> +		.handle_mmio	= handle_mmio_priority_reg_redist,
> +	},
> +	{
> +		.base		= GICR_ICFGR0,
> +		.len		= 0x08,
> +		.bits_per_irq	= 2,
> +		.handle_mmio	= handle_mmio_cfg_reg_redist,
> +	},
> +	{
> +		.base		= GICR_IGRPMODR0,
> +		.len		= 0x04,
> +		.bits_per_irq	= 1,
> +		.handle_mmio	= handle_mmio_raz_wi,
> +	},
> +	{
> +		.base		= GICR_NSACR,
> +		.len		= 0x04,
> +		.handle_mmio	= handle_mmio_raz_wi,
> +	},
> +	{},
> +};
> +
> +static bool handle_mmio_ctlr_redist(struct kvm_vcpu *vcpu,
> +				    struct kvm_exit_mmio *mmio,
> +				    phys_addr_t offset)
> +{
> +	/* since we don't support LPIs, this register is zero for now */
> +	vgic_reg_access(mmio, NULL, offset,
> +			ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
> +	return false;
> +}
> +
> +static bool handle_mmio_typer_redist(struct kvm_vcpu *vcpu,
> +				     struct kvm_exit_mmio *mmio,
> +				     phys_addr_t offset)
> +{
> +	u32 reg;
> +	u64 mpidr;
> +	struct kvm_vcpu *redist_vcpu = mmio->private;
> +	int target_vcpu_id = redist_vcpu->vcpu_id;
> +
> +	/* the upper 32 bits contain the affinity value */
> +	if ((offset & ~3) == 4) {
> +		mpidr = kvm_vcpu_get_mpidr_aff(redist_vcpu);
> +		reg = compress_mpidr(mpidr);
> +
> +		vgic_reg_access(mmio, &reg, offset,
> +				ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
> +		return false;
> +	}
> +
> +	reg = redist_vcpu->vcpu_id << 8;
> +	if (target_vcpu_id == atomic_read(&vcpu->kvm->online_vcpus) - 1)
> +		reg |= GICR_TYPER_LAST;
> +	vgic_reg_access(mmio, &reg, offset,
> +			ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
> +	return false;
> +}
> +
> +static const struct kvm_mmio_range vgic_redist_ranges[] = {
> +	{
> +		.base           = GICR_CTLR,
> +		.len            = 0x04,
> +		.bits_per_irq   = 0,
> +		.handle_mmio    = handle_mmio_ctlr_redist,
> +	},
> +	{
> +		.base           = GICR_TYPER,
> +		.len            = 0x08,
> +		.bits_per_irq   = 0,
> +		.handle_mmio    = handle_mmio_typer_redist,
> +	},
> +	{
> +		.base           = GICR_IIDR,
> +		.len            = 0x04,
> +		.bits_per_irq   = 0,
> +		.handle_mmio    = handle_mmio_iidr,
> +	},
> +	{
> +		.base           = GICR_WAKER,
> +		.len            = 0x04,
> +		.bits_per_irq   = 0,
> +		.handle_mmio    = handle_mmio_raz_wi,
> +	},
> +	{
> +		.base           = GICR_IDREGS,
> +		.len            = 0x30,
> +		.bits_per_irq   = 0,
> +		.handle_mmio    = handle_mmio_idregs,
> +	},
> +	{},
> +};
> +
> +/*
> + * This function splits accesses between the distributor and the two
> + * redistributor parts (private/SPI). As each redistributor is accessible
> + * from any CPU, we have to determine the affected VCPU by taking the faulting
> + * address into account. We then pass this VCPU to the handler function via
> + * the private parameter.
> + */
> +#define SGI_BASE_OFFSET SZ_64K
> +static bool vgic_v3_handle_mmio(struct kvm_vcpu *vcpu, struct kvm_run *run,
> +				struct kvm_exit_mmio *mmio)
> +{
> +	struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
> +	unsigned long dbase = dist->vgic_dist_base;
> +	unsigned long rdbase = dist->vgic_redist_base;
> +	int nrcpus = atomic_read(&vcpu->kvm->online_vcpus);
> +	int vcpu_id;
> +	const struct kvm_mmio_range *mmio_range;
> +
> +	if (is_in_range(mmio->phys_addr, mmio->len, dbase, GIC_V3_DIST_SIZE)) {
> +		return vgic_handle_mmio_range(vcpu, run, mmio,
> +					      vgic_v3_dist_ranges, dbase);
> +	}
> +
> +	if (!is_in_range(mmio->phys_addr, mmio->len, rdbase,
> +	    GIC_V3_REDIST_SIZE * nrcpus))
> +		return false;
> +
> +	vcpu_id = (mmio->phys_addr - rdbase) / GIC_V3_REDIST_SIZE;
> +	rdbase += (vcpu_id * GIC_V3_REDIST_SIZE);
> +	mmio->private = kvm_get_vcpu(vcpu->kvm, vcpu_id);
> +
> +	if (mmio->phys_addr >= rdbase + SGI_BASE_OFFSET) {
> +		rdbase += SGI_BASE_OFFSET;
> +		mmio_range = vgic_redist_sgi_ranges;
> +	} else {
> +		mmio_range = vgic_redist_ranges;
> +	}
> +	return vgic_handle_mmio_range(vcpu, run, mmio, mmio_range, rdbase);
> +}
> +
> +static bool vgic_v3_queue_sgi(struct kvm_vcpu *vcpu, int irq)
> +{
> +	if (vgic_queue_irq(vcpu, 0, irq)) {
> +		vgic_dist_irq_clear_pending(vcpu, irq);
> +		vgic_cpu_irq_clear(vcpu, irq);
> +		return true;
> +	}
> +
> +	return false;
> +}
> +
> +static int vgic_v3_init_maps(struct vgic_dist *dist)
> +{
> +	int nr_spis = dist->nr_irqs - VGIC_NR_PRIVATE_IRQS;
> +
> +	dist->irq_spi_mpidr = kcalloc(nr_spis, sizeof(dist->irq_spi_mpidr[0]),
> +				      GFP_KERNEL);
> +
> +	if (!dist->irq_spi_mpidr)
> +		return -ENOMEM;
> +
> +	return 0;
> +}
> +
> +static int vgic_v3_init(struct kvm *kvm, const struct vgic_params *params)
> +{
> +	struct vgic_dist *dist = &kvm->arch.vgic;
> +	int ret, i;
> +	u32 mpidr;
> +
> +	if (IS_VGIC_ADDR_UNDEF(dist->vgic_dist_base) ||
> +	    IS_VGIC_ADDR_UNDEF(dist->vgic_redist_base)) {
> +		kvm_err("Need to set vgic distributor addresses first\n");
> +		return -ENXIO;
> +	}
> +
> +	/*
> +	 * FIXME: this should be moved to init_maps time, and may bite
> +	 * us when adding save/restore. Add a per-emulation hook?
> +	 */
> +	ret = vgic_v3_init_maps(dist);
> +	if (ret) {
> +		kvm_err("Unable to allocate maps\n");
> +		return ret;
> +	}
> +
> +	/* Initialize the target VCPUs for each IRQ to VCPU 0 */
> +	mpidr = compress_mpidr(kvm_vcpu_get_mpidr_aff(kvm_get_vcpu(kvm, 0)));
> +	for (i = VGIC_NR_PRIVATE_IRQS; i < dist->nr_irqs; i++) {
> +		dist->irq_spi_cpu[i - VGIC_NR_PRIVATE_IRQS] = 0;
> +		dist->irq_spi_mpidr[i - VGIC_NR_PRIVATE_IRQS] = mpidr;
> +		vgic_bitmap_set_irq_val(dist->irq_spi_target, 0, i, 1);
> +	}
> +
> +	return 0;
> +}
> +
> +/* GICv3 does not keep track of SGI sources anymore. */
> +static void vgic_v3_add_sgi_source(struct kvm_vcpu *vcpu, int irq, int source)
> +{
> +}
> +
> +int vgic_v3_init_emulation(struct kvm *kvm)
> +{
> +	struct vgic_dist *dist = &kvm->arch.vgic;
> +
> +	dist->vm_ops.handle_mmio = vgic_v3_handle_mmio;
> +	dist->vm_ops.queue_sgi = vgic_v3_queue_sgi;
> +	dist->vm_ops.add_sgi_source = vgic_v3_add_sgi_source;
> +	dist->vm_ops.vgic_init = vgic_v3_init;
> +
> +	kvm->arch.max_vcpus = KVM_MAX_VCPUS;
> +
> +	return 0;
> +}
> +
> +static int vgic_v3_create(struct kvm_device *dev, u32 type)
> +{
> +	return kvm_vgic_create(dev->kvm, type);
> +}
> +
> +static void vgic_v3_destroy(struct kvm_device *dev)
> +{
> +	kfree(dev);
> +}
> +
> +static int vgic_v3_set_attr(struct kvm_device *dev,
> +			    struct kvm_device_attr *attr)
> +{
> +	int ret;
> +
> +	ret = vgic_set_common_attr(dev, attr);
> +	if (ret != -ENXIO)
> +		return ret;
> +
> +	switch (attr->group) {
> +	case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
> +	case KVM_DEV_ARM_VGIC_GRP_CPU_REGS:
> +		return -ENXIO;
> +	}
> +
> +	return -ENXIO;
> +}
> +
> +static int vgic_v3_get_attr(struct kvm_device *dev,
> +			    struct kvm_device_attr *attr)
> +{
> +	int ret;
> +
> +	ret = vgic_get_common_attr(dev, attr);
> +	if (ret != -ENXIO)
> +		return ret;
> +
> +	switch (attr->group) {
> +	case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
> +	case KVM_DEV_ARM_VGIC_GRP_CPU_REGS:
> +		return -ENXIO;
> +	}
> +
> +	return -ENXIO;
> +}
> +
> +static int vgic_v3_has_attr(struct kvm_device *dev,
> +			    struct kvm_device_attr *attr)
> +{
> +	switch (attr->group) {
> +	case KVM_DEV_ARM_VGIC_GRP_ADDR:
> +		switch (attr->attr) {
> +		case KVM_VGIC_V2_ADDR_TYPE_DIST:
> +		case KVM_VGIC_V2_ADDR_TYPE_CPU:
> +			return -ENXIO;
> +		}
> +		break;
> +	case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
> +	case KVM_DEV_ARM_VGIC_GRP_CPU_REGS:
> +		return -ENXIO;
> +	case KVM_DEV_ARM_VGIC_GRP_NR_IRQS:
> +		return 0;
> +	}
> +	return -ENXIO;
> +}
> +
> +struct kvm_device_ops kvm_arm_vgic_v3_ops = {
> +	.name = "kvm-arm-vgic-v3",
> +	.create = vgic_v3_create,
> +	.destroy = vgic_v3_destroy,
> +	.set_attr = vgic_v3_set_attr,
> +	.get_attr = vgic_v3_get_attr,
> +	.has_attr = vgic_v3_has_attr,
> +};
> diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c
> index 335ffe0..b7de0f8 100644
> --- a/virt/kvm/arm/vgic.c
> +++ b/virt/kvm/arm/vgic.c
> @@ -1249,7 +1249,7 @@ static bool vgic_update_irq_pending(struct kvm *kvm, int cpuid,
>  	struct kvm_vcpu *vcpu;
>  	int edge_triggered, level_triggered;
>  	int enabled;
> -	bool ret = true;
> +	bool ret = true, can_inject = true;
>  
>  	spin_lock(&dist->lock);
>  
> @@ -1264,6 +1264,11 @@ static bool vgic_update_irq_pending(struct kvm *kvm, int cpuid,
>  
>  	if (irq_num >= VGIC_NR_PRIVATE_IRQS) {
>  		cpuid = dist->irq_spi_cpu[irq_num - VGIC_NR_PRIVATE_IRQS];
> +		if (cpuid == VCPU_NOT_ALLOCATED) {
> +			/* Pretend we use CPU0, and prevent injection */
> +			cpuid = 0;
> +			can_inject = false;
> +		}
>  		vcpu = kvm_get_vcpu(kvm, cpuid);
>  	}
>  
> @@ -1285,7 +1290,7 @@ static bool vgic_update_irq_pending(struct kvm *kvm, int cpuid,
>  
>  	enabled = vgic_irq_is_enabled(vcpu, irq_num);
>  
> -	if (!enabled) {
> +	if (!enabled || !can_inject) {
>  		ret = false;
>  		goto out;
>  	}
> @@ -1438,6 +1443,7 @@ void kvm_vgic_destroy(struct kvm *kvm)
>  	}
>  	kfree(dist->irq_sgi_sources);
>  	kfree(dist->irq_spi_cpu);
> +	kfree(dist->irq_spi_mpidr);
>  	kfree(dist->irq_spi_target);
>  	kfree(dist->irq_pending_on_cpu);
>  	dist->irq_sgi_sources = NULL;
> @@ -1628,6 +1634,7 @@ int kvm_vgic_create(struct kvm *kvm, u32 type)
>  	kvm->arch.vgic.vctrl_base = vgic->vctrl_base;
>  	kvm->arch.vgic.vgic_dist_base = VGIC_ADDR_UNDEF;
>  	kvm->arch.vgic.vgic_cpu_base = VGIC_ADDR_UNDEF;
> +	kvm->arch.vgic.vgic_redist_base = VGIC_ADDR_UNDEF;
>  
>  out_unlock:
>  	for (; vcpu_lock_idx >= 0; vcpu_lock_idx--) {
> diff --git a/virt/kvm/arm/vgic.h b/virt/kvm/arm/vgic.h
> index ff3171a..b0c6b2f 100644
> --- a/virt/kvm/arm/vgic.h
> +++ b/virt/kvm/arm/vgic.h
> @@ -35,6 +35,8 @@
>  #define ACCESS_WRITE_VALUE	(3 << 1)
>  #define ACCESS_WRITE_MASK(x)	((x) & (3 << 1))
>  
> +#define VCPU_NOT_ALLOCATED	((u8)-1)
> +
>  unsigned long *vgic_bitmap_get_shared_map(struct vgic_bitmap *x);
>  
>  void vgic_update_state(struct kvm *kvm);
> @@ -115,5 +117,6 @@ int vgic_set_common_attr(struct kvm_device *dev, struct kvm_device_attr *attr);
>  int vgic_get_common_attr(struct kvm_device *dev, struct kvm_device_attr *attr);
>  
>  int vgic_v2_init_emulation(struct kvm *kvm);
> +int vgic_v3_init_emulation(struct kvm *kvm);
>  
>  #endif
>
Christoffer Dall Nov. 23, 2014, 2:38 p.m. UTC | #5
On Fri, Nov 14, 2014 at 10:07:59AM +0000, Andre Przywara wrote:
> With everything separated and prepared, we implement a model of a
> GICv3 distributor and redistributors by using the existing framework
> to provide handler functions for each register group.
> 
> Currently we limit the emulation to a model enforcing a single
> security state, with SRE==1 (forcing system register access) and
> ARE==1 (allowing more than 8 VCPUs).
> 
> We share some of the functions provided for GICv2 emulation, but take
> the different ways of addressing (v)CPUs into account.
> Save and restore is currently not implemented.
> 
> Similar to the split-off of the GICv2 specific code, the new emulation
> code goes into a new file (vgic-v3-emul.c).
> 
> Signed-off-by: Andre Przywara <andre.przywara@arm.com>
> ---
> Changelog v3...v4:
> - remove ICC_SGI1R_EL1 register handling (moved into later patch)
> - add definitions for single security state

what exactly does this mean?

> - document emulation limitations in vgic-v3-emul.c header
> - move CTLR, TYPER and IIDR handling into separate functions
> - add RAO/WI handling for IGROUPRn registers
> - remove unneeded offset masking on calling vgic_reg_access()
> - rework handle_mmio_route_reg() to only handle SPIs
> - refine IROUTERn register range
> - use non-atomic bitops functions (__clear_bit() and __set_bit())
> - rename vgic_dist_ranges[] to vgic_v3_dist_ranges[]
> - add (RAZ/WI) implementation of GICD_STATUSR
> - add (RAZ/WI) implementations of MBI registers
> - adapt to new private passing (in struct kvm_exit_mmio instead of a paramter)
> - fix vcpu_id calculation bug in handle CFG registers

which bug was that?  I can't see the difference between v3 and v4 on
this one?

> - always use hexadecimal numbers for .len member
> - simplify vgic_v3_handle_mmio()
> - add vgic_v3_create() and vgic_v3_destroy()
> - swap vgic_v3_[sg]et_attr() code location
> - add and improve comments
> - (adaptions to changes from earlier patches)
> 
>  arch/arm64/kvm/Makefile            |    1 +
>  include/kvm/arm_vgic.h             |    9 +-
>  include/linux/irqchip/arm-gic-v3.h |   32 ++
>  include/linux/kvm_host.h           |    1 +
>  include/uapi/linux/kvm.h           |    2 +
>  virt/kvm/arm/vgic-v3-emul.c        |  904 ++++++++++++++++++++++++++++++++++++
>  virt/kvm/arm/vgic.c                |   11 +-
>  virt/kvm/arm/vgic.h                |    3 +
>  8 files changed, 960 insertions(+), 3 deletions(-)
>  create mode 100644 virt/kvm/arm/vgic-v3-emul.c
> 
> diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile
> index d957353..4e6e09e 100644
> --- a/arch/arm64/kvm/Makefile
> +++ b/arch/arm64/kvm/Makefile
> @@ -24,5 +24,6 @@ kvm-$(CONFIG_KVM_ARM_VGIC) += $(KVM)/arm/vgic-v2.o
>  kvm-$(CONFIG_KVM_ARM_VGIC) += $(KVM)/arm/vgic-v2-emul.o
>  kvm-$(CONFIG_KVM_ARM_VGIC) += vgic-v2-switch.o
>  kvm-$(CONFIG_KVM_ARM_VGIC) += $(KVM)/arm/vgic-v3.o
> +kvm-$(CONFIG_KVM_ARM_VGIC) += $(KVM)/arm/vgic-v3-emul.o
>  kvm-$(CONFIG_KVM_ARM_VGIC) += vgic-v3-switch.o
>  kvm-$(CONFIG_KVM_ARM_TIMER) += $(KVM)/arm/arch_timer.o
> diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
> index 421833f..c1ef5a9 100644
> --- a/include/kvm/arm_vgic.h
> +++ b/include/kvm/arm_vgic.h
> @@ -160,7 +160,11 @@ struct vgic_dist {
>  
>  	/* Distributor and vcpu interface mapping in the guest */
>  	phys_addr_t		vgic_dist_base;
> -	phys_addr_t		vgic_cpu_base;
> +	/* GICv2 and GICv3 use different mapped register blocks */
> +	union {
> +		phys_addr_t		vgic_cpu_base;
> +		phys_addr_t		vgic_redist_base;
> +	};
>  
>  	/* Distributor enabled */
>  	u32			enabled;
> @@ -222,6 +226,9 @@ struct vgic_dist {
>  	 */
>  	struct vgic_bitmap	*irq_spi_target;
>  
> +	/* Target MPIDR for each IRQ (needed for GICv3 IROUTERn) only */
> +	u32			*irq_spi_mpidr;
> +
>  	/* Bitmap indicating which CPU has something pending */
>  	unsigned long		*irq_pending_on_cpu;
>  
> diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h
> index 03a4ea3..726d898 100644
> --- a/include/linux/irqchip/arm-gic-v3.h
> +++ b/include/linux/irqchip/arm-gic-v3.h
> @@ -33,6 +33,7 @@
>  #define GICD_SETSPI_SR			0x0050
>  #define GICD_CLRSPI_SR			0x0058
>  #define GICD_SEIR			0x0068
> +#define GICD_IGROUPR			0x0080
>  #define GICD_ISENABLER			0x0100
>  #define GICD_ICENABLER			0x0180
>  #define GICD_ISPENDR			0x0200
> @@ -41,14 +42,37 @@
>  #define GICD_ICACTIVER			0x0380
>  #define GICD_IPRIORITYR			0x0400
>  #define GICD_ICFGR			0x0C00
> +#define GICD_IGRPMODR			0x0D00
> +#define GICD_NSACR			0x0E00
>  #define GICD_IROUTER			0x6000
> +#define GICD_IDREGS			0xFFD0
>  #define GICD_PIDR2			0xFFE8
>  
> +/*
> + * Those registers are actually from GICv2, but the spec demands that they
> + * are implemented as RES0 if ARE is 1 (which we do in KVM's emulated GICv3).
> + */
> +#define GICD_ITARGETSR			0x0800
> +#define GICD_SGIR			0x0F00
> +#define GICD_CPENDSGIR			0x0F10
> +#define GICD_SPENDSGIR			0x0F20
> +
>  #define GICD_CTLR_RWP			(1U << 31)
> +#define GICD_CTLR_DS			(1U << 6)
>  #define GICD_CTLR_ARE_NS		(1U << 4)
>  #define GICD_CTLR_ENABLE_G1A		(1U << 1)
>  #define GICD_CTLR_ENABLE_G1		(1U << 0)
>  
> +/*
> + * In systems with a single security state (what we emulate in KVM)
> + * the meaning of the interrupt group enable bits is slightly different
> + */
> +#define GICD_CTLR_ENABLE_SS_G1		(1U << 1)
> +#define GICD_CTLR_ENABLE_SS_G0		(1U << 0)
> +
> +#define GICD_TYPER_LPIS			(1U << 17)
> +#define GICD_TYPER_MBIS			(1U << 16)
> +
>  #define GICD_IROUTER_SPI_MODE_ONE	(0U << 31)
>  #define GICD_IROUTER_SPI_MODE_ANY	(1U << 31)
>  
> @@ -56,6 +80,8 @@
>  #define GIC_PIDR2_ARCH_GICv3		0x30
>  #define GIC_PIDR2_ARCH_GICv4		0x40
>  
> +#define GIC_V3_DIST_SIZE		0x10000
> +
>  /*
>   * Re-Distributor registers, offsets from RD_base
>   */
> @@ -74,6 +100,7 @@
>  #define GICR_SYNCR			0x00C0
>  #define GICR_MOVLPIR			0x0100
>  #define GICR_MOVALLR			0x0110
> +#define GICR_IDREGS			GICD_IDREGS
>  #define GICR_PIDR2			GICD_PIDR2
>  
>  #define GICR_WAKER_ProcessorSleep	(1U << 1)
> @@ -82,6 +109,7 @@
>  /*
>   * Re-Distributor registers, offsets from SGI_base
>   */
> +#define GICR_IGROUPR0			GICD_IGROUPR
>  #define GICR_ISENABLER0			GICD_ISENABLER
>  #define GICR_ICENABLER0			GICD_ICENABLER
>  #define GICR_ISPENDR0			GICD_ISPENDR
> @@ -90,10 +118,14 @@
>  #define GICR_ICACTIVER0			GICD_ICACTIVER
>  #define GICR_IPRIORITYR0		GICD_IPRIORITYR
>  #define GICR_ICFGR0			GICD_ICFGR
> +#define GICR_IGRPMODR0			GICD_IGRPMODR
> +#define GICR_NSACR			GICD_NSACR
>  
>  #define GICR_TYPER_VLPIS		(1U << 1)
>  #define GICR_TYPER_LAST			(1U << 4)
>  
> +#define GIC_V3_REDIST_SIZE		0x20000
> +
>  /*
>   * CPU interface registers
>   */
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 326ba7a..4a7798e 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -1085,6 +1085,7 @@ void kvm_unregister_device_ops(u32 type);
>  extern struct kvm_device_ops kvm_mpic_ops;
>  extern struct kvm_device_ops kvm_xics_ops;
>  extern struct kvm_device_ops kvm_arm_vgic_v2_ops;
> +extern struct kvm_device_ops kvm_arm_vgic_v3_ops;
>  
>  #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
>  
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index 6076882..24cb129 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -960,6 +960,8 @@ enum kvm_device_type {
>  #define KVM_DEV_TYPE_ARM_VGIC_V2	KVM_DEV_TYPE_ARM_VGIC_V2
>  	KVM_DEV_TYPE_FLIC,
>  #define KVM_DEV_TYPE_FLIC		KVM_DEV_TYPE_FLIC
> +	KVM_DEV_TYPE_ARM_VGIC_V3,
> +#define KVM_DEV_TYPE_ARM_VGIC_V3	KVM_DEV_TYPE_ARM_VGIC_V3
>  	KVM_DEV_TYPE_MAX,
>  };
>  
> diff --git a/virt/kvm/arm/vgic-v3-emul.c b/virt/kvm/arm/vgic-v3-emul.c
> new file mode 100644
> index 0000000..97b5801
> --- /dev/null
> +++ b/virt/kvm/arm/vgic-v3-emul.c
> @@ -0,0 +1,904 @@
> +/*
> + * GICv3 distributor and redistributor emulation
> + *
> + * GICv3 emulation is currently only supported on a GICv3 host (because
> + * we rely on the hardware's CPU interface virtualization support), but
> + * supports both hardware with or without the optional GICv2 backwards
> + * compatibility features.
> + *
> + * Limitations of the emulation:
> + * (RAZ/WI: read as zero, write ignore, RAO/WI: read as one, write ignore)
> + * - We do not support LPIs (yet). TYPER.LPIS is reported as 0 and is RAZ/WI.
> + * - We do not support the message based interrupts (MBIs) triggered by
> + *   writes to the GICD_{SET,CLR}SPI_* registers. TYPER.MBIS is reported as 0.
> + * - We do not support the (optional) backwards compatibility feature.
> + *   GICD_CTLR.ARE resets to 1 and is RAO/WI. If the _host_ GIC supports
> + *   the compatiblity feature, you can use a GICv2 in the guest, though.
> + * - We only support a single security state. GICD_CTLR.DS is 1 and is RAO/WI.
> + * - Priorities are not emulated (same as the GICv2 emulation). Linux
> + *   as a guest is fine with this, because it does not use priorities.
> + * - We only support Group1 interrupts. Again Linux uses only those.
> + *
> + * Copyright (C) 2014 ARM Ltd.
> + * Author: Andre Przywara <andre.przywara@arm.com>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program.  If not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +#include <linux/cpu.h>
> +#include <linux/kvm.h>
> +#include <linux/kvm_host.h>
> +#include <linux/interrupt.h>
> +
> +#include <linux/irqchip/arm-gic-v3.h>
> +#include <kvm/arm_vgic.h>
> +
> +#include <asm/kvm_emulate.h>
> +#include <asm/kvm_arm.h>
> +#include <asm/kvm_mmu.h>
> +
> +#include "vgic.h"
> +
> +static bool handle_mmio_rao_wi(struct kvm_vcpu *vcpu,
> +			       struct kvm_exit_mmio *mmio, phys_addr_t offset)
> +{
> +	u32 reg = 0xffffffff;
> +
> +	vgic_reg_access(mmio, &reg, offset,
> +			ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
> +
> +	return false;
> +}
> +
> +static bool handle_mmio_ctlr(struct kvm_vcpu *vcpu,
> +			     struct kvm_exit_mmio *mmio, phys_addr_t offset)
> +{
> +	u32 reg = 0;
> +
> +	/*
> +	 * Force ARE and DS to 1, the guest cannot change this.
> +	 * For the time being we only support Group1 interrupts.
> +	 */
> +	if (vcpu->kvm->arch.vgic.enabled)
> +		reg = GICD_CTLR_ENABLE_SS_G1;
> +	reg |= GICD_CTLR_ARE_NS | GICD_CTLR_DS;
> +
> +	vgic_reg_access(mmio, &reg, offset,
> +			ACCESS_READ_VALUE | ACCESS_WRITE_VALUE);
> +	if (mmio->is_write) {
> +		if (reg & GICD_CTLR_ENABLE_SS_G0)
> +			kvm_info("guest tried to enable unsupported Group0 interrupts\n");
> +		vcpu->kvm->arch.vgic.enabled = !!(reg & GICD_CTLR_ENABLE_SS_G1);
> +		vgic_update_state(vcpu->kvm);
> +		return true;
> +	}
> +	return false;
> +}
> +
> +/*
> + * As this implementation does not provide compatibility
> + * with GICv2 (ARE==1), we report zero CPUs in bits [5..7].
> + * Also LPIs and MBIs are not supported, so we set the respective bits to 0.
> + * Also we report at most 2**10=1024 interrupt IDs (to match 1024 SPIs).
> + */
> +#define INTERRUPT_ID_BITS 10
> +static bool handle_mmio_typer(struct kvm_vcpu *vcpu,
> +			      struct kvm_exit_mmio *mmio, phys_addr_t offset)
> +{
> +	u32 reg;
> +
> +	/* we report at most 1024 IRQs via this interface */

hmmm, do we need to repeat ourselves here?

I get a bit confused by both the comment above and here, as to *why* we
are reporting this value?  And what is the bit about 'this interface'?
Is there another interface.

Perhaps what you're trying to get at here are the semantic differences
between ITLinesNumber and IDbits and how that helps a reader understand
the code.

> +	reg = (min(vcpu->kvm->arch.vgic.nr_irqs, 1024) >> 5) - 1;
> +
> +	reg |= (INTERRUPT_ID_BITS - 1) << 19;
> +
> +	vgic_reg_access(mmio, &reg, offset,
> +			ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
> +
> +	return false;
> +}
> +
> +static bool handle_mmio_iidr(struct kvm_vcpu *vcpu,
> +			     struct kvm_exit_mmio *mmio, phys_addr_t offset)
> +{
> +	u32 reg;
> +
> +	reg = (PRODUCT_ID_KVM << 24) | (IMPLEMENTER_ARM << 0);
> +	vgic_reg_access(mmio, &reg, offset,
> +			ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
> +
> +	return false;
> +}
> +
> +static bool handle_mmio_set_enable_reg_dist(struct kvm_vcpu *vcpu,
> +					    struct kvm_exit_mmio *mmio,
> +					    phys_addr_t offset)
> +{
> +	if (likely(offset >= VGIC_NR_PRIVATE_IRQS / 8))
> +		return vgic_handle_enable_reg(vcpu->kvm, mmio, offset,
> +					      vcpu->vcpu_id,
> +					      ACCESS_WRITE_SETBIT);
> +
> +	vgic_reg_access(mmio, NULL, offset,
> +			ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
> +	return false;
> +}
> +
> +static bool handle_mmio_clear_enable_reg_dist(struct kvm_vcpu *vcpu,
> +					      struct kvm_exit_mmio *mmio,
> +					      phys_addr_t offset)
> +{
> +	if (likely(offset >= VGIC_NR_PRIVATE_IRQS / 8))
> +		return vgic_handle_enable_reg(vcpu->kvm, mmio, offset,
> +					      vcpu->vcpu_id,
> +					      ACCESS_WRITE_CLEARBIT);
> +
> +	vgic_reg_access(mmio, NULL, offset,
> +			ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
> +	return false;
> +}
> +
> +static bool handle_mmio_set_pending_reg_dist(struct kvm_vcpu *vcpu,
> +					     struct kvm_exit_mmio *mmio,
> +					     phys_addr_t offset)
> +{
> +	if (likely(offset >= VGIC_NR_PRIVATE_IRQS / 8))
> +		return vgic_handle_set_pending_reg(vcpu->kvm, mmio, offset,
> +						   vcpu->vcpu_id);
> +
> +	vgic_reg_access(mmio, NULL, offset,
> +			ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
> +	return false;
> +}
> +
> +static bool handle_mmio_clear_pending_reg_dist(struct kvm_vcpu *vcpu,
> +					       struct kvm_exit_mmio *mmio,
> +					       phys_addr_t offset)
> +{
> +	if (likely(offset >= VGIC_NR_PRIVATE_IRQS / 8))
> +		return vgic_handle_clear_pending_reg(vcpu->kvm, mmio, offset,
> +						     vcpu->vcpu_id);
> +
> +	vgic_reg_access(mmio, NULL, offset,
> +			ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
> +	return false;
> +}
> +
> +static bool handle_mmio_priority_reg_dist(struct kvm_vcpu *vcpu,
> +					  struct kvm_exit_mmio *mmio,
> +					  phys_addr_t offset)
> +{
> +	u32 *reg;
> +
> +	if (unlikely(offset < VGIC_NR_PRIVATE_IRQS)) {
> +		vgic_reg_access(mmio, NULL, offset,
> +				ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
> +		return false;
> +	}
> +
> +	reg = vgic_bytemap_get_reg(&vcpu->kvm->arch.vgic.irq_priority,
> +				   vcpu->vcpu_id, offset);
> +	vgic_reg_access(mmio, reg, offset,
> +		ACCESS_READ_VALUE | ACCESS_WRITE_VALUE);
> +	return false;
> +}
> +
> +static bool handle_mmio_cfg_reg_dist(struct kvm_vcpu *vcpu,
> +				     struct kvm_exit_mmio *mmio,
> +				     phys_addr_t offset)
> +{
> +	u32 *reg;
> +
> +	if (unlikely(offset < VGIC_NR_PRIVATE_IRQS / 4)) {
> +		vgic_reg_access(mmio, NULL, offset,
> +				ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
> +		return false;
> +	}
> +
> +	reg = vgic_bitmap_get_reg(&vcpu->kvm->arch.vgic.irq_cfg,
> +				  vcpu->vcpu_id, offset >> 1);
> +
> +	return vgic_handle_cfg_reg(reg, mmio, offset);
> +}
> +
> +/*
> + * We use a compressed version of the MPIDR (all 32 bits in one 32-bit word)
> + * when we store the target MPIDR written by the guest.
> + */
> +static u32 compress_mpidr(unsigned long mpidr)
> +{
> +	u32 ret;
> +
> +	ret = MPIDR_AFFINITY_LEVEL(mpidr, 0);
> +	ret |= MPIDR_AFFINITY_LEVEL(mpidr, 1) << 8;
> +	ret |= MPIDR_AFFINITY_LEVEL(mpidr, 2) << 16;
> +	ret |= MPIDR_AFFINITY_LEVEL(mpidr, 3) << 24;
> +
> +	return ret;
> +}
> +
> +static unsigned long uncompress_mpidr(u32 value)
> +{
> +	unsigned long mpidr;
> +
> +	mpidr  = ((value >>  0) & 0xFF) << MPIDR_LEVEL_SHIFT(0);
> +	mpidr |= ((value >>  8) & 0xFF) << MPIDR_LEVEL_SHIFT(1);
> +	mpidr |= ((value >> 16) & 0xFF) << MPIDR_LEVEL_SHIFT(2);
> +	mpidr |= (u64)((value >> 24) & 0xFF) << MPIDR_LEVEL_SHIFT(3);
> +
> +	return mpidr;
> +}
> +
> +/*
> + * Lookup the given MPIDR value to get the vcpu_id (if there is one)
> + * and store that in the irq_spi_cpu[] array.
> + * This limits the number of VCPUs to 255 for now, extending the data
> + * type (or storing kvm_vcpu poiners) should lift the limit.

s/poiners/pointers

> + * Store the original MPIDR value in an extra array to support read-as-written.
> + * Unallocated MPIDRs are translated to a special value and caught
> + * before any array accesses.

We may have covered this already, but why can't we restore the original
MPIDR based on the the irq_spi_cpu array?

Is that because we loose information about 'which' unallocated MPIDR was
written?  If that's the case, it seems weird that we go through the
trouble but we anyway throw away the aff3 field...?

> + */
> +static bool handle_mmio_route_reg(struct kvm_vcpu *vcpu,
> +				  struct kvm_exit_mmio *mmio,
> +				  phys_addr_t offset)
> +{
> +	struct kvm *kvm = vcpu->kvm;
> +	struct vgic_dist *dist = &kvm->arch.vgic;
> +	int spi;
> +	u32 reg;
> +	int vcpu_id;
> +	unsigned long *bmap, mpidr;
> +
> +	/*
> +	 * The upper 32 bits of each 64 bit register are zero,
> +	 * as we don't support Aff3.
> +	 */
> +	if ((offset & 4)) {
> +		vgic_reg_access(mmio, NULL, offset,
> +				ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
> +		return false;
> +	}
> +
> +	/* This region only covers SPIs, so no handling of private IRQs here. */
> +	spi = offset / 8;

that's not how I read the spec, it says that GICD_IROUTER0 to
GICD_IROUTER1 are not implemented (because they are SGIs and PPIs), and
I read the 'SPI ID m' as the lowest numbered SPI ID being 32, thus you
should do:

spi = offset / 8 - VGIC_NR_PRIVATE_IRQS;

> +
> +	/* get the stored MPIDR for this IRQ */
> +	mpidr = uncompress_mpidr(dist->irq_spi_mpidr[spi]);
> +	mpidr &= MPIDR_HWID_BITMASK;

is this mask needed after calling uncompress above?

> +	reg = mpidr;
> +
> +	vgic_reg_access(mmio, &reg, offset,
> +			ACCESS_READ_VALUE | ACCESS_WRITE_VALUE);
> +
> +	if (!mmio->is_write)
> +		return false;
> +
> +	/*
> +	 * Now clear the currently assigned vCPU from the map, making room
> +	 * for the new one to be written below
> +	 */
> +	vcpu = kvm_mpidr_to_vcpu(kvm, mpidr);
> +	if (likely(vcpu)) {
> +		vcpu_id = vcpu->vcpu_id;
> +		bmap = vgic_bitmap_get_shared_map(&dist->irq_spi_target[vcpu_id]);
> +		__clear_bit(spi, bmap);
> +	}
> +
> +	dist->irq_spi_mpidr[spi] = compress_mpidr(reg);
> +	vcpu = kvm_mpidr_to_vcpu(kvm, reg & MPIDR_HWID_BITMASK);
> +
> +	/*
> +	 * The spec says that non-existent MPIDR values should not be
> +	 * forwarded to any existent (v)CPU, but should be able to become
> +	 * pending anyway. We simply keep the irq_spi_target[] array empty, so
> +	 * the interrupt will never be injected.
> +	 * irq_spi_cpu[irq] gets a magic value in this case.
> +	 */
> +	if (likely(vcpu)) {
> +		vcpu_id = vcpu->vcpu_id;
> +		dist->irq_spi_cpu[spi] = vcpu_id;
> +		bmap = vgic_bitmap_get_shared_map(&dist->irq_spi_target[vcpu_id]);
> +		__set_bit(spi, bmap);
> +	} else {
> +		dist->irq_spi_cpu[spi] = VCPU_NOT_ALLOCATED;
> +	}
> +
> +	vgic_update_state(kvm);
> +
> +	return true;
> +}
> +
> +/*
> + * We should be careful about promising too much when a guest reads
> + * this register. Don't claim to be like any hardware implementation,
> + * but just report the GIC as version 3 - which is what a Linux guest
> + * would check.
> + */
> +static bool handle_mmio_idregs(struct kvm_vcpu *vcpu,
> +			       struct kvm_exit_mmio *mmio,
> +			       phys_addr_t offset)
> +{
> +	u32 reg = 0;
> +
> +	switch (offset + GICD_IDREGS) {
> +	case GICD_PIDR2:
> +		reg = 0x3b;
> +		break;
> +	}
> +
> +	vgic_reg_access(mmio, &reg, offset,
> +			ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
> +
> +	return false;
> +}
> +
> +static const struct kvm_mmio_range vgic_v3_dist_ranges[] = {
> +	{
> +		.base           = GICD_CTLR,
> +		.len            = 0x04,
> +		.bits_per_irq   = 0,
> +		.handle_mmio    = handle_mmio_ctlr,
> +	},
> +	{
> +		.base           = GICD_TYPER,
> +		.len            = 0x04,
> +		.bits_per_irq   = 0,
> +		.handle_mmio    = handle_mmio_typer,
> +	},
> +	{
> +		.base           = GICD_IIDR,
> +		.len            = 0x04,
> +		.bits_per_irq   = 0,
> +		.handle_mmio    = handle_mmio_iidr,
> +	},
> +	{
> +		/* this register is optional, it is RAZ/WI if not implemented */
> +		.base           = GICD_STATUSR,
> +		.len            = 0x04,
> +		.bits_per_irq   = 0,
> +		.handle_mmio    = handle_mmio_raz_wi,
> +	},
> +	{
> +		/* this write only register is WI when TYPER.MBIS=0 */
> +		.base		= GICD_SETSPI_NSR,
> +		.len		= 0x04,
> +		.bits_per_irq	= 0,
> +		.handle_mmio	= handle_mmio_raz_wi,
> +	},
> +	{
> +		/* this write only register is WI when TYPER.MBIS=0 */
> +		.base		= GICD_CLRSPI_NSR,
> +		.len		= 0x04,
> +		.bits_per_irq	= 0,
> +		.handle_mmio	= handle_mmio_raz_wi,
> +	},
> +	{
> +		/* this is RAZ/WI when DS=1 */
> +		.base		= GICD_SETSPI_SR,
> +		.len		= 0x04,
> +		.bits_per_irq	= 0,
> +		.handle_mmio	= handle_mmio_raz_wi,
> +	},
> +	{
> +		/* this is RAZ/WI when DS=1 */
> +		.base		= GICD_CLRSPI_SR,
> +		.len		= 0x04,
> +		.bits_per_irq	= 0,
> +		.handle_mmio	= handle_mmio_raz_wi,
> +	},
> +	{
> +		.base		= GICD_IGROUPR,
> +		.len		= 0x80,
> +		.bits_per_irq	= 1,
> +		.handle_mmio	= handle_mmio_rao_wi,
> +	},
> +	{
> +		.base		= GICD_ISENABLER,
> +		.len		= 0x80,
> +		.bits_per_irq	= 1,
> +		.handle_mmio	= handle_mmio_set_enable_reg_dist,
> +	},
> +	{
> +		.base		= GICD_ICENABLER,
> +		.len		= 0x80,
> +		.bits_per_irq	= 1,
> +		.handle_mmio	= handle_mmio_clear_enable_reg_dist,
> +	},
> +	{
> +		.base		= GICD_ISPENDR,
> +		.len		= 0x80,
> +		.bits_per_irq	= 1,
> +		.handle_mmio	= handle_mmio_set_pending_reg_dist,
> +	},
> +	{
> +		.base		= GICD_ICPENDR,
> +		.len		= 0x80,
> +		.bits_per_irq	= 1,
> +		.handle_mmio	= handle_mmio_clear_pending_reg_dist,
> +	},
> +	{
> +		.base		= GICD_ISACTIVER,
> +		.len		= 0x80,
> +		.bits_per_irq	= 1,
> +		.handle_mmio	= handle_mmio_raz_wi,
> +	},
> +	{
> +		.base		= GICD_ICACTIVER,
> +		.len		= 0x80,
> +		.bits_per_irq	= 1,
> +		.handle_mmio	= handle_mmio_raz_wi,
> +	},
> +	{
> +		.base		= GICD_IPRIORITYR,
> +		.len		= 0x400,
> +		.bits_per_irq	= 8,
> +		.handle_mmio	= handle_mmio_priority_reg_dist,
> +	},
> +	{
> +		/* TARGETSRn is RES0 when ARE=1 */
> +		.base		= GICD_ITARGETSR,
> +		.len		= 0x400,
> +		.bits_per_irq	= 8,
> +		.handle_mmio	= handle_mmio_raz_wi,
> +	},
> +	{
> +		.base		= GICD_ICFGR,
> +		.len		= 0x100,
> +		.bits_per_irq	= 2,
> +		.handle_mmio	= handle_mmio_cfg_reg_dist,
> +	},
> +	{
> +		/* this is RAZ/WI when DS=1 */
> +		.base		= GICD_IGRPMODR,
> +		.len		= 0x80,
> +		.bits_per_irq	= 1,
> +		.handle_mmio	= handle_mmio_raz_wi,
> +	},
> +	{
> +		/* this is RAZ/WI when DS=1 */
> +		.base		= GICD_NSACR,
> +		.len		= 0x100,
> +		.bits_per_irq	= 2,
> +		.handle_mmio	= handle_mmio_raz_wi,
> +	},
> +	{
> +		/* this is RAZ/WI when ARE=1 */
> +		.base		= GICD_SGIR,
> +		.len		= 0x04,
> +		.handle_mmio	= handle_mmio_raz_wi,
> +	},
> +	{
> +		/* this is RAZ/WI when ARE=1 */
> +		.base		= GICD_CPENDSGIR,
> +		.len		= 0x10,
> +		.handle_mmio	= handle_mmio_raz_wi,
> +	},
> +	{
> +		/* this is RAZ/WI when ARE=1 */
> +		.base           = GICD_SPENDSGIR,
> +		.len            = 0x10,
> +		.handle_mmio    = handle_mmio_raz_wi,
> +	},
> +	{
> +		.base		= GICD_IROUTER + 0x100,
> +		.len		= 0x1edc,
> +		.bits_per_irq	= 64,
> +		.handle_mmio	= handle_mmio_route_reg,
> +	},
> +	{
> +		.base           = GICD_IDREGS,
> +		.len            = 0x30,
> +		.bits_per_irq   = 0,
> +		.handle_mmio    = handle_mmio_idregs,
> +	},
> +	{},
> +};
> +
> +static bool handle_mmio_set_enable_reg_redist(struct kvm_vcpu *vcpu,
> +					      struct kvm_exit_mmio *mmio,
> +					      phys_addr_t offset)
> +{
> +	struct kvm_vcpu *redist_vcpu = mmio->private;
> +
> +	return vgic_handle_enable_reg(vcpu->kvm, mmio, offset,
> +				      redist_vcpu->vcpu_id,
> +				      ACCESS_WRITE_SETBIT);
> +}
> +
> +static bool handle_mmio_clear_enable_reg_redist(struct kvm_vcpu *vcpu,
> +						struct kvm_exit_mmio *mmio,
> +						phys_addr_t offset)
> +{
> +	struct kvm_vcpu *redist_vcpu = mmio->private;
> +
> +	return vgic_handle_enable_reg(vcpu->kvm, mmio, offset,
> +				      redist_vcpu->vcpu_id,
> +				      ACCESS_WRITE_CLEARBIT);
> +}
> +
> +static bool handle_mmio_set_pending_reg_redist(struct kvm_vcpu *vcpu,
> +					       struct kvm_exit_mmio *mmio,
> +					       phys_addr_t offset)
> +{
> +	struct kvm_vcpu *redist_vcpu = mmio->private;
> +
> +	return vgic_handle_set_pending_reg(vcpu->kvm, mmio, offset,
> +					   redist_vcpu->vcpu_id);
> +}
> +
> +static bool handle_mmio_clear_pending_reg_redist(struct kvm_vcpu *vcpu,
> +						 struct kvm_exit_mmio *mmio,
> +						 phys_addr_t offset)
> +{
> +	struct kvm_vcpu *redist_vcpu = mmio->private;
> +
> +	return vgic_handle_clear_pending_reg(vcpu->kvm, mmio, offset,
> +					     redist_vcpu->vcpu_id);
> +}
> +
> +static bool handle_mmio_priority_reg_redist(struct kvm_vcpu *vcpu,
> +					    struct kvm_exit_mmio *mmio,
> +					    phys_addr_t offset)
> +{
> +	struct kvm_vcpu *redist_vcpu = mmio->private;
> +	u32 *reg;
> +
> +	reg = vgic_bytemap_get_reg(&vcpu->kvm->arch.vgic.irq_priority,
> +				   redist_vcpu->vcpu_id, offset);
> +	vgic_reg_access(mmio, reg, offset,
> +			ACCESS_READ_VALUE | ACCESS_WRITE_VALUE);
> +	return false;
> +}
> +
> +static bool handle_mmio_cfg_reg_redist(struct kvm_vcpu *vcpu,
> +				       struct kvm_exit_mmio *mmio,
> +				       phys_addr_t offset)
> +{
> +	struct kvm_vcpu *redist_vcpu = mmio->private;
> +
> +	u32 *reg = vgic_bitmap_get_reg(&vcpu->kvm->arch.vgic.irq_cfg,
> +				       redist_vcpu->vcpu_id, offset >> 1);
> +
> +	return vgic_handle_cfg_reg(reg, mmio, offset);
> +}
> +
> +static const struct kvm_mmio_range vgic_redist_sgi_ranges[] = {
> +	{
> +		.base		= GICR_IGROUPR0,
> +		.len		= 0x04,
> +		.bits_per_irq	= 1,
> +		.handle_mmio	= handle_mmio_raz_wi,

I think you were going to change this ro handle_mmio_rao_wi() instead?

> +	},
> +	{
> +		.base		= GICR_ISENABLER0,
> +		.len		= 0x04,
> +		.bits_per_irq	= 1,
> +		.handle_mmio	= handle_mmio_set_enable_reg_redist,
> +	},
> +	{
> +		.base		= GICR_ICENABLER0,
> +		.len		= 0x04,
> +		.bits_per_irq	= 1,
> +		.handle_mmio	= handle_mmio_clear_enable_reg_redist,
> +	},
> +	{
> +		.base		= GICR_ISPENDR0,
> +		.len		= 0x04,
> +		.bits_per_irq	= 1,
> +		.handle_mmio	= handle_mmio_set_pending_reg_redist,
> +	},
> +	{
> +		.base		= GICR_ICPENDR0,
> +		.len		= 0x04,
> +		.bits_per_irq	= 1,
> +		.handle_mmio	= handle_mmio_clear_pending_reg_redist,
> +	},
> +	{
> +		.base		= GICR_ISACTIVER0,
> +		.len		= 0x04,
> +		.bits_per_irq	= 1,
> +		.handle_mmio	= handle_mmio_raz_wi,
> +	},
> +	{
> +		.base		= GICR_ICACTIVER0,
> +		.len		= 0x04,
> +		.bits_per_irq	= 1,
> +		.handle_mmio	= handle_mmio_raz_wi,
> +	},
> +	{
> +		.base		= GICR_IPRIORITYR0,
> +		.len		= 0x20,
> +		.bits_per_irq	= 8,
> +		.handle_mmio	= handle_mmio_priority_reg_redist,
> +	},
> +	{
> +		.base		= GICR_ICFGR0,
> +		.len		= 0x08,
> +		.bits_per_irq	= 2,
> +		.handle_mmio	= handle_mmio_cfg_reg_redist,
> +	},
> +	{
> +		.base		= GICR_IGRPMODR0,
> +		.len		= 0x04,
> +		.bits_per_irq	= 1,
> +		.handle_mmio	= handle_mmio_raz_wi,
> +	},
> +	{
> +		.base		= GICR_NSACR,
> +		.len		= 0x04,
> +		.handle_mmio	= handle_mmio_raz_wi,
> +	},
> +	{},
> +};
> +
> +static bool handle_mmio_ctlr_redist(struct kvm_vcpu *vcpu,
> +				    struct kvm_exit_mmio *mmio,
> +				    phys_addr_t offset)
> +{
> +	/* since we don't support LPIs, this register is zero for now */
> +	vgic_reg_access(mmio, NULL, offset,
> +			ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
> +	return false;
> +}
> +
> +static bool handle_mmio_typer_redist(struct kvm_vcpu *vcpu,
> +				     struct kvm_exit_mmio *mmio,
> +				     phys_addr_t offset)
> +{
> +	u32 reg;
> +	u64 mpidr;
> +	struct kvm_vcpu *redist_vcpu = mmio->private;
> +	int target_vcpu_id = redist_vcpu->vcpu_id;
> +
> +	/* the upper 32 bits contain the affinity value */
> +	if ((offset & ~3) == 4) {
> +		mpidr = kvm_vcpu_get_mpidr_aff(redist_vcpu);
> +		reg = compress_mpidr(mpidr);
> +
> +		vgic_reg_access(mmio, &reg, offset,
> +				ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
> +		return false;
> +	}
> +
> +	reg = redist_vcpu->vcpu_id << 8;
> +	if (target_vcpu_id == atomic_read(&vcpu->kvm->online_vcpus) - 1)
> +		reg |= GICR_TYPER_LAST;
> +	vgic_reg_access(mmio, &reg, offset,
> +			ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
> +	return false;
> +}
> +
> +static const struct kvm_mmio_range vgic_redist_ranges[] = {
> +	{
> +		.base           = GICR_CTLR,
> +		.len            = 0x04,
> +		.bits_per_irq   = 0,
> +		.handle_mmio    = handle_mmio_ctlr_redist,
> +	},
> +	{
> +		.base           = GICR_TYPER,
> +		.len            = 0x08,
> +		.bits_per_irq   = 0,
> +		.handle_mmio    = handle_mmio_typer_redist,
> +	},
> +	{
> +		.base           = GICR_IIDR,
> +		.len            = 0x04,
> +		.bits_per_irq   = 0,
> +		.handle_mmio    = handle_mmio_iidr,
> +	},
> +	{
> +		.base           = GICR_WAKER,
> +		.len            = 0x04,
> +		.bits_per_irq   = 0,
> +		.handle_mmio    = handle_mmio_raz_wi,
> +	},
> +	{
> +		.base           = GICR_IDREGS,
> +		.len            = 0x30,
> +		.bits_per_irq   = 0,
> +		.handle_mmio    = handle_mmio_idregs,
> +	},
> +	{},
> +};
> +
> +/*
> + * This function splits accesses between the distributor and the two
> + * redistributor parts (private/SPI). As each redistributor is accessible
> + * from any CPU, we have to determine the affected VCPU by taking the faulting
> + * address into account. We then pass this VCPU to the handler function via
> + * the private parameter.
> + */
> +#define SGI_BASE_OFFSET SZ_64K
> +static bool vgic_v3_handle_mmio(struct kvm_vcpu *vcpu, struct kvm_run *run,
> +				struct kvm_exit_mmio *mmio)
> +{
> +	struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
> +	unsigned long dbase = dist->vgic_dist_base;
> +	unsigned long rdbase = dist->vgic_redist_base;
> +	int nrcpus = atomic_read(&vcpu->kvm->online_vcpus);
> +	int vcpu_id;
> +	const struct kvm_mmio_range *mmio_range;
> +
> +	if (is_in_range(mmio->phys_addr, mmio->len, dbase, GIC_V3_DIST_SIZE)) {
> +		return vgic_handle_mmio_range(vcpu, run, mmio,
> +					      vgic_v3_dist_ranges, dbase);
> +	}
> +
> +	if (!is_in_range(mmio->phys_addr, mmio->len, rdbase,
> +	    GIC_V3_REDIST_SIZE * nrcpus))
> +		return false;

Did you think more about the contiguous allocation issue here or can you
give me a pointer to the requirement in the spec?

> +
> +	vcpu_id = (mmio->phys_addr - rdbase) / GIC_V3_REDIST_SIZE;
> +	rdbase += (vcpu_id * GIC_V3_REDIST_SIZE);
> +	mmio->private = kvm_get_vcpu(vcpu->kvm, vcpu_id);
> +
> +	if (mmio->phys_addr >= rdbase + SGI_BASE_OFFSET) {
> +		rdbase += SGI_BASE_OFFSET;
> +		mmio_range = vgic_redist_sgi_ranges;
> +	} else {
> +		mmio_range = vgic_redist_ranges;
> +	}
> +	return vgic_handle_mmio_range(vcpu, run, mmio, mmio_range, rdbase);
> +}
> +
> +static bool vgic_v3_queue_sgi(struct kvm_vcpu *vcpu, int irq)
> +{
> +	if (vgic_queue_irq(vcpu, 0, irq)) {
> +		vgic_dist_irq_clear_pending(vcpu, irq);
> +		vgic_cpu_irq_clear(vcpu, irq);
> +		return true;
> +	}
> +
> +	return false;
> +}
> +
> +static int vgic_v3_init_maps(struct vgic_dist *dist)
> +{
> +	int nr_spis = dist->nr_irqs - VGIC_NR_PRIVATE_IRQS;
> +
> +	dist->irq_spi_mpidr = kcalloc(nr_spis, sizeof(dist->irq_spi_mpidr[0]),
> +				      GFP_KERNEL);
> +
> +	if (!dist->irq_spi_mpidr)
> +		return -ENOMEM;
> +
> +	return 0;
> +}
> +
> +static int vgic_v3_init(struct kvm *kvm, const struct vgic_params *params)
> +{
> +	struct vgic_dist *dist = &kvm->arch.vgic;
> +	int ret, i;
> +	u32 mpidr;
> +
> +	if (IS_VGIC_ADDR_UNDEF(dist->vgic_dist_base) ||
> +	    IS_VGIC_ADDR_UNDEF(dist->vgic_redist_base)) {
> +		kvm_err("Need to set vgic distributor addresses first\n");
> +		return -ENXIO;
> +	}
> +
> +	/*
> +	 * FIXME: this should be moved to init_maps time, and may bite
> +	 * us when adding save/restore. Add a per-emulation hook?
> +	 */

progress on this fixme?

> +	ret = vgic_v3_init_maps(dist);
> +	if (ret) {
> +		kvm_err("Unable to allocate maps\n");
> +		return ret;
> +	}
> +
> +	/* Initialize the target VCPUs for each IRQ to VCPU 0 */
> +	mpidr = compress_mpidr(kvm_vcpu_get_mpidr_aff(kvm_get_vcpu(kvm, 0)));
> +	for (i = VGIC_NR_PRIVATE_IRQS; i < dist->nr_irqs; i++) {
> +		dist->irq_spi_cpu[i - VGIC_NR_PRIVATE_IRQS] = 0;
> +		dist->irq_spi_mpidr[i - VGIC_NR_PRIVATE_IRQS] = mpidr;
> +		vgic_bitmap_set_irq_val(dist->irq_spi_target, 0, i, 1);
> +	}
> +
> +	return 0;
> +}
> +
> +/* GICv3 does not keep track of SGI sources anymore. */
> +static void vgic_v3_add_sgi_source(struct kvm_vcpu *vcpu, int irq, int source)
> +{
> +}
> +
> +int vgic_v3_init_emulation(struct kvm *kvm)
> +{
> +	struct vgic_dist *dist = &kvm->arch.vgic;
> +
> +	dist->vm_ops.handle_mmio = vgic_v3_handle_mmio;
> +	dist->vm_ops.queue_sgi = vgic_v3_queue_sgi;
> +	dist->vm_ops.add_sgi_source = vgic_v3_add_sgi_source;
> +	dist->vm_ops.vgic_init = vgic_v3_init;
> +
> +	kvm->arch.max_vcpus = KVM_MAX_VCPUS;
> +
> +	return 0;
> +}
> +
> +static int vgic_v3_create(struct kvm_device *dev, u32 type)
> +{
> +	return kvm_vgic_create(dev->kvm, type);
> +}
> +
> +static void vgic_v3_destroy(struct kvm_device *dev)
> +{
> +	kfree(dev);
> +}
> +
> +static int vgic_v3_set_attr(struct kvm_device *dev,
> +			    struct kvm_device_attr *attr)
> +{
> +	int ret;
> +
> +	ret = vgic_set_common_attr(dev, attr);
> +	if (ret != -ENXIO)
> +		return ret;
> +
> +	switch (attr->group) {
> +	case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
> +	case KVM_DEV_ARM_VGIC_GRP_CPU_REGS:
> +		return -ENXIO;
> +	}
> +
> +	return -ENXIO;
> +}
> +
> +static int vgic_v3_get_attr(struct kvm_device *dev,
> +			    struct kvm_device_attr *attr)
> +{
> +	int ret;
> +
> +	ret = vgic_get_common_attr(dev, attr);
> +	if (ret != -ENXIO)
> +		return ret;
> +
> +	switch (attr->group) {
> +	case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
> +	case KVM_DEV_ARM_VGIC_GRP_CPU_REGS:
> +		return -ENXIO;
> +	}
> +
> +	return -ENXIO;
> +}
> +
> +static int vgic_v3_has_attr(struct kvm_device *dev,
> +			    struct kvm_device_attr *attr)
> +{
> +	switch (attr->group) {
> +	case KVM_DEV_ARM_VGIC_GRP_ADDR:
> +		switch (attr->attr) {
> +		case KVM_VGIC_V2_ADDR_TYPE_DIST:
> +		case KVM_VGIC_V2_ADDR_TYPE_CPU:
> +			return -ENXIO;
> +		}
> +		break;
> +	case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
> +	case KVM_DEV_ARM_VGIC_GRP_CPU_REGS:
> +		return -ENXIO;
> +	case KVM_DEV_ARM_VGIC_GRP_NR_IRQS:
> +		return 0;
> +	}
> +	return -ENXIO;
> +}
> +
> +struct kvm_device_ops kvm_arm_vgic_v3_ops = {
> +	.name = "kvm-arm-vgic-v3",
> +	.create = vgic_v3_create,
> +	.destroy = vgic_v3_destroy,
> +	.set_attr = vgic_v3_set_attr,
> +	.get_attr = vgic_v3_get_attr,
> +	.has_attr = vgic_v3_has_attr,
> +};
> diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c
> index 335ffe0..b7de0f8 100644
> --- a/virt/kvm/arm/vgic.c
> +++ b/virt/kvm/arm/vgic.c
> @@ -1249,7 +1249,7 @@ static bool vgic_update_irq_pending(struct kvm *kvm, int cpuid,
>  	struct kvm_vcpu *vcpu;
>  	int edge_triggered, level_triggered;
>  	int enabled;
> -	bool ret = true;
> +	bool ret = true, can_inject = true;
>  
>  	spin_lock(&dist->lock);
>  
> @@ -1264,6 +1264,11 @@ static bool vgic_update_irq_pending(struct kvm *kvm, int cpuid,
>  
>  	if (irq_num >= VGIC_NR_PRIVATE_IRQS) {
>  		cpuid = dist->irq_spi_cpu[irq_num - VGIC_NR_PRIVATE_IRQS];
> +		if (cpuid == VCPU_NOT_ALLOCATED) {
> +			/* Pretend we use CPU0, and prevent injection */
> +			cpuid = 0;
> +			can_inject = false;
> +		}
>  		vcpu = kvm_get_vcpu(kvm, cpuid);
>  	}
>  
> @@ -1285,7 +1290,7 @@ static bool vgic_update_irq_pending(struct kvm *kvm, int cpuid,
>  
>  	enabled = vgic_irq_is_enabled(vcpu, irq_num);
>  
> -	if (!enabled) {
> +	if (!enabled || !can_inject) {
>  		ret = false;
>  		goto out;
>  	}
> @@ -1438,6 +1443,7 @@ void kvm_vgic_destroy(struct kvm *kvm)
>  	}
>  	kfree(dist->irq_sgi_sources);
>  	kfree(dist->irq_spi_cpu);
> +	kfree(dist->irq_spi_mpidr);
>  	kfree(dist->irq_spi_target);
>  	kfree(dist->irq_pending_on_cpu);
>  	dist->irq_sgi_sources = NULL;
> @@ -1628,6 +1634,7 @@ int kvm_vgic_create(struct kvm *kvm, u32 type)
>  	kvm->arch.vgic.vctrl_base = vgic->vctrl_base;
>  	kvm->arch.vgic.vgic_dist_base = VGIC_ADDR_UNDEF;
>  	kvm->arch.vgic.vgic_cpu_base = VGIC_ADDR_UNDEF;
> +	kvm->arch.vgic.vgic_redist_base = VGIC_ADDR_UNDEF;
>  
>  out_unlock:
>  	for (; vcpu_lock_idx >= 0; vcpu_lock_idx--) {
> diff --git a/virt/kvm/arm/vgic.h b/virt/kvm/arm/vgic.h
> index ff3171a..b0c6b2f 100644
> --- a/virt/kvm/arm/vgic.h
> +++ b/virt/kvm/arm/vgic.h
> @@ -35,6 +35,8 @@
>  #define ACCESS_WRITE_VALUE	(3 << 1)
>  #define ACCESS_WRITE_MASK(x)	((x) & (3 << 1))
>  
> +#define VCPU_NOT_ALLOCATED	((u8)-1)
> +
>  unsigned long *vgic_bitmap_get_shared_map(struct vgic_bitmap *x);
>  
>  void vgic_update_state(struct kvm *kvm);
> @@ -115,5 +117,6 @@ int vgic_set_common_attr(struct kvm_device *dev, struct kvm_device_attr *attr);
>  int vgic_get_common_attr(struct kvm_device *dev, struct kvm_device_attr *attr);
>  
>  int vgic_v2_init_emulation(struct kvm *kvm);
> +int vgic_v3_init_emulation(struct kvm *kvm);
>  
>  #endif
> -- 
> 1.7.9.5
> 

Thanks,
-Christoffer
Andre Przywara Nov. 24, 2014, 4 p.m. UTC | #6
Hi,

On 23/11/14 14:38, Christoffer Dall wrote:
> On Fri, Nov 14, 2014 at 10:07:59AM +0000, Andre Przywara wrote:
>> With everything separated and prepared, we implement a model of a
>> GICv3 distributor and redistributors by using the existing framework
>> to provide handler functions for each register group.
>>
>> Currently we limit the emulation to a model enforcing a single
>> security state, with SRE==1 (forcing system register access) and
>> ARE==1 (allowing more than 8 VCPUs).
>>
>> We share some of the functions provided for GICv2 emulation, but take
>> the different ways of addressing (v)CPUs into account.
>> Save and restore is currently not implemented.
>>
>> Similar to the split-off of the GICv2 specific code, the new emulation
>> code goes into a new file (vgic-v3-emul.c).
>>
>> Signed-off-by: Andre Przywara <andre.przywara@arm.com>
>> ---
>> Changelog v3...v4:
>> - remove ICC_SGI1R_EL1 register handling (moved into later patch)
>> - add definitions for single security state
> 
> what exactly does this mean?

Sorry, forgot to add that I was talking about GICD_CTLR bit values here.
Unfortunately the meaning of bits 0..2 differs depending on the security
setup. I added the names for the single security case (the third group
in the spec).

>> - document emulation limitations in vgic-v3-emul.c header
>> - move CTLR, TYPER and IIDR handling into separate functions
>> - add RAO/WI handling for IGROUPRn registers
>> - remove unneeded offset masking on calling vgic_reg_access()
>> - rework handle_mmio_route_reg() to only handle SPIs
>> - refine IROUTERn register range
>> - use non-atomic bitops functions (__clear_bit() and __set_bit())
>> - rename vgic_dist_ranges[] to vgic_v3_dist_ranges[]
>> - add (RAZ/WI) implementation of GICD_STATUSR
>> - add (RAZ/WI) implementations of MBI registers
>> - adapt to new private passing (in struct kvm_exit_mmio instead of a paramter)
>> - fix vcpu_id calculation bug in handle CFG registers
> 
> which bug was that?  I can't see the difference between v3 and v4 on
> this one?

http://www.linux-arm.org/git?p=linux-ap.git;a=commitdiff;h=dae47cd0b1c233241112037cea717f8c364a34e9
That was obviously introduced during some rework, where I didn't catch
all the cases.

>> - always use hexadecimal numbers for .len member
>> - simplify vgic_v3_handle_mmio()
>> - add vgic_v3_create() and vgic_v3_destroy()
>> - swap vgic_v3_[sg]et_attr() code location
>> - add and improve comments
>> - (adaptions to changes from earlier patches)
>>
>>  arch/arm64/kvm/Makefile            |    1 +
>>  include/kvm/arm_vgic.h             |    9 +-
>>  include/linux/irqchip/arm-gic-v3.h |   32 ++
>>  include/linux/kvm_host.h           |    1 +
>>  include/uapi/linux/kvm.h           |    2 +
>>  virt/kvm/arm/vgic-v3-emul.c        |  904 ++++++++++++++++++++++++++++++++++++
>>  virt/kvm/arm/vgic.c                |   11 +-
>>  virt/kvm/arm/vgic.h                |    3 +
>>  8 files changed, 960 insertions(+), 3 deletions(-)
>>  create mode 100644 virt/kvm/arm/vgic-v3-emul.c
>>
>> diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile
>> index d957353..4e6e09e 100644
>> --- a/arch/arm64/kvm/Makefile
>> +++ b/arch/arm64/kvm/Makefile
>> @@ -24,5 +24,6 @@ kvm-$(CONFIG_KVM_ARM_VGIC) += $(KVM)/arm/vgic-v2.o
>>  kvm-$(CONFIG_KVM_ARM_VGIC) += $(KVM)/arm/vgic-v2-emul.o
>>  kvm-$(CONFIG_KVM_ARM_VGIC) += vgic-v2-switch.o
>>  kvm-$(CONFIG_KVM_ARM_VGIC) += $(KVM)/arm/vgic-v3.o
>> +kvm-$(CONFIG_KVM_ARM_VGIC) += $(KVM)/arm/vgic-v3-emul.o
>>  kvm-$(CONFIG_KVM_ARM_VGIC) += vgic-v3-switch.o
>>  kvm-$(CONFIG_KVM_ARM_TIMER) += $(KVM)/arm/arch_timer.o
>> diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
>> index 421833f..c1ef5a9 100644
>> --- a/include/kvm/arm_vgic.h
>> +++ b/include/kvm/arm_vgic.h
>> @@ -160,7 +160,11 @@ struct vgic_dist {
>>
>>       /* Distributor and vcpu interface mapping in the guest */
>>       phys_addr_t             vgic_dist_base;
>> -     phys_addr_t             vgic_cpu_base;
>> +     /* GICv2 and GICv3 use different mapped register blocks */
>> +     union {
>> +             phys_addr_t             vgic_cpu_base;
>> +             phys_addr_t             vgic_redist_base;
>> +     };
>>
>>       /* Distributor enabled */
>>       u32                     enabled;
>> @@ -222,6 +226,9 @@ struct vgic_dist {
>>        */
>>       struct vgic_bitmap      *irq_spi_target;
>>
>> +     /* Target MPIDR for each IRQ (needed for GICv3 IROUTERn) only */
>> +     u32                     *irq_spi_mpidr;
>> +
>>       /* Bitmap indicating which CPU has something pending */
>>       unsigned long           *irq_pending_on_cpu;
>>
>> diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h
>> index 03a4ea3..726d898 100644
>> --- a/include/linux/irqchip/arm-gic-v3.h
>> +++ b/include/linux/irqchip/arm-gic-v3.h
>> @@ -33,6 +33,7 @@
>>  #define GICD_SETSPI_SR                       0x0050
>>  #define GICD_CLRSPI_SR                       0x0058
>>  #define GICD_SEIR                    0x0068
>> +#define GICD_IGROUPR                 0x0080
>>  #define GICD_ISENABLER                       0x0100
>>  #define GICD_ICENABLER                       0x0180
>>  #define GICD_ISPENDR                 0x0200
>> @@ -41,14 +42,37 @@
>>  #define GICD_ICACTIVER                       0x0380
>>  #define GICD_IPRIORITYR                      0x0400
>>  #define GICD_ICFGR                   0x0C00
>> +#define GICD_IGRPMODR                        0x0D00
>> +#define GICD_NSACR                   0x0E00
>>  #define GICD_IROUTER                 0x6000
>> +#define GICD_IDREGS                  0xFFD0
>>  #define GICD_PIDR2                   0xFFE8
>>
>> +/*
>> + * Those registers are actually from GICv2, but the spec demands that they
>> + * are implemented as RES0 if ARE is 1 (which we do in KVM's emulated GICv3).
>> + */
>> +#define GICD_ITARGETSR                       0x0800
>> +#define GICD_SGIR                    0x0F00
>> +#define GICD_CPENDSGIR                       0x0F10
>> +#define GICD_SPENDSGIR                       0x0F20
>> +
>>  #define GICD_CTLR_RWP                        (1U << 31)
>> +#define GICD_CTLR_DS                 (1U << 6)
>>  #define GICD_CTLR_ARE_NS             (1U << 4)
>>  #define GICD_CTLR_ENABLE_G1A         (1U << 1)
>>  #define GICD_CTLR_ENABLE_G1          (1U << 0)
>>
>> +/*
>> + * In systems with a single security state (what we emulate in KVM)
>> + * the meaning of the interrupt group enable bits is slightly different
>> + */
>> +#define GICD_CTLR_ENABLE_SS_G1               (1U << 1)
>> +#define GICD_CTLR_ENABLE_SS_G0               (1U << 0)
>> +
>> +#define GICD_TYPER_LPIS                      (1U << 17)
>> +#define GICD_TYPER_MBIS                      (1U << 16)
>> +
>>  #define GICD_IROUTER_SPI_MODE_ONE    (0U << 31)
>>  #define GICD_IROUTER_SPI_MODE_ANY    (1U << 31)
>>
>> @@ -56,6 +80,8 @@
>>  #define GIC_PIDR2_ARCH_GICv3         0x30
>>  #define GIC_PIDR2_ARCH_GICv4         0x40
>>
>> +#define GIC_V3_DIST_SIZE             0x10000
>> +
>>  /*
>>   * Re-Distributor registers, offsets from RD_base
>>   */
>> @@ -74,6 +100,7 @@
>>  #define GICR_SYNCR                   0x00C0
>>  #define GICR_MOVLPIR                 0x0100
>>  #define GICR_MOVALLR                 0x0110
>> +#define GICR_IDREGS                  GICD_IDREGS
>>  #define GICR_PIDR2                   GICD_PIDR2
>>
>>  #define GICR_WAKER_ProcessorSleep    (1U << 1)
>> @@ -82,6 +109,7 @@
>>  /*
>>   * Re-Distributor registers, offsets from SGI_base
>>   */
>> +#define GICR_IGROUPR0                        GICD_IGROUPR
>>  #define GICR_ISENABLER0                      GICD_ISENABLER
>>  #define GICR_ICENABLER0                      GICD_ICENABLER
>>  #define GICR_ISPENDR0                        GICD_ISPENDR
>> @@ -90,10 +118,14 @@
>>  #define GICR_ICACTIVER0                      GICD_ICACTIVER
>>  #define GICR_IPRIORITYR0             GICD_IPRIORITYR
>>  #define GICR_ICFGR0                  GICD_ICFGR
>> +#define GICR_IGRPMODR0                       GICD_IGRPMODR
>> +#define GICR_NSACR                   GICD_NSACR
>>
>>  #define GICR_TYPER_VLPIS             (1U << 1)
>>  #define GICR_TYPER_LAST                      (1U << 4)
>>
>> +#define GIC_V3_REDIST_SIZE           0x20000
>> +
>>  /*
>>   * CPU interface registers
>>   */
>> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
>> index 326ba7a..4a7798e 100644
>> --- a/include/linux/kvm_host.h
>> +++ b/include/linux/kvm_host.h
>> @@ -1085,6 +1085,7 @@ void kvm_unregister_device_ops(u32 type);
>>  extern struct kvm_device_ops kvm_mpic_ops;
>>  extern struct kvm_device_ops kvm_xics_ops;
>>  extern struct kvm_device_ops kvm_arm_vgic_v2_ops;
>> +extern struct kvm_device_ops kvm_arm_vgic_v3_ops;
>>
>>  #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
>>
>> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
>> index 6076882..24cb129 100644
>> --- a/include/uapi/linux/kvm.h
>> +++ b/include/uapi/linux/kvm.h
>> @@ -960,6 +960,8 @@ enum kvm_device_type {
>>  #define KVM_DEV_TYPE_ARM_VGIC_V2     KVM_DEV_TYPE_ARM_VGIC_V2
>>       KVM_DEV_TYPE_FLIC,
>>  #define KVM_DEV_TYPE_FLIC            KVM_DEV_TYPE_FLIC
>> +     KVM_DEV_TYPE_ARM_VGIC_V3,
>> +#define KVM_DEV_TYPE_ARM_VGIC_V3     KVM_DEV_TYPE_ARM_VGIC_V3
>>       KVM_DEV_TYPE_MAX,
>>  };
>>
>> diff --git a/virt/kvm/arm/vgic-v3-emul.c b/virt/kvm/arm/vgic-v3-emul.c
>> new file mode 100644
>> index 0000000..97b5801
>> --- /dev/null
>> +++ b/virt/kvm/arm/vgic-v3-emul.c
>> @@ -0,0 +1,904 @@
>> +/*
>> + * GICv3 distributor and redistributor emulation
>> + *
>> + * GICv3 emulation is currently only supported on a GICv3 host (because
>> + * we rely on the hardware's CPU interface virtualization support), but
>> + * supports both hardware with or without the optional GICv2 backwards
>> + * compatibility features.
>> + *
>> + * Limitations of the emulation:
>> + * (RAZ/WI: read as zero, write ignore, RAO/WI: read as one, write ignore)
>> + * - We do not support LPIs (yet). TYPER.LPIS is reported as 0 and is RAZ/WI.
>> + * - We do not support the message based interrupts (MBIs) triggered by
>> + *   writes to the GICD_{SET,CLR}SPI_* registers. TYPER.MBIS is reported as 0.
>> + * - We do not support the (optional) backwards compatibility feature.
>> + *   GICD_CTLR.ARE resets to 1 and is RAO/WI. If the _host_ GIC supports
>> + *   the compatiblity feature, you can use a GICv2 in the guest, though.
>> + * - We only support a single security state. GICD_CTLR.DS is 1 and is RAO/WI.
>> + * - Priorities are not emulated (same as the GICv2 emulation). Linux
>> + *   as a guest is fine with this, because it does not use priorities.
>> + * - We only support Group1 interrupts. Again Linux uses only those.
>> + *
>> + * Copyright (C) 2014 ARM Ltd.
>> + * Author: Andre Przywara <andre.przywara@arm.com>
>> + *
>> + * This program is free software; you can redistribute it and/or modify
>> + * it under the terms of the GNU General Public License version 2 as
>> + * published by the Free Software Foundation.
>> + *
>> + * This program is distributed in the hope that it will be useful,
>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
>> + * GNU General Public License for more details.
>> + *
>> + * You should have received a copy of the GNU General Public License
>> + * along with this program.  If not, see <http://www.gnu.org/licenses/>.
>> + */
>> +
>> +#include <linux/cpu.h>
>> +#include <linux/kvm.h>
>> +#include <linux/kvm_host.h>
>> +#include <linux/interrupt.h>
>> +
>> +#include <linux/irqchip/arm-gic-v3.h>
>> +#include <kvm/arm_vgic.h>
>> +
>> +#include <asm/kvm_emulate.h>
>> +#include <asm/kvm_arm.h>
>> +#include <asm/kvm_mmu.h>
>> +
>> +#include "vgic.h"
>> +
>> +static bool handle_mmio_rao_wi(struct kvm_vcpu *vcpu,
>> +                            struct kvm_exit_mmio *mmio, phys_addr_t offset)
>> +{
>> +     u32 reg = 0xffffffff;
>> +
>> +     vgic_reg_access(mmio, &reg, offset,
>> +                     ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
>> +
>> +     return false;
>> +}
>> +
>> +static bool handle_mmio_ctlr(struct kvm_vcpu *vcpu,
>> +                          struct kvm_exit_mmio *mmio, phys_addr_t offset)
>> +{
>> +     u32 reg = 0;
>> +
>> +     /*
>> +      * Force ARE and DS to 1, the guest cannot change this.
>> +      * For the time being we only support Group1 interrupts.
>> +      */
>> +     if (vcpu->kvm->arch.vgic.enabled)
>> +             reg = GICD_CTLR_ENABLE_SS_G1;
>> +     reg |= GICD_CTLR_ARE_NS | GICD_CTLR_DS;
>> +
>> +     vgic_reg_access(mmio, &reg, offset,
>> +                     ACCESS_READ_VALUE | ACCESS_WRITE_VALUE);
>> +     if (mmio->is_write) {
>> +             if (reg & GICD_CTLR_ENABLE_SS_G0)
>> +                     kvm_info("guest tried to enable unsupported Group0 interrupts\n");
>> +             vcpu->kvm->arch.vgic.enabled = !!(reg & GICD_CTLR_ENABLE_SS_G1);
>> +             vgic_update_state(vcpu->kvm);
>> +             return true;
>> +     }
>> +     return false;
>> +}
>> +
>> +/*
>> + * As this implementation does not provide compatibility
>> + * with GICv2 (ARE==1), we report zero CPUs in bits [5..7].
>> + * Also LPIs and MBIs are not supported, so we set the respective bits to 0.
>> + * Also we report at most 2**10=1024 interrupt IDs (to match 1024 SPIs).
>> + */
>> +#define INTERRUPT_ID_BITS 10
>> +static bool handle_mmio_typer(struct kvm_vcpu *vcpu,
>> +                           struct kvm_exit_mmio *mmio, phys_addr_t offset)
>> +{
>> +     u32 reg;
>> +
>> +     /* we report at most 1024 IRQs via this interface */
> 
> hmmm, do we need to repeat ourselves here?

Actually ... not.
To avoid confusion, I will probably just drop this comment.

> I get a bit confused by both the comment above and here, as to *why* we
> are reporting this value?  And what is the bit about 'this interface'?

With this interface I meant the number of SPIs which is communicated
here in a GICv2 compatible way (ITLinesNumber). Looking forward to LPI
support I didn't want to use the term IRQ without some confinement.

> Is there another interface.

IDbits, but admittedly this isn't clear from the comment.
Not sure if that justifies more comments before we add ITS support, though.

> Perhaps what you're trying to get at here are the semantic differences
> between ITLinesNumber and IDbits and how that helps a reader understand
> the code.

I can add a better comment.

>> +     reg = (min(vcpu->kvm->arch.vgic.nr_irqs, 1024) >> 5) - 1;
>> +
>> +     reg |= (INTERRUPT_ID_BITS - 1) << 19;
>> +
>> +     vgic_reg_access(mmio, &reg, offset,
>> +                     ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
>> +
>> +     return false;
>> +}
>> +
>> +static bool handle_mmio_iidr(struct kvm_vcpu *vcpu,
>> +                          struct kvm_exit_mmio *mmio, phys_addr_t offset)
>> +{
>> +     u32 reg;
>> +
>> +     reg = (PRODUCT_ID_KVM << 24) | (IMPLEMENTER_ARM << 0);
>> +     vgic_reg_access(mmio, &reg, offset,
>> +                     ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
>> +
>> +     return false;
>> +}
>> +
>> +static bool handle_mmio_set_enable_reg_dist(struct kvm_vcpu *vcpu,
>> +                                         struct kvm_exit_mmio *mmio,
>> +                                         phys_addr_t offset)
>> +{
>> +     if (likely(offset >= VGIC_NR_PRIVATE_IRQS / 8))
>> +             return vgic_handle_enable_reg(vcpu->kvm, mmio, offset,
>> +                                           vcpu->vcpu_id,
>> +                                           ACCESS_WRITE_SETBIT);
>> +
>> +     vgic_reg_access(mmio, NULL, offset,
>> +                     ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
>> +     return false;
>> +}
>> +
>> +static bool handle_mmio_clear_enable_reg_dist(struct kvm_vcpu *vcpu,
>> +                                           struct kvm_exit_mmio *mmio,
>> +                                           phys_addr_t offset)
>> +{
>> +     if (likely(offset >= VGIC_NR_PRIVATE_IRQS / 8))
>> +             return vgic_handle_enable_reg(vcpu->kvm, mmio, offset,
>> +                                           vcpu->vcpu_id,
>> +                                           ACCESS_WRITE_CLEARBIT);
>> +
>> +     vgic_reg_access(mmio, NULL, offset,
>> +                     ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
>> +     return false;
>> +}
>> +
>> +static bool handle_mmio_set_pending_reg_dist(struct kvm_vcpu *vcpu,
>> +                                          struct kvm_exit_mmio *mmio,
>> +                                          phys_addr_t offset)
>> +{
>> +     if (likely(offset >= VGIC_NR_PRIVATE_IRQS / 8))
>> +             return vgic_handle_set_pending_reg(vcpu->kvm, mmio, offset,
>> +                                                vcpu->vcpu_id);
>> +
>> +     vgic_reg_access(mmio, NULL, offset,
>> +                     ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
>> +     return false;
>> +}
>> +
>> +static bool handle_mmio_clear_pending_reg_dist(struct kvm_vcpu *vcpu,
>> +                                            struct kvm_exit_mmio *mmio,
>> +                                            phys_addr_t offset)
>> +{
>> +     if (likely(offset >= VGIC_NR_PRIVATE_IRQS / 8))
>> +             return vgic_handle_clear_pending_reg(vcpu->kvm, mmio, offset,
>> +                                                  vcpu->vcpu_id);
>> +
>> +     vgic_reg_access(mmio, NULL, offset,
>> +                     ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
>> +     return false;
>> +}
>> +
>> +static bool handle_mmio_priority_reg_dist(struct kvm_vcpu *vcpu,
>> +                                       struct kvm_exit_mmio *mmio,
>> +                                       phys_addr_t offset)
>> +{
>> +     u32 *reg;
>> +
>> +     if (unlikely(offset < VGIC_NR_PRIVATE_IRQS)) {
>> +             vgic_reg_access(mmio, NULL, offset,
>> +                             ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
>> +             return false;
>> +     }
>> +
>> +     reg = vgic_bytemap_get_reg(&vcpu->kvm->arch.vgic.irq_priority,
>> +                                vcpu->vcpu_id, offset);
>> +     vgic_reg_access(mmio, reg, offset,
>> +             ACCESS_READ_VALUE | ACCESS_WRITE_VALUE);
>> +     return false;
>> +}
>> +
>> +static bool handle_mmio_cfg_reg_dist(struct kvm_vcpu *vcpu,
>> +                                  struct kvm_exit_mmio *mmio,
>> +                                  phys_addr_t offset)
>> +{
>> +     u32 *reg;
>> +
>> +     if (unlikely(offset < VGIC_NR_PRIVATE_IRQS / 4)) {
>> +             vgic_reg_access(mmio, NULL, offset,
>> +                             ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
>> +             return false;
>> +     }
>> +
>> +     reg = vgic_bitmap_get_reg(&vcpu->kvm->arch.vgic.irq_cfg,
>> +                               vcpu->vcpu_id, offset >> 1);
>> +
>> +     return vgic_handle_cfg_reg(reg, mmio, offset);
>> +}
>> +
>> +/*
>> + * We use a compressed version of the MPIDR (all 32 bits in one 32-bit word)
>> + * when we store the target MPIDR written by the guest.
>> + */
>> +static u32 compress_mpidr(unsigned long mpidr)
>> +{
>> +     u32 ret;
>> +
>> +     ret = MPIDR_AFFINITY_LEVEL(mpidr, 0);
>> +     ret |= MPIDR_AFFINITY_LEVEL(mpidr, 1) << 8;
>> +     ret |= MPIDR_AFFINITY_LEVEL(mpidr, 2) << 16;
>> +     ret |= MPIDR_AFFINITY_LEVEL(mpidr, 3) << 24;
>> +
>> +     return ret;
>> +}
>> +
>> +static unsigned long uncompress_mpidr(u32 value)
>> +{
>> +     unsigned long mpidr;
>> +
>> +     mpidr  = ((value >>  0) & 0xFF) << MPIDR_LEVEL_SHIFT(0);
>> +     mpidr |= ((value >>  8) & 0xFF) << MPIDR_LEVEL_SHIFT(1);
>> +     mpidr |= ((value >> 16) & 0xFF) << MPIDR_LEVEL_SHIFT(2);
>> +     mpidr |= (u64)((value >> 24) & 0xFF) << MPIDR_LEVEL_SHIFT(3);
>> +
>> +     return mpidr;
>> +}
>> +
>> +/*
>> + * Lookup the given MPIDR value to get the vcpu_id (if there is one)
>> + * and store that in the irq_spi_cpu[] array.
>> + * This limits the number of VCPUs to 255 for now, extending the data
>> + * type (or storing kvm_vcpu poiners) should lift the limit.
> 
> s/poiners/pointers

OK.

>> + * Store the original MPIDR value in an extra array to support read-as-written.
>> + * Unallocated MPIDRs are translated to a special value and caught
>> + * before any array accesses.
> 
> We may have covered this already, but why can't we restore the original
> MPIDR based on the the irq_spi_cpu array?
> 
> Is that because we loose information about 'which' unallocated MPIDR was
> written?

Yes.

> If that's the case, it seems weird that we go through the
> trouble but we anyway throw away the aff3 field...?

Not supporting the aff3 field saves us from caring about atomicity on
GICD_IROUTER accesses (where aff3 is in the upper word of this 64-bit
register).
Not supporting Aff3 is an architectural option in the GICv3, so this
seems like a viable solution.
I had some code to support "real" 64-bit accesses, which would allow
Aff3 support, but have to fight this through Marc first sometimes in the
future again ;-)

>> + */
>> +static bool handle_mmio_route_reg(struct kvm_vcpu *vcpu,
>> +                               struct kvm_exit_mmio *mmio,
>> +                               phys_addr_t offset)
>> +{
>> +     struct kvm *kvm = vcpu->kvm;
>> +     struct vgic_dist *dist = &kvm->arch.vgic;
>> +     int spi;
>> +     u32 reg;
>> +     int vcpu_id;
>> +     unsigned long *bmap, mpidr;
>> +
>> +     /*
>> +      * The upper 32 bits of each 64 bit register are zero,
>> +      * as we don't support Aff3.
>> +      */
>> +     if ((offset & 4)) {
>> +             vgic_reg_access(mmio, NULL, offset,
>> +                             ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
>> +             return false;
>> +     }
>> +
>> +     /* This region only covers SPIs, so no handling of private IRQs here. */
>> +     spi = offset / 8;
> 
> that's not how I read the spec, it says that GICD_IROUTER0 to
> GICD_IROUTER1 are not implemented (because they are SGIs and PPIs), and
> I read the 'SPI ID m' as the lowest numbered SPI ID being 32, thus you
> should do:
> 
> spi = offset / 8 - VGIC_NR_PRIVATE_IRQS;

Well, below I changed the description of the IROUTER range to:
+     {
+             .base           = GICD_IROUTER + 0x100,
+             .len            = 0x1edc,
+             .bits_per_irq   = 64,
+             .handle_mmio    = handle_mmio_route_reg,
+     },

This was due to a comment on v3 by you, where you correctly stated the
difference in the spec's description between IROUTER and the other
registers regarding the private IRQ handling (not implemented/reserved
vs. RAZ/WI).

So the offset in this function is relative to 0x6100 and thus depicts
directly the SPI number.

>> +
>> +     /* get the stored MPIDR for this IRQ */
>> +     mpidr = uncompress_mpidr(dist->irq_spi_mpidr[spi]);
>> +     mpidr &= MPIDR_HWID_BITMASK;
> 
> is this mask needed after calling uncompress above?

Indeed not.

>> +     reg = mpidr;
>> +
>> +     vgic_reg_access(mmio, &reg, offset,
>> +                     ACCESS_READ_VALUE | ACCESS_WRITE_VALUE);
>> +
>> +     if (!mmio->is_write)
>> +             return false;
>> +
>> +     /*
>> +      * Now clear the currently assigned vCPU from the map, making room
>> +      * for the new one to be written below
>> +      */
>> +     vcpu = kvm_mpidr_to_vcpu(kvm, mpidr);
>> +     if (likely(vcpu)) {
>> +             vcpu_id = vcpu->vcpu_id;
>> +             bmap = vgic_bitmap_get_shared_map(&dist->irq_spi_target[vcpu_id]);
>> +             __clear_bit(spi, bmap);
>> +     }
>> +
>> +     dist->irq_spi_mpidr[spi] = compress_mpidr(reg);
>> +     vcpu = kvm_mpidr_to_vcpu(kvm, reg & MPIDR_HWID_BITMASK);
>> +
>> +     /*
>> +      * The spec says that non-existent MPIDR values should not be
>> +      * forwarded to any existent (v)CPU, but should be able to become
>> +      * pending anyway. We simply keep the irq_spi_target[] array empty, so
>> +      * the interrupt will never be injected.
>> +      * irq_spi_cpu[irq] gets a magic value in this case.
>> +      */
>> +     if (likely(vcpu)) {
>> +             vcpu_id = vcpu->vcpu_id;
>> +             dist->irq_spi_cpu[spi] = vcpu_id;
>> +             bmap = vgic_bitmap_get_shared_map(&dist->irq_spi_target[vcpu_id]);
>> +             __set_bit(spi, bmap);
>> +     } else {
>> +             dist->irq_spi_cpu[spi] = VCPU_NOT_ALLOCATED;
>> +     }
>> +
>> +     vgic_update_state(kvm);
>> +
>> +     return true;
>> +}
>> +
>> +/*
>> + * We should be careful about promising too much when a guest reads
>> + * this register. Don't claim to be like any hardware implementation,
>> + * but just report the GIC as version 3 - which is what a Linux guest
>> + * would check.
>> + */
>> +static bool handle_mmio_idregs(struct kvm_vcpu *vcpu,
>> +                            struct kvm_exit_mmio *mmio,
>> +                            phys_addr_t offset)
>> +{
>> +     u32 reg = 0;
>> +
>> +     switch (offset + GICD_IDREGS) {
>> +     case GICD_PIDR2:
>> +             reg = 0x3b;
>> +             break;
>> +     }
>> +
>> +     vgic_reg_access(mmio, &reg, offset,
>> +                     ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
>> +
>> +     return false;
>> +}
>> +
>> +static const struct kvm_mmio_range vgic_v3_dist_ranges[] = {
>> +     {
>> +             .base           = GICD_CTLR,
>> +             .len            = 0x04,
>> +             .bits_per_irq   = 0,
>> +             .handle_mmio    = handle_mmio_ctlr,
>> +     },
>> +     {
>> +             .base           = GICD_TYPER,
>> +             .len            = 0x04,
>> +             .bits_per_irq   = 0,
>> +             .handle_mmio    = handle_mmio_typer,
>> +     },
>> +     {
>> +             .base           = GICD_IIDR,
>> +             .len            = 0x04,
>> +             .bits_per_irq   = 0,
>> +             .handle_mmio    = handle_mmio_iidr,
>> +     },
>> +     {
>> +             /* this register is optional, it is RAZ/WI if not implemented */
>> +             .base           = GICD_STATUSR,
>> +             .len            = 0x04,
>> +             .bits_per_irq   = 0,
>> +             .handle_mmio    = handle_mmio_raz_wi,
>> +     },
>> +     {
>> +             /* this write only register is WI when TYPER.MBIS=0 */
>> +             .base           = GICD_SETSPI_NSR,
>> +             .len            = 0x04,
>> +             .bits_per_irq   = 0,
>> +             .handle_mmio    = handle_mmio_raz_wi,
>> +     },
>> +     {
>> +             /* this write only register is WI when TYPER.MBIS=0 */
>> +             .base           = GICD_CLRSPI_NSR,
>> +             .len            = 0x04,
>> +             .bits_per_irq   = 0,
>> +             .handle_mmio    = handle_mmio_raz_wi,
>> +     },
>> +     {
>> +             /* this is RAZ/WI when DS=1 */
>> +             .base           = GICD_SETSPI_SR,
>> +             .len            = 0x04,
>> +             .bits_per_irq   = 0,
>> +             .handle_mmio    = handle_mmio_raz_wi,
>> +     },
>> +     {
>> +             /* this is RAZ/WI when DS=1 */
>> +             .base           = GICD_CLRSPI_SR,
>> +             .len            = 0x04,
>> +             .bits_per_irq   = 0,
>> +             .handle_mmio    = handle_mmio_raz_wi,
>> +     },
>> +     {
>> +             .base           = GICD_IGROUPR,
>> +             .len            = 0x80,
>> +             .bits_per_irq   = 1,
>> +             .handle_mmio    = handle_mmio_rao_wi,
>> +     },
>> +     {
>> +             .base           = GICD_ISENABLER,
>> +             .len            = 0x80,
>> +             .bits_per_irq   = 1,
>> +             .handle_mmio    = handle_mmio_set_enable_reg_dist,
>> +     },
>> +     {
>> +             .base           = GICD_ICENABLER,
>> +             .len            = 0x80,
>> +             .bits_per_irq   = 1,
>> +             .handle_mmio    = handle_mmio_clear_enable_reg_dist,
>> +     },
>> +     {
>> +             .base           = GICD_ISPENDR,
>> +             .len            = 0x80,
>> +             .bits_per_irq   = 1,
>> +             .handle_mmio    = handle_mmio_set_pending_reg_dist,
>> +     },
>> +     {
>> +             .base           = GICD_ICPENDR,
>> +             .len            = 0x80,
>> +             .bits_per_irq   = 1,
>> +             .handle_mmio    = handle_mmio_clear_pending_reg_dist,
>> +     },
>> +     {
>> +             .base           = GICD_ISACTIVER,
>> +             .len            = 0x80,
>> +             .bits_per_irq   = 1,
>> +             .handle_mmio    = handle_mmio_raz_wi,
>> +     },
>> +     {
>> +             .base           = GICD_ICACTIVER,
>> +             .len            = 0x80,
>> +             .bits_per_irq   = 1,
>> +             .handle_mmio    = handle_mmio_raz_wi,
>> +     },
>> +     {
>> +             .base           = GICD_IPRIORITYR,
>> +             .len            = 0x400,
>> +             .bits_per_irq   = 8,
>> +             .handle_mmio    = handle_mmio_priority_reg_dist,
>> +     },
>> +     {
>> +             /* TARGETSRn is RES0 when ARE=1 */
>> +             .base           = GICD_ITARGETSR,
>> +             .len            = 0x400,
>> +             .bits_per_irq   = 8,
>> +             .handle_mmio    = handle_mmio_raz_wi,
>> +     },
>> +     {
>> +             .base           = GICD_ICFGR,
>> +             .len            = 0x100,
>> +             .bits_per_irq   = 2,
>> +             .handle_mmio    = handle_mmio_cfg_reg_dist,
>> +     },
>> +     {
>> +             /* this is RAZ/WI when DS=1 */
>> +             .base           = GICD_IGRPMODR,
>> +             .len            = 0x80,
>> +             .bits_per_irq   = 1,
>> +             .handle_mmio    = handle_mmio_raz_wi,
>> +     },
>> +     {
>> +             /* this is RAZ/WI when DS=1 */
>> +             .base           = GICD_NSACR,
>> +             .len            = 0x100,
>> +             .bits_per_irq   = 2,
>> +             .handle_mmio    = handle_mmio_raz_wi,
>> +     },
>> +     {
>> +             /* this is RAZ/WI when ARE=1 */
>> +             .base           = GICD_SGIR,
>> +             .len            = 0x04,
>> +             .handle_mmio    = handle_mmio_raz_wi,
>> +     },
>> +     {
>> +             /* this is RAZ/WI when ARE=1 */
>> +             .base           = GICD_CPENDSGIR,
>> +             .len            = 0x10,
>> +             .handle_mmio    = handle_mmio_raz_wi,
>> +     },
>> +     {
>> +             /* this is RAZ/WI when ARE=1 */
>> +             .base           = GICD_SPENDSGIR,
>> +             .len            = 0x10,
>> +             .handle_mmio    = handle_mmio_raz_wi,
>> +     },
>> +     {
>> +             .base           = GICD_IROUTER + 0x100,
>> +             .len            = 0x1edc,
>> +             .bits_per_irq   = 64,
>> +             .handle_mmio    = handle_mmio_route_reg,
>> +     },
>> +     {
>> +             .base           = GICD_IDREGS,
>> +             .len            = 0x30,
>> +             .bits_per_irq   = 0,
>> +             .handle_mmio    = handle_mmio_idregs,
>> +     },
>> +     {},
>> +};
>> +
>> +static bool handle_mmio_set_enable_reg_redist(struct kvm_vcpu *vcpu,
>> +                                           struct kvm_exit_mmio *mmio,
>> +                                           phys_addr_t offset)
>> +{
>> +     struct kvm_vcpu *redist_vcpu = mmio->private;
>> +
>> +     return vgic_handle_enable_reg(vcpu->kvm, mmio, offset,
>> +                                   redist_vcpu->vcpu_id,
>> +                                   ACCESS_WRITE_SETBIT);
>> +}
>> +
>> +static bool handle_mmio_clear_enable_reg_redist(struct kvm_vcpu *vcpu,
>> +                                             struct kvm_exit_mmio *mmio,
>> +                                             phys_addr_t offset)
>> +{
>> +     struct kvm_vcpu *redist_vcpu = mmio->private;
>> +
>> +     return vgic_handle_enable_reg(vcpu->kvm, mmio, offset,
>> +                                   redist_vcpu->vcpu_id,
>> +                                   ACCESS_WRITE_CLEARBIT);
>> +}
>> +
>> +static bool handle_mmio_set_pending_reg_redist(struct kvm_vcpu *vcpu,
>> +                                            struct kvm_exit_mmio *mmio,
>> +                                            phys_addr_t offset)
>> +{
>> +     struct kvm_vcpu *redist_vcpu = mmio->private;
>> +
>> +     return vgic_handle_set_pending_reg(vcpu->kvm, mmio, offset,
>> +                                        redist_vcpu->vcpu_id);
>> +}
>> +
>> +static bool handle_mmio_clear_pending_reg_redist(struct kvm_vcpu *vcpu,
>> +                                              struct kvm_exit_mmio *mmio,
>> +                                              phys_addr_t offset)
>> +{
>> +     struct kvm_vcpu *redist_vcpu = mmio->private;
>> +
>> +     return vgic_handle_clear_pending_reg(vcpu->kvm, mmio, offset,
>> +                                          redist_vcpu->vcpu_id);
>> +}
>> +
>> +static bool handle_mmio_priority_reg_redist(struct kvm_vcpu *vcpu,
>> +                                         struct kvm_exit_mmio *mmio,
>> +                                         phys_addr_t offset)
>> +{
>> +     struct kvm_vcpu *redist_vcpu = mmio->private;
>> +     u32 *reg;
>> +
>> +     reg = vgic_bytemap_get_reg(&vcpu->kvm->arch.vgic.irq_priority,
>> +                                redist_vcpu->vcpu_id, offset);
>> +     vgic_reg_access(mmio, reg, offset,
>> +                     ACCESS_READ_VALUE | ACCESS_WRITE_VALUE);
>> +     return false;
>> +}
>> +
>> +static bool handle_mmio_cfg_reg_redist(struct kvm_vcpu *vcpu,
>> +                                    struct kvm_exit_mmio *mmio,
>> +                                    phys_addr_t offset)
>> +{
>> +     struct kvm_vcpu *redist_vcpu = mmio->private;
>> +
>> +     u32 *reg = vgic_bitmap_get_reg(&vcpu->kvm->arch.vgic.irq_cfg,
>> +                                    redist_vcpu->vcpu_id, offset >> 1);
>> +
>> +     return vgic_handle_cfg_reg(reg, mmio, offset);
>> +}
>> +
>> +static const struct kvm_mmio_range vgic_redist_sgi_ranges[] = {
>> +     {
>> +             .base           = GICR_IGROUPR0,
>> +             .len            = 0x04,
>> +             .bits_per_irq   = 1,
>> +             .handle_mmio    = handle_mmio_raz_wi,
> 
> I think you were going to change this ro handle_mmio_rao_wi() instead?

Indeed, thanks for spotting!

>> +     },
>> +     {
>> +             .base           = GICR_ISENABLER0,
>> +             .len            = 0x04,
>> +             .bits_per_irq   = 1,
>> +             .handle_mmio    = handle_mmio_set_enable_reg_redist,
>> +     },
>> +     {
>> +             .base           = GICR_ICENABLER0,
>> +             .len            = 0x04,
>> +             .bits_per_irq   = 1,
>> +             .handle_mmio    = handle_mmio_clear_enable_reg_redist,
>> +     },
>> +     {
>> +             .base           = GICR_ISPENDR0,
>> +             .len            = 0x04,
>> +             .bits_per_irq   = 1,
>> +             .handle_mmio    = handle_mmio_set_pending_reg_redist,
>> +     },
>> +     {
>> +             .base           = GICR_ICPENDR0,
>> +             .len            = 0x04,
>> +             .bits_per_irq   = 1,
>> +             .handle_mmio    = handle_mmio_clear_pending_reg_redist,
>> +     },
>> +     {
>> +             .base           = GICR_ISACTIVER0,
>> +             .len            = 0x04,
>> +             .bits_per_irq   = 1,
>> +             .handle_mmio    = handle_mmio_raz_wi,
>> +     },
>> +     {
>> +             .base           = GICR_ICACTIVER0,
>> +             .len            = 0x04,
>> +             .bits_per_irq   = 1,
>> +             .handle_mmio    = handle_mmio_raz_wi,
>> +     },
>> +     {
>> +             .base           = GICR_IPRIORITYR0,
>> +             .len            = 0x20,
>> +             .bits_per_irq   = 8,
>> +             .handle_mmio    = handle_mmio_priority_reg_redist,
>> +     },
>> +     {
>> +             .base           = GICR_ICFGR0,
>> +             .len            = 0x08,
>> +             .bits_per_irq   = 2,
>> +             .handle_mmio    = handle_mmio_cfg_reg_redist,
>> +     },
>> +     {
>> +             .base           = GICR_IGRPMODR0,
>> +             .len            = 0x04,
>> +             .bits_per_irq   = 1,
>> +             .handle_mmio    = handle_mmio_raz_wi,
>> +     },
>> +     {
>> +             .base           = GICR_NSACR,
>> +             .len            = 0x04,
>> +             .handle_mmio    = handle_mmio_raz_wi,
>> +     },
>> +     {},
>> +};
>> +
>> +static bool handle_mmio_ctlr_redist(struct kvm_vcpu *vcpu,
>> +                                 struct kvm_exit_mmio *mmio,
>> +                                 phys_addr_t offset)
>> +{
>> +     /* since we don't support LPIs, this register is zero for now */
>> +     vgic_reg_access(mmio, NULL, offset,
>> +                     ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
>> +     return false;
>> +}
>> +
>> +static bool handle_mmio_typer_redist(struct kvm_vcpu *vcpu,
>> +                                  struct kvm_exit_mmio *mmio,
>> +                                  phys_addr_t offset)
>> +{
>> +     u32 reg;
>> +     u64 mpidr;
>> +     struct kvm_vcpu *redist_vcpu = mmio->private;
>> +     int target_vcpu_id = redist_vcpu->vcpu_id;
>> +
>> +     /* the upper 32 bits contain the affinity value */
>> +     if ((offset & ~3) == 4) {
>> +             mpidr = kvm_vcpu_get_mpidr_aff(redist_vcpu);
>> +             reg = compress_mpidr(mpidr);
>> +
>> +             vgic_reg_access(mmio, &reg, offset,
>> +                             ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
>> +             return false;
>> +     }
>> +
>> +     reg = redist_vcpu->vcpu_id << 8;
>> +     if (target_vcpu_id == atomic_read(&vcpu->kvm->online_vcpus) - 1)
>> +             reg |= GICR_TYPER_LAST;
>> +     vgic_reg_access(mmio, &reg, offset,
>> +                     ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
>> +     return false;
>> +}
>> +
>> +static const struct kvm_mmio_range vgic_redist_ranges[] = {
>> +     {
>> +             .base           = GICR_CTLR,
>> +             .len            = 0x04,
>> +             .bits_per_irq   = 0,
>> +             .handle_mmio    = handle_mmio_ctlr_redist,
>> +     },
>> +     {
>> +             .base           = GICR_TYPER,
>> +             .len            = 0x08,
>> +             .bits_per_irq   = 0,
>> +             .handle_mmio    = handle_mmio_typer_redist,
>> +     },
>> +     {
>> +             .base           = GICR_IIDR,
>> +             .len            = 0x04,
>> +             .bits_per_irq   = 0,
>> +             .handle_mmio    = handle_mmio_iidr,
>> +     },
>> +     {
>> +             .base           = GICR_WAKER,
>> +             .len            = 0x04,
>> +             .bits_per_irq   = 0,
>> +             .handle_mmio    = handle_mmio_raz_wi,
>> +     },
>> +     {
>> +             .base           = GICR_IDREGS,
>> +             .len            = 0x30,
>> +             .bits_per_irq   = 0,
>> +             .handle_mmio    = handle_mmio_idregs,
>> +     },
>> +     {},
>> +};
>> +
>> +/*
>> + * This function splits accesses between the distributor and the two
>> + * redistributor parts (private/SPI). As each redistributor is accessible
>> + * from any CPU, we have to determine the affected VCPU by taking the faulting
>> + * address into account. We then pass this VCPU to the handler function via
>> + * the private parameter.
>> + */
>> +#define SGI_BASE_OFFSET SZ_64K
>> +static bool vgic_v3_handle_mmio(struct kvm_vcpu *vcpu, struct kvm_run *run,
>> +                             struct kvm_exit_mmio *mmio)
>> +{
>> +     struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
>> +     unsigned long dbase = dist->vgic_dist_base;
>> +     unsigned long rdbase = dist->vgic_redist_base;
>> +     int nrcpus = atomic_read(&vcpu->kvm->online_vcpus);
>> +     int vcpu_id;
>> +     const struct kvm_mmio_range *mmio_range;
>> +
>> +     if (is_in_range(mmio->phys_addr, mmio->len, dbase, GIC_V3_DIST_SIZE)) {
>> +             return vgic_handle_mmio_range(vcpu, run, mmio,
>> +                                           vgic_v3_dist_ranges, dbase);
>> +     }
>> +
>> +     if (!is_in_range(mmio->phys_addr, mmio->len, rdbase,
>> +         GIC_V3_REDIST_SIZE * nrcpus))
>> +             return false;
> 
> Did you think more about the contiguous allocation issue here or can you
> give me a pointer to the requirement in the spec?

5.4.1 Re-Distributor Addressing

>> +
>> +     vcpu_id = (mmio->phys_addr - rdbase) / GIC_V3_REDIST_SIZE;
>> +     rdbase += (vcpu_id * GIC_V3_REDIST_SIZE);
>> +     mmio->private = kvm_get_vcpu(vcpu->kvm, vcpu_id);
>> +
>> +     if (mmio->phys_addr >= rdbase + SGI_BASE_OFFSET) {
>> +             rdbase += SGI_BASE_OFFSET;
>> +             mmio_range = vgic_redist_sgi_ranges;
>> +     } else {
>> +             mmio_range = vgic_redist_ranges;
>> +     }
>> +     return vgic_handle_mmio_range(vcpu, run, mmio, mmio_range, rdbase);
>> +}
>> +
>> +static bool vgic_v3_queue_sgi(struct kvm_vcpu *vcpu, int irq)
>> +{
>> +     if (vgic_queue_irq(vcpu, 0, irq)) {
>> +             vgic_dist_irq_clear_pending(vcpu, irq);
>> +             vgic_cpu_irq_clear(vcpu, irq);
>> +             return true;
>> +     }
>> +
>> +     return false;
>> +}
>> +
>> +static int vgic_v3_init_maps(struct vgic_dist *dist)
>> +{
>> +     int nr_spis = dist->nr_irqs - VGIC_NR_PRIVATE_IRQS;
>> +
>> +     dist->irq_spi_mpidr = kcalloc(nr_spis, sizeof(dist->irq_spi_mpidr[0]),
>> +                                   GFP_KERNEL);
>> +
>> +     if (!dist->irq_spi_mpidr)
>> +             return -ENOMEM;
>> +
>> +     return 0;
>> +}
>> +
>> +static int vgic_v3_init(struct kvm *kvm, const struct vgic_params *params)
>> +{
>> +     struct vgic_dist *dist = &kvm->arch.vgic;
>> +     int ret, i;
>> +     u32 mpidr;
>> +
>> +     if (IS_VGIC_ADDR_UNDEF(dist->vgic_dist_base) ||
>> +         IS_VGIC_ADDR_UNDEF(dist->vgic_redist_base)) {
>> +             kvm_err("Need to set vgic distributor addresses first\n");
>> +             return -ENXIO;
>> +     }
>> +
>> +     /*
>> +      * FIXME: this should be moved to init_maps time, and may bite
>> +      * us when adding save/restore. Add a per-emulation hook?
>> +      */
> 
> progress on this fixme?

Progress supplies the ISS, but not this piece of code (read: none) ;-)
I am more in favour of a follow-up patch on this one ...

Many thanks for reading through all this mess^Wcode ... again.

Cheers,
Andre.

>> +     ret = vgic_v3_init_maps(dist);
>> +     if (ret) {
>> +             kvm_err("Unable to allocate maps\n");
>> +             return ret;
>> +     }
>> +
>> +     /* Initialize the target VCPUs for each IRQ to VCPU 0 */
>> +     mpidr = compress_mpidr(kvm_vcpu_get_mpidr_aff(kvm_get_vcpu(kvm, 0)));
>> +     for (i = VGIC_NR_PRIVATE_IRQS; i < dist->nr_irqs; i++) {
>> +             dist->irq_spi_cpu[i - VGIC_NR_PRIVATE_IRQS] = 0;
>> +             dist->irq_spi_mpidr[i - VGIC_NR_PRIVATE_IRQS] = mpidr;
>> +             vgic_bitmap_set_irq_val(dist->irq_spi_target, 0, i, 1);
>> +     }
>> +
>> +     return 0;
>> +}
>> +
>> +/* GICv3 does not keep track of SGI sources anymore. */
>> +static void vgic_v3_add_sgi_source(struct kvm_vcpu *vcpu, int irq, int source)
>> +{
>> +}
>> +
>> +int vgic_v3_init_emulation(struct kvm *kvm)
>> +{
>> +     struct vgic_dist *dist = &kvm->arch.vgic;
>> +
>> +     dist->vm_ops.handle_mmio = vgic_v3_handle_mmio;
>> +     dist->vm_ops.queue_sgi = vgic_v3_queue_sgi;
>> +     dist->vm_ops.add_sgi_source = vgic_v3_add_sgi_source;
>> +     dist->vm_ops.vgic_init = vgic_v3_init;
>> +
>> +     kvm->arch.max_vcpus = KVM_MAX_VCPUS;
>> +
>> +     return 0;
>> +}
>> +
>> +static int vgic_v3_create(struct kvm_device *dev, u32 type)
>> +{
>> +     return kvm_vgic_create(dev->kvm, type);
>> +}
>> +
>> +static void vgic_v3_destroy(struct kvm_device *dev)
>> +{
>> +     kfree(dev);
>> +}
>> +
>> +static int vgic_v3_set_attr(struct kvm_device *dev,
>> +                         struct kvm_device_attr *attr)
>> +{
>> +     int ret;
>> +
>> +     ret = vgic_set_common_attr(dev, attr);
>> +     if (ret != -ENXIO)
>> +             return ret;
>> +
>> +     switch (attr->group) {
>> +     case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
>> +     case KVM_DEV_ARM_VGIC_GRP_CPU_REGS:
>> +             return -ENXIO;
>> +     }
>> +
>> +     return -ENXIO;
>> +}
>> +
>> +static int vgic_v3_get_attr(struct kvm_device *dev,
>> +                         struct kvm_device_attr *attr)
>> +{
>> +     int ret;
>> +
>> +     ret = vgic_get_common_attr(dev, attr);
>> +     if (ret != -ENXIO)
>> +             return ret;
>> +
>> +     switch (attr->group) {
>> +     case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
>> +     case KVM_DEV_ARM_VGIC_GRP_CPU_REGS:
>> +             return -ENXIO;
>> +     }
>> +
>> +     return -ENXIO;
>> +}
>> +
>> +static int vgic_v3_has_attr(struct kvm_device *dev,
>> +                         struct kvm_device_attr *attr)
>> +{
>> +     switch (attr->group) {
>> +     case KVM_DEV_ARM_VGIC_GRP_ADDR:
>> +             switch (attr->attr) {
>> +             case KVM_VGIC_V2_ADDR_TYPE_DIST:
>> +             case KVM_VGIC_V2_ADDR_TYPE_CPU:
>> +                     return -ENXIO;
>> +             }
>> +             break;
>> +     case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
>> +     case KVM_DEV_ARM_VGIC_GRP_CPU_REGS:
>> +             return -ENXIO;
>> +     case KVM_DEV_ARM_VGIC_GRP_NR_IRQS:
>> +             return 0;
>> +     }
>> +     return -ENXIO;
>> +}
>> +
>> +struct kvm_device_ops kvm_arm_vgic_v3_ops = {
>> +     .name = "kvm-arm-vgic-v3",
>> +     .create = vgic_v3_create,
>> +     .destroy = vgic_v3_destroy,
>> +     .set_attr = vgic_v3_set_attr,
>> +     .get_attr = vgic_v3_get_attr,
>> +     .has_attr = vgic_v3_has_attr,
>> +};
>> diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c
>> index 335ffe0..b7de0f8 100644
>> --- a/virt/kvm/arm/vgic.c
>> +++ b/virt/kvm/arm/vgic.c
>> @@ -1249,7 +1249,7 @@ static bool vgic_update_irq_pending(struct kvm *kvm, int cpuid,
>>       struct kvm_vcpu *vcpu;
>>       int edge_triggered, level_triggered;
>>       int enabled;
>> -     bool ret = true;
>> +     bool ret = true, can_inject = true;
>>
>>       spin_lock(&dist->lock);
>>
>> @@ -1264,6 +1264,11 @@ static bool vgic_update_irq_pending(struct kvm *kvm, int cpuid,
>>
>>       if (irq_num >= VGIC_NR_PRIVATE_IRQS) {
>>               cpuid = dist->irq_spi_cpu[irq_num - VGIC_NR_PRIVATE_IRQS];
>> +             if (cpuid == VCPU_NOT_ALLOCATED) {
>> +                     /* Pretend we use CPU0, and prevent injection */
>> +                     cpuid = 0;
>> +                     can_inject = false;
>> +             }
>>               vcpu = kvm_get_vcpu(kvm, cpuid);
>>       }
>>
>> @@ -1285,7 +1290,7 @@ static bool vgic_update_irq_pending(struct kvm *kvm, int cpuid,
>>
>>       enabled = vgic_irq_is_enabled(vcpu, irq_num);
>>
>> -     if (!enabled) {
>> +     if (!enabled || !can_inject) {
>>               ret = false;
>>               goto out;
>>       }
>> @@ -1438,6 +1443,7 @@ void kvm_vgic_destroy(struct kvm *kvm)
>>       }
>>       kfree(dist->irq_sgi_sources);
>>       kfree(dist->irq_spi_cpu);
>> +     kfree(dist->irq_spi_mpidr);
>>       kfree(dist->irq_spi_target);
>>       kfree(dist->irq_pending_on_cpu);
>>       dist->irq_sgi_sources = NULL;
>> @@ -1628,6 +1634,7 @@ int kvm_vgic_create(struct kvm *kvm, u32 type)
>>       kvm->arch.vgic.vctrl_base = vgic->vctrl_base;
>>       kvm->arch.vgic.vgic_dist_base = VGIC_ADDR_UNDEF;
>>       kvm->arch.vgic.vgic_cpu_base = VGIC_ADDR_UNDEF;
>> +     kvm->arch.vgic.vgic_redist_base = VGIC_ADDR_UNDEF;
>>
>>  out_unlock:
>>       for (; vcpu_lock_idx >= 0; vcpu_lock_idx--) {
>> diff --git a/virt/kvm/arm/vgic.h b/virt/kvm/arm/vgic.h
>> index ff3171a..b0c6b2f 100644
>> --- a/virt/kvm/arm/vgic.h
>> +++ b/virt/kvm/arm/vgic.h
>> @@ -35,6 +35,8 @@
>>  #define ACCESS_WRITE_VALUE   (3 << 1)
>>  #define ACCESS_WRITE_MASK(x) ((x) & (3 << 1))
>>
>> +#define VCPU_NOT_ALLOCATED   ((u8)-1)
>> +
>>  unsigned long *vgic_bitmap_get_shared_map(struct vgic_bitmap *x);
>>
>>  void vgic_update_state(struct kvm *kvm);
>> @@ -115,5 +117,6 @@ int vgic_set_common_attr(struct kvm_device *dev, struct kvm_device_attr *attr);
>>  int vgic_get_common_attr(struct kvm_device *dev, struct kvm_device_attr *attr);
>>
>>  int vgic_v2_init_emulation(struct kvm *kvm);
>> +int vgic_v3_init_emulation(struct kvm *kvm);
>>
>>  #endif
>> --
>> 1.7.9.5
>>
> 
> Thanks,
> -Christoffer
>
Christoffer Dall Nov. 25, 2014, 10:41 a.m. UTC | #7
Hi Andre,

On Mon, Nov 24, 2014 at 04:00:46PM +0000, Andre Przywara wrote:

[...]

> >> +
> >> +/*
> >> + * As this implementation does not provide compatibility
> >> + * with GICv2 (ARE==1), we report zero CPUs in bits [5..7].
> >> + * Also LPIs and MBIs are not supported, so we set the respective bits to 0.
> >> + * Also we report at most 2**10=1024 interrupt IDs (to match 1024 SPIs).
> >> + */
> >> +#define INTERRUPT_ID_BITS 10
> >> +static bool handle_mmio_typer(struct kvm_vcpu *vcpu,
> >> +                           struct kvm_exit_mmio *mmio, phys_addr_t offset)
> >> +{
> >> +     u32 reg;
> >> +
> >> +     /* we report at most 1024 IRQs via this interface */
> > 
> > hmmm, do we need to repeat ourselves here?
> 
> Actually ... not.
> To avoid confusion, I will probably just drop this comment.
> 
> > I get a bit confused by both the comment above and here, as to *why* we
> > are reporting this value?  And what is the bit about 'this interface'?
> 
> With this interface I meant the number of SPIs which is communicated
> here in a GICv2 compatible way (ITLinesNumber). Looking forward to LPI
> support I didn't want to use the term IRQ without some confinement.
> 
> > Is there another interface.
> 
> IDbits, but admittedly this isn't clear from the comment.
> Not sure if that justifies more comments before we add ITS support, though.
> 
> > Perhaps what you're trying to get at here are the semantic differences
> > between ITLinesNumber and IDbits and how that helps a reader understand
> > the code.
> 
> I can add a better comment.
> 

I think you just need to clarify the comment above the function or let
the code speak for itself.

> >> +     reg = (min(vcpu->kvm->arch.vgic.nr_irqs, 1024) >> 5) - 1;
> >> +
> >> +     reg |= (INTERRUPT_ID_BITS - 1) << 19;
> >> +
> >> +     vgic_reg_access(mmio, &reg, offset,
> >> +                     ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
> >> +
> >> +     return false;
> >> +}
> >> +


[...]

> >> + * Store the original MPIDR value in an extra array to support read-as-written.
> >> + * Unallocated MPIDRs are translated to a special value and caught
> >> + * before any array accesses.
> > 
> > We may have covered this already, but why can't we restore the original
> > MPIDR based on the the irq_spi_cpu array?
> > 
> > Is that because we loose information about 'which' unallocated MPIDR was
> > written?
> 
> Yes.
> 
> > If that's the case, it seems weird that we go through the
> > trouble but we anyway throw away the aff3 field...?
> 
> Not supporting the aff3 field saves us from caring about atomicity on
> GICD_IROUTER accesses (where aff3 is in the upper word of this 64-bit
> register).
> Not supporting Aff3 is an architectural option in the GICv3, so this
> seems like a viable solution.
> I had some code to support "real" 64-bit accesses, which would allow
> Aff3 support, but have to fight this through Marc first sometimes in the
> future again ;-)
> 

didn't realize it was an architecturally allowed option to not support
Aff3, in that case it's not worth the bother at this point.

> >> + */
> >> +static bool handle_mmio_route_reg(struct kvm_vcpu *vcpu,
> >> +                               struct kvm_exit_mmio *mmio,
> >> +                               phys_addr_t offset)
> >> +{
> >> +     struct kvm *kvm = vcpu->kvm;
> >> +     struct vgic_dist *dist = &kvm->arch.vgic;
> >> +     int spi;
> >> +     u32 reg;
> >> +     int vcpu_id;
> >> +     unsigned long *bmap, mpidr;
> >> +
> >> +     /*
> >> +      * The upper 32 bits of each 64 bit register are zero,
> >> +      * as we don't support Aff3.
> >> +      */
> >> +     if ((offset & 4)) {
> >> +             vgic_reg_access(mmio, NULL, offset,
> >> +                             ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
> >> +             return false;
> >> +     }
> >> +
> >> +     /* This region only covers SPIs, so no handling of private IRQs here. */
> >> +     spi = offset / 8;
> > 
> > that's not how I read the spec, it says that GICD_IROUTER0 to
> > GICD_IROUTER1 are not implemented (because they are SGIs and PPIs), and
> > I read the 'SPI ID m' as the lowest numbered SPI ID being 32, thus you
> > should do:
> > 
> > spi = offset / 8 - VGIC_NR_PRIVATE_IRQS;
> 
> Well, below I changed the description of the IROUTER range to:

oh, now it's finally coming back together for me, I think I
misunderstodd your point from last rounds of review because I didn't
realize that GICD_IROUTER was defined as 0x6000 (which I actually think
is a bit backwards, but this is not the place or time).

> +     {
> +             .base           = GICD_IROUTER + 0x100,
> +             .len            = 0x1edc,

in that case, len should be 0x1ee0:

 $ printf '0x%x\n' $(( (0x7fd8 + 8) - 0x6100 ))

> +             .bits_per_irq   = 64,
> +             .handle_mmio    = handle_mmio_route_reg,
> +     },
> 
> This was due to a comment on v3 by you, where you correctly stated the
> difference in the spec's description between IROUTER and the other
> registers regarding the private IRQ handling (not implemented/reserved
> vs. RAZ/WI).
> 
> So the offset in this function is relative to 0x6100 and thus depicts
> directly the SPI number.
> 

got it now, yes, the code is correct.

[...]

> >> +
> >> +/*
> >> + * This function splits accesses between the distributor and the two
> >> + * redistributor parts (private/SPI). As each redistributor is accessible
> >> + * from any CPU, we have to determine the affected VCPU by taking the faulting
> >> + * address into account. We then pass this VCPU to the handler function via
> >> + * the private parameter.
> >> + */
> >> +#define SGI_BASE_OFFSET SZ_64K
> >> +static bool vgic_v3_handle_mmio(struct kvm_vcpu *vcpu, struct kvm_run *run,
> >> +                             struct kvm_exit_mmio *mmio)
> >> +{
> >> +     struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
> >> +     unsigned long dbase = dist->vgic_dist_base;
> >> +     unsigned long rdbase = dist->vgic_redist_base;
> >> +     int nrcpus = atomic_read(&vcpu->kvm->online_vcpus);
> >> +     int vcpu_id;
> >> +     const struct kvm_mmio_range *mmio_range;
> >> +
> >> +     if (is_in_range(mmio->phys_addr, mmio->len, dbase, GIC_V3_DIST_SIZE)) {
> >> +             return vgic_handle_mmio_range(vcpu, run, mmio,
> >> +                                           vgic_v3_dist_ranges, dbase);
> >> +     }
> >> +
> >> +     if (!is_in_range(mmio->phys_addr, mmio->len, rdbase,
> >> +         GIC_V3_REDIST_SIZE * nrcpus))
> >> +             return false;
> > 
> > Did you think more about the contiguous allocation issue here or can you
> > give me a pointer to the requirement in the spec?
> 
> 5.4.1 Re-Distributor Addressing
> 

Section 5.4.1 talks about the pages within a single re-distributor having
to be contiguous, not all the re-deistributor regions having to be

contiguous, right?

> >> +
> >> +static int vgic_v3_init(struct kvm *kvm, const struct vgic_params *params)
> >> +{
> >> +     struct vgic_dist *dist = &kvm->arch.vgic;
> >> +     int ret, i;
> >> +     u32 mpidr;
> >> +
> >> +     if (IS_VGIC_ADDR_UNDEF(dist->vgic_dist_base) ||
> >> +         IS_VGIC_ADDR_UNDEF(dist->vgic_redist_base)) {
> >> +             kvm_err("Need to set vgic distributor addresses first\n");
> >> +             return -ENXIO;
> >> +     }
> >> +
> >> +     /*
> >> +      * FIXME: this should be moved to init_maps time, and may bite
> >> +      * us when adding save/restore. Add a per-emulation hook?
> >> +      */
> > 
> > progress on this fixme?
> 
> Progress supplies the ISS, but not this piece of code (read: none) ;-)
> I am more in favour of a follow-up patch on this one ...

hmmm, I'm not a fan of merging code with this kind of a comment in it,
because it looks scary, and I dont' really understand the problem from
just reading the comment, so something needs to be done here.

Thanks,

-Christoffer
Andre Przywara Nov. 28, 2014, 3:24 p.m. UTC | #8
Hej Christoffer,

On 25/11/14 10:41, Christoffer Dall wrote:
> Hi Andre,
> 
> On Mon, Nov 24, 2014 at 04:00:46PM +0000, Andre Przywara wrote:
> 

[...]

>>>> +
>>>> +     /* This region only covers SPIs, so no handling of private IRQs here. */
>>>> +     spi = offset / 8;
>>>
>>> that's not how I read the spec, it says that GICD_IROUTER0 to
>>> GICD_IROUTER1 are not implemented (because they are SGIs and PPIs), and
>>> I read the 'SPI ID m' as the lowest numbered SPI ID being 32, thus you
>>> should do:
>>>
>>> spi = offset / 8 - VGIC_NR_PRIVATE_IRQS;
>>
>> Well, below I changed the description of the IROUTER range to:
> 
> oh, now it's finally coming back together for me, I think I
> misunderstodd your point from last rounds of review because I didn't
> realize that GICD_IROUTER was defined as 0x6000 (which I actually think
> is a bit backwards, but this is not the place or time).
> 
>> +     {
>> +             .base           = GICD_IROUTER + 0x100,
>> +             .len            = 0x1edc,
> 
> in that case, len should be 0x1ee0:
> 
>  $ printf '0x%x\n' $(( (0x7fd8 + 8) - 0x6100 ))

Ah yes, the spec gives the beginning of the last register, not the end
of the region. Thanks for spotting this!

[...]

>>>> +
>>>> +     if (!is_in_range(mmio->phys_addr, mmio->len, rdbase,
>>>> +         GIC_V3_REDIST_SIZE * nrcpus))
>>>> +             return false;
>>>
>>> Did you think more about the contiguous allocation issue here or can you
>>> give me a pointer to the requirement in the spec?
>>
>> 5.4.1 Re-Distributor Addressing
>>
> 
> Section 5.4.1 talks about the pages within a single re-distributor having
> to be contiguous, not all the re-deistributor regions having to be
> contiguous, right?

Ah yes, you are right. But I still think it does not matter:
1) We are "implementing" the GICv3. So as the spec does not forbid this,
we just state that the redistributor register maps for each VCPU are
contiguous. Also we create the FDT accordingly. I will add a comment in
the documentation to state this.

2) The kernel's GICv3 DT bindings assume this allocation is the default.
Although Marc added bindings to work around this (stride), it seems much
more logical to me to not use it.

>>>> +
>>>> +static int vgic_v3_init(struct kvm *kvm, const struct vgic_params *params)
>>>> +{
>>>> +     struct vgic_dist *dist = &kvm->arch.vgic;
>>>> +     int ret, i;
>>>> +     u32 mpidr;
>>>> +
>>>> +     if (IS_VGIC_ADDR_UNDEF(dist->vgic_dist_base) ||
>>>> +         IS_VGIC_ADDR_UNDEF(dist->vgic_redist_base)) {
>>>> +             kvm_err("Need to set vgic distributor addresses first\n");
>>>> +             return -ENXIO;
>>>> +     }
>>>> +
>>>> +     /*
>>>> +      * FIXME: this should be moved to init_maps time, and may bite
>>>> +      * us when adding save/restore. Add a per-emulation hook?
>>>> +      */
>>>
>>> progress on this fixme?
>>
>> Progress supplies the ISS, but not this piece of code (read: none) ;-)
>> I am more in favour of a follow-up patch on this one ...
> 
> hmmm, I'm not a fan of merging code with this kind of a comment in it,
> because it looks scary, and I dont' really understand the problem from
> just reading the comment, so something needs to be done here.

I see. What about we are moving this unconditionally into vgic_init_maps
and allocate it for both v2 and v3 guests and get rid of the whole
function? It allocates only memory for the irq_spi_mpidr, which is 4
bytes per configured SPI (so at most less than 4 KB, but usually just
128 Bytes per guest). This would be a pretty quick solution. Does that
sound too hackish?

After your comments about the per-VM ops function pointers I am a bit
reluctant to introduce another one (which would be the obvious way
following the comment) for just this simple kalloc().
On the other hand the ITS emulation may later make better use of a GICv3
specific allocation function.

What's your opinion on this?

Cheers,
Andre.
Christoffer Dall Nov. 30, 2014, 8:30 a.m. UTC | #9
On Fri, Nov 28, 2014 at 03:24:11PM +0000, Andre Przywara wrote:
> Hej Christoffer,
> 
> On 25/11/14 10:41, Christoffer Dall wrote:
> > Hi Andre,
> > 
> > On Mon, Nov 24, 2014 at 04:00:46PM +0000, Andre Przywara wrote:
> > 
> 
> [...]
> 
> >>>> +
> >>>> +     /* This region only covers SPIs, so no handling of private IRQs here. */
> >>>> +     spi = offset / 8;
> >>>
> >>> that's not how I read the spec, it says that GICD_IROUTER0 to
> >>> GICD_IROUTER1 are not implemented (because they are SGIs and PPIs), and
> >>> I read the 'SPI ID m' as the lowest numbered SPI ID being 32, thus you
> >>> should do:
> >>>
> >>> spi = offset / 8 - VGIC_NR_PRIVATE_IRQS;
> >>
> >> Well, below I changed the description of the IROUTER range to:
> > 
> > oh, now it's finally coming back together for me, I think I
> > misunderstodd your point from last rounds of review because I didn't
> > realize that GICD_IROUTER was defined as 0x6000 (which I actually think
> > is a bit backwards, but this is not the place or time).
> > 
> >> +     {
> >> +             .base           = GICD_IROUTER + 0x100,
> >> +             .len            = 0x1edc,
> > 
> > in that case, len should be 0x1ee0:
> > 
> >  $ printf '0x%x\n' $(( (0x7fd8 + 8) - 0x6100 ))
> 
> Ah yes, the spec gives the beginning of the last register, not the end
> of the region. Thanks for spotting this!
> 
> [...]
> 
> >>>> +
> >>>> +     if (!is_in_range(mmio->phys_addr, mmio->len, rdbase,
> >>>> +         GIC_V3_REDIST_SIZE * nrcpus))
> >>>> +             return false;
> >>>
> >>> Did you think more about the contiguous allocation issue here or can you
> >>> give me a pointer to the requirement in the spec?
> >>
> >> 5.4.1 Re-Distributor Addressing
> >>
> > 
> > Section 5.4.1 talks about the pages within a single re-distributor having
> > to be contiguous, not all the re-deistributor regions having to be
> > contiguous, right?
> 
> Ah yes, you are right. But I still think it does not matter:
> 1) We are "implementing" the GICv3. So as the spec does not forbid this,
> we just state that the redistributor register maps for each VCPU are
> contiguous. Also we create the FDT accordingly. I will add a comment in
> the documentation to state this.
> 
> 2) The kernel's GICv3 DT bindings assume this allocation is the default.
> Although Marc added bindings to work around this (stride), it seems much
> more logical to me to not use it.

I don't disagree (and never have) with the fact that it is up to us to
decide.

My original question, which we haven't talked about yet, is if it is
*reasonable* to assume that all re-distributor regions will always be
contiguous?

How will you handle VCPU hotplug for example?  Where in the guest
physical memory map of our various virt machines should these regions
sit so that we can allocate anough re-distributors for VCPUs etc.?

I just want to make sure we're not limiting ourselves by some amount of
functionality or ABI (redistributor base addresses) that will be hard to
expand in the future.

> 
> >>>> +
> >>>> +static int vgic_v3_init(struct kvm *kvm, const struct vgic_params *params)
> >>>> +{
> >>>> +     struct vgic_dist *dist = &kvm->arch.vgic;
> >>>> +     int ret, i;
> >>>> +     u32 mpidr;
> >>>> +
> >>>> +     if (IS_VGIC_ADDR_UNDEF(dist->vgic_dist_base) ||
> >>>> +         IS_VGIC_ADDR_UNDEF(dist->vgic_redist_base)) {
> >>>> +             kvm_err("Need to set vgic distributor addresses first\n");
> >>>> +             return -ENXIO;
> >>>> +     }
> >>>> +
> >>>> +     /*
> >>>> +      * FIXME: this should be moved to init_maps time, and may bite
> >>>> +      * us when adding save/restore. Add a per-emulation hook?
> >>>> +      */
> >>>
> >>> progress on this fixme?
> >>
> >> Progress supplies the ISS, but not this piece of code (read: none) ;-)
> >> I am more in favour of a follow-up patch on this one ...
> > 
> > hmmm, I'm not a fan of merging code with this kind of a comment in it,
> > because it looks scary, and I dont' really understand the problem from
> > just reading the comment, so something needs to be done here.
> 
> I see. What about we are moving this unconditionally into vgic_init_maps
> and allocate it for both v2 and v3 guests and get rid of the whole
> function? It allocates only memory for the irq_spi_mpidr, which is 4
> bytes per configured SPI (so at most less than 4 KB, but usually just
> 128 Bytes per guest). This would be a pretty quick solution. Does that
> sound too hackish?
> 
> After your comments about the per-VM ops function pointers I am a bit
> reluctant to introduce another one (which would be the obvious way
> following the comment) for just this simple kalloc().
> On the other hand the ITS emulation may later make better use of a GICv3
> specific allocation function.

What I really disliked was the configuration of a function pointer,
which, when invoked, configured other function pointers.  That just made
my head spin.  So adding another per-gic-model init_maps method is not
that bad, but on the other hand, the only problem with keeping this here
is that when we restore the vgic state, then user space wants to be able
to populate all the date before running any VCPUs, and we don't create
the data structures before the first VCPU is run.

However, Eric has a problem with this "init-when-we-run-the-first-VCPU"
approach as well, so one argument is that we need to add a method to
both the gicv2 and gicv3 device API to say "VGIC_INIT" which userspace
can call after having created all the VCPUs.  And, in fact, we may want
to enforce this for the gicv3 right now and only maintain the existing
behavior for gicv2.

(Eric's use case is configuring IRQFD, which must logically be done
before running the machine, but also needs to be done after the vgic is
fully ready.).

Does this make sense?

We could consider scheduling a call for this if you think that would be
helpful.

-Christoffer
Andre Przywara Dec. 2, 2014, 4:24 p.m. UTC | #10
Hej Christoffer,

On 30/11/14 08:30, Christoffer Dall wrote:
> On Fri, Nov 28, 2014 at 03:24:11PM +0000, Andre Przywara wrote:
>> Hej Christoffer,
>>
>> On 25/11/14 10:41, Christoffer Dall wrote:
>>> Hi Andre,
>>>
>>> On Mon, Nov 24, 2014 at 04:00:46PM +0000, Andre Przywara wrote:
>>>
>>

[...]

>>>>>> +
>>>>>> +     if (!is_in_range(mmio->phys_addr, mmio->len, rdbase,
>>>>>> +         GIC_V3_REDIST_SIZE * nrcpus))
>>>>>> +             return false;
>>>>>
>>>>> Did you think more about the contiguous allocation issue here or can you
>>>>> give me a pointer to the requirement in the spec?
>>>>
>>>> 5.4.1 Re-Distributor Addressing
>>>>
>>>
>>> Section 5.4.1 talks about the pages within a single re-distributor having
>>> to be contiguous, not all the re-deistributor regions having to be
>>> contiguous, right?
>>
>> Ah yes, you are right. But I still think it does not matter:
>> 1) We are "implementing" the GICv3. So as the spec does not forbid this,
>> we just state that the redistributor register maps for each VCPU are
>> contiguous. Also we create the FDT accordingly. I will add a comment in
>> the documentation to state this.
>>
>> 2) The kernel's GICv3 DT bindings assume this allocation is the default.
>> Although Marc added bindings to work around this (stride), it seems much
>> more logical to me to not use it.
> 
> I don't disagree (and never have) with the fact that it is up to us to
> decide.
> 
> My original question, which we haven't talked about yet, is if it is
> *reasonable* to assume that all re-distributor regions will always be
> contiguous?
> 
> How will you handle VCPU hotplug for example?

As kvmtool does not support hotplug, I haven't thought about this yet.
To me it looks like userland should just use maxcpus for the allocation.
If I get the current QEMU code right, there is room for 127 GICv3 VCPUs
(2*64K per VCPU + 64K for the distributor in 16M space) at the moment.
Kvmtool uses a different mapping, which allows to share 1G with virtio,
so the limit is around 8000ish VCPUs here.
Are there any issues with changing the QEMU virt mapping later?
Migration, maybe?
If the UART, the RTC and the virtio regions are moved more towards the
beginning of the 256MB PCI mapping, then there should be space for a bit
less than 1024 VCPUs, if I get this right.

> Where in the guest
> physical memory map of our various virt machines should these regions
> sit so that we can allocate anough re-distributors for VCPUs etc.?

Various? Are there other mappings than those described in hw/arm/virt.c?

> I just want to make sure we're not limiting ourselves by some amount of
> functionality or ABI (redistributor base addresses) that will be hard to
> expand in the future.

If we are flexible with the mapping at VM creation time, QEMU could just
use a mapping depending on max_cpus:
< 128 VCPUs: use the current mapping
128 <= x < 1020: use a more compressed mapping
>= 1020: map the redistributor somewhere above 4 GB

As the device tree binding for GICv3 just supports a stride value, we
don't have any other real options beside this, right? So how I see this,
a contiguous mapping (with possible holes) is the only way.

>>>>>> +
>>>>>> +static int vgic_v3_init(struct kvm *kvm, const struct vgic_params *params)
>>>>>> +{
>>>>>> +     struct vgic_dist *dist = &kvm->arch.vgic;
>>>>>> +     int ret, i;
>>>>>> +     u32 mpidr;
>>>>>> +
>>>>>> +     if (IS_VGIC_ADDR_UNDEF(dist->vgic_dist_base) ||
>>>>>> +         IS_VGIC_ADDR_UNDEF(dist->vgic_redist_base)) {
>>>>>> +             kvm_err("Need to set vgic distributor addresses first\n");
>>>>>> +             return -ENXIO;
>>>>>> +     }
>>>>>> +
>>>>>> +     /*
>>>>>> +      * FIXME: this should be moved to init_maps time, and may bite
>>>>>> +      * us when adding save/restore. Add a per-emulation hook?
>>>>>> +      */
>>>>>
>>>>> progress on this fixme?
>>>>
>>>> Progress supplies the ISS, but not this piece of code (read: none) ;-)
>>>> I am more in favour of a follow-up patch on this one ...
>>>
>>> hmmm, I'm not a fan of merging code with this kind of a comment in it,
>>> because it looks scary, and I dont' really understand the problem from
>>> just reading the comment, so something needs to be done here.
>>
>> I see. What about we are moving this unconditionally into vgic_init_maps
>> and allocate it for both v2 and v3 guests and get rid of the whole
>> function? It allocates only memory for the irq_spi_mpidr, which is 4
>> bytes per configured SPI (so at most less than 4 KB, but usually just
>> 128 Bytes per guest). This would be a pretty quick solution. Does that
>> sound too hackish?
>>
>> After your comments about the per-VM ops function pointers I am a bit
>> reluctant to introduce another one (which would be the obvious way
>> following the comment) for just this simple kalloc().
>> On the other hand the ITS emulation may later make better use of a GICv3
>> specific allocation function.
> 
> What I really disliked was the configuration of a function pointer,
> which, when invoked, configured other function pointers.  That just made
> my head spin.  So adding another per-gic-model init_maps method is not
> that bad, but on the other hand, the only problem with keeping this here
> is that when we restore the vgic state, then user space wants to be able
> to populate all the date before running any VCPUs, and we don't create
> the data structures before the first VCPU is run.
> 
> However, Eric has a problem with this "init-when-we-run-the-first-VCPU"
> approach as well, so one argument is that we need to add a method to
> both the gicv2 and gicv3 device API to say "VGIC_INIT" which userspace
> can call after having created all the VCPUs.  And, in fact, we may want
> to enforce this for the gicv3 right now and only maintain the existing
> behavior for gicv2.
> 
> (Eric's use case is configuring IRQFD, which must logically be done
> before running the machine, but also needs to be done after the vgic is
> fully ready.).
> 
> Does this make sense?

So if we would avoid that spooky "detect-if-a-VCPU-has-run" code and
rely on an explicit ioctl, I am in favor for this. We would need to keep
the current approach for compatibility, though, right?

So what about we either keep the current GICv3 allocation as it stands
in my patches right now (or move the GICv3 specific part into the
general vgic_init_maps()) and adapt that to the VGIC_INIT call once that
has appeared (or even handle this in that series then).

Does that make sense? What is the time frame for that VGIC_INIT call?

> We could consider scheduling a call for this if you think that would be
> helpful.

Depends on your answer to the above ;-)

Cheers,
Andre.
Marc Zyngier Dec. 2, 2014, 5:06 p.m. UTC | #11
On 02/12/14 16:24, Andre Przywara wrote:
> Hej Christoffer,
> 
> On 30/11/14 08:30, Christoffer Dall wrote:
>> On Fri, Nov 28, 2014 at 03:24:11PM +0000, Andre Przywara wrote:
>>> Hej Christoffer,
>>>
>>> On 25/11/14 10:41, Christoffer Dall wrote:
>>>> Hi Andre,
>>>>
>>>> On Mon, Nov 24, 2014 at 04:00:46PM +0000, Andre Przywara wrote:
>>>>
>>>
> 
> [...]
> 
>>>>>>> +
>>>>>>> +     if (!is_in_range(mmio->phys_addr, mmio->len, rdbase,
>>>>>>> +         GIC_V3_REDIST_SIZE * nrcpus))
>>>>>>> +             return false;
>>>>>>
>>>>>> Did you think more about the contiguous allocation issue here or can you
>>>>>> give me a pointer to the requirement in the spec?
>>>>>
>>>>> 5.4.1 Re-Distributor Addressing
>>>>>
>>>>
>>>> Section 5.4.1 talks about the pages within a single re-distributor having
>>>> to be contiguous, not all the re-deistributor regions having to be
>>>> contiguous, right?
>>>
>>> Ah yes, you are right. But I still think it does not matter:
>>> 1) We are "implementing" the GICv3. So as the spec does not forbid this,
>>> we just state that the redistributor register maps for each VCPU are
>>> contiguous. Also we create the FDT accordingly. I will add a comment in
>>> the documentation to state this.
>>>
>>> 2) The kernel's GICv3 DT bindings assume this allocation is the default.
>>> Although Marc added bindings to work around this (stride), it seems much
>>> more logical to me to not use it.
>>
>> I don't disagree (and never have) with the fact that it is up to us to
>> decide.
>>
>> My original question, which we haven't talked about yet, is if it is
>> *reasonable* to assume that all re-distributor regions will always be
>> contiguous?
>>
>> How will you handle VCPU hotplug for example?
> 
> As kvmtool does not support hotplug, I haven't thought about this yet.
> To me it looks like userland should just use maxcpus for the allocation.
> If I get the current QEMU code right, there is room for 127 GICv3 VCPUs
> (2*64K per VCPU + 64K for the distributor in 16M space) at the moment.
> Kvmtool uses a different mapping, which allows to share 1G with virtio,
> so the limit is around 8000ish VCPUs here.
> Are there any issues with changing the QEMU virt mapping later?
> Migration, maybe?
> If the UART, the RTC and the virtio regions are moved more towards the
> beginning of the 256MB PCI mapping, then there should be space for a bit
> less than 1024 VCPUs, if I get this right.
> 
>> Where in the guest
>> physical memory map of our various virt machines should these regions
>> sit so that we can allocate anough re-distributors for VCPUs etc.?
> 
> Various? Are there other mappings than those described in hw/arm/virt.c?
> 
>> I just want to make sure we're not limiting ourselves by some amount of
>> functionality or ABI (redistributor base addresses) that will be hard to
>> expand in the future.
> 
> If we are flexible with the mapping at VM creation time, QEMU could just
> use a mapping depending on max_cpus:
> < 128 VCPUs: use the current mapping
> 128 <= x < 1020: use a more compressed mapping
>> = 1020: map the redistributor somewhere above 4 GB
> 
> As the device tree binding for GICv3 just supports a stride value, we
> don't have any other real options beside this, right? So how I see this,
> a contiguous mapping (with possible holes) is the only way.

Not really. The GICv3 binding definitely supports having several regions
for the redistributors (see the binding documentation). This allows for
the pathological case where you have N regions for N CPUs. Not that we
ever want to go there, really.

Thanks,

	M.
Andre Przywara Dec. 2, 2014, 5:32 p.m. UTC | #12
On 02/12/14 17:06, Marc Zyngier wrote:
> On 02/12/14 16:24, Andre Przywara wrote:
>> Hej Christoffer,
>>
>> On 30/11/14 08:30, Christoffer Dall wrote:
>>> On Fri, Nov 28, 2014 at 03:24:11PM +0000, Andre Przywara wrote:
>>>> Hej Christoffer,
>>>>
>>>> On 25/11/14 10:41, Christoffer Dall wrote:
>>>>> Hi Andre,
>>>>>
>>>>> On Mon, Nov 24, 2014 at 04:00:46PM +0000, Andre Przywara wrote:
>>>>>
>>>>
>>
>> [...]
>>
>>>>>>>> +
>>>>>>>> +     if (!is_in_range(mmio->phys_addr, mmio->len, rdbase,
>>>>>>>> +         GIC_V3_REDIST_SIZE * nrcpus))
>>>>>>>> +             return false;
>>>>>>>
>>>>>>> Did you think more about the contiguous allocation issue here or can you
>>>>>>> give me a pointer to the requirement in the spec?
>>>>>>
>>>>>> 5.4.1 Re-Distributor Addressing
>>>>>>
>>>>>
>>>>> Section 5.4.1 talks about the pages within a single re-distributor having
>>>>> to be contiguous, not all the re-deistributor regions having to be
>>>>> contiguous, right?
>>>>
>>>> Ah yes, you are right. But I still think it does not matter:
>>>> 1) We are "implementing" the GICv3. So as the spec does not forbid this,
>>>> we just state that the redistributor register maps for each VCPU are
>>>> contiguous. Also we create the FDT accordingly. I will add a comment in
>>>> the documentation to state this.
>>>>
>>>> 2) The kernel's GICv3 DT bindings assume this allocation is the default.
>>>> Although Marc added bindings to work around this (stride), it seems much
>>>> more logical to me to not use it.
>>>
>>> I don't disagree (and never have) with the fact that it is up to us to
>>> decide.
>>>
>>> My original question, which we haven't talked about yet, is if it is
>>> *reasonable* to assume that all re-distributor regions will always be
>>> contiguous?
>>>
>>> How will you handle VCPU hotplug for example?
>>
>> As kvmtool does not support hotplug, I haven't thought about this yet.
>> To me it looks like userland should just use maxcpus for the allocation.
>> If I get the current QEMU code right, there is room for 127 GICv3 VCPUs
>> (2*64K per VCPU + 64K for the distributor in 16M space) at the moment.
>> Kvmtool uses a different mapping, which allows to share 1G with virtio,
>> so the limit is around 8000ish VCPUs here.
>> Are there any issues with changing the QEMU virt mapping later?
>> Migration, maybe?
>> If the UART, the RTC and the virtio regions are moved more towards the
>> beginning of the 256MB PCI mapping, then there should be space for a bit
>> less than 1024 VCPUs, if I get this right.
>>
>>> Where in the guest
>>> physical memory map of our various virt machines should these regions
>>> sit so that we can allocate anough re-distributors for VCPUs etc.?
>>
>> Various? Are there other mappings than those described in hw/arm/virt.c?
>>
>>> I just want to make sure we're not limiting ourselves by some amount of
>>> functionality or ABI (redistributor base addresses) that will be hard to
>>> expand in the future.
>>
>> If we are flexible with the mapping at VM creation time, QEMU could just
>> use a mapping depending on max_cpus:
>> < 128 VCPUs: use the current mapping
>> 128 <= x < 1020: use a more compressed mapping
>>> = 1020: map the redistributor somewhere above 4 GB
>>
>> As the device tree binding for GICv3 just supports a stride value, we
>> don't have any other real options beside this, right? So how I see this,
>> a contiguous mapping (with possible holes) is the only way.
> 
> Not really. The GICv3 binding definitely supports having several regions
> for the redistributors (see the binding documentation). This allows for
> the pathological case where you have N regions for N CPUs. Not that we
> ever want to go there, really.

Ah yes, thanks for pointing that out. I was mixing this up with the
stride parameter, which is independent of this. Sorry for that.

So from a userland point of view we probably would like to have the
first n VCPU's redistributors mapped at their current places and allow
for more VCPUs to use memory above 4 GB.
Which would require quite some changes to the code to support this in a
very flexible way. I think this could be much easier if we confine
ourselves to two regions (one contiguous lower (< 4 GB) and one
contiguous upper region (>4 GB)), so we don't need to support arbitrary
per VCPU addresses, but could just use the 1st or 2nd map depending on
the VCPU number.
Is this too hackish?
If not, I would add another vgic_addr type (like
KVM_VGIC_V3_ADDR_TYPE_REDIST_UPPER or so) to be used from userland and
use that in the handle_mmio region detection.
Let me know if that sounds reasonable.

Cheers,
Andre.
Christoffer Dall Dec. 3, 2014, 10:28 a.m. UTC | #13
On Tue, Dec 02, 2014 at 04:24:53PM +0000, Andre Przywara wrote:
> Hej Christoffer,
> 
> On 30/11/14 08:30, Christoffer Dall wrote:
> > On Fri, Nov 28, 2014 at 03:24:11PM +0000, Andre Przywara wrote:
> >> Hej Christoffer,
> >>
> >> On 25/11/14 10:41, Christoffer Dall wrote:
> >>> Hi Andre,
> >>>
> >>> On Mon, Nov 24, 2014 at 04:00:46PM +0000, Andre Przywara wrote:
> >>>
> >>
> 
> [...]
> 
> >>>>>> +
> >>>>>> +     if (!is_in_range(mmio->phys_addr, mmio->len, rdbase,
> >>>>>> +         GIC_V3_REDIST_SIZE * nrcpus))
> >>>>>> +             return false;
> >>>>>
> >>>>> Did you think more about the contiguous allocation issue here or can you
> >>>>> give me a pointer to the requirement in the spec?
> >>>>
> >>>> 5.4.1 Re-Distributor Addressing
> >>>>
> >>>
> >>> Section 5.4.1 talks about the pages within a single re-distributor having
> >>> to be contiguous, not all the re-deistributor regions having to be
> >>> contiguous, right?
> >>
> >> Ah yes, you are right. But I still think it does not matter:
> >> 1) We are "implementing" the GICv3. So as the spec does not forbid this,
> >> we just state that the redistributor register maps for each VCPU are
> >> contiguous. Also we create the FDT accordingly. I will add a comment in
> >> the documentation to state this.
> >>
> >> 2) The kernel's GICv3 DT bindings assume this allocation is the default.
> >> Although Marc added bindings to work around this (stride), it seems much
> >> more logical to me to not use it.
> > 
> > I don't disagree (and never have) with the fact that it is up to us to
> > decide.
> > 
> > My original question, which we haven't talked about yet, is if it is
> > *reasonable* to assume that all re-distributor regions will always be
> > contiguous?
> > 
> > How will you handle VCPU hotplug for example?
> 
> As kvmtool does not support hotplug, I haven't thought about this yet.
> To me it looks like userland should just use maxcpus for the allocation.
> If I get the current QEMU code right, there is room for 127 GICv3 VCPUs
> (2*64K per VCPU + 64K for the distributor in 16M space) at the moment.
> Kvmtool uses a different mapping, which allows to share 1G with virtio,
> so the limit is around 8000ish VCPUs here.
> Are there any issues with changing the QEMU virt mapping later?

Not issues as such, but we try to keep it as stable as possible.  At
least soon, you have to worry about UEFI working with such changes, for
example.

> Migration, maybe?
> If the UART, the RTC and the virtio regions are moved more towards the
> beginning of the 256MB PCI mapping, then there should be space for a bit
> less than 1024 VCPUs, if I get this right.
> 
> > Where in the guest
> > physical memory map of our various virt machines should these regions
> > sit so that we can allocate anough re-distributors for VCPUs etc.?
> 
> Various? Are there other mappings than those described in hw/arm/virt.c?
> 

QEMU's and kvmtool's, for starters.

> > I just want to make sure we're not limiting ourselves by some amount of
> > functionality or ABI (redistributor base addresses) that will be hard to
> > expand in the future.
> 
> If we are flexible with the mapping at VM creation time, QEMU could just
> use a mapping depending on max_cpus:
> < 128 VCPUs: use the current mapping
> 128 <= x < 1020: use a more compressed mapping
> >= 1020: map the redistributor somewhere above 4 GB

I don't understand what the compressed mapping is.  Is the current code
easy to expand such that you can add a secondary separate redistributor
region above the 4GB map?

> 
> As the device tree binding for GICv3 just supports a stride value, we
> don't have any other real options beside this, right? So how I see this,
> a contiguous mapping (with possible holes) is the only way.
> 
> >>>>>> +
> >>>>>> +static int vgic_v3_init(struct kvm *kvm, const struct vgic_params *params)
> >>>>>> +{
> >>>>>> +     struct vgic_dist *dist = &kvm->arch.vgic;
> >>>>>> +     int ret, i;
> >>>>>> +     u32 mpidr;
> >>>>>> +
> >>>>>> +     if (IS_VGIC_ADDR_UNDEF(dist->vgic_dist_base) ||
> >>>>>> +         IS_VGIC_ADDR_UNDEF(dist->vgic_redist_base)) {
> >>>>>> +             kvm_err("Need to set vgic distributor addresses first\n");
> >>>>>> +             return -ENXIO;
> >>>>>> +     }
> >>>>>> +
> >>>>>> +     /*
> >>>>>> +      * FIXME: this should be moved to init_maps time, and may bite
> >>>>>> +      * us when adding save/restore. Add a per-emulation hook?
> >>>>>> +      */
> >>>>>
> >>>>> progress on this fixme?
> >>>>
> >>>> Progress supplies the ISS, but not this piece of code (read: none) ;-)
> >>>> I am more in favour of a follow-up patch on this one ...
> >>>
> >>> hmmm, I'm not a fan of merging code with this kind of a comment in it,
> >>> because it looks scary, and I dont' really understand the problem from
> >>> just reading the comment, so something needs to be done here.
> >>
> >> I see. What about we are moving this unconditionally into vgic_init_maps
> >> and allocate it for both v2 and v3 guests and get rid of the whole
> >> function? It allocates only memory for the irq_spi_mpidr, which is 4
> >> bytes per configured SPI (so at most less than 4 KB, but usually just
> >> 128 Bytes per guest). This would be a pretty quick solution. Does that
> >> sound too hackish?
> >>
> >> After your comments about the per-VM ops function pointers I am a bit
> >> reluctant to introduce another one (which would be the obvious way
> >> following the comment) for just this simple kalloc().
> >> On the other hand the ITS emulation may later make better use of a GICv3
> >> specific allocation function.
> > 
> > What I really disliked was the configuration of a function pointer,
> > which, when invoked, configured other function pointers.  That just made
> > my head spin.  So adding another per-gic-model init_maps method is not
> > that bad, but on the other hand, the only problem with keeping this here
> > is that when we restore the vgic state, then user space wants to be able
> > to populate all the date before running any VCPUs, and we don't create
> > the data structures before the first VCPU is run.
> > 
> > However, Eric has a problem with this "init-when-we-run-the-first-VCPU"
> > approach as well, so one argument is that we need to add a method to
> > both the gicv2 and gicv3 device API to say "VGIC_INIT" which userspace
> > can call after having created all the VCPUs.  And, in fact, we may want
> > to enforce this for the gicv3 right now and only maintain the existing
> > behavior for gicv2.
> > 
> > (Eric's use case is configuring IRQFD, which must logically be done
> > before running the machine, but also needs to be done after the vgic is
> > fully ready.).
> > 
> > Does this make sense?
> 
> So if we would avoid that spooky "detect-if-a-VCPU-has-run" code and
> rely on an explicit ioctl, I am in favor for this. We would need to keep
> the current approach for compatibility, though, right?

not for gicv3, only for gicv2, which is exactly why I'm bringing it up
here.

> 
> So what about we either keep the current GICv3 allocation as it stands
> in my patches right now (or move the GICv3 specific part into the
> general vgic_init_maps()) and adapt that to the VGIC_INIT call once that
> has appeared (or even handle this in that series then).
> 
> Does that make sense? What is the time frame for that VGIC_INIT call?
> 

Eric posted some patches in private yesterday, I think he's planning on
sending them out any day now.

-Christoffer
Christoffer Dall Dec. 3, 2014, 10:29 a.m. UTC | #14
On Tue, Dec 02, 2014 at 05:06:09PM +0000, Marc Zyngier wrote:
> On 02/12/14 16:24, Andre Przywara wrote:
> > Hej Christoffer,
> > 
> > On 30/11/14 08:30, Christoffer Dall wrote:
> >> On Fri, Nov 28, 2014 at 03:24:11PM +0000, Andre Przywara wrote:
> >>> Hej Christoffer,
> >>>
> >>> On 25/11/14 10:41, Christoffer Dall wrote:
> >>>> Hi Andre,
> >>>>
> >>>> On Mon, Nov 24, 2014 at 04:00:46PM +0000, Andre Przywara wrote:
> >>>>
> >>>
> > 
> > [...]
> > 
> >>>>>>> +
> >>>>>>> +     if (!is_in_range(mmio->phys_addr, mmio->len, rdbase,
> >>>>>>> +         GIC_V3_REDIST_SIZE * nrcpus))
> >>>>>>> +             return false;
> >>>>>>
> >>>>>> Did you think more about the contiguous allocation issue here or can you
> >>>>>> give me a pointer to the requirement in the spec?
> >>>>>
> >>>>> 5.4.1 Re-Distributor Addressing
> >>>>>
> >>>>
> >>>> Section 5.4.1 talks about the pages within a single re-distributor having
> >>>> to be contiguous, not all the re-deistributor regions having to be
> >>>> contiguous, right?
> >>>
> >>> Ah yes, you are right. But I still think it does not matter:
> >>> 1) We are "implementing" the GICv3. So as the spec does not forbid this,
> >>> we just state that the redistributor register maps for each VCPU are
> >>> contiguous. Also we create the FDT accordingly. I will add a comment in
> >>> the documentation to state this.
> >>>
> >>> 2) The kernel's GICv3 DT bindings assume this allocation is the default.
> >>> Although Marc added bindings to work around this (stride), it seems much
> >>> more logical to me to not use it.
> >>
> >> I don't disagree (and never have) with the fact that it is up to us to
> >> decide.
> >>
> >> My original question, which we haven't talked about yet, is if it is
> >> *reasonable* to assume that all re-distributor regions will always be
> >> contiguous?
> >>
> >> How will you handle VCPU hotplug for example?
> > 
> > As kvmtool does not support hotplug, I haven't thought about this yet.
> > To me it looks like userland should just use maxcpus for the allocation.
> > If I get the current QEMU code right, there is room for 127 GICv3 VCPUs
> > (2*64K per VCPU + 64K for the distributor in 16M space) at the moment.
> > Kvmtool uses a different mapping, which allows to share 1G with virtio,
> > so the limit is around 8000ish VCPUs here.
> > Are there any issues with changing the QEMU virt mapping later?
> > Migration, maybe?
> > If the UART, the RTC and the virtio regions are moved more towards the
> > beginning of the 256MB PCI mapping, then there should be space for a bit
> > less than 1024 VCPUs, if I get this right.
> > 
> >> Where in the guest
> >> physical memory map of our various virt machines should these regions
> >> sit so that we can allocate anough re-distributors for VCPUs etc.?
> > 
> > Various? Are there other mappings than those described in hw/arm/virt.c?
> > 
> >> I just want to make sure we're not limiting ourselves by some amount of
> >> functionality or ABI (redistributor base addresses) that will be hard to
> >> expand in the future.
> > 
> > If we are flexible with the mapping at VM creation time, QEMU could just
> > use a mapping depending on max_cpus:
> > < 128 VCPUs: use the current mapping
> > 128 <= x < 1020: use a more compressed mapping
> >> = 1020: map the redistributor somewhere above 4 GB
> > 
> > As the device tree binding for GICv3 just supports a stride value, we
> > don't have any other real options beside this, right? So how I see this,
> > a contiguous mapping (with possible holes) is the only way.
> 
> Not really. The GICv3 binding definitely supports having several regions
> for the redistributors (see the binding documentation). This allows for
> the pathological case where you have N regions for N CPUs. Not that we
> ever want to go there, really.
> 
What are your thoughts on mapping all of the redistributor regions in
one consecutive guest phys address space chunk?  Am I making an issue
out of nothing?

-Christoffer
Christoffer Dall Dec. 3, 2014, 10:30 a.m. UTC | #15
On Tue, Dec 02, 2014 at 05:32:45PM +0000, Andre Przywara wrote:
> On 02/12/14 17:06, Marc Zyngier wrote:
> > On 02/12/14 16:24, Andre Przywara wrote:
> >> Hej Christoffer,
> >>
> >> On 30/11/14 08:30, Christoffer Dall wrote:
> >>> On Fri, Nov 28, 2014 at 03:24:11PM +0000, Andre Przywara wrote:
> >>>> Hej Christoffer,
> >>>>
> >>>> On 25/11/14 10:41, Christoffer Dall wrote:
> >>>>> Hi Andre,
> >>>>>
> >>>>> On Mon, Nov 24, 2014 at 04:00:46PM +0000, Andre Przywara wrote:
> >>>>>
> >>>>
> >>
> >> [...]
> >>
> >>>>>>>> +
> >>>>>>>> +     if (!is_in_range(mmio->phys_addr, mmio->len, rdbase,
> >>>>>>>> +         GIC_V3_REDIST_SIZE * nrcpus))
> >>>>>>>> +             return false;
> >>>>>>>
> >>>>>>> Did you think more about the contiguous allocation issue here or can you
> >>>>>>> give me a pointer to the requirement in the spec?
> >>>>>>
> >>>>>> 5.4.1 Re-Distributor Addressing
> >>>>>>
> >>>>>
> >>>>> Section 5.4.1 talks about the pages within a single re-distributor having
> >>>>> to be contiguous, not all the re-deistributor regions having to be
> >>>>> contiguous, right?
> >>>>
> >>>> Ah yes, you are right. But I still think it does not matter:
> >>>> 1) We are "implementing" the GICv3. So as the spec does not forbid this,
> >>>> we just state that the redistributor register maps for each VCPU are
> >>>> contiguous. Also we create the FDT accordingly. I will add a comment in
> >>>> the documentation to state this.
> >>>>
> >>>> 2) The kernel's GICv3 DT bindings assume this allocation is the default.
> >>>> Although Marc added bindings to work around this (stride), it seems much
> >>>> more logical to me to not use it.
> >>>
> >>> I don't disagree (and never have) with the fact that it is up to us to
> >>> decide.
> >>>
> >>> My original question, which we haven't talked about yet, is if it is
> >>> *reasonable* to assume that all re-distributor regions will always be
> >>> contiguous?
> >>>
> >>> How will you handle VCPU hotplug for example?
> >>
> >> As kvmtool does not support hotplug, I haven't thought about this yet.
> >> To me it looks like userland should just use maxcpus for the allocation.
> >> If I get the current QEMU code right, there is room for 127 GICv3 VCPUs
> >> (2*64K per VCPU + 64K for the distributor in 16M space) at the moment.
> >> Kvmtool uses a different mapping, which allows to share 1G with virtio,
> >> so the limit is around 8000ish VCPUs here.
> >> Are there any issues with changing the QEMU virt mapping later?
> >> Migration, maybe?
> >> If the UART, the RTC and the virtio regions are moved more towards the
> >> beginning of the 256MB PCI mapping, then there should be space for a bit
> >> less than 1024 VCPUs, if I get this right.
> >>
> >>> Where in the guest
> >>> physical memory map of our various virt machines should these regions
> >>> sit so that we can allocate anough re-distributors for VCPUs etc.?
> >>
> >> Various? Are there other mappings than those described in hw/arm/virt.c?
> >>
> >>> I just want to make sure we're not limiting ourselves by some amount of
> >>> functionality or ABI (redistributor base addresses) that will be hard to
> >>> expand in the future.
> >>
> >> If we are flexible with the mapping at VM creation time, QEMU could just
> >> use a mapping depending on max_cpus:
> >> < 128 VCPUs: use the current mapping
> >> 128 <= x < 1020: use a more compressed mapping
> >>> = 1020: map the redistributor somewhere above 4 GB
> >>
> >> As the device tree binding for GICv3 just supports a stride value, we
> >> don't have any other real options beside this, right? So how I see this,
> >> a contiguous mapping (with possible holes) is the only way.
> > 
> > Not really. The GICv3 binding definitely supports having several regions
> > for the redistributors (see the binding documentation). This allows for
> > the pathological case where you have N regions for N CPUs. Not that we
> > ever want to go there, really.
> 
> Ah yes, thanks for pointing that out. I was mixing this up with the
> stride parameter, which is independent of this. Sorry for that.
> 
> So from a userland point of view we probably would like to have the
> first n VCPU's redistributors mapped at their current places and allow
> for more VCPUs to use memory above 4 GB.
> Which would require quite some changes to the code to support this in a
> very flexible way. I think this could be much easier if we confine
> ourselves to two regions (one contiguous lower (< 4 GB) and one
> contiguous upper region (>4 GB)), so we don't need to support arbitrary
> per VCPU addresses, but could just use the 1st or 2nd map depending on
> the VCPU number.
> Is this too hackish?
> If not, I would add another vgic_addr type (like
> KVM_VGIC_V3_ADDR_TYPE_REDIST_UPPER or so) to be used from userland and
> use that in the handle_mmio region detection.
> Let me know if that sounds reasonable.
> 
The point that I've been trying to make sure we think about is if we'll
regret not being able to fragment the redistributor regions a bit.  Even
if it's technically possible, we may regret requiring a huge contigous
allocation in the guest physical address space.  But maybe we don't care
when we have 40 bits to play with?

-Christoffer
Marc Zyngier Dec. 3, 2014, 10:44 a.m. UTC | #16
On 03/12/14 10:29, Christoffer Dall wrote:
> On Tue, Dec 02, 2014 at 05:06:09PM +0000, Marc Zyngier wrote:
>> On 02/12/14 16:24, Andre Przywara wrote:
>>> Hej Christoffer,
>>>
>>> On 30/11/14 08:30, Christoffer Dall wrote:
>>>> On Fri, Nov 28, 2014 at 03:24:11PM +0000, Andre Przywara wrote:
>>>>> Hej Christoffer,
>>>>>
>>>>> On 25/11/14 10:41, Christoffer Dall wrote:
>>>>>> Hi Andre,
>>>>>>
>>>>>> On Mon, Nov 24, 2014 at 04:00:46PM +0000, Andre Przywara wrote:
>>>>>>
>>>>>
>>>
>>> [...]
>>>
>>>>>>>>> +
>>>>>>>>> +     if (!is_in_range(mmio->phys_addr, mmio->len, rdbase,
>>>>>>>>> +         GIC_V3_REDIST_SIZE * nrcpus))
>>>>>>>>> +             return false;
>>>>>>>>
>>>>>>>> Did you think more about the contiguous allocation issue here or can you
>>>>>>>> give me a pointer to the requirement in the spec?
>>>>>>>
>>>>>>> 5.4.1 Re-Distributor Addressing
>>>>>>>
>>>>>>
>>>>>> Section 5.4.1 talks about the pages within a single re-distributor having
>>>>>> to be contiguous, not all the re-deistributor regions having to be
>>>>>> contiguous, right?
>>>>>
>>>>> Ah yes, you are right. But I still think it does not matter:
>>>>> 1) We are "implementing" the GICv3. So as the spec does not forbid this,
>>>>> we just state that the redistributor register maps for each VCPU are
>>>>> contiguous. Also we create the FDT accordingly. I will add a comment in
>>>>> the documentation to state this.
>>>>>
>>>>> 2) The kernel's GICv3 DT bindings assume this allocation is the default.
>>>>> Although Marc added bindings to work around this (stride), it seems much
>>>>> more logical to me to not use it.
>>>>
>>>> I don't disagree (and never have) with the fact that it is up to us to
>>>> decide.
>>>>
>>>> My original question, which we haven't talked about yet, is if it is
>>>> *reasonable* to assume that all re-distributor regions will always be
>>>> contiguous?
>>>>
>>>> How will you handle VCPU hotplug for example?
>>>
>>> As kvmtool does not support hotplug, I haven't thought about this yet.
>>> To me it looks like userland should just use maxcpus for the allocation.
>>> If I get the current QEMU code right, there is room for 127 GICv3 VCPUs
>>> (2*64K per VCPU + 64K for the distributor in 16M space) at the moment.
>>> Kvmtool uses a different mapping, which allows to share 1G with virtio,
>>> so the limit is around 8000ish VCPUs here.
>>> Are there any issues with changing the QEMU virt mapping later?
>>> Migration, maybe?
>>> If the UART, the RTC and the virtio regions are moved more towards the
>>> beginning of the 256MB PCI mapping, then there should be space for a bit
>>> less than 1024 VCPUs, if I get this right.
>>>
>>>> Where in the guest
>>>> physical memory map of our various virt machines should these regions
>>>> sit so that we can allocate anough re-distributors for VCPUs etc.?
>>>
>>> Various? Are there other mappings than those described in hw/arm/virt.c?
>>>
>>>> I just want to make sure we're not limiting ourselves by some amount of
>>>> functionality or ABI (redistributor base addresses) that will be hard to
>>>> expand in the future.
>>>
>>> If we are flexible with the mapping at VM creation time, QEMU could just
>>> use a mapping depending on max_cpus:
>>> < 128 VCPUs: use the current mapping
>>> 128 <= x < 1020: use a more compressed mapping
>>>> = 1020: map the redistributor somewhere above 4 GB
>>>
>>> As the device tree binding for GICv3 just supports a stride value, we
>>> don't have any other real options beside this, right? So how I see this,
>>> a contiguous mapping (with possible holes) is the only way.
>>
>> Not really. The GICv3 binding definitely supports having several regions
>> for the redistributors (see the binding documentation). This allows for
>> the pathological case where you have N regions for N CPUs. Not that we
>> ever want to go there, really.
>>
> What are your thoughts on mapping all of the redistributor regions in
> one consecutive guest phys address space chunk?  Am I making an issue
> out of nothing?

I don't think this is too bad. It puts constraints on the physical
memory map, but we do have a massive IPA space anyway (at least on
arm64). Of course, the issue is slightly more acute on 32bit guests,
where IPA space is at a premium. But this is fairly accurately modelling
a monolithic GICv3 (as opposed to distributed).

I imagine that, over time, we'll have to introduce support for "split"
redistributor ranges, but that probably only become an issue when you
want to support guests with several hundred vcpus.

Another interesting point you raise is vcpu hotplug. I'm not completely
sure how that would work. Do we pre-allocate redistributors, do we have
a more coarse grained "socket hot-plug"? I think that we need to give it
some thoughts, as this probably require a slightly different model for
GICv3.

Thanks,

	M.
Andre Przywara Dec. 3, 2014, 10:47 a.m. UTC | #17
On 03/12/14 10:30, Christoffer Dall wrote:
> On Tue, Dec 02, 2014 at 05:32:45PM +0000, Andre Przywara wrote:
>> On 02/12/14 17:06, Marc Zyngier wrote:
>>> On 02/12/14 16:24, Andre Przywara wrote:
>>>> Hej Christoffer,
>>>>
>>>> On 30/11/14 08:30, Christoffer Dall wrote:
>>>>> On Fri, Nov 28, 2014 at 03:24:11PM +0000, Andre Przywara wrote:
>>>>>> Hej Christoffer,
>>>>>>
>>>>>> On 25/11/14 10:41, Christoffer Dall wrote:
>>>>>>> Hi Andre,
>>>>>>>
>>>>>>> On Mon, Nov 24, 2014 at 04:00:46PM +0000, Andre Przywara wrote:
>>>>>>>
>>>>>>
>>>>
>>>> [...]
>>>>
>>>>>>>>>> +
>>>>>>>>>> +     if (!is_in_range(mmio->phys_addr, mmio->len, rdbase,
>>>>>>>>>> +         GIC_V3_REDIST_SIZE * nrcpus))
>>>>>>>>>> +             return false;
>>>>>>>>>
>>>>>>>>> Did you think more about the contiguous allocation issue here or can you
>>>>>>>>> give me a pointer to the requirement in the spec?
>>>>>>>>
>>>>>>>> 5.4.1 Re-Distributor Addressing
>>>>>>>>
>>>>>>>
>>>>>>> Section 5.4.1 talks about the pages within a single re-distributor having
>>>>>>> to be contiguous, not all the re-deistributor regions having to be
>>>>>>> contiguous, right?
>>>>>>
>>>>>> Ah yes, you are right. But I still think it does not matter:
>>>>>> 1) We are "implementing" the GICv3. So as the spec does not forbid this,
>>>>>> we just state that the redistributor register maps for each VCPU are
>>>>>> contiguous. Also we create the FDT accordingly. I will add a comment in
>>>>>> the documentation to state this.
>>>>>>
>>>>>> 2) The kernel's GICv3 DT bindings assume this allocation is the default.
>>>>>> Although Marc added bindings to work around this (stride), it seems much
>>>>>> more logical to me to not use it.
>>>>>
>>>>> I don't disagree (and never have) with the fact that it is up to us to
>>>>> decide.
>>>>>
>>>>> My original question, which we haven't talked about yet, is if it is
>>>>> *reasonable* to assume that all re-distributor regions will always be
>>>>> contiguous?
>>>>>
>>>>> How will you handle VCPU hotplug for example?
>>>>
>>>> As kvmtool does not support hotplug, I haven't thought about this yet.
>>>> To me it looks like userland should just use maxcpus for the allocation.
>>>> If I get the current QEMU code right, there is room for 127 GICv3 VCPUs
>>>> (2*64K per VCPU + 64K for the distributor in 16M space) at the moment.
>>>> Kvmtool uses a different mapping, which allows to share 1G with virtio,
>>>> so the limit is around 8000ish VCPUs here.
>>>> Are there any issues with changing the QEMU virt mapping later?
>>>> Migration, maybe?
>>>> If the UART, the RTC and the virtio regions are moved more towards the
>>>> beginning of the 256MB PCI mapping, then there should be space for a bit
>>>> less than 1024 VCPUs, if I get this right.
>>>>
>>>>> Where in the guest
>>>>> physical memory map of our various virt machines should these regions
>>>>> sit so that we can allocate anough re-distributors for VCPUs etc.?
>>>>
>>>> Various? Are there other mappings than those described in hw/arm/virt.c?
>>>>
>>>>> I just want to make sure we're not limiting ourselves by some amount of
>>>>> functionality or ABI (redistributor base addresses) that will be hard to
>>>>> expand in the future.
>>>>
>>>> If we are flexible with the mapping at VM creation time, QEMU could just
>>>> use a mapping depending on max_cpus:
>>>> < 128 VCPUs: use the current mapping
>>>> 128 <= x < 1020: use a more compressed mapping
>>>>> = 1020: map the redistributor somewhere above 4 GB
>>>>
>>>> As the device tree binding for GICv3 just supports a stride value, we
>>>> don't have any other real options beside this, right? So how I see this,
>>>> a contiguous mapping (with possible holes) is the only way.
>>>
>>> Not really. The GICv3 binding definitely supports having several regions
>>> for the redistributors (see the binding documentation). This allows for
>>> the pathological case where you have N regions for N CPUs. Not that we
>>> ever want to go there, really.
>>
>> Ah yes, thanks for pointing that out. I was mixing this up with the
>> stride parameter, which is independent of this. Sorry for that.
>>
>> So from a userland point of view we probably would like to have the
>> first n VCPU's redistributors mapped at their current places and allow
>> for more VCPUs to use memory above 4 GB.
>> Which would require quite some changes to the code to support this in a
>> very flexible way. I think this could be much easier if we confine
>> ourselves to two regions (one contiguous lower (< 4 GB) and one
>> contiguous upper region (>4 GB)), so we don't need to support arbitrary
>> per VCPU addresses, but could just use the 1st or 2nd map depending on
>> the VCPU number.
>> Is this too hackish?
>> If not, I would add another vgic_addr type (like
>> KVM_VGIC_V3_ADDR_TYPE_REDIST_UPPER or so) to be used from userland and
>> use that in the handle_mmio region detection.
>> Let me know if that sounds reasonable.
>>
> The point that I've been trying to make sure we think about is if we'll
> regret not being able to fragment the redistributor regions a bit.  Even
> if it's technically possible, we may regret requiring a huge contigous
> allocation in the guest physical address space.  But maybe we don't care
> when we have 40 bits to play with?

40 bits are more than enough. But are we OK with using only memory above
4GB? Is there some code before the Linux kernel that is limited to 4GB?
I am thinking about 32bit guests in particular, which may have some
firmware blob executed before which may not use the MMU.

If this is not an issue, I'd rather stay with one contiguous region - at
least for the itme being. The current GICv3 code has a limit of 255
VCPUs anyway, so this requires at most 32MB, which should be easily
fitted anywhere.

Should we later need to extend the number of VCPUs, we can in the worst
case adjust the code to support split regions if the 4GB limit issue
persists. This would be done via a new KVM capability and some new
register groups in the KVM device ioctl to set a second (or following)
region, so in a backwards compatible way.

Cheers,
Andre.
Christoffer Dall Dec. 3, 2014, 11:06 a.m. UTC | #18
On Wed, Dec 03, 2014 at 10:47:34AM +0000, Andre Przywara wrote:
> 
> 
> On 03/12/14 10:30, Christoffer Dall wrote:
> > On Tue, Dec 02, 2014 at 05:32:45PM +0000, Andre Przywara wrote:
> >> On 02/12/14 17:06, Marc Zyngier wrote:
> >>> On 02/12/14 16:24, Andre Przywara wrote:
> >>>> Hej Christoffer,
> >>>>
> >>>> On 30/11/14 08:30, Christoffer Dall wrote:
> >>>>> On Fri, Nov 28, 2014 at 03:24:11PM +0000, Andre Przywara wrote:
> >>>>>> Hej Christoffer,
> >>>>>>
> >>>>>> On 25/11/14 10:41, Christoffer Dall wrote:
> >>>>>>> Hi Andre,
> >>>>>>>
> >>>>>>> On Mon, Nov 24, 2014 at 04:00:46PM +0000, Andre Przywara wrote:
> >>>>>>>
> >>>>>>
> >>>>
> >>>> [...]
> >>>>
> >>>>>>>>>> +
> >>>>>>>>>> +     if (!is_in_range(mmio->phys_addr, mmio->len, rdbase,
> >>>>>>>>>> +         GIC_V3_REDIST_SIZE * nrcpus))
> >>>>>>>>>> +             return false;
> >>>>>>>>>
> >>>>>>>>> Did you think more about the contiguous allocation issue here or can you
> >>>>>>>>> give me a pointer to the requirement in the spec?
> >>>>>>>>
> >>>>>>>> 5.4.1 Re-Distributor Addressing
> >>>>>>>>
> >>>>>>>
> >>>>>>> Section 5.4.1 talks about the pages within a single re-distributor having
> >>>>>>> to be contiguous, not all the re-deistributor regions having to be
> >>>>>>> contiguous, right?
> >>>>>>
> >>>>>> Ah yes, you are right. But I still think it does not matter:
> >>>>>> 1) We are "implementing" the GICv3. So as the spec does not forbid this,
> >>>>>> we just state that the redistributor register maps for each VCPU are
> >>>>>> contiguous. Also we create the FDT accordingly. I will add a comment in
> >>>>>> the documentation to state this.
> >>>>>>
> >>>>>> 2) The kernel's GICv3 DT bindings assume this allocation is the default.
> >>>>>> Although Marc added bindings to work around this (stride), it seems much
> >>>>>> more logical to me to not use it.
> >>>>>
> >>>>> I don't disagree (and never have) with the fact that it is up to us to
> >>>>> decide.
> >>>>>
> >>>>> My original question, which we haven't talked about yet, is if it is
> >>>>> *reasonable* to assume that all re-distributor regions will always be
> >>>>> contiguous?
> >>>>>
> >>>>> How will you handle VCPU hotplug for example?
> >>>>
> >>>> As kvmtool does not support hotplug, I haven't thought about this yet.
> >>>> To me it looks like userland should just use maxcpus for the allocation.
> >>>> If I get the current QEMU code right, there is room for 127 GICv3 VCPUs
> >>>> (2*64K per VCPU + 64K for the distributor in 16M space) at the moment.
> >>>> Kvmtool uses a different mapping, which allows to share 1G with virtio,
> >>>> so the limit is around 8000ish VCPUs here.
> >>>> Are there any issues with changing the QEMU virt mapping later?
> >>>> Migration, maybe?
> >>>> If the UART, the RTC and the virtio regions are moved more towards the
> >>>> beginning of the 256MB PCI mapping, then there should be space for a bit
> >>>> less than 1024 VCPUs, if I get this right.
> >>>>
> >>>>> Where in the guest
> >>>>> physical memory map of our various virt machines should these regions
> >>>>> sit so that we can allocate anough re-distributors for VCPUs etc.?
> >>>>
> >>>> Various? Are there other mappings than those described in hw/arm/virt.c?
> >>>>
> >>>>> I just want to make sure we're not limiting ourselves by some amount of
> >>>>> functionality or ABI (redistributor base addresses) that will be hard to
> >>>>> expand in the future.
> >>>>
> >>>> If we are flexible with the mapping at VM creation time, QEMU could just
> >>>> use a mapping depending on max_cpus:
> >>>> < 128 VCPUs: use the current mapping
> >>>> 128 <= x < 1020: use a more compressed mapping
> >>>>> = 1020: map the redistributor somewhere above 4 GB
> >>>>
> >>>> As the device tree binding for GICv3 just supports a stride value, we
> >>>> don't have any other real options beside this, right? So how I see this,
> >>>> a contiguous mapping (with possible holes) is the only way.
> >>>
> >>> Not really. The GICv3 binding definitely supports having several regions
> >>> for the redistributors (see the binding documentation). This allows for
> >>> the pathological case where you have N regions for N CPUs. Not that we
> >>> ever want to go there, really.
> >>
> >> Ah yes, thanks for pointing that out. I was mixing this up with the
> >> stride parameter, which is independent of this. Sorry for that.
> >>
> >> So from a userland point of view we probably would like to have the
> >> first n VCPU's redistributors mapped at their current places and allow
> >> for more VCPUs to use memory above 4 GB.
> >> Which would require quite some changes to the code to support this in a
> >> very flexible way. I think this could be much easier if we confine
> >> ourselves to two regions (one contiguous lower (< 4 GB) and one
> >> contiguous upper region (>4 GB)), so we don't need to support arbitrary
> >> per VCPU addresses, but could just use the 1st or 2nd map depending on
> >> the VCPU number.
> >> Is this too hackish?
> >> If not, I would add another vgic_addr type (like
> >> KVM_VGIC_V3_ADDR_TYPE_REDIST_UPPER or so) to be used from userland and
> >> use that in the handle_mmio region detection.
> >> Let me know if that sounds reasonable.
> >>
> > The point that I've been trying to make sure we think about is if we'll
> > regret not being able to fragment the redistributor regions a bit.  Even
> > if it's technically possible, we may regret requiring a huge contigous
> > allocation in the guest physical address space.  But maybe we don't care
> > when we have 40 bits to play with?
> 
> 40 bits are more than enough. But are we OK with using only memory above
> 4GB? Is there some code before the Linux kernel that is limited to 4GB?
> I am thinking about 32bit guests in particular, which may have some
> firmware blob executed before which may not use the MMU.
> 
> If this is not an issue, I'd rather stay with one contiguous region - at
> least for the itme being. The current GICv3 code has a limit of 255
> VCPUs anyway, so this requires at most 32MB, which should be easily
> fitted anywhere.
> 
> Should we later need to extend the number of VCPUs, we can in the worst
> case adjust the code to support split regions if the 4GB limit issue
> persists. This would be done via a new KVM capability and some new
> register groups in the KVM device ioctl to set a second (or following)
> region, so in a backwards compatible way.
> 
ok, sounds reasonable.  I'll shut up then.

Thanks,
-Christoffer
Christoffer Dall Dec. 3, 2014, 11:07 a.m. UTC | #19
On Wed, Dec 03, 2014 at 10:44:32AM +0000, Marc Zyngier wrote:
> On 03/12/14 10:29, Christoffer Dall wrote:
> > On Tue, Dec 02, 2014 at 05:06:09PM +0000, Marc Zyngier wrote:
> >> On 02/12/14 16:24, Andre Przywara wrote:
> >>> Hej Christoffer,
> >>>
> >>> On 30/11/14 08:30, Christoffer Dall wrote:
> >>>> On Fri, Nov 28, 2014 at 03:24:11PM +0000, Andre Przywara wrote:
> >>>>> Hej Christoffer,
> >>>>>
> >>>>> On 25/11/14 10:41, Christoffer Dall wrote:
> >>>>>> Hi Andre,
> >>>>>>
> >>>>>> On Mon, Nov 24, 2014 at 04:00:46PM +0000, Andre Przywara wrote:
> >>>>>>
> >>>>>
> >>>
> >>> [...]
> >>>
> >>>>>>>>> +
> >>>>>>>>> +     if (!is_in_range(mmio->phys_addr, mmio->len, rdbase,
> >>>>>>>>> +         GIC_V3_REDIST_SIZE * nrcpus))
> >>>>>>>>> +             return false;
> >>>>>>>>
> >>>>>>>> Did you think more about the contiguous allocation issue here or can you
> >>>>>>>> give me a pointer to the requirement in the spec?
> >>>>>>>
> >>>>>>> 5.4.1 Re-Distributor Addressing
> >>>>>>>
> >>>>>>
> >>>>>> Section 5.4.1 talks about the pages within a single re-distributor having
> >>>>>> to be contiguous, not all the re-deistributor regions having to be
> >>>>>> contiguous, right?
> >>>>>
> >>>>> Ah yes, you are right. But I still think it does not matter:
> >>>>> 1) We are "implementing" the GICv3. So as the spec does not forbid this,
> >>>>> we just state that the redistributor register maps for each VCPU are
> >>>>> contiguous. Also we create the FDT accordingly. I will add a comment in
> >>>>> the documentation to state this.
> >>>>>
> >>>>> 2) The kernel's GICv3 DT bindings assume this allocation is the default.
> >>>>> Although Marc added bindings to work around this (stride), it seems much
> >>>>> more logical to me to not use it.
> >>>>
> >>>> I don't disagree (and never have) with the fact that it is up to us to
> >>>> decide.
> >>>>
> >>>> My original question, which we haven't talked about yet, is if it is
> >>>> *reasonable* to assume that all re-distributor regions will always be
> >>>> contiguous?
> >>>>
> >>>> How will you handle VCPU hotplug for example?
> >>>
> >>> As kvmtool does not support hotplug, I haven't thought about this yet.
> >>> To me it looks like userland should just use maxcpus for the allocation.
> >>> If I get the current QEMU code right, there is room for 127 GICv3 VCPUs
> >>> (2*64K per VCPU + 64K for the distributor in 16M space) at the moment.
> >>> Kvmtool uses a different mapping, which allows to share 1G with virtio,
> >>> so the limit is around 8000ish VCPUs here.
> >>> Are there any issues with changing the QEMU virt mapping later?
> >>> Migration, maybe?
> >>> If the UART, the RTC and the virtio regions are moved more towards the
> >>> beginning of the 256MB PCI mapping, then there should be space for a bit
> >>> less than 1024 VCPUs, if I get this right.
> >>>
> >>>> Where in the guest
> >>>> physical memory map of our various virt machines should these regions
> >>>> sit so that we can allocate anough re-distributors for VCPUs etc.?
> >>>
> >>> Various? Are there other mappings than those described in hw/arm/virt.c?
> >>>
> >>>> I just want to make sure we're not limiting ourselves by some amount of
> >>>> functionality or ABI (redistributor base addresses) that will be hard to
> >>>> expand in the future.
> >>>
> >>> If we are flexible with the mapping at VM creation time, QEMU could just
> >>> use a mapping depending on max_cpus:
> >>> < 128 VCPUs: use the current mapping
> >>> 128 <= x < 1020: use a more compressed mapping
> >>>> = 1020: map the redistributor somewhere above 4 GB
> >>>
> >>> As the device tree binding for GICv3 just supports a stride value, we
> >>> don't have any other real options beside this, right? So how I see this,
> >>> a contiguous mapping (with possible holes) is the only way.
> >>
> >> Not really. The GICv3 binding definitely supports having several regions
> >> for the redistributors (see the binding documentation). This allows for
> >> the pathological case where you have N regions for N CPUs. Not that we
> >> ever want to go there, really.
> >>
> > What are your thoughts on mapping all of the redistributor regions in
> > one consecutive guest phys address space chunk?  Am I making an issue
> > out of nothing?
> 
> I don't think this is too bad. It puts constraints on the physical
> memory map, but we do have a massive IPA space anyway (at least on
> arm64). Of course, the issue is slightly more acute on 32bit guests,
> where IPA space is at a premium. But this is fairly accurately modelling
> a monolithic GICv3 (as opposed to distributed).
> 
> I imagine that, over time, we'll have to introduce support for "split"
> redistributor ranges, but that probably only become an issue when you
> want to support guests with several hundred vcpus.
> 
> Another interesting point you raise is vcpu hotplug. I'm not completely
> sure how that would work. Do we pre-allocate redistributors, do we have
> a more coarse grained "socket hot-plug"? I think that we need to give it
> some thoughts, as this probably require a slightly different model for
> GICv3.
> 
hotplug is indeed probably a larger can of worms.  Let's move forward
with these patches for now and patch up things later then.

-Christoffer
Andre Przywara Dec. 3, 2014, 11:10 a.m. UTC | #20
Hej Christoffer,

On 03/12/14 10:28, Christoffer Dall wrote:
> On Tue, Dec 02, 2014 at 04:24:53PM +0000, Andre Przywara wrote:
>> Hej Christoffer,
>>
>> On 30/11/14 08:30, Christoffer Dall wrote:
>>> On Fri, Nov 28, 2014 at 03:24:11PM +0000, Andre Przywara wrote:
>>>> Hej Christoffer,
>>>>
>>>> On 25/11/14 10:41, Christoffer Dall wrote:
>>>>> Hi Andre,
>>>>>
>>>>> On Mon, Nov 24, 2014 at 04:00:46PM +0000, Andre Przywara wrote:
>>>>>
>>>>
>>
>> [...]
>>
>>>>>>>> +
>>>>>>>> +     if (!is_in_range(mmio->phys_addr, mmio->len, rdbase,
>>>>>>>> +         GIC_V3_REDIST_SIZE * nrcpus))
>>>>>>>> +             return false;
>>>>>>>
>>>>>>> Did you think more about the contiguous allocation issue here or can you
>>>>>>> give me a pointer to the requirement in the spec?
>>>>>>
>>>>>> 5.4.1 Re-Distributor Addressing
>>>>>>
>>>>>
>>>>> Section 5.4.1 talks about the pages within a single re-distributor having
>>>>> to be contiguous, not all the re-deistributor regions having to be
>>>>> contiguous, right?
>>>>
>>>> Ah yes, you are right. But I still think it does not matter:
>>>> 1) We are "implementing" the GICv3. So as the spec does not forbid this,
>>>> we just state that the redistributor register maps for each VCPU are
>>>> contiguous. Also we create the FDT accordingly. I will add a comment in
>>>> the documentation to state this.
>>>>
>>>> 2) The kernel's GICv3 DT bindings assume this allocation is the default.
>>>> Although Marc added bindings to work around this (stride), it seems much
>>>> more logical to me to not use it.
>>>
>>> I don't disagree (and never have) with the fact that it is up to us to
>>> decide.
>>>
>>> My original question, which we haven't talked about yet, is if it is
>>> *reasonable* to assume that all re-distributor regions will always be
>>> contiguous?
>>>
>>> How will you handle VCPU hotplug for example?
>>
>> As kvmtool does not support hotplug, I haven't thought about this yet.
>> To me it looks like userland should just use maxcpus for the allocation.
>> If I get the current QEMU code right, there is room for 127 GICv3 VCPUs
>> (2*64K per VCPU + 64K for the distributor in 16M space) at the moment.
>> Kvmtool uses a different mapping, which allows to share 1G with virtio,
>> so the limit is around 8000ish VCPUs here.
>> Are there any issues with changing the QEMU virt mapping later?
> 
> Not issues as such, but we try to keep it as stable as possible.  At
> least soon, you have to worry about UEFI working with such changes, for
> example.
> 
>> Migration, maybe?
>> If the UART, the RTC and the virtio regions are moved more towards the
>> beginning of the 256MB PCI mapping, then there should be space for a bit
>> less than 1024 VCPUs, if I get this right.
>>
>>> Where in the guest
>>> physical memory map of our various virt machines should these regions
>>> sit so that we can allocate anough re-distributors for VCPUs etc.?
>>
>> Various? Are there other mappings than those described in hw/arm/virt.c?
>>
> 
> QEMU's and kvmtool's, for starters.
> 
>>> I just want to make sure we're not limiting ourselves by some amount of
>>> functionality or ABI (redistributor base addresses) that will be hard to
>>> expand in the future.
>>
>> If we are flexible with the mapping at VM creation time, QEMU could just
>> use a mapping depending on max_cpus:
>> < 128 VCPUs: use the current mapping
>> 128 <= x < 1020: use a more compressed mapping
>>> = 1020: map the redistributor somewhere above 4 GB
> 
> I don't understand what the compressed mapping is. 

Currently we have in QEMU:
   0 MB -  128 MB: Flash
 128 MB -  144 MB: CPU peripherals (GIC for now only)
 144 MB -  144 MB: UART (64k)
 144 MB -  144 MB: RTC (64k)
 160 MB -  256 MB: virtio (32 * 512 Bytes used)
 256 MB - 1024 MB: PCI
1024 MB - ???? MB: DRAM

So we waste quite some space between 144 MB and 256 MB, where we
currently use only 3 64K pages. If we would move those three pages
closer to the 256 MB region, we could extend the GIC mapping space from
16 MB (127 VCPUs) to almost 128 MB (good for 1022 VCPUs).

> Is the current code
> easy to expand such that you can add a secondary separate redistributor
> region above the 4GB map?

Yes, reasonably easy. I sketched this, we just need to have two more
attribute registers (KVM_VGIC_V3_ADDR_TYPE_REDIST_UPPER and
KVM_VGIC_V3_NR_LOWER_VCPUS or so) in the KVM device. But I would refrain
from this for now, as the limit in the rest of the GICv3 code is 255
VCPUs anyways (x86 has the same limit).

>> As the device tree binding for GICv3 just supports a stride value, we
>> don't have any other real options beside this, right? So how I see this,
>> a contiguous mapping (with possible holes) is the only way.
>>
>>>>>>>> +
>>>>>>>> +static int vgic_v3_init(struct kvm *kvm, const struct vgic_params *params)
>>>>>>>> +{
>>>>>>>> +     struct vgic_dist *dist = &kvm->arch.vgic;
>>>>>>>> +     int ret, i;
>>>>>>>> +     u32 mpidr;
>>>>>>>> +
>>>>>>>> +     if (IS_VGIC_ADDR_UNDEF(dist->vgic_dist_base) ||
>>>>>>>> +         IS_VGIC_ADDR_UNDEF(dist->vgic_redist_base)) {
>>>>>>>> +             kvm_err("Need to set vgic distributor addresses first\n");
>>>>>>>> +             return -ENXIO;
>>>>>>>> +     }
>>>>>>>> +
>>>>>>>> +     /*
>>>>>>>> +      * FIXME: this should be moved to init_maps time, and may bite
>>>>>>>> +      * us when adding save/restore. Add a per-emulation hook?
>>>>>>>> +      */
>>>>>>>
>>>>>>> progress on this fixme?
>>>>>>
>>>>>> Progress supplies the ISS, but not this piece of code (read: none) ;-)
>>>>>> I am more in favour of a follow-up patch on this one ...
>>>>>
>>>>> hmmm, I'm not a fan of merging code with this kind of a comment in it,
>>>>> because it looks scary, and I dont' really understand the problem from
>>>>> just reading the comment, so something needs to be done here.
>>>>
>>>> I see. What about we are moving this unconditionally into vgic_init_maps
>>>> and allocate it for both v2 and v3 guests and get rid of the whole
>>>> function? It allocates only memory for the irq_spi_mpidr, which is 4
>>>> bytes per configured SPI (so at most less than 4 KB, but usually just
>>>> 128 Bytes per guest). This would be a pretty quick solution. Does that
>>>> sound too hackish?
>>>>
>>>> After your comments about the per-VM ops function pointers I am a bit
>>>> reluctant to introduce another one (which would be the obvious way
>>>> following the comment) for just this simple kalloc().
>>>> On the other hand the ITS emulation may later make better use of a GICv3
>>>> specific allocation function.
>>>
>>> What I really disliked was the configuration of a function pointer,
>>> which, when invoked, configured other function pointers.  That just made
>>> my head spin.  So adding another per-gic-model init_maps method is not
>>> that bad, but on the other hand, the only problem with keeping this here
>>> is that when we restore the vgic state, then user space wants to be able
>>> to populate all the date before running any VCPUs, and we don't create
>>> the data structures before the first VCPU is run.
>>>
>>> However, Eric has a problem with this "init-when-we-run-the-first-VCPU"
>>> approach as well, so one argument is that we need to add a method to
>>> both the gicv2 and gicv3 device API to say "VGIC_INIT" which userspace
>>> can call after having created all the VCPUs.  And, in fact, we may want
>>> to enforce this for the gicv3 right now and only maintain the existing
>>> behavior for gicv2.
>>>
>>> (Eric's use case is configuring IRQFD, which must logically be done
>>> before running the machine, but also needs to be done after the vgic is
>>> fully ready.).
>>>
>>> Does this make sense?
>>
>> So if we would avoid that spooky "detect-if-a-VCPU-has-run" code and
>> rely on an explicit ioctl, I am in favor for this. We would need to keep
>> the current approach for compatibility, though, right?
> 
> not for gicv3, only for gicv2, which is exactly why I'm bringing it up
> here.

Ah, I see.

>>
>> So what about we either keep the current GICv3 allocation as it stands
>> in my patches right now (or move the GICv3 specific part into the
>> general vgic_init_maps()) and adapt that to the VGIC_INIT call once that
>> has appeared (or even handle this in that series then).
>>
>> Does that make sense? What is the time frame for that VGIC_INIT call?
>>
> 
> Eric posted some patches in private yesterday, I think he's planning on
> sending them out any day now.

I just found them on the list. Let me take a look...

Cheers,
Andre.
Arnd Bergmann Dec. 3, 2014, 11:28 a.m. UTC | #21
On Wednesday 03 December 2014 11:10:18 Andre Przywara wrote:
> On 03/12/14 10:28, Christoffer Dall wrote:
> > On Tue, Dec 02, 2014 at 04:24:53PM +0000, Andre Przywara wrote:
> >> On 30/11/14 08:30, Christoffer Dall wrote:
> >>
> >> If we are flexible with the mapping at VM creation time, QEMU could just
> >> use a mapping depending on max_cpus:
> >> < 128 VCPUs: use the current mapping
> >> 128 <= x < 1020: use a more compressed mapping
> >>> = 1020: map the redistributor somewhere above 4 GB
> > 
> > I don't understand what the compressed mapping is. 
> 
> Currently we have in QEMU:
>    0 MB -  128 MB: Flash
>  128 MB -  144 MB: CPU peripherals (GIC for now only)
>  144 MB -  144 MB: UART (64k)
>  144 MB -  144 MB: RTC (64k)
>  160 MB -  256 MB: virtio (32 * 512 Bytes used)
>  256 MB - 1024 MB: PCI
> 1024 MB - ???? MB: DRAM
> 
> So we waste quite some space between 144 MB and 256 MB, where we
> currently use only 3 64K pages. If we would move those three pages
> closer to the 256 MB region, we could extend the GIC mapping space from
> 16 MB (127 VCPUs) to almost 128 MB (good for 1022 VCPUs).

Having a more compressed mapping sounds useful, but you could also consider
carving the GIC registers out of the PCI range and make PCI memory space
smaller if there are lots of CPUs.

Is there also a 64-bit PCI memory space behind the DRAM?

	Arnd
Peter Maydell Dec. 3, 2014, 11:39 a.m. UTC | #22
On 3 December 2014 at 11:28, Arnd Bergmann <arnd@arndb.de> wrote:
> On Wednesday 03 December 2014 11:10:18 Andre Przywara wrote:
>> Currently we have in QEMU:
>>    0 MB -  128 MB: Flash
>>  128 MB -  144 MB: CPU peripherals (GIC for now only)
>>  144 MB -  144 MB: UART (64k)
>>  144 MB -  144 MB: RTC (64k)
>>  160 MB -  256 MB: virtio (32 * 512 Bytes used)
>>  256 MB - 1024 MB: PCI
>> 1024 MB - ???? MB: DRAM
>>
>> So we waste quite some space between 144 MB and 256 MB, where we
>> currently use only 3 64K pages. If we would move those three pages
>> closer to the 256 MB region, we could extend the GIC mapping space from
>> 16 MB (127 VCPUs) to almost 128 MB (good for 1022 VCPUs).

Note that part of the reason for that gap is to give us space
to add more memory-mapped peripherals as they are needed.
For instance the fw_cfg conduit for talking to UEFI firmware
is going to need another 64K page in that section.

> Having a more compressed mapping sounds useful, but you could also consider
> carving the GIC registers out of the PCI range and make PCI memory space
> smaller if there are lots of CPUs.
>
> Is there also a 64-bit PCI memory space behind the DRAM?

The "PCI" section at the moment is purely hypothetical -- it's
a lump of reserved space in the memory map that I put in so
we had a place to put PCI later... The DRAM is just the last
thing in the memory map and goes up for as much DRAM as you
asked QEMU to provide, so in that sense there's no "behind"
there.

We could (at least at the moment) shuffle things around a bit,
since guests should all be looking at the DTB to figure where
things are. About the only thing we can't do is move the base
address of RAM or put the flash somewhere other than 0. If
we do need to make changes I'd rather we figured out the new
layout and switched to it rather than doing a set of piecemeal
tweaks, though.

-- PMM
Andre Przywara Dec. 3, 2014, 12:03 p.m. UTC | #23
On 03/12/14 11:39, Peter Maydell wrote:
> On 3 December 2014 at 11:28, Arnd Bergmann <arnd@arndb.de> wrote:
>> On Wednesday 03 December 2014 11:10:18 Andre Przywara wrote:
>>> Currently we have in QEMU:
>>>    0 MB -  128 MB: Flash
>>>  128 MB -  144 MB: CPU peripherals (GIC for now only)
>>>  144 MB -  144 MB: UART (64k)
>>>  144 MB -  144 MB: RTC (64k)
>>>  160 MB -  256 MB: virtio (32 * 512 Bytes used)
>>>  256 MB - 1024 MB: PCI
>>> 1024 MB - ???? MB: DRAM
>>>
>>> So we waste quite some space between 144 MB and 256 MB, where we
>>> currently use only 3 64K pages. If we would move those three pages
>>> closer to the 256 MB region, we could extend the GIC mapping space from
>>> 16 MB (127 VCPUs) to almost 128 MB (good for 1022 VCPUs).
> 
> Note that part of the reason for that gap is to give us space
> to add more memory-mapped peripherals as they are needed.
> For instance the fw_cfg conduit for talking to UEFI firmware
> is going to need another 64K page in that section.

Sure, that was just to show the situation in general. We could do
something like kvmtool and grow the CPU peripheral space and the device
MMIO pages from different directions to not waste space needlessly.
So UART, RTC, virtio and future devices use space from 128MB upwards,
whereas the GIC uses as much space below 256 MB as needed.

Btw.: Is there an issue with not aligning each virtio device to a 64K
page? Will we never need to separate access to virtio devices in a guest
with MMU help?

>> Having a more compressed mapping sounds useful, but you could also consider
>> carving the GIC registers out of the PCI range and make PCI memory space
>> smaller if there are lots of CPUs.
>>
>> Is there also a 64-bit PCI memory space behind the DRAM?
> 
> The "PCI" section at the moment is purely hypothetical -- it's
> a lump of reserved space in the memory map that I put in so
> we had a place to put PCI later... The DRAM is just the last
> thing in the memory map and goes up for as much DRAM as you
> asked QEMU to provide, so in that sense there's no "behind"
> there.
> 
> We could (at least at the moment) shuffle things around a bit,
> since guests should all be looking at the DTB to figure where
> things are. About the only thing we can't do is move the base
> address of RAM or put the flash somewhere other than 0. If
> we do need to make changes I'd rather we figured out the new
> layout and switched to it rather than doing a set of piecemeal
> tweaks, though.

Agreed. As QEMU does not support GICv3 anyway at the moment, there is no
need to rush things.

Is there any sign of a PCI host controller emulation for QEMU on the
horizon?

Cheers,
Andre.
Arnd Bergmann Dec. 3, 2014, 1:14 p.m. UTC | #24
On Wednesday 03 December 2014 12:03:59 Andre Przywara wrote:
> On 03/12/14 11:39, Peter Maydell wrote:
> > On 3 December 2014 at 11:28, Arnd Bergmann <arnd@arndb.de> wrote:
> >> On Wednesday 03 December 2014 11:10:18 Andre Przywara wrote:
> >>> Currently we have in QEMU:
> >>>    0 MB -  128 MB: Flash
> >>>  128 MB -  144 MB: CPU peripherals (GIC for now only)
> >>>  144 MB -  144 MB: UART (64k)
> >>>  144 MB -  144 MB: RTC (64k)
> >>>  160 MB -  256 MB: virtio (32 * 512 Bytes used)
> >>>  256 MB - 1024 MB: PCI
> >>> 1024 MB - ???? MB: DRAM
> >>>
> >>> So we waste quite some space between 144 MB and 256 MB, where we
> >>> currently use only 3 64K pages. If we would move those three pages
> >>> closer to the 256 MB region, we could extend the GIC mapping space from
> >>> 16 MB (127 VCPUs) to almost 128 MB (good for 1022 VCPUs).
> > 
> > Note that part of the reason for that gap is to give us space
> > to add more memory-mapped peripherals as they are needed.
> > For instance the fw_cfg conduit for talking to UEFI firmware
> > is going to need another 64K page in that section.
>
> Sure, that was just to show the situation in general. We could do
> something like kvmtool and grow the CPU peripheral space and the device
> MMIO pages from different directions to not waste space needlessly.
> So UART, RTC, virtio and future devices use space from 128MB upwards,
> whereas the GIC uses as much space below 256 MB as needed.

I also think that would be good. I assume 128MB of flash is required for
running UEFI, or could that be optional or variable-sized too?

A very easy method for doing this would be just allocate any devices with
a 64k spacing from the bottom (right after the end of flash) and use
whatever remains below 1GB for 32-bit PCI memory space, then have all
RAM after that, and finally do a 64-bit PCI memory space at the end
of RAM, probably at the next power-of-two aligned address.

Within the PCI area, it could look something like this:

<variable start>
* 64KB I/O space
* 1MB aligned ECAM config space, 1MB per emulated bus, possibly
  some spare to allow PCI hotplug
* prefetchable memory space, multiples of 1MB, only needed when
  doing device passthrough
* non-prefetchable memory space, all the rest up to 1GB

> Btw.: Is there an issue with not aligning each virtio device to a 64K
> page? Will we never need to separate access to virtio devices in a guest
> with MMU help?

As far as I'm concerned, none of the integrated devices need to be aligned
to 64K. The only requirement for 64K alignment is if you have a qemu
emulated machine that runs a hypervisor in it and does a passthrough of
its devices to an unprivileged guest running under the hypervisor.

I can't think of why anybody would ever do that for a uart or rtc, but
maybe my imagination is too limited.

Note that some of the older SoCs had all their peripherals within
a single 4kb page!
 

	Arnd
Christoffer Dall Dec. 4, 2014, 9:34 a.m. UTC | #25
On Wed, Dec 03, 2014 at 12:03:59PM +0000, Andre Przywara wrote:
> On 03/12/14 11:39, Peter Maydell wrote:
> > On 3 December 2014 at 11:28, Arnd Bergmann <arnd@arndb.de> wrote:
> >> On Wednesday 03 December 2014 11:10:18 Andre Przywara wrote:
> >>> Currently we have in QEMU:
> >>>    0 MB -  128 MB: Flash
> >>>  128 MB -  144 MB: CPU peripherals (GIC for now only)
> >>>  144 MB -  144 MB: UART (64k)
> >>>  144 MB -  144 MB: RTC (64k)
> >>>  160 MB -  256 MB: virtio (32 * 512 Bytes used)
> >>>  256 MB - 1024 MB: PCI
> >>> 1024 MB - ???? MB: DRAM
> >>>
> >>> So we waste quite some space between 144 MB and 256 MB, where we
> >>> currently use only 3 64K pages. If we would move those three pages
> >>> closer to the 256 MB region, we could extend the GIC mapping space from
> >>> 16 MB (127 VCPUs) to almost 128 MB (good for 1022 VCPUs).
> > 
> > Note that part of the reason for that gap is to give us space
> > to add more memory-mapped peripherals as they are needed.
> > For instance the fw_cfg conduit for talking to UEFI firmware
> > is going to need another 64K page in that section.
> 
> Sure, that was just to show the situation in general. We could do
> something like kvmtool and grow the CPU peripheral space and the device
> MMIO pages from different directions to not waste space needlessly.
> So UART, RTC, virtio and future devices use space from 128MB upwards,
> whereas the GIC uses as much space below 256 MB as needed.
> 
> Btw.: Is there an issue with not aligning each virtio device to a 64K
> page? Will we never need to separate access to virtio devices in a guest
> with MMU help?
> 
> >> Having a more compressed mapping sounds useful, but you could also consider
> >> carving the GIC registers out of the PCI range and make PCI memory space
> >> smaller if there are lots of CPUs.
> >>
> >> Is there also a 64-bit PCI memory space behind the DRAM?
> > 
> > The "PCI" section at the moment is purely hypothetical -- it's
> > a lump of reserved space in the memory map that I put in so
> > we had a place to put PCI later... The DRAM is just the last
> > thing in the memory map and goes up for as much DRAM as you
> > asked QEMU to provide, so in that sense there's no "behind"
> > there.
> > 
> > We could (at least at the moment) shuffle things around a bit,
> > since guests should all be looking at the DTB to figure where
> > things are. About the only thing we can't do is move the base
> > address of RAM or put the flash somewhere other than 0. If
> > we do need to make changes I'd rather we figured out the new
> > layout and switched to it rather than doing a set of piecemeal
> > tweaks, though.
> 
> Agreed. As QEMU does not support GICv3 anyway at the moment, there is no
> need to rush things.
> 
> Is there any sign of a PCI host controller emulation for QEMU on the
> horizon?
> 
Yes, we need to have this in Q1 2015 latest.  Linaro has scheduled this
work, but patches are a little while out I'm afraid.

-Christoffer
Eric Auger Dec. 4, 2014, 10:02 a.m. UTC | #26
On 12/04/2014 10:34 AM, Christoffer Dall wrote:
> On Wed, Dec 03, 2014 at 12:03:59PM +0000, Andre Przywara wrote:
>> On 03/12/14 11:39, Peter Maydell wrote:
>>> On 3 December 2014 at 11:28, Arnd Bergmann <arnd@arndb.de> wrote:
>>>> On Wednesday 03 December 2014 11:10:18 Andre Przywara wrote:
>>>>> Currently we have in QEMU:
>>>>>    0 MB -  128 MB: Flash
>>>>>  128 MB -  144 MB: CPU peripherals (GIC for now only)
>>>>>  144 MB -  144 MB: UART (64k)
>>>>>  144 MB -  144 MB: RTC (64k)
>>>>>  160 MB -  256 MB: virtio (32 * 512 Bytes used)
>>>>>  256 MB - 1024 MB: PCI
>>>>> 1024 MB - ???? MB: DRAM
>>>>>
>>>>> So we waste quite some space between 144 MB and 256 MB, where we
>>>>> currently use only 3 64K pages. If we would move those three pages
>>>>> closer to the 256 MB region, we could extend the GIC mapping space from
>>>>> 16 MB (127 VCPUs) to almost 128 MB (good for 1022 VCPUs).
>>>
>>> Note that part of the reason for that gap is to give us space
>>> to add more memory-mapped peripherals as they are needed.
>>> For instance the fw_cfg conduit for talking to UEFI firmware
>>> is going to need another 64K page in that section.

Hi,

For info I also planned to use 4MB between RTC and virtio for platform
bus (all dynamically instantiated sysbus devices including VFIO platform
devices), currently plugged at 148MB
(http://lists.gnu.org/archive/html/qemu-devel/2014-11/msg04346.html)

Best Regards

Eric

>>
>> Sure, that was just to show the situation in general. We could do
>> something like kvmtool and grow the CPU peripheral space and the device
>> MMIO pages from different directions to not waste space needlessly.
>> So UART, RTC, virtio and future devices use space from 128MB upwards,
>> whereas the GIC uses as much space below 256 MB as needed.
>>
>> Btw.: Is there an issue with not aligning each virtio device to a 64K
>> page? Will we never need to separate access to virtio devices in a guest
>> with MMU help?
>>
>>>> Having a more compressed mapping sounds useful, but you could also consider
>>>> carving the GIC registers out of the PCI range and make PCI memory space
>>>> smaller if there are lots of CPUs.
>>>>
>>>> Is there also a 64-bit PCI memory space behind the DRAM?
>>>
>>> The "PCI" section at the moment is purely hypothetical -- it's
>>> a lump of reserved space in the memory map that I put in so
>>> we had a place to put PCI later... The DRAM is just the last
>>> thing in the memory map and goes up for as much DRAM as you
>>> asked QEMU to provide, so in that sense there's no "behind"
>>> there.
>>>
>>> We could (at least at the moment) shuffle things around a bit,
>>> since guests should all be looking at the DTB to figure where
>>> things are. About the only thing we can't do is move the base
>>> address of RAM or put the flash somewhere other than 0. If
>>> we do need to make changes I'd rather we figured out the new
>>> layout and switched to it rather than doing a set of piecemeal
>>> tweaks, though.
>>
>> Agreed. As QEMU does not support GICv3 anyway at the moment, there is no
>> need to rush things.
>>
>> Is there any sign of a PCI host controller emulation for QEMU on the
>> horizon?
>>
> Yes, we need to have this in Q1 2015 latest.  Linaro has scheduled this
> work, but patches are a little while out I'm afraid.
> 
> -Christoffer
> _______________________________________________
> kvmarm mailing list
> kvmarm@lists.cs.columbia.edu
> https://lists.cs.columbia.edu/mailman/listinfo/kvmarm
>
diff mbox

Patch

diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile
index d957353..4e6e09e 100644
--- a/arch/arm64/kvm/Makefile
+++ b/arch/arm64/kvm/Makefile
@@ -24,5 +24,6 @@  kvm-$(CONFIG_KVM_ARM_VGIC) += $(KVM)/arm/vgic-v2.o
 kvm-$(CONFIG_KVM_ARM_VGIC) += $(KVM)/arm/vgic-v2-emul.o
 kvm-$(CONFIG_KVM_ARM_VGIC) += vgic-v2-switch.o
 kvm-$(CONFIG_KVM_ARM_VGIC) += $(KVM)/arm/vgic-v3.o
+kvm-$(CONFIG_KVM_ARM_VGIC) += $(KVM)/arm/vgic-v3-emul.o
 kvm-$(CONFIG_KVM_ARM_VGIC) += vgic-v3-switch.o
 kvm-$(CONFIG_KVM_ARM_TIMER) += $(KVM)/arm/arch_timer.o
diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
index 421833f..c1ef5a9 100644
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -160,7 +160,11 @@  struct vgic_dist {
 
 	/* Distributor and vcpu interface mapping in the guest */
 	phys_addr_t		vgic_dist_base;
-	phys_addr_t		vgic_cpu_base;
+	/* GICv2 and GICv3 use different mapped register blocks */
+	union {
+		phys_addr_t		vgic_cpu_base;
+		phys_addr_t		vgic_redist_base;
+	};
 
 	/* Distributor enabled */
 	u32			enabled;
@@ -222,6 +226,9 @@  struct vgic_dist {
 	 */
 	struct vgic_bitmap	*irq_spi_target;
 
+	/* Target MPIDR for each IRQ (needed for GICv3 IROUTERn) only */
+	u32			*irq_spi_mpidr;
+
 	/* Bitmap indicating which CPU has something pending */
 	unsigned long		*irq_pending_on_cpu;
 
diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h
index 03a4ea3..726d898 100644
--- a/include/linux/irqchip/arm-gic-v3.h
+++ b/include/linux/irqchip/arm-gic-v3.h
@@ -33,6 +33,7 @@ 
 #define GICD_SETSPI_SR			0x0050
 #define GICD_CLRSPI_SR			0x0058
 #define GICD_SEIR			0x0068
+#define GICD_IGROUPR			0x0080
 #define GICD_ISENABLER			0x0100
 #define GICD_ICENABLER			0x0180
 #define GICD_ISPENDR			0x0200
@@ -41,14 +42,37 @@ 
 #define GICD_ICACTIVER			0x0380
 #define GICD_IPRIORITYR			0x0400
 #define GICD_ICFGR			0x0C00
+#define GICD_IGRPMODR			0x0D00
+#define GICD_NSACR			0x0E00
 #define GICD_IROUTER			0x6000
+#define GICD_IDREGS			0xFFD0
 #define GICD_PIDR2			0xFFE8
 
+/*
+ * Those registers are actually from GICv2, but the spec demands that they
+ * are implemented as RES0 if ARE is 1 (which we do in KVM's emulated GICv3).
+ */
+#define GICD_ITARGETSR			0x0800
+#define GICD_SGIR			0x0F00
+#define GICD_CPENDSGIR			0x0F10
+#define GICD_SPENDSGIR			0x0F20
+
 #define GICD_CTLR_RWP			(1U << 31)
+#define GICD_CTLR_DS			(1U << 6)
 #define GICD_CTLR_ARE_NS		(1U << 4)
 #define GICD_CTLR_ENABLE_G1A		(1U << 1)
 #define GICD_CTLR_ENABLE_G1		(1U << 0)
 
+/*
+ * In systems with a single security state (what we emulate in KVM)
+ * the meaning of the interrupt group enable bits is slightly different
+ */
+#define GICD_CTLR_ENABLE_SS_G1		(1U << 1)
+#define GICD_CTLR_ENABLE_SS_G0		(1U << 0)
+
+#define GICD_TYPER_LPIS			(1U << 17)
+#define GICD_TYPER_MBIS			(1U << 16)
+
 #define GICD_IROUTER_SPI_MODE_ONE	(0U << 31)
 #define GICD_IROUTER_SPI_MODE_ANY	(1U << 31)
 
@@ -56,6 +80,8 @@ 
 #define GIC_PIDR2_ARCH_GICv3		0x30
 #define GIC_PIDR2_ARCH_GICv4		0x40
 
+#define GIC_V3_DIST_SIZE		0x10000
+
 /*
  * Re-Distributor registers, offsets from RD_base
  */
@@ -74,6 +100,7 @@ 
 #define GICR_SYNCR			0x00C0
 #define GICR_MOVLPIR			0x0100
 #define GICR_MOVALLR			0x0110
+#define GICR_IDREGS			GICD_IDREGS
 #define GICR_PIDR2			GICD_PIDR2
 
 #define GICR_WAKER_ProcessorSleep	(1U << 1)
@@ -82,6 +109,7 @@ 
 /*
  * Re-Distributor registers, offsets from SGI_base
  */
+#define GICR_IGROUPR0			GICD_IGROUPR
 #define GICR_ISENABLER0			GICD_ISENABLER
 #define GICR_ICENABLER0			GICD_ICENABLER
 #define GICR_ISPENDR0			GICD_ISPENDR
@@ -90,10 +118,14 @@ 
 #define GICR_ICACTIVER0			GICD_ICACTIVER
 #define GICR_IPRIORITYR0		GICD_IPRIORITYR
 #define GICR_ICFGR0			GICD_ICFGR
+#define GICR_IGRPMODR0			GICD_IGRPMODR
+#define GICR_NSACR			GICD_NSACR
 
 #define GICR_TYPER_VLPIS		(1U << 1)
 #define GICR_TYPER_LAST			(1U << 4)
 
+#define GIC_V3_REDIST_SIZE		0x20000
+
 /*
  * CPU interface registers
  */
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 326ba7a..4a7798e 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1085,6 +1085,7 @@  void kvm_unregister_device_ops(u32 type);
 extern struct kvm_device_ops kvm_mpic_ops;
 extern struct kvm_device_ops kvm_xics_ops;
 extern struct kvm_device_ops kvm_arm_vgic_v2_ops;
+extern struct kvm_device_ops kvm_arm_vgic_v3_ops;
 
 #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
 
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 6076882..24cb129 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -960,6 +960,8 @@  enum kvm_device_type {
 #define KVM_DEV_TYPE_ARM_VGIC_V2	KVM_DEV_TYPE_ARM_VGIC_V2
 	KVM_DEV_TYPE_FLIC,
 #define KVM_DEV_TYPE_FLIC		KVM_DEV_TYPE_FLIC
+	KVM_DEV_TYPE_ARM_VGIC_V3,
+#define KVM_DEV_TYPE_ARM_VGIC_V3	KVM_DEV_TYPE_ARM_VGIC_V3
 	KVM_DEV_TYPE_MAX,
 };
 
diff --git a/virt/kvm/arm/vgic-v3-emul.c b/virt/kvm/arm/vgic-v3-emul.c
new file mode 100644
index 0000000..97b5801
--- /dev/null
+++ b/virt/kvm/arm/vgic-v3-emul.c
@@ -0,0 +1,904 @@ 
+/*
+ * GICv3 distributor and redistributor emulation
+ *
+ * GICv3 emulation is currently only supported on a GICv3 host (because
+ * we rely on the hardware's CPU interface virtualization support), but
+ * supports both hardware with or without the optional GICv2 backwards
+ * compatibility features.
+ *
+ * Limitations of the emulation:
+ * (RAZ/WI: read as zero, write ignore, RAO/WI: read as one, write ignore)
+ * - We do not support LPIs (yet). TYPER.LPIS is reported as 0 and is RAZ/WI.
+ * - We do not support the message based interrupts (MBIs) triggered by
+ *   writes to the GICD_{SET,CLR}SPI_* registers. TYPER.MBIS is reported as 0.
+ * - We do not support the (optional) backwards compatibility feature.
+ *   GICD_CTLR.ARE resets to 1 and is RAO/WI. If the _host_ GIC supports
+ *   the compatiblity feature, you can use a GICv2 in the guest, though.
+ * - We only support a single security state. GICD_CTLR.DS is 1 and is RAO/WI.
+ * - Priorities are not emulated (same as the GICv2 emulation). Linux
+ *   as a guest is fine with this, because it does not use priorities.
+ * - We only support Group1 interrupts. Again Linux uses only those.
+ *
+ * Copyright (C) 2014 ARM Ltd.
+ * Author: Andre Przywara <andre.przywara@arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/cpu.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <linux/interrupt.h>
+
+#include <linux/irqchip/arm-gic-v3.h>
+#include <kvm/arm_vgic.h>
+
+#include <asm/kvm_emulate.h>
+#include <asm/kvm_arm.h>
+#include <asm/kvm_mmu.h>
+
+#include "vgic.h"
+
+static bool handle_mmio_rao_wi(struct kvm_vcpu *vcpu,
+			       struct kvm_exit_mmio *mmio, phys_addr_t offset)
+{
+	u32 reg = 0xffffffff;
+
+	vgic_reg_access(mmio, &reg, offset,
+			ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
+
+	return false;
+}
+
+static bool handle_mmio_ctlr(struct kvm_vcpu *vcpu,
+			     struct kvm_exit_mmio *mmio, phys_addr_t offset)
+{
+	u32 reg = 0;
+
+	/*
+	 * Force ARE and DS to 1, the guest cannot change this.
+	 * For the time being we only support Group1 interrupts.
+	 */
+	if (vcpu->kvm->arch.vgic.enabled)
+		reg = GICD_CTLR_ENABLE_SS_G1;
+	reg |= GICD_CTLR_ARE_NS | GICD_CTLR_DS;
+
+	vgic_reg_access(mmio, &reg, offset,
+			ACCESS_READ_VALUE | ACCESS_WRITE_VALUE);
+	if (mmio->is_write) {
+		if (reg & GICD_CTLR_ENABLE_SS_G0)
+			kvm_info("guest tried to enable unsupported Group0 interrupts\n");
+		vcpu->kvm->arch.vgic.enabled = !!(reg & GICD_CTLR_ENABLE_SS_G1);
+		vgic_update_state(vcpu->kvm);
+		return true;
+	}
+	return false;
+}
+
+/*
+ * As this implementation does not provide compatibility
+ * with GICv2 (ARE==1), we report zero CPUs in bits [5..7].
+ * Also LPIs and MBIs are not supported, so we set the respective bits to 0.
+ * Also we report at most 2**10=1024 interrupt IDs (to match 1024 SPIs).
+ */
+#define INTERRUPT_ID_BITS 10
+static bool handle_mmio_typer(struct kvm_vcpu *vcpu,
+			      struct kvm_exit_mmio *mmio, phys_addr_t offset)
+{
+	u32 reg;
+
+	/* we report at most 1024 IRQs via this interface */
+	reg = (min(vcpu->kvm->arch.vgic.nr_irqs, 1024) >> 5) - 1;
+
+	reg |= (INTERRUPT_ID_BITS - 1) << 19;
+
+	vgic_reg_access(mmio, &reg, offset,
+			ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
+
+	return false;
+}
+
+static bool handle_mmio_iidr(struct kvm_vcpu *vcpu,
+			     struct kvm_exit_mmio *mmio, phys_addr_t offset)
+{
+	u32 reg;
+
+	reg = (PRODUCT_ID_KVM << 24) | (IMPLEMENTER_ARM << 0);
+	vgic_reg_access(mmio, &reg, offset,
+			ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
+
+	return false;
+}
+
+static bool handle_mmio_set_enable_reg_dist(struct kvm_vcpu *vcpu,
+					    struct kvm_exit_mmio *mmio,
+					    phys_addr_t offset)
+{
+	if (likely(offset >= VGIC_NR_PRIVATE_IRQS / 8))
+		return vgic_handle_enable_reg(vcpu->kvm, mmio, offset,
+					      vcpu->vcpu_id,
+					      ACCESS_WRITE_SETBIT);
+
+	vgic_reg_access(mmio, NULL, offset,
+			ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
+	return false;
+}
+
+static bool handle_mmio_clear_enable_reg_dist(struct kvm_vcpu *vcpu,
+					      struct kvm_exit_mmio *mmio,
+					      phys_addr_t offset)
+{
+	if (likely(offset >= VGIC_NR_PRIVATE_IRQS / 8))
+		return vgic_handle_enable_reg(vcpu->kvm, mmio, offset,
+					      vcpu->vcpu_id,
+					      ACCESS_WRITE_CLEARBIT);
+
+	vgic_reg_access(mmio, NULL, offset,
+			ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
+	return false;
+}
+
+static bool handle_mmio_set_pending_reg_dist(struct kvm_vcpu *vcpu,
+					     struct kvm_exit_mmio *mmio,
+					     phys_addr_t offset)
+{
+	if (likely(offset >= VGIC_NR_PRIVATE_IRQS / 8))
+		return vgic_handle_set_pending_reg(vcpu->kvm, mmio, offset,
+						   vcpu->vcpu_id);
+
+	vgic_reg_access(mmio, NULL, offset,
+			ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
+	return false;
+}
+
+static bool handle_mmio_clear_pending_reg_dist(struct kvm_vcpu *vcpu,
+					       struct kvm_exit_mmio *mmio,
+					       phys_addr_t offset)
+{
+	if (likely(offset >= VGIC_NR_PRIVATE_IRQS / 8))
+		return vgic_handle_clear_pending_reg(vcpu->kvm, mmio, offset,
+						     vcpu->vcpu_id);
+
+	vgic_reg_access(mmio, NULL, offset,
+			ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
+	return false;
+}
+
+static bool handle_mmio_priority_reg_dist(struct kvm_vcpu *vcpu,
+					  struct kvm_exit_mmio *mmio,
+					  phys_addr_t offset)
+{
+	u32 *reg;
+
+	if (unlikely(offset < VGIC_NR_PRIVATE_IRQS)) {
+		vgic_reg_access(mmio, NULL, offset,
+				ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
+		return false;
+	}
+
+	reg = vgic_bytemap_get_reg(&vcpu->kvm->arch.vgic.irq_priority,
+				   vcpu->vcpu_id, offset);
+	vgic_reg_access(mmio, reg, offset,
+		ACCESS_READ_VALUE | ACCESS_WRITE_VALUE);
+	return false;
+}
+
+static bool handle_mmio_cfg_reg_dist(struct kvm_vcpu *vcpu,
+				     struct kvm_exit_mmio *mmio,
+				     phys_addr_t offset)
+{
+	u32 *reg;
+
+	if (unlikely(offset < VGIC_NR_PRIVATE_IRQS / 4)) {
+		vgic_reg_access(mmio, NULL, offset,
+				ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
+		return false;
+	}
+
+	reg = vgic_bitmap_get_reg(&vcpu->kvm->arch.vgic.irq_cfg,
+				  vcpu->vcpu_id, offset >> 1);
+
+	return vgic_handle_cfg_reg(reg, mmio, offset);
+}
+
+/*
+ * We use a compressed version of the MPIDR (all 32 bits in one 32-bit word)
+ * when we store the target MPIDR written by the guest.
+ */
+static u32 compress_mpidr(unsigned long mpidr)
+{
+	u32 ret;
+
+	ret = MPIDR_AFFINITY_LEVEL(mpidr, 0);
+	ret |= MPIDR_AFFINITY_LEVEL(mpidr, 1) << 8;
+	ret |= MPIDR_AFFINITY_LEVEL(mpidr, 2) << 16;
+	ret |= MPIDR_AFFINITY_LEVEL(mpidr, 3) << 24;
+
+	return ret;
+}
+
+static unsigned long uncompress_mpidr(u32 value)
+{
+	unsigned long mpidr;
+
+	mpidr  = ((value >>  0) & 0xFF) << MPIDR_LEVEL_SHIFT(0);
+	mpidr |= ((value >>  8) & 0xFF) << MPIDR_LEVEL_SHIFT(1);
+	mpidr |= ((value >> 16) & 0xFF) << MPIDR_LEVEL_SHIFT(2);
+	mpidr |= (u64)((value >> 24) & 0xFF) << MPIDR_LEVEL_SHIFT(3);
+
+	return mpidr;
+}
+
+/*
+ * Lookup the given MPIDR value to get the vcpu_id (if there is one)
+ * and store that in the irq_spi_cpu[] array.
+ * This limits the number of VCPUs to 255 for now, extending the data
+ * type (or storing kvm_vcpu poiners) should lift the limit.
+ * Store the original MPIDR value in an extra array to support read-as-written.
+ * Unallocated MPIDRs are translated to a special value and caught
+ * before any array accesses.
+ */
+static bool handle_mmio_route_reg(struct kvm_vcpu *vcpu,
+				  struct kvm_exit_mmio *mmio,
+				  phys_addr_t offset)
+{
+	struct kvm *kvm = vcpu->kvm;
+	struct vgic_dist *dist = &kvm->arch.vgic;
+	int spi;
+	u32 reg;
+	int vcpu_id;
+	unsigned long *bmap, mpidr;
+
+	/*
+	 * The upper 32 bits of each 64 bit register are zero,
+	 * as we don't support Aff3.
+	 */
+	if ((offset & 4)) {
+		vgic_reg_access(mmio, NULL, offset,
+				ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
+		return false;
+	}
+
+	/* This region only covers SPIs, so no handling of private IRQs here. */
+	spi = offset / 8;
+
+	/* get the stored MPIDR for this IRQ */
+	mpidr = uncompress_mpidr(dist->irq_spi_mpidr[spi]);
+	mpidr &= MPIDR_HWID_BITMASK;
+	reg = mpidr;
+
+	vgic_reg_access(mmio, &reg, offset,
+			ACCESS_READ_VALUE | ACCESS_WRITE_VALUE);
+
+	if (!mmio->is_write)
+		return false;
+
+	/*
+	 * Now clear the currently assigned vCPU from the map, making room
+	 * for the new one to be written below
+	 */
+	vcpu = kvm_mpidr_to_vcpu(kvm, mpidr);
+	if (likely(vcpu)) {
+		vcpu_id = vcpu->vcpu_id;
+		bmap = vgic_bitmap_get_shared_map(&dist->irq_spi_target[vcpu_id]);
+		__clear_bit(spi, bmap);
+	}
+
+	dist->irq_spi_mpidr[spi] = compress_mpidr(reg);
+	vcpu = kvm_mpidr_to_vcpu(kvm, reg & MPIDR_HWID_BITMASK);
+
+	/*
+	 * The spec says that non-existent MPIDR values should not be
+	 * forwarded to any existent (v)CPU, but should be able to become
+	 * pending anyway. We simply keep the irq_spi_target[] array empty, so
+	 * the interrupt will never be injected.
+	 * irq_spi_cpu[irq] gets a magic value in this case.
+	 */
+	if (likely(vcpu)) {
+		vcpu_id = vcpu->vcpu_id;
+		dist->irq_spi_cpu[spi] = vcpu_id;
+		bmap = vgic_bitmap_get_shared_map(&dist->irq_spi_target[vcpu_id]);
+		__set_bit(spi, bmap);
+	} else {
+		dist->irq_spi_cpu[spi] = VCPU_NOT_ALLOCATED;
+	}
+
+	vgic_update_state(kvm);
+
+	return true;
+}
+
+/*
+ * We should be careful about promising too much when a guest reads
+ * this register. Don't claim to be like any hardware implementation,
+ * but just report the GIC as version 3 - which is what a Linux guest
+ * would check.
+ */
+static bool handle_mmio_idregs(struct kvm_vcpu *vcpu,
+			       struct kvm_exit_mmio *mmio,
+			       phys_addr_t offset)
+{
+	u32 reg = 0;
+
+	switch (offset + GICD_IDREGS) {
+	case GICD_PIDR2:
+		reg = 0x3b;
+		break;
+	}
+
+	vgic_reg_access(mmio, &reg, offset,
+			ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
+
+	return false;
+}
+
+static const struct kvm_mmio_range vgic_v3_dist_ranges[] = {
+	{
+		.base           = GICD_CTLR,
+		.len            = 0x04,
+		.bits_per_irq   = 0,
+		.handle_mmio    = handle_mmio_ctlr,
+	},
+	{
+		.base           = GICD_TYPER,
+		.len            = 0x04,
+		.bits_per_irq   = 0,
+		.handle_mmio    = handle_mmio_typer,
+	},
+	{
+		.base           = GICD_IIDR,
+		.len            = 0x04,
+		.bits_per_irq   = 0,
+		.handle_mmio    = handle_mmio_iidr,
+	},
+	{
+		/* this register is optional, it is RAZ/WI if not implemented */
+		.base           = GICD_STATUSR,
+		.len            = 0x04,
+		.bits_per_irq   = 0,
+		.handle_mmio    = handle_mmio_raz_wi,
+	},
+	{
+		/* this write only register is WI when TYPER.MBIS=0 */
+		.base		= GICD_SETSPI_NSR,
+		.len		= 0x04,
+		.bits_per_irq	= 0,
+		.handle_mmio	= handle_mmio_raz_wi,
+	},
+	{
+		/* this write only register is WI when TYPER.MBIS=0 */
+		.base		= GICD_CLRSPI_NSR,
+		.len		= 0x04,
+		.bits_per_irq	= 0,
+		.handle_mmio	= handle_mmio_raz_wi,
+	},
+	{
+		/* this is RAZ/WI when DS=1 */
+		.base		= GICD_SETSPI_SR,
+		.len		= 0x04,
+		.bits_per_irq	= 0,
+		.handle_mmio	= handle_mmio_raz_wi,
+	},
+	{
+		/* this is RAZ/WI when DS=1 */
+		.base		= GICD_CLRSPI_SR,
+		.len		= 0x04,
+		.bits_per_irq	= 0,
+		.handle_mmio	= handle_mmio_raz_wi,
+	},
+	{
+		.base		= GICD_IGROUPR,
+		.len		= 0x80,
+		.bits_per_irq	= 1,
+		.handle_mmio	= handle_mmio_rao_wi,
+	},
+	{
+		.base		= GICD_ISENABLER,
+		.len		= 0x80,
+		.bits_per_irq	= 1,
+		.handle_mmio	= handle_mmio_set_enable_reg_dist,
+	},
+	{
+		.base		= GICD_ICENABLER,
+		.len		= 0x80,
+		.bits_per_irq	= 1,
+		.handle_mmio	= handle_mmio_clear_enable_reg_dist,
+	},
+	{
+		.base		= GICD_ISPENDR,
+		.len		= 0x80,
+		.bits_per_irq	= 1,
+		.handle_mmio	= handle_mmio_set_pending_reg_dist,
+	},
+	{
+		.base		= GICD_ICPENDR,
+		.len		= 0x80,
+		.bits_per_irq	= 1,
+		.handle_mmio	= handle_mmio_clear_pending_reg_dist,
+	},
+	{
+		.base		= GICD_ISACTIVER,
+		.len		= 0x80,
+		.bits_per_irq	= 1,
+		.handle_mmio	= handle_mmio_raz_wi,
+	},
+	{
+		.base		= GICD_ICACTIVER,
+		.len		= 0x80,
+		.bits_per_irq	= 1,
+		.handle_mmio	= handle_mmio_raz_wi,
+	},
+	{
+		.base		= GICD_IPRIORITYR,
+		.len		= 0x400,
+		.bits_per_irq	= 8,
+		.handle_mmio	= handle_mmio_priority_reg_dist,
+	},
+	{
+		/* TARGETSRn is RES0 when ARE=1 */
+		.base		= GICD_ITARGETSR,
+		.len		= 0x400,
+		.bits_per_irq	= 8,
+		.handle_mmio	= handle_mmio_raz_wi,
+	},
+	{
+		.base		= GICD_ICFGR,
+		.len		= 0x100,
+		.bits_per_irq	= 2,
+		.handle_mmio	= handle_mmio_cfg_reg_dist,
+	},
+	{
+		/* this is RAZ/WI when DS=1 */
+		.base		= GICD_IGRPMODR,
+		.len		= 0x80,
+		.bits_per_irq	= 1,
+		.handle_mmio	= handle_mmio_raz_wi,
+	},
+	{
+		/* this is RAZ/WI when DS=1 */
+		.base		= GICD_NSACR,
+		.len		= 0x100,
+		.bits_per_irq	= 2,
+		.handle_mmio	= handle_mmio_raz_wi,
+	},
+	{
+		/* this is RAZ/WI when ARE=1 */
+		.base		= GICD_SGIR,
+		.len		= 0x04,
+		.handle_mmio	= handle_mmio_raz_wi,
+	},
+	{
+		/* this is RAZ/WI when ARE=1 */
+		.base		= GICD_CPENDSGIR,
+		.len		= 0x10,
+		.handle_mmio	= handle_mmio_raz_wi,
+	},
+	{
+		/* this is RAZ/WI when ARE=1 */
+		.base           = GICD_SPENDSGIR,
+		.len            = 0x10,
+		.handle_mmio    = handle_mmio_raz_wi,
+	},
+	{
+		.base		= GICD_IROUTER + 0x100,
+		.len		= 0x1edc,
+		.bits_per_irq	= 64,
+		.handle_mmio	= handle_mmio_route_reg,
+	},
+	{
+		.base           = GICD_IDREGS,
+		.len            = 0x30,
+		.bits_per_irq   = 0,
+		.handle_mmio    = handle_mmio_idregs,
+	},
+	{},
+};
+
+static bool handle_mmio_set_enable_reg_redist(struct kvm_vcpu *vcpu,
+					      struct kvm_exit_mmio *mmio,
+					      phys_addr_t offset)
+{
+	struct kvm_vcpu *redist_vcpu = mmio->private;
+
+	return vgic_handle_enable_reg(vcpu->kvm, mmio, offset,
+				      redist_vcpu->vcpu_id,
+				      ACCESS_WRITE_SETBIT);
+}
+
+static bool handle_mmio_clear_enable_reg_redist(struct kvm_vcpu *vcpu,
+						struct kvm_exit_mmio *mmio,
+						phys_addr_t offset)
+{
+	struct kvm_vcpu *redist_vcpu = mmio->private;
+
+	return vgic_handle_enable_reg(vcpu->kvm, mmio, offset,
+				      redist_vcpu->vcpu_id,
+				      ACCESS_WRITE_CLEARBIT);
+}
+
+static bool handle_mmio_set_pending_reg_redist(struct kvm_vcpu *vcpu,
+					       struct kvm_exit_mmio *mmio,
+					       phys_addr_t offset)
+{
+	struct kvm_vcpu *redist_vcpu = mmio->private;
+
+	return vgic_handle_set_pending_reg(vcpu->kvm, mmio, offset,
+					   redist_vcpu->vcpu_id);
+}
+
+static bool handle_mmio_clear_pending_reg_redist(struct kvm_vcpu *vcpu,
+						 struct kvm_exit_mmio *mmio,
+						 phys_addr_t offset)
+{
+	struct kvm_vcpu *redist_vcpu = mmio->private;
+
+	return vgic_handle_clear_pending_reg(vcpu->kvm, mmio, offset,
+					     redist_vcpu->vcpu_id);
+}
+
+static bool handle_mmio_priority_reg_redist(struct kvm_vcpu *vcpu,
+					    struct kvm_exit_mmio *mmio,
+					    phys_addr_t offset)
+{
+	struct kvm_vcpu *redist_vcpu = mmio->private;
+	u32 *reg;
+
+	reg = vgic_bytemap_get_reg(&vcpu->kvm->arch.vgic.irq_priority,
+				   redist_vcpu->vcpu_id, offset);
+	vgic_reg_access(mmio, reg, offset,
+			ACCESS_READ_VALUE | ACCESS_WRITE_VALUE);
+	return false;
+}
+
+static bool handle_mmio_cfg_reg_redist(struct kvm_vcpu *vcpu,
+				       struct kvm_exit_mmio *mmio,
+				       phys_addr_t offset)
+{
+	struct kvm_vcpu *redist_vcpu = mmio->private;
+
+	u32 *reg = vgic_bitmap_get_reg(&vcpu->kvm->arch.vgic.irq_cfg,
+				       redist_vcpu->vcpu_id, offset >> 1);
+
+	return vgic_handle_cfg_reg(reg, mmio, offset);
+}
+
+static const struct kvm_mmio_range vgic_redist_sgi_ranges[] = {
+	{
+		.base		= GICR_IGROUPR0,
+		.len		= 0x04,
+		.bits_per_irq	= 1,
+		.handle_mmio	= handle_mmio_raz_wi,
+	},
+	{
+		.base		= GICR_ISENABLER0,
+		.len		= 0x04,
+		.bits_per_irq	= 1,
+		.handle_mmio	= handle_mmio_set_enable_reg_redist,
+	},
+	{
+		.base		= GICR_ICENABLER0,
+		.len		= 0x04,
+		.bits_per_irq	= 1,
+		.handle_mmio	= handle_mmio_clear_enable_reg_redist,
+	},
+	{
+		.base		= GICR_ISPENDR0,
+		.len		= 0x04,
+		.bits_per_irq	= 1,
+		.handle_mmio	= handle_mmio_set_pending_reg_redist,
+	},
+	{
+		.base		= GICR_ICPENDR0,
+		.len		= 0x04,
+		.bits_per_irq	= 1,
+		.handle_mmio	= handle_mmio_clear_pending_reg_redist,
+	},
+	{
+		.base		= GICR_ISACTIVER0,
+		.len		= 0x04,
+		.bits_per_irq	= 1,
+		.handle_mmio	= handle_mmio_raz_wi,
+	},
+	{
+		.base		= GICR_ICACTIVER0,
+		.len		= 0x04,
+		.bits_per_irq	= 1,
+		.handle_mmio	= handle_mmio_raz_wi,
+	},
+	{
+		.base		= GICR_IPRIORITYR0,
+		.len		= 0x20,
+		.bits_per_irq	= 8,
+		.handle_mmio	= handle_mmio_priority_reg_redist,
+	},
+	{
+		.base		= GICR_ICFGR0,
+		.len		= 0x08,
+		.bits_per_irq	= 2,
+		.handle_mmio	= handle_mmio_cfg_reg_redist,
+	},
+	{
+		.base		= GICR_IGRPMODR0,
+		.len		= 0x04,
+		.bits_per_irq	= 1,
+		.handle_mmio	= handle_mmio_raz_wi,
+	},
+	{
+		.base		= GICR_NSACR,
+		.len		= 0x04,
+		.handle_mmio	= handle_mmio_raz_wi,
+	},
+	{},
+};
+
+static bool handle_mmio_ctlr_redist(struct kvm_vcpu *vcpu,
+				    struct kvm_exit_mmio *mmio,
+				    phys_addr_t offset)
+{
+	/* since we don't support LPIs, this register is zero for now */
+	vgic_reg_access(mmio, NULL, offset,
+			ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
+	return false;
+}
+
+static bool handle_mmio_typer_redist(struct kvm_vcpu *vcpu,
+				     struct kvm_exit_mmio *mmio,
+				     phys_addr_t offset)
+{
+	u32 reg;
+	u64 mpidr;
+	struct kvm_vcpu *redist_vcpu = mmio->private;
+	int target_vcpu_id = redist_vcpu->vcpu_id;
+
+	/* the upper 32 bits contain the affinity value */
+	if ((offset & ~3) == 4) {
+		mpidr = kvm_vcpu_get_mpidr_aff(redist_vcpu);
+		reg = compress_mpidr(mpidr);
+
+		vgic_reg_access(mmio, &reg, offset,
+				ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
+		return false;
+	}
+
+	reg = redist_vcpu->vcpu_id << 8;
+	if (target_vcpu_id == atomic_read(&vcpu->kvm->online_vcpus) - 1)
+		reg |= GICR_TYPER_LAST;
+	vgic_reg_access(mmio, &reg, offset,
+			ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
+	return false;
+}
+
+static const struct kvm_mmio_range vgic_redist_ranges[] = {
+	{
+		.base           = GICR_CTLR,
+		.len            = 0x04,
+		.bits_per_irq   = 0,
+		.handle_mmio    = handle_mmio_ctlr_redist,
+	},
+	{
+		.base           = GICR_TYPER,
+		.len            = 0x08,
+		.bits_per_irq   = 0,
+		.handle_mmio    = handle_mmio_typer_redist,
+	},
+	{
+		.base           = GICR_IIDR,
+		.len            = 0x04,
+		.bits_per_irq   = 0,
+		.handle_mmio    = handle_mmio_iidr,
+	},
+	{
+		.base           = GICR_WAKER,
+		.len            = 0x04,
+		.bits_per_irq   = 0,
+		.handle_mmio    = handle_mmio_raz_wi,
+	},
+	{
+		.base           = GICR_IDREGS,
+		.len            = 0x30,
+		.bits_per_irq   = 0,
+		.handle_mmio    = handle_mmio_idregs,
+	},
+	{},
+};
+
+/*
+ * This function splits accesses between the distributor and the two
+ * redistributor parts (private/SPI). As each redistributor is accessible
+ * from any CPU, we have to determine the affected VCPU by taking the faulting
+ * address into account. We then pass this VCPU to the handler function via
+ * the private parameter.
+ */
+#define SGI_BASE_OFFSET SZ_64K
+static bool vgic_v3_handle_mmio(struct kvm_vcpu *vcpu, struct kvm_run *run,
+				struct kvm_exit_mmio *mmio)
+{
+	struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
+	unsigned long dbase = dist->vgic_dist_base;
+	unsigned long rdbase = dist->vgic_redist_base;
+	int nrcpus = atomic_read(&vcpu->kvm->online_vcpus);
+	int vcpu_id;
+	const struct kvm_mmio_range *mmio_range;
+
+	if (is_in_range(mmio->phys_addr, mmio->len, dbase, GIC_V3_DIST_SIZE)) {
+		return vgic_handle_mmio_range(vcpu, run, mmio,
+					      vgic_v3_dist_ranges, dbase);
+	}
+
+	if (!is_in_range(mmio->phys_addr, mmio->len, rdbase,
+	    GIC_V3_REDIST_SIZE * nrcpus))
+		return false;
+
+	vcpu_id = (mmio->phys_addr - rdbase) / GIC_V3_REDIST_SIZE;
+	rdbase += (vcpu_id * GIC_V3_REDIST_SIZE);
+	mmio->private = kvm_get_vcpu(vcpu->kvm, vcpu_id);
+
+	if (mmio->phys_addr >= rdbase + SGI_BASE_OFFSET) {
+		rdbase += SGI_BASE_OFFSET;
+		mmio_range = vgic_redist_sgi_ranges;
+	} else {
+		mmio_range = vgic_redist_ranges;
+	}
+	return vgic_handle_mmio_range(vcpu, run, mmio, mmio_range, rdbase);
+}
+
+static bool vgic_v3_queue_sgi(struct kvm_vcpu *vcpu, int irq)
+{
+	if (vgic_queue_irq(vcpu, 0, irq)) {
+		vgic_dist_irq_clear_pending(vcpu, irq);
+		vgic_cpu_irq_clear(vcpu, irq);
+		return true;
+	}
+
+	return false;
+}
+
+static int vgic_v3_init_maps(struct vgic_dist *dist)
+{
+	int nr_spis = dist->nr_irqs - VGIC_NR_PRIVATE_IRQS;
+
+	dist->irq_spi_mpidr = kcalloc(nr_spis, sizeof(dist->irq_spi_mpidr[0]),
+				      GFP_KERNEL);
+
+	if (!dist->irq_spi_mpidr)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static int vgic_v3_init(struct kvm *kvm, const struct vgic_params *params)
+{
+	struct vgic_dist *dist = &kvm->arch.vgic;
+	int ret, i;
+	u32 mpidr;
+
+	if (IS_VGIC_ADDR_UNDEF(dist->vgic_dist_base) ||
+	    IS_VGIC_ADDR_UNDEF(dist->vgic_redist_base)) {
+		kvm_err("Need to set vgic distributor addresses first\n");
+		return -ENXIO;
+	}
+
+	/*
+	 * FIXME: this should be moved to init_maps time, and may bite
+	 * us when adding save/restore. Add a per-emulation hook?
+	 */
+	ret = vgic_v3_init_maps(dist);
+	if (ret) {
+		kvm_err("Unable to allocate maps\n");
+		return ret;
+	}
+
+	/* Initialize the target VCPUs for each IRQ to VCPU 0 */
+	mpidr = compress_mpidr(kvm_vcpu_get_mpidr_aff(kvm_get_vcpu(kvm, 0)));
+	for (i = VGIC_NR_PRIVATE_IRQS; i < dist->nr_irqs; i++) {
+		dist->irq_spi_cpu[i - VGIC_NR_PRIVATE_IRQS] = 0;
+		dist->irq_spi_mpidr[i - VGIC_NR_PRIVATE_IRQS] = mpidr;
+		vgic_bitmap_set_irq_val(dist->irq_spi_target, 0, i, 1);
+	}
+
+	return 0;
+}
+
+/* GICv3 does not keep track of SGI sources anymore. */
+static void vgic_v3_add_sgi_source(struct kvm_vcpu *vcpu, int irq, int source)
+{
+}
+
+int vgic_v3_init_emulation(struct kvm *kvm)
+{
+	struct vgic_dist *dist = &kvm->arch.vgic;
+
+	dist->vm_ops.handle_mmio = vgic_v3_handle_mmio;
+	dist->vm_ops.queue_sgi = vgic_v3_queue_sgi;
+	dist->vm_ops.add_sgi_source = vgic_v3_add_sgi_source;
+	dist->vm_ops.vgic_init = vgic_v3_init;
+
+	kvm->arch.max_vcpus = KVM_MAX_VCPUS;
+
+	return 0;
+}
+
+static int vgic_v3_create(struct kvm_device *dev, u32 type)
+{
+	return kvm_vgic_create(dev->kvm, type);
+}
+
+static void vgic_v3_destroy(struct kvm_device *dev)
+{
+	kfree(dev);
+}
+
+static int vgic_v3_set_attr(struct kvm_device *dev,
+			    struct kvm_device_attr *attr)
+{
+	int ret;
+
+	ret = vgic_set_common_attr(dev, attr);
+	if (ret != -ENXIO)
+		return ret;
+
+	switch (attr->group) {
+	case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
+	case KVM_DEV_ARM_VGIC_GRP_CPU_REGS:
+		return -ENXIO;
+	}
+
+	return -ENXIO;
+}
+
+static int vgic_v3_get_attr(struct kvm_device *dev,
+			    struct kvm_device_attr *attr)
+{
+	int ret;
+
+	ret = vgic_get_common_attr(dev, attr);
+	if (ret != -ENXIO)
+		return ret;
+
+	switch (attr->group) {
+	case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
+	case KVM_DEV_ARM_VGIC_GRP_CPU_REGS:
+		return -ENXIO;
+	}
+
+	return -ENXIO;
+}
+
+static int vgic_v3_has_attr(struct kvm_device *dev,
+			    struct kvm_device_attr *attr)
+{
+	switch (attr->group) {
+	case KVM_DEV_ARM_VGIC_GRP_ADDR:
+		switch (attr->attr) {
+		case KVM_VGIC_V2_ADDR_TYPE_DIST:
+		case KVM_VGIC_V2_ADDR_TYPE_CPU:
+			return -ENXIO;
+		}
+		break;
+	case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
+	case KVM_DEV_ARM_VGIC_GRP_CPU_REGS:
+		return -ENXIO;
+	case KVM_DEV_ARM_VGIC_GRP_NR_IRQS:
+		return 0;
+	}
+	return -ENXIO;
+}
+
+struct kvm_device_ops kvm_arm_vgic_v3_ops = {
+	.name = "kvm-arm-vgic-v3",
+	.create = vgic_v3_create,
+	.destroy = vgic_v3_destroy,
+	.set_attr = vgic_v3_set_attr,
+	.get_attr = vgic_v3_get_attr,
+	.has_attr = vgic_v3_has_attr,
+};
diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c
index 335ffe0..b7de0f8 100644
--- a/virt/kvm/arm/vgic.c
+++ b/virt/kvm/arm/vgic.c
@@ -1249,7 +1249,7 @@  static bool vgic_update_irq_pending(struct kvm *kvm, int cpuid,
 	struct kvm_vcpu *vcpu;
 	int edge_triggered, level_triggered;
 	int enabled;
-	bool ret = true;
+	bool ret = true, can_inject = true;
 
 	spin_lock(&dist->lock);
 
@@ -1264,6 +1264,11 @@  static bool vgic_update_irq_pending(struct kvm *kvm, int cpuid,
 
 	if (irq_num >= VGIC_NR_PRIVATE_IRQS) {
 		cpuid = dist->irq_spi_cpu[irq_num - VGIC_NR_PRIVATE_IRQS];
+		if (cpuid == VCPU_NOT_ALLOCATED) {
+			/* Pretend we use CPU0, and prevent injection */
+			cpuid = 0;
+			can_inject = false;
+		}
 		vcpu = kvm_get_vcpu(kvm, cpuid);
 	}
 
@@ -1285,7 +1290,7 @@  static bool vgic_update_irq_pending(struct kvm *kvm, int cpuid,
 
 	enabled = vgic_irq_is_enabled(vcpu, irq_num);
 
-	if (!enabled) {
+	if (!enabled || !can_inject) {
 		ret = false;
 		goto out;
 	}
@@ -1438,6 +1443,7 @@  void kvm_vgic_destroy(struct kvm *kvm)
 	}
 	kfree(dist->irq_sgi_sources);
 	kfree(dist->irq_spi_cpu);
+	kfree(dist->irq_spi_mpidr);
 	kfree(dist->irq_spi_target);
 	kfree(dist->irq_pending_on_cpu);
 	dist->irq_sgi_sources = NULL;
@@ -1628,6 +1634,7 @@  int kvm_vgic_create(struct kvm *kvm, u32 type)
 	kvm->arch.vgic.vctrl_base = vgic->vctrl_base;
 	kvm->arch.vgic.vgic_dist_base = VGIC_ADDR_UNDEF;
 	kvm->arch.vgic.vgic_cpu_base = VGIC_ADDR_UNDEF;
+	kvm->arch.vgic.vgic_redist_base = VGIC_ADDR_UNDEF;
 
 out_unlock:
 	for (; vcpu_lock_idx >= 0; vcpu_lock_idx--) {
diff --git a/virt/kvm/arm/vgic.h b/virt/kvm/arm/vgic.h
index ff3171a..b0c6b2f 100644
--- a/virt/kvm/arm/vgic.h
+++ b/virt/kvm/arm/vgic.h
@@ -35,6 +35,8 @@ 
 #define ACCESS_WRITE_VALUE	(3 << 1)
 #define ACCESS_WRITE_MASK(x)	((x) & (3 << 1))
 
+#define VCPU_NOT_ALLOCATED	((u8)-1)
+
 unsigned long *vgic_bitmap_get_shared_map(struct vgic_bitmap *x);
 
 void vgic_update_state(struct kvm *kvm);
@@ -115,5 +117,6 @@  int vgic_set_common_attr(struct kvm_device *dev, struct kvm_device_attr *attr);
 int vgic_get_common_attr(struct kvm_device *dev, struct kvm_device_attr *attr);
 
 int vgic_v2_init_emulation(struct kvm *kvm);
+int vgic_v3_init_emulation(struct kvm *kvm);
 
 #endif