diff mbox

[v4,06/12] KVM: arm64: implement basic ITS register handlers

Message ID 1458958450-19662-7-git-send-email-andre.przywara@arm.com (mailing list archive)
State New, archived
Headers show

Commit Message

Andre Przywara March 26, 2016, 2:14 a.m. UTC
Add emulation for some basic MMIO registers used in the ITS emulation.
This includes:
- GITS_{CTLR,TYPER,IIDR}
- ID registers
- GITS_{CBASER,CREADR,CWRITER}
  those implement the ITS command buffer handling

Most of the handlers are pretty straight forward, but CWRITER goes
some extra miles to allow fine grained locking. The idea here
is to let only the first instance iterate through the command ring
buffer, CWRITER accesses on other VCPUs meanwhile will be picked up
by that first instance and handled as well. The ITS lock is thus only
hold for very small periods of time and is dropped before the actual
command handler is called.

Signed-off-by: Andre Przywara <andre.przywara@arm.com>
---
 include/kvm/vgic/vgic.h            |   3 +
 include/linux/irqchip/arm-gic-v3.h |   8 ++
 virt/kvm/arm/vgic/its-emul.c       | 272 ++++++++++++++++++++++++++++++++++++-
 virt/kvm/arm/vgic/vgic.h           |   6 +
 virt/kvm/arm/vgic/vgic_init.c      |   2 +
 5 files changed, 284 insertions(+), 7 deletions(-)

Comments

Christoffer Dall April 3, 2016, 10:08 a.m. UTC | #1
On Sat, Mar 26, 2016 at 02:14:04AM +0000, Andre Przywara wrote:
> Add emulation for some basic MMIO registers used in the ITS emulation.
> This includes:
> - GITS_{CTLR,TYPER,IIDR}
> - ID registers
> - GITS_{CBASER,CREADR,CWRITER}
>   those implement the ITS command buffer handling
> 
> Most of the handlers are pretty straight forward, but CWRITER goes
> some extra miles to allow fine grained locking. The idea here
> is to let only the first instance iterate through the command ring
> buffer, CWRITER accesses on other VCPUs meanwhile will be picked up
> by that first instance and handled as well. The ITS lock is thus only
> hold for very small periods of time and is dropped before the actual

s/hold/held/

> command handler is called.
> 
> Signed-off-by: Andre Przywara <andre.przywara@arm.com>
> ---
>  include/kvm/vgic/vgic.h            |   3 +
>  include/linux/irqchip/arm-gic-v3.h |   8 ++
>  virt/kvm/arm/vgic/its-emul.c       | 272 ++++++++++++++++++++++++++++++++++++-
>  virt/kvm/arm/vgic/vgic.h           |   6 +
>  virt/kvm/arm/vgic/vgic_init.c      |   2 +
>  5 files changed, 284 insertions(+), 7 deletions(-)
> 
> diff --git a/include/kvm/vgic/vgic.h b/include/kvm/vgic/vgic.h
> index c79bed5..bafea11 100644
> --- a/include/kvm/vgic/vgic.h
> +++ b/include/kvm/vgic/vgic.h
> @@ -115,6 +115,9 @@ struct vgic_io_device {
>  struct vgic_its {
>  	bool			enabled;
>  	spinlock_t		lock;

exactly what does this lock protect?

> +	u64			cbaser;
> +	int			creadr;
> +	int			cwriter;
>  };
>  
>  struct vgic_dist {
> diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h
> index a813c3e..7011b98 100644
> --- a/include/linux/irqchip/arm-gic-v3.h
> +++ b/include/linux/irqchip/arm-gic-v3.h
> @@ -179,15 +179,23 @@
>  #define GITS_BASER			0x0100
>  #define GITS_IDREGS_BASE		0xffd0
>  #define GITS_PIDR2			GICR_PIDR2
> +#define GITS_PIDR4			0xffd0
> +#define GITS_CIDR0			0xfff0
> +#define GITS_CIDR1			0xfff4
> +#define GITS_CIDR2			0xfff8
> +#define GITS_CIDR3			0xfffc
>  
>  #define GITS_TRANSLATER			0x10040
>  
>  #define GITS_CTLR_ENABLE		(1U << 0)
>  #define GITS_CTLR_QUIESCENT		(1U << 31)
>  
> +#define GITS_TYPER_PLPIS		(1UL << 0)
> +#define GITS_TYPER_IDBITS_SHIFT		8
>  #define GITS_TYPER_DEVBITS_SHIFT	13
>  #define GITS_TYPER_DEVBITS(r)		((((r) >> GITS_TYPER_DEVBITS_SHIFT) & 0x1f) + 1)
>  #define GITS_TYPER_PTA			(1UL << 19)
> +#define GITS_TYPER_HWCOLLCNT_SHIFT	24
>  
>  #define GITS_CBASER_VALID		(1UL << 63)
>  #define GITS_CBASER_nCnB		(0UL << 59)
> diff --git a/virt/kvm/arm/vgic/its-emul.c b/virt/kvm/arm/vgic/its-emul.c
> index 49dd5e4..de8d360 100644
> --- a/virt/kvm/arm/vgic/its-emul.c
> +++ b/virt/kvm/arm/vgic/its-emul.c
> @@ -31,23 +31,263 @@
>  #include "vgic.h"
>  #include "vgic_mmio.h"
>  
> +#define BASER_BASE_ADDRESS(x) ((x) & 0xfffffffff000ULL)
> +
> +static int vgic_mmio_read_its_ctlr(struct kvm_vcpu *vcpu,
> +				   struct kvm_io_device *this,
> +				   gpa_t addr, int len, void *val)
> +{
> +	struct vgic_its *its = &vcpu->kvm->arch.vgic.its;
> +	u32 reg;
> +
> +	reg = GITS_CTLR_QUIESCENT;
> +	if (its->enabled)
> +		reg |= GITS_CTLR_ENABLE;
> +
> +	write_mask32(reg, addr & 3, len, val);
> +
> +	return 0;
> +}
> +
> +static int vgic_mmio_write_its_ctlr(struct kvm_vcpu *vcpu,
> +				    struct kvm_io_device *this,
> +				    gpa_t addr, int len, const void *val)
> +{
> +	struct vgic_its *its = &vcpu->kvm->arch.vgic.its;
> +	struct vgic_io_device *iodev = container_of(this,
> +						    struct vgic_io_device, dev);
> +
> +        if (addr - iodev->base_addr == 0)
> +		its->enabled = !!(*(u8*)val & GITS_CTLR_ENABLE);
> +
> +	return 0;
> +}
> +
> +static int vgic_mmio_read_its_typer(struct kvm_vcpu *vcpu,
> +				    struct kvm_io_device *this,
> +				    gpa_t addr, int len, void *val)
> +{
> +	u64 reg = GITS_TYPER_PLPIS;
> +
> +	/*
> +	 * We use linear CPU numbers for redistributor addressing,
> +	 * so GITS_TYPER.PTA is 0.
> +	 * To avoid memory waste on the guest side, we keep the
> +	 * number of IDBits and DevBits low for the time being.
> +	 * This could later be made configurable by userland.
> +	 * Since we have all collections in linked list, we claim

s/linked list/linked lists/

is the point here not more specifically that we hold the data in kernel
data structures not visible by guest memory, rather than addressing
which specific data structure we use in the kernel?

> +	 * that we can hold all of the collection tables in our
> +	 * own memory and that the ITT entry size is 1 byte (the
> +	 * smallest possible one).
> +	 */
> +	reg |= 0xff << GITS_TYPER_HWCOLLCNT_SHIFT;
> +	reg |= 0x0f << GITS_TYPER_DEVBITS_SHIFT;
> +	reg |= 0x0f << GITS_TYPER_IDBITS_SHIFT;
> +
> +	write_mask64(reg, addr & 7, len, val);
> +
> +	return 0;
> +}
> +
> +static int vgic_mmio_read_its_iidr(struct kvm_vcpu *vcpu,
> +				   struct kvm_io_device *this,
> +				   gpa_t addr, int len, void *val)
> +{
> +	u32 reg = (PRODUCT_ID_KVM << 24) | (IMPLEMENTER_ARM << 0);
> +
> +	write_mask32(reg, addr & 3, len, val);
> +
> +	return 0;
> +}
> +
> +static int vgic_mmio_read_its_idregs(struct kvm_vcpu *vcpu,
> +				     struct kvm_io_device *this,
> +				     gpa_t addr, int len, void *val)
> +{
> +	struct vgic_io_device *iodev = container_of(this,
> +						    struct vgic_io_device, dev);
> +	u32 reg = 0;
> +	int idreg = (addr & ~3) - iodev->base_addr + GITS_IDREGS_BASE;
> +
> +	switch (idreg) {
> +	case GITS_PIDR2:
> +		reg = GIC_PIDR2_ARCH_GICv3;
> +		break;
> +	case GITS_PIDR4:
> +		/* This is a 64K software visible page */
> +		reg = 0x40;
> +		break;
> +	/* Those are the ID registers for (any) GIC. */
> +	case GITS_CIDR0:
> +		reg = 0x0d;
> +		break;
> +	case GITS_CIDR1:
> +		reg = 0xf0;
> +		break;
> +	case GITS_CIDR2:
> +		reg = 0x05;
> +		break;
> +	case GITS_CIDR3:
> +		reg = 0xb1;
> +		break;
> +	}
> +
> +	write_mask32(reg, addr & 3, len, val);
> +
> +	return 0;
> +}
> +
> +/*
> + * This function is called with both the ITS and the distributor lock dropped,
> + * so the actual command handlers must take the respective locks when needed.
> + */

I don't understand this comment.  Are these requirements for the locking
situation when entering this function or is explaining when this
function is called?

> +static int vits_handle_command(struct kvm_vcpu *vcpu, u64 *its_cmd)
> +{
> +	return -ENODEV;
> +}
> +
> +static int vgic_mmio_read_its_cbaser(struct kvm_vcpu *vcpu,
> +				    struct kvm_io_device *this,
> +				    gpa_t addr, int len, void *val)
> +{
> +	struct vgic_its *its = &vcpu->kvm->arch.vgic.its;
> +
> +	write_mask64(its->cbaser, addr & 7, len, val);
> +
> +	return 0;
> +}
> +
> +static int vgic_mmio_write_its_cbaser(struct kvm_vcpu *vcpu,
> +				      struct kvm_io_device *this,
> +				      gpa_t addr, int len, const void *val)
> +{
> +	struct vgic_its *its = &vcpu->kvm->arch.vgic.its;
> +
> +	if (its->enabled)
> +		return 0;
> +
> +	its->cbaser = mask64(its->cbaser, addr & 7, len, val);
> +	its->creadr = 0;
> +
> +	return 0;
> +}
> +
> +static int its_cmd_buffer_size(struct kvm *kvm)
> +{
> +	struct vgic_its *its = &kvm->arch.vgic.its;
> +
> +	return ((its->cbaser & 0xff) + 1) << 12;
> +}
> +
> +static gpa_t its_cmd_buffer_base(struct kvm *kvm)
> +{
> +	struct vgic_its *its = &kvm->arch.vgic.its;
> +
> +	return BASER_BASE_ADDRESS(its->cbaser);
> +}
> +
> +/*
> + * By writing to CWRITER the guest announces new commands to be processed.
> + * Since we cannot read from guest memory inside the ITS spinlock, we
> + * iterate over the command buffer (with the lock dropped) until the read
> + * pointer matches the write pointer. Other VCPUs writing this register in the
> + * meantime will just update the write pointer, leaving the command
> + * processing to the first instance of the function.
> + */
> +static int vgic_mmio_write_its_cwriter(struct kvm_vcpu *vcpu,
> +				       struct kvm_io_device *this,
> +				       gpa_t addr, int len, const void *val)
> +{
> +	struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
> +	struct vgic_its *its = &dist->its;
> +	gpa_t cbaser = its_cmd_buffer_base(vcpu->kvm);
> +	u64 cmd_buf[4];
> +	u32 reg;
> +	bool finished;
> +
> +	reg = mask64(its->cwriter & 0xfffe0, addr & 7, len, val);
> +	reg &= 0xfffe0;
> +	if (reg > its_cmd_buffer_size(vcpu->kvm))
> +		return 0;
> +
> +	spin_lock(&its->lock);
> +
> +	/*
> +	 * If there is still another VCPU handling commands, let this
> +	 * one pick up the new CWRITER and process "our" new commands as well.
> +	 */
> +	finished = (its->cwriter != its->creadr);
> +	its->cwriter = reg;
> +
> +	spin_unlock(&its->lock);
> +
> +	while (!finished) {
> +		int ret = kvm_read_guest(vcpu->kvm, cbaser + its->creadr,
> +					 cmd_buf, 32);
> +		if (ret) {
> +			/*
> +			 * Gah, we are screwed. Reset CWRITER to that command

s/that/the/

> +			 * that we have finished processing and return.
> +			 */

what does this situation mean? That the guest programmed bogus values
into GITS_BASERn, or?  (Is that allowed?)

> +			spin_lock(&its->lock);
> +			its->cwriter = its->creadr;
> +			spin_unlock(&its->lock);
> +			break;
> +		}
> +		vits_handle_command(vcpu, cmd_buf);
> +
> +		spin_lock(&its->lock);
> +		its->creadr += 32;
> +		if (its->creadr == its_cmd_buffer_size(vcpu->kvm))
> +			its->creadr = 0;
> +		finished = (its->creadr == its->cwriter);
> +		spin_unlock(&its->lock);
> +	}
> +
> +	return 0;
> +}
> +
> +static int vgic_mmio_read_its_cwriter(struct kvm_vcpu *vcpu,
> +				      struct kvm_io_device *this,
> +				      gpa_t addr, int len, void *val)
> +{
> +	struct vgic_its *its = &vcpu->kvm->arch.vgic.its;
> +	u64 reg = its->cwriter & 0xfffe0;
> +
> +	write_mask64(reg, addr & 7, len, val);
> +
> +	return 0;
> +}
> +
> +static int vgic_mmio_read_its_creadr(struct kvm_vcpu *vcpu,
> +				     struct kvm_io_device *this,
> +				     gpa_t addr, int len, void *val)
> +{
> +	struct vgic_its *its = &vcpu->kvm->arch.vgic.its;
> +	u64 reg = its->creadr & 0xfffe0;
> +
> +	write_mask64(reg, addr & 7, len, val);
> +
> +	return 0;
> +}
> +
>  struct vgic_register_region its_registers[] = {
>  	REGISTER_DESC_WITH_LENGTH(GITS_CTLR,
> -		vgic_mmio_read_raz, vgic_mmio_write_wi, 4),
> +		vgic_mmio_read_its_ctlr, vgic_mmio_write_its_ctlr, 4),
>  	REGISTER_DESC_WITH_LENGTH(GITS_IIDR,
> -		vgic_mmio_read_raz, vgic_mmio_write_wi, 4),
> +		vgic_mmio_read_its_iidr, vgic_mmio_write_wi, 4),
>  	REGISTER_DESC_WITH_LENGTH(GITS_TYPER,
> -		vgic_mmio_read_raz, vgic_mmio_write_wi, 4),
> +		vgic_mmio_read_its_typer, vgic_mmio_write_wi, 4),
>  	REGISTER_DESC_WITH_LENGTH(GITS_CBASER,
> -		vgic_mmio_read_raz, vgic_mmio_write_wi, 8),
> +		vgic_mmio_read_its_cbaser, vgic_mmio_write_its_cbaser, 8),
>  	REGISTER_DESC_WITH_LENGTH(GITS_CWRITER,
> -		vgic_mmio_read_raz, vgic_mmio_write_wi, 8),
> +		vgic_mmio_read_its_cwriter, vgic_mmio_write_its_cwriter, 8),
>  	REGISTER_DESC_WITH_LENGTH(GITS_CREADR,
> -		vgic_mmio_read_raz, vgic_mmio_write_wi, 8),
> +		vgic_mmio_read_its_creadr, vgic_mmio_write_wi, 8),
>  	REGISTER_DESC_WITH_LENGTH(GITS_BASER,
>  		vgic_mmio_read_raz, vgic_mmio_write_wi, 0x40),
>  	REGISTER_DESC_WITH_LENGTH(GITS_IDREGS_BASE,
> -		vgic_mmio_read_raz, vgic_mmio_write_wi, 0x30),
> +		vgic_mmio_read_its_idregs, vgic_mmio_write_wi, 0x30),
>  };
>  
>  /* This is called on setting the LPI enable bit in the redistributor. */
> @@ -59,9 +299,14 @@ int vits_init(struct kvm *kvm)
>  {
>  	struct vgic_dist *dist = &kvm->arch.vgic;
>  	struct vgic_its *its = &dist->its;
> +	int nr_vcpus = atomic_read(&kvm->online_vcpus);
>  	struct vgic_io_device *regions;
>  	int ret, i;
>  
> +	dist->pendbaser = kcalloc(nr_vcpus, sizeof(u64), GFP_KERNEL);
> +	if (!dist->pendbaser)
> +		return -ENOMEM;
> +
>  	spin_lock_init(&its->lock);
>  
>  	regions = kmalloc_array(ARRAY_SIZE(its_registers),
> @@ -82,3 +327,16 @@ int vits_init(struct kvm *kvm)
>  
>  	return -ENXIO;
>  }
> +
> +void vits_destroy(struct kvm *kvm)
> +{
> +	struct vgic_dist *dist = &kvm->arch.vgic;
> +	struct vgic_its *its = &dist->its;
> +
> +	if (!vgic_has_its(kvm))
> +		return;
> +
> +	kfree(dist->pendbaser);
> +
> +	its->enabled = false;
> +}
> diff --git a/virt/kvm/arm/vgic/vgic.h b/virt/kvm/arm/vgic/vgic.h
> index 4e7dcb8..08f97d1 100644
> --- a/virt/kvm/arm/vgic/vgic.h
> +++ b/virt/kvm/arm/vgic/vgic.h
> @@ -63,6 +63,7 @@ int vgic_register_redist_regions(struct kvm *kvm, gpa_t dist_base_address);
>  
>  int vits_init(struct kvm *kvm);
>  void vgic_enable_lpis(struct kvm_vcpu *vcpu);
> +void vits_destroy(struct kvm *kvm);
>  #else
>  static inline void vgic_v3_irq_change_affinity(struct kvm *kvm, u32 intid,
>  					       u64 mpidr)
> @@ -137,6 +138,11 @@ static inline void vgic_enable_lpis(struct kvm_vcpu *vcpu)
>  {
>  	return;
>  }
> +
> +static inline void vits_destroy(struct kvm *kvm)
> +{
> +	return;
> +}
>  #endif
>  
>  void vgic_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr);
> diff --git a/virt/kvm/arm/vgic/vgic_init.c b/virt/kvm/arm/vgic/vgic_init.c
> index dcfb93d..e4459e3 100644
> --- a/virt/kvm/arm/vgic/vgic_init.c
> +++ b/virt/kvm/arm/vgic/vgic_init.c
> @@ -298,6 +298,8 @@ void kvm_vgic_destroy(struct kvm *kvm)
>  
>  	kvm_vgic_dist_destroy(kvm);
>  
> +	vits_destroy(kvm);
> +
>  	kvm_for_each_vcpu(i, vcpu, kvm)
>  		kvm_vgic_vcpu_destroy(vcpu);
>  }
> -- 
> 2.7.3
>
Eric Auger April 6, 2016, 9:36 a.m. UTC | #2
Hi Andre,
On 03/26/2016 03:14 AM, Andre Przywara wrote:
> Add emulation for some basic MMIO registers used in the ITS emulation.
> This includes:
> - GITS_{CTLR,TYPER,IIDR}
> - ID registers
> - GITS_{CBASER,CREADR,CWRITER}
>   those implement the ITS command buffer handling
> 
> Most of the handlers are pretty straight forward, but CWRITER goes
> some extra miles to allow fine grained locking. The idea here
> is to let only the first instance iterate through the command ring
> buffer, CWRITER accesses on other VCPUs meanwhile will be picked up
> by that first instance and handled as well. The ITS lock is thus only
> hold for very small periods of time and is dropped before the actual
> command handler is called.
> 
> Signed-off-by: Andre Przywara <andre.przywara@arm.com>
> ---
>  include/kvm/vgic/vgic.h            |   3 +
>  include/linux/irqchip/arm-gic-v3.h |   8 ++
>  virt/kvm/arm/vgic/its-emul.c       | 272 ++++++++++++++++++++++++++++++++++++-
>  virt/kvm/arm/vgic/vgic.h           |   6 +
>  virt/kvm/arm/vgic/vgic_init.c      |   2 +
>  5 files changed, 284 insertions(+), 7 deletions(-)
> 
> diff --git a/include/kvm/vgic/vgic.h b/include/kvm/vgic/vgic.h
> index c79bed5..bafea11 100644
> --- a/include/kvm/vgic/vgic.h
> +++ b/include/kvm/vgic/vgic.h
> @@ -115,6 +115,9 @@ struct vgic_io_device {
>  struct vgic_its {
>  	bool			enabled;
>  	spinlock_t		lock;
> +	u64			cbaser;
> +	int			creadr;
> +	int			cwriter;
>  };
>  
>  struct vgic_dist {
> diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h
> index a813c3e..7011b98 100644
> --- a/include/linux/irqchip/arm-gic-v3.h
> +++ b/include/linux/irqchip/arm-gic-v3.h
> @@ -179,15 +179,23 @@
>  #define GITS_BASER			0x0100
>  #define GITS_IDREGS_BASE		0xffd0
>  #define GITS_PIDR2			GICR_PIDR2
> +#define GITS_PIDR4			0xffd0
> +#define GITS_CIDR0			0xfff0
> +#define GITS_CIDR1			0xfff4
> +#define GITS_CIDR2			0xfff8
> +#define GITS_CIDR3			0xfffc
>  
>  #define GITS_TRANSLATER			0x10040
>  
>  #define GITS_CTLR_ENABLE		(1U << 0)
>  #define GITS_CTLR_QUIESCENT		(1U << 31)
>  
> +#define GITS_TYPER_PLPIS		(1UL << 0)
> +#define GITS_TYPER_IDBITS_SHIFT		8
>  #define GITS_TYPER_DEVBITS_SHIFT	13
>  #define GITS_TYPER_DEVBITS(r)		((((r) >> GITS_TYPER_DEVBITS_SHIFT) & 0x1f) + 1)
>  #define GITS_TYPER_PTA			(1UL << 19)
> +#define GITS_TYPER_HWCOLLCNT_SHIFT	24
>  
>  #define GITS_CBASER_VALID		(1UL << 63)
>  #define GITS_CBASER_nCnB		(0UL << 59)
> diff --git a/virt/kvm/arm/vgic/its-emul.c b/virt/kvm/arm/vgic/its-emul.c
> index 49dd5e4..de8d360 100644
> --- a/virt/kvm/arm/vgic/its-emul.c
> +++ b/virt/kvm/arm/vgic/its-emul.c
> @@ -31,23 +31,263 @@
>  #include "vgic.h"
>  #include "vgic_mmio.h"
>  
> +#define BASER_BASE_ADDRESS(x) ((x) & 0xfffffffff000ULL)
> +
> +static int vgic_mmio_read_its_ctlr(struct kvm_vcpu *vcpu,
> +				   struct kvm_io_device *this,
> +				   gpa_t addr, int len, void *val)
> +{
> +	struct vgic_its *its = &vcpu->kvm->arch.vgic.its;
> +	u32 reg;
> +
> +	reg = GITS_CTLR_QUIESCENT;
> +	if (its->enabled)
> +		reg |= GITS_CTLR_ENABLE;
> +
> +	write_mask32(reg, addr & 3, len, val);
> +
> +	return 0;
> +}
> +
> +static int vgic_mmio_write_its_ctlr(struct kvm_vcpu *vcpu,
> +				    struct kvm_io_device *this,
> +				    gpa_t addr, int len, const void *val)
> +{
> +	struct vgic_its *its = &vcpu->kvm->arch.vgic.its;
> +	struct vgic_io_device *iodev = container_of(this,
> +						    struct vgic_io_device, dev);
> +
> +        if (addr - iodev->base_addr == 0)
> +		its->enabled = !!(*(u8*)val & GITS_CTLR_ENABLE);
> +
> +	return 0;
> +}
> +
> +static int vgic_mmio_read_its_typer(struct kvm_vcpu *vcpu,
> +				    struct kvm_io_device *this,
> +				    gpa_t addr, int len, void *val)
> +{
> +	u64 reg = GITS_TYPER_PLPIS;
> +
> +	/*
> +	 * We use linear CPU numbers for redistributor addressing,
> +	 * so GITS_TYPER.PTA is 0.
> +	 * To avoid memory waste on the guest side, we keep the
> +	 * number of IDBits and DevBits low for the time being.
> +	 * This could later be made configurable by userland.
> +	 * Since we have all collections in linked list, we claim
> +	 * that we can hold all of the collection tables in our
> +	 * own memory and that the ITT entry size is 1 byte (the
> +	 * smallest possible one).
> +	 */
> +	reg |= 0xff << GITS_TYPER_HWCOLLCNT_SHIFT;
> +	reg |= 0x0f << GITS_TYPER_DEVBITS_SHIFT;
> +	reg |= 0x0f << GITS_TYPER_IDBITS_SHIFT;
> +
> +	write_mask64(reg, addr & 7, len, val);
> +
> +	return 0;
> +}
> +
> +static int vgic_mmio_read_its_iidr(struct kvm_vcpu *vcpu,
> +				   struct kvm_io_device *this,
> +				   gpa_t addr, int len, void *val)
> +{
> +	u32 reg = (PRODUCT_ID_KVM << 24) | (IMPLEMENTER_ARM << 0);
> +
> +	write_mask32(reg, addr & 3, len, val);
> +
> +	return 0;
> +}
> +
> +static int vgic_mmio_read_its_idregs(struct kvm_vcpu *vcpu,
> +				     struct kvm_io_device *this,
> +				     gpa_t addr, int len, void *val)
> +{
> +	struct vgic_io_device *iodev = container_of(this,
> +						    struct vgic_io_device, dev);
> +	u32 reg = 0;
> +	int idreg = (addr & ~3) - iodev->base_addr + GITS_IDREGS_BASE;
> +
> +	switch (idreg) {
> +	case GITS_PIDR2:
> +		reg = GIC_PIDR2_ARCH_GICv3;
> +		break;
> +	case GITS_PIDR4:
> +		/* This is a 64K software visible page */
> +		reg = 0x40;
> +		break;
> +	/* Those are the ID registers for (any) GIC. */
> +	case GITS_CIDR0:
> +		reg = 0x0d;
> +		break;
> +	case GITS_CIDR1:
> +		reg = 0xf0;
> +		break;
> +	case GITS_CIDR2:
> +		reg = 0x05;
> +		break;
> +	case GITS_CIDR3:
> +		reg = 0xb1;
> +		break;
> +	}
> +
> +	write_mask32(reg, addr & 3, len, val);
> +
> +	return 0;
> +}
> +
> +/*
> + * This function is called with both the ITS and the distributor lock dropped,
dist lock does not exist anymore ;-)
> + * so the actual command handlers must take the respective locks when needed.
> + */
> +static int vits_handle_command(struct kvm_vcpu *vcpu, u64 *its_cmd)
> +{
> +	return -ENODEV;
> +}
> +
> +static int vgic_mmio_read_its_cbaser(struct kvm_vcpu *vcpu,
> +				    struct kvm_io_device *this,
> +				    gpa_t addr, int len, void *val)
> +{
> +	struct vgic_its *its = &vcpu->kvm->arch.vgic.its;
> +
> +	write_mask64(its->cbaser, addr & 7, len, val);
> +
> +	return 0;
> +}
> +
> +static int vgic_mmio_write_its_cbaser(struct kvm_vcpu *vcpu,
> +				      struct kvm_io_device *this,
> +				      gpa_t addr, int len, const void *val)
> +{
> +	struct vgic_its *its = &vcpu->kvm->arch.vgic.its;
> +
> +	if (its->enabled)
> +		return 0;
> +
> +	its->cbaser = mask64(its->cbaser, addr & 7, len, val);
> +	its->creadr = 0;
shouldn't this be protected by the its lock to guarantee consistency of
cbaser,  creadr?
> +
> +	return 0;
> +}
> +
> +static int its_cmd_buffer_size(struct kvm *kvm)
> +{
> +	struct vgic_its *its = &kvm->arch.vgic.its;
> +
> +	return ((its->cbaser & 0xff) + 1) << 12;
> +}
> +
> +static gpa_t its_cmd_buffer_base(struct kvm *kvm)
> +{
> +	struct vgic_its *its = &kvm->arch.vgic.its;
> +
> +	return BASER_BASE_ADDRESS(its->cbaser);
> +}
> +
> +/*
> + * By writing to CWRITER the guest announces new commands to be processed.
> + * Since we cannot read from guest memory inside the ITS spinlock, we
> + * iterate over the command buffer (with the lock dropped) until the read
> + * pointer matches the write pointer. Other VCPUs writing this register in the
> + * meantime will just update the write pointer, leaving the command
> + * processing to the first instance of the function.
> + */
> +static int vgic_mmio_write_its_cwriter(struct kvm_vcpu *vcpu,
> +				       struct kvm_io_device *this,
> +				       gpa_t addr, int len, const void *val)
> +{
> +	struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
> +	struct vgic_its *its = &dist->its;
> +	gpa_t cbaser = its_cmd_buffer_base(vcpu->kvm);
> +	u64 cmd_buf[4];
> +	u32 reg;
> +	bool finished;
> +
> +	reg = mask64(its->cwriter & 0xfffe0, addr & 7, len, val);
> +	reg &= 0xfffe0;
> +	if (reg > its_cmd_buffer_size(vcpu->kvm))
> +		return 0;
> +
> +	spin_lock(&its->lock);
> +
> +	/*
> +	 * If there is still another VCPU handling commands, let this
> +	 * one pick up the new CWRITER and process "our" new commands as well.
> +	 */
> +	finished = (its->cwriter != its->creadr);
empty?
> +	its->cwriter = reg;
> +
> +	spin_unlock(&its->lock);
Assuming we have 2 vcpus attempting a cwriter write at the same moment I
don't understand what does guarantee they are not going to execute the
following loop concurrently and possibly execute vits_handle_command
twice on the same cmd from the same its->creadr start?
> +
> +	while (!finished) {
> +		int ret = kvm_read_guest(vcpu->kvm, cbaser + its->creadr,
> +					 cmd_buf, 32);
> +		if (ret) {
> +			/*
> +			 * Gah, we are screwed. Reset CWRITER to that command
> +			 * that we have finished processing and return.
> +			 */
> +			spin_lock(&its->lock);
> +			its->cwriter = its->creadr;
don't get why do you set the queue empty?
> +			spin_unlock(&its->lock);
> +			break;
> +		}
> +		vits_handle_command(vcpu, cmd_buf);
> +
> +		spin_lock(&its->lock);
> +		its->creadr += 32;
> +		if (its->creadr == its_cmd_buffer_size(vcpu->kvm))
> +			its->creadr = 0;
> +		finished = (its->creadr == its->cwriter);
> +		spin_unlock(&its->lock);
> +	}
> +
> +	return 0;
> +}
> +
> +static int vgic_mmio_read_its_cwriter(struct kvm_vcpu *vcpu,
> +				      struct kvm_io_device *this,
> +				      gpa_t addr, int len, void *val)
> +{
> +	struct vgic_its *its = &vcpu->kvm->arch.vgic.its;
> +	u64 reg = its->cwriter & 0xfffe0;
> +
> +	write_mask64(reg, addr & 7, len, val);
> +
> +	return 0;
> +}
> +
> +static int vgic_mmio_read_its_creadr(struct kvm_vcpu *vcpu,
> +				     struct kvm_io_device *this,
> +				     gpa_t addr, int len, void *val)
> +{
> +	struct vgic_its *its = &vcpu->kvm->arch.vgic.its;
> +	u64 reg = its->creadr & 0xfffe0;
> +
> +	write_mask64(reg, addr & 7, len, val);
> +
> +	return 0;
> +}
> +
>  struct vgic_register_region its_registers[] = {
>  	REGISTER_DESC_WITH_LENGTH(GITS_CTLR,
> -		vgic_mmio_read_raz, vgic_mmio_write_wi, 4),
> +		vgic_mmio_read_its_ctlr, vgic_mmio_write_its_ctlr, 4),
>  	REGISTER_DESC_WITH_LENGTH(GITS_IIDR,
> -		vgic_mmio_read_raz, vgic_mmio_write_wi, 4),
> +		vgic_mmio_read_its_iidr, vgic_mmio_write_wi, 4),
>  	REGISTER_DESC_WITH_LENGTH(GITS_TYPER,
> -		vgic_mmio_read_raz, vgic_mmio_write_wi, 4),
> +		vgic_mmio_read_its_typer, vgic_mmio_write_wi, 4),
>  	REGISTER_DESC_WITH_LENGTH(GITS_CBASER,
> -		vgic_mmio_read_raz, vgic_mmio_write_wi, 8),
> +		vgic_mmio_read_its_cbaser, vgic_mmio_write_its_cbaser, 8),
>  	REGISTER_DESC_WITH_LENGTH(GITS_CWRITER,
> -		vgic_mmio_read_raz, vgic_mmio_write_wi, 8),
> +		vgic_mmio_read_its_cwriter, vgic_mmio_write_its_cwriter, 8),
>  	REGISTER_DESC_WITH_LENGTH(GITS_CREADR,
> -		vgic_mmio_read_raz, vgic_mmio_write_wi, 8),
> +		vgic_mmio_read_its_creadr, vgic_mmio_write_wi, 8),
>  	REGISTER_DESC_WITH_LENGTH(GITS_BASER,
>  		vgic_mmio_read_raz, vgic_mmio_write_wi, 0x40),
>  	REGISTER_DESC_WITH_LENGTH(GITS_IDREGS_BASE,
> -		vgic_mmio_read_raz, vgic_mmio_write_wi, 0x30),
> +		vgic_mmio_read_its_idregs, vgic_mmio_write_wi, 0x30),
>  };
>  
>  /* This is called on setting the LPI enable bit in the redistributor. */
> @@ -59,9 +299,14 @@ int vits_init(struct kvm *kvm)
>  {
>  	struct vgic_dist *dist = &kvm->arch.vgic;
>  	struct vgic_its *its = &dist->its;
> +	int nr_vcpus = atomic_read(&kvm->online_vcpus);
>  	struct vgic_io_device *regions;
>  	int ret, i;
>  
> +	dist->pendbaser = kcalloc(nr_vcpus, sizeof(u64), GFP_KERNEL);
> +	if (!dist->pendbaser)
> +		return -ENOMEM;
> +
>  	spin_lock_init(&its->lock);
>  
>  	regions = kmalloc_array(ARRAY_SIZE(its_registers),
> @@ -82,3 +327,16 @@ int vits_init(struct kvm *kvm)
>  
>  	return -ENXIO;
>  }
> +
> +void vits_destroy(struct kvm *kvm)
> +{
> +	struct vgic_dist *dist = &kvm->arch.vgic;
> +	struct vgic_its *its = &dist->its;
> +
> +	if (!vgic_has_its(kvm))
> +		return;
> +
> +	kfree(dist->pendbaser);
regions is leaking. By the way its->iodev never is set on init so cannot
be freeed on destructor. Another way to free it is to use the io_dev_ops
destructor.

Cheers

Eric
> +
> +	its->enabled = false;
> +}
> diff --git a/virt/kvm/arm/vgic/vgic.h b/virt/kvm/arm/vgic/vgic.h
> index 4e7dcb8..08f97d1 100644
> --- a/virt/kvm/arm/vgic/vgic.h
> +++ b/virt/kvm/arm/vgic/vgic.h
> @@ -63,6 +63,7 @@ int vgic_register_redist_regions(struct kvm *kvm, gpa_t dist_base_address);
>  
>  int vits_init(struct kvm *kvm);
>  void vgic_enable_lpis(struct kvm_vcpu *vcpu);
> +void vits_destroy(struct kvm *kvm);
>  #else
>  static inline void vgic_v3_irq_change_affinity(struct kvm *kvm, u32 intid,
>  					       u64 mpidr)
> @@ -137,6 +138,11 @@ static inline void vgic_enable_lpis(struct kvm_vcpu *vcpu)
>  {
>  	return;
>  }
> +
> +static inline void vits_destroy(struct kvm *kvm)
> +{
> +	return;
> +}
>  #endif
>  
>  void vgic_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr);
> diff --git a/virt/kvm/arm/vgic/vgic_init.c b/virt/kvm/arm/vgic/vgic_init.c
> index dcfb93d..e4459e3 100644
> --- a/virt/kvm/arm/vgic/vgic_init.c
> +++ b/virt/kvm/arm/vgic/vgic_init.c
> @@ -298,6 +298,8 @@ void kvm_vgic_destroy(struct kvm *kvm)
>  
>  	kvm_vgic_dist_destroy(kvm);
>  
> +	vits_destroy(kvm);
> +
>  	kvm_for_each_vcpu(i, vcpu, kvm)
>  		kvm_vgic_vcpu_destroy(vcpu);
>  }
>
Marc Zyngier April 7, 2016, 2:35 p.m. UTC | #3
On 26/03/16 02:14, Andre Przywara wrote:
> Add emulation for some basic MMIO registers used in the ITS emulation.
> This includes:
> - GITS_{CTLR,TYPER,IIDR}
> - ID registers
> - GITS_{CBASER,CREADR,CWRITER}
>   those implement the ITS command buffer handling
> 
> Most of the handlers are pretty straight forward, but CWRITER goes
> some extra miles to allow fine grained locking. The idea here
> is to let only the first instance iterate through the command ring
> buffer, CWRITER accesses on other VCPUs meanwhile will be picked up
> by that first instance and handled as well. The ITS lock is thus only
> hold for very small periods of time and is dropped before the actual

s/hold/held/

> command handler is called.
> 
> Signed-off-by: Andre Przywara <andre.przywara@arm.com>
> ---
>  include/kvm/vgic/vgic.h            |   3 +
>  include/linux/irqchip/arm-gic-v3.h |   8 ++
>  virt/kvm/arm/vgic/its-emul.c       | 272 ++++++++++++++++++++++++++++++++++++-
>  virt/kvm/arm/vgic/vgic.h           |   6 +
>  virt/kvm/arm/vgic/vgic_init.c      |   2 +
>  5 files changed, 284 insertions(+), 7 deletions(-)
> 
> diff --git a/include/kvm/vgic/vgic.h b/include/kvm/vgic/vgic.h
> index c79bed5..bafea11 100644
> --- a/include/kvm/vgic/vgic.h
> +++ b/include/kvm/vgic/vgic.h
> @@ -115,6 +115,9 @@ struct vgic_io_device {
>  struct vgic_its {
>  	bool			enabled;
>  	spinlock_t		lock;
> +	u64			cbaser;
> +	int			creadr;
> +	int			cwriter;

Irk. Please use explicitly sized types.

>  };
>  
>  struct vgic_dist {
> diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h
> index a813c3e..7011b98 100644
> --- a/include/linux/irqchip/arm-gic-v3.h
> +++ b/include/linux/irqchip/arm-gic-v3.h
> @@ -179,15 +179,23 @@
>  #define GITS_BASER			0x0100
>  #define GITS_IDREGS_BASE		0xffd0
>  #define GITS_PIDR2			GICR_PIDR2
> +#define GITS_PIDR4			0xffd0
> +#define GITS_CIDR0			0xfff0
> +#define GITS_CIDR1			0xfff4
> +#define GITS_CIDR2			0xfff8
> +#define GITS_CIDR3			0xfffc
>  
>  #define GITS_TRANSLATER			0x10040
>  
>  #define GITS_CTLR_ENABLE		(1U << 0)
>  #define GITS_CTLR_QUIESCENT		(1U << 31)
>  
> +#define GITS_TYPER_PLPIS		(1UL << 0)
> +#define GITS_TYPER_IDBITS_SHIFT		8
>  #define GITS_TYPER_DEVBITS_SHIFT	13
>  #define GITS_TYPER_DEVBITS(r)		((((r) >> GITS_TYPER_DEVBITS_SHIFT) & 0x1f) + 1)
>  #define GITS_TYPER_PTA			(1UL << 19)
> +#define GITS_TYPER_HWCOLLCNT_SHIFT	24
>  
>  #define GITS_CBASER_VALID		(1UL << 63)
>  #define GITS_CBASER_nCnB		(0UL << 59)
> diff --git a/virt/kvm/arm/vgic/its-emul.c b/virt/kvm/arm/vgic/its-emul.c
> index 49dd5e4..de8d360 100644
> --- a/virt/kvm/arm/vgic/its-emul.c
> +++ b/virt/kvm/arm/vgic/its-emul.c
> @@ -31,23 +31,263 @@
>  #include "vgic.h"
>  #include "vgic_mmio.h"
>  
> +#define BASER_BASE_ADDRESS(x) ((x) & 0xfffffffff000ULL)
> +
> +static int vgic_mmio_read_its_ctlr(struct kvm_vcpu *vcpu,
> +				   struct kvm_io_device *this,
> +				   gpa_t addr, int len, void *val)
> +{
> +	struct vgic_its *its = &vcpu->kvm->arch.vgic.its;
> +	u32 reg;
> +
> +	reg = GITS_CTLR_QUIESCENT;

So your ITS is always in a quiescent state? Even when you're processing
the command queue? You'll have to convince me...

> +	if (its->enabled)
> +		reg |= GITS_CTLR_ENABLE;
> +
> +	write_mask32(reg, addr & 3, len, val);
> +
> +	return 0;
> +}
> +
> +static int vgic_mmio_write_its_ctlr(struct kvm_vcpu *vcpu,
> +				    struct kvm_io_device *this,
> +				    gpa_t addr, int len, const void *val)
> +{
> +	struct vgic_its *its = &vcpu->kvm->arch.vgic.its;
> +	struct vgic_io_device *iodev = container_of(this,
> +						    struct vgic_io_device, dev);
> +
> +        if (addr - iodev->base_addr == 0)

whitespace issue.

> +		its->enabled = !!(*(u8*)val & GITS_CTLR_ENABLE);
> +
> +	return 0;
> +}
> +
> +static int vgic_mmio_read_its_typer(struct kvm_vcpu *vcpu,
> +				    struct kvm_io_device *this,
> +				    gpa_t addr, int len, void *val)
> +{
> +	u64 reg = GITS_TYPER_PLPIS;
> +
> +	/*
> +	 * We use linear CPU numbers for redistributor addressing,
> +	 * so GITS_TYPER.PTA is 0.
> +	 * To avoid memory waste on the guest side, we keep the
> +	 * number of IDBits and DevBits low for the time being.
> +	 * This could later be made configurable by userland.
> +	 * Since we have all collections in linked list, we claim
> +	 * that we can hold all of the collection tables in our
> +	 * own memory and that the ITT entry size is 1 byte (the
> +	 * smallest possible one).

All of this is going to bite us when we want to implement migration,
specially the HW collection bit.

> +	 */
> +	reg |= 0xff << GITS_TYPER_HWCOLLCNT_SHIFT;
> +	reg |= 0x0f << GITS_TYPER_DEVBITS_SHIFT;
> +	reg |= 0x0f << GITS_TYPER_IDBITS_SHIFT;
> +
> +	write_mask64(reg, addr & 7, len, val);
> +
> +	return 0;
> +}
> +
> +static int vgic_mmio_read_its_iidr(struct kvm_vcpu *vcpu,
> +				   struct kvm_io_device *this,
> +				   gpa_t addr, int len, void *val)
> +{
> +	u32 reg = (PRODUCT_ID_KVM << 24) | (IMPLEMENTER_ARM << 0);
> +
> +	write_mask32(reg, addr & 3, len, val);
> +
> +	return 0;
> +}
> +
> +static int vgic_mmio_read_its_idregs(struct kvm_vcpu *vcpu,
> +				     struct kvm_io_device *this,
> +				     gpa_t addr, int len, void *val)
> +{
> +	struct vgic_io_device *iodev = container_of(this,
> +						    struct vgic_io_device, dev);
> +	u32 reg = 0;
> +	int idreg = (addr & ~3) - iodev->base_addr + GITS_IDREGS_BASE;
> +
> +	switch (idreg) {
> +	case GITS_PIDR2:
> +		reg = GIC_PIDR2_ARCH_GICv3;

Are we leaving the lowest 4 bits to zero?

> +		break;
> +	case GITS_PIDR4:
> +		/* This is a 64K software visible page */
> +		reg = 0x40;

Same question.

Also, how about all the others PIDR registers?

> +		break;
> +	/* Those are the ID registers for (any) GIC. */
> +	case GITS_CIDR0:
> +		reg = 0x0d;
> +		break;
> +	case GITS_CIDR1:
> +		reg = 0xf0;
> +		break;
> +	case GITS_CIDR2:
> +		reg = 0x05;
> +		break;
> +	case GITS_CIDR3:
> +		reg = 0xb1;
> +		break;
> +	}

Given that these values are directly taken from the architecture, and
seem common to the whole GICv3 architecture when implemented by ARM, we
could have a common handler for the whole GICv3 implementatuin. Not a
bit deal though.

> +
> +	write_mask32(reg, addr & 3, len, val);
> +
> +	return 0;
> +}
> +
> +/*
> + * This function is called with both the ITS and the distributor lock dropped,
> + * so the actual command handlers must take the respective locks when needed.
> + */
> +static int vits_handle_command(struct kvm_vcpu *vcpu, u64 *its_cmd)
> +{
> +	return -ENODEV;
> +}
> +
> +static int vgic_mmio_read_its_cbaser(struct kvm_vcpu *vcpu,
> +				    struct kvm_io_device *this,
> +				    gpa_t addr, int len, void *val)
> +{
> +	struct vgic_its *its = &vcpu->kvm->arch.vgic.its;
> +
> +	write_mask64(its->cbaser, addr & 7, len, val);
> +
> +	return 0;
> +}
> +
> +static int vgic_mmio_write_its_cbaser(struct kvm_vcpu *vcpu,
> +				      struct kvm_io_device *this,
> +				      gpa_t addr, int len, const void *val)
> +{
> +	struct vgic_its *its = &vcpu->kvm->arch.vgic.its;
> +
> +	if (its->enabled)
> +		return 0;
> +
> +	its->cbaser = mask64(its->cbaser, addr & 7, len, val);
> +	its->creadr = 0;

Don't you need to acquire the command queue lock here?

> +
> +	return 0;
> +}
> +
> +static int its_cmd_buffer_size(struct kvm *kvm)
> +{
> +	struct vgic_its *its = &kvm->arch.vgic.its;
> +
> +	return ((its->cbaser & 0xff) + 1) << 12;
> +}
> +
> +static gpa_t its_cmd_buffer_base(struct kvm *kvm)
> +{
> +	struct vgic_its *its = &kvm->arch.vgic.its;
> +
> +	return BASER_BASE_ADDRESS(its->cbaser);
> +}
> +
> +/*
> + * By writing to CWRITER the guest announces new commands to be processed.
> + * Since we cannot read from guest memory inside the ITS spinlock, we
> + * iterate over the command buffer (with the lock dropped) until the read
> + * pointer matches the write pointer. Other VCPUs writing this register in the
> + * meantime will just update the write pointer, leaving the command
> + * processing to the first instance of the function.
> + */
> +static int vgic_mmio_write_its_cwriter(struct kvm_vcpu *vcpu,
> +				       struct kvm_io_device *this,
> +				       gpa_t addr, int len, const void *val)
> +{
> +	struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
> +	struct vgic_its *its = &dist->its;
> +	gpa_t cbaser = its_cmd_buffer_base(vcpu->kvm);
> +	u64 cmd_buf[4];
> +	u32 reg;
> +	bool finished;
> +
> +	reg = mask64(its->cwriter & 0xfffe0, addr & 7, len, val);
> +	reg &= 0xfffe0;
> +	if (reg > its_cmd_buffer_size(vcpu->kvm))
> +		return 0;
> +
> +	spin_lock(&its->lock);
> +
> +	/*
> +	 * If there is still another VCPU handling commands, let this
> +	 * one pick up the new CWRITER and process "our" new commands as well.
> +	 */

How do you detect that condition? All I see is a massive race here, with
two threads processing the queue in parallel, possibly corrupting each
other's data.

Please explain why you think this is safe.

> +	finished = (its->cwriter != its->creadr);
> +	its->cwriter = reg;
> +
> +	spin_unlock(&its->lock);
> +
> +	while (!finished) {
> +		int ret = kvm_read_guest(vcpu->kvm, cbaser + its->creadr,
> +					 cmd_buf, 32);
> +		if (ret) {
> +			/*
> +			 * Gah, we are screwed. Reset CWRITER to that command
> +			 * that we have finished processing and return.
> +			 */
> +			spin_lock(&its->lock);
> +			its->cwriter = its->creadr;
> +			spin_unlock(&its->lock);
> +			break;
> +		}
> +		vits_handle_command(vcpu, cmd_buf);
> +
> +		spin_lock(&its->lock);
> +		its->creadr += 32;
> +		if (its->creadr == its_cmd_buffer_size(vcpu->kvm))
> +			its->creadr = 0;
> +		finished = (its->creadr == its->cwriter);
> +		spin_unlock(&its->lock);
> +	}
> +
> +	return 0;
> +}
> +
> +static int vgic_mmio_read_its_cwriter(struct kvm_vcpu *vcpu,
> +				      struct kvm_io_device *this,
> +				      gpa_t addr, int len, void *val)
> +{
> +	struct vgic_its *its = &vcpu->kvm->arch.vgic.its;
> +	u64 reg = its->cwriter & 0xfffe0;
> +
> +	write_mask64(reg, addr & 7, len, val);
> +
> +	return 0;
> +}
> +
> +static int vgic_mmio_read_its_creadr(struct kvm_vcpu *vcpu,
> +				     struct kvm_io_device *this,
> +				     gpa_t addr, int len, void *val)
> +{
> +	struct vgic_its *its = &vcpu->kvm->arch.vgic.its;
> +	u64 reg = its->creadr & 0xfffe0;
> +
> +	write_mask64(reg, addr & 7, len, val);
> +
> +	return 0;
> +}
> +
>  struct vgic_register_region its_registers[] = {
>  	REGISTER_DESC_WITH_LENGTH(GITS_CTLR,
> -		vgic_mmio_read_raz, vgic_mmio_write_wi, 4),
> +		vgic_mmio_read_its_ctlr, vgic_mmio_write_its_ctlr, 4),
>  	REGISTER_DESC_WITH_LENGTH(GITS_IIDR,
> -		vgic_mmio_read_raz, vgic_mmio_write_wi, 4),
> +		vgic_mmio_read_its_iidr, vgic_mmio_write_wi, 4),
>  	REGISTER_DESC_WITH_LENGTH(GITS_TYPER,
> -		vgic_mmio_read_raz, vgic_mmio_write_wi, 4),
> +		vgic_mmio_read_its_typer, vgic_mmio_write_wi, 4),
>  	REGISTER_DESC_WITH_LENGTH(GITS_CBASER,
> -		vgic_mmio_read_raz, vgic_mmio_write_wi, 8),
> +		vgic_mmio_read_its_cbaser, vgic_mmio_write_its_cbaser, 8),
>  	REGISTER_DESC_WITH_LENGTH(GITS_CWRITER,
> -		vgic_mmio_read_raz, vgic_mmio_write_wi, 8),
> +		vgic_mmio_read_its_cwriter, vgic_mmio_write_its_cwriter, 8),
>  	REGISTER_DESC_WITH_LENGTH(GITS_CREADR,
> -		vgic_mmio_read_raz, vgic_mmio_write_wi, 8),
> +		vgic_mmio_read_its_creadr, vgic_mmio_write_wi, 8),
>  	REGISTER_DESC_WITH_LENGTH(GITS_BASER,
>  		vgic_mmio_read_raz, vgic_mmio_write_wi, 0x40),
>  	REGISTER_DESC_WITH_LENGTH(GITS_IDREGS_BASE,
> -		vgic_mmio_read_raz, vgic_mmio_write_wi, 0x30),
> +		vgic_mmio_read_its_idregs, vgic_mmio_write_wi, 0x30),
>  };
>  
>  /* This is called on setting the LPI enable bit in the redistributor. */
> @@ -59,9 +299,14 @@ int vits_init(struct kvm *kvm)
>  {
>  	struct vgic_dist *dist = &kvm->arch.vgic;
>  	struct vgic_its *its = &dist->its;
> +	int nr_vcpus = atomic_read(&kvm->online_vcpus);
>  	struct vgic_io_device *regions;
>  	int ret, i;
>  
> +	dist->pendbaser = kcalloc(nr_vcpus, sizeof(u64), GFP_KERNEL);
> +	if (!dist->pendbaser)
> +		return -ENOMEM;
> +
>  	spin_lock_init(&its->lock);
>  
>  	regions = kmalloc_array(ARRAY_SIZE(its_registers),
> @@ -82,3 +327,16 @@ int vits_init(struct kvm *kvm)
>  
>  	return -ENXIO;
>  }
> +
> +void vits_destroy(struct kvm *kvm)
> +{
> +	struct vgic_dist *dist = &kvm->arch.vgic;
> +	struct vgic_its *its = &dist->its;
> +
> +	if (!vgic_has_its(kvm))
> +		return;
> +
> +	kfree(dist->pendbaser);
> +
> +	its->enabled = false;
> +}
> diff --git a/virt/kvm/arm/vgic/vgic.h b/virt/kvm/arm/vgic/vgic.h
> index 4e7dcb8..08f97d1 100644
> --- a/virt/kvm/arm/vgic/vgic.h
> +++ b/virt/kvm/arm/vgic/vgic.h
> @@ -63,6 +63,7 @@ int vgic_register_redist_regions(struct kvm *kvm, gpa_t dist_base_address);
>  
>  int vits_init(struct kvm *kvm);
>  void vgic_enable_lpis(struct kvm_vcpu *vcpu);
> +void vits_destroy(struct kvm *kvm);
>  #else
>  static inline void vgic_v3_irq_change_affinity(struct kvm *kvm, u32 intid,
>  					       u64 mpidr)
> @@ -137,6 +138,11 @@ static inline void vgic_enable_lpis(struct kvm_vcpu *vcpu)
>  {
>  	return;
>  }
> +
> +static inline void vits_destroy(struct kvm *kvm)
> +{
> +	return;
> +}
>  #endif
>  
>  void vgic_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr);
> diff --git a/virt/kvm/arm/vgic/vgic_init.c b/virt/kvm/arm/vgic/vgic_init.c
> index dcfb93d..e4459e3 100644
> --- a/virt/kvm/arm/vgic/vgic_init.c
> +++ b/virt/kvm/arm/vgic/vgic_init.c
> @@ -298,6 +298,8 @@ void kvm_vgic_destroy(struct kvm *kvm)
>  
>  	kvm_vgic_dist_destroy(kvm);
>  
> +	vits_destroy(kvm);
> +
>  	kvm_for_each_vcpu(i, vcpu, kvm)
>  		kvm_vgic_vcpu_destroy(vcpu);
>  }
> 

Thanks,

	M.
Chalamarla, Tirumalesh May 5, 2016, 6:51 p.m. UTC | #4
On 3/25/16, 7:14 PM, "kvmarm-bounces@lists.cs.columbia.edu on behalf of Andre Przywara" <kvmarm-bounces@lists.cs.columbia.edu on behalf of andre.przywara@arm.com> wrote:

>Add emulation for some basic MMIO registers used in the ITS emulation.
>This includes:
>- GITS_{CTLR,TYPER,IIDR}
>- ID registers
>- GITS_{CBASER,CREADR,CWRITER}
>  those implement the ITS command buffer handling
>
>Most of the handlers are pretty straight forward, but CWRITER goes
>some extra miles to allow fine grained locking. The idea here
>is to let only the first instance iterate through the command ring
>buffer, CWRITER accesses on other VCPUs meanwhile will be picked up
>by that first instance and handled as well. The ITS lock is thus only
>hold for very small periods of time and is dropped before the actual
>command handler is called.
>
>Signed-off-by: Andre Przywara <andre.przywara@arm.com>
>---
> include/kvm/vgic/vgic.h            |   3 +
> include/linux/irqchip/arm-gic-v3.h |   8 ++
> virt/kvm/arm/vgic/its-emul.c       | 272 ++++++++++++++++++++++++++++++++++++-
> virt/kvm/arm/vgic/vgic.h           |   6 +
> virt/kvm/arm/vgic/vgic_init.c      |   2 +
> 5 files changed, 284 insertions(+), 7 deletions(-)
>
>diff --git a/include/kvm/vgic/vgic.h b/include/kvm/vgic/vgic.h
>index c79bed5..bafea11 100644
>--- a/include/kvm/vgic/vgic.h
>+++ b/include/kvm/vgic/vgic.h
>@@ -115,6 +115,9 @@ struct vgic_io_device {
> struct vgic_its {
> 	bool			enabled;
> 	spinlock_t		lock;
>+	u64			cbaser;
>+	int			creadr;
>+	int			cwriter;
> };
> 
> struct vgic_dist {
>diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h
>index a813c3e..7011b98 100644
>--- a/include/linux/irqchip/arm-gic-v3.h
>+++ b/include/linux/irqchip/arm-gic-v3.h
>@@ -179,15 +179,23 @@
> #define GITS_BASER			0x0100
> #define GITS_IDREGS_BASE		0xffd0
> #define GITS_PIDR2			GICR_PIDR2
>+#define GITS_PIDR4			0xffd0
>+#define GITS_CIDR0			0xfff0
>+#define GITS_CIDR1			0xfff4
>+#define GITS_CIDR2			0xfff8
>+#define GITS_CIDR3			0xfffc
> 
> #define GITS_TRANSLATER			0x10040
> 
> #define GITS_CTLR_ENABLE		(1U << 0)
> #define GITS_CTLR_QUIESCENT		(1U << 31)
> 
>+#define GITS_TYPER_PLPIS		(1UL << 0)
>+#define GITS_TYPER_IDBITS_SHIFT		8
> #define GITS_TYPER_DEVBITS_SHIFT	13
> #define GITS_TYPER_DEVBITS(r)		((((r) >> GITS_TYPER_DEVBITS_SHIFT) & 0x1f) + 1)
> #define GITS_TYPER_PTA			(1UL << 19)
>+#define GITS_TYPER_HWCOLLCNT_SHIFT	24
> 
> #define GITS_CBASER_VALID		(1UL << 63)
> #define GITS_CBASER_nCnB		(0UL << 59)
>diff --git a/virt/kvm/arm/vgic/its-emul.c b/virt/kvm/arm/vgic/its-emul.c
>index 49dd5e4..de8d360 100644
>--- a/virt/kvm/arm/vgic/its-emul.c
>+++ b/virt/kvm/arm/vgic/its-emul.c
>@@ -31,23 +31,263 @@
> #include "vgic.h"
> #include "vgic_mmio.h"
> 
>+#define BASER_BASE_ADDRESS(x) ((x) & 0xfffffffff000ULL)
>+
>+static int vgic_mmio_read_its_ctlr(struct kvm_vcpu *vcpu,
>+				   struct kvm_io_device *this,
>+				   gpa_t addr, int len, void *val)
>+{
>+	struct vgic_its *its = &vcpu->kvm->arch.vgic.its;
>+	u32 reg;
>+
>+	reg = GITS_CTLR_QUIESCENT;
>+	if (its->enabled)
>+		reg |= GITS_CTLR_ENABLE;
>+
>+	write_mask32(reg, addr & 3, len, val);
>+
>+	return 0;
>+}
>+
>+static int vgic_mmio_write_its_ctlr(struct kvm_vcpu *vcpu,
>+				    struct kvm_io_device *this,
>+				    gpa_t addr, int len, const void *val)
>+{
>+	struct vgic_its *its = &vcpu->kvm->arch.vgic.its;
>+	struct vgic_io_device *iodev = container_of(this,
>+						    struct vgic_io_device, dev);
>+
>+        if (addr - iodev->base_addr == 0)
>+		its->enabled = !!(*(u8*)val & GITS_CTLR_ENABLE);
>+
>+	return 0;
>+}
>+
>+static int vgic_mmio_read_its_typer(struct kvm_vcpu *vcpu,
>+				    struct kvm_io_device *this,
>+				    gpa_t addr, int len, void *val)
>+{
>+	u64 reg = GITS_TYPER_PLPIS;
>+
>+	/*
>+	 * We use linear CPU numbers for redistributor addressing,
>+	 * so GITS_TYPER.PTA is 0.
>+	 * To avoid memory waste on the guest side, we keep the
>+	 * number of IDBits and DevBits low for the time being.
>+	 * This could later be made configurable by userland.
>+	 * Since we have all collections in linked list, we claim
>+	 * that we can hold all of the collection tables in our
>+	 * own memory and that the ITT entry size is 1 byte (the
>+	 * smallest possible one).
>+	 */


How are you planning to handle device ids with VFIO?
Will there be a mapping between virtual and physical?
Or will it entirely based on virtual and not planning
to send the commands to corresponding physical ITS.
May be no need. 

>+	reg |= 0xff << GITS_TYPER_HWCOLLCNT_SHIFT;
>+	reg |= 0x0f << GITS_TYPER_DEVBITS_SHIFT;
>+	reg |= 0x0f << GITS_TYPER_IDBITS_SHIFT;
>+
>+	write_mask64(reg, addr & 7, len, val);
>+
>+	return 0;
>+}
>+
>+static int vgic_mmio_read_its_iidr(struct kvm_vcpu *vcpu,
>+				   struct kvm_io_device *this,
>+				   gpa_t addr, int len, void *val)
>+{
>+	u32 reg = (PRODUCT_ID_KVM << 24) | (IMPLEMENTER_ARM << 0);
>+
Hmm, no other implementor planning to support system emulation. 
>+	write_mask32(reg, addr & 3, len, val);
>+
>+	return 0;
>+}
>+
>+static int vgic_mmio_read_its_idregs(struct kvm_vcpu *vcpu,
>+				     struct kvm_io_device *this,
>+				     gpa_t addr, int len, void *val)
>+{
>+	struct vgic_io_device *iodev = container_of(this,
>+						    struct vgic_io_device, dev);
>+	u32 reg = 0;
>+	int idreg = (addr & ~3) - iodev->base_addr + GITS_IDREGS_BASE;
>+
>+	switch (idreg) {
>+	case GITS_PIDR2:
>+		reg = GIC_PIDR2_ARCH_GICv3;
>+		break;
>+	case GITS_PIDR4:
>+		/* This is a 64K software visible page */
>+		reg = 0x40;
>+		break;
>+	/* Those are the ID registers for (any) GIC. */
>+	case GITS_CIDR0:
>+		reg = 0x0d;
>+		break;
>+	case GITS_CIDR1:
>+		reg = 0xf0;
>+		break;
>+	case GITS_CIDR2:
>+		reg = 0x05;
>+		break;
>+	case GITS_CIDR3:
>+		reg = 0xb1;
>+		break;
>+	}
>+
>+	write_mask32(reg, addr & 3, len, val);
>+
>+	return 0;
>+}
>+
>+/*
>+ * This function is called with both the ITS and the distributor lock dropped,
>+ * so the actual command handlers must take the respective locks when needed.
>+ */
>+static int vits_handle_command(struct kvm_vcpu *vcpu, u64 *its_cmd)
>+{
>+	return -ENODEV;
>+}
>+
>+static int vgic_mmio_read_its_cbaser(struct kvm_vcpu *vcpu,
>+				    struct kvm_io_device *this,
>+				    gpa_t addr, int len, void *val)
>+{
>+	struct vgic_its *its = &vcpu->kvm->arch.vgic.its;
>+
>+	write_mask64(its->cbaser, addr & 7, len, val);
>+
>+	return 0;
>+}
>+
>+static int vgic_mmio_write_its_cbaser(struct kvm_vcpu *vcpu,
>+				      struct kvm_io_device *this,
>+				      gpa_t addr, int len, const void *val)
>+{
>+	struct vgic_its *its = &vcpu->kvm->arch.vgic.its;
>+
>+	if (its->enabled)
>+		return 0;
>+
>+	its->cbaser = mask64(its->cbaser, addr & 7, len, val);
>+	its->creadr = 0;
>+
>+	return 0;
>+}
>+
>+static int its_cmd_buffer_size(struct kvm *kvm)
>+{
>+	struct vgic_its *its = &kvm->arch.vgic.its;
>+
>+	return ((its->cbaser & 0xff) + 1) << 12;
>+}
>+
>+static gpa_t its_cmd_buffer_base(struct kvm *kvm)
>+{
>+	struct vgic_its *its = &kvm->arch.vgic.its;
>+
>+	return BASER_BASE_ADDRESS(its->cbaser);
>+}
>+
>+/*
>+ * By writing to CWRITER the guest announces new commands to be processed.
>+ * Since we cannot read from guest memory inside the ITS spinlock, we
>+ * iterate over the command buffer (with the lock dropped) until the read
>+ * pointer matches the write pointer. Other VCPUs writing this register in the
>+ * meantime will just update the write pointer, leaving the command
>+ * processing to the first instance of the function.
>+ */
>+static int vgic_mmio_write_its_cwriter(struct kvm_vcpu *vcpu,
>+				       struct kvm_io_device *this,
>+				       gpa_t addr, int len, const void *val)
>+{
>+	struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
>+	struct vgic_its *its = &dist->its;
>+	gpa_t cbaser = its_cmd_buffer_base(vcpu->kvm);
>+	u64 cmd_buf[4];
>+	u32 reg;
>+	bool finished;
>+
>+	reg = mask64(its->cwriter & 0xfffe0, addr & 7, len, val);
>+	reg &= 0xfffe0;
>+	if (reg > its_cmd_buffer_size(vcpu->kvm))
>+		return 0;
>+
>+	spin_lock(&its->lock);
>+
>+	/*
>+	 * If there is still another VCPU handling commands, let this
>+	 * one pick up the new CWRITER and process "our" new commands as well.
>+	 */
>+	finished = (its->cwriter != its->creadr);
>+	its->cwriter = reg;
>+
>+	spin_unlock(&its->lock);
>+
>+	while (!finished) {
>+		int ret = kvm_read_guest(vcpu->kvm, cbaser + its->creadr,
>+					 cmd_buf, 32);
>+		if (ret) {
>+			/*
>+			 * Gah, we are screwed. Reset CWRITER to that command
>+			 * that we have finished processing and return.
>+			 */
>+			spin_lock(&its->lock);
>+			its->cwriter = its->creadr;

Is this correct?
>+			spin_unlock(&its->lock);
>+			break;
>+		}
>+		vits_handle_command(vcpu, cmd_buf);
>+
>+		spin_lock(&its->lock);
>+		its->creadr += 32;
>+		if (its->creadr == its_cmd_buffer_size(vcpu->kvm))
>+			its->creadr = 0;
>+		finished = (its->creadr == its->cwriter);
>+		spin_unlock(&its->lock);
>+	}
>+
>+	return 0;
>+}
>+
>+static int vgic_mmio_read_its_cwriter(struct kvm_vcpu *vcpu,
>+				      struct kvm_io_device *this,
>+				      gpa_t addr, int len, void *val)
>+{
>+	struct vgic_its *its = &vcpu->kvm->arch.vgic.its;
>+	u64 reg = its->cwriter & 0xfffe0;
>+
>+	write_mask64(reg, addr & 7, len, val);
>+
>+	return 0;
>+}
>+
>+static int vgic_mmio_read_its_creadr(struct kvm_vcpu *vcpu,
>+				     struct kvm_io_device *this,
>+				     gpa_t addr, int len, void *val)
>+{
>+	struct vgic_its *its = &vcpu->kvm->arch.vgic.its;
>+	u64 reg = its->creadr & 0xfffe0;
>+
>+	write_mask64(reg, addr & 7, len, val);
>+
>+	return 0;
>+}
>+
> struct vgic_register_region its_registers[] = {
> 	REGISTER_DESC_WITH_LENGTH(GITS_CTLR,
>-		vgic_mmio_read_raz, vgic_mmio_write_wi, 4),
>+		vgic_mmio_read_its_ctlr, vgic_mmio_write_its_ctlr, 4),
> 	REGISTER_DESC_WITH_LENGTH(GITS_IIDR,
>-		vgic_mmio_read_raz, vgic_mmio_write_wi, 4),
>+		vgic_mmio_read_its_iidr, vgic_mmio_write_wi, 4),
> 	REGISTER_DESC_WITH_LENGTH(GITS_TYPER,
>-		vgic_mmio_read_raz, vgic_mmio_write_wi, 4),
>+		vgic_mmio_read_its_typer, vgic_mmio_write_wi, 4),
> 	REGISTER_DESC_WITH_LENGTH(GITS_CBASER,
>-		vgic_mmio_read_raz, vgic_mmio_write_wi, 8),
>+		vgic_mmio_read_its_cbaser, vgic_mmio_write_its_cbaser, 8),
> 	REGISTER_DESC_WITH_LENGTH(GITS_CWRITER,
>-		vgic_mmio_read_raz, vgic_mmio_write_wi, 8),
>+		vgic_mmio_read_its_cwriter, vgic_mmio_write_its_cwriter, 8),
> 	REGISTER_DESC_WITH_LENGTH(GITS_CREADR,
>-		vgic_mmio_read_raz, vgic_mmio_write_wi, 8),
>+		vgic_mmio_read_its_creadr, vgic_mmio_write_wi, 8),
> 	REGISTER_DESC_WITH_LENGTH(GITS_BASER,
> 		vgic_mmio_read_raz, vgic_mmio_write_wi, 0x40),
> 	REGISTER_DESC_WITH_LENGTH(GITS_IDREGS_BASE,
>-		vgic_mmio_read_raz, vgic_mmio_write_wi, 0x30),
>+		vgic_mmio_read_its_idregs, vgic_mmio_write_wi, 0x30),
> };
> 
> /* This is called on setting the LPI enable bit in the redistributor. */
>@@ -59,9 +299,14 @@ int vits_init(struct kvm *kvm)
> {
> 	struct vgic_dist *dist = &kvm->arch.vgic;
> 	struct vgic_its *its = &dist->its;
>+	int nr_vcpus = atomic_read(&kvm->online_vcpus);
> 	struct vgic_io_device *regions;
> 	int ret, i;
> 
>+	dist->pendbaser = kcalloc(nr_vcpus, sizeof(u64), GFP_KERNEL);
>+	if (!dist->pendbaser)
>+		return -ENOMEM;
>+
> 	spin_lock_init(&its->lock);
> 
> 	regions = kmalloc_array(ARRAY_SIZE(its_registers),
>@@ -82,3 +327,16 @@ int vits_init(struct kvm *kvm)
> 
> 	return -ENXIO;
> }
>+
>+void vits_destroy(struct kvm *kvm)
>+{
>+	struct vgic_dist *dist = &kvm->arch.vgic;
>+	struct vgic_its *its = &dist->its;
>+
>+	if (!vgic_has_its(kvm))
>+		return;
>+
>+	kfree(dist->pendbaser);
>+
>+	its->enabled = false;
>+}
>diff --git a/virt/kvm/arm/vgic/vgic.h b/virt/kvm/arm/vgic/vgic.h
>index 4e7dcb8..08f97d1 100644
>--- a/virt/kvm/arm/vgic/vgic.h
>+++ b/virt/kvm/arm/vgic/vgic.h
>@@ -63,6 +63,7 @@ int vgic_register_redist_regions(struct kvm *kvm, gpa_t dist_base_address);
> 
> int vits_init(struct kvm *kvm);
> void vgic_enable_lpis(struct kvm_vcpu *vcpu);
>+void vits_destroy(struct kvm *kvm);
> #else
> static inline void vgic_v3_irq_change_affinity(struct kvm *kvm, u32 intid,
> 					       u64 mpidr)
>@@ -137,6 +138,11 @@ static inline void vgic_enable_lpis(struct kvm_vcpu *vcpu)
> {
> 	return;
> }
>+
>+static inline void vits_destroy(struct kvm *kvm)
>+{
>+	return;
>+}
> #endif
> 
> void vgic_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr);
>diff --git a/virt/kvm/arm/vgic/vgic_init.c b/virt/kvm/arm/vgic/vgic_init.c
>index dcfb93d..e4459e3 100644
>--- a/virt/kvm/arm/vgic/vgic_init.c
>+++ b/virt/kvm/arm/vgic/vgic_init.c
>@@ -298,6 +298,8 @@ void kvm_vgic_destroy(struct kvm *kvm)
> 
> 	kvm_vgic_dist_destroy(kvm);
> 
>+	vits_destroy(kvm);
>+
> 	kvm_for_each_vcpu(i, vcpu, kvm)
> 		kvm_vgic_vcpu_destroy(vcpu);
> }
>-- 
>2.7.3
>
>_______________________________________________
>kvmarm mailing list
>kvmarm@lists.cs.columbia.edu
>https://lists.cs.columbia.edu/mailman/listinfo/kvmarm
Andre Przywara May 25, 2016, 11:37 a.m. UTC | #5
Hi Marc,

this series is mostly obsolete now, but I wanted to elaborate on the
locking scheme, which hasn't changed.

On 07/04/16 15:35, Marc Zyngier wrote:
> On 26/03/16 02:14, Andre Przywara wrote:

....

>> --- a/virt/kvm/arm/vgic/its-emul.c
>> +++ b/virt/kvm/arm/vgic/its-emul.c
>> @@ -31,23 +31,263 @@
>>  #include "vgic.h"
>>  #include "vgic_mmio.h"
>>  
>> +#define BASER_BASE_ADDRESS(x) ((x) & 0xfffffffff000ULL)
>> +
>> +static int vgic_mmio_read_its_ctlr(struct kvm_vcpu *vcpu,
>> +				   struct kvm_io_device *this,
>> +				   gpa_t addr, int len, void *val)
>> +{
>> +	struct vgic_its *its = &vcpu->kvm->arch.vgic.its;
>> +	u32 reg;
>> +
>> +	reg = GITS_CTLR_QUIESCENT;
> 
> So your ITS is always in a quiescent state? Even when you're processing
> the command queue? You'll have to convince me...

Does QUIESCENT actually cover the command queue status? I was under the
impression that this is purely to cope with distributed implementations
and the possible delays caused by propagating commands.
If not so, I can surely take the lock here and respect
(creadr == cwriter) upon setting the bit.

> 
>> +	if (its->enabled)
>> +		reg |= GITS_CTLR_ENABLE;
>> +
>> +	write_mask32(reg, addr & 3, len, val);
>> +
>> +	return 0;
>> +}
>> +
>> +static int vgic_mmio_write_its_ctlr(struct kvm_vcpu *vcpu,
>> +				    struct kvm_io_device *this,
>> +				    gpa_t addr, int len, const void *val)
>> +{
>> +	struct vgic_its *its = &vcpu->kvm->arch.vgic.its;
>> +	struct vgic_io_device *iodev = container_of(this,
>> +						    struct vgic_io_device, dev);
>> +
>> +        if (addr - iodev->base_addr == 0)
> 
> whitespace issue.
> 
>> +		its->enabled = !!(*(u8*)val & GITS_CTLR_ENABLE);
>> +
>> +	return 0;
>> +}
>> +
>> +static int vgic_mmio_read_its_typer(struct kvm_vcpu *vcpu,
>> +				    struct kvm_io_device *this,
>> +				    gpa_t addr, int len, void *val)
>> +{
>> +	u64 reg = GITS_TYPER_PLPIS;
>> +
>> +	/*
>> +	 * We use linear CPU numbers for redistributor addressing,
>> +	 * so GITS_TYPER.PTA is 0.
>> +	 * To avoid memory waste on the guest side, we keep the
>> +	 * number of IDBits and DevBits low for the time being.
>> +	 * This could later be made configurable by userland.
>> +	 * Since we have all collections in linked list, we claim
>> +	 * that we can hold all of the collection tables in our
>> +	 * own memory and that the ITT entry size is 1 byte (the
>> +	 * smallest possible one).
> 
> All of this is going to bite us when we want to implement migration,
> specially the HW collection bit.

How so? Clearly we keep the number of VCPUs constant, so everything
guest visible stays the same even upon migrating to a much different
host? Or are we talking about different things here?

> 
>> +	 */
>> +	reg |= 0xff << GITS_TYPER_HWCOLLCNT_SHIFT;
>> +	reg |= 0x0f << GITS_TYPER_DEVBITS_SHIFT;
>> +	reg |= 0x0f << GITS_TYPER_IDBITS_SHIFT;
>> +
>> +	write_mask64(reg, addr & 7, len, val);
>> +
>> +	return 0;
>> +}
>> +
>> +static int vgic_mmio_read_its_iidr(struct kvm_vcpu *vcpu,
>> +				   struct kvm_io_device *this,
>> +				   gpa_t addr, int len, void *val)
>> +{
>> +	u32 reg = (PRODUCT_ID_KVM << 24) | (IMPLEMENTER_ARM << 0);
>> +
>> +	write_mask32(reg, addr & 3, len, val);
>> +
>> +	return 0;
>> +}
>> +
>> +static int vgic_mmio_read_its_idregs(struct kvm_vcpu *vcpu,
>> +				     struct kvm_io_device *this,
>> +				     gpa_t addr, int len, void *val)
>> +{
>> +	struct vgic_io_device *iodev = container_of(this,
>> +						    struct vgic_io_device, dev);
>> +	u32 reg = 0;
>> +	int idreg = (addr & ~3) - iodev->base_addr + GITS_IDREGS_BASE;
>> +
>> +	switch (idreg) {
>> +	case GITS_PIDR2:
>> +		reg = GIC_PIDR2_ARCH_GICv3;
> 
> Are we leaving the lowest 4 bits to zero?

I couldn't stuff "42" in 4 bits, so: yes ;-)

>> +		break;
>> +	case GITS_PIDR4:
>> +		/* This is a 64K software visible page */
>> +		reg = 0x40;
> 
> Same question.
> 
> Also, how about all the others PIDR registers?

I guess I was halfway undecided between going with the pure
architectural requirements (which only mandate bits 7:4 in PIDR2) and
complying with the recommendation later in the spec.
So I will just fill PIDR0, PIDR2 and the rest of PIDR2 as well.

>> +		break;
>> +	/* Those are the ID registers for (any) GIC. */
>> +	case GITS_CIDR0:
>> +		reg = 0x0d;
>> +		break;
>> +	case GITS_CIDR1:
>> +		reg = 0xf0;
>> +		break;
>> +	case GITS_CIDR2:
>> +		reg = 0x05;
>> +		break;
>> +	case GITS_CIDR3:
>> +		reg = 0xb1;
>> +		break;
>> +	}
> 
> Given that these values are directly taken from the architecture, and
> seem common to the whole GICv3 architecture when implemented by ARM, we
> could have a common handler for the whole GICv3 implementatuin. Not a
> bit deal though.

Agreed.

>> +
>> +	write_mask32(reg, addr & 3, len, val);
>> +
>> +	return 0;
>> +}
>> +
>> +/*
>> + * This function is called with both the ITS and the distributor lock dropped,
>> + * so the actual command handlers must take the respective locks when needed.
>> + */
>> +static int vits_handle_command(struct kvm_vcpu *vcpu, u64 *its_cmd)
>> +{
>> +	return -ENODEV;
>> +}
>> +
>> +static int vgic_mmio_read_its_cbaser(struct kvm_vcpu *vcpu,
>> +				    struct kvm_io_device *this,
>> +				    gpa_t addr, int len, void *val)
>> +{
>> +	struct vgic_its *its = &vcpu->kvm->arch.vgic.its;
>> +
>> +	write_mask64(its->cbaser, addr & 7, len, val);
>> +
>> +	return 0;
>> +}
>> +
>> +static int vgic_mmio_write_its_cbaser(struct kvm_vcpu *vcpu,
>> +				      struct kvm_io_device *this,
>> +				      gpa_t addr, int len, const void *val)
>> +{
>> +	struct vgic_its *its = &vcpu->kvm->arch.vgic.its;
>> +
>> +	if (its->enabled)
>> +		return 0;
>> +
>> +	its->cbaser = mask64(its->cbaser, addr & 7, len, val);
>> +	its->creadr = 0;
> 
> Don't you need to acquire the command queue lock here?

I don't think so, because we "return 0;" above if the ITS is enabled,
which means that nothing is going on at the moment.
But I guess I can just take the lock anyway to be sure.

>> +
>> +	return 0;
>> +}
>> +
>> +static int its_cmd_buffer_size(struct kvm *kvm)
>> +{
>> +	struct vgic_its *its = &kvm->arch.vgic.its;
>> +
>> +	return ((its->cbaser & 0xff) + 1) << 12;
>> +}
>> +
>> +static gpa_t its_cmd_buffer_base(struct kvm *kvm)
>> +{
>> +	struct vgic_its *its = &kvm->arch.vgic.its;
>> +
>> +	return BASER_BASE_ADDRESS(its->cbaser);
>> +}
>> +
>> +/*
>> + * By writing to CWRITER the guest announces new commands to be processed.
>> + * Since we cannot read from guest memory inside the ITS spinlock, we
>> + * iterate over the command buffer (with the lock dropped) until the read
>> + * pointer matches the write pointer. Other VCPUs writing this register in the
>> + * meantime will just update the write pointer, leaving the command
>> + * processing to the first instance of the function.
>> + */
>> +static int vgic_mmio_write_its_cwriter(struct kvm_vcpu *vcpu,
>> +				       struct kvm_io_device *this,
>> +				       gpa_t addr, int len, const void *val)
>> +{
>> +	struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
>> +	struct vgic_its *its = &dist->its;
>> +	gpa_t cbaser = its_cmd_buffer_base(vcpu->kvm);
>> +	u64 cmd_buf[4];
>> +	u32 reg;
>> +	bool finished;
>> +
>> +	reg = mask64(its->cwriter & 0xfffe0, addr & 7, len, val);
>> +	reg &= 0xfffe0;
>> +	if (reg > its_cmd_buffer_size(vcpu->kvm))
>> +		return 0;
>> +
>> +	spin_lock(&its->lock);
>> +
>> +	/*
>> +	 * If there is still another VCPU handling commands, let this
>> +	 * one pick up the new CWRITER and process "our" new commands as well.
>> +	 */
> 
> How do you detect that condition? All I see is a massive race here, with
> two threads processing the queue in parallel, possibly corrupting each
> other's data.
> 
> Please explain why you think this is safe.

OK, here you go: Actually we are handling the command queue
synchronously: The first writer to CWRITER will end up here and will
iterate over all commands below. From the guests point of view it will
take some time to do the CWRITER MMIO write, but upon the writel
returning the queue is actually already empty again.
That means that any _sane_ guest will _never_ trigger the case where two
threads/VCPUs are handling the queue, because a concurrent write to
CWRITER without a lock in the guest would be broken by design.

However I am not sure we can rely on this, so I added this precaution
scheme:
The first writer takes our ITS (emulation) lock and examines cwriter and
creadr to see if they are equal. If they are, then the queue is
currently empty, which means we are the first one and need to take care
of the commands. We update our copy of cwriter (now marking the queue as
non-empty) and drop the lock. As we kept the queue-was-empty status in a
local variable, we now can start iterating through the queue. We only
take the lock briefly when really needed in this process (for instance
when one command has been processed and creadr is incremented).

If now a second VCPU writes CWRITER, we also take the lock and compare
creadr and cwriter. Now they are different, because the first thread is
still busy with handling the commands and hasn't finished yet.
So we set our local "finished" to true. Also we update cwriter to the
new value, then drop the lock. The while loop below will not be entered
in this thread and we return to the guest.
Now the first thread has handled another command, takes the lock to
increase creadr and compares it with cwriter. Without the second writer
it may have been finished already, but now cwriter has grown, so it will
just carry on with handling the commands.

A bit like someone washing the dishes while another person adds some
more dirty plates to the pile ;-)

But again: this is just insurance against broken guests and due to guest
locking requirements this would actually never happen.

Does that make sense?

Btw: I have finished rebasing this upon the new VGIC and will post a new
version after -rc1.

Cheers,
Andre.
Andre Przywara May 25, 2016, 1:49 p.m. UTC | #6
Hi Eric,

On 06/04/16 10:36, Eric Auger wrote:
> Hi Andre,
> On 03/26/2016 03:14 AM, Andre Przywara wrote:
>> Add emulation for some basic MMIO registers used in the ITS emulation.

...

>> +
>> +/*
>> + * By writing to CWRITER the guest announces new commands to be processed.
>> + * Since we cannot read from guest memory inside the ITS spinlock, we
>> + * iterate over the command buffer (with the lock dropped) until the read
>> + * pointer matches the write pointer. Other VCPUs writing this register in the
>> + * meantime will just update the write pointer, leaving the command
>> + * processing to the first instance of the function.
>> + */
>> +static int vgic_mmio_write_its_cwriter(struct kvm_vcpu *vcpu,
>> +				       struct kvm_io_device *this,
>> +				       gpa_t addr, int len, const void *val)
>> +{
>> +	struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
>> +	struct vgic_its *its = &dist->its;
>> +	gpa_t cbaser = its_cmd_buffer_base(vcpu->kvm);
>> +	u64 cmd_buf[4];
>> +	u32 reg;
>> +	bool finished;
>> +
>> +	reg = mask64(its->cwriter & 0xfffe0, addr & 7, len, val);
>> +	reg &= 0xfffe0;
>> +	if (reg > its_cmd_buffer_size(vcpu->kvm))
>> +		return 0;
>> +
>> +	spin_lock(&its->lock);
>> +
>> +	/*
>> +	 * If there is still another VCPU handling commands, let this
>> +	 * one pick up the new CWRITER and process "our" new commands as well.
>> +	 */
>> +	finished = (its->cwriter != its->creadr);
> empty?

Maybe that is indeed less misleading ...

>> +	its->cwriter = reg;
>> +
>> +	spin_unlock(&its->lock);
> Assuming we have 2 vcpus attempting a cwriter write at the same moment I
> don't understand what does guarantee they are not going to execute the
> following loop concurrently and possibly execute vits_handle_command
> twice on the same cmd from the same its->creadr start?

So does my explanation of the algorithm in that other mail today explain
it? Or do you still see a hole in the locking scheme here?

>> +
>> +	while (!finished) {
>> +		int ret = kvm_read_guest(vcpu->kvm, cbaser + its->creadr,
>> +					 cmd_buf, 32);
>> +		if (ret) {
>> +			/*
>> +			 * Gah, we are screwed. Reset CWRITER to that command
>> +			 * that we have finished processing and return.
>> +			 */
>> +			spin_lock(&its->lock);
>> +			its->cwriter = its->creadr;
> don't get why do you set the queue empty?

So from a guest's point of view this is something like an ITS internal
error which shouldn't happen. So I just bail out, but leave the queue in
a consistent (read: empty) state to allow possible future commands to be
processed at best effort.
I guess this is the best we can come up with, since there is no feasible
way of communicating errors back(?)

Cheers,
Andre.
Marc Zyngier May 26, 2016, 9:10 a.m. UTC | #7
On 25/05/16 12:37, Andre Przywara wrote:
> Hi Marc,
> 
> this series is mostly obsolete now, but I wanted to elaborate on the
> locking scheme, which hasn't changed.
> 
> On 07/04/16 15:35, Marc Zyngier wrote:
>> On 26/03/16 02:14, Andre Przywara wrote:
> 
> ....
> 
>>> --- a/virt/kvm/arm/vgic/its-emul.c
>>> +++ b/virt/kvm/arm/vgic/its-emul.c
>>> @@ -31,23 +31,263 @@
>>>  #include "vgic.h"
>>>  #include "vgic_mmio.h"
>>>  
>>> +#define BASER_BASE_ADDRESS(x) ((x) & 0xfffffffff000ULL)
>>> +
>>> +static int vgic_mmio_read_its_ctlr(struct kvm_vcpu *vcpu,
>>> +				   struct kvm_io_device *this,
>>> +				   gpa_t addr, int len, void *val)
>>> +{
>>> +	struct vgic_its *its = &vcpu->kvm->arch.vgic.its;
>>> +	u32 reg;
>>> +
>>> +	reg = GITS_CTLR_QUIESCENT;
>>
>> So your ITS is always in a quiescent state? Even when you're processing
>> the command queue? You'll have to convince me...
> 
> Does QUIESCENT actually cover the command queue status? I was under the
> impression that this is purely to cope with distributed implementations
> and the possible delays caused by propagating commands.

Here's what the spec says:

"For the ITS to be quiescent, there must be no transactions in progress.
In addition, all operations required to ensure that mapping data is
consistent with external memory must be complete."

How can you enforce consistency if you have command in progress?

> If not so, I can surely take the lock here and respect
> (creadr == cwriter) upon setting the bit.

Please do.

> 
>>
>>> +	if (its->enabled)
>>> +		reg |= GITS_CTLR_ENABLE;
>>> +
>>> +	write_mask32(reg, addr & 3, len, val);
>>> +
>>> +	return 0;
>>> +}
>>> +
>>> +static int vgic_mmio_write_its_ctlr(struct kvm_vcpu *vcpu,
>>> +				    struct kvm_io_device *this,
>>> +				    gpa_t addr, int len, const void *val)
>>> +{
>>> +	struct vgic_its *its = &vcpu->kvm->arch.vgic.its;
>>> +	struct vgic_io_device *iodev = container_of(this,
>>> +						    struct vgic_io_device, dev);
>>> +
>>> +        if (addr - iodev->base_addr == 0)
>>
>> whitespace issue.
>>
>>> +		its->enabled = !!(*(u8*)val & GITS_CTLR_ENABLE);
>>> +
>>> +	return 0;
>>> +}
>>> +
>>> +static int vgic_mmio_read_its_typer(struct kvm_vcpu *vcpu,
>>> +				    struct kvm_io_device *this,
>>> +				    gpa_t addr, int len, void *val)
>>> +{
>>> +	u64 reg = GITS_TYPER_PLPIS;
>>> +
>>> +	/*
>>> +	 * We use linear CPU numbers for redistributor addressing,
>>> +	 * so GITS_TYPER.PTA is 0.
>>> +	 * To avoid memory waste on the guest side, we keep the
>>> +	 * number of IDBits and DevBits low for the time being.
>>> +	 * This could later be made configurable by userland.
>>> +	 * Since we have all collections in linked list, we claim
>>> +	 * that we can hold all of the collection tables in our
>>> +	 * own memory and that the ITT entry size is 1 byte (the
>>> +	 * smallest possible one).
>>
>> All of this is going to bite us when we want to implement migration,
>> specially the HW collection bit.
> 
> How so? Clearly we keep the number of VCPUs constant, so everything
> guest visible stays the same even upon migrating to a much different
> host? Or are we talking about different things here?

Let me rephrase this: How are you going to address HW collections from
userspace in order to dump them? Short of having memory tables, you cannot.

> 
>>
>>> +	 */
>>> +	reg |= 0xff << GITS_TYPER_HWCOLLCNT_SHIFT;
>>> +	reg |= 0x0f << GITS_TYPER_DEVBITS_SHIFT;
>>> +	reg |= 0x0f << GITS_TYPER_IDBITS_SHIFT;
>>> +
>>> +	write_mask64(reg, addr & 7, len, val);
>>> +
>>> +	return 0;
>>> +}
>>> +
>>> +static int vgic_mmio_read_its_iidr(struct kvm_vcpu *vcpu,
>>> +				   struct kvm_io_device *this,
>>> +				   gpa_t addr, int len, void *val)
>>> +{
>>> +	u32 reg = (PRODUCT_ID_KVM << 24) | (IMPLEMENTER_ARM << 0);
>>> +
>>> +	write_mask32(reg, addr & 3, len, val);
>>> +
>>> +	return 0;
>>> +}
>>> +
>>> +static int vgic_mmio_read_its_idregs(struct kvm_vcpu *vcpu,
>>> +				     struct kvm_io_device *this,
>>> +				     gpa_t addr, int len, void *val)
>>> +{
>>> +	struct vgic_io_device *iodev = container_of(this,
>>> +						    struct vgic_io_device, dev);
>>> +	u32 reg = 0;
>>> +	int idreg = (addr & ~3) - iodev->base_addr + GITS_IDREGS_BASE;
>>> +
>>> +	switch (idreg) {
>>> +	case GITS_PIDR2:
>>> +		reg = GIC_PIDR2_ARCH_GICv3;
>>
>> Are we leaving the lowest 4 bits to zero?
> 
> I couldn't stuff "42" in 4 bits, so: yes ;-)

The 4 bottom bits are "Implementation Defined", so you could fit
something in it if you wanted.

> 
>>> +		break;
>>> +	case GITS_PIDR4:
>>> +		/* This is a 64K software visible page */
>>> +		reg = 0x40;
>>
>> Same question.
>>
>> Also, how about all the others PIDR registers?
> 
> I guess I was halfway undecided between going with the pure
> architectural requirements (which only mandate bits 7:4 in PIDR2) and
> complying with the recommendation later in the spec.
> So I will just fill PIDR0, PIDR2 and the rest of PIDR2 as well.
> 
>>> +		break;
>>> +	/* Those are the ID registers for (any) GIC. */
>>> +	case GITS_CIDR0:
>>> +		reg = 0x0d;
>>> +		break;
>>> +	case GITS_CIDR1:
>>> +		reg = 0xf0;
>>> +		break;
>>> +	case GITS_CIDR2:
>>> +		reg = 0x05;
>>> +		break;
>>> +	case GITS_CIDR3:
>>> +		reg = 0xb1;
>>> +		break;
>>> +	}
>>
>> Given that these values are directly taken from the architecture, and
>> seem common to the whole GICv3 architecture when implemented by ARM, we
>> could have a common handler for the whole GICv3 implementatuin. Not a
>> bit deal though.
> 
> Agreed.
> 
>>> +
>>> +	write_mask32(reg, addr & 3, len, val);
>>> +
>>> +	return 0;
>>> +}
>>> +
>>> +/*
>>> + * This function is called with both the ITS and the distributor lock dropped,
>>> + * so the actual command handlers must take the respective locks when needed.
>>> + */
>>> +static int vits_handle_command(struct kvm_vcpu *vcpu, u64 *its_cmd)
>>> +{
>>> +	return -ENODEV;
>>> +}
>>> +
>>> +static int vgic_mmio_read_its_cbaser(struct kvm_vcpu *vcpu,
>>> +				    struct kvm_io_device *this,
>>> +				    gpa_t addr, int len, void *val)
>>> +{
>>> +	struct vgic_its *its = &vcpu->kvm->arch.vgic.its;
>>> +
>>> +	write_mask64(its->cbaser, addr & 7, len, val);
>>> +
>>> +	return 0;
>>> +}
>>> +
>>> +static int vgic_mmio_write_its_cbaser(struct kvm_vcpu *vcpu,
>>> +				      struct kvm_io_device *this,
>>> +				      gpa_t addr, int len, const void *val)
>>> +{
>>> +	struct vgic_its *its = &vcpu->kvm->arch.vgic.its;
>>> +
>>> +	if (its->enabled)
>>> +		return 0;
>>> +
>>> +	its->cbaser = mask64(its->cbaser, addr & 7, len, val);
>>> +	its->creadr = 0;
>>
>> Don't you need to acquire the command queue lock here?
> 
> I don't think so, because we "return 0;" above if the ITS is enabled,
> which means that nothing is going on at the moment.

What would prevent the ITS from becoming enabled between the two
assignments?

> But I guess I can just take the lock anyway to be sure.

I don't think this is optional.

>>> +
>>> +	return 0;
>>> +}
>>> +
>>> +static int its_cmd_buffer_size(struct kvm *kvm)
>>> +{
>>> +	struct vgic_its *its = &kvm->arch.vgic.its;
>>> +
>>> +	return ((its->cbaser & 0xff) + 1) << 12;
>>> +}
>>> +
>>> +static gpa_t its_cmd_buffer_base(struct kvm *kvm)
>>> +{
>>> +	struct vgic_its *its = &kvm->arch.vgic.its;
>>> +
>>> +	return BASER_BASE_ADDRESS(its->cbaser);
>>> +}
>>> +
>>> +/*
>>> + * By writing to CWRITER the guest announces new commands to be processed.
>>> + * Since we cannot read from guest memory inside the ITS spinlock, we
>>> + * iterate over the command buffer (with the lock dropped) until the read
>>> + * pointer matches the write pointer. Other VCPUs writing this register in the
>>> + * meantime will just update the write pointer, leaving the command
>>> + * processing to the first instance of the function.
>>> + */
>>> +static int vgic_mmio_write_its_cwriter(struct kvm_vcpu *vcpu,
>>> +				       struct kvm_io_device *this,
>>> +				       gpa_t addr, int len, const void *val)
>>> +{
>>> +	struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
>>> +	struct vgic_its *its = &dist->its;
>>> +	gpa_t cbaser = its_cmd_buffer_base(vcpu->kvm);
>>> +	u64 cmd_buf[4];
>>> +	u32 reg;
>>> +	bool finished;
>>> +
>>> +	reg = mask64(its->cwriter & 0xfffe0, addr & 7, len, val);
>>> +	reg &= 0xfffe0;
>>> +	if (reg > its_cmd_buffer_size(vcpu->kvm))
>>> +		return 0;
>>> +
>>> +	spin_lock(&its->lock);
>>> +
>>> +	/*
>>> +	 * If there is still another VCPU handling commands, let this
>>> +	 * one pick up the new CWRITER and process "our" new commands as well.
>>> +	 */
>>
>> How do you detect that condition? All I see is a massive race here, with
>> two threads processing the queue in parallel, possibly corrupting each
>> other's data.
>>
>> Please explain why you think this is safe.
> 
> OK, here you go: Actually we are handling the command queue
> synchronously: The first writer to CWRITER will end up here and will
> iterate over all commands below. From the guests point of view it will
> take some time to do the CWRITER MMIO write, but upon the writel
> returning the queue is actually already empty again.
> That means that any _sane_ guest will _never_ trigger the case where two
> threads/VCPUs are handling the queue, because a concurrent write to
> CWRITER without a lock in the guest would be broken by design.

That feels very fragile, and you're relying on a well behaved guest.

> 
> However I am not sure we can rely on this, so I added this precaution
> scheme:
> The first writer takes our ITS (emulation) lock and examines cwriter and
> creadr to see if they are equal. If they are, then the queue is
> currently empty, which means we are the first one and need to take care
> of the commands. We update our copy of cwriter (now marking the queue as
> non-empty) and drop the lock. As we kept the queue-was-empty status in a
> local variable, we now can start iterating through the queue. We only
> take the lock briefly when really needed in this process (for instance
> when one command has been processed and creadr is incremented).
> 
> If now a second VCPU writes CWRITER, we also take the lock and compare
> creadr and cwriter. Now they are different, because the first thread is
> still busy with handling the commands and hasn't finished yet.
> So we set our local "finished" to true. Also we update cwriter to the
> new value, then drop the lock. The while loop below will not be entered
> in this thread and we return to the guest.
> Now the first thread has handled another command, takes the lock to
> increase creadr and compares it with cwriter. Without the second writer
> it may have been finished already, but now cwriter has grown, so it will
> just carry on with handling the commands.
> 
> A bit like someone washing the dishes while another person adds some
> more dirty plates to the pile ;-)

I'm sorry, this feels incredibly complicated, and will make actual
debugging/tracing a nightmare. It also penalize the first vcpu that gets
to submitting a command, leading to scheduling imbalances.

Why can't you simply turn the ITS lock into a mutex and keep it held,
processing each batch in the context of the vcpu that is writing to
GITC_CWRITER?

Thanks,

	M.
Andre Przywara June 3, 2016, 3:42 p.m. UTC | #8
Hi,

I addressed some comments of yours in v5, just some rationale for the
others (sorry for the late reply on these!):

...

>>>> +static int vgic_mmio_read_its_typer(struct kvm_vcpu *vcpu,
>>>> +				    struct kvm_io_device *this,
>>>> +				    gpa_t addr, int len, void *val)
>>>> +{
>>>> +	u64 reg = GITS_TYPER_PLPIS;
>>>> +
>>>> +	/*
>>>> +	 * We use linear CPU numbers for redistributor addressing,
>>>> +	 * so GITS_TYPER.PTA is 0.
>>>> +	 * To avoid memory waste on the guest side, we keep the
>>>> +	 * number of IDBits and DevBits low for the time being.
>>>> +	 * This could later be made configurable by userland.
>>>> +	 * Since we have all collections in linked list, we claim
>>>> +	 * that we can hold all of the collection tables in our
>>>> +	 * own memory and that the ITT entry size is 1 byte (the
>>>> +	 * smallest possible one).
>>>
>>> All of this is going to bite us when we want to implement migration,
>>> specially the HW collection bit.
>>
>> How so? Clearly we keep the number of VCPUs constant, so everything
>> guest visible stays the same even upon migrating to a much different
>> host? Or are we talking about different things here?
> 
> Let me rephrase this: How are you going to address HW collections from
> userspace in order to dump them? Short of having memory tables, you cannot.

I was just hoping for addressing first things first. Having the
collection table entirely in the kernel makes things easier for now.

How is that save/restore story covered by a hardware ITS? Is it just
ignored since it's not needed/used there?

That being said I would be happy to change this when we get migration
support for the ITS register state and we have agreed on an API for
userland ITS save/restore.

>>>
>>>> +	 */
>>>> +	reg |= 0xff << GITS_TYPER_HWCOLLCNT_SHIFT;
>>>> +	reg |= 0x0f << GITS_TYPER_DEVBITS_SHIFT;
>>>> +	reg |= 0x0f << GITS_TYPER_IDBITS_SHIFT;
>>>> +
>>>> +	write_mask64(reg, addr & 7, len, val);
>>>> +
>>>> +	return 0;
>>>> +}

.....

>>>> +static int vgic_mmio_read_its_idregs(struct kvm_vcpu *vcpu,
>>>> +				     struct kvm_io_device *this,
>>>> +				     gpa_t addr, int len, void *val)
>>>> +{
>>>> +	struct vgic_io_device *iodev = container_of(this,
>>>> +						    struct vgic_io_device, dev);
>>>> +	u32 reg = 0;
>>>> +	int idreg = (addr & ~3) - iodev->base_addr + GITS_IDREGS_BASE;
>>>> +
>>>> +	switch (idreg) {
>>>> +	case GITS_PIDR2:
>>>> +		reg = GIC_PIDR2_ARCH_GICv3;
>>>
>>> Are we leaving the lowest 4 bits to zero?
>>
>> I couldn't stuff "42" in 4 bits, so: yes ;-)
> 
> The 4 bottom bits are "Implementation Defined", so you could fit
> something in it if you wanted.
> 
>>
>>>> +		break;
>>>> +	case GITS_PIDR4:
>>>> +		/* This is a 64K software visible page */
>>>> +		reg = 0x40;
>>>
>>> Same question.
>>>
>>> Also, how about all the others PIDR registers?
>>
>> I guess I was halfway undecided between going with the pure
>> architectural requirements (which only mandate bits 7:4 in PIDR2) and
>> complying with the recommendation later in the spec.
>> So I will just fill PIDR0, PIDR2 and the rest of PIDR2 as well.
>>
>>>> +		break;
>>>> +	/* Those are the ID registers for (any) GIC. */
>>>> +	case GITS_CIDR0:
>>>> +		reg = 0x0d;
>>>> +		break;
>>>> +	case GITS_CIDR1:
>>>> +		reg = 0xf0;
>>>> +		break;
>>>> +	case GITS_CIDR2:
>>>> +		reg = 0x05;
>>>> +		break;
>>>> +	case GITS_CIDR3:
>>>> +		reg = 0xb1;
>>>> +		break;
>>>> +	}
>>>
>>> Given that these values are directly taken from the architecture, and
>>> seem common to the whole GICv3 architecture when implemented by ARM, we
>>> could have a common handler for the whole GICv3 implementatuin. Not a
>>> bit deal though.

I tried it, but it turned out to be more involved than anticipated, so I
shelved it for now. I can make a patch on top later.

>>
>> Agreed.
>>
>>>> +
>>>> +	write_mask32(reg, addr & 3, len, val);
>>>> +
>>>> +	return 0;
>>>> +}

....

>>>> +/*
>>>> + * By writing to CWRITER the guest announces new commands to be processed.
>>>> + * Since we cannot read from guest memory inside the ITS spinlock, we
>>>> + * iterate over the command buffer (with the lock dropped) until the read
>>>> + * pointer matches the write pointer. Other VCPUs writing this register in the
>>>> + * meantime will just update the write pointer, leaving the command
>>>> + * processing to the first instance of the function.
>>>> + */
>>>> +static int vgic_mmio_write_its_cwriter(struct kvm_vcpu *vcpu,
>>>> +				       struct kvm_io_device *this,
>>>> +				       gpa_t addr, int len, const void *val)
>>>> +{
>>>> +	struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
>>>> +	struct vgic_its *its = &dist->its;
>>>> +	gpa_t cbaser = its_cmd_buffer_base(vcpu->kvm);
>>>> +	u64 cmd_buf[4];
>>>> +	u32 reg;
>>>> +	bool finished;
>>>> +
>>>> +	reg = mask64(its->cwriter & 0xfffe0, addr & 7, len, val);
>>>> +	reg &= 0xfffe0;
>>>> +	if (reg > its_cmd_buffer_size(vcpu->kvm))
>>>> +		return 0;
>>>> +
>>>> +	spin_lock(&its->lock);
>>>> +
>>>> +	/*
>>>> +	 * If there is still another VCPU handling commands, let this
>>>> +	 * one pick up the new CWRITER and process "our" new commands as well.
>>>> +	 */
>>>
>>> How do you detect that condition? All I see is a massive race here, with
>>> two threads processing the queue in parallel, possibly corrupting each
>>> other's data.
>>>
>>> Please explain why you think this is safe.
>>
>> OK, here you go: Actually we are handling the command queue
>> synchronously: The first writer to CWRITER will end up here and will
>> iterate over all commands below. From the guests point of view it will
>> take some time to do the CWRITER MMIO write, but upon the writel
>> returning the queue is actually already empty again.
>> That means that any _sane_ guest will _never_ trigger the case where two
>> threads/VCPUs are handling the queue, because a concurrent write to
>> CWRITER without a lock in the guest would be broken by design.
> 
> That feels very fragile, and you're relying on a well behaved guest.

No, I don't rely on it, as mentioned in the next sentence.
I just don't optimize for that case, because in reality it should never
happen - apart from a malicious guest. Let me think about the security
implications, though (whether a misbehaved guest could hog a physical CPU).

>> However I am not sure we can rely on this, so I added this precaution
>> scheme:

>> The first writer takes our ITS (emulation) lock and examines cwriter and
>> creadr to see if they are equal. If they are, then the queue is
>> currently empty, which means we are the first one and need to take care
>> of the commands. We update our copy of cwriter (now marking the queue as
>> non-empty) and drop the lock. As we kept the queue-was-empty status in a
>> local variable, we now can start iterating through the queue. We only
>> take the lock briefly when really needed in this process (for instance
>> when one command has been processed and creadr is incremented).
>>
>> If now a second VCPU writes CWRITER, we also take the lock and compare
>> creadr and cwriter. Now they are different, because the first thread is
>> still busy with handling the commands and hasn't finished yet.
>> So we set our local "finished" to true. Also we update cwriter to the
>> new value, then drop the lock. The while loop below will not be entered
>> in this thread and we return to the guest.
>> Now the first thread has handled another command, takes the lock to
>> increase creadr and compares it with cwriter. Without the second writer
>> it may have been finished already, but now cwriter has grown, so it will
>> just carry on with handling the commands.
>>
>> A bit like someone washing the dishes while another person adds some
>> more dirty plates to the pile ;-)
> 
> I'm sorry, this feels incredibly complicated, and will make actual
> debugging/tracing a nightmare. It also penalize the first vcpu that gets
> to submitting a command, leading to scheduling imbalances.

Well, as mentioned that would never happen apart from that
malicious/misbehaved guest case - so I don't care so much about
scheduling imbalances _for the guest_. As said above I have to think
about how this affects the host cores, though.

And frankly I don't see how this is overly complicated (in the context
of SMP locking schemes in general) - apart from me explaining it badly
above. It's just a bit tricky, but guarantees a strict order of command
processing.
On another approach we could just qualify a concurrent access as illegal
and ignore it - on real hardware you would have no guarantee for this to
work either - issuing two CWRITER MMIO writes at the same time would
just go bonkers.

> Why can't you simply turn the ITS lock into a mutex and keep it held,
> processing each batch in the context of the vcpu that is writing to
> GITC_CWRITER?

Frankly I spent quite some time to make this locking more fine grained
after some comments on earlier revisions - and I think it's better now,
since we don't hog all of the ITS data structures during the command
processing. Pessimistically one core could issue a big number of
commands, which would block the whole ITS.
Instead MSI injections happening in parallel now have a good chance of
iterating our lists and tables in between. This should improve scalability.

Thanks,
Andre.
Marc Zyngier June 3, 2016, 4:54 p.m. UTC | #9
On 03/06/16 16:42, Andre Przywara wrote:
> Hi,
> 
> I addressed some comments of yours in v5, just some rationale for the
> others (sorry for the late reply on these!):
> 
> ...
> 
>>>>> +static int vgic_mmio_read_its_typer(struct kvm_vcpu *vcpu,
>>>>> +				    struct kvm_io_device *this,
>>>>> +				    gpa_t addr, int len, void *val)
>>>>> +{
>>>>> +	u64 reg = GITS_TYPER_PLPIS;
>>>>> +
>>>>> +	/*
>>>>> +	 * We use linear CPU numbers for redistributor addressing,
>>>>> +	 * so GITS_TYPER.PTA is 0.
>>>>> +	 * To avoid memory waste on the guest side, we keep the
>>>>> +	 * number of IDBits and DevBits low for the time being.
>>>>> +	 * This could later be made configurable by userland.
>>>>> +	 * Since we have all collections in linked list, we claim
>>>>> +	 * that we can hold all of the collection tables in our
>>>>> +	 * own memory and that the ITT entry size is 1 byte (the
>>>>> +	 * smallest possible one).
>>>>
>>>> All of this is going to bite us when we want to implement migration,
>>>> specially the HW collection bit.
>>>
>>> How so? Clearly we keep the number of VCPUs constant, so everything
>>> guest visible stays the same even upon migrating to a much different
>>> host? Or are we talking about different things here?
>>
>> Let me rephrase this: How are you going to address HW collections from
>> userspace in order to dump them? Short of having memory tables, you cannot.
> 
> I was just hoping for addressing first things first. Having the
> collection table entirely in the kernel makes things easier for now.

I can see that. Yet this is not a sustainable approach.

> How is that save/restore story covered by a hardware ITS? Is it just
> ignored since it's not needed/used there?

The save restore story is covered by having tables in memory, and it is
not possible to save/restore a HW collection, full stop. There is no
register to dump it.

> That being said I would be happy to change this when we get migration
> support for the ITS register state and we have agreed on an API for
> userland ITS save/restore.
> 
>>>>
>>>>> +	 */
>>>>> +	reg |= 0xff << GITS_TYPER_HWCOLLCNT_SHIFT;
>>>>> +	reg |= 0x0f << GITS_TYPER_DEVBITS_SHIFT;
>>>>> +	reg |= 0x0f << GITS_TYPER_IDBITS_SHIFT;
>>>>> +
>>>>> +	write_mask64(reg, addr & 7, len, val);
>>>>> +
>>>>> +	return 0;
>>>>> +}
> 
> .....
> 
>>>>> +static int vgic_mmio_read_its_idregs(struct kvm_vcpu *vcpu,
>>>>> +				     struct kvm_io_device *this,
>>>>> +				     gpa_t addr, int len, void *val)
>>>>> +{
>>>>> +	struct vgic_io_device *iodev = container_of(this,
>>>>> +						    struct vgic_io_device, dev);
>>>>> +	u32 reg = 0;
>>>>> +	int idreg = (addr & ~3) - iodev->base_addr + GITS_IDREGS_BASE;
>>>>> +
>>>>> +	switch (idreg) {
>>>>> +	case GITS_PIDR2:
>>>>> +		reg = GIC_PIDR2_ARCH_GICv3;
>>>>
>>>> Are we leaving the lowest 4 bits to zero?
>>>
>>> I couldn't stuff "42" in 4 bits, so: yes ;-)
>>
>> The 4 bottom bits are "Implementation Defined", so you could fit
>> something in it if you wanted.
>>
>>>
>>>>> +		break;
>>>>> +	case GITS_PIDR4:
>>>>> +		/* This is a 64K software visible page */
>>>>> +		reg = 0x40;
>>>>
>>>> Same question.
>>>>
>>>> Also, how about all the others PIDR registers?
>>>
>>> I guess I was halfway undecided between going with the pure
>>> architectural requirements (which only mandate bits 7:4 in PIDR2) and
>>> complying with the recommendation later in the spec.
>>> So I will just fill PIDR0, PIDR2 and the rest of PIDR2 as well.
>>>
>>>>> +		break;
>>>>> +	/* Those are the ID registers for (any) GIC. */
>>>>> +	case GITS_CIDR0:
>>>>> +		reg = 0x0d;
>>>>> +		break;
>>>>> +	case GITS_CIDR1:
>>>>> +		reg = 0xf0;
>>>>> +		break;
>>>>> +	case GITS_CIDR2:
>>>>> +		reg = 0x05;
>>>>> +		break;
>>>>> +	case GITS_CIDR3:
>>>>> +		reg = 0xb1;
>>>>> +		break;
>>>>> +	}
>>>>
>>>> Given that these values are directly taken from the architecture, and
>>>> seem common to the whole GICv3 architecture when implemented by ARM, we
>>>> could have a common handler for the whole GICv3 implementatuin. Not a
>>>> bit deal though.
> 
> I tried it, but it turned out to be more involved than anticipated, so I
> shelved it for now. I can make a patch on top later.
> 
>>>
>>> Agreed.
>>>
>>>>> +
>>>>> +	write_mask32(reg, addr & 3, len, val);
>>>>> +
>>>>> +	return 0;
>>>>> +}
> 
> ....
> 
>>>>> +/*
>>>>> + * By writing to CWRITER the guest announces new commands to be processed.
>>>>> + * Since we cannot read from guest memory inside the ITS spinlock, we
>>>>> + * iterate over the command buffer (with the lock dropped) until the read
>>>>> + * pointer matches the write pointer. Other VCPUs writing this register in the
>>>>> + * meantime will just update the write pointer, leaving the command
>>>>> + * processing to the first instance of the function.
>>>>> + */
>>>>> +static int vgic_mmio_write_its_cwriter(struct kvm_vcpu *vcpu,
>>>>> +				       struct kvm_io_device *this,
>>>>> +				       gpa_t addr, int len, const void *val)
>>>>> +{
>>>>> +	struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
>>>>> +	struct vgic_its *its = &dist->its;
>>>>> +	gpa_t cbaser = its_cmd_buffer_base(vcpu->kvm);
>>>>> +	u64 cmd_buf[4];
>>>>> +	u32 reg;
>>>>> +	bool finished;
>>>>> +
>>>>> +	reg = mask64(its->cwriter & 0xfffe0, addr & 7, len, val);
>>>>> +	reg &= 0xfffe0;
>>>>> +	if (reg > its_cmd_buffer_size(vcpu->kvm))
>>>>> +		return 0;
>>>>> +
>>>>> +	spin_lock(&its->lock);
>>>>> +
>>>>> +	/*
>>>>> +	 * If there is still another VCPU handling commands, let this
>>>>> +	 * one pick up the new CWRITER and process "our" new commands as well.
>>>>> +	 */
>>>>
>>>> How do you detect that condition? All I see is a massive race here, with
>>>> two threads processing the queue in parallel, possibly corrupting each
>>>> other's data.
>>>>
>>>> Please explain why you think this is safe.
>>>
>>> OK, here you go: Actually we are handling the command queue
>>> synchronously: The first writer to CWRITER will end up here and will
>>> iterate over all commands below. From the guests point of view it will
>>> take some time to do the CWRITER MMIO write, but upon the writel
>>> returning the queue is actually already empty again.
>>> That means that any _sane_ guest will _never_ trigger the case where two
>>> threads/VCPUs are handling the queue, because a concurrent write to
>>> CWRITER without a lock in the guest would be broken by design.
>>
>> That feels very fragile, and you're relying on a well behaved guest.
> 
> No, I don't rely on it, as mentioned in the next sentence.
> I just don't optimize for that case, because in reality it should never
> happen - apart from a malicious guest. Let me think about the security
> implications, though (whether a misbehaved guest could hog a physical CPU).
> 
>>> However I am not sure we can rely on this, so I added this precaution
>>> scheme:
> 
>>> The first writer takes our ITS (emulation) lock and examines cwriter and
>>> creadr to see if they are equal. If they are, then the queue is
>>> currently empty, which means we are the first one and need to take care
>>> of the commands. We update our copy of cwriter (now marking the queue as
>>> non-empty) and drop the lock. As we kept the queue-was-empty status in a
>>> local variable, we now can start iterating through the queue. We only
>>> take the lock briefly when really needed in this process (for instance
>>> when one command has been processed and creadr is incremented).
>>>
>>> If now a second VCPU writes CWRITER, we also take the lock and compare
>>> creadr and cwriter. Now they are different, because the first thread is
>>> still busy with handling the commands and hasn't finished yet.
>>> So we set our local "finished" to true. Also we update cwriter to the
>>> new value, then drop the lock. The while loop below will not be entered
>>> in this thread and we return to the guest.
>>> Now the first thread has handled another command, takes the lock to
>>> increase creadr and compares it with cwriter. Without the second writer
>>> it may have been finished already, but now cwriter has grown, so it will
>>> just carry on with handling the commands.
>>>
>>> A bit like someone washing the dishes while another person adds some
>>> more dirty plates to the pile ;-)
>>
>> I'm sorry, this feels incredibly complicated, and will make actual
>> debugging/tracing a nightmare. It also penalize the first vcpu that gets
>> to submitting a command, leading to scheduling imbalances.
> 
> Well, as mentioned that would never happen apart from that
> malicious/misbehaved guest case - so I don't care so much about
> scheduling imbalances _for the guest_. As said above I have to think
> about how this affects the host cores, though.
> 
> And frankly I don't see how this is overly complicated (in the context
> of SMP locking schemes in general) - apart from me explaining it badly
> above. It's just a bit tricky, but guarantees a strict order of command
> processing.
> On another approach we could just qualify a concurrent access as illegal
> and ignore it - on real hardware you would have no guarantee for this to
> work either - issuing two CWRITER MMIO writes at the same time would
> just go bonkers.

There is nothing in the spec that indicates that what you're proposing
is legal. And it is likely that the HW would simply serialize the two
write cycles (unless you have a multi-ported register). Bear in mind
that the ITS has been designed for multiprocessor systems (funnily enough).

> 
>> Why can't you simply turn the ITS lock into a mutex and keep it held,
>> processing each batch in the context of the vcpu that is writing to
>> GITC_CWRITER?
> 
> Frankly I spent quite some time to make this locking more fine grained
> after some comments on earlier revisions - and I think it's better now,
> since we don't hog all of the ITS data structures during the command
> processing. Pessimistically one core could issue a big number of
> commands, which would block the whole ITS.
> Instead MSI injections happening in parallel now have a good chance of
> iterating our lists and tables in between. This should improve scalability.

I don't see how having a single "its->lock" makes it "fine-grained".
You're releasing the lock early in order not to cause other issues by
taking this spinlock twice. I call that papering over the issue.

	M.
diff mbox

Patch

diff --git a/include/kvm/vgic/vgic.h b/include/kvm/vgic/vgic.h
index c79bed5..bafea11 100644
--- a/include/kvm/vgic/vgic.h
+++ b/include/kvm/vgic/vgic.h
@@ -115,6 +115,9 @@  struct vgic_io_device {
 struct vgic_its {
 	bool			enabled;
 	spinlock_t		lock;
+	u64			cbaser;
+	int			creadr;
+	int			cwriter;
 };
 
 struct vgic_dist {
diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h
index a813c3e..7011b98 100644
--- a/include/linux/irqchip/arm-gic-v3.h
+++ b/include/linux/irqchip/arm-gic-v3.h
@@ -179,15 +179,23 @@ 
 #define GITS_BASER			0x0100
 #define GITS_IDREGS_BASE		0xffd0
 #define GITS_PIDR2			GICR_PIDR2
+#define GITS_PIDR4			0xffd0
+#define GITS_CIDR0			0xfff0
+#define GITS_CIDR1			0xfff4
+#define GITS_CIDR2			0xfff8
+#define GITS_CIDR3			0xfffc
 
 #define GITS_TRANSLATER			0x10040
 
 #define GITS_CTLR_ENABLE		(1U << 0)
 #define GITS_CTLR_QUIESCENT		(1U << 31)
 
+#define GITS_TYPER_PLPIS		(1UL << 0)
+#define GITS_TYPER_IDBITS_SHIFT		8
 #define GITS_TYPER_DEVBITS_SHIFT	13
 #define GITS_TYPER_DEVBITS(r)		((((r) >> GITS_TYPER_DEVBITS_SHIFT) & 0x1f) + 1)
 #define GITS_TYPER_PTA			(1UL << 19)
+#define GITS_TYPER_HWCOLLCNT_SHIFT	24
 
 #define GITS_CBASER_VALID		(1UL << 63)
 #define GITS_CBASER_nCnB		(0UL << 59)
diff --git a/virt/kvm/arm/vgic/its-emul.c b/virt/kvm/arm/vgic/its-emul.c
index 49dd5e4..de8d360 100644
--- a/virt/kvm/arm/vgic/its-emul.c
+++ b/virt/kvm/arm/vgic/its-emul.c
@@ -31,23 +31,263 @@ 
 #include "vgic.h"
 #include "vgic_mmio.h"
 
+#define BASER_BASE_ADDRESS(x) ((x) & 0xfffffffff000ULL)
+
+static int vgic_mmio_read_its_ctlr(struct kvm_vcpu *vcpu,
+				   struct kvm_io_device *this,
+				   gpa_t addr, int len, void *val)
+{
+	struct vgic_its *its = &vcpu->kvm->arch.vgic.its;
+	u32 reg;
+
+	reg = GITS_CTLR_QUIESCENT;
+	if (its->enabled)
+		reg |= GITS_CTLR_ENABLE;
+
+	write_mask32(reg, addr & 3, len, val);
+
+	return 0;
+}
+
+static int vgic_mmio_write_its_ctlr(struct kvm_vcpu *vcpu,
+				    struct kvm_io_device *this,
+				    gpa_t addr, int len, const void *val)
+{
+	struct vgic_its *its = &vcpu->kvm->arch.vgic.its;
+	struct vgic_io_device *iodev = container_of(this,
+						    struct vgic_io_device, dev);
+
+        if (addr - iodev->base_addr == 0)
+		its->enabled = !!(*(u8*)val & GITS_CTLR_ENABLE);
+
+	return 0;
+}
+
+static int vgic_mmio_read_its_typer(struct kvm_vcpu *vcpu,
+				    struct kvm_io_device *this,
+				    gpa_t addr, int len, void *val)
+{
+	u64 reg = GITS_TYPER_PLPIS;
+
+	/*
+	 * We use linear CPU numbers for redistributor addressing,
+	 * so GITS_TYPER.PTA is 0.
+	 * To avoid memory waste on the guest side, we keep the
+	 * number of IDBits and DevBits low for the time being.
+	 * This could later be made configurable by userland.
+	 * Since we have all collections in linked list, we claim
+	 * that we can hold all of the collection tables in our
+	 * own memory and that the ITT entry size is 1 byte (the
+	 * smallest possible one).
+	 */
+	reg |= 0xff << GITS_TYPER_HWCOLLCNT_SHIFT;
+	reg |= 0x0f << GITS_TYPER_DEVBITS_SHIFT;
+	reg |= 0x0f << GITS_TYPER_IDBITS_SHIFT;
+
+	write_mask64(reg, addr & 7, len, val);
+
+	return 0;
+}
+
+static int vgic_mmio_read_its_iidr(struct kvm_vcpu *vcpu,
+				   struct kvm_io_device *this,
+				   gpa_t addr, int len, void *val)
+{
+	u32 reg = (PRODUCT_ID_KVM << 24) | (IMPLEMENTER_ARM << 0);
+
+	write_mask32(reg, addr & 3, len, val);
+
+	return 0;
+}
+
+static int vgic_mmio_read_its_idregs(struct kvm_vcpu *vcpu,
+				     struct kvm_io_device *this,
+				     gpa_t addr, int len, void *val)
+{
+	struct vgic_io_device *iodev = container_of(this,
+						    struct vgic_io_device, dev);
+	u32 reg = 0;
+	int idreg = (addr & ~3) - iodev->base_addr + GITS_IDREGS_BASE;
+
+	switch (idreg) {
+	case GITS_PIDR2:
+		reg = GIC_PIDR2_ARCH_GICv3;
+		break;
+	case GITS_PIDR4:
+		/* This is a 64K software visible page */
+		reg = 0x40;
+		break;
+	/* Those are the ID registers for (any) GIC. */
+	case GITS_CIDR0:
+		reg = 0x0d;
+		break;
+	case GITS_CIDR1:
+		reg = 0xf0;
+		break;
+	case GITS_CIDR2:
+		reg = 0x05;
+		break;
+	case GITS_CIDR3:
+		reg = 0xb1;
+		break;
+	}
+
+	write_mask32(reg, addr & 3, len, val);
+
+	return 0;
+}
+
+/*
+ * This function is called with both the ITS and the distributor lock dropped,
+ * so the actual command handlers must take the respective locks when needed.
+ */
+static int vits_handle_command(struct kvm_vcpu *vcpu, u64 *its_cmd)
+{
+	return -ENODEV;
+}
+
+static int vgic_mmio_read_its_cbaser(struct kvm_vcpu *vcpu,
+				    struct kvm_io_device *this,
+				    gpa_t addr, int len, void *val)
+{
+	struct vgic_its *its = &vcpu->kvm->arch.vgic.its;
+
+	write_mask64(its->cbaser, addr & 7, len, val);
+
+	return 0;
+}
+
+static int vgic_mmio_write_its_cbaser(struct kvm_vcpu *vcpu,
+				      struct kvm_io_device *this,
+				      gpa_t addr, int len, const void *val)
+{
+	struct vgic_its *its = &vcpu->kvm->arch.vgic.its;
+
+	if (its->enabled)
+		return 0;
+
+	its->cbaser = mask64(its->cbaser, addr & 7, len, val);
+	its->creadr = 0;
+
+	return 0;
+}
+
+static int its_cmd_buffer_size(struct kvm *kvm)
+{
+	struct vgic_its *its = &kvm->arch.vgic.its;
+
+	return ((its->cbaser & 0xff) + 1) << 12;
+}
+
+static gpa_t its_cmd_buffer_base(struct kvm *kvm)
+{
+	struct vgic_its *its = &kvm->arch.vgic.its;
+
+	return BASER_BASE_ADDRESS(its->cbaser);
+}
+
+/*
+ * By writing to CWRITER the guest announces new commands to be processed.
+ * Since we cannot read from guest memory inside the ITS spinlock, we
+ * iterate over the command buffer (with the lock dropped) until the read
+ * pointer matches the write pointer. Other VCPUs writing this register in the
+ * meantime will just update the write pointer, leaving the command
+ * processing to the first instance of the function.
+ */
+static int vgic_mmio_write_its_cwriter(struct kvm_vcpu *vcpu,
+				       struct kvm_io_device *this,
+				       gpa_t addr, int len, const void *val)
+{
+	struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
+	struct vgic_its *its = &dist->its;
+	gpa_t cbaser = its_cmd_buffer_base(vcpu->kvm);
+	u64 cmd_buf[4];
+	u32 reg;
+	bool finished;
+
+	reg = mask64(its->cwriter & 0xfffe0, addr & 7, len, val);
+	reg &= 0xfffe0;
+	if (reg > its_cmd_buffer_size(vcpu->kvm))
+		return 0;
+
+	spin_lock(&its->lock);
+
+	/*
+	 * If there is still another VCPU handling commands, let this
+	 * one pick up the new CWRITER and process "our" new commands as well.
+	 */
+	finished = (its->cwriter != its->creadr);
+	its->cwriter = reg;
+
+	spin_unlock(&its->lock);
+
+	while (!finished) {
+		int ret = kvm_read_guest(vcpu->kvm, cbaser + its->creadr,
+					 cmd_buf, 32);
+		if (ret) {
+			/*
+			 * Gah, we are screwed. Reset CWRITER to that command
+			 * that we have finished processing and return.
+			 */
+			spin_lock(&its->lock);
+			its->cwriter = its->creadr;
+			spin_unlock(&its->lock);
+			break;
+		}
+		vits_handle_command(vcpu, cmd_buf);
+
+		spin_lock(&its->lock);
+		its->creadr += 32;
+		if (its->creadr == its_cmd_buffer_size(vcpu->kvm))
+			its->creadr = 0;
+		finished = (its->creadr == its->cwriter);
+		spin_unlock(&its->lock);
+	}
+
+	return 0;
+}
+
+static int vgic_mmio_read_its_cwriter(struct kvm_vcpu *vcpu,
+				      struct kvm_io_device *this,
+				      gpa_t addr, int len, void *val)
+{
+	struct vgic_its *its = &vcpu->kvm->arch.vgic.its;
+	u64 reg = its->cwriter & 0xfffe0;
+
+	write_mask64(reg, addr & 7, len, val);
+
+	return 0;
+}
+
+static int vgic_mmio_read_its_creadr(struct kvm_vcpu *vcpu,
+				     struct kvm_io_device *this,
+				     gpa_t addr, int len, void *val)
+{
+	struct vgic_its *its = &vcpu->kvm->arch.vgic.its;
+	u64 reg = its->creadr & 0xfffe0;
+
+	write_mask64(reg, addr & 7, len, val);
+
+	return 0;
+}
+
 struct vgic_register_region its_registers[] = {
 	REGISTER_DESC_WITH_LENGTH(GITS_CTLR,
-		vgic_mmio_read_raz, vgic_mmio_write_wi, 4),
+		vgic_mmio_read_its_ctlr, vgic_mmio_write_its_ctlr, 4),
 	REGISTER_DESC_WITH_LENGTH(GITS_IIDR,
-		vgic_mmio_read_raz, vgic_mmio_write_wi, 4),
+		vgic_mmio_read_its_iidr, vgic_mmio_write_wi, 4),
 	REGISTER_DESC_WITH_LENGTH(GITS_TYPER,
-		vgic_mmio_read_raz, vgic_mmio_write_wi, 4),
+		vgic_mmio_read_its_typer, vgic_mmio_write_wi, 4),
 	REGISTER_DESC_WITH_LENGTH(GITS_CBASER,
-		vgic_mmio_read_raz, vgic_mmio_write_wi, 8),
+		vgic_mmio_read_its_cbaser, vgic_mmio_write_its_cbaser, 8),
 	REGISTER_DESC_WITH_LENGTH(GITS_CWRITER,
-		vgic_mmio_read_raz, vgic_mmio_write_wi, 8),
+		vgic_mmio_read_its_cwriter, vgic_mmio_write_its_cwriter, 8),
 	REGISTER_DESC_WITH_LENGTH(GITS_CREADR,
-		vgic_mmio_read_raz, vgic_mmio_write_wi, 8),
+		vgic_mmio_read_its_creadr, vgic_mmio_write_wi, 8),
 	REGISTER_DESC_WITH_LENGTH(GITS_BASER,
 		vgic_mmio_read_raz, vgic_mmio_write_wi, 0x40),
 	REGISTER_DESC_WITH_LENGTH(GITS_IDREGS_BASE,
-		vgic_mmio_read_raz, vgic_mmio_write_wi, 0x30),
+		vgic_mmio_read_its_idregs, vgic_mmio_write_wi, 0x30),
 };
 
 /* This is called on setting the LPI enable bit in the redistributor. */
@@ -59,9 +299,14 @@  int vits_init(struct kvm *kvm)
 {
 	struct vgic_dist *dist = &kvm->arch.vgic;
 	struct vgic_its *its = &dist->its;
+	int nr_vcpus = atomic_read(&kvm->online_vcpus);
 	struct vgic_io_device *regions;
 	int ret, i;
 
+	dist->pendbaser = kcalloc(nr_vcpus, sizeof(u64), GFP_KERNEL);
+	if (!dist->pendbaser)
+		return -ENOMEM;
+
 	spin_lock_init(&its->lock);
 
 	regions = kmalloc_array(ARRAY_SIZE(its_registers),
@@ -82,3 +327,16 @@  int vits_init(struct kvm *kvm)
 
 	return -ENXIO;
 }
+
+void vits_destroy(struct kvm *kvm)
+{
+	struct vgic_dist *dist = &kvm->arch.vgic;
+	struct vgic_its *its = &dist->its;
+
+	if (!vgic_has_its(kvm))
+		return;
+
+	kfree(dist->pendbaser);
+
+	its->enabled = false;
+}
diff --git a/virt/kvm/arm/vgic/vgic.h b/virt/kvm/arm/vgic/vgic.h
index 4e7dcb8..08f97d1 100644
--- a/virt/kvm/arm/vgic/vgic.h
+++ b/virt/kvm/arm/vgic/vgic.h
@@ -63,6 +63,7 @@  int vgic_register_redist_regions(struct kvm *kvm, gpa_t dist_base_address);
 
 int vits_init(struct kvm *kvm);
 void vgic_enable_lpis(struct kvm_vcpu *vcpu);
+void vits_destroy(struct kvm *kvm);
 #else
 static inline void vgic_v3_irq_change_affinity(struct kvm *kvm, u32 intid,
 					       u64 mpidr)
@@ -137,6 +138,11 @@  static inline void vgic_enable_lpis(struct kvm_vcpu *vcpu)
 {
 	return;
 }
+
+static inline void vits_destroy(struct kvm *kvm)
+{
+	return;
+}
 #endif
 
 void vgic_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr);
diff --git a/virt/kvm/arm/vgic/vgic_init.c b/virt/kvm/arm/vgic/vgic_init.c
index dcfb93d..e4459e3 100644
--- a/virt/kvm/arm/vgic/vgic_init.c
+++ b/virt/kvm/arm/vgic/vgic_init.c
@@ -298,6 +298,8 @@  void kvm_vgic_destroy(struct kvm *kvm)
 
 	kvm_vgic_dist_destroy(kvm);
 
+	vits_destroy(kvm);
+
 	kvm_for_each_vcpu(i, vcpu, kvm)
 		kvm_vgic_vcpu_destroy(vcpu);
 }