diff mbox series

[v3,kvmtool,32/32] arm/arm64: Add PCI Express 1.1 support

Message ID 20200326152438.6218-33-alexandru.elisei@arm.com (mailing list archive)
State New, archived
Headers show
Series Add reassignable BARs and PCIE 1.1 support | expand

Commit Message

Alexandru Elisei March 26, 2020, 3:24 p.m. UTC
PCI Express comes with an extended addressing scheme, which directly
translated into a bigger device configuration space (256->4096 bytes)
and bigger PCI configuration space (16->256 MB), as well as mandatory
capabilities (power management [1] and PCI Express capability [2]).

However, our virtio PCI implementation implements version 0.9 of the
protocol and it still uses transitional PCI device ID's, so we have
opted to omit the mandatory PCI Express capabilities.For VFIO, the power
management and PCI Express capability are left for a subsequent patch.

[1] PCI Express Base Specification Revision 1.1, section 7.6
[2] PCI Express Base Specification Revision 1.1, section 7.8

Signed-off-by: Alexandru Elisei <alexandru.elisei@arm.com>
---
 arm/include/arm-common/kvm-arch.h |  4 +-
 arm/pci.c                         |  2 +-
 builtin-run.c                     |  1 +
 include/kvm/kvm-config.h          |  2 +-
 include/kvm/pci.h                 | 76 ++++++++++++++++++++++++++++---
 pci.c                             |  5 +-
 vfio/pci.c                        | 26 +++++++----
 7 files changed, 96 insertions(+), 20 deletions(-)

Comments

Andre Przywara April 6, 2020, 2:06 p.m. UTC | #1
On 26/03/2020 15:24, Alexandru Elisei wrote:
> PCI Express comes with an extended addressing scheme, which directly
> translated into a bigger device configuration space (256->4096 bytes)
> and bigger PCI configuration space (16->256 MB), as well as mandatory
> capabilities (power management [1] and PCI Express capability [2]).
> 
> However, our virtio PCI implementation implements version 0.9 of the
> protocol and it still uses transitional PCI device ID's, so we have
> opted to omit the mandatory PCI Express capabilities.For VFIO, the power
> management and PCI Express capability are left for a subsequent patch.
> 
> [1] PCI Express Base Specification Revision 1.1, section 7.6
> [2] PCI Express Base Specification Revision 1.1, section 7.8
> 
> Signed-off-by: Alexandru Elisei <alexandru.elisei@arm.com>
> ---
>  arm/include/arm-common/kvm-arch.h |  4 +-
>  arm/pci.c                         |  2 +-
>  builtin-run.c                     |  1 +
>  include/kvm/kvm-config.h          |  2 +-
>  include/kvm/pci.h                 | 76 ++++++++++++++++++++++++++++---
>  pci.c                             |  5 +-
>  vfio/pci.c                        | 26 +++++++----
>  7 files changed, 96 insertions(+), 20 deletions(-)
> 
> diff --git a/arm/include/arm-common/kvm-arch.h b/arm/include/arm-common/kvm-arch.h
> index b9d486d5eac2..13c55fa3dc29 100644
> --- a/arm/include/arm-common/kvm-arch.h
> +++ b/arm/include/arm-common/kvm-arch.h
> @@ -23,7 +23,7 @@
>  
>  #define ARM_IOPORT_SIZE		(ARM_MMIO_AREA - ARM_IOPORT_AREA)
>  #define ARM_VIRTIO_MMIO_SIZE	(ARM_AXI_AREA - (ARM_MMIO_AREA + ARM_GIC_SIZE))
> -#define ARM_PCI_CFG_SIZE	(1ULL << 24)
> +#define ARM_PCI_CFG_SIZE	(1ULL << 28)

The existence of this symbol seems somewhat odd, there should be no ARM
specific version of the config space size.
Do you know why we don't use the generic PCI_CFG_SIZE here?
At the very least I would expect something like:
#define ARM_PCI_CFG_SIZE	PCI_EXP_CFG_SIZE

>  #define ARM_PCI_MMIO_SIZE	(ARM_MEMORY_AREA - \
>  				(ARM_AXI_AREA + ARM_PCI_CFG_SIZE))
>  
> @@ -50,6 +50,8 @@
>  
>  #define VIRTIO_RING_ENDIAN	(VIRTIO_ENDIAN_LE | VIRTIO_ENDIAN_BE)
>  
> +#define ARCH_HAS_PCI_EXP	1
> +
>  static inline bool arm_addr_in_ioport_region(u64 phys_addr)
>  {
>  	u64 limit = KVM_IOPORT_AREA + ARM_IOPORT_SIZE;
> diff --git a/arm/pci.c b/arm/pci.c
> index ed325fa4a811..2251f627d8b5 100644
> --- a/arm/pci.c
> +++ b/arm/pci.c
> @@ -62,7 +62,7 @@ void pci__generate_fdt_nodes(void *fdt)
>  	_FDT(fdt_property_cell(fdt, "#address-cells", 0x3));
>  	_FDT(fdt_property_cell(fdt, "#size-cells", 0x2));
>  	_FDT(fdt_property_cell(fdt, "#interrupt-cells", 0x1));
> -	_FDT(fdt_property_string(fdt, "compatible", "pci-host-cam-generic"));
> +	_FDT(fdt_property_string(fdt, "compatible", "pci-host-ecam-generic"));
>  	_FDT(fdt_property(fdt, "dma-coherent", NULL, 0));
>  
>  	_FDT(fdt_property(fdt, "bus-range", bus_range, sizeof(bus_range)));
> diff --git a/builtin-run.c b/builtin-run.c
> index 9cb8c75300eb..def8a1f803ad 100644
> --- a/builtin-run.c
> +++ b/builtin-run.c
> @@ -27,6 +27,7 @@
>  #include "kvm/irq.h"
>  #include "kvm/kvm.h"
>  #include "kvm/pci.h"
> +#include "kvm/vfio.h"
>  #include "kvm/rtc.h"
>  #include "kvm/sdl.h"
>  #include "kvm/vnc.h"
> diff --git a/include/kvm/kvm-config.h b/include/kvm/kvm-config.h
> index a052b0bc7582..a1012c57b7a7 100644
> --- a/include/kvm/kvm-config.h
> +++ b/include/kvm/kvm-config.h
> @@ -2,7 +2,6 @@
>  #define KVM_CONFIG_H_
>  
>  #include "kvm/disk-image.h"
> -#include "kvm/vfio.h"
>  #include "kvm/kvm-config-arch.h"
>  
>  #define DEFAULT_KVM_DEV		"/dev/kvm"
> @@ -18,6 +17,7 @@
>  #define MIN_RAM_SIZE_MB		(64ULL)
>  #define MIN_RAM_SIZE_BYTE	(MIN_RAM_SIZE_MB << MB_SHIFT)
>  
> +struct vfio_device_params;
>  struct kvm_config {
>  	struct kvm_config_arch arch;
>  	struct disk_image_params disk_image[MAX_DISK_IMAGES];
> diff --git a/include/kvm/pci.h b/include/kvm/pci.h
> index be75f77fd2cb..71ee9d8cb01f 100644
> --- a/include/kvm/pci.h
> +++ b/include/kvm/pci.h
> @@ -10,6 +10,7 @@
>  #include "kvm/devices.h"
>  #include "kvm/msi.h"
>  #include "kvm/fdt.h"
> +#include "kvm.h"
>  
>  #define pci_dev_err(pci_hdr, fmt, ...) \
>  	pr_err("[%04x:%04x] " fmt, pci_hdr->vendor_id, pci_hdr->device_id, ##__VA_ARGS__)
> @@ -32,9 +33,41 @@
>  #define PCI_CONFIG_BUS_FORWARD	0xcfa
>  #define PCI_IO_SIZE		0x100
>  #define PCI_IOPORT_START	0x6200
> -#define PCI_CFG_SIZE		(1ULL << 24)
>  
> -struct kvm;
> +#define PCIE_CAP_REG_VER	0x1
> +#define PCIE_CAP_REG_DEV_LEGACY	(1 << 4)
> +#define PM_CAP_VER		0x3
> +
> +#ifdef ARCH_HAS_PCI_EXP
> +#define PCI_CFG_SIZE		(1ULL << 28)
> +#define PCI_DEV_CFG_SIZE	4096

Maybe use PCI_CFG_SPACE_EXP_SIZE from pci_regs.h?

> +
> +union pci_config_address {
> +	struct {
> +#if __BYTE_ORDER == __LITTLE_ENDIAN
> +		unsigned	reg_offset	: 2;		/* 1  .. 0  */
> +		unsigned	register_number	: 10;		/* 11 .. 2  */
> +		unsigned	function_number	: 3;		/* 14 .. 12 */
> +		unsigned	device_number	: 5;		/* 19 .. 15 */
> +		unsigned	bus_number	: 8;		/* 27 .. 20 */
> +		unsigned	reserved	: 3;		/* 30 .. 28 */
> +		unsigned	enable_bit	: 1;		/* 31       */
> +#else
> +		unsigned	enable_bit	: 1;		/* 31       */
> +		unsigned	reserved	: 3;		/* 30 .. 28 */
> +		unsigned	bus_number	: 8;		/* 27 .. 20 */
> +		unsigned	device_number	: 5;		/* 19 .. 15 */
> +		unsigned	function_number	: 3;		/* 14 .. 12 */
> +		unsigned	register_number	: 10;		/* 11 .. 2  */
> +		unsigned	reg_offset	: 2;		/* 1  .. 0  */
> +#endif

Just for the records:
I think we agreed on this before, but using a C bitfield to model
hardware defined bits is broken, because the C standard doesn't
guarantee those bits to be consecutive and layed out like we hope it would.
But since we have this issue already with the legacy config space, and
it seems to work (TM), we can fix this later.

> +	};
> +	u32 w;
> +};
> +
> +#else
> +#define PCI_CFG_SIZE		(1ULL << 24)
> +#define PCI_DEV_CFG_SIZE	256

Shall we use PCI_CFG_SPACE_SIZE from the kernel headers here?

>  
>  union pci_config_address {
>  	struct {
> @@ -58,6 +91,8 @@ union pci_config_address {
>  	};
>  	u32 w;
>  };
> +#endif
> +#define PCI_DEV_CFG_MASK	(PCI_DEV_CFG_SIZE - 1)
>  
>  struct msix_table {
>  	struct msi_msg msg;
> @@ -100,6 +135,33 @@ struct pci_cap_hdr {
>  	u8	next;
>  };
>  
> +struct pcie_cap {

I guess this is meant to map to the PCI Express Capability Structure as
described in the PCIe spec?
We would need to add the "packed" attribute then. But actually I am not
a fan of using C language constructs to model specified register
arrangements, the kernel tries to avoid this as well.

Actually, looking closer: why do we need this in the first place? I
removed this and struct pm_cap, and it still compiles.
So can we lose those two structures at all? And move the discussion and
implementation (for VirtIO 1.0?) to a later series?

> +	u8 cap;
> +	u8 next;
> +	u16 cap_reg;
> +	u32 dev_cap;
> +	u16 dev_ctrl;
> +	u16 dev_status;
> +	u32 link_cap;
> +	u16 link_ctrl;
> +	u16 link_status;
> +	u32 slot_cap;
> +	u16 slot_ctrl;
> +	u16 slot_status;
> +	u16 root_ctrl;
> +	u16 root_cap;
> +	u32 root_status;
> +};
> +
> +struct pm_cap {
> +	u8 cap;
> +	u8 next;
> +	u16 pmc;
> +	u16 pmcsr;
> +	u8 pmcsr_bse;
> +	u8 data;
> +};
> +
>  struct pci_device_header;
>  
>  typedef int (*bar_activate_fn_t)(struct kvm *kvm,
> @@ -110,14 +172,12 @@ typedef int (*bar_deactivate_fn_t)(struct kvm *kvm,
>  				   int bar_num, void *data);
>  
>  #define PCI_BAR_OFFSET(b)	(offsetof(struct pci_device_header, bar[b]))
> -#define PCI_DEV_CFG_SIZE	256
> -#define PCI_DEV_CFG_MASK	(PCI_DEV_CFG_SIZE - 1)
>  
>  struct pci_config_operations {
>  	void (*write)(struct kvm *kvm, struct pci_device_header *pci_hdr,
> -		      u8 offset, void *data, int sz);
> +		      u16 offset, void *data, int sz);
>  	void (*read)(struct kvm *kvm, struct pci_device_header *pci_hdr,
> -		     u8 offset, void *data, int sz);
> +		     u16 offset, void *data, int sz);
>  };
>  
>  struct pci_device_header {
> @@ -147,6 +207,10 @@ struct pci_device_header {
>  			u8		min_gnt;
>  			u8		max_lat;
>  			struct msix_cap msix;
> +#ifdef ARCH_HAS_PCI_EXP
> +			struct pm_cap pm;
> +			struct pcie_cap pcie;
> +#endif
>  		} __attribute__((packed));
>  		/* Pad to PCI config space size */
>  		u8	__pad[PCI_DEV_CFG_SIZE];
> diff --git a/pci.c b/pci.c
> index 68ece65441a6..b471209a6efc 100644
> --- a/pci.c
> +++ b/pci.c
> @@ -400,7 +400,8 @@ static void pci_config_bar_wr(struct kvm *kvm,
>  void pci__config_wr(struct kvm *kvm, union pci_config_address addr, void *data, int size)
>  {
>  	void *base;
> -	u8 bar, offset;
> +	u8 bar;
> +	u16 offset;
>  	struct pci_device_header *pci_hdr;
>  	u8 dev_num = addr.device_number;
>  	u32 value = 0;
> @@ -439,7 +440,7 @@ void pci__config_wr(struct kvm *kvm, union pci_config_address addr, void *data,
>  
>  void pci__config_rd(struct kvm *kvm, union pci_config_address addr, void *data, int size)
>  {
> -	u8 offset;
> +	u16 offset;
>  	struct pci_device_header *pci_hdr;
>  	u8 dev_num = addr.device_number;
>  
> diff --git a/vfio/pci.c b/vfio/pci.c
> index 2b891496547d..6b8726227ea0 100644
> --- a/vfio/pci.c
> +++ b/vfio/pci.c
> @@ -311,7 +311,7 @@ out_unlock:
>  }
>  
>  static void vfio_pci_msix_cap_write(struct kvm *kvm,
> -				    struct vfio_device *vdev, u8 off,
> +				    struct vfio_device *vdev, u16 off,
>  				    void *data, int sz)
>  {
>  	struct vfio_pci_device *pdev = &vdev->pci;
> @@ -343,7 +343,7 @@ static void vfio_pci_msix_cap_write(struct kvm *kvm,
>  }
>  
>  static int vfio_pci_msi_vector_write(struct kvm *kvm, struct vfio_device *vdev,
> -				     u8 off, u8 *data, u32 sz)
> +				     u16 off, u8 *data, u32 sz)
>  {
>  	size_t i;
>  	u32 mask = 0;
> @@ -391,7 +391,7 @@ static int vfio_pci_msi_vector_write(struct kvm *kvm, struct vfio_device *vdev,
>  }
>  
>  static void vfio_pci_msi_cap_write(struct kvm *kvm, struct vfio_device *vdev,
> -				   u8 off, u8 *data, u32 sz)
> +				   u16 off, u8 *data, u32 sz)
>  {
>  	u8 ctrl;
>  	struct msi_msg msg;
> @@ -536,7 +536,7 @@ out:
>  }
>  
>  static void vfio_pci_cfg_read(struct kvm *kvm, struct pci_device_header *pci_hdr,
> -			      u8 offset, void *data, int sz)
> +			      u16 offset, void *data, int sz)
>  {
>  	struct vfio_region_info *info;
>  	struct vfio_pci_device *pdev;
> @@ -554,7 +554,7 @@ static void vfio_pci_cfg_read(struct kvm *kvm, struct pci_device_header *pci_hdr
>  }
>  
>  static void vfio_pci_cfg_write(struct kvm *kvm, struct pci_device_header *pci_hdr,
> -			       u8 offset, void *data, int sz)
> +			       u16 offset, void *data, int sz)
>  {
>  	struct vfio_region_info *info;
>  	struct vfio_pci_device *pdev;
> @@ -638,15 +638,17 @@ static int vfio_pci_parse_caps(struct vfio_device *vdev)
>  {
>  	int ret;
>  	size_t size;
> -	u8 pos, next;
> +	u16 pos, next;
>  	struct pci_cap_hdr *cap;
> -	u8 virt_hdr[PCI_DEV_CFG_SIZE];
> +	u8 *virt_hdr;
>  	struct vfio_pci_device *pdev = &vdev->pci;
>  
>  	if (!(pdev->hdr.status & PCI_STATUS_CAP_LIST))
>  		return 0;
>  
> -	memset(virt_hdr, 0, PCI_DEV_CFG_SIZE);
> +	virt_hdr = calloc(1, PCI_DEV_CFG_SIZE);
> +	if (!virt_hdr)
> +		return -errno;

There are two places where we return in this function, we don't seem to
free virt_hdr in those cases. Looks like a job for your beloved goto ;-)

>  
>  	pos = pdev->hdr.capabilities & ~3;
>  
> @@ -682,6 +684,8 @@ static int vfio_pci_parse_caps(struct vfio_device *vdev)
>  	size = PCI_DEV_CFG_SIZE - PCI_STD_HEADER_SIZEOF;
>  	memcpy((void *)&pdev->hdr + pos, virt_hdr + pos, size);
>  
> +	free(virt_hdr);
> +
>  	return 0;
>  }
>  
> @@ -792,7 +796,11 @@ static int vfio_pci_fixup_cfg_space(struct vfio_device *vdev)
>  
>  	/* Install our fake Configuration Space */
>  	info = &vdev->regions[VFIO_PCI_CONFIG_REGION_INDEX].info;
> -	hdr_sz = PCI_DEV_CFG_SIZE;
> +	/*
> +	 * We don't touch the extended configuration space, let's be cautious
> +	 * and not overwrite it all with zeros, or bad things might happen.
> +	 */
> +	hdr_sz = PCI_CFG_SPACE_SIZE;

This breaks compilation on x86, do we miss to include a header file?
Works on arm64, though...

Cheers,
Andre.

>  	if (pwrite(vdev->fd, &pdev->hdr, hdr_sz, info->offset) != hdr_sz) {
>  		vfio_dev_err(vdev, "failed to write %zd bytes to Config Space",
>  			     hdr_sz);
>
Alexandru Elisei April 14, 2020, 8:56 a.m. UTC | #2
Hi,

On 4/6/20 3:06 PM, André Przywara wrote:
>
>> @@ -792,7 +796,11 @@ static int vfio_pci_fixup_cfg_space(struct vfio_device *vdev)
>>  
>>  	/* Install our fake Configuration Space */
>>  	info = &vdev->regions[VFIO_PCI_CONFIG_REGION_INDEX].info;
>> -	hdr_sz = PCI_DEV_CFG_SIZE;
>> +	/*
>> +	 * We don't touch the extended configuration space, let's be cautious
>> +	 * and not overwrite it all with zeros, or bad things might happen.
>> +	 */
>> +	hdr_sz = PCI_CFG_SPACE_SIZE;
> This breaks compilation on x86, do we miss to include a header file?
> Works on arm64, though...

I've been able to reproduce it on an x86 laptop with Ubuntu 16.04. You've stumbled
upon the reason I switched to using numbers for PCI_DEV_CFG_SIZE instead of using
the defines PCI_CFG_SPACE_SIZE and PCI_CFG_SPACE_EXP_SIZE from linux/pci_regs.h:
those defines don't exist on some distros. When that happened to Sami on arm64, I
made the change to PCI_DEV_CFG_SIZE, but I forgot to change it here. I'll switch
to using 256.

For the record, x86 compiles fine on the x86 machine which I used for testing.

Thanks,
Alex
>
> Cheers,
> Andre.
Alexandru Elisei May 6, 2020, 1:51 p.m. UTC | #3
Hi,

On 4/6/20 3:06 PM, André Przywara wrote:
> On 26/03/2020 15:24, Alexandru Elisei wrote:
>> PCI Express comes with an extended addressing scheme, which directly
>> translated into a bigger device configuration space (256->4096 bytes)
>> and bigger PCI configuration space (16->256 MB), as well as mandatory
>> capabilities (power management [1] and PCI Express capability [2]).
>>
>> However, our virtio PCI implementation implements version 0.9 of the
>> protocol and it still uses transitional PCI device ID's, so we have
>> opted to omit the mandatory PCI Express capabilities.For VFIO, the power
>> management and PCI Express capability are left for a subsequent patch.
>>
>> [1] PCI Express Base Specification Revision 1.1, section 7.6
>> [2] PCI Express Base Specification Revision 1.1, section 7.8
>>
>> Signed-off-by: Alexandru Elisei <alexandru.elisei@arm.com>
>> ---
>>  arm/include/arm-common/kvm-arch.h |  4 +-
>>  arm/pci.c                         |  2 +-
>>  builtin-run.c                     |  1 +
>>  include/kvm/kvm-config.h          |  2 +-
>>  include/kvm/pci.h                 | 76 ++++++++++++++++++++++++++++---
>>  pci.c                             |  5 +-
>>  vfio/pci.c                        | 26 +++++++----
>>  7 files changed, 96 insertions(+), 20 deletions(-)
>>
>> diff --git a/arm/include/arm-common/kvm-arch.h b/arm/include/arm-common/kvm-arch.h
>> index b9d486d5eac2..13c55fa3dc29 100644
>> --- a/arm/include/arm-common/kvm-arch.h
>> +++ b/arm/include/arm-common/kvm-arch.h
>> @@ -23,7 +23,7 @@
>>  
>>  #define ARM_IOPORT_SIZE		(ARM_MMIO_AREA - ARM_IOPORT_AREA)
>>  #define ARM_VIRTIO_MMIO_SIZE	(ARM_AXI_AREA - (ARM_MMIO_AREA + ARM_GIC_SIZE))
>> -#define ARM_PCI_CFG_SIZE	(1ULL << 24)
>> +#define ARM_PCI_CFG_SIZE	(1ULL << 28)
> The existence of this symbol seems somewhat odd, there should be no ARM
> specific version of the config space size.

The existence of the symbol is required because at the moment PCI Express support
is not available for all architecture, and this file needs to be included in
pci.h, not the other way around. I agree it's not ideal, but that's the way it is
right now.

> Do you know why we don't use the generic PCI_CFG_SIZE here?
> At the very least I would expect something like:
> #define ARM_PCI_CFG_SIZE	PCI_EXP_CFG_SIZE

We don't use PCI_EXP_CFG_SIZE because it might not be available for all
architectures. It is possible it's not there on x86, like the error you have
encountered showed, so instead of waiting for someone to report a compilation
failure, I would rather use a number from the start.

>
>>  #define ARM_PCI_MMIO_SIZE	(ARM_MEMORY_AREA - \
>>  				(ARM_AXI_AREA + ARM_PCI_CFG_SIZE))
>>  
>> @@ -50,6 +50,8 @@
>>  
>>  #define VIRTIO_RING_ENDIAN	(VIRTIO_ENDIAN_LE | VIRTIO_ENDIAN_BE)
>>  
>> +#define ARCH_HAS_PCI_EXP	1
>> +
>>  static inline bool arm_addr_in_ioport_region(u64 phys_addr)
>>  {
>>  	u64 limit = KVM_IOPORT_AREA + ARM_IOPORT_SIZE;
>> diff --git a/arm/pci.c b/arm/pci.c
>> index ed325fa4a811..2251f627d8b5 100644
>> --- a/arm/pci.c
>> +++ b/arm/pci.c
>> @@ -62,7 +62,7 @@ void pci__generate_fdt_nodes(void *fdt)
>>  	_FDT(fdt_property_cell(fdt, "#address-cells", 0x3));
>>  	_FDT(fdt_property_cell(fdt, "#size-cells", 0x2));
>>  	_FDT(fdt_property_cell(fdt, "#interrupt-cells", 0x1));
>> -	_FDT(fdt_property_string(fdt, "compatible", "pci-host-cam-generic"));
>> +	_FDT(fdt_property_string(fdt, "compatible", "pci-host-ecam-generic"));
>>  	_FDT(fdt_property(fdt, "dma-coherent", NULL, 0));
>>  
>>  	_FDT(fdt_property(fdt, "bus-range", bus_range, sizeof(bus_range)));
>> diff --git a/builtin-run.c b/builtin-run.c
>> index 9cb8c75300eb..def8a1f803ad 100644
>> --- a/builtin-run.c
>> +++ b/builtin-run.c
>> @@ -27,6 +27,7 @@
>>  #include "kvm/irq.h"
>>  #include "kvm/kvm.h"
>>  #include "kvm/pci.h"
>> +#include "kvm/vfio.h"
>>  #include "kvm/rtc.h"
>>  #include "kvm/sdl.h"
>>  #include "kvm/vnc.h"
>> diff --git a/include/kvm/kvm-config.h b/include/kvm/kvm-config.h
>> index a052b0bc7582..a1012c57b7a7 100644
>> --- a/include/kvm/kvm-config.h
>> +++ b/include/kvm/kvm-config.h
>> @@ -2,7 +2,6 @@
>>  #define KVM_CONFIG_H_
>>  
>>  #include "kvm/disk-image.h"
>> -#include "kvm/vfio.h"
>>  #include "kvm/kvm-config-arch.h"
>>  
>>  #define DEFAULT_KVM_DEV		"/dev/kvm"
>> @@ -18,6 +17,7 @@
>>  #define MIN_RAM_SIZE_MB		(64ULL)
>>  #define MIN_RAM_SIZE_BYTE	(MIN_RAM_SIZE_MB << MB_SHIFT)
>>  
>> +struct vfio_device_params;
>>  struct kvm_config {
>>  	struct kvm_config_arch arch;
>>  	struct disk_image_params disk_image[MAX_DISK_IMAGES];
>> diff --git a/include/kvm/pci.h b/include/kvm/pci.h
>> index be75f77fd2cb..71ee9d8cb01f 100644
>> --- a/include/kvm/pci.h
>> +++ b/include/kvm/pci.h
>> @@ -10,6 +10,7 @@
>>  #include "kvm/devices.h"
>>  #include "kvm/msi.h"
>>  #include "kvm/fdt.h"
>> +#include "kvm.h"
>>  
>>  #define pci_dev_err(pci_hdr, fmt, ...) \
>>  	pr_err("[%04x:%04x] " fmt, pci_hdr->vendor_id, pci_hdr->device_id, ##__VA_ARGS__)
>> @@ -32,9 +33,41 @@
>>  #define PCI_CONFIG_BUS_FORWARD	0xcfa
>>  #define PCI_IO_SIZE		0x100
>>  #define PCI_IOPORT_START	0x6200
>> -#define PCI_CFG_SIZE		(1ULL << 24)
>>  
>> -struct kvm;
>> +#define PCIE_CAP_REG_VER	0x1
>> +#define PCIE_CAP_REG_DEV_LEGACY	(1 << 4)
>> +#define PM_CAP_VER		0x3
>> +
>> +#ifdef ARCH_HAS_PCI_EXP
>> +#define PCI_CFG_SIZE		(1ULL << 28)
>> +#define PCI_DEV_CFG_SIZE	4096
> Maybe use PCI_CFG_SPACE_EXP_SIZE from pci_regs.h?

I cannot do that because it's not available on all distros. I used
PCI_CFG_SPACE_EXP_SIZE in the previous iteration, but I changed it when Sami
reported a compilation failure on his particular setup (it's mentioned in the
changelog).

>
>> +
>> +union pci_config_address {
>> +	struct {
>> +#if __BYTE_ORDER == __LITTLE_ENDIAN
>> +		unsigned	reg_offset	: 2;		/* 1  .. 0  */
>> +		unsigned	register_number	: 10;		/* 11 .. 2  */
>> +		unsigned	function_number	: 3;		/* 14 .. 12 */
>> +		unsigned	device_number	: 5;		/* 19 .. 15 */
>> +		unsigned	bus_number	: 8;		/* 27 .. 20 */
>> +		unsigned	reserved	: 3;		/* 30 .. 28 */
>> +		unsigned	enable_bit	: 1;		/* 31       */
>> +#else
>> +		unsigned	enable_bit	: 1;		/* 31       */
>> +		unsigned	reserved	: 3;		/* 30 .. 28 */
>> +		unsigned	bus_number	: 8;		/* 27 .. 20 */
>> +		unsigned	device_number	: 5;		/* 19 .. 15 */
>> +		unsigned	function_number	: 3;		/* 14 .. 12 */
>> +		unsigned	register_number	: 10;		/* 11 .. 2  */
>> +		unsigned	reg_offset	: 2;		/* 1  .. 0  */
>> +#endif
> Just for the records:
> I think we agreed on this before, but using a C bitfield to model
> hardware defined bits is broken, because the C standard doesn't
> guarantee those bits to be consecutive and layed out like we hope it would.
> But since we have this issue already with the legacy config space, and
> it seems to work (TM), we can fix this later.

Still on the record, it's broken for big endian because only the byte order is
different, not the individual bits. But there are other things that are broken for
big endian, so not a big deal.

>
>> +	};
>> +	u32 w;
>> +};
>> +
>> +#else
>> +#define PCI_CFG_SIZE		(1ULL << 24)
>> +#define PCI_DEV_CFG_SIZE	256
> Shall we use PCI_CFG_SPACE_SIZE from the kernel headers here?

I would rather not, for the reasons explained above.

>
>>  
>>  union pci_config_address {
>>  	struct {
>> @@ -58,6 +91,8 @@ union pci_config_address {
>>  	};
>>  	u32 w;
>>  };
>> +#endif
>> +#define PCI_DEV_CFG_MASK	(PCI_DEV_CFG_SIZE - 1)
>>  
>>  struct msix_table {
>>  	struct msi_msg msg;
>> @@ -100,6 +135,33 @@ struct pci_cap_hdr {
>>  	u8	next;
>>  };
>>  
>> +struct pcie_cap {
> I guess this is meant to map to the PCI Express Capability Structure as
> described in the PCIe spec?
> We would need to add the "packed" attribute then. But actually I am not
> a fan of using C language constructs to model specified register
> arrangements, the kernel tries to avoid this as well.

I'm not sure what you are suggesting. Should we rewrite the entire PCI emulation
code in kvmtool then?

>
> Actually, looking closer: why do we need this in the first place? I
> removed this and struct pm_cap, and it still compiles.
> So can we lose those two structures at all? And move the discussion and
> implementation (for VirtIO 1.0?) to a later series?

I've answered both points in v2 of the series [1].

[1] https://www.spinics.net/lists/kvm/msg209601.html

>
>> +	u8 cap;
>> +	u8 next;
>> +	u16 cap_reg;
>> +	u32 dev_cap;
>> +	u16 dev_ctrl;
>> +	u16 dev_status;
>> +	u32 link_cap;
>> +	u16 link_ctrl;
>> +	u16 link_status;
>> +	u32 slot_cap;
>> +	u16 slot_ctrl;
>> +	u16 slot_status;
>> +	u16 root_ctrl;
>> +	u16 root_cap;
>> +	u32 root_status;
>> +};
>> +
>> +struct pm_cap {
>> +	u8 cap;
>> +	u8 next;
>> +	u16 pmc;
>> +	u16 pmcsr;
>> +	u8 pmcsr_bse;
>> +	u8 data;
>> +};
>> +
>>  struct pci_device_header;
>>  
>>  typedef int (*bar_activate_fn_t)(struct kvm *kvm,
>> @@ -110,14 +172,12 @@ typedef int (*bar_deactivate_fn_t)(struct kvm *kvm,
>>  				   int bar_num, void *data);
>>  
>>  #define PCI_BAR_OFFSET(b)	(offsetof(struct pci_device_header, bar[b]))
>> -#define PCI_DEV_CFG_SIZE	256
>> -#define PCI_DEV_CFG_MASK	(PCI_DEV_CFG_SIZE - 1)
>>  
>>  struct pci_config_operations {
>>  	void (*write)(struct kvm *kvm, struct pci_device_header *pci_hdr,
>> -		      u8 offset, void *data, int sz);
>> +		      u16 offset, void *data, int sz);
>>  	void (*read)(struct kvm *kvm, struct pci_device_header *pci_hdr,
>> -		     u8 offset, void *data, int sz);
>> +		     u16 offset, void *data, int sz);
>>  };
>>  
>>  struct pci_device_header {
>> @@ -147,6 +207,10 @@ struct pci_device_header {
>>  			u8		min_gnt;
>>  			u8		max_lat;
>>  			struct msix_cap msix;
>> +#ifdef ARCH_HAS_PCI_EXP
>> +			struct pm_cap pm;
>> +			struct pcie_cap pcie;
>> +#endif
>>  		} __attribute__((packed));
>>  		/* Pad to PCI config space size */
>>  		u8	__pad[PCI_DEV_CFG_SIZE];
>> diff --git a/pci.c b/pci.c
>> index 68ece65441a6..b471209a6efc 100644
>> --- a/pci.c
>> +++ b/pci.c
>> @@ -400,7 +400,8 @@ static void pci_config_bar_wr(struct kvm *kvm,
>>  void pci__config_wr(struct kvm *kvm, union pci_config_address addr, void *data, int size)
>>  {
>>  	void *base;
>> -	u8 bar, offset;
>> +	u8 bar;
>> +	u16 offset;
>>  	struct pci_device_header *pci_hdr;
>>  	u8 dev_num = addr.device_number;
>>  	u32 value = 0;
>> @@ -439,7 +440,7 @@ void pci__config_wr(struct kvm *kvm, union pci_config_address addr, void *data,
>>  
>>  void pci__config_rd(struct kvm *kvm, union pci_config_address addr, void *data, int size)
>>  {
>> -	u8 offset;
>> +	u16 offset;
>>  	struct pci_device_header *pci_hdr;
>>  	u8 dev_num = addr.device_number;
>>  
>> diff --git a/vfio/pci.c b/vfio/pci.c
>> index 2b891496547d..6b8726227ea0 100644
>> --- a/vfio/pci.c
>> +++ b/vfio/pci.c
>> @@ -311,7 +311,7 @@ out_unlock:
>>  }
>>  
>>  static void vfio_pci_msix_cap_write(struct kvm *kvm,
>> -				    struct vfio_device *vdev, u8 off,
>> +				    struct vfio_device *vdev, u16 off,
>>  				    void *data, int sz)
>>  {
>>  	struct vfio_pci_device *pdev = &vdev->pci;
>> @@ -343,7 +343,7 @@ static void vfio_pci_msix_cap_write(struct kvm *kvm,
>>  }
>>  
>>  static int vfio_pci_msi_vector_write(struct kvm *kvm, struct vfio_device *vdev,
>> -				     u8 off, u8 *data, u32 sz)
>> +				     u16 off, u8 *data, u32 sz)
>>  {
>>  	size_t i;
>>  	u32 mask = 0;
>> @@ -391,7 +391,7 @@ static int vfio_pci_msi_vector_write(struct kvm *kvm, struct vfio_device *vdev,
>>  }
>>  
>>  static void vfio_pci_msi_cap_write(struct kvm *kvm, struct vfio_device *vdev,
>> -				   u8 off, u8 *data, u32 sz)
>> +				   u16 off, u8 *data, u32 sz)
>>  {
>>  	u8 ctrl;
>>  	struct msi_msg msg;
>> @@ -536,7 +536,7 @@ out:
>>  }
>>  
>>  static void vfio_pci_cfg_read(struct kvm *kvm, struct pci_device_header *pci_hdr,
>> -			      u8 offset, void *data, int sz)
>> +			      u16 offset, void *data, int sz)
>>  {
>>  	struct vfio_region_info *info;
>>  	struct vfio_pci_device *pdev;
>> @@ -554,7 +554,7 @@ static void vfio_pci_cfg_read(struct kvm *kvm, struct pci_device_header *pci_hdr
>>  }
>>  
>>  static void vfio_pci_cfg_write(struct kvm *kvm, struct pci_device_header *pci_hdr,
>> -			       u8 offset, void *data, int sz)
>> +			       u16 offset, void *data, int sz)
>>  {
>>  	struct vfio_region_info *info;
>>  	struct vfio_pci_device *pdev;
>> @@ -638,15 +638,17 @@ static int vfio_pci_parse_caps(struct vfio_device *vdev)
>>  {
>>  	int ret;
>>  	size_t size;
>> -	u8 pos, next;
>> +	u16 pos, next;
>>  	struct pci_cap_hdr *cap;
>> -	u8 virt_hdr[PCI_DEV_CFG_SIZE];
>> +	u8 *virt_hdr;
>>  	struct vfio_pci_device *pdev = &vdev->pci;
>>  
>>  	if (!(pdev->hdr.status & PCI_STATUS_CAP_LIST))
>>  		return 0;
>>  
>> -	memset(virt_hdr, 0, PCI_DEV_CFG_SIZE);
>> +	virt_hdr = calloc(1, PCI_DEV_CFG_SIZE);
>> +	if (!virt_hdr)
>> +		return -errno;
> There are two places where we return in this function, we don't seem to
> free virt_hdr in those cases. Looks like a job for your beloved goto ;-)
>
Indeed, I'll do that.

Thanks,
Alex
Andre Przywara May 12, 2020, 2:17 p.m. UTC | #4
On 06/05/2020 14:51, Alexandru Elisei wrote:

Hi,

> On 4/6/20 3:06 PM, André Przywara wrote:
>> On 26/03/2020 15:24, Alexandru Elisei wrote:
>>> PCI Express comes with an extended addressing scheme, which directly
>>> translated into a bigger device configuration space (256->4096 bytes)
>>> and bigger PCI configuration space (16->256 MB), as well as mandatory
>>> capabilities (power management [1] and PCI Express capability [2]).
>>>
>>> However, our virtio PCI implementation implements version 0.9 of the
>>> protocol and it still uses transitional PCI device ID's, so we have
>>> opted to omit the mandatory PCI Express capabilities.For VFIO, the power
>>> management and PCI Express capability are left for a subsequent patch.
>>>
>>> [1] PCI Express Base Specification Revision 1.1, section 7.6
>>> [2] PCI Express Base Specification Revision 1.1, section 7.8
>>>
>>> Signed-off-by: Alexandru Elisei <alexandru.elisei@arm.com>
>>> ---
>>>  arm/include/arm-common/kvm-arch.h |  4 +-
>>>  arm/pci.c                         |  2 +-
>>>  builtin-run.c                     |  1 +
>>>  include/kvm/kvm-config.h          |  2 +-
>>>  include/kvm/pci.h                 | 76 ++++++++++++++++++++++++++++---
>>>  pci.c                             |  5 +-
>>>  vfio/pci.c                        | 26 +++++++----
>>>  7 files changed, 96 insertions(+), 20 deletions(-)
>>>
>>> diff --git a/arm/include/arm-common/kvm-arch.h b/arm/include/arm-common/kvm-arch.h
>>> index b9d486d5eac2..13c55fa3dc29 100644
>>> --- a/arm/include/arm-common/kvm-arch.h
>>> +++ b/arm/include/arm-common/kvm-arch.h
>>> @@ -23,7 +23,7 @@
>>>  
>>>  #define ARM_IOPORT_SIZE		(ARM_MMIO_AREA - ARM_IOPORT_AREA)
>>>  #define ARM_VIRTIO_MMIO_SIZE	(ARM_AXI_AREA - (ARM_MMIO_AREA + ARM_GIC_SIZE))
>>> -#define ARM_PCI_CFG_SIZE	(1ULL << 24)
>>> +#define ARM_PCI_CFG_SIZE	(1ULL << 28)
>> The existence of this symbol seems somewhat odd, there should be no ARM
>> specific version of the config space size.
> 
> The existence of the symbol is required because at the moment PCI Express support
> is not available for all architecture, and this file needs to be included in
> pci.h, not the other way around. I agree it's not ideal, but that's the way it is
> right now.
> 
>> Do you know why we don't use the generic PCI_CFG_SIZE here?
>> At the very least I would expect something like:
>> #define ARM_PCI_CFG_SIZE	PCI_EXP_CFG_SIZE
> 
> We don't use PCI_EXP_CFG_SIZE because it might not be available for all
> architectures. It is possible it's not there on x86, like the error you have
> encountered showed, so instead of waiting for someone to report a compilation
> failure, I would rather use a number from the start.

Ah, OK, fair enough then.
I just wanted to avoid the impression that this is something
architecture specific.

>>
>>>  #define ARM_PCI_MMIO_SIZE	(ARM_MEMORY_AREA - \
>>>  				(ARM_AXI_AREA + ARM_PCI_CFG_SIZE))
>>>  
>>> @@ -50,6 +50,8 @@
>>>  
>>>  #define VIRTIO_RING_ENDIAN	(VIRTIO_ENDIAN_LE | VIRTIO_ENDIAN_BE)
>>>  
>>> +#define ARCH_HAS_PCI_EXP	1
>>> +
>>>  static inline bool arm_addr_in_ioport_region(u64 phys_addr)
>>>  {
>>>  	u64 limit = KVM_IOPORT_AREA + ARM_IOPORT_SIZE;
>>> diff --git a/arm/pci.c b/arm/pci.c
>>> index ed325fa4a811..2251f627d8b5 100644
>>> --- a/arm/pci.c
>>> +++ b/arm/pci.c
>>> @@ -62,7 +62,7 @@ void pci__generate_fdt_nodes(void *fdt)
>>>  	_FDT(fdt_property_cell(fdt, "#address-cells", 0x3));
>>>  	_FDT(fdt_property_cell(fdt, "#size-cells", 0x2));
>>>  	_FDT(fdt_property_cell(fdt, "#interrupt-cells", 0x1));
>>> -	_FDT(fdt_property_string(fdt, "compatible", "pci-host-cam-generic"));
>>> +	_FDT(fdt_property_string(fdt, "compatible", "pci-host-ecam-generic"));
>>>  	_FDT(fdt_property(fdt, "dma-coherent", NULL, 0));
>>>  
>>>  	_FDT(fdt_property(fdt, "bus-range", bus_range, sizeof(bus_range)));
>>> diff --git a/builtin-run.c b/builtin-run.c
>>> index 9cb8c75300eb..def8a1f803ad 100644
>>> --- a/builtin-run.c
>>> +++ b/builtin-run.c
>>> @@ -27,6 +27,7 @@
>>>  #include "kvm/irq.h"
>>>  #include "kvm/kvm.h"
>>>  #include "kvm/pci.h"
>>> +#include "kvm/vfio.h"
>>>  #include "kvm/rtc.h"
>>>  #include "kvm/sdl.h"
>>>  #include "kvm/vnc.h"
>>> diff --git a/include/kvm/kvm-config.h b/include/kvm/kvm-config.h
>>> index a052b0bc7582..a1012c57b7a7 100644
>>> --- a/include/kvm/kvm-config.h
>>> +++ b/include/kvm/kvm-config.h
>>> @@ -2,7 +2,6 @@
>>>  #define KVM_CONFIG_H_
>>>  
>>>  #include "kvm/disk-image.h"
>>> -#include "kvm/vfio.h"
>>>  #include "kvm/kvm-config-arch.h"
>>>  
>>>  #define DEFAULT_KVM_DEV		"/dev/kvm"
>>> @@ -18,6 +17,7 @@
>>>  #define MIN_RAM_SIZE_MB		(64ULL)
>>>  #define MIN_RAM_SIZE_BYTE	(MIN_RAM_SIZE_MB << MB_SHIFT)
>>>  
>>> +struct vfio_device_params;
>>>  struct kvm_config {
>>>  	struct kvm_config_arch arch;
>>>  	struct disk_image_params disk_image[MAX_DISK_IMAGES];
>>> diff --git a/include/kvm/pci.h b/include/kvm/pci.h
>>> index be75f77fd2cb..71ee9d8cb01f 100644
>>> --- a/include/kvm/pci.h
>>> +++ b/include/kvm/pci.h
>>> @@ -10,6 +10,7 @@
>>>  #include "kvm/devices.h"
>>>  #include "kvm/msi.h"
>>>  #include "kvm/fdt.h"
>>> +#include "kvm.h"
>>>  
>>>  #define pci_dev_err(pci_hdr, fmt, ...) \
>>>  	pr_err("[%04x:%04x] " fmt, pci_hdr->vendor_id, pci_hdr->device_id, ##__VA_ARGS__)
>>> @@ -32,9 +33,41 @@
>>>  #define PCI_CONFIG_BUS_FORWARD	0xcfa
>>>  #define PCI_IO_SIZE		0x100
>>>  #define PCI_IOPORT_START	0x6200
>>> -#define PCI_CFG_SIZE		(1ULL << 24)
>>>  
>>> -struct kvm;
>>> +#define PCIE_CAP_REG_VER	0x1
>>> +#define PCIE_CAP_REG_DEV_LEGACY	(1 << 4)
>>> +#define PM_CAP_VER		0x3
>>> +
>>> +#ifdef ARCH_HAS_PCI_EXP
>>> +#define PCI_CFG_SIZE		(1ULL << 28)
>>> +#define PCI_DEV_CFG_SIZE	4096
>> Maybe use PCI_CFG_SPACE_EXP_SIZE from pci_regs.h?
> 
> I cannot do that because it's not available on all distros. I used
> PCI_CFG_SPACE_EXP_SIZE in the previous iteration, but I changed it when Sami
> reported a compilation failure on his particular setup (it's mentioned in the
> changelog).

OK.

>>
>>> +
>>> +union pci_config_address {
>>> +	struct {
>>> +#if __BYTE_ORDER == __LITTLE_ENDIAN
>>> +		unsigned	reg_offset	: 2;		/* 1  .. 0  */
>>> +		unsigned	register_number	: 10;		/* 11 .. 2  */
>>> +		unsigned	function_number	: 3;		/* 14 .. 12 */
>>> +		unsigned	device_number	: 5;		/* 19 .. 15 */
>>> +		unsigned	bus_number	: 8;		/* 27 .. 20 */
>>> +		unsigned	reserved	: 3;		/* 30 .. 28 */
>>> +		unsigned	enable_bit	: 1;		/* 31       */
>>> +#else
>>> +		unsigned	enable_bit	: 1;		/* 31       */
>>> +		unsigned	reserved	: 3;		/* 30 .. 28 */
>>> +		unsigned	bus_number	: 8;		/* 27 .. 20 */
>>> +		unsigned	device_number	: 5;		/* 19 .. 15 */
>>> +		unsigned	function_number	: 3;		/* 14 .. 12 */
>>> +		unsigned	register_number	: 10;		/* 11 .. 2  */
>>> +		unsigned	reg_offset	: 2;		/* 1  .. 0  */
>>> +#endif
>> Just for the records:
>> I think we agreed on this before, but using a C bitfield to model
>> hardware defined bits is broken, because the C standard doesn't
>> guarantee those bits to be consecutive and layed out like we hope it would.
>> But since we have this issue already with the legacy config space, and
>> it seems to work (TM), we can fix this later.
> 
> Still on the record, it's broken for big endian because only the byte order is
> different, not the individual bits. But there are other things that are broken for
> big endian, so not a big deal.
> 
>>
>>> +	};
>>> +	u32 w;
>>> +};
>>> +
>>> +#else
>>> +#define PCI_CFG_SIZE		(1ULL << 24)
>>> +#define PCI_DEV_CFG_SIZE	256
>> Shall we use PCI_CFG_SPACE_SIZE from the kernel headers here?
> 
> I would rather not, for the reasons explained above.
> 
>>
>>>  
>>>  union pci_config_address {
>>>  	struct {
>>> @@ -58,6 +91,8 @@ union pci_config_address {
>>>  	};
>>>  	u32 w;
>>>  };
>>> +#endif
>>> +#define PCI_DEV_CFG_MASK	(PCI_DEV_CFG_SIZE - 1)
>>>  
>>>  struct msix_table {
>>>  	struct msi_msg msg;
>>> @@ -100,6 +135,33 @@ struct pci_cap_hdr {
>>>  	u8	next;
>>>  };
>>>  
>>> +struct pcie_cap {
>> I guess this is meant to map to the PCI Express Capability Structure as
>> described in the PCIe spec?
>> We would need to add the "packed" attribute then. But actually I am not
>> a fan of using C language constructs to model specified register
>> arrangements, the kernel tries to avoid this as well.
> 
> I'm not sure what you are suggesting. Should we rewrite the entire PCI emulation
> code in kvmtool then?

At least not add more of that problematic code, especially if we don't
need it. Maybe there is a better solution for the operations we will
need (byte array?), that's hard to say without seeing the code.

>> Actually, looking closer: why do we need this in the first place? I
>> removed this and struct pm_cap, and it still compiles.
>> So can we lose those two structures at all? And move the discussion and
>> implementation (for VirtIO 1.0?) to a later series?
> 
> I've answered both points in v2 of the series [1].
> 
> [1] https://www.spinics.net/lists/kvm/msg209601.html:

From there:
>> But more importantly: Do we actually need those definitions? We
>> don't seem to use them, do we?
>> And the u8 __pad[PCI_DEV_CFG_SIZE] below should provide the extended
>> storage space a guest would expect?
>
> Yes, we don't use them for the reasons I explained in the commit
> message. I would rather keep them, because they are required by the
> PCIE spec.

I don't get the point of adding code / data structures that we don't
need, especially if it has issues. I understand it's mandatory as per
the spec, but just adding a struct here doesn't fix this or makes this
better.

Cheers,
Andre

> 
>>
>>> +	u8 cap;
>>> +	u8 next;
>>> +	u16 cap_reg;
>>> +	u32 dev_cap;
>>> +	u16 dev_ctrl;
>>> +	u16 dev_status;
>>> +	u32 link_cap;
>>> +	u16 link_ctrl;
>>> +	u16 link_status;
>>> +	u32 slot_cap;
>>> +	u16 slot_ctrl;
>>> +	u16 slot_status;
>>> +	u16 root_ctrl;
>>> +	u16 root_cap;
>>> +	u32 root_status;
>>> +};
>>> +
>>> +struct pm_cap {
>>> +	u8 cap;
>>> +	u8 next;
>>> +	u16 pmc;
>>> +	u16 pmcsr;
>>> +	u8 pmcsr_bse;
>>> +	u8 data;
>>> +};
>>> +
>>>  struct pci_device_header;
>>>  
>>>  typedef int (*bar_activate_fn_t)(struct kvm *kvm,
>>> @@ -110,14 +172,12 @@ typedef int (*bar_deactivate_fn_t)(struct kvm *kvm,
>>>  				   int bar_num, void *data);
>>>  
>>>  #define PCI_BAR_OFFSET(b)	(offsetof(struct pci_device_header, bar[b]))
>>> -#define PCI_DEV_CFG_SIZE	256
>>> -#define PCI_DEV_CFG_MASK	(PCI_DEV_CFG_SIZE - 1)
>>>  
>>>  struct pci_config_operations {
>>>  	void (*write)(struct kvm *kvm, struct pci_device_header *pci_hdr,
>>> -		      u8 offset, void *data, int sz);
>>> +		      u16 offset, void *data, int sz);
>>>  	void (*read)(struct kvm *kvm, struct pci_device_header *pci_hdr,
>>> -		     u8 offset, void *data, int sz);
>>> +		     u16 offset, void *data, int sz);
>>>  };
>>>  
>>>  struct pci_device_header {
>>> @@ -147,6 +207,10 @@ struct pci_device_header {
>>>  			u8		min_gnt;
>>>  			u8		max_lat;
>>>  			struct msix_cap msix;
>>> +#ifdef ARCH_HAS_PCI_EXP
>>> +			struct pm_cap pm;
>>> +			struct pcie_cap pcie;
>>> +#endif
>>>  		} __attribute__((packed));
>>>  		/* Pad to PCI config space size */
>>>  		u8	__pad[PCI_DEV_CFG_SIZE];
>>> diff --git a/pci.c b/pci.c
>>> index 68ece65441a6..b471209a6efc 100644
>>> --- a/pci.c
>>> +++ b/pci.c
>>> @@ -400,7 +400,8 @@ static void pci_config_bar_wr(struct kvm *kvm,
>>>  void pci__config_wr(struct kvm *kvm, union pci_config_address addr, void *data, int size)
>>>  {
>>>  	void *base;
>>> -	u8 bar, offset;
>>> +	u8 bar;
>>> +	u16 offset;
>>>  	struct pci_device_header *pci_hdr;
>>>  	u8 dev_num = addr.device_number;
>>>  	u32 value = 0;
>>> @@ -439,7 +440,7 @@ void pci__config_wr(struct kvm *kvm, union pci_config_address addr, void *data,
>>>  
>>>  void pci__config_rd(struct kvm *kvm, union pci_config_address addr, void *data, int size)
>>>  {
>>> -	u8 offset;
>>> +	u16 offset;
>>>  	struct pci_device_header *pci_hdr;
>>>  	u8 dev_num = addr.device_number;
>>>  
>>> diff --git a/vfio/pci.c b/vfio/pci.c
>>> index 2b891496547d..6b8726227ea0 100644
>>> --- a/vfio/pci.c
>>> +++ b/vfio/pci.c
>>> @@ -311,7 +311,7 @@ out_unlock:
>>>  }
>>>  
>>>  static void vfio_pci_msix_cap_write(struct kvm *kvm,
>>> -				    struct vfio_device *vdev, u8 off,
>>> +				    struct vfio_device *vdev, u16 off,
>>>  				    void *data, int sz)
>>>  {
>>>  	struct vfio_pci_device *pdev = &vdev->pci;
>>> @@ -343,7 +343,7 @@ static void vfio_pci_msix_cap_write(struct kvm *kvm,
>>>  }
>>>  
>>>  static int vfio_pci_msi_vector_write(struct kvm *kvm, struct vfio_device *vdev,
>>> -				     u8 off, u8 *data, u32 sz)
>>> +				     u16 off, u8 *data, u32 sz)
>>>  {
>>>  	size_t i;
>>>  	u32 mask = 0;
>>> @@ -391,7 +391,7 @@ static int vfio_pci_msi_vector_write(struct kvm *kvm, struct vfio_device *vdev,
>>>  }
>>>  
>>>  static void vfio_pci_msi_cap_write(struct kvm *kvm, struct vfio_device *vdev,
>>> -				   u8 off, u8 *data, u32 sz)
>>> +				   u16 off, u8 *data, u32 sz)
>>>  {
>>>  	u8 ctrl;
>>>  	struct msi_msg msg;
>>> @@ -536,7 +536,7 @@ out:
>>>  }
>>>  
>>>  static void vfio_pci_cfg_read(struct kvm *kvm, struct pci_device_header *pci_hdr,
>>> -			      u8 offset, void *data, int sz)
>>> +			      u16 offset, void *data, int sz)
>>>  {
>>>  	struct vfio_region_info *info;
>>>  	struct vfio_pci_device *pdev;
>>> @@ -554,7 +554,7 @@ static void vfio_pci_cfg_read(struct kvm *kvm, struct pci_device_header *pci_hdr
>>>  }
>>>  
>>>  static void vfio_pci_cfg_write(struct kvm *kvm, struct pci_device_header *pci_hdr,
>>> -			       u8 offset, void *data, int sz)
>>> +			       u16 offset, void *data, int sz)
>>>  {
>>>  	struct vfio_region_info *info;
>>>  	struct vfio_pci_device *pdev;
>>> @@ -638,15 +638,17 @@ static int vfio_pci_parse_caps(struct vfio_device *vdev)
>>>  {
>>>  	int ret;
>>>  	size_t size;
>>> -	u8 pos, next;
>>> +	u16 pos, next;
>>>  	struct pci_cap_hdr *cap;
>>> -	u8 virt_hdr[PCI_DEV_CFG_SIZE];
>>> +	u8 *virt_hdr;
>>>  	struct vfio_pci_device *pdev = &vdev->pci;
>>>  
>>>  	if (!(pdev->hdr.status & PCI_STATUS_CAP_LIST))
>>>  		return 0;
>>>  
>>> -	memset(virt_hdr, 0, PCI_DEV_CFG_SIZE);
>>> +	virt_hdr = calloc(1, PCI_DEV_CFG_SIZE);
>>> +	if (!virt_hdr)
>>> +		return -errno;
>> There are two places where we return in this function, we don't seem to
>> free virt_hdr in those cases. Looks like a job for your beloved goto ;-)
>>
> Indeed, I'll do that.
> 
> Thanks,
> Alex
>
Alexandru Elisei May 12, 2020, 3:44 p.m. UTC | #5
Hi,

On 5/12/20 3:17 PM, André Przywara wrote:
> On 06/05/2020 14:51, Alexandru Elisei wrote:
>
> Hi,
>
>> On 4/6/20 3:06 PM, André Przywara wrote:
>>> On 26/03/2020 15:24, Alexandru Elisei wrote:
>>>
>>>>  
>>>>  union pci_config_address {
>>>>  	struct {
>>>> @@ -58,6 +91,8 @@ union pci_config_address {
>>>>  	};
>>>>  	u32 w;
>>>>  };
>>>> +#endif
>>>> +#define PCI_DEV_CFG_MASK	(PCI_DEV_CFG_SIZE - 1)
>>>>  
>>>>  struct msix_table {
>>>>  	struct msi_msg msg;
>>>> @@ -100,6 +135,33 @@ struct pci_cap_hdr {
>>>>  	u8	next;
>>>>  };
>>>>  
>>>> +struct pcie_cap {
>>> I guess this is meant to map to the PCI Express Capability Structure as
>>> described in the PCIe spec?
>>> We would need to add the "packed" attribute then. But actually I am not
>>> a fan of using C language constructs to model specified register
>>> arrangements, the kernel tries to avoid this as well.
>> I'm not sure what you are suggesting. Should we rewrite the entire PCI emulation
>> code in kvmtool then?
> At least not add more of that problematic code, especially if we don't

I don't see why how the code is problematic. Did I miss it in a previous comment?

> need it. Maybe there is a better solution for the operations we will
> need (byte array?), that's hard to say without seeing the code.
>
>>> Actually, looking closer: why do we need this in the first place? I
>>> removed this and struct pm_cap, and it still compiles.
>>> So can we lose those two structures at all? And move the discussion and
>>> implementation (for VirtIO 1.0?) to a later series?
>> I've answered both points in v2 of the series [1].
>>
>> [1] https://www.spinics.net/lists/kvm/msg209601.html:
> From there:
>>> But more importantly: Do we actually need those definitions? We
>>> don't seem to use them, do we?
>>> And the u8 __pad[PCI_DEV_CFG_SIZE] below should provide the extended
>>> storage space a guest would expect?
>> Yes, we don't use them for the reasons I explained in the commit
>> message. I would rather keep them, because they are required by the
>> PCIE spec.
> I don't get the point of adding code / data structures that we don't
> need, especially if it has issues. I understand it's mandatory as per
> the spec, but just adding a struct here doesn't fix this or makes this
> better.

Sure, I can remove the unused structs, especially if they have issues. But I don't
see what issues they have, would you mind expanding on that?

Thanks,
Alex
Andre Przywara May 13, 2020, 8:17 a.m. UTC | #6
On 12/05/2020 16:44, Alexandru Elisei wrote:

Hi,

> On 5/12/20 3:17 PM, André Przywara wrote:
>> On 06/05/2020 14:51, Alexandru Elisei wrote:
>>
>> Hi,
>>
>>> On 4/6/20 3:06 PM, André Przywara wrote:
>>>> On 26/03/2020 15:24, Alexandru Elisei wrote:
>>>>
>>>>>  
>>>>>  union pci_config_address {
>>>>>  	struct {
>>>>> @@ -58,6 +91,8 @@ union pci_config_address {
>>>>>  	};
>>>>>  	u32 w;
>>>>>  };
>>>>> +#endif
>>>>> +#define PCI_DEV_CFG_MASK	(PCI_DEV_CFG_SIZE - 1)
>>>>>  
>>>>>  struct msix_table {
>>>>>  	struct msi_msg msg;
>>>>> @@ -100,6 +135,33 @@ struct pci_cap_hdr {
>>>>>  	u8	next;
>>>>>  };
>>>>>  
>>>>> +struct pcie_cap {
>>>> I guess this is meant to map to the PCI Express Capability Structure as
>>>> described in the PCIe spec?
>>>> We would need to add the "packed" attribute then. But actually I am not
>>>> a fan of using C language constructs to model specified register
>>>> arrangements, the kernel tries to avoid this as well.
>>> I'm not sure what you are suggesting. Should we rewrite the entire PCI emulation
>>> code in kvmtool then?
>> At least not add more of that problematic code, especially if we don't
> 
> I don't see why how the code is problematic. Did I miss it in a previous comment?

I was referring to that point that modelling hardware defined registers
using a C struct is somewhat dodgy. As the C standard says, in section
6.7.2.1, end of paragraph 15:
"There may be unnamed padding within a structure object, but not at its
beginning."
Yes, GCC and other implementations offer "packed" to somewhat overcome
this, but this is a compiler extension and has other issues, like if you
have non-aligned members and take pointers to it.

I think our case here is more sane, since we always seem to use it on
normal memory (do we?), and are not mapping it to some actual device memory.

So I leave this up to you, but I am still opposed to the idea of adding
code that is not used.

Cheers,
Andre.

>> need it. Maybe there is a better solution for the operations we will
>> need (byte array?), that's hard to say without seeing the code.
>>
>>>> Actually, looking closer: why do we need this in the first place? I
>>>> removed this and struct pm_cap, and it still compiles.
>>>> So can we lose those two structures at all? And move the discussion and
>>>> implementation (for VirtIO 1.0?) to a later series?
>>> I've answered both points in v2 of the series [1].
>>>
>>> [1] https://www.spinics.net/lists/kvm/msg209601.html:
>> From there:
>>>> But more importantly: Do we actually need those definitions? We
>>>> don't seem to use them, do we?
>>>> And the u8 __pad[PCI_DEV_CFG_SIZE] below should provide the extended
>>>> storage space a guest would expect?
>>> Yes, we don't use them for the reasons I explained in the commit
>>> message. I would rather keep them, because they are required by the
>>> PCIE spec.
>> I don't get the point of adding code / data structures that we don't
>> need, especially if it has issues. I understand it's mandatory as per
>> the spec, but just adding a struct here doesn't fix this or makes this
>> better.
> 
> Sure, I can remove the unused structs, especially if they have issues. But I don't
> see what issues they have, would you mind expanding on that?

The best code is the one not written. ;-)

Cheers,
Andre
Alexandru Elisei May 13, 2020, 2:42 p.m. UTC | #7
Hi,

On 5/13/20 9:17 AM, André Przywara wrote:
> On 12/05/2020 16:44, Alexandru Elisei wrote:
>
> Hi,
>
>> On 5/12/20 3:17 PM, André Przywara wrote:
>>> On 06/05/2020 14:51, Alexandru Elisei wrote:
>>>
>>> Hi,
>>>
>>>> On 4/6/20 3:06 PM, André Przywara wrote:
>>>>> On 26/03/2020 15:24, Alexandru Elisei wrote:
>>>>>
>>>>>>  
>>>>>>  union pci_config_address {
>>>>>>  	struct {
>>>>>> @@ -58,6 +91,8 @@ union pci_config_address {
>>>>>>  	};
>>>>>>  	u32 w;
>>>>>>  };
>>>>>> +#endif
>>>>>> +#define PCI_DEV_CFG_MASK	(PCI_DEV_CFG_SIZE - 1)
>>>>>>  
>>>>>>  struct msix_table {
>>>>>>  	struct msi_msg msg;
>>>>>> @@ -100,6 +135,33 @@ struct pci_cap_hdr {
>>>>>>  	u8	next;
>>>>>>  };
>>>>>>  
>>>>>> +struct pcie_cap {
>>>>> I guess this is meant to map to the PCI Express Capability Structure as
>>>>> described in the PCIe spec?
>>>>> We would need to add the "packed" attribute then. But actually I am not
>>>>> a fan of using C language constructs to model specified register
>>>>> arrangements, the kernel tries to avoid this as well.
>>>> I'm not sure what you are suggesting. Should we rewrite the entire PCI emulation
>>>> code in kvmtool then?
>>> At least not add more of that problematic code, especially if we don't
>> I don't see why how the code is problematic. Did I miss it in a previous comment?
> I was referring to that point that modelling hardware defined registers
> using a C struct is somewhat dodgy. As the C standard says, in section
> 6.7.2.1, end of paragraph 15:
> "There may be unnamed padding within a structure object, but not at its
> beginning."
> Yes, GCC and other implementations offer "packed" to somewhat overcome
> this, but this is a compiler extension and has other issues, like if you
> have non-aligned members and take pointers to it.

The struct memory layout is dictated by aapcs64 (ARM IHI 0055B, page 12):

1. The alignment of an aggregate shall be the alignment of its most-aligned member.
2. The size of an aggregate shall be the smallest multiple of its alignment that
is sufficient to hold all of its members.

aapcs also specifies the alignment for each fundamental data type. The exact
wording is also used for armv7 (ARM IHI 0042F). Add to that the fact that the C
standard prohibits struct member reordering, and I don't think it's possible to
have two different layouts that respect the above rules (I'm happy to be proven
wrong, and I'm sure the people who wrote the specification would like to hear
about it). I would be surprised if other architectures didn't have similar rules.

As for the "packed" attribute, I'm also pretty sure it's a red herring. The PCI
spec specifies the registers in such a way that there's no gaps between the
registers and each register starts at a naturally aligned offset.

I do agree that taking pointers to struct members in this case can be problematic
if the register offset is not a multiple of 8.

> I think our case here is more sane, since we always seem to use it on
> normal memory (do we?), and are not mapping it to some actual device memory.

All the PCI configuration registers live in userspace memory, so normal memory, yes.

>
> So I leave this up to you, but I am still opposed to the idea of adding
> code that is not used.
>
> Cheers,
> Andre.
>
>>> need it. Maybe there is a better solution for the operations we will
>>> need (byte array?), that's hard to say without seeing the code.
>>>
>>>>> Actually, looking closer: why do we need this in the first place? I
>>>>> removed this and struct pm_cap, and it still compiles.
>>>>> So can we lose those two structures at all? And move the discussion and
>>>>> implementation (for VirtIO 1.0?) to a later series?
>>>> I've answered both points in v2 of the series [1].
>>>>
>>>> [1] https://www.spinics.net/lists/kvm/msg209601.html:
>>> From there:
>>>>> But more importantly: Do we actually need those definitions? We
>>>>> don't seem to use them, do we?
>>>>> And the u8 __pad[PCI_DEV_CFG_SIZE] below should provide the extended
>>>>> storage space a guest would expect?
>>>> Yes, we don't use them for the reasons I explained in the commit
>>>> message. I would rather keep them, because they are required by the
>>>> PCIE spec.
>>> I don't get the point of adding code / data structures that we don't
>>> need, especially if it has issues. I understand it's mandatory as per
>>> the spec, but just adding a struct here doesn't fix this or makes this
>>> better.
>> Sure, I can remove the unused structs, especially if they have issues. But I don't
>> see what issues they have, would you mind expanding on that?
> The best code is the one not written. ;-)

Truer words were never spoken.

That settles it then, I'll remove the unused structs.

Thanks,
Alex
Alexandru Elisei June 4, 2021, 4:46 p.m. UTC | #8
Hi Andre,

On 5/13/20 3:42 PM, Alexandru Elisei wrote:
> Hi,
>
> On 5/13/20 9:17 AM, André Przywara wrote:
>> On 12/05/2020 16:44, Alexandru Elisei wrote:
>>
>> Hi,
>>
>>> On 5/12/20 3:17 PM, André Przywara wrote:
>>>> On 06/05/2020 14:51, Alexandru Elisei wrote:
>>>>
>>>> Hi,
>>>>
>>>>> On 4/6/20 3:06 PM, André Przywara wrote:
>>>>> [..]
>>>>>> Actually, looking closer: why do we need this in the first place? I
>>>>>> removed this and struct pm_cap, and it still compiles.
>>>>>> So can we lose those two structures at all? And move the discussion and
>>>>>> implementation (for VirtIO 1.0?) to a later series?
>>>>> I've answered both points in v2 of the series [1].
>>>>>
>>>>> [1] https://www.spinics.net/lists/kvm/msg209601.html:
>>>> From there:
>>>>>> But more importantly: Do we actually need those definitions? We
>>>>>> don't seem to use them, do we?
>>>>>> And the u8 __pad[PCI_DEV_CFG_SIZE] below should provide the extended
>>>>>> storage space a guest would expect?
>>>>> Yes, we don't use them for the reasons I explained in the commit
>>>>> message. I would rather keep them, because they are required by the
>>>>> PCIE spec.
>>>> I don't get the point of adding code / data structures that we don't
>>>> need, especially if it has issues. I understand it's mandatory as per
>>>> the spec, but just adding a struct here doesn't fix this or makes this
>>>> better.
>>> Sure, I can remove the unused structs, especially if they have issues. But I don't
>>> see what issues they have, would you mind expanding on that?
>> The best code is the one not written. ;-)
> Truer words were never spoken.
>
> That settles it then, I'll remove the unused structs.

Coming back to this. Without the PCIE capability, Linux will incorrectly set the
size of the PCI configuration space for a device to the legacy PCI size (256 bytes
instead of 4096). This is done in drivers/pci/probe.c::pci_setup_device, where
dev->cfg_size = pci_cfg_size(dev). And pci_cfg_size() is:

int pci_cfg_space_size(struct pci_dev *dev)
{
    int pos;
    u32 status;
    u16 class;

#ifdef CONFIG_PCI_IOV
    /*
     * Per the SR-IOV specification (rev 1.1, sec 3.5), VFs are required to
     * implement a PCIe capability and therefore must implement extended
     * config space.  We can skip the NO_EXTCFG test below and the
     * reachability/aliasing test in pci_cfg_space_size_ext() by virtue of
     * the fact that the SR-IOV capability on the PF resides in extended
     * config space and must be accessible and non-aliased to have enabled
     * support for this VF.  This is a micro performance optimization for
     * systems supporting many VFs.
     */
    if (dev->is_virtfn)
        return PCI_CFG_SPACE_EXP_SIZE;
#endif

    if (dev->bus->bus_flags & PCI_BUS_FLAGS_NO_EXTCFG)
        return PCI_CFG_SPACE_SIZE;

    class = dev->class >> 8;
    if (class == PCI_CLASS_BRIDGE_HOST)
        return pci_cfg_space_size_ext(dev);

    if (pci_is_pcie(dev))
        return pci_cfg_space_size_ext(dev);

    pos = pci_find_capability(dev, PCI_CAP_ID_PCIX);
    if (!pos)
        return PCI_CFG_SPACE_SIZE;

    pci_read_config_dword(dev, pos + PCI_X_STATUS, &status);
    if (status & (PCI_X_STATUS_266MHZ | PCI_X_STATUS_533MHZ))
        return pci_cfg_space_size_ext(dev);

    return PCI_CFG_SPACE_SIZE;
}

And pci_is_pcie(dev) returns true if the device has a valid PCIE capability.

Why do we care? The RTL8168 driver checks if the NIC is PCIE capable, and if not
it prints the error message below and falls back to another (proprietary?) method
of device configuration (in
drivers/net/ethernet/realtek/r8169_main.c::rtl_csi_access_enable):

[    1.490530] r8169 0000:00:00.0 enp0s0: No native access to PCI extended config
space, falling back to CSI
[    1.500201] r8169 0000:00:00.0 enp0s0: Link is Down

I'll keep the PCIE cap and drop the power management cap.

Thanks,

Alex
diff mbox series

Patch

diff --git a/arm/include/arm-common/kvm-arch.h b/arm/include/arm-common/kvm-arch.h
index b9d486d5eac2..13c55fa3dc29 100644
--- a/arm/include/arm-common/kvm-arch.h
+++ b/arm/include/arm-common/kvm-arch.h
@@ -23,7 +23,7 @@ 
 
 #define ARM_IOPORT_SIZE		(ARM_MMIO_AREA - ARM_IOPORT_AREA)
 #define ARM_VIRTIO_MMIO_SIZE	(ARM_AXI_AREA - (ARM_MMIO_AREA + ARM_GIC_SIZE))
-#define ARM_PCI_CFG_SIZE	(1ULL << 24)
+#define ARM_PCI_CFG_SIZE	(1ULL << 28)
 #define ARM_PCI_MMIO_SIZE	(ARM_MEMORY_AREA - \
 				(ARM_AXI_AREA + ARM_PCI_CFG_SIZE))
 
@@ -50,6 +50,8 @@ 
 
 #define VIRTIO_RING_ENDIAN	(VIRTIO_ENDIAN_LE | VIRTIO_ENDIAN_BE)
 
+#define ARCH_HAS_PCI_EXP	1
+
 static inline bool arm_addr_in_ioport_region(u64 phys_addr)
 {
 	u64 limit = KVM_IOPORT_AREA + ARM_IOPORT_SIZE;
diff --git a/arm/pci.c b/arm/pci.c
index ed325fa4a811..2251f627d8b5 100644
--- a/arm/pci.c
+++ b/arm/pci.c
@@ -62,7 +62,7 @@  void pci__generate_fdt_nodes(void *fdt)
 	_FDT(fdt_property_cell(fdt, "#address-cells", 0x3));
 	_FDT(fdt_property_cell(fdt, "#size-cells", 0x2));
 	_FDT(fdt_property_cell(fdt, "#interrupt-cells", 0x1));
-	_FDT(fdt_property_string(fdt, "compatible", "pci-host-cam-generic"));
+	_FDT(fdt_property_string(fdt, "compatible", "pci-host-ecam-generic"));
 	_FDT(fdt_property(fdt, "dma-coherent", NULL, 0));
 
 	_FDT(fdt_property(fdt, "bus-range", bus_range, sizeof(bus_range)));
diff --git a/builtin-run.c b/builtin-run.c
index 9cb8c75300eb..def8a1f803ad 100644
--- a/builtin-run.c
+++ b/builtin-run.c
@@ -27,6 +27,7 @@ 
 #include "kvm/irq.h"
 #include "kvm/kvm.h"
 #include "kvm/pci.h"
+#include "kvm/vfio.h"
 #include "kvm/rtc.h"
 #include "kvm/sdl.h"
 #include "kvm/vnc.h"
diff --git a/include/kvm/kvm-config.h b/include/kvm/kvm-config.h
index a052b0bc7582..a1012c57b7a7 100644
--- a/include/kvm/kvm-config.h
+++ b/include/kvm/kvm-config.h
@@ -2,7 +2,6 @@ 
 #define KVM_CONFIG_H_
 
 #include "kvm/disk-image.h"
-#include "kvm/vfio.h"
 #include "kvm/kvm-config-arch.h"
 
 #define DEFAULT_KVM_DEV		"/dev/kvm"
@@ -18,6 +17,7 @@ 
 #define MIN_RAM_SIZE_MB		(64ULL)
 #define MIN_RAM_SIZE_BYTE	(MIN_RAM_SIZE_MB << MB_SHIFT)
 
+struct vfio_device_params;
 struct kvm_config {
 	struct kvm_config_arch arch;
 	struct disk_image_params disk_image[MAX_DISK_IMAGES];
diff --git a/include/kvm/pci.h b/include/kvm/pci.h
index be75f77fd2cb..71ee9d8cb01f 100644
--- a/include/kvm/pci.h
+++ b/include/kvm/pci.h
@@ -10,6 +10,7 @@ 
 #include "kvm/devices.h"
 #include "kvm/msi.h"
 #include "kvm/fdt.h"
+#include "kvm.h"
 
 #define pci_dev_err(pci_hdr, fmt, ...) \
 	pr_err("[%04x:%04x] " fmt, pci_hdr->vendor_id, pci_hdr->device_id, ##__VA_ARGS__)
@@ -32,9 +33,41 @@ 
 #define PCI_CONFIG_BUS_FORWARD	0xcfa
 #define PCI_IO_SIZE		0x100
 #define PCI_IOPORT_START	0x6200
-#define PCI_CFG_SIZE		(1ULL << 24)
 
-struct kvm;
+#define PCIE_CAP_REG_VER	0x1
+#define PCIE_CAP_REG_DEV_LEGACY	(1 << 4)
+#define PM_CAP_VER		0x3
+
+#ifdef ARCH_HAS_PCI_EXP
+#define PCI_CFG_SIZE		(1ULL << 28)
+#define PCI_DEV_CFG_SIZE	4096
+
+union pci_config_address {
+	struct {
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+		unsigned	reg_offset	: 2;		/* 1  .. 0  */
+		unsigned	register_number	: 10;		/* 11 .. 2  */
+		unsigned	function_number	: 3;		/* 14 .. 12 */
+		unsigned	device_number	: 5;		/* 19 .. 15 */
+		unsigned	bus_number	: 8;		/* 27 .. 20 */
+		unsigned	reserved	: 3;		/* 30 .. 28 */
+		unsigned	enable_bit	: 1;		/* 31       */
+#else
+		unsigned	enable_bit	: 1;		/* 31       */
+		unsigned	reserved	: 3;		/* 30 .. 28 */
+		unsigned	bus_number	: 8;		/* 27 .. 20 */
+		unsigned	device_number	: 5;		/* 19 .. 15 */
+		unsigned	function_number	: 3;		/* 14 .. 12 */
+		unsigned	register_number	: 10;		/* 11 .. 2  */
+		unsigned	reg_offset	: 2;		/* 1  .. 0  */
+#endif
+	};
+	u32 w;
+};
+
+#else
+#define PCI_CFG_SIZE		(1ULL << 24)
+#define PCI_DEV_CFG_SIZE	256
 
 union pci_config_address {
 	struct {
@@ -58,6 +91,8 @@  union pci_config_address {
 	};
 	u32 w;
 };
+#endif
+#define PCI_DEV_CFG_MASK	(PCI_DEV_CFG_SIZE - 1)
 
 struct msix_table {
 	struct msi_msg msg;
@@ -100,6 +135,33 @@  struct pci_cap_hdr {
 	u8	next;
 };
 
+struct pcie_cap {
+	u8 cap;
+	u8 next;
+	u16 cap_reg;
+	u32 dev_cap;
+	u16 dev_ctrl;
+	u16 dev_status;
+	u32 link_cap;
+	u16 link_ctrl;
+	u16 link_status;
+	u32 slot_cap;
+	u16 slot_ctrl;
+	u16 slot_status;
+	u16 root_ctrl;
+	u16 root_cap;
+	u32 root_status;
+};
+
+struct pm_cap {
+	u8 cap;
+	u8 next;
+	u16 pmc;
+	u16 pmcsr;
+	u8 pmcsr_bse;
+	u8 data;
+};
+
 struct pci_device_header;
 
 typedef int (*bar_activate_fn_t)(struct kvm *kvm,
@@ -110,14 +172,12 @@  typedef int (*bar_deactivate_fn_t)(struct kvm *kvm,
 				   int bar_num, void *data);
 
 #define PCI_BAR_OFFSET(b)	(offsetof(struct pci_device_header, bar[b]))
-#define PCI_DEV_CFG_SIZE	256
-#define PCI_DEV_CFG_MASK	(PCI_DEV_CFG_SIZE - 1)
 
 struct pci_config_operations {
 	void (*write)(struct kvm *kvm, struct pci_device_header *pci_hdr,
-		      u8 offset, void *data, int sz);
+		      u16 offset, void *data, int sz);
 	void (*read)(struct kvm *kvm, struct pci_device_header *pci_hdr,
-		     u8 offset, void *data, int sz);
+		     u16 offset, void *data, int sz);
 };
 
 struct pci_device_header {
@@ -147,6 +207,10 @@  struct pci_device_header {
 			u8		min_gnt;
 			u8		max_lat;
 			struct msix_cap msix;
+#ifdef ARCH_HAS_PCI_EXP
+			struct pm_cap pm;
+			struct pcie_cap pcie;
+#endif
 		} __attribute__((packed));
 		/* Pad to PCI config space size */
 		u8	__pad[PCI_DEV_CFG_SIZE];
diff --git a/pci.c b/pci.c
index 68ece65441a6..b471209a6efc 100644
--- a/pci.c
+++ b/pci.c
@@ -400,7 +400,8 @@  static void pci_config_bar_wr(struct kvm *kvm,
 void pci__config_wr(struct kvm *kvm, union pci_config_address addr, void *data, int size)
 {
 	void *base;
-	u8 bar, offset;
+	u8 bar;
+	u16 offset;
 	struct pci_device_header *pci_hdr;
 	u8 dev_num = addr.device_number;
 	u32 value = 0;
@@ -439,7 +440,7 @@  void pci__config_wr(struct kvm *kvm, union pci_config_address addr, void *data,
 
 void pci__config_rd(struct kvm *kvm, union pci_config_address addr, void *data, int size)
 {
-	u8 offset;
+	u16 offset;
 	struct pci_device_header *pci_hdr;
 	u8 dev_num = addr.device_number;
 
diff --git a/vfio/pci.c b/vfio/pci.c
index 2b891496547d..6b8726227ea0 100644
--- a/vfio/pci.c
+++ b/vfio/pci.c
@@ -311,7 +311,7 @@  out_unlock:
 }
 
 static void vfio_pci_msix_cap_write(struct kvm *kvm,
-				    struct vfio_device *vdev, u8 off,
+				    struct vfio_device *vdev, u16 off,
 				    void *data, int sz)
 {
 	struct vfio_pci_device *pdev = &vdev->pci;
@@ -343,7 +343,7 @@  static void vfio_pci_msix_cap_write(struct kvm *kvm,
 }
 
 static int vfio_pci_msi_vector_write(struct kvm *kvm, struct vfio_device *vdev,
-				     u8 off, u8 *data, u32 sz)
+				     u16 off, u8 *data, u32 sz)
 {
 	size_t i;
 	u32 mask = 0;
@@ -391,7 +391,7 @@  static int vfio_pci_msi_vector_write(struct kvm *kvm, struct vfio_device *vdev,
 }
 
 static void vfio_pci_msi_cap_write(struct kvm *kvm, struct vfio_device *vdev,
-				   u8 off, u8 *data, u32 sz)
+				   u16 off, u8 *data, u32 sz)
 {
 	u8 ctrl;
 	struct msi_msg msg;
@@ -536,7 +536,7 @@  out:
 }
 
 static void vfio_pci_cfg_read(struct kvm *kvm, struct pci_device_header *pci_hdr,
-			      u8 offset, void *data, int sz)
+			      u16 offset, void *data, int sz)
 {
 	struct vfio_region_info *info;
 	struct vfio_pci_device *pdev;
@@ -554,7 +554,7 @@  static void vfio_pci_cfg_read(struct kvm *kvm, struct pci_device_header *pci_hdr
 }
 
 static void vfio_pci_cfg_write(struct kvm *kvm, struct pci_device_header *pci_hdr,
-			       u8 offset, void *data, int sz)
+			       u16 offset, void *data, int sz)
 {
 	struct vfio_region_info *info;
 	struct vfio_pci_device *pdev;
@@ -638,15 +638,17 @@  static int vfio_pci_parse_caps(struct vfio_device *vdev)
 {
 	int ret;
 	size_t size;
-	u8 pos, next;
+	u16 pos, next;
 	struct pci_cap_hdr *cap;
-	u8 virt_hdr[PCI_DEV_CFG_SIZE];
+	u8 *virt_hdr;
 	struct vfio_pci_device *pdev = &vdev->pci;
 
 	if (!(pdev->hdr.status & PCI_STATUS_CAP_LIST))
 		return 0;
 
-	memset(virt_hdr, 0, PCI_DEV_CFG_SIZE);
+	virt_hdr = calloc(1, PCI_DEV_CFG_SIZE);
+	if (!virt_hdr)
+		return -errno;
 
 	pos = pdev->hdr.capabilities & ~3;
 
@@ -682,6 +684,8 @@  static int vfio_pci_parse_caps(struct vfio_device *vdev)
 	size = PCI_DEV_CFG_SIZE - PCI_STD_HEADER_SIZEOF;
 	memcpy((void *)&pdev->hdr + pos, virt_hdr + pos, size);
 
+	free(virt_hdr);
+
 	return 0;
 }
 
@@ -792,7 +796,11 @@  static int vfio_pci_fixup_cfg_space(struct vfio_device *vdev)
 
 	/* Install our fake Configuration Space */
 	info = &vdev->regions[VFIO_PCI_CONFIG_REGION_INDEX].info;
-	hdr_sz = PCI_DEV_CFG_SIZE;
+	/*
+	 * We don't touch the extended configuration space, let's be cautious
+	 * and not overwrite it all with zeros, or bad things might happen.
+	 */
+	hdr_sz = PCI_CFG_SPACE_SIZE;
 	if (pwrite(vdev->fd, &pdev->hdr, hdr_sz, info->offset) != hdr_sz) {
 		vfio_dev_err(vdev, "failed to write %zd bytes to Config Space",
 			     hdr_sz);