diff mbox series

[v2,kvmtool,27/30] pci: Implement reassignable BARs

Message ID 20200123134805.1993-28-alexandru.elisei@arm.com (mailing list archive)
State New, archived
Headers show
Series Add reassignable BARs and PCIE 1.1 support | expand

Commit Message

Alexandru Elisei Jan. 23, 2020, 1:48 p.m. UTC
BARs are used by the guest to configure the access to the PCI device by
writing the address to which the device will respond. The basic idea for
adding support for reassignable BARs is straightforward: deactivate
emulation for the memory region described by the old BAR value, and
activate emulation for the new region.

BAR reassignement can be done while device access is enabled and memory
regions for different devices can overlap as long as no access is made
to the overlapping memory regions. This means that it is legal for the
BARs of two distinct devices to point to an overlapping memory region,
and indeed, this is how Linux does resource assignment at boot. To
account for this situation, the simple algorithm described above is
enhanced to scan for all devices and:

- Deactivate emulation for any BARs that might overlap with the new BAR
  value.

- Enable emulation for any BARs that were overlapping with the old value
  after the BAR has been updated.

Activating/deactivating emulation of a memory region has side effects.
In order to prevent the execution of the same callback twice we now keep
track of the state of the region emulation. For example, this can happen
if we program a BAR with an address that overlaps a second BAR, thus
deactivating emulation for the second BAR, and then we disable all
region accesses to the second BAR by writing to the command register.

Signed-off-by: Alexandru Elisei <alexandru.elisei@arm.com>
---
 hw/vesa.c           |   6 +-
 include/kvm/pci.h   |  23 +++-
 pci.c               | 274 +++++++++++++++++++++++++++++++++++---------
 powerpc/spapr_pci.c |   2 +-
 vfio/pci.c          |  15 ++-
 virtio/pci.c        |   8 +-
 6 files changed, 261 insertions(+), 67 deletions(-)

Comments

Andre Przywara Feb. 7, 2020, 4:50 p.m. UTC | #1
On Thu, 23 Jan 2020 13:48:02 +0000
Alexandru Elisei <alexandru.elisei@arm.com> wrote:

Hi,

> BARs are used by the guest to configure the access to the PCI device by
> writing the address to which the device will respond. The basic idea for
> adding support for reassignable BARs is straightforward: deactivate
> emulation for the memory region described by the old BAR value, and
> activate emulation for the new region.
> 
> BAR reassignement can be done while device access is enabled and memory
> regions for different devices can overlap as long as no access is made
> to the overlapping memory regions. This means that it is legal for the
> BARs of two distinct devices to point to an overlapping memory region,
> and indeed, this is how Linux does resource assignment at boot. To
> account for this situation, the simple algorithm described above is
> enhanced to scan for all devices and:
> 
> - Deactivate emulation for any BARs that might overlap with the new BAR
>   value.
> 
> - Enable emulation for any BARs that were overlapping with the old value
>   after the BAR has been updated.
> 
> Activating/deactivating emulation of a memory region has side effects.
> In order to prevent the execution of the same callback twice we now keep
> track of the state of the region emulation. For example, this can happen
> if we program a BAR with an address that overlaps a second BAR, thus
> deactivating emulation for the second BAR, and then we disable all
> region accesses to the second BAR by writing to the command register.
> 
> Signed-off-by: Alexandru Elisei <alexandru.elisei@arm.com>
> ---
>  hw/vesa.c           |   6 +-
>  include/kvm/pci.h   |  23 +++-
>  pci.c               | 274 +++++++++++++++++++++++++++++++++++---------
>  powerpc/spapr_pci.c |   2 +-
>  vfio/pci.c          |  15 ++-
>  virtio/pci.c        |   8 +-
>  6 files changed, 261 insertions(+), 67 deletions(-)
> 
> diff --git a/hw/vesa.c b/hw/vesa.c
> index 3044a86078fb..aca938f79c82 100644
> --- a/hw/vesa.c
> +++ b/hw/vesa.c
> @@ -49,7 +49,7 @@ static int vesa__bar_activate(struct kvm *kvm,
>  	int r;
>  
>  	bar_addr = pci__bar_address(pci_hdr, bar_num);
> -	bar_size = pci_hdr->bar_size[bar_num];
> +	bar_size = pci__bar_size(pci_hdr, bar_num);
>  
>  	switch (bar_num) {
>  	case 0:
> @@ -121,9 +121,9 @@ struct framebuffer *vesa__init(struct kvm *kvm)
>  		.subsys_vendor_id	= cpu_to_le16(PCI_SUBSYSTEM_VENDOR_ID_REDHAT_QUMRANET),
>  		.subsys_id		= cpu_to_le16(PCI_SUBSYSTEM_ID_VESA),
>  		.bar[0]			= cpu_to_le32(port_addr | PCI_BASE_ADDRESS_SPACE_IO),
> -		.bar_size[0]		= PCI_IO_SIZE,
> +		.bar_info[0]		= (struct pci_bar_info) {.size = PCI_IO_SIZE},
>  		.bar[1]			= cpu_to_le32(VESA_MEM_ADDR | PCI_BASE_ADDRESS_SPACE_MEMORY),
> -		.bar_size[1]		= VESA_MEM_SIZE,
> +		.bar_info[1]		= (struct pci_bar_info) {.size = VESA_MEM_SIZE},
>  	};
>  
>  	vdev->pci_hdr.cfg_ops = (struct pci_config_operations) {
> diff --git a/include/kvm/pci.h b/include/kvm/pci.h
> index bf42f497168f..ae71ef33237c 100644
> --- a/include/kvm/pci.h
> +++ b/include/kvm/pci.h
> @@ -11,6 +11,17 @@
>  #include "kvm/msi.h"
>  #include "kvm/fdt.h"
>  
> +#define pci_dev_err(pci_hdr, fmt, ...) \
> +	pr_err("[%04x:%04x] " fmt, pci_hdr->vendor_id, pci_hdr->device_id, ##__VA_ARGS__)
> +#define pci_dev_warn(pci_hdr, fmt, ...) \
> +	pr_warning("[%04x:%04x] " fmt, pci_hdr->vendor_id, pci_hdr->device_id, ##__VA_ARGS__)
> +#define pci_dev_info(pci_hdr, fmt, ...) \
> +	pr_info("[%04x:%04x] " fmt, pci_hdr->vendor_id, pci_hdr->device_id, ##__VA_ARGS__)
> +#define pci_dev_dbg(pci_hdr, fmt, ...) \
> +	pr_debug("[%04x:%04x] " fmt, pci_hdr->vendor_id, pci_hdr->device_id, ##__VA_ARGS__)
> +#define pci_dev_die(pci_hdr, fmt, ...) \
> +	die("[%04x:%04x] " fmt, pci_hdr->vendor_id, pci_hdr->device_id, ##__VA_ARGS__)
> +
>  /*
>   * PCI Configuration Mechanism #1 I/O ports. See Section 3.7.4.1.
>   * ("Configuration Mechanism #1") of the PCI Local Bus Specification 2.1 for
> @@ -89,6 +100,11 @@ struct pci_cap_hdr {
>  	u8	next;
>  };
>  
> +struct pci_bar_info {
> +	u32 size;
> +	bool active;
> +};

Do we really need this data structure above?
There is this "32-bit plus 1-bit" annoyance, but also a lot of changes in this patch are about this, making the code less pretty.
So what about we introduce a bitmap, below in struct pci_device_header? I think we inherited the neat set_bit/test_bit functions from the kernel, so can we use that by just adding something like an "unsigned long bar_enabled;" below?

> +
>  struct pci_device_header;
>  
>  typedef int (*bar_activate_fn_t)(struct kvm *kvm,
> @@ -142,7 +158,7 @@ struct pci_device_header {
>  	};
>  
>  	/* Private to lkvm */
> -	u32		bar_size[6];
> +	struct pci_bar_info	bar_info[6];
>  	bar_activate_fn_t	bar_activate_fn;
>  	bar_deactivate_fn_t	bar_deactivate_fn;
>  	void *data;
> @@ -224,4 +240,9 @@ static inline u32 pci__bar_address(struct pci_device_header *pci_hdr, int bar_nu
>  	return __pci__bar_address(pci_hdr->bar[bar_num]);
>  }
>  
> +static inline u32 pci__bar_size(struct pci_device_header *pci_hdr, int bar_num)
> +{
> +	return pci_hdr->bar_info[bar_num].size;
> +}
> +
>  #endif /* KVM__PCI_H */
> diff --git a/pci.c b/pci.c
> index 98331a1fc205..1e9791250bc3 100644
> --- a/pci.c
> +++ b/pci.c
> @@ -68,7 +68,7 @@ void pci__assign_irq(struct device_header *dev_hdr)
>  
>  static bool pci_bar_is_implemented(struct pci_device_header *pci_hdr, int bar_num)
>  {
> -	return  bar_num < 6 && pci_hdr->bar_size[bar_num];
> +	return  bar_num < 6 && pci__bar_size(pci_hdr, bar_num);
>  }
>  
>  static void *pci_config_address_ptr(u16 port)
> @@ -157,6 +157,46 @@ static struct ioport_operations pci_config_data_ops = {
>  	.io_out	= pci_config_data_out,
>  };
>  
> +static int pci_activate_bar(struct kvm *kvm, struct pci_device_header *pci_hdr,
> +			    int bar_num)
> +{
> +	int r = 0;
> +
> +	if (pci_hdr->bar_info[bar_num].active)
> +		goto out;
> +
> +	r = pci_hdr->bar_activate_fn(kvm, pci_hdr, bar_num, pci_hdr->data);
> +	if (r < 0) {
> +		pci_dev_err(pci_hdr, "Error activating emulation for BAR %d",
> +			    bar_num);
> +		goto out;
> +	}
> +	pci_hdr->bar_info[bar_num].active = true;
> +
> +out:
> +	return r;
> +}
> +
> +static int pci_deactivate_bar(struct kvm *kvm, struct pci_device_header *pci_hdr,
> +			      int bar_num)
> +{
> +	int r = 0;
> +
> +	if (!pci_hdr->bar_info[bar_num].active)
> +		goto out;
> +
> +	r = pci_hdr->bar_deactivate_fn(kvm, pci_hdr, bar_num, pci_hdr->data);
> +	if (r < 0) {
> +		pci_dev_err(pci_hdr, "Error deactivating emulation for BAR %d",
> +			    bar_num);
> +		goto out;
> +	}
> +	pci_hdr->bar_info[bar_num].active = false;
> +
> +out:
> +	return r;
> +}
> +
>  static void pci_config_command_wr(struct kvm *kvm,
>  				  struct pci_device_header *pci_hdr,
>  				  u16 new_command)
> @@ -173,26 +213,179 @@ static void pci_config_command_wr(struct kvm *kvm,
>  
>  		if (toggle_io && pci__bar_is_io(pci_hdr, i)) {
>  			if (__pci__io_space_enabled(new_command))
> -				pci_hdr->bar_activate_fn(kvm, pci_hdr, i,
> -							 pci_hdr->data);
> -			else
> -				pci_hdr->bar_deactivate_fn(kvm, pci_hdr, i,
> -							   pci_hdr->data);
> +				pci_activate_bar(kvm, pci_hdr, i);
> +			if (!__pci__io_space_enabled(new_command))

Isn't that just "else", as before?

> +				pci_deactivate_bar(kvm, pci_hdr, i);
>  		}
>  
>  		if (toggle_mem && pci__bar_is_memory(pci_hdr, i)) {
>  			if (__pci__memory_space_enabled(new_command))
> -				pci_hdr->bar_activate_fn(kvm, pci_hdr, i,
> -							 pci_hdr->data);
> -			else
> -				pci_hdr->bar_deactivate_fn(kvm, pci_hdr, i,
> -							   pci_hdr->data);
> +				pci_activate_bar(kvm, pci_hdr, i);
> +			if (!__pci__memory_space_enabled(new_command))

Same here?

> +				pci_deactivate_bar(kvm, pci_hdr, i);
>  		}
>  	}
>  
>  	pci_hdr->command = new_command;
>  }
>  
> +static int pci_deactivate_bar_regions(struct kvm *kvm,
> +				      struct pci_device_header *pci_hdr,
> +				      u32 start, u32 size)
> +{
> +	struct device_header *dev_hdr;
> +	struct pci_device_header *tmp_hdr;
> +	u32 tmp_addr, tmp_size;
> +	int i, r;
> +
> +	dev_hdr = device__first_dev(DEVICE_BUS_PCI);
> +	while (dev_hdr) {
> +		tmp_hdr = dev_hdr->data;
> +		for (i = 0; i < 6; i++) {
> +			if (!pci_bar_is_implemented(tmp_hdr, i))
> +				continue;
> +
> +			tmp_addr = pci__bar_address(tmp_hdr, i);
> +			tmp_size = pci__bar_size(tmp_hdr, i);
> +
> +			if (tmp_addr + tmp_size <= start ||
> +			    tmp_addr >= start + size)
> +				continue;
> +
> +			r = pci_deactivate_bar(kvm, tmp_hdr, i);
> +			if (r < 0)
> +				return r;
> +		}
> +		dev_hdr = device__next_dev(dev_hdr);
> +	}
> +
> +	return 0;
> +}
> +
> +static int pci_activate_bar_regions(struct kvm *kvm,
> +				    struct pci_device_header *pci_hdr,
> +				    u32 start, u32 size)
> +{
> +	struct device_header *dev_hdr;
> +	struct pci_device_header *tmp_hdr;
> +	u32 tmp_addr, tmp_size;
> +	int i, r;
> +
> +	dev_hdr = device__first_dev(DEVICE_BUS_PCI);
> +	while (dev_hdr) {
> +		tmp_hdr = dev_hdr->data;
> +		for (i = 0; i < 6; i++) {
> +			if (!pci_bar_is_implemented(tmp_hdr, i))
> +				continue;
> +
> +			tmp_addr = pci__bar_address(tmp_hdr, i);
> +			tmp_size = pci__bar_size(tmp_hdr, i);
> +
> +			if (tmp_addr + tmp_size <= start ||
> +			    tmp_addr >= start + size)
> +				continue;
> +
> +			r = pci_activate_bar(kvm, tmp_hdr, i);
> +			if (r < 0)
> +				return r;
> +		}
> +		dev_hdr = device__next_dev(dev_hdr);
> +	}
> +
> +	return 0;
> +}
> +
> +static void pci_config_bar_wr(struct kvm *kvm,
> +			      struct pci_device_header *pci_hdr, int bar_num,
> +			      u32 value)
> +{
> +	u32 old_addr, new_addr, bar_size;
> +	u32 mask;
> +	int r;
> +
> +	if (pci__bar_is_io(pci_hdr, bar_num))
> +		mask = (u32)PCI_BASE_ADDRESS_IO_MASK;
> +	else
> +		mask = (u32)PCI_BASE_ADDRESS_MEM_MASK;
> +
> +	/*
> +	 * If the kernel masks the BAR, it will expect to find the size of the
> +	 * BAR there next time it reads from it. After the kernel reads the
> +	 * size, it will write the address back.
> +	 *
> +	 * According to the PCI local bus specification REV 3.0: The number of
> +	 * upper bits that a device actually implements depends on how much of
> +	 * the address space the device will respond to. A device that wants a 1
> +	 * MB memory address space (using a 32-bit base address register) would
> +	 * build the top 12 bits of the address register, hardwiring the other
> +	 * bits to 0.
> +	 *
> +	 * Furthermore, software can determine how much address space the device
> +	 * requires by writing a value of all 1's to the register and then
> +	 * reading the value back. The device will return 0's in all don't-care
> +	 * address bits, effectively specifying the address space required.
> +	 *
> +	 * Software computes the size of the address space with the formula
> +	 * S =  ~B + 1, where S is the memory size and B is the value read from
> +	 * the BAR. This means that the BAR value that kvmtool should return is
> +	 * B = ~(S - 1).
> +	 */
> +	if (value == 0xffffffff) {
> +		value = ~(pci__bar_size(pci_hdr, bar_num) - 1);
> +		/* Preserve the special bits. */
> +		value = (value & mask) | (pci_hdr->bar[bar_num] & ~mask);
> +		pci_hdr->bar[bar_num] = value;
> +		return;
> +	}
> +
> +	value = (value & mask) | (pci_hdr->bar[bar_num] & ~mask);
> +
> +	/* Don't toggle emulation when region type access is disbled. */
> +	if (pci__bar_is_io(pci_hdr, bar_num) &&
> +	    !pci__io_space_enabled(pci_hdr)) {
> +		pci_hdr->bar[bar_num] = value;
> +		return;
> +	}
> +
> +	if (pci__bar_is_memory(pci_hdr, bar_num) &&
> +	    !pci__memory_space_enabled(pci_hdr)) {
> +		pci_hdr->bar[bar_num] = value;
> +		return;
> +	}
> +
> +	old_addr = pci__bar_address(pci_hdr, bar_num);
> +	new_addr = __pci__bar_address(value);
> +	bar_size = pci__bar_size(pci_hdr, bar_num);
> +
> +	r = pci_deactivate_bar(kvm, pci_hdr, bar_num);
> +	if (r < 0)
> +		return;
> +
> +	r = pci_deactivate_bar_regions(kvm, pci_hdr, new_addr, bar_size);
> +	if (r < 0) {
> +		/*
> +		 * We cannot update the BAR because of an overlapping region
> +		 * that failed to deactivate emulation, so keep the old BAR
> +		 * value and re-activate emulation for it.
> +		 */
> +		pci_activate_bar(kvm, pci_hdr, bar_num);
> +		return;
> +	}
> +
> +	pci_hdr->bar[bar_num] = value;
> +	r = pci_activate_bar(kvm, pci_hdr, bar_num);
> +	if (r < 0) {
> +		/*
> +		 * New region cannot be emulated, re-enable the regions that
> +		 * were overlapping.
> +		 */
> +		pci_activate_bar_regions(kvm, pci_hdr, new_addr, bar_size);
> +		return;
> +	}
> +
> +	pci_activate_bar_regions(kvm, pci_hdr, old_addr, bar_size);
> +}
> +
>  void pci__config_wr(struct kvm *kvm, union pci_config_address addr, void *data, int size)
>  {
>  	void *base;
> @@ -200,7 +393,6 @@ void pci__config_wr(struct kvm *kvm, union pci_config_address addr, void *data,
>  	struct pci_device_header *pci_hdr;
>  	u8 dev_num = addr.device_number;
>  	u32 value = 0;
> -	u32 mask;
>  
>  	if (!pci_device_exists(addr.bus_number, dev_num, 0))
>  		return;
> @@ -225,46 +417,13 @@ void pci__config_wr(struct kvm *kvm, union pci_config_address addr, void *data,
>  	}
>  
>  	bar = (offset - PCI_BAR_OFFSET(0)) / sizeof(u32);
> -
> -	/*
> -	 * If the kernel masks the BAR, it will expect to find the size of the
> -	 * BAR there next time it reads from it. After the kernel reads the
> -	 * size, it will write the address back.
> -	 */
>  	if (bar < 6) {
> -		if (pci__bar_is_io(pci_hdr, bar))
> -			mask = (u32)PCI_BASE_ADDRESS_IO_MASK;
> -		else
> -			mask = (u32)PCI_BASE_ADDRESS_MEM_MASK;
> -		/*
> -		 * According to the PCI local bus specification REV 3.0:
> -		 * The number of upper bits that a device actually implements
> -		 * depends on how much of the address space the device will
> -		 * respond to. A device that wants a 1 MB memory address space
> -		 * (using a 32-bit base address register) would build the top
> -		 * 12 bits of the address register, hardwiring the other bits
> -		 * to 0.
> -		 *
> -		 * Furthermore, software can determine how much address space
> -		 * the device requires by writing a value of all 1's to the
> -		 * register and then reading the value back. The device will
> -		 * return 0's in all don't-care address bits, effectively
> -		 * specifying the address space required.
> -		 *
> -		 * Software computes the size of the address space with the
> -		 * formula S = ~B + 1, where S is the memory size and B is the
> -		 * value read from the BAR. This means that the BAR value that
> -		 * kvmtool should return is B = ~(S - 1).
> -		 */
>  		memcpy(&value, data, size);
> -		if (value == 0xffffffff)
> -			value = ~(pci_hdr->bar_size[bar] - 1);
> -		/* Preserve the special bits. */
> -		value = (value & mask) | (pci_hdr->bar[bar] & ~mask);
> -		memcpy(base + offset, &value, size);
> -	} else {
> -		memcpy(base + offset, data, size);
> +		pci_config_bar_wr(kvm, pci_hdr, bar, value);
> +		return;
>  	}
> +
> +	memcpy(base + offset, data, size);
>  }
>  
>  void pci__config_rd(struct kvm *kvm, union pci_config_address addr, void *data, int size)
> @@ -329,20 +488,21 @@ int pci__register_bar_regions(struct kvm *kvm, struct pci_device_header *pci_hdr
>  			continue;
>  
>  		has_bar_regions = true;
> +		assert(!pci_hdr->bar_info[i].active);
>  
>  		if (pci__bar_is_io(pci_hdr, i) &&
>  		    pci__io_space_enabled(pci_hdr)) {
> -				r = bar_activate_fn(kvm, pci_hdr, i, data);
> -				if (r < 0)
> -					return r;
> -			}
> +			r = pci_activate_bar(kvm, pci_hdr, i);
> +			if (r < 0)
> +				return r;
> +		}
>  
>  		if (pci__bar_is_memory(pci_hdr, i) &&
>  		    pci__memory_space_enabled(pci_hdr)) {
> -				r = bar_activate_fn(kvm, pci_hdr, i, data);
> -				if (r < 0)
> -					return r;
> -			}
> +			r = pci_activate_bar(kvm, pci_hdr, i);
> +			if (r < 0)
> +				return r;
> +		}
>  	}
>  
>  	assert(has_bar_regions);
> diff --git a/powerpc/spapr_pci.c b/powerpc/spapr_pci.c
> index a15f7d895a46..7be44d950acb 100644
> --- a/powerpc/spapr_pci.c
> +++ b/powerpc/spapr_pci.c
> @@ -369,7 +369,7 @@ int spapr_populate_pci_devices(struct kvm *kvm,
>  				of_pci_b_ddddd(devid) |
>  				of_pci_b_fff(fn) |
>  				of_pci_b_rrrrrrrr(bars[i]));
> -			reg[n+1].size = cpu_to_be64(hdr->bar_size[i]);
> +			reg[n+1].size = cpu_to_be64(pci__bar_size(hdr, i));
>  			reg[n+1].addr = 0;
>  
>  			assigned_addresses[n].phys_hi = cpu_to_be32(
> diff --git a/vfio/pci.c b/vfio/pci.c
> index 9e595562180b..3a641e72e574 100644
> --- a/vfio/pci.c
> +++ b/vfio/pci.c
> @@ -455,6 +455,7 @@ static int vfio_pci_bar_activate(struct kvm *kvm,
>  	struct vfio_pci_msix_pba *pba = &pdev->msix_pba;
>  	struct vfio_pci_msix_table *table = &pdev->msix_table;
>  	struct vfio_region *region = &vdev->regions[bar_num];
> +	u32 bar_addr;
>  	int ret;
>  
>  	if (!region->info.size) {
> @@ -462,8 +463,11 @@ static int vfio_pci_bar_activate(struct kvm *kvm,
>  		goto out;
>  	}
>  
> +	bar_addr = pci__bar_address(pci_hdr, bar_num);
> +
>  	if ((pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX) &&
>  	    (u32)bar_num == table->bar) {
> +		table->guest_phys_addr = region->guest_phys_addr = bar_addr;

I think those double assignments are a bit frowned upon, at least in Linux coding style. It would probably be cleaner to assign the region member after the error check.

>  		ret = kvm__register_mmio(kvm, table->guest_phys_addr,
>  					 table->size, false,
>  					 vfio_pci_msix_table_access, pdev);
> @@ -473,13 +477,22 @@ static int vfio_pci_bar_activate(struct kvm *kvm,
>  
>  	if ((pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX) &&
>  	    (u32)bar_num == pba->bar) {
> +		if (pba->bar == table->bar)
> +			pba->guest_phys_addr = table->guest_phys_addr + table->size;
> +		else
> +			pba->guest_phys_addr = region->guest_phys_addr = bar_addr;

same here with the double assignment

>  		ret = kvm__register_mmio(kvm, pba->guest_phys_addr,
>  					 pba->size, false,
>  					 vfio_pci_msix_pba_access, pdev);
>  		goto out;
>  	}
>  
> +	if (pci__bar_is_io(pci_hdr, bar_num))
> +		region->port_base = bar_addr;
> +	else
> +		region->guest_phys_addr = bar_addr;

Isn't that redundant with those double assignments above? Maybe you can get rid of those altogether?

Cheers,
Andre

>  	ret = vfio_map_region(kvm, vdev, region);
> +
>  out:
>  	return ret;
>  }
> @@ -749,7 +762,7 @@ static int vfio_pci_fixup_cfg_space(struct vfio_device *vdev)
>  		if (!base)
>  			continue;
>  
> -		pdev->hdr.bar_size[i] = region->info.size;
> +		pdev->hdr.bar_info[i].size = region->info.size;
>  	}
>  
>  	/* I really can't be bothered to support cardbus. */
> diff --git a/virtio/pci.c b/virtio/pci.c
> index 5a3cc6f1e943..e02430881394 100644
> --- a/virtio/pci.c
> +++ b/virtio/pci.c
> @@ -483,7 +483,7 @@ static int virtio_pci__bar_activate(struct kvm *kvm,
>  	int r;
>  
>  	bar_addr = pci__bar_address(pci_hdr, bar_num);
> -	bar_size = pci_hdr->bar_size[bar_num];
> +	bar_size = pci__bar_size(pci_hdr, bar_num);
>  
>  	switch (bar_num) {
>  	case 0:
> @@ -569,9 +569,9 @@ int virtio_pci__init(struct kvm *kvm, void *dev, struct virtio_device *vdev,
>  							| PCI_BASE_ADDRESS_SPACE_MEMORY),
>  		.status			= cpu_to_le16(PCI_STATUS_CAP_LIST),
>  		.capabilities		= (void *)&vpci->pci_hdr.msix - (void *)&vpci->pci_hdr,
> -		.bar_size[0]		= cpu_to_le32(PCI_IO_SIZE),
> -		.bar_size[1]		= cpu_to_le32(PCI_IO_SIZE),
> -		.bar_size[2]		= cpu_to_le32(PCI_IO_SIZE*2),
> +		.bar_info[0]		= (struct pci_bar_info) {.size = cpu_to_le32(PCI_IO_SIZE)},
> +		.bar_info[1]		= (struct pci_bar_info) {.size = cpu_to_le32(PCI_IO_SIZE)},
> +		.bar_info[2]		= (struct pci_bar_info) {.size = cpu_to_le32(PCI_IO_SIZE*2)},
>  	};
>  
>  	r = pci__register_bar_regions(kvm, &vpci->pci_hdr,
Alexandru Elisei March 10, 2020, 2:17 p.m. UTC | #2
Hi,

On 2/7/20 4:50 PM, Andre Przywara wrote:
> On Thu, 23 Jan 2020 13:48:02 +0000
> Alexandru Elisei <alexandru.elisei@arm.com> wrote:
>
> Hi,
>
>> BARs are used by the guest to configure the access to the PCI device by
>> writing the address to which the device will respond. The basic idea for
>> adding support for reassignable BARs is straightforward: deactivate
>> emulation for the memory region described by the old BAR value, and
>> activate emulation for the new region.
>>
>> BAR reassignement can be done while device access is enabled and memory
>> regions for different devices can overlap as long as no access is made
>> to the overlapping memory regions. This means that it is legal for the
>> BARs of two distinct devices to point to an overlapping memory region,
>> and indeed, this is how Linux does resource assignment at boot. To
>> account for this situation, the simple algorithm described above is
>> enhanced to scan for all devices and:
>>
>> - Deactivate emulation for any BARs that might overlap with the new BAR
>>   value.
>>
>> - Enable emulation for any BARs that were overlapping with the old value
>>   after the BAR has been updated.
>>
>> Activating/deactivating emulation of a memory region has side effects.
>> In order to prevent the execution of the same callback twice we now keep
>> track of the state of the region emulation. For example, this can happen
>> if we program a BAR with an address that overlaps a second BAR, thus
>> deactivating emulation for the second BAR, and then we disable all
>> region accesses to the second BAR by writing to the command register.
>>
>> Signed-off-by: Alexandru Elisei <alexandru.elisei@arm.com>
>> ---
>>  hw/vesa.c           |   6 +-
>>  include/kvm/pci.h   |  23 +++-
>>  pci.c               | 274 +++++++++++++++++++++++++++++++++++---------
>>  powerpc/spapr_pci.c |   2 +-
>>  vfio/pci.c          |  15 ++-
>>  virtio/pci.c        |   8 +-
>>  6 files changed, 261 insertions(+), 67 deletions(-)
>>
>> diff --git a/hw/vesa.c b/hw/vesa.c
>> index 3044a86078fb..aca938f79c82 100644
>> --- a/hw/vesa.c
>> +++ b/hw/vesa.c
>> @@ -49,7 +49,7 @@ static int vesa__bar_activate(struct kvm *kvm,
>>  	int r;
>>  
>>  	bar_addr = pci__bar_address(pci_hdr, bar_num);
>> -	bar_size = pci_hdr->bar_size[bar_num];
>> +	bar_size = pci__bar_size(pci_hdr, bar_num);
>>  
>>  	switch (bar_num) {
>>  	case 0:
>> @@ -121,9 +121,9 @@ struct framebuffer *vesa__init(struct kvm *kvm)
>>  		.subsys_vendor_id	= cpu_to_le16(PCI_SUBSYSTEM_VENDOR_ID_REDHAT_QUMRANET),
>>  		.subsys_id		= cpu_to_le16(PCI_SUBSYSTEM_ID_VESA),
>>  		.bar[0]			= cpu_to_le32(port_addr | PCI_BASE_ADDRESS_SPACE_IO),
>> -		.bar_size[0]		= PCI_IO_SIZE,
>> +		.bar_info[0]		= (struct pci_bar_info) {.size = PCI_IO_SIZE},
>>  		.bar[1]			= cpu_to_le32(VESA_MEM_ADDR | PCI_BASE_ADDRESS_SPACE_MEMORY),
>> -		.bar_size[1]		= VESA_MEM_SIZE,
>> +		.bar_info[1]		= (struct pci_bar_info) {.size = VESA_MEM_SIZE},
>>  	};
>>  
>>  	vdev->pci_hdr.cfg_ops = (struct pci_config_operations) {
>> diff --git a/include/kvm/pci.h b/include/kvm/pci.h
>> index bf42f497168f..ae71ef33237c 100644
>> --- a/include/kvm/pci.h
>> +++ b/include/kvm/pci.h
>> @@ -11,6 +11,17 @@
>>  #include "kvm/msi.h"
>>  #include "kvm/fdt.h"
>>  
>> +#define pci_dev_err(pci_hdr, fmt, ...) \
>> +	pr_err("[%04x:%04x] " fmt, pci_hdr->vendor_id, pci_hdr->device_id, ##__VA_ARGS__)
>> +#define pci_dev_warn(pci_hdr, fmt, ...) \
>> +	pr_warning("[%04x:%04x] " fmt, pci_hdr->vendor_id, pci_hdr->device_id, ##__VA_ARGS__)
>> +#define pci_dev_info(pci_hdr, fmt, ...) \
>> +	pr_info("[%04x:%04x] " fmt, pci_hdr->vendor_id, pci_hdr->device_id, ##__VA_ARGS__)
>> +#define pci_dev_dbg(pci_hdr, fmt, ...) \
>> +	pr_debug("[%04x:%04x] " fmt, pci_hdr->vendor_id, pci_hdr->device_id, ##__VA_ARGS__)
>> +#define pci_dev_die(pci_hdr, fmt, ...) \
>> +	die("[%04x:%04x] " fmt, pci_hdr->vendor_id, pci_hdr->device_id, ##__VA_ARGS__)
>> +
>>  /*
>>   * PCI Configuration Mechanism #1 I/O ports. See Section 3.7.4.1.
>>   * ("Configuration Mechanism #1") of the PCI Local Bus Specification 2.1 for
>> @@ -89,6 +100,11 @@ struct pci_cap_hdr {
>>  	u8	next;
>>  };
>>  
>> +struct pci_bar_info {
>> +	u32 size;
>> +	bool active;
>> +};
> Do we really need this data structure above?
> There is this "32-bit plus 1-bit" annoyance, but also a lot of changes in this patch are about this, making the code less pretty.
> So what about we introduce a bitmap, below in struct pci_device_header? I think we inherited the neat set_bit/test_bit functions from the kernel, so can we use that by just adding something like an "unsigned long bar_enabled;" below?

I think I understand what you are saying. I don't want to use a bitmap, because I
think that's even uglier. I'll try and see how adding an array of bools to struct
pci_device_header and keeping the bar_size member would look like.

>
>> +
>>  struct pci_device_header;
>>  
>>  typedef int (*bar_activate_fn_t)(struct kvm *kvm,
>> @@ -142,7 +158,7 @@ struct pci_device_header {
>>  	};
>>  
>>  	/* Private to lkvm */
>> -	u32		bar_size[6];
>> +	struct pci_bar_info	bar_info[6];
>>  	bar_activate_fn_t	bar_activate_fn;
>>  	bar_deactivate_fn_t	bar_deactivate_fn;
>>  	void *data;
>> @@ -224,4 +240,9 @@ static inline u32 pci__bar_address(struct pci_device_header *pci_hdr, int bar_nu
>>  	return __pci__bar_address(pci_hdr->bar[bar_num]);
>>  }
>>  
>> +static inline u32 pci__bar_size(struct pci_device_header *pci_hdr, int bar_num)
>> +{
>> +	return pci_hdr->bar_info[bar_num].size;
>> +}
>> +
>>  #endif /* KVM__PCI_H */
>> diff --git a/pci.c b/pci.c
>> index 98331a1fc205..1e9791250bc3 100644
>> --- a/pci.c
>> +++ b/pci.c
>> @@ -68,7 +68,7 @@ void pci__assign_irq(struct device_header *dev_hdr)
>>  
>>  static bool pci_bar_is_implemented(struct pci_device_header *pci_hdr, int bar_num)
>>  {
>> -	return  bar_num < 6 && pci_hdr->bar_size[bar_num];
>> +	return  bar_num < 6 && pci__bar_size(pci_hdr, bar_num);
>>  }
>>  
>>  static void *pci_config_address_ptr(u16 port)
>> @@ -157,6 +157,46 @@ static struct ioport_operations pci_config_data_ops = {
>>  	.io_out	= pci_config_data_out,
>>  };
>>  
>> +static int pci_activate_bar(struct kvm *kvm, struct pci_device_header *pci_hdr,
>> +			    int bar_num)
>> +{
>> +	int r = 0;
>> +
>> +	if (pci_hdr->bar_info[bar_num].active)
>> +		goto out;
>> +
>> +	r = pci_hdr->bar_activate_fn(kvm, pci_hdr, bar_num, pci_hdr->data);
>> +	if (r < 0) {
>> +		pci_dev_err(pci_hdr, "Error activating emulation for BAR %d",
>> +			    bar_num);
>> +		goto out;
>> +	}
>> +	pci_hdr->bar_info[bar_num].active = true;
>> +
>> +out:
>> +	return r;
>> +}
>> +
>> +static int pci_deactivate_bar(struct kvm *kvm, struct pci_device_header *pci_hdr,
>> +			      int bar_num)
>> +{
>> +	int r = 0;
>> +
>> +	if (!pci_hdr->bar_info[bar_num].active)
>> +		goto out;
>> +
>> +	r = pci_hdr->bar_deactivate_fn(kvm, pci_hdr, bar_num, pci_hdr->data);
>> +	if (r < 0) {
>> +		pci_dev_err(pci_hdr, "Error deactivating emulation for BAR %d",
>> +			    bar_num);
>> +		goto out;
>> +	}
>> +	pci_hdr->bar_info[bar_num].active = false;
>> +
>> +out:
>> +	return r;
>> +}
>> +
>>  static void pci_config_command_wr(struct kvm *kvm,
>>  				  struct pci_device_header *pci_hdr,
>>  				  u16 new_command)
>> @@ -173,26 +213,179 @@ static void pci_config_command_wr(struct kvm *kvm,
>>  
>>  		if (toggle_io && pci__bar_is_io(pci_hdr, i)) {
>>  			if (__pci__io_space_enabled(new_command))
>> -				pci_hdr->bar_activate_fn(kvm, pci_hdr, i,
>> -							 pci_hdr->data);
>> -			else
>> -				pci_hdr->bar_deactivate_fn(kvm, pci_hdr, i,
>> -							   pci_hdr->data);
>> +				pci_activate_bar(kvm, pci_hdr, i);
>> +			if (!__pci__io_space_enabled(new_command))
> Isn't that just "else", as before?
>
>> +				pci_deactivate_bar(kvm, pci_hdr, i);
>>  		}
>>  
>>  		if (toggle_mem && pci__bar_is_memory(pci_hdr, i)) {
>>  			if (__pci__memory_space_enabled(new_command))
>> -				pci_hdr->bar_activate_fn(kvm, pci_hdr, i,
>> -							 pci_hdr->data);
>> -			else
>> -				pci_hdr->bar_deactivate_fn(kvm, pci_hdr, i,
>> -							   pci_hdr->data);
>> +				pci_activate_bar(kvm, pci_hdr, i);
>> +			if (!__pci__memory_space_enabled(new_command))
> Same here?

You're right (same as above).

>
>> +				pci_deactivate_bar(kvm, pci_hdr, i);
>>  		}
>>  	}
>>  
>>  	pci_hdr->command = new_command;
>>  }
>>  
>> +static int pci_deactivate_bar_regions(struct kvm *kvm,
>> +				      struct pci_device_header *pci_hdr,
>> +				      u32 start, u32 size)
>> +{
>> +	struct device_header *dev_hdr;
>> +	struct pci_device_header *tmp_hdr;
>> +	u32 tmp_addr, tmp_size;
>> +	int i, r;
>> +
>> +	dev_hdr = device__first_dev(DEVICE_BUS_PCI);
>> +	while (dev_hdr) {
>> +		tmp_hdr = dev_hdr->data;
>> +		for (i = 0; i < 6; i++) {
>> +			if (!pci_bar_is_implemented(tmp_hdr, i))
>> +				continue;
>> +
>> +			tmp_addr = pci__bar_address(tmp_hdr, i);
>> +			tmp_size = pci__bar_size(tmp_hdr, i);
>> +
>> +			if (tmp_addr + tmp_size <= start ||
>> +			    tmp_addr >= start + size)
>> +				continue;
>> +
>> +			r = pci_deactivate_bar(kvm, tmp_hdr, i);
>> +			if (r < 0)
>> +				return r;
>> +		}
>> +		dev_hdr = device__next_dev(dev_hdr);
>> +	}
>> +
>> +	return 0;
>> +}
>> +
>> +static int pci_activate_bar_regions(struct kvm *kvm,
>> +				    struct pci_device_header *pci_hdr,
>> +				    u32 start, u32 size)
>> +{
>> +	struct device_header *dev_hdr;
>> +	struct pci_device_header *tmp_hdr;
>> +	u32 tmp_addr, tmp_size;
>> +	int i, r;
>> +
>> +	dev_hdr = device__first_dev(DEVICE_BUS_PCI);
>> +	while (dev_hdr) {
>> +		tmp_hdr = dev_hdr->data;
>> +		for (i = 0; i < 6; i++) {
>> +			if (!pci_bar_is_implemented(tmp_hdr, i))
>> +				continue;
>> +
>> +			tmp_addr = pci__bar_address(tmp_hdr, i);
>> +			tmp_size = pci__bar_size(tmp_hdr, i);
>> +
>> +			if (tmp_addr + tmp_size <= start ||
>> +			    tmp_addr >= start + size)
>> +				continue;
>> +
>> +			r = pci_activate_bar(kvm, tmp_hdr, i);
>> +			if (r < 0)
>> +				return r;
>> +		}
>> +		dev_hdr = device__next_dev(dev_hdr);
>> +	}
>> +
>> +	return 0;
>> +}
>> +
>> +static void pci_config_bar_wr(struct kvm *kvm,
>> +			      struct pci_device_header *pci_hdr, int bar_num,
>> +			      u32 value)
>> +{
>> +	u32 old_addr, new_addr, bar_size;
>> +	u32 mask;
>> +	int r;
>> +
>> +	if (pci__bar_is_io(pci_hdr, bar_num))
>> +		mask = (u32)PCI_BASE_ADDRESS_IO_MASK;
>> +	else
>> +		mask = (u32)PCI_BASE_ADDRESS_MEM_MASK;
>> +
>> +	/*
>> +	 * If the kernel masks the BAR, it will expect to find the size of the
>> +	 * BAR there next time it reads from it. After the kernel reads the
>> +	 * size, it will write the address back.
>> +	 *
>> +	 * According to the PCI local bus specification REV 3.0: The number of
>> +	 * upper bits that a device actually implements depends on how much of
>> +	 * the address space the device will respond to. A device that wants a 1
>> +	 * MB memory address space (using a 32-bit base address register) would
>> +	 * build the top 12 bits of the address register, hardwiring the other
>> +	 * bits to 0.
>> +	 *
>> +	 * Furthermore, software can determine how much address space the device
>> +	 * requires by writing a value of all 1's to the register and then
>> +	 * reading the value back. The device will return 0's in all don't-care
>> +	 * address bits, effectively specifying the address space required.
>> +	 *
>> +	 * Software computes the size of the address space with the formula
>> +	 * S =  ~B + 1, where S is the memory size and B is the value read from
>> +	 * the BAR. This means that the BAR value that kvmtool should return is
>> +	 * B = ~(S - 1).
>> +	 */
>> +	if (value == 0xffffffff) {
>> +		value = ~(pci__bar_size(pci_hdr, bar_num) - 1);
>> +		/* Preserve the special bits. */
>> +		value = (value & mask) | (pci_hdr->bar[bar_num] & ~mask);
>> +		pci_hdr->bar[bar_num] = value;
>> +		return;
>> +	}
>> +
>> +	value = (value & mask) | (pci_hdr->bar[bar_num] & ~mask);
>> +
>> +	/* Don't toggle emulation when region type access is disbled. */
>> +	if (pci__bar_is_io(pci_hdr, bar_num) &&
>> +	    !pci__io_space_enabled(pci_hdr)) {
>> +		pci_hdr->bar[bar_num] = value;
>> +		return;
>> +	}
>> +
>> +	if (pci__bar_is_memory(pci_hdr, bar_num) &&
>> +	    !pci__memory_space_enabled(pci_hdr)) {
>> +		pci_hdr->bar[bar_num] = value;
>> +		return;
>> +	}
>> +
>> +	old_addr = pci__bar_address(pci_hdr, bar_num);
>> +	new_addr = __pci__bar_address(value);
>> +	bar_size = pci__bar_size(pci_hdr, bar_num);
>> +
>> +	r = pci_deactivate_bar(kvm, pci_hdr, bar_num);
>> +	if (r < 0)
>> +		return;
>> +
>> +	r = pci_deactivate_bar_regions(kvm, pci_hdr, new_addr, bar_size);
>> +	if (r < 0) {
>> +		/*
>> +		 * We cannot update the BAR because of an overlapping region
>> +		 * that failed to deactivate emulation, so keep the old BAR
>> +		 * value and re-activate emulation for it.
>> +		 */
>> +		pci_activate_bar(kvm, pci_hdr, bar_num);
>> +		return;
>> +	}
>> +
>> +	pci_hdr->bar[bar_num] = value;
>> +	r = pci_activate_bar(kvm, pci_hdr, bar_num);
>> +	if (r < 0) {
>> +		/*
>> +		 * New region cannot be emulated, re-enable the regions that
>> +		 * were overlapping.
>> +		 */
>> +		pci_activate_bar_regions(kvm, pci_hdr, new_addr, bar_size);
>> +		return;
>> +	}
>> +
>> +	pci_activate_bar_regions(kvm, pci_hdr, old_addr, bar_size);
>> +}
>> +
>>  void pci__config_wr(struct kvm *kvm, union pci_config_address addr, void *data, int size)
>>  {
>>  	void *base;
>> @@ -200,7 +393,6 @@ void pci__config_wr(struct kvm *kvm, union pci_config_address addr, void *data,
>>  	struct pci_device_header *pci_hdr;
>>  	u8 dev_num = addr.device_number;
>>  	u32 value = 0;
>> -	u32 mask;
>>  
>>  	if (!pci_device_exists(addr.bus_number, dev_num, 0))
>>  		return;
>> @@ -225,46 +417,13 @@ void pci__config_wr(struct kvm *kvm, union pci_config_address addr, void *data,
>>  	}
>>  
>>  	bar = (offset - PCI_BAR_OFFSET(0)) / sizeof(u32);
>> -
>> -	/*
>> -	 * If the kernel masks the BAR, it will expect to find the size of the
>> -	 * BAR there next time it reads from it. After the kernel reads the
>> -	 * size, it will write the address back.
>> -	 */
>>  	if (bar < 6) {
>> -		if (pci__bar_is_io(pci_hdr, bar))
>> -			mask = (u32)PCI_BASE_ADDRESS_IO_MASK;
>> -		else
>> -			mask = (u32)PCI_BASE_ADDRESS_MEM_MASK;
>> -		/*
>> -		 * According to the PCI local bus specification REV 3.0:
>> -		 * The number of upper bits that a device actually implements
>> -		 * depends on how much of the address space the device will
>> -		 * respond to. A device that wants a 1 MB memory address space
>> -		 * (using a 32-bit base address register) would build the top
>> -		 * 12 bits of the address register, hardwiring the other bits
>> -		 * to 0.
>> -		 *
>> -		 * Furthermore, software can determine how much address space
>> -		 * the device requires by writing a value of all 1's to the
>> -		 * register and then reading the value back. The device will
>> -		 * return 0's in all don't-care address bits, effectively
>> -		 * specifying the address space required.
>> -		 *
>> -		 * Software computes the size of the address space with the
>> -		 * formula S = ~B + 1, where S is the memory size and B is the
>> -		 * value read from the BAR. This means that the BAR value that
>> -		 * kvmtool should return is B = ~(S - 1).
>> -		 */
>>  		memcpy(&value, data, size);
>> -		if (value == 0xffffffff)
>> -			value = ~(pci_hdr->bar_size[bar] - 1);
>> -		/* Preserve the special bits. */
>> -		value = (value & mask) | (pci_hdr->bar[bar] & ~mask);
>> -		memcpy(base + offset, &value, size);
>> -	} else {
>> -		memcpy(base + offset, data, size);
>> +		pci_config_bar_wr(kvm, pci_hdr, bar, value);
>> +		return;
>>  	}
>> +
>> +	memcpy(base + offset, data, size);
>>  }
>>  
>>  void pci__config_rd(struct kvm *kvm, union pci_config_address addr, void *data, int size)
>> @@ -329,20 +488,21 @@ int pci__register_bar_regions(struct kvm *kvm, struct pci_device_header *pci_hdr
>>  			continue;
>>  
>>  		has_bar_regions = true;
>> +		assert(!pci_hdr->bar_info[i].active);
>>  
>>  		if (pci__bar_is_io(pci_hdr, i) &&
>>  		    pci__io_space_enabled(pci_hdr)) {
>> -				r = bar_activate_fn(kvm, pci_hdr, i, data);
>> -				if (r < 0)
>> -					return r;
>> -			}
>> +			r = pci_activate_bar(kvm, pci_hdr, i);
>> +			if (r < 0)
>> +				return r;
>> +		}
>>  
>>  		if (pci__bar_is_memory(pci_hdr, i) &&
>>  		    pci__memory_space_enabled(pci_hdr)) {
>> -				r = bar_activate_fn(kvm, pci_hdr, i, data);
>> -				if (r < 0)
>> -					return r;
>> -			}
>> +			r = pci_activate_bar(kvm, pci_hdr, i);
>> +			if (r < 0)
>> +				return r;
>> +		}
>>  	}
>>  
>>  	assert(has_bar_regions);
>> diff --git a/powerpc/spapr_pci.c b/powerpc/spapr_pci.c
>> index a15f7d895a46..7be44d950acb 100644
>> --- a/powerpc/spapr_pci.c
>> +++ b/powerpc/spapr_pci.c
>> @@ -369,7 +369,7 @@ int spapr_populate_pci_devices(struct kvm *kvm,
>>  				of_pci_b_ddddd(devid) |
>>  				of_pci_b_fff(fn) |
>>  				of_pci_b_rrrrrrrr(bars[i]));
>> -			reg[n+1].size = cpu_to_be64(hdr->bar_size[i]);
>> +			reg[n+1].size = cpu_to_be64(pci__bar_size(hdr, i));
>>  			reg[n+1].addr = 0;
>>  
>>  			assigned_addresses[n].phys_hi = cpu_to_be32(
>> diff --git a/vfio/pci.c b/vfio/pci.c
>> index 9e595562180b..3a641e72e574 100644
>> --- a/vfio/pci.c
>> +++ b/vfio/pci.c
>> @@ -455,6 +455,7 @@ static int vfio_pci_bar_activate(struct kvm *kvm,
>>  	struct vfio_pci_msix_pba *pba = &pdev->msix_pba;
>>  	struct vfio_pci_msix_table *table = &pdev->msix_table;
>>  	struct vfio_region *region = &vdev->regions[bar_num];
>> +	u32 bar_addr;
>>  	int ret;
>>  
>>  	if (!region->info.size) {
>> @@ -462,8 +463,11 @@ static int vfio_pci_bar_activate(struct kvm *kvm,
>>  		goto out;
>>  	}
>>  
>> +	bar_addr = pci__bar_address(pci_hdr, bar_num);
>> +
>>  	if ((pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX) &&
>>  	    (u32)bar_num == table->bar) {
>> +		table->guest_phys_addr = region->guest_phys_addr = bar_addr;
> I think those double assignments are a bit frowned upon, at least in Linux coding style. It would probably be cleaner to assign the region member after the error check.
>
>>  		ret = kvm__register_mmio(kvm, table->guest_phys_addr,
>>  					 table->size, false,
>>  					 vfio_pci_msix_table_access, pdev);
>> @@ -473,13 +477,22 @@ static int vfio_pci_bar_activate(struct kvm *kvm,
>>  
>>  	if ((pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX) &&
>>  	    (u32)bar_num == pba->bar) {
>> +		if (pba->bar == table->bar)
>> +			pba->guest_phys_addr = table->guest_phys_addr + table->size;
>> +		else
>> +			pba->guest_phys_addr = region->guest_phys_addr = bar_addr;
> same here with the double assignment

Ok, I'll split it.

>
>>  		ret = kvm__register_mmio(kvm, pba->guest_phys_addr,
>>  					 pba->size, false,
>>  					 vfio_pci_msix_pba_access, pdev);
>>  		goto out;
>>  	}
>>  
>> +	if (pci__bar_is_io(pci_hdr, bar_num))
>> +		region->port_base = bar_addr;
>> +	else
>> +		region->guest_phys_addr = bar_addr;
> Isn't that redundant with those double assignments above? Maybe you can get rid of those altogether?

I don't think it's redundant, because the double assignments above only happen
when specific conditions are met.

Thanks,
Alex
diff mbox series

Patch

diff --git a/hw/vesa.c b/hw/vesa.c
index 3044a86078fb..aca938f79c82 100644
--- a/hw/vesa.c
+++ b/hw/vesa.c
@@ -49,7 +49,7 @@  static int vesa__bar_activate(struct kvm *kvm,
 	int r;
 
 	bar_addr = pci__bar_address(pci_hdr, bar_num);
-	bar_size = pci_hdr->bar_size[bar_num];
+	bar_size = pci__bar_size(pci_hdr, bar_num);
 
 	switch (bar_num) {
 	case 0:
@@ -121,9 +121,9 @@  struct framebuffer *vesa__init(struct kvm *kvm)
 		.subsys_vendor_id	= cpu_to_le16(PCI_SUBSYSTEM_VENDOR_ID_REDHAT_QUMRANET),
 		.subsys_id		= cpu_to_le16(PCI_SUBSYSTEM_ID_VESA),
 		.bar[0]			= cpu_to_le32(port_addr | PCI_BASE_ADDRESS_SPACE_IO),
-		.bar_size[0]		= PCI_IO_SIZE,
+		.bar_info[0]		= (struct pci_bar_info) {.size = PCI_IO_SIZE},
 		.bar[1]			= cpu_to_le32(VESA_MEM_ADDR | PCI_BASE_ADDRESS_SPACE_MEMORY),
-		.bar_size[1]		= VESA_MEM_SIZE,
+		.bar_info[1]		= (struct pci_bar_info) {.size = VESA_MEM_SIZE},
 	};
 
 	vdev->pci_hdr.cfg_ops = (struct pci_config_operations) {
diff --git a/include/kvm/pci.h b/include/kvm/pci.h
index bf42f497168f..ae71ef33237c 100644
--- a/include/kvm/pci.h
+++ b/include/kvm/pci.h
@@ -11,6 +11,17 @@ 
 #include "kvm/msi.h"
 #include "kvm/fdt.h"
 
+#define pci_dev_err(pci_hdr, fmt, ...) \
+	pr_err("[%04x:%04x] " fmt, pci_hdr->vendor_id, pci_hdr->device_id, ##__VA_ARGS__)
+#define pci_dev_warn(pci_hdr, fmt, ...) \
+	pr_warning("[%04x:%04x] " fmt, pci_hdr->vendor_id, pci_hdr->device_id, ##__VA_ARGS__)
+#define pci_dev_info(pci_hdr, fmt, ...) \
+	pr_info("[%04x:%04x] " fmt, pci_hdr->vendor_id, pci_hdr->device_id, ##__VA_ARGS__)
+#define pci_dev_dbg(pci_hdr, fmt, ...) \
+	pr_debug("[%04x:%04x] " fmt, pci_hdr->vendor_id, pci_hdr->device_id, ##__VA_ARGS__)
+#define pci_dev_die(pci_hdr, fmt, ...) \
+	die("[%04x:%04x] " fmt, pci_hdr->vendor_id, pci_hdr->device_id, ##__VA_ARGS__)
+
 /*
  * PCI Configuration Mechanism #1 I/O ports. See Section 3.7.4.1.
  * ("Configuration Mechanism #1") of the PCI Local Bus Specification 2.1 for
@@ -89,6 +100,11 @@  struct pci_cap_hdr {
 	u8	next;
 };
 
+struct pci_bar_info {
+	u32 size;
+	bool active;
+};
+
 struct pci_device_header;
 
 typedef int (*bar_activate_fn_t)(struct kvm *kvm,
@@ -142,7 +158,7 @@  struct pci_device_header {
 	};
 
 	/* Private to lkvm */
-	u32		bar_size[6];
+	struct pci_bar_info	bar_info[6];
 	bar_activate_fn_t	bar_activate_fn;
 	bar_deactivate_fn_t	bar_deactivate_fn;
 	void *data;
@@ -224,4 +240,9 @@  static inline u32 pci__bar_address(struct pci_device_header *pci_hdr, int bar_nu
 	return __pci__bar_address(pci_hdr->bar[bar_num]);
 }
 
+static inline u32 pci__bar_size(struct pci_device_header *pci_hdr, int bar_num)
+{
+	return pci_hdr->bar_info[bar_num].size;
+}
+
 #endif /* KVM__PCI_H */
diff --git a/pci.c b/pci.c
index 98331a1fc205..1e9791250bc3 100644
--- a/pci.c
+++ b/pci.c
@@ -68,7 +68,7 @@  void pci__assign_irq(struct device_header *dev_hdr)
 
 static bool pci_bar_is_implemented(struct pci_device_header *pci_hdr, int bar_num)
 {
-	return  bar_num < 6 && pci_hdr->bar_size[bar_num];
+	return  bar_num < 6 && pci__bar_size(pci_hdr, bar_num);
 }
 
 static void *pci_config_address_ptr(u16 port)
@@ -157,6 +157,46 @@  static struct ioport_operations pci_config_data_ops = {
 	.io_out	= pci_config_data_out,
 };
 
+static int pci_activate_bar(struct kvm *kvm, struct pci_device_header *pci_hdr,
+			    int bar_num)
+{
+	int r = 0;
+
+	if (pci_hdr->bar_info[bar_num].active)
+		goto out;
+
+	r = pci_hdr->bar_activate_fn(kvm, pci_hdr, bar_num, pci_hdr->data);
+	if (r < 0) {
+		pci_dev_err(pci_hdr, "Error activating emulation for BAR %d",
+			    bar_num);
+		goto out;
+	}
+	pci_hdr->bar_info[bar_num].active = true;
+
+out:
+	return r;
+}
+
+static int pci_deactivate_bar(struct kvm *kvm, struct pci_device_header *pci_hdr,
+			      int bar_num)
+{
+	int r = 0;
+
+	if (!pci_hdr->bar_info[bar_num].active)
+		goto out;
+
+	r = pci_hdr->bar_deactivate_fn(kvm, pci_hdr, bar_num, pci_hdr->data);
+	if (r < 0) {
+		pci_dev_err(pci_hdr, "Error deactivating emulation for BAR %d",
+			    bar_num);
+		goto out;
+	}
+	pci_hdr->bar_info[bar_num].active = false;
+
+out:
+	return r;
+}
+
 static void pci_config_command_wr(struct kvm *kvm,
 				  struct pci_device_header *pci_hdr,
 				  u16 new_command)
@@ -173,26 +213,179 @@  static void pci_config_command_wr(struct kvm *kvm,
 
 		if (toggle_io && pci__bar_is_io(pci_hdr, i)) {
 			if (__pci__io_space_enabled(new_command))
-				pci_hdr->bar_activate_fn(kvm, pci_hdr, i,
-							 pci_hdr->data);
-			else
-				pci_hdr->bar_deactivate_fn(kvm, pci_hdr, i,
-							   pci_hdr->data);
+				pci_activate_bar(kvm, pci_hdr, i);
+			if (!__pci__io_space_enabled(new_command))
+				pci_deactivate_bar(kvm, pci_hdr, i);
 		}
 
 		if (toggle_mem && pci__bar_is_memory(pci_hdr, i)) {
 			if (__pci__memory_space_enabled(new_command))
-				pci_hdr->bar_activate_fn(kvm, pci_hdr, i,
-							 pci_hdr->data);
-			else
-				pci_hdr->bar_deactivate_fn(kvm, pci_hdr, i,
-							   pci_hdr->data);
+				pci_activate_bar(kvm, pci_hdr, i);
+			if (!__pci__memory_space_enabled(new_command))
+				pci_deactivate_bar(kvm, pci_hdr, i);
 		}
 	}
 
 	pci_hdr->command = new_command;
 }
 
+static int pci_deactivate_bar_regions(struct kvm *kvm,
+				      struct pci_device_header *pci_hdr,
+				      u32 start, u32 size)
+{
+	struct device_header *dev_hdr;
+	struct pci_device_header *tmp_hdr;
+	u32 tmp_addr, tmp_size;
+	int i, r;
+
+	dev_hdr = device__first_dev(DEVICE_BUS_PCI);
+	while (dev_hdr) {
+		tmp_hdr = dev_hdr->data;
+		for (i = 0; i < 6; i++) {
+			if (!pci_bar_is_implemented(tmp_hdr, i))
+				continue;
+
+			tmp_addr = pci__bar_address(tmp_hdr, i);
+			tmp_size = pci__bar_size(tmp_hdr, i);
+
+			if (tmp_addr + tmp_size <= start ||
+			    tmp_addr >= start + size)
+				continue;
+
+			r = pci_deactivate_bar(kvm, tmp_hdr, i);
+			if (r < 0)
+				return r;
+		}
+		dev_hdr = device__next_dev(dev_hdr);
+	}
+
+	return 0;
+}
+
+static int pci_activate_bar_regions(struct kvm *kvm,
+				    struct pci_device_header *pci_hdr,
+				    u32 start, u32 size)
+{
+	struct device_header *dev_hdr;
+	struct pci_device_header *tmp_hdr;
+	u32 tmp_addr, tmp_size;
+	int i, r;
+
+	dev_hdr = device__first_dev(DEVICE_BUS_PCI);
+	while (dev_hdr) {
+		tmp_hdr = dev_hdr->data;
+		for (i = 0; i < 6; i++) {
+			if (!pci_bar_is_implemented(tmp_hdr, i))
+				continue;
+
+			tmp_addr = pci__bar_address(tmp_hdr, i);
+			tmp_size = pci__bar_size(tmp_hdr, i);
+
+			if (tmp_addr + tmp_size <= start ||
+			    tmp_addr >= start + size)
+				continue;
+
+			r = pci_activate_bar(kvm, tmp_hdr, i);
+			if (r < 0)
+				return r;
+		}
+		dev_hdr = device__next_dev(dev_hdr);
+	}
+
+	return 0;
+}
+
+static void pci_config_bar_wr(struct kvm *kvm,
+			      struct pci_device_header *pci_hdr, int bar_num,
+			      u32 value)
+{
+	u32 old_addr, new_addr, bar_size;
+	u32 mask;
+	int r;
+
+	if (pci__bar_is_io(pci_hdr, bar_num))
+		mask = (u32)PCI_BASE_ADDRESS_IO_MASK;
+	else
+		mask = (u32)PCI_BASE_ADDRESS_MEM_MASK;
+
+	/*
+	 * If the kernel masks the BAR, it will expect to find the size of the
+	 * BAR there next time it reads from it. After the kernel reads the
+	 * size, it will write the address back.
+	 *
+	 * According to the PCI local bus specification REV 3.0: The number of
+	 * upper bits that a device actually implements depends on how much of
+	 * the address space the device will respond to. A device that wants a 1
+	 * MB memory address space (using a 32-bit base address register) would
+	 * build the top 12 bits of the address register, hardwiring the other
+	 * bits to 0.
+	 *
+	 * Furthermore, software can determine how much address space the device
+	 * requires by writing a value of all 1's to the register and then
+	 * reading the value back. The device will return 0's in all don't-care
+	 * address bits, effectively specifying the address space required.
+	 *
+	 * Software computes the size of the address space with the formula
+	 * S =  ~B + 1, where S is the memory size and B is the value read from
+	 * the BAR. This means that the BAR value that kvmtool should return is
+	 * B = ~(S - 1).
+	 */
+	if (value == 0xffffffff) {
+		value = ~(pci__bar_size(pci_hdr, bar_num) - 1);
+		/* Preserve the special bits. */
+		value = (value & mask) | (pci_hdr->bar[bar_num] & ~mask);
+		pci_hdr->bar[bar_num] = value;
+		return;
+	}
+
+	value = (value & mask) | (pci_hdr->bar[bar_num] & ~mask);
+
+	/* Don't toggle emulation when region type access is disbled. */
+	if (pci__bar_is_io(pci_hdr, bar_num) &&
+	    !pci__io_space_enabled(pci_hdr)) {
+		pci_hdr->bar[bar_num] = value;
+		return;
+	}
+
+	if (pci__bar_is_memory(pci_hdr, bar_num) &&
+	    !pci__memory_space_enabled(pci_hdr)) {
+		pci_hdr->bar[bar_num] = value;
+		return;
+	}
+
+	old_addr = pci__bar_address(pci_hdr, bar_num);
+	new_addr = __pci__bar_address(value);
+	bar_size = pci__bar_size(pci_hdr, bar_num);
+
+	r = pci_deactivate_bar(kvm, pci_hdr, bar_num);
+	if (r < 0)
+		return;
+
+	r = pci_deactivate_bar_regions(kvm, pci_hdr, new_addr, bar_size);
+	if (r < 0) {
+		/*
+		 * We cannot update the BAR because of an overlapping region
+		 * that failed to deactivate emulation, so keep the old BAR
+		 * value and re-activate emulation for it.
+		 */
+		pci_activate_bar(kvm, pci_hdr, bar_num);
+		return;
+	}
+
+	pci_hdr->bar[bar_num] = value;
+	r = pci_activate_bar(kvm, pci_hdr, bar_num);
+	if (r < 0) {
+		/*
+		 * New region cannot be emulated, re-enable the regions that
+		 * were overlapping.
+		 */
+		pci_activate_bar_regions(kvm, pci_hdr, new_addr, bar_size);
+		return;
+	}
+
+	pci_activate_bar_regions(kvm, pci_hdr, old_addr, bar_size);
+}
+
 void pci__config_wr(struct kvm *kvm, union pci_config_address addr, void *data, int size)
 {
 	void *base;
@@ -200,7 +393,6 @@  void pci__config_wr(struct kvm *kvm, union pci_config_address addr, void *data,
 	struct pci_device_header *pci_hdr;
 	u8 dev_num = addr.device_number;
 	u32 value = 0;
-	u32 mask;
 
 	if (!pci_device_exists(addr.bus_number, dev_num, 0))
 		return;
@@ -225,46 +417,13 @@  void pci__config_wr(struct kvm *kvm, union pci_config_address addr, void *data,
 	}
 
 	bar = (offset - PCI_BAR_OFFSET(0)) / sizeof(u32);
-
-	/*
-	 * If the kernel masks the BAR, it will expect to find the size of the
-	 * BAR there next time it reads from it. After the kernel reads the
-	 * size, it will write the address back.
-	 */
 	if (bar < 6) {
-		if (pci__bar_is_io(pci_hdr, bar))
-			mask = (u32)PCI_BASE_ADDRESS_IO_MASK;
-		else
-			mask = (u32)PCI_BASE_ADDRESS_MEM_MASK;
-		/*
-		 * According to the PCI local bus specification REV 3.0:
-		 * The number of upper bits that a device actually implements
-		 * depends on how much of the address space the device will
-		 * respond to. A device that wants a 1 MB memory address space
-		 * (using a 32-bit base address register) would build the top
-		 * 12 bits of the address register, hardwiring the other bits
-		 * to 0.
-		 *
-		 * Furthermore, software can determine how much address space
-		 * the device requires by writing a value of all 1's to the
-		 * register and then reading the value back. The device will
-		 * return 0's in all don't-care address bits, effectively
-		 * specifying the address space required.
-		 *
-		 * Software computes the size of the address space with the
-		 * formula S = ~B + 1, where S is the memory size and B is the
-		 * value read from the BAR. This means that the BAR value that
-		 * kvmtool should return is B = ~(S - 1).
-		 */
 		memcpy(&value, data, size);
-		if (value == 0xffffffff)
-			value = ~(pci_hdr->bar_size[bar] - 1);
-		/* Preserve the special bits. */
-		value = (value & mask) | (pci_hdr->bar[bar] & ~mask);
-		memcpy(base + offset, &value, size);
-	} else {
-		memcpy(base + offset, data, size);
+		pci_config_bar_wr(kvm, pci_hdr, bar, value);
+		return;
 	}
+
+	memcpy(base + offset, data, size);
 }
 
 void pci__config_rd(struct kvm *kvm, union pci_config_address addr, void *data, int size)
@@ -329,20 +488,21 @@  int pci__register_bar_regions(struct kvm *kvm, struct pci_device_header *pci_hdr
 			continue;
 
 		has_bar_regions = true;
+		assert(!pci_hdr->bar_info[i].active);
 
 		if (pci__bar_is_io(pci_hdr, i) &&
 		    pci__io_space_enabled(pci_hdr)) {
-				r = bar_activate_fn(kvm, pci_hdr, i, data);
-				if (r < 0)
-					return r;
-			}
+			r = pci_activate_bar(kvm, pci_hdr, i);
+			if (r < 0)
+				return r;
+		}
 
 		if (pci__bar_is_memory(pci_hdr, i) &&
 		    pci__memory_space_enabled(pci_hdr)) {
-				r = bar_activate_fn(kvm, pci_hdr, i, data);
-				if (r < 0)
-					return r;
-			}
+			r = pci_activate_bar(kvm, pci_hdr, i);
+			if (r < 0)
+				return r;
+		}
 	}
 
 	assert(has_bar_regions);
diff --git a/powerpc/spapr_pci.c b/powerpc/spapr_pci.c
index a15f7d895a46..7be44d950acb 100644
--- a/powerpc/spapr_pci.c
+++ b/powerpc/spapr_pci.c
@@ -369,7 +369,7 @@  int spapr_populate_pci_devices(struct kvm *kvm,
 				of_pci_b_ddddd(devid) |
 				of_pci_b_fff(fn) |
 				of_pci_b_rrrrrrrr(bars[i]));
-			reg[n+1].size = cpu_to_be64(hdr->bar_size[i]);
+			reg[n+1].size = cpu_to_be64(pci__bar_size(hdr, i));
 			reg[n+1].addr = 0;
 
 			assigned_addresses[n].phys_hi = cpu_to_be32(
diff --git a/vfio/pci.c b/vfio/pci.c
index 9e595562180b..3a641e72e574 100644
--- a/vfio/pci.c
+++ b/vfio/pci.c
@@ -455,6 +455,7 @@  static int vfio_pci_bar_activate(struct kvm *kvm,
 	struct vfio_pci_msix_pba *pba = &pdev->msix_pba;
 	struct vfio_pci_msix_table *table = &pdev->msix_table;
 	struct vfio_region *region = &vdev->regions[bar_num];
+	u32 bar_addr;
 	int ret;
 
 	if (!region->info.size) {
@@ -462,8 +463,11 @@  static int vfio_pci_bar_activate(struct kvm *kvm,
 		goto out;
 	}
 
+	bar_addr = pci__bar_address(pci_hdr, bar_num);
+
 	if ((pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX) &&
 	    (u32)bar_num == table->bar) {
+		table->guest_phys_addr = region->guest_phys_addr = bar_addr;
 		ret = kvm__register_mmio(kvm, table->guest_phys_addr,
 					 table->size, false,
 					 vfio_pci_msix_table_access, pdev);
@@ -473,13 +477,22 @@  static int vfio_pci_bar_activate(struct kvm *kvm,
 
 	if ((pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX) &&
 	    (u32)bar_num == pba->bar) {
+		if (pba->bar == table->bar)
+			pba->guest_phys_addr = table->guest_phys_addr + table->size;
+		else
+			pba->guest_phys_addr = region->guest_phys_addr = bar_addr;
 		ret = kvm__register_mmio(kvm, pba->guest_phys_addr,
 					 pba->size, false,
 					 vfio_pci_msix_pba_access, pdev);
 		goto out;
 	}
 
+	if (pci__bar_is_io(pci_hdr, bar_num))
+		region->port_base = bar_addr;
+	else
+		region->guest_phys_addr = bar_addr;
 	ret = vfio_map_region(kvm, vdev, region);
+
 out:
 	return ret;
 }
@@ -749,7 +762,7 @@  static int vfio_pci_fixup_cfg_space(struct vfio_device *vdev)
 		if (!base)
 			continue;
 
-		pdev->hdr.bar_size[i] = region->info.size;
+		pdev->hdr.bar_info[i].size = region->info.size;
 	}
 
 	/* I really can't be bothered to support cardbus. */
diff --git a/virtio/pci.c b/virtio/pci.c
index 5a3cc6f1e943..e02430881394 100644
--- a/virtio/pci.c
+++ b/virtio/pci.c
@@ -483,7 +483,7 @@  static int virtio_pci__bar_activate(struct kvm *kvm,
 	int r;
 
 	bar_addr = pci__bar_address(pci_hdr, bar_num);
-	bar_size = pci_hdr->bar_size[bar_num];
+	bar_size = pci__bar_size(pci_hdr, bar_num);
 
 	switch (bar_num) {
 	case 0:
@@ -569,9 +569,9 @@  int virtio_pci__init(struct kvm *kvm, void *dev, struct virtio_device *vdev,
 							| PCI_BASE_ADDRESS_SPACE_MEMORY),
 		.status			= cpu_to_le16(PCI_STATUS_CAP_LIST),
 		.capabilities		= (void *)&vpci->pci_hdr.msix - (void *)&vpci->pci_hdr,
-		.bar_size[0]		= cpu_to_le32(PCI_IO_SIZE),
-		.bar_size[1]		= cpu_to_le32(PCI_IO_SIZE),
-		.bar_size[2]		= cpu_to_le32(PCI_IO_SIZE*2),
+		.bar_info[0]		= (struct pci_bar_info) {.size = cpu_to_le32(PCI_IO_SIZE)},
+		.bar_info[1]		= (struct pci_bar_info) {.size = cpu_to_le32(PCI_IO_SIZE)},
+		.bar_info[2]		= (struct pci_bar_info) {.size = cpu_to_le32(PCI_IO_SIZE*2)},
 	};
 
 	r = pci__register_bar_regions(kvm, &vpci->pci_hdr,