Message ID | 1584880394-11184-3-git-send-email-yi.l.liu@intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | vfio/pci: expose device's PASID capability to VMs | expand |
On Sun, 22 Mar 2020 05:33:14 -0700 "Liu, Yi L" <yi.l.liu@intel.com> wrote: > From: Liu Yi L <yi.l.liu@intel.com> > > Per PCIe r5.0, sec 9.3.7.14, if a PF implements the PASID Capability, the > PF PASID configuration is shared by its VFs. VFs must not implement their > own PASID Capability. > > Per PCIe r5.0, sec 9.3.7.11, VFs must not implement the PRI Capability. If > the PF implements PRI, it is shared by the VFs. > > On bare metal, it has been fixed by below efforts. > to PASID/PRI are > https://lkml.org/lkml/2019/9/5/996 > https://lkml.org/lkml/2019/9/5/995 > > This patch adds emulated PASID/PRI capabilities for VFs when assigned to > VMs via vfio-pci driver. This is required for enabling vSVA on pass-through > VFs as VFs have no PASID/PRI capability structure in its configure space. > > Cc: Kevin Tian <kevin.tian@intel.com> > CC: Jacob Pan <jacob.jun.pan@linux.intel.com> > Cc: Alex Williamson <alex.williamson@redhat.com> > Cc: Eric Auger <eric.auger@redhat.com> > Cc: Jean-Philippe Brucker <jean-philippe@linaro.org> > Signed-off-by: Liu Yi L <yi.l.liu@intel.com> > --- > drivers/vfio/pci/vfio_pci_config.c | 325 ++++++++++++++++++++++++++++++++++++- > 1 file changed, 323 insertions(+), 2 deletions(-) > > diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c > index 4b9af99..84b4ea0 100644 > --- a/drivers/vfio/pci/vfio_pci_config.c > +++ b/drivers/vfio/pci/vfio_pci_config.c > @@ -1509,11 +1509,304 @@ static int vfio_cap_init(struct vfio_pci_device *vdev) > return 0; > } > > +static int vfio_fill_custom_vconfig_bytes(struct vfio_pci_device *vdev, > + int offset, uint8_t *data, int size) > +{ > + int ret = 0, data_offset = 0; > + > + while (size) { > + int filled; > + > + if (size >= 4 && !(offset % 4)) { > + __le32 *dwordp = (__le32 *)&vdev->vconfig[offset]; > + u32 dword; > + > + memcpy(&dword, data + data_offset, 4); > + *dwordp = cpu_to_le32(dword); Why wouldn't we do: *dwordp = cpu_to_le32(*(u32 *)(data + data_offset)); or better yet, increment data on each iteration for: *dwordp = cpu_to_le32(*(u32 *)data); vfio_fill_vconfig_bytes() does almost this same thing, getting the data from config space rather than a buffer, so please figure out how to avoid duplicating the logic. > + filled = 4; > + } else if (size >= 2 && !(offset % 2)) { > + __le16 *wordp = (__le16 *)&vdev->vconfig[offset]; > + u16 word; > + > + memcpy(&word, data + data_offset, 2); > + *wordp = cpu_to_le16(word); > + filled = 2; > + } else { > + u8 *byte = &vdev->vconfig[offset]; > + > + memcpy(byte, data + data_offset, 1); This one is really silly. vdev->vconfig[offset] = *data; > + filled = 1; > + } > + > + offset += filled; > + data_offset += filled; > + size -= filled; > + } > + > + return ret; > +} > + > +static int vfio_pci_get_ecap_content(struct pci_dev *pdev, > + int cap, int cap_len, u8 *content) > +{ > + int pos, offset, len = cap_len, ret = 0; > + > + pos = pci_find_ext_capability(pdev, cap); > + if (!pos) > + return -EINVAL; > + > + offset = 0; > + while (len) { > + int fetched; > + > + if (len >= 4 && !(pos % 4)) { > + u32 *dwordp = (u32 *) (content + offset); > + u32 dword; > + __le32 *dwptr = (__le32 *) &dword; > + > + ret = pci_read_config_dword(pdev, pos, &dword); > + if (ret) > + return ret; > + *dwordp = le32_to_cpu(*dwptr); WHAT??? pci_read_config_dword() returns cpu endian data! > + fetched = 4; > + } else if (len >= 2 && !(pos % 2)) { > + u16 *wordp = (u16 *) (content + offset); > + u16 word; > + __le16 *wptr = (__le16 *) &word; > + > + ret = pci_read_config_word(pdev, pos, &word); > + if (ret) > + return ret; > + *wordp = le16_to_cpu(*wptr); > + fetched = 2; > + } else { > + u8 *byte = (u8 *) (content + offset); > + > + ret = pci_read_config_byte(pdev, pos, byte); > + if (ret) > + return ret; > + fetched = 1; > + } > + > + pos += fetched; > + offset += fetched; > + len -= fetched; > + } > + > + return ret; > +} > + > +struct vfio_pci_pasid_cap_data { > + u32 id:16; > + u32 version:4; > + u32 next:12; > + union { > + u16 cap_reg_val; > + struct { > + u16 rsv1:1; > + u16 execs:1; > + u16 prvs:1; > + u16 rsv2:5; > + u16 pasid_bits:5; > + u16 rsv3:3; > + }; > + } cap_reg; > + union { > + u16 control_reg_val; > + struct { > + u16 paside:1; > + u16 exece:1; > + u16 prve:1; > + u16 rsv4:13; > + }; > + } control_reg; > +}; > + > +static int vfio_pci_add_pasid_cap(struct vfio_pci_device *vdev, > + struct pci_dev *pdev, > + u16 epos, u16 *next, __le32 **prevp) > +{ > + u8 *map = vdev->pci_config_map; > + int ecap = PCI_EXT_CAP_ID_PASID; > + int len = pci_ext_cap_length[ecap]; > + struct vfio_pci_pasid_cap_data pasid_cap; > + struct vfio_pci_pasid_cap_data vpasid_cap; > + int ret; > + > + /* > + * If no cap filled in this function, should make sure the next > + * pointer points to current epos. > + */ > + *next = epos; > + > + if (!len) { > + pr_info("%s: VF %s hiding PASID capability\n", > + __func__, dev_name(&vdev->pdev->dev)); > + ret = 0; > + goto out; > + } No! Why? This is dead code. > + > + /* Add PASID capability*/ > + ret = vfio_pci_get_ecap_content(pdev, ecap, > + len, (u8 *)&pasid_cap); Why wouldn't we just use vfio_fill_config_bytes() rather than this ridiculous approach of coping it to a buffer, modifying it and copying it out? > + if (ret) > + goto out; > + > + if (!pasid_cap.control_reg.paside) { > + pr_debug("%s: its PF's PASID capability is not enabled\n", > + dev_name(&vdev->pdev->dev)); > + ret = 0; > + goto out; > + } What happens if the PF's PASID gets disabled while we're using it?? > + > + memcpy(&vpasid_cap, &pasid_cap, len); > + > + vpasid_cap.id = 0x18; What is going on here? #define PCI_EXT_CAP_ID_LTR 0x18 /* Latency Tolerance Reporting */ > + vpasid_cap.next = 0; > + /* clear the control reg for guest */ > + memset(&vpasid_cap.control_reg, 0x0, > + sizeof(vpasid_cap.control_reg)); So we zero a couple registers and that's why we need a structure to define the entire capability and this crazy copy to a cpu endian buffer. No. > + > + memset(map + epos, vpasid_cap.id, len); See below. > + ret = vfio_fill_custom_vconfig_bytes(vdev, epos, > + (u8 *)&vpasid_cap, len); > + if (!ret) { > + /* > + * Successfully filled in PASID cap, update > + * the next offset in previous cap header, > + * and also update caller about the offset > + * of next cap if any. > + */ > + u32 val = epos; > + **prevp &= cpu_to_le32(~(0xffcU << 20)); > + **prevp |= cpu_to_le32(val << 20); > + *prevp = (__le32 *)&vdev->vconfig[epos]; > + *next = epos + len; Could we make this any more complicated? > + } > + > +out: > + return ret; > +} > + > +struct vfio_pci_pri_cap_data { > + u32 id:16; > + u32 version:4; > + u32 next:12; > + union { > + u16 control_reg_val; > + struct { > + u16 enable:1; > + u16 reset:1; > + u16 rsv1:14; > + }; > + } control_reg; > + union { > + u16 status_reg_val; > + struct { > + u16 rf:1; > + u16 uprgi:1; > + u16 rsv2:6; > + u16 stop:1; > + u16 rsv3:6; > + u16 pasid_required:1; > + }; > + } status_reg; > + u32 prq_capacity; > + u32 prq_quota; > +}; > + > +static int vfio_pci_add_pri_cap(struct vfio_pci_device *vdev, > + struct pci_dev *pdev, > + u16 epos, u16 *next, __le32 **prevp) > +{ > + u8 *map = vdev->pci_config_map; > + int ecap = PCI_EXT_CAP_ID_PRI; > + int len = pci_ext_cap_length[ecap]; > + struct vfio_pci_pri_cap_data pri_cap; > + struct vfio_pci_pri_cap_data vpri_cap; > + int ret; > + > + /* > + * If no cap filled in this function, should make sure the next > + * pointer points to current epos. > + */ > + *next = epos; > + > + if (!len) { > + pr_info("%s: VF %s hiding PRI capability\n", > + __func__, dev_name(&vdev->pdev->dev)); > + ret = 0; > + goto out; > + } > + > + /* Add PASID capability*/ > + ret = vfio_pci_get_ecap_content(pdev, ecap, > + len, (u8 *)&pri_cap); > + if (ret) > + goto out; > + > + if (!pri_cap.control_reg.enable) { > + pr_debug("%s: its PF's PRI capability is not enabled\n", > + dev_name(&vdev->pdev->dev)); > + ret = 0; > + goto out; > + } > + > + memcpy(&vpri_cap, &pri_cap, len); > + > + vpri_cap.id = 0x19; #define PCI_EXT_CAP_ID_SECPCI 0x19 /* Secondary PCIe Capability */ ??? > + vpri_cap.next = 0; > + /* clear the control reg for guest */ > + memset(&vpri_cap.control_reg, 0x0, > + sizeof(vpri_cap.control_reg)); > + > + memset(map + epos, vpri_cap.id, len); > + ret = vfio_fill_custom_vconfig_bytes(vdev, epos, > + (u8 *)&vpri_cap, len); > + if (!ret) { > + /* > + * Successfully filled in PASID cap, update > + * the next offset in previous cap header, > + * and also update caller about the offset > + * of next cap if any. > + */ > + u32 val = epos; > + **prevp &= cpu_to_le32(~(0xffcU << 20)); > + **prevp |= cpu_to_le32(val << 20); > + *prevp = (__le32 *)&vdev->vconfig[epos]; > + *next = epos + len; > + } > + > +out: > + return ret; > +} > + > +static int vfio_pci_add_emulated_cap_for_vf(struct vfio_pci_device *vdev, > + struct pci_dev *pdev, u16 start_epos, __le32 *prev) > +{ > + __le32 *__prev = prev; > + u16 epos = start_epos, epos_next = start_epos; > + int ret = 0; > + > + /* Add PASID capability*/ > + ret = vfio_pci_add_pasid_cap(vdev, pdev, epos, > + &epos_next, &__prev); > + if (ret) > + return ret; > + > + /* Add PRI capability */ > + epos = epos_next; > + ret = vfio_pci_add_pri_cap(vdev, pdev, epos, > + &epos_next, &__prev); > + > + return ret; > +} > + > static int vfio_ecap_init(struct vfio_pci_device *vdev) > { > struct pci_dev *pdev = vdev->pdev; > u8 *map = vdev->pci_config_map; > - u16 epos; > + u16 epos, epos_max; > __le32 *prev = NULL; > int loops, ret, ecaps = 0; > > @@ -1521,6 +1814,7 @@ static int vfio_ecap_init(struct vfio_pci_device *vdev) > return 0; > > epos = PCI_CFG_SPACE_SIZE; > + epos_max = PCI_CFG_SPACE_SIZE; > > loops = (pdev->cfg_size - PCI_CFG_SPACE_SIZE) / PCI_CAP_SIZEOF; > > @@ -1545,6 +1839,9 @@ static int vfio_ecap_init(struct vfio_pci_device *vdev) > } > } > > + if (epos_max <= (epos + len)) > + epos_max = epos + len; > + > if (!len) { > pci_info(pdev, "%s: hiding ecap %#x@%#x\n", > __func__, ecap, epos); > @@ -1604,6 +1901,18 @@ static int vfio_ecap_init(struct vfio_pci_device *vdev) > if (!ecaps) > *(u32 *)&vdev->vconfig[PCI_CFG_SPACE_SIZE] = 0; > > +#ifdef CONFIG_PCI_ATS > + if (pdev->is_virtfn) { > + struct pci_dev *physfn = pdev->physfn; > + > + ret = vfio_pci_add_emulated_cap_for_vf(vdev, > + physfn, epos_max, prev); > + if (ret) > + pr_info("%s, failed to add special caps for VF %s\n", > + __func__, dev_name(&vdev->pdev->dev)); > + } > +#endif I can only imagine that we should place the caps at the same location they exist on the PF, we don't know what hidden registers might be hiding in config space. > + > return 0; > } > > @@ -1748,6 +2057,17 @@ static size_t vfio_pci_cap_remaining_dword(struct vfio_pci_device *vdev, > return i; > } > > +static bool vfio_pci_need_virt_perm(struct pci_dev *pdev, u8 cap_id) > +{ > +#ifdef CONFIG_PCI_ATS > + return (pdev->is_virtfn && > + (cap_id == PCI_EXT_CAP_ID_PASID || > + cap_id == PCI_EXT_CAP_ID_PRI)); > +#else > + return false; > +#endif > +} > + > static ssize_t vfio_config_do_rw(struct vfio_pci_device *vdev, char __user *buf, > size_t count, loff_t *ppos, bool iswrite) > { > @@ -1781,7 +2101,8 @@ static ssize_t vfio_config_do_rw(struct vfio_pci_device *vdev, char __user *buf, > if (cap_id == PCI_CAP_ID_INVALID) { > perm = &unassigned_perms; > cap_start = *ppos; > - } else if (cap_id == PCI_CAP_ID_INVALID_VIRT) { > + } else if (cap_id == PCI_CAP_ID_INVALID_VIRT || > + vfio_pci_need_virt_perm(pdev, cap_id)) { Why not simply fill the map with PCI_CAP_ID_INVALID_VIRT? > perm = &virt_perms; > cap_start = *ppos; > } else { This is really not close to acceptable as is. Sorry. Thanks, Alex
Hi Alex, > From: Alex Williamson <alex.williamson@redhat.com> > Sent: Friday, April 3, 2020 7:00 AM > To: Liu, Yi L <yi.l.liu@intel.com> > Subject: Re: [PATCH v1 2/2] vfio/pci: Emulate PASID/PRI capability for VFs > > On Sun, 22 Mar 2020 05:33:14 -0700 > "Liu, Yi L" <yi.l.liu@intel.com> wrote: > > > From: Liu Yi L <yi.l.liu@intel.com> > > > > Per PCIe r5.0, sec 9.3.7.14, if a PF implements the PASID Capability, the > > PF PASID configuration is shared by its VFs. VFs must not implement their > > own PASID Capability. > > > > Per PCIe r5.0, sec 9.3.7.11, VFs must not implement the PRI Capability. If > > the PF implements PRI, it is shared by the VFs. > > > > On bare metal, it has been fixed by below efforts. > > to PASID/PRI are > > https://lkml.org/lkml/2019/9/5/996 > > https://lkml.org/lkml/2019/9/5/995 > > > > This patch adds emulated PASID/PRI capabilities for VFs when assigned to > > VMs via vfio-pci driver. This is required for enabling vSVA on pass-through > > VFs as VFs have no PASID/PRI capability structure in its configure space. > > > > Cc: Kevin Tian <kevin.tian@intel.com> > > CC: Jacob Pan <jacob.jun.pan@linux.intel.com> > > Cc: Alex Williamson <alex.williamson@redhat.com> > > Cc: Eric Auger <eric.auger@redhat.com> > > Cc: Jean-Philippe Brucker <jean-philippe@linaro.org> > > Signed-off-by: Liu Yi L <yi.l.liu@intel.com> > > --- > > drivers/vfio/pci/vfio_pci_config.c | 325 > ++++++++++++++++++++++++++++++++++++- > > 1 file changed, 323 insertions(+), 2 deletions(-) > > > > diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c > > index 4b9af99..84b4ea0 100644 > > --- a/drivers/vfio/pci/vfio_pci_config.c > > +++ b/drivers/vfio/pci/vfio_pci_config.c > > @@ -1509,11 +1509,304 @@ static int vfio_cap_init(struct vfio_pci_device *vdev) > > return 0; > > } > > > > +static int vfio_fill_custom_vconfig_bytes(struct vfio_pci_device *vdev, > > + int offset, uint8_t *data, int size) > > +{ > > + int ret = 0, data_offset = 0; > > + > > + while (size) { > > + int filled; > > + > > + if (size >= 4 && !(offset % 4)) { > > + __le32 *dwordp = (__le32 *)&vdev->vconfig[offset]; > > + u32 dword; > > + > > + memcpy(&dword, data + data_offset, 4); > > + *dwordp = cpu_to_le32(dword); > > Why wouldn't we do: > > *dwordp = cpu_to_le32(*(u32 *)(data + data_offset)); > > or better yet, increment data on each iteration for: > > *dwordp = cpu_to_le32(*(u32 *)data); I'll refine it. > vfio_fill_vconfig_bytes() does almost this same thing, getting > the data > from config space rather than a buffer, so please figure out how to > avoid duplicating the logic. This patch is to emulate the PASID/PRI capability for VF. And per my understanding, the cap data from PF's config space cannot be filled to VF's vconfig directly. Take the control register in PASID capability structure as an example. If PASID cap is enabled in PF, the PASID enable bit in the control register will be set. If the cap data is filled to VF's vconfig directly, then guest will see a default enabled PASID capability in the VF. I guess it is not good. But, I may be wrong. If no such requirement, I can use vfio_fill_vconfig_bytes() directly, no need to do copy and modify and then copy. Also, if still needs to do the copy and modification, I may modify the vfio_fill_vconfig_bytes() to pass in the buffer and an indicator to tell vfio_fill_vconfig_bytes() whether fetch data from buffer or from config space. > > + filled = 4; > > + } else if (size >= 2 && !(offset % 2)) { > > + __le16 *wordp = (__le16 *)&vdev->vconfig[offset]; > > + u16 word; > > + > > + memcpy(&word, data + data_offset, 2); > > + *wordp = cpu_to_le16(word); > > + filled = 2; > > + } else { > > + u8 *byte = &vdev->vconfig[offset]; > > + > > + memcpy(byte, data + data_offset, 1); > > This one is really silly. > > vdev->vconfig[offset] = *data; got it. > > + filled = 1; > > + } > > + > > + offset += filled; > > + data_offset += filled; > > + size -= filled; > > + } > > + > > + return ret; > > +} > > + > > +static int vfio_pci_get_ecap_content(struct pci_dev *pdev, > > + int cap, int cap_len, u8 *content) > > +{ > > + int pos, offset, len = cap_len, ret = 0; > > + > > + pos = pci_find_ext_capability(pdev, cap); > > + if (!pos) > > + return -EINVAL; > > + > > + offset = 0; > > + while (len) { > > + int fetched; > > + > > + if (len >= 4 && !(pos % 4)) { > > + u32 *dwordp = (u32 *) (content + offset); > > + u32 dword; > > + __le32 *dwptr = (__le32 *) &dword; > > + > > + ret = pci_read_config_dword(pdev, pos, &dword); > > + if (ret) > > + return ret; > > + *dwordp = le32_to_cpu(*dwptr); > > WHAT??? pci_read_config_dword() returns cpu endian data! my bad. will remove the silly le32_to_cpu() convert. > > + fetched = 4; > > + } else if (len >= 2 && !(pos % 2)) { > > + u16 *wordp = (u16 *) (content + offset); > > + u16 word; > > + __le16 *wptr = (__le16 *) &word; > > + > > + ret = pci_read_config_word(pdev, pos, &word); > > + if (ret) > > + return ret; > > + *wordp = le16_to_cpu(*wptr); > > + fetched = 2; > > + } else { > > + u8 *byte = (u8 *) (content + offset); > > + > > + ret = pci_read_config_byte(pdev, pos, byte); > > + if (ret) > > + return ret; > > + fetched = 1; > > + } > > + > > + pos += fetched; > > + offset += fetched; > > + len -= fetched; > > + } > > + > > + return ret; > > +} > > + > > +struct vfio_pci_pasid_cap_data { > > + u32 id:16; > > + u32 version:4; > > + u32 next:12; > > + union { > > + u16 cap_reg_val; > > + struct { > > + u16 rsv1:1; > > + u16 execs:1; > > + u16 prvs:1; > > + u16 rsv2:5; > > + u16 pasid_bits:5; > > + u16 rsv3:3; > > + }; > > + } cap_reg; > > + union { > > + u16 control_reg_val; > > + struct { > > + u16 paside:1; > > + u16 exece:1; > > + u16 prve:1; > > + u16 rsv4:13; > > + }; > > + } control_reg; > > +}; > > + > > +static int vfio_pci_add_pasid_cap(struct vfio_pci_device *vdev, > > + struct pci_dev *pdev, > > + u16 epos, u16 *next, __le32 **prevp) > > +{ > > + u8 *map = vdev->pci_config_map; > > + int ecap = PCI_EXT_CAP_ID_PASID; > > + int len = pci_ext_cap_length[ecap]; > > + struct vfio_pci_pasid_cap_data pasid_cap; > > + struct vfio_pci_pasid_cap_data vpasid_cap; > > + int ret; > > + > > + /* > > + * If no cap filled in this function, should make sure the next > > + * pointer points to current epos. > > + */ > > + *next = epos; > > + > > + if (!len) { > > + pr_info("%s: VF %s hiding PASID capability\n", > > + __func__, dev_name(&vdev->pdev->dev)); > > + ret = 0; > > + goto out; > > + } > > No! Why? This is dead code. will remove it. thanks. > > + > > + /* Add PASID capability*/ > > + ret = vfio_pci_get_ecap_content(pdev, ecap, > > + len, (u8 *)&pasid_cap); > > > Why wouldn't we just use vfio_fill_config_bytes() rather than this > ridiculous approach of coping it to a buffer, modifying it and copying > it out? As the above reply, if copying PF's config space data to VF's vconfig directly is fine, then I can drop them. > > + if (ret) > > + goto out; > > + > > + if (!pasid_cap.control_reg.paside) { > > + pr_debug("%s: its PF's PASID capability is not enabled\n", > > + dev_name(&vdev->pdev->dev)); > > + ret = 0; > > + goto out; > > + } > > What happens if the PF's PASID gets disabled while we're using it?? This is actually the open I highlighted in cover letter. Per the reply from Baolu, this seems to be an open for bare-metal all the same. https://lkml.org/lkml/2020/3/31/95 > > + > > + memcpy(&vpasid_cap, &pasid_cap, len); > > + > > + vpasid_cap.id = 0x18; > > What is going on here? > > #define PCI_EXT_CAP_ID_LTR 0x18 /* Latency Tolerance Reporting */ my bad... This one was used in development phase. not needed in formal patch. really sorry for the mislead.. will remove it. > > + vpasid_cap.next = 0; > > + /* clear the control reg for guest */ > > + memset(&vpasid_cap.control_reg, 0x0, > > + sizeof(vpasid_cap.control_reg)); > > So we zero a couple registers and that's why we need a structure to > define the entire capability and this crazy copy to a cpu endian > buffer. No. there are two reasons for define a structure for the capability. For one, to check if PASID capability is enabled in PF. For two, to zero the control registers to ensure the guest see a clean control register. But if it is not necessary to do the two operations, it can be dropped. > > + > > + memset(map + epos, vpasid_cap.id, len); > > See below. > > > + ret = vfio_fill_custom_vconfig_bytes(vdev, epos, > > + (u8 *)&vpasid_cap, len); > > + if (!ret) { > > + /* > > + * Successfully filled in PASID cap, update > > + * the next offset in previous cap header, > > + * and also update caller about the offset > > + * of next cap if any. > > + */ > > + u32 val = epos; > > + **prevp &= cpu_to_le32(~(0xffcU << 20)); > > + **prevp |= cpu_to_le32(val << 20); > > + *prevp = (__le32 *)&vdev->vconfig[epos]; > > + *next = epos + len; > > Could we make this any more complicated? yeah, I should have added some comments. /* update the next field of prior cap structure */ **prevp &= cpu_to_le32(~(0xffcU << 20)); **prevp |= cpu_to_le32(val << 20); /* update the prevp pointer to point to the current cap */ *prevp = (__le32 *)&vdev->vconfig[epos]; *next = epos + len; > > + } > > + > > +out: > > + return ret; > > +} > > + > > +struct vfio_pci_pri_cap_data { > > + u32 id:16; > > + u32 version:4; > > + u32 next:12; > > + union { > > + u16 control_reg_val; > > + struct { > > + u16 enable:1; > > + u16 reset:1; > > + u16 rsv1:14; > > + }; > > + } control_reg; > > + union { > > + u16 status_reg_val; > > + struct { > > + u16 rf:1; > > + u16 uprgi:1; > > + u16 rsv2:6; > > + u16 stop:1; > > + u16 rsv3:6; > > + u16 pasid_required:1; > > + }; > > + } status_reg; > > + u32 prq_capacity; > > + u32 prq_quota; > > +}; > > + > > +static int vfio_pci_add_pri_cap(struct vfio_pci_device *vdev, > > + struct pci_dev *pdev, > > + u16 epos, u16 *next, __le32 **prevp) > > +{ > > + u8 *map = vdev->pci_config_map; > > + int ecap = PCI_EXT_CAP_ID_PRI; > > + int len = pci_ext_cap_length[ecap]; > > + struct vfio_pci_pri_cap_data pri_cap; > > + struct vfio_pci_pri_cap_data vpri_cap; > > + int ret; > > + > > + /* > > + * If no cap filled in this function, should make sure the next > > + * pointer points to current epos. > > + */ > > + *next = epos; > > + > > + if (!len) { > > + pr_info("%s: VF %s hiding PRI capability\n", > > + __func__, dev_name(&vdev->pdev->dev)); > > + ret = 0; > > + goto out; > > + } > > + > > + /* Add PASID capability*/ > > + ret = vfio_pci_get_ecap_content(pdev, ecap, > > + len, (u8 *)&pri_cap); > > + if (ret) > > + goto out; > > + > > + if (!pri_cap.control_reg.enable) { > > + pr_debug("%s: its PF's PRI capability is not enabled\n", > > + dev_name(&vdev->pdev->dev)); > > + ret = 0; > > + goto out; > > + } > > + > > + memcpy(&vpri_cap, &pri_cap, len); > > + > > + vpri_cap.id = 0x19; > > #define PCI_EXT_CAP_ID_SECPCI 0x19 /* Secondary PCIe Capability */ > > ??? my bad... This line was used in development phase. not needed in formal patch. really sorry for the mislead.. will remove it. > > + vpri_cap.next = 0; > > + /* clear the control reg for guest */ > > + memset(&vpri_cap.control_reg, 0x0, > > + sizeof(vpri_cap.control_reg)); > > + > > + memset(map + epos, vpri_cap.id, len); > > + ret = vfio_fill_custom_vconfig_bytes(vdev, epos, > > + (u8 *)&vpri_cap, len); > > + if (!ret) { > > + /* > > + * Successfully filled in PASID cap, update > > + * the next offset in previous cap header, > > + * and also update caller about the offset > > + * of next cap if any. > > + */ > > + u32 val = epos; > > + **prevp &= cpu_to_le32(~(0xffcU << 20)); > > + **prevp |= cpu_to_le32(val << 20); > > + *prevp = (__le32 *)&vdev->vconfig[epos]; > > + *next = epos + len; > > + } > > + > > +out: > > + return ret; > > +} > > + > > +static int vfio_pci_add_emulated_cap_for_vf(struct vfio_pci_device *vdev, > > + struct pci_dev *pdev, u16 start_epos, __le32 *prev) > > +{ > > + __le32 *__prev = prev; > > + u16 epos = start_epos, epos_next = start_epos; > > + int ret = 0; > > + > > + /* Add PASID capability*/ > > + ret = vfio_pci_add_pasid_cap(vdev, pdev, epos, > > + &epos_next, &__prev); > > + if (ret) > > + return ret; > > + > > + /* Add PRI capability */ > > + epos = epos_next; > > + ret = vfio_pci_add_pri_cap(vdev, pdev, epos, > > + &epos_next, &__prev); > > + > > + return ret; > > +} > > + > > static int vfio_ecap_init(struct vfio_pci_device *vdev) > > { > > struct pci_dev *pdev = vdev->pdev; > > u8 *map = vdev->pci_config_map; > > - u16 epos; > > + u16 epos, epos_max; > > __le32 *prev = NULL; > > int loops, ret, ecaps = 0; > > > > @@ -1521,6 +1814,7 @@ static int vfio_ecap_init(struct vfio_pci_device *vdev) > > return 0; > > > > epos = PCI_CFG_SPACE_SIZE; > > + epos_max = PCI_CFG_SPACE_SIZE; > > > > loops = (pdev->cfg_size - PCI_CFG_SPACE_SIZE) / PCI_CAP_SIZEOF; > > > > @@ -1545,6 +1839,9 @@ static int vfio_ecap_init(struct vfio_pci_device *vdev) > > } > > } > > > > + if (epos_max <= (epos + len)) > > + epos_max = epos + len; > > + > > if (!len) { > > pci_info(pdev, "%s: hiding ecap %#x@%#x\n", > > __func__, ecap, epos); > > @@ -1604,6 +1901,18 @@ static int vfio_ecap_init(struct vfio_pci_device *vdev) > > if (!ecaps) > > *(u32 *)&vdev->vconfig[PCI_CFG_SPACE_SIZE] = 0; > > > > +#ifdef CONFIG_PCI_ATS > > + if (pdev->is_virtfn) { > > + struct pci_dev *physfn = pdev->physfn; > > + > > + ret = vfio_pci_add_emulated_cap_for_vf(vdev, > > + physfn, epos_max, prev); > > + if (ret) > > + pr_info("%s, failed to add special caps for VF %s\n", > > + __func__, dev_name(&vdev->pdev->dev)); > > + } > > +#endif > > I can only imagine that we should place the caps at the same location > they exist on the PF, we don't know what hidden registers might be > hiding in config space. but we are not sure whether the same location is available on VF. In this patch, it actually places the emulated cap physically behind the cap which lays farthest (its offset is largest) within VF's config space as the PCIe caps are linked in a chain. > > + > > return 0; > > } > > > > @@ -1748,6 +2057,17 @@ static size_t vfio_pci_cap_remaining_dword(struct > vfio_pci_device *vdev, > > return i; > > } > > > > +static bool vfio_pci_need_virt_perm(struct pci_dev *pdev, u8 cap_id) > > +{ > > +#ifdef CONFIG_PCI_ATS > > + return (pdev->is_virtfn && > > + (cap_id == PCI_EXT_CAP_ID_PASID || > > + cap_id == PCI_EXT_CAP_ID_PRI)); > > +#else > > + return false; > > +#endif > > +} > > + > > static ssize_t vfio_config_do_rw(struct vfio_pci_device *vdev, char __user *buf, > > size_t count, loff_t *ppos, bool iswrite) > > { > > @@ -1781,7 +2101,8 @@ static ssize_t vfio_config_do_rw(struct vfio_pci_device > *vdev, char __user *buf, > > if (cap_id == PCI_CAP_ID_INVALID) { > > perm = &unassigned_perms; > > cap_start = *ppos; > > - } else if (cap_id == PCI_CAP_ID_INVALID_VIRT) { > > + } else if (cap_id == PCI_CAP_ID_INVALID_VIRT || > > + vfio_pci_need_virt_perm(pdev, cap_id)) { > > Why not simply fill the map with PCI_CAP_ID_INVALID_VIRT? good idea. thanks. > > perm = &virt_perms; > > cap_start = *ppos; > > } else { > > This is really not close to acceptable as is. Sorry. Thanks, really sorry for the silly mistakes, I can try to make a new version after we got a decision on whether needs to do data copy and medication before filling into vconfig. Thanks, Yi Liu
Hi Alex, > From: Alex Williamson <alex.williamson@redhat.com> > Sent: Friday, April 3, 2020 7:00 AM > To: Liu, Yi L <yi.l.liu@intel.com> > Subject: Re: [PATCH v1 2/2] vfio/pci: Emulate PASID/PRI capability for VFs > > On Sun, 22 Mar 2020 05:33:14 -0700 > "Liu, Yi L" <yi.l.liu@intel.com> wrote: > > > From: Liu Yi L <yi.l.liu@intel.com> > > > > Per PCIe r5.0, sec 9.3.7.14, if a PF implements the PASID Capability, the > > PF PASID configuration is shared by its VFs. VFs must not implement their > > own PASID Capability. > > > > Per PCIe r5.0, sec 9.3.7.11, VFs must not implement the PRI Capability. If > > the PF implements PRI, it is shared by the VFs. > > > > On bare metal, it has been fixed by below efforts. > > to PASID/PRI are > > https://lkml.org/lkml/2019/9/5/996 > > https://lkml.org/lkml/2019/9/5/995 > > > > This patch adds emulated PASID/PRI capabilities for VFs when assigned to > > VMs via vfio-pci driver. This is required for enabling vSVA on pass-through > > VFs as VFs have no PASID/PRI capability structure in its configure space. > > > > Cc: Kevin Tian <kevin.tian@intel.com> > > CC: Jacob Pan <jacob.jun.pan@linux.intel.com> > > Cc: Alex Williamson <alex.williamson@redhat.com> > > Cc: Eric Auger <eric.auger@redhat.com> > > Cc: Jean-Philippe Brucker <jean-philippe@linaro.org> > > Signed-off-by: Liu Yi L <yi.l.liu@intel.com> > > --- > > drivers/vfio/pci/vfio_pci_config.c | 325 > ++++++++++++++++++++++++++++++++++++- > > 1 file changed, 323 insertions(+), 2 deletions(-) > > > > diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c > > index 4b9af99..84b4ea0 100644 > > --- a/drivers/vfio/pci/vfio_pci_config.c > > +++ b/drivers/vfio/pci/vfio_pci_config.c > > @@ -1509,11 +1509,304 @@ static int vfio_cap_init(struct vfio_pci_device *vdev) > > return 0; > > } > > > > +static int vfio_fill_custom_vconfig_bytes(struct vfio_pci_device *vdev, > > + int offset, uint8_t *data, int size) > > +{ > > + int ret = 0, data_offset = 0; > > + > > + while (size) { > > + int filled; > > + > > + if (size >= 4 && !(offset % 4)) { > > + __le32 *dwordp = (__le32 *)&vdev->vconfig[offset]; > > + u32 dword; > > + > > + memcpy(&dword, data + data_offset, 4); > > + *dwordp = cpu_to_le32(dword); > > Why wouldn't we do: > > *dwordp = cpu_to_le32(*(u32 *)(data + data_offset)); > > or better yet, increment data on each iteration for: > > *dwordp = cpu_to_le32(*(u32 *)data); > > vfio_fill_vconfig_bytes() does almost this same thing, getting the data > from config space rather than a buffer, so please figure out how to > avoid duplicating the logic. Got another alternative. I may use the vfio_fill_vconfig_bytes() to fill the cap data from PF's config space into VF's vconfig. And after that, I can further modify the data in vconfig. e.g. zero the control reg of pasid cap. would it make more sense? > > + filled = 4; > > + } else if (size >= 2 && !(offset % 2)) { > > + __le16 *wordp = (__le16 *)&vdev->vconfig[offset]; > > + u16 word; > > + > > + memcpy(&word, data + data_offset, 2); > > + *wordp = cpu_to_le16(word); > > + filled = 2; > > + } else { > > + u8 *byte = &vdev->vconfig[offset]; > > + > > + memcpy(byte, data + data_offset, 1); [...] > > > + > > + memset(map + epos, vpasid_cap.id, len); > > See below. > > > + ret = vfio_fill_custom_vconfig_bytes(vdev, epos, > > + (u8 *)&vpasid_cap, len); > > + if (!ret) { > > + /* > > + * Successfully filled in PASID cap, update > > + * the next offset in previous cap header, > > + * and also update caller about the offset > > + * of next cap if any. > > + */ > > + u32 val = epos; > > + **prevp &= cpu_to_le32(~(0xffcU << 20)); > > + **prevp |= cpu_to_le32(val << 20); > > + *prevp = (__le32 *)&vdev->vconfig[epos]; > > + *next = epos + len; > > Could we make this any more complicated? I'm not sure if adding comments addressed this comment. After adding new cap in vconfig, it needs to update the cap.next field of prior cap. And in case of further add other cap, this function needs to update the prevp pointer to the address of the newly added cap. Regards, Yi Liu
On Fri, 3 Apr 2020 07:53:55 +0000 "Liu, Yi L" <yi.l.liu@intel.com> wrote: > Hi Alex, > > > From: Alex Williamson <alex.williamson@redhat.com> > > Sent: Friday, April 3, 2020 7:00 AM > > To: Liu, Yi L <yi.l.liu@intel.com> > > Subject: Re: [PATCH v1 2/2] vfio/pci: Emulate PASID/PRI capability for VFs > > > > On Sun, 22 Mar 2020 05:33:14 -0700 > > "Liu, Yi L" <yi.l.liu@intel.com> wrote: > > > > > From: Liu Yi L <yi.l.liu@intel.com> > > > > > > Per PCIe r5.0, sec 9.3.7.14, if a PF implements the PASID Capability, the > > > PF PASID configuration is shared by its VFs. VFs must not implement their > > > own PASID Capability. > > > > > > Per PCIe r5.0, sec 9.3.7.11, VFs must not implement the PRI Capability. If > > > the PF implements PRI, it is shared by the VFs. > > > > > > On bare metal, it has been fixed by below efforts. > > > to PASID/PRI are > > > https://lkml.org/lkml/2019/9/5/996 > > > https://lkml.org/lkml/2019/9/5/995 > > > > > > This patch adds emulated PASID/PRI capabilities for VFs when assigned to > > > VMs via vfio-pci driver. This is required for enabling vSVA on pass-through > > > VFs as VFs have no PASID/PRI capability structure in its configure space. > > > > > > Cc: Kevin Tian <kevin.tian@intel.com> > > > CC: Jacob Pan <jacob.jun.pan@linux.intel.com> > > > Cc: Alex Williamson <alex.williamson@redhat.com> > > > Cc: Eric Auger <eric.auger@redhat.com> > > > Cc: Jean-Philippe Brucker <jean-philippe@linaro.org> > > > Signed-off-by: Liu Yi L <yi.l.liu@intel.com> > > > --- > > > drivers/vfio/pci/vfio_pci_config.c | 325 > > ++++++++++++++++++++++++++++++++++++- > > > 1 file changed, 323 insertions(+), 2 deletions(-) > > > > > > diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c > > > index 4b9af99..84b4ea0 100644 > > > --- a/drivers/vfio/pci/vfio_pci_config.c > > > +++ b/drivers/vfio/pci/vfio_pci_config.c > > > @@ -1509,11 +1509,304 @@ static int vfio_cap_init(struct vfio_pci_device *vdev) > > > return 0; > > > } > > > > > > +static int vfio_fill_custom_vconfig_bytes(struct vfio_pci_device *vdev, > > > + int offset, uint8_t *data, int size) > > > +{ > > > + int ret = 0, data_offset = 0; > > > + > > > + while (size) { > > > + int filled; > > > + > > > + if (size >= 4 && !(offset % 4)) { > > > + __le32 *dwordp = (__le32 *)&vdev->vconfig[offset]; > > > + u32 dword; > > > + > > > + memcpy(&dword, data + data_offset, 4); > > > + *dwordp = cpu_to_le32(dword); > > > > Why wouldn't we do: > > > > *dwordp = cpu_to_le32(*(u32 *)(data + data_offset)); > > > > or better yet, increment data on each iteration for: > > > > *dwordp = cpu_to_le32(*(u32 *)data); > > I'll refine it. > > > vfio_fill_vconfig_bytes() does almost this same thing, getting > > the data > > from config space rather than a buffer, so please figure out how to > > avoid duplicating the logic. > > This patch is to emulate the PASID/PRI capability for VF. And per > my understanding, the cap data from PF's config space cannot be > filled to VF's vconfig directly. Take the control register in PASID > capability structure as an example. If PASID cap is enabled in PF, > the PASID enable bit in the control register will be set. If the cap > data is filled to VF's vconfig directly, then guest will see a default > enabled PASID capability in the VF. I guess it is not good. But, I may > be wrong. If no such requirement, I can use vfio_fill_vconfig_bytes() > directly, no need to do copy and modify and then copy. > > Also, if still needs to do the copy and modification, I may modify > the vfio_fill_vconfig_bytes() to pass in the buffer and an indicator > to tell vfio_fill_vconfig_bytes() whether fetch data from buffer or > from config space. Why is vconfig not your buffer? We're building config space emulation before the user has access to the device, so it's really not an issue to copy the PF config into the VF vconfig, then modify the enable bit in the vconfig space. > > > + filled = 4; > > > + } else if (size >= 2 && !(offset % 2)) { > > > + __le16 *wordp = (__le16 *)&vdev->vconfig[offset]; > > > + u16 word; > > > + > > > + memcpy(&word, data + data_offset, 2); > > > + *wordp = cpu_to_le16(word); > > > + filled = 2; > > > + } else { > > > + u8 *byte = &vdev->vconfig[offset]; > > > + > > > + memcpy(byte, data + data_offset, 1); > > > > This one is really silly. > > > > vdev->vconfig[offset] = *data; > > got it. > > > > + filled = 1; > > > + } > > > + > > > + offset += filled; > > > + data_offset += filled; > > > + size -= filled; > > > + } > > > + > > > + return ret; > > > +} > > > + > > > +static int vfio_pci_get_ecap_content(struct pci_dev *pdev, > > > + int cap, int cap_len, u8 *content) > > > +{ > > > + int pos, offset, len = cap_len, ret = 0; > > > + > > > + pos = pci_find_ext_capability(pdev, cap); > > > + if (!pos) > > > + return -EINVAL; > > > + > > > + offset = 0; > > > + while (len) { > > > + int fetched; > > > + > > > + if (len >= 4 && !(pos % 4)) { > > > + u32 *dwordp = (u32 *) (content + offset); > > > + u32 dword; > > > + __le32 *dwptr = (__le32 *) &dword; > > > + > > > + ret = pci_read_config_dword(pdev, pos, &dword); > > > + if (ret) > > > + return ret; > > > + *dwordp = le32_to_cpu(*dwptr); > > > > WHAT??? pci_read_config_dword() returns cpu endian data! > > my bad. will remove the silly le32_to_cpu() convert. > > > > + fetched = 4; > > > + } else if (len >= 2 && !(pos % 2)) { > > > + u16 *wordp = (u16 *) (content + offset); > > > + u16 word; > > > + __le16 *wptr = (__le16 *) &word; > > > + > > > + ret = pci_read_config_word(pdev, pos, &word); > > > + if (ret) > > > + return ret; > > > + *wordp = le16_to_cpu(*wptr); > > > + fetched = 2; > > > + } else { > > > + u8 *byte = (u8 *) (content + offset); > > > + > > > + ret = pci_read_config_byte(pdev, pos, byte); > > > + if (ret) > > > + return ret; > > > + fetched = 1; > > > + } > > > + > > > + pos += fetched; > > > + offset += fetched; > > > + len -= fetched; > > > + } > > > + > > > + return ret; > > > +} > > > + > > > +struct vfio_pci_pasid_cap_data { > > > + u32 id:16; > > > + u32 version:4; > > > + u32 next:12; > > > + union { > > > + u16 cap_reg_val; > > > + struct { > > > + u16 rsv1:1; > > > + u16 execs:1; > > > + u16 prvs:1; > > > + u16 rsv2:5; > > > + u16 pasid_bits:5; > > > + u16 rsv3:3; > > > + }; > > > + } cap_reg; > > > + union { > > > + u16 control_reg_val; > > > + struct { > > > + u16 paside:1; > > > + u16 exece:1; > > > + u16 prve:1; > > > + u16 rsv4:13; > > > + }; > > > + } control_reg; > > > +}; > > > + > > > +static int vfio_pci_add_pasid_cap(struct vfio_pci_device *vdev, > > > + struct pci_dev *pdev, > > > + u16 epos, u16 *next, __le32 **prevp) > > > +{ > > > + u8 *map = vdev->pci_config_map; > > > + int ecap = PCI_EXT_CAP_ID_PASID; > > > + int len = pci_ext_cap_length[ecap]; > > > + struct vfio_pci_pasid_cap_data pasid_cap; > > > + struct vfio_pci_pasid_cap_data vpasid_cap; > > > + int ret; > > > + > > > + /* > > > + * If no cap filled in this function, should make sure the next > > > + * pointer points to current epos. > > > + */ > > > + *next = epos; > > > + > > > + if (!len) { > > > + pr_info("%s: VF %s hiding PASID capability\n", > > > + __func__, dev_name(&vdev->pdev->dev)); > > > + ret = 0; > > > + goto out; > > > + } > > > > No! Why? This is dead code. > > will remove it. thanks. > > > > + > > > + /* Add PASID capability*/ > > > + ret = vfio_pci_get_ecap_content(pdev, ecap, > > > + len, (u8 *)&pasid_cap); > > > > > > Why wouldn't we just use vfio_fill_config_bytes() rather than this > > ridiculous approach of coping it to a buffer, modifying it and copying > > it out? > > As the above reply, if copying PF's config space data to VF's vconfig > directly is fine, then I can drop them. > > > > + if (ret) > > > + goto out; > > > + > > > + if (!pasid_cap.control_reg.paside) { > > > + pr_debug("%s: its PF's PASID capability is not enabled\n", > > > + dev_name(&vdev->pdev->dev)); > > > + ret = 0; > > > + goto out; > > > + } > > > > What happens if the PF's PASID gets disabled while we're using it?? > > This is actually the open I highlighted in cover letter. Per the reply > from Baolu, this seems to be an open for bare-metal all the same. > https://lkml.org/lkml/2020/3/31/95 Seems that needs to get sorted out before we can expose this. Maybe some sort of registration with the PF driver that PASID is being used by a VF so it cannot be disabled? > > > + > > > + memcpy(&vpasid_cap, &pasid_cap, len); > > > + > > > + vpasid_cap.id = 0x18; > > > > What is going on here? > > > > #define PCI_EXT_CAP_ID_LTR 0x18 /* Latency Tolerance Reporting */ > > my bad... This one was used in development phase. not needed in formal > patch. really sorry for the mislead.. will remove it. > > > > + vpasid_cap.next = 0; > > > + /* clear the control reg for guest */ > > > + memset(&vpasid_cap.control_reg, 0x0, > > > + sizeof(vpasid_cap.control_reg)); > > > > So we zero a couple registers and that's why we need a structure to > > define the entire capability and this crazy copy to a cpu endian > > buffer. No. > > there are two reasons for define a structure for the capability. For > one, to check if PASID capability is enabled in PF. For two, to zero > the control registers to ensure the guest see a clean control register. > But if it is not necessary to do the two operations, it can be dropped. Neither of these require the structures you've defined and I haven't convinced myself that the bit fields don't have their own issues with endianness. Please drop them. > > > + > > > + memset(map + epos, vpasid_cap.id, len); > > > > See below. > > > > > + ret = vfio_fill_custom_vconfig_bytes(vdev, epos, > > > + (u8 *)&vpasid_cap, len); > > > + if (!ret) { > > > + /* > > > + * Successfully filled in PASID cap, update > > > + * the next offset in previous cap header, > > > + * and also update caller about the offset > > > + * of next cap if any. > > > + */ > > > + u32 val = epos; > > > + **prevp &= cpu_to_le32(~(0xffcU << 20)); > > > + **prevp |= cpu_to_le32(val << 20); > > > + *prevp = (__le32 *)&vdev->vconfig[epos]; > > > + *next = epos + len; > > > > Could we make this any more complicated? > > yeah, I should have added some comments. > > /* update the next field of prior cap structure */ > **prevp &= cpu_to_le32(~(0xffcU << 20)); > **prevp |= cpu_to_le32(val << 20); > /* update the prevp pointer to point to the current cap */ > *prevp = (__le32 *)&vdev->vconfig[epos]; > *next = epos + len; The main loop when processing capabilities already has this sort of logic. Please figure out a way to not duplicate it. > > > + } > > > + > > > +out: > > > + return ret; > > > +} > > > + > > > +struct vfio_pci_pri_cap_data { > > > + u32 id:16; > > > + u32 version:4; > > > + u32 next:12; > > > + union { > > > + u16 control_reg_val; > > > + struct { > > > + u16 enable:1; > > > + u16 reset:1; > > > + u16 rsv1:14; > > > + }; > > > + } control_reg; > > > + union { > > > + u16 status_reg_val; > > > + struct { > > > + u16 rf:1; > > > + u16 uprgi:1; > > > + u16 rsv2:6; > > > + u16 stop:1; > > > + u16 rsv3:6; > > > + u16 pasid_required:1; > > > + }; > > > + } status_reg; > > > + u32 prq_capacity; > > > + u32 prq_quota; > > > +}; > > > + > > > +static int vfio_pci_add_pri_cap(struct vfio_pci_device *vdev, > > > + struct pci_dev *pdev, > > > + u16 epos, u16 *next, __le32 **prevp) > > > +{ > > > + u8 *map = vdev->pci_config_map; > > > + int ecap = PCI_EXT_CAP_ID_PRI; > > > + int len = pci_ext_cap_length[ecap]; > > > + struct vfio_pci_pri_cap_data pri_cap; > > > + struct vfio_pci_pri_cap_data vpri_cap; > > > + int ret; > > > + > > > + /* > > > + * If no cap filled in this function, should make sure the next > > > + * pointer points to current epos. > > > + */ > > > + *next = epos; > > > + > > > + if (!len) { > > > + pr_info("%s: VF %s hiding PRI capability\n", > > > + __func__, dev_name(&vdev->pdev->dev)); > > > + ret = 0; > > > + goto out; > > > + } > > > + > > > + /* Add PASID capability*/ > > > + ret = vfio_pci_get_ecap_content(pdev, ecap, > > > + len, (u8 *)&pri_cap); > > > + if (ret) > > > + goto out; > > > + > > > + if (!pri_cap.control_reg.enable) { > > > + pr_debug("%s: its PF's PRI capability is not enabled\n", > > > + dev_name(&vdev->pdev->dev)); > > > + ret = 0; > > > + goto out; > > > + } > > > + > > > + memcpy(&vpri_cap, &pri_cap, len); > > > + > > > + vpri_cap.id = 0x19; > > > > #define PCI_EXT_CAP_ID_SECPCI 0x19 /* Secondary PCIe Capability */ > > > > ??? > > my bad... This line was used in development phase. not needed in formal > patch. really sorry for the mislead.. will remove it. > > > > + vpri_cap.next = 0; > > > + /* clear the control reg for guest */ > > > + memset(&vpri_cap.control_reg, 0x0, > > > + sizeof(vpri_cap.control_reg)); > > > + > > > + memset(map + epos, vpri_cap.id, len); > > > + ret = vfio_fill_custom_vconfig_bytes(vdev, epos, > > > + (u8 *)&vpri_cap, len); > > > + if (!ret) { > > > + /* > > > + * Successfully filled in PASID cap, update > > > + * the next offset in previous cap header, > > > + * and also update caller about the offset > > > + * of next cap if any. > > > + */ > > > + u32 val = epos; > > > + **prevp &= cpu_to_le32(~(0xffcU << 20)); > > > + **prevp |= cpu_to_le32(val << 20); > > > + *prevp = (__le32 *)&vdev->vconfig[epos]; > > > + *next = epos + len; > > > + } > > > + > > > +out: > > > + return ret; > > > +} > > > + > > > +static int vfio_pci_add_emulated_cap_for_vf(struct vfio_pci_device *vdev, > > > + struct pci_dev *pdev, u16 start_epos, __le32 *prev) > > > +{ > > > + __le32 *__prev = prev; > > > + u16 epos = start_epos, epos_next = start_epos; > > > + int ret = 0; > > > + > > > + /* Add PASID capability*/ > > > + ret = vfio_pci_add_pasid_cap(vdev, pdev, epos, > > > + &epos_next, &__prev); > > > + if (ret) > > > + return ret; > > > + > > > + /* Add PRI capability */ > > > + epos = epos_next; > > > + ret = vfio_pci_add_pri_cap(vdev, pdev, epos, > > > + &epos_next, &__prev); > > > + > > > + return ret; > > > +} > > > + > > > static int vfio_ecap_init(struct vfio_pci_device *vdev) > > > { > > > struct pci_dev *pdev = vdev->pdev; > > > u8 *map = vdev->pci_config_map; > > > - u16 epos; > > > + u16 epos, epos_max; > > > __le32 *prev = NULL; > > > int loops, ret, ecaps = 0; > > > > > > @@ -1521,6 +1814,7 @@ static int vfio_ecap_init(struct vfio_pci_device *vdev) > > > return 0; > > > > > > epos = PCI_CFG_SPACE_SIZE; > > > + epos_max = PCI_CFG_SPACE_SIZE; > > > > > > loops = (pdev->cfg_size - PCI_CFG_SPACE_SIZE) / PCI_CAP_SIZEOF; > > > > > > @@ -1545,6 +1839,9 @@ static int vfio_ecap_init(struct vfio_pci_device *vdev) > > > } > > > } > > > > > > + if (epos_max <= (epos + len)) > > > + epos_max = epos + len; > > > + > > > if (!len) { > > > pci_info(pdev, "%s: hiding ecap %#x@%#x\n", > > > __func__, ecap, epos); > > > @@ -1604,6 +1901,18 @@ static int vfio_ecap_init(struct vfio_pci_device *vdev) > > > if (!ecaps) > > > *(u32 *)&vdev->vconfig[PCI_CFG_SPACE_SIZE] = 0; > > > > > > +#ifdef CONFIG_PCI_ATS > > > + if (pdev->is_virtfn) { > > > + struct pci_dev *physfn = pdev->physfn; > > > + > > > + ret = vfio_pci_add_emulated_cap_for_vf(vdev, > > > + physfn, epos_max, prev); > > > + if (ret) > > > + pr_info("%s, failed to add special caps for VF %s\n", > > > + __func__, dev_name(&vdev->pdev->dev)); > > > + } > > > +#endif > > > > I can only imagine that we should place the caps at the same location > > they exist on the PF, we don't know what hidden registers might be > > hiding in config space. > > but we are not sure whether the same location is available on VF. In > this patch, it actually places the emulated cap physically behind the > cap which lays farthest (its offset is largest) within VF's config space > as the PCIe caps are linked in a chain. But, as we've found on Broadcom NICs (iirc), hardware developers have a nasty habit of hiding random registers in PCI config space, outside of defined capabilities. I feel like IGD might even do this too, is that true? So I don't think we can guarantee that just because a section of config space isn't part of a defined capability that its unused. It only means that it's unused by common code, but it might have device specific purposes. So of the PCIe spec indicates that VFs cannot include these capabilities and virtialization software needs to emulate them, we need somewhere safe to place them in config space, and simply placing them off the end of known capabilities doesn't give me any confidence. Also, hardware has no requirement to make compact use of extended config space. The first capability must be at 0x100, the very next capability could consume all the way to the last byte of the 4K extended range, and the next link in the chain could be somewhere in the middle. Thanks, Alex > > > + > > > return 0; > > > } > > > > > > @@ -1748,6 +2057,17 @@ static size_t vfio_pci_cap_remaining_dword(struct > > vfio_pci_device *vdev, > > > return i; > > > } > > > > > > +static bool vfio_pci_need_virt_perm(struct pci_dev *pdev, u8 cap_id) > > > +{ > > > +#ifdef CONFIG_PCI_ATS > > > + return (pdev->is_virtfn && > > > + (cap_id == PCI_EXT_CAP_ID_PASID || > > > + cap_id == PCI_EXT_CAP_ID_PRI)); > > > +#else > > > + return false; > > > +#endif > > > +} > > > + > > > static ssize_t vfio_config_do_rw(struct vfio_pci_device *vdev, char __user *buf, > > > size_t count, loff_t *ppos, bool iswrite) > > > { > > > @@ -1781,7 +2101,8 @@ static ssize_t vfio_config_do_rw(struct vfio_pci_device > > *vdev, char __user *buf, > > > if (cap_id == PCI_CAP_ID_INVALID) { > > > perm = &unassigned_perms; > > > cap_start = *ppos; > > > - } else if (cap_id == PCI_CAP_ID_INVALID_VIRT) { > > > + } else if (cap_id == PCI_CAP_ID_INVALID_VIRT || > > > + vfio_pci_need_virt_perm(pdev, cap_id)) { > > > > Why not simply fill the map with PCI_CAP_ID_INVALID_VIRT? > > good idea. thanks. > > > > perm = &virt_perms; > > > cap_start = *ppos; > > > } else { > > > > This is really not close to acceptable as is. Sorry. Thanks, > > really sorry for the silly mistakes, I can try to make a new version > after we got a decision on whether needs to do data copy and medication > before filling into vconfig. > > Thanks, > Yi Liu >
> From: Alex Williamson <alex.williamson@redhat.com> > Sent: Saturday, April 4, 2020 1:26 AM [...] > > > > + if (!pasid_cap.control_reg.paside) { > > > > + pr_debug("%s: its PF's PASID capability is not enabled\n", > > > > + dev_name(&vdev->pdev->dev)); > > > > + ret = 0; > > > > + goto out; > > > > + } > > > > > > What happens if the PF's PASID gets disabled while we're using it?? > > > > This is actually the open I highlighted in cover letter. Per the reply > > from Baolu, this seems to be an open for bare-metal all the same. > > https://lkml.org/lkml/2020/3/31/95 > > Seems that needs to get sorted out before we can expose this. Maybe > some sort of registration with the PF driver that PASID is being used > by a VF so it cannot be disabled? I guess we may do vSVA for PF first, and then adding VF vSVA later given above additional need. It's not necessarily to enable both in one step. [...] > > > > @@ -1604,6 +1901,18 @@ static int vfio_ecap_init(struct > vfio_pci_device *vdev) > > > > if (!ecaps) > > > > *(u32 *)&vdev->vconfig[PCI_CFG_SPACE_SIZE] = 0; > > > > > > > > +#ifdef CONFIG_PCI_ATS > > > > + if (pdev->is_virtfn) { > > > > + struct pci_dev *physfn = pdev->physfn; > > > > + > > > > + ret = vfio_pci_add_emulated_cap_for_vf(vdev, > > > > + physfn, epos_max, prev); > > > > + if (ret) > > > > + pr_info("%s, failed to add special caps for VF %s\n", > > > > + __func__, dev_name(&vdev->pdev->dev)); > > > > + } > > > > +#endif > > > > > > I can only imagine that we should place the caps at the same location > > > they exist on the PF, we don't know what hidden registers might be > > > hiding in config space. Is there vendor guarantee that hidden registers will locate at the same offset between PF and VF config space? > > > > but we are not sure whether the same location is available on VF. In > > this patch, it actually places the emulated cap physically behind the > > cap which lays farthest (its offset is largest) within VF's config space > > as the PCIe caps are linked in a chain. > > But, as we've found on Broadcom NICs (iirc), hardware developers have a > nasty habit of hiding random registers in PCI config space, outside of > defined capabilities. I feel like IGD might even do this too, is that > true? So I don't think we can guarantee that just because a section of > config space isn't part of a defined capability that its unused. It > only means that it's unused by common code, but it might have device > specific purposes. So of the PCIe spec indicates that VFs cannot > include these capabilities and virtialization software needs to > emulate them, we need somewhere safe to place them in config space, and > simply placing them off the end of known capabilities doesn't give me > any confidence. Also, hardware has no requirement to make compact use > of extended config space. The first capability must be at 0x100, the > very next capability could consume all the way to the last byte of the > 4K extended range, and the next link in the chain could be somewhere in > the middle. Thanks, > Then what would be a viable option? Vendor nasty habit implies no standard, thus I don't see how VFIO can find a safe location by itself. Also curious how those hidden registers are identified by VFIO and employed with proper r/w policy today. If sort of quirks are used, then could such quirk way be extended to also carry the information about vendor specific safe location? When no such quirk info is provided (the majority case), VFIO then finds out a free location to carry the new cap. Thanks Kevin
On Tue, 7 Apr 2020 04:26:23 +0000 "Tian, Kevin" <kevin.tian@intel.com> wrote: > > From: Alex Williamson <alex.williamson@redhat.com> > > Sent: Saturday, April 4, 2020 1:26 AM > [...] > > > > > + if (!pasid_cap.control_reg.paside) { > > > > > + pr_debug("%s: its PF's PASID capability is not enabled\n", > > > > > + dev_name(&vdev->pdev->dev)); > > > > > + ret = 0; > > > > > + goto out; > > > > > + } > > > > > > > > What happens if the PF's PASID gets disabled while we're using it?? > > > > > > This is actually the open I highlighted in cover letter. Per the reply > > > from Baolu, this seems to be an open for bare-metal all the same. > > > https://lkml.org/lkml/2020/3/31/95 > > > > Seems that needs to get sorted out before we can expose this. Maybe > > some sort of registration with the PF driver that PASID is being used > > by a VF so it cannot be disabled? > > I guess we may do vSVA for PF first, and then adding VF vSVA later > given above additional need. It's not necessarily to enable both > in one step. > > [...] > > > > > @@ -1604,6 +1901,18 @@ static int vfio_ecap_init(struct > > vfio_pci_device *vdev) > > > > > if (!ecaps) > > > > > *(u32 *)&vdev->vconfig[PCI_CFG_SPACE_SIZE] = 0; > > > > > > > > > > +#ifdef CONFIG_PCI_ATS > > > > > + if (pdev->is_virtfn) { > > > > > + struct pci_dev *physfn = pdev->physfn; > > > > > + > > > > > + ret = vfio_pci_add_emulated_cap_for_vf(vdev, > > > > > + physfn, epos_max, prev); > > > > > + if (ret) > > > > > + pr_info("%s, failed to add special caps for VF %s\n", > > > > > + __func__, dev_name(&vdev->pdev->dev)); > > > > > + } > > > > > +#endif > > > > > > > > I can only imagine that we should place the caps at the same location > > > > they exist on the PF, we don't know what hidden registers might be > > > > hiding in config space. > > Is there vendor guarantee that hidden registers will locate at the > same offset between PF and VF config space? I'm not sure if the spec really precludes hidden registers, but the fact that these registers are explicitly outside of the capability chain implies they're only intended for device specific use, so I'd say there are no guarantees about anything related to these registers. FWIW, vfio started out being more strict about restricting config space access to defined capabilities, until... commit a7d1ea1c11b33bda2691f3294b4d735ed635535a Author: Alex Williamson <alex.williamson@redhat.com> Date: Mon Apr 1 09:04:12 2013 -0600 vfio-pci: Enable raw access to unassigned config space Devices like be2net hide registers between the gaps in capabilities and architected regions of PCI config space. Our choices to support such devices is to either build an ever growing and unmanageable white list or rely on hardware isolation to protect us. These registers are really no different than MMIO or I/O port space registers, which we don't attempt to regulate, so treat PCI config space in the same way. > > > but we are not sure whether the same location is available on VF. In > > > this patch, it actually places the emulated cap physically behind the > > > cap which lays farthest (its offset is largest) within VF's config space > > > as the PCIe caps are linked in a chain. > > > > But, as we've found on Broadcom NICs (iirc), hardware developers have a > > nasty habit of hiding random registers in PCI config space, outside of > > defined capabilities. I feel like IGD might even do this too, is that > > true? So I don't think we can guarantee that just because a section of > > config space isn't part of a defined capability that its unused. It > > only means that it's unused by common code, but it might have device > > specific purposes. So of the PCIe spec indicates that VFs cannot > > include these capabilities and virtialization software needs to > > emulate them, we need somewhere safe to place them in config space, and > > simply placing them off the end of known capabilities doesn't give me > > any confidence. Also, hardware has no requirement to make compact use > > of extended config space. The first capability must be at 0x100, the > > very next capability could consume all the way to the last byte of the > > 4K extended range, and the next link in the chain could be somewhere in > > the middle. Thanks, > > > > Then what would be a viable option? Vendor nasty habit implies > no standard, thus I don't see how VFIO can find a safe location > by itself. Also curious how those hidden registers are identified > by VFIO and employed with proper r/w policy today. If sort of quirks > are used, then could such quirk way be extended to also carry > the information about vendor specific safe location? When no > such quirk info is provided (the majority case), VFIO then finds > out a free location to carry the new cap. See above commit, rather than quirks we allow raw access to any config space outside of the capability chain. My preference for trying to place virtual capabilities at the same offset as the capability exists on the PF is my impression that the PF config space is often a template for the VF config space. The PF and VF are clearly not independent devices, they share design aspects, and sometimes drivers. Therefore if I was a lazy engineer trying to find a place to hide a register in config space (and ignoring vendor capabilities*), I'd probably put it in the same place on both devices. Thus if we maintain the same capability footprint as the PF, we have a better chance of avoiding them. It's a gamble and maybe we're overthinking it, but this has always been a concern when adding virtual capabilities to a physical device. We can always fail over to an approach where we simply find free space. Thanks, Alex * ISTR the Broadcom device implemented the hidden register in standard config space, which was otherwise entirely packed, ie. there was no room for the register to be implemented as a vendor cap.
> From: Alex Williamson <alex.williamson@redhat.com> > Sent: Tuesday, April 7, 2020 11:58 PM > > On Tue, 7 Apr 2020 04:26:23 +0000 > "Tian, Kevin" <kevin.tian@intel.com> wrote: > > > > From: Alex Williamson <alex.williamson@redhat.com> > > > Sent: Saturday, April 4, 2020 1:26 AM > > [...] > > > > > > + if (!pasid_cap.control_reg.paside) { > > > > > > + pr_debug("%s: its PF's PASID capability is not > enabled\n", > > > > > > + dev_name(&vdev->pdev->dev)); > > > > > > + ret = 0; > > > > > > + goto out; > > > > > > + } > > > > > > > > > > What happens if the PF's PASID gets disabled while we're using it?? > > > > > > > > This is actually the open I highlighted in cover letter. Per the reply > > > > from Baolu, this seems to be an open for bare-metal all the same. > > > > https://lkml.org/lkml/2020/3/31/95 > > > > > > Seems that needs to get sorted out before we can expose this. Maybe > > > some sort of registration with the PF driver that PASID is being used > > > by a VF so it cannot be disabled? > > > > I guess we may do vSVA for PF first, and then adding VF vSVA later > > given above additional need. It's not necessarily to enable both > > in one step. > > > > [...] > > > > > > @@ -1604,6 +1901,18 @@ static int vfio_ecap_init(struct > > > vfio_pci_device *vdev) > > > > > > if (!ecaps) > > > > > > *(u32 *)&vdev->vconfig[PCI_CFG_SPACE_SIZE] = 0; > > > > > > > > > > > > +#ifdef CONFIG_PCI_ATS > > > > > > + if (pdev->is_virtfn) { > > > > > > + struct pci_dev *physfn = pdev->physfn; > > > > > > + > > > > > > + ret = vfio_pci_add_emulated_cap_for_vf(vdev, > > > > > > + physfn, epos_max, prev); > > > > > > + if (ret) > > > > > > + pr_info("%s, failed to add special caps for > VF %s\n", > > > > > > + __func__, dev_name(&vdev->pdev- > >dev)); > > > > > > + } > > > > > > +#endif > > > > > > > > > > I can only imagine that we should place the caps at the same location > > > > > they exist on the PF, we don't know what hidden registers might be > > > > > hiding in config space. > > > > Is there vendor guarantee that hidden registers will locate at the > > same offset between PF and VF config space? > > I'm not sure if the spec really precludes hidden registers, but the > fact that these registers are explicitly outside of the capability > chain implies they're only intended for device specific use, so I'd say > there are no guarantees about anything related to these registers. > > FWIW, vfio started out being more strict about restricting config space > access to defined capabilities, until... > > commit a7d1ea1c11b33bda2691f3294b4d735ed635535a > Author: Alex Williamson <alex.williamson@redhat.com> > Date: Mon Apr 1 09:04:12 2013 -0600 > > vfio-pci: Enable raw access to unassigned config space > > Devices like be2net hide registers between the gaps in capabilities > and architected regions of PCI config space. Our choices to support > such devices is to either build an ever growing and unmanageable white > list or rely on hardware isolation to protect us. These registers are > really no different than MMIO or I/O port space registers, which we > don't attempt to regulate, so treat PCI config space in the same way. > > > > > but we are not sure whether the same location is available on VF. In > > > > this patch, it actually places the emulated cap physically behind the > > > > cap which lays farthest (its offset is largest) within VF's config space > > > > as the PCIe caps are linked in a chain. > > > > > > But, as we've found on Broadcom NICs (iirc), hardware developers have a > > > nasty habit of hiding random registers in PCI config space, outside of > > > defined capabilities. I feel like IGD might even do this too, is that > > > true? So I don't think we can guarantee that just because a section of > > > config space isn't part of a defined capability that its unused. It > > > only means that it's unused by common code, but it might have device > > > specific purposes. So of the PCIe spec indicates that VFs cannot > > > include these capabilities and virtialization software needs to > > > emulate them, we need somewhere safe to place them in config space, > and > > > simply placing them off the end of known capabilities doesn't give me > > > any confidence. Also, hardware has no requirement to make compact > use > > > of extended config space. The first capability must be at 0x100, the > > > very next capability could consume all the way to the last byte of the > > > 4K extended range, and the next link in the chain could be somewhere in > > > the middle. Thanks, > > > > > > > Then what would be a viable option? Vendor nasty habit implies > > no standard, thus I don't see how VFIO can find a safe location > > by itself. Also curious how those hidden registers are identified > > by VFIO and employed with proper r/w policy today. If sort of quirks > > are used, then could such quirk way be extended to also carry > > the information about vendor specific safe location? When no > > such quirk info is provided (the majority case), VFIO then finds > > out a free location to carry the new cap. > > See above commit, rather than quirks we allow raw access to any config > space outside of the capability chain. My preference for trying to > place virtual capabilities at the same offset as the capability exists > on the PF is my impression that the PF config space is often a template > for the VF config space. The PF and VF are clearly not independent > devices, they share design aspects, and sometimes drivers. Therefore > if I was a lazy engineer trying to find a place to hide a register in > config space (and ignoring vendor capabilities*), I'd probably put it > in the same place on both devices. Thus if we maintain the same We are checking internally whether this assumption makes sense at least for Intel devices which are PASID-capable. > capability footprint as the PF, we have a better chance of avoiding > them. It's a gamble and maybe we're overthinking it, but this has > always been a concern when adding virtual capabilities to a physical > device. We can always fail over to an approach where we simply find > free space. Thanks, Curious how failover could be triggered in your mind. It's easy to detect conflict with other PCI caps, but not for conflict with hidden registers. The latter can be identified only with device specific knowledge. Possibly in the end we may leverage Yan's vendor ops to find a safe location... > > Alex > > * ISTR the Broadcom device implemented the hidden register in standard > config space, which was otherwise entirely packed, ie. there was no > room for the register to be implemented as a vendor cap. I suppose such packed design is mostly for PF. Ideally VF is much simpler and the requirement of hidden registers should be much fewer. Otherwise even using same PF offset doesn't work. Long-term it is better for PCISIG to add some recommendations, e.g. for capabilities that are shared between PF/VF, VF config space should still reserve a range at the same location/size as of the PF ones. Thanks Kevin
Hi Alex + Bjorn FWIW I can't understand why PCI SIG went different ways with ATS, where its enumerated on PF and VF. But for PASID and PRI its only in PF. I'm checking with our internal SIG reps to followup on that. On Tue, Apr 07, 2020 at 09:58:01AM -0600, Alex Williamson wrote: > > Is there vendor guarantee that hidden registers will locate at the > > same offset between PF and VF config space? > > I'm not sure if the spec really precludes hidden registers, but the > fact that these registers are explicitly outside of the capability > chain implies they're only intended for device specific use, so I'd say > there are no guarantees about anything related to these registers. As you had suggested in the other thread, we could consider using the same offset as in PF, but even that's a better guess still not reliable. The other option is to maybe extend driver ops in the PF to expose where the offsets should be. Sort of adding the quirk in the implementation. I'm not sure how prevalent are PASID and PRI in VF devices. If SIG is resisting making VF's first class citizen, we might ask them to add some verbiage to suggest leave the same offsets as PF open to help emulation software. > > FWIW, vfio started out being more strict about restricting config space > access to defined capabilities, until... > > commit a7d1ea1c11b33bda2691f3294b4d735ed635535a > Author: Alex Williamson <alex.williamson@redhat.com> > Date: Mon Apr 1 09:04:12 2013 -0600 > Cheers, Ashok
On Tue, 7 Apr 2020 21:00:21 -0700 "Raj, Ashok" <ashok.raj@intel.com> wrote: > Hi Alex > > + Bjorn + Don > FWIW I can't understand why PCI SIG went different ways with ATS, > where its enumerated on PF and VF. But for PASID and PRI its only > in PF. > > I'm checking with our internal SIG reps to followup on that. > > On Tue, Apr 07, 2020 at 09:58:01AM -0600, Alex Williamson wrote: > > > Is there vendor guarantee that hidden registers will locate at the > > > same offset between PF and VF config space? > > > > I'm not sure if the spec really precludes hidden registers, but the > > fact that these registers are explicitly outside of the capability > > chain implies they're only intended for device specific use, so I'd say > > there are no guarantees about anything related to these registers. > > As you had suggested in the other thread, we could consider > using the same offset as in PF, but even that's a better guess > still not reliable. > > The other option is to maybe extend driver ops in the PF to expose > where the offsets should be. Sort of adding the quirk in the > implementation. > > I'm not sure how prevalent are PASID and PRI in VF devices. If SIG is resisting > making VF's first class citizen, we might ask them to add some verbiage > to suggest leave the same offsets as PF open to help emulation software. Even if we know where to expose these capabilities on the VF, it's not clear to me how we can actually virtualize the capability itself. If the spec defines, for example, an enable bit as r/w then software that interacts with that register expects the bit is settable. There's no protocol for "try to set the bit and re-read it to see if the hardware accepted it". Therefore a capability with a fixed enable bit representing the state of the PF, not settable by the VF, is disingenuous to the spec. If what we're trying to do is expose that PASID and PRI are enabled on the PF to a VF driver, maybe duplicating the PF capabilities on the VF without the ability to control it is not the right approach. Maybe we need new capabilities exposing these as slave features that cannot be controlled? We could define our own vendor capability for this, but of course we have both the where to put it in config space issue, as well as the issue of trying to push an ad-hoc standard. vfio could expose these as device features rather than emulating capabilities, but that still leaves a big gap between vfio in the hypervisor and the driver in the guest VM. That might still help push the responsibility and policy for how to expose it to the VM as a userspace problem though. I agree though, I don't know why the SIG would preclude implementing per VF control of these features. Thanks, Alex > > FWIW, vfio started out being more strict about restricting config space > > access to defined capabilities, until... > > > > commit a7d1ea1c11b33bda2691f3294b4d735ed635535a > > Author: Alex Williamson <alex.williamson@redhat.com> > > Date: Mon Apr 1 09:04:12 2013 -0600 > > > > Cheers, > Ashok >
Hi Alex On Wed, Apr 08, 2020 at 10:19:40AM -0600, Alex Williamson wrote: > On Tue, 7 Apr 2020 21:00:21 -0700 > "Raj, Ashok" <ashok.raj@intel.com> wrote: > > > Hi Alex > > > > + Bjorn > > + Don > > > FWIW I can't understand why PCI SIG went different ways with ATS, > > where its enumerated on PF and VF. But for PASID and PRI its only > > in PF. > > > > I'm checking with our internal SIG reps to followup on that. > > > > On Tue, Apr 07, 2020 at 09:58:01AM -0600, Alex Williamson wrote: > > > > Is there vendor guarantee that hidden registers will locate at the > > > > same offset between PF and VF config space? > > > > > > I'm not sure if the spec really precludes hidden registers, but the > > > fact that these registers are explicitly outside of the capability > > > chain implies they're only intended for device specific use, so I'd say > > > there are no guarantees about anything related to these registers. > > > > As you had suggested in the other thread, we could consider > > using the same offset as in PF, but even that's a better guess > > still not reliable. > > > > The other option is to maybe extend driver ops in the PF to expose > > where the offsets should be. Sort of adding the quirk in the > > implementation. > > > > I'm not sure how prevalent are PASID and PRI in VF devices. If SIG is resisting > > making VF's first class citizen, we might ask them to add some verbiage > > to suggest leave the same offsets as PF open to help emulation software. > > Even if we know where to expose these capabilities on the VF, it's not > clear to me how we can actually virtualize the capability itself. If > the spec defines, for example, an enable bit as r/w then software that > interacts with that register expects the bit is settable. There's no > protocol for "try to set the bit and re-read it to see if the hardware > accepted it". Therefore a capability with a fixed enable bit > representing the state of the PF, not settable by the VF, is > disingenuous to the spec. > > If what we're trying to do is expose that PASID and PRI are enabled on > the PF to a VF driver, maybe duplicating the PF capabilities on the VF > without the ability to control it is not the right approach. Maybe we > need new capabilities exposing these as slave features that cannot be > controlled? We could define our own vendor capability for this, but of The other option is to say, VFIO would never emulate these fake capablities. If exposing a VF with PASID/PRI is required the PF driver would simply wrap it into a VDCM like model which we do today for Scalable IOV devices. So PF handles all aspects of this interface. I also like the suggestion you propose, maybe an offset where these capabilities are exposed to VF's. Maybe have an architected DEVCAPx which exposes these RO capabilities. No control, and the offset should be preserved by the SIG, so VMM can have a safe place to stash them. > course we have both the where to put it in config space issue, as well > as the issue of trying to push an ad-hoc standard. vfio could expose > these as device features rather than emulating capabilities, but that > still leaves a big gap between vfio in the hypervisor and the driver in > the guest VM. That might still help push the responsibility and policy > for how to expose it to the VM as a userspace problem though. > > I agree though, I don't know why the SIG would preclude implementing > per VF control of these features. Thanks, Even if we ask SIG for clarification, it might affect today's devices So might not be useful to solve our current situation. Ashok
On Wed, Apr 08, 2020 at 10:19:40AM -0600, Alex Williamson wrote: > On Tue, 7 Apr 2020 21:00:21 -0700 > "Raj, Ashok" <ashok.raj@intel.com> wrote: > > > Hi Alex > > > > + Bjorn > > + Don > > > FWIW I can't understand why PCI SIG went different ways with ATS, > > where its enumerated on PF and VF. But for PASID and PRI its only > > in PF. > > > > I'm checking with our internal SIG reps to followup on that. > > > > On Tue, Apr 07, 2020 at 09:58:01AM -0600, Alex Williamson wrote: > > > > Is there vendor guarantee that hidden registers will locate at the > > > > same offset between PF and VF config space? > > > > > > I'm not sure if the spec really precludes hidden registers, but the > > > fact that these registers are explicitly outside of the capability > > > chain implies they're only intended for device specific use, so I'd say > > > there are no guarantees about anything related to these registers. > > > > As you had suggested in the other thread, we could consider > > using the same offset as in PF, but even that's a better guess > > still not reliable. > > > > The other option is to maybe extend driver ops in the PF to expose > > where the offsets should be. Sort of adding the quirk in the > > implementation. > > > > I'm not sure how prevalent are PASID and PRI in VF devices. If SIG is resisting > > making VF's first class citizen, we might ask them to add some verbiage > > to suggest leave the same offsets as PF open to help emulation software. > > Even if we know where to expose these capabilities on the VF, it's not > clear to me how we can actually virtualize the capability itself. If > the spec defines, for example, an enable bit as r/w then software that > interacts with that register expects the bit is settable. There's no > protocol for "try to set the bit and re-read it to see if the hardware > accepted it". Therefore a capability with a fixed enable bit > representing the state of the PF, not settable by the VF, is > disingenuous to the spec. Would it be OK to implement a lock down mechanism for the PF PASID capability, preventing changes to the PF cap when the VF is in use by VFIO? The emulation would still break the spec: since the PF cap would always be enabled the VF configuration bits would have no effect, but it seems preferable to having the Enable bit not enable anything. > > If what we're trying to do is expose that PASID and PRI are enabled on > the PF to a VF driver, maybe duplicating the PF capabilities on the VF > without the ability to control it is not the right approach. Maybe we > need new capabilities exposing these as slave features that cannot be > controlled? We could define our own vendor capability for this, but of > course we have both the where to put it in config space issue, as well > as the issue of trying to push an ad-hoc standard. vfio could expose > these as device features rather than emulating capabilities, but that > still leaves a big gap between vfio in the hypervisor and the driver in > the guest VM. That might still help push the responsibility and policy > for how to expose it to the VM as a userspace problem though. Userspace might have more difficulty working around the issues mentioned in this thread. They would still need a guarantee that the PF PASID configuration doesn't change at runtime, and they wouldn't have the ability to talk to a vendor driver to figure out where to place the fake PASID capability. Thanks, Jean > > I agree though, I don't know why the SIG would preclude implementing > per VF control of these features. Thanks, > > Alex > > > > FWIW, vfio started out being more strict about restricting config space > > > access to defined capabilities, until... > > > > > > commit a7d1ea1c11b33bda2691f3294b4d735ed635535a > > > Author: Alex Williamson <alex.williamson@redhat.com> > > > Date: Mon Apr 1 09:04:12 2013 -0600 > > > > > > > Cheers, > > Ashok > > >
On Wed, Apr 08, 2020 at 10:19:40AM -0600, Alex Williamson wrote: > On Tue, 7 Apr 2020 21:00:21 -0700 > "Raj, Ashok" <ashok.raj@intel.com> wrote: > > > Hi Alex > > > > + Bjorn > > + Don > > > FWIW I can't understand why PCI SIG went different ways with ATS, > > where its enumerated on PF and VF. But for PASID and PRI its only > > in PF. > > > > I'm checking with our internal SIG reps to followup on that. > > > > On Tue, Apr 07, 2020 at 09:58:01AM -0600, Alex Williamson wrote: > > > > Is there vendor guarantee that hidden registers will locate at the > > > > same offset between PF and VF config space? > > > > > > I'm not sure if the spec really precludes hidden registers, but the > > > fact that these registers are explicitly outside of the capability > > > chain implies they're only intended for device specific use, so I'd say > > > there are no guarantees about anything related to these registers. > > > > As you had suggested in the other thread, we could consider > > using the same offset as in PF, but even that's a better guess > > still not reliable. > > > > The other option is to maybe extend driver ops in the PF to expose > > where the offsets should be. Sort of adding the quirk in the > > implementation. > > > > I'm not sure how prevalent are PASID and PRI in VF devices. If SIG is resisting > > making VF's first class citizen, we might ask them to add some verbiage > > to suggest leave the same offsets as PF open to help emulation software. > > Even if we know where to expose these capabilities on the VF, it's not > clear to me how we can actually virtualize the capability itself. If > the spec defines, for example, an enable bit as r/w then software that > interacts with that register expects the bit is settable. There's no > protocol for "try to set the bit and re-read it to see if the hardware > accepted it". Therefore a capability with a fixed enable bit > representing the state of the PF, not settable by the VF, is > disingenuous to the spec. I think we are all in violent agreement. A lot of times the pci spec gets defined several years ahead of real products and no one remembers the justification on why they restricted things the way they did. Maybe someone early product wasn't quite exposing these features to the VF and hence the spec is bug compatible :-) > > If what we're trying to do is expose that PASID and PRI are enabled on > the PF to a VF driver, maybe duplicating the PF capabilities on the VF > without the ability to control it is not the right approach. Maybe we As long as the capability enable is only provided when the PF has enabled the feature. Then it seems the hardware seems to do the right thing. Assume we expose PASID/PRI only when PF has enabled it. It will be the case since the PF driver needs to exist, and IOMMU would have set the PASID/PRI/ATS on PF. If the emulation is purely spoofing the capability. Once vIOMMU driver enables PASID, the context entries for the VF are completely independent from the PF context entries. vIOMMU would enable PASID, and we just spoof the PASID capability. If vIOMMU or guest for some reason does disable_pasid(), then the vIOMMU driver can disaable PASID on the VF context entries. So the VF although the capability is blanket enabled on PF, IOMMU gaurantees the transactions are blocked. In the interim, it seems like the intent of the virtual capability can be honored via help from the IOMMU for the controlling aspect.. Did i miss anything? > need new capabilities exposing these as slave features that cannot be > controlled? We could define our own vendor capability for this, but of > course we have both the where to put it in config space issue, as well > as the issue of trying to push an ad-hoc standard. vfio could expose > these as device features rather than emulating capabilities, but that > still leaves a big gap between vfio in the hypervisor and the driver in > the guest VM. That might still help push the responsibility and policy > for how to expose it to the VM as a userspace problem though. I think this is a good long term solution, but if the vIOMMU implenentations can carry us for the time being, we can probably defer them unless we are stuck. > > I agree though, I don't know why the SIG would preclude implementing > per VF control of these features. Thanks, > Cheers, Ashok
Hi Alex Going through the PCIe Spec, there seems a lot of such capabilities that are different between PF and VF. Some that make sense and some don't. On Sun, Apr 12, 2020 at 08:10:43PM -0700, Raj, Ashok wrote: > > > > > I agree though, I don't know why the SIG would preclude implementing > > per VF control of these features. Thanks, > > For e.g. VF doesn't have I/O and Mem space enables, but has BME Interrupt Status Correctable Error Reporting Almost all of Device Control Register. So it seems like there is a ton of them we have to deal with today for VF's. How do we manage to emulate them without any support for them in VF's? > > Cheers, > Ashok
> From: Raj, Ashok <ashok.raj@linux.intel.com> > Sent: Monday, April 13, 2020 11:11 AM > > On Wed, Apr 08, 2020 at 10:19:40AM -0600, Alex Williamson wrote: > > On Tue, 7 Apr 2020 21:00:21 -0700 > > "Raj, Ashok" <ashok.raj@intel.com> wrote: > > > > > Hi Alex > > > > > > + Bjorn > > > > + Don > > > > > FWIW I can't understand why PCI SIG went different ways with ATS, > > > where its enumerated on PF and VF. But for PASID and PRI its only > > > in PF. > > > > > > I'm checking with our internal SIG reps to followup on that. > > > > > > On Tue, Apr 07, 2020 at 09:58:01AM -0600, Alex Williamson wrote: > > > > > Is there vendor guarantee that hidden registers will locate at the > > > > > same offset between PF and VF config space? > > > > > > > > I'm not sure if the spec really precludes hidden registers, but the > > > > fact that these registers are explicitly outside of the capability > > > > chain implies they're only intended for device specific use, so I'd say > > > > there are no guarantees about anything related to these registers. > > > > > > As you had suggested in the other thread, we could consider > > > using the same offset as in PF, but even that's a better guess > > > still not reliable. > > > > > > The other option is to maybe extend driver ops in the PF to expose > > > where the offsets should be. Sort of adding the quirk in the > > > implementation. > > > > > > I'm not sure how prevalent are PASID and PRI in VF devices. If SIG is > resisting > > > making VF's first class citizen, we might ask them to add some verbiage > > > to suggest leave the same offsets as PF open to help emulation software. > > > > Even if we know where to expose these capabilities on the VF, it's not > > clear to me how we can actually virtualize the capability itself. If > > the spec defines, for example, an enable bit as r/w then software that > > interacts with that register expects the bit is settable. There's no > > protocol for "try to set the bit and re-read it to see if the hardware > > accepted it". Therefore a capability with a fixed enable bit > > representing the state of the PF, not settable by the VF, is > > disingenuous to the spec. > > I think we are all in violent agreement. A lot of times the pci spec gets > defined several years ahead of real products and no one remembers > the justification on why they restricted things the way they did. > > Maybe someone early product wasn't quite exposing these features to the > VF > and hence the spec is bug compatible :-) > > > > > If what we're trying to do is expose that PASID and PRI are enabled on > > the PF to a VF driver, maybe duplicating the PF capabilities on the VF > > without the ability to control it is not the right approach. Maybe we > > As long as the capability enable is only provided when the PF has enabled > the feature. Then it seems the hardware seems to do the right thing. > > Assume we expose PASID/PRI only when PF has enabled it. It will be the > case since the PF driver needs to exist, and IOMMU would have set the > PASID/PRI/ATS on PF. > > If the emulation is purely spoofing the capability. Once vIOMMU driver > enables PASID, the context entries for the VF are completely independent > from the PF context entries. > > vIOMMU would enable PASID, and we just spoof the PASID capability. > > If vIOMMU or guest for some reason does disable_pasid(), then the > vIOMMU driver can disaable PASID on the VF context entries. So the VF > although the capability is blanket enabled on PF, IOMMU gaurantees the > transactions are blocked. > > > In the interim, it seems like the intent of the virtual capability > can be honored via help from the IOMMU for the controlling aspect.. > > Did i miss anything? Above works for emulating the enable bit (under the assumption that PF driver won't disable pasid when vf is assigned). However, there are also "Execute permission enable" and "Privileged mode enable" bits in PASID control registers. I don't know how those bits could be cleanly emulated when the guest writes a value different from PF's... Similar problem also exists when talking about PRI emulation, e.g. to enable PRI the software usually waits until the 'stopped' bit is set (indicating all previously issued requests have completed). How to emulate this bit accurately when one guest toggles the enable bit while the PF and other VFs are actively issuing page requests through the shared page request interface? from pcie spec I didn't find a way to catch when all previously-issued requests from a specific VF have completed. Can a conservative big-enough timeout value help here? I don't know... similar puzzle also exists for emulating the 'reset' control bit which is supposed to clear the pending request state for the whole page request interface. I feel the main problem in pcie spec is that, while they invent SR-IOV to address I/O virtualization requirement (where strict isolation is required), they blurred the boundary by leaving some shared resource cross PF and VFs which imply sort of cooperation between PF and VF drivers. On bare metal such cooperation is easy to build, by enabling/ disabling a capability en mass, by using the same set of setting, etc. However it doesn't consider the virtualization case where a VF is assigned to the guest which considers the VF as a standard PCI/PCIe endpoint thus such cooperation is missing. A vendor capability could help fix the gap here but making it adopted by major guest OSes will take time. But honestly speaking I don't know other good alternative now... :/ > > > need new capabilities exposing these as slave features that cannot be > > controlled? We could define our own vendor capability for this, but of > > course we have both the where to put it in config space issue, as well > > as the issue of trying to push an ad-hoc standard. vfio could expose > > these as device features rather than emulating capabilities, but that > > still leaves a big gap between vfio in the hypervisor and the driver in > > the guest VM. That might still help push the responsibility and policy > > for how to expose it to the VM as a userspace problem though. > > I think this is a good long term solution, but if the vIOMMU implenentations > can carry us for the time being, we can probably defer them unless > we are stuck. > > > > > I agree though, I don't know why the SIG would preclude implementing > > per VF control of these features. Thanks, > > > > Cheers, > Ashok Thanks Kevin
> From: Tian, Kevin > Sent: Monday, April 13, 2020 3:55 PM > > > From: Raj, Ashok <ashok.raj@linux.intel.com> > > Sent: Monday, April 13, 2020 11:11 AM > > > > On Wed, Apr 08, 2020 at 10:19:40AM -0600, Alex Williamson wrote: > > > On Tue, 7 Apr 2020 21:00:21 -0700 > > > "Raj, Ashok" <ashok.raj@intel.com> wrote: > > > > > > > Hi Alex > > > > > > > > + Bjorn > > > > > > + Don > > > > > > > FWIW I can't understand why PCI SIG went different ways with ATS, > > > > where its enumerated on PF and VF. But for PASID and PRI its only > > > > in PF. > > > > > > > > I'm checking with our internal SIG reps to followup on that. > > > > > > > > On Tue, Apr 07, 2020 at 09:58:01AM -0600, Alex Williamson wrote: > > > > > > Is there vendor guarantee that hidden registers will locate at the > > > > > > same offset between PF and VF config space? > > > > > > > > > > I'm not sure if the spec really precludes hidden registers, but the > > > > > fact that these registers are explicitly outside of the capability > > > > > chain implies they're only intended for device specific use, so I'd say > > > > > there are no guarantees about anything related to these registers. > > > > > > > > As you had suggested in the other thread, we could consider > > > > using the same offset as in PF, but even that's a better guess > > > > still not reliable. > > > > > > > > The other option is to maybe extend driver ops in the PF to expose > > > > where the offsets should be. Sort of adding the quirk in the > > > > implementation. > > > > > > > > I'm not sure how prevalent are PASID and PRI in VF devices. If SIG is > > resisting > > > > making VF's first class citizen, we might ask them to add some verbiage > > > > to suggest leave the same offsets as PF open to help emulation software. > > > > > > Even if we know where to expose these capabilities on the VF, it's not > > > clear to me how we can actually virtualize the capability itself. If > > > the spec defines, for example, an enable bit as r/w then software that > > > interacts with that register expects the bit is settable. There's no > > > protocol for "try to set the bit and re-read it to see if the hardware > > > accepted it". Therefore a capability with a fixed enable bit > > > representing the state of the PF, not settable by the VF, is > > > disingenuous to the spec. > > > > I think we are all in violent agreement. A lot of times the pci spec gets > > defined several years ahead of real products and no one remembers > > the justification on why they restricted things the way they did. > > > > Maybe someone early product wasn't quite exposing these features to the > > VF > > and hence the spec is bug compatible :-) > > > > > > > > If what we're trying to do is expose that PASID and PRI are enabled on > > > the PF to a VF driver, maybe duplicating the PF capabilities on the VF > > > without the ability to control it is not the right approach. Maybe we > > > > As long as the capability enable is only provided when the PF has enabled > > the feature. Then it seems the hardware seems to do the right thing. > > > > Assume we expose PASID/PRI only when PF has enabled it. It will be the > > case since the PF driver needs to exist, and IOMMU would have set the > > PASID/PRI/ATS on PF. > > > > If the emulation is purely spoofing the capability. Once vIOMMU driver > > enables PASID, the context entries for the VF are completely independent > > from the PF context entries. > > > > vIOMMU would enable PASID, and we just spoof the PASID capability. > > > > If vIOMMU or guest for some reason does disable_pasid(), then the > > vIOMMU driver can disaable PASID on the VF context entries. So the VF > > although the capability is blanket enabled on PF, IOMMU gaurantees the > > transactions are blocked. > > > > > > In the interim, it seems like the intent of the virtual capability > > can be honored via help from the IOMMU for the controlling aspect.. > > > > Did i miss anything? > > Above works for emulating the enable bit (under the assumption that > PF driver won't disable pasid when vf is assigned). However, there are > also "Execute permission enable" and "Privileged mode enable" bits in > PASID control registers. I don't know how those bits could be cleanly > emulated when the guest writes a value different from PF's... sent too quick. the IOMMU also includes control bits for allowing/ blocking execute requests and supervisor requests. We can rely on IOMMU to block those requests to emulate the disabled cases of all three control bits in the pasid cap. Thanks Kevin > > Similar problem also exists when talking about PRI emulation, e.g. > to enable PRI the software usually waits until the 'stopped' bit > is set (indicating all previously issued requests have completed). How > to emulate this bit accurately when one guest toggles the enable bit > while the PF and other VFs are actively issuing page requests through > the shared page request interface? from pcie spec I didn't find a way > to catch when all previously-issued requests from a specific VF have > completed. Can a conservative big-enough timeout value help here? > I don't know... similar puzzle also exists for emulating the 'reset' > control bit which is supposed to clear the pending request state for > the whole page request interface. > > I feel the main problem in pcie spec is that, while they invent SR-IOV > to address I/O virtualization requirement (where strict isolation is > required), they blurred the boundary by leaving some shared resource > cross PF and VFs which imply sort of cooperation between PF and VF > drivers. On bare metal such cooperation is easy to build, by enabling/ > disabling a capability en mass, by using the same set of setting, etc. > However it doesn't consider the virtualization case where a VF is > assigned to the guest which considers the VF as a standard PCI/PCIe > endpoint thus such cooperation is missing. A vendor capability could > help fix the gap here but making it adopted by major guest OSes will > take time. But honestly speaking I don't know other good alternative > now... :/ > > > > > > need new capabilities exposing these as slave features that cannot be > > > controlled? We could define our own vendor capability for this, but of > > > course we have both the where to put it in config space issue, as well > > > as the issue of trying to push an ad-hoc standard. vfio could expose > > > these as device features rather than emulating capabilities, but that > > > still leaves a big gap between vfio in the hypervisor and the driver in > > > the guest VM. That might still help push the responsibility and policy > > > for how to expose it to the VM as a userspace problem though. > > > > I think this is a good long term solution, but if the vIOMMU implenentations > > can carry us for the time being, we can probably defer them unless > > we are stuck. > > > > > > > > I agree though, I don't know why the SIG would preclude implementing > > > per VF control of these features. Thanks, > > > > > > > Cheers, > > Ashok > > Thanks > Kevin
On Sun, 12 Apr 2020 20:29:31 -0700 "Raj, Ashok" <ashok.raj@intel.com> wrote: > Hi Alex > > Going through the PCIe Spec, there seems a lot of such capabilities > that are different between PF and VF. Some that make sense > and some don't. > > > On Sun, Apr 12, 2020 at 08:10:43PM -0700, Raj, Ashok wrote: > > > > > > > > I agree though, I don't know why the SIG would preclude implementing > > > per VF control of these features. Thanks, > > > > > For e.g. > > VF doesn't have I/O and Mem space enables, but has BME VFs don't have I/O, so I/O enable is irrelevant. The memory enable bit is emulated, so it doesn't really do anything from the VM perspective. The hypervisor could provide more emulation around this, but it hasn't proven necessary. > Interrupt Status VFs don't have INTx, so this is irrelevant. > Correctable Error Reporting > Almost all of Device Control Register. Are we doing anything to virtualize these for VFs? I think we've addressed access control to these for PFs, but I don't see that we try to virtualize them for the VF. > So it seems like there is a ton of them we have to deal with today for > VF's. How do we manage to emulate them without any support for them > in VF's? The memory enable bit is just access to the MMIO space of the device, the hypervisor could choose to do more, but currently emulating the bit itself is sufficient. This doesn't really affect the device, just access to the device. The device control registers, I don't think we've had a need to virtualize them yet and I think we'd run into many of the same questions. If your point is that there exists gaps in the spec that make things difficult to virtualize, I won't argue with you there. MPS is a nearby one that's difficult to virtualize on the PF since its setting needs to take entire communication channels into account. So far though we aren't inventing new capabilities to add to VF config space and pretending they work, we're just stumbling on what the VF exposes whether on bare metal or in a VM. Thanks, Alex
On Mon, 13 Apr 2020 08:05:33 +0000 "Tian, Kevin" <kevin.tian@intel.com> wrote: > > From: Tian, Kevin > > Sent: Monday, April 13, 2020 3:55 PM > > > > > From: Raj, Ashok <ashok.raj@linux.intel.com> > > > Sent: Monday, April 13, 2020 11:11 AM > > > > > > On Wed, Apr 08, 2020 at 10:19:40AM -0600, Alex Williamson wrote: > > > > On Tue, 7 Apr 2020 21:00:21 -0700 > > > > "Raj, Ashok" <ashok.raj@intel.com> wrote: > > > > > > > > > Hi Alex > > > > > > > > > > + Bjorn > > > > > > > > + Don > > > > > > > > > FWIW I can't understand why PCI SIG went different ways with ATS, > > > > > where its enumerated on PF and VF. But for PASID and PRI its only > > > > > in PF. > > > > > > > > > > I'm checking with our internal SIG reps to followup on that. > > > > > > > > > > On Tue, Apr 07, 2020 at 09:58:01AM -0600, Alex Williamson wrote: > > > > > > > Is there vendor guarantee that hidden registers will locate at the > > > > > > > same offset between PF and VF config space? > > > > > > > > > > > > I'm not sure if the spec really precludes hidden registers, but the > > > > > > fact that these registers are explicitly outside of the capability > > > > > > chain implies they're only intended for device specific use, so I'd say > > > > > > there are no guarantees about anything related to these registers. > > > > > > > > > > As you had suggested in the other thread, we could consider > > > > > using the same offset as in PF, but even that's a better guess > > > > > still not reliable. > > > > > > > > > > The other option is to maybe extend driver ops in the PF to expose > > > > > where the offsets should be. Sort of adding the quirk in the > > > > > implementation. > > > > > > > > > > I'm not sure how prevalent are PASID and PRI in VF devices. If SIG is > > > resisting > > > > > making VF's first class citizen, we might ask them to add some verbiage > > > > > to suggest leave the same offsets as PF open to help emulation software. > > > > > > > > Even if we know where to expose these capabilities on the VF, it's not > > > > clear to me how we can actually virtualize the capability itself. If > > > > the spec defines, for example, an enable bit as r/w then software that > > > > interacts with that register expects the bit is settable. There's no > > > > protocol for "try to set the bit and re-read it to see if the hardware > > > > accepted it". Therefore a capability with a fixed enable bit > > > > representing the state of the PF, not settable by the VF, is > > > > disingenuous to the spec. > > > > > > I think we are all in violent agreement. A lot of times the pci spec gets > > > defined several years ahead of real products and no one remembers > > > the justification on why they restricted things the way they did. > > > > > > Maybe someone early product wasn't quite exposing these features to the > > > VF > > > and hence the spec is bug compatible :-) > > > > > > > > > > > If what we're trying to do is expose that PASID and PRI are enabled on > > > > the PF to a VF driver, maybe duplicating the PF capabilities on the VF > > > > without the ability to control it is not the right approach. Maybe we > > > > > > As long as the capability enable is only provided when the PF has enabled > > > the feature. Then it seems the hardware seems to do the right thing. > > > > > > Assume we expose PASID/PRI only when PF has enabled it. It will be the > > > case since the PF driver needs to exist, and IOMMU would have set the > > > PASID/PRI/ATS on PF. > > > > > > If the emulation is purely spoofing the capability. Once vIOMMU driver > > > enables PASID, the context entries for the VF are completely independent > > > from the PF context entries. > > > > > > vIOMMU would enable PASID, and we just spoof the PASID capability. > > > > > > If vIOMMU or guest for some reason does disable_pasid(), then the > > > vIOMMU driver can disaable PASID on the VF context entries. So the VF > > > although the capability is blanket enabled on PF, IOMMU gaurantees the > > > transactions are blocked. > > > > > > > > > In the interim, it seems like the intent of the virtual capability > > > can be honored via help from the IOMMU for the controlling aspect.. > > > > > > Did i miss anything? > > > > Above works for emulating the enable bit (under the assumption that > > PF driver won't disable pasid when vf is assigned). However, there are > > also "Execute permission enable" and "Privileged mode enable" bits in > > PASID control registers. I don't know how those bits could be cleanly > > emulated when the guest writes a value different from PF's... > > sent too quick. the IOMMU also includes control bits for allowing/ > blocking execute requests and supervisor requests. We can rely on > IOMMU to block those requests to emulate the disabled cases of > all three control bits in the pasid cap. So if the emulation of the PASID capability takes into account the IOMMU configuration to back that emulation, shouldn't we do that emulation in the hypervisor, ie. QEMU, rather than the kernel vfio layer? Thanks, Alex > > Similar problem also exists when talking about PRI emulation, e.g. > > to enable PRI the software usually waits until the 'stopped' bit > > is set (indicating all previously issued requests have completed). How > > to emulate this bit accurately when one guest toggles the enable bit > > while the PF and other VFs are actively issuing page requests through > > the shared page request interface? from pcie spec I didn't find a way > > to catch when all previously-issued requests from a specific VF have > > completed. Can a conservative big-enough timeout value help here? > > I don't know... similar puzzle also exists for emulating the 'reset' > > control bit which is supposed to clear the pending request state for > > the whole page request interface. > > > > I feel the main problem in pcie spec is that, while they invent SR-IOV > > to address I/O virtualization requirement (where strict isolation is > > required), they blurred the boundary by leaving some shared resource > > cross PF and VFs which imply sort of cooperation between PF and VF > > drivers. On bare metal such cooperation is easy to build, by enabling/ > > disabling a capability en mass, by using the same set of setting, etc. > > However it doesn't consider the virtualization case where a VF is > > assigned to the guest which considers the VF as a standard PCI/PCIe > > endpoint thus such cooperation is missing. A vendor capability could > > help fix the gap here but making it adopted by major guest OSes will > > take time. But honestly speaking I don't know other good alternative > > now... :/ > > > > > > > > > need new capabilities exposing these as slave features that cannot be > > > > controlled? We could define our own vendor capability for this, but of > > > > course we have both the where to put it in config space issue, as well > > > > as the issue of trying to push an ad-hoc standard. vfio could expose > > > > these as device features rather than emulating capabilities, but that > > > > still leaves a big gap between vfio in the hypervisor and the driver in > > > > the guest VM. That might still help push the responsibility and policy > > > > for how to expose it to the VM as a userspace problem though. > > > > > > I think this is a good long term solution, but if the vIOMMU implenentations > > > can carry us for the time being, we can probably defer them unless > > > we are stuck. > > > > > > > > > > > I agree though, I don't know why the SIG would preclude implementing > > > > per VF control of these features. Thanks, > > > > > > > > > > Cheers, > > > Ashok > > > > Thanks > > Kevin >
On Thu, 9 Apr 2020 09:35:33 +0200 Jean-Philippe Brucker <jean-philippe@linaro.org> wrote: > On Wed, Apr 08, 2020 at 10:19:40AM -0600, Alex Williamson wrote: > > On Tue, 7 Apr 2020 21:00:21 -0700 > > "Raj, Ashok" <ashok.raj@intel.com> wrote: > > > > > Hi Alex > > > > > > + Bjorn > > > > + Don > > > > > FWIW I can't understand why PCI SIG went different ways with ATS, > > > where its enumerated on PF and VF. But for PASID and PRI its only > > > in PF. > > > > > > I'm checking with our internal SIG reps to followup on that. > > > > > > On Tue, Apr 07, 2020 at 09:58:01AM -0600, Alex Williamson wrote: > > > > > Is there vendor guarantee that hidden registers will locate at the > > > > > same offset between PF and VF config space? > > > > > > > > I'm not sure if the spec really precludes hidden registers, but the > > > > fact that these registers are explicitly outside of the capability > > > > chain implies they're only intended for device specific use, so I'd say > > > > there are no guarantees about anything related to these registers. > > > > > > As you had suggested in the other thread, we could consider > > > using the same offset as in PF, but even that's a better guess > > > still not reliable. > > > > > > The other option is to maybe extend driver ops in the PF to expose > > > where the offsets should be. Sort of adding the quirk in the > > > implementation. > > > > > > I'm not sure how prevalent are PASID and PRI in VF devices. If SIG is resisting > > > making VF's first class citizen, we might ask them to add some verbiage > > > to suggest leave the same offsets as PF open to help emulation software. > > > > Even if we know where to expose these capabilities on the VF, it's not > > clear to me how we can actually virtualize the capability itself. If > > the spec defines, for example, an enable bit as r/w then software that > > interacts with that register expects the bit is settable. There's no > > protocol for "try to set the bit and re-read it to see if the hardware > > accepted it". Therefore a capability with a fixed enable bit > > representing the state of the PF, not settable by the VF, is > > disingenuous to the spec. > > Would it be OK to implement a lock down mechanism for the PF PASID > capability, preventing changes to the PF cap when the VF is in use by > VFIO? The emulation would still break the spec: since the PF cap would > always be enabled the VF configuration bits would have no effect, but it > seems preferable to having the Enable bit not enable anything. I think we absolutely need some mechanism to make sure the PF driver doesn't change the PASID enable bit while we're using it. A PASID user registration perhaps. And yes, that doesn't necessarily map to being able to actually disable PASID, but it sounds like Kevin and Ashok have some ideas how the emulation could use the IOMMU settings to achieve that more precisely. > > If what we're trying to do is expose that PASID and PRI are enabled on > > the PF to a VF driver, maybe duplicating the PF capabilities on the VF > > without the ability to control it is not the right approach. Maybe we > > need new capabilities exposing these as slave features that cannot be > > controlled? We could define our own vendor capability for this, but of > > course we have both the where to put it in config space issue, as well > > as the issue of trying to push an ad-hoc standard. vfio could expose > > these as device features rather than emulating capabilities, but that > > still leaves a big gap between vfio in the hypervisor and the driver in > > the guest VM. That might still help push the responsibility and policy > > for how to expose it to the VM as a userspace problem though. > > Userspace might have more difficulty working around the issues mentioned > in this thread. They would still need a guarantee that the PF PASID > configuration doesn't change at runtime, and they wouldn't have the > ability to talk to a vendor driver to figure out where to place the fake > PASID capability. Couldn't we do this with the DEVICE_FEATURE ioctl we just added? A user could set a PASID user or get a capability offset, where vfio-pci would plumb these through to whatever PF/vendor driver provides that interface, just as if it was implemented in the vfio-pci driver. But that still lets us leave the policy of inserting a capability and using the IOMMU to implement the bits to the user/QEMU. Thanks, Alex
> From: Alex Williamson <alex.williamson@redhat.com> > Sent: Tuesday, April 14, 2020 3:21 AM > > On Mon, 13 Apr 2020 08:05:33 +0000 > "Tian, Kevin" <kevin.tian@intel.com> wrote: > > > > From: Tian, Kevin > > > Sent: Monday, April 13, 2020 3:55 PM > > > > > > > From: Raj, Ashok <ashok.raj@linux.intel.com> > > > > Sent: Monday, April 13, 2020 11:11 AM > > > > > > > > On Wed, Apr 08, 2020 at 10:19:40AM -0600, Alex Williamson wrote: > > > > > On Tue, 7 Apr 2020 21:00:21 -0700 > > > > > "Raj, Ashok" <ashok.raj@intel.com> wrote: > > > > > > > > > > > Hi Alex > > > > > > > > > > > > + Bjorn > > > > > > > > > > + Don > > > > > > > > > > > FWIW I can't understand why PCI SIG went different ways with ATS, > > > > > > where its enumerated on PF and VF. But for PASID and PRI its only > > > > > > in PF. > > > > > > > > > > > > I'm checking with our internal SIG reps to followup on that. > > > > > > > > > > > > On Tue, Apr 07, 2020 at 09:58:01AM -0600, Alex Williamson wrote: > > > > > > > > Is there vendor guarantee that hidden registers will locate at the > > > > > > > > same offset between PF and VF config space? > > > > > > > > > > > > > > I'm not sure if the spec really precludes hidden registers, but the > > > > > > > fact that these registers are explicitly outside of the capability > > > > > > > chain implies they're only intended for device specific use, so I'd > say > > > > > > > there are no guarantees about anything related to these registers. > > > > > > > > > > > > As you had suggested in the other thread, we could consider > > > > > > using the same offset as in PF, but even that's a better guess > > > > > > still not reliable. > > > > > > > > > > > > The other option is to maybe extend driver ops in the PF to expose > > > > > > where the offsets should be. Sort of adding the quirk in the > > > > > > implementation. > > > > > > > > > > > > I'm not sure how prevalent are PASID and PRI in VF devices. If SIG is > > > > resisting > > > > > > making VF's first class citizen, we might ask them to add some > verbiage > > > > > > to suggest leave the same offsets as PF open to help emulation > software. > > > > > > > > > > Even if we know where to expose these capabilities on the VF, it's not > > > > > clear to me how we can actually virtualize the capability itself. If > > > > > the spec defines, for example, an enable bit as r/w then software that > > > > > interacts with that register expects the bit is settable. There's no > > > > > protocol for "try to set the bit and re-read it to see if the hardware > > > > > accepted it". Therefore a capability with a fixed enable bit > > > > > representing the state of the PF, not settable by the VF, is > > > > > disingenuous to the spec. > > > > > > > > I think we are all in violent agreement. A lot of times the pci spec gets > > > > defined several years ahead of real products and no one remembers > > > > the justification on why they restricted things the way they did. > > > > > > > > Maybe someone early product wasn't quite exposing these features to > the > > > > VF > > > > and hence the spec is bug compatible :-) > > > > > > > > > > > > > > If what we're trying to do is expose that PASID and PRI are enabled on > > > > > the PF to a VF driver, maybe duplicating the PF capabilities on the VF > > > > > without the ability to control it is not the right approach. Maybe we > > > > > > > > As long as the capability enable is only provided when the PF has > enabled > > > > the feature. Then it seems the hardware seems to do the right thing. > > > > > > > > Assume we expose PASID/PRI only when PF has enabled it. It will be the > > > > case since the PF driver needs to exist, and IOMMU would have set the > > > > PASID/PRI/ATS on PF. > > > > > > > > If the emulation is purely spoofing the capability. Once vIOMMU driver > > > > enables PASID, the context entries for the VF are completely > independent > > > > from the PF context entries. > > > > > > > > vIOMMU would enable PASID, and we just spoof the PASID capability. > > > > > > > > If vIOMMU or guest for some reason does disable_pasid(), then the > > > > vIOMMU driver can disaable PASID on the VF context entries. So the VF > > > > although the capability is blanket enabled on PF, IOMMU gaurantees > the > > > > transactions are blocked. > > > > > > > > > > > > In the interim, it seems like the intent of the virtual capability > > > > can be honored via help from the IOMMU for the controlling aspect.. > > > > > > > > Did i miss anything? > > > > > > Above works for emulating the enable bit (under the assumption that > > > PF driver won't disable pasid when vf is assigned). However, there are > > > also "Execute permission enable" and "Privileged mode enable" bits in > > > PASID control registers. I don't know how those bits could be cleanly > > > emulated when the guest writes a value different from PF's... > > > > sent too quick. the IOMMU also includes control bits for allowing/ > > blocking execute requests and supervisor requests. We can rely on > > IOMMU to block those requests to emulate the disabled cases of > > all three control bits in the pasid cap. > > > So if the emulation of the PASID capability takes into account the > IOMMU configuration to back that emulation, shouldn't we do that > emulation in the hypervisor, ie. QEMU, rather than the kernel vfio > layer? Thanks, > > Alex We need enforce it in physical IOMMU, to ensure that even the VF may send requests which violate the guest expectation those requests are always blocked by IOMMU. Kernel vfio identifies such need when emulating the pasid cap and then forward the request to host iommu driver. Thanks Kevin > > > > Similar problem also exists when talking about PRI emulation, e.g. > > > to enable PRI the software usually waits until the 'stopped' bit > > > is set (indicating all previously issued requests have completed). How > > > to emulate this bit accurately when one guest toggles the enable bit > > > while the PF and other VFs are actively issuing page requests through > > > the shared page request interface? from pcie spec I didn't find a way > > > to catch when all previously-issued requests from a specific VF have > > > completed. Can a conservative big-enough timeout value help here? > > > I don't know... similar puzzle also exists for emulating the 'reset' > > > control bit which is supposed to clear the pending request state for > > > the whole page request interface. > > > > > > I feel the main problem in pcie spec is that, while they invent SR-IOV > > > to address I/O virtualization requirement (where strict isolation is > > > required), they blurred the boundary by leaving some shared resource > > > cross PF and VFs which imply sort of cooperation between PF and VF > > > drivers. On bare metal such cooperation is easy to build, by enabling/ > > > disabling a capability en mass, by using the same set of setting, etc. > > > However it doesn't consider the virtualization case where a VF is > > > assigned to the guest which considers the VF as a standard PCI/PCIe > > > endpoint thus such cooperation is missing. A vendor capability could > > > help fix the gap here but making it adopted by major guest OSes will > > > take time. But honestly speaking I don't know other good alternative > > > now... :/ > > > > > > > > > > > > need new capabilities exposing these as slave features that cannot be > > > > > controlled? We could define our own vendor capability for this, but > of > > > > > course we have both the where to put it in config space issue, as well > > > > > as the issue of trying to push an ad-hoc standard. vfio could expose > > > > > these as device features rather than emulating capabilities, but that > > > > > still leaves a big gap between vfio in the hypervisor and the driver in > > > > > the guest VM. That might still help push the responsibility and policy > > > > > for how to expose it to the VM as a userspace problem though. > > > > > > > > I think this is a good long term solution, but if the vIOMMU > implenentations > > > > can carry us for the time being, we can probably defer them unless > > > > we are stuck. > > > > > > > > > > > > > > I agree though, I don't know why the SIG would preclude > implementing > > > > > per VF control of these features. Thanks, > > > > > > > > > > > > > Cheers, > > > > Ashok > > > > > > Thanks > > > Kevin > >
On Tue, 14 Apr 2020 02:40:58 +0000 "Tian, Kevin" <kevin.tian@intel.com> wrote: > > From: Alex Williamson <alex.williamson@redhat.com> > > Sent: Tuesday, April 14, 2020 3:21 AM > > > > On Mon, 13 Apr 2020 08:05:33 +0000 > > "Tian, Kevin" <kevin.tian@intel.com> wrote: > > > > > > From: Tian, Kevin > > > > Sent: Monday, April 13, 2020 3:55 PM > > > > > > > > > From: Raj, Ashok <ashok.raj@linux.intel.com> > > > > > Sent: Monday, April 13, 2020 11:11 AM > > > > > > > > > > On Wed, Apr 08, 2020 at 10:19:40AM -0600, Alex Williamson wrote: > > > > > > On Tue, 7 Apr 2020 21:00:21 -0700 > > > > > > "Raj, Ashok" <ashok.raj@intel.com> wrote: > > > > > > > > > > > > > Hi Alex > > > > > > > > > > > > > > + Bjorn > > > > > > > > > > > > + Don > > > > > > > > > > > > > FWIW I can't understand why PCI SIG went different ways with ATS, > > > > > > > where its enumerated on PF and VF. But for PASID and PRI its only > > > > > > > in PF. > > > > > > > > > > > > > > I'm checking with our internal SIG reps to followup on that. > > > > > > > > > > > > > > On Tue, Apr 07, 2020 at 09:58:01AM -0600, Alex Williamson wrote: > > > > > > > > > Is there vendor guarantee that hidden registers will locate at the > > > > > > > > > same offset between PF and VF config space? > > > > > > > > > > > > > > > > I'm not sure if the spec really precludes hidden registers, but the > > > > > > > > fact that these registers are explicitly outside of the capability > > > > > > > > chain implies they're only intended for device specific use, so I'd > > say > > > > > > > > there are no guarantees about anything related to these registers. > > > > > > > > > > > > > > As you had suggested in the other thread, we could consider > > > > > > > using the same offset as in PF, but even that's a better guess > > > > > > > still not reliable. > > > > > > > > > > > > > > The other option is to maybe extend driver ops in the PF to expose > > > > > > > where the offsets should be. Sort of adding the quirk in the > > > > > > > implementation. > > > > > > > > > > > > > > I'm not sure how prevalent are PASID and PRI in VF devices. If SIG is > > > > > resisting > > > > > > > making VF's first class citizen, we might ask them to add some > > verbiage > > > > > > > to suggest leave the same offsets as PF open to help emulation > > software. > > > > > > > > > > > > Even if we know where to expose these capabilities on the VF, it's not > > > > > > clear to me how we can actually virtualize the capability itself. If > > > > > > the spec defines, for example, an enable bit as r/w then software that > > > > > > interacts with that register expects the bit is settable. There's no > > > > > > protocol for "try to set the bit and re-read it to see if the hardware > > > > > > accepted it". Therefore a capability with a fixed enable bit > > > > > > representing the state of the PF, not settable by the VF, is > > > > > > disingenuous to the spec. > > > > > > > > > > I think we are all in violent agreement. A lot of times the pci spec gets > > > > > defined several years ahead of real products and no one remembers > > > > > the justification on why they restricted things the way they did. > > > > > > > > > > Maybe someone early product wasn't quite exposing these features to > > the > > > > > VF > > > > > and hence the spec is bug compatible :-) > > > > > > > > > > > > > > > > > If what we're trying to do is expose that PASID and PRI are enabled on > > > > > > the PF to a VF driver, maybe duplicating the PF capabilities on the VF > > > > > > without the ability to control it is not the right approach. Maybe we > > > > > > > > > > As long as the capability enable is only provided when the PF has > > enabled > > > > > the feature. Then it seems the hardware seems to do the right thing. > > > > > > > > > > Assume we expose PASID/PRI only when PF has enabled it. It will be the > > > > > case since the PF driver needs to exist, and IOMMU would have set the > > > > > PASID/PRI/ATS on PF. > > > > > > > > > > If the emulation is purely spoofing the capability. Once vIOMMU driver > > > > > enables PASID, the context entries for the VF are completely > > independent > > > > > from the PF context entries. > > > > > > > > > > vIOMMU would enable PASID, and we just spoof the PASID capability. > > > > > > > > > > If vIOMMU or guest for some reason does disable_pasid(), then the > > > > > vIOMMU driver can disaable PASID on the VF context entries. So the VF > > > > > although the capability is blanket enabled on PF, IOMMU gaurantees > > the > > > > > transactions are blocked. > > > > > > > > > > > > > > > In the interim, it seems like the intent of the virtual capability > > > > > can be honored via help from the IOMMU for the controlling aspect.. > > > > > > > > > > Did i miss anything? > > > > > > > > Above works for emulating the enable bit (under the assumption that > > > > PF driver won't disable pasid when vf is assigned). However, there are > > > > also "Execute permission enable" and "Privileged mode enable" bits in > > > > PASID control registers. I don't know how those bits could be cleanly > > > > emulated when the guest writes a value different from PF's... > > > > > > sent too quick. the IOMMU also includes control bits for allowing/ > > > blocking execute requests and supervisor requests. We can rely on > > > IOMMU to block those requests to emulate the disabled cases of > > > all three control bits in the pasid cap. > > > > > > So if the emulation of the PASID capability takes into account the > > IOMMU configuration to back that emulation, shouldn't we do that > > emulation in the hypervisor, ie. QEMU, rather than the kernel vfio > > layer? Thanks, > > > > Alex > > We need enforce it in physical IOMMU, to ensure that even the > VF may send requests which violate the guest expectation those > requests are always blocked by IOMMU. Kernel vfio identifies > such need when emulating the pasid cap and then forward the > request to host iommu driver. Implementing this in the kernel would be necessary if we needed to protect from the guest device doing something bad to the host or other devices. Making sure the physical IOMMU is configured to meet guest expectations doesn't sound like it necessarily falls into that category. We do that on a regular basis to program the DMA mappings. Tell me more about why the hypervisor can't handle this piece of guest/host synchronization on top of all the other things it synchronizes to make a VM. Thanks, Alex
> From: Alex Williamson <alex.williamson@redhat.com> > Sent: Tuesday, April 14, 2020 11:29 AM > > On Tue, 14 Apr 2020 02:40:58 +0000 > "Tian, Kevin" <kevin.tian@intel.com> wrote: > > > > From: Alex Williamson <alex.williamson@redhat.com> > > > Sent: Tuesday, April 14, 2020 3:21 AM > > > > > > On Mon, 13 Apr 2020 08:05:33 +0000 > > > "Tian, Kevin" <kevin.tian@intel.com> wrote: > > > > > > > > From: Tian, Kevin > > > > > Sent: Monday, April 13, 2020 3:55 PM > > > > > > > > > > > From: Raj, Ashok <ashok.raj@linux.intel.com> > > > > > > Sent: Monday, April 13, 2020 11:11 AM > > > > > > > > > > > > On Wed, Apr 08, 2020 at 10:19:40AM -0600, Alex Williamson wrote: > > > > > > > On Tue, 7 Apr 2020 21:00:21 -0700 > > > > > > > "Raj, Ashok" <ashok.raj@intel.com> wrote: > > > > > > > > > > > > > > > Hi Alex > > > > > > > > > > > > > > > > + Bjorn > > > > > > > > > > > > > > + Don > > > > > > > > > > > > > > > FWIW I can't understand why PCI SIG went different ways with > ATS, > > > > > > > > where its enumerated on PF and VF. But for PASID and PRI its > only > > > > > > > > in PF. > > > > > > > > > > > > > > > > I'm checking with our internal SIG reps to followup on that. > > > > > > > > > > > > > > > > On Tue, Apr 07, 2020 at 09:58:01AM -0600, Alex Williamson > wrote: > > > > > > > > > > Is there vendor guarantee that hidden registers will locate at > the > > > > > > > > > > same offset between PF and VF config space? > > > > > > > > > > > > > > > > > > I'm not sure if the spec really precludes hidden registers, but > the > > > > > > > > > fact that these registers are explicitly outside of the capability > > > > > > > > > chain implies they're only intended for device specific use, so > I'd > > > say > > > > > > > > > there are no guarantees about anything related to these > registers. > > > > > > > > > > > > > > > > As you had suggested in the other thread, we could consider > > > > > > > > using the same offset as in PF, but even that's a better guess > > > > > > > > still not reliable. > > > > > > > > > > > > > > > > The other option is to maybe extend driver ops in the PF to > expose > > > > > > > > where the offsets should be. Sort of adding the quirk in the > > > > > > > > implementation. > > > > > > > > > > > > > > > > I'm not sure how prevalent are PASID and PRI in VF devices. If > SIG is > > > > > > resisting > > > > > > > > making VF's first class citizen, we might ask them to add some > > > verbiage > > > > > > > > to suggest leave the same offsets as PF open to help emulation > > > software. > > > > > > > > > > > > > > Even if we know where to expose these capabilities on the VF, it's > not > > > > > > > clear to me how we can actually virtualize the capability itself. If > > > > > > > the spec defines, for example, an enable bit as r/w then software > that > > > > > > > interacts with that register expects the bit is settable. There's no > > > > > > > protocol for "try to set the bit and re-read it to see if the hardware > > > > > > > accepted it". Therefore a capability with a fixed enable bit > > > > > > > representing the state of the PF, not settable by the VF, is > > > > > > > disingenuous to the spec. > > > > > > > > > > > > I think we are all in violent agreement. A lot of times the pci spec > gets > > > > > > defined several years ahead of real products and no one > remembers > > > > > > the justification on why they restricted things the way they did. > > > > > > > > > > > > Maybe someone early product wasn't quite exposing these features > to > > > the > > > > > > VF > > > > > > and hence the spec is bug compatible :-) > > > > > > > > > > > > > > > > > > > > If what we're trying to do is expose that PASID and PRI are enabled > on > > > > > > > the PF to a VF driver, maybe duplicating the PF capabilities on the > VF > > > > > > > without the ability to control it is not the right approach. Maybe > we > > > > > > > > > > > > As long as the capability enable is only provided when the PF has > > > enabled > > > > > > the feature. Then it seems the hardware seems to do the right thing. > > > > > > > > > > > > Assume we expose PASID/PRI only when PF has enabled it. It will be > the > > > > > > case since the PF driver needs to exist, and IOMMU would have set > the > > > > > > PASID/PRI/ATS on PF. > > > > > > > > > > > > If the emulation is purely spoofing the capability. Once vIOMMU > driver > > > > > > enables PASID, the context entries for the VF are completely > > > independent > > > > > > from the PF context entries. > > > > > > > > > > > > vIOMMU would enable PASID, and we just spoof the PASID > capability. > > > > > > > > > > > > If vIOMMU or guest for some reason does disable_pasid(), then the > > > > > > vIOMMU driver can disaable PASID on the VF context entries. So the > VF > > > > > > although the capability is blanket enabled on PF, IOMMU gaurantees > > > the > > > > > > transactions are blocked. > > > > > > > > > > > > > > > > > > In the interim, it seems like the intent of the virtual capability > > > > > > can be honored via help from the IOMMU for the controlling aspect.. > > > > > > > > > > > > Did i miss anything? > > > > > > > > > > Above works for emulating the enable bit (under the assumption that > > > > > PF driver won't disable pasid when vf is assigned). However, there are > > > > > also "Execute permission enable" and "Privileged mode enable" bits in > > > > > PASID control registers. I don't know how those bits could be cleanly > > > > > emulated when the guest writes a value different from PF's... > > > > > > > > sent too quick. the IOMMU also includes control bits for allowing/ > > > > blocking execute requests and supervisor requests. We can rely on > > > > IOMMU to block those requests to emulate the disabled cases of > > > > all three control bits in the pasid cap. > > > > > > > > > So if the emulation of the PASID capability takes into account the > > > IOMMU configuration to back that emulation, shouldn't we do that > > > emulation in the hypervisor, ie. QEMU, rather than the kernel vfio > > > layer? Thanks, > > > > > > Alex > > > > We need enforce it in physical IOMMU, to ensure that even the > > VF may send requests which violate the guest expectation those > > requests are always blocked by IOMMU. Kernel vfio identifies > > such need when emulating the pasid cap and then forward the > > request to host iommu driver. > > Implementing this in the kernel would be necessary if we needed to > protect from the guest device doing something bad to the host or > other devices. Making sure the physical IOMMU is configured to meet > guest expectations doesn't sound like it necessarily falls into that > category. We do that on a regular basis to program the DMA mappings. > Tell me more about why the hypervisor can't handle this piece of > guest/host synchronization on top of all the other things it > synchronizes to make a VM. Thanks, > I care more about "execution permission" and "privileged mode". It might be dangerous when the guest disallows the VF from sending DMA requests which have the execute or privileged bit set while the VF can still do so w/o proper protection in the IOMMU. This is all about vSVA for vfio devices, where the guest page table is directly linked in IOMMU and vfio doesn't participate in the specific DMA mappings. if an emulated device includes a pasid cap, Qemu vIOMMU will handle it for sure. Thanks Kevin
On Tue, 14 Apr 2020 03:42:42 +0000 "Tian, Kevin" <kevin.tian@intel.com> wrote: > > From: Alex Williamson <alex.williamson@redhat.com> > > Sent: Tuesday, April 14, 2020 11:29 AM > > > > On Tue, 14 Apr 2020 02:40:58 +0000 > > "Tian, Kevin" <kevin.tian@intel.com> wrote: > > > > > > From: Alex Williamson <alex.williamson@redhat.com> > > > > Sent: Tuesday, April 14, 2020 3:21 AM > > > > > > > > On Mon, 13 Apr 2020 08:05:33 +0000 > > > > "Tian, Kevin" <kevin.tian@intel.com> wrote: > > > > > > > > > > From: Tian, Kevin > > > > > > Sent: Monday, April 13, 2020 3:55 PM > > > > > > > > > > > > > From: Raj, Ashok <ashok.raj@linux.intel.com> > > > > > > > Sent: Monday, April 13, 2020 11:11 AM > > > > > > > > > > > > > > On Wed, Apr 08, 2020 at 10:19:40AM -0600, Alex Williamson wrote: > > > > > > > > On Tue, 7 Apr 2020 21:00:21 -0700 > > > > > > > > "Raj, Ashok" <ashok.raj@intel.com> wrote: > > > > > > > > > > > > > > > > > Hi Alex > > > > > > > > > > > > > > > > > > + Bjorn > > > > > > > > > > > > > > > > + Don > > > > > > > > > > > > > > > > > FWIW I can't understand why PCI SIG went different ways with > > ATS, > > > > > > > > > where its enumerated on PF and VF. But for PASID and PRI its > > only > > > > > > > > > in PF. > > > > > > > > > > > > > > > > > > I'm checking with our internal SIG reps to followup on that. > > > > > > > > > > > > > > > > > > On Tue, Apr 07, 2020 at 09:58:01AM -0600, Alex Williamson > > wrote: > > > > > > > > > > > Is there vendor guarantee that hidden registers will locate at > > the > > > > > > > > > > > same offset between PF and VF config space? > > > > > > > > > > > > > > > > > > > > I'm not sure if the spec really precludes hidden registers, but > > the > > > > > > > > > > fact that these registers are explicitly outside of the capability > > > > > > > > > > chain implies they're only intended for device specific use, so > > I'd > > > > say > > > > > > > > > > there are no guarantees about anything related to these > > registers. > > > > > > > > > > > > > > > > > > As you had suggested in the other thread, we could consider > > > > > > > > > using the same offset as in PF, but even that's a better guess > > > > > > > > > still not reliable. > > > > > > > > > > > > > > > > > > The other option is to maybe extend driver ops in the PF to > > expose > > > > > > > > > where the offsets should be. Sort of adding the quirk in the > > > > > > > > > implementation. > > > > > > > > > > > > > > > > > > I'm not sure how prevalent are PASID and PRI in VF devices. If > > SIG is > > > > > > > resisting > > > > > > > > > making VF's first class citizen, we might ask them to add some > > > > verbiage > > > > > > > > > to suggest leave the same offsets as PF open to help emulation > > > > software. > > > > > > > > > > > > > > > > Even if we know where to expose these capabilities on the VF, it's > > not > > > > > > > > clear to me how we can actually virtualize the capability itself. If > > > > > > > > the spec defines, for example, an enable bit as r/w then software > > that > > > > > > > > interacts with that register expects the bit is settable. There's no > > > > > > > > protocol for "try to set the bit and re-read it to see if the hardware > > > > > > > > accepted it". Therefore a capability with a fixed enable bit > > > > > > > > representing the state of the PF, not settable by the VF, is > > > > > > > > disingenuous to the spec. > > > > > > > > > > > > > > I think we are all in violent agreement. A lot of times the pci spec > > gets > > > > > > > defined several years ahead of real products and no one > > remembers > > > > > > > the justification on why they restricted things the way they did. > > > > > > > > > > > > > > Maybe someone early product wasn't quite exposing these features > > to > > > > the > > > > > > > VF > > > > > > > and hence the spec is bug compatible :-) > > > > > > > > > > > > > > > > > > > > > > > If what we're trying to do is expose that PASID and PRI are enabled > > on > > > > > > > > the PF to a VF driver, maybe duplicating the PF capabilities on the > > VF > > > > > > > > without the ability to control it is not the right approach. Maybe > > we > > > > > > > > > > > > > > As long as the capability enable is only provided when the PF has > > > > enabled > > > > > > > the feature. Then it seems the hardware seems to do the right thing. > > > > > > > > > > > > > > Assume we expose PASID/PRI only when PF has enabled it. It will be > > the > > > > > > > case since the PF driver needs to exist, and IOMMU would have set > > the > > > > > > > PASID/PRI/ATS on PF. > > > > > > > > > > > > > > If the emulation is purely spoofing the capability. Once vIOMMU > > driver > > > > > > > enables PASID, the context entries for the VF are completely > > > > independent > > > > > > > from the PF context entries. > > > > > > > > > > > > > > vIOMMU would enable PASID, and we just spoof the PASID > > capability. > > > > > > > > > > > > > > If vIOMMU or guest for some reason does disable_pasid(), then the > > > > > > > vIOMMU driver can disaable PASID on the VF context entries. So the > > VF > > > > > > > although the capability is blanket enabled on PF, IOMMU gaurantees > > > > the > > > > > > > transactions are blocked. > > > > > > > > > > > > > > > > > > > > > In the interim, it seems like the intent of the virtual capability > > > > > > > can be honored via help from the IOMMU for the controlling aspect.. > > > > > > > > > > > > > > Did i miss anything? > > > > > > > > > > > > Above works for emulating the enable bit (under the assumption that > > > > > > PF driver won't disable pasid when vf is assigned). However, there are > > > > > > also "Execute permission enable" and "Privileged mode enable" bits in > > > > > > PASID control registers. I don't know how those bits could be cleanly > > > > > > emulated when the guest writes a value different from PF's... > > > > > > > > > > sent too quick. the IOMMU also includes control bits for allowing/ > > > > > blocking execute requests and supervisor requests. We can rely on > > > > > IOMMU to block those requests to emulate the disabled cases of > > > > > all three control bits in the pasid cap. > > > > > > > > > > > > So if the emulation of the PASID capability takes into account the > > > > IOMMU configuration to back that emulation, shouldn't we do that > > > > emulation in the hypervisor, ie. QEMU, rather than the kernel vfio > > > > layer? Thanks, > > > > > > > > Alex > > > > > > We need enforce it in physical IOMMU, to ensure that even the > > > VF may send requests which violate the guest expectation those > > > requests are always blocked by IOMMU. Kernel vfio identifies > > > such need when emulating the pasid cap and then forward the > > > request to host iommu driver. > > > > Implementing this in the kernel would be necessary if we needed to > > protect from the guest device doing something bad to the host or > > other devices. Making sure the physical IOMMU is configured to meet > > guest expectations doesn't sound like it necessarily falls into that > > category. We do that on a regular basis to program the DMA mappings. > > Tell me more about why the hypervisor can't handle this piece of > > guest/host synchronization on top of all the other things it > > synchronizes to make a VM. Thanks, > > > > I care more about "execution permission" and "privileged mode". > It might be dangerous when the guest disallows the VF from sending "Dangerous" how? We're generally ok with the user managing their own consistency, it's when the user can affect other users/devices that we require vfio in the kernel to actively manage something. There's a very different scope to the vfio-pci kernel module implementing a fake capability and trying to make it behave indistinguishably from the real capability versus a userspace driver piecing together an emulation that's good enough for their purposes. Thanks, Alex > DMA requests which have the execute or privileged bit set while the VF > can still do so w/o proper protection in the IOMMU. This is all about > vSVA for vfio devices, where the guest page table is directly linked in > IOMMU and vfio doesn't participate in the specific DMA mappings. if > an emulated device includes a pasid cap, Qemu vIOMMU will handle > it for sure. > > Thanks > Kevin >
> From: Alex Williamson <alex.williamson@redhat.com> > Sent: Tuesday, April 14, 2020 11:24 PM > > On Tue, 14 Apr 2020 03:42:42 +0000 > "Tian, Kevin" <kevin.tian@intel.com> wrote: > > > > From: Alex Williamson <alex.williamson@redhat.com> > > > Sent: Tuesday, April 14, 2020 11:29 AM > > > > > > On Tue, 14 Apr 2020 02:40:58 +0000 > > > "Tian, Kevin" <kevin.tian@intel.com> wrote: > > > > > > > > From: Alex Williamson <alex.williamson@redhat.com> > > > > > Sent: Tuesday, April 14, 2020 3:21 AM > > > > > > > > > > On Mon, 13 Apr 2020 08:05:33 +0000 > > > > > "Tian, Kevin" <kevin.tian@intel.com> wrote: > > > > > > > > > > > > From: Tian, Kevin > > > > > > > Sent: Monday, April 13, 2020 3:55 PM > > > > > > > > > > > > > > > From: Raj, Ashok <ashok.raj@linux.intel.com> > > > > > > > > Sent: Monday, April 13, 2020 11:11 AM > > > > > > > > > > > > > > > > On Wed, Apr 08, 2020 at 10:19:40AM -0600, Alex Williamson > wrote: > > > > > > > > > On Tue, 7 Apr 2020 21:00:21 -0700 > > > > > > > > > "Raj, Ashok" <ashok.raj@intel.com> wrote: > > > > > > > > > > > > > > > > > > > Hi Alex > > > > > > > > > > > > > > > > > > > > + Bjorn > > > > > > > > > > > > > > > > > > + Don > > > > > > > > > > > > > > > > > > > FWIW I can't understand why PCI SIG went different ways > with > > > ATS, > > > > > > > > > > where its enumerated on PF and VF. But for PASID and PRI its > > > only > > > > > > > > > > in PF. > > > > > > > > > > > > > > > > > > > > I'm checking with our internal SIG reps to followup on that. > > > > > > > > > > > > > > > > > > > > On Tue, Apr 07, 2020 at 09:58:01AM -0600, Alex Williamson > > > wrote: > > > > > > > > > > > > Is there vendor guarantee that hidden registers will locate > at > > > the > > > > > > > > > > > > same offset between PF and VF config space? > > > > > > > > > > > > > > > > > > > > > > I'm not sure if the spec really precludes hidden registers, > but > > > the > > > > > > > > > > > fact that these registers are explicitly outside of the > capability > > > > > > > > > > > chain implies they're only intended for device specific use, > so > > > I'd > > > > > say > > > > > > > > > > > there are no guarantees about anything related to these > > > registers. > > > > > > > > > > > > > > > > > > > > As you had suggested in the other thread, we could consider > > > > > > > > > > using the same offset as in PF, but even that's a better guess > > > > > > > > > > still not reliable. > > > > > > > > > > > > > > > > > > > > The other option is to maybe extend driver ops in the PF to > > > expose > > > > > > > > > > where the offsets should be. Sort of adding the quirk in the > > > > > > > > > > implementation. > > > > > > > > > > > > > > > > > > > > I'm not sure how prevalent are PASID and PRI in VF devices. If > > > SIG is > > > > > > > > resisting > > > > > > > > > > making VF's first class citizen, we might ask them to add > some > > > > > verbiage > > > > > > > > > > to suggest leave the same offsets as PF open to help > emulation > > > > > software. > > > > > > > > > > > > > > > > > > Even if we know where to expose these capabilities on the VF, > it's > > > not > > > > > > > > > clear to me how we can actually virtualize the capability itself. > If > > > > > > > > > the spec defines, for example, an enable bit as r/w then > software > > > that > > > > > > > > > interacts with that register expects the bit is settable. There's > no > > > > > > > > > protocol for "try to set the bit and re-read it to see if the > hardware > > > > > > > > > accepted it". Therefore a capability with a fixed enable bit > > > > > > > > > representing the state of the PF, not settable by the VF, is > > > > > > > > > disingenuous to the spec. > > > > > > > > > > > > > > > > I think we are all in violent agreement. A lot of times the pci spec > > > gets > > > > > > > > defined several years ahead of real products and no one > > > remembers > > > > > > > > the justification on why they restricted things the way they did. > > > > > > > > > > > > > > > > Maybe someone early product wasn't quite exposing these > features > > > to > > > > > the > > > > > > > > VF > > > > > > > > and hence the spec is bug compatible :-) > > > > > > > > > > > > > > > > > > > > > > > > > > If what we're trying to do is expose that PASID and PRI are > enabled > > > on > > > > > > > > > the PF to a VF driver, maybe duplicating the PF capabilities on > the > > > VF > > > > > > > > > without the ability to control it is not the right approach. > Maybe > > > we > > > > > > > > > > > > > > > > As long as the capability enable is only provided when the PF has > > > > > enabled > > > > > > > > the feature. Then it seems the hardware seems to do the right > thing. > > > > > > > > > > > > > > > > Assume we expose PASID/PRI only when PF has enabled it. It will > be > > > the > > > > > > > > case since the PF driver needs to exist, and IOMMU would have > set > > > the > > > > > > > > PASID/PRI/ATS on PF. > > > > > > > > > > > > > > > > If the emulation is purely spoofing the capability. Once vIOMMU > > > driver > > > > > > > > enables PASID, the context entries for the VF are completely > > > > > independent > > > > > > > > from the PF context entries. > > > > > > > > > > > > > > > > vIOMMU would enable PASID, and we just spoof the PASID > > > capability. > > > > > > > > > > > > > > > > If vIOMMU or guest for some reason does disable_pasid(), then > the > > > > > > > > vIOMMU driver can disaable PASID on the VF context entries. So > the > > > VF > > > > > > > > although the capability is blanket enabled on PF, IOMMU > gaurantees > > > > > the > > > > > > > > transactions are blocked. > > > > > > > > > > > > > > > > > > > > > > > > In the interim, it seems like the intent of the virtual capability > > > > > > > > can be honored via help from the IOMMU for the controlling > aspect.. > > > > > > > > > > > > > > > > Did i miss anything? > > > > > > > > > > > > > > Above works for emulating the enable bit (under the assumption > that > > > > > > > PF driver won't disable pasid when vf is assigned). However, there > are > > > > > > > also "Execute permission enable" and "Privileged mode enable" > bits in > > > > > > > PASID control registers. I don't know how those bits could be > cleanly > > > > > > > emulated when the guest writes a value different from PF's... > > > > > > > > > > > > sent too quick. the IOMMU also includes control bits for allowing/ > > > > > > blocking execute requests and supervisor requests. We can rely on > > > > > > IOMMU to block those requests to emulate the disabled cases of > > > > > > all three control bits in the pasid cap. > > > > > > > > > > > > > > > So if the emulation of the PASID capability takes into account the > > > > > IOMMU configuration to back that emulation, shouldn't we do that > > > > > emulation in the hypervisor, ie. QEMU, rather than the kernel vfio > > > > > layer? Thanks, > > > > > > > > > > Alex > > > > > > > > We need enforce it in physical IOMMU, to ensure that even the > > > > VF may send requests which violate the guest expectation those > > > > requests are always blocked by IOMMU. Kernel vfio identifies > > > > such need when emulating the pasid cap and then forward the > > > > request to host iommu driver. > > > > > > Implementing this in the kernel would be necessary if we needed to > > > protect from the guest device doing something bad to the host or > > > other devices. Making sure the physical IOMMU is configured to meet > > > guest expectations doesn't sound like it necessarily falls into that > > > category. We do that on a regular basis to program the DMA mappings. > > > Tell me more about why the hypervisor can't handle this piece of > > > guest/host synchronization on top of all the other things it > > > synchronizes to make a VM. Thanks, > > > > > > > I care more about "execution permission" and "privileged mode". > > It might be dangerous when the guest disallows the VF from sending > > "Dangerous" how? We're generally ok with the user managing their own > consistency, it's when the user can affect other users/devices that we > require vfio in the kernel to actively manage something. There's a very > different scope to the vfio-pci kernel module implementing a fake > capability and trying to make it behave indistinguishably from the real > capability versus a userspace driver piecing together an emulation > that's good enough for their purposes. Thanks, > How could emulation fix this gap when the VF DMAs don't go through the vIOMMU? What you explained all makes sense before talking about the emulation of PASID capability, i.e. vfio only cares about isolation between assigned devices. However now vfio exposes a capability which is shared by PF/VF while pure software emulation may break the guest expectation, and now the only viable mitigation is to get the help from physical IOMMU. then why cannot vfio include such mitigation in its emulation of the PASID capability? Thanks Kevin > > > DMA requests which have the execute or privileged bit set while the VF > > can still do so w/o proper protection in the IOMMU. This is all about > > vSVA for vfio devices, where the guest page table is directly linked in > > IOMMU and vfio doesn't participate in the specific DMA mappings. if > > an emulated device includes a pasid cap, Qemu vIOMMU will handle > > it for sure. > > > > Thanks > > Kevin > >
On Tue, 14 Apr 2020 23:57:33 +0000 "Tian, Kevin" <kevin.tian@intel.com> wrote: > > From: Alex Williamson <alex.williamson@redhat.com> > > Sent: Tuesday, April 14, 2020 11:24 PM > > > > On Tue, 14 Apr 2020 03:42:42 +0000 > > "Tian, Kevin" <kevin.tian@intel.com> wrote: > > > > > > From: Alex Williamson <alex.williamson@redhat.com> > > > > Sent: Tuesday, April 14, 2020 11:29 AM > > > > > > > > On Tue, 14 Apr 2020 02:40:58 +0000 > > > > "Tian, Kevin" <kevin.tian@intel.com> wrote: > > > > > > > > > > From: Alex Williamson <alex.williamson@redhat.com> > > > > > > Sent: Tuesday, April 14, 2020 3:21 AM > > > > > > > > > > > > On Mon, 13 Apr 2020 08:05:33 +0000 > > > > > > "Tian, Kevin" <kevin.tian@intel.com> wrote: > > > > > > > > > > > > > > From: Tian, Kevin > > > > > > > > Sent: Monday, April 13, 2020 3:55 PM > > > > > > > > > > > > > > > > > From: Raj, Ashok <ashok.raj@linux.intel.com> > > > > > > > > > Sent: Monday, April 13, 2020 11:11 AM > > > > > > > > > > > > > > > > > > On Wed, Apr 08, 2020 at 10:19:40AM -0600, Alex Williamson > > wrote: > > > > > > > > > > On Tue, 7 Apr 2020 21:00:21 -0700 > > > > > > > > > > "Raj, Ashok" <ashok.raj@intel.com> wrote: > > > > > > > > > > > > > > > > > > > > > Hi Alex > > > > > > > > > > > > > > > > > > > > > > + Bjorn > > > > > > > > > > > > > > > > > > > > + Don > > > > > > > > > > > > > > > > > > > > > FWIW I can't understand why PCI SIG went different ways > > with > > > > ATS, > > > > > > > > > > > where its enumerated on PF and VF. But for PASID and PRI its > > > > only > > > > > > > > > > > in PF. > > > > > > > > > > > > > > > > > > > > > > I'm checking with our internal SIG reps to followup on that. > > > > > > > > > > > > > > > > > > > > > > On Tue, Apr 07, 2020 at 09:58:01AM -0600, Alex Williamson > > > > wrote: > > > > > > > > > > > > > Is there vendor guarantee that hidden registers will locate > > at > > > > the > > > > > > > > > > > > > same offset between PF and VF config space? > > > > > > > > > > > > > > > > > > > > > > > > I'm not sure if the spec really precludes hidden registers, > > but > > > > the > > > > > > > > > > > > fact that these registers are explicitly outside of the > > capability > > > > > > > > > > > > chain implies they're only intended for device specific use, > > so > > > > I'd > > > > > > say > > > > > > > > > > > > there are no guarantees about anything related to these > > > > registers. > > > > > > > > > > > > > > > > > > > > > > As you had suggested in the other thread, we could consider > > > > > > > > > > > using the same offset as in PF, but even that's a better guess > > > > > > > > > > > still not reliable. > > > > > > > > > > > > > > > > > > > > > > The other option is to maybe extend driver ops in the PF to > > > > expose > > > > > > > > > > > where the offsets should be. Sort of adding the quirk in the > > > > > > > > > > > implementation. > > > > > > > > > > > > > > > > > > > > > > I'm not sure how prevalent are PASID and PRI in VF devices. If > > > > SIG is > > > > > > > > > resisting > > > > > > > > > > > making VF's first class citizen, we might ask them to add > > some > > > > > > verbiage > > > > > > > > > > > to suggest leave the same offsets as PF open to help > > emulation > > > > > > software. > > > > > > > > > > > > > > > > > > > > Even if we know where to expose these capabilities on the VF, > > it's > > > > not > > > > > > > > > > clear to me how we can actually virtualize the capability itself. > > If > > > > > > > > > > the spec defines, for example, an enable bit as r/w then > > software > > > > that > > > > > > > > > > interacts with that register expects the bit is settable. There's > > no > > > > > > > > > > protocol for "try to set the bit and re-read it to see if the > > hardware > > > > > > > > > > accepted it". Therefore a capability with a fixed enable bit > > > > > > > > > > representing the state of the PF, not settable by the VF, is > > > > > > > > > > disingenuous to the spec. > > > > > > > > > > > > > > > > > > I think we are all in violent agreement. A lot of times the pci spec > > > > gets > > > > > > > > > defined several years ahead of real products and no one > > > > remembers > > > > > > > > > the justification on why they restricted things the way they did. > > > > > > > > > > > > > > > > > > Maybe someone early product wasn't quite exposing these > > features > > > > to > > > > > > the > > > > > > > > > VF > > > > > > > > > and hence the spec is bug compatible :-) > > > > > > > > > > > > > > > > > > > > > > > > > > > > > If what we're trying to do is expose that PASID and PRI are > > enabled > > > > on > > > > > > > > > > the PF to a VF driver, maybe duplicating the PF capabilities on > > the > > > > VF > > > > > > > > > > without the ability to control it is not the right approach. > > Maybe > > > > we > > > > > > > > > > > > > > > > > > As long as the capability enable is only provided when the PF has > > > > > > enabled > > > > > > > > > the feature. Then it seems the hardware seems to do the right > > thing. > > > > > > > > > > > > > > > > > > Assume we expose PASID/PRI only when PF has enabled it. It will > > be > > > > the > > > > > > > > > case since the PF driver needs to exist, and IOMMU would have > > set > > > > the > > > > > > > > > PASID/PRI/ATS on PF. > > > > > > > > > > > > > > > > > > If the emulation is purely spoofing the capability. Once vIOMMU > > > > driver > > > > > > > > > enables PASID, the context entries for the VF are completely > > > > > > independent > > > > > > > > > from the PF context entries. > > > > > > > > > > > > > > > > > > vIOMMU would enable PASID, and we just spoof the PASID > > > > capability. > > > > > > > > > > > > > > > > > > If vIOMMU or guest for some reason does disable_pasid(), then > > the > > > > > > > > > vIOMMU driver can disaable PASID on the VF context entries. So > > the > > > > VF > > > > > > > > > although the capability is blanket enabled on PF, IOMMU > > gaurantees > > > > > > the > > > > > > > > > transactions are blocked. > > > > > > > > > > > > > > > > > > > > > > > > > > > In the interim, it seems like the intent of the virtual capability > > > > > > > > > can be honored via help from the IOMMU for the controlling > > aspect.. > > > > > > > > > > > > > > > > > > Did i miss anything? > > > > > > > > > > > > > > > > Above works for emulating the enable bit (under the assumption > > that > > > > > > > > PF driver won't disable pasid when vf is assigned). However, there > > are > > > > > > > > also "Execute permission enable" and "Privileged mode enable" > > bits in > > > > > > > > PASID control registers. I don't know how those bits could be > > cleanly > > > > > > > > emulated when the guest writes a value different from PF's... > > > > > > > > > > > > > > sent too quick. the IOMMU also includes control bits for allowing/ > > > > > > > blocking execute requests and supervisor requests. We can rely on > > > > > > > IOMMU to block those requests to emulate the disabled cases of > > > > > > > all three control bits in the pasid cap. > > > > > > > > > > > > > > > > > > So if the emulation of the PASID capability takes into account the > > > > > > IOMMU configuration to back that emulation, shouldn't we do that > > > > > > emulation in the hypervisor, ie. QEMU, rather than the kernel vfio > > > > > > layer? Thanks, > > > > > > > > > > > > Alex > > > > > > > > > > We need enforce it in physical IOMMU, to ensure that even the > > > > > VF may send requests which violate the guest expectation those > > > > > requests are always blocked by IOMMU. Kernel vfio identifies > > > > > such need when emulating the pasid cap and then forward the > > > > > request to host iommu driver. > > > > > > > > Implementing this in the kernel would be necessary if we needed to > > > > protect from the guest device doing something bad to the host or > > > > other devices. Making sure the physical IOMMU is configured to meet > > > > guest expectations doesn't sound like it necessarily falls into that > > > > category. We do that on a regular basis to program the DMA mappings. > > > > Tell me more about why the hypervisor can't handle this piece of > > > > guest/host synchronization on top of all the other things it > > > > synchronizes to make a VM. Thanks, > > > > > > > > > > I care more about "execution permission" and "privileged mode". > > > It might be dangerous when the guest disallows the VF from sending > > > > "Dangerous" how? We're generally ok with the user managing their own > > consistency, it's when the user can affect other users/devices that we > > require vfio in the kernel to actively manage something. There's a very > > different scope to the vfio-pci kernel module implementing a fake > > capability and trying to make it behave indistinguishably from the real > > capability versus a userspace driver piecing together an emulation > > that's good enough for their purposes. Thanks, > > > > How could emulation fix this gap when the VF DMAs don't go through > the vIOMMU? What you explained all makes sense before talking about > the emulation of PASID capability, i.e. vfio only cares about isolation > between assigned devices. However now vfio exposes a capability > which is shared by PF/VF while pure software emulation may break > the guest expectation, and now the only viable mitigation is to get > the help from physical IOMMU. then why cannot vfio include such > mitigation in its emulation of the PASID capability? DMA never actually goes "through" the vIOMMU. I'm not suggesting that vfio doesn't participate some how, but I don't know that emulating a capability that doesn't exist and involves policy should be done in the kernel, versus providing userspace with an interface to control what they need to implement that emulation. Thanks, Alex
> From: Alex Williamson <alex.williamson@redhat.com> > Sent: Wednesday, April 15, 2020 8:36 AM > > On Tue, 14 Apr 2020 23:57:33 +0000 > "Tian, Kevin" <kevin.tian@intel.com> wrote: > > > > From: Alex Williamson <alex.williamson@redhat.com> > > > Sent: Tuesday, April 14, 2020 11:24 PM > > > > > > On Tue, 14 Apr 2020 03:42:42 +0000 > > > "Tian, Kevin" <kevin.tian@intel.com> wrote: > > > > > > > > From: Alex Williamson <alex.williamson@redhat.com> > > > > > Sent: Tuesday, April 14, 2020 11:29 AM > > > > > > > > > > On Tue, 14 Apr 2020 02:40:58 +0000 > > > > > "Tian, Kevin" <kevin.tian@intel.com> wrote: > > > > > > > > > > > > From: Alex Williamson <alex.williamson@redhat.com> > > > > > > > Sent: Tuesday, April 14, 2020 3:21 AM > > > > > > > > > > > > > > On Mon, 13 Apr 2020 08:05:33 +0000 > > > > > > > "Tian, Kevin" <kevin.tian@intel.com> wrote: > > > > > > > > > > > > > > > > From: Tian, Kevin > > > > > > > > > Sent: Monday, April 13, 2020 3:55 PM > > > > > > > > > > > > > > > > > > > From: Raj, Ashok <ashok.raj@linux.intel.com> > > > > > > > > > > Sent: Monday, April 13, 2020 11:11 AM > > > > > > > > > > > > > > > > > > > > On Wed, Apr 08, 2020 at 10:19:40AM -0600, Alex Williamson > > > wrote: > > > > > > > > > > > On Tue, 7 Apr 2020 21:00:21 -0700 > > > > > > > > > > > "Raj, Ashok" <ashok.raj@intel.com> wrote: > > > > > > > > > > > > > > > > > > > > > > > Hi Alex > > > > > > > > > > > > > > > > > > > > > > > > + Bjorn > > > > > > > > > > > > > > > > > > > > > > + Don > > > > > > > > > > > > > > > > > > > > > > > FWIW I can't understand why PCI SIG went different ways > > > with > > > > > ATS, > > > > > > > > > > > > where its enumerated on PF and VF. But for PASID and > PRI its > > > > > only > > > > > > > > > > > > in PF. > > > > > > > > > > > > > > > > > > > > > > > > I'm checking with our internal SIG reps to followup on > that. > > > > > > > > > > > > > > > > > > > > > > > > On Tue, Apr 07, 2020 at 09:58:01AM -0600, Alex > Williamson > > > > > wrote: > > > > > > > > > > > > > > Is there vendor guarantee that hidden registers will > locate > > > at > > > > > the > > > > > > > > > > > > > > same offset between PF and VF config space? > > > > > > > > > > > > > > > > > > > > > > > > > > I'm not sure if the spec really precludes hidden registers, > > > but > > > > > the > > > > > > > > > > > > > fact that these registers are explicitly outside of the > > > capability > > > > > > > > > > > > > chain implies they're only intended for device specific > use, > > > so > > > > > I'd > > > > > > > say > > > > > > > > > > > > > there are no guarantees about anything related to these > > > > > registers. > > > > > > > > > > > > > > > > > > > > > > > > As you had suggested in the other thread, we could > consider > > > > > > > > > > > > using the same offset as in PF, but even that's a better > guess > > > > > > > > > > > > still not reliable. > > > > > > > > > > > > > > > > > > > > > > > > The other option is to maybe extend driver ops in the PF > to > > > > > expose > > > > > > > > > > > > where the offsets should be. Sort of adding the quirk in > the > > > > > > > > > > > > implementation. > > > > > > > > > > > > > > > > > > > > > > > > I'm not sure how prevalent are PASID and PRI in VF > devices. If > > > > > SIG is > > > > > > > > > > resisting > > > > > > > > > > > > making VF's first class citizen, we might ask them to add > > > some > > > > > > > verbiage > > > > > > > > > > > > to suggest leave the same offsets as PF open to help > > > emulation > > > > > > > software. > > > > > > > > > > > > > > > > > > > > > > Even if we know where to expose these capabilities on the > VF, > > > it's > > > > > not > > > > > > > > > > > clear to me how we can actually virtualize the capability > itself. > > > If > > > > > > > > > > > the spec defines, for example, an enable bit as r/w then > > > software > > > > > that > > > > > > > > > > > interacts with that register expects the bit is settable. > There's > > > no > > > > > > > > > > > protocol for "try to set the bit and re-read it to see if the > > > hardware > > > > > > > > > > > accepted it". Therefore a capability with a fixed enable bit > > > > > > > > > > > representing the state of the PF, not settable by the VF, is > > > > > > > > > > > disingenuous to the spec. > > > > > > > > > > > > > > > > > > > > I think we are all in violent agreement. A lot of times the pci > spec > > > > > gets > > > > > > > > > > defined several years ahead of real products and no one > > > > > remembers > > > > > > > > > > the justification on why they restricted things the way they > did. > > > > > > > > > > > > > > > > > > > > Maybe someone early product wasn't quite exposing these > > > features > > > > > to > > > > > > > the > > > > > > > > > > VF > > > > > > > > > > and hence the spec is bug compatible :-) > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > If what we're trying to do is expose that PASID and PRI are > > > enabled > > > > > on > > > > > > > > > > > the PF to a VF driver, maybe duplicating the PF capabilities > on > > > the > > > > > VF > > > > > > > > > > > without the ability to control it is not the right approach. > > > Maybe > > > > > we > > > > > > > > > > > > > > > > > > > > As long as the capability enable is only provided when the PF > has > > > > > > > enabled > > > > > > > > > > the feature. Then it seems the hardware seems to do the > right > > > thing. > > > > > > > > > > > > > > > > > > > > Assume we expose PASID/PRI only when PF has enabled it. It > will > > > be > > > > > the > > > > > > > > > > case since the PF driver needs to exist, and IOMMU would > have > > > set > > > > > the > > > > > > > > > > PASID/PRI/ATS on PF. > > > > > > > > > > > > > > > > > > > > If the emulation is purely spoofing the capability. Once > vIOMMU > > > > > driver > > > > > > > > > > enables PASID, the context entries for the VF are completely > > > > > > > independent > > > > > > > > > > from the PF context entries. > > > > > > > > > > > > > > > > > > > > vIOMMU would enable PASID, and we just spoof the PASID > > > > > capability. > > > > > > > > > > > > > > > > > > > > If vIOMMU or guest for some reason does disable_pasid(), > then > > > the > > > > > > > > > > vIOMMU driver can disaable PASID on the VF context entries. > So > > > the > > > > > VF > > > > > > > > > > although the capability is blanket enabled on PF, IOMMU > > > gaurantees > > > > > > > the > > > > > > > > > > transactions are blocked. > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > In the interim, it seems like the intent of the virtual capability > > > > > > > > > > can be honored via help from the IOMMU for the controlling > > > aspect.. > > > > > > > > > > > > > > > > > > > > Did i miss anything? > > > > > > > > > > > > > > > > > > Above works for emulating the enable bit (under the > assumption > > > that > > > > > > > > > PF driver won't disable pasid when vf is assigned). However, > there > > > are > > > > > > > > > also "Execute permission enable" and "Privileged mode > enable" > > > bits in > > > > > > > > > PASID control registers. I don't know how those bits could be > > > cleanly > > > > > > > > > emulated when the guest writes a value different from PF's... > > > > > > > > > > > > > > > > sent too quick. the IOMMU also includes control bits for > allowing/ > > > > > > > > blocking execute requests and supervisor requests. We can rely > on > > > > > > > > IOMMU to block those requests to emulate the disabled cases of > > > > > > > > all three control bits in the pasid cap. > > > > > > > > > > > > > > > > > > > > > So if the emulation of the PASID capability takes into account the > > > > > > > IOMMU configuration to back that emulation, shouldn't we do > that > > > > > > > emulation in the hypervisor, ie. QEMU, rather than the kernel vfio > > > > > > > layer? Thanks, > > > > > > > > > > > > > > Alex > > > > > > > > > > > > We need enforce it in physical IOMMU, to ensure that even the > > > > > > VF may send requests which violate the guest expectation those > > > > > > requests are always blocked by IOMMU. Kernel vfio identifies > > > > > > such need when emulating the pasid cap and then forward the > > > > > > request to host iommu driver. > > > > > > > > > > Implementing this in the kernel would be necessary if we needed to > > > > > protect from the guest device doing something bad to the host or > > > > > other devices. Making sure the physical IOMMU is configured to meet > > > > > guest expectations doesn't sound like it necessarily falls into that > > > > > category. We do that on a regular basis to program the DMA > mappings. > > > > > Tell me more about why the hypervisor can't handle this piece of > > > > > guest/host synchronization on top of all the other things it > > > > > synchronizes to make a VM. Thanks, > > > > > > > > > > > > > I care more about "execution permission" and "privileged mode". > > > > It might be dangerous when the guest disallows the VF from sending > > > > > > "Dangerous" how? We're generally ok with the user managing their own > > > consistency, it's when the user can affect other users/devices that we > > > require vfio in the kernel to actively manage something. There's a very > > > different scope to the vfio-pci kernel module implementing a fake > > > capability and trying to make it behave indistinguishably from the real > > > capability versus a userspace driver piecing together an emulation > > > that's good enough for their purposes. Thanks, > > > > > > > How could emulation fix this gap when the VF DMAs don't go through > > the vIOMMU? What you explained all makes sense before talking about > > the emulation of PASID capability, i.e. vfio only cares about isolation > > between assigned devices. However now vfio exposes a capability > > which is shared by PF/VF while pure software emulation may break > > the guest expectation, and now the only viable mitigation is to get > > the help from physical IOMMU. then why cannot vfio include such > > mitigation in its emulation of the PASID capability? > > DMA never actually goes "through" the vIOMMU. I'm not suggesting that > vfio doesn't participate some how, but I don't know that emulating a > capability that doesn't exist and involves policy should be done in the > kernel, versus providing userspace with an interface to control what > they need to implement that emulation. Thanks, > OK, I see your point. We'll think about the latter option (e.g. through DEVICE_FEATURE) and put a proposal for further discussion. btw as I commented earlier, let's focus on PF first to enable vSVA given that there are already many flying bits. After that we can then extend the vSVA support to VF based on discussions in this thread. Thanks Kevin
diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c index 4b9af99..84b4ea0 100644 --- a/drivers/vfio/pci/vfio_pci_config.c +++ b/drivers/vfio/pci/vfio_pci_config.c @@ -1509,11 +1509,304 @@ static int vfio_cap_init(struct vfio_pci_device *vdev) return 0; } +static int vfio_fill_custom_vconfig_bytes(struct vfio_pci_device *vdev, + int offset, uint8_t *data, int size) +{ + int ret = 0, data_offset = 0; + + while (size) { + int filled; + + if (size >= 4 && !(offset % 4)) { + __le32 *dwordp = (__le32 *)&vdev->vconfig[offset]; + u32 dword; + + memcpy(&dword, data + data_offset, 4); + *dwordp = cpu_to_le32(dword); + filled = 4; + } else if (size >= 2 && !(offset % 2)) { + __le16 *wordp = (__le16 *)&vdev->vconfig[offset]; + u16 word; + + memcpy(&word, data + data_offset, 2); + *wordp = cpu_to_le16(word); + filled = 2; + } else { + u8 *byte = &vdev->vconfig[offset]; + + memcpy(byte, data + data_offset, 1); + filled = 1; + } + + offset += filled; + data_offset += filled; + size -= filled; + } + + return ret; +} + +static int vfio_pci_get_ecap_content(struct pci_dev *pdev, + int cap, int cap_len, u8 *content) +{ + int pos, offset, len = cap_len, ret = 0; + + pos = pci_find_ext_capability(pdev, cap); + if (!pos) + return -EINVAL; + + offset = 0; + while (len) { + int fetched; + + if (len >= 4 && !(pos % 4)) { + u32 *dwordp = (u32 *) (content + offset); + u32 dword; + __le32 *dwptr = (__le32 *) &dword; + + ret = pci_read_config_dword(pdev, pos, &dword); + if (ret) + return ret; + *dwordp = le32_to_cpu(*dwptr); + fetched = 4; + } else if (len >= 2 && !(pos % 2)) { + u16 *wordp = (u16 *) (content + offset); + u16 word; + __le16 *wptr = (__le16 *) &word; + + ret = pci_read_config_word(pdev, pos, &word); + if (ret) + return ret; + *wordp = le16_to_cpu(*wptr); + fetched = 2; + } else { + u8 *byte = (u8 *) (content + offset); + + ret = pci_read_config_byte(pdev, pos, byte); + if (ret) + return ret; + fetched = 1; + } + + pos += fetched; + offset += fetched; + len -= fetched; + } + + return ret; +} + +struct vfio_pci_pasid_cap_data { + u32 id:16; + u32 version:4; + u32 next:12; + union { + u16 cap_reg_val; + struct { + u16 rsv1:1; + u16 execs:1; + u16 prvs:1; + u16 rsv2:5; + u16 pasid_bits:5; + u16 rsv3:3; + }; + } cap_reg; + union { + u16 control_reg_val; + struct { + u16 paside:1; + u16 exece:1; + u16 prve:1; + u16 rsv4:13; + }; + } control_reg; +}; + +static int vfio_pci_add_pasid_cap(struct vfio_pci_device *vdev, + struct pci_dev *pdev, + u16 epos, u16 *next, __le32 **prevp) +{ + u8 *map = vdev->pci_config_map; + int ecap = PCI_EXT_CAP_ID_PASID; + int len = pci_ext_cap_length[ecap]; + struct vfio_pci_pasid_cap_data pasid_cap; + struct vfio_pci_pasid_cap_data vpasid_cap; + int ret; + + /* + * If no cap filled in this function, should make sure the next + * pointer points to current epos. + */ + *next = epos; + + if (!len) { + pr_info("%s: VF %s hiding PASID capability\n", + __func__, dev_name(&vdev->pdev->dev)); + ret = 0; + goto out; + } + + /* Add PASID capability*/ + ret = vfio_pci_get_ecap_content(pdev, ecap, + len, (u8 *)&pasid_cap); + if (ret) + goto out; + + if (!pasid_cap.control_reg.paside) { + pr_debug("%s: its PF's PASID capability is not enabled\n", + dev_name(&vdev->pdev->dev)); + ret = 0; + goto out; + } + + memcpy(&vpasid_cap, &pasid_cap, len); + + vpasid_cap.id = 0x18; + vpasid_cap.next = 0; + /* clear the control reg for guest */ + memset(&vpasid_cap.control_reg, 0x0, + sizeof(vpasid_cap.control_reg)); + + memset(map + epos, vpasid_cap.id, len); + ret = vfio_fill_custom_vconfig_bytes(vdev, epos, + (u8 *)&vpasid_cap, len); + if (!ret) { + /* + * Successfully filled in PASID cap, update + * the next offset in previous cap header, + * and also update caller about the offset + * of next cap if any. + */ + u32 val = epos; + **prevp &= cpu_to_le32(~(0xffcU << 20)); + **prevp |= cpu_to_le32(val << 20); + *prevp = (__le32 *)&vdev->vconfig[epos]; + *next = epos + len; + } + +out: + return ret; +} + +struct vfio_pci_pri_cap_data { + u32 id:16; + u32 version:4; + u32 next:12; + union { + u16 control_reg_val; + struct { + u16 enable:1; + u16 reset:1; + u16 rsv1:14; + }; + } control_reg; + union { + u16 status_reg_val; + struct { + u16 rf:1; + u16 uprgi:1; + u16 rsv2:6; + u16 stop:1; + u16 rsv3:6; + u16 pasid_required:1; + }; + } status_reg; + u32 prq_capacity; + u32 prq_quota; +}; + +static int vfio_pci_add_pri_cap(struct vfio_pci_device *vdev, + struct pci_dev *pdev, + u16 epos, u16 *next, __le32 **prevp) +{ + u8 *map = vdev->pci_config_map; + int ecap = PCI_EXT_CAP_ID_PRI; + int len = pci_ext_cap_length[ecap]; + struct vfio_pci_pri_cap_data pri_cap; + struct vfio_pci_pri_cap_data vpri_cap; + int ret; + + /* + * If no cap filled in this function, should make sure the next + * pointer points to current epos. + */ + *next = epos; + + if (!len) { + pr_info("%s: VF %s hiding PRI capability\n", + __func__, dev_name(&vdev->pdev->dev)); + ret = 0; + goto out; + } + + /* Add PASID capability*/ + ret = vfio_pci_get_ecap_content(pdev, ecap, + len, (u8 *)&pri_cap); + if (ret) + goto out; + + if (!pri_cap.control_reg.enable) { + pr_debug("%s: its PF's PRI capability is not enabled\n", + dev_name(&vdev->pdev->dev)); + ret = 0; + goto out; + } + + memcpy(&vpri_cap, &pri_cap, len); + + vpri_cap.id = 0x19; + vpri_cap.next = 0; + /* clear the control reg for guest */ + memset(&vpri_cap.control_reg, 0x0, + sizeof(vpri_cap.control_reg)); + + memset(map + epos, vpri_cap.id, len); + ret = vfio_fill_custom_vconfig_bytes(vdev, epos, + (u8 *)&vpri_cap, len); + if (!ret) { + /* + * Successfully filled in PASID cap, update + * the next offset in previous cap header, + * and also update caller about the offset + * of next cap if any. + */ + u32 val = epos; + **prevp &= cpu_to_le32(~(0xffcU << 20)); + **prevp |= cpu_to_le32(val << 20); + *prevp = (__le32 *)&vdev->vconfig[epos]; + *next = epos + len; + } + +out: + return ret; +} + +static int vfio_pci_add_emulated_cap_for_vf(struct vfio_pci_device *vdev, + struct pci_dev *pdev, u16 start_epos, __le32 *prev) +{ + __le32 *__prev = prev; + u16 epos = start_epos, epos_next = start_epos; + int ret = 0; + + /* Add PASID capability*/ + ret = vfio_pci_add_pasid_cap(vdev, pdev, epos, + &epos_next, &__prev); + if (ret) + return ret; + + /* Add PRI capability */ + epos = epos_next; + ret = vfio_pci_add_pri_cap(vdev, pdev, epos, + &epos_next, &__prev); + + return ret; +} + static int vfio_ecap_init(struct vfio_pci_device *vdev) { struct pci_dev *pdev = vdev->pdev; u8 *map = vdev->pci_config_map; - u16 epos; + u16 epos, epos_max; __le32 *prev = NULL; int loops, ret, ecaps = 0; @@ -1521,6 +1814,7 @@ static int vfio_ecap_init(struct vfio_pci_device *vdev) return 0; epos = PCI_CFG_SPACE_SIZE; + epos_max = PCI_CFG_SPACE_SIZE; loops = (pdev->cfg_size - PCI_CFG_SPACE_SIZE) / PCI_CAP_SIZEOF; @@ -1545,6 +1839,9 @@ static int vfio_ecap_init(struct vfio_pci_device *vdev) } } + if (epos_max <= (epos + len)) + epos_max = epos + len; + if (!len) { pci_info(pdev, "%s: hiding ecap %#x@%#x\n", __func__, ecap, epos); @@ -1604,6 +1901,18 @@ static int vfio_ecap_init(struct vfio_pci_device *vdev) if (!ecaps) *(u32 *)&vdev->vconfig[PCI_CFG_SPACE_SIZE] = 0; +#ifdef CONFIG_PCI_ATS + if (pdev->is_virtfn) { + struct pci_dev *physfn = pdev->physfn; + + ret = vfio_pci_add_emulated_cap_for_vf(vdev, + physfn, epos_max, prev); + if (ret) + pr_info("%s, failed to add special caps for VF %s\n", + __func__, dev_name(&vdev->pdev->dev)); + } +#endif + return 0; } @@ -1748,6 +2057,17 @@ static size_t vfio_pci_cap_remaining_dword(struct vfio_pci_device *vdev, return i; } +static bool vfio_pci_need_virt_perm(struct pci_dev *pdev, u8 cap_id) +{ +#ifdef CONFIG_PCI_ATS + return (pdev->is_virtfn && + (cap_id == PCI_EXT_CAP_ID_PASID || + cap_id == PCI_EXT_CAP_ID_PRI)); +#else + return false; +#endif +} + static ssize_t vfio_config_do_rw(struct vfio_pci_device *vdev, char __user *buf, size_t count, loff_t *ppos, bool iswrite) { @@ -1781,7 +2101,8 @@ static ssize_t vfio_config_do_rw(struct vfio_pci_device *vdev, char __user *buf, if (cap_id == PCI_CAP_ID_INVALID) { perm = &unassigned_perms; cap_start = *ppos; - } else if (cap_id == PCI_CAP_ID_INVALID_VIRT) { + } else if (cap_id == PCI_CAP_ID_INVALID_VIRT || + vfio_pci_need_virt_perm(pdev, cap_id)) { perm = &virt_perms; cap_start = *ppos; } else {