Message ID | 1497595719-107855-1-git-send-email-chao.gao@intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
+ Eric On Fri, Jun 16, 2017 at 02:48:39PM +0800, Chao Gao wrote: >The problem is a VF of RC integrated PF (e.g. PF's BDF is 00:02.0), >we would wrongly use 00:00.0 to search VT-d unit. > >To search VT-d unit for a VF, the BDF of the PF is used. And If the >PF is an Extended Function, the BDF of one traditional function is >used. The following line (from acpi_find_matched_drhd_unit()): > devfn = PCI_SLOT(pdev->info.physfn.devfn) ? 0 : pdev->info.physfn.devfn; >sets 'devfn' to 0 if PF's devfn > 8. Apparently, this line treats all >PFs as ARI-capable function and assumes the Root Port or Switch >Downstream Port immediately above the PF has ARIforwarding enabled. >However, according to SRIOV spec 3.7.3, ARI is not applicable to RC >integrated PF. For this case, we should use PF's BDF directly other >than using 0 as devfn. > >This patch adds a new pdev type to indicate a function is RC >integrated. And check whether PF is a RC integrated endpoint when >searching VT-d unit. > >Reported-by: Crawford, Eric R <Eric.R.Crawford@intel.com> >Signed-off-by: Chao Gao <chao.gao@intel.com> >--- > xen/drivers/passthrough/pci.c | 28 +++++++++++++++++++--------- > xen/drivers/passthrough/vtd/dmar.c | 7 ++++++- > xen/drivers/passthrough/vtd/intremap.c | 1 + > xen/drivers/passthrough/vtd/iommu.c | 2 ++ > xen/include/xen/pci.h | 1 + > 5 files changed, 29 insertions(+), 10 deletions(-) > >diff --git a/xen/drivers/passthrough/pci.c b/xen/drivers/passthrough/pci.c >index 6e7126b..9842d76 100644 >--- a/xen/drivers/passthrough/pci.c >+++ b/xen/drivers/passthrough/pci.c >@@ -345,6 +345,7 @@ static struct pci_dev *alloc_pdev(struct pci_seg *pseg, u8 bus, u8 devfn) > break; > > case DEV_TYPE_PCIe_ENDPOINT: >+ case DEV_TYPE_RC_ENDPOINT: > pos = pci_find_cap_offset(pseg->nr, bus, PCI_SLOT(devfn), > PCI_FUNC(devfn), PCI_CAP_ID_EXP); > BUG_ON(!pos); >@@ -854,23 +855,24 @@ int pci_release_devices(struct domain *d) > > enum pdev_type pdev_type(u16 seg, u8 bus, u8 devfn) > { >- u16 class_device, creg; >- u8 d = PCI_SLOT(devfn), f = PCI_FUNC(devfn); >+ uint8_t d = PCI_SLOT(devfn), f = PCI_FUNC(devfn); > int pos = pci_find_cap_offset(seg, bus, d, f, PCI_CAP_ID_EXP); >+ int pcie_type = -1; > >- class_device = pci_conf_read16(seg, bus, d, f, PCI_CLASS_DEVICE); >- switch ( class_device ) >+ if ( pos ) >+ pcie_type = MASK_EXTR(pci_conf_read16(seg, bus, d, f, >+ pos + PCI_EXP_FLAGS), PCI_EXP_FLAGS_TYPE); >+ switch ( pci_conf_read16(seg, bus, d, f, PCI_CLASS_DEVICE) ) > { > case PCI_CLASS_BRIDGE_PCI: >- if ( !pos ) >- return DEV_TYPE_LEGACY_PCI_BRIDGE; >- creg = pci_conf_read16(seg, bus, d, f, pos + PCI_EXP_FLAGS); >- switch ( (creg & PCI_EXP_FLAGS_TYPE) >> 4 ) >+ switch ( pcie_type ) > { > case PCI_EXP_TYPE_PCI_BRIDGE: > return DEV_TYPE_PCIe2PCI_BRIDGE; > case PCI_EXP_TYPE_PCIE_BRIDGE: > return DEV_TYPE_PCI2PCIe_BRIDGE; >+ case -1: >+ return DEV_TYPE_LEGACY_PCI_BRIDGE; > } > return DEV_TYPE_PCIe_BRIDGE; > case PCI_CLASS_BRIDGE_HOST: >@@ -880,7 +882,15 @@ enum pdev_type pdev_type(u16 seg, u8 bus, u8 devfn) > return DEV_TYPE_PCI_UNKNOWN; > } > >- return pos ? DEV_TYPE_PCIe_ENDPOINT : DEV_TYPE_PCI; >+ switch ( pcie_type ) >+ { >+ case PCI_EXP_TYPE_RC_END: >+ return DEV_TYPE_RC_ENDPOINT; >+ case -1: >+ return DEV_TYPE_PCI; >+ } >+ >+ return DEV_TYPE_PCIe_ENDPOINT; > } > > /* >diff --git a/xen/drivers/passthrough/vtd/dmar.c b/xen/drivers/passthrough/vtd/dmar.c >index 82040dd..7c9d17b 100644 >--- a/xen/drivers/passthrough/vtd/dmar.c >+++ b/xen/drivers/passthrough/vtd/dmar.c >@@ -219,7 +219,12 @@ struct acpi_drhd_unit *acpi_find_matched_drhd_unit(const struct pci_dev *pdev) > else if ( pdev->info.is_virtfn ) > { > bus = pdev->info.physfn.bus; >- devfn = PCI_SLOT(pdev->info.physfn.devfn) ? 0 : pdev->info.physfn.devfn; >+ /* ARI is not appliable to Root Complex Integrated Endpoints */ >+ if ( PCI_SLOT(pdev->info.physfn.devfn) && >+ (pdev->type != DEV_TYPE_RC_ENDPOINT) ) >+ devfn = 0; >+ else >+ devfn = pdev->info.physfn.devfn; > } > else > { >diff --git a/xen/drivers/passthrough/vtd/intremap.c b/xen/drivers/passthrough/vtd/intremap.c >index 1e0317c..bae0d3b 100644 >--- a/xen/drivers/passthrough/vtd/intremap.c >+++ b/xen/drivers/passthrough/vtd/intremap.c >@@ -486,6 +486,7 @@ static void set_msi_source_id(struct pci_dev *pdev, struct iremap_entry *ire) > unsigned int sq; > > case DEV_TYPE_PCIe_ENDPOINT: >+ case DEV_TYPE_RC_ENDPOINT: > case DEV_TYPE_PCIe_BRIDGE: > case DEV_TYPE_PCIe2PCI_BRIDGE: > case DEV_TYPE_PCI_HOST_BRIDGE: >diff --git a/xen/drivers/passthrough/vtd/iommu.c b/xen/drivers/passthrough/vtd/iommu.c >index 19328f6..73f3095 100644 >--- a/xen/drivers/passthrough/vtd/iommu.c >+++ b/xen/drivers/passthrough/vtd/iommu.c >@@ -1493,6 +1493,7 @@ static int domain_context_mapping(struct domain *domain, u8 devfn, > break; > > case DEV_TYPE_PCIe_ENDPOINT: >+ case DEV_TYPE_RC_ENDPOINT: > if ( iommu_debug ) > printk(VTDPREFIX "d%d:PCIe: map %04x:%02x:%02x.%u\n", > domain->domain_id, seg, bus, >@@ -1644,6 +1645,7 @@ static int domain_context_unmap(struct domain *domain, u8 devfn, > goto out; > > case DEV_TYPE_PCIe_ENDPOINT: >+ case DEV_TYPE_RC_ENDPOINT: > if ( iommu_debug ) > printk(VTDPREFIX "d%d:PCIe: unmap %04x:%02x:%02x.%u\n", > domain->domain_id, seg, bus, >diff --git a/xen/include/xen/pci.h b/xen/include/xen/pci.h >index 59b6e8a..4ec74b1 100644 >--- a/xen/include/xen/pci.h >+++ b/xen/include/xen/pci.h >@@ -67,6 +67,7 @@ struct pci_dev { > enum pdev_type { > DEV_TYPE_PCI_UNKNOWN, > DEV_TYPE_PCIe_ENDPOINT, >+ DEV_TYPE_RC_ENDPOINT, // Root Complex Integrated Endpoint > DEV_TYPE_PCIe_BRIDGE, // PCIe root port, switch > DEV_TYPE_PCIe2PCI_BRIDGE, // PCIe-to-PCI/PCIx bridge > DEV_TYPE_PCI2PCIe_BRIDGE, // PCI/PCIx-to-PCIe bridge >-- >1.8.3.1 >
>>> On 16.06.17 at 08:48, <chao.gao@intel.com> wrote: > The problem is a VF of RC integrated PF (e.g. PF's BDF is 00:02.0), > we would wrongly use 00:00.0 to search VT-d unit. > > To search VT-d unit for a VF, the BDF of the PF is used. And If the > PF is an Extended Function, the BDF of one traditional function is > used. The following line (from acpi_find_matched_drhd_unit()): > devfn = PCI_SLOT(pdev->info.physfn.devfn) ? 0 : pdev->info.physfn.devfn; > sets 'devfn' to 0 if PF's devfn > 8. Is that really the relevant line? Since you say PF is an Extended Function, wouldn't if ( pdev->info.is_extfn ) { bus = pdev->bus; devfn = 0; } be the relevant code? Or else - is is_extfn not being set correctly? > @@ -854,23 +855,24 @@ int pci_release_devices(struct domain *d) > > enum pdev_type pdev_type(u16 seg, u8 bus, u8 devfn) > { > - u16 class_device, creg; > - u8 d = PCI_SLOT(devfn), f = PCI_FUNC(devfn); > + uint8_t d = PCI_SLOT(devfn), f = PCI_FUNC(devfn); > int pos = pci_find_cap_offset(seg, bus, d, f, PCI_CAP_ID_EXP); > + int pcie_type = -1; > > - class_device = pci_conf_read16(seg, bus, d, f, PCI_CLASS_DEVICE); > - switch ( class_device ) > + if ( pos ) > + pcie_type = MASK_EXTR(pci_conf_read16(seg, bus, d, f, > + pos + PCI_EXP_FLAGS), PCI_EXP_FLAGS_TYPE); Indentation. > + switch ( pci_conf_read16(seg, bus, d, f, PCI_CLASS_DEVICE) ) > { > case PCI_CLASS_BRIDGE_PCI: > - if ( !pos ) > - return DEV_TYPE_LEGACY_PCI_BRIDGE; > - creg = pci_conf_read16(seg, bus, d, f, pos + PCI_EXP_FLAGS); > - switch ( (creg & PCI_EXP_FLAGS_TYPE) >> 4 ) > + switch ( pcie_type ) > { > case PCI_EXP_TYPE_PCI_BRIDGE: > return DEV_TYPE_PCIe2PCI_BRIDGE; > case PCI_EXP_TYPE_PCIE_BRIDGE: > return DEV_TYPE_PCI2PCIe_BRIDGE; > + case -1: > + return DEV_TYPE_LEGACY_PCI_BRIDGE; > } > return DEV_TYPE_PCIe_BRIDGE; While overall I appreciate the cleanup your doing at once, please don't merge it with a change which isn't easy to follow for most of us. Aiui there's no behavioral change in this first case block, but that's rather un-obvious to work out. > --- a/xen/drivers/passthrough/vtd/dmar.c > +++ b/xen/drivers/passthrough/vtd/dmar.c > @@ -219,7 +219,12 @@ struct acpi_drhd_unit *acpi_find_matched_drhd_unit(const struct pci_dev *pdev) > else if ( pdev->info.is_virtfn ) > { > bus = pdev->info.physfn.bus; > - devfn = PCI_SLOT(pdev->info.physfn.devfn) ? 0 : pdev->info.physfn.devfn; > + /* ARI is not appliable to Root Complex Integrated Endpoints */ > + if ( PCI_SLOT(pdev->info.physfn.devfn) && > + (pdev->type != DEV_TYPE_RC_ENDPOINT) ) > + devfn = 0; > + else > + devfn = pdev->info.physfn.devfn; > } While I think I understand some of PCI, I have to admit that the connection to ARI is not at all obvious to me at this place in the sources. Hence I'd appreciate if you would extend the comment. Jan
On Fri, Jun 16, 2017 at 09:52:11AM -0600, Jan Beulich wrote: >>>> On 16.06.17 at 08:48, <chao.gao@intel.com> wrote: >> The problem is a VF of RC integrated PF (e.g. PF's BDF is 00:02.0), >> we would wrongly use 00:00.0 to search VT-d unit. >> >> To search VT-d unit for a VF, the BDF of the PF is used. And If the >> PF is an Extended Function, the BDF of one traditional function is >> used. The following line (from acpi_find_matched_drhd_unit()): >> devfn = PCI_SLOT(pdev->info.physfn.devfn) ? 0 : pdev->info.physfn.devfn; >> sets 'devfn' to 0 if PF's devfn > 8. > >Is that really the relevant line? Since you say PF is an Extended >Function, wouldn't > > if ( pdev->info.is_extfn ) > { > bus = pdev->bus; > devfn = 0; > } > >be the relevant code? Or else - is is_extfn not being set correctly? > I think this field is not being set for VF. And here what we want to know is whether the PF of this VF is an extended functin. We also can add a new field 'is_extfn' in pdev->info.physfn and change the caller in linux kernel accordingly. But it will be not compatible with the old kernel. > >> --- a/xen/drivers/passthrough/vtd/dmar.c >> +++ b/xen/drivers/passthrough/vtd/dmar.c >> @@ -219,7 +219,12 @@ struct acpi_drhd_unit *acpi_find_matched_drhd_unit(const struct pci_dev *pdev) >> else if ( pdev->info.is_virtfn ) >> { >> bus = pdev->info.physfn.bus; >> - devfn = PCI_SLOT(pdev->info.physfn.devfn) ? 0 : pdev->info.physfn.devfn; >> + /* ARI is not appliable to Root Complex Integrated Endpoints */ >> + if ( PCI_SLOT(pdev->info.physfn.devfn) && >> + (pdev->type != DEV_TYPE_RC_ENDPOINT) ) >> + devfn = 0; >> + else >> + devfn = pdev->info.physfn.devfn; >> } > >While I think I understand some of PCI, I have to admit that the >connection to ARI is not at all obvious to me at this place in the >sources. Hence I'd appreciate if you would extend the comment. Ok. How about this: - devfn = PCI_SLOT(pdev->info.physfn.devfn) ? 0 : pdev->info.physfn.devfn; + /* + * Use 0 as the devfn to search VT-d unit when the PF is an Extended + * Function (means within an ARI Device, a Function whose Function Number + * is greater than 7). + */ + if ( PCI_SLOT(pdev->info.physfn.devfn) && + (pci_find_ext_capability(pdev->seg, bus, + pdev->info.physfn.devfn, PCI_EXT_CAP_ID_ARI)) ) + devfn = 0; + else + devfn = pdev->info.physfn.devfn; Thanks Chao
>>> On 19.06.17 at 08:33, <chao.gao@intel.com> wrote: > On Fri, Jun 16, 2017 at 09:52:11AM -0600, Jan Beulich wrote: >>>>> On 16.06.17 at 08:48, <chao.gao@intel.com> wrote: >>> The problem is a VF of RC integrated PF (e.g. PF's BDF is 00:02.0), >>> we would wrongly use 00:00.0 to search VT-d unit. >>> >>> To search VT-d unit for a VF, the BDF of the PF is used. And If the >>> PF is an Extended Function, the BDF of one traditional function is >>> used. The following line (from acpi_find_matched_drhd_unit()): >>> devfn = PCI_SLOT(pdev->info.physfn.devfn) ? 0 : pdev->info.physfn.devfn; >>> sets 'devfn' to 0 if PF's devfn > 8. >> >>Is that really the relevant line? Since you say PF is an Extended >>Function, wouldn't >> >> if ( pdev->info.is_extfn ) >> { >> bus = pdev->bus; >> devfn = 0; >> } >> >>be the relevant code? Or else - is is_extfn not being set correctly? > > I think this field is not being set for VF. And here what we want to > know is whether the PF of this VF is an extended functin. We also can add > a new field 'is_extfn' in pdev->info.physfn and change the caller in > linux kernel accordingly. But it will be not compatible with the old kernel. Wait, no - I did describe things slightly wrongly, and hence perhaps managed to confuse you (besides myself). For the VF we don't want to see is_extfn set, but for its PF I'd expect that to be the case. With that I'd then think looking up the struct pci_dev for the PF is all it takes to tell apart both cases, the more that I'm not sure ... >>> --- a/xen/drivers/passthrough/vtd/dmar.c >>> +++ b/xen/drivers/passthrough/vtd/dmar.c >>> @@ -219,7 +219,12 @@ struct acpi_drhd_unit *acpi_find_matched_drhd_unit(const struct pci_dev *pdev) >>> else if ( pdev->info.is_virtfn ) >>> { >>> bus = pdev->info.physfn.bus; >>> - devfn = PCI_SLOT(pdev->info.physfn.devfn) ? 0 : pdev->info.physfn.devfn; >>> + /* ARI is not appliable to Root Complex Integrated Endpoints */ >>> + if ( PCI_SLOT(pdev->info.physfn.devfn) && >>> + (pdev->type != DEV_TYPE_RC_ENDPOINT) ) ... checking VF's type (instead of PF's) here is appropriate / most compatible. >>> + devfn = 0; >>> + else >>> + devfn = pdev->info.physfn.devfn; >>> } >> >>While I think I understand some of PCI, I have to admit that the >>connection to ARI is not at all obvious to me at this place in the >>sources. Hence I'd appreciate if you would extend the comment. > > Ok. How about this: > > - devfn = PCI_SLOT(pdev->info.physfn.devfn) ? 0 : pdev->info.physfn.devfn; > + /* > + * Use 0 as the devfn to search VT-d unit when the PF is an Extended > + * Function (means within an ARI Device, a Function whose Function Number > + * is greater than 7). > + */ > + if ( PCI_SLOT(pdev->info.physfn.devfn) && > + (pci_find_ext_capability(pdev->seg, bus, > + pdev->info.physfn.devfn, PCI_EXT_CAP_ID_ARI)) ) > + devfn = 0; > + else > + devfn = pdev->info.physfn.devfn; That's better, but I'm still having some difficulty. In the Linux kernel we have if (pci_dev->is_virtfn) { ... } else if (pci_ari_enabled(pci_dev->bus) && PCI_SLOT(pci_dev->devfn)) add->flags = XEN_PCI_DEV_EXTFN; which tells me that the mere presence of an ARI capability may not be enough. Furthermore Linux checks whether devfn is zero in pci_configure_ari(), not whether PCI_SLOT(devfn) is non-zero - wouldn't that mean you want to pass a zero devfn to pci_find_ext_capability() above? Jan
On Mon, Jun 19, 2017 at 01:43:25AM -0600, Jan Beulich wrote: >>>> On 19.06.17 at 08:33, <chao.gao@intel.com> wrote: >> On Fri, Jun 16, 2017 at 09:52:11AM -0600, Jan Beulich wrote: >>>>>> On 16.06.17 at 08:48, <chao.gao@intel.com> wrote: >>>> The problem is a VF of RC integrated PF (e.g. PF's BDF is 00:02.0), >>>> we would wrongly use 00:00.0 to search VT-d unit. >>>> >>>> To search VT-d unit for a VF, the BDF of the PF is used. And If the >>>> PF is an Extended Function, the BDF of one traditional function is >>>> used. The following line (from acpi_find_matched_drhd_unit()): >>>> devfn = PCI_SLOT(pdev->info.physfn.devfn) ? 0 : pdev->info.physfn.devfn; >>>> sets 'devfn' to 0 if PF's devfn > 8. >>> >>>Is that really the relevant line? Since you say PF is an Extended >>>Function, wouldn't >>> >>> if ( pdev->info.is_extfn ) >>> { >>> bus = pdev->bus; >>> devfn = 0; >>> } >>> >>>be the relevant code? Or else - is is_extfn not being set correctly? >> >> I think this field is not being set for VF. And here what we want to >> know is whether the PF of this VF is an extended functin. We also can add >> a new field 'is_extfn' in pdev->info.physfn and change the caller in >> linux kernel accordingly. But it will be not compatible with the old kernel. > >Wait, no - I did describe things slightly wrongly, and hence perhaps >managed to confuse you (besides myself). For the VF we don't want >to see is_extfn set, but for its PF I'd expect that to be the case. >With that I'd then think looking up the struct pci_dev for the PF is all >it takes to tell apart both cases, the more that I'm not sure ... Impressive! > >>>> --- a/xen/drivers/passthrough/vtd/dmar.c >>>> +++ b/xen/drivers/passthrough/vtd/dmar.c >>>> @@ -219,7 +219,12 @@ struct acpi_drhd_unit *acpi_find_matched_drhd_unit(const struct pci_dev *pdev) >>>> else if ( pdev->info.is_virtfn ) >>>> { >>>> bus = pdev->info.physfn.bus; >>>> - devfn = PCI_SLOT(pdev->info.physfn.devfn) ? 0 : pdev->info.physfn.devfn; >>>> + /* ARI is not appliable to Root Complex Integrated Endpoints */ >>>> + if ( PCI_SLOT(pdev->info.physfn.devfn) && >>>> + (pdev->type != DEV_TYPE_RC_ENDPOINT) ) > >... checking VF's type (instead of PF's) here is appropriate / most >compatible. It was a mistake. Sorry for this. > >>>> + devfn = 0; >>>> + else >>>> + devfn = pdev->info.physfn.devfn; >>>> } >>> >>>While I think I understand some of PCI, I have to admit that the >>>connection to ARI is not at all obvious to me at this place in the >>>sources. Hence I'd appreciate if you would extend the comment. >> >> Ok. How about this: >> >> - devfn = PCI_SLOT(pdev->info.physfn.devfn) ? 0 : pdev->info.physfn.devfn; >> + /* >> + * Use 0 as the devfn to search VT-d unit when the PF is an Extended >> + * Function (means within an ARI Device, a Function whose Function Number >> + * is greater than 7). >> + */ >> + if ( PCI_SLOT(pdev->info.physfn.devfn) && >> + (pci_find_ext_capability(pdev->seg, bus, >> + pdev->info.physfn.devfn, PCI_EXT_CAP_ID_ARI)) ) >> + devfn = 0; >> + else >> + devfn = pdev->info.physfn.devfn; > >That's better, but I'm still having some difficulty. In the Linux kernel >we have > > if (pci_dev->is_virtfn) { > ... > } else > if (pci_ari_enabled(pci_dev->bus) && PCI_SLOT(pci_dev->devfn)) > add->flags = XEN_PCI_DEV_EXTFN; > >which tells me that the mere presence of an ARI capability may not >be enough. Furthermore Linux checks whether devfn is zero in >pci_configure_ari(), not whether PCI_SLOT(devfn) is non-zero - >wouldn't that mean you want to pass a zero devfn to >pci_find_ext_capability() above? No. pci_configure_ari() is to enable ARI forwarding, which is a feature of downstream port. ARI capability is a feature of device. Anyhow, I finally understand what you said, which is actually what I want. If I can easily handle the _pcidevs_lock to call pci_get_pdev(), it is the best solution I think.
On Mon, Jun 19, 2017 at 01:43:25AM -0600, Jan Beulich wrote: >>>> On 19.06.17 at 08:33, <chao.gao@intel.com> wrote: >> On Fri, Jun 16, 2017 at 09:52:11AM -0600, Jan Beulich wrote: >>>>>> On 16.06.17 at 08:48, <chao.gao@intel.com> wrote: >>>> The problem is a VF of RC integrated PF (e.g. PF's BDF is 00:02.0), >>>> we would wrongly use 00:00.0 to search VT-d unit. >>>> >>>> To search VT-d unit for a VF, the BDF of the PF is used. And If the >>>> PF is an Extended Function, the BDF of one traditional function is >>>> used. The following line (from acpi_find_matched_drhd_unit()): >>>> devfn = PCI_SLOT(pdev->info.physfn.devfn) ? 0 : pdev->info.physfn.devfn; >>>> sets 'devfn' to 0 if PF's devfn > 8. >>> >>>Is that really the relevant line? Since you say PF is an Extended >>>Function, wouldn't >>> >>> if ( pdev->info.is_extfn ) >>> { >>> bus = pdev->bus; >>> devfn = 0; >>> } >>> >>>be the relevant code? Or else - is is_extfn not being set correctly? >> >> I think this field is not being set for VF. And here what we want to >> know is whether the PF of this VF is an extended functin. We also can add >> a new field 'is_extfn' in pdev->info.physfn and change the caller in >> linux kernel accordingly. But it will be not compatible with the old kernel. > >Wait, no - I did describe things slightly wrongly, and hence perhaps >managed to confuse you (besides myself). For the VF we don't want >to see is_extfn set, but for its PF I'd expect that to be the case. >With that I'd then think looking up the struct pci_dev for the PF is all >it takes to tell apart both cases, the more that I'm not sure ... Hi, Jan. in pci_add_device(): else if (info->is_virtfn) { pcidevs_lock(); pdev = pci_get_pdev(seg, info->physfn.bus, info->physfn.devfn); pcidevs_unlock(); if ( !pdev ) pci_add_device(seg, info->physfn.bus, info->physfn.devfn, NULL, node); pdev_type = "virtual function"; } could you recall in which case, we can't get the PF by pci_get_pdev() above? The reason why I want to know is in this case, is_extfn of the PF may not be set correctly. Thanks Chao
>>> On 20.06.17 at 12:51, <chao.gao@intel.com> wrote: > On Mon, Jun 19, 2017 at 01:43:25AM -0600, Jan Beulich wrote: >>>>> On 19.06.17 at 08:33, <chao.gao@intel.com> wrote: >>> On Fri, Jun 16, 2017 at 09:52:11AM -0600, Jan Beulich wrote: >>>>>>> On 16.06.17 at 08:48, <chao.gao@intel.com> wrote: >>>>> The problem is a VF of RC integrated PF (e.g. PF's BDF is 00:02.0), >>>>> we would wrongly use 00:00.0 to search VT-d unit. >>>>> >>>>> To search VT-d unit for a VF, the BDF of the PF is used. And If the >>>>> PF is an Extended Function, the BDF of one traditional function is >>>>> used. The following line (from acpi_find_matched_drhd_unit()): >>>>> devfn = PCI_SLOT(pdev->info.physfn.devfn) ? 0 : pdev->info.physfn.devfn; >>>>> sets 'devfn' to 0 if PF's devfn > 8. >>>> >>>>Is that really the relevant line? Since you say PF is an Extended >>>>Function, wouldn't >>>> >>>> if ( pdev->info.is_extfn ) >>>> { >>>> bus = pdev->bus; >>>> devfn = 0; >>>> } >>>> >>>>be the relevant code? Or else - is is_extfn not being set correctly? >>> >>> I think this field is not being set for VF. And here what we want to >>> know is whether the PF of this VF is an extended functin. We also can add >>> a new field 'is_extfn' in pdev->info.physfn and change the caller in >>> linux kernel accordingly. But it will be not compatible with the old kernel. >> >>Wait, no - I did describe things slightly wrongly, and hence perhaps >>managed to confuse you (besides myself). For the VF we don't want >>to see is_extfn set, but for its PF I'd expect that to be the case. >>With that I'd then think looking up the struct pci_dev for the PF is all >>it takes to tell apart both cases, the more that I'm not sure ... > > Hi, Jan. in pci_add_device(): > > else if (info->is_virtfn) > { > pcidevs_lock(); > pdev = pci_get_pdev(seg, info->physfn.bus, info->physfn.devfn); > pcidevs_unlock(); > if ( !pdev ) > pci_add_device(seg, info->physfn.bus, info->physfn.devfn, > NULL, node); > pdev_type = "virtual function"; > } > > could you recall in which case, we can't get the PF by > pci_get_pdev() above? This is just a safety measure to make sure we have a PF device to refer to (and we don't want to return failure because of that). I'm not aware of the path actually being needed (and even if it was taken, I'd expect a subsequent hypercall to report the PF). Jan
diff --git a/xen/drivers/passthrough/pci.c b/xen/drivers/passthrough/pci.c index 6e7126b..9842d76 100644 --- a/xen/drivers/passthrough/pci.c +++ b/xen/drivers/passthrough/pci.c @@ -345,6 +345,7 @@ static struct pci_dev *alloc_pdev(struct pci_seg *pseg, u8 bus, u8 devfn) break; case DEV_TYPE_PCIe_ENDPOINT: + case DEV_TYPE_RC_ENDPOINT: pos = pci_find_cap_offset(pseg->nr, bus, PCI_SLOT(devfn), PCI_FUNC(devfn), PCI_CAP_ID_EXP); BUG_ON(!pos); @@ -854,23 +855,24 @@ int pci_release_devices(struct domain *d) enum pdev_type pdev_type(u16 seg, u8 bus, u8 devfn) { - u16 class_device, creg; - u8 d = PCI_SLOT(devfn), f = PCI_FUNC(devfn); + uint8_t d = PCI_SLOT(devfn), f = PCI_FUNC(devfn); int pos = pci_find_cap_offset(seg, bus, d, f, PCI_CAP_ID_EXP); + int pcie_type = -1; - class_device = pci_conf_read16(seg, bus, d, f, PCI_CLASS_DEVICE); - switch ( class_device ) + if ( pos ) + pcie_type = MASK_EXTR(pci_conf_read16(seg, bus, d, f, + pos + PCI_EXP_FLAGS), PCI_EXP_FLAGS_TYPE); + switch ( pci_conf_read16(seg, bus, d, f, PCI_CLASS_DEVICE) ) { case PCI_CLASS_BRIDGE_PCI: - if ( !pos ) - return DEV_TYPE_LEGACY_PCI_BRIDGE; - creg = pci_conf_read16(seg, bus, d, f, pos + PCI_EXP_FLAGS); - switch ( (creg & PCI_EXP_FLAGS_TYPE) >> 4 ) + switch ( pcie_type ) { case PCI_EXP_TYPE_PCI_BRIDGE: return DEV_TYPE_PCIe2PCI_BRIDGE; case PCI_EXP_TYPE_PCIE_BRIDGE: return DEV_TYPE_PCI2PCIe_BRIDGE; + case -1: + return DEV_TYPE_LEGACY_PCI_BRIDGE; } return DEV_TYPE_PCIe_BRIDGE; case PCI_CLASS_BRIDGE_HOST: @@ -880,7 +882,15 @@ enum pdev_type pdev_type(u16 seg, u8 bus, u8 devfn) return DEV_TYPE_PCI_UNKNOWN; } - return pos ? DEV_TYPE_PCIe_ENDPOINT : DEV_TYPE_PCI; + switch ( pcie_type ) + { + case PCI_EXP_TYPE_RC_END: + return DEV_TYPE_RC_ENDPOINT; + case -1: + return DEV_TYPE_PCI; + } + + return DEV_TYPE_PCIe_ENDPOINT; } /* diff --git a/xen/drivers/passthrough/vtd/dmar.c b/xen/drivers/passthrough/vtd/dmar.c index 82040dd..7c9d17b 100644 --- a/xen/drivers/passthrough/vtd/dmar.c +++ b/xen/drivers/passthrough/vtd/dmar.c @@ -219,7 +219,12 @@ struct acpi_drhd_unit *acpi_find_matched_drhd_unit(const struct pci_dev *pdev) else if ( pdev->info.is_virtfn ) { bus = pdev->info.physfn.bus; - devfn = PCI_SLOT(pdev->info.physfn.devfn) ? 0 : pdev->info.physfn.devfn; + /* ARI is not appliable to Root Complex Integrated Endpoints */ + if ( PCI_SLOT(pdev->info.physfn.devfn) && + (pdev->type != DEV_TYPE_RC_ENDPOINT) ) + devfn = 0; + else + devfn = pdev->info.physfn.devfn; } else { diff --git a/xen/drivers/passthrough/vtd/intremap.c b/xen/drivers/passthrough/vtd/intremap.c index 1e0317c..bae0d3b 100644 --- a/xen/drivers/passthrough/vtd/intremap.c +++ b/xen/drivers/passthrough/vtd/intremap.c @@ -486,6 +486,7 @@ static void set_msi_source_id(struct pci_dev *pdev, struct iremap_entry *ire) unsigned int sq; case DEV_TYPE_PCIe_ENDPOINT: + case DEV_TYPE_RC_ENDPOINT: case DEV_TYPE_PCIe_BRIDGE: case DEV_TYPE_PCIe2PCI_BRIDGE: case DEV_TYPE_PCI_HOST_BRIDGE: diff --git a/xen/drivers/passthrough/vtd/iommu.c b/xen/drivers/passthrough/vtd/iommu.c index 19328f6..73f3095 100644 --- a/xen/drivers/passthrough/vtd/iommu.c +++ b/xen/drivers/passthrough/vtd/iommu.c @@ -1493,6 +1493,7 @@ static int domain_context_mapping(struct domain *domain, u8 devfn, break; case DEV_TYPE_PCIe_ENDPOINT: + case DEV_TYPE_RC_ENDPOINT: if ( iommu_debug ) printk(VTDPREFIX "d%d:PCIe: map %04x:%02x:%02x.%u\n", domain->domain_id, seg, bus, @@ -1644,6 +1645,7 @@ static int domain_context_unmap(struct domain *domain, u8 devfn, goto out; case DEV_TYPE_PCIe_ENDPOINT: + case DEV_TYPE_RC_ENDPOINT: if ( iommu_debug ) printk(VTDPREFIX "d%d:PCIe: unmap %04x:%02x:%02x.%u\n", domain->domain_id, seg, bus, diff --git a/xen/include/xen/pci.h b/xen/include/xen/pci.h index 59b6e8a..4ec74b1 100644 --- a/xen/include/xen/pci.h +++ b/xen/include/xen/pci.h @@ -67,6 +67,7 @@ struct pci_dev { enum pdev_type { DEV_TYPE_PCI_UNKNOWN, DEV_TYPE_PCIe_ENDPOINT, + DEV_TYPE_RC_ENDPOINT, // Root Complex Integrated Endpoint DEV_TYPE_PCIe_BRIDGE, // PCIe root port, switch DEV_TYPE_PCIe2PCI_BRIDGE, // PCIe-to-PCI/PCIx bridge DEV_TYPE_PCI2PCIe_BRIDGE, // PCI/PCIx-to-PCIe bridge
The problem is a VF of RC integrated PF (e.g. PF's BDF is 00:02.0), we would wrongly use 00:00.0 to search VT-d unit. To search VT-d unit for a VF, the BDF of the PF is used. And If the PF is an Extended Function, the BDF of one traditional function is used. The following line (from acpi_find_matched_drhd_unit()): devfn = PCI_SLOT(pdev->info.physfn.devfn) ? 0 : pdev->info.physfn.devfn; sets 'devfn' to 0 if PF's devfn > 8. Apparently, this line treats all PFs as ARI-capable function and assumes the Root Port or Switch Downstream Port immediately above the PF has ARIforwarding enabled. However, according to SRIOV spec 3.7.3, ARI is not applicable to RC integrated PF. For this case, we should use PF's BDF directly other than using 0 as devfn. This patch adds a new pdev type to indicate a function is RC integrated. And check whether PF is a RC integrated endpoint when searching VT-d unit. Reported-by: Crawford, Eric R <Eric.R.Crawford@intel.com> Signed-off-by: Chao Gao <chao.gao@intel.com> --- xen/drivers/passthrough/pci.c | 28 +++++++++++++++++++--------- xen/drivers/passthrough/vtd/dmar.c | 7 ++++++- xen/drivers/passthrough/vtd/intremap.c | 1 + xen/drivers/passthrough/vtd/iommu.c | 2 ++ xen/include/xen/pci.h | 1 + 5 files changed, 29 insertions(+), 10 deletions(-)