diff mbox

VT-d: fix VF of RC integrated endpoint matched to wrong VT-d unit

Message ID 1497595719-107855-1-git-send-email-chao.gao@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Chao Gao June 16, 2017, 6:48 a.m. UTC
The problem is a VF of RC integrated PF (e.g. PF's BDF is 00:02.0),
we would wrongly use 00:00.0 to search VT-d unit.

To search VT-d unit for a VF, the BDF of the PF is used. And If the
PF is an Extended Function, the BDF of one traditional function is
used. The following line (from acpi_find_matched_drhd_unit()):
    devfn = PCI_SLOT(pdev->info.physfn.devfn) ? 0 : pdev->info.physfn.devfn;
sets 'devfn' to 0 if PF's devfn > 8. Apparently, this line treats all
PFs as ARI-capable function and assumes the Root Port or Switch
Downstream Port immediately above the PF has ARIforwarding enabled.
However, according to SRIOV spec 3.7.3, ARI is not applicable to RC
integrated PF. For this case, we should use PF's BDF directly other
than using 0 as devfn.

This patch adds a new pdev type to indicate a function is RC
integrated. And check whether PF is a RC integrated endpoint when
searching VT-d unit.

Reported-by: Crawford, Eric R <Eric.R.Crawford@intel.com>
Signed-off-by: Chao Gao <chao.gao@intel.com>
---
 xen/drivers/passthrough/pci.c          | 28 +++++++++++++++++++---------
 xen/drivers/passthrough/vtd/dmar.c     |  7 ++++++-
 xen/drivers/passthrough/vtd/intremap.c |  1 +
 xen/drivers/passthrough/vtd/iommu.c    |  2 ++
 xen/include/xen/pci.h                  |  1 +
 5 files changed, 29 insertions(+), 10 deletions(-)

Comments

Chao Gao June 16, 2017, 6:57 a.m. UTC | #1
+ Eric

On Fri, Jun 16, 2017 at 02:48:39PM +0800, Chao Gao wrote:
>The problem is a VF of RC integrated PF (e.g. PF's BDF is 00:02.0),
>we would wrongly use 00:00.0 to search VT-d unit.
>
>To search VT-d unit for a VF, the BDF of the PF is used. And If the
>PF is an Extended Function, the BDF of one traditional function is
>used. The following line (from acpi_find_matched_drhd_unit()):
>    devfn = PCI_SLOT(pdev->info.physfn.devfn) ? 0 : pdev->info.physfn.devfn;
>sets 'devfn' to 0 if PF's devfn > 8. Apparently, this line treats all
>PFs as ARI-capable function and assumes the Root Port or Switch
>Downstream Port immediately above the PF has ARIforwarding enabled.
>However, according to SRIOV spec 3.7.3, ARI is not applicable to RC
>integrated PF. For this case, we should use PF's BDF directly other
>than using 0 as devfn.
>
>This patch adds a new pdev type to indicate a function is RC
>integrated. And check whether PF is a RC integrated endpoint when
>searching VT-d unit.
>
>Reported-by: Crawford, Eric R <Eric.R.Crawford@intel.com>
>Signed-off-by: Chao Gao <chao.gao@intel.com>
>---
> xen/drivers/passthrough/pci.c          | 28 +++++++++++++++++++---------
> xen/drivers/passthrough/vtd/dmar.c     |  7 ++++++-
> xen/drivers/passthrough/vtd/intremap.c |  1 +
> xen/drivers/passthrough/vtd/iommu.c    |  2 ++
> xen/include/xen/pci.h                  |  1 +
> 5 files changed, 29 insertions(+), 10 deletions(-)
>
>diff --git a/xen/drivers/passthrough/pci.c b/xen/drivers/passthrough/pci.c
>index 6e7126b..9842d76 100644
>--- a/xen/drivers/passthrough/pci.c
>+++ b/xen/drivers/passthrough/pci.c
>@@ -345,6 +345,7 @@ static struct pci_dev *alloc_pdev(struct pci_seg *pseg, u8 bus, u8 devfn)
>             break;
> 
>         case DEV_TYPE_PCIe_ENDPOINT:
>+        case DEV_TYPE_RC_ENDPOINT:
>             pos = pci_find_cap_offset(pseg->nr, bus, PCI_SLOT(devfn),
>                                       PCI_FUNC(devfn), PCI_CAP_ID_EXP);
>             BUG_ON(!pos);
>@@ -854,23 +855,24 @@ int pci_release_devices(struct domain *d)
> 
> enum pdev_type pdev_type(u16 seg, u8 bus, u8 devfn)
> {
>-    u16 class_device, creg;
>-    u8 d = PCI_SLOT(devfn), f = PCI_FUNC(devfn);
>+    uint8_t d = PCI_SLOT(devfn), f = PCI_FUNC(devfn);
>     int pos = pci_find_cap_offset(seg, bus, d, f, PCI_CAP_ID_EXP);
>+    int pcie_type = -1;
> 
>-    class_device = pci_conf_read16(seg, bus, d, f, PCI_CLASS_DEVICE);
>-    switch ( class_device )
>+    if ( pos )
>+        pcie_type = MASK_EXTR(pci_conf_read16(seg, bus, d, f,
>+                                  pos + PCI_EXP_FLAGS), PCI_EXP_FLAGS_TYPE);
>+    switch ( pci_conf_read16(seg, bus, d, f, PCI_CLASS_DEVICE) )
>     {
>     case PCI_CLASS_BRIDGE_PCI:
>-        if ( !pos )
>-            return DEV_TYPE_LEGACY_PCI_BRIDGE;
>-        creg = pci_conf_read16(seg, bus, d, f, pos + PCI_EXP_FLAGS);
>-        switch ( (creg & PCI_EXP_FLAGS_TYPE) >> 4 )
>+        switch ( pcie_type )
>         {
>         case PCI_EXP_TYPE_PCI_BRIDGE:
>             return DEV_TYPE_PCIe2PCI_BRIDGE;
>         case PCI_EXP_TYPE_PCIE_BRIDGE:
>             return DEV_TYPE_PCI2PCIe_BRIDGE;
>+        case -1:
>+            return DEV_TYPE_LEGACY_PCI_BRIDGE;
>         }
>         return DEV_TYPE_PCIe_BRIDGE;
>     case PCI_CLASS_BRIDGE_HOST:
>@@ -880,7 +882,15 @@ enum pdev_type pdev_type(u16 seg, u8 bus, u8 devfn)
>         return DEV_TYPE_PCI_UNKNOWN;
>     }
> 
>-    return pos ? DEV_TYPE_PCIe_ENDPOINT : DEV_TYPE_PCI;
>+    switch ( pcie_type )
>+    {
>+    case PCI_EXP_TYPE_RC_END:
>+        return DEV_TYPE_RC_ENDPOINT;
>+    case -1:
>+        return DEV_TYPE_PCI;
>+    }
>+
>+    return DEV_TYPE_PCIe_ENDPOINT;
> }
> 
> /*
>diff --git a/xen/drivers/passthrough/vtd/dmar.c b/xen/drivers/passthrough/vtd/dmar.c
>index 82040dd..7c9d17b 100644
>--- a/xen/drivers/passthrough/vtd/dmar.c
>+++ b/xen/drivers/passthrough/vtd/dmar.c
>@@ -219,7 +219,12 @@ struct acpi_drhd_unit *acpi_find_matched_drhd_unit(const struct pci_dev *pdev)
>     else if ( pdev->info.is_virtfn )
>     {
>         bus = pdev->info.physfn.bus;
>-        devfn = PCI_SLOT(pdev->info.physfn.devfn) ? 0 : pdev->info.physfn.devfn;
>+        /* ARI is not appliable to Root Complex Integrated Endpoints */
>+        if ( PCI_SLOT(pdev->info.physfn.devfn) &&
>+             (pdev->type != DEV_TYPE_RC_ENDPOINT) )
>+            devfn = 0;
>+        else
>+            devfn = pdev->info.physfn.devfn;
>     }
>     else
>     {
>diff --git a/xen/drivers/passthrough/vtd/intremap.c b/xen/drivers/passthrough/vtd/intremap.c
>index 1e0317c..bae0d3b 100644
>--- a/xen/drivers/passthrough/vtd/intremap.c
>+++ b/xen/drivers/passthrough/vtd/intremap.c
>@@ -486,6 +486,7 @@ static void set_msi_source_id(struct pci_dev *pdev, struct iremap_entry *ire)
>         unsigned int sq;
> 
>     case DEV_TYPE_PCIe_ENDPOINT:
>+    case DEV_TYPE_RC_ENDPOINT:
>     case DEV_TYPE_PCIe_BRIDGE:
>     case DEV_TYPE_PCIe2PCI_BRIDGE:
>     case DEV_TYPE_PCI_HOST_BRIDGE:
>diff --git a/xen/drivers/passthrough/vtd/iommu.c b/xen/drivers/passthrough/vtd/iommu.c
>index 19328f6..73f3095 100644
>--- a/xen/drivers/passthrough/vtd/iommu.c
>+++ b/xen/drivers/passthrough/vtd/iommu.c
>@@ -1493,6 +1493,7 @@ static int domain_context_mapping(struct domain *domain, u8 devfn,
>         break;
> 
>     case DEV_TYPE_PCIe_ENDPOINT:
>+    case DEV_TYPE_RC_ENDPOINT:
>         if ( iommu_debug )
>             printk(VTDPREFIX "d%d:PCIe: map %04x:%02x:%02x.%u\n",
>                    domain->domain_id, seg, bus,
>@@ -1644,6 +1645,7 @@ static int domain_context_unmap(struct domain *domain, u8 devfn,
>         goto out;
> 
>     case DEV_TYPE_PCIe_ENDPOINT:
>+    case DEV_TYPE_RC_ENDPOINT:
>         if ( iommu_debug )
>             printk(VTDPREFIX "d%d:PCIe: unmap %04x:%02x:%02x.%u\n",
>                    domain->domain_id, seg, bus,
>diff --git a/xen/include/xen/pci.h b/xen/include/xen/pci.h
>index 59b6e8a..4ec74b1 100644
>--- a/xen/include/xen/pci.h
>+++ b/xen/include/xen/pci.h
>@@ -67,6 +67,7 @@ struct pci_dev {
>     enum pdev_type {
>         DEV_TYPE_PCI_UNKNOWN,
>         DEV_TYPE_PCIe_ENDPOINT,
>+        DEV_TYPE_RC_ENDPOINT,       // Root Complex Integrated Endpoint
>         DEV_TYPE_PCIe_BRIDGE,       // PCIe root port, switch
>         DEV_TYPE_PCIe2PCI_BRIDGE,   // PCIe-to-PCI/PCIx bridge
>         DEV_TYPE_PCI2PCIe_BRIDGE,   // PCI/PCIx-to-PCIe bridge
>-- 
>1.8.3.1
>
Jan Beulich June 16, 2017, 3:52 p.m. UTC | #2
>>> On 16.06.17 at 08:48, <chao.gao@intel.com> wrote:
> The problem is a VF of RC integrated PF (e.g. PF's BDF is 00:02.0),
> we would wrongly use 00:00.0 to search VT-d unit.
> 
> To search VT-d unit for a VF, the BDF of the PF is used. And If the
> PF is an Extended Function, the BDF of one traditional function is
> used. The following line (from acpi_find_matched_drhd_unit()):
>     devfn = PCI_SLOT(pdev->info.physfn.devfn) ? 0 : pdev->info.physfn.devfn;
> sets 'devfn' to 0 if PF's devfn > 8.

Is that really the relevant line? Since you say PF is an Extended
Function, wouldn't

    if ( pdev->info.is_extfn )
    {
        bus = pdev->bus;
        devfn = 0;
    }

be the relevant code? Or else - is is_extfn not being set correctly?

> @@ -854,23 +855,24 @@ int pci_release_devices(struct domain *d)
>  
>  enum pdev_type pdev_type(u16 seg, u8 bus, u8 devfn)
>  {
> -    u16 class_device, creg;
> -    u8 d = PCI_SLOT(devfn), f = PCI_FUNC(devfn);
> +    uint8_t d = PCI_SLOT(devfn), f = PCI_FUNC(devfn);
>      int pos = pci_find_cap_offset(seg, bus, d, f, PCI_CAP_ID_EXP);
> +    int pcie_type = -1;
>  
> -    class_device = pci_conf_read16(seg, bus, d, f, PCI_CLASS_DEVICE);
> -    switch ( class_device )
> +    if ( pos )
> +        pcie_type = MASK_EXTR(pci_conf_read16(seg, bus, d, f,
> +                                  pos + PCI_EXP_FLAGS), PCI_EXP_FLAGS_TYPE);

Indentation.

> +    switch ( pci_conf_read16(seg, bus, d, f, PCI_CLASS_DEVICE) )
>      {
>      case PCI_CLASS_BRIDGE_PCI:
> -        if ( !pos )
> -            return DEV_TYPE_LEGACY_PCI_BRIDGE;
> -        creg = pci_conf_read16(seg, bus, d, f, pos + PCI_EXP_FLAGS);
> -        switch ( (creg & PCI_EXP_FLAGS_TYPE) >> 4 )
> +        switch ( pcie_type )
>          {
>          case PCI_EXP_TYPE_PCI_BRIDGE:
>              return DEV_TYPE_PCIe2PCI_BRIDGE;
>          case PCI_EXP_TYPE_PCIE_BRIDGE:
>              return DEV_TYPE_PCI2PCIe_BRIDGE;
> +        case -1:
> +            return DEV_TYPE_LEGACY_PCI_BRIDGE;
>          }
>          return DEV_TYPE_PCIe_BRIDGE;

While overall I appreciate the cleanup your doing at once, please
don't merge it with a change which isn't easy to follow for most of
us. Aiui there's no behavioral change in this first case block, but
that's rather un-obvious to work out.

> --- a/xen/drivers/passthrough/vtd/dmar.c
> +++ b/xen/drivers/passthrough/vtd/dmar.c
> @@ -219,7 +219,12 @@ struct acpi_drhd_unit *acpi_find_matched_drhd_unit(const struct pci_dev *pdev)
>      else if ( pdev->info.is_virtfn )
>      {
>          bus = pdev->info.physfn.bus;
> -        devfn = PCI_SLOT(pdev->info.physfn.devfn) ? 0 : pdev->info.physfn.devfn;
> +        /* ARI is not appliable to Root Complex Integrated Endpoints */
> +        if ( PCI_SLOT(pdev->info.physfn.devfn) &&
> +             (pdev->type != DEV_TYPE_RC_ENDPOINT) )
> +            devfn = 0;
> +        else
> +            devfn = pdev->info.physfn.devfn;
>      }

While I think I understand some of PCI, I have to admit that the
connection to ARI is not at all obvious to me at this place in the
sources. Hence I'd appreciate if you would extend the comment.

Jan
Chao Gao June 19, 2017, 6:33 a.m. UTC | #3
On Fri, Jun 16, 2017 at 09:52:11AM -0600, Jan Beulich wrote:
>>>> On 16.06.17 at 08:48, <chao.gao@intel.com> wrote:
>> The problem is a VF of RC integrated PF (e.g. PF's BDF is 00:02.0),
>> we would wrongly use 00:00.0 to search VT-d unit.
>> 
>> To search VT-d unit for a VF, the BDF of the PF is used. And If the
>> PF is an Extended Function, the BDF of one traditional function is
>> used. The following line (from acpi_find_matched_drhd_unit()):
>>     devfn = PCI_SLOT(pdev->info.physfn.devfn) ? 0 : pdev->info.physfn.devfn;
>> sets 'devfn' to 0 if PF's devfn > 8.
>
>Is that really the relevant line? Since you say PF is an Extended
>Function, wouldn't
>
>    if ( pdev->info.is_extfn )
>    {
>        bus = pdev->bus;
>        devfn = 0;
>    }
>
>be the relevant code? Or else - is is_extfn not being set correctly?
>

I think this field is not being set for VF. And here what we want to
know is whether the PF of this VF is an extended functin. We also can add
a new field 'is_extfn' in pdev->info.physfn and change the caller in
linux kernel accordingly. But it will be not compatible with the old kernel.

>
>> --- a/xen/drivers/passthrough/vtd/dmar.c
>> +++ b/xen/drivers/passthrough/vtd/dmar.c
>> @@ -219,7 +219,12 @@ struct acpi_drhd_unit *acpi_find_matched_drhd_unit(const struct pci_dev *pdev)
>>      else if ( pdev->info.is_virtfn )
>>      {
>>          bus = pdev->info.physfn.bus;
>> -        devfn = PCI_SLOT(pdev->info.physfn.devfn) ? 0 : pdev->info.physfn.devfn;
>> +        /* ARI is not appliable to Root Complex Integrated Endpoints */
>> +        if ( PCI_SLOT(pdev->info.physfn.devfn) &&
>> +             (pdev->type != DEV_TYPE_RC_ENDPOINT) )
>> +            devfn = 0;
>> +        else
>> +            devfn = pdev->info.physfn.devfn;
>>      }
>
>While I think I understand some of PCI, I have to admit that the
>connection to ARI is not at all obvious to me at this place in the
>sources. Hence I'd appreciate if you would extend the comment.

Ok. How about this:

-        devfn = PCI_SLOT(pdev->info.physfn.devfn) ? 0 : pdev->info.physfn.devfn;
+        /* 
+         * Use 0 as the devfn to search VT-d unit when the PF is an Extended
+         * Function (means within an ARI Device, a Function whose Function Number
+         * is greater than 7).
+         */
+        if ( PCI_SLOT(pdev->info.physfn.devfn) &&
+             (pci_find_ext_capability(pdev->seg, bus,
+                         pdev->info.physfn.devfn, PCI_EXT_CAP_ID_ARI)) )
+            devfn = 0;
+        else
+            devfn = pdev->info.physfn.devfn;

Thanks
Chao
Jan Beulich June 19, 2017, 7:43 a.m. UTC | #4
>>> On 19.06.17 at 08:33, <chao.gao@intel.com> wrote:
> On Fri, Jun 16, 2017 at 09:52:11AM -0600, Jan Beulich wrote:
>>>>> On 16.06.17 at 08:48, <chao.gao@intel.com> wrote:
>>> The problem is a VF of RC integrated PF (e.g. PF's BDF is 00:02.0),
>>> we would wrongly use 00:00.0 to search VT-d unit.
>>> 
>>> To search VT-d unit for a VF, the BDF of the PF is used. And If the
>>> PF is an Extended Function, the BDF of one traditional function is
>>> used. The following line (from acpi_find_matched_drhd_unit()):
>>>     devfn = PCI_SLOT(pdev->info.physfn.devfn) ? 0 : pdev->info.physfn.devfn;
>>> sets 'devfn' to 0 if PF's devfn > 8.
>>
>>Is that really the relevant line? Since you say PF is an Extended
>>Function, wouldn't
>>
>>    if ( pdev->info.is_extfn )
>>    {
>>        bus = pdev->bus;
>>        devfn = 0;
>>    }
>>
>>be the relevant code? Or else - is is_extfn not being set correctly?
> 
> I think this field is not being set for VF. And here what we want to
> know is whether the PF of this VF is an extended functin. We also can add
> a new field 'is_extfn' in pdev->info.physfn and change the caller in
> linux kernel accordingly. But it will be not compatible with the old kernel.

Wait, no - I did describe things slightly wrongly, and hence perhaps
managed to confuse you (besides myself). For the VF we don't want
to see is_extfn set, but for its PF I'd expect that to be the case.
With that I'd then think looking up the struct pci_dev for the PF is all
it takes to tell apart both cases, the more that I'm not sure ...

>>> --- a/xen/drivers/passthrough/vtd/dmar.c
>>> +++ b/xen/drivers/passthrough/vtd/dmar.c
>>> @@ -219,7 +219,12 @@ struct acpi_drhd_unit *acpi_find_matched_drhd_unit(const struct pci_dev *pdev)
>>>      else if ( pdev->info.is_virtfn )
>>>      {
>>>          bus = pdev->info.physfn.bus;
>>> -        devfn = PCI_SLOT(pdev->info.physfn.devfn) ? 0 : pdev->info.physfn.devfn;
>>> +        /* ARI is not appliable to Root Complex Integrated Endpoints */
>>> +        if ( PCI_SLOT(pdev->info.physfn.devfn) &&
>>> +             (pdev->type != DEV_TYPE_RC_ENDPOINT) )

... checking VF's type (instead of PF's) here is appropriate / most
compatible.

>>> +            devfn = 0;
>>> +        else
>>> +            devfn = pdev->info.physfn.devfn;
>>>      }
>>
>>While I think I understand some of PCI, I have to admit that the
>>connection to ARI is not at all obvious to me at this place in the
>>sources. Hence I'd appreciate if you would extend the comment.
> 
> Ok. How about this:
> 
> -        devfn = PCI_SLOT(pdev->info.physfn.devfn) ? 0 : pdev->info.physfn.devfn;
> +        /* 
> +         * Use 0 as the devfn to search VT-d unit when the PF is an Extended
> +         * Function (means within an ARI Device, a Function whose Function Number
> +         * is greater than 7).
> +         */
> +        if ( PCI_SLOT(pdev->info.physfn.devfn) &&
> +             (pci_find_ext_capability(pdev->seg, bus,
> +                         pdev->info.physfn.devfn, PCI_EXT_CAP_ID_ARI)) )
> +            devfn = 0;
> +        else
> +            devfn = pdev->info.physfn.devfn;

That's better, but I'm still having some difficulty. In the Linux kernel
we have

	if (pci_dev->is_virtfn) {
		...
	} else
	if (pci_ari_enabled(pci_dev->bus) && PCI_SLOT(pci_dev->devfn))
		add->flags = XEN_PCI_DEV_EXTFN;

which tells me that the mere presence of an ARI capability may not
be enough. Furthermore Linux checks whether devfn is zero in
pci_configure_ari(), not whether PCI_SLOT(devfn) is non-zero -
wouldn't that mean you want to pass a zero devfn to
pci_find_ext_capability() above?

Jan
Chao Gao June 19, 2017, 8:28 a.m. UTC | #5
On Mon, Jun 19, 2017 at 01:43:25AM -0600, Jan Beulich wrote:
>>>> On 19.06.17 at 08:33, <chao.gao@intel.com> wrote:
>> On Fri, Jun 16, 2017 at 09:52:11AM -0600, Jan Beulich wrote:
>>>>>> On 16.06.17 at 08:48, <chao.gao@intel.com> wrote:
>>>> The problem is a VF of RC integrated PF (e.g. PF's BDF is 00:02.0),
>>>> we would wrongly use 00:00.0 to search VT-d unit.
>>>> 
>>>> To search VT-d unit for a VF, the BDF of the PF is used. And If the
>>>> PF is an Extended Function, the BDF of one traditional function is
>>>> used. The following line (from acpi_find_matched_drhd_unit()):
>>>>     devfn = PCI_SLOT(pdev->info.physfn.devfn) ? 0 : pdev->info.physfn.devfn;
>>>> sets 'devfn' to 0 if PF's devfn > 8.
>>>
>>>Is that really the relevant line? Since you say PF is an Extended
>>>Function, wouldn't
>>>
>>>    if ( pdev->info.is_extfn )
>>>    {
>>>        bus = pdev->bus;
>>>        devfn = 0;
>>>    }
>>>
>>>be the relevant code? Or else - is is_extfn not being set correctly?
>> 
>> I think this field is not being set for VF. And here what we want to
>> know is whether the PF of this VF is an extended functin. We also can add
>> a new field 'is_extfn' in pdev->info.physfn and change the caller in
>> linux kernel accordingly. But it will be not compatible with the old kernel.
>
>Wait, no - I did describe things slightly wrongly, and hence perhaps
>managed to confuse you (besides myself). For the VF we don't want
>to see is_extfn set, but for its PF I'd expect that to be the case.
>With that I'd then think looking up the struct pci_dev for the PF is all
>it takes to tell apart both cases, the more that I'm not sure ...

Impressive!

>
>>>> --- a/xen/drivers/passthrough/vtd/dmar.c
>>>> +++ b/xen/drivers/passthrough/vtd/dmar.c
>>>> @@ -219,7 +219,12 @@ struct acpi_drhd_unit *acpi_find_matched_drhd_unit(const struct pci_dev *pdev)
>>>>      else if ( pdev->info.is_virtfn )
>>>>      {
>>>>          bus = pdev->info.physfn.bus;
>>>> -        devfn = PCI_SLOT(pdev->info.physfn.devfn) ? 0 : pdev->info.physfn.devfn;
>>>> +        /* ARI is not appliable to Root Complex Integrated Endpoints */
>>>> +        if ( PCI_SLOT(pdev->info.physfn.devfn) &&
>>>> +             (pdev->type != DEV_TYPE_RC_ENDPOINT) )
>
>... checking VF's type (instead of PF's) here is appropriate / most
>compatible.

It was a mistake. Sorry for this.

>
>>>> +            devfn = 0;
>>>> +        else
>>>> +            devfn = pdev->info.physfn.devfn;
>>>>      }
>>>
>>>While I think I understand some of PCI, I have to admit that the
>>>connection to ARI is not at all obvious to me at this place in the
>>>sources. Hence I'd appreciate if you would extend the comment.
>> 
>> Ok. How about this:
>> 
>> -        devfn = PCI_SLOT(pdev->info.physfn.devfn) ? 0 : pdev->info.physfn.devfn;
>> +        /* 
>> +         * Use 0 as the devfn to search VT-d unit when the PF is an Extended
>> +         * Function (means within an ARI Device, a Function whose Function Number
>> +         * is greater than 7).
>> +         */
>> +        if ( PCI_SLOT(pdev->info.physfn.devfn) &&
>> +             (pci_find_ext_capability(pdev->seg, bus,
>> +                         pdev->info.physfn.devfn, PCI_EXT_CAP_ID_ARI)) )
>> +            devfn = 0;
>> +        else
>> +            devfn = pdev->info.physfn.devfn;
>
>That's better, but I'm still having some difficulty. In the Linux kernel
>we have
>
>	if (pci_dev->is_virtfn) {
>		...
>	} else
>	if (pci_ari_enabled(pci_dev->bus) && PCI_SLOT(pci_dev->devfn))
>		add->flags = XEN_PCI_DEV_EXTFN;
>
>which tells me that the mere presence of an ARI capability may not
>be enough. Furthermore Linux checks whether devfn is zero in
>pci_configure_ari(), not whether PCI_SLOT(devfn) is non-zero -
>wouldn't that mean you want to pass a zero devfn to
>pci_find_ext_capability() above?

No. pci_configure_ari() is to enable ARI forwarding, which is a feature
of downstream port. ARI capability is a feature of device.
Anyhow, I finally understand what you said, which is actually what I
want. If I can easily handle the _pcidevs_lock to call pci_get_pdev(), it is the
best solution I think.
Chao Gao June 20, 2017, 10:51 a.m. UTC | #6
On Mon, Jun 19, 2017 at 01:43:25AM -0600, Jan Beulich wrote:
>>>> On 19.06.17 at 08:33, <chao.gao@intel.com> wrote:
>> On Fri, Jun 16, 2017 at 09:52:11AM -0600, Jan Beulich wrote:
>>>>>> On 16.06.17 at 08:48, <chao.gao@intel.com> wrote:
>>>> The problem is a VF of RC integrated PF (e.g. PF's BDF is 00:02.0),
>>>> we would wrongly use 00:00.0 to search VT-d unit.
>>>> 
>>>> To search VT-d unit for a VF, the BDF of the PF is used. And If the
>>>> PF is an Extended Function, the BDF of one traditional function is
>>>> used. The following line (from acpi_find_matched_drhd_unit()):
>>>>     devfn = PCI_SLOT(pdev->info.physfn.devfn) ? 0 : pdev->info.physfn.devfn;
>>>> sets 'devfn' to 0 if PF's devfn > 8.
>>>
>>>Is that really the relevant line? Since you say PF is an Extended
>>>Function, wouldn't
>>>
>>>    if ( pdev->info.is_extfn )
>>>    {
>>>        bus = pdev->bus;
>>>        devfn = 0;
>>>    }
>>>
>>>be the relevant code? Or else - is is_extfn not being set correctly?
>> 
>> I think this field is not being set for VF. And here what we want to
>> know is whether the PF of this VF is an extended functin. We also can add
>> a new field 'is_extfn' in pdev->info.physfn and change the caller in
>> linux kernel accordingly. But it will be not compatible with the old kernel.
>
>Wait, no - I did describe things slightly wrongly, and hence perhaps
>managed to confuse you (besides myself). For the VF we don't want
>to see is_extfn set, but for its PF I'd expect that to be the case.
>With that I'd then think looking up the struct pci_dev for the PF is all
>it takes to tell apart both cases, the more that I'm not sure ...

Hi, Jan. in pci_add_device():

    else if (info->is_virtfn)
    {
        pcidevs_lock();
        pdev = pci_get_pdev(seg, info->physfn.bus, info->physfn.devfn);
        pcidevs_unlock();
        if ( !pdev )
            pci_add_device(seg, info->physfn.bus, info->physfn.devfn,
                           NULL, node);
        pdev_type = "virtual function";
    }

could you recall in which case, we can't get the PF by
pci_get_pdev() above? The reason why I want to know is in this case,
is_extfn of the PF may not be set correctly.

Thanks
Chao
Jan Beulich June 20, 2017, 11:31 a.m. UTC | #7
>>> On 20.06.17 at 12:51, <chao.gao@intel.com> wrote:
> On Mon, Jun 19, 2017 at 01:43:25AM -0600, Jan Beulich wrote:
>>>>> On 19.06.17 at 08:33, <chao.gao@intel.com> wrote:
>>> On Fri, Jun 16, 2017 at 09:52:11AM -0600, Jan Beulich wrote:
>>>>>>> On 16.06.17 at 08:48, <chao.gao@intel.com> wrote:
>>>>> The problem is a VF of RC integrated PF (e.g. PF's BDF is 00:02.0),
>>>>> we would wrongly use 00:00.0 to search VT-d unit.
>>>>> 
>>>>> To search VT-d unit for a VF, the BDF of the PF is used. And If the
>>>>> PF is an Extended Function, the BDF of one traditional function is
>>>>> used. The following line (from acpi_find_matched_drhd_unit()):
>>>>>     devfn = PCI_SLOT(pdev->info.physfn.devfn) ? 0 : pdev->info.physfn.devfn;
>>>>> sets 'devfn' to 0 if PF's devfn > 8.
>>>>
>>>>Is that really the relevant line? Since you say PF is an Extended
>>>>Function, wouldn't
>>>>
>>>>    if ( pdev->info.is_extfn )
>>>>    {
>>>>        bus = pdev->bus;
>>>>        devfn = 0;
>>>>    }
>>>>
>>>>be the relevant code? Or else - is is_extfn not being set correctly?
>>> 
>>> I think this field is not being set for VF. And here what we want to
>>> know is whether the PF of this VF is an extended functin. We also can add
>>> a new field 'is_extfn' in pdev->info.physfn and change the caller in
>>> linux kernel accordingly. But it will be not compatible with the old kernel.
>>
>>Wait, no - I did describe things slightly wrongly, and hence perhaps
>>managed to confuse you (besides myself). For the VF we don't want
>>to see is_extfn set, but for its PF I'd expect that to be the case.
>>With that I'd then think looking up the struct pci_dev for the PF is all
>>it takes to tell apart both cases, the more that I'm not sure ...
> 
> Hi, Jan. in pci_add_device():
> 
>     else if (info->is_virtfn)
>     {
>         pcidevs_lock();
>         pdev = pci_get_pdev(seg, info->physfn.bus, info->physfn.devfn);
>         pcidevs_unlock();
>         if ( !pdev )
>             pci_add_device(seg, info->physfn.bus, info->physfn.devfn,
>                            NULL, node);
>         pdev_type = "virtual function";
>     }
> 
> could you recall in which case, we can't get the PF by
> pci_get_pdev() above?

This is just a safety measure to make sure we have a PF device
to refer to (and we don't want to return failure because of that).
I'm not aware of the path actually being needed (and even if it
was taken, I'd expect a subsequent hypercall to report the PF).

Jan
diff mbox

Patch

diff --git a/xen/drivers/passthrough/pci.c b/xen/drivers/passthrough/pci.c
index 6e7126b..9842d76 100644
--- a/xen/drivers/passthrough/pci.c
+++ b/xen/drivers/passthrough/pci.c
@@ -345,6 +345,7 @@  static struct pci_dev *alloc_pdev(struct pci_seg *pseg, u8 bus, u8 devfn)
             break;
 
         case DEV_TYPE_PCIe_ENDPOINT:
+        case DEV_TYPE_RC_ENDPOINT:
             pos = pci_find_cap_offset(pseg->nr, bus, PCI_SLOT(devfn),
                                       PCI_FUNC(devfn), PCI_CAP_ID_EXP);
             BUG_ON(!pos);
@@ -854,23 +855,24 @@  int pci_release_devices(struct domain *d)
 
 enum pdev_type pdev_type(u16 seg, u8 bus, u8 devfn)
 {
-    u16 class_device, creg;
-    u8 d = PCI_SLOT(devfn), f = PCI_FUNC(devfn);
+    uint8_t d = PCI_SLOT(devfn), f = PCI_FUNC(devfn);
     int pos = pci_find_cap_offset(seg, bus, d, f, PCI_CAP_ID_EXP);
+    int pcie_type = -1;
 
-    class_device = pci_conf_read16(seg, bus, d, f, PCI_CLASS_DEVICE);
-    switch ( class_device )
+    if ( pos )
+        pcie_type = MASK_EXTR(pci_conf_read16(seg, bus, d, f,
+                                  pos + PCI_EXP_FLAGS), PCI_EXP_FLAGS_TYPE);
+    switch ( pci_conf_read16(seg, bus, d, f, PCI_CLASS_DEVICE) )
     {
     case PCI_CLASS_BRIDGE_PCI:
-        if ( !pos )
-            return DEV_TYPE_LEGACY_PCI_BRIDGE;
-        creg = pci_conf_read16(seg, bus, d, f, pos + PCI_EXP_FLAGS);
-        switch ( (creg & PCI_EXP_FLAGS_TYPE) >> 4 )
+        switch ( pcie_type )
         {
         case PCI_EXP_TYPE_PCI_BRIDGE:
             return DEV_TYPE_PCIe2PCI_BRIDGE;
         case PCI_EXP_TYPE_PCIE_BRIDGE:
             return DEV_TYPE_PCI2PCIe_BRIDGE;
+        case -1:
+            return DEV_TYPE_LEGACY_PCI_BRIDGE;
         }
         return DEV_TYPE_PCIe_BRIDGE;
     case PCI_CLASS_BRIDGE_HOST:
@@ -880,7 +882,15 @@  enum pdev_type pdev_type(u16 seg, u8 bus, u8 devfn)
         return DEV_TYPE_PCI_UNKNOWN;
     }
 
-    return pos ? DEV_TYPE_PCIe_ENDPOINT : DEV_TYPE_PCI;
+    switch ( pcie_type )
+    {
+    case PCI_EXP_TYPE_RC_END:
+        return DEV_TYPE_RC_ENDPOINT;
+    case -1:
+        return DEV_TYPE_PCI;
+    }
+
+    return DEV_TYPE_PCIe_ENDPOINT;
 }
 
 /*
diff --git a/xen/drivers/passthrough/vtd/dmar.c b/xen/drivers/passthrough/vtd/dmar.c
index 82040dd..7c9d17b 100644
--- a/xen/drivers/passthrough/vtd/dmar.c
+++ b/xen/drivers/passthrough/vtd/dmar.c
@@ -219,7 +219,12 @@  struct acpi_drhd_unit *acpi_find_matched_drhd_unit(const struct pci_dev *pdev)
     else if ( pdev->info.is_virtfn )
     {
         bus = pdev->info.physfn.bus;
-        devfn = PCI_SLOT(pdev->info.physfn.devfn) ? 0 : pdev->info.physfn.devfn;
+        /* ARI is not appliable to Root Complex Integrated Endpoints */
+        if ( PCI_SLOT(pdev->info.physfn.devfn) &&
+             (pdev->type != DEV_TYPE_RC_ENDPOINT) )
+            devfn = 0;
+        else
+            devfn = pdev->info.physfn.devfn;
     }
     else
     {
diff --git a/xen/drivers/passthrough/vtd/intremap.c b/xen/drivers/passthrough/vtd/intremap.c
index 1e0317c..bae0d3b 100644
--- a/xen/drivers/passthrough/vtd/intremap.c
+++ b/xen/drivers/passthrough/vtd/intremap.c
@@ -486,6 +486,7 @@  static void set_msi_source_id(struct pci_dev *pdev, struct iremap_entry *ire)
         unsigned int sq;
 
     case DEV_TYPE_PCIe_ENDPOINT:
+    case DEV_TYPE_RC_ENDPOINT:
     case DEV_TYPE_PCIe_BRIDGE:
     case DEV_TYPE_PCIe2PCI_BRIDGE:
     case DEV_TYPE_PCI_HOST_BRIDGE:
diff --git a/xen/drivers/passthrough/vtd/iommu.c b/xen/drivers/passthrough/vtd/iommu.c
index 19328f6..73f3095 100644
--- a/xen/drivers/passthrough/vtd/iommu.c
+++ b/xen/drivers/passthrough/vtd/iommu.c
@@ -1493,6 +1493,7 @@  static int domain_context_mapping(struct domain *domain, u8 devfn,
         break;
 
     case DEV_TYPE_PCIe_ENDPOINT:
+    case DEV_TYPE_RC_ENDPOINT:
         if ( iommu_debug )
             printk(VTDPREFIX "d%d:PCIe: map %04x:%02x:%02x.%u\n",
                    domain->domain_id, seg, bus,
@@ -1644,6 +1645,7 @@  static int domain_context_unmap(struct domain *domain, u8 devfn,
         goto out;
 
     case DEV_TYPE_PCIe_ENDPOINT:
+    case DEV_TYPE_RC_ENDPOINT:
         if ( iommu_debug )
             printk(VTDPREFIX "d%d:PCIe: unmap %04x:%02x:%02x.%u\n",
                    domain->domain_id, seg, bus,
diff --git a/xen/include/xen/pci.h b/xen/include/xen/pci.h
index 59b6e8a..4ec74b1 100644
--- a/xen/include/xen/pci.h
+++ b/xen/include/xen/pci.h
@@ -67,6 +67,7 @@  struct pci_dev {
     enum pdev_type {
         DEV_TYPE_PCI_UNKNOWN,
         DEV_TYPE_PCIe_ENDPOINT,
+        DEV_TYPE_RC_ENDPOINT,       // Root Complex Integrated Endpoint
         DEV_TYPE_PCIe_BRIDGE,       // PCIe root port, switch
         DEV_TYPE_PCIe2PCI_BRIDGE,   // PCIe-to-PCI/PCIx bridge
         DEV_TYPE_PCI2PCIe_BRIDGE,   // PCI/PCIx-to-PCIe bridge