diff mbox

[v4,3/9] x86/physdev: enable PHYSDEVOP_pci_mmcfg_reserved for PVH Dom0

Message ID 20170630150117.88489-4-roger.pau@citrix.com (mailing list archive)
State New, archived
Headers show

Commit Message

Roger Pau Monné June 30, 2017, 3:01 p.m. UTC
So that hotplug (or MMCFG regions not present in the MCFG ACPI table)
can be added at run time by the hardware domain.

When a new MMCFG area is added to a PVH Dom0, Xen will scan it and add
the devices to the hardware domain.

Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
---
Cc: Jan Beulich <jbeulich@suse.com>
Cc: Andrew Cooper <andrew.cooper3@citrix.com>
---
changes since v3:
 - New in this version.
---
 tools/tests/vpci/emul.h       |  2 --
 xen/arch/x86/hvm/hypercall.c  |  4 ++++
 xen/arch/x86/physdev.c        | 19 +++++++++++++++++++
 xen/drivers/passthrough/pci.c | 37 ++++++++++++++++++++++++++++++++++---
 xen/drivers/vpci/vpci.c       |  4 ++--
 xen/include/xen/pci.h         |  1 +
 6 files changed, 60 insertions(+), 7 deletions(-)

Comments

Jan Beulich July 14, 2017, 10:32 a.m. UTC | #1
>>> On 30.06.17 at 17:01, <roger.pau@citrix.com> wrote:
> So that hotplug (or MMCFG regions not present in the MCFG ACPI table)
> can be added at run time by the hardware domain.

I think the emphasis should be the other way around. I'm rather certain
hotplug of bridges doesn't really work right now anyway; at least
IO-APIC hotplug code is completely missing.

> When a new MMCFG area is added to a PVH Dom0, Xen will scan it and add
> the devices to the hardware domain.

Adding the MMIO regions is certainly necessary, but what's the point of
also scanning the bus and adding the devices? We expect Dom0 to tell us
anyway, and not doing the scan in Xen avoids complications we presently
have in the segment 0 case when Dom0 decides to re-number busses (e.g.
in order to fit in SR-IOV VFs).

> --- a/xen/arch/x86/hvm/hypercall.c
> +++ b/xen/arch/x86/hvm/hypercall.c
> @@ -89,6 +89,10 @@ static long hvm_physdev_op(int cmd, XEN_GUEST_HANDLE_PARAM(void) arg)
>          if ( !has_pirq(curr->domain) )
>              return -ENOSYS;
>          break;
> +    case PHYSDEVOP_pci_mmcfg_reserved:
> +        if ( !is_hardware_domain(curr->domain) )
> +            return -ENOSYS;
> +        break;

This physdevop (like most ones) is restricted to Dom0 use anyway
(properly expressed via XSM check), so I'd rather see you check
has_vpci() here, in line with e.g. the check visible in context.

> --- a/xen/arch/x86/physdev.c
> +++ b/xen/arch/x86/physdev.c
> @@ -559,6 +559,25 @@ ret_t do_physdev_op(int cmd, XEN_GUEST_HANDLE_PARAM(void) arg)
>  
>          ret = pci_mmcfg_reserved(info.address, info.segment,
>                                   info.start_bus, info.end_bus, info.flags);
> +        if ( ret || !is_hvm_domain(currd) )
> +            break;
> +
> +        /*
> +         * For HVM (PVH) domains try to add the newly found MMCFG to the
> +         * domain.
> +         */
> +        ret = register_vpci_mmcfg_handler(currd, info.address, info.start_bus,
> +                                          info.end_bus, info.segment);
> +        if ( ret == -EEXIST )
> +        {
> +            ret = 0;
> +            break;

I don't really understand this part: Why would handlers be registered
already? If you consider double registration, wouldn't that better
either be detected by pci_mmcfg_reserved() (and the call here avoided
altogether) or the fact indeed be reported back to the caller?

> @@ -1110,6 +1110,37 @@ void __hwdom_init setup_hwdom_pci_devices(
>      pcidevs_unlock();
>  }
>  
> +static int add_device(uint8_t devfn, struct pci_dev *pdev)
> +{
> +    return iommu_add_device(pdev);
> +}

You're discarding devfn here, just for iommu_add_device() to re-do the
phantom function handling. At the very least this is wasteful. Perhaps
you minimally want to call iommu_add_device() only when
devfn == pdev->devfn (if all of this code stays in the first place)?

> +int pci_scan_and_setup_segment(uint16_t segment)
> +{
> +    struct pci_seg *pseg = get_pseg(segment);
> +    struct setup_hwdom ctxt = {
> +        .d = current->domain,
> +        .handler = add_device,
> +    };
> +    int ret;
> +
> +    if ( !pseg )
> +        return -EINVAL;
> +
> +    pcidevs_lock();
> +    ret = _scan_pci_devices(pseg, NULL);
> +    if ( ret )
> +        goto out;
> +
> +    ret = _setup_hwdom_pci_devices(pseg, &ctxt);
> +    if ( ret )
> +        goto out;
> +
> + out:

Please let's avoid such unnecessary goto-s. Even the first one could be
easily avoided without making the code anywhere near unreadable.

Jan
Roger Pau Monné July 20, 2017, 10:23 a.m. UTC | #2
On Fri, Jul 14, 2017 at 04:32:19AM -0600, Jan Beulich wrote:
> >>> On 30.06.17 at 17:01, <roger.pau@citrix.com> wrote:
> > So that hotplug (or MMCFG regions not present in the MCFG ACPI table)
> > can be added at run time by the hardware domain.
> 
> I think the emphasis should be the other way around. I'm rather certain
> hotplug of bridges doesn't really work right now anyway; at least
> IO-APIC hotplug code is completely missing.

IO-APICs can also be hot-plugged? Didn't even know about that...

> > When a new MMCFG area is added to a PVH Dom0, Xen will scan it and add
> > the devices to the hardware domain.
> 
> Adding the MMIO regions is certainly necessary, but what's the point of
> also scanning the bus and adding the devices?

It's not strictly necessary, the same can be accomplished by Dom0
calling PHYSDEVOP_manage_pci_add on each device.

Just thought it wouldn't hurt to do it here, but given your comment
below I'm not sure. I will wait for your reply before deciding what to
do.

> We expect Dom0 to tell us
> anyway, and not doing the scan in Xen avoids complications we presently
> have in the segment 0 case when Dom0 decides to re-number busses (e.g.
> in order to fit in SR-IOV VFs).

Is this renumbering performed by changing the
Primary/Secondary/Subordinate bus number registers in the bridge?

If so we could detect such accesses (by adding traps to type 01h
headers) and react accordingly.

What if Dom0 re-numbers the bus after having already registered the
devices with Xen?

> > --- a/xen/arch/x86/hvm/hypercall.c
> > +++ b/xen/arch/x86/hvm/hypercall.c
> > @@ -89,6 +89,10 @@ static long hvm_physdev_op(int cmd, XEN_GUEST_HANDLE_PARAM(void) arg)
> >          if ( !has_pirq(curr->domain) )
> >              return -ENOSYS;
> >          break;
> > +    case PHYSDEVOP_pci_mmcfg_reserved:
> > +        if ( !is_hardware_domain(curr->domain) )
> > +            return -ENOSYS;
> > +        break;
> 
> This physdevop (like most ones) is restricted to Dom0 use anyway
> (properly expressed via XSM check), so I'd rather see you check
> has_vpci() here, in line with e.g. the check visible in context.

Ack.

> > --- a/xen/arch/x86/physdev.c
> > +++ b/xen/arch/x86/physdev.c
> > @@ -559,6 +559,25 @@ ret_t do_physdev_op(int cmd, XEN_GUEST_HANDLE_PARAM(void) arg)
> >  
> >          ret = pci_mmcfg_reserved(info.address, info.segment,
> >                                   info.start_bus, info.end_bus, info.flags);
> > +        if ( ret || !is_hvm_domain(currd) )
> > +            break;
> > +
> > +        /*
> > +         * For HVM (PVH) domains try to add the newly found MMCFG to the
> > +         * domain.
> > +         */
> > +        ret = register_vpci_mmcfg_handler(currd, info.address, info.start_bus,
> > +                                          info.end_bus, info.segment);
> > +        if ( ret == -EEXIST )
> > +        {
> > +            ret = 0;
> > +            break;
> 
> I don't really understand this part: Why would handlers be registered
> already? If you consider double registration, wouldn't that better
> either be detected by pci_mmcfg_reserved() (and the call here avoided
> altogether) or the fact indeed be reported back to the caller?

Yes, this can be done in pci_mmcfg_reserved, it's just that so far
pci_mmcfg_reserved doesn't return -EEXIST for duplicated bridges.

> > @@ -1110,6 +1110,37 @@ void __hwdom_init setup_hwdom_pci_devices(
> >      pcidevs_unlock();
> >  }
> >  
> > +static int add_device(uint8_t devfn, struct pci_dev *pdev)
> > +{
> > +    return iommu_add_device(pdev);
> > +}
> 
> You're discarding devfn here, just for iommu_add_device() to re-do the
> phantom function handling. At the very least this is wasteful. Perhaps
> you minimally want to call iommu_add_device() only when
> devfn == pdev->devfn (if all of this code stays in the first place)?

Doesn't the IOMMU also need to know about the phantom functions in
order to add translations for them too?

I assume phantom_stride already takes care of this, so yes, if this
has to stay here a pdev->dev == devfn check should be added.

> > +int pci_scan_and_setup_segment(uint16_t segment)
> > +{
> > +    struct pci_seg *pseg = get_pseg(segment);
> > +    struct setup_hwdom ctxt = {
> > +        .d = current->domain,
> > +        .handler = add_device,
> > +    };
> > +    int ret;
> > +
> > +    if ( !pseg )
> > +        return -EINVAL;
> > +
> > +    pcidevs_lock();
> > +    ret = _scan_pci_devices(pseg, NULL);
> > +    if ( ret )
> > +        goto out;
> > +
> > +    ret = _setup_hwdom_pci_devices(pseg, &ctxt);
> > +    if ( ret )
> > +        goto out;
> > +
> > + out:
> 
> Please let's avoid such unnecessary goto-s. Even the first one could be
> easily avoided without making the code anywhere near unreadable.

Right, that's not a problem.

Thanks, Roger.
Jan Beulich July 28, 2017, 12:31 p.m. UTC | #3
>>> Roger Pau Monne <roger.pau@citrix.com> 07/20/17 12:24 PM >>>
>On Fri, Jul 14, 2017 at 04:32:19AM -0600, Jan Beulich wrote:
>> >>> On 30.06.17 at 17:01, <roger.pau@citrix.com> wrote:
>> > So that hotplug (or MMCFG regions not present in the MCFG ACPI table)
>> > can be added at run time by the hardware domain.
>> 
>> I think the emphasis should be the other way around. I'm rather certain
>> hotplug of bridges doesn't really work right now anyway; at least
>> IO-APIC hotplug code is completely missing.
>
>IO-APICs can also be hot-plugged? Didn't even know about that...

Think of hot adding an entire node.

>> > When a new MMCFG area is added to a PVH Dom0, Xen will scan it and add
>> > the devices to the hardware domain.
>> 
>> Adding the MMIO regions is certainly necessary, but what's the point of
>> also scanning the bus and adding the devices?
>
>It's not strictly necessary, the same can be accomplished by Dom0
>calling PHYSDEVOP_manage_pci_add on each device.
>
>Just thought it wouldn't hurt to do it here, but given your comment
>below I'm not sure. I will wait for your reply before deciding what to
>do.
>
>> We expect Dom0 to tell us
>> anyway, and not doing the scan in Xen avoids complications we presently
>> have in the segment 0 case when Dom0 decides to re-number busses (e.g.
>> in order to fit in SR-IOV VFs).
>
>Is this renumbering performed by changing the
>Primary/Secondary/Subordinate bus number registers in the bridge?

Yes.

>If so we could detect such accesses (by adding traps to type 01h
>headers) and react accordingly.

Yes.

>What if Dom0 re-numbers the bus after having already registered the
>devices with Xen?

The expectation would be for Dom0 to first unregister all devices in the
sub-topology, to the re-numbering, and then re-add them. That doesn't
happen in Linux though, afair.

>> > @@ -1110,6 +1110,37 @@ void __hwdom_init setup_hwdom_pci_devices(
>> >      pcidevs_unlock();
>> >  }
>> >  
>> > +static int add_device(uint8_t devfn, struct pci_dev *pdev)
>> > +{
>> > +    return iommu_add_device(pdev);
>> > +}
>> 
>> You're discarding devfn here, just for iommu_add_device() to re-do the
>> phantom function handling. At the very least this is wasteful. Perhaps
>> you minimally want to call iommu_add_device() only when
>> devfn == pdev->devfn (if all of this code stays in the first place)?
>
>Doesn't the IOMMU also need to know about the phantom functions in
>order to add translations for them too?

Yes, that's why iommu_add_device() and others have a respective loop.

Jan
diff mbox

Patch

diff --git a/tools/tests/vpci/emul.h b/tools/tests/vpci/emul.h
index 1b0217e7e3..047079de4c 100644
--- a/tools/tests/vpci/emul.h
+++ b/tools/tests/vpci/emul.h
@@ -58,8 +58,6 @@  extern struct pci_dev test_pdev;
 
 #include "vpci.h"
 
-#define __hwdom_init
-
 #define has_vpci(d) true
 
 /* Define our own locks. */
diff --git a/xen/arch/x86/hvm/hypercall.c b/xen/arch/x86/hvm/hypercall.c
index e7238ce293..89625d514c 100644
--- a/xen/arch/x86/hvm/hypercall.c
+++ b/xen/arch/x86/hvm/hypercall.c
@@ -89,6 +89,10 @@  static long hvm_physdev_op(int cmd, XEN_GUEST_HANDLE_PARAM(void) arg)
         if ( !has_pirq(curr->domain) )
             return -ENOSYS;
         break;
+    case PHYSDEVOP_pci_mmcfg_reserved:
+        if ( !is_hardware_domain(curr->domain) )
+            return -ENOSYS;
+        break;
     }
 
     if ( !curr->hcall_compat )
diff --git a/xen/arch/x86/physdev.c b/xen/arch/x86/physdev.c
index 0eb409758f..6b1c92fa0b 100644
--- a/xen/arch/x86/physdev.c
+++ b/xen/arch/x86/physdev.c
@@ -559,6 +559,25 @@  ret_t do_physdev_op(int cmd, XEN_GUEST_HANDLE_PARAM(void) arg)
 
         ret = pci_mmcfg_reserved(info.address, info.segment,
                                  info.start_bus, info.end_bus, info.flags);
+        if ( ret || !is_hvm_domain(currd) )
+            break;
+
+        /*
+         * For HVM (PVH) domains try to add the newly found MMCFG to the
+         * domain.
+         */
+        ret = register_vpci_mmcfg_handler(currd, info.address, info.start_bus,
+                                          info.end_bus, info.segment);
+        if ( ret == -EEXIST )
+        {
+            ret = 0;
+            break;
+        }
+        if ( ret )
+            break;
+
+        ret = pci_scan_and_setup_segment(info.segment);
+
         break;
     }
 
diff --git a/xen/drivers/passthrough/pci.c b/xen/drivers/passthrough/pci.c
index 3208cd5d71..2d38a5a297 100644
--- a/xen/drivers/passthrough/pci.c
+++ b/xen/drivers/passthrough/pci.c
@@ -924,7 +924,7 @@  out:
     return ret;
 }
 
-bool_t __init pci_device_detect(u16 seg, u8 bus, u8 dev, u8 func)
+bool pci_device_detect(u16 seg, u8 bus, u8 dev, u8 func)
 {
     u32 vendor;
 
@@ -971,7 +971,7 @@  void pci_check_disable_device(u16 seg, u8 bus, u8 devfn)
  * scan pci devices to add all existed PCI devices to alldevs_list,
  * and setup pci hierarchy in array bus2bridge.
  */
-static int __init _scan_pci_devices(struct pci_seg *pseg, void *arg)
+static int _scan_pci_devices(struct pci_seg *pseg, void *arg)
 {
     struct pci_dev *pdev;
     int bus, dev, func;
@@ -1050,7 +1050,7 @@  static void setup_one_hwdom_device(const struct setup_hwdom *ctxt,
                ctxt->d->domain_id, err);
 }
 
-static int __hwdom_init _setup_hwdom_pci_devices(struct pci_seg *pseg, void *arg)
+static int _setup_hwdom_pci_devices(struct pci_seg *pseg, void *arg)
 {
     struct setup_hwdom *ctxt = arg;
     int bus, devfn;
@@ -1110,6 +1110,37 @@  void __hwdom_init setup_hwdom_pci_devices(
     pcidevs_unlock();
 }
 
+static int add_device(uint8_t devfn, struct pci_dev *pdev)
+{
+    return iommu_add_device(pdev);
+}
+
+int pci_scan_and_setup_segment(uint16_t segment)
+{
+    struct pci_seg *pseg = get_pseg(segment);
+    struct setup_hwdom ctxt = {
+        .d = current->domain,
+        .handler = add_device,
+    };
+    int ret;
+
+    if ( !pseg )
+        return -EINVAL;
+
+    pcidevs_lock();
+    ret = _scan_pci_devices(pseg, NULL);
+    if ( ret )
+        goto out;
+
+    ret = _setup_hwdom_pci_devices(pseg, &ctxt);
+    if ( ret )
+        goto out;
+
+ out:
+    pcidevs_unlock();
+    return ret;
+}
+
 #ifdef CONFIG_ACPI
 #include <acpi/acpi.h>
 #include <acpi/apei.h>
diff --git a/xen/drivers/vpci/vpci.c b/xen/drivers/vpci/vpci.c
index c54de83b82..7d4ecd5fb5 100644
--- a/xen/drivers/vpci/vpci.c
+++ b/xen/drivers/vpci/vpci.c
@@ -33,12 +33,12 @@  struct vpci_register {
     struct list_head node;
 };
 
-int __hwdom_init vpci_add_handlers(struct pci_dev *pdev)
+int vpci_add_handlers(struct pci_dev *pdev)
 {
     unsigned int i;
     int rc = 0;
 
-    if ( !has_vpci(pdev->domain) )
+    if ( !has_vpci(pdev->domain) || pdev->vpci )
         return 0;
 
     pdev->vpci = xzalloc(struct vpci);
diff --git a/xen/include/xen/pci.h b/xen/include/xen/pci.h
index a9b80e330b..e550effcc9 100644
--- a/xen/include/xen/pci.h
+++ b/xen/include/xen/pci.h
@@ -131,6 +131,7 @@  struct pci_dev *pci_get_real_pdev(int seg, int bus, int devfn);
 struct pci_dev *pci_get_pdev_by_domain(
     struct domain *, int seg, int bus, int devfn);
 void pci_check_disable_device(u16 seg, u8 bus, u8 devfn);
+int pci_scan_and_setup_segment(uint16_t segment);
 
 uint8_t pci_conf_read8(
     unsigned int seg, unsigned int bus, unsigned int dev, unsigned int func,